[med-svn] [bbmap] 01/02: Imported Upstream version 35.85+dfsg
Andreas Tille
tille at debian.org
Sat Mar 19 08:13:07 UTC 2016
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository bbmap.
commit e608f9369a964480d8ba04874b90e93b8d32cbc0
Author: Andreas Tille <tille at debian.org>
Date: Sat Mar 19 09:09:29 2016 +0100
Imported Upstream version 35.85+dfsg
---
README.md | 5 +
a_sample_mt.sh | 84 +
addadapters.sh | 79 +
bbcountunique.sh | 95 +
bbduk.sh | 289 +
bbduk2.sh | 288 +
bbest.sh | 68 +
bbfakereads.sh | 92 +
bbmap.sh | 340 ++
bbmapskimmer.sh | 59 +
bbmask.sh | 110 +
bbmerge-auto.sh | 63 +
bbmerge.sh | 220 +
bbnorm.sh | 172 +
bbsplit.sh | 131 +
bbsplitpairs.sh | 83 +
bbwrap.sh | 81 +
build.xml | 56 +
calcmem.sh | 132 +
calctruequality.sh | 88 +
callpeaks.sh | 79 +
clumpify.sh | 91 +
commonkmers.sh | 72 +
config/cluster16s.txt | 19 +
config/filter16s.txt | 5 +
config/histograms.txt | 16 +
config/recalibrate.txt | 32 +
config/rnaseq.txt | 4 +
config/trimadapters.txt | 5 +
countbarcodes.sh | 81 +
countgc.sh | 62 +
countsharedlines.sh | 81 +
crossblock.sh | 11 +
crosscontaminate.sh | 95 +
current/align2/AbstractIndex.java | 227 +
current/align2/AbstractMapThread.java | 3189 +++++++++++
current/align2/AbstractMapper.java | 2740 +++++++++
current/align2/BBIndex.java | 3302 +++++++++++
current/align2/BBIndex5.java | 2643 +++++++++
current/align2/BBIndexAcc.java | 2804 ++++++++++
current/align2/BBIndexPacBio.java | 2603 +++++++++
current/align2/BBIndexPacBioSkimmer.java | 2287 ++++++++
current/align2/BBMap.java | 561 ++
current/align2/BBMap5.java | 540 ++
current/align2/BBMapAcc.java | 540 ++
current/align2/BBMapPacBio.java | 529 ++
current/align2/BBMapPacBioSkimmer.java | 527 ++
current/align2/BBMapThread.java | 1364 +++++
current/align2/BBMapThread5.java | 1292 +++++
current/align2/BBMapThreadAcc.java | 1397 +++++
current/align2/BBMapThreadPacBio.java | 1301 +++++
current/align2/BBMapThreadPacBioSkimmer.java | 1758 ++++++
current/align2/BBSplitter.java | 1225 +++++
current/align2/BBWrap.java | 195 +
current/align2/BandedAligner.java | 167 +
current/align2/BandedAlignerConcrete.java | 551 ++
current/align2/BandedAlignerJNI.java | 179 +
current/align2/Blacklist.java | 101 +
current/align2/Block.java | 171 +
current/align2/ChromLoadThread.java | 112 +
current/align2/CompareSamFiles.java | 383 ++
current/align2/CompressString.java | 269 +
current/align2/Evaluate.java | 56 +
current/align2/GapTools.java | 211 +
current/align2/GradeSamFile.java | 364 ++
current/align2/Heap.java | 140 +
current/align2/Index.java | 12 +
current/align2/IndexMaker4.java | 525 ++
current/align2/IndexMaker5.java | 516 ++
current/align2/IntList.java | 165 +
current/align2/IntList2.java | 83 +
current/align2/KeyRing.java | 510 ++
current/align2/ListNum.java | 59 +
current/align2/LongList.java | 255 +
current/align2/LongM.java | 62 +
current/align2/MSA.java | 870 +++
current/align2/MakeQualityHistogram.java | 113 +
current/align2/MakeRocCurve.java | 326 ++
current/align2/MultiStateAligner10ts.java | 3577 ++++++++++++
current/align2/MultiStateAligner11ts.java | 2576 +++++++++
current/align2/MultiStateAligner11tsJNI.java | 1665 ++++++
current/align2/MultiStateAligner9Flat.java | 2541 +++++++++
current/align2/MultiStateAligner9PacBio.java | 2541 +++++++++
.../align2/MultiStateAligner9PacBioAdapter.java | 1756 ++++++
.../align2/MultiStateAligner9PacBioAdapter2.java | 1756 ++++++
...ltiStateAligner9PacBioAdapter_WithBarriers.java | 2594 +++++++++
current/align2/MultiStateAligner9XFlat.java | 2423 ++++++++
current/align2/MultiStateAligner9ts.java | 2433 ++++++++
current/align2/NeedlemanWunsch.java | 111 +
current/align2/PackedHeap.java | 186 +
current/align2/Pointer.java | 38 +
current/align2/PrintTime.java | 37 +
current/align2/Quad.java | 33 +
current/align2/Quad64.java | 35 +
current/align2/Quad64Heap.java | 219 +
current/align2/QuadHeap.java | 229 +
current/align2/QualityTools.java | 566 ++
current/align2/RandomReads3.java | 1757 ++++++
current/align2/ReadComparatorID.java | 26 +
current/align2/ReadComparatorMapping.java | 174 +
current/align2/ReadComparatorName.java | 28 +
current/align2/ReadComparatorTopological.java | 77 +
current/align2/ReadErrorComparator.java | 38 +
current/align2/ReadLengthComparator.java | 48 +
current/align2/ReadStats.java | 1304 +++++
current/align2/RefToIndex.java | 166 +
current/align2/ReformatBatchOutput.java | 217 +
current/align2/ReformatBatchOutput2.java | 61 +
current/align2/Shared.java | 146 +
current/align2/Solver.java | 243 +
current/align2/SortReadsByID.java | 283 +
current/align2/SortReadsByMapping.java | 1972 +++++++
current/align2/SortReadsTopologically.java | 605 ++
current/align2/SplitMappedReads.java | 320 ++
current/align2/Tools.java | 2319 ++++++++
current/align2/TranslateColorspaceRead.java | 2139 ++++++++
current/align2/TrimRead.java | 540 ++
current/assemble/AbstractBuildThread.java | 43 +
current/assemble/AbstractExploreThread.java | 78 +
current/assemble/AbstractRemoveThread.java | 187 +
current/assemble/AbstractShaveThread.java | 33 +
current/assemble/KmerCompressor.java | 877 +++
current/assemble/Postfilter.java | 238 +
current/assemble/ShaveObject.java | 31 +
current/assemble/Shaver.java | 225 +
current/assemble/Shaver1.java | 568 ++
current/assemble/Shaver2.java | 419 ++
current/assemble/Tadpole.java | 1433 +++++
current/assemble/Tadpole1.java | 1314 +++++
current/assemble/Tadpole2.java | 1250 +++++
current/assemble/TadpoleWrapper.java | 88 +
current/bloom/KCountArray.java | 510 ++
current/bloom/KCountArray2.java | 227 +
current/bloom/KCountArray3.java | 141 +
current/bloom/KCountArray4.java | 366 ++
current/bloom/KCountArray4MT.java | 529 ++
current/bloom/KCountArray5MT.java | 501 ++
current/bloom/KCountArray6MT.java | 505 ++
current/bloom/KCountArray7MT.java | 559 ++
current/bloom/KCountArray7MTA.java | 660 +++
current/bloom/KCountArray8MT.java | 585 ++
current/bloom/KmerCount3.java | 173 +
current/bloom/KmerCount4.java | 359 ++
current/bloom/KmerCount5.java | 462 ++
current/bloom/KmerCount6.java | 437 ++
current/bloom/KmerCount6MT.java | 705 +++
current/bloom/KmerCount7MT.java | 882 +++
current/bloom/KmerCount7MTA.java | 989 ++++
current/bloom/KmerCountAbstract.java | 53 +
current/bloom/LargeKmerCount.java | 242 +
current/bloom/LargeKmerCount2.java | 333 ++
current/bloom/TestLargeKmer.java | 192 +
current/clump/Clump.java | 124 +
current/clump/ClumpList.java | 114 +
current/clump/ClumpTools.java | 14 +
current/clump/Clumpify.java | 141 +
current/clump/Condensor.java | 27 +
current/clump/KmerComparator.java | 279 +
current/clump/KmerComparator_original.java | 129 +
current/clump/KmerReduce.java | 427 ++
current/clump/KmerSort.java | 430 ++
current/clump/KmerSplit.java | 420 ++
current/cluster/Cluster.java | 270 +
current/cluster/ClusterTools.java | 180 +
current/cluster/MergeReadHeaders.java | 332 ++
current/cluster/ReadTag.java | 107 +
current/cluster/ReclusterByKmer.java | 616 +++
current/dna/AminoAcid.java | 721 +++
current/dna/ChromArrayMaker.java | 573 ++
current/dna/ChromToFasta.java | 142 +
current/dna/ChromosomeArray.java | 429 ++
current/dna/Coverage.java | 26 +
current/dna/CoverageArray.java | 136 +
current/dna/CoverageArray2.java | 225 +
current/dna/CoverageArray3.java | 235 +
current/dna/Data.java | 1636 ++++++
current/dna/Exon.java | 175 +
current/dna/FastaToChromArrays2.java | 587 ++
current/dna/Gene.java | 1056 ++++
current/dna/GeneSet.java | 132 +
current/dna/IntMap.java | 106 +
current/dna/IntMapFlex.java | 135 +
current/dna/Matrix.java | 93 +
current/dna/Motif.java | 231 +
current/dna/MotifMulti.java | 59 +
current/dna/MotifProbsN.java | 269 +
current/dna/MotifSimple.java | 94 +
current/dna/Parser.java | 1000 ++++
current/dna/Range.java | 149 +
current/dna/ScafLoc.java | 20 +
current/dna/Scaffold.java | 84 +
current/dna/Timer.java | 28 +
current/driver/A_Sample_Textfile.java | 186 +
current/driver/ClearRam.java | 64 +
current/driver/CollateSpikeIn.java | 46 +
current/driver/CompareReferenceGenomes.java | 47 +
current/driver/CompareSequences.java | 62 +
current/driver/ConcatenateFiles.java | 92 +
current/driver/ConcatenateTextFiles.java | 189 +
current/driver/Concatenator.java | 60 +
current/driver/ConvertSamToAln.java | 73 +
current/driver/CorrelateIdentity.java | 189 +
current/driver/CountRNAs.java | 32 +
current/driver/CountSharedLines.java | 247 +
current/driver/EstherFilter.java | 167 +
current/driver/FilterLines.java | 263 +
current/driver/FilterReadsByName.java | 437 ++
current/driver/FindMotifs.java | 362 ++
current/driver/FixDumbFile.java | 70 +
current/driver/GenerateNoCallsFromCoverage.java | 426 ++
current/driver/GetSequence.java | 101 +
current/driver/Grep.java | 20 +
current/driver/LineCount.java | 16 +
current/driver/LookAtID.java | 48 +
current/driver/MakeTestScript.java | 368 ++
current/driver/MakeTestScriptScoreOnly.java | 210 +
current/driver/MeasureGene.java | 230 +
current/driver/MergeBigelow.java | 245 +
current/driver/MergeCoverageOTU.java | 66 +
current/driver/MergeTextFiles.java | 96 +
current/driver/MergeTextFiles2.java | 96 +
current/driver/MoveFiles.java | 92 +
current/driver/PrintEnv.java | 36 +
current/driver/ReduceSilva.java | 330 ++
current/driver/Sample.java | 75 +
current/driver/Search.java | 167 +
current/driver/SelectReads.java | 74 +
current/driver/SniffSplices.java | 198 +
current/driver/SummarizeCoverage.java | 113 +
current/driver/SummarizeMSDIN.java | 122 +
current/driver/SummarizeSealStats.java | 210 +
current/driver/TestCompressionSpeed.java | 79 +
current/driver/TestLockSpeed.java | 162 +
current/driver/Translator.java | 213 +
current/driver/Translator2.java | 65 +
current/driver/TransposeTextFile.java | 53 +
current/driver/TrimSamFile.java | 65 +
current/fileIO/ArrayFile.java | 73 +
current/fileIO/ByteFile.java | 93 +
current/fileIO/ByteFile1.java | 235 +
current/fileIO/ByteFile2.java | 402 ++
current/fileIO/ByteStreamWriter.java | 455 ++
current/fileIO/ChainBlock.java | 224 +
current/fileIO/ChainLine.java | 123 +
current/fileIO/CompressFiles.java | 71 +
current/fileIO/CopyFile.java | 114 +
current/fileIO/CopyFiles.java | 64 +
current/fileIO/CopyFiles2.java | 162 +
current/fileIO/FileFormat.java | 667 +++
current/fileIO/FindFiles.java | 113 +
current/fileIO/GenericTextFile.java | 36 +
current/fileIO/LoadThread.java | 139 +
current/fileIO/MatrixFile.java | 89 +
current/fileIO/PipeThread.java | 88 +
current/fileIO/ReadWrite.java | 1613 ++++++
current/fileIO/RenameFiles.java | 158 +
current/fileIO/SummaryFile.java | 172 +
current/fileIO/TextFile.java | 266 +
current/fileIO/TextStreamWriter.java | 315 ++
current/jgi/A_Sample.java | 422 ++
current/jgi/A_Sample2.java | 160 +
current/jgi/A_SampleD.java | 168 +
current/jgi/A_SampleMT.java | 529 ++
current/jgi/A_Sample_Unpaired.java | 268 +
current/jgi/AddAdapters.java | 765 +++
current/jgi/AssemblyStats2.java | 1839 +++++++
current/jgi/AssemblyStatsWrapper.java | 73 +
current/jgi/BBDuk2.java | 3772 +++++++++++++
current/jgi/BBDukF.java | 3839 +++++++++++++
current/jgi/BBMask.java | 1396 +++++
current/jgi/BBMerge.java | 2135 +++++++
current/jgi/BBMergeOverlapper.java | 845 +++
current/jgi/BBQC.java | 1091 ++++
current/jgi/BBTool_ST.java | 486 ++
current/jgi/CalcTrueQuality.java | 1699 ++++++
current/jgi/CalcTrueQuality_single.java | 1439 +++++
current/jgi/CalcUniqueness.java | 610 ++
current/jgi/CallPeaks.java | 879 +++
current/jgi/CorrelateBarcodes.java | 471 ++
current/jgi/CountBarcodes.java | 514 ++
current/jgi/CountGC.java | 389 ++
current/jgi/CountUniqueness.java | 110 +
current/jgi/CovStatsLine.java | 92 +
current/jgi/CoveragePileup.java | 1801 ++++++
current/jgi/CrossContaminate.java | 513 ++
current/jgi/CutPrimers.java | 241 +
current/jgi/DecontaminateByNormalization.java | 618 +++
current/jgi/Dedupe.java | 5796 ++++++++++++++++++++
current/jgi/Dedupe2.java | 5663 +++++++++++++++++++
current/jgi/DedupeByMapping.java | 452 ++
current/jgi/DemuxByName.java | 498 ++
current/jgi/Difference.java | 39 +
current/jgi/ErrorCorrect.java | 849 +++
current/jgi/FakeReads.java | 367 ++
current/jgi/FilterByCoverage.java | 474 ++
current/jgi/FilterBySequence.java | 844 +++
current/jgi/FilterReadsWithSubs.java | 129 +
current/jgi/FindPrimers.java | 297 +
current/jgi/FindString.java | 25 +
current/jgi/FungalRelease.java | 468 ++
current/jgi/FuseSequence.java | 187 +
current/jgi/GetReads.java | 329 ++
current/jgi/GradeMergedReads.java | 301 +
current/jgi/GreedyBarCodeFinder.java | 101 +
current/jgi/IdentityMatrix.java | 319 ++
current/jgi/Info.java | 195 +
current/jgi/KmerCountExact.java | 458 ++
current/jgi/KmerCoverage.java | 1219 ++++
current/jgi/KmerNormalize.java | 3619 ++++++++++++
current/jgi/KmerSample.java | 123 +
current/jgi/LogLog.java | 415 ++
current/jgi/MakeChimeras.java | 412 ++
current/jgi/MakeCoverageHistogram.java | 280 +
current/jgi/MakeLengthHistogram.java | 231 +
current/jgi/MateReadsMT.java | 1600 ++++++
current/jgi/MergeBarcodes.java | 487 ++
current/jgi/NormAndCorrectWrapper.java | 77 +
current/jgi/Orf.java | 111 +
current/jgi/PhylipToFasta.java | 219 +
current/jgi/RQCFilter.java | 2185 ++++++++
current/jgi/RandomGenome.java | 54 +
current/jgi/ReadKmerDepthDistribution.java | 1078 ++++
current/jgi/RedirectTest.java | 78 +
current/jgi/ReformatReads.java | 1332 +++++
current/jgi/RemapQuality.java | 115 +
current/jgi/RemoveBadBarcodes.java | 82 +
current/jgi/RenameReads.java | 364 ++
current/jgi/SamToEst.java | 487 ++
current/jgi/Seal.java | 3117 +++++++++++
current/jgi/Shred.java | 391 ++
current/jgi/Shuffle.java | 494 ++
current/jgi/SmallKmerFrequency.java | 216 +
current/jgi/SplitNexteraLMP.java | 670 +++
current/jgi/SplitPairsAndSingles.java | 809 +++
current/jgi/SplitSam4Way.java | 122 +
current/jgi/SplitSamFile.java | 87 +
current/jgi/SynthMDA.java | 447 ++
current/jgi/TranslateSixFrames.java | 455 ++
current/kmer/AbstractKmerTable.java | 494 ++
current/kmer/AbstractKmerTableSet.java | 432 ++
current/kmer/AtomicShortArray.java | 27 +
current/kmer/DumpThread.java | 73 +
current/kmer/HashArray.java | 485 ++
current/kmer/HashArray1D.java | 219 +
current/kmer/HashArray2D.java | 222 +
current/kmer/HashArrayHybrid.java | 356 ++
current/kmer/HashBuffer.java | 269 +
current/kmer/HashForest.java | 476 ++
current/kmer/KmerBuffer.java | 53 +
current/kmer/KmerLink.java | 288 +
current/kmer/KmerNode.java | 361 ++
current/kmer/KmerNode1D.java | 161 +
current/kmer/KmerNode2D.java | 237 +
current/kmer/KmerTable.java | 339 ++
current/kmer/KmerTableSet.java | 1213 ++++
current/kmer/Primes.java | 163 +
current/kmer/TableLoaderLockFree.java | 823 +++
current/kmer/TableReader.java | 645 +++
current/pacbio/CalcCoverageFromSites.java | 526 ++
current/pacbio/GenerateMultiChrom.java | 169 +
current/pacbio/MakePacBioScript.java | 443 ++
current/pacbio/MergeFastaContigs.java | 532 ++
current/pacbio/MergeReadsAndGenome.java | 189 +
current/pacbio/PartitionFastaFile.java | 90 +
current/pacbio/PartitionReads.java | 225 +
current/pacbio/ProcessStackedSitesNormalized.java | 499 ++
current/pacbio/RemoveAdapters2.java | 665 +++
current/pacbio/RemoveAdapters3.java | 623 +++
current/pacbio/RemoveNFromChromosome.java | 55 +
current/pacbio/SiteR.java | 95 +
current/pacbio/SortSites.java | 298 +
current/pacbio/SplitOffPerfectContigs.java | 392 ++
current/pacbio/StackSites.java | 312 ++
current/pacbio/StackSites2.java | 501 ++
current/stream/ArrayListSet.java | 198 +
current/stream/ByteBuilder.java | 405 ++
.../ConcurrentCollectionReadInputStream.java | 310 ++
current/stream/ConcurrentDepot.java | 35 +
.../stream/ConcurrentGenericReadInputStream.java | 806 +++
.../stream/ConcurrentGenericReadOutputStream.java | 240 +
.../stream/ConcurrentLegacyReadInputStream.java | 279 +
current/stream/ConcurrentReadInputStream.java | 252 +
current/stream/ConcurrentReadInputStreamD.java | 500 ++
current/stream/ConcurrentReadListDepot.java | 35 +
current/stream/ConcurrentReadOutputStream.java | 121 +
current/stream/ConcurrentReadOutputStreamD.java | 323 ++
current/stream/ConcurrentReadStreamInterface.java | 65 +
current/stream/CrisWrapper.java | 96 +
current/stream/DualCris.java | 223 +
current/stream/FASTQ.java | 888 +++
current/stream/FastaQualReadInputStream.java | 340 ++
current/stream/FastaReadInputStream.java | 572 ++
current/stream/FastaReadInputStream2.java | 302 +
current/stream/FastqReadInputStream.java | 164 +
current/stream/KillSwitch.java | 204 +
current/stream/MultiCros.java | 190 +
current/stream/RTextInputStream.java | 301 +
current/stream/RandomReadInputStream3.java | 188 +
current/stream/Read.java | 3395 ++++++++++++
current/stream/ReadInputStream.java | 46 +
current/stream/ReadStreamByteWriter.java | 594 ++
current/stream/ReadStreamStringWriter.java | 364 ++
current/stream/ReadStreamWriter.java | 416 ++
current/stream/SamHeader.java | 360 ++
current/stream/SamLine.java | 2230 ++++++++
current/stream/SamReadInputStream.java | 213 +
current/stream/ScaffoldCoordinates.java | 85 +
current/stream/ScarfReadInputStream.java | 138 +
current/stream/SequentialReadInputStream.java | 193 +
current/stream/SiteScore.java | 1021 ++++
current/stream/SiteScoreR.java | 285 +
.../stream/mpi/ConcurrentReadInputStreamMPI.java | 77 +
.../stream/mpi/ConcurrentReadOutputStreamMPI.java | 69 +
current/stream/mpi/MPIWrapper.java | 311 ++
current/tax/FilterByTaxa.java | 441 ++
current/tax/GiToNcbi.java | 245 +
current/tax/PrintTaxonomy.java | 301 +
current/tax/RenameGiToNcbi.java | 270 +
current/tax/SortByTaxa.java | 582 ++
current/tax/SplitByTaxa.java | 426 ++
current/tax/TaxFilter.java | 242 +
current/tax/TaxNode.java | 101 +
current/tax/TaxTree.java | 661 +++
current/ukmer/AbstractKmerTableU.java | 563 ++
current/ukmer/DumpThreadU.java | 73 +
current/ukmer/HashArrayU.java | 586 ++
current/ukmer/HashArrayU1D.java | 298 +
current/ukmer/HashArrayU2D.java | 226 +
current/ukmer/HashArrayUHybrid.java | 314 ++
current/ukmer/HashBufferU.java | 312 ++
current/ukmer/HashForestU.java | 540 ++
current/ukmer/Kmer.java | 347 ++
current/ukmer/KmerBufferU.java | 67 +
current/ukmer/KmerNodeU.java | 387 ++
current/ukmer/KmerNodeU1D.java | 161 +
current/ukmer/KmerNodeU2D.java | 237 +
current/ukmer/KmerTableSetU.java | 1141 ++++
current/var/ApplyVarsToReference.java | 325 ++
current/var/GenerateConsensusVariations.java | 247 +
current/var/GenerateVarlets.java | 664 +++
current/var/GenerateVarlets2.java | 647 +++
current/var/GenerateVarlets3.java | 865 +++
current/var/StackVariations.java | 738 +++
current/var/StackVariations2.java | 833 +++
current/var/VarLine.java | 249 +
current/var/Variation.java | 869 +++
current/var/Varlet.java | 403 ++
cutprimers.sh | 79 +
decontaminate.sh | 121 +
dedupe.sh | 154 +
dedupe2.sh | 66 +
dedupebymapping.sh | 87 +
demuxbyname.sh | 94 +
docs/Legal.txt | 9 +
docs/Legal_Illumina.txt | 3 +
docs/ToolDescriptions.txt | 520 ++
docs/UsageGuide.txt | 306 ++
docs/changelog.txt | 2134 +++++++
docs/compiling.txt | 9 +
docs/guides/A_SampleGuide.txt | 18 +
docs/guides/AddAdaptersGuide.txt | 23 +
docs/guides/BBDukGuide.txt | 178 +
docs/guides/BBMapGuide.txt | 186 +
docs/guides/BBMap_old_readme.txt | 237 +
docs/guides/BBMaskGuide.txt | 45 +
docs/guides/BBMergeGuide.txt | 68 +
docs/guides/BBNormGuide.txt | 112 +
docs/guides/CalcUniquenessGuide.txt | 43 +
docs/guides/ClumpifyGuide.txt | 40 +
docs/guides/DedupeGuide.txt | 136 +
docs/guides/PreprocessingGuide.txt | 39 +
docs/guides/ReformatGuide.txt | 140 +
docs/guides/RepairGuide.txt | 29 +
docs/guides/SealGuide.txt | 97 +
docs/guides/SplitNexteraGuide.txt | 22 +
docs/guides/StatsGuide.txt | 33 +
docs/guides/TadpoleGuide.txt | 66 +
docs/guides/TaxonomyGuide.txt | 91 +
docs/readme.txt | 34 +
docs/readme_config.txt | 30 +
docs/readme_filetypes.txt | 34 +
ecc.sh | 62 +
estherfilter.sh | 64 +
filterbarcodes.sh | 83 +
filterbycoverage.sh | 92 +
filterbyname.sh | 93 +
filterbysequence.sh | 91 +
filterbytaxa.sh | 95 +
filterlines.sh | 87 +
filtersubs.sh | 61 +
fungalrelease.sh | 94 +
fuse.sh | 89 +
getreads.sh | 61 +
gi2taxid.sh | 79 +
gitable.sh | 72 +
grademerge.sh | 52 +
gradesam.sh | 61 +
idmatrix.sh | 84 +
jni/BBMergeOverlapper.c | 523 ++
jni/BandedAlignerJNI.c | 758 +++
jni/CMakeLists.txt | 55 +
jni/MultiStateAligner11tsJNI.c | 813 +++
jni/README.txt | 15 +
jni/align2_BandedAlignerJNI.h | 47 +
jni/align2_MultiStateAligner11tsJNI.h | 179 +
jni/jgi_BBMergeOverlapper.h | 45 +
jni/makefile.linux | 16 +
jni/makefile.osx | 16 +
kcompress.sh | 105 +
khist.sh | 61 +
kmercountexact.sh | 129 +
kmercoverage.sh | 106 +
license.txt | 25 +
loglog.sh | 83 +
makechimeras.sh | 78 +
mapPacBio.sh | 57 +
matrixtocolumns.sh | 68 +
mergeOTUs.sh | 55 +
mergebarcodes.sh | 89 +
msa.sh | 79 +
phylip2fasta.sh | 76 +
pileup.sh | 144 +
postfilter.sh | 98 +
printtime.sh | 46 +
randomreads.sh | 159 +
readlength.sh | 66 +
reducesilva.sh | 73 +
reformat.sh | 206 +
removebadbarcodes.sh | 71 +
removesmartbell.sh | 63 +
rename.sh | 94 +
repair.sh | 93 +
resources/adapters.fa | 304 +
resources/contents.txt | 33 +
resources/nextera.fa.gz | Bin 0 -> 755 bytes
resources/nextera_LMP_adapter.fa.gz | Bin 0 -> 124 bytes
resources/nextera_LMP_linker.fa.gz | Bin 0 -> 117 bytes
resources/phix174_ill.ref.fa.gz | Bin 0 -> 1825 bytes
resources/phix_adapters.fa.gz | Bin 0 -> 130 bytes
resources/primes.txt.gz | Bin 0 -> 65889 bytes
resources/sample1.fq.gz | Bin 0 -> 11136 bytes
resources/sample2.fq.gz | Bin 0 -> 11110 bytes
resources/truseq.fa.gz | Bin 0 -> 269 bytes
resources/truseq_rna.fa.gz | Bin 0 -> 646 bytes
samtoroc.sh | 69 +
seal.sh | 225 +
shred.sh | 64 +
shuffle.sh | 84 +
sortbytaxa.sh | 85 +
splitbytaxa.sh | 91 +
splitnextera.sh | 101 +
splitsam.sh | 49 +
stats.sh | 93 +
statswrapper.sh | 79 +
summarizescafstats.sh | 62 +
summarizeseal.sh | 73 +
synthmda.sh | 87 +
tadpole.sh | 175 +
tadwrapper.sh | 70 +
taxonomy.sh | 80 +
taxtree.sh | 71 +
testformat.sh | 58 +
textfile.sh | 47 +
translate6frames.sh | 97 +
564 files changed, 242337 insertions(+)
diff --git a/README.md b/README.md
new file mode 100755
index 0000000..5c67c31
--- /dev/null
+++ b/README.md
@@ -0,0 +1,5 @@
+# BBTools bioinformatics tools, including BBMap.
+# Author: Brian Bushnell, Jon Rood
+# Language: Java
+# Information about documentation is in /docs/readme.txt.
+# Version 35.85
diff --git a/a_sample_mt.sh b/a_sample_mt.sh
new file mode 100755
index 0000000..21d2bbb
--- /dev/null
+++ b/a_sample_mt.sh
@@ -0,0 +1,84 @@
+#!/bin/bash
+#a_sample_mt in=<infile> out=<outfile>
+
+usage(){
+echo "
+Written by Brian Bushnell
+Last modified November 19, 2015
+
+Description: Does nothing. Should be fast.
+
+Usage: a_sample_mt.sh in=<input file> out=<output file>
+
+Input may be fasta or fastq, compressed or uncompressed.
+
+
+Standard parameters:
+in=<file> Primary input, or read 1 input.
+in2=<file> Read 2 input if reads are in two files.
+out=<file> Primary output, or read 1 output.
+out2=<file> Read 2 output if reads are in two files.
+overwrite=f (ow) Set to false to force the program to abort rather than
+ overwrite an existing file.
+showspeed=t (ss) Set to 'f' to suppress display of processing speed.
+ziplevel=2 (zl) Set to 1 (lowest) through 9 (max) to change compression
+ level; lower compression is faster.
+
+Processing parameters:
+None yet!
+
+Java Parameters:
+-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
+ -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory.
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx4g"
+z2="-Xms4g"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+ if [[ $set == 1 ]]; then
+ return
+ fi
+ freeRam 4000m 84
+ z="-Xmx${RAM}m"
+ z2="-Xms${RAM}m"
+}
+calcXmx "$@"
+
+a_sample_mt() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ fi
+ local CMD="java $EA $z -cp $CP jgi.A_SampleMT $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+a_sample_mt "$@"
diff --git a/addadapters.sh b/addadapters.sh
new file mode 100755
index 0000000..018346a
--- /dev/null
+++ b/addadapters.sh
@@ -0,0 +1,79 @@
+#!/bin/bash
+#addadapters in=<infile> out=<outfile>
+
+function usage(){
+echo "
+Written by Brian Bushnell
+Last modified February 17, 2015
+
+Description: Randomly adds adapters to a file, or grades a trimmed file.
+The input is a set of reads, paired or unpaired.
+The output is those same reads with adapter sequence replacing some of the bases in some reads.
+For paired reads, adapters are located in the same position in read1 and read2.
+This is designed for benchmarking adapter-trimming software, and evaluating methodology.
+
+Usage: addadapters.sh in=<file> in2=<file2> out=<outfile> out2=<outfile2> adapters=<file>
+
+in2 and out2 are for paired reads and are optional.
+If input is paired and there is only one output file, it will be written interleaved.
+
+
+Parameters:
+ow=f (overwrite) Overwrites files that already exist.
+int=f (interleaved) Determines whether INPUT file is considered interleaved.
+qin=auto ASCII offset for input quality. May be 33 (Sanger), 64 (Illumina), or auto.
+qout=auto ASCII offset for output quality. May be 33 (Sanger), 64 (Illumina), or auto (same as input).
+add Add adapters to input files. Default mode.
+grade Evaluate trimmed input files.
+adapters=<file> Fasta file of adapter sequences.
+literal=<sequence> Comma-delimited list of adapter sequences.
+left Adapters are on the left (3') end of the read.
+right Adapters are on the right (5') end of the read. Default mode.
+adderrors=t Add errors to adapters based on the quality scores.
+addpaired=t Add adapters to the same location for read 1 and read 2.
+arc=f Add reverse-complemented adapters as well as forward.
+rate=0.5 Add adapters to this fraction of reads.
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx200m"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+}
+calcXmx "$@"
+
+function addadapters() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ fi
+ local CMD="java $EA $z -cp $CP jgi.AddAdapters $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+addadapters "$@"
diff --git a/bbcountunique.sh b/bbcountunique.sh
new file mode 100755
index 0000000..f6d81a6
--- /dev/null
+++ b/bbcountunique.sh
@@ -0,0 +1,95 @@
+#!/bin/bash
+#bbcountunique in=<infile> out=<outfile>
+
+usage(){
+echo "
+Written by Brian Bushnell
+Last modified February 17, 2015
+
+Description: Generates a kmer uniqueness histogram, binned by file position.
+There are 3 columns for single reads, 6 columns for paired:
+count number of reads or pairs processed
+r1_first percent unique 1st kmer of read 1
+r1_rand percent unique random kmer of read 1
+r2_first percent unique 1st kmer of read 2
+r2_rand percent unique random kmer of read 2
+pair percent unique concatenated kmer from read 1 and 2
+
+Usage: bbcountunique.sh in=<input> out=<output>
+
+
+Optional parameters (and their defaults)
+
+Input parameters:
+in2=null Second input file for paired reads
+interleaved=auto Set true/false to override autodetection of the input file as paired interleaved.
+samplerate=1 Set to below 1 to sample a fraction of input reads.
+reads=-1 Only process this number of reads, then quit (-1 means all)
+
+Output parameters:
+out=<file> File for output stats
+
+Processing parameters:
+k=20 Kmer length (range 1-31).
+interval=25000 Print one line to the histogram per this many reads.
+cumulative=f Show cumulative numbers rather than per-interval numbers.
+percent=t Show percentages of unique reads.
+count=f Show raw counts of unique reads.
+printlastbin=f (plb) Print a line for the final undersized bin.
+
+
+Java Parameters:
+-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
+ -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory.
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx1g"
+z2="-Xms1g"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+ if [[ $set == 1 ]]; then
+ return
+ fi
+ freeRam 3200m 84
+ z="-Xmx${RAM}m"
+ z2="-Xms${RAM}m"
+}
+calcXmx "$@"
+
+bbcountunique() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ fi
+ local CMD="java $EA $z -cp $CP jgi.CalcUniqueness $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+bbcountunique "$@"
diff --git a/bbduk.sh b/bbduk.sh
new file mode 100755
index 0000000..83fcd4f
--- /dev/null
+++ b/bbduk.sh
@@ -0,0 +1,289 @@
+#!/bin/bash
+#bbduk in=<file> out=<file> ref=<ref file>
+
+usage(){
+echo "
+Written by Brian Bushnell
+Last modified December 10, 2015
+
+Description: Compares reads to the kmers in a reference dataset, optionally
+allowing an edit distance. Splits the reads into two outputs - those that
+match the reference, and those that don't. Can also trim (remove) the matching
+parts of the reads rather than binning the reads.
+
+Usage: bbduk.sh in=<input file> out=<output file> ref=<contaminant files>
+
+Input may be stdin or a fasta or fastq file, compressed or uncompressed.
+If you pipe via stdin/stdout, please include the file type; e.g. for gzipped
+fasta input, set in=stdin.fa.gz
+
+
+Input parameters:
+in=<file> Main input. in=stdin.fq will pipe from stdin.
+in2=<file> Input for 2nd read of pairs in a different file.
+ref=<file,file> Comma-delimited list of reference files.
+literal=<seq,seq> Comma-delimited list of literal reference sequences.
+touppercase=f (tuc) Change all bases upper-case.
+interleaved=auto (int) t/f overrides interleaved autodetection.
+qin=auto Input quality offset: 33 (Sanger), 64, or auto.
+reads=-1 If positive, quit after processing X reads or pairs.
+copyundefined=f (cu) Process non-AGCT IUPAC reference bases by making all
+ possible unambiguous copies. Intended for short motifs
+ or adapter barcodes, as time/memory use is exponential.
+samplerate=1 Set lower to only process a fraction of input reads.
+
+Output parameters:
+out=<file> (outnonmatch) Write reads here that do not contain
+ kmers matching the database. 'out=stdout.fq' will pipe
+ to standard out.
+out2=<file> (outnonmatch2) Use this to write 2nd read of pairs to a
+ different file.
+outm=<file> (outmatch) Write reads here that contain kmers matching
+ the database.
+outm2=<file> (outmatch2) Use this to write 2nd read of pairs to a
+ different file.
+outs=<file> (outsingle) Use this to write singleton reads whose mate
+ was trimmed shorter than minlen.
+stats=<file> Write statistics about which contamininants were detected.
+refstats=<file> Write statistics on a per-reference-file basis.
+rpkm=<file> Write RPKM for each reference sequence (for RNA-seq).
+dump=<file> Dump kmer tables to a file, in fasta format.
+duk=<file> Write statistics in duk's format. *DEPRECATED*
+nzo=t Only write statistics about ref sequences with nonzero hits.
+overwrite=t (ow) Grant permission to overwrite files.
+showspeed=t (ss) 'f' suppresses display of processing speed.
+ziplevel=2 (zl) Compression level; 1 (min) through 9 (max).
+fastawrap=70 Length of lines in fasta output.
+qout=auto Output quality offset: 33 (Sanger), 64, or auto.
+statscolumns=3 (cols) Number of columns for stats output, 3 or 5.
+ 5 includes base counts.
+rename=f Rename reads to indicate which sequences they matched.
+refnames=f Use names of reference files rather than scaffold IDs.
+trd=f Truncate read and ref names at the first whitespace.
+ordered=f Set to true to output reads in same order as input.
+maxbasesout=-1 If positive, quit after writing approximately this many
+ bases to out (outu/outnonmatch).
+maxbasesoutm=-1 If positive, quit after writing approximately this many
+ bases to outm (outmatch).
+
+Histogram output parameters:
+bhist=<file> Base composition histogram by position.
+qhist=<file> Quality histogram by position.
+qchist=<file> Count of bases with each quality value.
+aqhist=<file> Histogram of average read quality.
+bqhist=<file> Quality histogram designed for box plots.
+lhist=<file> Read length histogram.
+gchist=<file> Read GC content histogram.
+gcbins=100 Number gchist bins. Set to 'auto' to use read length.
+
+Histograms for sam files only (requires sam format 1.4 or higher):
+ehist=<file> Errors-per-read histogram.
+qahist=<file> Quality accuracy histogram of error rates versus quality
+ score.
+indelhist=<file> Indel length histogram.
+mhist=<file> Histogram of match, sub, del, and ins rates by read location.
+idhist=<file> Histogram of read count versus percent identity.
+idbins=100 Number idhist bins. Set to 'auto' to use read length.
+
+Processing parameters:
+k=27 Kmer length used for finding contaminants. Contaminants
+ shorter than k will not be found. k must be at least 1.
+rcomp=t Look for reverse-complements of kmers in addition to
+ forward kmers.
+maskmiddle=t (mm) Treat the middle base of a kmer as a wildcard, to
+ increase sensitivity in the presence of errors.
+minkmerhits=1 (mkh) Reads need at least this many matching kmers
+ to be considered as matching the reference.
+minkmerfraction=0.0 (mkf) A reads needs at least this fraction of its total
+ kmers to hit a ref, in order to be considered a match.
+ If this and minkmerhits are set, the greater is used.
+mincovfraction=0.0 (mcf) A reads needs at least this fraction of its total
+ bases to be covered by ref kmers to be considered a match.
+ If specified, mcf overrides mkh and mkf.
+hammingdistance=0 (hdist) Maximum Hamming distance for ref kmers (subs only).
+ Memory use is proportional to (3*K)^hdist.
+qhdist=0 Hamming distance for query kmers; impacts speed, not memory.
+editdistance=0 (edist) Maximum edit distance from ref kmers (subs
+ and indels). Memory use is proportional to (8*K)^edist.
+hammingdistance2=0 (hdist2) Sets hdist for short kmers, when using mink.
+qhdist2=0 Sets qhdist for short kmers, when using mink.
+editdistance2=0 (edist2) Sets edist for short kmers, when using mink.
+forbidn=f (fn) Forbids matching of read kmers containing N.
+ By default, these will match a reference 'A' if
+ hdist>0 or edist>0, to increase sensitivity.
+removeifeitherbad=t (rieb) Paired reads get sent to 'outmatch' if either is
+ match (or either is trimmed shorter than minlen).
+ Set to false to require both.
+findbestmatch=f (fbm) If multiple matches, associate read with sequence
+ sharing most kmers. Reduces speed.
+skipr1=f Don't do kmer-based operations on read 1.
+skipr2=f Don't do kmer-based operations on read 2.
+ecco=f For overlapping paired reads only. Performs error-
+ correction with BBMerge prior to kmer operations.
+recalibrate=f (recal) Recalibrate quality scores. Requires calibration
+ matrices generated by CalcTrueQuality.
+sam=<file,file> If recalibration is desired, and matrices have not already
+ been generated, BBDuk will create them from the sam file.
+
+Speed and Memory parameters:
+threads=auto (t) Set number of threads to use; default is number of
+ logical processors.
+prealloc=f Preallocate memory in table. Allows faster table loading
+ and more efficient memory usage, for a large reference.
+monitor=f Kill this process if it crashes. monitor=600,0.01 would
+ kill after 600 seconds under 1% usage.
+minrskip=1 (mns) Force minimal skip interval when indexing reference
+ kmers. 1 means use all, 2 means use every other kmer, etc.
+maxrskip=1 (mxs) Restrict maximal skip interval when indexing
+ reference kmers. Normally all are used for scaffolds<100kb,
+ but with longer scaffolds, up to maxrskip-1 are skipped.
+rskip= Set both minrskip and maxrskip to the same value.
+ If not set, rskip will vary based on sequence length.
+qskip=1 Skip query kmers to increase speed. 1 means use all.
+speed=0 Ignore this fraction of kmer space (0-15 out of 16) in both
+ reads and reference. Increases speed and reduces memory.
+Note: Do not use more than one of 'speed', 'qskip', and 'rskip'.
+
+Trimming/Filtering/Masking parameters:
+Note - if neither ktrim nor kmask is set, the default behavior is kfilter.
+All three are mutually exclusive.
+
+ktrim=f Trim reads to remove bases matching reference kmers.
+ Values:
+ f (don't trim),
+ r (trim to the right),
+ l (trim to the left)
+kmask=f Replace bases matching ref kmers with another symbol.
+ Allows any non-whitespace character other than t or f,
+ and processes short kmers on both ends. 'kmask=lc' will
+ convert masked bases to lowercase.
+maskfullycovered=f (mfc) Only mask bases that are fully covered by kmers.
+mink=0 Look for shorter kmers at read tips down to this length,
+ when k-trimming or masking. 0 means disabled. Enabling
+ this will disable maskmiddle.
+qtrim=f Trim read ends to remove bases with quality below trimq.
+ Performed AFTER looking for kmers.
+ Values:
+ rl (trim both ends),
+ f (neither end),
+ r (right end only),
+ l (left end only),
+ w (sliding window).
+trimq=6 Regions with average quality BELOW this will be trimmed.
+minlength=10 (ml) Reads shorter than this after trimming will be
+ discarded. Pairs will be discarded if both are shorter.
+mlf=0 (minlengthfraction) Reads shorter than this fraction of
+ original length after trimming will be discarded.
+maxlength= Reads longer than this after trimming will be discarded.
+ Pairs will be discarded only if both are longer.
+minavgquality=0 (maq) Reads with average quality (after trimming) below
+ this will be discarded.
+maqb=0 If positive, calculate maq from this many initial bases.
+chastityfilter=f (cf) Discard reads with id containing ' 1:Y:' or ' 2:Y:'.
+barcodefilter=f Remove reads with unexpected barcodes if barcodes is set,
+ or barcodes containing 'N' otherwise. A barcode must be
+ the last part of the read header.
+barcodes= Comma-delimited list of barcodes or files of barcodes.
+maxns=-1 If non-negative, reads with more Ns than this
+ (after trimming) will be discarded.
+mcb=0 (minconsecutivebases) Discard reads without at least
+ this many consecutive called bases.
+ottm=f (outputtrimmedtomatch) Output reads trimmed to shorter
+ than minlength to outm rather than discarding.
+tp=0 (trimpad) Trim this much extra around matching kmers.
+tbo=f (trimbyoverlap) Trim adapters based on where paired
+ reads overlap.
+strictoverlap=t Adjust sensitivity for trimbyoverlap mode.
+minoverlap=14 Require this many bases of overlap for detection.
+mininsert=40 Require insert size of at least this for overlap.
+ Should be reduced to 16 for small RNA sequencing.
+tpe=f (trimpairsevenly) When kmer right-trimming, trim both
+ reads to the minimum length of either.
+forcetrimleft=0 (ftl) If positive, trim bases to the left of this position
+ (exclusive, 0-based).
+forcetrimright=0 (ftr) If positive, trim bases to the right of this position
+ (exclusive, 0-based).
+forcetrimright2=0 (ftr2) If positive, trim this many bases on the right end.
+forcetrimmod=0 (ftm) If positive, right-trim length to be equal to zero,
+ modulo this number.
+restrictleft=0 If positive, only look for kmer matches in the
+ leftmost X bases.
+restrictright=0 If positive, only look for kmer matches in the
+ rightmost X bases.
+mingc=0 Discard reads with GC content below this.
+maxgc=1 Discard reads with GC content above this.
+
+Entropy/Complexity parameters:
+entropy=-1 Set between 0 and 1 to filter reads with entropy below
+ that value. Higher is more stringent.
+entropywindow=50 Calculate entropy using a sliding window of this length.
+entropyk=5 Calculate entropy using kmers of this length.
+minbasefrequency=0 Discard reads with a minimum base frequency below this.
+
+Cardinality estimation:
+cardinality=f (loglog) Count unique kmers using the LogLog algorithm.
+loglogk=31 Use this kmer length for counting.
+loglogbuckets=1999 Use this many buckets for counting.
+
+Java Parameters:
+
+-Xmx This will be passed to Java to set memory usage, overriding
+ the program's automatic memory detection. -Xmx20g will
+ specify 20 gigs of RAM, and -Xmx200m will specify 200 megs.
+ The max is typically 85% of physical memory.
+
+There is a changelog at /bbmap/docs/changelog_bbduk.txt
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+NATIVELIBDIR="$DIR""jni/"
+
+z="-Xmx1g"
+z2="-Xms1g"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+ if [[ $set == 1 ]]; then
+ return
+ fi
+ freeRam 1400m 42
+ z="-Xmx${RAM}m"
+ z2="-Xms${RAM}m"
+}
+calcXmx "$@"
+
+bbduk() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module unload samtools
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ module load samtools
+ fi
+ local CMD="java -Djava.library.path=$NATIVELIBDIR $EA $z $z2 -cp $CP jgi.BBDukF $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+bbduk "$@"
diff --git a/bbduk2.sh b/bbduk2.sh
new file mode 100755
index 0000000..0c2a8f9
--- /dev/null
+++ b/bbduk2.sh
@@ -0,0 +1,288 @@
+#!/bin/bash
+#bbduk2 in=<file> out=<file> fref=<file>
+
+usage(){
+echo "
+Written by Brian Bushnell
+Last modified December 10, 2015
+
+BBDuk2 is like BBDuk but can kfilter, kmask, and ktrim in a single pass.
+It does not replace BBDuk, and is only provided to allow maximally efficient
+pipeline integration when multiple steps will be performed. The syntax is
+slightly different.
+
+Description: Compares reads to the kmers in a reference dataset, optionally
+allowing an edit distance. Splits the reads into two outputs - those that
+match the reference, and those that don't. Can also trim (remove) the matching
+parts of the reads rather than binning the reads.
+
+Usage: bbduk2.sh in=<input file> out=<output file> fref=<contaminant files>
+
+Input may be stdin or a fasta or fastq file, compressed or uncompressed.
+If you pipe via stdin/stdout, please include the file type; e.g. for gzipped
+fasta input, set in=stdin.fa.gz
+
+
+Input parameters:
+in=<file> Main input. in=stdin.fq will pipe from stdin.
+in2=<file> Input for 2nd read of pairs in a different file.
+fref=<file,file> Comma-delimited list of fasta reference files for filtering.
+rref=<file,file> Comma-delimited list of fasta reference files for right-trimming.
+lref=<file,file> Comma-delimited list of fasta reference files for left-trimming.
+mref=<file,file> Comma-delimited list of fasta reference files for masking.
+fliteral=<seq,seq> Comma-delimited list of literal sequences for filtering.
+rliteral=<seq,seq> Comma-delimited list of literal sequences for right-trimming.
+lliteral=<seq,seq> Comma-delimited list of literal sequences for left-trimming.
+mliteral=<seq,seq> Comma-delimited list of literal sequences for masking.
+touppercase=f (tuc) Change all bases upper-case.
+interleaved=auto (int) t/f overrides interleaved autodetection.
+qin=auto Input quality offset: 33 (Sanger), 64, or auto.
+reads=-1 If positive, quit after processing X reads or pairs.
+copyundefined=f (cu) Process non-AGCT IUPAC reference bases by making all
+ possible unambiguous copies. Intended for short motifs
+ or adapter barcodes, as time/memory use is exponential.
+
+Output parameters:
+out=<file> (outnonmatch) Write reads here that do not contain
+ kmers matching the database. 'out=stdout.fq' will pipe
+ to standard out.
+out2=<file> (outnonmatch2) Use this to write 2nd read of pairs to a
+ different file.
+outm=<file> (outmatch) Write reads here that contain kmers matching
+ the database.
+outm2=<file> (outmatch2) Use this to write 2nd read of pairs to a
+ different file.
+outs=<file> (outsingle) Use this to write singleton reads whose mate
+ was trimmed shorter than minlen.
+stats=<file> Write statistics about which contamininants were detected.
+refstats=<file> Write statistics on a per-reference-file basis.
+rpkm=<file> Write RPKM for each reference sequence (for RNA-seq).
+dump=<file> Dump kmer tables to a file, in fasta format.
+nzo=t Only write statistics about ref sequences with nonzero hits.
+overwrite=t (ow) Grant permission to overwrite files.
+showspeed=t (ss) 'f' suppresses display of processing speed.
+ziplevel=2 (zl) Compression level; 1 (min) through 9 (max).
+fastawrap=80 Length of lines in fasta output.
+qout=auto Output quality offset: 33 (Sanger), 64, or auto.
+statscolumns=3 (cols) Number of columns for stats output, 3 or 5.
+ 5 includes base counts.
+rename=f Rename reads to indicate which sequences they matched.
+refnames=f Use names of reference files rather than scaffold IDs.
+trd=f Truncate read and ref names at the first whitespace.
+ordered=f Set to true to output reads in same order as input.
+
+Histogram output parameters:
+bhist=<file> Base composition histogram by position.
+qhist=<file> Quality histogram by position.
+qchist=<file> Count of bases with each quality value.
+aqhist=<file> Histogram of average read quality.
+bqhist=<file> Quality histogram designed for box plots.
+lhist=<file> Read length histogram.
+gchist=<file> Read GC content histogram.
+gcbins=100 Number gchist bins. Set to 'auto' to use read length.
+
+Histograms for sam files only (requires sam format 1.4 or higher):
+
+ehist=<file> Errors-per-read histogram.
+qahist=<file> Quality accuracy histogram of error rates versus quality
+ score.
+indelhist=<file> Indel length histogram.
+mhist=<file> Histogram of match, sub, del, and ins rates by read location.
+idhist=<file> Histogram of read count versus percent identity.
+idbins=100 Number idhist bins. Set to 'auto' to use read length.
+
+Processing parameters:
+k=27 Kmer length used for finding contaminants. Contaminants
+ shorter than k will not be found. k must be at least 1.
+rcomp=t Look for reverse-complements of kmers in addition to
+ forward kmers.
+maskmiddle=t (mm) Treat the middle base of a kmer as a wildcard, to
+ increase sensitivity in the presence of errors.
+minkmerhits=1 (mkh) Reads need at least this many matching kmers
+ to be considered as matching the reference.
+hammingdistance=0 (hdist) Maximum Hamming distance for ref kmers (subs only).
+ Memory use is proportional to (3*K)^hdist.
+qhdist=0 Hamming distance for query kmers; impacts speed, not memory.
+editdistance=0 (edist) Maximum edit distance from ref kmers (subs
+ and indels). Memory use is proportional to (8*K)^edist.
+hammingdistance2=0 (hdist2) Sets hdist for short kmers, when using mink.
+qhdist2=0 Sets qhdist for short kmers, when using mink.
+editdistance2=0 (edist2) Sets edist for short kmers, when using mink.
+forbidn=f (fn) Forbids matching of read kmers containing N.
+ By default, these will match a reference 'A' if
+ hdist>0 or edist>0, to increase sensitivity.
+removeifeitherbad=t (rieb) Paired reads get sent to 'outmatch' if either is
+ match (or either is trimmed shorter than minlen).
+ Set to false to require both.
+findbestmatch=f (fbm) If multiple matches, associate read with sequence
+ sharing most kmers. Reduces speed.
+skipr1=f Don't do kmer-based operations on read 1.
+skipr2=f Don't do kmer-based operations on read 2.
+ecco=f For overlapping paired reads only. Performs error-
+ correction with BBMerge prior to kmer operations.
+recalibrate=f (recal) Recalibrate quality scores. Requires calibration
+ matrices generated by CalcTrueQuality.
+sam=<file,file> If recalibration is desired, and matrices have not already
+ been generated, BBDuk will create them from the sam file.
+
+Speed and Memory parameters:
+threads=auto (t) Set number of threads to use; default is number of
+ logical processors.
+prealloc=f Preallocate memory in table. Allows faster table loading
+ and more efficient memory usage, for a large reference.
+monitor=f Kill this process if it crashes. monitor=600,0.01 would
+ kill after 600 seconds under 1% usage.
+minrskip=1 (mns) Force minimal skip interval when indexing reference
+ kmers. 1 means use all, 2 means use every other kmer, etc.
+maxrskip=1 (mxs) Restrict maximal skip interval when indexing
+ reference kmers. Normally all are used for scaffolds<100kb,
+ but with longer scaffolds, up to maxrskip-1 are skipped.
+rskip= Set both minrskip and maxrskip to the same value.
+ If not set, rskip will vary based on sequence length.
+qskip=1 Skip query kmers to increase speed. 1 means use all.
+speed=0 Ignore this fraction of kmer space (0-15 out of 16) in both
+ reads and reference. Increases speed and reduces memory.
+Note: Do not use more than one of 'speed', 'qskip', and 'rskip'.
+
+Trimming/Filtering/Masking parameters:
+Note - for BBDuk2, kmer filtering, trimming, and masking are independent,
+and all can be performed at the same time.
+
+ktrim=f Trim reads to remove bases matching reference kmers.
+ Values:
+ f (don't trim),
+ r (trim to the right),
+ l (trim to the left)
+kmask=f Replace bases matching ref kmers with another symbol.
+ Allows any non-whitespace character other than t or f,
+ and processes short kmers on both ends. 'kmask=lc' will
+ convert masked bases to lowercase.
+mink=0 Look for shorter kmers at read tips down to this length,
+ when k-trimming or masking. 0 means disabled. Enabling
+ this will disable maskmiddle.
+qtrim=f Trim read ends to remove bases with quality below trimq.
+ Performed AFTER looking for kmers.
+ Values:
+ rl (trim both ends),
+ f (neither end),
+ r (right end only),
+ l (left end only),
+ w (sliding window)
+trimq=6 Regions with average quality BELOW this will be trimmed.
+minlength=10 (ml) Reads shorter than this after trimming will be
+ discarded. Pairs will be discarded if both are shorter.
+mlf=0 (minlengthfraction) Reads shorter than this fraction of
+ original length after trimming will be discarded.
+maxlength= Reads longer than this after trimming will be discarded.
+ Pairs will be discarded only if both are longer.
+minavgquality=0 (maq) Reads with average quality (after trimming) below
+ this will be discarded.
+maqb=0 If positive, calculate maq from this many initial bases.
+chastityfilter=f (cf) Discard reads with id containing ' 1:Y:' or ' 2:Y:'.
+barcodefilter=f Remove reads with unexpected barcodes if barcodes is set,
+ or barcodes containing 'N' otherwise. A barcode must be
+ the last part of the read header.
+barcodes= Comma-delimited list of barcodes or files of barcodes.
+maxns=-1 If non-negative, reads with more Ns than this
+ (after trimming) will be discarded.
+mcb=0 (minconsecutivebases) Discard reads without at least
+ this many consecutive called bases.
+ottm=f (outputtrimmedtomatch) Output reads trimmed to shorter
+ than minlength to outm rather than discarding.
+tp=0 (trimpad) Trim this much extra around matching kmers.
+tbo=f (trimbyoverlap) Trim adapters based on where paired
+ reads overlap.
+strictoverlap=t Adjust sensitivity for trimbyoverlap mode.
+minoverlap=14 Require this many bases of overlap for detection.
+mininsert=50 Require insert size of at least this for overlap.
+ Should be reduced to 16 for small RNA sequencing.
+tpe=f (trimpairsevenly) When kmer right-trimming, trim both
+ reads to the minimum length of either.
+forcetrimleft=0 (ftl) If positive, trim bases to the left of this position
+ (exclusive, 0-based).
+forcetrimright=0 (ftr) If positive, trim bases to the right of this position
+ (exclusive, 0-based).
+forcetrimright2=0 (ftr2) If positive, trim this many bases on the right end.
+forcetrimmod=0 (ftm) If positive, right-trim length to be equal to zero,
+ modulo this number.
+restrictleft=0 If positive, only look for kmer matches in the
+ leftmost X bases.
+restrictright=0 If positive, only look for kmer matches in the
+ rightmost X bases.
+mingc=0 Discard reads with GC content below this.
+maxgc=1 Discard reads with GC content above this.
+
+Entropy/Complexity parameters:
+entropy=-1 Set between 0 and 1 to filter reads with entropy below
+ that value. Higher is more stringent.
+entropywindow=50 Calculate entropy using a sliding window of this length.
+entropyk=5 Calculate entropy using kmers of this length.
+minbasefrequency=0 Discard reads with a minimum base frequency below this.
+
+Cardinality estimation:
+cardinality=f (loglog) Count unique kmers using the LogLog algorithm.
+loglogk=31 Use this kmer length for counting.
+loglogbuckets=1999 Use this many buckets for counting.
+
+Java Parameters:
+
+-Xmx This will be passed to Java to set memory usage, overriding
+ the program's automatic memory detection. -Xmx20g will
+ specify 20 gigs of RAM, and -Xmx200m will specify 200 megs.
+ The max is typically 85% of physical memory.
+
+There is a changelog at /bbmap/docs/changelog_bbduk.txt
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+NATIVELIBDIR="$DIR""jni/"
+
+z="-Xmx1g"
+z2="-Xms1g"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+ if [[ $set == 1 ]]; then
+ return
+ fi
+ freeRam 1400m 42
+ z="-Xmx${RAM}m"
+ z2="-Xms${RAM}m"
+}
+calcXmx "$@"
+
+bbduk2() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module unload samtools
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ module load samtools
+ fi
+ local CMD="java -Djava.library.path=$NATIVELIBDIR $EA $z $z2 -cp $CP jgi.BBDuk2 $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+bbduk2 "$@"
diff --git a/bbest.sh b/bbest.sh
new file mode 100755
index 0000000..3a7f4a3
--- /dev/null
+++ b/bbest.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+#bbest in=<infile> out=<outfile>
+
+function usage(){
+echo "
+Written by Brian Bushnell
+Last modified November 4, 2015
+
+Description: Calculates EST (expressed sequence tags) capture by an assembly from a sam file.
+Designed to use BBMap output generated with these flags: k=13 maxindel=100000 customtag ordered
+
+Usage: bbest.sh in=<sam file> out=<stats file>
+
+Parameters:
+in=<file> Specify a sam file (or stdin) containing mapped ests.
+out=<file> Specify the output stats file (default is stdout).
+ref=<file> Specify the reference file (optional).
+est=<file> Specify the est fasta file (optional).
+fraction=<0.98> Min fraction of bases mapped to ref to be
+ considered 'all mapped'.
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx120m"
+z2="-Xms120m"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+}
+calcXmx "$@"
+
+function bbest() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module unload samtools
+ module load oracle-jdk/1.7_64bit
+ module load samtools
+ module load pigz
+ fi
+ local CMD="java $EA $z -cp $CP jgi.SamToEst $@"
+# echo $CMD >&2
+ eval $CMD
+}
+
+bbest "$@"
diff --git a/bbfakereads.sh b/bbfakereads.sh
new file mode 100755
index 0000000..7230ff2
--- /dev/null
+++ b/bbfakereads.sh
@@ -0,0 +1,92 @@
+#!/bin/bash
+#fakereads in=<infile> out=<outfile>
+
+function usage(){
+echo "
+Written by Brian Bushnell
+Last modified February 17, 2015
+
+Description: Generates fake read pairs from ends of contigs or single reads.
+
+Usage: bbfakereads.sh in=<file> out=<outfile> out2=<outfile2>
+
+Out2 is optional; if there is only one output file, it will be written interleaved.
+
+
+Standard parameters:
+ow=f (overwrite) Overwrites files that already exist.
+zl=4 (ziplevel) Set compression level, 1 (low) to 9 (max).
+fastawrap=100 Length of lines in fasta output.
+tuc=f (touppercase) Change lowercase letters in reads to uppercase.
+qin=auto ASCII offset for input quality. May be 33 (Sanger), 64 (Illumina), or auto.
+qout=auto ASCII offset for output quality. May be 33 (Sanger), 64 (Illumina), or auto (same as input).
+qfin=<.qual file> Read qualities from this qual file, for the reads coming from 'in=<fasta file>'
+qfout=<.qual file> Write qualities from this qual file, for the reads going to 'out=<fasta file>'
+qfout2=<.qual file> Write qualities from this qual file, for the reads coming from 'out2=<fasta file>'
+verifyinterleaved=f (vint) When true, checks a file to see if the names look paired. Prints an error message if not.
+tossbrokenreads=f (tbr) Discard reads that have different numbers of bases and qualities. By default this will be detected and cause a crash.
+
+Faking parameters:
+length=250 Generate reads of this length.
+minlength=1 Don't generate reads shorter than this.
+overlap=0 If you set overlap, then reads will by variable length, overlapping by 'overlap' in the middle.
+identifier=null (id) Output read names are prefixed with this.
+addspace=t Set to false to omit the space before /1 and /2 of paired reads.
+
+Java Parameters:
+-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
+ -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory.
+
+Supported input formats are fastq, fasta, fast+qual, scarf, and bread (BBMap's native format)
+Supported output formats are fastq, fasta, fast+qual, bread
+Supported compression formats are gz, zip, and bz2
+To read from stdin, set 'in=stdin'. The format should be specified with an extension, like 'in=stdin.fq.gz'
+To write to stdout, set 'out=stdout'. The format should be specified with an extension, like 'out=stdout.fasta'
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx600m"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+}
+calcXmx "$@"
+
+function fakereads() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module unload samtools
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ module load samtools
+ fi
+ local CMD="java $EA $z -cp $CP jgi.FakeReads $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+fakereads "$@"
diff --git a/bbmap.sh b/bbmap.sh
new file mode 100755
index 0000000..019e9d1
--- /dev/null
+++ b/bbmap.sh
@@ -0,0 +1,340 @@
+#!/bin/bash
+#bbmap in=<infile> out=<outfile>
+
+usage(){
+echo "
+BBMap v35.x
+Written by Brian Bushnell, from Dec. 2010 - present
+Last modified December 15, 2015
+
+Description: Fast and accurate splice-aware read aligner.
+
+To index: bbmap.sh ref=<reference fasta>
+To map: bbmap.sh in=<reads> out=<output sam>
+To map without writing an index:
+ bbmap.sh ref=<reference fasta> in=<reads> out=<output sam> nodisk
+
+in=stdin will accept reads from standard in, and out=stdout will write to
+standard out, but file extensions are still needed to specify the format of the
+input and output files e.g. in=stdin.fa.gz will read gzipped fasta from
+standard in; out=stdout.sam.gz will write gzipped sam.
+
+Indexing Parameters (required when building the index):
+
+nodisk=f Set to true to build index in memory and write nothing
+ to disk except output.
+ref=<file> Specify the reference sequence. Only do this ONCE,
+ when building the index (unless using 'nodisk').
+build=1 If multiple references are indexed in the same directory,
+ each needs a unique numeric ID (unless using 'nodisk').
+k=13 Kmer length, range 8-15. Longer is faster but uses
+ more memory. Shorter is more sensitive.
+ If indexing and mapping are done in two steps, K should
+ be specified each time.
+path=<.> Specify the location to write the index, if you don't
+ want it in the current working directory.
+usemodulo=f Throw away ~80% of kmers based on remainder modulo a
+ number (reduces RAM by 50% and sensitivity slightly).
+ Should be enabled both when building the index AND
+ when mapping.
+rebuild=f Force a rebuild of the index (ref= should be set).
+
+Input Parameters:
+
+build=1 Designate index to use. Corresponds to the number
+ specified when building the index.
+in=<file> Primary reads input; required parameter.
+in2=<file> For paired reads in two files.
+interleaved=auto True forces paired/interleaved input; false forces
+ single-ended mapping. If not specified, interleaved
+ status will be autodetected from read names.
+fastareadlen=500 Break up FASTA reads longer than this. Max is 500 for
+ BBMap and 6000 for BBMapPacBio. Only works for FASTA
+ input (use 'maxlen' for FASTQ input). The default for
+ bbmap.sh is 500, and for mapPacBio.sh is 6000.
+unpigz=f Spawn a pigz (parallel gzip) process for faster
+ decompression than using Java.
+ Requires pigz to be installed.
+touppercase=t (tuc) Convert lowercase letters in reads to upper case
+ (otherwise they will not match the reference).
+
+Sampling Parameters:
+
+reads=-1 Set to a positive number N to only process the first N
+ reads (or pairs), then quit. -1 means use all reads.
+samplerate=1 Set to a number from 0 to 1 to randomly select that
+ fraction of reads for mapping. 1 uses all reads.
+skipreads=0 Set to a number N to skip the first N reads (or pairs),
+ then map the rest.
+
+Mapping Parameters:
+
+fast=f This flag is a macro which sets other paramters to run
+ faster, at reduced sensitivity. Bad for RNA-seq.
+slow=f This flag is a macro which sets other paramters to run
+ slower, at greater sensitivity. 'vslow' is even slower.
+maxindel=16000 Don't look for indels longer than this. Lower is faster.
+ Set to >=100k for RNAseq with long introns like mammals.
+strictmaxindel=f When enabled, do not allow indels longer than 'maxindel'.
+ By default these are not sought, but may be found anyway.
+minid=0.76 Approximate minimum alignment identity to look for.
+ Higher is faster and less sensitive.
+minhits=1 Minimum number of seed hits required for candidate sites.
+ Higher is faster.
+local=f Set to true to use local, rather than global, alignments.
+ This will soft-clip ugly ends of poor alignments.
+perfectmode=f Allow only perfect mappings when set to true (very fast).
+semiperfectmode=f Allow only perfect and semiperfect (perfect except for
+ N's in the reference) mappings.
+threads=auto (t) Set to number of threads desired. By default, uses
+ all cores available.
+ambiguous=best (ambig) Set behavior on ambiguously-mapped reads (with
+ multiple top-scoring mapping locations).
+ best (use the first best site)
+ toss (consider unmapped)
+ random (select one top-scoring site randomly)
+ all (retain all top-scoring sites)
+samestrandpairs=f (ssp) Specify whether paired reads should map to the
+ same strand or opposite strands.
+requirecorrectstrand=t (rcs) Forbid pairing of reads without correct strand
+ orientation. Set to false for long-mate-pair libraries.
+killbadpairs=f (kbp) If a read pair is mapped with an inappropriate
+ insert size or orientation, the read with the lower
+ mapping quality is marked unmapped.
+pairedonly=f (po) Treat unpaired reads as unmapped. Thus they will
+ be sent to 'outu' but not 'outm'.
+rcomp=f Reverse complement both reads prior to mapping (for LMP
+ outward-facing libraries).
+rcompmate=f Reverse complement read2 prior to mapping.
+pairlen=32000 Set max allowed distance between paired reads.
+ (insert size)=(pairlen)+(read1 length)+(read2 length)
+rescuedist=1200 Don't try to rescue paired reads if avg. insert size
+ greater than this. Lower is faster.
+rescuemismatches=32 Maximum mismatches allowed in a rescued read. Lower
+ is faster.
+averagepairdist=100 (apd) Initial average distance between paired reads.
+ Varies dynamically; does not need to be specified.
+bandwidthratio=0 (bwr) If above zero, restrict alignment band to this
+ fraction of read length. Faster but less accurate.
+usejni=f (jni) Do alignments faster, in C code. Requires
+ compiling the C code; details are in /jni/README.txt.
+maxsites2=800 Don't analyze (or print) more than this many alignments
+ per read.
+monitor=f Kill this process if CPU usage drops to zero for
+ a long time. monitor=600,0.01 would kill after 600
+ seconds under 1% usage.
+
+Quality and Trimming Parameters:
+
+qin=auto Set to 33 or 64 to specify input quality value ASCII
+ offset. 33 is Sanger, 64 is old Solexa.
+qout=auto Set to 33 or 64 to specify output quality value ASCII
+ offset (only if output format is fastq).
+qtrim=f Quality-trim ends before mapping. Options are:
+ 'f' (false), 'l' (left), 'r' (right), and 'lr' (both).
+untrim=f Undo trimming after mapping. Untrimmed bases will be
+ soft-clipped in cigar strings.
+trimq=6 Trim regions with average quality below this
+ (phred algorithm).
+mintrimlength=60 (mintl) Don't trim reads to be shorter than this.
+fakefastaquality=-1 (ffq) Set to a positive number 1-50 to generate fake
+ quality strings for fasta input reads.
+ignorebadquality=f (ibq) Keep going, rather than crashing, if a read has
+ out-of-range quality values.
+usequality=t Use quality scores when determining which read kmers
+ to use as seeds.
+minaveragequality=0 (maq) Discard reads with average quality below this.
+maqb=0 If positive, calculate maq from this many initial bases.
+
+Output Parameters:
+
+out=<file> Write all reads to this file.
+outu=<file> Write only unmapped reads to this file. Does not
+ include unmapped paired reads with a mapped mate.
+outm=<file> Write only mapped reads to this file. Includes
+ unmapped paired reads with a mapped mate.
+mappedonly=f If true, treats 'out' like 'outm'.
+bamscript=<file> (bs) Write a shell script to <file> that will turn
+ the sam output into a sorted, indexed bam file.
+ordered=f Set to true to output reads in same order as input.
+ Slower and uses more memory.
+overwrite=f (ow) Allow process to overwrite existing files.
+secondary=f Print secondary alignments.
+sssr=0.95 (secondarysitescoreratio) Print only secondary alignments
+ with score of at least this fraction of primary.
+ssao=f (secondarysiteasambiguousonly) Only print secondary
+ alignments for ambiguously-mapped reads.
+maxsites=5 Maximum number of total alignments to print per read.
+ Only relevant when secondary=t.
+quickmatch=f Generate cigar strings more quickly.
+trimreaddescriptions=f (trd) Truncate read and ref names at the first whitespace,
+ assuming that the remainder is a comment or description.
+ziplevel=2 (zl) Compression level for zip or gzip output.
+pigz=f Spawn a pigz (parallel gzip) process for faster
+ compression than Java. Requires pigz to be installed.
+machineout=f Set to true to output statistics in machine-friendly
+ 'key=value' format.
+printunmappedcount=f Print the total number of unmapped reads and bases.
+ If input is paired, the number will be of pairs
+ for which both reads are unmapped.
+showprogress=0 If positive, print a '.' every X reads.
+showprogress2=0 If positive, print the number of seconds since the
+ last progress update (instead of a '.').
+
+Post-Filtering Parameters:
+
+idfilter=0 Independant of minid; sets exact minimum identity
+ allowed for alignments to be printed. Range 0 to 1.
+subfilter=-1 Ban alignments with more than this many substitutions.
+insfilter=-1 Ban alignments with more than this many insertions.
+delfilter=-1 Ban alignments with more than this many deletions.
+indelfilter=-1 Ban alignments with more than this many indels.
+editfilter=-1 Ban alignments with more than this many edits.
+inslenfilter=-1 Ban alignments with an insertion longer than this.
+dellenfilter=-1 Ban alignments with a deletion longer than this.
+
+Sam flags and settings:
+
+noheader=f Disable generation of header lines.
+sam=1.4 Set to 1.4 to write Sam version 1.4 cigar strings,
+ with = and X, or 1.3 to use M.
+saa=t (secondaryalignmentasterisks) Use asterisks instead of
+ bases for sam secondary alignments.
+cigar=t Set to 'f' to skip generation of cigar strings (faster).
+keepnames=f Keep original names of paired reads, rather than
+ ensuring both reads have the same name.
+intronlen=999999999 Set to a lower number like 10 to change 'D' to 'N' in
+ cigar strings for deletions of at least that length.
+rgid= Set readgroup ID. All other readgroup fields
+ can be set similarly, with the flag rgXX=
+mdtag=f Write MD tags.
+nhtag=f Write NH tags.
+xmtag=f Write XM tags (may only work correctly with ambig=all).
+amtag=f Write AM tags.
+nmtag=f Write NM tags.
+xstag=f Set to 'xs=fs', 'xs=ss', or 'xs=us' to write XS tags
+ for RNAseq using firststrand, secondstrand, or
+ unstranded libraries. Needed by Cufflinks.
+ JGI mainly uses 'firststrand'.
+stoptag=f Write a tag indicating read stop location, prefixed by YS:i:
+lengthtag=f Write a tag indicating (query,ref) alignment lengths,
+ prefixed by YL:Z:
+idtag=f Write a tag indicating percent identity, prefixed by YI:f:
+inserttag=f Write a tag indicating insert size, prefixed by X8:Z:
+scoretag=f Write a tag indicating BBMap's raw score, prefixed by YR:i:
+timetag=f Write a tag indicating this read's mapping time, prefixed by X0:i:
+boundstag=f Write a tag indicating whether either read in the pair
+ goes off the end of the reference, prefixed by XB:Z:
+notags=f Turn off all optional tags.
+
+Histogram and statistics output parameters:
+
+scafstats=<file> Statistics on how many reads mapped to which scaffold.
+refstats=<file> Statistics on how many reads mapped to which reference
+ file; only for BBSplit.
+sortscafs=t Sort scaffolds or references by read count.
+bhist=<file> Base composition histogram by position.
+qhist=<file> Quality histogram by position.
+aqhist=<file> Histogram of average read quality.
+bqhist=<file> Quality histogram designed for box plots.
+lhist=<file> Read length histogram.
+ihist=<file> Write histogram of insert sizes (for paired reads).
+ehist=<file> Errors-per-read histogram.
+qahist=<file> Quality accuracy histogram of error rates versus
+ quality score.
+indelhist=<file> Indel length histogram.
+mhist=<file> Histogram of match, sub, del, and ins rates by
+ read location.
+gchist=<file> Read GC content histogram.
+gcbins=100 Number gchist bins. Set to 'auto' to use read length.
+idhist=<file> Histogram of read count versus percent identity.
+idbins=100 Number idhist bins. Set to 'auto' to use read length.
+statsfile=stderr Mapping statistics are printed here.
+
+Coverage output parameters (these may reduce speed and use more RAM):
+
+covstats=<file> Per-scaffold coverage info.
+rpkm=<file> Per-scaffold RPKM/FPKM counts.
+covhist=<file> Histogram of # occurrences of each depth level.
+basecov=<file> Coverage per base location.
+bincov=<file> Print binned coverage per location (one line per X bases).
+covbinsize=1000 Set the binsize for binned coverage output.
+nzo=t Only print scaffolds with nonzero coverage.
+twocolumn=f Change to true to print only ID and Avg_fold instead of
+ all 6 columns to the 'out=' file.
+32bit=f Set to true if you need per-base coverage over 64k.
+strandedcov=f Track coverage for plus and minus strand independently.
+startcov=f Only track start positions of reads.
+secondarycov=t Include coverage of secondary alignments.
+physcov=f Calculate physical coverage for paired reads.
+ This includes the unsequenced bases.
+delcoverage=t (delcov) Count bases covered by deletions as covered.
+ True is faster than false.
+
+Java Parameters:
+-Xmx This will be passed to Java to set memory usage,
+ overriding the program's automatic memory detection.
+ -Xmx20g will specify 20 gigs of RAM, and -Xmx800m
+ will specify 800 megs. The max is typically 85% of
+ physical memory. The human genome requires around 24g,
+ or 12g with the 'usemodulo' flag. The index uses
+ roughly 6 bytes per reference base.
+
+This list is not complete. For more information, please consult
+$DIR""docs/readme.txt.
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter
+any problems, or post at: http://seqanswers.com/forums/showthread.php?t=41057
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+NATIVELIBDIR="$DIR""jni/"
+
+z="-Xmx1g"
+z2="-Xms1g"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+ if [[ $set == 1 ]]; then
+ return
+ fi
+ freeRam 3200m 84
+ z="-Xmx${RAM}m"
+ z2="-Xms${RAM}m"
+}
+calcXmx "$@"
+
+
+bbmap() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module unload samtools
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ module load samtools
+ fi
+ local CMD="java -Djava.library.path=$NATIVELIBDIR $EA $z -cp $CP align2.BBMap build=1 overwrite=true fastareadlen=500 $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+bbmap "$@"
diff --git a/bbmapskimmer.sh b/bbmapskimmer.sh
new file mode 100755
index 0000000..7735110
--- /dev/null
+++ b/bbmapskimmer.sh
@@ -0,0 +1,59 @@
+#!/bin/bash
+
+#This is a version of BBMap designed to final all sites above a given threshold,
+#rather than the single best site.
+
+usage(){
+ bash "$DIR"bbmap.sh
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+NATIVELIBDIR="$DIR""jni/"
+
+z="-Xmx1g"
+z2="-Xms1g"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+ if [[ $set == 1 ]]; then
+ return
+ fi
+ freeRam 3200m 84
+ z="-Xmx${RAM}m"
+ z2="-Xms${RAM}m"
+}
+calcXmx "$@"
+
+mapPacBioSkimmer() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module unload samtools
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ module load samtools
+ fi
+ local CMD="java -Djava.library.path=$NATIVELIBDIR $EA $z -cp $CP align2.BBMapPacBioSkimmer build=1 overwrite=true minratio=0.40 fastareadlen=6000 ambig=all minscaf=100 startpad=10000 stoppad=10000 midpad=6000 $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+mapPacBioSkimmer "$@"
diff --git a/bbmask.sh b/bbmask.sh
new file mode 100755
index 0000000..df4f552
--- /dev/null
+++ b/bbmask.sh
@@ -0,0 +1,110 @@
+#!/bin/bash
+#bbmask in=<infile> out=<outfile>
+
+usage(){
+echo "
+Written by Brian Bushnell
+Last modified May 19, 2015
+
+Description: Masks sequences of low-complexity, or containing repeat kmers, or covered by mapped reads.
+By default this program will mask using entropy with a window=80 and entropy=0.75
+
+Usage: bbmask.sh in=<file> out=<file> sam=<file,file,...file>
+
+Input may be stdin or a fasta or fastq file, raw or gzipped.
+sam is optional, but may be a comma-delimited list of sam files to mask.
+If you pipe via stdin/stdout, please include the file type; e.g. for gzipped fasta input, set in=stdin.fa.gz
+
+
+Input parameters:
+in=<file> Input sequences to mask. 'in=stdin.fa' will pipe from standard in.
+sam=<file,file> Comma-delimited list of sam files. Optional. Their mapped coordinates will be masked.
+touppercase=f (tuc) Change all letters to upper-case.
+interleaved=auto (int) If true, forces fastq input to be paired and interleaved.
+qin=auto ASCII offset for input quality. May be 33 (Sanger), 64 (Illumina), or auto.
+
+Output parameters:
+out=<file> Write masked sequences here. 'out=stdout.fa' will pipe to standard out.
+overwrite=t (ow) Set to false to force the program to abort rather than overwrite an existing file.
+ziplevel=2 (zl) Set to 1 (lowest) through 9 (max) to change compression level; lower compression is faster.
+fastawrap=70 Length of lines in fasta output.
+qout=auto ASCII offset for output quality. May be 33 (Sanger), 64 (Illumina), or auto (same as input).
+
+Processing parameters:
+threads=auto (t) Set number of threads to use; default is number of logical processors.
+maskrepeats=f (mr) Mask areas covered by exact repeat kmers.
+kr=5 Kmer size to use for repeat detection (1-15). Use minkr and maxkr to sweep a range of kmers.
+minlen=40 Minimum length of repeat area to mask.
+mincount=4 Minimum number of repeats to mask.
+masklowentropy=t (mle) Mask areas with low complexity by calculating entropy over a window for a fixed kmer size.
+ke=5 Kmer size to use for entropy calculation (1-15). Use minke and maxke to sweep a range. Large ke uses more memory.
+window=80 (w) Window size for entropy calculation.
+entropy=0.70 (e) Mask windows with entropy under this value (0-1). 0.0001 will mask only homopolymers and 1 will mask everything.
+lowercase=f (lc) Convert masked bases to lower case. Default is to convert them to N.
+split=f Split into unmasked pieces and discard masked pieces.
+
+Coverage parameters (only relevant if sam files are specified):
+mincov=-1 If nonnegative, mask bases with coverage outside this range.
+maxcov=-1 If nonnegative, mask bases with coverage outside this range.
+delcov=t Include deletions when calculating coverage.
+NOTE: If neither mincov nor maxcov are set, all covered bases will be masked.
+
+Other parameters:
+pigz=t Use pigz to compress. If argument is a number, that will set the number of pigz threads.
+unpigz=t Use pigz to decompress.
+
+Java Parameters:
+-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
+ -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory.
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx1g"
+z2="-Xms1g"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+ if [[ $set == 1 ]]; then
+ return
+ fi
+ freeRam 3200m 42
+ z="-Xmx${RAM}m"
+ z2="-Xms${RAM}m"
+}
+calcXmx "$@"
+
+bbmask() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ fi
+ local CMD="java $EA $z -cp $CP jgi.BBMask $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+bbmask "$@"
diff --git a/bbmerge-auto.sh b/bbmerge-auto.sh
new file mode 100755
index 0000000..2999e86
--- /dev/null
+++ b/bbmerge-auto.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+#merge in=<infile> out=<outfile>
+
+function usage(){
+echo "
+bbmerge-auto.sh is a wrapper for BBMerge that attempts to use all available
+memory, instead of a fixed amount. This is for use with the Tadpole options
+of error-correction (ecct) and extension, which require more memory.
+For merging by overlap only, please use bbmerge.sh. If you set memory
+manually with the -Xmx flag, bbmerge.sh and bbmerge-auto.sh are equivalent.
+
+For information about usage and parameters, please run bbmerge.sh.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+NATIVELIBDIR="$DIR""jni/"
+
+z="-Xmx14g"
+z2="-Xms14g"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+ if [[ $set == 1 ]]; then
+ return
+ fi
+ freeRam 15000m 84
+ z="-Xmx${RAM}m"
+ z2="-Xms${RAM}m"
+}
+calcXmx "$@"
+
+function merge() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ fi
+ local CMD="java -Djava.library.path=$NATIVELIBDIR $EA $z $z2 -cp $CP jgi.BBMerge $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+merge "$@"
diff --git a/bbmerge.sh b/bbmerge.sh
new file mode 100755
index 0000000..880a267
--- /dev/null
+++ b/bbmerge.sh
@@ -0,0 +1,220 @@
+#!/bin/bash
+#merge in=<infile> out=<outfile>
+
+function usage(){
+echo "
+BBMerge v8.82
+Written by Brian Bushnell and Jonathan Rood
+Last modified October 27, 2015
+
+Description: Merges paired reads into single reads by overlap detection.
+With sufficient coverage, can also merge nonoverlapping reads by kmer extension.
+
+Usage for interleaved files: bbmerge.sh in=<reads> out=<merged reads> outu=<unmerged reads>
+Usage for paired files: bbmerge.sh in1=<read1> in2=<read2> out=<merged reads> outu1=<unmerged1> outu2=<unmerged2>
+
+Input may be stdin or a fasta, fastq, or scarf file, raw or gzipped.
+
+
+Input parameters:
+in=null Primary input. 'in2' will specify a second file.
+interleaved=auto May be set to true or false to override autodetection of
+ whether the input file as interleaved.
+reads=-1 Quit after this many read pairs (-1 means all).
+
+
+Output parameters:
+out=<file> File for merged reads. 'out2' will specify a second file.
+outu=<file> File for unmerged reads. 'outu2' will specify a second file.
+outinsert=<file> (outi) File to write read names and insert sizes.
+outadapter=<file> (outa) File to write consensus adapter sequences.
+outc=<file> File to write input read kmer cardinality estimate.
+ihist=<file> (hist) Insert length histogram output file.
+nzo=t Only print histogram bins with nonzero values.
+showhiststats=t Print extra header lines with statistical information.
+ziplevel=2 Set to 1 (lowest) through 9 (max) to change compression
+ level; lower compression is faster.
+ordered=f Output reads in same order as input.
+mix=f Output both the merged (or mergable) and unmerged reads
+ in the same file (out=). Useful for ecco mode.
+
+
+Trimming/Filtering parameters:
+qtrim=f Trim read ends to remove bases with quality below minq.
+ Trims BEFORE merging.
+ Values: t (trim both ends),
+ f (neither end),
+ r (right end only),
+ l (left end only).
+qtrim2=f May be specified instead of qtrim to perform trimming
+ only if merging is unsuccessful, then retry merging.
+trimq=10 Trim quality threshold. This may be a comma-delimited
+ list (ascending) to try multiple values.
+minlength=1 (ml) Reads shorter than this after trimming, but before
+ merging, will be discarded. Pairs will be discarded only
+ if both are shorter.
+maxlength=-1 Reads with longer insert sizes will be discarded.
+tbo=f (trimbyoverlap) Trim overlapping reads to remove
+ rightmost (3') non-overlapping portion, instead of joining.
+minavgquality=0 (maq) Reads with average quality below this, after
+ trimming, will not be attempted to be merged.
+maxexpectederrors=0 (mee) If positive, reads with more combined expected
+ errors than this will not be attempted to be merged.
+forcetrimleft=0 (ftl) If nonzero, trim left bases of the read to
+ this position (exclusive, 0-based).
+forcetrimright=0 (ftr) If nonzero, trim right bases of the read
+ after this position (exclusive, 0-based).
+forcetrimright2=0 (ftr2) If positive, trim this many bases on the right end.
+forcetrimmod=5 (ftm) If positive, trim length to be equal to
+ zero modulo this number.
+
+
+Processing Parameters:
+usejni=f (jni) Do overlapping in C code, which is faster. Requires
+ compiling the C code; details are in /jni/README.txt.
+merge=t Create merged reads. If set to false, you can still
+ generate an insert histogram.
+ecco=f Error-correct the overlapping part, but don't merge.
+useoverlap=t Attempt find the insert size using read overlap.
+mininsert=35 Minimum insert size to merge reads.
+mininsert0=35 Insert sizes less than this will not be considered.
+ Must be less than or equal to mininsert.
+minoverlap=12 Minimum number of overlapping bases to allow merging.
+minoverlap0=8 Overlaps shorter than this will not be considered.
+ Must be less than or equal to minoverlap.
+minq=9 Ignore bases with quality below this.
+maxq=41 Cap output quality scores at this.
+entropy=t Increase the minimum overlap requirement for low-
+ complexity reads.
+efilter=6 Ban overlaps with over this many times the expected
+ number of errors. Lower is more strict.
+pfilter=0.00002 Ban improbable overlaps. Higher is more strict. 0 will
+ disable the filter; 1 will allow only perfect overlaps.
+kfilter=0 Ban overlaps that create kmers with count below
+ this value (0 disables). Does not seem to help.
+lowercase=f Expect lowercase letters to signify adapter sequence.
+ouq=f Calculate best overlap using quality values.
+owq=t Calculate best overlap without using quality values.
+usequality=t If disabled, quality values are completely ignored,
+ both for overlap detection and filtering. May be useful
+ for data with inaccurate quality values.
+iupacton=f (itn) Change ambiguous IUPAC symbols to N.
+
+
+Normal Mode:
+normalmode=f Original BBMerge algorithm. Faster, but lower overall
+ merge rate.
+margin=2 The best overlap must have at least 'margin' fewer
+ mismatches than the second best.
+mismatches=3 Do not allow more than this many mismatches.
+requireratiomatch=f (rrm) Require the answer from normal mode and ratio mode
+ to agree, reducing false positives if both are enabled.
+trimonfailure=t (tof) If detecting insert size by overlap fails,
+ the reads will be trimmed and this will be re-attempted.
+
+
+Ratio Mode:
+ratiomode=t Newer algorithm. Slower, but higher merge rate.
+ Much better for long overlaps and high error rates.
+maxratio=0.09 Max error rate; higher increases merge rate.
+ratiomargin=5.5 Lower increases merge rate; min is 1.
+ratiooffset=0.55 Lower increases merge rate; min is 0.
+ratiominoverlapreduction=3 This is the difference between minoverlap in
+ normal mode and minoverlap in ratio mode; generally,
+ minoverlap should be lower in ratio mode.
+
+*** Ratio Mode and Normal Mode may be used alone or simultaneously. ***
+*** Ratio Mode is much more accurate and is now the default mode. ***
+
+
+Strictness (these are mutually exclusive macros that set other parameters):
+strict=f Decrease false positive rate and merging rate.
+verystrict=f (vstrict) Greatly decrease FP and merging rate.
+ultrastrict=f (ustrict) Decrease FP and merging rate even more.
+maxstrict=f (xstrict) Maximally decrease FP and merging rate.
+loose=f Increase false positive rate and merging rate.
+veryloose=f (vloose) Greatly increase FP and merging rate.
+ultraloose=f (uloose) Increase FP and merging rate even more.
+maxloose=f (xloose) Maximally decrease FP and merging rate.
+fast=f Fastest possible mode; less accurate.
+
+
+Tadpole Parameters (for read extension and error-correction):
+extend=0 Extend reads to the right this much before merging.
+ Requires sufficient (>5x) kmer coverage.
+extend2=0 Extend reads only after a failed merge attempt.
+iterations=1 (ei) Iteratively attempt to extend by extend2 distance
+ and merge up to this many times.
+ecctadpole=f (ecct) If reads fail to merge, error-correct with Tadpole
+ and try again. This happens prior to extend2.
+removedeadends (shave) Remove kmers leading to dead ends.
+removebubbles (rinse) Remove kmers in error bubbles.
+mindepthseed=3 (mds) Minimum kmer depth to begin extension.
+mindepthextend=2 (mde) Minimum kmer depth continue extension.
+branchmult1=20 Min ratio of 1st to 2nd-greatest path depth at high depth.
+branchmult2=3 Min ratio of 1st to 2nd-greatest path depth at low depth.
+branchlower=3 Max value of 2nd-greatest path depth to be considered low.
+ibb=t Ignore backward branches when extending.
+extra=<file> A file or comma-delimited list of files of reads to use
+ for kmer counting, but not for merging or output.
+k=31 Kmer length (1-31 is fastest).
+prealloc=f Pre-allocate memory rather than dynamically growing;
+ faster and more memory-efficient for large datasets.
+ A float fraction (0-1) may be specified, default 1.
+prefilter=0 If set to a positive integer, use a countmin sketch to
+ ignore kmers with depth of that value or lower, to
+ reduce memory usage.
+minprob=0.5 Ignore kmers with overall probability of correctness
+ below this, to reduce memory usage.
+
+
+Java Parameters:
+-Xmx This will be passed to Java to set memory usage,
+ overriding the program's automatic memory detection.
+ For example, -Xmx400m will specify 400 MB RAM.
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+NATIVELIBDIR="$DIR""jni/"
+
+z="-Xmx1000m"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+}
+calcXmx "$@"
+
+function merge() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ fi
+ local CMD="java -Djava.library.path=$NATIVELIBDIR $EA $z -cp $CP jgi.BBMerge $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+merge "$@"
diff --git a/bbnorm.sh b/bbnorm.sh
new file mode 100755
index 0000000..9bd17c2
--- /dev/null
+++ b/bbnorm.sh
@@ -0,0 +1,172 @@
+#!/bin/bash
+#bbnorm in=<infile> out=<outfile>
+
+usage(){
+echo "
+Written by Brian Bushnell
+Last modified December 14, 2015
+
+Description: Normalizes read depth based on kmer counts.
+Can also error-correct, bin reads by kmer depth, and generate a kmer depth histogram.
+
+Usage: bbnorm.sh in=<input> out=<reads to keep> outt=<reads to toss> hist=<histogram output>
+
+Input may be fasta or fastq, compressed or uncompressed.
+'out' and 'hist' are both optional.
+
+
+Optional parameters (and their defaults)
+
+Input parameters:
+in=null Primary input. Use in2 for paired reads in a second file
+in2=null Second input file for paired reads in two files
+extra=null Additional files to use for input (generating hash table) but not for output
+fastareadlen=2^31 Break up FASTA reads longer than this. Can be useful when processing scaffolded genomes
+tablereads=-1 Use at most this many reads when building the hashtable (-1 means all)
+kmersample=1 Process every nth kmer, and skip the rest
+readsample=1 Process every nth read, and skip the rest
+interleaved=auto May be set to true or false to force the input read file to ovverride autodetection of the input file as paired interleaved.
+qin=auto ASCII offset for input quality. May be 33 (Sanger), 64 (Illumina), or auto.
+
+Output parameters:
+out=<file> File for normalized or corrected reads. Use out2 for paired reads in a second file
+outt=<file> (outtoss) File for reads that were excluded from primary output
+reads=-1 Only process this number of reads, then quit (-1 means all)
+sampleoutput=t Use sampling on output as well as input (not used if sample rates are 1)
+keepall=f Set to true to keep all reads (e.g. if you just want error correction).
+zerobin=f Set to true if you want kmers with a count of 0 to go in the 0 bin instead of the 1 bin in histograms.
+ Default is false, to prevent confusion about how there can be 0-count kmers.
+ The reason is that based on the 'minq' and 'minprob' settings, some kmers may be excluded from the bloom filter.
+tmpdir=$TMPDIR This will specify a directory for temp files (only needed for multipass runs). If null, they will be written to the output directory.
+usetempdir=t Allows enabling/disabling of temporary directory; if disabled, temp files will be written to the output directory.
+qout=auto ASCII offset for output quality. May be 33 (Sanger), 64 (Illumina), or auto (same as input).
+rename=f Rename reads based on their kmer depth.
+
+Hashing parameters:
+k=31 Kmer length (values under 32 are most efficient, but arbitrarily high values are supported)
+bits=32 Bits per cell in bloom filter; must be 2, 4, 8, 16, or 32. Maximum kmer depth recorded is 2^cbits. Automatically reduced to 16 in 2-pass.
+ Large values decrease accuracy for a fixed amount of memory, so use the lowest number you can that will still capture highest-depth kmers.
+hashes=3 Number of times each kmer is hashed and stored. Higher is slower.
+ Higher is MORE accurate if there is enough memory, and LESS accurate if there is not enough memory.
+prefilter=f True is slower, but generally more accurate; filters out low-depth kmers from the main hashtable. The prefilter is more memory-efficient because it uses 2-bit cells.
+prehashes=2 Number of hashes for prefilter.
+prefilterbits=2 (pbits) Bits per cell in prefilter.
+prefiltersize=0.35 Fraction of memory to allocate to prefilter.
+buildpasses=1 More passes can sometimes increase accuracy by iteratively removing low-depth kmers
+minq=6 Ignore kmers containing bases with quality below this
+minprob=0.5 Ignore kmers with overall probability of correctness below this
+threads=auto (t) Spawn exactly X hashing threads (default is number of logical processors). Total active threads may exceed X due to I/O threads.
+rdk=t (removeduplicatekmers) When true, a kmer's count will only be incremented once per read pair, even if that kmer occurs more than once.
+
+Normalization parameters:
+fixspikes=f (fs) Do a slower, high-precision bloom filter lookup of kmers that appear to have an abnormally high depth due to collisions.
+target=100 (tgt) Target normalization depth. NOTE: All depth parameters control kmer depth, not read depth.
+ For kmer depth Dk, read depth Dr, read length R, and kmer size K: Dr=Dk*(R/(R-K+1))
+maxdepth=-1 (max) Reads will not be downsampled when below this depth, even if they are above the target depth.
+mindepth=5 (min) Kmers with depth below this number will not be included when calculating the depth of a read.
+minkmers=15 (mgkpr) Reads must have at least this many kmers over min depth to be retained. Aka 'mingoodkmersperread'.
+percentile=54.0 (dp) Read depth is by default inferred from the 54th percentile of kmer depth, but this may be changed to any number 1-100.
+uselowerdepth=t (uld) For pairs, use the depth of the lower read as the depth proxy.
+deterministic=t (dr) Generate random numbers deterministically to ensure identical output between multiple runs. May decrease speed with a huge number of threads.
+passes=2 (p) 1 pass is the basic mode. 2 passes (default) allows greater accuracy, error detection, better contol of output depth.
+
+Error detection parameters:
+hdp=90.0 (highdepthpercentile) Position in sorted kmer depth array used as proxy of a read's high kmer depth.
+ldp=25.0 (lowdepthpercentile) Position in sorted kmer depth array used as proxy of a read's low kmer depth.
+tossbadreads=f (tbr) Throw away reads detected as containing errors.
+requirebothbad=f (rbb) Only toss bad pairs if both reads are bad.
+errordetectratio=125 (edr) Reads with a ratio of at least this much between their high and low depth kmers will be classified as error reads.
+highthresh=12 (ht) Threshold for high kmer. A high kmer at this or above are considered non-error.
+lowthresh=3 (lt) Threshold for low kmer. Kmers at this and below are always considered errors.
+
+Error correction parameters:
+ecc=f Set to true to correct errors.
+ecclimit=3 Correct up to this many errors per read. If more are detected, the read will remain unchanged.
+errorcorrectratio=140 (ecr) Adjacent kmers with a depth ratio of at least this much between will be classified as an error.
+echighthresh=22 (echt) Threshold for high kmer. A kmer at this or above may be considered non-error.
+eclowthresh=2 (eclt) Threshold for low kmer. Kmers at this and below are considered errors.
+eccmaxqual=127 Do not correct bases with quality above this value.
+aec=f (aggressiveErrorCorrection) Sets more aggressive values of ecr=100, ecclimit=7, echt=16, eclt=3.
+cec=f (conservativeErrorCorrection) Sets more conservative values of ecr=180, ecclimit=2, echt=30, eclt=1, sl=4, pl=4.
+meo=f (markErrorsOnly) Marks errors by reducing quality value of suspected errors; does not correct anything.
+mue=t (markUncorrectableErrors) Marks errors only on uncorrectable reads; requires 'ecc=t'.
+overlap=f (ecco) Error correct by read overlap.
+
+Depth binning parameters:
+lowbindepth=10 (lbd) Cutoff for low depth bin.
+highbindepth=80 (hbd) Cutoff for high depth bin.
+outlow=<file> Pairs in which both reads have a median below lbd go into this file.
+outhigh=<file> Pairs in which both reads have a median above hbd go into this file.
+outmid=<file> All other pairs go into this file.
+
+Histogram parameters:
+hist=<file> Specify a file to write the input kmer depth histogram.
+histout=<file> Specify a file to write the output kmer depth histogram.
+histcol=3 (histogramcolumns) Number of histogram columns, 2 or 3.
+pzc=f (printzerocoverage) Print lines in the histogram with zero coverage.
+histlen=1048576 Max kmer depth displayed in histogram. Also affects statistics displayed, but does not affect normalization.
+
+Peak calling parameters:
+peaks=<file> Write the peaks to this file. Default is stdout.
+minHeight=2 (h) Ignore peaks shorter than this.
+minVolume=2 (v) Ignore peaks with less area than this.
+minWidth=2 (w) Ignore peaks narrower than this.
+minPeak=2 (minp) Ignore peaks with an X-value below this.
+maxPeak=BIG (maxp) Ignore peaks with an X-value above this.
+maxPeakCount=8 (maxpc) Print up to this many peaks (prioritizing height).
+
+Java Parameters:
+-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
+ -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory.
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx31g"
+z2="-Xms31g"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+ if [[ $set == 1 ]]; then
+ return
+ fi
+ freeRam 31000m 84
+ z="-Xmx${RAM}m"
+ z2="-Xms${RAM}m"
+}
+calcXmx "$@"
+
+normalize() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ fi
+ local CMD="java $EA $z $z2 -cp $CP jgi.KmerNormalize bits=32 $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+normalize "$@"
diff --git a/bbsplit.sh b/bbsplit.sh
new file mode 100755
index 0000000..8d35256
--- /dev/null
+++ b/bbsplit.sh
@@ -0,0 +1,131 @@
+#!/bin/bash
+
+function usage(){
+echo "
+BBSplit / BBMap v35.x
+Written by Brian Bushnell, from Dec. 2010 - present
+Last modified October 9, 2015
+
+Description: Maps reads to multiple references simultaneously.
+Outputs reads to a file for the reference they best match, with multiple options for dealing with ambiguous mappings.
+
+To index: bbsplit.sh build=<1> ref_x=<reference fasta> ref_y=<another reference fasta>
+To map: bbsplit.sh build=<1> in=<reads> out_x=<output file> out_y=<another output file>
+
+To be concise, and do everything in one command:
+bbsplit.sh ref=x.fa,y.fa in=reads.fq basename=o%.fq
+
+that is equivalent to
+bbsplit.sh build=1 in=reads.fq ref_x=x.fa ref_y=y.fa out_x=ox.fq out_y=oy.fq
+
+By default paired reads will yield interleaved output, but you can use the # symbol to produce twin output files.
+For example, basename=o%_#.fq will produce ox_1.fq, ox_2.fq, oy_1.fq, and oy_2.fq.
+
+
+Indexing Parameters (required when building the index):
+ref=<file,file> A list of references, or directories containing fasta files.
+ref_<name>=<ref.fa> Alternate, longer way to specify references. e.g., ref_ecoli=ecoli.fa
+ These can also be comma-delimited lists of files; e.g., ref_a=a1.fa,a2.fa,a3.fa
+build=<1> If multiple references are indexed in the same directory, each needs a unique build ID.
+path=<.> Specify the location to write the index, if you don't want it in the current working directory.
+
+Input Parameters:
+build=<1> Designate index to use. Corresponds to the number specified when building the index.
+in=<reads.fq> Primary reads input; required parameter.
+in2=<reads2.fq> For paired reads in two files.
+qin=<auto> Set to 33 or 64 to specify input quality value ASCII offset.
+interleaved=<auto> True forces paired/interleaved input; false forces single-ended mapping.
+ If not specified, interleaved status will be autodetected from read names.
+
+Mapping Parameters:
+maxindel=<20> Don't look for indels longer than this. Lower is faster. Set to >=100k for RNA-seq.
+minratio=<0.65> Fraction of max alignment score required to keep a site. Higher is faster.
+minhits=<1> Minimum number of seed hits required for candidate sites. Higher is faster.
+ambiguous=<best> Set behavior on ambiguously-mapped reads (with multiple top-scoring mapping locations).
+ best (use the first best site)
+ toss (consider unmapped)
+ random (select one top-scoring site randomly)
+ all (retain all top-scoring sites. Does not work yet with SAM output)
+ambiguous2=<best> Set behavior only for reads that map ambiguously to multiple different references.
+ Normal 'ambiguous=' controls behavior on all ambiguous reads;
+ Ambiguous2 excludes reads that map ambiguously within a single reference.
+ best (use the first best site)
+ toss (consider unmapped)
+ all (write a copy to the output for each reference to which it maps)
+ split (write a copy to the AMBIGUOUS_ output for each reference to which it maps)
+trim=<true> Quality-trim ends to Q5 before mapping. Options are 'l' (left), 'r' (right), and 'lr' (both).
+untrim=<true> Undo trimming after mapping. Untrimmed bases will be soft-clipped in cigar strings.
+
+Output Parameters:
+out_<name>=<file> Output reads that map to the reference <name> to <file>.
+basename=prefix%suffix Equivalent to multiple out_%=prefix%suffix expressions, in which each % is replaced by the name of a reference file.
+bs=<file> Write a shell script to 'file' that will turn the sam output into a sorted, indexed bam file.
+scafstats=<file> Write statistics on how many reads mapped to which scaffold to this file.
+refstats=<file> Write statistics on how many reads mapped to which reference to this file.
+nzo=t Only print lines with nonzero coverage.
+
+***** Notes *****
+Almost all BBMap parameters can be used; run bbmap.sh for more details.
+Exceptions include the 'nodisk' flag, which BBSplit does not support.
+BBSplit is recommended for fastq and fasta output, not for sam/bam output.
+When the reference sequences are shorter than read length, use Seal instead of BBSplit.
+
+Java Parameters:
+-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
+ -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory.
+
+This list is not complete. For more information, please consult $DIRdocs/readme.txt
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+NATIVELIBDIR="$DIR""jni/"
+
+z="-Xmx1g"
+z2="-Xms1g"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+ if [[ $set == 1 ]]; then
+ return
+ fi
+ freeRam 3200m 84
+ z="-Xmx${RAM}m"
+ z2="-Xms${RAM}m"
+}
+calcXmx "$@"
+
+function bbsplit() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module unload samtools
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ module load samtools
+ fi
+ local CMD="java -Djava.library.path=$NATIVELIBDIR $EA $z -cp $CP align2.BBSplitter ow=t fastareadlen=500 minhits=1 minratio=0.56 maxindel=20 qtrim=rl untrim=t trimq=6 $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+bbsplit "$@"
diff --git a/bbsplitpairs.sh b/bbsplitpairs.sh
new file mode 100755
index 0000000..a111b5a
--- /dev/null
+++ b/bbsplitpairs.sh
@@ -0,0 +1,83 @@
+#!/bin/bash
+#splitpairs in=<infile> out=<outfile>
+
+usage(){
+echo "
+Written by Brian Bushnell
+Last modified February 17, 2015
+
+Description: Separates paired reads into files of 'good' pairs and 'good' singletons by removing 'bad' reads that are shorter than a min length.
+Designed to handle situations where reads become too short to be useful after trimming. This program also optionally performs quality trimming.
+
+Usage: bbsplitpairs.sh in=<input file> out=<pair output file> outs=<singleton output file> minlen=<minimum read length, an integer>
+
+Input may be fasta or fastq, compressed or uncompressed.
+
+Optional parameters (and their defaults)
+
+in=<file> The 'in=' flag is needed if the input file is not the first parameter. 'in=stdin' will pipe from standard in.
+in2=<file> Use this if 2nd read of pairs are in a different file.
+out=<file> The 'out=' flag is needed if the output file is not the second parameter. 'out=stdout' will pipe to standard out.
+out2=<file> Use this to write 2nd read of pairs to a different file.
+outsingle=<file> (outs) Write singleton reads here.
+
+overwrite=t (ow) Set to false to force the program to abort rather than overwrite an existing file.
+showspeed=t (ss) Set to 'f' to suppress display of processing speed.
+interleaved=auto (int) If true, forces fastq input to be paired and interleaved.
+qtrim=f Trim read ends to remove bases with quality below trimq.
+ Values: rl (trim both ends), f (neither end), r (right end only), l (left end only).
+trimq=6 Trim quality threshold.
+minlen=20 (ml) Reads shorter than this after trimming will be discarded.
+ziplevel=2 (zl) Set to 1 (lowest) through 9 (max) to change compression level; lower compression is faster.
+fixinterleaving=f (fint) Fixes corrupted interleaved files by examining pair names. Only use on files with broken interleaving.
+repair=f (rp) Fixes arbitrarily corrupted paired reads by examining read names. High memory.
+ain=f (allowidenticalnames) When detecting pair names, allows identical names, instead of requiring /1 and /2 or 1: and 2:
+
+Java Parameters:
+-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
+ -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory.
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx200m"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+}
+calcXmx "$@"
+
+splitpairs() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ fi
+ local CMD="java $EA $z -cp $CP jgi.SplitPairsAndSingles $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+splitpairs "$@"
diff --git a/bbwrap.sh b/bbwrap.sh
new file mode 100755
index 0000000..8e6ef6d
--- /dev/null
+++ b/bbwrap.sh
@@ -0,0 +1,81 @@
+#!/bin/bash
+#bbwrap in=<infile> out=<outfile>
+
+usage(){
+echo "
+BBWrap v34.x
+Last modified April 21, 2015
+
+Description: Wrapper for BBMap to allow multiple input and output files for the same reference.
+
+To index: bbwrap.sh ref=<reference fasta>
+To map: bbwrap.sh in=<file,file,...> out=<file,file,...>
+To map without an index: bbwrap.sh ref=<reference fasta> in=<file,file,...> out=<file,file,...> nodisk
+To map pairs and singletons and output them into the same file:
+bbwrap.sh in1=read1.fq,singleton.fq in2=read2.fq,null out=mapped.sam append
+
+BBWrap will not work with stdin and stdout, or histogram output.
+
+Other Parameters:
+
+in=<file,file> Input sequences to map.
+mapper=bbmap Select mapper. May be BBMap, BBMapPacBio,
+ or BBMapPacBioSkimmer.
+append=f Append to files rather than overwriting them.
+ If append is enabled, and there is exactly one output file,
+ all output will be written to that file.
+
+***** All BBMap parameters can be used; see bbmap.sh for more details. *****
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+NATIVELIBDIR="$DIR""jni/"
+
+z="-Xmx1g"
+z2="-Xms1g"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+ if [[ $set == 1 ]]; then
+ return
+ fi
+ freeRam 3200m 84
+ z="-Xmx${RAM}m"
+ z2="-Xms${RAM}m"
+}
+calcXmx "$@"
+
+bbwrap() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module unload samtools
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ module load samtools
+ fi
+ local CMD="java -Djava.library.path=$NATIVELIBDIR $EA $z -cp $CP align2.BBWrap build=1 overwrite=true fastareadlen=500 $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+bbwrap "$@"
diff --git a/build.xml b/build.xml
new file mode 100755
index 0000000..7aa32b9
--- /dev/null
+++ b/build.xml
@@ -0,0 +1,56 @@
+<project name="bbtools" default="dist" basedir=".">
+ <description>
+ Brian Bushnell's tools!
+ </description>
+
+ <!-- genepool specific values; you can override these from the command line -->
+ <property name="jcompiler" value="org.eclipse.jdt.core.JDTCompilerAdapter"/>
+ <property name="mpijar" location="/usr/common/usg/hpc/openmpi/gnu4.6/sge/1.8.1/ib_2.1-1.0.0/lib/mpi.jar"/>
+
+ <!-- set global properties for this build -->
+ <property name="src" location="current"/>
+ <property name="build" location="build"/>
+ <property name="dist" location="dist"/>
+ <property name="resources" location="resources"/>
+
+ <path id="class.path">
+ <pathelement location="${mpijar}"/>
+ </path>
+
+ <target name="init">
+ <!-- Create the time stamp -->
+ <tstamp/>
+ <!-- Create the build directory structure used by compile -->
+ <mkdir dir="${build}"/>
+ </target>
+
+ <target name="compile" depends="init"
+ description="compile the source " >
+
+ <!-- Compile the java code from ${src} into ${build} -->
+ <javac srcdir="${src}" destdir="${build}" compiler="${jcompiler}" nowarn="true" includeantruntime="false" source="1.7" target="1.7" debug="true" debuglevel="lines,vars,source" >
+ <classpath refid="class.path" />
+ <exclude name="jgi/KmerNormalizeExact.java" />
+ </javac>
+ </target>
+
+ <target name="dist" depends="compile"
+ description="generate the distribution" >
+
+ <!-- Create the distribution directory -->
+ <mkdir dir="${dist}/lib"/>
+ <!-- Put everything in ${build} into the MyProject-${DSTAMP}.jar file -->
+ <jar jarfile="${dist}/lib/BBTools.jar">
+ <fileset dir="${build}"/>
+ <fileset dir="${resources}"/>
+ </jar>
+ </target>
+
+ <target name="clean"
+ description="clean up" >
+
+ <!-- Delete the ${build} and ${dist} directory trees -->
+ <delete dir="${build}"/>
+ <delete dir="${dist}"/>
+ </target>
+</project>
diff --git a/calcmem.sh b/calcmem.sh
new file mode 100755
index 0000000..5b67e21
--- /dev/null
+++ b/calcmem.sh
@@ -0,0 +1,132 @@
+#!/bin/bash
+#calcmem
+
+#function usage(){
+# echo "CalcMem v1.03"
+# echo "Written by Brian Bushnell, Doug Jacobsen, Alex Copeland"
+# echo "Calculates available memory in megabytes"
+# echo "Last modified June 4, 2015"
+#}
+
+function parseXmx () {
+
+ local setxmx=0
+ local setxms=0
+
+ for arg in "$@"
+ do
+ if [[ "$arg" == -Xmx* ]]; then
+ z="$arg"
+ setxmx=1
+ elif [[ "$arg" == Xmx* ]]; then
+ z="-$arg"
+ setxmx=1
+ elif [[ "$arg" == -Xms* ]]; then
+ z2="$arg"
+ setxms=1
+ elif [[ "$arg" == Xms* ]]; then
+ z2="-$arg"
+ setxms=1
+ elif [[ "$arg" == -da ]] || [[ "$arg" == -ea ]]; then
+ EA="$arg"
+ fi
+ done
+
+ if [[ $setxmx == 1 ]] && [[ $setxms == 0 ]]; then
+ local substring=`echo $z| cut -d'x' -f 2`
+ z2="-Xms$substring"
+ setxms=1
+ elif [[ $setxmx == 0 ]] && [[ $setxms == 1 ]]; then
+ local substring=`echo $z2| cut -d's' -f 2`
+ z="-Xmx$substring"
+ setxmx=1
+ fi
+
+ set=$setxmx
+
+}
+
+
+RAM=0;
+
+function freeRam(){
+ #Memory is in kilobytes.
+ local defaultMem=3200000
+ if [ $# -gt 0 ]; then
+ defaultMem=$1;
+ case $defaultMem in
+ *g)
+ defaultMem=`echo $defaultMem| cut -d'g' -f 1`
+ defaultMem=$(( $defaultMem * $(( 1024 * 1024 )) ))
+ ;;
+ *m)
+ defaultMem=`echo $defaultMem| cut -d'm' -f 1`
+ defaultMem=$(( $defaultMem * 1024 ))
+ ;;
+ *k)
+ defaultMem=`echo $defaultMem| cut -d'k' -f 1`
+ ;;
+ esac
+ fi
+
+ local mult=84
+ if [ $# -gt 1 ]; then
+ mult=$2;
+ fi
+
+ #echo "mult = $mult"
+ #echo "default = $defaultMem"
+
+ local ulimit=$(ulimit -v)
+ ulimit="${ulimit:-0}"
+ if [ "$ulimit" = "unlimited" ]; then ulimit=0; fi
+ local x=$ulimit
+
+ if [ -e /proc/meminfo ]; then
+ local vfree=$(cat /proc/meminfo | awk -F: 'BEGIN{total=-1;used=-1} /^CommitLimit:/ { total=$2 }; /^Committed_AS:/ { used=$2 } END{ print (total-used) }')
+ local pfree=$(cat /proc/meminfo | awk -F: 'BEGIN{free=-1;cached=-1;buffers=-1} /^MemFree:/ { free=$2 }; /^Cached:/ { cached=$2}; /^Buffers:/ { buffers=$2} END{ print (free+cached+buffers) }')
+
+ #echo "vfree = $vfree"
+ #echo "pfree = $pfree"
+ #echo "ulimit = $ulimit"
+
+ local x2=0;
+
+ if [ $vfree -gt 0 ] && [ $pfree -gt 0 ]; then
+ if [ $vfree -gt $pfree ]; then x2=$pfree;
+ else x2=$vfree; fi
+ elif [ $vfree -gt 0 ]; then x2=$vfree;
+ elif [ $pfree -gt 0 ]; then x2=$pfree;
+ fi
+
+ #echo $x
+ #echo $x2
+ #echo $vfree
+ #echo $pfree
+
+ if [ "$x" = "unlimited" ] || (("$x" > $x2)); then x=$x2; fi
+ if [ $x -lt 1 ]; then x=$x2; fi
+ fi
+
+ #echo "x=$x"
+ local HOSTNAME=`hostname`
+ if [ $x -lt 1 ] || [[ $HOSTNAME == genepool* ]]; then
+ #echo "branch for unknown memory"
+ #echo $x
+ #echo "ram is unlimited"
+ RAM=$((defaultMem/1024))
+ echo "Max memory cannot be determined. Attempting to use $RAM MB." 1>&2
+ echo "If this fails, please add the -Xmx flag (e.g. -Xmx24g) to your command, " 1>&2
+ echo "or run this program qsubbed or from a qlogin session on Genepool, or set ulimit to an appropriate value." 1>&2
+ else
+ #echo "branch for known memory"
+ #echo $x
+
+ RAM=$(( ((x-500000)*mult/100)/1024 ))
+ #echo $RAM
+ fi
+ #local z="-Xmx${RAM}m"
+ return 0
+}
+
+#freeRam "$@"
diff --git a/calctruequality.sh b/calctruequality.sh
new file mode 100755
index 0000000..65036ab
--- /dev/null
+++ b/calctruequality.sh
@@ -0,0 +1,88 @@
+#!/bin/bash
+#calctruequality in=<infile> out=<outfile>
+
+usage(){
+echo "
+Written by Brian Bushnell
+Last modified December 11, 2015
+
+Description: Calculates the observed quality scores from a sam file.
+Generates matrices for use in recalibrating quality scores.
+
+Usage: calctruequality.sh in=<file,file,...file> path=<directory>
+
+
+Parameters (and their defaults)
+
+Input parameters:
+in=<file,file> Sam file or comma-delimited list of files. Must use = and X cigar symbols.
+reads=-1 Stop after processing this many reads (if positive).
+
+Output parameters:
+overwrite=t (ow) Set to true to allow overwriting of existing files.
+path=. Directory to write quality matrices (within /ref subdir).
+write=t Write matrices.
+showstats=t Print a summary.
+
+Other parameters:
+t=auto
+pigz=f Use pigz to compress. If argument is a number, that will set the number of pigz threads.
+unpigz=t Use pigz to decompress.
+passes=2 Generate matrices for 1-pass recalibration only. Max is 2.
+recalqmax=42 Adjust max quality scores tracked.
+loadq102= For each recalibration matrix, enable or disable that matrix with t/f.
+ You can specify pass1 or pass2 like this: loadq102_p1=f loadq102_p2=t.
+
+Java Parameters:
+-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
+ -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory.
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx1g"
+z2="-Xms1g"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+ if [[ $set == 1 ]]; then
+ return
+ fi
+}
+calcXmx "$@"
+
+calctruequality() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ module load samtools
+ fi
+ local CMD="java $EA $z -cp $CP jgi.CalcTrueQuality $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+calctruequality "$@"
diff --git a/callpeaks.sh b/callpeaks.sh
new file mode 100755
index 0000000..1efe804
--- /dev/null
+++ b/callpeaks.sh
@@ -0,0 +1,79 @@
+#!/bin/bash
+#callpeaks in=<infile>
+
+usage(){
+echo "
+Written by Brian Bushnell
+Last modified September 17, 2015
+
+Description: Calls peaks from a 2-column (x, y) tab-delimited histogram.
+
+Usage: callpeaks.sh in=<histogram file> out=<output file>
+
+
+Peak-calling parameters:
+in=<file> 'in=stdin.fq' will pipe from standard in.
+out=<file> Write the peaks to this file. Default is stdout.
+minHeight=2 (h) Ignore peaks shorter than this.
+minVolume=2 (v) Ignore peaks with less area than this.
+minWidth=2 (w) Ignore peaks narrower than this.
+minPeak=2 (minp) Ignore peaks with an X-value below this.
+ Useful when low-count kmers are filtered).
+maxPeak=BIG (maxp) Ignore peaks with an X-value above this.
+maxPeakCount=8 (maxpc) Print up to this many peaks (prioritizing height).
+countColumn=1 (col) For multi-column input, this column, zero-based,
+ contains the counts.
+
+Smoothing parameters:
+smoothradius=0 Radius of triangle filter. Set above zero to smooth data
+ prior to peak-calling. Higher values are smoother.
+smoothprogressive=f Set to true to widen the filter as the x-coordinate
+ increases. Useful for kmer-frequency histograms.
+maxradius=10 Maximum radius of progressive smoothing function.
+progressivemult=2 Increment radius each time depth increases by this factor.
+
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx200m"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+}
+calcXmx "$@"
+
+stats() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ fi
+ local CMD="java $EA -Xmx120m -cp $CP jgi.CallPeaks $@"
+# echo $CMD >&2
+ eval $CMD
+}
+
+stats "$@"
diff --git a/clumpify.sh b/clumpify.sh
new file mode 100755
index 0000000..e8f39c8
--- /dev/null
+++ b/clumpify.sh
@@ -0,0 +1,91 @@
+#!/bin/bash
+#clumpify in=<infile> out=<outfile> groups=<number>
+
+usage(){
+echo "
+Written by Brian Bushnell
+Last modified November 16, 2015
+
+Description: Sorts sequences to put similar reads near each other.
+This is a wrapper for KmerSplit and KmerSort.
+Works best with single-ended data.
+
+Usage: clumpify.sh in=<file> out=<file> groups=<number>
+
+Input may be fasta or fastq, compressed or uncompressed.
+
+Optional parameters (and their defaults)
+
+in=<file> Input file.
+out=<file> Output file. May not be standard out.
+groups=16 Use this many intermediate files (to save memory).
+rcomp=t Give read clumps the same orientation.
+ Should be disabled for paired reads.
+rename=t Add kmer information to the name.
+consensus=t Generate consensus reads from clumps.
+divisor=80m (div) Use a prime number at least this big as the divisor.
+k=31 Use kmers of this length (1-31).
+mincount=0 Ignore pivot kmers with count less than this.
+prefilter=t Use a prefilter if counting kmers.
+overwrite=f (ow) Set to false to force the program to abort rather
+ than overwrite an existing file.
+ziplevel=2 (zl) Set to 1 (lowest) through 9 (max) to change
+ compression level; lower compression is faster.
+
+Java Parameters:
+-Xmx This will be passed to Java to set memory usage, overriding
+ the program's automatic memory detection. -Xmx20g will
+ specify 20 gigs of RAM, and -Xmx200m will specify 200 megs.
+ The max is typically 85% of physical memory.
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx2g"
+z2="-Xms2g"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+ if [[ $set == 1 ]]; then
+ return
+ fi
+ freeRam 2000m 84
+ z="-Xmx${RAM}m"
+ z2="-Xms${RAM}m"
+}
+calcXmx "$@"
+
+clumpify() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ fi
+ local CMD="java $EA $z -cp $CP clump.Clumpify $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+clumpify "$@"
diff --git a/commonkmers.sh b/commonkmers.sh
new file mode 100755
index 0000000..575e311
--- /dev/null
+++ b/commonkmers.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+#commonkmers in=<infile> out=<outfile>
+
+function usage(){
+echo "
+Written by Brian Bushnell
+Last modified February 20, 2015
+
+Description: Prints the most common kmers in each sequence.
+This is intended for short kmers only!
+
+Usage: commonkmers.sh in=<file> out=<file>
+
+
+Parameters:
+k=2 Kmer length, 0-12.
+display=3 Print this many kmers per sequence.
+count=f Print the kmer counts as well.
+
+ow=f (overwrite) Overwrites files that already exist.
+app=f (append) Append to files that already exist.
+zl=4 (ziplevel) Set compression level, 1 (low) to 9 (max).
+qin=auto ASCII offset for input quality. May be 33 (Sanger), 64 (Illumina), or auto.
+
+Java Parameters:
+-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
+ -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory.
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx800m"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+}
+calcXmx "$@"
+
+function commonkmers() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ fi
+ local CMD="java $EA $z -cp $CP jgi.SmallKmerFrequency $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+commonkmers "$@"
diff --git a/config/cluster16s.txt b/config/cluster16s.txt
new file mode 100755
index 0000000..734a997
--- /dev/null
+++ b/config/cluster16s.txt
@@ -0,0 +1,19 @@
+#This is for clustering PacBio 16s reads of insert with Dedupe
+csf=stats_e26.txt
+outbest=best_e26.fq
+qin=33
+usejni=t
+am=f
+ac=f
+fo
+c
+rnc=f
+mcs=3
+k=27
+mo=1420
+ow
+unpigz
+cc
+pto
+nam=4
+e=26
diff --git a/config/filter16s.txt b/config/filter16s.txt
new file mode 100755
index 0000000..04d85df
--- /dev/null
+++ b/config/filter16s.txt
@@ -0,0 +1,5 @@
+#Filter PacBio 16s Reads of Insert with reformat.sh before clustering
+minlen=1420
+maxlen=1640
+maq=20
+qin=33
diff --git a/config/histograms.txt b/config/histograms.txt
new file mode 100755
index 0000000..4ec3d44
--- /dev/null
+++ b/config/histograms.txt
@@ -0,0 +1,16 @@
+#Histograms, for BBMap
+bhist.txt
+qhist=qhist.txt
+aqhist=aqhist.txt
+qahist=qahist.txt
+bqhist=bqhist.txt
+qchist=qchist.txt
+lhist=lhist.txt
+ihist=ihist.txt
+ehist=ehist.txt
+indelhist=indelhist.txt
+mhist=mhist.txt
+gchist=gchist.txt
+gcbins=auto
+idhist=idhist.txt
+idbins=auto
diff --git a/config/recalibrate.txt b/config/recalibrate.txt
new file mode 100755
index 0000000..85603f3
--- /dev/null
+++ b/config/recalibrate.txt
@@ -0,0 +1,32 @@
+#Quality recalibration parameters for CalcTrueQuality and BBDuk
+recalpasses=2
+observationcutoff_p1=100
+observationcutoff_p2=200
+recalqmax=41
+recalqmin=2
+
+#first pass matrices
+loadq102_p1=f
+loadqbp_p1=t
+loadq10_p1=f
+loadq12_p1=f
+loadqb12_p1=f
+loadqb012_p1=f
+loadqb123_p1=t
+loadqb234_p1=f
+loadq12b12_p1=f
+loadqp_p1=f
+loadq_p1=f
+
+#second pass matrices
+loadq102_p2=f
+loadqbp_p2=t
+loadq10_p2=f
+loadq12_p2=f
+loadqb12_p2=f
+loadqb012_p2=f
+loadqb123_p2=f
+loadqb234_p2=f
+loadq12b12_p2=f
+loadqp_p2=f
+loadq_p2=f
diff --git a/config/rnaseq.txt b/config/rnaseq.txt
new file mode 100755
index 0000000..3080e42
--- /dev/null
+++ b/config/rnaseq.txt
@@ -0,0 +1,4 @@
+#Vertebrate RNA-seq settings for BBMap
+maxindel=200000
+intronlen=10
+xs=us
diff --git a/config/trimadapters.txt b/config/trimadapters.txt
new file mode 100755
index 0000000..7cae3de
--- /dev/null
+++ b/config/trimadapters.txt
@@ -0,0 +1,5 @@
+k=23
+mink=11
+hdist=1
+tbo
+tpe
diff --git a/countbarcodes.sh b/countbarcodes.sh
new file mode 100755
index 0000000..5c39fb6
--- /dev/null
+++ b/countbarcodes.sh
@@ -0,0 +1,81 @@
+#!/bin/bash
+#filterbarcodes in=<infile> out=<outfile>
+
+usage(){
+echo "
+Written by Brian Bushnell
+Last modified October 16, 2015
+
+Description: Counts the number of reads with each barcode.
+
+Usage: countbarcodes.sh in=<file> counts=<file>
+
+Input may be stdin or a fasta or fastq file, raw or gzipped.
+If you pipe via stdin/stdout, please include the file type; e.g. for gzipped fasta input, set in=stdin.fa.gz
+
+
+Optional parameters (and their defaults)
+
+Input parameters:
+in=<file> Input reads, whose names end in a colon then barcode.
+counts=<file> Output of counts.
+interleaved=auto (int) If true, forces fastq input to be paired and interleaved.
+qin=auto ASCII offset for input quality. May be 33 (Sanger), 64 (Illumina), or auto.
+unpigz=t Use pigz to decompress.
+expected= Comma-delimited list of expected bar codes.
+valid= Comma-delimited list of valid bar codes.
+countundefined=t Count barcodes that contain non-ACGT symbols.
+printheader=t Print a header.
+maxrows=-1 Optionally limit the number of rows printed.
+
+Output parameters:
+out=<file> Write bar codes and counts here. 'out=stdout' will pipe to standard out.
+
+Java Parameters:
+-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
+ -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory.
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx200m"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+}
+calcXmx "$@"
+
+countbarcodes() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ fi
+ local CMD="java $EA $z -cp $CP jgi.CountBarcodes $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+countbarcodes "$@"
diff --git a/countgc.sh b/countgc.sh
new file mode 100755
index 0000000..a33bb88
--- /dev/null
+++ b/countgc.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+#countgc in=<infile> out=<outfile>
+
+usage(){
+echo "
+Written by Brian Bushnell
+Last modified January 21, 2015
+
+Description: Counts GC content of reads or scaffolds.
+
+Usage: countgc in=<input> out=<output> format=<format>
+
+Input may be stdin or a fasta or fastq file, compressed or uncompressed.
+Output (which is optional) is tab-delimited.
+format=1: name length A C G T N
+format=2: name GC
+format=4: name length GC
+Note that in format 1, A+C+G+T=1 even when N is nonzero.
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx120m"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+}
+calcXmx "$@"
+
+countgc() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module load oracle-jdk/1.7_64bit
+ fi
+ local CMD="java $EA $z -cp $CP jgi.CountGC $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+countgc "$@"
diff --git a/countsharedlines.sh b/countsharedlines.sh
new file mode 100755
index 0000000..c930ea1
--- /dev/null
+++ b/countsharedlines.sh
@@ -0,0 +1,81 @@
+#!/bin/bash
+#countsharedlines in=<infile> out=<outfile>
+
+function usage(){
+echo "
+Written by Brian Bushnell
+Last modified September 15, 2015
+
+Description: Counts the number of lines shared between sets of files.
+One output file will be printed for each input file. For example,
+an output file for a file in the 'in1' set will contain one line per
+file in the 'in2' set, indicating how many lines are shared.
+
+Usage: countsharedlines.sh in1=<file,file...> in2=<file,file...>
+
+
+Parameters:
+include=f Set to 'true' to include the filtered names rather than excluding them.
+prefix=f Allow matching of only the line's prefix (all characters up to first whitespace).
+casesensitive=t (case) Match case also.
+ow=t (overwrite) Overwrites files that already exist.
+app=f (append) Append to files that already exist.
+zl=4 (ziplevel) Set compression level, 1 (low) to 9 (max).
+
+
+Java Parameters:
+-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
+ -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory.
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx800m"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+ if [[ $set == 1 ]]; then
+ return
+ fi
+ freeRam 800m 84
+ z="-Xmx${RAM}m"
+ z2="-Xms${RAM}m"
+}
+calcXmx "$@"
+
+function countsharedlines() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module unload samtools
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ module load samtools
+ fi
+ local CMD="java $EA $z -cp $CP driver.CountSharedLines $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+countsharedlines "$@"
diff --git a/crossblock.sh b/crossblock.sh
new file mode 100755
index 0000000..5cc5f7b
--- /dev/null
+++ b/crossblock.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+#For usage information, please see decontaminate.sh
+
+function crossblock(){
+ CMD="decontaminate.sh $@"
+ eval $CMD
+}
+
+crossblock "$@"
+
diff --git a/crosscontaminate.sh b/crosscontaminate.sh
new file mode 100755
index 0000000..595a8ed
--- /dev/null
+++ b/crosscontaminate.sh
@@ -0,0 +1,95 @@
+#!/bin/bash
+#crosscontaminate in=<file,file,...> out=<file,file,...>
+
+usage(){
+echo "
+Written by Brian Bushnell
+Last modified February 17, 2015
+
+Description: Generates synthetic cross-contaminated files from clean files.
+Intended for use with synthetic reads generated by SynthMDA or RandomReads.
+
+Usage: crosscontaminate.sh in=<file,file,...> out=<file,file,...>
+
+
+Parameters and their defaults:
+
+Input parameters:
+in=<file,file,...> Clean input reads.
+innamefile=<file> A file containing the names of input files, one name per line.
+interleaved=auto (int) t/f overrides interleaved autodetection.
+qin=auto Input quality offset: 33 (Sanger), 64, or auto.
+reads=-1 If positive, quit after processing X reads or pairs.
+
+Processing Parameters:
+minsinks=1 Min contamination destinations from one source.
+maxsinks=8 Max contamination destinations from one source.
+minprob=0.000005 Min allowed contamination rate (geometric distribution).
+maxprob=0.025 Max allowed contamination rate.
+
+Output parameters:
+out=<file,file,...> Contaminated output reads.
+outnamefile=<file> A file containing the names of output files, one name per line.
+overwrite=t (ow) Grant permission to overwrite files.
+#showspeed=t (ss) 'f' suppresses display of processing speed.
+ziplevel=2 (zl) Compression level; 1 (min) through 9 (max).
+threads=auto (t) Set number of threads to use; default is number of logical processors.
+qout=auto Output quality offset: 33 (Sanger), 64, or auto.
+shuffle=f Shuffle contents of output files.
+shufflethreads=3 Use this many threads for shuffling (requires more memory).
+
+Java Parameters:
+-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
+ -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory.
+
+There is a changelog at docs/changelog_crosscontaminate.txt
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx4g"
+z2="-Xms4g"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+ if [[ $set == 1 ]]; then
+ return
+ fi
+ freeRam 4000m 42
+ z="-Xmx${RAM}m"
+}
+calcXmx "$@"
+
+crosscontaminate() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ fi
+ local CMD="java $EA $z -cp $CP jgi.CrossContaminate $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+crosscontaminate "$@"
diff --git a/current/align2/AbstractIndex.java b/current/align2/AbstractIndex.java
new file mode 100755
index 0000000..4973e3d
--- /dev/null
+++ b/current/align2/AbstractIndex.java
@@ -0,0 +1,227 @@
+package align2;
+
+import java.util.ArrayList;
+
+import stream.SiteScore;
+
+/**
+ * @author Brian Bushnell
+ * @date Oct 15, 2013
+ *
+ */
+public abstract class AbstractIndex {
+
+ AbstractIndex(int keylen, int kfilter, int pointsMatch, int minChrom_, int maxChrom_, MSA msa_){
+ KEYLEN=keylen;
+ KEYSPACE=1<<(2*KEYLEN);
+ BASE_KEY_HIT_SCORE=pointsMatch*KEYLEN;
+ KFILTER=kfilter;
+ msa=msa_;
+
+ minChrom=minChrom_;
+ maxChrom=maxChrom_;
+ assert(minChrom==MINCHROM);
+ assert(maxChrom==MAXCHROM);
+ assert(minChrom<=maxChrom);
+ }
+
+ final int count(int key){
+// assert(false);
+ if(COUNTS!=null){return COUNTS[key];} //TODO: Benchmark speed and memory usage with counts=null. Probably only works for single-block genomes.
+// assert(false);
+ final Block b=index[0];
+ final int rkey=KeyRing.reverseComplementKey(key, KEYLEN);
+ int a=b.length(key);
+ return key==rkey ? a : a+b.length(rkey);
+ }
+
+ static final boolean overlap(int a1, int b1, int a2, int b2){
+ assert(a1<=b1 && a2<=b2) : a1+", "+b1+", "+a2+", "+b2;
+ return a2<=b1 && b2>=a1;
+ }
+
+ /** Is (a1, b1) within (a2, b2) ? */
+ static final boolean isWithin(int a1, int b1, int a2, int b2){
+ assert(a1<=b1 && a2<=b2) : a1+", "+b1+", "+a2+", "+b2;
+ return a1>=a2 && b1<=b2;
+ }
+
+
+ /** Generates a term that increases score with how far apart the two farthest perfect matches are.
+ * Assumes that the centerIndex corresponds to the leftmost perfect match. */
+ final int scoreY(int[] locs, int centerIndex, int offsets[]){
+ int center=locs[centerIndex];
+// int rightIndex=centerIndex;
+// for(int i=centerIndex; i<offsets.length; i++){
+// if(locs[i]==center){
+// rightIndex=i;
+// }
+// }
+
+ int rightIndex=-1;
+ for(int i=offsets.length-1; rightIndex<centerIndex; i--){
+ if(locs[i]==center){
+ rightIndex=i;
+ }
+ }
+
+ //Assumed to not be necessary.
+// for(int i=0; i<centerIndex; i++){
+// if(locs[i]==center){
+// centerIndex=i;
+// }
+// }
+
+ return offsets[rightIndex]-offsets[centerIndex];
+ }
+
+ abstract float[] keyProbArray();
+ abstract byte[] getBaseScoreArray(int len, int strand);
+ abstract int[] getKeyScoreArray(int len, int strand);
+
+ abstract int maxScore(int[] offsets, byte[] baseScores, int[] keyScores, int readlen, boolean useQuality);
+ public abstract ArrayList<SiteScore> findAdvanced(byte[] basesP, byte[] basesM, byte[] qual, byte[] baseScoresP, int[] keyScoresP, int[] offsets, long id);
+
+ long callsToScore=0;
+ long callsToExtendScore=0;
+ long initialKeys=0;
+ long initialKeyIterations=0;
+ long initialKeys2=0;
+ long initialKeyIterations2=0;
+ long usedKeys=0;
+ long usedKeyIterations=0;
+
+ static final int HIT_HIST_LEN=40;
+ final long[] hist_hits=new long[HIT_HIST_LEN+1];
+ final long[] hist_hits_score=new long[HIT_HIST_LEN+1];
+ final long[] hist_hits_extend=new long[HIT_HIST_LEN+1];
+
+ final int minChrom;
+ final int maxChrom;
+
+ static int MINCHROM=1;
+ static int MAXCHROM=Integer.MAX_VALUE;
+
+ static final boolean SUBSUME_SAME_START_SITES=true; //Not recommended if slow alignment is disabled.
+ static final boolean SUBSUME_SAME_STOP_SITES=true; //Not recommended if slow alignment is disabled.
+
+ /**
+ * True: Slightly slower.<br>
+ * False: Faster, but may mask detection of some ambiguously mapping reads.
+ */
+ static final boolean LIMIT_SUBSUMPTION_LENGTH_TO_2X=true;
+
+ /** Not recommended if slow alignment is disabled. Can conceal sites that should be marked as amiguous. */
+ static final boolean SUBSUME_OVERLAPPING_SITES=false;
+
+ static final boolean SHRINK_BEFORE_WALK=true;
+
+ /** More accurate but uses chromosome arrays while mapping */
+ static final boolean USE_EXTENDED_SCORE=true; //Calculate score more slowly by extending keys
+
+ /** Even more accurate but even slower than normal extended score calculation.
+ * Scores are compatible with slow-aligned scores. */
+ static final boolean USE_AFFINE_SCORE=true && USE_EXTENDED_SCORE; //Calculate score even more slowly
+
+
+ public static final boolean RETAIN_BEST_SCORES=true;
+ public static final boolean RETAIN_BEST_QCUTOFF=true;
+
+ public static boolean QUIT_AFTER_TWO_PERFECTS=true;
+ static final boolean DYNAMICALLY_TRIM_LOW_SCORES=true;
+
+
+ static final boolean REMOVE_CLUMPY=true; //Remove keys like AAAAAA or GCGCGC that self-overlap and thus occur in clumps
+
+
+ /** If no hits are found, search again with slower parameters (less of genome excluded) */
+ static final boolean DOUBLE_SEARCH_NO_HIT=false;
+ /** Only this fraction of the originally removed genome fraction (FRACTION_GENOME_TO_EXCLUDE)
+ * is removed for the second pass */
+ static final float DOUBLE_SEARCH_THRESH_MULT=0.25f; //Must be less than 1.
+
+ static boolean PERFECTMODE=false;
+ static boolean SEMIPERFECTMODE=false;
+ static final boolean REMOVE_FREQUENT_GENOME_FRACTION=true; //Default true; false is more accurate
+
+ /** Ignore longest site list(s) when doing a slow walk. */
+ static final boolean TRIM_LONG_HIT_LISTS=false; //Increases speed with tiny loss of accuracy. Default: true for clean or synthetic, false for noisy real data
+
+
+ public static final boolean TRIM_BY_GREEDY=true; //default: true
+
+ public static int MIN_APPROX_HITS_TO_KEEP=1; //Default 2 for skimmer, 1 otherwise, min 1; lower is more accurate
+
+
+ public static final boolean TRIM_BY_TOTAL_SITE_COUNT=false; //default: false
+ /** Length histogram index of maximum average hit list length to use.
+ * The max number of sites to search is calculated by (#keys)*(lengthHistogram[chrom][MAX_AVERAGE_SITES_TO_SEARCH]).
+ * Then, while the actual number of sites exceeds this, the longest hit list should be removed.
+ */
+
+ static int MAX_USABLE_LENGTH=Integer.MAX_VALUE;
+ static int MAX_USABLE_LENGTH2=Integer.MAX_VALUE;
+
+
+ public static void clear(){
+ index=null;
+ lengthHistogram=null;
+ COUNTS=null;
+ }
+
+ static Block[] index;
+ static int[] lengthHistogram=null;
+ static int[] COUNTS=null;
+
+ final int KEYLEN; //default 12, suggested 10 ~ 13, max 15; bigger is faster but uses more RAM
+ final int KEYSPACE;
+ /** Site must have at least this many contiguous matches */
+ final int KFILTER;
+ final MSA msa;
+ final int BASE_KEY_HIT_SCORE;
+
+
+ boolean verbose=false;
+ static boolean verbose2=false;
+
+ static boolean SLOW=false;
+ static boolean VSLOW=false;
+
+ static int NUM_CHROM_BITS=3;
+ static int CHROMS_PER_BLOCK=(1<<(NUM_CHROM_BITS));
+
+ static final int MINGAP=Shared.MINGAP;
+ static final int MINGAP2=(MINGAP+128); //Depends on read length...
+
+ static boolean USE_CAMELWALK=false;
+
+ static final boolean ADD_LIST_SIZE_BONUS=false;
+ static final byte[] LIST_SIZE_BONUS=new byte[100];
+
+ public static boolean GENERATE_KEY_SCORES_FROM_QUALITY=true; //True: Much faster and more accurate.
+ public static boolean GENERATE_BASE_SCORES_FROM_QUALITY=true; //True: Faster, and at least as accurate.
+
+ static final int calcListSizeBonus(int[] array){
+ if(array==null || array.length>LIST_SIZE_BONUS.length-1){return 0;}
+ return LIST_SIZE_BONUS[array.length];
+ }
+
+ static final int calcListSizeBonus(int size){
+ if(size>LIST_SIZE_BONUS.length-1){return 0;}
+ return LIST_SIZE_BONUS[size];
+ }
+
+ static{
+ final int len=LIST_SIZE_BONUS.length;
+// for(int i=1; i<len; i++){
+// int x=(int)((len/(Math.sqrt(i)))/5)-1;
+// LIST_SIZE_BONUS[i]=(byte)(x/2);
+// }
+ LIST_SIZE_BONUS[0]=3;
+ LIST_SIZE_BONUS[1]=2;
+ LIST_SIZE_BONUS[2]=1;
+ LIST_SIZE_BONUS[len-1]=0;
+// System.err.println(Arrays.toString(LIST_SIZE_BONUS));
+ }
+
+}
diff --git a/current/align2/AbstractMapThread.java b/current/align2/AbstractMapThread.java
new file mode 100755
index 0000000..db0ca3f
--- /dev/null
+++ b/current/align2/AbstractMapThread.java
@@ -0,0 +1,3189 @@
+package align2;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+
+import jgi.CoveragePileup;
+import jgi.ReformatReads;
+
+import stream.ConcurrentReadInputStream;
+import stream.ConcurrentReadOutputStream;
+import stream.Read;
+import stream.SamLine;
+import stream.SiteScore;
+
+import dna.AminoAcid;
+import dna.ChromosomeArray;
+import dna.Data;
+import dna.Gene;
+
+
+
+/**
+ * @author Brian Bushnell
+ * @date Feb 27, 2013
+ *
+ */
+public abstract class AbstractMapThread extends Thread {
+
+ AbstractMapThread(ConcurrentReadInputStream cris_,
+ ConcurrentReadOutputStream outStream_, ConcurrentReadOutputStream outStreamMapped_, ConcurrentReadOutputStream outStreamUnmapped_, ConcurrentReadOutputStream outStreamBlack_,
+ CoveragePileup pileup_, boolean SLOW_ALIGN_, boolean LOCAL_ALIGN_, boolean AMBIGUOUS_TOSS_,
+ boolean AMBIGUOUS_RANDOM_, boolean AMBIGUOUS_ALL_, boolean TRIM_LEFT_, boolean TRIM_RIGHT_, boolean UNTRIM_, byte TRIM_QUAL_, int MIN_TRIM_LEN_, int THRESH_,
+ int minChrom_, int maxChrom_, int KFILTER_, float IDFILTER_, boolean KILL_BAD_PAIRS_, boolean SAVE_AMBIGUOUS_XY_,
+ boolean REQUIRE_CORRECT_STRANDS_PAIRS_,
+ boolean SAME_STRAND_PAIRS_, boolean DO_RESCUE_, boolean STRICT_MAX_INDEL_, int SLOW_ALIGN_PADDING_, int SLOW_RESCUE_PADDING_,
+ String MSA_TYPE_, int keylen_, boolean PERFECTMODE_, boolean SEMIPERFECTMODE_, boolean FORBID_SELF_MAPPING_, boolean RCOMP_MATE_,
+ boolean MAKE_MATCH_STRING_, boolean OUTPUT_MAPPED_ONLY_, boolean DONT_OUTPUT_BLACKLISTED_READS_, boolean PRINT_SECONDARY_ALIGNMENTS_,
+ boolean QUICK_MATCH_STRINGS_, int MAX_SITESCORES_TO_PRINT_, float MINIMUM_ALIGNMENT_SCORE_RATIO_,
+ float keyDensity_, float maxKeyDensity_, float minKeyDensity_, int maxDesiredKeys_,
+ int MIN_APPROX_HITS_TO_KEEP_, boolean USE_EXTENDED_SCORE_, int BASE_HIT_SCORE_, boolean USE_AFFINE_SCORE_, int MAX_INDEL_,
+ boolean TRIM_LIST_, int TIP_DELETION_SEARCH_RANGE_){
+
+
+ cris=cris_;
+ outStream=outStream_;
+ outStreamMapped=outStreamMapped_;
+ outStreamUnmapped=outStreamUnmapped_;
+ outStreamBlack=outStreamBlack_;
+ pileup=pileup_;
+
+ SLOW_ALIGN=SLOW_ALIGN_;
+ LOCAL_ALIGN=LOCAL_ALIGN_;
+ AMBIGUOUS_TOSS=AMBIGUOUS_TOSS_;
+ AMBIGUOUS_RANDOM=AMBIGUOUS_RANDOM_;
+ AMBIGUOUS_ALL=AMBIGUOUS_ALL_;
+ TRIM_LEFT=TRIM_LEFT_;
+ TRIM_RIGHT=TRIM_RIGHT_;
+ UNTRIM=UNTRIM_;
+ TRIM_QUAL=TRIM_QUAL_;
+ TRIM_MIN_LENGTH=MIN_TRIM_LEN_;
+ THRESH=THRESH_;
+ minChrom=minChrom_;
+ maxChrom=maxChrom_;
+ KFILTER=KFILTER_;
+ IDFILTER=IDFILTER_;
+
+ KILL_BAD_PAIRS=KILL_BAD_PAIRS_;
+ SAVE_AMBIGUOUS_XY=SAVE_AMBIGUOUS_XY_;
+// GEN_MATCH_FAST=GEN_MATCH_FAST_;
+ SLOW_ALIGN_PADDING=SLOW_ALIGN_PADDING_;
+ SLOW_RESCUE_PADDING=SLOW_RESCUE_PADDING_;
+ DO_RESCUE=DO_RESCUE_;
+ STRICT_MAX_INDEL=STRICT_MAX_INDEL_;
+ BANDWIDTH=MSA.bandwidth;
+ PAIRED=cris.paired();
+ REQUIRE_CORRECT_STRANDS_PAIRS=REQUIRE_CORRECT_STRANDS_PAIRS_;
+ SAME_STRAND_PAIRS=SAME_STRAND_PAIRS_;
+
+ /* ------------ */
+
+ TRIM_LIST=TRIM_LIST_;
+ TIP_DELETION_SEARCH_RANGE=TIP_DELETION_SEARCH_RANGE_;
+ FIND_TIP_DELETIONS=TIP_DELETION_SEARCH_RANGE>0;
+
+ MIN_APPROX_HITS_TO_KEEP=MIN_APPROX_HITS_TO_KEEP_;
+ USE_EXTENDED_SCORE=USE_EXTENDED_SCORE_;
+ BASE_HIT_SCORE=BASE_HIT_SCORE_;
+ BASE_KEY_HIT_SCORE=BASE_HIT_SCORE*keylen_;
+ USE_AFFINE_SCORE=USE_AFFINE_SCORE_;
+ EXPECTED_LEN_LIMIT=(ALIGN_COLUMNS()*17)/20-(2*(SLOW_ALIGN_PADDING+10)); //TODO: Due to some bug in expected length calculation, this is low.
+ MAX_INDEL=MAX_INDEL_;
+ ALIGN_COLUMNS=ALIGN_COLUMNS();
+
+ /* ------------ */
+
+
+ KEYLEN=keylen_;
+ keyDensity=keyDensity_;
+ maxKeyDensity=maxKeyDensity_;
+ minKeyDensity=minKeyDensity_;
+ maxDesiredKeys=maxDesiredKeys_;
+
+ MINIMUM_ALIGNMENT_SCORE_RATIO=MINIMUM_ALIGNMENT_SCORE_RATIO_;
+ MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED=Tools.max(MINIMUM_ALIGNMENT_SCORE_RATIO*.80f, 1-((1-MINIMUM_ALIGNMENT_SCORE_RATIO)*1.4f));
+ MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE=Tools.max(MINIMUM_ALIGNMENT_SCORE_RATIO*.60f, 1-((1-MINIMUM_ALIGNMENT_SCORE_RATIO)*1.8f));
+// TRIM_LIST=TRIM_LIST_;
+ MAKE_MATCH_STRING=(MAKE_MATCH_STRING_ || STRICT_MAX_INDEL_);
+ assert(SLOW_ALIGN_PADDING>=0);
+
+ OUTPUT_MAPPED_ONLY=OUTPUT_MAPPED_ONLY_;
+ DONT_OUTPUT_BLACKLISTED_READS=DONT_OUTPUT_BLACKLISTED_READS_;
+ MAX_SITESCORES_TO_PRINT=MAX_SITESCORES_TO_PRINT_;
+ PRINT_SECONDARY_ALIGNMENTS=PRINT_SECONDARY_ALIGNMENTS_;
+ QUICK_MATCH_STRINGS=((QUICK_MATCH_STRINGS_ || STRICT_MAX_INDEL_) && MAKE_MATCH_STRING);
+
+ RCOMP_MATE=RCOMP_MATE_;
+ PERFECTMODE=PERFECTMODE_;
+ SEMIPERFECTMODE=SEMIPERFECTMODE_;
+ FORBID_SELF_MAPPING=FORBID_SELF_MAPPING_;
+// assert(!(RCOMP_MATE/* || FORBID_SELF_MAPPING*/)) : "RCOMP_MATE: TODO";
+
+// TIP_DELETION_SEARCH_RANGE=TIP_DELETION_SEARCH_RANGE_;
+// FIND_TIP_DELETIONS=TIP_DELETION_SEARCH_RANGE>0;
+// EXPECTED_LEN_LIMIT=(ALIGN_COLUMNS*17)/20-(2*(SLOW_ALIGN_PADDING+10)); //TODO: Due to some bug in expected length calculation, this is low.
+ MSA_TYPE=MSA_TYPE_;
+ EXTRA_PADDING=(BANDWIDTH<1 && (MSA.bandwidthRatio<=0 || MSA.bandwidthRatio>=0.2f) ?
+ EXTRA_PADDING : Tools.min(EXTRA_PADDING, Tools.max(BANDWIDTH/4, (int)(MSA.bandwidthRatio*60))));
+
+ AVERAGE_PAIR_DIST=INITIAL_AVERAGE_PAIR_DIST;
+
+ if(SLOW_ALIGN || MAKE_MATCH_STRING){
+ msa=MSA.makeMSA(ALIGN_ROWS(), ALIGN_COLUMNS(), MSA_TYPE);
+ POINTS_MATCH=msa.POINTS_MATCH();
+ POINTS_MATCH2=msa.POINTS_MATCH2();
+// CLEARZONE1=(int)(CLEARZONE_RATIO1*POINTS_MATCH2);
+// CLEARZONE1b=(int)(CLEARZONE_RATIO1b*POINTS_MATCH2);
+// CLEARZONE1c=(int)(CLEARZONE_RATIO1c*POINTS_MATCH2);
+// CLEARZONEP=(int)(CLEARZONE_RATIOP*POINTS_MATCH2);
+// CLEARZONE3=PENALIZE_AMBIG ? (int)(CLEARZONE_RATIO3*POINTS_MATCH2) : 0;
+ CLEARZONE1e=(int)(2*POINTS_MATCH2-POINTS_MATCH-msa.POINTS_SUB())+1;
+ }else{
+ POINTS_MATCH=70;
+ POINTS_MATCH2=100;
+ msa=null;
+// CLEARZONE1=0;
+// CLEARZONE1b=0;
+// CLEARZONE1c=0;
+// CLEARZONEP=0;
+// CLEARZONE3=0;
+ CLEARZONE1e=0;
+ }
+
+// CLEARZONE1b_CUTOFF_FLAT=CLEARZONE1b_CUTOFF_FLAT_RATIO*POINTS_MATCH2;
+// CLEARZONE1c_CUTOFF_FLAT=CLEARZONE1c_CUTOFF_FLAT_RATIO*POINTS_MATCH2;
+// INV_CLEARZONE3=(CLEARZONE3==0 ? 0 : 1f/CLEARZONE3);
+
+// index=new BBIndex(KEYLEN, minChrom, maxChrom, KFILTER, msa);
+ GENERATE_KEY_SCORES_FROM_QUALITY=AbstractIndex.GENERATE_KEY_SCORES_FROM_QUALITY;
+ readstats=(ReadStats.collectingStats() ? new ReadStats() : null);
+
+ PROCESS_EDIT_FILTER=(SUBFILTER>=0 || DELFILTER>=0 || INSFILTER>=0 || INDELFILTER>=0 || DELLENFILTER>=0 || INSLENFILTER>=0 || EDITFILTER>=0);
+ }
+
+ public abstract int ALIGN_COLUMNS();
+ public abstract int ALIGN_ROWS();
+ abstract int CLEARZONE1();
+
+ abstract AbstractIndex index();
+
+ public final void postFilterRead(Read r, byte[] basesM, int maxImperfectSwScore, int maxSwScore){
+ if(!r.mapped() || r.perfect()){return;}
+ assert(Read.CHECKSITES(r, basesM));
+ ensureMatchStringOnPrimary(r, basesM, maxImperfectSwScore, maxSwScore);
+ if(!r.mapped() || r.perfect()){return;}
+ assert(r.match!=null) : "Postfiltering does not work with cigar strings disabled.";
+ boolean removedTop=false;
+ if(verbose && (PROCESS_EDIT_FILTER || IDFILTER>0)){
+ System.err.println("\nBefore filtering: sites=\n"+r.sites);
+// new Exception().printStackTrace(System.err);
+ }
+ removedTop=processIDFilter(r, basesM, maxImperfectSwScore, maxSwScore) | removedTop;
+ removedTop=processEditFilter(r, basesM, maxImperfectSwScore, maxSwScore) | removedTop;
+ if(verbose && (PROCESS_EDIT_FILTER || IDFILTER>0)){
+ System.err.println("\nAfter filtering: removedTop="+removedTop+", sites=\n"+r.sites);
+ }
+ if(removedTop && r.mapped() && r.match==null){
+ ensureMatchStringOnPrimary(r, basesM, maxImperfectSwScore, maxSwScore);
+ ensureMatchStringsOnSiteScores(r, basesM, maxImperfectSwScore, maxSwScore);
+ postFilterRead(r, basesM, maxImperfectSwScore, maxSwScore);
+ if(verbose){
+ System.err.println("\nAfter filtering2: sites=\n"+r.sites);
+ }
+ }
+ if(removedTop && r.mapped()){
+ for(SiteScore ss : r.sites){
+ ss.setScore(Tools.min(ss.score, ss.score/4));
+ int newSlowScore=Tools.min(ss.slowScore, ss.slowScore/4);
+ int newPairedScore=(ss.pairedScore<=ss.slowScore ? 0 : Tools.max(newSlowScore+1, Tools.min(ss.pairedScore, ss.pairedScore/4)));
+ ss.setSlowPairedScore(newSlowScore, newPairedScore);
+ }
+ SiteScore top=r.topSite();
+ assert(top!=null) : r;
+ r.mapScore=top.score;
+ if(verbose){
+ System.err.println("\nAfter filtering3: sites=\n"+r.sites);
+ }
+ r.setAmbiguous(true);
+ }
+ }
+
+ final int ensureMatchStringsOnSiteScores(Read r, byte[] basesM, int maxImperfectSwScore, int maxSwScore){
+ if(!r.mapped() || r.numSites()<1){return 0;}
+ int removed=0;
+ int generated=0;
+ final SiteScore top=r.topSite();
+ for(int i=1, lim=MAX_SITESCORES_TO_PRINT; i<lim+removed && i<r.numSites(); i++){
+ SiteScore ss=r.sites.get(i);
+ if(ss.match==null){
+ genMatchStringForSite(r.numericID, ss, r.bases, basesM, maxImperfectSwScore, maxSwScore, r.mate, PRINT_SECONDARY_ALIGNMENTS);
+ if(ss.match==null){
+ r.sites.set(i, null);
+ removed++;
+ }else{
+ generated++;
+ }
+ }
+ }
+ if(removed>0){Tools.condenseStrict(r.sites);}
+ if(generated>0){Collections.sort(r.sites);}
+ if(r.topSite()!=top){
+ r.setFromTopSite();
+ if(r.mate!=null){
+ r.setPaired(false);
+ r.mate.setPaired(false);
+ }
+ }
+ return removed;
+ }
+
+ final int ensureMatchStringOnPrimary(final Read r, final byte[] basesM, final int maxImperfectSwScore, final int maxSwScore){
+ if(!r.mapped() || r.numSites()<1){return 0;}
+ if(r.match!=null){
+ assert(r.numSites()>0);
+ assert(r.sites.get(0).match==r.match);
+ return 0;
+ }
+ int removed=0;
+ int generated=0;
+ final SiteScore top=r.topSite();
+
+ boolean success=false;
+ for(int i=0, lim=r.numSites(); i<lim && !success; i++){
+ SiteScore ss=r.sites.get(i);
+ if(ss.match==null){
+ genMatchStringForSite(r.numericID, ss, r.bases, basesM, maxImperfectSwScore, maxSwScore, r.mate, false);
+ if(ss.match==null){
+ r.sites.set(i, null);
+ removed++;
+ }else{
+ generated++;
+ }
+ }
+ success=ss.match!=null;
+ }
+ if(removed>0){Tools.condenseStrict(r.sites);}
+ if(generated>0){Collections.sort(r.sites);}
+
+ if(r.numSites()<1){
+ r.clearMapping();
+ return removed;
+ }
+
+ if(r.sites.get(0).match==null){return removed+ensureMatchStringOnPrimary(r, basesM, maxImperfectSwScore, maxSwScore);}
+
+ r.setFromTopSite();
+ if(r.topSite()!=top && r.mate!=null){
+ r.setPaired(false);
+ r.mate.setPaired(false);
+ }
+
+ return removed;
+ }
+
+
+ public final boolean processIDFilter(Read r, byte[] basesM, int maxImperfectSwScore, int maxSwScore){
+ if(IDFILTER<=0){return false;}
+ if(!r.mapped() || r.perfect()){return false;}
+ assert(r.match!=null) : "Identity Filter does not work with cigar strings disabled.";
+ if(!r.paired() && Read.identityFlat(r.match)<IDFILTER){
+ r.clearMapping();
+ if(r.mate!=null){
+ r.setPaired(false);
+ r.mate.setPaired(false);
+ }
+ }
+ boolean removedTop=false;
+ if(r.sites!=null){
+ int removed=0;
+ for(int i=r.sites.size()-1; i>0; i--){
+ SiteScore ss=r.sites.get(i);
+ if(removedTop && ss.match==null){
+ genMatchStringForSite(r.numericID, ss, r.bases, basesM, maxImperfectSwScore, maxSwScore, r.mate, PRINT_SECONDARY_ALIGNMENTS);
+ }
+ if(ss.match!=null && !ss.perfect() && Read.identityFlat(ss.match)<IDFILTER){
+ r.sites.set(i, null);
+ removed++;
+ if(i==0){removedTop=true;}
+ }
+ }
+ if(removed>0){Tools.condenseStrict(r.sites);}
+ }
+ if(removedTop){
+ if(r.mate!=null){
+ r.setPaired(false);
+ r.mate.setPaired(false);
+ }
+ if(r.sites==null || r.sites.isEmpty() || r.topSite().match==null){
+ r.clearMapping();
+ }else{
+ r.setFromTopSite();
+ }
+ }
+ return removedTop;
+ }
+
+ public final boolean processEditFilter(Read r, byte[] basesM, int maxImperfectSwScore, int maxSwScore){
+ if(!PROCESS_EDIT_FILTER || !r.mapped() || r.match==null || r.perfect()){return false;}
+ assert(r.match!=null) : "Edit Filter does not work with cigar strings disabled.";
+ boolean removedTop=false;
+ if(r.sites!=null){
+ int removed=0;
+ for(int i=r.sites.size()-1; i>=0; i--){
+ SiteScore ss=r.sites.get(i);
+ if(removedTop && ss.match==null){
+ genMatchStringForSite(r.numericID, ss, r.bases, basesM, maxImperfectSwScore, maxSwScore, r.mate, PRINT_SECONDARY_ALIGNMENTS);
+ }
+ if(ss.match!=null && !ss.semiperfect()){
+ final int sub=Read.countSubs(ss.match);
+ final int ins=Read.countInsertions(ss.match);
+ final int del=Read.countDeletions(ss.match);
+ final int inscount=Read.countInsertionEvents(ss.match);
+ final int delcount=Read.countDeletionEvents(ss.match);
+
+ boolean bad=false;
+ bad=bad||(SUBFILTER>=0 && sub>SUBFILTER);
+// System.err.println(SUBFILTER>=0 && sub>SUBFILTER);
+ bad=bad||(INSFILTER>=0 && inscount>INSFILTER);
+// System.err.println(INSFILTER>=0 && ins>INSFILTER);
+ bad=bad||(DELFILTER>=0 && delcount>DELFILTER);
+// System.err.println(DELFILTER>=0 && del>DELFILTER);
+ bad=bad||(INSLENFILTER>=0 && hasLongInsertion(ss.match, INSLENFILTER));
+// System.err.println(INSLENFILTER>=0 && hasLongInsertion(ss.match, INSLENFILTER));
+ bad=bad||(DELLENFILTER>=0 && hasLongDeletion(ss.match, DELLENFILTER));
+// System.err.println(DELLENFILTER>=0 && hasLongDeletion(ss.match, DELLENFILTER));
+ bad=bad||(INDELFILTER>=0 && inscount+delcount>INDELFILTER);
+// System.err.println(EDITFILTER>=0 && sub+ins+del>DELFILTER);
+ bad=bad||(EDITFILTER>=0 && sub+ins+del>EDITFILTER);
+// System.err.println(EDITFILTER>=0 && sub+ins+del>DELFILTER);
+ if(bad){
+ r.sites.set(i, null);
+ removed++;
+ if(i==0){removedTop=true;}
+ }
+
+// assert(false) : SUBFILTER+", "+PROCESS_EDIT_FILTER+", "+sub+", "+ins+", "+del+", "+bad;
+ }
+ }
+ if(removed>0){Tools.condenseStrict(r.sites);}
+ }
+ if(removedTop){
+ if(r.mate!=null){
+ r.setPaired(false);
+ r.mate.setPaired(false);
+ }
+ if(r.sites==null || r.sites.isEmpty()){
+ r.clearMapping();
+ }else{
+ r.setFromTopSite();
+ }
+ }
+ return removedTop;
+ }
+
+ @Override
+ public final void run() {
+ //System.err.println("Waiting on a list... (initial)");
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> readlist=ln.list;
+
+// long count=System.currentTimeMillis();
+// String os=System.getProperty("os.name");
+// int procs=Runtime.getRuntime().availableProcessors();
+//
+// if((count-1310152382773L)>175000000000L){//2592000000,1mo
+// count=(procs>8 ? 1 : 2)+((hashCode()&0xFFFFFFF)%5);
+// }
+ final boolean black=(Blacklist.hasBlacklist());
+ final boolean MAKE_QUALITY_HISTOGRAM=(readstats==null ? false : ReadStats.COLLECT_QUALITY_STATS);
+ final boolean MAKE_MATCH_HISTOGRAM=(readstats==null ? false : ReadStats.COLLECT_MATCH_STATS);
+ final boolean MAKE_INSERT_HISTOGRAM=(readstats==null ? false : ReadStats.COLLECT_INSERT_STATS);
+ final boolean MAKE_BASE_HISTOGRAM=(readstats==null ? false : ReadStats.COLLECT_BASE_STATS);
+ final boolean MAKE_QUALITY_ACCURACY=(readstats==null ? false : ReadStats.COLLECT_QUALITY_ACCURACY);
+
+ final boolean MAKE_EHIST=(readstats==null ? false : ReadStats.COLLECT_ERROR_STATS);
+ final boolean MAKE_INDELHIST=(readstats==null ? false : ReadStats.COLLECT_INDEL_STATS);
+ final boolean MAKE_LHIST=(readstats==null ? false : ReadStats.COLLECT_LENGTH_STATS);
+ final boolean MAKE_GCHIST=(readstats==null ? false : ReadStats.COLLECT_GC_STATS);
+ final boolean MAKE_IDHIST=(readstats==null ? false : ReadStats.COLLECT_IDENTITY_STATS);
+ final boolean MAKE_TIMEHIST=(readstats==null ? false : ReadStats.COLLECT_TIME_STATS);
+ final boolean MAKE_COVERAGE=(pileup==null ? false : true);
+
+ if(SKIP_INITIAL>0){
+ while(!readlist.isEmpty()){
+
+ if(readlist.get(readlist.size()-1).numericID<SKIP_INITIAL){
+ //Do nothing
+ }else{
+ while(readlist.get(0).numericID<SKIP_INITIAL){readlist.remove(0);}
+ break;
+ }
+
+ writeList(new ArrayList<Read>(1), black, ln.id);
+
+ cris.returnList(ln.id, readlist.isEmpty());
+// if(count>0){
+// cris.returnList(ln.id, readlist.isEmpty());
+// count--;
+// }
+
+ //System.err.println("Waiting on a list...");
+ ln=cris.nextList();
+ readlist=ln.list;
+ }
+ }
+
+ while(!readlist.isEmpty()){
+
+ if(MAX_READ_LENGTH>0 || MIN_READ_LENGTH>0){
+ ReformatReads.breakReads(readlist, MAX_READ_LENGTH, MIN_READ_LENGTH);
+ }
+
+ //System.err.println("Got a list of size "+readlist.size());
+ for(int i=0; i<readlist.size(); i++){
+
+ long startTime=0;
+ if(TIME_TAG){startTime=System.nanoTime();}
+
+ Read r=readlist.get(i);
+ assert(r.mate==null || (r.pairnum()==0 && r.mate.pairnum()==1)) : r.pairnum()+", "+r.mate.pairnum();
+
+ // System.out.println("Got read: "+r.toText(false));
+ // System.out.println("Synthetic: "+r.synthetic());
+
+
+
+
+ if(r.synthetic()){
+ syntheticReads++;
+ if(r.originalSite==null){r.makeOriginalSite();}
+ r.clearSite();
+ if(r.mate!=null){
+ assert(r.mate.synthetic());
+ if(r.mate.originalSite==null){r.mate.makeOriginalSite();}
+ r.mate.clearSite();
+ }
+ }
+
+ //Clear these in case output is being re-used
+ r.clearAnswers(true);
+
+ assert(r.bases==null || r.length()<=maxReadLength()) : "Read "+r.numericID+", length "+r.length()+", exceeds the limit of "+maxReadLength()+"\n"+
+ "You can map the reads in chunks by reformatting to fasta, then mapping with the setting 'fastareadlen="+maxReadLength()+"'";
+ final Read r2=r.mate;
+
+ if(MAKE_QUALITY_HISTOGRAM){readstats.addToQualityHistogram(r);}
+ if(MAKE_BASE_HISTOGRAM){readstats.addToBaseHistogram(r);}
+
+ if(MAKE_LHIST){readstats.addToLengthHistogram(r);}
+ if(MAKE_GCHIST){readstats.addToGCHistogram(r);}
+
+ if(TRIM_LEFT || TRIM_RIGHT){
+ TrimRead.trim(r, TRIM_LEFT, TRIM_RIGHT, TRIM_QUAL, TRIM_MIN_LENGTH);
+ TrimRead.trim(r2, TRIM_LEFT, TRIM_RIGHT, TRIM_QUAL, TRIM_MIN_LENGTH);
+ }
+
+ if(RCOMP){r.reverseComplement();}
+
+ if(r2==null){
+ final byte[] basesP=r.bases;
+ final byte[] basesM=AminoAcid.reverseComplementBases(basesP);
+ basesUsed1+=(basesM==null ? 0 : basesM.length);
+ processRead(r, basesM);
+ capSiteList(r, MAX_SITESCORES_TO_PRINT, PRINT_SECONDARY_ALIGNMENTS);
+ assert(Read.CHECKSITES(r, basesM));
+ }else{
+ if(RCOMP_MATE!=RCOMP){r2.reverseComplement();}
+ final byte[] basesP1=r.bases;
+ final byte[] basesM1=AminoAcid.reverseComplementBases(basesP1);
+ final byte[] basesP2=r2.bases;
+ final byte[] basesM2=AminoAcid.reverseComplementBases(basesP2);
+ basesUsed1+=(basesM1==null ? 0 : basesM1.length);
+ basesUsed2+=(basesM2==null ? 0 : basesM2.length);
+ assert(r2.bases==null || r2.length()<maxReadLength()) :
+ "Read "+r2.numericID+", length "+r2.length()+", exceeds the limit of "+maxReadLength()+"\n"+
+ "You can map the reads in chunks by reformatting to fasta, then mapping with the setting 'fastareadlen="+maxReadLength()+"'";
+ processReadPair(r, basesM1, basesM2);
+ capSiteList(r, MAX_SITESCORES_TO_PRINT, PRINT_SECONDARY_ALIGNMENTS);
+ capSiteList(r2, MAX_SITESCORES_TO_PRINT, PRINT_SECONDARY_ALIGNMENTS);
+// if(!LOCAL_ALIGN){//TODO: This can fail in local mode; see bug#0001
+ assert(Read.CHECKSITES(r, basesM1));
+ assert(Read.CHECKSITES(r2, basesM2));
+// }
+ }
+
+ if(UNTRIM && (TRIM_LEFT || TRIM_RIGHT)){
+ TrimRead.untrim(r);
+ TrimRead.untrim(r2);
+ }
+
+ if(MAKE_MATCH_HISTOGRAM){readstats.addToMatchHistogram(r);}
+ if(MAKE_INSERT_HISTOGRAM && r.paired()){readstats.addToInsertHistogram(r, (SAME_STRAND_PAIRS || !REQUIRE_CORRECT_STRANDS_PAIRS));}
+ if(MAKE_QUALITY_ACCURACY){readstats.addToQualityAccuracy(r);}
+
+ if(MAKE_EHIST){readstats.addToErrorHistogram(r);}
+ if(MAKE_INDELHIST){readstats.addToIndelHistogram(r);}
+ if(MAKE_IDHIST){readstats.addToIdentityHistogram(r);}
+
+ if(TIME_TAG){
+ final Long time=(System.nanoTime()-startTime+500)/1000;
+ r.obj=time;
+ if(r2!=null){r2.obj=time;}
+ assert(r.obj!=null && r.obj.getClass()==Long.class);
+ if(MAKE_TIMEHIST){readstats.addToTimeHistogram(r);}
+ }
+ }
+
+
+ if(MAKE_COVERAGE){
+ synchronized(pileup){//TODO: Potential bottleneck
+ for(Read r : readlist){
+ pileup.processRead(r);
+ if(r.mate!=null){pileup.processRead(r.mate);}
+ }
+ }
+ }
+
+// System.err.println("Returning a list..."+"\n"+readlist);
+
+ writeList(readlist, black, ln.id);
+
+
+ //System.err.println("Left from adding list "+readlist.get(0).numericID);
+
+ cris.returnList(ln.id, false);
+// if(count>0){
+// cris.returnList(ln.id, readlist.isEmpty());
+// count--;
+// }
+ //System.err.println("Waiting on a list...");
+ ln=cris.nextList();
+ readlist=ln.list;
+ }
+
+
+
+ //System.err.println("Returning a list... (final)");
+ assert(readlist.isEmpty());
+ cris.returnList(ln.id, true);
+ finish();
+ }
+
+ private final void writeList(ArrayList<Read> readlist, boolean black, long listNumID){
+ if(outStreamMapped!=null){
+ ArrayList<Read> x=new ArrayList<Read>(readlist.size());
+ for(Read r1 : readlist){
+ if(r1!=null){
+ Read r2=r1.mate;
+ if(r1.mapped() || (r2!=null && r2.mapped())){
+ if(!black || !Blacklist.inBlacklist(r1)){x.add(r1);}
+ }
+ }
+ }
+ outStreamMapped.add(x, listNumID);
+ }
+
+ if(outStreamBlack!=null){
+ ArrayList<Read> x=new ArrayList<Read>(readlist.size());
+ for(Read r1 : readlist){
+ if(black && Blacklist.inBlacklist(r1)){x.add(r1);}
+ }
+ outStreamBlack.add(x, listNumID);
+ }
+
+ if(BBSplitter.streamTable!=null || BBSplitter.TRACK_SET_STATS || BBSplitter.TRACK_SCAF_STATS){
+ BBSplitter.printReads(readlist, listNumID, null, CLEARZONE1());
+ }
+
+ if(outStreamUnmapped!=null){
+ ArrayList<Read> x=new ArrayList<Read>(readlist.size());
+ for(Read r1 : readlist){
+ if(r1!=null){
+ Read r2=r1.mate;
+ if(!(r1.mapped() || (r2!=null && r2.mapped()))){
+ x.add(r1);
+ }
+ }
+ }
+ outStreamUnmapped.add(x, listNumID);
+ }
+
+// System.err.println("outputStream = "+outputStream==null ? "null" : "real");
+ if(outStream!=null){ //Important to send all lists to output, even empty ones, to keep list IDs straight.
+ if(OUTPUT_MAPPED_ONLY){removeUnmapped(readlist);}
+ if(black && DONT_OUTPUT_BLACKLISTED_READS){removeBlacklisted(readlist);}
+ for(Read r : readlist){
+ if(r!=null){
+ if(CLEAR_ATTACHMENT){r.obj=null;}
+ assert(r.bases!=null);
+ if(r.sites!=null && r.sites.isEmpty()){r.sites=null;}
+ }
+ }
+// System.err.println("Adding list of length "+readlist.size());
+ outStream.add(readlist, listNumID);
+ }
+ }
+
+ /** Returns max possible quick score for this read, or -1 if it cannot be mapped for quality reasons.
+ * A positive score will be returned if it CAN be mapped, but no hits are found. */
+ public final int quickMap(final Read r, final byte[] basesM){
+ final AbstractIndex index=index();
+ byte[] basesP=r.bases;
+ if(basesP.length<KEYLEN){return 0;}
+ assert(basesP.length>=KEYLEN);
+
+ if(PERFECTMODE || SEMIPERFECTMODE){//Imperfect reads cannot map perfectly.
+ if(r.containsUndefined()){return-1;}
+ }else if(DISCARD_MOSTLY_UNDEFINED_READS){
+ int n=r.countUndefined();
+ if(n>25 && basesP.length-n<n){return -1;}
+ }
+ if(MIN_AVERAGE_QUALITY>0){
+ if(r.avgQualityByProbability(false, MIN_AVERAGE_QUALITY_BASES)<MIN_AVERAGE_QUALITY){return -1;}
+ }
+
+ final int keyProbLen=basesP.length-KEYLEN+1;
+ final float[] keyProbs=index.keyProbArray();
+ int[] offsets;
+
+ float keyDen2=((maxDesiredKeys*KEYLEN)/(float)basesP.length);
+ keyDen2=Tools.max(minKeyDensity, keyDen2);
+ keyDen2=Tools.min(keyDensity, keyDen2);
+
+ float keyDen3;
+ if(basesP.length<=50){
+ keyDen3=maxKeyDensity;
+ }else if(basesP.length>=200){
+ keyDen3=maxKeyDensity-0.5f;
+ }else{
+ keyDen3=maxKeyDensity-0.003333333333f*(basesP.length-50); //0.003333... = 0.5/150
+ }
+
+ keyDen3=Tools.max(keyDensity, keyDen3);
+
+ if(GENERATE_KEY_SCORES_FROM_QUALITY || r.quality==null){
+ QualityTools.makeKeyProbs(r.quality, r.bases, KEYLEN, keyProbs, USE_MODULO);
+
+ boolean offsetsMode3=true;
+ if(offsetsMode3){
+ offsets=KeyRing.makeOffsets3(keyProbs, r.length(), KEYLEN, keyDen2, keyDen3, 2, (PERFECTMODE || SEMIPERFECTMODE));
+ }else{
+ //Old version; usually worse.
+ offsets=KeyRing.makeOffsets2(keyProbs, r.length(), KEYLEN, keyDen2, keyDen3, 2);
+ int numKeys=(offsets==null ? 0 : offsets.length+1);
+ int maxRounds=0;//(PERFECTMODE || SEMIPERFECTMODE) ? 0 : 9999;//(numKeys)/2;
+ while(maxRounds>0 && offsets!=null && offsets.length<numKeys){
+ numKeys=offsets.length;
+ offsets=QualityTools.modifyOffsets(offsets, keyProbs);
+ maxRounds--;
+ }
+ }
+ }else{
+ offsets=KeyRing.makeOffsets(r.quality, KEYLEN, keyDensity, 2);
+ }
+ if(verbose){System.err.println("Made offsets: "+Arrays.toString(offsets));}
+
+ if(offsets==null || offsets.length<AbstractIndex.MIN_APPROX_HITS_TO_KEEP || (r.quality!=null && r.avgQuality(false, 0)<2)){return -1;}
+
+
+ final byte[] baseScoresP=index.getBaseScoreArray(basesP.length, 0);
+ final int[] keyScoresP=index.getKeyScoreArray(offsets.length, 0);
+
+ if(AbstractIndex.USE_EXTENDED_SCORE){
+ if(AbstractIndex.GENERATE_BASE_SCORES_FROM_QUALITY){
+ QualityTools.makeByteScoreArray(r.quality, 100, baseScoresP, true);
+ }
+ }
+
+ if(GENERATE_KEY_SCORES_FROM_QUALITY){
+ int a=BASE_KEY_HIT_SCORE;
+ int baseKeyScore=a/8;
+ int range=a-baseKeyScore;
+ final int[] keyScoresAll=new int[keyProbLen];
+ QualityTools.makeKeyScores(keyProbs, keyProbLen, range, baseKeyScore, keyScoresAll);
+
+ float probAllErrors=1f;
+ for(int i=0; i<offsets.length; i++){
+ keyScoresP[i]=keyScoresAll[offsets[i]];
+ probAllErrors*=keyProbs[offsets[i]];
+ }
+ if(probAllErrors>0.50f){return -1;} //Default .5f; higher gives more false positives, lower gives more false negatives
+ if(verbose){System.err.println("Prob all errors = "+probAllErrors+"\n\n");}
+ }else{
+ Arrays.fill(keyScoresP, BASE_KEY_HIT_SCORE);
+ }
+ if(verbose){System.err.println("Made key scores: "+Arrays.toString(keyScoresP));}
+
+ keysUsed+=offsets.length;
+ int maxScore=index.maxScore(offsets, baseScoresP, keyScoresP, basesP.length, true);
+ if(verbose){System.err.println("Max Score: "+maxScore);}
+ assert(maxScore>0);
+
+ ArrayList<SiteScore> list=index.findAdvanced(basesP, basesM, r.quality, baseScoresP, keyScoresP, offsets, r.numericID);
+ if(verbose){System.err.println("list: "+list);}
+
+ r.sites=list;
+ removeOutOfBounds(r, OUTPUT_MAPPED_ONLY, OUTPUT_SAM, EXPECTED_LEN_LIMIT);
+ assert(Read.CHECKSITES(list, r.bases, basesM, r.numericID, false));
+ if(FORBID_SELF_MAPPING){forbidSelfMapping(list, r.originalSite);}
+
+ if(list==null || list.isEmpty()){
+ r.sites=null;
+ }else{
+ r.sites=list;
+ if(!SLOW_ALIGN && AbstractIndex.USE_AFFINE_SCORE){
+ for(SiteScore ss : list){ss.setSlowScore(ss.quickScore);}
+ }
+ }
+// assert(r.list!=null); //Less efficient, but easier to code later.
+
+ return maxScore;
+ }
+
+
+ /**
+ * Returns number of scores of at least maxImperfectSwScore.
+ * If problems are encountered such that it is prudent to do slow-alignment, a number lower than 1 will be returned.
+ */
+ final int scoreNoIndels(final Read r, final byte[] basesP, final byte[] basesM, final int maxSwScore, final int maxImperfectSwScore){
+
+ if(!SLOW_ALIGN || r.numSites()==0){return 0;}
+
+ int numPerfectScores=0;
+ int numNearPerfectScores=0;
+ int bestScoreNoIndel=Integer.MIN_VALUE;
+
+ boolean forceSlow=false;
+
+ for(int j=0; j<r.sites.size(); j++){
+
+ SiteScore ss=r.sites.get(j);
+ int oldScore=ss.score;
+ int sslen=ss.stop()-ss.start()+1;
+// assert(false) : ss+", "+ss.quickScore+", "+ss.score+", "+ss.slowScore+", "+ss.pairedScore;
+
+ final byte[] bases=(ss.strand==Gene.PLUS ? basesP : basesM);
+
+ if(AbstractIndex.USE_AFFINE_SCORE && ss.quickScore==maxSwScore){
+ assert(ss.stop()==ss.start()+r.length()-1) : ss.toText()+", "+maxSwScore+", "+maxImperfectSwScore+
+ ", "+r.length()+", "+(ss.start()+r.length()-1);
+ }
+
+ if(verbose){System.err.print("C) Changing SiteScore from "+ss+"\n");}
+
+ int slowScoreNoIndel;
+ if(ss.perfect){
+ if(verbose){System.err.print("C1");}
+ numNearPerfectScores++;
+ assert(ss.semiperfect);
+ assert(ss.stop()-ss.start()+1==bases.length);
+// assert(maxSwScore==msa.scoreNoIndels((ss.strand==Gene.PLUS ? basesP : basesM), ss)); //TODO Disable this very slow assertion
+ slowScoreNoIndel=maxSwScore;
+ ss.setSlowScore(slowScoreNoIndel);
+ ss.setScore(slowScoreNoIndel);
+ ss.gaps=null;
+ }else{
+ if(verbose){System.err.print("C2");}
+ ChromosomeArray cha=Data.getChromosome(ss.chrom);
+ slowScoreNoIndel=msa.scoreNoIndels(bases, cha.array, ss.start(), (sslen==bases.length ? ss : null));
+
+ //This block is to correct situations where slow align does not get called,
+ //so one near-perfect alignment is found and one missed, because the read should align to stop, not start.
+ if(slowScoreNoIndel<oldScore && oldScore>=maxImperfectSwScore && ss.stop()-ss.start()+1!=bases.length){
+ int slowScoreNoIndel2=msa.scoreNoIndels(bases, cha.array, ss.stop()-bases.length+1, null);
+ if(slowScoreNoIndel2>=maxImperfectSwScore){
+ slowScoreNoIndel=slowScoreNoIndel2;
+ ss.setStart(ss.stop()-bases.length+1);
+ ss.setPerfect(bases);
+ }
+ }
+
+ ss.setSlowScore(slowScoreNoIndel);
+ ss.setScore(slowScoreNoIndel);
+
+ //This is the problem section.
+ if(slowScoreNoIndel>=maxImperfectSwScore){
+ if(verbose){System.err.print("C3");}
+ numNearPerfectScores++;
+
+ ss.setStop(ss.start()+bases.length-1);
+ ss.gaps=null;
+ if(slowScoreNoIndel>=maxSwScore){
+ if(verbose){System.err.print("C4");}
+ assert(slowScoreNoIndel==maxSwScore) : slowScoreNoIndel+">"+maxSwScore;
+ numPerfectScores++;
+ ss.perfect=ss.semiperfect=true;
+ }else{
+ if(verbose){System.err.print("C5");}
+ assert(!ss.perfect);
+ ss.setPerfect(bases);
+ assert(!ss.perfect);
+ }
+ if(QUICK_MATCH_STRINGS && !ss.perfect && (PRINT_SECONDARY_ALIGNMENTS || slowScoreNoIndel>=bestScoreNoIndel)){
+ ss.match=msa.genMatchNoIndels(bases, cha.array, ss.start());
+ }
+ }else if(oldScore>=maxImperfectSwScore){
+ if(verbose){System.err.print("C6");}
+ forceSlow=true;
+ }else if(PRINT_SECONDARY_ALIGNMENTS){
+ if(verbose){System.err.print("C7");}
+ forceSlow=true;
+ }else if(verbose){
+ if(verbose){System.err.print("C8");} //May need slow alignment for sitescore.
+ }
+ }
+
+ if(verbose){System.err.print("\nto "+ss+"\n");}
+
+ bestScoreNoIndel=Tools.max(ss.slowScore, bestScoreNoIndel);
+// assert(CHECKSITE(ss, bases));
+ }
+ return (forceSlow ? -numNearPerfectScores : numNearPerfectScores);
+ }
+
+
+ /** Assumes list is sorted */
+ public final void genMatchString(final Read r, final byte[] basesP, final byte[] basesM, final int maxImperfectSwScore, final int maxSwScore, boolean setSSScore, final boolean recur){
+ if(verbose){System.err.println("\n\n\n\n\ngenMatchString for read\n"+r+"\n\n\n\n\n");}
+ assert(Read.CHECKSITES(r, basesM));
+ assert(checkTopSite(r));
+
+ assert(r.mate!=null || r.numSites()==0 || r.topSite().score==r.mapScore) : "\n"+r.toText(false)+"\n"; //Came from BBMapAcc; not sure if it is correct
+ assert(msa!=null);
+ if(r.numSites()==0){
+ r.chrom=-1;
+ assert(r.mate!=null || r.numSites()==0 || r.topSite().score==r.mapScore) : "\n"+r.toText(false)+"\n";
+ return;
+ }
+
+ if(PRINT_SECONDARY_ALIGNMENTS){
+ capSiteList(r, MAX_SITESCORES_TO_PRINT+3, PRINT_SECONDARY_ALIGNMENTS);
+ }
+
+ if(QUICK_MATCH_STRINGS && PRINT_SECONDARY_ALIGNMENTS && USE_SS_MATCH_FOR_PRIMARY){} //TODO What was this line for?
+
+ int best=Integer.MIN_VALUE;
+ int scoreChanged=0;
+
+ for(int i=0; i<r.sites.size(); i++){
+ SiteScore ss=r.sites.get(i);
+
+ if(verbose){System.err.println("**************** best="+best+", scoreChanged="+scoreChanged+"\nconsidering ss "+ss);}
+
+ if(i>0){
+ if(best>=ss.slowScore && !PRINT_SECONDARY_ALIGNMENTS){
+ if(verbose){System.err.println("break triggered by low score: ");}
+ break;
+ }
+ }
+ final int oldSlowScore=ss.slowScore, oldScore=ss.score;
+ if(ss.match==null || (i==0 && !USE_SS_MATCH_FOR_PRIMARY)){
+ genMatchStringForSite(r.numericID, ss, basesP, basesM, maxImperfectSwScore, maxSwScore, r.mate, PRINT_SECONDARY_ALIGNMENTS);
+ if(setSSScore){ss.setScore(ss.slowScore);}
+ }
+ if(i>0 && ss.match==null && !r.paired()){
+ if(verbose){System.err.println("Removed site "+ss);}
+ r.sites.remove(i);
+ }else{
+ if(oldScore!=ss.score || oldSlowScore!=ss.slowScore){scoreChanged++;}
+ best=Tools.max(ss.slowScore, best);
+ }
+
+ if(verbose){System.err.println("**************** best="+best+", scoreChanged="+scoreChanged+"\nconsidered ss "+ss);}
+ }
+
+ if(verbose){System.err.println("Finished basic match generation. best="+best+", scoreChanged="+scoreChanged+", AMBIGUOUS_RANDOM="+AMBIGUOUS_RANDOM+", ambiguous="+r.ambiguous());}
+
+ boolean needsSorting=(scoreChanged>0 && !Read.CHECKORDER(r.sites));
+ if(verbose){
+ System.err.println("needsSorting="+needsSorting+", scoreChanged="+scoreChanged+", "+Read.CHECKORDER(r.sites));
+// for(SiteScore ss : r.sites){System.err.println("score="+ss.score);}
+ }
+ while(needsSorting){
+ needsSorting=false;
+ final SiteScore top=r.topSite();
+ if(verbose){System.err.println("GMS 1");}
+ Tools.mergeDuplicateSites(r.sites, false, false);
+ Collections.sort(r.sites);
+ for(int i=0; i<r.sites.size() && i<MAX_SITESCORES_TO_PRINT; i++){
+ if(verbose){System.err.println("GMS 2");}
+ SiteScore ss=r.sites.get(i);
+ if(ss.match==null){
+ if(verbose){System.err.println("GMS 3");}
+ genMatchStringForSite(r.numericID, ss, basesP, basesM, maxImperfectSwScore, maxSwScore, r.mate, PRINT_SECONDARY_ALIGNMENTS);
+ if(setSSScore){ss.setScore(ss.slowScore);}
+ if(i>0 && ss.match==null){r.sites.remove(i);}
+ else{needsSorting=true;}
+ i--;
+ }
+ if(i>0 || !PRINT_SECONDARY_ALIGNMENTS){
+ if(verbose){System.err.println("GMS 4");}
+ break;
+ }
+ }
+
+ if(r.paired() && r.topSite()!=top){
+ r.setPaired(false);
+ r.mate.setPaired(false);
+ }
+ }
+
+
+ final SiteScore ss=r.topSite();
+ assert(ss==r.topSite());
+
+// assert(ss.slowScore>0) : ss.slowScore+", "+best+", "+r.mapScore;
+
+ r.start=ss.start();
+ r.stop=ss.stop();
+ r.chrom=ss.chrom;
+ r.setStrand(ss.strand);
+ r.match=ss.match;
+ r.gaps=ss.gaps;
+ r.mapScore=ss.slowScore;
+ r.setPerfect(ss.perfect());
+ r.setRescued(ss.rescued());
+
+ assert(checkTopSite(r)) : r;
+ assert(Read.CHECKSITES(r, basesM)) : "\n\n"+ss.mappedLength()+", "+ss.mappedLength()+"\n\n"+r+"\n\n"+r.mate+"\n\n"+r.toFastq()+"\n\n"+r.mate.toFastq()+"\n\n";
+
+// assert(false) : r.numericID+", "+ss.slowScore+", "+r.mapScore;
+ }
+
+
+ protected final int genMatchStringForSite(final long id, final SiteScore ss, final byte[] basesP, final byte[] basesM,
+ final int maxImperfectSwScore, final int maxSwScore, final Read mate, final boolean secondary){
+ final byte[] bases=ss.plus() ? basesP : basesM;
+ assert(Read.CHECKSITE(ss, bases, id));
+ assert(msa!=null);
+
+
+ final int minMsaLimit;
+ {
+ final float mult=(PAIRED ? MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED : MINIMUM_ALIGNMENT_SCORE_RATIO)*(secondary ? SECONDARY_SITE_SCORE_RATIO : 1f);
+ minMsaLimit=-1+(int)(mult*maxSwScore);
+ }
+
+ if(GEN_MATCH_FAST){
+
+ assert(!(SLOW_ALIGN || AbstractIndex.USE_EXTENDED_SCORE) || AbstractIndex.GENERATE_BASE_SCORES_FROM_QUALITY ||
+ (ss.slowScore==maxSwScore) == ss.perfect()) :
+ bases.length+", "+ss.toText()+", "+maxSwScore+", "+ss.slowScore+", "+ss.perfect()+", "+ss.semiperfect();
+
+ //TODO: This WAS disabled because I saw a read marked perfect with a sub in it, probably with quality 0 at that point.
+ if((SLOW_ALIGN || AbstractIndex.USE_EXTENDED_SCORE) && ss.perfect()){
+ assert(ss.stop()-ss.start()==(bases.length-1));
+ ss.match=makePerfectMatchString(bases.length);
+ assert(ss.isPerfect(bases)) : id+", "+ss; //TODO: Slow assertion
+ }else{
+ int oldScore=ss.slowScore;
+ assert(ss.gaps==null || ss.gaps[0]==ss.start() && ss.gaps[ss.gaps.length-1]==ss.stop()) : "\nrid="+id+"; ss="+ss+"\n"+new String(basesP)+"\n";
+ int padding=(ss.perfect || ss.semiperfect ? 0 : Tools.max(SLOW_ALIGN_PADDING, 6));
+
+ if(verbose){System.err.println("Attempting to realign read:\n"+id+", "+ss+"\npadding="+padding+"\nrescued="+ss.rescued());}
+
+ TranslateColorspaceRead.realign_new(ss, bases, msa, padding, 1, minMsaLimit, MAX_INDEL<1, false, id); //Also generates the match string
+ ss.gaps=GapTools.fixGaps(ss.start(), ss.stop(), ss.gaps, Shared.MINGAP);
+
+ if(verbose){System.err.println("Realigned read:\n"+id+", "+ss+"\npadding="+padding+"\nrescued="+ss.rescued()+"\nreflen="+(ss.stop()-ss.start()+1));}
+ assert(Read.CHECKSITE(ss, bases, id));
+
+ int leftPaddingNeeded=ss.leftPaddingNeeded(4, 5), rightPaddingNeeded=ss.rightPaddingNeeded(4, 5);
+ if(ss.slowScore<oldScore || leftPaddingNeeded>0 || rightPaddingNeeded>0){
+ if(verbose){System.err.println("---- A ----");}
+ if(verbose){
+ System.err.print("Read "+id+": "+ss.start()+","+ss.stop()+": "+oldScore+">"+ss.slowScore);
+ }
+
+ int extra=(MAX_INDEL>0 ? 80 : 20)+SLOW_ALIGN_PADDING;
+ int expectedLen=GapTools.calcGrefLen(ss.start(), ss.stop(), ss.gaps); //TODO Gaps should be correct here!!!
+ int remaining=(msa.maxColumns-expectedLen-2);
+ extra=Tools.max(0, Tools.min(remaining/2, extra));
+ TranslateColorspaceRead.realign_new(ss, bases, msa, extra, 2, minMsaLimit, false, true, id);
+ ss.gaps=GapTools.fixGaps(ss.start(), ss.stop(), ss.gaps, Shared.MINGAP);
+ assert(Read.CHECKSITE(ss, bases, id));
+
+ if(verbose){
+ System.err.println("\n-> "+ss.start()+","+ss.stop()+","+ss.slowScore+
+ /*(r.originalSite==null ? "" : "\t*"+r.originalSite)+*/"\t(extra = "+extra+")");
+ }
+ }
+ if(verbose){System.err.println("---- B ----");}
+ assert(Read.CHECKSITE(ss, bases, id));
+
+ if(verbose){
+ System.err.println("---- D3 ----");
+ System.err.println(ss);
+ System.err.println("Checking perfect status: ss.perfect="+ss.perfect()+", ss.semi="+ss.semiperfect()+
+ ", maxSwScore="+maxSwScore+", ss.slowScore="+ss.slowScore);
+ }
+ ss.setPerfectFlag(maxSwScore, bases);
+ if(verbose){
+ System.err.println("---- E ----");
+ System.err.println("Checking perfect status: ss.perfect="+ss.perfect()+", ss.semi="+ss.semiperfect()+
+ ", maxSwScore="+maxSwScore+", ss.slowScore="+ss.slowScore);
+ }
+
+ assert(Read.CHECKSITE(ss, bases, id));
+ }
+ }else{
+ if(verbose){System.err.println("---- F ----");}
+ ChromosomeArray cha=Data.getChromosome(ss.chrom);
+
+ if(ss.perfect()){
+ ss.match=makePerfectMatchString(bases.length);
+ }else{
+ assert(false) : "TODO: This does not take strand into account";
+ if(ss.slowScore>=maxImperfectSwScore){
+ //TODO
+ }
+
+ if(msa!=null){
+ assert(false) : "0 is not good here; try a non-indel match string.";
+ int[] max=msa.fillLimited(bases, cha.array, ss.start(), ss.stop(), 0, ss.gaps);
+ // System.err.print("*");
+ ss.match=msa.traceback(bases, cha.array, ss.start(), ss.stop(), max[0], max[1], max[2], ss.gaps!=null);
+ }
+ }
+ }
+ if(verbose){System.err.println("---- G ----");}
+ ss.clipTipIndels(bases, basesM, 4, 10, msa);
+
+ assert(Read.CHECKSITE(ss, bases, id));
+ return ss.slowScore;
+ }
+
+
+
+ /** Returns the number of additional bases away that should be searched for slow align.
+ * This should probably be called between quickMap and slowAlign, only on
+ * sites where stop-start<=bases.length-1 */
+ final void findTipDeletions(final Read r, final byte[] basesP, final byte[] basesM, final int maxSwScore, final int maxImperfectScore){
+
+ boolean findRight=r.quality==null || (r.minQualityLastNBases(TIP_DELETION_MAX_TIPLEN)>=TIP_DELETION_MIN_QUALITY &&
+ r.avgQualityLastNBases(TIP_DELETION_MAX_TIPLEN)>=TIP_DELETION_AVG_QUALITY);
+ boolean findLeft=r.quality==null || (r.minQualityFirstNBases(TIP_DELETION_MAX_TIPLEN)>=TIP_DELETION_MIN_QUALITY &&
+ r.avgQualityFirstNBases(TIP_DELETION_MAX_TIPLEN)>=TIP_DELETION_AVG_QUALITY);
+ if(!findRight && !findLeft){
+// System.err.print(".");
+ return;
+ }
+// System.err.print("*");
+
+ for(SiteScore ss : r.sites){
+ final byte[] bases=(ss.strand==Gene.PLUS ? basesP : basesM);
+ if(!ss.semiperfect && ss.slowScore<maxImperfectScore){
+ boolean changed=findTipDeletions(ss, bases, maxImperfectScore, findRight, findLeft);
+ if(changed){
+ ss.match=null;
+ ss.setSlowScore(msa.scoreNoIndels(bases, ss.chrom, ss.start()));
+ assert(!ss.perfect);
+ if(ss.slowScore==maxSwScore){
+ ss.setStop(ss.start()+bases.length-1);
+ ss.perfect=ss.semiperfect=true;
+ }else{
+ ss.perfect=false;
+ ss.setPerfect(bases, true);
+ }
+ }
+ }
+ }
+ }
+
+ final boolean findTipDeletions(SiteScore ss, final byte[] bases, final int maxImperfectScore, boolean lookRight, boolean lookLeft){
+ if(ss.slowScore>=maxImperfectScore /*&& ss.stop()-ss.start()<=basesP.length-1*/){return false;}
+ assert(lookRight || lookLeft);
+ assert(TIP_DELETION_MAX_TIPLEN>2);
+ if(bases.length<=2*TIP_DELETION_MAX_TIPLEN){return false;}
+ assert(TIP_DELETION_MAX_TIPLEN<bases.length);
+ assert(TIP_DELETION_SEARCH_RANGE>0);
+
+ int maxSearch=TIP_DELETION_SEARCH_RANGE;
+ maxSearch=Tools.min(maxSearch, ALIGN_COLUMNS-(SLOW_RESCUE_PADDING+8+Tools.max(bases.length, ss.stop()-ss.start())));
+ if(maxSearch<1){return false;}
+
+ boolean changed=false;
+
+ if(lookRight){
+ int x=findTipDeletionsRight(bases, ss.chrom, ss.stop(), maxSearch, TIP_DELETION_MAX_TIPLEN);
+ if(x>0){
+ assert(x+ss.stop()-ss.start()<ALIGN_COLUMNS);
+ ss.setStop(ss.stop()+x);
+ changed=true;
+ maxSearch=Tools.min(maxSearch, ALIGN_COLUMNS-(SLOW_RESCUE_PADDING+8+Tools.max(bases.length, ss.stop()-ss.start())));
+ if(maxSearch<1){return changed;}
+ }
+ }
+
+ if(lookLeft){
+ int y=findTipDeletionsLeft(bases, ss.chrom, ss.start(), maxSearch, TIP_DELETION_MAX_TIPLEN);
+ if(y>0){
+ assert(y+ss.stop()-ss.start()<ALIGN_COLUMNS);
+ ss.setStart(ss.start()-y);
+ changed=true;
+ }
+ }
+ return changed;
+ }
+
+
+ final void rescue(Read anchor, Read loose, byte[] basesP, byte[] basesM, int searchDist){
+
+ if(mappedRetained2>1000 && numMated*20L<mappedRetained2){return;}//skip rescue; mating is not working.
+ if(searchDist>MAX_RESCUE_DIST){return;}//too slow
+
+ //Lists should be sorted at this point, and have a paired score if they are paired.
+
+ if(anchor.sites==null || anchor.sites.isEmpty()){return;}
+ if(loose.sites==null){
+ loose.sites=new ArrayList<SiteScore>(anchor.sites.size());
+ }
+
+ final int maxLooseSwScore=msa.maxQuality(basesP.length);
+ final int maxAnchorSwScore=msa.maxQuality(anchor.length());
+ final int maxImperfectScore=msa.maxImperfectScore(basesP.length);
+
+ final int bestLooseScore=loose.sites.isEmpty() ? 0 : loose.topSite().slowScore;
+ final int bestAnchorScore=anchor.topSite().slowScore;
+
+ if(bestLooseScore==maxLooseSwScore && bestAnchorScore==maxAnchorSwScore
+ && anchor.topSite().pairedScore>0){return;}
+
+ int rescueScoreLimit=(int)(0.95f*bestAnchorScore);
+// int retainScoreLimit=(int)(bestLooseScore>0 ? 0.58f*bestLooseScore : 0.58f*maxLooseSwScore);
+ int retainScoreLimit=Tools.max((int)(0.68f*bestLooseScore), (int)(0.4f*maxLooseSwScore));
+ int retainScoreLimit2=Tools.max((int)(0.95f*bestLooseScore), (int)(0.55f*maxLooseSwScore));
+ final int maxMismatches=(PERFECTMODE || SEMIPERFECTMODE) ? 0 :
+ (bestLooseScore>maxImperfectScore) ? 5 : Tools.min(MAX_RESCUE_MISMATCHES, (int)(0.60f*basesP.length-1)); //Higher number is more lenient
+ assert(PERFECTMODE || SEMIPERFECTMODE || maxMismatches>1 || loose.length()<16) : loose; //Added the <16 qualifier when a 4bp read failed this assertion
+
+ final boolean findTipDeletions=FIND_TIP_DELETIONS && bestLooseScore<maxImperfectScore;
+
+ //Data for finding tip deletions
+ final boolean findRight=findTipDeletions && (loose.quality==null || (loose.minQualityLastNBases(TIP_DELETION_MAX_TIPLEN)>=TIP_DELETION_MIN_QUALITY
+ && loose.avgQualityLastNBases(TIP_DELETION_MAX_TIPLEN)>=TIP_DELETION_AVG_QUALITY));
+ final boolean findLeft=findTipDeletions && (loose.quality==null || (loose.minQualityFirstNBases(TIP_DELETION_MAX_TIPLEN)>=TIP_DELETION_MIN_QUALITY
+ && loose.avgQualityFirstNBases(TIP_DELETION_MAX_TIPLEN)>=TIP_DELETION_AVG_QUALITY));
+
+// int searchIntoAnchor=Tools.max(20, Tools.min(anchor.length(), loose.length()));
+ for(SiteScore ssa : anchor.sites){
+ if(ssa.slowScore<rescueScoreLimit){break;}
+ if(ssa.pairedScore==0 && !ssa.rescued){
+// int searchIntoAnchor=ssa.stop-ssa.start-1+(anchor.length()/2); //Allows rescue of fragments half the length of a read
+ int searchIntoAnchor=ssa.stop()-ssa.start()-1+(anchor.length()*11/16); //Allows rescue of fragments 68% the length of a read
+ int loc;
+ int idealStart;
+ byte[] bases;
+ byte strand=(SAME_STRAND_PAIRS ? ssa.strand : (byte)(ssa.strand^1));
+ boolean searchRight=(SAME_STRAND_PAIRS ? strand==Gene.PLUS : strand==Gene.MINUS);
+ assert(strand==0 || strand==1);
+
+ if(SAME_STRAND_PAIRS){
+ if(ssa.strand==Gene.MINUS){
+ bases=basesM;
+ loc=ssa.start()+searchIntoAnchor;
+ idealStart=ssa.start()-AVERAGE_PAIR_DIST;
+ }else{
+ bases=basesP;
+ loc=ssa.stop()-searchIntoAnchor;
+ idealStart=ssa.stop()+AVERAGE_PAIR_DIST;
+ }
+ }else{
+ if(ssa.strand==Gene.PLUS){
+ bases=basesM;//opposite strand
+ loc=ssa.stop()-searchIntoAnchor;
+ idealStart=ssa.stop()+AVERAGE_PAIR_DIST;
+ }else{
+ bases=basesP;//opposite strand
+ loc=ssa.start()+searchIntoAnchor;
+ idealStart=ssa.start()-AVERAGE_PAIR_DIST;
+ }
+ }
+// loc-=20; //Search for overlapping read ends
+// assert(anchor.numericID<360000) : "loc="+loc+", searchDist="+searchDist+", idealStart="+idealStart+", searchIntoAnchor="+searchIntoAnchor+", maxMismatches="+maxMismatches;
+// System.err.println("loc="+loc+", searchDist="+searchDist+", idealStart="+idealStart+", searchIntoAnchor="+searchIntoAnchor+", maxMismatches="+maxMismatches);
+ SiteScore ss=quickRescue(bases, ssa.chrom, strand, loc, searchDist+searchIntoAnchor, searchRight,
+ idealStart, maxMismatches, POINTS_MATCH, POINTS_MATCH2);
+
+ if(ss!=null && ss.isInBounds()){
+ int mismatches=ss.slowScore;
+ ss.setSlowScore(0);
+ if(mismatches<=maxMismatches){
+ slowRescue(bases, ss, maxLooseSwScore, maxImperfectScore, findRight, findLeft);
+ if(ss.score>retainScoreLimit && ss.isInBounds()){
+ if(ss.score>retainScoreLimit2){//Set them as paired to make them more resistant to being discarded
+ ss.setPairedScore(Tools.max(ss.pairedScore, ss.slowScore+ssa.slowScore/4));
+ ssa.setPairedScore(Tools.max(ssa.pairedScore, ssa.slowScore+ss.slowScore/4));
+ assert(ss.pairedScore>0);
+ assert(ssa.pairedScore>0);
+ }
+ loose.sites.add(ss);
+ }
+ }
+ }
+ }else{
+ assert(ssa.pairedScore>0);
+ assert(ssa.pairedScore>ssa.quickScore || ssa.pairedScore>ssa.slowScore) : ssa.toText();
+ }
+ }
+ }
+
+
+ final void slowRescue(final byte[] bases, SiteScore ss, final int maxScore, final int maxImperfectScore,
+ boolean findTipDeletionsRight, boolean findTipDeletionsLeft){
+
+ int swscoreNoIndel=msa.scoreNoIndels(bases, ss.chrom, ss.start());
+ final int oldStart=ss.start();
+
+ if(swscoreNoIndel<maxImperfectScore && MAX_INDEL>0){
+ ss.setSlowScore(swscoreNoIndel);
+ if(findTipDeletionsRight || findTipDeletionsLeft){
+ boolean changed=findTipDeletions(ss, bases, maxImperfectScore, findTipDeletionsRight, findTipDeletionsLeft);
+ if(changed){
+ ss.match=null;
+ swscoreNoIndel=msa.scoreNoIndels(bases, ss.chrom, ss.start());
+ }
+ }
+
+ final int minMsaLimit=-CLEARZONE1e+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED*maxScore);
+
+ final int minscore=Tools.max(swscoreNoIndel, minMsaLimit);
+ final int[] swscoreArray=msa.fillAndScoreLimited(bases, ss.chrom, ss.start(), ss.stop(), SLOW_RESCUE_PADDING, minscore, ss.gaps);
+
+ if(swscoreArray!=null){
+ ss.setSlowScore(swscoreArray[0]);
+ ss.setScore(ss.slowScore);
+ ss.setStart(swscoreArray[1]);
+ ss.setStop(swscoreArray[2]);
+
+ if(verbose){System.err.println("ss="+ss);}
+ if(QUICK_MATCH_STRINGS && swscoreArray!=null && swscoreArray.length==6 && swscoreArray[0]>=minscore && (PRINT_SECONDARY_ALIGNMENTS || USE_SS_MATCH_FOR_PRIMARY)){
+ assert(swscoreArray.length==6) : swscoreArray.length;
+ assert(swscoreArray[0]>=minscore) : "\n"+Arrays.toString(swscoreArray)+"\n"+minscore;
+ ss.match=msa.traceback(bases, Data.getChromosome(ss.chrom).array, ss.start()-SLOW_RESCUE_PADDING, ss.stop()+SLOW_RESCUE_PADDING,
+ swscoreArray[3], swscoreArray[4], swscoreArray[5], ss.gaps!=null); //TODO: This failed once on a semiperfect low-complexity homo-5-mer sequence.
+ if(ss.match!=null){
+ ss.setLimits(swscoreArray[1], swscoreArray[2]);
+ if(verbose){System.err.println("After quick match: ss="+ss+"\nbases="+new String(bases)+"\nref= "+Data.getChromosome(ss.chrom).getString(ss.start, ss.stop));}
+ assert(ss.lengthsAgree());
+ ss.fixXY(bases, true, msa);
+ ss.clipTipIndels(bases, 4, 10, msa);
+ assert(ss.lengthsAgree());
+ if(verbose){System.err.println("After clipping: ss="+ss);}
+ }
+ }else{ss.match=null;}
+
+ }else{
+ ss.setSlowScore(swscoreNoIndel);
+ ss.setScore(ss.slowScore);
+ ss.setStart(oldStart);
+ ss.setStop(ss.start()+bases.length-1);
+ }
+ }else{
+ ss.setSlowScore(swscoreNoIndel);
+ ss.setScore(ss.slowScore);
+ ss.setStop(ss.start()+bases.length-1);
+ }
+ ss.setPairedScore(ss.score+1);
+ assert(ss.slowScore<=maxScore);
+ ss.perfect=(ss.slowScore==maxScore);
+ if(ss.perfect){ss.semiperfect=true;}
+ else{ss.setPerfect(bases);}
+ }
+
+
+ protected static final void capSiteList(Read r, int cap, boolean printSecondary){
+ if(r==null || r.sites==null || cap<0){return;}
+ if(cap==0){r.sites=null;}
+ else{
+ for(int i=r.sites.size()-1; i>=cap; i--){r.sites.remove(i);}
+ }
+ if(!printSecondary || r.numSites()<2){return;}
+ int max=r.topSite().slowScore;
+ int min=Tools.min(max-500, (int)(max*SECONDARY_SITE_SCORE_RATIO));
+ if(r.ambiguous()){//Ensures ambiguous reads will have at least one secondary site
+ min=Tools.max(min, r.sites.get(1).score);
+ }
+ for(int i=r.sites.size()-1; i>0; i--){
+ if(r.sites.get(i).slowScore<min){r.sites.remove(i);}
+ }
+// assert(false) : r.mapScore+", "+max+", "+cap+", "+r.list;
+// assert(r.list.size()<2) : "\n"+max+", "+min+", "+r.list+"\n";
+ }
+
+ protected final int removeDuplicateBestSites(Read r){
+ int x=0;
+ if(r.numSites()<2){return 0;}
+
+ //Remove duplicate best sites that may exist as a result of realignment.
+ final SiteScore ss1=r.topSite();
+ for(int i=r.sites.size()-1; i>0; i--){
+ SiteScore ss2=r.sites.get(i);
+ if(ss1.chrom==ss2.chrom && ss1.strand==ss2.strand && ss1.start()==ss2.start() && ss1.stop()==ss2.stop()){
+ if(!Shared.anomaly){
+// Shared.anomaly=true;
+// System.err.println("Ignoring anomalous duplicate site: "+"\n"+r.toText(false)+(r.mate==null ? "" : "\n"+r.mate.toText(false))+"\n");
+ System.err.println("Ignoring anomalous duplicate site for rid="+r.numericID);
+// new Exception().printStackTrace(System.err);
+ }
+ r.sites.remove(i);
+ x++;
+ }else{break;}
+ }
+ return x;
+ }
+
+ protected final void removeUnmapped(ArrayList<Read> list){
+ for(int i=0; i<list.size(); i++){
+ Read r=list.get(i);
+ if(r.numSites()==0){
+ if(r.mate==null || r.mate.numSites()==0){
+ list.set(i, null);
+ }
+ }
+ }
+ }
+
+ protected final void removeBlacklisted(ArrayList<Read> list){
+ for(int i=0; i<list.size(); i++){
+ Read r=list.get(i);
+ if(Blacklist.inBlacklist(r)){
+ list.set(i, null);
+ }
+ }
+ }
+
+ protected final void removeMapped(ArrayList<Read> list){
+ for(int i=0; i<list.size(); i++){
+ Read r=list.get(i);
+ if(r.numSites()==0){
+ if(r.mate==null || r.mate.numSites()==0){
+ list.set(i, null);
+ }
+ }
+ }
+ }
+
+
+ public abstract int trimList(ArrayList<SiteScore> list, boolean retainPaired, int maxScore, boolean specialCasePerfect, int minSitesToRetain, int maxSitesToRetain);
+
+ public final int trimListAdvanced(ArrayList<SiteScore> list, boolean retainPaired, boolean retainSemiperfect, int maxScore, boolean specialCasePerfect,
+ int minSitesToRetain, int maxSitesToRetain, boolean indexUsesExtendedScore, float thresh){
+ if(list==null || list.size()==0){return -99999;}
+ if(list.size()==1){return list.get(0).score;}
+
+ final int highestScore;
+ if(indexUsesExtendedScore){
+
+ highestScore=Tools.trimSiteList(list, .6f, retainPaired, true, minSitesToRetain, maxSitesToRetain);
+ if(highestScore==maxScore && specialCasePerfect){
+ Tools.trimSiteList(list, .94f, retainPaired, true, minSitesToRetain, maxSitesToRetain);
+ if(list.size()>8){Tools.trimSiteList(list, .99f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ return highestScore;
+ }
+
+ }else{
+ highestScore=Tools.trimSiteList(list, .4f, retainPaired, true, minSitesToRetain, maxSitesToRetain);
+ thresh=thresh*0.5f;
+ }
+
+ int lim, lastScore=list.get(0).score;
+ long area=lastScore;
+ for(lim=1; lim<list.size(); lim++){
+ SiteScore ss=list.get(lim);
+ lastScore=ss.score;
+ area=area+lastScore;
+ if(lastScore<0 || (lastScore/(float)area)<thresh){break;}
+ }
+ lim=Tools.max(minSitesToRetain, lim);
+ Tools.trimSitesBelowCutoff(list, lastScore, retainPaired, true, minSitesToRetain, maxSitesToRetain);
+
+ return highestScore;
+ }
+
+
+ public abstract void scoreSlow(final ArrayList<SiteScore> list, final byte[] basesP, final byte[] basesM,
+ final int maxSwScore, final int maxImperfectSwScore);
+
+ /** This is only for saving ambiguous xy which is now irrelevant. */
+ public final boolean processAmbiguous(ArrayList<SiteScore> list, boolean primary, boolean removeAmbiguous, int clearzone, boolean save_xy){
+ if(!save_xy){return true;}
+ assert(false) : "TODO: Needs to be redone with contig names.";
+
+ assert(list.size()>1);
+ boolean ambiguous=true;
+// if(save_xy && minChrom<=24 && maxChrom>=24){
+// int best=list.get(0).score;
+//
+// //Remove everything outside of the clearzone
+// for(int i=list.size()-1; i>0; i--){
+// assert(best>=list.get(i).score);
+// if(best-list.get(i).score>clearzone){
+//// assert(i>1); //No longer true because of clearzone/clearzone2
+// list.remove(i);
+// }else{
+//// assert(i>0); //Maybe no longer true because of clearzone/clearzone2
+// break;
+// }
+// }
+//
+//
+// assert(list.size()>1);
+// int Xcount=0;
+// int Ycount=0;
+// for(SiteScore ss : list){
+// assert(ss.score-list.get(0).score<=clearzone);
+// if(ss.chrom==23){
+// Xcount++;
+// }else if(ss.chrom==24){
+// Ycount++;
+// }
+// }
+// if(Xcount>1 || Ycount>2 || (Xcount+Ycount)<list.size()){
+// ambiguous=true;
+// }else{
+// ambiguous=false;
+// for(int i=list.size()-1; i>0; i--){list.remove(i);}
+// assert(list.size()==1);
+// }
+// }
+ assert(list.size()>=1);
+
+ if(ambiguous){
+ assert(list.size()>1);
+ if(removeAmbiguous){
+ list.clear();
+ }
+ }
+
+ return ambiguous;
+ }
+
+
+ public void calcStatistics1(final Read r, final int maxSwScore, final int maxPossibleQuickScore){
+ final Read r2=r.mate;
+ final int len1=r.length();
+ final int len2=(r2==null ? 0 : r.length());
+
+ if(OUTPUT_PAIRED_ONLY && r.mate!=null && !r.paired() && (r.mapped() || r.mate.mapped())){r.clearPairMapping();}
+ if(r.ambiguous() && (AMBIGUOUS_TOSS || r.mapped())){
+ ambiguousBestAlignment1++;
+ ambiguousBestAlignmentBases1+=len1;
+ }
+
+ if((!r.mapped() || (r.ambiguous() && AMBIGUOUS_TOSS)) && (r2==null || !r2.mapped() || (r2.ambiguous() && AMBIGUOUS_TOSS))){
+ bothUnmapped++;
+ bothUnmappedBases+=r.length();
+ if(r2!=null){
+ bothUnmapped++;
+ bothUnmappedBases+=r2.length();
+ }
+ }
+
+ int[] correctness=calcCorrectness(r, THRESH);
+ int correctGroup=correctness[0];
+ int correctGroupSize=correctness[1];
+ int numGroups=correctness[2];
+ int elements=correctness[3];
+ int correctScore=correctness[4];
+ int topScore=correctness[5];
+ int sizeOfTopGroup=correctness[6];
+ int numCorrect=correctness[7];
+ boolean firstElementCorrect=(correctness[8]==1);
+ boolean firstElementCorrectLoose=(correctness[9]==1);
+ boolean firstGroupCorrectLoose=(correctness[10]==1);
+
+ assert(elements>0 == r.mapped());
+
+ if(elements>0){
+
+ if(r.match!=null){
+ int[] errors=r.countErrors(SamLine.INTRON_LIMIT);
+ matchCountM1+=errors[0];
+ matchCountS1+=errors[1];
+ matchCountD1+=errors[2];
+ matchCountI1+=errors[3];
+ matchCountN1+=errors[4];
+
+ readCountS1+=(errors[1]>0 ? 1 : 0);
+ readCountD1+=(errors[2]>0 ? 1 : 0);
+ readCountI1+=(errors[3]>0 ? 1 : 0);
+// assert(errors[3]==0) : "\n"+r+"\n"+r2+"\n";
+ readCountN1+=(errors[4]>0 ? 1 : 0);
+ readCountSplice1+=(errors[5]>0 ? 1 : 0);
+ readCountE1+=((errors[1]>0 || errors[2]>0 || errors[3]>0)? 1 : 0);
+ }
+
+
+ mappedRetained1++;
+ mappedRetainedBases1+=len1;
+ if(r.rescued()){
+ if(r.strand()==Gene.PLUS){
+ rescuedP1++;
+ }else{
+ rescuedM1++;
+ }
+ }
+ if(r.paired()){
+ numMated++;
+ numMatedBases+=(len1+len2);
+ int inner;
+ int outer;
+ if(r.start<=r2.start){
+ inner=r2.start-r.stop;
+ outer=r2.stop-r.start;
+ }else{
+ inner=r.start-r2.stop;
+ outer=r.stop-r2.start;
+ }
+
+ inner=Tools.min(MAX_PAIR_DIST, inner);
+ inner=Tools.max(MIN_PAIR_DIST, inner);
+ innerLengthSum+=inner;
+ outerLengthSum+=outer;
+ insertSizeSum+=(inner+r.length()+r2.length());
+ }else if(r2!=null && r2.mapped()/*&& r.list!=null && r.list.size()>0*/){
+ badPairs++;
+ badPairBases+=(len1+len2);
+ }
+
+ if(r.perfect() || (maxSwScore>0 && r.topSite().slowScore==maxSwScore)){
+ perfectMatch1++;
+ perfectMatchBases1+=len1;
+ }else if(SLOW_ALIGN){
+ assert(r.topSite().slowScore<maxSwScore) : maxSwScore+"\t"+r.topSite().toText();
+ }
+
+ int foundSemi=0;
+ for(SiteScore ss : r.sites){
+ if(ss.perfect){
+ perfectHitCount1++;
+ assert(ss.semiperfect);
+ }
+ if(ss.semiperfect){
+ semiPerfectHitCount1++;
+ foundSemi=1;
+ }
+ }
+ semiperfectMatch1+=foundSemi;
+ if(foundSemi>0){semiperfectMatchBases1+=len1;}
+
+ if(firstElementCorrect){
+ if(r.strand()==Gene.PLUS){firstSiteCorrectP1++;}
+ else{firstSiteCorrectM1++;}
+ if(r.paired()){firstSiteCorrectPaired1++;}
+ else{firstSiteCorrectSolo1++;}
+ if(r.rescued()){firstSiteCorrectRescued1++;}
+ }else{
+ firstSiteIncorrect1++;
+// System.out.println("********");
+// System.out.println(r.toText(false));
+// System.out.println(r2.toText(false));
+ }
+
+ if(firstElementCorrectLoose){
+ firstSiteCorrectLoose1++;
+ }else{
+ firstSiteIncorrectLoose1++;
+ }
+
+ siteSum1+=elements;
+ topSiteSum1+=sizeOfTopGroup;
+
+ if(topScore==maxPossibleQuickScore){perfectHit1++;}
+ if(sizeOfTopGroup==1){uniqueHit1++;}
+
+ if(correctGroup>0){
+
+ if(r.strand()==Gene.PLUS){truePositiveP1++;}
+ else{truePositiveM1++;}
+ totalCorrectSites1+=numCorrect;
+
+ if(correctGroup==1){
+ if(sizeOfTopGroup==1){
+ correctUniqueHit1++;
+ }else{
+ correctMultiHit1++;
+ }
+ }else{
+ correctLowHit1++;
+ }
+
+ }else{
+
+ falsePositive1++;
+// System.out.println("********");
+// System.out.println(r.toText(false));
+// System.out.println(r2.toText(false));
+ }
+ }else if(maxPossibleQuickScore==-1){
+ lowQualityReadsDiscarded1++;
+ lowQualityBasesDiscarded1+=len1;
+ r.setDiscarded(true);
+ }else{
+ noHit1++;
+ }
+ }
+
+
+ public void calcStatistics2(final Read r, final int maxSwScore, final int maxPossibleQuickScore){
+ final int len=r.length();
+
+ if(r.ambiguous() && (AMBIGUOUS_TOSS || r.mapped())){
+ ambiguousBestAlignment2++;
+ ambiguousBestAlignmentBases2+=len;
+ }
+
+ int[] correctness=calcCorrectness(r, THRESH);
+ int correctGroup=correctness[0];
+ int correctGroupSize=correctness[1];
+ int numGroups=correctness[2];
+ int elements=correctness[3];
+ int correctScore=correctness[4];
+ int topScore=correctness[5];
+ int sizeOfTopGroup=correctness[6];
+ int numCorrect=correctness[7];
+ boolean firstElementCorrect=(correctness[8]==1);
+ boolean firstElementCorrectLoose=(correctness[9]==1);
+ boolean firstGroupCorrectLoose=(correctness[10]==1);
+
+ if(elements>0){
+
+ if(r.match!=null){
+ int[] errors=r.countErrors(SamLine.INTRON_LIMIT);
+ matchCountM2+=errors[0];
+ matchCountS2+=errors[1];
+ matchCountD2+=errors[2];
+ matchCountI2+=errors[3];
+ matchCountN2+=errors[4];
+
+ readCountS2+=(errors[1]>0 ? 1 : 0);
+ readCountD2+=(errors[2]>0 ? 1 : 0);
+ readCountI2+=(errors[3]>0 ? 1 : 0);
+ readCountN2+=(errors[4]>0 ? 1 : 0);
+ readCountSplice2+=(errors[5]>0 ? 1 : 0);
+ readCountE2+=((errors[1]>0 || errors[2]>0 || errors[3]>0)? 1 : 0);
+ }
+
+ mappedRetained2++;
+ mappedRetainedBases2+=len;
+ if(r.rescued()){
+ if(r.strand()==Gene.PLUS){
+ rescuedP2++;
+ }else{
+ rescuedM2++;
+ }
+ }
+
+ if(r.perfect() || (maxSwScore>0 && r.topSite().slowScore==maxSwScore)){
+ perfectMatch2++;
+ perfectMatchBases2+=len;
+ }else if(SLOW_ALIGN){
+ assert(r.topSite().slowScore<maxSwScore) : maxSwScore+"\t"+r.topSite().toText();
+ }
+
+ int foundSemi=0;
+ for(SiteScore ss : r.sites){
+ if(ss.perfect){
+ perfectHitCount2++;
+ assert(ss.semiperfect);
+ }
+ if(ss.semiperfect){
+ semiPerfectHitCount2++;
+ foundSemi=1;
+ }
+ }
+ semiperfectMatch2+=foundSemi;
+ if(foundSemi>0){semiperfectMatchBases2+=len;}
+
+ if(firstElementCorrect){
+ if(r.strand()==Gene.PLUS){firstSiteCorrectP2++;}
+ else{firstSiteCorrectM2++;}
+ if(r.paired()){firstSiteCorrectPaired2++;}
+ else{firstSiteCorrectSolo2++;}
+ if(r.rescued()){firstSiteCorrectRescued2++;}
+ }else{
+ firstSiteIncorrect2++;
+// System.out.println("********");
+// System.out.println(r.toText(false));
+// System.out.println(r.mate.toText(false));
+ }
+
+ if(firstElementCorrectLoose){
+ firstSiteCorrectLoose2++;
+ }else{
+ firstSiteIncorrectLoose2++;
+ }
+
+ siteSum2+=elements;
+ topSiteSum2+=sizeOfTopGroup;
+
+ if(topScore==maxPossibleQuickScore){perfectHit2++;}
+ if(sizeOfTopGroup==1){uniqueHit2++;}
+
+ if(correctGroup>0){
+
+ if(r.strand()==Gene.PLUS){truePositiveP2++;}
+ else{truePositiveM2++;}
+ totalCorrectSites2+=numCorrect;
+
+ if(correctGroup==1){
+ if(sizeOfTopGroup==1){
+ correctUniqueHit2++;
+ }else{
+ correctMultiHit2++;
+ }
+ }else{
+ correctLowHit2++;
+ }
+
+ }else{
+
+ falsePositive2++;
+// System.out.println("********");
+// System.out.println(r.toText(false));
+// System.out.println(r.mate.toText(false));
+ }
+ }else if(maxPossibleQuickScore==-1){
+ lowQualityReadsDiscarded2++;
+ lowQualityBasesDiscarded2+=len;
+ }else{
+ noHit2++;
+ }
+ }
+
+ public abstract void processRead(Read r, final byte[] basesM);
+
+ @Deprecated
+ protected final boolean applyClearzone3_old(Read r, int CLEARZONE3, float INV_CLEARZONE3){
+
+ assert(!r.paired()); //This is currently for unpaired reads
+ if(!r.mapped() || r.ambiguous() || r.discarded() || r.numSites()<2){return false;}
+
+ final int score1=r.topSite().slowScore;
+ final int score2=r.sites.get(1).slowScore;
+ final int score3=(r.sites.size()>2 ? r.sites.get(2).slowScore : -1);
+ int dif=score1-score2;
+
+ assert(r.mapScore==score1) : r.mapScore+", "+r.topSite().toText();
+
+ assert(score1==r.mapScore);
+ assert(score1>=score2) : "\n"+r.topSite().toText()+"\t<\t"+r.sites.get(1).toText()+"\n"+r.toText(false)+"\n";
+ if(dif>=CLEARZONE3){return false;}
+
+// final int dif2=40+(CLEARZONE3-dif)/3;
+// final int dif2=(CLEARZONE3-dif)/2;
+ int dif2=(CLEARZONE3-dif);
+
+ float f=dif2*INV_CLEARZONE3;
+
+ int sub=(dif2+2*(int)(f*dif2));
+
+ if(score3!=-1){
+ assert(score1>=score3);
+ dif=score1-score3;
+ assert(score1>=score3);
+ if(dif<CLEARZONE3){
+ dif2=(CLEARZONE3-dif);
+
+ f=dif2*INV_CLEARZONE3;
+ sub=sub+(dif2+2*(int)(f*dif2))/4;
+
+// sub=sub+(dif2)/2;
+ }
+ }
+
+ for(SiteScore ss : r.sites){
+ ss.setSlowScore(ss.slowScore-sub);
+ ss.setScore(ss.score-sub);
+ }
+ r.mapScore-=sub;
+ return sub>0;
+ }
+
+
+ protected final boolean applyClearzone3(Read r, int CLEARZONE3, float INV_CLEARZONE3){
+
+ assert(!r.paired()); //This is currently for unpaired reads
+ final ArrayList<SiteScore> list=r.sites;
+ if(!r.mapped() || r.ambiguous() || r.discarded() || list==null || list.size()<2){return false;}
+
+ final int score1=list.get(0).slowScore;
+ assert(r.mapScore==score1) : r.mapScore+", "+list.get(0).toText()+"\n"+r;
+
+ float sub=0;
+ final int max=Tools.min(CZ3_MULTS.length, list.size());
+ for(int i=1; i<max; i++){
+ final SiteScore ss2=list.get(i);
+ assert(ss2!=null) : r;
+ if(ss2!=null){
+ if(i>2 && ss2.slowScore<list.get(i-1).slowScore){break;}
+// final int score2=list.get(i).slowScore;
+// final int dif=score1-score2;
+// if(dif>=CLEARZONE3){break;}
+// int dif2=(CLEARZONE3-dif);
+// float f=dif2*INV_CLEARZONE3;
+// sub+=(dif2+2*(f*dif2))*CZ3_MULTS[i];
+ float f=calcCZ3_fraction(score1, ss2.slowScore, CLEARZONE3, INV_CLEARZONE3);
+ if(f<=0){break;}
+ sub+=(f*CZ3_MULTS[i]);
+ }
+ }
+ assert(sub>=0);
+ if(sub<=0){return false;}
+
+ float sub2;
+// float asymptote=8f+0.0267f*r.length();
+ float asymptote=4f+0.03f*r.length();
+ sub=sub*1.8f;
+ sub2=CLEARZONE3*((asymptote*sub)/(sub+asymptote));
+// sub2=CLEARZONE3*sub;
+// System.out.println("sub="+sub+", sub2="+sub2+", CLEARZONE3="+CLEARZONE3+", (5*sub)="+(5*sub)+", (sub+5*CLEARZONE3)="+(sub+5*CLEARZONE3));
+ int subi=(int)(sub2+0.5f);
+ if(subi>=r.mapScore-300){
+ subi=r.mapScore-300;
+ }
+ if(subi<=0){return false;}
+
+ for(SiteScore ss : list){
+ ss.setSlowScore(ss.slowScore-subi);
+ ss.setScore(ss.score-subi);
+ }
+ r.mapScore-=subi;
+ assert(r.mapScore>200);
+ return true;
+ }
+
+
+// protected float calcCZ3(int score1, int score2, int CLEARZONE3, float INV_CLEARZONE3){
+//
+// int dif=score1-score2;
+// if(dif>=CLEARZONE3){return 0;}
+// //Now dif is between 0 and CZ3
+//
+//// final int dif2=40+(CLEARZONE3-dif)/3;
+//// final int dif2=(CLEARZONE3-dif)/2;
+// int dif2=(CLEARZONE3-dif); //dif2 is higher if the scores are closer.
+//
+// float f=dif2*INV_CLEARZONE3; //f ranges linearly from 1 (if the scores are identical) to 0 (when score2 is maximally below score1)
+//
+// float f2=f*f;
+// float f7=(float)Math.pow(f, .7);
+//
+//// return (dif2+2f*f*dif2+2f*Tools.min(f2,0.5f)*dif2);
+// return (CLEARZONE3*f7+2f*f*dif2+2f*Tools.min(f2,0.5f)*dif2);
+// }
+
+
+ protected float calcCZ3_fraction(int score1, int score2, int CLEARZONE3, float INV_CLEARZONE3){
+
+ int dif=score1-score2;
+ if(dif>=CLEARZONE3){return 0;}
+ //Now dif is between 0 and CZ3
+
+// final int dif2=40+(CLEARZONE3-dif)/3;
+// final int dif2=(CLEARZONE3-dif)/2;
+ int dif2=(CLEARZONE3-dif); //dif2 is higher if the scores are closer.
+
+ float f=dif2*INV_CLEARZONE3; //f ranges linearly from 1 (if the scores are identical) to 0 (when score2 is maximally below score1)
+
+ float f2=f*f;
+// float f7=(float)Math.pow(f, .7);
+
+// return (dif2+2f*f*dif2+2f*Tools.min(f2,0.5f)*dif2);
+ return f+2f*f2+2f*f2*f;
+ }
+
+ /** Returns number of perfect pairs */
+ public abstract int pairSiteScoresInitial(Read r, Read r2, boolean trim);
+
+
+
+
+
+ protected static void pairSiteScoresFinal(Read r, Read r2, boolean trim, boolean setScore, int MAX_PAIR_DIST, int AVERAGE_PAIR_DIST,
+ boolean SAME_STRAND_PAIRS, boolean REQUIRE_CORRECT_STRANDS_PAIRS, int maxTrimSitesToRetain){
+
+ if(r.sites!=null){
+ for(SiteScore ss : r.sites){ss.setPairedScore(0);}
+ }
+ if(r2.sites!=null){
+ for(SiteScore ss : r2.sites){ss.setPairedScore(0);}
+ }
+
+ if(r.numSites()<1 || r2.numSites()<1){return;}
+
+ SiteScore.PCOMP.sort(r.sites);
+ SiteScore.PCOMP.sort(r2.sites);
+
+ int maxPairedScore1=-1;
+ int maxPairedScore2=-1;
+
+
+// if(verbose){
+// System.out.println(r.list.size()+", "+r2.list.size());
+// System.out.println();
+// for(SiteScore ss : r.list){
+// System.out.println(ss.toText());
+// }
+// System.out.println();
+// for(SiteScore ss : r2.list){
+// System.out.println(ss.toText());
+// }
+// System.out.println();
+// }
+
+ final float mult1=Tools.min(1/2f, Tools.max(1/4f, (r.length()/(4f*r2.length()))));
+ final float mult2=Tools.min(1/2f, Tools.max(1/4f, (r2.length()/(4f*r.length()))));
+
+ final int ilimit=r.sites.size()-1;
+ final int jlimit=r2.sites.size()-1;
+
+ final int outerDistLimit=(Tools.max(r.length(), r2.length())*OUTER_DIST_MULT)/OUTER_DIST_DIV; //Minimum pairing distance
+ final int expectedFragLength=AVERAGE_PAIR_DIST+r.length()+r2.length();
+
+ if(verboseS){
+ System.err.println("************************** PAIRING ********************************");
+ System.err.println("outerDistLimit="+outerDistLimit+", MAX_PAIR_DIST="+MAX_PAIR_DIST);
+ }
+
+ for(int i=0, j=0; i<=ilimit && j<=jlimit; i++){
+ SiteScore ss1=r.sites.get(i);
+ SiteScore ss2=r2.sites.get(j);
+
+ while(j<jlimit && (ss2.chrom<ss1.chrom || (ss2.chrom==ss1.chrom && ss1.start()-ss2.stop()>MAX_PAIR_DIST))){
+ j++;
+// if(verbose){System.err.println("a.Incrementing j->"+j);}
+ ss2=r2.sites.get(j);
+ }
+
+ for(int k=j; k<=jlimit; k++){
+ ss2=r2.sites.get(k);
+
+ if(verboseS){
+ System.err.println("Considering sites:\n"+ss1+"\n"+ss2);
+ }
+
+ if(ss2.chrom>ss1.chrom){break;}
+ // if(verbose){System.err.println("Same chrom");}
+ if(ss2.start()-ss1.stop()>MAX_PAIR_DIST){break;}
+
+ final int innerdist;
+ final int outerdist;
+
+ //assert(!SAME_STRAND_PAIRS) : "TODO";
+
+ if(REQUIRE_CORRECT_STRANDS_PAIRS){
+ if(ss1.strand!=ss2.strand){
+ if(ss1.strand==Gene.PLUS){
+ innerdist=ss2.start()-ss1.stop();
+ outerdist=ss2.stop()-ss1.start();
+ }else{
+ innerdist=ss1.start()-ss2.stop();
+ outerdist=ss1.stop()-ss2.start();
+ }
+ }else{
+ if(ss1.start()<=ss2.start()){
+ innerdist=ss2.start()-ss1.stop();
+ outerdist=ss2.stop()-ss1.start();
+ }else{
+ innerdist=ss1.start()-ss2.stop();
+ outerdist=ss1.stop()-ss2.start();
+ }
+ }
+ }else{
+ if(ss1.start()<=ss2.start()){
+ innerdist=ss2.start()-ss1.stop();
+ outerdist=ss2.stop()-ss1.start();
+ }else{
+ innerdist=ss1.start()-ss2.stop();
+ outerdist=ss1.stop()-ss2.start();
+ }
+ }
+
+ if(verboseS){
+ System.err.println("innerdist="+innerdist+", outerdist="+outerdist);
+ }
+
+// if(ss1.start()<=ss2.start()){
+// innerdist=ss2.start()-ss1.stop();
+// outerdist=ss2.stop()-ss1.start();
+// }else{
+// innerdist=ss1.start()-ss2.stop();
+// outerdist=ss1.stop()-ss2.start();
+// }
+ assert(outerdist>=innerdist) : "outerdist<innerdist:\n"+innerdist+", "+outerdist+", "+ss1+", "+ss2;
+
+ if(outerdist>=outerDistLimit && innerdist<=MAX_PAIR_DIST){
+
+ boolean strandOK=((ss1.strand==ss2.strand)==SAME_STRAND_PAIRS);
+ // if(verbose){System.err.println("strandOK="+strandOK);}
+
+ if(strandOK || !REQUIRE_CORRECT_STRANDS_PAIRS){
+
+ int deviation=absdif(AVERAGE_PAIR_DIST, innerdist);
+
+ final int pairedScore1;
+ final int pairedScore2;
+ if(strandOK){
+ // pairedScore1=ss1.score+(int)(ss2.score*mult1);
+ // pairedScore2=ss2.score+(int)(ss1.score*mult2);
+
+ pairedScore1=ss1.score+1+
+ Tools.max(1, (int)(ss2.score*mult1)-(((deviation)*ss2.score)/Tools.max(100,(10*expectedFragLength+100))));
+ pairedScore2=ss2.score+1+
+ Tools.max(1, (int)(ss1.score*mult2)-(((deviation)*ss1.score)/Tools.max(100,(10*expectedFragLength+100))));
+
+
+ }else{//e.g. a junction
+ pairedScore1=ss1.score+ss2.score/16;
+ pairedScore2=ss2.score+ss1.score/16;
+ }
+
+ if(verboseS){
+ System.err.println("strandOK="+strandOK+"\tpairedScore1="+pairedScore1+", pairedScore2="+pairedScore2);
+ System.err.println(" \tscore1="+ss1.score+", score2="+ss2.score);
+ }
+
+ ss1.setPairedScore(Tools.max(ss1.pairedScore, pairedScore1));
+ ss2.setPairedScore(Tools.max(ss2.pairedScore, pairedScore2));
+ maxPairedScore1=Tools.max(ss1.score, maxPairedScore1);
+ maxPairedScore2=Tools.max(ss2.score, maxPairedScore2);
+ // if(verbose){System.err.println("Paired:\nss1="+ss1.toText()+", ss2="+ss2.toText());}
+ }
+ }else{
+ // if(verbose){System.err.println("Out of range");}
+ }
+ }
+ // if(verbose){System.err.println("\nss1="+ss1.toText()+", ss2="+ss2.toText());}
+
+ }
+
+ if(setScore){
+ for(SiteScore ss : r.sites){
+ if(ss.pairedScore>ss.score){ss.setScore(ss.pairedScore);}
+ else{assert(ss.pairedScore==0);}
+ }
+ for(SiteScore ss : r2.sites){
+ if(ss.pairedScore>ss.score){ss.setScore(ss.pairedScore);}
+ else{assert(ss.pairedScore==0);}
+ }
+ }
+
+ if(trim){
+// Tools.trimSitesBelowCutoffInplace(r.list, (int)(maxPairedScore1*.95f), false);
+// Tools.trimSitesBelowCutoffInplace(r2.list, (int)(maxPairedScore2*.95f), false);
+ float f=Tools.min(SECONDARY_SITE_SCORE_RATIO, 0.95f);
+ Tools.trimSitesBelowCutoff(r.sites, (int)(maxPairedScore1*f), false, true, 1, maxTrimSitesToRetain);
+ Tools.trimSitesBelowCutoff(r2.sites, (int)(maxPairedScore2*f), false, true, 1, maxTrimSitesToRetain);
+ }
+ }
+
+ protected final boolean canPair(SiteScore ss1, SiteScore ss2, int len1, int len2,
+ boolean REQUIRE_CORRECT_STRANDS_PAIRS, boolean SAME_STRAND_PAIRS, int MAX_PAIR_DIST){
+ if(ss1.chrom!=ss2.chrom){return false;}
+ if(REQUIRE_CORRECT_STRANDS_PAIRS){
+ boolean strandOK=((ss1.strand==ss2.strand)==SAME_STRAND_PAIRS);
+ if(!strandOK){return false;}
+ }
+// int dist=0;
+//
+// if(ss1.start()<=ss2.start()){
+// dist=ss2.start()-ss1.stop();
+// }else if(ss1.start()>ss2.start()){
+// dist=ss1.start()-ss2.stop();
+// }
+//
+// return (dist>=MIN_PAIR_DIST && dist<=MAX_PAIR_DIST);
+
+// final int outerDistLimit=MIN_PAIR_DIST+len1+len2;
+// final int outerDistLimit=(Tools.max(len1, len2)*(OUTER_DIST_MULT2))/OUTER_DIST_DIV;
+ final int outerDistLimit=(Tools.max(len1, len2)*(OUTER_DIST_MULT))/OUTER_DIST_DIV;
+ int innerdist=0;
+ int outerdist=0;
+
+ if(verboseS){
+ System.err.println("canPair: outerDistLimit="+outerDistLimit);
+ }
+
+// if(ss1.start()<=ss2.start()){
+// innerdist=ss2.start()-ss1.stop();
+// outerdist=ss2.stop()-ss1.start();
+// }else if(ss1.start()>ss2.start()){
+// innerdist=ss1.start()-ss2.stop();
+// outerdist=ss1.stop()-ss2.start();
+// }
+// assert(outerdist>=innerdist);
+
+ //assert(!SAME_STRAND_PAIRS) : "TODO";
+
+ if(REQUIRE_CORRECT_STRANDS_PAIRS){
+ if(ss1.strand!=ss2.strand){
+ if(ss1.strand==Gene.PLUS){
+ innerdist=ss2.start()-ss1.stop();
+ outerdist=ss2.stop()-ss1.start();
+ }else{
+ innerdist=ss1.start()-ss2.stop();
+ outerdist=ss1.stop()-ss2.start();
+ }
+ }else{
+ if(ss1.start()<=ss2.start()){
+ innerdist=ss2.start()-ss1.stop();
+ outerdist=ss2.stop()-ss1.start();
+ }else{
+ innerdist=ss1.start()-ss2.stop();
+ outerdist=ss1.stop()-ss2.start();
+ }
+ }
+ }else{
+ if(ss1.start()<=ss2.start()){
+ innerdist=ss2.start()-ss1.stop();
+ outerdist=ss2.stop()-ss1.start();
+ }else{
+ innerdist=ss1.start()-ss2.stop();
+ outerdist=ss1.stop()-ss2.start();
+ }
+ }
+
+ return (outerdist>=outerDistLimit && innerdist<=MAX_PAIR_DIST);
+ }
+
+
+// /** Returns the number of additional bases away that should be searched for slow align.
+// * This should probably be called between quickMap and slowAlign, only on
+// * sites where stop-start<=bases.length-1 */
+// public abstract void findTipDeletions(final Read r, final byte[] basesP, final byte[] basesM, final int maxSwScore, final int maxImperfectScore);
+//
+// public abstract boolean findTipDeletions(SiteScore ss, final byte[] bases, final int maxImperfectScore, boolean lookRight, boolean lookLeft);
+
+
+ /** Returns the number of additional bases away that should be searched for slow align.
+ * This should probably be called between quickMap and slowAlign, only on
+ * sites where stop-start<=bases.length-1 */
+ protected final int findTipDeletionsRight(final byte[] bases, final int chrom,
+ int originalStop, int searchDist, int tiplen){
+ ChromosomeArray cha=Data.getChromosome(chrom);
+ byte[] ref=cha.array;
+ if(originalStop<cha.minIndex+tiplen-1){return 0;} //fail
+
+ int minMismatches=tiplen;
+ int bestStart=originalStop;
+
+ final int tipCoord=bases.length-1;
+
+ int lastMismatch=0;
+ int originalMismatches=0;
+ int contig=0;
+ for(int i=0; i<tiplen && contig<5; i++){
+ if(bases[tipCoord-i]!=ref[originalStop-i]){
+ originalMismatches++;
+ lastMismatch=i;
+ contig=0;
+ }else{
+ contig++;
+ }
+ }
+// System.err.print("!");
+ if(originalMismatches<3){return 0;}
+ minMismatches=originalMismatches;
+ tiplen=lastMismatch+1;
+ assert(tiplen>1);
+ if(tiplen<4){return 0;}
+// System.err.println("Tiplen="+tiplen+", mismatches="+originalMismatches);
+// System.err.print("* ");
+
+ searchDist=Tools.min(searchDist, 30*originalMismatches);
+ int lastIndexToStart=Tools.min(ref.length-1, originalStop+searchDist);
+ for(int start=originalStop+1; start<=lastIndexToStart && minMismatches>0; start++){
+// System.err.print("_");
+ int mismatches=0;
+ for(int j=0; j<tiplen && mismatches<minMismatches; j++){
+ if(bases[tipCoord-j]!=ref[start-j]){
+ mismatches++;
+ }
+ }
+// System.err.print(mismatches+" ");
+ if(mismatches<minMismatches){
+ bestStart=start;
+ minMismatches=mismatches;
+ }
+ }
+// System.err.println("\noriginalStop+1:"+(originalStop+1)+"\nlastIndexToStart:"+(lastIndexToStart)+"\ntiplen: "+tiplen+"\noriginalMismatches: "+originalMismatches+"\nminMismatches: "+minMismatches+"\n");
+ if(minMismatches>2 || originalMismatches-minMismatches<2){
+ return 0;
+ }
+// System.err.println(" $$$ ");
+ return bestStart-originalStop;
+ }
+
+
+ /** Returns the number of additional bases away that should be searched for slow align.
+ * This should probably be called between quickMap and slowAlign, only on
+ * sites where stop-start<=bases.length-1 */
+ protected final int findTipDeletionsLeft(final byte[] bases, final int chrom,
+ final int originalStart, int searchDist, int tiplen){
+ ChromosomeArray cha=Data.getChromosome(chrom);
+ byte[] ref=cha.array;
+ if(originalStart+tiplen>=ref.length){return 0;} //fail
+
+ if(cha.minIndex>=originalStart){return 0;} //fail
+
+ int minMismatches=tiplen;
+ int bestStart=originalStart;
+
+ int lastMismatch=0;
+ int originalMismatches=0;
+ int contig=0;
+ for(int i=0; i<tiplen && contig<5; i++){
+ if(bases[i]!=ref[originalStart+i]){
+ originalMismatches++;
+ lastMismatch=i;
+ contig=0;
+ }else{
+ contig++;
+ }
+ }
+// System.err.print("!");
+ if(originalMismatches<3){return 0;}
+ minMismatches=originalMismatches;
+ tiplen=lastMismatch+1;
+ assert(tiplen>1);
+ if(tiplen<4){return 0;}
+// System.err.println("Tiplen="+tiplen+", mismatches="+originalMismatches);
+// System.err.print("* ");
+
+ searchDist=Tools.min(searchDist, 16+16*originalMismatches+8*tiplen);
+ int lastIndexToStart=Tools.max(cha.minIndex, originalStart-searchDist);
+ for(int start=originalStart-1; start>=lastIndexToStart && minMismatches>0; start--){
+// System.err.print("_");
+ int mismatches=0;
+ for(int j=0; j<tiplen && mismatches<minMismatches; j++){
+ if(bases[j]!=ref[start+j]){
+ mismatches++;
+ }
+ }
+// System.err.print(mismatches+" ");
+ if(mismatches<minMismatches){
+ bestStart=start;
+ minMismatches=mismatches;
+ }
+ }
+// System.err.println("\noriginalStop+1:"+(originalStop+1)+"\nlastIndexToStart:"+(lastIndexToStart)+"\ntiplen: "+tiplen+"\noriginalMismatches: "+originalMismatches+"\nminMismatches: "+minMismatches+"\n");
+ if(minMismatches>2 || originalMismatches-minMismatches<2){
+ return 0;
+ }
+// System.err.println(" $$$ ");
+ return originalStart-bestStart;
+ }
+
+
+// public abstract void rescue(Read anchor, Read loose, byte[] basesP, byte[] basesM, int searchDist);
+
+
+// public abstract void slowRescue(final byte[] bases, SiteScore ss, final int maxScore, final int maxImperfectScore,
+// boolean findTipDeletionsRight, boolean findTipDeletionsLeft);
+
+
+ /** Assumes bases/colors are already on the correct strand */
+ public final SiteScore quickRescue(final byte[] bases, final int chrom, final byte strand, final int loc, final int searchDist,
+ final boolean searchRight, final int idealStart, final int maxAllowedMismatches, int POINTS_MATCH, int POINTS_MATCH2){
+ if(bases==null || bases.length<10){return null;}
+ ChromosomeArray cha=Data.getChromosome(chrom);
+ byte[] ref=cha.array;
+
+ int lowerBound, upperBound;
+ if(searchRight){
+ lowerBound=Tools.max(cha.minIndex, loc);
+ upperBound=Tools.min(ref.length-bases.length, loc+searchDist);
+ }else{
+ lowerBound=Tools.max(cha.minIndex, loc-searchDist);
+ upperBound=Tools.min(ref.length-bases.length, loc);
+ }
+// assert(false) : lowerBound+", "+upperBound;
+// int minMismatches=(int)(bases.length*.6f); //Default: .75f. Lower numbers are faster with lower quality.
+ int minMismatches=maxAllowedMismatches+1;
+ //For situations like RNASEQ with lots of deletions, a higher value of at least .75 should be used.
+
+ int maxContigMatches=0;
+ int bestScore=0;
+ int bestStart=-1;
+ int bestAbsdif=Integer.MAX_VALUE;
+
+ if(searchRight){
+ for(int start=lowerBound; start<=upperBound/* && minMismatches>0*/; start++){
+ int mismatches=0;
+ int contig=0;
+ int currentContig=0;
+ for(int j=0; j<bases.length && mismatches<=minMismatches; j++){
+ final byte c=bases[j], r=ref[start+j];
+ if(c!=r || c=='N'){
+ mismatches++;
+ contig=Tools.max(contig, currentContig);
+ currentContig=0;
+ }else{
+ currentContig++;
+ }
+ }
+
+ int score=(bases.length-mismatches)+contig;
+ int absdif=absdif(start, idealStart);
+ if(mismatches<=minMismatches && (score>bestScore || (score==bestScore && absdif<bestAbsdif))){
+ bestStart=start;
+ minMismatches=mismatches;
+ maxContigMatches=contig;
+ bestScore=score;
+ bestAbsdif=absdif;
+ if(mismatches==0){upperBound=Tools.min(upperBound, idealStart+absdif);}
+// assert(upperBound>=start && lowerBound<=start);
+// assert(upperBound>=idealStart);
+// assert(lowerBound<=idealStart);
+ }
+ }
+ }else{
+ for(int start=upperBound; start>=lowerBound/* && minMismatches>0*/; start--){
+ int mismatches=0;
+ int contig=0;
+ int currentContig=0;
+ for(int j=0; j<bases.length && mismatches<=minMismatches; j++){
+ final byte c=bases[j], r=ref[start+j];
+ if(c!=r || c=='N'){
+ mismatches++;
+ contig=Tools.max(contig, currentContig);
+ currentContig=0;
+ }else{
+ currentContig++;
+ }
+ }
+
+ int score=(bases.length-mismatches)+contig;
+ int absdif=absdif(start, idealStart);
+ if(mismatches<=minMismatches && (score>bestScore || (score==bestScore && absdif<bestAbsdif))){
+ bestStart=start;
+ minMismatches=mismatches;
+ maxContigMatches=contig;
+ bestScore=score;
+ bestAbsdif=absdif;
+ if(mismatches==0){lowerBound=Tools.max(lowerBound, idealStart-absdif);}
+// assert(upperBound>=start && lowerBound<=start);
+// assert(upperBound>=idealStart);
+// assert(lowerBound<=idealStart);
+ }
+ }
+ }
+
+ if(bestStart<0){return null;}
+
+ //These scores are dummies and will not quite match the normally generated scores.
+ final int scoreOut;
+ if(USE_AFFINE_SCORE){
+ scoreOut=POINTS_MATCH+(POINTS_MATCH2*(bases.length-1-minMismatches));
+ }else{
+ scoreOut=maxContigMatches+(BASE_HIT_SCORE*(bases.length-minMismatches));
+ }
+
+ SiteScore ss=new SiteScore(chrom, strand, bestStart, bestStart+bases.length-1, 0, scoreOut);
+ ss.setPerfect(bases);
+ ss.rescued=true;
+ ss.setSlowScore(minMismatches); //TODO: Clear this field later!
+ return ss;
+ }
+
+
+ /** Assumes bases/colors are already on the correct strand */
+ protected final int[] quickerRescue(final byte[] bases, final int chrom, int loc, final int searchDist){
+ ChromosomeArray cha=Data.getChromosome(chrom);
+ byte[] ref=cha.array;
+ if(loc<cha.minIndex){loc=cha.minIndex;}
+
+ int lastIndexToStart=loc+searchDist-1;
+ final int limit=Tools.min(lastIndexToStart, ref.length-bases.length)+1;
+
+ int minMismatches=bases.length;
+ int bestStart=-1;
+ for(int start=loc; start<limit && minMismatches>0; start++){
+ int mismatches=0;
+ for(int j=0; j<bases.length && mismatches<minMismatches; j++){
+ if(bases[j]!=ref[start+j]){
+ mismatches++;
+ }
+ }
+ if(mismatches<minMismatches){
+ bestStart=start;
+ minMismatches=mismatches;
+ }
+ }
+
+ return new int[] {bestStart, bestStart+bases.length-1, minMismatches};
+ }
+
+
+ public abstract void processReadPair(final Read r, final byte[] basesM1, final byte[] basesM2);
+
+ /** TODO: Iterate through loop backwards when removing sites.
+ * @param r
+ * @param DONT_OUTPUT_UNMAPPED_READS
+ * @param SAM_OUT
+ * @param EXPECTED_LEN_LIMIT
+ * @return Number of sites removed
+ */
+ protected final int removeOutOfBounds(Read r, boolean DONT_OUTPUT_UNMAPPED_READS, boolean SAM_OUT, int EXPECTED_LEN_LIMIT){
+// assert(false) : DONT_OUTPUT_UNMAPPED_READS+", "+SAM_OUT+", "+EXPECTED_LEN_LIMIT;
+ ArrayList<SiteScore> ssl=r.sites;
+ if(ssl==null){return 0;}
+ int initial=ssl.size();
+ for(int i=0; i<ssl.size(); i++){
+ SiteScore ss=ssl.get(i);
+// System.out.println("Estimated greflen: "+GapTools.calcGrefLen(ss.start(), ss.stop(), ss.gaps));
+ ChromosomeArray cha=Data.getChromosome(ss.chrom);
+ int max=cha.maxIndex;
+ if(ss.start()<0 || ss.stop()>max){
+ ssl.remove(i);
+ i--;
+ ss=null;
+ }else if(/*DONT_OUTPUT_UNMAPPED_READS && */SAM_OUT){
+ if(!Data.isSingleScaffold(ss.chrom, ss.start(), ss.stop())){
+ //TODO: Attempt to trim instead of removing
+ ssl.remove(i);
+ i--;
+ ss=null;
+ }
+ }
+ if(ss!=null){
+ int expectedLen=GapTools.calcGrefLen(ss);
+ if(expectedLen>=EXPECTED_LEN_LIMIT){
+ //TODO: Alternately, I could kill the site.
+ ss.setStop(ss.start()+Tools.min(r.length()+40, EXPECTED_LEN_LIMIT));
+ if(ss.gaps!=null){GapTools.fixGaps(ss);}
+ }
+ }
+ }
+
+// System.out.println("Estimated greflen: "+GapTools.calcGrefLen(r.start, r.stop, r.gaps));
+// assert(false);
+
+ return initial-ssl.size();
+ }
+
+ protected static final int forbidSelfMapping(ArrayList<SiteScore> ssl, SiteScore original){
+// assert(original!=null);
+ if(ssl==null || ssl.isEmpty() || original==null){return 0;}
+ int removed=0;
+ for(int i=0; i<ssl.size(); i++){
+ SiteScore ss=ssl.get(i);
+ if(ss.overlaps(original, true)){
+ ssl.set(i, null);
+ removed++;
+ }
+ }
+ if(removed>0){Tools.condenseStrict(ssl);}
+ return removed;
+ }
+
+
+ /** Generate a score penalty based on the presence of errors near the read tips. */
+ public static int calcTipScorePenalty(final Read r, final int maxScore, final int tiplen){
+ if(!r.mapped() || r.match==null || r.length()<2*tiplen){return 0;}
+
+ int points=0;
+ final byte[] match=r.match;
+ final byte[] bases=r.bases;
+ final int last=r.length()-1;
+ byte prev='m';
+ for(int i=0, cpos=0; cpos<=tiplen; i++){
+ byte b=match[i];
+ if(b=='m'){
+ cpos++;
+ }else if(b=='D'){
+ if(prev!='D'){points+=2*(tiplen+2-cpos);}
+ }else if(b=='N' || b=='C'){
+ points+=(tiplen+2-cpos);
+ cpos++;
+ }else{
+ if(Character.isDigit(b)){
+ r.match=Read.toLongMatchString(r.match);
+ return calcTipScorePenalty(r, maxScore, tiplen);
+ }
+ assert(b=='I' || b=='S') : ((char)b)+"\n"+new String(match)+"\n"+new String(bases)+"\n";
+ points+=2*(tiplen+2-cpos);
+ cpos++;
+ }
+ prev=b;
+ }
+
+ prev='m';
+ for(int i=match.length-1, cpos=0; cpos<=tiplen; i--){
+ byte b=match[i];
+ if(b=='m'){
+ cpos++;
+ }else if(b=='D'){
+ if(prev!='D'){points+=2*(tiplen+2-cpos);}
+ }else if(b=='N' || b=='C'){
+ points+=(tiplen+2-cpos);
+ cpos++;
+ }else{
+ assert(b=='I' || b=='S');
+ points+=2*(tiplen+2-cpos);
+ cpos++;
+ }
+ prev=b;
+ }
+
+ byte b=bases[0];
+ //homopolymer tip penalty
+ if(b!='N' && b==bases[1]){
+ for(int i=2; i<=tiplen && bases[i]==b; i++){points++;}
+ }
+
+ //homopolymer tip penalty
+ b=bases[last];
+ if(b!='N' && b==bases[last-1]){
+ for(int i=last-2; i>=(last-tiplen) && bases[i]==b; i--){points++;}
+ }
+
+ //Did not seem to help
+// int hits=r.list.get(0).hits;
+// float desired=Tools.min(6, bases.length/12f);
+// if(hits<desired){points+=20*(1-(hits/desired));}
+
+ if(points<1){return 0;}
+// points=Tools.min(points, 40);
+
+ float asymptote=80;
+ float f=((asymptote*points)/(points+asymptote));
+
+ int penalty=(int)(f*.0022f*maxScore);
+ int maxPenalty=r.mapScore-maxScore/10;
+ if(maxPenalty<=0){return 0;}
+ return Tools.min(penalty, maxPenalty);
+
+// final int len=7;
+// int dist1=len+1, dist2=len+1;
+// for(int i=0; i<=len; i++){
+// if(r.match[i]!='m'){
+// dist1=i;
+//// System.out.println("dist1="+dist1+"\n"+new String(r.match));
+// break;
+// }
+// }
+// for(int i=0, tip=r.match.length-1; i<=len; i++){
+// if(r.match[tip-i]!='m'){
+// dist2=i;
+//// System.out.println("dist2="+dist2+"\n"+new String(r.match));
+// break;
+// }
+// }
+// float penalty=0;
+// if(dist1<len){
+// penalty+=(len-dist1+1)*.005f*maxScore;
+// }
+// if(dist2<len){
+// penalty+=(len-dist2+1)*.005f*maxScore;
+// }
+// return (int)penalty;
+ }
+
+
+ public static void applyScorePenalty(Read r, int penalty){
+ if(penalty>0){
+ r.mapScore-=penalty;
+ for(SiteScore ss : r.sites){
+ ss.setSlowScore(ss.slowScore-penalty);
+ ss.setScore(ss.score-penalty);
+ }
+ }
+ }
+
+
+ /** {group of correct hit (or -1), size of correct group, number of groups,
+ * number of elements, correctScore, maxScore, size of top group, num correct, firstElementCorrect,
+ * firstElementCorrectLoose, firstGroupCorrectLoose} */
+ protected int[] calcCorrectness(Read r, int thresh){
+ //assume sorted.
+ ArrayList<SiteScore> ssl=r.sites;
+
+ if(ssl==null || ssl.isEmpty()){
+ return new int[] {-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+ }
+
+ SiteScore original=r.originalSite;
+ assert((original==null) != (r.synthetic()));
+ if(original==null){
+ original=ssl.get(0);
+ }
+
+ int group=0;
+ int correctGroup=-1;
+ int groupSize=0;
+ int correctGroupSize=-1;
+ int prevScore=Integer.MAX_VALUE;
+ int sizeOfTopGroup=0;
+ SiteScore correct=null;
+
+ int firstElementCorrect=0;
+ int firstElementCorrectLoose=0;
+ int firstGroupCorrectLoose=0;
+
+ int numCorrect=0;
+
+ for(int i=0; i<ssl.size(); i++){
+ SiteScore ss=ssl.get(i);
+ if(ss.score==ssl.get(0).score){sizeOfTopGroup++;}
+
+ if(prevScore!=ss.score){
+ assert(prevScore>ss.score || (AMBIGUOUS_RANDOM && r.ambiguous()) || r.mate!=null) : "prevScore="+prevScore+", score="+ss.score+
+ ", i="+i+", r="+r+"\n\nss"+i+" = "+ss+"\n\n"+(i==0 ? "" : "ss"+(i-1)+" = "+ssl.get(i-1));
+
+ if(correctGroup==group){
+ correctGroupSize=groupSize;
+ }
+
+ group++;
+ groupSize=0;
+ prevScore=ss.score;
+ }
+ groupSize++;
+
+
+// boolean b=isCorrectHit(ss, original.chrom, original.strand, original.start, 1, thresh);
+ boolean b=isCorrectHit(ss, original.chrom, original.strand, original.start(), original.stop(), thresh);
+ boolean b2=isCorrectHitLoose(ss, original.chrom, original.strand, original.start(), original.stop(), thresh+20);
+ if(b){
+ if(i==0){firstElementCorrect=1;}
+ numCorrect++;
+ if(correct==null){
+ correct=ss;
+ correctGroup=group;
+ }
+ }
+ if(b2){
+ if(i==0){firstElementCorrectLoose=1;}
+ if(group==0){firstGroupCorrectLoose=1;}
+ }
+ }
+ if(correctGroup==group){
+ correctGroupSize=groupSize;
+ }
+
+ assert(correctGroup!=0 && correctGroup<=group);
+ assert(group<=ssl.size());
+ assert(sizeOfTopGroup>0 && sizeOfTopGroup<=ssl.size());
+ assert((correctGroup>0) == (correctGroupSize>0));
+ return new int[] {correctGroup, correctGroupSize, group, ssl.size(),
+ correct==null ? 0 : correct.score, ssl.get(0).score, sizeOfTopGroup, numCorrect, firstElementCorrect,
+ firstElementCorrectLoose, firstGroupCorrectLoose};
+ }
+
+
+ public static final boolean isCorrectHit(SiteScore ss, int trueChrom, byte trueStrand, int trueStart, int trueStop, int thresh){
+// boolean b=(ss.chrom==trueChrom && ss.strand==trueStrand);
+ if(ss.chrom!=trueChrom || ss.strand!=trueStrand){return false;}
+
+ assert(ss.stop()>ss.start()) : ss.toText()+", "+trueStart+", "+trueStop;
+ assert(trueStop>trueStart) : ss.toText()+", "+trueStart+", "+trueStop;
+
+ return (absdif(ss.start(), trueStart)<=thresh && absdif(ss.stop(), trueStop)<=thresh);
+// return (absdif(ss.start(), trueStart)<=thresh || absdif(ss.stop(), trueStop)<=thresh);
+
+// if(absdif(ss.start(), trueStart)<=thresh){return true;}
+// if(absdif(ss.stop(), trueStop)<=thresh){return true;}
+// return false;
+
+// if(absdif(ss.start(), trueStart)>thresh){return false;}
+// if(absdif(ss.stop(), trueStop)>thresh){return false;}
+// return true;
+ }
+
+
+ public static final boolean isCorrectHitLoose(SiteScore ss, int trueChrom, byte trueStrand, int trueStart, int trueStop, int thresh){
+// boolean b=(ss.chrom==trueChrom && ss.strand==trueStrand);
+ if(ss.chrom!=trueChrom || ss.strand!=trueStrand){return false;}
+
+ assert(ss.stop()>ss.start()) : ss.toText()+", "+trueStart+", "+trueStop;
+ assert(trueStop>trueStart) : ss.toText()+", "+trueStart+", "+trueStop;
+
+ return (absdif(ss.start(), trueStart)<=thresh || absdif(ss.stop(), trueStop)<=thresh);
+
+// if(absdif(ss.start(), trueStart)<=thresh){return true;}
+// if(absdif(ss.stop(), trueStop)<=thresh){return true;}
+// return false;
+
+// if(absdif(ss.start(), trueStart)>thresh){return false;}
+// if(absdif(ss.stop(), trueStop)>thresh){return false;}
+// return true;
+ }
+
+ protected static final byte[] makePerfectMatchString(int len){
+ byte[] r=new byte[len];
+ Arrays.fill(r, (byte)'m');
+ return r;
+ }
+
+ protected static final int absdif(int a, int b){
+ return a>b ? a-b : b-a;
+ }
+
+ /** Returns maximum read length supported by this mapper */
+ public abstract int maxReadLength();
+
+ /** Ensure top site is congruent with read */
+ protected static final boolean checkTopSite(Read r){
+ if(!r.mapped()){return true;}
+ if(r.numSites()==0){return false;}
+ SiteScore ss=r.topSite();
+ if(ss==null){return false;}
+ boolean b=(ss.start()==r.start) && (ss.stop()==r.stop) && (ss.strand==r.strand()) && (ss.chrom==r.chrom) && (ss.match==r.match);
+ assert(b) : "\nread="+r+"\nmate="+r.mate+"\nss="+ss+"\n"+(ss==null ? "ss is null" :
+ ((ss.start()==r.start)+", "+(ss.stop()==r.stop)+", "+(ss.strand==r.strand())+", "+(ss.chrom==r.chrom)+", "+(ss.match==r.match))+"\nlist="+r.sites);
+ return b;
+ }
+
+
+ protected static final int removeLongIndels(ArrayList<SiteScore> list, int maxlen){
+ if(list==null || list.size()<1){return 0;}
+ int removed=0;
+ for(int i=list.size()-1; i>=0; i--){
+ SiteScore ss=list.get(i);
+ if(hasLongIndel(ss.match, maxlen)){
+ list.remove(i);
+ removed++;
+ }
+ }
+ return removed;
+ }
+
+ protected static final boolean hasLongIndel(byte[] match, int maxlen){
+ if(match==null || match.length<maxlen){return false;}
+ byte prev='0';
+ int len=0;
+ for(byte b : match){
+ if(b=='D' || b=='I' || b=='X' || b=='Y'){
+ if(b==prev){len++;}
+ else{len=1;}
+ if(len>maxlen){return true;}
+ }else{
+ len=0;
+ }
+ prev=b;
+ }
+ return false;
+ }
+
+ protected static final boolean hasLongInsertion(byte[] match, int maxlen){
+ if(match==null || match.length<maxlen){return false;}
+ byte prev='0';
+ int len=0;
+ for(byte b : match){
+ if(b=='I' || b=='X' || b=='Y'){
+ if(b==prev){len++;}
+ else{len=1;}
+ if(len>maxlen){return true;}
+ }else{
+ len=0;
+ }
+ prev=b;
+ }
+ return false;
+ }
+
+ protected static final boolean hasLongDeletion(byte[] match, int maxlen){
+ if(match==null || match.length<maxlen){return false;}
+ byte prev='0';
+ int len=0;
+ for(byte b : match){
+ if(b=='D'){
+ if(b==prev){len++;}
+ else{len=1;}
+ if(len>maxlen){return true;}
+ }else{
+ len=0;
+ }
+ prev=b;
+ }
+ return false;
+ }
+
+ /** TODO */
+ final void processReadSplit(Read r, byte[] basesM, int minlen, int maxlen){
+ assert(minlen>=KEYLEN && maxlen>=minlen) : KEYLEN+", "+maxlen+", "+minlen;
+ int len=r.length();
+ if(len<=maxlen){
+ processRead(r, basesM);
+ return;
+ }
+ ArrayList<Read> subreads=r.split(minlen, maxlen);
+ }
+
+ public final synchronized boolean finished(){return finished;}
+
+ public final synchronized boolean working(){return !finished;}
+
+ final synchronized void finish(){
+ assert(!finished);
+ finished=true;
+ notifyAll();
+ }
+
+ private boolean finished=false;
+
+ private static final float[] CZ3_MULTS=new float[] {0f, 1f, .75f, 0.5f, 0.25f, 0.125f, 0.0625f};
+
+ /*--------------------------------------------------------------*/
+
+ /** Input read source. */
+ protected final ConcurrentReadInputStream cris;
+
+
+ /** All reads go here. <br>
+ * If outputunmapped=false, omit unmapped single reads and double-unmapped paired reads. */
+ protected final ConcurrentReadOutputStream outStream;
+ /** All mapped reads (and half-mapped pairs) go here except reads that only map to the blacklist. */
+ protected final ConcurrentReadOutputStream outStreamMapped;
+ /** All unmapped reads (and double-unmapped pairs) go here. */
+ protected final ConcurrentReadOutputStream outStreamUnmapped;
+ /** All reads (and half-mapped pairs) that map best to the blacklist go here. */
+ protected final ConcurrentReadOutputStream outStreamBlack;
+
+
+ /*--------------------------------------------------------------*/
+
+
+ public final String MSA_TYPE;
+ final MSA msa;
+// final TranslateColorspaceRead tcr;
+ public final ReadStats readstats;
+ public final CoveragePileup pileup;
+ public final int POINTS_MATCH, POINTS_MATCH2;
+ public final int KEYLEN;
+
+ protected final boolean PERFECTMODE; //Only look for perfect matches
+ protected final boolean SEMIPERFECTMODE; //Only look for perfect and semiperfect matches
+ protected final boolean FORBID_SELF_MAPPING; //Do not allow reads to map to their official origin. Allows you to find next-best matches (when supported)
+ protected final boolean RCOMP_MATE; //Reverse-complement mate prior to mapping
+ protected static boolean RCOMP=false;
+ /** True if this thread should generate a match string for the best match */
+ protected final boolean MAKE_MATCH_STRING;
+
+ protected final boolean OUTPUT_MAPPED_ONLY;
+ protected final boolean DONT_OUTPUT_BLACKLISTED_READS;
+ protected final boolean PRINT_SECONDARY_ALIGNMENTS;
+ protected final boolean QUICK_MATCH_STRINGS;
+ protected final boolean USE_SS_MATCH_FOR_PRIMARY=true;
+
+ protected final int MAX_SITESCORES_TO_PRINT;
+
+ /** Scores below the (max possible alignment score)*(MINIMUM_ALIGNMENT_SCORE_RATIO) will be discarded.
+ * Default: 0.4 for synthetic data. */
+ protected final float MINIMUM_ALIGNMENT_SCORE_RATIO;
+ protected final float MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE;
+ protected final float MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED;
+
+ protected final float keyDensity;
+ protected final float maxKeyDensity;
+ protected final float minKeyDensity;
+ protected final int maxDesiredKeys;
+
+ /*--------------------------------------------------------------*/
+
+ final int CLEARZONE1e;
+
+ /*--------------------------------------------------------------*/
+
+ final int MIN_APPROX_HITS_TO_KEEP;
+ final boolean USE_EXTENDED_SCORE;
+ public static final boolean GENERATE_BASE_SCORES_FROM_QUALITY=AbstractIndex.GENERATE_BASE_SCORES_FROM_QUALITY;
+ final int BASE_HIT_SCORE;
+ final int BASE_KEY_HIT_SCORE;
+ final boolean USE_AFFINE_SCORE;
+ final int EXPECTED_LEN_LIMIT;
+ final int MAX_INDEL;
+
+ final boolean TRIM_LIST;
+ final int TIP_DELETION_SEARCH_RANGE;
+ final boolean FIND_TIP_DELETIONS;
+ final int ALIGN_COLUMNS;
+
+ /*--------------------------------------------------------------*/
+
+
+ /** Use dynamic programming slow-alignment phase to increase quality. Program may not run anymore if this is disabled. */
+ protected final boolean SLOW_ALIGN;
+ /** Produce local alignments instead of global alignments */
+ protected final boolean LOCAL_ALIGN;
+ /** Discard reads with ambiguous alignments (consider them unmapped). */
+ protected final boolean AMBIGUOUS_TOSS;
+ /** Choose a random site for reads with ambiguous alignments. */
+ protected final boolean AMBIGUOUS_RANDOM;
+ /** Output all sites for reads with ambiguous alignments. */
+ protected final boolean AMBIGUOUS_ALL;
+ /** Quality-trim left side of reads before mapping. */
+ protected final boolean TRIM_LEFT;
+ /** Quality-trim right side of reads before mapping. */
+ protected final boolean TRIM_RIGHT;
+ /** Undo quality trimming after mapping. */
+ protected final boolean UNTRIM;
+ /** Trim until 2 consecutive bases are encountered with at least this quality. */
+ protected final byte TRIM_QUAL;
+ /** Don't trim reads to be shorter than this */
+ protected final int TRIM_MIN_LENGTH;
+ /** Distance cutoff for classifying a read as loosely correct */
+ protected final int THRESH;
+ /** Semi-deprecated. Minimum chrom to index or load. */
+ protected final int minChrom;
+ /** Semi-deprecated. Maximum chrom to index or load. */
+ protected final int maxChrom;
+ /** Disallow sites that do not have at least k consecutive matching bases. */
+ protected final int KFILTER;
+ /** Disallow sites with identity below this. */
+ protected final float IDFILTER;
+ /** Do advanced filtering on number of specific types of edits */
+ protected final boolean PROCESS_EDIT_FILTER;
+
+
+ /** When reads are not in valid pairing orientation, eliminate (mark unmapped) the lower-scoring read. */
+ protected final boolean KILL_BAD_PAIRS;
+ /** For human genome, map ambiguous reads in the PAR to the X chromosome. */
+ protected final boolean SAVE_AMBIGUOUS_XY;
+ /** Deprecated. Must be set to true. */
+ protected final boolean GEN_MATCH_FAST=true;
+
+ /** Padding for dynamic-programming slow alignment. */
+ protected final int SLOW_ALIGN_PADDING;
+ /** Padding for dynamic-programming slow alignment for rescued reads (which typically may need more padding). */
+ protected final int SLOW_RESCUE_PADDING;
+ /** If a site is unpaired, search nearby for a possible site for the other read. */
+ protected final boolean DO_RESCUE;
+ /** Forbid alignments with indels longer than MAX_INDEL */
+ protected final boolean STRICT_MAX_INDEL;
+ /** Bandwidth of banded MSA */
+ protected final int BANDWIDTH;
+
+ protected final boolean PAIRED;
+ protected final boolean REQUIRE_CORRECT_STRANDS_PAIRS;
+ protected final boolean SAME_STRAND_PAIRS;
+
+ /*--------------------------------------------------------------*/
+
+ static int INITIAL_AVERAGE_PAIR_DIST=100;
+ protected int AVERAGE_PAIR_DIST;
+ protected float AVERAGE_PAIRING_RATE=0;
+
+ /** Extra padding for when slow alignment fails. */
+ protected int EXTRA_PADDING=10;
+
+ protected final boolean GENERATE_KEY_SCORES_FROM_QUALITY;
+
+ /*--------------------------------------------------------------*/
+
+ protected static boolean PENALIZE_AMBIG=true;
+ protected static int SUBFILTER=-1;
+ protected static int DELFILTER=-1;
+ protected static int INSFILTER=-1;
+ protected static int INDELFILTER=-1;
+ protected static int DELLENFILTER=-1;
+ protected static int INSLENFILTER=-1;
+ protected static int EDITFILTER=-1;
+
+ protected static boolean OUTPUT_SAM=false;
+
+ protected static float SECONDARY_SITE_SCORE_RATIO=.95f;
+ protected static boolean PRINT_SECONDARY_ALIGNMENTS_ONLY_FOR_AMBIGUOUS_READS=false;
+
+ protected static boolean CALC_STATISTICS=true;
+ protected static int MIN_PAIR_DIST=-160;
+ protected static int MAX_PAIR_DIST=32000;
+ protected static int MAX_RESCUE_DIST=1200;
+ protected static int MAX_RESCUE_MISMATCHES=32;
+ /** IMPORTANT!!!! This option causes non-deterministic output. */
+ protected static final boolean DYNAMIC_INSERT_LENGTH=true;
+ /** Counts undefined bases. */
+ protected static final boolean DISCARD_MOSTLY_UNDEFINED_READS=true;
+ protected static byte MIN_AVERAGE_QUALITY=0;
+ protected static int MIN_AVERAGE_QUALITY_BASES=0;
+ protected static boolean TIME_TAG=false;
+ protected static boolean CLEAR_ATTACHMENT=true;
+
+ protected static final byte TIP_DELETION_MIN_QUALITY=6;
+ protected static final byte TIP_DELETION_AVG_QUALITY=14;
+ protected static final int TIP_DELETION_MAX_TIPLEN=8;
+
+ protected static final int OUTER_DIST_MULT=14;
+// protected static final int OUTER_DIST_MULT2=OUTER_DIST_MULT-1;
+ protected static final int OUTER_DIST_DIV=32;
+
+ protected static long SKIP_INITIAL=0;
+
+ protected static boolean OUTPUT_PAIRED_ONLY=false;
+
+ protected static int MAX_READ_LENGTH=0;
+ protected static int MIN_READ_LENGTH=0;
+
+ protected static boolean USE_MODULO=false;
+
+ protected static int MAX_TRIM_SITES_TO_RETAIN=800;
+
+// static{if(OUTER_DIST_MULT2<1){throw new RuntimeException();}}
+
+ /*--------------------------------------------------------------*/
+
+ public long totalNumCorrect1=0;
+ public long totalNumIncorrect1=0;
+ public long totalNumIncorrectPrior1=0;
+ public long totalNumCapturedAllCorrect1=0;
+ public long totalNumCapturedAllCorrectTop1=0;
+ public long totalNumCapturedAllCorrectOnly1=0;
+
+ public long totalNumCorrect2=0;
+ public long totalNumIncorrect2=0;
+ public long totalNumIncorrectPrior2=0;
+ public long totalNumCapturedAllCorrect2=0;
+ public long totalNumCapturedAllCorrectTop2=0;
+ public long totalNumCapturedAllCorrectOnly2=0;
+
+ /*--------------------------------------------------------------*/
+
+ public boolean verbose=false;
+ public static final boolean verboseS=false;
+
+ public long readsUsed1=0;
+ public long readsUsed2=0;
+ public long basesUsed1=0;
+ public long basesUsed2=0;
+ public long numMated=0;
+ public long numMatedBases=0;
+ public long badPairs=0;
+ public long badPairBases=0;
+ public long innerLengthSum=0;
+ public long outerLengthSum=0;
+ public long insertSizeSum=0;
+ public long keysUsed=0;
+ public long syntheticReads=0;
+ public long bothUnmapped=0;
+ public long bothUnmappedBases=0;
+
+ public long mapped1=0;
+ public long mappedRetained1=0;
+ public long mappedRetainedBases1=0;
+ public long rescuedP1=0;
+ public long rescuedM1=0;
+ public long truePositiveP1=0;
+ public long truePositiveM1=0;
+ public long falsePositive1=0;
+ public long totalCorrectSites1=0;
+
+ public long firstSiteCorrectP1=0;
+ public long firstSiteCorrectM1=0;
+ public long firstSiteIncorrect1=0;
+ public long firstSiteCorrectLoose1=0;
+ public long firstSiteIncorrectLoose1=0;
+ public long firstSiteCorrectPaired1=0;
+ public long firstSiteCorrectSolo1=0;
+ public long firstSiteCorrectRescued1=0;
+
+ public long matchCountS1=0;
+ public long matchCountI1=0;
+ public long matchCountD1=0;
+ public long matchCountM1=0;
+ public long matchCountN1=0;
+
+ public long readCountE1=0;
+ public long readCountS1=0;
+ public long readCountI1=0;
+ public long readCountD1=0;
+ public long readCountN1=0;
+ public long readCountSplice1=0;
+
+ public long perfectHit1=0; //Highest quick score is max quick score
+ public long uniqueHit1=0; //Only one hit has highest score
+ public long correctUniqueHit1=0; //unique highest hit on answer site
+ public long correctMultiHit1=0; //non-unique highest hit on answer site
+ public long correctLowHit1=0; //hit on answer site, but not highest scorer
+ public long noHit1=0;
+
+ /** Number of perfect hit sites found */
+ public long perfectHitCount1=0;
+ /** Number of sites found that are perfect except for no-ref */
+ public long semiPerfectHitCount1=0;
+
+
+ public long perfectMatch1=0; //Highest slow score is max slow score
+ public long semiperfectMatch1=0;
+ public long perfectMatchBases1=0;
+ public long semiperfectMatchBases1=0;
+ public long ambiguousBestAlignment1=0;
+ public long ambiguousBestAlignmentBases1=0;
+
+ public long initialSiteSum1=0;
+ public long postTrimSiteSum1=0;
+ public long postRescueSiteSum1=0;
+ public long siteSum1=0;
+ public long topSiteSum1=0;
+
+ public long lowQualityReadsDiscarded1=0;
+ public long lowQualityBasesDiscarded1=0;
+
+ public long mapped2=0;
+ public long mappedRetained2=0;
+ public long mappedRetainedBases2=0;
+ public long rescuedP2=0;
+ public long rescuedM2=0;
+ public long truePositiveP2=0;
+ public long truePositiveM2=0;
+ public long falsePositive2=0;
+ public long totalCorrectSites2=0;
+
+ public long firstSiteCorrectP2=0;
+ public long firstSiteCorrectM2=0;
+ public long firstSiteIncorrect2=0;
+ public long firstSiteCorrectLoose2=0;
+ public long firstSiteIncorrectLoose2=0;
+ public long firstSiteCorrectPaired2=0;
+ public long firstSiteCorrectSolo2=0;
+ public long firstSiteCorrectRescued2=0;
+
+ public long matchCountS2=0;
+ public long matchCountI2=0;
+ public long matchCountD2=0;
+ public long matchCountM2=0;
+ public long matchCountN2=0;
+
+ public long readCountE2=0;
+ public long readCountS2=0;
+ public long readCountI2=0;
+ public long readCountD2=0;
+ public long readCountN2=0;
+ public long readCountSplice2=0;
+
+ public long perfectHit2=0; //Highest quick score is max quick score
+ public long uniqueHit2=0; //Only one hit has highest score
+ public long correctUniqueHit2=0; //unique highest hit on answer site
+ public long correctMultiHit2=0; //non-unique highest hit on answer site
+ public long correctLowHit2=0; //hit on answer site, but not highest scorer
+ public long noHit2=0;
+
+ /** Number of perfect hit sites found */
+ public long perfectHitCount2=0;
+ /** Number of sites found that are perfect except for no-ref */
+ public long semiPerfectHitCount2=0;
+
+ public long perfectMatch2=0; //Highest slow score is max slow score
+ public long semiperfectMatch2=0;
+ public long perfectMatchBases2=0;
+ public long semiperfectMatchBases2=0;
+ public long ambiguousBestAlignment2=0;
+ public long ambiguousBestAlignmentBases2=0;
+
+ public long initialSiteSum2=0;
+ public long postTrimSiteSum2=0;
+ public long postRescueSiteSum2=0;
+ public long siteSum2=0;
+ public long topSiteSum2=0;
+
+ public long lowQualityReadsDiscarded2=0;
+ public long lowQualityBasesDiscarded2=0;
+
+ /*--------------------------------------------------------------*/
+
+ int idmodulo;
+}
diff --git a/current/align2/AbstractMapper.java b/current/align2/AbstractMapper.java
new file mode 100755
index 0000000..66c51ab
--- /dev/null
+++ b/current/align2/AbstractMapper.java
@@ -0,0 +1,2740 @@
+package align2;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.lang.Thread.State;
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import jgi.CoveragePileup;
+
+import stream.ConcurrentLegacyReadInputStream;
+import stream.ConcurrentReadInputStream;
+import stream.FASTQ;
+import stream.ConcurrentReadOutputStream;
+import stream.KillSwitch;
+import stream.RandomReadInputStream3;
+import stream.Read;
+import stream.ReadStreamWriter;
+import stream.SamLine;
+import stream.SequentialReadInputStream;
+import stream.SiteScore;
+
+import dna.ChromosomeArray;
+import dna.Data;
+import dna.Parser;
+import dna.Timer;
+import fileIO.ReadWrite;
+import fileIO.FileFormat;
+import fileIO.TextStreamWriter;
+
+/**
+ * Abstract superclass created from BBMap variants.
+ * Handles argument parsing, I/O stream initialization and shutdown,
+ * thread management, statistics collection and formatting.
+ * @author Brian Bushnell
+ * @date Oct 15, 2013
+ *
+ */
+public abstract class AbstractMapper {
+
+ public AbstractMapper(String[] args){
+ if(Shared.COMMAND_LINE==null){
+ Shared.COMMAND_LINE=(args==null ? null : args.clone());
+ Shared.BBMAP_CLASS=this.getClass().getName();
+ int x=Shared.BBMAP_CLASS.lastIndexOf('.');
+ if(x>=0){Shared.BBMAP_CLASS=Shared.BBMAP_CLASS.substring(x+1);}
+ }
+ setDefaults();
+ String[] args2=preparse0(args);
+ String[] args3=preparse(args2);
+ parse(args3);
+ postparse(args3);
+ setup();
+ checkFiles();
+ }
+
+ void printOptions(){
+ sysout.println("For help, please consult readme.txt or run the shellscript with no parameters.");
+ }
+
+ final void abort(AbstractMapThread[] mtts, String message){
+// System.err.println("Attempting to abort.");
+ closeStreams(cris, rosA, rosM, rosU, rosB);
+ KillSwitch.kill(message==null ? "" : message);
+// if(mtts!=null){int x=shutDownThreads(mtts, true);}
+// if(message==null){throw new RuntimeException();}
+// throw new RuntimeException(message);
+ }
+
+ /** In megabytes */
+ final void adjustThreadsforMemory(long threadMem){
+ Runtime rt=Runtime.getRuntime();
+ long mmemory=rt.maxMemory()/1000000;
+ long tmemory=rt.totalMemory()/1000000;
+ long fmemory=rt.freeMemory()/1000000;
+ long umemory=tmemory-fmemory;
+ long amemory=mmemory-umemory;
+// System.err.println("mmemory="+mmemory+", tmemory="+tmemory+", fmemory="+fmemory+", umemory="+umemory+", amemory="+amemory);
+// int maxThreads=Tools.max(1, (int)((amemory-70)/threadMem));
+ int maxThreads=(int)((amemory-100)/threadMem);
+ if(Shared.threads()>maxThreads){
+ System.err.println("\nMax Memory = "+mmemory+" MB\nAvailable Memory = "+amemory+" MB");
+ if(maxThreads<1){abort(null, "\n\nNot enough memory. Please run on a node with at least "+((long)((umemory+100+threadMem)*1.15))+" MB.\n");}
+ System.err.println("Reducing threads from "+Shared.threads()+" to "+maxThreads+" due to low system memory.");
+ Shared.setThreads(maxThreads);
+ }
+ }
+
+ abstract void setDefaults();
+
+ abstract String[] preparse(String[] args);
+
+ abstract void postparse(String[] args);
+
+ abstract void setup();
+
+ abstract void loadIndex();
+
+ abstract void processAmbig2();
+
+ abstract void testSpeed(String[] args);
+
+ abstract void setSemiperfectMode();
+
+ abstract void setPerfectMode();
+
+ abstract void printSettings(int k);
+
+ private final void parse(String[] args){
+
+
+ sysout.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+
+ args=Parser.parseConfig(args);
+ if(Parser.parseHelp(args, true)){
+ printOptions();
+ System.exit(0);
+ }
+ sysout.println("BBMap version "+Shared.BBMAP_VERSION_STRING);
+
+ Timer t=new Timer();
+
+ Read.TO_UPPER_CASE=true;
+
+ boolean setMaxIndel1=false, setMaxIndel2=false;
+ boolean forceRebuild=false;
+ Parser parser=new Parser();
+ parser.minTrimLength=minTrimLength;
+
+
+ for(int i=0; i<args.length; i++){
+ final String arg=(args[i]==null ? "null" : args[i]);
+ final String[] split=arg.split("=");
+ final String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if("null".equalsIgnoreCase(b)){b=null;}
+// System.err.println("Processing "+arg);
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(Parser.parseZip(arg, a, b)){
+ if(a.equals("ziplevel") || a.equals("zl")){//Handle conflated term
+ ziplevel=Integer.parseInt(b);
+ }
+ }else if(Parser.parseHist(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseSam(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseCommonStatic(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseQuality(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseFasta(arg, a, b)){
+ //do nothing
+ }else if(parser.parseInterleaved(arg, a, b)){
+ //do nothing
+ }else if(parser.parseCommon(arg, a, b)){
+ //do nothing
+ }else if(parser.parseMapping(arg, a, b)){
+ //do nothing
+ }else if(parser.parseTrim(arg, a, b)){
+ //do nothing
+ }else if(a.equals("printtoerr")){
+ if(Tools.parseBoolean(b)){
+ sysout=System.err;
+ Data.sysout=System.err;
+ }
+ }else if(a.equals("path") || a.equals("root")){
+ Data.setPath(b);
+ }else if(a.equals("ref") || a.equals("reference") || a.equals("fasta")){
+ reference=b;
+ }else if(a.equals("in") || a.equals("in1")){
+ in1=b;
+ }else if(a.equals("in2")){
+ in2=b;
+ }else if(a.equals("qfin") || a.equals("qfin1")){
+ qfin1=b;
+ }else if(a.equals("qfin2")){
+ qfin2=b;
+ }else if(a.equals("out")){
+ if(b==null || b.equalsIgnoreCase("null") || b.equalsIgnoreCase("none")){
+ outFile=null;
+ }else{
+ outFile=b;
+// outFile=b.replace('#', '1');
+// outFile2=(b.contains("#") ? b.replace('#', '2') : null);
+ }
+ }else if(a.equals("out1")){
+ outFile=(b==null || b.equalsIgnoreCase("null") || b.equalsIgnoreCase("none")) ? null : b;
+ if(outFile==null){
+ outFile=null;
+ }
+ }else if(a.equals("out2")){
+ outFile2=(b==null || b.equalsIgnoreCase("null") || b.equalsIgnoreCase("none")) ? null : b;
+ }else if(a.equals("outm") || a.equals("outm1") || a.equals("outmapped") || a.equals("outmapped1")){
+ outFileM=(b==null || b.equalsIgnoreCase("null") || b.equalsIgnoreCase("none")) ? null : b;
+ }else if(a.equals("outm2") || a.equals("outmapped2")){
+ outFileM2=(b==null || b.equalsIgnoreCase("null") || b.equalsIgnoreCase("none")) ? null : b;
+ }else if(a.equals("outu") || a.equals("outu1") || a.equals("outunmapped") || a.equals("outunmapped1")){
+ outFileU=(b==null || b.equalsIgnoreCase("null") || b.equalsIgnoreCase("none")) ? null : b;
+ }else if(a.equals("outu2") || a.equals("outunmapped2")){
+ outFileU2=(b==null || b.equalsIgnoreCase("null") || b.equalsIgnoreCase("none")) ? null : b;
+ }else if(a.equals("outb") || a.equals("outb1") || a.equals("outblack") || a.equals("outblack1") || a.equals("outblacklist") || a.equals("outblacklist1")){
+ outFileB=(b==null || b.equalsIgnoreCase("null") || b.equalsIgnoreCase("none")) ? null : b;
+ }else if(a.equals("outb2") || a.equals("outblack2") || a.equals("outblacklist2")){
+ outFileB2=(b==null || b.equalsIgnoreCase("null") || b.equalsIgnoreCase("none")) ? null : b;
+ }else if(a.equals("blacklist") && !Data.scaffoldPrefixes){
+ if(b==null || b.equalsIgnoreCase("null") || b.equalsIgnoreCase("none")){blacklist=null;}
+ else{
+ if(blacklist==null){blacklist=new ArrayList<String>();}
+ if(b.indexOf(',')<0 || new File(b).exists()){blacklist.add(b);}
+ else{
+ String[] temp=b.split(",");
+ for(String tmp : temp){blacklist.add(tmp);}
+ }
+ }
+ }else if(a.startsWith("out_") && b!=null){
+ //ignore, it will be processed later
+ if(splitterOutputs==null){splitterOutputs=new ArrayList<String>();}
+ splitterOutputs.add(b);
+ }else if(a.equals("bamscript") || a.equals("bs")){
+ bamscript=b;
+ }else if(a.equals("local")){
+ LOCAL_ALIGN=Tools.parseBoolean(b);
+ }else if(a.equals("averagepairdist") || a.equals("apd")){
+ AbstractMapThread.INITIAL_AVERAGE_PAIR_DIST=(int)Tools.parseKMG(b);
+ }else if(a.equals("skipreads")){
+ AbstractMapThread.SKIP_INITIAL=Tools.parseKMG(b);
+ }else if(a.equals("readlen") || a.equals("length") || a.equals("len")){
+ synthReadlen=Integer.parseInt(b);
+ }else if(a.equals("kfilter")){
+ KFILTER=Integer.parseInt(b);
+ }else if(a.equals("msa")){
+ MSA_TYPE=b;
+ }else if(a.equals("bandwidth") || a.equals("bw")){
+ int x=Tools.max(0, Integer.parseInt(b));
+ MSA.bandwidth=x;
+ }else if(a.equals("bandwidthratio") || a.equals("bwr")){
+ float x=Tools.max(0, Float.parseFloat(b));
+ MSA.bandwidthRatio=x;
+ assert(x>=0) : "Bandwidth ratio should be at least 0.";
+ }else if(a.equals("eono") || a.equals("erroronnooutput")){
+ ERROR_ON_NO_OUTPUT=Tools.parseBoolean(b);
+ }else if(a.equals("log")){
+ RefToIndex.LOG=Tools.parseBoolean(b);
+ }else if(a.equals("sitesonly") || a.equals("outputsitesonly")){
+ outputSitesOnly=Tools.parseBoolean(b);
+ sysout.println("Set outputSitesOnly to "+outputSitesOnly);
+ }else if(a.equals("discardambiguous") || a.equals("tossambiguous")){
+ REMOVE_DUPLICATE_BEST_ALIGNMENTS=Tools.parseBoolean(b);
+ sysout.println("Set REMOVE_DUPLICATE_BEST_ALIGNMENTS to "+REMOVE_DUPLICATE_BEST_ALIGNMENTS);
+ }else if(a.equals("ambiguous") || a.equals("ambig")){
+ if(b==null){
+ throw new RuntimeException(arg);
+ }else if(b.equalsIgnoreCase("keep") || b.equalsIgnoreCase("best") || b.equalsIgnoreCase("first")){
+ ambigMode=AMBIG_BEST;
+ }else if(b.equalsIgnoreCase("all")){
+ ambigMode=AMBIG_ALL;
+ }else if(b.equalsIgnoreCase("random")){
+ ambigMode=AMBIG_RANDOM;
+ }else if(b.equalsIgnoreCase("toss") || b.equalsIgnoreCase("discard") || b.equalsIgnoreCase("remove")){
+ ambigMode=AMBIG_TOSS;
+ }else{
+ throw new RuntimeException(arg);
+ }
+// sysout.println("Set REMOVE_DUPLICATE_BEST_ALIGNMENTS to "+REMOVE_DUPLICATE_BEST_ALIGNMENTS);
+ }else if(a.equals("penalizeambiguous") || a.equals("penalizeambig") || a.equals("pambig")){
+ AbstractMapThread.PENALIZE_AMBIG=SamLine.PENALIZE_AMBIG=Tools.parseBoolean(b);
+ }else if(a.equals("maxsites")){
+ int x=Integer.parseInt(b);
+ assert(x>0) : "maxsites must be at least 1.";
+ MAX_SITESCORES_TO_PRINT=Tools.max(x, 1);
+ AbstractMapThread.MAX_TRIM_SITES_TO_RETAIN=Tools.max(MAX_SITESCORES_TO_PRINT*2, AbstractMapThread.MAX_TRIM_SITES_TO_RETAIN);
+ }else if(a.equals("maxsites2")){
+ int x=Integer.parseInt(b);
+ assert(x>1) : "maxsites2 must be at least 2.";
+ AbstractMapThread.MAX_TRIM_SITES_TO_RETAIN=Tools.max(x, 2);
+ }else if(a.equals("secondary")){
+ PRINT_SECONDARY_ALIGNMENTS=Tools.parseBoolean(b);
+ ReadStreamWriter.OUTPUT_SAM_SECONDARY_ALIGNMENTS=PRINT_SECONDARY_ALIGNMENTS;
+ }else if(a.equals("sssr") || a.equals("secondarysitescoreratio")){
+ AbstractMapThread.SECONDARY_SITE_SCORE_RATIO=Float.parseFloat(b);
+ }else if(a.equals("ssao") || a.equals("secondarysiteasambiguousonly")){
+ AbstractMapThread.PRINT_SECONDARY_ALIGNMENTS_ONLY_FOR_AMBIGUOUS_READS=Tools.parseBoolean(b);
+ }else if(a.equals("quickmatch")){
+ QUICK_MATCH_STRINGS=Tools.parseBoolean(b);
+ }else if(a.equals("ambiguous2") || a.equals("ambig2")){
+ if(b==null){
+ throw new RuntimeException(arg);
+ }else if(b.equalsIgnoreCase("split") || b.equalsIgnoreCase("stream")){
+ BBSplitter.AMBIGUOUS2_MODE=BBSplitter.AMBIGUOUS2_SPLIT;
+ }else if(b.equalsIgnoreCase("keep") || b.equalsIgnoreCase("best") || b.equalsIgnoreCase("first")){
+ BBSplitter.AMBIGUOUS2_MODE=BBSplitter.AMBIGUOUS2_FIRST;
+ }else if(b.equalsIgnoreCase("toss") || b.equalsIgnoreCase("discard") || b.equalsIgnoreCase("remove")){
+ BBSplitter.AMBIGUOUS2_MODE=BBSplitter.AMBIGUOUS2_TOSS;
+ }else if(b.equalsIgnoreCase("random")){
+ BBSplitter.AMBIGUOUS2_MODE=BBSplitter.AMBIGUOUS2_RANDOM;
+ }else if(b.equalsIgnoreCase("all")){
+ BBSplitter.AMBIGUOUS2_MODE=BBSplitter.AMBIGUOUS2_ALL;
+ }else{
+ throw new RuntimeException(arg);
+ }
+ }else if(a.equals("forbidselfmapping")){
+ FORBID_SELF_MAPPING=Tools.parseBoolean(b);
+ sysout.println("Set FORBID_SELF_MAPPING to "+FORBID_SELF_MAPPING);
+ }else if(a.equals("match") || a.equals("cigar")){
+ if(b!=null){b=b.toLowerCase();}else{b="true";}
+ if(b.equals("long") || b.equals("normal")){
+ MAKE_MATCH_STRING=true;
+ Read.COMPRESS_MATCH_BEFORE_WRITING=false;
+// sysout.println("Writing long match strings.");
+ }else if(b.equals("short") || b.equals("compressed")){
+ MAKE_MATCH_STRING=true;
+ Read.COMPRESS_MATCH_BEFORE_WRITING=true;
+// sysout.println("Writing short match strings.");
+ }else{
+ MAKE_MATCH_STRING=Tools.parseBoolean(b);
+ }
+
+ if(MAKE_MATCH_STRING){
+ sysout.println("Cigar strings enabled.");
+ }else{
+ sysout.println("Cigar strings disabled.");
+ }
+ }else if(a.equals("semiperfectmode")){
+ SEMIPERFECTMODE=Tools.parseBoolean(b);
+ if(ziplevel==-1){ziplevel=2;}
+ }else if(a.equals("perfectmode")){
+ PERFECTMODE=Tools.parseBoolean(b);
+ if(ziplevel==-1){ziplevel=2;}
+ }else if(a.equals("trimlist")){
+ TRIM_LIST=Tools.parseBoolean(b);
+ }else if(a.equals("pairedrandom")){
+ PAIRED_RANDOM_READS=Tools.parseBoolean(b);
+ }else if(a.equals("ordered") || a.equals("ord")){
+ OUTPUT_ORDERED_READS=Tools.parseBoolean(b);
+ sysout.println("Set OUTPUT_ORDERED_READS to "+OUTPUT_ORDERED_READS);
+ }else if(a.equals("outputunmapped")){
+ OUTPUT_MAPPED_ONLY=!Tools.parseBoolean(b);
+ sysout.println("Set OUTPUT_MAPPED_ONLY to "+OUTPUT_MAPPED_ONLY);
+ }else if(a.equals("mappedonly")){
+ OUTPUT_MAPPED_ONLY=Tools.parseBoolean(b);
+ sysout.println("Set OUTPUT_MAPPED_ONLY to "+OUTPUT_MAPPED_ONLY);
+ }else if(a.equals("outputblacklisted")){
+ DONT_OUTPUT_BLACKLISTED_READS=!Tools.parseBoolean(b);
+ sysout.println("Set DONT_OUTPUT_BLACKLISTED_READS to "+DONT_OUTPUT_BLACKLISTED_READS);
+ }else if(a.equals("indexloaded")){
+ INDEX_LOADED=Tools.parseBoolean(b);
+ }else if(a.equals("build") || a.equals("genome") || a.equals("index")){
+ build=Integer.parseInt(b);
+ }else if(a.equals("minchrom")){
+ minChrom=Integer.parseInt(b);
+ maxChrom=Tools.max(minChrom, maxChrom);
+ }else if(a.equals("maxchrom")){
+ maxChrom=Byte.parseByte(b);
+ minChrom=Tools.min(minChrom, maxChrom);
+ }else if(a.equals("expectedsites")){
+ expectedSites=Integer.parseInt(b);
+ }else if(a.equals("targetsize")){
+ targetGenomeSize=Tools.parseKMG(b);
+ }else if(a.equals("fgte")){
+ fractionGenomeToExclude=Float.parseFloat(b);
+ sysout.println("Set fractionGenomeToExclude to "+String.format("%.4f",fractionGenomeToExclude));
+ }else if(a.equals("minratio")){
+ MINIMUM_ALIGNMENT_SCORE_RATIO=Float.parseFloat(b);
+ sysout.println("Set MINIMUM_ALIGNMENT_SCORE_RATIO to "+String.format("%.3f",MINIMUM_ALIGNMENT_SCORE_RATIO));
+ minid=-1;
+ }else if(a.equals("minidentity") || a.equals("minid")){
+ if(b.lastIndexOf('%')==b.length()-1){minid=Double.parseDouble(b.substring(b.length()-1))/100;}
+ else{minid=Double.parseDouble(b);}
+ assert(minid>=0 && minid<=100) : "min identity must be between 0 and 1. Values from 1 to 100 will be assumed percent and divided by 100.";
+ }else if(a.equals("rcompmate") || a.equals("reversecomplementmate")){
+ rcompMate=Tools.parseBoolean(b);
+ sysout.println("Set RCOMP_MATE to "+rcompMate);
+ }else if(a.equals("rcomp") || a.equals("reversecomplement")){
+ AbstractMapThread.RCOMP=Tools.parseBoolean(b);
+ sysout.println("Set RCOMP to "+rcompMate);
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ Read.verbose=verbose;
+ SiteScore.verbose=verbose;
+ TranslateColorspaceRead.verbose=verbose;
+ AbstractIndex.verbose2=verbose;
+ }else if(a.equals("verbosestats")){
+ if(Character.isDigit(b.charAt(0))){
+ verbose_stats=Integer.parseInt(b);
+ }else{
+ verbose_stats=Tools.parseBoolean(b) ? 9 : 0;
+ }
+ }else if(a.equals("maxdellen")){
+ maxDelLen=Integer.parseInt(b);
+ }else if(a.equals("maxinslen")){
+ maxInsLen=Integer.parseInt(b);
+ }else if(a.equals("maxsublen")){
+ maxSubLen=Integer.parseInt(b);
+ }else if(a.equals("minqual")){
+ minQuality=Byte.parseByte(b);
+ midQuality=Tools.max(minQuality, midQuality);
+ maxQuality=Tools.max(midQuality, maxQuality);
+ }else if(a.equals("midqual")){
+ midQuality=Byte.parseByte(b);
+ maxQuality=Tools.max(midQuality, maxQuality);
+ minQuality=Tools.min(minQuality, midQuality);
+ }else if(a.equals("maxqual")){
+ maxQuality=Byte.parseByte(b);
+ midQuality=Tools.min(maxQuality, midQuality);
+ minQuality=Tools.min(minQuality, midQuality);
+ }else if(a.equals("matelen") || a.equals("pairlen")){
+ int x=Integer.parseInt(b);
+ AbstractMapThread.MAX_PAIR_DIST=x;
+ }else if(a.equals("s") || a.equals("snps")){
+ maxSnps=Integer.parseInt(b);
+ baseSnpRate=1;
+ }else if(a.equals("u") || a.equals("subs")){
+ maxInss=Integer.parseInt(b);
+ baseInsRate=1;
+ }else if(a.equals("d") || a.equals("dels")){
+ maxDels=Integer.parseInt(b);
+ baseDelRate=1;
+ }else if(a.equals("i") || a.equals("inss")){
+ maxSubs=Integer.parseInt(b);
+ baseSubRate=1;
+ }else if(a.equals("sequentialoverlap")){
+ sequentialOverlap=Integer.parseInt(b);
+ }else if(a.equals("sequentialstrandalt")){
+ sequentialStrandAlt=Tools.parseBoolean(b);
+ }else if(a.equals("k") || a.equals("keylen")){
+ keylen=Integer.parseInt(b);
+ assert(keylen>0 && keylen<16) : "k must lie between 1 and 15, inclusive.";
+ }else if(a.equals("genscaffoldinfo")){
+ RefToIndex.genScaffoldInfo=Tools.parseBoolean(b);
+ }else if(a.equals("loadscaffolds")){
+ Data.LOAD_SCAFFOLDS=Tools.parseBoolean(b);
+ }else if(a.equals("autoRefToIndex.chrombits")){
+ if("auto".equalsIgnoreCase(b)){RefToIndex.AUTO_CHROMBITS=true;}
+ else{RefToIndex.AUTO_CHROMBITS=Tools.parseBoolean(b);}
+ }else if(a.equals("RefToIndex.chrombits") || a.equals("cbits")){
+ if("auto".equalsIgnoreCase(b)){RefToIndex.AUTO_CHROMBITS=true;}
+ else{
+ RefToIndex.AUTO_CHROMBITS=false;
+ RefToIndex.chrombits=Integer.parseInt(b);
+ }
+ }else if(a.equals("requirecorrectstrand") || a.equals("rcs")){
+ REQUIRE_CORRECT_STRANDS_PAIRS=Tools.parseBoolean(b);
+ }else if(a.equals("samestrandpairs") || a.equals("ssp")){
+ SAME_STRAND_PAIRS=Tools.parseBoolean(b);
+ if(SAME_STRAND_PAIRS){sysout.println("Warning! SAME_STRAND_PAIRS=true mode is not fully tested.");}
+ }else if(a.equals("killbadpairs") || a.equals("kbp")){
+ KILL_BAD_PAIRS=Tools.parseBoolean(b);
+ }else if(a.equals("pairedonly") || a.equals("po")){
+ AbstractMapThread.OUTPUT_PAIRED_ONLY=Tools.parseBoolean(b);
+ }else if(a.equals("idmodulo") || a.equals("idmod")){
+ idmodulo=Integer.parseInt(b);
+ }else if(a.equals("minhits") || a.equals("minapproxhits")){
+ minApproxHits=Integer.parseInt(b);
+ }else if(a.equals("maxindel")){
+ maxIndel1=(int)Tools.max(0, Tools.parseKMG(b));
+ if(!setMaxIndel2){maxIndel2=2*maxIndel1;}
+ }else if(a.equals("maxindel1") || a.equals("maxindelsingle")){
+ maxIndel1=(int)Tools.max(0, Tools.parseKMG(b));
+ maxIndel2=Tools.max(maxIndel1, maxIndel2);
+ setMaxIndel1=true;
+ }else if(a.equals("maxindel2") || a.equals("maxindelsum")){
+ maxIndel2=(int)Tools.max(0, Tools.parseKMG(b));
+ maxIndel1=Tools.min(maxIndel1, maxIndel2);
+ setMaxIndel2=true;
+ }else if(a.equals("strictmaxindel")){
+ if(b!=null && Character.isDigit(b.charAt(0))){
+ maxIndel1=(int)Tools.max(0, Tools.parseKMG(b));
+ if(!setMaxIndel2){maxIndel2=2*maxIndel1;}
+ STRICT_MAX_INDEL=true;
+ }else{
+ STRICT_MAX_INDEL=Tools.parseBoolean(b);
+ }
+ }else if(a.equals("padding")){
+ SLOW_ALIGN_PADDING=Integer.parseInt(b);
+ SLOW_RESCUE_PADDING=SLOW_ALIGN_PADDING;
+ }else if(a.equals("rescue")){
+ RESCUE=Tools.parseBoolean(b);
+ }else if(a.equals("rescuemismatches")){
+ AbstractMapThread.MAX_RESCUE_MISMATCHES=Integer.parseInt(b);
+ }else if(a.equals("rescuedist")){
+ AbstractMapThread.MAX_RESCUE_DIST=(int)Tools.parseKMG(b);
+ }else if(a.equals("tipsearch")){
+ if(b!=null && ("f".equalsIgnoreCase(b) || "false".equalsIgnoreCase(b))){TIP_SEARCH_DIST=0;}
+ else{TIP_SEARCH_DIST=Tools.max(0, Integer.parseInt(b));}
+ }else if(a.equals("dper") || a.equals("dprr")){
+ DOUBLE_PRINT_ERROR_RATE=Tools.parseBoolean(b);
+ }else if(a.equals("chromgz")){
+ Data.CHROMGZ=Tools.parseBoolean(b);
+ }else if(a.equals("nodisk")){
+ RefToIndex.NODISK=Tools.parseBoolean(b);
+ }else if(a.equals("maxchromlen")){
+ RefToIndex.maxChromLen=Tools.parseKMG(b);
+ }else if(a.equals("minscaf") || a.equals("mincontig")){
+ RefToIndex.minScaf=Integer.parseInt(b);
+ }else if(a.equals("midpad") || a.equals("interpad")){
+ RefToIndex.midPad=Integer.parseInt(b);
+ }else if(a.equals("startpad")){
+ RefToIndex.startPad=Integer.parseInt(b);
+ }else if(a.equals("stoppad")){
+ RefToIndex.stopPad=Integer.parseInt(b);
+ }else if(a.equals("forceanalyze")){
+ forceanalyze=Tools.parseBoolean(b);
+ }else if(a.equals("machineoutput") || a.equals("machineout")){
+ MACHINE_OUTPUT=Tools.parseBoolean(b);
+ }else if(a.equals("showprogress") || a.equals("showprogress2")){
+ if(b!=null && Character.isDigit(b.charAt(0))){
+ long x=Tools.parseKMG(b);
+ ConcurrentReadInputStream.PROGRESS_INCR=x;
+ ConcurrentReadInputStream.SHOW_PROGRESS=(x>0);
+ }else{
+ ConcurrentReadInputStream.PROGRESS_INCR=ConcurrentReadInputStream.PROGRESS_INCR<1 ? 1000000 : ConcurrentReadInputStream.PROGRESS_INCR;
+ ConcurrentReadInputStream.SHOW_PROGRESS=Tools.parseBoolean(b);
+ }
+ if(a.equals("showprogress2")){ConcurrentReadInputStream.SHOW_PROGRESS2=ConcurrentReadInputStream.SHOW_PROGRESS;}
+ }else if(a.equals("scafstats") || a.equals("scaffoldstats")){
+ if(b==null && arg.indexOf('=')<0){b="stdout";}
+ if(b==null || b.equalsIgnoreCase("false") || b.equalsIgnoreCase("f") || b.equalsIgnoreCase("none") || b.equalsIgnoreCase("null")){
+ BBSplitter.TRACK_SCAF_STATS=false;
+ BBSplitter.SCAF_STATS_FILE=null;
+ sysout.println("No file specified; not tracking scaffold statistics.");
+ }else{
+ BBSplitter.TRACK_SCAF_STATS=true;
+ BBSplitter.SCAF_STATS_FILE=b;
+ sysout.println("Scaffold statistics will be written to "+b);
+ }
+ }else if(a.equals("setstats") || a.equals("refstats")){
+ if(b==null && arg.indexOf('=')<0){b="stdout";}
+ if(b==null || b.equalsIgnoreCase("false") || b.equalsIgnoreCase("f") || b.equalsIgnoreCase("none") || b.equalsIgnoreCase("null")){
+ BBSplitter.TRACK_SET_STATS=false;
+ BBSplitter.SET_STATS_FILE=null;
+ sysout.println("No file specified; not tracking reference set statistics.");
+ }else{
+ BBSplitter.TRACK_SET_STATS=true;
+ BBSplitter.SET_STATS_FILE=b;
+ sysout.println("Reference set statistics will be written to "+b);
+ }
+ }else if(a.equals("camelwalk")){
+ AbstractIndex.USE_CAMELWALK=Tools.parseBoolean(b);
+ }else if(a.equals("usequality") || a.equals("uq")){
+ AbstractIndex.GENERATE_KEY_SCORES_FROM_QUALITY=AbstractIndex.GENERATE_BASE_SCORES_FROM_QUALITY=Tools.parseBoolean(b);
+ }else if(a.equals("ignorequality")){
+ AbstractIndex.GENERATE_KEY_SCORES_FROM_QUALITY=AbstractIndex.GENERATE_BASE_SCORES_FROM_QUALITY=!Tools.parseBoolean(b);
+ }else if(a.equals("keepbadkeys") || a.equals("kbk")){
+ KeyRing.KEEP_BAD_KEYS=Tools.parseBoolean(b);
+ }else if(a.equals("usemodulo") || a.equals("um")){
+ USE_MODULO=AbstractMapThread.USE_MODULO=IndexMaker4.USE_MODULO=IndexMaker5.USE_MODULO=Tools.parseBoolean(b);
+ }else if(a.equals("lowmem") || a.equals("lowram") || a.equals("lowmemory")){
+ boolean x=Tools.parseBoolean(b);
+ if(x){
+ Shared.LOW_MEMORY=true;
+ USE_MODULO=AbstractMapThread.USE_MODULO=IndexMaker4.USE_MODULO=IndexMaker5.USE_MODULO=Tools.parseBoolean(b);
+ }else{
+ Shared.LOW_MEMORY=false;
+ }
+ }else if(a.equals("coveragestats") || a.equals("covstats")){
+ coverageStats=b;
+ }else if(a.equals("coverageminscaf") || a.equals("covminscaf")){
+ coverageMinScaf=Integer.parseInt(b);
+ }else if(a.equals("binnedcoverage") || a.equals("bincov")){
+ coverageBinned=b;
+ }else if(a.equals("coverage") || a.equals("basecov")){
+ coverageBase=b;
+ }else if(a.equals("secondarycoverage") || a.equals("secondarycov")){
+ CoveragePileup.USE_SECONDARY=Tools.parseBoolean(b);
+ }else if(a.equals("coveragehistogram") || a.equals("covhist")){
+ coverageHist=b;
+ }else if(a.equals("normcov")){
+ normcov=b;
+ }else if(a.equals("normcovo")){
+ normcovOverall=b;
+ }else if(a.equals("normb") || a.equals("normbins")){
+ CoveragePileup.NORMALIZE_LENGTH_BINS=Integer.parseInt(b);
+ }else if(a.equals("rpkm") || a.equals("fpkm")){
+ coverageRPKM=b;
+ }else if(a.equals("physicalcoverage") || a.equals("physcov")){
+ coveragePhysical=Tools.parseBoolean(b);
+ }else if(a.equals("32bit") || a.equals("32bits") || a.equals("bits32")){
+ cov32bit=Tools.parseBoolean(b);
+ }else if(a.equals("bitset")){
+ covBitset=Tools.parseBoolean(b);
+ covSetbs=true;
+ }else if(a.equals("arrays")){
+ covArrays=Tools.parseBoolean(b);
+ covSetbs=true;
+ }else if(a.equals("nzo") || a.equals("nonzeroonly")){
+ covNzo=scafNzo=Tools.parseBoolean(b);
+ }else if(a.equals("sortstats") || a.equals("sortscafs")){
+ sortStats=Tools.parseBoolean(b);
+ }else if(a.equals("twocolumn")){
+ covTwocolumn=Tools.parseBoolean(b);
+ }else if(a.equals("ksb") || a.equals("keepshortbins")){
+ covKsb=Tools.parseBoolean(b);
+ }else if(a.equals("covbinsize")){
+ covBinSize=Integer.parseInt(b);
+ }else if(a.equals("strandedcoverage") || a.equals("strandedcov") || a.equals("covstranded")){
+ covStranded=Tools.parseBoolean(b);
+ }else if(a.equals("startcov") || a.equals("covstart")){
+ covStartOnly=Tools.parseBoolean(b);
+ }else if(a.equals("concisecov")){
+ CoveragePileup.CONCISE=Tools.parseBoolean(b);
+ }else if(a.equals("covwindow")){
+ if(b==null || b.length()<1 || Character.isLetter(b.charAt(0))){
+ CoveragePileup.USE_WINDOW=Tools.parseBoolean(b);
+ }else{
+ CoveragePileup.LOW_COV_WINDOW=Integer.parseInt(b);
+ CoveragePileup.USE_WINDOW=(CoveragePileup.LOW_COV_WINDOW>0);
+ }
+ }else if(a.equals("covwindowavg")){
+ CoveragePileup.LOW_COV_DEPTH=Double.parseDouble(b);
+ }else if(a.equals("delcov") || a.equals("includedels") || a.equals("includedeletions") || a.equals("delcoverage")){
+ CoveragePileup.INCLUDE_DELETIONS=Tools.parseBoolean(b);
+ }else if(a.equals("rebuild")){
+ forceRebuild=Tools.parseBoolean(b);
+ }else if(a.equals("printunmappedcount")){
+ PRINT_UNMAPPED_COUNT=Tools.parseBoolean(b);
+ }else if(a.equals("timetag")){
+ boolean x=Tools.parseBoolean(b);
+ AbstractMapThread.TIME_TAG=x;
+ SamLine.MAKE_TIME_TAG=x;
+ if(x){AbstractMapThread.CLEAR_ATTACHMENT=false;}
+ }else if(a.equals("correctthresh")){
+ CORRECT_THRESH=Integer.parseInt(b);
+ }else if(a.equals("statsfile")){
+ statsOutputFile=b;
+ }else{
+ throw new RuntimeException("Unknown parameter: "+arg);
+ }
+ }
+
+ {//Process parser fields
+ Parser.processQuality();
+
+ qtrimLeft=parser.qtrimLeft;
+ qtrimRight=parser.qtrimRight;
+ TRIM_QUALITY=parser.trimq;
+ AbstractMapThread.MIN_AVERAGE_QUALITY=parser.minAvgQuality;
+ AbstractMapThread.MIN_AVERAGE_QUALITY_BASES=parser.minAvgQualityBases;
+ AbstractMapThread.MIN_READ_LENGTH=parser.minReadLength;
+ AbstractMapThread.MAX_READ_LENGTH=parser.maxReadLength;
+ minTrimLength=parser.minTrimLength;
+ untrim=parser.untrim;
+
+ maxReads=parser.maxReads;
+ overwrite=ReadStats.overwrite=CoveragePileup.overwrite=parser.overwrite;
+ append=ReadStats.append=parser.append;
+ setintron=SamLine.setintron;
+
+ samplerate=parser.samplerate;
+ sampleseed=parser.sampleseed;
+ IDFILTER=parser.idFilter;
+ build=parser.build;
+ if(IDFILTER>0){
+ if(IDFILTER==1f){PERFECTMODE=true;}
+ MAKE_MATCH_STRING=true;
+ }
+
+ if(parser.subfilter>-1){AbstractMapThread.SUBFILTER=parser.subfilter;}
+ if(parser.delfilter>-1){AbstractMapThread.DELFILTER=parser.delfilter;}
+ if(parser.insfilter>-1){AbstractMapThread.INSFILTER=parser.insfilter;}
+ if(parser.indelfilter>-1){AbstractMapThread.INDELFILTER=parser.indelfilter;}
+ if(parser.dellenfilter>-1){AbstractMapThread.DELLENFILTER=parser.dellenfilter;}
+ if(parser.inslenfilter>-1){AbstractMapThread.INSLENFILTER=parser.inslenfilter;}
+ if(parser.editfilter>-1){AbstractMapThread.EDITFILTER=parser.editfilter;}
+
+ if(ReadStats.COLLECT_TIME_STATS){AbstractMapThread.TIME_TAG=true;}
+ }
+
+ if(forceRebuild){
+ String sf=RefToIndex.summaryLoc(build);
+ if(sf!=null){
+ File f=new File(sf);
+ if(f.exists() && f.isFile()){f.delete();}
+ }
+ }
+
+ ChromosomeArray.CHANGE_UNDEFINED_TO_N_ON_READ=(!INDEX_LOADED);
+
+ if(BBSplitter.AMBIGUOUS2_MODE==BBSplitter.AMBIGUOUS2_SPLIT && splitterOutputs!=null){
+ ArrayList<String> clone=(ArrayList<String>) splitterOutputs.clone();
+ for(String s : clone){
+ splitterOutputs.add("AMBIGUOUS_"+s);
+ }
+ }
+ }
+
+ private final void checkFiles(){
+ if(in1!=null && in1.contains("#") && !new File(in1).exists()){
+ int pound=in1.lastIndexOf('#');
+ String a=in1.substring(0, pound);
+ String b=in1.substring(pound+1);
+ in1=a+1+b;
+ in2=a+2+b;
+ }
+ if(in2!=null && (in2.contains("=") || in2.equalsIgnoreCase("null"))){in2=null;}
+ if(in2!=null){
+ if(FASTQ.FORCE_INTERLEAVED){sysout.println("Reset INTERLEAVED to false because paired input files were specified.");}
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+ }
+
+ if(outFile!=null && outFile2==null && outFile.contains("#") && !outFile.contains(".sam") && !outFile.contains(".bam") && outFile.contains(".")){
+ int pound=outFile.lastIndexOf('#');
+ String a=outFile.substring(0, pound);
+ String b=outFile.substring(pound+1);
+ outFile=a+1+b;
+ outFile2=a+2+b;
+ }
+
+ if(outFileM!=null && outFileM2==null && outFileM.contains("#") && !outFileM.contains(".sam") && !outFileM.contains(".bam") && outFileM.contains(".")){
+ int pound=outFileM.lastIndexOf('#');
+ String a=outFileM.substring(0, pound);
+ String b=outFileM.substring(pound+1);
+ outFileM=a+1+b;
+ outFileM2=a+2+b;
+ }
+
+ if(outFileU!=null && outFileU2==null && outFileU.contains("#") && !outFileU.contains(".sam") && !outFileU.contains(".bam") && outFileU.contains(".")){
+ int pound=outFileU.lastIndexOf('#');
+ String a=outFileU.substring(0, pound);
+ String b=outFileU.substring(pound+1);
+ outFileU=a+1+b;
+ outFileU2=a+2+b;
+ }
+
+ if(OUTPUT_READS && !Tools.testOutputFiles(overwrite, append, false, outFile, outFile2)){
+ throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+outFile+", "+outFile2+"\n");
+ }
+
+ if(maxReads>0 && maxReads<Long.MAX_VALUE){sysout.println("Max reads: "+maxReads);}
+
+ ReadStats.testFiles(false);
+
+ assert(synthReadlen<0 || synthReadlen>=keylen);
+ }
+
+ private final String[] preparse0(String[] args){
+ int nulls=0;
+ for(int i=0; i<args.length; i++){
+ if(args[i]==null){nulls++;}
+ else{
+ final String arg=args[i];
+ final String[] split=arg.split("=");
+ assert(split.length>0) : "\n= symbol must be adjacent to 2 terms, with no spaces. E.g. 'out=mapped.sam'";
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1].toLowerCase() : null;
+ if("null".equalsIgnoreCase(b)){b=null;}
+ if(b!=null && (b.equals("stdout") || b.startsWith("stdout."))){
+ sysout=System.err;
+ Data.sysout=System.err;
+ }else if(a.equals("printtoerr")){
+ if(Tools.parseBoolean(b)){sysout=System.err; Data.sysout=System.err;}
+ }else if(b!=null && (b.equals("stdin") || b.startsWith("stdin."))){
+ SYSIN=true;
+ }else if(a.equals("fast")){
+ fast=Tools.parseBoolean(b);
+ if(fast){slow=false;}
+ args[i]=null;
+ nulls++;
+ }else if(a.equals("slow")){
+ slow=Tools.parseBoolean(b);
+ if(slow){fast=false;}
+ args[i]=null;
+ nulls++;
+ }else if(a.equals("vslow")){
+ vslow=Tools.parseBoolean(b);
+ if(vslow){fast=false;slow=true;}
+ args[i]=null;
+ nulls++;
+ }
+ }
+ }
+ if(nulls>0){args=Tools.condenseStrict(args);}
+ return args;
+ }
+
+ static final String padPercent(double value, int places){
+ String x=String.format("%."+places+"f", value);
+ int desired=3+(places<1 ? 0 : 1+places);
+ while(x.length()<desired){x=" "+x;}
+ return x;
+ }
+
+ static final String pad(long value, int places){
+ String x=""+value;
+ while(x.length()<places){x=" "+x;}
+ return x;
+ }
+
+ static final String padPercentMachine(double value, int places){
+ String x=String.format("%."+places+"f", value);
+ return x;
+ }
+
+
+ boolean openStreams(Timer t, String[] args){
+
+ cris=getReadInputStream(in1, in2, qfin1, qfin2);
+ final boolean paired=cris.paired();
+ cris.setSampleRate(samplerate, sampleseed);
+
+ final int buff=(!OUTPUT_ORDERED_READS ? 12 : Tools.max(32, 2*Shared.threads()));
+ if(OUTPUT_READS){
+ ReadStreamWriter.MINCHROM=minChrom;
+ ReadStreamWriter.MAXCHROM=maxChrom;
+
+ AbstractMapThread.OUTPUT_SAM=false;
+ if(outFile!=null){
+ FileFormat ff1=FileFormat.testOutput(outFile, FileFormat.SAM, 0, 0, true, overwrite, append, OUTPUT_ORDERED_READS);
+ FileFormat ff2=outFile2==null ? null : FileFormat.testOutput(outFile2, FileFormat.SAM, 0, 0, true, overwrite, append, OUTPUT_ORDERED_READS);
+ rosA=ConcurrentReadOutputStream.getStream(ff1, ff2, qfout, qfout2, buff, null, false);
+ rosA.start();
+ t.stop();
+ sysout.println("Started output stream:\t"+t);
+ t.start();
+ AbstractMapThread.OUTPUT_SAM|=ff1.samOrBam();
+ }
+ if(outFileM!=null){
+ FileFormat ff1=FileFormat.testOutput(outFileM, FileFormat.SAM, 0, 0, true, overwrite, append, OUTPUT_ORDERED_READS);
+ FileFormat ff2=outFileM2==null ? null : FileFormat.testOutput(outFileM2, FileFormat.SAM, 0, 0, true, overwrite, append, OUTPUT_ORDERED_READS);
+ rosM=ConcurrentReadOutputStream.getStream(ff1, ff2, qfoutM, qfoutM2, buff, null, false);
+ rosM.start();
+ t.stop();
+ sysout.println("Started output stream:\t"+t);
+ t.start();
+ AbstractMapThread.OUTPUT_SAM|=ff1.samOrBam();
+ }
+ if(outFileU!=null){
+ FileFormat ff1=FileFormat.testOutput(outFileU, FileFormat.SAM, 0, 0, true, overwrite, append, OUTPUT_ORDERED_READS);
+ FileFormat ff2=outFileU2==null ? null : FileFormat.testOutput(outFileU2, FileFormat.SAM, 0, 0, true, overwrite, append, OUTPUT_ORDERED_READS);
+ rosU=ConcurrentReadOutputStream.getStream(ff1, ff2, qfoutU, qfoutU2, buff, null, false);
+ rosU.start();
+ t.stop();
+ sysout.println("Started output stream:\t"+t);
+ t.start();
+ AbstractMapThread.OUTPUT_SAM|=ff1.samOrBam();
+ }
+ if(outFileB!=null && !Data.scaffoldPrefixes){
+ FileFormat ff1=FileFormat.testOutput(outFileB, FileFormat.SAM, 0, 0, true, overwrite, append, OUTPUT_ORDERED_READS);
+ FileFormat ff2=outFileB2==null ? null : FileFormat.testOutput(outFileB2, FileFormat.SAM, 0, 0, true, overwrite, append, OUTPUT_ORDERED_READS);
+ rosB=ConcurrentReadOutputStream.getStream(ff1, ff2, qfoutB, qfoutB2, buff, null, false);
+ rosB.start();
+ t.stop();
+ sysout.println("Started output stream:\t"+t);
+ t.start();
+ AbstractMapThread.OUTPUT_SAM|=ff1.samOrBam();
+ }
+ }
+
+ if(Data.scaffoldPrefixes){
+ BBSplitter.streamTable=BBSplitter.makeOutputStreams(args, OUTPUT_READS, OUTPUT_ORDERED_READS, buff, paired, overwrite, append, false);
+ if(BBSplitter.AMBIGUOUS2_MODE==BBSplitter.AMBIGUOUS2_SPLIT){
+ BBSplitter.streamTableAmbiguous=BBSplitter.makeOutputStreams(args, OUTPUT_READS, OUTPUT_ORDERED_READS, buff, paired, overwrite, append, true);
+ }
+ }else{
+ BBSplitter.TRACK_SET_STATS=false;
+ }
+
+ if(BBSplitter.TRACK_SET_STATS){
+ sysout.print("Creating ref-set statistics table: ");
+ BBSplitter.makeSetCountTable();
+ t.stop();
+ sysout.println(" \t"+t);
+ t.start();
+ }
+ if(BBSplitter.TRACK_SCAF_STATS){
+ sysout.print("Creating scaffold statistics table:");
+ BBSplitter.makeScafCountTable();
+ t.stop();
+ sysout.println(" \t"+t);
+ t.start();
+ }
+
+ {
+ String syncObj=new String("syncObj");
+ synchronized(syncObj){
+ System.gc();
+ Thread.yield();
+// if(waitForMemoryClear){
+ try {syncObj.wait(waitForMemoryClear ? 1000 : 100);}
+ catch (InterruptedException e) {e.printStackTrace();}
+// }
+ }
+
+ t.stop();
+ sysout.println("Cleared Memory: \t"+t);
+ }
+
+ return paired;
+ }
+
+ static final int shutDownThreads(AbstractMapThread[] mtts, boolean force){
+ int broken=0;
+ long millis=force ? 500 : 8000;
+ for(int i=0; i<mtts.length; i++){
+ AbstractMapThread mtt=mtts[i];
+ if(mtt==null){broken++;}
+ else{
+ synchronized(mtt){
+ while(mtt.working()){
+ State st=mtt.getState();
+ if(st==State.TERMINATED){
+ if(mtt.working()){
+ broken++;
+ break;
+ }
+ }
+ try {
+ mtt.wait(millis);
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ if(force && mtt.working()){
+ mtt.interrupt();
+ broken++;
+ break;
+ }
+ }
+ }
+ if(i==0){
+ sysout.print("Detecting finished threads: 0");
+ }else{
+ sysout.print(", "+i);
+ }
+ }
+ }
+
+ if(broken>0){
+ System.err.println("\n\n**************************************************************************\n" +
+ "Warning! "+broken+" mapping thread"+(broken==1 ? "" : "s")+" did not terminate normally.\n" +
+ "Check the error log; the output may be corrupt or incomplete.\n" +
+ "Please submit the full stderr output as a bug report, not just this message.\n" +
+ "**************************************************************************\n\n");
+ }
+ return broken;
+ }
+
+ static final boolean closeStreams(ConcurrentReadInputStream cris, ConcurrentReadOutputStream rosA, ConcurrentReadOutputStream rosM, ConcurrentReadOutputStream rosU, ConcurrentReadOutputStream rosB){
+ errorState|=ReadWrite.closeStreams(cris, rosA, rosM, rosU, rosB);
+ if(BBSplitter.streamTable!=null){
+ for(ConcurrentReadOutputStream tros : BBSplitter.streamTable.values()){
+ errorState|=ReadWrite.closeStream(tros);
+ }
+ }
+ if(BBSplitter.streamTableAmbiguous!=null){
+ for(ConcurrentReadOutputStream tros : BBSplitter.streamTableAmbiguous.values()){
+ errorState|=ReadWrite.closeStream(tros);
+ }
+ }
+ return errorState;
+ }
+
+ static final ConcurrentReadInputStream getReadInputStream(String in1, String in2, String qf1, String qf2){
+
+ assert(in1!=null);
+ assert(!in1.equalsIgnoreCase(in2)) : in1+", "+in2;
+
+ final ConcurrentReadInputStream cris;
+
+ FileFormat ff1=FileFormat.testInput(in1, FileFormat.FASTQ, 0, 0, true, true, false);
+ FileFormat ff2=FileFormat.testInput(in2, FileFormat.FASTQ, 0, 0, true, true, false);
+
+ if(ff1.fastq() || ff1.fasta() || ff1.samOrBam() || ff1.scarf() || ff1.bread()){
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, ff1.samOrBam(), ff1, ff2, qf1, qf2);
+ }else if(ff1.sequential()){
+ if(maxReads<0){maxReads=Long.MAX_VALUE;}
+// assert(false) : trials;
+ SequentialReadInputStream ris=new SequentialReadInputStream(maxReads, synthReadlen, Tools.max(50, synthReadlen/2), sequentialOverlap, sequentialStrandAlt);
+ cris=new ConcurrentLegacyReadInputStream(ris, maxReads);
+
+ }else if(ff1.random()){
+
+ useRandomReads=true;
+ assert(synthReadlen>0);
+
+ RandomReads3.PERFECT_READ_RATIO=PERFECT_READ_RATIO;
+
+ RandomReadInputStream3 ris=new RandomReadInputStream3(maxReads, synthReadlen, synthReadlen,
+ maxSnps, maxInss, maxDels, maxSubs,
+ baseSnpRate, baseInsRate, baseDelRate, baseSubRate,
+ maxInsLen, maxDelLen, maxSubLen,
+ minChrom, maxChrom, PAIRED_RANDOM_READS,
+ minQuality, midQuality, maxQuality);
+ cris=new ConcurrentLegacyReadInputStream(ris, maxReads);
+ }else{
+ throw new RuntimeException("Can't determine read input source: ff1="+ff1+", ff2="+ff2);
+ }
+ return cris;
+ }
+
+
+ static void printOutput(final AbstractMapThread[] mtts, final Timer t, final int keylen, final boolean paired, final boolean SKIMMER, final CoveragePileup pile,
+ boolean nzoStats, boolean sortStats, String dest){
+ if(MACHINE_OUTPUT){
+ printOutput_Machine(mtts, t, keylen, paired, SKIMMER, pile, nzoStats, sortStats, dest);
+ return;
+ }
+ if(dest==null){dest="stderr.txt";}
+ TextStreamWriter tswStats=new TextStreamWriter(dest, overwrite, append, false);
+ tswStats.start();
+
+ long readsUsed1=0;
+ long readsUsed2=0;
+ long lowQualityReadsDiscarded1=0;
+ long lowQualityReadsDiscarded2=0;
+ long lowQualityBasesDiscarded1=0;
+ long lowQualityBasesDiscarded2=0;
+
+ long msaIterationsLimited=0;
+ long msaIterationsUnlimited=0;
+
+ long basesUsed1=0;
+ long basesUsed2=0;
+ long keysUsed=0;
+ long bothUnmapped=0;
+ long bothUnmappedBases=0;
+
+ long syntheticReads=0;
+ long numMated=0;
+ long numMatedBases=0;
+ long badPairs=0;
+ long badPairBases=0;
+ long innerLengthSum=0;
+ long outerLengthSum=0;
+ long insertSizeSum=0;
+
+ long callsToScore=0;
+ long callsToExtend=0;
+ long initialKeys=0;
+ long initialKeyIterations=0;
+ long usedKeys=0;
+ long usedKeyIterations=0;
+
+ long[] hist_hits=new long[41];
+ long[] hist_hits_score=new long[41];
+ long[] hist_hits_extend=new long[41];
+
+ long initialSiteSum1=0;
+ long postTrimSiteSum1=0;
+ long postRescueSiteSum1=0;
+ long siteSum1=0;
+ long topSiteSum1=0;
+
+ long matchCountS1=0;
+ long matchCountI1=0;
+ long matchCountD1=0;
+ long matchCountM1=0;
+ long matchCountN1=0;
+
+ long readCountS1=0;
+ long readCountI1=0;
+ long readCountD1=0;
+ long readCountN1=0;
+ long readCountSplice1=0;
+ long readCountE1=0;
+
+
+ long mapped1=0;
+ long mappedRetained1=0;
+ long mappedRetainedBases1=0;
+ long rescuedP1=0;
+ long rescuedM1=0;
+ long truePositiveP1=0;
+ long truePositiveM1=0;
+ long falsePositive1=0;
+ long totalCorrectSites1=0;
+ long firstSiteCorrectP1=0;
+ long firstSiteCorrectM1=0;
+ long firstSiteIncorrect1=0;
+ long firstSiteCorrectLoose1=0;
+ long firstSiteIncorrectLoose1=0;
+ long firstSiteCorrectPaired1=0;
+ long firstSiteCorrectSolo1=0;
+ long firstSiteCorrectRescued1=0;
+ long perfectHit1=0; //Highest score is max score
+ long uniqueHit1=0; //Only one hit has highest score
+ long correctUniqueHit1=0; //unique highest hit on answer site
+ long correctMultiHit1=0; //non-unique highest hit on answer site (non-skimmer only)
+ long correctLowHit1=0; //hit on answer site, but not highest scorer
+ long noHit1=0;
+ long perfectMatch1=0; //Highest slow score is max slow score
+ long semiperfectMatch1=0;
+ long perfectMatchBases1=0;
+ long semiperfectMatchBases1=0;
+ long perfectHitCount1=0;
+ long semiPerfectHitCount1=0;
+ long duplicateBestAlignment1=0;
+ long duplicateBestAlignmentBases1=0;
+
+ long totalNumCorrect1=0; //Only for skimmer
+ long totalNumIncorrect1=0; //Only for skimmer
+ long totalNumIncorrectPrior1=0; //Only for skimmer
+ long totalNumCapturedAllCorrect1=0; //Only for skimmer
+ long totalNumCapturedAllCorrectTop1=0; //Only for skimmer
+ long totalNumCapturedAllCorrectOnly1=0; //Only for skimmer
+
+ long initialSiteSum2=0;
+ long postTrimSiteSum2=0;
+ long postRescueSiteSum2=0;
+ long siteSum2=0;
+ long topSiteSum2=0;
+
+ long mapped2=0;
+ long mappedRetained2=0;
+ long mappedRetainedBases2=0;
+ long rescuedP2=0;
+ long rescuedM2=0;
+ long truePositiveP2=0;
+ long truePositiveM2=0;
+ long falsePositive2=0;
+ long totalCorrectSites2=0;
+ long firstSiteCorrectP2=0;
+ long firstSiteCorrectM2=0;
+ long firstSiteIncorrect2=0;
+ long firstSiteCorrectLoose2=0;
+ long firstSiteIncorrectLoose2=0;
+ long firstSiteCorrectPaired2=0;
+ long firstSiteCorrectSolo2=0;
+ long firstSiteCorrectRescued2=0;
+ long perfectHit2=0; //Highest score is max score
+ long perfectHitCount2=0;
+ long semiPerfectHitCount2=0;
+
+ long uniqueHit2=0; //Only one hit has highest score
+ long correctUniqueHit2=0; //unique highest hit on answer site
+ long correctMultiHit2=0; //non-unique highest hit on answer site (non-skimmer only)
+ long correctLowHit2=0; //hit on answer site, but not highest scorer
+ long noHit2=0;
+ long perfectMatch2=0; //Highest slow score is max slow score
+ long semiperfectMatch2=0;
+ long perfectMatchBases2=0;
+ long semiperfectMatchBases2=0;
+ long duplicateBestAlignment2=0;
+ long duplicateBestAlignmentBases2=0;
+
+ long totalNumCorrect2=0; //Only for skimmer
+ long totalNumIncorrect2=0; //Only for skimmer
+ long totalNumIncorrectPrior2=0; //Only for skimmer
+ long totalNumCapturedAllCorrect2=0; //Only for skimmer
+ long totalNumCapturedAllCorrectTop2=0; //Only for skimmer
+ long totalNumCapturedAllCorrectOnly2=0; //Only for skimmer
+
+ long matchCountS2=0;
+ long matchCountI2=0;
+ long matchCountD2=0;
+ long matchCountM2=0;
+ long matchCountN2=0;
+
+ long readCountS2=0;
+ long readCountI2=0;
+ long readCountD2=0;
+ long readCountN2=0;
+ long readCountSplice2=0;
+ long readCountE2=0;
+
+ readsUsed1=0;
+ for(int i=0; i<mtts.length; i++){
+ AbstractMapThread mtt=mtts[i];
+
+ if(mtt.msa!=null){
+ msaIterationsLimited+=mtt.msa.iterationsLimited;
+ msaIterationsUnlimited+=mtt.msa.iterationsUnlimited;
+ }
+
+ readsUsed1+=mtt.readsUsed1;
+ readsUsed2+=mtt.readsUsed2;
+ syntheticReads+=mtt.syntheticReads;
+ numMated+=mtt.numMated;
+ numMatedBases+=mtt.numMatedBases;
+ badPairs+=mtt.badPairs;
+ badPairBases+=mtt.badPairBases;
+ innerLengthSum+=mtt.innerLengthSum;
+ outerLengthSum+=mtt.outerLengthSum;
+ insertSizeSum+=mtt.insertSizeSum;
+ basesUsed1+=mtt.basesUsed1;
+ basesUsed2+=mtt.basesUsed2;
+ keysUsed+=mtt.keysUsed;
+ bothUnmapped+=mtt.bothUnmapped;
+ bothUnmappedBases+=mtt.bothUnmappedBases;
+
+ mapped1+=mtt.mapped1;
+ mappedRetained1+=mtt.mappedRetained1;
+ mappedRetainedBases1+=mtt.mappedRetainedBases1;
+ rescuedP1+=mtt.rescuedP1;
+ rescuedM1+=mtt.rescuedM1;
+ lowQualityReadsDiscarded1+=mtt.lowQualityReadsDiscarded1;
+ lowQualityBasesDiscarded1+=mtt.lowQualityBasesDiscarded1;
+ truePositiveP1+=mtt.truePositiveP1;
+ truePositiveM1+=mtt.truePositiveM1;
+ falsePositive1+=mtt.falsePositive1;
+// System.err.println("Adding "+mtt.falsePositive+" false positives -> "+falsePositive);
+ totalCorrectSites1+=mtt.totalCorrectSites1;
+
+ firstSiteCorrectP1+=mtt.firstSiteCorrectP1;
+ firstSiteCorrectM1+=mtt.firstSiteCorrectM1;
+ firstSiteIncorrect1+=mtt.firstSiteIncorrect1;
+ firstSiteCorrectLoose1+=mtt.firstSiteCorrectLoose1;
+ firstSiteIncorrectLoose1+=mtt.firstSiteIncorrectLoose1;
+ firstSiteCorrectPaired1+=mtt.firstSiteCorrectPaired1;
+ firstSiteCorrectSolo1+=mtt.firstSiteCorrectSolo1;
+ firstSiteCorrectRescued1+=mtt.firstSiteCorrectRescued1;
+
+ perfectHit1+=mtt.perfectHit1; //Highest score is max score
+ perfectHitCount1+=mtt.perfectHitCount1;
+ semiPerfectHitCount1+=mtt.semiPerfectHitCount1;
+ uniqueHit1+=mtt.uniqueHit1; //Only one hit has highest score
+ correctUniqueHit1+=mtt.correctUniqueHit1; //unique highest hit on answer site
+ correctMultiHit1+=mtt.correctMultiHit1; //non-unique highest hit on answer site
+ correctLowHit1+=mtt.correctLowHit1; //hit on answer site, but not highest scorer
+ noHit1+=mtt.noHit1;
+
+ totalNumCorrect1+=mtt.totalNumCorrect1; //Skimmer only
+ totalNumIncorrect1+=mtt.totalNumIncorrect1; //Skimmer only
+ totalNumIncorrectPrior1+=mtt.totalNumIncorrectPrior1; //Skimmer only
+ totalNumCapturedAllCorrect1+=mtt.totalNumCapturedAllCorrect1; //Skimmer only
+ totalNumCapturedAllCorrectTop1+=mtt.totalNumCapturedAllCorrectTop1; //Skimmer only
+ totalNumCapturedAllCorrectOnly1+=mtt.totalNumCapturedAllCorrectOnly1; //Skimmer only
+
+ perfectMatch1+=mtt.perfectMatch1; //Highest slow score is max slow score
+ semiperfectMatch1+=mtt.semiperfectMatch1; //A semiperfect mapping was found
+ perfectMatchBases1+=mtt.perfectMatchBases1;
+ semiperfectMatchBases1+=mtt.semiperfectMatchBases1;
+
+ duplicateBestAlignment1+=mtt.ambiguousBestAlignment1;
+ duplicateBestAlignmentBases1+=mtt.ambiguousBestAlignmentBases1;
+
+ initialSiteSum1+=mtt.initialSiteSum1;
+ postTrimSiteSum1+=mtt.postTrimSiteSum1;
+ postRescueSiteSum1+=mtt.postRescueSiteSum1;
+ siteSum1+=mtt.siteSum1;
+ topSiteSum1+=mtt.topSiteSum1;
+
+ AbstractIndex index=mtt.index();
+ callsToScore+=index.callsToScore;
+ callsToExtend+=index.callsToExtendScore;
+ initialKeys+=index.initialKeys;
+ initialKeyIterations+=index.initialKeyIterations;
+ usedKeys+=index.usedKeys;
+ usedKeyIterations+=index.usedKeyIterations;
+
+ for(int j=0; j<index.hist_hits.length; j++){
+ int x=Tools.min(hist_hits.length-1, j);
+ hist_hits[x]+=index.hist_hits[j];
+ hist_hits_score[x]+=index.hist_hits_score[j];
+ hist_hits_extend[x]+=index.hist_hits_extend[j];
+ }
+
+ matchCountS1+=mtt.matchCountS1;
+ matchCountI1+=mtt.matchCountI1;
+ matchCountD1+=mtt.matchCountD1;
+ matchCountM1+=mtt.matchCountM1;
+ matchCountN1+=mtt.matchCountN1;
+
+ readCountS1+=mtt.readCountS1;
+ readCountI1+=mtt.readCountI1;
+ readCountD1+=mtt.readCountD1;
+ readCountN1+=mtt.readCountN1;
+ readCountSplice1+=mtt.readCountSplice1;
+ readCountE1+=mtt.readCountE1;
+
+ mapped2+=mtt.mapped2;
+ mappedRetained2+=mtt.mappedRetained2;
+ mappedRetainedBases2+=mtt.mappedRetainedBases2;
+ rescuedP2+=mtt.rescuedP2;
+ rescuedM2+=mtt.rescuedM2;
+ lowQualityReadsDiscarded2+=mtt.lowQualityReadsDiscarded2;
+ lowQualityBasesDiscarded2+=mtt.lowQualityBasesDiscarded2;
+ truePositiveP2+=mtt.truePositiveP2;
+ truePositiveM2+=mtt.truePositiveM2;
+ falsePositive2+=mtt.falsePositive2;
+// System.err.println("Adding "+mtt.falsePositive+" false positives -> "+falsePositive);
+ totalCorrectSites2+=mtt.totalCorrectSites2;
+
+ firstSiteCorrectP2+=mtt.firstSiteCorrectP2;
+ firstSiteCorrectM2+=mtt.firstSiteCorrectM2;
+ firstSiteIncorrect2+=mtt.firstSiteIncorrect2;
+ firstSiteCorrectLoose2+=mtt.firstSiteCorrectLoose2;
+ firstSiteIncorrectLoose2+=mtt.firstSiteIncorrectLoose2;
+ firstSiteCorrectPaired2+=mtt.firstSiteCorrectPaired2;
+ firstSiteCorrectSolo2+=mtt.firstSiteCorrectSolo2;
+ firstSiteCorrectRescued2+=mtt.firstSiteCorrectRescued2;
+
+ perfectHit2+=mtt.perfectHit2; //Highest score is max score
+ perfectHitCount2+=mtt.perfectHitCount2;
+ semiPerfectHitCount2+=mtt.semiPerfectHitCount2;
+ uniqueHit2+=mtt.uniqueHit2; //Only one hit has highest score
+ correctUniqueHit2+=mtt.correctUniqueHit2; //unique highest hit on answer site
+ correctMultiHit2+=mtt.correctMultiHit2; //non-unique highest hit on answer site
+ correctLowHit2+=mtt.correctLowHit2; //hit on answer site, but not highest scorer
+ noHit2+=mtt.noHit2;
+
+ totalNumCorrect2+=mtt.totalNumCorrect2; //Skimmer only
+ totalNumIncorrect2+=mtt.totalNumIncorrect2; //Skimmer only
+ totalNumIncorrectPrior2+=mtt.totalNumIncorrectPrior2; //Skimmer only
+ totalNumCapturedAllCorrect2+=mtt.totalNumCapturedAllCorrect2; //Skimmer only
+ totalNumCapturedAllCorrectTop2+=mtt.totalNumCapturedAllCorrectTop2; //Skimmer only
+ totalNumCapturedAllCorrectOnly2+=mtt.totalNumCapturedAllCorrectOnly2; //Skimmer only
+
+ perfectMatch2+=mtt.perfectMatch2; //Highest slow score is max slow score
+ semiperfectMatch2+=mtt.semiperfectMatch2; //A semiperfect mapping was found
+ perfectMatchBases2+=mtt.perfectMatchBases2;
+ semiperfectMatchBases2+=mtt.semiperfectMatchBases2;
+
+ duplicateBestAlignment2+=mtt.ambiguousBestAlignment2;
+ duplicateBestAlignmentBases2+=mtt.ambiguousBestAlignmentBases2;
+
+ initialSiteSum2+=mtt.initialSiteSum2;
+ postTrimSiteSum2+=mtt.postTrimSiteSum2;
+ postRescueSiteSum2+=mtt.postRescueSiteSum2;
+ siteSum2+=mtt.siteSum2;
+ topSiteSum2+=mtt.topSiteSum2;
+
+ matchCountS2+=mtt.matchCountS2;
+ matchCountI2+=mtt.matchCountI2;
+ matchCountD2+=mtt.matchCountD2;
+ matchCountM2+=mtt.matchCountM2;
+ matchCountN2+=mtt.matchCountN2;
+
+ readCountS2+=mtt.readCountS2;
+ readCountI2+=mtt.readCountI2;
+ readCountD2+=mtt.readCountD2;
+ readCountN2+=mtt.readCountN2;
+ readCountSplice2+=mtt.readCountSplice2;
+ readCountE2+=mtt.readCountE2;
+
+ }
+ maxReads=readsUsed1;
+ if(syntheticReads>0){SYNTHETIC=true;}
+
+ t.stop();
+ long nanos=t.elapsed;
+
+ if(verbose_stats>1){
+ StringBuilder sb=new StringBuilder(1000);
+ sb.append("\n\n###################\n#hits\tcount\tscore\textend\n");
+ for(int i=0; i<hist_hits.length; i++){
+ sb.append(i+"\t"+hist_hits[i]+"\t"+hist_hits_score[i]+"\t"+hist_hits_extend[i]+"\n");
+ }
+ try {
+ ReadWrite.writeString(sb, "hist_hits.txt", true);
+ } catch (Throwable e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+
+ final long basesUsed=basesUsed1+basesUsed2;
+
+ final double invTrials=1d/maxReads;
+ final double invTrials100=100d/maxReads;
+ final double invBases100=100d/(basesUsed);
+ final double invBases100_1=100d/basesUsed1;
+ final double invBases100_2=100d/basesUsed2;
+ double invSites100=100d/siteSum1;
+
+ final double matedPercent=(numMated*invTrials100);
+ ReadStats.matedPercent=matedPercent;
+ final double badPairsPercent=(badPairs*invTrials100);
+ final double matedPercentBases=(numMatedBases*invBases100);
+ final double badPairsPercentBases=(badPairBases*invBases100);
+ final double innerLengthAvg=(innerLengthSum*1d/numMated);
+ final double outerLengthAvg=(outerLengthSum*1d/numMated);
+ final double insertSizeAvg=(insertSizeSum*1d/numMated);
+
+ final double readsPerSecond=((readsUsed1+readsUsed2)*1000000000d)/nanos;
+ final double fragsPerSecond=(keysUsed*1000000000d)/nanos;
+ final double kiloBasesPerSecond=(basesUsed*1000000d)/nanos;
+
+ double perfectHitPercent=(perfectHit1*invTrials100); //Highest score is max score
+ double perfectMatchPercent=(perfectMatch1*invTrials100);
+ double semiperfectMatchPercent=(semiperfectMatch1*invTrials100);
+ double perfectMatchPercentBases=(perfectMatchBases1*invBases100_1);
+ double semiperfectMatchPercentBases=(semiperfectMatchBases1*invBases100_1);
+
+ double perfectHitCountPercent=perfectHitCount1*invSites100;
+ double semiPerfectHitCountPercent=semiPerfectHitCount1*invSites100;
+
+ double uniqueHitPercent=(uniqueHit1*invTrials100); //Only one hit has highest score
+ double correctUniqueHitPercent=(correctUniqueHit1*invTrials100); //unique highest hit on answer site
+ double correctMultiHitPercent=(correctMultiHit1*invTrials100); //non-unique highest hit on answer site
+ double correctLowHitPercent=(correctLowHit1*invTrials100); //hit on answer site, but not highest scorer
+ double ambiguousFound=(duplicateBestAlignment1*invTrials100);
+ double ambiguousBasesFound=(duplicateBestAlignmentBases1*invBases100_1);
+ double correctHighHitPercent=((correctMultiHit1+correctUniqueHit1)*invTrials100);
+ double correctHitPercent=((correctLowHit1+correctMultiHit1+correctUniqueHit1)*invTrials100);
+
+ double mappedB=(mapped1*invTrials100);
+ double mappedRetainedB=(mappedRetained1*invTrials100);
+ double mappedRetainedBasesB=(mappedRetainedBases1*invBases100_1);
+ double rescuedPB=(rescuedP1*invTrials100);
+ double rescuedMB=(rescuedM1*invTrials100);
+ double falsePositiveB=(firstSiteIncorrect1*invTrials100);
+ double falsePositiveLooseB=(firstSiteIncorrectLoose1*invTrials100);
+ double truePositivePB=(firstSiteCorrectP1*invTrials100);
+ double truePositiveMB=(firstSiteCorrectM1*invTrials100);
+ double truePositiveStrict=((firstSiteCorrectP1+firstSiteCorrectM1)*invTrials100);
+ double truePositiveLoose=(firstSiteCorrectLoose1*invTrials100);
+ double snrStrict=10*Math.log10((firstSiteCorrectM1+firstSiteCorrectP1+0.1)/(firstSiteIncorrect1+0.1));
+ double snrLoose=10*Math.log10((firstSiteCorrectLoose1+0.1)/(firstSiteIncorrectLoose1+0.1));
+ double truePositivePMRatio=(truePositivePB/truePositiveMB);
+ double truePositivePairedB=(firstSiteCorrectPaired1*100d/numMated);
+ double truePositiveSoloB=(firstSiteCorrectSolo1*100d/(mappedRetained1-numMated));
+ double truePositiveRescuedB=(firstSiteCorrectRescued1*100d/(rescuedP1+rescuedM1));
+ double noHitPercent=(noHit1*invTrials100);
+
+ long mappedReads, unambiguousReads, mappedBases, unambiguousBases;
+ if(REMOVE_DUPLICATE_BEST_ALIGNMENTS){
+ mappedReads=mappedRetained1+duplicateBestAlignment1;
+ unambiguousReads=mappedRetained1;
+ mappedBases=mappedRetainedBases1+duplicateBestAlignmentBases1;
+ unambiguousBases=mappedRetainedBases1;
+ }else{
+ mappedReads=mappedRetained1;
+ unambiguousReads=mappedRetained1-duplicateBestAlignment1;
+ mappedBases=mappedRetainedBases1;
+ unambiguousBases=mappedRetainedBases1-duplicateBestAlignmentBases1;
+ }
+
+ double avgNumCorrect=(SKIMMER ? totalNumCorrect1*invTrials : (totalCorrectSites1/(1d*(truePositiveP1+truePositiveM1))));
+ double avgNumIncorrect=totalNumIncorrect1*invTrials; //Skimmer only
+ double avgNumIncorrectPrior=totalNumIncorrectPrior1*invTrials; //Skimmer only
+
+ double rateCapturedAllCorrect=totalNumCapturedAllCorrect1*invTrials100; //Skimmer only
+ double rateCapturedAllTop=totalNumCapturedAllCorrectTop1*invTrials100; //Skimmer only
+ double rateCapturedAllOnly=totalNumCapturedAllCorrectOnly1*invTrials100; //Skimmer only
+
+ double avgCallsToScore=(callsToScore*invTrials);
+ double avgCallsToExtendScore=(callsToExtend*invTrials);
+ double avgInitialKeys=(initialKeys*1d/initialKeyIterations);
+ double avgUsedKeys=(usedKeys*1d/usedKeyIterations);
+
+ double avgInitialSites=(initialSiteSum1*invTrials);
+ double avgPostTrimSites=(postTrimSiteSum1*invTrials);
+ double avgPostRescueSites=(postRescueSiteSum1*invTrials);
+ double avgSites=(siteSum1*invTrials);
+ double avgPerfectSites=(perfectHitCount1*invTrials);
+ double avgSemiPerfectSites=(semiPerfectHitCount1*invTrials);
+ double avgTopSites=(topSiteSum1*invTrials);
+ double lowQualityReadsDiscardedPercent=(lowQualityReadsDiscarded1*invTrials100);
+ double lowQualityBasesDiscardedPercent=(lowQualityBasesDiscarded1*invBases100_1);
+
+ long matchErrors=matchCountS1+matchCountI1+matchCountD1;
+ long baseLen=matchCountM1+matchCountI1+matchCountS1+matchCountN1;
+ long matchLen=matchCountM1+matchCountI1+matchCountS1+matchCountN1+matchCountD1;
+ long refLen=matchCountM1+matchCountS1+matchCountN1+matchCountD1;
+ double errorRate=matchErrors*100d/matchLen;
+ double matchRate=matchCountM1*100d/matchLen;
+ double subRate=matchCountS1*100d/matchLen;
+ double delRate=matchCountD1*100d/matchLen;
+ double insRate=matchCountI1*100d/matchLen;
+ double nRate=matchCountN1*100d/matchLen;
+ double readSubRate=readCountS1*100d/mapped1;
+ double readDelRate=readCountD1*100d/mapped1;
+ double readInsRate=readCountI1*100d/mapped1;
+ double readNRate=readCountN1*100d/mapped1;
+ double readSpliceRate=readCountSplice1*100d/mapped1;
+ double readErrorRate=readCountE1*100d/mapped1;
+
+ if(SYNTHETIC && verbose_stats==-1){verbose_stats=Tools.max(verbose_stats,9);}
+
+ tswStats.println("Reads Used: \t"+(readsUsed1+readsUsed2)+"\t("+(basesUsed)+" bases)");
+ tswStats.println();
+
+ if(useRandomReads){
+ tswStats.println("Read Length: \t"+synthReadlen);
+ tswStats.println("SNP rate: \t"+baseSnpRate+"\t(max = "+maxSnps+")");
+ tswStats.println("INS rate: \t"+baseInsRate+"\t(max = "+maxInss+", maxLen = "+maxInsLen+")");
+ tswStats.println("DEL rate: \t"+baseDelRate+"\t(max = "+maxDels+", maxLen = "+maxDelLen+")");
+ tswStats.println("SUB rate: \t"+baseSubRate+"\t(max = "+maxSubs+", maxLen = "+maxSubLen+")");
+ tswStats.println("minQuality: \t"+minQuality);
+ tswStats.println("midQuality: \t"+midQuality);
+ tswStats.println("maxQuality: \t"+maxQuality);
+ tswStats.println("prefect fraction: \t"+PERFECT_READ_RATIO);
+ tswStats.println();
+ }
+
+ tswStats.println("Mapping: \t"+t);
+ tswStats.println(String.format("Reads/sec: \t%.2f", readsPerSecond));
+ tswStats.println(String.format("kBases/sec: \t%.2f", kiloBasesPerSecond));
+ double milf=msaIterationsLimited*invTrials;
+ double milu=msaIterationsUnlimited*invTrials;
+ if(verbose_stats>=1){tswStats.println("MSA iterations: \t"+String.format("%.2fL + %.2fU = %.2f", milf,milu,milf+milu));}
+
+ if(paired){
+ tswStats.println("\n\nPairing data: \tpct reads\tnum reads \tpct bases\t num bases");
+ tswStats.println();
+ if(paired){
+ tswStats.println("mated pairs: \t"+padPercent(matedPercent,4)+"% \t"+pad(numMated,9)+" \t"+padPercent(matedPercentBases,4)+"% \t"+pad(numMatedBases,12));
+ tswStats.println("bad pairs: \t"+padPercent(badPairsPercent,4)+"% \t"+pad(badPairs,9)+" \t"+padPercent(badPairsPercentBases,4)+"% \t"+pad(badPairBases,12));
+ }
+
+ tswStats.println("insert size avg: \t "+padPercent(insertSizeAvg,2));
+ if(ReadStats.COLLECT_INSERT_STATS){
+ if(ReadStats.merged==null){ReadStats.mergeAll();}
+ long[] array=ReadStats.merged.insertHist.array;
+ double median=Tools.median(array);
+ double q1=Tools.percentile(array, 0.25);
+ double q3=Tools.percentile(array, 0.75);
+ double stdev=Tools.standardDeviationHistogram(array);
+ //TODO: Quartiles
+ tswStats.println("insert 25th %: \t "+padPercent(q1,2));
+ tswStats.println("insert median: \t "+padPercent(median,2));
+ tswStats.println("insert 75th %: \t "+padPercent(q3,2));
+ tswStats.println("insert std dev: \t "+padPercent(stdev,2));
+ tswStats.println("insert mode: \t "+Tools.calcMode(array));
+ }
+ if(verbose_stats>=1){
+ tswStats.println(String.format("avg inner length:\t %.2f", innerLengthAvg));
+ tswStats.println(String.format("avg insert size: \t %.2f", outerLengthAvg));
+ }
+ }
+
+ if(PRINT_UNMAPPED_COUNT){
+ double invReadsUsed100=100.0/(readsUsed1+readsUsed2);
+ double invBasesUsed100=100.0/basesUsed;
+ double x=bothUnmapped*invReadsUsed100;
+ double y=bothUnmappedBases*invBasesUsed100;
+ if(!paired){tswStats.println();}
+ tswStats.println("unmapped: \t"+padPercent(x,4)+"% \t"+pad(bothUnmapped,9)+" \t"+padPercent(y,4)+"% \t"+pad(bothUnmappedBases,12));
+ }
+
+ tswStats.println();
+ tswStats.println("\nRead 1 data: \tpct reads\tnum reads \tpct bases\t num bases");
+ if(verbose_stats>=1){
+ if(avgInitialKeys>0){tswStats.println(String.format("Avg Initial Keys: \t"+(avgInitialKeys<100?" ":"")+"%.3f",
+ avgInitialKeys));}
+ if(avgUsedKeys>0){tswStats.println(String.format("Avg Used Keys: \t"+(avgUsedKeys<100?" ":"")+"%.3f",
+ avgUsedKeys));}
+ if(avgCallsToScore>0){tswStats.println(String.format("Avg Calls to Score: \t"+(avgCallsToScore<100?" ":"")+"%.3f",
+ avgCallsToScore));}
+ if(avgCallsToExtendScore>0){tswStats.println(String.format("Avg Calls to Extend:\t"+(avgCallsToExtendScore<100?" ":"")+"%.3f",
+ avgCallsToExtendScore));}
+ tswStats.println();
+
+ tswStats.println(String.format("Avg Initial Sites: \t"+(avgInitialSites<10?" ":"")+"%.3f", avgInitialSites));
+ if(TRIM_LIST){tswStats.println(String.format("Avg Post-Trim: \t"+(avgPostTrimSites<10?" ":"")+"%.3f", avgPostTrimSites));}
+ if(paired){tswStats.println(String.format("Avg Post-Rescue: \t"+(avgPostRescueSites<10?" ":"")+"%.3f", avgPostRescueSites));}
+ tswStats.println(String.format("Avg Final Sites: \t"+(avgSites<10?" ":"")+"%.3f", avgSites));
+ tswStats.println(String.format("Avg Top Sites: \t"+(avgTopSites<10?" ":"")+"%.3f", avgTopSites));
+ if(verbose_stats>1){
+ tswStats.println(String.format("Avg Perfect Sites: \t"+(avgPerfectSites<10?" ":"")+"%.3f \t"+
+ (perfectHitCountPercent<10?" ":"")+"%.3f%%", avgPerfectSites, perfectHitCountPercent));
+ tswStats.println(String.format("Avg Semiperfect Sites:\t"+(avgSemiPerfectSites<10?" ":"")+"%.3f \t"+
+ (semiPerfectHitCountPercent<10?" ":"")+"%.3f%%", avgSemiPerfectSites, semiPerfectHitCountPercent));
+ }
+
+ if(SYNTHETIC){
+ tswStats.println(String.format("Avg Correct Sites: \t"+(avgNumCorrect<10?" ":"")+"%.3f", avgNumCorrect));
+ if(SKIMMER){
+ tswStats.println(String.format("Avg Incorrect Sites:\t"+(avgNumIncorrect<10?" ":"")+"%.3f", avgNumIncorrect));
+ tswStats.println(String.format("Avg IncorrectP Sites:\t"+(avgNumIncorrectPrior<10?" ":"")+"%.3f", avgNumIncorrectPrior));
+ }
+ }
+ }
+
+ tswStats.println();
+ if(REMOVE_DUPLICATE_BEST_ALIGNMENTS){
+ double x=ambiguousFound+mappedRetainedB;
+ double y=ambiguousBasesFound+mappedRetainedBasesB;
+ tswStats.println("mapped: \t"+padPercent(x,4)+"% \t"+pad(mappedReads,9)+" \t"+padPercent(y,4)+"% \t"+pad(mappedBases,12));
+ tswStats.println("unambiguous: \t"+padPercent(mappedRetainedB,4)+"% \t"+pad(unambiguousReads,9)+" \t"+padPercent(mappedRetainedBasesB,4)+"% \t"+pad(unambiguousBases,12));
+ }else{
+ double x=mappedRetainedB-ambiguousFound;
+ double y=mappedRetainedBasesB-ambiguousBasesFound;
+ tswStats.println("mapped: \t"+padPercent(mappedRetainedB,4)+"% \t"+pad(mappedReads,9)+" \t"+padPercent(mappedRetainedBasesB,4)+"% \t"+pad(mappedBases,12));
+ tswStats.println("unambiguous: \t"+padPercent(x,4)+"% \t"+pad(unambiguousReads,9)+" \t"+padPercent(y,4)+"% \t"+pad(unambiguousBases,12));
+ }
+ tswStats.println("ambiguous: \t"+padPercent(ambiguousFound,4)+"% \t"+pad(duplicateBestAlignment1,9)+
+ " \t"+padPercent(ambiguousBasesFound,4)+"% \t"+pad(duplicateBestAlignmentBases1,12));
+ tswStats.println("low-Q discards: \t"+padPercent(lowQualityReadsDiscardedPercent,4)+"% \t"+pad(lowQualityReadsDiscarded1,9)+
+ " \t"+padPercent(lowQualityBasesDiscardedPercent,4)+"% \t"+pad(lowQualityBasesDiscarded1,12));
+
+ tswStats.println();
+ tswStats.println("perfect best site:\t"+padPercent(perfectMatchPercent,4)+"% \t"+pad(perfectMatch1,9)+
+ " \t"+padPercent(perfectMatchPercentBases,4)+"% \t"+pad(perfectMatchBases1,12));
+ tswStats.println("semiperfect site:\t"+padPercent(semiperfectMatchPercent,4)+"% \t"+pad(semiperfectMatch1,9)+
+ " \t"+padPercent(semiperfectMatchPercentBases,4)+"% \t"+pad(semiperfectMatchBases1,12));
+ if(paired){
+ tswStats.println("rescued: \t"+padPercent(rescuedPB+rescuedMB,4)+"% \t"+pad(rescuedP1+rescuedM1,9));
+ }
+
+ if(MAKE_MATCH_STRING){
+
+ tswStats.println();
+// tswStats.println(" \tpct reads\tnum reads \tpct bases\t num bases");
+ tswStats.println("Match Rate: \t NA \t NA \t"+padPercent(matchRate,4)+"% \t"+pad(matchCountM1,12));
+ tswStats.println("Error Rate: \t"+padPercent(readErrorRate,4)+"% \t"+pad(readCountE1,9)+" \t"+padPercent(errorRate,4)+"% \t"+pad(matchErrors,12));
+ tswStats.println("Sub Rate: \t"+padPercent(readSubRate,4)+"% \t"+pad(readCountS1,9)+" \t"+padPercent(subRate,4)+"% \t"+pad(matchCountS1,12));
+ tswStats.println("Del Rate: \t"+padPercent(readDelRate,4)+"% \t"+pad(readCountD1,9)+" \t"+padPercent(delRate,4)+"% \t"+pad(matchCountD1,12));
+ tswStats.println("Ins Rate: \t"+padPercent(readInsRate,4)+"% \t"+pad(readCountI1,9)+" \t"+padPercent(insRate,4)+"% \t"+pad(matchCountI1,12));
+ tswStats.println("N Rate: \t"+padPercent(readNRate,4)+"% \t"+pad(readCountN1,9)+" \t"+padPercent(nRate,4)+"% \t"+pad(matchCountN1,12));
+ if(SamLine.INTRON_LIMIT<Integer.MAX_VALUE){
+ tswStats.println("Splice Rate: \t"+padPercent(readSpliceRate,4)+"% \t"+pad(readCountSplice1,9)+" \t(splices at least "+SamLine.INTRON_LIMIT+" bp)");
+ }
+
+ if(DOUBLE_PRINT_ERROR_RATE){
+ System.err.println();
+ System.err.println(String.format("Match Rate: \t"+(matchRate<10?" ":"")+"%.4f", matchRate)+"% \t"+matchCountM1);
+ System.err.println(String.format("Error Rate: \t"+(errorRate<10?" ":"")+"%.4f", errorRate)+"% \t"+matchErrors);
+ System.err.println(String.format("Sub Rate: \t"+(subRate<10?" ":"")+"%.4f", subRate)+"% \t"+matchCountS1);
+ System.err.println(String.format("Del Rate: \t"+(delRate<10?" ":"")+"%.4f", delRate)+"% \t"+matchCountD1);
+ System.err.println(String.format("Ins Rate: \t"+(insRate<10?" ":"")+"%.4f", insRate)+"% \t"+matchCountI1);
+ System.err.println(String.format("N Rate: \t"+(nRate<10?" ":"")+"%.4f", nRate)+"% \t"+matchCountN1);
+ }
+ }
+
+ if(SYNTHETIC){
+ tswStats.println();
+ tswStats.println("true positive: \t"+padPercent(truePositiveStrict,4)+"%\t(loose: "+padPercent(truePositiveLoose,4)+"%)");
+ tswStats.println("false positive: \t"+padPercent(falsePositiveB,4)+"%\t(loose: "+padPercent(falsePositiveLooseB,4)+"%)");
+ tswStats.println("false negative: \t"+padPercent(noHitPercent,4)+"%");
+ tswStats.println("SNR: \t"+padPercent(snrStrict,4)+" \t(loose: "+padPercent(snrLoose,4)+")");
+ if(verbose_stats>0){
+ tswStats.println("correctLowHit: \t"+padPercent(correctLowHitPercent,4)+"%");
+ tswStats.println(String.format("Plus/Minus ratio:\t %1.4f", truePositivePMRatio));
+ }
+
+ if(paired){
+ tswStats.println("correct pairs: \t"+padPercent(truePositivePairedB,4)+"%\t(of mated)");
+ tswStats.println("correct singles: \t"+padPercent(truePositiveSoloB,4)+"%");
+ tswStats.println("correct rescued: \t"+padPercent(truePositiveRescuedB,4)+"%");
+ }
+
+ if(SKIMMER){
+ tswStats.println("found all correct:\t"+padPercent(rateCapturedAllCorrect,4)+"%)");
+ tswStats.println("all correct top: \t"+padPercent(rateCapturedAllTop,4)+"%)");
+ tswStats.println("all correct only: \t"+padPercent(rateCapturedAllOnly,4)+"%)");
+ }
+ }
+
+ if(paired){
+
+ invSites100=100d/siteSum2;
+
+ perfectHitPercent=(perfectHit2*invTrials100); //Highest score is max score
+ perfectMatchPercent=(perfectMatch2*invTrials100);
+ semiperfectMatchPercent=(semiperfectMatch2*invTrials100);
+ perfectMatchPercentBases=(perfectMatchBases2*invBases100_2);
+ semiperfectMatchPercentBases=(semiperfectMatchBases2*invBases100_2);
+
+ perfectHitCountPercent=perfectHitCount2*invSites100;
+ semiPerfectHitCountPercent=semiPerfectHitCount2*invSites100;
+
+ uniqueHitPercent=(uniqueHit2*invTrials100); //Only one hit has highest score
+ correctUniqueHitPercent=(correctUniqueHit2*invTrials100); //unique highest hit on answer site
+ correctMultiHitPercent=(correctMultiHit2*invTrials100); //non-unique highest hit on answer site
+ correctLowHitPercent=(correctLowHit2*invTrials100); //hit on answer site, but not highest scorer
+ ambiguousFound=(duplicateBestAlignment2*invTrials100);
+ ambiguousBasesFound=(duplicateBestAlignmentBases2*invBases100_2);
+ correctHighHitPercent=((correctMultiHit2+correctUniqueHit2)*invTrials100);
+ correctHitPercent=((correctLowHit2+correctMultiHit2+correctUniqueHit2)*invTrials100);
+
+ mappedB=(mapped2*invTrials100);
+ mappedRetainedB=(mappedRetained2*invTrials100);
+ mappedRetainedBasesB=(mappedRetainedBases2*invBases100_2);
+ rescuedPB=(rescuedP2*invTrials100);
+ rescuedMB=(rescuedM2*invTrials100);
+ falsePositiveB=(firstSiteIncorrect2*invTrials100);
+ falsePositiveLooseB=(firstSiteIncorrectLoose2*invTrials100);
+ truePositivePB=(firstSiteCorrectP2*invTrials100);
+ truePositiveMB=(firstSiteCorrectM2*invTrials100);
+ truePositiveStrict=((firstSiteCorrectP2+firstSiteCorrectM2)*invTrials100);
+ truePositiveLoose=(firstSiteCorrectLoose2*invTrials100);
+ snrStrict=10*Math.log10((firstSiteCorrectM2+firstSiteCorrectP2+0.2)/(firstSiteIncorrect2+0.2));
+ snrLoose=10*Math.log10((firstSiteCorrectLoose2+0.2)/(firstSiteIncorrectLoose2+0.2));
+ truePositivePMRatio=(truePositivePB/truePositiveMB);
+ truePositivePairedB=(firstSiteCorrectPaired2*100d/numMated);
+ truePositiveSoloB=(firstSiteCorrectSolo2*100d/(mappedRetained2-numMated));
+ truePositiveRescuedB=(firstSiteCorrectRescued2*100d/(rescuedP2+rescuedM2));
+ noHitPercent=(noHit2*invTrials100);
+
+ if(REMOVE_DUPLICATE_BEST_ALIGNMENTS){
+ mappedReads=mappedRetained2+duplicateBestAlignment2;
+ unambiguousReads=mappedRetained2;
+ mappedBases=mappedRetainedBases2+duplicateBestAlignmentBases2;
+ unambiguousBases=mappedRetainedBases2;
+ }else{
+ mappedReads=mappedRetained2;
+ unambiguousReads=mappedRetained2-duplicateBestAlignment2;
+ mappedBases=mappedRetainedBases2;
+ unambiguousBases=mappedRetainedBases2-duplicateBestAlignmentBases2;
+ }
+
+ avgNumCorrect=(SKIMMER ? totalNumCorrect2*invTrials : (totalCorrectSites2/(2d*(truePositiveP2+truePositiveM2))));
+ avgNumIncorrect=totalNumIncorrect2*invTrials; //Skimmer only
+ avgNumIncorrectPrior=totalNumIncorrectPrior2*invTrials; //Skimmer only
+
+ rateCapturedAllCorrect=totalNumCapturedAllCorrect2*invTrials100; //Skimmer only
+ rateCapturedAllTop=totalNumCapturedAllCorrectTop2*invTrials100; //Skimmer only
+ rateCapturedAllOnly=totalNumCapturedAllCorrectOnly2*invTrials100; //Skimmer only
+
+ avgCallsToScore=(callsToScore*invTrials);
+ avgCallsToExtendScore=(callsToExtend*invTrials);
+ avgInitialKeys=(initialKeys*2d/initialKeyIterations);
+ avgUsedKeys=(usedKeys*2d/usedKeyIterations);
+
+ avgInitialSites=(initialSiteSum2*invTrials);
+ avgPostTrimSites=(postTrimSiteSum2*invTrials);
+ avgPostRescueSites=(postRescueSiteSum2*invTrials);
+ avgSites=(siteSum2*invTrials);
+ avgPerfectSites=(perfectHitCount2*invTrials);
+ avgSemiPerfectSites=(semiPerfectHitCount2*invTrials);
+ avgTopSites=(topSiteSum2*invTrials);
+ lowQualityReadsDiscardedPercent=(lowQualityReadsDiscarded2*invTrials100);
+ lowQualityBasesDiscardedPercent=(lowQualityBasesDiscarded2*invBases100_2);
+
+ matchErrors=matchCountS2+matchCountI2+matchCountD2;
+ baseLen=matchCountM2+matchCountI2+matchCountS2+matchCountN2;
+ matchLen=matchCountM2+matchCountI2+matchCountS2+matchCountN2+matchCountD2;
+ refLen=matchCountM2+matchCountS2+matchCountN2+matchCountD2;
+ errorRate=matchErrors*100d/matchLen;
+ matchRate=matchCountM2*100d/matchLen;
+ subRate=matchCountS2*100d/matchLen;
+ delRate=matchCountD2*100d/matchLen;
+ insRate=matchCountI2*100d/matchLen;
+ nRate=matchCountN2*100d/matchLen;
+ readSubRate=readCountS2*100d/mapped2;
+ readDelRate=readCountD2*100d/mapped2;
+ readInsRate=readCountI2*100d/mapped2;
+ readNRate=readCountN2*100d/mapped2;
+ readSpliceRate=readCountSplice2*100d/mapped2;
+ readErrorRate=readCountE2*100d/mapped2;
+
+ tswStats.println();
+ tswStats.println("\nRead 2 data: \tpct reads\tnum reads \tpct bases\t num bases");
+ if(verbose_stats>=1){
+ if(avgInitialKeys>0){tswStats.println(String.format("Avg Initial Keys: \t"+(avgInitialKeys<100?" ":"")+"%.3f",
+ avgInitialKeys));}
+ if(avgUsedKeys>0){tswStats.println(String.format("Avg Used Keys: \t"+(avgUsedKeys<100?" ":"")+"%.3f",
+ avgUsedKeys));}
+ if(avgCallsToScore>0){tswStats.println(String.format("Avg Calls to Score: \t"+(avgCallsToScore<100?" ":"")+"%.3f",
+ avgCallsToScore));}
+ if(avgCallsToExtendScore>0){tswStats.println(String.format("Avg Calls to Extend:\t"+(avgCallsToExtendScore<100?" ":"")+"%.3f",
+ avgCallsToExtendScore));}
+ tswStats.println();
+
+ tswStats.println(String.format("Avg Initial Sites: \t"+(avgInitialSites<10?" ":"")+"%.3f", avgInitialSites));
+ if(TRIM_LIST){tswStats.println(String.format("Avg Post-Trim: \t"+(avgPostTrimSites<10?" ":"")+"%.3f", avgPostTrimSites));}
+ if(paired){tswStats.println(String.format("Avg Post-Rescue: \t"+(avgPostRescueSites<10?" ":"")+"%.3f", avgPostRescueSites));}
+ tswStats.println(String.format("Avg Final Sites: \t"+(avgSites<10?" ":"")+"%.3f", avgSites));
+ tswStats.println(String.format("Avg Top Sites: \t"+(avgTopSites<10?" ":"")+"%.3f", avgTopSites));
+ if(verbose_stats>1){
+ tswStats.println(String.format("Avg Perfect Sites: \t"+(avgPerfectSites<10?" ":"")+"%.3f \t"+
+ (perfectHitCountPercent<10?" ":"")+"%.3f%%", avgPerfectSites, perfectHitCountPercent));
+ tswStats.println(String.format("Avg Semiperfect Sites:\t"+(avgSemiPerfectSites<10?" ":"")+"%.3f \t"+
+ (semiPerfectHitCountPercent<10?" ":"")+"%.3f%%", avgSemiPerfectSites, semiPerfectHitCountPercent));
+ }
+
+ if(SYNTHETIC){
+ tswStats.println(String.format("Avg Correct Sites: \t"+(avgNumCorrect<10?" ":"")+"%.3f", avgNumCorrect));
+ if(SKIMMER){
+ tswStats.println(String.format("Avg Incorrect Sites:\t"+(avgNumIncorrect<10?" ":"")+"%.3f", avgNumIncorrect));
+ tswStats.println(String.format("Avg IncorrectP Sites:\t"+(avgNumIncorrectPrior<10?" ":"")+"%.3f", avgNumIncorrectPrior));
+ }
+ }
+ }
+
+ tswStats.println();
+ if(REMOVE_DUPLICATE_BEST_ALIGNMENTS){
+ double x=ambiguousFound+mappedRetainedB;
+ double y=ambiguousBasesFound+mappedRetainedBasesB;
+ tswStats.println("mapped: \t"+padPercent(x,4)+"% \t"+pad(mappedReads,9)+" \t"+padPercent(y,4)+"% \t"+pad(mappedBases,12));
+ tswStats.println("unambiguous: \t"+padPercent(mappedRetainedB,4)+"% \t"+pad(unambiguousReads,9)+" \t"+padPercent(mappedRetainedBasesB,4)+"% \t"+pad(unambiguousBases,12));
+ }else{
+ double x=mappedRetainedB-ambiguousFound;
+ double y=mappedRetainedBasesB-ambiguousBasesFound;
+ tswStats.println("mapped: \t"+padPercent(mappedRetainedB,4)+"% \t"+pad(mappedReads,9)+" \t"+padPercent(mappedRetainedBasesB,4)+"% \t"+pad(mappedBases,12));
+ tswStats.println("unambiguous: \t"+padPercent(x,4)+"% \t"+pad(unambiguousReads,9)+" \t"+padPercent(y,4)+"% \t"+pad(unambiguousBases,12));
+ }
+ tswStats.println("ambiguous: \t"+padPercent(ambiguousFound,4)+"% \t"+pad(duplicateBestAlignment2,9)+
+ " \t"+padPercent(ambiguousBasesFound,4)+"% \t"+pad(duplicateBestAlignmentBases2,12));
+ tswStats.println("low-Q discards: \t"+padPercent(lowQualityReadsDiscardedPercent,4)+"% \t"+pad(lowQualityReadsDiscarded2,9)+
+ " \t"+padPercent(lowQualityBasesDiscardedPercent,4)+"% \t"+pad(lowQualityBasesDiscarded2,12));
+
+ tswStats.println();
+ tswStats.println("perfect best site:\t"+padPercent(perfectMatchPercent,4)+"% \t"+pad(perfectMatch2,9)+
+ " \t"+padPercent(perfectMatchPercentBases,4)+"% \t"+pad(perfectMatchBases2,12));
+ tswStats.println("semiperfect site:\t"+padPercent(semiperfectMatchPercent,4)+"% \t"+pad(semiperfectMatch2,9)+
+ " \t"+padPercent(semiperfectMatchPercentBases,4)+"% \t"+pad(semiperfectMatchBases2,12));
+ if(paired){
+ tswStats.println("rescued: \t"+padPercent(rescuedPB+rescuedMB,4)+"% \t"+pad(rescuedP2+rescuedM2,9));
+ }
+
+ if(MAKE_MATCH_STRING){
+
+ tswStats.println();
+// tswStats.println(" \tpct reads\tnum reads \tpct bases\t num bases");
+ tswStats.println("Match Rate: \t NA \t NA \t"+padPercent(matchRate,4)+"% \t"+pad(matchCountM2,12));
+ tswStats.println("Error Rate: \t"+padPercent(readErrorRate,4)+"% \t"+pad(readCountE2,9)+" \t"+padPercent(errorRate,4)+"% \t"+pad(matchErrors,12));
+ tswStats.println("Sub Rate: \t"+padPercent(readSubRate,4)+"% \t"+pad(readCountS2,9)+" \t"+padPercent(subRate,4)+"% \t"+pad(matchCountS2,12));
+ tswStats.println("Del Rate: \t"+padPercent(readDelRate,4)+"% \t"+pad(readCountD2,9)+" \t"+padPercent(delRate,4)+"% \t"+pad(matchCountD2,12));
+ tswStats.println("Ins Rate: \t"+padPercent(readInsRate,4)+"% \t"+pad(readCountI2,9)+" \t"+padPercent(insRate,4)+"% \t"+pad(matchCountI2,12));
+ tswStats.println("N Rate: \t"+padPercent(readNRate,4)+"% \t"+pad(readCountN2,9)+" \t"+padPercent(nRate,4)+"% \t"+pad(matchCountN2,12));
+ if(SamLine.INTRON_LIMIT<Integer.MAX_VALUE){
+ tswStats.println("Splice Rate: \t"+padPercent(readSpliceRate,4)+"% \t"+pad(readCountSplice2,9)+" \t(splices at least "+SamLine.INTRON_LIMIT+" bp)");
+ }
+
+ if(DOUBLE_PRINT_ERROR_RATE){
+ System.err.println();
+ System.err.println(String.format("Match Rate: \t"+(matchRate<10?" ":"")+"%.4f", matchRate)+"% \t"+matchCountM2);
+ System.err.println(String.format("Error Rate: \t"+(errorRate<10?" ":"")+"%.4f", errorRate)+"% \t"+matchErrors);
+ System.err.println(String.format("Sub Rate: \t"+(subRate<10?" ":"")+"%.4f", subRate)+"% \t"+matchCountS2);
+ System.err.println(String.format("Del Rate: \t"+(delRate<10?" ":"")+"%.4f", delRate)+"% \t"+matchCountD2);
+ System.err.println(String.format("Ins Rate: \t"+(insRate<10?" ":"")+"%.4f", insRate)+"% \t"+matchCountI2);
+ System.err.println(String.format("N Rate: \t"+(nRate<10?" ":"")+"%.4f", nRate)+"% \t"+matchCountN2);
+ }
+ }
+
+ if(SYNTHETIC){
+ tswStats.println();
+ tswStats.println("true positive: \t"+padPercent(truePositiveStrict,4)+"%\t(loose: "+padPercent(truePositiveLoose,4)+"%)");
+ tswStats.println("false positive: \t"+padPercent(falsePositiveB,4)+"%\t(loose: "+padPercent(falsePositiveLooseB,4)+"%)");
+ tswStats.println("false negative: \t"+padPercent(noHitPercent,4)+"%");
+ tswStats.println("SNR: \t"+padPercent(snrStrict,4)+" \t(loose: "+padPercent(snrLoose,4)+")");
+ if(verbose_stats>0){
+ tswStats.println("correctLowHit: \t"+padPercent(correctLowHitPercent,4)+"%");
+ tswStats.println(String.format("Plus/Minus ratio:\t %2.4f", truePositivePMRatio));
+ }
+
+ if(paired){
+ tswStats.println("correct pairs: \t"+padPercent(truePositivePairedB,4)+"%\t(of mated)");
+ tswStats.println("correct singles: \t"+padPercent(truePositiveSoloB,4)+"%");
+ tswStats.println("correct rescued: \t"+padPercent(truePositiveRescuedB,4)+"%");
+ }
+
+ if(SKIMMER){
+ tswStats.println("found all correct:\t"+padPercent(rateCapturedAllCorrect,4)+"%)");
+ tswStats.println("all correct top: \t"+padPercent(rateCapturedAllTop,4)+"%)");
+ tswStats.println("all correct only: \t"+padPercent(rateCapturedAllOnly,4)+"%)");
+ }
+ }
+ }
+ errorState|=tswStats.poisonAndWait();
+
+ if(BBSplitter.TRACK_SCAF_STATS){
+ BBSplitter.printCounts(BBSplitter.SCAF_STATS_FILE, BBSplitter.scafCountTable, true, readsUsed1+readsUsed2, nzoStats, sortStats);
+ }
+
+ if(BBSplitter.TRACK_SET_STATS){
+ BBSplitter.printCounts(BBSplitter.SET_STATS_FILE, BBSplitter.setCountTable, true, readsUsed1+readsUsed2, nzoStats, sortStats);
+ }
+
+ ReadStats.writeAll();
+ if(pile!=null){
+ CoveragePileup.overwrite=overwrite;
+ CoveragePileup.append=append;
+ pile.printOutput();
+ }
+
+ assert(!CALC_STATISTICS || truePositiveP1+truePositiveM1+falsePositive1+noHit1+lowQualityReadsDiscarded1==maxReads) :
+ "\nThe number of reads out does not add up to the number of reads in.\nThis may indicate that a mapping thread crashed." +
+ "\nIf you submit a bug report, include the entire console output, not just this error message.\n"+
+ truePositiveP1+"+"+truePositiveM1+"+"+falsePositive1+"+"+noHit1+"+"+lowQualityReadsDiscarded1+" = "+
+ (truePositiveP1+truePositiveM1+falsePositive1+noHit1+lowQualityReadsDiscarded1)+" != "+maxReads;
+ if(!SKIMMER){
+ assert(!CALC_STATISTICS || truePositiveP1+truePositiveM1==correctLowHit1+correctMultiHit1+correctUniqueHit1);
+ }else{
+ assert(!CALC_STATISTICS || truePositiveP1+truePositiveM1==correctLowHit1+correctUniqueHit1);
+ }
+ }
+
+
+ static void printOutput_Machine(final AbstractMapThread[] mtts, final Timer t, final int keylen, final boolean paired, final boolean SKIMMER,
+ final CoveragePileup pile, boolean nzoStats, boolean sortStats, String dest){
+ if(dest==null){dest="stderr.txt";}
+ TextStreamWriter tswStats=new TextStreamWriter(dest, overwrite, append, false);
+ tswStats.start();
+
+ long readsUsed1=0;
+ long readsUsed2=0;
+ long lowQualityReadsDiscarded1=0;
+ long lowQualityReadsDiscarded2=0;
+ long lowQualityBasesDiscarded1=0;
+ long lowQualityBasesDiscarded2=0;
+
+ long msaIterationsLimited=0;
+ long msaIterationsUnlimited=0;
+
+ long basesUsed1=0;
+ long basesUsed2=0;
+ long basesAtQuickmap=0;
+ long keysUsed=0;
+ long bothUnmapped=0;
+ long bothUnmappedBases=0;
+
+ long syntheticReads=0;
+ long numMated=0;
+ long badPairs=0;
+ long innerLengthSum=0;
+ long outerLengthSum=0;
+ long insertSizeSum=0;
+
+ long callsToScore=0;
+ long callsToExtend=0;
+ long initialKeys=0;
+ long initialKeyIterations=0;
+ long usedKeys=0;
+ long usedKeyIterations=0;
+
+ long[] hist_hits=new long[41];
+ long[] hist_hits_score=new long[41];
+ long[] hist_hits_extend=new long[41];
+
+ long initialSiteSum1=0;
+ long postTrimSiteSum1=0;
+ long postRescueSiteSum1=0;
+ long siteSum1=0;
+ long topSiteSum1=0;
+
+ long matchCountS1=0;
+ long matchCountI1=0;
+ long matchCountD1=0;
+ long matchCountM1=0;
+ long matchCountN1=0;
+
+
+ long mapped1=0;
+ long mappedRetained1=0;
+ long rescuedP1=0;
+ long rescuedM1=0;
+ long truePositiveP1=0;
+ long truePositiveM1=0;
+ long falsePositive1=0;
+ long totalCorrectSites1=0;
+ long firstSiteCorrectP1=0;
+ long firstSiteCorrectM1=0;
+ long firstSiteIncorrect1=0;
+ long firstSiteCorrectLoose1=0;
+ long firstSiteIncorrectLoose1=0;
+ long firstSiteCorrectPaired1=0;
+ long firstSiteCorrectSolo1=0;
+ long firstSiteCorrectRescued1=0;
+ long perfectHit1=0; //Highest score is max score
+ long uniqueHit1=0; //Only one hit has highest score
+ long correctUniqueHit1=0; //unique highest hit on answer site
+ long correctMultiHit1=0; //non-unique highest hit on answer site (non-skimmer only)
+ long correctLowHit1=0; //hit on answer site, but not highest scorer
+ long noHit1=0;
+ long perfectMatch1=0; //Highest slow score is max slow score
+ long semiperfectMatch1=0;
+ long perfectMatchBases1=0;
+ long semiperfectMatchBases1=0;
+ long perfectHitCount1=0;
+ long semiPerfectHitCount1=0;
+ long duplicateBestAlignment1=0;
+
+ long totalNumCorrect1=0; //Only for skimmer
+ long totalNumIncorrect1=0; //Only for skimmer
+ long totalNumIncorrectPrior1=0; //Only for skimmer
+ long totalNumCapturedAllCorrect1=0; //Only for skimmer
+ long totalNumCapturedAllCorrectTop1=0; //Only for skimmer
+ long totalNumCapturedAllCorrectOnly1=0; //Only for skimmer
+
+ long initialSiteSum2=0;
+ long postTrimSiteSum2=0;
+ long postRescueSiteSum2=0;
+ long siteSum2=0;
+ long topSiteSum2=0;
+
+ long mapped2=0;
+ long mappedRetained2=0;
+ long rescuedP2=0;
+ long rescuedM2=0;
+ long truePositiveP2=0;
+ long truePositiveM2=0;
+ long falsePositive2=0;
+ long totalCorrectSites2=0;
+ long firstSiteCorrectP2=0;
+ long firstSiteCorrectM2=0;
+ long firstSiteIncorrect2=0;
+ long firstSiteCorrectLoose2=0;
+ long firstSiteIncorrectLoose2=0;
+ long firstSiteCorrectPaired2=0;
+ long firstSiteCorrectSolo2=0;
+ long firstSiteCorrectRescued2=0;
+ long perfectHit2=0; //Highest score is max score
+ long perfectHitCount2=0;
+ long semiPerfectHitCount2=0;
+
+ long uniqueHit2=0; //Only one hit has highest score
+ long correctUniqueHit2=0; //unique highest hit on answer site
+ long correctMultiHit2=0; //non-unique highest hit on answer site (non-skimmer only)
+ long correctLowHit2=0; //hit on answer site, but not highest scorer
+ long noHit2=0;
+ long perfectMatch2=0; //Highest slow score is max slow score
+ long semiperfectMatch2=0;
+ long perfectMatchBases2=0;
+ long semiperfectMatchBases2=0;
+ long duplicateBestAlignment2=0;
+
+ long totalNumCorrect2=0; //Only for skimmer
+ long totalNumIncorrect2=0; //Only for skimmer
+ long totalNumIncorrectPrior2=0; //Only for skimmer
+ long totalNumCapturedAllCorrect2=0; //Only for skimmer
+ long totalNumCapturedAllCorrectTop2=0; //Only for skimmer
+ long totalNumCapturedAllCorrectOnly2=0; //Only for skimmer
+
+ long matchCountS2=0;
+ long matchCountI2=0;
+ long matchCountD2=0;
+ long matchCountM2=0;
+ long matchCountN2=0;
+
+ readsUsed1=0;
+ for(int i=0; i<mtts.length; i++){
+ AbstractMapThread mtt=mtts[i];
+
+ if(mtt.msa!=null){
+ msaIterationsLimited+=mtt.msa.iterationsLimited;
+ msaIterationsUnlimited+=mtt.msa.iterationsUnlimited;
+ }
+// if(mtt.tcr!=null){
+// if(mtt.tcr.msaBS!=null){
+// msaIterationsLimited+=mtt.tcr.msaBS.iterationsLimited;
+// msaIterationsUnlimited+=mtt.tcr.msaBS.iterationsUnlimited;
+// }
+// if(mtt.tcr.msaCS!=null){
+// msaIterationsLimited+=mtt.tcr.msaCS.iterationsLimited;
+// msaIterationsUnlimited+=mtt.tcr.msaCS.iterationsUnlimited;
+// }
+// }
+
+ readsUsed1+=mtt.readsUsed1;
+ readsUsed2+=mtt.readsUsed2;
+ syntheticReads+=mtt.syntheticReads;
+ numMated+=mtt.numMated;
+ badPairs+=mtt.badPairs;
+ innerLengthSum+=mtt.innerLengthSum;
+ outerLengthSum+=mtt.outerLengthSum;
+ insertSizeSum+=mtt.insertSizeSum;
+ basesUsed1+=mtt.basesUsed1;
+ basesUsed2+=mtt.basesUsed2;
+ keysUsed+=mtt.keysUsed;
+ bothUnmapped+=mtt.bothUnmapped;
+ bothUnmappedBases+=mtt.bothUnmappedBases;
+
+ mapped1+=mtt.mapped1;
+ mappedRetained1+=mtt.mappedRetained1;
+ rescuedP1+=mtt.rescuedP1;
+ rescuedM1+=mtt.rescuedM1;
+ lowQualityReadsDiscarded1+=mtt.lowQualityReadsDiscarded1;
+ truePositiveP1+=mtt.truePositiveP1;
+ truePositiveM1+=mtt.truePositiveM1;
+ falsePositive1+=mtt.falsePositive1;
+// System.err.println("Adding "+mtt.falsePositive+" false positives -> "+falsePositive);
+ totalCorrectSites1+=mtt.totalCorrectSites1;
+
+ firstSiteCorrectP1+=mtt.firstSiteCorrectP1;
+ firstSiteCorrectM1+=mtt.firstSiteCorrectM1;
+ firstSiteIncorrect1+=mtt.firstSiteIncorrect1;
+ firstSiteCorrectLoose1+=mtt.firstSiteCorrectLoose1;
+ firstSiteIncorrectLoose1+=mtt.firstSiteIncorrectLoose1;
+ firstSiteCorrectPaired1+=mtt.firstSiteCorrectPaired1;
+ firstSiteCorrectSolo1+=mtt.firstSiteCorrectSolo1;
+ firstSiteCorrectRescued1+=mtt.firstSiteCorrectRescued1;
+
+ perfectHit1+=mtt.perfectHit1; //Highest score is max score
+ perfectHitCount1+=mtt.perfectHitCount1;
+ semiPerfectHitCount1+=mtt.semiPerfectHitCount1;
+ uniqueHit1+=mtt.uniqueHit1; //Only one hit has highest score
+ correctUniqueHit1+=mtt.correctUniqueHit1; //unique highest hit on answer site
+ correctMultiHit1+=mtt.correctMultiHit1; //non-unique highest hit on answer site
+ correctLowHit1+=mtt.correctLowHit1; //hit on answer site, but not highest scorer
+ noHit1+=mtt.noHit1;
+
+ totalNumCorrect1+=mtt.totalNumCorrect1; //Skimmer only
+ totalNumIncorrect1+=mtt.totalNumIncorrect1; //Skimmer only
+ totalNumIncorrectPrior1+=mtt.totalNumIncorrectPrior1; //Skimmer only
+ totalNumCapturedAllCorrect1+=mtt.totalNumCapturedAllCorrect1; //Skimmer only
+ totalNumCapturedAllCorrectTop1+=mtt.totalNumCapturedAllCorrectTop1; //Skimmer only
+ totalNumCapturedAllCorrectOnly1+=mtt.totalNumCapturedAllCorrectOnly1; //Skimmer only
+
+ perfectMatch1+=mtt.perfectMatch1; //Highest slow score is max slow score
+ semiperfectMatch1+=mtt.semiperfectMatch1; //A semiperfect mapping was found
+ perfectMatchBases1+=mtt.perfectMatchBases1;
+ semiperfectMatchBases1+=mtt.semiperfectMatchBases1;
+
+ duplicateBestAlignment1+=mtt.ambiguousBestAlignment1;
+
+ initialSiteSum1+=mtt.initialSiteSum1;
+ postTrimSiteSum1+=mtt.postTrimSiteSum1;
+ postRescueSiteSum1+=mtt.postRescueSiteSum1;
+ siteSum1+=mtt.siteSum1;
+ topSiteSum1+=mtt.topSiteSum1;
+
+ AbstractIndex index=mtt.index();
+ callsToScore+=index.callsToScore;
+ callsToExtend+=index.callsToExtendScore;
+ initialKeys+=index.initialKeys;
+ initialKeyIterations+=index.initialKeyIterations;
+ usedKeys+=index.usedKeys;
+ usedKeyIterations+=index.usedKeyIterations;
+
+ for(int j=0; j<index.hist_hits.length; j++){
+ int x=Tools.min(hist_hits.length-1, j);
+ hist_hits[x]+=index.hist_hits[j];
+ hist_hits_score[x]+=index.hist_hits_score[j];
+ hist_hits_extend[x]+=index.hist_hits_extend[j];
+ }
+
+ matchCountS1+=mtt.matchCountS1;
+ matchCountI1+=mtt.matchCountI1;
+ matchCountD1+=mtt.matchCountD1;
+ matchCountM1+=mtt.matchCountM1;
+ matchCountN1+=mtt.matchCountN1;
+
+ mapped2+=mtt.mapped2;
+ mappedRetained2+=mtt.mappedRetained2;
+ rescuedP2+=mtt.rescuedP2;
+ rescuedM2+=mtt.rescuedM2;
+ lowQualityReadsDiscarded2+=mtt.lowQualityReadsDiscarded2;
+ truePositiveP2+=mtt.truePositiveP2;
+ truePositiveM2+=mtt.truePositiveM2;
+ falsePositive2+=mtt.falsePositive2;
+// System.err.println("Adding "+mtt.falsePositive+" false positives -> "+falsePositive);
+ totalCorrectSites2+=mtt.totalCorrectSites2;
+
+ firstSiteCorrectP2+=mtt.firstSiteCorrectP2;
+ firstSiteCorrectM2+=mtt.firstSiteCorrectM2;
+ firstSiteIncorrect2+=mtt.firstSiteIncorrect2;
+ firstSiteCorrectLoose2+=mtt.firstSiteCorrectLoose2;
+ firstSiteIncorrectLoose2+=mtt.firstSiteIncorrectLoose2;
+ firstSiteCorrectPaired2+=mtt.firstSiteCorrectPaired2;
+ firstSiteCorrectSolo2+=mtt.firstSiteCorrectSolo2;
+ firstSiteCorrectRescued2+=mtt.firstSiteCorrectRescued2;
+
+ perfectHit2+=mtt.perfectHit2; //Highest score is max score
+ perfectHitCount2+=mtt.perfectHitCount2;
+ semiPerfectHitCount2+=mtt.semiPerfectHitCount2;
+ uniqueHit2+=mtt.uniqueHit2; //Only one hit has highest score
+ correctUniqueHit2+=mtt.correctUniqueHit2; //unique highest hit on answer site
+ correctMultiHit2+=mtt.correctMultiHit2; //non-unique highest hit on answer site
+ correctLowHit2+=mtt.correctLowHit2; //hit on answer site, but not highest scorer
+ noHit2+=mtt.noHit2;
+
+ totalNumCorrect2+=mtt.totalNumCorrect2; //Skimmer only
+ totalNumIncorrect2+=mtt.totalNumIncorrect2; //Skimmer only
+ totalNumIncorrectPrior2+=mtt.totalNumIncorrectPrior2; //Skimmer only
+ totalNumCapturedAllCorrect2+=mtt.totalNumCapturedAllCorrect2; //Skimmer only
+ totalNumCapturedAllCorrectTop2+=mtt.totalNumCapturedAllCorrectTop2; //Skimmer only
+ totalNumCapturedAllCorrectOnly2+=mtt.totalNumCapturedAllCorrectOnly2; //Skimmer only
+
+ perfectMatch2+=mtt.perfectMatch2; //Highest slow score is max slow score
+ semiperfectMatch2+=mtt.semiperfectMatch2; //A semiperfect mapping was found
+ perfectMatchBases1+=mtt.perfectMatchBases1;
+ semiperfectMatchBases1+=mtt.semiperfectMatchBases1;
+
+ duplicateBestAlignment2+=mtt.ambiguousBestAlignment2;
+
+ initialSiteSum2+=mtt.initialSiteSum2;
+ postTrimSiteSum2+=mtt.postTrimSiteSum2;
+ postRescueSiteSum2+=mtt.postRescueSiteSum2;
+ siteSum2+=mtt.siteSum2;
+ topSiteSum2+=mtt.topSiteSum2;
+
+ matchCountS2+=mtt.matchCountS2;
+ matchCountI2+=mtt.matchCountI2;
+ matchCountD2+=mtt.matchCountD2;
+ matchCountM2+=mtt.matchCountM2;
+ matchCountN2+=mtt.matchCountN2;
+
+ }
+ maxReads=readsUsed1;
+ if(syntheticReads>0){SYNTHETIC=true;}
+
+ t.stop();
+ long nanos=t.elapsed;
+
+ if(verbose_stats>1){
+ StringBuilder sb=new StringBuilder(1000);
+ sb.append("\n\n###################\n#hits\tcount\tscore\textend\n");
+ for(int i=0; i<hist_hits.length; i++){
+ sb.append(i+"\t"+hist_hits[i]+"\t"+hist_hits_score[i]+"\t"+hist_hits_extend[i]+"\n");
+ }
+ try {
+ ReadWrite.writeString(sb, "hist_hits.txt", true);
+ } catch (Throwable e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+
+ final long basesUsed=(basesUsed1+basesUsed2);
+
+ final double invTrials=1d/maxReads;
+ final double invTrials100=100d/maxReads;
+ double invSites100=100d/siteSum1;
+
+ final double matedPercent=(numMated*invTrials100);
+ ReadStats.matedPercent=matedPercent;
+ final double badPairsPercent=(badPairs*invTrials100);
+ final double innerLengthAvg=(innerLengthSum*1d/numMated);
+ final double outerLengthAvg=(outerLengthSum*1d/numMated);
+ final double insertSizeAvg=(insertSizeSum*1d/numMated);
+
+ final double readsPerSecond=((readsUsed1+readsUsed2)*1000000000d)/nanos;
+ final double fragsPerSecond=(keysUsed*1000000000d)/nanos;
+ final double kiloBasesPerSecond=(basesUsed*1000000d)/nanos;
+
+ double perfectHitPercent=(perfectHit1*invTrials100); //Highest score is max score
+ double perfectMatchPercent=(perfectMatch1*invTrials100);
+ double semiperfectMatchPercent=(semiperfectMatch1*invTrials100);
+
+ double perfectHitCountPercent=perfectHitCount1*invSites100;
+ double semiPerfectHitCountPercent=semiPerfectHitCount1*invSites100;
+
+ double uniqueHitPercent=(uniqueHit1*invTrials100); //Only one hit has highest score
+ double correctUniqueHitPercent=(correctUniqueHit1*invTrials100); //unique highest hit on answer site
+ double correctMultiHitPercent=(correctMultiHit1*invTrials100); //non-unique highest hit on answer site
+ double correctLowHitPercent=(correctLowHit1*invTrials100); //hit on answer site, but not highest scorer
+ double ambiguousFound=(duplicateBestAlignment1*invTrials100);
+ double correctHighHitPercent=((correctMultiHit1+correctUniqueHit1)*invTrials100);
+ double correctHitPercent=((correctLowHit1+correctMultiHit1+correctUniqueHit1)*invTrials100);
+
+ double mappedB=(mapped1*invTrials100);
+ double mappedRetainedB=(mappedRetained1*invTrials100);
+ double rescuedPB=(rescuedP1*invTrials100);
+ double rescuedMB=(rescuedM1*invTrials100);
+ double falsePositiveB=(firstSiteIncorrect1*invTrials100);
+ double falsePositiveLooseB=(firstSiteIncorrectLoose1*invTrials100);
+ double truePositivePB=(firstSiteCorrectP1*invTrials100);
+ double truePositiveMB=(firstSiteCorrectM1*invTrials100);
+ double truePositiveStrict=((firstSiteCorrectP1+firstSiteCorrectM1)*invTrials100);
+ double truePositiveLoose=(firstSiteCorrectLoose1*invTrials100);
+ double snrStrict=10*Math.log10((firstSiteCorrectM1+firstSiteCorrectP1+0.1)/(firstSiteIncorrect1+0.1));
+ double snrLoose=10*Math.log10((firstSiteCorrectLoose1+0.1)/(firstSiteIncorrectLoose1+0.1));
+ double truePositivePMRatio=(truePositivePB/truePositiveMB);
+ double truePositivePairedB=(firstSiteCorrectPaired1*100d/numMated);
+ double truePositiveSoloB=(firstSiteCorrectSolo1*100d/(mappedRetained1-numMated));
+ double truePositiveRescuedB=(firstSiteCorrectRescued1*100d/(rescuedP1+rescuedM1));
+ double noHitPercent=(noHit1*invTrials100);
+
+ long mappedReads, unambiguousReads;
+ if(REMOVE_DUPLICATE_BEST_ALIGNMENTS){
+ mappedReads=mappedRetained1+duplicateBestAlignment1;
+ unambiguousReads=mappedRetained1;
+ }else{
+ mappedReads=mappedRetained1;
+ unambiguousReads=mappedRetained1-duplicateBestAlignment1;
+ }
+
+ double avgNumCorrect=(SKIMMER ? totalNumCorrect1*invTrials : (totalCorrectSites1/(1d*(truePositiveP1+truePositiveM1))));
+ double avgNumIncorrect=totalNumIncorrect1*invTrials; //Skimmer only
+ double avgNumIncorrectPrior=totalNumIncorrectPrior1*invTrials; //Skimmer only
+
+ double rateCapturedAllCorrect=totalNumCapturedAllCorrect1*invTrials100; //Skimmer only
+ double rateCapturedAllTop=totalNumCapturedAllCorrectTop1*invTrials100; //Skimmer only
+ double rateCapturedAllOnly=totalNumCapturedAllCorrectOnly1*invTrials100; //Skimmer only
+
+ double avgCallsToScore=(callsToScore*invTrials);
+ double avgCallsToExtendScore=(callsToExtend*invTrials);
+ double avgInitialKeys=(initialKeys*1d/initialKeyIterations);
+ double avgUsedKeys=(usedKeys*1d/usedKeyIterations);
+
+ double avgInitialSites=(initialSiteSum1*invTrials);
+ double avgPostTrimSites=(postTrimSiteSum1*invTrials);
+ double avgPostRescueSites=(postRescueSiteSum1*invTrials);
+ double avgSites=(siteSum1*invTrials);
+ double avgPerfectSites=(perfectHitCount1*invTrials);
+ double avgSemiPerfectSites=(semiPerfectHitCount1*invTrials);
+ double avgTopSites=(topSiteSum1*invTrials);
+ double lowQualityReadsDiscardedPercent=(lowQualityReadsDiscarded1*invTrials100);
+
+ long matchErrors=matchCountS1+matchCountI1+matchCountD1;
+ long baseLen=matchCountM1+matchCountI1+matchCountS1+matchCountN1;
+ long matchLen=matchCountM1+matchCountI1+matchCountS1+matchCountN1+matchCountD1;
+ long refLen=matchCountM1+matchCountS1+matchCountN1+matchCountD1;
+ double errorRate=matchErrors*100d/matchLen;
+ double matchRate=matchCountM1*100d/matchLen;//baseLen;
+ double subRate=matchCountS1*100d/matchLen;//baseLen;
+ double delRate=matchCountD1*100d/matchLen;
+ double insRate=matchCountI1*100d/matchLen;//baseLen;
+ double nRate=matchCountN1*100d/matchLen;//baseLen;
+
+ if(SYNTHETIC && verbose_stats==-1){verbose_stats=Tools.max(verbose_stats,9);}
+
+ tswStats.println("Reads_Used"+DELIMITER+(readsUsed1+readsUsed2));
+ tswStats.println("Bases_Used"+DELIMITER+(basesUsed));
+ tswStats.println(String.format("Reads/sec"+DELIMITER+"%.2f", readsPerSecond));
+ tswStats.println(String.format("kBases/sec"+DELIMITER+"%.2f", kiloBasesPerSecond));
+ double milf=msaIterationsLimited*invTrials;
+ double milu=msaIterationsUnlimited*invTrials;
+ if(verbose_stats>=1){tswStats.println("MSA_iterations"+DELIMITER+String.format("%.2fL + %.2fU = %.2f", milf,milu,milf+milu));}
+
+// tswStats.println();
+// tswStats.println("\nRead 1 data:");
+
+ tswStats.println();
+
+ if(REMOVE_DUPLICATE_BEST_ALIGNMENTS){
+ double x=ambiguousFound+mappedRetainedB;
+ tswStats.println("R1_Mapped_Percent"+DELIMITER+padPercentMachine(x,4)+"%");
+ tswStats.println("R1_Unambiguous_Percent"+DELIMITER+padPercentMachine(mappedRetainedB,4)+"%");
+ tswStats.println("R1_Mapped_Reads"+DELIMITER+mappedReads);
+ tswStats.println("R1_Unambiguous_Reads"+DELIMITER+unambiguousReads);
+ }else{
+ double x=mappedRetainedB-ambiguousFound;
+ tswStats.println("R1_Mapped_Percent"+DELIMITER+padPercentMachine(mappedRetainedB,4)+"%");
+ tswStats.println("R1_Unambiguous_Percent"+DELIMITER+padPercentMachine(x,4)+"%");
+ tswStats.println("R1_Mapped_Reads"+DELIMITER+mappedReads);
+ tswStats.println("R1_Unambiguous_Reads"+DELIMITER+unambiguousReads);
+ }
+
+ tswStats.println();
+ if(paired){
+ tswStats.println(String.format("Mated_Pairs"+DELIMITER+"%.4f%%", matedPercent));
+ tswStats.println(String.format("Bad_Pairs"+DELIMITER+"%.3f%%", badPairsPercent));
+ }
+ if(paired){
+ tswStats.println(String.format("R1_Rescued"+DELIMITER+"%.3f", rescuedPB+rescuedMB)+"%");
+ tswStats.println(String.format("Avg_Insert_Size"+DELIMITER+"%.2f", insertSizeAvg));
+ }
+ tswStats.println();
+ tswStats.println(String.format("R1_Perfect_Best_Site"+DELIMITER+"%.4f", perfectMatchPercent)+"%");
+ tswStats.println(String.format("R1_Semiperfect_Site"+DELIMITER+"%.4f", semiperfectMatchPercent)+"%");
+ tswStats.println(String.format("R1_Ambiguous_Mapping"+DELIMITER+"%.4f", ambiguousFound)+"%");
+// +(REMOVE_DUPLICATE_BEST_ALIGNMENTS ? " (Removed)" : " (Kept)"));
+ tswStats.println(String.format("R1_Low_Quality_Discards"+DELIMITER+"%.4f", lowQualityReadsDiscardedPercent)+"%");
+
+ if(MAKE_MATCH_STRING){
+ tswStats.println();
+ tswStats.println("R1_Match_Rate"+DELIMITER+padPercentMachine(matchRate,4)+"%");
+ tswStats.println("R1_Error_Rate"+DELIMITER+padPercentMachine(errorRate,4)+"%");
+ tswStats.println("R1_Sub_Rate"+DELIMITER+padPercentMachine(subRate,4)+"%");
+ tswStats.println("R1_Del_Rate"+DELIMITER+padPercentMachine(delRate,4)+"%");
+ tswStats.println("R1_Ins_Rate"+DELIMITER+padPercentMachine(insRate,4)+"%");
+ tswStats.println("R1_N_Rate"+DELIMITER+padPercentMachine(nRate,4)+"%");
+
+ tswStats.println("R1_Match_Count"+DELIMITER+matchCountM1);
+ tswStats.println("R1_Error_Count"+DELIMITER+matchErrors);
+ tswStats.println("R1_Sub_Count"+DELIMITER+matchCountS1);
+ tswStats.println("R1_Del_Count"+DELIMITER+matchCountD1);
+ tswStats.println("R1_Ins_Count"+DELIMITER+matchCountI1);
+ tswStats.println("R1_N_Count"+DELIMITER+matchCountN1);
+ }
+
+ if(paired){
+ invSites100=100d/siteSum2;
+
+ perfectHitPercent=perfectHit2*invTrials100; //Highest score is max score
+ perfectMatchPercent=perfectMatch2*invTrials100;
+ semiperfectMatchPercent=semiperfectMatch2*invTrials100;
+
+ perfectHitCountPercent=perfectHitCount2*invSites100;
+ semiPerfectHitCountPercent=semiPerfectHitCount2*invSites100;
+
+ uniqueHitPercent=uniqueHit2*invTrials100; //Only one hit has highest score
+ correctUniqueHitPercent=correctUniqueHit2*invTrials100; //unique highest hit on answer site
+ correctMultiHitPercent=correctMultiHit2*invTrials100; //non-unique highest hit on answer site
+ correctLowHitPercent=correctLowHit2*invTrials100; //hit on answer site, but not highest scorer
+ ambiguousFound=(duplicateBestAlignment2*invTrials100);
+ correctHighHitPercent=(correctMultiHit2+correctUniqueHit2)*invTrials100;
+ correctHitPercent=(correctLowHit2+correctMultiHit2+correctUniqueHit2)*invTrials100;
+
+ mappedB=(mapped2*invTrials100);
+ mappedRetainedB=(mappedRetained2*invTrials100);
+ rescuedPB=(rescuedP2*invTrials100);
+ rescuedMB=(rescuedM2*invTrials100);
+ falsePositiveB=(firstSiteIncorrect2*invTrials100);
+ falsePositiveLooseB=(firstSiteIncorrectLoose2*invTrials100);
+ truePositivePB=(firstSiteCorrectP2*invTrials100);
+ truePositiveMB=(firstSiteCorrectM2*invTrials100);
+ truePositiveStrict=((firstSiteCorrectP2+firstSiteCorrectM2)*invTrials100);
+ truePositiveLoose=(firstSiteCorrectLoose2*invTrials100);
+ snrStrict=10*Math.log10((firstSiteCorrectM2+firstSiteCorrectP2+0.1)/(firstSiteIncorrect2+0.1));
+ snrLoose=10*Math.log10((firstSiteCorrectLoose2+0.1)/(firstSiteIncorrectLoose2+0.1));
+ truePositivePMRatio=(truePositivePB/truePositiveMB);
+ truePositivePairedB=(firstSiteCorrectPaired2*100d/numMated);
+ truePositiveSoloB=(firstSiteCorrectSolo2*100d/(mappedRetained2-numMated));
+ truePositiveRescuedB=(firstSiteCorrectRescued2*100d/(rescuedP2+rescuedM2));
+ avgNumCorrect=(totalCorrectSites2/(1d*(truePositiveP2+truePositiveM2)));
+ noHitPercent=noHit2*invTrials100;
+
+ avgNumCorrect=(SKIMMER ? totalNumCorrect2*invTrials : (totalCorrectSites2/(1d*(truePositiveP2+truePositiveM2))));
+ avgNumIncorrect=totalNumIncorrect1*invTrials; //Skimmer only
+ avgNumIncorrectPrior=totalNumIncorrectPrior1*invTrials; //Skimmer only
+
+ rateCapturedAllCorrect=totalNumCapturedAllCorrect2*invTrials100; //Skimmer only
+ rateCapturedAllTop=totalNumCapturedAllCorrectTop2*invTrials100; //Skimmer only
+ rateCapturedAllOnly=totalNumCapturedAllCorrectOnly2*invTrials100; //Skimmer only
+
+ if(REMOVE_DUPLICATE_BEST_ALIGNMENTS){
+ mappedReads=mappedRetained2+duplicateBestAlignment2;
+ unambiguousReads=mappedRetained2;
+ }else{
+ mappedReads=mappedRetained2;
+ unambiguousReads=mappedRetained2-duplicateBestAlignment2;
+ }
+
+ avgInitialSites=initialSiteSum2*invTrials;
+ avgPostTrimSites=postTrimSiteSum2*invTrials;
+ avgPostRescueSites=postRescueSiteSum2*invTrials;
+ avgSites=siteSum2*invTrials;
+ avgPerfectSites=(perfectHitCount1*invTrials);
+ avgSemiPerfectSites=(semiPerfectHitCount1*invTrials);
+ avgTopSites=topSiteSum2*invTrials;
+ lowQualityReadsDiscardedPercent=lowQualityReadsDiscarded2*invTrials100;
+
+ matchErrors=matchCountS2+matchCountI2+matchCountD2;
+ baseLen=matchCountM2+matchCountI2+matchCountS2+matchCountN2;
+ matchLen=matchCountM2+matchCountI2+matchCountS2+matchCountN2+matchCountD2;
+ refLen=matchCountM2+matchCountS2+matchCountN2+matchCountD2;
+ errorRate=matchErrors*100d/matchLen;
+ matchRate=matchCountM2*100d/matchLen;//baseLen;
+ subRate=matchCountS2*100d/matchLen;//baseLen;
+ delRate=matchCountD2*100d/matchLen;
+ insRate=matchCountI2*100d/matchLen;//baseLen;
+ nRate=matchCountN2*100d/matchLen;//baseLen;
+
+// tswStats.println("\n\nRead 2 data:");
+ tswStats.println();
+// tswStats.println(String.format("perfectHit"+DELIMITER+"%.2f", perfectHitPercent)+"%");
+// tswStats.println(String.format("uniqueHit"+DELIMITER+"%.2f", uniqueHitPercent)+"%");
+// tswStats.println(String.format("correctUniqueHit"+DELIMITER+"%.2f", correctUniqueHitPercent)+"%");
+// tswStats.println(String.format("correctMultiHit"+DELIMITER+"%.2f", correctMultiHitPercent)+"%");
+// tswStats.println(String.format("correctHighHit"+DELIMITER+"%.2f", correctHighHitPercent)+"%");
+// tswStats.println(String.format("correctHit"+DELIMITER+"%.2f", correctHitPercent)+"%");
+
+ //tswStats.println(String.format("mapped"+DELIMITER+(mappedB<10?" ":"")+"%.3f", mappedB)+"%");
+ if(REMOVE_DUPLICATE_BEST_ALIGNMENTS){
+ double x=ambiguousFound+mappedRetainedB;
+ tswStats.println("R2_Mapped_Percent"+DELIMITER+padPercentMachine(x,4)+"%");
+ tswStats.println("R2_Unambiguous_Percent"+DELIMITER+padPercentMachine(mappedRetainedB,4)+"%");
+ tswStats.println("R2_Mapped_Reads"+DELIMITER+mappedReads);
+ tswStats.println("R2_Unambiguous_Reads"+DELIMITER+unambiguousReads);
+ }else{
+ double x=mappedRetainedB-ambiguousFound;
+ tswStats.println("R2_Mapped_Percent"+DELIMITER+padPercentMachine(mappedRetainedB,4)+"%");
+ tswStats.println("R2_Unambiguous_Percent"+DELIMITER+padPercentMachine(x,4)+"%");
+ tswStats.println("R2_Mapped_Reads"+DELIMITER+mappedReads);
+ tswStats.println("R2_Unambiguous_Reads"+DELIMITER+unambiguousReads);
+ }
+ tswStats.println();
+ if(paired){
+ tswStats.println(String.format("R2_Rescued"+DELIMITER+"%.3f", rescuedPB+rescuedMB)+"%");
+ }
+ tswStats.println();
+ tswStats.println(String.format("R2_Perfect_Best_Site"+DELIMITER+"%.4f", perfectMatchPercent)+"%");
+ tswStats.println(String.format("R2_Semiperfect_Site"+DELIMITER+"%.4f", semiperfectMatchPercent)+"%");
+ tswStats.println(String.format("R2_Ambiguous_Mapping"+DELIMITER+"%.4f", ambiguousFound)+"%");
+ //(REMOVE_DUPLICATE_BEST_ALIGNMENTS ? "(Removed)" : "(Kept)"));
+ tswStats.println(String.format("R2_Low_Quality_Discards"+DELIMITER+"%.4f", lowQualityReadsDiscardedPercent)+"%");
+
+ if(MAKE_MATCH_STRING){
+ tswStats.println();
+ tswStats.println("R2_Match_Rate"+DELIMITER+padPercentMachine(matchRate,4)+"%");
+ tswStats.println("R2_Error_Rate"+DELIMITER+padPercentMachine(errorRate,4)+"%");
+ tswStats.println("R2_Sub_Rate"+DELIMITER+padPercentMachine(subRate,4)+"%");
+ tswStats.println("R2_Del_Rate"+DELIMITER+padPercentMachine(delRate,4)+"%");
+ tswStats.println("R2_Ins_Rate"+DELIMITER+padPercentMachine(insRate,4)+"%");
+ tswStats.println("R2_N_Rate"+DELIMITER+padPercentMachine(nRate,4)+"%");
+
+ tswStats.println("R2_Match_Count"+DELIMITER+matchCountM2);
+ tswStats.println("R2_Error_Count"+DELIMITER+matchErrors);
+ tswStats.println("R2_Sub_Count"+DELIMITER+matchCountS2);
+ tswStats.println("R2_Del_Count"+DELIMITER+matchCountD2);
+ tswStats.println("R2_Ins_Count"+DELIMITER+matchCountI2);
+ tswStats.println("R2_N_Count"+DELIMITER+matchCountN2);
+ }
+ }
+ errorState|=tswStats.poisonAndWait();
+
+ if(BBSplitter.TRACK_SCAF_STATS){
+ BBSplitter.printCounts(BBSplitter.SCAF_STATS_FILE, BBSplitter.scafCountTable, true, readsUsed1+readsUsed2, nzoStats, sortStats);
+ }
+
+ if(BBSplitter.TRACK_SET_STATS){
+ BBSplitter.printCounts(BBSplitter.SET_STATS_FILE, BBSplitter.setCountTable, true, readsUsed1+readsUsed2, nzoStats, sortStats);
+ }
+
+ errorState|=ReadStats.writeAll();
+
+ if(pile!=null){
+ CoveragePileup.overwrite=overwrite;
+ CoveragePileup.append=append;
+ pile.printOutput();
+ }
+
+ assert(!CALC_STATISTICS || truePositiveP1+truePositiveM1+falsePositive1+noHit1+lowQualityReadsDiscarded1==maxReads) :
+ "\nThe number of reads out does not add up to the number of reads in.\nThis may indicate that a mapping thread crashed." +
+ "\nIf you submit a bug report, include the entire console output, not just this error message.\n"+
+ truePositiveP1+"+"+truePositiveM1+"+"+falsePositive1+"+"+noHit1+"+"+lowQualityReadsDiscarded1+" = "+
+ (truePositiveP1+truePositiveM1+falsePositive1+noHit1+lowQualityReadsDiscarded1)+" != "+maxReads;
+ if(!SKIMMER){
+ assert(!CALC_STATISTICS || truePositiveP1+truePositiveM1==correctLowHit1+correctMultiHit1+correctUniqueHit1);
+ }else{
+ assert(!CALC_STATISTICS || truePositiveP1+truePositiveM1==correctLowHit1+correctUniqueHit1);
+ }
+ }
+
+ static final void printSettings0(int k, int maxindel, float minratio){
+ if(MACHINE_OUTPUT){
+ sysout.println("Genome"+DELIMITER+Data.GENOME_BUILD);
+ sysout.println("Key_Length"+DELIMITER+k);
+ sysout.println("Max_Indel"+DELIMITER+maxindel);
+ sysout.println("Minimum_Score_Ratio"+DELIMITER+minratio);
+ sysout.println("Mapping_Mode"+DELIMITER+(PERFECTMODE ? "perfect" : SEMIPERFECTMODE ? "semiperfect" : "normal"));
+ }else{
+ sysout.println("Genome: \t"+Data.GENOME_BUILD);
+ sysout.println("Key Length: \t"+k);
+ sysout.println("Max Indel: \t"+maxindel);
+ sysout.println("Minimum Score Ratio: \t"+minratio);
+ sysout.println("Mapping Mode: \t"+(PERFECTMODE ? "perfect" : SEMIPERFECTMODE ? "semiperfect" : "normal"));
+ }
+ }
+
+
+ static final int absdif(int a, int b){
+ return a>b ? a-b : b-a;
+ }
+
+ protected static void clearStatics(){
+ maxReads=-1;
+// readsUsed=0;
+// readsUsed2=0;
+// lowQualityReadsDiscarded1=0;
+// lowQualityReadsDiscarded2=0;
+// lowQualityBasesDiscarded1=0;
+// lowQualityBasesDiscarded2=0;
+
+ outputBaseName="readsOut_"+(System.nanoTime()&0x1FFFF);
+ outFile=null;
+ outFile2=null;
+ outFileM=null;
+ outFileM2=null;
+ outFileU=null;
+ outFileU2=null;
+ outFileB=null;
+ outFileB2=null;
+ ArrayList<String> blacklist=null;
+
+ errorState=false;
+ }
+
+ /* ------------ Non-static fields ----------- */
+
+
+ ConcurrentReadInputStream cris;
+ ConcurrentReadOutputStream rosA=null, rosM=null, rosU=null, rosB=null;
+
+ float fractionGenomeToExclude=-1;
+ int maxIndel1=-1;
+ int maxIndel2=-1;
+ int minApproxHits=-1;
+ int expectedSites=-1;
+ int ambigMode=AMBIG_BEST;
+// int ambigMode2=AMBIG_BEST;
+ boolean fast=false;
+ boolean slow=false;
+ boolean vslow=false;
+ boolean verbose=false;
+ boolean rcompMate=false;
+ boolean outputSitesOnly=false;
+ long targetGenomeSize=-1;
+ int ziplevel=-1;
+ int build=1;
+ String reference=null;
+ int keylen=13;
+ int idmodulo=1;
+ float samplerate=1f;
+ double minid=-1;
+ long sampleseed=1;
+ boolean ambiguousRandom=false, ambiguousAll=false;
+ boolean forceanalyze=false;
+// private boolean gunzip=false;
+// private boolean gzip=false;
+// private boolean pigz=false;
+// private boolean unpigz=false;
+ boolean setxs=false, setintron=false;
+ String bamscript=null;
+ String in1=null, in2=null, qfin1=null, qfin2=null;
+ String qfout=null, qfout2=null, qfoutM=null, qfoutM2=null, qfoutU=null, qfoutU2=null, qfoutB=null, qfoutB2=null;
+
+ /** Scores below the (max possible alignment score)*(MINIMUM_ALIGNMENT_SCORE_RATIO) will be discarded.
+ * Default: 0.4 ~ 0.5 for clean data against raw PacBio data.
+ * Very sensitive! A value of 0.2 will potentially produce many false positives. */
+ float MINIMUM_ALIGNMENT_SCORE_RATIO;
+
+ float keyDensity;//Normal key density
+ float maxKeyDensity; //For situations where some of the read is too low quality, this is the max for the rest of the read.
+ float minKeyDensity;
+ int maxDesiredKeys; //Don't go above this number of keys except to maintain minKeyDensity.
+
+ /** Additional ref bases on each end of site mapping location in alignment window.
+ * If there are no insertions or deletions, 0 is fine. */
+ int SLOW_ALIGN_PADDING;
+ int SLOW_RESCUE_PADDING;
+ int TIP_SEARCH_DIST;
+
+ /** Class name of MSA to use */
+ String MSA_TYPE;
+ int MAX_SITESCORES_TO_PRINT;
+ boolean PRINT_SECONDARY_ALIGNMENTS;
+
+ /* ------------ Coverage ----------- */
+
+ CoveragePileup pileup;
+ String coverageStats=null, coverageBinned=null, coverageBase=null, coverageHist=null, coverageRPKM=null, normcov=null, normcovOverall=null;
+ int coverageMinScaf=0;
+ boolean coveragePhysical=false;
+ boolean cov32bit=false;
+ boolean covBitset=false;
+ boolean covSetbs=false;
+ boolean covArrays=true;
+ boolean covNzo=false;
+ boolean scafNzo=true;
+ boolean sortStats=true;
+ boolean covTwocolumn=false;
+ boolean covKsb=true;
+ boolean covStranded=false;
+ boolean covStartOnly=false;
+ int covBinSize=1000;
+
+
+ /* ------------ Static fields ----------- */
+
+ static final int AMBIG_BEST=0;
+ static final int AMBIG_TOSS=1;
+ static final int AMBIG_RANDOM=2;
+ static final int AMBIG_ALL=3;
+
+ static int CORRECT_THRESH=0; //Threshold for calculating true positives on synthetic data, or something.
+
+ static int synthReadlen=150;
+
+ static int maxInsLen=30; //Default 40
+ static int maxSubLen=30; //Default 40
+ static int maxDelLen=40; //Default 8000
+
+ static byte minQuality=3;
+ static byte midQuality=23;
+ static byte maxQuality=35;
+
+ static int maxSnps=4;//4;
+ static int maxInss=3;//2;
+ static int maxDels=3;
+ static int maxSubs=3;//2;
+
+ static float baseSnpRate=0.50f;
+ static float baseInsRate=0.30f;
+ static float baseDelRate=0.30f;
+ static float baseSubRate=0.30f;//0.3f;
+ static float PERFECT_READ_RATIO=0.0f;//0.2f;//0.8f
+
+ //Extra work for rare cases in human only.
+ static boolean SAVE_AMBIGUOUS_XY=false;
+
+
+ static boolean TRIM_LIST=true; //Increases speed many times; reduces accuracy a bit
+
+ static boolean PAIRED_RANDOM_READS=false;
+ static boolean REQUIRE_CORRECT_STRANDS_PAIRS=true;
+ static boolean SAME_STRAND_PAIRS=false;
+ static boolean KILL_BAD_PAIRS=false;
+
+ static boolean INDEX_LOADED=false;
+ static final boolean SLOW_ALIGN=true; //Do a more accurate scoring pass with MSA
+ static boolean MAKE_MATCH_STRING=SLOW_ALIGN;
+
+ /** Rescue paired reads by searching near mate */
+ static boolean RESCUE=true;
+
+ /** Generally should be set to false unless SLOW_ALIGN==true */
+ static boolean REMOVE_DUPLICATE_BEST_ALIGNMENTS=false;
+
+ /** Forbid alignments with indels longer than MAX_INDEL */
+ static boolean STRICT_MAX_INDEL=false;
+ /** Don't allow reads to map to their origin location in the reference. Useful for self-correcting reads. */
+ static boolean FORBID_SELF_MAPPING=false;
+ /** Only allow perfect and semiperfect mappings */
+ static boolean SEMIPERFECTMODE=false;
+ /** Only allow perfect mappings */
+ static boolean PERFECTMODE=false;
+ /** Only allow sites with at least this many contiguous matches */
+ static int KFILTER=-1;
+ /** Only allow sites with identity of at least this */
+ static float IDFILTER=0f;
+
+ /** Quality-trim left side of read before mapping */
+ static boolean qtrimLeft=false;
+ /** Quality-trim right side of read before mapping */
+ static boolean qtrimRight=false;
+ /** Restore read to untrimmed state after mapping (and destroy match string) */
+ static boolean untrim=false;
+ /** Trim bases with quality less than or equal to this value */
+ static byte TRIM_QUALITY=6;
+ /** Don't trim reads to be shorter than this */
+ static int minTrimLength=60;
+ /** Produce local alignments instead of global alignments */
+ static boolean LOCAL_ALIGN=false;
+
+ public static int minChrom=1;
+ public static int maxChrom=Integer.MAX_VALUE;
+
+ static long maxReads=-1;
+
+ protected static boolean CALC_STATISTICS=true;
+
+ static boolean QUICK_MATCH_STRINGS=false;
+ static boolean OUTPUT_READS=false;
+ static boolean OUTPUT_MAPPED_ONLY=false;
+ static boolean DONT_OUTPUT_BLACKLISTED_READS=false;
+
+ static boolean OUTPUT_ORDERED_READS=false;
+ static boolean DOUBLE_PRINT_ERROR_RATE=false;
+ static boolean PRINT_UNMAPPED_COUNT=false;
+
+ static String outputBaseName="readsOut_"+(System.nanoTime()&0x1FFFF);
+ static String outFile=null;
+ static String outFile2=null;
+ static String outFileM=null;
+ static String outFileM2=null;
+ static String outFileU=null;
+ static String outFileU2=null;
+ static String outFileB=null;
+ static String outFileB2=null;
+ static ArrayList<String> blacklist=null;
+ static ArrayList<String> splitterOutputs=null;
+
+ static boolean useRandomReads=false;
+ static int sequentialOverlap=5;
+ static boolean sequentialStrandAlt=false;
+
+ static boolean overwrite=false;
+ static boolean append=false;
+ static boolean SYNTHETIC=false;
+ static boolean ERROR_ON_NO_OUTPUT=false;
+ static boolean MACHINE_OUTPUT=false;
+ static boolean USE_MODULO=false;
+ static String statsOutputFile="stderr.txt";
+ final static String DELIMITER="=";
+
+ static PrintStream sysout=System.err;
+ static boolean SYSIN=false;
+ static int verbose_stats=0;
+ static boolean waitForMemoryClear=false;
+
+ public static boolean errorState=false;
+
+}
diff --git a/current/align2/BBIndex.java b/current/align2/BBIndex.java
new file mode 100755
index 0000000..19083fa
--- /dev/null
+++ b/current/align2/BBIndex.java
@@ -0,0 +1,3302 @@
+package align2;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+
+import stream.SiteScore;
+
+import dna.AminoAcid;
+import dna.Data;
+import dna.Gene;
+
+
+/**
+ * Based on Index11f
+ * Index stored in single array per block.
+ *
+ *
+ *
+ * @author Brian Bushnell
+ * @date Dec 22, 2012
+ *
+ */
+public final class BBIndex extends AbstractIndex {
+
+
+ public static void main(String[] args){
+
+ int k=13;
+
+ for(int i=0; i<args.length; i++){
+ String s=args[i].toLowerCase();
+ if(s.contains("=")){
+ String[] split=s.split("=");
+ String a=split[0];
+ String b=split[1];
+ if(a.equals("build") || a.equals("b")){
+ Data.setGenome(Integer.parseInt(b));
+ }else if(a.equals("minchrom")){
+ MINCHROM=Integer.parseInt(b);
+ }else if(a.equals("maxchrom")){
+ MAXCHROM=Integer.parseInt(b);
+ }else if(a.equals("keylen") || a.equals("k")){
+ k=Integer.parseInt(b);
+ }
+ }
+ }
+
+ if(MINCHROM==-1){MINCHROM=1;}
+ if(MAXCHROM==-1){
+ assert(Data.numChroms<=Byte.MAX_VALUE) : "TODO";
+ MAXCHROM=Data.numChroms;
+ }
+
+
+ System.err.println("Writing build "+Data.GENOME_BUILD+" "+
+ "BASESPACE index, keylen="+k+", chrom bits="+NUM_CHROM_BITS);
+
+
+ int first=(NUM_CHROM_BITS==0 ? 1 : 0);
+
+
+ Data.sysout.println("Loading index for chunk "+first+"-"+MAXCHROM+", build "+Data.GENOME_BUILD);
+ index=IndexMaker4.makeIndex(Data.GENOME_BUILD, first, MAXCHROM,
+ k, NUM_CHROM_BITS, MAX_ALLOWED_CHROM_INDEX, CHROM_MASK_LOW, CHROM_MASK_HIGH, SITE_MASK, SHIFT_LENGTH, true, false, index);
+
+
+ System.err.println("Finished all chroms, may still be writing.");
+ }
+
+
+ public BBIndex(int k_, int minChrom_, int maxChrom_, int kfilter_, MSA msa_){
+ super(k_, kfilter_, BASE_HIT_SCORE, minChrom_, maxChrom_, msa_);
+ INV_BASE_KEY_HIT_SCORE=1f/BASE_KEY_HIT_SCORE;
+ INDEL_PENALTY=(BASE_KEY_HIT_SCORE/2)-1; //default (HIT_SCORE/2)-1
+ INDEL_PENALTY_MULT=20; //default 20; penalty for indel length
+ MAX_PENALTY_FOR_MISALIGNED_HIT=BASE_KEY_HIT_SCORE-(1+BASE_KEY_HIT_SCORE/8);
+ SCOREZ_1KEY=Z_SCORE_MULT*KEYLEN;
+ {
+ int cyc=0;
+ for(int chrom=minChrom; chrom<=maxChrom; chrom=((chrom&CHROM_MASK_HIGH)+CHROMS_PER_BLOCK)){cyc+=2;}
+ cycles=cyc;
+ }
+ prescoreArray=new int[cycles];
+ precountArray=new int[cycles];
+ }
+
+ /** Load or generate index from minChrom to maxChrom, inclusive, with keylength k.
+ * This range can encompass multiple blocks.
+ * Should only be called once in a process. */
+ public static final synchronized void loadIndex(int minChrom, int maxChrom, int k, boolean writeToDisk, boolean diskInvalid){
+ if(minChrom<1){minChrom=1;}
+ if(maxChrom>Data.numChroms){maxChrom=Data.numChroms;}
+ assert(minChrom<=maxChrom) : minChrom+", "+maxChrom;
+ Data.sysout.println("Loading index for chunk "+minChrom+"-"+maxChrom+", build "+Data.GENOME_BUILD);
+ index=IndexMaker4.makeIndex(Data.GENOME_BUILD, minChrom, maxChrom,
+ k, NUM_CHROM_BITS, MAX_ALLOWED_CHROM_INDEX, CHROM_MASK_LOW, CHROM_MASK_HIGH, SITE_MASK, SHIFT_LENGTH, writeToDisk, diskInvalid, index);
+ }
+
+ /** Calculate statistics of index, such as list lengths, and find clumpy keys */
+ public static final synchronized void analyzeIndex(int minChrom, int maxChrom, float fractionToExclude, int k){
+ assert(lengthHistogram==null);
+ assert(COUNTS==null);
+
+ int KEYSPACE=1<<(2*k);
+ COUNTS=new int[KEYSPACE];
+ maxChrom=maxChrom(maxChrom);
+
+ HashMap<Integer, LongM> cmap=new HashMap<Integer, LongM>();
+
+ for(int chrom=minChrom; chrom<=maxChrom; chrom=((chrom&CHROM_MASK_HIGH)+CHROMS_PER_BLOCK)){
+ Block b=index[chrom];
+ final int[] sites=b.sites;
+ final int[] starts=b.starts;
+
+ for(int key=0; key<KEYSPACE; key++){
+
+ long clumps=0;
+
+ final int start1=starts[key];
+ final int stop1=starts[key+1];
+ final int len1=stop1-start1;
+ COUNTS[key]=(int)Tools.min(Integer.MAX_VALUE, COUNTS[key]+len1);
+
+ if(REMOVE_CLUMPY){
+ for(int i=start1+1; i<stop1; i++){
+ int dif=sites[i]-sites[i-1];
+ assert(dif!=0);
+ if(dif>0 && dif<=CLUMPY_MAX_DIST){
+ clumps++;
+ }
+ }
+ if(clumps>0){
+ final int x=Tools.min(key, AminoAcid.reverseComplementBinaryFast(key, k));
+ final Integer ko=x;
+ LongM lm=cmap.get(ko);
+ if(lm==null){
+ lm=new LongM(0);
+ cmap.put(ko, lm);
+ }
+ lm.increment(clumps);
+ }
+ }
+ }
+ }
+
+ for(int key=0; key<COUNTS.length; key++){
+ int rkey=AminoAcid.reverseComplementBinaryFast(key, k);
+ if(key<rkey){
+ int x=(int)Tools.min(Integer.MAX_VALUE, COUNTS[key]+(long)COUNTS[rkey]);
+ COUNTS[key]=COUNTS[rkey]=x;
+ }
+ }
+
+ if(REMOVE_CLUMPY){
+ Integer[] keys=cmap.keySet().toArray(new Integer[cmap.size()]);
+ Arrays.sort(keys);
+
+ for(Integer key : keys){
+ long clumps=cmap.get(key).value();
+ long len=COUNTS[key];
+ if((len>CLUMPY_MIN_LENGTH_INDEX && clumps>CLUMPY_FRACTION*len)/* || (len>8*CLUMPY_MIN_LENGTH_INDEX && clumps>.75f*CLUMPY_FRACTION*len)*/){
+ int rkey=AminoAcid.reverseComplementBinaryFast(key, k);
+ assert(key<=rkey);
+ assert(key==KeyRing.reverseComplementKey(rkey, k));
+ COUNTS[key]=0;
+ COUNTS[rkey]=0;
+ }
+ }
+ }
+
+ lengthHistogram=Tools.makeLengthHistogram3(COUNTS, 1000, verbose2);
+
+ //if(verbose2){System.err.println("lengthHistogram: "+Arrays.toString(lengthHistogram));}
+
+ if(REMOVE_FREQUENT_GENOME_FRACTION){
+
+ int lengthLimitIndex=(int)((1-fractionToExclude)*(lengthHistogram.length-1));
+ int lengthLimitIndex2=(int)((1-fractionToExclude*DOUBLE_SEARCH_THRESH_MULT)*(lengthHistogram.length-1));
+
+ MAX_USABLE_LENGTH=Tools.max(2*SMALL_GENOME_LIST, lengthHistogram[lengthLimitIndex]);
+ MAX_USABLE_LENGTH2=Tools.max(6*SMALL_GENOME_LIST, lengthHistogram[lengthLimitIndex2]);
+
+ if(verbose2){System.err.println("MAX_USABLE_LENGTH: "+MAX_USABLE_LENGTH+"\nMAX_USABLE_LENGTH2: "+MAX_USABLE_LENGTH2);}
+ }
+
+ Solver.POINTS_PER_SITE=(int)Math.floor((Solver.BASE_POINTS_PER_SITE*4000f)/Tools.max(2*SMALL_GENOME_LIST, lengthHistogram[MAX_AVERAGE_LIST_TO_SEARCH]));
+ if(Solver.POINTS_PER_SITE==0){Solver.POINTS_PER_SITE=-1;}
+ if(verbose2){System.err.println("POINTS_PER_SITE: "+Solver.POINTS_PER_SITE);}
+ assert(Solver.POINTS_PER_SITE<0) : Solver.POINTS_PER_SITE;
+ }
+
+
+ /** Returns the filename for the block holding this chrom */
+ public static final String fname(int chrom, int k){
+ return IndexMaker4.fname(minChrom(chrom), maxChrom(chrom), k, NUM_CHROM_BITS);
+ }
+
+ /** Ensure key offsets are strictly ascending. */
+ private static boolean checkOffsets(int[] offsets){
+ for(int i=1; i<offsets.length; i++){
+ if(offsets[i]<=offsets[i-1]){return false;}
+ }
+ return true;
+ }
+
+ @Deprecated
+ private final int trimExcessHitLists(int[] keys, int[][] hits){
+
+ assert(false) : "Needs to be redone because hits are no longer sorted by length.";
+
+ assert(hits.length==keys.length);
+// assert(false) : "modify this function so that it gives more weight to trimming lists over highly covered baits";
+ //And also, incorporate the "remove the longest list" function
+
+ final int limit=Tools.max(SMALL_GENOME_LIST, lengthHistogram[MAX_AVERAGE_LIST_TO_SEARCH])*keys.length;
+ final int limit2=Tools.max(SMALL_GENOME_LIST, lengthHistogram[MAX_AVERAGE_LIST_TO_SEARCH2]);
+ final int limit3=Tools.max(SMALL_GENOME_LIST, lengthHistogram[MAX_SHORTEST_LIST_TO_SEARCH]);
+
+ int sum=0;
+ int initialHitCount=0;
+
+ int shortest=Integer.MAX_VALUE-1;
+ int shortest2=Integer.MAX_VALUE;
+
+ for(int i=0; i<keys.length; i++){
+ int key=keys[i];
+ int x=count(key);
+ sum+=x;
+ initialHitCount+=(x==0 ? 0 : 1);
+ if(x>0){
+ if(x<shortest2){
+ shortest2=x;
+ if(shortest2<shortest){
+ shortest2=shortest;
+ shortest=x;
+ }
+ }
+ }
+ }
+ assert(shortest2>=shortest);
+ if(initialHitCount<MIN_APPROX_HITS_TO_KEEP){return initialHitCount;}
+ if(shortest>limit3 && !SLOW){
+ for(int i=0; i<hits.length; i++){hits[i]=null;}
+ return 0;
+ }
+ if(sum<=limit && sum/initialHitCount<=limit2){return initialHitCount;}
+
+ Pointer[] ptrs=Pointer.loadMatrix(hits);
+// ptrs[0].value/=2;
+// ptrs[ptrs.length-1].value/=2;
+ Arrays.sort(ptrs);
+
+ int finalHitCount=initialHitCount;
+ for(int i=ptrs.length-1; sum>limit || sum/finalHitCount>limit2; i--){
+ Pointer p=ptrs[i];
+ sum-=hits[p.key].length;
+ hits[p.key]=null;
+ finalHitCount--;
+ }
+
+ return finalHitCount;
+ }
+
+ /** Remove least useful keys to accelerate search */
+ private final int trimExcessHitListsByGreedy(int[] offsets, int[] keyScores, int maxHitLists, int[] keys){
+
+ float[] keyWeights=getKeyWeightArray(keyScores.length);
+ for(int i=0; i<keyScores.length; i++){
+ keyWeights[i]=keyScores[i]*INV_BASE_KEY_HIT_SCORE;
+ }
+
+// assert(false) : "modify this function so that it gives more weight to trimming lists over highly covered baits";
+ //And also, incorporate the "remove the longest list" function
+
+ final int limit=Tools.max(SMALL_GENOME_LIST, lengthHistogram[MAX_AVERAGE_LIST_TO_SEARCH])*keys.length;
+ final int limit2=Tools.max(SMALL_GENOME_LIST, lengthHistogram[MAX_AVERAGE_LIST_TO_SEARCH2]);
+ final int limit3=Tools.max(SMALL_GENOME_LIST, lengthHistogram[MAX_SHORTEST_LIST_TO_SEARCH]);
+// final int limitS=lengthHistogram[chrom][MAX_SINGLE_LIST_TO_SEARCH];
+
+ int sum=0;
+ int initialHitCount=0;
+
+ int shortest=Integer.MAX_VALUE-1;
+ int shortest2=Integer.MAX_VALUE;
+
+// for(int i=0; i<hits.length; i++){
+// if(hits[i]!=null && hits[i].length>limitS){hits[i]=null;}
+// }
+
+ final int[] lengths=getGenericArray(keys.length);
+
+ for(int i=0; i<keys.length; i++){
+ int key=keys[i];
+ int x=count(key);
+ lengths[i]=x;
+ sum+=x;
+ initialHitCount+=(x==0 ? 0 : 1);
+ if(x>0){
+ if(x<shortest2){
+ shortest2=x;
+ if(shortest2<shortest){
+ shortest2=shortest;
+ shortest=x;
+ }
+ }
+ }
+ }
+// assert(false) : limit+", "+limit2+", "+limit3+", "+shortest2+", "+shortest+", "+initialHitCount+", "+MIN_APPROX_HITS_TO_KEEP+"\n"+Arrays.toString(lengths);
+ assert(shortest2>=shortest);
+ if(initialHitCount<MIN_APPROX_HITS_TO_KEEP){return initialHitCount;}
+ if(shortest>limit3 && !SLOW){
+ for(int i=0; i<keys.length; i++){keys[i]=-1;}
+ return 0;
+ }
+
+ int hitsCount=initialHitCount;
+ int worstValue=Integer.MIN_VALUE;
+
+ while(hitsCount>=MIN_APPROX_HITS_TO_KEEP && (sum>limit || sum/initialHitCount>limit2 || hitsCount>maxHitLists/* || worstValue<0*/)){
+ final int[] lists=getGreedyListArray(hitsCount);
+ for(int i=0, j=0; j<lists.length; i++){
+ if(lengths[i]>0){
+ lists[j]=i;
+ j++;
+ }
+ }
+
+ Solver.findWorstGreedy(offsets, lengths, keyWeights, KEYLEN, lists, greedyReturn);
+ int worstIndex=greedyReturn[0];
+ int worst=lists[worstIndex];
+ worstValue=greedyReturn[1];
+ sum-=lengths[worst];
+
+// if(worstValue>0 && (hitsCount<=maxHitLists || lengths[worst]<excessListLimit)){return hitsCount;}
+ if(worstValue>0 || lengths[worst]<SMALL_GENOME_LIST){return hitsCount;} //This line increases accuracy at expense of speed. Lower constant = more accurate, default 0.
+ hitsCount--;
+ lengths[worst]=0;
+ keys[worst]=-1;
+ }
+ return hitsCount;
+ }
+
+
+ private final int getHits(final int[] keys, final int chrom, final int maxLen, final int[] starts, final int[] stops){
+ int numHits=0;
+ final Block b=index[chrom];
+ for(int i=0; i<keys.length; i++){
+ final int key=keys[i];
+ starts[i]=-1;
+ stops[i]=-1;
+ if(key>=0){
+ final int len=count(key);
+ if(len>0 && len<maxLen){
+ final int len2=b.length(key);
+ if(len2>0){
+ starts[i]=b.starts[key];
+ stops[i]=starts[i]+len2;
+ numHits++;
+ }
+ }
+ }
+ }
+ return numHits;
+ }
+
+
+ private final int countHits(final int[] keys, final int maxLen, boolean clearBadKeys){
+ int numHits=0;
+ for(int i=0; i<keys.length; i++){
+ final int key=keys[i];
+ if(key>=0){
+ final int len=count(key);
+ if(len>0 && len<maxLen){
+ numHits++;
+ }else if(clearBadKeys){
+ keys[i]=-1;
+ }
+ }
+ }
+ return numHits;
+ }
+
+
+ public final ArrayList<SiteScore> findAdvanced(byte[] basesP, byte[] basesM, byte[] qual, byte[] baseScoresP, int[] keyScoresP, int[] offsets, long id){
+ assert(minChrom<=maxChrom && minChrom>=0);
+ ArrayList<SiteScore> result=find(basesP, basesM, qual, baseScoresP, keyScoresP, offsets, true, id);
+ if(DOUBLE_SEARCH_NO_HIT && (result==null || result.isEmpty())){result=find(basesP, basesM, qual, baseScoresP, keyScoresP, offsets, false, id);}
+
+ return result;
+ }
+
+
+ public final ArrayList<SiteScore> find(byte[] basesP, byte[] basesM, byte[] qual, byte[] baseScoresP, int[] keyScoresP, int[] offsetsP, boolean obeyLimits, long id){
+
+ assert(checkOffsets(offsetsP)) : Arrays.toString(offsetsP);
+ final int[] keysOriginal=KeyRing.makeKeys(basesP, offsetsP, KEYLEN);
+ int[] keysP=Arrays.copyOf(keysOriginal, keysOriginal.length);
+
+ initialKeys+=offsetsP.length;
+ initialKeyIterations++;
+
+ final int maxLen=(obeyLimits ? MAX_USABLE_LENGTH : MAX_USABLE_LENGTH2);
+
+ int numHits=0;
+ numHits=countHits(keysP, maxLen, true);
+
+ if(verbose){
+ System.err.println("initial hits: "+numHits);
+ System.err.println("initial keys: "+keysP.length+"\n"+Arrays.toString(keysP));
+ }
+
+ if(numHits>0){ //TODO: Change these to higher numbers
+ int trigger=(3*keysP.length)/4;
+ if(numHits<4 && numHits<trigger){
+ for(int i=0; i<keysP.length; i++){keysP[i]=keysOriginal[i];}
+ numHits=countHits(keysP, (maxLen*3)/2, true);
+ }
+ if(numHits<3 && numHits<trigger){
+ for(int i=0; i<keysP.length; i++){keysP[i]=keysOriginal[i];}
+ numHits=countHits(keysP, maxLen*2, true);
+ }
+ if(numHits<3 && numHits<trigger){
+ for(int i=0; i<keysP.length; i++){keysP[i]=keysOriginal[i];}
+ numHits=countHits(keysP, maxLen*3, true);
+ }
+ if(numHits<2 && numHits<trigger){
+ for(int i=0; i<keysP.length; i++){keysP[i]=keysOriginal[i];}
+ numHits=countHits(keysP, maxLen*5, true);
+ }
+ }
+// assert(checkOffsets(offsetsP)) : Arrays.toString(offsetsP);
+ if(numHits<keysP.length){
+ int[][] r=shrink2(offsetsP, keysP, keyScoresP);
+ assert(r!=null);
+ if(r!=null){
+ offsetsP=r[0];
+ keysP=r[1];
+ keyScoresP=r[2];
+ }
+ }else{
+ assert(shrink2(offsetsP, keysP, keyScoresP)==null);
+ }
+ initialKeys2+=numHits;
+ //assert(checkOffsets(offsetsP)) : Arrays.toString(offsetsP);
+
+// assert(checkOffsets(offsets)) : Arrays.toString(offsets);
+ if(TRIM_BY_GREEDY && obeyLimits){
+ int maxLists=Tools.max((int)(HIT_FRACTION_TO_RETAIN*keysP.length), MIN_HIT_LISTS_TO_RETAIN);
+ numHits=trimExcessHitListsByGreedy(offsetsP, keyScoresP, maxLists, keysP);
+ }
+// System.out.println("After greedy: numHits = "+numHits);
+
+ if(verbose){
+ System.err.println("final hits: "+numHits);
+ System.err.println("final keys: "+keysP.length+"\n"+Arrays.toString(keysP));
+ }
+
+ if(TRIM_BY_TOTAL_SITE_COUNT && obeyLimits){
+ throw new RuntimeException("Needs to be redone.");
+// numHits=trimExcessHitLists(keys, hits);
+ }
+
+ if(TRIM_LONG_HIT_LISTS && obeyLimits && numHits>MIN_APPROX_HITS_TO_KEEP){
+ int cutoffIndex=((int) (HIT_FRACTION_TO_RETAIN*(keysP.length)-0.01f))+(keysP.length-numHits);
+
+ int zeroes=keysP.length-numHits;
+ int altMinIndex=(zeroes+(MIN_HIT_LISTS_TO_RETAIN-1));
+ cutoffIndex=Tools.max(cutoffIndex, altMinIndex);
+
+ assert(cutoffIndex>0) : cutoffIndex+"\n"+numHits;
+
+ if(cutoffIndex<(keysP.length-1)){
+ int[] lens=getGenericArray(keysP.length);
+ for(int i=0; i<keysP.length; i++){lens[i]=count(keysP[i]);}
+ Arrays.sort(lens);
+ int cutoff=lens[cutoffIndex];
+
+ cutoff=Tools.max(lengthHistogram[MIN_INDEX_TO_DROP_LONG_HIT_LIST], cutoff);
+
+ int removed=0;
+
+ for(int i=0; i<keysP.length; i++){
+ int key=keysP[i];
+ if(count(key)>cutoff){
+ keysP[i]=-1;
+ removed++;
+ numHits--;
+ }
+ }
+ }
+ }
+// assert(checkOffsets(offsets)) : Arrays.toString(offsets);
+
+ final ArrayList<SiteScore> result=new ArrayList<SiteScore>(8);
+ if(numHits<MIN_APPROX_HITS_TO_KEEP){return result;}
+ //assert(checkOffsets(offsetsP)) : Arrays.toString(offsetsP);
+ if(numHits<keysP.length){
+ int[][] r=shrink2(offsetsP, keysP, keyScoresP);
+ assert(r!=null);
+ if(r!=null){
+ offsetsP=r[0];
+ keysP=r[1];
+ keyScoresP=r[2];
+ }
+ }else{
+ assert(shrink2(offsetsP, keysP, keyScoresP)==null);
+ }
+ assert(keysP.length==numHits);
+ //assert(checkOffsets(offsetsP)) : Arrays.toString(offsetsP);
+ //Reverse the offsets for minus-strand mapping, since they are generated based on quality
+ int[] offsetsM=KeyRing.reverseOffsets(offsetsP, KEYLEN, basesP.length);
+ if(verbose){
+ System.err.println("Reversed offsets: \n"+Arrays.toString(offsetsP)+" ->\n"+Arrays.toString(offsetsM));
+ }
+ final int[] keysM=KeyRing.reverseComplementKeys(keysP, KEYLEN);
+
+// assert(checkOffsets(offsetsP)) : Arrays.toString(offsetsP);
+// assert(checkOffsets(offsetsM)) : Arrays.toString(offsetsM);
+
+ assert(!USE_EXTENDED_SCORE || (baseScoresP!=null && (qual==null || baseScoresP.length==qual.length)));
+ assert(keyScoresP!=null);
+ assert(keyScoresP.length==offsetsP.length) : keyScoresP.length+", "+offsetsP.length+", "+Arrays.toString(keyScoresP);
+ final byte[] baseScoresM=Tools.reverseAndCopy(baseScoresP, getBaseScoreArray(baseScoresP.length, 1));
+ final int[] keyScoresM=Tools.reverseAndCopy(keyScoresP, getKeyScoreArray(keyScoresP.length, 1));
+ final int maxQuickScore=maxQuickScore(offsetsP, keyScoresP);
+
+ assert(offsetsM.length==offsetsP.length);
+ assert(maxQuickScore==maxQuickScore(offsetsM, keyScoresM));
+
+ /*
+ * bestScores:
+ *
+ * bestScores[0] currentTopScore
+ * bestScores[1] maxHits
+ * bestScores[2] qcutoff
+ * bestScores[3] bestqscore
+ * bestScores[4] maxQuickScore
+ * bestScores[5] perfectsFound
+ */
+ final int[] bestScores=new int[6];
+
+ //This prevents filtering by qscore when a low-quality read only uses a few keys.
+ //In that case, extending is more important.
+ final boolean prescan_qscore=(PRESCAN_QSCORE && numHits>=5);
+
+ int[][] prescanResults=null;
+ int[] precounts=null;
+ int[] prescores=null;
+
+ int hitsCutoff=0;
+ int qscoreCutoff=(int)(MIN_QSCORE_MULT*maxQuickScore);
+
+ boolean allBasesCovered=true;
+ {
+ if(offsetsP[0]!=0){allBasesCovered=false;}
+ else if(offsetsP[offsetsP.length-1]!=(basesP.length-KEYLEN)){allBasesCovered=false;}
+ else{
+ for(int i=1; i<offsetsP.length; i++){
+ if(offsetsP[i]>offsetsP[i-1]+KEYLEN){
+ allBasesCovered=false;
+ break;
+ }
+ }
+ }
+ }
+
+ //TODO I don't understand this logic
+ final boolean pretendAllBasesAreCovered=//false;
+ (allBasesCovered ||
+ keysP.length>=keysOriginal.length-4 ||
+ (keysP.length>=9 && (offsetsP[offsetsP.length-1]-offsetsP[0]+KEYLEN)>Tools.max(40, (int)(basesP.length*.75f))));
+
+// System.err.println(allBasesCovered+"\t"+Arrays.toString(offsetsP));
+// assert(allBasesCovered);
+
+ if(prescan_qscore){
+ prescanResults=prescanAllBlocks(bestScores,
+ keysP, keyScoresP, offsetsP,
+ keysM, keyScoresM, offsetsM,
+ pretendAllBasesAreCovered);
+
+ if(prescanResults!=null){
+ precounts=prescanResults[0];
+ prescores=prescanResults[1];
+ }
+
+ if(bestScores[1]<MIN_APPROX_HITS_TO_KEEP){return result;}
+ if(bestScores[3]<maxQuickScore*MIN_QSCORE_MULT2){return result;} //if(bestScores[3]<maxQuickScore(offsetsP, keyScoresP)*.10f){return result;}
+
+ if(bestScores[3]>=maxQuickScore && pretendAllBasesAreCovered){
+ assert(bestScores[3]==maxQuickScore);
+ assert(bestScores[1]==numHits);
+
+ hitsCutoff=calcApproxHitsCutoff(keysP.length, bestScores[1], MIN_APPROX_HITS_TO_KEEP, true);
+ qscoreCutoff=Tools.max(qscoreCutoff, (int)(bestScores[3]*DYNAMIC_QSCORE_THRESH_PERFECT));
+ }else{
+ hitsCutoff=calcApproxHitsCutoff(keysP.length, bestScores[1], MIN_APPROX_HITS_TO_KEEP, false);
+ qscoreCutoff=Tools.max(qscoreCutoff, (int)(bestScores[3]*PRESCAN_QSCORE_THRESH));
+ }
+ }
+
+ final int maxScore=maxScore(offsetsP, baseScoresP, keyScoresP, basesP.length, true);
+ final boolean fullyDefined=AminoAcid.isFullyDefined(basesP);
+ assert(bestScores[2]<=0) : Arrays.toString(bestScores);
+
+ int cycle=0;
+ for(int chrom=minChrom; chrom<=maxChrom; chrom=((chrom&CHROM_MASK_HIGH)+CHROMS_PER_BLOCK)){
+ if(precounts==null || precounts[cycle]>=hitsCutoff || prescores[cycle]>=qscoreCutoff){
+ find(keysP, basesP, baseScoresP, keyScoresP, chrom, Gene.PLUS,
+ offsetsP, obeyLimits, result, bestScores, allBasesCovered, maxScore, fullyDefined);
+ }
+ if(QUIT_AFTER_TWO_PERFECTS){
+ if(bestScores[5]>=2){break;} //if(bestScores[5]>=3 || (bestScores[5]>=2 && chrom<24)){break;} //for human
+ }
+ cycle++;
+ if(precounts==null || precounts[cycle]>=hitsCutoff || prescores[cycle]>=qscoreCutoff){
+ find(keysM, basesM, baseScoresM, keyScoresM, chrom, Gene.MINUS,
+ offsetsM, obeyLimits, result, bestScores, allBasesCovered, maxScore, fullyDefined);
+ }
+ if(QUIT_AFTER_TWO_PERFECTS){
+ if(bestScores[5]>=2){break;} //if(bestScores[5]>=3 || (bestScores[5]>=2 && chrom<24)){break;} //for human
+ }
+ cycle++;
+ }
+
+// assert(Read.CHECKSITES(result, basesP, basesM, id, false)); //TODO: Comment out once checked
+
+ return result;
+ }
+
+ /** Search blocks rapidly to find max hits, and perfect sites. May indicate some blocks can be skipped. */
+ private final int[][] prescanAllBlocks(int[] bestScores,
+ int[] keysP, int[] keyScoresP, int[] offsetsP,
+ int[] keysM, int[] keyScoresM, int[] offsetsM,
+ final boolean allBasesCovered){
+
+ int[][][] pm=new int[][][] {{keysP, keyScoresP, offsetsP}, {keysM, keyScoresM, offsetsM}};
+
+ int bestqscore=0;
+ int maxHits=0;
+ int minHitsToScore=MIN_APPROX_HITS_TO_KEEP;
+
+ final int maxQuickScore=maxQuickScore(offsetsP, keyScoresP);
+
+ final int[] counts=precountArray;
+ final int[] scores=prescoreArray;
+ final int[][] ret=prescanReturn;
+ Arrays.fill(counts, keysP.length);
+ Arrays.fill(scores, maxQuickScore);
+ ret[0]=counts;
+ ret[1]=scores;
+
+ int cycle=0;
+ for(int chrom=minChrom; chrom<=maxChrom; chrom=((chrom&CHROM_MASK_HIGH)+CHROMS_PER_BLOCK)){
+ final int baseChrom=baseChrom(chrom);
+ for(int pmi=0; pmi<2; pmi++, cycle++){
+
+ int[] keys=pm[pmi][0];
+ int[] keyScores=pm[pmi][1];
+ int[] offsets=pm[pmi][2];
+// int[][] hits=getHitArray(offsets.length);
+
+ int[] starts=startArray;
+ int[] stops=stopArray;
+ int numHits=getHits(keys, chrom, Integer.MAX_VALUE, starts, stops);
+
+ if(numHits<minHitsToScore){
+ scores[cycle]=-9999;
+ counts[cycle]=0;
+ }else{
+
+// final int maxQuickScore=maxQuickScore(offsets, keyScores);
+ // System.err.println("maxScore = "+maxScore);
+
+ if(numHits<keys.length){
+ int[][] r=shrink(starts, stops, offsets, keyScores, offsets.length);
+ if(r!=null){
+ starts=r[0];
+ stops=r[1];
+ offsets=r[2];
+ keyScores=r[4];
+ }
+ }
+
+ assert(numHits==offsets.length);
+ assert(numHits==keyScores.length);
+ heap.clear();
+ final Quad[] triples=tripleStorage;
+ final int[] values=valueArray;
+
+ int[] temp=findMaxQscore2(starts, stops, offsets, keyScores, baseChrom, triples, values, minHitsToScore, true,
+ bestqscore>=maxQuickScore && allBasesCovered);
+
+ scores[cycle]=temp[0];
+ counts[cycle]=temp[1];
+
+ bestqscore=Tools.max(temp[0], bestqscore);
+ maxHits=Tools.max(maxHits, temp[1]);
+ if(bestqscore>=maxQuickScore && allBasesCovered){
+ assert(bestqscore==maxQuickScore);
+ assert(maxHits==keysP.length) :
+ "\nTemp: \t"+Arrays.toString(temp)+", cycle="+cycle+"\n" +
+ "Scores: \t"+Arrays.toString(scores)+
+ "Counts: \t"+Arrays.toString(counts)+
+ "bestqscore: \t"+bestqscore+
+ "maxHits: \t"+maxHits+
+ "maxQuickScore: \t"+maxQuickScore+
+ "numHits: \t"+numHits+
+ "minHitsToScore: \t"+minHitsToScore+
+ "keys.length: \t"+keys.length;
+
+ minHitsToScore=Tools.max(minHitsToScore, maxHits);
+
+ {
+ //This early exit is optional. Does not seem to impact speed much either way.
+ bestScores[1]=Tools.max(bestScores[1], maxHits);
+ bestScores[3]=Tools.max(bestScores[3], bestqscore);
+ return ret;
+ }
+ }
+ }
+ }
+ }
+
+ bestScores[1]=Tools.max(bestScores[1], maxHits);
+ bestScores[3]=Tools.max(bestScores[3], bestqscore);
+
+ if(!RETAIN_BEST_QCUTOFF){bestScores[2]=-9999;}
+
+ return ret;
+ }
+
+
+ /** Search a single block and strand */
+ public final ArrayList<SiteScore> find(int[] keys, final byte[] bases, final byte[] baseScores, int[] keyScores,
+ final int chrom, final byte strand,
+ int[] offsets, final boolean obeyLimits, ArrayList<SiteScore> ssl, int[] bestScores,
+ final boolean allBasesCovered, final int maxScore, final boolean fullyDefined){
+
+ assert(checkOffsets(offsets)) : Arrays.toString(offsets);
+
+ //Index of first location of each key
+ int[] starts=startArray;
+ //Index of first location of next key (i.e., (last location of key)+1)
+ int[] stops=stopArray;
+
+ int numHits=getHits(keys, chrom, Integer.MAX_VALUE, starts, stops);
+ if(numHits<MIN_APPROX_HITS_TO_KEEP){return ssl;}
+// assert(false) : "\n"+Data.getChromosome(1).minIndex+"\n"+Data.getChromosome(1).maxIndex+"\n"+Data.getChromosome(1).array.length;
+// assert(false) : "\n"+Data.getChromosome(1).minIndex+"\n"+Data.getChromosome(1).maxIndex+"\n"+Data.getChromosome(1).array.length+"\n"+(char)Data.getChromosome(1).array[0];
+// assert(false) : numHits+"\n"+Arrays.toString(starts)+"\n"+Arrays.toString(stops)+"\n"+Arrays.toString(index[0].getHitList(starts[0], stops[0]))+"\n";
+ if(USE_CAMELWALK){
+ if(USE_SLOWALK3){
+ if(!RETAIN_BEST_SCORES){Arrays.fill(bestScores, 0);}
+ ssl=camelWalk3(starts, stops, bases, baseScores, keyScores, offsets, chrom, strand, obeyLimits, ssl, bestScores, allBasesCovered, maxScore, fullyDefined);
+ }else{
+ assert(false) : "TODO";
+ ssl=slowWalk2(starts, stops, bases, baseScores, keyScores, offsets, chrom, strand, obeyLimits, ssl, fullyDefined);
+ }
+ }else{
+ if(USE_SLOWALK3){
+ if(!RETAIN_BEST_SCORES){Arrays.fill(bestScores, 0);}
+ ssl=slowWalk3(starts, stops, bases, baseScores, keyScores, offsets, chrom, strand, obeyLimits, ssl, bestScores, allBasesCovered, maxScore, fullyDefined);
+ }else{
+ ssl=slowWalk2(starts, stops, bases, baseScores, keyScores, offsets, chrom, strand, obeyLimits, ssl, fullyDefined);
+ }
+ }
+
+ return ssl;
+ }
+
+ /** Compress arrays by removing null/empty lists */
+ private final int[][] shrink(int[] starts, int[] stops, int[] offsets, int[] keyScores, final int len){
+ int numHits=0;
+ for(int i=0; i<len; i++){
+ if(starts[i]>=0){numHits++;}
+ }
+
+ if(numHits==offsets.length){
+ return null;
+ }else{
+ int[][] r=shrinkReturn3;
+ int[] starts2=startArray;
+ int[] stops2=stopArray;
+ int[] offsets2=getOffsetArray(numHits);
+ int[] keyScores2=new int[numHits];
+
+ for(int i=0, j=0; i<len; i++){
+ if(starts[i]>=0){
+ starts2[j]=starts[i];
+ stops2[j]=stops[i];
+ offsets2[j]=offsets[i];
+ keyScores2[j]=keyScores[i];
+ j++;
+ }
+ }
+ r[0]=starts2;
+ r[1]=stops2;
+ r[2]=offsets2;
+ r[4]=keyScores2;
+ return r;
+ }
+ }
+
+ /** Removes "-1" keys. */
+ private final int[][] shrink2(int[] offsets, int[] keys, int[] keyScores){
+
+
+ int numHits=0;
+ for(int i=0; i<keys.length; i++){
+ if(keys[i]>=0){numHits++;}
+ }
+
+
+ assert(checkOffsets(offsets)) : Arrays.toString(offsets);
+ if(numHits==keys.length){
+ return null;
+ }else{
+ int[][] r=shrinkReturn2;
+ int[] offsets2=getOffsetArray(numHits);
+ assert(offsets2!=offsets);
+ assert(offsets2.length<offsets.length);
+ int[] keys2=new int[numHits];
+ int[] keyScores2=new int[numHits];
+
+ for(int i=0, j=0; i<keys.length; i++){
+ if(keys[i]>=0){
+ offsets2[j]=offsets[i];
+ keys2[j]=keys[i];
+ keyScores2[j]=keyScores[i];
+ j++;
+ }
+ }
+ assert(checkOffsets(offsets2)) : "\nnumHits="+numHits+"\n"+Arrays.toString(offsets)+" -> \n"+Arrays.toString(offsets2)+"\n"+
+ "\n"+Arrays.toString(keys)+" -> \n"+Arrays.toString(keys2)+"\n";
+ r[0]=offsets2;
+ r[1]=keys2;
+ r[2]=keyScores2;
+ return r;
+ }
+ }
+
+
+ /** This uses a heap to track next column to increment */
+ private final ArrayList<SiteScore> slowWalk2(int[] starts, int[] stops, final byte[] bases,
+ final byte[] baseScores, int[] keyScores, int[] offsets,
+ final int baseChrom_, final byte strand, final boolean obeyLimits, ArrayList<SiteScore> ssl, final boolean fullyDefined){
+
+ if(SHRINK_BEFORE_WALK){
+ int[][] r=shrink(starts, stops, offsets, keyScores, offsets.length);
+ if(r!=null){
+ starts=r[0];
+ stops=r[1];
+ offsets=r[2];
+ keyScores=r[4];
+ }
+ }
+
+ final int numHits=offsets.length; //After shrink
+
+ assert(numHits==offsets.length);
+ assert(numHits==keyScores.length);
+
+ assert(!(!SHRINK_BEFORE_WALK && ADD_SCORE_Z));
+
+ //This can be done before or after shrinking, but the results will change depending on MIN_SCORE_MULT and etc.
+ final int maxScore=maxScore(offsets, baseScores, keyScores, bases.length, true);
+// final int maxQuickScore=(!USE_EXTENDED_SCORE ? maxScore : maxQuickScore(offsets));
+// System.err.println("maxScore = "+maxScore);
+
+// final int minScore=(obeyLimits ? (int)(MIN_SCORE_MULT*maxScore) : (int)(MIN_SCORE_MULT*0.85f*maxScore));
+ final int minScore=(obeyLimits ? (int)(MIN_SCORE_MULT*maxScore) : (int)(MIN_SCORE_MULT*1.25f*maxScore));
+// final int minQuickScore=(!USE_EXTENDED_SCORE ? minScore : (int)(maxQuickScore*0.15f));
+// final int minScore=(int)(MIN_SCORE_MULT*maxScore);
+// System.err.println("minScore = "+minScore);
+
+ final int baseChrom=baseChrom(baseChrom_);
+
+
+ heap.clear();
+ final Quad[] triples=tripleStorage;
+
+ final Block b=index[baseChrom];
+ final int[] values=valueArray;
+ final int[] sizes=sizeArray;
+ final int[] locArray=(USE_EXTENDED_SCORE ? getLocArray(bases.length) : null);
+
+ for(int i=0; i<numHits; i++){
+ final int[] sites=b.sites;
+ final int start=starts[i];
+ sizes[i]=b.length(start, stops[i]);
+ assert(sizes[i]>0);
+
+ int a=sites[start];
+ int a2;
+ if((a&SITE_MASK)>=offsets[i]){
+ a2=a-offsets[i];
+ }else{
+ int ch=numberToChrom(a, baseChrom);
+ int st=numberToSite(a);
+ int st2=Tools.max(st-offsets[i], 0);
+ a2=toNumber(st2, ch);
+ }
+ assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom));
+
+ Quad t=triples[i];
+ assert(t!=null) : "Should be using tripleStorage";
+ assert(i==t.column);
+ t.row=start;
+ t.site=a2;
+ t.list=sites;
+ values[i]=a2;
+
+ heap.add(t);
+ }
+
+ if(ssl==null){ssl=new ArrayList<SiteScore>(8);}
+
+ int currentTopScore=-999999999;
+
+ int cutoff=minScore;
+
+ int maxHits=0;
+ int approxHitsCutoff=MIN_APPROX_HITS_TO_KEEP;
+
+// System.out.println("\nEntering SS loop:");
+// System.out.println("maxScore="+maxScore+"\tminScore="+minScore+"\tcurrentTopScore="+currentTopScore+"\n" +
+// "cutoff="+cutoff+"\tmaxHits="+maxHits+"\tapproxHitsCutoff="+approxHitsCutoff);
+// System.out.println();
+
+ SiteScore prevSS=null;
+ while(!heap.isEmpty()){
+ Quad t=heap.peek();
+ final int site=t.site;
+ final int centerIndex=t.column;
+
+ int maxNearbySite=site;
+ int approxHits=0;
+
+ {//Inner loop
+ final int minsite=site-MAX_INDEL, maxsite=site+MAX_INDEL2;
+ for(int column=0, chances=numHits-approxHitsCutoff; column<numHits && chances>=0; column++){
+ final int x=values[column];
+ assert(x==triples[column].site);
+ if(x>=minsite && x<=maxsite){
+ maxNearbySite=(x>maxNearbySite ? x : maxNearbySite);
+ approxHits++;
+ }else{chances--;}
+ }
+ }
+ hist_hits[Tools.min(HIT_HIST_LEN, approxHits)]++;
+
+ assert(centerIndex>=0) : centerIndex;
+ assert(approxHits>=1 || approxHitsCutoff>1) : approxHits+", "+approxHitsCutoff+", "+numHits+", "+t.column;
+ if(approxHits>=approxHitsCutoff){
+
+ int score;
+
+ int mapStart=site, mapStop=maxNearbySite;
+
+ if(USE_EXTENDED_SCORE){
+ final int chrom=numberToChrom(site, baseChrom);
+ score=extendScore(bases, baseScores, offsets, values, chrom, centerIndex, locArray, numHits, approxHits);
+ if(true/*USE_AFFINE_SCORE*/){
+ //Correct begin and end positions if they changed.
+ int min=Integer.MAX_VALUE;
+ int max=Integer.MIN_VALUE;
+ for(int i=0; i<locArray.length; i++){
+ int x=locArray[i];
+ if(x>-1){
+ if(x<min){min=x;}
+ if(x>max){max=x;}
+ }
+ }
+
+// assert(min>-1 && max>-1) : Arrays.toString(locArray); //TODO: How did this assertion trigger?
+ if(min<0 || max<0){
+ //Note: This error can trigger if minChrom and maxChrom do not align to block boundaries
+ if(!Shared.anomaly){
+ Shared.anomaly=true;
+ System.err.println("Anomaly in "+getClass().getName()+".slowWalk: "+
+ chrom+", "+mapStart+", "+mapStop+", "+centerIndex+", "+
+ Arrays.toString(locArray)+"\n"+
+ Arrays.toString(values)+"\n"+
+ new String(bases)+"\nstrand="+strand+"\n");
+ System.err.println();
+ }
+ score=-99999;
+ }
+
+
+ mapStart=toNumber(min, chrom);
+ mapStop=toNumber(max, chrom);
+
+
+
+// System.err.println("site="+site+", maxNearbySite="+maxNearbySite+", min="+min+", max="+max+
+// ", mapStart="+mapStart+", mapStop="+mapStop);
+
+// if(chrom==17 && absdif(min, 30354420)<2000){
+// System.err.println("\n*****\n");
+// System.err.println("site="+site+" ("+numberToSite(site)+"), maxNearbySite="+maxNearbySite+
+// " ("+numberToSite(maxNearbySite)+"), min="+min+", max="+max+
+// ", mapStart="+mapStart+", mapStop="+mapStop);
+// System.err.println();
+// System.err.println(Arrays.toString(locArray));
+// System.err.println();
+// System.err.println("chrom="+chrom);
+// System.err.println("score="+score);
+// }
+ }
+ }else{
+ score=quickScore(values, keyScores, centerIndex, offsets, sizes, true, approxHits, numHits);
+ if(ADD_SCORE_Z){
+ int scoreZ=scoreZ2(values, centerIndex, offsets, approxHits, numHits);
+ score+=scoreZ;
+ }
+ }
+
+ if(score>=cutoff){
+
+ if(score>currentTopScore){
+// System.err.println("New top score!");
+
+ if(DYNAMICALLY_TRIM_LOW_SCORES){
+
+ maxHits=Tools.max(approxHits, maxHits);
+// approxHitsCutoff=Tools.max(approxHitsCutoff, maxHits);
+ approxHitsCutoff=Tools.max(approxHitsCutoff, maxHits-1); //More sensitive, but slower
+
+ cutoff=Tools.max(cutoff, (int)(score*DYNAMIC_SCORE_THRESH));
+
+ // cutoff=Tools.max(cutoff, minScore+(int)((score-minScore)*DYNAMIC_SCORE_THRESH));
+ if(USE_EXTENDED_SCORE && score>=maxScore){
+ cutoff=Tools.max(cutoff, (int)(score*0.95f));
+ }
+ }
+
+ currentTopScore=score;
+
+// System.out.println("New top score: "+currentTopScore+" \t("+cutoff+")");
+ }
+
+ final int chrom=numberToChrom(mapStart, baseChrom);
+ final int site2=numberToSite(mapStart);
+ final int site3=numberToSite(mapStop)+bases.length-1;
+
+ assert(NUM_CHROM_BITS==0 || site2<SITE_MASK-1000) : "chrom="+chrom+", strand="+strand+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", site2="+site2+", site3="+site3+", read.length="+bases.length+
+ "\n\n"+Arrays.toString(b.getHitList(centerIndex));
+ assert(site2<site3) : "chrom="+chrom+", strand="+strand+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", site2="+site2+", site3="+site3+", read.length="+bases.length;
+
+
+ //Note: I could also do this as soon as score is calculated.
+// if(ADD_SCORE_Z){
+// int scoreZ=scoreZ2(values, centerIndex, offsets);
+// score+=scoreZ;
+// }
+
+ //This block is optional, but tries to eliminate multiple identical alignments
+// SiteScore prevSS=(ssl.size()<1 ? null : ssl.get(ssl.size()-1));
+
+ SiteScore ss=null;
+ final boolean perfect1=USE_EXTENDED_SCORE && score==maxScore && fullyDefined;
+
+ int[] gapArray=null;
+ if(site3-site2>=MINGAP+bases.length){
+ gapArray=makeGapArray(locArray, mapStart, MINGAP);
+ if(gapArray!=null){
+ int sub=site2-mapStart;//thus site2=mapStart+sub
+ for(int i=0; i<gapArray.length; i++){
+ gapArray[i]+=sub;
+ }
+ assert(gapArray[0]==mapStart) : gapArray[0]+", "+mapStart;
+ assert(gapArray[gapArray.length-1]==mapStop);
+ }
+ assert(false) : Arrays.toString(locArray);
+ }
+
+ if(gapArray==null && prevSS!=null && prevSS.gaps==null &&
+ prevSS.chrom==chrom && prevSS.strand==strand && overlap(prevSS.start, prevSS.stop, site2, site3)){
+
+ int betterScore=Tools.max(score, prevSS.score);
+ int minStart=Tools.min(prevSS.start, site2);
+ int maxStop=Tools.max(prevSS.stop, site3);
+ final boolean perfect2=USE_EXTENDED_SCORE && prevSS.score==maxScore && fullyDefined;
+ assert(!USE_EXTENDED_SCORE || perfect2==prevSS.perfect);
+
+ boolean shortEnough=(!LIMIT_SUBSUMPTION_LENGTH_TO_2X || (maxStop-minStart<2*bases.length));
+
+ if(prevSS.start==site2 && prevSS.stop==site3){
+ prevSS.score=prevSS.quickScore=betterScore;
+ }else if(SUBSUME_SAME_START_SITES && shortEnough && prevSS.start==site2){
+ if(perfect2){
+ //do nothing
+ }else if(perfect1){
+ prevSS.setStop(site3);
+ prevSS.setPerfect();
+ }else{
+ prevSS.setStop(maxStop);
+ }
+ prevSS.score=prevSS.quickScore=betterScore;
+ }else if(SUBSUME_SAME_STOP_SITES && shortEnough && prevSS.stop==site3){
+ if(perfect2){
+ //do nothing
+ }else if(perfect1){
+ prevSS.setStart(site2);
+ prevSS.setPerfect();
+ }else{
+ prevSS.setStart(minStart);
+ }
+ prevSS.score=prevSS.quickScore=betterScore;
+ }else if(SUBSUME_OVERLAPPING_SITES && shortEnough && (maxStop-minStart<=bases.length+MAX_SUBSUMPTION_LENGTH)
+ && !perfect1 && !perfect2){
+ prevSS.setLimits(minStart, maxStop);
+ prevSS.score=prevSS.quickScore=betterScore;
+ }else{
+ ss=new SiteScore(chrom, strand, site2, site3, approxHits, score, false, perfect1);
+ if(!perfect1){ss.setPerfect(bases);}
+ assert(!perfect1 || ss.stop-ss.start==bases.length-1);
+ }
+ assert(!perfect2 || prevSS.stop-prevSS.start==bases.length-1);
+ }else{
+ ss=new SiteScore(chrom, strand, site2, site3, approxHits, score, false, perfect1);
+ if(!perfect1){ss.setPerfect(bases);}
+ ss.gaps=gapArray;
+ if(verbose && gapArray!=null){
+ System.err.println(ss.toText()+"\t"+Arrays.toString(gapArray)+"\n"+Arrays.toString(locArray)+"\n");
+ }
+ }
+
+ if(ss!=null){
+// System.out.println("Added site "+ss.toText());
+ ssl.add(ss);
+ prevSS=ss;
+ }else{
+// System.out.println("Subsumed site "+new SiteScore(chrom, strand, site2, site3, score).toText());
+ }
+
+// if(prevSS!=null && prevSS.chrom==chrom && prevSS.strand==strand && overlap(prevSS.start, prevSS.stop, site2, site3)){
+// int betterScore=Tools.max(score, prevSS.score);
+// if(prevSS.start==site2 && prevSS.stop==site3){
+// prevSS.score=prevSS.quickScore=betterScore;
+// }else if(prevSS.start==site2
+// /*isWithin(prevSS.start, prevSS.stop, site2, site3) ||
+// isWithin(site2, site3, prevSS.start, prevSS.stop)*/){
+// prevSS.score=prevSS.quickScore=betterScore;
+// assert(prevSS.start<prevSS.stop);
+//// prevSS.start=Tools.min(prevSS.start, site2);
+// prevSS.stop=Tools.max(prevSS.stop, site3);
+// assert(prevSS.start<prevSS.stop);
+// }else{
+// SiteScore ss=new SiteScore(chrom, strand, site2, site3, score);
+// ssl.add(ss);
+// prevSS=ss;
+// }
+// }else{
+// SiteScore ss=new SiteScore(chrom, strand, site2, site3, score);
+// ssl.add(ss);
+// prevSS=ss;
+// }
+
+ }
+ }
+
+ while(heap.peek().site==site){ //Remove all identical elements, and add subsequent elements
+ final Quad t2=heap.poll();
+ final int row=t2.row+1, col=t2.column;
+ if(row<stops[col]){
+ t2.row=row;
+
+ int a=t2.list[row];
+ int a2;
+ if((a&SITE_MASK)>=offsets[col]){
+ a2=a-offsets[col];
+
+ assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) :
+ "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col];
+ }else{
+ int ch=numberToChrom(a, baseChrom);
+ int st=numberToSite(a);
+ int st2=Tools.max(st-offsets[col], 0);
+ a2=toNumber(st2, ch);
+
+ assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) :
+ "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col];
+ }
+
+ assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) :
+ "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col];
+
+ t2.site=a2;
+ values[col]=a2;
+ heap.add(t2);
+ }
+ if(heap.isEmpty()){break;}
+ }
+
+ }
+
+ return ssl;
+ }
+
+ /** This uses a heap to track next column to increment */
+ private final ArrayList<SiteScore> slowWalk3(int[] starts, int[] stops, final byte[] bases,
+ final byte[] baseScores, int[] keyScores, int[] offsets,
+ final int baseChrom_, final byte strand, final boolean obeyLimits, ArrayList<SiteScore> ssl,
+ int[] bestScores, final boolean allBasesCovered, final int maxScore, final boolean fullyDefined){
+ assert(USE_EXTENDED_SCORE);
+
+ final int numKeys=offsets.length; //Before shrink
+
+ //This can be done before or after shrinking, but the results will change depending on MIN_SCORE_MULT and etc.
+ final int maxQuickScore=maxQuickScore(offsets, keyScores);
+
+ if(SHRINK_BEFORE_WALK){
+ int[][] r=shrink(starts, stops, offsets, keyScores, offsets.length);
+ if(r!=null){
+ starts=r[0];
+ stops=r[1];
+ offsets=r[2];
+ keyScores=r[4];
+ }
+ }
+
+ final int numHits=offsets.length; //After shrink
+
+
+ assert(numHits==offsets.length);
+ assert(numHits==keyScores.length);
+
+ usedKeys+=numHits;
+ usedKeyIterations++;
+
+ final boolean filter_by_qscore=(FILTER_BY_QSCORE && numKeys>=5);
+
+ assert(!(!SHRINK_BEFORE_WALK && ADD_SCORE_Z));
+
+
+// final int minScore=(obeyLimits ? (int)(MIN_SCORE_MULT*maxScore) : (int)(MIN_SCORE_MULT*0.85f*maxScore));
+ final int minScore=(obeyLimits ? (int)(MIN_SCORE_MULT*maxScore) : (int)(MIN_SCORE_MULT*1.25f*maxScore));
+ final int minQuickScore=(int)(MIN_QSCORE_MULT*maxQuickScore);
+
+ final int baseChrom=baseChrom(baseChrom_);
+
+ heap.clear();
+
+ final Quad[] triples=tripleStorage;
+
+ final int[] values=valueArray;
+ final int[] sizes=sizeArray;
+ final int[] locArray=(USE_EXTENDED_SCORE ? getLocArray(bases.length) : null);
+ final Block b=index[baseChrom];
+
+ if(ssl==null){ssl=new ArrayList<SiteScore>(8);}
+
+ int currentTopScore=bestScores[0];
+ int cutoff=Tools.max(minScore, (int)(currentTopScore*DYNAMIC_SCORE_THRESH));
+
+ int qcutoff=Tools.max(bestScores[2], minQuickScore);
+ int bestqscore=bestScores[3];
+ int maxHits=bestScores[1];
+ int perfectsFound=bestScores[5];
+ assert((currentTopScore>=maxScore) == (perfectsFound>0)) : currentTopScore+", "+maxScore+", "+perfectsFound+", "+maxHits+", "+numHits+", "+new String(bases);
+ int approxHitsCutoff=calcApproxHitsCutoff(numKeys, maxHits, MIN_APPROX_HITS_TO_KEEP, currentTopScore>=maxScore);
+ if(approxHitsCutoff>numHits){return ssl;}
+
+ final boolean shortCircuit=(allBasesCovered && numKeys==numHits && filter_by_qscore);
+
+
+ assert(USE_EXTENDED_SCORE);
+
+ if(currentTopScore>=maxScore){
+ assert(currentTopScore==maxScore);
+ qcutoff=Tools.max(qcutoff, (int)(maxQuickScore*DYNAMIC_QSCORE_THRESH_PERFECT));
+ }
+
+
+ for(int i=0; i<numHits; i++){
+ final int[] sites=b.sites;
+ final int start=starts[i];
+ sizes[i]=b.length(start, stops[i]);
+ assert(sizes[i]>0);
+
+ int a=sites[start];
+ int a2;
+ if((a&SITE_MASK)>=offsets[i]){
+ a2=a-offsets[i];
+ }else{
+ int ch=numberToChrom(a, baseChrom);
+ int st=numberToSite(a);
+ int st2=Tools.max(st-offsets[i], 0);
+ a2=toNumber(st2, ch);
+ }
+ assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom));
+
+ Quad t=triples[i];
+ assert(t!=null) : "Should be using tripleStorage";
+ assert(i==t.column);
+ t.row=start;
+ t.site=a2;
+ t.list=sites;
+ values[i]=a2;
+
+ heap.add(t);
+ }
+
+// System.out.println("\nEntering SS loop:");
+// System.out.println("maxScore="+maxScore+"\tminScore="+minScore+"\tcurrentTopScore="+currentTopScore+"\n" +
+// "cutoff="+cutoff+"\tmaxHits="+maxHits+"\tapproxHitsCutoff="+approxHitsCutoff);
+// System.out.println("maxQuickScore="+maxQuickScore+"\tminQuickScore="+minQuickScore+"\tqcutoff="+qcutoff);
+
+
+ SiteScore prevSS=null;
+ while(!heap.isEmpty()){
+ Quad t=heap.peek();
+ final int site=t.site;
+ final int centerIndex=t.column;
+
+ int maxNearbySite=site;
+
+
+ int approxHits=0;
+
+ {//Inner loop
+ final int minsite=site-MAX_INDEL, maxsite=site+MAX_INDEL2;
+ for(int column=0, chances=numHits-approxHitsCutoff; column<numHits && chances>=0; column++){
+ final int x=values[column];
+ assert(x==triples[column].site);
+ if(x>=minsite && x<=maxsite){
+ maxNearbySite=(x>maxNearbySite ? x : maxNearbySite);
+ approxHits++;
+ }else{chances--;}
+ }
+ }
+ hist_hits[Tools.min(HIT_HIST_LEN, approxHits)]++;
+
+ assert(centerIndex>=0) : centerIndex;
+ assert(approxHits>=1 || approxHitsCutoff>1) : approxHits+", "+approxHitsCutoff+", "+numHits+", "+t.column;
+ if(approxHits>=approxHitsCutoff){
+
+ int score;
+ int qscore=(filter_by_qscore ? quickScore(values, keyScores, centerIndex, offsets, sizes, true, approxHits, numHits) : qcutoff);
+ if(ADD_SCORE_Z){
+ int scoreZ=scoreZ2(values, centerIndex, offsets, approxHits, numHits);
+ qscore+=scoreZ;
+ }
+
+ int mapStart=site, mapStop=maxNearbySite;
+
+ assert(USE_EXTENDED_SCORE);
+
+ boolean locArrayValid=false;
+ if(qscore<qcutoff){
+ score=-1;
+ }else{
+
+ final int chrom=numberToChrom(site, baseChrom);
+
+ //TODO Note that disabling the shortCircuit code seems to make things run 2% faster (with identical results).
+ //However, theoretically, shortCircuit should be far more efficient. Test both ways on cluster and on a larger run.
+ //May have something to do with compiler loop optimizations.
+ if(shortCircuit && qscore==maxQuickScore){
+ assert(approxHits==numKeys);
+ score=maxScore;
+ }else{
+ if(verbose){
+ System.err.println("numHits="+numHits+", approxHits="+approxHits+", keys="+numKeys+", centerIndex="+centerIndex+
+ ", qscore="+qscore+", qcutoff="+qcutoff+", filter_by_qscore="+filter_by_qscore);
+ System.err.println("Extending "+Arrays.toString(values));
+ }
+ score=extendScore(bases, baseScores, offsets, values, chrom, centerIndex, locArray, numHits, approxHits);
+ locArrayValid=true;
+
+ if(verbose){
+ System.err.println("score: "+score);
+ System.err.println("locArray: "+Arrays.toString(locArray));
+ }
+
+ //Correct begin and end positions if they changed.
+ int min=Integer.MAX_VALUE;
+ int max=Integer.MIN_VALUE;
+ for(int i=0; i<locArray.length; i++){
+ int x=locArray[i];
+ if(x>-1){
+ if(x<min){min=x;}
+ if(x>max){max=x;}
+ }
+ }
+
+ if(score>=maxScore){
+ assert(min==max && min>-1) : "\n"+score+", "+maxScore+", "+min+", "+max+
+ ", "+(max-min)+", "+bases.length+"\n"+Arrays.toString(locArray)+"\n";
+ }
+
+ // assert(min>-1 && max>-1) : Arrays.toString(locArray); //TODO: How did this assertion trigger?
+ if(min<0 || max<0){
+ if(!Shared.anomaly){
+ Shared.anomaly=true;
+ System.err.println("\nAnomaly in "+getClass().getName()+".slowWalk:\n"+
+ "chrom="+chrom+", mapStart="+mapStart+", mapStop="+mapStop+", centerIndex="+centerIndex+", strand="+strand+"\n"+
+ "score="+score+", maxScore="+maxScore+", qscore="+qscore+", filter_by_qscore="+filter_by_qscore+"\n"+
+ "numHits="+approxHits+", approxHits="+approxHits+"\n"+
+ "min="+min+", max="+max+", (max-min)="+(max-min)+"\n"+
+ "bases.length="+bases.length+"\n"+Arrays.toString(locArray)+"\n"+
+ "locArray:\t"+Arrays.toString(locArray)+"\n"+
+ "values:\t"+Arrays.toString(values)+"\n"+
+ "bases:\t"+new String(bases));
+ System.err.println();
+ assert(false);
+ }
+ score=-99999;
+ }
+
+ //mapStart and mapStop are indices
+ mapStart=toNumber(min, chrom);
+ mapStop=toNumber(max, chrom);
+
+ if(score>=maxScore){
+ assert(mapStop-mapStart==0) : "\n"+score+", "+maxScore+", "+min+", "+max+
+ ", "+(max-min)+", "+(mapStop-mapStart)+", "+bases.length+"\n"+Arrays.toString(locArray)+"\n";
+ }
+ }
+
+ if(score==maxScore){
+ qcutoff=Tools.max(qcutoff, (int)(maxQuickScore*DYNAMIC_QSCORE_THRESH_PERFECT));
+ approxHitsCutoff=calcApproxHitsCutoff(numKeys, maxHits, MIN_APPROX_HITS_TO_KEEP, true);
+ }
+
+ if(score>=cutoff){
+ qcutoff=Tools.max(qcutoff, (int)(qscore*DYNAMIC_QSCORE_THRESH));
+ bestqscore=Tools.max(qscore, bestqscore);
+ }
+ }
+
+ if(score>=cutoff){
+
+ if(score>currentTopScore){
+// System.err.println("New top score!");
+
+ if(DYNAMICALLY_TRIM_LOW_SCORES){
+
+ maxHits=Tools.max(approxHits, maxHits);
+ approxHitsCutoff=calcApproxHitsCutoff(numKeys, maxHits, approxHitsCutoff, currentTopScore>=maxScore);
+
+ cutoff=Tools.max(cutoff, (int)(score*DYNAMIC_SCORE_THRESH));
+ if(score>=maxScore){
+ assert(USE_EXTENDED_SCORE);
+ cutoff=Tools.max(cutoff, (int)(score*0.95f));
+ }
+ }
+
+ currentTopScore=score;
+
+// System.out.println("New top score: "+currentTopScore+" \t("+cutoff+")");
+ }
+
+ final int chrom=numberToChrom(mapStart, baseChrom);
+ final int site2=numberToSite(mapStart);
+ final int site3=numberToSite(mapStop)+bases.length-1;
+
+ assert(site2!=site3) : site2+", "+site3+", "+mapStart+", "+mapStop;
+
+ assert(NUM_CHROM_BITS==0 || site2<SITE_MASK-1000) : "chrom="+chrom+", strand="+strand+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", site2="+site2+", site3="+site3+", read.length="+bases.length+
+ "\n\n"+Arrays.toString(b.getHitList(centerIndex));
+ assert(site2<site3) : "chrom="+chrom+", strand="+strand+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", site2="+site2+", site3="+site3+", read.length="+bases.length;
+
+
+ int[] gapArray=null;
+ if(site3-site2>=MINGAP+bases.length){
+ assert(locArrayValid) : "Loc array was not filled.";
+// System.err.println("****\n"+Arrays.toString(locArray)+"\n");
+// int[] clone=locArray.clone();
+ gapArray=makeGapArray(locArray, site2, MINGAP);
+ if(gapArray!=null){
+// System.err.println(Arrays.toString(locArray)+"\n");
+// System.err.println(Arrays.toString(gapArray));
+//
+//// int sub=site2-mapStart;//thus site2=mapStart+sub
+//// for(int i=0; i<gapArray.length; i++){
+//// gapArray[i]+=sub;
+//// }
+//// System.err.println(Arrays.toString(gapArray));
+//
+// System.err.println(mapStart+" -> "+site2);
+// System.err.println(mapStop+" -> "+site3);
+
+ assert(gapArray[0]>=site2 && gapArray[0]-site2<bases.length);
+ assert(gapArray[gapArray.length-1]<=site3 && site3-gapArray[gapArray.length-1]<bases.length) : "\n"+
+ mapStart+" -> "+site2+"\n"+
+ mapStop+" -> "+site3+"\n\n"+
+ Arrays.toString(gapArray)+"\n\n"+
+// Arrays.toString(clone)+"\n\n"+
+ Arrays.toString(locArray)+"\n"+
+ "numHits="+numHits+", "+
+ "heap.size="+heap.size()+", "+
+ "numHits="+numHits+", "+
+ "approxHits="+approxHits+"\n";
+ gapArray[0]=Tools.min(gapArray[0], site2);
+ gapArray[gapArray.length-1]=Tools.max(gapArray[gapArray.length-1], site3);
+ }
+ if(verbose){System.err.println("@ site "+site2+", made gap array: "+Arrays.toString(gapArray));}
+// assert(false) : Arrays.toString(locArray);
+ }
+
+
+ //This block is optional, but tries to eliminate multiple identical alignments
+
+ SiteScore ss=null;
+ final boolean perfect1=USE_EXTENDED_SCORE && score==maxScore && fullyDefined;
+ final boolean inbounds=(site2>=0 && site3<Data.chromLengths[chrom]);
+// if(!inbounds){System.err.println("Index tossed out-of-bounds site chr"+chrom+", "+site2+"-"+site3);}
+
+ if(verbose){
+ System.err.println("Considering site chr"+chrom+", site2="+site2+", site3="+site3+", mapStart="+mapStart+", mapStop="+mapStop);
+ if(!inbounds){System.err.println("Index tossed out-of-bounds site.");}
+ assert(site2!=site3) : site2+", "+site3+", "+mapStart+", "+mapStop;
+ }
+ //assert((ss==null || !ss.semiperfect) && (prevSS==null || !prevSS.semiperfect)) : (ss==null ? false : ss.semiperfect)+", "+(prevSS==null ? false : prevSS.semiperfect); //***
+
+ if(inbounds && !SEMIPERFECTMODE && !PERFECTMODE && gapArray==null && prevSS!=null &&
+ prevSS.chrom==chrom && prevSS.strand==strand && overlap(prevSS.start, prevSS.stop, site2, site3)){
+
+ if(verbose){System.err.println("Considering overlapping site chr"+chrom+", "+site2+"-"+site3);}
+
+ final int betterScore=Tools.max(score, prevSS.score);
+ final int minStart=Tools.min(prevSS.start, site2);
+ final int maxStop=Tools.max(prevSS.stop, site3);
+ final boolean perfect2=USE_EXTENDED_SCORE && prevSS.score==maxScore && fullyDefined;
+ assert(!USE_EXTENDED_SCORE || perfect2==prevSS.perfect);
+
+ final boolean shortEnough=(!LIMIT_SUBSUMPTION_LENGTH_TO_2X || (maxStop-minStart<2*bases.length));
+
+ if(prevSS.start==site2 && prevSS.stop==site3){
+ if(verbose){System.err.println("Class 1: Same bounds as last site.");}
+ prevSS.score=prevSS.quickScore=betterScore;
+ prevSS.perfect=(prevSS.perfect || perfect1 || perfect2);
+ if(prevSS.perfect){prevSS.semiperfect=true;}
+ //assert((ss==null || !ss.semiperfect) && (prevSS==null || !prevSS.semiperfect)) : (ss==null ? false : ss.semiperfect)+", "+(prevSS==null ? false : prevSS.semiperfect); //***
+ }else if(SUBSUME_SAME_START_SITES && shortEnough && prevSS.start==site2 && !prevSS.semiperfect){
+ if(verbose){System.err.println("Class 2: Same start as last site.");}
+ if(perfect2){
+ //do nothing
+ }else if(perfect1){
+ prevSS.setStop(site3);
+ if(!prevSS.perfect){perfectsFound++;}//***$
+ prevSS.perfect=prevSS.semiperfect=true;
+ }else{
+ prevSS.setStop(maxStop);
+ prevSS.setPerfect(bases);
+ }
+ //assert((ss==null || !ss.semiperfect) && (prevSS==null || !prevSS.semiperfect)) : (ss==null ? false : ss.semiperfect)+", "+(prevSS==null ? false : prevSS.semiperfect); //***
+ prevSS.score=prevSS.quickScore=betterScore;
+ }else if(SUBSUME_SAME_STOP_SITES && shortEnough && prevSS.stop==site3 && !prevSS.semiperfect){
+ if(verbose){System.err.println("Class 3: Same stop as last site.");}
+ if(perfect2){
+ //do nothing
+ }else if(perfect1){
+ prevSS.setStart(site2);
+ if(!prevSS.perfect){perfectsFound++;}//***$
+ prevSS.perfect=prevSS.semiperfect=true;
+ }else{
+ prevSS.setStart(minStart);
+ prevSS.setPerfect(bases);
+ }
+ //assert((ss==null || !ss.semiperfect) && (prevSS==null || !prevSS.semiperfect)) : (ss==null ? false : ss.semiperfect)+", "+(prevSS==null ? false : prevSS.semiperfect); //***
+ prevSS.score=prevSS.quickScore=betterScore;
+ }else if(SUBSUME_OVERLAPPING_SITES && shortEnough && (maxStop-minStart<=bases.length+MAX_SUBSUMPTION_LENGTH)
+ && !perfect1 && !perfect2 && !prevSS.semiperfect){
+ if(verbose){System.err.println("Class 4.");}
+ prevSS.setLimits(minStart, maxStop);
+ prevSS.score=prevSS.quickScore=betterScore;
+ prevSS.setPerfect(bases);
+ //assert((ss==null || !ss.semiperfect) && (prevSS==null || !prevSS.semiperfect)) : (ss==null ? false : ss.semiperfect)+", "+(prevSS==null ? false : prevSS.semiperfect); //***
+ }else{
+ if(verbose){System.err.println("Class 5: Making new site");}
+ ss=new SiteScore(chrom, strand, site2, site3, approxHits, score, false, perfect1);
+ if(!perfect1){ss.setPerfect(bases);}
+ //assert((ss==null || !ss.semiperfect) && (prevSS==null || !prevSS.semiperfect)) : (ss==null ? false : ss.semiperfect)+", "+(prevSS==null ? false : prevSS.semiperfect); //***
+// assert(Read.CHECKSITE(ss, bases));
+ if(verbose){System.err.println("A) Index made SiteScore "+ss.toText()+", "+Arrays.toString(ss.gaps));}
+ assert(!perfect1 || ss.stop-ss.start==bases.length-1);
+ }
+ assert(!perfect2 || prevSS.stop-prevSS.start==bases.length-1);
+ }else if(inbounds){
+ if(verbose){System.err.println("Considering new site chr"+chrom+", "+site2+"-"+site3);}
+ ss=new SiteScore(chrom, strand, site2, site3, approxHits, score, false, perfect1);
+ if(!perfect1){ss.setPerfect(bases);}
+ //assert((ss==null || !ss.semiperfect) && (prevSS==null || !prevSS.semiperfect)) : (ss==null ? false : ss.semiperfect)+", "+(prevSS==null ? false : prevSS.semiperfect); //***
+// assert(Read.CHECKSITE(ss, bases));
+ ss.gaps=gapArray;
+ if(verbose){System.err.println("B) Index made SiteScore "+ss.toText()+", "+Arrays.toString(ss.gaps));}
+ }
+ //assert((ss==null || !ss.semiperfect) && (prevSS==null || !prevSS.semiperfect)) : (ss==null ? false : ss.semiperfect)+", "+(prevSS==null ? false : prevSS.semiperfect); //***
+ assert(ss==null || !ss.perfect || ss.semiperfect) : ss;
+ assert(prevSS==null || !prevSS.perfect || prevSS.semiperfect) : "\n"+SiteScore.header()+"\n"+ss+"\n"+prevSS;
+ if(ss!=null && (SEMIPERFECTMODE && !ss.semiperfect) || (PERFECTMODE && !ss.perfect)){ss=null;}
+
+
+ if(ss!=null){
+// System.out.println("Added site "+ss.toText()+", qscore="+qscore);
+ ssl.add(ss);
+ if(ss.perfect){
+
+ if(prevSS==null || !prevSS.perfect || !ss.overlaps(prevSS)){
+ if(prevSS==null){assert ssl.size()<2 || !ss.overlaps(ssl.get(ssl.size()-2));}
+ perfectsFound++;
+
+ //Human-specific code
+// if(QUIT_AFTER_TWO_PERFECTS){
+// if(perfectsFound>=3 || (perfectsFound>=2 && chrom<24)){break;}
+// }
+
+ if(QUIT_AFTER_TWO_PERFECTS && perfectsFound>=2){break;}
+ }
+ }
+
+ prevSS=ss;
+ }else{
+// System.out.println("Subsumed site "+new SiteScore(chrom, strand, site2, site3, score).toText());
+ }
+ //assert((ss==null || !ss.semiperfect) && (prevSS==null || !prevSS.semiperfect)) : (ss==null ? false : ss.semiperfect)+", "+(prevSS==null ? false : prevSS.semiperfect); //***
+ }
+ }
+
+ while(heap.peek().site==site){ //Remove all identical elements, and add subsequent elements
+ final Quad t2=heap.poll();
+ final int row=t2.row+1, col=t2.column;
+ if(row<stops[col]){
+ t2.row=row;
+
+ int a=t2.list[row];
+ int a2;
+ if((a&SITE_MASK)>=offsets[col]){
+ a2=a-offsets[col];
+
+ assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) :
+ "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col];
+ }else{
+ int ch=numberToChrom(a, baseChrom);
+ int st=numberToSite(a);
+ int st2=Tools.max(st-offsets[col], 0);
+ a2=toNumber(st2, ch);
+
+ assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) :
+ "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col];
+ }
+
+ assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) :
+ "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col];
+
+ t2.site=a2;
+ values[col]=a2;
+ heap.add(t2);
+ }else if(heap.size()<approxHitsCutoff || PERFECTMODE){
+ assert(USE_EXTENDED_SCORE);
+ bestScores[0]=Tools.max(bestScores[0], currentTopScore);
+ bestScores[1]=Tools.max(bestScores[1], maxHits);
+ bestScores[2]=Tools.max(bestScores[2], qcutoff);
+ bestScores[3]=Tools.max(bestScores[3], bestqscore);
+
+ bestScores[4]=maxQuickScore;
+ bestScores[5]=perfectsFound; //***$ fixed by adding this line
+ if(!RETAIN_BEST_QCUTOFF){bestScores[2]=-9999;}
+
+ return ssl;
+ }
+ if(heap.isEmpty()){
+ assert(false) : heap.size()+", "+approxHitsCutoff;
+ break;
+ }
+ }
+
+ }
+
+ assert(USE_EXTENDED_SCORE);
+ bestScores[0]=Tools.max(bestScores[0], currentTopScore);
+ bestScores[1]=Tools.max(bestScores[1], maxHits);
+ bestScores[2]=Tools.max(bestScores[2], qcutoff);
+ bestScores[3]=Tools.max(bestScores[3], bestqscore);
+
+ bestScores[4]=maxQuickScore;
+ bestScores[5]=perfectsFound;
+ if(!RETAIN_BEST_QCUTOFF){bestScores[2]=-9999;}
+
+ return ssl;
+ }
+
+
+ /** Uses dual heaps - reserve (big) and active (small). */
+ private final ArrayList<SiteScore> camelWalk3(int[] starts, int[] stops, final byte[] bases,
+ final byte[] baseScores, int[] keyScores, int[] offsets,
+ final int baseChrom_, final byte strand, final boolean obeyLimits, ArrayList<SiteScore> ssl,
+ int[] bestScores, final boolean allBasesCovered, final int maxScore, final boolean fullyDefined){
+ assert(USE_EXTENDED_SCORE);
+
+ final int numKeys=offsets.length; //Before shrink
+
+ //This can be done before or after shrinking, but the results will change depending on MIN_SCORE_MULT and etc.
+ final int maxQuickScore=maxQuickScore(offsets, keyScores);
+
+ if(SHRINK_BEFORE_WALK){
+ int[][] r=shrink(starts, stops, offsets, keyScores, offsets.length);
+ if(r!=null){
+ starts=r[0];
+ stops=r[1];
+ offsets=r[2];
+ keyScores=r[4];
+ }
+ }
+
+ final int numHits=offsets.length; //After shrink
+
+
+ assert(numHits==offsets.length);
+ assert(numHits==keyScores.length);
+
+ usedKeys+=numHits;
+ usedKeyIterations++;
+
+ final boolean filter_by_qscore=(FILTER_BY_QSCORE && numKeys>=5);
+
+ assert(!(!SHRINK_BEFORE_WALK && ADD_SCORE_Z));
+
+
+// final int minScore=(obeyLimits ? (int)(MIN_SCORE_MULT*maxScore) : (int)(MIN_SCORE_MULT*0.85f*maxScore));
+ final int minScore=(obeyLimits ? (int)(MIN_SCORE_MULT*maxScore) : (int)(MIN_SCORE_MULT*1.25f*maxScore));
+ final int minQuickScore=(int)(MIN_QSCORE_MULT*maxQuickScore);
+
+ final int baseChrom=baseChrom(baseChrom_);
+
+ heap.clear();
+ active.clear();
+
+ final Quad[] triples=tripleStorage;
+
+ final int[] values=valueArray;
+ final int[] sizes=sizeArray;
+ final int[] locArray=(USE_EXTENDED_SCORE ? getLocArray(bases.length) : null);
+ final Block b=index[baseChrom];
+
+ if(ssl==null){ssl=new ArrayList<SiteScore>(8);}
+
+ int currentTopScore=bestScores[0];
+ int cutoff=Tools.max(minScore, (int)(currentTopScore*DYNAMIC_SCORE_THRESH));
+
+ int qcutoff=Tools.max(bestScores[2], minQuickScore);
+ int bestqscore=bestScores[3];
+ int maxHits=bestScores[1];
+ int perfectsFound=bestScores[5];
+ assert((currentTopScore>=maxScore) == (perfectsFound>0)) : currentTopScore+", "+maxScore+", "+perfectsFound+", "+maxHits+", "+numHits;
+ int approxHitsCutoff=calcApproxHitsCutoff(numKeys, maxHits, MIN_APPROX_HITS_TO_KEEP, currentTopScore>=maxScore);
+ if(approxHitsCutoff>numHits){return ssl;}
+
+ final boolean shortCircuit=(allBasesCovered && numKeys==numHits && filter_by_qscore);
+
+ if(currentTopScore>=maxScore){
+ assert(currentTopScore==maxScore);
+ qcutoff=Tools.max(qcutoff, (int)(maxQuickScore*DYNAMIC_QSCORE_THRESH_PERFECT));
+ }
+
+
+ for(int i=0; i<numHits; i++){
+ final int[] sites=b.sites;
+ final int start=starts[i];
+ sizes[i]=b.length(start, stops[i]);
+ assert(sizes[i]>0);
+
+ int a=sites[start];
+ int a2;
+ if((a&SITE_MASK)>=offsets[i]){
+ a2=a-offsets[i];
+ }else{
+ int ch=numberToChrom(a, baseChrom);
+ int st=numberToSite(a);
+ int st2=Tools.max(st-offsets[i], 0);
+ a2=toNumber(st2, ch);
+ }
+ assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom));
+
+ Quad t=triples[i];
+ assert(t!=null) : "Should be using tripleStorage";
+ assert(i==t.column);
+ t.row=start;
+ t.site=a2;
+ t.list=sites;
+ values[i]=a2;
+
+ heap.add(t);
+ }
+
+ assert(numHits>0);
+ assert(heap.size()==numHits);
+
+ /* Tracks largest element allowed in 'active' */
+
+// System.err.println("\nEntering SS loop:");
+// System.err.println("maxScore="+maxScore+"\tminScore="+minScore+"\tcurrentTopScore="+currentTopScore+"\n" +
+// "cutoff="+cutoff+"\tmaxHits="+maxHits+"\tapproxHitsCutoff="+approxHitsCutoff);
+// System.err.println("maxQuickScore="+maxQuickScore+"\tminQuickScore="+minQuickScore+"\tqcutoff="+qcutoff);
+
+// int iter=0;
+ SiteScore prevSS=null;
+ int maxNearbySite=0;
+ int site=0;
+ int horizon=0;
+ assert(active.isEmpty());
+ while(!heap.isEmpty() || !active.isEmpty()){
+// iter++;
+
+ do{
+ while(!active.isEmpty() && active.peek().site==site){ //Remove all identical elements, and add subsequent elements
+ final Quad t2=active.poll();
+ final int row=t2.row+1, col=t2.column;
+
+ //This is called the "increment" operation. Very messy and slow due to rare cases at beginning of a chrom.
+ if(row<stops[col]){//then increment and return to the heap
+ t2.row=row;
+
+ int a=t2.list[row];
+ int a2;
+ if((a&SITE_MASK)>=offsets[col]){
+ a2=a-offsets[col];
+
+ assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) :
+ "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col];
+ }else{
+ int ch=numberToChrom(a, baseChrom);
+ int st=numberToSite(a);
+ int st2=Tools.max(st-offsets[col], 0);
+ a2=toNumber(st2, ch);
+
+ assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) :
+ "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col];
+ }
+
+ assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) :
+ "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col];
+
+ t2.site=a2;
+ values[col]=a2;
+ if(a2<=horizon){
+ active.add(t2);
+ maxNearbySite=Tools.max(t2.site, maxNearbySite);
+ }else{heap.add(t2);}
+ }else if((heap.size()+active.size())<approxHitsCutoff || PERFECTMODE){ //Then there are not enough keys remaining for a site
+ assert(USE_EXTENDED_SCORE);
+ bestScores[0]=Tools.max(bestScores[0], currentTopScore);
+ bestScores[1]=Tools.max(bestScores[1], maxHits);
+ bestScores[2]=Tools.max(bestScores[2], qcutoff);
+ bestScores[3]=Tools.max(bestScores[3], bestqscore);
+
+ bestScores[4]=maxQuickScore;
+ bestScores[5]=perfectsFound; //***$ fixed by adding this line
+ if(!RETAIN_BEST_QCUTOFF){bestScores[2]=-9999;}
+
+ return ssl;
+ }
+// if(heap.isEmpty() && active.isEmpty()){
+// assert(false) : heap.size()+", "+active.size()+", "+approxHitsCutoff;
+// break;
+// }
+ }
+
+ final Quad t;
+ if(active.isEmpty()){
+ t=heap.poll();
+ active.add(t);
+ maxNearbySite=t.site;
+ }else{
+ t=active.peek();
+ }
+ site=t.site;
+ horizon=(int)Tools.min(Integer.MAX_VALUE, site+(long)MAX_INDEL2);
+ while(!heap.isEmpty() && heap.peek().site<=horizon){
+ Quad t2=heap.poll();
+ active.add(t2);
+ maxNearbySite=t2.site;
+ }
+
+ if(verbose){System.err.println("\nFinished loop iteration. active="+active+"\nheap="+heap+"\nfloor="+site+", horizon="+horizon+", maxNearbySite="+maxNearbySite);}
+
+ }while(active.size()<approxHitsCutoff);
+
+ final int approxHits=active.size();
+ final Quad t=active.peek();
+ final int centerIndex=t.column;
+
+ if(verbose){
+ System.err.println("\nLeft loop. active="+active+"\nheap="+heap+"\n" +
+ "floor="+site+", maxNearbySite="+maxNearbySite+", approxHits="+approxHits+", approxHitsCutoff="+approxHitsCutoff+"\n");
+ }
+
+
+
+// approxHits=0;
+// {//Inner loop
+// final int minsite=site, maxsite=site+MAX_INDEL2;
+// for(int column=0, chances=numHits-approxHitsCutoff; column<numHits && chances>=0; column++){
+// final int x=values[column];
+// assert(x==triples[column].site);
+// if(x>=minsite && x<=maxsite){
+// maxNearbySite=(x>maxNearbySite ? x : maxNearbySite);
+// approxHits++;
+// }else{chances--;}
+//// if(verbose){
+//// System.err.println("column="+column+", numHits="+numHits+", approxHits="+approxHits+
+//// ", approxHitsCutoff="+approxHitsCutoff+", chances="+chances);
+//// }
+// }
+// //Invalid assertion due to loop early exit
+//// assert(approxHits>0) : "\niter="+iter+", maxHits="+maxHits+", numHits="+numHits+", approxHitsCutoff="+approxHitsCutoff+
+//// "\nheap.size()="+heap.size()+", minsite="+minsite+", maxsite="+maxsite+", values[center]="+values[centerIndex]+", t="+t;
+// }
+// assert(approxHits<=active.size()) : "approxHits="+approxHits+", active.size()="+active.size()+", maxNearbySite="+maxNearbySite+"\nvalues="+Arrays.toString(values);
+
+ hist_hits[Tools.min(HIT_HIST_LEN, approxHits)]++;
+
+ assert(centerIndex>=0) : centerIndex;
+ assert(approxHits>=1 || approxHitsCutoff>1) : approxHits+", "+approxHitsCutoff+", "+numHits+", "+t.column;
+ if(approxHits>=approxHitsCutoff){
+
+// if(verbose){System.err.println("A");}
+
+ int score;
+ int qscore=(filter_by_qscore ? quickScore(values, keyScores, centerIndex, offsets, sizes, true, approxHits, numHits) : qcutoff);
+ if(ADD_SCORE_Z){
+ int scoreZ=scoreZ2(values, centerIndex, offsets, approxHits, numHits);
+ qscore+=scoreZ;
+ }
+
+ int mapStart=site, mapStop=maxNearbySite;
+ assert(mapStart<=mapStop);
+
+ assert(USE_EXTENDED_SCORE);
+
+ boolean locArrayValid=false;
+ if(qscore<qcutoff){
+ score=-1;
+// if(verbose){System.err.println("B");}
+ }else{
+// if(verbose){System.err.println("C");}
+ final int chrom=numberToChrom(site, baseChrom);
+
+ //TODO Note that disabling the shortCircuit code seems to make things run 2% faster (with identical results).
+ //However, theoretically, shortCircuit should be far more efficient. Test both ways on cluster and on a larger run.
+ //May have something to do with compiler loop optimizations.
+ if(shortCircuit && qscore==maxQuickScore){
+// if(verbose){System.err.println("D");}
+ assert(approxHits==numKeys);
+ score=maxScore;
+ }else{
+// if(verbose){System.err.println("E");}
+ if(verbose){
+ System.err.println("numHits="+numHits+", approxHits="+approxHits+", qscore="+qscore+", qcutoff="+qcutoff+", filter_by_qscore="+filter_by_qscore);
+ System.err.println("Extending "+Arrays.toString(values));
+ }
+ score=extendScore(bases, baseScores, offsets, values, chrom, centerIndex, locArray, numHits, approxHits);
+ locArrayValid=true;
+
+ if(verbose){
+ System.err.println("score: "+score);
+ System.err.println("locArray: "+Arrays.toString(locArray));
+ }
+
+ //Correct begin and end positions if they changed.
+ int min=Integer.MAX_VALUE;
+ int max=Integer.MIN_VALUE;
+ for(int i=0; i<locArray.length; i++){
+ int x=locArray[i];
+ if(x>-1){
+ if(x<min){min=x;}
+ if(x>max){max=x;}
+ }
+ }
+
+ if(score>=maxScore){
+ assert(min==max && min>-1) : "\n"+score+", "+maxScore+", "+min+", "+max+
+ ", "+(max-min)+", "+bases.length+"\n"+Arrays.toString(locArray)+"\n";
+ }
+
+ // assert(min>-1 && max>-1) : Arrays.toString(locArray); //TODO: How did this assertion trigger?
+ if(min<0 || max<0){
+ if(!Shared.anomaly){
+ Shared.anomaly=true;
+ System.err.println("\nAnomaly in "+getClass().getName()+".slowWalk:\n"+
+ "chrom="+chrom+", mapStart="+mapStart+", mapStop="+mapStop+", centerIndex="+centerIndex+", strand="+strand+"\n"+
+ "score="+score+", maxScore="+maxScore+", qscore="+qscore+", filter_by_qscore="+filter_by_qscore+"\n"+
+ "numHits="+approxHits+", approxHits="+approxHits+"\n"+
+ "min="+min+", max="+max+", (max-min)="+(max-min)+"\n"+
+ "bases.length="+bases.length+"\n"+Arrays.toString(locArray)+"\n"+
+ "locArray:\t"+Arrays.toString(locArray)+"\n"+
+ "values:\t"+Arrays.toString(values)+"\n"+
+ "bases:\t"+new String(bases));
+ System.err.println();
+ assert(false);
+ }
+ score=-99999;
+ }
+
+ //mapStart and mapStop are indices
+ mapStart=toNumber(min, chrom);
+ mapStop=toNumber(max, chrom);
+ assert(mapStart<=mapStop);
+
+ if(score>=maxScore){
+ assert(mapStop-mapStart==0) : "\n"+score+", "+maxScore+", "+min+", "+max+
+ ", "+(max-min)+", "+(mapStop-mapStart)+", "+bases.length+"\n"+Arrays.toString(locArray)+"\n";
+ }
+ }
+// if(verbose){System.err.println("F");}
+
+ if(score==maxScore){
+ qcutoff=Tools.max(qcutoff, (int)(maxQuickScore*DYNAMIC_QSCORE_THRESH_PERFECT));
+ approxHitsCutoff=calcApproxHitsCutoff(numKeys, maxHits, MIN_APPROX_HITS_TO_KEEP, true);
+ }
+
+ if(score>=cutoff){
+ qcutoff=Tools.max(qcutoff, (int)(qscore*DYNAMIC_QSCORE_THRESH));
+ bestqscore=Tools.max(qscore, bestqscore);
+ }
+ }
+// if(verbose){System.err.println("G");}
+
+ if(score>=cutoff){
+// if(verbose){System.err.println("H");}
+
+ if(score>currentTopScore){
+// if(verbose){System.err.println("I");}
+// System.err.println("New top score!");
+
+ if(DYNAMICALLY_TRIM_LOW_SCORES){
+
+ maxHits=Tools.max(approxHits, maxHits);
+ approxHitsCutoff=calcApproxHitsCutoff(numKeys, maxHits, approxHitsCutoff, currentTopScore>=maxScore);
+
+ cutoff=Tools.max(cutoff, (int)(score*DYNAMIC_SCORE_THRESH));
+ if(score>=maxScore){
+ assert(USE_EXTENDED_SCORE);
+ cutoff=Tools.max(cutoff, (int)(score*0.95f));
+ }
+ }
+
+ currentTopScore=score;
+
+// System.err.println("New top score: "+currentTopScore+" \t("+cutoff+")");
+ }
+// if(verbose){System.err.println("J");}
+
+ final int chrom=numberToChrom(mapStart, baseChrom);
+ final int site2=numberToSite(mapStart);
+ final int site3=numberToSite(mapStop)+bases.length-1;
+
+ assert(NUM_CHROM_BITS==0 || site2<SITE_MASK-1000) : "chrom="+chrom+", strand="+strand+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", site2="+site2+", site3="+site3+", read.length="+bases.length+
+ "\n\n"+Arrays.toString(b.getHitList(centerIndex));
+ assert(site2<site3) : "\nchrom="+chrom+", strand="+strand+", site="+site+", maxNearbySite="+maxNearbySite+"\n"+
+ "mapStart="+mapStart+", mapStop="+mapStop+", site2="+site2+", site3="+site3+", read.length="+bases.length+"\n"+
+ "numberToSite("+mapStart+")="+numberToSite(mapStart)+", numberToSite("+mapStop+")="+numberToSite(mapStop)+"\n"+
+ "\n"+new String(bases)+"\n";
+
+ if(verbose){
+ System.err.println("chrom="+chrom+", strand="+strand+", site="+site+", maxNearbySite="+maxNearbySite+"\n"+
+ "mapStart="+mapStart+", mapStop="+mapStop+", site2="+site2+", site3="+site3+", read.length="+bases.length);
+ }
+
+ int[] gapArray=null;
+ if(site3-site2>=MINGAP+bases.length){
+ assert(locArrayValid) : "Loc array was not filled.";
+// System.err.println("****\n"+Arrays.toString(locArray)+"\n");
+// int[] clone=locArray.clone();
+ gapArray=makeGapArray(locArray, site2, MINGAP);
+ if(gapArray!=null){
+// System.err.println(Arrays.toString(locArray)+"\n");
+// System.err.println(Arrays.toString(gapArray));
+//
+//// int sub=site2-mapStart;//thus site2=mapStart+sub
+//// for(int i=0; i<gapArray.length; i++){
+//// gapArray[i]+=sub;
+//// }
+//// System.err.println(Arrays.toString(gapArray));
+//
+// System.err.println(mapStart+" -> "+site2);
+// System.err.println(mapStop+" -> "+site3);
+
+ assert(gapArray[0]>=site2 && gapArray[0]-site2<bases.length);
+ assert(gapArray[gapArray.length-1]<=site3 && site3-gapArray[gapArray.length-1]<bases.length) : "\n"+
+ mapStart+" -> "+site2+"\n"+
+ mapStop+" -> "+site3+"\n\n"+
+ Arrays.toString(gapArray)+"\n\n"+
+// Arrays.toString(clone)+"\n\n"+
+ Arrays.toString(locArray)+"\n"+
+ "numHits="+numHits+", "+
+ "heap.size="+heap.size()+", "+
+ "numHits="+numHits+", "+
+ "approxHits="+approxHits+"\n";
+ gapArray[0]=Tools.min(gapArray[0], site2);
+ gapArray[gapArray.length-1]=Tools.max(gapArray[gapArray.length-1], site3);
+ }
+ if(verbose){System.err.println("@ site "+site2+", made gap array: "+Arrays.toString(gapArray));}
+// assert(false) : Arrays.toString(locArray);
+ }
+
+
+ //This block is optional, but tries to eliminate multiple identical alignments
+
+ SiteScore ss=null;
+ final boolean perfect1=USE_EXTENDED_SCORE && score==maxScore && fullyDefined;
+ final boolean inbounds=(site2>=0 && site3<Data.chromLengths[chrom]);
+// if(!inbounds){System.err.println("Index tossed out-of-bounds site chr"+chrom+", "+site2+"-"+site3);}
+
+ if(inbounds && !SEMIPERFECTMODE && !PERFECTMODE && gapArray==null && prevSS!=null &&
+ prevSS.chrom==chrom && prevSS.strand==strand && overlap(prevSS.start, prevSS.stop, site2, site3)){
+
+ final int betterScore=Tools.max(score, prevSS.score);
+ final int minStart=Tools.min(prevSS.start, site2);
+ final int maxStop=Tools.max(prevSS.stop, site3);
+ final boolean perfect2=USE_EXTENDED_SCORE && prevSS.score==maxScore && fullyDefined;
+ assert(!USE_EXTENDED_SCORE || perfect2==prevSS.perfect);
+
+ final boolean shortEnough=(!LIMIT_SUBSUMPTION_LENGTH_TO_2X || (maxStop-minStart<2*bases.length));
+
+ if(prevSS.start==site2 && prevSS.stop==site3){
+ prevSS.score=prevSS.quickScore=betterScore;
+ prevSS.perfect=(prevSS.perfect || perfect1 || perfect2);
+ if(prevSS.perfect){prevSS.semiperfect=true;}
+ }else if(SUBSUME_SAME_START_SITES && shortEnough && prevSS.start==site2 && !prevSS.semiperfect){
+ if(perfect2){
+ //do nothing
+ }else if(perfect1){
+ prevSS.setStop(site3);
+ if(!prevSS.perfect){perfectsFound++;}//***$
+ prevSS.perfect=prevSS.semiperfect=true;
+ }else{
+ prevSS.setStop(maxStop);
+ prevSS.setPerfect(bases);
+ }
+ prevSS.score=prevSS.quickScore=betterScore;
+ }else if(SUBSUME_SAME_STOP_SITES && shortEnough && prevSS.stop==site3 && !prevSS.semiperfect){
+ if(perfect2){
+ //do nothing
+ }else if(perfect1){
+ prevSS.setStart(site2);
+ if(!prevSS.perfect){perfectsFound++;}//***$
+ prevSS.perfect=prevSS.semiperfect=true;
+ }else{
+ prevSS.setStart(minStart);
+ prevSS.setPerfect(bases);
+ }
+ prevSS.score=prevSS.quickScore=betterScore;
+ }else if(SUBSUME_OVERLAPPING_SITES && shortEnough && (maxStop-minStart<=bases.length+MAX_SUBSUMPTION_LENGTH)
+ && !perfect1 && !perfect2 && !prevSS.semiperfect){
+ prevSS.setLimits(minStart, maxStop);
+ prevSS.score=prevSS.quickScore=betterScore;
+ prevSS.setPerfect(bases);
+ }else{
+ ss=new SiteScore(chrom, strand, site2, site3, approxHits, score, false, perfect1);
+ if(!perfect1){ss.setPerfect(bases);}
+// assert(Read.CHECKSITE(ss, bases));
+ if(verbose){System.err.println("A) Index made SiteScore "+ss.toText()+", "+Arrays.toString(ss.gaps));}
+ assert(!perfect1 || ss.stop-ss.start==bases.length-1);
+ }
+ assert(!perfect2 || prevSS.stop-prevSS.start==bases.length-1);
+ }else if(inbounds){
+ ss=new SiteScore(chrom, strand, site2, site3, approxHits, score, false, perfect1);
+ if(!perfect1){ss.setPerfect(bases);}
+// assert(Read.CHECKSITE(ss, bases));
+ ss.gaps=gapArray;
+ if(verbose){System.err.println("B) Index made SiteScore "+ss.toText()+", "+Arrays.toString(ss.gaps));}
+ }
+
+ assert(ss==null || !ss.perfect || ss.semiperfect) : ss;
+ assert(prevSS==null || !prevSS.perfect || prevSS.semiperfect) : "\n"+SiteScore.header()+"\n"+ss+"\n"+prevSS;
+ if(ss!=null && (SEMIPERFECTMODE && !ss.semiperfect) || (PERFECTMODE && !ss.perfect)){ss=null;}
+
+
+ if(ss!=null){
+// System.err.println("Added site "+ss.toText()+", qscore="+qscore);
+ ssl.add(ss);
+ if(ss.perfect){
+
+ if(prevSS==null || !prevSS.perfect || !ss.overlaps(prevSS)){
+ if(prevSS==null){assert ssl.size()<2 || !ss.overlaps(ssl.get(ssl.size()-2));}
+ perfectsFound++;
+
+ //Human-specific code
+// if(QUIT_AFTER_TWO_PERFECTS){
+// if(perfectsFound>=3 || (perfectsFound>=2 && chrom<24)){break;}
+// }
+
+ if(QUIT_AFTER_TWO_PERFECTS && perfectsFound>=2){break;}
+ }
+ }
+
+ prevSS=ss;
+ }else{
+// System.err.println("Subsumed site "+new SiteScore(chrom, strand, site2, site3, score).toText());
+ }
+ }
+ }
+
+// while(!active.isEmpty() && active.peek().site==site){ //Remove all identical elements, and add subsequent elements
+// final Quad t2=active.poll();
+// final int row=t2.row+1, col=t2.column;
+//
+// //This is called the "increment" operation. Very messy and slow due to rare cases at beginning of a chrom.
+// if(row<stops[col]){//then increment and return to the heap
+// t2.row=row;
+//
+// int a=t2.list[row];
+// int a2;
+// if((a&SITE_MASK)>=offsets[col]){
+// a2=a-offsets[col];
+//
+// assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) :
+// "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+
+// ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col];
+// }else{
+// int ch=numberToChrom(a, baseChrom);
+// int st=numberToSite(a);
+// int st2=Tools.max(st-offsets[col], 0);
+// a2=toNumber(st2, ch);
+//
+// assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) :
+// "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+
+// ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col];
+// }
+//
+// assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) :
+// "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+
+// ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col];
+//
+// t2.site=a2;
+// values[col]=a2;
+// if()
+// heap.add(t2);
+// }else if((heap.size()+active.size())<approxHitsCutoff || PERFECTMODE){ //Then there are not enough keys remaining for a site
+// assert(USE_EXTENDED_SCORE);
+// bestScores[0]=Tools.max(bestScores[0], currentTopScore);
+// bestScores[1]=Tools.max(bestScores[1], maxHits);
+// bestScores[2]=Tools.max(bestScores[2], qcutoff);
+// bestScores[3]=Tools.max(bestScores[3], bestqscore);
+//
+// bestScores[4]=maxQuickScore;
+// bestScores[5]=perfectsFound; //***$ fixed by adding this line
+// if(!RETAIN_BEST_QCUTOFF){bestScores[2]=-9999;}
+//
+// return ssl;
+// }
+// if(heap.isEmpty() && active.isEmpty()){
+// assert(false) : heap.size()+", "+active.size()+", "+approxHitsCutoff;
+// break;
+// }
+// }
+
+ }
+
+ assert(USE_EXTENDED_SCORE);
+ bestScores[0]=Tools.max(bestScores[0], currentTopScore);
+ bestScores[1]=Tools.max(bestScores[1], maxHits);
+ bestScores[2]=Tools.max(bestScores[2], qcutoff);
+ bestScores[3]=Tools.max(bestScores[3], bestqscore);
+
+ bestScores[4]=maxQuickScore;
+ bestScores[5]=perfectsFound;
+ if(!RETAIN_BEST_QCUTOFF){bestScores[2]=-9999;}
+
+ return ssl;
+ }
+
+
+ private final int[] findMaxQscore2(final int[] starts, final int[] stops, final int[] offsets, final int[] keyScores,
+ final int baseChrom_, final Quad[] triples, final int[] values, final int prevMaxHits,
+ boolean earlyExit, boolean perfectOnly){
+
+ final int numHits=offsets.length;
+ assert(numHits>=prevMaxHits);
+
+ final int baseChrom=baseChrom(baseChrom_);
+ final Block b=index[baseChrom];
+ final int[] sizes=sizeArray;
+
+ heap.clear();
+ for(int i=0; i<numHits; i++){
+ final int[] sites=b.sites;
+ final int start=starts[i];
+ sizes[i]=b.length(start, stops[i]);
+ assert(sizes[i]>0);
+
+ int a=sites[start];
+ int a2;
+ if((a&SITE_MASK)>=offsets[i]){
+ a2=a-offsets[i];
+ }else{
+ int ch=numberToChrom(a, baseChrom);
+ int st=numberToSite(a);
+ int st2=Tools.max(st-offsets[i], 0);
+ a2=toNumber(st2, ch);
+ }
+ assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom));
+
+ Quad t=triples[i];
+ assert(t!=null) : "Should be using tripleStorage";
+ assert(i==t.column);
+ t.row=start;
+ t.site=a2;
+ t.list=sites;
+ values[i]=a2;
+
+ heap.add(t);
+ }
+
+ final int maxQuickScore=maxQuickScore(offsets, keyScores);
+
+ int topQscore=-999999999;
+
+ int maxHits=0;
+// int approxHitsCutoff=MIN_APPROX_HITS_TO_KEEP;
+
+
+ int approxHitsCutoff;
+ final int indelCutoff;
+ if(perfectOnly){
+ approxHitsCutoff=numHits;
+ indelCutoff=0;
+ }else{
+ approxHitsCutoff=Tools.max(prevMaxHits, Tools.min(MIN_APPROX_HITS_TO_KEEP, numHits-1)); //Faster, same accuracy
+ indelCutoff=MAX_INDEL2;
+ }
+
+
+ while(!heap.isEmpty()){
+ Quad t=heap.peek();
+ final int site=t.site;
+ final int centerIndex=t.column;
+
+ int maxNearbySite=site;
+
+
+ int approxHits=0;
+ {//Inner loop
+ final int minsite=site-Tools.min(MAX_INDEL, indelCutoff), maxsite=site+MAX_INDEL2;
+ for(int column=0, chances=numHits-approxHitsCutoff; column<numHits && chances>=0; column++){
+ final int x=values[column];
+ assert(x==triples[column].site);
+ if(x>=minsite && x<=maxsite){
+ maxNearbySite=(x>maxNearbySite ? x : maxNearbySite);
+ approxHits++;
+ }else{chances--;}
+ }
+ }
+ hist_hits[Tools.min(HIT_HIST_LEN, approxHits)]++;
+
+ assert(centerIndex>=0) : centerIndex;
+ assert(approxHits>=1 || approxHitsCutoff>1) : approxHits+", "+approxHitsCutoff+", "+numHits+", "+t.column;
+ if(approxHits>=approxHitsCutoff){
+
+ int qscore=quickScore(values, keyScores, centerIndex, offsets, sizes, true, approxHits, numHits);
+
+ if(ADD_SCORE_Z){
+ int scoreZ=scoreZ2(values, centerIndex, offsets, approxHits, numHits);
+ qscore+=scoreZ;
+ }
+
+ if(qscore>topQscore){
+
+// maxHits=Tools.max(approxHits, maxHits);
+// approxHitsCutoff=Tools.max(approxHitsCutoff, maxHits); //Best setting for pre-scan
+
+ maxHits=Tools.max(approxHits, maxHits);
+ approxHitsCutoff=Tools.max(approxHitsCutoff, approxHits-1); //Best setting for pre-scan
+
+ topQscore=qscore;
+
+ if(qscore>=maxQuickScore){
+ assert(qscore==maxQuickScore);
+ assert(approxHits==numHits);
+ if(earlyExit){
+ return new int[] {topQscore, maxHits};
+ }
+ }
+ }
+ }
+
+ while(heap.peek().site==site){ //Remove all identical elements, and add subsequent elements
+ final Quad t2=heap.poll();
+ final int row=t2.row+1, col=t2.column;
+ if(row<stops[col]){
+ t2.row=row;
+
+ int a=t2.list[row];
+ int a2;
+ if((a&SITE_MASK)>=offsets[col]){
+ a2=a-offsets[col];
+
+ assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) :
+ "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col];
+ }else{
+ int ch=numberToChrom(a, baseChrom);
+ int st=numberToSite(a);
+ int st2=Tools.max(st-offsets[col], 0);
+ a2=toNumber(st2, ch);
+
+ assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) :
+ "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col];
+ }
+
+ assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) :
+ "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col];
+
+ t2.site=a2;
+ values[col]=a2;
+ heap.add(t2);
+ }else if(earlyExit && (perfectOnly || heap.size()<approxHitsCutoff)){
+ return new int[] {topQscore, maxHits};
+ }
+ if(heap.isEmpty()){break;}
+ }
+
+ }
+
+
+
+ return new int[] {topQscore, maxHits};
+ }
+
+
+ private static final int absdif(int a, int b){
+ return a>b ? a-b : b-a;
+ }
+
+
+ final int maxScore(int[] offsets, byte[] baseScores, int[] keyScores, int readlen, boolean useQuality){
+
+ if(useQuality){
+ //These lines apparently MUST be used if quality is used later on for slow align.
+ if(USE_AFFINE_SCORE){return msa.maxQuality(baseScores);}
+ if(USE_EXTENDED_SCORE){return readlen*(BASE_HIT_SCORE+BASE_HIT_SCORE/5)+Tools.sumInt(baseScores);}
+ }else{
+ if(USE_AFFINE_SCORE){return msa.maxQuality(readlen);}
+ if(USE_EXTENDED_SCORE){return readlen*(BASE_HIT_SCORE+BASE_HIT_SCORE/5);}
+ }
+
+ return maxQuickScore(offsets, keyScores);
+ }
+
+
+ public final int maxQuickScore(int[] offsets, int[] keyScores){
+
+// int x=offsets.length*BASE_KEY_HIT_SCORE;
+ int x=Tools.intSum(keyScores);
+ int y=Y_SCORE_MULT*(offsets[offsets.length-1]-offsets[0]);
+// if(ADD_LIST_SIZE_BONUS){x+=(LIST_SIZE_BONUS[1]*offsets.length);}
+// assert(!ADD_SCORE_Z) : "Need to make sure this is correct...";
+
+// if(ADD_SCORE_Z){x+=((offsets[offsets.length-1]+CHUNKSIZE)*Z_SCORE_MULT);}
+ if(ADD_SCORE_Z){x+=maxScoreZ(offsets);}
+
+ return x+y;
+// int bonus=(2*(HIT_SCORE/2)); //For matching both ends
+// return x+y+bonus;
+ }
+
+
+ private final int quickScore(final int[] locs, final int[] keyScores, final int centerIndex, final int offsets[],
+ int[] sizes, final boolean penalizeIndels, final int numApproxHits, final int numHits){
+
+ hist_hits_score[Tools.min(HIT_HIST_LEN, numApproxHits)]++;
+ if(numApproxHits==1){return keyScores[centerIndex];}
+
+ //Done!
+ //Correct way to calculate score:
+ //Find the first chunk that exactly hits the center.
+ //Then, align leftward of it, and align rightward of it, and sum the scores.
+
+ //"-centerIndex" is a disambiguating term that, given otherwise identical match patterns
+ //(for example, a small indel will generate two valid site candidates), choose the lower site.
+
+ int x=keyScores[centerIndex]+scoreLeft(locs, keyScores, centerIndex, sizes, penalizeIndels)+
+ scoreRight(locs, keyScores, centerIndex, sizes, penalizeIndels, numHits)-centerIndex;
+
+ int y=Y_SCORE_MULT*scoreY(locs, centerIndex, offsets);
+ if(ADD_LIST_SIZE_BONUS){x+=calcListSizeBonus(sizes[centerIndex]);}
+// int z=scoreZ(locs, hits);
+ return x+y;
+ }
+
+
+// /** Generates a term that increases score with how many bases in the read match the ref. */
+// private static final int scoreZ(int[] locs, int centerIndex, int offsets[]){
+// final int center=locs[centerIndex];
+//
+// final int[] refLoc=new int[offsets[offsets.length-1]+CHUNKSIZE];
+//
+// final int maxLoc=center+MAX_INDEL2;
+// final int minLoc=Tools.max(0, center-MAX_INDEL);
+//
+// int score=0;
+//
+// for(int i=0; i<locs.length; i++){
+// int loc=locs[i];
+//// int dif=absdif(loc, center);
+// if(loc>=minLoc && loc<=maxLoc){
+//// assert(loc>=center) : "loc="+loc+"\ni="+i+"\ncenterIndex="+centerIndex+
+//// "\nmaxLoc="+maxLoc+"\nlocs:\t"+Arrays.toString(locs)+"\noffsets:\t"+Arrays.toString(offsets);
+//
+// int offset=offsets[i];
+// int max=CHUNKSIZE+offset;
+//
+// for(int j=offset; j<max; j++){
+// int old=refLoc[j];
+// if(old==0){
+// refLoc[j]=loc;
+// score+=4;
+// }else if(old>loc){
+// refLoc[j]=loc;
+// score-=2;
+// }else if(old==loc){
+// score-=1;
+// //do nothing, perhaps, or add 1?
+// }else{
+// score-=2;
+// assert(old<loc);
+// }
+// }
+// }
+// }
+// return score;
+// }
+
+
+
+ private final int extendScore(final byte[] bases, final byte[] baseScores, final int[] offsets, final int[] values,
+ final int chrom, final int centerIndex, final int[] locArray, final int numHits, final int numApproxHits){
+ callsToExtendScore++;
+ hist_hits_extend[Tools.min(HIT_HIST_LEN, numApproxHits)]++;
+
+ final int centerVal=values[centerIndex];
+ final int centerLoc=numberToSite(centerVal);
+
+ final int minLoc=Tools.max(0, centerLoc-MAX_INDEL); //Legacy, for assertions
+ final int maxLoc=centerLoc+MAX_INDEL2; //Legacy, for assertions
+
+ final int minVal=centerVal-MAX_INDEL;
+ final int maxVal=centerVal+MAX_INDEL2;
+
+ final byte[] ref=Data.getChromosome(chrom).array;
+
+ if(verbose){
+ System.err.println("\n");
+ System.err.println("minLoc="+minLoc+", maxLoc="+ maxLoc+", centerIndex="+centerIndex+", centerVal="+centerVal+", centerLoc="+centerLoc);
+ System.err.println("minVal="+minVal+", maxVal="+ maxVal+", numHits="+numHits+", numApproxHits="+numApproxHits);
+ System.err.println("offsets:\t"+Arrays.toString(offsets));
+ System.err.println("values:\t"+Arrays.toString(values));
+ System.err.println();
+ int centerOffset=offsets[centerIndex];
+
+ for(int i=0; i<centerOffset; i++){System.err.print(" ");}
+ for(int i=centerOffset; i<centerOffset+KEYLEN; i++){System.err.print((char)bases[i]);}
+ System.err.println();
+
+ System.err.println(new String(bases));
+ System.err.println(new String(Arrays.copyOfRange(ref, centerLoc, centerLoc+bases.length)));
+ System.err.println();
+ }
+
+// int[] locArray=new int[bases.length];
+ Arrays.fill(locArray, -1);
+
+
+// if(verbose){
+// System.err.println("Reverse fill:");
+// }
+
+ //First fill in reverse
+ for(int i=0, keynum=0; i<numHits; i++){
+ final int value=values[i];
+// if(verbose){System.err.println("values["+i+"]="+value);}
+
+ if(value>=minVal && value<=maxVal){
+ final int refbase=numberToSite(value);
+// if(verbose){System.err.println("refbase="+refbase);}
+ assert(refbase>=minLoc && refbase<=maxLoc);
+
+ // System.err.println("Reverse: Trying key "+refbase+" @ "+offsets[i]);
+ // System.err.println("Passed!");
+ keynum++;
+ final int callbase=offsets[i];
+
+ int misses=0;
+ for(int cloc=callbase+KEYLEN-1, rloc=refbase+cloc; cloc>=0 && rloc>=0 && rloc<ref.length; cloc--, rloc--){
+ int old=locArray[cloc];
+ if(old==refbase){
+// if(verbose){System.err.println("Broke because old="+old+", refbase="+refbase);}
+ break;
+ } //Already filled with present value
+ if(misses>0 && old>=0){
+// if(verbose){System.err.println("Broke because old="+old+", misses="+misses);}
+ break;
+ } //Already filled with something that has no errors
+ byte c=bases[cloc];
+ byte r=ref[rloc];
+
+ if(c==r){
+ if(old<0 || refbase==centerLoc){ //If the cell is empty or this key corresponds to center
+ locArray[cloc]=refbase;
+ }
+ }else{
+ misses++;
+ //Only extends first key all the way back. Others stop at the first error.
+ if(old>=0 || keynum>1){
+// if(verbose){System.err.println("Broke because old="+old+", keynum="+keynum);}
+ break;
+ }
+ }
+ }
+ }
+ }
+
+// if(verbose){
+// System.err.println("locArray:\t"+Arrays.toString(locArray));
+// System.err.println("Forward fill:");
+// }
+
+ //Then fill forward
+ for(int i=0; i<numHits; i++){
+ final int value=values[i];
+// if(verbose){System.err.println("values["+i+"]="+value);}
+
+ if(value>=minVal && value<=maxVal){
+ final int refbase=numberToSite(value);
+// if(verbose){System.err.println("refbase="+refbase);}
+ assert(refbase>=minLoc && refbase<=maxLoc);
+ final int callbase=offsets[i];
+
+ int misses=0;
+ for(int cloc=callbase+KEYLEN, rloc=refbase+cloc; cloc<bases.length && rloc<ref.length; cloc++, rloc++){
+ int old=locArray[cloc];
+ if(old==refbase){break;} //Already filled with present value
+ if(misses>0 && old>=0){break;} //Already filled with something that has no errors
+ byte c=bases[cloc];
+ byte r=ref[rloc];
+
+ if(c==r){
+ if(old<0 || refbase==centerLoc){ //If the cell is empty or this key corresponds to center
+ locArray[cloc]=refbase;
+ }
+ }else{
+ misses++;
+ if(old>=0){break;} //Already filled with something that has no errors
+ }
+ }
+ }
+ }
+
+ //Try to subsume out-of-order locs where higher numbers come before lower numbers. Made things worse.
+// for(int i=1; i<locArray.length; i++){
+// final int loc=locArray[i];
+// final int last=locArray[i-1];
+// if(loc<last && last>-1){
+// final byte c=bases[i];
+//// final int rloc1=loc+i;
+// final int rloc2=last+i;
+// byte r2=ref[rloc2];
+// if(r2==c){
+// locArray[i]=last;
+// }
+// }
+// }
+
+// //Change 'N' to -2. A bit slow.
+// {
+// int firstMatch=0;
+// while(firstMatch<locArray.length && locArray[firstMatch]<0){firstMatch++;}
+// assert(firstMatch<locArray.length) : new String(bases);
+// int last=locArray[firstMatch];
+// for(int i=firstMatch-1; i>=0; i--){
+// final byte c=bases[i];
+// if(c=='N'){locArray[i]=-2;}
+// else{
+// assert(locArray[i]==-1);
+// final int rloc=last+i;
+// byte r=ref[rloc];
+// if(r=='N'){locArray[i]=-2;}
+// }
+// }
+// for(int i=firstMatch; i<locArray.length; i++){
+// final int loc=locArray[i];
+// if(last<1){last=loc;}
+// final byte c=bases[i];
+// if(c=='N'){locArray[i]=-2;}
+// else if(loc==-1 && last>0){
+// final int rloc=last+i;
+// byte r=ref[rloc];
+// if(r=='N'){locArray[i]=-2;}
+// }
+// }
+// }
+
+ //Change 'N' to -2, but only for nocalls, not norefs. Much faster.
+ {
+ final byte nb=(byte)'N';
+ for(int i=0; i<bases.length; i++){
+ if(bases[i]==nb){locArray[i]=-2;}
+ }
+ }
+
+//
+// {
+// int last=locArray[0];
+// for(int i=1; i<locArray.length; i++){
+// final int loc=locArray[i];
+// if(loc>0){
+// if(last<1){last=loc;}
+// }else{
+// if(last>0){
+// final byte c=bases[i];
+// final int rloc2=last+i;
+// byte r2=ref[rloc2];
+// if(c=='N'){
+//
+// }else if(c==r2){
+// locArray[i]=last;
+// }
+// }
+// last=-1;
+// }
+// }
+// }
+// {
+// int last=locArray[locArray.length-1];
+// for(int i=locArray.length-2; i>=0; i--){
+// final int loc=locArray[i];
+// if(loc>0){
+// if(last<1){last=loc;}
+// }else{
+// if(last>0){
+// final byte c=bases[i];
+// final int rloc2=last+i;
+// byte r2=ref[rloc2];
+// if(c=='N'){
+//
+// }else if(c==r2){
+// locArray[i]=last;
+// }
+// }
+// last=-1;
+// }
+// }
+// }
+
+// for(int i=locArray.length-2; i>=0; i--){
+// final int loc=locArray[i];
+// final int last=locArray[i+1];
+// if(loc>last && last>-1){
+// final byte c=bases[i];
+//// final int rloc1=loc+i;
+// final int rloc2=last+i;
+// byte r2=ref[rloc2];
+// if(r2==c){
+// locArray[i]=last;
+// }
+// }
+// }
+
+ if(verbose){
+// System.err.println("locArray:\t"+Arrays.toString(locArray));
+
+ int centerOffset=offsets[centerIndex];
+ int lim=centerOffset+KEYLEN;
+ for(int i=centerOffset; i<lim; i++){
+ assert(locArray[i]!=-1) : " ( "+centerOffset+" < "+i+" < "+lim+" ) "+
+ "\nlocArray: "+Arrays.toString(locArray)+
+ "\nvalues: "+Arrays.toString(values)+
+ "\noffsets: "+Arrays.toString(offsets);
+ }
+ }
+
+ if(USE_AFFINE_SCORE){
+ /* TODO - sometimes returns a higher score than actual alignment. This should never happen. */
+ int score=(KFILTER<2 ? msa.calcAffineScore(locArray, baseScores, bases) :
+ msa.calcAffineScore(locArray, baseScores, bases, KFILTER));
+ return score;
+ }
+
+ int score=0;
+ int lastLoc=-1;
+ int centerBonus=BASE_HIT_SCORE/5;
+ for(int i=0; i<locArray.length; i++){
+ int loc=locArray[i];
+ if(loc>=0){
+ score+=BASE_HIT_SCORE+baseScores[i];
+ if(loc==centerLoc){score+=centerBonus;}
+ if(loc!=lastLoc && lastLoc>=0){
+ int dif=absdif(loc, lastLoc);
+ int penalty=Tools.min(INDEL_PENALTY+INDEL_PENALTY_MULT*dif, MAX_PENALTY_FOR_MISALIGNED_HIT);
+ score-=penalty;
+ }
+ lastLoc=loc;
+ }
+ }
+
+// System.err.println("Extended score: "+score);
+// System.err.println(Arrays.toString(locArray));
+
+
+ return score;
+ }
+
+
+ /** NOTE! This destroys the locArray, so use a copy if needed. */
+ private static final int[] makeGapArray(int[] locArray, int minLoc, int minGap){
+ int gaps=0;
+ boolean doSort=false;
+
+ if(locArray[0]<0){locArray[0]=minLoc;}
+ for(int i=1; i<locArray.length; i++){
+ if(locArray[i]<0){locArray[i]=locArray[i-1]+1;}
+ else{locArray[i]+=i;}
+ if(locArray[i]<locArray[i-1]){doSort=true;}
+ }
+
+// System.err.println(Arrays.toString(locArray)+"\n");
+
+ if(doSort){
+// System.err.println("*");
+ Arrays.sort(locArray);
+ }
+// System.err.println(Arrays.toString(locArray)+"\n");
+
+ for(int i=1; i<locArray.length; i++){
+ int dif=locArray[i]-locArray[i-1];
+ assert(dif>=0);
+ if(dif>minGap){
+ gaps++;
+ }
+ }
+ if(gaps<1){return null;}
+ int[] out=new int[2+gaps*2];
+ out[0]=locArray[0];
+ out[out.length-1]=locArray[locArray.length-1];
+
+ for(int i=1, j=1; i<locArray.length; i++){
+ int dif=locArray[i]-locArray[i-1];
+ assert(dif>=0);
+ if(dif>minGap){
+ out[j]=locArray[i-1];
+ out[j+1]=locArray[i];
+ j+=2;
+ }
+ }
+ return out;
+ }
+
+
+ /** Generates a term that increases score with how many bases in the read match the ref. */
+ private final int scoreZ2(int[] locs, int centerIndex, int offsets[], int numApproxHits, int numHits){
+
+ if(numApproxHits==1){return SCOREZ_1KEY;}
+
+ final int center=locs[centerIndex];
+
+ final int maxLoc=center+MAX_INDEL2;
+ final int minLoc=Tools.max(0, center-MAX_INDEL);
+
+ int score=0;
+
+ int a0=-1, b0=-1;
+
+ for(int i=0; i<numHits; i++){
+ int loc=locs[i];
+// int dif=absdif(loc, center);
+ if(loc>=minLoc && loc<=maxLoc){
+// assert(loc>=center) : "loc="+loc+"\ni="+i+"\ncenterIndex="+centerIndex+
+// "\nmaxLoc="+maxLoc+"\nlocs:\t"+Arrays.toString(locs)+"\noffsets:\t"+Arrays.toString(offsets);
+ int a=offsets[i];
+
+ if(b0<a){
+ score+=b0-a0;
+ a0=a;
+ }
+ b0=a+KEYLEN;
+ }
+ }
+ score+=b0-a0;
+ score=score*Z_SCORE_MULT;
+// assert(score==scoreZslow(locs, centerIndex, offsets, false)) : scoreZslow(locs, centerIndex, offsets, true)+" != "+score;
+ return score;
+ }
+
+ @Deprecated
+ /** This was just to verify scoreZ2. */
+ private final int scoreZslow(int[] locs, int centerIndex, int offsets[], boolean display, int numHits){
+ final int center=locs[centerIndex];
+
+ final int maxLoc=center+MAX_INDEL2;
+ final int minLoc=Tools.max(0, center-MAX_INDEL);
+
+ byte[] array=new byte[offsets[offsets.length-1]+KEYLEN];
+ int score=0;
+
+ for(int i=0; i<numHits; i++){
+ int loc=locs[i];
+// int dif=absdif(loc, center);
+ if(loc>=minLoc && loc<=maxLoc){
+ int pos=offsets[i];
+// if(true){
+// System.err.println("\ni="+i+", pos="+pos+", array=["+array.length+"], limit="+(pos+CHUNKSIZE-1));
+// }
+ for(int j=pos; j<pos+KEYLEN; j++){
+ if(array[j]==0){score++;}
+ array[j]=1;
+ }
+ }
+ }
+
+ if(display){System.err.println("\n"+Arrays.toString(array)+"\n");}
+
+ return score*Z_SCORE_MULT;
+ }
+
+ /** Generates a term that increases score with how many bases in the read match the ref. */
+ private final int maxScoreZ(int offsets[]){
+ int score=0;
+ int a0=-1, b0=-1;
+
+ for(int i=0; i<offsets.length; i++){
+ int a=offsets[i];
+
+ if(b0<a){
+ score+=b0-a0;
+ a0=a;
+ }
+ b0=a+KEYLEN;
+
+ }
+ score+=b0-a0;
+ return score*Z_SCORE_MULT;
+ }
+
+
+ private final int scoreRight(int[] locs, int[] keyScores, int centerIndex, int[] sizes, boolean penalizeIndels, int numHits){
+
+ int score=0;
+
+ int prev, loc=locs[centerIndex];
+
+ for(int i=centerIndex+1; i<numHits; i++){
+
+ if(locs[i]>=0){
+ prev=loc;
+ loc=locs[i];
+
+ int offset=absdif(loc, prev);
+
+ if(offset<=MAX_INDEL){
+ score+=keyScores[i];
+ if(ADD_LIST_SIZE_BONUS){score+=calcListSizeBonus(sizes[i]);}
+
+// if(i==locs.length-1){score+=HIT_SCORE/2;} //Adds a bonus for matching the first or last key
+
+ if(penalizeIndels && offset!=0){
+ int penalty=Tools.min(INDEL_PENALTY+INDEL_PENALTY_MULT*offset, MAX_PENALTY_FOR_MISALIGNED_HIT);
+ score-=penalty;
+// score-=(INDEL_PENALTY+Tools.min(INDEL_PENALTY_MULT*offset, 1+HIT_SCORE/4));
+ }
+ }else{
+ loc=prev;
+ }
+ }
+
+ }
+ return score;
+
+ }
+
+ private final int scoreLeft(int[] locs, int[] keyScores, int centerIndex, int[] sizes, boolean penalizeIndels){
+
+ callsToScore++;
+
+ int score=0;
+
+ int prev, loc=locs[centerIndex];
+
+ for(int i=centerIndex-1; i>=0; i--){
+
+ if(locs[i]>=0){
+ prev=loc;
+ loc=locs[i];
+
+ int offset=absdif(loc, prev);
+
+ if(offset<=MAX_INDEL){
+ score+=keyScores[i];
+ if(ADD_LIST_SIZE_BONUS){score+=calcListSizeBonus(sizes[i]);}
+
+// if(i==0){score+=HIT_SCORE/2;} //Adds a bonus for matching the first or last key
+ if(penalizeIndels && offset!=0){
+ int penalty=Tools.min(INDEL_PENALTY+INDEL_PENALTY_MULT*offset, MAX_PENALTY_FOR_MISALIGNED_HIT);
+ score-=penalty;
+ }
+ }else{
+ loc=prev;
+ }
+ }
+
+ }
+ return score;
+
+ }
+
+ /** Encode a (location, chrom) pair to an index */
+ private static final int toNumber(int site, int chrom){
+ int out=(chrom&CHROM_MASK_LOW);
+ out=out<<SHIFT_LENGTH;
+ out=(out|site);
+ return out;
+ }
+
+ /** Decode an (index, baseChrom) pair to a chromosome */
+ private static final int numberToChrom(int number, int baseChrom){
+ assert((baseChrom&CHROM_MASK_LOW)==0) : Integer.toHexString(number)+", baseChrom="+baseChrom;
+ assert(baseChrom>=0) : Integer.toHexString(number)+", baseChrom="+baseChrom;
+ int out=(number>>>SHIFT_LENGTH);
+ out=out+(baseChrom&CHROM_MASK_HIGH);
+ return out;
+ }
+
+ /** Decode an index to a location */
+ private static final int numberToSite(int number){
+ return (number&SITE_MASK);
+ }
+
+ private static final int minChrom(int chrom){return Tools.max(MINCHROM, chrom&CHROM_MASK_HIGH);}
+ private static final int baseChrom(int chrom){return Tools.max(0, chrom&CHROM_MASK_HIGH);}
+ private static final int maxChrom(int chrom){return Tools.max(MINCHROM, Tools.min(MAXCHROM, chrom|CHROM_MASK_LOW));}
+
+
+ private final int[] getOffsetArray(int len){
+ if(offsetArrays[len]==null){offsetArrays[len]=new int[len];}
+ return offsetArrays[len];
+ }
+ private final int[] getLocArray(int len){
+ if(len>=locArrays.length){return new int[len];}
+ if(locArrays[len]==null){locArrays[len]=new int[len];}
+ return locArrays[len];
+ }
+ private final int[] getGreedyListArray(int len){
+ if(greedyListArrays[len]==null){greedyListArrays[len]=new int[len];}
+ return greedyListArrays[len];
+ }
+ private final int[] getGenericArray(int len){
+ if(genericArrays[len]==null){genericArrays[len]=new int[len];}
+ return genericArrays[len];
+ }
+
+ final byte[] getBaseScoreArray(int len, int strand){
+ if(len>=baseScoreArrays[0].length){return new byte[len];}
+ if(baseScoreArrays[strand][len]==null){baseScoreArrays[strand][len]=new byte[len];}
+ return baseScoreArrays[strand][len];
+ }
+ final int[] getKeyScoreArray(int len, int strand){
+ if(len>=keyScoreArrays.length){return new int[len];}
+ if(keyScoreArrays[strand][len]==null){keyScoreArrays[strand][len]=new int[len];}
+ return keyScoreArrays[strand][len];
+ }
+ private final float[] getKeyWeightArray(int len){
+ if(len>=keyWeightArrays.length){return new float[len];}
+ if(keyWeightArrays[len]==null){keyWeightArrays[len]=new float[len];}
+ return keyWeightArrays[len];
+ }
+ @Override
+ float[] keyProbArray() {
+ return keyProbArray;
+ }
+
+
+ private final int[][] locArrays=new int[601][];
+ private final int[] valueArray=new int[128];
+ private final int[] sizeArray=new int[128];
+ private final int[][] offsetArrays=new int[128][];
+ private final int[][] greedyListArrays=new int[128][];
+ private final int[][] genericArrays=new int[128][];
+ private final int[] startArray=new int[128];
+ private final int[] stopArray=new int[128];
+ private final Quad[] tripleStorage=makeQuadStorage(128);
+ private final int[] greedyReturn=new int[2];
+ private final int[][] shrinkReturn2=new int[3][];
+ private final int[][] shrinkReturn3=new int[5][];
+ private final int[][] prescanReturn=new int[2][];
+ private final int[] prescoreArray;
+ private final int[] precountArray;
+
+ private final byte[][][] baseScoreArrays=new byte[2][601][];
+ private final int[][][] keyScoreArrays=new int[2][128][];
+ final float[] keyProbArray=new float[601];
+ private final float[][] keyWeightArrays=new float[128][];
+
+
+ private final Quad[] makeQuadStorage(int number){
+ Quad[] r=new Quad[number];
+ for(int i=0; i<number; i++){r[i]=new Quad(i, 0, 0);}
+ return r;
+ }
+
+
+ private final QuadHeap heap=new QuadHeap(127);
+ private final QuadHeap active=new QuadHeap(127);
+
+ static int SHIFT_LENGTH=(32-1-NUM_CHROM_BITS);
+ static int MAX_ALLOWED_CHROM_INDEX=~((-1)<<SHIFT_LENGTH);
+
+ /** Mask the number to get the site, which is in the lower bits */
+ static int SITE_MASK=((-1)>>>(NUM_CHROM_BITS+1));
+
+ /** Mask the chromosome's high bits to get the low bits */
+ static int CHROM_MASK_LOW=CHROMS_PER_BLOCK-1;
+
+ /** Mask the chromosome's lower bits to get the high bits */
+ static int CHROM_MASK_HIGH=~CHROM_MASK_LOW;
+
+ static void setChromBits(int x){
+
+ NUM_CHROM_BITS=x;
+ CHROMS_PER_BLOCK=(1<<(NUM_CHROM_BITS));
+ SHIFT_LENGTH=(32-1-NUM_CHROM_BITS);
+ MAX_ALLOWED_CHROM_INDEX=~((-1)<<SHIFT_LENGTH);
+ SITE_MASK=((-1)>>>(NUM_CHROM_BITS+1));
+ CHROM_MASK_LOW=CHROMS_PER_BLOCK-1;
+ CHROM_MASK_HIGH=~CHROM_MASK_LOW;
+
+// assert(NUM_CHROM_BITS<30);
+ assert(NUM_CHROM_BITS>=0); //max is 3 for human; perhaps more for other organisms
+// assert((1<<(NUM_CHROM_BITS))>=CHROMSPERBLOCK) : (1<<(NUM_CHROM_BITS))+" < "+CHROMSPERBLOCK;
+ assert((1<<(NUM_CHROM_BITS))==CHROMS_PER_BLOCK) : (1<<(NUM_CHROM_BITS))+" < "+CHROMS_PER_BLOCK;
+ assert(Integer.bitCount(CHROMS_PER_BLOCK)==1);
+ assert(Integer.numberOfLeadingZeros(SITE_MASK)==(NUM_CHROM_BITS+1)) : Integer.toHexString(SITE_MASK);
+ }
+
+ private final int cycles;
+
+ public static final int BASE_HIT_SCORE=100;
+ public static final int ALIGN_COLUMNS=3000;
+ public static int MAX_INDEL=16000; //Max indel length, min 0, default 400; longer is more accurate
+ public static int MAX_INDEL2=2*MAX_INDEL;
+
+ private final float INV_BASE_KEY_HIT_SCORE;
+ private final int INDEL_PENALTY; //default (HIT_SCORE/2)-1
+ private final int INDEL_PENALTY_MULT; //default 20; penalty for indel length
+ private final int MAX_PENALTY_FOR_MISALIGNED_HIT;
+ private final int SCOREZ_1KEY;
+
+ public static final boolean ADD_SCORE_Z=true; //Increases quality, decreases speed
+ public static final int Z_SCORE_MULT=20;
+ public static final int Y_SCORE_MULT=10;
+
+
+ /**
+ * Return only sites that match completely or with partial no-reference
+ */
+ public static void setSemiperfectMode() {
+ assert(!PERFECTMODE);
+ SEMIPERFECTMODE=true;
+ PRESCAN_QSCORE=false;
+// MIN_APPROX_HITS_TO_KEEP++;
+
+
+
+ MAX_INDEL=0;
+ MAX_INDEL2=0;
+ }
+
+ /**
+ * Return only sites that match completely
+ */
+ public static void setPerfectMode() {
+ assert(!SEMIPERFECTMODE);
+ PERFECTMODE=true;
+ PRESCAN_QSCORE=false;
+// MIN_APPROX_HITS_TO_KEEP++;
+
+
+
+ MAX_INDEL=0;
+ MAX_INDEL2=0;
+ }
+
+ static float FRACTION_GENOME_TO_EXCLUDE=0.03f; //Default .03; lower is slower and more accurate. For perfect reads and small genomes, lower is FASTER.
+
+ public static final void setFractionToExclude(float f){
+ assert(f>=0 && f<1);
+ FRACTION_GENOME_TO_EXCLUDE=f;
+ MIN_INDEX_TO_DROP_LONG_HIT_LIST=(int)(1000*(1-3.5*FRACTION_GENOME_TO_EXCLUDE)); //default 810
+ MAX_AVERAGE_LIST_TO_SEARCH=(int)(1000*(1-2.3*FRACTION_GENOME_TO_EXCLUDE)); //lower is faster, default 840
+ MAX_AVERAGE_LIST_TO_SEARCH2=(int)(1000*(1-1.4*FRACTION_GENOME_TO_EXCLUDE)); //default 910
+ MAX_SINGLE_LIST_TO_SEARCH=(int)(1000*(1-1.0*FRACTION_GENOME_TO_EXCLUDE)); //default 935
+ MAX_SHORTEST_LIST_TO_SEARCH=(int)(1000*(1-2.8*FRACTION_GENOME_TO_EXCLUDE)); //Default 860
+ }
+
+
+ /** Default .75. Range: 0 to 1 (but 0 will break everything). Lower is faster and less accurate. */
+ static final float HIT_FRACTION_TO_RETAIN=0.85f; //default: .8
+ /** Range: 0 to 1000. Lower should be faster and less accurate. */
+ static int MIN_INDEX_TO_DROP_LONG_HIT_LIST=(int)(1000*(1-3.5*FRACTION_GENOME_TO_EXCLUDE)); //default 810
+ /** Range: 2 to infinity. Lower should be faster and less accurate. */
+ static final int MIN_HIT_LISTS_TO_RETAIN=6;
+
+ static int MAX_AVERAGE_LIST_TO_SEARCH=(int)(1000*(1-2.3*FRACTION_GENOME_TO_EXCLUDE)); //lower is faster, default 840
+ //lower is faster
+ static int MAX_AVERAGE_LIST_TO_SEARCH2=(int)(1000*(1-1.4*FRACTION_GENOME_TO_EXCLUDE)); //default 910
+ //lower is faster
+ static int MAX_SINGLE_LIST_TO_SEARCH=(int)(1000*(1-1.0*FRACTION_GENOME_TO_EXCLUDE)); //default 935
+ //lower is faster
+ static int MAX_SHORTEST_LIST_TO_SEARCH=(int)(1000*(1-2.8*FRACTION_GENOME_TO_EXCLUDE)); //Default 860
+
+ /** To increase accuracy on small genomes, override greedy list dismissal when the list is at most this long. */
+ public static final int SMALL_GENOME_LIST=20;
+
+ static{assert(!(TRIM_BY_GREEDY && TRIM_BY_TOTAL_SITE_COUNT)) : "Pick one.";}
+
+ static final int CLUMPY_MAX_DIST=5; //Keys repeating over intervals of this or less are clumpy.
+
+ /** Minimum length of list before clumpiness is considered. This is an index in the length histogram, from 0 to 1000. */
+ static final int CLUMPY_MIN_LENGTH_INDEX=2000;
+ static final float CLUMPY_FRACTION=0.75f; //0 to 1; higher is slower but more stringent. 0.5 means the median distance is clumpy.
+
+ static final int MAX_SUBSUMPTION_LENGTH=MAX_INDEL2;
+
+ /** approxHitsCutoff=maxHits-MAX_HITS_REDUCTION1 when slowWalk3 is first entered */
+ public static final int MAX_HITS_REDUCTION1=0;
+
+ /** approxHitsCutoff=maxHits-MAX_HITS_REDUCTION2 dynamically when best score is exceeded */
+ public static int MAX_HITS_REDUCTION2=2; //default 1; higher is more accurate (more mapping and less FP) but slower
+
+ /** approxHitsCutoff=maxHits-MAX_HITS_REDUCTION_PERFECT when perfect score is found */
+ public static final int MAX_HITS_REDUCTION_PERFECT=0;
+
+ public static int MAXIMUM_MAX_HITS_REDUCTION=3;
+ public static int HIT_REDUCTION_DIV=5;
+
+ private static final int calcApproxHitsCutoff(final int keys, final int hits, int currentCutoff, final boolean perfect){ //***$
+ assert(keys>=hits) : keys+", "+hits;
+ assert(hits>=0);
+
+ int mahtk=MIN_APPROX_HITS_TO_KEEP;
+ if(SEMIPERFECTMODE || PERFECTMODE){
+ if(keys==1){return 1;}
+ else if(MIN_APPROX_HITS_TO_KEEP<keys){
+ mahtk++;
+ if(currentCutoff==MIN_APPROX_HITS_TO_KEEP){currentCutoff++;}
+ }
+ }
+
+ int reduction=Tools.min(Tools.max((hits)/HIT_REDUCTION_DIV, MAX_HITS_REDUCTION2), Tools.max(MAXIMUM_MAX_HITS_REDUCTION, keys/8));
+// if(hits<3){reduction=0;}
+// else if(hits<4){reduction=Tools.min(1, reduction);}
+// else if(hits<5){reduction=Tools.min(2, reduction);}
+// else if(hits<6){reduction=Tools.min(3, reduction);}
+ assert(reduction>=0);
+ int r=hits-reduction;
+
+ r=Tools.max(mahtk, currentCutoff, r);
+
+ if(perfect){
+ r=Tools.max(r, keys-MAX_HITS_REDUCTION_PERFECT);
+ }
+ return r;
+ }
+
+ public static final boolean USE_SLOWALK3=true && USE_EXTENDED_SCORE;
+ public static boolean PRESCAN_QSCORE=true && USE_EXTENDED_SCORE; //Decrease quality and increase speed
+ public static final boolean FILTER_BY_QSCORE=true; //Slightly lower quality, but very fast.
+ public static final float MIN_SCORE_MULT=(USE_AFFINE_SCORE ? 0.15f : USE_EXTENDED_SCORE ? .3f : 0.10f); //Fraction of max score to use as cutoff. Default 0.15, max is 1; lower is more accurate
+ public static final float MIN_QSCORE_MULT=0.025f; //Fraction of max score to use as cutoff. Default 0.15, max is 1; lower is more accurate
+ public static final float MIN_QSCORE_MULT2=0.1f;
+ static final float DYNAMIC_SCORE_THRESH=(USE_AFFINE_SCORE ? 0.84f : USE_EXTENDED_SCORE ? .84f : 0.6f); //Default .85f; lower is more accurate
+ static final float DYNAMIC_QSCORE_THRESH=0.6f; //default .58f
+ static final float DYNAMIC_QSCORE_THRESH_PERFECT=0.8f; //***$
+ static final float PRESCAN_QSCORE_THRESH=DYNAMIC_QSCORE_THRESH*.95f; //default 1.0f; lower is more accurate and 0 essentially sets PRESCAN_QSCORE=false
+ static{
+ assert(MIN_SCORE_MULT>=0 && MIN_SCORE_MULT<1);
+ assert(DYNAMIC_SCORE_THRESH>=0 && DYNAMIC_SCORE_THRESH<1);
+ }
+
+
+}
diff --git a/current/align2/BBIndex5.java b/current/align2/BBIndex5.java
new file mode 100755
index 0000000..2cb9f4e
--- /dev/null
+++ b/current/align2/BBIndex5.java
@@ -0,0 +1,2643 @@
+package align2;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import stream.Read;
+import stream.SiteScore;
+
+import dna.AminoAcid;
+import dna.Data;
+import dna.Gene;
+
+
+/**
+ * Based on Index11f
+ * Index stored in single array per block.
+ * Supports 32-bit unsigned index.
+ *
+ * @author Brian Bushnell
+ * @date Jan 3, 2013
+ *
+ */
+public final class BBIndex5 extends AbstractIndex {
+
+
+ public static void main(String[] args){
+
+ int k=13;
+
+ for(int i=0; i<args.length; i++){
+ String s=args[i].toLowerCase();
+ if(s.contains("=")){
+ String[] split=s.split("=");
+ String a=split[0];
+ String b=split[1];
+ if(a.equals("build") || a.equals("b")){
+ Data.setGenome(Integer.parseInt(b));
+ }else if(a.equals("minchrom")){
+ MINCHROM=Integer.parseInt(b);
+ }else if(a.equals("maxchrom")){
+ MAXCHROM=Integer.parseInt(b);
+ }else if(a.equals("keylen") || a.equals("k")){
+ k=Integer.parseInt(b);
+ }
+ }
+ }
+
+ if(MINCHROM==-1){MINCHROM=1;}
+ if(MAXCHROM==-1){
+ assert(Data.numChroms<=Byte.MAX_VALUE) : "TODO";
+ MAXCHROM=Data.numChroms;
+ }
+
+
+ System.err.println("Writing build "+Data.GENOME_BUILD+" "+
+ "BASESPACE index, keylen="+k+", chrom bits="+NUM_CHROM_BITS);
+
+
+ int first=(NUM_CHROM_BITS==0 ? 1 : 0);
+// first=24;
+
+ Data.sysout.println("Loading index for chunk "+first+"-"+MAXCHROM+", build "+Data.GENOME_BUILD);
+ index=IndexMaker5.makeIndex(Data.GENOME_BUILD, first, MAXCHROM,
+ k, NUM_CHROM_BITS, MAX_ALLOWED_CHROM_INDEX, CHROM_MASK_LOW, CHROM_MASK_HIGH, SITE_MASK, SHIFT_LENGTH, true, false, index);
+
+
+ System.err.println("Finished all chroms, may still be writing.");
+ }
+
+
+ public BBIndex5(int k_, int minChrom_, int maxChrom_, int kfilter_, MSA msa_){
+ super(k_, kfilter_, BASE_HIT_SCORE, minChrom_, maxChrom_, msa_);
+ INV_BASE_KEY_HIT_SCORE=1f/BASE_KEY_HIT_SCORE;
+ INDEL_PENALTY=(BASE_KEY_HIT_SCORE/2)-1; //default (HIT_SCORE/2)-1
+ INDEL_PENALTY_MULT=20; //default 20; penalty for indel length
+ MAX_PENALTY_FOR_MISALIGNED_HIT=BASE_KEY_HIT_SCORE-(1+BASE_KEY_HIT_SCORE/8);
+ SCOREZ_1KEY=Z_SCORE_MULT*KEYLEN;
+ {
+ int cyc=0;
+ for(int chrom=minChrom; chrom<=maxChrom; chrom=((chrom&CHROM_MASK_HIGH)+CHROMS_PER_BLOCK)){cyc+=2;}
+ cycles=cyc;
+ }
+ prescoreArray=new int[cycles];
+ precountArray=new int[cycles];
+ }
+
+ /** Load or generate index from minChrom to maxChrom, inclusive, with keylength k.
+ * This range can encompass multiple blocks.
+ * Should only be called once in a process. */
+ public static final synchronized void loadIndex(int minChrom, int maxChrom, int k, boolean writeToDisk, boolean diskInvalid){
+ if(minChrom<1){minChrom=1;}
+ if(maxChrom>Data.numChroms){maxChrom=Data.numChroms;}
+ assert(minChrom<=maxChrom);
+ Data.sysout.println("Loading index for chunk "+minChrom+"-"+maxChrom+", build "+Data.GENOME_BUILD);
+ index=IndexMaker5.makeIndex(Data.GENOME_BUILD, minChrom, maxChrom,
+ k, NUM_CHROM_BITS, MAX_ALLOWED_CHROM_INDEX, CHROM_MASK_LOW, CHROM_MASK_HIGH, SITE_MASK, SHIFT_LENGTH, writeToDisk, diskInvalid, index);
+
+ }
+
+ /** Calculate statistics of index, such as list lengths, and find clumpy keys */
+ public static final synchronized void analyzeIndex(int minChrom, int maxChrom, float fractionToExclude, int k){
+
+ assert(lengthHistogram==null);
+ assert(COUNTS==null);
+
+ int KEYSPACE=1<<(2*k);
+ COUNTS=new int[KEYSPACE];
+
+ maxChrom=maxChrom(maxChrom);
+
+ for(int key=0; key<KEYSPACE; key++){
+ int rkey=KeyRing.reverseComplementKey(key, k);
+ assert(key==KeyRing.reverseComplementKey(rkey, k));
+
+ if(key<=rkey){
+
+ long clumps=0;
+ long len=0;
+
+ for(int chrom=minChrom; chrom<=maxChrom; chrom=((chrom&CHROM_MASK_HIGH)+CHROMS_PER_BLOCK)){
+ Block b=index[chrom];
+
+ final int[] sites=b.sites;
+ final int start1=b.starts[key];
+ final int stop1=start1+b.length(key);
+ final int start2=(rkey==key ? -1 : b.starts[rkey]);
+ final int stop2=(rkey==key ? -1 : start2+b.length(rkey));
+ final int len1=stop1-start1;
+ final int len2=stop2-start2;
+
+ len=len+len1+len2;
+// System.out.println(len+", len1="+len1+", len2="+len2+", start1="+start1+", stop1="+stop1+", start2="+start2+", stop2="+stop2);
+
+ if(REMOVE_CLUMPY){
+ for(int i=start1+1; i<stop1; i++){
+ int dif=sites[i]-sites[i-1];
+ assert(dif!=0);
+ if(dif>0 && dif<=CLUMPY_MAX_DIST){
+ clumps++;
+ }
+ }
+
+ for(int i=start2+1; i<stop2; i++){
+ int dif=sites[i]-sites[i-1];
+ assert(dif!=0);
+ if(dif>0 && dif<=CLUMPY_MAX_DIST){
+ clumps++;
+ }
+ }
+ }
+
+ }
+
+ COUNTS[key]=(int)Tools.min(Integer.MAX_VALUE, COUNTS[key]+len);
+ if(key!=rkey){COUNTS[rkey]=(int)Tools.min(Integer.MAX_VALUE, COUNTS[rkey]+len);}
+ assert(COUNTS[key]==COUNTS[rkey]) : key+", "+rkey;
+
+ if(REMOVE_CLUMPY && len>CLUMPY_MIN_LENGTH_INDEX && clumps>(CLUMPY_FRACTION*len)){
+ COUNTS[key]=0;
+ COUNTS[rkey]=0;
+ for(int chrom=minChrom; chrom<=maxChrom; chrom=((chrom&CHROM_MASK_HIGH)+CHROMS_PER_BLOCK)){
+ Block b=index[chrom];
+ final int[] sites=b.sites;
+ sites[b.starts[key]]=-1;
+ sites[b.starts[rkey]]=-1;
+ }
+ }
+
+// System.err.println("COUNTS["+key+"] = "+COUNTS[key]+", COUNTS["+rkey+"] = "+COUNTS[rkey]);
+ }
+ }
+
+ lengthHistogram=Tools.makeLengthHistogram3(COUNTS, 1000, verbose2);
+
+ //if(verbose2){System.err.println("lengthHistogram: "+Arrays.toString(lengthHistogram));}
+
+ if(REMOVE_FREQUENT_GENOME_FRACTION){
+
+ int lengthLimitIndex=(int)((1-fractionToExclude)*(lengthHistogram.length-1));
+ int lengthLimitIndex2=(int)((1-fractionToExclude*DOUBLE_SEARCH_THRESH_MULT)*(lengthHistogram.length-1));
+
+ MAX_USABLE_LENGTH=Tools.max(2*SMALL_GENOME_LIST, lengthHistogram[lengthLimitIndex]);
+ MAX_USABLE_LENGTH2=Tools.max(6*SMALL_GENOME_LIST, lengthHistogram[lengthLimitIndex2]);
+
+ if(verbose2){System.err.println("MAX_USABLE_LENGTH: "+MAX_USABLE_LENGTH+"\nMAX_USABLE_LENGTH2: "+MAX_USABLE_LENGTH2);}
+ }
+
+ Solver.POINTS_PER_SITE=(int)Math.floor((Solver.BASE_POINTS_PER_SITE*4000f)/Tools.max(2*SMALL_GENOME_LIST, lengthHistogram[MAX_AVERAGE_LIST_TO_SEARCH]));
+ if(Solver.POINTS_PER_SITE==0){Solver.POINTS_PER_SITE=-1;}
+ if(verbose2){System.err.println("POINTS_PER_SITE: "+Solver.POINTS_PER_SITE);}
+ assert(Solver.POINTS_PER_SITE<0) : Solver.POINTS_PER_SITE;
+ }
+
+
+ /** Returns the filename for the block holding this chrom */
+ public static final String fname(int chrom, int k){
+ return IndexMaker5.fname(minChrom(chrom), maxChrom(chrom), k, NUM_CHROM_BITS);
+ }
+
+ /** Ensure key offsets are strictly ascending. */
+ private static boolean checkOffsets(int[] offsets){
+ for(int i=1; i<offsets.length; i++){
+ if(offsets[i]<=offsets[i-1]){return false;}
+ }
+ return true;
+ }
+
+ @Deprecated
+ private final int trimExcessHitLists(int[] keys, int[][] hits){
+
+ assert(false) : "Needs to be redone because hits are no longer sorted by length.";
+
+ assert(hits.length==keys.length);
+// assert(false) : "modify this function so that it gives more weight to trimming lists over highly covered baits";
+ //And also, incorporate the "remove the longest list" function
+
+ final int limit=Tools.max(SMALL_GENOME_LIST, lengthHistogram[MAX_AVERAGE_LIST_TO_SEARCH])*keys.length;
+ final int limit2=Tools.max(SMALL_GENOME_LIST, lengthHistogram[MAX_AVERAGE_LIST_TO_SEARCH2]);
+ final int limit3=Tools.max(SMALL_GENOME_LIST, lengthHistogram[MAX_SHORTEST_LIST_TO_SEARCH]);
+
+ int sum=0;
+ int initialHitCount=0;
+
+ int shortest=Integer.MAX_VALUE-1;
+ int shortest2=Integer.MAX_VALUE;
+
+ for(int i=0; i<keys.length; i++){
+ int key=keys[i];
+ int x=count(key);
+ sum+=x;
+ initialHitCount+=(x==0 ? 0 : 1);
+ if(x>0){
+ if(x<shortest2){
+ shortest2=x;
+ if(shortest2<shortest){
+ shortest2=shortest;
+ shortest=x;
+ }
+ }
+ }
+ }
+ assert(shortest2>=shortest);
+ if(initialHitCount<MIN_APPROX_HITS_TO_KEEP){return initialHitCount;}
+ if(shortest>limit3 && !SLOW){
+ for(int i=0; i<hits.length; i++){hits[i]=null;}
+ return 0;
+ }
+ if(sum<=limit && sum/initialHitCount<=limit2){return initialHitCount;}
+
+ Pointer[] ptrs=Pointer.loadMatrix(hits);
+// ptrs[0].value/=2;
+// ptrs[ptrs.length-1].value/=2;
+ Arrays.sort(ptrs);
+
+ int finalHitCount=initialHitCount;
+ for(int i=ptrs.length-1; sum>limit || sum/finalHitCount>limit2; i--){
+ Pointer p=ptrs[i];
+ sum-=hits[p.key].length;
+ hits[p.key]=null;
+ finalHitCount--;
+ }
+
+ return finalHitCount;
+ }
+
+ /** Remove least useful keys to accelerate search */
+ private final int trimExcessHitListsByGreedy(int[] offsets, int[] keyScores, int maxHitLists, int[] keys){
+
+ float[] keyWeights=getKeyWeightArray(keyScores.length);
+ for(int i=0; i<keyScores.length; i++){
+ keyWeights[i]=keyScores[i]*INV_BASE_KEY_HIT_SCORE;
+ }
+
+// assert(false) : "modify this function so that it gives more weight to trimming lists over highly covered baits";
+ //And also, incorporate the "remove the longest list" function
+
+ final int limit=Tools.max(SMALL_GENOME_LIST, lengthHistogram[MAX_AVERAGE_LIST_TO_SEARCH])*keys.length;
+ final int limit2=Tools.max(SMALL_GENOME_LIST, lengthHistogram[MAX_AVERAGE_LIST_TO_SEARCH2]);
+ final int limit3=Tools.max(SMALL_GENOME_LIST, lengthHistogram[MAX_SHORTEST_LIST_TO_SEARCH]);
+// final int limitS=lengthHistogram[chrom][MAX_SINGLE_LIST_TO_SEARCH];
+
+ int sum=0;
+ int initialHitCount=0;
+
+ int shortest=Integer.MAX_VALUE-1;
+ int shortest2=Integer.MAX_VALUE;
+
+// for(int i=0; i<hits.length; i++){
+// if(hits[i]!=null && hits[i].length>limitS){hits[i]=null;}
+// }
+
+ final int[] lengths=getGenericArray(keys.length);
+
+ for(int i=0; i<keys.length; i++){
+ int key=keys[i];
+ int x=count(key);
+ lengths[i]=x;
+ sum+=x;
+ initialHitCount+=(x==0 ? 0 : 1);
+ if(x>0){
+ if(x<shortest2){
+ shortest2=x;
+ if(shortest2<shortest){
+ shortest2=shortest;
+ shortest=x;
+ }
+ }
+ }
+ }
+ assert(shortest2>=shortest);
+ if(initialHitCount<MIN_APPROX_HITS_TO_KEEP){return initialHitCount;}
+ if(shortest>limit3 && !SLOW){
+ for(int i=0; i<keys.length; i++){keys[i]=-1;}
+ return 0;
+ }
+
+ int hitsCount=initialHitCount;
+ int worstValue=Integer.MIN_VALUE;
+
+ while(hitsCount>=MIN_APPROX_HITS_TO_KEEP && (sum>limit || sum/initialHitCount>limit2 || hitsCount>maxHitLists/* || worstValue<0*/)){
+ final int[] lists=getGreedyListArray(hitsCount);
+ for(int i=0, j=0; j<lists.length; i++){
+ if(lengths[i]>0){
+ lists[j]=i;
+ j++;
+ }
+ }
+
+ Solver.findWorstGreedy(offsets, lengths, keyWeights, KEYLEN, lists, greedyReturn);
+ int worstIndex=greedyReturn[0];
+ int worst=lists[worstIndex];
+ worstValue=greedyReturn[1];
+ sum-=lengths[worst];
+
+// if(worstValue>0 && (hitsCount<=maxHitLists || lengths[worst]<excessListLimit)){return hitsCount;}
+ if(worstValue>0 || lengths[worst]<SMALL_GENOME_LIST){return hitsCount;} //This line increases accuracy at expense of speed. Lower constant = more accurate, default 0.
+ hitsCount--;
+ lengths[worst]=0;
+ keys[worst]=-1;
+ }
+ return hitsCount;
+ }
+
+
+ private final int getHits(final int[] keys, final int chrom, final int maxLen, final int[] starts, final int[] stops){
+ int numHits=0;
+ final Block b=index[chrom];
+ for(int i=0; i<keys.length; i++){
+ final int key=keys[i];
+ starts[i]=-1;
+ stops[i]=-1;
+ if(key>=0){
+ final int len=count(key);
+ if(len>0 && len<maxLen){
+ final int len2=b.length(key);
+ if(len2>0){
+ starts[i]=b.starts[key];
+ stops[i]=starts[i]+len2;
+ numHits++;
+ }
+ }
+ }
+ }
+ return numHits;
+ }
+
+
+ private final int countHits(final int[] keys, final int maxLen, boolean clearBadKeys){
+ int numHits=0;
+ for(int i=0; i<keys.length; i++){
+ final int key=keys[i];
+ if(key>=0){
+ final int len=count(key);
+ if(len>0 && len<maxLen){
+ numHits++;
+ }else if(clearBadKeys){
+ keys[i]=-1;
+ }
+ }
+ }
+ return numHits;
+ }
+
+
+ public final ArrayList<SiteScore> findAdvanced(byte[] basesP, byte[] basesM, byte[] qual, byte[] baseScoresP, int[] keyScoresP, int[] offsets, long id){
+ assert(minChrom<=maxChrom && minChrom>=0);
+ ArrayList<SiteScore> result=find(basesP, basesM, qual, baseScoresP, keyScoresP, offsets, true, id);
+ if(DOUBLE_SEARCH_NO_HIT && (result==null || result.isEmpty())){result=find(basesP, basesM, qual, baseScoresP, keyScoresP, offsets, false, id);}
+
+ return result;
+ }
+
+
+ public final ArrayList<SiteScore> find(byte[] basesP, byte[] basesM, byte[] qual, byte[] baseScoresP, int[] keyScoresP, int[] offsetsP, boolean obeyLimits, long id){
+
+ assert(checkOffsets(offsetsP)) : Arrays.toString(offsetsP);
+ final int[] keysOriginal=KeyRing.makeKeys(basesP, offsetsP, KEYLEN);
+ int[] keysP=Arrays.copyOf(keysOriginal, keysOriginal.length);
+
+ initialKeys+=offsetsP.length;
+ initialKeyIterations++;
+
+ final int maxLen=(obeyLimits ? MAX_USABLE_LENGTH : MAX_USABLE_LENGTH2);
+
+ int numHits=0;
+ numHits=countHits(keysP, maxLen, true);
+ if(numHits>0){ //TODO: Change these to higher numbers
+ int trigger=(3*keysP.length)/4;
+ if(numHits<4 && numHits<trigger){
+ for(int i=0; i<keysP.length; i++){keysP[i]=keysOriginal[i];}
+ numHits=countHits(keysP, (maxLen*3)/2, true);
+ }
+ if(numHits<3 && numHits<trigger){
+ for(int i=0; i<keysP.length; i++){keysP[i]=keysOriginal[i];}
+ numHits=countHits(keysP, maxLen*2, true);
+ }
+ if(numHits<3 && numHits<trigger){
+ for(int i=0; i<keysP.length; i++){keysP[i]=keysOriginal[i];}
+ numHits=countHits(keysP, maxLen*3, true);
+ }
+ if(numHits<2 && numHits<trigger){
+ for(int i=0; i<keysP.length; i++){keysP[i]=keysOriginal[i];}
+ numHits=countHits(keysP, maxLen*5, true);
+ }
+ }
+// assert(checkOffsets(offsetsP)) : Arrays.toString(offsetsP);
+ if(numHits<keysP.length){
+ int[][] r=shrink2(offsetsP, keysP, keyScoresP);
+ assert(r!=null);
+ if(r!=null){
+ offsetsP=r[0];
+ keysP=r[1];
+ keyScoresP=r[2];
+ }
+ }else{
+ assert(shrink2(offsetsP, keysP, keyScoresP)==null);
+ }
+ initialKeys2+=numHits;
+ //assert(checkOffsets(offsetsP)) : Arrays.toString(offsetsP);
+
+// assert(checkOffsets(offsets)) : Arrays.toString(offsets);
+ if(TRIM_BY_GREEDY && obeyLimits){
+ int maxLists=Tools.max((int)(HIT_FRACTION_TO_RETAIN*keysP.length), MIN_HIT_LISTS_TO_RETAIN);
+ numHits=trimExcessHitListsByGreedy(offsetsP, keyScoresP, maxLists, keysP);
+ }
+// System.out.println("After greedy: numHits = "+numHits);
+
+ if(TRIM_BY_TOTAL_SITE_COUNT && obeyLimits){
+ throw new RuntimeException("Needs to be redone.");
+// numHits=trimExcessHitLists(keys, hits);
+ }
+
+ if(TRIM_LONG_HIT_LISTS && obeyLimits && numHits>MIN_APPROX_HITS_TO_KEEP){
+ int cutoffIndex=((int) (HIT_FRACTION_TO_RETAIN*(keysP.length)-0.01f))+(keysP.length-numHits);
+
+ int zeroes=keysP.length-numHits;
+ int altMinIndex=(zeroes+(MIN_HIT_LISTS_TO_RETAIN-1));
+ cutoffIndex=Tools.max(cutoffIndex, altMinIndex);
+
+ assert(cutoffIndex>0) : cutoffIndex+"\n"+numHits;
+
+ if(cutoffIndex<(keysP.length-1)){
+ int[] lens=getGenericArray(keysP.length);
+ for(int i=0; i<keysP.length; i++){lens[i]=count(keysP[i]);}
+ Arrays.sort(lens);
+ int cutoff=lens[cutoffIndex];
+
+ cutoff=Tools.max(lengthHistogram[MIN_INDEX_TO_DROP_LONG_HIT_LIST], cutoff);
+
+ int removed=0;
+
+ for(int i=0; i<keysP.length; i++){
+ int key=keysP[i];
+ if(count(key)>cutoff){
+ keysP[i]=-1;
+ removed++;
+ numHits--;
+ }
+ }
+ }
+ }
+// assert(checkOffsets(offsets)) : Arrays.toString(offsets);
+
+ final ArrayList<SiteScore> result=new ArrayList<SiteScore>(8);
+ if(numHits<MIN_APPROX_HITS_TO_KEEP){return result;}
+ //assert(checkOffsets(offsetsP)) : Arrays.toString(offsetsP);
+ if(numHits<keysP.length){
+ int[][] r=shrink2(offsetsP, keysP, keyScoresP);
+ assert(r!=null);
+ if(r!=null){
+ offsetsP=r[0];
+ keysP=r[1];
+ keyScoresP=r[2];
+ }
+ }else{
+ assert(shrink2(offsetsP, keysP, keyScoresP)==null);
+ }
+ assert(keysP.length==numHits);
+ //assert(checkOffsets(offsetsP)) : Arrays.toString(offsetsP);
+ //Reverse the offsets for minus-strand mapping, since they are generated based on quality
+ int[] offsetsM=KeyRing.reverseOffsets(offsetsP, KEYLEN, basesP.length);
+ final int[] keysM=KeyRing.reverseComplementKeys(keysP, KEYLEN);
+
+// assert(checkOffsets(offsetsP)) : Arrays.toString(offsetsP);
+// assert(checkOffsets(offsetsM)) : Arrays.toString(offsetsM);
+
+ assert(!USE_EXTENDED_SCORE || (baseScoresP!=null && (qual==null || baseScoresP.length==qual.length)));
+ assert(keyScoresP!=null);
+ assert(keyScoresP.length==offsetsP.length) : keyScoresP.length+", "+offsetsP.length+", "+Arrays.toString(keyScoresP);
+ final byte[] baseScoresM=Tools.reverseAndCopy(baseScoresP, getBaseScoreArray(baseScoresP.length, 1));
+ final int[] keyScoresM=Tools.reverseAndCopy(keyScoresP, getKeyScoreArray(keyScoresP.length, 1));
+ final int maxQuickScore=maxQuickScore(offsetsP, keyScoresP);
+
+ assert(offsetsM.length==offsetsP.length);
+ assert(maxQuickScore==maxQuickScore(offsetsM, keyScoresM));
+
+ final int[] bestScores=new int[6];
+
+ //This prevents filtering by qscore when a low-quality read only uses a few keys.
+ //In that case, extending is more important.
+ final boolean prescan_qscore=(PRESCAN_QSCORE && numHits>=5);
+
+ int[][] prescanResults=null;
+ int[] precounts=null;
+ int[] prescores=null;
+
+ int hitsCutoff=0;
+ int qscoreCutoff=(int)(MIN_QSCORE_MULT*maxQuickScore);
+
+ boolean allBasesCovered=true;
+ {
+ if(offsetsP[0]!=0){allBasesCovered=false;}
+ else if(offsetsP[offsetsP.length-1]!=(basesP.length-KEYLEN)){allBasesCovered=false;}
+ else{
+ for(int i=1; i<offsetsP.length; i++){
+ if(offsetsP[i]>offsetsP[i-1]+KEYLEN){
+ allBasesCovered=false;
+ break;
+ }
+ }
+ }
+ }
+
+ //TODO I don't understand this logic
+ final boolean pretendAllBasesAreCovered=(allBasesCovered ||
+ keysP.length>=keysOriginal.length-4 ||
+ (keysP.length>=9 && (offsetsP[offsetsP.length-1]-offsetsP[0]+KEYLEN)>Tools.max(40, (int)(basesP.length*.75f))));
+
+// System.err.println(allBasesCovered+"\t"+Arrays.toString(offsetsP));
+// assert(allBasesCovered);
+
+ if(prescan_qscore){
+ prescanResults=prescanAllBlocks(bestScores,
+ keysP, keyScoresP, offsetsP,
+ keysM, keyScoresM, offsetsM,
+ pretendAllBasesAreCovered);
+
+ if(prescanResults!=null){
+ precounts=prescanResults[0];
+ prescores=prescanResults[1];
+ }
+
+ if(bestScores[1]<MIN_APPROX_HITS_TO_KEEP){return result;}
+ if(bestScores[3]<maxQuickScore*MIN_QSCORE_MULT2){return result;} //if(bestScores[3]<maxQuickScore(offsetsP, keyScoresP)*.10f){return result;}
+
+ if(bestScores[3]>=maxQuickScore && pretendAllBasesAreCovered){
+ assert(bestScores[3]==maxQuickScore);
+ assert(bestScores[1]==numHits);
+
+ hitsCutoff=calcApproxHitsCutoff(keysP.length, bestScores[1], MIN_APPROX_HITS_TO_KEEP, true);
+ qscoreCutoff=Tools.max(qscoreCutoff, (int)(bestScores[3]*DYNAMIC_QSCORE_THRESH_PERFECT));
+ }else{
+ hitsCutoff=calcApproxHitsCutoff(keysP.length, bestScores[1], MIN_APPROX_HITS_TO_KEEP, false);
+ qscoreCutoff=Tools.max(qscoreCutoff, (int)(bestScores[3]*PRESCAN_QSCORE_THRESH));
+ }
+ }
+
+ final int maxScore=maxScore(offsetsP, baseScoresP, keyScoresP, basesP.length, true);
+ final boolean fullyDefined=AminoAcid.isFullyDefined(basesP);
+ assert(bestScores[2]<=0) : Arrays.toString(bestScores);
+
+ int cycle=0;
+ for(int chrom=minChrom; chrom<=maxChrom; chrom=((chrom&CHROM_MASK_HIGH)+CHROMS_PER_BLOCK)){
+ if(precounts==null || precounts[cycle]>=hitsCutoff || prescores[cycle]>=qscoreCutoff){
+ find(keysP, basesP, baseScoresP, keyScoresP, chrom, Gene.PLUS,
+ offsetsP, obeyLimits, result, bestScores, allBasesCovered, maxScore, fullyDefined);
+ }
+ if(QUIT_AFTER_TWO_PERFECTS){
+ if(bestScores[5]>=2){break;} //if(bestScores[5]>=3 || (bestScores[5]>=2 && chrom<24)){break;} //for human
+ }
+ cycle++;
+ if(precounts==null || precounts[cycle]>=hitsCutoff || prescores[cycle]>=qscoreCutoff){
+ find(keysM, basesM, baseScoresM, keyScoresM, chrom, Gene.MINUS,
+ offsetsM, obeyLimits, result, bestScores, allBasesCovered, maxScore, fullyDefined);
+ }
+ if(QUIT_AFTER_TWO_PERFECTS){
+ if(bestScores[5]>=2){break;} //if(bestScores[5]>=3 || (bestScores[5]>=2 && chrom<24)){break;} //for human
+ }
+ cycle++;
+ }
+
+ assert(Read.CHECKSITES(result, basesP, basesM, id, false)); //TODO: Comment out once checked
+
+ return result;
+ }
+
+ /** Search blocks rapidly to find max hits, and perfect sites. May indicate some blocks can be skipped. */
+ private final int[][] prescanAllBlocks(int[] bestScores,
+ int[] keysP, int[] keyScoresP, int[] offsetsP,
+ int[] keysM, int[] keyScoresM, int[] offsetsM,
+ final boolean allBasesCovered){
+
+ int[][][] pm=new int[][][] {{keysP, keyScoresP, offsetsP}, {keysM, keyScoresM, offsetsM}};
+
+ int bestqscore=0;
+ int maxHits=0;
+ int minHitsToScore=MIN_APPROX_HITS_TO_KEEP;
+
+ final int maxQuickScore=maxQuickScore(offsetsP, keyScoresP);
+
+ final int[] counts=precountArray;
+ final int[] scores=prescoreArray;
+ final int[][] ret=prescanReturn;
+ Arrays.fill(counts, keysP.length);
+ Arrays.fill(scores, maxQuickScore);
+ ret[0]=counts;
+ ret[1]=scores;
+
+ int cycle=0;
+ for(int chrom=minChrom; chrom<=maxChrom; chrom=((chrom&CHROM_MASK_HIGH)+CHROMS_PER_BLOCK)){
+ final int baseChrom=baseChrom(chrom);
+ for(int pmi=0; pmi<2; pmi++, cycle++){
+
+ int[] keys=pm[pmi][0];
+ int[] keyScores=pm[pmi][1];
+ int[] offsets=pm[pmi][2];
+// int[][] hits=getHitArray(offsets.length);
+
+ int[] starts=startArray;
+ int[] stops=stopArray;
+ final int numHits=getHits(keys, chrom, Integer.MAX_VALUE, starts, stops);
+
+ if(numHits<minHitsToScore){
+ scores[cycle]=-9999;
+ counts[cycle]=0;
+ }else{
+
+// final int maxQuickScore=maxQuickScore(offsets, keyScores);
+ // System.err.println("maxScore = "+maxScore);
+
+ if(numHits<keys.length){
+ int[][] r=shrink(starts, stops, offsets, keyScores, offsets.length);
+ if(r!=null){
+ starts=r[0];
+ stops=r[1];
+ offsets=r[2];
+ keyScores=r[4];
+// assert(numHits==offsets.length) : "\n"+Arrays.toString(starts)+"\n\n"+Arrays.toString(starts2)+"\n\n"+Arrays.toString(stops2)
+// +"\n\n"+numHits+", "+offsets.length;
+ }
+ }
+
+ assert(numHits==offsets.length) : numHits+", "+offsets.length+", "+starts.length+", "+keys.length;
+ assert(numHits==keyScores.length);
+ heap.clear();
+ final Quad64[] triples=tripleStorage;
+ final int[] values=valueArray;
+
+ int[] temp=findMaxQscore2(starts, stops, offsets, keyScores, baseChrom, triples, values, minHitsToScore, true,
+ bestqscore>=maxQuickScore && allBasesCovered);
+
+ scores[cycle]=temp[0];
+ counts[cycle]=temp[1];
+
+ bestqscore=Tools.max(temp[0], bestqscore);
+ maxHits=Tools.max(maxHits, temp[1]);
+ if(bestqscore>=maxQuickScore && allBasesCovered){
+ assert(bestqscore==maxQuickScore);
+ assert(maxHits==keysP.length) :
+ "\nTemp: \t"+Arrays.toString(temp)+", cycle="+cycle+"\n" +
+ "Scores: \t"+Arrays.toString(scores)+
+ "Counts: \t"+Arrays.toString(counts)+
+ "bestqscore: \t"+bestqscore+
+ "maxHits: \t"+maxHits+
+ "maxQuickScore: \t"+maxQuickScore+
+ "numHits: \t"+numHits+
+ "minHitsToScore: \t"+minHitsToScore+
+ "keys.length: \t"+keys.length;
+
+ minHitsToScore=Tools.max(minHitsToScore, maxHits);
+
+ {
+ //This early exit is optional. Does not seem to impact speed much either way.
+ bestScores[1]=Tools.max(bestScores[1], maxHits);
+ bestScores[3]=Tools.max(bestScores[3], bestqscore);
+ return ret;
+ }
+ }
+ }
+ }
+ }
+
+ bestScores[1]=Tools.max(bestScores[1], maxHits);
+ bestScores[3]=Tools.max(bestScores[3], bestqscore);
+
+ if(!RETAIN_BEST_QCUTOFF){bestScores[2]=-9999;}
+
+ return ret;
+ }
+
+
+ /** Search a single block and strand */
+ public final ArrayList<SiteScore> find(int[] keys, final byte[] bases, final byte[] baseScores, int[] keyScores,
+ final int chrom, final byte strand,
+ int[] offsets, final boolean obeyLimits, ArrayList<SiteScore> ssl, int[] bestScores,
+ final boolean allBasesCovered, final int maxScore, final boolean fullyDefined){
+
+ assert(checkOffsets(offsets)) : Arrays.toString(offsets);
+
+ int[] starts=startArray;
+ int[] stops=stopArray;
+
+ int numHits=getHits(keys, chrom, Integer.MAX_VALUE, starts, stops);
+ if(numHits<MIN_APPROX_HITS_TO_KEEP){return ssl;}
+
+ if(USE_SLOWALK3){
+ if(!RETAIN_BEST_SCORES){Arrays.fill(bestScores, 0);}
+ ssl=slowWalk3(starts, stops, bases, baseScores, keyScores, offsets, chrom, strand, obeyLimits, ssl, bestScores, allBasesCovered, maxScore, fullyDefined);
+ }else{
+ ssl=slowWalk2(starts, stops, bases, baseScores, keyScores, offsets, chrom, strand, obeyLimits, ssl, fullyDefined);
+ }
+
+ return ssl;
+
+ }
+
+ /** Compress arrays by removing null/empty lists */
+ private final int[][] shrink(int[] starts, int[] stops, int[] offsets, int[] keyScores, final int len){
+
+
+ int numHits=0;
+ for(int i=0; i<len; i++){
+ if(starts[i]>=0){numHits++;}
+ }
+
+ if(numHits==offsets.length){
+ return null;
+ }else{
+ int[][] r=shrinkReturn3;
+ int[] starts2=startArray;
+ int[] stops2=stopArray;
+ int[] offsets2=getOffsetArray(numHits);
+ int[] keyScores2=new int[numHits];
+
+ for(int i=0, j=0; i<len; i++){
+ if(starts[i]>=0){
+ starts2[j]=starts[i];
+ stops2[j]=stops[i];
+ offsets2[j]=offsets[i];
+ keyScores2[j]=keyScores[i];
+ j++;
+ }
+ }
+ r[0]=starts2;
+ r[1]=stops2;
+ r[2]=offsets2;
+ r[4]=keyScores2;
+ return r;
+ }
+ }
+
+ /** Removes "-1" keys. */
+ private final int[][] shrink2(int[] offsets, int[] keys, int[] keyScores){
+
+
+ int numHits=0;
+ for(int i=0; i<keys.length; i++){
+ if(keys[i]>=0){numHits++;}
+ }
+
+
+ assert(checkOffsets(offsets)) : Arrays.toString(offsets);
+ if(numHits==keys.length){
+ return null;
+ }else{
+ int[][] r=shrinkReturn2;
+ int[] offsets2=getOffsetArray(numHits);
+ assert(offsets2!=offsets);
+ assert(offsets2.length<offsets.length);
+ int[] keys2=new int[numHits];
+ int[] keyScores2=new int[numHits];
+
+ for(int i=0, j=0; i<keys.length; i++){
+ if(keys[i]>=0){
+ offsets2[j]=offsets[i];
+ keys2[j]=keys[i];
+ keyScores2[j]=keyScores[i];
+ j++;
+ }
+ }
+ assert(checkOffsets(offsets2)) : "\nnumHits="+numHits+"\n"+Arrays.toString(offsets)+" -> \n"+Arrays.toString(offsets2)+"\n"+
+ "\n"+Arrays.toString(keys)+" -> \n"+Arrays.toString(keys2)+"\n";
+ r[0]=offsets2;
+ r[1]=keys2;
+ r[2]=keyScores2;
+ return r;
+ }
+ }
+
+
+ /** This uses a heap to track next column to increment */
+ private final ArrayList<SiteScore> slowWalk2(int[] starts, int[] stops, final byte[] bases,
+ final byte[] baseScores, int[] keyScores, int[] offsets,
+ final int baseChrom_, final byte strand, final boolean obeyLimits, ArrayList<SiteScore> ssl, final boolean fullyDefined){
+
+ if(SHRINK_BEFORE_WALK){
+ int[][] r=shrink(starts, stops, offsets, keyScores, offsets.length);
+ if(r!=null){
+ starts=r[0];
+ stops=r[1];
+ offsets=r[2];
+ keyScores=r[4];
+ }
+ }
+
+ final int numHits=offsets.length; //After shrink
+
+
+ assert(numHits==offsets.length);
+ assert(numHits==keyScores.length);
+
+//// System.out.println("After SHRINK_BEFORE_WALK: numHits = "+hits.length);
+// Block b=index[baseChrom_];
+// int[][] hits=b.getHitLists(starts, stops);
+// if(SHRINK_BEFORE_WALK){
+// Object[] r=shrink(hits, offsets, keyScores);
+// if(r!=null){
+// hits=(int[][])r[0];
+// offsets=(int[])r[1];
+// keyScores=(int[])r[3];
+// }
+// }
+//
+// final int numHits=hits.length; //After shrink
+
+
+ assert(numHits==offsets.length);
+ assert(numHits==keyScores.length);
+
+ assert(!(!SHRINK_BEFORE_WALK && ADD_SCORE_Z));
+
+ //This can be done before or after shrinking, but the results will change depending on MIN_SCORE_MULT and etc.
+ final int maxScore=maxScore(offsets, baseScores, keyScores, bases.length, true);
+// final int maxQuickScore=(!USE_EXTENDED_SCORE ? maxScore : maxQuickScore(offsets));
+// System.err.println("maxScore = "+maxScore);
+
+// final int minScore=(obeyLimits ? (int)(MIN_SCORE_MULT*maxScore) : (int)(MIN_SCORE_MULT*0.85f*maxScore));
+ final int minScore=(obeyLimits ? (int)(MIN_SCORE_MULT*maxScore) : (int)(MIN_SCORE_MULT*1.25f*maxScore));
+// final int minQuickScore=(!USE_EXTENDED_SCORE ? minScore : (int)(maxQuickScore*0.15f));
+// final int minScore=(int)(MIN_SCORE_MULT*maxScore);
+// System.err.println("minScore = "+minScore);
+
+ final int baseChrom=baseChrom(baseChrom_);
+
+
+// final PriorityQueue<Quad64> heap=new PriorityQueue<Quad64>(numHits);
+ heap.clear();
+// final Quad64[] triples=new Quad64[numHits];
+ final Quad64[] triples=tripleStorage;
+
+ final Block b=index[baseChrom];
+ final int[] values=valueArray;
+ final int[] sizes=sizeArray;
+ final int[] locArray=(USE_EXTENDED_SCORE ? getLocArray(bases.length) : null);
+
+ for(int i=0; i<numHits; i++){
+ final int[] sites=b.sites;
+ final int start=starts[i];
+ sizes[i]=b.length(start, stops[i]);
+ assert(sizes[i]>0);
+
+ int a=sites[start];
+ int a2;
+ if((a&SITE_MASK)>=offsets[i]){
+ a2=a-offsets[i];
+ }else{
+ int ch=numberToChrom(a, baseChrom);
+ int st=numberToSite(a);
+ int st2=Tools.max(st-offsets[i], 0);
+ a2=toNumber(st2, ch);
+ }
+ assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom));
+
+ Quad64 t=triples[i];
+ assert(t!=null) : "Should be using tripleStorage";
+ assert(i==t.column);
+ t.row=start;
+ t.site=((long)a2)&0xFFFFFFFFL;
+ t.list=sites;
+ values[i]=a2;
+
+ heap.add(t);
+ }
+
+ if(ssl==null){ssl=new ArrayList<SiteScore>(8);}
+
+ int currentTopScore=-999999999;
+
+ int cutoff=minScore;
+
+ int maxHits=0;
+ int approxHitsCutoff=MIN_APPROX_HITS_TO_KEEP;
+
+// System.out.println("\nEntering SS loop:");
+// System.out.println("maxScore="+maxScore+"\tminScore="+minScore+"\tcurrentTopScore="+currentTopScore+"\n" +
+// "cutoff="+cutoff+"\tmaxHits="+maxHits+"\tapproxHitsCutoff="+approxHitsCutoff);
+// System.out.println();
+
+ SiteScore prevSS=null;
+ while(!heap.isEmpty()){
+ Quad64 t=heap.peek();
+ final int site=(int)t.site; //*** TODO
+ final int centerIndex=t.column;
+
+ int maxNearbySite=site;
+
+
+ int approxHits=0;
+
+ {//Inner loop
+ final int minsite=subUnsigned(site, MAX_INDEL);
+ final int maxsite=addUnsigned(site, MAX_INDEL2);
+ for(int column=0, chances=numHits-approxHitsCutoff; column<numHits && chances>=0; column++){
+ final int x=values[column];
+ assert(x==(int)triples[column].site);
+ if(x>=minsite && x<=maxsite){
+ maxNearbySite=(x>maxNearbySite ? x : maxNearbySite);
+ approxHits++;
+ }else{chances--;}
+ }
+ }
+
+ assert(centerIndex>=0) : centerIndex;
+ assert(approxHits>=1 || approxHitsCutoff>1) : approxHits+", "+approxHitsCutoff+", "+numHits+", "+t.column;
+ if(approxHits>=approxHitsCutoff){
+
+ int score;
+
+ int mapStart=site, mapStop=maxNearbySite;
+
+ if(USE_EXTENDED_SCORE){
+ final int chrom=numberToChrom(site, baseChrom);
+ score=extendScore(bases, baseScores, offsets, values, chrom, centerIndex, locArray, numHits, approxHits);
+ if(true/*USE_AFFINE_SCORE*/){
+ //Correct begin and end positions if they changed.
+ int min=Integer.MAX_VALUE;
+ int max=Integer.MIN_VALUE;
+ for(int i=0; i<locArray.length; i++){
+ int x=locArray[i];
+ if(x>-1){
+ if(x<min){min=x;}
+ if(x>max){max=x;}
+ }
+ }
+
+// assert(min>-1 && max>-1) : Arrays.toString(locArray); //TODO: How did this assertion trigger?
+ if(min<0 || max<0){
+ //Note: This error can trigger if minChrom and maxChrom do not align to block boundaries
+ if(!Shared.anomaly){
+ Shared.anomaly=true;
+ System.err.println("Anomaly in "+getClass().getName()+".slowWalk: "+
+ chrom+", "+mapStart+", "+mapStop+", "+centerIndex+", "+
+ Arrays.toString(locArray)+"\n"+
+ Arrays.toString(values)+"\n"+
+ new String(bases)+"\nstrand="+strand+"\n");
+ System.err.println();
+ }
+ score=-99999;
+ }
+
+
+ mapStart=toNumber(min, chrom);
+ mapStop=toNumber(max, chrom);
+
+
+
+// System.err.println("site="+site+", maxNearbySite="+maxNearbySite+", min="+min+", max="+max+
+// ", mapStart="+mapStart+", mapStop="+mapStop);
+
+// if(chrom==17 && absdif(min, 30354420)<2000){
+// System.err.println("\n*****\n");
+// System.err.println("site="+site+" ("+numberToSite(site)+"), maxNearbySite="+maxNearbySite+
+// " ("+numberToSite(maxNearbySite)+"), min="+min+", max="+max+
+// ", mapStart="+mapStart+", mapStop="+mapStop);
+// System.err.println();
+// System.err.println(Arrays.toString(locArray));
+// System.err.println();
+// System.err.println("chrom="+chrom);
+// System.err.println("score="+score);
+// }
+ }
+ }else{
+ score=quickScore(values, keyScores, centerIndex, offsets, sizes, true, approxHits, numHits);
+ if(ADD_SCORE_Z){
+ int scoreZ=scoreZ2(values, centerIndex, offsets, approxHits, numHits);
+ score+=scoreZ;
+ }
+ }
+
+
+// score=score(values, centerIndex, offsets, hits);
+// if(ADD_SCORE_Z){
+// int scoreZ=scoreZ2(values, centerIndex, offsets);
+// score+=scoreZ;
+// }
+//
+// if(USE_EXTENDED_SCORE){
+// if(score>minQuickScore){
+//// System.out.println(score+" > "+minQuickScore);
+// score=extendScore(read, offsets, values, numberToChrom(site, baseChrom), centerIndex, locArray);
+// }else{
+//// System.out.print(".");
+// score=-1;
+// }
+// }
+
+
+// System.err.println("maxScore = "+maxScore);
+// System.err.println("hits = "+approxHits+" / "+approxHitsCutoff);
+// System.err.println("score = "+score+" / "+cutoff);
+
+ if(score>=cutoff){
+
+// System.err.println("Passed!");
+
+// System.out.println("approxHits="+approxHits+" / "+approxHitsCutoff);
+// System.out.println("score="+score+" / "+cutoff);
+// System.out.println("strand="+Gene.strandCodes[strand]);
+// System.out.println("center="+values[centerIndex]);
+// System.out.println("values="+Arrays.toString(values));
+// extendScore(read, offsets, values, numberToChrom(site, baseChrom), centerIndex);
+// System.out.println();
+
+ if(score>currentTopScore){
+// System.err.println("New top score!");
+
+ if(DYNAMICALLY_TRIM_LOW_SCORES){
+
+ maxHits=Tools.max(approxHits, maxHits);
+// approxHitsCutoff=Tools.max(approxHitsCutoff, maxHits);
+ approxHitsCutoff=Tools.max(approxHitsCutoff, maxHits-1); //More sensitive, but slower
+
+ cutoff=Tools.max(cutoff, (int)(score*DYNAMIC_SCORE_THRESH));
+
+ // cutoff=Tools.max(cutoff, minScore+(int)((score-minScore)*DYNAMIC_SCORE_THRESH));
+ if(USE_EXTENDED_SCORE && score>=maxScore){
+ cutoff=Tools.max(cutoff, (int)(score*0.95f));
+ }
+ }
+
+ currentTopScore=score;
+
+// System.out.println("New top score: "+currentTopScore+" \t("+cutoff+")");
+ }
+
+// final int chrom=numberToChrom(site, baseChrom);
+// final int site2=numberToSite(site);
+// final int site3=numberToSite(maxNearbySite)+read.length;
+
+ final int chrom=numberToChrom(mapStart, baseChrom);
+ final int site2=numberToSite(mapStart);
+ final int site3=numberToSite(mapStop)+bases.length-1;
+
+ assert(NUM_CHROM_BITS==0 || site2<SITE_MASK-1000) : "chrom="+chrom+", strand="+strand+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", site2="+site2+", site3="+site3+", read.length="+bases.length+
+ "\n\n"+Arrays.toString(b.getHitList(centerIndex));
+ assert(site2<site3) : "chrom="+chrom+", strand="+strand+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", site2="+site2+", site3="+site3+", read.length="+bases.length;
+
+
+ //Note: I could also do this as soon as score is calculated.
+// if(ADD_SCORE_Z){
+// int scoreZ=scoreZ2(values, centerIndex, offsets);
+// score+=scoreZ;
+// }
+
+ //This block is optional, but tries to eliminate multiple identical alignments
+// SiteScore prevSS=(ssl.size()<1 ? null : ssl.get(ssl.size()-1));
+
+ SiteScore ss=null;
+ final boolean perfect1=USE_EXTENDED_SCORE && score==maxScore && fullyDefined;
+
+ int[] gapArray=null;
+ if(site3-site2>=MINGAP+bases.length){
+ gapArray=makeGapArray(locArray, mapStart, MINGAP);
+ if(gapArray!=null){
+ int sub=site2-mapStart;//thus site2=mapStart+sub
+ for(int i=0; i<gapArray.length; i++){
+ gapArray[i]+=sub;
+ }
+ assert(gapArray[0]==mapStart);
+ assert(gapArray[gapArray.length-1]==mapStop);
+ }
+ assert(false) : Arrays.toString(locArray);
+ }
+
+ if(gapArray==null && prevSS!=null && prevSS.gaps==null &&
+ prevSS.chrom==chrom && prevSS.strand==strand && overlap(prevSS.start, prevSS.stop, site2, site3)){
+
+ int betterScore=Tools.max(score, prevSS.score);
+ int minStart=Tools.min(prevSS.start, site2);
+ int maxStop=Tools.max(prevSS.stop, site3);
+ final boolean perfect2=USE_EXTENDED_SCORE && prevSS.score==maxScore && fullyDefined;
+ assert(!USE_EXTENDED_SCORE || perfect2==prevSS.perfect);
+
+ boolean shortEnough=(!LIMIT_SUBSUMPTION_LENGTH_TO_2X || (maxStop-minStart<2*bases.length));
+
+ if(prevSS.start==site2 && prevSS.stop==site3){
+ prevSS.score=prevSS.quickScore=betterScore;
+ }else if(SUBSUME_SAME_START_SITES && shortEnough && prevSS.start==site2){
+ if(perfect2){
+ //do nothing
+ }else if(perfect1){
+ prevSS.setStop(site3);
+ prevSS.setPerfect();
+ }else{
+ prevSS.setStop(maxStop);
+ }
+ prevSS.score=prevSS.quickScore=betterScore;
+ }else if(SUBSUME_SAME_STOP_SITES && shortEnough && prevSS.stop==site3){
+ if(perfect2){
+ //do nothing
+ }else if(perfect1){
+ prevSS.setStart(site2);
+ prevSS.setPerfect();
+ }else{
+ prevSS.setStart(minStart);
+ }
+ prevSS.score=prevSS.quickScore=betterScore;
+ }else if(SUBSUME_OVERLAPPING_SITES && shortEnough && (maxStop-minStart<=bases.length+MAX_SUBSUMPTION_LENGTH)
+ && !perfect1 && !perfect2){
+ prevSS.setLimits(minStart, maxStop);
+ prevSS.score=prevSS.quickScore=betterScore;
+ }else{
+ ss=new SiteScore(chrom, strand, site2, site3, approxHits, score, false, perfect1);
+ if(!perfect1){ss.setPerfect(bases);}
+ assert(!perfect1 || ss.stop-ss.start==bases.length-1);
+ }
+ assert(!perfect2 || prevSS.stop-prevSS.start==bases.length-1);
+ }else{
+ ss=new SiteScore(chrom, strand, site2, site3, approxHits, score, false, perfect1);
+ if(!perfect1){ss.setPerfect(bases);}
+ ss.gaps=gapArray;
+ if(gapArray!=null){
+ System.err.println(ss.toText()+"\t"+Arrays.toString(gapArray)+"\n"+Arrays.toString(locArray)+"\n");
+ }
+ }
+
+ if(ss!=null){
+// System.out.println("Added site "+ss.toText());
+ ssl.add(ss);
+ prevSS=ss;
+ }else{
+// System.out.println("Subsumed site "+new SiteScore(chrom, strand, site2, site3, score).toText());
+ }
+
+// if(prevSS!=null && prevSS.chrom==chrom && prevSS.strand==strand && overlap(prevSS.start, prevSS.stop, site2, site3)){
+// int betterScore=Tools.max(score, prevSS.score);
+// if(prevSS.start==site2 && prevSS.stop==site3){
+// prevSS.score=prevSS.quickScore=betterScore;
+// }else if(prevSS.start==site2
+// /*isWithin(prevSS.start, prevSS.stop, site2, site3) ||
+// isWithin(site2, site3, prevSS.start, prevSS.stop)*/){
+// prevSS.score=prevSS.quickScore=betterScore;
+// assert(prevSS.start<prevSS.stop);
+//// prevSS.start=Tools.min(prevSS.start, site2);
+// prevSS.stop=Tools.max(prevSS.stop, site3);
+// assert(prevSS.start<prevSS.stop);
+// }else{
+// SiteScore ss=new SiteScore(chrom, strand, site2, site3, score);
+// ssl.add(ss);
+// prevSS=ss;
+// }
+// }else{
+// SiteScore ss=new SiteScore(chrom, strand, site2, site3, score);
+// ssl.add(ss);
+// prevSS=ss;
+// }
+
+ }
+ }
+
+ while(site==(int)heap.peek().site){ //Remove all identical elements, and add subsequent elements
+ final Quad64 t2=heap.poll();
+ final int row=t2.row+1, col=t2.column;
+ if(row<stops[col]){
+ t2.row=row;
+
+ int a=t2.list[row];
+ int a2;
+ if((a&SITE_MASK)>=offsets[col]){
+ a2=a-offsets[col];
+
+ assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) :
+ "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col];
+ }else{
+ int ch=numberToChrom(a, baseChrom);
+ int st=numberToSite(a);
+ int st2=Tools.max(st-offsets[col], 0);
+ a2=toNumber(st2, ch);
+
+ assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) :
+ "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col];
+ }
+
+ assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) :
+ "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col];
+
+ t2.site=((long)a2)&0xFFFFFFFFL;
+ values[col]=a2;
+ heap.add(t2);
+ }
+ if(heap.isEmpty()){break;}
+ }
+
+ }
+
+ return ssl;
+ }
+
+ /** This uses a heap to track next column to increment */
+ private final ArrayList<SiteScore> slowWalk3(int[] starts, int[] stops, final byte[] bases,
+ final byte[] baseScores, int[] keyScores, int[] offsets,
+ final int baseChrom_, final byte strand, final boolean obeyLimits, ArrayList<SiteScore> ssl,
+ int[] bestScores, final boolean allBasesCovered, final int maxScore, final boolean fullyDefined){
+ assert(USE_EXTENDED_SCORE);
+
+ final int numKeys=offsets.length; //Before shrink
+
+ //This can be done before or after shrinking, but the results will change depending on MIN_SCORE_MULT and etc.
+ final int maxQuickScore=maxQuickScore(offsets, keyScores);
+
+ if(SHRINK_BEFORE_WALK){
+ int[][] r=shrink(starts, stops, offsets, keyScores, offsets.length);
+ if(r!=null){
+ starts=r[0];
+ stops=r[1];
+ offsets=r[2];
+ keyScores=r[4];
+ }
+ }
+
+ final int numHits=offsets.length; //After shrink
+
+
+ assert(numHits==offsets.length);
+ assert(numHits==keyScores.length);
+
+ usedKeys+=numHits;
+ usedKeyIterations++;
+
+ final boolean filter_by_qscore=(FILTER_BY_QSCORE && numKeys>=5);
+
+ assert(!(!SHRINK_BEFORE_WALK && ADD_SCORE_Z));
+
+
+// final int minScore=(obeyLimits ? (int)(MIN_SCORE_MULT*maxScore) : (int)(MIN_SCORE_MULT*0.85f*maxScore));
+ final int minScore=(obeyLimits ? (int)(MIN_SCORE_MULT*maxScore) : (int)(MIN_SCORE_MULT*1.25f*maxScore));
+ final int minQuickScore=(int)(MIN_QSCORE_MULT*maxQuickScore);
+
+ final int baseChrom=baseChrom(baseChrom_);
+
+ heap.clear();
+
+ final Quad64[] triples=tripleStorage;
+
+ final int[] values=valueArray;
+ final int[] sizes=sizeArray;
+ final int[] locArray=(USE_EXTENDED_SCORE ? getLocArray(bases.length) : null);
+ final Block b=index[baseChrom];
+
+ if(ssl==null){ssl=new ArrayList<SiteScore>(8);}
+
+ int currentTopScore=bestScores[0];
+ int cutoff=Tools.max(minScore, (int)(currentTopScore*DYNAMIC_SCORE_THRESH));
+
+ int qcutoff=Tools.max(bestScores[2], minQuickScore);
+ int bestqscore=bestScores[3];
+ int maxHits=bestScores[1];
+ int perfectsFound=bestScores[5];
+ assert((currentTopScore>=maxScore) == (perfectsFound>0)) : currentTopScore+", "+maxScore+", "+perfectsFound+", "+maxHits+", "+numHits;
+ int approxHitsCutoff=calcApproxHitsCutoff(numKeys, maxHits, MIN_APPROX_HITS_TO_KEEP, currentTopScore>=maxScore);
+ if(approxHitsCutoff>numHits){return ssl;}
+
+ final boolean shortCircuit=(allBasesCovered && numKeys==numHits && filter_by_qscore);
+
+
+ assert(USE_EXTENDED_SCORE);
+
+ if(currentTopScore>=maxScore){
+ assert(currentTopScore==maxScore);
+ qcutoff=Tools.max(qcutoff, (int)(maxQuickScore*DYNAMIC_QSCORE_THRESH_PERFECT));
+ }
+
+
+ for(int i=0; i<numHits; i++){
+ final int[] sites=b.sites;
+ final int start=starts[i];
+ sizes[i]=b.length(start, stops[i]);
+ assert(sizes[i]>0);
+
+ int a=sites[start];
+ int a2;
+ if((a&SITE_MASK)>=offsets[i]){
+ a2=a-offsets[i];
+ }else{
+ int ch=numberToChrom(a, baseChrom);
+ int st=numberToSite(a);
+ int st2=Tools.max(st-offsets[i], 0);
+ a2=toNumber(st2, ch);
+ }
+ assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom));
+
+ Quad64 t=triples[i];
+ assert(t!=null) : "Should be using tripleStorage";
+ assert(i==t.column);
+ t.row=start;
+ t.site=((long)a2)&0xFFFFFFFFL;
+ t.list=sites;
+ values[i]=a2;
+
+ heap.add(t);
+ }
+
+// System.out.println("\nEntering SS loop:");
+// System.out.println("maxScore="+maxScore+"\tminScore="+minScore+"\tcurrentTopScore="+currentTopScore+"\n" +
+// "cutoff="+cutoff+"\tmaxHits="+maxHits+"\tapproxHitsCutoff="+approxHitsCutoff);
+// System.out.println("maxQuickScore="+maxQuickScore+"\tminQuickScore="+minQuickScore+"\tqcutoff="+qcutoff);
+
+
+ SiteScore prevSS=null;
+ while(!heap.isEmpty()){
+ Quad64 t=heap.peek();
+ final int site=(int)t.site;
+ final int centerIndex=t.column;
+
+ int maxNearbySite=site;
+
+
+ int approxHits=0;
+
+ {//Inner loop
+ final int minsite=subUnsigned(site, MAX_INDEL);
+ final int maxsite=addUnsigned(site, MAX_INDEL2);
+ for(int column=0, chances=numHits-approxHitsCutoff; column<numHits && chances>=0; column++){
+ final int x=values[column];
+ assert(x==(int)triples[column].site);
+ if(x>=minsite && x<=maxsite){
+ maxNearbySite=(x>maxNearbySite ? x : maxNearbySite);
+ approxHits++;
+ }else{chances--;}
+ }
+ }
+
+ assert(centerIndex>=0) : centerIndex;
+ assert(approxHits>=1 || approxHitsCutoff>1) : approxHits+", "+approxHitsCutoff+", "+numHits+", "+t.column;
+ if(approxHits>=approxHitsCutoff){
+
+ int score;
+ int qscore=(filter_by_qscore ? quickScore(values, keyScores, centerIndex, offsets, sizes, true, approxHits, numHits) : qcutoff);
+ if(ADD_SCORE_Z){
+ int scoreZ=scoreZ2(values, centerIndex, offsets, approxHits, numHits);
+ qscore+=scoreZ;
+ }
+
+ int mapStart=site, mapStop=maxNearbySite;
+
+ assert(USE_EXTENDED_SCORE);
+
+ boolean locArrayValid=false;
+ if(qscore<qcutoff){
+ score=-1;
+ }else{
+
+ final int chrom=numberToChrom(site, baseChrom);
+
+ //TODO Note that disabling the shortCircuit code seems to make things run 2% faster (with identical results).
+ //However, theoretically, shortCircuit should be far more efficient. Test both ways on cluster and on a larger run.
+ //May have something to do with compiler loop optimizations.
+ if(shortCircuit && qscore==maxQuickScore){
+ assert(approxHits==numKeys);
+// assert(maxScore==extendScore(bases, baseScores, offsets, values, chrom, centerIndex, locArray)) :
+// maxScore+", "+extendScore(bases, baseScores, offsets, values, chrom, centerIndex, locArray)+
+// ", "+approxHits+", "+numKeys+", "+qscore+", "+maxQuickScore+", "+
+// (scoreZ2(values, centerIndex, offsets, approxHits)+quickScore(values, keyScores, centerIndex, offsets, hits, false, approxHits))+", "+
+// (scoreZ2(values, centerIndex, offsets, approxHits)+quickScore(values, keyScores, centerIndex, offsets, hits, true, approxHits))
+// +"\noffsets="+Arrays.toString(offsets)+"\n";
+ score=maxScore;
+ }else{
+ if(verbose){
+ System.err.println("Extending "+Arrays.toString(values));
+ }
+ score=extendScore(bases, baseScores, offsets, values, chrom, centerIndex, locArray, numHits, approxHits);
+ locArrayValid=true;
+
+ if(verbose){
+ System.err.println("score: "+score);
+ System.err.println("locArray: "+Arrays.toString(locArray));
+ }
+
+ //Correct begin and end positions if they changed.
+ int min=Integer.MAX_VALUE;
+ int max=Integer.MIN_VALUE;
+ for(int i=0; i<locArray.length; i++){
+ int x=locArray[i];
+ if(x>-1){
+ if(x<min){min=x;}
+ if(x>max){max=x;}
+ }
+ }
+
+ if(score>=maxScore){
+ assert(min==max && min>-1) : "\n"+score+", "+maxScore+", "+min+", "+max+
+ ", "+(max-min)+", "+bases.length+"\n"+Arrays.toString(locArray)+"\n";
+ }
+
+ // assert(min>-1 && max>-1) : Arrays.toString(locArray); //TODO: How did this assertion trigger?
+ if(min<0 || max<0){
+ if(!Shared.anomaly){
+ Shared.anomaly=true;
+ System.err.println("Anomaly in "+getClass().getName()+".slowWalk: "+
+ chrom+", "+mapStart+", "+mapStop+", "+centerIndex+", "+
+ Arrays.toString(locArray)+"\n"+
+ Arrays.toString(values)+"\n"+
+ new String(bases)+"\nstrand="+strand+"\n");
+ System.err.println();
+ }
+ score=-99999;
+ }
+
+ //mapStart and mapStop are indices
+ mapStart=toNumber(min, chrom);
+ mapStop=toNumber(max, chrom);
+
+ if(score>=maxScore){
+ assert(mapStop-mapStart==0) : "\n"+score+", "+maxScore+", "+min+", "+max+
+ ", "+(max-min)+", "+(mapStop-mapStart)+", "+bases.length+"\n"+Arrays.toString(locArray)+"\n";
+ }
+ }
+
+ if(score==maxScore){
+ qcutoff=Tools.max(qcutoff, (int)(maxQuickScore*DYNAMIC_QSCORE_THRESH_PERFECT));
+ approxHitsCutoff=calcApproxHitsCutoff(numKeys, maxHits, MIN_APPROX_HITS_TO_KEEP, true);
+ }
+
+ if(score>=cutoff){
+ qcutoff=Tools.max(qcutoff, (int)(qscore*DYNAMIC_QSCORE_THRESH));
+ bestqscore=Tools.max(qscore, bestqscore);
+ }
+ }
+
+// System.err.println("maxScore = "+maxScore);
+// System.err.println("hits = "+approxHits+" / "+approxHitsCutoff);
+// System.err.println("score = "+score+" / "+cutoff);
+
+ if(score>=cutoff){
+
+// System.err.println("Passed!");
+
+// System.out.println("approxHits="+approxHits+" / "+approxHitsCutoff);
+// System.out.println("score="+score+" / "+cutoff);
+// System.out.println("strand="+Gene.strandCodes[strand]);
+// System.out.println("center="+values[centerIndex]);
+// System.out.println("values="+Arrays.toString(values));
+// extendScore(read, offsets, values, numberToChrom(site, baseChrom), centerIndex);
+// System.out.println();
+
+ if(score>currentTopScore){
+// System.err.println("New top score!");
+
+ if(DYNAMICALLY_TRIM_LOW_SCORES){
+
+ maxHits=Tools.max(approxHits, maxHits);
+ approxHitsCutoff=calcApproxHitsCutoff(numKeys, maxHits, approxHitsCutoff, currentTopScore>=maxScore);
+
+ cutoff=Tools.max(cutoff, (int)(score*DYNAMIC_SCORE_THRESH));
+ if(score>=maxScore){
+ assert(USE_EXTENDED_SCORE);
+ cutoff=Tools.max(cutoff, (int)(score*0.95f));
+ }
+ }
+
+ currentTopScore=score;
+
+// System.out.println("New top score: "+currentTopScore+" \t("+cutoff+")");
+ }
+
+ final int chrom=numberToChrom(mapStart, baseChrom);
+ final int site2=numberToSite(mapStart);
+ final int site3=numberToSite(mapStop)+bases.length-1;
+
+ assert(NUM_CHROM_BITS==0 || site2<SITE_MASK-1000) : "chrom="+chrom+", strand="+strand+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", site2="+site2+", site3="+site3+", read.length="+bases.length+
+ "\n\n"+Arrays.toString(b.getHitList(centerIndex));
+ assert(site2<site3) : "chrom="+chrom+", strand="+strand+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", site2="+site2+", site3="+site3+", read.length="+bases.length;
+
+
+ int[] gapArray=null;
+ if(site3-site2>=MINGAP+bases.length){
+ assert(locArrayValid) : "Loc array was not filled.";
+// System.err.println("****\n"+Arrays.toString(locArray)+"\n");
+// int[] clone=locArray.clone();
+ gapArray=makeGapArray(locArray, site2, MINGAP);
+ if(gapArray!=null){
+// System.err.println(Arrays.toString(locArray)+"\n");
+// System.err.println(Arrays.toString(gapArray));
+//
+//// int sub=site2-mapStart;//thus site2=mapStart+sub
+//// for(int i=0; i<gapArray.length; i++){
+//// gapArray[i]+=sub;
+//// }
+//// System.err.println(Arrays.toString(gapArray));
+//
+// System.err.println(mapStart+" -> "+site2);
+// System.err.println(mapStop+" -> "+site3);
+
+ assert(gapArray[0]>=site2 && gapArray[0]-site2<bases.length);
+ assert(gapArray[gapArray.length-1]<=site3 && site3-gapArray[gapArray.length-1]<bases.length) : "\n"+
+ mapStart+" -> "+site2+"\n"+
+ mapStop+" -> "+site3+"\n\n"+
+ Arrays.toString(gapArray)+"\n\n"+
+// Arrays.toString(clone)+"\n\n"+
+ Arrays.toString(locArray)+"\n"+
+ "numHits="+numHits+", "+
+ "heap.size="+heap.size()+", "+
+ "numHits="+numHits+", "+
+ "approxHits="+approxHits+"\n";
+ gapArray[0]=Tools.min(gapArray[0], site2);
+ gapArray[gapArray.length-1]=Tools.max(gapArray[gapArray.length-1], site3);
+ }
+ if(verbose){System.err.println("@ site "+site2+", made gap array: "+Arrays.toString(gapArray));}
+// assert(false) : Arrays.toString(locArray);
+ }
+
+
+ //This block is optional, but tries to eliminate multiple identical alignments
+
+ SiteScore ss=null;
+ final boolean perfect1=USE_EXTENDED_SCORE && score==maxScore && fullyDefined;
+ final boolean inbounds=(site2>=0 && site3<Data.chromLengths[chrom]);
+// if(!inbounds){System.err.println("Index tossed out-of-bounds site chr"+chrom+", "+site2+"-"+site3);}
+
+ if(inbounds && !SEMIPERFECTMODE && !PERFECTMODE && gapArray==null && prevSS!=null &&
+ prevSS.chrom==chrom && prevSS.strand==strand && overlap(prevSS.start, prevSS.stop, site2, site3)){
+
+ final int betterScore=Tools.max(score, prevSS.score);
+ final int minStart=Tools.min(prevSS.start, site2);
+ final int maxStop=Tools.max(prevSS.stop, site3);
+ final boolean perfect2=USE_EXTENDED_SCORE && prevSS.score==maxScore && fullyDefined;
+ assert(!USE_EXTENDED_SCORE || perfect2==prevSS.perfect);
+
+ final boolean shortEnough=(!LIMIT_SUBSUMPTION_LENGTH_TO_2X || (maxStop-minStart<2*bases.length));
+
+ if(prevSS.start==site2 && prevSS.stop==site3){
+ prevSS.score=prevSS.quickScore=betterScore;
+ prevSS.perfect=(prevSS.perfect || perfect1 || perfect2);
+ if(prevSS.perfect){prevSS.semiperfect=true;}
+ }else if(SUBSUME_SAME_START_SITES && shortEnough && prevSS.start==site2 && !prevSS.semiperfect){
+ if(perfect2){
+ //do nothing
+ }else if(perfect1){
+ prevSS.setStop(site3);
+ if(!prevSS.perfect){perfectsFound++;}//***$
+ prevSS.perfect=prevSS.semiperfect=true;
+ }else{
+ prevSS.setStop(maxStop);
+ prevSS.setPerfect(bases);
+ }
+ prevSS.score=prevSS.quickScore=betterScore;
+ }else if(SUBSUME_SAME_STOP_SITES && shortEnough && prevSS.stop==site3 && !prevSS.semiperfect){
+ if(perfect2){
+ //do nothing
+ }else if(perfect1){
+ prevSS.setStart(site2);
+ if(!prevSS.perfect){perfectsFound++;}//***$
+ prevSS.perfect=prevSS.semiperfect=true;
+ }else{
+ prevSS.setStart(minStart);
+ prevSS.setPerfect(bases);
+ }
+ prevSS.score=prevSS.quickScore=betterScore;
+ }else if(SUBSUME_OVERLAPPING_SITES && shortEnough && (maxStop-minStart<=bases.length+MAX_SUBSUMPTION_LENGTH)
+ && !perfect1 && !perfect2 && !prevSS.semiperfect){
+ prevSS.setLimits(minStart, maxStop);
+ prevSS.score=prevSS.quickScore=betterScore;
+ prevSS.setPerfect(bases);
+ }else{
+ ss=new SiteScore(chrom, strand, site2, site3, approxHits, score, false, perfect1);
+ if(!perfect1){ss.setPerfect(bases);}
+ if(verbose){System.err.println("A) Index made SiteScore "+ss.toText()+", "+Arrays.toString(ss.gaps));}
+ assert(!perfect1 || ss.stop-ss.start==bases.length-1);
+ }
+ assert(!perfect2 || prevSS.stop-prevSS.start==bases.length-1);
+ }else if(inbounds){
+ ss=new SiteScore(chrom, strand, site2, site3, approxHits, score, false, perfect1);
+ if(!perfect1){ss.setPerfect(bases);}
+ ss.gaps=gapArray;
+ if(verbose){System.err.println("B) Index made SiteScore "+ss.toText()+", "+Arrays.toString(ss.gaps));}
+ }
+
+ assert(ss==null || !ss.perfect || ss.semiperfect) : ss;
+ assert(prevSS==null || !prevSS.perfect || prevSS.semiperfect) : "\n"+SiteScore.header()+"\n"+ss+"\n"+prevSS;
+ if(ss!=null && (SEMIPERFECTMODE && !ss.semiperfect) || (PERFECTMODE && !ss.perfect)){ss=null;}
+
+
+ if(ss!=null){
+// System.out.println("Added site "+ss.toText()+", qscore="+qscore);
+ ssl.add(ss);
+ if(ss.perfect){
+
+ if(prevSS==null || !prevSS.perfect || !ss.overlaps(prevSS)){
+ if(prevSS==null){assert ssl.size()<2 || !ss.overlaps(ssl.get(ssl.size()-2));}
+ perfectsFound++;
+
+ //Human-specific code
+// if(QUIT_AFTER_TWO_PERFECTS){
+// if(perfectsFound>=3 || (perfectsFound>=2 && chrom<24)){break;}
+// }
+
+ if(QUIT_AFTER_TWO_PERFECTS && perfectsFound>=2){break;}
+ }
+ }
+
+ prevSS=ss;
+ }else{
+// System.out.println("Subsumed site "+new SiteScore(chrom, strand, site2, site3, score).toText());
+ }
+ }
+ }
+
+ while(site==(int)heap.peek().site){ //Remove all identical elements, and add subsequent elements
+ final Quad64 t2=heap.poll();
+ final int row=t2.row+1, col=t2.column;
+ if(row<stops[col]){
+ t2.row=row;
+
+ int a=t2.list[row];
+ int a2;
+ if((a&SITE_MASK)>=offsets[col]){
+ a2=a-offsets[col];
+
+ assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) :
+ "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col];
+ }else{
+ int ch=numberToChrom(a, baseChrom);
+ int st=numberToSite(a);
+ int st2=Tools.max(st-offsets[col], 0);
+ a2=toNumber(st2, ch);
+
+ assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) :
+ "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col];
+ }
+
+ assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) :
+ "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col];
+
+ t2.site=((long)a2)&0xFFFFFFFFL;
+ values[col]=a2;
+ heap.add(t2);
+ }else if(heap.size()<approxHitsCutoff || PERFECTMODE){
+ assert(USE_EXTENDED_SCORE);
+ bestScores[0]=Tools.max(bestScores[0], currentTopScore);
+ bestScores[1]=Tools.max(bestScores[1], maxHits);
+ bestScores[2]=Tools.max(bestScores[2], qcutoff);
+ bestScores[3]=Tools.max(bestScores[3], bestqscore);
+
+ bestScores[4]=maxQuickScore;
+ bestScores[5]=perfectsFound; //***$ fixed by adding this line
+ if(!RETAIN_BEST_QCUTOFF){bestScores[2]=-9999;}
+
+ return ssl;
+ }
+ if(heap.isEmpty()){
+ assert(false) : heap.size()+", "+approxHitsCutoff;
+ break;
+ }
+ }
+
+ }
+
+ assert(USE_EXTENDED_SCORE);
+ bestScores[0]=Tools.max(bestScores[0], currentTopScore);
+ bestScores[1]=Tools.max(bestScores[1], maxHits);
+ bestScores[2]=Tools.max(bestScores[2], qcutoff);
+ bestScores[3]=Tools.max(bestScores[3], bestqscore);
+
+ bestScores[4]=maxQuickScore;
+ bestScores[5]=perfectsFound;
+ if(!RETAIN_BEST_QCUTOFF){bestScores[2]=-9999;}
+
+// bestScores[0]=Tools.max(bestScores[0], currentTopScore);
+// bestScores[1]=Tools.max(bestScores[1], maxHits);
+// bestScores[2]=Tools.max(bestScores[2], qcutoff);
+ return ssl;
+ }
+
+
+ private final int[] findMaxQscore2(final int[] starts, final int[] stops, final int[] offsets, final int[] keyScores,
+ final int baseChrom_, final Quad64[] triples, final int[] values, final int prevMaxHits,
+ boolean earlyExit, boolean perfectOnly){
+
+ final int numHits=offsets.length;
+ assert(numHits>=prevMaxHits);
+
+ final int baseChrom=baseChrom(baseChrom_);
+ final Block b=index[baseChrom];
+ final int[] sizes=sizeArray;
+
+ heap.clear();
+ for(int i=0; i<numHits; i++){
+ final int[] sites=b.sites;
+ final int start=starts[i];
+ sizes[i]=b.length(start, stops[i]);
+ assert(sizes[i]>0);
+
+ int a=sites[start];
+ int a2;
+ if((a&SITE_MASK)>=offsets[i]){
+ a2=a-offsets[i];
+ }else{
+ int ch=numberToChrom(a, baseChrom);
+ int st=numberToSite(a);
+ int st2=Tools.max(st-offsets[i], 0);
+ a2=toNumber(st2, ch);
+ }
+ assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom));
+
+ Quad64 t=triples[i];
+ assert(t!=null) : "Should be using tripleStorage";
+ assert(i==t.column);
+ t.row=start;
+ t.site=((long)a2)&0xFFFFFFFFL;
+ t.list=sites;
+ values[i]=a2;
+
+ heap.add(t);
+ }
+
+ final int maxQuickScore=maxQuickScore(offsets, keyScores);
+
+ int topQscore=-999999999;
+
+ int maxHits=0;
+// int approxHitsCutoff=MIN_APPROX_HITS_TO_KEEP;
+
+
+ int approxHitsCutoff;
+ final int indelCutoff;
+ if(perfectOnly){
+ approxHitsCutoff=numHits;
+ indelCutoff=0;
+ }else{
+ approxHitsCutoff=Tools.max(prevMaxHits, Tools.min(MIN_APPROX_HITS_TO_KEEP, numHits-1)); //Faster, same accuracy
+ indelCutoff=MAX_INDEL2;
+ }
+
+
+ while(!heap.isEmpty()){
+ Quad64 t=heap.peek();
+ final int site=(int)t.site; //*** TODO
+ final int centerIndex=t.column;
+
+ int maxNearbySite=site;
+
+
+ int approxHits=0;
+
+ {//Inner loop
+ final int minsite=subUnsigned(site, MAX_INDEL);
+ final int maxsite=addUnsigned(site, MAX_INDEL2);
+ for(int column=0, chances=numHits-approxHitsCutoff; column<numHits && chances>=0; column++){
+ final int x=values[column];
+ assert(x==(int)triples[column].site);
+ if(x>=minsite && x<=maxsite){
+ maxNearbySite=(x>maxNearbySite ? x : maxNearbySite);
+ approxHits++;
+ }else{chances--;}
+ }
+ }
+
+ assert(centerIndex>=0) : centerIndex;
+ assert(approxHits>=1 || approxHitsCutoff>1) : approxHits+", "+approxHitsCutoff+", "+numHits+", "+t.column;
+ if(approxHits>=approxHitsCutoff){
+
+ int qscore=quickScore(values, keyScores, centerIndex, offsets, sizes, true, approxHits, numHits);
+
+ if(ADD_SCORE_Z){
+ int scoreZ=scoreZ2(values, centerIndex, offsets, approxHits, numHits);
+ qscore+=scoreZ;
+ }
+
+ if(qscore>topQscore){
+
+// maxHits=Tools.max(approxHits, maxHits);
+// approxHitsCutoff=Tools.max(approxHitsCutoff, maxHits); //Best setting for pre-scan
+
+ maxHits=Tools.max(approxHits, maxHits);
+ approxHitsCutoff=Tools.max(approxHitsCutoff, approxHits-1); //Best setting for pre-scan
+
+ topQscore=qscore;
+
+ if(qscore>=maxQuickScore){
+ assert(qscore==maxQuickScore);
+ assert(approxHits==numHits);
+ if(earlyExit){
+ return new int[] {topQscore, maxHits};
+ }
+ }
+ }
+ }
+
+ while(site==(int)heap.peek().site){ //Remove all identical elements, and add subsequent elements
+ final Quad64 t2=heap.poll();
+ final int row=t2.row+1, col=t2.column;
+ if(row<stops[col]){
+ t2.row=row;
+
+ int a=t2.list[row];
+ int a2;
+ if((a&SITE_MASK)>=offsets[col]){
+ a2=a-offsets[col];
+
+ assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) :
+ "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col];
+ }else{
+ int ch=numberToChrom(a, baseChrom);
+ int st=numberToSite(a);
+ int st2=Tools.max(st-offsets[col], 0);
+ a2=toNumber(st2, ch);
+
+ assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) :
+ "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col];
+ }
+
+ assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) :
+ "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col];
+
+ t2.site=((long)a2)&0xFFFFFFFFL;
+ values[col]=a2;
+ heap.add(t2);
+ }else if(earlyExit && (perfectOnly || heap.size()<approxHitsCutoff)){
+ return new int[] {topQscore, maxHits};
+ }
+ if(heap.isEmpty()){break;}
+ }
+
+ }
+
+
+
+ return new int[] {topQscore, maxHits};
+ }
+
+
+ /** Uses unsigned math */
+ private static final int absdif(int a, int b){
+// return a>b ? a-b : b-a;
+ return (a<0 == b<0) ? a>b ? a-b : b-a : Integer.MAX_VALUE;
+ }
+
+
+ final int maxScore(int[] offsets, byte[] baseScores, int[] keyScores, int readlen, boolean useQuality){
+
+ if(useQuality){
+ //These lines apparently MUST be used if quality is used later on for slow align.
+ if(USE_AFFINE_SCORE){return msa.maxQuality(baseScores);}
+ if(USE_EXTENDED_SCORE){return readlen*(BASE_HIT_SCORE+BASE_HIT_SCORE/5)+Tools.sumInt(baseScores);}
+ }else{
+ if(USE_AFFINE_SCORE){return msa.maxQuality(readlen);}
+ if(USE_EXTENDED_SCORE){return readlen*(BASE_HIT_SCORE+BASE_HIT_SCORE/5);}
+ }
+
+ return maxQuickScore(offsets, keyScores);
+ }
+
+
+ public final int maxQuickScore(int[] offsets, int[] keyScores){
+
+// int x=offsets.length*BASE_KEY_HIT_SCORE;
+ int x=Tools.intSum(keyScores);
+ int y=Y_SCORE_MULT*(offsets[offsets.length-1]-offsets[0]);
+// if(ADD_LIST_SIZE_BONUS){x+=(LIST_SIZE_BONUS[1]*offsets.length);}
+// assert(!ADD_SCORE_Z) : "Need to make sure this is correct...";
+
+// if(ADD_SCORE_Z){x+=((offsets[offsets.length-1]+CHUNKSIZE)*Z_SCORE_MULT);}
+ if(ADD_SCORE_Z){x+=maxScoreZ(offsets);}
+
+ return x+y;
+// int bonus=(2*(HIT_SCORE/2)); //For matching both ends
+// return x+y+bonus;
+ }
+
+
+ private final int quickScore(final int[] values, final int[] keyScores, final int centerIndex, final int offsets[],
+ int[] sizes, final boolean penalizeIndels, final int numApproxHits, final int numHits){
+
+ if(numApproxHits==1){return keyScores[centerIndex];}
+
+ //Done!
+ //Correct way to calculate score:
+ //Find the first chunk that exactly hits the center.
+ //Then, align leftward of it, and align rightward of it, and sum the scores.
+
+ //"-centerIndex" is a disambiguating term that, given otherwise identical match patterns
+ //(for example, a small indel will generate two valid site candidates), choose the lower site.
+
+ int x=keyScores[centerIndex]+scoreLeft(values, keyScores, centerIndex, sizes, penalizeIndels)+
+ scoreRight(values, keyScores, centerIndex, sizes, penalizeIndels, numHits)-centerIndex;
+
+ int y=Y_SCORE_MULT*scoreY(values, centerIndex, offsets);
+ if(ADD_LIST_SIZE_BONUS){x+=calcListSizeBonus(sizes[centerIndex]);}
+// int z=scoreZ(locs, hits);
+ return x+y;
+ }
+
+
+// /** Generates a term that increases score with how many bases in the read match the ref. */
+// private static final int scoreZ(int[] locs, int centerIndex, int offsets[]){
+// final int center=locs[centerIndex];
+//
+// final int[] refLoc=new int[offsets[offsets.length-1]+CHUNKSIZE];
+//
+// final int maxLoc=center+MAX_INDEL2;
+// final int minLoc=Tools.max(0, center-MAX_INDEL);
+//
+// int score=0;
+//
+// for(int i=0; i<locs.length; i++){
+// int loc=locs[i];
+//// int dif=absdif(loc, center);
+// if(loc>=minLoc && loc<=maxLoc){
+//// assert(loc>=center) : "loc="+loc+"\ni="+i+"\ncenterIndex="+centerIndex+
+//// "\nmaxLoc="+maxLoc+"\nlocs:\t"+Arrays.toString(locs)+"\noffsets:\t"+Arrays.toString(offsets);
+//
+// int offset=offsets[i];
+// int max=CHUNKSIZE+offset;
+//
+// for(int j=offset; j<max; j++){
+// int old=refLoc[j];
+// if(old==0){
+// refLoc[j]=loc;
+// score+=4;
+// }else if(old>loc){
+// refLoc[j]=loc;
+// score-=2;
+// }else if(old==loc){
+// score-=1;
+// //do nothing, perhaps, or add 1?
+// }else{
+// score-=2;
+// assert(old<loc);
+// }
+// }
+// }
+// }
+// return score;
+// }
+
+
+
+ private final int extendScore(final byte[] bases, final byte[] baseScores, final int[] offsets, final int[] values,
+ final int chrom, final int centerIndex, final int[] locArray, final int numHits, final int numApproxHits){
+ callsToExtendScore++;
+
+ final int centerVal=values[centerIndex];
+ final int centerLoc=numberToSite(centerVal);
+
+ final int minLoc=Tools.max(0, centerLoc-MAX_INDEL); //Legacy, for assertions
+ final int maxLoc=centerLoc+MAX_INDEL2; //Legacy, for assertions
+
+ final int minVal=subUnsigned(centerVal, MAX_INDEL);
+ final int maxVal=addUnsigned(centerVal, MAX_INDEL2);
+
+// System.out.println("Min, center, max = "+minLoc+", "+center+", "+ maxLoc);
+// System.out.println("centerIndex = "+centerIndex);
+
+ final byte[] ref=Data.getChromosome(chrom).array;
+
+// int[] locArray=new int[bases.length];
+ Arrays.fill(locArray, -1);
+
+
+ //First fill in reverse
+ for(int i=0, keynum=0; i<numHits; i++){
+ final int value=values[i];
+
+ if(value>=minVal && value<=maxVal){
+ final int refbase=numberToSite(value);
+ assert(refbase>=minLoc && refbase<=maxLoc);
+
+ // System.out.println("Reverse: Trying key "+refbase+" @ "+offsets[i]);
+ // System.out.println("Passed!");
+ keynum++;
+ final int callbase=offsets[i];
+
+ int misses=0;
+ for(int cloc=callbase+KEYLEN-1, rloc=refbase+cloc; cloc>=0 && rloc>=0 && rloc<ref.length; cloc--, rloc--){
+ int old=locArray[cloc];
+ if(old==refbase){
+ // System.out.println("Broke because old="+old+", refbase="+refbase);
+ break;
+ } //Already filled with present value
+ if(misses>0 && old>=0){
+ // System.out.println("Broke because old="+old+", misses="+misses);
+ break;
+ } //Already filled with something that has no errors
+ byte c=bases[cloc];
+ byte r=ref[rloc];
+
+ if(c==r){
+ if(old<0 || refbase==centerLoc){ //If the cell is empty or this key corresponds to center
+ locArray[cloc]=refbase;
+ }
+ }else{
+ misses++;
+ //Only extends first key all the way back. Others stop at the first error.
+ if(old>=0 || keynum>1){
+ // System.out.println("Broke because old="+old+", keynum="+keynum);
+ break;
+ }
+ }
+ }
+ }
+ }
+
+
+
+ //Then fill forward
+ for(int i=0; i<numHits; i++){
+ final int value=values[i];
+
+ if(value>=minVal && value<=maxVal){
+ final int refbase=numberToSite(value);
+ assert(refbase>=minLoc && refbase<=maxLoc);
+ final int callbase=offsets[i];
+
+ int misses=0;
+ for(int cloc=callbase+KEYLEN, rloc=refbase+cloc; cloc<bases.length && rloc<ref.length; cloc++, rloc++){
+ int old=locArray[cloc];
+ if(old==refbase){break;} //Already filled with present value
+ if(misses>0 && old>=0){break;} //Already filled with something that has no errors
+ byte c=bases[cloc];
+ byte r=ref[rloc];
+
+ if(c==r){
+ if(old<0 || refbase==centerLoc){ //If the cell is empty or this key corresponds to center
+ locArray[cloc]=refbase;
+ }
+ }else{
+ misses++;
+ if(old>=0){break;} //Already filled with something that has no errors
+ }
+ }
+ }
+ }
+
+// //Change 'N' to -2. A bit slow.
+// {
+// int firstMatch=0;
+// while(firstMatch<locArray.length && locArray[firstMatch]<0){firstMatch++;}
+// assert(firstMatch<locArray.length) : new String(bases);
+// int last=locArray[firstMatch];
+// for(int i=firstMatch-1; i>=0; i--){
+// final byte c=bases[i];
+// if(c=='N'){locArray[i]=-2;}
+// else{
+// assert(locArray[i]==-1);
+// final int rloc=last+i;
+// byte r=ref[rloc];
+// if(r=='N'){locArray[i]=-2;}
+// }
+// }
+// for(int i=firstMatch; i<locArray.length; i++){
+// final int loc=locArray[i];
+// if(last<1){last=loc;}
+// final byte c=bases[i];
+// if(c=='N'){locArray[i]=-2;}
+// else if(loc==-1 && last>0){
+// final int rloc=last+i;
+// byte r=ref[rloc];
+// if(r=='N'){locArray[i]=-2;}
+// }
+// }
+// }
+
+ //Change 'N' to -2, but only for nocalls, not norefs. Much faster.
+ {
+ final byte nb=(byte)'N';
+ for(int i=0; i<bases.length; i++){
+ if(bases[i]==nb){locArray[i]=-2;}
+ }
+ }
+
+ if(USE_AFFINE_SCORE){
+ /* TODO - sometimes returns a higher score than actual alignment. This should never happen. */
+ int score=(KFILTER<2 ? msa.calcAffineScore(locArray, baseScores, bases) :
+ msa.calcAffineScore(locArray, baseScores, bases, KFILTER));
+ return score;
+ }
+
+ int score=0;
+ int lastLoc=-1;
+ int centerBonus=BASE_HIT_SCORE/5;
+ for(int i=0; i<locArray.length; i++){
+ int loc=locArray[i];
+ if(loc>=0){
+ score+=BASE_HIT_SCORE+baseScores[i];
+ if(loc==centerLoc){score+=centerBonus;}
+ if(loc!=lastLoc && lastLoc>=0){
+ int dif=absdif(loc, lastLoc);
+ int penalty=Tools.min(INDEL_PENALTY+INDEL_PENALTY_MULT*dif, MAX_PENALTY_FOR_MISALIGNED_HIT);
+ score-=penalty;
+ }
+ lastLoc=loc;
+ }
+ }
+
+// System.err.println("Extended score: "+score);
+// System.err.println(Arrays.toString(locArray));
+
+
+ return score;
+ }
+
+
+ /** NOTE! This destroys the locArray, so use a copy if needed. */
+ private static final int[] makeGapArray(int[] locArray, int minLoc, int minGap){
+ int gaps=0;
+ boolean doSort=false;
+
+ if(locArray[0]<0){locArray[0]=minLoc;}
+ for(int i=1; i<locArray.length; i++){
+ if(locArray[i]<0){locArray[i]=locArray[i-1]+1;}
+ else{locArray[i]+=i;}
+ if(locArray[i]<locArray[i-1]){doSort=true;}
+ }
+
+// System.err.println(Arrays.toString(locArray)+"\n");
+
+ if(doSort){
+// System.err.println("*");
+ Arrays.sort(locArray);
+ }
+// System.err.println(Arrays.toString(locArray)+"\n");
+
+ for(int i=1; i<locArray.length; i++){
+ int dif=locArray[i]-locArray[i-1];
+ assert(dif>=0);
+ if(dif>minGap){
+ gaps++;
+ }
+ }
+ if(gaps<1){return null;}
+ int[] out=new int[2+gaps*2];
+ out[0]=locArray[0];
+ out[out.length-1]=locArray[locArray.length-1];
+
+ for(int i=1, j=1; i<locArray.length; i++){
+ int dif=locArray[i]-locArray[i-1];
+ assert(dif>=0);
+ if(dif>minGap){
+ out[j]=locArray[i-1];
+ out[j+1]=locArray[i];
+ j+=2;
+ }
+ }
+ return out;
+ }
+
+
+ /** Generates a term that increases score with how many bases in the read match the ref. */
+ private final int scoreZ2(int[] values, int centerIndex, int offsets[], int numApproxHits, int numHits){
+
+ if(numApproxHits==1){return SCOREZ_1KEY;}
+
+ final int center=values[centerIndex];
+
+ final int maxLoc=center+MAX_INDEL2;
+ final int minLoc=Tools.max(0, center-MAX_INDEL);
+
+
+
+// final int minVal=subUnsigned(centerVal, MAX_INDEL);
+// final int maxVal=addUnsigned(centerVal, MAX_INDEL2);
+
+ int score=0;
+
+ int a0=-1, b0=-1;
+
+ for(int i=0; i<numHits; i++){
+ int loc=values[i];
+// int dif=absdif(loc, center);
+ if(loc>=minLoc && loc<=maxLoc){
+// assert(loc>=center) : "loc="+loc+"\ni="+i+"\ncenterIndex="+centerIndex+
+// "\nmaxLoc="+maxLoc+"\nlocs:\t"+Arrays.toString(locs)+"\noffsets:\t"+Arrays.toString(offsets);
+ int a=offsets[i];
+
+ if(b0<a){
+ score+=b0-a0;
+ a0=a;
+ }
+ b0=a+KEYLEN;
+ }
+ }
+ score+=b0-a0;
+ score=score*Z_SCORE_MULT;
+// assert(score==scoreZslow(locs, centerIndex, offsets, false)) : scoreZslow(locs, centerIndex, offsets, true)+" != "+score;
+ return score;
+ }
+
+ @Deprecated
+ /** This was just to verify scoreZ2. */
+ private final int scoreZslow(int[] locs, int centerIndex, int offsets[], boolean display, int numHits){
+ final int center=locs[centerIndex];
+
+ final int maxLoc=center+MAX_INDEL2;
+ final int minLoc=Tools.max(0, center-MAX_INDEL);
+
+ byte[] array=new byte[offsets[offsets.length-1]+KEYLEN];
+ int score=0;
+
+ for(int i=0; i<numHits; i++){
+ int loc=locs[i];
+// int dif=absdif(loc, center);
+ if(loc>=minLoc && loc<=maxLoc){
+ int pos=offsets[i];
+// if(true){
+// System.err.println("\ni="+i+", pos="+pos+", array=["+array.length+"], limit="+(pos+CHUNKSIZE-1));
+// }
+ for(int j=pos; j<pos+KEYLEN; j++){
+ if(array[j]==0){score++;}
+ array[j]=1;
+ }
+ }
+ }
+
+ if(display){System.err.println("\n"+Arrays.toString(array)+"\n");}
+
+ return score*Z_SCORE_MULT;
+ }
+
+ /** Generates a term that increases score with how many bases in the read match the ref. */
+ private final int maxScoreZ(int offsets[]){
+ int score=0;
+ int a0=-1, b0=-1;
+
+ for(int i=0; i<offsets.length; i++){
+ int a=offsets[i];
+
+ if(b0<a){
+ score+=b0-a0;
+ a0=a;
+ }
+ b0=a+KEYLEN;
+
+ }
+ score+=b0-a0;
+ return score*Z_SCORE_MULT;
+ }
+
+
+ private final int scoreRight(int[] values, int[] keyScores, int centerIndex, int[] sizes, boolean penalizeIndels, int numHits){
+
+ int score=0;
+
+ int prev, loc=values[centerIndex];
+
+ for(int i=centerIndex+1; i<numHits; i++){
+
+ if(values[i]!=-1){
+ prev=loc;
+ loc=values[i];
+
+ int offset=absdif(loc, prev);
+
+ if(offset<=MAX_INDEL){
+ score+=keyScores[i];
+ if(ADD_LIST_SIZE_BONUS){score+=calcListSizeBonus(sizes[i]);}
+
+// if(i==locs.length-1){score+=HIT_SCORE/2;} //Adds a bonus for matching the first or last key
+
+ if(penalizeIndels && offset!=0){
+ int penalty=Tools.min(INDEL_PENALTY+INDEL_PENALTY_MULT*offset, MAX_PENALTY_FOR_MISALIGNED_HIT);
+ score-=penalty;
+// score-=(INDEL_PENALTY+Tools.min(INDEL_PENALTY_MULT*offset, 1+HIT_SCORE/4));
+ }
+ }else{
+ loc=prev;
+ }
+ }
+
+ }
+ return score;
+
+ }
+
+ private final int scoreLeft(int[] values, int[] keyScores, int centerIndex, int[] sizes, boolean penalizeIndels){
+
+ callsToScore++;
+
+ int score=0;
+
+ int prev, loc=values[centerIndex];
+
+ for(int i=centerIndex-1; i>=0; i--){
+
+ if(values[i]!=-1){
+ prev=loc;
+ loc=values[i];
+
+ int offset=absdif(loc, prev);
+
+ if(offset<=MAX_INDEL){
+ score+=keyScores[i];
+ if(ADD_LIST_SIZE_BONUS){score+=calcListSizeBonus(sizes[i]);}
+
+// if(i==0){score+=HIT_SCORE/2;} //Adds a bonus for matching the first or last key
+ if(penalizeIndels && offset!=0){
+ int penalty=Tools.min(INDEL_PENALTY+INDEL_PENALTY_MULT*offset, MAX_PENALTY_FOR_MISALIGNED_HIT);
+ score-=penalty;
+ }
+ }else{
+ loc=prev;
+ }
+ }
+
+ }
+ return score;
+
+ }
+
+ /** Encode a (location, chrom) pair to an index */
+ private static final int toNumber(int site, int chrom){
+ int out=(chrom&CHROM_MASK_LOW);
+ out=out<<SHIFT_LENGTH;
+ out=(out|site);
+ return out;
+ }
+
+ /** Decode an (index, baseChrom) pair to a chromosome */
+ private static final int numberToChrom(int number, int baseChrom){
+ assert((baseChrom&CHROM_MASK_LOW)==0) : Integer.toHexString(number)+", baseChrom="+baseChrom;
+ assert(baseChrom>=0) : Integer.toHexString(number)+", baseChrom="+baseChrom;
+ int out=(number>>>SHIFT_LENGTH);
+ out=out+(baseChrom&CHROM_MASK_HIGH);
+ return out;
+ }
+
+ /** Decode an index to a location */
+ private static final int numberToSite(int number){
+ return (number&SITE_MASK);
+ }
+
+ private static int subUnsigned(int a, int b){
+ int c=a-b;
+ if((a<0)==(c<0)){return c;}
+ if(a>=0){return b>=0 ? 0 : Integer.MAX_VALUE-10000;}
+ return b>=0 ? Integer.MIN_VALUE : -10000;
+ }
+
+ private static int addUnsigned(int a, int b){
+ int c=a+b;
+ if((a<0)==(c<0)){return c;}
+ if(a>=0){return b>=0 ? Integer.MAX_VALUE-10000 : 0;}
+ return b>=0 ? -10000 : Integer.MIN_VALUE;
+ }
+
+ private static final int minChrom(int chrom){return Tools.max(MINCHROM, chrom&CHROM_MASK_HIGH);}
+ private static final int baseChrom(int chrom){return Tools.max(0, chrom&CHROM_MASK_HIGH);}
+ private static final int maxChrom(int chrom){return Tools.max(MINCHROM, Tools.min(MAXCHROM, chrom|CHROM_MASK_LOW));}
+
+
+ private final int[] getOffsetArray(int len){
+ if(offsetArrays[len]==null){offsetArrays[len]=new int[len];}
+ return offsetArrays[len];
+ }
+ private final int[] getLocArray(int len){
+ if(len>=locArrays.length){return new int[len];}
+ if(locArrays[len]==null){locArrays[len]=new int[len];}
+ return locArrays[len];
+ }
+ private final int[] getGreedyListArray(int len){
+ if(greedyListArrays[len]==null){greedyListArrays[len]=new int[len];}
+ return greedyListArrays[len];
+ }
+ private final int[] getGenericArray(int len){
+ if(genericArrays[len]==null){genericArrays[len]=new int[len];}
+ return genericArrays[len];
+ }
+
+ final byte[] getBaseScoreArray(int len, int strand){
+ if(len>=baseScoreArrays[0].length){return new byte[len];}
+ if(baseScoreArrays[strand][len]==null){baseScoreArrays[strand][len]=new byte[len];}
+ return baseScoreArrays[strand][len];
+ }
+ final int[] getKeyScoreArray(int len, int strand){
+ if(len>=keyScoreArrays.length){return new int[len];}
+ if(keyScoreArrays[strand][len]==null){keyScoreArrays[strand][len]=new int[len];}
+ return keyScoreArrays[strand][len];
+ }
+ private final float[] getKeyWeightArray(int len){
+ if(len>=keyWeightArrays.length){return new float[len];}
+ if(keyWeightArrays[len]==null){keyWeightArrays[len]=new float[len];}
+ return keyWeightArrays[len];
+ }
+ @Override
+ float[] keyProbArray() {
+ return keyProbArray;
+ }
+
+
+ private final int[][] locArrays=new int[601][];
+ private final int[] valueArray=new int[128];
+ private final int[] sizeArray=new int[128];
+ private final int[][] offsetArrays=new int[128][];
+ private final int[][] greedyListArrays=new int[128][];
+ private final int[][] genericArrays=new int[128][];
+ private final int[] startArray=new int[128];
+ private final int[] stopArray=new int[128];
+ private final Quad64[] tripleStorage=makeQuad64Storage(128);
+ private final int[] greedyReturn=new int[2];
+ private final int[][] shrinkReturn2=new int[3][];
+ private final int[][] shrinkReturn3=new int[5][];
+ private final int[][] prescanReturn=new int[2][];
+ private final int[] prescoreArray;
+ private final int[] precountArray;
+
+ private final byte[][][] baseScoreArrays=new byte[2][601][];
+ private final int[][][] keyScoreArrays=new int[2][128][];
+ final float[] keyProbArray=new float[601];
+ private final float[][] keyWeightArrays=new float[128][];
+
+
+ private final Quad64[] makeQuad64Storage(int number){
+ Quad64[] r=new Quad64[number];
+ for(int i=0; i<number; i++){r[i]=new Quad64(i, 0, 0);}
+ return r;
+ }
+
+
+ private final Quad64Heap heap=new Quad64Heap(127);
+
+ static int SHIFT_LENGTH=(32-NUM_CHROM_BITS);
+ static int MAX_ALLOWED_CHROM_INDEX=~((-1)<<SHIFT_LENGTH);
+
+ /** Mask the number to get the site, which is in the lower bits */
+ static int SITE_MASK=((-1)>>>(NUM_CHROM_BITS));
+
+ /** Mask the chromosome's high bits to get the low bits */
+ static int CHROM_MASK_LOW=CHROMS_PER_BLOCK-1;
+
+ /** Mask the chromosome's lower bits to get the high bits */
+ static int CHROM_MASK_HIGH=~CHROM_MASK_LOW;
+
+ static void setChromBits(int x){
+
+ NUM_CHROM_BITS=x;
+ CHROMS_PER_BLOCK=(1<<(NUM_CHROM_BITS));
+ SHIFT_LENGTH=(32-NUM_CHROM_BITS);
+ MAX_ALLOWED_CHROM_INDEX=~((-1)<<SHIFT_LENGTH);
+ SITE_MASK=((-1)>>>(NUM_CHROM_BITS));
+ CHROM_MASK_LOW=CHROMS_PER_BLOCK-1;
+ CHROM_MASK_HIGH=~CHROM_MASK_LOW;
+
+// assert(false) : Integer.bitCount(SITE_MASK)+", "+MAX_ALLOWED_CHROM_INDEX+", "+CHROMS_PER_BLOCK;
+
+// assert(NUM_CHROM_BITS<30);
+ assert(NUM_CHROM_BITS>=0); //max is 3 for human; perhaps more for other organisms
+// assert((1<<(NUM_CHROM_BITS))>=CHROMSPERBLOCK) : (1<<(NUM_CHROM_BITS))+" < "+CHROMSPERBLOCK;
+ assert((1<<(NUM_CHROM_BITS))==CHROMS_PER_BLOCK) : (1<<(NUM_CHROM_BITS))+" < "+CHROMS_PER_BLOCK;
+ assert(Integer.bitCount(CHROMS_PER_BLOCK)==1);
+ assert(Integer.numberOfLeadingZeros(SITE_MASK)==(NUM_CHROM_BITS)) : Integer.toHexString(SITE_MASK);
+ }
+
+ private final int cycles;
+
+ public static final int BASE_HIT_SCORE=100;
+ public static final int ALIGN_COLUMNS=3000;
+ public static int MAX_INDEL=16000; //Max indel length, min 0, default 400; longer is more accurate
+ public static int MAX_INDEL2=2*MAX_INDEL;
+
+ private final float INV_BASE_KEY_HIT_SCORE;
+ private final int INDEL_PENALTY; //default (HIT_SCORE/2)-1
+ private final int INDEL_PENALTY_MULT; //default 20; penalty for indel length
+ private final int MAX_PENALTY_FOR_MISALIGNED_HIT;
+ private final int SCOREZ_1KEY;
+
+ public static final boolean GENERATE_KEY_SCORES_FROM_QUALITY=true; //True: Much faster and more accurate.
+ public static final boolean GENERATE_BASE_SCORES_FROM_QUALITY=true; //True: Faster, and at least as accurate.
+ public static final boolean ADD_SCORE_Z=true; //Increases quality, decreases speed
+ public static final int Z_SCORE_MULT=20;
+ public static final int Y_SCORE_MULT=10;
+
+
+ /**
+ * Return only sites that match completely or with partial no-reference
+ */
+ public static void setSemiperfectMode() {
+ assert(!PERFECTMODE);
+ SEMIPERFECTMODE=true;
+ PRESCAN_QSCORE=false;
+// MIN_APPROX_HITS_TO_KEEP++;
+
+
+
+ MAX_INDEL=0;
+ MAX_INDEL2=0;
+ }
+
+ /**
+ * Return only sites that match completely
+ */
+ public static void setPerfectMode() {
+ assert(!SEMIPERFECTMODE);
+ PERFECTMODE=true;
+ PRESCAN_QSCORE=false;
+// MIN_APPROX_HITS_TO_KEEP++;
+
+
+
+ MAX_INDEL=0;
+ MAX_INDEL2=0;
+ }
+
+ static float FRACTION_GENOME_TO_EXCLUDE=0.03f; //Default .03; lower is slower and more accurate. For perfect reads and small genomes, lower is FASTER.
+
+ public static final void setFractionToExclude(float f){
+ assert(f>=0 && f<1);
+ FRACTION_GENOME_TO_EXCLUDE=f;
+ MIN_INDEX_TO_DROP_LONG_HIT_LIST=(int)(1000*(1-3.5*FRACTION_GENOME_TO_EXCLUDE)); //default 810
+ MAX_AVERAGE_LIST_TO_SEARCH=(int)(1000*(1-2.3*FRACTION_GENOME_TO_EXCLUDE)); //lower is faster, default 840
+ MAX_AVERAGE_LIST_TO_SEARCH2=(int)(1000*(1-1.4*FRACTION_GENOME_TO_EXCLUDE)); //default 910
+ MAX_SINGLE_LIST_TO_SEARCH=(int)(1000*(1-1.0*FRACTION_GENOME_TO_EXCLUDE)); //default 935
+ MAX_SHORTEST_LIST_TO_SEARCH=(int)(1000*(1-2.8*FRACTION_GENOME_TO_EXCLUDE)); //Default 860
+ }
+
+
+ /** Default .75. Range: 0 to 1 (but 0 will break everything). Lower is faster and less accurate. */
+ static final float HIT_FRACTION_TO_RETAIN=0.85f; //default: .8
+ /** Range: 0 to 1000. Lower should be faster and less accurate. */
+ static int MIN_INDEX_TO_DROP_LONG_HIT_LIST=(int)(1000*(1-3.5*FRACTION_GENOME_TO_EXCLUDE)); //default 810
+ /** Range: 2 to infinity. Lower should be faster and less accurate. */
+ static final int MIN_HIT_LISTS_TO_RETAIN=6;
+
+ static int MAX_AVERAGE_LIST_TO_SEARCH=(int)(1000*(1-2.3*FRACTION_GENOME_TO_EXCLUDE)); //lower is faster, default 840
+ //lower is faster
+ static int MAX_AVERAGE_LIST_TO_SEARCH2=(int)(1000*(1-1.4*FRACTION_GENOME_TO_EXCLUDE)); //default 910
+ //lower is faster
+ static int MAX_SINGLE_LIST_TO_SEARCH=(int)(1000*(1-1.0*FRACTION_GENOME_TO_EXCLUDE)); //default 935
+ //lower is faster
+ static int MAX_SHORTEST_LIST_TO_SEARCH=(int)(1000*(1-2.8*FRACTION_GENOME_TO_EXCLUDE)); //Default 860
+
+ /** To increase accuracy on small genomes, override greedy list dismissal when the list is at most this long. */
+ public static final int SMALL_GENOME_LIST=20;
+
+ static{assert(!(TRIM_BY_GREEDY && TRIM_BY_TOTAL_SITE_COUNT)) : "Pick one.";}
+
+ static final int CLUMPY_MAX_DIST=5; //Keys repeating over intervals of this or less are clumpy.
+
+ /** Minimum length of list before clumpiness is considered. This is an index in the length histogram, from 0 to 1000. */
+ static final int CLUMPY_MIN_LENGTH_INDEX=2000;
+ static final float CLUMPY_FRACTION=0.75f; //0 to 1; higher is slower but more stringent. 0.5 means the median distance is clumpy.
+
+ static final int MAX_SUBSUMPTION_LENGTH=MAX_INDEL2;
+
+ /** approxHitsCutoff=maxHits-MAX_HITS_REDUCTION1 when slowWalk3 is first entered */
+ public static final int MAX_HITS_REDUCTION1=0;
+
+ /** approxHitsCutoff=maxHits-MAX_HITS_REDUCTION2 dynamically when best score is exceeded */
+ public static int MAX_HITS_REDUCTION2=2; //default 1; higher is more accurate (more mapping and less FP) but slower
+
+ /** approxHitsCutoff=maxHits-MAX_HITS_REDUCTION_PERFECT when perfect score is found */
+ public static final int MAX_HITS_REDUCTION_PERFECT=0;
+
+ public static int MAXIMUM_MAX_HITS_REDUCTION=3;
+ public static int HIT_REDUCTION_DIV=5;
+
+ private static final int calcApproxHitsCutoff(final int keys, final int hits, int currentCutoff, final boolean perfect){ //***$
+ assert(keys>=hits) : keys+", "+hits;
+ assert(hits>=0);
+
+ int mahtk=MIN_APPROX_HITS_TO_KEEP;
+ if(SEMIPERFECTMODE || PERFECTMODE){
+ if(keys==1){return 1;}
+ else if(MIN_APPROX_HITS_TO_KEEP<keys){
+ mahtk++;
+ if(currentCutoff==MIN_APPROX_HITS_TO_KEEP){currentCutoff++;}
+ }
+ }
+
+ int reduction=Tools.min(Tools.max((hits)/HIT_REDUCTION_DIV, MAX_HITS_REDUCTION2), Tools.max(MAXIMUM_MAX_HITS_REDUCTION, keys/8));
+// if(hits<3){reduction=0;}
+// else if(hits<4){reduction=Tools.min(1, reduction);}
+// else if(hits<5){reduction=Tools.min(2, reduction);}
+// else if(hits<6){reduction=Tools.min(3, reduction);}
+ assert(reduction>=0);
+ int r=hits-reduction;
+
+ r=Tools.max(mahtk, currentCutoff, r);
+
+ if(perfect){
+ r=Tools.max(r, keys-MAX_HITS_REDUCTION_PERFECT);
+ }
+ return r;
+ }
+
+ public static final boolean USE_SLOWALK3=true && USE_EXTENDED_SCORE;
+ public static boolean PRESCAN_QSCORE=true && USE_EXTENDED_SCORE; //Decrease quality and increase speed
+ public static final boolean FILTER_BY_QSCORE=true; //Slightly lower quality, but very fast.
+ public static final float MIN_SCORE_MULT=(USE_AFFINE_SCORE ? 0.15f : USE_EXTENDED_SCORE ? .3f : 0.10f); //Fraction of max score to use as cutoff. Default 0.15, max is 1; lower is more accurate
+ public static final float MIN_QSCORE_MULT=0.025f; //Fraction of max score to use as cutoff. Default 0.15, max is 1; lower is more accurate
+ public static final float MIN_QSCORE_MULT2=0.1f;
+ static final float DYNAMIC_SCORE_THRESH=(USE_AFFINE_SCORE ? 0.84f : USE_EXTENDED_SCORE ? .84f : 0.6f); //Default .85f; lower is more accurate
+ static final float DYNAMIC_QSCORE_THRESH=0.6f; //default .58f
+ static final float DYNAMIC_QSCORE_THRESH_PERFECT=0.8f; //***$
+ static final float PRESCAN_QSCORE_THRESH=DYNAMIC_QSCORE_THRESH*.95f; //default 1.0f; lower is more accurate and 0 essentially sets PRESCAN_QSCORE=false
+ static{
+ assert(MIN_SCORE_MULT>=0 && MIN_SCORE_MULT<1);
+ assert(DYNAMIC_SCORE_THRESH>=0 && DYNAMIC_SCORE_THRESH<1);
+ }
+
+
+}
diff --git a/current/align2/BBIndexAcc.java b/current/align2/BBIndexAcc.java
new file mode 100755
index 0000000..729a66c
--- /dev/null
+++ b/current/align2/BBIndexAcc.java
@@ -0,0 +1,2804 @@
+package align2;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+
+import stream.Read;
+import stream.SiteScore;
+
+import dna.AminoAcid;
+import dna.Data;
+import dna.Gene;
+
+
+/**
+ * Based on Index11a
+ *
+ *
+ *
+ *
+ * @author Brian Bushnell
+ * @date Jul 11, 2012
+ *
+ */
+public final class BBIndexAcc extends AbstractIndex {
+
+
+ public static void main(String[] args){
+
+ int k=13;
+
+ for(int i=0; i<args.length; i++){
+ String s=args[i].toLowerCase();
+ if(s.contains("=")){
+ String[] split=s.split("=");
+ String a=split[0];
+ String b=split[1];
+ if(a.equals("build") || a.equals("b")){
+ Data.setGenome(Integer.parseInt(b));
+ }else if(a.equals("minchrom")){
+ MINCHROM=Integer.parseInt(b);
+ }else if(a.equals("maxchrom")){
+ MAXCHROM=Integer.parseInt(b);
+ }else if(a.equals("keylen") || a.equals("k")){
+ k=Integer.parseInt(b);
+ }
+ }
+ }
+
+ if(MINCHROM==-1){MINCHROM=1;}
+ if(MAXCHROM==-1){
+ assert(Data.numChroms<=Byte.MAX_VALUE) : "TODO";
+ MAXCHROM=Data.numChroms;
+ }
+
+
+ System.err.println("Writing build "+Data.GENOME_BUILD+" "+
+ "BASESPACE index, keylen="+k+", chrom bits="+NUM_CHROM_BITS);
+
+
+ int first=(NUM_CHROM_BITS==0 ? 1 : 0);
+
+
+ Data.sysout.println("Loading index for chunk "+first+"-"+MAXCHROM+", build "+Data.GENOME_BUILD);
+ index=IndexMaker4.makeIndex(Data.GENOME_BUILD, first, MAXCHROM,
+ k, NUM_CHROM_BITS, MAX_ALLOWED_CHROM_INDEX, CHROM_MASK_LOW, CHROM_MASK_HIGH, SITE_MASK, SHIFT_LENGTH, true, false, index);
+
+
+ System.err.println("Finished all chroms, may still be writing.");
+ }
+
+
+ public BBIndexAcc(int k_, int minChrom_, int maxChrom_, int kfilter_, MSA msa_){
+ super(k_, kfilter_, BASE_HIT_SCORE, minChrom_, maxChrom_, msa_);
+ INV_BASE_KEY_HIT_SCORE=1f/BASE_KEY_HIT_SCORE;
+ INDEL_PENALTY=(BASE_KEY_HIT_SCORE/2)-1; //default (HIT_SCORE/2)-1
+ INDEL_PENALTY_MULT=20; //default 20; penalty for indel length
+ MAX_PENALTY_FOR_MISALIGNED_HIT=BASE_KEY_HIT_SCORE-(1+BASE_KEY_HIT_SCORE/8);
+ SCOREZ_1KEY=Z_SCORE_MULT*KEYLEN;
+ {
+ int cyc=0;
+ for(int chrom=minChrom; chrom<=maxChrom; chrom=((chrom&CHROM_MASK_HIGH)+CHROMS_PER_BLOCK)){cyc+=2;}
+ cycles=cyc;
+ }
+ prescoreArray=new int[cycles];
+ precountArray=new int[cycles];
+ }
+
+ /** Load or generate index from minChrom to maxChrom, inclusive, with keylength k.
+ * This range can encompass multiple blocks.
+ * Should only be called once in a process. */
+ public static final synchronized void loadIndex(int minChrom, int maxChrom, int k, boolean writeToDisk, boolean diskInvalid){
+ if(minChrom<1){minChrom=1;}
+ if(maxChrom>Data.numChroms){maxChrom=Data.numChroms;}
+ assert(minChrom<=maxChrom);
+ Data.sysout.println("Loading index for chunk "+minChrom+"-"+maxChrom+", build "+Data.GENOME_BUILD);
+ index=IndexMaker4.makeIndex(Data.GENOME_BUILD, minChrom, maxChrom,
+ k, NUM_CHROM_BITS, MAX_ALLOWED_CHROM_INDEX, CHROM_MASK_LOW, CHROM_MASK_HIGH, SITE_MASK, SHIFT_LENGTH, writeToDisk, diskInvalid, index);
+ }
+
+ /** Calculate statistics of index, such as list lengths, and find clumpy keys */
+ public static final synchronized void analyzeIndex(int minChrom, int maxChrom, float fractionToExclude, int k){
+ assert(lengthHistogram==null);
+ assert(COUNTS==null);
+
+ int KEYSPACE=1<<(2*k);
+ COUNTS=new int[KEYSPACE];
+ maxChrom=maxChrom(maxChrom);
+
+ HashMap<Integer, LongM> cmap=new HashMap<Integer, LongM>();
+
+ for(int chrom=minChrom; chrom<=maxChrom; chrom=((chrom&CHROM_MASK_HIGH)+CHROMS_PER_BLOCK)){
+ Block b=index[chrom];
+ final int[] sites=b.sites;
+ final int[] starts=b.starts;
+
+ for(int key=0; key<KEYSPACE; key++){
+
+ long clumps=0;
+
+ final int start1=starts[key];
+ final int stop1=starts[key+1];
+ final int len1=stop1-start1;
+ COUNTS[key]=(int)Tools.min(Integer.MAX_VALUE, COUNTS[key]+len1);
+
+ if(REMOVE_CLUMPY){
+ for(int i=start1+1; i<stop1; i++){
+ int dif=sites[i]-sites[i-1];
+ assert(dif!=0);
+ if(dif>0 && dif<=CLUMPY_MAX_DIST){
+ clumps++;
+ }
+ }
+ if(clumps>0){
+ final int x=Tools.min(key, AminoAcid.reverseComplementBinaryFast(key, k));
+ final Integer ko=x;
+ LongM lm=cmap.get(ko);
+ if(lm==null){
+ lm=new LongM(0);
+ cmap.put(ko, lm);
+ }
+ lm.increment(clumps);
+ }
+ }
+ }
+ }
+
+ for(int key=0; key<COUNTS.length; key++){
+ int rkey=AminoAcid.reverseComplementBinaryFast(key, k);
+ if(key<rkey){
+ int x=(int)Tools.min(Integer.MAX_VALUE, COUNTS[key]+(long)COUNTS[rkey]);
+ COUNTS[key]=COUNTS[rkey]=x;
+ }
+ }
+
+ if(REMOVE_CLUMPY){
+ Integer[] keys=cmap.keySet().toArray(new Integer[cmap.size()]);
+ Arrays.sort(keys);
+
+ for(Integer key : keys){
+ long clumps=cmap.get(key).value();
+ long len=COUNTS[key];
+ if((len>CLUMPY_MIN_LENGTH_INDEX && clumps>CLUMPY_FRACTION*len)/* || (len>8*CLUMPY_MIN_LENGTH_INDEX && clumps>.75f*CLUMPY_FRACTION*len)*/){
+ int rkey=AminoAcid.reverseComplementBinaryFast(key, k);
+ assert(key<=rkey);
+ assert(key==KeyRing.reverseComplementKey(rkey, k));
+ COUNTS[key]=0;
+ COUNTS[rkey]=0;
+ }
+ }
+ }
+
+ lengthHistogram=Tools.makeLengthHistogram3(COUNTS, 1000, verbose2);
+
+ //if(verbose2){System.err.println("lengthHistogram: "+Arrays.toString(lengthHistogram));}
+
+ if(REMOVE_FREQUENT_GENOME_FRACTION){
+
+ int lengthLimitIndex=(int)((1-fractionToExclude)*(lengthHistogram.length-1));
+ int lengthLimitIndex2=(int)((1-fractionToExclude*DOUBLE_SEARCH_THRESH_MULT)*(lengthHistogram.length-1));
+
+ MAX_USABLE_LENGTH=Tools.max(2*SMALL_GENOME_LIST, lengthHistogram[lengthLimitIndex]);
+ MAX_USABLE_LENGTH2=Tools.max(6*SMALL_GENOME_LIST, lengthHistogram[lengthLimitIndex2]);
+
+ if(verbose2){System.err.println("MAX_USABLE_LENGTH: "+MAX_USABLE_LENGTH+"\nMAX_USABLE_LENGTH2: "+MAX_USABLE_LENGTH2);}
+ }
+
+ Solver.POINTS_PER_SITE=(int)Math.floor((Solver.BASE_POINTS_PER_SITE*4000f)/Tools.max(2*SMALL_GENOME_LIST, lengthHistogram[MAX_AVERAGE_LIST_TO_SEARCH]));
+ if(Solver.POINTS_PER_SITE==0){Solver.POINTS_PER_SITE=-1;}
+ if(verbose2){System.err.println("POINTS_PER_SITE: "+Solver.POINTS_PER_SITE);}
+ assert(Solver.POINTS_PER_SITE<0) : Solver.POINTS_PER_SITE;
+ }
+
+// /** Calculate statistics of index, such as list lengths, and find clumpy keys */
+// public static final synchronized void analyzeIndex(int minChrom, int maxChrom, float fractionToExclude, int k){
+//
+// assert(lengthHistogram==null);
+// assert(COUNTS==null);
+//
+// int KEYSPACE=1<<(2*k);
+// COUNTS=new int[KEYSPACE];
+//
+// maxChrom=maxChrom(maxChrom);
+//
+// for(int key=0; key<KEYSPACE; key++){
+// int rkey=KeyRing.reverseComplementKey(key, k, cs);
+// assert(key==KeyRing.reverseComplementKey(rkey, k));
+//
+// if(key<=rkey){
+//
+// long clumps=0;
+// long len=0;
+//
+// for(int chrom=minChrom; chrom<=maxChrom; chrom=((chrom&CHROM_MASK_HIGH)+CHROMS_PER_BLOCK)){
+// Block b=index[chrom];
+//
+// final int[] sites=b.sites;
+// final int start1=b.starts[key];
+// final int stop1=start1+b.length(key);
+// final int start2=(rkey==key ? -1 : b.starts[rkey]);
+// final int stop2=(rkey==key ? -1 : start2+b.length(rkey));
+// final int len1=stop1-start1;
+// final int len2=stop2-start2;
+//
+// len=len+len1+len2;
+//
+// if(REMOVE_CLUMPY){
+// for(int i=start1+1; i<stop1; i++){
+// int dif=sites[i]-sites[i-1];
+// assert(dif!=0);
+// if(dif>0 && dif<=CLUMPY_MAX_DIST){
+// clumps++;
+// }
+// }
+//
+// for(int i=start2+1; i<stop2; i++){
+// int dif=sites[i]-sites[i-1];
+// assert(dif!=0);
+// if(dif>0 && dif<=CLUMPY_MAX_DIST){
+// clumps++;
+// }
+// }
+// }
+//
+// }
+//
+// COUNTS[key]=(int)Tools.min(Integer.MAX_VALUE, COUNTS[key]+len);
+// if(key!=rkey){COUNTS[rkey]=(int)Tools.min(Integer.MAX_VALUE, COUNTS[rkey]+len);}
+// assert(COUNTS[key]==COUNTS[rkey]) : key+", "+rkey;
+//
+// if(REMOVE_CLUMPY && len>CLUMPY_MIN_LENGTH_INDEX && clumps>(CLUMPY_FRACTION*len)){
+// COUNTS[key]=0;
+// COUNTS[rkey]=0;
+// for(int chrom=minChrom; chrom<=maxChrom; chrom=((chrom&CHROM_MASK_HIGH)+CHROMS_PER_BLOCK)){
+// Block b=index[chrom];
+// final int[] sites=b.sites;
+// sites[b.starts[key]]=-1;
+// sites[b.starts[rkey]]=-1;
+// }
+// }
+//
+//// System.err.println("COUNTS["+key+"] = "+COUNTS[key]+", COUNTS["+rkey+"] = "+COUNTS[rkey]);
+// }
+// }
+//
+// lengthHistogram=Tools.makeLengthHistogram3(COUNTS, 1000, verbose2);
+//
+// //if(verbose2){System.err.println("lengthHistogram: "+Arrays.toString(lengthHistogram));}
+//
+// if(REMOVE_FREQUENT_GENOME_FRACTION){
+//
+// int lengthLimitIndex=(int)((1-fractionToExclude)*(lengthHistogram.length-1));
+// int lengthLimitIndex2=(int)((1-fractionToExclude*DOUBLE_SEARCH_THRESH_MULT)*(lengthHistogram.length-1));
+//
+// MAX_USABLE_LENGTH=Tools.max(2*SMALL_GENOME_LIST, lengthHistogram[lengthLimitIndex]);
+// MAX_USABLE_LENGTH2=Tools.max(6*SMALL_GENOME_LIST, lengthHistogram[lengthLimitIndex2]);
+//
+// if(verbose2){System.err.println("MAX_USABLE_LENGTH: "+MAX_USABLE_LENGTH+"\nMAX_USABLE_LENGTH2: "+MAX_USABLE_LENGTH2);}
+// }
+//
+// Solver.POINTS_PER_SITE=(int)Math.floor((Solver.BASE_POINTS_PER_SITE*4000f)/Tools.max(2*SMALL_GENOME_LIST, lengthHistogram[MAX_AVERAGE_LIST_TO_SEARCH]));
+// if(Solver.POINTS_PER_SITE==0){Solver.POINTS_PER_SITE=-1;}
+// if(verbose2){System.err.println("POINTS_PER_SITE: "+Solver.POINTS_PER_SITE);}
+// assert(Solver.POINTS_PER_SITE<0) : Solver.POINTS_PER_SITE;
+// }
+
+
+ /** Returns the filename for the block holding this chrom */
+ public static final String fname(int chrom, int k){
+ return IndexMaker4.fname(minChrom(chrom), maxChrom(chrom), k, NUM_CHROM_BITS);
+ }
+
+ /** Ensure key offsets are strictly ascending. */
+ private static boolean checkOffsets(int[] offsets){
+ for(int i=1; i<offsets.length; i++){
+ if(offsets[i]<=offsets[i-1]){return false;}
+ }
+ return true;
+ }
+
+ @Deprecated
+ private final int trimExcessHitLists(int[] keys, int[][] hits){
+
+ assert(false) : "Needs to be redone because hits are no longer sorted by length.";
+
+ assert(hits.length==keys.length);
+// assert(false) : "modify this function so that it gives more weight to trimming lists over highly covered baits";
+ //And also, incorporate the "remove the longest list" function
+
+ final int limit=Tools.max(SMALL_GENOME_LIST, lengthHistogram[MAX_AVERAGE_LIST_TO_SEARCH])*keys.length;
+ final int limit2=Tools.max(SMALL_GENOME_LIST, lengthHistogram[MAX_AVERAGE_LIST_TO_SEARCH2]);
+ final int limit3=Tools.max(SMALL_GENOME_LIST, lengthHistogram[MAX_SHORTEST_LIST_TO_SEARCH]);
+
+ int sum=0;
+ int initialHitCount=0;
+
+ int shortest=Integer.MAX_VALUE-1;
+ int shortest2=Integer.MAX_VALUE;
+
+ for(int i=0; i<keys.length; i++){
+ int key=keys[i];
+ int x=COUNTS[key];
+ sum+=x;
+ initialHitCount+=(x==0 ? 0 : 1);
+ if(x>0){
+ if(x<shortest2){
+ shortest2=x;
+ if(shortest2<shortest){
+ shortest2=shortest;
+ shortest=x;
+ }
+ }
+ }
+ }
+ assert(shortest2>=shortest);
+ if(initialHitCount<MIN_APPROX_HITS_TO_KEEP){return initialHitCount;}
+ if(shortest>limit3 && !SLOW){
+ for(int i=0; i<hits.length; i++){hits[i]=null;}
+ return 0;
+ }
+ if(sum<=limit && sum/initialHitCount<=limit2){return initialHitCount;}
+
+ Pointer[] ptrs=Pointer.loadMatrix(hits);
+// ptrs[0].value/=2;
+// ptrs[ptrs.length-1].value/=2;
+ Arrays.sort(ptrs);
+
+ int finalHitCount=initialHitCount;
+ for(int i=ptrs.length-1; sum>limit || sum/finalHitCount>limit2; i--){
+ Pointer p=ptrs[i];
+ sum-=hits[p.key].length;
+ hits[p.key]=null;
+ finalHitCount--;
+ }
+
+ return finalHitCount;
+ }
+
+ /** Remove least useful keys to accelerate search */
+ private final int trimExcessHitListsByGreedy(int[] offsets, int[] keyScores, int maxHitLists, int[] keys){
+
+ float[] keyWeights=getKeyWeightArray(keyScores.length);
+ for(int i=0; i<keyScores.length; i++){
+ keyWeights[i]=keyScores[i]*INV_BASE_KEY_HIT_SCORE;
+ }
+
+// assert(false) : "modify this function so that it gives more weight to trimming lists over highly covered baits";
+ //And also, incorporate the "remove the longest list" function
+
+ final int limit=Tools.max(SMALL_GENOME_LIST, lengthHistogram[MAX_AVERAGE_LIST_TO_SEARCH])*keys.length;
+ final int limit2=Tools.max(SMALL_GENOME_LIST, lengthHistogram[MAX_AVERAGE_LIST_TO_SEARCH2]);
+ final int limit3=Tools.max(SMALL_GENOME_LIST, lengthHistogram[MAX_SHORTEST_LIST_TO_SEARCH]);
+// final int limitS=lengthHistogram[chrom][MAX_SINGLE_LIST_TO_SEARCH];
+
+ int sum=0;
+ int initialHitCount=0;
+
+ int shortest=Integer.MAX_VALUE-1;
+ int shortest2=Integer.MAX_VALUE;
+
+// for(int i=0; i<hits.length; i++){
+// if(hits[i]!=null && hits[i].length>limitS){hits[i]=null;}
+// }
+
+ final int[] lengths=getGenericArray(keys.length);
+
+ for(int i=0; i<keys.length; i++){
+ int key=keys[i];
+ int x=count(key);
+ lengths[i]=x;
+ sum+=x;
+ initialHitCount+=(x==0 ? 0 : 1);
+ if(x>0){
+ if(x<shortest2){
+ shortest2=x;
+ if(shortest2<shortest){
+ shortest2=shortest;
+ shortest=x;
+ }
+ }
+ }
+ }
+ assert(shortest2>=shortest);
+ if(initialHitCount<MIN_APPROX_HITS_TO_KEEP){return initialHitCount;}
+ if(shortest>limit3 && !SLOW){
+ for(int i=0; i<keys.length; i++){keys[i]=-1;}
+ return 0;
+ }
+
+ int hitsCount=initialHitCount;
+ int worstValue=Integer.MIN_VALUE;
+
+ while(hitsCount>=MIN_APPROX_HITS_TO_KEEP && (sum>limit || sum/initialHitCount>limit2 || hitsCount>maxHitLists/* || worstValue<0*/)){
+ final int[] lists=getGreedyListArray(hitsCount);
+ for(int i=0, j=0; j<lists.length; i++){
+ if(lengths[i]>0){
+ lists[j]=i;
+ j++;
+ }
+ }
+
+ Solver.findWorstGreedy(offsets, lengths, keyWeights, KEYLEN, lists, greedyReturn);
+ int worstIndex=greedyReturn[0];
+ int worst=lists[worstIndex];
+ worstValue=greedyReturn[1];
+ sum-=lengths[worst];
+
+// if(worstValue>0 && (hitsCount<=maxHitLists || lengths[worst]<excessListLimit)){return hitsCount;}
+ if(worstValue>0 || lengths[worst]<SMALL_GENOME_LIST){return hitsCount;} //This line increases accuracy at expense of speed. Lower constant = more accurate, default 0.
+ hitsCount--;
+ lengths[worst]=0;
+ keys[worst]=-1;
+ }
+ return hitsCount;
+ }
+
+
+ private final int getHits(final int[] keys, final int chrom, final int maxLen, final int[] starts, final int[] stops){
+ int numHits=0;
+ final Block b=index[chrom];
+ for(int i=0; i<keys.length; i++){
+ final int key=keys[i];
+ starts[i]=-1;
+ stops[i]=-1;
+ if(key>=0){
+ final int len=count(key);
+ if(len>0 && len<maxLen){
+ final int len2=b.length(key);
+ if(len2>0){
+ starts[i]=b.starts[key];
+ stops[i]=starts[i]+len2;
+ numHits++;
+ }
+ }
+ }
+ }
+ return numHits;
+ }
+
+
+ private final int countHits(final int[] keys, final int maxLen, boolean clearBadKeys){
+ int numHits=0;
+ for(int i=0; i<keys.length; i++){
+ final int key=keys[i];
+ if(key>=0){
+ final int len=count(key);
+ if(len>0 && len<maxLen){
+ numHits++;
+ }else if(clearBadKeys){
+ keys[i]=-1;
+ }
+ }
+ }
+ return numHits;
+ }
+
+
+ public final ArrayList<SiteScore> findAdvanced(byte[] basesP, byte[] basesM, byte[] qual, byte[] baseScoresP, int[] keyScoresP, int[] offsets, long id){
+ assert(minChrom<=maxChrom && minChrom>=0);
+ ArrayList<SiteScore> result=find(basesP, basesM, qual, baseScoresP, keyScoresP, offsets, true, id);
+ if(DOUBLE_SEARCH_NO_HIT && (result==null || result.isEmpty())){result=find(basesP, basesM, qual, baseScoresP, keyScoresP, offsets, false, id);}
+
+ return result;
+ }
+
+
+ public final ArrayList<SiteScore> find(byte[] basesP, byte[] basesM, byte[] qual, byte[] baseScoresP, int[] keyScoresP, int[] offsetsP, boolean obeyLimits, long id){
+
+ assert(checkOffsets(offsetsP)) : Arrays.toString(offsetsP);
+ final int[] keysOriginal=KeyRing.makeKeys(basesP, offsetsP, KEYLEN);
+ int[] keysP=Arrays.copyOf(keysOriginal, keysOriginal.length);
+
+ initialKeys+=offsetsP.length;
+ initialKeyIterations++;
+
+ final int maxLen=(obeyLimits ? MAX_USABLE_LENGTH : MAX_USABLE_LENGTH2);
+
+ int numHits=0;
+ numHits=countHits(keysP, maxLen, true);
+ if(numHits>0){ //TODO: Change these to higher numbers
+ int trigger=(3*keysP.length)/4;
+ if(numHits<6 && numHits<trigger){
+ for(int i=0; i<keysP.length; i++){keysP[i]=keysOriginal[i];}
+ numHits=countHits(keysP, (maxLen*3)/2, true);
+ }
+ if(numHits<5 && numHits<trigger){
+ for(int i=0; i<keysP.length; i++){keysP[i]=keysOriginal[i];}
+ numHits=countHits(keysP, maxLen*2, true);
+ }
+ if(numHits<4 && numHits<trigger){
+ for(int i=0; i<keysP.length; i++){keysP[i]=keysOriginal[i];}
+ numHits=countHits(keysP, maxLen*3, true);
+ }
+ if(numHits<3 && numHits<trigger){
+ for(int i=0; i<keysP.length; i++){keysP[i]=keysOriginal[i];}
+ numHits=countHits(keysP, maxLen*5, true);
+ }
+ }
+// assert(checkOffsets(offsetsP)) : Arrays.toString(offsetsP);
+ if(numHits<keysP.length){
+ int[][] r=shrink2(offsetsP, keysP, keyScoresP);
+ assert(r!=null);
+ if(r!=null){
+ offsetsP=r[0];
+ keysP=r[1];
+ keyScoresP=r[2];
+ }
+ }else{
+ assert(shrink2(offsetsP, keysP, keyScoresP)==null);
+ }
+ initialKeys2+=numHits;
+ //assert(checkOffsets(offsetsP)) : Arrays.toString(offsetsP);
+
+// assert(checkOffsets(offsets)) : Arrays.toString(offsets);
+ if(TRIM_BY_GREEDY && obeyLimits){
+ int maxLists=Tools.max((int)(HIT_FRACTION_TO_RETAIN*keysP.length), MIN_HIT_LISTS_TO_RETAIN);
+ numHits=trimExcessHitListsByGreedy(offsetsP, keyScoresP, maxLists, keysP);
+ }
+// System.out.println("After greedy: numHits = "+numHits);
+
+ if(TRIM_BY_TOTAL_SITE_COUNT && obeyLimits){
+ throw new RuntimeException("Needs to be redone.");
+// numHits=trimExcessHitLists(keys, hits);
+ }
+
+ if(TRIM_LONG_HIT_LISTS && obeyLimits && numHits>MIN_APPROX_HITS_TO_KEEP){
+ int cutoffIndex=((int) (HIT_FRACTION_TO_RETAIN*(keysP.length)-0.01f))+(keysP.length-numHits);
+
+ int zeroes=keysP.length-numHits;
+ int altMinIndex=(zeroes+(MIN_HIT_LISTS_TO_RETAIN-1));
+ cutoffIndex=Tools.max(cutoffIndex, altMinIndex);
+
+ assert(cutoffIndex>0) : cutoffIndex+"\n"+numHits;
+
+ if(cutoffIndex<(keysP.length-1)){
+ int[] lens=getGenericArray(keysP.length);
+ for(int i=0; i<keysP.length; i++){lens[i]=count(keysP[i]);}
+ Arrays.sort(lens);
+ int cutoff=lens[cutoffIndex];
+
+ cutoff=Tools.max(lengthHistogram[MIN_INDEX_TO_DROP_LONG_HIT_LIST], cutoff);
+
+ int removed=0;
+
+ for(int i=0; i<keysP.length; i++){
+ int key=keysP[i];
+ if(count(key)>cutoff){
+ keysP[i]=-1;
+ removed++;
+ numHits--;
+ }
+ }
+ }
+ }
+// assert(checkOffsets(offsets)) : Arrays.toString(offsets);
+
+ final ArrayList<SiteScore> result=new ArrayList<SiteScore>(8);
+ if(numHits<MIN_APPROX_HITS_TO_KEEP){return result;}
+ //assert(checkOffsets(offsetsP)) : Arrays.toString(offsetsP);
+ if(numHits<keysP.length){
+ int[][] r=shrink2(offsetsP, keysP, keyScoresP);
+ assert(r!=null);
+ if(r!=null){
+ offsetsP=r[0];
+ keysP=r[1];
+ keyScoresP=r[2];
+ }
+ }else{
+ assert(shrink2(offsetsP, keysP, keyScoresP)==null);
+ }
+ assert(keysP.length==numHits);
+ //assert(checkOffsets(offsetsP)) : Arrays.toString(offsetsP);
+ //Reverse the offsets for minus-strand mapping, since they are generated based on quality
+ int[] offsetsM=KeyRing.reverseOffsets(offsetsP, KEYLEN, basesP.length);
+ if(verbose){
+ System.err.println("Reversed offsets: \n"+Arrays.toString(offsetsP)+" ->\n"+Arrays.toString(offsetsM));
+ }
+ final int[] keysM=KeyRing.reverseComplementKeys(keysP, KEYLEN);
+
+// assert(checkOffsets(offsetsP)) : Arrays.toString(offsetsP);
+// assert(checkOffsets(offsetsM)) : Arrays.toString(offsetsM);
+
+ assert(!USE_EXTENDED_SCORE || (baseScoresP!=null && (qual==null || baseScoresP.length==qual.length)));
+ assert(keyScoresP!=null);
+ assert(keyScoresP.length==offsetsP.length) : keyScoresP.length+", "+offsetsP.length+", "+Arrays.toString(keyScoresP);
+ final byte[] baseScoresM=Tools.reverseAndCopy(baseScoresP, getBaseScoreArray(baseScoresP.length, 1));
+ final int[] keyScoresM=Tools.reverseAndCopy(keyScoresP, getKeyScoreArray(keyScoresP.length, 1));
+ final int maxQuickScore=maxQuickScore(offsetsP, keyScoresP);
+
+ assert(offsetsM.length==offsetsP.length);
+ assert(maxQuickScore==maxQuickScore(offsetsM, keyScoresM));
+
+ /*
+ * bestScores:
+ *
+ * bestScores[0] currentTopScore
+ * bestScores[1] maxHits
+ * bestScores[2] qcutoff
+ * bestScores[3] bestqscore
+ * bestScores[4] maxQuickScore
+ * bestScores[5] perfectsFound
+ */
+ final int[] bestScores=new int[6];
+
+ //This prevents filtering by qscore when a low-quality read only uses a few keys.
+ //In that case, extending is more important.
+ final boolean prescan_qscore=(PRESCAN_QSCORE && numHits>=5);
+
+ int[][] prescanResults=null;
+ int[] precounts=null;
+ int[] prescores=null;
+
+ int hitsCutoff=0;
+ int qscoreCutoff=(int)(MIN_QSCORE_MULT*maxQuickScore);
+
+ boolean allBasesCovered=true;
+ {
+ if(offsetsP[0]!=0){allBasesCovered=false;}
+ else if(offsetsP[offsetsP.length-1]!=(basesP.length-KEYLEN)){allBasesCovered=false;}
+ else{
+ for(int i=1; i<offsetsP.length; i++){
+ if(offsetsP[i]>offsetsP[i-1]+KEYLEN){
+ allBasesCovered=false;
+ break;
+ }
+ }
+ }
+ }
+
+ //TODO I don't understand this logic
+ final boolean pretendAllBasesAreCovered=//false;
+ (allBasesCovered ||
+ keysP.length>=keysOriginal.length-4 ||
+ (keysP.length>=9 && (offsetsP[offsetsP.length-1]-offsetsP[0]+KEYLEN)>Tools.max(40, (int)(basesP.length*.75f))));
+
+// System.err.println(allBasesCovered+"\t"+Arrays.toString(offsetsP));
+// assert(allBasesCovered);
+
+ if(prescan_qscore){
+ prescanResults=prescanAllBlocks(bestScores,
+ keysP, keyScoresP, offsetsP,
+ keysM, keyScoresM, offsetsM,
+ pretendAllBasesAreCovered);
+
+ if(prescanResults!=null){
+ precounts=prescanResults[0];
+ prescores=prescanResults[1];
+ }
+
+ if(bestScores[1]<MIN_APPROX_HITS_TO_KEEP){return result;}
+ if(bestScores[3]<maxQuickScore*MIN_QSCORE_MULT2){return result;} //if(bestScores[3]<maxQuickScore(offsetsP, keyScoresP)*.10f){return result;}
+
+ if(bestScores[3]>=maxQuickScore && pretendAllBasesAreCovered){
+ assert(bestScores[3]==maxQuickScore);
+ assert(bestScores[1]==numHits);
+
+ hitsCutoff=calcApproxHitsCutoff(keysP.length, bestScores[1], MIN_APPROX_HITS_TO_KEEP, true);
+ qscoreCutoff=Tools.max(qscoreCutoff, (int)(bestScores[3]*DYNAMIC_QSCORE_THRESH_PERFECT));
+ }else{
+ hitsCutoff=calcApproxHitsCutoff(keysP.length, bestScores[1], MIN_APPROX_HITS_TO_KEEP, false);
+ qscoreCutoff=Tools.max(qscoreCutoff, (int)(bestScores[3]*PRESCAN_QSCORE_THRESH));
+ }
+ }
+
+ final int maxScore=maxScore(offsetsP, baseScoresP, keyScoresP, basesP.length, true);
+ final boolean fullyDefined=AminoAcid.isFullyDefined(basesP);
+ assert(bestScores[2]<=0) : Arrays.toString(bestScores); //Note - I am not sure what this assertion does, or if it is valid for acc
+
+ int cycle=0;
+ for(int chrom=minChrom; chrom<=maxChrom; chrom=((chrom&CHROM_MASK_HIGH)+CHROMS_PER_BLOCK)){
+ if(precounts==null || precounts[cycle]>=hitsCutoff || prescores[cycle]>=qscoreCutoff){
+ find(keysP, basesP, baseScoresP, keyScoresP, chrom, Gene.PLUS,
+ offsetsP, obeyLimits, result, bestScores, allBasesCovered, maxScore, fullyDefined);
+ }
+ if(QUIT_AFTER_TWO_PERFECTS){
+ if(bestScores[5]>=2){break;} //if(bestScores[5]>=3 || (bestScores[5]>=2 && chrom<24)){break;} //for human
+ }
+ cycle++;
+ if(precounts==null || precounts[cycle]>=hitsCutoff || prescores[cycle]>=qscoreCutoff){
+ find(keysM, basesM, baseScoresM, keyScoresM, chrom, Gene.MINUS,
+ offsetsM, obeyLimits, result, bestScores, allBasesCovered, maxScore, fullyDefined);
+ }
+ if(QUIT_AFTER_TWO_PERFECTS){
+ if(bestScores[5]>=2){break;} //if(bestScores[5]>=3 || (bestScores[5]>=2 && chrom<24)){break;} //for human
+ }
+ cycle++;
+ }
+
+ assert(Read.CHECKSITES(result, basesP, basesM, id, false)); //TODO: Comment out once checked
+
+ return result;
+ }
+
+ /** Search blocks rapidly to find max hits, and perfect sites. May indicate some blocks can be skipped. */
+ private final int[][] prescanAllBlocks(int[] bestScores,
+ int[] keysP, int[] keyScoresP, int[] offsetsP,
+ int[] keysM, int[] keyScoresM, int[] offsetsM,
+ final boolean allBasesCovered){
+
+ int[][][] pm=new int[][][] {{keysP, keyScoresP, offsetsP}, {keysM, keyScoresM, offsetsM}};
+
+ int bestqscore=0;
+ int maxHits=0;
+ int minHitsToScore=MIN_APPROX_HITS_TO_KEEP;
+
+ final int maxQuickScore=maxQuickScore(offsetsP, keyScoresP);
+
+ final int[] counts=precountArray;
+ final int[] scores=prescoreArray;
+ final int[][] ret=prescanReturn;
+ Arrays.fill(counts, keysP.length);
+ Arrays.fill(scores, maxQuickScore);
+ ret[0]=counts;
+ ret[1]=scores;
+
+ int cycle=0;
+ for(int chrom=minChrom; chrom<=maxChrom; chrom=((chrom&CHROM_MASK_HIGH)+CHROMS_PER_BLOCK)){
+ final int baseChrom=baseChrom(chrom);
+ for(int pmi=0; pmi<2; pmi++, cycle++){
+
+ int[] keys=pm[pmi][0];
+ int[] keyScores=pm[pmi][1];
+ int[] offsets=pm[pmi][2];
+// int[][] hits=getHitArray(offsets.length);
+
+ int[] starts=startArray;
+ int[] stops=stopArray;
+ int numHits=getHits(keys, chrom, Integer.MAX_VALUE, starts, stops);
+
+ if(numHits<minHitsToScore){
+ scores[cycle]=-9999;
+ counts[cycle]=0;
+ }else{
+
+// final int maxQuickScore=maxQuickScore(offsets, keyScores);
+ // System.err.println("maxScore = "+maxScore);
+
+ if(numHits<keys.length){
+ int[][] r=shrink(starts, stops, offsets, keyScores, offsets.length);
+ if(r!=null){
+ starts=r[0];
+ stops=r[1];
+ offsets=r[2];
+ keyScores=r[4];
+ }
+ }
+
+ assert(numHits==offsets.length);
+ assert(numHits==keyScores.length);
+ heap.clear();
+ final Quad[] triples=tripleStorage;
+ final int[] values=valueArray;
+
+ int[] temp=findMaxQscore2(starts, stops, offsets, keyScores, baseChrom, triples, values, minHitsToScore, true,
+ bestqscore>=maxQuickScore && allBasesCovered);
+
+ scores[cycle]=temp[0];
+ counts[cycle]=temp[1];
+
+ bestqscore=Tools.max(temp[0], bestqscore);
+ maxHits=Tools.max(maxHits, temp[1]);
+ if(bestqscore>=maxQuickScore && allBasesCovered){
+ assert(bestqscore==maxQuickScore);
+ assert(maxHits==keysP.length) :
+ "\nTemp: \t"+Arrays.toString(temp)+", cycle="+cycle+"\n" +
+ "Scores: \t"+Arrays.toString(scores)+
+ "Counts: \t"+Arrays.toString(counts)+
+ "bestqscore: \t"+bestqscore+
+ "maxHits: \t"+maxHits+
+ "maxQuickScore: \t"+maxQuickScore+
+ "numHits: \t"+numHits+
+ "minHitsToScore: \t"+minHitsToScore+
+ "keys.length: \t"+keys.length;
+
+ minHitsToScore=Tools.max(minHitsToScore, maxHits);
+
+ {
+ //This early exit is optional. Does not seem to impact speed much either way.
+ bestScores[1]=Tools.max(bestScores[1], maxHits);
+ bestScores[3]=Tools.max(bestScores[3], bestqscore);
+ return ret;
+ }
+ }
+ }
+ }
+ }
+
+ bestScores[1]=Tools.max(bestScores[1], maxHits);
+ bestScores[3]=Tools.max(bestScores[3], bestqscore);
+
+ if(!RETAIN_BEST_QCUTOFF){bestScores[2]=-9999;}
+
+ return ret;
+ }
+
+
+ /** Search a single block and strand */
+ public final ArrayList<SiteScore> find(int[] keys, final byte[] bases, final byte[] baseScores, int[] keyScores,
+ final int chrom, final byte strand,
+ int[] offsets, final boolean obeyLimits, ArrayList<SiteScore> ssl, int[] bestScores,
+ final boolean allBasesCovered, final int maxScore, final boolean fullyDefined){
+
+ assert(checkOffsets(offsets)) : Arrays.toString(offsets);
+
+ //Index of first location of each key
+ int[] starts=startArray;
+ //Index of first location of next key (i.e., (last location of key)+1)
+ int[] stops=stopArray;
+
+ int numHits=getHits(keys, chrom, Integer.MAX_VALUE, starts, stops);
+ if(numHits<MIN_APPROX_HITS_TO_KEEP){return ssl;}
+
+ if(USE_SLOWALK3){
+ if(!RETAIN_BEST_SCORES){Arrays.fill(bestScores, 0);}
+ ssl=slowWalk3(starts, stops, bases, baseScores, keyScores, offsets, chrom, strand, obeyLimits, ssl, bestScores, allBasesCovered, maxScore, fullyDefined);
+ }else{
+ ssl=slowWalk2(starts, stops, bases, baseScores, keyScores, offsets, chrom, strand, obeyLimits, ssl, fullyDefined);
+ }
+
+ return ssl;
+ }
+
+ /** Compress arrays by removing null/empty lists */
+ private final int[][] shrink(int[] starts, int[] stops, int[] offsets, int[] keyScores, final int len){
+ int numHits=0;
+ for(int i=0; i<len; i++){
+ if(starts[i]>=0){numHits++;}
+ }
+
+ if(numHits==offsets.length){
+ return null;
+ }else{
+ int[][] r=shrinkReturn3;
+ int[] starts2=startArray;
+ int[] stops2=stopArray;
+ int[] offsets2=getOffsetArray(numHits);
+ int[] keyScores2=new int[numHits];
+
+ for(int i=0, j=0; i<len; i++){
+ if(starts[i]>=0){
+ starts2[j]=starts[i];
+ stops2[j]=stops[i];
+ offsets2[j]=offsets[i];
+ keyScores2[j]=keyScores[i];
+ j++;
+ }
+ }
+ r[0]=starts2;
+ r[1]=stops2;
+ r[2]=offsets2;
+ r[4]=keyScores2;
+ return r;
+ }
+ }
+
+ /** Removes "-1" keys. */
+ private final int[][] shrink2(int[] offsets, int[] keys, int[] keyScores){
+
+
+ int numHits=0;
+ for(int i=0; i<keys.length; i++){
+ if(keys[i]>=0){numHits++;}
+ }
+
+
+ assert(checkOffsets(offsets)) : Arrays.toString(offsets);
+ if(numHits==keys.length){
+ return null;
+ }else{
+ int[][] r=shrinkReturn2;
+ int[] offsets2=getOffsetArray(numHits);
+ assert(offsets2!=offsets);
+ assert(offsets2.length<offsets.length);
+ int[] keys2=new int[numHits];
+ int[] keyScores2=new int[numHits];
+
+ for(int i=0, j=0; i<keys.length; i++){
+ if(keys[i]>=0){
+ offsets2[j]=offsets[i];
+ keys2[j]=keys[i];
+ keyScores2[j]=keyScores[i];
+ j++;
+ }
+ }
+ assert(checkOffsets(offsets2)) : "\nnumHits="+numHits+"\n"+Arrays.toString(offsets)+" -> \n"+Arrays.toString(offsets2)+"\n"+
+ "\n"+Arrays.toString(keys)+" -> \n"+Arrays.toString(keys2)+"\n";
+ r[0]=offsets2;
+ r[1]=keys2;
+ r[2]=keyScores2;
+ return r;
+ }
+ }
+
+
+ /** This uses a heap to track next column to increment */
+ private final ArrayList<SiteScore> slowWalk2(int[] starts, int[] stops, final byte[] bases,
+ final byte[] baseScores, int[] keyScores, int[] offsets,
+ final int baseChrom_, final byte strand, final boolean obeyLimits, ArrayList<SiteScore> ssl, final boolean fullyDefined){
+
+ if(SHRINK_BEFORE_WALK){
+ int[][] r=shrink(starts, stops, offsets, keyScores, offsets.length);
+ if(r!=null){
+ starts=r[0];
+ stops=r[1];
+ offsets=r[2];
+ keyScores=r[4];
+ }
+ }
+
+ final int numHits=offsets.length; //After shrink
+
+ assert(numHits==offsets.length);
+ assert(numHits==keyScores.length);
+
+ assert(!(!SHRINK_BEFORE_WALK && ADD_SCORE_Z));
+
+ //This can be done before or after shrinking, but the results will change depending on MIN_SCORE_MULT and etc.
+ final int maxScore=maxScore(offsets, baseScores, keyScores, bases.length, true);
+// final int maxQuickScore=(!USE_EXTENDED_SCORE ? maxScore : maxQuickScore(offsets));
+// System.err.println("maxScore = "+maxScore);
+
+// final int minScore=(obeyLimits ? (int)(MIN_SCORE_MULT*maxScore) : (int)(MIN_SCORE_MULT*0.85f*maxScore));
+ final int minScore=(obeyLimits ? (int)(MIN_SCORE_MULT*maxScore) : (int)(MIN_SCORE_MULT*1.25f*maxScore));
+// final int minQuickScore=(!USE_EXTENDED_SCORE ? minScore : (int)(maxQuickScore*0.15f));
+// final int minScore=(int)(MIN_SCORE_MULT*maxScore);
+// System.err.println("minScore = "+minScore);
+
+ final int baseChrom=baseChrom(baseChrom_);
+
+
+ heap.clear();
+ final Quad[] triples=tripleStorage;
+
+ final Block b=index[baseChrom];
+ final int[] values=valueArray;
+ final int[] sizes=sizeArray;
+ final int[] locArray=(USE_EXTENDED_SCORE ? getLocArray(bases.length) : null);
+
+ for(int i=0; i<numHits; i++){
+ final int[] sites=b.sites;
+ final int start=starts[i];
+ sizes[i]=b.length(start, stops[i]);
+ assert(sizes[i]>0);
+
+ int a=sites[start];
+ int a2;
+ if((a&SITE_MASK)>=offsets[i]){
+ a2=a-offsets[i];
+ }else{
+ int ch=numberToChrom(a, baseChrom);
+ int st=numberToSite(a);
+ int st2=Tools.max(st-offsets[i], 0);
+ a2=toNumber(st2, ch);
+ }
+ assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom));
+
+ Quad t=triples[i];
+ assert(t!=null) : "Should be using tripleStorage";
+ assert(i==t.column);
+ t.row=start;
+ t.site=a2;
+ t.list=sites;
+ values[i]=a2;
+
+ heap.add(t);
+ }
+
+ if(ssl==null){ssl=new ArrayList<SiteScore>(8);}
+
+ int currentTopScore=-999999999;
+
+ int cutoff=minScore;
+
+ int maxHits=0;
+ int approxHitsCutoff=MIN_APPROX_HITS_TO_KEEP;
+
+// System.out.println("\nEntering SS loop:");
+// System.out.println("maxScore="+maxScore+"\tminScore="+minScore+"\tcurrentTopScore="+currentTopScore+"\n" +
+// "cutoff="+cutoff+"\tmaxHits="+maxHits+"\tapproxHitsCutoff="+approxHitsCutoff);
+// System.out.println();
+
+ SiteScore prevSS=null;
+ while(!heap.isEmpty()){
+ Quad t=heap.peek();
+ final int site=t.site;
+ final int centerIndex=t.column;
+
+ int maxNearbySite=site;
+ int approxHits=0;
+
+ {//Inner loop
+ final int minsite=site-MAX_INDEL, maxsite=site+MAX_INDEL2;
+ for(int column=0, chances=numHits-approxHitsCutoff; column<numHits && chances>=0; column++){
+ final int x=values[column];
+ assert(x==triples[column].site);
+ if(x>=minsite && x<=maxsite){
+ maxNearbySite=(x>maxNearbySite ? x : maxNearbySite);
+ approxHits++;
+ }else{chances--;}
+ }
+ }
+ hist_hits[Tools.min(HIT_HIST_LEN, approxHits)]++;
+
+ assert(centerIndex>=0) : centerIndex;
+ assert(approxHits>=1 || approxHitsCutoff>1) : approxHits+", "+approxHitsCutoff+", "+numHits+", "+t.column;
+ boolean locArrayValid=false;
+ if(approxHits>=approxHitsCutoff){
+
+ int score;
+
+ int mapStart=site, mapStop=maxNearbySite;
+
+ if(USE_EXTENDED_SCORE){
+ final int chrom=numberToChrom(site, baseChrom);
+ if(verbose){
+ System.err.println(new String(bases));
+ System.err.println("numHits="+numHits+", approxHits="+approxHits+/*", keys="+numKeys+*/", centerIndex="+centerIndex);
+ System.err.println("Extending "+Arrays.toString(values));
+ }
+ score=extendScore(bases, baseScores, offsets, values, chrom, centerIndex, locArray, numHits, approxHits);
+ locArrayValid=true;
+ if(true/*USE_AFFINE_SCORE*/){
+ //Correct begin and end positions if they changed.
+ int min=Integer.MAX_VALUE;
+ int max=Integer.MIN_VALUE;
+ for(int i=0; i<locArray.length; i++){
+ int x=locArray[i];
+ if(x>-1){
+ if(x<min){min=x;}
+ if(x>max){max=x;}
+ }
+ }
+
+// assert(min>-1 && max>-1) : Arrays.toString(locArray); //TODO: How did this assertion trigger?
+ if(min<0 || max<0){
+ //Note: This error can trigger if minChrom and maxChrom do not align to block boundaries
+ if(!Shared.anomaly){
+ Shared.anomaly=true;
+ System.err.println("Anomaly in "+getClass().getName()+".slowWalk: "+
+ chrom+", "+mapStart+", "+mapStop+", "+centerIndex+", "+
+ Arrays.toString(locArray)+"\n"+
+ Arrays.toString(values)+"\n"+
+ new String(bases)+"\nstrand="+strand+"\n");
+ System.err.println();
+ }
+ score=-99999;
+ }
+
+
+ mapStart=toNumber(min, chrom);
+ mapStop=toNumber(max, chrom);
+
+
+
+// System.err.println("site="+site+", maxNearbySite="+maxNearbySite+", min="+min+", max="+max+
+// ", mapStart="+mapStart+", mapStop="+mapStop);
+
+// if(chrom==17 && absdif(min, 30354420)<2000){
+// System.err.println("\n*****\n");
+// System.err.println("site="+site+" ("+numberToSite(site)+"), maxNearbySite="+maxNearbySite+
+// " ("+numberToSite(maxNearbySite)+"), min="+min+", max="+max+
+// ", mapStart="+mapStart+", mapStop="+mapStop);
+// System.err.println();
+// System.err.println(Arrays.toString(locArray));
+// System.err.println();
+// System.err.println("chrom="+chrom);
+// System.err.println("score="+score);
+// }
+ }
+ }else{
+ score=quickScore(values, keyScores, centerIndex, offsets, sizes, true, approxHits, numHits);
+ if(ADD_SCORE_Z){
+ int scoreZ=scoreZ2(values, centerIndex, offsets, approxHits, numHits);
+ score+=scoreZ;
+ }
+ }
+
+ if(score>=cutoff){
+
+ if(score>currentTopScore){
+// System.err.println("New top score!");
+
+ if(DYNAMICALLY_TRIM_LOW_SCORES){
+
+ maxHits=Tools.max(approxHits, maxHits);
+// approxHitsCutoff=Tools.max(approxHitsCutoff, maxHits);
+ approxHitsCutoff=Tools.max(approxHitsCutoff, maxHits-1); //More sensitive, but slower
+
+ cutoff=Tools.max(cutoff, (int)(score*DYNAMIC_SCORE_THRESH));
+
+ // cutoff=Tools.max(cutoff, minScore+(int)((score-minScore)*DYNAMIC_SCORE_THRESH));
+ if(USE_EXTENDED_SCORE && score>=maxScore){
+ cutoff=Tools.max(cutoff, (int)(score*0.95f));
+ }
+ }
+
+ currentTopScore=score;
+
+// System.out.println("New top score: "+currentTopScore+" \t("+cutoff+")");
+ }
+
+ final int chrom=numberToChrom(mapStart, baseChrom);
+ final int site2=numberToSite(mapStart);
+ final int site3=numberToSite(mapStop)+bases.length-1;
+
+ assert(NUM_CHROM_BITS==0 || site2<SITE_MASK-1000) : "chrom="+chrom+", strand="+strand+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", site2="+site2+", site3="+site3+", read.length="+bases.length+
+ "\n\n"+Arrays.toString(b.getHitList(centerIndex));
+ assert(site2<site3) : "chrom="+chrom+", strand="+strand+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", site2="+site2+", site3="+site3+", read.length="+bases.length;
+
+
+ //Note: I could also do this as soon as score is calculated.
+// if(ADD_SCORE_Z){
+// int scoreZ=scoreZ2(values, centerIndex, offsets);
+// score+=scoreZ;
+// }
+
+ //This block is optional, but tries to eliminate multiple identical alignments
+// SiteScore prevSS=(ssl.size()<1 ? null : ssl.get(ssl.size()-1));
+
+ SiteScore ss=null;
+ final boolean perfect1=USE_EXTENDED_SCORE && score==maxScore && fullyDefined;
+
+ int[] gapArray=null;
+ if(site3-site2>=MINGAP+bases.length){
+ gapArray=makeGapArray(locArray, mapStart, MINGAP);
+ if(gapArray!=null){
+ int sub=site2-mapStart;//thus site2=mapStart+sub
+ for(int i=0; i<gapArray.length; i++){
+ gapArray[i]+=sub;
+ }
+ assert(gapArray[0]==mapStart) : gapArray[0]+", "+mapStart;
+ assert(gapArray[gapArray.length-1]==mapStop);
+ }
+ assert(false) : Arrays.toString(locArray);
+ }
+
+ //This code comes from slowWalk3
+// int[] gapArray=null;
+// if(site3-site2>=MINGAP+bases.length){
+// assert(locArrayValid) : "Loc array was not filled.";
+// gapArray=makeGapArray(locArray, site2, MINGAP);
+// if(gapArray!=null){
+// gapArray[0]=Tools.min(gapArray[0], site2);
+// gapArray[gapArray.length-1]=Tools.max(gapArray[gapArray.length-1], site3);
+// }
+// if(verbose){System.err.println("@ site "+site2+", made gap array: "+Arrays.toString(gapArray));}
+// }
+//
+
+ if(gapArray==null && prevSS!=null && prevSS.gaps==null &&
+ prevSS.chrom==chrom && prevSS.strand==strand && overlap(prevSS.start, prevSS.stop, site2, site3)){
+
+ int betterScore=Tools.max(score, prevSS.score);
+ int minStart=Tools.min(prevSS.start, site2);
+ int maxStop=Tools.max(prevSS.stop, site3);
+ final boolean perfect2=USE_EXTENDED_SCORE && prevSS.score==maxScore && fullyDefined;
+ assert(!USE_EXTENDED_SCORE || perfect2==prevSS.perfect);
+
+ boolean shortEnough=(!LIMIT_SUBSUMPTION_LENGTH_TO_2X || (maxStop-minStart<2*bases.length));
+
+ if(prevSS.start==site2 && prevSS.stop==site3){
+ prevSS.score=prevSS.quickScore=betterScore;
+ }else if(SUBSUME_SAME_START_SITES && shortEnough && prevSS.start==site2){
+ if(perfect2){
+ //do nothing
+ }else if(perfect1){
+ prevSS.setStop(site3);
+ prevSS.setPerfect();
+ }else{
+ prevSS.setStop(maxStop);
+ }
+ prevSS.score=prevSS.quickScore=betterScore;
+ }else if(SUBSUME_SAME_STOP_SITES && shortEnough && prevSS.stop==site3){
+ if(perfect2){
+ //do nothing
+ }else if(perfect1){
+ prevSS.setStart(site2);
+ prevSS.setPerfect();
+ }else{
+ prevSS.setStart(minStart);
+ }
+ prevSS.score=prevSS.quickScore=betterScore;
+ }else if(SUBSUME_OVERLAPPING_SITES && shortEnough && (maxStop-minStart<=bases.length+MAX_SUBSUMPTION_LENGTH)
+ && !perfect1 && !perfect2){
+ prevSS.setLimits(minStart, maxStop);
+ prevSS.score=prevSS.quickScore=betterScore;
+ }else{
+ ss=new SiteScore(chrom, strand, site2, site3, approxHits, score, false, perfect1);
+ if(!perfect1){ss.setPerfect(bases);}
+ assert(!perfect1 || ss.stop-ss.start==bases.length-1);
+ }
+ assert(!perfect2 || prevSS.stop-prevSS.start==bases.length-1);
+ }else{
+ ss=new SiteScore(chrom, strand, site2, site3, approxHits, score, false, perfect1);
+ if(!perfect1){ss.setPerfect(bases);}
+ ss.gaps=gapArray;
+ if(verbose && gapArray!=null){
+ System.err.println(ss.toText()+"\t"+Arrays.toString(gapArray)+"\n"+Arrays.toString(locArray)+"\n");
+ }
+ }
+
+ if(ss!=null){
+// System.out.println("Added site "+ss.toText());
+ ssl.add(ss);
+ prevSS=ss;
+ }else{
+// System.out.println("Subsumed site "+new SiteScore(chrom, strand, site2, site3, score).toText());
+ }
+
+// if(prevSS!=null && prevSS.chrom==chrom && prevSS.strand==strand && overlap(prevSS.start, prevSS.stop, site2, site3)){
+// int betterScore=Tools.max(score, prevSS.score);
+// if(prevSS.start==site2 && prevSS.stop==site3){
+// prevSS.score=prevSS.quickScore=betterScore;
+// }else if(prevSS.start==site2
+// /*isWithin(prevSS.start, prevSS.stop, site2, site3) ||
+// isWithin(site2, site3, prevSS.start, prevSS.stop)*/){
+// prevSS.score=prevSS.quickScore=betterScore;
+// assert(prevSS.start<prevSS.stop);
+//// prevSS.start=Tools.min(prevSS.start, site2);
+// prevSS.stop=Tools.max(prevSS.stop, site3);
+// assert(prevSS.start<prevSS.stop);
+// }else{
+// SiteScore ss=new SiteScore(chrom, strand, site2, site3, score);
+// ssl.add(ss);
+// prevSS=ss;
+// }
+// }else{
+// SiteScore ss=new SiteScore(chrom, strand, site2, site3, score);
+// ssl.add(ss);
+// prevSS=ss;
+// }
+
+ }
+ }
+
+ while(heap.peek().site==site){ //Remove all identical elements, and add subsequent elements
+ final Quad t2=heap.poll();
+ final int row=t2.row+1, col=t2.column;
+ if(row<stops[col]){
+ t2.row=row;
+
+ int a=t2.list[row];
+ int a2;
+ if((a&SITE_MASK)>=offsets[col]){
+ a2=a-offsets[col];
+
+ assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) :
+ "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col];
+ }else{
+ int ch=numberToChrom(a, baseChrom);
+ int st=numberToSite(a);
+ int st2=Tools.max(st-offsets[col], 0);
+ a2=toNumber(st2, ch);
+
+ assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) :
+ "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col];
+ }
+
+ assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) :
+ "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col];
+
+ t2.site=a2;
+ values[col]=a2;
+ heap.add(t2);
+ }
+ if(heap.isEmpty()){break;}
+ }
+
+ }
+
+ return ssl;
+ }
+
+ /** This uses a heap to track next column to increment */
+ private final ArrayList<SiteScore> slowWalk3(int[] starts, int[] stops, final byte[] bases,
+ final byte[] baseScores, int[] keyScores, int[] offsets,
+ final int baseChrom_, final byte strand, final boolean obeyLimits, ArrayList<SiteScore> ssl,
+ int[] bestScores, final boolean allBasesCovered, final int maxScore, final boolean fullyDefined){
+ assert(USE_EXTENDED_SCORE);
+
+ final int numKeys=offsets.length; //Before shrink
+
+ //This can be done before or after shrinking, but the results will change depending on MIN_SCORE_MULT and etc.
+ final int maxQuickScore=maxQuickScore(offsets, keyScores);
+
+ if(SHRINK_BEFORE_WALK){
+ int[][] r=shrink(starts, stops, offsets, keyScores, offsets.length);
+ if(r!=null){
+ starts=r[0];
+ stops=r[1];
+ offsets=r[2];
+ keyScores=r[4];
+ }
+ }
+
+ final int numHits=offsets.length; //After shrink
+
+
+ assert(numHits==offsets.length);
+ assert(numHits==keyScores.length);
+
+ usedKeys+=numHits;
+ usedKeyIterations++;
+
+ final boolean filter_by_qscore=(FILTER_BY_QSCORE && numKeys>=5);
+
+ assert(!(!SHRINK_BEFORE_WALK && ADD_SCORE_Z));
+
+
+// final int minScore=(obeyLimits ? (int)(MIN_SCORE_MULT*maxScore) : (int)(MIN_SCORE_MULT*0.85f*maxScore));
+ final int minScore=(obeyLimits ? (int)(MIN_SCORE_MULT*maxScore) : (int)(MIN_SCORE_MULT*1.25f*maxScore));
+ final int minQuickScore=(int)(MIN_QSCORE_MULT*maxQuickScore);
+
+ final int baseChrom=baseChrom(baseChrom_);
+
+ heap.clear();
+
+ final Quad[] triples=tripleStorage;
+
+ final int[] values=valueArray;
+ final int[] sizes=sizeArray;
+ final int[] locArray=(USE_EXTENDED_SCORE ? getLocArray(bases.length) : null);
+ final Block b=index[baseChrom];
+
+ if(ssl==null){ssl=new ArrayList<SiteScore>(8);}
+
+ int currentTopScore=bestScores[0];
+ int cutoff=Tools.max(minScore, (int)(currentTopScore*DYNAMIC_SCORE_THRESH));
+
+ int qcutoff=Tools.max(bestScores[2], minQuickScore);
+ int bestqscore=bestScores[3];
+ int maxHits=bestScores[1];
+ int perfectsFound=bestScores[5];
+ assert((currentTopScore>=maxScore) == (perfectsFound>0)) : currentTopScore+", "+maxScore+", "+perfectsFound+", "+maxHits+", "+numHits+", "+new String(bases);
+ int approxHitsCutoff=calcApproxHitsCutoff(numKeys, maxHits, MIN_APPROX_HITS_TO_KEEP, currentTopScore>=maxScore);
+ if(approxHitsCutoff>numHits){return ssl;}
+
+ final boolean shortCircuit=(allBasesCovered && numKeys==numHits && filter_by_qscore);
+
+
+ assert(USE_EXTENDED_SCORE);
+
+ if(currentTopScore>=maxScore){
+ assert(currentTopScore==maxScore);
+ qcutoff=Tools.max(qcutoff, (int)(maxQuickScore*DYNAMIC_QSCORE_THRESH_PERFECT));
+ }
+
+
+ for(int i=0; i<numHits; i++){
+ final int[] sites=b.sites;
+ final int start=starts[i];
+ sizes[i]=b.length(start, stops[i]);
+ assert(sizes[i]>0);
+
+ int a=sites[start];
+ int a2;
+ if((a&SITE_MASK)>=offsets[i]){
+ a2=a-offsets[i];
+ }else{
+ int ch=numberToChrom(a, baseChrom);
+ int st=numberToSite(a);
+ int st2=Tools.max(st-offsets[i], 0);
+ a2=toNumber(st2, ch);
+ }
+ assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom));
+
+ Quad t=triples[i];
+ assert(t!=null) : "Should be using tripleStorage";
+ assert(i==t.column);
+ t.row=start;
+ t.site=a2;
+ t.list=sites;
+ values[i]=a2;
+
+ heap.add(t);
+ }
+
+// System.out.println("\nEntering SS loop:");
+// System.out.println("maxScore="+maxScore+"\tminScore="+minScore+"\tcurrentTopScore="+currentTopScore+"\n" +
+// "cutoff="+cutoff+"\tmaxHits="+maxHits+"\tapproxHitsCutoff="+approxHitsCutoff);
+// System.out.println("maxQuickScore="+maxQuickScore+"\tminQuickScore="+minQuickScore+"\tqcutoff="+qcutoff);
+
+
+ SiteScore prevSS=null;
+ while(!heap.isEmpty()){
+ Quad t=heap.peek();
+ final int site=t.site;
+ final int centerIndex=t.column;
+
+ int maxNearbySite=site;
+
+
+ int approxHits=0;
+
+ {//Inner loop
+ final int minsite=site-MAX_INDEL, maxsite=site+MAX_INDEL2;
+ for(int column=0, chances=numHits-approxHitsCutoff; column<numHits && chances>=0; column++){
+ final int x=values[column];
+ assert(x==triples[column].site);
+ if(x>=minsite && x<=maxsite){
+ maxNearbySite=(x>maxNearbySite ? x : maxNearbySite);
+ approxHits++;
+ }else{chances--;}
+ }
+ }
+ hist_hits[Tools.min(HIT_HIST_LEN, approxHits)]++;
+
+ assert(centerIndex>=0) : centerIndex;
+ assert(approxHits>=1 || approxHitsCutoff>1) : approxHits+", "+approxHitsCutoff+", "+numHits+", "+t.column;
+ if(approxHits>=approxHitsCutoff){
+
+ int score;
+ int qscore=(filter_by_qscore ? quickScore(values, keyScores, centerIndex, offsets, sizes, true, approxHits, numHits) : qcutoff);
+ if(ADD_SCORE_Z){
+ int scoreZ=scoreZ2(values, centerIndex, offsets, approxHits, numHits);
+ qscore+=scoreZ;
+ }
+
+ int mapStart=site, mapStop=maxNearbySite;
+
+ assert(USE_EXTENDED_SCORE);
+
+ boolean locArrayValid=false;
+ if(qscore<qcutoff){
+ score=-1;
+ }else{
+
+ final int chrom=numberToChrom(site, baseChrom);
+
+ //TODO Note that disabling the shortCircuit code seems to make things run 2% faster (with identical results).
+ //However, theoretically, shortCircuit should be far more efficient. Test both ways on cluster and on a larger run.
+ //May have something to do with compiler loop optimizations.
+ if(shortCircuit && qscore==maxQuickScore){
+ assert(approxHits==numKeys);
+ score=maxScore;
+ }else{
+ if(verbose){
+ System.err.println("numHits="+numHits+", approxHits="+approxHits+", keys="+numKeys+", centerIndex="+centerIndex+
+ ", qscore="+qscore+", qcutoff="+qcutoff+", filter_by_qscore="+filter_by_qscore);
+ System.err.println("Extending "+Arrays.toString(values));
+ }
+ score=extendScore(bases, baseScores, offsets, values, chrom, centerIndex, locArray, numHits, approxHits);
+ locArrayValid=true;
+
+ if(verbose){
+ System.err.println("score: "+score);
+ System.err.println("locArray: "+Arrays.toString(locArray));
+ }
+
+ //Correct begin and end positions if they changed.
+ int min=Integer.MAX_VALUE;
+ int max=Integer.MIN_VALUE;
+ for(int i=0; i<locArray.length; i++){
+ int x=locArray[i];
+ if(x>-1){
+ if(x<min){min=x;}
+ if(x>max){max=x;}
+ }
+ }
+
+ if(score>=maxScore){
+ assert(min==max && min>-1) : "\n"+score+", "+maxScore+", "+min+", "+max+
+ ", "+(max-min)+", "+bases.length+"\n"+Arrays.toString(locArray)+"\n";
+ }
+
+ // assert(min>-1 && max>-1) : Arrays.toString(locArray); //TODO: How did this assertion trigger?
+ if(min<0 || max<0){
+ if(!Shared.anomaly){
+ Shared.anomaly=true;
+ System.err.println("\nAnomaly in "+getClass().getName()+".slowWalk:\n"+
+ "chrom="+chrom+", mapStart="+mapStart+", mapStop="+mapStop+", centerIndex="+centerIndex+", strand="+strand+"\n"+
+ "score="+score+", maxScore="+maxScore+", qscore="+qscore+", filter_by_qscore="+filter_by_qscore+"\n"+
+ "numHits="+approxHits+", approxHits="+approxHits+"\n"+
+ "min="+min+", max="+max+", (max-min)="+(max-min)+"\n"+
+ "bases.length="+bases.length+"\n"+Arrays.toString(locArray)+"\n"+
+ "locArray:\t"+Arrays.toString(locArray)+"\n"+
+ "values:\t"+Arrays.toString(values)+"\n"+
+ "bases:\t"+new String(bases));
+ System.err.println();
+ assert(false);
+ }
+ score=-99999;
+ }
+
+ //mapStart and mapStop are indices
+ mapStart=toNumber(min, chrom);
+ mapStop=toNumber(max, chrom);
+
+ if(score>=maxScore){
+ assert(mapStop-mapStart==0) : "\n"+score+", "+maxScore+", "+min+", "+max+
+ ", "+(max-min)+", "+(mapStop-mapStart)+", "+bases.length+"\n"+Arrays.toString(locArray)+"\n";
+ }
+ }
+
+ if(score==maxScore){
+ qcutoff=Tools.max(qcutoff, (int)(maxQuickScore*DYNAMIC_QSCORE_THRESH_PERFECT));
+ approxHitsCutoff=calcApproxHitsCutoff(numKeys, maxHits, MIN_APPROX_HITS_TO_KEEP, true);
+ }
+
+ if(score>=cutoff){
+ qcutoff=Tools.max(qcutoff, (int)(qscore*DYNAMIC_QSCORE_THRESH));
+ bestqscore=Tools.max(qscore, bestqscore);
+ }
+ }
+
+ if(score>=cutoff){
+
+ if(score>currentTopScore){
+// System.err.println("New top score!");
+
+ if(DYNAMICALLY_TRIM_LOW_SCORES){
+
+ maxHits=Tools.max(approxHits, maxHits);
+ approxHitsCutoff=calcApproxHitsCutoff(numKeys, maxHits, approxHitsCutoff, currentTopScore>=maxScore);
+
+ cutoff=Tools.max(cutoff, (int)(score*DYNAMIC_SCORE_THRESH));
+ if(score>=maxScore){
+ assert(USE_EXTENDED_SCORE);
+ cutoff=Tools.max(cutoff, (int)(score*0.95f));
+ }
+ }
+
+ currentTopScore=score;
+
+// System.out.println("New top score: "+currentTopScore+" \t("+cutoff+")");
+ }
+
+ final int chrom=numberToChrom(mapStart, baseChrom);
+ final int site2=numberToSite(mapStart);
+ final int site3=numberToSite(mapStop)+bases.length-1;
+
+ assert(site2!=site3) : site2+", "+site3+", "+mapStart+", "+mapStop;
+
+ assert(NUM_CHROM_BITS==0 || site2<SITE_MASK-1000) : "chrom="+chrom+", strand="+strand+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", site2="+site2+", site3="+site3+", read.length="+bases.length+
+ "\n\n"+Arrays.toString(b.getHitList(centerIndex));
+ assert(site2<site3) : "chrom="+chrom+", strand="+strand+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", site2="+site2+", site3="+site3+", read.length="+bases.length;
+
+
+ int[] gapArray=null;
+ if(site3-site2>=MINGAP+bases.length){
+ assert(locArrayValid) : "Loc array was not filled.";
+// System.err.println("****\n"+Arrays.toString(locArray)+"\n");
+// int[] clone=locArray.clone();
+ gapArray=makeGapArray(locArray, site2, MINGAP);
+ if(gapArray!=null){
+// System.err.println(Arrays.toString(locArray)+"\n");
+// System.err.println(Arrays.toString(gapArray));
+//
+//// int sub=site2-mapStart;//thus site2=mapStart+sub
+//// for(int i=0; i<gapArray.length; i++){
+//// gapArray[i]+=sub;
+//// }
+//// System.err.println(Arrays.toString(gapArray));
+//
+// System.err.println(mapStart+" -> "+site2);
+// System.err.println(mapStop+" -> "+site3);
+
+ assert(gapArray[0]>=site2 && gapArray[0]-site2<bases.length);
+ assert(gapArray[gapArray.length-1]<=site3 && site3-gapArray[gapArray.length-1]<bases.length) : "\n"+
+ mapStart+" -> "+site2+"\n"+
+ mapStop+" -> "+site3+"\n\n"+
+ Arrays.toString(gapArray)+"\n\n"+
+// Arrays.toString(clone)+"\n\n"+
+ Arrays.toString(locArray)+"\n"+
+ "numHits="+numHits+", "+
+ "heap.size="+heap.size()+", "+
+ "numHits="+numHits+", "+
+ "approxHits="+approxHits+"\n";
+ gapArray[0]=Tools.min(gapArray[0], site2);
+ gapArray[gapArray.length-1]=Tools.max(gapArray[gapArray.length-1], site3);
+ }
+ if(verbose){System.err.println("@ site "+site2+", made gap array: "+Arrays.toString(gapArray));}
+// assert(false) : Arrays.toString(locArray);
+ }
+
+
+ //This block is optional, but tries to eliminate multiple identical alignments
+
+ SiteScore ss=null;
+ final boolean perfect1=USE_EXTENDED_SCORE && score==maxScore && fullyDefined;
+ final boolean inbounds=(site2>=0 && site3<Data.chromLengths[chrom]);
+// if(!inbounds){System.err.println("Index tossed out-of-bounds site chr"+chrom+", "+site2+"-"+site3);}
+
+ if(verbose){
+ System.err.println("Considering site chr"+chrom+", site2="+site2+", site3="+site3+", mapStart="+mapStart+", mapStop="+mapStop);
+ if(!inbounds){System.err.println("Index tossed out-of-bounds site.");}
+ assert(site2!=site3) : site2+", "+site3+", "+mapStart+", "+mapStop;
+ }
+ //assert((ss==null || !ss.semiperfect) && (prevSS==null || !prevSS.semiperfect)) : (ss==null ? false : ss.semiperfect)+", "+(prevSS==null ? false : prevSS.semiperfect); //***
+
+ if(inbounds && !SEMIPERFECTMODE && !PERFECTMODE && gapArray==null && prevSS!=null &&
+ prevSS.chrom==chrom && prevSS.strand==strand && overlap(prevSS.start, prevSS.stop, site2, site3)){
+
+ if(verbose){System.err.println("Considering overlapping site chr"+chrom+", "+site2+"-"+site3);}
+
+ final int betterScore=Tools.max(score, prevSS.score);
+ final int minStart=Tools.min(prevSS.start, site2);
+ final int maxStop=Tools.max(prevSS.stop, site3);
+ final boolean perfect2=USE_EXTENDED_SCORE && prevSS.score==maxScore && fullyDefined;
+ assert(!USE_EXTENDED_SCORE || perfect2==prevSS.perfect);
+
+ final boolean shortEnough=(!LIMIT_SUBSUMPTION_LENGTH_TO_2X || (maxStop-minStart<2*bases.length));
+
+ if(prevSS.start==site2 && prevSS.stop==site3){
+ if(verbose){System.err.println("Class 1: Same bounds as last site.");}
+ prevSS.score=prevSS.quickScore=betterScore;
+ prevSS.perfect=(prevSS.perfect || perfect1 || perfect2);
+ if(prevSS.perfect){prevSS.semiperfect=true;}
+ //assert((ss==null || !ss.semiperfect) && (prevSS==null || !prevSS.semiperfect)) : (ss==null ? false : ss.semiperfect)+", "+(prevSS==null ? false : prevSS.semiperfect); //***
+ }else if(SUBSUME_SAME_START_SITES && shortEnough && prevSS.start==site2 && !prevSS.semiperfect){
+ if(verbose){System.err.println("Class 2: Same start as last site.");}
+ if(perfect2){
+ //do nothing
+ }else if(perfect1){
+ prevSS.setStop(site3);
+ if(!prevSS.perfect){perfectsFound++;}//***$
+ prevSS.perfect=prevSS.semiperfect=true;
+ }else{
+ prevSS.setStop(maxStop);
+ prevSS.setPerfect(bases);
+ }
+ //assert((ss==null || !ss.semiperfect) && (prevSS==null || !prevSS.semiperfect)) : (ss==null ? false : ss.semiperfect)+", "+(prevSS==null ? false : prevSS.semiperfect); //***
+ prevSS.score=prevSS.quickScore=betterScore;
+ }else if(SUBSUME_SAME_STOP_SITES && shortEnough && prevSS.stop==site3 && !prevSS.semiperfect){
+ if(verbose){System.err.println("Class 3: Same stop as last site.");}
+ if(perfect2){
+ //do nothing
+ }else if(perfect1){
+ prevSS.setStart(site2);
+ if(!prevSS.perfect){perfectsFound++;}//***$
+ prevSS.perfect=prevSS.semiperfect=true;
+ }else{
+ prevSS.setStart(minStart);
+ prevSS.setPerfect(bases);
+ }
+ //assert((ss==null || !ss.semiperfect) && (prevSS==null || !prevSS.semiperfect)) : (ss==null ? false : ss.semiperfect)+", "+(prevSS==null ? false : prevSS.semiperfect); //***
+ prevSS.score=prevSS.quickScore=betterScore;
+ }else if(SUBSUME_OVERLAPPING_SITES && shortEnough && (maxStop-minStart<=bases.length+MAX_SUBSUMPTION_LENGTH)
+ && !perfect1 && !perfect2 && !prevSS.semiperfect){
+ if(verbose){System.err.println("Class 4.");}
+ prevSS.setLimits(minStart, maxStop);
+ prevSS.score=prevSS.quickScore=betterScore;
+ prevSS.setPerfect(bases);
+ //assert((ss==null || !ss.semiperfect) && (prevSS==null || !prevSS.semiperfect)) : (ss==null ? false : ss.semiperfect)+", "+(prevSS==null ? false : prevSS.semiperfect); //***
+ }else{
+ if(verbose){System.err.println("Class 5: Making new site");}
+ ss=new SiteScore(chrom, strand, site2, site3, approxHits, score, false, perfect1);
+ if(!perfect1){ss.setPerfect(bases);}
+ //assert((ss==null || !ss.semiperfect) && (prevSS==null || !prevSS.semiperfect)) : (ss==null ? false : ss.semiperfect)+", "+(prevSS==null ? false : prevSS.semiperfect); //***
+// assert(Read.CHECKSITE(ss, bases));
+ if(verbose){System.err.println("A) Index made SiteScore "+ss.toText()+", "+Arrays.toString(ss.gaps));}
+ assert(!perfect1 || ss.stop-ss.start==bases.length-1);
+ }
+ assert(!perfect2 || prevSS.stop-prevSS.start==bases.length-1);
+ }else if(inbounds){
+ if(verbose){System.err.println("Considering new site chr"+chrom+", "+site2+"-"+site3);}
+ ss=new SiteScore(chrom, strand, site2, site3, approxHits, score, false, perfect1);
+ if(!perfect1){ss.setPerfect(bases);}
+ //assert((ss==null || !ss.semiperfect) && (prevSS==null || !prevSS.semiperfect)) : (ss==null ? false : ss.semiperfect)+", "+(prevSS==null ? false : prevSS.semiperfect); //***
+// assert(Read.CHECKSITE(ss, bases));
+ ss.gaps=gapArray;
+ if(verbose){System.err.println("B) Index made SiteScore "+ss.toText()+", "+Arrays.toString(ss.gaps));}
+ }
+ //assert((ss==null || !ss.semiperfect) && (prevSS==null || !prevSS.semiperfect)) : (ss==null ? false : ss.semiperfect)+", "+(prevSS==null ? false : prevSS.semiperfect); //***
+ assert(ss==null || !ss.perfect || ss.semiperfect) : ss;
+ assert(prevSS==null || !prevSS.perfect || prevSS.semiperfect) : "\n"+SiteScore.header()+"\n"+ss+"\n"+prevSS;
+ if(ss!=null && (SEMIPERFECTMODE && !ss.semiperfect) || (PERFECTMODE && !ss.perfect)){ss=null;}
+
+
+ if(ss!=null){
+// System.out.println("Added site "+ss.toText()+", qscore="+qscore);
+ ssl.add(ss);
+ if(ss.perfect){
+
+ if(prevSS==null || !prevSS.perfect || !ss.overlaps(prevSS)){
+ if(prevSS==null){assert ssl.size()<2 || !ss.overlaps(ssl.get(ssl.size()-2));}
+ perfectsFound++;
+
+ //Human-specific code
+// if(QUIT_AFTER_TWO_PERFECTS){
+// if(perfectsFound>=3 || (perfectsFound>=2 && chrom<24)){break;}
+// }
+
+ if(QUIT_AFTER_TWO_PERFECTS && perfectsFound>=2){break;}
+ }
+ }
+
+ prevSS=ss;
+ }else{
+// System.out.println("Subsumed site "+new SiteScore(chrom, strand, site2, site3, score).toText());
+ }
+ //assert((ss==null || !ss.semiperfect) && (prevSS==null || !prevSS.semiperfect)) : (ss==null ? false : ss.semiperfect)+", "+(prevSS==null ? false : prevSS.semiperfect); //***
+ }
+ }
+
+ while(heap.peek().site==site){ //Remove all identical elements, and add subsequent elements
+ final Quad t2=heap.poll();
+ final int row=t2.row+1, col=t2.column;
+ if(row<stops[col]){
+ t2.row=row;
+
+ int a=t2.list[row];
+ int a2;
+ if((a&SITE_MASK)>=offsets[col]){
+ a2=a-offsets[col];
+
+ assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) :
+ "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col];
+ }else{
+ int ch=numberToChrom(a, baseChrom);
+ int st=numberToSite(a);
+ int st2=Tools.max(st-offsets[col], 0);
+ a2=toNumber(st2, ch);
+
+ assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) :
+ "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col];
+ }
+
+ assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) :
+ "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col];
+
+ t2.site=a2;
+ values[col]=a2;
+ heap.add(t2);
+ }else if(heap.size()<approxHitsCutoff || PERFECTMODE){
+ assert(USE_EXTENDED_SCORE);
+ bestScores[0]=Tools.max(bestScores[0], currentTopScore);
+ bestScores[1]=Tools.max(bestScores[1], maxHits);
+ bestScores[2]=Tools.max(bestScores[2], qcutoff);
+ bestScores[3]=Tools.max(bestScores[3], bestqscore);
+
+ bestScores[4]=maxQuickScore;
+ bestScores[5]=perfectsFound; //***$ fixed by adding this line
+ if(!RETAIN_BEST_QCUTOFF){bestScores[2]=-9999;}
+
+ return ssl;
+ }
+ if(heap.isEmpty()){
+ assert(false) : heap.size()+", "+approxHitsCutoff;
+ break;
+ }
+ }
+
+ }
+
+ assert(USE_EXTENDED_SCORE);
+ bestScores[0]=Tools.max(bestScores[0], currentTopScore);
+ bestScores[1]=Tools.max(bestScores[1], maxHits);
+ bestScores[2]=Tools.max(bestScores[2], qcutoff);
+ bestScores[3]=Tools.max(bestScores[3], bestqscore);
+
+ bestScores[4]=maxQuickScore;
+ bestScores[5]=perfectsFound;
+ if(!RETAIN_BEST_QCUTOFF){bestScores[2]=-9999;}
+
+ return ssl;
+ }
+
+
+ private final int[] findMaxQscore2(final int[] starts, final int[] stops, final int[] offsets, final int[] keyScores,
+ final int baseChrom_, final Quad[] triples, final int[] values, final int prevMaxHits,
+ boolean earlyExit, boolean perfectOnly){
+
+ final int numHits=offsets.length;
+ assert(numHits>=prevMaxHits);
+
+ final int baseChrom=baseChrom(baseChrom_);
+ final Block b=index[baseChrom];
+ final int[] sizes=sizeArray;
+
+ heap.clear();
+ for(int i=0; i<numHits; i++){
+ final int[] sites=b.sites;
+ final int start=starts[i];
+ sizes[i]=b.length(start, stops[i]);
+ assert(sizes[i]>0);
+
+ int a=sites[start];
+ int a2;
+ if((a&SITE_MASK)>=offsets[i]){
+ a2=a-offsets[i];
+ }else{
+ int ch=numberToChrom(a, baseChrom);
+ int st=numberToSite(a);
+ int st2=Tools.max(st-offsets[i], 0);
+ a2=toNumber(st2, ch);
+ }
+ assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom));
+
+ Quad t=triples[i];
+ assert(t!=null) : "Should be using tripleStorage";
+ assert(i==t.column);
+ t.row=start;
+ t.site=a2;
+ t.list=sites;
+ values[i]=a2;
+
+ heap.add(t);
+ }
+
+ final int maxQuickScore=maxQuickScore(offsets, keyScores);
+
+ int topQscore=-999999999;
+
+ int maxHits=0;
+// int approxHitsCutoff=MIN_APPROX_HITS_TO_KEEP;
+
+
+ int approxHitsCutoff;
+ final int indelCutoff;
+ if(perfectOnly){
+ approxHitsCutoff=numHits;
+ indelCutoff=0;
+ }else{
+ approxHitsCutoff=Tools.max(prevMaxHits, Tools.min(MIN_APPROX_HITS_TO_KEEP, numHits-1)); //Faster, same accuracy
+ indelCutoff=MAX_INDEL2;
+ }
+
+
+ while(!heap.isEmpty()){
+ Quad t=heap.peek();
+ final int site=t.site;
+ final int centerIndex=t.column;
+
+ int maxNearbySite=site;
+
+
+ int approxHits=0;
+ {//Inner loop
+ final int minsite=site-Tools.min(MAX_INDEL, indelCutoff), maxsite=site+MAX_INDEL2;
+ for(int column=0, chances=numHits-approxHitsCutoff; column<numHits && chances>=0; column++){
+ final int x=values[column];
+ assert(x==triples[column].site);
+ if(x>=minsite && x<=maxsite){
+ maxNearbySite=(x>maxNearbySite ? x : maxNearbySite);
+ approxHits++;
+ }else{chances--;}
+ }
+ }
+ hist_hits[Tools.min(HIT_HIST_LEN, approxHits)]++;
+
+ assert(centerIndex>=0) : centerIndex;
+ assert(approxHits>=1 || approxHitsCutoff>1) : approxHits+", "+approxHitsCutoff+", "+numHits+", "+t.column;
+ if(approxHits>=approxHitsCutoff){
+
+ int qscore=quickScore(values, keyScores, centerIndex, offsets, sizes, true, approxHits, numHits);
+
+ if(ADD_SCORE_Z){
+ int scoreZ=scoreZ2(values, centerIndex, offsets, approxHits, numHits);
+ qscore+=scoreZ;
+ }
+
+ if(qscore>topQscore){
+
+// maxHits=Tools.max(approxHits, maxHits);
+// approxHitsCutoff=Tools.max(approxHitsCutoff, maxHits); //Best setting for pre-scan
+
+ maxHits=Tools.max(approxHits, maxHits);
+ approxHitsCutoff=Tools.max(approxHitsCutoff, approxHits-1); //Best setting for pre-scan
+
+ topQscore=qscore;
+
+ if(qscore>=maxQuickScore){
+ assert(qscore==maxQuickScore);
+ assert(approxHits==numHits);
+ if(earlyExit){
+ return new int[] {topQscore, maxHits};
+ }
+ }
+ }
+ }
+
+ while(heap.peek().site==site){ //Remove all identical elements, and add subsequent elements
+ final Quad t2=heap.poll();
+ final int row=t2.row+1, col=t2.column;
+ if(row<stops[col]){
+ t2.row=row;
+
+ int a=t2.list[row];
+ int a2;
+ if((a&SITE_MASK)>=offsets[col]){
+ a2=a-offsets[col];
+
+ assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) :
+ "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col];
+ }else{
+ int ch=numberToChrom(a, baseChrom);
+ int st=numberToSite(a);
+ int st2=Tools.max(st-offsets[col], 0);
+ a2=toNumber(st2, ch);
+
+ assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) :
+ "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col];
+ }
+
+ assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) :
+ "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col];
+
+ t2.site=a2;
+ values[col]=a2;
+ heap.add(t2);
+ }else if(earlyExit && (perfectOnly || heap.size()<approxHitsCutoff)){
+ return new int[] {topQscore, maxHits};
+ }
+ if(heap.isEmpty()){break;}
+ }
+
+ }
+
+
+
+ return new int[] {topQscore, maxHits};
+ }
+
+
+ private static final int absdif(int a, int b){
+ return a>b ? a-b : b-a;
+ }
+
+
+ final int maxScore(int[] offsets, byte[] baseScores, int[] keyScores, int readlen, boolean useQuality){
+
+ if(useQuality){
+ //These lines apparently MUST be used if quality is used later on for slow align.
+ if(USE_AFFINE_SCORE){return msa.maxQuality(baseScores);}
+ if(USE_EXTENDED_SCORE){return readlen*(BASE_HIT_SCORE+BASE_HIT_SCORE/5)+Tools.sumInt(baseScores);}
+ }else{
+ if(USE_AFFINE_SCORE){return msa.maxQuality(readlen);}
+ if(USE_EXTENDED_SCORE){return readlen*(BASE_HIT_SCORE+BASE_HIT_SCORE/5);}
+ }
+
+ return maxQuickScore(offsets, keyScores);
+ }
+
+
+ public final int maxQuickScore(int[] offsets, int[] keyScores){
+
+// int x=offsets.length*BASE_KEY_HIT_SCORE;
+ int x=Tools.intSum(keyScores);
+ int y=Y_SCORE_MULT*(offsets[offsets.length-1]-offsets[0]);
+// if(ADD_LIST_SIZE_BONUS){x+=(LIST_SIZE_BONUS[1]*offsets.length);}
+// assert(!ADD_SCORE_Z) : "Need to make sure this is correct...";
+
+// if(ADD_SCORE_Z){x+=((offsets[offsets.length-1]+CHUNKSIZE)*Z_SCORE_MULT);}
+ if(ADD_SCORE_Z){x+=maxScoreZ(offsets);}
+
+ return x+y;
+// int bonus=(2*(HIT_SCORE/2)); //For matching both ends
+// return x+y+bonus;
+ }
+
+
+ private final int quickScore(final int[] locs, final int[] keyScores, final int centerIndex, final int offsets[],
+ int[] sizes, final boolean penalizeIndels, final int numApproxHits, final int numHits){
+
+ hist_hits_score[Tools.min(HIT_HIST_LEN, numApproxHits)]++;
+ if(numApproxHits==1){return keyScores[centerIndex];}
+
+ //Done!
+ //Correct way to calculate score:
+ //Find the first chunk that exactly hits the center.
+ //Then, align leftward of it, and align rightward of it, and sum the scores.
+
+ //"-centerIndex" is a disambiguating term that, given otherwise identical match patterns
+ //(for example, a small indel will generate two valid site candidates), choose the lower site.
+
+ int x=keyScores[centerIndex]+scoreLeft(locs, keyScores, centerIndex, sizes, penalizeIndels)+
+ scoreRight(locs, keyScores, centerIndex, sizes, penalizeIndels, numHits)-centerIndex;
+
+ int y=Y_SCORE_MULT*scoreY(locs, centerIndex, offsets);
+ if(ADD_LIST_SIZE_BONUS){x+=calcListSizeBonus(sizes[centerIndex]);}
+// int z=scoreZ(locs, hits);
+ return x+y;
+ }
+
+
+// /** Generates a term that increases score with how many bases in the read match the ref. */
+// private static final int scoreZ(int[] locs, int centerIndex, int offsets[]){
+// final int center=locs[centerIndex];
+//
+// final int[] refLoc=new int[offsets[offsets.length-1]+CHUNKSIZE];
+//
+// final int maxLoc=center+MAX_INDEL2;
+// final int minLoc=Tools.max(0, center-MAX_INDEL);
+//
+// int score=0;
+//
+// for(int i=0; i<locs.length; i++){
+// int loc=locs[i];
+//// int dif=absdif(loc, center);
+// if(loc>=minLoc && loc<=maxLoc){
+//// assert(loc>=center) : "loc="+loc+"\ni="+i+"\ncenterIndex="+centerIndex+
+//// "\nmaxLoc="+maxLoc+"\nlocs:\t"+Arrays.toString(locs)+"\noffsets:\t"+Arrays.toString(offsets);
+//
+// int offset=offsets[i];
+// int max=CHUNKSIZE+offset;
+//
+// for(int j=offset; j<max; j++){
+// int old=refLoc[j];
+// if(old==0){
+// refLoc[j]=loc;
+// score+=4;
+// }else if(old>loc){
+// refLoc[j]=loc;
+// score-=2;
+// }else if(old==loc){
+// score-=1;
+// //do nothing, perhaps, or add 1?
+// }else{
+// score-=2;
+// assert(old<loc);
+// }
+// }
+// }
+// }
+// return score;
+// }
+
+
+
+ private final int extendScore(final byte[] bases, final byte[] baseScores, final int[] offsets, final int[] values,
+ final int chrom, final int centerIndex, final int[] locArray, final int numHits, final int numApproxHits){
+ callsToExtendScore++;
+ hist_hits_extend[Tools.min(HIT_HIST_LEN, numApproxHits)]++;
+
+ final int centerVal=values[centerIndex];
+ final int centerLoc=numberToSite(centerVal);
+
+ final int minLoc=Tools.max(0, centerLoc-MAX_INDEL); //Legacy, for assertions
+ final int maxLoc=centerLoc+MAX_INDEL2; //Legacy, for assertions
+
+ final int minVal=centerVal-MAX_INDEL;
+ final int maxVal=centerVal+MAX_INDEL2;
+
+ final byte[] ref=Data.getChromosome(chrom).array;
+
+ if(verbose){
+ System.err.println("\n");
+ System.err.println("minLoc="+minLoc+", maxLoc="+ maxLoc+", centerIndex="+centerIndex+", centerVal="+centerVal+", centerLoc="+centerLoc);
+ System.err.println("minVal="+minVal+", maxVal="+ maxVal+", numHits="+numHits+", numApproxHits="+numApproxHits);
+ System.err.println("offsets:\t"+Arrays.toString(offsets));
+ System.err.println("values:\t"+Arrays.toString(values));
+ System.err.println();
+ int centerOffset=offsets[centerIndex];
+
+ for(int i=0; i<centerOffset; i++){System.err.print(" ");}
+ for(int i=centerOffset; i<centerOffset+KEYLEN; i++){System.err.print((char)bases[i]);}
+ System.err.println();
+
+ System.err.println(new String(bases));
+ System.err.println(new String(Arrays.copyOfRange(ref, centerLoc, centerLoc+bases.length)));
+ System.err.println();
+ }
+
+// int[] locArray=new int[bases.length];
+ Arrays.fill(locArray, -1);
+
+
+// if(verbose){
+// System.err.println("Reverse fill:");
+// }
+
+ //First fill in reverse
+ for(int i=0, keynum=0; i<numHits; i++){
+ final int value=values[i];
+// if(verbose){System.err.println("values["+i+"]="+value);}
+
+ if(value>=minVal && value<=maxVal){
+ final int refbase=numberToSite(value);
+// if(verbose){System.err.println("refbase="+refbase);}
+ assert(refbase>=minLoc && refbase<=maxLoc);
+
+ // System.err.println("Reverse: Trying key "+refbase+" @ "+offsets[i]);
+ // System.err.println("Passed!");
+ keynum++;
+ final int callbase=offsets[i];
+
+ int misses=0;
+ for(int cloc=callbase+KEYLEN-1, rloc=refbase+cloc; cloc>=0 && rloc>=0 && rloc<ref.length; cloc--, rloc--){
+ int old=locArray[cloc];
+ if(old==refbase){
+// if(verbose){System.err.println("Broke because old="+old+", refbase="+refbase);}
+ break;
+ } //Already filled with present value
+ if(misses>0 && old>=0){
+// if(verbose){System.err.println("Broke because old="+old+", misses="+misses);}
+ break;
+ } //Already filled with something that has no errors
+ byte c=bases[cloc];
+ byte r=ref[rloc];
+
+ if(c==r){
+ if(old<0 || refbase==centerLoc){ //If the cell is empty or this key corresponds to center
+ locArray[cloc]=refbase;
+ }
+ }else{
+ misses++;
+ //Only extends first key all the way back. Others stop at the first error.
+ if(old>=0 || keynum>1){
+// if(verbose){System.err.println("Broke because old="+old+", keynum="+keynum);}
+ break;
+ }
+ }
+ }
+ }
+ }
+
+// if(verbose){
+// System.err.println("locArray:\t"+Arrays.toString(locArray));
+// System.err.println("Forward fill:");
+// }
+
+ //Then fill forward
+ for(int i=0; i<numHits; i++){
+ final int value=values[i];
+// if(verbose){System.err.println("values["+i+"]="+value);}
+
+ if(value>=minVal && value<=maxVal){
+ final int refbase=numberToSite(value);
+// if(verbose){System.err.println("refbase="+refbase);}
+ assert(refbase>=minLoc && refbase<=maxLoc);
+ final int callbase=offsets[i];
+
+ int misses=0;
+ for(int cloc=callbase+KEYLEN, rloc=refbase+cloc; cloc<bases.length && rloc<ref.length; cloc++, rloc++){
+ int old=locArray[cloc];
+ if(old==refbase){break;} //Already filled with present value
+ if(misses>0 && old>=0){break;} //Already filled with something that has no errors
+ byte c=bases[cloc];
+ byte r=ref[rloc];
+
+ if(c==r){
+ if(old<0 || refbase==centerLoc){ //If the cell is empty or this key corresponds to center
+ locArray[cloc]=refbase;
+ }
+ }else{
+ misses++;
+ if(old>=0){break;} //Already filled with something that has no errors
+ }
+ }
+ }
+ }
+
+ //Try to subsume out-of-order locs where higher numbers come before lower numbers. Made things worse.
+// for(int i=1; i<locArray.length; i++){
+// final int loc=locArray[i];
+// final int last=locArray[i-1];
+// if(loc<last && last>-1){
+// final byte c=bases[i];
+//// final int rloc1=loc+i;
+// final int rloc2=last+i;
+// byte r2=ref[rloc2];
+// if(r2==c){
+// locArray[i]=last;
+// }
+// }
+// }
+
+// //Change 'N' to -2. A bit slow.
+// {
+// int firstMatch=0;
+// while(firstMatch<locArray.length && locArray[firstMatch]<0){firstMatch++;}
+// assert(firstMatch<locArray.length) : new String(bases);
+// int last=locArray[firstMatch];
+// for(int i=firstMatch-1; i>=0; i--){
+// final byte c=bases[i];
+// if(c=='N'){locArray[i]=-2;}
+// else{
+// assert(locArray[i]==-1);
+// final int rloc=last+i;
+// byte r=ref[rloc];
+// if(r=='N'){locArray[i]=-2;}
+// }
+// }
+// for(int i=firstMatch; i<locArray.length; i++){
+// final int loc=locArray[i];
+// if(last<1){last=loc;}
+// final byte c=bases[i];
+// if(c=='N'){locArray[i]=-2;}
+// else if(loc==-1 && last>0){
+// final int rloc=last+i;
+// byte r=ref[rloc];
+// if(r=='N'){locArray[i]=-2;}
+// }
+// }
+// }
+
+ //Change 'N' to -2, but only for nocalls, not norefs. Much faster.
+ {
+ final byte nb=(byte)'N';
+ for(int i=0; i<bases.length; i++){
+ if(bases[i]==nb){locArray[i]=-2;}
+ }
+ }
+
+//
+// {
+// int last=locArray[0];
+// for(int i=1; i<locArray.length; i++){
+// final int loc=locArray[i];
+// if(loc>0){
+// if(last<1){last=loc;}
+// }else{
+// if(last>0){
+// final byte c=bases[i];
+// final int rloc2=last+i;
+// byte r2=ref[rloc2];
+// if(c=='N'){
+//
+// }else if(c==r2){
+// locArray[i]=last;
+// }
+// }
+// last=-1;
+// }
+// }
+// }
+// {
+// int last=locArray[locArray.length-1];
+// for(int i=locArray.length-2; i>=0; i--){
+// final int loc=locArray[i];
+// if(loc>0){
+// if(last<1){last=loc;}
+// }else{
+// if(last>0){
+// final byte c=bases[i];
+// final int rloc2=last+i;
+// byte r2=ref[rloc2];
+// if(c=='N'){
+//
+// }else if(c==r2){
+// locArray[i]=last;
+// }
+// }
+// last=-1;
+// }
+// }
+// }
+
+// for(int i=locArray.length-2; i>=0; i--){
+// final int loc=locArray[i];
+// final int last=locArray[i+1];
+// if(loc>last && last>-1){
+// final byte c=bases[i];
+//// final int rloc1=loc+i;
+// final int rloc2=last+i;
+// byte r2=ref[rloc2];
+// if(r2==c){
+// locArray[i]=last;
+// }
+// }
+// }
+
+ if(verbose){
+// System.err.println("locArray:\t"+Arrays.toString(locArray));
+
+ int centerOffset=offsets[centerIndex];
+ int lim=centerOffset+KEYLEN;
+ for(int i=centerOffset; i<lim; i++){
+ assert(locArray[i]!=-1) : " ( "+centerOffset+" < "+i+" < "+lim+" ) "+
+ "\nlocArray: "+Arrays.toString(locArray)+
+ "\nvalues: "+Arrays.toString(values)+
+ "\noffsets: "+Arrays.toString(offsets);
+ }
+ }
+
+ if(USE_AFFINE_SCORE){
+ /* TODO - sometimes returns a higher score than actual alignment. This should never happen. */
+ int score=(KFILTER<2 ? msa.calcAffineScore(locArray, baseScores, bases) :
+ msa.calcAffineScore(locArray, baseScores, bases, KFILTER));
+ return score;
+ }
+
+ int score=0;
+ int lastLoc=-1;
+ int centerBonus=BASE_HIT_SCORE/5;
+ for(int i=0; i<locArray.length; i++){
+ int loc=locArray[i];
+ if(loc>=0){
+ score+=BASE_HIT_SCORE+baseScores[i];
+ if(loc==centerLoc){score+=centerBonus;}
+ if(loc!=lastLoc && lastLoc>=0){
+ int dif=absdif(loc, lastLoc);
+ int penalty=Tools.min(INDEL_PENALTY+INDEL_PENALTY_MULT*dif, MAX_PENALTY_FOR_MISALIGNED_HIT);
+ score-=penalty;
+ }
+ lastLoc=loc;
+ }
+ }
+
+// System.err.println("Extended score: "+score);
+// System.err.println(Arrays.toString(locArray));
+
+
+ return score;
+ }
+
+
+ /** NOTE! This destroys the locArray, so use a copy if needed. */
+ private static final int[] makeGapArray(int[] locArray, int minLoc, int minGap){
+ int gaps=0;
+ boolean doSort=false;
+
+ if(locArray[0]<0){locArray[0]=minLoc;}
+ for(int i=1; i<locArray.length; i++){
+ if(locArray[i]<0){locArray[i]=locArray[i-1]+1;}
+ else{locArray[i]+=i;}
+ if(locArray[i]<locArray[i-1]){doSort=true;}
+ }
+
+// System.err.println(Arrays.toString(locArray)+"\n");
+
+ if(doSort){
+// System.err.println("*");
+ Arrays.sort(locArray);
+ }
+// System.err.println(Arrays.toString(locArray)+"\n");
+
+ for(int i=1; i<locArray.length; i++){
+ int dif=locArray[i]-locArray[i-1];
+ assert(dif>=0);
+ if(dif>minGap){
+ gaps++;
+ }
+ }
+ if(gaps<1){return null;}
+ int[] out=new int[2+gaps*2];
+ out[0]=locArray[0];
+ out[out.length-1]=locArray[locArray.length-1];
+
+ for(int i=1, j=1; i<locArray.length; i++){
+ int dif=locArray[i]-locArray[i-1];
+ assert(dif>=0);
+ if(dif>minGap){
+ out[j]=locArray[i-1];
+ out[j+1]=locArray[i];
+ j+=2;
+ }
+ }
+ return out;
+ }
+
+
+ /** Generates a term that increases score with how many bases in the read match the ref. */
+ private final int scoreZ2(int[] locs, int centerIndex, int offsets[], int numApproxHits, int numHits){
+
+ if(numApproxHits==1){return SCOREZ_1KEY;}
+
+ final int center=locs[centerIndex];
+
+ final int maxLoc=center+MAX_INDEL2;
+ final int minLoc=Tools.max(0, center-MAX_INDEL);
+
+ int score=0;
+
+ int a0=-1, b0=-1;
+
+ for(int i=0; i<numHits; i++){
+ int loc=locs[i];
+// int dif=absdif(loc, center);
+ if(loc>=minLoc && loc<=maxLoc){
+// assert(loc>=center) : "loc="+loc+"\ni="+i+"\ncenterIndex="+centerIndex+
+// "\nmaxLoc="+maxLoc+"\nlocs:\t"+Arrays.toString(locs)+"\noffsets:\t"+Arrays.toString(offsets);
+ int a=offsets[i];
+
+ if(b0<a){
+ score+=b0-a0;
+ a0=a;
+ }
+ b0=a+KEYLEN;
+ }
+ }
+ score+=b0-a0;
+ score=score*Z_SCORE_MULT;
+// assert(score==scoreZslow(locs, centerIndex, offsets, false)) : scoreZslow(locs, centerIndex, offsets, true)+" != "+score;
+ return score;
+ }
+
+ @Deprecated
+ /** This was just to verify scoreZ2. */
+ private final int scoreZslow(int[] locs, int centerIndex, int offsets[], boolean display, int numHits){
+ final int center=locs[centerIndex];
+
+ final int maxLoc=center+MAX_INDEL2;
+ final int minLoc=Tools.max(0, center-MAX_INDEL);
+
+ byte[] array=new byte[offsets[offsets.length-1]+KEYLEN];
+ int score=0;
+
+ for(int i=0; i<numHits; i++){
+ int loc=locs[i];
+// int dif=absdif(loc, center);
+ if(loc>=minLoc && loc<=maxLoc){
+ int pos=offsets[i];
+// if(true){
+// System.err.println("\ni="+i+", pos="+pos+", array=["+array.length+"], limit="+(pos+CHUNKSIZE-1));
+// }
+ for(int j=pos; j<pos+KEYLEN; j++){
+ if(array[j]==0){score++;}
+ array[j]=1;
+ }
+ }
+ }
+
+ if(display){System.err.println("\n"+Arrays.toString(array)+"\n");}
+
+ return score*Z_SCORE_MULT;
+ }
+
+ /** Generates a term that increases score with how many bases in the read match the ref. */
+ private final int maxScoreZ(int offsets[]){
+ int score=0;
+ int a0=-1, b0=-1;
+
+ for(int i=0; i<offsets.length; i++){
+ int a=offsets[i];
+
+ if(b0<a){
+ score+=b0-a0;
+ a0=a;
+ }
+ b0=a+KEYLEN;
+
+ }
+ score+=b0-a0;
+ return score*Z_SCORE_MULT;
+ }
+
+
+ private final int scoreRight(int[] locs, int[] keyScores, int centerIndex, int[] sizes, boolean penalizeIndels, int numHits){
+
+ int score=0;
+
+ int prev, loc=locs[centerIndex];
+
+ for(int i=centerIndex+1; i<numHits; i++){
+
+ if(locs[i]>=0){
+ prev=loc;
+ loc=locs[i];
+
+ int offset=absdif(loc, prev);
+
+ if(offset<=MAX_INDEL){
+ score+=keyScores[i];
+ if(ADD_LIST_SIZE_BONUS){score+=calcListSizeBonus(sizes[i]);}
+
+// if(i==locs.length-1){score+=HIT_SCORE/2;} //Adds a bonus for matching the first or last key
+
+ if(penalizeIndels && offset!=0){
+ int penalty=Tools.min(INDEL_PENALTY+INDEL_PENALTY_MULT*offset, MAX_PENALTY_FOR_MISALIGNED_HIT);
+ score-=penalty;
+// score-=(INDEL_PENALTY+Tools.min(INDEL_PENALTY_MULT*offset, 1+HIT_SCORE/4));
+ }
+ }else{
+ loc=prev;
+ }
+ }
+
+ }
+ return score;
+
+ }
+
+ private final int scoreLeft(int[] locs, int[] keyScores, int centerIndex, int[] sizes, boolean penalizeIndels){
+
+ callsToScore++;
+
+ int score=0;
+
+ int prev, loc=locs[centerIndex];
+
+ for(int i=centerIndex-1; i>=0; i--){
+
+ if(locs[i]>=0){
+ prev=loc;
+ loc=locs[i];
+
+ int offset=absdif(loc, prev);
+
+ if(offset<=MAX_INDEL){
+ score+=keyScores[i];
+ if(ADD_LIST_SIZE_BONUS){score+=calcListSizeBonus(sizes[i]);}
+
+// if(i==0){score+=HIT_SCORE/2;} //Adds a bonus for matching the first or last key
+ if(penalizeIndels && offset!=0){
+ int penalty=Tools.min(INDEL_PENALTY+INDEL_PENALTY_MULT*offset, MAX_PENALTY_FOR_MISALIGNED_HIT);
+ score-=penalty;
+ }
+ }else{
+ loc=prev;
+ }
+ }
+
+ }
+ return score;
+
+ }
+
+ /** Encode a (location, chrom) pair to an index */
+ private static final int toNumber(int site, int chrom){
+ int out=(chrom&CHROM_MASK_LOW);
+ out=out<<SHIFT_LENGTH;
+ out=(out|site);
+ return out;
+ }
+
+ /** Decode an (index, baseChrom) pair to a chromosome */
+ private static final int numberToChrom(int number, int baseChrom){
+ assert((baseChrom&CHROM_MASK_LOW)==0) : Integer.toHexString(number)+", baseChrom="+baseChrom;
+ assert(baseChrom>=0) : Integer.toHexString(number)+", baseChrom="+baseChrom;
+ int out=(number>>>SHIFT_LENGTH);
+ out=out+(baseChrom&CHROM_MASK_HIGH);
+ return out;
+ }
+
+ /** Decode an index to a location */
+ private static final int numberToSite(int number){
+ return (number&SITE_MASK);
+ }
+
+ private static final int minChrom(int chrom){return Tools.max(MINCHROM, chrom&CHROM_MASK_HIGH);}
+ private static final int baseChrom(int chrom){return Tools.max(0, chrom&CHROM_MASK_HIGH);}
+ private static final int maxChrom(int chrom){return Tools.max(MINCHROM, Tools.min(MAXCHROM, chrom|CHROM_MASK_LOW));}
+
+
+ private final int[] getOffsetArray(int len){
+ if(offsetArrays[len]==null){offsetArrays[len]=new int[len];}
+ return offsetArrays[len];
+ }
+ private final int[] getLocArray(int len){
+ if(len>=locArrays.length){return new int[len];}
+ if(locArrays[len]==null){locArrays[len]=new int[len];}
+ return locArrays[len];
+ }
+ private final int[] getGreedyListArray(int len){
+ if(greedyListArrays[len]==null){greedyListArrays[len]=new int[len];}
+ return greedyListArrays[len];
+ }
+ private final int[] getGenericArray(int len){
+ if(genericArrays[len]==null){genericArrays[len]=new int[len];}
+ return genericArrays[len];
+ }
+
+ final byte[] getBaseScoreArray(int len, int strand){
+ if(len>=baseScoreArrays[0].length){return new byte[len];}
+ if(baseScoreArrays[strand][len]==null){baseScoreArrays[strand][len]=new byte[len];}
+ return baseScoreArrays[strand][len];
+ }
+ final int[] getKeyScoreArray(int len, int strand){
+ if(len>=keyScoreArrays.length){return new int[len];}
+ if(keyScoreArrays[strand][len]==null){keyScoreArrays[strand][len]=new int[len];}
+ return keyScoreArrays[strand][len];
+ }
+ private final float[] getKeyWeightArray(int len){
+ if(len>=keyWeightArrays.length){return new float[len];}
+ if(keyWeightArrays[len]==null){keyWeightArrays[len]=new float[len];}
+ return keyWeightArrays[len];
+ }
+ @Override
+ float[] keyProbArray() {
+ return keyProbArray;
+ }
+
+
+ private final int[][] locArrays=new int[601][];
+ private final int[] valueArray=new int[256];
+ private final int[] sizeArray=new int[256];
+ private final int[][] offsetArrays=new int[256][];
+ private final int[][] greedyListArrays=new int[256][];
+ private final int[][] genericArrays=new int[256][];
+ private final int[] startArray=new int[256];
+ private final int[] stopArray=new int[256];
+ private final Quad[] tripleStorage=makeQuadStorage(256);
+ private final int[] greedyReturn=new int[2];
+ private final int[][] shrinkReturn2=new int[3][];
+ private final int[][] shrinkReturn3=new int[5][];
+ private final int[][] prescanReturn=new int[2][];
+ private final int[] prescoreArray;
+ private final int[] precountArray;
+
+ private final byte[][][] baseScoreArrays=new byte[2][601][];
+ private final int[][][] keyScoreArrays=new int[2][256][];
+ final float[] keyProbArray=new float[601];
+ private final float[][] keyWeightArrays=new float[256][];
+
+
+ private final Quad[] makeQuadStorage(int number){
+ Quad[] r=new Quad[number];
+ for(int i=0; i<number; i++){r[i]=new Quad(i, 0, 0);}
+ return r;
+ }
+
+
+ private final QuadHeap heap=new QuadHeap(255);
+
+ static int SHIFT_LENGTH=(32-1-NUM_CHROM_BITS);
+ static int MAX_ALLOWED_CHROM_INDEX=~((-1)<<SHIFT_LENGTH);
+
+ /** Mask the number to get the site, which is in the lower bits */
+ static int SITE_MASK=((-1)>>>(NUM_CHROM_BITS+1));
+
+ /** Mask the chromosome's high bits to get the low bits */
+ static int CHROM_MASK_LOW=CHROMS_PER_BLOCK-1;
+
+ /** Mask the chromosome's lower bits to get the high bits */
+ static int CHROM_MASK_HIGH=~CHROM_MASK_LOW;
+
+ static void setChromBits(int x){
+
+ NUM_CHROM_BITS=x;
+ CHROMS_PER_BLOCK=(1<<(NUM_CHROM_BITS));
+ SHIFT_LENGTH=(32-1-NUM_CHROM_BITS);
+ MAX_ALLOWED_CHROM_INDEX=~((-1)<<SHIFT_LENGTH);
+ SITE_MASK=((-1)>>>(NUM_CHROM_BITS+1));
+ CHROM_MASK_LOW=CHROMS_PER_BLOCK-1;
+ CHROM_MASK_HIGH=~CHROM_MASK_LOW;
+
+// assert(NUM_CHROM_BITS<30);
+ assert(NUM_CHROM_BITS>=0); //max is 3 for human; perhaps more for other organisms
+// assert((1<<(NUM_CHROM_BITS))>=CHROMSPERBLOCK) : (1<<(NUM_CHROM_BITS))+" < "+CHROMSPERBLOCK;
+ assert((1<<(NUM_CHROM_BITS))==CHROMS_PER_BLOCK) : (1<<(NUM_CHROM_BITS))+" < "+CHROMS_PER_BLOCK;
+ assert(Integer.bitCount(CHROMS_PER_BLOCK)==1);
+ assert(Integer.numberOfLeadingZeros(SITE_MASK)==(NUM_CHROM_BITS+1)) : Integer.toHexString(SITE_MASK);
+ }
+
+ private final int cycles;
+
+ public static final int BASE_HIT_SCORE=100;
+ public static final int ALIGN_COLUMNS=3000;
+ public static int MAX_INDEL=16000; //Max indel length, min 0, default 400; longer is more accurate
+ public static int MAX_INDEL2=2*MAX_INDEL;
+
+ private final float INV_BASE_KEY_HIT_SCORE;
+ private final int INDEL_PENALTY; //default (HIT_SCORE/2)-1
+ private final int INDEL_PENALTY_MULT; //default 20; penalty for indel length
+ private final int MAX_PENALTY_FOR_MISALIGNED_HIT;
+ private final int SCOREZ_1KEY;
+
+ public static final boolean GENERATE_KEY_SCORES_FROM_QUALITY=true; //True: Much faster and more accurate.
+ public static final boolean GENERATE_BASE_SCORES_FROM_QUALITY=true; //True: Faster, and at least as accurate.
+ public static final boolean ADD_SCORE_Z=true; //Increases quality, decreases speed
+ public static final int Z_SCORE_MULT=20;
+ public static final int Y_SCORE_MULT=10;
+
+
+ /**
+ * Return only sites that match completely or with partial no-reference
+ */
+ public static void setSemiperfectMode() {
+ assert(!PERFECTMODE);
+ SEMIPERFECTMODE=true;
+ PRESCAN_QSCORE=false;
+// MIN_APPROX_HITS_TO_KEEP++;
+
+
+
+ MAX_INDEL=0;
+ MAX_INDEL2=0;
+ }
+
+ /**
+ * Return only sites that match completely
+ */
+ public static void setPerfectMode() {
+ assert(!SEMIPERFECTMODE);
+ PERFECTMODE=true;
+ PRESCAN_QSCORE=false;
+// MIN_APPROX_HITS_TO_KEEP++;
+
+
+
+ MAX_INDEL=0;
+ MAX_INDEL2=0;
+ }
+
+ static float FRACTION_GENOME_TO_EXCLUDE=0.003f; //Default .04; lower is slower and more accurate
+
+ public static final void setFractionToExclude(float f){
+ assert(f>=0 && f<1);
+ FRACTION_GENOME_TO_EXCLUDE=f;
+ MIN_INDEX_TO_DROP_LONG_HIT_LIST=(int)(1000*(1-3.5*FRACTION_GENOME_TO_EXCLUDE)); //default 810
+ MAX_AVERAGE_LIST_TO_SEARCH=(int)(1000*(1-2.3*FRACTION_GENOME_TO_EXCLUDE)); //lower is faster, default 840
+ MAX_AVERAGE_LIST_TO_SEARCH2=(int)(1000*(1-1.4*FRACTION_GENOME_TO_EXCLUDE)); //default 910
+ MAX_SINGLE_LIST_TO_SEARCH=(int)(1000*(1-1.0*FRACTION_GENOME_TO_EXCLUDE)); //default 935
+ MAX_SHORTEST_LIST_TO_SEARCH=(int)(1000*(1-2.8*FRACTION_GENOME_TO_EXCLUDE)); //Default 860
+ }
+
+
+ /** Default .75. Range: 0 to 1 (but 0 will break everything). Lower is faster and less accurate. */
+ static final float HIT_FRACTION_TO_RETAIN=0.85f; //default: .85
+ /** Range: 0 to 1000. Lower should be faster and less accurate. */
+ static int MIN_INDEX_TO_DROP_LONG_HIT_LIST=(int)(1000*(1-3.5*FRACTION_GENOME_TO_EXCLUDE)); //default 810
+ /** Range: 2 to infinity. Lower should be faster and less accurate. */
+ static final int MIN_HIT_LISTS_TO_RETAIN=8;
+
+ static int MAX_AVERAGE_LIST_TO_SEARCH=(int)(1000*(1-2.3*FRACTION_GENOME_TO_EXCLUDE)); //lower is faster, default 840
+ //lower is faster
+ static int MAX_AVERAGE_LIST_TO_SEARCH2=(int)(1000*(1-1.4*FRACTION_GENOME_TO_EXCLUDE)); //default 910
+ //lower is faster
+ static int MAX_SINGLE_LIST_TO_SEARCH=(int)(1000*(1-1.0*FRACTION_GENOME_TO_EXCLUDE)); //default 935
+ //lower is faster
+ static int MAX_SHORTEST_LIST_TO_SEARCH=(int)(1000*(1-2.8*FRACTION_GENOME_TO_EXCLUDE)); //Default 860
+
+ /** To increase accuracy on small genomes, override greedy list dismissal when the list is at most this long. */
+ public static final int SMALL_GENOME_LIST=80;
+
+ static{assert(!(TRIM_BY_GREEDY && TRIM_BY_TOTAL_SITE_COUNT)) : "Pick one.";}
+
+ static final int CLUMPY_MAX_DIST=4; //Keys repeating over intervals of this or less are clumpy.
+
+ /** Minimum length of list before clumpiness is considered. This is an index in the length histogram, from 0 to 1000. */
+ static final int CLUMPY_MIN_LENGTH_INDEX=6000;
+ static final float CLUMPY_FRACTION=0.8f; //0 to 1; higher is slower but more stringent. 0.5 means the median distance is clumpy.
+
+ static final int MAX_SUBSUMPTION_LENGTH=MAX_INDEL2;
+
+ /** approxHitsCutoff=maxHits-MAX_HITS_REDUCTION1 when slowWalk3 is first entered */
+ public static final int MAX_HITS_REDUCTION1=3;
+
+ /** approxHitsCutoff=maxHits-MAX_HITS_REDUCTION2 dynamically when best score is exceeded */
+ public static int MAX_HITS_REDUCTION2=3;
+
+ /** approxHitsCutoff=maxHits-MAX_HITS_REDUCTION_PERFECT when perfect score is found */
+ public static int MAX_HITS_REDUCTION_PERFECT=5;
+
+ public static int MAXIMUM_MAX_HITS_REDUCTION=7;
+ public static int HIT_REDUCTION_DIV=4;
+
+ private static final int calcApproxHitsCutoff(final int keys, final int hits, int currentCutoff, final boolean perfect){ //***$
+ assert(keys>=hits) : keys+", "+hits;
+ assert(hits>=0);
+
+ int mahtk=MIN_APPROX_HITS_TO_KEEP;
+ if(SEMIPERFECTMODE || PERFECTMODE){
+ if(keys==1){return 1;}
+ else if(MIN_APPROX_HITS_TO_KEEP<keys){
+ mahtk++;
+ if(currentCutoff==MIN_APPROX_HITS_TO_KEEP){currentCutoff++;}
+ }
+ }
+
+ int reduction=Tools.min(Tools.max((hits)/HIT_REDUCTION_DIV, MAX_HITS_REDUCTION2), Tools.max(MAXIMUM_MAX_HITS_REDUCTION, keys/8));
+ assert(reduction>=0);
+ int r=hits-reduction;
+
+ r=Tools.max(mahtk, currentCutoff, r);
+
+ if(perfect){
+ r=Tools.max(r, keys-MAX_HITS_REDUCTION_PERFECT);
+ }
+ return r;
+ }
+
+ public static final boolean USE_SLOWALK3=false && USE_EXTENDED_SCORE;
+ public static boolean PRESCAN_QSCORE=false && USE_EXTENDED_SCORE; //Decrease quality and increase speed
+ public static final boolean FILTER_BY_QSCORE=false; //Slightly lower quality, but very fast.
+ public static final float MIN_SCORE_MULT=(USE_AFFINE_SCORE ? 0.08f : USE_EXTENDED_SCORE ? .3f : 0.10f); //Fraction of max score to use as cutoff. Default 0.15, max is 1; lower is more accurate
+ public static final float MIN_QSCORE_MULT=0.01f; //Fraction of max score to use as cutoff. Default 0.15, max is 1; lower is more accurate
+ public static final float MIN_QSCORE_MULT2=0.1f;
+ static final float DYNAMIC_SCORE_THRESH=(USE_AFFINE_SCORE ? 0.5f : USE_EXTENDED_SCORE ? .84f : 0.6f); //Default .85f; lower is more accurate
+ static final float DYNAMIC_QSCORE_THRESH=0.40f; //default .58f
+ static final float DYNAMIC_QSCORE_THRESH_PERFECT=0.6f; //***$
+ static final float PRESCAN_QSCORE_THRESH=DYNAMIC_QSCORE_THRESH*.8f; //default 1.0f; lower is more accurate and 0 essentially sets PRESCAN_QSCORE=false
+ static{
+ assert(MIN_SCORE_MULT>=0 && MIN_SCORE_MULT<1);
+ assert(DYNAMIC_SCORE_THRESH>=0 && DYNAMIC_SCORE_THRESH<1);
+ }
+
+
+}
diff --git a/current/align2/BBIndexPacBio.java b/current/align2/BBIndexPacBio.java
new file mode 100755
index 0000000..07f4365
--- /dev/null
+++ b/current/align2/BBIndexPacBio.java
@@ -0,0 +1,2603 @@
+package align2;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+
+import stream.Read;
+import stream.SiteScore;
+
+import dna.AminoAcid;
+import dna.Data;
+import dna.Gene;
+
+
+/**
+ * Based on Index11f
+ *
+ *
+ *
+ *
+ * @author Brian Bushnell
+ * @date Jul 11, 2012
+ *
+ */
+public final class BBIndexPacBio extends AbstractIndex {
+
+
+ public static void main(String[] args){
+
+ int k=12;
+
+ for(int i=0; i<args.length; i++){
+ String s=args[i].toLowerCase();
+ if(s.contains("=")){
+ String[] split=s.split("=");
+ String a=split[0];
+ String b=split[1];
+ if(a.equals("build") || a.equals("b")){
+ Data.setGenome(Integer.parseInt(b));
+ }else if(a.equals("minchrom")){
+ MINCHROM=Integer.parseInt(b);
+ }else if(a.equals("maxchrom")){
+ MAXCHROM=Integer.parseInt(b);
+ }else if(a.equals("keylen") || a.equals("k")){
+ k=Integer.parseInt(b);
+ }
+ }
+ }
+
+ if(MINCHROM==-1){MINCHROM=1;}
+ if(MAXCHROM==-1){
+ assert(Data.numChroms<=Byte.MAX_VALUE) : "TODO";
+ MAXCHROM=Data.numChroms;
+ }
+
+
+ System.err.println("Writing build "+Data.GENOME_BUILD+" "+
+ "BASESPACE index, keylen="+k+", chrom bits="+NUM_CHROM_BITS);
+
+
+ int first=(NUM_CHROM_BITS==0 ? 1 : 0);
+
+
+ Data.sysout.println("Loading index for chunk "+first+"-"+MAXCHROM+", build "+Data.GENOME_BUILD);
+ index=IndexMaker4.makeIndex(Data.GENOME_BUILD, first, MAXCHROM,
+ k, NUM_CHROM_BITS, MAX_ALLOWED_CHROM_INDEX, CHROM_MASK_LOW, CHROM_MASK_HIGH, SITE_MASK, SHIFT_LENGTH, true, false, index);
+
+
+ System.err.println("Finished all chroms, may still be writing.");
+ }
+
+
+ public BBIndexPacBio(int k_, int minChrom_, int maxChrom_, int kfilter_, MSA msa_){
+ super(k_, kfilter_, BASE_HIT_SCORE, minChrom_, maxChrom_, msa_);
+ INV_BASE_KEY_HIT_SCORE=1f/BASE_KEY_HIT_SCORE;
+ INDEL_PENALTY=(BASE_KEY_HIT_SCORE/8)-1; //default (HIT_SCORE/2)-1
+ INDEL_PENALTY_MULT=25; //default 20; penalty for indel length
+ MAX_PENALTY_FOR_MISALIGNED_HIT=BASE_KEY_HIT_SCORE-(1+BASE_KEY_HIT_SCORE/8);
+ SCOREZ_1KEY=Z_SCORE_MULT*KEYLEN;
+ {
+ int cyc=0;
+ for(int chrom=minChrom; chrom<=maxChrom; chrom=((chrom&CHROM_MASK_HIGH)+CHROMS_PER_BLOCK)){cyc+=2;}
+ cycles=cyc;
+ }
+ prescoreArray=new int[cycles];
+ precountArray=new int[cycles];
+ }
+
+ /** Load or generate index from minChrom to maxChrom, inclusive, with keylength k.
+ * This range can encompass multiple blocks.
+ * Should only be called once in a process. */
+ public static final synchronized void loadIndex(int minChrom, int maxChrom, int k, boolean writeToDisk, boolean diskInvalid){
+ if(minChrom<1){minChrom=1;}
+ if(maxChrom>Data.numChroms){maxChrom=Data.numChroms;}
+ assert(minChrom<=maxChrom);
+ Data.sysout.println("Loading index for chunk "+minChrom+"-"+maxChrom+", build "+Data.GENOME_BUILD);
+ index=IndexMaker4.makeIndex(Data.GENOME_BUILD, minChrom, maxChrom,
+ k, NUM_CHROM_BITS, MAX_ALLOWED_CHROM_INDEX, CHROM_MASK_LOW, CHROM_MASK_HIGH, SITE_MASK, SHIFT_LENGTH, writeToDisk, diskInvalid, index);
+ }
+
+ /** Calculate statistics of index, such as list lengths, and find clumpy keys */
+ public static final synchronized void analyzeIndex(int minChrom, int maxChrom, float fractionToExclude, int k){
+ assert(lengthHistogram==null);
+ assert(COUNTS==null);
+
+ int KEYSPACE=1<<(2*k);
+ COUNTS=new int[KEYSPACE];
+ maxChrom=maxChrom(maxChrom);
+
+ HashMap<Integer, LongM> cmap=new HashMap<Integer, LongM>();
+
+ for(int chrom=minChrom; chrom<=maxChrom; chrom=((chrom&CHROM_MASK_HIGH)+CHROMS_PER_BLOCK)){
+ Block b=index[chrom];
+ final int[] sites=b.sites;
+ final int[] starts=b.starts;
+
+ for(int key=0; key<KEYSPACE; key++){
+
+ long clumps=0;
+
+ final int start1=starts[key];
+ final int stop1=starts[key+1];
+ final int len1=stop1-start1;
+ COUNTS[key]=(int)Tools.min(Integer.MAX_VALUE, COUNTS[key]+len1);
+
+ if(REMOVE_CLUMPY){
+ for(int i=start1+1; i<stop1; i++){
+ int dif=sites[i]-sites[i-1];
+ assert(dif!=0);
+ if(dif>0 && dif<=CLUMPY_MAX_DIST){
+ clumps++;
+ }
+ }
+ if(clumps>0){
+ final int x=Tools.min(key, AminoAcid.reverseComplementBinaryFast(key, k));
+ final Integer ko=x;
+ LongM lm=cmap.get(ko);
+ if(lm==null){
+ lm=new LongM(0);
+ cmap.put(ko, lm);
+ }
+ lm.increment(clumps);
+ }
+ }
+ }
+ }
+
+ for(int key=0; key<COUNTS.length; key++){
+ int rkey=AminoAcid.reverseComplementBinaryFast(key, k);
+ if(key<rkey){
+ int x=(int)Tools.min(Integer.MAX_VALUE, COUNTS[key]+(long)COUNTS[rkey]);
+ COUNTS[key]=COUNTS[rkey]=x;
+ }
+ }
+
+ if(REMOVE_CLUMPY){
+ Integer[] keys=cmap.keySet().toArray(new Integer[cmap.size()]);
+ Arrays.sort(keys);
+
+ for(Integer key : keys){
+ long clumps=cmap.get(key).value();
+ long len=COUNTS[key];
+ if((len>CLUMPY_MIN_LENGTH_INDEX && clumps>CLUMPY_FRACTION*len)/* || (len>8*CLUMPY_MIN_LENGTH_INDEX && clumps>.75f*CLUMPY_FRACTION*len)*/){
+ int rkey=AminoAcid.reverseComplementBinaryFast(key, k);
+ assert(key<=rkey);
+ assert(key==KeyRing.reverseComplementKey(rkey, k));
+ COUNTS[key]=0;
+ COUNTS[rkey]=0;
+ }
+ }
+ }
+
+ lengthHistogram=Tools.makeLengthHistogram3(COUNTS, 1000, verbose2);
+
+ //if(verbose2){System.err.println("lengthHistogram: "+Arrays.toString(lengthHistogram));}
+
+ if(REMOVE_FREQUENT_GENOME_FRACTION){
+
+ int lengthLimitIndex=(int)((1-fractionToExclude)*(lengthHistogram.length-1));
+ int lengthLimitIndex2=(int)((1-fractionToExclude*DOUBLE_SEARCH_THRESH_MULT)*(lengthHistogram.length-1));
+
+ MAX_USABLE_LENGTH=Tools.max(2*SMALL_GENOME_LIST, lengthHistogram[lengthLimitIndex]);
+ MAX_USABLE_LENGTH2=Tools.max(6*SMALL_GENOME_LIST, lengthHistogram[lengthLimitIndex2]);
+
+ if(verbose2){System.err.println("MAX_USABLE_LENGTH: "+MAX_USABLE_LENGTH+"\nMAX_USABLE_LENGTH2: "+MAX_USABLE_LENGTH2);}
+ }
+
+ Solver.POINTS_PER_SITE=(int)Math.floor((Solver.BASE_POINTS_PER_SITE*4000f)/Tools.max(2*SMALL_GENOME_LIST, lengthHistogram[MAX_AVERAGE_LIST_TO_SEARCH]));
+ if(Solver.POINTS_PER_SITE==0){Solver.POINTS_PER_SITE=-1;}
+ if(verbose2){System.err.println("POINTS_PER_SITE: "+Solver.POINTS_PER_SITE);}
+ assert(Solver.POINTS_PER_SITE<0) : Solver.POINTS_PER_SITE;
+ }
+
+
+ /** Returns the filename for the block holding this chrom */
+ public static final String fname(int chrom, int k){
+ return IndexMaker4.fname(minChrom(chrom), maxChrom(chrom), k, NUM_CHROM_BITS);
+ }
+
+ /** Ensure key offsets are strictly ascending. */
+ private static boolean checkOffsets(int[] offsets){
+ for(int i=1; i<offsets.length; i++){
+ if(offsets[i]<=offsets[i-1]){return false;}
+ }
+ return true;
+ }
+
+ @Deprecated
+ private final int trimExcessHitLists(int[] keys, int[][] hits){
+
+ assert(false) : "Needs to be redone because hits are no longer sorted by length.";
+
+ assert(hits.length==keys.length);
+// assert(false) : "modify this function so that it gives more weight to trimming lists over highly covered baits";
+ //And also, incorporate the "remove the longest list" function
+
+ final int limit=Tools.max(SMALL_GENOME_LIST, lengthHistogram[MAX_AVERAGE_LIST_TO_SEARCH])*keys.length;
+ final int limit2=Tools.max(SMALL_GENOME_LIST, lengthHistogram[MAX_AVERAGE_LIST_TO_SEARCH2]);
+ final int limit3=Tools.max(SMALL_GENOME_LIST, lengthHistogram[MAX_SHORTEST_LIST_TO_SEARCH]);
+
+ int sum=0;
+ int initialHitCount=0;
+
+ int shortest=Integer.MAX_VALUE-1;
+ int shortest2=Integer.MAX_VALUE;
+
+ for(int i=0; i<keys.length; i++){
+ int key=keys[i];
+ int x=COUNTS[key];
+ sum+=x;
+ initialHitCount+=(x==0 ? 0 : 1);
+ if(x>0){
+ if(x<shortest2){
+ shortest2=x;
+ if(shortest2<shortest){
+ shortest2=shortest;
+ shortest=x;
+ }
+ }
+ }
+ }
+ assert(shortest2>=shortest);
+ if(initialHitCount<MIN_APPROX_HITS_TO_KEEP){return initialHitCount;}
+ if(shortest>limit3 && !SLOW){
+ for(int i=0; i<hits.length; i++){hits[i]=null;}
+ return 0;
+ }
+ if(sum<=limit && sum/initialHitCount<=limit2){return initialHitCount;}
+
+ Pointer[] ptrs=Pointer.loadMatrix(hits);
+// ptrs[0].value/=2;
+// ptrs[ptrs.length-1].value/=2;
+ Arrays.sort(ptrs);
+
+ int finalHitCount=initialHitCount;
+ for(int i=ptrs.length-1; sum>limit || sum/finalHitCount>limit2; i--){
+ Pointer p=ptrs[i];
+ sum-=hits[p.key].length;
+ hits[p.key]=null;
+ finalHitCount--;
+ }
+
+ return finalHitCount;
+ }
+
+ /** Remove least useful keys to accelerate search */
+ public final int trimExcessHitListsByGreedy(int[] offsets, int[] keyScores, int maxHitLists, int[] keys){
+
+ float[] keyWeights=getKeyWeightArray(keyScores.length);
+ for(int i=0; i<keyScores.length; i++){
+ keyWeights[i]=keyScores[i]*INV_BASE_KEY_HIT_SCORE;
+ }
+
+// assert(false) : "modify this function so that it gives more weight to trimming lists over highly covered baits";
+ //And also, incorporate the "remove the longest list" function
+
+ final int limit=Tools.max(SMALL_GENOME_LIST, lengthHistogram[MAX_AVERAGE_LIST_TO_SEARCH])*keys.length;
+ final int limit2=Tools.max(SMALL_GENOME_LIST, lengthHistogram[MAX_AVERAGE_LIST_TO_SEARCH2]);
+ final int limit3=Tools.max(SMALL_GENOME_LIST, lengthHistogram[MAX_SHORTEST_LIST_TO_SEARCH]);
+// final int limitS=lengthHistogram[chrom][MAX_SINGLE_LIST_TO_SEARCH];
+
+ int sum=0;
+ int initialHitCount=0;
+
+ int shortest=Integer.MAX_VALUE-1;
+ int shortest2=Integer.MAX_VALUE;
+
+// for(int i=0; i<hits.length; i++){
+// if(hits[i]!=null && hits[i].length>limitS){hits[i]=null;}
+// }
+
+ final int[] lengths=getGenericArray(keys.length);
+
+ for(int i=0; i<keys.length; i++){
+ int key=keys[i];
+ int x=count(key);
+ lengths[i]=x;
+ sum+=x;
+ initialHitCount+=(x==0 ? 0 : 1);
+ if(x>0){
+ if(x<shortest2){
+ shortest2=x;
+ if(shortest2<shortest){
+ shortest2=shortest;
+ shortest=x;
+ }
+ }
+ }
+ }
+ assert(shortest2>=shortest);
+ if(initialHitCount<MIN_APPROX_HITS_TO_KEEP){return initialHitCount;}
+ if(shortest>limit3 && !SLOW){
+ for(int i=0; i<keys.length; i++){keys[i]=-1;}
+ return 0;
+ }
+
+ int hitsCount=initialHitCount;
+ int worstValue=Integer.MIN_VALUE;
+
+ while(hitsCount>=MIN_APPROX_HITS_TO_KEEP && (sum>limit || sum/initialHitCount>limit2 || hitsCount>maxHitLists/* || worstValue<0*/)){
+ final int[] lists=getGreedyListArray(hitsCount);
+ for(int i=0, j=0; j<lists.length; i++){
+ if(lengths[i]>0){
+ lists[j]=i;
+ j++;
+ }
+ }
+
+ Solver.findWorstGreedy(offsets, lengths, keyWeights, KEYLEN, lists, greedyReturn);
+ int worstIndex=greedyReturn[0];
+ int worst=lists[worstIndex];
+ worstValue=greedyReturn[1];
+ sum-=lengths[worst];
+
+// if(worstValue>0 && (hitsCount<=maxHitLists || lengths[worst]<excessListLimit)){return hitsCount;}
+ if(worstValue>0 || lengths[worst]<SMALL_GENOME_LIST){return hitsCount;} //This line increases accuracy at expense of speed. Lower constant = more accurate, default 0.
+ hitsCount--;
+ lengths[worst]=0;
+ keys[worst]=-1;
+ }
+ return hitsCount;
+ }
+
+
+ private final int getHits(final int[] keys, final int chrom, final int maxLen, final int[] starts, final int[] stops){
+ int numHits=0;
+ final Block b=index[chrom];
+ for(int i=0; i<keys.length; i++){
+ final int key=keys[i];
+ starts[i]=-1;
+ stops[i]=-1;
+ if(key>=0){
+ final int len=count(key);
+ if(len>0 && len<maxLen){
+ final int len2=b.length(key);
+ if(len2>0){
+ starts[i]=b.starts[key];
+ stops[i]=starts[i]+len2;
+ numHits++;
+ }
+ }
+ }
+ }
+ return numHits;
+ }
+
+
+ private final int countHits(final int[] keys, final int maxLen, boolean clearBadKeys){
+ int numHits=0;
+ for(int i=0; i<keys.length; i++){
+ final int key=keys[i];
+ if(key>=0){
+ final int len=count(key);
+ if(len>0 && len<maxLen){
+ numHits++;
+ }else if(clearBadKeys){
+ keys[i]=-1;
+ }
+ }
+ }
+ return numHits;
+ }
+
+
+ public final ArrayList<SiteScore> findAdvanced(byte[] basesP, byte[] basesM, byte[] qual, byte[] baseScoresP, int[] keyScoresP, int[] offsets, long id){
+ assert(minChrom<=maxChrom && minChrom>=0);
+ ArrayList<SiteScore> result=find(basesP, basesM, qual, baseScoresP, keyScoresP, offsets, true, id);
+ if(DOUBLE_SEARCH_NO_HIT && (result==null || result.isEmpty())){result=find(basesP, basesM, qual, baseScoresP, keyScoresP, offsets, false, id);}
+
+ return result;
+ }
+
+
+public final ArrayList<SiteScore> find(byte[] basesP, byte[] basesM, byte[] qual, byte[] baseScoresP, int[] keyScoresP, int[] offsetsP, boolean obeyLimits, long id){
+
+ assert(checkOffsets(offsetsP)) : Arrays.toString(offsetsP);
+ final int[] keysOriginal=KeyRing.makeKeys(basesP, offsetsP, KEYLEN);
+ int[] keysP=Arrays.copyOf(keysOriginal, keysOriginal.length);
+
+ initialKeys+=offsetsP.length;
+ initialKeyIterations++;
+
+ final int maxLen=(obeyLimits ? MAX_USABLE_LENGTH : MAX_USABLE_LENGTH2);
+
+ int numHits=0;
+ numHits=countHits(keysP, maxLen, true);
+ if(numHits>0){ //TODO: Change these to higher numbers
+ int trigger=(3*keysP.length)/4;
+ if(numHits<20 && numHits<trigger){
+ for(int i=0; i<keysP.length; i++){keysP[i]=keysOriginal[i];}
+ numHits=countHits(keysP, (maxLen*3)/2, true);
+ }
+ if(numHits<18 && numHits<trigger){
+ for(int i=0; i<keysP.length; i++){keysP[i]=keysOriginal[i];}
+ numHits=countHits(keysP, maxLen*2, true);
+ }
+ if(numHits<16 && numHits<trigger){
+ for(int i=0; i<keysP.length; i++){keysP[i]=keysOriginal[i];}
+ numHits=countHits(keysP, maxLen*3, true);
+ }
+ if(numHits<14 && numHits<trigger){
+ for(int i=0; i<keysP.length; i++){keysP[i]=keysOriginal[i];}
+ numHits=countHits(keysP, maxLen*5, true);
+ }
+ }
+// assert(checkOffsets(offsetsP)) : Arrays.toString(offsetsP);
+ if(numHits<keysP.length){
+ int[][] r=shrink2(offsetsP, keysP, keyScoresP);
+ assert(r!=null);
+ if(r!=null){
+ offsetsP=r[0];
+ keysP=r[1];
+ keyScoresP=r[2];
+ }
+ }else{
+ assert(shrink2(offsetsP, keysP, keyScoresP)==null);
+ }
+ initialKeys2+=numHits;
+ //assert(checkOffsets(offsetsP)) : Arrays.toString(offsetsP);
+
+// assert(checkOffsets(offsets)) : Arrays.toString(offsets);
+ if(TRIM_BY_GREEDY && obeyLimits){
+ int maxLists=Tools.max((int)(HIT_FRACTION_TO_RETAIN*keysP.length), MIN_HIT_LISTS_TO_RETAIN);
+ numHits=trimExcessHitListsByGreedy(offsetsP, keyScoresP, maxLists, keysP);
+ }
+// System.out.println("After greedy: numHits = "+numHits);
+
+ if(TRIM_BY_TOTAL_SITE_COUNT && obeyLimits){
+ throw new RuntimeException("Needs to be redone.");
+// numHits=trimExcessHitLists(keys, hits);
+ }
+
+ if(TRIM_LONG_HIT_LISTS && obeyLimits && numHits>MIN_APPROX_HITS_TO_KEEP){
+ int cutoffIndex=((int) (HIT_FRACTION_TO_RETAIN*(keysP.length)-0.01f))+(keysP.length-numHits);
+
+ int zeroes=keysP.length-numHits;
+ int altMinIndex=(zeroes+(MIN_HIT_LISTS_TO_RETAIN-1));
+ cutoffIndex=Tools.max(cutoffIndex, altMinIndex);
+
+ assert(cutoffIndex>0) : cutoffIndex+"\n"+numHits;
+
+ if(cutoffIndex<(keysP.length-1)){
+ int[] lens=getGenericArray(keysP.length);
+ for(int i=0; i<keysP.length; i++){lens[i]=count(keysP[i]);}
+ Arrays.sort(lens);
+ int cutoff=lens[cutoffIndex];
+
+ cutoff=Tools.max(lengthHistogram[MIN_INDEX_TO_DROP_LONG_HIT_LIST], cutoff);
+
+ int removed=0;
+
+ for(int i=0; i<keysP.length; i++){
+ int key=keysP[i];
+ if(count(key)>cutoff){
+ keysP[i]=-1;
+ removed++;
+ numHits--;
+ }
+ }
+ }
+ }
+// assert(checkOffsets(offsets)) : Arrays.toString(offsets);
+
+ final ArrayList<SiteScore> result=new ArrayList<SiteScore>(8);
+ if(numHits<MIN_APPROX_HITS_TO_KEEP){return result;}
+ //assert(checkOffsets(offsetsP)) : Arrays.toString(offsetsP);
+ if(numHits<keysP.length){
+ int[][] r=shrink2(offsetsP, keysP, keyScoresP);
+ assert(r!=null);
+ if(r!=null){
+ offsetsP=r[0];
+ keysP=r[1];
+ keyScoresP=r[2];
+ }
+ }else{
+ assert(shrink2(offsetsP, keysP, keyScoresP)==null);
+ }
+ assert(keysP.length==numHits);
+ //assert(checkOffsets(offsetsP)) : Arrays.toString(offsetsP);
+ //Reverse the offsets for minus-strand mapping, since they are generated based on quality
+ int[] offsetsM=KeyRing.reverseOffsets(offsetsP, KEYLEN, basesP.length);
+ final int[] keysM=KeyRing.reverseComplementKeys(keysP, KEYLEN);
+
+// assert(checkOffsets(offsetsP)) : Arrays.toString(offsetsP);
+// assert(checkOffsets(offsetsM)) : Arrays.toString(offsetsM);
+
+ assert(!USE_EXTENDED_SCORE || (baseScoresP!=null && (qual==null || baseScoresP.length==qual.length)));
+ assert(keyScoresP!=null);
+ assert(keyScoresP.length==offsetsP.length) : keyScoresP.length+", "+offsetsP.length+", "+Arrays.toString(keyScoresP);
+ final byte[] baseScoresM=Tools.reverseAndCopy(baseScoresP, getBaseScoreArray(baseScoresP.length, 1));
+ final int[] keyScoresM=Tools.reverseAndCopy(keyScoresP, getKeyScoreArray(keyScoresP.length, 1));
+ final int maxQuickScore=maxQuickScore(offsetsP, keyScoresP);
+
+ assert(offsetsM.length==offsetsP.length);
+ assert(maxQuickScore==maxQuickScore(offsetsM, keyScoresM));
+
+ /*
+ * bestScores:
+ *
+ * bestScores[0] currentTopScore
+ * bestScores[1] maxHits
+ * bestScores[2] qcutoff
+ * bestScores[3] bestqscore
+ * bestScores[4] maxQuickScore
+ * bestScores[5] perfectsFound
+ */
+ final int[] bestScores=new int[6];
+
+ //This prevents filtering by qscore when a low-quality read only uses a few keys.
+ //In that case, extending is more important.
+ final boolean prescan_qscore=(PRESCAN_QSCORE && numHits>=5);
+
+ int[][] prescanResults=null;
+ int[] precounts=null;
+ int[] prescores=null;
+
+ int hitsCutoff=0;
+ int qscoreCutoff=(int)(MIN_QSCORE_MULT*maxQuickScore);
+
+ boolean allBasesCovered=true;
+ {
+ if(offsetsP[0]!=0){allBasesCovered=false;}
+ else if(offsetsP[offsetsP.length-1]!=(basesP.length-KEYLEN)){allBasesCovered=false;}
+ else{
+ for(int i=1; i<offsetsP.length; i++){
+ if(offsetsP[i]>offsetsP[i-1]+KEYLEN){
+ allBasesCovered=false;
+ break;
+ }
+ }
+ }
+ }
+
+ //TODO I don't understand this logic
+ final boolean pretendAllBasesAreCovered=(allBasesCovered ||
+ keysP.length>=keysOriginal.length-4 ||
+ (keysP.length>=9 && (offsetsP[offsetsP.length-1]-offsetsP[0]+KEYLEN)>Tools.max(40, (int)(basesP.length*.75f))));
+
+// System.err.println(allBasesCovered+"\t"+Arrays.toString(offsetsP));
+// assert(allBasesCovered);
+
+ if(prescan_qscore){
+ prescanResults=prescanAllBlocks(bestScores,
+ keysP, keyScoresP, offsetsP,
+ keysM, keyScoresM, offsetsM,
+ pretendAllBasesAreCovered);
+
+ if(prescanResults!=null){
+ precounts=prescanResults[0];
+ prescores=prescanResults[1];
+ }
+
+ if(bestScores[1]<MIN_APPROX_HITS_TO_KEEP){return result;}
+ if(bestScores[3]<maxQuickScore*MIN_QSCORE_MULT2){return result;}
+
+ if(bestScores[3]>=maxQuickScore && pretendAllBasesAreCovered){
+ assert(bestScores[3]==maxQuickScore);
+ assert(bestScores[1]==numHits);
+
+ hitsCutoff=calcApproxHitsCutoff(keysP.length, bestScores[1], MIN_APPROX_HITS_TO_KEEP, true);
+ qscoreCutoff=Tools.max(qscoreCutoff, (int)(bestScores[3]*DYNAMIC_QSCORE_THRESH_PERFECT));
+ }else{
+ hitsCutoff=calcApproxHitsCutoff(keysP.length, bestScores[1], MIN_APPROX_HITS_TO_KEEP, false);
+ qscoreCutoff=Tools.max(qscoreCutoff, (int)(bestScores[3]*PRESCAN_QSCORE_THRESH));
+ }
+ }
+
+ final int maxScore=maxScore(offsetsP, baseScoresP, keyScoresP, basesP.length, true);
+ final boolean fullyDefined=AminoAcid.isFullyDefined(basesP);
+ assert(bestScores[2]<=0) : Arrays.toString(bestScores);
+
+ int cycle=0;
+ for(int chrom=minChrom; chrom<=maxChrom; chrom=((chrom&CHROM_MASK_HIGH)+CHROMS_PER_BLOCK)){
+ if(precounts==null || precounts[cycle]>=hitsCutoff || prescores[cycle]>=qscoreCutoff){
+ find(keysP, basesP, baseScoresP, keyScoresP, chrom, Gene.PLUS,
+ offsetsP, obeyLimits, result, bestScores, allBasesCovered, maxScore, fullyDefined);
+ }
+ if(QUIT_AFTER_TWO_PERFECTS){
+ if(bestScores[5]>=2){break;}
+ }
+ cycle++;
+ if(precounts==null || precounts[cycle]>=hitsCutoff || prescores[cycle]>=qscoreCutoff){
+ find(keysM, basesM, baseScoresM, keyScoresM, chrom, Gene.MINUS,
+ offsetsM, obeyLimits, result, bestScores, allBasesCovered, maxScore, fullyDefined);
+ }
+ if(QUIT_AFTER_TWO_PERFECTS){
+ if(bestScores[5]>=2){break;}
+ }
+ cycle++;
+ }
+
+ assert(Read.CHECKSITES(result, basesP, basesM, id, false)); //TODO: Comment out once checked
+
+ return result;
+ }
+
+ /** Search blocks rapidly to find max hits, and perfect sites. May indicate some blocks can be skipped. */
+ private final int[][] prescanAllBlocks(int[] bestScores,
+ int[] keysP, int[] keyScoresP, int[] offsetsP,
+ int[] keysM, int[] keyScoresM, int[] offsetsM,
+ final boolean allBasesCovered){
+
+ int[][][] pm=new int[][][] {{keysP, keyScoresP, offsetsP}, {keysM, keyScoresM, offsetsM}};
+
+ int bestqscore=0;
+ int maxHits=0;
+ int minHitsToScore=MIN_APPROX_HITS_TO_KEEP;
+
+ final int maxQuickScore=maxQuickScore(offsetsP, keyScoresP);
+
+ final int[] counts=precountArray;
+ final int[] scores=prescoreArray;
+ final int[][] ret=prescanReturn;
+ Arrays.fill(counts, keysP.length);
+ Arrays.fill(scores, maxQuickScore);
+ ret[0]=counts;
+ ret[1]=scores;
+
+ int cycle=0;
+ for(int chrom=minChrom; chrom<=maxChrom; chrom=((chrom&CHROM_MASK_HIGH)+CHROMS_PER_BLOCK)){
+ final int baseChrom=baseChrom(chrom);
+ for(int pmi=0; pmi<2; pmi++, cycle++){
+
+ int[] keys=pm[pmi][0];
+ int[] keyScores=pm[pmi][1];
+ int[] offsets=pm[pmi][2];
+// int[][] hits=getHitArray(offsets.length);
+
+ int[] starts=startArray;
+ int[] stops=stopArray;
+ int numHits=getHits(keys, chrom, Integer.MAX_VALUE, starts, stops);
+
+ if(numHits<minHitsToScore){
+ scores[cycle]=-9999;
+ counts[cycle]=0;
+ }else{
+
+// final int maxQuickScore=maxQuickScore(offsets, keyScores);
+ // System.err.println("maxScore = "+maxScore);
+
+ if(numHits<keys.length){
+ int[][] r=shrink(starts, stops, offsets, keyScores, offsets.length);
+ if(r!=null){
+ starts=r[0];
+ stops=r[1];
+ offsets=r[2];
+ keyScores=r[4];
+ }
+ }
+
+ assert(numHits==offsets.length);
+ assert(numHits==keyScores.length);
+ heap.clear();
+ final Quad[] triples=tripleStorage;
+ final int[] values=valueArray;
+
+ int[] temp=findMaxQscore2(starts, stops, offsets, keyScores, baseChrom, triples, values, minHitsToScore, true,
+ bestqscore>=maxQuickScore && allBasesCovered);
+
+ scores[cycle]=temp[0];
+ counts[cycle]=temp[1];
+
+ bestqscore=Tools.max(temp[0], bestqscore);
+ maxHits=Tools.max(maxHits, temp[1]);
+ if(bestqscore>=maxQuickScore && allBasesCovered){
+ assert(bestqscore==maxQuickScore);
+ assert(maxHits==keysP.length) :
+ "\nTemp: \t"+Arrays.toString(temp)+", cycle="+cycle+"\n" +
+ "Scores: \t"+Arrays.toString(scores)+
+ "Counts: \t"+Arrays.toString(counts)+
+ "bestqscore: \t"+bestqscore+
+ "maxHits: \t"+maxHits+
+ "maxQuickScore: \t"+maxQuickScore+
+ "numHits: \t"+numHits+
+ "minHitsToScore: \t"+minHitsToScore+
+ "keys.length: \t"+keys.length;
+
+ minHitsToScore=Tools.max(minHitsToScore, maxHits);
+
+ {
+ //This early exit is optional. Does not seem to impact speed much either way.
+ bestScores[1]=Tools.max(bestScores[1], maxHits);
+ bestScores[3]=Tools.max(bestScores[3], bestqscore);
+ return ret;
+ }
+ }
+ }
+ }
+ }
+
+ bestScores[1]=Tools.max(bestScores[1], maxHits);
+ bestScores[3]=Tools.max(bestScores[3], bestqscore);
+
+ if(!RETAIN_BEST_QCUTOFF){bestScores[2]=-9999;}
+
+ return ret;
+ }
+
+
+ /** Search a single block and strand */
+ public final ArrayList<SiteScore> find(int[] keys, final byte[] bases, final byte[] baseScores, int[] keyScores,
+ final int chrom, final byte strand,
+ int[] offsets, final boolean obeyLimits, ArrayList<SiteScore> ssl, int[] bestScores,
+ final boolean allBasesCovered, final int maxScore, final boolean fullyDefined){
+
+ assert(checkOffsets(offsets)) : Arrays.toString(offsets);
+
+ int[] starts=startArray;
+ int[] stops=stopArray;
+
+ int numHits=getHits(keys, chrom, Integer.MAX_VALUE, starts, stops);
+ if(numHits<MIN_APPROX_HITS_TO_KEEP){return ssl;}
+ if(USE_SLOWALK3){
+ if(!RETAIN_BEST_SCORES){Arrays.fill(bestScores, 0);}
+ ssl=slowWalk3(starts, stops, bases, baseScores, keyScores, offsets, chrom, strand, obeyLimits, ssl, bestScores, allBasesCovered, maxScore, fullyDefined);
+ }else{
+ ssl=slowWalk2(starts, stops, bases, baseScores, keyScores, offsets, chrom, strand, obeyLimits, ssl, fullyDefined);
+ }
+
+ return ssl;
+ }
+
+ /** Compress arrays by removing null/empty lists */
+ private final int[][] shrink(int[] starts, int[] stops, int[] offsets, int[] keyScores, final int len){
+ int numHits=0;
+ for(int i=0; i<len; i++){
+ if(starts[i]>=0){numHits++;}
+ }
+
+ if(numHits==offsets.length){
+ return null;
+ }else{
+ int[][] r=shrinkReturn3;
+ int[] starts2=startArray;
+ int[] stops2=stopArray;
+ int[] offsets2=getOffsetArray(numHits);
+ int[] keyScores2=new int[numHits];
+
+ for(int i=0, j=0; i<len; i++){
+ if(starts[i]>=0){
+ starts2[j]=starts[i];
+ stops2[j]=stops[i];
+ offsets2[j]=offsets[i];
+ keyScores2[j]=keyScores[i];
+ j++;
+ }
+ }
+ r[0]=starts2;
+ r[1]=stops2;
+ r[2]=offsets2;
+ r[4]=keyScores2;
+ return r;
+ }
+ }
+
+ /** Removes "-1" keys. */
+ private final int[][] shrink2(int[] offsets, int[] keys, int[] keyScores){
+
+
+ int numHits=0;
+ for(int i=0; i<keys.length; i++){
+ if(keys[i]>=0){numHits++;}
+ }
+
+
+ assert(checkOffsets(offsets)) : Arrays.toString(offsets);
+ if(numHits==keys.length){
+ return null;
+ }else{
+ int[][] r=shrinkReturn2;
+ int[] offsets2=getOffsetArray(numHits);
+ assert(offsets2!=offsets);
+ assert(offsets2.length<offsets.length);
+ int[] keys2=new int[numHits];
+ int[] keyScores2=new int[numHits];
+
+ for(int i=0, j=0; i<keys.length; i++){
+ if(keys[i]>=0){
+ offsets2[j]=offsets[i];
+ keys2[j]=keys[i];
+ keyScores2[j]=keyScores[i];
+ j++;
+ }
+ }
+ assert(checkOffsets(offsets2)) : "\nnumHits="+numHits+"\n"+Arrays.toString(offsets)+" -> \n"+Arrays.toString(offsets2)+"\n"+
+ "\n"+Arrays.toString(keys)+" -> \n"+Arrays.toString(keys2)+"\n";
+ r[0]=offsets2;
+ r[1]=keys2;
+ r[2]=keyScores2;
+ return r;
+ }
+ }
+
+
+ /** This uses a heap to track next column to increment */
+ private final ArrayList<SiteScore> slowWalk2(int[] starts, int[] stops, final byte[] bases,
+ final byte[] baseScores, int[] keyScores, int[] offsets,
+ final int baseChrom_, final byte strand, final boolean obeyLimits, ArrayList<SiteScore> ssl, final boolean fullyDefined){
+
+ if(SHRINK_BEFORE_WALK){
+ int[][] r=shrink(starts, stops, offsets, keyScores, offsets.length);
+ if(r!=null){
+ starts=r[0];
+ stops=r[1];
+ offsets=r[2];
+ keyScores=r[4];
+ }
+ }
+
+ final int numHits=offsets.length; //After shrink
+
+ assert(numHits==offsets.length);
+ assert(numHits==keyScores.length);
+
+//// System.out.println("After SHRINK_BEFORE_WALK: numHits = "+hits.length);
+// Block b=index[baseChrom_];
+// int[][] hits=b.getHitLists(starts, stops);
+// if(SHRINK_BEFORE_WALK){
+// Object[] r=shrink(hits, offsets, keyScores);
+// if(r!=null){
+// hits=(int[][])r[0];
+// offsets=(int[])r[1];
+// keyScores=(int[])r[3];
+// }
+// }
+//
+// final int numHits=hits.length; //After shrink
+
+
+ assert(numHits==offsets.length);
+ assert(numHits==keyScores.length);
+
+ assert(!(!SHRINK_BEFORE_WALK && ADD_SCORE_Z));
+
+ //This can be done before or after shrinking, but the results will change depending on MIN_SCORE_MULT and etc.
+ final int maxScore=maxScore(offsets, baseScores, keyScores, bases.length, true);
+// final int maxQuickScore=(!USE_EXTENDED_SCORE ? maxScore : maxQuickScore(offsets));
+// System.err.println("maxScore = "+maxScore);
+
+// final int minScore=(obeyLimits ? (int)(MIN_SCORE_MULT*maxScore) : (int)(MIN_SCORE_MULT*0.85f*maxScore));
+ final int minScore=(obeyLimits ? (int)(MIN_SCORE_MULT*maxScore) : (int)(MIN_SCORE_MULT*1.25f*maxScore));
+// final int minQuickScore=(!USE_EXTENDED_SCORE ? minScore : (int)(maxQuickScore*0.15f));
+// final int minScore=(int)(MIN_SCORE_MULT*maxScore);
+// System.err.println("minScore = "+minScore);
+
+ final int baseChrom=baseChrom(baseChrom_);
+
+
+// final PriorityQueue<Quad> heap=new PriorityQueue<Quad>(numHits);
+ heap.clear();
+// final Quad[] triples=new Quad[numHits];
+ final Quad[] triples=tripleStorage;
+
+ final Block b=index[baseChrom];
+ final int[] values=valueArray;
+ final int[] sizes=sizeArray;
+ final int[] locArray=(USE_EXTENDED_SCORE ? getLocArray(bases.length) : null);
+
+ for(int i=0; i<numHits; i++){
+ final int[] sites=b.sites;
+ final int start=starts[i];
+ sizes[i]=b.length(start, stops[i]);
+ assert(sizes[i]>0);
+
+ int a=sites[start];
+ int a2;
+ if((a&SITE_MASK)>=offsets[i]){
+ a2=a-offsets[i];
+ }else{
+ int ch=numberToChrom(a, baseChrom);
+ int st=numberToSite(a);
+ int st2=Tools.max(st-offsets[i], 0);
+ a2=toNumber(st2, ch);
+ }
+ assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom));
+
+ Quad t=triples[i];
+ assert(t!=null) : "Should be using tripleStorage";
+ assert(i==t.column);
+ t.row=start;
+ t.site=a2;
+ t.list=sites;
+ values[i]=a2;
+
+ heap.add(t);
+ }
+
+ if(ssl==null){ssl=new ArrayList<SiteScore>(8);}
+
+ int currentTopScore=-999999999;
+
+ int cutoff=minScore;
+
+ int maxHits=0;
+ int approxHitsCutoff=MIN_APPROX_HITS_TO_KEEP;
+
+// System.out.println("\nEntering SS loop:");
+// System.out.println("maxScore="+maxScore+"\tminScore="+minScore+"\tcurrentTopScore="+currentTopScore+"\n" +
+// "cutoff="+cutoff+"\tmaxHits="+maxHits+"\tapproxHitsCutoff="+approxHitsCutoff);
+// System.out.println();
+
+ SiteScore prevSS=null;
+ while(!heap.isEmpty()){
+ Quad t=heap.peek();
+ final int site=t.site;
+ final int centerIndex=t.column;
+
+ int maxNearbySite=site;
+
+
+ int approxHits=0;
+
+ {//Inner loop
+ final int minsite=site-MAX_INDEL, maxsite=site+MAX_INDEL2;
+ for(int column=0, chances=numHits-approxHitsCutoff; column<numHits && chances>=0; column++){
+ final int x=values[column];
+ assert(x==triples[column].site);
+ if(x>=minsite && x<=maxsite){
+ maxNearbySite=(x>maxNearbySite ? x : maxNearbySite);
+ approxHits++;
+ }else{chances--;}
+ }
+ }
+
+ assert(centerIndex>=0) : centerIndex;
+ assert(approxHits>=1 || approxHitsCutoff>1) : approxHits+", "+approxHitsCutoff+", "+numHits+", "+t.column;
+ if(approxHits>=approxHitsCutoff){
+
+ int score;
+
+ int mapStart=site, mapStop=maxNearbySite;
+
+ if(USE_EXTENDED_SCORE){
+ final int chrom=numberToChrom(site, baseChrom);
+ score=extendScore(bases, baseScores, offsets, values, chrom, centerIndex, locArray, numHits, approxHits);
+ if(true/*USE_AFFINE_SCORE*/){
+ //Correct begin and end positions if they changed.
+ int min=Integer.MAX_VALUE;
+ int max=Integer.MIN_VALUE;
+ for(int i=0; i<locArray.length; i++){
+ int x=locArray[i];
+ if(x>-1){
+ if(x<min){min=x;}
+ if(x>max){max=x;}
+ }
+ }
+
+// assert(min>-1 && max>-1) : Arrays.toString(locArray); //TODO: How did this assertion trigger?
+ if(min<0 || max<0){
+ //Note: This error can trigger if minChrom and maxChrom do not align to block boundaries
+ if(!Shared.anomaly){
+ Shared.anomaly=true;
+ System.err.println("Anomaly in "+getClass().getName()+".slowWalk: "+
+ chrom+", "+mapStart+", "+mapStop+", "+centerIndex+", "+
+ Arrays.toString(locArray)+"\n"+
+ Arrays.toString(values)+"\n"+
+ new String(bases)+"\nstrand="+strand+"\n");
+ System.err.println();
+ }
+ score=-99999;
+ }
+
+
+ mapStart=toNumber(min, chrom);
+ mapStop=toNumber(max, chrom);
+
+
+
+// System.err.println("site="+site+", maxNearbySite="+maxNearbySite+", min="+min+", max="+max+
+// ", mapStart="+mapStart+", mapStop="+mapStop);
+
+// if(chrom==17 && absdif(min, 30354420)<2000){
+// System.err.println("\n*****\n");
+// System.err.println("site="+site+" ("+numberToSite(site)+"), maxNearbySite="+maxNearbySite+
+// " ("+numberToSite(maxNearbySite)+"), min="+min+", max="+max+
+// ", mapStart="+mapStart+", mapStop="+mapStop);
+// System.err.println();
+// System.err.println(Arrays.toString(locArray));
+// System.err.println();
+// System.err.println("chrom="+chrom);
+// System.err.println("score="+score);
+// }
+ }
+ }else{
+ score=quickScore(values, keyScores, centerIndex, offsets, sizes, true, approxHits, numHits);
+ if(ADD_SCORE_Z){
+ int scoreZ=scoreZ2(values, centerIndex, offsets, approxHits, numHits);
+ score+=scoreZ;
+ }
+ }
+
+
+// score=score(values, centerIndex, offsets, hits);
+// if(ADD_SCORE_Z){
+// int scoreZ=scoreZ2(values, centerIndex, offsets);
+// score+=scoreZ;
+// }
+//
+// if(USE_EXTENDED_SCORE){
+// if(score>minQuickScore){
+//// System.out.println(score+" > "+minQuickScore);
+// score=extendScore(read, offsets, values, numberToChrom(site, baseChrom), centerIndex, locArray);
+// }else{
+//// System.out.print(".");
+// score=-1;
+// }
+// }
+
+
+// System.err.println("maxScore = "+maxScore);
+// System.err.println("hits = "+approxHits+" / "+approxHitsCutoff);
+// System.err.println("score = "+score+" / "+cutoff);
+
+ if(score>=cutoff){
+
+// System.err.println("Passed!");
+
+// System.out.println("approxHits="+approxHits+" / "+approxHitsCutoff);
+// System.out.println("score="+score+" / "+cutoff);
+// System.out.println("strand="+Gene.strandCodes[strand]);
+// System.out.println("center="+values[centerIndex]);
+// System.out.println("values="+Arrays.toString(values));
+// extendScore(read, offsets, values, numberToChrom(site, baseChrom), centerIndex);
+// System.out.println();
+
+ if(score>currentTopScore){
+// System.err.println("New top score!");
+
+ if(DYNAMICALLY_TRIM_LOW_SCORES){
+
+ maxHits=Tools.max(approxHits, maxHits);
+// approxHitsCutoff=Tools.max(approxHitsCutoff, maxHits);
+ approxHitsCutoff=Tools.max(approxHitsCutoff, maxHits-1); //More sensitive, but slower
+
+ cutoff=Tools.max(cutoff, (int)(score*DYNAMIC_SCORE_THRESH));
+
+ // cutoff=Tools.max(cutoff, minScore+(int)((score-minScore)*DYNAMIC_SCORE_THRESH));
+ if(USE_EXTENDED_SCORE && score>=maxScore){
+ cutoff=Tools.max(cutoff, (int)(score*0.95f));
+ }
+ }
+
+ currentTopScore=score;
+
+// System.out.println("New top score: "+currentTopScore+" \t("+cutoff+")");
+ }
+
+// final int chrom=numberToChrom(site, baseChrom);
+// final int site2=numberToSite(site);
+// final int site3=numberToSite(maxNearbySite)+read.length;
+
+ final int chrom=numberToChrom(mapStart, baseChrom);
+ final int site2=numberToSite(mapStart);
+ final int site3=numberToSite(mapStop)+bases.length-1;
+
+ assert(NUM_CHROM_BITS==0 || site2<SITE_MASK-1000) : "chrom="+chrom+", strand="+strand+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", site2="+site2+", site3="+site3+", read.length="+bases.length+
+ "\n\n"+Arrays.toString(b.getHitList(centerIndex));
+ assert(site2<site3) : "chrom="+chrom+", strand="+strand+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", site2="+site2+", site3="+site3+", read.length="+bases.length;
+
+
+ //Note: I could also do this as soon as score is calculated.
+// if(ADD_SCORE_Z){
+// int scoreZ=scoreZ2(values, centerIndex, offsets);
+// score+=scoreZ;
+// }
+
+ //This block is optional, but tries to eliminate multiple identical alignments
+// SiteScore prevSS=(ssl.size()<1 ? null : ssl.get(ssl.size()-1));
+
+ SiteScore ss=null;
+ final boolean perfect1=USE_EXTENDED_SCORE && score==maxScore && fullyDefined;
+
+ int[] gapArray=null;
+ if(site3-site2>=MINGAP+bases.length){
+ gapArray=makeGapArray(locArray, mapStart, MINGAP);
+ if(gapArray!=null){
+ int sub=site2-mapStart;//thus site2=mapStart+sub
+ for(int i=0; i<gapArray.length; i++){
+ gapArray[i]+=sub;
+ }
+ assert(gapArray[0]==mapStart);
+ assert(gapArray[gapArray.length-1]==mapStop);
+ }
+ assert(false) : Arrays.toString(locArray);
+ }
+
+ if(gapArray==null && prevSS!=null && prevSS.gaps==null &&
+ prevSS.chrom==chrom && prevSS.strand==strand && overlap(prevSS.start, prevSS.stop, site2, site3)){
+
+ int betterScore=Tools.max(score, prevSS.score);
+ int minStart=Tools.min(prevSS.start, site2);
+ int maxStop=Tools.max(prevSS.stop, site3);
+ final boolean perfect2=USE_EXTENDED_SCORE && prevSS.score==maxScore && fullyDefined;
+ assert(!USE_EXTENDED_SCORE || perfect2==prevSS.perfect);
+
+ boolean shortEnough=(!LIMIT_SUBSUMPTION_LENGTH_TO_2X || (maxStop-minStart<2*bases.length));
+
+ if(prevSS.start==site2 && prevSS.stop==site3){
+ prevSS.score=prevSS.quickScore=betterScore;
+ }else if(SUBSUME_SAME_START_SITES && shortEnough && prevSS.start==site2){
+ if(perfect2){
+ //do nothing
+ }else if(perfect1){
+ prevSS.setStop(site3);
+ prevSS.setPerfect();
+ }else{
+ prevSS.setStop(maxStop);
+ }
+ prevSS.score=prevSS.quickScore=betterScore;
+ }else if(SUBSUME_SAME_STOP_SITES && shortEnough && prevSS.stop==site3){
+ if(perfect2){
+ //do nothing
+ }else if(perfect1){
+ prevSS.setStart(site2);
+ prevSS.setPerfect();
+ }else{
+ prevSS.setStart(minStart);
+ }
+ prevSS.score=prevSS.quickScore=betterScore;
+ }else if(SUBSUME_OVERLAPPING_SITES && shortEnough && (maxStop-minStart<=bases.length+MAX_SUBSUMPTION_LENGTH)
+ && !perfect1 && !perfect2){
+ prevSS.setLimits(minStart, maxStop);
+ prevSS.score=prevSS.quickScore=betterScore;
+ }else{
+ ss=new SiteScore(chrom, strand, site2, site3, approxHits, score, false, perfect1);
+ if(!perfect1){ss.setPerfect(bases);}
+ assert(!perfect1 || ss.stop-ss.start==bases.length-1);
+ }
+ assert(!perfect2 || prevSS.stop-prevSS.start==bases.length-1);
+ }else{
+ ss=new SiteScore(chrom, strand, site2, site3, approxHits, score, false, perfect1);
+ if(!perfect1){ss.setPerfect(bases);}
+ ss.gaps=gapArray;
+ if(gapArray!=null){
+ System.err.println(ss.toText()+"\t"+Arrays.toString(gapArray)+"\n"+Arrays.toString(locArray)+"\n");
+ }
+ }
+
+ if(ss!=null){
+// System.out.println("Added site "+ss.toText());
+ ssl.add(ss);
+ prevSS=ss;
+ }else{
+// System.out.println("Subsumed site "+new SiteScore(chrom, strand, site2, site3, score).toText());
+ }
+
+ }
+ }
+
+ while(heap.peek().site==site){ //Remove all identical elements, and add subsequent elements
+ final Quad t2=heap.poll();
+ final int row=t2.row+1, col=t2.column;
+ if(row<stops[col]){
+ t2.row=row;
+
+ int a=t2.list[row];
+ int a2;
+ if((a&SITE_MASK)>=offsets[col]){
+ a2=a-offsets[col];
+
+ assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) :
+ "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col];
+ }else{
+ int ch=numberToChrom(a, baseChrom);
+ int st=numberToSite(a);
+ int st2=Tools.max(st-offsets[col], 0);
+ a2=toNumber(st2, ch);
+
+ assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) :
+ "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col];
+ }
+
+ assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) :
+ "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col];
+
+ t2.site=a2;
+ values[col]=a2;
+ heap.add(t2);
+ }
+ if(heap.isEmpty()){break;}
+ }
+
+ }
+
+ return ssl;
+ }
+
+ /** This uses a heap to track next column to increment */
+ private final ArrayList<SiteScore> slowWalk3(int[] starts, int[] stops, final byte[] bases,
+ final byte[] baseScores, int[] keyScores, int[] offsets,
+ final int baseChrom_, final byte strand, final boolean obeyLimits, ArrayList<SiteScore> ssl,
+ int[] bestScores, final boolean allBasesCovered, final int maxScore, final boolean fullyDefined){
+ assert(USE_EXTENDED_SCORE);
+
+ final int numKeys=offsets.length; //Before shrink
+
+ //This can be done before or after shrinking, but the results will change depending on MIN_SCORE_MULT and etc.
+ final int maxQuickScore=maxQuickScore(offsets, keyScores);
+
+ if(SHRINK_BEFORE_WALK){
+ int[][] r=shrink(starts, stops, offsets, keyScores, offsets.length);
+ if(r!=null){
+ starts=r[0];
+ stops=r[1];
+ offsets=r[2];
+ keyScores=r[4];
+ }
+ }
+
+ final int numHits=offsets.length; //After shrink
+
+
+ assert(numHits==offsets.length);
+ assert(numHits==keyScores.length);
+
+ usedKeys+=numHits;
+ usedKeyIterations++;
+
+ final boolean filter_by_qscore=(FILTER_BY_QSCORE && numKeys>=5);
+
+ assert(!(!SHRINK_BEFORE_WALK && ADD_SCORE_Z));
+
+
+// final int minScore=(obeyLimits ? (int)(MIN_SCORE_MULT*maxScore) : (int)(MIN_SCORE_MULT*0.85f*maxScore));
+ final int minScore=(obeyLimits ? (int)(MIN_SCORE_MULT*maxScore) : (int)(MIN_SCORE_MULT*1.25f*maxScore));
+ final int minQuickScore=(int)(MIN_QSCORE_MULT*maxQuickScore);
+
+ final int baseChrom=baseChrom(baseChrom_);
+
+ heap.clear();
+
+ final Quad[] triples=tripleStorage;
+
+ final int[] values=valueArray;
+ final int[] sizes=sizeArray;
+ final int[] locArray=(USE_EXTENDED_SCORE ? getLocArray(bases.length) : null);
+ final Block b=index[baseChrom];
+
+ if(ssl==null){ssl=new ArrayList<SiteScore>(8);}
+
+ int currentTopScore=bestScores[0];
+ int cutoff=Tools.max(minScore, (int)(currentTopScore*DYNAMIC_SCORE_THRESH));
+
+ int qcutoff=Tools.max(bestScores[2], minQuickScore);
+ int bestqscore=bestScores[3];
+ int maxHits=bestScores[1];
+ int perfectsFound=bestScores[5];
+ assert((currentTopScore>=maxScore) == (perfectsFound>0)) : currentTopScore+", "+maxScore+", "+perfectsFound+", "+maxHits+", "+numHits;
+ int approxHitsCutoff=calcApproxHitsCutoff(numKeys, maxHits, MIN_APPROX_HITS_TO_KEEP, currentTopScore>=maxScore);
+ if(approxHitsCutoff>numHits){return ssl;}
+
+ final boolean shortCircuit=(allBasesCovered && numKeys==numHits && filter_by_qscore);
+
+ if(currentTopScore>=maxScore){
+ assert(currentTopScore==maxScore);
+ qcutoff=Tools.max(qcutoff, (int)(maxQuickScore*DYNAMIC_QSCORE_THRESH_PERFECT));
+ }
+
+
+// assert(false) : "numHits="+numHits+", maxHits="+maxHits+", MIN_APPROX_HITS_TO_KEEP="+MIN_APPROX_HITS_TO_KEEP+", approxHitsCutoff="+approxHitsCutoff+", maxHits="+maxHits;
+
+
+ for(int i=0; i<numHits; i++){
+ final int[] sites=b.sites;
+ final int start=starts[i];
+ sizes[i]=b.length(start, stops[i]);
+ assert(sizes[i]>0);
+
+ int a=sites[start];
+ int a2;
+ if((a&SITE_MASK)>=offsets[i]){
+ a2=a-offsets[i];
+ }else{
+ int ch=numberToChrom(a, baseChrom);
+ int st=numberToSite(a);
+ int st2=Tools.max(st-offsets[i], 0);
+ a2=toNumber(st2, ch);
+ }
+ assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom));
+
+ Quad t=triples[i];
+ assert(t!=null) : "Should be using tripleStorage";
+ assert(i==t.column);
+ t.row=start;
+ t.site=a2;
+ t.list=sites;
+ values[i]=a2;
+
+ heap.add(t);
+ }
+
+// System.out.println("\nEntering SS loop:");
+// System.out.println("maxScore="+maxScore+"\tminScore="+minScore+"\tcurrentTopScore="+currentTopScore+"\n" +
+// "cutoff="+cutoff+"\tmaxHits="+maxHits+"\tapproxHitsCutoff="+approxHitsCutoff);
+// System.out.println("maxQuickScore="+maxQuickScore+"\tminQuickScore="+minQuickScore+"\tqcutoff="+qcutoff);
+
+
+ SiteScore prevSS=null;
+ while(!heap.isEmpty()){
+ Quad t=heap.peek();
+ final int site=t.site;
+ final int centerIndex=t.column;
+
+ int maxNearbySite=site;
+
+
+ int approxHits=0;
+
+ {//Inner loop
+ final int minsite=site-MAX_INDEL, maxsite=site+MAX_INDEL2;
+ for(int column=0, chances=numHits-approxHitsCutoff; column<numHits && chances>=0; column++){
+ final int x=values[column];
+ assert(x==triples[column].site);
+ if(x>=minsite && x<=maxsite){
+ maxNearbySite=(x>maxNearbySite ? x : maxNearbySite);
+ approxHits++;
+ }else{chances--;}
+ }
+ }
+
+ assert(centerIndex>=0) : centerIndex;
+ assert(approxHits>=1 || approxHitsCutoff>1) : approxHits+", "+approxHitsCutoff+", "+numHits+", "+t.column;
+ if(approxHits>=approxHitsCutoff){
+
+ int score;
+ int qscore=(filter_by_qscore ? quickScore(values, keyScores, centerIndex, offsets, sizes, true, approxHits, numHits) : qcutoff);
+ if(ADD_SCORE_Z){
+ int scoreZ=scoreZ2(values, centerIndex, offsets, approxHits, numHits);
+ qscore+=scoreZ;
+ }
+
+ int mapStart=site, mapStop=maxNearbySite;
+
+ assert(USE_EXTENDED_SCORE);
+
+ boolean locArrayValid=false;
+ if(qscore<qcutoff){
+ score=-1;
+ }else{
+
+ final int chrom=numberToChrom(site, baseChrom);
+
+ //TODO Note that disabling the shortCircuit code seems to make things run 2% faster (with identical results).
+ //However, theoretically, shortCircuit should be far more efficient. Test both ways on cluster and on a larger run.
+ //May have something to do with compiler loop optimizations.
+ if(shortCircuit && qscore==maxQuickScore){
+ assert(approxHits==numKeys);
+ score=maxScore;
+ }else{
+ if(verbose){
+ System.err.println("Extending "+Arrays.toString(values));
+ }
+ score=extendScore(bases, baseScores, offsets, values, chrom, centerIndex, locArray, numHits, approxHits);
+ locArrayValid=true;
+
+ if(verbose){
+ System.err.println("score: "+score);
+ System.err.println("locArray: "+Arrays.toString(locArray));
+ }
+
+ //Correct begin and end positions if they changed.
+ int min=Integer.MAX_VALUE;
+ int max=Integer.MIN_VALUE;
+ for(int i=0; i<locArray.length; i++){
+ int x=locArray[i];
+ if(x>-1){
+ if(x<min){min=x;}
+ if(x>max){max=x;}
+ }
+ }
+
+ if(score>=maxScore){
+ assert(min==max && min>-1) : "\n"+score+", "+maxScore+", "+min+", "+max+
+ ", "+(max-min)+", "+bases.length+"\n"+Arrays.toString(locArray)+"\n";
+ }
+
+ // assert(min>-1 && max>-1) : Arrays.toString(locArray); //TODO: How did this assertion trigger?
+ if(min<0 || max<0){
+ if(!Shared.anomaly){
+ Shared.anomaly=true;
+ System.err.println("Anomaly in "+getClass().getName()+".slowWalk: "+
+ chrom+", "+mapStart+", "+mapStop+", "+centerIndex+", "+
+ Arrays.toString(locArray)+"\n"+
+ Arrays.toString(values)+"\n"+
+ new String(bases)+"\nstrand="+strand+"\n");
+ System.err.println();
+ }
+ score=-99999;
+ }
+
+ //mapStart and mapStop are indices
+ mapStart=toNumber(min, chrom);
+ mapStop=toNumber(max, chrom);
+
+ if(score>=maxScore){
+ assert(mapStop-mapStart==0) : "\n"+score+", "+maxScore+", "+min+", "+max+
+ ", "+(max-min)+", "+(mapStop-mapStart)+", "+bases.length+"\n"+Arrays.toString(locArray)+"\n";
+ }
+ }
+
+ if(score==maxScore){
+ qcutoff=Tools.max(qcutoff, (int)(maxQuickScore*DYNAMIC_QSCORE_THRESH_PERFECT));
+ approxHitsCutoff=calcApproxHitsCutoff(numKeys, maxHits, MIN_APPROX_HITS_TO_KEEP, true);
+ }
+
+ if(score>=cutoff){
+ qcutoff=Tools.max(qcutoff, (int)(qscore*DYNAMIC_QSCORE_THRESH));
+ bestqscore=Tools.max(qscore, bestqscore);
+ }
+ }
+
+ if(score>=cutoff){
+
+ if(score>currentTopScore){
+// System.err.println("New top score!");
+
+ if(DYNAMICALLY_TRIM_LOW_SCORES){
+
+ maxHits=Tools.max(approxHits, maxHits);
+ approxHitsCutoff=calcApproxHitsCutoff(numKeys, maxHits, approxHitsCutoff, currentTopScore>=maxScore);
+
+ cutoff=Tools.max(cutoff, (int)(score*DYNAMIC_SCORE_THRESH));
+ if(score>=maxScore){
+ assert(USE_EXTENDED_SCORE);
+ cutoff=Tools.max(cutoff, (int)(score*0.95f));
+ }
+ }
+
+ currentTopScore=score;
+
+// System.out.println("New top score: "+currentTopScore+" \t("+cutoff+")");
+ }
+
+ final int chrom=numberToChrom(mapStart, baseChrom);
+ final int site2=numberToSite(mapStart);
+ final int site3=numberToSite(mapStop)+bases.length-1;
+
+ assert(NUM_CHROM_BITS==0 || site2<SITE_MASK-1000) : "chrom="+chrom+", strand="+strand+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", site2="+site2+", site3="+site3+", read.length="+bases.length+
+ "\n\n"+Arrays.toString(b.getHitList(centerIndex));
+ assert(site2<site3) : "chrom="+chrom+", strand="+strand+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", site2="+site2+", site3="+site3+", read.length="+bases.length;
+
+
+ int[] gapArray=null;
+ if(site3-site2>=MINGAP+bases.length){
+ assert(locArrayValid) : "Loc array was not filled.";
+// System.err.println("****\n"+Arrays.toString(locArray)+"\n");
+// int[] clone=locArray.clone();
+ gapArray=makeGapArray(locArray, site2, MINGAP);
+ if(gapArray!=null){
+// System.err.println(Arrays.toString(locArray)+"\n");
+// System.err.println(Arrays.toString(gapArray));
+//
+//// int sub=site2-mapStart;//thus site2=mapStart+sub
+//// for(int i=0; i<gapArray.length; i++){
+//// gapArray[i]+=sub;
+//// }
+//// System.err.println(Arrays.toString(gapArray));
+//
+// System.err.println(mapStart+" -> "+site2);
+// System.err.println(mapStop+" -> "+site3);
+
+ assert(gapArray[0]>=site2 && gapArray[0]-site2<bases.length);
+ assert(gapArray[gapArray.length-1]<=site3 && site3-gapArray[gapArray.length-1]<bases.length) : "\n"+
+ mapStart+" -> "+site2+"\n"+
+ mapStop+" -> "+site3+"\n\n"+
+ Arrays.toString(gapArray)+"\n\n"+
+// Arrays.toString(clone)+"\n\n"+
+ Arrays.toString(locArray)+"\n"+
+ "numHits="+numHits+", "+
+ "heap.size="+heap.size()+", "+
+ "numHits="+numHits+", "+
+ "approxHits="+approxHits+"\n";
+ gapArray[0]=Tools.min(gapArray[0], site2);
+ gapArray[gapArray.length-1]=Tools.max(gapArray[gapArray.length-1], site3);
+ }
+ if(verbose){System.err.println("@ site "+site2+", made gap array: "+Arrays.toString(gapArray));}
+// assert(false) : Arrays.toString(locArray);
+ }
+
+
+ //This block is optional, but tries to eliminate multiple identical alignments
+
+ SiteScore ss=null;
+ final boolean perfect1=USE_EXTENDED_SCORE && score==maxScore && fullyDefined;
+ final boolean inbounds=(site2>=0 && site3<Data.chromLengths[chrom]);
+// if(!inbounds){System.err.println("Index tossed out-of-bounds site chr"+chrom+", "+site2+"-"+site3);}
+
+ if(inbounds && !SEMIPERFECTMODE && !PERFECTMODE && gapArray==null && prevSS!=null &&
+ prevSS.chrom==chrom && prevSS.strand==strand && overlap(prevSS.start, prevSS.stop, site2, site3)){
+
+ final int betterScore=Tools.max(score, prevSS.score);
+ final int minStart=Tools.min(prevSS.start, site2);
+ final int maxStop=Tools.max(prevSS.stop, site3);
+ final boolean perfect2=USE_EXTENDED_SCORE && prevSS.score==maxScore && fullyDefined;
+ assert(!USE_EXTENDED_SCORE || perfect2==prevSS.perfect);
+
+ final boolean shortEnough=(!LIMIT_SUBSUMPTION_LENGTH_TO_2X || (maxStop-minStart<2*bases.length));
+
+ if(prevSS.start==site2 && prevSS.stop==site3){
+ prevSS.score=prevSS.quickScore=betterScore;
+ prevSS.perfect=(prevSS.perfect || perfect1 || perfect2);
+ if(prevSS.perfect){prevSS.semiperfect=true;}
+ }else if(SUBSUME_SAME_START_SITES && shortEnough && prevSS.start==site2 && !prevSS.semiperfect){
+ if(perfect2){
+ //do nothing
+ }else if(perfect1){
+ prevSS.setStop(site3);
+ if(!prevSS.perfect){perfectsFound++;}//***$
+ prevSS.perfect=prevSS.semiperfect=true;
+ }else{
+ prevSS.setStop(maxStop);
+ prevSS.setPerfect(bases);
+ }
+ prevSS.score=prevSS.quickScore=betterScore;
+ }else if(SUBSUME_SAME_STOP_SITES && shortEnough && prevSS.stop==site3 && !prevSS.semiperfect){
+ if(perfect2){
+ //do nothing
+ }else if(perfect1){
+ prevSS.setStart(site2);
+ if(!prevSS.perfect){perfectsFound++;}//***$
+ prevSS.perfect=prevSS.semiperfect=true;
+ }else{
+ prevSS.setStart(minStart);
+ prevSS.setPerfect(bases);
+ }
+ prevSS.score=prevSS.quickScore=betterScore;
+ }else if(SUBSUME_OVERLAPPING_SITES && shortEnough && (maxStop-minStart<=bases.length+MAX_SUBSUMPTION_LENGTH)
+ && !perfect1 && !perfect2 && !prevSS.semiperfect){
+ prevSS.setLimits(minStart, maxStop);
+ prevSS.score=prevSS.quickScore=betterScore;
+ prevSS.setPerfect(bases);
+ }else{
+ ss=new SiteScore(chrom, strand, site2, site3, approxHits, score, false, perfect1);
+ if(!perfect1){ss.setPerfect(bases);}
+ if(verbose){System.err.println("A) Index made SiteScore "+ss.toText()+", "+Arrays.toString(ss.gaps));}
+ assert(!perfect1 || ss.stop-ss.start==bases.length-1);
+ }
+ assert(!perfect2 || prevSS.stop-prevSS.start==bases.length-1);
+ }else if(inbounds){
+ ss=new SiteScore(chrom, strand, site2, site3, approxHits, score, false, perfect1);
+ if(!perfect1){ss.setPerfect(bases);}
+ ss.gaps=gapArray;
+ if(verbose){System.err.println("B) Index made SiteScore "+ss.toText()+", "+Arrays.toString(ss.gaps));}
+ }
+
+ assert(ss==null || !ss.perfect || ss.semiperfect) : ss;
+ assert(prevSS==null || !prevSS.perfect || prevSS.semiperfect) : "\n"+SiteScore.header()+"\n"+ss+"\n"+prevSS;
+ if(ss!=null && (SEMIPERFECTMODE && !ss.semiperfect) || (PERFECTMODE && !ss.perfect)){ss=null;}
+
+ if(ss!=null){
+// System.out.println("Added site "+ss.toText()+", qscore="+qscore);
+ ssl.add(ss);
+ if(ss.perfect){
+
+ if(prevSS==null || !prevSS.perfect || !ss.overlaps(prevSS)){
+ if(prevSS==null){assert ssl.size()<2 || !ss.overlaps(ssl.get(ssl.size()-2));}
+ perfectsFound++;
+
+ //Human-specific code
+// if(QUIT_AFTER_TWO_PERFECTS){
+// if(perfectsFound>=3 || (perfectsFound>=2 && chrom<24)){break;}
+// }
+
+ if(QUIT_AFTER_TWO_PERFECTS && perfectsFound>=2){break;}
+ }
+ }
+
+ prevSS=ss;
+ }else{
+// System.out.println("Subsumed site "+new SiteScore(chrom, strand, site2, site3, score).toText());
+ }
+ }
+ }
+
+ while(heap.peek().site==site){ //Remove all identical elements, and add subsequent elements
+ final Quad t2=heap.poll();
+ final int row=t2.row+1, col=t2.column;
+ if(row<stops[col]){
+ t2.row=row;
+
+ int a=t2.list[row];
+ int a2;
+ if((a&SITE_MASK)>=offsets[col]){
+ a2=a-offsets[col];
+
+ assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) :
+ "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col];
+ }else{
+ int ch=numberToChrom(a, baseChrom);
+ int st=numberToSite(a);
+ int st2=Tools.max(st-offsets[col], 0);
+ a2=toNumber(st2, ch);
+
+ assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) :
+ "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col];
+ }
+
+ assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) :
+ "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col];
+
+ t2.site=a2;
+ values[col]=a2;
+ heap.add(t2);
+ }else if(heap.size()<approxHitsCutoff || PERFECTMODE){
+ assert(USE_EXTENDED_SCORE);
+ bestScores[0]=Tools.max(bestScores[0], currentTopScore);
+ bestScores[1]=Tools.max(bestScores[1], maxHits);
+ bestScores[2]=Tools.max(bestScores[2], qcutoff);
+ bestScores[3]=Tools.max(bestScores[3], bestqscore);
+
+ bestScores[4]=maxQuickScore;
+ bestScores[5]=perfectsFound; //***$ fixed by adding this line
+ if(!RETAIN_BEST_QCUTOFF){bestScores[2]=-9999;}
+
+ return ssl;
+ }
+ if(heap.isEmpty()){
+ assert(false) : heap.size()+", "+approxHitsCutoff;
+ break;
+ }
+ }
+
+ }
+
+ assert(USE_EXTENDED_SCORE);
+ bestScores[0]=Tools.max(bestScores[0], currentTopScore);
+ bestScores[1]=Tools.max(bestScores[1], maxHits);
+ bestScores[2]=Tools.max(bestScores[2], qcutoff);
+ bestScores[3]=Tools.max(bestScores[3], bestqscore);
+
+ bestScores[4]=maxQuickScore;
+ bestScores[5]=perfectsFound;
+ if(!RETAIN_BEST_QCUTOFF){bestScores[2]=-9999;}
+
+ return ssl;
+ }
+
+
+ private final int[] findMaxQscore2(final int[] starts, final int[] stops, final int[] offsets, final int[] keyScores,
+ final int baseChrom_, final Quad[] triples, final int[] values, final int prevMaxHits,
+ boolean earlyExit, boolean perfectOnly){
+
+ final int numHits=offsets.length;
+ assert(numHits>=prevMaxHits);
+
+ final int baseChrom=baseChrom(baseChrom_);
+ final Block b=index[baseChrom];
+ final int[] sizes=sizeArray;
+
+ heap.clear();
+ for(int i=0; i<numHits; i++){
+ final int[] sites=b.sites;
+ final int start=starts[i];
+ sizes[i]=b.length(start, stops[i]);
+ assert(sizes[i]>0);
+
+ int a=sites[start];
+ int a2;
+ if((a&SITE_MASK)>=offsets[i]){
+ a2=a-offsets[i];
+ }else{
+ int ch=numberToChrom(a, baseChrom);
+ int st=numberToSite(a);
+ int st2=Tools.max(st-offsets[i], 0);
+ a2=toNumber(st2, ch);
+ }
+ assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom));
+
+ Quad t=triples[i];
+ assert(t!=null) : "Should be using tripleStorage";
+ assert(i==t.column);
+ t.row=start;
+ t.site=a2;
+ t.list=sites;
+ values[i]=a2;
+
+ heap.add(t);
+ }
+
+ final int maxQuickScore=maxQuickScore(offsets, keyScores);
+
+ int topQscore=-999999999;
+
+ int maxHits=0;
+// int approxHitsCutoff=MIN_APPROX_HITS_TO_KEEP;
+
+
+ int approxHitsCutoff;
+ final int indelCutoff;
+ if(perfectOnly){
+ approxHitsCutoff=numHits;
+ indelCutoff=0;
+ }else{
+ approxHitsCutoff=Tools.max(prevMaxHits, Tools.min(MIN_APPROX_HITS_TO_KEEP, numHits-1)); //Faster, same accuracy
+ indelCutoff=MAX_INDEL2;
+ }
+
+
+ while(!heap.isEmpty()){
+ Quad t=heap.peek();
+ final int site=t.site;
+ final int centerIndex=t.column;
+
+ int maxNearbySite=site;
+
+
+ int approxHits=0;
+ {//Inner loop
+ final int minsite=site-Tools.min(MAX_INDEL, indelCutoff), maxsite=site+MAX_INDEL2;
+ for(int column=0, chances=numHits-approxHitsCutoff; column<numHits && chances>=0; column++){
+ final int x=values[column];
+ assert(x==triples[column].site);
+ if(x>=minsite && x<=maxsite){
+ maxNearbySite=(x>maxNearbySite ? x : maxNearbySite);
+ approxHits++;
+ }else{chances--;}
+ }
+ }
+ hist_hits[Tools.min(HIT_HIST_LEN, approxHits)]++;
+
+ assert(centerIndex>=0) : centerIndex;
+ assert(approxHits>=1 || approxHitsCutoff>1) : approxHits+", "+approxHitsCutoff+", "+numHits+", "+t.column;
+ if(approxHits>=approxHitsCutoff){
+
+ int qscore=quickScore(values, keyScores, centerIndex, offsets, sizes, true, approxHits, numHits);
+
+ if(ADD_SCORE_Z){
+ int scoreZ=scoreZ2(values, centerIndex, offsets, approxHits, numHits);
+ qscore+=scoreZ;
+ }
+
+ if(qscore>topQscore){
+
+// maxHits=Tools.max(approxHits, maxHits);
+// approxHitsCutoff=Tools.max(approxHitsCutoff, maxHits); //Best setting for pre-scan
+
+ maxHits=Tools.max(approxHits, maxHits);
+ approxHitsCutoff=Tools.max(approxHitsCutoff, approxHits-1); //Best setting for pre-scan
+
+ topQscore=qscore;
+
+ if(qscore>=maxQuickScore){
+ assert(qscore==maxQuickScore);
+ assert(approxHits==numHits);
+ if(earlyExit){
+ return new int[] {topQscore, maxHits};
+ }
+ }
+ }
+ }
+
+ while(heap.peek().site==site){ //Remove all identical elements, and add subsequent elements
+ final Quad t2=heap.poll();
+ final int row=t2.row+1, col=t2.column;
+ if(row<stops[col]){
+ t2.row=row;
+
+ int a=t2.list[row];
+ int a2;
+ if((a&SITE_MASK)>=offsets[col]){
+ a2=a-offsets[col];
+
+ assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) :
+ "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col];
+ }else{
+ int ch=numberToChrom(a, baseChrom);
+ int st=numberToSite(a);
+ int st2=Tools.max(st-offsets[col], 0);
+ a2=toNumber(st2, ch);
+
+ assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) :
+ "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col];
+ }
+
+ assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) :
+ "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col];
+
+ t2.site=a2;
+ values[col]=a2;
+ heap.add(t2);
+ }else if(earlyExit && (perfectOnly || heap.size()<approxHitsCutoff)){
+ return new int[] {topQscore, maxHits};
+ }
+ if(heap.isEmpty()){break;}
+ }
+
+ }
+
+
+
+ return new int[] {topQscore, maxHits};
+ }
+
+
+ private static final int absdif(int a, int b){
+ return a>b ? a-b : b-a;
+ }
+
+
+ final int maxScore(int[] offsets, byte[] baseScores, int[] keyScores, int readlen, boolean useQuality){
+
+ if(useQuality){
+ //These lines apparently MUST be used if quality is used later on for slow align.
+ if(USE_AFFINE_SCORE){return msa.maxQuality(baseScores);}
+ if(USE_EXTENDED_SCORE){return readlen*(BASE_HIT_SCORE+BASE_HIT_SCORE/5)+Tools.sumInt(baseScores);}
+ }else{
+ if(USE_AFFINE_SCORE){return msa.maxQuality(readlen);}
+ if(USE_EXTENDED_SCORE){return readlen*(BASE_HIT_SCORE+BASE_HIT_SCORE/5);}
+ }
+
+ return maxQuickScore(offsets, keyScores);
+ }
+
+
+ public final int maxQuickScore(int[] offsets, int[] keyScores){
+
+// int x=offsets.length*BASE_KEY_HIT_SCORE;
+ int x=Tools.intSum(keyScores);
+ int y=Y_SCORE_MULT*(offsets[offsets.length-1]-offsets[0]);
+// if(ADD_LIST_SIZE_BONUS){x+=(LIST_SIZE_BONUS[1]*offsets.length);}
+// assert(!ADD_SCORE_Z) : "Need to make sure this is correct...";
+
+// if(ADD_SCORE_Z){x+=((offsets[offsets.length-1]+CHUNKSIZE)*Z_SCORE_MULT);}
+ if(ADD_SCORE_Z){x+=maxScoreZ(offsets);}
+
+ return x+y;
+// int bonus=(2*(HIT_SCORE/2)); //For matching both ends
+// return x+y+bonus;
+ }
+
+
+ private final int quickScore(final int[] locs, final int[] keyScores, final int centerIndex, final int offsets[],
+ int[] sizes, final boolean penalizeIndels, final int numApproxHits, final int numHits){
+
+ hist_hits_score[Tools.min(HIT_HIST_LEN, numApproxHits)]++;
+ if(numApproxHits==1){return keyScores[centerIndex];}
+
+ //Done!
+ //Correct way to calculate score:
+ //Find the first chunk that exactly hits the center.
+ //Then, align leftward of it, and align rightward of it, and sum the scores.
+
+ //"-centerIndex" is a disambiguating term that, given otherwise identical match patterns
+ //(for example, a small indel will generate two valid site candidates), choose the lower site.
+
+ int x=keyScores[centerIndex]+scoreLeft(locs, keyScores, centerIndex, sizes, penalizeIndels)+
+ scoreRight(locs, keyScores, centerIndex, sizes, penalizeIndels, numHits)-centerIndex;
+
+ int y=Y_SCORE_MULT*scoreY(locs, centerIndex, offsets);
+ if(ADD_LIST_SIZE_BONUS){x+=calcListSizeBonus(sizes[centerIndex]);}
+// int z=scoreZ(locs, hits);
+ return x+y;
+ }
+
+
+// /** Generates a term that increases score with how many bases in the read match the ref. */
+// public static final int scoreZ(int[] locs, int centerIndex, int offsets[]){
+// final int center=locs[centerIndex];
+//
+// final int[] refLoc=new int[offsets[offsets.length-1]+CHUNKSIZE];
+//
+// final int maxLoc=center+MAX_INDEL2;
+// final int minLoc=Tools.max(0, center-MAX_INDEL);
+//
+// int score=0;
+//
+// for(int i=0; i<locs.length; i++){
+// int loc=locs[i];
+//// int dif=absdif(loc, center);
+// if(loc>=minLoc && loc<=maxLoc){
+//// assert(loc>=center) : "loc="+loc+"\ni="+i+"\ncenterIndex="+centerIndex+
+//// "\nmaxLoc="+maxLoc+"\nlocs:\t"+Arrays.toString(locs)+"\noffsets:\t"+Arrays.toString(offsets);
+//
+// int offset=offsets[i];
+// int max=CHUNKSIZE+offset;
+//
+// for(int j=offset; j<max; j++){
+// int old=refLoc[j];
+// if(old==0){
+// refLoc[j]=loc;
+// score+=4;
+// }else if(old>loc){
+// refLoc[j]=loc;
+// score-=2;
+// }else if(old==loc){
+// score-=1;
+// //do nothing, perhaps, or add 1?
+// }else{
+// score-=2;
+// assert(old<loc);
+// }
+// }
+// }
+// }
+// return score;
+// }
+
+
+
+ private final int extendScore(final byte[] bases, final byte[] baseScores, final int[] offsets, final int[] values,
+ final int chrom, final int centerIndex, final int[] locArray, final int numHits, final int numApproxHits){
+ callsToExtendScore++;
+ hist_hits_extend[Tools.min(HIT_HIST_LEN, numApproxHits)]++;
+
+ final int centerVal=values[centerIndex];
+ final int centerLoc=numberToSite(centerVal);
+
+ final int minLoc=Tools.max(0, centerLoc-MAX_INDEL); //Legacy, for assertions
+ final int maxLoc=centerLoc+MAX_INDEL2; //Legacy, for assertions
+
+ final int minVal=centerVal-MAX_INDEL;
+ final int maxVal=centerVal+MAX_INDEL2;
+
+ final byte[] ref=Data.getChromosome(chrom).array;
+
+ if(verbose){
+ System.err.println("\n");
+ System.err.println("minLoc="+minLoc+", maxLoc="+ maxLoc+", centerIndex="+centerIndex+", centerVal="+centerVal+", centerLoc="+centerLoc);
+ System.err.println("minVal="+minVal+", maxVal="+ maxVal+", numHits="+numHits+", numApproxHits="+numApproxHits);
+ System.err.println("offsets:\t"+Arrays.toString(offsets));
+ System.err.println("values:\t"+Arrays.toString(values));
+ System.err.println();
+ int centerOffset=offsets[centerIndex];
+
+ for(int i=0; i<centerOffset; i++){System.err.print(" ");}
+ for(int i=centerOffset; i<centerOffset+KEYLEN; i++){System.err.print((char)bases[i]);}
+ System.err.println();
+
+ System.err.println(new String(bases));
+ System.err.println(new String(Arrays.copyOfRange(ref, centerLoc, centerLoc+bases.length)));
+ System.err.println();
+ }
+
+// int[] locArray=new int[bases.length];
+ Arrays.fill(locArray, -1);
+
+
+ //First fill in reverse
+ for(int i=0, keynum=0; i<numHits; i++){
+ final int value=values[i];
+
+ if(value>=minVal && value<=maxVal){
+ final int refbase=numberToSite(value);
+ assert(refbase>=minLoc && refbase<=maxLoc);
+
+// System.out.println("numApproxHits="+numApproxHits+", numHits="+numHits+", i="+i+", minVal="+minVal+", value="+value+", maxVal="+maxVal+
+// ", refbase="+refbase+", minLoc="+minLoc+", maxLoc="+maxLoc+", keynum="+keynum);
+// System.out.println("Reverse: Trying key "+refbase+" @ "+offsets[i]);
+// System.out.println("Passed!");
+//
+// System.out.println("Number: \t"+Long.toHexString(value|(1l<<63)));
+// System.out.println("Mask: \t"+Long.toHexString(SITE_MASK|(1l<<63)));
+// System.out.println("Both: \t"+Long.toHexString((value&SITE_MASK)|(1l<<63)));
+
+ keynum++;
+ final int callbase=offsets[i];
+
+ int misses=0;
+ for(int cloc=callbase+KEYLEN-1, rloc=refbase+cloc; cloc>=0 && rloc>=0 && rloc<ref.length; cloc--, rloc--){
+ int old=locArray[cloc];
+ if(old==refbase){
+// System.out.println("Broke because old="+old+", refbase="+refbase);
+ break;
+ } //Already filled with present value
+ if(misses>0 && old>=0){
+// System.out.println("Broke because old="+old+", misses="+misses);
+ break;
+ } //Already filled with something that has no errors
+ byte c=bases[cloc];
+ byte r=ref[rloc];
+
+ if(c==r){
+ if(old<0 || refbase==centerLoc){ //If the cell is empty or this key corresponds to center
+ locArray[cloc]=refbase;
+ }
+ }else{
+ misses++;
+ //Only extends first key all the way back. Others stop at the first error.
+ if(old>=0 || keynum>1){
+// System.out.println("Broke because old="+old+", keynum="+keynum);
+ break;
+ }
+ }
+ }
+ }
+ }
+
+
+
+ //Then fill forward
+ for(int i=0; i<numHits; i++){
+ final int value=values[i];
+
+ if(value>=minVal && value<=maxVal){
+ final int refbase=numberToSite(value);
+ assert(refbase>=minLoc && refbase<=maxLoc);
+ final int callbase=offsets[i];
+
+ int misses=0;
+ for(int cloc=callbase+KEYLEN, rloc=refbase+cloc; cloc<bases.length && rloc<ref.length; cloc++, rloc++){
+ int old=locArray[cloc];
+ if(old==refbase){break;} //Already filled with present value
+ if(misses>0 && old>=0){break;} //Already filled with something that has no errors
+ byte c=bases[cloc];
+ byte r=ref[rloc];
+
+ if(c==r){
+ if(old<0 || refbase==centerLoc){ //If the cell is empty or this key corresponds to center
+ locArray[cloc]=refbase;
+ }
+ }else{
+ misses++;
+ if(old>=0){break;} //Already filled with something that has no errors
+ }
+ }
+ }
+ }
+
+// //Change 'N' to -2. A bit slow.
+// {
+// int firstMatch=0;
+// while(firstMatch<locArray.length && locArray[firstMatch]<0){firstMatch++;}
+// assert(firstMatch<locArray.length) : new String(bases);
+// int last=locArray[firstMatch];
+// for(int i=firstMatch-1; i>=0; i--){
+// final byte c=bases[i];
+// if(c=='N'){locArray[i]=-2;}
+// else{
+// assert(locArray[i]==-1);
+// final int rloc=last+i;
+// byte r=ref[rloc];
+// if(r=='N'){locArray[i]=-2;}
+// }
+// }
+// for(int i=firstMatch; i<locArray.length; i++){
+// final int loc=locArray[i];
+// if(last<1){last=loc;}
+// final byte c=bases[i];
+// if(c=='N'){locArray[i]=-2;}
+// else if(loc==-1 && last>0){
+// final int rloc=last+i;
+// byte r=ref[rloc];
+// if(r=='N'){locArray[i]=-2;}
+// }
+// }
+// }
+
+// System.out.println("$$$\n"+Arrays.toString(locArray));
+// assert(false) : "hits="+numHits+", centerIndex="+centerIndex+", centerVal="+centerVal+", centerLoc="+centerLoc+
+// ", minLoc="+minLoc+", maxLoc="+maxLoc+", minVal="+minVal+", maxVal="+maxVal;
+
+ //Change 'N' to -2, but only for nocalls, not norefs. Much faster.
+ {
+ final byte nb=(byte)'N';
+ for(int i=0; i<bases.length; i++){
+ if(bases[i]==nb){locArray[i]=-2;}
+ }
+ }
+
+ if(USE_AFFINE_SCORE){
+ /* TODO - sometimes returns a higher score than actual alignment. This should never happen. */
+ int score=(KFILTER<2 ? msa.calcAffineScore(locArray, baseScores, bases) :
+ msa.calcAffineScore(locArray, baseScores, bases, KFILTER));
+ return score;
+ }
+
+ int score=0;
+ int lastLoc=-1;
+ int centerBonus=BASE_HIT_SCORE/5;
+ for(int i=0; i<locArray.length; i++){
+ int loc=locArray[i];
+ if(loc>=0){
+ score+=BASE_HIT_SCORE+baseScores[i];
+ if(loc==centerLoc){score+=centerBonus;}
+ if(loc!=lastLoc && lastLoc>=0){
+ int dif=absdif(loc, lastLoc);
+ int penalty=Tools.min(INDEL_PENALTY+INDEL_PENALTY_MULT*dif, MAX_PENALTY_FOR_MISALIGNED_HIT);
+ score-=penalty;
+ }
+ lastLoc=loc;
+ }
+ }
+
+// System.err.println("Extended score: "+score);
+// System.err.println(Arrays.toString(locArray));
+
+
+ return score;
+ }
+
+
+ /** NOTE! This destroys the locArray, so use a copy if needed. */
+ private static final int[] makeGapArray(int[] locArray, int minLoc, int minGap){
+ int gaps=0;
+ boolean doSort=false;
+
+ if(locArray[0]<0){locArray[0]=minLoc;}
+ for(int i=1; i<locArray.length; i++){
+ if(locArray[i]<0){locArray[i]=locArray[i-1]+1;}
+ else{locArray[i]+=i;}
+ if(locArray[i]<locArray[i-1]){doSort=true;}
+ }
+
+// System.err.println(Arrays.toString(locArray)+"\n");
+
+ if(doSort){
+// System.err.println("*");
+ Arrays.sort(locArray);
+ }
+// System.err.println(Arrays.toString(locArray)+"\n");
+
+ for(int i=1; i<locArray.length; i++){
+ int dif=locArray[i]-locArray[i-1];
+ assert(dif>=0);
+ if(dif>minGap){
+ gaps++;
+ }
+ }
+ if(gaps<1){return null;}
+ int[] out=new int[2+gaps*2];
+ out[0]=locArray[0];
+ out[out.length-1]=locArray[locArray.length-1];
+
+ for(int i=1, j=1; i<locArray.length; i++){
+ int dif=locArray[i]-locArray[i-1];
+ assert(dif>=0);
+ if(dif>minGap){
+ out[j]=locArray[i-1];
+ out[j+1]=locArray[i];
+ j+=2;
+ }
+ }
+ return out;
+ }
+
+
+ /** Generates a term that increases score with how many bases in the read match the ref. */
+ private final int scoreZ2(int[] locs, int centerIndex, int offsets[], int numApproxHits, int numHits){
+
+ if(numApproxHits==1){return SCOREZ_1KEY;}
+
+ final int center=locs[centerIndex];
+
+ final int maxLoc=center+MAX_INDEL2;
+ final int minLoc=Tools.max(0, center-MAX_INDEL);
+
+ int score=0;
+
+ int a0=-1, b0=-1;
+
+ for(int i=0; i<numHits; i++){
+ int loc=locs[i];
+// int dif=absdif(loc, center);
+ if(loc>=minLoc && loc<=maxLoc){
+// assert(loc>=center) : "loc="+loc+"\ni="+i+"\ncenterIndex="+centerIndex+
+// "\nmaxLoc="+maxLoc+"\nlocs:\t"+Arrays.toString(locs)+"\noffsets:\t"+Arrays.toString(offsets);
+ int a=offsets[i];
+
+ if(b0<a){
+ score+=b0-a0;
+ a0=a;
+ }
+ b0=a+KEYLEN;
+ }
+ }
+ score+=b0-a0;
+ score=score*Z_SCORE_MULT;
+// assert(score==scoreZslow(locs, centerIndex, offsets, false)) : scoreZslow(locs, centerIndex, offsets, true)+" != "+score;
+ return score;
+ }
+
+ @Deprecated
+ /** This was just to verify scoreZ2. */
+ private final int scoreZslow(int[] locs, int centerIndex, int offsets[], boolean display){
+ final int center=locs[centerIndex];
+
+ final int maxLoc=center+MAX_INDEL2;
+ final int minLoc=Tools.max(0, center-MAX_INDEL);
+
+ byte[] array=new byte[offsets[offsets.length-1]+KEYLEN];
+ int score=0;
+
+ for(int i=0; i<locs.length; i++){
+ int loc=locs[i];
+// int dif=absdif(loc, center);
+ if(loc>=minLoc && loc<=maxLoc){
+ int pos=offsets[i];
+// if(true){
+// System.err.println("\ni="+i+", pos="+pos+", array=["+array.length+"], limit="+(pos+CHUNKSIZE-1));
+// }
+ for(int j=pos; j<pos+KEYLEN; j++){
+ if(array[j]==0){score++;}
+ array[j]=1;
+ }
+ }
+ }
+
+ if(display){System.err.println("\n"+Arrays.toString(array)+"\n");}
+
+ return score*Z_SCORE_MULT;
+ }
+
+ /** Generates a term that increases score with how many bases in the read match the ref. */
+ private final int maxScoreZ(int offsets[]){
+ int score=0;
+ int a0=-1, b0=-1;
+
+ for(int i=0; i<offsets.length; i++){
+ int a=offsets[i];
+
+ if(b0<a){
+ score+=b0-a0;
+ a0=a;
+ }
+ b0=a+KEYLEN;
+
+ }
+ score+=b0-a0;
+ return score*Z_SCORE_MULT;
+ }
+
+
+ private final int scoreRight(int[] locs, int[] keyScores, int centerIndex, int[] sizes, boolean penalizeIndels, int numHits){
+
+ int score=0;
+
+ int prev, loc=locs[centerIndex];
+
+ for(int i=centerIndex+1; i<numHits; i++){
+
+ if(locs[i]>=0){
+ prev=loc;
+ loc=locs[i];
+
+ int offset=absdif(loc, prev);
+
+ if(offset<=MAX_INDEL){
+ score+=keyScores[i];
+ if(ADD_LIST_SIZE_BONUS){score+=calcListSizeBonus(sizes[i]);}
+
+// if(i==locs.length-1){score+=HIT_SCORE/2;} //Adds a bonus for matching the first or last key
+
+ if(penalizeIndels && offset!=0){
+ int penalty=Tools.min(INDEL_PENALTY+INDEL_PENALTY_MULT*offset, MAX_PENALTY_FOR_MISALIGNED_HIT);
+ score-=penalty;
+// score-=(INDEL_PENALTY+Tools.min(INDEL_PENALTY_MULT*offset, 1+HIT_SCORE/4));
+ }
+ }else{
+ loc=prev;
+ }
+ }
+
+ }
+ return score;
+
+ }
+
+ private final int scoreLeft(int[] locs, int[] keyScores, int centerIndex, int[] sizes, boolean penalizeIndels){
+
+ callsToScore++;
+
+ int score=0;
+
+ int prev, loc=locs[centerIndex];
+
+ for(int i=centerIndex-1; i>=0; i--){
+
+ if(locs[i]>=0){
+ prev=loc;
+ loc=locs[i];
+
+ int offset=absdif(loc, prev);
+
+ if(offset<=MAX_INDEL){
+ score+=keyScores[i];
+ if(ADD_LIST_SIZE_BONUS){score+=calcListSizeBonus(sizes[i]);}
+
+// if(i==0){score+=HIT_SCORE/2;} //Adds a bonus for matching the first or last key
+ if(penalizeIndels && offset!=0){
+ int penalty=Tools.min(INDEL_PENALTY+INDEL_PENALTY_MULT*offset, MAX_PENALTY_FOR_MISALIGNED_HIT);
+ score-=penalty;
+ }
+ }else{
+ loc=prev;
+ }
+ }
+
+ }
+ return score;
+
+ }
+
+ /** Encode a (location, chrom) pair to an index */
+ private static final int toNumber(int site, int chrom){
+ int out=(chrom&CHROM_MASK_LOW);
+ out=out<<SHIFT_LENGTH;
+ out=(out|site);
+ return out;
+ }
+
+ /** Decode an (index, baseChrom) pair to a chromosome */
+ private static final int numberToChrom(int number, int baseChrom){
+ assert((baseChrom&CHROM_MASK_LOW)==0) : Integer.toHexString(number)+", baseChrom="+baseChrom;
+ assert(baseChrom>=0) : Integer.toHexString(number)+", baseChrom="+baseChrom;
+ int out=(number>>>SHIFT_LENGTH);
+ out=out+(baseChrom&CHROM_MASK_HIGH);
+ return out;
+ }
+
+ /** Decode an index to a location */
+ private static final int numberToSite(int number){
+ return (number&SITE_MASK);
+ }
+
+ public static final int minChrom(int chrom){return Tools.max(MINCHROM, chrom&CHROM_MASK_HIGH);}
+ public static final int baseChrom(int chrom){return Tools.max(0, chrom&CHROM_MASK_HIGH);}
+ public static final int maxChrom(int chrom){return Tools.max(MINCHROM, Tools.min(MAXCHROM, chrom|CHROM_MASK_LOW));}
+
+
+ private final int[] getOffsetArray(int len){
+ if(len>=offsetArrays.length){return new int[len];}
+ if(offsetArrays[len]==null){offsetArrays[len]=new int[len];}
+ return offsetArrays[len];
+ }
+ private final int[] getLocArray(int len){
+ if(len>=locArrays.length){return new int[len];}
+ if(locArrays[len]==null){locArrays[len]=new int[len];}
+ return locArrays[len];
+ }
+ private final int[] getGreedyListArray(int len){
+ if(len>=greedyListArrays.length){return new int[len];}
+ if(greedyListArrays[len]==null){greedyListArrays[len]=new int[len];}
+ return greedyListArrays[len];
+ }
+ private final int[] getGenericArray(int len){
+ if(len>=genericArrays.length){return new int[len];}
+ if(genericArrays[len]==null){genericArrays[len]=new int[len];}
+ return genericArrays[len];
+ }
+
+ final byte[] getBaseScoreArray(int len, int strand){
+ if(len>=baseScoreArrays[0].length){return new byte[len];}
+ if(baseScoreArrays[strand][len]==null){baseScoreArrays[strand][len]=new byte[len];}
+ return baseScoreArrays[strand][len];
+ }
+ final int[] getKeyScoreArray(int len, int strand){
+ if(len>=keyScoreArrays.length){return new int[len];}
+ if(keyScoreArrays[strand][len]==null){keyScoreArrays[strand][len]=new int[len];}
+ return keyScoreArrays[strand][len];
+ }
+ private final float[] getKeyWeightArray(int len){
+ if(len>=keyWeightArrays.length){return new float[len];}
+ if(keyWeightArrays[len]==null){keyWeightArrays[len]=new float[len];}
+ return keyWeightArrays[len];
+ }
+ @Override
+ float[] keyProbArray() {
+ return keyProbArray;
+ }
+
+ public static final int KMER_ARRAY_LENGTH=1201;
+ public static final int HEAP_LENGTH=2047;
+ public static final int BASE_ARRAY_LENGTH=6001;
+
+ private final int[][] locArrays=new int[BASE_ARRAY_LENGTH][];
+ private final int[] valueArray=new int[HEAP_LENGTH];
+ private final int[] sizeArray=new int[HEAP_LENGTH];
+ private final int[][] offsetArrays=new int[KMER_ARRAY_LENGTH][];
+ private final int[][] greedyListArrays=new int[KMER_ARRAY_LENGTH][];
+ private final int[][] genericArrays=new int[KMER_ARRAY_LENGTH][];
+ private final int[] startArray=new int[HEAP_LENGTH];
+ private final int[] stopArray=new int[HEAP_LENGTH];
+ private final Quad[] tripleStorage=makeQuadStorage(HEAP_LENGTH);
+ private final int[] greedyReturn=new int[2];
+ private final int[][] shrinkReturn2=new int[3][];
+ private final int[][] shrinkReturn3=new int[5][];
+ private final int[][] prescanReturn=new int[2][];
+ private final int[] prescoreArray;
+ private final int[] precountArray;
+
+ private final byte[][][] baseScoreArrays=new byte[2][BASE_ARRAY_LENGTH][];
+ private final int[][][] keyScoreArrays=new int[2][KMER_ARRAY_LENGTH][];
+ final float[] keyProbArray=new float[BASE_ARRAY_LENGTH];
+ private final float[][] keyWeightArrays=new float[KMER_ARRAY_LENGTH][];
+
+
+ private final Quad[] makeQuadStorage(int number){
+ Quad[] r=new Quad[number];
+ for(int i=0; i<number; i++){r[i]=new Quad(i, 0, 0);}
+ return r;
+ }
+
+
+ private final QuadHeap heap=new QuadHeap(HEAP_LENGTH);
+
+ static int SHIFT_LENGTH=(32-1-NUM_CHROM_BITS);
+ static int MAX_ALLOWED_CHROM_INDEX=~((-1)<<SHIFT_LENGTH);
+
+ /** Mask the number to get the site, which is in the lower bits */
+ static int SITE_MASK=((-1)>>>(NUM_CHROM_BITS+1));
+
+ /** Mask the chromosome's high bits to get the low bits */
+ static int CHROM_MASK_LOW=CHROMS_PER_BLOCK-1;
+
+ /** Mask the chromosome's lower bits to get the high bits */
+ static int CHROM_MASK_HIGH=~CHROM_MASK_LOW;
+
+ static void setChromBits(int x){
+
+ NUM_CHROM_BITS=x;
+ CHROMS_PER_BLOCK=(1<<(NUM_CHROM_BITS));
+ SHIFT_LENGTH=(32-1-NUM_CHROM_BITS);
+ MAX_ALLOWED_CHROM_INDEX=~((-1)<<SHIFT_LENGTH);
+ SITE_MASK=((-1)>>>(NUM_CHROM_BITS+1));
+ CHROM_MASK_LOW=CHROMS_PER_BLOCK-1;
+ CHROM_MASK_HIGH=~CHROM_MASK_LOW;
+
+// assert(NUM_CHROM_BITS<30);
+ assert(NUM_CHROM_BITS>=0); //max is 3 for human; perhaps more for other organisms
+// assert((1<<(NUM_CHROM_BITS))>=CHROMSPERBLOCK) : (1<<(NUM_CHROM_BITS))+" < "+CHROMSPERBLOCK;
+ assert((1<<(NUM_CHROM_BITS))==CHROMS_PER_BLOCK) : (1<<(NUM_CHROM_BITS))+" < "+CHROMS_PER_BLOCK;
+ assert(Integer.bitCount(CHROMS_PER_BLOCK)==1);
+ assert(Integer.numberOfLeadingZeros(SITE_MASK)==(NUM_CHROM_BITS+1)) : Integer.toHexString(SITE_MASK);
+ }
+
+ private final int cycles;
+
+ public static final int BASE_HIT_SCORE=100;
+ public static final int ALIGN_COLUMNS=7600;
+ public static int MAX_INDEL=100; //Max indel length, min 0, default 400; longer is more accurate
+ public static int MAX_INDEL2=8*MAX_INDEL;
+
+ private final float INV_BASE_KEY_HIT_SCORE;
+ private final int INDEL_PENALTY; //default (HIT_SCORE/2)-1
+ private final int INDEL_PENALTY_MULT; //default 20; penalty for indel length
+ private final int MAX_PENALTY_FOR_MISALIGNED_HIT;
+ private final int SCOREZ_1KEY;
+
+ public static final boolean GENERATE_KEY_SCORES_FROM_QUALITY=true; //True: Much faster and more accurate.
+ public static final boolean GENERATE_BASE_SCORES_FROM_QUALITY=true; //True: Faster, and at least as accurate.
+ public static final boolean ADD_SCORE_Z=true; //Increases quality, decreases speed
+ public static final int Z_SCORE_MULT=25;
+ public static final int Y_SCORE_MULT=10;
+
+
+ /**
+ * Return only sites that match completely or with partial no-reference
+ */
+ public static void setSemiperfectMode() {
+ assert(!PERFECTMODE);
+ SEMIPERFECTMODE=true;
+ PRESCAN_QSCORE=false;
+// MIN_APPROX_HITS_TO_KEEP++;
+
+
+
+ MAX_INDEL=0;
+ MAX_INDEL2=0;
+ }
+
+ /**
+ * Return only sites that match completely
+ */
+ public static void setPerfectMode() {
+ assert(!SEMIPERFECTMODE);
+ PERFECTMODE=true;
+ PRESCAN_QSCORE=false;
+// MIN_APPROX_HITS_TO_KEEP++;
+
+
+
+ MAX_INDEL=0;
+ MAX_INDEL2=0;
+ }
+
+ static float FRACTION_GENOME_TO_EXCLUDE=0.005f; //Default .04; lower is slower and more accurate
+
+ public static final void setFractionToExclude(float f){
+ assert(f>=0 && f<1);
+ FRACTION_GENOME_TO_EXCLUDE=f;
+ MIN_INDEX_TO_DROP_LONG_HIT_LIST=(int)(1000*(1-3.5*FRACTION_GENOME_TO_EXCLUDE)); //default 810
+ MAX_AVERAGE_LIST_TO_SEARCH=(int)(1000*(1-2.3*FRACTION_GENOME_TO_EXCLUDE)); //lower is faster, default 840
+ MAX_AVERAGE_LIST_TO_SEARCH2=(int)(1000*(1-1.4*FRACTION_GENOME_TO_EXCLUDE)); //default 910
+ MAX_SINGLE_LIST_TO_SEARCH=(int)(1000*(1-1.0*FRACTION_GENOME_TO_EXCLUDE)); //default 935
+ MAX_SHORTEST_LIST_TO_SEARCH=(int)(1000*(1-2.8*FRACTION_GENOME_TO_EXCLUDE)); //Default 860
+ }
+
+
+ /** Default .75. Range: 0 to 1 (but 0 will break everything). Lower is faster and less accurate. */
+ static final float HIT_FRACTION_TO_RETAIN=.97f; //default: .85
+ /** Range: 0 to 1000. Lower should be faster and less accurate. */
+ static int MIN_INDEX_TO_DROP_LONG_HIT_LIST=(int)(1000*(1-3.5*FRACTION_GENOME_TO_EXCLUDE)); //default 810
+ /** Range: 2 to infinity. Lower should be faster and less accurate. */
+ static final int MIN_HIT_LISTS_TO_RETAIN=12;
+
+ static int MAX_AVERAGE_LIST_TO_SEARCH=(int)(1000*(1-2.3*FRACTION_GENOME_TO_EXCLUDE)); //lower is faster, default 840
+ //lower is faster
+ static int MAX_AVERAGE_LIST_TO_SEARCH2=(int)(1000*(1-1.4*FRACTION_GENOME_TO_EXCLUDE)); //default 910
+ //lower is faster
+ static int MAX_SINGLE_LIST_TO_SEARCH=(int)(1000*(1-1.0*FRACTION_GENOME_TO_EXCLUDE)); //default 935
+ //lower is faster
+ static int MAX_SHORTEST_LIST_TO_SEARCH=(int)(1000*(1-2.8*FRACTION_GENOME_TO_EXCLUDE)); //Default 860
+
+ /** To increase accuracy on small genomes, override greedy list dismissal when the list is at most this long. */
+ public static final int SMALL_GENOME_LIST=80;
+
+ static{assert(!(TRIM_BY_GREEDY && TRIM_BY_TOTAL_SITE_COUNT)) : "Pick one.";}
+
+ static final int CLUMPY_MAX_DIST=5; //Keys repeating over intervals of this or less are clumpy.
+
+ /** Minimum length of list before clumpiness is considered. This is an index in the length histogram, from 0 to 1000. */
+ static final int CLUMPY_MIN_LENGTH_INDEX=2800;
+ static final float CLUMPY_FRACTION=0.8f; //0 to 1; higher is slower but more stringent. 0.5 means the median distance is clumpy.
+
+ static final int MAX_SUBSUMPTION_LENGTH=MAX_INDEL2;
+
+ /** approxHitsCutoff=maxHits-MAX_HITS_REDUCTION1 when slowWalk3 is first entered */
+ public static final int MAX_HITS_REDUCTION1=2;
+
+ /** approxHitsCutoff=maxHits-MAX_HITS_REDUCTION2 dynamically when best score is exceeded */
+ public static int MAX_HITS_REDUCTION2=3;
+
+ /** approxHitsCutoff=maxHits-MAX_HITS_REDUCTION_PERFECT when perfect score is found */
+ public static final int MAX_HITS_REDUCTION_PERFECT=2;
+
+ public static int MAXIMUM_MAX_HITS_REDUCTION=6;
+ public static int HIT_REDUCTION_DIV=4;
+
+ private static final int calcApproxHitsCutoff(final int keys, final int hits, int currentCutoff, final boolean perfect){ //***$
+ assert(keys>=hits) : keys+", "+hits;
+ assert(hits>=0);
+
+ int mahtk=MIN_APPROX_HITS_TO_KEEP;
+ if(SEMIPERFECTMODE || PERFECTMODE){
+ if(keys==1){return 1;}
+ else if(MIN_APPROX_HITS_TO_KEEP<keys){
+ mahtk++;
+ if(currentCutoff==MIN_APPROX_HITS_TO_KEEP){currentCutoff++;}
+ }
+ }
+
+ int reduction=Tools.min(Tools.max((hits)/HIT_REDUCTION_DIV, MAX_HITS_REDUCTION2), Tools.max(MAXIMUM_MAX_HITS_REDUCTION, keys/8));
+ assert(reduction>=0);
+ int r=hits-reduction;
+
+ r=Tools.max(mahtk, currentCutoff, r);
+
+ if(perfect){
+ r=Tools.max(r, keys-MAX_HITS_REDUCTION_PERFECT);
+ }
+ return r;
+ }
+
+ public static final boolean USE_SLOWALK3=true && USE_EXTENDED_SCORE;
+ public static boolean PRESCAN_QSCORE=true && USE_EXTENDED_SCORE; //Decrease quality and increase speed
+ public static final boolean FILTER_BY_QSCORE=true; //Slightly lower quality, but very fast.
+ public static final float MIN_SCORE_MULT=(USE_AFFINE_SCORE ? 0.02f : USE_EXTENDED_SCORE ? .3f : 0.10f); //Fraction of max score to use as cutoff. Default 0.15, max is 1; lower is more accurate
+ public static final float MIN_QSCORE_MULT=0.005f; //Fraction of max score to use as cutoff. Default 0.025, max is 1; lower is more accurate
+ public static final float MIN_QSCORE_MULT2=0.005f;
+ static final float DYNAMIC_SCORE_THRESH=(USE_AFFINE_SCORE ? 0.64f : USE_EXTENDED_SCORE ? .74f : 0.6f); //Default .85f; lower is more accurate
+ static final float DYNAMIC_QSCORE_THRESH=0.6f; //default .58f
+ static final float DYNAMIC_QSCORE_THRESH_PERFECT=0.8f; //***$
+ static final float PRESCAN_QSCORE_THRESH=DYNAMIC_QSCORE_THRESH*.95f; //default 1.0f; lower is more accurate and 0 essentially sets PRESCAN_QSCORE=false
+ static{
+ assert(MIN_SCORE_MULT>=0 && MIN_SCORE_MULT<1);
+ assert(DYNAMIC_SCORE_THRESH>=0 && DYNAMIC_SCORE_THRESH<1);
+ }
+
+
+}
diff --git a/current/align2/BBIndexPacBioSkimmer.java b/current/align2/BBIndexPacBioSkimmer.java
new file mode 100755
index 0000000..b5bdecb
--- /dev/null
+++ b/current/align2/BBIndexPacBioSkimmer.java
@@ -0,0 +1,2287 @@
+package align2;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+
+import stream.SiteScore;
+
+import dna.AminoAcid;
+import dna.Data;
+import dna.Gene;
+
+
+/**
+ * Based on Index11f
+ * Designed to skim and retain all sites above a threshold.
+ *
+ *
+ *
+ * @author Brian Bushnell
+ * @date Jul 11, 2012
+ *
+ */
+public final class BBIndexPacBioSkimmer extends AbstractIndex {
+
+
+ public static void main(String[] args){
+
+ int k=12;
+
+ for(int i=0; i<args.length; i++){
+ String s=args[i].toLowerCase();
+ if(s.contains("=")){
+ String[] split=s.split("=");
+ String a=split[0];
+ String b=split[1];
+ if(a.equals("build") || a.equals("b")){
+ Data.setGenome(Integer.parseInt(b));
+ }else if(a.equals("minchrom")){
+ MINCHROM=Integer.parseInt(b);
+ }else if(a.equals("maxchrom")){
+ MAXCHROM=Integer.parseInt(b);
+ }else if(a.equals("keylen") || a.equals("k")){
+ k=Integer.parseInt(b);
+ }
+ }
+ }
+
+ if(MINCHROM==-1){MINCHROM=1;}
+ if(MAXCHROM==-1){
+ assert(Data.numChroms<=Byte.MAX_VALUE) : "TODO";
+ MAXCHROM=Data.numChroms;
+ }
+
+
+ System.err.println("Writing build "+Data.GENOME_BUILD+" "+
+ "BASESPACE index, keylen="+k+", chrom bits="+NUM_CHROM_BITS);
+
+
+ int first=(NUM_CHROM_BITS==0 ? 1 : 0);
+
+
+ Data.sysout.println("Loading index for chunk "+first+"-"+MAXCHROM+", build "+Data.GENOME_BUILD);
+ index=IndexMaker4.makeIndex(Data.GENOME_BUILD, first, MAXCHROM,
+ k, NUM_CHROM_BITS, MAX_ALLOWED_CHROM_INDEX, CHROM_MASK_LOW, CHROM_MASK_HIGH, SITE_MASK, SHIFT_LENGTH, true, false, index);
+
+
+ System.err.println("Finished all chroms, may still be writing.");
+ }
+
+
+ public BBIndexPacBioSkimmer(int k_, int minChrom_, int maxChrom_, int kfilter_, MSA msa_){
+ super(k_, kfilter_, BASE_HIT_SCORE, minChrom_, maxChrom_, msa_);
+ INV_BASE_KEY_HIT_SCORE=1f/BASE_KEY_HIT_SCORE;
+ INDEL_PENALTY=(BASE_KEY_HIT_SCORE/8)-1; //default (HIT_SCORE/2)-1
+ INDEL_PENALTY_MULT=25; //default 20; penalty for indel length
+ MAX_PENALTY_FOR_MISALIGNED_HIT=BASE_KEY_HIT_SCORE-(1+BASE_KEY_HIT_SCORE/8);
+ SCOREZ_1KEY=Z_SCORE_MULT*KEYLEN;
+ {
+ int cyc=0;
+ for(int chrom=minChrom; chrom<=maxChrom; chrom=((chrom&CHROM_MASK_HIGH)+CHROMS_PER_BLOCK)){cyc+=2;}
+ cycles=cyc;
+ }
+ prescoreArray=new int[cycles];
+ precountArray=new int[cycles];
+ }
+
+ /** Load or generate index from minChrom to maxChrom, inclusive, with keylength k.
+ * This range can encompass multiple blocks.
+ * Should only be called once in a process. */
+ public static final synchronized void loadIndex(int minChrom, int maxChrom, int k, boolean writeToDisk, boolean diskInvalid){
+ if(minChrom<1){minChrom=1;}
+ if(maxChrom>Data.numChroms){maxChrom=Data.numChroms;}
+ assert(minChrom<=maxChrom);
+ Data.sysout.println("Loading index for chunk "+minChrom+"-"+maxChrom+", build "+Data.GENOME_BUILD);
+ index=IndexMaker4.makeIndex(Data.GENOME_BUILD, minChrom, maxChrom,
+ k, NUM_CHROM_BITS, MAX_ALLOWED_CHROM_INDEX, CHROM_MASK_LOW, CHROM_MASK_HIGH, SITE_MASK, SHIFT_LENGTH, writeToDisk, diskInvalid, index);
+
+ }
+
+ /** Calculate statistics of index, such as list lengths, and find clumpy keys */
+ public static final synchronized void analyzeIndex(int minChrom, int maxChrom, float fractionToExclude, int k){
+ assert(lengthHistogram==null);
+ assert(COUNTS==null);
+
+ int KEYSPACE=1<<(2*k);
+ COUNTS=new int[KEYSPACE];
+ maxChrom=maxChrom(maxChrom);
+
+ HashMap<Integer, LongM> cmap=new HashMap<Integer, LongM>();
+
+ for(int chrom=minChrom; chrom<=maxChrom; chrom=((chrom&CHROM_MASK_HIGH)+CHROMS_PER_BLOCK)){
+ Block b=index[chrom];
+ final int[] sites=b.sites;
+ final int[] starts=b.starts;
+
+ for(int key=0; key<KEYSPACE; key++){
+
+ long clumps=0;
+
+ final int start1=starts[key];
+ final int stop1=starts[key+1];
+ final int len1=stop1-start1;
+ COUNTS[key]=(int)Tools.min(Integer.MAX_VALUE, COUNTS[key]+len1);
+
+ if(REMOVE_CLUMPY){
+ for(int i=start1+1; i<stop1; i++){
+ int dif=sites[i]-sites[i-1];
+ assert(dif!=0);
+ if(dif>0 && dif<=CLUMPY_MAX_DIST){
+ clumps++;
+ }
+ }
+ if(clumps>0){
+ final int x=Tools.min(key, AminoAcid.reverseComplementBinaryFast(key, k));
+ final Integer ko=x;
+ LongM lm=cmap.get(ko);
+ if(lm==null){
+ lm=new LongM(0);
+ cmap.put(ko, lm);
+ }
+ lm.increment(clumps);
+ }
+ }
+ }
+ }
+
+ for(int key=0; key<COUNTS.length; key++){
+ int rkey=AminoAcid.reverseComplementBinaryFast(key, k);
+ if(key<rkey){
+ int x=(int)Tools.min(Integer.MAX_VALUE, COUNTS[key]+(long)COUNTS[rkey]);
+ COUNTS[key]=COUNTS[rkey]=x;
+ }
+ }
+
+ if(REMOVE_CLUMPY){
+ Integer[] keys=cmap.keySet().toArray(new Integer[cmap.size()]);
+ Arrays.sort(keys);
+
+ for(Integer key : keys){
+ long clumps=cmap.get(key).value();
+ long len=COUNTS[key];
+ if((len>CLUMPY_MIN_LENGTH_INDEX && clumps>CLUMPY_FRACTION*len)/* || (len>8*CLUMPY_MIN_LENGTH_INDEX && clumps>.75f*CLUMPY_FRACTION*len)*/){
+ int rkey=AminoAcid.reverseComplementBinaryFast(key, k);
+ assert(key<=rkey);
+ assert(key==KeyRing.reverseComplementKey(rkey, k));
+ COUNTS[key]=0;
+ COUNTS[rkey]=0;
+ }
+ }
+ }
+
+ lengthHistogram=Tools.makeLengthHistogram3(COUNTS, 1000, verbose2);
+
+ //if(verbose2){System.err.println("lengthHistogram: "+Arrays.toString(lengthHistogram));}
+
+ if(REMOVE_FREQUENT_GENOME_FRACTION){
+
+ int lengthLimitIndex=(int)((1-fractionToExclude)*(lengthHistogram.length-1));
+ int lengthLimitIndex2=(int)((1-fractionToExclude*DOUBLE_SEARCH_THRESH_MULT)*(lengthHistogram.length-1));
+
+ MAX_USABLE_LENGTH=Tools.max(2*SMALL_GENOME_LIST, lengthHistogram[lengthLimitIndex]);
+ MAX_USABLE_LENGTH2=Tools.max(6*SMALL_GENOME_LIST, lengthHistogram[lengthLimitIndex2]);
+
+ if(verbose2){System.err.println("MAX_USABLE_LENGTH: "+MAX_USABLE_LENGTH+"\nMAX_USABLE_LENGTH2: "+MAX_USABLE_LENGTH2);}
+ }
+
+ Solver.POINTS_PER_SITE=(int)Math.floor((Solver.BASE_POINTS_PER_SITE*4000f)/Tools.max(2*SMALL_GENOME_LIST, lengthHistogram[MAX_AVERAGE_LIST_TO_SEARCH]));
+ if(Solver.POINTS_PER_SITE==0){Solver.POINTS_PER_SITE=-1;}
+ if(verbose2){System.err.println("POINTS_PER_SITE: "+Solver.POINTS_PER_SITE);}
+ assert(Solver.POINTS_PER_SITE<0) : Solver.POINTS_PER_SITE;
+ }
+
+// /** Calculate statistics of index, such as list lengths, and find clumpy keys */
+// public static final synchronized void analyzeIndex(int minChrom, int maxChrom, float fractionToExclude, int k){
+//
+// assert(lengthHistogram==null);
+// assert(COUNTS==null);
+//
+// int KEYSPACE=1<<(2*k);
+// COUNTS=new int[KEYSPACE];
+//
+// maxChrom=maxChrom(maxChrom);
+//
+// for(int key=0; key<KEYSPACE; key++){
+// int rkey=KeyRing.reverseComplementKey(key, k, cs);
+// assert(key==KeyRing.reverseComplementKey(rkey, k));
+//
+// if(key<=rkey){
+//
+// long clumps=0;
+// long len=0;
+//
+// for(int chrom=minChrom; chrom<=maxChrom; chrom=((chrom&CHROM_MASK_HIGH)+CHROMS_PER_BLOCK)){
+// Block b=index[chrom];
+//
+// final int[] sites=b.sites;
+// final int start1=b.starts[key];
+// final int stop1=start1+b.length(key);
+// final int start2=(rkey==key ? -1 : b.starts[rkey]);
+// final int stop2=(rkey==key ? -1 : start2+b.length(rkey));
+// final int len1=stop1-start1;
+// final int len2=stop2-start2;
+//
+// len=len+len1+len2;
+//
+// if(REMOVE_CLUMPY){
+// for(int i=start1+1; i<stop1; i++){
+// int dif=sites[i]-sites[i-1];
+// assert(dif!=0);
+// if(dif>0 && dif<=CLUMPY_MAX_DIST){
+// clumps++;
+// }
+// }
+//
+// for(int i=start2+1; i<stop2; i++){
+// int dif=sites[i]-sites[i-1];
+// assert(dif!=0);
+// if(dif>0 && dif<=CLUMPY_MAX_DIST){
+// clumps++;
+// }
+// }
+// }
+//
+// }
+//
+// COUNTS[key]=(int)Tools.min(Integer.MAX_VALUE, COUNTS[key]+len);
+// if(key!=rkey){COUNTS[rkey]=(int)Tools.min(Integer.MAX_VALUE, COUNTS[rkey]+len);}
+// assert(COUNTS[key]==COUNTS[rkey]) : key+", "+rkey;
+//
+// if(REMOVE_CLUMPY && len>CLUMPY_MIN_LENGTH_INDEX && clumps>(CLUMPY_FRACTION*len)){
+// COUNTS[key]=0;
+// COUNTS[rkey]=0;
+// for(int chrom=minChrom; chrom<=maxChrom; chrom=((chrom&CHROM_MASK_HIGH)+CHROMS_PER_BLOCK)){
+// Block b=index[chrom];
+// final int[] sites=b.sites;
+// sites[b.starts[key]]=-1;
+// sites[b.starts[rkey]]=-1;
+// }
+// }
+//
+//// System.err.println("COUNTS["+key+"] = "+COUNTS[key]+", COUNTS["+rkey+"] = "+COUNTS[rkey]);
+// }
+// }
+//
+// lengthHistogram=Tools.makeLengthHistogram3(COUNTS, 1000, verbose2);
+//
+// //if(verbose2){System.err.println("lengthHistogram: "+Arrays.toString(lengthHistogram));}
+//
+// if(REMOVE_FREQUENT_GENOME_FRACTION){
+//
+// int lengthLimitIndex=(int)((1-fractionToExclude)*(lengthHistogram.length-1));
+// int lengthLimitIndex2=(int)((1-fractionToExclude*DOUBLE_SEARCH_THRESH_MULT)*(lengthHistogram.length-1));
+//
+// MAX_USABLE_LENGTH=Tools.max(2*SMALL_GENOME_LIST, lengthHistogram[lengthLimitIndex]);
+// MAX_USABLE_LENGTH2=Tools.max(6*SMALL_GENOME_LIST, lengthHistogram[lengthLimitIndex2]);
+//
+// if(verbose2){System.err.println("MAX_USABLE_LENGTH: "+MAX_USABLE_LENGTH+"\nMAX_USABLE_LENGTH2: "+MAX_USABLE_LENGTH2);}
+// }
+//
+// Solver.POINTS_PER_SITE=(int)Math.floor((Solver.BASE_POINTS_PER_SITE*4000f)/Tools.max(2*SMALL_GENOME_LIST, lengthHistogram[MAX_AVERAGE_LIST_TO_SEARCH]));
+// if(Solver.POINTS_PER_SITE==0){Solver.POINTS_PER_SITE=-1;}
+// if(verbose2){System.err.println("POINTS_PER_SITE: "+Solver.POINTS_PER_SITE);}
+// assert(Solver.POINTS_PER_SITE<0) : Solver.POINTS_PER_SITE;
+// }
+
+
+ /** Returns the filename for the block holding this chrom */
+ public static final String fname(int chrom, int k){
+ return IndexMaker4.fname(minChrom(chrom), maxChrom(chrom), k, NUM_CHROM_BITS);
+ }
+
+ /** Ensure key offsets are strictly ascending. */
+ private static boolean checkOffsets(int[] offsets){
+ for(int i=1; i<offsets.length; i++){
+ if(offsets[i]<=offsets[i-1]){return false;}
+ }
+ return true;
+ }
+
+ @Deprecated
+ private final int trimExcessHitLists(int[] keys, int[][] hits){
+
+ assert(false) : "Needs to be redone because hits are no longer sorted by length.";
+
+ assert(hits.length==keys.length);
+// assert(false) : "modify this function so that it gives more weight to trimming lists over highly covered baits";
+ //And also, incorporate the "remove the longest list" function
+
+ final int limit=Tools.max(SMALL_GENOME_LIST, lengthHistogram[MAX_AVERAGE_LIST_TO_SEARCH])*keys.length;
+ final int limit2=Tools.max(SMALL_GENOME_LIST, lengthHistogram[MAX_AVERAGE_LIST_TO_SEARCH2]);
+ final int limit3=Tools.max(SMALL_GENOME_LIST, lengthHistogram[MAX_SHORTEST_LIST_TO_SEARCH]);
+
+ int sum=0;
+ int initialHitCount=0;
+
+ int shortest=Integer.MAX_VALUE-1;
+ int shortest2=Integer.MAX_VALUE;
+
+ for(int i=0; i<keys.length; i++){
+ int key=keys[i];
+ int x=COUNTS[key];
+ sum+=x;
+ initialHitCount+=(x==0 ? 0 : 1);
+ if(x>0){
+ if(x<shortest2){
+ shortest2=x;
+ if(shortest2<shortest){
+ shortest2=shortest;
+ shortest=x;
+ }
+ }
+ }
+ }
+ assert(shortest2>=shortest);
+ if(initialHitCount<MIN_APPROX_HITS_TO_KEEP){return initialHitCount;}
+ if(shortest>limit3 && !SLOW){
+ for(int i=0; i<hits.length; i++){hits[i]=null;}
+ return 0;
+ }
+ if(sum<=limit && sum/initialHitCount<=limit2){return initialHitCount;}
+
+ Pointer[] ptrs=Pointer.loadMatrix(hits);
+// ptrs[0].value/=2;
+// ptrs[ptrs.length-1].value/=2;
+ Arrays.sort(ptrs);
+
+ int finalHitCount=initialHitCount;
+ for(int i=ptrs.length-1; sum>limit || sum/finalHitCount>limit2; i--){
+ Pointer p=ptrs[i];
+ sum-=hits[p.key].length;
+ hits[p.key]=null;
+ finalHitCount--;
+ }
+
+ return finalHitCount;
+ }
+
+ /** Remove least useful keys to accelerate search */
+ public final int trimExcessHitListsByGreedy(int[] offsets, int[] keyScores, int maxHitLists, int[] keys){
+
+ float[] keyWeights=getKeyWeightArray(keyScores.length);
+ for(int i=0; i<keyScores.length; i++){
+ keyWeights[i]=keyScores[i]*INV_BASE_KEY_HIT_SCORE;
+ }
+
+// assert(false) : "modify this function so that it gives more weight to trimming lists over highly covered baits";
+ //And also, incorporate the "remove the longest list" function
+
+ final int limit=Tools.max(SMALL_GENOME_LIST, lengthHistogram[MAX_AVERAGE_LIST_TO_SEARCH])*keys.length;
+ final int limit2=Tools.max(SMALL_GENOME_LIST, lengthHistogram[MAX_AVERAGE_LIST_TO_SEARCH2]);
+ final int limit3=Tools.max(SMALL_GENOME_LIST, lengthHistogram[MAX_SHORTEST_LIST_TO_SEARCH]);
+// final int limitS=lengthHistogram[chrom][MAX_SINGLE_LIST_TO_SEARCH];
+
+ int sum=0;
+ int initialHitCount=0;
+
+ int shortest=Integer.MAX_VALUE-1;
+ int shortest2=Integer.MAX_VALUE;
+
+// for(int i=0; i<hits.length; i++){
+// if(hits[i]!=null && hits[i].length>limitS){hits[i]=null;}
+// }
+
+ final int[] lengths=getGenericArray(keys.length);
+
+ for(int i=0; i<keys.length; i++){
+ int key=keys[i];
+ int x=count(key);
+ lengths[i]=x;
+ sum+=x;
+ initialHitCount+=(x==0 ? 0 : 1);
+ if(x>0){
+ if(x<shortest2){
+ shortest2=x;
+ if(shortest2<shortest){
+ shortest2=shortest;
+ shortest=x;
+ }
+ }
+ }
+ }
+ assert(shortest2>=shortest);
+ if(initialHitCount<MIN_APPROX_HITS_TO_KEEP){return initialHitCount;}
+ if(shortest>limit3 && !SLOW){
+ for(int i=0; i<keys.length; i++){keys[i]=-1;}
+ return 0;
+ }
+
+ int hitsCount=initialHitCount;
+ int worstValue=Integer.MIN_VALUE;
+
+ while(hitsCount>=MIN_APPROX_HITS_TO_KEEP && (sum>limit || sum/initialHitCount>limit2 || hitsCount>maxHitLists/* || worstValue<0*/)){
+ final int[] lists=getGreedyListArray(hitsCount);
+ for(int i=0, j=0; j<lists.length; i++){
+ if(lengths[i]>0){
+ lists[j]=i;
+ j++;
+ }
+ }
+
+ Solver.findWorstGreedy(offsets, lengths, keyWeights, KEYLEN, lists, greedyReturn);
+ int worstIndex=greedyReturn[0];
+ int worst=lists[worstIndex];
+ worstValue=greedyReturn[1];
+ sum-=lengths[worst];
+
+// if(worstValue>0 && (hitsCount<=maxHitLists || lengths[worst]<excessListLimit)){return hitsCount;}
+ if(worstValue>0 || lengths[worst]<SMALL_GENOME_LIST){return hitsCount;} //This line increases accuracy at expense of speed. Lower constant = more accurate, default 0.
+ hitsCount--;
+ lengths[worst]=0;
+ keys[worst]=-1;
+ }
+ return hitsCount;
+ }
+
+
+ private final int getHits(final int[] keys, final int chrom, final int maxLen, final int[] starts, final int[] stops){
+ int numHits=0;
+ final Block b=index[chrom];
+ for(int i=0; i<keys.length; i++){
+ final int key=keys[i];
+ starts[i]=-1;
+ stops[i]=-1;
+ if(key>=0){
+ final int len=count(key);
+ if(len>0 && len<maxLen){
+ final int len2=b.length(key);
+ if(len2>0){
+ starts[i]=b.starts[key];
+ stops[i]=starts[i]+len2;
+ numHits++;
+ }
+ }
+ }
+ }
+ return numHits;
+ }
+
+
+ private final int countHits(final int[] keys, final int maxLen, boolean clearBadKeys){
+ int numHits=0;
+ for(int i=0; i<keys.length; i++){
+ final int key=keys[i];
+ if(key>=0){
+ final int len=count(key);
+ if(len>0 && len<maxLen){
+ numHits++;
+ }else if(clearBadKeys){
+ keys[i]=-1;
+ }
+ }
+ }
+ return numHits;
+ }
+
+
+ public final ArrayList<SiteScore> findAdvanced(byte[] basesP, byte[] basesM, byte[] qual, byte[] baseScoresP, int[] keyScoresP, int[] offsets, long id){
+ assert(minChrom<=maxChrom && minChrom>=0);
+ ArrayList<SiteScore> result=find(basesP, basesM, qual, baseScoresP, keyScoresP, offsets, true, id);
+ if(DOUBLE_SEARCH_NO_HIT && (result==null || result.isEmpty())){result=find(basesP, basesM, qual, baseScoresP, keyScoresP, offsets, false, id);}
+
+ return result;
+ }
+
+
+ public final ArrayList<SiteScore> find(byte[] basesP, byte[] basesM, byte[] qual, byte[] baseScoresP, int[] keyScoresP, int[] offsetsP, boolean obeyLimits, long id){
+
+ assert(checkOffsets(offsetsP)) : Arrays.toString(offsetsP);
+ final int[] keysOriginal=KeyRing.makeKeys(basesP, offsetsP, KEYLEN);
+ int[] keysP=Arrays.copyOf(keysOriginal, keysOriginal.length);
+
+ initialKeys+=offsetsP.length;
+ initialKeyIterations++;
+
+ final int maxLen=(obeyLimits ? MAX_USABLE_LENGTH : MAX_USABLE_LENGTH2);
+
+ int numHits=0;
+ numHits=countHits(keysP, maxLen, true);
+ if(numHits>0){ //TODO: Change these to higher numbers
+ int trigger=(3*keysP.length)/4;
+ if(numHits<20 && numHits<trigger){
+ for(int i=0; i<keysP.length; i++){keysP[i]=keysOriginal[i];}
+ numHits=countHits(keysP, (maxLen*3)/2, true);
+ }
+ if(numHits<18 && numHits<trigger){
+ for(int i=0; i<keysP.length; i++){keysP[i]=keysOriginal[i];}
+ numHits=countHits(keysP, maxLen*2, true);
+ }
+ if(numHits<16 && numHits<trigger){
+ for(int i=0; i<keysP.length; i++){keysP[i]=keysOriginal[i];}
+ numHits=countHits(keysP, maxLen*3, true);
+ }
+ if(numHits<14 && numHits<trigger){
+ for(int i=0; i<keysP.length; i++){keysP[i]=keysOriginal[i];}
+ numHits=countHits(keysP, maxLen*5, true);
+ }
+ }
+// assert(checkOffsets(offsetsP)) : Arrays.toString(offsetsP);
+ if(numHits<keysP.length){
+ int[][] r=shrink2(offsetsP, keysP, keyScoresP);
+ assert(r!=null);
+ if(r!=null){
+ offsetsP=r[0];
+ keysP=r[1];
+ keyScoresP=r[2];
+ }
+ }else{
+ assert(shrink2(offsetsP, keysP, keyScoresP)==null);
+ }
+ initialKeys2+=numHits;
+ //assert(checkOffsets(offsetsP)) : Arrays.toString(offsetsP);
+
+// assert(checkOffsets(offsets)) : Arrays.toString(offsets);
+ if(TRIM_BY_GREEDY && obeyLimits){
+ int maxLists=Tools.max((int)(HIT_FRACTION_TO_RETAIN*keysP.length), MIN_HIT_LISTS_TO_RETAIN);
+ numHits=trimExcessHitListsByGreedy(offsetsP, keyScoresP, maxLists, keysP);
+ }
+// System.out.println("After greedy: numHits = "+numHits);
+
+ if(TRIM_BY_TOTAL_SITE_COUNT && obeyLimits){
+ throw new RuntimeException("Needs to be redone.");
+// numHits=trimExcessHitLists(keys, hits);
+ }
+
+ if(TRIM_LONG_HIT_LISTS && obeyLimits && numHits>MIN_APPROX_HITS_TO_KEEP){
+ int cutoffIndex=((int) (HIT_FRACTION_TO_RETAIN*(keysP.length)-0.01f))+(keysP.length-numHits);
+
+ int zeroes=keysP.length-numHits;
+ int altMinIndex=(zeroes+(MIN_HIT_LISTS_TO_RETAIN-1));
+ cutoffIndex=Tools.max(cutoffIndex, altMinIndex);
+
+ assert(cutoffIndex>0) : cutoffIndex+"\n"+numHits;
+
+ if(cutoffIndex<(keysP.length-1)){
+ int[] lens=getGenericArray(keysP.length);
+ for(int i=0; i<keysP.length; i++){lens[i]=count(keysP[i]);}
+ Arrays.sort(lens);
+ int cutoff=lens[cutoffIndex];
+
+ cutoff=Tools.max(lengthHistogram[MIN_INDEX_TO_DROP_LONG_HIT_LIST], cutoff);
+
+ int removed=0;
+
+ for(int i=0; i<keysP.length; i++){
+ int key=keysP[i];
+ if(count(key)>cutoff){
+ keysP[i]=-1;
+ removed++;
+ numHits--;
+ }
+ }
+ }
+ }
+// assert(checkOffsets(offsets)) : Arrays.toString(offsets);
+
+ final ArrayList<SiteScore> result=new ArrayList<SiteScore>(8);
+ if(numHits<MIN_APPROX_HITS_TO_KEEP){return result;}
+ //assert(checkOffsets(offsetsP)) : Arrays.toString(offsetsP);
+ if(numHits<keysP.length){
+ int[][] r=shrink2(offsetsP, keysP, keyScoresP);
+ assert(r!=null);
+ if(r!=null){
+ offsetsP=r[0];
+ keysP=r[1];
+ keyScoresP=r[2];
+ }
+ }else{
+ assert(shrink2(offsetsP, keysP, keyScoresP)==null);
+ }
+ assert(keysP.length==numHits);
+ //assert(checkOffsets(offsetsP)) : Arrays.toString(offsetsP);
+ //Reverse the offsets for minus-strand mapping, since they are generated based on quality
+ int[] offsetsM=KeyRing.reverseOffsets(offsetsP, KEYLEN, basesP.length);
+ final int[] keysM=KeyRing.reverseComplementKeys(keysP, KEYLEN);
+
+// assert(checkOffsets(offsetsP)) : Arrays.toString(offsetsP);
+// assert(checkOffsets(offsetsM)) : Arrays.toString(offsetsM);
+
+ assert(!USE_EXTENDED_SCORE || (baseScoresP!=null && (qual==null || baseScoresP.length==qual.length)));
+ assert(keyScoresP!=null);
+ assert(keyScoresP.length==offsetsP.length) : keyScoresP.length+", "+offsetsP.length+", "+Arrays.toString(keyScoresP);
+ final byte[] baseScoresM=Tools.reverseAndCopy(baseScoresP, getBaseScoreArray(baseScoresP.length, 1));
+ final int[] keyScoresM=Tools.reverseAndCopy(keyScoresP, getKeyScoreArray(keyScoresP.length, 1));
+ final int maxQuickScore=maxQuickScore(offsetsP, keyScoresP);
+
+ assert(offsetsM.length==offsetsP.length);
+ assert(maxQuickScore==maxQuickScore(offsetsM, keyScoresM));
+
+ final int[] bestScores=new int[6];
+
+ //This prevents filtering by qscore when a low-quality read only uses a few keys.
+ //In that case, extending is more important.
+ final boolean prescan_qscore=(PRESCAN_QSCORE && numHits>=5);
+
+ int[][] prescanResults=null;
+ int[] precounts=null;
+ int[] prescores=null;
+
+ int hitsCutoff=0;
+ int qscoreCutoff=(int)(MIN_QSCORE_MULT*maxQuickScore);
+
+ boolean allBasesCovered=true;
+ {
+ if(offsetsP[0]!=0){allBasesCovered=false;}
+ else if(offsetsP[offsetsP.length-1]!=(basesP.length-KEYLEN)){allBasesCovered=false;}
+ else{
+ for(int i=1; i<offsetsP.length; i++){
+ if(offsetsP[i]>offsetsP[i-1]+KEYLEN){
+ allBasesCovered=false;
+ break;
+ }
+ }
+ }
+ }
+
+ //TODO I don't understand this logic
+ final boolean pretendAllBasesAreCovered=(allBasesCovered ||
+ keysP.length>=keysOriginal.length-4 ||
+ (keysP.length>=9 && (offsetsP[offsetsP.length-1]-offsetsP[0]+KEYLEN)>Tools.max(40, (int)(basesP.length*.75f))));
+
+// System.err.println(allBasesCovered+"\t"+Arrays.toString(offsetsP));
+// assert(allBasesCovered);
+
+ if(prescan_qscore){
+ prescanResults=prescanAllBlocks(bestScores,
+ keysP, keyScoresP, offsetsP,
+ keysM, keyScoresM, offsetsM,
+ pretendAllBasesAreCovered);
+
+ if(prescanResults!=null){
+ precounts=prescanResults[0];
+ prescores=prescanResults[1];
+ }
+
+ if(bestScores[1]<MIN_APPROX_HITS_TO_KEEP){return result;}
+ if(bestScores[3]<maxQuickScore*MIN_QSCORE_MULT2){return result;}
+
+ if(bestScores[3]>=maxQuickScore && pretendAllBasesAreCovered){
+ assert(bestScores[3]==maxQuickScore);
+ assert(bestScores[1]==numHits);
+ }
+
+ hitsCutoff=calcApproxHitsCutoff(keysP.length, bestScores[1], MIN_APPROX_HITS_TO_KEEP, true);
+ qscoreCutoff=calcQScoreCutoff(maxQuickScore, bestScores[3]/2, qscoreCutoff);
+ }
+
+ final int maxScore=maxScore(offsetsP, baseScoresP, keyScoresP, basesP.length, true);
+ final boolean fullyDefined=AminoAcid.isFullyDefined(basesP);
+ assert(bestScores[2]<=0) : Arrays.toString(bestScores);
+
+ int cycle=0;
+ for(int chrom=minChrom; chrom<=maxChrom; chrom=((chrom&CHROM_MASK_HIGH)+CHROMS_PER_BLOCK)){
+ if(precounts==null || precounts[cycle]>=hitsCutoff || prescores[cycle]>=qscoreCutoff){
+ find(keysP, basesP, baseScoresP, keyScoresP, chrom, Gene.PLUS,
+ offsetsP, obeyLimits, result, bestScores, allBasesCovered, maxScore, fullyDefined);
+ }
+ cycle++;
+ if(precounts==null || precounts[cycle]>=hitsCutoff || prescores[cycle]>=qscoreCutoff){
+ find(keysM, basesM, baseScoresM, keyScoresM, chrom, Gene.MINUS,
+ offsetsM, obeyLimits, result, bestScores, allBasesCovered, maxScore, fullyDefined);
+ }
+ cycle++;
+ }
+
+// assert(Read.CHECKSITES(result, basesP));
+
+ return result;
+ }
+
+ /** Search blocks rapidly to find max hits, and perfect sites. May indicate some blocks can be skipped. */
+ private final int[][] prescanAllBlocks(int[] bestScores,
+ int[] keysP, int[] keyScoresP, int[] offsetsP,
+ int[] keysM, int[] keyScoresM, int[] offsetsM,
+ final boolean allBasesCovered){
+
+ int[][][] pm=new int[][][] {{keysP, keyScoresP, offsetsP}, {keysM, keyScoresM, offsetsM}};
+
+ int bestqscore=0;
+ int maxHits=0;
+ int minHitsToScore=MIN_APPROX_HITS_TO_KEEP;
+
+ final int maxQuickScore=maxQuickScore(offsetsP, keyScoresP);
+
+ final int[] counts=precountArray;
+ final int[] scores=prescoreArray;
+ final int[][] ret=prescanReturn;
+ Arrays.fill(counts, keysP.length);
+ Arrays.fill(scores, maxQuickScore);
+ ret[0]=counts;
+ ret[1]=scores;
+
+ int cycle=0;
+ for(int chrom=minChrom; chrom<=maxChrom; chrom=((chrom&CHROM_MASK_HIGH)+CHROMS_PER_BLOCK)){
+ final int baseChrom=baseChrom(chrom);
+ for(int pmi=0; pmi<2; pmi++, cycle++){
+
+ int[] keys=pm[pmi][0];
+ int[] keyScores=pm[pmi][1];
+ int[] offsets=pm[pmi][2];
+// int[][] hits=getHitArray(offsets.length);
+
+ int[] starts=startArray;
+ int[] stops=stopArray;
+ int numHits=getHits(keys, chrom, Integer.MAX_VALUE, starts, stops);
+
+ if(numHits<minHitsToScore){
+ scores[cycle]=-9999;
+ counts[cycle]=0;
+ }else{
+
+// final int maxQuickScore=maxQuickScore(offsets, keyScores);
+ // System.err.println("maxScore = "+maxScore);
+
+ if(numHits<keys.length){
+ int[][] r=shrink(starts, stops, offsets, keyScores, offsets.length);
+ if(r!=null){
+ starts=r[0];
+ stops=r[1];
+ offsets=r[2];
+ keyScores=r[4];
+ }
+ }
+
+ assert(numHits==offsets.length);
+ assert(numHits==keyScores.length);
+ heap.clear();
+ final Quad[] triples=tripleStorage;
+ final int[] values=valueArray;
+
+ int[] temp=findMaxQscore2(starts, stops, offsets, keyScores, baseChrom, triples, values, minHitsToScore, true,
+ bestqscore>=maxQuickScore && allBasesCovered);
+
+ scores[cycle]=temp[0];
+ counts[cycle]=temp[1];
+
+ bestqscore=Tools.max(temp[0], bestqscore);
+ maxHits=Tools.max(maxHits, temp[1]);
+ if(bestqscore>=maxQuickScore && allBasesCovered){
+ assert(bestqscore==maxQuickScore);
+ assert(maxHits==keysP.length) :
+ "\nTemp: \t"+Arrays.toString(temp)+", cycle="+cycle+"\n" +
+ "Scores: \t"+Arrays.toString(scores)+
+ "Counts: \t"+Arrays.toString(counts)+
+ "bestqscore: \t"+bestqscore+
+ "maxHits: \t"+maxHits+
+ "maxQuickScore: \t"+maxQuickScore+
+ "numHits: \t"+numHits+
+ "minHitsToScore: \t"+minHitsToScore+
+ "keys.length: \t"+keys.length;
+
+ minHitsToScore=Tools.max(minHitsToScore, maxHits);
+
+ {
+ //This early exit is optional. Does not seem to impact speed much either way.
+ bestScores[1]=Tools.max(bestScores[1], maxHits);
+ bestScores[3]=Tools.max(bestScores[3], bestqscore);
+ return ret;
+ }
+ }
+ }
+ }
+ }
+
+ bestScores[1]=Tools.max(bestScores[1], maxHits);
+ bestScores[3]=Tools.max(bestScores[3], bestqscore);
+
+ if(!RETAIN_BEST_QCUTOFF){bestScores[2]=-9999;}
+
+ return ret;
+ }
+
+
+ /** Search a single block and strand */
+ public final ArrayList<SiteScore> find(int[] keys, final byte[] bases, final byte[] baseScores, int[] keyScores,
+ final int chrom, final byte strand,
+ int[] offsets, final boolean obeyLimits, ArrayList<SiteScore> ssl, int[] bestScores,
+ final boolean allBasesCovered, final int maxScore, final boolean fullyDefined){
+
+ assert(checkOffsets(offsets)) : Arrays.toString(offsets);
+
+ int[] starts=startArray;
+ int[] stops=stopArray;
+
+ int numHits=getHits(keys, chrom, Integer.MAX_VALUE, starts, stops);
+ if(numHits<MIN_APPROX_HITS_TO_KEEP){return ssl;}
+
+
+ if(!RETAIN_BEST_SCORES){Arrays.fill(bestScores, 0);}
+ ssl=slowWalk3(starts, stops, bases, baseScores, keyScores, offsets, chrom, strand, obeyLimits, ssl, bestScores, allBasesCovered, maxScore, fullyDefined);
+
+
+ return ssl;
+ }
+
+ /** Compress arrays by removing null/empty lists */
+ private final int[][] shrink(int[] starts, int[] stops, int[] offsets, int[] keyScores, final int len){
+ int numHits=0;
+ for(int i=0; i<len; i++){
+ if(starts[i]>=0){numHits++;}
+ }
+
+ if(numHits==offsets.length){
+ return null;
+ }else{
+ int[][] r=shrinkReturn3;
+ int[] starts2=startArray;
+ int[] stops2=stopArray;
+ int[] offsets2=getOffsetArray(numHits);
+ int[] keyScores2=new int[numHits];
+
+ for(int i=0, j=0; i<len; i++){
+ if(starts[i]>=0){
+ starts2[j]=starts[i];
+ stops2[j]=stops[i];
+ offsets2[j]=offsets[i];
+ keyScores2[j]=keyScores[i];
+ j++;
+ }
+ }
+ r[0]=starts2;
+ r[1]=stops2;
+ r[2]=offsets2;
+ r[4]=keyScores2;
+ return r;
+ }
+ }
+
+ /** Removes "-1" keys. */
+ private final int[][] shrink2(int[] offsets, int[] keys, int[] keyScores){
+
+
+ int numHits=0;
+ for(int i=0; i<keys.length; i++){
+ if(keys[i]>=0){numHits++;}
+ }
+
+
+ assert(checkOffsets(offsets)) : Arrays.toString(offsets);
+ if(numHits==keys.length){
+ return null;
+ }else{
+ int[][] r=shrinkReturn2;
+ int[] offsets2=getOffsetArray(numHits);
+ assert(offsets2!=offsets);
+ assert(offsets2.length<offsets.length);
+ int[] keys2=new int[numHits];
+ int[] keyScores2=new int[numHits];
+
+ for(int i=0, j=0; i<keys.length; i++){
+ if(keys[i]>=0){
+ offsets2[j]=offsets[i];
+ keys2[j]=keys[i];
+ keyScores2[j]=keyScores[i];
+ j++;
+ }
+ }
+ assert(checkOffsets(offsets2)) : "\nnumHits="+numHits+"\n"+Arrays.toString(offsets)+" -> \n"+Arrays.toString(offsets2)+"\n"+
+ "\n"+Arrays.toString(keys)+" -> \n"+Arrays.toString(keys2)+"\n";
+ r[0]=offsets2;
+ r[1]=keys2;
+ r[2]=keyScores2;
+ return r;
+ }
+ }
+
+
+ /** This uses a heap to track next column to increment */
+ private final ArrayList<SiteScore> slowWalk3(int[] starts, int[] stops, final byte[] bases,
+ final byte[] baseScores, int[] keyScores, int[] offsets,
+ final int baseChrom_, final byte strand, final boolean obeyLimits, ArrayList<SiteScore> ssl,
+ int[] bestScores, final boolean allBasesCovered, final int maxScore, final boolean fullyDefined){
+ assert(USE_EXTENDED_SCORE);
+
+ final int numKeys=offsets.length; //Before shrink
+
+ //This can be done before or after shrinking, but the results will change depending on MIN_SCORE_MULT and etc.
+ final int maxQuickScore=maxQuickScore(offsets, keyScores);
+
+ if(SHRINK_BEFORE_WALK){
+ int[][] r=shrink(starts, stops, offsets, keyScores, offsets.length);
+ if(r!=null){
+ starts=r[0];
+ stops=r[1];
+ offsets=r[2];
+ keyScores=r[4];
+ }
+ }
+
+ final int numHits=offsets.length; //After shrink
+
+
+ assert(numHits==offsets.length);
+ assert(numHits==keyScores.length);
+
+ usedKeys+=numHits;
+ usedKeyIterations++;
+
+ final boolean filter_by_qscore=(FILTER_BY_QSCORE && numKeys>=5);
+
+ assert(!(!SHRINK_BEFORE_WALK && ADD_SCORE_Z));
+
+
+// final int minScore=(obeyLimits ? (int)(MIN_SCORE_MULT*maxScore) : (int)(MIN_SCORE_MULT*0.85f*maxScore));
+ final int minScore=(obeyLimits ? (int)(MIN_SCORE_MULT*maxScore) : (int)(MIN_SCORE_MULT*1.25f*maxScore));
+ final int minQuickScore=(int)(MIN_QSCORE_MULT*maxQuickScore);
+
+ final int baseChrom=baseChrom(baseChrom_);
+
+ heap.clear();
+
+ final Quad[] triples=tripleStorage;
+
+ final int[] values=valueArray;
+ final int[] sizes=sizeArray;
+ final int[] locArray=(USE_EXTENDED_SCORE ? getLocArray(bases.length) : null);
+ final Block b=index[baseChrom];
+
+ if(ssl==null){ssl=new ArrayList<SiteScore>(8);}
+
+ int currentTopScore=bestScores[0];
+ int cutoff=Tools.max(minScore, (int)(currentTopScore*DYNAMIC_SCORE_THRESH));
+
+ int qcutoff=Tools.max(bestScores[2], minQuickScore);
+ int bestqscore=bestScores[3];
+ int maxHits=bestScores[1];
+ int perfectsFound=bestScores[5];
+ assert((currentTopScore>=maxScore) == (perfectsFound>0)) : currentTopScore+", "+maxScore+", "+perfectsFound+", "+maxHits+", "+numHits;
+ int approxHitsCutoff=calcApproxHitsCutoff(numKeys, maxHits, MIN_APPROX_HITS_TO_KEEP, currentTopScore>=maxScore);
+ if(approxHitsCutoff>numHits){return ssl;}
+
+ final boolean shortCircuit=(allBasesCovered && numKeys==numHits && filter_by_qscore);
+
+ if(currentTopScore>=maxScore){
+ assert(currentTopScore==maxScore);
+
+ }
+
+
+ for(int i=0; i<numHits; i++){
+ final int[] sites=b.sites;
+ final int start=starts[i];
+ sizes[i]=b.length(start, stops[i]);
+ assert(sizes[i]>0);
+
+ int a=sites[start];
+ int a2;
+ if((a&SITE_MASK)>=offsets[i]){
+ a2=a-offsets[i];
+ }else{
+ int ch=numberToChrom(a, baseChrom);
+ int st=numberToSite(a);
+ int st2=Tools.max(st-offsets[i], 0);
+ a2=toNumber(st2, ch);
+ }
+ assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom));
+
+ Quad t=triples[i];
+ assert(t!=null) : "Should be using tripleStorage";
+ assert(i==t.column);
+ t.row=start;
+ t.site=a2;
+ t.list=sites;
+ values[i]=a2;
+
+ heap.add(t);
+ }
+
+// System.out.println("\nEntering SS loop:");
+// System.out.println("maxScore="+maxScore+"\tminScore="+minScore+"\tcurrentTopScore="+currentTopScore+"\n" +
+// "cutoff="+cutoff+"\tmaxHits="+maxHits+"\tapproxHitsCutoff="+approxHitsCutoff);
+// System.out.println("maxQuickScore="+maxQuickScore+"\tminQuickScore="+minQuickScore+"\tqcutoff="+qcutoff);
+
+
+ SiteScore prevSS=null;
+ while(!heap.isEmpty()){
+ Quad t=heap.peek();
+ final int site=t.site;
+ final int centerIndex=t.column;
+
+ int maxNearbySite=site;
+
+
+ int approxHits=0;
+
+ {//Inner loop
+ final int minsite=site-MAX_INDEL, maxsite=site+MAX_INDEL2;
+ for(int column=0, chances=numHits-approxHitsCutoff; column<numHits && chances>=0; column++){
+ final int x=values[column];
+ assert(x==triples[column].site);
+ if(x>=minsite && x<=maxsite){
+ maxNearbySite=(x>maxNearbySite ? x : maxNearbySite);
+ approxHits++;
+ }else{chances--;}
+ }
+ }
+
+ assert(centerIndex>=0) : centerIndex;
+ assert(approxHits>=1 || approxHitsCutoff>1) : approxHits+", "+approxHitsCutoff+", "+numHits+", "+t.column;
+ if(approxHits>=approxHitsCutoff){
+
+ int score;
+ int qscore=(filter_by_qscore ? quickScore(values, keyScores, centerIndex, offsets, sizes, true, approxHits, numHits) : qcutoff);
+ if(ADD_SCORE_Z){
+ int scoreZ=scoreZ2(values, centerIndex, offsets, approxHits, numHits);
+ qscore+=scoreZ;
+ }
+
+ int mapStart=site, mapStop=maxNearbySite;
+
+ assert(USE_EXTENDED_SCORE);
+
+ boolean locArrayValid=false;
+ if(qscore<qcutoff){
+ score=-1;
+ }else{
+
+ final int chrom=numberToChrom(site, baseChrom);
+
+ //TODO Note that disabling the shortCircuit code seems to make things run 2% faster (with identical results).
+ //However, theoretically, shortCircuit should be far more efficient. Test both ways on cluster and on a larger run.
+ //May have something to do with compiler loop optimizations.
+ if(shortCircuit && qscore==maxQuickScore){
+ assert(approxHits==numKeys);
+ score=maxScore;
+ }else{
+ if(verbose){
+ System.err.println("Extending "+Arrays.toString(values));
+ }
+ score=extendScore(bases, baseScores, offsets, values, chrom, centerIndex, locArray, numHits, approxHits);
+ locArrayValid=true;
+
+ if(verbose){
+ System.err.println("score: "+score);
+ System.err.println("locArray: "+Arrays.toString(locArray));
+ }
+
+ //Correct begin and end positions if they changed.
+ int min=Integer.MAX_VALUE;
+ int max=Integer.MIN_VALUE;
+ for(int i=0; i<locArray.length; i++){
+ int x=locArray[i];
+ if(x>-1){
+ if(x<min){min=x;}
+ if(x>max){max=x;}
+ }
+ }
+
+ if(score>=maxScore){
+ assert(min==max && min>-1) : "\n"+score+", "+maxScore+", "+min+", "+max+
+ ", "+(max-min)+", "+bases.length+"\n"+Arrays.toString(locArray)+"\n";
+ }
+
+ // assert(min>-1 && max>-1) : Arrays.toString(locArray); //TODO: How did this assertion trigger?
+ if(min<0 || max<0){
+ System.err.println("Anomaly in "+getClass().getName()+".slowWalk: "+
+ chrom+", "+mapStart+", "+mapStop+", "+centerIndex+", "+
+ Arrays.toString(locArray)+"\n"+
+ Arrays.toString(values)+"\n"+
+ new String(bases)+"\nstrand="+strand+"\n");
+ System.err.println();
+ score=-99999;
+ }
+
+ //mapStart and mapStop are indices
+ mapStart=toNumber(min, chrom);
+ mapStop=toNumber(max, chrom);
+
+ if(score>=maxScore){
+ assert(mapStop-mapStart==0) : "\n"+score+", "+maxScore+", "+min+", "+max+
+ ", "+(max-min)+", "+(mapStop-mapStart)+", "+bases.length+"\n"+Arrays.toString(locArray)+"\n";
+ }
+ }
+
+// if(score==maxScore){//Disabled for Skimmer version
+// qcutoff=Tools.max(qcutoff, (int)(maxQuickScore*DYNAMIC_QSCORE_THRESH_PERFECT));
+// approxHitsCutoff=calcApproxHitsCutoff(numKeys, maxHits, MIN_APPROX_HITS_TO_KEEP, true);
+// }
+
+ if(score>=cutoff){
+ qcutoff=calcQScoreCutoff(maxQuickScore, qscore, qcutoff);
+ bestqscore=Tools.max(qscore, bestqscore);
+ }
+ }
+
+ if(score>=cutoff){
+
+ if(score>currentTopScore){
+// System.err.println("New top score!");
+
+ if(DYNAMICALLY_TRIM_LOW_SCORES){
+
+ maxHits=Tools.max(approxHits, maxHits);
+ approxHitsCutoff=calcApproxHitsCutoff(numKeys, maxHits, approxHitsCutoff, currentTopScore>=maxScore);
+ cutoff=calcScoreCutoff(maxScore, currentTopScore, cutoff);
+ }
+
+ currentTopScore=score;
+
+// System.out.println("New top score: "+currentTopScore+" \t("+cutoff+")");
+ }
+
+ final int chrom=numberToChrom(mapStart, baseChrom);
+ final int site2=numberToSite(mapStart);
+ final int site3=numberToSite(mapStop)+bases.length-1;
+
+ assert(NUM_CHROM_BITS==0 || site2<SITE_MASK-1000) : "chrom="+chrom+", strand="+strand+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", site2="+site2+", site3="+site3+", read.length="+bases.length+
+ "\n\n"+Arrays.toString(b.getHitList(centerIndex));
+ assert(site2<site3) : "chrom="+chrom+", strand="+strand+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", site2="+site2+", site3="+site3+", read.length="+bases.length;
+
+
+ int[] gapArray=null;
+ if(site3-site2>=MINGAP+bases.length){
+ assert(locArrayValid) : "Loc array was not filled.";
+// System.err.println("****\n"+Arrays.toString(locArray)+"\n");
+// int[] clone=locArray.clone();
+ gapArray=makeGapArray(locArray, site2, MINGAP);
+ if(gapArray!=null){
+// System.err.println(Arrays.toString(locArray)+"\n");
+// System.err.println(Arrays.toString(gapArray));
+//
+//// int sub=site2-mapStart;//thus site2=mapStart+sub
+//// for(int i=0; i<gapArray.length; i++){
+//// gapArray[i]+=sub;
+//// }
+//// System.err.println(Arrays.toString(gapArray));
+//
+// System.err.println(mapStart+" -> "+site2);
+// System.err.println(mapStop+" -> "+site3);
+
+ assert(gapArray[0]>=site2 && gapArray[0]-site2<bases.length);
+ assert(gapArray[gapArray.length-1]<=site3 && site3-gapArray[gapArray.length-1]<bases.length) : "\n"+
+ mapStart+" -> "+site2+"\n"+
+ mapStop+" -> "+site3+"\n\n"+
+ Arrays.toString(gapArray)+"\n\n"+
+// Arrays.toString(clone)+"\n\n"+
+ Arrays.toString(locArray)+"\n"+
+ "numHits="+numHits+", "+
+ "heap.size="+heap.size()+", "+
+ "numHits="+numHits+", "+
+ "approxHits="+approxHits+"\n";
+ gapArray[0]=Tools.min(gapArray[0], site2);
+ gapArray[gapArray.length-1]=Tools.max(gapArray[gapArray.length-1], site3);
+ }
+ if(verbose){System.err.println("@ site "+site2+", made gap array: "+Arrays.toString(gapArray));}
+// assert(false) : Arrays.toString(locArray);
+ }
+
+
+ //This block is optional, but tries to eliminate multiple identical alignments
+
+ SiteScore ss=null;
+ final boolean perfect1=USE_EXTENDED_SCORE && score==maxScore && fullyDefined;
+ final boolean inbounds=(site2>=0 && site3<Data.chromLengths[chrom]);
+// if(!inbounds){System.err.println("Index tossed out-of-bounds site chr"+chrom+", "+site2+"-"+site3);}
+
+ if(inbounds && !SEMIPERFECTMODE && !PERFECTMODE && gapArray==null && prevSS!=null &&
+ prevSS.chrom==chrom && prevSS.strand==strand && overlap(prevSS.start, prevSS.stop, site2, site3)){
+
+ final int betterScore=Tools.max(score, prevSS.score);
+ final int minStart=Tools.min(prevSS.start, site2);
+ final int maxStop=Tools.max(prevSS.stop, site3);
+ final boolean perfect2=USE_EXTENDED_SCORE && prevSS.score==maxScore && fullyDefined;
+ assert(!USE_EXTENDED_SCORE || perfect2==prevSS.perfect);
+
+ final boolean shortEnough=(!LIMIT_SUBSUMPTION_LENGTH_TO_2X || (maxStop-minStart<2*bases.length));
+
+ if(prevSS.start==site2 && prevSS.stop==site3){
+ prevSS.score=prevSS.quickScore=betterScore;
+ prevSS.perfect=(prevSS.perfect || perfect1 || perfect2);
+ if(prevSS.perfect){prevSS.semiperfect=true;}
+ }else if(SUBSUME_SAME_START_SITES && shortEnough && prevSS.start==site2 && !prevSS.semiperfect){
+ if(perfect2){
+ //do nothing
+ }else if(perfect1){
+ prevSS.setStop(site3);
+ if(!prevSS.perfect){perfectsFound++;}//***$
+ prevSS.perfect=prevSS.semiperfect=true;
+ }else{
+ prevSS.setStop(maxStop);
+ prevSS.setPerfect(bases);
+ }
+ prevSS.score=prevSS.quickScore=betterScore;
+ }else if(SUBSUME_SAME_STOP_SITES && shortEnough && prevSS.stop==site3 && !prevSS.semiperfect){
+ if(perfect2){
+ //do nothing
+ }else if(perfect1){
+ prevSS.setStart(site2);
+ if(!prevSS.perfect){perfectsFound++;}//***$
+ prevSS.perfect=prevSS.semiperfect=true;
+ }else{
+ prevSS.setStart(minStart);
+ prevSS.setPerfect(bases);
+ }
+ prevSS.score=prevSS.quickScore=betterScore;
+ }else if(SUBSUME_OVERLAPPING_SITES && shortEnough && (maxStop-minStart<=bases.length+MAX_SUBSUMPTION_LENGTH)
+ && !perfect1 && !perfect2 && !prevSS.semiperfect){
+ prevSS.setLimits(minStart, maxStop);
+ prevSS.score=prevSS.quickScore=betterScore;
+ prevSS.setPerfect(bases);
+ }else{
+ ss=new SiteScore(chrom, strand, site2, site3, approxHits, score, false, perfect1);
+ if(!perfect1){ss.setPerfect(bases);}
+ if(verbose){System.err.println("A) Index made SiteScore "+ss.toText()+", "+Arrays.toString(ss.gaps));}
+ assert(!perfect1 || ss.stop-ss.start==bases.length-1);
+ }
+ assert(!perfect2 || prevSS.stop-prevSS.start==bases.length-1);
+ }else if(inbounds){
+ ss=new SiteScore(chrom, strand, site2, site3, approxHits, score, false, perfect1);
+ if(!perfect1){ss.setPerfect(bases);}
+ ss.gaps=gapArray;
+ if(verbose){System.err.println("B) Index made SiteScore "+ss.toText()+", "+Arrays.toString(ss.gaps));}
+ }
+
+ assert(ss==null || !ss.perfect || ss.semiperfect) : ss;
+ assert(prevSS==null || !prevSS.perfect || prevSS.semiperfect) : "\n"+SiteScore.header()+"\n"+ss+"\n"+prevSS;
+ if(ss!=null && (SEMIPERFECTMODE && !ss.semiperfect) || (PERFECTMODE && !ss.perfect)){ss=null;}
+
+
+ if(ss!=null){
+// System.out.println("Added site "+ss.toText()+", qscore="+qscore);
+ ssl.add(ss);
+ if(ss.perfect){
+
+ if(prevSS==null || !prevSS.perfect || !ss.overlaps(prevSS)){
+ if(prevSS==null){assert ssl.size()<2 || !ss.overlaps(ssl.get(ssl.size()-2));}
+ perfectsFound++;
+
+ //Human-specific code
+// if(QUIT_AFTER_TWO_PERFECTS){
+// if(perfectsFound>=3 || (perfectsFound>=2 && chrom<24)){break;}
+// }
+
+// if(QUIT_AFTER_TWO_PERFECTS && perfectsFound>=2){break;}
+ }
+ }
+
+ prevSS=ss;
+ }else{
+// System.out.println("Subsumed site "+new SiteScore(chrom, strand, site2, site3, score).toText());
+ }
+ }
+ }
+
+ while(heap.peek().site==site){ //Remove all identical elements, and add subsequent elements
+ final Quad t2=heap.poll();
+ final int row=t2.row+1, col=t2.column;
+ if(row<stops[col]){
+ t2.row=row;
+
+ int a=t2.list[row];
+ int a2;
+ if((a&SITE_MASK)>=offsets[col]){
+ a2=a-offsets[col];
+
+ assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) :
+ "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col];
+ }else{
+ int ch=numberToChrom(a, baseChrom);
+ int st=numberToSite(a);
+ int st2=Tools.max(st-offsets[col], 0);
+ a2=toNumber(st2, ch);
+
+ assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) :
+ "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col];
+ }
+
+ assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) :
+ "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col];
+
+ t2.site=a2;
+ values[col]=a2;
+ heap.add(t2);
+ }else if(heap.size()<approxHitsCutoff || PERFECTMODE){
+ assert(USE_EXTENDED_SCORE);
+ bestScores[0]=Tools.max(bestScores[0], currentTopScore);
+ bestScores[1]=Tools.max(bestScores[1], maxHits);
+ bestScores[2]=Tools.max(bestScores[2], qcutoff);
+ bestScores[3]=Tools.max(bestScores[3], bestqscore);
+
+ bestScores[4]=maxQuickScore;
+ bestScores[5]=perfectsFound; //***$ fixed by adding this line
+ if(!RETAIN_BEST_QCUTOFF){bestScores[2]=-9999;}
+
+ return ssl;
+ }
+ if(heap.isEmpty()){
+ assert(false) : heap.size()+", "+approxHitsCutoff;
+ break;
+ }
+ }
+
+ }
+
+ assert(USE_EXTENDED_SCORE);
+ bestScores[0]=Tools.max(bestScores[0], currentTopScore);
+ bestScores[1]=Tools.max(bestScores[1], maxHits);
+ bestScores[2]=Tools.max(bestScores[2], qcutoff);
+ bestScores[3]=Tools.max(bestScores[3], bestqscore);
+
+ bestScores[4]=maxQuickScore;
+ bestScores[5]=perfectsFound;
+ if(!RETAIN_BEST_QCUTOFF){bestScores[2]=-9999;}
+
+ return ssl;
+ }
+
+
+ private final int[] findMaxQscore2(final int[] starts, final int[] stops, final int[] offsets, final int[] keyScores,
+ final int baseChrom_, final Quad[] triples, final int[] values, final int prevMaxHits,
+ boolean earlyExit, boolean perfectOnly){
+
+ final int numHits=offsets.length;
+ assert(numHits>=prevMaxHits);
+
+ final int baseChrom=baseChrom(baseChrom_);
+ final Block b=index[baseChrom];
+ final int[] sizes=sizeArray;
+
+ heap.clear();
+ for(int i=0; i<numHits; i++){
+ final int[] sites=b.sites;
+ final int start=starts[i];
+ sizes[i]=b.length(start, stops[i]);
+ assert(sizes[i]>0);
+
+ int a=sites[start];
+ int a2;
+ if((a&SITE_MASK)>=offsets[i]){
+ a2=a-offsets[i];
+ }else{
+ int ch=numberToChrom(a, baseChrom);
+ int st=numberToSite(a);
+ int st2=Tools.max(st-offsets[i], 0);
+ a2=toNumber(st2, ch);
+ }
+ assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom));
+
+ Quad t=triples[i];
+ assert(t!=null) : "Should be using tripleStorage";
+ assert(i==t.column);
+ t.row=start;
+ t.site=a2;
+ t.list=sites;
+ values[i]=a2;
+
+ heap.add(t);
+ }
+
+ final int maxQuickScore=maxQuickScore(offsets, keyScores);
+
+ int topQscore=-999999999;
+
+ int maxHits=0;
+// int approxHitsCutoff=MIN_APPROX_HITS_TO_KEEP;
+
+
+ int approxHitsCutoff;
+ final int indelCutoff;
+ if(perfectOnly){
+ approxHitsCutoff=numHits;
+ indelCutoff=0;
+ }else{
+ approxHitsCutoff=Tools.max(prevMaxHits, Tools.min(MIN_APPROX_HITS_TO_KEEP, numHits-1)); //Faster, same accuracy
+ indelCutoff=MAX_INDEL2;
+ }
+
+
+ while(!heap.isEmpty()){
+ Quad t=heap.peek();
+ final int site=t.site;
+ final int centerIndex=t.column;
+
+ int maxNearbySite=site;
+
+
+ int approxHits=0;
+
+ {//Inner loop
+ final int minsite=site-Tools.min(MAX_INDEL, indelCutoff), maxsite=site+MAX_INDEL2;
+ for(int column=0, chances=numHits-approxHitsCutoff; column<numHits && chances>=0; column++){
+ final int x=values[column];
+ assert(x==triples[column].site);
+ if(x>=minsite && x<=maxsite){
+ maxNearbySite=(x>maxNearbySite ? x : maxNearbySite);
+ approxHits++;
+ }else{chances--;}
+ }
+ }
+
+ assert(centerIndex>=0) : centerIndex;
+ assert(approxHits>=1 || approxHitsCutoff>1) : approxHits+", "+approxHitsCutoff+", "+numHits+", "+t.column;
+ if(approxHits>=approxHitsCutoff){
+
+ int qscore=quickScore(values, keyScores, centerIndex, offsets, sizes, true, approxHits, numHits);
+
+ if(ADD_SCORE_Z){
+ int scoreZ=scoreZ2(values, centerIndex, offsets, approxHits, numHits);
+ qscore+=scoreZ;
+ }
+
+ if(qscore>topQscore){
+
+// maxHits=Tools.max(approxHits, maxHits);
+// approxHitsCutoff=Tools.max(approxHitsCutoff, maxHits); //Best setting for pre-scan
+
+ maxHits=Tools.max(approxHits, maxHits);
+ approxHitsCutoff=Tools.max(approxHitsCutoff, approxHits-1); //Best setting for pre-scan
+
+ topQscore=qscore;
+
+ if(qscore>=maxQuickScore){
+ assert(qscore==maxQuickScore);
+ assert(approxHits==numHits);
+ if(earlyExit){
+ return new int[] {topQscore, maxHits};
+ }
+ }
+ }
+ }
+
+ while(heap.peek().site==site){ //Remove all identical elements, and add subsequent elements
+ final Quad t2=heap.poll();
+ final int row=t2.row+1, col=t2.column;
+ if(row<stops[col]){
+ t2.row=row;
+
+ int a=t2.list[row];
+ int a2;
+ if((a&SITE_MASK)>=offsets[col]){
+ a2=a-offsets[col];
+
+ assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) :
+ "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col];
+ }else{
+ int ch=numberToChrom(a, baseChrom);
+ int st=numberToSite(a);
+ int st2=Tools.max(st-offsets[col], 0);
+ a2=toNumber(st2, ch);
+
+ assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) :
+ "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col];
+ }
+
+ assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) :
+ "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", site="+site+
+ ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col];
+
+ t2.site=a2;
+ values[col]=a2;
+ heap.add(t2);
+ }else if(earlyExit && (perfectOnly || heap.size()<approxHitsCutoff)){
+ return new int[] {topQscore, maxHits};
+ }
+ if(heap.isEmpty()){break;}
+ }
+
+ }
+
+
+
+ return new int[] {topQscore, maxHits};
+ }
+
+
+ private static final int absdif(int a, int b){
+ return a>b ? a-b : b-a;
+ }
+
+
+ final int maxScore(int[] offsets, byte[] baseScores, int[] keyScores, int readlen, boolean useQuality){
+
+ if(useQuality){
+ //These lines apparently MUST be used if quality is used later on for slow align.
+ if(USE_AFFINE_SCORE){return msa.maxQuality(baseScores);}
+ if(USE_EXTENDED_SCORE){return readlen*(BASE_HIT_SCORE+BASE_HIT_SCORE/5)+Tools.sumInt(baseScores);}
+ }else{
+ if(USE_AFFINE_SCORE){return msa.maxQuality(readlen);}
+ if(USE_EXTENDED_SCORE){return readlen*(BASE_HIT_SCORE+BASE_HIT_SCORE/5);}
+ }
+
+ return maxQuickScore(offsets, keyScores);
+ }
+
+
+ public final int maxQuickScore(int[] offsets, int[] keyScores){
+
+// int x=offsets.length*BASE_KEY_HIT_SCORE;
+ int x=Tools.intSum(keyScores);
+ int y=(Y_SCORE_MULT+Y2_SCORE_MULT)*(offsets[offsets.length-1]-offsets[0]);
+// if(ADD_LIST_SIZE_BONUS){x+=(LIST_SIZE_BONUS[1]*offsets.length);}
+// assert(!ADD_SCORE_Z) : "Need to make sure this is correct...";
+
+// if(ADD_SCORE_Z){x+=((offsets[offsets.length-1]+CHUNKSIZE)*Z_SCORE_MULT);}
+ if(ADD_SCORE_Z){x+=maxScoreZ(offsets);}
+
+ return x+y;
+// int bonus=(2*(HIT_SCORE/2)); //For matching both ends
+// return x+y+bonus;
+ }
+
+
+ private final int quickScore(final int[] locs, final int[] keyScores, final int centerIndex, final int offsets[],
+ int[] sizes, final boolean penalizeIndels, final int numApproxHits, final int numHits){
+
+ if(numApproxHits==1){return keyScores[centerIndex];}
+
+ //Done!
+ //Correct way to calculate score:
+ //Find the first chunk that exactly hits the center.
+ //Then, align leftward of it, and align rightward of it, and sum the scores.
+
+ //"-centerIndex" is a disambiguating term that, given otherwise identical match patterns
+ //(for example, a small indel will generate two valid site candidates), choose the lower site.
+
+ int x=keyScores[centerIndex]+scoreLeft(locs, keyScores, centerIndex, sizes, penalizeIndels)+
+ scoreRight(locs, keyScores, centerIndex, sizes, penalizeIndels, numHits)-centerIndex;
+
+ int y=Y_SCORE_MULT*scoreY(locs, centerIndex, offsets)+Y2_SCORE_MULT*scoreY2(locs, centerIndex, offsets);
+ if(ADD_LIST_SIZE_BONUS){x+=calcListSizeBonus(sizes[centerIndex]);}
+// int z=scoreZ(locs, hits);
+ return x+y;
+ }
+
+
+ /** Generates a term that increases score with how far apart the two farthest perfect (+- Y2_INDEL) matches are.
+ * Assumes that the centerIndex corresponds to the leftmost perfect match. */
+ public final int scoreY2(int[] locs, int centerIndex, int offsets[]){
+ int center=locs[centerIndex];
+//
+// int leftIndex=centerIndex;
+// for(int i=centerIndex-1; i>=0; i--){
+// if(absdif(locs[i], centerIndex)>Y2_INDEL){break;}
+// leftIndex=i;
+// }
+
+ int leftIndex=centerIndex;
+ for(int i=0; i<centerIndex; i++){
+// assert(locs[i]<=locs[centerIndex]) : locs[i]+", "+locs[centerIndex]+", "+centerIndex+"\n"+Arrays.toString(locs);
+// if(centerIndex-locs[i]>Y2_INDEL){break;}
+ if(absdif(locs[i], center)<=Y2_INDEL){
+ leftIndex=i;
+ break;
+ }
+ }
+
+ int rightIndex=centerIndex;
+ for(int i=offsets.length-1; i>centerIndex; i--){
+// assert(locs[i]>=locs[centerIndex]);
+// if(locs[i]-centerIndex>Y2_INDEL){break;}
+ if(absdif(locs[i], center)<=Y2_INDEL){
+ rightIndex=i;
+ break;
+ }
+ }
+
+ return offsets[rightIndex]-offsets[leftIndex];
+ }
+
+
+// /** Generates a term that increases score with how many bases in the read match the ref. */
+// public static final int scoreZ(int[] locs, int centerIndex, int offsets[]){
+// final int center=locs[centerIndex];
+//
+// final int[] refLoc=new int[offsets[offsets.length-1]+CHUNKSIZE];
+//
+// final int maxLoc=center+MAX_INDEL2;
+// final int minLoc=Tools.max(0, center-MAX_INDEL);
+//
+// int score=0;
+//
+// for(int i=0; i<locs.length; i++){
+// int loc=locs[i];
+//// int dif=absdif(loc, center);
+// if(loc>=minLoc && loc<=maxLoc){
+//// assert(loc>=center) : "loc="+loc+"\ni="+i+"\ncenterIndex="+centerIndex+
+//// "\nmaxLoc="+maxLoc+"\nlocs:\t"+Arrays.toString(locs)+"\noffsets:\t"+Arrays.toString(offsets);
+//
+// int offset=offsets[i];
+// int max=CHUNKSIZE+offset;
+//
+// for(int j=offset; j<max; j++){
+// int old=refLoc[j];
+// if(old==0){
+// refLoc[j]=loc;
+// score+=4;
+// }else if(old>loc){
+// refLoc[j]=loc;
+// score-=2;
+// }else if(old==loc){
+// score-=1;
+// //do nothing, perhaps, or add 1?
+// }else{
+// score-=2;
+// assert(old<loc);
+// }
+// }
+// }
+// }
+// return score;
+// }
+
+
+
+ private final int extendScore(final byte[] bases, final byte[] baseScores, final int[] offsets, final int[] values,
+ final int chrom, final int centerIndex, final int[] locArray, final int numHits, final int numApproxHits){
+ callsToExtendScore++;
+
+ final int centerVal=values[centerIndex];
+ final int centerLoc=numberToSite(centerVal);
+
+ final int minLoc=Tools.max(0, centerLoc-MAX_INDEL); //Legacy, for assertions
+ final int maxLoc=centerLoc+MAX_INDEL2; //Legacy, for assertions
+
+ final int minVal=centerVal-MAX_INDEL;
+ final int maxVal=centerVal+MAX_INDEL2;
+
+// System.out.println("Min, center, max = "+minLoc+", "+center+", "+ maxLoc);
+// System.out.println("centerIndex = "+centerIndex);
+
+ final byte[] ref=Data.getChromosome(chrom).array;
+
+// int[] locArray=new int[bases.length];
+ Arrays.fill(locArray, -1);
+
+
+ //First fill in reverse
+ for(int i=0, keynum=0; i<numHits; i++){
+ final int value=values[i];
+
+ if(value>=minVal && value<=maxVal){
+ final int refbase=numberToSite(value);
+ assert(refbase>=minLoc && refbase<=maxLoc);
+
+ // System.out.println("Reverse: Trying key "+refbase+" @ "+offsets[i]);
+ // System.out.println("Passed!");
+ keynum++;
+ final int callbase=offsets[i];
+
+ int misses=0;
+ for(int cloc=callbase+KEYLEN-1, rloc=refbase+cloc; cloc>=0 && rloc>=0 && rloc<ref.length; cloc--, rloc--){
+ int old=locArray[cloc];
+ if(old==refbase){
+ // System.out.println("Broke because old="+old+", refbase="+refbase);
+ break;
+ } //Already filled with present value
+ if(misses>0 && old>=0){
+ // System.out.println("Broke because old="+old+", misses="+misses);
+ break;
+ } //Already filled with something that has no errors
+ byte c=bases[cloc];
+ byte r=ref[rloc];
+
+ if(c==r){
+ if(old<0 || refbase==centerLoc){ //If the cell is empty or this key corresponds to center
+ locArray[cloc]=refbase;
+ }
+ }else{
+ misses++;
+ //Only extends first key all the way back. Others stop at the first error.
+ if(old>=0 || keynum>1){
+ // System.out.println("Broke because old="+old+", keynum="+keynum);
+ break;
+ }
+ }
+ }
+ }
+ }
+
+
+
+ //Then fill forward
+ for(int i=0; i<numHits; i++){
+ final int value=values[i];
+
+ if(value>=minVal && value<=maxVal){
+ final int refbase=numberToSite(value);
+ assert(refbase>=minLoc && refbase<=maxLoc);
+ final int callbase=offsets[i];
+
+ int misses=0;
+ for(int cloc=callbase+KEYLEN, rloc=refbase+cloc; cloc<bases.length && rloc<ref.length; cloc++, rloc++){
+ int old=locArray[cloc];
+ if(old==refbase){break;} //Already filled with present value
+ if(misses>0 && old>=0){break;} //Already filled with something that has no errors
+ byte c=bases[cloc];
+ byte r=ref[rloc];
+
+ if(c==r){
+ if(old<0 || refbase==centerLoc){ //If the cell is empty or this key corresponds to center
+ locArray[cloc]=refbase;
+ }
+ }else{
+ misses++;
+ if(old>=0){break;} //Already filled with something that has no errors
+ }
+ }
+ }
+ }
+
+// //Change 'N' to -2. A bit slow.
+// {
+// int firstMatch=0;
+// while(firstMatch<locArray.length && locArray[firstMatch]<0){firstMatch++;}
+// assert(firstMatch<locArray.length) : new String(bases);
+// int last=locArray[firstMatch];
+// for(int i=firstMatch-1; i>=0; i--){
+// final byte c=bases[i];
+// if(c=='N'){locArray[i]=-2;}
+// else{
+// assert(locArray[i]==-1);
+// final int rloc=last+i;
+// byte r=ref[rloc];
+// if(r=='N'){locArray[i]=-2;}
+// }
+// }
+// for(int i=firstMatch; i<locArray.length; i++){
+// final int loc=locArray[i];
+// if(last<1){last=loc;}
+// final byte c=bases[i];
+// if(c=='N'){locArray[i]=-2;}
+// else if(loc==-1 && last>0){
+// final int rloc=last+i;
+// byte r=ref[rloc];
+// if(r=='N'){locArray[i]=-2;}
+// }
+// }
+// }
+
+ //Change 'N' to -2, but only for nocalls, not norefs. Much faster.
+ {
+ final byte nb=(byte)'N';
+ for(int i=0; i<bases.length; i++){
+ if(bases[i]==nb){locArray[i]=-2;}
+ }
+ }
+
+ if(USE_AFFINE_SCORE){
+ /* TODO - sometimes returns a higher score than actual alignment. This should never happen. */
+ int score=(KFILTER<2 ? msa.calcAffineScore(locArray, baseScores, bases) :
+ msa.calcAffineScore(locArray, baseScores, bases, KFILTER));
+ return score;
+ }
+
+ int score=0;
+ int lastLoc=-1;
+ int centerBonus=BASE_HIT_SCORE/5;
+ for(int i=0; i<locArray.length; i++){
+ int loc=locArray[i];
+ if(loc>=0){
+ score+=BASE_HIT_SCORE+baseScores[i];
+ if(loc==centerLoc){score+=centerBonus;}
+ if(loc!=lastLoc && lastLoc>=0){
+ int dif=absdif(loc, lastLoc);
+ int penalty=Tools.min(INDEL_PENALTY+INDEL_PENALTY_MULT*dif, MAX_PENALTY_FOR_MISALIGNED_HIT);
+ score-=penalty;
+ }
+ lastLoc=loc;
+ }
+ }
+
+// System.err.println("Extended score: "+score);
+// System.err.println(Arrays.toString(locArray));
+
+
+ return score;
+ }
+
+
+ /** NOTE! This destroys the locArray, so use a copy if needed. */
+ private static final int[] makeGapArray(int[] locArray, int minLoc, int minGap){
+ int gaps=0;
+ boolean doSort=false;
+
+ if(locArray[0]<0){locArray[0]=minLoc;}
+ for(int i=1; i<locArray.length; i++){
+ if(locArray[i]<0){locArray[i]=locArray[i-1]+1;}
+ else{locArray[i]+=i;}
+ if(locArray[i]<locArray[i-1]){doSort=true;}
+ }
+
+// System.err.println(Arrays.toString(locArray)+"\n");
+
+ if(doSort){
+// System.err.println("*");
+ Arrays.sort(locArray);
+ }
+// System.err.println(Arrays.toString(locArray)+"\n");
+
+ for(int i=1; i<locArray.length; i++){
+ int dif=locArray[i]-locArray[i-1];
+ assert(dif>=0);
+ if(dif>minGap){
+ gaps++;
+ }
+ }
+ if(gaps<1){return null;}
+ int[] out=new int[2+gaps*2];
+ out[0]=locArray[0];
+ out[out.length-1]=locArray[locArray.length-1];
+
+ for(int i=1, j=1; i<locArray.length; i++){
+ int dif=locArray[i]-locArray[i-1];
+ assert(dif>=0);
+ if(dif>minGap){
+ out[j]=locArray[i-1];
+ out[j+1]=locArray[i];
+ j+=2;
+ }
+ }
+ return out;
+ }
+
+
+ /** Generates a term that increases score with how many bases in the read match the ref. */
+ private final int scoreZ2(int[] locs, int centerIndex, int offsets[], int numApproxHits, int numHits){
+
+ if(numApproxHits==1){return SCOREZ_1KEY;}
+
+ final int center=locs[centerIndex];
+
+ final int maxLoc=center+MAX_INDEL2;
+ final int minLoc=Tools.max(0, center-MAX_INDEL);
+
+ int score=0;
+
+ int a0=-1, b0=-1;
+
+ for(int i=0; i<numHits; i++){
+ int loc=locs[i];
+// int dif=absdif(loc, center);
+ if(loc>=minLoc && loc<=maxLoc){
+// assert(loc>=center) : "loc="+loc+"\ni="+i+"\ncenterIndex="+centerIndex+
+// "\nmaxLoc="+maxLoc+"\nlocs:\t"+Arrays.toString(locs)+"\noffsets:\t"+Arrays.toString(offsets);
+ int a=offsets[i];
+
+ if(b0<a){
+ score+=b0-a0;
+ a0=a;
+ }
+ b0=a+KEYLEN;
+ }
+ }
+ score+=b0-a0;
+ score=score*Z_SCORE_MULT;
+// assert(score==scoreZslow(locs, centerIndex, offsets, false)) : scoreZslow(locs, centerIndex, offsets, true)+" != "+score;
+ return score;
+ }
+
+ @Deprecated
+ /** This was just to verify scoreZ2. */
+ private final int scoreZslow(int[] locs, int centerIndex, int offsets[], boolean display){
+ final int center=locs[centerIndex];
+
+ final int maxLoc=center+MAX_INDEL2;
+ final int minLoc=Tools.max(0, center-MAX_INDEL);
+
+ byte[] array=new byte[offsets[offsets.length-1]+KEYLEN];
+ int score=0;
+
+ for(int i=0; i<locs.length; i++){
+ int loc=locs[i];
+// int dif=absdif(loc, center);
+ if(loc>=minLoc && loc<=maxLoc){
+ int pos=offsets[i];
+// if(true){
+// System.err.println("\ni="+i+", pos="+pos+", array=["+array.length+"], limit="+(pos+CHUNKSIZE-1));
+// }
+ for(int j=pos; j<pos+KEYLEN; j++){
+ if(array[j]==0){score++;}
+ array[j]=1;
+ }
+ }
+ }
+
+ if(display){System.err.println("\n"+Arrays.toString(array)+"\n");}
+
+ return score*Z_SCORE_MULT;
+ }
+
+ /** Generates a term that increases score with how many bases in the read match the ref. */
+ private final int maxScoreZ(int offsets[]){
+ int score=0;
+ int a0=-1, b0=-1;
+
+ for(int i=0; i<offsets.length; i++){
+ int a=offsets[i];
+
+ if(b0<a){
+ score+=b0-a0;
+ a0=a;
+ }
+ b0=a+KEYLEN;
+
+ }
+ score+=b0-a0;
+ return score*Z_SCORE_MULT;
+ }
+
+
+ private final int scoreRight(int[] locs, int[] keyScores, int centerIndex, int[] sizes, boolean penalizeIndels, int numHits){
+
+ int score=0;
+
+ int prev, loc=locs[centerIndex];
+
+ for(int i=centerIndex+1; i<numHits; i++){
+
+ if(locs[i]>=0){
+ prev=loc;
+ loc=locs[i];
+
+ int offset=absdif(loc, prev);
+
+ if(offset<=MAX_INDEL){
+ score+=keyScores[i];
+ if(ADD_LIST_SIZE_BONUS){score+=calcListSizeBonus(sizes[i]);}
+
+// if(i==locs.length-1){score+=HIT_SCORE/2;} //Adds a bonus for matching the first or last key
+
+ if(penalizeIndels && offset!=0){
+ int penalty=Tools.min(INDEL_PENALTY+INDEL_PENALTY_MULT*offset, MAX_PENALTY_FOR_MISALIGNED_HIT);
+ score-=penalty;
+// score-=(INDEL_PENALTY+Tools.min(INDEL_PENALTY_MULT*offset, 1+HIT_SCORE/4));
+ }
+ }else{
+ loc=prev;
+ }
+ }
+
+ }
+ return score;
+
+ }
+
+ private final int scoreLeft(int[] locs, int[] keyScores, int centerIndex, int[] sizes, boolean penalizeIndels){
+
+ callsToScore++;
+
+ int score=0;
+
+ int prev, loc=locs[centerIndex];
+
+ for(int i=centerIndex-1; i>=0; i--){
+
+ if(locs[i]>=0){
+ prev=loc;
+ loc=locs[i];
+
+ int offset=absdif(loc, prev);
+
+ if(offset<=MAX_INDEL){
+ score+=keyScores[i];
+ if(ADD_LIST_SIZE_BONUS){score+=calcListSizeBonus(sizes[i]);}
+
+// if(i==0){score+=HIT_SCORE/2;} //Adds a bonus for matching the first or last key
+ if(penalizeIndels && offset!=0){
+ int penalty=Tools.min(INDEL_PENALTY+INDEL_PENALTY_MULT*offset, MAX_PENALTY_FOR_MISALIGNED_HIT);
+ score-=penalty;
+ }
+ }else{
+ loc=prev;
+ }
+ }
+
+ }
+ return score;
+
+ }
+
+ /** Encode a (location, chrom) pair to an index */
+ private static final int toNumber(int site, int chrom){
+ int out=(chrom&CHROM_MASK_LOW);
+ out=out<<SHIFT_LENGTH;
+ out=(out|site);
+ return out;
+ }
+
+ /** Decode an (index, baseChrom) pair to a chromosome */
+ private static final int numberToChrom(int number, int baseChrom){
+ assert((baseChrom&CHROM_MASK_LOW)==0) : Integer.toHexString(number)+", baseChrom="+baseChrom;
+ assert(baseChrom>=0) : Integer.toHexString(number)+", baseChrom="+baseChrom;
+ int out=(number>>>SHIFT_LENGTH);
+ out=out+(baseChrom&CHROM_MASK_HIGH);
+ return out;
+ }
+
+ /** Decode an index to a location */
+ private static final int numberToSite(int number){
+ return (number&SITE_MASK);
+ }
+
+ public static final int minChrom(int chrom){return Tools.max(MINCHROM, chrom&CHROM_MASK_HIGH);}
+ public static final int baseChrom(int chrom){return Tools.max(0, chrom&CHROM_MASK_HIGH);}
+ public static final int maxChrom(int chrom){return Tools.max(MINCHROM, Tools.min(MAXCHROM, chrom|CHROM_MASK_LOW));}
+
+
+ private final int[] getOffsetArray(int len){
+ if(offsetArrays[len]==null){offsetArrays[len]=new int[len];}
+ return offsetArrays[len];
+ }
+ private final int[] getLocArray(int len){
+ if(len>=locArrays.length){return new int[len];}
+ if(locArrays[len]==null){locArrays[len]=new int[len];}
+ return locArrays[len];
+ }
+ private final int[] getGreedyListArray(int len){
+ if(greedyListArrays[len]==null){greedyListArrays[len]=new int[len];}
+ return greedyListArrays[len];
+ }
+ private final int[] getGenericArray(int len){
+ if(genericArrays[len]==null){genericArrays[len]=new int[len];}
+ return genericArrays[len];
+ }
+
+ final byte[] getBaseScoreArray(int len, int strand){
+ if(len>=baseScoreArrays[0].length){return new byte[len];}
+ if(baseScoreArrays[strand][len]==null){baseScoreArrays[strand][len]=new byte[len];}
+ return baseScoreArrays[strand][len];
+ }
+ final int[] getKeyScoreArray(int len, int strand){
+ if(len>=keyScoreArrays.length){return new int[len];}
+ if(keyScoreArrays[strand][len]==null){keyScoreArrays[strand][len]=new int[len];}
+ return keyScoreArrays[strand][len];
+ }
+ private final float[] getKeyWeightArray(int len){
+ if(len>=keyWeightArrays.length){return new float[len];}
+ if(keyWeightArrays[len]==null){keyWeightArrays[len]=new float[len];}
+ return keyWeightArrays[len];
+ }
+ @Override
+ float[] keyProbArray() {
+ return keyProbArray;
+ }
+
+
+ private final int[][] locArrays=new int[4001][];
+ private final int[] valueArray=new int[1001];
+ private final int[] sizeArray=new int[1001];
+ private final int[][] offsetArrays=new int[1001][];
+ private final int[][] greedyListArrays=new int[1001][];
+ private final int[][] genericArrays=new int[1001][];
+ private final int[] startArray=new int[1001];
+ private final int[] stopArray=new int[1001];
+ private final Quad[] tripleStorage=makeQuadStorage(1001);
+ private final int[] greedyReturn=new int[2];
+ private final int[][] shrinkReturn2=new int[3][];
+ private final int[][] shrinkReturn3=new int[5][];
+ private final int[][] prescanReturn=new int[2][];
+ private final int[] prescoreArray;
+ private final int[] precountArray;
+
+ private final byte[][][] baseScoreArrays=new byte[2][4001][];
+ private final int[][][] keyScoreArrays=new int[2][1001][];
+ final float[] keyProbArray=new float[4001];
+ private final float[][] keyWeightArrays=new float[1001][];
+
+
+ private final Quad[] makeQuadStorage(int number){
+ Quad[] r=new Quad[number];
+ for(int i=0; i<number; i++){r[i]=new Quad(i, 0, 0);}
+ return r;
+ }
+
+
+ private final QuadHeap heap=new QuadHeap(1023);
+
+ static int SHIFT_LENGTH=(32-1-NUM_CHROM_BITS);
+ static int MAX_ALLOWED_CHROM_INDEX=~((-1)<<SHIFT_LENGTH);
+
+ /** Mask the number to get the site, which is in the lower bits */
+ static int SITE_MASK=((-1)>>>(NUM_CHROM_BITS+1));
+
+ /** Mask the chromosome's high bits to get the low bits */
+ static int CHROM_MASK_LOW=CHROMS_PER_BLOCK-1;
+
+ /** Mask the chromosome's lower bits to get the high bits */
+ static int CHROM_MASK_HIGH=~CHROM_MASK_LOW;
+
+ static void setChromBits(int x){
+
+ NUM_CHROM_BITS=x;
+ CHROMS_PER_BLOCK=(1<<(NUM_CHROM_BITS));
+ SHIFT_LENGTH=(32-1-NUM_CHROM_BITS);
+ MAX_ALLOWED_CHROM_INDEX=~((-1)<<SHIFT_LENGTH);
+ SITE_MASK=((-1)>>>(NUM_CHROM_BITS+1));
+ CHROM_MASK_LOW=CHROMS_PER_BLOCK-1;
+ CHROM_MASK_HIGH=~CHROM_MASK_LOW;
+
+// assert(NUM_CHROM_BITS<30);
+ assert(NUM_CHROM_BITS>=0); //max is 3 for human; perhaps more for other organisms
+// assert((1<<(NUM_CHROM_BITS))>=CHROMSPERBLOCK) : (1<<(NUM_CHROM_BITS))+" < "+CHROMSPERBLOCK;
+ assert((1<<(NUM_CHROM_BITS))==CHROMS_PER_BLOCK) : (1<<(NUM_CHROM_BITS))+" < "+CHROMS_PER_BLOCK;
+ assert(Integer.bitCount(CHROMS_PER_BLOCK)==1);
+ assert(Integer.numberOfLeadingZeros(SITE_MASK)==(NUM_CHROM_BITS+1)) : Integer.toHexString(SITE_MASK);
+ }
+
+ private final int cycles;
+
+ public static final int BASE_HIT_SCORE=100;
+ public static final int ALIGN_COLUMNS=5500;
+ public static int MAX_INDEL=96; //Max indel length, min 0, default 400; longer is more accurate
+ public static int MAX_INDEL2=8*MAX_INDEL;
+
+ private final float INV_BASE_KEY_HIT_SCORE;
+ private final int INDEL_PENALTY; //default (HIT_SCORE/2)-1
+ private final int INDEL_PENALTY_MULT; //default 20; penalty for indel length
+ private final int MAX_PENALTY_FOR_MISALIGNED_HIT;
+ private final int SCOREZ_1KEY;
+
+ public static final boolean GENERATE_KEY_SCORES_FROM_QUALITY=true; //True: Much faster and more accurate.
+ public static final boolean GENERATE_BASE_SCORES_FROM_QUALITY=true; //True: Faster, and at least as accurate.
+ public static final boolean ADD_SCORE_Z=true; //Increases quality, decreases speed
+ public static final int Z_SCORE_MULT=25;
+ public static final int Y_SCORE_MULT=5;
+
+ /** Y2 score: based on distance between hits within Y2_INDEL of center */
+ public static final int Y2_SCORE_MULT=5;
+ public static final int Y2_INDEL=4;
+
+
+ /**
+ * Return only sites that match completely or with partial no-reference
+ */
+ public static void setSemiperfectMode() {
+ assert(!PERFECTMODE);
+ SEMIPERFECTMODE=true;
+ PRESCAN_QSCORE=false;
+// MIN_APPROX_HITS_TO_KEEP++;
+ SKIM_LEVEL_Q=0.15f;
+ SKIM_LEVEL=0.35f;
+ SKIM_LEVEL_H=0.15f;
+ MAX_INDEL=0;
+ MAX_INDEL2=0;
+ }
+
+ /**
+ * Return only sites that match completely
+ */
+ public static void setPerfectMode() {
+ assert(!SEMIPERFECTMODE);
+ PERFECTMODE=true;
+ PRESCAN_QSCORE=false;
+// MIN_APPROX_HITS_TO_KEEP++;
+ SKIM_LEVEL_Q=0.15f;
+ SKIM_LEVEL=0.35f;
+ SKIM_LEVEL_H=0.15f;
+ MAX_INDEL=0;
+ MAX_INDEL2=0;
+ }
+
+ static float FRACTION_GENOME_TO_EXCLUDE=0.005f; //Default .04; lower is slower and more accurate
+
+ public static final void setFractionToExclude(float f){
+ assert(f>=0 && f<1);
+ FRACTION_GENOME_TO_EXCLUDE=f;
+ MIN_INDEX_TO_DROP_LONG_HIT_LIST=(int)(1000*(1-3.5*FRACTION_GENOME_TO_EXCLUDE)); //default 810
+ MAX_AVERAGE_LIST_TO_SEARCH=(int)(1000*(1-2.3*FRACTION_GENOME_TO_EXCLUDE)); //lower is faster, default 840
+ MAX_AVERAGE_LIST_TO_SEARCH2=(int)(1000*(1-1.4*FRACTION_GENOME_TO_EXCLUDE)); //default 910
+ MAX_SINGLE_LIST_TO_SEARCH=(int)(1000*(1-1.0*FRACTION_GENOME_TO_EXCLUDE)); //default 935
+ MAX_SHORTEST_LIST_TO_SEARCH=(int)(1000*(1-2.8*FRACTION_GENOME_TO_EXCLUDE)); //Default 860
+ }
+
+
+ /** Default .75. Range: 0 to 1 (but 0 will break everything). Lower is faster and less accurate. */
+ static final float HIT_FRACTION_TO_RETAIN=.97f; //default: .85
+ /** Range: 0 to 1000. Lower should be faster and less accurate. */
+ static int MIN_INDEX_TO_DROP_LONG_HIT_LIST=(int)(1000*(1-3.5*FRACTION_GENOME_TO_EXCLUDE)); //default 810
+ /** Range: 2 to infinity. Lower should be faster and less accurate. */
+ static final int MIN_HIT_LISTS_TO_RETAIN=12;
+
+ static int MAX_AVERAGE_LIST_TO_SEARCH=(int)(1000*(1-2.3*FRACTION_GENOME_TO_EXCLUDE)); //lower is faster, default 840
+ //lower is faster
+ static int MAX_AVERAGE_LIST_TO_SEARCH2=(int)(1000*(1-1.4*FRACTION_GENOME_TO_EXCLUDE)); //default 910
+ //lower is faster
+ static int MAX_SINGLE_LIST_TO_SEARCH=(int)(1000*(1-1.0*FRACTION_GENOME_TO_EXCLUDE)); //default 935
+ //lower is faster
+ static int MAX_SHORTEST_LIST_TO_SEARCH=(int)(1000*(1-2.8*FRACTION_GENOME_TO_EXCLUDE)); //Default 860
+
+ /** To increase accuracy on small genomes, override greedy list dismissal when the list is at most this long. */
+ public static final int SMALL_GENOME_LIST=80;
+
+ static{assert(!(TRIM_BY_GREEDY && TRIM_BY_TOTAL_SITE_COUNT)) : "Pick one.";}
+
+ static final int CLUMPY_MAX_DIST=5; //Keys repeating over intervals of this or less are clumpy.
+
+ /** Minimum length of list before clumpiness is considered. This is an index in the length histogram, from 0 to 1000. */
+ static final int CLUMPY_MIN_LENGTH_INDEX=2800;
+ static final float CLUMPY_FRACTION=0.8f; //0 to 1; higher is slower but more stringent. 0.5 means the median distance is clumpy.
+
+ static final int MAX_SUBSUMPTION_LENGTH=MAX_INDEL2;
+
+ private static final int calcQScoreCutoff(final int max, final int score, final int currentCutoff){
+ assert(max>=score) : max+", "+score;
+ assert(score>=0);
+
+ assert(currentCutoff>0);
+ int r=Tools.max(currentCutoff, Tools.min((int)(SKIM_LEVEL_Q*max), (int)(DYNAMIC_SKIM_LEVEL_Q*score)));
+// if(r>currentCutoff){
+// System.out.println("qcutoff: "+currentCutoff+"\t->\t"+r);
+// }
+ return r;
+ }
+
+ private static final int calcScoreCutoff(final int max, final int score, final int currentCutoff){
+ assert(max>=score) : max+", "+score;
+ assert(score>=0);
+
+ assert(currentCutoff>0);
+ int r=Tools.max(currentCutoff, Tools.min((int)(SKIM_LEVEL*max), (int)(DYNAMIC_SKIM_LEVEL*score)));
+ return r;
+ }
+
+ private static final int calcApproxHitsCutoff(final int keys, final int hits, int currentCutoff, final boolean perfect){ //***$
+ assert(keys>=hits) : keys+", "+hits;
+ assert(hits>=0);
+
+ int mahtk=MIN_APPROX_HITS_TO_KEEP;
+ if(SEMIPERFECTMODE || PERFECTMODE){
+ if(keys==1){return 1;}
+ else if(MIN_APPROX_HITS_TO_KEEP<keys){
+ mahtk++;
+ if(currentCutoff==MIN_APPROX_HITS_TO_KEEP){currentCutoff++;}
+ }
+ }
+
+ assert(currentCutoff>0);
+ return Tools.max(currentCutoff, Tools.min((int)(SKIM_LEVEL_H*keys), (int)(DYNAMIC_SKIM_LEVEL_H*hits)));
+ }
+
+ public static boolean PRESCAN_QSCORE=true && USE_EXTENDED_SCORE; //Decrease quality and increase speed
+ public static final boolean FILTER_BY_QSCORE=true; //Slightly lower quality, but very fast.
+ public static final float MIN_SCORE_MULT=(USE_AFFINE_SCORE ? 0.03f : USE_EXTENDED_SCORE ? .3f : 0.10f); //Fraction of max score to use as cutoff. Default 0.15, max is 1; lower is more accurate
+ public static final float MIN_QSCORE_MULT=0.03f; //Fraction of max score to use as cutoff. Default 0.025, max is 1; lower is more accurate. VERY SENSITIVE.
+ public static final float MIN_QSCORE_MULT2=0.03f;
+ static final float DYNAMIC_SCORE_THRESH=(USE_AFFINE_SCORE ? 0.55f : USE_EXTENDED_SCORE ? .74f : 0.6f); //Default .85f; lower is more accurate
+ static{
+ assert(MIN_SCORE_MULT>=0 && MIN_SCORE_MULT<1);
+// assert(DYNAMIC_SCORE_THRESH>=0 && DYNAMIC_SCORE_THRESH<1);
+ }
+
+ //Skim Depth Settings
+
+ /** Always retain sites with at least this fraction of max hits (to pass on to qscore) */
+ public static float SKIM_LEVEL_H=0.098f; //.08 or .09
+ /** Always retain sites with at least this fraction of best hits */
+ public static final float DYNAMIC_SKIM_LEVEL_H=0.48f; //.45
+
+ /** Always retain sites with at least this fraction of max qscore (to pass on to extend) */
+ public static float SKIM_LEVEL_Q=0.098f; //.09
+ /** Always retain sites with at least this fraction of best qscore */
+ public static final float DYNAMIC_SKIM_LEVEL_Q=0.78f; //.75
+
+ /** Always retain sites with at least this fraction of max score (to output) */
+ public static float SKIM_LEVEL=0.105f; //.10
+ /** Always retain sites with at least this fraction of best score */
+ public static final float DYNAMIC_SKIM_LEVEL=0.78f; //.75
+
+
+}
diff --git a/current/align2/BBMap.java b/current/align2/BBMap.java
new file mode 100755
index 0000000..489dce6
--- /dev/null
+++ b/current/align2/BBMap.java
@@ -0,0 +1,561 @@
+package align2;
+
+import java.io.File;
+import java.util.ArrayList;
+
+import jgi.CoveragePileup;
+
+import stream.FastaReadInputStream;
+import stream.ReadStreamWriter;
+import stream.SamLine;
+
+import dna.ChromosomeArray;
+import dna.Data;
+import dna.Timer;
+import fileIO.ReadWrite;
+
+/**
+ * Based on TestIndex11f
+ *
+ * @author Brian Bushnell
+ * @date Dec 22, 2012
+ *
+ */
+public final class BBMap extends AbstractMapper {
+
+
+ public static void main(String[] args){
+ Timer t=new Timer();
+ BBMap mapper=new BBMap(args);
+ args=Tools.condenseStrict(args);
+ if(!INDEX_LOADED){mapper.loadIndex();}
+ if(Data.scaffoldPrefixes){mapper.processAmbig2();}
+ mapper.testSpeed(args);
+ ReadWrite.waitForWritingToFinish();
+ t.stop();
+ sysout.println("\nTotal time: \t"+t);
+ clearStatics();
+ }
+
+ public BBMap(String[] args){
+ super(args);
+ }
+
+ @Override
+ public void setDefaults(){
+ ReadWrite.ZIPLEVEL=2;
+ MAKE_MATCH_STRING=true;
+ keylen=13;
+
+ MINIMUM_ALIGNMENT_SCORE_RATIO=0.56f;
+
+ keyDensity=1.9f;//2.3f;
+ maxKeyDensity=3f;//4f;
+ minKeyDensity=1.5f;//1.8f;
+ maxDesiredKeys=15;
+
+ SLOW_ALIGN_PADDING=4;
+ SLOW_RESCUE_PADDING=4+SLOW_ALIGN_PADDING;
+ TIP_SEARCH_DIST=100;
+
+ MSA_TYPE="MultiStateAligner11ts";
+ MAX_SITESCORES_TO_PRINT=5;
+ PRINT_SECONDARY_ALIGNMENTS=false;
+ AbstractIndex.MIN_APPROX_HITS_TO_KEEP=1;
+ }
+
+ @Override
+ public String[] preparse(String[] args){
+ if(fast){
+ ArrayList<String> list=new ArrayList<String>();
+ list.add("tipsearch="+TIP_SEARCH_DIST/5);
+ list.add("maxindel=80");
+ list.add("minhits=2");
+ list.add("bwr=0.18");
+ list.add("bw=40");
+ list.add("minratio=0.65");
+ list.add("midpad=150");
+ list.add("minscaf=50");
+ list.add("quickmatch=t");
+ list.add("rescuemismatches=15");
+ list.add("rescuedist=800");
+ list.add("maxsites=3");
+ list.add("maxsites2=100");
+// list.add("k=13");
+
+ //TODO: Make these adjustable.
+// MIN_TRIM_SITES_TO_RETAIN_SINGLE
+// MIN_TRIM_SITES_TO_RETAIN_PAIRED
+// MAX_TRIM_SITES_TO_RETAIN
+ //TODO: Make trimLists adjustable via an offset or multiplier
+
+ BBIndex.setFractionToExclude(BBIndex.FRACTION_GENOME_TO_EXCLUDE*1.25f);
+
+ for(String s : args){if(s!=null){list.add(s);}}
+ args=list.toArray(new String[list.size()]);
+
+ keyDensity*=0.9f;
+ maxKeyDensity*=0.9f;
+ minKeyDensity*=0.9f;
+ }else if(vslow){
+ ArrayList<String> list=new ArrayList<String>();
+ list.add("tipsearch="+(TIP_SEARCH_DIST*3)/2);
+ list.add("minhits=1");
+ list.add("minratio=0.25");
+ list.add("rescuemismatches=50");
+ list.add("rescuedist=3000");
+
+ BBIndex.setFractionToExclude(0);
+
+ for(String s : args){if(s!=null){list.add(s);}}
+ args=list.toArray(new String[list.size()]);
+
+ SLOW_ALIGN_PADDING=SLOW_ALIGN_PADDING*2+2;
+ SLOW_RESCUE_PADDING=SLOW_RESCUE_PADDING*2+2;
+
+ AbstractIndex.SLOW=true;
+ AbstractIndex.VSLOW=true;
+ keyDensity*=2.5f;
+ maxKeyDensity*=2.5f;
+ minKeyDensity*=2.5f;
+ }else if(slow){
+ ArrayList<String> list=new ArrayList<String>();
+ list.add("tipsearch="+(TIP_SEARCH_DIST*3)/2);
+// list.add("maxindel=80");
+ list.add("minhits=1");
+// list.add("bwr=0.18");
+// list.add("bw=40");
+ list.add("minratio=0.45");
+// list.add("midpad=150");
+// list.add("minscaf=50");
+// list.add("k=13");
+
+ BBIndex.setFractionToExclude(BBIndex.FRACTION_GENOME_TO_EXCLUDE*0.4f);
+
+ for(String s : args){if(s!=null){list.add(s);}}
+ args=list.toArray(new String[list.size()]);
+
+ AbstractIndex.SLOW=true;
+ keyDensity*=1.2f;
+ maxKeyDensity*=1.2f;
+ minKeyDensity*=1.2f;
+ }
+ return args;
+ }
+
+ @Override
+ void postparse(String[] args){
+
+ if(MSA.bandwidthRatio>0 && MSA.bandwidthRatio<.2){
+ SLOW_ALIGN_PADDING=Tools.min(SLOW_ALIGN_PADDING, 3);
+ SLOW_RESCUE_PADDING=Tools.min(SLOW_RESCUE_PADDING, 6);
+ }
+
+ if(maxIndel1>-1){
+ TIP_SEARCH_DIST=Tools.min(TIP_SEARCH_DIST, maxIndel1);
+ BBIndex.MAX_INDEL=maxIndel1;
+ }
+ if(maxIndel2>-1){
+ BBIndex.MAX_INDEL2=maxIndel2;
+ }
+
+ if(minApproxHits>-1){
+ BBIndex.MIN_APPROX_HITS_TO_KEEP=minApproxHits;
+ }
+
+ if(expectedSites>-1){
+ BBMapThread.setExpectedSites(expectedSites);
+ sysout.println("Set EXPECTED_SITES to "+expectedSites);
+ }
+
+ if(fractionGenomeToExclude>=0){
+ BBIndex.setFractionToExclude(fractionGenomeToExclude);
+ }
+
+ {
+ final String a=(args.length>0 ? args[0] : null);
+ final String b=(args.length>1 ? args[1] : null);
+ if(in1==null && a!=null && a.indexOf('=')<0 && (a.startsWith("stdin") || new File(a).exists())){in1=a;}
+ if(in2==null && b!=null && b.indexOf('=')<0 && new File(b).exists()){in2=b;}
+ if(ERROR_ON_NO_OUTPUT && !OUTPUT_READS && in1!=null){throw new RuntimeException("Error: no output file, and ERROR_ON_NO_OUTPUT="+ERROR_ON_NO_OUTPUT);}
+ }
+
+ assert(synthReadlen<BBMapThread.ALIGN_ROWS);
+
+ if(MSA.bandwidth>0){
+ int halfwidth=MSA.bandwidth/2;
+ TIP_SEARCH_DIST=Tools.min(TIP_SEARCH_DIST, halfwidth/2);
+ BBIndex.MAX_INDEL=Tools.min(BBIndex.MAX_INDEL, halfwidth/2);
+ BBIndex.MAX_INDEL2=Tools.min(BBIndex.MAX_INDEL2, halfwidth);
+ SLOW_ALIGN_PADDING=Tools.min(SLOW_ALIGN_PADDING, halfwidth/4);
+ SLOW_RESCUE_PADDING=Tools.min(SLOW_RESCUE_PADDING, halfwidth/4);
+ }
+
+ if(PRINT_SECONDARY_ALIGNMENTS){
+ REMOVE_DUPLICATE_BEST_ALIGNMENTS=false;
+ BBIndex.QUIT_AFTER_TWO_PERFECTS=false;
+ }
+
+ if(ambigMode==AMBIG_BEST){
+ REMOVE_DUPLICATE_BEST_ALIGNMENTS=false;
+ if(!PRINT_SECONDARY_ALIGNMENTS){BBIndex.QUIT_AFTER_TWO_PERFECTS=true;}
+ sysout.println("Retaining first best site only for ambiguous mappings.");
+ }else if(ambigMode==AMBIG_ALL){
+ PRINT_SECONDARY_ALIGNMENTS=ReadStreamWriter.OUTPUT_SAM_SECONDARY_ALIGNMENTS=true;
+ REMOVE_DUPLICATE_BEST_ALIGNMENTS=false;
+ BBIndex.QUIT_AFTER_TWO_PERFECTS=false;
+ SamLine.MAKE_NH_TAG=true;
+ ambiguousAll=true;
+ sysout.println("Retaining all best sites for ambiguous mappings.");
+ }else if(ambigMode==AMBIG_RANDOM){
+ REMOVE_DUPLICATE_BEST_ALIGNMENTS=false;
+ BBIndex.QUIT_AFTER_TWO_PERFECTS=false;
+ ambiguousRandom=true;
+ sysout.println("Choosing a site randomly for ambiguous mappings.");
+ }else if(ambigMode==AMBIG_TOSS){
+ REMOVE_DUPLICATE_BEST_ALIGNMENTS=true;
+ BBIndex.QUIT_AFTER_TWO_PERFECTS=true;
+ sysout.println("Ambiguously mapped reads will be considered unmapped.");
+ }else{
+ throw new RuntimeException("Unknown ambiguous mapping mode: "+ambigMode);
+ }
+
+ }
+
+ @Override
+ public void setup(){
+
+ assert(!useRandomReads || maxReads>0 || (in1!=null && in1.equals("sequential"))) : "Please specify number of reads to use.";
+
+ if(minid!=-1){
+ MINIMUM_ALIGNMENT_SCORE_RATIO=MSA.minIdToMinRatio(minid, MSA_TYPE);
+ sysout.println("Set MINIMUM_ALIGNMENT_SCORE_RATIO to "+String.format("%.3f",MINIMUM_ALIGNMENT_SCORE_RATIO));
+ }
+
+ if(!setxs){SamLine.MAKE_XS_TAG=(SamLine.INTRON_LIMIT<1000000000);}
+ if(setxs && !setintron){SamLine.INTRON_LIMIT=10;}
+
+ if(outFile==null && outFile2==null && outFileM==null && outFileM2==null && outFileU==null && outFileU2==null
+ && outFileB==null && outFileB2==null && splitterOutputs==null && BBSplitter.streamTable==null){
+ sysout.println("No output file.");
+ OUTPUT_READS=false;
+ }else{
+ OUTPUT_READS=true;
+ if(bamscript!=null){
+ BBSplitter.makeBamScript(bamscript, splitterOutputs, outFile, outFile2, outFileM, outFileM2, outFileU, outFileU2, outFileB, outFileB2);
+ }
+ }
+// assert(false) : bamscript+", "+BBSplitter.streamTable+", "+OUTPUT_READS;
+
+
+
+ FastaReadInputStream.MIN_READ_LEN=Tools.max(keylen+2, FastaReadInputStream.MIN_READ_LEN);
+ assert(FastaReadInputStream.settingsOK());
+
+ if(build<0){throw new RuntimeException("Must specify a build number, e.g. build=1");}
+ else{Data.GENOME_BUILD=build;}
+
+ if(blacklist!=null && blacklist.size()>0){
+ Timer t=new Timer();
+ t.start();
+ for(String s : blacklist){
+ Blacklist.addToBlacklist(s);
+ }
+ t.stop();
+ sysout.println("Created blacklist:\t"+t);
+ t.start();
+ }
+
+ if(ziplevel!=-1){ReadWrite.ZIPLEVEL=ziplevel;}
+ if(reference!=null){RefToIndex.makeIndex(reference, build, sysout, keylen);}
+ }
+
+
+ @Override
+ void processAmbig2(){
+ assert(Data.scaffoldPrefixes) : "Only process this block if there are multiple references.";
+ if(BBSplitter.AMBIGUOUS2_MODE==BBSplitter.AMBIGUOUS2_SPLIT){
+ REMOVE_DUPLICATE_BEST_ALIGNMENTS=false;
+ BBIndex.QUIT_AFTER_TWO_PERFECTS=false;
+ sysout.println("Reads that map to multiple references will be written to special output streams.");
+ }else if(BBSplitter.AMBIGUOUS2_MODE==BBSplitter.AMBIGUOUS2_FIRST){
+ REMOVE_DUPLICATE_BEST_ALIGNMENTS=false;
+ BBIndex.QUIT_AFTER_TWO_PERFECTS=false;
+ sysout.println("Reads that map to multiple references will be written to the first reference's stream only.");
+ }else if(BBSplitter.AMBIGUOUS2_MODE==BBSplitter.AMBIGUOUS2_TOSS){
+ BBIndex.QUIT_AFTER_TWO_PERFECTS=true;
+ sysout.println("Reads that map to multiple references will be considered unmapped.");
+ }else if(BBSplitter.AMBIGUOUS2_MODE==BBSplitter.AMBIGUOUS2_RANDOM){
+ REMOVE_DUPLICATE_BEST_ALIGNMENTS=false;
+ BBIndex.QUIT_AFTER_TWO_PERFECTS=false;
+ sysout.println("Reads that map to multiple references will be written to a random stream.");
+ }else if(BBSplitter.AMBIGUOUS2_MODE==BBSplitter.AMBIGUOUS2_ALL){
+ REMOVE_DUPLICATE_BEST_ALIGNMENTS=false;
+ BBIndex.QUIT_AFTER_TWO_PERFECTS=false;
+ sysout.println("Reads that map to multiple references will be written to all relevant output streams.");
+ }else{
+ BBSplitter.AMBIGUOUS2_MODE=BBSplitter.AMBIGUOUS2_FIRST;
+ }
+ }
+
+ @Override
+ void loadIndex(){
+ Timer t=new Timer();
+
+ if(build>-1){
+ Data.setGenome(build);
+ AbstractIndex.MINCHROM=1;
+ AbstractIndex.MAXCHROM=Data.numChroms;
+ if(minChrom<0){minChrom=1;}
+ if(maxChrom<0 || maxChrom>Data.numChroms){maxChrom=Data.numChroms;}
+ sysout.println("Set genome to "+Data.GENOME_BUILD);
+
+ if(RefToIndex.AUTO_CHROMBITS){
+ int maxLength=Tools.max(Data.chromLengths);
+ RefToIndex.chrombits=Integer.numberOfLeadingZeros(maxLength)-1;
+ RefToIndex.chrombits=Tools.min(RefToIndex.chrombits, 16);
+ }
+ if(RefToIndex.chrombits!=-1){
+ BBIndex.setChromBits(RefToIndex.chrombits);
+ if(verbose_stats>0){sysout.println("Set CHROMBITS to "+RefToIndex.chrombits);}
+ }
+ }
+
+ assert(minChrom>=AbstractIndex.MINCHROM && maxChrom<=AbstractIndex.MAXCHROM) :
+ minChrom+", "+maxChrom+", "+AbstractIndex.MINCHROM+", "+AbstractIndex.MAXCHROM;
+ AbstractIndex.MINCHROM=minChrom;
+ AbstractIndex.MAXCHROM=maxChrom;
+
+ if(targetGenomeSize>0){
+ long bases=Data.numDefinedBases;
+ long x=Tools.max(1, Math.round(0.25f+bases*1d/targetGenomeSize));
+ BBMapThread.setExpectedSites((int)x);
+ sysout.println("Set EXPECTED_SITES to "+x);
+ }
+
+ assert(!(PERFECTMODE && SEMIPERFECTMODE));
+ if(PERFECTMODE){setPerfectMode();}
+ if(SEMIPERFECTMODE){setSemiperfectMode();}
+
+ //Optional section for discrete timing of chrom array loading
+ if(SLOW_ALIGN || AbstractIndex.USE_EXTENDED_SCORE || useRandomReads || MAKE_MATCH_STRING){
+ sysout.println();
+ if(INDEX_LOADED){
+ //do nothing
+ }else if(RefToIndex.chromlist==null){
+ Data.loadChromosomes(minChrom, maxChrom);
+ }else{
+ assert(RefToIndex.chromlist.size()==maxChrom-minChrom+1) : RefToIndex.chromlist.size();
+ for(ChromosomeArray cha : RefToIndex.chromlist){
+ Data.chromosomePlusMatrix[cha.chromosome]=cha;
+ }
+ }
+ if(Shared.TRIM_READ_COMMENTS){Data.trimScaffoldNames();}
+ t.stop();
+ sysout.println("Loaded Reference:\t"+t);
+ t.start();
+ }
+ RefToIndex.chromlist=null;
+
+ t.start();
+ BBIndex.loadIndex(minChrom, maxChrom, keylen, !RefToIndex.NODISK, RefToIndex.NODISK);
+
+ {
+ long len=Data.numDefinedBases;
+ if(len<300000000){
+ BBIndex.MAX_HITS_REDUCTION2+=1;
+ BBIndex.MAXIMUM_MAX_HITS_REDUCTION+=1;
+ if(len<30000000){
+ BBIndex.setFractionToExclude(BBIndex.FRACTION_GENOME_TO_EXCLUDE*0.5f);
+ BBIndex.MAXIMUM_MAX_HITS_REDUCTION+=1;
+ BBIndex.HIT_REDUCTION_DIV=Tools.max(BBIndex.HIT_REDUCTION_DIV-1, 3);
+ }else if(len<100000000){
+ BBIndex.setFractionToExclude(BBIndex.FRACTION_GENOME_TO_EXCLUDE*0.6f);
+ }else{
+ BBIndex.setFractionToExclude(BBIndex.FRACTION_GENOME_TO_EXCLUDE*0.75f);
+ }
+ }
+ }
+
+ t.stop();
+ sysout.println("Generated Index:\t"+t);
+ t.start();
+
+ if(!SLOW_ALIGN && !AbstractIndex.USE_EXTENDED_SCORE && !useRandomReads && !MAKE_MATCH_STRING){
+ for(int chrom=minChrom; chrom<=maxChrom; chrom++){
+ Data.unload(chrom, true);
+ }
+ }
+
+ if(ReadWrite.countActiveThreads()>0){
+ ReadWrite.waitForWritingToFinish();
+ t.stop();
+ sysout.println("Finished Writing:\t"+t);
+ t.start();
+ }
+
+ if(coverageBinned!=null || coverageBase!=null || coverageHist!=null || coverageStats!=null || coverageRPKM!=null || normcov!=null || normcovOverall!=null){
+ String[] cvargs=("covhist="+coverageHist+"\tcovstats="+coverageStats+"\tbasecov="+coverageBase+"\tbincov="+coverageBinned+"\tphyscov="+coveragePhysical+
+ "\t32bit="+cov32bit+"\tnzo="+covNzo+"\ttwocolumn="+covTwocolumn+"\tsecondary="+PRINT_SECONDARY_ALIGNMENTS+"\tcovminscaf="+coverageMinScaf+
+ "\tksb="+covKsb+"\tbinsize="+covBinSize+"\tstartcov="+covStartOnly+"\tstrandedcov="+covStranded+"\trpkm="+coverageRPKM+
+ "\tnormcov="+normcov+"\tnormcovo="+normcovOverall+(in1==null ? "" : "\tin1="+in1)+(in2==null ? "" : "\tin2="+in2)+
+ (covSetbs ? ("\tbitset="+covBitset+"\tarrays="+covArrays) : "")).split("\t");
+ pileup=new CoveragePileup(cvargs);
+ pileup.createDataStructures();
+ pileup.loadScaffoldsFromIndex(minChrom, maxChrom);
+ }
+
+ if(!forceanalyze && (in1==null || maxReads==0)){return;}
+
+ BBIndex.analyzeIndex(minChrom, maxChrom, BBIndex.FRACTION_GENOME_TO_EXCLUDE, keylen);
+
+ t.stop();
+ sysout.println("Analyzed Index: \t"+t);
+ t.start();
+
+
+// assert(false) : RefToIndex.chrombits+", "+AbstractIndex.CHROMS_PER_BLOCK;
+ }
+
+ public void testSpeed(String[] args){
+
+ if(in1==null || maxReads==0){
+ sysout.println("No reads to process; quitting.");
+ return;
+ }
+
+ Timer t=new Timer();
+
+ final boolean paired=openStreams(t, args);
+ if(paired){BBIndex.QUIT_AFTER_TWO_PERFECTS=false;}
+
+ t.start();
+
+ if(Shared.USE_JNI){
+ final int threads=Shared.threads();
+ adjustThreadsforMemory(105);
+ if(Shared.threads()<threads*0.9){
+ sysout.println("Disabling JNI due to low system memory.");
+ Shared.USE_JNI=false;
+ Shared.setThreads(threads);
+ }
+ }
+ if(!Shared.USE_JNI){
+ adjustThreadsforMemory(65);
+ }
+
+ AbstractMapThread.CALC_STATISTICS=CALC_STATISTICS;
+ AbstractMapThread[] mtts=new AbstractMapThread[Shared.threads()];
+ for(int i=0; i<mtts.length; i++){
+ try {
+ mtts[i]=new BBMapThread(cris, keylen,
+ pileup, SLOW_ALIGN, CORRECT_THRESH, minChrom,
+ maxChrom, keyDensity, maxKeyDensity, minKeyDensity, maxDesiredKeys, REMOVE_DUPLICATE_BEST_ALIGNMENTS,
+ SAVE_AMBIGUOUS_XY, MINIMUM_ALIGNMENT_SCORE_RATIO, TRIM_LIST, MAKE_MATCH_STRING, QUICK_MATCH_STRINGS, rosA, rosM, rosU, rosB,
+ SLOW_ALIGN_PADDING, SLOW_RESCUE_PADDING, OUTPUT_MAPPED_ONLY, DONT_OUTPUT_BLACKLISTED_READS, MAX_SITESCORES_TO_PRINT, PRINT_SECONDARY_ALIGNMENTS,
+ REQUIRE_CORRECT_STRANDS_PAIRS, SAME_STRAND_PAIRS, KILL_BAD_PAIRS, rcompMate,
+ PERFECTMODE, SEMIPERFECTMODE, FORBID_SELF_MAPPING, TIP_SEARCH_DIST,
+ ambiguousRandom, ambiguousAll, KFILTER, IDFILTER, qtrimLeft, qtrimRight, untrim, TRIM_QUALITY, minTrimLength, LOCAL_ALIGN, RESCUE, STRICT_MAX_INDEL, MSA_TYPE);
+ } catch (Throwable e) {
+ e.printStackTrace();
+ abort(mtts, "Aborting due to prior error when making thread "+i+".");
+ }
+ mtts[i].idmodulo=idmodulo;
+ if(verbose){
+ mtts[i].verbose=verbose;
+ mtts[i].index().verbose=verbose;
+ }
+ }
+
+ cris.start(); //4567
+ sysout.println("Processing reads in "+(paired ? "paired" : "single")+"-ended mode.");
+ sysout.println("Started read stream.");
+
+ /* The threads are started after initialization to prevent resource competition between initialization and mapping */
+ for(int i=0; i<mtts.length; i++){mtts[i].start();}
+ sysout.println("Started "+mtts.length+" mapping thread"+(mtts.length==1 ? "" : "s")+".");
+
+ final int broken=shutDownThreads(mtts, false);
+
+ sysout.println("\n\n ------------------ Results ------------------ ");
+
+ closeStreams(cris, rosA, rosM, rosU, rosB);
+ sysout.println();
+ printSettings(keylen);
+ printOutput(mtts, t, keylen, paired, false, pileup, scafNzo, sortStats, statsOutputFile);
+ if(broken>0 || errorState){throw new RuntimeException("BBMap terminated in an error state; the output may be corrupt.");}
+ }
+
+ @Override
+ void setSemiperfectMode() {
+ assert(SEMIPERFECTMODE);
+ if(SEMIPERFECTMODE){
+ TRIM_LIST=false;
+ keyDensity/=2;
+ maxKeyDensity/=2;
+ minKeyDensity=1.1f;
+ maxDesiredKeys/=2;
+ MINIMUM_ALIGNMENT_SCORE_RATIO=0.45f;
+ BBIndex.setSemiperfectMode();
+ }
+ }
+
+ @Override
+ void setPerfectMode() {
+ assert(PERFECTMODE);
+ if(PERFECTMODE){
+ TRIM_LIST=false;
+ keyDensity/=2;
+ maxKeyDensity/=2;
+ minKeyDensity=1.1f;
+ maxDesiredKeys/=2;
+ MINIMUM_ALIGNMENT_SCORE_RATIO=1.0f;
+ BBIndex.setPerfectMode();
+ }
+ }
+
+
+ @Override
+ void printSettings(int k){
+
+ printSettings0(k, BBIndex.MAX_INDEL, MINIMUM_ALIGNMENT_SCORE_RATIO);
+
+ if(verbose_stats>=2){
+ sysout.println("Key Density: \t"+keyDensity+" ("+minKeyDensity+" ~ "+maxKeyDensity+")");
+ sysout.println("Max keys: \t"+maxDesiredKeys);
+
+ sysout.println("Block Subsections: \t"+BBIndex.CHROMS_PER_BLOCK);
+ sysout.println("Fraction To Remove: \t"+String.format("%.4f", (BBIndex.REMOVE_FREQUENT_GENOME_FRACTION ? BBIndex.FRACTION_GENOME_TO_EXCLUDE : 0)));
+ // sysout.println("ADD_SCORE_Z: \t"+Index4.ADD_SCORE_Z);
+ sysout.println("Hits To Keep: \t"+BBIndex.MIN_APPROX_HITS_TO_KEEP);
+ }
+
+ if(verbose_stats>=3){
+ sysout.println("Remove Clumpy: \t"+BBIndex.REMOVE_CLUMPY);
+ if(BBIndex.REMOVE_CLUMPY){
+ sysout.println("CLUMPY_MAX_DIST: \t"+BBIndex.CLUMPY_MAX_DIST);
+ sysout.println("CLUMPY_MIN_LENGTH: \t"+BBIndex.CLUMPY_MIN_LENGTH_INDEX);
+ sysout.println("CLUMPY_FRACTION: \t"+BBIndex.CLUMPY_FRACTION);
+ }
+ sysout.println("Remove Long Lists: \t"+BBIndex.TRIM_LONG_HIT_LISTS);
+ if(BBIndex.TRIM_LONG_HIT_LISTS){
+ sysout.println("HIT_FRACTION_TO_RETAIN:\t"+BBIndex.HIT_FRACTION_TO_RETAIN);
+ }
+ sysout.println("Trim By Greedy: \t"+BBIndex.TRIM_BY_GREEDY);
+ sysout.println("Trim By Total Sites: \t"+BBIndex.TRIM_BY_TOTAL_SITE_COUNT);
+ if(BBIndex.TRIM_BY_TOTAL_SITE_COUNT){
+ sysout.println("MAX_AVG_SITES: \t"+BBIndex.MAX_AVERAGE_LIST_TO_SEARCH);
+ sysout.println("MAX_AVG_SITES_2: \t"+BBIndex.MAX_AVERAGE_LIST_TO_SEARCH2);
+ sysout.println("MAX_SHORTEST_SITE: \t"+BBIndex.MAX_SHORTEST_LIST_TO_SEARCH);
+ }
+ sysout.println("Index Min Score: \t"+BBIndex.MIN_SCORE_MULT);
+
+ sysout.println("Dynamic Trim: \t"+BBIndex.DYNAMICALLY_TRIM_LOW_SCORES);
+ if(BBIndex.DYNAMICALLY_TRIM_LOW_SCORES){
+ sysout.println("DYNAMIC_SCORE_THRESH: \t"+BBIndex.DYNAMIC_SCORE_THRESH);
+ }
+ }
+
+ }
+
+}
diff --git a/current/align2/BBMap5.java b/current/align2/BBMap5.java
new file mode 100755
index 0000000..2f2fc0d
--- /dev/null
+++ b/current/align2/BBMap5.java
@@ -0,0 +1,540 @@
+package align2;
+
+import java.io.File;
+import java.util.ArrayList;
+
+import jgi.CoveragePileup;
+
+import stream.FASTQ;
+import stream.FastaReadInputStream;
+import stream.ReadStreamWriter;
+import stream.SamLine;
+
+import dna.ChromosomeArray;
+import dna.Data;
+import dna.Timer;
+import fileIO.ReadWrite;
+
+/**
+ * Based on TestIndex11f
+ *
+ * @author Brian Bushnell
+ * @date Jan 3, 2013
+ *
+ */
+public final class BBMap5 extends AbstractMapper {
+
+
+ public static void main(String[] args){
+ Timer t=new Timer();
+ BBMap5 mapper=new BBMap5(args);
+ args=Tools.condenseStrict(args);
+ if(!INDEX_LOADED){mapper.loadIndex();}
+ if(Data.scaffoldPrefixes){mapper.processAmbig2();}
+ mapper.testSpeed(args);
+ ReadWrite.waitForWritingToFinish();
+ t.stop();
+ sysout.println("\nTotal time: \t"+t);
+ clearStatics();
+ }
+
+ public BBMap5(String[] args){
+ super(args);
+ }
+
+ @Override
+ public void setDefaults(){
+ ReadWrite.ZIPLEVEL=2;
+ MAKE_MATCH_STRING=true;
+ keylen=13;
+
+ MINIMUM_ALIGNMENT_SCORE_RATIO=0.56f;
+
+ keyDensity=1.9f;//2.3f;
+ maxKeyDensity=3f;//4f;
+ minKeyDensity=1.5f;//1.8f;
+ maxDesiredKeys=15;
+
+ SLOW_ALIGN_PADDING=4;
+ SLOW_RESCUE_PADDING=4+SLOW_ALIGN_PADDING;
+ TIP_SEARCH_DIST=100;
+
+ MSA_TYPE="MultiStateAligner11ts";
+ MAX_SITESCORES_TO_PRINT=5;
+ PRINT_SECONDARY_ALIGNMENTS=false;
+ AbstractIndex.MIN_APPROX_HITS_TO_KEEP=1;
+ }
+
+ @Override
+ public String[] preparse(String[] args){
+ if(fast){
+ ArrayList<String> list=new ArrayList<String>();
+ list.add("tipsearch="+TIP_SEARCH_DIST/5);
+ list.add("maxindel=80");
+ list.add("minhits=2");
+ list.add("bwr=0.18");
+ list.add("bw=40");
+ list.add("minratio=0.65");
+ list.add("midpad=150");
+ list.add("minscaf=50");
+ list.add("quickmatch=t");
+ list.add("rescuemismatches=15");
+ list.add("rescuedist=800");
+ list.add("maxsites=3");
+ list.add("maxsites2=100");
+// list.add("k=13");
+
+ BBIndex5.setFractionToExclude(BBIndex5.FRACTION_GENOME_TO_EXCLUDE*1.25f);
+
+ for(String s : args){if(s!=null){list.add(s);}}
+ args=list.toArray(new String[list.size()]);
+
+ keyDensity*=0.9f;
+ maxKeyDensity*=0.9f;
+ minKeyDensity*=0.9f;
+ }else if(vslow){
+ ArrayList<String> list=new ArrayList<String>();
+ list.add("tipsearch="+(TIP_SEARCH_DIST*3)/2);
+ list.add("minhits=1");
+ list.add("minratio=0.25");
+ list.add("rescuemismatches=50");
+ list.add("rescuedist=3000");
+
+ BBIndex5.setFractionToExclude(0);
+
+ for(String s : args){if(s!=null){list.add(s);}}
+ args=list.toArray(new String[list.size()]);
+
+ SLOW_ALIGN_PADDING=SLOW_ALIGN_PADDING*2+2;
+ SLOW_RESCUE_PADDING=SLOW_RESCUE_PADDING*2+2;
+
+ AbstractIndex.SLOW=true;
+ AbstractIndex.VSLOW=true;
+ keyDensity*=2.5f;
+ maxKeyDensity*=2.5f;
+ minKeyDensity*=2.5f;
+ }else if(slow){
+ //TODO: Unfinished
+ ArrayList<String> list=new ArrayList<String>();
+
+ BBIndex5.setFractionToExclude(BBIndex5.FRACTION_GENOME_TO_EXCLUDE*0.4f);
+
+ for(String s : args){if(s!=null){list.add(s);}}
+ args=list.toArray(new String[list.size()]);
+
+ AbstractIndex.SLOW=true;
+ keyDensity*=1.2f;
+ maxKeyDensity*=1.2f;
+ minKeyDensity*=1.2f;
+ }
+ return args;
+ }
+
+ @Override
+ void postparse(String[] args){
+
+ if(MSA.bandwidthRatio>0 && MSA.bandwidthRatio<.2){
+ SLOW_ALIGN_PADDING=Tools.min(SLOW_ALIGN_PADDING, 3);
+ SLOW_RESCUE_PADDING=Tools.min(SLOW_RESCUE_PADDING, 6);
+ }
+
+ if(maxIndel1>-1){
+ TIP_SEARCH_DIST=Tools.min(TIP_SEARCH_DIST, maxIndel1);
+ BBIndex5.MAX_INDEL=maxIndel1;
+ }
+ if(maxIndel2>-1){
+ BBIndex5.MAX_INDEL2=maxIndel2;
+ }
+
+ if(minApproxHits>-1){
+ BBIndex5.MIN_APPROX_HITS_TO_KEEP=minApproxHits;
+ }
+
+ if(expectedSites>-1){
+ BBMapThread5.setExpectedSites(expectedSites);
+ sysout.println("Set EXPECTED_SITES to "+expectedSites);
+ }
+
+ if(fractionGenomeToExclude>=0){
+ BBIndex5.setFractionToExclude(fractionGenomeToExclude);
+ }
+
+ {
+ final String a=(args.length>0 ? args[0] : null);
+ final String b=(args.length>1 ? args[1] : null);
+ if(in1==null && a!=null && a.indexOf('=')<0 && (a.startsWith("stdin") || new File(a).exists())){in1=a;}
+ if(in2==null && b!=null && b.indexOf('=')<0 && new File(b).exists()){in2=b;}
+ if(ERROR_ON_NO_OUTPUT && !OUTPUT_READS && in1!=null){throw new RuntimeException("Error: no output file, and ERROR_ON_NO_OUTPUT="+ERROR_ON_NO_OUTPUT);}
+ }
+
+ assert(synthReadlen<BBMapThread5.ALIGN_ROWS);
+
+ if(MSA.bandwidth>0){
+ int halfwidth=MSA.bandwidth/2;
+ TIP_SEARCH_DIST=Tools.min(TIP_SEARCH_DIST, halfwidth/2);
+ BBIndex5.MAX_INDEL=Tools.min(BBIndex5.MAX_INDEL, halfwidth/2);
+ BBIndex5.MAX_INDEL2=Tools.min(BBIndex5.MAX_INDEL2, halfwidth);
+ SLOW_ALIGN_PADDING=Tools.min(SLOW_ALIGN_PADDING, halfwidth/4);
+ SLOW_RESCUE_PADDING=Tools.min(SLOW_RESCUE_PADDING, halfwidth/4);
+ }
+
+ if(PRINT_SECONDARY_ALIGNMENTS){
+ REMOVE_DUPLICATE_BEST_ALIGNMENTS=false;
+ BBIndex5.QUIT_AFTER_TWO_PERFECTS=false;
+ }
+
+ if(ambigMode==AMBIG_BEST){
+ REMOVE_DUPLICATE_BEST_ALIGNMENTS=false;
+ if(!PRINT_SECONDARY_ALIGNMENTS){BBIndex5.QUIT_AFTER_TWO_PERFECTS=true;}
+ sysout.println("Retaining first best site only for ambiguous mappings.");
+ }else if(ambigMode==AMBIG_ALL){
+ PRINT_SECONDARY_ALIGNMENTS=ReadStreamWriter.OUTPUT_SAM_SECONDARY_ALIGNMENTS=true;
+ REMOVE_DUPLICATE_BEST_ALIGNMENTS=false;
+ BBIndex5.QUIT_AFTER_TWO_PERFECTS=false;
+ SamLine.MAKE_NH_TAG=true;
+ ambiguousAll=true;
+ sysout.println("Retaining all best sites for ambiguous mappings.");
+ }else if(ambigMode==AMBIG_RANDOM){
+ REMOVE_DUPLICATE_BEST_ALIGNMENTS=false;
+ BBIndex5.QUIT_AFTER_TWO_PERFECTS=false;
+ ambiguousRandom=true;
+ sysout.println("Choosing a site randomly for ambiguous mappings.");
+ }else if(ambigMode==AMBIG_TOSS){
+ REMOVE_DUPLICATE_BEST_ALIGNMENTS=true;
+ BBIndex5.QUIT_AFTER_TWO_PERFECTS=true;
+ sysout.println("Ambiguously mapped reads will be considered unmapped.");
+ }else{
+ throw new RuntimeException("Unknown ambiguous mapping mode: "+ambigMode);
+ }
+
+ }
+
+ @Override
+ public void setup(){
+
+ assert(!useRandomReads || maxReads>0 || (in1!=null && in1.equals("sequential"))) : "Please specify number of reads to use.";
+
+ if(minid!=-1){
+ MINIMUM_ALIGNMENT_SCORE_RATIO=MSA.minIdToMinRatio(minid, MSA_TYPE);
+ sysout.println("Set MINIMUM_ALIGNMENT_SCORE_RATIO to "+String.format("%.3f",MINIMUM_ALIGNMENT_SCORE_RATIO));
+ }
+
+ if(!setxs){SamLine.MAKE_XS_TAG=(SamLine.INTRON_LIMIT<1000000000);}
+ if(setxs && !setintron){SamLine.INTRON_LIMIT=10;}
+
+ if(outFile==null && outFile2==null && outFileM==null && outFileM2==null && outFileU==null && outFileU2==null
+ && outFileB==null && outFileB2==null && splitterOutputs==null && BBSplitter.streamTable==null){
+ sysout.println("No output file.");
+ OUTPUT_READS=false;
+ }else{
+ OUTPUT_READS=true;
+ if(bamscript!=null){
+ BBSplitter.makeBamScript(bamscript, splitterOutputs, outFile, outFile2, outFileM, outFileM2, outFileU, outFileU2, outFileB, outFileB2);
+ }
+ }
+
+ FastaReadInputStream.MIN_READ_LEN=Tools.max(keylen+2, FastaReadInputStream.MIN_READ_LEN);
+ assert(FastaReadInputStream.settingsOK());
+
+ if(build<0){throw new RuntimeException("Must specify a build number, e.g. build=1");}
+ else{Data.GENOME_BUILD=build;}
+
+ if(blacklist!=null && blacklist.size()>0){
+ Timer t=new Timer();
+ t.start();
+ for(String s : blacklist){
+ Blacklist.addToBlacklist(s);
+ }
+ t.stop();
+ sysout.println("Created blacklist:\t"+t);
+ t.start();
+ }
+
+ if(ziplevel!=-1){ReadWrite.ZIPLEVEL=ziplevel;}
+ if(reference!=null){RefToIndex.makeIndex(reference, build, sysout, keylen);}
+ }
+
+
+ @Override
+ void processAmbig2(){
+ assert(Data.scaffoldPrefixes) : "Only process this block if there are multiple references.";
+ if(BBSplitter.AMBIGUOUS2_MODE==BBSplitter.AMBIGUOUS2_SPLIT){
+ REMOVE_DUPLICATE_BEST_ALIGNMENTS=false;
+ BBIndex5.QUIT_AFTER_TWO_PERFECTS=false;
+ sysout.println("Reads that map to multiple references will be written to special output streams.");
+ }else if(BBSplitter.AMBIGUOUS2_MODE==BBSplitter.AMBIGUOUS2_FIRST){
+ REMOVE_DUPLICATE_BEST_ALIGNMENTS=false;
+ BBIndex5.QUIT_AFTER_TWO_PERFECTS=false;
+ sysout.println("Reads that map to multiple references will be written to the first reference's stream only.");
+ }else if(BBSplitter.AMBIGUOUS2_MODE==BBSplitter.AMBIGUOUS2_TOSS){
+ BBIndex5.QUIT_AFTER_TWO_PERFECTS=true;
+ sysout.println("Reads that map to multiple references will be considered unmapped.");
+ }else if(BBSplitter.AMBIGUOUS2_MODE==BBSplitter.AMBIGUOUS2_RANDOM){
+ REMOVE_DUPLICATE_BEST_ALIGNMENTS=false;
+ BBIndex5.QUIT_AFTER_TWO_PERFECTS=false;
+ sysout.println("Reads that map to multiple references will be written to a random stream.");
+ }else if(BBSplitter.AMBIGUOUS2_MODE==BBSplitter.AMBIGUOUS2_ALL){
+ REMOVE_DUPLICATE_BEST_ALIGNMENTS=false;
+ BBIndex5.QUIT_AFTER_TWO_PERFECTS=false;
+ sysout.println("Reads that map to multiple references will be written to all relevant output streams.");
+ }else{
+ BBSplitter.AMBIGUOUS2_MODE=BBSplitter.AMBIGUOUS2_FIRST;
+ }
+ }
+
+ @Override
+ void loadIndex(){
+ Timer t=new Timer();
+
+ if(build>-1){
+ Data.setGenome(build);
+ AbstractIndex.MINCHROM=1;
+ AbstractIndex.MAXCHROM=Data.numChroms;
+ if(minChrom<0){minChrom=1;}
+ if(maxChrom<0 || maxChrom>Data.numChroms){maxChrom=Data.numChroms;}
+ sysout.println("Set genome to "+Data.GENOME_BUILD);
+
+ if(RefToIndex.AUTO_CHROMBITS){
+ int maxLength=Tools.max(Data.chromLengths);
+ RefToIndex.chrombits=Integer.numberOfLeadingZeros(maxLength); //Different for v5!
+ RefToIndex.chrombits=Tools.min(RefToIndex.chrombits, 16);
+ }
+ if(RefToIndex.chrombits!=-1){
+ BBIndex5.setChromBits(RefToIndex.chrombits);
+ if(verbose_stats>0){sysout.println("Set CHROMBITS to "+RefToIndex.chrombits);}
+ }
+ }
+
+ assert(minChrom>=AbstractIndex.MINCHROM && maxChrom<=AbstractIndex.MAXCHROM) :
+ minChrom+", "+maxChrom+", "+AbstractIndex.MINCHROM+", "+AbstractIndex.MAXCHROM;
+ AbstractIndex.MINCHROM=minChrom;
+ AbstractIndex.MAXCHROM=maxChrom;
+
+ if(targetGenomeSize>0){
+ long bases=Data.numDefinedBases;
+ long x=Tools.max(1, Math.round(0.25f+bases*1d/targetGenomeSize));
+ BBMapThread5.setExpectedSites((int)x);
+ sysout.println("Set EXPECTED_SITES to "+x);
+ }
+
+ assert(!(PERFECTMODE && SEMIPERFECTMODE));
+ if(PERFECTMODE){setPerfectMode();}
+ if(SEMIPERFECTMODE){setSemiperfectMode();}
+
+ //Optional section for discrete timing of chrom array loading
+ if(SLOW_ALIGN || AbstractIndex.USE_EXTENDED_SCORE || useRandomReads || MAKE_MATCH_STRING){
+ sysout.println();
+ if(RefToIndex.chromlist==null){
+ Data.loadChromosomes(minChrom, maxChrom);
+ }else{
+ assert(RefToIndex.chromlist.size()==maxChrom-minChrom+1) : RefToIndex.chromlist.size();
+ for(ChromosomeArray cha : RefToIndex.chromlist){
+ Data.chromosomePlusMatrix[cha.chromosome]=cha;
+ }
+ }
+ if(Shared.TRIM_READ_COMMENTS){Data.trimScaffoldNames();}
+ t.stop();
+ sysout.println("Loaded Reference:\t"+t);
+ t.start();
+ }
+ RefToIndex.chromlist=null;
+
+ t.start();
+ BBIndex5.loadIndex(minChrom, maxChrom, keylen, !RefToIndex.NODISK, RefToIndex.NODISK);
+
+ {
+ long len=Data.numDefinedBases;
+ if(len<300000000){
+ BBIndex5.MAX_HITS_REDUCTION2+=1;
+ BBIndex5.MAXIMUM_MAX_HITS_REDUCTION+=1;
+ if(len<30000000){
+ BBIndex5.setFractionToExclude(BBIndex5.FRACTION_GENOME_TO_EXCLUDE*0.5f);
+ BBIndex5.MAXIMUM_MAX_HITS_REDUCTION+=1;
+ BBIndex5.HIT_REDUCTION_DIV=Tools.max(BBIndex5.HIT_REDUCTION_DIV-1, 3);
+ }else if(len<100000000){
+ BBIndex5.setFractionToExclude(BBIndex5.FRACTION_GENOME_TO_EXCLUDE*0.6f);
+ }else{
+ BBIndex5.setFractionToExclude(BBIndex5.FRACTION_GENOME_TO_EXCLUDE*0.75f);
+ }
+ }
+ }
+
+ t.stop();
+ sysout.println("Generated Index:\t"+t);
+ t.start();
+
+ if(!SLOW_ALIGN && !AbstractIndex.USE_EXTENDED_SCORE && !useRandomReads && !MAKE_MATCH_STRING){
+ for(int chrom=minChrom; chrom<=maxChrom; chrom++){
+ Data.unload(chrom, true);
+ }
+ }
+
+ if(ReadWrite.countActiveThreads()>0){
+ ReadWrite.waitForWritingToFinish();
+ t.stop();
+ sysout.println("Finished Writing:\t"+t);
+ t.start();
+ }
+
+ if(coverageBinned!=null || coverageBase!=null || coverageHist!=null || coverageStats!=null || coverageRPKM!=null || normcov!=null || normcovOverall!=null){
+ String[] cvargs=("covhist="+coverageHist+"\tcovstats="+coverageStats+"\tbasecov="+coverageBase+"\tbincov="+coverageBinned+"\tphyscov="+coveragePhysical+
+ "\t32bit="+cov32bit+"\tnzo="+covNzo+"\ttwocolumn="+covTwocolumn+"\tsecondary="+PRINT_SECONDARY_ALIGNMENTS+"\tcovminscaf="+coverageMinScaf+
+ "\tksb="+covKsb+"\tbinsize="+covBinSize+"\tstartcov="+covStartOnly+"\tstrandedcov="+covStranded+"\trpkm="+coverageRPKM+
+ "\tnormcov="+normcov+"\tnormcovo="+normcovOverall+(in1==null ? "" : "\tin1="+in1)+(in2==null ? "" : "\tin2="+in2)+
+ (covSetbs ? ("\tbitset="+covBitset+"\tarrays="+covArrays) : "")).split("\t");
+ pileup=new CoveragePileup(cvargs);
+ pileup.createDataStructures();
+ pileup.loadScaffoldsFromIndex(minChrom, maxChrom);
+ }
+
+ if(!forceanalyze && (in1==null || maxReads==0)){return;}
+
+ BBIndex5.analyzeIndex(minChrom, maxChrom, BBIndex5.FRACTION_GENOME_TO_EXCLUDE, keylen);
+
+ t.stop();
+ sysout.println("Analyzed Index: \t"+t);
+ t.start();
+ }
+
+ public void testSpeed(String[] args){
+
+ if(in1==null || maxReads==0){
+ sysout.println("No reads to process; quitting.");
+ return;
+ }
+
+ Timer t=new Timer();
+
+ final boolean paired=openStreams(t, args);
+ if(paired){BBIndex5.QUIT_AFTER_TWO_PERFECTS=false;}
+
+ t.start();
+
+ if(Shared.USE_JNI){
+ final int threads=Shared.threads();
+ adjustThreadsforMemory(105);
+ if(Shared.threads()<threads*0.9){
+ sysout.println("Disabling JNI due to low system memory.");
+ Shared.USE_JNI=false;
+ Shared.setThreads(threads);
+ }
+ }
+ if(!Shared.USE_JNI){
+ adjustThreadsforMemory(65);
+ }
+
+ AbstractMapThread.CALC_STATISTICS=CALC_STATISTICS;
+ AbstractMapThread[] mtts=new AbstractMapThread[Shared.threads()];
+ for(int i=0; i<mtts.length; i++){
+ try {
+ mtts[i]=new BBMapThread5(cris, keylen,
+ pileup, SLOW_ALIGN, CORRECT_THRESH, minChrom,
+ maxChrom, keyDensity, maxKeyDensity, minKeyDensity, maxDesiredKeys, REMOVE_DUPLICATE_BEST_ALIGNMENTS,
+ SAVE_AMBIGUOUS_XY, MINIMUM_ALIGNMENT_SCORE_RATIO, TRIM_LIST, MAKE_MATCH_STRING, QUICK_MATCH_STRINGS, rosA, rosM, rosU, rosB,
+ SLOW_ALIGN_PADDING, SLOW_RESCUE_PADDING, OUTPUT_MAPPED_ONLY, DONT_OUTPUT_BLACKLISTED_READS, MAX_SITESCORES_TO_PRINT, PRINT_SECONDARY_ALIGNMENTS,
+ REQUIRE_CORRECT_STRANDS_PAIRS, SAME_STRAND_PAIRS, KILL_BAD_PAIRS, rcompMate,
+ PERFECTMODE, SEMIPERFECTMODE, FORBID_SELF_MAPPING, TIP_SEARCH_DIST,
+ ambiguousRandom, ambiguousAll, KFILTER, IDFILTER, qtrimLeft, qtrimRight, untrim, TRIM_QUALITY, minTrimLength, LOCAL_ALIGN, RESCUE, STRICT_MAX_INDEL, MSA_TYPE);
+ } catch (Exception e) {
+ e.printStackTrace();
+ abort(mtts, "Aborting due to prior error.");
+ }
+ mtts[i].idmodulo=idmodulo;
+ if(verbose){
+ mtts[i].verbose=verbose;
+ mtts[i].index().verbose=verbose;
+ }
+ }
+
+ cris.start(); //4567
+ sysout.println("Processing reads in "+(paired ? "paired" : "single")+"-ended mode.");
+ sysout.println("Started read stream.");
+
+ /* The threads are started after initialization to prevent resource competition between initialization and mapping */
+ for(int i=0; i<mtts.length; i++){mtts[i].start();}
+ sysout.println("Started "+mtts.length+" mapping thread"+(mtts.length==1 ? "" : "s")+".");
+
+ final int broken=shutDownThreads(mtts, false);
+
+ sysout.println("\n\n ------------------ Results ------------------ ");
+
+ closeStreams(cris, rosA, rosM, rosU, rosB);
+ sysout.println();
+ printSettings(keylen);
+ printOutput(mtts, t, keylen, paired, false, pileup, scafNzo, sortStats, statsOutputFile);
+ if(broken>0 || errorState){throw new RuntimeException("BBMap terminated in an error state; the output may be corrupt.");}
+ }
+
+ @Override
+ void setSemiperfectMode() {
+ assert(SEMIPERFECTMODE);
+ if(SEMIPERFECTMODE){
+ TRIM_LIST=false;
+ keyDensity/=2;
+ maxKeyDensity/=2;
+ minKeyDensity=1.1f;
+ maxDesiredKeys/=2;
+ MINIMUM_ALIGNMENT_SCORE_RATIO=0.45f;
+ BBIndex5.setSemiperfectMode();
+ }
+ }
+
+ @Override
+ void setPerfectMode() {
+ assert(PERFECTMODE);
+ if(PERFECTMODE){
+ TRIM_LIST=false;
+ keyDensity/=2;
+ maxKeyDensity/=2;
+ minKeyDensity=1.1f;
+ maxDesiredKeys/=2;
+ MINIMUM_ALIGNMENT_SCORE_RATIO=1.0f;
+ BBIndex5.setPerfectMode();
+ }
+ }
+
+
+ @Override
+ void printSettings(int k){
+
+ printSettings0(k, BBIndex5.MAX_INDEL, MINIMUM_ALIGNMENT_SCORE_RATIO);
+
+ if(verbose_stats>=2){
+ sysout.println("Key Density: \t"+keyDensity+" ("+minKeyDensity+" ~ "+maxKeyDensity+")");
+ sysout.println("Max keys: \t"+maxDesiredKeys);
+
+ sysout.println("Block Subsections: \t"+BBIndex5.CHROMS_PER_BLOCK);
+ sysout.println("Fraction To Remove: \t"+String.format("%.4f", (BBIndex5.REMOVE_FREQUENT_GENOME_FRACTION ? BBIndex5.FRACTION_GENOME_TO_EXCLUDE : 0)));
+ // sysout.println("ADD_SCORE_Z: \t"+Index5.ADD_SCORE_Z);
+ sysout.println("Hits To Keep: \t"+BBIndex5.MIN_APPROX_HITS_TO_KEEP);
+ }
+
+ if(verbose_stats>=3){
+ sysout.println("Remove Clumpy: \t"+BBIndex5.REMOVE_CLUMPY);
+ if(BBIndex5.REMOVE_CLUMPY){
+ sysout.println("CLUMPY_MAX_DIST: \t"+BBIndex5.CLUMPY_MAX_DIST);
+ sysout.println("CLUMPY_MIN_LENGTH: \t"+BBIndex5.CLUMPY_MIN_LENGTH_INDEX);
+ sysout.println("CLUMPY_FRACTION: \t"+BBIndex5.CLUMPY_FRACTION);
+ }
+ sysout.println("Remove Long Lists: \t"+BBIndex5.TRIM_LONG_HIT_LISTS);
+ if(BBIndex5.TRIM_LONG_HIT_LISTS){
+ sysout.println("HIT_FRACTION_TO_RETAIN:\t"+BBIndex5.HIT_FRACTION_TO_RETAIN);
+ }
+ sysout.println("Trim By Greedy: \t"+BBIndex5.TRIM_BY_GREEDY);
+ sysout.println("Trim By Total Sites: \t"+BBIndex5.TRIM_BY_TOTAL_SITE_COUNT);
+ if(BBIndex5.TRIM_BY_TOTAL_SITE_COUNT){
+ sysout.println("MAX_AVG_SITES: \t"+BBIndex5.MAX_AVERAGE_LIST_TO_SEARCH);
+ sysout.println("MAX_AVG_SITES_2: \t"+BBIndex5.MAX_AVERAGE_LIST_TO_SEARCH2);
+ sysout.println("MAX_SHORTEST_SITE: \t"+BBIndex5.MAX_SHORTEST_LIST_TO_SEARCH);
+ }
+ sysout.println("Index Min Score: \t"+BBIndex5.MIN_SCORE_MULT);
+
+ sysout.println("Dynamic Trim: \t"+BBIndex5.DYNAMICALLY_TRIM_LOW_SCORES);
+ if(BBIndex5.DYNAMICALLY_TRIM_LOW_SCORES){
+ sysout.println("DYNAMIC_SCORE_THRESH: \t"+BBIndex5.DYNAMIC_SCORE_THRESH);
+ }
+ }
+
+ }
+
+}
diff --git a/current/align2/BBMapAcc.java b/current/align2/BBMapAcc.java
new file mode 100755
index 0000000..c7d9444
--- /dev/null
+++ b/current/align2/BBMapAcc.java
@@ -0,0 +1,540 @@
+package align2;
+
+import java.io.File;
+import java.util.ArrayList;
+
+import jgi.CoveragePileup;
+
+import stream.FASTQ;
+import stream.FastaReadInputStream;
+import stream.ReadStreamWriter;
+import stream.SamLine;
+
+import dna.ChromosomeArray;
+import dna.Data;
+import dna.Timer;
+import fileIO.ReadWrite;
+
+/**
+ * Based on TestIndex11Ii
+ *
+ * @author Brian Bushnell
+ * @date Jul 10, 2012
+ *
+ */
+public final class BBMapAcc extends AbstractMapper {
+
+
+ public static void main(String[] args){
+ Timer t=new Timer();
+ BBMapAcc mapper=new BBMapAcc(args);
+ args=Tools.condenseStrict(args);
+ if(!INDEX_LOADED){mapper.loadIndex();}
+ if(Data.scaffoldPrefixes){mapper.processAmbig2();}
+ mapper.testSpeed(args);
+ ReadWrite.waitForWritingToFinish();
+ t.stop();
+ sysout.println("\nTotal time: \t"+t);
+ clearStatics();
+ }
+
+ public BBMapAcc(String[] args){
+ super(args);
+ }
+
+ @Override
+ public void setDefaults(){
+ ReadWrite.ZIPLEVEL=2;
+ MAKE_MATCH_STRING=true;
+ keylen=13;
+
+ MINIMUM_ALIGNMENT_SCORE_RATIO=0.56f;
+
+ keyDensity=2.3f;//2.3f;
+ maxKeyDensity=3.2f;//4f;
+ minKeyDensity=1.8f;//1.8f;
+ maxDesiredKeys=20;
+
+ SLOW_ALIGN_PADDING=20;
+ SLOW_RESCUE_PADDING=4+SLOW_ALIGN_PADDING;
+ TIP_SEARCH_DIST=200;
+
+ MSA_TYPE="MultiStateAligner11ts";
+ MAX_SITESCORES_TO_PRINT=8;
+ PRINT_SECONDARY_ALIGNMENTS=false;
+ AbstractIndex.MIN_APPROX_HITS_TO_KEEP=1;
+ }
+
+ @Override
+ public String[] preparse(String[] args){
+ if(fast){
+ ArrayList<String> list=new ArrayList<String>();
+ list.add("tipsearch="+TIP_SEARCH_DIST/2);
+ list.add("maxindel=80");
+// list.add("minhits=2");
+ list.add("bwr=0.3");
+// list.add("bw=64");
+ list.add("minratio=0.60");
+ list.add("midpad=150");
+ list.add("minscaf=50");
+ list.add("quickmatch=t");
+ list.add("rescuemismatches=15");
+ list.add("rescuedist=800");
+ list.add("maxsites=3");
+ list.add("maxsites2=100");
+// list.add("k=13");
+
+ BBIndexAcc.setFractionToExclude(BBIndexAcc.FRACTION_GENOME_TO_EXCLUDE*1.25f);
+
+ for(String s : args){if(s!=null){list.add(s);}}
+ args=list.toArray(new String[list.size()]);
+
+ keyDensity*=0.9f;
+ maxKeyDensity*=0.9f;
+ minKeyDensity*=0.9f;
+ }else if(vslow){
+ ArrayList<String> list=new ArrayList<String>();
+ list.add("tipsearch="+(TIP_SEARCH_DIST*3)/2);
+ list.add("minhits=1");
+ list.add("minratio=0.25");
+ list.add("rescuemismatches=50");
+ list.add("rescuedist=3000");
+
+ BBIndexAcc.setFractionToExclude(0);
+
+ for(String s : args){if(s!=null){list.add(s);}}
+ args=list.toArray(new String[list.size()]);
+
+ SLOW_ALIGN_PADDING=SLOW_ALIGN_PADDING*2+2;
+ SLOW_RESCUE_PADDING=SLOW_RESCUE_PADDING*2+2;
+
+ AbstractIndex.SLOW=true;
+ AbstractIndex.VSLOW=true;
+ keyDensity*=2.5f;
+ maxKeyDensity*=2.5f;
+ minKeyDensity*=2.5f;
+ }else if(slow){
+ //TODO: Unfinished
+ ArrayList<String> list=new ArrayList<String>();
+
+ BBIndexAcc.setFractionToExclude(BBIndexAcc.FRACTION_GENOME_TO_EXCLUDE*0.4f);
+
+ for(String s : args){if(s!=null){list.add(s);}}
+ args=list.toArray(new String[list.size()]);
+
+ AbstractIndex.SLOW=true;
+ keyDensity*=1.2f;
+ maxKeyDensity*=1.2f;
+ minKeyDensity*=1.2f;
+ }
+ return args;
+ }
+
+ @Override
+ void postparse(String[] args){
+
+ if(MSA.bandwidthRatio>0 && MSA.bandwidthRatio<.2){
+ SLOW_ALIGN_PADDING=Tools.min(SLOW_ALIGN_PADDING, 4);
+ SLOW_RESCUE_PADDING=Tools.min(SLOW_RESCUE_PADDING, 8);
+ }
+
+ if(maxIndel1>-1){
+ TIP_SEARCH_DIST=Tools.min(TIP_SEARCH_DIST, maxIndel1);
+ BBIndexAcc.MAX_INDEL=maxIndel1;
+ }
+ if(maxIndel2>-1){
+ BBIndexAcc.MAX_INDEL2=maxIndel2;
+ }
+
+ if(minApproxHits>-1){
+ BBIndexAcc.MIN_APPROX_HITS_TO_KEEP=minApproxHits;
+ }
+
+ if(expectedSites>-1){
+ BBMapThreadAcc.setExpectedSites(expectedSites);
+ sysout.println("Set EXPECTED_SITES to "+expectedSites);
+ }
+
+ if(fractionGenomeToExclude>=0){
+ BBIndexAcc.setFractionToExclude(fractionGenomeToExclude);
+ }
+
+ {
+ final String a=(args.length>0 ? args[0] : null);
+ final String b=(args.length>1 ? args[1] : null);
+ if(in1==null && a!=null && a.indexOf('=')<0 && (a.startsWith("stdin") || new File(a).exists())){in1=a;}
+ if(in2==null && b!=null && b.indexOf('=')<0 && new File(b).exists()){in2=b;}
+ if(ERROR_ON_NO_OUTPUT && !OUTPUT_READS && in1!=null){throw new RuntimeException("Error: no output file, and ERROR_ON_NO_OUTPUT="+ERROR_ON_NO_OUTPUT);}
+ }
+
+ assert(synthReadlen<BBMapThreadAcc.ALIGN_ROWS);
+
+ if(MSA.bandwidth>0){
+ int halfwidth=MSA.bandwidth/2;
+ TIP_SEARCH_DIST=Tools.min(TIP_SEARCH_DIST, halfwidth/2);
+ BBIndexAcc.MAX_INDEL=Tools.min(BBIndexAcc.MAX_INDEL, halfwidth/2);
+ BBIndexAcc.MAX_INDEL2=Tools.min(BBIndexAcc.MAX_INDEL2, halfwidth);
+ SLOW_ALIGN_PADDING=Tools.min(SLOW_ALIGN_PADDING, halfwidth/4);
+ SLOW_RESCUE_PADDING=Tools.min(SLOW_RESCUE_PADDING, halfwidth/4);
+ }
+
+ if(PRINT_SECONDARY_ALIGNMENTS){
+ REMOVE_DUPLICATE_BEST_ALIGNMENTS=false;
+ BBIndexAcc.QUIT_AFTER_TWO_PERFECTS=false;
+ }
+
+ if(ambigMode==AMBIG_BEST){
+ REMOVE_DUPLICATE_BEST_ALIGNMENTS=false;
+ if(!PRINT_SECONDARY_ALIGNMENTS){BBIndexAcc.QUIT_AFTER_TWO_PERFECTS=true;}
+ sysout.println("Retaining first best site only for ambiguous mappings.");
+ }else if(ambigMode==AMBIG_ALL){
+ PRINT_SECONDARY_ALIGNMENTS=ReadStreamWriter.OUTPUT_SAM_SECONDARY_ALIGNMENTS=true;
+ REMOVE_DUPLICATE_BEST_ALIGNMENTS=false;
+ BBIndexAcc.QUIT_AFTER_TWO_PERFECTS=false;
+ SamLine.MAKE_NH_TAG=true;
+ ambiguousAll=true;
+ sysout.println("Retaining all best sites for ambiguous mappings.");
+ }else if(ambigMode==AMBIG_RANDOM){
+ REMOVE_DUPLICATE_BEST_ALIGNMENTS=false;
+ BBIndexAcc.QUIT_AFTER_TWO_PERFECTS=false;
+ ambiguousRandom=true;
+ sysout.println("Choosing a site randomly for ambiguous mappings.");
+ }else if(ambigMode==AMBIG_TOSS){
+ REMOVE_DUPLICATE_BEST_ALIGNMENTS=true;
+ BBIndexAcc.QUIT_AFTER_TWO_PERFECTS=true;
+ sysout.println("Ambiguously mapped reads will be considered unmapped.");
+ }else{
+ throw new RuntimeException("Unknown ambiguous mapping mode: "+ambigMode);
+ }
+
+ }
+
+ @Override
+ public void setup(){
+
+ assert(!useRandomReads || maxReads>0 || (in1!=null && in1.equals("sequential"))) : "Please specify number of reads to use.";
+
+ if(minid!=-1){
+ MINIMUM_ALIGNMENT_SCORE_RATIO=MSA.minIdToMinRatio(minid, MSA_TYPE);
+ sysout.println("Set MINIMUM_ALIGNMENT_SCORE_RATIO to "+String.format("%.3f",MINIMUM_ALIGNMENT_SCORE_RATIO));
+ }
+
+ if(!setxs){SamLine.MAKE_XS_TAG=(SamLine.INTRON_LIMIT<1000000000);}
+ if(setxs && !setintron){SamLine.INTRON_LIMIT=10;}
+
+ if(outFile==null && outFile2==null && outFileM==null && outFileM2==null && outFileU==null && outFileU2==null
+ && outFileB==null && outFileB2==null && splitterOutputs==null && BBSplitter.streamTable==null){
+ sysout.println("No output file.");
+ OUTPUT_READS=false;
+ }else{
+ OUTPUT_READS=true;
+ if(bamscript!=null){
+ BBSplitter.makeBamScript(bamscript, splitterOutputs, outFile, outFile2, outFileM, outFileM2, outFileU, outFileU2, outFileB, outFileB2);
+ }
+ }
+
+ FastaReadInputStream.MIN_READ_LEN=Tools.max(keylen+2, FastaReadInputStream.MIN_READ_LEN);
+ assert(FastaReadInputStream.settingsOK());
+
+ if(build<0){throw new RuntimeException("Must specify a build number, e.g. build=1");}
+ else{Data.GENOME_BUILD=build;}
+
+ if(blacklist!=null && blacklist.size()>0){
+ Timer t=new Timer();
+ t.start();
+ for(String s : blacklist){
+ Blacklist.addToBlacklist(s);
+ }
+ t.stop();
+ sysout.println("Created blacklist:\t"+t);
+ t.start();
+ }
+
+ if(ziplevel!=-1){ReadWrite.ZIPLEVEL=ziplevel;}
+ if(reference!=null){RefToIndex.makeIndex(reference, build, sysout, keylen);}
+ }
+
+
+ @Override
+ void processAmbig2(){
+ assert(Data.scaffoldPrefixes) : "Only process this block if there are multiple references.";
+ if(BBSplitter.AMBIGUOUS2_MODE==BBSplitter.AMBIGUOUS2_SPLIT){
+ REMOVE_DUPLICATE_BEST_ALIGNMENTS=false;
+ BBIndexAcc.QUIT_AFTER_TWO_PERFECTS=false;
+ sysout.println("Reads that map to multiple references will be written to special output streams.");
+ }else if(BBSplitter.AMBIGUOUS2_MODE==BBSplitter.AMBIGUOUS2_FIRST){
+ REMOVE_DUPLICATE_BEST_ALIGNMENTS=false;
+ BBIndexAcc.QUIT_AFTER_TWO_PERFECTS=false;
+ sysout.println("Reads that map to multiple references will be written to the first reference's stream only.");
+ }else if(BBSplitter.AMBIGUOUS2_MODE==BBSplitter.AMBIGUOUS2_TOSS){
+ BBIndexAcc.QUIT_AFTER_TWO_PERFECTS=true;
+ sysout.println("Reads that map to multiple references will be considered unmapped.");
+ }else if(BBSplitter.AMBIGUOUS2_MODE==BBSplitter.AMBIGUOUS2_RANDOM){
+ REMOVE_DUPLICATE_BEST_ALIGNMENTS=false;
+ BBIndexAcc.QUIT_AFTER_TWO_PERFECTS=false;
+ sysout.println("Reads that map to multiple references will be written to a random stream.");
+ }else if(BBSplitter.AMBIGUOUS2_MODE==BBSplitter.AMBIGUOUS2_ALL){
+ REMOVE_DUPLICATE_BEST_ALIGNMENTS=false;
+ BBIndexAcc.QUIT_AFTER_TWO_PERFECTS=false;
+ sysout.println("Reads that map to multiple references will be written to all relevant output streams.");
+ }else{
+ BBSplitter.AMBIGUOUS2_MODE=BBSplitter.AMBIGUOUS2_FIRST;
+ }
+ }
+
+ @Override
+ void loadIndex(){
+ Timer t=new Timer();
+
+ if(build>-1){
+ Data.setGenome(build);
+ AbstractIndex.MINCHROM=1;
+ AbstractIndex.MAXCHROM=Data.numChroms;
+ if(minChrom<0){minChrom=1;}
+ if(maxChrom<0 || maxChrom>Data.numChroms){maxChrom=Data.numChroms;}
+ sysout.println("Set genome to "+Data.GENOME_BUILD);
+
+ if(RefToIndex.AUTO_CHROMBITS){
+ int maxLength=Tools.max(Data.chromLengths);
+ RefToIndex.chrombits=Integer.numberOfLeadingZeros(maxLength)-1;
+ RefToIndex.chrombits=Tools.min(RefToIndex.chrombits, 16);
+ }
+ if(RefToIndex.chrombits!=-1){
+ BBIndexAcc.setChromBits(RefToIndex.chrombits);
+ if(verbose_stats>0){sysout.println("Set CHROMBITS to "+RefToIndex.chrombits);}
+ }
+ }
+
+ assert(minChrom>=AbstractIndex.MINCHROM && maxChrom<=AbstractIndex.MAXCHROM) :
+ minChrom+", "+maxChrom+", "+AbstractIndex.MINCHROM+", "+AbstractIndex.MAXCHROM;
+ AbstractIndex.MINCHROM=minChrom;
+ AbstractIndex.MAXCHROM=maxChrom;
+
+ if(targetGenomeSize>0){
+ long bases=Data.numDefinedBases;
+ long x=Tools.max(1, Math.round(0.25f+bases*1d/targetGenomeSize));
+ BBMapThreadAcc.setExpectedSites((int)x);
+ sysout.println("Set EXPECTED_SITES to "+x);
+ }
+
+ assert(!(PERFECTMODE && SEMIPERFECTMODE));
+ if(PERFECTMODE){setPerfectMode();}
+ if(SEMIPERFECTMODE){setSemiperfectMode();}
+
+ //Optional section for discrete timing of chrom array loading
+ if(SLOW_ALIGN || AbstractIndex.USE_EXTENDED_SCORE || useRandomReads || MAKE_MATCH_STRING){
+ sysout.println();
+ if(RefToIndex.chromlist==null){
+ Data.loadChromosomes(minChrom, maxChrom);
+ }else{
+ assert(RefToIndex.chromlist.size()==maxChrom-minChrom+1) : RefToIndex.chromlist.size();
+ for(ChromosomeArray cha : RefToIndex.chromlist){
+ Data.chromosomePlusMatrix[cha.chromosome]=cha;
+ }
+ }
+ if(Shared.TRIM_READ_COMMENTS){Data.trimScaffoldNames();}
+ t.stop();
+ sysout.println("Loaded Reference:\t"+t);
+ t.start();
+ }
+ RefToIndex.chromlist=null;
+
+ t.start();
+ BBIndexAcc.loadIndex(minChrom, maxChrom, keylen, !RefToIndex.NODISK, RefToIndex.NODISK);
+
+ {
+ long len=Data.numDefinedBases;
+ if(len<300000000){
+ BBIndexAcc.MAX_HITS_REDUCTION2+=1;
+ BBIndexAcc.MAXIMUM_MAX_HITS_REDUCTION+=1;
+ if(len<30000000){
+ BBIndexAcc.setFractionToExclude(BBIndexAcc.FRACTION_GENOME_TO_EXCLUDE*0.5f);
+ BBIndexAcc.MAXIMUM_MAX_HITS_REDUCTION+=1;
+ BBIndexAcc.HIT_REDUCTION_DIV=Tools.max(BBIndexAcc.HIT_REDUCTION_DIV-1, 3);
+ }else if(len<100000000){
+ BBIndexAcc.setFractionToExclude(BBIndexAcc.FRACTION_GENOME_TO_EXCLUDE*0.6f);
+ }else{
+ BBIndexAcc.setFractionToExclude(BBIndexAcc.FRACTION_GENOME_TO_EXCLUDE*0.75f);
+ }
+ }
+ }
+
+ t.stop();
+ sysout.println("Generated Index:\t"+t);
+ t.start();
+
+ if(!SLOW_ALIGN && !AbstractIndex.USE_EXTENDED_SCORE && !useRandomReads && !MAKE_MATCH_STRING){
+ for(int chrom=minChrom; chrom<=maxChrom; chrom++){
+ Data.unload(chrom, true);
+ }
+ }
+
+ if(ReadWrite.countActiveThreads()>0){
+ ReadWrite.waitForWritingToFinish();
+ t.stop();
+ sysout.println("Finished Writing:\t"+t);
+ t.start();
+ }
+
+ if(coverageBinned!=null || coverageBase!=null || coverageHist!=null || coverageStats!=null || coverageRPKM!=null || normcov!=null || normcovOverall!=null){
+ String[] cvargs=("covhist="+coverageHist+"\tcovstats="+coverageStats+"\tbasecov="+coverageBase+"\tbincov="+coverageBinned+"\tphyscov="+coveragePhysical+
+ "\t32bit="+cov32bit+"\tnzo="+covNzo+"\ttwocolumn="+covTwocolumn+"\tsecondary="+PRINT_SECONDARY_ALIGNMENTS+"\tcovminscaf="+coverageMinScaf+
+ "\tksb="+covKsb+"\tbinsize="+covBinSize+"\tstartcov="+covStartOnly+"\tstrandedcov="+covStranded+"\trpkm="+coverageRPKM+
+ "\tnormcov="+normcov+"\tnormcovo="+normcovOverall+(in1==null ? "" : "\tin1="+in1)+(in2==null ? "" : "\tin2="+in2)+
+ (covSetbs ? ("\tbitset="+covBitset+"\tarrays="+covArrays) : "")).split("\t");
+ pileup=new CoveragePileup(cvargs);
+ pileup.createDataStructures();
+ pileup.loadScaffoldsFromIndex(minChrom, maxChrom);
+ }
+
+ if(!forceanalyze && (in1==null || maxReads==0)){return;}
+
+ BBIndexAcc.analyzeIndex(minChrom, maxChrom, BBIndexAcc.FRACTION_GENOME_TO_EXCLUDE, keylen);
+
+ t.stop();
+ sysout.println("Analyzed Index: \t"+t);
+ t.start();
+ }
+
+ public void testSpeed(String[] args){
+
+ if(in1==null || maxReads==0){
+ sysout.println("No reads to process; quitting.");
+ return;
+ }
+
+ Timer t=new Timer();
+
+ final boolean paired=openStreams(t, args);
+ if(paired){BBIndexAcc.QUIT_AFTER_TWO_PERFECTS=false;}
+
+ t.start();
+
+ if(Shared.USE_JNI){
+ final int threads=Shared.threads();
+ adjustThreadsforMemory(105);
+ if(Shared.threads()<threads*0.9){
+ sysout.println("Disabling JNI due to low system memory.");
+ Shared.USE_JNI=false;
+ Shared.setThreads(threads);
+ }
+ }
+ if(!Shared.USE_JNI){
+ adjustThreadsforMemory(65);
+ }
+
+ AbstractMapThread.CALC_STATISTICS=CALC_STATISTICS;
+ AbstractMapThread[] mtts=new AbstractMapThread[Shared.threads()];
+ for(int i=0; i<mtts.length; i++){
+ try{
+ mtts[i]=new BBMapThreadAcc(cris, keylen,
+ pileup, SLOW_ALIGN, CORRECT_THRESH, minChrom,
+ maxChrom, keyDensity, maxKeyDensity, minKeyDensity, maxDesiredKeys, REMOVE_DUPLICATE_BEST_ALIGNMENTS,
+ SAVE_AMBIGUOUS_XY, MINIMUM_ALIGNMENT_SCORE_RATIO, TRIM_LIST, MAKE_MATCH_STRING, QUICK_MATCH_STRINGS, rosA, rosM, rosU, rosB,
+ SLOW_ALIGN_PADDING, SLOW_RESCUE_PADDING, OUTPUT_MAPPED_ONLY, DONT_OUTPUT_BLACKLISTED_READS, MAX_SITESCORES_TO_PRINT, PRINT_SECONDARY_ALIGNMENTS,
+ REQUIRE_CORRECT_STRANDS_PAIRS, SAME_STRAND_PAIRS, KILL_BAD_PAIRS, rcompMate,
+ PERFECTMODE, SEMIPERFECTMODE, FORBID_SELF_MAPPING, TIP_SEARCH_DIST,
+ ambiguousRandom, ambiguousAll, KFILTER, IDFILTER, qtrimLeft, qtrimRight, untrim, TRIM_QUALITY, minTrimLength, LOCAL_ALIGN, RESCUE, STRICT_MAX_INDEL, MSA_TYPE);
+ } catch (Exception e) {
+ e.printStackTrace();
+ abort(mtts, "Aborting due to prior error.");
+ }
+ mtts[i].idmodulo=idmodulo;
+ if(verbose){
+ mtts[i].verbose=verbose;
+ mtts[i].index().verbose=verbose;
+ }
+ }
+
+ cris.start(); //4567
+ sysout.println("Processing reads in "+(paired ? "paired" : "single")+"-ended mode.");
+ sysout.println("Started read stream.");
+
+ /* The threads are started after initialization to prevent resource competition between initialization and mapping */
+ for(int i=0; i<mtts.length; i++){mtts[i].start();}
+ sysout.println("Started "+mtts.length+" mapping thread"+(mtts.length==1 ? "" : "s")+".");
+
+ final int broken=shutDownThreads(mtts, false);
+
+ sysout.println("\n\n ------------------ Results ------------------ ");
+
+ closeStreams(cris, rosA, rosM, rosU, rosB);
+ sysout.println();
+ printSettings(keylen);
+ printOutput(mtts, t, keylen, paired, false, pileup, scafNzo, sortStats, statsOutputFile);
+ if(broken>0 || errorState){throw new RuntimeException("BBMap terminated in an error state; the output may be corrupt.");}
+ }
+
+ @Override
+ void setSemiperfectMode() {
+ assert(SEMIPERFECTMODE);
+ if(SEMIPERFECTMODE){
+ TRIM_LIST=false;
+ keyDensity/=2;
+ maxKeyDensity/=2;
+ minKeyDensity=1.1f;
+ maxDesiredKeys/=2;
+ MINIMUM_ALIGNMENT_SCORE_RATIO=0.45f;
+ BBIndexAcc.setSemiperfectMode();
+ }
+ }
+
+ @Override
+ void setPerfectMode() {
+ assert(PERFECTMODE);
+ if(PERFECTMODE){
+ TRIM_LIST=false;
+ keyDensity/=2;
+ maxKeyDensity/=2;
+ minKeyDensity=1.1f;
+ maxDesiredKeys/=2;
+ MINIMUM_ALIGNMENT_SCORE_RATIO=1.0f;
+ BBIndexAcc.setPerfectMode();
+ }
+ }
+
+
+ @Override
+ void printSettings(int k){
+
+ printSettings0(k, BBIndexAcc.MAX_INDEL, MINIMUM_ALIGNMENT_SCORE_RATIO);
+
+ if(verbose_stats>=2){
+ sysout.println("Key Density: \t"+keyDensity+" ("+minKeyDensity+" ~ "+maxKeyDensity+")");
+ sysout.println("Max keys: \t"+maxDesiredKeys);
+
+ sysout.println("Block Subsections: \t"+BBIndexAcc.CHROMS_PER_BLOCK);
+ sysout.println("Fraction To Remove: \t"+String.format("%.4f", (BBIndexAcc.REMOVE_FREQUENT_GENOME_FRACTION ? BBIndexAcc.FRACTION_GENOME_TO_EXCLUDE : 0)));
+ // sysout.println("ADD_SCORE_Z: \t"+Indexi.ADD_SCORE_Z);
+ sysout.println("Hits To Keep: \t"+BBIndexAcc.MIN_APPROX_HITS_TO_KEEP);
+ }
+
+ if(verbose_stats>=3){
+ sysout.println("Remove Clumpy: \t"+BBIndexAcc.REMOVE_CLUMPY);
+ if(BBIndexAcc.REMOVE_CLUMPY){
+ sysout.println("CLUMPY_MAX_DIST: \t"+BBIndexAcc.CLUMPY_MAX_DIST);
+ sysout.println("CLUMPY_MIN_LENGTH: \t"+BBIndexAcc.CLUMPY_MIN_LENGTH_INDEX);
+ sysout.println("CLUMPY_FRACTION: \t"+BBIndexAcc.CLUMPY_FRACTION);
+ }
+ sysout.println("Remove Long Lists: \t"+BBIndexAcc.TRIM_LONG_HIT_LISTS);
+ if(BBIndexAcc.TRIM_LONG_HIT_LISTS){
+ sysout.println("HIT_FRACTION_TO_RETAIN:\t"+BBIndexAcc.HIT_FRACTION_TO_RETAIN);
+ }
+ sysout.println("Trim By Greedy: \t"+BBIndexAcc.TRIM_BY_GREEDY);
+ sysout.println("Trim By Total Sites: \t"+BBIndexAcc.TRIM_BY_TOTAL_SITE_COUNT);
+ if(BBIndexAcc.TRIM_BY_TOTAL_SITE_COUNT){
+ sysout.println("MAX_AVG_SITES: \t"+BBIndexAcc.MAX_AVERAGE_LIST_TO_SEARCH);
+ sysout.println("MAX_AVG_SITES_2: \t"+BBIndexAcc.MAX_AVERAGE_LIST_TO_SEARCH2);
+ sysout.println("MAX_SHORTEST_SITE: \t"+BBIndexAcc.MAX_SHORTEST_LIST_TO_SEARCH);
+ }
+ sysout.println("Index Min Score: \t"+BBIndexAcc.MIN_SCORE_MULT);
+
+ sysout.println("Dynamic Trim: \t"+BBIndexAcc.DYNAMICALLY_TRIM_LOW_SCORES);
+ if(BBIndexAcc.DYNAMICALLY_TRIM_LOW_SCORES){
+ sysout.println("DYNAMIC_SCORE_THRESH: \t"+BBIndexAcc.DYNAMIC_SCORE_THRESH);
+ }
+ }
+
+ }
+
+}
diff --git a/current/align2/BBMapPacBio.java b/current/align2/BBMapPacBio.java
new file mode 100755
index 0000000..32645b3
--- /dev/null
+++ b/current/align2/BBMapPacBio.java
@@ -0,0 +1,529 @@
+package align2;
+
+import java.io.File;
+import java.util.ArrayList;
+
+import jgi.CoveragePileup;
+
+import stream.FastaReadInputStream;
+import stream.ReadStreamWriter;
+import stream.SamLine;
+
+import dna.ChromArrayMaker;
+import dna.ChromosomeArray;
+import dna.Data;
+import dna.FastaToChromArrays2;
+import dna.Timer;
+import fileIO.ReadWrite;
+
+/**
+ * Based on TestIndex11f
+ *
+ * @author Brian Bushnell
+ * @date Jul 10, 2012
+ *
+ */
+public final class BBMapPacBio extends AbstractMapper {
+
+
+ public static void main(String[] args){
+ Timer t=new Timer();
+ BBMapPacBio mapper=new BBMapPacBio(args);
+ args=Tools.condenseStrict(args);
+ if(!INDEX_LOADED){mapper.loadIndex();}
+ if(Data.scaffoldPrefixes){mapper.processAmbig2();}
+ mapper.testSpeed(args);
+ ReadWrite.waitForWritingToFinish();
+ t.stop();
+ sysout.println("\nTotal time: \t"+t);
+ clearStatics();
+ }
+
+ public BBMapPacBio(String[] args){
+ super(args);
+ }
+
+ @Override
+ public void setDefaults(){
+ FastaToChromArrays2.MID_PADDING=2000;
+ ReadWrite.ZIPLEVEL=2;
+ MAKE_MATCH_STRING=true;
+ keylen=12;
+
+ MINIMUM_ALIGNMENT_SCORE_RATIO=0.46f;
+
+ keyDensity=3.5f;//2.3f;
+ maxKeyDensity=4.5f;//4f;
+ minKeyDensity=2.8f;//1.8f;
+ maxDesiredKeys=63;
+
+ SLOW_ALIGN_PADDING=8;
+ SLOW_RESCUE_PADDING=8+SLOW_ALIGN_PADDING;
+ TIP_SEARCH_DIST=15;
+
+ MSA_TYPE="MultiStateAligner9PacBio";
+ MAX_SITESCORES_TO_PRINT=100;
+ PRINT_SECONDARY_ALIGNMENTS=false;
+ AbstractIndex.MIN_APPROX_HITS_TO_KEEP=1;
+ Shared.READ_BUFFER_LENGTH=Tools.mid(1, Shared.READ_BUFFER_LENGTH, 20);
+ }
+
+ @Override
+ public String[] preparse(String[] args){
+ if(fast){
+ ArrayList<String> list=new ArrayList<String>();
+ list.add("tipsearch="+TIP_SEARCH_DIST/5);
+// list.add("maxindel=100");
+// list.add("minhits=2");
+ list.add("bwr=0.16");
+// list.add("minratio=0.5");
+// list.add("k=13");
+ list.add("quickmatch=t");
+ list.add("rescuemismatches=15");
+ list.add("rescuedist=800");
+ list.add("maxsites=5");
+ list.add("maxsites2=400");
+
+ BBIndexPacBio.setFractionToExclude(BBIndexPacBio.FRACTION_GENOME_TO_EXCLUDE*1.25f);
+
+ for(String s : args){if(s!=null){list.add(s);}}
+ args=list.toArray(new String[list.size()]);
+
+ keyDensity*=0.9f;
+ maxKeyDensity*=0.9f;
+ minKeyDensity*=0.9f;
+ }else if(vslow){
+ ArrayList<String> list=new ArrayList<String>();
+ list.add("tipsearch="+(TIP_SEARCH_DIST*3)/2);
+ list.add("minhits=1");
+ list.add("minratio=0.25");
+ list.add("rescuemismatches=50");
+ list.add("rescuedist=3000");
+
+ BBIndexPacBio.setFractionToExclude(0);
+
+ for(String s : args){if(s!=null){list.add(s);}}
+ args=list.toArray(new String[list.size()]);
+
+ SLOW_ALIGN_PADDING=SLOW_ALIGN_PADDING*2+2;
+ SLOW_RESCUE_PADDING=SLOW_RESCUE_PADDING*2+2;
+
+ AbstractIndex.SLOW=true;
+ AbstractIndex.VSLOW=true;
+ keyDensity*=2.5f;
+ maxKeyDensity*=2.5f;
+ minKeyDensity*=2.5f;
+ }else if(slow){
+ //TODO: Unfinished
+ ArrayList<String> list=new ArrayList<String>();
+
+ BBIndexPacBio.setFractionToExclude(BBIndexPacBio.FRACTION_GENOME_TO_EXCLUDE*0.4f);
+
+ for(String s : args){if(s!=null){list.add(s);}}
+ args=list.toArray(new String[list.size()]);
+
+ AbstractIndex.SLOW=true;
+ keyDensity*=1.2f;
+ maxKeyDensity*=1.2f;
+ minKeyDensity*=1.2f;
+ }
+ return args;
+ }
+
+ @Override
+ void postparse(String[] args){
+
+ if(MSA.bandwidthRatio>0 && MSA.bandwidthRatio<.2){
+ SLOW_ALIGN_PADDING=Tools.min(SLOW_ALIGN_PADDING, 5);
+ SLOW_RESCUE_PADDING=Tools.min(SLOW_RESCUE_PADDING, 10);
+ }
+
+ if(maxIndel1>-1){
+ TIP_SEARCH_DIST=Tools.min(TIP_SEARCH_DIST, maxIndel1);
+ BBIndexPacBio.MAX_INDEL=maxIndel1;
+ }
+ if(maxIndel2>-1){
+ BBIndexPacBio.MAX_INDEL2=maxIndel2;
+ }
+
+ if(minApproxHits>-1){
+ BBIndexPacBio.MIN_APPROX_HITS_TO_KEEP=minApproxHits;
+ }
+
+ if(expectedSites>-1){
+ BBMapThreadPacBio.setExpectedSites(expectedSites);
+ sysout.println("Set EXPECTED_SITES to "+expectedSites);
+ }
+
+ if(fractionGenomeToExclude>=0){
+ BBIndexPacBio.setFractionToExclude(fractionGenomeToExclude);
+ }
+
+ {
+ final String a=(args.length>0 ? args[0] : null);
+ final String b=(args.length>1 ? args[1] : null);
+ if(in1==null && a!=null && a.indexOf('=')<0 && (a.startsWith("stdin") || new File(a).exists())){in1=a;}
+ if(in2==null && b!=null && b.indexOf('=')<0 && new File(b).exists()){in2=b;}
+ if(ERROR_ON_NO_OUTPUT && !OUTPUT_READS && in1!=null){throw new RuntimeException("Error: no output file, and ERROR_ON_NO_OUTPUT="+ERROR_ON_NO_OUTPUT);}
+ }
+
+ assert(synthReadlen<BBMapThreadPacBio.ALIGN_ROWS);
+
+ if(MSA.bandwidth>0){
+ int halfwidth=MSA.bandwidth/2;
+ TIP_SEARCH_DIST=Tools.min(TIP_SEARCH_DIST, halfwidth/2);
+ BBIndexPacBio.MAX_INDEL=Tools.min(BBIndexPacBio.MAX_INDEL, halfwidth/2);
+ BBIndexPacBio.MAX_INDEL2=Tools.min(BBIndexPacBio.MAX_INDEL2, halfwidth);
+ SLOW_ALIGN_PADDING=Tools.min(SLOW_ALIGN_PADDING, halfwidth/4);
+ SLOW_RESCUE_PADDING=Tools.min(SLOW_RESCUE_PADDING, halfwidth/4);
+ }
+
+ if(PRINT_SECONDARY_ALIGNMENTS){
+ REMOVE_DUPLICATE_BEST_ALIGNMENTS=false;
+ BBIndexPacBio.QUIT_AFTER_TWO_PERFECTS=false;
+ }
+
+ if(ambigMode==AMBIG_BEST){
+ REMOVE_DUPLICATE_BEST_ALIGNMENTS=false;
+ if(!PRINT_SECONDARY_ALIGNMENTS){BBIndexPacBio.QUIT_AFTER_TWO_PERFECTS=true;}
+ sysout.println("Retaining first best site only for ambiguous mappings.");
+ }else if(ambigMode==AMBIG_ALL){
+ PRINT_SECONDARY_ALIGNMENTS=ReadStreamWriter.OUTPUT_SAM_SECONDARY_ALIGNMENTS=true;
+ REMOVE_DUPLICATE_BEST_ALIGNMENTS=false;
+ BBIndexPacBio.QUIT_AFTER_TWO_PERFECTS=false;
+ SamLine.MAKE_NH_TAG=true;
+ ambiguousAll=true;
+ sysout.println("Retaining all best sites for ambiguous mappings.");
+ }else if(ambigMode==AMBIG_RANDOM){
+ REMOVE_DUPLICATE_BEST_ALIGNMENTS=false;
+ BBIndexPacBio.QUIT_AFTER_TWO_PERFECTS=false;
+ ambiguousRandom=true;
+ sysout.println("Choosing a site randomly for ambiguous mappings.");
+ }else if(ambigMode==AMBIG_TOSS){
+ REMOVE_DUPLICATE_BEST_ALIGNMENTS=true;
+ BBIndexPacBio.QUIT_AFTER_TWO_PERFECTS=true;
+ sysout.println("Ambiguously mapped reads will be considered unmapped.");
+ }else{
+ throw new RuntimeException("Unknown ambiguous mapping mode: "+ambigMode);
+ }
+
+ }
+
+ @Override
+ public void setup(){
+
+ assert(!useRandomReads || maxReads>0 || (in1!=null && in1.equals("sequential"))) : "Please specify number of reads to use.";
+
+ if(minid!=-1){
+ MINIMUM_ALIGNMENT_SCORE_RATIO=MSA.minIdToMinRatio(minid, MSA_TYPE);
+ sysout.println("Set MINIMUM_ALIGNMENT_SCORE_RATIO to "+String.format("%.3f",MINIMUM_ALIGNMENT_SCORE_RATIO));
+ }
+
+ if(!setxs){SamLine.MAKE_XS_TAG=(SamLine.INTRON_LIMIT<1000000000);}
+ if(setxs && !setintron){SamLine.INTRON_LIMIT=10;}
+
+ if(outFile==null && outFile2==null && outFileM==null && outFileM2==null && outFileU==null && outFileU2==null
+ && outFileB==null && outFileB2==null && splitterOutputs==null && BBSplitter.streamTable==null){
+ sysout.println("No output file.");
+ OUTPUT_READS=false;
+ }else{
+ OUTPUT_READS=true;
+ if(bamscript!=null){
+ BBSplitter.makeBamScript(bamscript, splitterOutputs, outFile, outFile2, outFileM, outFileM2, outFileU, outFileU2, outFileB, outFileB2);
+ }
+ }
+
+ FastaReadInputStream.MIN_READ_LEN=Tools.max(keylen+2, FastaReadInputStream.MIN_READ_LEN);
+ assert(FastaReadInputStream.settingsOK());
+
+ if(build<0){throw new RuntimeException("Must specify a build number, e.g. build=1");}
+ else{Data.GENOME_BUILD=build;}
+
+ if(blacklist!=null && blacklist.size()>0){
+ Timer t=new Timer();
+ t.start();
+ for(String s : blacklist){
+ Blacklist.addToBlacklist(s);
+ }
+ t.stop();
+ sysout.println("Created blacklist:\t"+t);
+ t.start();
+ }
+
+ if(ziplevel!=-1){ReadWrite.ZIPLEVEL=ziplevel;}
+ if(reference!=null){RefToIndex.makeIndex(reference, build, sysout, keylen);}
+ }
+
+
+ @Override
+ void processAmbig2(){
+ assert(Data.scaffoldPrefixes) : "Only process this block if there are multiple references.";
+ if(BBSplitter.AMBIGUOUS2_MODE==BBSplitter.AMBIGUOUS2_SPLIT){
+ REMOVE_DUPLICATE_BEST_ALIGNMENTS=false;
+ BBIndexPacBio.QUIT_AFTER_TWO_PERFECTS=false;
+ sysout.println("Reads that map to multiple references will be written to special output streams.");
+ }else if(BBSplitter.AMBIGUOUS2_MODE==BBSplitter.AMBIGUOUS2_FIRST){
+ REMOVE_DUPLICATE_BEST_ALIGNMENTS=false;
+ BBIndexPacBio.QUIT_AFTER_TWO_PERFECTS=false;
+ sysout.println("Reads that map to multiple references will be written to the first reference's stream only.");
+ }else if(BBSplitter.AMBIGUOUS2_MODE==BBSplitter.AMBIGUOUS2_TOSS){
+ BBIndexPacBio.QUIT_AFTER_TWO_PERFECTS=true;
+ sysout.println("Reads that map to multiple references will be considered unmapped.");
+ }else if(BBSplitter.AMBIGUOUS2_MODE==BBSplitter.AMBIGUOUS2_RANDOM){
+ REMOVE_DUPLICATE_BEST_ALIGNMENTS=false;
+ BBIndexPacBio.QUIT_AFTER_TWO_PERFECTS=false;
+ sysout.println("Reads that map to multiple references will be written to a random stream.");
+ }else if(BBSplitter.AMBIGUOUS2_MODE==BBSplitter.AMBIGUOUS2_ALL){
+ REMOVE_DUPLICATE_BEST_ALIGNMENTS=false;
+ BBIndexPacBio.QUIT_AFTER_TWO_PERFECTS=false;
+ sysout.println("Reads that map to multiple references will be written to all relevant output streams.");
+ }else{
+ BBSplitter.AMBIGUOUS2_MODE=BBSplitter.AMBIGUOUS2_FIRST;
+ }
+ }
+
+ @Override
+ void loadIndex(){
+ Timer t=new Timer();
+
+ if(build>-1){
+ Data.setGenome(build);
+ AbstractIndex.MINCHROM=1;
+ AbstractIndex.MAXCHROM=Data.numChroms;
+ if(minChrom<0){minChrom=1;}
+ if(maxChrom<0 || maxChrom>Data.numChroms){maxChrom=Data.numChroms;}
+ sysout.println("Set genome to "+Data.GENOME_BUILD);
+
+ if(RefToIndex.AUTO_CHROMBITS){
+ int maxLength=Tools.max(Data.chromLengths);
+ RefToIndex.chrombits=Integer.numberOfLeadingZeros(maxLength)-1;
+ RefToIndex.chrombits=Tools.min(RefToIndex.chrombits, 16);
+ }
+ if(RefToIndex.chrombits!=-1){
+ BBIndexPacBio.setChromBits(RefToIndex.chrombits);
+ if(verbose_stats>0){sysout.println("Set CHROMBITS to "+RefToIndex.chrombits);}
+ }
+ }
+
+ assert(minChrom>=AbstractIndex.MINCHROM && maxChrom<=AbstractIndex.MAXCHROM) :
+ minChrom+", "+maxChrom+", "+AbstractIndex.MINCHROM+", "+AbstractIndex.MAXCHROM;
+ AbstractIndex.MINCHROM=minChrom;
+ AbstractIndex.MAXCHROM=maxChrom;
+
+ if(targetGenomeSize>0){
+ long bases=Data.numDefinedBases;
+ long x=Tools.max(1, Math.round(0.25f+bases*1d/targetGenomeSize));
+ BBMapThreadPacBio.setExpectedSites((int)x);
+ sysout.println("Set EXPECTED_SITES to "+x);
+ }
+
+ assert(!(PERFECTMODE && SEMIPERFECTMODE));
+ if(PERFECTMODE){setPerfectMode();}
+ if(SEMIPERFECTMODE){setSemiperfectMode();}
+
+ //Optional section for discrete timing of chrom array loading
+ if(SLOW_ALIGN || AbstractIndex.USE_EXTENDED_SCORE || useRandomReads || MAKE_MATCH_STRING){
+ sysout.println();
+ if(RefToIndex.chromlist==null){
+ Data.loadChromosomes(minChrom, maxChrom);
+ }else{
+ assert(RefToIndex.chromlist.size()==maxChrom-minChrom+1) : RefToIndex.chromlist.size();
+ for(ChromosomeArray cha : RefToIndex.chromlist){
+ Data.chromosomePlusMatrix[cha.chromosome]=cha;
+ }
+ }
+ if(Shared.TRIM_READ_COMMENTS){Data.trimScaffoldNames();}
+ t.stop();
+ sysout.println("Loaded Reference:\t"+t);
+ t.start();
+ }
+ RefToIndex.chromlist=null;
+
+ t.start();
+ BBIndexPacBio.loadIndex(minChrom, maxChrom, keylen, !RefToIndex.NODISK, RefToIndex.NODISK);
+
+ {
+ long len=Data.numDefinedBases;
+ if(len<300000000){
+ BBIndexPacBio.MAX_HITS_REDUCTION2+=1;
+ BBIndexPacBio.MAXIMUM_MAX_HITS_REDUCTION+=1;
+ if(len<30000000){
+ BBIndexPacBio.setFractionToExclude(BBIndexPacBio.FRACTION_GENOME_TO_EXCLUDE*0.5f);
+ BBIndexPacBio.MAXIMUM_MAX_HITS_REDUCTION+=1;
+ BBIndexPacBio.HIT_REDUCTION_DIV=Tools.max(BBIndexPacBio.HIT_REDUCTION_DIV-1, 3);
+ }else if(len<100000000){
+ BBIndexPacBio.setFractionToExclude(BBIndexPacBio.FRACTION_GENOME_TO_EXCLUDE*0.6f);
+ }else{
+ BBIndexPacBio.setFractionToExclude(BBIndexPacBio.FRACTION_GENOME_TO_EXCLUDE*0.75f);
+ }
+ }
+ }
+
+ t.stop();
+ sysout.println("Generated Index:\t"+t);
+ t.start();
+
+ if(!SLOW_ALIGN && !AbstractIndex.USE_EXTENDED_SCORE && !useRandomReads && !MAKE_MATCH_STRING){
+ for(int chrom=minChrom; chrom<=maxChrom; chrom++){
+ Data.unload(chrom, true);
+ }
+ }
+
+ if(ReadWrite.countActiveThreads()>0){
+ ReadWrite.waitForWritingToFinish();
+ t.stop();
+ sysout.println("Finished Writing:\t"+t);
+ t.start();
+ }
+
+ if(coverageBinned!=null || coverageBase!=null || coverageHist!=null || coverageStats!=null || coverageRPKM!=null || normcov!=null || normcovOverall!=null){
+ String[] cvargs=("covhist="+coverageHist+"\tcovstats="+coverageStats+"\tbasecov="+coverageBase+"\tbincov="+coverageBinned+"\tphyscov="+coveragePhysical+
+ "\t32bit="+cov32bit+"\tnzo="+covNzo+"\ttwocolumn="+covTwocolumn+"\tsecondary="+PRINT_SECONDARY_ALIGNMENTS+"\tcovminscaf="+coverageMinScaf+
+ "\tksb="+covKsb+"\tbinsize="+covBinSize+"\tstartcov="+covStartOnly+"\tstrandedcov="+covStranded+"\trpkm="+coverageRPKM+
+ "\tnormcov="+normcov+"\tnormcovo="+normcovOverall+(in1==null ? "" : "\tin1="+in1)+(in2==null ? "" : "\tin2="+in2)+
+ (covSetbs ? ("\tbitset="+covBitset+"\tarrays="+covArrays) : "")).split("\t");
+ pileup=new CoveragePileup(cvargs);
+ pileup.createDataStructures();
+ pileup.loadScaffoldsFromIndex(minChrom, maxChrom);
+ }
+
+ if(!forceanalyze && (in1==null || maxReads==0)){return;}
+
+ BBIndexPacBio.analyzeIndex(minChrom, maxChrom, BBIndexPacBio.FRACTION_GENOME_TO_EXCLUDE, keylen);
+
+ t.stop();
+ sysout.println("Analyzed Index: \t"+t);
+ t.start();
+ }
+
+ public void testSpeed(String[] args){
+
+ if(in1==null || maxReads==0){
+ sysout.println("No reads to process; quitting.");
+ return;
+ }
+
+ Timer t=new Timer();
+
+ final boolean paired=openStreams(t, args);
+ if(paired){BBIndexPacBio.QUIT_AFTER_TWO_PERFECTS=false;}
+
+ t.start();
+
+ adjustThreadsforMemory(680);
+
+ AbstractMapThread.CALC_STATISTICS=CALC_STATISTICS;
+ AbstractMapThread[] mtts=new AbstractMapThread[Shared.threads()];
+ for(int i=0; i<mtts.length; i++){
+ try {
+ mtts[i]=new BBMapThreadPacBio(cris, keylen,
+ pileup, SLOW_ALIGN, CORRECT_THRESH, minChrom,
+ maxChrom, keyDensity, maxKeyDensity, minKeyDensity, maxDesiredKeys, REMOVE_DUPLICATE_BEST_ALIGNMENTS,
+ SAVE_AMBIGUOUS_XY, MINIMUM_ALIGNMENT_SCORE_RATIO, TRIM_LIST, MAKE_MATCH_STRING, QUICK_MATCH_STRINGS, rosA, rosM, rosU, rosB,
+ SLOW_ALIGN_PADDING, SLOW_RESCUE_PADDING, OUTPUT_MAPPED_ONLY, DONT_OUTPUT_BLACKLISTED_READS, MAX_SITESCORES_TO_PRINT, PRINT_SECONDARY_ALIGNMENTS,
+ REQUIRE_CORRECT_STRANDS_PAIRS, SAME_STRAND_PAIRS, KILL_BAD_PAIRS, rcompMate,
+ PERFECTMODE, SEMIPERFECTMODE, FORBID_SELF_MAPPING, TIP_SEARCH_DIST,
+ ambiguousRandom, ambiguousAll, KFILTER, IDFILTER, qtrimLeft, qtrimRight, untrim, TRIM_QUALITY, minTrimLength, LOCAL_ALIGN, RESCUE, STRICT_MAX_INDEL, MSA_TYPE);
+ } catch (Exception e) {
+ e.printStackTrace();
+ abort(mtts, "Aborting due to prior error.");
+ }
+ mtts[i].idmodulo=idmodulo;
+ if(verbose){
+ mtts[i].verbose=verbose;
+ mtts[i].index().verbose=verbose;
+ }
+ }
+
+ cris.start(); //4567
+ sysout.println("Processing reads in "+(paired ? "paired" : "single")+"-ended mode.");
+ sysout.println("Started read stream.");
+
+ /* The threads are started after initialization to prevent resource competition between initialization and mapping */
+ for(int i=0; i<mtts.length; i++){mtts[i].start();}
+ sysout.println("Started "+mtts.length+" mapping thread"+(mtts.length==1 ? "" : "s")+".");
+
+ final int broken=shutDownThreads(mtts, false);
+
+ sysout.println("\n\n ------------------ Results ------------------ ");
+
+ closeStreams(cris, rosA, rosM, rosU, rosB);
+ sysout.println();
+ printSettings(keylen);
+ printOutput(mtts, t, keylen, paired, false, pileup, scafNzo, sortStats, statsOutputFile);
+ if(broken>0 || errorState){throw new RuntimeException("BBMap terminated in an error state; the output may be corrupt.");}
+ }
+
+ @Override
+ void setSemiperfectMode() {
+ assert(SEMIPERFECTMODE);
+ if(SEMIPERFECTMODE){
+ TRIM_LIST=false;
+ keyDensity/=2;
+ maxKeyDensity/=2;
+ minKeyDensity=1.1f;
+ maxDesiredKeys/=2;
+ MINIMUM_ALIGNMENT_SCORE_RATIO=0.45f;
+ BBIndexPacBio.setSemiperfectMode();
+ }
+ }
+
+ @Override
+ void setPerfectMode() {
+ assert(PERFECTMODE);
+ if(PERFECTMODE){
+ TRIM_LIST=false;
+ keyDensity/=2;
+ maxKeyDensity/=2;
+ minKeyDensity=1.1f;
+ maxDesiredKeys/=2;
+ MINIMUM_ALIGNMENT_SCORE_RATIO=1.0f;
+ BBIndexPacBio.setPerfectMode();
+ }
+ }
+
+
+ @Override
+ void printSettings(int k){
+
+ printSettings0(k, BBIndexPacBio.MAX_INDEL, MINIMUM_ALIGNMENT_SCORE_RATIO);
+
+ if(verbose_stats>=2){
+ sysout.println("Key Density: \t"+keyDensity+" ("+minKeyDensity+" ~ "+maxKeyDensity+")");
+ sysout.println("Max keys: \t"+maxDesiredKeys);
+
+ sysout.println("Block Subsections: \t"+BBIndexPacBio.CHROMS_PER_BLOCK);
+ sysout.println("Fraction To Remove: \t"+String.format("%.4f", (BBIndexPacBio.REMOVE_FREQUENT_GENOME_FRACTION ? BBIndexPacBio.FRACTION_GENOME_TO_EXCLUDE : 0)));
+ // sysout.println("ADD_SCORE_Z: \t"+IndexPacBio.ADD_SCORE_Z);
+ sysout.println("Hits To Keep: \t"+BBIndexPacBio.MIN_APPROX_HITS_TO_KEEP);
+ }
+
+ if(verbose_stats>=3){
+ sysout.println("Remove Clumpy: \t"+BBIndexPacBio.REMOVE_CLUMPY);
+ if(BBIndexPacBio.REMOVE_CLUMPY){
+ sysout.println("CLUMPY_MAX_DIST: \t"+BBIndexPacBio.CLUMPY_MAX_DIST);
+ sysout.println("CLUMPY_MIN_LENGTH: \t"+BBIndexPacBio.CLUMPY_MIN_LENGTH_INDEX);
+ sysout.println("CLUMPY_FRACTION: \t"+BBIndexPacBio.CLUMPY_FRACTION);
+ }
+ sysout.println("Remove Long Lists: \t"+BBIndexPacBio.TRIM_LONG_HIT_LISTS);
+ if(BBIndexPacBio.TRIM_LONG_HIT_LISTS){
+ sysout.println("HIT_FRACTION_TO_RETAIN:\t"+BBIndexPacBio.HIT_FRACTION_TO_RETAIN);
+ }
+ sysout.println("Trim By Greedy: \t"+BBIndexPacBio.TRIM_BY_GREEDY);
+ sysout.println("Trim By Total Sites: \t"+BBIndexPacBio.TRIM_BY_TOTAL_SITE_COUNT);
+ if(BBIndexPacBio.TRIM_BY_TOTAL_SITE_COUNT){
+ sysout.println("MAX_AVG_SITES: \t"+BBIndexPacBio.MAX_AVERAGE_LIST_TO_SEARCH);
+ sysout.println("MAX_AVG_SITES_2: \t"+BBIndexPacBio.MAX_AVERAGE_LIST_TO_SEARCH2);
+ sysout.println("MAX_SHORTEST_SITE: \t"+BBIndexPacBio.MAX_SHORTEST_LIST_TO_SEARCH);
+ }
+ sysout.println("Index Min Score: \t"+BBIndexPacBio.MIN_SCORE_MULT);
+
+ sysout.println("Dynamic Trim: \t"+BBIndexPacBio.DYNAMICALLY_TRIM_LOW_SCORES);
+ if(BBIndexPacBio.DYNAMICALLY_TRIM_LOW_SCORES){
+ sysout.println("DYNAMIC_SCORE_THRESH: \t"+BBIndexPacBio.DYNAMIC_SCORE_THRESH);
+ }
+ }
+
+ }
+
+}
diff --git a/current/align2/BBMapPacBioSkimmer.java b/current/align2/BBMapPacBioSkimmer.java
new file mode 100755
index 0000000..756c0b4
--- /dev/null
+++ b/current/align2/BBMapPacBioSkimmer.java
@@ -0,0 +1,527 @@
+package align2;
+
+import java.io.File;
+import java.util.ArrayList;
+
+import jgi.CoveragePileup;
+
+import stream.FastaReadInputStream;
+import stream.ReadStreamWriter;
+import stream.SamLine;
+
+import dna.ChromosomeArray;
+import dna.Data;
+import dna.FastaToChromArrays2;
+import dna.Timer;
+import fileIO.ReadWrite;
+
+/**
+ * Based on TestIndex11f
+ * Designed to skim and retain all sites above a threshold.
+ * @author Brian Bushnell
+ * @date Jul 10, 2012
+ *
+ */
+public final class BBMapPacBioSkimmer extends AbstractMapper {
+
+
+ public static void main(String[] args){
+ Timer t=new Timer();
+ BBMapPacBioSkimmer mapper=new BBMapPacBioSkimmer(args);
+ args=Tools.condenseStrict(args);
+ if(!INDEX_LOADED){mapper.loadIndex();}
+ if(Data.scaffoldPrefixes){mapper.processAmbig2();}
+ mapper.testSpeed(args);
+ ReadWrite.waitForWritingToFinish();
+ t.stop();
+ sysout.println("\nTotal time: \t"+t);
+ clearStatics();
+ }
+
+ public BBMapPacBioSkimmer(String[] args){
+ super(args);
+ }
+
+ @Override
+ public void setDefaults(){
+ FastaToChromArrays2.MID_PADDING=2000;
+ ReadWrite.ZIPLEVEL=2;
+ MAKE_MATCH_STRING=true;
+ keylen=12;
+
+ MINIMUM_ALIGNMENT_SCORE_RATIO=0.45f;
+
+ keyDensity=3.3f;//2.3f; //Normal key density
+ maxKeyDensity=4.3f;//4f; //For situations where some of the read is too low quality, this is the max for the rest of the read.
+ minKeyDensity=1.8f;//1.8f;
+ maxDesiredKeys=63; //Don't go above this number of keys except to maintain minKeyDensity.
+
+ SLOW_ALIGN_PADDING=8;
+ SLOW_RESCUE_PADDING=8+SLOW_ALIGN_PADDING;
+ TIP_SEARCH_DIST=15;
+
+ MSA_TYPE="MultiStateAligner9PacBio";
+ MAX_SITESCORES_TO_PRINT=500;
+ PRINT_SECONDARY_ALIGNMENTS=true;
+ AbstractIndex.MIN_APPROX_HITS_TO_KEEP=2;
+
+ ambiguousAll=true;
+ }
+
+ @Override
+ public String[] preparse(String[] args){
+ if(fast){
+ ArrayList<String> list=new ArrayList<String>();
+ list.add("tipsearch="+TIP_SEARCH_DIST/5);
+// list.add("maxindel=100");
+// list.add("minhits=2");
+ list.add("bwr=0.16");
+// list.add("minratio=0.5");
+// list.add("k=13");
+ list.add("quickmatch=t");
+ list.add("rescuemismatches=15");
+ list.add("rescuedist=800");
+
+// BBIndexPacBioSkimmer.setFractionToExclude(BBIndexPacBioSkimmer.FRACTION_GENOME_TO_EXCLUDE*1.25f);
+
+ for(String s : args){if(s!=null){list.add(s);}}
+ args=list.toArray(new String[list.size()]);
+
+ keyDensity*=0.9f;
+ maxKeyDensity*=0.9f;
+ minKeyDensity*=0.9f;
+ }else if(vslow){
+ ArrayList<String> list=new ArrayList<String>();
+ list.add("tipsearch="+(TIP_SEARCH_DIST*3)/2);
+ list.add("minhits=1");
+ list.add("minratio=0.25");
+ list.add("rescuemismatches=50");
+ list.add("rescuedist=3000");
+
+ BBIndexPacBioSkimmer.setFractionToExclude(0);
+
+ for(String s : args){if(s!=null){list.add(s);}}
+ args=list.toArray(new String[list.size()]);
+
+ SLOW_ALIGN_PADDING=SLOW_ALIGN_PADDING*2+2;
+ SLOW_RESCUE_PADDING=SLOW_RESCUE_PADDING*2+2;
+
+ AbstractIndex.SLOW=true;
+ AbstractIndex.VSLOW=true;
+ keyDensity*=2.5f;
+ maxKeyDensity*=2.5f;
+ minKeyDensity*=2.5f;
+ }else if(slow){
+ //TODO: Unfinished
+ ArrayList<String> list=new ArrayList<String>();
+
+ BBIndexPacBioSkimmer.setFractionToExclude(BBIndexPacBioSkimmer.FRACTION_GENOME_TO_EXCLUDE*0.4f);
+
+ for(String s : args){if(s!=null){list.add(s);}}
+ args=list.toArray(new String[list.size()]);
+
+ AbstractIndex.SLOW=true;
+ keyDensity*=1.2f;
+ maxKeyDensity*=1.2f;
+ minKeyDensity*=1.2f;
+ }
+ return args;
+ }
+
+ @Override
+ void postparse(String[] args){
+
+ if(MSA.bandwidthRatio>0 && MSA.bandwidthRatio<.2){
+ SLOW_ALIGN_PADDING=Tools.min(SLOW_ALIGN_PADDING, 5);
+ SLOW_RESCUE_PADDING=Tools.min(SLOW_RESCUE_PADDING, 10);
+ }
+
+ if(maxIndel1>-1){
+ TIP_SEARCH_DIST=Tools.min(TIP_SEARCH_DIST, maxIndel1);
+ BBIndexPacBioSkimmer.MAX_INDEL=maxIndel1;
+ }
+ if(maxIndel2>-1){
+ BBIndexPacBioSkimmer.MAX_INDEL2=maxIndel2;
+ }
+
+ if(minApproxHits>-1){
+ BBIndexPacBioSkimmer.MIN_APPROX_HITS_TO_KEEP=minApproxHits;
+ }
+
+ if(expectedSites>-1){
+ BBMapThreadPacBioSkimmer.setExpectedSites(expectedSites);
+ sysout.println("Set EXPECTED_SITES to "+expectedSites);
+ }
+
+ if(fractionGenomeToExclude>=0){
+ BBIndexPacBioSkimmer.setFractionToExclude(fractionGenomeToExclude);
+ }
+
+ {
+ final String a=(args.length>0 ? args[0] : null);
+ final String b=(args.length>1 ? args[1] : null);
+ if(in1==null && a!=null && a.indexOf('=')<0 && (a.startsWith("stdin") || new File(a).exists())){in1=a;}
+ if(in2==null && b!=null && b.indexOf('=')<0 && new File(b).exists()){in2=b;}
+ if(ERROR_ON_NO_OUTPUT && !OUTPUT_READS && in1!=null){throw new RuntimeException("Error: no output file, and ERROR_ON_NO_OUTPUT="+ERROR_ON_NO_OUTPUT);}
+ }
+
+ assert(synthReadlen<BBMapThreadPacBioSkimmer.ALIGN_ROWS);
+
+ if(MSA.bandwidth>0){
+ int halfwidth=MSA.bandwidth/2;
+ TIP_SEARCH_DIST=Tools.min(TIP_SEARCH_DIST, halfwidth/2);
+ BBIndexPacBioSkimmer.MAX_INDEL=Tools.min(BBIndexPacBioSkimmer.MAX_INDEL, halfwidth/2);
+ BBIndexPacBioSkimmer.MAX_INDEL2=Tools.min(BBIndexPacBioSkimmer.MAX_INDEL2, halfwidth);
+ SLOW_ALIGN_PADDING=Tools.min(SLOW_ALIGN_PADDING, halfwidth/4);
+ SLOW_RESCUE_PADDING=Tools.min(SLOW_RESCUE_PADDING, halfwidth/4);
+ }
+
+ if(PRINT_SECONDARY_ALIGNMENTS){
+ REMOVE_DUPLICATE_BEST_ALIGNMENTS=false;
+// BBIndexPacBioSkimmer.QUIT_AFTER_TWO_PERFECTS=false;
+ }
+
+ if(ambigMode==AMBIG_BEST){
+ REMOVE_DUPLICATE_BEST_ALIGNMENTS=false;
+// if(!PRINT_SECONDARY_ALIGNMENTS){BBIndexPacBioSkimmer.QUIT_AFTER_TWO_PERFECTS=true;}
+ sysout.println("Retaining first best site only for ambiguous mappings.");
+ }else if(ambigMode==AMBIG_ALL){
+ PRINT_SECONDARY_ALIGNMENTS=ReadStreamWriter.OUTPUT_SAM_SECONDARY_ALIGNMENTS=true;
+ REMOVE_DUPLICATE_BEST_ALIGNMENTS=false;
+// BBIndexPacBioSkimmer.QUIT_AFTER_TWO_PERFECTS=false;
+ SamLine.MAKE_NH_TAG=true;
+ ambiguousAll=true;
+ sysout.println("Retaining all best sites for ambiguous mappings.");
+ }else if(ambigMode==AMBIG_RANDOM){
+ REMOVE_DUPLICATE_BEST_ALIGNMENTS=false;
+// BBIndexPacBioSkimmer.QUIT_AFTER_TWO_PERFECTS=false;
+ ambiguousRandom=true;
+ sysout.println("Choosing a site randomly for ambiguous mappings.");
+ }else if(ambigMode==AMBIG_TOSS){
+ REMOVE_DUPLICATE_BEST_ALIGNMENTS=true;
+// BBIndexPacBioSkimmer.QUIT_AFTER_TWO_PERFECTS=true;
+ sysout.println("Ambiguously mapped reads will be considered unmapped.");
+ }else{
+ throw new RuntimeException("Unknown ambiguous mapping mode: "+ambigMode);
+ }
+
+ }
+
+ @Override
+ public void setup(){
+
+ assert(!useRandomReads || maxReads>0 || (in1!=null && in1.equals("sequential"))) : "Please specify number of reads to use.";
+
+ if(minid!=-1){
+ MINIMUM_ALIGNMENT_SCORE_RATIO=MSA.minIdToMinRatio(minid, MSA_TYPE);
+ sysout.println("Set MINIMUM_ALIGNMENT_SCORE_RATIO to "+String.format("%.3f",MINIMUM_ALIGNMENT_SCORE_RATIO));
+ }
+
+ if(!setxs){SamLine.MAKE_XS_TAG=(SamLine.INTRON_LIMIT<1000000000);}
+ if(setxs && !setintron){SamLine.INTRON_LIMIT=10;}
+
+ if(outFile==null && outFile2==null && outFileM==null && outFileM2==null && outFileU==null && outFileU2==null
+ && outFileB==null && outFileB2==null && splitterOutputs==null && BBSplitter.streamTable==null){
+ sysout.println("No output file.");
+ OUTPUT_READS=false;
+ }else{
+ OUTPUT_READS=true;
+ if(bamscript!=null){
+ BBSplitter.makeBamScript(bamscript, splitterOutputs, outFile, outFile2, outFileM, outFileM2, outFileU, outFileU2, outFileB, outFileB2);
+ }
+ }
+
+ FastaReadInputStream.MIN_READ_LEN=Tools.max(keylen+2, FastaReadInputStream.MIN_READ_LEN);
+ assert(FastaReadInputStream.settingsOK());
+
+ if(build<0){throw new RuntimeException("Must specify a build number, e.g. build=1");}
+ else{Data.GENOME_BUILD=build;}
+
+ if(blacklist!=null && blacklist.size()>0){
+ Timer t=new Timer();
+ t.start();
+ for(String s : blacklist){
+ Blacklist.addToBlacklist(s);
+ }
+ t.stop();
+ sysout.println("Created blacklist:\t"+t);
+ t.start();
+ }
+
+ if(ziplevel!=-1){ReadWrite.ZIPLEVEL=ziplevel;}
+ if(reference!=null){RefToIndex.makeIndex(reference, build, sysout, keylen);}
+ }
+
+
+ @Override
+ void processAmbig2(){
+ assert(Data.scaffoldPrefixes) : "Only process this block if there are multiple references.";
+ if(BBSplitter.AMBIGUOUS2_MODE==BBSplitter.AMBIGUOUS2_SPLIT){
+ REMOVE_DUPLICATE_BEST_ALIGNMENTS=false;
+// BBIndexPacBioSkimmer.QUIT_AFTER_TWO_PERFECTS=false;
+ sysout.println("Reads that map to multiple references will be written to special output streams.");
+ }else if(BBSplitter.AMBIGUOUS2_MODE==BBSplitter.AMBIGUOUS2_FIRST){
+ REMOVE_DUPLICATE_BEST_ALIGNMENTS=false;
+// BBIndexPacBioSkimmer.QUIT_AFTER_TWO_PERFECTS=false;
+ sysout.println("Reads that map to multiple references will be written to the first reference's stream only.");
+ }else if(BBSplitter.AMBIGUOUS2_MODE==BBSplitter.AMBIGUOUS2_TOSS){
+// BBIndexPacBioSkimmer.QUIT_AFTER_TWO_PERFECTS=true;
+ sysout.println("Reads that map to multiple references will be considered unmapped.");
+ }else if(BBSplitter.AMBIGUOUS2_MODE==BBSplitter.AMBIGUOUS2_RANDOM){
+ REMOVE_DUPLICATE_BEST_ALIGNMENTS=false;
+// BBIndexPacBioSkimmer.QUIT_AFTER_TWO_PERFECTS=false;
+ sysout.println("Reads that map to multiple references will be written to a random stream.");
+ }else if(BBSplitter.AMBIGUOUS2_MODE==BBSplitter.AMBIGUOUS2_ALL){
+ REMOVE_DUPLICATE_BEST_ALIGNMENTS=false;
+// BBIndexPacBioSkimmer.QUIT_AFTER_TWO_PERFECTS=false;
+ sysout.println("Reads that map to multiple references will be written to all relevant output streams.");
+ }else{
+ BBSplitter.AMBIGUOUS2_MODE=BBSplitter.AMBIGUOUS2_FIRST;
+ }
+ }
+
+ @Override
+ void loadIndex(){
+ Timer t=new Timer();
+
+ if(build>-1){
+ Data.setGenome(build);
+ AbstractIndex.MINCHROM=1;
+ AbstractIndex.MAXCHROM=Data.numChroms;
+ if(minChrom<0){minChrom=1;}
+ if(maxChrom<0 || maxChrom>Data.numChroms){maxChrom=Data.numChroms;}
+ sysout.println("Set genome to "+Data.GENOME_BUILD);
+
+ if(RefToIndex.AUTO_CHROMBITS){
+ int maxLength=Tools.max(Data.chromLengths);
+ RefToIndex.chrombits=Integer.numberOfLeadingZeros(maxLength)-1;
+ RefToIndex.chrombits=Tools.min(RefToIndex.chrombits, 16);
+ }
+ if(RefToIndex.chrombits!=-1){
+ BBIndexPacBioSkimmer.setChromBits(RefToIndex.chrombits);
+ if(verbose_stats>0){sysout.println("Set CHROMBITS to "+RefToIndex.chrombits);}
+ }
+ }
+
+ assert(minChrom>=AbstractIndex.MINCHROM && maxChrom<=AbstractIndex.MAXCHROM) :
+ minChrom+", "+maxChrom+", "+AbstractIndex.MINCHROM+", "+AbstractIndex.MAXCHROM;
+ AbstractIndex.MINCHROM=minChrom;
+ AbstractIndex.MAXCHROM=maxChrom;
+
+ if(targetGenomeSize>0){
+ long bases=Data.numDefinedBases;
+ long x=Tools.max(1, Math.round(0.25f+bases*1d/targetGenomeSize));
+ BBMapThreadPacBioSkimmer.setExpectedSites((int)x);
+ sysout.println("Set EXPECTED_SITES to "+x);
+ }
+
+ assert(!(PERFECTMODE && SEMIPERFECTMODE));
+ if(PERFECTMODE){setPerfectMode();}
+ if(SEMIPERFECTMODE){setSemiperfectMode();}
+
+ //Optional section for discrete timing of chrom array loading
+ if(SLOW_ALIGN || AbstractIndex.USE_EXTENDED_SCORE || useRandomReads || MAKE_MATCH_STRING){
+ sysout.println();
+ if(RefToIndex.chromlist==null){
+ Data.loadChromosomes(minChrom, maxChrom);
+ }else{
+ assert(RefToIndex.chromlist.size()==maxChrom-minChrom+1) : RefToIndex.chromlist.size();
+ for(ChromosomeArray cha : RefToIndex.chromlist){
+ Data.chromosomePlusMatrix[cha.chromosome]=cha;
+ }
+ }
+ if(Shared.TRIM_READ_COMMENTS){Data.trimScaffoldNames();}
+ t.stop();
+ sysout.println("Loaded Reference:\t"+t);
+ t.start();
+ }
+ RefToIndex.chromlist=null;
+
+ t.start();
+ BBIndexPacBioSkimmer.loadIndex(minChrom, maxChrom, keylen, !RefToIndex.NODISK, RefToIndex.NODISK);
+
+ {
+ long len=Data.numDefinedBases;
+ if(len<300000000){
+// BBIndexPacBioSkimmer.MAX_HITS_REDUCTION2+=1;
+// BBIndexPacBioSkimmer.MAXIMUM_MAX_HITS_REDUCTION+=1;
+ if(len<30000000){
+ BBIndexPacBioSkimmer.setFractionToExclude(BBIndexPacBioSkimmer.FRACTION_GENOME_TO_EXCLUDE*0.5f);
+// BBIndexPacBioSkimmer.MAXIMUM_MAX_HITS_REDUCTION+=1;
+// BBIndexPacBioSkimmer.HIT_REDUCTION_DIV=Tools.max(BBIndexPacBioSkimmer.HIT_REDUCTION_DIV-1, 3);
+ }else if(len<100000000){
+ BBIndexPacBioSkimmer.setFractionToExclude(BBIndexPacBioSkimmer.FRACTION_GENOME_TO_EXCLUDE*0.6f);
+ }else{
+ BBIndexPacBioSkimmer.setFractionToExclude(BBIndexPacBioSkimmer.FRACTION_GENOME_TO_EXCLUDE*0.75f);
+ }
+ }
+ }
+
+ t.stop();
+ sysout.println("Generated Index:\t"+t);
+ t.start();
+
+ if(!SLOW_ALIGN && !AbstractIndex.USE_EXTENDED_SCORE && !useRandomReads && !MAKE_MATCH_STRING){
+ for(int chrom=minChrom; chrom<=maxChrom; chrom++){
+ Data.unload(chrom, true);
+ }
+ }
+
+ if(ReadWrite.countActiveThreads()>0){
+ ReadWrite.waitForWritingToFinish();
+ t.stop();
+ sysout.println("Finished Writing:\t"+t);
+ t.start();
+ }
+
+ if(coverageBinned!=null || coverageBase!=null || coverageHist!=null || coverageStats!=null || coverageRPKM!=null || normcov!=null || normcovOverall!=null){
+ String[] cvargs=("covhist="+coverageHist+"\tcovstats="+coverageStats+"\tbasecov="+coverageBase+"\tbincov="+coverageBinned+"\tphyscov="+coveragePhysical+
+ "\t32bit="+cov32bit+"\tnzo="+covNzo+"\ttwocolumn="+covTwocolumn+"\tsecondary="+PRINT_SECONDARY_ALIGNMENTS+"\tcovminscaf="+coverageMinScaf+
+ "\tksb="+covKsb+"\tbinsize="+covBinSize+"\tstartcov="+covStartOnly+"\tstrandedcov="+covStranded+"\trpkm="+coverageRPKM+
+ "\tnormcov="+normcov+"\tnormcovo="+normcovOverall+(in1==null ? "" : "\tin1="+in1)+(in2==null ? "" : "\tin2="+in2)+
+ (covSetbs ? ("\tbitset="+covBitset+"\tarrays="+covArrays) : "")).split("\t");
+ pileup=new CoveragePileup(cvargs);
+ pileup.createDataStructures();
+ pileup.loadScaffoldsFromIndex(minChrom, maxChrom);
+ }
+
+ if(!forceanalyze && (in1==null || maxReads==0)){return;}
+
+ BBIndexPacBioSkimmer.analyzeIndex(minChrom, maxChrom, BBIndexPacBioSkimmer.FRACTION_GENOME_TO_EXCLUDE, keylen);
+
+ t.stop();
+ sysout.println("Analyzed Index: \t"+t);
+ t.start();
+ }
+
+ public void testSpeed(String[] args){
+
+ if(in1==null || maxReads==0){
+ sysout.println("No reads to process; quitting.");
+ return;
+ }
+
+ Timer t=new Timer();
+
+ final boolean paired=openStreams(t, args);
+// if(paired){BBIndexPacBioSkimmer.QUIT_AFTER_TWO_PERFECTS=false;}
+
+ t.start();
+
+ adjustThreadsforMemory(680);
+
+ AbstractMapThread.CALC_STATISTICS=CALC_STATISTICS;
+ AbstractMapThread[] mtts=new AbstractMapThread[Shared.threads()];
+ for(int i=0; i<mtts.length; i++){
+ try {
+ mtts[i]=new BBMapThreadPacBioSkimmer(cris, keylen,
+ pileup, SLOW_ALIGN, CORRECT_THRESH, minChrom,
+ maxChrom, keyDensity, maxKeyDensity, minKeyDensity, maxDesiredKeys, REMOVE_DUPLICATE_BEST_ALIGNMENTS,
+ SAVE_AMBIGUOUS_XY, MINIMUM_ALIGNMENT_SCORE_RATIO, TRIM_LIST, MAKE_MATCH_STRING, QUICK_MATCH_STRINGS, rosA, rosM, rosU, rosB,
+ SLOW_ALIGN_PADDING, SLOW_RESCUE_PADDING, OUTPUT_MAPPED_ONLY, DONT_OUTPUT_BLACKLISTED_READS, MAX_SITESCORES_TO_PRINT, PRINT_SECONDARY_ALIGNMENTS,
+ REQUIRE_CORRECT_STRANDS_PAIRS, SAME_STRAND_PAIRS, KILL_BAD_PAIRS, rcompMate,
+ PERFECTMODE, SEMIPERFECTMODE, FORBID_SELF_MAPPING, TIP_SEARCH_DIST,
+ ambiguousRandom, ambiguousAll, KFILTER, IDFILTER, qtrimLeft, qtrimRight, untrim, TRIM_QUALITY, minTrimLength, LOCAL_ALIGN, RESCUE, STRICT_MAX_INDEL, MSA_TYPE);
+ } catch (Exception e) {
+ e.printStackTrace();
+ abort(mtts, "Aborting due to prior error.");
+ }
+ mtts[i].idmodulo=idmodulo;
+ if(verbose){
+ mtts[i].verbose=verbose;
+ mtts[i].index().verbose=verbose;
+ }
+ }
+
+ cris.start(); //4567
+ sysout.println("Processing reads in "+(paired ? "paired" : "single")+"-ended mode.");
+ sysout.println("Started read stream.");
+
+ /* The threads are started after initialization to prevent resource competition between initialization and mapping */
+ for(int i=0; i<mtts.length; i++){mtts[i].start();}
+ sysout.println("Started "+mtts.length+" mapping thread"+(mtts.length==1 ? "" : "s")+".");
+
+ final int broken=shutDownThreads(mtts, false);
+
+ sysout.println("\n\n ------------------ Results ------------------ ");
+
+ closeStreams(cris, rosA, rosM, rosU, rosB);
+ sysout.println();
+ printSettings(keylen);
+ printOutput(mtts, t, keylen, paired, true, pileup, scafNzo, sortStats, statsOutputFile);
+ if(broken>0 || errorState){throw new RuntimeException("BBMap terminated in an error state; the output may be corrupt.");}
+ }
+
+ @Override
+ void setSemiperfectMode() {
+ assert(SEMIPERFECTMODE);
+ if(SEMIPERFECTMODE){
+ TRIM_LIST=false;
+ keyDensity/=2;
+ maxKeyDensity/=2;
+ minKeyDensity=1.1f;
+ maxDesiredKeys/=2;
+ MINIMUM_ALIGNMENT_SCORE_RATIO=0.45f; //To allow semiperfect reads
+ BBIndexPacBioSkimmer.setSemiperfectMode();
+ }
+ }
+
+ @Override
+ void setPerfectMode() {
+ assert(PERFECTMODE);
+ if(PERFECTMODE){
+ TRIM_LIST=false;
+ keyDensity/=2;
+ maxKeyDensity/=2;
+ minKeyDensity=1.1f;
+ maxDesiredKeys/=2;
+ MINIMUM_ALIGNMENT_SCORE_RATIO=1.0f;
+ BBIndexPacBioSkimmer.setPerfectMode();
+ }
+ }
+
+
+ @Override
+ void printSettings(int k){
+
+ printSettings0(k, BBIndexPacBioSkimmer.MAX_INDEL, MINIMUM_ALIGNMENT_SCORE_RATIO);
+
+ if(verbose_stats>=2){
+ sysout.println("Key Density: \t"+keyDensity+" ("+minKeyDensity+" ~ "+maxKeyDensity+")");
+ sysout.println("Max keys: \t"+maxDesiredKeys);
+
+ sysout.println("Block Subsections: \t"+BBIndexPacBioSkimmer.CHROMS_PER_BLOCK);
+ sysout.println("Fraction To Remove: \t"+String.format("%.4f", (BBIndexPacBioSkimmer.REMOVE_FREQUENT_GENOME_FRACTION ? BBIndexPacBioSkimmer.FRACTION_GENOME_TO_EXCLUDE : 0)));
+ // sysout.println("ADD_SCORE_Z: \t"+IndexPacBioSkimmer.ADD_SCORE_Z);
+ sysout.println("Hits To Keep: \t"+BBIndexPacBioSkimmer.MIN_APPROX_HITS_TO_KEEP);
+ }
+
+ if(verbose_stats>=3){
+ sysout.println("Remove Clumpy: \t"+BBIndexPacBioSkimmer.REMOVE_CLUMPY);
+ if(BBIndexPacBioSkimmer.REMOVE_CLUMPY){
+ sysout.println("CLUMPY_MAX_DIST: \t"+BBIndexPacBioSkimmer.CLUMPY_MAX_DIST);
+ sysout.println("CLUMPY_MIN_LENGTH: \t"+BBIndexPacBioSkimmer.CLUMPY_MIN_LENGTH_INDEX);
+ sysout.println("CLUMPY_FRACTION: \t"+BBIndexPacBioSkimmer.CLUMPY_FRACTION);
+ }
+ sysout.println("Remove Long Lists: \t"+BBIndexPacBioSkimmer.TRIM_LONG_HIT_LISTS);
+ if(BBIndexPacBioSkimmer.TRIM_LONG_HIT_LISTS){
+ sysout.println("HIT_FRACTION_TO_RETAIN:\t"+BBIndexPacBioSkimmer.HIT_FRACTION_TO_RETAIN);
+ }
+ sysout.println("Trim By Greedy: \t"+BBIndexPacBioSkimmer.TRIM_BY_GREEDY);
+ sysout.println("Trim By Total Sites: \t"+BBIndexPacBioSkimmer.TRIM_BY_TOTAL_SITE_COUNT);
+ if(BBIndexPacBioSkimmer.TRIM_BY_TOTAL_SITE_COUNT){
+ sysout.println("MAX_AVG_SITES: \t"+BBIndexPacBioSkimmer.MAX_AVERAGE_LIST_TO_SEARCH);
+ sysout.println("MAX_AVG_SITES_2: \t"+BBIndexPacBioSkimmer.MAX_AVERAGE_LIST_TO_SEARCH2);
+ sysout.println("MAX_SHORTEST_SITE: \t"+BBIndexPacBioSkimmer.MAX_SHORTEST_LIST_TO_SEARCH);
+ }
+ sysout.println("Index Min Score: \t"+BBIndexPacBioSkimmer.MIN_SCORE_MULT);
+
+ sysout.println("Dynamic Trim: \t"+BBIndexPacBioSkimmer.DYNAMICALLY_TRIM_LOW_SCORES);
+ if(BBIndexPacBioSkimmer.DYNAMICALLY_TRIM_LOW_SCORES){
+ sysout.println("DYNAMIC_SCORE_THRESH: \t"+BBIndexPacBioSkimmer.DYNAMIC_SCORE_THRESH);
+ }
+ }
+
+ }
+
+}
diff --git a/current/align2/BBMapThread.java b/current/align2/BBMapThread.java
new file mode 100755
index 0000000..83dd98d
--- /dev/null
+++ b/current/align2/BBMapThread.java
@@ -0,0 +1,1364 @@
+package align2;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+
+import jgi.CoveragePileup;
+
+import stream.ConcurrentReadInputStream;
+import stream.ConcurrentReadOutputStream;
+import stream.Read;
+import stream.SiteScore;
+
+import dna.AminoAcid;
+import dna.Data;
+import dna.Gene;
+
+/**
+ * Based on MapTestThread11f
+ *
+ * @author Brian Bushnell
+ * @date Dec 22, 2012
+ *
+ */
+public final class BBMapThread extends AbstractMapThread{
+
+ static final int ALIGN_COLUMNS=BBIndex.ALIGN_COLUMNS;
+ static final int ALIGN_ROWS=601;
+
+
+
+ /** Don't trim for local alignments unless at least this many bases will be clipped */
+ private final int LOCAL_ALIGN_TIP_LENGTH=1;
+ /** Range is 0-1; a lower number makes trimming more aggressive */
+ private final float LOCAL_ALIGN_MATCH_POINT_RATIO=1f;
+
+ /** Ratio of the points for a match of a single base needed to declare unambiguous. 1 SNP is currently about 2.57 */
+ public final float CLEARZONE_RATIOP=1.6f; //default 1.3f, which makes read ambiguous if there is 1 N in an alternate site.
+ public final float CLEARZONE_RATIO1=2.0f;
+ public final float CLEARZONE_RATIO1b=2.6f;
+ public final float CLEARZONE_RATIO1c=4.6f;
+ public final float CLEARZONE_RATIO3=8.0f;
+ /** Max allowed number of sites within 1 edit (excluding primary site) */
+ public final int CLEARZONE_LIMIT1e=40; //Needs to be redone to assign a quality penalty rather than simply marking as ambiguous
+ public final int CLEARZONEP;
+ public final int CLEARZONE1;
+ public final int CLEARZONE1b;
+ public final int CLEARZONE1c;
+ //public final int CLEARZONE1e;
+ public final int CLEARZONE3;
+ public final float INV_CLEARZONE3;
+ public final float CLEARZONE1b_CUTOFF_FLAT_RATIO=12;//3f;
+ public final float CLEARZONE1b_CUTOFF_FLAT;
+ public final float CLEARZONE1b_CUTOFF_SCALE=0.97f;
+ public final float CLEARZONE1c_CUTOFF_FLAT_RATIO=26;//7f;
+ public final float CLEARZONE1c_CUTOFF_FLAT;
+ public final float CLEARZONE1c_CUTOFF_SCALE=0.92f;
+
+ public final BBIndex index;
+
+
+ private final int MIN_TRIM_SITES_TO_RETAIN_SINGLE=3;
+ private final int MIN_TRIM_SITES_TO_RETAIN_PAIRED=2;
+
+ public static void setExpectedSites(int x){
+ System.err.println("Warning: EXPECTED_SITES is not valid for "+(new Object() { }.getClass().getEnclosingClass().getName()));
+ }
+
+ @Override
+ public final int ALIGN_COLUMNS(){return ALIGN_COLUMNS;}
+ @Override
+ public final int ALIGN_ROWS(){return ALIGN_ROWS;}
+ @Override
+ public final int maxReadLength(){return ALIGN_ROWS-1;}
+ @Override
+ final AbstractIndex index(){return index;}
+ @Override
+ final int CLEARZONE1(){return CLEARZONE1;}
+
+ public BBMapThread(ConcurrentReadInputStream cris_, int keylen_,
+ CoveragePileup pileup_, boolean SMITH_WATERMAN_, int THRESH_, int minChrom_,
+ int maxChrom_, float keyDensity_, float maxKeyDensity_, float minKeyDensity_, int maxDesiredKeys_,
+ boolean REMOVE_DUPLICATE_BEST_ALIGNMENTS_, boolean SAVE_AMBIGUOUS_XY_,
+ float MINIMUM_ALIGNMENT_SCORE_RATIO_, boolean TRIM_LIST_, boolean MAKE_MATCH_STRING_, boolean QUICK_MATCH_STRINGS_,
+ ConcurrentReadOutputStream outStream_, ConcurrentReadOutputStream outStreamMapped_, ConcurrentReadOutputStream outStreamUnmapped_, ConcurrentReadOutputStream outStreamBlack_,
+ int SLOW_ALIGN_PADDING_, int SLOW_RESCUE_PADDING_, boolean DONT_OUTPUT_UNMAPPED_READS_, boolean DONT_OUTPUT_BLACKLISTED_READS_,
+ int MAX_SITESCORES_TO_PRINT_, boolean PRINT_SECONDARY_ALIGNMENTS_,
+ boolean REQUIRE_CORRECT_STRANDS_PAIRS_, boolean SAME_STRAND_PAIRS_, boolean KILL_BAD_PAIRS_, boolean RCOMP_MATE_,
+ boolean PERFECTMODE_, boolean SEMIPERFECTMODE_, boolean FORBID_SELF_MAPPING_, int TIP_DELETION_SEARCH_RANGE_,
+ boolean AMBIGUOUS_RANDOM_, boolean AMBIGUOUS_ALL_, int KFILTER_, float IDFILTER_, boolean TRIM_LEFT_, boolean TRIM_RIGHT_, boolean UNTRIM_, byte TRIM_QUAL_, int TRIM_MIN_LEN_,
+ boolean LOCAL_ALIGN_, boolean RESCUE_, boolean STRICT_MAX_INDEL_, String MSA_TYPE_){
+
+ super(cris_,
+ outStream_, outStreamMapped_, outStreamUnmapped_, outStreamBlack_,
+ pileup_, SMITH_WATERMAN_, LOCAL_ALIGN_, REMOVE_DUPLICATE_BEST_ALIGNMENTS_,
+ AMBIGUOUS_RANDOM_, AMBIGUOUS_ALL_, TRIM_LEFT_, TRIM_RIGHT_, UNTRIM_, TRIM_QUAL_, TRIM_MIN_LEN_, THRESH_,
+ minChrom_, maxChrom_, KFILTER_, IDFILTER_, KILL_BAD_PAIRS_, SAVE_AMBIGUOUS_XY_,
+ REQUIRE_CORRECT_STRANDS_PAIRS_,
+ SAME_STRAND_PAIRS_, RESCUE_, STRICT_MAX_INDEL_, SLOW_ALIGN_PADDING_, SLOW_RESCUE_PADDING_,
+ MSA_TYPE_, keylen_, PERFECTMODE_, SEMIPERFECTMODE_, FORBID_SELF_MAPPING_, RCOMP_MATE_,
+ MAKE_MATCH_STRING_, DONT_OUTPUT_UNMAPPED_READS_, DONT_OUTPUT_BLACKLISTED_READS_, PRINT_SECONDARY_ALIGNMENTS_,
+ QUICK_MATCH_STRINGS_, MAX_SITESCORES_TO_PRINT_, MINIMUM_ALIGNMENT_SCORE_RATIO_,
+ keyDensity_, maxKeyDensity_, minKeyDensity_, maxDesiredKeys_,
+ BBIndex.MIN_APPROX_HITS_TO_KEEP, BBIndex.USE_EXTENDED_SCORE,
+ BBIndex.BASE_HIT_SCORE, BBIndex.USE_AFFINE_SCORE, BBIndex.MAX_INDEL, TRIM_LIST_, TIP_DELETION_SEARCH_RANGE_);
+
+ assert(SLOW_ALIGN_PADDING>=0);
+ assert(!(RCOMP_MATE/* || FORBID_SELF_MAPPING*/)) : "RCOMP_MATE: TODO";
+
+ if(SLOW_ALIGN || MAKE_MATCH_STRING){
+// msa=MSA.makeMSA(ALIGN_ROWS, ALIGN_COLUMNS, MSA_TYPE);
+// POINTS_MATCH=msa.POINTS_MATCH();
+// POINTS_MATCH2=msa.POINTS_MATCH2();
+ CLEARZONE1=(int)(CLEARZONE_RATIO1*POINTS_MATCH2);
+ CLEARZONE1b=(int)(CLEARZONE_RATIO1b*POINTS_MATCH2);
+ CLEARZONE1c=(int)(CLEARZONE_RATIO1c*POINTS_MATCH2);
+ CLEARZONEP=(int)(CLEARZONE_RATIOP*POINTS_MATCH2);
+ CLEARZONE3=PENALIZE_AMBIG ? (int)(CLEARZONE_RATIO3*POINTS_MATCH2) : 0;
+// CLEARZONE1e=(int)(2*POINTS_MATCH2-POINTS_MATCH-msa.POINTS_SUB())+1;
+ }else{
+// POINTS_MATCH=70;
+// POINTS_MATCH2=100;
+// msa=null;
+ CLEARZONE1=0;
+ CLEARZONE1b=0;
+ CLEARZONE1c=0;
+ CLEARZONEP=0;
+ CLEARZONE3=0;
+// CLEARZONE1e=0;
+ }
+
+ CLEARZONE1b_CUTOFF_FLAT=CLEARZONE1b_CUTOFF_FLAT_RATIO*POINTS_MATCH2;
+ CLEARZONE1c_CUTOFF_FLAT=CLEARZONE1c_CUTOFF_FLAT_RATIO*POINTS_MATCH2;
+ INV_CLEARZONE3=(CLEARZONE3==0 ? 0 : 1f/CLEARZONE3);
+
+ index=new BBIndex(KEYLEN, minChrom, maxChrom, KFILTER, msa);
+ }
+
+
+ public int trimList(ArrayList<SiteScore> list, boolean retainPaired, int maxScore, boolean specialCasePerfect, int minSitesToRetain, int maxSitesToRetain){
+ if(list==null || list.size()==0){return -99999;}
+ if(list.size()==1){return list.get(0).score;}
+
+ final int highestScore;
+ if(USE_AFFINE_SCORE){
+
+ highestScore=Tools.trimSiteList(list, .6f, retainPaired, true, minSitesToRetain, maxSitesToRetain);
+ if(highestScore==maxScore && specialCasePerfect){
+ Tools.trimSiteList(list, .94f, retainPaired, true, minSitesToRetain, maxSitesToRetain);
+ if(list.size()>8){Tools.trimSiteList(list, .99f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ return highestScore;
+ }
+
+ final int mstr2=(minSitesToRetain<=1 ? 1 : minSitesToRetain+1);
+
+// if(list.size()>6){Tools.trimSiteList(list, .65f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+//// // System.out.print(", "+list.size());
+// if(list.size()>10){Tools.trimSiteList(list, .7f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+//// // System.out.print(", "+list.size());
+// if(list.size()>14){Tools.trimSiteList(list, .75f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+//// // System.out.print(", "+list.size());
+// if(list.size()>18){Tools.trimSiteList(list, .8f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+//// //// System.out.print(", "+list.size());
+// if(list.size()>22){Tools.trimSiteList(list, .85f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+//// //// System.out.print(", "+list.size());
+// if(list.size()>26){Tools.trimSiteList(list, .9f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+//// // System.out.print(", "+list.size());
+// if(list.size()>34){Tools.trimSiteList(list, .95f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+// // System.out.print(", "+list.size());
+// if(list.size()>42){Tools.trimSiteList(list, .98f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+// // System.out.print(", "+list.size());
+// if(list.size()>50){Tools.trimSiteList(list, .99f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+// // System.out.print(", "+list.size());
+//// if(list.size()>64){Tools.trimSiteList(list, .995f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+// // System.out.print(", "+list.size());
+
+ if(list.size()>4){Tools.trimSiteList(list, .65f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+// // System.out.print(", "+list.size());
+ if(list.size()>8){Tools.trimSiteList(list, .7f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+// // System.out.print(", "+list.size());
+ if(list.size()>12){Tools.trimSiteList(list, .75f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+// // System.out.print(", "+list.size());
+ if(list.size()>16){Tools.trimSiteList(list, .8f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+// //// System.out.print(", "+list.size());
+ if(list.size()>20){Tools.trimSiteList(list, .85f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+// //// System.out.print(", "+list.size());
+ if(list.size()>24){Tools.trimSiteList(list, .9f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+// // System.out.print(", "+list.size());
+ if(list.size()>32){Tools.trimSiteList(list, .95f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ if(list.size()>40){Tools.trimSiteList(list, .97f, retainPaired, true, mstr2, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ if(list.size()>48){Tools.trimSiteList(list, .99f, retainPaired, true, mstr2, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+// if(list.size()>64){Tools.trimSiteList(list, .995f, retainPaired, true, mstr2, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+
+
+ }else if(USE_EXTENDED_SCORE){
+ highestScore=Tools.trimSiteList(list, .75f, retainPaired, true, minSitesToRetain, maxSitesToRetain);
+
+ if(list.size()>8){Tools.trimSiteList(list, .8f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ if(list.size()>16){Tools.trimSiteList(list, .85f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ if(list.size()>24){Tools.trimSiteList(list, .90f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ if(list.size()>36){Tools.trimSiteList(list, .92f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ //// System.out.print(", "+list.size());
+ if(list.size()>40){Tools.trimSiteList(list, .94f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ //// System.out.print(", "+list.size());
+ if(list.size()>48){Tools.trimSiteList(list, .96f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ //// System.out.print(", "+list.size());
+ if(list.size()>56){Tools.trimSiteList(list, .97f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ if(list.size()>64){Tools.trimSiteList(list, .98f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ if(list.size()>80){Tools.trimSiteList(list, .99f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+
+
+ }else{
+ // System.out.print("\n\nSize:\t"+list.size());
+
+
+ highestScore=Tools.trimSiteList(list, .6f, retainPaired, true, minSitesToRetain, maxSitesToRetain);
+
+ if(list.size()>12){Tools.trimSiteList(list, .65f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ if(list.size()>16){Tools.trimSiteList(list, .7f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ if(list.size()>24){Tools.trimSiteList(list, .74f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ if(list.size()>28){Tools.trimSiteList(list, .8f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ if(list.size()>32){Tools.trimSiteList(list, .85f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ if(list.size()>48){Tools.trimSiteList(list, .90f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ //// System.out.print(", "+list.size());
+ // if(list.size()>40){Tools.trimSiteList(list, .95f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ //// System.out.print(", "+list.size());
+ // if(list.size()>48){Tools.trimSiteList(list, .96f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ //// System.out.print(", "+list.size());
+ // if(list.size()>56){Tools.trimSiteList(list, .97f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ }
+
+ return highestScore;
+ }
+
+
+ public void scoreSlow(final ArrayList<SiteScore> list, final byte[] basesP, final byte[] basesM,
+ final int maxSwScore, final int maxImperfectSwScore){
+
+ int minMsaLimit;
+ if(PAIRED){
+ minMsaLimit=-CLEARZONE1e+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE*maxSwScore);
+ }else{
+ minMsaLimit=-CLEARZONE1e+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO*maxSwScore);
+ }
+ assert(Read.CHECKSITES(list, basesP, basesM, -1));
+
+ int minMatch=Tools.max(-300, minMsaLimit-CLEARZONE3); //Score must exceed this to generate quick match string
+ if(verbose){
+ System.err.println("Slow-scoring. maxSwScore="+maxSwScore+", maxImperfectSwScore="+maxImperfectSwScore+", minMsaLimit="+minMsaLimit+", minMatch="+minMatch);
+ }
+ for(int i=0; i<list.size(); i++){
+ final SiteScore ss=list.get(i);
+ assert(ss.lengthsAgree());
+ final byte[] bases=(ss.strand==Gene.PLUS ? basesP : basesM);
+
+ if(SEMIPERFECTMODE){
+ assert(ss.stop-ss.start==bases.length-1);
+ assert(ss.semiperfect);
+ }
+
+ if(verbose){System.err.println("\nSlow-scoring "+ss);}
+ if(ss.stop-ss.start!=bases.length-1){
+ assert(ss.stop-ss.start>bases.length-1) : bases.length+", "+ss.toText();
+ assert(!ss.semiperfect) : "\n"+bases.length+", "+ss.toText()+", "+ss.perfect+", "+ss.semiperfect+", "+maxSwScore+"\n"+new String(basesP)+"\n";
+ ss.setSlowScore(0);
+ ss.semiperfect=false;
+ ss.perfect=false;
+ }
+
+ final int swscoreNoIndel=ss.slowScore;
+ int[] swscoreArray=null;
+
+ boolean clipped=true, setLimits=false;
+ if(swscoreNoIndel<maxImperfectSwScore && !ss.semiperfect){
+ if(verbose && ss.stop-ss.start>4000){
+ System.err.println(ss.toText());
+ System.err.println(list.size());
+ System.err.println();
+ }
+
+ int expectedLen=GapTools.calcGrefLen(ss);
+ if(verbose){System.err.println("expectedLen="+expectedLen);}
+ if(expectedLen>=EXPECTED_LEN_LIMIT){
+ //TODO: Alternately, I could kill the site.
+ ss.setStop(ss.start+Tools.min(basesP.length+40, EXPECTED_LEN_LIMIT));
+ if(verbose){System.err.println("expectedLen="+expectedLen+"; ss="+ss);}
+ }
+
+ int pad=SLOW_ALIGN_PADDING;
+ final int minscore=Tools.max(swscoreNoIndel, minMsaLimit);
+ final int minscore2=Tools.max(swscoreNoIndel-MSA.MIN_SCORE_ADJUST, minMsaLimit);
+ if(verbose){System.err.println("Sent to msa with start="+ss.start+", stop="+ss.stop+", pad="+pad+", limit="+minscore+", gaps="+GapTools.toString(ss.gaps));}
+ swscoreArray=msa.fillAndScoreLimited(bases, ss, pad, minscore);
+ if(verbose){System.err.println("Received "+Arrays.toString(swscoreArray));}
+
+ if(swscoreArray!=null && swscoreArray.length>6 && (swscoreArray[3]+swscoreArray[4]+expectedLen<EXPECTED_LEN_LIMIT)){
+ int[] oldArray=swscoreArray.clone();
+ assert(swscoreArray.length==8);
+ int extraPadLeft=swscoreArray[6];
+ int extraPadRight=swscoreArray[7];
+
+ if(verbose){
+ System.err.println("msa returned "+Arrays.toString(swscoreArray)+", re-running.");
+ System.err.println("Added extra padding: "+ss.toText()+", "+Arrays.toString(oldArray));
+ }
+
+ ss.setLimits(ss.start-extraPadLeft, ss.stop+extraPadRight);
+ pad=SLOW_ALIGN_PADDING+EXTRA_PADDING;
+ if(verbose){System.err.println("Sent to msa with start="+ss.start+", stop="+ss.stop+", pad="+pad+", limit="+minscore+", gaps="+GapTools.toString(ss.gaps));}
+ swscoreArray=msa.fillAndScoreLimited(bases, ss, pad, minscore);
+
+ if(verbose){System.err.println("Result of extra padding: "+ss.toText()+", "+Arrays.toString(swscoreArray));}
+ if(swscoreArray==null || swscoreArray[0]<oldArray[0]){
+ if(verbose){
+ System.err.println("Result was inferior.");
+ }
+ swscoreArray=oldArray;
+ }
+ }
+ assert(ss.lengthsAgree());
+ if(verbose){
+ System.err.println(QUICK_MATCH_STRINGS+", "+(swscoreArray==null ? "null" : (swscoreArray.length+", "+swscoreArray[0]+" >=? "+minscore)));
+ System.err.println("start="+ss.start+", stop="+ss.stop+", len="+ss.mappedLength());
+ }
+ if(QUICK_MATCH_STRINGS && swscoreArray!=null && swscoreArray.length==6 && swscoreArray[0]>=minscore2 && (PRINT_SECONDARY_ALIGNMENTS || (USE_SS_MATCH_FOR_PRIMARY && swscoreArray[0]>minMatch))){
+ if(verbose){System.err.println("Generating match string.");}
+ assert(swscoreArray.length==6) : swscoreArray.length;
+ assert(swscoreArray[0]>=minscore2) : "\n"+Arrays.toString(swscoreArray)+"\n"+minscore+"\n"+minMatch;
+ ss.match=msa.traceback(bases, Data.getChromosome(ss.chrom).array, ss.start-pad, ss.stop+pad, swscoreArray[3], swscoreArray[4], swscoreArray[5], ss.gaps!=null);
+ if(ss.match!=null){
+ assert(ss.pairedScore<1 || (ss.slowScore<=0 && ss.pairedScore>ss.quickScore ) || ss.pairedScore>ss.slowScore); //123
+ ss.setLimits(swscoreArray[1], swscoreArray[2]);
+ setLimits=true;
+ assert(ss.lengthsAgree());
+ clipped=ss.fixXY(bases, true, msa);
+ assert(ss.pairedScore<1 || (ss.slowScore<=0 && ss.pairedScore>ss.quickScore ) || ss.pairedScore>ss.slowScore); //123
+ clipped=ss.clipTipIndels(bases, basesM, 4, 10, msa) || clipped;
+ assert(ss.pairedScore<1 || (ss.slowScore<=0 && ss.pairedScore>ss.quickScore ) || ss.pairedScore>ss.slowScore); //123
+ assert(ss.lengthsAgree());
+ }
+ }else{
+ ss.match=null;
+ }
+ }
+ if(swscoreArray!=null && !setLimits){
+ if(verbose){System.err.println("msa returned "+Arrays.toString(swscoreArray));}
+ ss.setSlowScore(swscoreArray[0]);
+ ss.setLimits(swscoreArray[1], swscoreArray[2]);
+ assert(ss.lengthsAgree());
+ }else{
+ assert(swscoreNoIndel<=maxSwScore) : swscoreNoIndel+", "+maxImperfectSwScore+", "+maxSwScore+", "+new String(basesP);
+ assert(clipped || swscoreNoIndel==-1 || msa.scoreNoIndels(bases, ss.chrom, ss.start)==swscoreNoIndel) :
+ setLimits+", "+clipped+", "+(swscoreArray==null)+", "+
+ swscoreNoIndel+" != "+msa.scoreNoIndels(bases, ss.chrom, ss.start)+"\n"+
+ ss.toText()+"\n"+(ss.stop-ss.start)+", "+bases.length; //Slow
+ }
+ assert(ss.lengthsAgree());
+ ss.setScore(ss.slowScore);
+ minMatch=Tools.max(minMatch, ss.slowScore);
+ minMsaLimit=Tools.max(minMsaLimit, ss.slowScore-CLEARZONE3);
+ assert(ss.slowScore<=maxSwScore);
+ assert(!(ss.perfect && ss.slowScore<maxSwScore));
+ ss.perfect=(ss.slowScore==maxSwScore);
+ if(ss.perfect){ss.semiperfect=true;}
+ else if(!ss.semiperfect){ss.setPerfect(bases);}
+
+ if(verbose){System.err.println(" -> "+ss);}
+ }
+
+ }
+
+
+ public void processRead(final Read r, final byte[] basesM){
+ if(idmodulo>1 && r.numericID%idmodulo!=1){return;}
+ final byte[] basesP=r.bases;
+
+// System.err.print(" rd#"+r.numericID+" ");
+// if(r.numericID==25967){
+// verbose=true;
+// msa.verbose=true;
+// GapTools.verbose=true;
+// index.verbose=true;
+// tcr.verbose=true;
+// }
+
+ if(verbose){System.err.println("\nProcessing "+r);}
+ readsUsed1++;
+
+ final int maxPossibleQuickScore=quickMap(r, basesM);
+ if(verbose){System.err.println("\nQuick Map: \t"+r.sites);}
+
+ if(maxPossibleQuickScore<0){
+ r.sites=null;
+ lowQualityReadsDiscarded1++;
+ lowQualityBasesDiscarded1+=basesP.length;
+ r.setDiscarded(true);
+ return;
+ }
+ initialSiteSum1+=r.numSites();
+ if(verbose){System.err.println("\ninitialSiteSum1: "+initialSiteSum1);}
+
+ int maxSwScore=0;
+ int maxImperfectSwScore=0;
+
+ if(SLOW_ALIGN || USE_AFFINE_SCORE){
+ maxSwScore=msa.maxQuality(r.length());
+ maxImperfectSwScore=msa.maxImperfectScore(r.length());
+ }
+
+ if(TRIM_LIST && r.numSites()>1){
+ if(MIN_TRIM_SITES_TO_RETAIN_SINGLE>1){Collections.sort(r.sites);}
+ int highestQuickScore=trimList(r.sites, false, maxSwScore, true, MIN_TRIM_SITES_TO_RETAIN_SINGLE, MAX_TRIM_SITES_TO_RETAIN);
+ }
+ postTrimSiteSum1+=r.numSites();
+ if(verbose){System.err.println("\nAfter trim: \t"+r.sites);}
+
+ assert(Read.CHECKSITES(r, basesM));
+
+
+ if(SLOW_ALIGN && r.numSites()>0){
+
+ int numNearPerfectScores=scoreNoIndels(r, basesP, basesM, maxSwScore, maxImperfectSwScore);
+
+ Collections.sort(r.sites); //Puts higher scores first to better trigger the early exit based on perfect scores
+ assert(Read.CHECKSITES(r, basesM));
+
+// int numPerfectScores=0;
+// if(numNearPerfectScores>0){
+// for(SiteScore ss : r.list){
+// if(ss.perfect){numPerfectScores++;}
+// else{break;}
+// }
+// }
+
+ if(verbose){
+ System.err.println("\nAfter scoreNoIndels: \t"+r.sites);
+ }
+
+ if(numNearPerfectScores<1){
+ if(FIND_TIP_DELETIONS){findTipDeletions(r, basesP, basesM, maxSwScore, maxImperfectSwScore);}
+ }
+
+ if(verbose){
+ System.err.println("\nAfter findTipDeletions: \t"+r.sites);
+ }
+
+ //TODO: This causes problems with perfect matches that are mapped to areas longer than the read length
+ //***Above note should be resolved now, but needs to be verified.
+
+ if(numNearPerfectScores<1){
+ scoreSlow(r.sites, basesP, basesM, maxSwScore, maxImperfectSwScore);
+ }
+
+ if(STRICT_MAX_INDEL){
+ int removed=removeLongIndels(r.sites, index.MAX_INDEL);
+ if(r.numSites()==0){r.clearMapping();}
+ }
+
+ if(verbose){System.err.println("\nAfter scoreSlow: \t"+r.sites);}
+ assert(Read.CHECKSITES(r, basesM, false));
+ }
+
+
+ if(r.numSites()>0){
+ mapped1++;
+ try {
+ Tools.mergeDuplicateSites(r.sites, true, true);
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ throw new RuntimeException("\n\n"+r.toText(false)+"\n\n");
+ }
+ Collections.sort(r.sites);
+ }
+
+ if(r.numSites()>1){
+ SiteScore ss1=r.topSite();
+ SiteScore ss2=r.sites.get(1);
+ //Ensure no duplicates
+ assert(ss1.chrom!=ss2.chrom || ss1.strand!=ss2.strand || ss1.start!=ss2.start || ss1.stop!=ss2.stop) : r.toText(false);
+ }
+ assert(Read.CHECKSITES(r, basesM));
+
+ if(r.numSites()>=1){
+ assert(r.topSite().score==r.topSite().slowScore) : r.topSite();
+ }
+
+ if(SLOW_ALIGN || USE_AFFINE_SCORE){r.setPerfectFlag(maxSwScore);}
+
+ if(r.numSites()>1){
+
+ final int clearzone;
+ final int score=r.topSite().score;
+ if(r.perfect()){clearzone=CLEARZONEP;}
+ else{
+ assert(score<maxSwScore);
+ final float cz1blimit=(maxSwScore*CLEARZONE1b_CUTOFF_SCALE-CLEARZONE1b_CUTOFF_FLAT);
+ final float cz1climit=(maxSwScore*CLEARZONE1c_CUTOFF_SCALE-CLEARZONE1c_CUTOFF_FLAT);
+ if(score>cz1blimit){
+// clearzone=CLEARZONE1;
+ clearzone=(int)(((maxSwScore-score)*CLEARZONE1b+(score-cz1blimit)*CLEARZONE1)/(maxSwScore-cz1blimit));
+ }else if(score>cz1climit){
+// clearzone=CLEARZONE1b;
+ clearzone=(int)(((cz1blimit-score)*CLEARZONE1c+(score-cz1climit)*CLEARZONE1b)/(cz1blimit-cz1climit));
+ }else{
+ clearzone=CLEARZONE1c;
+ }
+// assert(false) : x+", "+cz1blimit+", "+cz1climit+", "+CLEARZONE1b_CUTOFF_FLAT+", "+clearzone;
+ }
+
+
+// final int clearzone=r.perfect() ? CLEARZONEP :
+// r.list.get(0).score>=(int)(maxSwScore*CLEARZONE1b_CUTOFF) ? CLEARZONE1 :
+// (r.list.get(0).score>=(int)(maxSwScore*CLEARZONE1c_CUTOFF) ? (CLEARZONE1b_CUTOFF-)CLEARZONE1b : CLEARZONE1c);
+ int numBestSites1=Tools.countTopScores(r.sites, clearzone);
+ if(numBestSites1>1){
+ //Ambiguous alignment
+ assert(r.sites.size()>1);
+ boolean b=processAmbiguous(r.sites, true, AMBIGUOUS_TOSS, clearzone, SAVE_AMBIGUOUS_XY); //Never gets executed anymore, so always returns true
+ r.setAmbiguous(b);
+ }else{
+ final int lim=(r.perfect() ? (int)(4f*CLEARZONE_LIMIT1e) : score+CLEARZONE1e>=maxSwScore ? 2*CLEARZONE_LIMIT1e : CLEARZONE_LIMIT1e)+1;
+ if(r.sites.size()>lim && clearzone<CLEARZONE1e){
+ numBestSites1=Tools.countTopScores(r.sites, CLEARZONE1e);
+ if(numBestSites1>lim){
+ boolean b=processAmbiguous(r.sites, true, AMBIGUOUS_TOSS, clearzone, SAVE_AMBIGUOUS_XY);
+ r.setAmbiguous(b);
+ }
+ }
+ }
+ }
+
+ if(verbose){System.err.println("A: "+r);}
+
+ if((SLOW_ALIGN || USE_AFFINE_SCORE) && r.numSites()>0){
+ int lim=(int)(maxSwScore*MINIMUM_ALIGNMENT_SCORE_RATIO);
+ if(r.topSite().score<lim){r.sites=null;}
+ else{Tools.removeLowQualitySitesUnpaired(r.sites, Tools.min(lim, Tools.max(1, lim-CLEARZONE3)));}
+ }
+ if(r.numSites()==0){r.sites=null;r.mapScore=0;}
+ r.setFromTopSite(AMBIGUOUS_RANDOM, true, MAX_PAIR_DIST);
+ assert(Read.CHECKSITES(r, basesM));
+
+ if(verbose){System.err.println("B: "+r);}
+
+ //Unimportant anomaly due to ambiguous reads that later have low quality sites removed and become unmapped.
+// assert(!r.mapped() || new SamLine(r, 0).toRead(true).ambiguous()==r.ambiguous()) : "\n"+r+"\n\n"+new SamLine(r, 0)+"\n\n"+new SamLine(r, 0).toRead(true)+"\n\n"+
+// "ambi="+ambi+", r.ambiguous()="+r.ambiguous()+", new SamLine(r, 0).toRead(true).ambiguous()="+new SamLine(r, 0).toRead(true).ambiguous()+"\n\n"+
+// "r.mapped="+r.mapped()+", sl.mapped()="+new SamLine(r, 0).mapped()+", sl.toRead(true).mapped()="+new SamLine(r, 0).toRead(true).mapped();
+// assert(r.ambiguous()==ambi) : r;
+
+ assert(r.gaps==null || r.gaps[0]==r.start && r.gaps[r.gaps.length-1]==r.stop);
+ assert(r.sites==null || r.mapScore>0) : r.sites+", "+r.mapScore+"\n"+r;
+
+ if(r.numSites()>1){
+ assert(r.topSite().score==r.topSite().slowScore) : "\n"+r.toText(false)+"\n";
+ assert(r.topSite().score==r.mapScore) : "\n"+r.toText(false)+"\n";
+ }
+
+ if(verbose){System.err.println("C: "+r);}
+
+ //***$
+ if(MAKE_MATCH_STRING && r.numSites()>0){
+ if(USE_SS_MATCH_FOR_PRIMARY && r.topSite().match!=null){
+ r.match=r.topSite().match;
+ }else{
+ if(r.sites.size()>1){
+ assert(r.topSite().score>=r.sites.get(1).score) : "\n"+r.topSite().toText()+"\t<\t"+r.sites.get(1).toText()+"\n"+r.toText(false)+"\n";
+ }
+ int mapScore=r.mapScore;
+
+ assert(r.mate!=null || r.numSites()==0 || r.topSite().score==r.mapScore) : "\n"+r.toText(false)+"\n";
+
+ if(verbose){System.err.println("D: "+r);}
+
+ {
+ boolean firstIter=true;
+ do{//
+ if(!firstIter){
+ Collections.sort(r.sites);
+ r.setFromTopSite(AMBIGUOUS_RANDOM, true, MAX_PAIR_DIST);
+ }
+ genMatchString(r, basesP, basesM, maxImperfectSwScore, maxSwScore, true, true);
+ assert(r.mate!=null || r.numSites()==0 || r.topSite().score==r.mapScore) : "\n"+r.toText(false)+"\n";
+// TODO: Fix this; it should never happen.
+// if(mapScore>r.mapScore){
+// System.err.println("genMatchString reduced mapping score: "+mapScore+" -> "+r.mapScore+" in read "+r.numericID);
+// }
+ if(STRICT_MAX_INDEL && hasLongIndel(r.match, index.MAX_INDEL)){
+ SiteScore ss=r.topSite();
+ r.mapScore=Tools.min(ss.score, -9999);
+ ss.setScore(r.mapScore);
+ ss.setSlowPairedScore(ss.score, ss.score);
+ }
+ r.topSite().setScore(r.topSite().slowScore);
+ firstIter=false;
+ }while(r.sites.size()>1 && r.topSite().score<r.sites.get(1).score);
+ }
+
+ if(r.numSites()>1){
+ assert(r.topSite().score==r.topSite().slowScore) : "\n"+r.toText(false)+"\n";
+ assert(r.topSite().score==r.mapScore) : "\n"+r.toText(false)+"\n";
+ }
+
+ if(verbose){System.err.println("E: "+r);}
+ }
+ }
+
+ if(r.numSites()>1){
+ assert(r.topSite().score==r.topSite().slowScore) : "\n"+r.toText(false)+"\n";
+ assert(r.topSite().score==r.mapScore) : "\n"+r.toText(false)+"\n";
+ removeDuplicateBestSites(r);
+ }
+ if(r.numSites()>0){r.topSite().match=r.match;}
+
+
+ if(r.sites!=null && r.mapScore<=0){//This came from BBMapThreadPacBio; not sure if needed for other modes
+ if(!STRICT_MAX_INDEL && !Shared.anomaly){
+ System.err.println("Note: Read "+r.id+" failed cigar string generation and will be marked as unmapped.\t"+(r.match==null)+"\t"+r.mapScore+"\t"+r.topSite()+"\t"+new String(r.bases));
+ if(MSA.bandwidth>0 || MSA.bandwidthRatio>0 || MSA.flatMode){Shared.anomaly=true;}
+ }
+ r.mapScore=0;
+ r.setMapped(false);
+ r.sites=null;
+ }
+
+
+
+ //This block is to prevent an assertion from firing. Generally caused by alignment being lost during match generation.
+ //TODO: Fix cause.
+ if(r.mapScore>0 && r.sites==null){
+ if(!Shared.anomaly){System.err.println("Anomaly: mapScore>0 and list==null.\n"+r+"\n");}
+ Shared.anomaly=true;
+ r.clearMapping();
+ }else if(r.mapScore<=0 && r.sites!=null){
+ if(BANDWIDTH<1){
+ if(!Shared.anomaly){System.err.println("Anomaly1: mapScore<=0 and list!=null.\n"+r+"\n");}
+ Shared.anomaly=true;
+ }
+ r.clearMapping();
+ }
+ assert(r.sites==null || r.mapScore>0) :
+ "\nmapScore = "+r.mapScore+"\nread = "+r.toText(false)+"\nscore thresh = "+(-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO*maxSwScore))+"\n"+
+ "msa unlimited return = "+Arrays.toString(msa.fillAndScoreLimited(r.strand()==Gene.PLUS ? r.bases :
+ AminoAcid.reverseComplementBases(r.bases), r.topSite(), Tools.max(SLOW_ALIGN_PADDING, 10), 0))+"\n"+
+ "msa limited return = "+Arrays.toString(msa.fillAndScoreLimited(r.strand()==Gene.PLUS ? r.bases :
+ AminoAcid.reverseComplementBases(r.bases), r.topSite(), Tools.max(SLOW_ALIGN_PADDING, 10), (-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO*maxSwScore))))+"\n\n"+
+ "msa vert limit: "+msa.showVertLimit()+"\n\nmsa horz limit: "+msa.showHorizLimit()+"\n\n";
+
+// assert(r.list==null || r.mapScore>0) : r.mapScore+"\n"+r.list==null ? "null" : r.list.toString();
+
+ if((CLEARZONE3>CLEARZONE1 || CLEARZONE3>CLEARZONEP) && r.sites!=null && !r.ambiguous()){
+
+ assert(r.mapScore>0);
+ float cz3v2=(CLEARZONE3*Tools.min(1.25f, (maxSwScore/(float)r.mapScore)));
+
+// boolean changed=applyClearzone3(r, CLEARZONE3, INV_CLEARZONE3);
+ boolean changed=applyClearzone3(r, (int)cz3v2, 1/cz3v2);
+ if(changed){
+ int minScore=(int)(maxSwScore*MINIMUM_ALIGNMENT_SCORE_RATIO);
+ if(r.mapScore<minScore){
+ assert(!r.ambiguous());
+ r.setAmbiguous(true);
+ }
+ }
+ }
+
+ if(r.ambiguous() && AMBIGUOUS_TOSS){r.sites=null; r.clearSite(); r.setMapped(false);}
+
+ if(r.mapped() && r.numSites()>1 && PRINT_SECONDARY_ALIGNMENTS){
+ ensureMatchStringsOnSiteScores(r, basesM, maxImperfectSwScore, maxSwScore);
+ assert(Read.CHECKSITES(r, basesM));
+ }
+
+ assert(checkTopSite(r));
+ if(r.mapped() && (LOCAL_ALIGN || r.containsXYC())){
+ msa.toLocalAlignment(r, r.topSite(), basesM, r.containsXYC() ? 1 : LOCAL_ALIGN_TIP_LENGTH, LOCAL_ALIGN_MATCH_POINT_RATIO);
+ assert(Read.CHECKSITES(r, basesM));
+ }
+
+ if(r.numSites()==0 || (!r.ambiguous() && r.mapScore<maxSwScore*MINIMUM_ALIGNMENT_SCORE_RATIO)){
+ r.clearMapping();
+ }
+
+// assert(false) : "\n\n"+r.sites+"\n\n"+r.toSam()+"\n\n"+r+"\n\n";
+
+ postFilterRead(r, basesM, maxImperfectSwScore, maxSwScore);
+ if(MAKE_MATCH_STRING){ensureMatchStringOnPrimary(r, basesM, maxImperfectSwScore, maxSwScore);}
+
+ if(PENALIZE_AMBIG){
+ int penalty=calcTipScorePenalty(r, maxSwScore, 7);
+ applyScorePenalty(r, penalty);
+ }
+
+// if(r.ambiguous() && r.sites!=null){
+// r.setAmbiguous(false);
+// r.mapScore/=3;
+// for(SiteScore ss : r.sites){
+// ss.slowScore/=3;
+// ss.score/=3;
+// }
+// }
+
+// //Penalize quality score of long deletions, as they are less likely to be accurate. Did not seem to be useful.
+// if(r.mapped() && !r.ambiguous()){
+// int delta=absdif(r.start+r.length()-1, r.stop);
+// if(delta>100){
+// float penalty2=0.0004f*(500f*delta)/(500f+delta);
+// r.mapScore=(int)(r.mapScore*(1-penalty2));
+// }
+// }
+
+ if(CALC_STATISTICS){
+ calcStatistics1(r, maxSwScore, maxPossibleQuickScore);
+ }
+ }
+
+
+ /** Returns number of perfect pairs */
+ public int pairSiteScoresInitial(Read r, Read r2, boolean trim){
+
+ if(r.numSites()<1 || r2.numSites()<1){return 0;}
+
+ SiteScore.PCOMP.sort(r.sites);
+ SiteScore.PCOMP.sort(r2.sites);
+
+ for(SiteScore ss : r.sites){ss.setPairedScore(0);}
+ for(SiteScore ss : r2.sites){ss.setPairedScore(0);}
+
+// ArrayList<SiteScorePair> pairs=new ArrayList<SiteScorePair>(Tools.min(8, Tools.min(r.list.size(), r2.list.size())));
+
+ int maxPairedScore1=-1;
+ int maxPairedScore2=-1;
+
+
+// for(SiteScore ss : r.list){
+// System.out.println(ss.toText());
+// }
+
+// int i=0, j=0;
+ final int ilimit=r.sites.size()-1;
+ final int jlimit=r2.sites.size()-1;
+ final int maxReadLen=Tools.max(r.length(), r2.length());
+
+// final int outerDistLimit=MIN_PAIR_DIST+r.length()+r2.length();
+ final int outerDistLimit=(Tools.max(r.length(), r2.length())*(OUTER_DIST_MULT))/OUTER_DIST_DIV;//-(SLOW_ALIGN ? 100 : 0);
+ final int innerDistLimit=MAX_PAIR_DIST;//+(FIND_TIP_DELETIONS ? TIP_DELETION_SEARCH_RANGE : 0);
+ final int expectedFragLength=AVERAGE_PAIR_DIST+r.length()+r2.length();
+
+ int numPerfectPairs=0;
+
+ for(int i=0, j=0; i<=ilimit && j<=jlimit; i++){
+ SiteScore ss1=r.sites.get(i);
+ SiteScore ss2=r2.sites.get(j);
+
+ while(j<jlimit && (ss2.chrom<ss1.chrom || (ss2.chrom==ss1.chrom && ss1.start-ss2.stop>innerDistLimit))){
+ j++;
+ ss2=r2.sites.get(j);
+ }
+
+ for(int k=j; k<=jlimit; k++){
+ ss2=r2.sites.get(k);
+
+ if(ss2.chrom>ss1.chrom){break;}
+ if(ss2.start-ss1.stop>innerDistLimit){break;}
+
+// int dist=0;
+//
+// if(ss1.start<=ss2.start){
+// dist=ss2.start-ss1.stop;
+// }else if(ss1.start>ss2.start){
+// dist=ss1.start-ss2.stop;
+// }
+
+
+// int innerdist=0;
+// int outerdist=0;
+//
+// if(ss1.start<=ss2.start){
+// innerdist=ss2.start-ss1.stop;
+// outerdist=ss2.stop-ss1.start;
+// }else if(ss1.start>ss2.start){
+// innerdist=ss1.start-ss2.stop;
+// outerdist=ss1.stop-ss2.start;
+// }
+
+ final int innerdist, outerdist;
+ //assert(!SAME_STRAND_PAIRS) : "TODO";
+
+ if(REQUIRE_CORRECT_STRANDS_PAIRS){
+ if(ss1.strand!=ss2.strand){
+ if(ss1.strand==Gene.PLUS){
+ innerdist=ss2.start-ss1.stop;
+ outerdist=ss2.stop-ss1.start;
+ }else{
+ innerdist=ss1.start-ss2.stop;
+ outerdist=ss1.stop-ss2.start;
+ }
+ }else{
+ if(ss1.start<=ss2.start){
+ innerdist=ss2.start-ss1.stop;
+ outerdist=ss2.stop-ss1.start;
+ }else{
+ innerdist=ss1.start-ss2.stop;
+ outerdist=ss1.stop-ss2.start;
+ }
+ }
+ }else{
+ if(ss1.start<=ss2.start){
+ innerdist=ss2.start-ss1.stop;
+ outerdist=ss2.stop-ss1.start;
+ }else{
+ innerdist=ss1.start-ss2.stop;
+ outerdist=ss1.stop-ss2.start;
+ }
+ }
+
+ assert(outerdist>=innerdist);
+
+ if(outerdist>=outerDistLimit && innerdist<=innerDistLimit){
+
+ boolean strandOK=((ss1.strand==ss2.strand)==SAME_STRAND_PAIRS);
+
+ if(strandOK || !REQUIRE_CORRECT_STRANDS_PAIRS){
+
+ boolean paired1=false, paired2=false;
+
+ int deviation=absdif(AVERAGE_PAIR_DIST, innerdist);
+
+ final int pairedScore1;
+ final int pairedScore2;
+ if(strandOK){
+// pairedScore1=ss1.score+ss2.score/2;
+// pairedScore2=ss2.score+ss1.score/2;
+
+ pairedScore1=ss1.score+1+Tools.max(1, ss2.score/2-(((deviation)*ss2.score)/(32*expectedFragLength+100)));
+ pairedScore2=ss2.score+1+Tools.max(1, ss1.score/2-(((deviation)*ss1.score)/(32*expectedFragLength+100)));
+ }else{//e.g. a junction
+ pairedScore1=ss1.score+Tools.max(0, ss2.score/16);
+ pairedScore2=ss2.score+Tools.max(0, ss1.score/16);
+ }
+
+ if(pairedScore1>ss1.pairedScore){
+ paired1=true;
+ ss1.setPairedScore(Tools.max(ss1.pairedScore, pairedScore1));
+ maxPairedScore1=Tools.max(ss1.score, maxPairedScore1);
+ // System.out.println("Paired "+ss1.toText()+" with "+ss2.toText());
+ }else{
+ // System.out.println(ss1.toText()+" already paired.");
+ }
+ if(pairedScore2>ss2.pairedScore){
+ paired2=true;
+ ss2.setPairedScore(Tools.max(ss2.pairedScore, pairedScore2));
+ maxPairedScore2=Tools.max(ss2.score, maxPairedScore2);
+ }
+
+ if(paired1 && paired2 && outerdist>=maxReadLen && deviation<=expectedFragLength && ss1.perfect && ss2.perfect){
+ numPerfectPairs++; //Lower bound. Some perfect pairs may be the same.
+ }
+
+// ss1.pairedScore=Tools.max(ss1.pairedScore, pairedScore1);
+// ss2.pairedScore=Tools.max(ss2.pairedScore, pairedScore2);
+// maxPairedScore1=Tools.max(ss1.score, maxPairedScore1);
+// maxPairedScore2=Tools.max(ss2.score, maxPairedScore2);
+ }
+ }
+ }
+
+ }
+
+
+
+ for(SiteScore ss : r.sites){
+ if(ss.pairedScore>ss.score){ss.setScore(ss.pairedScore);}
+ else{assert(ss.pairedScore==0);}
+// ss.score=ss.pairedScore=Tools.max(ss.pairedScore, ss.score);
+ }
+ for(SiteScore ss : r2.sites){
+ if(ss.pairedScore>ss.score){ss.setScore(ss.pairedScore);}
+ else{assert(ss.pairedScore==0);}
+// ss.score=ss.pairedScore=Tools.max(ss.pairedScore, ss.score);
+ }
+
+ if(trim){
+ if(numPerfectPairs>0){
+// System.out.print(".");
+ Tools.trimSitesBelowCutoff(r.sites, (int)(maxPairedScore1*.94f), false, true, 1, MAX_TRIM_SITES_TO_RETAIN);
+ Tools.trimSitesBelowCutoff(r2.sites, (int)(maxPairedScore2*.94f), false, true, 1, MAX_TRIM_SITES_TO_RETAIN);
+ }else{
+ if(r.sites.size()>4){
+ Tools.trimSitesBelowCutoff(r.sites, (int)(maxPairedScore1*.9f), true, true, 1, MAX_TRIM_SITES_TO_RETAIN);
+ }
+ if(r2.sites.size()>4){
+ Tools.trimSitesBelowCutoff(r2.sites, (int)(maxPairedScore2*.9f), true, true, 1, MAX_TRIM_SITES_TO_RETAIN);
+ }
+ }
+ }
+
+// if(pairs.isEmpty()){return null;}
+//
+// ArrayList<SiteScore> temp=new ArrayList<SiteScore>(Tools.max(r.list.size(), r2.list.size()));
+//
+// for(SiteScore ss : r.list){
+// if(ss.score>maxPairedScore1){temp.add(ss);}
+// }
+// for(SiteScorePair ssp : pairs){
+// temp.add(ssp.a);
+// }
+// r.list.clear();
+// r.list.addAll(temp);
+//
+// for(SiteScore ss : r2.list){
+// if(ss.score>maxPairedScore2){temp.add(ss);}
+// }
+// for(SiteScorePair ssp : pairs){
+// temp.add(ssp.b);
+// }
+// r2.list.clear();
+// r2.list.addAll(temp);
+//
+// return pairs;
+
+ return numPerfectPairs;
+ }
+
+
+ public void processReadPair(final Read r, final byte[] basesM1, final byte[] basesM2){
+ if(idmodulo>1 && r.numericID%idmodulo!=1){return;}
+ final Read r2=r.mate;
+ assert(r2!=null);
+ final byte[] basesP1=r.bases, basesP2=r2.bases;
+ final int len1=(basesP1==null ? 0 : basesP1.length), len2=(basesP2==null ? 0 : basesP2.length);
+
+ readsUsed1++;
+ readsUsed2++;
+
+ final int maxPossibleQuickScore1=quickMap(r, basesM1);
+ final int maxPossibleQuickScore2=quickMap(r2, basesM2);
+
+ if(verbose){
+ System.err.println("\nAfter quick map:\nRead1:\t"+r+"\nRead2:\t"+r.mate);
+ }
+
+ if(maxPossibleQuickScore1<0 && maxPossibleQuickScore2<0){
+ r.sites=null;
+ r2.sites=null;
+ lowQualityReadsDiscarded1++;
+ lowQualityBasesDiscarded1+=len1;
+ r.setDiscarded(true);
+ lowQualityReadsDiscarded2++;
+ lowQualityBasesDiscarded2+=len2;
+ r2.setDiscarded(true);
+ return;
+ }
+
+ //Not really needed due to subsumption
+// Tools.mergeDuplicateSites(r.list);
+// Tools.mergeDuplicateSites(r2.list);
+
+ initialSiteSum1+=r.numSites();
+ initialSiteSum2+=r2.numSites();
+
+ //TODO: Fix this. This is a workaround for an assertion error counting the number of reads used.
+ //Discards need to be tracked separately for each end.
+// if(maxPossibleQuickScore2<0){lowQualityReadsDiscarded--;}
+
+ final int maxSwScore1=msa.maxQuality(len1);
+ final int maxImperfectSwScore1=msa.maxImperfectScore(len1);
+ final int maxSwScore2=msa.maxQuality(len2);
+ final int maxImperfectSwScore2=msa.maxImperfectScore(len2);
+
+ pairSiteScoresInitial(r, r2, TRIM_LIST);
+ if(verbose){System.err.println("\nAfter initial pair:\nRead1:\t"+r+"\nRead2:\t"+r2);}
+
+ if(TRIM_LIST){
+
+ if(MIN_TRIM_SITES_TO_RETAIN_PAIRED>1){
+ if(r.numSites()>MIN_TRIM_SITES_TO_RETAIN_PAIRED){Collections.sort(r.sites);}
+ if(r2.numSites()>MIN_TRIM_SITES_TO_RETAIN_PAIRED){Collections.sort(r2.sites);}
+ }
+
+ trimList(r.sites, true, maxSwScore1, false, MIN_TRIM_SITES_TO_RETAIN_PAIRED, MAX_TRIM_SITES_TO_RETAIN);
+ trimList(r2.sites, true, maxSwScore2, false, MIN_TRIM_SITES_TO_RETAIN_PAIRED, MAX_TRIM_SITES_TO_RETAIN);
+ }
+ postTrimSiteSum1+=r.numSites();
+ postTrimSiteSum2+=r2.numSites();
+
+ {//Reset score to non-paired score
+ if(r.sites!=null){
+ for(SiteScore ss : r.sites){
+ assert(ss.slowScore<=ss.quickScore);
+ ss.setScore(ss.quickScore);
+ }
+ }
+ if(r2.sites!=null){
+ for(SiteScore ss : r2.sites){
+ assert(ss.slowScore<=ss.quickScore);
+ ss.setScore(ss.quickScore);
+ }
+ }
+ }
+
+ if(verbose){System.err.println("\nAfter trim:\nRead1:\t"+r.sites+"\nRead2:\t"+r2.sites);}
+
+// assert(Read.CHECKSITES(r, basesM1) && Read.CHECKSITES(r2, basesM2));
+
+ if(SLOW_ALIGN){
+
+ if(r.numSites()>0){
+
+ int numNearPerfectScores1=scoreNoIndels(r, basesP1, basesM1, maxSwScore1, maxImperfectSwScore1);
+ Collections.sort(r.sites); //Puts higher scores first to better trigger the early exit based on perfect scores
+
+ if(numNearPerfectScores1<1){
+ if(FIND_TIP_DELETIONS){findTipDeletions(r, basesP1, basesM1, maxSwScore1, maxImperfectSwScore1);}
+ }
+
+ //TODO:
+ //Note scoreSlow can be skipped under this circumstance:
+ //When rescue is disabled, numNearPerfectScores>0, and there are no paired sites.
+ scoreSlow(r.sites, basesP1, basesM1, maxSwScore1, maxImperfectSwScore1);
+ if(STRICT_MAX_INDEL){
+ int removed=removeLongIndels(r.sites, index.MAX_INDEL);
+ if(r.numSites()==0){r.clearMapping();}
+ }
+ Tools.mergeDuplicateSites(r.sites, true, true);
+ }
+
+ if(r2.numSites()>0){
+ int numNearPerfectScores2=scoreNoIndels(r2, basesP2, basesM2, maxSwScore2, maxImperfectSwScore2);
+ Collections.sort(r2.sites); //Puts higher scores first to better trigger the early exit based on perfect scores
+
+ if(numNearPerfectScores2<1){
+ if(FIND_TIP_DELETIONS){findTipDeletions(r2, basesP2, basesM2, maxSwScore2, maxImperfectSwScore2);}
+ }
+
+ scoreSlow(r2.sites, basesP2, basesM2, maxSwScore2, maxImperfectSwScore2);
+ if(STRICT_MAX_INDEL){
+ int removed=removeLongIndels(r2.sites, index.MAX_INDEL);
+ if(r2.numSites()<1){r2.clearMapping();}
+ }
+ Tools.mergeDuplicateSites(r2.sites, true, true);
+ }
+
+
+ if(verbose){System.err.println("\nAfter slow align:\nRead1:\t"+r+"\nRead2:\t"+r2);}
+ assert(Read.CHECKSITES(r, basesM1, false) && Read.CHECKSITES(r2, basesM2, false));
+
+ if(DO_RESCUE){
+ int unpaired1=0;
+ int unpaired2=0;
+ if(r.sites!=null){
+ for(SiteScore ss : r.sites){
+ assert(ss.pairedScore<1 || ss.pairedScore>ss.quickScore || ss.pairedScore>ss.slowScore) :
+ "\n"+ss.toText()+"\n"+r.toText(false)+"\n\n"+r.toFastq()+"\n"+r2.toFastq()+"\napd="+AVERAGE_PAIR_DIST+"\n";
+ if(ss.pairedScore==0){unpaired1++;}
+ }
+ }
+ if(r2.sites!=null){
+ for(SiteScore ss : r2.sites){
+ assert(ss.pairedScore<1 || ss.pairedScore>ss.quickScore || ss.pairedScore>ss.slowScore) :
+ "\n"+ss.toText()+"\n"+r2.toText(false)+"\n\n"+r.toFastq()+"\n"+r2.toFastq()+"\napd="+AVERAGE_PAIR_DIST+"\n";
+ if(ss.pairedScore==0){unpaired2++;}
+ }
+ }
+
+ if(unpaired1>0 && r.numSites()>0){
+ Collections.sort(r.sites);
+ Tools.removeLowQualitySitesPaired(r.sites, maxSwScore1, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE);
+ rescue(r, r2, basesP2, basesM2, Tools.min(MAX_PAIR_DIST, 2*AVERAGE_PAIR_DIST+100));
+ Tools.mergeDuplicateSites(r2.sites, true, true);
+ }
+
+ if(unpaired2>0 && r2.numSites()>0){
+ Collections.sort(r2.sites);
+ Tools.removeLowQualitySitesPaired(r2.sites, maxSwScore2, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE);
+ rescue(r2, r, basesP1, basesM1, Tools.min(MAX_PAIR_DIST, 2*AVERAGE_PAIR_DIST+100));
+ Tools.mergeDuplicateSites(r.sites, true, true);
+ }
+
+ postRescueSiteSum1+=r.numSites();
+ postRescueSiteSum2+=r2.numSites();
+
+// if(r.list!=null){Collections.sort(r.list);}
+// if(r2.list!=null){Collections.sort(r2.list);}
+//
+// Tools.removeLowQualitySites(r.list, maxSwScore1, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE);
+// Tools.removeLowQualitySites(r2.list, maxSwScore2, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE);
+
+ if(verbose){System.err.println("\nAfter rescue:\nRead1:\t"+r+"\nRead2:\t"+r2);}
+ assert(Read.CHECKSITES(r, basesM1, false) && Read.CHECKSITES(r2, basesM2, false));
+ }
+ }else{
+ Tools.mergeDuplicateSites(r.sites, true, false);
+ Tools.mergeDuplicateSites(r2.sites, true, false);
+ if(verbose){System.err.println("\nAfter merge:\nRead1:\t"+r+"\nRead2:\t"+r2);}
+ assert(Read.CHECKSITES(r, basesM1, false) && Read.CHECKSITES(r2, basesM2, false));
+ }
+
+ if(r.numSites()>1){Collections.sort(r.sites);}
+ if(r2.numSites()>1){Collections.sort(r2.sites);}
+ assert(Read.CHECKSITES(r, basesM1) && Read.CHECKSITES(r2, basesM2));
+
+ if(false){//This block is optional, but increases correctness by a tiny bit. (or maybe not!)
+ if(SLOW_ALIGN || USE_AFFINE_SCORE){
+ Tools.removeLowQualitySitesPaired(r.sites, maxSwScore1, MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED, MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED);
+ Tools.removeLowQualitySitesPaired(r2.sites, maxSwScore2, MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED, MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED);
+ }
+
+ pairSiteScoresFinal(r, r2, false, false, MAX_PAIR_DIST, AVERAGE_PAIR_DIST, SAME_STRAND_PAIRS, REQUIRE_CORRECT_STRANDS_PAIRS, MAX_TRIM_SITES_TO_RETAIN);
+
+ if(r.numSites()>1){Collections.sort(r.sites);}
+ if(r2.numSites()>1){Collections.sort(r2.sites);}
+ }
+
+ if(SLOW_ALIGN || USE_AFFINE_SCORE){
+ Tools.removeLowQualitySitesPaired(r.sites, maxSwScore1, MINIMUM_ALIGNMENT_SCORE_RATIO, MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED);
+ Tools.removeLowQualitySitesPaired(r2.sites, maxSwScore2, MINIMUM_ALIGNMENT_SCORE_RATIO, MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED);
+ }
+
+ pairSiteScoresFinal(r, r2, true, true, MAX_PAIR_DIST, AVERAGE_PAIR_DIST, SAME_STRAND_PAIRS, REQUIRE_CORRECT_STRANDS_PAIRS, MAX_TRIM_SITES_TO_RETAIN);
+ if(verbose){System.err.println("\nAfter final pairing:\nRead1:\t"+r+"\nRead2:\t"+r2);}
+
+ if(r.numSites()>0){
+ mapped1++;
+ Collections.sort(r.sites);
+ }
+ if(r2.numSites()>0){
+ mapped2++;
+ Collections.sort(r2.sites);
+ }
+ assert(Read.CHECKSITES(r, basesM1) && Read.CHECKSITES(r2, basesM2));
+
+ if(SLOW_ALIGN || USE_AFFINE_SCORE){
+ r.setPerfectFlag(maxSwScore1);
+ r2.setPerfectFlag(maxSwScore2);
+// assert(Read.CHECKSITES(r, basesM1) && Read.CHECKSITES(r2, basesM2));
+ }
+
+
+ if(r.numSites()>1){
+ final int clearzone=r.perfect() ? CLEARZONEP :
+ r.topSite().score>=(int)(maxSwScore1*CLEARZONE1b_CUTOFF_SCALE-CLEARZONE1b_CUTOFF_FLAT) ? CLEARZONE1 :
+ (r.topSite().score>=(int)(maxSwScore1*CLEARZONE1c_CUTOFF_SCALE-CLEARZONE1c_CUTOFF_FLAT) ? CLEARZONE1b : CLEARZONE1c);
+ int numBestSites1=Tools.countTopScores(r.sites, clearzone);
+ if(numBestSites1>1){
+ //Ambiguous alignment
+ assert(r.sites.size()>1);
+
+ boolean b=processAmbiguous(r.sites, true, AMBIGUOUS_TOSS, clearzone, SAVE_AMBIGUOUS_XY);
+ r.setAmbiguous(b);
+ }
+// assert(Read.CHECKSITES(r, basesM1));
+ }
+
+ if(r2.numSites()>1){
+ final int clearzone=r2.perfect() ? CLEARZONEP :
+ r2.topSite().score>=(int)(maxSwScore2*CLEARZONE1b_CUTOFF_SCALE-CLEARZONE1b_CUTOFF_FLAT) ? CLEARZONE1 :
+ (r2.topSite().score>=(int)(maxSwScore2*CLEARZONE1c_CUTOFF_SCALE-CLEARZONE1c_CUTOFF_FLAT) ? CLEARZONE1b : CLEARZONE1c);
+ int numBestSites2=Tools.countTopScores(r2.sites, clearzone);
+ if(numBestSites2>1){
+ //Ambiguous alignment
+ assert(r2.sites.size()>1);
+
+ boolean b=processAmbiguous(r2.sites, false, AMBIGUOUS_TOSS, clearzone, SAVE_AMBIGUOUS_XY);
+ r2.setAmbiguous(b);
+ }
+// assert(Read.CHECKSITES(r2, basesM2));
+ }
+ if(verbose){System.err.println("\nAfter ambiguous removal:\nRead1:\t"+r+"\nRead2:\t"+r2);}
+
+ if(r.numSites()>0 && r2.numSites()>0){
+ SiteScore ss1=r.topSite();
+ SiteScore ss2=r2.topSite();
+ if(canPair(ss1, ss2, len1, len2, REQUIRE_CORRECT_STRANDS_PAIRS, SAME_STRAND_PAIRS, MAX_PAIR_DIST)){
+ assert(SLOW_ALIGN ? ss1.pairedScore>ss1.slowScore : ss1.pairedScore>ss1.quickScore) :
+ "\n"+ss1.toText()+"\n"+ss2.toText()+"\n"+r.toText(false)+"\n"+r2.toText(false)+"\n\n"+
+ r.mapped()+", "+r.paired()+", "+r.strand()+", "+r.ambiguous()+"\n\n"+r2.mapped()+", "+r2.paired()+", "+r2.strand()+", "+r2.ambiguous()+"\n\n";
+ assert(SLOW_ALIGN ? ss2.pairedScore>ss2.slowScore : ss2.pairedScore>ss2.quickScore) :
+ "\n"+ss1.toText()+"\n"+ss2.toText()+"\n"+r.toText(false)+"\n"+r2.toText(false)+"\n\n";
+ r.setPaired(true);
+ r.mate.setPaired(true);
+ }
+ }
+
+ if(r.numSites()==0){r.sites=null;r.mapScore=0;}
+ if(r2.numSites()==0){r2.sites=null;r2.mapScore=0;}
+
+ r.setFromTopSite(AMBIGUOUS_RANDOM, true, MAX_PAIR_DIST);
+ r2.setFromTopSite(AMBIGUOUS_RANDOM, true, MAX_PAIR_DIST);
+ if(KILL_BAD_PAIRS){
+ if(r.isBadPair(REQUIRE_CORRECT_STRANDS_PAIRS, SAME_STRAND_PAIRS, MAX_PAIR_DIST)){
+ int x=r.mapScore/len1;
+ int y=r2.mapScore/len2;
+ if(x>=y){
+ r2.clearAnswers(false);
+ }else{
+ r.clearAnswers(false);
+ }
+ }
+ }
+ if(verbose){System.err.println("\nAfter bad pair removal:\nRead1:\t"+r+"\nRead2:\t"+r2);}
+
+ assert(r.sites==null || r.mapScore>0) : r.mapScore+"\n"+r.toText(false)+"\n\n"+r2.toText(false)+"\n";
+ assert(r2.sites==null || r2.mapScore>0) : r2.mapScore+"\n"+r.toText(false)+"\n\n"+r2.toText(false)+"\n";
+ if(MAKE_MATCH_STRING){
+ if(r.numSites()>0){
+ if(USE_SS_MATCH_FOR_PRIMARY && r.topSite().match!=null){
+ r.match=r.topSite().match;
+ }else{
+ genMatchString(r, basesP1, basesM1, maxImperfectSwScore1, maxSwScore1, false, false);
+
+ if(STRICT_MAX_INDEL && r.mapped()){
+ if(hasLongIndel(r.match, index.MAX_INDEL)){
+ r.clearMapping();
+ r2.setPaired(false);
+ }
+ }
+ }
+// assert(Read.CHECKSITES(r, basesM1));
+ }
+ if(r2.numSites()>0){
+ if(USE_SS_MATCH_FOR_PRIMARY && r2.topSite().match!=null){
+ r2.match=r2.topSite().match;
+ }else{
+ genMatchString(r2, basesP2, basesM2, maxImperfectSwScore2, maxSwScore2, false, false);
+ if(STRICT_MAX_INDEL && r2.mapped()){
+ if(hasLongIndel(r2.match, index.MAX_INDEL)){
+ r2.clearMapping();
+ r.setPaired(false);
+ }
+ }
+ }
+// assert(Read.CHECKSITES(r2, basesM2));
+ }
+ }
+
+ assert(checkTopSite(r)); // TODO remove this
+ if(verbose){
+ System.err.println("\nFinal:\nRead1:\t"+r+"\nRead2:\t"+r2);
+ if(r.match!=null && r.shortmatch()){r.match=Read.toLongMatchString(r.match); r.setShortMatch(false);}
+ if(r2.match!=null && r2.shortmatch()){r2.match=Read.toLongMatchString(r2.match); r2.setShortMatch(false);}
+ }
+
+ //Block to prevent assertion from firing. Generally caused by alignment being lost during match generation. TODO: Fix cause.
+ if(r.mapScore>0 && r.sites==null){
+ if(!Shared.anomaly){System.err.println("Anomaly: mapScore>0 and list==null.\n"+r+"\n");}
+ Shared.anomaly=true;
+ r.clearMapping();
+ r2.setPaired(false);
+ }else if(r.mapScore<=0 && r.sites!=null){
+ if(!STRICT_MAX_INDEL && !Shared.anomaly){System.err.println("Anomaly2: mapScore<=0 and list!=null.\n"+r+"\n");}
+ Shared.anomaly=true;
+ r.clearMapping();
+ r2.setPaired(false);
+ }
+ assert(checkTopSite(r)); // TODO remove this
+ //Block to prevent assertion from firing. Generally caused by alignment being lost during match generation. TODO: Fix cause.
+ if(r2.mapScore>0 && r2.sites==null){
+ if(!Shared.anomaly){System.err.println("Anomaly: mapScore>0 and list==null.\n"+r+"\n");}
+ Shared.anomaly=true;
+ r2.clearMapping();
+ r.setPaired(false);
+ }else if(r2.mapScore<=0 && r2.sites!=null){
+ if(!STRICT_MAX_INDEL && !Shared.anomaly){System.err.println("Anomaly3: mapScore<=0 and list!=null.\n"+r+"\n");}
+ Shared.anomaly=true;
+ r2.clearMapping();
+ r.setPaired(false);
+ }
+
+ assert(r.sites==null || r.mapScore>0) :
+ r.mapScore+"\t"+r.sites+"\n"+(-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED*maxSwScore1))+"\n"+
+ Arrays.toString(msa.fillAndScoreLimited(r.strand()==Gene.PLUS ? r.bases :
+ AminoAcid.reverseComplementBases(r.bases), r.topSite(), Tools.max(SLOW_ALIGN_PADDING, 80), 0))+"\n"+
+ Arrays.toString(msa.fillAndScoreLimited(r.strand()==Gene.PLUS ? r.bases :
+ AminoAcid.reverseComplementBases(r.bases), r.topSite(), Tools.max(SLOW_ALIGN_PADDING, 80), (-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED*maxSwScore1))))+"\n\n"+
+ msa.showVertLimit()+"\n\n"+msa.showHorizLimit()+"\n\n"+r+"\n\n"+r2+"\n\n";
+ assert(r2.sites==null || r2.mapScore>0) :
+ r2.mapScore+"\t"+r2.sites+"\n"+(-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED*maxSwScore2))+"\n"+
+ Arrays.toString(msa.fillAndScoreLimited(r2.strand()==Gene.PLUS ? r2.bases :
+ AminoAcid.reverseComplementBases(r2.bases), r2.topSite(), Tools.max(SLOW_ALIGN_PADDING, 80), 0))+"\n"+
+ Arrays.toString(msa.fillAndScoreLimited(r2.strand()==Gene.PLUS ? r2.bases :
+ AminoAcid.reverseComplementBases(r2.bases), r2.topSite(), Tools.max(SLOW_ALIGN_PADDING, 80), (-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED*maxSwScore2))))+"\n\n"+
+ msa.showVertLimit()+"\n\n"+msa.showHorizLimit()+"\n\n"+r+"\n\n"+r2+"\n\n";
+
+ assert(!r.mapped() || !MAKE_MATCH_STRING || r.match!=null) : "Note that sometimes, VERY RARELY, match string generation fails.";
+ assert(checkTopSite(r)); // TODO remove this
+ removeDuplicateBestSites(r);
+ removeDuplicateBestSites(r2);
+
+ if(DYNAMIC_INSERT_LENGTH && numMated>1000 && r.paired()){
+ AVERAGE_PAIR_DIST=(int)(innerLengthSum*1f/numMated);
+ }
+ assert(checkTopSite(r)); // TODO remove this
+ if(r.ambiguous() && AMBIGUOUS_TOSS){
+ if(r.sites!=null){r.sites=null;}
+ r.clearSite();
+ r.setMapped(false);
+ r.setPaired(false);
+ r2.setPaired(false);
+ }else if(r.mapped() && r.numSites()>1 && PRINT_SECONDARY_ALIGNMENTS){
+ ensureMatchStringsOnSiteScores(r, basesM1, maxImperfectSwScore1, maxSwScore1);
+ assert(Read.CHECKSITES(r, basesM1));
+ }
+ if(r2.ambiguous() && AMBIGUOUS_TOSS){
+ if(r2.sites!=null){r2.sites=null;}
+ r2.clearSite();
+ r2.setMapped(false);
+ r.setPaired(false);
+ r2.setPaired(false);
+ }else if(r2.mapped() && r2.numSites()>1 && PRINT_SECONDARY_ALIGNMENTS){
+ ensureMatchStringsOnSiteScores(r2, basesM2, maxImperfectSwScore2, maxSwScore2);
+ assert(Read.CHECKSITES(r2, basesM2));
+ }
+// assert(Read.CHECKSITES(r, basesM1) && Read.CHECKSITES(r2, basesM2));
+
+ assert(checkTopSite(r));
+ if(r.mapped() && (LOCAL_ALIGN || r.containsXYC())){
+ final SiteScore ss=r.topSite();
+ ss.match=r.match;
+ msa.toLocalAlignment(r, ss, basesM1, r.containsXYC() ? 1 : LOCAL_ALIGN_TIP_LENGTH, LOCAL_ALIGN_MATCH_POINT_RATIO);
+// System.err.println("\n\n*********\n\n"+r+"\n\n*********\n\n");
+// assert(Read.CHECKSITES(r, basesM1)); //TODO: This can fail; see bug#0001
+ }
+// assert(false) : r.mapped()+", "+LOCAL_ALIGN+", "+r.containsXYC()+", "+new String(r.match);
+
+ assert(checkTopSite(r2));
+ if(r2.mapped() && (LOCAL_ALIGN || r2.containsXYC())){
+ final SiteScore ss=r2.topSite();
+ ss.match=r2.match;
+ msa.toLocalAlignment(r2, ss, basesM2, r2.containsXYC() ? 1 : LOCAL_ALIGN_TIP_LENGTH, LOCAL_ALIGN_MATCH_POINT_RATIO);
+// assert(Read.CHECKSITES(r2, basesM2)); //TODO: This can fail; see bug#0001
+ }
+
+ postFilterRead(r, basesM1, maxImperfectSwScore1, maxSwScore1);
+ postFilterRead(r2, basesM2, maxImperfectSwScore2, maxSwScore2);
+ if(MAKE_MATCH_STRING){
+ ensureMatchStringOnPrimary(r, basesM1, maxImperfectSwScore1, maxSwScore1);
+ ensureMatchStringOnPrimary(r2, basesM2, maxImperfectSwScore2, maxSwScore2);
+ }
+
+ if(CALC_STATISTICS){
+ calcStatistics1(r, maxSwScore1, maxPossibleQuickScore1);
+ calcStatistics2(r2, maxSwScore2, maxPossibleQuickScore2);
+ }
+ }
+
+}
diff --git a/current/align2/BBMapThread5.java b/current/align2/BBMapThread5.java
new file mode 100755
index 0000000..d1d99a1
--- /dev/null
+++ b/current/align2/BBMapThread5.java
@@ -0,0 +1,1292 @@
+package align2;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+
+import jgi.CoveragePileup;
+
+import stream.ConcurrentReadInputStream;
+import stream.ConcurrentReadOutputStream;
+import stream.Read;
+import stream.SiteScore;
+
+import dna.AminoAcid;
+import dna.Data;
+import dna.Gene;
+
+/**
+ * Based on MapTestThread11f
+ *
+ * @author Brian Bushnell
+ * @date Jan 3, 2013
+ *
+ */
+public final class BBMapThread5 extends AbstractMapThread {
+
+ static final int ALIGN_COLUMNS=BBIndex5.ALIGN_COLUMNS;
+ static final int ALIGN_ROWS=601;
+
+
+
+ /** Don't trim for local alignments unless at least this many bases will be clipped */
+ private final int LOCAL_ALIGN_TIP_LENGTH=1;
+ /** Range is 0-1; a lower number makes trimming more aggressive */
+ private final float LOCAL_ALIGN_MATCH_POINT_RATIO=1f;
+
+ /** Ratio of the points for a match of a single base needed to declare unambiguous. 1 SNP is currently about 2.57 */
+ public final float CLEARZONE_RATIOP=1.6f; //default 1.3f, which makes read ambiguous if there is 1 N in an alternate site.
+ public final float CLEARZONE_RATIO1=2.0f;
+ public final float CLEARZONE_RATIO1b=2.6f;
+ public final float CLEARZONE_RATIO1c=4.6f;
+ public final float CLEARZONE_RATIO3=8.0f;
+ /** Max allowed number of sites within 1 edit (excluding primary site) */
+ public final int CLEARZONE_LIMIT1e=40; //Needs to be redone to assign a quality penalty rather than simply marking as ambiguous
+ public final int CLEARZONEP;
+ public final int CLEARZONE1;
+ public final int CLEARZONE1b;
+ public final int CLEARZONE1c;
+ //public final int CLEARZONE1e;
+ public final int CLEARZONE3;
+ public final float INV_CLEARZONE3;
+ public final float CLEARZONE1b_CUTOFF=0.92f;
+ public final float CLEARZONE1c_CUTOFF=0.82f;
+
+ public final BBIndex5 index;
+
+
+ private final int MIN_TRIM_SITES_TO_RETAIN_SINGLE=3;
+ private final int MIN_TRIM_SITES_TO_RETAIN_PAIRED=2;
+
+ public static void setExpectedSites(int x){
+ System.err.println("Warning: EXPECTED_SITES is not valid for "+(new Object() { }.getClass().getEnclosingClass().getName()));
+ }
+
+ @Override
+ public final int ALIGN_COLUMNS(){return ALIGN_COLUMNS;}
+ @Override
+ public final int ALIGN_ROWS(){return ALIGN_ROWS;}
+ @Override
+ public final int maxReadLength(){return ALIGN_ROWS-1;}
+ @Override
+ final AbstractIndex index(){return index;}
+ @Override
+ final int CLEARZONE1(){return CLEARZONE1;}
+
+ public BBMapThread5(ConcurrentReadInputStream cris_, int keylen_,
+ CoveragePileup pileup_, boolean SMITH_WATERMAN_, int THRESH_, int minChrom_,
+ int maxChrom_, float keyDensity_, float maxKeyDensity_, float minKeyDensity_, int maxDesiredKeys_,
+ boolean REMOVE_DUPLICATE_BEST_ALIGNMENTS_, boolean SAVE_AMBIGUOUS_XY_,
+ float MINIMUM_ALIGNMENT_SCORE_RATIO_, boolean TRIM_LIST_, boolean MAKE_MATCH_STRING_, boolean QUICK_MATCH_STRINGS_,
+ ConcurrentReadOutputStream outStream_, ConcurrentReadOutputStream outStreamMapped_, ConcurrentReadOutputStream outStreamUnmapped_, ConcurrentReadOutputStream outStreamBlack_,
+ int SLOW_ALIGN_PADDING_, int SLOW_RESCUE_PADDING_, boolean DONT_OUTPUT_UNMAPPED_READS_, boolean DONT_OUTPUT_BLACKLISTED_READS_,
+ int MAX_SITESCORES_TO_PRINT_, boolean PRINT_SECONDARY_ALIGNMENTS_,
+ boolean REQUIRE_CORRECT_STRANDS_PAIRS_, boolean SAME_STRAND_PAIRS_, boolean KILL_BAD_PAIRS_, boolean RCOMP_MATE_,
+ boolean PERFECTMODE_, boolean SEMIPERFECTMODE_, boolean FORBID_SELF_MAPPING_, int TIP_DELETION_SEARCH_RANGE_,
+ boolean AMBIGUOUS_RANDOM_, boolean AMBIGUOUS_ALL_, int KFILTER_, float IDFILTER_, boolean TRIM_LEFT_, boolean TRIM_RIGHT_, boolean UNTRIM_, byte TRIM_QUAL_, int TRIM_MIN_LEN_,
+ boolean LOCAL_ALIGN_, boolean RESCUE_, boolean STRICT_MAX_INDEL_, String MSA_TYPE_){
+
+ super(cris_,
+ outStream_, outStreamMapped_, outStreamUnmapped_, outStreamBlack_,
+ pileup_, SMITH_WATERMAN_, LOCAL_ALIGN_, REMOVE_DUPLICATE_BEST_ALIGNMENTS_,
+ AMBIGUOUS_RANDOM_, AMBIGUOUS_ALL_, TRIM_LEFT_, TRIM_RIGHT_, UNTRIM_, TRIM_QUAL_, TRIM_MIN_LEN_, THRESH_,
+ minChrom_, maxChrom_, KFILTER_, IDFILTER_, KILL_BAD_PAIRS_, SAVE_AMBIGUOUS_XY_,
+ REQUIRE_CORRECT_STRANDS_PAIRS_,
+ SAME_STRAND_PAIRS_, RESCUE_, STRICT_MAX_INDEL_, SLOW_ALIGN_PADDING_, SLOW_RESCUE_PADDING_,
+ MSA_TYPE_, keylen_, PERFECTMODE_, SEMIPERFECTMODE_, FORBID_SELF_MAPPING_, RCOMP_MATE_,
+ MAKE_MATCH_STRING_, DONT_OUTPUT_UNMAPPED_READS_, DONT_OUTPUT_BLACKLISTED_READS_, PRINT_SECONDARY_ALIGNMENTS_,
+ QUICK_MATCH_STRINGS_, MAX_SITESCORES_TO_PRINT_, MINIMUM_ALIGNMENT_SCORE_RATIO_,
+ keyDensity_, maxKeyDensity_, minKeyDensity_, maxDesiredKeys_,
+ BBIndex5.MIN_APPROX_HITS_TO_KEEP, BBIndex5.USE_EXTENDED_SCORE,
+ BBIndex5.BASE_HIT_SCORE, BBIndex5.USE_AFFINE_SCORE, BBIndex5.MAX_INDEL, TRIM_LIST_, TIP_DELETION_SEARCH_RANGE_);
+
+ assert(SLOW_ALIGN_PADDING>=0);
+ assert(!(RCOMP_MATE/* || FORBID_SELF_MAPPING*/)) : "RCOMP_MATE: TODO";
+
+ if(SLOW_ALIGN || MAKE_MATCH_STRING){
+// msa=MSA.makeMSA(ALIGN_ROWS, ALIGN_COLUMNS, MSA_TYPE);
+// POINTS_MATCH=msa.POINTS_MATCH();
+// POINTS_MATCH2=msa.POINTS_MATCH2();
+ CLEARZONE1=(int)(CLEARZONE_RATIO1*POINTS_MATCH2);
+ CLEARZONE1b=(int)(CLEARZONE_RATIO1b*POINTS_MATCH2);
+ CLEARZONE1c=(int)(CLEARZONE_RATIO1c*POINTS_MATCH2);
+ CLEARZONEP=(int)(CLEARZONE_RATIOP*POINTS_MATCH2);
+ CLEARZONE3=PENALIZE_AMBIG ? (int)(CLEARZONE_RATIO3*POINTS_MATCH2) : 0;
+// CLEARZONE1e=(int)(2*POINTS_MATCH2-POINTS_MATCH-msa.POINTS_SUB())+1;
+ }else{
+// POINTS_MATCH=70;
+// POINTS_MATCH2=100;
+// msa=null;
+ CLEARZONE1=0;
+ CLEARZONE1b=0;
+ CLEARZONE1c=0;
+ CLEARZONEP=0;
+ CLEARZONE3=0;
+// CLEARZONE1e=0;
+ }
+ INV_CLEARZONE3=(CLEARZONE3==0 ? 0 : 1f/CLEARZONE3);
+
+ index=new BBIndex5(KEYLEN, minChrom, maxChrom, KFILTER, msa);
+ }
+
+
+ public int trimList(ArrayList<SiteScore> list, boolean retainPaired, int maxScore, boolean specialCasePerfect, int minSitesToRetain, int maxSitesToRetain){
+ if(list==null || list.size()==0){return -99999;}
+ if(list.size()==1){return list.get(0).score;}
+
+ final int highestScore;
+ if(USE_AFFINE_SCORE){
+
+ highestScore=Tools.trimSiteList(list, .6f, retainPaired, true, minSitesToRetain, maxSitesToRetain);
+ if(highestScore==maxScore && specialCasePerfect){
+ Tools.trimSiteList(list, .94f, retainPaired, true, minSitesToRetain, maxSitesToRetain);
+ if(list.size()>8){Tools.trimSiteList(list, .99f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ return highestScore;
+ }
+
+ final int mstr2=(minSitesToRetain<=1 ? 1 : minSitesToRetain+1);
+
+// if(list.size()>6){Tools.trimSiteList(list, .65f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+//// // System.out.print(", "+list.size());
+// if(list.size()>10){Tools.trimSiteList(list, .7f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+//// // System.out.print(", "+list.size());
+// if(list.size()>14){Tools.trimSiteList(list, .75f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+//// // System.out.print(", "+list.size());
+// if(list.size()>18){Tools.trimSiteList(list, .8f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+//// //// System.out.print(", "+list.size());
+// if(list.size()>22){Tools.trimSiteList(list, .85f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+//// //// System.out.print(", "+list.size());
+// if(list.size()>26){Tools.trimSiteList(list, .9f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+//// // System.out.print(", "+list.size());
+// if(list.size()>34){Tools.trimSiteList(list, .95f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+// // System.out.print(", "+list.size());
+// if(list.size()>42){Tools.trimSiteList(list, .98f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+// // System.out.print(", "+list.size());
+// if(list.size()>50){Tools.trimSiteList(list, .99f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+// // System.out.print(", "+list.size());
+//// if(list.size()>64){Tools.trimSiteList(list, .995f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+// // System.out.print(", "+list.size());
+
+ if(list.size()>4){Tools.trimSiteList(list, .65f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+// // System.out.print(", "+list.size());
+ if(list.size()>8){Tools.trimSiteList(list, .7f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+// // System.out.print(", "+list.size());
+ if(list.size()>12){Tools.trimSiteList(list, .75f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+// // System.out.print(", "+list.size());
+ if(list.size()>16){Tools.trimSiteList(list, .8f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+// //// System.out.print(", "+list.size());
+ if(list.size()>20){Tools.trimSiteList(list, .85f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+// //// System.out.print(", "+list.size());
+ if(list.size()>24){Tools.trimSiteList(list, .9f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+// // System.out.print(", "+list.size());
+ if(list.size()>32){Tools.trimSiteList(list, .95f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ if(list.size()>40){Tools.trimSiteList(list, .97f, retainPaired, true, mstr2, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ if(list.size()>48){Tools.trimSiteList(list, .99f, retainPaired, true, mstr2, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+// if(list.size()>64){Tools.trimSiteList(list, .995f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+
+
+ }else if(USE_EXTENDED_SCORE){
+ highestScore=Tools.trimSiteList(list, .75f, retainPaired, true, minSitesToRetain, maxSitesToRetain);
+
+ if(list.size()>8){Tools.trimSiteList(list, .8f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ if(list.size()>16){Tools.trimSiteList(list, .85f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ if(list.size()>24){Tools.trimSiteList(list, .90f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ if(list.size()>36){Tools.trimSiteList(list, .92f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ //// System.out.print(", "+list.size());
+ if(list.size()>40){Tools.trimSiteList(list, .94f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ //// System.out.print(", "+list.size());
+ if(list.size()>48){Tools.trimSiteList(list, .96f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ //// System.out.print(", "+list.size());
+ if(list.size()>56){Tools.trimSiteList(list, .97f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ if(list.size()>64){Tools.trimSiteList(list, .98f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ if(list.size()>80){Tools.trimSiteList(list, .99f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+
+
+ }else{
+ // System.out.print("\n\nSize:\t"+list.size());
+
+
+ highestScore=Tools.trimSiteList(list, .6f, retainPaired, true, minSitesToRetain, maxSitesToRetain);
+
+ if(list.size()>12){Tools.trimSiteList(list, .65f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ if(list.size()>16){Tools.trimSiteList(list, .7f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ if(list.size()>24){Tools.trimSiteList(list, .74f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ if(list.size()>28){Tools.trimSiteList(list, .8f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ if(list.size()>32){Tools.trimSiteList(list, .85f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ if(list.size()>48){Tools.trimSiteList(list, .90f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ //// System.out.print(", "+list.size());
+ // if(list.size()>40){Tools.trimSiteList(list, .95f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ //// System.out.print(", "+list.size());
+ // if(list.size()>48){Tools.trimSiteList(list, .96f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ //// System.out.print(", "+list.size());
+ // if(list.size()>56){Tools.trimSiteList(list, .97f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ }
+
+ return highestScore;
+ }
+
+
+ public void scoreSlow(final ArrayList<SiteScore> list, final byte[] basesP, final byte[] basesM,
+ final int maxSwScore, final int maxImperfectSwScore){
+
+ int minMsaLimit;
+ if(PAIRED){
+ minMsaLimit=-CLEARZONE1e+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE*maxSwScore);
+ }else{
+ minMsaLimit=-CLEARZONE1e+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO*maxSwScore);
+ }
+ assert(Read.CHECKSITES(list, basesP, basesM, -1));
+
+ int minMatch=Tools.max(-300, minMsaLimit-CLEARZONE3); //Score must exceed this to generate quick match string
+ if(verbose){
+ System.err.println("Slow-scoring. maxSwScore="+maxSwScore+", maxImperfectSwScore="+maxImperfectSwScore+", minMsaLimit="+minMsaLimit+", minMatch="+minMatch);
+ }
+ for(int i=0; i<list.size(); i++){
+ final SiteScore ss=list.get(i);
+ assert(ss.lengthsAgree());
+ final byte[] bases=(ss.strand==Gene.PLUS ? basesP : basesM);
+
+ if(SEMIPERFECTMODE){
+ assert(ss.stop-ss.start==bases.length-1);
+ assert(ss.semiperfect);
+ }
+
+ if(verbose){System.err.println("\nSlow-scoring "+ss);}
+ if(ss.stop-ss.start!=bases.length-1){
+ assert(ss.stop-ss.start>bases.length-1) : bases.length+", "+ss.toText();
+ assert(!ss.semiperfect) : "\n"+bases.length+", "+ss.toText()+", "+ss.perfect+", "+ss.semiperfect+", "+maxSwScore+"\n"+new String(basesP)+"\n";
+ ss.setSlowScore(0);
+ ss.semiperfect=false;
+ ss.perfect=false;
+ }
+
+ final int swscoreNoIndel=ss.slowScore;
+ int[] swscoreArray=null;
+
+ boolean clipped=true, setLimits=false;
+ if(swscoreNoIndel<maxImperfectSwScore && !ss.semiperfect){
+ if(verbose && ss.stop-ss.start>4000){
+ System.err.println(ss.toText());
+ System.err.println(list.size());
+ System.err.println();
+ }
+
+ int expectedLen=GapTools.calcGrefLen(ss);
+ if(verbose){System.err.println("expectedLen="+expectedLen);}
+ if(expectedLen>=EXPECTED_LEN_LIMIT){
+ //TODO: Alternately, I could kill the site.
+ ss.setStop(ss.start+Tools.min(basesP.length+40, EXPECTED_LEN_LIMIT));
+ if(verbose){System.err.println("expectedLen="+expectedLen+"; ss="+ss);}
+ }
+
+ int pad=SLOW_ALIGN_PADDING;
+ final int minscore=Tools.max(swscoreNoIndel, minMsaLimit);
+ final int minscore2=Tools.max(swscoreNoIndel-MSA.MIN_SCORE_ADJUST, minMsaLimit);
+ if(verbose){System.err.println("Sent to msa with start="+ss.start+", stop="+ss.stop+", pad="+pad+", limit="+minscore+", gaps="+GapTools.toString(ss.gaps));}
+ swscoreArray=msa.fillAndScoreLimited(bases, ss, pad, minscore);
+ if(verbose){System.err.println("Received "+Arrays.toString(swscoreArray));}
+
+ if(swscoreArray!=null && swscoreArray.length>6 && (swscoreArray[3]+swscoreArray[4]+expectedLen<EXPECTED_LEN_LIMIT)){
+ int[] oldArray=swscoreArray.clone();
+ assert(swscoreArray.length==8);
+ int extraPadLeft=swscoreArray[6];
+ int extraPadRight=swscoreArray[7];
+
+ if(verbose){
+ System.err.println("msa returned "+Arrays.toString(swscoreArray)+", re-running.");
+ System.err.println("Added extra padding: "+ss.toText()+", "+Arrays.toString(oldArray));
+ }
+
+ ss.setLimits(ss.start-extraPadLeft, ss.stop+extraPadRight);
+ pad=SLOW_ALIGN_PADDING+EXTRA_PADDING;
+ if(verbose){System.err.println("Sent to msa with start="+ss.start+", stop="+ss.stop+", pad="+pad+", limit="+minscore+", gaps="+GapTools.toString(ss.gaps));}
+ swscoreArray=msa.fillAndScoreLimited(bases, ss, pad, minscore);
+
+ if(verbose){System.err.println("Result of extra padding: "+ss.toText()+", "+Arrays.toString(swscoreArray));}
+ if(swscoreArray==null || swscoreArray[0]<oldArray[0]){
+ if(verbose){
+ System.err.println("Result was inferior.");
+ }
+ swscoreArray=oldArray;
+ }
+ }
+ assert(ss.lengthsAgree());
+ if(verbose){
+ System.err.println(QUICK_MATCH_STRINGS+", "+(swscoreArray==null ? "null" : (swscoreArray.length+", "+swscoreArray[0]+" >=? "+minscore)));
+ System.err.println("start="+ss.start+", stop="+ss.stop+", len="+ss.mappedLength());
+ }
+ if(QUICK_MATCH_STRINGS && swscoreArray!=null && swscoreArray.length==6 && swscoreArray[0]>=minscore2 && (PRINT_SECONDARY_ALIGNMENTS || (USE_SS_MATCH_FOR_PRIMARY && swscoreArray[0]>minMatch))){
+ if(verbose){System.err.println("Generating match string.");}
+ assert(swscoreArray.length==6) : swscoreArray.length;
+ assert(swscoreArray[0]>=minscore2) : "\n"+Arrays.toString(swscoreArray)+"\n"+minscore+"\n"+minMatch;
+ ss.match=msa.traceback(bases, Data.getChromosome(ss.chrom).array, ss.start-pad, ss.stop+pad, swscoreArray[3], swscoreArray[4], swscoreArray[5], ss.gaps!=null);
+ if(ss.match!=null){
+ assert(ss.pairedScore<1 || (ss.slowScore<=0 && ss.pairedScore>ss.quickScore ) || ss.pairedScore>ss.slowScore); //123
+ ss.setLimits(swscoreArray[1], swscoreArray[2]);
+ setLimits=true;
+ assert(ss.lengthsAgree());
+ clipped=ss.fixXY(bases, true, msa);
+ assert(ss.pairedScore<1 || (ss.slowScore<=0 && ss.pairedScore>ss.quickScore ) || ss.pairedScore>ss.slowScore); //123
+ clipped=ss.clipTipIndels(bases, basesM, 4, 10, msa) || clipped;
+ assert(ss.pairedScore<1 || (ss.slowScore<=0 && ss.pairedScore>ss.quickScore ) || ss.pairedScore>ss.slowScore); //123
+ assert(ss.lengthsAgree());
+ }
+ }else{
+ ss.match=null;
+ }
+ }
+ if(swscoreArray!=null && !setLimits){
+ if(verbose){System.err.println("msa returned "+Arrays.toString(swscoreArray));}
+ ss.setSlowScore(swscoreArray[0]);
+ ss.setLimits(swscoreArray[1], swscoreArray[2]);
+ assert(ss.lengthsAgree());
+ }else{
+ assert(swscoreNoIndel<=maxSwScore) : swscoreNoIndel+", "+maxImperfectSwScore+", "+maxSwScore+", "+new String(basesP);
+ assert(clipped || swscoreNoIndel==-1 || msa.scoreNoIndels(bases, ss.chrom, ss.start)==swscoreNoIndel) :
+ setLimits+", "+clipped+", "+(swscoreArray==null)+", "+
+ swscoreNoIndel+" != "+msa.scoreNoIndels(bases, ss.chrom, ss.start)+"\n"+
+ ss.toText()+"\n"+(ss.stop-ss.start)+", "+bases.length; //Slow
+ }
+ assert(ss.lengthsAgree());
+ ss.setScore(ss.slowScore);
+ minMatch=Tools.max(minMatch, ss.slowScore);
+ minMsaLimit=Tools.max(minMsaLimit, ss.slowScore-CLEARZONE3);
+ assert(ss.slowScore<=maxSwScore);
+ assert(!(ss.perfect && ss.slowScore<maxSwScore));
+ ss.perfect=(ss.slowScore==maxSwScore);
+ if(ss.perfect){ss.semiperfect=true;}
+ else if(!ss.semiperfect){ss.setPerfect(bases);}
+
+ if(verbose){System.err.println(" -> "+ss);}
+ }
+
+ }
+
+
+ public void processRead(final Read r, final byte[] basesM){
+ if(idmodulo>1 && r.numericID%idmodulo!=1){return;}
+ final byte[] basesP=r.bases;
+
+// System.err.print(" rd#"+r.numericID+" ");
+// if(r.numericID==25967){
+// verbose=true;
+// msa.verbose=true;
+// GapTools.verbose=true;
+// index.verbose=true;
+// tcr.verbose=true;
+// }
+
+ if(verbose){System.err.println("\nProcessing "+r);}
+ readsUsed1++;
+
+ final int maxPossibleQuickScore=quickMap(r, basesM);
+ if(verbose){System.err.println("\nQuick Map: \t"+r.sites);}
+
+ if(maxPossibleQuickScore<0){
+ r.sites=null;
+ lowQualityReadsDiscarded1++;
+ lowQualityBasesDiscarded1+=basesP.length;
+ r.setDiscarded(true);
+ return;
+ }
+ initialSiteSum1+=r.numSites();
+ if(verbose){System.err.println("\ninitialSiteSum1: "+initialSiteSum1);}
+
+ int maxSwScore=0;
+ int maxImperfectSwScore=0;
+
+ if(SLOW_ALIGN || USE_AFFINE_SCORE){
+ maxSwScore=msa.maxQuality(r.length());
+ maxImperfectSwScore=msa.maxImperfectScore(r.length());
+ }
+
+ if(TRIM_LIST && r.numSites()>1){
+ if(MIN_TRIM_SITES_TO_RETAIN_SINGLE>1){Collections.sort(r.sites);}
+ int highestQuickScore=trimList(r.sites, false, maxSwScore, true, MIN_TRIM_SITES_TO_RETAIN_SINGLE, MAX_TRIM_SITES_TO_RETAIN);
+ }
+ postTrimSiteSum1+=r.numSites();
+ if(verbose){System.err.println("\nAfter trim: \t"+r.sites);}
+
+ assert(Read.CHECKSITES(r, basesM));
+
+
+ if(SLOW_ALIGN && r.numSites()>0){
+
+ int numNearPerfectScores=scoreNoIndels(r, basesP, basesM, maxSwScore, maxImperfectSwScore);
+
+ Collections.sort(r.sites); //Puts higher scores first to better trigger the early exit based on perfect scores
+
+// int numPerfectScores=0;
+// if(numNearPerfectScores>0){
+// for(SiteScore ss : r.list){
+// if(ss.perfect){numPerfectScores++;}
+// else{break;}
+// }
+// }
+
+ if(verbose){
+ System.err.println("\nAfter scoreNoIndels: \t"+r.sites);
+ }
+
+ if(numNearPerfectScores<1){
+ if(FIND_TIP_DELETIONS){findTipDeletions(r, basesP, basesM, maxSwScore, maxImperfectSwScore);}
+ }
+
+ if(verbose){
+ System.err.println("\nAfter findTipDeletions: \t"+r.sites);
+ }
+
+ //TODO: This causes problems with perfect matches that are mapped to areas longer than the read length
+ //***Above note should be resolved now, but needs to be verified.
+
+ if(numNearPerfectScores<1){
+ scoreSlow(r.sites, basesP, basesM, maxSwScore, maxImperfectSwScore);
+ }
+
+ if(STRICT_MAX_INDEL){
+ int removed=removeLongIndels(r.sites, index.MAX_INDEL);
+ if(r.numSites()==0){r.clearMapping();}
+ }
+
+ if(verbose){System.err.println("\nAfter scoreSlow: \t"+r.sites);}
+ assert(Read.CHECKSITES(r, basesM, false));
+ }
+
+
+ if(r.numSites()>0){
+ mapped1++;
+ try {
+ Tools.mergeDuplicateSites(r.sites, true, true);
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ throw new RuntimeException("\n\n"+r.toText(false)+"\n\n");
+ }
+ Collections.sort(r.sites);
+ }
+
+ if(r.numSites()>1){
+ SiteScore ss1=r.topSite();
+ SiteScore ss2=r.sites.get(1);
+ //Ensure no duplicates
+ assert(ss1.chrom!=ss2.chrom || ss1.strand!=ss2.strand || ss1.start!=ss2.start || ss1.stop!=ss2.stop) : r.toText(false);
+ }
+ assert(Read.CHECKSITES(r, basesM));
+
+ if(r.numSites()>1){
+ assert(r.topSite().score==r.topSite().slowScore);
+ }
+
+ if(SLOW_ALIGN || USE_AFFINE_SCORE){r.setPerfectFlag(maxSwScore);}
+
+ if(r.numSites()>1){
+ final int clearzone=r.perfect() ? CLEARZONEP :
+ r.topSite().score>=(int)(maxSwScore*CLEARZONE1b_CUTOFF) ? CLEARZONE1 :
+ (r.topSite().score>=(int)(maxSwScore*CLEARZONE1c_CUTOFF) ? CLEARZONE1b : CLEARZONE1c);
+ final int numBestSites1=Tools.countTopScores(r.sites, clearzone);
+ if(numBestSites1>1){
+ //Ambiguous alignment
+ assert(r.sites.size()>1);
+ boolean b=processAmbiguous(r.sites, true, AMBIGUOUS_TOSS, clearzone, SAVE_AMBIGUOUS_XY);
+ r.setAmbiguous(b);
+ }
+ }
+
+ if(verbose){System.err.println("A: "+r);}
+
+ if((SLOW_ALIGN || USE_AFFINE_SCORE) && r.numSites()>0){
+ int lim=(int)(maxSwScore*MINIMUM_ALIGNMENT_SCORE_RATIO);
+ if(r.topSite().score<lim){r.sites=null;}
+ else{Tools.removeLowQualitySitesUnpaired(r.sites, Tools.min(lim, Tools.max(1, lim-CLEARZONE3)));}
+ }
+ if(r.numSites()==0){r.sites=null;r.mapScore=0;}
+ r.setFromTopSite(AMBIGUOUS_RANDOM, true, MAX_PAIR_DIST);
+ assert(Read.CHECKSITES(r, basesM));
+
+ assert(r.gaps==null || r.gaps[0]==r.start && r.gaps[r.gaps.length-1]==r.stop);
+ assert(r.sites==null || r.mapScore>0) : r.sites+", "+r.mapScore+"\n"+r;
+
+ if(r.numSites()>1){
+ assert(r.topSite().score==r.topSite().slowScore) : "\n"+r.toText(false)+"\n";
+ assert(r.topSite().score==r.mapScore) : "\n"+r.toText(false)+"\n";
+ }
+
+ if(verbose){System.err.println("C: "+r);}
+
+ //***$
+ if(MAKE_MATCH_STRING && r.numSites()>0){
+ if(USE_SS_MATCH_FOR_PRIMARY && r.topSite().match!=null){
+ r.match=r.topSite().match;
+ }else{
+ if(r.sites.size()>1){
+ assert(r.topSite().score>=r.sites.get(1).score) : "\n"+r.topSite().toText()+"\t<\t"+r.sites.get(1).toText()+"\n"+r.toText(false)+"\n";
+ }
+ int mapScore=r.mapScore;
+
+ assert(r.mate!=null || r.numSites()==0 || r.topSite().score==r.mapScore) : "\n"+r.toText(false)+"\n";
+
+ if(verbose){System.err.println("D: "+r);}
+
+ {
+ boolean firstIter=true;
+ do{//
+ if(!firstIter){
+ Collections.sort(r.sites);
+ r.setFromTopSite(AMBIGUOUS_RANDOM, true, MAX_PAIR_DIST);
+ }
+ genMatchString(r, basesP, basesM, maxImperfectSwScore, maxSwScore, true, true);
+ assert(r.mate!=null || r.numSites()==0 || r.topSite().score==r.mapScore) : "\n"+r.toText(false)+"\n";
+// TODO: Fix this; it should never happen.
+// if(mapScore>r.mapScore){
+// System.err.println("genMatchString reduced mapping score: "+mapScore+" -> "+r.mapScore+" in read "+r.numericID);
+// }
+ if(STRICT_MAX_INDEL && hasLongIndel(r.match, index.MAX_INDEL)){
+ SiteScore ss=r.topSite();
+ ss.score=r.mapScore=Tools.min(ss.score, -9999);
+ ss.setSlowPairedScore(ss.score, ss.score);
+ }
+ r.topSite().score=r.topSite().slowScore;
+ firstIter=false;
+ }while(r.sites.size()>1 && r.topSite().score<r.sites.get(1).score);
+ }
+
+ if(r.numSites()>1){
+ assert(r.topSite().score==r.topSite().slowScore) : "\n"+r.toText(false)+"\n";
+ assert(r.topSite().score==r.mapScore) : "\n"+r.toText(false)+"\n";
+ }
+
+ if(verbose){System.err.println("E: "+r);}
+ }
+ }
+
+ if(r.numSites()>1){
+ assert(r.topSite().score==r.topSite().slowScore) : "\n"+r.toText(false)+"\n";
+ assert(r.topSite().score==r.mapScore) : "\n"+r.toText(false)+"\n";
+ removeDuplicateBestSites(r);
+ }
+ if(r.numSites()>0){r.topSite().match=r.match;}
+
+
+
+ if(r.sites!=null && r.mapScore<=0){//This came from BBMapThreadPacBio; not sure if needed for other modes
+ if(!STRICT_MAX_INDEL && !Shared.anomaly){
+ System.err.println("Note: Read "+r.id+" failed cigar string generation and will be marked as unmapped.\t"+(r.match==null)+"\t"+r.mapScore+"\t"+r.topSite()+"\t"+new String(r.bases));
+ if(MSA.bandwidth>0 || MSA.bandwidthRatio>0 || MSA.flatMode){Shared.anomaly=true;}
+ }
+ r.mapScore=0;
+ r.setMapped(false);
+ r.sites=null;
+ }
+
+
+
+ //This block is to prevent an assertion from firing. Generally caused by alignment being lost during match generation.
+ //TODO: Fix cause.
+ if(r.mapScore>0 && r.sites==null){
+ if(!Shared.anomaly){System.err.println("Anomaly: mapScore>0 and list==null.\n"+r+"\n");}
+ Shared.anomaly=true;
+ r.clearMapping();
+ }else if(r.mapScore<=0 && r.sites!=null){
+ if(BANDWIDTH<1){
+ if(!Shared.anomaly){System.err.println("Anomaly1: mapScore<=0 and list!=null.\n"+r+"\n");}
+ Shared.anomaly=true;
+ }
+ r.clearMapping();
+ }
+ assert(r.sites==null || r.mapScore>0) :
+ "\nmapScore = "+r.mapScore+"\nread = "+r.toText(false)+"\nscore thresh = "+(-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO*maxSwScore))+"\n"+
+ "msa unlimited return = "+Arrays.toString(msa.fillAndScoreLimited(r.strand()==Gene.PLUS ? r.bases :
+ AminoAcid.reverseComplementBases(r.bases), r.topSite(), Tools.max(SLOW_ALIGN_PADDING, 10), 0))+"\n"+
+ "msa limited return = "+Arrays.toString(msa.fillAndScoreLimited(r.strand()==Gene.PLUS ? r.bases :
+ AminoAcid.reverseComplementBases(r.bases), r.topSite(), Tools.max(SLOW_ALIGN_PADDING, 10), (-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO*maxSwScore))))+"\n\n"+
+ "msa vert limit: "+msa.showVertLimit()+"\n\nmsa horz limit: "+msa.showHorizLimit()+"\n\n";
+
+// assert(r.list==null || r.mapScore>0) : r.mapScore+"\n"+r.list==null ? "null" : r.list.toString();
+
+ if((CLEARZONE3>CLEARZONE1 || CLEARZONE3>CLEARZONEP) && r.sites!=null && !r.ambiguous()){
+
+ assert(r.mapScore>0);
+ float cz3v2=(CLEARZONE3*Tools.min(1.25f, (maxSwScore/(float)r.mapScore)));
+
+// boolean changed=applyClearzone3(r, CLEARZONE3, INV_CLEARZONE3);
+ boolean changed=applyClearzone3(r, (int)cz3v2, 1/cz3v2);
+ if(changed){
+ int minScore=(int)(maxSwScore*MINIMUM_ALIGNMENT_SCORE_RATIO);
+ if(r.mapScore<minScore){
+ assert(!r.ambiguous());
+ r.setAmbiguous(true);
+ }
+ }
+ }
+
+ if(r.ambiguous() && AMBIGUOUS_TOSS){r.sites=null; r.clearSite(); r.setMapped(false);}
+
+ if(r.mapped() && r.numSites()>1 && PRINT_SECONDARY_ALIGNMENTS){
+ ensureMatchStringsOnSiteScores(r, basesM, maxImperfectSwScore, maxSwScore);
+ assert(Read.CHECKSITES(r, basesM));
+ }
+
+ assert(checkTopSite(r));
+ if(r.mapped() && (LOCAL_ALIGN || r.containsXYC())){
+ msa.toLocalAlignment(r, r.topSite(), basesM, r.containsXYC() ? 1 : LOCAL_ALIGN_TIP_LENGTH, LOCAL_ALIGN_MATCH_POINT_RATIO);
+ assert(Read.CHECKSITES(r, basesM));
+ }
+
+ if(r.numSites()==0 || (!r.ambiguous() && r.mapScore<maxSwScore*MINIMUM_ALIGNMENT_SCORE_RATIO)){
+ r.clearMapping();
+ }
+ postFilterRead(r, basesM, maxImperfectSwScore, maxSwScore);
+ if(MAKE_MATCH_STRING){ensureMatchStringOnPrimary(r, basesM, maxImperfectSwScore, maxSwScore);}
+
+ if(PENALIZE_AMBIG){
+ int penalty=calcTipScorePenalty(r, maxSwScore, 7);
+ applyScorePenalty(r, penalty);
+ }
+
+ if(CALC_STATISTICS){
+ calcStatistics1(r, maxSwScore, maxPossibleQuickScore);
+ }
+ }
+
+
+ /** Returns number of perfect pairs */
+ public int pairSiteScoresInitial(Read r, Read r2, boolean trim){
+
+ if(r.numSites()<1 || r2.numSites()<1){return 0;}
+
+ SiteScore.PCOMP.sort(r.sites);
+ SiteScore.PCOMP.sort(r2.sites);
+
+ for(SiteScore ss : r.sites){ss.setPairedScore(0);}
+ for(SiteScore ss : r2.sites){ss.setPairedScore(0);}
+
+// ArrayList<SiteScorePair> pairs=new ArrayList<SiteScorePair>(Tools.min(8, Tools.min(r.list.size(), r2.list.size())));
+
+ int maxPairedScore1=-1;
+ int maxPairedScore2=-1;
+
+
+// for(SiteScore ss : r.list){
+// System.out.println(ss.toText());
+// }
+
+// int i=0, j=0;
+ final int ilimit=r.sites.size()-1;
+ final int jlimit=r2.sites.size()-1;
+ final int maxReadLen=Tools.max(r.length(), r2.length());
+
+// final int outerDistLimit=MIN_PAIR_DIST+r.length()+r2.length();
+ final int outerDistLimit=(Tools.max(r.length(), r2.length())*(OUTER_DIST_MULT))/OUTER_DIST_DIV;//-(SLOW_ALIGN ? 100 : 0);
+ final int innerDistLimit=MAX_PAIR_DIST;//+(FIND_TIP_DELETIONS ? TIP_DELETION_SEARCH_RANGE : 0);
+ final int expectedFragLength=AVERAGE_PAIR_DIST+r.length()+r2.length();
+
+ int numPerfectPairs=0;
+
+ for(int i=0, j=0; i<=ilimit && j<=jlimit; i++){
+ SiteScore ss1=r.sites.get(i);
+ SiteScore ss2=r2.sites.get(j);
+
+ while(j<jlimit && (ss2.chrom<ss1.chrom || (ss2.chrom==ss1.chrom && ss1.start-ss2.stop>innerDistLimit))){
+ j++;
+ ss2=r2.sites.get(j);
+ }
+
+ for(int k=j; k<=jlimit; k++){
+ ss2=r2.sites.get(k);
+
+ if(ss2.chrom>ss1.chrom){break;}
+ if(ss2.start-ss1.stop>innerDistLimit){break;}
+
+// int dist=0;
+//
+// if(ss1.start<=ss2.start){
+// dist=ss2.start-ss1.stop;
+// }else if(ss1.start>ss2.start){
+// dist=ss1.start-ss2.stop;
+// }
+
+
+// int innerdist=0;
+// int outerdist=0;
+//
+// if(ss1.start<=ss2.start){
+// innerdist=ss2.start-ss1.stop;
+// outerdist=ss2.stop-ss1.start;
+// }else if(ss1.start>ss2.start){
+// innerdist=ss1.start-ss2.stop;
+// outerdist=ss1.stop-ss2.start;
+// }
+
+ final int innerdist, outerdist;
+ //assert(!SAME_STRAND_PAIRS) : "TODO";
+
+ if(REQUIRE_CORRECT_STRANDS_PAIRS){
+ if(ss1.strand!=ss2.strand){
+ if(ss1.strand==Gene.PLUS){
+ innerdist=ss2.start-ss1.stop;
+ outerdist=ss2.stop-ss1.start;
+ }else{
+ innerdist=ss1.start-ss2.stop;
+ outerdist=ss1.stop-ss2.start;
+ }
+ }else{
+ if(ss1.start<=ss2.start){
+ innerdist=ss2.start-ss1.stop;
+ outerdist=ss2.stop-ss1.start;
+ }else{
+ innerdist=ss1.start-ss2.stop;
+ outerdist=ss1.stop-ss2.start;
+ }
+ }
+ }else{
+ if(ss1.start<=ss2.start){
+ innerdist=ss2.start-ss1.stop;
+ outerdist=ss2.stop-ss1.start;
+ }else{
+ innerdist=ss1.start-ss2.stop;
+ outerdist=ss1.stop-ss2.start;
+ }
+ }
+
+ assert(outerdist>=innerdist);
+
+ if(outerdist>=outerDistLimit && innerdist<=innerDistLimit){
+
+ boolean strandOK=((ss1.strand==ss2.strand)==SAME_STRAND_PAIRS);
+
+ if(strandOK || !REQUIRE_CORRECT_STRANDS_PAIRS){
+
+ boolean paired1=false, paired2=false;
+
+ int deviation=absdif(AVERAGE_PAIR_DIST, innerdist);
+
+ final int pairedScore1;
+ final int pairedScore2;
+ if(strandOK){
+// pairedScore1=ss1.score+ss2.score/2;
+// pairedScore2=ss2.score+ss1.score/2;
+
+ pairedScore1=ss1.score+1+Tools.max(1, ss2.score/2-(((deviation)*ss2.score)/(32*expectedFragLength+100)));
+ pairedScore2=ss2.score+1+Tools.max(1, ss1.score/2-(((deviation)*ss1.score)/(32*expectedFragLength+100)));
+ }else{//e.g. a junction
+ pairedScore1=ss1.score+Tools.max(0, ss2.score/16);
+ pairedScore2=ss2.score+Tools.max(0, ss1.score/16);
+ }
+
+ if(pairedScore1>ss1.pairedScore){
+ paired1=true;
+ ss1.setPairedScore(Tools.max(ss1.pairedScore, pairedScore1));
+ maxPairedScore1=Tools.max(ss1.score, maxPairedScore1);
+ // System.out.println("Paired "+ss1.toText()+" with "+ss2.toText());
+ }else{
+ // System.out.println(ss1.toText()+" already paired.");
+ }
+ if(pairedScore2>ss2.pairedScore){
+ paired2=true;
+ ss2.setPairedScore(Tools.max(ss2.pairedScore, pairedScore2));
+ maxPairedScore2=Tools.max(ss2.score, maxPairedScore2);
+ }
+
+ if(paired1 && paired2 && outerdist>=maxReadLen && deviation<=expectedFragLength && ss1.perfect && ss2.perfect){
+ numPerfectPairs++; //Lower bound. Some perfect pairs may be the same.
+ }
+
+// ss1.pairedScore=Tools.max(ss1.pairedScore, pairedScore1);
+// ss2.pairedScore=Tools.max(ss2.pairedScore, pairedScore2);
+// maxPairedScore1=Tools.max(ss1.score, maxPairedScore1);
+// maxPairedScore2=Tools.max(ss2.score, maxPairedScore2);
+ }
+ }
+ }
+
+ }
+
+
+
+ for(SiteScore ss : r.sites){
+ if(ss.pairedScore>ss.score){ss.score=ss.pairedScore;}
+ else{assert(ss.pairedScore==0);}
+// ss.score=ss.pairedScore=Tools.max(ss.pairedScore, ss.score);
+ }
+ for(SiteScore ss : r2.sites){
+ if(ss.pairedScore>ss.score){ss.score=ss.pairedScore;}
+ else{assert(ss.pairedScore==0);}
+// ss.score=ss.pairedScore=Tools.max(ss.pairedScore, ss.score);
+ }
+
+ if(trim){
+ if(numPerfectPairs>0){
+// System.out.print(".");
+ Tools.trimSitesBelowCutoff(r.sites, (int)(maxPairedScore1*.94f), false, true, 1, MAX_TRIM_SITES_TO_RETAIN);
+ Tools.trimSitesBelowCutoff(r2.sites, (int)(maxPairedScore2*.94f), false, true, 1, MAX_TRIM_SITES_TO_RETAIN);
+ }else{
+ if(r.sites.size()>4){
+ Tools.trimSitesBelowCutoff(r.sites, (int)(maxPairedScore1*.9f), true, true, 1, MAX_TRIM_SITES_TO_RETAIN);
+ }
+ if(r2.sites.size()>4){
+ Tools.trimSitesBelowCutoff(r2.sites, (int)(maxPairedScore2*.9f), true, true, 1, MAX_TRIM_SITES_TO_RETAIN);
+ }
+ }
+ }
+
+// if(pairs.isEmpty()){return null;}
+//
+// ArrayList<SiteScore> temp=new ArrayList<SiteScore>(Tools.max(r.list.size(), r2.list.size()));
+//
+// for(SiteScore ss : r.list){
+// if(ss.score>maxPairedScore1){temp.add(ss);}
+// }
+// for(SiteScorePair ssp : pairs){
+// temp.add(ssp.a);
+// }
+// r.list.clear();
+// r.list.addAll(temp);
+//
+// for(SiteScore ss : r2.list){
+// if(ss.score>maxPairedScore2){temp.add(ss);}
+// }
+// for(SiteScorePair ssp : pairs){
+// temp.add(ssp.b);
+// }
+// r2.list.clear();
+// r2.list.addAll(temp);
+//
+// return pairs;
+
+ return numPerfectPairs;
+ }
+
+
+ public void processReadPair(final Read r, final byte[] basesM1, final byte[] basesM2){
+ if(idmodulo>1 && r.numericID%idmodulo!=1){return;}
+ final Read r2=r.mate;
+ assert(r2!=null);
+ final byte[] basesP1=r.bases, basesP2=r2.bases;
+ final int len1=(basesP1==null ? 0 : basesP1.length), len2=(basesP2==null ? 0 : basesP2.length);
+
+ readsUsed1++;
+ readsUsed2++;
+
+ final int maxPossibleQuickScore1=quickMap(r, basesM1);
+ final int maxPossibleQuickScore2=quickMap(r2, basesM2);
+
+ if(verbose){
+ System.err.println("\nAfter quick map:\nRead1:\t"+r+"\nRead2:\t"+r.mate);
+ }
+
+ if(maxPossibleQuickScore1<0 && maxPossibleQuickScore2<0){
+ r.sites=null;
+ r2.sites=null;
+ lowQualityReadsDiscarded1++;
+ lowQualityBasesDiscarded1+=len1;
+ r.setDiscarded(true);
+ lowQualityReadsDiscarded2++;
+ lowQualityBasesDiscarded2+=len2;
+ r2.setDiscarded(true);
+ return;
+ }
+
+ //Not really needed due to subsumption
+// Tools.mergeDuplicateSites(r.list);
+// Tools.mergeDuplicateSites(r2.list);
+
+ initialSiteSum1+=r.numSites();
+ initialSiteSum2+=r2.numSites();
+
+ //TODO: Fix this. This is a workaround for an assertion error counting the number of reads used.
+ //Discards need to be tracked separately for each end.
+// if(maxPossibleQuickScore2<0){lowQualityReadsDiscarded--;}
+
+ final int maxSwScore1=msa.maxQuality(len1);
+ final int maxImperfectSwScore1=msa.maxImperfectScore(len1);
+ final int maxSwScore2=msa.maxQuality(len2);
+ final int maxImperfectSwScore2=msa.maxImperfectScore(len2);
+
+ pairSiteScoresInitial(r, r2, TRIM_LIST);
+ if(verbose){System.err.println("\nAfter initial pair:\nRead1:\t"+r+"\nRead2:\t"+r2);}
+
+ if(TRIM_LIST){
+
+ if(MIN_TRIM_SITES_TO_RETAIN_PAIRED>1){
+ if(r.numSites()>MIN_TRIM_SITES_TO_RETAIN_PAIRED){Collections.sort(r.sites);}
+ if(r2.numSites()>MIN_TRIM_SITES_TO_RETAIN_PAIRED){Collections.sort(r2.sites);}
+ }
+
+ trimList(r.sites, true, maxSwScore1, false, MIN_TRIM_SITES_TO_RETAIN_PAIRED, MAX_TRIM_SITES_TO_RETAIN);
+ trimList(r2.sites, true, maxSwScore2, false, MIN_TRIM_SITES_TO_RETAIN_PAIRED, MAX_TRIM_SITES_TO_RETAIN);
+ }
+ postTrimSiteSum1+=r.numSites();
+ postTrimSiteSum2+=r2.numSites();
+
+ {//Reset score to non-paired score
+ if(r.sites!=null){
+ for(SiteScore ss : r.sites){
+ assert(ss.slowScore<=ss.quickScore);
+ ss.score=ss.quickScore;
+ }
+ }
+ if(r2.sites!=null){
+ for(SiteScore ss : r2.sites){
+ assert(ss.slowScore<=ss.quickScore);
+ ss.score=ss.quickScore;
+ }
+ }
+ }
+
+ if(verbose){System.err.println("\nAfter trim:\nRead1:\t"+r.sites+"\nRead2:\t"+r2.sites);}
+
+// assert(Read.CHECKSITES(r, basesM1) && Read.CHECKSITES(r2, basesM2));
+
+ if(SLOW_ALIGN){
+
+ if(r.numSites()>0){
+
+ int numNearPerfectScores1=scoreNoIndels(r, basesP1, basesM1, maxSwScore1, maxImperfectSwScore1);
+ Collections.sort(r.sites); //Puts higher scores first to better trigger the early exit based on perfect scores
+
+ if(numNearPerfectScores1<1){
+ if(FIND_TIP_DELETIONS){findTipDeletions(r, basesP1, basesM1, maxSwScore1, maxImperfectSwScore1);}
+ }
+
+ //TODO:
+ //Note scoreSlow can be skipped under this circumstance:
+ //When rescue is disabled, numNearPerfectScores>0, and there are no paired sites.
+ scoreSlow(r.sites, basesP1, basesM1, maxSwScore1, maxImperfectSwScore1);
+ if(STRICT_MAX_INDEL){
+ int removed=removeLongIndels(r.sites, index.MAX_INDEL);
+ if(r.numSites()==0){r.clearMapping();}
+ }
+ Tools.mergeDuplicateSites(r.sites, true, true);
+ }
+
+ if(r2.numSites()>0){
+ int numNearPerfectScores2=scoreNoIndels(r2, basesP2, basesM2, maxSwScore2, maxImperfectSwScore2);
+ Collections.sort(r2.sites); //Puts higher scores first to better trigger the early exit based on perfect scores
+
+ if(numNearPerfectScores2<1){
+ if(FIND_TIP_DELETIONS){findTipDeletions(r2, basesP2, basesM2, maxSwScore2, maxImperfectSwScore2);}
+ }
+
+ scoreSlow(r2.sites, basesP2, basesM2, maxSwScore2, maxImperfectSwScore2);
+ if(STRICT_MAX_INDEL){
+ int removed=removeLongIndels(r2.sites, index.MAX_INDEL);
+ if(r2.numSites()<1){r2.clearMapping();}
+ }
+ Tools.mergeDuplicateSites(r2.sites, true, true);
+ }
+
+
+ if(verbose){System.err.println("\nAfter slow align:\nRead1:\t"+r+"\nRead2:\t"+r2);}
+ assert(Read.CHECKSITES(r, basesM1, false) && Read.CHECKSITES(r2, basesM2, false));
+
+ if(DO_RESCUE){
+ int unpaired1=0;
+ int unpaired2=0;
+ if(r.sites!=null){
+ for(SiteScore ss : r.sites){
+ assert(ss.pairedScore<1 || ss.pairedScore>ss.quickScore || ss.pairedScore>ss.slowScore) :
+ "\n"+ss.toText()+"\n"+r.toText(false)+"\n";
+ if(ss.pairedScore==0){unpaired1++;}
+ }
+ }
+ if(r2.sites!=null){
+ for(SiteScore ss : r2.sites){
+ assert(ss.pairedScore<1 || ss.pairedScore>ss.quickScore || ss.pairedScore>ss.slowScore) :
+ "\n"+ss.toText()+"\n"+r2.toText(false)+"\n";
+ if(ss.pairedScore==0){unpaired2++;}
+ }
+ }
+
+ if(unpaired1>0 && r.numSites()>0){
+ Collections.sort(r.sites);
+ Tools.removeLowQualitySitesPaired(r.sites, maxSwScore1, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE);
+ rescue(r, r2, basesP2, basesM2, Tools.min(MAX_PAIR_DIST, 2*AVERAGE_PAIR_DIST+100));
+ Tools.mergeDuplicateSites(r2.sites, true, true);
+ }
+
+ if(unpaired2>0 && r2.numSites()>0){
+ Collections.sort(r2.sites);
+ Tools.removeLowQualitySitesPaired(r2.sites, maxSwScore2, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE);
+ rescue(r2, r, basesP1, basesM1, Tools.min(MAX_PAIR_DIST, 2*AVERAGE_PAIR_DIST+100));
+ Tools.mergeDuplicateSites(r.sites, true, true);
+ }
+
+ postRescueSiteSum1+=r.numSites();
+ postRescueSiteSum2+=r2.numSites();
+
+// if(r.list!=null){Collections.sort(r.list);}
+// if(r2.list!=null){Collections.sort(r2.list);}
+//
+// Tools.removeLowQualitySites(r.list, maxSwScore1, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE);
+// Tools.removeLowQualitySites(r2.list, maxSwScore2, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE);
+
+ if(verbose){System.err.println("\nAfter rescue:\nRead1:\t"+r+"\nRead2:\t"+r2);}
+ assert(Read.CHECKSITES(r, basesM1, false) && Read.CHECKSITES(r2, basesM2, false));
+ }
+ }else{
+ Tools.mergeDuplicateSites(r.sites, true, false);
+ Tools.mergeDuplicateSites(r2.sites, true, false);
+ if(verbose){System.err.println("\nAfter merge:\nRead1:\t"+r+"\nRead2:\t"+r2);}
+ assert(Read.CHECKSITES(r, basesM1, false) && Read.CHECKSITES(r2, basesM2, false));
+ }
+
+ if(r.numSites()>1){Collections.sort(r.sites);}
+ if(r2.numSites()>1){Collections.sort(r2.sites);}
+
+ if(false){//This block is optional, but increases correctness by a tiny bit. (or maybe not!)
+ if(SLOW_ALIGN || USE_AFFINE_SCORE){
+ Tools.removeLowQualitySitesPaired(r.sites, maxSwScore1, MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED, MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED);
+ Tools.removeLowQualitySitesPaired(r2.sites, maxSwScore2, MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED, MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED);
+ }
+
+ pairSiteScoresFinal(r, r2, false, false, MAX_PAIR_DIST, AVERAGE_PAIR_DIST, SAME_STRAND_PAIRS, REQUIRE_CORRECT_STRANDS_PAIRS, MAX_TRIM_SITES_TO_RETAIN);
+
+ if(r.numSites()>1){Collections.sort(r.sites);}
+ if(r2.numSites()>1){Collections.sort(r2.sites);}
+ }
+
+ if(SLOW_ALIGN || USE_AFFINE_SCORE){
+ Tools.removeLowQualitySitesPaired(r.sites, maxSwScore1, MINIMUM_ALIGNMENT_SCORE_RATIO, MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED);
+ Tools.removeLowQualitySitesPaired(r2.sites, maxSwScore2, MINIMUM_ALIGNMENT_SCORE_RATIO, MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED);
+ }
+
+ pairSiteScoresFinal(r, r2, true, true, MAX_PAIR_DIST, AVERAGE_PAIR_DIST, SAME_STRAND_PAIRS, REQUIRE_CORRECT_STRANDS_PAIRS, MAX_TRIM_SITES_TO_RETAIN);
+ if(verbose){System.err.println("\nAfter final pairing:\nRead1:\t"+r+"\nRead2:\t"+r2);}
+
+ if(r.numSites()>0){
+ mapped1++;
+ Collections.sort(r.sites);
+ }
+ if(r2.numSites()>0){
+ mapped2++;
+ Collections.sort(r2.sites);
+ }
+ assert(Read.CHECKSITES(r, basesM1) && Read.CHECKSITES(r2, basesM2));
+
+ if(SLOW_ALIGN || USE_AFFINE_SCORE){
+ r.setPerfectFlag(maxSwScore1);
+ r2.setPerfectFlag(maxSwScore2);
+// assert(Read.CHECKSITES(r, basesM1) && Read.CHECKSITES(r2, basesM2));
+ }
+
+
+ if(r.numSites()>1){
+ final int clearzone=r.perfect() ? CLEARZONEP :
+ r.topSite().score>=(int)(maxSwScore1*CLEARZONE1b_CUTOFF) ? CLEARZONE1 :
+ (r.topSite().score>=(int)(maxSwScore1*CLEARZONE1c_CUTOFF) ? CLEARZONE1b : CLEARZONE1c);
+ int numBestSites1=Tools.countTopScores(r.sites, clearzone);
+ if(numBestSites1>1){
+ //Ambiguous alignment
+ assert(r.sites.size()>1);
+
+ boolean b=processAmbiguous(r.sites, true, AMBIGUOUS_TOSS, clearzone, SAVE_AMBIGUOUS_XY);
+ r.setAmbiguous(b);
+ }
+// assert(Read.CHECKSITES(r, basesM1));
+ }
+
+ if(r2.numSites()>1){
+ final int clearzone=r2.perfect() ? CLEARZONEP :
+ r2.topSite().score>=(int)(maxSwScore2*CLEARZONE1b_CUTOFF) ? CLEARZONE1 :
+ (r2.topSite().score>=(int)(maxSwScore2*CLEARZONE1c_CUTOFF) ? CLEARZONE1b : CLEARZONE1c);
+ int numBestSites2=Tools.countTopScores(r2.sites, clearzone);
+ if(numBestSites2>1){
+ //Ambiguous alignment
+ assert(r2.sites.size()>1);
+
+ boolean b=processAmbiguous(r2.sites, false, AMBIGUOUS_TOSS, clearzone, SAVE_AMBIGUOUS_XY);
+ r2.setAmbiguous(b);
+ }
+// assert(Read.CHECKSITES(r2, basesM2));
+ }
+ if(verbose){System.err.println("\nAfter ambiguous removal:\nRead1:\t"+r+"\nRead2:\t"+r2);}
+
+ if(r.numSites()>0 && r2.numSites()>0){
+ SiteScore ss1=r.topSite();
+ SiteScore ss2=r2.topSite();
+ if(canPair(ss1, ss2, len1, len2, REQUIRE_CORRECT_STRANDS_PAIRS, SAME_STRAND_PAIRS, MAX_PAIR_DIST)){
+ assert(SLOW_ALIGN ? ss1.pairedScore>ss1.slowScore : ss1.pairedScore>ss1.quickScore) :
+ "\n"+ss1.toText()+"\n"+ss2.toText()+"\n"+r.toText(false)+"\n"+r2.toText(false)+"\n\n"+
+ r.mapped()+", "+r.paired()+", "+r.strand()+", "+r.ambiguous()+"\n\n"+r2.mapped()+", "+r2.paired()+", "+r2.strand()+", "+r2.ambiguous()+"\n\n";
+ assert(SLOW_ALIGN ? ss2.pairedScore>ss2.slowScore : ss2.pairedScore>ss2.quickScore) :
+ "\n"+ss1.toText()+"\n"+ss2.toText()+"\n"+r.toText(false)+"\n"+r2.toText(false)+"\n\n";
+ r.setPaired(true);
+ r.mate.setPaired(true);
+ }
+ }
+
+ if(r.numSites()==0){r.sites=null;r.mapScore=0;}
+ if(r2.numSites()==0){r2.sites=null;r2.mapScore=0;}
+
+ r.setFromTopSite(AMBIGUOUS_RANDOM, true, MAX_PAIR_DIST);
+ r2.setFromTopSite(AMBIGUOUS_RANDOM, true, MAX_PAIR_DIST);
+ if(KILL_BAD_PAIRS){
+ if(r.isBadPair(REQUIRE_CORRECT_STRANDS_PAIRS, SAME_STRAND_PAIRS, MAX_PAIR_DIST)){
+ int x=r.mapScore/len1;
+ int y=r2.mapScore/len2;
+ if(x>=y){
+ r2.clearAnswers(false);
+ }else{
+ r.clearAnswers(false);
+ }
+ }
+ }
+ if(verbose){System.err.println("\nAfter bad pair removal:\nRead1:\t"+r+"\nRead2:\t"+r2);}
+
+ assert(r.sites==null || r.mapScore>0) : r.mapScore+"\n"+r.toText(false)+"\n\n"+r2.toText(false)+"\n";
+ assert(r2.sites==null || r2.mapScore>0) : r2.mapScore+"\n"+r.toText(false)+"\n\n"+r2.toText(false)+"\n";
+ if(MAKE_MATCH_STRING){
+ if(r.numSites()>0){
+ if(USE_SS_MATCH_FOR_PRIMARY && r.topSite().match!=null){
+ r.match=r.topSite().match;
+ }else{
+ genMatchString(r, basesP1, basesM1, maxImperfectSwScore1, maxSwScore1, false, false);
+
+ if(STRICT_MAX_INDEL && r.mapped()){
+ if(hasLongIndel(r.match, index.MAX_INDEL)){
+ r.clearMapping();
+ r2.setPaired(false);
+ }
+ }
+ }
+// assert(Read.CHECKSITES(r, basesM1));
+ }
+ if(r2.numSites()>0){
+ if(USE_SS_MATCH_FOR_PRIMARY && r2.topSite().match!=null){
+ r2.match=r2.topSite().match;
+ }else{
+ genMatchString(r2, basesP2, basesM2, maxImperfectSwScore2, maxSwScore2, false, false);
+
+ if(STRICT_MAX_INDEL && r2.mapped()){
+ if(hasLongIndel(r2.match, index.MAX_INDEL)){
+ r2.clearMapping();
+ r.setPaired(false);
+ }
+ }
+ }
+// assert(Read.CHECKSITES(r2, basesM2));
+ }
+ }
+
+ assert(checkTopSite(r)); // TODO remove this
+ if(verbose){
+ System.err.println("\nFinal:\nRead1:\t"+r+"\nRead2:\t"+r2);
+ if(r.match!=null && r.shortmatch()){r.match=Read.toLongMatchString(r.match); r.setShortMatch(false);}
+ if(r2.match!=null && r2.shortmatch()){r2.match=Read.toLongMatchString(r2.match); r2.setShortMatch(false);}
+ }
+
+ //Block to prevent assertion from firing. Generally caused by alignment being lost during match generation. TODO: Fix cause.
+ if(r.mapScore>0 && r.sites==null){
+ if(!Shared.anomaly){System.err.println("Anomaly: mapScore>0 and list==null.\n"+r+"\n");}
+ Shared.anomaly=true;
+ r.clearMapping();
+ r2.setPaired(false);
+ }else if(r.mapScore<=0 && r.sites!=null){
+ if(!STRICT_MAX_INDEL && !Shared.anomaly){System.err.println("Anomaly2: mapScore<=0 and list!=null.\n"+r+"\n");}
+ Shared.anomaly=true;
+ r.clearMapping();
+ r2.setPaired(false);
+ }
+ assert(checkTopSite(r)); // TODO remove this
+ //Block to prevent assertion from firing. Generally caused by alignment being lost during match generation. TODO: Fix cause.
+ if(r2.mapScore>0 && r2.sites==null){
+ if(!Shared.anomaly){System.err.println("Anomaly: mapScore>0 and list==null.\n"+r+"\n");}
+ Shared.anomaly=true;
+ r2.clearMapping();
+ r.setPaired(false);
+ }else if(r2.mapScore<=0 && r2.sites!=null){
+ if(!STRICT_MAX_INDEL && !Shared.anomaly){System.err.println("Anomaly3: mapScore<=0 and list!=null.\n"+r+"\n");}
+ Shared.anomaly=true;
+ r2.clearMapping();
+ r.setPaired(false);
+ }
+
+ assert(r.sites==null || r.mapScore>0) :
+ r.mapScore+"\t"+r.sites+"\n"+(-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED*maxSwScore1))+"\n"+
+ Arrays.toString(msa.fillAndScoreLimited(r.strand()==Gene.PLUS ? r.bases :
+ AminoAcid.reverseComplementBases(r.bases), r.topSite(), Tools.max(SLOW_ALIGN_PADDING, 80), 0))+"\n"+
+ Arrays.toString(msa.fillAndScoreLimited(r.strand()==Gene.PLUS ? r.bases :
+ AminoAcid.reverseComplementBases(r.bases), r.topSite(), Tools.max(SLOW_ALIGN_PADDING, 80), (-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED*maxSwScore1))))+"\n\n"+
+ msa.showVertLimit()+"\n\n"+msa.showHorizLimit()+"\n\n"+r+"\n\n"+r2+"\n\n";
+ assert(r2.sites==null || r2.mapScore>0) :
+ r2.mapScore+"\t"+r2.sites+"\n"+(-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED*maxSwScore2))+"\n"+
+ Arrays.toString(msa.fillAndScoreLimited(r2.strand()==Gene.PLUS ? r2.bases :
+ AminoAcid.reverseComplementBases(r2.bases), r2.topSite(), Tools.max(SLOW_ALIGN_PADDING, 80), 0))+"\n"+
+ Arrays.toString(msa.fillAndScoreLimited(r2.strand()==Gene.PLUS ? r2.bases :
+ AminoAcid.reverseComplementBases(r2.bases), r2.topSite(), Tools.max(SLOW_ALIGN_PADDING, 80), (-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED*maxSwScore2))))+"\n\n"+
+ msa.showVertLimit()+"\n\n"+msa.showHorizLimit()+"\n\n"+r+"\n\n"+r2+"\n\n";
+
+ assert(!r.mapped() || !MAKE_MATCH_STRING || r.match!=null) : "Note that sometimes, VERY RARELY, match string generation fails.";
+ assert(checkTopSite(r)); // TODO remove this
+ removeDuplicateBestSites(r);
+ removeDuplicateBestSites(r2);
+
+ if(DYNAMIC_INSERT_LENGTH && numMated>1000 && r.paired()){
+ AVERAGE_PAIR_DIST=(int)(innerLengthSum*1f/numMated);
+ }
+ assert(checkTopSite(r)); // TODO remove this
+ if(r.ambiguous() && AMBIGUOUS_TOSS){
+ if(r.sites!=null){r.sites=null;}
+ r.clearSite();
+ r.setMapped(false);
+ r.setPaired(false);
+ r2.setPaired(false);
+ }else if(r.mapped() && r.numSites()>1 && PRINT_SECONDARY_ALIGNMENTS){
+ ensureMatchStringsOnSiteScores(r, basesM1, maxImperfectSwScore1, maxSwScore1);
+ assert(Read.CHECKSITES(r, basesM1));
+ }
+ if(r2.ambiguous() && AMBIGUOUS_TOSS){
+ if(r2.sites!=null){r2.sites=null;}
+ r2.clearSite();
+ r2.setMapped(false);
+ r.setPaired(false);
+ r2.setPaired(false);
+ }else if(r2.mapped() && r2.numSites()>1 && PRINT_SECONDARY_ALIGNMENTS){
+ ensureMatchStringsOnSiteScores(r2, basesM2, maxImperfectSwScore2, maxSwScore2);
+ assert(Read.CHECKSITES(r2, basesM2));
+ }
+
+ assert(checkTopSite(r));
+ if(r.mapped() && (LOCAL_ALIGN || r.containsXYC())){
+ final SiteScore ss=r.topSite();
+ ss.match=r.match;
+ msa.toLocalAlignment(r, ss, basesM1, r.containsXYC() ? 1 : LOCAL_ALIGN_TIP_LENGTH, LOCAL_ALIGN_MATCH_POINT_RATIO);
+ }
+
+ assert(checkTopSite(r2));
+ if(r2.mapped() && (LOCAL_ALIGN || r2.containsXYC())){
+ final SiteScore ss=r2.topSite();
+ ss.match=r2.match;
+ msa.toLocalAlignment(r2, ss, basesM2, r2.containsXYC() ? 1 : LOCAL_ALIGN_TIP_LENGTH, LOCAL_ALIGN_MATCH_POINT_RATIO);
+ }
+
+ postFilterRead(r, basesM1, maxImperfectSwScore1, maxSwScore1);
+ postFilterRead(r2, basesM2, maxImperfectSwScore2, maxSwScore2);
+ if(MAKE_MATCH_STRING){
+ ensureMatchStringOnPrimary(r, basesM1, maxImperfectSwScore1, maxSwScore1);
+ ensureMatchStringOnPrimary(r2, basesM2, maxImperfectSwScore2, maxSwScore2);
+ }
+
+ if(CALC_STATISTICS){
+ calcStatistics1(r, maxSwScore1, maxPossibleQuickScore1);
+ calcStatistics2(r2, maxSwScore2, maxPossibleQuickScore2);
+ }
+ }
+
+}
diff --git a/current/align2/BBMapThreadAcc.java b/current/align2/BBMapThreadAcc.java
new file mode 100755
index 0000000..3a5147d
--- /dev/null
+++ b/current/align2/BBMapThreadAcc.java
@@ -0,0 +1,1397 @@
+package align2;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+
+import jgi.CoveragePileup;
+
+import stream.ConcurrentReadInputStream;
+import stream.ConcurrentReadOutputStream;
+import stream.Read;
+import stream.SiteScore;
+
+import dna.AminoAcid;
+import dna.Data;
+import dna.Gene;
+
+/**
+ * Based on MapTestThread11i
+ *
+ * @author Brian Bushnell
+ * @date Jul 10, 2012
+ *
+ */
+public final class BBMapThreadAcc extends AbstractMapThread{
+
+ static final int ALIGN_COLUMNS=BBIndexAcc.ALIGN_COLUMNS;
+ static final int ALIGN_ROWS=601;
+
+
+
+ /** Don't trim for local alignments unless at least this many bases will be clipped */
+ private final int LOCAL_ALIGN_TIP_LENGTH=1;
+ /** Range is 0-1; a lower number makes trimming more aggressive */
+ private final float LOCAL_ALIGN_MATCH_POINT_RATIO=1f;
+
+ /** Ratio of the points for a match of a single base needed to declare unambiguous. 1 SNP is currently about 2.57 */
+ public final float CLEARZONE_RATIOP=1.6f; //default 1.3f, which makes read ambiguous if there is 1 N in an alternate site.
+ public final float CLEARZONE_RATIO1=2.0f;
+ public final float CLEARZONE_RATIO1b=2.6f;
+ public final float CLEARZONE_RATIO1c=4.8f;
+ public final float CLEARZONE_RATIO3=9.5f;
+ /** Max allowed number of sites within 1 edit (excluding primary site) */
+ public final int CLEARZONE_LIMIT1e=50;
+ public final int CLEARZONEP;
+ public final int CLEARZONE1;
+ public final int CLEARZONE1b;
+ public final int CLEARZONE1c;
+ //public final int CLEARZONE1e;
+ public final int CLEARZONE3;
+ public final float INV_CLEARZONE3;
+ public final float CLEARZONE1b_CUTOFF_FLAT_RATIO=12;//3f;
+ public final float CLEARZONE1b_CUTOFF_FLAT;
+ public final float CLEARZONE1b_CUTOFF_SCALE=0.97f;
+ public final float CLEARZONE1c_CUTOFF_FLAT_RATIO=26;//7f;
+ public final float CLEARZONE1c_CUTOFF_FLAT;
+ public final float CLEARZONE1c_CUTOFF_SCALE=0.92f;
+
+ public final BBIndexAcc index;
+
+
+ private final int MIN_TRIM_SITES_TO_RETAIN_SINGLE=3;
+ private final int MIN_TRIM_SITES_TO_RETAIN_PAIRED=2;
+
+ public static void setExpectedSites(int x){
+ System.err.println("Warning: EXPECTED_SITES is not valid for "+(new Object() { }.getClass().getEnclosingClass().getName()));
+ }
+
+ @Override
+ public final int ALIGN_COLUMNS(){return ALIGN_COLUMNS;}
+ @Override
+ public final int ALIGN_ROWS(){return ALIGN_ROWS;}
+ @Override
+ public final int maxReadLength(){return ALIGN_ROWS-1;}
+ @Override
+ final AbstractIndex index(){return index;}
+ @Override
+ final int CLEARZONE1(){return CLEARZONE1;}
+
+ public BBMapThreadAcc(ConcurrentReadInputStream cris_, int keylen_,
+ CoveragePileup pileup_, boolean SMITH_WATERMAN_, int THRESH_, int minChrom_,
+ int maxChrom_, float keyDensity_, float maxKeyDensity_, float minKeyDensity_, int maxDesiredKeys_,
+ boolean REMOVE_DUPLICATE_BEST_ALIGNMENTS_, boolean SAVE_AMBIGUOUS_XY_,
+ float MINIMUM_ALIGNMENT_SCORE_RATIO_, boolean TRIM_LIST_, boolean MAKE_MATCH_STRING_, boolean QUICK_MATCH_STRINGS_,
+ ConcurrentReadOutputStream outStream_, ConcurrentReadOutputStream outStreamMapped_, ConcurrentReadOutputStream outStreamUnmapped_, ConcurrentReadOutputStream outStreamBlack_,
+ int SLOW_ALIGN_PADDING_, int SLOW_RESCUE_PADDING_, boolean DONT_OUTPUT_UNMAPPED_READS_, boolean DONT_OUTPUT_BLACKLISTED_READS_,
+ int MAX_SITESCORES_TO_PRINT_, boolean PRINT_SECONDARY_ALIGNMENTS_,
+ boolean REQUIRE_CORRECT_STRANDS_PAIRS_, boolean SAME_STRAND_PAIRS_, boolean KILL_BAD_PAIRS_, boolean RCOMP_MATE_,
+ boolean PERFECTMODE_, boolean SEMIPERFECTMODE_, boolean FORBID_SELF_MAPPING_, int TIP_DELETION_SEARCH_RANGE_,
+ boolean AMBIGUOUS_RANDOM_, boolean AMBIGUOUS_ALL_, int KFILTER_, float IDFILTER_, boolean TRIM_LEFT_, boolean TRIM_RIGHT_, boolean UNTRIM_, byte TRIM_QUAL_, int TRIM_MIN_LEN_,
+ boolean LOCAL_ALIGN_, boolean RESCUE_, boolean STRICT_MAX_INDEL_, String MSA_TYPE_){
+
+ super(cris_,
+ outStream_, outStreamMapped_, outStreamUnmapped_, outStreamBlack_,
+ pileup_, SMITH_WATERMAN_, LOCAL_ALIGN_, REMOVE_DUPLICATE_BEST_ALIGNMENTS_,
+ AMBIGUOUS_RANDOM_, AMBIGUOUS_ALL_, TRIM_LEFT_, TRIM_RIGHT_, UNTRIM_, TRIM_QUAL_, TRIM_MIN_LEN_, THRESH_,
+ minChrom_, maxChrom_, KFILTER_, IDFILTER_, KILL_BAD_PAIRS_, SAVE_AMBIGUOUS_XY_,
+ REQUIRE_CORRECT_STRANDS_PAIRS_,
+ SAME_STRAND_PAIRS_, RESCUE_, STRICT_MAX_INDEL_, SLOW_ALIGN_PADDING_, SLOW_RESCUE_PADDING_,
+ MSA_TYPE_, keylen_, PERFECTMODE_, SEMIPERFECTMODE_, FORBID_SELF_MAPPING_, RCOMP_MATE_,
+ MAKE_MATCH_STRING_, DONT_OUTPUT_UNMAPPED_READS_, DONT_OUTPUT_BLACKLISTED_READS_, PRINT_SECONDARY_ALIGNMENTS_,
+ QUICK_MATCH_STRINGS_, MAX_SITESCORES_TO_PRINT_, MINIMUM_ALIGNMENT_SCORE_RATIO_,
+ keyDensity_, maxKeyDensity_, minKeyDensity_, maxDesiredKeys_,
+ BBIndexAcc.MIN_APPROX_HITS_TO_KEEP, BBIndexAcc.USE_EXTENDED_SCORE,
+ BBIndexAcc.BASE_HIT_SCORE, BBIndexAcc.USE_AFFINE_SCORE, BBIndexAcc.MAX_INDEL, TRIM_LIST_, TIP_DELETION_SEARCH_RANGE_);
+
+ assert(SLOW_ALIGN_PADDING>=0);
+ assert(!(RCOMP_MATE/* || FORBID_SELF_MAPPING*/)) : "RCOMP_MATE: TODO";
+
+ if(SLOW_ALIGN || MAKE_MATCH_STRING){
+// msa=MSA.makeMSA(ALIGN_ROWS, ALIGN_COLUMNS, MSA_TYPE);
+// POINTS_MATCH=msa.POINTS_MATCH();
+// POINTS_MATCH2=msa.POINTS_MATCH2();
+ CLEARZONE1=(int)(CLEARZONE_RATIO1*POINTS_MATCH2);
+ CLEARZONE1b=(int)(CLEARZONE_RATIO1b*POINTS_MATCH2);
+ CLEARZONE1c=(int)(CLEARZONE_RATIO1c*POINTS_MATCH2);
+ CLEARZONEP=(int)(CLEARZONE_RATIOP*POINTS_MATCH2);
+ CLEARZONE3=PENALIZE_AMBIG ? (int)(CLEARZONE_RATIO3*POINTS_MATCH2) : 0;
+// CLEARZONE1e=(int)(2*POINTS_MATCH2-POINTS_MATCH-msa.POINTS_SUB())+1;
+ }else{
+// POINTS_MATCH=70;
+// POINTS_MATCH2=100;
+// msa=null;
+ CLEARZONE1=0;
+ CLEARZONE1b=0;
+ CLEARZONE1c=0;
+ CLEARZONEP=0;
+ CLEARZONE3=0;
+// CLEARZONE1e=0;
+ }
+
+ CLEARZONE1b_CUTOFF_FLAT=CLEARZONE1b_CUTOFF_FLAT_RATIO*POINTS_MATCH2;
+ CLEARZONE1c_CUTOFF_FLAT=CLEARZONE1c_CUTOFF_FLAT_RATIO*POINTS_MATCH2;
+ INV_CLEARZONE3=(CLEARZONE3==0 ? 0 : 1f/CLEARZONE3);
+
+ index=new BBIndexAcc(KEYLEN, minChrom, maxChrom, KFILTER, msa);
+ }
+
+
+ public int trimList(ArrayList<SiteScore> list, boolean retainPaired, int maxScore, boolean specialCasePerfect, int minSitesToRetain, int maxSitesToRetain){
+ if(list==null || list.size()==0){return -99999;}
+ if(list.size()==1){return list.get(0).score;}
+
+ final int highestScore;
+ if(USE_AFFINE_SCORE){
+
+ highestScore=Tools.trimSiteList(list, .35f, retainPaired, true, minSitesToRetain, maxSitesToRetain);
+
+// System.err.println("\nTrimming list of length "+list.size()+" vs highestScore "+highestScore+", maxScore "+maxScore+", specialcasePerfect="+specialCasePerfect);
+
+ final int mstr2=(minSitesToRetain<=1 ? 1 : minSitesToRetain+1);
+ if(highestScore==maxScore && specialCasePerfect){
+ Tools.trimSiteList(list, .9f, retainPaired, true, mstr2, maxSitesToRetain);
+ if(list.size()>30){Tools.trimSiteList(list, .92f, retainPaired, true, mstr2, maxSitesToRetain);}
+ if(list.size()>60){Tools.trimSiteList(list, .94f, retainPaired, true, mstr2, maxSitesToRetain);}
+ if(list.size()>80){Tools.trimSiteList(list, .96f, retainPaired, true, mstr2, maxSitesToRetain);}
+ if(list.size()>120){Tools.trimSiteList(list, .97f, retainPaired, true, mstr2, maxSitesToRetain);}
+ if(list.size()>160){Tools.trimSiteList(list, .99f, retainPaired, true, mstr2, maxSitesToRetain);}
+ return highestScore;
+ }
+
+ if(list.size()>4){Tools.trimSiteList(list, .4f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+// System.out.print(", "+list.size());
+ if(list.size()>6){Tools.trimSiteList(list, .45f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+// System.out.print(", "+list.size());
+ if(list.size()>8){Tools.trimSiteList(list, .5f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+// System.out.print(", "+list.size());
+ if(list.size()>12){Tools.trimSiteList(list, .55f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+// System.out.print(", "+list.size());
+ if(list.size()>16){Tools.trimSiteList(list, .6f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+// System.out.print(", "+list.size());
+ if(list.size()>20){Tools.trimSiteList(list, .65f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+// System.out.print(", "+list.size());
+ if(list.size()>24){Tools.trimSiteList(list, .7f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+// System.out.print(", "+list.size());
+ if(list.size()>32){Tools.trimSiteList(list, .75f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+// System.out.print(", "+list.size());
+ if(list.size()>40){Tools.trimSiteList(list, .8f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+// System.out.print(", "+list.size());
+ if(list.size()>48){Tools.trimSiteList(list, .85f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+// System.out.print(", "+list.size());
+ if(list.size()>56){Tools.trimSiteList(list, .9f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+// System.out.print(", "+list.size());
+ if(list.size()>64){Tools.trimSiteList(list, .92f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+// System.out.print(", "+list.size());
+ if(list.size()>80){Tools.trimSiteList(list, .94f, retainPaired, true, mstr2, maxSitesToRetain);}
+// System.out.print(", "+list.size());
+ if(list.size()>100){Tools.trimSiteList(list, .95f, retainPaired, true, mstr2, maxSitesToRetain);}
+// System.out.print(", "+list.size());
+ if(list.size()>120){Tools.trimSiteList(list, .96f, retainPaired, true, mstr2, maxSitesToRetain);}
+// System.out.print(", "+list.size());
+ if(list.size()>160){Tools.trimSiteList(list, .97f, retainPaired, true, mstr2, maxSitesToRetain);}
+// System.out.print(", "+list.size());
+ if(list.size()>200){Tools.trimSiteList(list, .98f, retainPaired, true, mstr2, maxSitesToRetain);}
+// System.out.print(", "+list.size());
+ if(list.size()>240){Tools.trimSiteList(list, .99f, retainPaired, true, mstr2, maxSitesToRetain);}
+// System.out.print(", "+list.size());
+
+
+// if(list.size()>4){Tools.trimSiteList(list, .4f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+//// System.out.print(", "+list.size());
+// if(list.size()>8){Tools.trimSiteList(list, .45f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+//// System.out.print(", "+list.size());
+// if(list.size()>12){Tools.trimSiteList(list, .5f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+//// System.out.print(", "+list.size());
+// if(list.size()>16){Tools.trimSiteList(list, .55f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+//// System.out.print(", "+list.size());
+// if(list.size()>20){Tools.trimSiteList(list, .6f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+//// System.out.print(", "+list.size());
+// if(list.size()>24){Tools.trimSiteList(list, .65f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+//// System.out.print(", "+list.size());
+// if(list.size()>32){Tools.trimSiteList(list, .7f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+//// System.out.print(", "+list.size());
+// if(list.size()>48){Tools.trimSiteList(list, .75f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+//// System.out.print(", "+list.size());
+// if(list.size()>64){Tools.trimSiteList(list, .8f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+//// System.out.print(", "+list.size());
+// if(list.size()>128){Tools.trimSiteList(list, .85f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+//// System.out.print(", "+list.size());
+// if(list.size()>256){Tools.trimSiteList(list, .9f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+//// System.out.print(", "+list.size());
+// if(list.size()>512){Tools.trimSiteList(list, .92f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+//// System.out.print(", "+list.size());
+// if(list.size()>2048){Tools.trimSiteList(list, .94f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+//// System.out.print(", "+list.size());
+// if(list.size()>4096){Tools.trimSiteList(list, .95f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+//// System.out.print(", "+list.size());
+// if(list.size()>8192){Tools.trimSiteList(list, .96f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+//// System.out.print(", "+list.size());
+// if(list.size()>16000){Tools.trimSiteList(list, .97f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+//// System.out.print(", "+list.size());
+// if(list.size()>32000){Tools.trimSiteList(list, .98f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+//// System.out.print(", "+list.size());
+// if(list.size()>32000){Tools.trimSiteList(list, .99f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+//// System.out.print(", "+list.size());
+
+
+ }else if(BBIndexAcc.USE_EXTENDED_SCORE){
+ highestScore=Tools.trimSiteList(list, .75f, retainPaired, true, minSitesToRetain, maxSitesToRetain);
+
+ if(list.size()>8){Tools.trimSiteList(list, .8f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ if(list.size()>16){Tools.trimSiteList(list, .85f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ if(list.size()>24){Tools.trimSiteList(list, .90f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ if(list.size()>36){Tools.trimSiteList(list, .92f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ //// System.out.print(", "+list.size());
+ if(list.size()>40){Tools.trimSiteList(list, .94f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ //// System.out.print(", "+list.size());
+ if(list.size()>48){Tools.trimSiteList(list, .96f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ //// System.out.print(", "+list.size());
+ if(list.size()>56){Tools.trimSiteList(list, .97f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ if(list.size()>64){Tools.trimSiteList(list, .98f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ if(list.size()>80){Tools.trimSiteList(list, .99f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+
+
+ }else{
+ // System.out.print("\n\nSize:\t"+list.size());
+
+
+ highestScore=Tools.trimSiteList(list, .6f, retainPaired, true, minSitesToRetain, maxSitesToRetain);
+
+ if(list.size()>12){Tools.trimSiteList(list, .65f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ if(list.size()>16){Tools.trimSiteList(list, .7f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ if(list.size()>24){Tools.trimSiteList(list, .74f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ if(list.size()>28){Tools.trimSiteList(list, .8f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ if(list.size()>32){Tools.trimSiteList(list, .85f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ if(list.size()>48){Tools.trimSiteList(list, .90f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ //// System.out.print(", "+list.size());
+ // if(list.size()>40){Tools.trimSiteList(list, .95f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ //// System.out.print(", "+list.size());
+ // if(list.size()>48){Tools.trimSiteList(list, .96f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ //// System.out.print(", "+list.size());
+ // if(list.size()>56){Tools.trimSiteList(list, .97f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ }
+
+ return highestScore;
+ }
+
+
+ public void scoreSlow(final ArrayList<SiteScore> list, final byte[] basesP, final byte[] basesM,
+ final int maxSwScore, final int maxImperfectSwScore){
+
+ int minMsaLimit;
+ if(PAIRED){
+ minMsaLimit=-CLEARZONE1e+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE*maxSwScore);
+ }else{
+ minMsaLimit=-CLEARZONE1e+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO*maxSwScore);
+ }
+ assert(Read.CHECKSITES(list, basesP, basesM, -1));
+
+ int minMatch=Tools.max(-300, minMsaLimit-CLEARZONE3); //Score must exceed this to generate quick match string
+ if(verbose){
+ System.err.println("Slow-scoring. maxSwScore="+maxSwScore+", maxImperfectSwScore="+maxImperfectSwScore+", minMsaLimit="+minMsaLimit+", minMatch="+minMatch);
+ }
+ for(int i=0; i<list.size(); i++){
+ final SiteScore ss=list.get(i);
+ assert(ss.lengthsAgree());
+ final byte[] bases=(ss.strand==Gene.PLUS ? basesP : basesM);
+
+ if(SEMIPERFECTMODE){
+ assert(ss.stop-ss.start==bases.length-1);
+ assert(ss.semiperfect);
+ }
+
+ if(verbose){System.err.println("\nSlow-scoring "+ss);}
+ if(ss.stop-ss.start!=bases.length-1){
+ assert(ss.stop-ss.start>bases.length-1) : bases.length+", "+ss.toText();
+ assert(!ss.semiperfect) : "\n"+bases.length+", "+ss.toText()+", "+ss.perfect+", "+ss.semiperfect+", "+maxSwScore+"\n"+new String(basesP)+"\n";
+ ss.setSlowScore(0);
+ ss.semiperfect=false;
+ ss.perfect=false;
+ }
+
+ final int swscoreNoIndel=ss.slowScore;
+ int[] swscoreArray=null;
+
+ boolean clipped=true, setLimits=false;
+ if(swscoreNoIndel<maxImperfectSwScore && !ss.semiperfect){
+ if(verbose && ss.stop-ss.start>4000){
+ System.err.println(ss.toText());
+ System.err.println(list.size());
+ System.err.println();
+ }
+
+ int expectedLen=GapTools.calcGrefLen(ss);
+ if(verbose){System.err.println("expectedLen="+expectedLen);}
+ if(expectedLen>=EXPECTED_LEN_LIMIT){
+ //TODO: Alternately, I could kill the site.
+ ss.setStop(ss.start+Tools.min(basesP.length+40, EXPECTED_LEN_LIMIT));
+ if(verbose){System.err.println("expectedLen="+expectedLen+"; ss="+ss);}
+ }
+
+ int pad=SLOW_ALIGN_PADDING;
+ final int minscore=Tools.max(swscoreNoIndel, minMsaLimit);
+ final int minscore2=Tools.max(swscoreNoIndel-MSA.MIN_SCORE_ADJUST, minMsaLimit);
+ if(verbose){System.err.println("Sent to msa with start="+ss.start+", stop="+ss.stop+", pad="+pad+", limit="+minscore+", gaps="+GapTools.toString(ss.gaps));}
+ swscoreArray=msa.fillAndScoreLimited(bases, ss, pad, minscore);
+ if(verbose){System.err.println("Received "+Arrays.toString(swscoreArray));}
+
+ if(swscoreArray!=null && swscoreArray.length>6 && (swscoreArray[3]+swscoreArray[4]+expectedLen<EXPECTED_LEN_LIMIT)){
+ int[] oldArray=swscoreArray.clone();
+ assert(swscoreArray.length==8);
+ int extraPadLeft=swscoreArray[6];
+ int extraPadRight=swscoreArray[7];
+
+ if(verbose){
+ System.err.println("msa returned "+Arrays.toString(swscoreArray)+", re-running.");
+ System.err.println("Added extra padding: "+ss.toText()+", "+Arrays.toString(oldArray));
+ }
+
+ ss.setLimits(ss.start-extraPadLeft, ss.stop+extraPadRight);
+ pad=SLOW_ALIGN_PADDING+EXTRA_PADDING;
+ if(verbose){System.err.println("Sent to msa with start="+ss.start+", stop="+ss.stop+", pad="+pad+", limit="+minscore+", gaps="+GapTools.toString(ss.gaps));}
+ swscoreArray=msa.fillAndScoreLimited(bases, ss, pad, minscore);
+
+ if(verbose){System.err.println("Result of extra padding: "+ss.toText()+", "+Arrays.toString(swscoreArray));}
+ if(swscoreArray==null || swscoreArray[0]<oldArray[0]){
+ if(verbose){
+ System.err.println("Result was inferior.");
+ }
+ swscoreArray=oldArray;
+ }
+ }
+ assert(ss.lengthsAgree());
+ if(verbose){
+ System.err.println(QUICK_MATCH_STRINGS+", "+(swscoreArray==null ? "null" : (swscoreArray.length+", "+swscoreArray[0]+" >=? "+minscore)));
+ System.err.println("start="+ss.start+", stop="+ss.stop+", len="+ss.mappedLength());
+ }
+ if(QUICK_MATCH_STRINGS && swscoreArray!=null && swscoreArray.length==6 && swscoreArray[0]>=minscore2 && (PRINT_SECONDARY_ALIGNMENTS || (USE_SS_MATCH_FOR_PRIMARY && swscoreArray[0]>minMatch))){
+ if(verbose){System.err.println("Generating match string.");}
+ assert(swscoreArray.length==6) : swscoreArray.length;
+ assert(swscoreArray[0]>=minscore2) : "\n"+Arrays.toString(swscoreArray)+"\n"+minscore+"\n"+minMatch;
+ ss.match=msa.traceback(bases, Data.getChromosome(ss.chrom).array, ss.start-pad, ss.stop+pad, swscoreArray[3], swscoreArray[4], swscoreArray[5], ss.gaps!=null);
+ if(ss.match!=null){
+ assert(ss.pairedScore<1 || (ss.slowScore<=0 && ss.pairedScore>ss.quickScore ) || ss.pairedScore>ss.slowScore); //123
+ ss.setLimits(swscoreArray[1], swscoreArray[2]);
+ setLimits=true;
+ assert(ss.lengthsAgree());
+ clipped=ss.fixXY(bases, true, msa);
+ assert(ss.pairedScore<1 || (ss.slowScore<=0 && ss.pairedScore>ss.quickScore ) || ss.pairedScore>ss.slowScore); //123
+ clipped=ss.clipTipIndels(bases, basesM, 4, 10, msa) || clipped;
+ assert(ss.pairedScore<1 || (ss.slowScore<=0 && ss.pairedScore>ss.quickScore ) || ss.pairedScore>ss.slowScore); //123
+ assert(ss.lengthsAgree());
+ }
+ }else{
+ ss.match=null;
+ }
+ }
+ if(swscoreArray!=null && !setLimits){
+ if(verbose){System.err.println("msa returned "+Arrays.toString(swscoreArray));}
+ ss.setSlowScore(swscoreArray[0]);
+ ss.setLimits(swscoreArray[1], swscoreArray[2]);
+ assert(ss.lengthsAgree());
+ }else{
+ assert(swscoreNoIndel<=maxSwScore) : swscoreNoIndel+", "+maxImperfectSwScore+", "+maxSwScore+", "+new String(basesP);
+ assert(clipped || swscoreNoIndel==-1 || msa.scoreNoIndels(bases, ss.chrom, ss.start)==swscoreNoIndel) :
+ setLimits+", "+clipped+", "+(swscoreArray==null)+", "+
+ swscoreNoIndel+" != "+msa.scoreNoIndels(bases, ss.chrom, ss.start)+"\n"+
+ ss.toText()+"\n"+(ss.stop-ss.start)+", "+bases.length; //Slow
+ }
+ assert(ss.lengthsAgree());
+ ss.setScore(ss.slowScore);
+ minMatch=Tools.max(minMatch, ss.slowScore);
+ minMsaLimit=Tools.max(minMsaLimit, ss.slowScore-CLEARZONE3);
+ assert(ss.slowScore<=maxSwScore);
+ assert(!(ss.perfect && ss.slowScore<maxSwScore));
+ ss.perfect=(ss.slowScore==maxSwScore);
+ if(ss.perfect){ss.semiperfect=true;}
+ else if(!ss.semiperfect){ss.setPerfect(bases);}
+
+ if(verbose){System.err.println(" -> "+ss);}
+ }
+
+ }
+
+
+ public void processRead(final Read r, final byte[] basesM){
+ if(idmodulo>1 && r.numericID%idmodulo!=1){return;}
+ final byte[] basesP=r.bases;
+
+// System.err.print(" rd#"+r.numericID+" ");
+// if(r.numericID==25967){
+// verbose=true;
+// msa.verbose=true;
+// GapTools.verbose=true;
+// index.verbose=true;
+// tcr.verbose=true;
+// }
+
+ if(verbose){System.err.println("\nProcessing "+r);}
+ readsUsed1++;
+
+ final int maxPossibleQuickScore=quickMap(r, basesM);
+ if(verbose){System.err.println("\nQuick Map: \t"+r.sites);}
+
+ if(maxPossibleQuickScore<0){
+ r.sites=null;
+ lowQualityReadsDiscarded1++;
+ lowQualityBasesDiscarded1+=basesP.length;
+ r.setDiscarded(true);
+ return;
+ }
+ initialSiteSum1+=r.numSites();
+ if(verbose){System.err.println("\ninitialSiteSum1: "+initialSiteSum1);}
+
+ int maxSwScore=0;
+ int maxImperfectSwScore=0;
+
+ if(SLOW_ALIGN || USE_AFFINE_SCORE){
+ maxSwScore=msa.maxQuality(r.length());
+ maxImperfectSwScore=msa.maxImperfectScore(r.length());
+ }
+
+ if(TRIM_LIST && r.numSites()>1){
+ if(MIN_TRIM_SITES_TO_RETAIN_SINGLE>1){Collections.sort(r.sites);}
+ int highestQuickScore=trimList(r.sites, false, maxSwScore, true, MIN_TRIM_SITES_TO_RETAIN_SINGLE, MAX_TRIM_SITES_TO_RETAIN);
+ }
+ postTrimSiteSum1+=r.numSites();
+ if(verbose){System.err.println("\nAfter trim: \t"+r.sites);}
+
+ assert(Read.CHECKSITES(r, basesM));
+
+
+ if(SLOW_ALIGN && r.numSites()>0){
+
+ int numNearPerfectScores=scoreNoIndels(r, basesP, basesM, maxSwScore, maxImperfectSwScore);
+
+ Collections.sort(r.sites); //Puts higher scores first to better trigger the early exit based on perfect scores
+
+ int numPerfectScores=0;
+ if(numNearPerfectScores>0){
+ for(SiteScore ss : r.sites){
+ if(ss.perfect){numPerfectScores++;}
+ else{break;}
+ }
+ }
+
+ if(verbose){
+ System.err.println("\nAfter scoreNoIndels: \t"+r.sites);
+ }
+
+ if(numPerfectScores<2 && numNearPerfectScores<3){
+ if(FIND_TIP_DELETIONS){findTipDeletions(r, basesP, basesM, maxSwScore, maxImperfectSwScore);}
+ }
+
+ if(verbose){
+ System.err.println("\nAfter findTipDeletions: \t"+r.sites);
+ }
+
+ //TODO: This causes problems with perfect matches that are mapped to areas longer than the read length
+ //***Above note should be resolved now, but needs to be verified.
+
+ if(numNearPerfectScores<1){
+ scoreSlow(r.sites, basesP, basesM, maxSwScore, maxImperfectSwScore);
+ }
+
+ if(STRICT_MAX_INDEL){
+ int removed=removeLongIndels(r.sites, index.MAX_INDEL);
+ if(r.numSites()==0){r.clearMapping();}
+ }
+
+ if(verbose){System.err.println("\nAfter scoreSlow: \t"+r.sites);}
+ assert(Read.CHECKSITES(r, basesM, false));
+ }
+
+
+ if(r.numSites()>0){
+ mapped1++;
+ try {
+ Tools.mergeDuplicateSites(r.sites, true, true);
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ throw new RuntimeException("\n\n"+r.toText(false)+"\n\n");
+ }
+ Collections.sort(r.sites);
+ }
+
+ if(r.numSites()>1){
+ SiteScore ss1=r.topSite();
+ SiteScore ss2=r.sites.get(1);
+ //Ensure no duplicates
+ assert(ss1.chrom!=ss2.chrom || ss1.strand!=ss2.strand || ss1.start!=ss2.start || ss1.stop!=ss2.stop) : r.toText(false);
+ }
+ assert(Read.CHECKSITES(r, basesM));
+
+ if(r.numSites()>1){
+ assert(r.topSite().score==r.topSite().slowScore);
+ }
+
+ if((SLOW_ALIGN || USE_AFFINE_SCORE) && r.numSites()>0){
+ int lim=(int)(maxSwScore*MINIMUM_ALIGNMENT_SCORE_RATIO);
+ if(r.topSite().score<lim){r.sites=null;}
+ else{Tools.removeLowQualitySitesUnpaired(r.sites, Tools.min(lim, Tools.max(1, lim-CLEARZONE3)));}
+ }
+
+ if(SLOW_ALIGN || USE_AFFINE_SCORE){r.setPerfectFlag(maxSwScore);}
+
+ if(r.numSites()>1){
+
+ final int clearzone;
+ final int score=r.topSite().score;
+ if(r.perfect()){clearzone=CLEARZONEP;}
+ else{
+ assert(score<maxSwScore);
+ final float cz1blimit=(maxSwScore*CLEARZONE1b_CUTOFF_SCALE-CLEARZONE1b_CUTOFF_FLAT);
+ final float cz1climit=(maxSwScore*CLEARZONE1c_CUTOFF_SCALE-CLEARZONE1c_CUTOFF_FLAT);
+ if(score>cz1blimit){
+// clearzone=CLEARZONE1;
+ clearzone=(int)(((maxSwScore-score)*CLEARZONE1b+(score-cz1blimit)*CLEARZONE1)/(maxSwScore-cz1blimit));
+ }else if(score>cz1climit){
+// clearzone=CLEARZONE1b;
+ clearzone=(int)(((cz1blimit-score)*CLEARZONE1c+(score-cz1climit)*CLEARZONE1b)/(cz1blimit-cz1climit));
+ }else{
+ clearzone=CLEARZONE1c;
+ }
+// assert(false) : x+", "+cz1blimit+", "+cz1climit+", "+CLEARZONE1b_CUTOFF_FLAT+", "+clearzone;
+ }
+
+
+// final int clearzone=r.perfect() ? CLEARZONEP :
+// r.list.get(0).score>=(int)(maxSwScore*CLEARZONE1b_CUTOFF) ? CLEARZONE1 :
+// (r.list.get(0).score>=(int)(maxSwScore*CLEARZONE1c_CUTOFF) ? (CLEARZONE1b_CUTOFF-)CLEARZONE1b : CLEARZONE1c);
+ int numBestSites1=Tools.countTopScores(r.sites, clearzone);
+ if(numBestSites1>1){
+ //Ambiguous alignment
+ assert(r.sites.size()>1);
+ boolean b=processAmbiguous(r.sites, true, AMBIGUOUS_TOSS, clearzone, SAVE_AMBIGUOUS_XY); //Never gets executed anymore, so always returns true
+ r.setAmbiguous(b);
+ }else{
+ final int lim=(r.perfect() ? 3*CLEARZONE_LIMIT1e : score+CLEARZONE1e>=maxSwScore ? 2*CLEARZONE_LIMIT1e : CLEARZONE_LIMIT1e)+1;
+ if(r.sites.size()>lim && clearzone<CLEARZONE1e){
+ numBestSites1=Tools.countTopScores(r.sites, CLEARZONE1e);
+ if(numBestSites1>lim){
+ boolean b=processAmbiguous(r.sites, true, AMBIGUOUS_TOSS, clearzone, SAVE_AMBIGUOUS_XY);
+ r.setAmbiguous(b);
+ }
+ }
+ }
+ }
+
+ if(verbose){System.err.println("A: "+r);}
+
+ if((SLOW_ALIGN || USE_AFFINE_SCORE) && r.numSites()>0){
+ int lim=(int)(maxSwScore*MINIMUM_ALIGNMENT_SCORE_RATIO);
+ if(r.topSite().score<lim){r.sites=null;}
+ else{Tools.removeLowQualitySitesUnpaired(r.sites, Tools.min(lim, Tools.max(1, lim-CLEARZONE3)));}
+ }
+ if(r.numSites()==0){r.sites=null;r.mapScore=0;}
+ r.setFromTopSite(AMBIGUOUS_RANDOM, true, MAX_PAIR_DIST);
+ assert(Read.CHECKSITES(r, basesM));
+
+ if(verbose){System.err.println("B: "+r);}
+
+ //Unimportant anomaly due to ambiguous reads that later have low quality sites removed and become unmapped.
+// assert(!r.mapped() || new SamLine(r, 0).toRead(true).ambiguous()==r.ambiguous()) : "\n"+r+"\n\n"+new SamLine(r, 0)+"\n\n"+new SamLine(r, 0).toRead(true)+"\n\n"+
+// "ambi="+ambi+", r.ambiguous()="+r.ambiguous()+", new SamLine(r, 0).toRead(true).ambiguous()="+new SamLine(r, 0).toRead(true).ambiguous()+"\n\n"+
+// "r.mapped="+r.mapped()+", sl.mapped()="+new SamLine(r, 0).mapped()+", sl.toRead(true).mapped()="+new SamLine(r, 0).toRead(true).mapped();
+// assert(r.ambiguous()==ambi) : r;
+
+ assert(r.gaps==null || r.gaps[0]==r.start && r.gaps[r.gaps.length-1]==r.stop);
+ assert(r.sites==null || r.mapScore>0) : r.sites+", "+r.mapScore+"\n"+r;
+
+ if(r.numSites()>1){
+ assert(r.topSite().score==r.topSite().slowScore) : "\n"+r.toText(false)+"\n";
+ assert(r.topSite().score==r.mapScore) : "\n"+r.toText(false)+"\n";
+ }
+
+ if(verbose){System.err.println("C: "+r);}
+
+ //***$
+ if(MAKE_MATCH_STRING && r.numSites()>0){
+ if(USE_SS_MATCH_FOR_PRIMARY && r.topSite().match!=null){
+ r.match=r.topSite().match;
+ }else{
+ if(r.sites.size()>1){
+ assert(r.topSite().score>=r.sites.get(1).score) : "\n"+r.topSite().toText()+"\t<\t"+r.sites.get(1).toText()+"\n"+r.toText(false)+"\n";
+ }
+ int mapScore=r.mapScore;
+
+ assert(r.mate!=null || r.numSites()==0 || r.topSite().score==r.mapScore) : "\n"+r.toText(false)+"\n";
+
+ if(verbose){System.err.println("D: "+r);}
+
+ {
+ boolean firstIter=true;
+ do{//
+ if(!firstIter){
+ Collections.sort(r.sites);
+ r.setFromTopSite(AMBIGUOUS_RANDOM, true, MAX_PAIR_DIST);
+ }
+ genMatchString(r, basesP, basesM, maxImperfectSwScore, maxSwScore, true, true);
+ assert(r.mate!=null || r.numSites()==0 || r.topSite().score==r.mapScore) : "\n"+r.toText(false)+"\n";
+// TODO: Fix this; it should never happen.
+// if(mapScore>r.mapScore){
+// System.err.println("genMatchString reduced mapping score: "+mapScore+" -> "+r.mapScore+" in read "+r.numericID);
+// }
+ if(STRICT_MAX_INDEL && hasLongIndel(r.match, index.MAX_INDEL)){
+ SiteScore ss=r.topSite();
+ ss.score=r.mapScore=Tools.min(ss.score, -9999);
+ ss.setSlowPairedScore(ss.score, ss.score);
+ }
+ r.topSite().score=r.topSite().slowScore;
+ firstIter=false;
+ }while(r.sites.size()>1 && r.topSite().score<r.sites.get(1).score);
+ }
+
+ if(r.numSites()>1){
+ assert(r.topSite().score==r.topSite().slowScore) : "\n"+r.toText(false)+"\n";
+ assert(r.topSite().score==r.mapScore) : "\n"+r.toText(false)+"\n";
+ }
+
+ if(verbose){System.err.println("E: "+r);}
+ }
+ }
+
+ if(r.numSites()>1){
+ assert(r.topSite().score==r.topSite().slowScore) : "\n"+r.toText(false)+"\n";
+ assert(r.topSite().score==r.mapScore) : "\n"+r.toText(false)+"\n";
+ removeDuplicateBestSites(r);
+ }
+ if(r.numSites()>0){r.topSite().match=r.match;}
+
+
+
+ if(r.sites!=null && r.mapScore<=0){//This came from BBMapThreadPacBio; not sure if needed for other modes
+ if(!STRICT_MAX_INDEL && !Shared.anomaly){
+ System.err.println("Note: Read "+r.id+" failed cigar string generation and will be marked as unmapped.\t"+(r.match==null)+"\t"+r.mapScore+"\t"+r.topSite()+"\t"+new String(r.bases));
+ if(MSA.bandwidth>0 || MSA.bandwidthRatio>0 || MSA.flatMode){Shared.anomaly=true;}
+ }
+ r.mapScore=0;
+ r.setMapped(false);
+ r.sites=null;
+ }
+
+
+
+ //This block is to prevent an assertion from firing. Generally caused by alignment being lost during match generation.
+ //TODO: Fix cause.
+ if(r.mapScore>0 && r.sites==null){
+ if(!Shared.anomaly){System.err.println("Anomaly: mapScore>0 and list==null.\n"+r+"\n");}
+ Shared.anomaly=true;
+ r.clearMapping();
+ }else if(r.mapScore<=0 && r.sites!=null){
+ if(BANDWIDTH<1){
+ if(!Shared.anomaly){System.err.println("Anomaly1: mapScore<=0 and list!=null.\n"+r+"\n");}
+ Shared.anomaly=true;
+ }
+ r.clearMapping();
+ }
+ assert(r.sites==null || r.mapScore>0) :
+ "\nmapScore = "+r.mapScore+"\nread = "+r.toText(false)+"\nscore thresh = "+(-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO*maxSwScore))+"\n"+
+ "msa unlimited return = "+Arrays.toString(msa.fillAndScoreLimited(r.strand()==Gene.PLUS ? r.bases :
+ AminoAcid.reverseComplementBases(r.bases), r.topSite(), Tools.max(SLOW_ALIGN_PADDING, 10), 0))+"\n"+
+ "msa limited return = "+Arrays.toString(msa.fillAndScoreLimited(r.strand()==Gene.PLUS ? r.bases :
+ AminoAcid.reverseComplementBases(r.bases), r.topSite(), Tools.max(SLOW_ALIGN_PADDING, 10), (-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO*maxSwScore))))+"\n\n"+
+ "msa vert limit: "+msa.showVertLimit()+"\n\nmsa horz limit: "+msa.showHorizLimit()+"\n\n";
+
+// assert(r.list==null || r.mapScore>0) : r.mapScore+"\n"+r.list==null ? "null" : r.list.toString();
+
+ if((CLEARZONE3>CLEARZONE1 || CLEARZONE3>CLEARZONEP) && r.sites!=null && !r.ambiguous()){
+
+ assert(r.mapScore>0);
+ float cz3v2=(CLEARZONE3*Tools.min(1.1f, (maxSwScore/(float)r.mapScore)));
+
+// boolean changed=applyClearzone3(r, CLEARZONE3, INV_CLEARZONE3);
+ boolean changed=applyClearzone3(r, (int)cz3v2, 1/cz3v2);
+ if(changed){
+ int minScore=(int)(maxSwScore*MINIMUM_ALIGNMENT_SCORE_RATIO);
+ if(r.mapScore<minScore){
+ assert(!r.ambiguous());
+ r.setAmbiguous(true);
+ }
+ }
+ }
+
+// if(CLEARZONE3>CLEARZONE1 || CLEARZONE3>CLEARZONEP){
+// boolean changed=applyClearzone3(r, CLEARZONE3, INV_CLEARZONE3);
+// if(changed){
+// int minScore=(int)(maxSwScore*MINIMUM_ALIGNMENT_SCORE_RATIO);
+// if(r.mapScore<minScore){
+// assert(!r.ambiguous());
+// r.setAmbiguous(true);
+// }
+// }
+// }
+
+ if(r.ambiguous() && AMBIGUOUS_TOSS){r.sites=null; r.clearSite(); r.setMapped(false);}
+
+ if(r.mapped() && r.numSites()>1 && PRINT_SECONDARY_ALIGNMENTS){
+ ensureMatchStringsOnSiteScores(r, basesM, maxImperfectSwScore, maxSwScore);
+ assert(Read.CHECKSITES(r, basesM));
+ }
+
+ assert(checkTopSite(r));
+ if(r.mapped() && (LOCAL_ALIGN || r.containsXYC())){
+ msa.toLocalAlignment(r, r.topSite(), basesM, r.containsXYC() ? 1 : LOCAL_ALIGN_TIP_LENGTH, LOCAL_ALIGN_MATCH_POINT_RATIO);
+ assert(Read.CHECKSITES(r, basesM));
+ }
+
+ if(r.numSites()==0 || (!r.ambiguous() && r.mapScore<maxSwScore*MINIMUM_ALIGNMENT_SCORE_RATIO)){
+ r.clearMapping();
+ }
+ postFilterRead(r, basesM, maxImperfectSwScore, maxSwScore);
+ if(MAKE_MATCH_STRING){ensureMatchStringOnPrimary(r, basesM, maxImperfectSwScore, maxSwScore);}
+
+ if(PENALIZE_AMBIG){
+ int penalty=calcTipScorePenalty(r, maxSwScore, 7);
+ applyScorePenalty(r, penalty);
+ }
+
+ if(CALC_STATISTICS){
+ calcStatistics1(r, maxSwScore, maxPossibleQuickScore);
+ }
+ }
+
+
+ /** Returns number of perfect pairs */
+ public int pairSiteScoresInitial(Read r, Read r2, boolean trim){
+
+ if(r.numSites()<1 || r2.numSites()<1){return 0;}
+
+ SiteScore.PCOMP.sort(r.sites);
+ SiteScore.PCOMP.sort(r2.sites);
+
+ for(SiteScore ss : r.sites){ss.setPairedScore(0);}
+ for(SiteScore ss : r2.sites){ss.setPairedScore(0);}
+
+// ArrayList<SiteScorePair> pairs=new ArrayList<SiteScorePair>(Tools.min(8, Tools.min(r.list.size(), r2.list.size())));
+
+ int maxPairedScore1=-1;
+ int maxPairedScore2=-1;
+
+
+// for(SiteScore ss : r.list){
+// System.out.println(ss.toText());
+// }
+
+// int i=0, j=0;
+ final int ilimit=r.sites.size()-1;
+ final int jlimit=r2.sites.size()-1;
+ final int maxReadLen=Tools.max(r.length(), r2.length());
+
+// final int outerDistLimit=MIN_PAIR_DIST+r.length()+r2.length();
+ final int outerDistLimit=(Tools.max(r.length(), r2.length())*(OUTER_DIST_MULT))/OUTER_DIST_DIV;//-(SLOW_ALIGN ? 100 : 0);
+ final int innerDistLimit=MAX_PAIR_DIST;//+(FIND_TIP_DELETIONS ? TIP_DELETION_SEARCH_RANGE : 0);
+ final int expectedFragLength=AVERAGE_PAIR_DIST+r.length()+r2.length();
+
+ int numPerfectPairs=0;
+
+ for(int i=0, j=0; i<=ilimit && j<=jlimit; i++){
+ SiteScore ss1=r.sites.get(i);
+ SiteScore ss2=r2.sites.get(j);
+
+ while(j<jlimit && (ss2.chrom<ss1.chrom || (ss2.chrom==ss1.chrom && ss1.start-ss2.stop>innerDistLimit))){
+ j++;
+ ss2=r2.sites.get(j);
+ }
+
+ for(int k=j; k<=jlimit; k++){
+ ss2=r2.sites.get(k);
+
+ if(ss2.chrom>ss1.chrom){break;}
+ if(ss2.start-ss1.stop>innerDistLimit){break;}
+
+// int dist=0;
+//
+// if(ss1.start<=ss2.start){
+// dist=ss2.start-ss1.stop;
+// }else if(ss1.start>ss2.start){
+// dist=ss1.start-ss2.stop;
+// }
+
+
+// int innerdist=0;
+// int outerdist=0;
+//
+// if(ss1.start<=ss2.start){
+// innerdist=ss2.start-ss1.stop;
+// outerdist=ss2.stop-ss1.start;
+// }else if(ss1.start>ss2.start){
+// innerdist=ss1.start-ss2.stop;
+// outerdist=ss1.stop-ss2.start;
+// }
+
+ final int innerdist, outerdist;
+ //assert(!SAME_STRAND_PAIRS) : "TODO";
+
+ if(REQUIRE_CORRECT_STRANDS_PAIRS){
+ if(ss1.strand!=ss2.strand){
+ if(ss1.strand==Gene.PLUS){
+ innerdist=ss2.start-ss1.stop;
+ outerdist=ss2.stop-ss1.start;
+ }else{
+ innerdist=ss1.start-ss2.stop;
+ outerdist=ss1.stop-ss2.start;
+ }
+ }else{
+ if(ss1.start<=ss2.start){
+ innerdist=ss2.start-ss1.stop;
+ outerdist=ss2.stop-ss1.start;
+ }else{
+ innerdist=ss1.start-ss2.stop;
+ outerdist=ss1.stop-ss2.start;
+ }
+ }
+ }else{
+ if(ss1.start<=ss2.start){
+ innerdist=ss2.start-ss1.stop;
+ outerdist=ss2.stop-ss1.start;
+ }else{
+ innerdist=ss1.start-ss2.stop;
+ outerdist=ss1.stop-ss2.start;
+ }
+ }
+
+ assert(outerdist>=innerdist);
+
+ if(outerdist>=outerDistLimit && innerdist<=innerDistLimit){
+
+ boolean strandOK=((ss1.strand==ss2.strand)==SAME_STRAND_PAIRS);
+
+ if(strandOK || !REQUIRE_CORRECT_STRANDS_PAIRS){
+
+ boolean paired1=false, paired2=false;
+
+ int deviation=absdif(AVERAGE_PAIR_DIST, innerdist);
+
+ final int pairedScore1;
+ final int pairedScore2;
+ if(strandOK){
+// pairedScore1=ss1.score+ss2.score/2;
+// pairedScore2=ss2.score+ss1.score/2;
+
+ pairedScore1=ss1.score+1+Tools.max(1, ss2.score/2-(((deviation)*ss2.score)/(32*expectedFragLength+100)));
+ pairedScore2=ss2.score+1+Tools.max(1, ss1.score/2-(((deviation)*ss1.score)/(32*expectedFragLength+100)));
+ }else{//e.g. a junction
+ pairedScore1=ss1.score+Tools.max(0, ss2.score/16);
+ pairedScore2=ss2.score+Tools.max(0, ss1.score/16);
+ }
+
+ if(pairedScore1>ss1.pairedScore){
+ paired1=true;
+ ss1.setPairedScore(Tools.max(ss1.pairedScore, pairedScore1));
+ maxPairedScore1=Tools.max(ss1.score, maxPairedScore1);
+ // System.out.println("Paired "+ss1.toText()+" with "+ss2.toText());
+ }else{
+ // System.out.println(ss1.toText()+" already paired.");
+ }
+ if(pairedScore2>ss2.pairedScore){
+ paired2=true;
+ ss2.setPairedScore(Tools.max(ss2.pairedScore, pairedScore2));
+ maxPairedScore2=Tools.max(ss2.score, maxPairedScore2);
+ }
+
+ if(paired1 && paired2 && outerdist>=maxReadLen && deviation<=expectedFragLength && ss1.perfect && ss2.perfect){
+ numPerfectPairs++; //Lower bound. Some perfect pairs may be the same.
+ }
+
+// ss1.pairedScore=Tools.max(ss1.pairedScore, pairedScore1);
+// ss2.pairedScore=Tools.max(ss2.pairedScore, pairedScore2);
+// maxPairedScore1=Tools.max(ss1.score, maxPairedScore1);
+// maxPairedScore2=Tools.max(ss2.score, maxPairedScore2);
+ }
+ }
+ }
+
+ }
+
+
+
+ for(SiteScore ss : r.sites){
+ if(ss.pairedScore>ss.score){ss.score=ss.pairedScore;}
+ else{assert(ss.pairedScore==0);}
+// ss.score=ss.pairedScore=Tools.max(ss.pairedScore, ss.score);
+ }
+ for(SiteScore ss : r2.sites){
+ if(ss.pairedScore>ss.score){ss.score=ss.pairedScore;}
+ else{assert(ss.pairedScore==0);}
+// ss.score=ss.pairedScore=Tools.max(ss.pairedScore, ss.score);
+ }
+
+ if(trim){
+ if(numPerfectPairs>0){
+// System.out.print(".");
+ Tools.trimSitesBelowCutoff(r.sites, (int)(maxPairedScore1*.94f), false, true, 1, MAX_TRIM_SITES_TO_RETAIN);
+ Tools.trimSitesBelowCutoff(r2.sites, (int)(maxPairedScore2*.94f), false, true, 1, MAX_TRIM_SITES_TO_RETAIN);
+ }else{
+ if(r.sites.size()>4){
+ Tools.trimSitesBelowCutoff(r.sites, (int)(maxPairedScore1*.9f), true, true, 1, MAX_TRIM_SITES_TO_RETAIN);
+ }
+ if(r2.sites.size()>4){
+ Tools.trimSitesBelowCutoff(r2.sites, (int)(maxPairedScore2*.9f), true, true, 1, MAX_TRIM_SITES_TO_RETAIN);
+ }
+ }
+ }
+
+// if(pairs.isEmpty()){return null;}
+//
+// ArrayList<SiteScore> temp=new ArrayList<SiteScore>(Tools.max(r.list.size(), r2.list.size()));
+//
+// for(SiteScore ss : r.list){
+// if(ss.score>maxPairedScore1){temp.add(ss);}
+// }
+// for(SiteScorePair ssp : pairs){
+// temp.add(ssp.a);
+// }
+// r.list.clear();
+// r.list.addAll(temp);
+//
+// for(SiteScore ss : r2.list){
+// if(ss.score>maxPairedScore2){temp.add(ss);}
+// }
+// for(SiteScorePair ssp : pairs){
+// temp.add(ssp.b);
+// }
+// r2.list.clear();
+// r2.list.addAll(temp);
+//
+// return pairs;
+
+ return numPerfectPairs;
+ }
+
+
+ public void processReadPair(final Read r, final byte[] basesM1, final byte[] basesM2){
+ if(idmodulo>1 && r.numericID%idmodulo!=1){return;}
+ final Read r2=r.mate;
+ assert(r2!=null);
+ final byte[] basesP1=r.bases, basesP2=r2.bases;
+ final int len1=(basesP1==null ? 0 : basesP1.length), len2=(basesP2==null ? 0 : basesP2.length);
+
+ readsUsed1++;
+ readsUsed2++;
+
+ final int maxPossibleQuickScore1=quickMap(r, basesM1);
+ final int maxPossibleQuickScore2=quickMap(r2, basesM2);
+
+ if(verbose){
+ System.err.println("\nAfter quick map:\nRead1:\t"+r+"\nRead2:\t"+r.mate);
+ }
+
+ if(maxPossibleQuickScore1<0 && maxPossibleQuickScore2<0){
+ r.sites=null;
+ r2.sites=null;
+ lowQualityReadsDiscarded1++;
+ lowQualityBasesDiscarded1+=len1;
+ r.setDiscarded(true);
+ lowQualityReadsDiscarded2++;
+ lowQualityBasesDiscarded2+=len2;
+ r2.setDiscarded(true);
+ return;
+ }
+
+ //Not really needed due to subsumption
+// Tools.mergeDuplicateSites(r.list);
+// Tools.mergeDuplicateSites(r2.list);
+
+ initialSiteSum1+=r.numSites();
+ initialSiteSum2+=r2.numSites();
+
+ //TODO: Fix this. This is a workaround for an assertion error counting the number of reads used.
+ //Discards need to be tracked separately for each end.
+// if(maxPossibleQuickScore2<0){lowQualityReadsDiscarded--;}
+
+ final int maxSwScore1=msa.maxQuality(len1);
+ final int maxImperfectSwScore1=msa.maxImperfectScore(len1);
+ final int maxSwScore2=msa.maxQuality(len2);
+ final int maxImperfectSwScore2=msa.maxImperfectScore(len2);
+
+ pairSiteScoresInitial(r, r2, TRIM_LIST);
+ if(verbose){System.err.println("\nAfter initial pair:\nRead1:\t"+r+"\nRead2:\t"+r2);}
+
+ if(TRIM_LIST){
+
+ if(MIN_TRIM_SITES_TO_RETAIN_PAIRED>1){
+ if(r.numSites()>MIN_TRIM_SITES_TO_RETAIN_PAIRED){Collections.sort(r.sites);}
+ if(r2.numSites()>MIN_TRIM_SITES_TO_RETAIN_PAIRED){Collections.sort(r2.sites);}
+ }
+
+ trimList(r.sites, true, maxSwScore1, false, MIN_TRIM_SITES_TO_RETAIN_PAIRED, MAX_TRIM_SITES_TO_RETAIN);
+ trimList(r2.sites, true, maxSwScore2, false, MIN_TRIM_SITES_TO_RETAIN_PAIRED, MAX_TRIM_SITES_TO_RETAIN);
+ }
+ postTrimSiteSum1+=r.numSites();
+ postTrimSiteSum2+=r2.numSites();
+
+ {//Reset score to non-paired score
+ if(r.sites!=null){
+ for(SiteScore ss : r.sites){
+ assert(ss.slowScore<=ss.quickScore);
+ ss.score=ss.quickScore;
+ }
+ }
+ if(r2.sites!=null){
+ for(SiteScore ss : r2.sites){
+ assert(ss.slowScore<=ss.quickScore);
+ ss.score=ss.quickScore;
+ }
+ }
+ }
+
+ if(verbose){System.err.println("\nAfter trim:\nRead1:\t"+r.sites+"\nRead2:\t"+r2.sites);}
+
+// assert(Read.CHECKSITES(r, basesM1) && Read.CHECKSITES(r2, basesM2));
+
+ if(SLOW_ALIGN){
+
+ if(r.numSites()>0){
+
+ int numNearPerfectScores1=scoreNoIndels(r, basesP1, basesM1, maxSwScore1, maxImperfectSwScore1);
+ Collections.sort(r.sites); //Puts higher scores first to better trigger the early exit based on perfect scores
+
+ if(numNearPerfectScores1<1){
+ if(FIND_TIP_DELETIONS){findTipDeletions(r, basesP1, basesM1, maxSwScore1, maxImperfectSwScore1);}
+ }
+
+ //TODO:
+ //Note scoreSlow can be skipped under this circumstance:
+ //When rescue is disabled, numNearPerfectScores>0, and there are no paired sites.
+ scoreSlow(r.sites, basesP1, basesM1, maxSwScore1, maxImperfectSwScore1);
+ if(STRICT_MAX_INDEL){
+ int removed=removeLongIndels(r.sites, index.MAX_INDEL);
+ if(r.numSites()==0){r.clearMapping();}
+ }
+ Tools.mergeDuplicateSites(r.sites, true, true);
+ }
+
+ if(r2.numSites()>0){
+ int numNearPerfectScores2=scoreNoIndels(r2, basesP2, basesM2, maxSwScore2, maxImperfectSwScore2);
+ Collections.sort(r2.sites); //Puts higher scores first to better trigger the early exit based on perfect scores
+
+ if(numNearPerfectScores2<1){
+ if(FIND_TIP_DELETIONS){findTipDeletions(r2, basesP2, basesM2, maxSwScore2, maxImperfectSwScore2);}
+ }
+
+ scoreSlow(r2.sites, basesP2, basesM2, maxSwScore2, maxImperfectSwScore2);
+ if(STRICT_MAX_INDEL){
+ int removed=removeLongIndels(r2.sites, index.MAX_INDEL);
+ if(r2.numSites()<1){r2.clearMapping();}
+ }
+ Tools.mergeDuplicateSites(r2.sites, true, true);
+ }
+
+
+ if(verbose){System.err.println("\nAfter slow align:\nRead1:\t"+r+"\nRead2:\t"+r2);}
+ assert(Read.CHECKSITES(r, basesM1, false) && Read.CHECKSITES(r2, basesM2, false));
+
+ if(DO_RESCUE){
+ int unpaired1=0;
+ int unpaired2=0;
+ if(r.sites!=null){
+ for(SiteScore ss : r.sites){
+ assert(ss.pairedScore<1 || ss.pairedScore>ss.quickScore || ss.pairedScore>ss.slowScore) :
+ "\n"+ss.toText()+"\n"+r.toText(false)+"\n";
+ if(ss.pairedScore==0){unpaired1++;}
+ }
+ }
+ if(r2.sites!=null){
+ for(SiteScore ss : r2.sites){
+ assert(ss.pairedScore<1 || ss.pairedScore>ss.quickScore || ss.pairedScore>ss.slowScore) :
+ "\n"+ss.toText()+"\n"+r2.toText(false)+"\n";
+ if(ss.pairedScore==0){unpaired2++;}
+ }
+ }
+
+ if(unpaired1>0 && r.numSites()>0){
+ Collections.sort(r.sites);
+ Tools.removeLowQualitySitesPaired(r.sites, maxSwScore1, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE);
+ rescue(r, r2, basesP2, basesM2, Tools.min(MAX_PAIR_DIST, 2*AVERAGE_PAIR_DIST+100));
+ Tools.mergeDuplicateSites(r2.sites, true, true);
+ }
+
+ if(unpaired2>0 && r2.numSites()>0){
+ Collections.sort(r2.sites);
+ Tools.removeLowQualitySitesPaired(r2.sites, maxSwScore2, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE);
+ rescue(r2, r, basesP1, basesM1, Tools.min(MAX_PAIR_DIST, 2*AVERAGE_PAIR_DIST+100));
+ Tools.mergeDuplicateSites(r.sites, true, true);
+ }
+
+ postRescueSiteSum1+=r.numSites();
+ postRescueSiteSum2+=r2.numSites();
+
+// if(r.list!=null){Collections.sort(r.list);}
+// if(r2.list!=null){Collections.sort(r2.list);}
+//
+// Tools.removeLowQualitySites(r.list, maxSwScore1, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE);
+// Tools.removeLowQualitySites(r2.list, maxSwScore2, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE);
+
+ if(verbose){System.err.println("\nAfter rescue:\nRead1:\t"+r+"\nRead2:\t"+r2);}
+ assert(Read.CHECKSITES(r, basesM1, false) && Read.CHECKSITES(r2, basesM2, false));
+ }
+ }else{
+ Tools.mergeDuplicateSites(r.sites, true, false);
+ Tools.mergeDuplicateSites(r2.sites, true, false);
+ if(verbose){System.err.println("\nAfter merge:\nRead1:\t"+r+"\nRead2:\t"+r2);}
+ assert(Read.CHECKSITES(r, basesM1, false) && Read.CHECKSITES(r2, basesM2, false));
+ }
+
+ if(r.numSites()>1){Collections.sort(r.sites);}
+ if(r2.numSites()>1){Collections.sort(r2.sites);}
+
+ if(false){//This block is optional, but increases correctness by a tiny bit. (or maybe not!)
+ if(SLOW_ALIGN || USE_AFFINE_SCORE){
+ Tools.removeLowQualitySitesPaired(r.sites, maxSwScore1, MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED, MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED);
+ Tools.removeLowQualitySitesPaired(r2.sites, maxSwScore2, MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED, MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED);
+ }
+
+ pairSiteScoresFinal(r, r2, false, false, MAX_PAIR_DIST, AVERAGE_PAIR_DIST, SAME_STRAND_PAIRS, REQUIRE_CORRECT_STRANDS_PAIRS, MAX_TRIM_SITES_TO_RETAIN);
+
+ if(r.numSites()>1){Collections.sort(r.sites);}
+ if(r2.numSites()>1){Collections.sort(r2.sites);}
+ }
+
+ if(SLOW_ALIGN || USE_AFFINE_SCORE){
+ Tools.removeLowQualitySitesPaired(r.sites, maxSwScore1, MINIMUM_ALIGNMENT_SCORE_RATIO, MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED);
+ Tools.removeLowQualitySitesPaired(r2.sites, maxSwScore2, MINIMUM_ALIGNMENT_SCORE_RATIO, MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED);
+ }
+
+ pairSiteScoresFinal(r, r2, true, true, MAX_PAIR_DIST, AVERAGE_PAIR_DIST, SAME_STRAND_PAIRS, REQUIRE_CORRECT_STRANDS_PAIRS, MAX_TRIM_SITES_TO_RETAIN);
+ if(verbose){System.err.println("\nAfter final pairing:\nRead1:\t"+r+"\nRead2:\t"+r2);}
+
+ if(r.numSites()>0){
+ mapped1++;
+ Collections.sort(r.sites);
+ }
+ if(r2.numSites()>0){
+ mapped2++;
+ Collections.sort(r2.sites);
+ }
+ assert(Read.CHECKSITES(r, basesM1) && Read.CHECKSITES(r2, basesM2));
+
+ if(SLOW_ALIGN || USE_AFFINE_SCORE){
+ r.setPerfectFlag(maxSwScore1);
+ r2.setPerfectFlag(maxSwScore2);
+// assert(Read.CHECKSITES(r, basesM1) && Read.CHECKSITES(r2, basesM2));
+ }
+
+
+ if(r.numSites()>1){
+ final int clearzone=r.perfect() ? CLEARZONEP :
+ r.topSite().score>=(int)(maxSwScore1*CLEARZONE1b_CUTOFF_SCALE-CLEARZONE1b_CUTOFF_FLAT) ? CLEARZONE1 :
+ (r.topSite().score>=(int)(maxSwScore1*CLEARZONE1c_CUTOFF_SCALE-CLEARZONE1c_CUTOFF_FLAT) ? CLEARZONE1b : CLEARZONE1c);
+ int numBestSites1=Tools.countTopScores(r.sites, clearzone);
+ if(numBestSites1>1){
+ //Ambiguous alignment
+ assert(r.sites.size()>1);
+
+ boolean b=processAmbiguous(r.sites, true, AMBIGUOUS_TOSS, clearzone, SAVE_AMBIGUOUS_XY);
+ r.setAmbiguous(b);
+ }
+// assert(Read.CHECKSITES(r, basesM1));
+ }
+
+ if(r2.numSites()>1){
+ final int clearzone=r2.perfect() ? CLEARZONEP :
+ r2.topSite().score>=(int)(maxSwScore2*CLEARZONE1b_CUTOFF_SCALE-CLEARZONE1b_CUTOFF_FLAT) ? CLEARZONE1 :
+ (r2.topSite().score>=(int)(maxSwScore2*CLEARZONE1c_CUTOFF_SCALE-CLEARZONE1c_CUTOFF_FLAT) ? CLEARZONE1b : CLEARZONE1c);
+ int numBestSites2=Tools.countTopScores(r2.sites, clearzone);
+ if(numBestSites2>1){
+ //Ambiguous alignment
+ assert(r2.sites.size()>1);
+
+ boolean b=processAmbiguous(r2.sites, false, AMBIGUOUS_TOSS, clearzone, SAVE_AMBIGUOUS_XY);
+ r2.setAmbiguous(b);
+ }
+// assert(Read.CHECKSITES(r2, basesM2));
+ }
+ if(verbose){System.err.println("\nAfter ambiguous removal:\nRead1:\t"+r+"\nRead2:\t"+r2);}
+
+ if(r.numSites()>0 && r2.numSites()>0){
+ SiteScore ss1=r.topSite();
+ SiteScore ss2=r2.topSite();
+ if(canPair(ss1, ss2, len1, len2, REQUIRE_CORRECT_STRANDS_PAIRS, SAME_STRAND_PAIRS, MAX_PAIR_DIST)){
+ assert(SLOW_ALIGN ? ss1.pairedScore>ss1.slowScore : ss1.pairedScore>ss1.quickScore) :
+ "\n"+ss1.toText()+"\n"+ss2.toText()+"\n"+r.toText(false)+"\n"+r2.toText(false)+"\n\n"+
+ r.mapped()+", "+r.paired()+", "+r.strand()+", "+r.ambiguous()+"\n\n"+r2.mapped()+", "+r2.paired()+", "+r2.strand()+", "+r2.ambiguous()+"\n\n";
+ assert(SLOW_ALIGN ? ss2.pairedScore>ss2.slowScore : ss2.pairedScore>ss2.quickScore) :
+ "\n"+ss1.toText()+"\n"+ss2.toText()+"\n"+r.toText(false)+"\n"+r2.toText(false)+"\n\n";
+ r.setPaired(true);
+ r.mate.setPaired(true);
+ }
+ }
+
+ if(r.numSites()==0){r.sites=null;r.mapScore=0;}
+ if(r2.numSites()==0){r2.sites=null;r2.mapScore=0;}
+
+ r.setFromTopSite(AMBIGUOUS_RANDOM, true, MAX_PAIR_DIST);
+ r2.setFromTopSite(AMBIGUOUS_RANDOM, true, MAX_PAIR_DIST);
+ if(KILL_BAD_PAIRS){
+ if(r.isBadPair(REQUIRE_CORRECT_STRANDS_PAIRS, SAME_STRAND_PAIRS, MAX_PAIR_DIST)){
+ int x=r.mapScore/len1;
+ int y=r2.mapScore/len2;
+ if(x>=y){
+ r2.clearAnswers(false);
+ }else{
+ r.clearAnswers(false);
+ }
+ }
+ }
+ if(verbose){System.err.println("\nAfter bad pair removal:\nRead1:\t"+r+"\nRead2:\t"+r2);}
+
+ assert(r.sites==null || r.mapScore>0) : r.mapScore+"\n"+r.toText(false)+"\n\n"+r2.toText(false)+"\n";
+ assert(r2.sites==null || r2.mapScore>0) : r2.mapScore+"\n"+r.toText(false)+"\n\n"+r2.toText(false)+"\n";
+ if(MAKE_MATCH_STRING){
+ if(r.numSites()>0){
+ if(USE_SS_MATCH_FOR_PRIMARY && r.topSite().match!=null){
+ r.match=r.topSite().match;
+ }else{
+ genMatchString(r, basesP1, basesM1, maxImperfectSwScore1, maxSwScore1, false, false);
+
+ if(STRICT_MAX_INDEL && r.mapped()){
+ if(hasLongIndel(r.match, index.MAX_INDEL)){
+ r.clearMapping();
+ r2.setPaired(false);
+ }
+ }
+ }
+// assert(Read.CHECKSITES(r, basesM1));
+ }
+ if(r2.numSites()>0){
+ if(USE_SS_MATCH_FOR_PRIMARY && r2.topSite().match!=null){
+ r2.match=r2.topSite().match;
+ }else{
+ genMatchString(r2, basesP2, basesM2, maxImperfectSwScore2, maxSwScore2, false, false);
+
+ if(STRICT_MAX_INDEL && r2.mapped()){
+ if(hasLongIndel(r2.match, index.MAX_INDEL)){
+ r2.clearMapping();
+ r.setPaired(false);
+ }
+ }
+ }
+// assert(Read.CHECKSITES(r2, basesM2));
+ }
+ }
+
+ assert(checkTopSite(r)); // TODO remove this
+ if(verbose){
+ System.err.println("\nFinal:\nRead1:\t"+r+"\nRead2:\t"+r2);
+ if(r.match!=null && r.shortmatch()){r.match=Read.toLongMatchString(r.match); r.setShortMatch(false);}
+ if(r2.match!=null && r2.shortmatch()){r2.match=Read.toLongMatchString(r2.match); r2.setShortMatch(false);}
+ }
+
+ //Block to prevent assertion from firing. Generally caused by alignment being lost during match generation. TODO: Fix cause.
+ if(r.mapScore>0 && r.sites==null){
+ if(!Shared.anomaly){System.err.println("Anomaly: mapScore>0 and list==null.\n"+r+"\n");}
+ Shared.anomaly=true;
+ r.clearMapping();
+ r2.setPaired(false);
+ }else if(r.mapScore<=0 && r.sites!=null){
+ if(!STRICT_MAX_INDEL && !Shared.anomaly){System.err.println("Anomaly2: mapScore<=0 and list!=null.\n"+r+"\n");}
+ Shared.anomaly=true;
+ r.clearMapping();
+ r2.setPaired(false);
+ }
+ assert(checkTopSite(r)); // TODO remove this
+ //Block to prevent assertion from firing. Generally caused by alignment being lost during match generation. TODO: Fix cause.
+ if(r2.mapScore>0 && r2.sites==null){
+ if(!Shared.anomaly){System.err.println("Anomaly: mapScore>0 and list==null.\n"+r+"\n");}
+ Shared.anomaly=true;
+ r2.clearMapping();
+ r.setPaired(false);
+ }else if(r2.mapScore<=0 && r2.sites!=null){
+ if(!STRICT_MAX_INDEL && !Shared.anomaly){System.err.println("Anomaly3: mapScore<=0 and list!=null.\n"+r+"\n");}
+ Shared.anomaly=true;
+ r2.clearMapping();
+ r.setPaired(false);
+ }
+
+ assert(r.sites==null || r.mapScore>0) :
+ r.mapScore+"\t"+r.sites+"\n"+(-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED*maxSwScore1))+"\n"+
+ Arrays.toString(msa.fillAndScoreLimited(r.strand()==Gene.PLUS ? r.bases :
+ AminoAcid.reverseComplementBases(r.bases), r.topSite(), Tools.max(SLOW_ALIGN_PADDING, 80), 0))+"\n"+
+ Arrays.toString(msa.fillAndScoreLimited(r.strand()==Gene.PLUS ? r.bases :
+ AminoAcid.reverseComplementBases(r.bases), r.topSite(), Tools.max(SLOW_ALIGN_PADDING, 80), (-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED*maxSwScore1))))+"\n\n"+
+ msa.showVertLimit()+"\n\n"+msa.showHorizLimit()+"\n\n"+r+"\n\n"+r2+"\n\n";
+ assert(r2.sites==null || r2.mapScore>0) :
+ r2.mapScore+"\t"+r2.sites+"\n"+(-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED*maxSwScore2))+"\n"+
+ Arrays.toString(msa.fillAndScoreLimited(r2.strand()==Gene.PLUS ? r2.bases :
+ AminoAcid.reverseComplementBases(r2.bases), r2.topSite(), Tools.max(SLOW_ALIGN_PADDING, 80), 0))+"\n"+
+ Arrays.toString(msa.fillAndScoreLimited(r2.strand()==Gene.PLUS ? r2.bases :
+ AminoAcid.reverseComplementBases(r2.bases), r2.topSite(), Tools.max(SLOW_ALIGN_PADDING, 80), (-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED*maxSwScore2))))+"\n\n"+
+ msa.showVertLimit()+"\n\n"+msa.showHorizLimit()+"\n\n"+r+"\n\n"+r2+"\n\n";
+
+ assert(!r.mapped() || !MAKE_MATCH_STRING || r.match!=null) : "Note that sometimes, VERY RARELY, match string generation fails.";
+ assert(checkTopSite(r)); // TODO remove this
+ removeDuplicateBestSites(r);
+ removeDuplicateBestSites(r2);
+
+ if(DYNAMIC_INSERT_LENGTH && numMated>1000 && r.paired()){
+ AVERAGE_PAIR_DIST=(int)(innerLengthSum*1f/numMated);
+ }
+ assert(checkTopSite(r)); // TODO remove this
+ if(r.ambiguous() && AMBIGUOUS_TOSS){
+ if(r.sites!=null){r.sites=null;}
+ r.clearSite();
+ r.setMapped(false);
+ r.setPaired(false);
+ r2.setPaired(false);
+ }else if(r.mapped() && r.numSites()>1 && PRINT_SECONDARY_ALIGNMENTS){
+ ensureMatchStringsOnSiteScores(r, basesM1, maxImperfectSwScore1, maxSwScore1);
+ assert(Read.CHECKSITES(r, basesM1));
+ }
+ if(r2.ambiguous() && AMBIGUOUS_TOSS){
+ if(r2.sites!=null){r2.sites=null;}
+ r2.clearSite();
+ r2.setMapped(false);
+ r.setPaired(false);
+ r2.setPaired(false);
+ }else if(r2.mapped() && r2.numSites()>1 && PRINT_SECONDARY_ALIGNMENTS){
+ ensureMatchStringsOnSiteScores(r2, basesM2, maxImperfectSwScore2, maxSwScore2);
+ assert(Read.CHECKSITES(r2, basesM2));
+ }
+// assert(Read.CHECKSITES(r, basesM1) && Read.CHECKSITES(r2, basesM2));
+
+ assert(checkTopSite(r));
+ if(r.mapped() && (LOCAL_ALIGN || r.containsXYC())){
+ final SiteScore ss=r.topSite();
+ ss.match=r.match;
+ msa.toLocalAlignment(r, ss, basesM1, r.containsXYC() ? 1 : LOCAL_ALIGN_TIP_LENGTH, LOCAL_ALIGN_MATCH_POINT_RATIO);
+// System.err.println("\n\n*********\n\n"+r+"\n\n*********\n\n");
+// assert(Read.CHECKSITES(r, basesM1)); //TODO: This can fail; see bug#0001
+ }
+
+ assert(checkTopSite(r2));
+ if(r2.mapped() && (LOCAL_ALIGN || r2.containsXYC())){
+ final SiteScore ss=r2.topSite();
+ ss.match=r2.match;
+ msa.toLocalAlignment(r2, ss, basesM2, r2.containsXYC() ? 1 : LOCAL_ALIGN_TIP_LENGTH, LOCAL_ALIGN_MATCH_POINT_RATIO);
+// assert(Read.CHECKSITES(r2, basesM2)); //TODO: This can fail; see bug#0001
+ }
+
+ postFilterRead(r, basesM1, maxImperfectSwScore1, maxSwScore1);
+ postFilterRead(r2, basesM2, maxImperfectSwScore2, maxSwScore2);
+ if(MAKE_MATCH_STRING){
+ ensureMatchStringOnPrimary(r, basesM1, maxImperfectSwScore1, maxSwScore1);
+ ensureMatchStringOnPrimary(r2, basesM2, maxImperfectSwScore2, maxSwScore2);
+ }
+
+ if(CALC_STATISTICS){
+ calcStatistics1(r, maxSwScore1, maxPossibleQuickScore1);
+ calcStatistics2(r2, maxSwScore2, maxPossibleQuickScore2);
+ }
+ }
+
+}
diff --git a/current/align2/BBMapThreadPacBio.java b/current/align2/BBMapThreadPacBio.java
new file mode 100755
index 0000000..0190937
--- /dev/null
+++ b/current/align2/BBMapThreadPacBio.java
@@ -0,0 +1,1301 @@
+package align2;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+
+import jgi.CoveragePileup;
+
+import stream.ConcurrentReadInputStream;
+import stream.ConcurrentReadOutputStream;
+import stream.Read;
+import stream.SiteScore;
+
+import dna.AminoAcid;
+import dna.Data;
+import dna.Gene;
+
+/**
+ * Based on MapTestThread11f
+ *
+ * @author Brian Bushnell
+ * @date Jul 10, 2012
+ *
+ */
+public final class BBMapThreadPacBio extends AbstractMapThread{
+
+ static final int ALIGN_COLUMNS=BBIndexPacBio.ALIGN_COLUMNS;
+ static final int ALIGN_ROWS=6020;
+
+
+
+ /** Don't trim for local alignments unless at least this many bases will be clipped */
+ private final int LOCAL_ALIGN_TIP_LENGTH=1;
+ /** Range is 0-1; a lower number makes trimming more aggressive */
+ private final float LOCAL_ALIGN_MATCH_POINT_RATIO=0.75f;
+
+ /** Ratio of the points for a match of a single base needed to declare unambiguous */
+ public final float CLEARZONE_RATIOP=1.5f;
+ public final float CLEARZONE_RATIO1=2.2f;
+ public final float CLEARZONE_RATIO1b=2.8f;
+ public final float CLEARZONE_RATIO1c=4.8f;
+ public final float CLEARZONE_RATIO3=8f;
+ /** Max allowed number of sites within 1 edit (excluding primary site) */
+ public final int CLEARZONE_LIMIT1e=4;
+ //public final int CLEARZONE1e;
+ public final int CLEARZONEP;
+ public final int CLEARZONE1;
+ public final int CLEARZONE1b;
+ public final int CLEARZONE1c;
+ //public final int CLEARZONE1e;
+ public final int CLEARZONE3;
+ public final float INV_CLEARZONE3;
+ public final float CLEARZONE1b_CUTOFF=0.92f;
+ public final float CLEARZONE1c_CUTOFF=0.82f;
+
+ public final BBIndexPacBio index;
+
+
+ private final int MIN_TRIM_SITES_TO_RETAIN_SINGLE=3;
+ private final int MIN_TRIM_SITES_TO_RETAIN_PAIRED=2;
+
+ public static void setExpectedSites(int x){
+ System.err.println("Warning: EXPECTED_SITES is not valid for "+(new Object() { }.getClass().getEnclosingClass().getName()));
+ }
+
+ @Override
+ public final int ALIGN_COLUMNS(){return ALIGN_COLUMNS;}
+ @Override
+ public final int ALIGN_ROWS(){return ALIGN_ROWS;}
+ @Override
+ public final int maxReadLength(){return ALIGN_ROWS-1;}
+ @Override
+ final AbstractIndex index(){return index;}
+ @Override
+ final int CLEARZONE1(){return CLEARZONE1;}
+
+ public BBMapThreadPacBio(ConcurrentReadInputStream cris_, int keylen_,
+ CoveragePileup pileup_, boolean SMITH_WATERMAN_, int THRESH_, int minChrom_,
+ int maxChrom_, float keyDensity_, float maxKeyDensity_, float minKeyDensity_, int maxDesiredKeys_,
+ boolean REMOVE_DUPLICATE_BEST_ALIGNMENTS_, boolean SAVE_AMBIGUOUS_XY_,
+ float MINIMUM_ALIGNMENT_SCORE_RATIO_, boolean TRIM_LIST_, boolean MAKE_MATCH_STRING_, boolean QUICK_MATCH_STRINGS_,
+ ConcurrentReadOutputStream outStream_, ConcurrentReadOutputStream outStreamMapped_, ConcurrentReadOutputStream outStreamUnmapped_, ConcurrentReadOutputStream outStreamBlack_,
+ int SLOW_ALIGN_PADDING_, int SLOW_RESCUE_PADDING_, boolean DONT_OUTPUT_UNMAPPED_READS_, boolean DONT_OUTPUT_BLACKLISTED_READS_,
+ int MAX_SITESCORES_TO_PRINT_, boolean PRINT_SECONDARY_ALIGNMENTS_,
+ boolean REQUIRE_CORRECT_STRANDS_PAIRS_, boolean SAME_STRAND_PAIRS_, boolean KILL_BAD_PAIRS_, boolean RCOMP_MATE_,
+ boolean PERFECTMODE_, boolean SEMIPERFECTMODE_, boolean FORBID_SELF_MAPPING_, int TIP_DELETION_SEARCH_RANGE_,
+ boolean AMBIGUOUS_RANDOM_, boolean AMBIGUOUS_ALL_, int KFILTER_, float IDFILTER_, boolean TRIM_LEFT_, boolean TRIM_RIGHT_, boolean UNTRIM_, byte TRIM_QUAL_, int TRIM_MIN_LEN_,
+ boolean LOCAL_ALIGN_, boolean RESCUE_, boolean STRICT_MAX_INDEL_, String MSA_TYPE_){
+
+ super(cris_,
+ outStream_, outStreamMapped_, outStreamUnmapped_, outStreamBlack_,
+ pileup_, SMITH_WATERMAN_, LOCAL_ALIGN_, REMOVE_DUPLICATE_BEST_ALIGNMENTS_,
+ AMBIGUOUS_RANDOM_, AMBIGUOUS_ALL_, TRIM_LEFT_, TRIM_RIGHT_, UNTRIM_, TRIM_QUAL_, TRIM_MIN_LEN_, THRESH_,
+ minChrom_, maxChrom_, KFILTER_, IDFILTER_, KILL_BAD_PAIRS_, SAVE_AMBIGUOUS_XY_,
+ REQUIRE_CORRECT_STRANDS_PAIRS_,
+ SAME_STRAND_PAIRS_, RESCUE_, STRICT_MAX_INDEL_, SLOW_ALIGN_PADDING_, SLOW_RESCUE_PADDING_,
+ MSA_TYPE_, keylen_, PERFECTMODE_, SEMIPERFECTMODE_, FORBID_SELF_MAPPING_, RCOMP_MATE_,
+ MAKE_MATCH_STRING_, DONT_OUTPUT_UNMAPPED_READS_, DONT_OUTPUT_BLACKLISTED_READS_, PRINT_SECONDARY_ALIGNMENTS_,
+ QUICK_MATCH_STRINGS_, MAX_SITESCORES_TO_PRINT_, MINIMUM_ALIGNMENT_SCORE_RATIO_,
+ keyDensity_, maxKeyDensity_, minKeyDensity_, maxDesiredKeys_,
+ BBIndexPacBio.MIN_APPROX_HITS_TO_KEEP, BBIndexPacBio.USE_EXTENDED_SCORE,
+ BBIndexPacBio.BASE_HIT_SCORE, BBIndexPacBio.USE_AFFINE_SCORE, BBIndexPacBio.MAX_INDEL, TRIM_LIST_, TIP_DELETION_SEARCH_RANGE_);
+
+ assert(SLOW_ALIGN_PADDING>=0);
+ assert(!(RCOMP_MATE/* || FORBID_SELF_MAPPING*/)) : "RCOMP_MATE: TODO";
+
+ if(SLOW_ALIGN || MAKE_MATCH_STRING){
+// msa=MSA.makeMSA(ALIGN_ROWS, ALIGN_COLUMNS, MSA_TYPE);
+// POINTS_MATCH=msa.POINTS_MATCH();
+// POINTS_MATCH2=msa.POINTS_MATCH2();
+ CLEARZONE1=(int)(CLEARZONE_RATIO1*POINTS_MATCH2);
+ CLEARZONE1b=(int)(CLEARZONE_RATIO1b*POINTS_MATCH2);
+ CLEARZONE1c=(int)(CLEARZONE_RATIO1c*POINTS_MATCH2);
+ CLEARZONEP=(int)(CLEARZONE_RATIOP*POINTS_MATCH2);
+ CLEARZONE3=PENALIZE_AMBIG ? (int)(CLEARZONE_RATIO3*POINTS_MATCH2) : 0;
+// CLEARZONE1e=(int)(2*POINTS_MATCH2-POINTS_MATCH-msa.POINTS_SUB())+1;
+ }else{
+// POINTS_MATCH=70;
+// POINTS_MATCH2=100;
+// msa=null;
+ CLEARZONE1=0;
+ CLEARZONE1b=0;
+ CLEARZONE1c=0;
+ CLEARZONEP=0;
+ CLEARZONE3=0;
+// CLEARZONE1e=0;
+ }
+ INV_CLEARZONE3=(CLEARZONE3==0 ? 0 : 1f/CLEARZONE3);
+
+ index=new BBIndexPacBio(KEYLEN, minChrom, maxChrom, KFILTER, msa);
+ }
+
+
+ public int trimList(ArrayList<SiteScore> list, boolean retainPaired, int maxScore, boolean specialCasePerfect, int minSitesToRetain, int maxSitesToRetain){
+ if(list==null || list.size()==0){return -99999;}
+ if(list.size()==1){return list.get(0).score;}
+
+ final int highestScore;
+ if(USE_AFFINE_SCORE){
+
+ highestScore=Tools.trimSiteList(list, .6f, retainPaired, true, minSitesToRetain, maxSitesToRetain);
+ if(highestScore==maxScore && specialCasePerfect){
+ Tools.trimSiteList(list, .94f, retainPaired, true, minSitesToRetain, maxSitesToRetain);
+ if(list.size()>8){Tools.trimSiteList(list, .99f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ return highestScore;
+ }
+
+ final int mstr2=(minSitesToRetain<=1 ? 1 : minSitesToRetain+1);
+
+// if(list.size()>6){Tools.trimSiteList(list, .65f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+//// // System.out.print(", "+list.size());
+// if(list.size()>10){Tools.trimSiteList(list, .7f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+//// // System.out.print(", "+list.size());
+// if(list.size()>14){Tools.trimSiteList(list, .75f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+//// // System.out.print(", "+list.size());
+// if(list.size()>18){Tools.trimSiteList(list, .8f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+//// //// System.out.print(", "+list.size());
+// if(list.size()>22){Tools.trimSiteList(list, .85f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+//// //// System.out.print(", "+list.size());
+// if(list.size()>26){Tools.trimSiteList(list, .9f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+//// // System.out.print(", "+list.size());
+// if(list.size()>34){Tools.trimSiteList(list, .95f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+// // System.out.print(", "+list.size());
+// if(list.size()>42){Tools.trimSiteList(list, .98f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+// // System.out.print(", "+list.size());
+// if(list.size()>50){Tools.trimSiteList(list, .99f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+// // System.out.print(", "+list.size());
+//// if(list.size()>64){Tools.trimSiteList(list, .995f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+// // System.out.print(", "+list.size());
+
+ if(list.size()>4){Tools.trimSiteList(list, .65f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+// // System.out.print(", "+list.size());
+ if(list.size()>8){Tools.trimSiteList(list, .7f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+// // System.out.print(", "+list.size());
+ if(list.size()>12){Tools.trimSiteList(list, .75f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+// // System.out.print(", "+list.size());
+ if(list.size()>16){Tools.trimSiteList(list, .8f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+// //// System.out.print(", "+list.size());
+ if(list.size()>20){Tools.trimSiteList(list, .85f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+// //// System.out.print(", "+list.size());
+ if(list.size()>24){Tools.trimSiteList(list, .9f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+// // System.out.print(", "+list.size());
+ if(list.size()>32){Tools.trimSiteList(list, .95f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ if(list.size()>40){Tools.trimSiteList(list, .97f, retainPaired, true, mstr2, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ if(list.size()>48){Tools.trimSiteList(list, .99f, retainPaired, true, mstr2, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+// if(list.size()>64){Tools.trimSiteList(list, .995f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+
+
+ }else if(USE_EXTENDED_SCORE){
+ highestScore=Tools.trimSiteList(list, .75f, retainPaired, true, minSitesToRetain, maxSitesToRetain);
+
+ if(list.size()>8){Tools.trimSiteList(list, .8f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ if(list.size()>16){Tools.trimSiteList(list, .85f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ if(list.size()>24){Tools.trimSiteList(list, .90f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ if(list.size()>36){Tools.trimSiteList(list, .92f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ //// System.out.print(", "+list.size());
+ if(list.size()>40){Tools.trimSiteList(list, .94f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ //// System.out.print(", "+list.size());
+ if(list.size()>48){Tools.trimSiteList(list, .96f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ //// System.out.print(", "+list.size());
+ if(list.size()>56){Tools.trimSiteList(list, .97f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ if(list.size()>64){Tools.trimSiteList(list, .98f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ if(list.size()>80){Tools.trimSiteList(list, .99f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+
+
+ }else{
+ // System.out.print("\n\nSize:\t"+list.size());
+
+
+ highestScore=Tools.trimSiteList(list, .6f, retainPaired, true, minSitesToRetain, maxSitesToRetain);
+
+ if(list.size()>12){Tools.trimSiteList(list, .65f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ if(list.size()>16){Tools.trimSiteList(list, .7f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ if(list.size()>24){Tools.trimSiteList(list, .74f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ if(list.size()>28){Tools.trimSiteList(list, .8f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ if(list.size()>32){Tools.trimSiteList(list, .85f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ if(list.size()>48){Tools.trimSiteList(list, .90f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ //// System.out.print(", "+list.size());
+ // if(list.size()>40){Tools.trimSiteList(list, .95f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ //// System.out.print(", "+list.size());
+ // if(list.size()>48){Tools.trimSiteList(list, .96f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ //// System.out.print(", "+list.size());
+ // if(list.size()>56){Tools.trimSiteList(list, .97f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ }
+
+ return highestScore;
+ }
+
+
+ public void scoreSlow(final ArrayList<SiteScore> list, final byte[] basesP, final byte[] basesM,
+ final int maxSwScore, final int maxImperfectSwScore){
+
+ int minMsaLimit;
+ if(PAIRED){
+ minMsaLimit=-CLEARZONE1e+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE*maxSwScore);
+ }else{
+ minMsaLimit=-CLEARZONE1e+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO*maxSwScore);
+ }
+ assert(Read.CHECKSITES(list, basesP, basesM, -1));
+
+ int minMatch=Tools.max(-300, minMsaLimit-CLEARZONE3); //Score must exceed this to generate quick match string
+ if(verbose){
+ System.err.println("Slow-scoring. maxSwScore="+maxSwScore+", maxImperfectSwScore="+maxImperfectSwScore+", minMsaLimit="+minMsaLimit+", minMatch="+minMatch);
+ }
+ for(int i=0; i<list.size(); i++){
+ final SiteScore ss=list.get(i);
+ assert(ss.lengthsAgree());
+ final byte[] bases=(ss.strand==Gene.PLUS ? basesP : basesM);
+
+ if(SEMIPERFECTMODE){
+ assert(ss.stop-ss.start==bases.length-1);
+ assert(ss.semiperfect);
+ }
+
+ if(verbose){System.err.println("\nSlow-scoring "+ss);}
+ if(ss.stop-ss.start!=bases.length-1){
+ assert(ss.stop-ss.start>bases.length-1) : bases.length+", "+ss.toText();
+ assert(!ss.semiperfect) : "\n"+bases.length+", "+ss.toText()+", "+ss.perfect+", "+ss.semiperfect+", "+maxSwScore+"\n"+new String(basesP)+"\n";
+ ss.setSlowScore(0);
+ ss.semiperfect=false;
+ ss.perfect=false;
+ }
+
+ final int swscoreNoIndel=ss.slowScore;
+ int[] swscoreArray=null;
+
+ boolean clipped=true, setLimits=false;
+ if(swscoreNoIndel<maxImperfectSwScore && !ss.semiperfect){
+ if(verbose && ss.stop-ss.start>4000){
+ System.err.println(ss.toText());
+ System.err.println(list.size());
+ System.err.println();
+ }
+
+ int expectedLen=GapTools.calcGrefLen(ss);
+ if(verbose){System.err.println("expectedLen="+expectedLen);}
+ if(expectedLen>=EXPECTED_LEN_LIMIT){
+ //TODO: Alternately, I could kill the site.
+ ss.setStop(ss.start+Tools.min(basesP.length+40, EXPECTED_LEN_LIMIT));
+ if(verbose){System.err.println("expectedLen="+expectedLen+"; ss="+ss);}
+ }
+
+ int pad=SLOW_ALIGN_PADDING;
+ final int minscore=Tools.max(swscoreNoIndel, minMsaLimit);
+ final int minscore2=Tools.max(swscoreNoIndel-MSA.MIN_SCORE_ADJUST, minMsaLimit);
+ if(verbose){System.err.println("Sent to msa with start="+ss.start+", stop="+ss.stop+", pad="+pad+", limit="+minscore+", gaps="+GapTools.toString(ss.gaps));}
+ swscoreArray=msa.fillAndScoreLimited(bases, ss, pad, minscore);
+ if(verbose){System.err.println("Received "+Arrays.toString(swscoreArray));}
+
+ if(swscoreArray!=null && swscoreArray.length>6 && (swscoreArray[3]+swscoreArray[4]+expectedLen<EXPECTED_LEN_LIMIT)){
+ int[] oldArray=swscoreArray.clone();
+ assert(swscoreArray.length==8);
+ int extraPadLeft=swscoreArray[6];
+ int extraPadRight=swscoreArray[7];
+
+ if(verbose){
+ System.err.println("msa returned "+Arrays.toString(swscoreArray)+", re-running.");
+ System.err.println("Added extra padding: "+ss.toText()+", "+Arrays.toString(oldArray));
+ }
+
+ ss.setLimits(ss.start-extraPadLeft, ss.stop+extraPadRight);
+ pad=SLOW_ALIGN_PADDING+EXTRA_PADDING;
+ if(verbose){System.err.println("Sent to msa with start="+ss.start+", stop="+ss.stop+", pad="+pad+", limit="+minscore+", gaps="+GapTools.toString(ss.gaps));}
+ swscoreArray=msa.fillAndScoreLimited(bases, ss, pad, minscore);
+
+ if(verbose){System.err.println("Result of extra padding: "+ss.toText()+", "+Arrays.toString(swscoreArray));}
+ if(swscoreArray==null || swscoreArray[0]<oldArray[0]){
+ if(verbose){
+ System.err.println("Result was inferior.");
+ }
+ swscoreArray=oldArray;
+ }
+ }
+ assert(ss.lengthsAgree());
+ if(verbose){
+ System.err.println(QUICK_MATCH_STRINGS+", "+(swscoreArray==null ? "null" : (swscoreArray.length+", "+swscoreArray[0]+" >=? "+minscore)));
+ System.err.println("start="+ss.start+", stop="+ss.stop+", len="+ss.mappedLength());
+ }
+ if(QUICK_MATCH_STRINGS && swscoreArray!=null && swscoreArray.length==6 && swscoreArray[0]>=minscore2 && (PRINT_SECONDARY_ALIGNMENTS || (USE_SS_MATCH_FOR_PRIMARY && swscoreArray[0]>minMatch))){
+ if(verbose){System.err.println("Generating match string.");}
+ assert(swscoreArray.length==6) : swscoreArray.length;
+ assert(swscoreArray[0]>=minscore2) : "\n"+Arrays.toString(swscoreArray)+"\n"+minscore+"\n"+minMatch;
+ ss.match=msa.traceback(bases, Data.getChromosome(ss.chrom).array, ss.start-pad, ss.stop+pad, swscoreArray[3], swscoreArray[4], swscoreArray[5], ss.gaps!=null);
+ if(ss.match!=null){
+ assert(ss.pairedScore<1 || (ss.slowScore<=0 && ss.pairedScore>ss.quickScore ) || ss.pairedScore>ss.slowScore); //123
+ ss.setLimits(swscoreArray[1], swscoreArray[2]);
+ setLimits=true;
+ assert(ss.lengthsAgree());
+ clipped=ss.fixXY(bases, true, msa);
+ assert(ss.pairedScore<1 || (ss.slowScore<=0 && ss.pairedScore>ss.quickScore ) || ss.pairedScore>ss.slowScore); //123
+ clipped=ss.clipTipIndels(bases, basesM, 4, 10, msa) || clipped;
+ assert(ss.pairedScore<1 || (ss.slowScore<=0 && ss.pairedScore>ss.quickScore ) || ss.pairedScore>ss.slowScore); //123
+ assert(ss.lengthsAgree());
+ }
+ }else{
+ ss.match=null;
+ }
+ }
+ if(swscoreArray!=null && !setLimits){
+ if(verbose){System.err.println("msa returned "+Arrays.toString(swscoreArray));}
+ ss.setSlowScore(swscoreArray[0]);
+ ss.setLimits(swscoreArray[1], swscoreArray[2]);
+ assert(ss.lengthsAgree());
+ }else{
+ assert(swscoreNoIndel<=maxSwScore) : swscoreNoIndel+", "+maxImperfectSwScore+", "+maxSwScore+", "+new String(basesP);
+ assert(clipped || swscoreNoIndel==-1 || msa.scoreNoIndels(bases, ss.chrom, ss.start)==swscoreNoIndel) :
+ setLimits+", "+clipped+", "+(swscoreArray==null)+", "+
+ swscoreNoIndel+" != "+msa.scoreNoIndels(bases, ss.chrom, ss.start)+"\n"+
+ ss.toText()+"\n"+(ss.stop-ss.start)+", "+bases.length; //Slow
+ }
+ assert(ss.lengthsAgree());
+ ss.setScore(ss.slowScore);
+ minMatch=Tools.max(minMatch, ss.slowScore);
+ minMsaLimit=Tools.max(minMsaLimit, ss.slowScore-CLEARZONE3);
+ assert(ss.slowScore<=maxSwScore);
+ assert(!(ss.perfect && ss.slowScore<maxSwScore));
+ ss.perfect=(ss.slowScore==maxSwScore);
+ if(ss.perfect){ss.semiperfect=true;}
+ else if(!ss.semiperfect){ss.setPerfect(bases);}
+
+ if(verbose){System.err.println(" -> "+ss);}
+ }
+
+ }
+
+
+ public void processRead(final Read r, final byte[] basesM){
+ if(idmodulo>1 && r.numericID%idmodulo!=1){return;}
+ final byte[] basesP=r.bases;
+
+// System.err.print(" rd#"+r.numericID+" ");
+// if(r.numericID==25967){
+// verbose=true;
+// msa.verbose=true;
+// GapTools.verbose=true;
+// index.verbose=true;
+// tcr.verbose=true;
+// }
+
+ if(verbose){System.err.println("\nProcessing "+r);}
+ readsUsed1++;
+
+ final int maxPossibleQuickScore=quickMap(r, basesM);
+ if(verbose){System.err.println("\nQuick Map: \t"+r.sites);}
+
+ if(maxPossibleQuickScore<0){
+ r.sites=null;
+ lowQualityReadsDiscarded1++;
+ lowQualityBasesDiscarded1+=basesP.length;
+ r.setDiscarded(true);
+ return;
+ }
+ initialSiteSum1+=r.numSites();
+ if(verbose){System.err.println("\ninitialSiteSum1: "+initialSiteSum1);}
+
+ int maxSwScore=0;
+ int maxImperfectSwScore=0;
+
+ if(SLOW_ALIGN || USE_AFFINE_SCORE){
+ maxSwScore=msa.maxQuality(r.length());
+ maxImperfectSwScore=msa.maxImperfectScore(r.length());
+ }
+
+ if(TRIM_LIST && r.numSites()>1){
+ if(MIN_TRIM_SITES_TO_RETAIN_SINGLE>1){Collections.sort(r.sites);}
+ int highestQuickScore=trimList(r.sites, false, maxSwScore, true, MIN_TRIM_SITES_TO_RETAIN_SINGLE, MAX_TRIM_SITES_TO_RETAIN);
+ }
+ postTrimSiteSum1+=r.numSites();
+ if(verbose){System.err.println("\nAfter trim: \t"+r.sites);}
+
+ assert(Read.CHECKSITES(r, basesM));
+
+
+ if(SLOW_ALIGN && r.numSites()>0){
+
+ int numNearPerfectScores=scoreNoIndels(r, basesP, basesM, maxSwScore, maxImperfectSwScore);
+
+ Collections.sort(r.sites); //Puts higher scores first to better trigger the early exit based on perfect scores
+
+// int numPerfectScores=0;
+// if(numNearPerfectScores>0){
+// for(SiteScore ss : r.list){
+// if(ss.perfect){numPerfectScores++;}
+// else{break;}
+// }
+// }
+
+ if(verbose){
+ System.err.println("\nAfter scoreNoIndels: \t"+r.sites);
+ }
+
+ if(numNearPerfectScores<1){
+ if(FIND_TIP_DELETIONS){findTipDeletions(r, basesP, basesM, maxSwScore, maxImperfectSwScore);}
+ }
+
+ if(verbose){
+ System.err.println("\nAfter findTipDeletions: \t"+r.sites);
+ }
+
+ //TODO: This causes problems with perfect matches that are mapped to areas longer than the read length
+ //***Above note should be resolved now, but needs to be verified.
+
+ if(numNearPerfectScores<1){
+ scoreSlow(r.sites, basesP, basesM, maxSwScore, maxImperfectSwScore);
+ }
+
+ if(STRICT_MAX_INDEL){
+ int removed=removeLongIndels(r.sites, index.MAX_INDEL);
+ if(r.numSites()==0){r.clearMapping();}
+ }
+
+ if(verbose){System.err.println("\nAfter scoreSlow: \t"+r.sites);}
+ assert(Read.CHECKSITES(r, basesM, false));
+ }
+
+
+ if(r.numSites()>0){
+ mapped1++;
+ try {
+ Tools.mergeDuplicateSites(r.sites, true, true);
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ throw new RuntimeException("\n\n"+r.toText(false)+"\n\n");
+ }
+ Collections.sort(r.sites);
+ }
+
+ if(r.numSites()>1){
+ SiteScore ss1=r.topSite();
+ SiteScore ss2=r.sites.get(1);
+ //Ensure no duplicates
+ assert(ss1.chrom!=ss2.chrom || ss1.strand!=ss2.strand || ss1.start!=ss2.start || ss1.stop!=ss2.stop) : r.toText(false);
+ }
+ assert(Read.CHECKSITES(r, basesM));
+
+ if(r.numSites()>1){
+ assert(r.topSite().score==r.topSite().slowScore);
+ }
+
+ if(SLOW_ALIGN || USE_AFFINE_SCORE){r.setPerfectFlag(maxSwScore);}
+
+ if(r.numSites()>1){
+ final int clearzone=r.perfect() ? CLEARZONEP :
+ r.topSite().score>=(int)(maxSwScore*CLEARZONE1b_CUTOFF) ? CLEARZONE1 :
+ (r.topSite().score>=(int)(maxSwScore*CLEARZONE1c_CUTOFF) ? CLEARZONE1b : CLEARZONE1c);
+ final int numBestSites1=Tools.countTopScores(r.sites, clearzone);
+ if(numBestSites1>1){
+ //Ambiguous alignment
+ assert(r.sites.size()>1);
+ boolean b=processAmbiguous(r.sites, true, AMBIGUOUS_TOSS, clearzone, false);
+ r.setAmbiguous(b);
+ }
+ }
+
+ if(verbose){System.err.println("A: "+r);}
+
+ if((SLOW_ALIGN || USE_AFFINE_SCORE) && r.numSites()>0){
+ int lim=(int)(maxSwScore*MINIMUM_ALIGNMENT_SCORE_RATIO);
+ if(r.topSite().score<lim){r.sites=null;}
+ else{Tools.removeLowQualitySitesUnpaired(r.sites, Tools.min(lim, Tools.max(1, lim-CLEARZONE3)));}
+ }
+ if(r.numSites()==0){r.sites=null;r.mapScore=0;}
+ r.setFromTopSite(AMBIGUOUS_RANDOM, true, MAX_PAIR_DIST);
+ assert(Read.CHECKSITES(r, basesM));
+
+ if(verbose){System.err.println("B: "+r);}
+
+ //Unimportant anomaly due to ambiguous reads that later have low quality sites removed and become unmapped.
+// assert(!r.mapped() || new SamLine(r, 0).toRead(true).ambiguous()==r.ambiguous()) : "\n"+r+"\n\n"+new SamLine(r, 0)+"\n\n"+new SamLine(r, 0).toRead(true)+"\n\n"+
+// "ambi="+ambi+", r.ambiguous()="+r.ambiguous()+", new SamLine(r, 0).toRead(true).ambiguous()="+new SamLine(r, 0).toRead(true).ambiguous()+"\n\n"+
+// "r.mapped="+r.mapped()+", sl.mapped()="+new SamLine(r, 0).mapped()+", sl.toRead(true).mapped()="+new SamLine(r, 0).toRead(true).mapped();
+// assert(r.ambiguous()==ambi) : r;
+
+ assert(r.gaps==null || r.gaps[0]==r.start && r.gaps[r.gaps.length-1]==r.stop);
+ assert(r.sites==null || r.mapScore>0) : r.sites+", "+r.mapScore+"\n"+r;
+
+ if(r.numSites()>1){
+ assert(r.topSite().score==r.topSite().slowScore) : "\n"+r.toText(false)+"\n";
+ assert(r.topSite().score==r.mapScore) : "\n"+r.toText(false)+"\n";
+ }
+
+ if(verbose){System.err.println("C: "+r);}
+
+ //***$
+ if(MAKE_MATCH_STRING && r.numSites()>0){
+ if(USE_SS_MATCH_FOR_PRIMARY && r.topSite().match!=null){
+ r.match=r.topSite().match;
+ }else{
+ if(r.sites.size()>1){
+ assert(r.topSite().score>=r.sites.get(1).score) : "\n"+r.topSite().toText()+"\t<\t"+r.sites.get(1).toText()+"\n"+r.toText(false)+"\n";
+ }
+ int mapScore=r.mapScore;
+
+ assert(r.mate!=null || r.numSites()==0 || r.topSite().score==r.mapScore) : "\n"+r.toText(false)+"\n";
+
+ if(verbose){System.err.println("D: "+r);}
+
+ {
+ boolean firstIter=true;
+ do{//
+ if(!firstIter){
+ Collections.sort(r.sites);
+ r.setFromTopSite(AMBIGUOUS_RANDOM, true, MAX_PAIR_DIST);
+ }
+ genMatchString(r, basesP, basesM, maxImperfectSwScore, maxSwScore, true, true);
+ assert(r.mate!=null || r.numSites()==0 || r.topSite().score==r.mapScore) : "\n"+r.toText(false)+"\n";
+// TODO: Fix this; it should never happen.
+// if(mapScore>r.mapScore){
+// System.err.println("genMatchString reduced mapping score: "+mapScore+" -> "+r.mapScore+" in read "+r.numericID);
+// }
+ if(STRICT_MAX_INDEL && hasLongIndel(r.match, index.MAX_INDEL)){
+ SiteScore ss=r.topSite();
+ ss.score=r.mapScore=Tools.min(ss.score, -9999);
+ ss.setSlowPairedScore(ss.score, ss.score);
+ }
+ r.topSite().score=r.topSite().slowScore;
+ firstIter=false;
+ }while(r.sites.size()>1 && r.topSite().score<r.sites.get(1).score);
+ }
+
+ if(r.numSites()>1){
+ assert(r.topSite().score==r.topSite().slowScore) : "\n"+r.toText(false)+"\n";
+ assert(r.topSite().score==r.mapScore) : "\n"+r.toText(false)+"\n";
+ }
+
+ if(verbose){System.err.println("E: "+r);}
+ }
+ }
+
+ if(r.numSites()>1){
+ assert(r.topSite().score==r.topSite().slowScore) : "\n"+r.toText(false)+"\n";
+ assert(r.topSite().score==r.mapScore) : "\n"+r.toText(false)+"\n";
+ removeDuplicateBestSites(r);
+ }
+ if(r.numSites()>0){r.topSite().match=r.match;}
+
+
+ if(r.sites!=null && r.mapScore<=0){
+ if(!STRICT_MAX_INDEL && !Shared.anomaly){
+ System.err.println("Note: Read "+r.id+" failed cigar string generation and will be marked as unmapped.\t"+(r.match==null)+"\t"+r.mapScore+"\t"+r.topSite()+"\t"+new String(r.bases));
+ if(MSA.bandwidth>0 || MSA.bandwidthRatio>0 || MSA.flatMode){Shared.anomaly=true;}
+ }
+ r.mapScore=0;
+ r.setMapped(false);
+ r.sites=null;
+ }
+
+
+
+ //This block is to prevent an assertion from firing. Generally caused by alignment being lost during match generation.
+ //TODO: Fix cause.
+ if(r.mapScore>0 && r.sites==null){
+ if(!Shared.anomaly){System.err.println("Anomaly: mapScore>0 and list==null.\n"+r+"\n");}
+ Shared.anomaly=true;
+ r.clearMapping();
+ }else if(r.mapScore<=0 && r.sites!=null){
+ if(BANDWIDTH<1){
+ if(!Shared.anomaly){System.err.println("Anomaly1: mapScore<=0 and list!=null.\n"+r+"\n");}
+ Shared.anomaly=true;
+ }
+ r.clearMapping();
+ }
+ assert(r.sites==null || r.mapScore>0) :
+ "\nmapScore = "+r.mapScore+"\nread = "+r.toText(false)+"\nscore thresh = "+(-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO*maxSwScore))+"\n"+
+ "msa unlimited return = "+Arrays.toString(msa.fillAndScoreLimited(r.strand()==Gene.PLUS ? r.bases :
+ AminoAcid.reverseComplementBases(r.bases), r.topSite(), Tools.max(SLOW_ALIGN_PADDING, 10), 0))+"\n"+
+ "msa limited return = "+Arrays.toString(msa.fillAndScoreLimited(r.strand()==Gene.PLUS ? r.bases :
+ AminoAcid.reverseComplementBases(r.bases), r.topSite(), Tools.max(SLOW_ALIGN_PADDING, 10), (-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO*maxSwScore))))+"\n\n"+
+ "msa vert limit: "+msa.showVertLimit()+"\n\nmsa horz limit: "+msa.showHorizLimit()+"\n\n";
+
+// assert(r.list==null || r.mapScore>0) : r.mapScore+"\n"+r.list==null ? "null" : r.list.toString();
+
+
+
+ if(CLEARZONE3>CLEARZONE1 || CLEARZONE3>CLEARZONEP){
+ boolean changed=applyClearzone3(r, CLEARZONE3, INV_CLEARZONE3);
+ if(changed){
+ int minScore=(int)(maxSwScore*MINIMUM_ALIGNMENT_SCORE_RATIO);
+ if(r.mapScore<minScore){
+ assert(!r.ambiguous());
+ r.setAmbiguous(true);
+ }
+ }
+ }
+
+ if(r.ambiguous() && AMBIGUOUS_TOSS){r.sites=null; r.clearSite(); r.setMapped(false);}
+ assert(checkTopSite(r));
+
+ if(r.mapped() && r.numSites()>1 && PRINT_SECONDARY_ALIGNMENTS){
+ ensureMatchStringsOnSiteScores(r, basesM, maxImperfectSwScore, maxSwScore);
+ assert(Read.CHECKSITES(r, basesM));
+ }
+
+ if(r.mapped() && (LOCAL_ALIGN || r.containsXYC())){
+ msa.toLocalAlignment(r, r.topSite(), basesM, r.containsXYC() ? 1 : LOCAL_ALIGN_TIP_LENGTH, LOCAL_ALIGN_MATCH_POINT_RATIO);
+ assert(Read.CHECKSITES(r, basesM));
+ }
+
+ if(r.numSites()==0 || (!r.ambiguous() && r.mapScore<maxSwScore*MINIMUM_ALIGNMENT_SCORE_RATIO)){
+ r.clearMapping();
+ }
+ postFilterRead(r, basesM, maxImperfectSwScore, maxSwScore);
+ if(MAKE_MATCH_STRING){ensureMatchStringOnPrimary(r, basesM, maxImperfectSwScore, maxSwScore);}
+
+ if(PENALIZE_AMBIG){
+ int penalty=calcTipScorePenalty(r, maxSwScore, 7);
+ applyScorePenalty(r, penalty);
+ }
+
+ if(CALC_STATISTICS){
+ calcStatistics1(r, maxSwScore, maxPossibleQuickScore);
+ }
+ }
+
+
+ /** Returns number of perfect pairs */
+ public int pairSiteScoresInitial(Read r, Read r2, boolean trim){
+
+ if(r.numSites()<1 || r2.numSites()<1){return 0;}
+
+ SiteScore.PCOMP.sort(r.sites);
+ SiteScore.PCOMP.sort(r2.sites);
+
+ for(SiteScore ss : r.sites){ss.setPairedScore(0);}
+ for(SiteScore ss : r2.sites){ss.setPairedScore(0);}
+
+// ArrayList<SiteScorePair> pairs=new ArrayList<SiteScorePair>(Tools.min(8, Tools.min(r.list.size(), r2.list.size())));
+
+ int maxPairedScore1=-1;
+ int maxPairedScore2=-1;
+
+
+// for(SiteScore ss : r.list){
+// System.out.println(ss.toText());
+// }
+
+// int i=0, j=0;
+ final int ilimit=r.sites.size()-1;
+ final int jlimit=r2.sites.size()-1;
+ final int maxReadLen=Tools.max(r.length(), r2.length());
+
+// final int outerDistLimit=MIN_PAIR_DIST+r.length()+r2.length();
+ final int outerDistLimit=(Tools.max(r.length(), r2.length())*(OUTER_DIST_MULT))/OUTER_DIST_DIV;//-(SLOW_ALIGN ? 100 : 0);
+ final int innerDistLimit=MAX_PAIR_DIST;//+(FIND_TIP_DELETIONS ? TIP_DELETION_SEARCH_RANGE : 0);
+ final int expectedFragLength=AVERAGE_PAIR_DIST+r.length()+r2.length();
+
+ int numPerfectPairs=0;
+
+ for(int i=0, j=0; i<=ilimit && j<=jlimit; i++){
+ SiteScore ss1=r.sites.get(i);
+ SiteScore ss2=r2.sites.get(j);
+
+ while(j<jlimit && (ss2.chrom<ss1.chrom || (ss2.chrom==ss1.chrom && ss1.start-ss2.stop>innerDistLimit))){
+ j++;
+ ss2=r2.sites.get(j);
+ }
+
+ for(int k=j; k<=jlimit; k++){
+ ss2=r2.sites.get(k);
+
+ if(ss2.chrom>ss1.chrom){break;}
+ if(ss2.start-ss1.stop>innerDistLimit){break;}
+
+// int dist=0;
+//
+// if(ss1.start<=ss2.start){
+// dist=ss2.start-ss1.stop;
+// }else if(ss1.start>ss2.start){
+// dist=ss1.start-ss2.stop;
+// }
+
+
+// int innerdist=0;
+// int outerdist=0;
+//
+// if(ss1.start<=ss2.start){
+// innerdist=ss2.start-ss1.stop;
+// outerdist=ss2.stop-ss1.start;
+// }else if(ss1.start>ss2.start){
+// innerdist=ss1.start-ss2.stop;
+// outerdist=ss1.stop-ss2.start;
+// }
+
+ final int innerdist, outerdist;
+ //assert(!SAME_STRAND_PAIRS) : "TODO";
+
+ if(REQUIRE_CORRECT_STRANDS_PAIRS){
+ if(ss1.strand!=ss2.strand){
+ if(ss1.strand==Gene.PLUS){
+ innerdist=ss2.start-ss1.stop;
+ outerdist=ss2.stop-ss1.start;
+ }else{
+ innerdist=ss1.start-ss2.stop;
+ outerdist=ss1.stop-ss2.start;
+ }
+ }else{
+ if(ss1.start<=ss2.start){
+ innerdist=ss2.start-ss1.stop;
+ outerdist=ss2.stop-ss1.start;
+ }else{
+ innerdist=ss1.start-ss2.stop;
+ outerdist=ss1.stop-ss2.start;
+ }
+ }
+ }else{
+ if(ss1.start<=ss2.start){
+ innerdist=ss2.start-ss1.stop;
+ outerdist=ss2.stop-ss1.start;
+ }else{
+ innerdist=ss1.start-ss2.stop;
+ outerdist=ss1.stop-ss2.start;
+ }
+ }
+
+ assert(outerdist>=innerdist);
+
+ if(outerdist>=outerDistLimit && innerdist<=innerDistLimit){
+
+ boolean strandOK=((ss1.strand==ss2.strand)==SAME_STRAND_PAIRS);
+
+ if(strandOK || !REQUIRE_CORRECT_STRANDS_PAIRS){
+
+ boolean paired1=false, paired2=false;
+
+ int deviation=absdif(AVERAGE_PAIR_DIST, innerdist);
+
+ final int pairedScore1;
+ final int pairedScore2;
+ if(strandOK){
+// pairedScore1=ss1.score+ss2.score/2;
+// pairedScore2=ss2.score+ss1.score/2;
+
+ pairedScore1=ss1.score+1+Tools.max(1, ss2.score/2-(((deviation)*ss2.score)/(32*expectedFragLength+100)));
+ pairedScore2=ss2.score+1+Tools.max(1, ss1.score/2-(((deviation)*ss1.score)/(32*expectedFragLength+100)));
+ }else{//e.g. a junction
+ pairedScore1=ss1.score+Tools.max(0, ss2.score/16);
+ pairedScore2=ss2.score+Tools.max(0, ss1.score/16);
+ }
+
+ if(pairedScore1>ss1.pairedScore){
+ paired1=true;
+ ss1.setPairedScore(Tools.max(ss1.pairedScore, pairedScore1));
+ maxPairedScore1=Tools.max(ss1.score, maxPairedScore1);
+ // System.out.println("Paired "+ss1.toText()+" with "+ss2.toText());
+ }else{
+ // System.out.println(ss1.toText()+" already paired.");
+ }
+ if(pairedScore2>ss2.pairedScore){
+ paired2=true;
+ ss2.setPairedScore(Tools.max(ss2.pairedScore, pairedScore2));
+ maxPairedScore2=Tools.max(ss2.score, maxPairedScore2);
+ }
+
+ if(paired1 && paired2 && outerdist>=maxReadLen && deviation<=expectedFragLength && ss1.perfect && ss2.perfect){
+ numPerfectPairs++; //Lower bound. Some perfect pairs may be the same.
+ }
+
+// ss1.pairedScore=Tools.max(ss1.pairedScore, pairedScore1);
+// ss2.pairedScore=Tools.max(ss2.pairedScore, pairedScore2);
+// maxPairedScore1=Tools.max(ss1.score, maxPairedScore1);
+// maxPairedScore2=Tools.max(ss2.score, maxPairedScore2);
+ }
+ }
+ }
+
+ }
+
+
+
+ for(SiteScore ss : r.sites){
+ if(ss.pairedScore>ss.score){ss.score=ss.pairedScore;}
+ else{assert(ss.pairedScore==0);}
+// ss.score=ss.pairedScore=Tools.max(ss.pairedScore, ss.score);
+ }
+ for(SiteScore ss : r2.sites){
+ if(ss.pairedScore>ss.score){ss.score=ss.pairedScore;}
+ else{assert(ss.pairedScore==0);}
+// ss.score=ss.pairedScore=Tools.max(ss.pairedScore, ss.score);
+ }
+
+ if(trim){
+ if(numPerfectPairs>0){
+// System.out.print(".");
+ Tools.trimSitesBelowCutoff(r.sites, (int)(maxPairedScore1*.94f), false, true, 1, MAX_TRIM_SITES_TO_RETAIN);
+ Tools.trimSitesBelowCutoff(r2.sites, (int)(maxPairedScore2*.94f), false, true, 1, MAX_TRIM_SITES_TO_RETAIN);
+ }else{
+ if(r.sites.size()>4){
+ Tools.trimSitesBelowCutoff(r.sites, (int)(maxPairedScore1*.9f), true, true, 1, MAX_TRIM_SITES_TO_RETAIN);
+ }
+ if(r2.sites.size()>4){
+ Tools.trimSitesBelowCutoff(r2.sites, (int)(maxPairedScore2*.9f), true, true, 1, MAX_TRIM_SITES_TO_RETAIN);
+ }
+ }
+ }
+
+// if(pairs.isEmpty()){return null;}
+//
+// ArrayList<SiteScore> temp=new ArrayList<SiteScore>(Tools.max(r.list.size(), r2.list.size()));
+//
+// for(SiteScore ss : r.list){
+// if(ss.score>maxPairedScore1){temp.add(ss);}
+// }
+// for(SiteScorePair ssp : pairs){
+// temp.add(ssp.a);
+// }
+// r.list.clear();
+// r.list.addAll(temp);
+//
+// for(SiteScore ss : r2.list){
+// if(ss.score>maxPairedScore2){temp.add(ss);}
+// }
+// for(SiteScorePair ssp : pairs){
+// temp.add(ssp.b);
+// }
+// r2.list.clear();
+// r2.list.addAll(temp);
+//
+// return pairs;
+
+ return numPerfectPairs;
+ }
+
+
+ public void processReadPair(final Read r, final byte[] basesM1, final byte[] basesM2){
+ if(idmodulo>1 && r.numericID%idmodulo!=1){return;}
+ final Read r2=r.mate;
+ assert(r2!=null);
+ final byte[] basesP1=r.bases, basesP2=r2.bases;
+ final int len1=(basesP1==null ? 0 : basesP1.length), len2=(basesP2==null ? 0 : basesP2.length);
+
+ readsUsed1++;
+ readsUsed2++;
+
+ final int maxPossibleQuickScore1=quickMap(r, basesM1);
+ final int maxPossibleQuickScore2=quickMap(r2, basesM2);
+
+ if(verbose){
+ System.err.println("\nAfter quick map:\nRead1:\t"+r+"\nRead2:\t"+r.mate);
+ }
+
+ if(maxPossibleQuickScore1<0 && maxPossibleQuickScore2<0){
+ r.sites=null;
+ r2.sites=null;
+ lowQualityReadsDiscarded1++;
+ lowQualityBasesDiscarded1+=len1;
+ r.setDiscarded(true);
+ lowQualityReadsDiscarded2++;
+ lowQualityBasesDiscarded2+=len2;
+ r2.setDiscarded(true);
+ return;
+ }
+
+ //Not really needed due to subsumption
+// Tools.mergeDuplicateSites(r.list);
+// Tools.mergeDuplicateSites(r2.list);
+
+ initialSiteSum1+=r.numSites();
+ initialSiteSum2+=r2.numSites();
+
+ //TODO: Fix this. This is a workaround for an assertion error counting the number of reads used.
+ //Discards need to be tracked separately for each end.
+// if(maxPossibleQuickScore2<0){lowQualityReadsDiscarded--;}
+
+ final int maxSwScore1=msa.maxQuality(len1);
+ final int maxImperfectSwScore1=msa.maxImperfectScore(len1);
+ final int maxSwScore2=msa.maxQuality(len2);
+ final int maxImperfectSwScore2=msa.maxImperfectScore(len2);
+
+ pairSiteScoresInitial(r, r2, TRIM_LIST);
+ if(verbose){System.err.println("\nAfter initial pair:\nRead1:\t"+r+"\nRead2:\t"+r2);}
+
+ if(TRIM_LIST){
+
+ if(MIN_TRIM_SITES_TO_RETAIN_PAIRED>1){
+ if(r.numSites()>MIN_TRIM_SITES_TO_RETAIN_PAIRED){Collections.sort(r.sites);}
+ if(r2.numSites()>MIN_TRIM_SITES_TO_RETAIN_PAIRED){Collections.sort(r2.sites);}
+ }
+
+ trimList(r.sites, true, maxSwScore1, false, MIN_TRIM_SITES_TO_RETAIN_PAIRED, MAX_TRIM_SITES_TO_RETAIN);
+ trimList(r2.sites, true, maxSwScore2, false, MIN_TRIM_SITES_TO_RETAIN_PAIRED, MAX_TRIM_SITES_TO_RETAIN);
+ }
+ postTrimSiteSum1+=r.numSites();
+ postTrimSiteSum2+=r2.numSites();
+
+ {//Reset score to non-paired score
+ if(r.sites!=null){
+ for(SiteScore ss : r.sites){
+ assert(ss.slowScore<=ss.quickScore);
+ ss.score=ss.quickScore;
+ }
+ }
+ if(r2.sites!=null){
+ for(SiteScore ss : r2.sites){
+ assert(ss.slowScore<=ss.quickScore);
+ ss.score=ss.quickScore;
+ }
+ }
+ }
+
+ if(verbose){System.err.println("\nAfter trim:\nRead1:\t"+r.sites+"\nRead2:\t"+r2.sites);}
+
+// assert(Read.CHECKSITES(r, basesM1) && Read.CHECKSITES(r2, basesM2));
+
+ if(SLOW_ALIGN){
+
+ if(r.numSites()>0){
+
+ int numNearPerfectScores1=scoreNoIndels(r, basesP1, basesM1, maxSwScore1, maxImperfectSwScore1);
+ Collections.sort(r.sites); //Puts higher scores first to better trigger the early exit based on perfect scores
+
+ if(numNearPerfectScores1<1){
+ if(FIND_TIP_DELETIONS){findTipDeletions(r, basesP1, basesM1, maxSwScore1, maxImperfectSwScore1);}
+ }
+
+ //TODO:
+ //Note scoreSlow can be skipped under this circumstance:
+ //When rescue is disabled, numNearPerfectScores>0, and there are no paired sites.
+ scoreSlow(r.sites, basesP1, basesM1, maxSwScore1, maxImperfectSwScore1);
+ if(STRICT_MAX_INDEL){
+ int removed=removeLongIndels(r.sites, index.MAX_INDEL);
+ if(r.numSites()==0){r.clearMapping();}
+ }
+ Tools.mergeDuplicateSites(r.sites, true, true);
+ }
+
+ if(r2.numSites()>0){
+ int numNearPerfectScores2=scoreNoIndels(r2, basesP2, basesM2, maxSwScore2, maxImperfectSwScore2);
+ Collections.sort(r2.sites); //Puts higher scores first to better trigger the early exit based on perfect scores
+
+ if(numNearPerfectScores2<1){
+ if(FIND_TIP_DELETIONS){findTipDeletions(r2, basesP2, basesM2, maxSwScore2, maxImperfectSwScore2);}
+ }
+
+ scoreSlow(r2.sites, basesP2, basesM2, maxSwScore2, maxImperfectSwScore2);
+ if(STRICT_MAX_INDEL){
+ int removed=removeLongIndels(r2.sites, index.MAX_INDEL);
+ if(r2.numSites()<1){r2.clearMapping();}
+ }
+ Tools.mergeDuplicateSites(r2.sites, true, true);
+ }
+
+
+ if(verbose){System.err.println("\nAfter slow align:\nRead1:\t"+r+"\nRead2:\t"+r2);}
+ assert(Read.CHECKSITES(r, basesM1, false) && Read.CHECKSITES(r2, basesM2, false));
+
+ if(DO_RESCUE){
+ int unpaired1=0;
+ int unpaired2=0;
+ if(r.sites!=null){
+ for(SiteScore ss : r.sites){
+ assert(ss.pairedScore<1 || ss.pairedScore>ss.quickScore || ss.pairedScore>ss.slowScore) :
+ "\n"+ss.toText()+"\n"+r.toText(false)+"\n";
+ if(ss.pairedScore==0){unpaired1++;}
+ }
+ }
+ if(r2.sites!=null){
+ for(SiteScore ss : r2.sites){
+ assert(ss.pairedScore<1 || ss.pairedScore>ss.quickScore || ss.pairedScore>ss.slowScore) :
+ "\n"+ss.toText()+"\n"+r2.toText(false)+"\n";
+ if(ss.pairedScore==0){unpaired2++;}
+ }
+ }
+
+ if(unpaired1>0 && r.numSites()>0){
+ Collections.sort(r.sites);
+ Tools.removeLowQualitySitesPaired(r.sites, maxSwScore1, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE);
+ rescue(r, r2, basesP2, basesM2, Tools.min(MAX_PAIR_DIST, 2*AVERAGE_PAIR_DIST+100));
+ Tools.mergeDuplicateSites(r2.sites, true, true);
+ }
+
+ if(unpaired2>0 && r2.numSites()>0){
+ Collections.sort(r2.sites);
+ Tools.removeLowQualitySitesPaired(r2.sites, maxSwScore2, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE);
+ rescue(r2, r, basesP1, basesM1, Tools.min(MAX_PAIR_DIST, 2*AVERAGE_PAIR_DIST+100));
+ Tools.mergeDuplicateSites(r.sites, true, true);
+ }
+
+ postRescueSiteSum1+=r.numSites();
+ postRescueSiteSum2+=r2.numSites();
+
+// if(r.list!=null){Collections.sort(r.list);}
+// if(r2.list!=null){Collections.sort(r2.list);}
+//
+// Tools.removeLowQualitySites(r.list, maxSwScore1, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE);
+// Tools.removeLowQualitySites(r2.list, maxSwScore2, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE);
+
+ if(verbose){System.err.println("\nAfter rescue:\nRead1:\t"+r+"\nRead2:\t"+r2);}
+ assert(Read.CHECKSITES(r, basesM1, false) && Read.CHECKSITES(r2, basesM2, false));
+ }
+ }else{
+ Tools.mergeDuplicateSites(r.sites, true, false);
+ Tools.mergeDuplicateSites(r2.sites, true, false);
+ if(verbose){System.err.println("\nAfter merge:\nRead1:\t"+r+"\nRead2:\t"+r2);}
+ assert(Read.CHECKSITES(r, basesM1, false) && Read.CHECKSITES(r2, basesM2, false));
+ }
+
+ if(r.numSites()>1){Collections.sort(r.sites);}
+ if(r2.numSites()>1){Collections.sort(r2.sites);}
+
+ if(false){//This block is optional, but increases correctness by a tiny bit. (or maybe not!)
+ if(SLOW_ALIGN || USE_AFFINE_SCORE){
+ Tools.removeLowQualitySitesPaired(r.sites, maxSwScore1, MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED, MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED);
+ Tools.removeLowQualitySitesPaired(r2.sites, maxSwScore2, MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED, MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED);
+ }
+
+ pairSiteScoresFinal(r, r2, false, false, MAX_PAIR_DIST, AVERAGE_PAIR_DIST, SAME_STRAND_PAIRS, REQUIRE_CORRECT_STRANDS_PAIRS, MAX_TRIM_SITES_TO_RETAIN);
+
+ if(r.numSites()>1){Collections.sort(r.sites);}
+ if(r2.numSites()>1){Collections.sort(r2.sites);}
+ }
+
+ if(SLOW_ALIGN || USE_AFFINE_SCORE){
+ Tools.removeLowQualitySitesPaired(r.sites, maxSwScore1, MINIMUM_ALIGNMENT_SCORE_RATIO, MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED);
+ Tools.removeLowQualitySitesPaired(r2.sites, maxSwScore2, MINIMUM_ALIGNMENT_SCORE_RATIO, MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED);
+ }
+
+ pairSiteScoresFinal(r, r2, true, true, MAX_PAIR_DIST, AVERAGE_PAIR_DIST, SAME_STRAND_PAIRS, REQUIRE_CORRECT_STRANDS_PAIRS, MAX_TRIM_SITES_TO_RETAIN);
+ if(verbose){System.err.println("\nAfter final pairing:\nRead1:\t"+r+"\nRead2:\t"+r2);}
+
+ if(r.numSites()>0){
+ mapped1++;
+ Collections.sort(r.sites);
+ }
+ if(r2.numSites()>0){
+ mapped2++;
+ Collections.sort(r2.sites);
+ }
+ assert(Read.CHECKSITES(r, basesM1) && Read.CHECKSITES(r2, basesM2));
+
+ if(SLOW_ALIGN || USE_AFFINE_SCORE){
+ r.setPerfectFlag(maxSwScore1);
+ r2.setPerfectFlag(maxSwScore2);
+// assert(Read.CHECKSITES(r, basesM1) && Read.CHECKSITES(r2, basesM2));
+ }
+
+
+ if(r.numSites()>1){
+ final int clearzone=r.perfect() ? CLEARZONEP :
+ r.topSite().score>=(int)(maxSwScore1*CLEARZONE1b_CUTOFF) ? CLEARZONE1 :
+ (r.topSite().score>=(int)(maxSwScore1*CLEARZONE1c_CUTOFF) ? CLEARZONE1b : CLEARZONE1c);
+ int numBestSites1=Tools.countTopScores(r.sites, clearzone);
+ if(numBestSites1>1){
+ //Ambiguous alignment
+ assert(r.sites.size()>1);
+
+ boolean b=processAmbiguous(r.sites, true, AMBIGUOUS_TOSS, clearzone, false);
+ r.setAmbiguous(b);
+ }
+// assert(Read.CHECKSITES(r, basesM1));
+ }
+
+ if(r2.numSites()>1){
+ final int clearzone=r2.perfect() ? CLEARZONEP :
+ r2.topSite().score>=(int)(maxSwScore2*CLEARZONE1b_CUTOFF) ? CLEARZONE1 :
+ (r2.topSite().score>=(int)(maxSwScore2*CLEARZONE1c_CUTOFF) ? CLEARZONE1b : CLEARZONE1c);
+ int numBestSites2=Tools.countTopScores(r2.sites, clearzone);
+ if(numBestSites2>1){
+ //Ambiguous alignment
+ assert(r2.sites.size()>1);
+
+ boolean b=processAmbiguous(r2.sites, false, AMBIGUOUS_TOSS, clearzone, false);
+ r2.setAmbiguous(b);
+ }
+// assert(Read.CHECKSITES(r2, basesM2));
+ }
+ if(verbose){System.err.println("\nAfter ambiguous removal:\nRead1:\t"+r+"\nRead2:\t"+r2);}
+
+ if(r.numSites()>0 && r2.numSites()>0){
+ SiteScore ss1=r.topSite();
+ SiteScore ss2=r2.topSite();
+ if(canPair(ss1, ss2, len1, len2, REQUIRE_CORRECT_STRANDS_PAIRS, SAME_STRAND_PAIRS, MAX_PAIR_DIST)){
+ assert(SLOW_ALIGN ? ss1.pairedScore>ss1.slowScore : ss1.pairedScore>ss1.quickScore) :
+ "\n"+ss1.toText()+"\n"+ss2.toText()+"\n"+r.toText(false)+"\n"+r2.toText(false)+"\n\n"+
+ r.mapped()+", "+r.paired()+", "+r.strand()+", "+r.ambiguous()+"\n\n"+r2.mapped()+", "+r2.paired()+", "+r2.strand()+", "+r2.ambiguous()+"\n\n";
+ assert(SLOW_ALIGN ? ss2.pairedScore>ss2.slowScore : ss2.pairedScore>ss2.quickScore) :
+ "\n"+ss1.toText()+"\n"+ss2.toText()+"\n"+r.toText(false)+"\n"+r2.toText(false)+"\n\n";
+ r.setPaired(true);
+ r.mate.setPaired(true);
+ }
+ }
+
+ if(r.numSites()==0){r.sites=null;r.mapScore=0;}
+ if(r2.numSites()==0){r2.sites=null;r2.mapScore=0;}
+
+ r.setFromTopSite(AMBIGUOUS_RANDOM, true, MAX_PAIR_DIST);
+ r2.setFromTopSite(AMBIGUOUS_RANDOM, true, MAX_PAIR_DIST);
+ if(KILL_BAD_PAIRS){
+ if(r.isBadPair(REQUIRE_CORRECT_STRANDS_PAIRS, SAME_STRAND_PAIRS, MAX_PAIR_DIST)){
+ int x=r.mapScore/len1;
+ int y=r2.mapScore/len2;
+ if(x>=y){
+ r2.clearAnswers(false);
+ }else{
+ r.clearAnswers(false);
+ }
+ }
+ }
+ if(verbose){System.err.println("\nAfter bad pair removal:\nRead1:\t"+r+"\nRead2:\t"+r2);}
+
+ assert(r.sites==null || r.mapScore>0) : r.mapScore+"\n"+r.toText(false)+"\n\n"+r2.toText(false)+"\n";
+ assert(r2.sites==null || r2.mapScore>0) : r2.mapScore+"\n"+r.toText(false)+"\n\n"+r2.toText(false)+"\n";
+ if(MAKE_MATCH_STRING){
+ if(r.numSites()>0){
+ if(USE_SS_MATCH_FOR_PRIMARY && r.topSite().match!=null){
+ r.match=r.topSite().match;
+ }else{
+ genMatchString(r, basesP1, basesM1, maxImperfectSwScore1, maxSwScore1, false, false);
+
+ if(STRICT_MAX_INDEL && r.mapped()){
+ if(hasLongIndel(r.match, index.MAX_INDEL)){
+ r.clearMapping();
+ r2.setPaired(false);
+ }
+ }
+ }
+// assert(Read.CHECKSITES(r, basesM1));
+ }
+ if(r2.numSites()>0){
+ if(USE_SS_MATCH_FOR_PRIMARY && r2.topSite().match!=null){
+ r2.match=r2.topSite().match;
+ }else{
+ genMatchString(r2, basesP2, basesM2, maxImperfectSwScore2, maxSwScore2, false, false);
+
+ if(STRICT_MAX_INDEL && r2.mapped()){
+ if(hasLongIndel(r2.match, index.MAX_INDEL)){
+ r2.clearMapping();
+ r.setPaired(false);
+ }
+ }
+ }
+// assert(Read.CHECKSITES(r2, basesM2));
+ }
+ }
+
+ assert(checkTopSite(r)); // TODO remove this
+ if(verbose){
+ System.err.println("\nFinal:\nRead1:\t"+r+"\nRead2:\t"+r2);
+ if(r.match!=null && r.shortmatch()){r.match=Read.toLongMatchString(r.match); r.setShortMatch(false);}
+ if(r2.match!=null && r2.shortmatch()){r2.match=Read.toLongMatchString(r2.match); r2.setShortMatch(false);}
+ }
+
+ //Block to prevent assertion from firing. Generally caused by alignment being lost during match generation. TODO: Fix cause.
+ if(r.mapScore>0 && r.sites==null){
+ if(!Shared.anomaly){System.err.println("Anomaly: mapScore>0 and list==null.\n"+r+"\n");}
+ Shared.anomaly=true;
+ r.clearMapping();
+ r2.setPaired(false);
+ }else if(r.mapScore<=0 && r.sites!=null){
+ if(!STRICT_MAX_INDEL && !Shared.anomaly){System.err.println("Anomaly2: mapScore<=0 and list!=null.\n"+r+"\n");}
+ Shared.anomaly=true;
+ r.clearMapping();
+ r2.setPaired(false);
+ }
+ assert(checkTopSite(r)); // TODO remove this
+ //Block to prevent assertion from firing. Generally caused by alignment being lost during match generation. TODO: Fix cause.
+ if(r2.mapScore>0 && r2.sites==null){
+ if(!Shared.anomaly){System.err.println("Anomaly: mapScore>0 and list==null.\n"+r+"\n");}
+ Shared.anomaly=true;
+ r2.clearMapping();
+ r.setPaired(false);
+ }else if(r2.mapScore<=0 && r2.sites!=null){
+ if(!STRICT_MAX_INDEL && !Shared.anomaly){System.err.println("Anomaly3: mapScore<=0 and list!=null.\n"+r+"\n");}
+ Shared.anomaly=true;
+ r2.clearMapping();
+ r.setPaired(false);
+ }
+
+ assert(r.sites==null || r.mapScore>0) :
+ r.mapScore+"\t"+r.sites+"\n"+(-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED*maxSwScore1))+"\n"+
+ Arrays.toString(msa.fillAndScoreLimited(r.strand()==Gene.PLUS ? r.bases :
+ AminoAcid.reverseComplementBases(r.bases), r.topSite(), Tools.max(SLOW_ALIGN_PADDING, 80), 0))+"\n"+
+ Arrays.toString(msa.fillAndScoreLimited(r.strand()==Gene.PLUS ? r.bases :
+ AminoAcid.reverseComplementBases(r.bases), r.topSite(), Tools.max(SLOW_ALIGN_PADDING, 80), (-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED*maxSwScore1))))+"\n\n"+
+ msa.showVertLimit()+"\n\n"+msa.showHorizLimit()+"\n\n"+r+"\n\n"+r2+"\n\n";
+ assert(r2.sites==null || r2.mapScore>0) :
+ r2.mapScore+"\t"+r2.sites+"\n"+(-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED*maxSwScore2))+"\n"+
+ Arrays.toString(msa.fillAndScoreLimited(r2.strand()==Gene.PLUS ? r2.bases :
+ AminoAcid.reverseComplementBases(r2.bases), r2.topSite(), Tools.max(SLOW_ALIGN_PADDING, 80), 0))+"\n"+
+ Arrays.toString(msa.fillAndScoreLimited(r2.strand()==Gene.PLUS ? r2.bases :
+ AminoAcid.reverseComplementBases(r2.bases), r2.topSite(), Tools.max(SLOW_ALIGN_PADDING, 80), (-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED*maxSwScore2))))+"\n\n"+
+ msa.showVertLimit()+"\n\n"+msa.showHorizLimit()+"\n\n"+r+"\n\n"+r2+"\n\n";
+
+ assert(!r.mapped() || !MAKE_MATCH_STRING || r.match!=null) : "Note that sometimes, VERY RARELY, match string generation fails.";
+ assert(checkTopSite(r)); // TODO remove this
+ removeDuplicateBestSites(r);
+ removeDuplicateBestSites(r2);
+
+ if(DYNAMIC_INSERT_LENGTH && numMated>1000 && r.paired()){
+ AVERAGE_PAIR_DIST=(int)(innerLengthSum*1f/numMated);
+ }
+ assert(checkTopSite(r)); // TODO remove this
+ if(r.ambiguous() && AMBIGUOUS_TOSS){
+ if(r.sites!=null){r.sites=null;}
+ r.clearSite();
+ r.setMapped(false);
+ r.setPaired(false);
+ r2.setPaired(false);
+ }else if(r.mapped() && r.numSites()>1 && PRINT_SECONDARY_ALIGNMENTS){
+ ensureMatchStringsOnSiteScores(r, basesM1, maxImperfectSwScore1, maxSwScore1);
+ assert(Read.CHECKSITES(r, basesM1));
+ }
+ if(r2.ambiguous() && AMBIGUOUS_TOSS){
+ if(r2.sites!=null){r2.sites=null;}
+ r2.clearSite();
+ r2.setMapped(false);
+ r.setPaired(false);
+ r2.setPaired(false);
+ }else if(r2.mapped() && r2.numSites()>1 && PRINT_SECONDARY_ALIGNMENTS){
+ ensureMatchStringsOnSiteScores(r2, basesM2, maxImperfectSwScore2, maxSwScore2);
+ assert(Read.CHECKSITES(r2, basesM2));
+ }
+// assert(Read.CHECKSITES(r, basesM1) && Read.CHECKSITES(r2, basesM2));
+
+ assert(checkTopSite(r));
+ if(r.mapped() && (LOCAL_ALIGN || r.containsXYC())){
+ final SiteScore ss=r.topSite();
+ ss.match=r.match;
+ msa.toLocalAlignment(r, ss, basesM1, r.containsXYC() ? 1 : LOCAL_ALIGN_TIP_LENGTH, LOCAL_ALIGN_MATCH_POINT_RATIO);
+// System.err.println("\n\n*********\n\n"+r+"\n\n*********\n\n");
+// assert(Read.CHECKSITES(r, basesM1)); //TODO: This can fail; see bug#0001
+ }
+
+ assert(checkTopSite(r2));
+ if(r2.mapped() && (LOCAL_ALIGN || r2.containsXYC())){
+ final SiteScore ss=r2.topSite();
+ ss.match=r2.match;
+ msa.toLocalAlignment(r2, ss, basesM2, r2.containsXYC() ? 1 : LOCAL_ALIGN_TIP_LENGTH, LOCAL_ALIGN_MATCH_POINT_RATIO);
+// assert(Read.CHECKSITES(r2, basesM2)); //TODO: This can fail; see bug#0001
+ }
+
+ postFilterRead(r, basesM1, maxImperfectSwScore1, maxSwScore1);
+ postFilterRead(r2, basesM2, maxImperfectSwScore2, maxSwScore2);
+ if(MAKE_MATCH_STRING){
+ ensureMatchStringOnPrimary(r, basesM1, maxImperfectSwScore1, maxSwScore1);
+ ensureMatchStringOnPrimary(r2, basesM2, maxImperfectSwScore2, maxSwScore2);
+ }
+
+ if(CALC_STATISTICS){
+ calcStatistics1(r, maxSwScore1, maxPossibleQuickScore1);
+ calcStatistics2(r2, maxSwScore2, maxPossibleQuickScore2);
+ }
+ }
+
+}
diff --git a/current/align2/BBMapThreadPacBioSkimmer.java b/current/align2/BBMapThreadPacBioSkimmer.java
new file mode 100755
index 0000000..806b33a
--- /dev/null
+++ b/current/align2/BBMapThreadPacBioSkimmer.java
@@ -0,0 +1,1758 @@
+package align2;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+
+import jgi.CoveragePileup;
+
+import stream.ConcurrentReadInputStream;
+import stream.ConcurrentReadOutputStream;
+import stream.Read;
+import stream.SamLine;
+import stream.SiteScore;
+
+import dna.AminoAcid;
+import dna.Data;
+import dna.Gene;
+
+/**
+ * Based on MapTestThread11f
+ * Designed to skim and retain all sites above a threshold.
+ * @author Brian Bushnell
+ * @date Jul 10, 2012
+ *
+ */
+public final class BBMapThreadPacBioSkimmer extends AbstractMapThread{
+
+ static final int ALIGN_COLUMNS=BBIndexPacBioSkimmer.ALIGN_COLUMNS;
+ static final int ALIGN_ROWS=4020;
+
+
+
+ /** Don't trim for local alignments unless at least this many bases will be clipped */
+ private final int LOCAL_ALIGN_TIP_LENGTH=1;
+ /** Range is 0-1; a lower number makes trimming more aggressive */
+ private final float LOCAL_ALIGN_MATCH_POINT_RATIO=0.75f;
+
+ /** Ratio of the points for a match of a single base needed to declare unambiguous */
+ public final float CLEARZONE_RATIOP=1.5f;
+ public final float CLEARZONE_RATIO1=2.2f;
+ public final float CLEARZONE_RATIO1b=2.8f;
+ public final float CLEARZONE_RATIO1c=4.8f;
+ public final float CLEARZONE_RATIO3=8f;
+ /** Max allowed number of sites within 1 edit (excluding primary site) */
+ public final int CLEARZONE_LIMIT1e=4;
+ //public final int CLEARZONE1e;
+ public final int CLEARZONEP;
+ public final int CLEARZONE1;
+ public final int CLEARZONE1b;
+ public final int CLEARZONE1c;
+ //public final int CLEARZONE1e;
+ public final int CLEARZONE3;
+ public final float INV_CLEARZONE3;
+ public final float CLEARZONE1b_CUTOFF=0.92f;
+ public final float CLEARZONE1c_CUTOFF=0.82f;
+
+ public final BBIndexPacBioSkimmer index;
+
+
+ private static int MIN_TRIM_SITES_TO_RETAIN_SINGLE=2;
+ private static int MIN_TRIM_SITES_TO_RETAIN_PAIRED=1;
+
+ /** TODO - perhaps I can rewrite cz3 to penalize reads that map similarly to more than the expected number of places */
+ public static final boolean USE_CLEARZONE3=false;
+
+ private static int EXPECTED_SITES=1;
+ public static void setExpectedSites(int x){
+ EXPECTED_SITES=x;
+ MIN_TRIM_SITES_TO_RETAIN_SINGLE=Tools.max((int)(EXPECTED_SITES*4)+1, MIN_TRIM_SITES_TO_RETAIN_SINGLE);
+ MIN_TRIM_SITES_TO_RETAIN_PAIRED=Tools.max((int)(EXPECTED_SITES*4)+1, MIN_TRIM_SITES_TO_RETAIN_PAIRED);
+ MAX_TRIM_SITES_TO_RETAIN=EXPECTED_SITES*40+80;
+ }
+
+ @Override
+ public final int ALIGN_COLUMNS(){return ALIGN_COLUMNS;}
+ @Override
+ public final int ALIGN_ROWS(){return ALIGN_ROWS;}
+ @Override
+ public final int maxReadLength(){return ALIGN_ROWS-1;}
+ @Override
+ final AbstractIndex index(){return index;}
+ @Override
+ final int CLEARZONE1(){return CLEARZONE1;}
+
+ public BBMapThreadPacBioSkimmer(ConcurrentReadInputStream cris_, int keylen_,
+ CoveragePileup pileup_, boolean SMITH_WATERMAN_, int THRESH_, int minChrom_,
+ int maxChrom_, float keyDensity_, float maxKeyDensity_, float minKeyDensity_, int maxDesiredKeys_,
+ boolean REMOVE_DUPLICATE_BEST_ALIGNMENTS_, boolean SAVE_AMBIGUOUS_XY_,
+ float MINIMUM_ALIGNMENT_SCORE_RATIO_, boolean TRIM_LIST_, boolean MAKE_MATCH_STRING_, boolean QUICK_MATCH_STRINGS_,
+ ConcurrentReadOutputStream outStream_, ConcurrentReadOutputStream outStreamMapped_, ConcurrentReadOutputStream outStreamUnmapped_, ConcurrentReadOutputStream outStreamBlack_,
+ int SLOW_ALIGN_PADDING_, int SLOW_RESCUE_PADDING_, boolean DONT_OUTPUT_UNMAPPED_READS_, boolean DONT_OUTPUT_BLACKLISTED_READS_,
+ int MAX_SITESCORES_TO_PRINT_, boolean PRINT_SECONDARY_ALIGNMENTS_,
+ boolean REQUIRE_CORRECT_STRANDS_PAIRS_, boolean SAME_STRAND_PAIRS_, boolean KILL_BAD_PAIRS_, boolean RCOMP_MATE_,
+ boolean PERFECTMODE_, boolean SEMIPERFECTMODE_, boolean FORBID_SELF_MAPPING_, int TIP_DELETION_SEARCH_RANGE_,
+ boolean AMBIGUOUS_RANDOM_, boolean AMBIGUOUS_ALL_, int KFILTER_, float IDFILTER_, boolean TRIM_LEFT_, boolean TRIM_RIGHT_, boolean UNTRIM_, byte TRIM_QUAL_, int TRIM_MIN_LEN_,
+ boolean LOCAL_ALIGN_, boolean RESCUE_, boolean STRICT_MAX_INDEL_, String MSA_TYPE_){
+
+ super(cris_,
+ outStream_, outStreamMapped_, outStreamUnmapped_, outStreamBlack_,
+ pileup_, SMITH_WATERMAN_, LOCAL_ALIGN_, REMOVE_DUPLICATE_BEST_ALIGNMENTS_,
+ AMBIGUOUS_RANDOM_, AMBIGUOUS_ALL_, TRIM_LEFT_, TRIM_RIGHT_, UNTRIM_, TRIM_QUAL_, TRIM_MIN_LEN_, THRESH_,
+ minChrom_, maxChrom_, KFILTER_, IDFILTER_, KILL_BAD_PAIRS_, SAVE_AMBIGUOUS_XY_,
+ REQUIRE_CORRECT_STRANDS_PAIRS_,
+ SAME_STRAND_PAIRS_, RESCUE_, STRICT_MAX_INDEL_, SLOW_ALIGN_PADDING_, SLOW_RESCUE_PADDING_,
+ MSA_TYPE_, keylen_, PERFECTMODE_, SEMIPERFECTMODE_, FORBID_SELF_MAPPING_, RCOMP_MATE_,
+ MAKE_MATCH_STRING_, DONT_OUTPUT_UNMAPPED_READS_, DONT_OUTPUT_BLACKLISTED_READS_, PRINT_SECONDARY_ALIGNMENTS_,
+ QUICK_MATCH_STRINGS_, MAX_SITESCORES_TO_PRINT_, MINIMUM_ALIGNMENT_SCORE_RATIO_,
+ keyDensity_, maxKeyDensity_, minKeyDensity_, maxDesiredKeys_,
+ BBIndexPacBioSkimmer.MIN_APPROX_HITS_TO_KEEP, BBIndexPacBioSkimmer.USE_EXTENDED_SCORE,
+ BBIndexPacBioSkimmer.BASE_HIT_SCORE, BBIndexPacBioSkimmer.USE_AFFINE_SCORE, BBIndexPacBioSkimmer.MAX_INDEL, TRIM_LIST_, TIP_DELETION_SEARCH_RANGE_);
+
+ assert(SLOW_ALIGN_PADDING>=0);
+ assert(!(RCOMP_MATE/* || FORBID_SELF_MAPPING*/)) : "RCOMP_MATE: TODO";
+
+ if(SLOW_ALIGN || MAKE_MATCH_STRING){
+// msa=MSA.makeMSA(ALIGN_ROWS, ALIGN_COLUMNS, MSA_TYPE);
+// POINTS_MATCH=msa.POINTS_MATCH();
+// POINTS_MATCH2=msa.POINTS_MATCH2();
+ CLEARZONE1=(int)(CLEARZONE_RATIO1*POINTS_MATCH2);
+ CLEARZONE1b=(int)(CLEARZONE_RATIO1b*POINTS_MATCH2);
+ CLEARZONE1c=(int)(CLEARZONE_RATIO1c*POINTS_MATCH2);
+ CLEARZONEP=(int)(CLEARZONE_RATIOP*POINTS_MATCH2);
+ CLEARZONE3=PENALIZE_AMBIG ? (int)(CLEARZONE_RATIO3*POINTS_MATCH2) : 0;
+// CLEARZONE1e=(int)(2*POINTS_MATCH2-POINTS_MATCH-msa.POINTS_SUB())+1;
+ }else{
+// POINTS_MATCH=70;
+// POINTS_MATCH2=100;
+// msa=null;
+ CLEARZONE1=0;
+ CLEARZONE1b=0;
+ CLEARZONE1c=0;
+ CLEARZONEP=0;
+ CLEARZONE3=0;
+// CLEARZONE1e=0;
+ }
+ INV_CLEARZONE3=(CLEARZONE3==0 ? 0 : 1f/CLEARZONE3);
+
+ index=new BBIndexPacBioSkimmer(KEYLEN, minChrom, maxChrom, KFILTER, msa);
+ }
+
+
+ public int trimList(ArrayList<SiteScore> list, boolean retainPaired, int maxScore, boolean specialCasePerfect, int minSitesToRetain, int maxSitesToRetain){
+ if(list==null || list.size()==0){return -99999;}
+ if(list.size()==1){return list.get(0).score;}
+
+ boolean b=(list.size()>=minSitesToRetain);
+
+ final int highestScore;
+ if(USE_AFFINE_SCORE){
+
+ Tools.trimSiteList(list, .10f, retainPaired, true, EXPECTED_SITES<3 ? EXPECTED_SITES : EXPECTED_SITES*2, maxSitesToRetain);
+
+ highestScore=Tools.trimSiteList(list, .25f, retainPaired, true, minSitesToRetain, maxSitesToRetain);
+ if(highestScore==maxScore && specialCasePerfect){
+ Tools.trimSiteList(list, .94f, retainPaired, true, minSitesToRetain, maxSitesToRetain);
+ if(list.size()>8){Tools.trimSiteList(list, .99f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ return highestScore;
+ }
+
+ final int mstr2=(minSitesToRetain<=1 ? 1 : minSitesToRetain+1);
+ final int mstr3=(minSitesToRetain+1)/2;
+
+ assert(!b || list.size()>=EXPECTED_SITES);
+
+ int N=(EXPECTED_SITES+3)/4;
+
+ if(list.size()>2*N){
+ Tools.trimSiteListByMax(list, (int)(.12f*maxScore), retainPaired, true, mstr3, maxSitesToRetain);
+ }
+
+ if(list.size()>3*N){
+ Tools.trimSiteList(list, .28f, retainPaired, true, minSitesToRetain, maxSitesToRetain);
+ Tools.trimSiteListByMax(list, (int)(.14f*maxScore), retainPaired, true, mstr3, maxSitesToRetain);
+ }
+
+ if(list.size()>4*N){
+ Tools.trimSiteList(list, .32f, retainPaired, true, minSitesToRetain, maxSitesToRetain);
+ Tools.trimSiteListByMax(list, (int)(.16f*maxScore), retainPaired, true, mstr3, maxSitesToRetain);
+ }
+
+ if(list.size()>5*N){
+ Tools.trimSiteList(list, .36f, retainPaired, true, minSitesToRetain, maxSitesToRetain);
+ Tools.trimSiteListByMax(list, (int)(.17f*maxScore), retainPaired, true, mstr3, maxSitesToRetain);
+ }
+
+ if(list.size()>6*N){
+ Tools.trimSiteList(list, .38f, retainPaired, true, minSitesToRetain, maxSitesToRetain);
+ Tools.trimSiteListByMax(list, (int)(.18f*maxScore), retainPaired, true, mstr3, maxSitesToRetain);
+ }
+
+ if(list.size()>8*N){
+ Tools.trimSiteList(list, .40f, retainPaired, true, minSitesToRetain, maxSitesToRetain);
+ Tools.trimSiteListByMax(list, (int)(.20f*maxScore), retainPaired, true, mstr3, maxSitesToRetain);
+ }
+// // System.out.print(", "+list.size());
+ if(list.size()>12*N){
+ Tools.trimSiteList(list, .45f, retainPaired, true, minSitesToRetain, maxSitesToRetain);
+ Tools.trimSiteListByMax(list, (int)(.22f*maxScore), retainPaired, true, mstr3, maxSitesToRetain);
+ }
+// // System.out.print(", "+list.size());
+ if(list.size()>16*N){
+ Tools.trimSiteList(list, .50f, retainPaired, true, minSitesToRetain, maxSitesToRetain);
+ Tools.trimSiteListByMax(list, (int)(.24f*maxScore), retainPaired, true, mstr3, maxSitesToRetain);
+ }
+// // System.out.print(", "+list.size());
+ if(list.size()>20*N){
+ Tools.trimSiteList(list, .55f, retainPaired, true, minSitesToRetain, maxSitesToRetain);
+ Tools.trimSiteListByMax(list, (int)(.26f*maxScore), retainPaired, true, mstr3, maxSitesToRetain);
+ }
+
+ if(list.size()>24*N){
+ Tools.trimSiteList(list, .60f, retainPaired, true, minSitesToRetain, maxSitesToRetain);
+ Tools.trimSiteListByMax(list, (int)(.28f*maxScore), retainPaired, true, mstr3, maxSitesToRetain);
+ }
+// //// System.out.print(", "+list.size());
+ if(list.size()>32*N){
+ Tools.trimSiteList(list, .65f, retainPaired, true, minSitesToRetain, maxSitesToRetain);
+ Tools.trimSiteListByMax(list, (int)(.30f*maxScore), retainPaired, true, mstr3, maxSitesToRetain);
+ }
+// //// System.out.print(", "+list.size());
+ if(list.size()>40*N){Tools.trimSiteList(list, .70f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+// // System.out.print(", "+list.size());
+ if(list.size()>48*N){Tools.trimSiteList(list, .75f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ if(list.size()>64*N){Tools.trimSiteList(list, .80f, retainPaired, true, mstr2, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ if(list.size()>80*N){Tools.trimSiteList(list, .85f, retainPaired, true, mstr2, maxSitesToRetain);}
+ if(list.size()>96*N){Tools.trimSiteList(list, .90f, retainPaired, true, mstr2, maxSitesToRetain);}
+
+ assert(!b || list.size()>=EXPECTED_SITES);
+
+
+ }else if(BBIndexPacBioSkimmer.USE_EXTENDED_SCORE){
+ highestScore=Tools.trimSiteList(list, .75f, retainPaired, true, minSitesToRetain, maxSitesToRetain);
+
+ if(list.size()>8){Tools.trimSiteList(list, .8f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ if(list.size()>16){Tools.trimSiteList(list, .85f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ if(list.size()>24){Tools.trimSiteList(list, .90f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ if(list.size()>36){Tools.trimSiteList(list, .92f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ //// System.out.print(", "+list.size());
+ if(list.size()>40){Tools.trimSiteList(list, .94f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ //// System.out.print(", "+list.size());
+ if(list.size()>48){Tools.trimSiteList(list, .96f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ //// System.out.print(", "+list.size());
+ if(list.size()>56){Tools.trimSiteList(list, .97f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ if(list.size()>64){Tools.trimSiteList(list, .98f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ if(list.size()>80){Tools.trimSiteList(list, .99f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+
+
+ }else{
+ // System.out.print("\n\nSize:\t"+list.size());
+
+
+ highestScore=Tools.trimSiteList(list, .6f, retainPaired, true, minSitesToRetain, maxSitesToRetain);
+
+ if(list.size()>12){Tools.trimSiteList(list, .65f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ if(list.size()>16){Tools.trimSiteList(list, .7f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ if(list.size()>24){Tools.trimSiteList(list, .74f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ if(list.size()>28){Tools.trimSiteList(list, .8f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ if(list.size()>32){Tools.trimSiteList(list, .85f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ if(list.size()>48){Tools.trimSiteList(list, .90f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ //// System.out.print(", "+list.size());
+ // if(list.size()>40){Tools.trimSiteList(list, .95f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ //// System.out.print(", "+list.size());
+ // if(list.size()>48){Tools.trimSiteList(list, .96f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ //// System.out.print(", "+list.size());
+ // if(list.size()>56){Tools.trimSiteList(list, .97f, retainPaired, true, minSitesToRetain, maxSitesToRetain);}
+ // System.out.print(", "+list.size());
+ }
+
+ return highestScore;
+ }
+
+
+ public void scoreSlow(final ArrayList<SiteScore> list, final byte[] basesP, final byte[] basesM,
+ final int maxSwScore, final int maxImperfectSwScore){
+
+ final int minMsaLimit;
+ if(PAIRED){
+ minMsaLimit=-CLEARZONE1e+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE*maxSwScore);
+ }else{
+ minMsaLimit=-CLEARZONE1e+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO*maxSwScore);
+ }
+ assert(Read.CHECKSITES(list, basesP, basesM, -1));
+
+ int minMatch=Tools.max(-300, minMsaLimit-CLEARZONE3); //Score must exceed this to generate quick match string
+ if(verbose){
+ System.err.println("Slow-scoring. maxSwScore="+maxSwScore+", maxImperfectSwScore="+maxImperfectSwScore+", minMsaLimit="+minMsaLimit+", minMatch="+minMatch);
+ }
+ for(int i=0; i<list.size(); i++){
+ final SiteScore ss=list.get(i);
+ assert(ss.lengthsAgree());
+ final byte[] bases=(ss.strand==Gene.PLUS ? basesP : basesM);
+
+ if(SEMIPERFECTMODE){
+ assert(ss.stop-ss.start==bases.length-1);
+ assert(ss.semiperfect);
+ }
+
+ if(verbose){System.err.println("\nSlow-scoring "+ss);}
+ if(ss.stop-ss.start!=bases.length-1){
+ assert(ss.stop-ss.start>bases.length-1) : bases.length+", "+ss.toText();
+ assert(!ss.semiperfect) : "\n"+bases.length+", "+ss.toText()+", "+ss.perfect+", "+ss.semiperfect+", "+maxSwScore+"\n"+new String(basesP)+"\n";
+ ss.setSlowScore(0);
+ ss.semiperfect=false;
+ ss.perfect=false;
+ }
+
+ final int swscoreNoIndel=ss.slowScore;
+ int[] swscoreArray=null;
+
+ boolean clipped=true, setLimits=false;
+ if(swscoreNoIndel<maxImperfectSwScore && !ss.semiperfect){
+ if(verbose && ss.stop-ss.start>4000){
+ System.err.println(ss.toText());
+ System.err.println(list.size());
+ System.err.println();
+ }
+
+ int expectedLen=GapTools.calcGrefLen(ss);
+ if(verbose){System.err.println("expectedLen="+expectedLen);}
+ if(expectedLen>=EXPECTED_LEN_LIMIT){
+ //TODO: Alternately, I could kill the site.
+ ss.setStop(ss.start+Tools.min(basesP.length+40, EXPECTED_LEN_LIMIT));
+ if(verbose){System.err.println("expectedLen="+expectedLen+"; ss="+ss);}
+ }
+
+ int pad=SLOW_ALIGN_PADDING;
+ final int minscore=Tools.max(swscoreNoIndel, minMsaLimit);
+ final int minscore2=Tools.max(swscoreNoIndel-MSA.MIN_SCORE_ADJUST, minMsaLimit);
+ if(verbose){System.err.println("Sent to msa with start="+ss.start+", stop="+ss.stop+", pad="+pad+", limit="+minscore+", gaps="+GapTools.toString(ss.gaps));}
+ swscoreArray=msa.fillAndScoreLimited(bases, ss, pad, minscore);
+ if(verbose){System.err.println("Received "+Arrays.toString(swscoreArray));}
+
+ if(swscoreArray!=null && swscoreArray.length>6 && (swscoreArray[3]+swscoreArray[4]+expectedLen<EXPECTED_LEN_LIMIT)){
+ int[] oldArray=swscoreArray.clone();
+ assert(swscoreArray.length==8);
+ int extraPadLeft=swscoreArray[6];
+ int extraPadRight=swscoreArray[7];
+
+ if(verbose){
+ System.err.println("msa returned "+Arrays.toString(swscoreArray)+", re-running.");
+ System.err.println("Added extra padding: "+ss.toText()+", "+Arrays.toString(oldArray));
+ }
+
+ ss.setLimits(ss.start-extraPadLeft, ss.stop+extraPadRight);
+ pad=SLOW_ALIGN_PADDING+EXTRA_PADDING;
+ if(verbose){System.err.println("Sent to msa with start="+ss.start+", stop="+ss.stop+", pad="+pad+", limit="+minscore+", gaps="+GapTools.toString(ss.gaps));}
+ swscoreArray=msa.fillAndScoreLimited(bases, ss, pad, minscore);
+
+ if(verbose){System.err.println("Result of extra padding: "+ss.toText()+", "+Arrays.toString(swscoreArray));}
+ if(swscoreArray==null || swscoreArray[0]<oldArray[0]){
+ if(verbose){
+ System.err.println("Result was inferior.");
+ }
+ swscoreArray=oldArray;
+ }
+ }
+ assert(ss.lengthsAgree());
+ if(verbose){
+ System.err.println(QUICK_MATCH_STRINGS+", "+(swscoreArray==null ? "null" : (swscoreArray.length+", "+swscoreArray[0]+" >=? "+minscore)));
+ System.err.println("start="+ss.start+", stop="+ss.stop+", len="+ss.mappedLength());
+ }
+ if(QUICK_MATCH_STRINGS && swscoreArray!=null && swscoreArray.length==6 && swscoreArray[0]>=minscore2 && (PRINT_SECONDARY_ALIGNMENTS || (USE_SS_MATCH_FOR_PRIMARY && swscoreArray[0]>minMatch))){
+ if(verbose){System.err.println("Generating match string.");}
+ assert(swscoreArray.length==6) : swscoreArray.length;
+ assert(swscoreArray[0]>=minscore2) : "\n"+Arrays.toString(swscoreArray)+"\n"+minscore+"\n"+minMatch;
+ ss.match=msa.traceback(bases, Data.getChromosome(ss.chrom).array, ss.start-pad, ss.stop+pad, swscoreArray[3], swscoreArray[4], swscoreArray[5], ss.gaps!=null);
+ if(ss.match!=null){
+ assert(ss.pairedScore<1 || (ss.slowScore<=0 && ss.pairedScore>ss.quickScore ) || ss.pairedScore>ss.slowScore); //123
+ ss.setLimits(swscoreArray[1], swscoreArray[2]);
+ setLimits=true;
+ assert(ss.lengthsAgree());
+ clipped=ss.fixXY(bases, true, msa);
+ assert(ss.pairedScore<1 || (ss.slowScore<=0 && ss.pairedScore>ss.quickScore ) || ss.pairedScore>ss.slowScore); //123
+ clipped=ss.clipTipIndels(bases, basesM, 4, 10, msa) || clipped;
+ assert(ss.pairedScore<1 || (ss.slowScore<=0 && ss.pairedScore>ss.quickScore ) || ss.pairedScore>ss.slowScore); //123
+ assert(ss.lengthsAgree());
+ }
+ }else{
+ ss.match=null;
+ }
+ }
+ if(swscoreArray!=null && !setLimits){
+ if(verbose){System.err.println("msa returned "+Arrays.toString(swscoreArray));}
+ ss.setSlowScore(swscoreArray[0]);
+ ss.setLimits(swscoreArray[1], swscoreArray[2]);
+ assert(ss.lengthsAgree());
+ }else{
+ assert(swscoreNoIndel<=maxSwScore) : swscoreNoIndel+", "+maxImperfectSwScore+", "+maxSwScore+", "+new String(basesP);
+ assert(clipped || swscoreNoIndel==-1 || msa.scoreNoIndels(bases, ss.chrom, ss.start)==swscoreNoIndel) :
+ setLimits+", "+clipped+", "+(swscoreArray==null)+", "+
+ swscoreNoIndel+" != "+msa.scoreNoIndels(bases, ss.chrom, ss.start)+"\n"+
+ ss.toText()+"\n"+(ss.stop-ss.start)+", "+bases.length; //Slow
+ }
+ assert(ss.lengthsAgree());
+ ss.setScore(ss.slowScore);
+ minMatch=Tools.max(minMatch, ss.slowScore);
+// minMsaLimit=Tools.max(minMsaLimit, ss.slowScore-CLEARZONE3);
+ assert(ss.slowScore<=maxSwScore);
+ assert(!(ss.perfect && ss.slowScore<maxSwScore));
+ ss.perfect=(ss.slowScore==maxSwScore);
+ if(ss.perfect){ss.semiperfect=true;}
+ else if(!ss.semiperfect){ss.setPerfect(bases);}
+
+ if(verbose){System.err.println(" -> "+ss);}
+ }
+
+ }
+
+
+ /** {group of correct hit (or -1), size of correct group, number of groups,
+ * number of elements, correctScore, maxScore, size of top group, num correct, firstElementCorrect,
+ * firstElementCorrectLoose, firstGroupCorrectLoose} */
+
+ public void calcStatistics1(final Read r, final int maxSwScore, final int maxPossibleQuickScore){
+ if(OUTPUT_PAIRED_ONLY && r.mate!=null && !r.paired() && (r.mapped() || r.mate.mapped())){r.clearPairMapping();}
+ final Read r2=r.mate;
+
+ if(!r.mapped() && (r2==null || !r2.mapped())){
+ bothUnmapped++;
+ bothUnmappedBases+=r.length();
+ if(r2!=null){
+ bothUnmapped++;
+ bothUnmappedBases+=r2.length();
+ }
+ }
+
+ /* {number of correct (loose) sites, number of incorrect (loose) sites, number incorrect sites before last correct site,
+ * number of sites, correctScore, maxScore, firstElementCorrect, firstElementCorrectLoose, position of first correct element (or -1),
+ * sizeOfTopGroup, numTopCorrect} */
+ int[] correctness=calcCorrectnessSkimmer(r, THRESH);
+
+ int numCorrect=correctness[0];
+ int numIncorrect=correctness[1];
+ int numIncorrectPrior=correctness[2];
+ int numSites=correctness[3];
+ int correctScore=correctness[4];
+ int topScore=correctness[5];
+ boolean firstElementCorrect=(correctness[6]==1);
+ boolean firstElementCorrectLoose=(correctness[7]==1);
+ int positionOfFirstCorrect=correctness[8];
+ int sizeOfTopGroup=correctness[9];
+ int numTopCorrect=correctness[10];
+
+ final int len1=r.length();
+ final int len2=(r2==null ? 0 : r.length());
+
+ assert(numSites==numCorrect+numIncorrect) : numSites+", "+numCorrect+", "+numIncorrect+", "+r.numSites();
+ assert(numSites==r.numSites());
+
+ totalNumCorrect1+=numCorrect;
+ totalNumIncorrect1+=numIncorrect;
+ totalNumIncorrectPrior1+=numIncorrectPrior;
+ if(numCorrect>=EXPECTED_SITES){
+ totalNumCapturedAllCorrect1++;
+// assert(numCorrect==EXPECTED_CORRECT_SITES) : numCorrect +", "+EXPECTED_CORRECT_SITES+", "+r.list;
+ assert(r.sites.size()>=EXPECTED_SITES) : numCorrect;
+ if(numTopCorrect==numCorrect){
+ totalNumCapturedAllCorrectTop1++;
+ if(numCorrect==numSites){
+ totalNumCapturedAllCorrectOnly1++;
+ }
+ }
+ }
+
+ assert(numSites>0 == r.mapped());
+ if(numSites>0){
+
+ if(r.match!=null){
+ int[] errors=r.countErrors(SamLine.INTRON_LIMIT);
+ matchCountM1+=errors[0];
+ matchCountS1+=errors[1];
+ matchCountD1+=errors[2];
+ matchCountI1+=errors[3];
+ matchCountN1+=errors[4];
+
+ readCountS1+=(errors[1]>0 ? 1 : 0);
+ readCountD1+=(errors[2]>0 ? 1 : 0);
+ readCountI1+=(errors[3]>0 ? 1 : 0);
+ readCountN1+=(errors[4]>0 ? 1 : 0);
+ readCountSplice1+=(errors[5]>0 ? 1 : 0);
+ readCountE1+=((errors[1]>0 || errors[2]>0 || errors[3]>0)? 1 : 0);
+ }
+
+
+ mappedRetained1++;
+ if(r.rescued()){
+ if(r.strand()==Gene.PLUS){
+ rescuedP1++;
+ }else{
+ rescuedM1++;
+ }
+ }
+ if(r.paired()){
+ numMated++;
+ int inner;
+ int outer;
+ if(r.start<=r.mate.start){
+ inner=r.mate.start-r.stop;
+ outer=r.mate.stop-r.start;
+ }else{
+ inner=r.start-r.mate.stop;
+ outer=r.stop-r.mate.start;
+ }
+
+ inner=Tools.min(MAX_PAIR_DIST, inner);
+ inner=Tools.max(MIN_PAIR_DIST, inner);
+ innerLengthSum+=inner;
+ outerLengthSum+=outer;
+ insertSizeSum+=(inner+r.length()+r.mateLength());
+ }else if(r.mate!=null && r.mate.mapped()/*&& r.list!=null && r.list.size()>0*/){
+ badPairs++;
+ }
+
+ if(r.perfect() || (maxSwScore>0 && r.topSite().slowScore==maxSwScore)){
+ perfectMatch1++;
+ }else if(SLOW_ALIGN){
+ assert(r.topSite().slowScore<maxSwScore) : maxSwScore+"\t"+r.topSite().toText();
+ }
+
+ int foundSemi=0;
+ for(SiteScore ss : r.sites){
+ if(ss.perfect){
+ perfectHitCount1++;
+ assert(ss.semiperfect);
+ }
+ if(ss.semiperfect){
+ semiPerfectHitCount1++;
+ foundSemi=1;
+ }
+ }
+ semiperfectMatch1+=foundSemi;
+ if(foundSemi>0){semiperfectMatchBases1+=len1;}
+
+ if(firstElementCorrect){
+ if(r.strand()==Gene.PLUS){firstSiteCorrectP1++;}
+ else{firstSiteCorrectM1++;}
+ if(r.paired()){firstSiteCorrectPaired1++;}
+ else{firstSiteCorrectSolo1++;}
+ if(r.rescued()){firstSiteCorrectRescued1++;}
+ }else{
+ firstSiteIncorrect1++;
+// System.out.println("********");
+// System.out.println(r.toText(false));
+// System.out.println(r.mate.toText(false));
+ }
+
+ if(firstElementCorrectLoose){
+ firstSiteCorrectLoose1++;
+ }else{
+ firstSiteIncorrectLoose1++;
+ }
+
+ siteSum1+=numSites;
+ topSiteSum1+=sizeOfTopGroup;
+
+ if(topScore==maxPossibleQuickScore){perfectHit1++;}
+ if(sizeOfTopGroup==1){uniqueHit1++;}
+
+ if(numCorrect>0){
+
+ if(r.strand()==Gene.PLUS){truePositiveP1++;}
+ else{truePositiveM1++;}
+ totalCorrectSites1+=numCorrect;
+
+ if(positionOfFirstCorrect==0){
+ correctUniqueHit1++;
+ }else{
+ correctLowHit1++;
+ }
+
+ }else{
+
+ falsePositive1++;
+// System.out.println("********");
+// System.out.println(r.toText(false));
+// System.out.println(r.mate.toText(false));
+ }
+ }else if(maxPossibleQuickScore==-1){
+ lowQualityReadsDiscarded1++;
+ lowQualityBasesDiscarded1+=len1;
+ r.setDiscarded(true);
+ }else{
+ noHit1++;
+ }
+ }
+
+
+ public void calcStatistics2(final Read r, final int maxSwScore, final int maxPossibleQuickScore){
+
+ int[] correctness=calcCorrectnessSkimmer(r, THRESH);
+
+ int numCorrect=correctness[0];
+ int numIncorrect=correctness[1];
+ int numIncorrectPrior=correctness[2];
+ int numSites=correctness[3];
+ int correctScore=correctness[4];
+ int topScore=correctness[5];
+ boolean firstElementCorrect=(correctness[6]==1);
+ boolean firstElementCorrectLoose=(correctness[7]==1);
+ int positionOfFirstCorrect=correctness[8];
+ int sizeOfTopGroup=correctness[9];
+ int numTopCorrect=correctness[10];
+
+ final int len=r.length();
+
+ totalNumCorrect2+=numCorrect;
+ totalNumIncorrect2+=numIncorrect;
+ totalNumIncorrectPrior2+=numIncorrectPrior;
+ if(numCorrect>=EXPECTED_SITES){
+ totalNumCapturedAllCorrect2++;
+ if(numTopCorrect==numCorrect){
+ totalNumCapturedAllCorrectTop2++;
+ if(numCorrect==numSites){
+ totalNumCapturedAllCorrectOnly2++;
+ }
+ }
+ }
+
+ if(numSites>0){
+
+ if(r.match!=null){
+ int[] errors=r.countErrors(SamLine.INTRON_LIMIT);
+ matchCountM2+=errors[0];
+ matchCountS2+=errors[1];
+ matchCountD2+=errors[2];
+ matchCountI2+=errors[3];
+ matchCountN2+=errors[4];
+
+ readCountS2+=(errors[1]>0 ? 1 : 0);
+ readCountD2+=(errors[2]>0 ? 1 : 0);
+ readCountI2+=(errors[3]>0 ? 1 : 0);
+ readCountN2+=(errors[4]>0 ? 1 : 0);
+ readCountSplice2+=(errors[5]>0 ? 1 : 0);
+ readCountE2+=((errors[1]>0 || errors[2]>0 || errors[3]>0)? 1 : 0);
+ }
+
+ mappedRetained2++;
+ if(r.rescued()){
+ if(r.strand()==Gene.PLUS){
+ rescuedP2++;
+ }else{
+ rescuedM2++;
+ }
+ }
+
+ if(r.perfect() || (maxSwScore>0 && r.topSite().slowScore==maxSwScore)){
+ perfectMatch2++;
+ }else if(SLOW_ALIGN){
+ assert(r.topSite().slowScore<maxSwScore) : maxSwScore+"\t"+r.topSite().toText();
+ }
+
+ int foundSemi=0;
+ for(SiteScore ss : r.sites){
+ if(ss.perfect){
+ perfectHitCount2++;
+ assert(ss.semiperfect);
+ }
+ if(ss.semiperfect){
+ semiPerfectHitCount2++;
+ foundSemi=1;
+ }
+ }
+ semiperfectMatch2+=foundSemi;
+ if(foundSemi>0){semiperfectMatchBases2+=len;}
+
+ if(firstElementCorrect){
+ if(r.strand()==Gene.PLUS){firstSiteCorrectP2++;}
+ else{firstSiteCorrectM2++;}
+ if(r.paired()){firstSiteCorrectPaired2++;}
+ else{firstSiteCorrectSolo2++;}
+ if(r.rescued()){firstSiteCorrectRescued2++;}
+ }else{
+ firstSiteIncorrect2++;
+// System.out.println("********");
+// System.out.println(r.toText(false));
+// System.out.println(r.mate.toText(false));
+ }
+
+ if(firstElementCorrectLoose){
+ firstSiteCorrectLoose2++;
+ }else{
+ firstSiteIncorrectLoose2++;
+ }
+
+ siteSum2+=numSites;
+ topSiteSum2+=sizeOfTopGroup;
+
+ if(topScore==maxPossibleQuickScore){perfectHit2++;}
+ if(sizeOfTopGroup==1){uniqueHit2++;}
+
+ if(numCorrect>0){
+
+ if(r.strand()==Gene.PLUS){truePositiveP2++;}
+ else{truePositiveM2++;}
+ totalCorrectSites2+=numCorrect;
+
+ if(positionOfFirstCorrect==0){
+ correctUniqueHit2++;
+ }else{
+ correctLowHit2++;
+ }
+
+ }else{
+
+ falsePositive2++;
+ // System.out.println("********");
+ // System.out.println(r.toText(false));
+ // System.out.println(r.mate.toText(false));
+ }
+ }else if(maxPossibleQuickScore==-1){
+ lowQualityReadsDiscarded2++;
+ }else{
+ noHit2++;
+ }
+ }
+
+ public void processRead(final Read r, final byte[] basesM){
+ if(idmodulo>1 && r.numericID%idmodulo!=1){return;}
+ final byte[] basesP=r.bases;
+
+// System.err.print(" rd#"+r.numericID+" ");
+// if(r.numericID==25967){
+// verbose=true;
+// msa.verbose=true;
+// GapTools.verbose=true;
+// index.verbose=true;
+// tcr.verbose=true;
+// }
+
+ if(verbose){System.err.println("\nProcessing "+r);}
+ readsUsed1++;
+
+ final int maxPossibleQuickScore=quickMap(r, basesM);
+ if(verbose){System.err.println("\nQuick Map: \t"+r.sites);}
+
+ if(maxPossibleQuickScore<0){
+ r.sites=null;
+ lowQualityReadsDiscarded1++;
+ lowQualityBasesDiscarded1+=basesP.length;
+ r.setDiscarded(true);
+ return;
+ }
+ initialSiteSum1+=r.numSites();
+ if(verbose){System.err.println("\ninitialSiteSum1: "+initialSiteSum1);}
+
+ int maxSwScore=0;
+ int maxImperfectSwScore=0;
+
+ if(SLOW_ALIGN || USE_AFFINE_SCORE){
+ maxSwScore=msa.maxQuality(r.length());
+ maxImperfectSwScore=msa.maxImperfectScore(r.length());
+ }
+
+ if(TRIM_LIST && r.numSites()>1){
+ if(MIN_TRIM_SITES_TO_RETAIN_SINGLE>1){Collections.sort(r.sites);}
+ int highestQuickScore=trimList(r.sites, false, maxSwScore, true, MIN_TRIM_SITES_TO_RETAIN_SINGLE, MAX_TRIM_SITES_TO_RETAIN);
+ }
+ postTrimSiteSum1+=r.numSites();
+ if(verbose){System.err.println("\nAfter trim: \t"+r.sites);}
+
+ assert(Read.CHECKSITES(r, basesM));
+
+
+ if(SLOW_ALIGN && r.numSites()>0){
+ Tools.subsumeOverlappingSites(r.sites, true, false);
+ int numNearPerfectScores=scoreNoIndels(r, basesP, basesM, maxSwScore, maxImperfectSwScore);
+
+ Collections.sort(r.sites); //Puts higher scores first to better trigger the early exit based on perfect scores
+
+// int numPerfectScores=0;
+// if(numNearPerfectScores>0){
+// for(SiteScore ss : r.list){
+// if(ss.perfect){numPerfectScores++;}
+// else{break;}
+// }
+// }
+
+ if(verbose){
+ System.err.println("\nAfter scoreNoIndels: \t"+r.sites);
+ }
+
+ if(numNearPerfectScores<1){
+ if(FIND_TIP_DELETIONS){findTipDeletions(r, basesP, basesM, maxSwScore, maxImperfectSwScore);}
+ }
+
+ if(verbose){
+ System.err.println("\nAfter findTipDeletions: \t"+r.sites);
+ }
+
+ //TODO: This causes problems with perfect matches that are mapped to areas longer than the read length
+ //***Above note should be resolved now, but needs to be verified.
+
+ if(numNearPerfectScores<r.sites.size()){
+ scoreSlow(r.sites, basesP, basesM, maxSwScore, maxImperfectSwScore);
+ }
+
+ if(STRICT_MAX_INDEL){
+ int removed=removeLongIndels(r.sites, index.MAX_INDEL);
+ if(r.numSites()==0){r.clearMapping();}
+ }
+
+ if(verbose){System.err.println("\nAfter scoreSlow: \t"+r.sites);}
+ assert(Read.CHECKSITES(r, basesM, false));
+ }
+
+
+ if(r.numSites()>0){
+ mapped1++;
+ try {
+ Tools.mergeDuplicateSites(r.sites, true, true);
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ throw new RuntimeException("\n\n"+r.toText(false)+"\n\n");
+ }
+ try {
+ Tools.removeOverlappingSites(r.sites, true);
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ throw new RuntimeException("\n\n"+r.toText(false)+"\n\n");
+ }
+ Collections.sort(r.sites);
+ assert(Read.CHECKSITES(r, basesM));
+ }
+ if(verbose){System.err.println("\nAfter merge: \t"+r.sites);}
+
+ if(r.numSites()>1){
+ SiteScore ss1=r.topSite();
+ SiteScore ss2=r.sites.get(1);
+ //Ensure no duplicates
+ assert(ss1.chrom!=ss2.chrom || ss1.strand!=ss2.strand || ss1.start!=ss2.start || ss1.stop!=ss2.stop) : r.toText(false);
+ }
+ assert(Read.CHECKSITES(r, basesM));
+
+ if(r.numSites()>1){
+ assert(r.topSite().score==r.topSite().slowScore);
+ }
+
+ if(SLOW_ALIGN || USE_AFFINE_SCORE){r.setPerfectFlag(maxSwScore);}
+
+ if(r.numSites()>1){
+ final int clearzone=r.perfect() ? CLEARZONEP :
+ r.topSite().score>=(int)(maxSwScore*CLEARZONE1b_CUTOFF) ? CLEARZONE1 :
+ (r.topSite().score>=(int)(maxSwScore*CLEARZONE1c_CUTOFF) ? CLEARZONE1b : CLEARZONE1c);
+ final int numBestSites1=Tools.countTopScores(r.sites, clearzone);
+ if(numBestSites1>1){
+ //Ambiguous alignment
+ assert(r.sites.size()>1);
+ boolean b=processAmbiguous(r.sites, true, AMBIGUOUS_TOSS, clearzone, false);
+ r.setAmbiguous(b);
+ }
+ }
+
+ if(SLOW_ALIGN || USE_AFFINE_SCORE){
+ Tools.removeLowQualitySitesUnpaired2(r.sites, maxSwScore, MINIMUM_ALIGNMENT_SCORE_RATIO, EXPECTED_SITES);
+ if(r.numSites()>1){
+ SiteScore a=r.topSite();
+ SiteScore b=r.sites.get(1);
+ assert(a.score>=b.score);
+ assert(a.score>=(int)(maxSwScore*MINIMUM_ALIGNMENT_SCORE_RATIO)) : a;
+ assert(b.score>=(int)(maxSwScore*MINIMUM_ALIGNMENT_SCORE_RATIO)) : "\n"+a+"\t"+b+"\n"+(maxSwScore*MINIMUM_ALIGNMENT_SCORE_RATIO);
+ }
+ }
+ if(verbose){System.err.println("\nAfter removal: \t"+r.sites);}
+
+ if(r.numSites()==0){r.sites=null;r.mapScore=0;}
+ r.setFromTopSite(AMBIGUOUS_RANDOM, true, MAX_PAIR_DIST);
+ assert(Read.CHECKSITES(r, basesM));
+
+ assert(r.gaps==null || r.gaps[0]==r.start && r.gaps[r.gaps.length-1]==r.stop);
+ assert(r.sites==null || r.mapScore>0) : r.sites+", "+r.mapScore+"\n"+r;
+
+ if(r.numSites()>1){
+ assert(r.topSite().score==r.topSite().slowScore) : "\n"+r.toText(false)+"\n";
+ assert(r.topSite().score==r.mapScore) : "\n"+r.toText(false)+"\n";
+ }
+
+ if(verbose){System.err.println("C: "+r);}
+
+ //***$
+ if(MAKE_MATCH_STRING && r.numSites()>0){
+ if(USE_SS_MATCH_FOR_PRIMARY && r.topSite().match!=null){
+ r.match=r.topSite().match;
+ }else{
+ if(r.sites.size()>1){
+ assert(r.topSite().score>=r.sites.get(1).score) : "\n"+r.topSite().toText()+"\t<\t"+r.sites.get(1).toText()+"\n"+r.toText(false)+"\n";
+ }
+ int mapScore=r.mapScore;
+
+ assert(r.mate!=null || r.numSites()==0 || r.topSite().score==r.mapScore) : "\n"+r.toText(false)+"\n";
+
+ if(verbose){System.err.println("D: "+r);}
+
+ {
+ boolean firstIter=true;
+ do{//
+ if(!firstIter){
+ Collections.sort(r.sites);
+ r.setFromTopSite(AMBIGUOUS_RANDOM, true, MAX_PAIR_DIST);
+ }
+ genMatchString(r, basesP, basesM, maxImperfectSwScore, maxSwScore, true, true);
+ assert(r.mate!=null || r.numSites()==0 || r.topSite().score==r.mapScore) : "\n"+r.toText(false)+"\n";
+// TODO: Fix this; it should never happen.
+// if(mapScore>r.mapScore){
+// System.err.println("genMatchString reduced mapping score: "+mapScore+" -> "+r.mapScore+" in read "+r.numericID);
+// }
+ if(STRICT_MAX_INDEL && hasLongIndel(r.match, index.MAX_INDEL)){
+ SiteScore ss=r.topSite();
+ ss.score=r.mapScore=Tools.min(ss.score, -9999);
+ ss.setSlowPairedScore(ss.score, ss.score);
+ }
+ r.topSite().score=r.topSite().slowScore;
+ firstIter=false;
+ }while(r.sites.size()>1 && r.topSite().score<r.sites.get(1).score);
+ }
+
+ if(r.numSites()>1){
+ assert(r.topSite().score==r.topSite().slowScore) : "\n"+r.toText(false)+"\n";
+ assert(r.topSite().score==r.mapScore) : "\n"+r.toText(false)+"\n";
+ assert(Read.CHECKSITES(r, basesM));
+ }
+
+ if(verbose){System.err.println("E: "+r);}
+ }
+ }
+ if(verbose){System.err.println("\nAfter match: \t"+r.sites);}
+
+ if(r.numSites()>1){
+ assert(r.topSite().score==r.topSite().slowScore) : "\n"+r.toText(false)+"\n";
+ assert(r.topSite().score==r.mapScore) : "\n"+r.toText(false)+"\n";
+ removeDuplicateBestSites(r);
+ }
+ if(r.numSites()>0){r.topSite().match=r.match;}
+
+
+
+ if(r.sites!=null && r.mapScore<=0){
+ if(!STRICT_MAX_INDEL && !Shared.anomaly){
+ System.err.println("Note: Read "+r.id+" failed cigar string generation and will be marked as unmapped.\t"+(r.match==null)+"\t"+r.mapScore+"\t"+r.topSite()+"\t"+new String(r.bases));
+ if(MSA.bandwidth>0 || MSA.bandwidthRatio>0 || MSA.flatMode){Shared.anomaly=true;}
+ }
+ r.mapScore=0;
+ r.setMapped(false);
+ r.sites=null;
+ }
+
+
+
+ //This block is to prevent an assertion from firing. Generally caused by alignment being lost during match generation.
+ //TODO: Fix cause.
+ if(r.mapScore>0 && r.sites==null){
+ if(!Shared.anomaly){System.err.println("Anomaly: mapScore>0 and list==null.\n"+r+"\n");}
+ Shared.anomaly=true;
+ r.clearMapping();
+ }else if(r.mapScore<=0 && r.sites!=null){
+ if(BANDWIDTH<1){
+ if(!Shared.anomaly){System.err.println("Anomaly1: mapScore<=0 and list!=null.\n"+r+"\n");}
+ Shared.anomaly=true;
+ }
+ r.clearMapping();
+ }
+ assert(r.sites==null || r.mapScore>0) :
+ "\nmapScore = "+r.mapScore+"\nread = "+r.toText(false)+"\nscore thresh = "+(-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO*maxSwScore))+"\n"+
+ "msa unlimited return = "+Arrays.toString(msa.fillAndScoreLimited(r.strand()==Gene.PLUS ? r.bases :
+ AminoAcid.reverseComplementBases(r.bases), r.topSite(), Tools.max(SLOW_ALIGN_PADDING, 10), 0))+"\n"+
+ "msa limited return = "+Arrays.toString(msa.fillAndScoreLimited(r.strand()==Gene.PLUS ? r.bases :
+ AminoAcid.reverseComplementBases(r.bases), r.topSite(), Tools.max(SLOW_ALIGN_PADDING, 10), (-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO*maxSwScore))))+"\n\n"+
+ "msa vert limit: "+msa.showVertLimit()+"\n\nmsa horz limit: "+msa.showHorizLimit()+"\n\n";
+
+// assert(r.list==null || r.mapScore>0) : r.mapScore+"\n"+r.list==null ? "null" : r.list.toString();
+
+ if(USE_CLEARZONE3 && (CLEARZONE3>CLEARZONE1 || CLEARZONE3>CLEARZONEP)){
+ boolean changed=applyClearzone3(r, CLEARZONE3, INV_CLEARZONE3);
+ if(changed){
+ int minScore=(int)(maxSwScore*MINIMUM_ALIGNMENT_SCORE_RATIO);
+ if(r.mapScore<minScore){
+ r.setAmbiguous(true);
+
+ if(AMBIGUOUS_TOSS){
+ r.sites.clear();
+ r.clearSite();
+ r.setMapped(false);
+ }
+ }
+ }
+ }
+
+ if(r.ambiguous() && AMBIGUOUS_TOSS){r.sites=null; r.clearSite(); r.setMapped(false);}
+
+ if(r.mapped() && r.numSites()>1 && PRINT_SECONDARY_ALIGNMENTS){
+ ensureMatchStringsOnSiteScores(r, basesM, maxImperfectSwScore, maxSwScore);
+ assert(Read.CHECKSITES(r, basesM));
+ }
+
+ assert(checkTopSite(r));
+ if(r.mapped() && (LOCAL_ALIGN || r.containsXYC())){
+ msa.toLocalAlignment(r, r.topSite(), basesM, r.containsXYC() ? 1 : LOCAL_ALIGN_TIP_LENGTH, LOCAL_ALIGN_MATCH_POINT_RATIO);
+ assert(Read.CHECKSITES(r, basesM));
+ }
+
+ if(r.numSites()==0 || (!r.ambiguous() && r.mapScore<maxSwScore*MINIMUM_ALIGNMENT_SCORE_RATIO)){
+ r.clearMapping();
+ }
+ postFilterRead(r, basesM, maxImperfectSwScore, maxSwScore);
+ if(MAKE_MATCH_STRING){ensureMatchStringOnPrimary(r, basesM, maxImperfectSwScore, maxSwScore);}
+
+ if(PENALIZE_AMBIG){
+ int penalty=calcTipScorePenalty(r, maxSwScore, 7);
+ applyScorePenalty(r, penalty);
+ }
+
+ if(CALC_STATISTICS){
+ calcStatistics1(r, maxSwScore, maxPossibleQuickScore);
+ }
+ }
+
+
+ /** Returns number of perfect pairs */
+ public int pairSiteScoresInitial(Read r, Read r2, boolean trim){
+
+ if(r.numSites()<1 || r2.numSites()<1){return 0;}
+
+ SiteScore.PCOMP.sort(r.sites);
+ SiteScore.PCOMP.sort(r2.sites);
+
+ for(SiteScore ss : r.sites){ss.setPairedScore(0);}
+ for(SiteScore ss : r2.sites){ss.setPairedScore(0);}
+
+// ArrayList<SiteScorePair> pairs=new ArrayList<SiteScorePair>(Tools.min(8, Tools.min(r.list.size(), r2.list.size())));
+
+ int maxPairedScore1=-1;
+ int maxPairedScore2=-1;
+
+
+// for(SiteScore ss : r.list){
+// System.out.println(ss.toText());
+// }
+
+// int i=0, j=0;
+ final int ilimit=r.sites.size()-1;
+ final int jlimit=r2.sites.size()-1;
+ final int maxReadLen=Tools.max(r.length(), r2.length());
+
+// final int outerDistLimit=MIN_PAIR_DIST+r.length()+r2.length();
+ final int outerDistLimit=(Tools.max(r.length(), r2.length())*(OUTER_DIST_MULT))/OUTER_DIST_DIV;//-(SLOW_ALIGN ? 100 : 0);
+ final int innerDistLimit=MAX_PAIR_DIST;//+(FIND_TIP_DELETIONS ? TIP_DELETION_SEARCH_RANGE : 0);
+ final int expectedFragLength=AVERAGE_PAIR_DIST+r.length()+r2.length();
+
+ int numPerfectPairs=0;
+
+ for(int i=0, j=0; i<=ilimit && j<=jlimit; i++){
+ SiteScore ss1=r.sites.get(i);
+ SiteScore ss2=r2.sites.get(j);
+
+ while(j<jlimit && (ss2.chrom<ss1.chrom || (ss2.chrom==ss1.chrom && ss1.start-ss2.stop>innerDistLimit))){
+ j++;
+ ss2=r2.sites.get(j);
+ }
+
+ for(int k=j; k<=jlimit; k++){
+ ss2=r2.sites.get(k);
+
+ if(ss2.chrom>ss1.chrom){break;}
+ if(ss2.start-ss1.stop>innerDistLimit){break;}
+
+// int dist=0;
+//
+// if(ss1.start<=ss2.start){
+// dist=ss2.start-ss1.stop;
+// }else if(ss1.start>ss2.start){
+// dist=ss1.start-ss2.stop;
+// }
+
+
+// int innerdist=0;
+// int outerdist=0;
+//
+// if(ss1.start<=ss2.start){
+// innerdist=ss2.start-ss1.stop;
+// outerdist=ss2.stop-ss1.start;
+// }else if(ss1.start>ss2.start){
+// innerdist=ss1.start-ss2.stop;
+// outerdist=ss1.stop-ss2.start;
+// }
+
+ final int innerdist, outerdist;
+ //assert(!SAME_STRAND_PAIRS) : "TODO";
+
+ if(REQUIRE_CORRECT_STRANDS_PAIRS){
+ if(ss1.strand!=ss2.strand){
+ if(ss1.strand==Gene.PLUS){
+ innerdist=ss2.start-ss1.stop;
+ outerdist=ss2.stop-ss1.start;
+ }else{
+ innerdist=ss1.start-ss2.stop;
+ outerdist=ss1.stop-ss2.start;
+ }
+ }else{
+ if(ss1.start<=ss2.start){
+ innerdist=ss2.start-ss1.stop;
+ outerdist=ss2.stop-ss1.start;
+ }else{
+ innerdist=ss1.start-ss2.stop;
+ outerdist=ss1.stop-ss2.start;
+ }
+ }
+ }else{
+ if(ss1.start<=ss2.start){
+ innerdist=ss2.start-ss1.stop;
+ outerdist=ss2.stop-ss1.start;
+ }else{
+ innerdist=ss1.start-ss2.stop;
+ outerdist=ss1.stop-ss2.start;
+ }
+ }
+
+ assert(outerdist>=innerdist);
+
+ if(outerdist>=outerDistLimit && innerdist<=innerDistLimit){
+
+ boolean strandOK=((ss1.strand==ss2.strand)==SAME_STRAND_PAIRS);
+
+ if(strandOK || !REQUIRE_CORRECT_STRANDS_PAIRS){
+
+ boolean paired1=false, paired2=false;
+
+ int deviation=absdif(AVERAGE_PAIR_DIST, innerdist);
+
+ final int pairedScore1;
+ final int pairedScore2;
+ if(strandOK){
+// pairedScore1=ss1.score+ss2.score/2;
+// pairedScore2=ss2.score+ss1.score/2;
+
+ pairedScore1=ss1.score+1+Tools.max(1, ss2.score/2-(((deviation)*ss2.score)/(32*expectedFragLength+100)));
+ pairedScore2=ss2.score+1+Tools.max(1, ss1.score/2-(((deviation)*ss1.score)/(32*expectedFragLength+100)));
+ }else{//e.g. a junction
+ pairedScore1=ss1.score+Tools.max(0, ss2.score/16);
+ pairedScore2=ss2.score+Tools.max(0, ss1.score/16);
+ }
+
+ if(pairedScore1>ss1.pairedScore){
+ paired1=true;
+ ss1.setPairedScore(Tools.max(ss1.pairedScore, pairedScore1));
+ maxPairedScore1=Tools.max(ss1.score, maxPairedScore1);
+ // System.out.println("Paired "+ss1.toText()+" with "+ss2.toText());
+ }else{
+ // System.out.println(ss1.toText()+" already paired.");
+ }
+ if(pairedScore2>ss2.pairedScore){
+ paired2=true;
+ ss2.setPairedScore(Tools.max(ss2.pairedScore, pairedScore2));
+ maxPairedScore2=Tools.max(ss2.score, maxPairedScore2);
+ }
+
+ if(paired1 && paired2 && outerdist>=maxReadLen && deviation<=expectedFragLength && ss1.perfect && ss2.perfect){
+ numPerfectPairs++; //Lower bound. Some perfect pairs may be the same.
+ }
+
+// ss1.pairedScore=Tools.max(ss1.pairedScore, pairedScore1);
+// ss2.pairedScore=Tools.max(ss2.pairedScore, pairedScore2);
+// maxPairedScore1=Tools.max(ss1.score, maxPairedScore1);
+// maxPairedScore2=Tools.max(ss2.score, maxPairedScore2);
+ }
+ }
+ }
+
+ }
+
+
+
+ for(SiteScore ss : r.sites){
+ if(ss.pairedScore>ss.score){ss.score=ss.pairedScore;}
+ else{assert(ss.pairedScore==0);}
+// ss.score=ss.pairedScore=Tools.max(ss.pairedScore, ss.score);
+ }
+ for(SiteScore ss : r2.sites){
+ if(ss.pairedScore>ss.score){ss.score=ss.pairedScore;}
+ else{assert(ss.pairedScore==0);}
+// ss.score=ss.pairedScore=Tools.max(ss.pairedScore, ss.score);
+ }
+
+ if(trim){
+ if(numPerfectPairs>MIN_TRIM_SITES_TO_RETAIN_PAIRED){
+// System.out.print(".");
+ Collections.sort(r.sites);
+ Collections.sort(r2.sites);
+ Tools.trimSitesBelowCutoff(r.sites, (int)(maxPairedScore1*.1f), true, true, MIN_TRIM_SITES_TO_RETAIN_PAIRED, MAX_TRIM_SITES_TO_RETAIN);
+ Tools.trimSitesBelowCutoff(r2.sites, (int)(maxPairedScore2*.1f), true, true, MIN_TRIM_SITES_TO_RETAIN_PAIRED, MAX_TRIM_SITES_TO_RETAIN);
+ }else{
+ if(r.sites.size()>MIN_TRIM_SITES_TO_RETAIN_PAIRED){
+ Collections.sort(r.sites);
+ Tools.trimSitesBelowCutoff(r.sites, (int)(maxPairedScore1*.1f), true, true, MIN_TRIM_SITES_TO_RETAIN_PAIRED, MAX_TRIM_SITES_TO_RETAIN);
+ }
+ if(r2.sites.size()>MIN_TRIM_SITES_TO_RETAIN_PAIRED){
+ Collections.sort(r2.sites);
+ Tools.trimSitesBelowCutoff(r2.sites, (int)(maxPairedScore2*.1f), true, true, MIN_TRIM_SITES_TO_RETAIN_PAIRED, MAX_TRIM_SITES_TO_RETAIN);
+ }
+ }
+ }
+
+// if(pairs.isEmpty()){return null;}
+//
+// ArrayList<SiteScore> temp=new ArrayList<SiteScore>(Tools.max(r.list.size(), r2.list.size()));
+//
+// for(SiteScore ss : r.list){
+// if(ss.score>maxPairedScore1){temp.add(ss);}
+// }
+// for(SiteScorePair ssp : pairs){
+// temp.add(ssp.a);
+// }
+// r.list.clear();
+// r.list.addAll(temp);
+//
+// for(SiteScore ss : r2.list){
+// if(ss.score>maxPairedScore2){temp.add(ss);}
+// }
+// for(SiteScorePair ssp : pairs){
+// temp.add(ssp.b);
+// }
+// r2.list.clear();
+// r2.list.addAll(temp);
+//
+// return pairs;
+
+ return numPerfectPairs;
+ }
+
+
+ public void processReadPair(final Read r, final byte[] basesM1, final byte[] basesM2){
+ if(idmodulo>1 && r.numericID%idmodulo!=1){return;}
+ final Read r2=r.mate;
+ assert(r2!=null);
+ final byte[] basesP1=r.bases, basesP2=r2.bases;
+ final int len1=(basesP1==null ? 0 : basesP1.length), len2=(basesP2==null ? 0 : basesP2.length);
+
+ readsUsed1++;
+ readsUsed2++;
+
+ final int maxPossibleQuickScore1=quickMap(r, basesM1);
+ final int maxPossibleQuickScore2=quickMap(r2, basesM2);
+
+ if(verbose){
+ System.err.println("\nAfter quick map:\nRead1:\t"+r+"\nRead2:\t"+r.mate);
+ }
+
+ if(maxPossibleQuickScore1<0 && maxPossibleQuickScore2<0){
+ r.sites=null;
+ r2.sites=null;
+ lowQualityReadsDiscarded1++;
+ lowQualityBasesDiscarded1+=len1;
+ r.setDiscarded(true);
+ lowQualityReadsDiscarded2++;
+ lowQualityBasesDiscarded2+=len2;
+ r2.setDiscarded(true);
+ return;
+ }
+
+ //Not really needed due to subsumption
+ Tools.mergeDuplicateSites(r.sites, true, false);
+ Tools.mergeDuplicateSites(r2.sites, true, false);
+
+ initialSiteSum1+=r.numSites();
+ initialSiteSum2+=r2.numSites();
+
+ //TODO: Fix this. This is a workaround for an assertion error counting the number of reads used.
+ //Discards need to be tracked separately for each end.
+// if(maxPossibleQuickScore2<0){lowQualityReadsDiscarded--;}
+
+ final int maxSwScore1=msa.maxQuality(len1);
+ final int maxImperfectSwScore1=msa.maxImperfectScore(len1);
+ final int maxSwScore2=msa.maxQuality(len2);
+ final int maxImperfectSwScore2=msa.maxImperfectScore(len2);
+
+ //TODO: POSSIBLY block pairing across N blocks.
+ pairSiteScoresInitial(r, r2, TRIM_LIST);
+ if(verbose){System.err.println("\nAfter initial pair:\nRead1:\t"+r+"\nRead2:\t"+r2);}
+
+ if(TRIM_LIST){
+
+ if(MIN_TRIM_SITES_TO_RETAIN_PAIRED>1){
+ if(r.numSites()>MIN_TRIM_SITES_TO_RETAIN_PAIRED){
+ Collections.sort(r.sites);
+ trimList(r.sites, true, maxSwScore1, false, MIN_TRIM_SITES_TO_RETAIN_PAIRED, MAX_TRIM_SITES_TO_RETAIN);
+ }
+ if(r2.numSites()>MIN_TRIM_SITES_TO_RETAIN_PAIRED){
+ Collections.sort(r2.sites);
+ trimList(r2.sites, true, maxSwScore2, false, MIN_TRIM_SITES_TO_RETAIN_PAIRED, MAX_TRIM_SITES_TO_RETAIN);
+ }
+ }
+ }
+ postTrimSiteSum1+=r.numSites();
+ postTrimSiteSum2+=r2.numSites();
+
+ {//Reset score to non-paired score
+ if(r.sites!=null){
+ for(SiteScore ss : r.sites){
+ assert(ss.slowScore<=ss.quickScore);
+ ss.score=ss.quickScore;
+ }
+ }
+ if(r2.sites!=null){
+ for(SiteScore ss : r2.sites){
+ assert(ss.slowScore<=ss.quickScore);
+ ss.score=ss.quickScore;
+ }
+ }
+ }
+
+ if(verbose){System.err.println("\nAfter trim:\nRead1:\t"+r.sites+"\nRead2:\t"+r2.sites);}
+
+// assert(Read.CHECKSITES(r, basesM1) && Read.CHECKSITES(r2, basesM2));
+
+ if(SLOW_ALIGN){
+
+ if(r.numSites()>0){
+ Tools.subsumeOverlappingSites(r.sites, true, false);
+ int numNearPerfectScores1=scoreNoIndels(r, basesP1, basesM1, maxSwScore1, maxImperfectSwScore1);
+ Collections.sort(r.sites); //Puts higher scores first to better trigger the early exit based on perfect scores
+
+ if(numNearPerfectScores1<MIN_TRIM_SITES_TO_RETAIN_PAIRED){
+ if(FIND_TIP_DELETIONS){findTipDeletions(r, basesP1, basesM1, maxSwScore1, maxImperfectSwScore1);}
+ }
+
+ //TODO:
+ //Note scoreSlow can be skipped under this circumstance:
+ //When rescue is disabled, numNearPerfectScores>0, and there are no paired sites.
+ scoreSlow(r.sites, basesP1, basesM1, maxSwScore1, maxImperfectSwScore1);
+ if(STRICT_MAX_INDEL){
+ int removed=removeLongIndels(r.sites, index.MAX_INDEL);
+ if(r.numSites()==0){r.clearMapping();}
+ }
+ Tools.mergeDuplicateSites(r.sites, true, true);
+ }
+
+ if(r2.numSites()>0){
+ Tools.subsumeOverlappingSites(r2.sites, true, false);
+ int numNearPerfectScores2=scoreNoIndels(r2, basesP2, basesM2, maxSwScore2, maxImperfectSwScore2);
+ Collections.sort(r2.sites); //Puts higher scores first to better trigger the early exit based on perfect scores
+
+ if(numNearPerfectScores2<MIN_TRIM_SITES_TO_RETAIN_PAIRED){
+ if(FIND_TIP_DELETIONS){findTipDeletions(r2, basesP2, basesM2, maxSwScore2, maxImperfectSwScore2);}
+ }
+
+ scoreSlow(r2.sites, basesP2, basesM2, maxSwScore2, maxImperfectSwScore2);
+ if(STRICT_MAX_INDEL){
+ int removed=removeLongIndels(r2.sites, index.MAX_INDEL);
+ if(r2.numSites()<1){r2.clearMapping();}
+ }
+ Tools.mergeDuplicateSites(r2.sites, true, true);
+ }
+
+
+ if(verbose){System.err.println("\nAfter slow align:\nRead1:\t"+r+"\nRead2:\t"+r2);}
+ assert(Read.CHECKSITES(r, basesM1, false) && Read.CHECKSITES(r2, basesM2, false));
+
+ if(DO_RESCUE){
+ int unpaired1=0;
+ int unpaired2=0;
+ if(r.sites!=null){
+ for(SiteScore ss : r.sites){
+ assert(ss.pairedScore<1 || ss.pairedScore>ss.quickScore || ss.pairedScore>ss.slowScore) :
+ "\n"+ss.toText()+"\n"+r.toText(false)+"\n";
+ if(ss.pairedScore==0){unpaired1++;}
+ }
+ }
+ if(r2.sites!=null){
+ for(SiteScore ss : r2.sites){
+ assert(ss.pairedScore<1 || ss.pairedScore>ss.quickScore || ss.pairedScore>ss.slowScore) :
+ "\n"+ss.toText()+"\n"+r2.toText(false)+"\n";
+ if(ss.pairedScore==0){unpaired2++;}
+ }
+ }
+
+ if(unpaired1>0 && r.numSites()>0){
+ Collections.sort(r.sites);
+ Tools.removeLowQualitySitesPaired(r.sites, maxSwScore1, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE);
+ rescue(r, r2, basesP2, basesM2, Tools.min(MAX_PAIR_DIST, 2*AVERAGE_PAIR_DIST+100));
+ Tools.mergeDuplicateSites(r2.sites, true, true);
+ }
+
+ if(unpaired2>0 && r2.numSites()>0){
+ Collections.sort(r2.sites);
+ Tools.removeLowQualitySitesPaired(r2.sites, maxSwScore2, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE);
+ rescue(r2, r, basesP1, basesM1, Tools.min(MAX_PAIR_DIST, 2*AVERAGE_PAIR_DIST+100));
+ Tools.mergeDuplicateSites(r.sites, true, true);
+ }
+
+ postRescueSiteSum1+=r.numSites();
+ postRescueSiteSum2+=r2.numSites();
+
+// if(r.list!=null){Collections.sort(r.list);}
+// if(r2.list!=null){Collections.sort(r2.list);}
+//
+// Tools.removeLowQualitySites(r.list, maxSwScore1, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE);
+// Tools.removeLowQualitySites(r2.list, maxSwScore2, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE);
+
+ if(verbose){System.err.println("\nAfter rescue:\nRead1:\t"+r+"\nRead2:\t"+r2);}
+ assert(Read.CHECKSITES(r, basesM1, false) && Read.CHECKSITES(r2, basesM2, false));
+ }
+ }else{
+ Tools.mergeDuplicateSites(r.sites, true, false);
+ Tools.mergeDuplicateSites(r2.sites, true, false);
+ if(verbose){System.err.println("\nAfter merge:\nRead1:\t"+r+"\nRead2:\t"+r2);}
+ assert(Read.CHECKSITES(r, basesM1, false) && Read.CHECKSITES(r2, basesM2, false));
+ }
+
+ try {
+ Tools.removeOverlappingSites(r.sites, true);
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ throw new RuntimeException("\n\n"+r.toText(false)+"\n\n");
+ }
+ try {
+ Tools.removeOverlappingSites(r2.sites, true);
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ throw new RuntimeException("\n\n"+r2.toText(false)+"\n\n");
+ }
+
+ if(r.numSites()>1){Collections.sort(r.sites);}
+ if(r2.numSites()>1){Collections.sort(r2.sites);}
+
+ if(false){//This block is optional, but increases correctness by a tiny bit. (or maybe not!)
+ if(SLOW_ALIGN || USE_AFFINE_SCORE){
+ Tools.removeLowQualitySitesPaired(r.sites, maxSwScore1, MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED, MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED);
+ Tools.removeLowQualitySitesPaired(r2.sites, maxSwScore2, MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED, MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED);
+ }
+
+ pairSiteScoresFinal(r, r2, false, false, MAX_PAIR_DIST, AVERAGE_PAIR_DIST, SAME_STRAND_PAIRS, REQUIRE_CORRECT_STRANDS_PAIRS, MAX_TRIM_SITES_TO_RETAIN);
+
+ if(r.numSites()>1){Collections.sort(r.sites);}
+ if(r2.numSites()>1){Collections.sort(r2.sites);}
+ }
+
+ if(SLOW_ALIGN || USE_AFFINE_SCORE){
+// assert(false) : "Change to removeLowQualitySitesPaired2";
+ //TODO Verify correctness
+ Tools.removeLowQualitySitesPaired2(r.sites, maxSwScore1, MINIMUM_ALIGNMENT_SCORE_RATIO, MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED, EXPECTED_SITES);
+ Tools.removeLowQualitySitesPaired2(r2.sites, maxSwScore2, MINIMUM_ALIGNMENT_SCORE_RATIO, MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED, EXPECTED_SITES);
+ }
+
+ //TODO
+// assert(false) : "Only verified up to this point.";
+ pairSiteScoresFinal(r, r2, false, true, MAX_PAIR_DIST, AVERAGE_PAIR_DIST, SAME_STRAND_PAIRS, REQUIRE_CORRECT_STRANDS_PAIRS, MAX_TRIM_SITES_TO_RETAIN);
+
+ if(r.numSites()>0){
+ mapped1++;
+ Collections.sort(r.sites);
+ }
+ if(r2.numSites()>0){
+ mapped2++;
+ Collections.sort(r2.sites);
+ }
+ assert(Read.CHECKSITES(r, basesM1) && Read.CHECKSITES(r2, basesM2));
+
+ if(SLOW_ALIGN || USE_AFFINE_SCORE){
+ r.setPerfectFlag(maxSwScore1);
+ r2.setPerfectFlag(maxSwScore2);
+// assert(Read.CHECKSITES(r, basesM1) && Read.CHECKSITES(r2, basesM2));
+ }
+
+
+ if(r.numSites()>1){
+ final int clearzone=r.perfect() ? CLEARZONEP :
+ r.topSite().score>=(int)(maxSwScore1*CLEARZONE1b_CUTOFF) ? CLEARZONE1 :
+ (r.topSite().score>=(int)(maxSwScore1*CLEARZONE1c_CUTOFF) ? CLEARZONE1b : CLEARZONE1c);
+ int numBestSites1=Tools.countTopScores(r.sites, clearzone);
+ if(numBestSites1>1){
+ //Ambiguous alignment
+ assert(r.sites.size()>1);
+
+ boolean b=processAmbiguous(r.sites, true, AMBIGUOUS_TOSS, clearzone, false);
+ r.setAmbiguous(b);
+ }
+// assert(Read.CHECKSITES(r, basesM1));
+ }
+
+ if(r2.numSites()>1){
+ final int clearzone=r2.perfect() ? CLEARZONEP :
+ r2.topSite().score>=(int)(maxSwScore2*CLEARZONE1b_CUTOFF) ? CLEARZONE1 :
+ (r2.topSite().score>=(int)(maxSwScore2*CLEARZONE1c_CUTOFF) ? CLEARZONE1b : CLEARZONE1c);
+ int numBestSites2=Tools.countTopScores(r2.sites, clearzone);
+ if(numBestSites2>1){
+ //Ambiguous alignment
+ assert(r2.sites.size()>1);
+
+ boolean b=processAmbiguous(r2.sites, false, AMBIGUOUS_TOSS, clearzone, false);
+ r2.setAmbiguous(b);
+ }
+// assert(Read.CHECKSITES(r2, basesM2));
+ }
+ if(verbose){System.err.println("\nAfter ambiguous removal:\nRead1:\t"+r+"\nRead2:\t"+r2);}
+
+ if(r.numSites()>0 && r2.numSites()>0){
+ SiteScore ss1=r.topSite();
+ SiteScore ss2=r2.topSite();
+ if(canPair(ss1, ss2, len1, len2, REQUIRE_CORRECT_STRANDS_PAIRS, SAME_STRAND_PAIRS, MAX_PAIR_DIST)){
+ assert(SLOW_ALIGN ? ss1.pairedScore>ss1.slowScore : ss1.pairedScore>ss1.quickScore) :
+ "\n"+ss1.toText()+"\n"+ss2.toText()+"\n"+r.toText(false)+"\n"+r2.toText(false)+"\n\n"+
+ r.mapped()+", "+r.paired()+", "+r.strand()+", "+r.ambiguous()+"\n\n"+r2.mapped()+", "+r2.paired()+", "+r2.strand()+", "+r2.ambiguous()+"\n\n";
+ assert(SLOW_ALIGN ? ss2.pairedScore>ss2.slowScore : ss2.pairedScore>ss2.quickScore) :
+ "\n"+ss1.toText()+"\n"+ss2.toText()+"\n"+r.toText(false)+"\n"+r2.toText(false)+"\n\n";
+ r.setPaired(true);
+ r.mate.setPaired(true);
+ }
+ }
+
+ if(r.numSites()==0){r.sites=null;r.mapScore=0;}
+ if(r2.numSites()==0){r2.sites=null;r2.mapScore=0;}
+
+ r.setFromTopSite(AMBIGUOUS_RANDOM, true, MAX_PAIR_DIST);
+ r2.setFromTopSite(AMBIGUOUS_RANDOM, true, MAX_PAIR_DIST);
+ if(KILL_BAD_PAIRS){
+ if(r.isBadPair(REQUIRE_CORRECT_STRANDS_PAIRS, SAME_STRAND_PAIRS, MAX_PAIR_DIST)){
+ int x=r.mapScore/len1;
+ int y=r2.mapScore/len2;
+ if(x>=y){
+ r2.clearAnswers(false);
+ }else{
+ r.clearAnswers(false);
+ }
+ }
+ }
+ if(verbose){System.err.println("\nAfter bad pair removal:\nRead1:\t"+r+"\nRead2:\t"+r2);}
+
+ assert(r.sites==null || r.mapScore>0) : r.mapScore+"\n"+r.toText(false)+"\n\n"+r2.toText(false)+"\n";
+ assert(r2.sites==null || r2.mapScore>0) : r2.mapScore+"\n"+r.toText(false)+"\n\n"+r2.toText(false)+"\n";
+ if(MAKE_MATCH_STRING){
+ if(r.numSites()>0){
+ if(USE_SS_MATCH_FOR_PRIMARY && r.topSite().match!=null){
+ r.match=r.topSite().match;
+ }else{
+ genMatchString(r, basesP1, basesM1, maxImperfectSwScore1, maxSwScore1, false, false);
+
+ if(STRICT_MAX_INDEL && r.mapped()){
+ if(hasLongIndel(r.match, index.MAX_INDEL)){
+ r.clearMapping();
+ r2.setPaired(false);
+ }
+ }
+ }
+// assert(Read.CHECKSITES(r, basesM1));
+ }
+ if(r2.numSites()>0){
+ if(USE_SS_MATCH_FOR_PRIMARY && r2.topSite().match!=null){
+ r2.match=r2.topSite().match;
+ }else{
+ genMatchString(r2, basesP2, basesM2, maxImperfectSwScore2, maxSwScore2, false, false);
+
+ if(STRICT_MAX_INDEL && r2.mapped()){
+ if(hasLongIndel(r2.match, index.MAX_INDEL)){
+ r2.clearMapping();
+ r.setPaired(false);
+ }
+ }
+ }
+// assert(Read.CHECKSITES(r2, basesM2));
+ }
+ }
+
+ assert(checkTopSite(r)); // TODO remove this
+ if(verbose){
+ System.err.println("\nFinal:\nRead1:\t"+r+"\nRead2:\t"+r2);
+ if(r.match!=null && r.shortmatch()){r.match=Read.toLongMatchString(r.match); r.setShortMatch(false);}
+ if(r2.match!=null && r2.shortmatch()){r2.match=Read.toLongMatchString(r2.match); r2.setShortMatch(false);}
+ }
+
+ //Block to prevent assertion from firing. Generally caused by alignment being lost during match generation. TODO: Fix cause.
+ if(r.mapScore>0 && r.sites==null){
+ if(!Shared.anomaly){System.err.println("Anomaly: mapScore>0 and list==null.\n"+r+"\n");}
+ Shared.anomaly=true;
+ r.clearMapping();
+ r2.setPaired(false);
+ }else if(r.mapScore<=0 && r.sites!=null){
+ if(!STRICT_MAX_INDEL && !Shared.anomaly){System.err.println("Anomaly2: mapScore<=0 and list!=null.\n"+r+"\n");}
+ Shared.anomaly=true;
+ r.clearMapping();
+ r2.setPaired(false);
+ }
+ assert(checkTopSite(r)); // TODO remove this
+ //Block to prevent assertion from firing. Generally caused by alignment being lost during match generation. TODO: Fix cause.
+ if(r2.mapScore>0 && r2.sites==null){
+ if(!Shared.anomaly){System.err.println("Anomaly: mapScore>0 and list==null.\n"+r+"\n");}
+ Shared.anomaly=true;
+ r2.clearMapping();
+ r.setPaired(false);
+ }else if(r2.mapScore<=0 && r2.sites!=null){
+ if(!STRICT_MAX_INDEL && !Shared.anomaly){System.err.println("Anomaly3: mapScore<=0 and list!=null.\n"+r+"\n");}
+ Shared.anomaly=true;
+ r2.clearMapping();
+ r.setPaired(false);
+ }
+
+ assert(r.sites==null || r.mapScore>0) :
+ r.mapScore+"\t"+r.sites+"\n"+(-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED*maxSwScore1))+"\n"+
+ Arrays.toString(msa.fillAndScoreLimited(r.strand()==Gene.PLUS ? r.bases :
+ AminoAcid.reverseComplementBases(r.bases), r.topSite(), Tools.max(SLOW_ALIGN_PADDING, 80), 0))+"\n"+
+ Arrays.toString(msa.fillAndScoreLimited(r.strand()==Gene.PLUS ? r.bases :
+ AminoAcid.reverseComplementBases(r.bases), r.topSite(), Tools.max(SLOW_ALIGN_PADDING, 80), (-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED*maxSwScore1))))+"\n\n"+
+ msa.showVertLimit()+"\n\n"+msa.showHorizLimit()+"\n\n"+r+"\n\n"+r2+"\n\n";
+ assert(r2.sites==null || r2.mapScore>0) :
+ r2.mapScore+"\t"+r2.sites+"\n"+(-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED*maxSwScore2))+"\n"+
+ Arrays.toString(msa.fillAndScoreLimited(r2.strand()==Gene.PLUS ? r2.bases :
+ AminoAcid.reverseComplementBases(r2.bases), r2.topSite(), Tools.max(SLOW_ALIGN_PADDING, 80), 0))+"\n"+
+ Arrays.toString(msa.fillAndScoreLimited(r2.strand()==Gene.PLUS ? r2.bases :
+ AminoAcid.reverseComplementBases(r2.bases), r2.topSite(), Tools.max(SLOW_ALIGN_PADDING, 80), (-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED*maxSwScore2))))+"\n\n"+
+ msa.showVertLimit()+"\n\n"+msa.showHorizLimit()+"\n\n"+r+"\n\n"+r2+"\n\n";
+
+ assert(!r.mapped() || !MAKE_MATCH_STRING || r.match!=null) : "Note that sometimes, VERY RARELY, match string generation fails.";
+ assert(checkTopSite(r)); // TODO remove this
+ removeDuplicateBestSites(r);
+ removeDuplicateBestSites(r2);
+
+ if(DYNAMIC_INSERT_LENGTH && numMated>1000 && r.paired()){
+ AVERAGE_PAIR_DIST=(int)(innerLengthSum*1f/numMated);
+ }
+ assert(checkTopSite(r)); // TODO remove this
+ if(r.ambiguous() && AMBIGUOUS_TOSS){
+ if(r.sites!=null){r.sites=null;}
+ r.clearSite();
+ r.setMapped(false);
+ r.setPaired(false);
+ r2.setPaired(false);
+ }else if(r.mapped() && r.numSites()>1 && PRINT_SECONDARY_ALIGNMENTS){
+ ensureMatchStringsOnSiteScores(r, basesM1, maxImperfectSwScore1, maxSwScore1);
+ assert(Read.CHECKSITES(r, basesM1));
+ }
+ if(r2.ambiguous() && AMBIGUOUS_TOSS){
+ if(r2.sites!=null){r2.sites=null;}
+ r2.clearSite();
+ r2.setMapped(false);
+ r.setPaired(false);
+ r2.setPaired(false);
+ }else if(r2.mapped() && r2.numSites()>1 && PRINT_SECONDARY_ALIGNMENTS){
+ ensureMatchStringsOnSiteScores(r2, basesM2, maxImperfectSwScore2, maxSwScore2);
+ assert(Read.CHECKSITES(r2, basesM2));
+ }
+// assert(Read.CHECKSITES(r, basesM1) && Read.CHECKSITES(r2, basesM2));
+
+ assert(checkTopSite(r));
+ if(r.mapped() && (LOCAL_ALIGN || r.containsXYC())){
+ final SiteScore ss=r.topSite();
+ ss.match=r.match;
+ msa.toLocalAlignment(r, ss, basesM1, r.containsXYC() ? 1 : LOCAL_ALIGN_TIP_LENGTH, LOCAL_ALIGN_MATCH_POINT_RATIO);
+// System.err.println("\n\n*********\n\n"+r+"\n\n*********\n\n");
+// assert(Read.CHECKSITES(r, basesM1)); //TODO: This can fail; see bug#0001
+ }
+
+ assert(checkTopSite(r2));
+ if(r2.mapped() && (LOCAL_ALIGN || r2.containsXYC())){
+ final SiteScore ss=r2.topSite();
+ ss.match=r2.match;
+ msa.toLocalAlignment(r2, ss, basesM2, r2.containsXYC() ? 1 : LOCAL_ALIGN_TIP_LENGTH, LOCAL_ALIGN_MATCH_POINT_RATIO);
+// assert(Read.CHECKSITES(r2, basesM2)); //TODO: This can fail; see bug#0001
+ }
+
+ postFilterRead(r, basesM1, maxImperfectSwScore1, maxSwScore1);
+ postFilterRead(r2, basesM2, maxImperfectSwScore2, maxSwScore2);
+ if(MAKE_MATCH_STRING){
+ ensureMatchStringOnPrimary(r, basesM1, maxImperfectSwScore1, maxSwScore1);
+ ensureMatchStringOnPrimary(r2, basesM2, maxImperfectSwScore2, maxSwScore2);
+ }
+
+ if(CALC_STATISTICS){
+ calcStatistics1(r, maxSwScore1, maxPossibleQuickScore1);
+ calcStatistics2(r2, maxSwScore2, maxPossibleQuickScore2);
+ }
+ }
+
+ /** {number of correct (loose) sites, number of incorrect (loose) sites, number incorrect sites before last correct site,
+ * number of sites, correctScore, maxScore, firstElementCorrect, firstElementCorrectLoose, position of first correct element (or -1),
+ * sizeOfTopGroup, numTopCorrect} */
+ protected int[] calcCorrectnessSkimmer(Read r, int thresh){
+ //assume sorted.
+ ArrayList<SiteScore> ssl=r.sites;
+
+ if(ssl==null || ssl.isEmpty()){
+ return new int[] {0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0};
+ }
+
+ SiteScore original=r.originalSite;
+ assert((original==null) != (r.synthetic()));
+ if(original==null){
+ original=ssl.get(0);
+ }
+
+ int numCorrect=0;
+ int numIncorrect=0;
+ int numIncorrectPrior=0; //Prior to expected number of correct sites being found
+ int numIncorrectPrior2=0; //Prior to last correct element actually found
+ int numIncorrectPriorTemp=0;
+
+ int correctScore=0;
+ int maxScore=ssl.get(0).score;
+
+ int positionOfFirstCorrect=-1;
+ int firstElementCorrect=0;
+ int firstElementCorrectLoose=0;
+
+ int sizeOfTopGroup=0;
+
+ int numTopCorrect=0;
+
+ for(int i=0; i<ssl.size(); i++){
+ SiteScore ss=ssl.get(i);
+ if(ss.score==ssl.get(0).score){sizeOfTopGroup++;}
+
+
+// boolean b=isCorrectHit(ss, original.chrom, original.strand, original.start, 1, thresh);
+ boolean b=isCorrectHit(ss, ss.chrom, original.strand, original.start, original.stop, thresh);
+ boolean b2=isCorrectHitLoose(ss, ss.chrom, original.strand, original.start, original.stop, thresh+120);
+ if(b){
+ if(i==0){firstElementCorrect=1;}
+ }
+ if(b2){
+ if(i==0){firstElementCorrectLoose=1;}
+ numCorrect++;
+ numIncorrectPrior2+=numIncorrectPriorTemp;
+ numIncorrectPriorTemp=0;
+
+ if(numIncorrect==0){numTopCorrect++;}
+ if(positionOfFirstCorrect<0){
+ positionOfFirstCorrect=i;
+ correctScore=ss.score;
+ }
+ }else{
+ numIncorrect++;
+ numIncorrectPriorTemp++;
+ if(numCorrect<EXPECTED_SITES){
+ numIncorrectPrior++;
+ }
+ }
+ }
+
+ assert(sizeOfTopGroup>0 && sizeOfTopGroup<=ssl.size());
+ return new int[] {numCorrect, numIncorrect, numIncorrectPrior2,
+ ssl.size(), correctScore, maxScore, firstElementCorrect, firstElementCorrectLoose, positionOfFirstCorrect, sizeOfTopGroup, numTopCorrect};
+ }
+
+
+}
diff --git a/current/align2/BBSplitter.java b/current/align2/BBSplitter.java
new file mode 100755
index 0000000..f1aeba3
--- /dev/null
+++ b/current/align2/BBSplitter.java
@@ -0,0 +1,1225 @@
+package align2;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.LinkedHashSet;
+
+import stream.ConcurrentReadOutputStream;
+import stream.Read;
+import stream.SiteScore;
+
+
+import dna.Data;
+import dna.Parser;
+import dna.Timer;
+import fileIO.ReadWrite;
+import fileIO.FileFormat;
+import fileIO.TextFile;
+import fileIO.TextStreamWriter;
+
+/**
+ * @author Brian Bushnell
+ * @date Mar 19, 2013
+ *
+ */
+public class BBSplitter {
+
+ public static void main(String[] args){
+ if(Shared.COMMAND_LINE==null){
+ Shared.COMMAND_LINE=(args==null ? null : args.clone());
+ Shared.BBMAP_CLASS="BBSplitter";
+ }
+ Timer t=new Timer();
+ String[] margs=processArgs(args);
+ ReadWrite.waitForWritingToFinish();
+ t.stop();
+ Data.sysout.println("Ref merge time: \t"+t);
+ Data.scaffoldPrefixes=true;
+ if(MAP_MODE==MAP_NORMAL){
+ BBMap.main(margs);
+ }else if(MAP_MODE==MAP_ACC){
+ BBMapAcc.main(margs);
+ }else if(MAP_MODE==MAP_PACBIO){
+ BBMapPacBio.main(margs);
+ }else if(MAP_MODE==MAP_PACBIOSKIMMER){
+ BBMapPacBioSkimmer.main(margs);
+ }else{
+ throw new RuntimeException();
+ }
+// Data.sysout.println("\nTotal time: \t"+t);
+ }
+
+ public static String[] processArgs(String[] args){
+ for(String s : args){
+ if(s.endsWith("=stdout") || s.contains("=stdout.")){Data.sysout=System.err;}
+ }
+ Data.sysout.println("Executing "+(new Object() { }.getClass().getEnclosingClass().getName())+" "+Arrays.toString(args)+"\n");
+// if(ReadWrite.ZIPLEVEL<2){ReadWrite.ZIPLEVEL=2;} //Should be fine for a realistic number of threads, except in perfect mode with lots of sites and a small index.
+ String[] oldargs=args;
+ args=remakeArgs(args);
+ if(args!=oldargs){
+ Data.sysout.println("Converted arguments to "+Arrays.toString(args));
+ }
+
+ ReadWrite.ZIPLEVEL=2;
+
+ Timer t=new Timer();
+
+
+ int ziplevel=-1;
+ int build=1;
+
+ LinkedHashSet<String> nameSet=new LinkedHashSet<String>();
+ HashMap<String, LinkedHashSet<String>> table=new HashMap<String, LinkedHashSet<String>>();
+
+ ArrayList<String> unparsed=new ArrayList<String>();
+
+ String basename=null;
+
+ for(int i=0; i<args.length; i++){
+ final String arg=args[i];
+ final String[] split=arg.split("=");
+ String a0=split[0];
+ String a=a0.toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if(b!=null && b.equalsIgnoreCase("null")){b=null;}
+
+ if(a.equals("blacklist") || a.equals("ref_blacklist")){a="ref_blacklist";}
+ if(a.equals("whitelist") || a.equals("ref_whitelist")){a="ref_whitelist";}
+ if(a.equals("ref") || a.equals("reference")){a="ref_ref";}
+
+ if(b!=null && (a.startsWith("ref_"))){
+ String setName=a0.substring(4);
+ if(setName.indexOf(',')>=0){setName=setName.replace(',', '_');}
+ if(setName.indexOf('$')>=0){setName=setName.replace('$', '_');}
+ nameSet.add(setName);
+ if(!table.containsKey(setName)){table.put(setName, new LinkedHashSet<String>());}
+ LinkedHashSet<String> set=table.get(setName);
+
+ File f;
+ if((f=new File(b)).exists()){
+ try {
+ String s=f.getCanonicalPath();
+ set.add(s);
+ } catch (IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }else{
+ for(String x : b.split(",")){
+ f=new File(x);
+ if(f.exists()){
+ try {
+ set.add(f.getCanonicalPath());
+ } catch (IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }else{
+ assert(x.startsWith("stdin")) : "Can't find file "+x;
+ set.add(x);
+ }
+ }
+ }
+// assert(false) : a+", "+b+", "+arg+", "+setName;
+ }else{
+ if(a.startsWith("-xmx") || a.startsWith("-xms")){
+ //jvm argument; do nothing
+ }else if(a.equals("path") || a.equals("root")){
+ Data.setPath(b);
+ }else if(a.equals("ziplevel") || a.equals("zl")){
+ ReadWrite.ZIPLEVEL=Integer.parseInt(b);
+ unparsed.add(args[i]);
+ }else if(a.equals("build")){
+ build=Integer.parseInt(b);
+ unparsed.add(args[i]);
+ }else if(a.equals("basename") || a.equals("pattern")){
+ basename=b;
+ assert(b==null || (b.indexOf('%')>=0 && (b.indexOf('%')<b.lastIndexOf('.')))) :
+ "basename must contain a '%' symbol prior to file extension.";
+ }else if(a.equals("append") || a.equals("app")){
+ append=ReadStats.append=Tools.parseBoolean(b);
+// sysout.println("Set append to "+append);
+ unparsed.add(args[i]);
+ }else if(a.equals("overwrite") || a.equals("ow")){
+ overwrite=Tools.parseBoolean(b);
+// Data.sysout.println("Set overwrite to "+overwrite);
+ unparsed.add(args[i]);
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ unparsed.add(args[i]);
+ }else if(a.equals("rebuild")){
+ forceRebuild=Tools.parseBoolean(b);
+ unparsed.add(args[i]);
+ }else if(a.equals("fastawrap")){
+ Shared.FASTA_WRAP=Integer.parseInt(b);
+ }else{
+ unparsed.add(args[i]);
+ }
+ }
+ }
+
+ String refname=mergeReferences(nameSet, table, build);
+ ArrayList<String> outnames=gatherLists(nameSet, basename);
+// unparsed.add("scaffoldprefixes=true");
+ unparsed.add("ref="+refname);
+
+ String[] margs=new String[unparsed.size()+(outnames==null ? 0 : outnames.size())];
+ int idx=0;
+ for(int i=0; i<unparsed.size(); i++){
+ margs[idx]=unparsed.get(i);
+ idx++;
+ }
+ if(outnames!=null){
+ for(int i=0; i<outnames.size(); i++){
+ margs[idx]=outnames.get(i);
+ idx++;
+ }
+ }
+
+ return margs;
+ }
+
+
+ public static String[] remakeArgs(String[] args){
+
+ LinkedHashSet<String> set=new LinkedHashSet<String>();
+ HashMap<String,LinkedHashSet<String>> map=new HashMap<String,LinkedHashSet<String>>();
+ int removed=0;
+
+ for(int i=0; i<args.length; i++){
+ final String arg=args[i];
+ final String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(a.equals("mapmode") && b!=null){
+ args[i]=null;
+ removed++;
+ if(b.equalsIgnoreCase("normal")){MAP_MODE=MAP_NORMAL;}
+ else if(b.equalsIgnoreCase("accurate") || b.equalsIgnoreCase("acc")){MAP_MODE=MAP_ACC;}
+ else if(b.equalsIgnoreCase("pacbio") || b.equalsIgnoreCase("pb") || b.equalsIgnoreCase("bp")){MAP_MODE=MAP_PACBIO;}
+ else if(b.equalsIgnoreCase("pacbioskimmer") || b.equalsIgnoreCase("pbs") || b.equalsIgnoreCase("bps")){MAP_MODE=MAP_PACBIOSKIMMER;}
+ else{throw new RuntimeException("Unknown mode: "+b);}
+ }else if(a.equals("ref") && b!=null){
+ args[i]=null;
+ removed++;
+ processRef(b, set, map);
+ }
+ }
+ if(set.isEmpty() && removed==0){return args;}
+ if(MAP_MODE==MAP_ACC){removed--;}
+ String[] args2=new String[args.length+set.size()-removed];
+
+ int i=0, j=0;
+ if(MAP_MODE==MAP_ACC){
+ args2[j]="minratio=0.4"; //Increase sensitivity in accurate mode
+ j++;
+ }
+ for(; i<args.length; i++){
+ if(args[i]!=null){
+ args2[j]=args[i];
+ j++;
+ }
+ }
+ for(String key : set){
+ LinkedHashSet<String> list=map.get(key);
+ StringBuilder sb=new StringBuilder(200);
+ sb.append("ref_"+key+"=");
+ String comma="";
+ for(String s : list){
+ sb.append(comma);
+ sb.append(s);
+ comma=",";
+ }
+ args2[j]=sb.toString();
+ j++;
+ }
+ return args2;
+ }
+
+ private static void processRef(String b, LinkedHashSet<String> set, HashMap<String,LinkedHashSet<String>> map){
+
+ ArrayList<String> files=Tools.getFileOrFiles(b, null, true, false, false, false);
+ for(String file : files){
+ String name=file.replace('\\', '/');
+ int x=name.lastIndexOf('/');
+ if(x>=0){name=name.substring(x+1);}
+ while(name.endsWith(".zip") || name.endsWith(".bz2") || name.endsWith(".gz") || name.endsWith(".gzip")){
+ name=name.substring(0, name.lastIndexOf('.'));
+ }
+ if(name.lastIndexOf('.')>=0){
+ name=name.substring(0, name.lastIndexOf('.'));
+ }
+ set.add(name);
+ LinkedHashSet<String> list=map.get(name);
+ if(list==null){
+ list=new LinkedHashSet<String>();
+ map.put(name, list);
+ }
+ list.add(file);
+ }
+ }
+
+ public static ArrayList<String> gatherLists(LinkedHashSet<String> nameSet, String basename){
+ if(basename==null){return null;}
+ ArrayList<String> args=new ArrayList<String>();
+ for(String name : nameSet){
+ if(basename!=null){
+ args.add("out_"+name+"="+(basename.replaceFirst("%", name)));
+ }
+ }
+ return args;
+ }
+
+
+ public static String mergeReferences(LinkedHashSet<String> nameSet, HashMap<String, LinkedHashSet<String>> nameToFileTable, int build){
+ LinkedHashSet<String> fnames=new LinkedHashSet<String>();
+// nameSet.remove("blacklist");
+// nameSet.remove("whitelist");
+ addNames(fnames, nameToFileTable, "whitelist");
+ for(String s : nameSet){
+ if(!s.equalsIgnoreCase("blacklist") && !s.equalsIgnoreCase("whitelist")){
+ addNames(fnames, nameToFileTable, s);
+ }
+ }
+ addNames(fnames, nameToFileTable, "blacklist");
+
+ final HashMap<String, LinkedHashSet<String>> fileToNameTable=new HashMap<String, LinkedHashSet<String>>();
+ for(String name : nameSet){
+ LinkedHashSet<String> files=nameToFileTable.get(name);
+ if(files!=null){
+ for(String f : files){
+ LinkedHashSet<String> names=fileToNameTable.get(f);
+ if(names==null){
+ names=new LinkedHashSet<String>();
+ fileToNameTable.put(f, names);
+ }
+ names.add(name);
+ }
+ }
+ }
+
+ final String root=Data.ROOT_GENOME+build;
+ {
+ File f=new File(root);
+ if(!f.exists()){f.mkdirs();}
+ }
+
+ {
+ final String reflist=root+"/reflist.txt";
+ final String namelist=root+"/namelist.txt";
+ final boolean reflistExists=new File(reflist).exists();
+ boolean writeReflist=false;
+ String[] oldrefs=null;
+ String[] oldnames=null;
+ if(reflistExists){
+ TextFile tf=new TextFile(reflist, false, false);
+ oldrefs=tf.toStringLines();
+ tf.close();
+
+ tf=new TextFile(namelist, false, false);
+ oldnames=tf.toStringLines();
+ tf.close();
+ }
+ if(fnames.size()>0){
+ writeReflist=true;
+ ArrayList<String> fl=new ArrayList<String>(fnames.size());
+ fl.addAll(fnames);
+ ArrayList<String> nl=new ArrayList<String>(nameSet.size());
+ nl.addAll(nameSet);
+ //TODO: Compare old to new
+ }else{
+ assert(oldrefs!=null) : "No reference specified, and none exists. Please regenerate the index.";
+ for(String s : oldrefs){fnames.add(s);}
+
+ assert(oldnames!=null) : "No reference specified, and none exists. Please regenerate the index.";
+ for(String s : oldnames){nameSet.add(s);}
+
+ writeReflist=false;
+ }
+ if(writeReflist){
+ {
+// assert(false) : fnames;
+// assert(fnames.size()>0);
+ TextStreamWriter tsw=new TextStreamWriter(reflist, overwrite, append, false);
+ tsw.start();
+ for(String s : fnames){tsw.println(s);}
+ tsw.poisonAndWait();
+ assert(new File(reflist).exists()) : reflist+".exists? "+new File(reflist).exists();
+ }
+ {
+// assert(nameSet.size()>0);
+ TextStreamWriter tsw=new TextStreamWriter(namelist, overwrite, append, false);
+ tsw.start();
+ for(String s : nameSet){tsw.println(s);}
+ tsw.poisonAndWait();
+ }
+ }
+ }
+
+ if(fnames.size()<1){
+ assert(false) : "No references specified." +
+ "\nTODO: This is really annoying; I need to include reference names in some auxillary file.";
+ return null;
+ }else if(fnames.size()==1){
+// Data.sysout.println("Only one reference file; skipping merge.");
+// String refname=fnames.iterator().next();
+// return refname;
+ }
+
+ long key=0;
+ for(String s : nameSet){
+ key=Long.rotateLeft(key, 21);
+ key=key^s.hashCode();
+// System.err.println("Hashed nameSet "+nameSet+" -> "+key);
+ }
+ key=(key&Long.MAX_VALUE);
+ String refname0="merged_ref_"+key+".fa.gz";
+ String refname=root+"/"+refname0;
+
+ if(!forceRebuild){
+ File f=new File(refname);
+ if(f.exists()){
+ // Data.sysout.println("Merged reference file /ref/genome/"+build+"/"+refname0+" already exists; skipping merge.");
+ Data.sysout.println("Merged reference file "+refname+" already exists; skipping merge.");
+ return refname;
+ }
+// else{
+// f=new File(root);
+// if(!f.exists()){f.mkdirs();}
+// }
+ }
+ // Data.sysout.println("Creating merged reference file /ref/genome/"+build+"/"+refname0);
+ Data.sysout.println("Creating merged reference file "+refname);
+
+ TextStreamWriter tsw=new TextStreamWriter(refname, overwrite || forceRebuild, false, true);
+ tsw.start();
+ for(String fname : fnames){
+ TextFile tf=new TextFile(fname, false, false);
+ LinkedHashSet<String> listnames=fileToNameTable.get(fname);
+// assert(false) : "\n\n"+fname+"\n\n"+listnames+"\n\n"+fileToNameTable+"\n\n"+nameSet+"\n\n"+nameToFileTable+"\n\n";
+ String prefix=null;
+ {
+ StringBuilder sb=new StringBuilder(100);
+ sb.append('>');
+ if(listnames!=null){
+ String sep="";
+ for(String s : listnames){
+ sb.append(sep);
+ sb.append(s);
+ sep=",";
+ }
+ }
+ sb.append('$');
+ prefix=sb.toString();
+ }
+// assert(false) : prefix;
+// System.err.println(prefix);
+ for(String line=tf.nextLine(); line!=null; line=tf.nextLine()){
+ if(prefix!=null && line.charAt(0)=='>'){
+
+ tsw.print(prefix);
+ tsw.println(line.substring(1));
+ }else{
+ tsw.println(line);
+ }
+ }
+ tf.close();
+ }
+ tsw.poison();
+ tsw.waitForFinish();
+
+ return refname;
+ }
+
+ /** Returns the set of scaffold name prefixes or suffixes.
+ *
+ * @param prefix True to return prefixes (set names), false to return suffixes (scaffold names)
+ * @return
+ */
+ public static HashSet<String> getScaffoldAffixes(boolean getPrefixes){
+ final byte[][][] b3=Data.scaffoldNames;
+
+ int size=(int)Tools.min((10+Data.numScaffolds*3)/2, Integer.MAX_VALUE);
+ HashSet<String> set=new HashSet<String>(size);
+
+ assert(b3!=null);
+ for(byte[][] b2 : b3){
+ if(b2!=null){
+ for(byte[] bname : b2){
+ if(bname!=null){
+ int idx=Tools.indexOf(bname, (byte)'$');
+ String prefix=null, suffix=null;
+ if(idx>=0){
+ if(getPrefixes){prefix=new String(bname, 0, idx);}
+ else{suffix=new String(bname, idx+1, bname.length-idx-1);}
+ }else{
+ if(!getPrefixes){suffix=new String(bname);}
+ }
+
+ if(getPrefixes){
+ if(prefix!=null){
+ if(prefix.indexOf(',')>=0){
+ for(String s : prefix.split(",")){
+ set.add(s);
+ }
+ }else{
+ set.add(prefix);
+ }
+ }
+ }else{
+ if(suffix!=null){
+ set.add(suffix);
+ }
+ }
+ }
+ }
+ }
+ }
+ return set;
+ }
+
+ public static synchronized HashMap<String, ConcurrentReadOutputStream> makeOutputStreams(String[] args, boolean OUTPUT_READS, boolean OUTPUT_ORDERED_READS,
+ int buff, boolean paired, boolean overwrite_, boolean append_, boolean ambiguous){
+// assert(false) : Arrays.toString(args);
+ HashMap<String, ConcurrentReadOutputStream> table=new HashMap<String, ConcurrentReadOutputStream>();
+ for(String arg : args){
+ String[] split=arg.split("=");
+ String a=split[0];
+ String b=split.length>1 ? split[1] : null;
+ if(b!=null && b.equalsIgnoreCase("null")){b=null;}
+
+ if(arg.indexOf('=')>0 && a.toLowerCase().startsWith("out_")){
+ String name=a.substring(4).replace('\\', '/');
+
+ final String fname1, fname2;
+
+ if(ambiguous){
+ if(b.indexOf('/')>=0){
+ int x=b.lastIndexOf('/');
+ b=b.substring(0, x+1)+"AMBIGUOUS_"+b.substring(x+1);
+ }else{
+ b="AMBIGUOUS_"+b;
+ }
+ }
+
+ if(!FileFormat.hasSamOrBamExtension(b) && ReadWrite.stripExtension(b).contains("#")){
+ fname1=b.replace('#', '1');
+ fname2=b.replace('#', '2');
+ }else{
+ fname1=b;
+ fname2=null;
+ }
+// assert(false) : fname1;
+// assert(!ambiguous) : fname1+", "+fname2+", "+b+", "+ambiguous;
+
+ FileFormat ff1=FileFormat.testOutput(fname1, FileFormat.SAM, null, true, overwrite_, append_, OUTPUT_ORDERED_READS);
+ FileFormat ff2=paired ? FileFormat.testOutput(fname2, FileFormat.SAM, null, true, overwrite_, append_, OUTPUT_ORDERED_READS) : null;
+ ConcurrentReadOutputStream ros=ConcurrentReadOutputStream.getStream(ff1, ff2, null, null, buff, null, false);
+ ros.start();
+// Data.sysout.println("Started output stream:\t"+t);
+ table.put(name, ros);
+ AbstractMapThread.OUTPUT_SAM|=ff1.samOrBam();
+ }
+ }
+ return table.isEmpty() ? null : table;
+ }
+
+
+ public static synchronized LinkedHashMap<String, SetCount> makeSetCountTable(){
+ assert(setCountTable==null);
+ HashSet<String> names=getScaffoldAffixes(true);
+ setCountTable=new LinkedHashMap<String, SetCount>();
+ for(String s : names){setCountTable.put(s, new SetCount(s));}
+ return setCountTable;
+ }
+
+
+ public static synchronized LinkedHashMap<String, SetCount> makeScafCountTable(){
+ assert(scafCountTable==null);
+ HashSet<String> names=getScaffoldAffixes(false);
+ scafCountTable=new LinkedHashMap<String, SetCount>();
+ for(String s : names){scafCountTable.put(s, new SetCount(s));}
+// System.out.println("Made table "+scafCountTable);
+ return scafCountTable;
+ }
+
+
+ /**
+ * @param readlist List of reads to print
+ * @param listID ID of read list, from ReadInputStream
+ * @param splitTable A temporary structure to hold sets of reads that go to the different output streams
+ * @param clearzone Min distance between best and next-best site to be considered unambiguous
+ */
+ public static void printReads(ArrayList<Read> readlist, long listID, HashMap<String, ArrayList<Read>> splitTable, int clearzone){
+ if(clearzone>=0 || TRACK_SET_STATS || TRACK_SCAF_STATS){
+ printReadsAndProcessAmbiguous(readlist, listID, splitTable, null, clearzone);
+ return;
+ }
+ assert((streamTable!=null && streamTable.size()>0) || (setCountTable!=null && setCountTable.size()>0) || (scafCountTable!=null && scafCountTable.size()>0));
+ boolean clear=true;
+ if(splitTable==null){
+ splitTable=new HashMap<String, ArrayList<Read>>();
+ clear=false;
+ }
+
+ if(!readlist.isEmpty()){
+ HashSet<String> set=new HashSet<String>(8);
+ for(Read r : readlist){
+ if(r!=null){
+ set=toListNames(r, set);
+ for(String s : set){
+ ArrayList<Read> alr=splitTable.get(s);
+ if(alr==null){
+ alr=new ArrayList<Read>();
+ splitTable.put(s, alr);
+ }
+ //123***
+ alr.add(r);
+ }
+ set.clear();
+ }
+ }
+ }
+
+ for(String s : streamTable.keySet()){
+ ArrayList<Read> alr=splitTable.get(s);
+ if(alr==null){alr=blank;}
+ ConcurrentReadOutputStream tros=streamTable.get(s);
+ tros.add(alr, listID);
+ }
+ if(clear){splitTable.clear();}
+ }
+
+
+ /**
+ * @param readlist List of reads to print
+ * @param listID ID of read list, from ReadInputStream
+ * @param splitTable A temporary structure to hold sets of reads that go to the different output streams
+ * @param clearzone Min distance between best and next-best site to be considered unambiguous
+ */
+ public static void printReadsAndProcessAmbiguous(ArrayList<Read> readlist, long listID, HashMap<String, ArrayList<Read>> splitTable,
+ HashMap<String, ArrayList<Read>> splitTableA, int clearzone){
+ assert(clearzone>=0 || TRACK_SET_STATS || TRACK_SCAF_STATS);
+ assert((streamTable!=null && streamTable.size()>0) || (setCountTable!=null && setCountTable.size()>0) || (scafCountTable!=null && scafCountTable.size()>0));
+ boolean clear=streamTable!=null, clearA=streamTableAmbiguous!=null;
+ if(splitTable==null && streamTable!=null){
+ splitTable=new HashMap<String, ArrayList<Read>>();
+ clear=false;
+ }
+ if(splitTableA==null && streamTableAmbiguous!=null){
+ splitTableA=new HashMap<String, ArrayList<Read>>();
+ clearA=false;
+ }
+
+ final HashSet<String> hss0, hss1, hss2, hss3, hsspr, hssam;
+ final HashSet<String>[] hssa;
+ if(TRACK_SET_STATS || streamTable!=null){
+ hss0=new HashSet<String>(16);
+ hss1=new HashSet<String>(16);
+ hss2=new HashSet<String>(16);
+ hss3=new HashSet<String>(16);
+ hsspr=new HashSet<String>(16);
+ hssam=new HashSet<String>(16);
+ hssa=(HashSet<String>[])new HashSet[] {hss0, hss1, hss2, hss3};
+ }else if(TRACK_SCAF_STATS){
+ hss0=new HashSet<String>(16);
+ hss1=null; hss2=null; hss3=null; hsspr=null; hssam=null; hssa=null;
+ }else{
+ hss0=null; hss1=null; hss2=null; hss3=null; hsspr=null; hssam=null; hssa=null;
+ }
+
+ for(final Read r1 : readlist){
+// System.out.println("\nProcessing read "+r1.numericID);
+ final Read r2=r1==null ? null : r1.mate;
+
+ if(r1!=null){addToScafCounts(r1, clearzone, hss0);} //Scafstats for read 1
+ if(r2!=null){addToScafCounts(r2, clearzone, hss0);} //Scafstats for read 2
+
+ if(r1!=null){
+
+ final HashSet<String>[] sets=(TRACK_SET_STATS || streamTable!=null) ? getSets(r1, clearzone, hssa) : null;
+ boolean ambiguous=false;
+ if(sets!=null){
+ final HashSet<String> p1=(sets[0].isEmpty() ? null : sets[0]), s1=(sets[1].isEmpty() ? null : sets[1]),
+ p2=(sets[2].isEmpty() ? null : sets[2]), s2=(sets[3].isEmpty() ? null : sets[3]);
+ assert(sets==hssa);
+// assert(p1!=null);
+// assert(s1!=null);
+// assert(p2!=null);
+// assert(s2!=null);
+
+ if(p1!=null && p2!=null && !p1.equals(p2)){ambiguous=true;}
+ else if(p1!=null && s1!=null && !p1.containsAll(s1)){ambiguous=true;}
+ else if(p2!=null && s2!=null && !p2.containsAll(s2)){ambiguous=true;}
+
+// System.out.println("\nambiguous="+ambiguous);
+// System.out.println(p1);
+// System.out.println(s1);
+
+ HashSet<String> primarySet=hsspr, ambigSet=hssam;
+ primarySet.clear();
+ ambigSet.clear();
+ if(AMBIGUOUS2_MODE==AMBIGUOUS2_FIRST || AMBIGUOUS2_MODE==AMBIGUOUS2_UNSET){//pick one
+ if(r2==null || r1.mapScore>=r2.mapScore){
+ if(p1!=null){primarySet.addAll(p1);}
+ }else{
+ if(p2!=null){primarySet.addAll(p2);}
+ }
+ }else{//merge
+ if(p1!=null){primarySet.addAll(p1);}
+ if(p2!=null){primarySet.addAll(p2);}
+ }
+
+
+ if(ambiguous){
+ if(AMBIGUOUS2_MODE==AMBIGUOUS2_SPLIT){
+ if(primarySet!=null && s1!=null){primarySet.addAll(s1);}
+ if(primarySet!=null && s2!=null){primarySet.addAll(s2);}
+ ambigSet=primarySet;
+ primarySet=null;
+ }else if(AMBIGUOUS2_MODE==AMBIGUOUS2_ALL){
+ if(primarySet!=null && s1!=null){primarySet.addAll(s1);}
+ if(primarySet!=null && s2!=null){primarySet.addAll(s2);}
+ ambigSet=null;
+ }else if(AMBIGUOUS2_MODE==AMBIGUOUS2_RANDOM){
+ throw new RuntimeException("AMBIGUOUS2_RANDOM: Not yet implemented.");
+ }else if(AMBIGUOUS2_MODE==AMBIGUOUS2_TOSS){
+ primarySet=null;
+ }
+ }
+
+ if(primarySet!=null && splitTable!=null){
+ for(String s : primarySet){
+ ArrayList<Read> alr=splitTable.get(s);
+ if(alr==null){
+ alr=new ArrayList<Read>();
+ splitTable.put(s, alr);
+ }
+ alr.add(r1);
+ }
+ }
+
+ if(ambigSet!=null && splitTableA!=null){
+ for(String s : ambigSet){
+ ArrayList<Read> alr=splitTableA.get(s);
+ if(alr==null){
+ alr=new ArrayList<Read>();
+ splitTableA.put(s, alr);
+ }
+ alr.add(r1);
+ }
+ }
+
+ if(setCountTable!=null){
+
+ primarySet=hsspr;
+ primarySet.clear();
+ if(p1!=null){primarySet.addAll(p1);}
+ if(p2!=null){primarySet.addAll(p2);}
+ if(ambiguous){
+ if(s1!=null){primarySet.addAll(s1);}
+ if(s2!=null){primarySet.addAll(s2);}
+ }
+ // System.out.println(primarySet);
+ final int incrR=1+(r2==null ? 0 : 1);
+ final int incrB=r1.length()+(r1.mateLength());
+
+ for(String s : primarySet){
+ SetCount sc=setCountTable.get(s);
+ assert(sc!=null) : s;
+ if(ambiguous){
+ synchronized(sc){
+ // System.out.println("Incrementing set "+sc);
+ sc.ambiguousReads+=incrR;
+ sc.ambiguousBases+=incrB;
+ }
+ }else{
+ synchronized(sc){
+ // System.out.println("Incrementing set "+sc);
+ sc.mappedReads+=incrR;
+ sc.mappedBases+=incrB;
+ }
+ }
+ }
+ }
+ for(HashSet<String> set : sets){set.clear();}
+ }
+ }
+ }
+ if(streamTable!=null){
+ for(String s : streamTable.keySet()){
+// System.err.println("Searching for "+s+" in "+splitTable.keySet());
+// System.err.println(splitTable.containsKey(s));
+ ArrayList<Read> alr=splitTable.get(s);
+// System.err.println("Adding alr "+alr+"\n");
+ if(alr==null){alr=blank;}
+ ConcurrentReadOutputStream tros=streamTable.get(s);
+ tros.add(alr, listID);
+ }
+ }
+ if(streamTableAmbiguous!=null){
+ for(String s : streamTableAmbiguous.keySet()){
+ ArrayList<Read> alr=splitTableA.get(s);
+ if(alr==null){alr=blank;}
+ ConcurrentReadOutputStream tros=streamTableAmbiguous.get(s);
+ tros.add(alr, listID);
+ }
+ }
+ if(clear){splitTable.clear();}
+ if(clearA){splitTableA.clear();}
+ }
+
+ private static void addToScafCounts(Read r, int clearzone, HashSet<String> hss0){
+ assert((scafCountTable!=null)==TRACK_SCAF_STATS) : TRACK_SCAF_STATS;
+ if(scafCountTable!=null){
+ HashSet<String> set=getScaffolds(r, clearzone, hss0, false);
+ if(set!=null && !set.isEmpty()){
+ int incrRM=0;
+ int incrRA=0;
+ int incrBM=0;
+ int incrBA=0;
+ if(r!=null){
+ if(r.ambiguous()){
+ incrRA+=1;
+ incrBA+=r.length();
+ }else{
+ incrRM+=1;
+ incrBM+=r.length();
+ }
+ }
+ for(String s : set){
+ SetCount sc=scafCountTable.get(s);
+ assert(sc!=null) : "Can't find "+s+"\nin\n"+scafCountTable.keySet()+"\n";
+
+// System.out.println(sc);
+// System.out.println("+ "+incrRM+", "+incrRA+", "+incrBM+", "+incrBA);
+ synchronized(sc){
+ // System.out.println("Incrementing scaf "+sc);
+ sc.mappedReads+=incrRM;
+ sc.mappedBases+=incrBM;
+ sc.ambiguousReads+=incrRA;
+ sc.ambiguousBases+=incrBA;
+ }
+// System.out.println(sc);
+// System.out.println();
+// assert(false) : "\n"+incrRM+", "+incrRA+", "+incrBM+", "+incrBA+"\n"+set;
+ }
+ set.clear();
+ }
+ }
+ }
+
+ //*********************************
+
+ public static HashSet<String>[] getSets(Read r1, int clearzone, HashSet<String>[] sets){
+ Read r2=r1.mate;
+ if(!r1.mapped() && (r2==null || !r2.mapped())){return null;}
+
+ if(sets==null){
+ assert(false);
+ sets=new HashSet[4];
+ }else{
+ for(HashSet<String> set : sets){
+ assert(set==null || set.isEmpty());
+ }
+ }
+
+ HashSet<String> primary1=sets[0], other1=sets[1], primary2=sets[2], other2=sets[3];
+ if(r1.mapped()){
+// System.out.println(r1.list.size());
+ SiteScore s0=r1.topSite();
+ primary1=toListNames(s0, primary1);
+ for(int i=1; i<r1.sites.size(); i++){
+ SiteScore ss=r1.sites.get(i);
+ if(ss.score+clearzone<s0.score){break;}
+ other1=toListNames(ss, other1);
+ }
+// System.out.println(primary1);
+// System.out.println(other1);
+ }
+ if(r2!=null && r2.mapped()){
+ SiteScore s0=r2.topSite();
+ primary2=toListNames(s0, primary2);
+ for(int i=1; i<r2.sites.size(); i++){
+ SiteScore ss=r2.sites.get(i);
+ if(ss.score+clearzone<s0.score){break;}
+ other2=toListNames(ss, other2);
+ }
+ }
+ sets[0]=primary1;
+ sets[1]=other1;
+ sets[2]=primary2;
+ sets[3]=other2;
+ return sets;
+ }
+
+
+ public static HashSet<String> getScaffolds(Read r1, int clearzone, HashSet<String> set, boolean includeMate){
+ Read r2=(includeMate ? r1.mate : null);
+ if(!r1.mapped() && (r2==null || !r2.mapped())){return null;}
+ assert(set==null || set.isEmpty());
+
+ if(!r1.ambiguous() && (r2==null || !r2.ambiguous())){
+ byte[] scafb1=r1.getScaffoldName(false);
+ byte[] scafb2=(r2==null ? null : r2.getScaffoldName(false));
+ if(scafb1==null){scafb1=scafb2;}
+ if(scafb1==null){
+ assert(false) : r1;
+ return null;
+ }
+ if(scafb2==null || scafb1==scafb2){
+ final String s;
+ if(Data.scaffoldPrefixes){
+ int idx=Tools.indexOf(scafb1, (byte)'$');
+ assert(idx>=0) : idx+", "+new String(scafb1);
+ s=(idx>=0 ? new String(scafb1, idx+1, scafb1.length-idx-1) : new String(scafb1));
+ }else{
+ s=new String(scafb1);
+ }
+ if(set==null){set=new HashSet<String>(1);}
+// assert(!s.contains("$")) : s+", "+Data.scaffoldPrefixes+", "+Tools.indexOf(scafb1, (byte)'$');
+ set.add(s);
+ return set;
+ }
+ }
+
+ if(set==null){set=new HashSet<String>(4);}
+ if(r1.mapped()){
+ SiteScore s0=r1.topSite();
+ for(SiteScore ss : r1.sites){
+ if(ss.score+clearzone<s0.score){break;}
+ byte[] b=ss.getScaffoldName(false);
+ assert(b!=null);
+ final String s;
+ if(Data.scaffoldPrefixes){
+ int idx=Tools.indexOf(b, (byte)'$');
+ assert(idx>=0) : idx+", "+new String(b);
+ s=(idx>=0 ? new String(b, idx+1, b.length-idx-1) : new String(b));
+ }else{
+ s=new String(b);
+ }
+ set.add(s);
+ }
+ }
+ if(r2!=null && r2.mapped()){
+ SiteScore s0=r2.topSite();
+ for(SiteScore ss : r2.sites){
+ if(ss.score+clearzone<s0.score){break;}
+ byte[] b=ss.getScaffoldName(false);
+ assert(b!=null);
+ final String s;
+ if(Data.scaffoldPrefixes){
+ int idx=Tools.indexOf(b, (byte)'$');
+ assert(idx>=0) : idx+", "+new String(b);
+ s=(idx>=0 ? new String(b, idx+1, b.length-idx-1) : new String(b));
+ }else{
+ s=new String(b);
+ }
+ set.add(s);
+ }
+ }
+ assert(set.size()>0);
+ return set;
+ }
+
+
+ /**
+ * @param r
+ * @return A set of names of reference lists containing this read or its mate.
+ */
+ public static HashSet<String> toListNames(Read r, HashSet<String> set) {
+ if(r==null){return set;}
+ byte[] scaf1=r.getScaffoldName(false);
+ byte[] scaf2=(r.mate==null ? null : r.mate.getScaffoldName(false));
+ if(scaf1==null && scaf2==null){return set;}
+
+ if(set==null){set=new HashSet<String>(8);}
+ else{assert(set.isEmpty());}
+
+ int x=scaf1==null ? -1 : Tools.indexOf(scaf1, (byte)'$');
+ if(x>=0){
+ String s=new String(scaf1, 0, x);
+ if(s.indexOf(',')<0){
+ set.add(s);
+ }else{
+ for(String s2 : s.split(",")){set.add(s2);}
+ }
+ }
+
+ x=(scaf2==null || scaf2==scaf1) ? -1 : Tools.indexOf(scaf2, (byte)'$');
+ if(x>=0){
+ String s=new String(scaf2, 0, x);
+ if(s.indexOf(',')<0){
+ set.add(s);
+ }else{
+ for(String s2 : s.split(",")){set.add(s2);}
+ }
+ }
+
+ return set;
+ }
+
+
+ /**
+ * @param r
+ * @return A mapping of reference names to read clones.
+ */
+ public static HashMap<String, Read> toNameMap(Read r, HashMap<String, Read> map) {
+
+ if(true){throw new RuntimeException("TODO");}
+
+ if(r==null){return map;}
+ byte[] scaf1=r.getScaffoldName(false);
+ byte[] scaf2=(r.mate==null ? null : r.mate.getScaffoldName(false));
+ if(scaf1==null && scaf2==null){return map;}
+
+ if(map==null){map=new HashMap<String, Read>(8);}
+ else{assert(map.isEmpty());}
+
+ int x=scaf1==null ? -1 : Tools.indexOf(scaf1, (byte)'$');
+ if(x>=0){
+ String s=new String(scaf1, 0, x);
+ if(s.indexOf(',')<0){
+ if(!map.containsKey(s)){
+ map.put(s, cloneRead(r, s));
+ }
+ }else{
+ for(String s2 : s.split(",")){
+ if(!map.containsKey(s2)){
+ map.put(s2, cloneRead(r, s2));
+ }
+ }
+ }
+ }
+
+ x=(scaf2==null || scaf2==scaf1) ? -1 : Tools.indexOf(scaf2, (byte)'$');
+ if(x>=0){
+ String s=new String(scaf2, 0, x);
+ if(s.indexOf(',')<0){
+ if(!map.containsKey(s)){
+ map.put(s, cloneRead(r, s));
+ }
+ }else{
+ for(String s2 : s.split(",")){
+ if(!map.containsKey(s2)){
+ map.put(s2, cloneRead(r, s2));
+ }
+ }
+ }
+ }
+
+ return map;
+ }
+
+ private static Read cloneRead(Read r, String ref){
+ throw new RuntimeException("TODO");
+ }
+
+
+ /**
+ * @param r
+ * @return A set of names of reference lists containing this site.
+ */
+ public static HashSet<String> toListNames(SiteScore r, HashSet<String> set) {
+ if(r==null){return set;}
+ byte[] scaf1=r.getScaffoldName(false);
+ if(scaf1==null){return set;}
+ if(set==null){set=new HashSet<String>(8);}
+ int x=scaf1==null ? -1 : Tools.indexOf(scaf1, (byte)'$');
+ if(x>=0){
+ String s=new String(scaf1, 0, x);
+ if(s.indexOf(',')<0){
+ set.add(s);
+ }else{
+ for(String s2 : s.split(",")){set.add(s2);}
+ }
+ }
+ return set;
+ }
+
+ private static void addNames(LinkedHashSet<String> fnames, HashMap<String, LinkedHashSet<String>> table, String setName){
+ LinkedHashSet<String> set=table.get(setName);
+ if(set==null){return;}
+ for(String s : set){fnames.add(s);}
+ }
+
+ public static void makeBamScript(String outname, ArrayList<String> list, String...sams){
+ LinkedHashSet<String> set=new LinkedHashSet<String>();
+ if(sams!=null){
+ for(String s : sams){
+ if(s!=null && (s.endsWith(".sam") || s.endsWith(".sam.gz") || s.endsWith(".bam"))){
+ set.add(s);
+ }
+ }
+ }
+ if(list!=null){
+ for(String s : list){
+ if(s!=null && (s.endsWith(".sam") || s.endsWith(".sam.gz") || s.endsWith(".bam"))){
+ set.add(s);
+ }
+ }
+ }
+ if(streamTable!=null){
+ for(ConcurrentReadOutputStream ros : streamTable.values()){
+ String s=ros.fname();
+ if(s.endsWith(".sam") || s.endsWith(".sam.gz") || s.endsWith(".bam")){
+ set.add(s);
+ }
+ }
+ }
+ TextStreamWriter tsw=new TextStreamWriter(outname, overwrite, append, false);
+ tsw.start();
+
+ String memstring=null;
+ if(set.size()>0){
+ tsw.println("#!/bin/bash");
+ tsw.println("module unload samtools");
+ tsw.println("module load samtools/0.1.19");
+
+ long mem=Runtime.getRuntime().maxMemory()/3400000;
+ mem=Tools.min(100000, mem);
+ if(mem<2048){memstring=mem+"M";}
+ else{memstring=(mem/1024)+"G";}
+
+ tsw.println("echo \"Note: This script is designed to run with the amount of memory detected by BBMap.\"");
+ tsw.println("echo \" If Samtools crashes, please ensure you are running on the same platform as BBMap,\"");
+ tsw.println("echo \" or reduce Samtools' memory setting (the -m flag).\"");
+ }
+
+ for(String sam : set){
+ String bam;
+ if(sam.endsWith(".sam.gz")){bam=sam.substring(0, sam.length()-6)+"bam";}
+ else if(sam.endsWith(".sam")){bam=sam.substring(0, sam.length()-3)+"bam";}
+ else{bam=sam;} //Hopefully, they must have outputted a bam file using samtools.
+ String bam2=bam.substring(0, bam.length()-4)+"_sorted";
+
+ boolean pipe=true;
+ if(pipe && sam!=bam){
+ tsw.println("echo \"Note: Please ignore any warnings about 'EOF marker is absent'; " +
+ "this is a bug in samtools that occurs when using piped input.\"");
+ tsw.println("samtools view -bSh1 "+sam+" | samtools sort -m "+memstring+" -@ 3 - "+bam2);
+ }else{
+ if(sam!=bam){tsw.println("samtools view -bSh1 -o "+bam+" "+sam);}
+ tsw.println("samtools sort -m "+memstring+" -@ 3 "+bam+" "+bam2);
+ }
+
+ tsw.println("samtools index "+bam2+".bam");
+ }
+ tsw.poison();
+ tsw.waitForFinish();
+
+ try {
+ File f=new File(outname);
+ f.setExecutable(true, false);
+ } catch (Exception e) {
+// e.printStackTrace();
+ }
+ }
+
+ public static class SetCount implements Comparable<SetCount>{
+
+ public SetCount(String s){
+ name=s;
+ }
+
+ public boolean equals(Object other){return equals((SetCount)other);}
+ public boolean equals(SetCount other){return compareTo(other)==0;}
+
+ @Override
+ public int compareTo(SetCount o) {
+ if(mappedReads!=o.mappedReads){return mappedReads>o.mappedReads ? 1 : -1;}
+ if(ambiguousReads!=o.ambiguousReads){return ambiguousReads>o.ambiguousReads ? 1 : -1;}
+ return name.compareTo(o.name);
+ }
+
+ public String toString(){
+ return name+", "+mappedReads+", "+ambiguousReads+", "+mappedBases+", "+ambiguousBases;
+ }
+
+ public final String name;
+ public long mappedReads;
+ public long ambiguousReads;
+ public long mappedBases;
+ public long ambiguousBases;
+
+ }
+
+ public static void printCounts(String fname, LinkedHashMap<String, SetCount> map, boolean header, long totalReads, boolean nzo, boolean sort){
+ final ArrayList<SetCount> list=new ArrayList<SetCount>(map.size());
+ for(String name : map.keySet()){
+ list.add(map.get(name));
+ }
+ final TextStreamWriter tsw=new TextStreamWriter(fname, overwrite, append, false);
+ tsw.start();
+ if(sort){
+ Collections.sort(list);
+ Collections.reverse(list);
+ }
+
+ if(header){
+ tsw.print("#name\t%unambiguousReads\tunambiguousMB\t%ambiguousReads\tambiguousMB\tunambiguousReads\tambiguousReads\n");
+ }
+ final StringBuilder sb=new StringBuilder(1024);
+ final double divR=100.0/(totalReads);
+ final double divB=1.0/1000000;
+ for(SetCount sc : list){
+ if(!nzo || sc.mappedReads>0 || sc.ambiguousReads>0){
+ sb.append(sc.name).append('\t');
+ sb.append(String.format("%.5f\t", sc.mappedReads*divR));
+ sb.append(String.format("%.5f\t", sc.mappedBases*divB));
+ sb.append(String.format("%.5f\t", sc.ambiguousReads*divR));
+ sb.append(String.format("%.5f\t", sc.ambiguousBases*divB));
+ sb.append(sc.mappedReads).append('\t');
+ sb.append(sc.ambiguousReads).append('\n');
+ tsw.print(sb.toString());
+ sb.setLength(0);
+ }
+ }
+ tsw.poison();
+ }
+
+ public static LinkedHashMap<String, SetCount> setCountTable=null;
+ public static LinkedHashMap<String, SetCount> scafCountTable=null;
+
+ /**
+ * Holds named output streams.
+ */
+ public static HashMap<String, ConcurrentReadOutputStream> streamTable=null;
+
+ /**
+ * Holds named output streams for ambiguous (across different references) reads.
+ */
+ public static HashMap<String, ConcurrentReadOutputStream> streamTableAmbiguous=null;
+ public static final int AMBIGUOUS2_UNSET=0;
+ public static final int AMBIGUOUS2_FIRST=1;
+ public static final int AMBIGUOUS2_SPLIT=2;
+ public static final int AMBIGUOUS2_TOSS=3;
+ public static final int AMBIGUOUS2_RANDOM=4;
+ public static final int AMBIGUOUS2_ALL=5;
+ public static int AMBIGUOUS2_MODE=AMBIGUOUS2_UNSET;
+ public static boolean TRACK_SET_STATS=false;
+ public static boolean TRACK_SCAF_STATS=false;
+ public static String SCAF_STATS_FILE=null;
+ public static String SET_STATS_FILE=null;
+ public static boolean overwrite=true;
+ public static boolean append=false;
+ public static boolean verbose=false;
+ public static boolean forceRebuild=false;
+ private static final ArrayList<Read> blank=new ArrayList<Read>(0);
+
+ public static final int MAP_NORMAL=1;
+ public static final int MAP_ACC=2;
+ public static final int MAP_PACBIO=3;
+ public static final int MAP_PACBIOSKIMMER=4;
+ public static int MAP_MODE=MAP_NORMAL;
+
+}
diff --git a/current/align2/BBWrap.java b/current/align2/BBWrap.java
new file mode 100755
index 0000000..fd98644
--- /dev/null
+++ b/current/align2/BBWrap.java
@@ -0,0 +1,195 @@
+package align2;
+
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import stream.Read;
+import dna.Data;
+import dna.Parser;
+import dna.Timer;
+
+/**
+ * @author Brian Bushnell
+ * @date Mar 27, 2014
+ *
+ */
+public class BBWrap {
+
+ public static void main(String[] args){
+ BBWrap wrapper=new BBWrap();
+ ArrayList<String> list=wrapper.parse(args);
+ wrapper.execute(list);
+ }
+
+ private final ArrayList<String> parse(String[] args){
+
+ sysout.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+ sysout.println("BBMap version "+Shared.BBMAP_VERSION_STRING);
+
+ args=Parser.parseConfig(args);
+ if(Parser.parseHelp(args, true)){
+// printOptions();
+ System.exit(0);
+ }
+ sysout.println("BBMap version "+Shared.BBMAP_VERSION_STRING);
+
+ Timer t=new Timer();
+
+ Read.TO_UPPER_CASE=true;
+
+ for(int i=0; i<args.length; i++){
+ final String arg=args[i];
+ final String[] split=arg.split("=");
+ final String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+// if("null".equalsIgnoreCase(b)){b=null;}
+// System.err.println("Processing "+arg);
+ if(a.equals("path") || a.equals("root")){
+ Data.setPath(b);
+ args[i]=null;
+ }else if(a.equals("mapper")){
+ mapper=b;
+ args[i]=null;
+ }else if(a.equals("ref") || a.equals("reference") || a.equals("fasta")){
+ ref=b;
+ args[i]=null;
+ }else if(a.equals("in") || a.equals("in1")){
+ add(b, in1List);
+ args[i]=null;
+ }else if(a.equals("in2")){
+ add(b, in2List);
+ args[i]=null;
+ }else if(a.equals("out") || a.equals("out1")){
+ add(b, out1List);
+ args[i]=null;
+ }else if(a.equals("out2")){
+ add(b, out2List);
+ args[i]=null;
+ }else if(a.equals("outm") || a.equals("outm1") || a.equals("outmapped") || a.equals("outmapped1")){
+ add(b, outm1List);
+ args[i]=null;
+ }else if(a.equals("outm2") || a.equals("outmapped2")){
+ add(b, outm2List);
+ args[i]=null;
+ }else if(a.equals("outu") || a.equals("outu1") || a.equals("outunmapped") || a.equals("outunmapped1")){
+ add(b, outu1List);
+ args[i]=null;
+ }else if(a.equals("outu2") || a.equals("outunmapped2")){
+ add(b, outu2List);
+ args[i]=null;
+ }else if(a.equals("outb") || a.equals("outb1") || a.equals("outblack") || a.equals("outblack1") || a.equals("outblacklist") || a.equals("outblacklist1")){
+ add(b, outb1List);
+ args[i]=null;
+ }else if(a.equals("outb2") || a.equals("outblack2") || a.equals("outblacklist2")){
+ add(b, outb2List);
+ args[i]=null;
+ }else if(a.equals("qualityhistogram") || a.equals("qualityhist") || a.equals("qhist")){
+ add(b, qhistList);
+ args[i]=null;
+ }else if(a.equals("matchhistogram") || a.equals("matchhist") || a.equals("mhist")){
+ add(b, mhistList);
+ args[i]=null;
+ }else if(a.equals("inserthistogram") || a.equals("inserthist") || a.equals("ihist")){
+ add(b, ihistList);
+ args[i]=null;
+ }else if(a.equals("bamscript") || a.equals("bs")){
+ add(b, bsList);
+ args[i]=null;
+ }else if(a.equals("append") || a.equals("app")){
+ append=Tools.parseBoolean(b);
+ }
+ }
+
+ ArrayList<String> list=new ArrayList<String>();
+ for(String s : args){
+ if(s!=null){
+ list.add(s);
+ }
+ }
+// return list.toArray(new String[list.size()]);
+ return list;
+
+ }
+
+ private void add(String s, ArrayList<String> list){
+ if(s!=null && !"null".equals(s.toLowerCase())){
+ String[] sa=s.split(",");
+ for(String ss : sa){
+ list.add(ss);
+ }
+ }
+ }
+
+ private void execute(ArrayList<String> base){
+ for(int i=0; i<in1List.size(); i++){
+ ArrayList<String> list=(ArrayList<String>) base.clone();
+
+ if(i==0 && ref!=null){list.add("ref="+ref);}
+ else if(i>0){list.add("indexloaded=t");}
+
+ addToList(list, bsList, "bs", i);
+ addToList(list, qhistList, "qhist", i);
+ addToList(list, mhistList, "mhist", i);
+ addToList(list, ihistList, "ihist", i);
+ addToList(list, in1List, "in", i);
+ addToList(list, out1List, "out", i);
+ addToList(list, outu1List, "outu", i);
+ addToList(list, outm1List, "outm", i);
+ addToList(list, outb1List, "outb", i);
+ addToList(list, in2List, "in2", i);
+ addToList(list, out2List, "out2", i);
+ addToList(list, outu2List, "outu2", i);
+ addToList(list, outm2List, "outm2", i);
+ addToList(list, outb2List, "outb2", i);
+
+ String[] args=list.toArray(new String[list.size()]);
+ if(mapper==null || mapper.equalsIgnoreCase("bbmap")){
+ BBMap.main(args);
+ }else if(mapper.equalsIgnoreCase("bbmappacbio") || mapper.equalsIgnoreCase("pacbio")){
+ BBMapPacBio.main(args);
+ }else if(mapper.equalsIgnoreCase("bbmappacbioskimmer") || mapper.equalsIgnoreCase("pacbioskimmer") || mapper.equalsIgnoreCase("skimmer") || mapper.equalsIgnoreCase("bbmapskimmer")){
+ BBMapPacBioSkimmer.main(args);
+ }else if(mapper.equalsIgnoreCase("bbmap5") || mapper.equalsIgnoreCase("5")){
+ BBMap5.main(args);
+ }else if(mapper.equalsIgnoreCase("bbmapacc") || mapper.equalsIgnoreCase("acc")){
+ BBMapAcc.main(args);
+ }else if(mapper.equalsIgnoreCase("bbsplit") || mapper.equalsIgnoreCase("bbsplitter")){
+ BBSplitter.main(args);
+ }
+ }
+ }
+
+ private void addToList(ArrayList<String> list, ArrayList<String> source, String key, int i){
+ if(source.size()>i){
+ list.add(key+"="+source.get(i));
+ }else if(append && source.size()==1){
+ list.add(key+"="+source.get(0));
+ }
+ }
+
+ private String ref;
+ private String mapper="bbmap";
+
+ private ArrayList<String> bsList=new ArrayList<String>();
+ private ArrayList<String> qhistList=new ArrayList<String>();
+ private ArrayList<String> mhistList=new ArrayList<String>();
+ private ArrayList<String> ihistList=new ArrayList<String>();
+
+ private ArrayList<String> in1List=new ArrayList<String>();
+ private ArrayList<String> out1List=new ArrayList<String>();
+ private ArrayList<String> outu1List=new ArrayList<String>();
+ private ArrayList<String> outm1List=new ArrayList<String>();
+ private ArrayList<String> outb1List=new ArrayList<String>();
+
+ private ArrayList<String> in2List=new ArrayList<String>();
+ private ArrayList<String> out2List=new ArrayList<String>();
+ private ArrayList<String> outu2List=new ArrayList<String>();
+ private ArrayList<String> outm2List=new ArrayList<String>();
+ private ArrayList<String> outb2List=new ArrayList<String>();
+
+ private boolean append=false;
+
+ static PrintStream sysout=System.err;
+
+}
diff --git a/current/align2/BandedAligner.java b/current/align2/BandedAligner.java
new file mode 100755
index 0000000..8432a98
--- /dev/null
+++ b/current/align2/BandedAligner.java
@@ -0,0 +1,167 @@
+package align2;
+
+import java.util.Arrays;
+
+/**
+ * @author Brian Bushnell
+ * @date Aug 5, 2013
+ *
+ */
+public abstract class BandedAligner {
+
+ public BandedAligner(int width_){
+ maxWidth=Tools.max(width_, 3)|1;
+ assert(maxWidth>=3) : "width<3 : "+width_+" -> "+maxWidth;
+ assert(big>maxWidth/2);
+ }
+
+ public static final BandedAligner makeBandedAligner(int width_){
+ BandedAligner ba=(Shared.USE_JNI ? new BandedAlignerJNI(width_) : new BandedAlignerConcrete(width_));
+ return ba;
+ }
+
+ public final int alignQuadrupleProgressive(final byte[] query, final byte[] ref, int minEdits, int maxEdits, final boolean exact){
+ maxEdits=Tools.min(maxEdits, Tools.max(query.length, ref.length));
+ minEdits=Tools.min(minEdits, maxEdits);
+ for(long i=minEdits, me=-1; me<maxEdits; i=i*4){
+ me=Tools.min(i, maxEdits);
+ if(me*2>maxEdits){me=maxEdits;}
+ int edits=alignQuadruple(query, ref, (int)me, exact);
+// System.err.println("i="+i+", me="+me+", minEdits="+minEdits+", maxEdits="+maxEdits+", edits="+edits);
+ if(edits<me){return edits;}
+ }
+ return maxEdits;
+ }
+
+ public final int alignQuadruple(final byte[] query, final byte[] ref, final int maxEdits, final boolean exact){
+ final int a=alignForward(query, ref, 0, 0, maxEdits, exact);
+ final int b=alignReverse(query, ref, query.length-1, ref.length-1, maxEdits, exact);
+ final int me2=Tools.min(maxEdits, Tools.max(a, b));
+ if(me2==0){return 0;}
+ final int c=alignForwardRC(query, ref, query.length-1, 0, me2, exact);
+ final int d=alignReverseRC(query, ref, 0, ref.length-1, me2, exact);
+// System.err.println("a="+a+", b="+b+", c="+c+", d="+d);
+ return Tools.min(Tools.max(a, b), Tools.max(c, d));
+ }
+
+ public final int alignDouble(final byte[] query, final byte[] ref, final int maxEdits, final boolean exact){
+ final int a=alignForward(query, ref, 0, 0, maxEdits, exact);
+ if(a==0){return 0;}
+ final int c=alignForwardRC(query, ref, query.length-1, 0, a, exact);
+ return Tools.min(a, c);
+ }
+
+ /**
+ * @param query
+ * @param ref
+ * @param qstart
+ * @param rstart
+ * @return Edit distance
+ */
+ public abstract int alignForward(final byte[] query, final byte[] ref, final int qstart, final int rstart, final int maxEdits, final boolean exact);
+
+ /**
+ * @param query
+ * @param ref
+ * @param qstart
+ * @param rstart
+ * @return Edit distance
+ */
+ public abstract int alignForwardRC(final byte[] query, final byte[] ref, final int qstart, final int rstart, final int maxEdits, final boolean exact);
+
+ /**
+ * @param query
+ * @param ref
+ * @param qstart
+ * @param rstart
+ * @return Edit distance
+ */
+ public abstract int alignReverse(final byte[] query, final byte[] ref, final int qstart, final int rstart, final int maxEdits, final boolean exact);
+
+ /**
+ * @param query
+ * @param ref
+ * @param qstart
+ * @param rstart
+ * @return Edit distance
+ */
+ public abstract int alignReverseRC(final byte[] query, final byte[] ref, final int qstart, final int rstart, final int maxEdits, final boolean exact);
+
+ protected void fillBig(int[] array){
+ final int lim=array.length-1;
+ for(int i=1; i<lim; i++){array[i]=big;}
+ }
+
+ /** Score is lastRow-edits */
+ public final int score(){
+ return lastRow-lastEdits+1;
+ }
+
+ /** Position of min value in array (meaning the best alignment) relative to the middle of the array. */
+ protected int lastOffset(int[] array, int halfWidth){
+ final int center=halfWidth+1;
+ int minLoc=center;
+ for(int i=1; i<=halfWidth; i++){
+ if(array[center+i]<array[minLoc]){minLoc=center+i;}
+ if(array[center-i]<array[minLoc]){minLoc=center-i;}
+ }
+ return center-minLoc;
+ }
+
+ protected int penalizeOffCenter_old(int[] array, int halfWidth){
+ if(verbose){
+ System.err.println("penalizeOffCenter_old("+Arrays.toString(array)+", "+halfWidth);
+ }
+ final int center=halfWidth+1;
+ int edits=array[center];
+ for(int i=1; i<=halfWidth; i++){
+ array[center+i]=Tools.min(big, array[center+i]+i);
+ edits=Tools.min(edits, array[center+i]);
+ array[center-i]=Tools.min(big, array[center-i]+i);
+ edits=Tools.min(edits, array[center-i]);
+ }
+ if(verbose){
+ System.err.println("returned "+edits);
+ }
+ return edits;
+ }
+
+ protected int penalizeOffCenter(int[] array, int halfWidth){
+ if(verbose){
+ System.err.println("penalizeOffCenter("+Arrays.toString(array)+", "+halfWidth);
+ }
+ final int center=halfWidth+1;
+ int edits=array[center];
+ for(int i=1; i<=halfWidth; i++){
+ array[center+i]=Tools.min(big, Tools.max(i, array[center+i]));
+ edits=Tools.min(edits, array[center+i]);
+ array[center-i]=Tools.min(big, Tools.max(i, array[center-i]));
+ edits=Tools.min(edits, array[center-i]);
+ }
+ if(verbose){
+ System.err.println("returned "+edits);
+ }
+ return edits;
+ }
+
+ /** Final row aligned in last alignment. */
+ public int lastRow;
+ /** Final edits value in last alignment. */
+ public int lastEdits;
+
+ /** Position of min value in array (meaning the best alignment) relative to the middle of the array.
+ * Positive value is to the right (ref sequence longer than query), negative value left (ref shorter than query) */
+ protected int lastOffset;
+
+ public int lastRefLoc;
+ public int lastQueryLoc;
+
+ public final int maxWidth;
+
+ public static final int big=99999999;
+ public static boolean verbose=false;
+ /** Penalizes non-length-neutral alignments.
+ * This causes query-to-ref alignment to yield same score as ref-to-query alignment, which is useful for assertions. */
+ public static boolean penalizeOffCenter=true;
+
+}
diff --git a/current/align2/BandedAlignerConcrete.java b/current/align2/BandedAlignerConcrete.java
new file mode 100755
index 0000000..bb23b28
--- /dev/null
+++ b/current/align2/BandedAlignerConcrete.java
@@ -0,0 +1,551 @@
+package align2;
+
+import java.util.Arrays;
+
+import dna.AminoAcid;
+
+/**
+ * @author Brian Bushnell
+ * @date Aug 5, 2013
+ *
+ */
+public class BandedAlignerConcrete extends BandedAligner{
+
+
+ public static void main(String[] args){
+ byte[] query=args[0].getBytes();
+ byte[] ref=(args[1].equals(".") ? args[0].getBytes() : args[1].getBytes());
+ int qstart=-1;
+ int rstart=-1;
+ int maxedits=big-1;
+ int width=5;
+ if(args.length>2){qstart=Integer.parseInt(args[2]);}
+ if(args.length>3){rstart=Integer.parseInt(args[3]);}
+ if(args.length>4){maxedits=Integer.parseInt(args[4]);}
+ if(args.length>5){width=Integer.parseInt(args[5]);}
+
+ BandedAlignerConcrete ba=new BandedAlignerConcrete(width);
+
+ int edits;
+
+ penalizeOffCenter=true;
+ edits=ba.alignForward(query, ref, (qstart==-1 ? 0 : qstart), (rstart==-1 ? 0 : rstart), maxedits, true);
+ System.out.println("Forward: \tedits="+edits+", lastRow="+ba.lastRow+", score="+ba.score());
+ System.out.println("***********************\n");
+
+ penalizeOffCenter=false;
+ edits=ba.alignForward(query, ref, (qstart==-1 ? 0 : qstart), (rstart==-1 ? 0 : rstart), maxedits, true);
+ System.out.println("Forward2: \tedits="+edits+", lastRow="+ba.lastRow+", score="+ba.score());
+ System.out.println("***********************\n");
+//
+// edits=ba.alignForwardRC(query, ref, (qstart==-1 ? query.length-1 : qstart), (rstart==-1 ? 0 : rstart), maxedits, true);
+// System.out.println("ForwardRC: \tedits="+edits+", lastRow="+ba.lastRow+", score="+ba.score());
+// System.out.println("***********************\n");
+
+ penalizeOffCenter=true;
+ edits=ba.alignReverse(query, ref, (qstart==-1 ? query.length-1 : qstart), (rstart==-1 ? ref.length-1 : rstart), maxedits, true);
+ System.out.println("Reverse: \tedits="+edits+", lastRow="+ba.lastRow+", score="+ba.score());
+ System.out.println("***********************\n");
+
+ penalizeOffCenter=false;
+ edits=ba.alignReverse(query, ref, (qstart==-1 ? query.length-1 : qstart), (rstart==-1 ? ref.length-1 : rstart), maxedits, true);
+ System.out.println("Reverse2: \tedits="+edits+", lastRow="+ba.lastRow+", score="+ba.score());
+ System.out.println("***********************\n");
+
+// edits=ba.alignReverseRC(query, ref, (qstart==-1 ? 0 : qstart), (rstart==-1 ? ref.length-1 : rstart), maxedits, true);
+// System.out.println("ReverseRC: \tedits="+edits+", lastRow="+ba.lastRow+", score="+ba.score());
+// System.out.println("***********************\n");
+
+ penalizeOffCenter=true;
+ edits=ba.alignQuadruple(query, ref, maxedits, true);
+ System.out.println("Quadruple: \tedits="+edits+", lastRow="+ba.lastRow+", score="+ba.score());
+ System.out.println("***********************\n");
+
+ penalizeOffCenter=false;
+ edits=ba.alignQuadruple(query, ref, maxedits, true);
+ System.out.println("Quadruple2: \tedits="+edits+", lastRow="+ba.lastRow+", score="+ba.score());
+ System.out.println("***********************\n");
+
+ penalizeOffCenter=true;
+ edits=ba.alignDouble(query, ref, maxedits, true);
+ System.out.println("Double: \tedits="+edits+", lastRow="+ba.lastRow+", score="+ba.score());
+ System.out.println("***********************\n");
+
+ penalizeOffCenter=false;
+ edits=ba.alignDouble(query, ref, maxedits, true);
+ System.out.println("Double2: \tedits="+edits+", lastRow="+ba.lastRow+", score="+ba.score());
+ System.out.println("***********************\n");
+ }
+
+
+ public BandedAlignerConcrete(int width_){
+ super(width_);
+ array1=new int[maxWidth+2];
+ array2=new int[maxWidth+2];
+ Arrays.fill(array1, big);
+ Arrays.fill(array2, big);
+// for(int i=2; i<rows; i++){
+// matrix[i]=matrix[i-2];
+// }
+ assert(big>maxWidth/2);
+ }
+
+ /**
+ * @param query
+ * @param ref
+ * @param qstart
+ * @param rstart
+ * @return Edit distance
+ */
+ public int alignForward(final byte[] query, final byte[] ref, final int qstart, final int rstart, final int maxEdits, final boolean exact){
+ assert(big>maxEdits);
+ if(verbose){System.err.println("alignForward("+new String(query)+", "+new String(ref)+", "+qstart+", "+rstart+", "+maxEdits+")");}
+ if(query.length-qstart>ref.length-rstart){
+ int x=alignForward(ref, query, rstart, qstart, maxEdits, exact);
+ int temp=lastQueryLoc;
+ lastQueryLoc=lastRefLoc;
+ lastRefLoc=temp;
+ if(verbose){
+ System.out.println("Reversed.");
+ System.out.println("Final state: lastRow="+lastRow+", lastEdits="+lastEdits+", lastOffset="+lastOffset+
+ ", lastQueryLoc="+lastQueryLoc+", lastRefLoc="+lastRefLoc+(query.length<30 ? ", query="+new String(query)+", ref="+new String(ref) : "")+"\n");
+ }
+ return x;
+ }
+ int edits=0, row=0;
+ lastRow=-1;
+ lastEdits=0;
+ lastOffset=0;
+
+ final int width=Tools.min(maxWidth, (maxEdits*2)+1, Tools.max(query.length, ref.length)*2+2)|1;
+ final int halfWidth=width/2;
+ final boolean inexact=!exact;
+
+ int qloc=qstart;
+ int rsloc=rstart-halfWidth;
+ final int xlines=query.length-qstart;
+ final int ylines=ref.length-rstart;
+ final int len=Tools.min(xlines, ylines);
+ if(verbose){System.err.println("xlines="+xlines+", ylines="+ylines+", len="+len);}
+ if(len<1){
+ if(false){
+ throw new RuntimeException("No overlap: qstart="+qstart+", rstart="+rstart+", qlen="+query.length+", rlen="+ref.length);
+ }
+ assert(false) : ("No overlap: qstart="+qstart+", rstart="+rstart+", qlen="+query.length+", rlen="+ref.length);
+ return 0;
+ }
+
+ Arrays.fill(array1, 0, Tools.min(width, maxWidth)+1, big);
+ Arrays.fill(array2, 0, Tools.min(width, maxWidth)+1, big);
+ arrayCurrent=array1;
+ arrayPrev=array2;
+ {
+ if(verbose){System.err.println("\nFirst row.");}
+ final byte q=query[qloc];
+ final int colStart=Tools.max(0, rsloc);
+ final int colLimit=Tools.min(rsloc+width, ref.length);
+ edits=big;
+ int mloc=1+(colStart-rsloc);
+ if(verbose){System.err.println("q="+(char)q+", qloc="+qloc+", rsloc="+rsloc+", colStart="+colStart+", colLimit="+colLimit+", mloc="+mloc);}
+// assert(false) : mloc+", "+colStart+", "+rsloc;
+ for(int col=colStart; col<colLimit; mloc++, col++){
+ if(verbose){System.err.println("col="+col+", mloc="+mloc);}
+ final byte r=ref[col];
+ final int score=(q==r || (inexact && (!AminoAcid.isFullyDefined(q) || !AminoAcid.isFullyDefined(r))) ? 0 : 1);
+ arrayCurrent[mloc]=score;
+ edits=Tools.min(edits, score);
+ if(verbose){System.err.println("Comparing "+(char)q+" to "+(char)r+"; prev=0; score="+score+"; scores = "+Arrays.toString(arrayCurrent));}
+ }
+ row++; qloc++; rsloc++;
+ }
+ if(penalizeOffCenter){edits=penalizeOffCenter(arrayCurrent, halfWidth);}
+
+ for(row=1; row<len; row++, qloc++, rsloc++){
+// if(verbose){System.err.println("\nNew row, prev="+Arrays.toString(arrayCurrent));}
+ arrayTemp=arrayCurrent;
+ arrayCurrent=arrayPrev;
+ arrayPrev=arrayTemp;
+ if(verbose){System.err.println("\nNew row, prev="+Arrays.toString(arrayPrev)+", current="+Arrays.toString(arrayCurrent));}
+ final byte q=query[qloc];
+ final int colStart=Tools.max(0, rsloc);
+ final int colLimit=Tools.min(rsloc+width, ref.length);
+ Arrays.fill(arrayCurrent, big);
+ edits=big;
+ int mloc=1+(colStart-rsloc);
+ boolean forceDiag=(row==len-1);
+ if(verbose){System.err.println("q="+(char)q+", qloc="+qloc+", rsloc="+rsloc+", colStart="+colStart+", colLimit="+colLimit+", mloc="+mloc);}
+ for(int col=colStart; col<colLimit; mloc++, col++){
+ if(verbose){System.err.println("col="+col+", mloc="+mloc);}
+ final byte r=ref[col];
+ final int scoreUp=arrayPrev[mloc+1]+1;
+ final int scoreDiag=arrayPrev[mloc]+(q==r || (inexact && (!AminoAcid.isFullyDefined(q) || !AminoAcid.isFullyDefined(r))) ? 0 : 1);
+ final int scoreLeft=arrayCurrent[mloc-1]+1;
+ final int score=(forceDiag || col==ref.length-1) ? scoreDiag : Tools.min(scoreUp, scoreDiag, scoreLeft);
+ if(verbose){System.err.println("prev=min(s["+(mloc-1)+"]="+arrayCurrent[mloc-1]+", p["+(mloc)+"]="+arrayPrev[mloc]+", p["+(mloc+1)+"]="+arrayPrev[mloc+1]+")");}
+ arrayCurrent[mloc]=score;
+ edits=Tools.min(edits, score);
+ if(verbose){System.err.println("Comparing "+(char)q+" to "+(char)r+"; up="+scoreUp+"; diag="+scoreDiag+"; left="+scoreLeft+"; scores = "+Arrays.toString(arrayCurrent));}
+ }
+ if(edits>maxEdits){row++; break;}
+ }
+ if(penalizeOffCenter){edits=penalizeOffCenter(arrayCurrent, halfWidth);}
+
+ lastRow=row-1;
+ lastEdits=edits;
+ lastQueryLoc=qloc-1;
+ lastOffset=lastOffset(arrayCurrent, halfWidth);
+ lastRefLoc=rsloc+halfWidth-lastOffset-1;
+ while(lastRefLoc>=ref.length || lastQueryLoc>=query.length){lastRefLoc--; lastQueryLoc--;}
+ if(verbose){
+ System.out.println("\nFinal state: arrayCurrent="+Arrays.toString(arrayCurrent)+"\nlastRow="+lastRow+", lastEdits="+lastEdits+", lastOffset="+lastOffset+
+ ", lastQueryLoc="+lastQueryLoc+", lastRefLoc="+lastRefLoc+(query.length<30 ? ", query="+new String(query)+", ref="+new String(ref) : "")+"\n");
+ }
+ return edits;
+ }
+
+ /**
+ * @param query
+ * @param ref
+ * @param qstart
+ * @param rstart
+ * @return Edit distance
+ */
+ public int alignForwardRC(final byte[] query, final byte[] ref, final int qstart, final int rstart, final int maxEdits, final boolean exact){
+ assert(big>maxEdits);
+ if(verbose){System.err.println("alignForwardRC("+new String(query)+", "+new String(ref)+", "+qstart+", "+rstart+", "+maxEdits+")");}
+ if(qstart+1>ref.length-rstart){
+ int x=alignReverseRC(ref, query, rstart, qstart, maxEdits, exact);
+ int temp=lastQueryLoc;
+ lastQueryLoc=lastRefLoc;
+ lastRefLoc=temp;
+ if(verbose){
+ System.out.println("Reversed.");
+ System.out.println("Final state: lastRow="+lastRow+", lastEdits="+lastEdits+", lastOffset="+lastOffset+
+ ", lastQueryLoc="+lastQueryLoc+", lastRefLoc="+lastRefLoc+(query.length<30 ? ", query="+new String(query)+", ref="+new String(ref) : "")+"\n");
+ }
+ return x;
+ }
+ int edits=0, row=0;
+ lastRow=-1;
+ lastEdits=0;
+ lastOffset=0;
+
+ final int width=Tools.min(maxWidth, (maxEdits*2)+1, Tools.max(query.length, ref.length)*2+2)|1;
+ final int halfWidth=width/2;
+ final boolean inexact=!exact;
+
+ int qloc=qstart;
+ int rsloc=rstart-halfWidth;
+ final int xlines=qstart+1;
+ final int ylines=ref.length-rstart;
+ final int len=Tools.min(xlines, ylines);
+ if(verbose){System.err.println("xlines="+xlines+", ylines="+ylines+", len="+len);}
+ if(len<1){
+ if(false){
+ throw new RuntimeException("No overlap: qstart="+qstart+", rstart="+rstart+", qlen="+query.length+", rlen="+ref.length);
+ }
+ assert(false) : ("No overlap: qstart="+qstart+", rstart="+rstart+", qlen="+query.length+", rlen="+ref.length);
+ return 0;
+ }
+
+ Arrays.fill(array1, 0, Tools.min(width, maxWidth)+1, big);
+ Arrays.fill(array2, 0, Tools.min(width, maxWidth)+1, big);
+ arrayCurrent=array1;
+ arrayPrev=array2;
+
+ {
+ if(verbose){System.err.println("\nFirst row.");}
+ final byte q=AminoAcid.baseToComplementExtended[query[qloc]];
+ final int colStart=Tools.max(0, rsloc);
+ final int colLimit=Tools.min(rsloc+width, ref.length);
+ edits=big;
+ int mloc=1+(colStart-rsloc);
+ if(verbose){System.err.println("q="+(char)q+", qloc="+qloc+", rsloc="+rsloc+", colStart="+colStart+", colLimit="+colLimit+", mloc="+mloc);}
+ for(int col=colStart; col<colLimit; mloc++, col++){
+ if(verbose){System.err.println("col="+col+", mloc="+mloc);}
+ final byte r=ref[col];
+ final int score=(q==r || (inexact && (!AminoAcid.isFullyDefined(q) || !AminoAcid.isFullyDefined(r))) ? 0 : 1);
+ arrayCurrent[mloc]=score;
+ edits=Tools.min(edits, score);
+ if(verbose){System.err.println("Comparing "+(char)q+" to "+(char)r+"; prev=0; score="+score+"; scores = "+Arrays.toString(arrayCurrent));}
+ }
+ row++; qloc--; rsloc++;
+ }
+ if(penalizeOffCenter){edits=penalizeOffCenter(arrayCurrent, halfWidth);}
+
+ for(row=1; row<len; row++, qloc--, rsloc++){
+// if(verbose){System.err.println("\nNew row, prev="+Arrays.toString(arrayCurrent));}
+ arrayTemp=arrayCurrent;
+ arrayCurrent=arrayPrev;
+ arrayPrev=arrayTemp;
+ if(verbose){System.err.println("\nNew row, prev="+Arrays.toString(arrayPrev)+", current="+Arrays.toString(arrayCurrent));}
+ final byte q=AminoAcid.baseToComplementExtended[query[qloc]];
+ final int colStart=Tools.max(0, rsloc);
+ final int colLimit=Tools.min(rsloc+width, ref.length);
+ Arrays.fill(arrayCurrent, big);
+ edits=big;
+ int mloc=1+(colStart-rsloc);
+ boolean forceDiag=(row==len-1);
+ if(verbose){System.err.println("q="+(char)q+", qloc="+qloc+", rsloc="+rsloc+", colStart="+colStart+", colLimit="+colLimit+", mloc="+mloc);}
+ for(int col=colStart; col<colLimit; mloc++, col++){
+ if(verbose){System.err.println("col="+col+", mloc="+mloc);}
+ final byte r=ref[col];
+ final int scoreUp=arrayPrev[mloc+1]+1;
+ final int scoreDiag=arrayPrev[mloc]+(q==r || (inexact && (!AminoAcid.isFullyDefined(q) || !AminoAcid.isFullyDefined(r))) ? 0 : 1);
+ final int scoreLeft=arrayCurrent[mloc-1]+1;
+ final int score=(forceDiag || col==ref.length-1) ? scoreDiag : Tools.min(scoreUp, scoreDiag, scoreLeft);
+ if(verbose){System.err.println("prev=min(s["+(mloc-1)+"]="+arrayCurrent[mloc-1]+", p["+(mloc)+"]="+arrayPrev[mloc]+", p["+(mloc+1)+"]="+arrayPrev[mloc+1]+")");}
+ arrayCurrent[mloc]=score;
+ edits=Tools.min(edits, score);
+ if(verbose){System.err.println("Comparing "+(char)q+" to "+(char)r+"; up="+scoreUp+"; diag="+scoreDiag+"; left="+scoreLeft+"; scores = "+Arrays.toString(arrayCurrent));}
+ }
+ if(edits>maxEdits){row++; break;}
+ }
+ if(penalizeOffCenter){edits=penalizeOffCenter(arrayCurrent, halfWidth);}
+
+ lastRow=row-1;
+ lastEdits=edits;
+ lastOffset=lastOffset(arrayCurrent, halfWidth);
+ lastQueryLoc=qloc+1;
+ lastRefLoc=rsloc+halfWidth-lastOffset-1;
+ while(lastRefLoc>=ref.length || lastQueryLoc<0){lastRefLoc--; lastQueryLoc++;}
+ if(verbose){
+ System.out.println("\nFinal state: arrayCurrent="+Arrays.toString(arrayCurrent)+"\nlastRow="+lastRow+", lastEdits="+lastEdits+", lastOffset="+lastOffset+
+ ", lastQueryLoc="+lastQueryLoc+", lastRefLoc="+lastRefLoc+(query.length<30 ? ", query="+new String(query)+", ref="+new String(ref) : "")+", qloc="+qloc+"\n");
+ }
+ return edits;
+ }
+
+ /**
+ * @param query
+ * @param ref
+ * @param qstart
+ * @param rstart
+ * @return Edit distance
+ */
+ public int alignReverse(final byte[] query, final byte[] ref, final int qstart, final int rstart, final int maxEdits, final boolean exact){
+ assert(big>maxEdits);
+ if(verbose){System.err.println("alignReverse("+new String(query)+", "+new String(ref)+", "+qstart+", "+rstart+", "+maxEdits+")");}
+ if(qstart>rstart){
+ int x=alignReverse(ref, query, rstart, qstart, maxEdits, exact);
+ int temp=lastQueryLoc;
+ lastQueryLoc=lastRefLoc;
+ lastRefLoc=temp;
+ if(verbose){
+ System.out.println("Reversed.");
+ System.out.println("Final state: lastRow="+lastRow+", lastEdits="+lastEdits+", lastOffset="+lastOffset+
+ ", lastQueryLoc="+lastQueryLoc+", lastRefLoc="+lastRefLoc+(query.length<30 ? ", query="+new String(query)+", ref="+new String(ref) : "")+"\n");
+ }
+ return x;
+ }
+// if(true){return big;}
+ int edits=0, row=0;
+ lastRow=-1;
+ lastEdits=0;
+ lastOffset=0;
+
+ final int width=Tools.min(maxWidth, (maxEdits*2)+1, Tools.max(query.length, ref.length)*2+2)|1;
+ final int halfWidth=width/2;
+ final boolean inexact=!exact;
+
+ int qloc=qstart;
+ int rsloc=rstart-halfWidth;
+ final int xlines=qstart+1;
+ final int ylines=rstart+1;
+ final int len=Tools.min(xlines, ylines);
+ if(verbose){System.err.println("xlines="+xlines+", ylines="+ylines+", len="+len);}
+ if(len<1){
+ if(false){
+ throw new RuntimeException("No overlap: qstart="+qstart+", rstart="+rstart+", qlen="+query.length+", rlen="+ref.length);
+ }
+ assert(false) : ("No overlap: qstart="+qstart+", rstart="+rstart+", qlen="+query.length+", rlen="+ref.length);
+ return 0;
+ }
+
+ Arrays.fill(array1, 0, Tools.min(width, maxWidth)+1, big);
+ Arrays.fill(array2, 0, Tools.min(width, maxWidth)+1, big);
+ arrayCurrent=array1;
+ arrayPrev=array2;
+
+ {
+ if(verbose){System.err.println("\nFirst row.");}
+ final byte q=query[qloc];
+ final int colStart=Tools.max(0, rsloc);
+ final int colLimit=Tools.min(rsloc+width, ref.length);
+ edits=big;
+ int mloc=1+width-(colLimit-rsloc);
+// assert(false) : width+", "+maxEdits+", "+colLimit+", "+rsloc;
+ if(verbose){System.err.println("q="+(char)q+", qloc="+qloc+", rsloc="+rsloc+", colStart="+colStart+", colLimit="+colLimit+", mloc="+mloc);}
+ for(int col=colLimit-1; col>=colStart; mloc++, col--){
+ if(verbose){System.err.println("col="+col+", mloc="+mloc);}
+ final byte r=ref[col];
+ final int score=(q==r || (inexact && (!AminoAcid.isFullyDefined(q) || !AminoAcid.isFullyDefined(r))) ? 0 : 1);
+ arrayCurrent[mloc]=score;
+ edits=Tools.min(edits, score);
+ if(verbose){System.err.println("Comparing "+(char)q+" to "+(char)r+"; prev=0; score="+score+"; scores = "+Arrays.toString(arrayCurrent));}
+ }
+ row++; qloc--; rsloc--;
+ }
+ if(penalizeOffCenter){edits=penalizeOffCenter(arrayCurrent, halfWidth);}
+
+ for(row=1; row<len; row++, qloc--, rsloc--){
+// if(verbose){System.err.println("\nNew row, prev="+Arrays.toString(arrayCurrent));}
+ arrayTemp=arrayCurrent;
+ arrayCurrent=arrayPrev;
+ arrayPrev=arrayTemp;
+ if(verbose){System.err.println("\nNew row, prev="+Arrays.toString(arrayPrev)+", current="+Arrays.toString(arrayCurrent));}
+ final byte q=query[qloc];
+ final int colStart=Tools.max(0, rsloc);
+ final int colLimit=Tools.min(rsloc+width, ref.length);
+ Arrays.fill(arrayCurrent, big);
+ edits=big;
+ int mloc=1+width-(colLimit-rsloc);
+ boolean forceDiag=(row==len-1);
+ if(verbose){System.err.println("q="+(char)q+", qloc="+qloc+", rsloc="+rsloc+", colStart="+colStart+", colLimit="+colLimit+", mloc="+mloc);}
+ for(int col=colLimit-1; col>=colStart; mloc++, col--){
+ if(verbose){System.err.println("col="+col+", mloc="+mloc);}
+ final byte r=ref[col];
+ final int scoreUp=arrayPrev[mloc+1]+1;
+ final int scoreDiag=arrayPrev[mloc]+(q==r || (inexact && (!AminoAcid.isFullyDefined(q) || !AminoAcid.isFullyDefined(r))) ? 0 : 1);
+ final int scoreLeft=arrayCurrent[mloc-1]+1;
+ final int score=(forceDiag || col==0) ? scoreDiag : Tools.min(scoreUp, scoreDiag, scoreLeft);
+ if(verbose){System.err.println("prev=min(s["+(mloc-1)+"]="+arrayCurrent[mloc-1]+", p["+(mloc)+"]="+arrayPrev[mloc]+", p["+(mloc+1)+"]="+arrayPrev[mloc+1]+")");}
+ arrayCurrent[mloc]=score;
+ edits=Tools.min(edits, score);
+ if(verbose){System.err.println("Comparing "+(char)q+" to "+(char)r+"; up="+scoreUp+"; diag="+scoreDiag+"; left="+scoreLeft+"; scores = "+Arrays.toString(arrayCurrent));}
+ }
+ if(edits>maxEdits){row++; break;}
+ }
+ if(penalizeOffCenter){edits=penalizeOffCenter(arrayCurrent, halfWidth);}
+
+ lastRow=row-1;
+ lastEdits=edits;
+ lastOffset=lastOffset(arrayCurrent, halfWidth);
+ lastQueryLoc=qloc+1;
+ lastRefLoc=rsloc+halfWidth+lastOffset+1;
+ while(lastRefLoc<0 || lastQueryLoc<0){lastRefLoc++; lastQueryLoc++;}
+ if(verbose){
+ System.out.println("\nFinal state: arrayCurrent="+Arrays.toString(arrayCurrent)+"\nlastRow="+lastRow+", lastEdits="+lastEdits+", lastOffset="+lastOffset+
+ ", lastQueryLoc="+lastQueryLoc+", lastRefLoc="+lastRefLoc+(query.length<30 ? ", query="+new String(query)+", ref="+new String(ref) : "")+", qloc="+qloc+", rsloc="+rsloc+"\n");
+ }
+ return edits;
+ }
+
+ /**
+ * @param query
+ * @param ref
+ * @param qstart
+ * @param rstart
+ * @return Edit distance
+ */
+ public int alignReverseRC(final byte[] query, final byte[] ref, final int qstart, final int rstart, final int maxEdits, final boolean exact){
+ assert(big>maxEdits);
+ if(verbose){System.err.println("alignReverseRC("+new String(query)+", "+new String(ref)+", "+qstart+", "+rstart+", "+maxEdits+")");}
+ if(query.length-qstart>rstart+1){
+ int x=alignForwardRC(ref, query, rstart, qstart, maxEdits, exact);
+ int temp=lastQueryLoc;
+ lastQueryLoc=lastRefLoc;
+ lastRefLoc=temp;
+ if(verbose){
+ System.out.println("Reversed.");
+ System.out.println("Final state: lastRow="+lastRow+", lastEdits="+lastEdits+", lastOffset="+lastOffset+
+ ", lastQueryLoc="+lastQueryLoc+", lastRefLoc="+lastRefLoc+(query.length<30 ? ", query="+new String(query)+", ref="+new String(ref) : "")+"\n");
+ }
+ return x;
+ }
+ int edits=0, row=0;
+ lastRow=-1;
+ lastEdits=0;
+ lastOffset=0;
+
+ final int width=Tools.min(maxWidth, (maxEdits*2)+1, Tools.max(query.length, ref.length)*2+2)|1;
+ final int halfWidth=width/2;
+ final boolean inexact=!exact;
+
+ int qloc=qstart;
+ int rsloc=rstart-halfWidth;
+ final int xlines=query.length-qstart;
+ final int ylines=rstart+1;
+ final int len=Tools.min(xlines, ylines);
+ if(verbose){System.err.println("xlines="+xlines+", ylines="+ylines+", len="+len);}
+ if(len<1){
+ if(false){
+ throw new RuntimeException("No overlap: qstart="+qstart+", rstart="+rstart+", qlen="+query.length+", rlen="+ref.length);
+ }
+ assert(false) : ("No overlap: qstart="+qstart+", rstart="+rstart+", qlen="+query.length+", rlen="+ref.length);
+ return 0;
+ }
+
+ Arrays.fill(array1, 0, Tools.min(width, maxWidth)+1, big);
+ Arrays.fill(array2, 0, Tools.min(width, maxWidth)+1, big);
+ arrayCurrent=array1;
+ arrayPrev=array2;
+
+ {
+ if(verbose){System.err.println("\nFirst row.");}
+ final byte q=AminoAcid.baseToComplementExtended[query[qloc]];
+ final int colStart=Tools.max(0, rsloc);
+ final int colLimit=Tools.min(rsloc+width, ref.length);
+ edits=big;
+ int mloc=1+width-(colLimit-rsloc);
+ if(verbose){System.err.println("q="+(char)q+", qloc="+qloc+", rsloc="+rsloc+", colStart="+colStart+", colLimit="+colLimit+", mloc="+mloc);}
+ for(int col=colLimit-1; col>=colStart; mloc++, col--){
+ if(verbose){System.err.println("col="+col+", mloc="+mloc);}
+ final byte r=ref[col];
+ final int score=(q==r || (inexact && (!AminoAcid.isFullyDefined(q) || !AminoAcid.isFullyDefined(r))) ? 0 : 1);
+ arrayCurrent[mloc]=score;
+ edits=Tools.min(edits, score);
+ if(verbose){System.err.println("Comparing "+(char)q+" to "+(char)r+"; prev=0; score="+score+"; scores = "+Arrays.toString(arrayCurrent));}
+ }
+ row++; qloc++; rsloc--;
+ }
+ if(penalizeOffCenter){edits=penalizeOffCenter(arrayCurrent, halfWidth);}
+
+ for(row=1; row<len; row++, qloc++, rsloc--){
+// if(verbose){System.err.println("\nNew row, prev="+Arrays.toString(arrayCurrent));}
+ arrayTemp=arrayCurrent;
+ arrayCurrent=arrayPrev;
+ arrayPrev=arrayTemp;
+ if(verbose){System.err.println("\nNew row, prev="+Arrays.toString(arrayPrev)+", current="+Arrays.toString(arrayCurrent));}
+ final byte q=AminoAcid.baseToComplementExtended[query[qloc]];
+ final int colStart=Tools.max(0, rsloc);
+ final int colLimit=Tools.min(rsloc+width, ref.length);
+ Arrays.fill(arrayCurrent, big);
+ edits=big;
+ int mloc=1+width-(colLimit-rsloc);
+ boolean forceDiag=(row==len-1);
+ if(verbose){System.err.println("q="+(char)q+", qloc="+qloc+", rsloc="+rsloc+", colStart="+colStart+", colLimit="+colLimit+", mloc="+mloc);}
+ for(int col=colLimit-1; col>=colStart; mloc++, col--){
+ if(verbose){System.err.println("col="+col+", mloc="+mloc);}
+ final byte r=ref[col];
+ final int scoreUp=arrayPrev[mloc+1]+1;
+ final int scoreDiag=arrayPrev[mloc]+(q==r || (inexact && (!AminoAcid.isFullyDefined(q) || !AminoAcid.isFullyDefined(r))) ? 0 : 1);
+ final int scoreLeft=arrayCurrent[mloc-1]+1;
+ final int score=(forceDiag || col==0) ? scoreDiag : Tools.min(scoreUp, scoreDiag, scoreLeft);
+ if(verbose){System.err.println("prev=min(s["+(mloc-1)+"]="+arrayCurrent[mloc-1]+", p["+(mloc)+"]="+arrayPrev[mloc]+", p["+(mloc+1)+"]="+arrayPrev[mloc+1]+")");}
+ arrayCurrent[mloc]=score;
+ edits=Tools.min(edits, score);
+ if(verbose){System.err.println("Comparing "+(char)q+" to "+(char)r+"; up="+scoreUp+"; diag="+scoreDiag+"; left="+scoreLeft+"; scores = "+Arrays.toString(arrayCurrent));}
+ }
+ if(edits>maxEdits){row++; break;}
+ }
+ if(penalizeOffCenter){edits=penalizeOffCenter(arrayCurrent, halfWidth);}
+
+ lastRow=row-1;
+ lastEdits=edits;
+ lastOffset=lastOffset(arrayCurrent, halfWidth);
+ lastQueryLoc=qloc-1;
+ lastRefLoc=rsloc+halfWidth+lastOffset+1;
+ while(lastRefLoc<0 || lastQueryLoc>=query.length){lastRefLoc++; lastQueryLoc--;}
+ if(verbose){
+ System.out.println("\nFinal state: arrayCurrent="+Arrays.toString(arrayCurrent)+"\nlastRow="+lastRow+", lastEdits="+lastEdits+", lastOffset="+lastOffset+
+ ", lastQueryLoc="+lastQueryLoc+", lastRefLoc="+lastRefLoc+(query.length<30 ? ", query="+new String(query)+", ref="+new String(ref) : "")+"\n");
+ }
+ return edits;
+ }
+
+ private final int[] array1;
+ private final int[] array2;
+ private int[] arrayCurrent, arrayPrev, arrayTemp;
+
+}
diff --git a/current/align2/BandedAlignerJNI.java b/current/align2/BandedAlignerJNI.java
new file mode 100755
index 0000000..8669b40
--- /dev/null
+++ b/current/align2/BandedAlignerJNI.java
@@ -0,0 +1,179 @@
+package align2;
+
+import java.io.File;
+import dna.AminoAcid;
+
+/**
+ * @author Jonathan Rood
+ * @date Jul 18, 2014
+ *
+ */
+public class BandedAlignerJNI extends BandedAligner{
+
+ static {
+ String name = "bbtoolsjni";
+ try {
+ System.loadLibrary(name);
+ } catch (UnsatisfiedLinkError e1) {
+ // System.loadLibrary() does not work with MPI.
+ // Need to use System.load() with an explicit full
+ // path to the native library file for the MPI case.
+ boolean success = false;
+ String libpath=System.getProperty("java.library.path");
+ libpath = libpath.replace("-Djava.library.path=","");
+ String[] libpathEntries = libpath.split(File.pathSeparator);
+ for(int i = 0; i < libpathEntries.length; i++) {
+ if(success) break;
+ String lib = libpathEntries[i]+"/"+System.mapLibraryName(name);
+ try {
+ System.load(lib);
+ success = true;
+ } catch (UnsatisfiedLinkError e2) {
+ success = false;
+ if((i+1) >= libpathEntries.length) {
+ System.err.println("Native library can not be found in java.library.path. ");
+ System.exit(1);
+ }
+ }
+ }
+ }
+ }
+
+ private native int alignForwardJNI(byte[] query, byte[] ref, int qstart, int rstart, int maxEdits, boolean exact, int maxWidth, byte[] baseToNumber, int[] returnVals);
+
+ private native int alignForwardRCJNI(byte[] query, byte[] ref, int qstart, int rstart, int maxEdits, boolean exact, int maxWidth, byte[] baseToNumber, byte[] baseToComplementExtended, int[] returnVals);
+
+ private native int alignReverseJNI(byte[] query, byte[] ref, int qstart, int rstart, int maxEdits, boolean exact, int maxWidth, byte[] baseToNumber, int[] returnVals);
+
+ private native int alignReverseRCJNI(byte[] query, byte[] ref, int qstart, int rstart, int maxEdits, boolean exact, int maxWidth, byte[] baseToNumber, byte[] baseToComplementExtended, int[] returnVals);
+
+ public static void main(String[] args){
+ byte[] query=args[0].getBytes();
+ byte[] ref=args[1].getBytes();
+ int qstart=-1;
+ int rstart=-1;
+ int maxedits=big;
+ int width=5;
+ if(args.length>2){qstart=Integer.parseInt(args[2]);}
+ if(args.length>3){rstart=Integer.parseInt(args[3]);}
+ if(args.length>4){maxedits=Integer.parseInt(args[4]);}
+ if(args.length>4){width=Integer.parseInt(args[5]);}
+
+ BandedAlignerJNI ba=new BandedAlignerJNI(width);
+
+ int edits;
+
+ edits=ba.alignForward(query, ref, (qstart==-1 ? 0 : qstart), (rstart==-1 ? 0 : rstart), maxedits, true);
+ System.out.println("Forward: \tedits="+edits+", lastRow="+ba.lastRow+", score="+ba.score());
+ System.out.println("***********************\n");
+//
+// edits=ba.alignForwardRC(query, ref, (qstart==-1 ? query.length-1 : qstart), (rstart==-1 ? 0 : rstart), maxedits, true);
+// System.out.println("ForwardRC: \tedits="+edits+", lastRow="+ba.lastRow+", score="+ba.score());
+// System.out.println("***********************\n");
+
+ edits=ba.alignReverse(query, ref, (qstart==-1 ? query.length-1 : qstart), (rstart==-1 ? ref.length-1 : rstart), maxedits, true);
+ System.out.println("Reverse: \tedits="+edits+", lastRow="+ba.lastRow+", score="+ba.score());
+ System.out.println("***********************\n");
+
+// edits=ba.alignReverseRC(query, ref, (qstart==-1 ? 0 : qstart), (rstart==-1 ? ref.length-1 : rstart), maxedits, true);
+// System.out.println("ReverseRC: \tedits="+edits+", lastRow="+ba.lastRow+", score="+ba.score());
+// System.out.println("***********************\n");
+ }
+
+ public BandedAlignerJNI(int width_){
+ super(width_);
+ assert(big>maxWidth/2);
+ }
+
+ /**
+ * @param query
+ * @param ref
+ * @param qstart
+ * @param rstart
+ * @return Edit distance
+ */
+ public int alignForward(final byte[] query, final byte[] ref, final int qstart, final int rstart, final int maxEdits, final boolean exact){
+ int[] returnVals = new int[5];
+ returnVals[0] = lastQueryLoc;
+ returnVals[1] = lastRefLoc;
+ returnVals[2] = lastRow;
+ returnVals[3] = lastEdits;
+ returnVals[4] = lastOffset;
+ int edits = alignForwardJNI(query,ref,qstart,rstart,maxEdits,exact,maxWidth,AminoAcid.baseToNumber,returnVals);
+ lastQueryLoc = returnVals[0];
+ lastRefLoc = returnVals[1];
+ lastRow = returnVals[2];
+ lastEdits = returnVals[3];
+ lastOffset = returnVals[4];
+ return edits;
+ }
+
+ /**
+ * @param query
+ * @param ref
+ * @param qstart
+ * @param rstart
+ * @return Edit distance
+ */
+ public int alignForwardRC(final byte[] query, final byte[] ref, final int qstart, final int rstart, final int maxEdits, final boolean exact){
+ int[] returnVals = new int[5];
+ returnVals[0] = lastQueryLoc;
+ returnVals[1] = lastRefLoc;
+ returnVals[2] = lastRow;
+ returnVals[3] = lastEdits;
+ returnVals[4] = lastOffset;
+ int edits = alignForwardRCJNI(query,ref,qstart,rstart,maxEdits,exact,maxWidth,AminoAcid.baseToNumber,AminoAcid.baseToComplementExtended,returnVals);
+ lastQueryLoc = returnVals[0];
+ lastRefLoc = returnVals[1];
+ lastRow = returnVals[2];
+ lastEdits = returnVals[3];
+ lastOffset = returnVals[4];
+ return edits;
+ }
+
+ /**
+ * @param query
+ * @param ref
+ * @param qstart
+ * @param rstart
+ * @return Edit distance
+ */
+ public int alignReverse(final byte[] query, final byte[] ref, final int qstart, final int rstart, final int maxEdits, final boolean exact){
+ int[] returnVals = new int[5];
+ returnVals[0] = lastQueryLoc;
+ returnVals[1] = lastRefLoc;
+ returnVals[2] = lastRow;
+ returnVals[3] = lastEdits;
+ returnVals[4] = lastOffset;
+ int edits = alignReverseJNI(query,ref,qstart,rstart,maxEdits,exact,maxWidth,AminoAcid.baseToNumber,returnVals);
+ lastQueryLoc = returnVals[0];
+ lastRefLoc = returnVals[1];
+ lastRow = returnVals[2];
+ lastEdits = returnVals[3];
+ lastOffset = returnVals[4];
+ return edits;
+ }
+
+ /**
+ * @param query
+ * @param ref
+ * @param qstart
+ * @param rstart
+ * @return Edit distance
+ */
+ public int alignReverseRC(final byte[] query, final byte[] ref, final int qstart, final int rstart, final int maxEdits, final boolean exact){
+ int[] returnVals = new int[5];
+ returnVals[0] = lastQueryLoc;
+ returnVals[1] = lastRefLoc;
+ returnVals[2] = lastRow;
+ returnVals[3] = lastEdits;
+ returnVals[4] = lastOffset;
+ int edits = alignReverseRCJNI(query,ref,qstart,rstart,maxEdits,exact,maxWidth,AminoAcid.baseToNumber,AminoAcid.baseToComplementExtended,returnVals);
+ lastQueryLoc = returnVals[0];
+ lastRefLoc = returnVals[1];
+ lastRow = returnVals[2];
+ lastEdits = returnVals[3];
+ lastOffset = returnVals[4];
+ return edits;
+ }
+}
diff --git a/current/align2/Blacklist.java b/current/align2/Blacklist.java
new file mode 100755
index 0000000..c0a4317
--- /dev/null
+++ b/current/align2/Blacklist.java
@@ -0,0 +1,101 @@
+package align2;
+
+import java.util.HashSet;
+
+import stream.Read;
+
+import fileIO.TextFile;
+
+/**
+ * @author Brian Bushnell
+ * @date Mar 14, 2013
+ *
+ */
+public class Blacklist {
+
+ public static boolean inWhitelist(Read r){
+ return r==null ? false : (inWhitelist2(r) || inWhitelist2(r.mate));
+ }
+
+ private static boolean inWhitelist2(Read r){
+ if(r==null || !r.mapped() || whitelist==null || whitelist.isEmpty()){return false;}
+ byte[] name=r.getScaffoldName(false);
+ return (name!=null && whitelist.contains(new String(name)));
+ }
+
+ public static boolean inBlacklist(Read r){
+ if(r==null){return false;}
+ boolean a=inBlacklist2(r);
+ boolean b=inBlacklist2(r.mate);
+ if(!a && !b){return false;}
+ if(a){
+ return b || r.mate==null || !r.mate.mapped();
+ }
+ return b && !r.mapped();
+ }
+
+ private static boolean inBlacklist2(Read r){
+ if(r==null || !r.mapped() || blacklist==null || blacklist.isEmpty()){return false;}
+ byte[] name=r.getScaffoldName(false);
+ return (name!=null && blacklist.contains(new String(name)));
+ }
+
+ public static void addToBlacklist(String fname){
+ addToSet(fname, true);
+ }
+
+ public static void addToWhitelist(String fname){
+ addToSet(fname, false);
+ }
+
+ public static synchronized int addToSet(String fname, boolean black){
+ final HashSet<String> set;
+ int added=0, overwritten=0;
+ if(black){
+ if(blacklist==null){blacklist=new HashSet<String>(4001);}
+ set=blacklist;
+ }else{
+ if(whitelist==null){whitelist=new HashSet<String>(4001);}
+ set=whitelist;
+ }
+ TextFile tf=new TextFile(fname, false, false);
+ String line=tf.nextLine();
+ if(line==null){return 0;}
+ final boolean fasta=(line.charAt(0)=='>');
+ System.err.println("Detected "+(black ? "black" : "white")+"list file "+fname+" as "+(fasta ? "" : "non-")+"fasta-formatted.");
+ while(line!=null){
+ String key=null;
+ if(fasta){
+ if(line.charAt(0)=='>'){key=new String(line.substring(1));}
+ }else{
+ key=line;
+ }
+ if(key!=null){
+ boolean b=set.add(key);
+ added++;
+ if(!b){
+ if(overwritten==0){
+ System.err.println("Duplicate "+(black ? "black" : "white")+"list key "+key);
+ System.err.println("Subsequent duplicates from this file will not be mentioned.");
+ }
+ overwritten++;
+ }
+ }
+ line=tf.nextLine();
+ }
+ if(overwritten>0){
+ System.err.println("Added "+overwritten+" duplicate keys.");
+ }
+ return added-overwritten;
+ }
+
+ public static boolean hasBlacklist(){return blacklist!=null && !blacklist.isEmpty();}
+ public static boolean hasWhitelist(){return whitelist!=null && !whitelist.isEmpty();}
+
+ public static void clearBlacklist(){blacklist=null;}
+ public static void clearWhitelist(){whitelist=null;}
+
+ private static HashSet<String> blacklist=null;
+ private static HashSet<String> whitelist=null;
+
+}
diff --git a/current/align2/Block.java b/current/align2/Block.java
new file mode 100755
index 0000000..9459968
--- /dev/null
+++ b/current/align2/Block.java
@@ -0,0 +1,171 @@
+package align2;
+
+import java.io.File;
+import java.io.Serializable;
+import java.util.Arrays;
+
+import fileIO.LoadThread;
+import fileIO.ReadWrite;
+
+/**
+ * @author Brian Bushnell
+ * @date Dec 23, 2012
+ *
+ */
+public class Block implements Serializable{
+
+ /**
+ *
+ */
+ private static final long serialVersionUID = -1638122096023589384L;
+
+ public Block(int numSites_, int numStarts_){
+ numSites=numSites_;
+ numStarts=numStarts_;
+ sites=new int[numSites];
+ starts=new int[numStarts+1];
+ assert(Integer.bitCount(numStarts)==1 && Integer.bitCount(starts.length)==2) : numStarts;
+ }
+
+ public Block(int[] sites_, int[] starts_){
+ sites=sites_;
+ starts=starts_;
+ numSites=sites.length;
+ numStarts=starts.length-1;
+ assert(Integer.bitCount(numStarts)==1 && Integer.bitCount(starts.length)==2) : numStarts;
+ }
+
+ /** For legacy support */
+ public int[] getHitList(int key){
+ int len=length(key);
+ if(len==0){return null;}
+ int start=starts[key];
+ int[] r=Arrays.copyOfRange(sites, start, start+len);
+ return r;
+ }
+
+ /** For legacy support */
+ public int[] getHitList(int start, int stop){
+ int len=length(start, stop);
+ if(len==0){return null;}
+ assert(len>0) : len+", "+start+", "+stop;
+ int[] r=Arrays.copyOfRange(sites, start, start+len);
+ return r;
+ }
+
+ /** For legacy support */
+ public int[][] getHitLists(int[] start, int[] stop){
+ int[][] r=new int[start.length][];
+ for(int i=0; i<start.length; i++){r[i]=getHitList(start[i], stop[i]);}
+ return r;
+ }
+
+ public int length(int key){
+ int x=starts[key+1]-starts[key];
+ if(x==0){return 0;}
+ return sites[starts[key]]!=-1 ? x : 0; //Lists can be removed by making the first site -1.
+ }
+
+ public int length(int start, int stop){
+ if(start==stop || sites[start]==-1){return 0;}
+ return stop-start;
+ }
+
+ public boolean write(String fname, boolean overwrite){
+ String fname2=fname+"2.gz";
+ {
+ File f=new File(fname);
+ if(f.exists()){
+ if(!overwrite){
+ assert(false) : "Tried to overwrite file "+f.getAbsolutePath();
+ return false;
+ }
+ }
+ f=new File(fname2);
+ if(f.exists()){
+ if(!overwrite){
+ assert(false) : "Tried to overwrite file "+f.getAbsolutePath();
+ return false;
+ }
+ }
+ }
+ ReadWrite.writeObjectInThread(sites, fname, allowSubprocess);
+ if(!compress){
+ ReadWrite.writeObjectInThread(starts, fname+"2.gz", allowSubprocess);
+ }else{
+ if(copyOnWrite){
+ final int[] x;
+ x=new int[starts.length];
+ for(int i=1; i<x.length; i++){
+ x[i]=starts[i]-starts[i-1];
+ }
+ ReadWrite.writeObjectInThread(starts, fname+"2.gz", allowSubprocess);
+ }else{
+ compress(starts);
+ ReadWrite.writeAsync(starts, fname+"2.gz", allowSubprocess);
+ decompress(starts);
+ }
+ }
+ return true;
+ }
+
+ private static void compress(int[] x){
+ for(int i=x.length-1; i>0; i--){
+ x[i]=x[i]-x[i-1];
+ }
+ }
+
+ private static void decompress(int[] x){
+ int sum=x[0];
+ for(int i=1; i<x.length; i++){
+ sum+=x[i];
+ x[i]=sum;
+ }
+ }
+
+ public static Block read(String fname){
+ String fname2=fname+"2.gz";
+
+ final int[] a, b;
+ {
+ LoadThread<int[]> lta=LoadThread.load(fname, int[].class);
+ b=ReadWrite.read(int[].class, fname2, false);
+ lta.waitForThisToFinish();
+ a=lta.output;
+ }
+// {
+// LoadThread<int[]> lta=LoadThread.load(fname, int[].class);
+// LoadThread<int[]> ltb=LoadThread.load(fname2, int[].class);
+// lta.waitForThisToFinish();
+// ltb.waitForThisToFinish();
+// a=lta.output;
+// b=ltb.output;
+// }
+
+// int[] a=ReadWrite.read(int[].class, fname);
+// int[] b=ReadWrite.read(int[].class, fname2);
+
+ assert(a!=null && b!=null) : a+", "+b;
+ if(compress){
+ int sum=b[0];
+ for(int i=1; i<b.length; i++){
+ sum+=b[i];
+ b[i]=sum;
+ }
+ }
+ Block r=new Block(a, b);
+ assert(r.sites==a);
+ assert(r.starts==b);
+ return r;
+ }
+
+ public final int numSites;
+ public final int numStarts;
+ public final int[] sites;
+ public final int[] starts;
+
+ private static boolean allowSubprocess=false;
+ private static final boolean compress=true;
+ private static final boolean copyOnWrite=false;
+
+}
diff --git a/current/align2/ChromLoadThread.java b/current/align2/ChromLoadThread.java
new file mode 100755
index 0000000..ce0b3ed
--- /dev/null
+++ b/current/align2/ChromLoadThread.java
@@ -0,0 +1,112 @@
+package align2;
+
+import dna.ChromosomeArray;
+
+/**
+ * @author Brian Bushnell
+ * @date Dec 31, 2012
+ *
+ */
+public class ChromLoadThread extends Thread {
+
+ public static void main(String[] args){
+
+ }
+
+ public ChromLoadThread(String fname_, int id_, ChromosomeArray[] r_){
+ fname=fname_;
+ id=id_;
+ array=r_;
+ }
+
+ public static ChromLoadThread load(String fname, int id, ChromosomeArray[] r){
+ assert(r[id]==null);
+ ChromLoadThread clt=null;
+ if(r[id]==null){
+ increment(1);
+ clt=new ChromLoadThread(fname, id, r);
+ clt.start();
+ }
+ return clt;
+ }
+
+ public static ChromosomeArray[] loadAll(String pattern, int min, int max, ChromosomeArray[] r){
+ if(r==null){r=new ChromosomeArray[max+1];}
+ assert(r.length>=max+1);
+
+ int pound=pattern.lastIndexOf('#');
+ String a=pattern.substring(0, pound);
+ String b=pattern.substring(pound+1);
+
+ ChromLoadThread[] clta=new ChromLoadThread[max];
+ for(int i=min; i<max; i++){
+ String fname=(a+i+b);
+ clta[i]=load(fname, i, r);
+ }
+
+ if(max>=min){ //Load last element in this thread instead of making a new thread.
+ increment(1);
+ r[max]=ChromosomeArray.read(a+max+b);
+ increment(-1);
+ }
+
+ for(int i=min; i<max; i++){
+ while(r[i]==null){
+ synchronized(lock){
+ while(lock[0]>0){
+ try {
+ lock.wait();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ lock.notify();
+ }
+ }
+ }
+ }
+
+ return r;
+ }
+
+ @Override
+ public void run(){
+ try {
+ array[id]=ChromosomeArray.read(fname);
+ } catch (Exception e) {
+ increment(-1);
+ throw new RuntimeException(e);
+ }
+ increment(-1);
+ }
+
+ private static final int increment(int i){
+ int r;
+ synchronized(lock){
+ if(i<=0){
+ lock[0]+=i;
+ lock.notify();
+ }else{
+ while(lock[0]>=MAX_CONCURRENT){
+ try {
+ lock.wait();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+
+ }
+ r=lock[0];
+ }
+ return r;
+ }
+
+ private final int id;
+ private final String fname;
+ private final ChromosomeArray[] array;
+
+ public static final int[] lock=new int[1];
+ public static int MAX_CONCURRENT=Shared.threads();
+
+}
diff --git a/current/align2/CompareSamFiles.java b/current/align2/CompareSamFiles.java
new file mode 100755
index 0000000..da85d6e
--- /dev/null
+++ b/current/align2/CompareSamFiles.java
@@ -0,0 +1,383 @@
+package align2;
+
+import java.io.File;
+import java.util.Arrays;
+import java.util.BitSet;
+
+import stream.Read;
+import stream.SamLine;
+import stream.SiteScore;
+
+import dna.Data;
+import dna.Parser;
+
+import fileIO.TextFile;
+
+/** Generate a file containing reads mapped correctly in one file and incorrectly in another file. */
+public class CompareSamFiles {
+
+
+ public static void main(String[] args){
+
+ String in1=null;
+ String in2=null;
+ long reads=-1;
+
+ for(int i=0; i<args.length; i++){
+ final String arg=args[i];
+ final String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if("null".equalsIgnoreCase(b)){b=null;}
+// System.err.println("Processing "+args[i]);
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(a.equals("path") || a.equals("root")){
+ Data.setPath(b);
+ }else if(a.equals("in") || a.equals("in1")){
+ in1=b;
+ }else if(a.equals("in2")){
+ in2=b;
+ }else if(a.equals("parsecustom")){
+ parsecustom=Tools.parseBoolean(b);
+ }else if(a.equals("thresh")){
+ THRESH2=Integer.parseInt(b);
+ }else if(a.equals("printerr")){
+ printerr=Tools.parseBoolean(b);
+// }else if(a.equals("ssaha2") || a.equals("subtractleadingclip")){
+// SamLine.SUBTRACT_LEADING_SOFT_CLIP=Tools.parseBoolean(b);
+ }else if(a.equals("blasr")){
+ BLASR=Tools.parseBoolean(b);
+ }else if(a.equals("q") || a.equals("quality") || a.startsWith("minq")){
+ minQuality=Integer.parseInt(b);
+ }else if(in1==null && i==0 && args[i].indexOf('=')<0 && (a.startsWith("stdin") || new File(args[i]).exists())){
+ in1=args[i];
+ }else if(in2==null && i==1 && args[i].indexOf('=')<0 && (a.startsWith("stdin") || new File(args[i]).exists())){
+ in2=args[i];
+ }else if(a.equals("reads")){
+ reads=Tools.parseKMG(b);
+ }else if(i==2 && args[i].indexOf('=')<0 && Character.isDigit(a.charAt(0))){
+ reads=Tools.parseKMG(a);
+ }
+ }
+
+ assert(in1!=null) : args[0]+".exists() ? "+new File(args[0]).exists();
+// assert(in2!=null) : args[1]+".exists() ? "+new File(args[1]).exists();
+
+ if(reads<1){
+// assert(false) : "Number of expected reads was not specified. Please add a parameter reads=<number> or disable assertions.";
+ reads=100000;
+ System.err.println("Warning - number of expected reads was not specified.");
+ }
+
+ TextFile tf1=new TextFile(in1, false, false);
+ TextFile tf2=null;
+ if(in2!=null){tf2=new TextFile(in2, false, false);}
+
+ BitSet truePos1=new BitSet((int)reads);
+ BitSet falsePos1=new BitSet((int)reads);
+ BitSet truePos2=new BitSet((int)reads);
+ BitSet falsePos2=new BitSet((int)reads);
+
+ String s=null;
+
+ TextFile tf;
+ {
+ tf=tf1;
+ for(s=tf.nextLine(); s!=null; s=tf.nextLine()){
+ char c=s.charAt(0);
+ if(c!='@'/* && c!=' ' && c!='\t'*/){
+ SamLine sl=new SamLine(s);
+ if(sl.primary()){
+ Read r=sl.toRead(parsecustom);
+ if(parsecustom && r.originalSite==null){
+ assert(false);
+ System.err.println("Turned off custom parsing.");
+ parsecustom=false;
+ }
+ //System.out.println(r);
+ int type=type(r, sl);
+ int id=(int)r.numericID;
+ if(type==2){truePos1.set(id);}
+ else if(type>2){falsePos1.set(id);}
+ }
+ }
+ }
+ tf.close();
+ }
+ if(tf2!=null){
+ tf=tf2;
+ for(s=tf.nextLine(); s!=null; s=tf.nextLine()){
+ char c=s.charAt(0);
+ if(c!='@'/* && c!=' ' && c!='\t'*/){
+ SamLine sl=new SamLine(s);
+ if(sl.primary()){
+ Read r=sl.toRead(parsecustom);
+ if(parsecustom && r.originalSite==null){
+ assert(false);
+ System.err.println("Turned off custom parsing.");
+ parsecustom=false;
+ }
+ //System.out.println(r);
+ int type=type(r, sl);
+ int id=(int)r.numericID;
+ if(type==2){truePos2.set(id);}
+ else if(type>2){falsePos2.set(id);}
+ }
+ }
+ }
+ tf.close();
+ }
+
+
+
+ BitSet added=new BitSet((int)reads);
+ {
+ tf=tf1;
+ tf.reset();
+ for(s=tf.nextLine(); s!=null; s=tf.nextLine()){
+ char c=s.charAt(0);
+ if(c!='@'/* && c!=' ' && c!='\t'*/){
+ SamLine sl=new SamLine(s);
+// assert(false) : s+", "+truePos1.cardinality()+", "+truePos2.cardinality()+", "+falsePos1.cardinality()+", "+falsePos2.cardinality()+", ";
+ if(sl.primary()){
+ Read r=sl.toRead(parsecustom);
+ int id=(int)r.numericID;
+ if(!added.get(id)){
+// if(truePos1.get(id)!=truePos2.get(id) || falsePos1.get(id)!=falsePos2.get(id)){
+// System.out.println(s);
+// added.set(id);
+// }
+// if(falsePos1.get(id) && truePos2.get(id)){
+// System.out.println(s);
+// added.set(id);
+// }
+ if(falsePos1.get(id) && !falsePos2.get(id)){
+ System.out.println(s);
+ added.set(id);
+ }
+ }
+ }
+ }
+ }
+ tf.close();
+ }
+ if(tf2!=null){
+ tf=tf2;
+ tf.reset();
+ for(s=tf.nextLine(); s!=null; s=tf.nextLine()){
+ char c=s.charAt(0);
+ if(c!='@'/* && c!=' ' && c!='\t'*/){
+ SamLine sl=new SamLine(s);
+ if(sl.primary()){
+ Read r=sl.toRead(parsecustom);
+ int id=(int)r.numericID;
+ if(!added.get(id)){
+// if(truePos1.get(id)!=truePos2.get(id) || falsePos1.get(id)!=falsePos2.get(id)){
+// System.out.println(s);
+// added.set(id);
+// }
+// if(falsePos2.get(id) && truePos1.get(id)){
+// System.out.println(s);
+// added.set(id);
+// }
+ if(falsePos2.get(id) && !falsePos1.get(id)){
+ System.out.println(s);
+ added.set(id);
+ }
+ }
+ }
+ }
+ }
+ tf.close();
+ }
+ }
+
+
+ public static void calcStatistics1(final Read r, SamLine sl){
+
+ int THRESH=0;
+ primary++;
+
+ if(r.discarded()/* || r.mapScore==0*/){
+ discarded++;
+ unmapped++;
+ }else if(r.ambiguous()){
+// assert(r.mapped()) : "\n"+r+"\n"+sl+"\n";
+ if(r.mapped()){mapped++;}
+ ambiguous++;
+ }else if(r.mapScore<1){
+ unmapped++;
+ }else if(r.mapScore<=minQuality){
+ if(r.mapped()){mapped++;}
+ ambiguous++;
+ }else{
+ if(!r.mapped()){
+ unmapped++;
+ }else{
+ mapped++;
+ mappedRetained++;
+
+ if(parsecustom){
+ SiteScore os=r.originalSite;
+ assert(os!=null);
+ if(os!=null){
+ int trueChrom=os.chrom;
+ byte trueStrand=os.strand;
+ int trueStart=os.start;
+ int trueStop=os.stop;
+ SiteScore ss=new SiteScore(r.chrom, r.strand(), r.start, r.stop, 0, 0);
+ byte[] originalContig=sl.originalContig();
+ if(BLASR){
+ originalContig=(originalContig==null || Tools.indexOf(originalContig, (byte)'/')<0 ? originalContig :
+ Arrays.copyOfRange(originalContig, 0, Tools.lastIndexOf(originalContig, (byte)'/')));
+ }
+ int cstart=sl.originalContigStart();
+
+ boolean strict=isCorrectHit(ss, trueChrom, trueStrand, trueStart, trueStop, THRESH, originalContig, sl.rname(), cstart);
+ boolean loose=isCorrectHitLoose(ss, trueChrom, trueStrand, trueStart, trueStop, THRESH+THRESH2, originalContig, sl.rname(), cstart);
+
+ // if(!strict){
+ // System.out.println(ss+", "+new String(originalContig)+", "+new String(sl.rname()));
+ // assert(false);
+ // }
+
+ // System.out.println("loose = "+loose+" for "+r.toText());
+
+ if(loose){
+ // System.err.println("TPL\t"+trueChrom+", "+trueStrand+", "+trueStart+", "+trueStop+"\tvs\t"
+ // +ss.chrom+", "+ss.strand+", "+ss.start+", "+ss.stop);
+ truePositiveLoose++;
+ }else{
+ // System.err.println("FPL\t"+trueChrom+", "+trueStrand+", "+trueStart+", "+trueStop+"\tvs\t"
+ // +ss.chrom+", "+ss.strand+", "+ss.start+", "+ss.stop);
+ falsePositiveLoose++;
+ }
+
+ if(strict){
+ // System.err.println("TPS\t"+trueStart+", "+trueStop+"\tvs\t"+ss.start+", "+ss.stop);
+ truePositiveStrict++;
+ }else{
+ // System.err.println("FPS\t"+trueStart+", "+trueStop+"\tvs\t"+ss.start+", "+ss.stop);
+ falsePositiveStrict++;
+ }
+ }
+ }
+ }
+ }
+ }
+
+
+
+ public static int type(final Read r, SamLine sl){
+
+ int THRESH=0;
+ primary++;
+
+ if(r.discarded()/* || r.mapScore==0*/){
+ return 0;
+ }else if(r.ambiguous()){
+ return 1;
+ }else if(r.mapScore<1){
+ return 0;
+ }else if(r.mapScore<=minQuality){
+ return 1;
+ }else{
+ if(!r.mapped()){
+ return 0;
+ }else{
+
+ if(parsecustom){
+ SiteScore os=r.originalSite;
+ assert(os!=null);
+ if(os!=null){
+ int trueChrom=os.chrom;
+ byte trueStrand=os.strand;
+ int trueStart=os.start;
+ int trueStop=os.stop;
+ SiteScore ss=new SiteScore(r.chrom, r.strand(), r.start, r.stop, 0, 0);
+ byte[] originalContig=sl.originalContig();
+ if(BLASR){
+ originalContig=(originalContig==null || Tools.indexOf(originalContig, (byte)'/')<0 ? originalContig :
+ Arrays.copyOfRange(originalContig, 0, Tools.lastIndexOf(originalContig, (byte)'/')));
+ }
+ int cstart=sl.originalContigStart();
+
+ boolean strict=isCorrectHit(ss, trueChrom, trueStrand, trueStart, trueStop, THRESH, originalContig, sl.rname(), cstart);
+ boolean loose=isCorrectHitLoose(ss, trueChrom, trueStrand, trueStart, trueStop, THRESH+THRESH2, originalContig, sl.rname(), cstart);
+
+ if(strict){return 2;}
+ if(loose){return 3;}
+ return 4;
+ }
+ }
+ }
+ }
+ return 0;
+ }
+
+
+
+ public static boolean isCorrectHit(SiteScore ss, int trueChrom, byte trueStrand, int trueStart, int trueStop, int thresh,
+ byte[] originalContig, byte[] contig, int cstart){
+ if(ss.strand!=trueStrand){return false;}
+ if(originalContig!=null){
+ if(!Arrays.equals(originalContig, contig)){return false;}
+ }else{
+ if(ss.chrom!=trueChrom){return false;}
+ }
+
+ assert(ss.stop>ss.start) : ss.toText()+", "+trueStart+", "+trueStop;
+ assert(trueStop>trueStart) : ss.toText()+", "+trueStart+", "+trueStop;
+ int cstop=cstart+trueStop-trueStart;
+// return (absdif(ss.start, trueStart)<=thresh && absdif(ss.stop, trueStop)<=thresh);
+ return (absdif(ss.start, cstart)<=thresh && absdif(ss.stop, cstop)<=thresh);
+ }
+
+
+ public static boolean isCorrectHitLoose(SiteScore ss, int trueChrom, byte trueStrand, int trueStart, int trueStop, int thresh,
+ byte[] originalContig, byte[] contig, int cstart){
+ if(ss.strand!=trueStrand){return false;}
+ if(originalContig!=null){
+ if(!Arrays.equals(originalContig, contig)){return false;}
+ }else{
+ if(ss.chrom!=trueChrom){return false;}
+ }
+
+ assert(ss.stop>ss.start) : ss.toText()+", "+trueStart+", "+trueStop;
+ assert(trueStop>trueStart) : ss.toText()+", "+trueStart+", "+trueStop;
+ int cstop=cstart+trueStop-trueStart;
+// return (absdif(ss.start, trueStart)<=thresh || absdif(ss.stop, trueStop)<=thresh);
+ return (absdif(ss.start, cstart)<=thresh || absdif(ss.stop, cstop)<=thresh);
+ }
+
+ private static final int absdif(int a, int b){
+ return a>b ? a-b : b-a;
+ }
+
+ public static int truePositiveStrict=0;
+ public static int falsePositiveStrict=0;
+
+ public static int truePositiveLoose=0;
+ public static int falsePositiveLoose=0;
+
+ public static int mapped=0;
+ public static int mappedRetained=0;
+ public static int unmapped=0;
+
+ public static int discarded=0;
+ public static int ambiguous=0;
+
+ public static long lines=0;
+ public static long primary=0;
+ public static long secondary=0;
+
+ public static int minQuality=3;
+
+ public static boolean parsecustom=true;
+ public static boolean printerr=false;
+
+ public static int THRESH2=20;
+ public static boolean BLASR=false;
+
+}
diff --git a/current/align2/CompressString.java b/current/align2/CompressString.java
new file mode 100755
index 0000000..37f1d7a
--- /dev/null
+++ b/current/align2/CompressString.java
@@ -0,0 +1,269 @@
+package align2;
+
+import dna.ChromosomeArray;
+import dna.Data;
+
+public class CompressString {
+
+ public static void main(String[] args){
+
+ String s;
+
+ s=compressRepeats(args[0].getBytes(), 1);
+ s=compress(args[0]);
+ s=compressRepeatsUltra(args[0].getBytes(), 1, 3, null);
+ System.out.println(args[0]+"\n"+s);
+
+ System.exit(0);
+
+ ChromosomeArray cha=Data.getChromosome(1);
+ byte[] bytes=cha.array;
+
+ int letters=0;
+ for(int i=0; i<bytes.length; i++){if(bytes[i]!='N'){letters++;}}
+
+ System.out.println("cha bytes length = "+bytes.length);
+ System.out.println("cha letters length = "+letters);
+ System.out.println("min="+cha.minIndex+", max="+cha.maxIndex+", length="+(cha.maxIndex-cha.minIndex+1));
+
+ s=compressRepeatsUltra(bytes, 1, 1, null);
+ System.out.println("compress(1) length: "+s.length());
+
+ s=compressRepeatsUltra(bytes, 2, 2, null);
+ System.out.println("compress(2) length: "+s.length());
+
+ s=compressRepeatsUltra(bytes, 3, 3, null);
+ System.out.println("compress(3) length: "+s.length());
+
+ s=compressRepeatsUltra(bytes, 1, 2, null);
+ System.out.println("compress(1,2) length: "+s.length());
+
+ s=compressRepeatsUltra(bytes, 1, 3, null);
+ System.out.println("compress(1,3) length: "+s.length());
+
+// s=compressRepeatsMultiperiod(bytes, 1, 1, null);
+// System.out.println("compress(1) length: "+s.length());
+//
+// s=compressRepeatsMultiperiod(bytes, 2, 2, null);
+// System.out.println("compress(2) length: "+s.length());
+//
+// s=compressRepeatsMultiperiod(bytes, 3, 3, null);
+// System.out.println("compress(3) length: "+s.length());
+//
+// s=compressRepeatsMultiperiod(bytes, 1, 2, null);
+// System.out.println("compress(1,2) length: "+s.length());
+//
+// s=compressRepeatsMultiperiod(bytes, 1, 3, null);
+// System.out.println("compress(1,3) length: "+s.length());
+ }
+
+ public static String compress(String s){
+ String s1=compressRepeats(s.getBytes(), 1);
+ String s2=compressRepeats(s1.getBytes(), 2);
+ String s3=compressRepeats(s2.getBytes(), 3);
+ return s3;
+ }
+
+ public static String compressRepeats(byte[] array, int period){
+
+ StringBuilder sb=new StringBuilder(array.length);
+
+ for(int base=0; base<array.length; base++){
+ //Test for repeats of current pattern (array[i] to array[period-1])
+ int repeats=countRepeats(array, base, period);
+ int occurances=repeats+1;
+
+// System.out.println("base = "+base+"\t, repeats = "+repeats);
+
+ if(repeats==0){
+ //Advance pointer by 1
+ sb.append((char)array[base]);
+ }else if(repeats==1){
+ //Still advance pointer by 1
+ sb.append((char)array[base]);
+ }else if(repeats==2){
+ //Jump ahead
+ base+=period-1;
+ }else{
+ //Compress and advance pointer by a factor of period
+
+ int log=(32-Integer.numberOfLeadingZeros(repeats+1))-1; //+1 is optional; gives lower compression
+// System.out.println("repeats="+repeats+
+// ", Integer.highestOneBit("+repeats+")="+Integer.highestOneBit(repeats)+
+// ", log="+log+", "+Integer.toBinaryString(repeats));
+ assert(log>0 && log<=31);
+
+
+ //Append
+ for(int i=1; i<log; i++){
+ for(int j=0; j<period; j++){
+ sb.append((char)array[base+j]);
+ }
+ }
+
+ base=base+(period*(repeats))-1;
+
+ }
+
+ }
+
+ return sb.toString();
+ }
+
+ public static String compressRepeatsMultiperiod(byte[] array, int minPeriod, int maxPeriod, IntList list){
+
+ StringBuilder sb=new StringBuilder(array.length);
+
+ for(int base=0; base<array.length; base++){
+ //Test for repeats of current pattern (array[i] to array[period-1])
+
+
+ int period=0;
+ int repeats=0;
+// for(int x=maxPeriod; x>=minPeriod; x--){
+// int temp=countRepeats(array, base, x);
+// if(temp>1){
+// repeats=temp;
+// period=x;
+// break;
+// }
+// }
+ for(int x=minPeriod; x<=maxPeriod; x++){
+ int temp=countRepeats(array, base, x);
+ if(temp>1){
+ repeats=temp;
+ period=x;
+ break;
+ }
+ }
+ int occurances=repeats+1;
+
+// System.out.println("base = "+base+"\t, repeats = "+repeats+"\t, period = "+period);
+
+ if(repeats==0){
+ //Advance pointer by 1
+ sb.append((char)array[base]);
+ if(list!=null){list.add(base);}
+ }else if(repeats==1){
+ //Still advance pointer by 1
+ sb.append((char)array[base]);
+ if(list!=null){list.add(base);}
+ }
+// else if(repeats==2){
+// for(int j=0; j<period; j++){
+// sb.append((char)array[base+j]);
+// }
+// //Jump ahead
+// base+=2*period-1;
+// }
+ else{
+ //Compress and advance pointer by a factor of period
+
+ int log=(32-Integer.numberOfLeadingZeros(repeats+1))-1; //+1 is optional; gives lower compression
+// System.out.println("repeats="+repeats+
+// ", Integer.highestOneBit("+repeats+")="+Integer.highestOneBit(repeats)+
+// ", log="+log+", "+Integer.toBinaryString(repeats));
+ assert(log>0 && log<=31);
+
+
+ //Append
+ for(int i=0; i<log; i++){
+ for(int j=0; j<period; j++){
+ sb.append((char)array[base+j]);
+ if(list!=null){list.add(base+j);}
+ }
+ }
+
+ base=base+(period*(repeats))-1;
+
+ }
+
+ }
+
+ return sb.toString();
+ }
+
+ public static String compressRepeatsUltra(byte[] array, int minPeriod, int maxPeriod, IntList list){
+
+ StringBuilder sb=new StringBuilder(array.length);
+
+ for(int base=0; base<array.length; base++){
+ //Test for repeats of current pattern (array[i] to array[period-1])
+
+
+ int period=0;
+ int repeats=0;
+
+ for(int x=minPeriod; x<=maxPeriod; x++){
+ int temp=countRepeats(array, base, x);
+// System.out.println("*** temp="+temp+" for "+base+", "+x);
+ if(temp>1){
+ repeats=temp;
+ period=x;
+ break;
+ }
+ }
+// System.err.println(repeats);
+ if(repeats==0){
+ //Advance pointer by 1
+ sb.append((char)array[base]);
+ if(list!=null){list.add(base);}
+ }else if(repeats==1){
+ //Still advance pointer by 1
+ sb.append((char)array[base]);
+ if(list!=null){list.add(base);}
+
+// System.err.println(base);
+ base=base+(period*(repeats))-1;
+// System.err.println(base);
+ }
+// else if(repeats==2){
+// for(int j=0; j<period; j++){
+// sb.append((char)array[base+j]);
+// }
+// //Jump ahead
+// base+=2*period-1;
+// }
+ else{
+ //Compress and advance pointer
+
+ //Append
+ for(int j=0; j<period; j++){
+ sb.append((char)array[base+j]);
+ if(list!=null){list.add(base+j);}
+ }
+
+ base=base+(period*(repeats))-1;
+
+ }
+
+ }
+
+ return sb.toString();
+ }
+
+ public static int countRepeats(byte[] array, int base, int period){
+
+ int max=array.length-period+1;
+
+ int matches=0;
+ boolean fail=false;
+ for(int loc=base+period; loc<max && !fail; loc+=period){
+ for(int i=0; i<period && !fail; i++){
+ if(array[base+i]==array[loc+i]){
+ matches++;
+// System.out.println("base = "+base+", loc = "+loc+", period = "+period+", and "+(base+i)+" == "+(loc+i));
+ }else{
+// System.err.println("failed");
+ fail=true;
+ }
+ }
+ }
+// System.err.println("matches = "+matches);
+ int repeats=matches/period;
+// System.err.println("repeats = "+repeats);
+ return repeats;
+ }
+
+
+}
diff --git a/current/align2/Evaluate.java b/current/align2/Evaluate.java
new file mode 100755
index 0000000..2aaaf38
--- /dev/null
+++ b/current/align2/Evaluate.java
@@ -0,0 +1,56 @@
+package align2;
+
+import dna.Data;
+import dna.Gene;
+import fileIO.TextFile;
+
+public class Evaluate {
+
+ public static void main(String[] args){
+ TextFile tf=new TextFile(args[0], false, false);
+
+ String[] lines=tf.toStringLines();
+ tf.close();
+
+ int trials=lines.length;
+ if(args.length>0){
+ if(args.length>1){
+ trials=Integer.parseInt(args[1]);
+ }
+ }
+
+ int correct=0;
+
+ for(String s : lines){
+ if(isCorrect(s)){correct++;}
+ }
+
+ int incorrect=trials-correct;
+ int falsePositive=lines.length-correct;
+ int falseNegative=trials-lines.length;
+
+ Data.sysout.println("Trials: \t"+trials);
+ Data.sysout.println("Correct: \t"+correct+"\t"+String.format("%.4f",correct*100f/trials)+"%");
+ Data.sysout.println("Incorrect: \t"+incorrect+"\t"+String.format("%.4f",incorrect*100f/trials)+"%");
+ Data.sysout.println("False Positive:\t"+falsePositive+"\t"+String.format("%.4f",falsePositive*100f/trials)+"%");
+ Data.sysout.println("False Negative:\t"+falseNegative+"\t"+String.format("%.4f",falseNegative*100f/trials)+"%");
+
+ }
+
+ public static boolean isCorrect(String s){
+ String[] line=s.split("\t");
+
+ String[] answer=line[0].split("_");
+ int trueChrom=Gene.toChromosome(answer[1]);
+ byte trueStrand=Byte.parseByte(answer[2]);
+ int trueLoc=Integer.parseInt(answer[3]);
+
+ int calledChrom=Gene.toChromosome(line[2]);
+ byte calledStrand=Gene.toStrand(line[1]);
+ int calledLoc=Integer.parseInt(line[3]);
+
+ return (trueChrom==calledChrom && trueStrand==calledStrand && Math.abs(trueLoc-calledLoc)<1000);
+
+ }
+
+}
diff --git a/current/align2/GapTools.java b/current/align2/GapTools.java
new file mode 100755
index 0000000..fbf023a
--- /dev/null
+++ b/current/align2/GapTools.java
@@ -0,0 +1,211 @@
+package align2;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import stream.SiteScore;
+
+public class GapTools {
+
+ public static int[] fixGaps(SiteScore ss){
+ int[] r=fixGaps(ss.start(), ss.stop(), ss.gaps, Shared.MINGAP);
+ ss.gaps=r;
+ return r;
+ }
+
+ public static String toString(int[] gaps){
+ if(gaps==null){return null;}
+ StringBuilder sb=new StringBuilder();
+ for(int i=0; i<gaps.length; i++){
+ if(i>0){sb.append('~');}
+ sb.append(gaps[i]);
+ }
+ return sb.toString();
+ }
+
+ public static int[] fixGaps(int a, int b, int[] gaps, int minGap){
+// System.err.println("fixGaps Input: "+a+", "+b+", "+Arrays.toString(gaps)+", "+minGap);
+// assert(false) : "fixGaps called!";
+ if(verbose){System.err.println("fixGaps a: "+Arrays.toString(gaps));}
+ assert(b>a);
+ if(gaps==null){return null;}
+ assert(gaps.length>=4);
+ if(verbose){System.err.println("fixGaps b: "+Arrays.toString(gaps));}
+
+ int g0=gaps[0];
+ int gN=gaps[gaps.length-1];
+ if(!Tools.overlap(a, b, g0, gN)){return null;}
+
+ int changed=0;
+ if(gaps[0]!=a){gaps[0]=a; changed++;}
+ if(gaps[gaps.length-1]!=b){gaps[gaps.length-1]=b; changed++;}
+ for(int i=0; i<gaps.length; i++){
+ if(gaps[i]<a){gaps[i]=a; changed++;}
+ else if(gaps[i]>b){gaps[i]=b; changed++;}
+ }
+
+ if(verbose){System.err.println("fixGaps c0: "+Arrays.toString(gaps));}
+
+ for(int i=1; i<gaps.length; i++){
+ if(gaps[i-1]>gaps[i]){gaps[i]=gaps[i-1]; changed++;}
+ }
+
+ if(changed==0){return gaps;}
+
+ if(verbose){System.err.println("fixGaps c1: "+Arrays.toString(gaps));}
+
+ gaps[0]=a;
+ gaps[gaps.length-1]=b;
+ if(verbose){System.err.println("fixGaps d: "+Arrays.toString(gaps));}
+
+ int remove=0;
+ for(int i=0; i<gaps.length; i+=2){
+ gaps[i]=Tools.constrict(gaps[i], a, b);
+ gaps[i+1]=Tools.constrict(gaps[i+1], a, b);
+ if(gaps[i]==gaps[i+1]){remove++;}
+ }
+ if(verbose){System.err.println("fixGaps e: "+Arrays.toString(gaps));}
+ if(remove==0){return gaps;}
+ if(verbose){System.err.println("fixGaps f: "+Arrays.toString(gaps));}
+
+ return fixGaps2(a, b, gaps, minGap);
+ }
+
+ /** This may have some off-by-one errors... */
+ public static final int calcGrefLen(SiteScore ss){
+ return calcGrefLen(ss.start(), ss.stop(), ss.gaps);
+ }
+
+ /** This may have some off-by-one errors... */
+ public static final int calcGrefLen(int a, int b, int[] gaps){
+ int total=b-a+1;
+ if(gaps==null){return total;}
+ for(int i=2; i<gaps.length; i+=2){
+ int b1=gaps[i-1];
+ int b2=gaps[i];
+ int syms=calcNumGapSymbols(b1, b2);
+ total=total-syms*(Shared.GAPLEN-1);
+ }
+ assert(total>0) : "total="+total+", a="+a+", b="+b+", gaps="+Arrays.toString(gaps);
+ return total;
+ }
+
+ /** TODO: Verify. */
+ public static final int calcBufferNeeded(int a, int b, int[] gaps){
+ int total=b-a+1;
+ if(gaps==null){return total;}
+ for(int i=2; i<gaps.length; i+=2){
+ int b1=gaps[i-1];
+ int b2=gaps[i];
+ int syms=calcNumGapSymbols(b1, b2);
+ total=total-syms*(Shared.GAPLEN-1)+Shared.GAPBUFFER2;
+ }
+ assert(total>0) : a+", "+b+", "+Arrays.toString(gaps);
+ return total;
+ }
+
+ /** TODO: Verify. */
+ public static int calcGapLen(int a, int b){
+ assert(b>a);
+ int gap=b-a;
+ if(gap<Shared.MINGAP){return gap;}
+ int len=Shared.GAPBUFFER2;
+ gap-=Shared.GAPBUFFER2;
+ int div=gap/Shared.GAPLEN;
+ int rem=gap%Shared.GAPLEN;
+ len+=(div+rem);
+ return len;
+ }
+
+ public static int calcNumGapSymbols(int a, int b){
+ assert(b>a);
+ int gap=b-a-Shared.GAPBUFFER2;
+ return Tools.max(0, gap/Shared.GAPLEN);
+ }
+
+ public static final int[] fixGaps2(int a, int b, int[] gaps, int minGap){
+ if(verbose){System.err.println("Input: "+a+", "+b+", "+Arrays.toString(gaps)+", "+minGap);}
+ ArrayList<Range> list=toList(gaps);
+ if(verbose){System.err.println("Before fixing: "+list);}
+ assert(list.size()>1);
+ for(int i=1; i<list.size(); i++){
+ Range r1=list.get(i-1);
+ Range r2=list.get(i);
+
+ if(verbose){
+ System.err.println("\nRound "+i);
+ System.err.println("r1="+r1);
+ System.err.println("r2="+r2);
+ }
+
+ if(r1!=null){
+ if(r2.a-r1.b<=minGap){
+ r2.a=Tools.min(r1.a, r2.a);
+ r2.b=Tools.max(r1.b, r2.b);
+ list.set(i-1, null);
+ }
+ }
+
+ if(verbose){
+ System.err.println("->");
+ System.err.println(list.get(i-1));
+ System.err.println(list.get(i));
+ }
+
+ }
+ if(verbose){System.err.println("After fixing: "+list);}
+ Tools.condenseStrict(list);
+ if(verbose){System.err.println("After condensing: "+list);}
+
+ if(list.size()<2){return null;}
+
+ int[] gaps2;
+ if(gaps.length==list.size()*2){
+ gaps2=gaps;
+ }else{
+ gaps2=new int[list.size()*2];
+ }
+ for(int i=0, j=0; i<list.size(); i++, j+=2){
+ Range r=list.get(i);
+ gaps2[j]=r.a;
+ gaps2[j+1]=r.b;
+ }
+ if(verbose){System.err.println("Final gaps: "+Arrays.toString(gaps2));}
+ return gaps2;
+ }
+
+ public static final ArrayList<Range> toList(int[] gaps){
+ ArrayList<Range> list=new ArrayList<Range>(gaps.length/2);
+ for(int i=0; i<gaps.length; i+=2){list.add(new Range(gaps[i], gaps[i+1]));}
+ return list;
+ }
+
+ public static class Range implements Comparable<Range>{
+
+ public Range(int a_, int b_){
+ assert(b_>=a_);
+ a=a_;
+ b=b_;
+ }
+
+ public int compareTo(Range r){
+ int x;
+ x=a-r.a;
+ if(x!=0){return x;}
+ return b-r.b;
+ }
+
+ public String toString(){
+ return "("+a+","+b+")";
+ }
+
+ public boolean equals(Object other){return equals((Range)other);}
+ public boolean equals(Range other){return compareTo(other)==0;}
+
+ public int a;
+ public int b;
+ }
+
+ public static boolean verbose=false;
+
+}
diff --git a/current/align2/GradeSamFile.java b/current/align2/GradeSamFile.java
new file mode 100755
index 0000000..34771af
--- /dev/null
+++ b/current/align2/GradeSamFile.java
@@ -0,0 +1,364 @@
+package align2;
+
+import java.io.File;
+import java.util.Arrays;
+import java.util.BitSet;
+
+import dna.Parser;
+
+import stream.Read;
+import stream.SamLine;
+import stream.SiteScore;
+
+import fileIO.FileFormat;
+import fileIO.TextFile;
+import fileIO.TextStreamWriter;
+
+public class GradeSamFile {
+
+
+ public static void main(String[] args){
+
+ String in=null, outl=null, outs=null;
+ long reads=-1;
+
+ for(int i=0; i<args.length; i++){
+ final String arg=args[i];
+ final String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if("null".equalsIgnoreCase(b)){b=null;}
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(a.equals("in") || a.equals("in1")){
+ in=b;
+ }else if(a.equals("reads")){
+ reads=Tools.parseKMG(b);
+ }else if(a.equals("parsecustom")){
+ parsecustom=Tools.parseBoolean(b);
+ }else if(a.equals("thresh")){
+ THRESH2=Integer.parseInt(b);
+ }else if(a.equals("printerr")){
+ printerr=Tools.parseBoolean(b);
+// }else if(a.equals("ssaha2") || a.equals("subtractleadingclip")){
+// SamLine.SUBTRACT_LEADING_SOFT_CLIP=Tools.parseBoolean(b);
+ }else if(a.equals("blasr")){
+ BLASR=Tools.parseBoolean(b);
+ }else if(a.equals("q") || a.equals("quality") || a.startsWith("minq")){
+ minQuality=Integer.parseInt(b);
+ }else if(a.equals("bitset")){
+ USE_BITSET=Tools.parseBoolean(b);
+ }else if(a.equals("outloose") || a.equals("outl")){
+ outl=b;
+ }else if(a.equals("outstrict") || a.equals("outs")){
+ outs=b;
+ }else if(i==0 && args[i].indexOf('=')<0 && (a.startsWith("stdin") || new File(args[0]).exists())){
+ in=args[0];
+ }else if(i==1 && args[i].indexOf('=')<0 && Character.isDigit(a.charAt(0))){
+ reads=Tools.parseKMG(a);
+ }else{
+ throw new RuntimeException("Unknown parameter "+arg);
+ }
+ }
+
+ if(outl!=null){
+ ffLoose=FileFormat.testOutput(outl, FileFormat.SAM, null, false, true, false, true);
+ tswLoose=new TextStreamWriter(ffLoose);
+ tswLoose.start();
+ }
+
+ if(outs!=null){
+ ffStrict=FileFormat.testOutput(outs, FileFormat.SAM, null, false, true, false, true);
+ tswStrict=new TextStreamWriter(ffStrict);
+ tswStrict.start();
+ }
+
+ if(USE_BITSET){
+ int x=400000;
+ if(reads>0 && reads<=Integer.MAX_VALUE){x=(int)reads;}
+ try {
+ seen=new BitSet(x);
+ } catch (Throwable e) {
+ seen=null;
+ e.printStackTrace();
+ System.out.println("Did not have enough memory to allocate bitset; duplicate mappings will not be detected.");
+ }
+ }
+
+ assert(in!=null) : args[0]+".exists() ? "+new File(args[0]).exists();
+
+ if(reads<1){
+ assert(false) : "Number of expected reads was not specified. Please add a parameter reads=<number> or disable assertions.";
+ System.err.println("Warning - number of expected reads was not specified.");
+ }
+
+ TextFile tf=new TextFile(in, false, false);
+
+ String s=null;
+ for(s=tf.nextLine(); s!=null; s=tf.nextLine()){
+ char c=s.charAt(0);
+// System.out.println(s);
+ if(c!='@'/* && c!=' ' && c!='\t'*/){
+ SamLine sl=new SamLine(s);
+ lines++;
+
+ int id=(parsecustom && seen!=null ? ((((int)sl.parseNumericId())<<1)|sl.pairnum()) : (int)lines);
+// System.out.println(sl.parseNumericId()+", "+sl.pairnum()+", "+id+"");
+// if(id%500==10){assert(false);}
+ if(sl.primary() && (!parsecustom || seen==null || !seen.get(id))){
+ Read r=sl.toRead(parsecustom);
+ if(seen!=null){seen.set(id);}
+ if(parsecustom && r.originalSite==null){
+ assert(false);
+ System.err.println("Turned off custom parsing.");
+ parsecustom=false;
+ }
+ //System.out.println(r);
+ calcStatistics1(r, sl);
+ }else{
+ secondary++;
+ }
+ }
+ }
+
+ if(tswLoose!=null){tswLoose.poisonAndWait();}
+ if(tswStrict!=null){tswStrict.poisonAndWait();}
+
+ if(reads<-1){reads=primary;}
+
+ double tmult=100d/reads;
+
+ double mappedB=mapped*tmult;
+ double retainedB=mappedRetained*tmult;
+ double truePositiveStrictB=truePositiveStrict*tmult;
+ double falsePositiveStrictB=falsePositiveStrict*tmult;
+ double truePositiveLooseB=truePositiveLoose*tmult;
+ double falsePositiveLooseB=falsePositiveLoose*tmult;
+ double falseNegativeB=(reads-mapped)*tmult;
+ double discardedB=discarded*tmult;
+ double ambiguousB=ambiguous*tmult;
+
+ System.out.println();
+ System.out.println("Mapping Statistics for "+args[0]+":");
+ System.out.println("primary alignments: \t"+primary+" found of "+reads+" expected");
+ System.out.println("secondary alignments: \t"+secondary+" found");
+ System.out.println(String.format("mapped: \t"+(mappedB<10?" ":"")+"%.3f", mappedB)+"%");
+ System.out.println(String.format("retained: \t"+(retainedB<10?" ":"")+"%.3f", retainedB)+"%");
+ System.out.println(String.format("discarded: \t"+(discardedB<10?" ":"")+"%.3f", discardedB)+"%");
+ System.out.println(String.format("ambiguous: \t"+(ambiguousB<10?" ":"")+"%.3f", ambiguousB)+"%");
+ if(parsecustom){
+ System.out.println();
+ System.out.println("Strict correctness (both ends exactly correct):");
+ System.out.println(String.format("true positive: \t"+(truePositiveStrictB<10?" ":"")+"%.3f", truePositiveStrictB)+"%");
+ System.out.println(String.format("false positive: \t"+(falsePositiveStrictB<10?" ":"")+"%.3f", falsePositiveStrictB)+"%");
+ System.out.println();
+ System.out.println("Loose correctness (one end approximately correct):");
+ System.out.println(String.format("true positive: \t"+(truePositiveLooseB<10?" ":"")+"%.3f", truePositiveLooseB)+"%");
+ System.out.println(String.format("false positive: \t"+(falsePositiveLooseB<10?" ":"")+"%.3f", falsePositiveLooseB)+"%");
+ }
+ System.out.println();
+ System.out.println(String.format("false negative: \t"+(falseNegativeB<10?" ":"")+"%.3f", falseNegativeB)+"%");
+
+ if(printerr){
+ System.err.println();
+ System.err.println("Mapping Statistics for "+args[0]+":");
+ System.err.println("primary alignments: \t"+primary+" found of "+reads+" expected");
+ System.err.println("secondary alignments: \t"+secondary+" found");
+ System.err.println(String.format("mapped: \t"+(mappedB<10?" ":"")+"%.3f", mappedB)+"%");
+ System.err.println(String.format("retained: \t"+(retainedB<10?" ":"")+"%.3f", retainedB)+"%");
+ System.err.println(String.format("discarded: \t"+(discardedB<10?" ":"")+"%.3f", discardedB)+"%");
+ System.err.println(String.format("ambiguous: \t"+(ambiguousB<10?" ":"")+"%.3f", ambiguousB)+"%");
+ if(parsecustom){
+ System.err.println();
+ System.err.println("Strict correctness (both ends exactly correct):");
+ System.err.println(String.format("true positive: \t"+(truePositiveStrictB<10?" ":"")+"%.3f", truePositiveStrictB)+"%");
+ System.err.println(String.format("false positive: \t"+(falsePositiveStrictB<10?" ":"")+"%.3f", falsePositiveStrictB)+"%");
+ System.err.println();
+ System.err.println("Loose correctness (one end approximately correct):");
+ System.err.println(String.format("true positive: \t"+(truePositiveLooseB<10?" ":"")+"%.3f", truePositiveLooseB)+"%");
+ System.err.println(String.format("false positive: \t"+(falsePositiveLooseB<10?" ":"")+"%.3f", falsePositiveLooseB)+"%");
+ }
+ System.err.println();
+ System.err.println(String.format("false negative: \t"+(falseNegativeB<10?" ":"")+"%.3f", falseNegativeB)+"%");
+ }
+
+
+ }
+
+
+ public static void calcStatistics1(final Read r, SamLine sl){
+
+ int THRESH=0;
+ primary++;
+
+ if(r.discarded()/* || r.mapScore==0*/){
+ discarded++;
+ unmapped++;
+ }else if(r.ambiguous()){
+// assert(r.mapped()) : "\n"+r+"\n"+sl+"\n";
+ if(r.mapped()){mapped++;}
+ ambiguous++;
+ }else if(r.mapScore<1){
+ unmapped++;
+ }else if(r.mapScore<=minQuality){
+ if(r.mapped()){mapped++;}
+ ambiguous++;
+ }else{
+ if(!r.mapped()){
+ unmapped++;
+ }else{
+ mapped++;
+ mappedRetained++;
+
+ if(parsecustom){
+ SiteScore os=r.originalSite;
+// System.out.println("A1: "+os);
+ assert(os!=null);
+ if(os!=null){
+ final int trueChrom=os.chrom;
+ final byte trueStrand=os.strand;
+ final int trueStart=os.start;
+ final int trueStop=os.stop;
+// System.err.println();
+// System.err.println(sl);
+// System.err.println();
+// System.err.println(r);
+// System.err.println();
+ SiteScore ss=new SiteScore(r.chrom, r.strand(), r.start, r.stop, 0, 0);
+ byte[] originalContig=sl.originalContig();
+ if(BLASR){
+ originalContig=(originalContig==null || Tools.indexOf(originalContig, (byte)'/')<0 ? originalContig :
+ Arrays.copyOfRange(originalContig, 0, Tools.lastIndexOf(originalContig, (byte)'/')));
+ }
+ int cstart=sl.originalContigStart();
+
+// System.out.println("A2: "+trueStart+", "+cstart);
+ boolean strict=isCorrectHit(ss, trueChrom, trueStrand, trueStart, trueStop, THRESH, originalContig, sl.rname(), cstart, r);
+ boolean loose=isCorrectHitLoose(ss, trueChrom, trueStrand, trueStart, trueStop, THRESH+THRESH2, originalContig, sl.rname(), cstart);
+
+ // if(!strict){
+ // System.out.println(ss+", "+new String(originalContig)+", "+new String(sl.rname()));
+ // assert(false);
+ // }
+
+ // System.out.println("loose = "+loose+" for "+r.toText());
+
+ if(loose){
+ // System.err.println("TPL\t"+trueChrom+", "+trueStrand+", "+trueStart+", "+trueStop+"\tvs\t"
+ // +ss.chrom+", "+ss.strand+", "+ss.start+", "+ss.stop);
+ truePositiveLoose++;
+ }else{
+ // System.err.println("FPL\t"+trueChrom+", "+trueStrand+", "+trueStart+", "+trueStop+"\tvs\t"
+ // +ss.chrom+", "+ss.strand+", "+ss.start+", "+ss.stop);
+ falsePositiveLoose++;
+ if(tswLoose!=null){
+ if(ffLoose.samOrBam()){
+ tswLoose.println(sl.toText());
+ }else{
+ tswLoose.println(r);
+ }
+ }
+ }
+
+ if(strict){
+ // System.err.println("TPS\t"+trueStart+", "+trueStop+"\tvs\t"+ss.start+", "+ss.stop);
+ truePositiveStrict++;
+ }else{
+ // System.err.println("FPS\t"+trueStart+", "+trueStop+"\tvs\t"+ss.start+", "+ss.stop);
+ falsePositiveStrict++;
+ if(tswStrict!=null){
+ if(ffStrict.samOrBam()){
+ tswStrict.println(sl.toText());
+ }else{
+ tswStrict.println(r);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ }
+
+
+
+ public static boolean isCorrectHit(SiteScore ss, int trueChrom, byte trueStrand, int trueStart, int trueStop, int thresh,
+ byte[] originalContig, byte[] contig, int cstart, Read r){
+
+ final int cstop=cstart+trueStop-trueStart;
+
+// System.out.println("\n"+r.id);
+// System.out.println(" \tstrand"+/*"\tchrom"+*/"\tstart\tstop\t");//+"scaf");
+// System.out.println("Original:\t"+trueStrand+/*"\t"+trueChrom+*/"\t"+trueStart+"\t"+trueStop+"\t");//+new String(originalContig));
+// System.out.println("Mapped: \t"+ss.strand+/*"\t"+ss.chrom+*/"\t"+ss.start+"\t"+ss.stop+"\t");//+new String(contig));
+// System.out.println("cs: \t"+trueStrand+/*"\t"+trueChrom+*/"\t"+cstart+"\t"+cstop+"\t");//+new String(contig));
+
+ if(ss.strand!=trueStrand){return false;}
+ if(originalContig!=null){
+ if(!Arrays.equals(originalContig, contig)){return false;}
+ }else{
+ if(ss.chrom!=trueChrom){return false;}
+ }
+
+ assert(ss.stop>ss.start) : ss.toText()+", "+trueStart+", "+trueStop;
+ assert(trueStop>trueStart) : ss.toText()+", "+trueStart+", "+trueStop;
+// return (absdif(ss.start, trueStart)<=thresh && absdif(ss.stop, trueStop)<=thresh);
+ return (absdif(ss.start, cstart)<=thresh && absdif(ss.stop, cstop)<=thresh);
+ }
+
+
+ public static boolean isCorrectHitLoose(SiteScore ss, int trueChrom, byte trueStrand, int trueStart, int trueStop, int thresh,
+ byte[] originalContig, byte[] contig, int cstart){
+ if(ss.strand!=trueStrand){return false;}
+ if(originalContig!=null){
+ if(!Arrays.equals(originalContig, contig)){return false;}
+ }else{
+ if(ss.chrom!=trueChrom){return false;}
+ }
+
+ assert(ss.stop>ss.start) : ss.toText()+", "+trueStart+", "+trueStop;
+ assert(trueStop>trueStart) : ss.toText()+", "+trueStart+", "+trueStop;
+ int cstop=cstart+trueStop-trueStart;
+// return (absdif(ss.start, trueStart)<=thresh || absdif(ss.stop, trueStop)<=thresh);
+ return (absdif(ss.start, cstart)<=thresh || absdif(ss.stop, cstop)<=thresh);
+ }
+
+ private static final int absdif(int a, int b){
+ return a>b ? a-b : b-a;
+ }
+
+ public static FileFormat ffLoose=null;
+ public static FileFormat ffStrict=null;
+ public static TextStreamWriter tswLoose=null;
+ public static TextStreamWriter tswStrict=null;
+
+ public static int truePositiveStrict=0;
+ public static int falsePositiveStrict=0;
+
+ public static int truePositiveLoose=0;
+ public static int falsePositiveLoose=0;
+
+ public static int mapped=0;
+ public static int mappedRetained=0;
+ public static int unmapped=0;
+
+ public static int discarded=0;
+ public static int ambiguous=0;
+
+ public static long lines=0;
+ public static long primary=0;
+ public static long secondary=0;
+
+ public static int minQuality=3;
+
+ public static boolean parsecustom=true;
+ public static boolean printerr=false;
+
+ public static int THRESH2=20;
+ public static boolean BLASR=false;
+ public static boolean USE_BITSET=true;
+ public static BitSet seen=null;
+
+}
diff --git a/current/align2/Heap.java b/current/align2/Heap.java
new file mode 100755
index 0000000..70d3645
--- /dev/null
+++ b/current/align2/Heap.java
@@ -0,0 +1,140 @@
+package align2;
+
+import java.util.PriorityQueue;
+
+public final class Heap<T extends Comparable<? super T>> {
+
+ public Heap(int maxSize){
+
+ int len=maxSize+1;
+ if((len&1)==1){len++;} //Array size is always even.
+
+ CAPACITY=maxSize;
+ array=(T[])new Comparable[len];
+// queue=new PriorityQueue<T>(maxSize);
+ }
+
+ public boolean add(T t){
+ //assert(testForDuplicates());
+// assert(queue.size()==size);
+// queue.add(t);
+ assert(size==0 || array[size]!=null);
+ size++;
+ array[size]=t;
+ percDown(size);
+// assert(queue.size()==size);
+// assert(queue.peek()==peek());
+ //assert(testForDuplicates());
+ return true;
+ }
+
+ public T peek(){
+ //assert(testForDuplicates());
+// assert(queue.size()==size);
+ if(size==0){return null;}
+// assert(array[1]==queue.peek()) : size+", "+queue.size()+"\n"+
+// array[1]+"\n"+
+// array[2]+" , "+array[3]+"\n"+
+// array[4]+" , "+array[5]+" , "+array[6]+" , "+array[7]+"\n"+
+// queue.peek()+"\n";
+ //assert(testForDuplicates());
+ return array[1];
+ }
+
+ public T poll(){
+ //assert(testForDuplicates());
+// assert(queue.size()==size);
+ if(size==0){return null;}
+ T t=array[1];
+// assert(t==queue.poll());
+ array[1]=array[size];
+ array[size]=null;
+ size--;
+ if(size>0){percUp(1);}
+// assert(queue.size()==size);
+// assert(queue.peek()==peek());
+ //assert(testForDuplicates());
+ return t;
+ }
+
+ private void percDown(int loc){
+ //assert(testForDuplicates());
+ assert(loc>0);
+ if(loc==1){return;}
+ int next=loc/2;
+ T a=array[loc];
+ T b=array[next];
+ assert(a!=b);
+ if(a.compareTo(b)<0){
+ array[next]=a;
+ array[loc]=b;
+ percDown(next);
+ }
+ }
+
+ private void percUp(int loc){
+ //assert(testForDuplicates());
+ assert(loc>0 && loc<=size) : loc+", "+size;
+ int next1=loc*2;
+ int next2=next1+1;
+ if(next1>size){return;}
+ T a=array[loc];
+ T b=array[next1];
+ T c=array[next2];
+ assert(a!=b);
+ assert(b!=c);
+ assert(b!=null);
+ //assert(testForDuplicates());
+ if(c==null || b.compareTo(c)<1){
+ if(a.compareTo(b)>0){
+ array[next1]=a;
+ array[loc]=b;
+ //assert(testForDuplicates());
+ percUp(next1);
+ }
+ }else{
+ if(a.compareTo(c)>0){
+ array[next2]=a;
+ array[loc]=c;
+ //assert(testForDuplicates());
+ percUp(next2);
+ }
+ }
+ }
+
+ public boolean isEmpty(){
+// assert((size==0) == queue.isEmpty());
+ return size==0;
+ }
+
+ public void clear(){
+// queue.clear();
+ for(int i=1; i<=size; i++){array[i]=null;}
+ size=0;
+ }
+
+ public int size(){
+ return size;
+ }
+
+ public static int tier(int x){
+ int leading=Integer.numberOfLeadingZeros(x);
+ return 31-leading;
+ }
+
+ public boolean testForDuplicates(){
+ for(int i=0; i<array.length; i++){
+ for(int j=i+1; j<array.length; j++){
+ if(array[i]!=null && array[i]==array[j]){return false;}
+ }
+ }
+ return true;
+ }
+
+ private final T[] array;
+ private final int CAPACITY;
+ private int size=0;
+
+// private PriorityQueue<T> queue;
+
+}
diff --git a/current/align2/Index.java b/current/align2/Index.java
new file mode 100755
index 0000000..1a675a1
--- /dev/null
+++ b/current/align2/Index.java
@@ -0,0 +1,12 @@
+package align2;
+
+/**
+ * @author Brian Bushnell
+ * @date Dec 19, 2012
+ *
+ */
+public abstract class Index {
+
+ //TODO: Put static methods here.
+
+}
diff --git a/current/align2/IndexMaker4.java b/current/align2/IndexMaker4.java
new file mode 100755
index 0000000..9818daa
--- /dev/null
+++ b/current/align2/IndexMaker4.java
@@ -0,0 +1,525 @@
+package align2;
+
+import java.io.File;
+import java.lang.Thread.State;
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import stream.KillSwitch;
+
+import dna.AminoAcid;
+import dna.ChromosomeArray;
+import dna.Data;
+import dna.Timer;
+import fileIO.ReadWrite;
+
+
+/**
+ * @author Brian Bushnell
+ * @date Dec 23, 2012
+ *
+ */
+public class IndexMaker4 {
+
+ public static Block[] makeIndex(final int genome, int minChrom, int maxChrom, int k, int CHROMBITS,
+ int MAX_ALLOWED_CHROM_INDEX, int CHROM_MASK_LOW, int CHROM_MASK_HIGH, int SITE_MASK, int SHIFT_LENGTH,
+ boolean WRITE, boolean DISK_INVALID, Block[] index){
+ Timer t=new Timer();
+
+ MAX_CONCURRENT_BLOCKS=(Shared.LOW_MEMORY ? 1 : (Data.WINDOWS ? (WRITE ? 1 : Tools.max(1, Shared.threads()/4)) : Tools.max(1, Shared.threads()/4)));
+
+ minChrom=Tools.max(1, minChrom);
+ if(genome>=0 && Data.GENOME_BUILD!=genome){
+ Data.setGenome(genome);
+ maxChrom=Tools.min(Data.numChroms, maxChrom);
+ }
+
+ assert(minChrom<=maxChrom);
+
+ if(index==null){index=new Block[maxChrom+1];}
+
+ ArrayList<BlockMaker> list=new ArrayList<BlockMaker>();
+
+ for(int i=1; i<=maxChrom;){
+ if(i>=minChrom){
+ int a=minChrom(i, minChrom, CHROM_MASK_HIGH);
+ int b=maxChrom(i, minChrom, maxChrom, CHROM_MASK_LOW);
+ assert(b>=i);
+
+ BlockMaker idm=new BlockMaker(a, b, k, CHROMBITS, MAX_ALLOWED_CHROM_INDEX, CHROM_MASK_LOW, CHROM_MASK_HIGH, SITE_MASK, SHIFT_LENGTH, WRITE, DISK_INVALID, index);
+ list.add(idm);
+ incrementActiveBlocks(1);
+ idm.start();
+
+ while(idm.getState()==State.NEW){}//wait
+
+ i=b+1;
+ }else{i++;}
+ }
+
+ for(BlockMaker cm : list){
+ while(cm.getState()!=State.TERMINATED){
+ try {
+ cm.join();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+
+ t.stop();
+// Data.sysout.println("Index gen time: \t"+t);
+
+ return index;
+ }
+
+ public static Block makeBlock(int minChrom, int maxChrom, int k, int CHROMBITS, int MAX_ALLOWED_CHROM_INDEX,
+ int CHROM_MASK_LOW, int CHROM_MASK_HIGH, int SITE_MASK, int SHIFT_LENGTH, boolean WRITE, boolean DISK_INVALID, Block[] matrix){
+ assert(false) : maxChrom+", "+MAX_ALLOWED_CHROM_INDEX;
+ BlockMaker idm=new BlockMaker(minChrom, maxChrom, k, CHROMBITS, MAX_ALLOWED_CHROM_INDEX, CHROM_MASK_LOW, CHROM_MASK_HIGH, SITE_MASK, SHIFT_LENGTH, WRITE, DISK_INVALID, matrix);
+ Block block=idm.makeArrays();
+
+ assert(false) : maxChrom+", "+MAX_ALLOWED_CHROM_INDEX;
+
+ if(verbose){
+ for(int i=0; i<block.numStarts; i++){
+ int[] array=block.getHitList(i);
+ if(array==null){Data.sysout.println(i+": "+null);}
+ else{Data.sysout.println(i+": "+Arrays.toString(array));}
+ }
+ }
+
+ return block;
+ }
+
+
+
+ private static class BlockMaker extends Thread{
+
+ public BlockMaker(int minChrom_, int maxChrom_, int k, int CHROMBITS_,
+ int MAX_ALLOWED_CHROM_INDEX_, int CHROM_MASK_LOW_, int CHROM_MASK_HIGH_, int SITE_MASK_, int SHIFT_LENGTH_,
+ boolean WRITE_TO_DISK_, boolean DISK_INVALID_, Block[] matrix_){
+
+ KEYLEN=k;
+ CHROMBITS=CHROMBITS_;
+ KEYSPACE=1<<(2*KEYLEN);
+ MAX_ALLOWED_CHROM_INDEX=MAX_ALLOWED_CHROM_INDEX_;
+ WRITE_TO_DISK=WRITE_TO_DISK_;
+ DISK_INVALID=DISK_INVALID_;
+
+
+ CHROM_MASK_LOW=CHROM_MASK_LOW_;
+ CHROM_MASK_HIGH=CHROM_MASK_HIGH_;
+ SITE_MASK=SITE_MASK_;
+ SHIFT_LENGTH=SHIFT_LENGTH_;
+
+ minChrom=minChrom_;
+ maxChrom=maxChrom_;
+ matrix=matrix_;
+// assert(false) : maxChrom+", "+MAX_ALLOWED_CHROM_INDEX;
+// System.err.println(minChrom+"~"+maxChrom);
+ }
+
+
+ @Override
+ public void run(){
+ makeArrays();
+ incrementActiveBlocks(-1);
+ }
+
+
+ private Block makeArrays(){
+
+ if(!DISK_INVALID){
+ String fname=fname(minChrom, maxChrom, KEYLEN, CHROMBITS);
+ File f=new File(fname);
+
+ if(f.exists() && new File(fname+"2.gz").exists()){
+ Block x=Block.read(fname);
+ if(matrix!=null){
+ for(int i=baseChrom(minChrom); i<=maxChrom; i++){
+ matrix[i]=x;
+ }
+ }
+ return x;
+ }else{
+ synchronized(getClass()){
+ Data.sysout.println("No index available; generating from reference genome: "+f.getAbsolutePath());
+ if(WRITE_TO_DISK){
+ String root=ReadWrite.parseRoot2(f.getAbsolutePath());
+ File rf=new File(root);
+ if(!rf.exists()){
+ rf.mkdirs();
+ }
+ }
+ }
+ }
+ }
+
+ CountThread threads[]=new CountThread[4];
+ int[] sizes=KillSwitch.allocInt1D(KEYSPACE+1);
+ int[] intercom=new int[4];
+ Block[] indexHolder=new Block[1];
+
+ for(int i=0; i<4; i++){
+ threads[i]=new CountThread(i, sizes, intercom, indexHolder);
+ threads[i].start();
+// while(!threads[i].isAlive()){
+// //wait for these threads to start
+// }
+ }
+ Data.sysout.println("Indexing threads started for block "+baseChrom(minChrom)+"-"+maxChrom);
+ for(int i=0; i<threads.length; i++){
+ while(threads[i].getState()!=State.TERMINATED){
+ try {
+ threads[i].join();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+ Data.sysout.println("Indexing threads finished for block "+baseChrom(minChrom)+"-"+maxChrom);
+
+ for(int i=sizes.length-2; i>=0; i--){
+ sizes[i+1]=sizes[i];
+ }
+ sizes[0]=0;
+
+ if(matrix!=null){
+ for(int i=baseChrom(minChrom); i<=maxChrom; i++){
+ matrix[i]=indexHolder[0];
+ }
+ }
+
+ if(WRITE_TO_DISK){
+ String fname=fname(minChrom, maxChrom, KEYLEN, CHROMBITS);
+// File f=new File(fname);
+// assert(!f.exists()) : "Tried to overwrite file "+f.getAbsolutePath();
+ indexHolder[0].write(fname, true);
+ }
+
+ return indexHolder[0];
+ }
+
+
+ private class CountThread extends Thread{
+
+ public CountThread(int id_, int[] sizes_, int[] intercom_, Block[] indexHolder_){
+ id=id_;
+ idb=AminoAcid.numberToBase[id];
+ sizes=sizes_;
+ indexHolder=indexHolder_;
+ intercom=intercom_;
+
+ minIndex=(id<<(2*KEYLEN-2));
+ maxIndex=(int)(((id+1L)<<(2*KEYLEN-2))-1);
+ //Data.sysout.println("Thread "+id+" range is "+minIndex+", "+maxIndex);
+
+ if(ALLOW_POLYMERS){
+ banmask=-1; //poly-A still slips through
+ }else{
+ int b=0;
+ for(int i=0; i<KEYLEN; i++){
+ b<<=2;
+ b=(b|id);
+ }
+ banmask=~((-1)<<((2*KEYLEN)-banshift));
+ }
+ }
+
+ private final int id;
+ private final int idb;
+ private final int[] sizes;
+ /** {sizeSum, #finishedCounting, #finishedAllocating, #finishedFilling} */
+ private final int[] intercom;
+ private final Block[] indexHolder;
+ private final int minIndex;
+ private final int maxIndex;
+ private final int banmask;
+ private static final int banshift=4;
+
+ @Override
+ public void run(){
+
+ //Data.sysout.println("Thread "+id+" counting sizes for ("+minChrom+", "+maxChrom+")");
+ for(int i=minChrom; i<=maxChrom; i++){countSizes(i);}
+
+ final Block b;
+ synchronized(intercom){
+ //Data.sysout.println("Thread "+id+" synced on intercom: "+Arrays.toString(intercom));
+ intercom[1]++;
+ if(id==0){
+ while(intercom[1]<4){
+ //Data.sysout.println("Thread "+id+" waiting on intercom: "+Arrays.toString(intercom));
+ try {
+ intercom.wait();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+
+ int sum=0;
+ for(int i=0; i<sizes.length; i++){
+ int temp=sizes[i];
+ sizes[i]=sum;
+ sum+=temp;
+ }
+
+ if(USE_ALLOC_SYNC){
+ synchronized(ALLOC_SYNC){//To allow contiguous memory allocation
+ b=new Block(KillSwitch.allocInt1D(sum), sizes);
+ }
+ }else{
+ b=new Block(KillSwitch.allocInt1D(sum), sizes);
+ }
+ indexHolder[0]=b;
+ intercom[2]++;
+ assert(intercom[2]==1);
+ intercom.notifyAll();
+ }else{
+ while(intercom[2]<1){
+ //Data.sysout.println("Thread "+id+" waiting on intercom: "+Arrays.toString(intercom));
+ try {
+ if(intercom[1]>=4){intercom.notify();}
+ intercom.wait();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+ }
+
+ //Data.sysout.println("Thread "+id+" filling arrays for ("+minChrom+", "+maxChrom+")");
+
+
+ for(int i=minChrom; i<=maxChrom; i++){fillArrays(i);}
+ //Data.sysout.println("Thread "+id+" finished.");
+ }
+
+ private void countSizes(final int chrom){
+
+ // System.err.println("Thread "+id+" using chr"+chrom+" for countSizes");
+ ChromosomeArray ca=dna.Data.getChromosome(chrom);
+
+ // int baseChrom=baseChrom(chrom);
+
+ if(ca.maxIndex>MAX_ALLOWED_CHROM_INDEX){
+ throw new RuntimeException("Chrom "+chrom+": "+ca.maxIndex+" > "+MAX_ALLOWED_CHROM_INDEX);
+ }
+
+ final int max=ca.maxIndex-KEYLEN+1;
+ final int skip=KEYLEN-1;
+ assert(skip>0) : "\n*** The key length is too short. For the flag set 'k=X', X should be between 8 and 15; it was set to "+KEYLEN+" ***\n";
+
+
+ int start=ca.minIndex;
+ while(start<max && ca.getNumber(start+skip)==-1){start+=skip;}
+ while(start<max && ca.getNumber(start)==-1){start++;}
+
+ // Data.sysout.println("Entering hash loop.");
+
+ // "a" is site start, "b" is site end
+ final byte[] array=ca.array;
+ for(int a=start, b=start+skip; a<max; a++, b++){
+ if(array[a]==idb){
+ int key=ca.getNumber(a, b);
+// if(key>=0 && (key>>banshift)!=(key&banmask) && (!USE_MODULO || key%MODULO==0)){
+// assert(key>=minIndex && key<=maxIndex) : "\n"+id+", "+ca.getNumber(a)+", "+(char)ca.get(a)+", "+key+", "+Integer.toHexString(key)+
+// ", "+ca.getString(a, b)+"\n"+minIndex+", "+maxIndex+"\n";
+// sizes[key]++;
+// }
+ if(key>=0 && (key>>banshift)!=(key&banmask) && (!USE_MODULO || key%MODULO==0 || (AminoAcid.reverseComplementBinaryFast(key, KEYLEN))%MODULO==0)){
+ assert(key>=minIndex && key<=maxIndex) : "\n"+id+", "+ca.getNumber(a)+", "+(char)ca.get(a)+", "+key+", "+Integer.toHexString(key)+
+ ", "+ca.getString(a, b)+"\n"+minIndex+", "+maxIndex+"\n";
+ sizes[key]++;
+ }
+ }
+ // Data.sysout.println("a="+a+", b="+b+", max="+max);
+ }
+
+ // Data.sysout.println("Left hash loop.");
+
+ }
+
+ private void fillArrays(final int chrom){
+
+ // System.err.println("Thread "+id+" using chr"+chrom+" for fillArrays");
+ ChromosomeArray ca=dna.Data.getChromosome(chrom);
+
+ int baseChrom=baseChrom(chrom);
+
+ if(ca.maxIndex>MAX_ALLOWED_CHROM_INDEX){
+ throw new RuntimeException("Chrom "+chrom+": "+ca.maxIndex+" > "+MAX_ALLOWED_CHROM_INDEX);
+ }
+
+ final int max=ca.maxIndex-KEYLEN+1;
+ final int skip=KEYLEN-1;
+ assert(skip>0);
+
+
+ int start=ca.minIndex;
+ while(start<max && ca.getNumber(start+skip)==-1){start+=skip;}
+ while(start<max && ca.getNumber(start)==-1){start++;}
+
+
+// // Data.sysout.println("Entering hash loop.");
+// // "a" is site start, "b" is site end
+// int len=KEYLEN-1;
+// int keyB=ca.getNumber(start, start+skip-1);
+// final int mask=(KEYLEN==16 ? -1 : ~((-1)<<(2*KEYLEN)));
+// final byte[] array=ca.array;
+// final byte[] btn=AminoAcid.baseToNumber;
+// for(int a=start, b=start+skip; a<max; a++, b++){
+// int c=btn[array[b]];
+// if(c>=0){
+// keyB=((keyB<<2)|c);
+// len++;
+// }else{
+// len=0;
+// }
+// int key=keyB&mask;
+// if(len>=KEYLEN && /* array[a]==idb*/ key>=minIndex && key<=maxIndex){
+//// int key=keyB&mask;
+// assert(key>=minIndex && key<=maxIndex);
+// int number=toNumber(a, chrom);
+// assert(numberToChrom(number, baseChrom)==chrom);
+// assert(numberToSite(number)==a);
+// index[key][sizes[key]]=number;
+// sizes[key]++;
+// }
+// // Data.sysout.println("a="+a+", b="+b+", max="+max);
+// }
+
+
+ // Data.sysout.println("Entering hash loop.");
+ // "a" is site start, "b" is site end
+
+ int[] sites=indexHolder[0].sites;
+
+ for(int a=start, b=start+skip; a<max; a++, b++){
+ if(ca.array[a]==idb){
+ int key=ca.getNumber(a, b);
+ if(key>=0 && (key>>banshift)!=(key&banmask) && (!USE_MODULO || key%MODULO==0 || (AminoAcid.reverseComplementBinaryFast(key, KEYLEN))%MODULO==0)){
+ assert(key>=minIndex && key<=maxIndex);
+ int number=toNumber(a, chrom);
+ assert(numberToChrom(number, baseChrom)==chrom);
+ assert(numberToSite(number)==a);
+ int loc=sizes[key];
+ assert(sites[loc]==0);
+ sites[loc]=number;
+ sizes[key]++;
+ }
+ }
+ // Data.sysout.println("a="+a+", b="+b+", max="+max);
+ }
+ // Data.sysout.println("Left hash loop.");
+
+ }
+
+ }
+
+
+ /** Encode a (location, chrom) pair to an index */
+ public final int toNumber(int site, int chrom){
+ int out=(chrom&CHROM_MASK_LOW);
+ out=out<<SHIFT_LENGTH;
+ out=(out|site);
+ return out;
+ }
+
+ /** Decode an index to a location */
+ public final int numberToSite(int number){
+ return (number&SITE_MASK);
+ }
+
+ /** Decode an (index, baseChrom) pair to a chromosome */
+ public final int numberToChrom(int number, int baseChrom){
+ assert((baseChrom&CHROM_MASK_LOW)==0) : Integer.toHexString(number)+", baseChrom="+baseChrom;
+ assert(baseChrom>=0) : Integer.toHexString(number)+", baseChrom="+baseChrom;
+ // assert(baseChrom<8) : Integer.toHexString(number)+", baseChrom="+baseChrom;
+
+ int out=(number>>>SHIFT_LENGTH);
+
+ out=out+(baseChrom&CHROM_MASK_HIGH);
+
+ // assert(out<8) : Integer.toHexString(number)+", baseChrom="+baseChrom;
+ return out;
+ }
+
+ public final int baseChrom(int chrom){return Tools.max(0, chrom&CHROM_MASK_HIGH);}
+
+ private final int KEYLEN;
+ private final int CHROMBITS;
+ private final int KEYSPACE;
+ private final int MAX_ALLOWED_CHROM_INDEX;
+ public final boolean WRITE_TO_DISK;
+ public final boolean DISK_INVALID;
+
+ private final int CHROM_MASK_LOW;
+ private final int CHROM_MASK_HIGH;
+ private final int SITE_MASK;
+ private final int SHIFT_LENGTH;
+
+ private final int minChrom;
+ private final int maxChrom;
+
+ private final Block[] matrix;
+
+ }
+
+ public static final int minChrom(int chrom, int MINCHROM, int CHROM_MASK_HIGH){return Tools.max(MINCHROM, chrom&CHROM_MASK_HIGH);}
+ public static final int maxChrom(int chrom, int MINCHROM, int MAXCHROM, int CHROM_MASK_LOW){return Tools.max(MINCHROM, Tools.min(MAXCHROM, chrom|CHROM_MASK_LOW));}
+
+ public static final String fname(int minChrom, int maxChrom, int k, int chrombits){
+ return fname(minChrom, maxChrom, k, chrombits, Data.GENOME_BUILD);
+ }
+
+ public static final String fname(int minChrom, int maxChrom, int k, int chrombits, int build){
+ String suffix="_index_k"+k+"_c"+chrombits+"_b"+build+".block";
+ if(minChrom!=maxChrom){
+ return Data.ROOT_INDEX+build+"/chr"+minChrom+"-"+maxChrom+suffix;
+ }else{
+ return Data.ROOT_INDEX+build+"/chr"+minChrom+suffix;
+ }
+ }
+
+ private static void incrementActiveBlocks(int i){
+ assert(i!=0);
+ synchronized(THREAD_SYNC){
+ assert(ACTIVE_BLOCKS>=0);
+ assert(ACTIVE_BLOCKS<=MAX_CONCURRENT_BLOCKS);
+
+ while(i>0 && ACTIVE_BLOCKS>0 && ACTIVE_BLOCKS>=MAX_CONCURRENT_BLOCKS){
+ try {
+ THREAD_SYNC.wait(10000);
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ ACTIVE_BLOCKS+=i;
+ if(ACTIVE_BLOCKS<MAX_CONCURRENT_BLOCKS || i<0){THREAD_SYNC.notifyAll();}
+
+ assert(ACTIVE_BLOCKS>=0);
+ assert(ACTIVE_BLOCKS<=MAX_CONCURRENT_BLOCKS);
+ }
+ }
+
+ public static boolean verbose=false;
+
+ public static boolean USE_ALLOC_SYNC=false;
+ private static final String ALLOC_SYNC=new String("ALLOC_SYNC");
+ private static final String THREAD_SYNC=new String("THREAD_SYNC");
+
+ public static int MAX_CONCURRENT_BLOCKS=(Shared.LOW_MEMORY ? 1 : (Data.WINDOWS ? 1 : Tools.max(1, Shared.threads()/4)));
+ private static int ACTIVE_BLOCKS=0;
+
+ public static boolean ALLOW_POLYMERS=false;
+ public static boolean USE_MODULO=false;
+ static final int MODULO=9;
+
+}
diff --git a/current/align2/IndexMaker5.java b/current/align2/IndexMaker5.java
new file mode 100755
index 0000000..1719beb
--- /dev/null
+++ b/current/align2/IndexMaker5.java
@@ -0,0 +1,516 @@
+package align2;
+
+import java.io.File;
+import java.lang.Thread.State;
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import dna.AminoAcid;
+import dna.ChromosomeArray;
+import dna.Data;
+import dna.Timer;
+import fileIO.ReadWrite;
+
+
+/**
+ * @author Brian Bushnell
+ * @date Jan 3, 2013
+ *
+ */
+public class IndexMaker5 {
+
+
+ public static Block[] makeIndex(final int genome, int minChrom, int maxChrom, int k, int CHROMBITS,
+ int MAX_ALLOWED_CHROM_INDEX, int CHROM_MASK_LOW, int CHROM_MASK_HIGH, int SITE_MASK, int SHIFT_LENGTH, boolean WRITE, boolean DISK_INVALID, Block[] index){
+ Timer t=new Timer();
+
+ MAX_CONCURRENT_BLOCKS=(Data.WINDOWS ? 1 : Tools.max(1, Shared.threads()/4));
+
+ minChrom=Tools.max(1, minChrom);
+ if(genome>=0 && Data.GENOME_BUILD!=genome){
+ Data.setGenome(genome);
+ maxChrom=Tools.min(Data.numChroms, maxChrom);
+ }
+
+ assert(minChrom<=maxChrom);
+
+ if(index==null){index=new Block[maxChrom+1];}
+
+ ArrayList<BlockMaker> list=new ArrayList<BlockMaker>();
+
+ for(int i=1; i<=maxChrom;){
+ if(i>=minChrom){
+ int a=minChrom(i, minChrom, CHROM_MASK_HIGH);
+ int b=maxChrom(i, minChrom, maxChrom, CHROM_MASK_LOW);
+ assert(b>=i);
+
+ BlockMaker idm=new BlockMaker(a, b, k, CHROMBITS, MAX_ALLOWED_CHROM_INDEX, CHROM_MASK_LOW, CHROM_MASK_HIGH, SITE_MASK, SHIFT_LENGTH, WRITE, DISK_INVALID, index);
+ list.add(idm);
+ incrementActiveBlocks(1);
+ idm.start();
+
+ while(idm.getState()==State.NEW){}//wait
+
+ i=b+1;
+ }else{i++;}
+ }
+
+ for(BlockMaker cm : list){
+ while(cm.getState()!=State.TERMINATED){
+ try {
+ cm.join();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+
+ t.stop();
+// Data.sysout.println("Index gen time: \t"+t);
+
+ return index;
+ }
+
+ public static Block makeBlock(int minChrom, int maxChrom, int k, int CHROMBITS, int MAX_ALLOWED_CHROM_INDEX,
+ int CHROM_MASK_LOW, int CHROM_MASK_HIGH, int SITE_MASK, int SHIFT_LENGTH, boolean WRITE, boolean DISK_INVALID, Block[] matrix){
+ assert(false) : maxChrom+", "+MAX_ALLOWED_CHROM_INDEX;
+ BlockMaker idm=new BlockMaker(minChrom, maxChrom, k, CHROMBITS, MAX_ALLOWED_CHROM_INDEX, CHROM_MASK_LOW, CHROM_MASK_HIGH, SITE_MASK, SHIFT_LENGTH, WRITE, DISK_INVALID, matrix);
+ Block block=idm.makeArrays();
+
+ assert(false) : maxChrom+", "+MAX_ALLOWED_CHROM_INDEX;
+
+ if(verbose){
+ for(int i=0; i<block.numStarts; i++){
+ int[] array=block.getHitList(i);
+ if(array==null){Data.sysout.println(i+": "+null);}
+ else{Data.sysout.println(i+": "+Arrays.toString(array));}
+ }
+ }
+
+ return block;
+ }
+
+
+
+ private static class BlockMaker extends Thread{
+
+ public BlockMaker(int minChrom_, int maxChrom_, int k, int CHROMBITS_,
+ int MAX_ALLOWED_CHROM_INDEX_, int CHROM_MASK_LOW_, int CHROM_MASK_HIGH_, int SITE_MASK_, int SHIFT_LENGTH_,
+ boolean WRITE_TO_DISK_, boolean DISK_INVALID_, Block[] matrix_){
+
+ KEYLEN=k;
+ CHROMBITS=CHROMBITS_;
+ KEYSPACE=1<<(2*KEYLEN);
+ MAX_ALLOWED_CHROM_INDEX=MAX_ALLOWED_CHROM_INDEX_;
+ WRITE_TO_DISK=WRITE_TO_DISK_;
+ DISK_INVALID=DISK_INVALID_;
+
+
+ CHROM_MASK_LOW=CHROM_MASK_LOW_;
+ CHROM_MASK_HIGH=CHROM_MASK_HIGH_;
+ SITE_MASK=SITE_MASK_;
+ SHIFT_LENGTH=SHIFT_LENGTH_;
+
+ minChrom=minChrom_;
+ maxChrom=maxChrom_;
+ matrix=matrix_;
+// assert(false) : maxChrom+", "+MAX_ALLOWED_CHROM_INDEX;
+// System.err.println(minChrom+"~"+maxChrom);
+ }
+
+
+ @Override
+ public void run(){
+ makeArrays();
+ incrementActiveBlocks(-1);
+ }
+
+
+ private Block makeArrays(){
+
+ {
+ String fname=fname(minChrom, maxChrom, KEYLEN, CHROMBITS);
+ File f=new File(fname);
+
+ if(f.exists() && new File(fname+"2.gz").exists()){
+ Block x=Block.read(fname);
+ if(matrix!=null){
+ for(int i=baseChrom(minChrom); i<=maxChrom; i++){
+ matrix[i]=x;
+ }
+ }
+ return x;
+ }else{
+ synchronized(getClass()){
+ Data.sysout.println("No index available; generating from reference genome: "+f.getAbsolutePath());
+ if(WRITE_TO_DISK){
+ String root=ReadWrite.parseRoot2(f.getAbsolutePath());
+ File rf=new File(root);
+ if(!rf.exists()){
+ rf.mkdirs();
+ }
+ }
+ }
+ }
+ }
+
+ CountThread threads[]=new CountThread[4];
+ int[] sizes=new int[KEYSPACE+1];
+ int[] intercom=new int[4];
+ Block[] indexHolder=new Block[1];
+
+ for(int i=0; i<4; i++){
+ threads[i]=new CountThread(i, sizes, intercom, indexHolder);
+ threads[i].start();
+// while(!threads[i].isAlive()){
+// //wait for these threads to start
+// }
+ }
+ Data.sysout.println("Indexing threads started.");
+ for(int i=0; i<threads.length; i++){
+ if(threads[i].getState()!=State.TERMINATED){
+ try {
+ threads[i].join();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+ Data.sysout.println("Threads finished.");
+
+ for(int i=sizes.length-2; i>=0; i--){
+ sizes[i+1]=sizes[i];
+ }
+ sizes[0]=0;
+
+ if(matrix!=null){
+ for(int i=baseChrom(minChrom); i<=maxChrom; i++){
+ matrix[i]=indexHolder[0];
+ }
+ }
+
+ if(WRITE_TO_DISK){
+ String fname=fname(minChrom, maxChrom, KEYLEN, CHROMBITS);
+// File f=new File(fname);
+// assert(!f.exists()) : "Tried to overwrite file "+f.getAbsolutePath();
+ indexHolder[0].write(fname, true);
+ }
+
+ return indexHolder[0];
+ }
+
+
+ private class CountThread extends Thread{
+
+ public CountThread(int id_, int[] sizes_, int[] intercom_, Block[] indexHolder_){
+ id=id_;
+ idb=AminoAcid.numberToBase[id];
+ sizes=sizes_;
+ indexHolder=indexHolder_;
+ intercom=intercom_;
+
+ minIndex=(id<<(2*KEYLEN-2));
+ maxIndex=(int)(((id+1L)<<(2*KEYLEN-2))-1);
+ //Data.sysout.println("Thread "+id+" range is "+minIndex+", "+maxIndex);
+
+ if(ALLOW_POLYMERS){
+ banned=-1;
+ banmask=-1; //poly-A still slips through
+ }else{
+ int b=0;
+ for(int i=0; i<KEYLEN; i++){
+ b<<=2;
+ b=(b|id);
+ }
+ banned=b;
+ banmask=~((-1)<<((2*KEYLEN)-banshift));
+ }
+ }
+
+ private final int id;
+ private final int idb;
+ private final int[] sizes;
+ /** {sizeSum, #finishedCounting, #finishedAllocating, #finishedFilling} */
+ private final int[] intercom;
+ private final Block[] indexHolder;
+ private final int minIndex;
+ private final int maxIndex;
+ private final int banned;
+ private final int banmask;
+ private static final int banshift=4;
+
+ @Override
+ public void run(){
+
+ //Data.sysout.println("Thread "+id+" counting sizes for ("+minChrom+", "+maxChrom+")");
+ for(int i=minChrom; i<=maxChrom; i++){countSizes(i);}
+
+ final Block b;
+ synchronized(intercom){
+ //Data.sysout.println("Thread "+id+" synced on intercom: "+Arrays.toString(intercom));
+ intercom[1]++;
+ if(id==0){
+ while(intercom[1]<4){
+ //Data.sysout.println("Thread "+id+" waiting on intercom: "+Arrays.toString(intercom));
+ try {
+ intercom.wait();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+
+ int sum=0;
+ for(int i=0; i<sizes.length; i++){
+ int temp=sizes[i];
+ sizes[i]=sum;
+ sum+=temp;
+ }
+
+ if(USE_ALLOC_SYNC){
+ synchronized(ALLOC_SYNC){//To allow contiguous memory allocation
+ b=new Block(new int[sum], sizes);
+ }
+ }else{
+ b=new Block(new int[sum], sizes);
+ }
+ indexHolder[0]=b;
+ intercom[2]++;
+ assert(intercom[2]==1);
+ intercom.notifyAll();
+ }else{
+ while(intercom[2]<1){
+ //Data.sysout.println("Thread "+id+" waiting on intercom: "+Arrays.toString(intercom));
+ try {
+ if(intercom[1]>=4){intercom.notify();}
+ intercom.wait();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+ }
+
+ //Data.sysout.println("Thread "+id+" filling arrays for ("+minChrom+", "+maxChrom+")");
+
+ for(int i=minChrom; i<=maxChrom; i++){fillArrays(i);}
+ //Data.sysout.println("Thread "+id+" finished.");
+ }
+
+ private void countSizes(final int chrom){
+
+ // System.err.println("Thread "+id+" using chr"+chrom+" for countSizes");
+ ChromosomeArray ca=dna.Data.getChromosome(chrom);
+
+ // int baseChrom=baseChrom(chrom);
+
+ if(ca.maxIndex>MAX_ALLOWED_CHROM_INDEX){
+ throw new RuntimeException("Chrom "+chrom+": "+ca.maxIndex+" > "+MAX_ALLOWED_CHROM_INDEX);
+ }
+
+ final int max=ca.maxIndex-KEYLEN+1;
+ final int skip=KEYLEN-1;
+ assert(skip>0);
+
+
+ int start=ca.minIndex;
+ while(start<max && ca.getNumber(start+skip)==-1){start+=skip;}
+ while(start<max && ca.getNumber(start)==-1){start++;}
+
+ // Data.sysout.println("Entering hash loop.");
+
+ // "a" is site start, "b" is site end
+ final byte[] array=ca.array;
+ for(int a=start, b=start+skip; a<max; a++, b++){
+ if(array[a]==idb){
+ int key=ca.getNumber(a, b);
+ if(key>=0 && (key>>banshift)!=(key&banmask) && (!USE_MODULO || key%MODULO==0 || (AminoAcid.reverseComplementBinaryFast(key, KEYLEN))%MODULO==0)){
+ assert(key>=minIndex && key<=maxIndex) : "\n"+id+", "+ca.getNumber(a)+", "+(char)ca.get(a)+", "+key+", "+Integer.toHexString(key)+
+ ", "+ca.getString(a, b)+"\n"+minIndex+", "+maxIndex+"\n";
+ sizes[key]++;
+ }
+ }
+ // Data.sysout.println("a="+a+", b="+b+", max="+max);
+ }
+
+ // Data.sysout.println("Left hash loop.");
+
+ }
+
+ private void fillArrays(final int chrom){
+
+ // System.err.println("Thread "+id+" using chr"+chrom+" for fillArrays");
+ ChromosomeArray ca=dna.Data.getChromosome(chrom);
+
+ int baseChrom=baseChrom(chrom);
+
+ if(ca.maxIndex>MAX_ALLOWED_CHROM_INDEX){
+ throw new RuntimeException("Chrom "+chrom+": "+ca.maxIndex+" > "+MAX_ALLOWED_CHROM_INDEX);
+ }
+
+ final int max=ca.maxIndex-KEYLEN+1;
+ final int skip=KEYLEN-1;
+ assert(skip>0);
+
+
+ int start=ca.minIndex;
+ while(start<max && ca.getNumber(start+skip)==-1){start+=skip;}
+ while(start<max && ca.getNumber(start)==-1){start++;}
+
+
+// // Data.sysout.println("Entering hash loop.");
+// // "a" is site start, "b" is site end
+// int len=KEYLEN-1;
+// int keyB=ca.getNumber(start, start+skip-1);
+// final int mask=(KEYLEN==16 ? -1 : ~((-1)<<(2*KEYLEN)));
+// final byte[] array=ca.array;
+// final byte[] btn=AminoAcid.baseToNumber;
+// for(int a=start, b=start+skip; a<max; a++, b++){
+// int c=btn[array[b]];
+// if(c>=0){
+// keyB=((keyB<<2)|c);
+// len++;
+// }else{
+// len=0;
+// }
+// int key=keyB&mask;
+// if(len>=KEYLEN && /* array[a]==idb*/ key>=minIndex && key<=maxIndex){
+//// int key=keyB&mask;
+// assert(key>=minIndex && key<=maxIndex);
+// int number=toNumber(a, chrom);
+// assert(numberToChrom(number, baseChrom)==chrom);
+// assert(numberToSite(number)==a);
+// index[key][sizes[key]]=number;
+// sizes[key]++;
+// }
+// // Data.sysout.println("a="+a+", b="+b+", max="+max);
+// }
+
+
+ // Data.sysout.println("Entering hash loop.");
+ // "a" is site start, "b" is site end
+
+ int[] sites=indexHolder[0].sites;
+
+ for(int a=start, b=start+skip; a<max; a++, b++){
+ if(ca.array[a]==idb){
+ int key=ca.getNumber(a, b);
+ if(key>=0 && (key>>banshift)!=(key&banmask) && (!USE_MODULO || key%MODULO==0 || (AminoAcid.reverseComplementBinaryFast(key, KEYLEN))%MODULO==0)){
+ assert(key>=minIndex && key<=maxIndex);
+ int number=toNumber(a, chrom);
+ assert(numberToChrom(number, baseChrom)==chrom);
+ assert(numberToSite(number)==a);
+ int loc=sizes[key];
+ assert(sites[loc]==0);
+ sites[loc]=number;
+ sizes[key]++;
+ }
+ }
+ // Data.sysout.println("a="+a+", b="+b+", max="+max);
+ }
+ // Data.sysout.println("Left hash loop.");
+
+ }
+
+ }
+
+
+ /** Encode a (location, chrom) pair to an index */
+ public final int toNumber(int site, int chrom){
+ int out=(chrom&CHROM_MASK_LOW);
+ out=out<<SHIFT_LENGTH;
+ out=(out|site);
+ return out;
+ }
+
+ /** Decode an index to a location */
+ public final int numberToSite(int number){
+ return (number&SITE_MASK);
+ }
+
+ /** Decode an (index, baseChrom) pair to a chromosome */
+ public final int numberToChrom(int number, int baseChrom){
+ assert((baseChrom&CHROM_MASK_LOW)==0) : Integer.toHexString(number)+", baseChrom="+baseChrom;
+ assert(baseChrom>=0) : Integer.toHexString(number)+", baseChrom="+baseChrom;
+ // assert(baseChrom<8) : Integer.toHexString(number)+", baseChrom="+baseChrom;
+
+ int out=(number>>>SHIFT_LENGTH);
+
+ out=out+(baseChrom&CHROM_MASK_HIGH);
+
+ // assert(out<8) : Integer.toHexString(number)+", baseChrom="+baseChrom;
+ return out;
+ }
+
+ public final int baseChrom(int chrom){return Tools.max(0, chrom&CHROM_MASK_HIGH);}
+
+ private final int KEYLEN;
+ private final int CHROMBITS;
+ private final int KEYSPACE;
+ private final int MAX_ALLOWED_CHROM_INDEX;
+ public final boolean WRITE_TO_DISK;
+ public final boolean DISK_INVALID;
+
+ private final int CHROM_MASK_LOW;
+ private final int CHROM_MASK_HIGH;
+ private final int SITE_MASK;
+ private final int SHIFT_LENGTH;
+
+ private final int minChrom;
+ private final int maxChrom;
+
+ private final Block[] matrix;
+
+ }
+
+ public static final int minChrom(int chrom, int MINCHROM, int CHROM_MASK_HIGH){return Tools.max(MINCHROM, chrom&CHROM_MASK_HIGH);}
+ public static final int maxChrom(int chrom, int MINCHROM, int MAXCHROM, int CHROM_MASK_LOW){return Tools.max(MINCHROM, Tools.min(MAXCHROM, chrom|CHROM_MASK_LOW));}
+
+ public static final String fname(int minChrom, int maxChrom, int k, int chrombits){
+ String suffix="_index_k"+k+"_c"+chrombits+"_b"+Data.GENOME_BUILD+".blockB";
+ if(minChrom!=maxChrom){
+ return Data.ROOT_INDEX+Data.GENOME_BUILD+"/chr"+minChrom+"-"+maxChrom+suffix;
+ }else{
+ return Data.ROOT_INDEX+Data.GENOME_BUILD+"/chr"+minChrom+suffix;
+ }
+ }
+
+ private static void incrementActiveBlocks(int i){
+ assert(i!=0);
+ synchronized(THREAD_SYNC){
+ assert(ACTIVE_BLOCKS>=0);
+ assert(ACTIVE_BLOCKS<=MAX_CONCURRENT_BLOCKS);
+
+ while(i>0 && ACTIVE_BLOCKS>0 && ACTIVE_BLOCKS>=MAX_CONCURRENT_BLOCKS){
+ try {
+ THREAD_SYNC.wait(10000);
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ ACTIVE_BLOCKS+=i;
+ if(ACTIVE_BLOCKS<MAX_CONCURRENT_BLOCKS || i<0){THREAD_SYNC.notifyAll();}
+
+ assert(ACTIVE_BLOCKS>=0);
+ assert(ACTIVE_BLOCKS<=MAX_CONCURRENT_BLOCKS);
+ }
+ }
+
+ public static boolean verbose=false;
+
+ public static boolean USE_ALLOC_SYNC=false;
+ private static final String ALLOC_SYNC=new String("ALLOC_SYNC");
+ private static final String THREAD_SYNC=new String("THREAD_SYNC");
+
+ public static int MAX_CONCURRENT_BLOCKS=(Data.WINDOWS ? 1 : 2);
+ private static int ACTIVE_BLOCKS=0;
+
+ public static boolean ALLOW_POLYMERS=false;
+ public static boolean USE_MODULO=false;
+ private static final int MODULO=IndexMaker4.MODULO;
+
+}
diff --git a/current/align2/IntList.java b/current/align2/IntList.java
new file mode 100755
index 0000000..7a10191
--- /dev/null
+++ b/current/align2/IntList.java
@@ -0,0 +1,165 @@
+package align2;
+
+import java.util.Arrays;
+
+
+
+public final class IntList{
+
+ public IntList(){this(256);}
+
+ public IntList(int initial){
+ assert(initial>0);
+ array=new int[initial];
+ }
+
+ public void clear(){size=0;}
+
+ public final void set(int loc, int value){
+ if(loc>=array.length){
+ resize(loc*2L+1);
+ }
+ array[loc]=value;
+ size=max(size, loc+1);
+ }
+
+ public final void increment(int loc, int value){
+ if(loc>=array.length){
+ resize(loc*2L+1);
+ }
+ array[loc]+=value;
+ size=max(size, loc+1);
+ }
+
+ public final int get(int loc){
+ return(loc>=size ? 0 : array[loc]);
+ }
+
+ public final void add(int x){
+ if(size>=array.length){
+ resize(size*2L+1);
+ }
+ array[size]=x;
+ size++;
+ }
+
+ public boolean contains(int x) {
+ for(int i=0; i<size; i++){
+ if(array[i]==x){return true;}
+ }
+ return false;
+ }
+
+ private final void resize(final long size2){
+ assert(size2>size) : size+", "+size2;
+ final int size3=(int)Tools.min(Integer.MAX_VALUE, size2);
+ assert(size2>size) : "Overflow: "+size+", "+size2+" -> "+size3;
+ array=Arrays.copyOf(array, size3);
+ }
+
+ public final void shrink(){
+ if(size==array.length){return;}
+ array=Arrays.copyOf(array, size);
+ }
+
+ public final void shrinkToUnique(){
+ //Assumes sorted.
+ if(size<=0){
+ shrink();
+ return;
+ }
+
+ int unique=1;
+
+ for(int i=1; i<size; i++){
+ assert(array[i]>=array[i-1]);
+ if(array[i]!=array[i-1]){unique++;}
+ }
+ if(unique==array.length){return;}
+ int[] alt=new int[unique];
+
+ alt[0]=array[0];
+ for(int i=1, j=1; j<unique; i++){
+ if(array[i]!=array[i-1]){
+ alt[j]=array[i];
+ j++;
+ }
+ }
+
+ array=alt;
+ size=alt.length;
+ }
+
+ public String toString(){
+ return toStringListView();
+ }
+
+ public String toStringSetView(){
+ StringBuilder sb=new StringBuilder();
+ sb.append('[');
+ String comma="";
+ for(int i=0; i<size; i++){
+ if(array[i]!=0){
+ sb.append(comma+"("+i+", "+array[i]+")");
+ comma=", ";
+ }
+ }
+ sb.append(']');
+ return sb.toString();
+ }
+
+ public String toStringListView(){
+ StringBuilder sb=new StringBuilder();
+ sb.append('[');
+ String comma="";
+ for(int i=0; i<size; i++){
+ sb.append(comma+array[i]);
+ comma=", ";
+ }
+ sb.append(']');
+ return sb.toString();
+ }
+
+ public void sort() {
+ if(size>1){Arrays.sort(array, 0, size);}
+ }
+
+ public void reverse() {
+ Tools.reverseInPlace(array, 0, size);
+ }
+
+ /** Assumes this is sorted.
+ * Reduces the list to a set of unique values;
+ * stores their counts in a second list. */
+ public void getUniqueCounts(IntList counts) {
+ counts.size=0;
+ if(size<=0){return;}
+
+ int unique=1;
+ int count=1;
+
+ for(int i=1; i<size; i++){
+ assert(array[i]>=array[i-1]);
+ if(array[i]==array[i-1]){
+ count++;
+ }else{
+ array[unique]=array[i];
+ unique++;
+ counts.add(count);
+ count=1;
+ }
+ }
+ if(count>0){
+ counts.add(count);
+ }
+ size=unique;
+ assert(counts.size==size);
+ }
+
+ private static final int min(int x, int y){return x<y ? x : y;}
+ private static final int max(int x, int y){return x>y ? x : y;}
+
+ public int[] array;
+ public int size=0;
+
+}
diff --git a/current/align2/IntList2.java b/current/align2/IntList2.java
new file mode 100755
index 0000000..a8f93ff
--- /dev/null
+++ b/current/align2/IntList2.java
@@ -0,0 +1,83 @@
+package align2;
+
+import java.util.Arrays;
+
+
+
+public final class IntList2{
+
+ public IntList2(){this(256);}
+
+ public IntList2(int initial){
+ assert(initial>0);
+ array=new int[initial][];
+ }
+
+ public final void set(int loc, int[] value){
+ if(loc>=array.length){
+ resize((loc+1)*2);
+ }
+ array[loc]=value;
+ size=max(size, loc+1);
+ }
+
+ public final void increment(int loc, int value){
+ throw new RuntimeException("Unsupported");
+ }
+
+ public final int[] get(int loc){
+ return(loc>=size ? null : array[loc]);
+ }
+
+ @Deprecated
+ public final void add(int x){
+ throw new RuntimeException("Unsupported");
+ }
+
+ public final void add(int[] x){
+ if(size>=array.length){
+ resize(max(size*2, 1));
+ }
+ array[size]=x;
+ size++;
+ }
+
+ public final void resize(int size2){
+ assert(size2>size);
+ array=Arrays.copyOf(array, size2);
+ }
+
+ public final void shrink(){
+ if(size==array.length){return;}
+ array=Arrays.copyOf(array, size);
+ }
+
+ public final void shrinkToUnique(){
+ throw new RuntimeException("Unsupported");
+ }
+
+ public String toString(){
+ StringBuilder sb=new StringBuilder();
+ sb.append('[');
+ String comma="";
+ for(int i=0; i<size; i++){
+ if(array[i]!=null){
+ sb.append(comma+"("+i+", "+Arrays.toString(array[i])+")");
+ comma=", ";
+ }
+ }
+ sb.append(']');
+ return sb.toString();
+ }
+
+ public void sort() {
+ if(size>1){Arrays.sort(array, 0, size);}
+ }
+
+ private static final int min(int x, int y){return x<y ? x : y;}
+ private static final int max(int x, int y){return x>y ? x : y;}
+
+ public int[][] array;
+ public int size=0;
+
+}
diff --git a/current/align2/KeyRing.java b/current/align2/KeyRing.java
new file mode 100755
index 0000000..4aaa6d2
--- /dev/null
+++ b/current/align2/KeyRing.java
@@ -0,0 +1,510 @@
+package align2;
+
+import java.util.Arrays;
+
+import dna.AminoAcid;
+import dna.ChromosomeArray;
+
+public final class KeyRing {
+
+ public static final void main(String[] args){
+ int len=Integer.parseInt(args[0]);
+ float density=(float) Double.parseDouble(args[1]);
+ int chunksize=13;
+ if(args.length>2){chunksize=Integer.parseInt(args[2]);}
+
+ byte[] qual=new byte[len];
+ Arrays.fill(qual, (byte)20);
+
+ int[] offsets=KeyRing.makeOffsets(qual, chunksize, density, 2);
+ System.out.println(Arrays.toString(offsets));
+ }
+
+ public static int[] makeKeys(byte[] s, int[] offsets, int chunksize){
+ if(offsets==null){return null;}
+ assert(chunksize>0 && chunksize<16);
+ assert(offsets!=null) : s.length+", "+new String(s);
+ int[] keys=new int[offsets.length];
+
+// System.out.println(Arrays.toString(offsets));
+
+ for(int i=0; i<offsets.length; i++){
+// System.out.println(s.length()+", "+offsets.length+", "+chunksize+", "+keys.length+", "+i);
+ keys[i]=ChromosomeArray.toNumber(offsets[i], offsets[i]+chunksize-1, s);
+ }
+ return keys;
+ }
+
+ public static int[] reverseComplementKeys(int[] keys, int k){
+// assert(!cs);
+ int[] r=new int[keys.length];
+ for(int i=0, x=keys.length-1; i<r.length; i++){
+ r[i]=AminoAcid.reverseComplementBinaryFast(keys[x-i], k);
+ }
+ return r;
+ }
+
+ public static int reverseComplementKey(int key, int k){
+// return cs ? reverseComplementKey_old(key, k, cs) : AminoAcid.reverseComplementBinaryFast(key, k);
+ return AminoAcid.reverseComplementBinaryFast(key, k);
+ }
+
+
+ public static final String decode(int key, int chunksize){
+ StringBuilder sb=new StringBuilder();
+ for(int i=0; i<chunksize; i++){
+ int temp=(key>>(2*(chunksize-i-1)));
+ temp=(temp&3);
+ sb.append((char)AminoAcid.numberToBase[temp]);
+ }
+
+ String s=sb.toString();
+
+ assert(key==ChromosomeArray.toNumber(0, s.length()-1, s)) :
+ Integer.toHexString(key)+" -> "+s+" != "+Integer.toHexString(ChromosomeArray.toNumber(0, s.length()-1, s));
+
+ return sb.toString();
+ }
+
+ /*
+ public static final int[] makeOffsets(int readlen, int blocksize, int overlap, int minKeysDesired){
+ assert(blocksize>0);
+ assert(overlap<blocksize);
+ assert(blocksize<=readlen) : readlen+", "+blocksize+", "+overlap+", "+minKeysDesired;
+
+ int slots=readlen-blocksize+1;
+ int midslots=slots-2;
+ int spacing=blocksize-overlap;
+
+ if(slots==1){return new int[] {0};}
+ if(slots<=spacing+1){return new int[] {0, slots-1};}
+
+// int middles=(midslots/spacing);
+//
+// if(middles+2<minKeysDesired && midslots+2>=minKeysDesired){
+// while(middles+2<minKeysDesired){
+// spacing--;
+// assert(spacing>0);
+// middles=(midslots/spacing);
+// }
+// }
+
+ int middles=(midslots/spacing);
+ if(middles<minKeysDesired-2){
+ middles=Tools.max(minKeysDesired-2, midslots);
+ }
+
+ assert(middles>0); //due to the escape conditions
+
+// float fspacing=midslots/(float)(middles+1);
+ float fspacing=midslots/(float)(middles);
+ assert(fspacing>=1);
+
+ int[] offsets=new int[middles+2];
+ offsets[0]=0;
+ offsets[offsets.length-1]=slots-1;
+
+ for(int i=1; i<=middles; i++){
+ offsets[i]=Math.round(fspacing*i);
+ }
+
+// System.out.println("readlen = \t"+readlen);
+// System.out.println("blocksize = \t"+blocksize);
+// System.out.println("overlap = \t"+overlap);
+// System.out.println("slots = \t"+slots);
+// System.out.println("midslots = \t"+midslots);
+// System.out.println("spacing = \t"+spacing);
+// System.out.println("middles = \t"+middles);
+// System.out.println("fspacing = \t"+fspacing);
+// System.out.println("Offsets = \t"+Arrays.toString(offsets));
+ return offsets;
+
+ }*/
+
+ /** This is only useful for low-quality reads, with no-calls. Otherwise it just wastes time... */
+ public static final int[] reverseOffsets(final int[] offsetsP, final int k, final int readlen){
+ int[] offsetsM=new int[offsetsP.length];
+ for(int i=0; i<offsetsP.length; i++){
+ int x=offsetsP[offsetsP.length-i-1];
+ assert(x>=0);
+ assert(x+k<=readlen);
+ x=readlen-(x+k);
+ assert(x>=0);
+ assert(x+k<=readlen) : "\n"+Arrays.toString(offsetsP)+"\n"+Arrays.toString(offsetsM)+"\n"+i+"\n"+x+"\n"+readlen;
+ offsetsM[i]=x;
+ }
+ return offsetsM;
+ }
+
+ public static final int[] makeOffsetsWithDensity(int readlen, int blocksize, float density, int minKeysDesired){
+ assert(blocksize>0);
+ assert(density<blocksize);
+ assert(density>0);
+ assert(blocksize<=readlen) : readlen+", "+blocksize+", "+density+", "+minKeysDesired;
+
+ int slots=readlen-blocksize+1;
+ int midslots=slots-2;
+
+ int desired=(int)Math.ceil((readlen*density)/blocksize);
+ assert(desired>=0);
+ desired=Tools.max(minKeysDesired, desired);
+ desired=Tools.min(slots, desired);
+
+ if(slots==1 || desired==1){return new int[] {0};}
+ if(desired==2){return new int[] {0, slots-1};}
+
+ int middles=desired-2;
+
+ assert(middles>0); //due to the escape conditions
+
+// float fspacing=midslots/(float)(middles+1);
+ float fspacing=midslots/(float)(middles);
+ assert(fspacing>=1);
+
+ int[] offsets=new int[desired];
+ offsets[0]=0;
+ offsets[offsets.length-1]=slots-1;
+
+ for(int i=1; i<=middles; i++){
+ offsets[i]=Math.round(fspacing*i);
+ }
+
+// System.out.println("readlen = \t"+readlen);
+// System.out.println("blocksize = \t"+blocksize);
+// System.out.println("overlap = \t"+overlap);
+// System.out.println("slots = \t"+slots);
+// System.out.println("midslots = \t"+midslots);
+// System.out.println("spacing = \t"+spacing);
+// System.out.println("middles = \t"+middles);
+// System.out.println("fspacing = \t"+fspacing);
+// System.out.println("Offsets = \t"+Arrays.toString(offsets));
+ return offsets;
+
+ }
+
+
+ public static final int[] makeOffsetsWithNumberOfKeys(int readlen, int blocksize, int maxKeys){
+ assert(maxKeys>0);
+// System.err.println("readlen, blocksize, maxKeys = "+readlen+","+blocksize+","+maxKeys);
+ if(blocksize>readlen){return null;}
+ int slots=readlen-blocksize+1;
+// System.err.println("slots = "+slots);
+ if(slots==1 || maxKeys==1){return new int[] {slots/2};}
+ if(slots==2 || maxKeys==2){return new int[] {0, slots-1};}
+ if(slots==3 || maxKeys==3){return new int[] {0, slots/2, slots-1};}
+
+ int midslots=slots-2;
+ maxKeys=Tools.min(maxKeys, slots);
+ int middles=Tools.min(maxKeys-2, midslots);
+// System.err.println("midslots = "+midslots);
+// System.err.println("middles = "+middles);
+
+ assert(middles>0); //due to the escape conditions
+
+// float fspacing=midslots/(float)(middles+0); //Bad - leaves 2 adjacent keys at the end.
+ float fspacing=midslots/(float)(middles+1f);
+ fspacing=Tools.max(1f, fspacing);
+ assert(fspacing>=1);
+
+ int[] offsets=new int[middles+2];
+ offsets[0]=0;
+ offsets[offsets.length-1]=slots-1;
+
+
+// for(int i=1; i<=middles; i++){
+// offsets[i]=Math.round(fspacing*i);
+// }
+
+
+
+ for(int i=1; i<=middles; i++){
+ offsets[i]=Math.round(fspacing*i);
+ }
+ if(middles>2){
+ offsets[1]=(int)fspacing;
+ offsets[middles]=(int) Math.ceil(fspacing*middles);
+ }
+
+// System.out.println("readlen = \t"+readlen);
+// System.out.println("blocksize = \t"+blocksize);
+//// System.out.println("overlap = \t"+overlap);
+// System.out.println("slots = \t"+slots);
+// System.out.println("midslots = \t"+midslots);
+//// System.out.println("spacing = \t"+spacing);
+// System.out.println("middles = \t"+middles);
+// System.out.println("fspacing = \t"+fspacing);
+// System.out.println("Offsets = \t"+Arrays.toString(offsets));
+
+ for(int i=1; i<offsets.length; i++){
+ if(offsets[i]<=offsets[i-1]){assert(false) : "fspacing "+fspacing+"\nmidslots "+midslots+"\nmiddles "+middles+
+ "\nmaxKeys "+maxKeys+"\nslots "+slots+"\noffsets "+Arrays.toString(offsets);}
+ }
+
+ return offsets;
+
+ }
+
+// public static final int desiredKeys(int readlen, int blocksize, int overlap, int minKeysDesired){
+// assert(blocksize>0);
+// assert(overlap<blocksize);
+// assert(blocksize<=readlen) : readlen+", "+blocksize+", "+overlap+", "+minKeysDesired;
+// assert(minKeysDesired>=2);
+//
+// int slots=readlen-blocksize+1;
+// int midslots=slots-2;
+// int spacing=blocksize-overlap;
+//
+// if(slots<=minKeysDesired){return slots;}
+// if(slots<=spacing+1){return Tools.min(3, slots);}
+//
+// int middles=(midslots/spacing);
+// if(middles<minKeysDesired-2){
+// middles=Tools.max(minKeysDesired-2, midslots);
+// }
+//
+// assert(middles>0); //due to the escape conditions
+// return middles+2;
+// }
+
+ public static final int desiredKeysFromDensity(int readlen, int blocksize, float density, int minKeysDesired){
+ assert(blocksize>0);
+ assert(density<=blocksize) : density+", "+blocksize;
+ assert(density>0);
+ assert(blocksize<=readlen) : readlen+", "+blocksize+", "+density+", "+minKeysDesired;
+
+ int slots=readlen-blocksize+1;
+
+ int desired=(int)Math.ceil((readlen*density)/blocksize);
+ assert(desired>=0);
+ desired=Tools.max(minKeysDesired, desired);
+ desired=Tools.min(slots, desired);
+ return desired;
+ }
+
+ public static final int[] makeOffsets(final int readlen, int blocksize, float density, int minKeysDesired){
+ assert(blocksize>0);
+ assert(blocksize<=readlen) : readlen+", "+blocksize+", "+density+", "+minKeysDesired;
+
+ if(readlen<blocksize){return null;}
+
+ int desiredKeys=desiredKeysFromDensity(readlen, blocksize, density, minKeysDesired);
+ assert(desiredKeys>0) : readlen+","+blocksize+","+density+","+minKeysDesired+","+desiredKeys;
+
+ int[] offsets=makeOffsetsWithNumberOfKeys(readlen, blocksize, desiredKeys);
+// System.out.println("desiredKeys="+desiredKeys+", actual="+(offsets==null ? 0 : offsets.length));
+ assert(offsets!=null) :readlen+","+blocksize+","+density+","+minKeysDesired+","+desiredKeys;
+ return offsets;
+ }
+
+ public static final int[] makeOffsets(byte[] qual, int blocksize, float density, int minKeysDesired){
+ int readlen=qual.length;
+ assert(blocksize>0);
+ assert(blocksize<=readlen) : readlen+", "+blocksize+", "+density+", "+minKeysDesired;
+
+ int left=0, right=readlen-1;
+
+ for(int i=left, cntr=0; i<readlen && cntr<blocksize; i++, cntr++){
+ if(qual[i]<1){
+ left=i+1;
+ cntr=0;
+ }
+ }
+ for(int i=right, cntr=0; i>=0 && cntr<blocksize; i--, cntr++){
+ if(qual[i]<1){
+ right=i-1;
+ cntr=0;
+ }
+ }
+
+// System.out.println("left="+left+", right="+right+", readlen="+readlen+", " +
+// "blocksize="+blocksize+", density="+density+", minKeysDesired="+minKeysDesired);
+
+ readlen=right-left+1;
+ assert(readlen<=qual.length);
+ if(readlen<blocksize){return null;}
+
+ int desiredKeys=desiredKeysFromDensity(qual.length, blocksize, density, minKeysDesired);
+ assert(desiredKeys>0) : qual.length+","+readlen+","+blocksize+","+density+","+minKeysDesired+","+desiredKeys;
+
+// System.out.println("desiredKeys="+desiredKeys);
+// System.out.println("Resulting density = "+(desiredKeys*blocksize)/(float)qual.length);
+
+ int[] offsets=makeOffsetsWithNumberOfKeys(readlen, blocksize, desiredKeys);
+// System.out.println("desiredKeys="+desiredKeys+", actual="+(offsets==null ? 0 : offsets.length));
+ assert(offsets!=null) : qual.length+","+readlen+","+blocksize+","+density+","+minKeysDesired+","+desiredKeys;
+ if(left>0){
+ for(int i=0; i<offsets.length; i++){offsets[i]+=left;}
+ }
+ return offsets;
+ }
+
+// public static final int[] makeOffsets2(float[] keyErrorProb,
+// final int readlenOriginal, int blocksize, float density, int minKeysDesired){
+// return makeOffsets2(keyErrorProb, readlenOriginal, blocksize, density, 2*density, minKeysDesired);
+// }
+
+ public static final int[] makeOffsets2(float[] keyErrorProb,
+ final int readlenOriginal, int blocksize, float density, float maxDensity, int minKeysDesired){
+ int readlen=readlenOriginal;
+ assert(maxDensity>=density);
+ assert(blocksize>0);
+ assert(blocksize<=readlen) : readlen+", "+blocksize+", "+density+", "+minKeysDesired;
+
+ int left=0, right=readlen-blocksize;
+
+ //This can be set as low as .90 for long reads, if qualities are accurate.
+ final float errorLimit=KEEP_BAD_KEYS ? 2f : 0.94f; //Default: .95f
+
+ while(left<=right && keyErrorProb[left]>errorLimit){left++;}
+ while(right>=left && keyErrorProb[right]>errorLimit){right--;}
+
+// System.out.println("left="+left+", right="+right+", readlen="+readlen+", " +
+// "blocksize="+blocksize+", density="+density+", minKeysDesired="+minKeysDesired);
+
+ if(right<left){return null;}
+ readlen=right-left+blocksize;
+ assert(readlen<=readlenOriginal);
+ if(readlen<blocksize){
+ assert(false);
+ return null;
+ }
+
+// System.out.println("Left="+left+", right="+right);
+
+ int desiredKeys=desiredKeysFromDensity(readlenOriginal, blocksize, density, minKeysDesired);
+ if(readlen<readlenOriginal){
+ int desiredKeys2=desiredKeysFromDensity(readlen, blocksize, maxDensity, minKeysDesired);
+ desiredKeys=Tools.min(desiredKeys, desiredKeys2);
+ }
+ assert(desiredKeys>0) : readlenOriginal+","+readlen+","+blocksize+","+density+","+minKeysDesired+","+desiredKeys;
+
+// System.out.println("desiredKeys="+desiredKeys);
+// System.out.println("Resulting density = "+(desiredKeys*blocksize)/(float)qual.length);
+
+ int[] offsets=makeOffsetsWithNumberOfKeys(readlen, blocksize, desiredKeys);
+
+// System.out.println("offsets initial = "+Arrays.toString(offsets));
+
+// System.out.println("desiredKeys="+desiredKeys+", actual="+(offsets==null ? 0 : offsets.length));
+ assert(offsets!=null) : readlenOriginal+","+readlen+","+blocksize+","+density+","+minKeysDesired+","+desiredKeys;
+ if(left>0){
+ for(int i=0; i<offsets.length; i++){offsets[i]+=left;}
+ }
+ return offsets;
+ }
+
+ public static final int[] makeOffsets3(float[] keyErrorProb,
+ final int readlenOriginal, int blocksize, float density, float maxDensity, int minKeysDesired, boolean semiperfectmode){
+ int readlen=readlenOriginal;
+ assert(maxDensity>=density);
+ assert(blocksize>0);
+ assert(blocksize<=readlen) : readlen+", "+blocksize+", "+density+", "+minKeysDesired;
+
+ final int maxProbIndex=readlen-blocksize;
+// assert(maxProbIndex==keyErrorProb.length-1);
+ assert(maxProbIndex<=keyErrorProb.length-1) : maxProbIndex+", "+keyErrorProb.length;
+ int left=0, right=maxProbIndex;
+
+ final float errorLimit2=KEEP_BAD_KEYS ? 2f : 0.9999f; //Default: .95f
+
+ //This can be set as low as .90 for long reads, if qualities are accurate.
+ final float errorLimit1=KEEP_BAD_KEYS ? 2f : (semiperfectmode ? 0.99f : 0.94f); //Default: .95f
+
+ while(left<=right && keyErrorProb[left]>=errorLimit1){left++;}
+ while(right>=left && keyErrorProb[right]>=errorLimit1){right--;}
+
+// System.out.println("Left="+left+", right="+right);
+
+ int potentialKeys=0;
+ for(int i=left; i<=right; i++){
+ if(keyErrorProb[i]<errorLimit2){potentialKeys++;}
+ }
+ if(potentialKeys==0){return null;}
+
+// System.out.println("left="+left+", right="+right+", readlen="+readlen+", " +
+// "blocksize="+blocksize+", density="+density+", minKeysDesired="+minKeysDesired);
+
+ if(right<left){return null;}
+ readlen=right-left+blocksize;
+ assert(readlen<=readlenOriginal);
+ if(readlen<blocksize){
+ assert(false);
+ return null;
+ }
+
+ int desiredKeys=desiredKeysFromDensity(readlenOriginal, blocksize, density, minKeysDesired);
+ if(readlen<readlenOriginal){
+ int desiredKeys2=desiredKeysFromDensity(readlen, blocksize, maxDensity, minKeysDesired);
+ desiredKeys=Tools.min(desiredKeys, desiredKeys2);
+ }
+ desiredKeys=Tools.min(desiredKeys, potentialKeys);
+ assert(desiredKeys>0) : readlenOriginal+","+readlen+","+blocksize+","+density+","+minKeysDesired+","+desiredKeys;
+
+// System.out.println("desiredKeys="+desiredKeys);
+// System.out.println("Resulting density = "+(desiredKeys*blocksize)/(float)qual.length);
+
+ int[] offsets=new int[desiredKeys];
+ float interval=(right-left)/(float)(Tools.max(desiredKeys-1, 1));
+
+ int intervalInt=((int)interval)+1;
+
+ float f=left;
+ int prev=-1;
+ int misses=0;
+ for(int i=0, j=left; i<offsets.length; i++){
+ int x=-1;
+
+// System.out.println("prev="+prev+", j="+j+", intervalInt="+intervalInt);
+
+ if(prev<j){
+ if(keyErrorProb[j]<errorLimit2 && (prev<0 || j-prev>0)){
+ x=j;
+// System.out.println("A: x="+x);
+ }else{
+ for(int k=j-1, lim=prev+2; k>lim; k--){
+ if(keyErrorProb[k]<errorLimit2){x=k;break;}
+ }
+// System.out.println("B: x="+x);
+ if(x<0){
+ for(int k=j+1, lim=Tools.min(j+intervalInt, right); k<lim; k++){
+ if(keyErrorProb[k]<errorLimit2){x=k;break;}
+ }
+ }
+// System.out.println("C: x="+x);
+ }
+ }
+
+ offsets[i]=x;
+ if(x>-1){
+ assert(keyErrorProb[x]<errorLimit2);
+ prev=x;
+ }else{
+ misses++;
+ prev=Tools.max(prev, j-2);
+ }
+
+ f+=interval;
+ j=Tools.min(maxProbIndex, (Tools.max(j+1, (int)Math.round(f))));
+ }
+// System.out.println("offsets initial = "+Arrays.toString(offsets));
+
+ if(misses>0){
+ int[] offsets2=new int[offsets.length-misses];
+ for(int i=0, j=0; i<offsets.length; i++){
+ if(offsets[i]>=0){
+ offsets2[j]=offsets[i];
+ j++;
+ }
+ }
+ offsets=offsets2;
+ }
+// System.out.println("offsets shrunk = "+Arrays.toString(offsets));
+
+// System.out.println("desiredKeys="+desiredKeys+", actual="+(offsets==null ? 0 : offsets.length));
+ assert(offsets!=null) : readlenOriginal+","+readlen+","+blocksize+","+density+","+minKeysDesired+","+desiredKeys;
+ return offsets;
+ }
+
+ public static boolean KEEP_BAD_KEYS=false;
+
+}
diff --git a/current/align2/ListNum.java b/current/align2/ListNum.java
new file mode 100755
index 0000000..7914be7
--- /dev/null
+++ b/current/align2/ListNum.java
@@ -0,0 +1,59 @@
+package align2;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.Random;
+
+import stream.Read;
+
+public final class ListNum<K extends Serializable> implements Serializable, Iterable<K> {
+
+ /**
+ *
+ */
+ private static final long serialVersionUID = -7509242172010729386L;
+
+ public ListNum(ArrayList<K> list_, long id_){
+ list=list_;
+ id=id_;
+ if(GEN_RANDOM_NUMBERS && list!=null){
+ for(K k : list){
+ if(k!=null){
+ ((Read)k).rand=randy.nextDouble();
+ }
+ }
+ }
+ }
+
+ public final int size(){
+ return list==null ? 0 : list.size();
+ }
+
+ public final K get(int i){return list.get(i);}
+ public final K set(int i, K k){return list.set(i, k);}
+ public final K remove(int i){return list.remove(i);}
+ public final void add(K k){list.add(k);}
+ public final void clear(){list.clear();}
+
+ @Override
+ public Iterator<K> iterator() {return list==null ? null : list.iterator();}
+
+ public final ArrayList<K> list;
+ public final long id;
+
+ public static synchronized void setDeterministicRandom(boolean b){
+ GEN_RANDOM_NUMBERS=b;
+ if(b){
+ randy=new Random(seed);
+ seed++;
+ }
+ }
+ public static boolean deterministicRandom(){
+ return GEN_RANDOM_NUMBERS;
+ }
+
+ private static boolean GEN_RANDOM_NUMBERS=false;
+ private static Random randy;
+ private static long seed=0;
+}
diff --git a/current/align2/LongList.java b/current/align2/LongList.java
new file mode 100755
index 0000000..cde4053
--- /dev/null
+++ b/current/align2/LongList.java
@@ -0,0 +1,255 @@
+package align2;
+
+import java.util.Arrays;
+
+
+
+public final class LongList{
+
+ public LongList(){this(256);}
+
+ public LongList(int initial){
+ assert(initial>0);
+ array=new long[initial];
+ }
+
+ public void clear(){
+ size=0;
+ }
+
+ public final void set(int loc, long value){
+ if(loc>=array.length){
+ resize(loc*2L+1);
+ }
+ array[loc]=value;
+ size=max(size, loc+1);
+ }
+
+ public final void increment(int loc, long value){
+ if(loc>=array.length){
+ resize(loc*2L+1);
+ }
+ array[loc]+=value;
+ size=max(size, loc+1);
+ }
+
+ public final void increment(int loc){
+ increment(loc, 1);
+ }
+
+ public final void add(LongList b){
+ for(int i=b.size-1; i>=0; i--){
+ increment(i, b.get(i));
+ }
+ }
+
+ public final void add(long[] b){
+ for(int i=b.length-1; i>=0; i--){
+ increment(i, b[i]);
+ }
+ }
+
+ public final void append(LongList b){
+ for(int i=0; i<b.size; i++){
+ add(b.get(i));
+ }
+ }
+
+ public final void append(long[] b){
+ for(int i=0; i<b.length; i++){
+ add(b[i]);
+ }
+ }
+
+ public final long get(int loc){
+ return(loc>=size ? 0 : array[loc]);
+ }
+
+ public final void add(long x){
+ if(size>=array.length){
+ resize(size*2L+1);
+ }
+ array[size]=x;
+ size++;
+ }
+
+ private final void resize(final long size2){
+ assert(size2>size) : size+", "+size2;
+ final int size3=(int)Tools.min(Integer.MAX_VALUE, size2);
+ assert(size2>size) : "Overflow: "+size+", "+size2+" -> "+size3;
+ array=Arrays.copyOf(array, size3);
+ }
+
+ public final void shrink(){
+ if(size==array.length){return;}
+ array=Arrays.copyOf(array, size);
+ }
+
+ public final double stdev(){
+ if(size<2){return 0;}
+ double sum=sum();
+ double avg=sum/size;
+ double sumdev2=0;
+ for(int i=0; i<size; i++){
+ long x=array[i];
+ double dev=avg-x;
+ sumdev2+=(dev*dev);
+ }
+ return Math.sqrt(sumdev2/size);
+ }
+
+ public final long sumLong(){
+ long sum=0;
+ for(int i=0; i<size; i++){
+ sum+=array[i];
+ }
+ return sum;
+ }
+
+ public final double sum(){
+ double sum=0;
+ for(int i=0; i<size; i++){
+ sum+=array[i];
+ }
+ return sum;
+ }
+
+ public final double mean(){
+ return size<1 ? 0 : sum()/size;
+ }
+
+ /** Assumes list is sorted */
+ public final long median(){
+ if(size<1){return 0;}
+ int idx=percentile(0.5);
+ return array[idx];
+ }
+
+ /** Assumes list is sorted */
+ public final long mode(){
+ if(size<1){return 0;}
+ assert(sorted());
+ int streak=1, bestStreak=0;
+ long prev=array[0];
+ long best=prev;
+ for(int i=0; i<size; i++){
+ long x=array[i];
+ if(x==prev){streak++;}
+ else{
+ if(streak>bestStreak){
+ bestStreak=streak;
+ best=prev;
+ }
+ streak=1;
+ prev=x;
+ }
+ }
+ if(streak>bestStreak){
+ bestStreak=streak;
+ best=prev;
+ }
+ return best;
+ }
+
+ public int percentile(double fraction){
+ if(size<2){return size-1;}
+ assert(sorted());
+ double target=(sum()*fraction);
+ double sum=0;
+ for(int i=0; i<size; i++){
+ sum+=array[i];
+ if(sum>=target){
+ return i;
+ }
+ }
+ return size-1;
+ }
+
+ public final void shrinkToUnique(){
+ //Assumes sorted.
+ if(size<=0){
+ shrink();
+ return;
+ }
+
+ int unique=1;
+
+ for(int i=1; i<size; i++){
+ assert(array[i]>=array[i-1]);
+ if(array[i]!=array[i-1]){unique++;}
+ }
+ if(unique==array.length){return;}
+ long[] alt=new long[unique];
+
+ alt[0]=array[0];
+ for(int i=1, j=1; j<unique; i++){
+ if(array[i]!=array[i-1]){
+ alt[j]=array[i];
+ j++;
+ }
+ }
+
+ array=alt;
+ size=alt.length;
+ }
+
+ public String toString(){
+ return toStringListView();
+ }
+
+ public String toStringSetView(){
+ StringBuilder sb=new StringBuilder();
+ sb.append('[');
+ String comma="";
+ for(int i=0; i<size; i++){
+ if(array[i]!=0){
+ sb.append(comma+"("+i+", "+array[i]+")");
+ comma=", ";
+ }
+ }
+ sb.append(']');
+ return sb.toString();
+ }
+
+ public String toStringListView(){
+ StringBuilder sb=new StringBuilder();
+ sb.append('[');
+ String comma="";
+ for(int i=0; i<size; i++){
+ sb.append(comma+array[i]);
+ comma=", ";
+ }
+ sb.append(']');
+ return sb.toString();
+ }
+
+ public long[] toArray(){
+ long[] x=new long[size];
+ for(int i=0; i<x.length; i++){
+ x[i]=array[i];
+ }
+ return x;
+ }
+
+ public void sort() {
+ if(size>1){Arrays.sort(array, 0, size);}
+ }
+
+ public boolean sorted(){
+ for(int i=1; i<size; i++){
+ if(array[i]<array[i-1]){return false;}
+ }
+ return true;
+ }
+
+ private static final long min(long x, long y){return x<y ? x : y;}
+ private static final long max(long x, long y){return x>y ? x : y;}
+
+ private static final int min(int x, int y){return x<y ? x : y;}
+ private static final int max(int x, int y){return x>y ? x : y;}
+
+ public long[] array;
+ /** Highest occupied index plus 1, i.e., lowest unoccupied index */
+ public int size=0;
+
+}
diff --git a/current/align2/LongM.java b/current/align2/LongM.java
new file mode 100755
index 0000000..92d48f0
--- /dev/null
+++ b/current/align2/LongM.java
@@ -0,0 +1,62 @@
+package align2;
+
+/**
+ * A mutable long object
+ * @author Brian Bushnell
+ * @date Feb 8, 2013
+ *
+ */
+public class LongM implements Comparable<LongM> {
+ public LongM(){this(0L);}
+ public LongM(long v){value=v;}
+
+ /**
+ * @param key
+ * @param b
+ */
+ public LongM(long v, boolean mut) {
+ value=v;
+ mutable=mut;
+ }
+
+ public LongM iCopy(){
+ if(!mutable){return this;}
+ return new LongM(value, false);
+ }
+
+ public long value(){return value;}
+// public long longValue(){return value;}
+ public void lock(){mutable=false;}
+
+ public long set(long v){
+ if(!mutable){throw new RuntimeException("Mutating a locked LongM");}
+ return (value=v);
+ }
+ public long increment(){return set(value+1);}
+ public long increment(long x){return set(value+x);}
+
+ @Override
+ public int hashCode(){
+ return (int)((value^(value>>>32))&0xFFFFFFFFL);
+ }
+
+ @Override
+ public int compareTo(LongM b){
+ return value==b.value ? 0 : value<b.value ? -1 : 1;
+ }
+
+ public boolean equals(LongM b){
+ return value==b.value;
+ }
+
+ @Override
+ public boolean equals(Object b){
+ return equals((LongM)b);
+ }
+ public String toString(){return Long.toString(value);}
+ public String toHexString(){return Long.toHexString(value);}
+ public String toBinaryString(){return Long.toBinaryString(value);}
+
+ private boolean mutable=true;
+ private long value;
+}
\ No newline at end of file
diff --git a/current/align2/MSA.java b/current/align2/MSA.java
new file mode 100755
index 0000000..31df396
--- /dev/null
+++ b/current/align2/MSA.java
@@ -0,0 +1,870 @@
+package align2;
+
+import java.util.Arrays;
+
+import stream.Read;
+import stream.SiteScore;
+
+import dna.ChromosomeArray;
+import dna.Data;
+import dna.Gene;
+
+/**
+ * @author Brian Bushnell
+ * @date Jun 20, 2013
+ *
+ */
+public abstract class MSA {
+
+ public static final float minIdToMinRatio(double minid, String classname){
+ if("MultiStateAligner9ts".equalsIgnoreCase(classname)){
+ return MultiStateAligner9ts.minIdToMinRatio(minid);
+ }else if("MultiStateAligner10ts".equalsIgnoreCase(classname)){
+ return MultiStateAligner10ts.minIdToMinRatio(minid);
+ }else if("MultiStateAligner11ts".equalsIgnoreCase(classname)){
+ return MultiStateAligner11ts.minIdToMinRatio(minid);
+ }else if("MultiStateAligner9PacBio".equalsIgnoreCase(classname)){
+ return MultiStateAligner9PacBio.minIdToMinRatio(minid);
+ }else if("MultiStateAligner9Flat".equalsIgnoreCase(classname)){
+ return MultiStateAligner9Flat.minIdToMinRatio(minid);
+ }else if("MultiStateAligner9XFlat".equalsIgnoreCase(classname)){
+ return MultiStateAligner9XFlat.minIdToMinRatio(minid);
+ }else{
+ assert(false) : "Unhandled MSA type: "+classname;
+ return MultiStateAligner11ts.minIdToMinRatio(minid);
+ }
+ }
+
+ public static final MSA makeMSA(int maxRows_, int maxColumns_, String classname){
+ flatMode=false;
+ if("MultiStateAligner9ts".equalsIgnoreCase(classname)){
+ return new MultiStateAligner9ts(maxRows_, maxColumns_);
+ }else if("MultiStateAligner10ts".equalsIgnoreCase(classname)){
+ return new MultiStateAligner10ts(maxRows_, maxColumns_);
+ }else if("MultiStateAligner11ts".equalsIgnoreCase(classname)){
+ if(Shared.USE_JNI){
+ return new MultiStateAligner11tsJNI(maxRows_, maxColumns_);
+ }else{
+ return new MultiStateAligner11ts(maxRows_, maxColumns_);
+ }
+ }else if("MultiStateAligner11tsJNI".equalsIgnoreCase(classname)){
+ return new MultiStateAligner11tsJNI(maxRows_, maxColumns_);
+ }else if("MultiStateAligner9PacBio".equalsIgnoreCase(classname)){
+ return new MultiStateAligner9PacBio(maxRows_, maxColumns_);
+ }else if("MultiStateAligner9Flat".equalsIgnoreCase(classname)){
+ return new MultiStateAligner9Flat(maxRows_, maxColumns_);
+ }else if("MultiStateAligner9XFlat".equalsIgnoreCase(classname)){
+ flatMode=true;
+ return new MultiStateAligner9XFlat(maxRows_, maxColumns_);
+ }else{
+ assert(false) : "Unhandled MSA type: "+classname;
+ return new MultiStateAligner11ts(maxRows_, maxColumns_);
+ }
+ }
+
+ public MSA(int maxRows_, int maxColumns_){
+ maxRows=maxRows_;
+ maxColumns=maxColumns_;
+ }
+
+ /** return new int[] {rows, maxC, maxS, max};
+ * Will not fill areas that cannot match minScore */
+ public abstract int[] fillLimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore, int[] gaps);
+
+
+ /** return new int[] {rows, maxC, maxS, max};
+ * Will not fill areas that cannot match minScore */
+ public abstract int[] fillUnlimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int[] gaps);
+
+ @Deprecated
+ /** return new int[] {rows, maxC, maxS, max}; */
+ public abstract int[] fillQ(byte[] read, byte[] ref, byte[] baseScores, int refStartLoc, int refEndLoc);
+
+
+ /** @return {score, bestRefStart, bestRefStop} */
+ /** Generates the match string */
+ public abstract byte[] traceback(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state, boolean gapped);
+
+
+ /** Generates the match string */
+ public abstract byte[] traceback2(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state);
+
+ /** @return {score, bestRefStart, bestRefStop} */
+ public abstract int[] score(final byte[] read, final byte[] ref, final int refStartLoc, final int refEndLoc,
+ final int maxRow, final int maxCol, final int maxState, boolean gapped);
+
+ /** @return {score, bestRefStart, bestRefStop}, or {score, bestRefStart, bestRefStop, padLeft, padRight} if more padding is needed */
+ public abstract int[] score2(final byte[] read, final byte[] ref, final int refStartLoc, final int refEndLoc,
+ final int maxRow, final int maxCol, final int maxState);
+
+
+ /** Will not fill areas that cannot match minScore.
+ * @return {score, bestRefStart, bestRefStop} */
+ public final int[] fillAndScoreLimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore, int[] gaps){
+ int a=Tools.max(0, refStartLoc);
+ int b=Tools.min(ref.length-1, refEndLoc);
+ assert(b>=a);
+
+ int[] score;
+
+ if(verbose && b-a<500){
+ System.err.println(new String(read));
+ System.err.println(new String(ref, a, b-a));
+ }
+
+ if(gaps==null){
+ if(verbose){
+ System.err.println("no gaps");
+ }
+ if(b-a>=maxColumns){
+ System.err.println("Warning: Max alignment columns exceeded; restricting range. "+(b-a+1)+" > "+maxColumns);
+ assert(false) : refStartLoc+", "+refEndLoc;
+ b=Tools.min(ref.length-1, a+maxColumns-1);
+ }
+ int[] max=fillLimited(read, ref, a, b, minScore, gaps);
+ score=(max==null ? null : score(read, ref, a, b, max[0], max[1], max[2], false));
+ }else{
+ if(verbose){System.err.println("\ngaps: "+Arrays.toString(gaps)+"\n"+new String(read)+"\ncoords: "+refStartLoc+", "+refEndLoc);}
+ int[] max=fillLimited(read, ref, a, b, minScore, gaps);
+ if(verbose){System.err.println("max: "+Arrays.toString(max));}
+// score=(max==null ? null : score(read, grefbuffer, 0, greflimit, max[0], max[1], max[2], true));
+ score=(max==null ? null : score(read, ref, a, b, max[0], max[1], max[2], true));
+ }
+ return score;
+ }
+
+ public final int[] fillAndScoreLimited(byte[] read, SiteScore ss, int thresh, int minScore){
+ return fillAndScoreLimited(read, ss.chrom, ss.start, ss.stop, thresh, minScore, ss.gaps);
+ }
+
+// public final int[] translateScoreFromGappedCoordinate(int[] score)
+
+ public final int[] fillAndScoreLimited(byte[] read, int chrom, int start, int stop, int thresh, int minScore, int[] gaps){
+ return fillAndScoreLimited(read, Data.getChromosome(chrom).array, start-thresh, stop+thresh, minScore, gaps);
+ }
+
+ @Deprecated
+ public final int[] fillAndScoreQ(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, byte[] baseScores){
+ int a=Tools.max(0, refStartLoc);
+ int b=Tools.min(ref.length-1, refEndLoc);
+ assert(b>=a);
+ if(b-a>=maxColumns){
+ System.err.println("Warning: Max alignment columns exceeded; restricting range. "+(b-a+1)+" > "+maxColumns);
+ b=Tools.min(ref.length-1, a+maxColumns-1);
+ }
+ int[] max=fillQ(read, ref, baseScores, a, b);
+// int[] score=score(read, ref, a, b, max[0], max[1], max[2]);
+// return score;
+ return null;
+ }
+
+ @Deprecated
+ public final int[] fillAndScoreQ(byte[] read, SiteScore ss, int thresh, byte[] baseScores){
+ return fillAndScoreQ(read, ss.chrom, ss.start, ss.stop, thresh, baseScores);
+ }
+
+ @Deprecated
+ public final int[] fillAndScoreQ(byte[] read, int chrom, int start, int stop, int thresh, byte[] baseScores){
+ return fillAndScoreQ(read, Data.getChromosome(chrom).array, start-thresh, stop+thresh, baseScores);
+ }
+
+ public final int scoreNoIndels(byte[] read, SiteScore ss){
+ ChromosomeArray cha=Data.getChromosome(ss.chrom);
+ return scoreNoIndels(read, cha.array, ss.start);
+ }
+
+ public final int scoreNoIndels(byte[] read, final int chrom, final int refStart){
+ ChromosomeArray cha=Data.getChromosome(chrom);
+ return scoreNoIndels(read, cha.array, refStart);
+ }
+
+ public final int scoreNoIndels(byte[] read, SiteScore ss, byte[] baseScores){
+ ChromosomeArray cha=Data.getChromosome(ss.chrom);
+ return scoreNoIndels(read, cha.array, baseScores, ss.start);
+ }
+
+ public final int scoreNoIndels(byte[] read, final int chrom, final int refStart, byte[] baseScores){
+ ChromosomeArray cha=Data.getChromosome(chrom);
+ return scoreNoIndels(read, cha.array, baseScores, refStart);
+ }
+
+// public final int scoreNoIndels(byte[] read, final int chrom, final int refStart){
+
+ /** Calculates score based on an array from Index. */
+ public abstract int calcAffineScore(int[] locArray, byte[] baseScores, byte[] bases);
+
+ /** Calculates score based on an array from Index using a kfilter. Slightly slower. */
+ public abstract int calcAffineScore(int[] locArray, byte[] baseScores, byte[] bases, int minContig);
+
+ public abstract int scoreNoIndels(byte[] read, byte[] ref, final int refStart);
+ public int scoreNoIndels(byte[] read, byte[] ref, final int refStart, final SiteScore ss){
+ throw new RuntimeException("Unimplemented method in class "+this.getClass());
+ }
+
+ public abstract byte[] genMatchNoIndels(byte[] read, byte[] ref, final int refStart);
+
+ public abstract int scoreNoIndels(byte[] read, byte[] ref, byte[] baseScores, final int refStart);
+ public int scoreNoIndels(byte[] read, byte[] ref, byte[] baseScores, final int refStart, SiteScore ss){
+ throw new RuntimeException("Unimplemented method in class "+this.getClass());
+ }
+
+ public abstract int scoreNoIndelsAndMakeMatchString(byte[] read, byte[] ref, byte[] baseScores, final int refStart, byte[][] matchReturn);
+
+ public abstract int scoreNoIndelsAndMakeMatchString(byte[] read, byte[] ref, final int refStart, byte[][] matchReturn);
+
+ /** Assumes match string is in long format */
+ public final boolean toLocalAlignment(Read r, SiteScore ss, byte[] basesM, int minToClip, float matchPointsMult){
+ final byte[] match=r.match, bases=(r.strand()==Gene.PLUS ? r.bases : basesM);
+ if(match==null || match.length<1){return false;}
+
+ assert(match==ss.match);
+ assert(match==r.match);
+ assert(r.start==ss.start);
+ assert(r.stop==ss.stop);
+
+ if(r.containsXY2()){
+ if(verbose){System.err.println("\nInitial0:");}
+ if(verbose){System.err.println("0: match="+new String(match));}
+ if(verbose){System.err.println("0: r.start="+r.start+", r.stop="+r.stop+"; len="+bases.length+"; reflen="+(r.stop-r.start+1));}
+ ss.fixXY(bases, false, this);
+ r.start=ss.start;
+ r.stop=ss.stop;
+ if(verbose){System.err.println("\nAfter fixXY:");}
+ if(verbose){System.err.println("0: match="+new String(match));}
+ if(verbose){System.err.println("0: r.start="+r.start+", r.stop="+r.stop+"; len="+bases.length+"; reflen="+(r.stop-r.start+1));}
+ assert(match==ss.match);
+ assert(match==r.match);
+ assert(r.start==ss.start);
+ assert(r.stop==ss.stop);
+ assert(ss.lengthsAgree()) : ss.mappedLength()+"!="+ss.matchLength()+"\n"+ss+"\n\n"+r+"\n";
+ }
+ assert(ss.lengthsAgree()) : ss.mappedLength()+"!="+ss.matchLength()+"\n"+ss+"\n\n"+r+"\n";
+
+ int maxScore=-1;
+
+ int startLocC=-1;
+ int stopLocC=-1;
+ int lastZeroC=0;
+
+ int startLocM=-1;
+ int stopLocM=-1;
+ int lastZeroM=0;
+
+ int startLocR=-1;
+ int stopLocR=-1;
+ int lastZeroR=0;
+
+ byte mode=match[0], prevMode='0';
+ int current=0, prevStreak=0;
+ int cpos=0;
+ int rpos=r.start;
+ int score=0;
+
+ if(verbose){System.err.println("\nInitial:");}
+ if(verbose){System.err.println("A: r.start="+r.start+", r.stop="+r.stop+"; rpos="+rpos+"; len="+bases.length+"; reflen="+(r.stop-r.start+1));}
+ if(verbose){System.err.println("A: match=\n"+new String(match));}
+ if(verbose){System.err.println(new String(bases));}
+ if(verbose){System.err.println(Data.getChromosome(r.chrom).getString(r.start, Tools.max(r.stop, r.start+bases.length-1)));}
+
+ if(verbose){
+ int calcscore=score(match);
+ System.err.println("A: score="+r.mapScore+", ss.slowScore="+ss.slowScore+", calcscore="+calcscore);
+// assert(ss.slowScore<=calcscore); //May be lower due to ambig3. I found a case where this line fails, possibly due to long deletions?
+ }
+
+ for(int mpos=0; mpos<match.length; mpos++){
+ byte c=match[mpos];
+
+ if(mode==c){
+ current++;
+ }else{
+ if(mode=='m'){
+ if(score<=0){
+ score=0;
+ lastZeroC=cpos;
+ lastZeroM=mpos-current;
+ lastZeroR=rpos;
+ }
+ int add=calcMatchScore(current);
+ score+=(matchPointsMult*add);
+// if(prevMode=='N' || prevMode=='R'){score=score+POINTS_MATCH2()-POINTS_MATCH();} //Don't penalize first match after N
+ cpos+=current;
+ rpos+=current;
+ if(score>maxScore){
+ maxScore=score;
+ startLocC=lastZeroC;
+ startLocM=lastZeroM;
+ startLocR=lastZeroR;
+ stopLocC=cpos-1;
+ stopLocM=mpos-1;
+ stopLocR=rpos-1;
+ }
+ }else if(mode=='S'){
+ score+=calcSubScore(current);
+ if(prevMode=='N' || prevMode=='R'){score=score+POINTS_SUB2()-POINTS_SUB();} //Don't penalize first sub after N
+ else if(prevMode=='m' && prevStreak<2){score=score+POINTS_SUBR()-POINTS_SUB();}
+ cpos+=current;
+ rpos+=current;
+ }else if(mode=='D'){
+ score+=calcDelScore(current, true);
+ rpos+=current;
+ }else if(mode=='I'){
+ score+=calcInsScore(current);
+ cpos+=current;
+ }else if(mode=='C'){
+ cpos+=current;
+ rpos+=current;
+ }else if(mode=='X' || mode=='Y'){
+ score+=calcInsScore(current);//TODO: Consider changing XY to subs
+ cpos+=current;
+ rpos+=current;
+ }else if(mode=='N'){
+ score+=calcNocallScore(current);
+ cpos+=current;
+ rpos+=current;
+ }else if(mode=='R'){
+ score+=calcNorefScore(current);
+ cpos+=current;
+ rpos+=current;
+ }else{
+ assert(false) : "Unhandled symbol "+mode+"\n"+(char)mode+"\n"+new String(match)+"\n"+new String(bases);
+ }
+ if(verbose){System.err.println("mode "+(char)mode+"->"+(char)c+"; rpos="+rpos);}
+ prevMode=mode;
+ prevStreak=current;
+ mode=c;
+ current=1;
+ }
+ }
+ if(current>0){
+ assert(mode==match[match.length-1]);
+ if(mode=='m'){
+ if(score<=0){
+ score=0;
+ lastZeroC=cpos;
+ lastZeroM=match.length-current;
+ lastZeroR=rpos;
+ }
+ int add=calcMatchScore(current);
+ score+=(matchPointsMult*add);
+// if(prevMode=='N' || prevMode=='R'){score=score+POINTS_MATCH2()-POINTS_MATCH();} //Don't penalize first match after N
+ cpos+=current;
+ rpos+=current;
+ if(score>maxScore){
+ maxScore=score;
+ startLocC=lastZeroC;
+ startLocM=lastZeroM;
+ startLocR=lastZeroR;
+ stopLocC=cpos-1;
+ stopLocM=match.length-1;
+ stopLocR=rpos-1;
+ }
+ }else if(mode=='S'){
+ score+=calcSubScore(current);
+ if(prevMode=='N' || prevMode=='R'){score=score+POINTS_SUB2()-POINTS_SUB();} //Don't penalize first sub after N
+ else if(prevMode=='m' && prevStreak<2){score=score+POINTS_SUBR()-POINTS_SUB();}
+ cpos+=current;
+ rpos+=current;
+ }else if(mode=='D'){
+ score+=calcDelScore(current, true);
+ rpos+=current;
+ }else if(mode=='I'){
+ score+=calcInsScore(current);
+ cpos+=current;
+ }else if(mode=='C'){
+ cpos+=current;
+ rpos+=current;
+ }else if(mode=='X' || mode=='Y'){
+ score+=calcInsScore(current);
+ cpos+=current;
+ rpos+=current;
+ }else if(mode=='N'){
+ score+=calcNocallScore(current);
+ cpos+=current;
+ rpos+=current;
+ }else if(mode=='R'){
+ score+=calcNorefScore(current);
+ cpos+=current;
+ rpos+=current;
+ }else if(mode!=0){
+ assert(false) : "Unhandled symbol "+mode+"\n"+(char)mode+"\n"+new String(match)+"\n"+new String(bases);
+ }
+ if(verbose){System.err.println("mode "+(char)mode+"->end; rpos="+rpos);}
+ }
+
+ if(startLocC<0 || stopLocC<0){
+ //This can happen if there are zero matches. Which would be rare, but I have seen it occur.
+ r.clearMapping();
+// assert(false) : "Failed: "+startLocC+", "+stopLocC+"\n"+r+"\n"+r.mate+"\n"+r.toFastq()+"\n"+(r.mate==null ? "null" : r.mate.toFastq());
+ return false;
+ }
+
+
+ if(verbose){System.err.println("A: r.start="+r.start+", r.stop="+r.stop+"; rpos="+rpos+"; len="+bases.length+"; reflen="+(r.stop-r.start+1));}
+
+ assert(rpos==r.stop+1) : "\n\n\n"+rpos+"!="+(r.stop+1)+"\n"+r+"\n\n"+
+ (r.topSite()==null ? "null" : r.topSite().mappedLength()+", "+r.topSite().matchLength()+", "+r.topSite().start+", "+r.topSite().stop+"\n"+r.topSite());
+
+ if(verbose){System.err.println("B: rpos="+rpos+", startLocR="+startLocR+", stopLocR="+stopLocR);}
+
+ int headTrimR=startLocC;
+ int headTrimM=startLocM;
+ int tailTrimR=bases.length-stopLocC-1;
+ int tailTrimM=match.length-stopLocM-1;
+
+ if(verbose){System.err.println("C: headTrimR="+headTrimR+", headTrimM="+headTrimM+", tailTrimR="+tailTrimR+", tailTrimM="+tailTrimM);}
+
+ if(headTrimR<=minToClip && headTrimM<=minToClip){
+ headTrimR=headTrimM=0;
+ }
+ if(tailTrimR<=minToClip && tailTrimM<=minToClip){
+ tailTrimR=tailTrimM=0;
+ }
+ if(headTrimR==0 && headTrimM==0 && tailTrimR==0 && tailTrimM==0){
+ return false;
+ }
+ //Do trimming
+ final int headDelta=headTrimR-headTrimM;
+ final int tailDelta=tailTrimR-tailTrimM;
+ final byte[] match2;
+
+ if(verbose){System.err.println("D: headTrimR="+headTrimR+", headTrimM="+headTrimM+", tailTrimR="+tailTrimR+", tailTrimM="+tailTrimM);}
+ if(verbose){System.err.println("D: headDelta="+headDelta+", tailDelta="+tailDelta);}
+
+ if(headDelta==0 && tailDelta==0){
+ //Length-neutral trimming
+ match2=match;
+ for(int i=0; i<headTrimM; i++){match[i]='C';}
+ for(int i=match.length-tailTrimM; i<match.length; i++){match[i]='C';}
+ }else{
+ final int newlen=match.length-headTrimM-tailTrimM+headTrimR+tailTrimR;
+ match2=new byte[newlen];
+ for(int i=0; i<headTrimR; i++){match2[i]='C';}
+ for(int i=match2.length-tailTrimR; i<match2.length; i++){match2[i]='C';}
+ for(int i=headTrimM, i2=headTrimR, lim=match2.length-tailTrimR; i2<lim; i++, i2++){
+ match2[i2]=match[i];
+ }
+ }
+
+ assert(ss==null || ((ss.start==r.start) && (ss.stop==r.stop) && (ss.strand==r.strand()) && (ss.chrom==r.chrom) && (ss.match==r.match))) :
+ "\nr="+r+"\nr2="+r.mate+"\nss=\n"+ss+"\n"+(ss==null ? "ss is null" : ((ss.start==r.start)+", "+(ss.stop==r.stop)+", "+
+ (ss.strand==r.strand())+", "+(ss.chrom==r.chrom)+", "+(ss.match==r.match)));
+
+ if(headTrimR!=0){r.start=startLocR-headTrimR;}
+ if(tailTrimR!=0){r.stop=stopLocR+tailTrimR;}
+ r.match=match2;
+
+ if(matchPointsMult!=1f){
+ maxScore=score(match);
+ }
+ if(ss!=null){maxScore=Tools.max(maxScore, ss.slowScore);}
+ r.mapScore=maxScore;
+
+ if(verbose){System.err.println("E: r.start="+r.start+", r.stop="+r.stop);}
+
+ if(ss!=null){
+ assert(maxScore>=ss.slowScore) : maxScore+", "+ss.slowScore+"\n"+r.toFastq();
+ ss.match=r.match;
+ ss.setLimits(r.start, r.stop);
+ int pairedScore=ss.pairedScore>0 ? Tools.max(ss.pairedScore+(maxScore-ss.slowScore), 0) : 0;
+ }
+
+ if(!ss.perfect && ss.isPerfect(bases)){
+ ss.perfect=ss.semiperfect=true;
+ r.setPerfect(true);
+ Arrays.fill(r.match, (byte)'m');
+ ss.setSlowScore(maxScore);
+ }else if(!ss.semiperfect && ss.isSemiPerfect(bases)){
+ ss.semiperfect=true;
+ ChromosomeArray cha=Data.getChromosome(ss.chrom);
+ r.match=ss.match=genMatchNoIndels(bases, cha.array, ss.start);
+ return toLocalAlignment(r, ss, basesM, minToClip, matchPointsMult);
+ }
+ return true;
+ }
+
+
+ /** Assumes match string is in long format. */
+ public final int score(byte[] match){
+ if(match==null || match.length<1){return 0;}
+
+ byte mode=match[0], prevMode='0';
+ int current=0, prevStreak=0;
+ int score=0;
+
+ for(int mpos=0; mpos<match.length; mpos++){
+ byte c=match[mpos];
+
+ if(mode==c){
+ current++;
+ }else{
+ if(mode=='m'){
+ score+=calcMatchScore(current);
+// if(prevMode=='N' || prevMode=='R'){score=score+POINTS_MATCH2()-POINTS_MATCH();} //Don't penalize first match after N
+ }else if(mode=='S'){
+ score+=calcSubScore(current);
+ if(prevMode=='N' || prevMode=='R'){score=score+POINTS_SUB2()-POINTS_SUB();} //Don't penalize first sub after N
+ else if(prevMode=='m' && prevStreak<2){score=score+POINTS_SUBR()-POINTS_SUB();}
+ }else if(mode=='D'){
+ score+=calcDelScore(current, true);
+ }else if(mode=='I'){
+ score+=calcInsScore(current);
+ }else if(mode=='C'){
+ //do nothing
+ }else if(mode=='X' || mode=='Y'){
+ score+=calcInsScore(current);
+ }else if(mode=='N'){
+ score+=calcNocallScore(current);
+ }else if(mode=='R'){
+ score+=calcNorefScore(current);
+ }else{
+ assert(false) : "Unhandled symbol "+mode+"\n"+(char)mode+"\n"+new String(match);
+ }
+ if(verbose){System.err.println("mode "+(char)mode+"->"+(char)c+"\tcurrent="+current+"\tscore="+score);}
+ prevMode=mode;
+ prevStreak=current;
+ mode=c;
+ current=1;
+ }
+ }
+ if(current>0){
+ assert(mode==match[match.length-1]);
+ if(mode=='m'){
+ score+=calcMatchScore(current);
+// if(prevMode=='N' || prevMode=='R'){score=score+POINTS_MATCH2()-POINTS_MATCH();} //Don't penalize first match after N
+ }else if(mode=='S'){
+ score+=calcSubScore(current);
+ if(prevMode=='N' || prevMode=='R'){score=score+POINTS_SUB2()-POINTS_SUB();} //Don't penalize first sub after N
+ else if(prevMode=='m' && prevStreak<2){score=score+POINTS_SUBR()-POINTS_SUB();}
+ }else if(mode=='D'){
+ score+=calcDelScore(current, true);
+ }else if(mode=='I'){
+ score+=calcInsScore(current);
+ }else if(mode=='C'){
+ //do nothing
+ }else if(mode=='X' || mode=='Y'){
+ score+=calcInsScore(current);
+ }else if(mode=='N'){
+ score+=calcNocallScore(current);
+ }else if(mode=='R'){
+ score+=calcNorefScore(current);
+ }else if(mode!=0){
+ assert(false) : "Unhandled symbol "+mode+"\n"+(char)mode+"\n"+new String(match);
+ }
+ if(verbose){System.err.println("mode "+(char)mode+"->end; score="+score);}
+ }
+
+ return score;
+ }
+
+// //TODO
+// public final byte[] softClipBoundsShortmatch(byte[] match, byte[] bases, int minToClip){
+// if(match==null || match.length<1){return null;}
+// int[] score=new int[bases.length];
+//
+// byte mode='0', c='0';
+// int current=0;
+// int rpos=0;
+// long currentScore;
+// for(int i=0; i<match.length; i++){
+// c=match[i];
+// if(Character.isDigit(c)){
+// current=(current*10)+(c-'0');
+// }else{
+// if(mode==c){
+// current=Tools.max(current+1, 2);
+// }else{
+// current=Tools.max(current, 1);
+//
+// if(mode=='m'){
+// msdicn[0]+=current;
+// }else if(mode=='S'){
+// msdicn[1]+=current;
+// }else if(mode=='D'){
+// msdicn[2]+=current;
+// }else if(mode=='I'){
+// msdicn[3]+=current;
+// }else if(mode=='C' || mode=='X' || mode=='Y'){
+// msdicn[4]+=current;
+// }else if(mode=='N' || mode=='R'){
+// msdicn[5]+=current;
+// }
+// mode=c;
+// current=0;
+// }
+// }
+// }
+// if(current>0 || !Character.isDigit(c)){
+// current=Tools.max(current, 1);
+// if(mode=='m'){
+// msdicn[0]+=current;
+// }else if(mode=='S'){
+// msdicn[1]+=current;
+// }else if(mode=='D'){
+// msdicn[2]+=current;
+// }else if(mode=='I'){
+// msdicn[3]+=current;
+// }else if(mode=='C' || mode=='X' || mode=='Y'){
+// msdicn[4]+=current;
+// }else if(mode=='N' || mode=='R'){
+// msdicn[5]+=current;
+// }
+// }
+// return msdicn;
+// }
+
+ public abstract int maxQuality(int numBases);
+
+ public abstract int maxQuality(byte[] baseScores);
+
+ public abstract int maxImperfectScore(int numBases);
+
+ public abstract int maxImperfectScore(byte[] baseScores);
+
+ public final static String toString(int[] a){
+
+ int width=7;
+
+ StringBuilder sb=new StringBuilder((a.length+1)*width+2);
+ for(int num : a){
+ String s=" "+num;
+ int spaces=width-s.length();
+ assert(spaces>=0) : width+", "+s.length()+", "+s+", "+num+", "+spaces;
+ for(int i=0; i<spaces; i++){sb.append(' ');}
+ sb.append(s);
+ }
+
+ return sb.toString();
+ }
+
+ static void printMatrix(int[][][] packed, int readlen, int reflen, int TIMEMASK, int SCOREOFFSET){
+ for(int mode=0; mode<packed.length; mode++){
+ printMatrix(packed, readlen, reflen, TIMEMASK, SCOREOFFSET, mode);
+ }
+ }
+
+ static void printMatrix(int[][][] packed, int readlen, int reflen, int TIMEMASK, int SCOREOFFSET, int mode){
+ final int ylim=Tools.min(readlen+1, packed[mode].length);
+ final int xlim=Tools.min(reflen+1, packed[mode].length);
+ for(int row=0; row<ylim; row++){
+ System.out.println(toScorePacked(packed[mode][row], SCOREOFFSET, xlim));
+ }
+ System.out.println();
+ for(int row=0; row<ylim; row++){
+ System.out.println(toTimePacked(packed[mode][row], TIMEMASK, xlim));
+ }
+ System.out.println();
+ }
+
+ public final static String toTimePacked(int[] a, int TIMEMASK, int lim){
+ int width=6;
+ lim=Tools.min(lim, a.length);
+
+ StringBuilder sb=new StringBuilder((a.length+1)*width+2);
+ for(int j=0; j<lim; j++){
+ int num=a[j]&TIMEMASK;
+ String s=" "+num;
+ int spaces=width-s.length();
+ assert(spaces>=0) : width+", "+s.length()+", "+s+", "+num+", "+spaces;
+ for(int i=0; i<spaces; i++){sb.append(' ');}
+ sb.append(s);
+ }
+
+ return sb.toString();
+ }
+
+ public final static String toScorePacked(int[] a, int SCOREOFFSET, int lim){
+ int width=6;
+ lim=Tools.min(lim, a.length);
+
+// String minString=" -";
+// String maxString=" ";
+// while(minString.length()<width){minString+='9';}
+// while(maxString.length()<width){maxString+='9';}
+
+ String minString=" -";
+ String maxString=" +";
+ while(minString.length()<width){minString=minString+' ';}
+ while(maxString.length()<width){maxString=maxString+' ';}
+
+ StringBuilder sb=new StringBuilder((a.length+1)*width+2);
+ for(int j=0; j<lim; j++){
+ int num=a[j]>>SCOREOFFSET;
+ String s=" "+num;
+ if(s.length()>width){s=num>0 ? maxString : minString;}
+ int spaces=width-s.length();
+ assert(spaces>=0) : width+", "+s.length()+", "+s+", "+num+", "+spaces;
+ for(int i=0; i<spaces; i++){sb.append(' ');}
+ sb.append(s);
+ }
+
+ return sb.toString();
+ }
+
+ public final static String toString(byte[] a){
+
+ int width=6;
+
+ StringBuilder sb=new StringBuilder((a.length+1)*width+2);
+ for(int num : a){
+ String s=" "+num;
+ int spaces=width-s.length();
+ assert(spaces>=0);
+ for(int i=0; i<spaces; i++){sb.append(' ');}
+ sb.append(s);
+ }
+
+ return sb.toString();
+ }
+
+ public final static String toString(byte[] ref, int startLoc, int stopLoc){
+ StringBuilder sb=new StringBuilder(stopLoc-startLoc+1);
+ for(int i=startLoc; i<=stopLoc; i++){sb.append((char)ref[i]);}
+ return sb.toString();
+ }
+
+ public final int calcMatchScore(int len){
+ assert(len>0) : len;
+ return POINTS_MATCH()+(len-1)*POINTS_MATCH2();
+ }
+
+ public final int calcSubScore(int len){
+ assert(len>0) : len;
+ final int lim3=LIMIT_FOR_COST_3();
+ int score=POINTS_SUB();
+ if(len>lim3){
+ score+=(len-lim3)*POINTS_SUB3();
+ len=lim3;
+ }
+ if(len>1){
+ score+=(len-1)*POINTS_SUB2();
+ }
+ return score;
+ }
+
+ public final int calcNorefScore(int len){return len*POINTS_NOREF();}
+
+ public final int calcNocallScore(int len){return len*POINTS_NOCALL();}
+
+ public abstract int calcDelScore(int len, boolean approximateGaps);
+
+ public abstract int calcInsScore(int len);
+
+ static final int GAPBUFFER=Shared.GAPBUFFER;
+ static final int GAPBUFFER2=Shared.GAPBUFFER2;
+ static final int GAPLEN=Shared.GAPLEN;
+ static final int MINGAP=Shared.MINGAP;
+ static final int GAPCOST=Shared.GAPCOST;
+ static final byte GAPC=Shared.GAPC;
+
+ /** Seemingly to clear out prior data from the gref. Not sure what else it's used for. */
+ static final int GREFLIMIT2_CUSHION=128; //Tools.max(GAPBUFFER2, GAPLEN);
+
+
+ /**DO NOT MODIFY*/
+ public abstract byte[] getGrefbuffer();
+
+// public final int[] vertLimit;
+// public final int[] horizLimit;
+
+ public abstract CharSequence showVertLimit();
+ public abstract CharSequence showHorizLimit();
+
+//// public static final int MODEBITS=2;
+// public static final int TIMEBITS=11;
+// public static final int SCOREBITS=32-TIMEBITS;
+// public static final int MAX_TIME=((1<<TIMEBITS)-1);
+// public static final int MAX_SCORE=((1<<(SCOREBITS-1))-1)-2000;
+// public static final int MIN_SCORE=0-MAX_SCORE; //Keeps it 1 point above "BAD".
+//
+//// public static final int MODEOFFSET=0; //Always zero.
+//// public static final int TIMEOFFSET=0;
+ public abstract int SCOREOFFSET();
+//
+//// public static final int MODEMASK=~((-1)<<MODEBITS);
+//// public static final int TIMEMASK=(~((-1)<<TIMEBITS))<<TIMEOFFSET;
+// public static final int TIMEMASK=~((-1)<<TIMEBITS);
+// public static final int SCOREMASK=(~((-1)<<SCOREBITS))<<SCOREOFFSET;
+
+ static final byte MODE_MS=0;
+ static final byte MODE_DEL=1;
+ static final byte MODE_INS=2;
+ static final byte MODE_SUB=3;
+
+ public abstract int POINTS_NOREF();
+ public abstract int POINTS_NOCALL();
+ public abstract int POINTS_MATCH();
+ public abstract int POINTS_MATCH2();
+ public abstract int POINTS_COMPATIBLE();
+ public abstract int POINTS_SUB();
+ public abstract int POINTS_SUBR();
+ public abstract int POINTS_SUB2();
+ public abstract int POINTS_SUB3();
+ public abstract int POINTS_MATCHSUB();
+ public abstract int POINTS_INS();
+ public abstract int POINTS_INS2();
+ public abstract int POINTS_INS3();
+ public abstract int POINTS_INS4();
+ public abstract int POINTS_DEL();
+ public abstract int POINTS_DEL2();
+ public abstract int POINTS_DEL3();
+ public abstract int POINTS_DEL4();
+ public abstract int POINTS_DEL5();
+ public abstract int POINTS_DEL_REF_N();
+ public abstract int POINTS_GAP();
+
+ public abstract int TIMESLIP();
+ public abstract int MASK5();
+
+ abstract int BARRIER_I1();
+ abstract int BARRIER_D1();
+
+ public abstract int LIMIT_FOR_COST_3();
+ public abstract int LIMIT_FOR_COST_4();
+ public abstract int LIMIT_FOR_COST_5();
+
+ public abstract int BAD();
+
+
+// public static final int POINTSoff_NOREF=(POINTS_NOREF<<SCOREOFFSET);
+// public static final int POINTSoff_NOCALL=(POINTS_NOCALL<<SCOREOFFSET);
+// public static final int POINTSoff_MATCH=(POINTS_MATCH<<SCOREOFFSET);
+// public static final int POINTSoff_MATCH2=(POINTS_MATCH2<<SCOREOFFSET);
+// public static final int POINTSoff_COMPATIBLE=(POINTS_COMPATIBLE<<SCOREOFFSET);
+// public static final int POINTSoff_SUB=(POINTS_SUB<<SCOREOFFSET);
+// public static final int POINTSoff_SUBR=(POINTS_SUBR<<SCOREOFFSET);
+// public static final int POINTSoff_SUB2=(POINTS_SUB2<<SCOREOFFSET);
+// public static final int POINTSoff_SUB3=(POINTS_SUB3<<SCOREOFFSET);
+// public static final int POINTSoff_MATCHSUB=(POINTS_MATCHSUB<<SCOREOFFSET);
+// public static final int POINTSoff_INS=(POINTS_INS<<SCOREOFFSET);
+// public static final int POINTSoff_INS2=(POINTS_INS2<<SCOREOFFSET);
+// public static final int POINTSoff_INS3=(POINTS_INS3<<SCOREOFFSET);
+// public static final int POINTSoff_INS4=(POINTS_INS4<<SCOREOFFSET);
+// public static final int POINTSoff_DEL=(POINTS_DEL<<SCOREOFFSET);
+// public static final int POINTSoff_DEL2=(POINTS_DEL2<<SCOREOFFSET);
+// public static final int POINTSoff_DEL3=(POINTS_DEL3<<SCOREOFFSET);
+// public static final int POINTSoff_DEL4=(POINTS_DEL4<<SCOREOFFSET);
+// public static final int POINTSoff_DEL5=(POINTS_DEL5<<SCOREOFFSET);
+// public static final int POINTSoff_GAP=(POINTS_GAP<<SCOREOFFSET);
+// public static final int POINTSoff_DEL_REF_N=(POINTS_DEL_REF_N<<SCOREOFFSET);
+// public static final int BADoff=(BAD<<SCOREOFFSET);
+// public static final int MAXoff_SCORE=MAX_SCORE<<SCOREOFFSET;
+// public static final int MINoff_SCORE=MIN_SCORE<<SCOREOFFSET;
+
+
+ public final int maxRows;
+ public final int maxColumns;
+
+ public long iterationsLimited=0;
+ public long iterationsUnlimited=0;
+
+ public boolean verbose=false;
+ public boolean verbose2=false;
+
+ public static int bandwidth=0;
+ public static float bandwidthRatio=0;
+ public static boolean flatMode=false;
+
+ public static final int MIN_SCORE_ADJUST=120;
+
+}
diff --git a/current/align2/MakeQualityHistogram.java b/current/align2/MakeQualityHistogram.java
new file mode 100755
index 0000000..d488425
--- /dev/null
+++ b/current/align2/MakeQualityHistogram.java
@@ -0,0 +1,113 @@
+package align2;
+
+import java.util.ArrayList;
+
+import stream.ConcurrentLegacyReadInputStream;
+import stream.RTextInputStream;
+import stream.Read;
+import stream.SiteScore;
+
+import fileIO.ReadWrite;
+
+public class MakeQualityHistogram {
+
+ public static void main(String[] args){
+
+ String fname1=args[0];
+ String fname2=(args.length>1 ? args[1] : null);
+ assert(fname2==null || !fname1.equals(fname2)) : "Error - input files have same name.";
+
+ long maxReads=0;
+ RTextInputStream rtis=new RTextInputStream(fname1, fname2, maxReads);
+ ConcurrentLegacyReadInputStream cris=new ConcurrentLegacyReadInputStream(rtis, maxReads);
+
+ int[][][] counts=process(cris);
+ printMappedHistogram(counts[0]);
+ System.out.println();
+ printPairedHistogram(counts[1]);
+// System.out.println("*** main() finished ***");
+ }
+
+ public static void printMappedHistogram(int[][] mapped){
+ System.out.println("#Error Quality Histogram");
+ System.out.println("Quality\tMapped\tUnmapped\tPercent Mapped");
+ for(int i=0; i<mapped[0].length; i++){
+ int e=mapped[0][i];
+ int m=mapped[1][i];
+ float percent=e*100f/(e+m);
+ System.out.println(i+"\t"+e+"\t"+m+"\t"+String.format("%.3f", percent));
+ }
+ }
+
+ public static void printPairedHistogram(int[][] paired){
+ System.out.println("#Error Quality Histogram");
+ System.out.println("Quality\tPaired\tSingle\tPercent Paired");
+ for(int i=0; i<paired[0].length; i++){
+ int e=paired[0][i];
+ int m=paired[1][i];
+ float percent=e*100f/(e+m);
+ System.out.println(i+"\t"+e+"\t"+m+"\t"+String.format("%.3f", percent));
+ }
+ }
+
+ public static int[][][] process(ConcurrentLegacyReadInputStream cris){
+
+ cris.start();
+
+ int[][] mapped=new int[2][50];
+ int[][] paired=new int[2][50];
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> readlist=ln.list;
+ while(!readlist.isEmpty()){
+
+ processList(readlist, mapped, paired);
+
+ cris.returnList(ln.id, readlist.isEmpty());
+ //System.err.println("Waiting on a list...");
+ ln=cris.nextList();
+ readlist=ln.list;
+ }
+
+ //System.err.println("Returning a list... (final)");
+ assert(readlist.isEmpty());
+ cris.returnList(ln.id, readlist.isEmpty());
+ ReadWrite.closeStream(cris);
+
+ return new int[][][] {mapped, paired};
+ }
+
+ private static void processList(ArrayList<Read> list, int[][] mapped, int[][] paired) {
+ for(Read r : list){
+ processRead(r, mapped, paired);
+// if(r.mate!=null){
+// processRead(r.mate, mapped, paired);
+// }
+ }
+ }
+
+ private static void processRead(Read r, int[][] mapped, int[][] paired) {
+
+ if(r.chrom<1 && r.numSites()>0){
+ SiteScore ss=r.topSite(); //Should not be necessary
+ r.start=ss.start;
+ r.stop=ss.stop;
+ r.chrom=ss.chrom;
+ r.setStrand(ss.strand);
+ }
+
+ int avgQ=r.avgQuality(true, 0);
+ if(r.chrom>0){
+ mapped[0][avgQ]++;
+ }else{
+ mapped[1][avgQ]++;
+ }
+ if(r.paired()){
+ paired[0][avgQ]++;
+ }else{
+ paired[1][avgQ]++;
+ }
+
+ }
+
+}
diff --git a/current/align2/MakeRocCurve.java b/current/align2/MakeRocCurve.java
new file mode 100755
index 0000000..6443926
--- /dev/null
+++ b/current/align2/MakeRocCurve.java
@@ -0,0 +1,326 @@
+package align2;
+
+import java.io.File;
+import java.util.Arrays;
+import java.util.BitSet;
+
+import stream.Read;
+import stream.SamLine;
+import stream.SiteScore;
+
+import dna.Parser;
+import dna.Timer;
+
+import fileIO.TextFile;
+
+public class MakeRocCurve {
+
+
+ public static void main(String[] args){
+
+ Timer t=new Timer();
+ String in=null;
+ long reads=-1;
+
+ for(int i=0; i<args.length; i++){
+ final String arg=args[i];
+ final String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if("null".equalsIgnoreCase(b)){b=null;}
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(a.equals("in") || a.equals("in1")){
+ in=b;
+ }else if(a.equals("reads")){
+ reads=Tools.parseKMG(b);
+ }else if(a.equals("parsecustom")){
+ parsecustom=Tools.parseBoolean(b);
+// }else if(a.equals("ssaha2") || a.equals("subtractleadingclip")){
+// SamLine.SUBTRACT_LEADING_SOFT_CLIP=Tools.parseBoolean(b);
+ }else if(a.equals("blasr")){
+ BLASR=Tools.parseBoolean(b);
+ }else if(a.equals("bitset")){
+ USE_BITSET=Tools.parseBoolean(b);
+ }else if(a.equals("thresh")){
+ THRESH2=Integer.parseInt(b);
+ }else if(a.equals("outputerrors")){
+// OUTPUT_ERRORS=true;
+ }else if(i==0 && args[i].indexOf('=')<0 && (a.startsWith("stdin") || new File(args[0]).exists())){
+ in=args[0];
+ }else if(i==1 && args[i].indexOf('=')<0 && Character.isDigit(a.charAt(0))){
+ reads=Tools.parseKMG(a);
+ }
+ }
+
+ if(USE_BITSET){
+ int x=400000;
+ if(reads>0 && reads<=Integer.MAX_VALUE){x=(int)reads;}
+ try {
+ seen=new BitSet(x);
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ System.out.println("Did not have enough memory to allocate bitset; duplicate mappings will not be detected.");
+ }
+ }
+
+ process(in);
+
+ System.out.println("ROC Curve for "+in);
+ System.out.println(header());
+ gradeList(reads);
+ t.stop();
+ System.err.println("Time: \t"+t);
+
+ }
+
+ public static void process(String samfile){
+ TextFile tf=new TextFile(samfile, false, false);
+
+ String s=null;
+ for(s=tf.nextLine(); s!=null; s=tf.nextLine()){
+ char c=s.charAt(0);
+ if(c!='@'/* && c!=' ' && c!='\t'*/){
+ SamLine sl=new SamLine(s);
+ final int id=((((int)sl.parseNumericId())<<1)|sl.pairnum());
+ assert(sl!=null);
+ Read r=sl.toRead(true);
+ if(r!=null){
+ r.obj=sl;
+ if(sl.primary() && (seen==null || !seen.get(id))){
+ if(seen!=null){seen.set(id);}
+ calcStatistics1(r, (SamLine) r.obj);
+ }
+ }else{
+ assert(false) : "'"+"'";
+ System.err.println("Bad read from line '"+s+"'");
+ }
+// calcStatistics1(r);
+ }
+ }
+ tf.close();
+ }
+
+ public static String header(){
+ return "minScore\tmapped\tretained\ttruePositiveStrict\tfalsePositiveStrict\ttruePositiveLoose" +
+ "\tfalsePositiveLoose\tfalseNegative\tdiscarded\tambiguous";
+ }
+
+ public static void gradeList(long reads){
+
+ int truePositiveStrict=0;
+ int falsePositiveStrict=0;
+
+ int truePositiveLoose=0;
+ int falsePositiveLoose=0;
+
+ int mapped=0;
+ int mappedRetained=0;
+ int unmapped=0;
+
+ int discarded=0;
+ int ambiguous=0;
+
+ int primary=0;
+
+
+ for(int q=truePositiveStrictA.length-1; q>=0; q--){
+ if(mappedA[q]>0 || unmappedA[q]>0){
+ truePositiveStrict+=truePositiveStrictA[q];
+ falsePositiveStrict+=falsePositiveStrictA[q];
+ truePositiveLoose+=truePositiveLooseA[q];
+ falsePositiveLoose+=falsePositiveLooseA[q];
+ mapped+=mappedA[q];
+ mappedRetained+=mappedRetainedA[q];
+ unmapped+=unmappedA[q];
+ discarded+=discardedA[q];
+ ambiguous+=ambiguousA[q];
+ primary+=primaryA[q];
+
+ double tmult=100d/reads;
+
+ double mappedB=mapped*tmult;
+ double retainedB=mappedRetained*tmult;
+ double truePositiveStrictB=truePositiveStrict*tmult;
+ double falsePositiveStrictB=falsePositiveStrict*tmult;
+ double truePositiveLooseB=truePositiveLoose*tmult;
+ double falsePositiveLooseB=falsePositiveLoose*tmult;
+ double falseNegativeB=(reads-mapped)*tmult;
+ double discardedB=discarded*tmult;
+ double ambiguousB=ambiguous*tmult;
+
+ StringBuilder sb=new StringBuilder();
+ sb.append(q);
+ sb.append('\t');
+ sb.append(String.format("%.4f", mappedB));
+ sb.append('\t');
+ sb.append(String.format("%.4f", retainedB));
+ sb.append('\t');
+ sb.append(String.format("%.4f", truePositiveStrictB));
+ sb.append('\t');
+ sb.append(String.format("%.4f", falsePositiveStrictB));
+ sb.append('\t');
+ sb.append(String.format("%.4f", truePositiveLooseB));
+ sb.append('\t');
+ sb.append(String.format("%.4f", falsePositiveLooseB));
+ sb.append('\t');
+ sb.append(String.format("%.4f", falseNegativeB));
+ sb.append('\t');
+ sb.append(String.format("%.4f", discardedB));
+ sb.append('\t');
+ sb.append(String.format("%.4f", ambiguousB));
+
+ System.out.println(sb);
+ }else{
+ assert(truePositiveStrictA[q]==0) : q;
+ assert(falsePositiveStrictA[q]==0) : q;
+ assert(truePositiveLooseA[q]==0) : q;
+ assert(falsePositiveLooseA[q]==0) : q;
+ }
+
+ }
+ }
+
+ public static void calcStatistics1(final Read r, SamLine sl){
+
+ int q=r.mapScore;
+
+ int THRESH=0;
+ primaryA[q]++;
+ if(q<0){q=0;}
+ if(q>=discardedA.length){q=discardedA.length-1;}
+
+ if(r.discarded()/* || r.mapScore==0*/){
+ discardedA[q]++;
+ unmappedA[q]++;
+ }else if(r.ambiguous()){
+// assert(r.mapped()) : "\n"+r+"\n"+sl+"\n";
+ if(r.mapped()){mappedA[q]++;}
+ ambiguousA[q]++;
+ }else if(r.mapScore<1){
+ unmappedA[q]++;
+ }else if(!r.mapped()){
+ unmappedA[q]++;
+ }
+// else if(r.mapScore<=minQuality){
+// if(r.mapped()){mappedA[q]++;}
+// ambiguousA[q]++;
+// }
+ else{
+
+ mappedA[q]++;
+ mappedRetainedA[q]++;
+
+ if(parsecustom){
+ SiteScore os=r.originalSite;
+ assert(os!=null);
+ if(os!=null){
+ int trueChrom=os.chrom;
+ byte trueStrand=os.strand;
+ int trueStart=os.start;
+ int trueStop=os.stop;
+ SiteScore ss=new SiteScore(r.chrom, r.strand(), r.start, r.stop, 0, 0);
+ byte[] originalContig=sl.originalContig();
+ if(BLASR){
+ originalContig=(originalContig==null || Tools.indexOf(originalContig, (byte)'/')<0 ? originalContig :
+ Arrays.copyOfRange(originalContig, 0, Tools.lastIndexOf(originalContig, (byte)'/')));
+ }
+ int cstart=sl.originalContigStart();
+
+ boolean strict=isCorrectHit(ss, trueChrom, trueStrand, trueStart, trueStop, THRESH, originalContig, sl.rname(), cstart);
+ boolean loose=isCorrectHitLoose(ss, trueChrom, trueStrand, trueStart, trueStop, THRESH+THRESH2, originalContig, sl.rname(), cstart);
+
+ // if(!strict){
+ // System.out.println(ss+", "+new String(originalContig)+", "+new String(sl.rname()));
+ // assert(false);
+ // }
+
+ // System.out.println("loose = "+loose+" for "+r.toText());
+
+ if(loose){
+ // System.err.println("TPL\t"+trueChrom+", "+trueStrand+", "+trueStart+", "+trueStop+"\tvs\t"
+ // +ss.chrom+", "+ss.strand+", "+ss.start+", "+ss.stop);
+ truePositiveLooseA[q]++;
+ }else{
+ // System.err.println("FPL\t"+trueChrom+", "+trueStrand+", "+trueStart+", "+trueStop+"\tvs\t"
+ // +ss.chrom+", "+ss.strand+", "+ss.start+", "+ss.stop);
+ falsePositiveLooseA[q]++;
+ }
+
+ if(strict){
+ // System.err.println("TPS\t"+trueStart+", "+trueStop+"\tvs\t"+ss.start+", "+ss.stop);
+ truePositiveStrictA[q]++;
+ }else{
+ // System.err.println("FPS\t"+trueStart+", "+trueStop+"\tvs\t"+ss.start+", "+ss.stop);
+ falsePositiveStrictA[q]++;
+ }
+ }
+ }
+ }
+
+ }
+
+
+
+ public static boolean isCorrectHit(SiteScore ss, int trueChrom, byte trueStrand, int trueStart, int trueStop, int thresh,
+ byte[] originalContig, byte[] contig, int cstart){
+ if(ss.strand!=trueStrand){return false;}
+ if(originalContig!=null){
+ if(!Arrays.equals(originalContig, contig)){return false;}
+ }else{
+ if(ss.chrom!=trueChrom){return false;}
+ }
+
+ assert(ss.stop>ss.start) : ss.toText()+", "+trueStart+", "+trueStop;
+ assert(trueStop>trueStart) : ss.toText()+", "+trueStart+", "+trueStop;
+ int cstop=cstart+trueStop-trueStart;
+// return (absdif(ss.start, trueStart)<=thresh && absdif(ss.stop, trueStop)<=thresh);
+ return (absdif(ss.start, cstart)<=thresh && absdif(ss.stop, cstop)<=thresh);
+ }
+
+
+ public static boolean isCorrectHitLoose(SiteScore ss, int trueChrom, byte trueStrand, int trueStart, int trueStop, int thresh,
+ byte[] originalContig, byte[] contig, int cstart){
+ if(ss.strand!=trueStrand){return false;}
+ if(originalContig!=null){
+ if(!Arrays.equals(originalContig, contig)){return false;}
+ }else{
+ if(ss.chrom!=trueChrom){return false;}
+ }
+
+ assert(ss.stop>ss.start) : ss.toText()+", "+trueStart+", "+trueStop;
+ assert(trueStop>trueStart) : ss.toText()+", "+trueStart+", "+trueStop;
+ int cstop=cstart+trueStop-trueStart;
+// return (absdif(ss.start, trueStart)<=thresh || absdif(ss.stop, trueStop)<=thresh);
+ return (absdif(ss.start, cstart)<=thresh || absdif(ss.stop, cstop)<=thresh);
+ }
+
+ private static final int absdif(int a, int b){
+ return a>b ? a-b : b-a;
+ }
+
+ public static int truePositiveStrictA[]=new int[1000];
+ public static int falsePositiveStrictA[]=new int[1000];
+
+ public static int truePositiveLooseA[]=new int[1000];
+ public static int falsePositiveLooseA[]=new int[1000];
+
+ public static int mappedA[]=new int[1000];
+ public static int mappedRetainedA[]=new int[1000];
+ public static int unmappedA[]=new int[1000];
+
+ public static int discardedA[]=new int[1000];
+ public static int ambiguousA[]=new int[1000];
+
+ public static int primaryA[]=new int[1000];
+
+ public static boolean parsecustom=true;
+
+ public static int THRESH2=20;
+ public static boolean BLASR=false;
+ public static boolean USE_BITSET=true;
+ public static BitSet seen=null;
+
+}
diff --git a/current/align2/MultiStateAligner10ts.java b/current/align2/MultiStateAligner10ts.java
new file mode 100755
index 0000000..07f0dd6
--- /dev/null
+++ b/current/align2/MultiStateAligner10ts.java
@@ -0,0 +1,3577 @@
+package align2;
+
+import java.util.Arrays;
+
+import stream.SiteScore;
+
+import dna.AminoAcid;
+
+/**
+ * "P" for "Packed".<br>
+ * Same as MSA2P, but the "prevState" field was removed.
+ * Yields identical results to MSA2, but is faster.
+ * For very long reads (over 2000bp) the score may overflow, so MSA2 should be used instead,
+ * or the time field should be shrunk. */
+public final class MultiStateAligner10ts extends MSA{
+
+
+ public static void main(String[] args){
+ byte[] read=args[0].getBytes();
+ byte[] ref=args[1].getBytes();
+
+ byte[] original=ref;
+
+ MultiStateAligner10ts msa=new MultiStateAligner10ts(read.length, ref.length);
+
+ System.out.println("Initial: ");
+ printMatrix(msa.packed, read.length, ref.length, TIMEMASK, SCOREOFFSET);
+
+ int[] max=msa.fillLimited(read, ref, 0, ref.length-1, 0, null);
+
+ System.out.println("Max: "+Arrays.toString(max));
+
+ System.out.println("Final: ");
+ printMatrix(msa.packed, read.length, ref.length, TIMEMASK, SCOREOFFSET);
+
+ byte[] out=msa.traceback(read, ref, 0, ref.length-1, max[0], max[1], max[2], false);
+
+ int[] score=null;
+ score=msa.score(read, ref, 0, ref.length-1, max[0], max[1], max[2], false);
+
+ System.out.println(new String(ref));
+ System.out.println(new String(read));
+ System.out.println(new String(out));
+ System.out.println("Score: "+Arrays.toString(score));
+ }
+
+
+ public MultiStateAligner10ts(int maxRows_, int maxColumns_){
+ super(maxRows_, maxColumns_);
+
+ packed=new int[3][maxRows+1][maxColumns+1];
+ bpacked=new int[3][maxRows+1][];
+ startmatrix=new int[3][maxRows+1][];
+ grefbuffer=new byte[maxColumns+2];
+ col0score=new int[maxRows+1];
+
+ vertLimit=new int[maxRows+1];
+ horizLimit=new int[maxColumns+1];
+ Arrays.fill(vertLimit, BADoff);
+ Arrays.fill(horizLimit, BADoff);
+
+ for(int matrix=0; matrix<packed.length; matrix++){
+ for(int i=1; i<=maxRows; i++){
+ for(int j=0; j<packed[matrix][i].length; j++){
+ packed[matrix][i][j]|=BADoff;
+ }
+// packed[matrix][i][0]|=MODE_INS;
+ }
+// for(int i=0; i<maxRows+1; i++){
+// scores[matrix][i][0]=(i*POINTSoff_NOREF);
+// }
+ for(int i=0; i<=maxRows; i++){
+
+ int prevScore=(i<2 ? 0 : packed[matrix][i-1][0]);
+ int score=(i<2 ? (i*POINTSoff_INS) :
+ (i<LIMIT_FOR_COST_3 ? prevScore+POINTSoff_INS2 :
+ (i<LIMIT_FOR_COST_4 ? prevScore+POINTSoff_INS3 : prevScore+POINTSoff_INS4)));
+
+ packed[matrix][i][0]=score;
+ }
+
+ for(int i=0; i<3; i++){
+ bpacked[matrix][i]=packed[matrix][i];
+ startmatrix[matrix][i]=new int[maxColumns+1];
+ Arrays.fill(startmatrix[matrix][i], -1);
+ }
+
+ for(int i=3; i<bpacked[matrix].length; i++){
+ bpacked[matrix][i]=bpacked[matrix][i-2];
+ startmatrix[matrix][i]=startmatrix[matrix][i-2];
+ }
+
+ for(int i=0; i<startmatrix[matrix][0].length; i++){
+ startmatrix[matrix][0][i]=i; //TODO: This may be off by 1 or 2, and may be different for the different matrices, so check it.
+ }
+ }
+ for(int i=0; i<packed[0].length; i++){
+ col0score[i]=packed[0][i][0];
+ }
+ }
+
+
+ /** return new int[] {rows, maxC, maxS, max};
+ * Will not fill areas that cannot match minScore */
+ public final int[] fillLimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore, int[] gaps){
+ if(gaps==null){return fillLimitedX(read, ref, refStartLoc, refEndLoc, minScore);}
+ else{
+ byte[] gref=makeGref(ref, gaps, refStartLoc, refEndLoc);
+
+ if(verbose && greflimit>0 && greflimit<500){
+ System.err.println(new String(gref, 0, greflimit));
+ }
+
+ assert(gref!=null) : "Excessively long read:\n"+new String(read);
+ return fillLimitedX(read, gref, 0, greflimit, minScore);
+ }
+ }
+
+
+ /** return new int[] {rows, maxC, maxS, max};
+ * Will not fill areas that cannot match minScore */
+ private final int[] fillLimitedX(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore){
+ if(verbose){System.err.println("fillLimitedX");}
+ if(bandwidth>0 && bandwidth<read.length){return fillBanded1(read, ref, refStartLoc, refEndLoc, minScore);}
+// minScore=0;
+// assert(minScore>0);
+ rows=read.length;
+ columns=refEndLoc-refStartLoc+1;
+
+ if(/*read.length<40 || */false || minScore<=0 || columns>read.length+Tools.min(170, read.length+20)){
+// assert(false) : minScore;
+// assert(minScore>0) : minScore;
+// assert(false) : +minScore+", "+columns+", "+read.length+", "+Tools.min(100, read.length);
+ return fillUnlimited(read, ref, refStartLoc, refEndLoc);
+ }
+
+// final int BARRIER_I2=columns-BARRIER_I1;
+ final int BARRIER_I2=rows-BARRIER_I1, BARRIER_I2b=columns-1;
+ final int BARRIER_D2=rows-BARRIER_D1;
+
+ minScore-=MIN_SCORE_ADJUST; //Increases quality trivially
+
+ assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows+"\n"+
+ refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n";
+ assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns+"\n"+
+ refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n";
+
+ assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc+"\n"+
+ refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n";
+ assert(refEndLoc<ref.length) : "Check that values are in-bounds before calling this function: "+refEndLoc+", "+ref.length+"\n"+
+ refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n";
+
+// for(int x=0; x<packed.length; x++){
+// for(int y=1; y<rows+1; y++){
+// Arrays.fill(packed[x][y], 1, columns+1, BADoff);
+// }
+// }
+ for(int x=0; x<packed.length; x++){
+
+// Arrays.fill(packed[x][1], 1, columns+1, BADoff);
+ Arrays.fill(packed[x][rows], 1, columns+1, BADoff);
+// for(int y=1; y<rows+1; y++){
+// Arrays.fill(packed[x][y], 1, columns+1, BADoff);
+// }
+ }
+
+ int minGoodCol=1;
+ int maxGoodCol=columns;
+
+ final int minScore_off=(minScore<<SCOREOFFSET);
+ final int maxGain=(read.length-1)*POINTSoff_MATCH2+POINTSoff_MATCH;
+ final int floor=minScore_off-maxGain;
+// final int subfloor=Tools.max(BADoff, floor-200*POINTSoff_MATCH2);
+ final int subfloor=floor-5*POINTSoff_MATCH2;
+ assert(subfloor>BADoff); //TODO: Actually, it needs to be substantially more.
+ assert(subfloor<minScore_off) : minScore_off+", "+floor+", "+BADoff+", "+subfloor;
+
+ if(verbose2){
+ System.out.println();
+ System.out.println("minScore="+minScore+"\t"+minScore_off);
+ System.out.println("maxGain="+(maxGain>>SCOREOFFSET)+"\t"+(maxGain));
+ System.out.println("floor="+(floor>>SCOREOFFSET)+"\t"+(floor));
+ System.out.println("subfloor="+(subfloor>>SCOREOFFSET)+"\t"+(subfloor));
+ System.out.println("BADoff="+(BADoff>>SCOREOFFSET)+"\t"+(BADoff));
+ System.out.println("maxGain="+(maxGain>>SCOREOFFSET)+"\t"+(maxGain));
+ System.out.println();
+ }
+
+ vertLimit[rows]=minScore_off;
+ boolean prevDefined=false;
+ for(int i=rows-1; i>=0; i--){
+ byte c=read[i];
+ if(AminoAcid.isFullyDefined(c)){
+ vertLimit[i]=Tools.max(vertLimit[i+1]-(prevDefined ? POINTSoff_MATCH2 : POINTSoff_MATCH), floor);
+ prevDefined=true;
+ }else{
+ vertLimit[i]=Tools.max(vertLimit[i+1]-POINTSoff_NOCALL, floor);
+ prevDefined=false;
+ }
+ }
+
+ horizLimit[columns]=minScore_off;
+ prevDefined=false;
+ for(int i=columns-1; i>=0; i--){
+ byte c=ref[refStartLoc+i];
+ if(AminoAcid.isFullyDefined(c)){
+ horizLimit[i]=Tools.max(horizLimit[i+1]-(prevDefined ? POINTSoff_MATCH2 : POINTSoff_MATCH), floor);
+ prevDefined=true;
+ }else{
+ horizLimit[i]=Tools.max(horizLimit[i+1]-(prevDefined && c==GAPC ? POINTSoff_DEL : POINTSoff_NOREF), floor);
+ prevDefined=false;
+ }
+ }
+
+// vertLimit[rows]=minScore_off;
+// for(int i=rows-1; i>=0; i--){
+// vertLimit[i]=Tools.max(vertLimit[i+1]-POINTSoff_MATCH2, floor);
+// }
+//
+// horizLimit[columns]=minScore_off;
+// for(int i=columns-1; i>=0; i--){
+// horizLimit[i]=Tools.max(horizLimit[i+1]-POINTSoff_MATCH2, floor);
+// }
+
+ for(int row=1; row<=rows; row++){
+
+ int colStart=minGoodCol;
+ int colStop=maxGoodCol;
+
+ minGoodCol=-1;
+ maxGoodCol=-2;
+
+ final int vlimit=vertLimit[row];
+
+ if(verbose2){
+ System.out.println();
+ System.out.println("row="+row);
+ System.out.println("colStart="+colStart);
+ System.out.println("colStop="+colStop);
+ System.out.println("vlimit="+(vlimit>>SCOREOFFSET)+"\t"+(vlimit));
+ }
+
+ if(colStart<0 || colStop<colStart){break;}
+
+
+ if(colStart>1){
+ assert(row>0);
+ packed[MODE_MS][row][colStart-1]=subfloor;
+ packed[MODE_INS][row][colStart-1]=subfloor;
+ packed[MODE_DEL][row][colStart-1]=subfloor;
+ }
+
+
+ for(int col=colStart; col<=columns; col++){
+
+
+ if(verbose2){
+ System.out.println("\ncol "+col);
+ }
+
+ final byte call0=(row<2 ? (byte)'?' : read[row-2]);
+ final byte call1=read[row-1];
+ final byte ref0=(col<2 ? (byte)'!' : ref[refStartLoc+col-2]);
+ final byte ref1=ref[refStartLoc+col-1];
+
+ final boolean gap=(ref1==GAPC);
+ assert(call1!=GAPC);
+
+// final boolean match=(read[row-1]==ref[refStartLoc+col-1]);
+// final boolean prevMatch=(row<2 || col<2 ? false : read[row-2]==ref[refStartLoc+col-2]);
+ final boolean match=(call1==ref1 && ref1!='N');
+ final boolean prevMatch=(call0==ref0 && ref0!='N');
+
+// System.err.println("")
+
+ iterationsLimited++;
+ final int limit=Tools.max(vlimit, horizLimit[col]);
+ final int limit3=Tools.max(floor, (match ? limit-POINTSoff_MATCH2 : limit-POINTSoff_SUB3));
+
+ final int delNeeded=Tools.max(0, row-col-1);
+ final int insNeeded=Tools.max(0, (rows-row)-(columns-col)-1);
+
+ final int delPenalty=calcDelScoreOffset(delNeeded);
+ final int insPenalty=calcInsScoreOffset(insNeeded);
+
+
+ final int scoreFromDiag_MS=packed[MODE_MS][row-1][col-1]&SCOREMASK;
+ final int scoreFromDel_MS=packed[MODE_DEL][row-1][col-1]&SCOREMASK;
+ final int scoreFromIns_MS=packed[MODE_INS][row-1][col-1]&SCOREMASK;
+
+ final int scoreFromDiag_DEL=packed[MODE_MS][row][col-1]&SCOREMASK;
+ final int scoreFromDel_DEL=packed[MODE_DEL][row][col-1]&SCOREMASK;
+
+ final int scoreFromDiag_INS=packed[MODE_MS][row-1][col]&SCOREMASK;
+ final int scoreFromIns_INS=packed[MODE_INS][row-1][col]&SCOREMASK;
+
+// if(scoreFromDiag_MS<limit3 && scoreFromDel_MS<limit3 && scoreFromIns_MS<limit3
+// && scoreFromDiag_DEL<limit && scoreFromDel_DEL<limit && scoreFromDiag_INS<limit && scoreFromIns_INS<limit){
+// iterationsLimited--; //A "fast" iteration
+// }
+
+ if(gap || (scoreFromDiag_MS<=limit3 && scoreFromDel_MS<=limit3 && scoreFromIns_MS<=limit3)){
+ packed[MODE_MS][row][col]=subfloor;
+ }else{//Calculate match and sub scores
+ final int streak=(packed[MODE_MS][row-1][col-1]&TIMEMASK);
+
+ {//Calculate match/sub score
+
+ int score;
+ int time;
+ byte prevState;
+
+ if(match){
+
+ int scoreMS=scoreFromDiag_MS+(prevMatch ? POINTSoff_MATCH2 : POINTSoff_MATCH);
+ int scoreD=scoreFromDel_MS+POINTSoff_MATCH;
+ int scoreI=scoreFromIns_MS+POINTSoff_MATCH;
+
+// byte prevState;
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? streak+1 : 1);
+ prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+ prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+ prevState=MODE_INS;
+ }
+
+// if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+// assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+// assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+//// packed[MODE_MS][row][col]=(score|prevState|time);
+// packed[MODE_MS][row][col]=(score|time);
+// assert((score&SCOREMASK)==score);
+//// assert((prevState&MODEMASK)==prevState);
+// assert((time&TIMEMASK)==time);
+
+ }else{
+
+ int scoreMS;
+ if(ref1!='N' && call1!='N'){
+ scoreMS=scoreFromDiag_MS+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) :
+ (streak==0 ? POINTSoff_SUB : streak<LIMIT_FOR_COST_3 ? POINTSoff_SUB2 : POINTSoff_SUB3));
+ }else{
+ scoreMS=scoreFromDiag_MS+POINTSoff_NOCALL;
+ }
+
+ int scoreD=scoreFromDel_MS+POINTSoff_SUB; //+2 to move it as close as possible to the deletion / insertion
+ int scoreI=scoreFromIns_MS+POINTSoff_SUB;
+
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? 1 : streak+1);
+// time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1);
+ prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+ prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+ prevState=MODE_INS;
+ }
+
+// if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+// assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+// assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+//// packed[MODE_MS][row][col]=(score|prevState|time);
+// packed[MODE_MS][row][col]=(score|time);
+// assert((score&SCOREMASK)==score);
+//// assert((prevState&MODEMASK)==prevState);
+// assert((time&TIMEMASK)==time);
+ }
+
+ final int limit2;
+ if(delNeeded>0){
+ limit2=limit-delPenalty;
+ }else if(insNeeded>0){
+ limit2=limit-insPenalty;
+ }else{
+ limit2=limit;
+ }
+ assert(limit2>=limit);
+
+ if(verbose2){System.err.println("MS: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));}
+
+ if(score>=limit2){
+ maxGoodCol=col;
+ if(minGoodCol<0){minGoodCol=col;}
+ }else{
+ score=subfloor;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_MS][row][col]=(score|prevState|time);
+ packed[MODE_MS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+ }
+
+ if((scoreFromDiag_DEL<=limit && scoreFromDel_DEL<=limit) || row<BARRIER_D1 || row>BARRIER_D2){
+// assert((scoreFromDiag_DEL<=limit && scoreFromDel_DEL<=limit)) : scoreFromDiag_DEL+", "+row;
+ packed[MODE_DEL][row][col]=subfloor;
+ }else{//Calculate DEL score
+
+ final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK;
+
+ int scoreMS=scoreFromDiag_DEL+POINTSoff_DEL;
+ int scoreD=scoreFromDel_DEL+(streak==0 ? POINTSoff_DEL :
+ streak<LIMIT_FOR_COST_3 ? POINTSoff_DEL2 :
+ streak<LIMIT_FOR_COST_4 ? POINTSoff_DEL3 :
+ streak<LIMIT_FOR_COST_5 ? POINTSoff_DEL4 :
+ ((streak&MASK5)==0 ? POINTSoff_DEL5 : 0));
+// int scoreI=scoreFromIns+POINTSoff_DEL;
+
+
+ if(ref1=='N'){
+ scoreMS+=POINTSoff_DEL_REF_N;
+ scoreD+=POINTSoff_DEL_REF_N;
+ }else if(gap){
+ scoreMS+=POINTSoff_GAP;
+ scoreD+=POINTSoff_GAP;
+ }
+
+ //if(match){scoreMS=subfloor;}
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreD){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreD;
+ time=streak+1;
+ prevState=MODE_DEL;
+ }
+
+ final int limit2;
+ if(insNeeded>0){
+ limit2=limit-insPenalty;
+ }else if(delNeeded>0){
+ limit2=limit-calcDelScoreOffset(time+delNeeded)+calcDelScoreOffset(time);
+ }else{
+ limit2=limit;
+ }
+ assert(limit2>=limit);
+ if(verbose2){System.err.println("DEL: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));}
+
+ if(score>=limit2){
+ maxGoodCol=col;
+ if(minGoodCol<0){minGoodCol=col;}
+ }else{
+ score=subfloor;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_DEL][row][col]=(score|prevState|time);
+ packed[MODE_DEL][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+
+// if(gap || (scoreFromDiag_INS<=limit && scoreFromIns_INS<=limit) || col<BARRIER_I1 || col>BARRIER_I2){
+ if(gap || (scoreFromDiag_INS<=limit && scoreFromIns_INS<=limit) || (row<BARRIER_I1 && col>1) || (row>BARRIER_I2 && col<BARRIER_I2b)){
+ packed[MODE_INS][row][col]=subfloor;
+ }else{//Calculate INS score
+
+ final int streak=packed[MODE_INS][row-1][col]&TIMEMASK;
+
+ int scoreMS=scoreFromDiag_INS+POINTSoff_INS;
+// int scoreD=scoreFromDel+POINTSoff_INS;
+ int scoreI=scoreFromIns_INS+(streak==0 ? POINTSoff_INS :
+ streak<LIMIT_FOR_COST_3 ? POINTSoff_INS2 :
+ streak<LIMIT_FOR_COST_4 ? POINTSoff_INS3 : POINTSoff_INS4);
+
+
+// System.err.println("("+row+","+col+")\t"+scoreFromDiag+"+"+POINTSoff_INS+"="+scoreM+", "+
+// scoreFromSub+"+"+POINTSoff_INS+"="+scoreS+", "
+// +scoreD+", "+scoreFromIns+"+"+
+// (streak==0 ? POINTSoff_INS : streak<LIMIT_FOR_COST_3 ? POINTSoff_INS2 : POINTSoff_INS3)+"="+scoreI);
+
+ //if(match){scoreMS=subfloor;}
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreI){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreI;
+ time=streak+1;
+ prevState=MODE_INS;
+ }
+
+ final int limit2;
+ if(delNeeded>0){
+ limit2=limit-delPenalty;
+ }else if(insNeeded>0){
+ limit2=limit-calcInsScoreOffset(time+insNeeded)+calcInsScoreOffset(time);
+ }else{
+ limit2=limit;
+ }
+ assert(limit2>=limit);
+
+ if(verbose2){System.err.println("INS: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));}
+ if(score>=limit2){
+ maxGoodCol=col;
+ if(minGoodCol<0){minGoodCol=col;}
+ }else{
+ score=subfloor;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_INS][row][col]=(score|prevState|time);
+ packed[MODE_INS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+
+
+ if(col>=colStop){
+ if(col>colStop && maxGoodCol<col){break;}
+ if(row>1){
+ packed[MODE_MS][row-1][col+1]=subfloor;
+ packed[MODE_INS][row-1][col+1]=subfloor;
+ packed[MODE_DEL][row-1][col+1]=subfloor;
+ }
+ }
+ }
+ }
+
+
+ int maxCol=-1;
+ int maxState=-1;
+ int maxScore=Integer.MIN_VALUE;
+
+ for(int state=0; state<packed.length; state++){
+ for(int col=1; col<=columns; col++){
+ int x=packed[state][rows][col]&SCOREMASK;
+ if(x>maxScore){
+ maxScore=x;
+ maxCol=col;
+ maxState=state;
+ }
+ }
+ }
+
+ assert(maxScore>=BADoff);
+// if(maxScore==BADoff){
+// return null;
+// }
+// if(maxScore<floor){
+// return null;
+// }
+
+ if(verbose){
+ System.out.println("Filled matrix.");
+ printMatrix(packed, rows, columns, TIMEMASK, SCOREOFFSET);
+ }
+ if(verbose){System.err.println("maxscore="+(maxScore>>SCOREOFFSET)+", minscore="+(minScore_off>>SCOREOFFSET));}
+
+ if(maxScore<minScore_off){
+ return null;
+ }
+
+ maxScore>>=SCOREOFFSET;
+
+// System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore);
+ return new int[] {rows, maxCol, maxState, maxScore};
+ }
+
+
+ /**
+ * Like fillLimitedX but additionally restricted to a band.
+ * return new int[] {rows, maxC, maxS, max};
+ * Will not fill areas that cannot match minScore */
+ private final int[] fillBanded1(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore){
+ if(verbose){System.err.println("fillBanded1()");}
+// minScore=0;
+// assert(minScore>0);
+ rows=read.length;
+ columns=refEndLoc-refStartLoc+1;
+ final int halfband=bandwidth/2;
+
+ if(/*read.length<40 || */false || minScore<=0 || columns>read.length+Tools.min(170, read.length+20)){
+// assert(false) : minScore;
+// assert(minScore>0) : minScore;
+// assert(false) : +minScore+", "+columns+", "+read.length+", "+Tools.min(100, read.length);
+ return fillUnlimited(read, ref, refStartLoc, refEndLoc);
+ }
+
+// final int BARRIER_I2=columns-BARRIER_I1;
+ final int BARRIER_I2=rows-BARRIER_I1, BARRIER_I2b=columns-1;
+ final int BARRIER_D2=rows-BARRIER_D1;
+
+ minScore-=MIN_SCORE_ADJUST; //Increases quality trivially
+
+ assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows+"\n"+
+ refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n";
+ assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns+"\n"+
+ refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n";
+
+ assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc+"\n"+
+ refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n";
+ assert(refEndLoc<ref.length) : "Check that values are in-bounds before calling this function: "+refEndLoc+", "+ref.length+"\n"+
+ refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n";
+
+// for(int x=0; x<packed.length; x++){
+// for(int y=1; y<rows+1; y++){
+// Arrays.fill(packed[x][y], 1, columns+1, BADoff);
+// }
+// }
+ for(int x=0; x<packed.length; x++){
+
+// Arrays.fill(packed[x][1], 1, columns+1, BADoff);
+ Arrays.fill(packed[x][rows], 1, columns+1, BADoff);
+// for(int y=1; y<rows+1; y++){
+// Arrays.fill(packed[x][y], 1, columns+1, BADoff);
+// }
+ }
+
+ int minGoodCol=1;
+ int maxGoodCol=columns;
+
+ final int minScore_off=(minScore<<SCOREOFFSET);
+ final int maxGain=(read.length-1)*POINTSoff_MATCH2+POINTSoff_MATCH;
+ final int floor=minScore_off-maxGain;
+// final int subfloor=Tools.max(BADoff, floor-200*POINTSoff_MATCH2);
+ final int subfloor=floor-5*POINTSoff_MATCH2;
+ assert(subfloor>BADoff); //TODO: Actually, it needs to be substantially more.
+ assert(subfloor<minScore_off) : minScore_off+", "+floor+", "+BADoff+", "+subfloor;
+
+ if(verbose2){
+ System.out.println();
+ System.out.println("minScore="+minScore+"\t"+minScore_off);
+ System.out.println("maxGain="+(maxGain>>SCOREOFFSET)+"\t"+(maxGain));
+ System.out.println("floor="+(floor>>SCOREOFFSET)+"\t"+(floor));
+ System.out.println("subfloor="+(subfloor>>SCOREOFFSET)+"\t"+(subfloor));
+ System.out.println("BADoff="+(BADoff>>SCOREOFFSET)+"\t"+(BADoff));
+ System.out.println("maxGain="+(maxGain>>SCOREOFFSET)+"\t"+(maxGain));
+ System.out.println();
+ }
+
+ vertLimit[rows]=minScore_off;
+ boolean prevDefined=false;
+ for(int i=rows-1; i>=0; i--){
+ byte c=read[i];
+ if(AminoAcid.isFullyDefined(c)){
+ vertLimit[i]=Tools.max(vertLimit[i+1]-(prevDefined ? POINTSoff_MATCH2 : POINTSoff_MATCH), floor);
+ prevDefined=true;
+ }else{
+ vertLimit[i]=Tools.max(vertLimit[i+1]-POINTSoff_NOCALL, floor);
+ prevDefined=false;
+ }
+ }
+
+ horizLimit[columns]=minScore_off;
+ prevDefined=false;
+ for(int i=columns-1; i>=0; i--){
+ byte c=ref[refStartLoc+i];
+ if(AminoAcid.isFullyDefined(c)){
+ horizLimit[i]=Tools.max(horizLimit[i+1]-(prevDefined ? POINTSoff_MATCH2 : POINTSoff_MATCH), floor);
+ prevDefined=true;
+ }else{
+ horizLimit[i]=Tools.max(horizLimit[i+1]-(prevDefined && c==GAPC ? POINTSoff_DEL : POINTSoff_NOREF), floor);
+ prevDefined=false;
+ }
+ }
+
+// vertLimit[rows]=minScore_off;
+// for(int i=rows-1; i>=0; i--){
+// vertLimit[i]=Tools.max(vertLimit[i+1]-POINTSoff_MATCH2, floor);
+// }
+//
+// horizLimit[columns]=minScore_off;
+// for(int i=columns-1; i>=0; i--){
+// horizLimit[i]=Tools.max(horizLimit[i+1]-POINTSoff_MATCH2, floor);
+// }
+
+ for(int row=1; row<=rows; row++){
+
+ int colStart=Tools.max(minGoodCol, row-halfband);
+ int colStop=Tools.min(maxGoodCol, row+halfband);
+
+ minGoodCol=-1;
+ maxGoodCol=-2;
+
+ final int vlimit=vertLimit[row];
+
+ if(verbose2){
+ System.out.println();
+ System.out.println("row="+row);
+ System.out.println("colStart="+colStart);
+ System.out.println("colStop="+colStop);
+ System.out.println("vlimit="+(vlimit>>SCOREOFFSET)+"\t"+(vlimit));
+ }
+
+ if(colStart<0 || colStop<colStart){break;}
+
+
+ if(colStart>1){
+ assert(row>0);
+ packed[MODE_MS][row][colStart-1]=subfloor;
+ packed[MODE_INS][row][colStart-1]=subfloor;
+ packed[MODE_DEL][row][colStart-1]=subfloor;
+ }
+
+
+ for(int col=colStart; col<=columns; col++){
+
+
+ if(verbose2){
+ System.out.println("\ncol "+col);
+ }
+
+ final byte call0=(row<2 ? (byte)'?' : read[row-2]);
+ final byte call1=read[row-1];
+ final byte ref0=(col<2 ? (byte)'!' : ref[refStartLoc+col-2]);
+ final byte ref1=ref[refStartLoc+col-1];
+
+ final boolean gap=(ref1==GAPC);
+ assert(call1!=GAPC);
+
+// final boolean match=(read[row-1]==ref[refStartLoc+col-1]);
+// final boolean prevMatch=(row<2 || col<2 ? false : read[row-2]==ref[refStartLoc+col-2]);
+ final boolean match=(call1==ref1 && ref1!='N');
+ final boolean prevMatch=(call0==ref0 && ref0!='N');
+
+// System.err.println("")
+
+ iterationsLimited++;
+ final int limit=Tools.max(vlimit, horizLimit[col]);
+ final int limit3=Tools.max(floor, (match ? limit-POINTSoff_MATCH2 : limit-POINTSoff_SUB3));
+
+ final int delNeeded=Tools.max(0, row-col-1);
+ final int insNeeded=Tools.max(0, (rows-row)-(columns-col)-1);
+
+ final int delPenalty=calcDelScoreOffset(delNeeded);
+ final int insPenalty=calcInsScoreOffset(insNeeded);
+
+
+ final int scoreFromDiag_MS=packed[MODE_MS][row-1][col-1]&SCOREMASK;
+ final int scoreFromDel_MS=packed[MODE_DEL][row-1][col-1]&SCOREMASK;
+ final int scoreFromIns_MS=packed[MODE_INS][row-1][col-1]&SCOREMASK;
+
+ final int scoreFromDiag_DEL=packed[MODE_MS][row][col-1]&SCOREMASK;
+ final int scoreFromDel_DEL=packed[MODE_DEL][row][col-1]&SCOREMASK;
+
+ final int scoreFromDiag_INS=packed[MODE_MS][row-1][col]&SCOREMASK;
+ final int scoreFromIns_INS=packed[MODE_INS][row-1][col]&SCOREMASK;
+
+// if(scoreFromDiag_MS<limit3 && scoreFromDel_MS<limit3 && scoreFromIns_MS<limit3
+// && scoreFromDiag_DEL<limit && scoreFromDel_DEL<limit && scoreFromDiag_INS<limit && scoreFromIns_INS<limit){
+// iterationsLimited--; //A "fast" iteration
+// }
+
+ if(gap || (scoreFromDiag_MS<=limit3 && scoreFromDel_MS<=limit3 && scoreFromIns_MS<=limit3)){
+ packed[MODE_MS][row][col]=subfloor;
+ }else{//Calculate match and sub scores
+ final int streak=(packed[MODE_MS][row-1][col-1]&TIMEMASK);
+
+ {//Calculate match/sub score
+
+ int score;
+ int time;
+ byte prevState;
+
+ if(match){
+
+ int scoreMS=scoreFromDiag_MS+(prevMatch ? POINTSoff_MATCH2 : POINTSoff_MATCH);
+ int scoreD=scoreFromDel_MS+POINTSoff_MATCH;
+ int scoreI=scoreFromIns_MS+POINTSoff_MATCH;
+
+// byte prevState;
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? streak+1 : 1);
+ prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+ prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+ prevState=MODE_INS;
+ }
+
+// if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+// assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+// assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+//// packed[MODE_MS][row][col]=(score|prevState|time);
+// packed[MODE_MS][row][col]=(score|time);
+// assert((score&SCOREMASK)==score);
+//// assert((prevState&MODEMASK)==prevState);
+// assert((time&TIMEMASK)==time);
+
+ }else{
+
+ int scoreMS;
+ if(ref1!='N' && call1!='N'){
+ scoreMS=scoreFromDiag_MS+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) :
+ (streak==0 ? POINTSoff_SUB : streak<LIMIT_FOR_COST_3 ? POINTSoff_SUB2 : POINTSoff_SUB3));
+ }else{
+ scoreMS=scoreFromDiag_MS+POINTSoff_NOCALL;
+ }
+
+ int scoreD=scoreFromDel_MS+POINTSoff_SUB; //+2 to move it as close as possible to the deletion / insertion
+ int scoreI=scoreFromIns_MS+POINTSoff_SUB;
+
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? 1 : streak+1);
+// time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1);
+ prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+ prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+ prevState=MODE_INS;
+ }
+
+// if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+// assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+// assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+//// packed[MODE_MS][row][col]=(score|prevState|time);
+// packed[MODE_MS][row][col]=(score|time);
+// assert((score&SCOREMASK)==score);
+//// assert((prevState&MODEMASK)==prevState);
+// assert((time&TIMEMASK)==time);
+ }
+
+ final int limit2;
+ if(delNeeded>0){
+ limit2=limit-delPenalty;
+ }else if(insNeeded>0){
+ limit2=limit-insPenalty;
+ }else{
+ limit2=limit;
+ }
+ assert(limit2>=limit);
+
+ if(verbose2){System.err.println("MS: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));}
+
+ if(score>=limit2){
+ maxGoodCol=col;
+ if(minGoodCol<0){minGoodCol=col;}
+ }else{
+ score=subfloor;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_MS][row][col]=(score|prevState|time);
+ packed[MODE_MS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+ }
+
+ if((scoreFromDiag_DEL<=limit && scoreFromDel_DEL<=limit) || row<BARRIER_D1 || row>BARRIER_D2){
+// assert((scoreFromDiag_DEL<=limit && scoreFromDel_DEL<=limit)) : scoreFromDiag_DEL+", "+row;
+ packed[MODE_DEL][row][col]=subfloor;
+ }else{//Calculate DEL score
+
+ final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK;
+
+ int scoreMS=scoreFromDiag_DEL+POINTSoff_DEL;
+ int scoreD=scoreFromDel_DEL+(streak==0 ? POINTSoff_DEL :
+ streak<LIMIT_FOR_COST_3 ? POINTSoff_DEL2 :
+ streak<LIMIT_FOR_COST_4 ? POINTSoff_DEL3 :
+ streak<LIMIT_FOR_COST_5 ? POINTSoff_DEL4 :
+ ((streak&MASK5)==0 ? POINTSoff_DEL5 : 0));
+// int scoreI=scoreFromIns+POINTSoff_DEL;
+
+
+ if(ref1=='N'){
+ scoreMS+=POINTSoff_DEL_REF_N;
+ scoreD+=POINTSoff_DEL_REF_N;
+ }else if(gap){
+ scoreMS+=POINTSoff_GAP;
+ scoreD+=POINTSoff_GAP;
+ }
+
+ //if(match){scoreMS=subfloor;}
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreD){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreD;
+ time=streak+1;
+ prevState=MODE_DEL;
+ }
+
+ final int limit2;
+ if(insNeeded>0){
+ limit2=limit-insPenalty;
+ }else if(delNeeded>0){
+ limit2=limit-calcDelScoreOffset(time+delNeeded)+calcDelScoreOffset(time);
+ }else{
+ limit2=limit;
+ }
+ assert(limit2>=limit);
+ if(verbose2){System.err.println("DEL: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));}
+
+ if(score>=limit2){
+ maxGoodCol=col;
+ if(minGoodCol<0){minGoodCol=col;}
+ }else{
+ score=subfloor;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_DEL][row][col]=(score|prevState|time);
+ packed[MODE_DEL][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+
+// if(gap || (scoreFromDiag_INS<=limit && scoreFromIns_INS<=limit) || col<BARRIER_I1 || col>BARRIER_I2){
+ if(gap || (scoreFromDiag_INS<=limit && scoreFromIns_INS<=limit) || (row<BARRIER_I1 && col>1) || (row>BARRIER_I2 && col<BARRIER_I2b)){
+ packed[MODE_INS][row][col]=subfloor;
+ }else{//Calculate INS score
+
+ final int streak=packed[MODE_INS][row-1][col]&TIMEMASK;
+
+ int scoreMS=scoreFromDiag_INS+POINTSoff_INS;
+// int scoreD=scoreFromDel+POINTSoff_INS;
+ int scoreI=scoreFromIns_INS+(streak==0 ? POINTSoff_INS :
+ streak<LIMIT_FOR_COST_3 ? POINTSoff_INS2 :
+ streak<LIMIT_FOR_COST_4 ? POINTSoff_INS3 : POINTSoff_INS4);
+
+
+// System.err.println("("+row+","+col+")\t"+scoreFromDiag+"+"+POINTSoff_INS+"="+scoreM+", "+
+// scoreFromSub+"+"+POINTSoff_INS+"="+scoreS+", "
+// +scoreD+", "+scoreFromIns+"+"+
+// (streak==0 ? POINTSoff_INS : streak<LIMIT_FOR_COST_3 ? POINTSoff_INS2 : POINTSoff_INS3)+"="+scoreI);
+
+ //if(match){scoreMS=subfloor;}
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreI){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreI;
+ time=streak+1;
+ prevState=MODE_INS;
+ }
+
+ final int limit2;
+ if(delNeeded>0){
+ limit2=limit-delPenalty;
+ }else if(insNeeded>0){
+ limit2=limit-calcInsScoreOffset(time+insNeeded)+calcInsScoreOffset(time);
+ }else{
+ limit2=limit;
+ }
+ assert(limit2>=limit);
+
+ if(verbose2){System.err.println("INS: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));}
+ if(score>=limit2){
+ maxGoodCol=col;
+ if(minGoodCol<0){minGoodCol=col;}
+ }else{
+ score=subfloor;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_INS][row][col]=(score|prevState|time);
+ packed[MODE_INS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+
+
+ if(col>=colStop){
+ if(col>colStop && maxGoodCol<col){break;}
+ if(row>1){
+ packed[MODE_MS][row-1][col+1]=subfloor;
+ packed[MODE_INS][row-1][col+1]=subfloor;
+ packed[MODE_DEL][row-1][col+1]=subfloor;
+ }
+ }
+ }
+ }
+
+
+ int maxCol=-1;
+ int maxState=-1;
+ int maxScore=Integer.MIN_VALUE;
+
+ for(int state=0; state<packed.length; state++){
+ for(int col=1; col<=columns; col++){
+ int x=packed[state][rows][col]&SCOREMASK;
+ if(x>maxScore){
+ maxScore=x;
+ maxCol=col;
+ maxState=state;
+ }
+ }
+ }
+
+ assert(maxScore>=BADoff);
+// if(maxScore==BADoff){
+// return null;
+// }
+// if(maxScore<floor){
+// return null;
+// }
+ if(maxScore<minScore_off){
+ return null;
+ }
+
+ maxScore>>=SCOREOFFSET;
+
+// System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore);
+ return new int[] {rows, maxCol, maxState, maxScore};
+ }
+
+
+ /** return new int[] {rows, maxC, maxS, max, maxStart};
+ * Will not fill areas that cannot match minScore */
+ private final int[] fillBanded(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore){
+ assert(false) : "TODO";
+// minScore=0;
+// assert(minScore>0);
+ rows=read.length;
+ columns=refEndLoc-refStartLoc+1;
+
+ final int BARRIER_I2=rows-BARRIER_I1, BARRIER_I2b=columns-1;
+ final int BARRIER_D2=rows-BARRIER_D1;
+
+ minScore-=MIN_SCORE_ADJUST; //Increases quality trivially
+
+ assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows+"\n"+
+ refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n";
+ assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns+"\n"+
+ refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n";
+
+ assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc+"\n"+
+ refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n";
+ assert(refEndLoc<ref.length) : "Check that values are in-bounds before calling this function: "+refEndLoc+", "+ref.length+"\n"+
+ refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n";
+
+ for(int x=0; x<packed.length; x++){
+
+// Arrays.fill(bpacked[x][1], 1, columns+1, BADoff);
+ Arrays.fill(bpacked[x][rows], 1, columns+1, BADoff);
+// for(int y=1; y<rows+1; y++){
+// Arrays.fill(bpacked[x][y], 1, columns+1, BADoff);
+// }
+ }
+
+ int minGoodCol=1;
+ int maxGoodCol=columns;
+
+ final int minScore_off=(minScore<<SCOREOFFSET);
+ final int maxGain=(read.length-1)*POINTSoff_MATCH2+POINTSoff_MATCH;
+ final int floor=minScore_off-maxGain;
+// final int subfloor=Tools.max(BADoff, floor-200*POINTSoff_MATCH2);
+ final int subfloor=floor-5*POINTSoff_MATCH2;
+ assert(subfloor>BADoff); //TODO: Actually, it needs to be substantially more.
+ assert(subfloor<minScore_off) : minScore_off+", "+floor+", "+BADoff+", "+subfloor;
+
+ if(verbose2){
+ System.out.println();
+ System.out.println("minScore="+minScore+"\t"+minScore_off);
+ System.out.println("maxGain="+(maxGain>>SCOREOFFSET)+"\t"+(maxGain));
+ System.out.println("floor="+(floor>>SCOREOFFSET)+"\t"+(floor));
+ System.out.println("subfloor="+(subfloor>>SCOREOFFSET)+"\t"+(subfloor));
+ System.out.println("BADoff="+(BADoff>>SCOREOFFSET)+"\t"+(BADoff));
+ System.out.println("maxGain="+(maxGain>>SCOREOFFSET)+"\t"+(maxGain));
+ System.out.println();
+ }
+
+ vertLimit[rows]=minScore_off;
+ boolean prevDefined=false;
+ for(int i=rows-1; i>=0; i--){
+ byte c=read[i];
+ if(AminoAcid.isFullyDefined(c)){
+ vertLimit[i]=Tools.max(vertLimit[i+1]-(prevDefined ? POINTSoff_MATCH2 : POINTSoff_MATCH), floor);
+ prevDefined=true;
+ }else{
+ vertLimit[i]=Tools.max(vertLimit[i+1]-POINTSoff_NOCALL, floor);
+ prevDefined=false;
+ }
+ }
+
+ horizLimit[columns]=minScore_off;
+ prevDefined=false;
+ for(int i=columns-1; i>=0; i--){
+ byte c=ref[refStartLoc+i];
+ if(AminoAcid.isFullyDefined(c)){
+ horizLimit[i]=Tools.max(horizLimit[i+1]-(prevDefined ? POINTSoff_MATCH2 : POINTSoff_MATCH), floor);
+ prevDefined=true;
+ }else{
+ horizLimit[i]=Tools.max(horizLimit[i+1]-(prevDefined && c==GAPC ? POINTSoff_DEL : POINTSoff_NOREF), floor);
+ prevDefined=false;
+ }
+ }
+
+// vertLimit[rows]=minScore_off;
+// for(int i=rows-1; i>=0; i--){
+// vertLimit[i]=Tools.max(vertLimit[i+1]-POINTSoff_MATCH2, floor);
+// }
+//
+// horizLimit[columns]=minScore_off;
+// for(int i=columns-1; i>=0; i--){
+// horizLimit[i]=Tools.max(horizLimit[i+1]-POINTSoff_MATCH2, floor);
+// }
+
+ for(int row=1; row<=rows; row++){
+
+ int colStart=minGoodCol;
+ int colStop=maxGoodCol;
+
+ minGoodCol=-1;
+ maxGoodCol=-2;
+
+ final int vlimit=vertLimit[row];
+
+ if(verbose2){
+ System.out.println();
+ System.out.println("row="+row);
+ System.out.println("colStart="+colStart);
+ System.out.println("colStop="+colStop);
+ System.out.println("vlimit="+(vlimit>>SCOREOFFSET)+"\t"+(vlimit));
+ }
+
+ if(colStart<0 || colStop<colStart){break;}
+
+
+ if(colStart>1){
+ assert(row>0);
+ bpacked[MODE_MS][row][colStart-1]=subfloor;
+ bpacked[MODE_INS][row][colStart-1]=subfloor;
+ bpacked[MODE_DEL][row][colStart-1]=subfloor;
+ }else{
+ bpacked[MODE_MS][row-1][0]=bpacked[MODE_INS][row-1][0]=bpacked[MODE_DEL][row-1][0]=col0score[row-1];
+ bpacked[MODE_MS][row][0]=bpacked[MODE_INS][row][0]=bpacked[MODE_DEL][row][0]=col0score[row];
+ }
+
+
+ for(int col=colStart; col<=columns; col++){
+
+
+ if(verbose2){
+ System.out.println("\ncol "+col);
+ }
+
+ final byte call0=(row<2 ? (byte)'?' : read[row-2]);
+ final byte call1=read[row-1];
+ final byte ref0=(col<2 ? (byte)'!' : ref[refStartLoc+col-2]);
+ final byte ref1=ref[refStartLoc+col-1];
+
+ final boolean gap=(ref1==GAPC);
+ assert(call1!=GAPC);
+
+// final boolean match=(read[row-1]==ref[refStartLoc+col-1]);
+// final boolean prevMatch=(row<2 || col<2 ? false : read[row-2]==ref[refStartLoc+col-2]);
+ final boolean match=(call1==ref1 && ref1!='N');
+ final boolean prevMatch=(call0==ref0 && ref0!='N');
+
+// System.err.println("")
+
+ iterationsLimited++;
+ final int limit=Tools.max(vlimit, horizLimit[col]);
+ final int limit3=Tools.max(floor, (match ? limit-POINTSoff_MATCH2 : limit-POINTSoff_SUB3));
+
+ final int delNeeded=Tools.max(0, row-col-1);
+ final int insNeeded=Tools.max(0, (rows-row)-(columns-col)-1);
+
+ final int delPenalty=calcDelScoreOffset(delNeeded);
+ final int insPenalty=calcInsScoreOffset(insNeeded);
+
+
+ final int scoreFromDiag_MS=bpacked[MODE_MS][row-1][col-1]&SCOREMASK;
+ final int scoreFromDel_MS=bpacked[MODE_DEL][row-1][col-1]&SCOREMASK;
+ final int scoreFromIns_MS=bpacked[MODE_INS][row-1][col-1]&SCOREMASK;
+
+ final int scoreFromDiag_DEL=bpacked[MODE_MS][row][col-1]&SCOREMASK;
+ final int scoreFromDel_DEL=bpacked[MODE_DEL][row][col-1]&SCOREMASK;
+
+ final int scoreFromDiag_INS=bpacked[MODE_MS][row-1][col]&SCOREMASK;
+ final int scoreFromIns_INS=bpacked[MODE_INS][row-1][col]&SCOREMASK;
+
+// if(scoreFromDiag_MS<limit3 && scoreFromDel_MS<limit3 && scoreFromIns_MS<limit3
+// && scoreFromDiag_DEL<limit && scoreFromDel_DEL<limit && scoreFromDiag_INS<limit && scoreFromIns_INS<limit){
+// iterationsLimited--; //A "fast" iteration
+// }
+
+ if(gap || (scoreFromDiag_MS<=limit3 && scoreFromDel_MS<=limit3 && scoreFromIns_MS<=limit3)){
+ bpacked[MODE_MS][row][col]=subfloor;
+ startmatrix[MODE_MS][row][col]=-1;
+ }else{//Calculate match and sub scores
+ final int streak=(bpacked[MODE_MS][row-1][col-1]&TIMEMASK);
+
+ {//Calculate match/sub score
+
+ int score;
+ int time;
+ int start;
+ byte prevState;
+
+ if(match){
+
+ int scoreMS=scoreFromDiag_MS+(prevMatch ? POINTSoff_MATCH2 : POINTSoff_MATCH);
+ int scoreD=scoreFromDel_MS+POINTSoff_MATCH;
+ int scoreI=scoreFromIns_MS+POINTSoff_MATCH;
+
+// byte prevState;
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? streak+1 : 1);
+ prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+ prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+ prevState=MODE_INS;
+ }
+
+// if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+// assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+// assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+//// bpacked[MODE_MS][row][col]=(score|prevState|time);
+// bpacked[MODE_MS][row][col]=(score|time);
+// assert((score&SCOREMASK)==score);
+//// assert((prevState&MODEMASK)==prevState);
+// assert((time&TIMEMASK)==time);
+
+ }else{
+
+ int scoreMS;
+ if(ref1!='N' && call1!='N'){
+ scoreMS=scoreFromDiag_MS+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) :
+ (streak==0 ? POINTSoff_SUB : streak<LIMIT_FOR_COST_3 ? POINTSoff_SUB2 : POINTSoff_SUB3));
+ }else{
+ scoreMS=scoreFromDiag_MS+POINTSoff_NOCALL;
+ }
+
+ int scoreD=scoreFromDel_MS+POINTSoff_SUB; //+2 to move it as close as possible to the deletion / insertion
+ int scoreI=scoreFromIns_MS+POINTSoff_SUB;
+
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? 1 : streak+1);
+// time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1);
+ prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+ prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+ prevState=MODE_INS;
+ }
+
+// if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+// assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+// assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+//// bpacked[MODE_MS][row][col]=(score|prevState|time);
+// bpacked[MODE_MS][row][col]=(score|time);
+// assert((score&SCOREMASK)==score);
+//// assert((prevState&MODEMASK)==prevState);
+// assert((time&TIMEMASK)==time);
+ }
+ start=startmatrix[prevState][row-1][col-1];
+
+ final int limit2;
+ if(delNeeded>0){
+ limit2=limit-delPenalty;
+ }else if(insNeeded>0){
+ limit2=limit-insPenalty;
+ }else{
+ limit2=limit;
+ }
+ assert(limit2>=limit);
+
+ if(verbose2){System.err.println("MS: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));}
+
+ if(score>=limit2){
+ maxGoodCol=col;
+ if(minGoodCol<0){minGoodCol=col;}
+ }else{
+ score=subfloor;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// bpacked[MODE_MS][row][col]=(score|prevState|time);
+ bpacked[MODE_MS][row][col]=(score|time);
+ startmatrix[MODE_MS][row][col]=start;
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+ }
+
+ if((scoreFromDiag_DEL<=limit && scoreFromDel_DEL<=limit) || row<BARRIER_D1 || row>BARRIER_D2){
+// assert((scoreFromDiag_DEL<=limit && scoreFromDel_DEL<=limit)) : scoreFromDiag_DEL+", "+row;
+ bpacked[MODE_DEL][row][col]=subfloor;
+ startmatrix[MODE_MS][row][col]=-1;
+ }else{//Calculate DEL score
+
+ final int streak=bpacked[MODE_DEL][row][col-1]&TIMEMASK;
+
+ int scoreMS=scoreFromDiag_DEL+POINTSoff_DEL;
+ int scoreD=scoreFromDel_DEL+(streak==0 ? POINTSoff_DEL :
+ streak<LIMIT_FOR_COST_3 ? POINTSoff_DEL2 :
+ streak<LIMIT_FOR_COST_4 ? POINTSoff_DEL3 :
+ streak<LIMIT_FOR_COST_5 ? POINTSoff_DEL4 :
+ ((streak&MASK5)==0 ? POINTSoff_DEL5 : 0));
+// int scoreI=scoreFromIns+POINTSoff_DEL;
+
+
+ if(ref1=='N'){
+ scoreMS+=POINTSoff_DEL_REF_N;
+ scoreD+=POINTSoff_DEL_REF_N;
+ }else if(gap){
+ scoreMS+=POINTSoff_GAP;
+ scoreD+=POINTSoff_GAP;
+ }
+
+ //if(match){scoreMS=subfloor;}
+
+ int score;
+ int time;
+ int start;
+ byte prevState;
+
+ if(scoreMS>=scoreD){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreD;
+ time=streak+1;
+ prevState=MODE_DEL;
+ }
+ start=startmatrix[prevState][row][col-1];
+
+ final int limit2;
+ if(insNeeded>0){
+ limit2=limit-insPenalty;
+ }else if(delNeeded>0){
+ limit2=limit-calcDelScoreOffset(time+delNeeded)+calcDelScoreOffset(time);
+ }else{
+ limit2=limit;
+ }
+ assert(limit2>=limit);
+ if(verbose2){System.err.println("DEL: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));}
+
+ if(score>=limit2){
+ maxGoodCol=col;
+ if(minGoodCol<0){minGoodCol=col;}
+ }else{
+ score=subfloor;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// bpacked[MODE_DEL][row][col]=(score|prevState|time);
+ bpacked[MODE_DEL][row][col]=(score|time);
+ startmatrix[MODE_MS][row][col]=start;
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+
+// if(gap || (scoreFromDiag_INS<=limit && scoreFromIns_INS<=limit) || col<BARRIER_I1 || col>BARRIER_I2){
+ if(gap || (scoreFromDiag_INS<=limit && scoreFromIns_INS<=limit) || (row<BARRIER_I1 && col>1) || (row>BARRIER_I2 && col<BARRIER_I2b)){
+ bpacked[MODE_INS][row][col]=subfloor;
+ startmatrix[MODE_MS][row][col]=-1;
+ }else{//Calculate INS score
+
+ final int streak=bpacked[MODE_INS][row-1][col]&TIMEMASK;
+
+ int scoreMS=scoreFromDiag_INS+POINTSoff_INS;
+// int scoreD=scoreFromDel+POINTSoff_INS;
+ int scoreI=scoreFromIns_INS+(streak==0 ? POINTSoff_INS :
+ streak<LIMIT_FOR_COST_3 ? POINTSoff_INS2 :
+ streak<LIMIT_FOR_COST_4 ? POINTSoff_INS3 : POINTSoff_INS4);
+
+
+// System.err.println("("+row+","+col+")\t"+scoreFromDiag+"+"+POINTSoff_INS+"="+scoreM+", "+
+// scoreFromSub+"+"+POINTSoff_INS+"="+scoreS+", "
+// +scoreD+", "+scoreFromIns+"+"+
+// (streak==0 ? POINTSoff_INS : streak<LIMIT_FOR_COST_3 ? POINTSoff_INS2 : POINTSoff_INS3)+"="+scoreI);
+
+ //if(match){scoreMS=subfloor;}
+
+ int score;
+ int time;
+ int start;
+ byte prevState;
+
+ if(scoreMS>=scoreI){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreI;
+ time=streak+1;
+ prevState=MODE_INS;
+ }
+ start=startmatrix[prevState][row-1][col];
+
+ final int limit2;
+ if(delNeeded>0){
+ limit2=limit-delPenalty;
+ }else if(insNeeded>0){
+ limit2=limit-calcInsScoreOffset(time+insNeeded)+calcInsScoreOffset(time);
+ }else{
+ limit2=limit;
+ }
+ assert(limit2>=limit);
+
+ if(verbose2){System.err.println("INS: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));}
+ if(score>=limit2){
+ maxGoodCol=col;
+ if(minGoodCol<0){minGoodCol=col;}
+ }else{
+ score=subfloor;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// bpacked[MODE_INS][row][col]=(score|prevState|time);
+ bpacked[MODE_INS][row][col]=(score|time);
+ startmatrix[MODE_MS][row][col]=start;
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+
+
+ if(col>=colStop){
+ if(col>colStop && maxGoodCol<col){break;}
+ if(row>1){
+ bpacked[MODE_MS][row-1][col+1]=subfloor;
+ bpacked[MODE_INS][row-1][col+1]=subfloor;
+ bpacked[MODE_DEL][row-1][col+1]=subfloor;
+ startmatrix[MODE_MS][row-1][col+1]=-1;
+ startmatrix[MODE_INS][row-1][col+1]=-1;
+ startmatrix[MODE_DEL][row-1][col+1]=-1;
+ }
+ }
+ }
+ }
+
+
+ int maxCol=-1;
+ int maxState=-1;
+ int maxStart=-1;
+ int maxScore=Integer.MIN_VALUE;
+
+ for(int state=0; state<packed.length; state++){
+ for(int col=1; col<=columns; col++){
+ int x=bpacked[state][rows][col]&SCOREMASK;
+ if(x>maxScore){
+ maxScore=x;
+ maxStart=startmatrix[state][rows][col];
+ maxCol=col;
+ maxState=state;
+ }
+ }
+ }
+
+ assert(maxScore>=BADoff);
+// if(maxScore==BADoff){
+// return null;
+// }
+// if(maxScore<floor){
+// return null;
+// }
+ if(maxScore<minScore_off){
+ return null;
+ }
+
+ maxScore>>=SCOREOFFSET;
+
+// System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore);
+ return new int[] {rows, maxCol, maxState, maxScore, maxStart};
+ }
+
+
+ /** return new int[] {rows, maxC, maxS, max};
+ * Will not fill areas that cannot match minScore */
+ public final int[] fillUnlimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int[] gaps){
+ if(gaps==null){return fillUnlimited(read, ref, refStartLoc, refEndLoc);}
+ else{
+ byte[] gref=makeGref(ref, gaps, refStartLoc, refEndLoc);
+ assert(gref!=null) : "Excessively long read:\n"+new String(read);
+ return fillUnlimited(read, gref, 0, greflimit);
+ }
+ }
+
+
+ /** return new int[] {rows, maxC, maxS, max};
+ * Does not require a min score (ie, same as old method) */
+ private final int[] fillUnlimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc){
+ if(verbose){System.err.println("fillUnlimited()");}
+ rows=read.length;
+ columns=refEndLoc-refStartLoc+1;
+
+ final int maxGain=(read.length-1)*POINTSoff_MATCH2+POINTSoff_MATCH;
+ final int subfloor=0-2*maxGain;
+ assert(subfloor>BADoff && subfloor*2>BADoff); //TODO: Actually, it needs to be substantially more.
+// final int BARRIER_I2=columns-BARRIER_I1;
+ final int BARRIER_I2=rows-BARRIER_I1, BARRIER_I2b=columns-1;
+ final int BARRIER_D2=rows-BARRIER_D1;
+
+ //temporary, for finding a bug
+ if(rows>maxRows || columns>maxColumns){
+ throw new RuntimeException();
+ }
+
+ assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows;
+ assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns;
+
+ assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc;
+ assert(refEndLoc<ref.length) : "Check that values are in-bounds before calling this function: "+refEndLoc+", "+ref.length;
+
+ for(int row=1; row<=rows; row++){
+
+// int minc=max(1, row-20);
+// int maxc=min(columns, row+20);
+
+ for(int col=1; col<=columns; col++){
+ iterationsUnlimited++;
+
+// final boolean match=(read[row-1]==ref[refStartLoc+col-1]);
+// final boolean prevMatch=(row<2 || col<2 ? false : read[row-2]==ref[refStartLoc+col-2]);
+
+ final byte call0=(row<2 ? (byte)'?' : read[row-2]);
+ final byte call1=read[row-1];
+ final byte ref0=(col<2 ? (byte)'!' : ref[refStartLoc+col-2]);
+ final byte ref1=ref[refStartLoc+col-1];
+
+ final boolean match=(call1==ref1 && ref1!='N');
+ final boolean prevMatch=(call0==ref0 && ref0!='N');
+
+ final boolean gap=(ref1==GAPC);
+ assert(call1!=GAPC);
+
+ if(gap){
+ packed[MODE_MS][row][col]=subfloor;
+ }else{//Calculate match and sub scores
+
+ final int scoreFromDiag=packed[MODE_MS][row-1][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row-1][col-1]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col-1]&SCOREMASK;
+ final int streak=(packed[MODE_MS][row-1][col-1]&TIMEMASK);
+
+ {//Calculate match/sub score
+
+ if(match){
+
+ int scoreMS=scoreFromDiag+(prevMatch ? POINTSoff_MATCH2 : POINTSoff_MATCH);
+ int scoreD=scoreFromDel+POINTSoff_MATCH;
+ int scoreI=scoreFromIns+POINTSoff_MATCH;
+
+ int score;
+ int time;
+// byte prevState;
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? streak+1 : 1);
+// prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+// prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+// prevState=MODE_INS;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_MS][row][col]=(score|prevState|time);
+ packed[MODE_MS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+
+ }else{
+
+ int scoreMS;
+ if(ref1!='N' && call1!='N'){
+ scoreMS=scoreFromDiag+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) :
+ (streak==0 ? POINTSoff_SUB : streak<LIMIT_FOR_COST_3 ? POINTSoff_SUB2 : POINTSoff_SUB3));
+ }else{
+ scoreMS=scoreFromDiag+POINTSoff_NOCALL;
+ }
+
+ int scoreD=scoreFromDel+POINTSoff_SUB; //+2 to move it as close as possible to the deletion / insertion
+ int scoreI=scoreFromIns+POINTSoff_SUB;
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? 1 : streak+1);
+// time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1);
+ prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+ prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+ prevState=MODE_INS;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_MS][row][col]=(score|prevState|time);
+ packed[MODE_MS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+ }
+ }
+
+ if(row<BARRIER_D1 || row>BARRIER_D2){
+ packed[MODE_DEL][row][col]=subfloor;
+ }else{//Calculate DEL score
+
+ final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK;
+
+ final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK;
+
+ int scoreMS=scoreFromDiag+POINTSoff_DEL;
+ int scoreD=scoreFromDel+(streak==0 ? POINTSoff_DEL :
+ streak<LIMIT_FOR_COST_3 ? POINTSoff_DEL2 :
+ streak<LIMIT_FOR_COST_4 ? POINTSoff_DEL3 :
+ streak<LIMIT_FOR_COST_5 ? POINTSoff_DEL4 :
+ ((streak&MASK5)==0 ? POINTSoff_DEL5 : 0));
+// int scoreI=scoreFromIns+POINTSoff_DEL;
+
+ if(ref1=='N'){
+ scoreMS+=POINTSoff_DEL_REF_N;
+ scoreD+=POINTSoff_DEL_REF_N;
+ }else if(gap){
+ scoreMS+=POINTSoff_GAP;
+ scoreD+=POINTSoff_GAP;
+ }
+
+ //if(match){scoreMS=subfloor;}
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreD){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreD;
+ time=streak+1;
+ prevState=MODE_DEL;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_DEL][row][col]=(score|prevState|time);
+ packed[MODE_DEL][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+
+ //Calculate INS score
+// if(gap || col<BARRIER_I1 || col>BARRIER_I2){
+ if(gap || (row<BARRIER_I1 && col>1) || (row>BARRIER_I2 && col<BARRIER_I2b)){
+ packed[MODE_INS][row][col]=subfloor;
+ }else{//Calculate INS score
+
+ final int streak=packed[MODE_INS][row-1][col]&TIMEMASK;
+
+ final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK;
+
+ int scoreMS=scoreFromDiag+POINTSoff_INS;
+// int scoreD=scoreFromDel+POINTSoff_INS;
+ int scoreI=scoreFromIns+(streak==0 ? POINTSoff_INS :
+ streak<LIMIT_FOR_COST_3 ? POINTSoff_INS2 :
+ streak<LIMIT_FOR_COST_4 ? POINTSoff_INS3 : POINTSoff_INS4);
+
+// System.err.println("("+row+","+col+")\t"+scoreFromDiag+"+"+POINTSoff_INS+"="+scoreM+", "+
+// scoreFromSub+"+"+POINTSoff_INS+"="+scoreS+", "
+// +scoreD+", "+scoreFromIns+"+"+
+// (streak==0 ? POINTSoff_INS : streak<LIMIT_FOR_COST_3 ? POINTSoff_INS2 : POINTSoff_INS3)+"="+scoreI);
+
+ //if(match){scoreMS=subfloor;}
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreI){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreI;
+ time=streak+1;
+ prevState=MODE_INS;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_INS][row][col]=(score|prevState|time);
+ packed[MODE_INS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+ }
+ }
+
+
+ int maxCol=-1;
+ int maxState=-1;
+ int maxScore=Integer.MIN_VALUE;
+
+ for(int state=0; state<packed.length; state++){
+ for(int col=1; col<=columns; col++){
+ int x=packed[state][rows][col]&SCOREMASK;
+ if(x>maxScore){
+ maxScore=x;
+ maxCol=col;
+ maxState=state;
+ }
+ }
+ }
+ maxScore>>=SCOREOFFSET;
+
+// System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore);
+ return new int[] {rows, maxCol, maxState, maxScore};
+ }
+
+ @Deprecated
+ /** return new int[] {rows, maxC, maxS, max}; */
+ public final int[] fillQ(byte[] read, byte[] ref, byte[] baseScores, int refStartLoc, int refEndLoc){
+ assert(false) : "Needs to be redone to work with score cutoffs. Not difficult.";
+ rows=read.length;
+ columns=refEndLoc-refStartLoc+1;
+
+ assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows;
+ assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns;
+
+ assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc;
+ assert(refEndLoc<ref.length) : "Check that values are in-bounds before calling this function: "+refEndLoc+", "+ref.length;
+
+ for(int row=1; row<=rows; row++){
+
+// int minc=max(1, row-20);
+// int maxc=min(columns, row+20);
+
+ for(int col=1; col<=columns; col++){
+
+ final boolean match=(read[row-1]==ref[refStartLoc+col-1]);
+ final boolean prevMatch=(row<2 || col<2 ? false : read[row-2]==ref[refStartLoc+col-2]);
+
+ {//Calculate match and sub scores
+
+ final int scoreFromDiag=packed[MODE_MS][row-1][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row-1][col-1]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col-1]&SCOREMASK;
+ final int streak=(packed[MODE_MS][row-1][col-1]&TIMEMASK);
+
+ {//Calculate match/sub score
+
+ if(match){
+
+ int scoreMS=scoreFromDiag+(prevMatch ? POINTSoff_MATCH2 : POINTSoff_MATCH);
+ int scoreD=scoreFromDel+POINTSoff_MATCH;
+ int scoreI=scoreFromIns+POINTSoff_MATCH;
+
+ int score;
+ int time;
+// byte prevState;
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? streak+1 : 1);
+// prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+// prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+// prevState=MODE_INS;
+ }
+ score+=(((int)baseScores[row-1])<<SCOREOFFSET); //modifier
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_MS][row][col]=(score|prevState|time);
+ packed[MODE_MS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+
+ }else{
+
+ int scoreMS=scoreFromDiag+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) :
+ (streak==0 ? POINTSoff_SUB : streak<LIMIT_FOR_COST_3 ? POINTSoff_SUB2 : POINTSoff_SUB3));
+ int scoreD=scoreFromDel+POINTSoff_SUB; //+2 to move it as close as possible to the deletion / insertion
+ int scoreI=scoreFromIns+POINTSoff_SUB;
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? 1 : streak+1);
+// time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1);
+ prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+ prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+ prevState=MODE_INS;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_MS][row][col]=(score|prevState|time);
+ packed[MODE_MS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+ }
+ }
+
+ {//Calculate DEL score
+
+ final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK;
+
+ final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK;
+
+ int scoreMS=scoreFromDiag+POINTSoff_DEL;
+ int scoreD=scoreFromDel+(streak==0 ? POINTSoff_DEL :
+ streak<LIMIT_FOR_COST_3 ? POINTSoff_DEL2 :
+ streak<LIMIT_FOR_COST_4 ? POINTSoff_DEL3 :
+ streak<LIMIT_FOR_COST_5 ? POINTSoff_DEL4 :
+ ((streak&MASK5)==0 ? POINTSoff_DEL5 : 0));
+// int scoreI=scoreFromIns+POINTSoff_DEL;
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreD){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreD;
+ time=streak+1;
+ prevState=MODE_DEL;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_DEL][row][col]=(score|prevState|time);
+ packed[MODE_DEL][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+
+ {//Calculate INS score
+
+ final int streak=packed[MODE_INS][row-1][col]&TIMEMASK;
+
+ final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK;
+
+ int scoreMS=scoreFromDiag+POINTSoff_INS;
+// int scoreD=scoreFromDel+POINTSoff_INS;
+ int scoreI=scoreFromIns+(streak==0 ? POINTSoff_INS :
+ streak<LIMIT_FOR_COST_3 ? POINTSoff_INS2 :
+ streak<LIMIT_FOR_COST_4 ? POINTSoff_INS3 : POINTSoff_INS4);
+
+// System.err.println("("+row+","+col+")\t"+scoreFromDiag+"+"+POINTSoff_INS+"="+scoreM+", "+
+// scoreFromSub+"+"+POINTSoff_INS+"="+scoreS+", "
+// +scoreD+", "+scoreFromIns+"+"+
+// (streak==0 ? POINTSoff_INS : streak<LIMIT_FOR_COST_3 ? POINTSoff_INS2 : POINTSoff_INS3)+"="+scoreI);
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreI){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreI;
+ time=streak+1;
+ prevState=MODE_INS;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_INS][row][col]=(score|prevState|time);
+ packed[MODE_INS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+ }
+ }
+
+
+ int maxCol=-1;
+ int maxState=-1;
+ int maxScore=Integer.MIN_VALUE;
+
+ for(int state=0; state<packed.length; state++){
+ for(int col=1; col<=columns; col++){
+ int x=packed[state][rows][col]&SCOREMASK;
+ if(x>maxScore){
+ maxScore=x;
+ maxCol=col;
+ maxState=state;
+ }
+ }
+ }
+ maxScore>>=SCOREOFFSET;
+
+// System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore);
+ return new int[] {rows, maxCol, maxState, maxScore};
+ }
+
+
+ /** @return {score, bestRefStart, bestRefStop} */
+ /** Generates the match string */
+ public final byte[] traceback(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state, boolean gapped){
+ if(gapped){
+ final byte[] gref=grefbuffer;
+ int gstart=translateToGappedCoordinate(refStartLoc, gref);
+ int gstop=translateToGappedCoordinate(refEndLoc, gref);
+ byte[] out=traceback2(read, gref, gstart, gstop, row, col, state);
+ return out;
+ }else{
+ return traceback2(read, ref, refStartLoc, refEndLoc, row, col, state);
+ }
+ }
+
+
+ /** Generates the match string */
+ public final byte[] traceback2(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state){
+// assert(false);
+ assert(refStartLoc<=refEndLoc) : refStartLoc+", "+refEndLoc;
+ assert(row==rows);
+
+ byte[] out=new byte[row+col-1]; //TODO if an out of bound crash occurs, try removing the "-1".
+ int outPos=0;
+
+ int gaps=0;
+
+ if(state==MODE_INS){
+ //TODO ? Maybe not needed.
+ }
+
+ while(row>0 && col>0){
+
+// byte prev0=(byte)(packed[state][row][col]&MODEMASK);
+
+ final int time=packed[state][row][col]&TIMEMASK;
+ final byte prev;
+
+// System.err.println("state="+state+", prev="+prev+", row="+row+", col="+col+", score="+scores[state][row][col]);
+
+ if(state==MODE_MS){
+ if(time>1){prev=(byte)state;}
+ else{
+ final int scoreFromDiag=packed[MODE_MS][row-1][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row-1][col-1]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col-1]&SCOREMASK;
+ if(scoreFromDiag>=scoreFromDel && scoreFromDiag>=scoreFromIns){prev=MODE_MS;}
+ else if(scoreFromDel>=scoreFromIns){prev=MODE_DEL;}
+ else{prev=MODE_INS;}
+ }
+
+ byte c=read[row-1];
+ byte r=ref[refStartLoc+col-1];
+ if(c==r){
+ out[outPos]='m';
+ }else{
+ if(!AminoAcid.isFullyDefined(c)){
+ out[outPos]='N';
+ }else if(!AminoAcid.isFullyDefined(r)){
+// out[outPos]='X';
+ out[outPos]='N';
+ }else{
+ out[outPos]='S';
+ }
+ }
+
+ row--;
+ col--;
+ }else if(state==MODE_DEL){
+ if(time>1){prev=(byte)state;}
+ else{
+ final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK;
+ if(scoreFromDiag>=scoreFromDel){prev=MODE_MS;}
+ else{prev=MODE_DEL;}
+ }
+
+ byte r=ref[refStartLoc+col-1];
+ if(r==GAPC){
+ out[outPos]='-';
+ gaps++;
+ }else{
+ out[outPos]='D';
+ }
+ col--;
+ }else{
+ if(time>1){prev=(byte)state;}
+ else{
+ final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK;
+ if(scoreFromDiag>=scoreFromIns){prev=MODE_MS;}
+ else{prev=MODE_INS;}
+ }
+
+ assert(state==MODE_INS) : state;
+ if(col==0){
+ out[outPos]='X';
+ }else if(col>=columns){
+ out[outPos]='Y';
+ }else{
+ out[outPos]='I';
+ }
+ row--;
+ }
+
+// assert(prev==prev0);
+ state=prev;
+ outPos++;
+ }
+
+ assert(row==0 || col==0);
+ if(col!=row){
+ while(row>0){
+ out[outPos]='X';
+ outPos++;
+ row--;
+ col--;
+ }
+ if(col>0){
+ //do nothing
+ }
+ }
+
+
+ //Shrink and reverse the string
+ byte[] out2=new byte[outPos];
+ for(int i=0; i<outPos; i++){
+ out2[i]=out[outPos-i-1];
+ }
+ out=null;
+
+ if(gaps==0){return out2;}
+
+ //TODO Consider outputting this compressed.
+ byte[] out3=new byte[out2.length+gaps*(GAPLEN-1)];
+ for(int i=0, j=0; i<out2.length; i++){
+ byte c=out2[i];
+ if(c!=GAPC){
+ out3[j]=c;
+ j++;
+ }else{
+ int lim=j+GAPLEN;
+ for(; j<lim; j++){
+ out3[j]='D';
+ }
+ }
+ }
+ return out3;
+ }
+
+ /** @return {score, bestRefStart, bestRefStop} */
+ public final int[] score(final byte[] read, final byte[] ref, final int refStartLoc, final int refEndLoc,
+ final int maxRow, final int maxCol, final int maxState, boolean gapped){
+ if(gapped){
+ if(verbose){
+ System.err.println("score():");
+ System.err.println("origin="+grefRefOrigin+", "+refStartLoc+", "+refEndLoc+", "+maxRow+", "+maxCol);
+ }
+ final byte[] gref=grefbuffer;
+ int gstart=translateToGappedCoordinate(refStartLoc, gref);
+ int gstop=translateToGappedCoordinate(refEndLoc, gref);
+
+ assert(translateFromGappedCoordinate(gstart, gref)==refStartLoc); //TODO: Remove slow assertions
+ assert(translateFromGappedCoordinate(gstop, gref)==refEndLoc);
+
+ assert(gstart==0) : gstart; //TODO: skip translation if this is always zero
+
+ if(verbose){System.err.println("gstart, gstop: "+gstart+", "+gstop);}
+ int[] out=score2(read, gref, gstart, gstop, maxRow, maxCol, maxState);
+ if(verbose){System.err.println("got score "+Arrays.toString(out));}
+
+ assert(out[1]==translateToGappedCoordinate(translateFromGappedCoordinate(out[1], gref), gref)) :
+ "Verifying: "+out[1]+" -> "+translateFromGappedCoordinate(out[1], gref)+" -> "+
+ translateToGappedCoordinate(translateFromGappedCoordinate(out[1], gref), gref);
+ assert(out[2]==translateToGappedCoordinate(translateFromGappedCoordinate(out[2], gref), gref));
+
+ out[1]=translateFromGappedCoordinate(out[1], gref);
+ out[2]=translateFromGappedCoordinate(out[2], gref);
+ if(verbose){System.err.println("returning score "+Arrays.toString(out));}
+ return out;
+ }else{
+ return score2(read, ref, refStartLoc, refEndLoc, maxRow, maxCol, maxState);
+ }
+ }
+
+ /** @return {score, bestRefStart, bestRefStop}, or {score, bestRefStart, bestRefStop, padLeft, padRight} if more padding is needed */
+ public final int[] score2(final byte[] read, final byte[] ref, final int refStartLoc, final int refEndLoc,
+ final int maxRow, final int maxCol, final int maxState){
+
+ int row=maxRow;
+ int col=maxCol;
+ int state=maxState;
+
+ assert(maxState>=0 && maxState<packed.length) :
+ maxState+", "+maxRow+", "+maxCol+"\n"+new String(read)+"\n"+toString(ref, refStartLoc, refEndLoc);
+ assert(maxRow>=0 && maxRow<packed[0].length) :
+ maxState+", "+maxRow+", "+maxCol+"\n"+new String(read)+"\n"+toString(ref, refStartLoc, refEndLoc);
+ assert(maxCol>=0 && maxCol<packed[0][0].length) :
+ maxState+", "+maxRow+", "+maxCol+"\n"+new String(read)+"\n"+toString(ref, refStartLoc, refEndLoc);
+
+ int score=packed[maxState][maxRow][maxCol]&SCOREMASK; //Or zero, if it is to be recalculated
+
+ if(row<rows){
+ int difR=rows-row;
+ int difC=columns-col;
+
+ while(difR>difC){
+ score+=POINTSoff_NOREF;
+ difR--;
+ }
+
+ row+=difR;
+ col+=difR;
+
+ }
+
+ assert(refStartLoc<=refEndLoc);
+ assert(row==rows);
+
+
+ final int bestRefStop=refStartLoc+col-1;
+
+ while(row>0 && col>0){
+// System.err.println("state="+state+", row="+row+", col="+col);
+
+
+
+// byte prev0=(byte)(packed[state][row][col]&MODEMASK);
+
+ final int time=packed[state][row][col]&TIMEMASK;
+ final byte prev;
+
+ if(state==MODE_MS){
+ if(time>1){prev=(byte)state;}
+ else{
+ final int scoreFromDiag=packed[MODE_MS][row-1][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row-1][col-1]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col-1]&SCOREMASK;
+ if(scoreFromDiag>=scoreFromDel && scoreFromDiag>=scoreFromIns){prev=MODE_MS;}
+ else if(scoreFromDel>=scoreFromIns){prev=MODE_DEL;}
+ else{prev=MODE_INS;}
+ }
+ row--;
+ col--;
+ }else if(state==MODE_DEL){
+ if(time>1){prev=(byte)state;}
+ else{
+ final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK;
+ if(scoreFromDiag>=scoreFromDel){prev=MODE_MS;}
+ else{prev=MODE_DEL;}
+ }
+ col--;
+ }else{
+ assert(state==MODE_INS);
+ if(time>1){prev=(byte)state;}
+ else{
+ final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK;
+ if(scoreFromDiag>=scoreFromIns){prev=MODE_MS;}
+ else{prev=MODE_INS;}
+ }
+ row--;
+ }
+
+ if(col<0){
+ System.err.println(row);
+ break; //prevents an out of bounds access
+
+ }
+
+// assert(prev==prev0);
+ state=prev;
+
+// System.err.println("state2="+state+", row2="+row+", col2="+col+"\n");
+ }
+// assert(false) : row+", "+col;
+ if(row>col){
+ col-=row;
+ }
+
+ final int bestRefStart=refStartLoc+col;
+
+ score>>=SCOREOFFSET;
+ int[] rvec;
+ if(bestRefStart<refStartLoc || bestRefStop>refEndLoc){ //Suggest extra padding in cases of overflow
+ int padLeft=Tools.max(0, refStartLoc-bestRefStart);
+ int padRight=Tools.max(0, bestRefStop-refEndLoc);
+ rvec=new int[] {score, bestRefStart, bestRefStop, maxRow, maxCol, maxState, padLeft, padRight};
+ }else{
+ rvec=new int[] {score, bestRefStart, bestRefStop, maxRow, maxCol, maxState};
+ }
+ return rvec;
+ }
+
+ /**
+ * Fills grefbuffer
+ * @param ref
+ * @param a
+ * @param b
+ * @param gaps
+ * @return gref
+ */
+ private final byte[] makeGref(byte[] ref, int[] gaps, int refStartLoc, int refEndLoc){
+ assert(gaps!=null && gaps.length>0);
+
+ assert(refStartLoc<=gaps[0]) : refStartLoc+", "+refEndLoc+", "+Arrays.toString(gaps);
+ assert(refEndLoc>=gaps[gaps.length-1]);
+
+ final int g0_old=gaps[0];
+ final int gN_old=gaps[gaps.length-1];
+ gaps[0]=Tools.min(gaps[0], refStartLoc);
+ gaps[gaps.length-1]=Tools.max(gN_old, refEndLoc);
+ grefRefOrigin=gaps[0];
+
+ if(verbose){System.err.println("\ngaps2: "+Arrays.toString(gaps));}
+
+// grefRefOrigin=Tools.min(gaps[0], refStartLoc);
+
+// //This block is no longer needed since the array is preallocated.
+// int len=0;
+// final int gb2=GAPBUFFER*2;
+// for(int i=0; i<gaps.length; i+=2){
+// int x=gaps[i];
+// int y=gaps[i+1];
+// len+=(y-x+1);
+// if(i+2<gaps.length){
+// int z=gaps[i+2];
+// assert(z>y);
+// int gap=z-y-1;
+// if(gap<MINGAP){
+// len+=gap;
+// }else{
+// len+=gb2;
+// gap-=gb2;
+// int div=gap/GAPLEN;
+// int rem=gap%GAPLEN;
+// len+=(div+rem);
+// }
+// }
+// }
+ byte[] gref=grefbuffer;
+
+ int gpos=0;
+ for(int i=0; i<gaps.length; i+=2){
+ int x=gaps[i];
+ int y=gaps[i+1];
+
+ for(int r=x; r<=y; r++, gpos++){
+ //TODO: if out of bounds, use an 'N'
+// assert(gpos<gref.length) : refStartLoc+", "+refEndLoc+", "+greflimit+", "+GREFLIMIT2_CUSHION+"\n"+new String(gref);
+ gref[gpos]=ref[r];
+ }
+
+ if(i+2<gaps.length){
+ int z=gaps[i+2];
+ assert(z>y);
+ int gap=z-y-1;
+ assert(gap>=MINGAP) : gap+"\t"+MINGAP;
+ if(gap<MINGAP){
+ assert(false) : "TODO - just fill in normally";
+ }else{
+ int rem=gap%GAPLEN;
+ int lim=y+GAPBUFFER+rem;
+
+ int div=(gap-GAPBUFFER2)/GAPLEN;
+ if(verbose){
+ System.err.println("div = "+div);
+ }
+ assert(div>0);
+
+ for(int r=y+1; r<=lim; r++, gpos++){
+ gref[gpos]=ref[r];
+ }
+ for(int g=0; g<div; g++, gpos++){
+ gref[gpos]=GAPC;
+ }
+ for(int r=z-GAPBUFFER; r<z; r++, gpos++){
+ gref[gpos]=ref[r];
+ }
+ }
+ }
+ }
+
+ greflimit=gpos;
+
+ assert(gref[gpos-1]==ref[refEndLoc]);
+// assert(greflimit+GREFLIMIT2_CUSHION<=gref.length) : refStartLoc+", "+refEndLoc+", "+greflimit+", "+GREFLIMIT2_CUSHION+"\n"+new String(gref);
+ //Add a cushion to the end to clear out the prior data (especially GAPC) that was there
+ {
+ final int lim=Tools.min(gref.length, greflimit+GREFLIMIT2_CUSHION);
+ if(lim>gref.length){
+ System.err.println("gref buffer overflow: "+lim+" > "+gref.length);
+ return null;
+ }
+ for(int i=greflimit, r=refEndLoc+1; i<lim; i++, r++){
+ gref[i]=(r<ref.length ? ref[r] : (byte)'N');
+ greflimit2=i;
+ }
+ }
+
+ if(verbose){
+ System.err.println("gref:\n"+new String(gref));
+ }
+
+ gaps[0]=g0_old;
+ gaps[gaps.length-1]=gN_old;
+
+ if(verbose){
+ System.err.println("\ngaps3: "+Arrays.toString(gaps));
+ }
+
+ return gref;
+ }
+
+
+// public final int[] translateScoreFromGappedCoordinate(int[] score){
+//// {score, bestRefStart, bestRefStop}
+// int a=score[1];
+// int b=score[2];
+// int a2=-9999;
+// int b2=-9999;
+// for(int i=0, j=grefRefOrigin; i<grefbuffer.length; i++){
+// byte c=grefbuffer[i];
+//
+// if(i==a){a2=j;}
+// if(i==b){
+// b2=j;
+// assert(a2!=-9999);
+// score[1]=a2;
+// score[2]=b2;
+// return score;
+// }
+//
+// j+=(c==GAPC ? GAPLEN : 1);
+//// if(c!=GAPC){j++;}
+//// else{j+=GAPLEN;}
+// }
+// throw new RuntimeException("Out of bounds.");
+// }
+
+ private final int translateFromGappedCoordinate(int point, byte[] gref){
+ if(verbose){System.err.println("translateFromGappedCoordinate("+point+"), gro="+grefRefOrigin+", grl="+greflimit);}
+ if(point<=0){return grefRefOrigin+point;}
+ for(int i=0, j=grefRefOrigin; i<greflimit2; i++){
+// if(verbose){System.err.println("i="+i+", j="+j+", sym="+(char)gref[i]);}
+ byte c=gref[i];
+ assert(point>=i) : "\n"+grefRefOrigin+"\n"+point+"\n"+new String(gref)+"\n";
+
+ if(i==point){
+ if(verbose){System.err.println(" -> "+j);}
+ return j;
+ }
+
+ j+=(c==GAPC ? GAPLEN : 1);
+// if(c!=GAPC){j++;}
+// else{j+=GAPLEN;}
+ }
+
+ System.err.println(grefRefOrigin);
+ System.err.println(point);
+ System.err.println(new String(gref));
+
+ throw new RuntimeException("Out of bounds.");
+ }
+
+ private final int translateToGappedCoordinate(int point, byte[] gref){
+ if(verbose){System.err.println("translateToGappedCoordinate("+point+"), gro="+grefRefOrigin+", grl="+greflimit);}
+ if(point<=grefRefOrigin){return point-grefRefOrigin;}
+ for(int i=0, j=grefRefOrigin; i<greflimit2; i++){
+// if(verbose){System.err.println("i="+i+", j="+j+", sym="+(char)gref[i]);}
+ assert(point>=j) : "\n"+grefRefOrigin+"\n"+point+"\n"+new String(gref)+"\n";
+ byte c=gref[i];
+
+ if(j==point){
+ if(verbose){System.err.println(" -> "+i);}
+ return i;
+ }
+
+ j+=(c==GAPC ? GAPLEN : 1);
+// if(c!=GAPC){j++;}
+// else{j+=GAPLEN;}
+ }
+
+ System.err.println(grefRefOrigin);
+ System.err.println(point);
+ System.err.println(new String(gref));
+
+ throw new RuntimeException("Out of bounds.");
+ }
+
+// public final int scoreNoIndels(byte[] read, SiteScore ss){
+//
+// ChromosomeArray cha=Data.getChromosome(ss.chrom);
+// final int refStart=ss.start;
+//
+// int score=0;
+// int mode=MODE_START;
+// int timeInMode=0;
+// if(refStart<0 || refStart+read.length>cha.maxIndex+1){return -99999;} //TODO: Partial match
+//
+// for(int i=0; i<read.length; i++){
+// byte c=read[i];
+// byte r=cha.get(refStart+i);
+//
+// if(c==r){
+// if(mode==MODE_MS){
+// timeInMode++;
+// score+=POINTSoff_MATCH2;
+// }else{
+// timeInMode=0;
+// score+=POINTSoff_MATCH;
+// }
+// mode=MODE_MS;
+// }else if(c<0 || c=='N'){
+// score+=POINTSoff_NOCALL;
+// }else if(r<0 || r=='N'){
+// score+=POINTSoff_NOREF;
+// }else{
+// if(mode==MODE_SUB){timeInMode++;}
+// else{timeInMode=0;}
+//
+// if(timeInMode==0){score+=POINTSoff_SUB;}
+// else if(timeInMode<LIMIT_FOR_COST_3){score+=POINTSoff_SUB2;}
+// else{score+=POINTSoff_SUB3;}
+// }
+// }
+//
+// return score;
+// }
+
+
+// public final int scoreNoIndels(byte[] read, final int chrom, final int refStart){
+//
+// ChromosomeArray cha=Data.getChromosome(chrom);
+//
+// int score=0;
+// int mode=MODE_START;
+// int timeInMode=0;
+//
+// //This block handles cases where the read runs outside the reference
+// //Of course, padding the reference with 'N' would be better, but...
+// int readStart=0;
+// int readStop=read.length;
+// final int refStop=refStart+read.length;
+// if(refStart<0){
+// readStart=0-refStart;
+// score+=POINTSoff_NOREF*readStart;
+// }
+// if(refStop>cha.maxIndex+1){
+// int dif=(cha.maxIndex+1-refStop);
+// readStop-=dif;
+// score+=POINTSoff_NOREF*dif;
+// }
+//
+//// if(refStart<0 || refStart+read.length>cha.maxIndex+1){return -99999;} //No longer needed.
+//
+// for(int i=readStart; i<readStop; i++){
+// byte c=read[i];
+// byte r=cha.get(refStart+i);
+//
+// if(c==r){
+// if(mode==MODE_MS){
+// timeInMode++;
+// score+=POINTSoff_MATCH2;
+// }else{
+// timeInMode=0;
+// score+=POINTSoff_MATCH;
+// }
+// mode=MODE_MS;
+// }else if(c<0 || c=='N'){
+// score+=POINTSoff_NOCALL;
+// }else if(r<0 || r=='N'){
+// score+=POINTSoff_NOREF;
+// }else{
+// if(mode==MODE_SUB){timeInMode++;}
+// else{timeInMode=0;}
+//
+// if(timeInMode==0){score+=POINTSoff_SUB;}
+// else if(timeInMode<LIMIT_FOR_COST_3){score+=POINTSoff_SUB2;}
+// else{score+=POINTSoff_SUB3;}
+// }
+// }
+//
+// return score;
+// }
+
+
+
+ /** Calculates score based on an array from Index */
+ private final int calcAffineScore(int[] locArray){
+ int score=0;
+ int lastLoc=-2; //Last true location
+ int lastValue=-1;
+ int timeInMode=0;
+
+ for(int i=0; i<locArray.length; i++){
+ int loc=locArray[i];
+
+ if(loc>0){//match
+ if(loc==lastValue){//contiguous match
+ score+=POINTS_MATCH2;
+ }else if(loc==lastLoc || lastLoc<0){//match after a sub, or first match
+ score+=POINTS_MATCH;
+ }else if(loc<lastLoc){//deletion
+ assert(lastLoc>=0);
+ score+=POINTS_MATCH;
+ score+=POINTS_DEL;
+ int dif=lastLoc-loc+1;
+ if(dif>MINGAP){
+ int rem=dif%GAPLEN;
+ int div=(dif-GAPBUFFER2)/GAPLEN;
+ score+=(div*POINTS_GAP);
+ assert(rem+GAPBUFFER2<dif);
+ dif=rem+GAPBUFFER2;
+ assert(dif>LIMIT_FOR_COST_4); //and probably LIMIT_FOR_COST_5
+// assert(false) : div;
+ }
+ if(dif>LIMIT_FOR_COST_5){
+ score+=((dif-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTS_DEL5;
+ dif=LIMIT_FOR_COST_5;
+ }
+ if(dif>LIMIT_FOR_COST_4){
+ score+=(dif-LIMIT_FOR_COST_4)*POINTS_DEL4;
+ dif=LIMIT_FOR_COST_4;
+ }
+ if(dif>LIMIT_FOR_COST_3){
+ score+=(dif-LIMIT_FOR_COST_3)*POINTS_DEL3;
+ dif=LIMIT_FOR_COST_3;
+ }
+ if(dif>1){
+ score+=(dif-1)*POINTS_DEL2;
+ }
+ timeInMode=1;
+ }else if(loc>lastLoc){//insertion
+ assert(lastLoc>=0);
+ score+=POINTS_MATCH;
+ score+=POINTS_INS;
+ int dif=Tools.min(loc-lastLoc+1, 5);
+ assert(dif>0);
+ if(dif>LIMIT_FOR_COST_4){
+ score+=(dif-LIMIT_FOR_COST_4)*POINTS_INS4;
+ dif=LIMIT_FOR_COST_4;
+ }
+ if(dif>LIMIT_FOR_COST_3){
+ score+=(dif-LIMIT_FOR_COST_3)*POINTS_INS3;
+ dif=LIMIT_FOR_COST_3;
+ }
+ if(dif>1){
+ score+=(dif-1)*POINTS_INS2;
+ }
+ timeInMode=1;
+ }else{
+ assert(false);
+ }
+ lastLoc=loc;
+ }else{//substitution
+ if(lastValue<0 && timeInMode>0){//contiguous
+ if(timeInMode<LIMIT_FOR_COST_3){score+=POINTS_SUB2;}
+ else{score+=POINTS_SUB3;}
+ timeInMode++;
+ }else{
+ score+=POINTS_SUB;
+ timeInMode=1;
+ }
+ }
+ lastValue=loc;
+ }
+ assert(score<=maxQuality(locArray.length));
+ return score;
+ }
+
+ @Override
+ public final int calcAffineScore(int[] locArray, byte[] baseScores, byte[] bases){
+ int score=0;
+ int lastLoc=-3; //Last true location
+ int lastValue=-1;
+ int timeInMode=0;
+
+ for(int i=0; i<locArray.length; i++){
+ int loc=locArray[i];
+
+ if(loc>0){//match
+ if(loc==lastValue){//contiguous match
+ score+=(POINTS_MATCH2+baseScores[i]);
+ }else if(loc==lastLoc || lastLoc<0){//match after a sub, or first match
+ score+=(POINTS_MATCH+baseScores[i]);
+ }else if(loc<lastLoc){//deletion
+ assert(lastLoc>=0);
+ score+=(POINTS_MATCH+baseScores[i]);
+ score+=POINTS_DEL;
+ int dif=lastLoc-loc+1;
+ if(dif>MINGAP){
+ int rem=dif%GAPLEN;
+ int div=(dif-GAPBUFFER2)/GAPLEN;
+ score+=(div*POINTS_GAP);
+ assert(rem+GAPBUFFER2<dif);
+ dif=rem+GAPBUFFER2;
+ assert(dif>LIMIT_FOR_COST_4); //and probably LIMIT_FOR_COST_5
+// assert(false) : div;
+ }
+ if(dif>LIMIT_FOR_COST_5){
+ score+=((dif-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTS_DEL5;
+ dif=LIMIT_FOR_COST_5;
+ }
+ if(dif>LIMIT_FOR_COST_4){
+ score+=(dif-LIMIT_FOR_COST_4)*POINTS_DEL4;
+ dif=LIMIT_FOR_COST_4;
+ }
+ if(dif>LIMIT_FOR_COST_3){
+ score+=(dif-LIMIT_FOR_COST_3)*POINTS_DEL3;
+ dif=LIMIT_FOR_COST_3;
+ }
+ if(dif>1){
+ score+=(dif-1)*POINTS_DEL2;
+ }
+ timeInMode=1;
+ }else if(loc>lastLoc){//insertion
+ assert(lastLoc>=0);
+ score+=(POINTS_MATCH+baseScores[i]);
+ score+=POINTS_INS;
+ int dif=Tools.min(loc-lastLoc+1, 5);
+ assert(dif>0);
+ if(dif>LIMIT_FOR_COST_4){
+ score+=(dif-LIMIT_FOR_COST_4)*POINTS_INS4;
+ dif=LIMIT_FOR_COST_4;
+ }
+ if(dif>LIMIT_FOR_COST_3){
+ score+=(dif-LIMIT_FOR_COST_3)*POINTS_INS3;
+ dif=LIMIT_FOR_COST_3;
+ }
+ if(dif>1){
+ score+=(dif-1)*POINTS_INS2;
+ }
+ timeInMode=1;
+ }else{
+ assert(false);
+ }
+ lastLoc=loc;
+ }else if(loc==-1){//substitution
+ if(lastValue<0 && timeInMode>0){//contiguous
+ if(timeInMode<LIMIT_FOR_COST_3){score+=POINTS_SUB2;}
+ else{score+=POINTS_SUB3;}
+ timeInMode++;
+ }else{
+ score+=POINTS_SUB;
+ timeInMode=1;
+ }
+ }else{
+ assert(loc==-2) : loc+"\n"+Arrays.toString(locArray)+"\n"+Arrays.toString(baseScores)+"\n"+new String(bases)+"\n"
+ +"If this happens please ensure that the reference has a startpad of Ns longer than readlength.";//N (no-call or no-ref)
+ timeInMode=0;
+ score+=POINTS_NOCALL;
+ }
+ lastValue=loc;
+ }
+ assert(score<=maxQuality(locArray.length));
+ return score;
+ }
+
+ @Override
+ public final int calcAffineScore(int[] locArray, byte[] baseScores, byte[] bases, int minContig){
+ assert(minContig>1) : minContig;
+
+ int contig=0;
+ int maxContig=0;
+
+ int score=0;
+ int lastLoc=-3; //Last true location
+ int lastValue=-1;
+ int timeInMode=0;
+
+ for(int i=0; i<locArray.length; i++){
+ int loc=locArray[i];
+
+ if(loc>0){//match
+ if(loc==lastValue){//contiguous match
+ contig++;
+ score+=(POINTS_MATCH2+baseScores[i]);
+ }else if(loc==lastLoc || lastLoc<0){//match after a sub, or first match
+ maxContig=Tools.max(maxContig, contig);
+ contig=1;
+ score+=(POINTS_MATCH+baseScores[i]);
+ }else if(loc<lastLoc){//deletion
+ maxContig=Tools.max(maxContig, contig);
+ contig=0;
+ assert(lastLoc>=0);
+ score+=(POINTS_MATCH+baseScores[i]);
+ score+=POINTS_DEL;
+ int dif=lastLoc-loc+1;
+ if(dif>MINGAP){
+ int rem=dif%GAPLEN;
+ int div=(dif-GAPBUFFER2)/GAPLEN;
+ score+=(div*POINTS_GAP);
+ assert(rem+GAPBUFFER2<dif);
+ dif=rem+GAPBUFFER2;
+ assert(dif>LIMIT_FOR_COST_4); //and probably LIMIT_FOR_COST_5
+// assert(false) : div;
+ }
+ if(dif>LIMIT_FOR_COST_5){
+ score+=((dif-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTS_DEL5;
+ dif=LIMIT_FOR_COST_5;
+ }
+ if(dif>LIMIT_FOR_COST_4){
+ score+=(dif-LIMIT_FOR_COST_4)*POINTS_DEL4;
+ dif=LIMIT_FOR_COST_4;
+ }
+ if(dif>LIMIT_FOR_COST_3){
+ score+=(dif-LIMIT_FOR_COST_3)*POINTS_DEL3;
+ dif=LIMIT_FOR_COST_3;
+ }
+ if(dif>1){
+ score+=(dif-1)*POINTS_DEL2;
+ }
+ timeInMode=1;
+ }else if(loc>lastLoc){//insertion
+ maxContig=Tools.max(maxContig, contig);
+ contig=0;
+ assert(lastLoc>=0);
+ score+=(POINTS_MATCH+baseScores[i]);
+ score+=POINTS_INS;
+ int dif=Tools.min(loc-lastLoc+1, 5);
+ assert(dif>0);
+ if(dif>LIMIT_FOR_COST_4){
+ score+=(dif-LIMIT_FOR_COST_4)*POINTS_INS4;
+ dif=LIMIT_FOR_COST_4;
+ }
+ if(dif>LIMIT_FOR_COST_3){
+ score+=(dif-LIMIT_FOR_COST_3)*POINTS_INS3;
+ dif=LIMIT_FOR_COST_3;
+ }
+ if(dif>1){
+ score+=(dif-1)*POINTS_INS2;
+ }
+ timeInMode=1;
+ }else{
+ assert(false);
+ }
+ lastLoc=loc;
+ }else if(loc==-1){//substitution
+ if(lastValue<0 && timeInMode>0){//contiguous
+ if(timeInMode<LIMIT_FOR_COST_3){score+=POINTS_SUB2;}
+ else{score+=POINTS_SUB3;}
+ timeInMode++;
+ }else{
+ score+=POINTS_SUB;
+ timeInMode=1;
+ }
+ }else{
+ assert(loc==-2) : loc+"\n"+Arrays.toString(locArray)+"\n"+Arrays.toString(baseScores)+"\n"+new String(bases)+"\n"
+ +"If this happens please ensure that the reference has a startpad of Ns longer than readlength.";//N (no-call or no-ref)
+ timeInMode=0;
+ score+=POINTS_NOCALL;
+ }
+ lastValue=loc;
+ }
+ assert(score<=maxQuality(locArray.length));
+ if(Tools.max(contig, maxContig)<minContig){score=Tools.min(score, -50*locArray.length);}
+ return score;
+ }
+
+ @Override
+ public final int scoreNoIndels(byte[] read, byte[] ref, final int refStart){
+
+ int score=0;
+ int mode=-1;
+ int timeInMode=0;
+
+ //This block handles cases where the read runs outside the reference
+ //Of course, padding the reference with 'N' would be better, but...
+ int readStart=0;
+ int readStop=read.length;
+ final int refStop=refStart+read.length;
+ if(refStart<0){
+ readStart=0-refStart;
+ score+=POINTS_NOREF*readStart;
+ }
+ if(refStop>ref.length){
+ int dif=(refStop-ref.length);
+ readStop-=dif;
+ score+=POINTS_NOREF*dif;
+ }
+
+// if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed.
+
+ for(int i=readStart; i<readStop; i++){
+ byte c=read[i];
+ byte r=ref[refStart+i];
+
+ if(c==r && c!='N'){
+ if(mode==MODE_MS){
+ timeInMode++;
+ score+=POINTS_MATCH2;
+ }else{
+ timeInMode=0;
+ score+=POINTS_MATCH;
+ }
+ mode=MODE_MS;
+ }else if(c<0 || c=='N'){
+ score+=POINTS_NOCALL;
+ }else if(r<0 || r=='N'){
+ score+=POINTS_NOREF;
+ }else{
+ if(mode==MODE_SUB){timeInMode++;}
+ else{timeInMode=0;}
+
+ if(timeInMode==0){score+=POINTS_SUB;}
+ else if(timeInMode<LIMIT_FOR_COST_3){score+=POINTS_SUB2;}
+ else{score+=POINTS_SUB3;}
+ mode=MODE_SUB;
+ }
+ }
+
+ return score;
+ }
+
+ @Override
+ public final byte[] genMatchNoIndels(byte[] read, byte[] ref, final int refStart){
+ if(read==null || ref==null){return null;}
+
+ final byte[] match=new byte[read.length];
+
+ for(int i=0, j=refStart; i<read.length; i++, j++){
+ byte c=read[i];
+ byte r=(j<0 || j>=ref.length) ? (byte)'N' : ref[j];
+
+ if(c=='N' || r=='N'){match[i]='N';}
+ else if(c==r){match[i]='m';}
+ else{match[i]='S';}
+
+ }
+
+ return match;
+ }
+
+ @Override
+ public final int scoreNoIndels(byte[] read, byte[] ref, byte[] baseScores, final int refStart){
+
+ int score=0;
+ int mode=-1;
+ int timeInMode=0;
+
+ //This block handles cases where the read runs outside the reference
+ //Of course, padding the reference with 'N' would be better, but...
+ int readStart=0;
+ int readStop=read.length;
+ final int refStop=refStart+read.length;
+ if(refStart<0){
+ readStart=0-refStart;
+ score+=POINTS_NOREF*readStart;
+ }
+ if(refStop>ref.length){
+ int dif=(refStop-ref.length);
+ readStop-=dif;
+ score+=POINTS_NOREF*dif;
+ }
+
+// if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed.
+
+ for(int i=readStart; i<readStop; i++){
+ byte c=read[i];
+ byte r=ref[refStart+i];
+
+ if(c==r && c!='N'){
+ if(mode==MODE_MS){
+ timeInMode++;
+ score+=POINTS_MATCH2;
+ }else{
+ timeInMode=0;
+ score+=POINTS_MATCH;
+ }
+ score+=baseScores[i];
+ mode=MODE_MS;
+ }else if(c<0 || c=='N'){
+ score+=POINTS_NOCALL;
+ }else if(r<0 || r=='N'){
+ score+=POINTS_NOREF;
+ }else{
+ if(mode==MODE_SUB){timeInMode++;}
+ else{timeInMode=0;}
+
+ if(timeInMode==0){score+=POINTS_SUB;}
+ else if(timeInMode<LIMIT_FOR_COST_3){score+=POINTS_SUB2;}
+ else{score+=POINTS_SUB3;}
+ mode=MODE_SUB;
+ }
+ }
+
+ return score;
+ }
+
+ @Override
+ public final int scoreNoIndelsAndMakeMatchString(byte[] read, byte[] ref, byte[] baseScores, final int refStart, byte[][] matchReturn){
+ int score=0;
+ int mode=-1;
+ int timeInMode=0;
+
+ assert(refStart<=ref.length) : refStart+", "+ref.length;
+
+ //This block handles cases where the read runs outside the reference
+ //Of course, padding the reference with 'N' would be better, but...
+ int readStart=0;
+ int readStop=read.length;
+ final int refStop=refStart+read.length;
+ if(refStart<0 || refStop>ref.length){return -99999;}
+ if(refStart<0){
+ readStart=0-refStart;
+ score+=POINTS_NOREF*readStart;
+ }
+ if(refStop>ref.length){
+ int dif=(refStop-ref.length);
+ System.err.println(new String(read)+"\ndif="+dif+", ref.length="+ref.length+", refStop="+refStop);
+ readStop-=dif;
+ score+=POINTS_NOREF*dif;
+ }
+ assert(refStart+readStop<=ref.length) : "readStart="+readStart+", readStop="+readStop+
+ ", refStart="+refStart+", refStop="+refStop+", ref.length="+ref.length+", read.length="+read.length;
+
+ assert(matchReturn!=null);
+ assert(matchReturn.length==1);
+ if(matchReturn[0]==null || matchReturn[0].length!=read.length){
+ assert(matchReturn[0]==null || matchReturn[0].length<read.length) : matchReturn[0].length+"!="+read.length;
+ matchReturn[0]=new byte[read.length];
+ }
+ final byte[] match=matchReturn[0];
+
+// if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed.
+
+ for(int i=readStart; i<readStop; i++){
+ byte c=read[i];
+ byte r=ref[refStart+i];
+
+ assert(r!='.' && c!='.');
+
+ if(c==r && c!='N'){
+ if(mode==MODE_MS){
+ timeInMode++;
+ score+=POINTS_MATCH2;
+ }else{
+ timeInMode=0;
+ score+=POINTS_MATCH;
+ }
+ score+=baseScores[i];
+ match[i]='m';
+ mode=MODE_MS;
+ }else if(c<0 || c=='N'){
+ score+=POINTS_NOCALL;
+ match[i]='N';
+ }else if(r<0 || r=='N'){
+ score+=POINTS_NOREF;
+// match[i]='m';
+ match[i]='N';
+ }else{
+ match[i]='S';
+ if(mode==MODE_SUB){timeInMode++;}
+ else{timeInMode=0;}
+
+ if(timeInMode==0){score+=POINTS_SUB;}
+ else if(timeInMode<LIMIT_FOR_COST_3){score+=POINTS_SUB2;}
+ else{score+=POINTS_SUB3;}
+ mode=MODE_SUB;
+ }
+ }
+
+ return score;
+ }
+
+ @Override
+ public final int scoreNoIndelsAndMakeMatchString(byte[] read, byte[] ref, final int refStart, byte[][] matchReturn){
+ int score=0;
+ int mode=-1;
+ int timeInMode=0;
+
+ assert(refStart<=ref.length) : refStart+", "+ref.length;
+
+ //This block handles cases where the read runs outside the reference
+ //Of course, padding the reference with 'N' would be better, but...
+ int readStart=0;
+ int readStop=read.length;
+ final int refStop=refStart+read.length;
+ if(refStart<0 || refStop>ref.length){return -99999;}
+ if(refStart<0){
+ readStart=0-refStart;
+ score+=POINTS_NOREF*readStart;
+ }
+ if(refStop>ref.length){
+ int dif=(refStop-ref.length);
+ System.err.println(new String(read)+"\ndif="+dif+", ref.length="+ref.length+", refStop="+refStop);
+ readStop-=dif;
+ score+=POINTS_NOREF*dif;
+ }
+ assert(refStart+readStop<=ref.length) : "readStart="+readStart+", readStop="+readStop+
+ ", refStart="+refStart+", refStop="+refStop+", ref.length="+ref.length+", read.length="+read.length;
+
+ assert(matchReturn!=null);
+ assert(matchReturn.length==1);
+ if(matchReturn[0]==null || matchReturn[0].length!=read.length){
+ assert(matchReturn[0]==null || matchReturn[0].length<read.length) : matchReturn[0].length+"!="+read.length;
+ matchReturn[0]=new byte[read.length];
+ }
+ final byte[] match=matchReturn[0];
+
+// if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed.
+
+ for(int i=readStart; i<readStop; i++){
+ byte c=read[i];
+ byte r=ref[refStart+i];
+
+ assert(r!='.' && c!='.');
+
+ if(c==r && c!='N'){
+ if(mode==MODE_MS){
+ timeInMode++;
+ score+=POINTS_MATCH2;
+ }else{
+ timeInMode=0;
+ score+=POINTS_MATCH;
+ }
+ match[i]='m';
+ mode=MODE_MS;
+ }else if(c<0 || c=='N'){
+ score+=POINTS_NOCALL;
+ match[i]='N';
+ }else if(r<0 || r=='N'){
+ score+=POINTS_NOREF;
+// match[i]='m';
+ match[i]='N';
+ }else{
+ match[i]='S';
+ if(mode==MODE_SUB){timeInMode++;}
+ else{timeInMode=0;}
+
+ if(timeInMode==0){score+=POINTS_SUB;}
+ else if(timeInMode<LIMIT_FOR_COST_3){score+=POINTS_SUB2;}
+ else{score+=POINTS_SUB3;}
+ mode=MODE_SUB;
+ }
+ }
+
+ return score;
+ }
+
+ @Override
+ public final int scoreNoIndels(byte[] read, byte[] ref, final int refStart, final SiteScore ss){
+
+ int score=0;
+ int mode=-1;
+ int timeInMode=0;
+
+ //This block handles cases where the read runs outside the reference
+ //Of course, padding the reference with 'N' would be better, but...
+ int readStart=0;
+ int readStop=read.length;
+ final int refStop=refStart+read.length;
+ boolean semiperfect=true;
+ int norefs=0;
+
+ if(refStart<0){
+ readStart=0-refStart;
+ score+=POINTS_NOREF*readStart;
+ norefs+=readStart;
+ }
+ if(refStop>ref.length){
+ int dif=(refStop-ref.length);
+ readStop-=dif;
+ score+=POINTS_NOREF*dif;
+ norefs+=dif;
+ }
+
+// if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed.
+
+ for(int i=readStart; i<readStop; i++){
+ byte c=read[i];
+ byte r=ref[refStart+i];
+
+ if(c==r && c!='N'){
+ if(mode==MODE_MS){
+ timeInMode++;
+ score+=POINTS_MATCH2;
+ }else{
+ timeInMode=0;
+ score+=POINTS_MATCH;
+ }
+ mode=MODE_MS;
+ }else if(c<0 || c=='N'){
+ score+=POINTS_NOCALL;
+ semiperfect=false;
+ }else if(r<0 || r=='N'){
+ score+=POINTS_NOREF;
+ norefs++;
+ }else{
+ if(mode==MODE_SUB){timeInMode++;}
+ else{timeInMode=0;}
+
+ if(timeInMode==0){score+=POINTS_SUB;}
+ else if(timeInMode<LIMIT_FOR_COST_3){score+=POINTS_SUB2;}
+ else{score+=POINTS_SUB3;}
+ mode=MODE_SUB;
+ semiperfect=false;
+ }
+ }
+
+ //TODO: Verify this; it's in the PacBio version
+ //if(semiperfect && ss!=null){ss.semiperfect=((ss.stop==ss.start+read.length-1) && (norefs<=read.length/2));}
+
+ return score;
+ }
+
+ @Override
+ public final int scoreNoIndels(byte[] read, byte[] ref, byte[] baseScores, final int refStart, SiteScore ss){
+
+ int score=0;
+ int mode=-1;
+ int timeInMode=0;
+ int norefs=0;
+
+ //This block handles cases where the read runs outside the reference
+ //Of course, padding the reference with 'N' would be better, but...
+ int readStart=0;
+ int readStop=read.length;
+ final int refStop=refStart+read.length;
+ boolean semiperfect=true;
+
+ if(refStart<0){
+ readStart=0-refStart;
+ score+=POINTS_NOREF*readStart;
+ norefs+=readStart;
+ }
+ if(refStop>ref.length){
+ int dif=(refStop-ref.length);
+ readStop-=dif;
+ score+=POINTS_NOREF*dif;
+ norefs+=dif;
+ }
+
+// if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed.
+
+ for(int i=readStart; i<readStop; i++){
+ byte c=read[i];
+ byte r=ref[refStart+i];
+
+ if(c==r && c!='N'){
+ if(mode==MODE_MS){
+ timeInMode++;
+ score+=POINTS_MATCH2;
+ }else{
+ timeInMode=0;
+ score+=POINTS_MATCH;
+ }
+ score+=baseScores[i];
+ mode=MODE_MS;
+ }else if(c<0 || c=='N'){
+ score+=POINTS_NOCALL;
+ semiperfect=false;
+ }else if(r<0 || r=='N'){
+ score+=POINTS_NOREF;
+ norefs++;
+ }else{
+ if(mode==MODE_SUB){timeInMode++;}
+ else{timeInMode=0;}
+
+ if(timeInMode==0){score+=POINTS_SUB;}
+ else if(timeInMode<LIMIT_FOR_COST_3){score+=POINTS_SUB2;}
+ else{score+=POINTS_SUB3;}
+ mode=MODE_SUB;
+ semiperfect=false;
+ }
+ }
+
+ //TODO: Verify. This is in the PacBio version.
+// if(semiperfect && ss!=null){ss.semiperfect=((ss.stop==ss.start+read.length-1) && (norefs<=read.length/2));}
+// assert(Read.CHECKSITE(ss, read, -1));
+
+ return score;
+ }
+
+ @Override
+ public final int maxQuality(int numBases){
+ return POINTS_MATCH+(numBases-1)*(POINTS_MATCH2);
+ }
+
+ @Override
+ public final int maxQuality(byte[] baseScores){
+ return POINTS_MATCH+(baseScores.length-1)*(POINTS_MATCH2)+Tools.sumInt(baseScores);
+ }
+
+ public final int maxImperfectScore(int numBases){
+// int maxQ=maxQuality(numBases);
+//// maxImperfectSwScore=maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB);
+// int maxI=maxQ+POINTS_DEL;
+// maxI=Tools.max(maxI, maxQ+POINTS_INS-POINTS_MATCH2);
+// maxI=Tools.min(maxI, maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB));
+
+ int maxQ=maxQuality(numBases);
+ int maxI=maxQ+Tools.min(POINTS_DEL, POINTS_INS-POINTS_MATCH2);
+ assert(maxI<(maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB)));
+ return maxI;
+ }
+
+ @Override
+ public final int maxImperfectScore(byte[] baseScores){
+// int maxQ=maxQuality(numBases);
+//// maxImperfectSwScore=maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB);
+// int maxI=maxQ+POINTS_DEL;
+// maxI=Tools.max(maxI, maxQ+POINTS_INS-POINTS_MATCH2);
+// maxI=Tools.min(maxI, maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB));
+
+ int maxQ=maxQuality(baseScores);
+ int maxI=maxQ+Tools.min(POINTS_DEL, POINTS_INS-POINTS_MATCH2);
+ assert(maxI<(maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB)));
+ return maxI;
+ }
+
+ @Override
+ public int calcDelScore(int len, boolean approximateGaps){
+ if(len<=0){return 0;}
+ int score=POINTS_DEL;
+
+ if(approximateGaps && len>MINGAP){
+ int rem=len%GAPLEN;
+ int div=(len-GAPBUFFER2)/GAPLEN;
+ score+=(div*POINTS_GAP);
+ assert(rem+GAPBUFFER2<len);
+ len=rem+GAPBUFFER2;
+ assert(len>LIMIT_FOR_COST_4); //and probably LIMIT_FOR_COST_5
+// assert(false) : div;
+ }
+
+ if(len>LIMIT_FOR_COST_5){
+ score+=((len-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTS_DEL5;
+ len=LIMIT_FOR_COST_5;
+ }
+ if(len>LIMIT_FOR_COST_4){
+ score+=(len-LIMIT_FOR_COST_4)*POINTS_DEL4;
+ len=LIMIT_FOR_COST_4;
+ }
+ if(len>LIMIT_FOR_COST_3){
+ score+=(len-LIMIT_FOR_COST_3)*POINTS_DEL3;
+ len=LIMIT_FOR_COST_3;
+ }
+ if(len>1){
+ score+=(len-1)*POINTS_DEL2;
+ }
+ return score;
+ }
+
+ private static int calcDelScoreOffset(int len){
+ if(len<=0){return 0;}
+ int score=POINTSoff_DEL;
+
+ if(len>LIMIT_FOR_COST_5){
+ score+=((len-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTSoff_DEL5;
+ len=LIMIT_FOR_COST_5;
+ }
+ if(len>LIMIT_FOR_COST_4){
+ score+=(len-LIMIT_FOR_COST_4)*POINTSoff_DEL4;
+ len=LIMIT_FOR_COST_4;
+ }
+ if(len>LIMIT_FOR_COST_3){
+ score+=(len-LIMIT_FOR_COST_3)*POINTSoff_DEL3;
+ len=LIMIT_FOR_COST_3;
+ }
+ if(len>1){
+ score+=(len-1)*POINTSoff_DEL2;
+ }
+ return score;
+ }
+
+ @Override
+ public int calcInsScore(int len){
+ if(len<=0){return 0;}
+ int score=POINTS_INS;
+
+ if(len>LIMIT_FOR_COST_4){
+ score+=(len-LIMIT_FOR_COST_4)*POINTS_INS4;
+ len=LIMIT_FOR_COST_4;
+ }
+ if(len>LIMIT_FOR_COST_3){
+ score+=(len-LIMIT_FOR_COST_3)*POINTS_INS3;
+ len=LIMIT_FOR_COST_3;
+ }
+ if(len>1){
+ score+=(len-1)*POINTS_INS2;
+ }
+ return score;
+ }
+
+ private static int calcInsScoreOffset(int len){
+ if(len<=0){return 0;}
+ int score=POINTSoff_INS;
+ if(len>LIMIT_FOR_COST_4){
+ score+=(len-LIMIT_FOR_COST_4)*POINTSoff_INS4;
+ len=LIMIT_FOR_COST_4;
+ }
+ if(len>LIMIT_FOR_COST_3){
+ score+=(len-LIMIT_FOR_COST_3)*POINTSoff_INS3;
+ len=LIMIT_FOR_COST_3;
+ }
+ if(len>1){
+ score+=(len-1)*POINTSoff_INS2;
+ }
+ return score;
+ }
+
+
+ private final int[][][] packed;
+ /** Banded packed matrix */
+ private final int[][][] bpacked;
+ /** Start locations of banded matrix */
+ private final int[][][] startmatrix;
+ /** Score for column zero */
+ private final int[] col0score;
+
+ private final byte[] grefbuffer;
+ private int greflimit=-1;
+ private int greflimit2=-1;
+ private int grefRefOrigin=-1;
+
+
+ /**DO NOT MODIFY*/
+ public final byte[] getGrefbuffer(){
+ return grefbuffer;
+ }
+
+ public final int[] vertLimit;
+ public final int[] horizLimit;
+
+ public CharSequence showVertLimit(){
+ StringBuilder sb=new StringBuilder();
+ for(int i=0; i<=rows; i++){sb.append(vertLimit[i]>>SCOREOFFSET).append(",");}
+ return sb;
+ }
+ public CharSequence showHorizLimit(){
+ StringBuilder sb=new StringBuilder();
+ for(int i=0; i<=columns; i++){sb.append(horizLimit[i]>>SCOREOFFSET).append(",");}
+ return sb;
+ }
+
+ public static float minIdToMinRatio(double minid){
+ if(minid>1){minid=minid/100;}
+ assert(minid>0 && minid<=1) : "Min identity should be between 0 and 1. Values above 1 will be assumed to be percent and divided by 100.";
+ double matchdif=POINTS_MATCH-POINTS_MATCH2;
+ double match=POINTS_MATCH2;
+ double sub=-POINTS_MATCH2+0.5*(matchdif+POINTS_SUB)+0.5*POINTS_SUB2;
+ double del=0.1*(matchdif+POINTS_DEL)+0.2*POINTS_DEL2+0.4*POINTS_DEL3+0.3*POINTS_DEL4;
+ double ins=-POINTS_MATCH2+0.4*(matchdif+POINTS_INS)+0.3*(POINTS_INS2)+0.3*(POINTS_INS3);
+ double badAvg=.7*sub+.2*del+.1*ins;
+ double badFraction=1-minid;
+ double minratio=(match+badFraction*badAvg)/match;
+ assert(minratio<=1);
+ minratio=Tools.max(0.05, minratio);
+ return (float)minratio;
+ }
+
+ public static final int TIMEBITS=11;
+ public static final int SCOREBITS=32-TIMEBITS;
+ public static final int MAX_TIME=((1<<TIMEBITS)-1);
+ public static final int MAX_SCORE=((1<<(SCOREBITS-1))-1)-2000;
+ public static final int MIN_SCORE=0-MAX_SCORE; //Keeps it 1 point above "BAD".
+
+ public static final int SCOREOFFSET=TIMEBITS;
+
+ public static final int TIMEMASK=~((-1)<<TIMEBITS);
+ public static final int SCOREMASK=(~((-1)<<SCOREBITS))<<SCOREOFFSET;
+
+ private static final byte MODE_MS=0;
+ private static final byte MODE_DEL=1;
+ private static final byte MODE_INS=2;
+ private static final byte MODE_SUB=3;
+
+ public static final int POINTS_NOREF=0;
+ public static final int POINTS_NOCALL=0;
+ public static final int POINTS_MATCH=70;
+ public static final int POINTS_MATCH2=100; //Note: Changing to 90 substantially reduces false positives
+ public static final int POINTS_COMPATIBLE=50;
+ public static final int POINTS_SUB=-127;
+ public static final int POINTS_SUBR=-147; //increased penalty if prior match streak was at most 1
+ public static final int POINTS_SUB2=-51;
+ public static final int POINTS_SUB3=-25;
+ public static final int POINTS_MATCHSUB=-10;
+ public static final int POINTS_INS=-395;
+ public static final int POINTS_INS2=-39;
+ public static final int POINTS_INS3=-23;
+ public static final int POINTS_INS4=-8;
+ public static final int POINTS_DEL=-472;
+ public static final int POINTS_DEL2=-33;
+ public static final int POINTS_DEL3=-9;
+ public static final int POINTS_DEL4=-1;
+ public static final int POINTS_DEL5=-1;
+ public static final int POINTS_DEL_REF_N=-10;
+ public static final int POINTS_GAP=0-GAPCOST;
+
+ public static final int TIMESLIP=4;
+ public static final int MASK5=TIMESLIP-1;
+ static{assert(Integer.bitCount(TIMESLIP)==1);}
+
+
+ private static final int BARRIER_I1=2;
+ private static final int BARRIER_D1=3;
+
+
+ public static final int LIMIT_FOR_COST_3=5;
+ public static final int LIMIT_FOR_COST_4=20;
+ public static final int LIMIT_FOR_COST_5=80;
+
+ public static final int BAD=MIN_SCORE-1;
+
+
+ public static final int POINTSoff_NOREF=(POINTS_NOREF<<SCOREOFFSET);
+ public static final int POINTSoff_NOCALL=(POINTS_NOCALL<<SCOREOFFSET);
+ public static final int POINTSoff_MATCH=(POINTS_MATCH<<SCOREOFFSET);
+ public static final int POINTSoff_MATCH2=(POINTS_MATCH2<<SCOREOFFSET);
+ public static final int POINTSoff_COMPATIBLE=(POINTS_COMPATIBLE<<SCOREOFFSET);
+ public static final int POINTSoff_SUB=(POINTS_SUB<<SCOREOFFSET);
+ public static final int POINTSoff_SUBR=(POINTS_SUBR<<SCOREOFFSET);
+ public static final int POINTSoff_SUB2=(POINTS_SUB2<<SCOREOFFSET);
+ public static final int POINTSoff_SUB3=(POINTS_SUB3<<SCOREOFFSET);
+ public static final int POINTSoff_MATCHSUB=(POINTS_MATCHSUB<<SCOREOFFSET);
+ public static final int POINTSoff_INS=(POINTS_INS<<SCOREOFFSET);
+ public static final int POINTSoff_INS2=(POINTS_INS2<<SCOREOFFSET);
+ public static final int POINTSoff_INS3=(POINTS_INS3<<SCOREOFFSET);
+ public static final int POINTSoff_INS4=(POINTS_INS4<<SCOREOFFSET);
+ public static final int POINTSoff_DEL=(POINTS_DEL<<SCOREOFFSET);
+ public static final int POINTSoff_DEL2=(POINTS_DEL2<<SCOREOFFSET);
+ public static final int POINTSoff_DEL3=(POINTS_DEL3<<SCOREOFFSET);
+ public static final int POINTSoff_DEL4=(POINTS_DEL4<<SCOREOFFSET);
+ public static final int POINTSoff_DEL5=(POINTS_DEL5<<SCOREOFFSET);
+ public static final int POINTSoff_GAP=(POINTS_GAP<<SCOREOFFSET);
+ public static final int POINTSoff_DEL_REF_N=(POINTS_DEL_REF_N<<SCOREOFFSET);
+ public static final int BADoff=(BAD<<SCOREOFFSET);
+ public static final int MAXoff_SCORE=MAX_SCORE<<SCOREOFFSET;
+ public static final int MINoff_SCORE=MIN_SCORE<<SCOREOFFSET;
+
+
+ public final int POINTS_NOREF(){return POINTS_NOREF;}
+ public final int POINTS_NOCALL(){return POINTS_NOCALL;}
+ public final int POINTS_MATCH(){return POINTS_MATCH;}
+ public final int POINTS_MATCH2(){return POINTS_MATCH2;}
+ public final int POINTS_COMPATIBLE(){return POINTS_COMPATIBLE;}
+ public final int POINTS_SUB(){return POINTS_SUB;}
+ public final int POINTS_SUBR(){return POINTS_SUBR;}
+ public final int POINTS_SUB2(){return POINTS_SUB2;}
+ public final int POINTS_SUB3(){return POINTS_SUB3;}
+ public final int POINTS_MATCHSUB(){return POINTS_MATCHSUB;}
+ public final int POINTS_INS(){return POINTS_INS;}
+ public final int POINTS_INS2(){return POINTS_INS2;}
+ public final int POINTS_INS3(){return POINTS_INS3;}
+ public final int POINTS_INS4(){return POINTS_INS4;}
+ public final int POINTS_DEL(){return POINTS_DEL;}
+ public final int POINTS_DEL2(){return POINTS_DEL2;}
+ public final int POINTS_DEL3(){return POINTS_DEL3;}
+ public final int POINTS_DEL4(){return POINTS_DEL4;}
+ public final int POINTS_DEL5(){return POINTS_DEL5;}
+ public final int POINTS_DEL_REF_N(){return POINTS_DEL_REF_N;}
+ public final int POINTS_GAP(){return POINTS_GAP;}
+
+ public final int TIMESLIP(){return TIMESLIP;}
+ public final int MASK5(){return MASK5;}
+ public final int SCOREOFFSET(){return SCOREOFFSET();}
+
+ final int BARRIER_I1(){return BARRIER_I1;}
+ final int BARRIER_D1(){return BARRIER_D1;}
+
+ public final int LIMIT_FOR_COST_3(){return LIMIT_FOR_COST_3;}
+ public final int LIMIT_FOR_COST_4(){return LIMIT_FOR_COST_4;}
+ public final int LIMIT_FOR_COST_5(){return LIMIT_FOR_COST_5;}
+
+ public final int BAD(){return BAD;}
+
+
+ private int rows;
+ private int columns;
+
+}
diff --git a/current/align2/MultiStateAligner11ts.java b/current/align2/MultiStateAligner11ts.java
new file mode 100755
index 0000000..c356bd4
--- /dev/null
+++ b/current/align2/MultiStateAligner11ts.java
@@ -0,0 +1,2576 @@
+package align2;
+
+import java.util.Arrays;
+
+import stream.SiteScore;
+
+import dna.AminoAcid;
+
+/**
+ * Modification of MultiStateAligner9ts to replace fixed affine steps with an array */
+public final class MultiStateAligner11ts extends MSA{
+
+
+ public static void main(String[] args){
+
+ int x=Integer.parseInt(args[0]);
+ MultiStateAligner11ts msa=new MultiStateAligner11ts(100, 100);
+ System.out.println(msa.calcDelScore(x, true));
+
+// byte[] read=args[0].getBytes();
+// byte[] ref=args[1].getBytes();
+//
+// byte[] original=ref;
+//
+// if(args.length>2 && args[2].equalsIgnoreCase("cs")){
+// colorspace=true;
+// read=AminoAcid.toColorspace(read);
+// ref=AminoAcid.toColorspace(ref);
+// }
+//
+// MultiStateAligner11ts msa=new MultiStateAligner11ts(read.length, ref.length);
+//
+// System.out.println("Initial: ");
+// printMatrix(msa.packed, read.length, ref.length, TIMEMASK, SCOREOFFSET);
+//
+// int[] max=msa.fillLimited(read, ref, 0, ref.length-1, 0, null);
+//
+// System.out.println("Max: "+Arrays.toString(max));
+//
+// System.out.println("Final: ");
+// printMatrix(msa.packed, read.length, ref.length, TIMEMASK, SCOREOFFSET);
+//
+// byte[] out=msa.traceback(read, ref, 0, ref.length-1, max[0], max[1], max[2], false);
+//
+// int[] score=null;
+// score=msa.score(read, ref, 0, ref.length-1, max[0], max[1], max[2], false);
+//
+// if(colorspace){System.out.println(new String(original));}
+// System.out.println(new String(ref));
+// System.out.println(new String(read));
+// System.out.println(new String(out));
+// System.out.println("Score: "+Arrays.toString(score));
+ }
+
+
+ public MultiStateAligner11ts(int maxRows_, int maxColumns_){
+ super(maxRows_, maxColumns_);
+
+ {
+ int[][][] packed0=null;
+ byte[] grefbuffer0=null;
+ int[] vertLimit0=null;
+ int[] horizLimit0=null;
+
+ try {
+ packed0=new int[3][maxRows+1][maxColumns+1];
+ grefbuffer0=new byte[maxColumns+2];
+ vertLimit0=new int[maxRows+1];
+ horizLimit0=new int[maxColumns+1];
+ } catch (OutOfMemoryError e) {
+ packed0=null;
+ grefbuffer0=null;
+ vertLimit0=null;
+ horizLimit0=null;
+ throw new RuntimeException(e.toString());
+ }
+
+ packed=packed0;
+ grefbuffer=grefbuffer0;
+ vertLimit=vertLimit0;
+ horizLimit=horizLimit0;
+ }
+
+ Arrays.fill(vertLimit, BADoff);
+ Arrays.fill(horizLimit, BADoff);
+
+// for(int i=0; i<maxColumns+1; i++){
+// scores[0][i]=0-i;
+// }
+
+ for(int matrix=0; matrix<packed.length; matrix++){
+ for(int i=1; i<=maxRows; i++){
+ for(int j=0; j<packed[matrix][i].length; j++){
+ packed[matrix][i][j]|=BADoff;
+ }
+// packed[matrix][i][0]|=MODE_INS;
+ }
+// for(int i=0; i<maxRows+1; i++){
+// scores[matrix][i][0]=(i*POINTSoff_NOREF);
+// }
+ for(int i=0; i<=maxRows; i++){
+
+ int prevScore=(i<2 ? 0 : packed[matrix][i-1][0]);
+// int score=(i<2 ? (i*POINTSoff_INS) :
+// (i<LIMIT_FOR_COST_3 ? prevScore+POINTSoff_INS2 :
+// (i<LIMIT_FOR_COST_4 ? prevScore+POINTSoff_INS3 : prevScore+POINTSoff_INS4)));
+ int score=prevScore+POINTSoff_INS_ARRAY[i];
+
+ packed[matrix][i][0]=score;
+ }
+// for(int i=1; i<maxColumns+1; i++){
+// prevState[matrix][0][i]=MODE_DEL;
+// }
+// for(int i=0; i<=maxColumns; i++){
+// packed[matrix][0][i]|=MODE_MS;
+// }
+ }
+ }
+
+ @Override
+ public final int[] fillLimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore, int[] gaps){
+ if(gaps==null){return fillLimitedX(read, ref, refStartLoc, refEndLoc, minScore);}
+ else{
+ byte[] gref=makeGref(ref, gaps, refStartLoc, refEndLoc);
+
+ if(verbose && greflimit>0 && greflimit<500){
+ System.err.println(new String(gref, 0, greflimit));
+ }
+
+ assert(gref!=null) : "Excessively long read:\n"+new String(read);
+ return fillLimitedX(read, gref, 0, greflimit, minScore);
+ }
+ }
+
+
+ /** return new int[] {rows, maxC, maxS, max};
+ * Will not fill areas that cannot match minScore */
+ private final int[] fillLimitedX(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore){
+ if(verbose){System.err.println("fillLimitedX");}
+// minScore=0;
+// assert(minScore>0);
+ rows=read.length;
+ columns=refEndLoc-refStartLoc+1;
+
+ final int halfband=(bandwidth<1 && bandwidthRatio<=0) ? 0 :
+ Tools.max(Tools.min(bandwidth<1 ? 9999999 : bandwidth, bandwidthRatio<=0 ? 9999999 : 8+(int)(rows*bandwidthRatio)), (columns-rows+8))/2;
+
+ if(minScore<1 || (columns+rows<90) || ((halfband<1 || halfband*3>columns) && (columns>read.length+Tools.min(170, read.length+20)))){
+// assert(false) : minScore;
+// assert(minScore>0) : minScore;
+// assert(false) : +minScore+", "+columns+", "+read.length+", "+Tools.min(100, read.length);
+ return fillUnlimited(read, ref, refStartLoc, refEndLoc);
+ }
+
+// final int BARRIER_I2=columns-BARRIER_I1;
+ final int BARRIER_I2=rows-BARRIER_I1, BARRIER_I2b=columns-1;
+ final int BARRIER_D2=rows-BARRIER_D1;
+
+ minScore-=MIN_SCORE_ADJUST; //Increases quality trivially
+
+ assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows+"\n"+
+ refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n";
+ assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns+"\n"+
+ refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n";
+
+ assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc+"\n"+
+ refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n";
+ assert(refEndLoc<ref.length) : "Check that values are in-bounds before calling this function: "+refEndLoc+", "+ref.length+"\n"+
+ refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n";
+
+ if(verbose){
+ System.err.println("Clearing matrix due to verbose mode.");
+ for(int x=0; x<packed.length; x++){
+ for(int y=1; y<rows+1; y++){
+ Arrays.fill(packed[x][y], 1, columns+1, BADoff);
+ }
+ }
+ }
+
+ for(int x=0; x<packed.length; x++){
+
+// Arrays.fill(packed[x][1], 1, columns+1, BADoff);
+ Arrays.fill(packed[x][rows], 1, columns+1, BADoff);
+// for(int y=1; y<rows+1; y++){
+// Arrays.fill(packed[x][y], 1, columns+1, BADoff);
+// }
+ }
+
+ int minGoodCol=1;
+ int maxGoodCol=columns;
+
+ final int minScore_off=(minScore<<SCOREOFFSET);
+ final int maxGain=(read.length-1)*POINTSoff_MATCH2+POINTSoff_MATCH;
+ final int floor=minScore_off-maxGain;
+// final int subfloor=Tools.max(BADoff, floor-200*POINTSoff_MATCH2);
+ final int subfloor=floor-5*POINTSoff_MATCH2;
+ assert(subfloor>BADoff); //TODO: Actually, it needs to be substantially more.
+ assert(subfloor<minScore_off) : minScore_off+", "+floor+", "+BADoff+", "+subfloor;
+
+ if(verbose2){
+ System.out.println();
+ System.out.println("minScore="+minScore+"\t"+minScore_off);
+ System.out.println("maxGain="+(maxGain>>SCOREOFFSET)+"\t"+(maxGain));
+ System.out.println("floor="+(floor>>SCOREOFFSET)+"\t"+(floor));
+ System.out.println("subfloor="+(subfloor>>SCOREOFFSET)+"\t"+(subfloor));
+ System.out.println("BADoff="+(BADoff>>SCOREOFFSET)+"\t"+(BADoff));
+ System.out.println("maxGain="+(maxGain>>SCOREOFFSET)+"\t"+(maxGain));
+ System.out.println();
+ }
+
+ vertLimit[rows]=minScore_off;
+ boolean prevDefined=false;
+ for(int i=rows-1; i>=0; i--){
+ byte c=read[i];
+ if(AminoAcid.isFullyDefined(c)){
+ vertLimit[i]=Tools.max(vertLimit[i+1]-(prevDefined ? POINTSoff_MATCH2 : POINTSoff_MATCH), floor);
+ prevDefined=true;
+ }else{
+ vertLimit[i]=Tools.max(vertLimit[i+1]-POINTSoff_NOCALL, floor);
+ prevDefined=false;
+ }
+ }
+
+ horizLimit[columns]=minScore_off;
+ prevDefined=false;
+ for(int i=columns-1; i>=0; i--){
+ byte c=ref[refStartLoc+i];
+ if(AminoAcid.isFullyDefined(c)){
+ horizLimit[i]=Tools.max(horizLimit[i+1]-(prevDefined ? POINTSoff_MATCH2 : POINTSoff_MATCH), floor);
+ prevDefined=true;
+ }else{
+ horizLimit[i]=Tools.max(horizLimit[i+1]-(prevDefined && c==GAPC ? POINTSoff_DEL : POINTSoff_NOREF), floor);
+ prevDefined=false;
+ }
+ }
+
+// vertLimit[rows]=minScore_off;
+// for(int i=rows-1; i>=0; i--){
+// vertLimit[i]=Tools.max(vertLimit[i+1]-POINTSoff_MATCH2, floor);
+// }
+//
+// horizLimit[columns]=minScore_off;
+// for(int i=columns-1; i>=0; i--){
+// horizLimit[i]=Tools.max(horizLimit[i+1]-POINTSoff_MATCH2, floor);
+// }
+
+ for(int row=1; row<=rows; row++){
+
+ final int colStart=(halfband<1 ? minGoodCol : Tools.max(minGoodCol, row-halfband));
+ final int colStop=(halfband<1 ? maxGoodCol : Tools.min(maxGoodCol, row+halfband*2-1));
+
+ minGoodCol=-1;
+ maxGoodCol=-2;
+
+ final int vlimit=vertLimit[row];
+
+ if(verbose2){
+ System.out.println();
+ System.out.println("row="+row);
+ System.out.println("colStart="+colStart);
+ System.out.println("colStop="+colStop);
+ System.out.println("vlimit="+(vlimit>>SCOREOFFSET)+"\t"+(vlimit));
+ }
+
+ if(colStart<0 || colStop<colStart){break;}
+
+
+ if(colStart>1){
+ assert(row>0);
+ packed[MODE_MS][row][colStart-1]=subfloor;
+ packed[MODE_INS][row][colStart-1]=subfloor;
+ packed[MODE_DEL][row][colStart-1]=subfloor;
+ }
+
+
+ for(int col=colStart; col<=columns; col++){
+
+
+ if(verbose2){
+ System.err.println("\ncol "+col);
+ }
+
+ final byte call0=(row<2 ? (byte)'?' : read[row-2]);
+ final byte call1=read[row-1];
+ final byte ref0=(col<2 ? (byte)'!' : ref[refStartLoc+col-2]);
+ final byte ref1=ref[refStartLoc+col-1];
+
+ final boolean gap=(ref1==GAPC);
+ assert(call1!=GAPC);
+
+// final boolean match=(read[row-1]==ref[refStartLoc+col-1]);
+// final boolean prevMatch=(row<2 || col<2 ? false : read[row-2]==ref[refStartLoc+col-2]);
+ final boolean match=(call1==ref1 && ref1!='N');
+ final boolean prevMatch=(call0==ref0 && ref0!='N');
+
+// System.err.println("")
+
+ iterationsLimited++;
+ final int limit=Tools.max(vlimit, horizLimit[col]);
+ final int limit3=Tools.max(floor, (match ? limit-POINTSoff_MATCH2 : limit-POINTSoff_SUB3));
+
+ final int delNeeded=Tools.max(0, row-col-1);
+ final int insNeeded=Tools.max(0, (rows-row)-(columns-col)-1);
+
+ final int delPenalty=calcDelScoreOffset(delNeeded);
+ final int insPenalty=calcInsScoreOffset(insNeeded);
+
+
+ final int scoreFromDiag_MS=packed[MODE_MS][row-1][col-1]&SCOREMASK;
+ final int scoreFromDel_MS=packed[MODE_DEL][row-1][col-1]&SCOREMASK;
+ final int scoreFromIns_MS=packed[MODE_INS][row-1][col-1]&SCOREMASK;
+
+ final int scoreFromDiag_DEL=packed[MODE_MS][row][col-1]&SCOREMASK;
+ final int scoreFromDel_DEL=packed[MODE_DEL][row][col-1]&SCOREMASK;
+
+ final int scoreFromDiag_INS=packed[MODE_MS][row-1][col]&SCOREMASK;
+ final int scoreFromIns_INS=packed[MODE_INS][row-1][col]&SCOREMASK;
+
+// if(scoreFromDiag_MS<limit3 && scoreFromDel_MS<limit3 && scoreFromIns_MS<limit3
+// && scoreFromDiag_DEL<limit && scoreFromDel_DEL<limit && scoreFromDiag_INS<limit && scoreFromIns_INS<limit){
+// iterationsLimited--; //A "fast" iteration
+// }
+
+ if(gap || (scoreFromDiag_MS<=limit3 && scoreFromDel_MS<=limit3 && scoreFromIns_MS<=limit3)){
+ packed[MODE_MS][row][col]=subfloor;
+ }else{//Calculate match and sub scores
+ final int streak=(packed[MODE_MS][row-1][col-1]&TIMEMASK);
+
+ {//Calculate match/sub score
+
+ int score;
+ int time;
+ byte prevState;
+
+ if(match){
+
+ int scoreMS=scoreFromDiag_MS+(prevMatch ? POINTSoff_MATCH2 : POINTSoff_MATCH);
+ int scoreD=scoreFromDel_MS+POINTSoff_MATCH;
+ int scoreI=scoreFromIns_MS+POINTSoff_MATCH;
+
+// byte prevState;
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? streak+1 : 1);
+ prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+ prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+ prevState=MODE_INS;
+ }
+
+// if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+// assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+// assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+//// packed[MODE_MS][row][col]=(score|prevState|time);
+// packed[MODE_MS][row][col]=(score|time);
+// assert((score&SCOREMASK)==score);
+//// assert((prevState&MODEMASK)==prevState);
+// assert((time&TIMEMASK)==time);
+
+ }else{
+
+ int scoreMS;
+ if(ref1!='N' && call1!='N'){
+ scoreMS=scoreFromDiag_MS+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) :
+ POINTSoff_SUB_ARRAY[streak+1]);
+// scoreMS=scoreFromDiag_MS+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) :
+// (streak==0 ? POINTSoff_SUB : streak<LIMIT_FOR_COST_3 ? POINTSoff_SUB2 : POINTSoff_SUB3));
+ }else{
+ scoreMS=scoreFromDiag_MS+POINTSoff_NOCALL;
+ }
+
+ int scoreD=scoreFromDel_MS+POINTSoff_SUB; //+2 to move it as close as possible to the deletion / insertion
+ int scoreI=scoreFromIns_MS+POINTSoff_SUB;
+
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? 1 : streak+1);
+// time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1);
+ prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+ prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+ prevState=MODE_INS;
+ }
+
+// if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+// assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+// assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+//// packed[MODE_MS][row][col]=(score|prevState|time);
+// packed[MODE_MS][row][col]=(score|time);
+// assert((score&SCOREMASK)==score);
+//// assert((prevState&MODEMASK)==prevState);
+// assert((time&TIMEMASK)==time);
+ }
+
+ final int limit2;
+ if(delNeeded>0){
+ limit2=limit-delPenalty;
+ }else if(insNeeded>0){
+ limit2=limit-insPenalty;
+ }else{
+ limit2=limit;
+ }
+ assert(limit2>=limit);
+
+ if(verbose2){System.err.println("MS: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));}
+
+ if(score>=limit2){
+ maxGoodCol=col;
+ if(minGoodCol<0){minGoodCol=col;}
+ }else{
+ score=subfloor;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_MS][row][col]=(score|prevState|time);
+ packed[MODE_MS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+ }
+
+ if((scoreFromDiag_DEL<=limit && scoreFromDel_DEL<=limit) || row<BARRIER_D1 || row>BARRIER_D2){
+// assert((scoreFromDiag_DEL<=limit && scoreFromDel_DEL<=limit)) : scoreFromDiag_DEL+", "+row;
+ packed[MODE_DEL][row][col]=subfloor;
+ }else{//Calculate DEL score
+
+ final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK;
+
+ int scoreMS=scoreFromDiag_DEL+POINTSoff_DEL;
+ int scoreD=scoreFromDel_DEL+(streak==0 ? POINTSoff_DEL :
+ streak<LIMIT_FOR_COST_3 ? POINTSoff_DEL2 :
+ streak<LIMIT_FOR_COST_4 ? POINTSoff_DEL3 :
+ streak<LIMIT_FOR_COST_5 ? POINTSoff_DEL4 :
+ ((streak&MASK5)==0 ? POINTSoff_DEL5 : 0));
+// int scoreI=scoreFromIns+POINTSoff_DEL;
+
+
+ if(ref1=='N'){
+ scoreMS+=POINTSoff_DEL_REF_N;
+ scoreD+=POINTSoff_DEL_REF_N;
+ }else if(gap){
+ scoreMS+=POINTSoff_GAP;
+ scoreD+=POINTSoff_GAP;
+ }
+
+ //if(match){scoreMS=subfloor;}
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreD){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreD;
+ time=streak+1;
+ prevState=MODE_DEL;
+ }
+
+ final int limit2;
+ if(insNeeded>0){
+ limit2=limit-insPenalty;
+ }else if(delNeeded>0){
+ limit2=limit-calcDelScoreOffset(time+delNeeded)+calcDelScoreOffset(time);
+ }else{
+ limit2=limit;
+ }
+ assert(limit2>=limit);
+ if(verbose2){System.err.println("DEL: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));}
+
+ if(score>=limit2){
+ maxGoodCol=col;
+ if(minGoodCol<0){minGoodCol=col;}
+ }else{
+ score=subfloor;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_DEL][row][col]=(score|prevState|time);
+ packed[MODE_DEL][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+
+// if(gap || (scoreFromDiag_INS<=limit && scoreFromIns_INS<=limit) || col<BARRIER_I1 || col>BARRIER_I2){
+ if(gap || (scoreFromDiag_INS<=limit && scoreFromIns_INS<=limit) || (row<BARRIER_I1 && col>1) || (row>BARRIER_I2 && col<BARRIER_I2b)){
+ packed[MODE_INS][row][col]=subfloor;
+ }else{//Calculate INS score
+
+ final int streak=packed[MODE_INS][row-1][col]&TIMEMASK;
+
+ int scoreMS=scoreFromDiag_INS+POINTSoff_INS;
+// int scoreD=scoreFromDel+POINTSoff_INS;
+ int scoreI=scoreFromIns_INS+POINTSoff_INS_ARRAY[streak+1];
+// int scoreI=scoreFromIns_INS+(streak==0 ? POINTSoff_INS :
+// streak<LIMIT_FOR_COST_3 ? POINTSoff_INS2 :
+// streak<LIMIT_FOR_COST_4 ? POINTSoff_INS3 : POINTSoff_INS4);
+
+// System.err.println("("+row+","+col+")\t"+scoreFromDiag+"+"+POINTSoff_INS+"="+scoreM+", "+
+// scoreFromSub+"+"+POINTSoff_INS+"="+scoreS+", "
+// +scoreD+", "+scoreFromIns+"+"+
+// (streak==0 ? POINTSoff_INS : streak<LIMIT_FOR_COST_3 ? POINTSoff_INS2 : POINTSoff_INS3)+"="+scoreI);
+
+ //if(match){scoreMS=subfloor;}
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreI){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreI;
+ time=streak+1;
+ prevState=MODE_INS;
+ }
+
+ final int limit2;
+ if(delNeeded>0){
+ limit2=limit-delPenalty;
+ }else if(insNeeded>0){
+ limit2=limit-calcInsScoreOffset(time+insNeeded)+calcInsScoreOffset(time);
+ }else{
+ limit2=limit;
+ }
+ assert(limit2>=limit);
+
+ if(verbose2){System.err.println("INS: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));}
+ if(score>=limit2){
+ maxGoodCol=col;
+ if(minGoodCol<0){minGoodCol=col;}
+ }else{
+ score=subfloor;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_INS][row][col]=(score|prevState|time);
+ packed[MODE_INS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+
+
+ if(col>=colStop){
+ if(col>colStop && (maxGoodCol<col || halfband>0)){break;}
+ if(row>1){
+ packed[MODE_MS][row-1][col+1]=subfloor;
+ packed[MODE_INS][row-1][col+1]=subfloor;
+ packed[MODE_DEL][row-1][col+1]=subfloor;
+ }
+ }
+ }
+ }
+
+
+ int maxCol=-1;
+ int maxState=-1;
+ int maxScore=Integer.MIN_VALUE;
+
+ for(int state=0; state<packed.length; state++){
+ for(int col=1; col<=columns; col++){
+ int x=packed[state][rows][col]&SCOREMASK;
+ if(x>maxScore){
+ maxScore=x;
+ maxCol=col;
+ maxState=state;
+ }
+ }
+ }
+
+ assert(maxScore>=BADoff);
+// if(maxScore==BADoff){
+// return null;
+// }
+// if(maxScore<floor){
+// return null;
+// }
+
+ if(verbose){
+ System.out.println("Filled matrix.");
+ printMatrix(packed, rows, columns, TIMEMASK, SCOREOFFSET);
+ }
+ if(verbose){System.err.println("maxscore="+(maxScore>>SCOREOFFSET)+", minscore="+(minScore_off>>SCOREOFFSET));}
+
+ if(maxScore<minScore_off){
+ return null;
+ }
+
+ maxScore>>=SCOREOFFSET;
+
+ if(verbose){
+ System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore);
+ }
+ return new int[] {rows, maxCol, maxState, maxScore};
+ }
+
+ @Override
+ public final int[] fillUnlimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int[] gaps){
+ if(gaps==null){return fillUnlimited(read, ref, refStartLoc, refEndLoc);}
+ else{
+ byte[] gref=makeGref(ref, gaps, refStartLoc, refEndLoc);
+ assert(gref!=null) : "Excessively long read:\n"+new String(read);
+ return fillUnlimited(read, gref, 0, greflimit);
+ }
+ }
+
+
+ /** return new int[] {rows, maxC, maxS, max};
+ * Does not require a min score (ie, same as old method) */
+ private final int[] fillUnlimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc){
+ rows=read.length;
+ columns=refEndLoc-refStartLoc+1;
+
+ final int maxGain=(read.length-1)*POINTSoff_MATCH2+POINTSoff_MATCH;
+ final int subfloor=0-2*maxGain;
+ assert(subfloor>BADoff && subfloor*2>BADoff) : (read.length-1)+", "+maxGain+", "+subfloor+", "+(subfloor*2)+", "+BADoff+"\n"
+ +rows+", "+columns+", "+POINTSoff_MATCH2+", "+SCOREOFFSET+"\n"+new String(read)+"\n"; //TODO: Actually, it needs to be substantially more.
+// final int BARRIER_I2=columns-BARRIER_I1;
+ final int BARRIER_I2=rows-BARRIER_I1, BARRIER_I2b=columns-1;
+ final int BARRIER_D2=rows-BARRIER_D1;
+
+ //temporary, for finding a bug
+ if(rows>maxRows || columns>maxColumns){
+ throw new RuntimeException("rows="+rows+", maxRows="+maxRows+", cols="+columns+", maxCols="+maxColumns+"\n"+new String(read)+"\n");
+ }
+
+ assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows;
+ assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns;
+
+ assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc;
+ assert(refEndLoc<ref.length) : "Check that values are in-bounds before calling this function: "+refEndLoc+", "+ref.length;
+
+ for(int row=1; row<=rows; row++){
+
+// int minc=max(1, row-20);
+// int maxc=min(columns, row+20);
+
+ for(int col=1; col<=columns; col++){
+ iterationsUnlimited++;
+
+// final boolean match=(read[row-1]==ref[refStartLoc+col-1]);
+// final boolean prevMatch=(row<2 || col<2 ? false : read[row-2]==ref[refStartLoc+col-2]);
+
+ final byte call0=(row<2 ? (byte)'?' : read[row-2]);
+ final byte call1=read[row-1];
+ final byte ref0=(col<2 ? (byte)'!' : ref[refStartLoc+col-2]);
+ final byte ref1=ref[refStartLoc+col-1];
+
+ final boolean match=(call1==ref1 && ref1!='N');
+ final boolean prevMatch=(call0==ref0 && ref0!='N');
+
+ final boolean gap=(ref1==GAPC);
+ assert(call1!=GAPC);
+
+ if(gap){
+ packed[MODE_MS][row][col]=subfloor;
+ }else{//Calculate match and sub scores
+
+ final int scoreFromDiag=packed[MODE_MS][row-1][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row-1][col-1]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col-1]&SCOREMASK;
+ final int streak=(packed[MODE_MS][row-1][col-1]&TIMEMASK);
+
+ {//Calculate match/sub score
+
+ if(match){
+
+ int scoreMS=scoreFromDiag+(prevMatch ? POINTSoff_MATCH2 : POINTSoff_MATCH);
+ int scoreD=scoreFromDel+POINTSoff_MATCH;
+ int scoreI=scoreFromIns+POINTSoff_MATCH;
+
+ int score;
+ int time;
+// byte prevState;
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? streak+1 : 1);
+// prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+// prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+// prevState=MODE_INS;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_MS][row][col]=(score|prevState|time);
+ packed[MODE_MS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+
+ }else{
+
+ int scoreMS;
+ if(ref1!='N' && call1!='N'){
+ scoreMS=scoreFromDiag+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) :
+ POINTSoff_SUB_ARRAY[streak+1]);
+// scoreMS=scoreFromDiag+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) :
+// (streak==0 ? POINTSoff_SUB : streak<LIMIT_FOR_COST_3 ? POINTSoff_SUB2 : POINTSoff_SUB3));
+ }else{
+ scoreMS=scoreFromDiag+POINTSoff_NOCALL;
+ }
+
+ int scoreD=scoreFromDel+POINTSoff_SUB; //+2 to move it as close as possible to the deletion / insertion
+ int scoreI=scoreFromIns+POINTSoff_SUB;
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? 1 : streak+1);
+// time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1);
+ prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+ prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+ prevState=MODE_INS;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_MS][row][col]=(score|prevState|time);
+ packed[MODE_MS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+ }
+ }
+
+ if(row<BARRIER_D1 || row>BARRIER_D2){
+ packed[MODE_DEL][row][col]=subfloor;
+ }else{//Calculate DEL score
+
+ final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK;
+
+ final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK;
+
+ int scoreMS=scoreFromDiag+POINTSoff_DEL;
+ int scoreD=scoreFromDel+(streak==0 ? POINTSoff_DEL :
+ streak<LIMIT_FOR_COST_3 ? POINTSoff_DEL2 :
+ streak<LIMIT_FOR_COST_4 ? POINTSoff_DEL3 :
+ streak<LIMIT_FOR_COST_5 ? POINTSoff_DEL4 :
+ ((streak&MASK5)==0 ? POINTSoff_DEL5 : 0));
+// int scoreI=scoreFromIns+POINTSoff_DEL;
+
+ if(ref1=='N'){
+ scoreMS+=POINTSoff_DEL_REF_N;
+ scoreD+=POINTSoff_DEL_REF_N;
+ }else if(gap){
+ scoreMS+=POINTSoff_GAP;
+ scoreD+=POINTSoff_GAP;
+ }
+
+ //if(match){scoreMS=subfloor;}
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreD){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreD;
+ time=streak+1;
+ prevState=MODE_DEL;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_DEL][row][col]=(score|prevState|time);
+ packed[MODE_DEL][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+
+ //Calculate INS score
+// if(gap || col<BARRIER_I1 || col>BARRIER_I2){
+ if(gap || (row<BARRIER_I1 && col>1) || (row>BARRIER_I2 && col<BARRIER_I2b)){
+ packed[MODE_INS][row][col]=subfloor;
+ }else{//Calculate INS score
+
+ final int streak=packed[MODE_INS][row-1][col]&TIMEMASK;
+
+ final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK;
+
+ int scoreMS=scoreFromDiag+POINTSoff_INS;
+// int scoreD=scoreFromDel+POINTSoff_INS;
+ int scoreI=scoreFromIns+POINTSoff_INS_ARRAY[streak+1];
+// int scoreI=scoreFromIns+(streak==0 ? POINTSoff_INS :
+// streak<LIMIT_FOR_COST_3 ? POINTSoff_INS2 :
+// streak<LIMIT_FOR_COST_4 ? POINTSoff_INS3 : POINTSoff_INS4);
+
+// System.err.println("("+row+","+col+")\t"+scoreFromDiag+"+"+POINTSoff_INS+"="+scoreM+", "+
+// scoreFromSub+"+"+POINTSoff_INS+"="+scoreS+", "
+// +scoreD+", "+scoreFromIns+"+"+
+// (streak==0 ? POINTSoff_INS : streak<LIMIT_FOR_COST_3 ? POINTSoff_INS2 : POINTSoff_INS3)+"="+scoreI);
+
+ //if(match){scoreMS=subfloor;}
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreI){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreI;
+ time=streak+1;
+ prevState=MODE_INS;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_INS][row][col]=(score|prevState|time);
+ packed[MODE_INS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+ }
+ }
+
+
+ int maxCol=-1;
+ int maxState=-1;
+ int maxScore=Integer.MIN_VALUE;
+
+ for(int state=0; state<packed.length; state++){
+ for(int col=1; col<=columns; col++){
+ int x=packed[state][rows][col]&SCOREMASK;
+ if(x>maxScore){
+ maxScore=x;
+ maxCol=col;
+ maxState=state;
+ }
+ }
+ }
+ maxScore>>=SCOREOFFSET;
+
+// System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore);
+ return new int[] {rows, maxCol, maxState, maxScore};
+ }
+
+ @Deprecated
+ /** return new int[] {rows, maxC, maxS, max}; */
+ public final int[] fillQ(byte[] read, byte[] ref, byte[] baseScores, int refStartLoc, int refEndLoc){
+ assert(false) : "Needs to be redone to work with score cutoffs. Not difficult.";
+ rows=read.length;
+ columns=refEndLoc-refStartLoc+1;
+
+ assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows;
+ assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns;
+
+ assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc;
+ assert(refEndLoc<ref.length) : "Check that values are in-bounds before calling this function: "+refEndLoc+", "+ref.length;
+
+ for(int row=1; row<=rows; row++){
+
+// int minc=max(1, row-20);
+// int maxc=min(columns, row+20);
+
+ for(int col=1; col<=columns; col++){
+
+ final boolean match=(read[row-1]==ref[refStartLoc+col-1]);
+ final boolean prevMatch=(row<2 || col<2 ? false : read[row-2]==ref[refStartLoc+col-2]);
+
+ {//Calculate match and sub scores
+
+ final int scoreFromDiag=packed[MODE_MS][row-1][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row-1][col-1]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col-1]&SCOREMASK;
+ final int streak=(packed[MODE_MS][row-1][col-1]&TIMEMASK);
+
+ {//Calculate match/sub score
+
+ if(match){
+
+ int scoreMS=scoreFromDiag+(prevMatch ? POINTSoff_MATCH2 : POINTSoff_MATCH);
+ int scoreD=scoreFromDel+POINTSoff_MATCH;
+ int scoreI=scoreFromIns+POINTSoff_MATCH;
+
+ int score;
+ int time;
+// byte prevState;
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? streak+1 : 1);
+// prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+// prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+// prevState=MODE_INS;
+ }
+ score+=(((int)baseScores[row-1])<<SCOREOFFSET); //modifier
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_MS][row][col]=(score|prevState|time);
+ packed[MODE_MS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+
+ }else{
+
+ int scoreMS=scoreFromDiag+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) :
+ POINTSoff_SUB_ARRAY[streak+1]);
+// int scoreMS=scoreFromDiag+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) :
+// (streak==0 ? POINTSoff_SUB : streak<LIMIT_FOR_COST_3 ? POINTSoff_SUB2 : POINTSoff_SUB3));
+ int scoreD=scoreFromDel+POINTSoff_SUB; //+2 to move it as close as possible to the deletion / insertion
+ int scoreI=scoreFromIns+POINTSoff_SUB;
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? 1 : streak+1);
+// time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1);
+ prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+ prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+ prevState=MODE_INS;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_MS][row][col]=(score|prevState|time);
+ packed[MODE_MS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+ }
+ }
+
+ {//Calculate DEL score
+
+ final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK;
+
+ final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK;
+
+ int scoreMS=scoreFromDiag+POINTSoff_DEL;
+ int scoreD=scoreFromDel+(streak==0 ? POINTSoff_DEL :
+ streak<LIMIT_FOR_COST_3 ? POINTSoff_DEL2 :
+ streak<LIMIT_FOR_COST_4 ? POINTSoff_DEL3 :
+ streak<LIMIT_FOR_COST_5 ? POINTSoff_DEL4 :
+ ((streak&MASK5)==0 ? POINTSoff_DEL5 : 0));
+// int scoreI=scoreFromIns+POINTSoff_DEL;
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreD){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreD;
+ time=streak+1;
+ prevState=MODE_DEL;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_DEL][row][col]=(score|prevState|time);
+ packed[MODE_DEL][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+
+ {//Calculate INS score
+
+ final int streak=packed[MODE_INS][row-1][col]&TIMEMASK;
+
+ final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK;
+
+ int scoreMS=scoreFromDiag+POINTSoff_INS;
+// int scoreD=scoreFromDel+POINTSoff_INS;
+ int scoreI=scoreFromIns+POINTSoff_INS_ARRAY[streak+1];
+// int scoreI=scoreFromIns_INS+(streak==0 ? POINTSoff_INS :
+// streak<LIMIT_FOR_COST_3 ? POINTSoff_INS2 :
+// streak<LIMIT_FOR_COST_4 ? POINTSoff_INS3 : POINTSoff_INS4);
+
+// System.err.println("("+row+","+col+")\t"+scoreFromDiag+"+"+POINTSoff_INS+"="+scoreM+", "+
+// scoreFromSub+"+"+POINTSoff_INS+"="+scoreS+", "
+// +scoreD+", "+scoreFromIns+"+"+
+// (streak==0 ? POINTSoff_INS : streak<LIMIT_FOR_COST_3 ? POINTSoff_INS2 : POINTSoff_INS3)+"="+scoreI);
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreI){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreI;
+ time=streak+1;
+ prevState=MODE_INS;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_INS][row][col]=(score|prevState|time);
+ packed[MODE_INS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+ }
+ }
+
+
+ int maxCol=-1;
+ int maxState=-1;
+ int maxScore=Integer.MIN_VALUE;
+
+ for(int state=0; state<packed.length; state++){
+ for(int col=1; col<=columns; col++){
+ int x=packed[state][rows][col]&SCOREMASK;
+ if(x>maxScore){
+ maxScore=x;
+ maxCol=col;
+ maxState=state;
+ }
+ }
+ }
+ maxScore>>=SCOREOFFSET;
+
+// System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore);
+ return new int[] {rows, maxCol, maxState, maxScore};
+ }
+
+ @Override
+ /** @return {score, bestRefStart, bestRefStop} */
+ /** Generates the match string */
+ public final byte[] traceback(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state, boolean gapped){
+ if(gapped){
+ final byte[] gref=grefbuffer;
+ int gstart=translateToGappedCoordinate(refStartLoc, gref, read);
+ int gstop=translateToGappedCoordinate(refEndLoc, gref, read);
+ byte[] out=traceback2(read, gref, gstart, gstop, row, col, state);
+ return out;
+ }else{
+ return traceback2(read, ref, refStartLoc, refEndLoc, row, col, state);
+ }
+ }
+
+ @Override
+ /** Generates the match string */
+ public final byte[] traceback2(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state){
+// assert(false);
+ assert(refStartLoc<=refEndLoc) : refStartLoc+", "+refEndLoc;
+ assert(row==rows);
+
+ byte[] out=new byte[row+col-1]; //TODO if an out of bound crash occurs, try removing the "-1".
+ int outPos=0;
+
+ int gaps=0;
+
+ if(state==MODE_INS){
+ //TODO ? Maybe not needed.
+ }
+
+ while(row>0 && col>0){
+
+// byte prev0=(byte)(packed[state][row][col]&MODEMASK);
+
+ final int time=packed[state][row][col]&TIMEMASK;
+ final byte prev;
+
+// System.err.println("state="+state+", prev="+prev+", row="+row+", col="+col+", score="+scores[state][row][col]);
+
+ if(state==MODE_MS){
+ if(time>1){prev=(byte)state;}
+ else{
+ final int scoreFromDiag=packed[MODE_MS][row-1][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row-1][col-1]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col-1]&SCOREMASK;
+ if(scoreFromDiag>=scoreFromDel && scoreFromDiag>=scoreFromIns){prev=MODE_MS;}
+ else if(scoreFromDel>=scoreFromIns){prev=MODE_DEL;}
+ else{prev=MODE_INS;}
+ }
+
+ byte c=read[row-1];
+ byte r=ref[refStartLoc+col-1];
+ if(c==r){
+ out[outPos]='m';
+ }else{
+ if(!AminoAcid.isFullyDefined(c)){
+ out[outPos]='N';
+ }else if(!AminoAcid.isFullyDefined(r)){
+// out[outPos]='X';
+ out[outPos]='N';
+ }else{
+ out[outPos]='S';
+ }
+ }
+
+ row--;
+ col--;
+ }else if(state==MODE_DEL){
+ if(time>1){prev=(byte)state;}
+ else{
+ final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK;
+ if(scoreFromDiag>=scoreFromDel){prev=MODE_MS;}
+ else{prev=MODE_DEL;}
+ }
+
+ byte r=ref[refStartLoc+col-1];
+ if(r==GAPC){
+ out[outPos]='-';
+ gaps++;
+ }else{
+ out[outPos]='D';
+ }
+ col--;
+ }else{
+ if(time>1){prev=(byte)state;}
+ else{
+ final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK;
+ if(scoreFromDiag>=scoreFromIns){prev=MODE_MS;}
+ else{prev=MODE_INS;}
+ }
+
+ assert(state==MODE_INS) : state;
+ if(col==0){
+ out[outPos]='X';
+ }else if(col>=columns){
+ out[outPos]='Y';
+ }else{
+ out[outPos]='I';
+ }
+ row--;
+ }
+
+// assert(prev==prev0);
+ state=prev;
+ outPos++;
+ }
+
+ assert(row==0 || col==0);
+ if(col!=row){
+ while(row>0){
+ out[outPos]='X';
+ outPos++;
+ row--;
+ col--;
+ }
+ if(col>0){
+ //do nothing
+ }
+ }
+
+
+ //Shrink and reverse the string
+ byte[] out2=new byte[outPos];
+ for(int i=0; i<outPos; i++){
+ out2[i]=out[outPos-i-1];
+ }
+ out=null;
+
+ if(gaps==0){return out2;}
+
+ //TODO Consider outputting this compressed.
+ byte[] out3=new byte[out2.length+gaps*(GAPLEN-1)];
+ for(int i=0, j=0; i<out2.length; i++){
+ byte c=out2[i];
+ if(c!=GAPC){
+ out3[j]=c;
+ j++;
+ }else{
+ int lim=j+GAPLEN;
+ for(; j<lim; j++){
+ out3[j]='D';
+ }
+ }
+ }
+ return out3;
+ }
+
+ @Override
+ /** @return {score, bestRefStart, bestRefStop} */
+ public final int[] score(final byte[] read, final byte[] ref, final int refStartLoc, final int refEndLoc,
+ final int maxRow, final int maxCol, final int maxState, boolean gapped){
+ if(gapped){
+ if(verbose){
+ System.err.println("score():");
+ System.err.println("origin="+grefRefOrigin+", "+refStartLoc+", "+refEndLoc+", "+maxRow+", "+maxCol);
+ }
+ final byte[] gref=grefbuffer;
+ int gstart=translateToGappedCoordinate(refStartLoc, gref, read);
+ int gstop=translateToGappedCoordinate(refEndLoc, gref, read);
+
+ assert(translateFromGappedCoordinate(gstart, gref)==refStartLoc); //TODO: Remove slow assertions
+ assert(translateFromGappedCoordinate(gstop, gref)==refEndLoc);
+
+ assert(gstart==0) : gstart; //TODO: skip translation if this is always zero
+
+ if(verbose){System.err.println("gstart, gstop: "+gstart+", "+gstop);}
+ int[] out=score2(read, gref, gstart, gstop, maxRow, maxCol, maxState);
+ if(verbose){System.err.println("got score "+Arrays.toString(out));}
+
+ assert(out[1]==translateToGappedCoordinate(translateFromGappedCoordinate(out[1], gref), gref, read)) :
+ "Verifying: "+out[1]+" -> "+translateFromGappedCoordinate(out[1], gref)+" -> "+
+ translateToGappedCoordinate(translateFromGappedCoordinate(out[1], gref), gref, read);
+ assert(out[2]==translateToGappedCoordinate(translateFromGappedCoordinate(out[2], gref), gref, read));
+
+ out[1]=translateFromGappedCoordinate(out[1], gref);
+ out[2]=translateFromGappedCoordinate(out[2], gref);
+ if(verbose){System.err.println("returning score "+Arrays.toString(out));}
+ return out;
+ }else{
+ return score2(read, ref, refStartLoc, refEndLoc, maxRow, maxCol, maxState);
+ }
+ }
+
+ @Override
+ /** @return {score, bestRefStart, bestRefStop, maxRow, maxCol, maxState}, <br>
+ * or {score, bestRefStart, bestRefStop, maxRow, maxCol, maxState, padLeft, padRight} <br>
+ * if more padding is needed */
+ public final int[] score2(final byte[] read, final byte[] ref, final int refStartLoc, final int refEndLoc,
+ final int maxRow, final int maxCol, final int maxState){
+
+ int row=maxRow;
+ int col=maxCol;
+ int state=maxState;
+
+ assert(maxState>=0 && maxState<packed.length) :
+ maxState+", "+maxRow+", "+maxCol+"\n"+new String(read)+"\n"+toString(ref, refStartLoc, refEndLoc);
+ assert(maxRow>=0 && maxRow<packed[0].length) :
+ maxState+", "+maxRow+", "+maxCol+"\n"+new String(read)+"\n"+toString(ref, refStartLoc, refEndLoc);
+ assert(maxCol>=0 && maxCol<packed[0][0].length) :
+ maxState+", "+maxRow+", "+maxCol+"\n"+new String(read)+"\n"+toString(ref, refStartLoc, refEndLoc);
+
+ int score=packed[maxState][maxRow][maxCol]&SCOREMASK; //Or zero, if it is to be recalculated
+
+ if(row<rows){
+ int difR=rows-row;
+ int difC=columns-col;
+
+ while(difR>difC){
+ score+=POINTSoff_NOREF;
+ difR--;
+ }
+
+ row+=difR;
+ col+=difR;
+
+ }
+
+ assert(refStartLoc<=refEndLoc);
+ assert(row==rows);
+
+
+ final int bestRefStop=refStartLoc+col-1;
+
+ if(verbose){System.err.println("Scoring.");}
+
+ int stateTime=0;
+
+ while(row>0 && col>0){
+
+ if(verbose){System.err.println("state="+state+", row="+row+", col="+col);}
+
+ final int time=packed[state][row][col]&TIMEMASK;
+ final byte prev;
+
+ if(state==MODE_MS){
+ if(time>1){prev=(byte)state;}
+ else{
+ final int scoreFromDiag=packed[MODE_MS][row-1][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row-1][col-1]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col-1]&SCOREMASK;
+ if(scoreFromDiag>=scoreFromDel && scoreFromDiag>=scoreFromIns){prev=MODE_MS;}
+ else if(scoreFromDel>=scoreFromIns){prev=MODE_DEL;}
+ else{prev=MODE_INS;}
+ }
+ row--;
+ col--;
+ }else if(state==MODE_DEL){
+ if(time>1){prev=(byte)state;}
+ else{
+ final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK;
+ if(scoreFromDiag>=scoreFromDel){prev=MODE_MS;}
+ else{prev=MODE_DEL;}
+ }
+ col--;
+ }else{
+ assert(state==MODE_INS);
+ if(time>1){prev=(byte)state;}
+ else{
+ final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK;
+ if(scoreFromDiag>=scoreFromIns){prev=MODE_MS;}
+ else{prev=MODE_INS;}
+ }
+ row--;
+ }
+
+ if(col<0){
+ if(verbose){
+ System.err.println("Warning, column went below 0 at row="+row);
+ }
+ break; //prevents an out of bounds access
+ }
+
+// assert(prev==prev0);
+ if(state==prev){stateTime++;}else{stateTime=0;}
+ state=prev;
+
+ if(verbose){System.err.println("state2="+state+", time="+time+", stateTime="+stateTime+", row2="+row+", col2="+col+"\n");}
+ }
+// assert(false) : row+", "+col;
+ if(row>col){
+ col-=row;
+ }
+
+ final int bestRefStart=refStartLoc+col;
+
+ score>>=SCOREOFFSET;
+
+ if(verbose){
+ System.err.println("bestRefStart="+bestRefStart+", refStartLoc="+refStartLoc);
+ System.err.println("bestRefStop="+bestRefStop+", refEndLoc="+refEndLoc);
+ }
+
+ int padLeft=0;
+ int padRight=0;
+ if(bestRefStart<refStartLoc){
+ padLeft=Tools.max(0, refStartLoc-bestRefStart);
+ }else if(bestRefStart==refStartLoc && state==MODE_INS){
+ padLeft=stateTime;
+ }
+ if(bestRefStop>refEndLoc){
+ padRight=Tools.max(0, bestRefStop-refEndLoc);
+ }else if(bestRefStop==refEndLoc && maxState==MODE_INS){
+ padRight=packed[maxState][maxRow][maxCol]&TIMEMASK;
+ }
+
+ int[] rvec;
+ if(padLeft>0 || padRight>0){ //Suggest extra padding in cases of overflow
+ rvec=new int[] {score, bestRefStart, bestRefStop, maxRow, maxCol, maxState, padLeft, padRight};
+ }else{
+ rvec=new int[] {score, bestRefStart, bestRefStop, maxRow, maxCol, maxState};
+ }
+ return rvec;
+ }
+
+ /**
+ * Fills grefbuffer
+ * @param ref
+ * @param a
+ * @param b
+ * @param gaps
+ * @return gref
+ */
+ private final byte[] makeGref(byte[] ref, int[] gaps, int refStartLoc, int refEndLoc){
+ assert(gaps!=null && gaps.length>0);
+
+ assert(refStartLoc<=gaps[0]) : refStartLoc+", "+refEndLoc+", "+Arrays.toString(gaps);
+ assert(refEndLoc>=gaps[gaps.length-1]);
+
+ final int g0_old=gaps[0];
+ final int gN_old=gaps[gaps.length-1];
+ gaps[0]=Tools.min(gaps[0], refStartLoc);
+ gaps[gaps.length-1]=Tools.max(gN_old, refEndLoc);
+ grefRefOrigin=gaps[0];
+
+ if(verbose){System.err.println("\ngaps2: "+Arrays.toString(gaps));}
+
+// grefRefOrigin=Tools.min(gaps[0], refStartLoc);
+
+// //This block is no longer needed since the array is preallocated.
+// int len=0;
+// final int gb2=GAPBUFFER*2;
+// for(int i=0; i<gaps.length; i+=2){
+// int x=gaps[i];
+// int y=gaps[i+1];
+// len+=(y-x+1);
+// if(i+2<gaps.length){
+// int z=gaps[i+2];
+// assert(z>y);
+// int gap=z-y-1;
+// if(gap<MINGAP){
+// len+=gap;
+// }else{
+// len+=gb2;
+// gap-=gb2;
+// int div=gap/GAPLEN;
+// int rem=gap%GAPLEN;
+// len+=(div+rem);
+// }
+// }
+// }
+ byte[] gref=grefbuffer;
+
+ int gpos=0;
+ for(int i=0; i<gaps.length; i+=2){
+ int x=gaps[i];
+ int y=gaps[i+1];
+
+ for(int r=x; r<=y; r++, gpos++){
+ //TODO: if out of bounds, use an 'N'
+ assert(gpos<gref.length) :
+ "\ngpos="+gpos+", gref.length="+gref.length+/*", read.length="+read.length+*/", gaps2="+Arrays.toString(gaps)+
+ "\ni="+i+", r="+r+", x="+x+", y="+y+
+ "\nGapTools.calcGrefLen("+gaps[0]+", "+gaps[gaps.length-1]+", gaps)="+GapTools.calcGrefLen(gaps[0], gaps[gaps.length-1], gaps)+
+ "\nGapTools.calcGrefLen("+gaps[0]+", "+gaps[gaps.length-1]+", gaps)="+GapTools.calcGrefLen(gaps[0], gaps[gaps.length-1], gaps)+
+ "\n"+refStartLoc+", "+refEndLoc+", "+greflimit+", "+GREFLIMIT2_CUSHION+"\n"+new String(gref)+"\n"/*+new String(read)+"\n"*/;
+ gref[gpos]=ref[r];
+ }
+
+ if(i+2<gaps.length){
+ int z=gaps[i+2];
+ assert(z>y);
+ int gap=z-y-1;
+ assert(gap>=MINGAP) : gap+"\t"+MINGAP;
+ if(gap<MINGAP){
+ assert(false) : "TODO - just fill in normally";
+ }else{
+ int rem=gap%GAPLEN;
+ int lim=y+GAPBUFFER+rem;
+
+ int div=(gap-GAPBUFFER2)/GAPLEN;
+ if(verbose){
+ System.err.println("div = "+div);
+ }
+ assert(div>0);
+
+ for(int r=y+1; r<=lim; r++, gpos++){
+ gref[gpos]=ref[r];
+ }
+ for(int g=0; g<div; g++, gpos++){
+ gref[gpos]=GAPC;
+ }
+ for(int r=z-GAPBUFFER; r<z; r++, gpos++){
+ gref[gpos]=ref[r];
+ }
+ }
+ }
+ }
+
+ greflimit=gpos;
+
+ assert(gref[gpos-1]==ref[refEndLoc]);
+// assert(greflimit+GREFLIMIT2_CUSHION<=gref.length) : refStartLoc+", "+refEndLoc+", "+greflimit+", "+GREFLIMIT2_CUSHION+"\n"+new String(gref);
+ //Add a cushion to the end to clear out the prior data (especially GAPC) that was there
+ {
+ final int lim=Tools.min(gref.length, greflimit+GREFLIMIT2_CUSHION);
+ if(lim>gref.length){
+ System.err.println("gref buffer overflow: "+lim+" > "+gref.length);
+ return null;
+ }
+ for(int i=greflimit, r=refEndLoc+1; i<lim; i++, r++){
+ gref[i]=(r<ref.length ? ref[r] : (byte)'N');
+ greflimit2=i;
+ }
+ }
+
+ if(verbose){
+ System.err.println("gref:\n"+new String(gref));
+ }
+
+ gaps[0]=g0_old;
+ gaps[gaps.length-1]=gN_old;
+
+ if(verbose){
+ System.err.println("\ngaps3: "+Arrays.toString(gaps));
+ }
+
+ return gref;
+ }
+
+
+// public final int[] translateScoreFromGappedCoordinate(int[] score){
+//// {score, bestRefStart, bestRefStop}
+// int a=score[1];
+// int b=score[2];
+// int a2=-9999;
+// int b2=-9999;
+// for(int i=0, j=grefRefOrigin; i<grefbuffer.length; i++){
+// byte c=grefbuffer[i];
+//
+// if(i==a){a2=j;}
+// if(i==b){
+// b2=j;
+// assert(a2!=-9999);
+// score[1]=a2;
+// score[2]=b2;
+// return score;
+// }
+//
+// j+=(c==GAPC ? GAPLEN : 1);
+//// if(c!=GAPC){j++;}
+//// else{j+=GAPLEN;}
+// }
+// throw new RuntimeException("Out of bounds.");
+// }
+
+ private final int translateFromGappedCoordinate(int point, byte[] gref){
+ if(verbose){System.err.println("translateFromGappedCoordinate("+point+"), gro="+grefRefOrigin+", grl="+greflimit);}
+ if(point<=0){return grefRefOrigin+point;}
+ for(int i=0, j=grefRefOrigin; i<greflimit2; i++){
+// if(verbose){System.err.println("i="+i+", j="+j+", sym="+(char)gref[i]);}
+ byte c=gref[i];
+ assert(point>=i) : "\n"+grefRefOrigin+"\n"+point+"\n"+new String(gref)+"\n";
+
+ if(i==point){
+ if(verbose){System.err.println(" -> "+j);}
+ return j;
+ }
+
+ j+=(c==GAPC ? GAPLEN : 1);
+// if(c!=GAPC){j++;}
+// else{j+=GAPLEN;}
+ }
+
+ System.err.println(grefRefOrigin);
+ System.err.println(point);
+ System.err.println(new String(gref));
+
+ throw new RuntimeException("Out of bounds.");
+ }
+
+ private final int translateToGappedCoordinate(int point, byte[] gref, byte[] read){
+ if(verbose){System.err.println("translateToGappedCoordinate("+point+"), gro="+grefRefOrigin+", grl="+greflimit);}
+ if(point<=grefRefOrigin){return point-grefRefOrigin;}
+ for(int i=0, j=grefRefOrigin; i<greflimit2; i++){
+// if(verbose){System.err.println("i="+i+", j="+j+", sym="+(char)gref[i]);}
+ assert(point>=j) : "\n"+grefRefOrigin+"\n"+point+"\n"+new String(gref)+"\n";
+ byte c=gref[i];
+
+ if(j==point){
+ if(verbose){System.err.println(" -> "+i);}
+ return i;
+ }
+
+ j+=(c==GAPC ? GAPLEN : 1);
+// if(c!=GAPC){j++;}
+// else{j+=GAPLEN;}
+ }
+
+ System.err.println("\ngrefRefOrigin="+grefRefOrigin);
+ System.err.println("point="+point);
+ System.err.println("read="+new String(read));
+ System.err.println("gref="+new String(gref));
+
+ throw new RuntimeException("Out of bounds.");
+ }
+
+
+ /** Calculates score based on an array from Index */
+ private final int calcAffineScore(int[] locArray){
+ int score=0;
+ int lastLoc=-2; //Last true location
+ int lastValue=-1;
+ int timeInMode=0;
+
+ for(int i=0; i<locArray.length; i++){
+ int loc=locArray[i];
+
+ if(loc>0){//match
+ if(loc==lastValue){//contiguous match
+ score+=POINTS_MATCH2;
+ }else if(loc==lastLoc || lastLoc<0){//match after a sub, or first match
+ score+=POINTS_MATCH;
+ }else if(loc<lastLoc){//deletion
+ assert(lastLoc>=0);
+ score+=POINTS_MATCH;
+ score+=POINTS_DEL;
+ int dif=lastLoc-loc+1;
+ if(dif>MINGAP){
+ int rem=dif%GAPLEN;
+ int div=(dif-GAPBUFFER2)/GAPLEN;
+ score+=(div*POINTS_GAP);
+ assert(rem+GAPBUFFER2<dif);
+ dif=rem+GAPBUFFER2;
+ assert(dif>LIMIT_FOR_COST_4); //and probably LIMIT_FOR_COST_5
+// assert(false) : div;
+ }
+ if(dif>LIMIT_FOR_COST_5){
+ score+=((dif-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTS_DEL5;
+ dif=LIMIT_FOR_COST_5;
+ }
+ if(dif>LIMIT_FOR_COST_4){
+ score+=(dif-LIMIT_FOR_COST_4)*POINTS_DEL4;
+ dif=LIMIT_FOR_COST_4;
+ }
+ if(dif>LIMIT_FOR_COST_3){
+ score+=(dif-LIMIT_FOR_COST_3)*POINTS_DEL3;
+ dif=LIMIT_FOR_COST_3;
+ }
+ if(dif>1){
+ score+=(dif-1)*POINTS_DEL2;
+ }
+ timeInMode=1;
+ }else if(loc>lastLoc){//insertion
+ assert(lastLoc>=0);
+ score+=(POINTS_MATCH+POINTS_INS_ARRAY_C[lastLoc-loc]);
+// score+=POINTS_MATCH;
+// score+=POINTS_INS;
+// int dif=lastLoc-loc+1;
+// if(dif>LIMIT_FOR_COST_4){
+// score+=(dif-LIMIT_FOR_COST_4)*POINTS_INS4;
+// dif=LIMIT_FOR_COST_4;
+// }
+// if(dif>LIMIT_FOR_COST_3){
+// score+=(dif-LIMIT_FOR_COST_3)*POINTS_INS3;
+// dif=LIMIT_FOR_COST_3;
+// }
+// if(dif>1){
+// score+=(dif-1)*POINTS_INS2;
+// }
+ timeInMode=1;
+ }else{
+ assert(false);
+ }
+ lastLoc=loc;
+ }else{//substitution
+ if(lastValue<0 && timeInMode>0){//contiguous
+// if(timeInMode<LIMIT_FOR_COST_3){score+=POINTS_SUB2;}
+// else{score+=POINTS_SUB3;}
+ timeInMode++;
+ score+=(POINTS_SUB_ARRAY[timeInMode]);
+ }else{
+ score+=POINTS_SUB;
+ timeInMode=1;
+ }
+ }
+ lastValue=loc;
+ }
+ assert(score<=maxQuality(locArray.length));
+ return score;
+ }
+
+ @Override
+ public final int calcAffineScore(final int[] locArray, final byte[] baseScores, final byte bases[]){
+ int score=0;
+ int lastLoc=-3; //Last true location
+ int lastValue=-1;
+ int timeInMode=0;
+
+ for(int i=0; i<locArray.length; i++){
+ final int loc=locArray[i];
+
+ if(loc>0){//match
+ if(loc==lastValue){//contiguous match
+ score+=(POINTS_MATCH2+baseScores[i]);
+ }else if(loc==lastLoc || lastLoc<0){//match after a sub, or first match
+ score+=(POINTS_MATCH+baseScores[i]);
+ }else if(loc<lastLoc){//deletion
+ assert(lastLoc>=0);
+ score+=(POINTS_MATCH+baseScores[i]);
+ score+=POINTS_DEL;
+ int dif=lastLoc-loc+1;
+ if(dif>MINGAP){
+ int rem=dif%GAPLEN;
+ int div=(dif-GAPBUFFER2)/GAPLEN;
+ score+=(div*POINTS_GAP);
+ assert(rem+GAPBUFFER2<dif);
+ dif=rem+GAPBUFFER2;
+ assert(dif>LIMIT_FOR_COST_4); //and probably LIMIT_FOR_COST_5
+// assert(false) : div;
+ }
+ if(dif>LIMIT_FOR_COST_5){
+ score+=((dif-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTS_DEL5;
+ dif=LIMIT_FOR_COST_5;
+ }
+ if(dif>LIMIT_FOR_COST_4){
+ score+=(dif-LIMIT_FOR_COST_4)*POINTS_DEL4;
+ dif=LIMIT_FOR_COST_4;
+ }
+ if(dif>LIMIT_FOR_COST_3){
+ score+=(dif-LIMIT_FOR_COST_3)*POINTS_DEL3;
+ dif=LIMIT_FOR_COST_3;
+ }
+ if(dif>1){
+ score+=(dif-1)*POINTS_DEL2;
+ }
+ timeInMode=1;
+ }else if(loc>lastLoc){//insertion
+ assert(lastLoc>=0);
+ score+=(POINTS_MATCH+baseScores[i]+POINTS_INS_ARRAY_C[Tools.min(loc-lastLoc, 5)]);
+// score+=(POINTS_MATCH+baseScores[i]);
+// score+=POINTS_INS;
+// int dif=lastLoc-loc+1;
+// if(dif>LIMIT_FOR_COST_4){
+// score+=(dif-LIMIT_FOR_COST_4)*POINTS_INS4;
+// dif=LIMIT_FOR_COST_4;
+// }
+// if(dif>LIMIT_FOR_COST_3){
+// score+=(dif-LIMIT_FOR_COST_3)*POINTS_INS3;
+// dif=LIMIT_FOR_COST_3;
+// }
+// if(dif>1){
+// score+=(dif-1)*POINTS_INS2;
+// }
+ timeInMode=1;
+ }else{
+ assert(false);
+ }
+ lastLoc=loc;
+ }else if(loc==-1){//substitution
+ if(lastValue<0 && timeInMode>0){//contiguous
+// if(timeInMode<LIMIT_FOR_COST_3){score+=POINTS_SUB2;}
+// else{score+=POINTS_SUB3;}
+ timeInMode++;
+ score+=(POINTS_SUB_ARRAY[timeInMode]);
+ }else{
+ score+=POINTS_SUB;
+ timeInMode=1;
+ }
+ }else{
+ assert(loc==-2) : "\ni="+i+", loc="+loc+", score="+score+", lastLoc="+lastLoc+", lastValue="+lastValue
+ +", time="+timeInMode+", length="+locArray.length+"\nbases=\n"+new String(bases)
+ +"\nlocs[]=\n"+Arrays.toString(locArray)+"\n"+new String(bases)+"\n"
+ +"If this happens please ensure that the reference has a startpad of Ns longer than readlength.";//N (no-call or no-ref)
+ timeInMode=0;
+ score+=POINTS_NOCALL;
+ }
+ lastValue=loc;
+ }
+ assert(score<=maxQuality(locArray.length));
+ return score;
+ }
+
+ @Override
+ public final int calcAffineScore(int[] locArray, byte[] baseScores, byte[] bases, int minContig){
+ assert(minContig>1) : minContig;
+
+ int contig=0;
+ int maxContig=0;
+
+ int score=0;
+ int lastLoc=-3; //Last true location
+ int lastValue=-1;
+ int timeInMode=0;
+
+ for(int i=0; i<locArray.length; i++){
+ int loc=locArray[i];
+
+ if(loc>0){//match
+ if(loc==lastValue){//contiguous match
+ contig++;
+ score+=(POINTS_MATCH2+baseScores[i]);
+ }else if(loc==lastLoc || lastLoc<0){//match after a sub, or first match
+ maxContig=Tools.max(maxContig, contig);
+ contig=1;
+ score+=(POINTS_MATCH+baseScores[i]);
+ }else if(loc<lastLoc){//deletion
+ maxContig=Tools.max(maxContig, contig);
+ contig=0;
+ assert(lastLoc>=0);
+ score+=(POINTS_MATCH+baseScores[i]);
+ score+=POINTS_DEL;
+ int dif=lastLoc-loc+1;
+ if(dif>MINGAP){
+ int rem=dif%GAPLEN;
+ int div=(dif-GAPBUFFER2)/GAPLEN;
+ score+=(div*POINTS_GAP);
+ assert(rem+GAPBUFFER2<dif);
+ dif=rem+GAPBUFFER2;
+ assert(dif>LIMIT_FOR_COST_4); //and probably LIMIT_FOR_COST_5
+// assert(false) : div;
+ }
+ if(dif>LIMIT_FOR_COST_5){
+ score+=((dif-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTS_DEL5;
+ dif=LIMIT_FOR_COST_5;
+ }
+ if(dif>LIMIT_FOR_COST_4){
+ score+=(dif-LIMIT_FOR_COST_4)*POINTS_DEL4;
+ dif=LIMIT_FOR_COST_4;
+ }
+ if(dif>LIMIT_FOR_COST_3){
+ score+=(dif-LIMIT_FOR_COST_3)*POINTS_DEL3;
+ dif=LIMIT_FOR_COST_3;
+ }
+ if(dif>1){
+ score+=(dif-1)*POINTS_DEL2;
+ }
+ timeInMode=1;
+ }else if(loc>lastLoc){//insertion
+ maxContig=Tools.max(maxContig, contig);
+ contig=0;
+ assert(lastLoc>=0);
+ score+=(POINTS_MATCH+baseScores[i]+POINTS_INS_ARRAY_C[Tools.min(loc-lastLoc, 5)]);
+// score+=(POINTS_MATCH+baseScores[i]);
+// score+=POINTS_INS;
+// int dif=lastLoc-loc+1;
+// if(dif>LIMIT_FOR_COST_4){
+// score+=(dif-LIMIT_FOR_COST_4)*POINTS_INS4;
+// dif=LIMIT_FOR_COST_4;
+// }
+// if(dif>LIMIT_FOR_COST_3){
+// score+=(dif-LIMIT_FOR_COST_3)*POINTS_INS3;
+// dif=LIMIT_FOR_COST_3;
+// }
+// if(dif>1){
+// score+=(dif-1)*POINTS_INS2;
+// }
+ timeInMode=1;
+ }else{
+ assert(false);
+ }
+ lastLoc=loc;
+ }else if(loc==-1){//substitution
+ if(lastValue<0 && timeInMode>0){//contiguous
+// if(timeInMode<LIMIT_FOR_COST_3){score+=POINTS_SUB2;}
+// else{score+=POINTS_SUB3;}
+ timeInMode++;
+ score+=(POINTS_SUB_ARRAY[timeInMode]);
+ }else{
+ score+=POINTS_SUB;
+ timeInMode=1;
+ }
+ }else{
+ assert(loc==-2) : loc+"\n"+Arrays.toString(locArray)+"\n"+Arrays.toString(baseScores)+"\n"+new String(bases)+"\n"
+ +"If this happens please ensure that the reference has a startpad of Ns longer than readlength.";//N (no-call or no-ref)
+ timeInMode=0;
+ score+=POINTS_NOCALL;
+ }
+ lastValue=loc;
+ }
+ assert(score<=maxQuality(locArray.length));
+ if(Tools.max(contig, maxContig)<minContig){score=Tools.min(score, -50*locArray.length);}
+ return score;
+ }
+
+ @Override
+ public final int scoreNoIndels(byte[] read, byte[] ref, final int refStart){
+ return scoreNoIndels(read, ref, refStart, null);
+ }
+ @Override
+ public final int scoreNoIndels(byte[] read, byte[] ref, final int refStart, final SiteScore ss){
+
+ int score=0;
+ int mode=-1;
+ int timeInMode=0;
+
+ //This block handles cases where the read runs outside the reference
+ //Of course, padding the reference with 'N' would be better, but...
+ int readStart=0;
+ int readStop=read.length;
+ final int refStop=refStart+read.length;
+ boolean semiperfect=true;
+ int norefs=0;
+
+ if(refStart<0){
+ readStart=0-refStart;
+ score+=POINTS_NOREF*readStart;
+ norefs+=readStart;
+ }
+ if(refStop>ref.length){
+ int dif=(refStop-ref.length);
+ readStop-=dif;
+ score+=POINTS_NOREF*dif;
+ norefs+=dif;
+ }
+
+// if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed.
+
+ for(int i=readStart; i<readStop; i++){
+ byte c=read[i];
+ byte r=ref[refStart+i];
+
+ if(c==r && c!='N'){
+ if(mode==MODE_MS){
+ timeInMode++;
+ score+=POINTS_MATCH2;
+ }else{
+ timeInMode=0;
+ score+=POINTS_MATCH;
+ }
+ mode=MODE_MS;
+ }else if(c<0 || c=='N'){
+ score+=POINTS_NOCALL;
+ semiperfect=false;
+ }else if(r<0 || r=='N'){
+ score+=POINTS_NOREF;
+ norefs++;
+ }else{
+ if(mode==MODE_SUB){timeInMode++;}
+ else{timeInMode=0;}
+
+ score+=(POINTS_SUB_ARRAY[timeInMode+1]);
+// if(timeInMode==0){score+=POINTS_SUB;}
+// else if(timeInMode<LIMIT_FOR_COST_3){score+=POINTS_SUB2;}
+// else{score+=POINTS_SUB3;}
+ mode=MODE_SUB;
+ semiperfect=false;
+ }
+ }
+
+ //TODO: Verify this; it's in the PacBio version
+ //if(semiperfect && ss!=null){ss.semiperfect=((ss.stop==ss.start+read.length-1) && (norefs<=read.length/2));}
+
+ return score;
+ }
+
+ @Override
+ public final byte[] genMatchNoIndels(byte[] read, byte[] ref, final int refStart){
+ if(read==null || ref==null){return null;}
+
+ final byte[] match=new byte[read.length];
+
+ for(int i=0, j=refStart; i<read.length; i++, j++){
+ byte c=read[i];
+ byte r=(j<0 || j>=ref.length) ? (byte)'N' : ref[j];
+
+ if(c=='N' || r=='N'){match[i]='N';}
+ else if(c==r){match[i]='m';}
+ else{match[i]='S';}
+
+ }
+
+ return match;
+ }
+
+ @Override
+ public final int scoreNoIndels(byte[] read, byte[] ref, byte[] baseScores, final int refStart){
+ return scoreNoIndels(read, ref, baseScores, refStart, null);
+ }
+ @Override
+ public final int scoreNoIndels(byte[] read, byte[] ref, byte[] baseScores, final int refStart, SiteScore ss){
+
+ int score=0;
+ int mode=-1;
+ int timeInMode=0;
+ int norefs=0;
+
+ //This block handles cases where the read runs outside the reference
+ //Of course, padding the reference with 'N' would be better, but...
+ int readStart=0;
+ int readStop=read.length;
+ final int refStop=refStart+read.length;
+ boolean semiperfect=true;
+
+ if(refStart<0){
+ readStart=0-refStart;
+ score+=POINTS_NOREF*readStart;
+ norefs+=readStart;
+ }
+ if(refStop>ref.length){
+ int dif=(refStop-ref.length);
+ readStop-=dif;
+ score+=POINTS_NOREF*dif;
+ norefs+=dif;
+ }
+
+// if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed.
+
+ for(int i=readStart; i<readStop; i++){
+ byte c=read[i];
+ byte r=ref[refStart+i];
+
+ if(c==r && c!='N'){
+ if(mode==MODE_MS){
+ timeInMode++;
+ score+=POINTS_MATCH2;
+ }else{
+ timeInMode=0;
+ score+=POINTS_MATCH;
+ }
+ score+=baseScores[i];
+ mode=MODE_MS;
+ }else if(c<0 || c=='N'){
+ score+=POINTS_NOCALL;
+ semiperfect=false;
+ }else if(r<0 || r=='N'){
+ score+=POINTS_NOREF;
+ norefs++;
+ }else{
+ if(mode==MODE_SUB){timeInMode++;}
+ else{timeInMode=0;}
+
+ score+=(POINTS_SUB_ARRAY[timeInMode+1]);
+// if(timeInMode==0){score+=POINTS_SUB;}
+// else if(timeInMode<LIMIT_FOR_COST_3){score+=POINTS_SUB2;}
+// else{score+=POINTS_SUB3;}
+ mode=MODE_SUB;
+ semiperfect=false;
+ }
+ }
+
+ //TODO: Verify. This is in the PacBio version.
+// if(semiperfect && ss!=null){ss.semiperfect=((ss.stop==ss.start+read.length-1) && (norefs<=read.length/2));}
+// assert(Read.CHECKSITE(ss, read, -1));
+
+ return score;
+ }
+
+ @Override
+ public final int scoreNoIndelsAndMakeMatchString(byte[] read, byte[] ref, byte[] baseScores, final int refStart, byte[][] matchReturn){
+ int score=0;
+ int mode=-1;
+ int timeInMode=0;
+
+ assert(refStart<=ref.length) : refStart+", "+ref.length;
+
+ //This block handles cases where the read runs outside the reference
+ //Of course, padding the reference with 'N' would be better, but...
+ int readStart=0;
+ int readStop=read.length;
+ final int refStop=refStart+read.length;
+ if(refStart<0 || refStop>ref.length){return -99999;}
+ if(refStart<0){
+ readStart=0-refStart;
+ score+=POINTS_NOREF*readStart;
+ }
+ if(refStop>ref.length){
+ int dif=(refStop-ref.length);
+ System.err.println(new String(read)+"\ndif="+dif+", ref.length="+ref.length+", refStop="+refStop);
+ readStop-=dif;
+ score+=POINTS_NOREF*dif;
+ }
+ assert(refStart+readStop<=ref.length) : "readStart="+readStart+", readStop="+readStop+
+ ", refStart="+refStart+", refStop="+refStop+", ref.length="+ref.length+", read.length="+read.length;
+
+ assert(matchReturn!=null);
+ assert(matchReturn.length==1);
+ if(matchReturn[0]==null || matchReturn[0].length!=read.length){
+ assert(matchReturn[0]==null || matchReturn[0].length<read.length) : matchReturn[0].length+"!="+read.length;
+ matchReturn[0]=new byte[read.length];
+ }
+ final byte[] match=matchReturn[0];
+
+// if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed.
+
+ for(int i=readStart; i<readStop; i++){
+ byte c=read[i];
+ byte r=ref[refStart+i];
+
+ assert(r!='.' && c!='.');
+
+ if(c==r && c!='N'){
+ if(mode==MODE_MS){
+ timeInMode++;
+ score+=POINTS_MATCH2;
+ }else{
+ timeInMode=0;
+ score+=POINTS_MATCH;
+ }
+ score+=baseScores[i];
+ match[i]='m';
+ mode=MODE_MS;
+ }else if(c<0 || c=='N'){
+ score+=POINTS_NOCALL;
+ match[i]='N';
+ }else if(r<0 || r=='N'){
+ score+=POINTS_NOREF;
+// match[i]='m';
+ match[i]='N';
+ }else{
+ match[i]='S';
+ if(mode==MODE_SUB){timeInMode++;}
+ else{timeInMode=0;}
+
+ score+=(POINTS_SUB_ARRAY[timeInMode+1]);
+// if(timeInMode==0){score+=POINTS_SUB;}
+// else if(timeInMode<LIMIT_FOR_COST_3){score+=POINTS_SUB2;}
+// else{score+=POINTS_SUB3;}
+ mode=MODE_SUB;
+ }
+ }
+
+ return score;
+ }
+
+ @Override
+ public final int scoreNoIndelsAndMakeMatchString(byte[] read, byte[] ref, final int refStart, byte[][] matchReturn){
+ int score=0;
+ int mode=-1;
+ int timeInMode=0;
+
+ assert(refStart<=ref.length) : refStart+", "+ref.length;
+
+ //This block handles cases where the read runs outside the reference
+ //Of course, padding the reference with 'N' would be better, but...
+ int readStart=0;
+ int readStop=read.length;
+ final int refStop=refStart+read.length;
+ if(refStart<0 || refStop>ref.length){return -99999;}
+ if(refStart<0){
+ readStart=0-refStart;
+ score+=POINTS_NOREF*readStart;
+ }
+ if(refStop>ref.length){
+ int dif=(refStop-ref.length);
+ System.err.println(new String(read)+"\ndif="+dif+", ref.length="+ref.length+", refStop="+refStop);
+ readStop-=dif;
+ score+=POINTS_NOREF*dif;
+ }
+ assert(refStart+readStop<=ref.length) : "readStart="+readStart+", readStop="+readStop+
+ ", refStart="+refStart+", refStop="+refStop+", ref.length="+ref.length+", read.length="+read.length;
+
+ assert(matchReturn!=null);
+ assert(matchReturn.length==1);
+ if(matchReturn[0]==null || matchReturn[0].length!=read.length){
+ assert(matchReturn[0]==null || matchReturn[0].length<read.length) : matchReturn[0].length+"!="+read.length;
+ matchReturn[0]=new byte[read.length];
+ }
+ final byte[] match=matchReturn[0];
+
+// if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed.
+
+ for(int i=readStart; i<readStop; i++){
+ byte c=read[i];
+ byte r=ref[refStart+i];
+
+ assert(r!='.' && c!='.');
+
+ if(c==r && c!='N'){
+ if(mode==MODE_MS){
+ timeInMode++;
+ score+=POINTS_MATCH2;
+ }else{
+ timeInMode=0;
+ score+=POINTS_MATCH;
+ }
+ match[i]='m';
+ mode=MODE_MS;
+ }else if(c<0 || c=='N'){
+ score+=POINTS_NOCALL;
+ match[i]='N';
+ }else if(r<0 || r=='N'){
+ score+=POINTS_NOREF;
+// match[i]='m';
+ match[i]='N';
+ }else{
+ match[i]='S';
+ if(mode==MODE_SUB){timeInMode++;}
+ else{timeInMode=0;}
+
+ if(AFFINE_ARRAYS){
+ score+=(POINTS_SUB_ARRAY[timeInMode+1]);
+ }else{
+ if(timeInMode==0){score+=POINTS_SUB;}
+ else if(timeInMode<LIMIT_FOR_COST_3){score+=POINTS_SUB2;}
+ else{score+=POINTS_SUB3;}
+ }
+ mode=MODE_SUB;
+ }
+ }
+
+ return score;
+ }
+
+ @Override
+ public final int maxQuality(int numBases){
+ return POINTS_MATCH+(numBases-1)*(POINTS_MATCH2);
+ }
+
+ @Override
+ public final int maxQuality(byte[] baseScores){
+ return POINTS_MATCH+(baseScores.length-1)*(POINTS_MATCH2)+Tools.sumInt(baseScores);
+ }
+
+ @Override
+ public final int maxImperfectScore(int numBases){
+// int maxQ=maxQuality(numBases);
+//// maxImperfectSwScore=maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB);
+// int maxI=maxQ+POINTS_DEL;
+// maxI=Tools.max(maxI, maxQ+POINTS_INS-POINTS_MATCH2);
+// maxI=Tools.min(maxI, maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB));
+
+ int maxQ=maxQuality(numBases);
+ int maxI=maxQ+Tools.min(POINTS_DEL, POINTS_INS-POINTS_MATCH2);
+ assert(maxI<(maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB)));
+ return maxI;
+ }
+
+ @Override
+ public final int maxImperfectScore(byte[] baseScores){
+// int maxQ=maxQuality(numBases);
+//// maxImperfectSwScore=maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB);
+// int maxI=maxQ+POINTS_DEL;
+// maxI=Tools.max(maxI, maxQ+POINTS_INS-POINTS_MATCH2);
+// maxI=Tools.min(maxI, maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB));
+
+ int maxQ=maxQuality(baseScores);
+ int maxI=maxQ+Tools.min(POINTS_DEL, POINTS_INS-POINTS_MATCH2);
+ assert(maxI<(maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB)));
+ return maxI;
+ }
+
+ @Override
+ public int calcDelScore(int len, boolean approximateGaps){
+ if(len<=0){return 0;}
+ int score=POINTS_DEL;
+
+ if(approximateGaps && len>MINGAP){
+ int rem=len%GAPLEN;
+ int div=(len-GAPBUFFER2)/GAPLEN;
+ score+=(div*POINTS_GAP);
+ assert(rem+GAPBUFFER2<len);
+ len=rem+GAPBUFFER2;
+ assert(len>LIMIT_FOR_COST_4); //and probably LIMIT_FOR_COST_5
+// assert(false) : div;
+ }
+
+ if(len>LIMIT_FOR_COST_5){
+ score+=((len-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTS_DEL5;
+ len=LIMIT_FOR_COST_5;
+ }
+ if(len>LIMIT_FOR_COST_4){
+ score+=(len-LIMIT_FOR_COST_4)*POINTS_DEL4;
+ len=LIMIT_FOR_COST_4;
+ }
+ if(len>LIMIT_FOR_COST_3){
+ score+=(len-LIMIT_FOR_COST_3)*POINTS_DEL3;
+ len=LIMIT_FOR_COST_3;
+ }
+ if(len>1){
+ score+=(len-1)*POINTS_DEL2;
+ }
+ return score;
+ }
+
+ private static int calcDelScoreOffset(int len){
+ if(len<=0){return 0;}
+ int score=POINTSoff_DEL;
+
+ if(len>LIMIT_FOR_COST_5){
+ score+=((len-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTSoff_DEL5;
+ len=LIMIT_FOR_COST_5;
+ }
+ if(len>LIMIT_FOR_COST_4){
+ score+=(len-LIMIT_FOR_COST_4)*POINTSoff_DEL4;
+ len=LIMIT_FOR_COST_4;
+ }
+ if(len>LIMIT_FOR_COST_3){
+ score+=(len-LIMIT_FOR_COST_3)*POINTSoff_DEL3;
+ len=LIMIT_FOR_COST_3;
+ }
+ if(len>1){
+ score+=(len-1)*POINTSoff_DEL2;
+ }
+ return score;
+ }
+
+ @Override
+ public int calcInsScore(int len){
+ if(len<=0){return 0;}
+ if(AFFINE_ARRAYS){
+ return POINTS_INS_ARRAY_C[len];
+ }else{
+ int score=POINTS_INS;
+
+ if(len>LIMIT_FOR_COST_4){
+ score+=(len-LIMIT_FOR_COST_4)*POINTS_INS4;
+ len=LIMIT_FOR_COST_4;
+ }
+ if(len>LIMIT_FOR_COST_3){
+ score+=(len-LIMIT_FOR_COST_3)*POINTS_INS3;
+ len=LIMIT_FOR_COST_3;
+ }
+ if(len>1){
+ score+=(len-1)*POINTS_INS2;
+ }
+ return score;
+ }
+ }
+
+ private static int calcInsScoreOffset(int len){
+ if(len<=0){return 0;}
+ if(AFFINE_ARRAYS){
+ return POINTSoff_INS_ARRAY_C[len];
+ }else{
+ int score=POINTSoff_INS;
+ if(len>LIMIT_FOR_COST_4){
+ score+=(len-LIMIT_FOR_COST_4)*POINTSoff_INS4;
+ len=LIMIT_FOR_COST_4;
+ }
+ if(len>LIMIT_FOR_COST_3){
+ score+=(len-LIMIT_FOR_COST_3)*POINTSoff_INS3;
+ len=LIMIT_FOR_COST_3;
+ }
+ if(len>1){
+ score+=(len-1)*POINTSoff_INS2;
+ }
+ return score;
+ }
+ }
+
+
+ private final int[][][] packed;
+ private final byte[] grefbuffer;
+ private int greflimit=-1;
+ private int greflimit2=-1;
+ private int grefRefOrigin=-1;
+
+
+ @Override
+ /**DO NOT MODIFY*/
+ public final byte[] getGrefbuffer(){
+ return grefbuffer;
+ }
+
+ public final int[] vertLimit;
+ public final int[] horizLimit;
+
+ @Override
+ public CharSequence showVertLimit(){
+ StringBuilder sb=new StringBuilder();
+ for(int i=0; i<=rows; i++){sb.append(vertLimit[i]>>SCOREOFFSET).append(",");}
+ return sb;
+ }
+
+ @Override
+ public CharSequence showHorizLimit(){
+ StringBuilder sb=new StringBuilder();
+ for(int i=0; i<=columns; i++){sb.append(horizLimit[i]>>SCOREOFFSET).append(",");}
+ return sb;
+ }
+
+ public static float minIdToMinRatio(double minid){
+ if(minid>1){minid=minid/100;}
+ assert(minid>0 && minid<=1) : "Min identity should be between 0 and 1. Values above 1 will be assumed to be percent and divided by 100.";
+ double matchdif=POINTS_MATCH-POINTS_MATCH2;
+ double match=POINTS_MATCH2;
+ double sub=-POINTS_MATCH2+0.5*(matchdif+POINTS_SUB)+0.5*POINTS_SUB2;
+ double del=0.1*(matchdif+POINTS_DEL)+0.2*POINTS_DEL2+0.4*POINTS_DEL3+0.3*POINTS_DEL4;
+ double ins=-POINTS_MATCH2+0.4*(matchdif+POINTS_INS)+0.3*(POINTS_INS2)+0.3*(POINTS_INS3);
+ double badAvg=.7*sub+.2*del+.1*ins;
+ double badFraction=1-minid;
+ double minratio=(match+badFraction*badAvg)/match;
+ assert(minratio<=1);
+ minratio=Tools.max(0.1, minratio);
+ return (float)minratio;
+ }
+
+ public static final int TIMEBITS=11;
+ public static final int SCOREBITS=32-TIMEBITS;
+ public static final int MAX_TIME=((1<<TIMEBITS)-1);
+ public static final int MAX_SCORE=((1<<(SCOREBITS-1))-1)-2000;
+ public static final int MIN_SCORE=0-MAX_SCORE; //Keeps it 1 point above "BAD".
+
+ public static final int SCOREOFFSET=TIMEBITS;
+
+ public static final int TIMEMASK=~((-1)<<TIMEBITS);
+ public static final int SCOREMASK=(~((-1)<<SCOREBITS))<<SCOREOFFSET;
+
+ private static final byte MODE_MS=0;
+ private static final byte MODE_DEL=1;
+ private static final byte MODE_INS=2;
+ private static final byte MODE_SUB=3;
+
+ public static final int POINTS_NOREF=0;
+ public static final int POINTS_NOCALL=0;
+ public static final int POINTS_MATCH=70;
+ public static final int POINTS_MATCH2=100; //Note: Changing to 90 substantially reduces false positives
+ public static final int POINTS_COMPATIBLE=50;
+ public static final int POINTS_SUB=-127;
+ public static final int POINTS_SUBR=-147; //increased penalty if prior match streak was at most 1
+ public static final int POINTS_SUB2=-51;
+ public static final int POINTS_SUB3=-25;
+ public static final int POINTS_MATCHSUB=-10;
+ public static final int POINTS_INS=-395;
+ public static final int POINTS_INS2=-39;
+ public static final int POINTS_INS3=-23;
+ public static final int POINTS_INS4=-8;
+ public static final int POINTS_DEL=-472;
+ public static final int POINTS_DEL2=-33;
+ public static final int POINTS_DEL3=-9;
+ public static final int POINTS_DEL4=-1;
+ public static final int POINTS_DEL5=-1;
+ public static final int POINTS_DEL_REF_N=-10;
+ public static final int POINTS_GAP=0-GAPCOST;
+
+ public static final int TIMESLIP=4;
+ public static final int MASK5=TIMESLIP-1;
+ static{assert(Integer.bitCount(TIMESLIP)==1);}
+
+
+ private static final int BARRIER_I1=2;
+ private static final int BARRIER_D1=3;
+
+
+ public static final int LIMIT_FOR_COST_3=5;
+ public static final int LIMIT_FOR_COST_4=20;
+ public static final int LIMIT_FOR_COST_5=80;
+
+ public static final int BAD=MIN_SCORE-1;
+
+
+ public static final int POINTSoff_NOREF=(POINTS_NOREF<<SCOREOFFSET);
+ public static final int POINTSoff_NOCALL=(POINTS_NOCALL<<SCOREOFFSET);
+ public static final int POINTSoff_MATCH=(POINTS_MATCH<<SCOREOFFSET);
+ public static final int POINTSoff_MATCH2=(POINTS_MATCH2<<SCOREOFFSET);
+ public static final int POINTSoff_COMPATIBLE=(POINTS_COMPATIBLE<<SCOREOFFSET);
+ public static final int POINTSoff_SUB=(POINTS_SUB<<SCOREOFFSET);
+ public static final int POINTSoff_SUBR=(POINTS_SUBR<<SCOREOFFSET);
+ public static final int POINTSoff_SUB2=(POINTS_SUB2<<SCOREOFFSET);
+ public static final int POINTSoff_SUB3=(POINTS_SUB3<<SCOREOFFSET);
+ public static final int POINTSoff_MATCHSUB=(POINTS_MATCHSUB<<SCOREOFFSET);
+ public static final int POINTSoff_INS=(POINTS_INS<<SCOREOFFSET);
+ public static final int POINTSoff_INS2=(POINTS_INS2<<SCOREOFFSET);
+ public static final int POINTSoff_INS3=(POINTS_INS3<<SCOREOFFSET);
+ public static final int POINTSoff_INS4=(POINTS_INS4<<SCOREOFFSET);
+ public static final int POINTSoff_DEL=(POINTS_DEL<<SCOREOFFSET);
+ public static final int POINTSoff_DEL2=(POINTS_DEL2<<SCOREOFFSET);
+ public static final int POINTSoff_DEL3=(POINTS_DEL3<<SCOREOFFSET);
+ public static final int POINTSoff_DEL4=(POINTS_DEL4<<SCOREOFFSET);
+ public static final int POINTSoff_DEL5=(POINTS_DEL5<<SCOREOFFSET);
+ public static final int POINTSoff_GAP=(POINTS_GAP<<SCOREOFFSET);
+ public static final int POINTSoff_DEL_REF_N=(POINTS_DEL_REF_N<<SCOREOFFSET);
+ public static final int BADoff=(BAD<<SCOREOFFSET);
+ public static final int MAXoff_SCORE=MAX_SCORE<<SCOREOFFSET;
+ public static final int MINoff_SCORE=MIN_SCORE<<SCOREOFFSET;
+
+ /** TODO: possibly enclose all uses of affine arrays in a branch controlled by this */
+ public static final boolean AFFINE_ARRAYS=true;
+ public static final int[] POINTS_INS_ARRAY;
+ public static final int[] POINTSoff_INS_ARRAY;
+ public static final int[] POINTS_INS_ARRAY_C;
+ public static final int[] POINTSoff_INS_ARRAY_C;
+
+ public static final int[] POINTS_SUB_ARRAY;
+ public static final int[] POINTSoff_SUB_ARRAY;
+ public static final int[] POINTS_SUB_ARRAY_C;
+ public static final int[] POINTSoff_SUB_ARRAY_C;
+
+ static{
+ POINTS_INS_ARRAY=new int[604];
+ POINTSoff_INS_ARRAY=new int[604];
+ POINTS_INS_ARRAY_C=new int[604];
+ POINTSoff_INS_ARRAY_C=new int[604];
+
+ for(int i=1; i<POINTS_INS_ARRAY.length; i++){
+ int pts, ptsoff;
+ if(i>LIMIT_FOR_COST_4){
+ pts=POINTS_INS4;
+ ptsoff=POINTSoff_INS4;
+ }else if(i>LIMIT_FOR_COST_3){
+ pts=POINTS_INS3;
+ ptsoff=POINTSoff_INS3;
+ }else if(i>1){
+ pts=POINTS_INS2;
+ ptsoff=POINTSoff_INS2;
+ }else{
+ pts=POINTS_INS;
+ ptsoff=POINTSoff_INS;
+ }
+ POINTS_INS_ARRAY[i]=pts;
+ POINTSoff_INS_ARRAY[i]=ptsoff;
+ POINTS_INS_ARRAY_C[i]=Tools.max(MIN_SCORE, pts+POINTS_INS_ARRAY_C[i-1]);
+ POINTSoff_INS_ARRAY_C[i]=Tools.max(MINoff_SCORE, ptsoff+POINTSoff_INS_ARRAY_C[i-1]);
+ }
+
+
+ POINTS_SUB_ARRAY=new int[604];
+ POINTSoff_SUB_ARRAY=new int[604];
+ POINTS_SUB_ARRAY_C=new int[604];
+ POINTSoff_SUB_ARRAY_C=new int[604];
+
+ for(int i=1; i<POINTS_SUB_ARRAY.length; i++){
+ int pts, ptsoff;
+ if(i>LIMIT_FOR_COST_3){
+ pts=POINTS_SUB3;
+ ptsoff=POINTSoff_SUB3;
+ }else if(i>1){
+ pts=POINTS_SUB2;
+ ptsoff=POINTSoff_SUB2;
+ }else{
+ pts=POINTS_SUB;
+ ptsoff=POINTSoff_SUB;
+ }
+ POINTS_SUB_ARRAY[i]=pts;
+ POINTSoff_SUB_ARRAY[i]=ptsoff;
+ POINTS_SUB_ARRAY_C[i]=Tools.max(MIN_SCORE, pts+POINTS_SUB_ARRAY_C[i-1]);
+ POINTSoff_SUB_ARRAY_C[i]=Tools.max(MINoff_SCORE, ptsoff+POINTSoff_SUB_ARRAY_C[i-1]);
+ }
+ }
+
+ public final int POINTS_NOREF(){return POINTS_NOREF;}
+ public final int POINTS_NOCALL(){return POINTS_NOCALL;}
+ public final int POINTS_MATCH(){return POINTS_MATCH;}
+ public final int POINTS_MATCH2(){return POINTS_MATCH2;}
+ public final int POINTS_COMPATIBLE(){return POINTS_COMPATIBLE;}
+ public final int POINTS_SUB(){return POINTS_SUB;}
+ public final int POINTS_SUBR(){return POINTS_SUBR;}
+ public final int POINTS_SUB2(){return POINTS_SUB2;}
+ public final int POINTS_SUB3(){return POINTS_SUB3;}
+ public final int POINTS_MATCHSUB(){return POINTS_MATCHSUB;}
+ public final int POINTS_INS(){return POINTS_INS;}
+ public final int POINTS_INS2(){return POINTS_INS2;}
+ public final int POINTS_INS3(){return POINTS_INS3;}
+ public final int POINTS_INS4(){return POINTS_INS4;}
+ public final int POINTS_DEL(){return POINTS_DEL;}
+ public final int POINTS_DEL2(){return POINTS_DEL2;}
+ public final int POINTS_DEL3(){return POINTS_DEL3;}
+ public final int POINTS_DEL4(){return POINTS_DEL4;}
+ public final int POINTS_DEL5(){return POINTS_DEL5;}
+ public final int POINTS_DEL_REF_N(){return POINTS_DEL_REF_N;}
+ public final int POINTS_GAP(){return POINTS_GAP;}
+
+ public final int TIMESLIP(){return TIMESLIP;}
+ public final int MASK5(){return MASK5;}
+ public final int SCOREOFFSET(){return SCOREOFFSET();}
+
+ final int BARRIER_I1(){return BARRIER_I1;}
+ final int BARRIER_D1(){return BARRIER_D1;}
+
+ public final int LIMIT_FOR_COST_3(){return LIMIT_FOR_COST_3;}
+ public final int LIMIT_FOR_COST_4(){return LIMIT_FOR_COST_4;}
+ public final int LIMIT_FOR_COST_5(){return LIMIT_FOR_COST_5;}
+
+ public final int BAD(){return BAD;}
+
+
+ private int rows;
+ private int columns;
+
+}
diff --git a/current/align2/MultiStateAligner11tsJNI.java b/current/align2/MultiStateAligner11tsJNI.java
new file mode 100755
index 0000000..3e5afdc
--- /dev/null
+++ b/current/align2/MultiStateAligner11tsJNI.java
@@ -0,0 +1,1665 @@
+package align2;
+import java.util.Arrays;
+import stream.SiteScore;
+import dna.AminoAcid;
+import java.io.File;
+
+/**
+ * Modification of MultiStateAligner9ts to replace fixed affine steps with an array */
+public final class MultiStateAligner11tsJNI extends MSA{
+
+ static {
+ String name = "bbtoolsjni";
+ try {
+ System.loadLibrary(name);
+ } catch (UnsatisfiedLinkError e1) {
+ // System.loadLibrary() does not work with MPI.
+ // Need to use System.load() with an explicit full
+ // path to the native library file for the MPI case.
+ boolean success = false;
+ String libpath=System.getProperty("java.library.path");
+ libpath = libpath.replace("-Djava.library.path=","");
+ String[] libpathEntries = libpath.split(File.pathSeparator);
+ for(int i = 0; i < libpathEntries.length; i++) {
+ if(success) break;
+ String lib = libpathEntries[i]+"/"+System.mapLibraryName(name);
+ try {
+ System.load(lib);
+ success = true;
+ } catch (UnsatisfiedLinkError e2) {
+ success = false;
+ if((i+1) >= libpathEntries.length) {
+ System.err.println("Native library can not be found in java.library.path. ");
+ System.exit(1);
+ }
+ }
+ }
+ }
+ }
+
+ private native void fillUnlimitedJNI(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int[] result, long[] iterationsUnlimited, int[] packed, int[] POINTSoff_SUB_ARRAY, int[] POINTSoff_INS_ARRAY, int maxRows, int maxColumns);
+
+ private native void fillLimitedXJNI(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore, int[] result, long[] iterationsLimited, int[] packed, int[] POINTSoff_SUB_ARRAY, int[] POINTSoff_INS_ARRAY, int maxRows, int maxColumns, int bandwidth, float bandwidthRatio, int[] vertLimit, int[] horizLimit, byte[] baseToNumber, int[] POINTSoff_INS_ARRAY_C);
+
+ public static void main(String[] args){
+ byte[] read=args[0].getBytes();
+ byte[] ref=args[1].getBytes();
+ byte[] original=ref;
+
+ MultiStateAligner11tsJNI msa=new MultiStateAligner11tsJNI(read.length, ref.length);
+ System.out.println("Initial: ");
+ //printMatrix(msa.packed, read.length, ref.length, TIMEMASK, SCOREOFFSET);
+
+ int[] max=msa.fillLimited(read, ref, 0, ref.length-1, 0, null);
+
+ System.out.println("Max: "+Arrays.toString(max));
+
+ System.out.println("Final: ");
+ //printMatrix(msa.packed, read.length, ref.length, TIMEMASK, SCOREOFFSET);
+
+ byte[] out=msa.traceback(read, ref, 0, ref.length-1, max[0], max[1], max[2], false);
+
+ int[] score=null;
+ score=msa.score(read, ref, 0, ref.length-1, max[0], max[1], max[2], false);
+
+ System.out.println(new String(ref));
+ System.out.println(new String(read));
+ System.out.println(new String(out));
+ System.out.println("Score: "+Arrays.toString(score));
+ }
+
+ public MultiStateAligner11tsJNI(int maxRows_, int maxColumns_){
+ super(maxRows_, maxColumns_);
+ {
+ int[] packed0=null;
+ byte[] grefbuffer0=null;
+ int[] vertLimit0=null;
+ int[] horizLimit0=null;
+
+ try {
+ packed0=new int[3*(maxRows+1)*(maxColumns+1)];
+ grefbuffer0=new byte[maxColumns+2];
+ vertLimit0=new int[maxRows+1];
+ horizLimit0=new int[maxColumns+1];
+ } catch (OutOfMemoryError e) {
+ packed0=null;
+ grefbuffer0=null;
+ vertLimit0=null;
+ horizLimit0=null;
+ throw new RuntimeException(e.toString());
+ }
+
+ packed=packed0;
+ grefbuffer=grefbuffer0;
+ vertLimit=vertLimit0;
+ horizLimit=horizLimit0;
+ }
+
+ Arrays.fill(vertLimit, BADoff);
+ Arrays.fill(horizLimit, BADoff);
+
+ for(int matrix=0; matrix<3; matrix++){
+ for(int i=1; i<=maxRows; i++){
+ for(int j=0; j<maxColumns+1; j++){
+ packed[(matrix)*(maxRows+1)*(maxColumns+1)+(i)*(maxColumns+1)+(j)]|=BADoff;
+ }
+ }
+ for(int i=0; i<=maxRows; i++){
+ int prevScore=(i<2 ? 0 : packed[(matrix)*(maxRows+1)*(maxColumns+1)+(i-1)*(maxColumns+1)+(0)]);
+ int score=prevScore+POINTSoff_INS_ARRAY[i];
+ packed[(matrix)*(maxRows+1)*(maxColumns+1)+(i)*(maxColumns+1)+(0)]=score;
+ }
+ }
+ }
+
+ @Override
+ public final int[] fillLimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore, int[] gaps){
+ if(gaps==null){return fillLimitedX(read, ref, refStartLoc, refEndLoc, minScore);}
+ else{
+ byte[] gref=makeGref(ref, gaps, refStartLoc, refEndLoc);
+
+ if(verbose && greflimit>0 && greflimit<500){
+ System.err.println(new String(gref, 0, greflimit));
+ }
+
+ assert(gref!=null) : "Excessively long read:\n"+new String(read);
+ return fillLimitedX(read, gref, 0, greflimit, minScore);
+ }
+ }
+
+ /** return new int[] {rows, maxC, maxS, max};
+ * Will not fill areas that cannot match minScore */
+ private final int[] fillLimitedX(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore){
+ if(verbose){System.err.println("fillLimitedX");}
+ rows=read.length;
+ columns=refEndLoc-refStartLoc+1;
+
+ final int halfband=(bandwidth<1 && bandwidthRatio<=0) ? 0 :
+ Tools.max(Tools.min(bandwidth<1 ? 9999999 : bandwidth, bandwidthRatio<=0 ? 9999999 : 8+(int)(rows*bandwidthRatio)), (columns-rows+8))/2;
+
+ if(minScore<1 || (columns+rows<90) || ((halfband<1 || halfband*3>columns) && (columns>read.length+Tools.min(170, read.length+20)))){
+ return fillUnlimited(read, ref, refStartLoc, refEndLoc);
+ }
+
+ minScore-=120; //Increases quality trivially
+
+ //Create arrays for passing values to and from native library
+ int[] result = new int[5];
+ long[] iterationsLimitedArray = new long[1];
+
+ //Put values into array for passing to native library
+ iterationsLimitedArray[0] = iterationsLimited;
+
+ fillLimitedXJNI(read,ref,refStartLoc,refEndLoc,minScore,result,iterationsLimitedArray,packed,POINTSoff_SUB_ARRAY,POINTSoff_INS_ARRAY,maxRows,maxColumns,bandwidth,bandwidthRatio,vertLimit,horizLimit,AminoAcid.baseToNumber,POINTSoff_INS_ARRAY_C);
+
+ //Retrieve variables from native library that were updated there
+ iterationsLimited = iterationsLimitedArray[0];
+
+ if(result[4]==1){
+ return null;
+ }
+
+ //return new int[] {rows, maxCol, maxState, maxScore};
+ return new int[] {result[0], result[1], result[2], result[3]};
+ }
+
+ @Override
+ public final int[] fillUnlimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int[] gaps){
+ if(gaps==null){return fillUnlimited(read, ref, refStartLoc, refEndLoc);}
+ else{
+ byte[] gref=makeGref(ref, gaps, refStartLoc, refEndLoc);
+ assert(gref!=null) : "Excessively long read:\n"+new String(read);
+ return fillUnlimited(read, gref, 0, greflimit);
+ }
+ }
+
+ /** return new int[] {rows, maxC, maxS, max};
+ * Does not require a min score (ie, same as old method) */
+ private final int[] fillUnlimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc){
+
+ //Create arrays for passing values to and from native library
+ int[] result = new int[4];
+ long[] iterationsUnlimitedArray = new long[1];
+ iterationsUnlimitedArray[0] = iterationsUnlimited;
+
+ fillUnlimitedJNI(read,ref,refStartLoc,refEndLoc,result,iterationsUnlimitedArray,packed,POINTSoff_SUB_ARRAY,POINTSoff_INS_ARRAY,maxRows,maxColumns);
+
+ //Retrieve variables from native library that were updated there
+ long myiterationsUnlimited = iterationsUnlimitedArray[0];
+
+ //return new int[] {rows, maxCol, maxState, maxScore};
+ return new int[] {result[0], result[1], result[2], result[3]};
+ }
+
+ @Deprecated
+ /** return new int[] {rows, maxC, maxS, max}; */
+ public final int[] fillQ(byte[] read, byte[] ref, byte[] baseScores, int refStartLoc, int refEndLoc){
+ assert(false) : "Needs to be redone to work with score cutoffs. Not difficult.";
+ rows=read.length;
+ columns=refEndLoc-refStartLoc+1;
+
+ assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows;
+ assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns;
+
+ assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc;
+ assert(refEndLoc<ref.length) : "Check that values are in-bounds before calling this function: "+refEndLoc+", "+ref.length;
+
+ for(int row=1; row<=rows; row++){
+ for(int col=1; col<=columns; col++){
+ final boolean match=(read[row-1]==ref[refStartLoc+col-1]);
+ final boolean prevMatch=(row<2 || col<2 ? false : read[row-2]==ref[refStartLoc+col-2]);
+ {//Calculate match and sub scores
+ final int scoreFromDiag=packed[(MODE_MS)*(maxRows+1)*(maxColumns+1)+(row-1)*(maxColumns+1)+(col-1)]&SCOREMASK;
+ final int scoreFromDel=packed[(MODE_DEL)*(maxRows+1)*(maxColumns+1)+(row-1)*(maxColumns+1)+(col-1)]&SCOREMASK;
+ final int scoreFromIns=packed[(MODE_INS)*(maxRows+1)*(maxColumns+1)+(row-1)*(maxColumns+1)+(col-1)]&SCOREMASK;
+ final int streak=(packed[(MODE_MS)*(maxRows+1)*(maxColumns+1)+(row-1)*(maxColumns+1)+(col-1)]&TIMEMASK);
+ {//Calculate match/sub score
+ if(match){
+ int scoreMS=scoreFromDiag+(prevMatch ? POINTSoff_MATCH2 : POINTSoff_MATCH);
+ int scoreD=scoreFromDel+POINTSoff_MATCH;
+ int scoreI=scoreFromIns+POINTSoff_MATCH;
+
+ int score;
+ int time;
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? streak+1 : 1);
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+ }else{
+ score=scoreI;
+ time=1;
+ }
+ score+=(((int)baseScores[row-1])<<SCOREOFFSET); //modifier
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+ packed[(MODE_MS)*(maxRows+1)*(maxColumns+1)+(row)*(maxColumns+1)+(col)]=(score|time);
+ assert((score&SCOREMASK)==score);
+ assert((time&TIMEMASK)==time);
+
+ }else{
+ int scoreMS=scoreFromDiag+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) :
+ POINTSoff_SUB_ARRAY[streak+1]);
+ int scoreD=scoreFromDel+POINTSoff_SUB; //+2 to move it as close as possible to the deletion / insertion
+ int scoreI=scoreFromIns+POINTSoff_SUB;
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? 1 : streak+1);
+ prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+ prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+ prevState=MODE_INS;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+ packed[(MODE_MS)*(maxRows+1)*(maxColumns+1)+(row)*(maxColumns+1)+(col)]=(score|time);
+ assert((score&SCOREMASK)==score);
+ assert((time&TIMEMASK)==time);
+ }
+ }
+ }
+
+ {//Calculate DEL score
+ final int streak=packed[(MODE_DEL)*(maxRows+1)*(maxColumns+1)+(row)*(maxColumns+1)+(col-1)]&TIMEMASK;
+ final int scoreFromDiag=packed[(MODE_MS)*(maxRows+1)*(maxColumns+1)+(row)*(maxColumns+1)+(col-1)]&SCOREMASK;
+ final int scoreFromDel=packed[(MODE_DEL)*(maxRows+1)*(maxColumns+1)+(row)*(maxColumns+1)+(col-1)]&SCOREMASK;
+
+ int scoreMS=scoreFromDiag+POINTSoff_DEL;
+ int scoreD=scoreFromDel+(streak==0 ? POINTSoff_DEL :
+ streak<LIMIT_FOR_COST_3 ? POINTSoff_DEL2 :
+ streak<LIMIT_FOR_COST_4 ? POINTSoff_DEL3 :
+ streak<LIMIT_FOR_COST_5 ? POINTSoff_DEL4 :
+ ((streak&MASK5)==0 ? POINTSoff_DEL5 : 0));
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreD){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreD;
+ time=streak+1;
+ prevState=MODE_DEL;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+ packed[(MODE_DEL)*(maxRows+1)*(maxColumns+1)+(row)*(maxColumns+1)+(col)]=(score|time);
+ assert((score&SCOREMASK)==score);
+ assert((time&TIMEMASK)==time);
+ }
+
+ {//Calculate INS score
+ final int streak=packed[(MODE_INS)*(maxRows+1)*(maxColumns+1)+(row-1)*(maxColumns+1)+(col)]&TIMEMASK;
+ final int scoreFromDiag=packed[(MODE_MS)*(maxRows+1)*(maxColumns+1)+(row-1)*(maxColumns+1)+(col)]&SCOREMASK;
+ final int scoreFromIns=packed[(MODE_INS)*(maxRows+1)*(maxColumns+1)+(row-1)*(maxColumns+1)+(col)]&SCOREMASK;
+
+ int scoreMS=scoreFromDiag+POINTSoff_INS;
+ int scoreI=scoreFromIns+POINTSoff_INS_ARRAY[streak+1];
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreI){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreI;
+ time=streak+1;
+ prevState=MODE_INS;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+ packed[(MODE_INS)*(maxRows+1)*(maxColumns+1)+(row)*(maxColumns+1)+(col)]=(score|time);
+ assert((score&SCOREMASK)==score);
+ assert((time&TIMEMASK)==time);
+ }
+ }
+ }
+
+ int maxCol=-1;
+ int maxState=-1;
+ int maxScore=Integer.MIN_VALUE;
+
+ for(int state=0; state<3; state++){
+ for(int col=1; col<=columns; col++){
+ int x=packed[(state)*(maxRows+1)*(maxColumns+1)+(rows)*(maxColumns+1)+(col)]&SCOREMASK;
+ if(x>maxScore){
+ maxScore=x;
+ maxCol=col;
+ maxState=state;
+ }
+ }
+ }
+ maxScore>>=SCOREOFFSET;
+
+ return new int[] {rows, maxCol, maxState, maxScore};
+ }
+
+ @Override
+ /** @return {score, bestRefStart, bestRefStop} */
+ /** Generates the match string */
+ public final byte[] traceback(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state, boolean gapped){
+ if(gapped){
+ final byte[] gref=grefbuffer;
+ int gstart=translateToGappedCoordinate(refStartLoc, gref);
+ int gstop=translateToGappedCoordinate(refEndLoc, gref);
+ byte[] out=traceback2(read, gref, gstart, gstop, row, col, state);
+ return out;
+ }else{
+ return traceback2(read, ref, refStartLoc, refEndLoc, row, col, state);
+ }
+ }
+
+ @Override
+ /** Generates the match string */
+ public final byte[] traceback2(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state){
+ assert(refStartLoc<=refEndLoc) : refStartLoc+", "+refEndLoc;
+ assert(row==rows);
+
+ byte[] out=new byte[row+col-1]; //TODO if an out of bound crash occurs, try removing the "-1".
+ int outPos=0;
+
+ int gaps=0;
+
+ if(state==MODE_INS){
+ }
+
+ while(row>0 && col>0){
+ final int time=packed[(state)*(maxRows+1)*(maxColumns+1)+(row)*(maxColumns+1)+(col)]&TIMEMASK;
+ final byte prev;
+
+ if(state==MODE_MS){
+ if(time>1){prev=(byte)state;}
+ else{
+ final int scoreFromDiag=packed[(MODE_MS)*(maxRows+1)*(maxColumns+1)+(row-1)*(maxColumns+1)+(col-1)]&SCOREMASK;
+ final int scoreFromDel=packed[(MODE_DEL)*(maxRows+1)*(maxColumns+1)+(row-1)*(maxColumns+1)+(col-1)]&SCOREMASK;
+ final int scoreFromIns=packed[(MODE_INS)*(maxRows+1)*(maxColumns+1)+(row-1)*(maxColumns+1)+(col-1)]&SCOREMASK;
+ if(scoreFromDiag>=scoreFromDel && scoreFromDiag>=scoreFromIns){prev=MODE_MS;}
+ else if(scoreFromDel>=scoreFromIns){prev=MODE_DEL;}
+ else{prev=MODE_INS;}
+ }
+
+ byte c=read[row-1];
+ byte r=ref[refStartLoc+col-1];
+ if(c==r){
+ out[outPos]='m';
+ }else{
+ if(!AminoAcid.isFullyDefined(c)){
+ out[outPos]='N';
+ }else if(!AminoAcid.isFullyDefined(r)){
+ out[outPos]='N';
+ }else{
+ out[outPos]='S';
+ }
+ }
+
+ row--;
+ col--;
+ }else if(state==MODE_DEL){
+ if(time>1){prev=(byte)state;}
+ else{
+ final int scoreFromDiag=packed[(MODE_MS)*(maxRows+1)*(maxColumns+1)+(row)*(maxColumns+1)+(col-1)]&SCOREMASK;
+ final int scoreFromDel=packed[(MODE_DEL)*(maxRows+1)*(maxColumns+1)+(row)*(maxColumns+1)+(col-1)]&SCOREMASK;
+ if(scoreFromDiag>=scoreFromDel){prev=MODE_MS;}
+ else{prev=MODE_DEL;}
+ }
+
+ byte r=ref[refStartLoc+col-1];
+ if(r==GAPC){
+ out[outPos]='-';
+ gaps++;
+ }else{
+ out[outPos]='D';
+ }
+ col--;
+ }else{
+ if(time>1){prev=(byte)state;}
+ else{
+ final int scoreFromDiag=packed[(MODE_MS)*(maxRows+1)*(maxColumns+1)+(row-1)*(maxColumns+1)+(col)]&SCOREMASK;
+ final int scoreFromIns=packed[(MODE_INS)*(maxRows+1)*(maxColumns+1)+(row-1)*(maxColumns+1)+(col)]&SCOREMASK;
+ if(scoreFromDiag>=scoreFromIns){prev=MODE_MS;}
+ else{prev=MODE_INS;}
+ }
+
+ assert(state==MODE_INS) : state;
+ if(col==0){
+ out[outPos]='X';
+ }else if(col>=columns){
+ out[outPos]='Y';
+ }else{
+ out[outPos]='I';
+ }
+ row--;
+ }
+
+ state=prev;
+ outPos++;
+ }
+
+ assert(row==0 || col==0);
+ if(col!=row){
+ while(row>0){
+ out[outPos]='X';
+ outPos++;
+ row--;
+ col--;
+ }
+ if(col>0){
+ //do nothing
+ }
+ }
+
+ byte[] out2=new byte[outPos];
+ for(int i=0; i<outPos; i++){
+ out2[i]=out[outPos-i-1];
+ }
+ out=null;
+
+ if(gaps==0){return out2;}
+
+ byte[] out3=new byte[out2.length+gaps*(GAPLEN-1)];
+ for(int i=0, j=0; i<out2.length; i++){
+ byte c=out2[i];
+ if(c!=GAPC){
+ out3[j]=c;
+ j++;
+ }else{
+ int lim=j+GAPLEN;
+ for(; j<lim; j++){
+ out3[j]='D';
+ }
+ }
+ }
+ return out3;
+ }
+
+ @Override
+ /** @return {score, bestRefStart, bestRefStop} */
+ public final int[] score(final byte[] read, final byte[] ref, final int refStartLoc, final int refEndLoc,
+ final int maxRow, final int maxCol, final int maxState, boolean gapped){
+ if(gapped){
+ if(verbose){
+ System.err.println("score():");
+ System.err.println("origin="+grefRefOrigin+", "+refStartLoc+", "+refEndLoc+", "+maxRow+", "+maxCol);
+ }
+ final byte[] gref=grefbuffer;
+ int gstart=translateToGappedCoordinate(refStartLoc, gref);
+ int gstop=translateToGappedCoordinate(refEndLoc, gref);
+
+ assert(translateFromGappedCoordinate(gstart, gref)==refStartLoc); //TODO: Remove slow assertions
+ assert(translateFromGappedCoordinate(gstop, gref)==refEndLoc);
+
+ assert(gstart==0) : gstart; //TODO: skip translation if this is always zero
+
+ if(verbose){System.err.println("gstart, gstop: "+gstart+", "+gstop);}
+ int[] out=score2(read, gref, gstart, gstop, maxRow, maxCol, maxState);
+ if(verbose){System.err.println("got score "+Arrays.toString(out));}
+
+ assert(out[1]==translateToGappedCoordinate(translateFromGappedCoordinate(out[1], gref), gref)) :
+ "Verifying: "+out[1]+" -> "+translateFromGappedCoordinate(out[1], gref)+" -> "+
+ translateToGappedCoordinate(translateFromGappedCoordinate(out[1], gref), gref);
+ assert(out[2]==translateToGappedCoordinate(translateFromGappedCoordinate(out[2], gref), gref));
+
+ out[1]=translateFromGappedCoordinate(out[1], gref);
+ out[2]=translateFromGappedCoordinate(out[2], gref);
+ if(verbose){System.err.println("returning score "+Arrays.toString(out));}
+ return out;
+ }else{
+ return score2(read, ref, refStartLoc, refEndLoc, maxRow, maxCol, maxState);
+ }
+ }
+
+ @Override
+ /** @return {score, bestRefStart, bestRefStop, maxRow, maxCol, maxState}, <br>
+ * or {score, bestRefStart, bestRefStop, maxRow, maxCol, maxState, padLeft, padRight} <br>
+ * if more padding is needed */
+ public final int[] score2(final byte[] read, final byte[] ref, final int refStartLoc, final int refEndLoc,
+ final int maxRow, final int maxCol, final int maxState){
+ int row=maxRow;
+ int col=maxCol;
+ int state=maxState;
+
+ assert(maxState>=0 && maxState<3) :
+ maxState+", "+maxRow+", "+maxCol+"\n"+new String(read)+"\n"+toString(ref, refStartLoc, refEndLoc);
+ //assert(maxRow>=0 && maxRow<packed[0].length) :
+ // maxState+", "+maxRow+", "+maxCol+"\n"+new String(read)+"\n"+toString(ref, refStartLoc, refEndLoc);
+ //assert(maxCol>=0 && maxCol<packed[0][0].length) :
+ // maxState+", "+maxRow+", "+maxCol+"\n"+new String(read)+"\n"+toString(ref, refStartLoc, refEndLoc);
+
+ int score=packed[(maxState)*(maxRows+1)*(maxColumns+1)+(maxRow)*(maxColumns+1)+(maxCol)]&SCOREMASK; //Or zero, if it is to be recalculated
+
+ if(row<rows){
+ int difR=rows-row;
+ int difC=columns-col;
+
+ while(difR>difC){
+ score+=POINTSoff_NOREF;
+ difR--;
+ }
+
+ row+=difR;
+ col+=difR;
+ }
+
+ assert(refStartLoc<=refEndLoc);
+ assert(row==rows);
+
+ final int bestRefStop=refStartLoc+col-1;
+
+ if(verbose){System.err.println("Scoring.");}
+
+ int stateTime=0;
+
+ while(row>0 && col>0){
+ if(verbose){System.err.println("state="+state+", row="+row+", col="+col);}
+
+ final int time=packed[(state)*(maxRows+1)*(maxColumns+1)+(row)*(maxColumns+1)+(col)]&TIMEMASK;
+ final byte prev;
+
+ if(state==MODE_MS){
+ if(time>1){prev=(byte)state;}
+ else{
+ final int scoreFromDiag=packed[(MODE_MS)*(maxRows+1)*(maxColumns+1)+(row-1)*(maxColumns+1)+(col-1)]&SCOREMASK;
+ final int scoreFromDel=packed[(MODE_DEL)*(maxRows+1)*(maxColumns+1)+(row-1)*(maxColumns+1)+(col-1)]&SCOREMASK;
+ final int scoreFromIns=packed[(MODE_INS)*(maxRows+1)*(maxColumns+1)+(row-1)*(maxColumns+1)+(col-1)]&SCOREMASK;
+ if(scoreFromDiag>=scoreFromDel && scoreFromDiag>=scoreFromIns){prev=MODE_MS;}
+ else if(scoreFromDel>=scoreFromIns){prev=MODE_DEL;}
+ else{prev=MODE_INS;}
+ }
+ row--;
+ col--;
+ }else if(state==MODE_DEL){
+ if(time>1){prev=(byte)state;}
+ else{
+ final int scoreFromDiag=packed[(MODE_MS)*(maxRows+1)*(maxColumns+1)+(row)*(maxColumns+1)+(col-1)]&SCOREMASK;
+ final int scoreFromDel=packed[(MODE_DEL)*(maxRows+1)*(maxColumns+1)+(row)*(maxColumns+1)+(col-1)]&SCOREMASK;
+ if(scoreFromDiag>=scoreFromDel){prev=MODE_MS;}
+ else{prev=MODE_DEL;}
+ }
+ col--;
+ }else{
+ assert(state==MODE_INS);
+ if(time>1){prev=(byte)state;}
+ else{
+ final int scoreFromDiag=packed[(MODE_MS)*(maxRows+1)*(maxColumns+1)+(row-1)*(maxColumns+1)+(col)]&SCOREMASK;
+ final int scoreFromIns=packed[(MODE_INS)*(maxRows+1)*(maxColumns+1)+(row-1)*(maxColumns+1)+(col)]&SCOREMASK;
+ if(scoreFromDiag>=scoreFromIns){prev=MODE_MS;}
+ else{prev=MODE_INS;}
+ }
+ row--;
+ }
+
+ if(col<0){
+ if(verbose){
+ System.err.println("Warning, column went below 0 at row="+row);
+ }
+ break; //prevents an out of bounds access
+ }
+
+ if(state==prev){stateTime++;}else{stateTime=0;}
+ state=prev;
+
+ if(verbose){System.err.println("state2="+state+", time="+time+", stateTime="+stateTime+", row2="+row+", col2="+col+"\n");}
+ }
+ if(row>col){
+ col-=row;
+ }
+
+ final int bestRefStart=refStartLoc+col;
+
+ score>>=SCOREOFFSET;
+
+ if(verbose){
+ System.err.println("bestRefStart="+bestRefStart+", refStartLoc="+refStartLoc);
+ System.err.println("bestRefStop="+bestRefStop+", refEndLoc="+refEndLoc);
+ }
+
+ int padLeft=0;
+ int padRight=0;
+ if(bestRefStart<refStartLoc){
+ padLeft=Tools.max(0, refStartLoc-bestRefStart);
+ }else if(bestRefStart==refStartLoc && state==MODE_INS){
+ padLeft=stateTime;
+ }
+ if(bestRefStop>refEndLoc){
+ padRight=Tools.max(0, bestRefStop-refEndLoc);
+ }else if(bestRefStop==refEndLoc && maxState==MODE_INS){
+ padRight=packed[(maxState)*(maxRows+1)*(maxColumns+1)+(maxRow)*(maxColumns+1)+(maxCol)]&TIMEMASK;
+ }
+
+ int[] rvec;
+ if(padLeft>0 || padRight>0){ //Suggest extra padding in cases of overflow
+ rvec=new int[] {score, bestRefStart, bestRefStop, maxRow, maxCol, maxState, padLeft, padRight};
+ }else{
+ rvec=new int[] {score, bestRefStart, bestRefStop, maxRow, maxCol, maxState};
+ }
+ return rvec;
+ }
+
+ /**
+ * Fills grefbuffer
+ * @param ref
+ * @param a
+ * @param b
+ * @param gaps
+ * @return gref
+ */
+ private final byte[] makeGref(byte[] ref, int[] gaps, int refStartLoc, int refEndLoc){
+ assert(gaps!=null && gaps.length>0);
+
+ assert(refStartLoc<=gaps[0]) : refStartLoc+", "+refEndLoc+", "+Arrays.toString(gaps);
+ assert(refEndLoc>=gaps[gaps.length-1]);
+
+ final int g0_old=gaps[0];
+ final int gN_old=gaps[gaps.length-1];
+ gaps[0]=Tools.min(gaps[0], refStartLoc);
+ gaps[gaps.length-1]=Tools.max(gN_old, refEndLoc);
+ grefRefOrigin=gaps[0];
+
+ if(verbose){System.err.println("\ngaps2: "+Arrays.toString(gaps));}
+
+ byte[] gref=grefbuffer;
+
+ int gpos=0;
+ for(int i=0; i<gaps.length; i+=2){
+ int x=gaps[i];
+ int y=gaps[i+1];
+
+ for(int r=x; r<=y; r++, gpos++){
+ //TODO: if out of bounds, use an 'N'
+ assert(gpos<gref.length) :
+ "\ngpos="+gpos+", gref.length="+gref.length+/*", read.length="+read.length+*/", gaps2="+Arrays.toString(gaps)+
+ "\ni="+i+", r="+r+", x="+x+", y="+y+
+ "\nGapTools.calcGrefLen("+gaps[0]+", "+gaps[gaps.length-1]+", gaps)="+GapTools.calcGrefLen(gaps[0], gaps[gaps.length-1], gaps)+
+ "\nGapTools.calcGrefLen("+gaps[0]+", "+gaps[gaps.length-1]+", gaps)="+GapTools.calcGrefLen(gaps[0], gaps[gaps.length-1], gaps)+
+ "\n"+refStartLoc+", "+refEndLoc+", "+greflimit+", "+GREFLIMIT2_CUSHION+"\n"+new String(gref)+"\n"/*+new String(read)+"\n"*/;
+ gref[gpos]=ref[r];
+ }
+
+ if(i+2<gaps.length){
+ int z=gaps[i+2];
+ assert(z>y);
+ int gap=z-y-1;
+ assert(gap>=MINGAP) : gap+"\t"+MINGAP;
+ if(gap<MINGAP){
+ assert(false) : "TODO - just fill in normally";
+ }else{
+ int rem=gap%GAPLEN;
+ int lim=y+GAPBUFFER+rem;
+
+ int div=(gap-GAPBUFFER2)/GAPLEN;
+ if(verbose){
+ System.err.println("div = "+div);
+ }
+ assert(div>0);
+
+ for(int r=y+1; r<=lim; r++, gpos++){
+ gref[gpos]=ref[r];
+ }
+ for(int g=0; g<div; g++, gpos++){
+ gref[gpos]=GAPC;
+ }
+ for(int r=z-GAPBUFFER; r<z; r++, gpos++){
+ gref[gpos]=ref[r];
+ }
+ }
+ }
+ }
+
+ greflimit=gpos;
+
+ assert(gref[gpos-1]==ref[refEndLoc]);
+ {
+ final int lim=Tools.min(gref.length, greflimit+GREFLIMIT2_CUSHION);
+ if(lim>gref.length){
+ System.err.println("gref buffer overflow: "+lim+" > "+gref.length);
+ return null;
+ }
+ for(int i=greflimit, r=refEndLoc+1; i<lim; i++, r++){
+ gref[i]=(r<ref.length ? ref[r] : (byte)'N');
+ greflimit2=i;
+ }
+ }
+
+ if(verbose){
+ System.err.println("gref:\n"+new String(gref));
+ }
+
+ gaps[0]=g0_old;
+ gaps[gaps.length-1]=gN_old;
+
+ if(verbose){
+ System.err.println("\ngaps3: "+Arrays.toString(gaps));
+ }
+
+ return gref;
+ }
+
+ private final int translateFromGappedCoordinate(int point, byte[] gref){
+ if(verbose){System.err.println("translateFromGappedCoordinate("+point+"), gro="+grefRefOrigin+", grl="+greflimit);}
+ if(point<=0){return grefRefOrigin+point;}
+ for(int i=0, j=grefRefOrigin; i<greflimit2; i++){
+ byte c=gref[i];
+ assert(point>=i) : "\n"+grefRefOrigin+"\n"+point+"\n"+new String(gref)+"\n";
+
+ if(i==point){
+ if(verbose){System.err.println(" -> "+j);}
+ return j;
+ }
+
+ j+=(c==GAPC ? GAPLEN : 1);
+ }
+
+ System.err.println(grefRefOrigin);
+ System.err.println(point);
+ System.err.println(new String(gref));
+
+ throw new RuntimeException("Out of bounds.");
+ }
+
+ private final int translateToGappedCoordinate(int point, byte[] gref){
+ if(verbose){System.err.println("translateToGappedCoordinate("+point+"), gro="+grefRefOrigin+", grl="+greflimit);}
+ if(point<=grefRefOrigin){return point-grefRefOrigin;}
+ for(int i=0, j=grefRefOrigin; i<greflimit2; i++){
+ assert(point>=j) : "\n"+grefRefOrigin+"\n"+point+"\n"+new String(gref)+"\n";
+ byte c=gref[i];
+
+ if(j==point){
+ if(verbose){System.err.println(" -> "+i);}
+ return i;
+ }
+
+ j+=(c==GAPC ? GAPLEN : 1);
+ }
+
+ System.err.println(grefRefOrigin);
+ System.err.println(point);
+ System.err.println(new String(gref));
+
+ throw new RuntimeException("Out of bounds.");
+ }
+
+ /** Calculates score based on an array from Index */
+ private final int calcAffineScore(int[] locArray){
+ int score=0;
+ int lastLoc=-2; //Last true location
+ int lastValue=-1;
+ int timeInMode=0;
+
+ for(int i=0; i<locArray.length; i++){
+ int loc=locArray[i];
+
+ if(loc>0){//match
+ if(loc==lastValue){//contiguous match
+ score+=POINTS_MATCH2;
+ }else if(loc==lastLoc || lastLoc<0){//match after a sub, or first match
+ score+=POINTS_MATCH;
+ }else if(loc<lastLoc){//deletion
+ assert(lastLoc>=0);
+ score+=POINTS_MATCH;
+ score+=POINTS_DEL;
+ int dif=lastLoc-loc+1;
+ if(dif>MINGAP){
+ int rem=dif%GAPLEN;
+ int div=(dif-GAPBUFFER2)/GAPLEN;
+ score+=(div*POINTS_GAP);
+ assert(rem+GAPBUFFER2<dif);
+ dif=rem+GAPBUFFER2;
+ assert(dif>LIMIT_FOR_COST_4); //and probably LIMIT_FOR_COST_5
+ }
+ if(dif>LIMIT_FOR_COST_5){
+ score+=((dif-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTS_DEL5;
+ dif=LIMIT_FOR_COST_5;
+ }
+ if(dif>LIMIT_FOR_COST_4){
+ score+=(dif-LIMIT_FOR_COST_4)*POINTS_DEL4;
+ dif=LIMIT_FOR_COST_4;
+ }
+ if(dif>LIMIT_FOR_COST_3){
+ score+=(dif-LIMIT_FOR_COST_3)*POINTS_DEL3;
+ dif=LIMIT_FOR_COST_3;
+ }
+ if(dif>1){
+ score+=(dif-1)*POINTS_DEL2;
+ }
+ timeInMode=1;
+ }else if(loc>lastLoc){//insertion
+ assert(lastLoc>=0);
+ score+=(POINTS_MATCH+POINTS_INS_ARRAY_C[lastLoc-loc]);
+ timeInMode=1;
+ }else{
+ assert(false);
+ }
+ lastLoc=loc;
+ }else{//substitution
+ if(lastValue<0 && timeInMode>0){//contiguous
+ timeInMode++;
+ score+=(POINTS_SUB_ARRAY[timeInMode]);
+ }else{
+ score+=POINTS_SUB;
+ timeInMode=1;
+ }
+ }
+ lastValue=loc;
+ }
+ assert(score<=maxQuality(locArray.length));
+ return score;
+ }
+
+ @Override
+ public final int calcAffineScore(final int[] locArray, final byte[] baseScores, final byte bases[]){
+ int score=0;
+ int lastLoc=-3; //Last true location
+ int lastValue=-1;
+ int timeInMode=0;
+
+ for(int i=0; i<locArray.length; i++){
+ final int loc=locArray[i];
+
+ if(loc>0){//match
+ if(loc==lastValue){//contiguous match
+ score+=(POINTS_MATCH2+baseScores[i]);
+ }else if(loc==lastLoc || lastLoc<0){//match after a sub, or first match
+ score+=(POINTS_MATCH+baseScores[i]);
+ }else if(loc<lastLoc){//deletion
+ assert(lastLoc>=0);
+ score+=(POINTS_MATCH+baseScores[i]);
+ score+=POINTS_DEL;
+ int dif=lastLoc-loc+1;
+ if(dif>MINGAP){
+ int rem=dif%GAPLEN;
+ int div=(dif-GAPBUFFER2)/GAPLEN;
+ score+=(div*POINTS_GAP);
+ assert(rem+GAPBUFFER2<dif);
+ dif=rem+GAPBUFFER2;
+ assert(dif>LIMIT_FOR_COST_4); //and probably LIMIT_FOR_COST_5
+ }
+ if(dif>LIMIT_FOR_COST_5){
+ score+=((dif-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTS_DEL5;
+ dif=LIMIT_FOR_COST_5;
+ }
+ if(dif>LIMIT_FOR_COST_4){
+ score+=(dif-LIMIT_FOR_COST_4)*POINTS_DEL4;
+ dif=LIMIT_FOR_COST_4;
+ }
+ if(dif>LIMIT_FOR_COST_3){
+ score+=(dif-LIMIT_FOR_COST_3)*POINTS_DEL3;
+ dif=LIMIT_FOR_COST_3;
+ }
+ if(dif>1){
+ score+=(dif-1)*POINTS_DEL2;
+ }
+ timeInMode=1;
+ }else if(loc>lastLoc){//insertion
+ assert(lastLoc>=0);
+ score+=(POINTS_MATCH+baseScores[i]+POINTS_INS_ARRAY_C[Tools.min(loc-lastLoc, 5)]);
+ timeInMode=1;
+ }else{
+ assert(false);
+ }
+ lastLoc=loc;
+ }else if(loc==-1){//substitution
+ if(lastValue<0 && timeInMode>0){//contiguous
+ timeInMode++;
+ score+=(POINTS_SUB_ARRAY[timeInMode]);
+ }else{
+ score+=POINTS_SUB;
+ timeInMode=1;
+ }
+ }else{
+ assert(loc==-2) : "\ni="+i+", loc="+loc+", score="+score+", lastLoc="+lastLoc+", lastValue="+lastValue
+ +", time="+timeInMode+", length="+locArray.length+"\nbases=\n"+new String(bases)
+ +"\nlocs[]=\n"+Arrays.toString(locArray)+"\n"+new String(bases)+"\n"
+ +"If this happens please ensure that the reference has a startpad of Ns longer than readlength.";//N (no-call or no-ref)
+ timeInMode=0;
+ score+=POINTS_NOCALL;
+ }
+ lastValue=loc;
+ }
+ assert(score<=maxQuality(locArray.length));
+ return score;
+ }
+
+ @Override
+ public final int calcAffineScore(int[] locArray, byte[] baseScores, byte[] bases, int minContig){
+ assert(minContig>1) : minContig;
+
+ int contig=0;
+ int maxContig=0;
+
+ int score=0;
+ int lastLoc=-3; //Last true location
+ int lastValue=-1;
+ int timeInMode=0;
+
+ for(int i=0; i<locArray.length; i++){
+ int loc=locArray[i];
+
+ if(loc>0){//match
+ if(loc==lastValue){//contiguous match
+ contig++;
+ score+=(POINTS_MATCH2+baseScores[i]);
+ }else if(loc==lastLoc || lastLoc<0){//match after a sub, or first match
+ maxContig=Tools.max(maxContig, contig);
+ contig=1;
+ score+=(POINTS_MATCH+baseScores[i]);
+ }else if(loc<lastLoc){//deletion
+ maxContig=Tools.max(maxContig, contig);
+ contig=0;
+ assert(lastLoc>=0);
+ score+=(POINTS_MATCH+baseScores[i]);
+ score+=POINTS_DEL;
+ int dif=lastLoc-loc+1;
+ if(dif>MINGAP){
+ int rem=dif%GAPLEN;
+ int div=(dif-GAPBUFFER2)/GAPLEN;
+ score+=(div*POINTS_GAP);
+ assert(rem+GAPBUFFER2<dif);
+ dif=rem+GAPBUFFER2;
+ assert(dif>LIMIT_FOR_COST_4); //and probably LIMIT_FOR_COST_5
+ }
+ if(dif>LIMIT_FOR_COST_5){
+ score+=((dif-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTS_DEL5;
+ dif=LIMIT_FOR_COST_5;
+ }
+ if(dif>LIMIT_FOR_COST_4){
+ score+=(dif-LIMIT_FOR_COST_4)*POINTS_DEL4;
+ dif=LIMIT_FOR_COST_4;
+ }
+ if(dif>LIMIT_FOR_COST_3){
+ score+=(dif-LIMIT_FOR_COST_3)*POINTS_DEL3;
+ dif=LIMIT_FOR_COST_3;
+ }
+ if(dif>1){
+ score+=(dif-1)*POINTS_DEL2;
+ }
+ timeInMode=1;
+ }else if(loc>lastLoc){//insertion
+ maxContig=Tools.max(maxContig, contig);
+ contig=0;
+ assert(lastLoc>=0);
+ score+=(POINTS_MATCH+baseScores[i]+POINTS_INS_ARRAY_C[Tools.min(loc-lastLoc, 5)]);
+ timeInMode=1;
+ }else{
+ assert(false);
+ }
+ lastLoc=loc;
+ }else if(loc==-1){//substitution
+ if(lastValue<0 && timeInMode>0){//contiguous
+ timeInMode++;
+ score+=(POINTS_SUB_ARRAY[timeInMode]);
+ }else{
+ score+=POINTS_SUB;
+ timeInMode=1;
+ }
+ }else{
+ assert(loc==-2) : loc+"\n"+Arrays.toString(locArray)+"\n"+Arrays.toString(baseScores)+"\n"+new String(bases)+"\n"
+ +"If this happens please ensure that the reference has a startpad of Ns longer than readlength.";//N (no-call or no-ref)
+ timeInMode=0;
+ score+=POINTS_NOCALL;
+ }
+ lastValue=loc;
+ }
+ assert(score<=maxQuality(locArray.length));
+ if(Tools.max(contig, maxContig)<minContig){score=Tools.min(score, -50*locArray.length);}
+ return score;
+ }
+
+ @Override
+ public final int scoreNoIndels(byte[] read, byte[] ref, final int refStart){
+ return scoreNoIndels(read, ref, refStart, null);
+ }
+ @Override
+ public final int scoreNoIndels(byte[] read, byte[] ref, final int refStart, final SiteScore ss){
+ int score=0;
+ int mode=-1;
+ int timeInMode=0;
+
+ //This block handles cases where the read runs outside the reference
+ //Of course, padding the reference with 'N' would be better, but...
+ int readStart=0;
+ int readStop=read.length;
+ final int refStop=refStart+read.length;
+ boolean semiperfect=true;
+ int norefs=0;
+
+ if(refStart<0){
+ readStart=0-refStart;
+ score+=POINTS_NOREF*readStart;
+ norefs+=readStart;
+ }
+ if(refStop>ref.length){
+ int dif=(refStop-ref.length);
+ readStop-=dif;
+ score+=POINTS_NOREF*dif;
+ norefs+=dif;
+ }
+
+ for(int i=readStart; i<readStop; i++){
+ byte c=read[i];
+ byte r=ref[refStart+i];
+
+ if(c==r && c!='N'){
+ if(mode==MODE_MS){
+ timeInMode++;
+ score+=POINTS_MATCH2;
+ }else{
+ timeInMode=0;
+ score+=POINTS_MATCH;
+ }
+ mode=MODE_MS;
+ }else if(c<0 || c=='N'){
+ score+=POINTS_NOCALL;
+ semiperfect=false;
+ }else if(r<0 || r=='N'){
+ score+=POINTS_NOREF;
+ norefs++;
+ }else{
+ if(mode==MODE_SUB){timeInMode++;}
+ else{timeInMode=0;}
+
+ score+=(POINTS_SUB_ARRAY[timeInMode+1]);
+ mode=MODE_SUB;
+ semiperfect=false;
+ }
+ }
+
+ return score;
+ }
+
+ @Override
+ public final byte[] genMatchNoIndels(byte[] read, byte[] ref, final int refStart){
+ if(read==null || ref==null){return null;}
+
+ final byte[] match=new byte[read.length];
+
+ for(int i=0, j=refStart; i<read.length; i++, j++){
+ byte c=read[i];
+ byte r=(j<0 || j>=ref.length) ? (byte)'N' : ref[j];
+
+ if(c=='N' || r=='N'){match[i]='N';}
+ else if(c==r){match[i]='m';}
+ else{match[i]='S';}
+
+ }
+
+ return match;
+ }
+
+ @Override
+ public final int scoreNoIndels(byte[] read, byte[] ref, byte[] baseScores, final int refStart){
+ return scoreNoIndels(read, ref, baseScores, refStart, null);
+ }
+ @Override
+ public final int scoreNoIndels(byte[] read, byte[] ref, byte[] baseScores, final int refStart, SiteScore ss){
+ int score=0;
+ int mode=-1;
+ int timeInMode=0;
+ int norefs=0;
+
+ //This block handles cases where the read runs outside the reference
+ //Of course, padding the reference with 'N' would be better, but...
+ int readStart=0;
+ int readStop=read.length;
+ final int refStop=refStart+read.length;
+ boolean semiperfect=true;
+
+ if(refStart<0){
+ readStart=0-refStart;
+ score+=POINTS_NOREF*readStart;
+ norefs+=readStart;
+ }
+ if(refStop>ref.length){
+ int dif=(refStop-ref.length);
+ readStop-=dif;
+ score+=POINTS_NOREF*dif;
+ norefs+=dif;
+ }
+
+ for(int i=readStart; i<readStop; i++){
+ byte c=read[i];
+ byte r=ref[refStart+i];
+
+ if(c==r && c!='N'){
+ if(mode==MODE_MS){
+ timeInMode++;
+ score+=POINTS_MATCH2;
+ }else{
+ timeInMode=0;
+ score+=POINTS_MATCH;
+ }
+ score+=baseScores[i];
+ mode=MODE_MS;
+ }else if(c<0 || c=='N'){
+ score+=POINTS_NOCALL;
+ semiperfect=false;
+ }else if(r<0 || r=='N'){
+ score+=POINTS_NOREF;
+ norefs++;
+ }else{
+ if(mode==MODE_SUB){timeInMode++;}
+ else{timeInMode=0;}
+
+ score+=(POINTS_SUB_ARRAY[timeInMode+1]);
+ mode=MODE_SUB;
+ semiperfect=false;
+ }
+ }
+
+ return score;
+ }
+
+ @Override
+ public final int scoreNoIndelsAndMakeMatchString(byte[] read, byte[] ref, byte[] baseScores, final int refStart, byte[][] matchReturn){
+ int score=0;
+ int mode=-1;
+ int timeInMode=0;
+
+ assert(refStart<=ref.length) : refStart+", "+ref.length;
+
+ //This block handles cases where the read runs outside the reference
+ //Of course, padding the reference with 'N' would be better, but...
+ int readStart=0;
+ int readStop=read.length;
+ final int refStop=refStart+read.length;
+ if(refStart<0 || refStop>ref.length){return -99999;}
+ if(refStart<0){
+ readStart=0-refStart;
+ score+=POINTS_NOREF*readStart;
+ }
+ if(refStop>ref.length){
+ int dif=(refStop-ref.length);
+ System.err.println(new String(read)+"\ndif="+dif+", ref.length="+ref.length+", refStop="+refStop);
+ readStop-=dif;
+ score+=POINTS_NOREF*dif;
+ }
+ assert(refStart+readStop<=ref.length) : "readStart="+readStart+", readStop="+readStop+
+ ", refStart="+refStart+", refStop="+refStop+", ref.length="+ref.length+", read.length="+read.length;
+
+ assert(matchReturn!=null);
+ assert(matchReturn.length==1);
+ if(matchReturn[0]==null || matchReturn[0].length!=read.length){
+ assert(matchReturn[0]==null || matchReturn[0].length<read.length) : matchReturn[0].length+"!="+read.length;
+ matchReturn[0]=new byte[read.length];
+ }
+ final byte[] match=matchReturn[0];
+
+ for(int i=readStart; i<readStop; i++){
+ byte c=read[i];
+ byte r=ref[refStart+i];
+
+ assert(r!='.' && c!='.');
+
+ if(c==r && c!='N'){
+ if(mode==MODE_MS){
+ timeInMode++;
+ score+=POINTS_MATCH2;
+ }else{
+ timeInMode=0;
+ score+=POINTS_MATCH;
+ }
+ score+=baseScores[i];
+ match[i]='m';
+ mode=MODE_MS;
+ }else if(c<0 || c=='N'){
+ score+=POINTS_NOCALL;
+ match[i]='N';
+ }else if(r<0 || r=='N'){
+ score+=POINTS_NOREF;
+ match[i]='N';
+ }else{
+ match[i]='S';
+ if(mode==MODE_SUB){timeInMode++;}
+ else{timeInMode=0;}
+
+ score+=(POINTS_SUB_ARRAY[timeInMode+1]);
+ mode=MODE_SUB;
+ }
+ }
+
+ return score;
+ }
+
+ @Override
+ public final int scoreNoIndelsAndMakeMatchString(byte[] read, byte[] ref, final int refStart, byte[][] matchReturn){
+ int score=0;
+ int mode=-1;
+ int timeInMode=0;
+
+ assert(refStart<=ref.length) : refStart+", "+ref.length;
+
+ //This block handles cases where the read runs outside the reference
+ //Of course, padding the reference with 'N' would be better, but...
+ int readStart=0;
+ int readStop=read.length;
+ final int refStop=refStart+read.length;
+ if(refStart<0 || refStop>ref.length){return -99999;}
+ if(refStart<0){
+ readStart=0-refStart;
+ score+=POINTS_NOREF*readStart;
+ }
+ if(refStop>ref.length){
+ int dif=(refStop-ref.length);
+ System.err.println(new String(read)+"\ndif="+dif+", ref.length="+ref.length+", refStop="+refStop);
+ readStop-=dif;
+ score+=POINTS_NOREF*dif;
+ }
+ assert(refStart+readStop<=ref.length) : "readStart="+readStart+", readStop="+readStop+
+ ", refStart="+refStart+", refStop="+refStop+", ref.length="+ref.length+", read.length="+read.length;
+
+ assert(matchReturn!=null);
+ assert(matchReturn.length==1);
+ if(matchReturn[0]==null || matchReturn[0].length!=read.length){
+ assert(matchReturn[0]==null || matchReturn[0].length<read.length) : matchReturn[0].length+"!="+read.length;
+ matchReturn[0]=new byte[read.length];
+ }
+ final byte[] match=matchReturn[0];
+
+ for(int i=readStart; i<readStop; i++){
+ byte c=read[i];
+ byte r=ref[refStart+i];
+
+ assert(r!='.' && c!='.');
+
+ if(c==r && c!='N'){
+ if(mode==MODE_MS){
+ timeInMode++;
+ score+=POINTS_MATCH2;
+ }else{
+ timeInMode=0;
+ score+=POINTS_MATCH;
+ }
+ match[i]='m';
+ mode=MODE_MS;
+ }else if(c<0 || c=='N'){
+ score+=POINTS_NOCALL;
+ match[i]='N';
+ }else if(r<0 || r=='N'){
+ score+=POINTS_NOREF;
+ match[i]='N';
+ }else{
+ match[i]='S';
+ if(mode==MODE_SUB){timeInMode++;}
+ else{timeInMode=0;}
+
+ if(AFFINE_ARRAYS){
+ score+=(POINTS_SUB_ARRAY[timeInMode+1]);
+ }else{
+ if(timeInMode==0){score+=POINTS_SUB;}
+ else if(timeInMode<LIMIT_FOR_COST_3){score+=POINTS_SUB2;}
+ else{score+=POINTS_SUB3;}
+ }
+ mode=MODE_SUB;
+ }
+ }
+
+ return score;
+ }
+
+ @Override
+ public final int maxQuality(int numBases){
+ return POINTS_MATCH+(numBases-1)*(POINTS_MATCH2);
+ }
+
+ @Override
+ public final int maxQuality(byte[] baseScores){
+ return POINTS_MATCH+(baseScores.length-1)*(POINTS_MATCH2)+Tools.sumInt(baseScores);
+ }
+
+ @Override
+ public final int maxImperfectScore(int numBases){
+ int maxQ=maxQuality(numBases);
+ int maxI=maxQ+Tools.min(POINTS_DEL, POINTS_INS-POINTS_MATCH2);
+ assert(maxI<(maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB)));
+ return maxI;
+ }
+
+ @Override
+ public final int maxImperfectScore(byte[] baseScores){
+ int maxQ=maxQuality(baseScores);
+ int maxI=maxQ+Tools.min(POINTS_DEL, POINTS_INS-POINTS_MATCH2);
+ assert(maxI<(maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB)));
+ return maxI;
+ }
+
+ @Override
+ public int calcDelScore(int len, boolean approximateGaps){
+ if(len<=0){return 0;}
+ int score=POINTS_DEL;
+
+ if(approximateGaps && len>MINGAP){
+ int rem=len%GAPLEN;
+ int div=(len-GAPBUFFER2)/GAPLEN;
+ score+=(div*POINTS_GAP);
+ assert(rem+GAPBUFFER2<len);
+ len=rem+GAPBUFFER2;
+ assert(len>LIMIT_FOR_COST_4); //and probably LIMIT_FOR_COST_5
+ }
+
+ if(len>LIMIT_FOR_COST_5){
+ score+=((len-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTS_DEL5;
+ len=LIMIT_FOR_COST_5;
+ }
+ if(len>LIMIT_FOR_COST_4){
+ score+=(len-LIMIT_FOR_COST_4)*POINTS_DEL4;
+ len=LIMIT_FOR_COST_4;
+ }
+ if(len>LIMIT_FOR_COST_3){
+ score+=(len-LIMIT_FOR_COST_3)*POINTS_DEL3;
+ len=LIMIT_FOR_COST_3;
+ }
+ if(len>1){
+ score+=(len-1)*POINTS_DEL2;
+ }
+ return score;
+ }
+
+ private static int calcDelScoreOffset(int len){
+ if(len<=0){return 0;}
+ int score=POINTSoff_DEL;
+
+ if(len>LIMIT_FOR_COST_5){
+ score+=((len-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTSoff_DEL5;
+ len=LIMIT_FOR_COST_5;
+ }
+ if(len>LIMIT_FOR_COST_4){
+ score+=(len-LIMIT_FOR_COST_4)*POINTSoff_DEL4;
+ len=LIMIT_FOR_COST_4;
+ }
+ if(len>LIMIT_FOR_COST_3){
+ score+=(len-LIMIT_FOR_COST_3)*POINTSoff_DEL3;
+ len=LIMIT_FOR_COST_3;
+ }
+ if(len>1){
+ score+=(len-1)*POINTSoff_DEL2;
+ }
+ return score;
+ }
+
+ @Override
+ public int calcInsScore(int len){
+ if(len<=0){return 0;}
+ if(AFFINE_ARRAYS){
+ return POINTS_INS_ARRAY_C[len];
+ }else{
+ int score=POINTS_INS;
+
+ if(len>LIMIT_FOR_COST_4){
+ score+=(len-LIMIT_FOR_COST_4)*POINTS_INS4;
+ len=LIMIT_FOR_COST_4;
+ }
+ if(len>LIMIT_FOR_COST_3){
+ score+=(len-LIMIT_FOR_COST_3)*POINTS_INS3;
+ len=LIMIT_FOR_COST_3;
+ }
+ if(len>1){
+ score+=(len-1)*POINTS_INS2;
+ }
+ return score;
+ }
+ }
+
+ private static int calcInsScoreOffset(int len){
+ if(len<=0){return 0;}
+ if(AFFINE_ARRAYS){
+ return POINTSoff_INS_ARRAY_C[len];
+ }else{
+ int score=POINTSoff_INS;
+ if(len>LIMIT_FOR_COST_4){
+ score+=(len-LIMIT_FOR_COST_4)*POINTSoff_INS4;
+ len=LIMIT_FOR_COST_4;
+ }
+ if(len>LIMIT_FOR_COST_3){
+ score+=(len-LIMIT_FOR_COST_3)*POINTSoff_INS3;
+ len=LIMIT_FOR_COST_3;
+ }
+ if(len>1){
+ score+=(len-1)*POINTSoff_INS2;
+ }
+ return score;
+ }
+ }
+
+ private final int[]packed;
+ private final byte[] grefbuffer;
+ private int greflimit=-1;
+ private int greflimit2=-1;
+ private int grefRefOrigin=-1;
+
+ @Override
+ /**DO NOT MODIFY*/
+ public final byte[] getGrefbuffer(){
+ return grefbuffer;
+ }
+
+ public final int[] vertLimit;
+ public final int[] horizLimit;
+
+ @Override
+ public CharSequence showVertLimit(){
+ StringBuilder sb=new StringBuilder();
+ for(int i=0; i<=rows; i++){sb.append(vertLimit[i]>>SCOREOFFSET).append(",");}
+ return sb;
+ }
+
+ @Override
+ public CharSequence showHorizLimit(){
+ StringBuilder sb=new StringBuilder();
+ for(int i=0; i<=columns; i++){sb.append(horizLimit[i]>>SCOREOFFSET).append(",");}
+ return sb;
+ }
+
+ public static float minIdToMinRatio(double minid){
+ if(minid>1){minid=minid/100;}
+ assert(minid>0 && minid<=1) : "Min identity should be between 0 and 1. Values above 1 will be assumed to be percent and divided by 100.";
+ double matchdif=POINTS_MATCH-POINTS_MATCH2;
+ double match=POINTS_MATCH2;
+ double sub=-POINTS_MATCH2+0.5*(matchdif+POINTS_SUB)+0.5*POINTS_SUB2;
+ double del=0.1*(matchdif+POINTS_DEL)+0.2*POINTS_DEL2+0.4*POINTS_DEL3+0.3*POINTS_DEL4;
+ double ins=-POINTS_MATCH2+0.4*(matchdif+POINTS_INS)+0.3*(POINTS_INS2)+0.3*(POINTS_INS3);
+ double badAvg=.7*sub+.2*del+.1*ins;
+ double badFraction=1-minid;
+ double minratio=(match+badFraction*badAvg)/match;
+ assert(minratio<=1);
+ minratio=Tools.max(0.1, minratio);
+ return (float)minratio;
+ }
+
+ public static final int TIMEBITS=11;
+ public static final int SCOREBITS=32-TIMEBITS;
+ public static final int MAX_TIME=((1<<TIMEBITS)-1);
+ public static final int MAX_SCORE=((1<<(SCOREBITS-1))-1)-2000;
+ public static final int MIN_SCORE=0-MAX_SCORE; //Keeps it 1 point above "BAD".
+
+ public static final int SCOREOFFSET=TIMEBITS;
+
+ public static final int TIMEMASK=~((-1)<<TIMEBITS);
+ public static final int SCOREMASK=(~((-1)<<SCOREBITS))<<SCOREOFFSET;
+
+ private static final byte MODE_MS=0;
+ private static final byte MODE_DEL=1;
+ private static final byte MODE_INS=2;
+ private static final byte MODE_SUB=3;
+
+ public static final int POINTS_NOREF=0;
+ public static final int POINTS_NOCALL=0;
+ public static final int POINTS_MATCH=70;
+ public static final int POINTS_MATCH2=100; //Note: Changing to 90 substantially reduces false positives
+ public static final int POINTS_COMPATIBLE=50;
+ public static final int POINTS_SUB=-127;
+ public static final int POINTS_SUBR=-147; //increased penalty if prior match streak was at most 1
+ public static final int POINTS_SUB2=-51;
+ public static final int POINTS_SUB3=-25;
+ public static final int POINTS_MATCHSUB=-10;
+ public static final int POINTS_INS=-395;
+ public static final int POINTS_INS2=-39;
+ public static final int POINTS_INS3=-23;
+ public static final int POINTS_INS4=-8;
+ public static final int POINTS_DEL=-472;
+ public static final int POINTS_DEL2=-33;
+ public static final int POINTS_DEL3=-9;
+ public static final int POINTS_DEL4=-1;
+ public static final int POINTS_DEL5=-1;
+ public static final int POINTS_DEL_REF_N=-10;
+ public static final int POINTS_GAP=0-GAPCOST;
+
+ public static final int TIMESLIP=4;
+ public static final int MASK5=TIMESLIP-1;
+ static{assert(Integer.bitCount(TIMESLIP)==1);}
+
+ private static final int BARRIER_I1=2;
+ private static final int BARRIER_D1=3;
+
+ public static final int LIMIT_FOR_COST_3=5;
+ public static final int LIMIT_FOR_COST_4=20;
+ public static final int LIMIT_FOR_COST_5=80;
+
+ public static final int BAD=MIN_SCORE-1;
+
+ public static final int POINTSoff_NOREF=(POINTS_NOREF<<SCOREOFFSET);
+ public static final int POINTSoff_NOCALL=(POINTS_NOCALL<<SCOREOFFSET);
+ public static final int POINTSoff_MATCH=(POINTS_MATCH<<SCOREOFFSET);
+ public static final int POINTSoff_MATCH2=(POINTS_MATCH2<<SCOREOFFSET);
+ public static final int POINTSoff_COMPATIBLE=(POINTS_COMPATIBLE<<SCOREOFFSET);
+ public static final int POINTSoff_SUB=(POINTS_SUB<<SCOREOFFSET);
+ public static final int POINTSoff_SUBR=(POINTS_SUBR<<SCOREOFFSET);
+ public static final int POINTSoff_SUB2=(POINTS_SUB2<<SCOREOFFSET);
+ public static final int POINTSoff_SUB3=(POINTS_SUB3<<SCOREOFFSET);
+ public static final int POINTSoff_MATCHSUB=(POINTS_MATCHSUB<<SCOREOFFSET);
+ public static final int POINTSoff_INS=(POINTS_INS<<SCOREOFFSET);
+ public static final int POINTSoff_INS2=(POINTS_INS2<<SCOREOFFSET);
+ public static final int POINTSoff_INS3=(POINTS_INS3<<SCOREOFFSET);
+ public static final int POINTSoff_INS4=(POINTS_INS4<<SCOREOFFSET);
+ public static final int POINTSoff_DEL=(POINTS_DEL<<SCOREOFFSET);
+ public static final int POINTSoff_DEL2=(POINTS_DEL2<<SCOREOFFSET);
+ public static final int POINTSoff_DEL3=(POINTS_DEL3<<SCOREOFFSET);
+ public static final int POINTSoff_DEL4=(POINTS_DEL4<<SCOREOFFSET);
+ public static final int POINTSoff_DEL5=(POINTS_DEL5<<SCOREOFFSET);
+ public static final int POINTSoff_GAP=(POINTS_GAP<<SCOREOFFSET);
+ public static final int POINTSoff_DEL_REF_N=(POINTS_DEL_REF_N<<SCOREOFFSET);
+ public static final int BADoff=(BAD<<SCOREOFFSET);
+ public static final int MAXoff_SCORE=MAX_SCORE<<SCOREOFFSET;
+ public static final int MINoff_SCORE=MIN_SCORE<<SCOREOFFSET;
+
+ public static final boolean AFFINE_ARRAYS=true;
+ public static final int[] POINTS_INS_ARRAY;
+ public static final int[] POINTSoff_INS_ARRAY;
+ public static final int[] POINTS_INS_ARRAY_C;
+ public static final int[] POINTSoff_INS_ARRAY_C;
+
+ public static final int[] POINTS_SUB_ARRAY;
+ public static final int[] POINTSoff_SUB_ARRAY;
+ public static final int[] POINTS_SUB_ARRAY_C;
+ public static final int[] POINTSoff_SUB_ARRAY_C;
+
+ static{
+ POINTS_INS_ARRAY=new int[604];
+ POINTSoff_INS_ARRAY=new int[604];
+ POINTS_INS_ARRAY_C=new int[604];
+ POINTSoff_INS_ARRAY_C=new int[604];
+
+ for(int i=1; i<POINTS_INS_ARRAY.length; i++){
+ int pts, ptsoff;
+ if(i>LIMIT_FOR_COST_4){
+ pts=POINTS_INS4;
+ ptsoff=POINTSoff_INS4;
+ }else if(i>LIMIT_FOR_COST_3){
+ pts=POINTS_INS3;
+ ptsoff=POINTSoff_INS3;
+ }else if(i>1){
+ pts=POINTS_INS2;
+ ptsoff=POINTSoff_INS2;
+ }else{
+ pts=POINTS_INS;
+ ptsoff=POINTSoff_INS;
+ }
+ POINTS_INS_ARRAY[i]=pts;
+ POINTSoff_INS_ARRAY[i]=ptsoff;
+ POINTS_INS_ARRAY_C[i]=Tools.max(MIN_SCORE, pts+POINTS_INS_ARRAY_C[i-1]);
+ POINTSoff_INS_ARRAY_C[i]=Tools.max(MINoff_SCORE, ptsoff+POINTSoff_INS_ARRAY_C[i-1]);
+ }
+
+ POINTS_SUB_ARRAY=new int[604];
+ POINTSoff_SUB_ARRAY=new int[604];
+ POINTS_SUB_ARRAY_C=new int[604];
+ POINTSoff_SUB_ARRAY_C=new int[604];
+
+ for(int i=1; i<POINTS_SUB_ARRAY.length; i++){
+ int pts, ptsoff;
+ if(i>LIMIT_FOR_COST_3){
+ pts=POINTS_SUB3;
+ ptsoff=POINTSoff_SUB3;
+ }else if(i>1){
+ pts=POINTS_SUB2;
+ ptsoff=POINTSoff_SUB2;
+ }else{
+ pts=POINTS_SUB;
+ ptsoff=POINTSoff_SUB;
+ }
+ POINTS_SUB_ARRAY[i]=pts;
+ POINTSoff_SUB_ARRAY[i]=ptsoff;
+ POINTS_SUB_ARRAY_C[i]=Tools.max(MIN_SCORE, pts+POINTS_SUB_ARRAY_C[i-1]);
+ POINTSoff_SUB_ARRAY_C[i]=Tools.max(MINoff_SCORE, ptsoff+POINTSoff_SUB_ARRAY_C[i-1]);
+ }
+ }
+
+ public final int POINTS_NOREF(){return POINTS_NOREF;}
+ public final int POINTS_NOCALL(){return POINTS_NOCALL;}
+ public final int POINTS_MATCH(){return POINTS_MATCH;}
+ public final int POINTS_MATCH2(){return POINTS_MATCH2;}
+ public final int POINTS_COMPATIBLE(){return POINTS_COMPATIBLE;}
+ public final int POINTS_SUB(){return POINTS_SUB;}
+ public final int POINTS_SUBR(){return POINTS_SUBR;}
+ public final int POINTS_SUB2(){return POINTS_SUB2;}
+ public final int POINTS_SUB3(){return POINTS_SUB3;}
+ public final int POINTS_MATCHSUB(){return POINTS_MATCHSUB;}
+ public final int POINTS_INS(){return POINTS_INS;}
+ public final int POINTS_INS2(){return POINTS_INS2;}
+ public final int POINTS_INS3(){return POINTS_INS3;}
+ public final int POINTS_INS4(){return POINTS_INS4;}
+ public final int POINTS_DEL(){return POINTS_DEL;}
+ public final int POINTS_DEL2(){return POINTS_DEL2;}
+ public final int POINTS_DEL3(){return POINTS_DEL3;}
+ public final int POINTS_DEL4(){return POINTS_DEL4;}
+ public final int POINTS_DEL5(){return POINTS_DEL5;}
+ public final int POINTS_DEL_REF_N(){return POINTS_DEL_REF_N;}
+ public final int POINTS_GAP(){return POINTS_GAP;}
+
+ public final int TIMESLIP(){return TIMESLIP;}
+ public final int MASK5(){return MASK5;}
+ public final int SCOREOFFSET(){return SCOREOFFSET();}
+
+ final int BARRIER_I1(){return BARRIER_I1;}
+ final int BARRIER_D1(){return BARRIER_D1;}
+
+ public final int LIMIT_FOR_COST_3(){return LIMIT_FOR_COST_3;}
+ public final int LIMIT_FOR_COST_4(){return LIMIT_FOR_COST_4;}
+ public final int LIMIT_FOR_COST_5(){return LIMIT_FOR_COST_5;}
+
+ public final int BAD(){return BAD;}
+
+ private int rows;
+ private int columns;
+
+}
diff --git a/current/align2/MultiStateAligner9Flat.java b/current/align2/MultiStateAligner9Flat.java
new file mode 100755
index 0000000..7b43426
--- /dev/null
+++ b/current/align2/MultiStateAligner9Flat.java
@@ -0,0 +1,2541 @@
+package align2;
+
+import java.util.Arrays;
+
+import stream.Read;
+import stream.SiteScore;
+
+import dna.AminoAcid;
+
+/**
+ * Based on MSA9ts, with transform scores tweaked for PacBio. */
+public final class MultiStateAligner9Flat extends MSA{
+
+
+ public static void main(String[] args){
+ byte[] read=args[0].getBytes();
+ byte[] ref=args[1].getBytes();
+
+ byte[] original=ref;
+
+ MultiStateAligner9Flat msa=new MultiStateAligner9Flat(read.length, ref.length);
+
+ System.out.println("Initial: ");
+ printMatrix(msa.packed, read.length, ref.length, TIMEMASK, SCOREOFFSET);
+
+ int[] max=msa.fillLimited(read, ref, 0, ref.length-1, 0, null);
+
+ System.out.println("Max: "+Arrays.toString(max));
+
+ System.out.println("Final: ");
+ printMatrix(msa.packed, read.length, ref.length, TIMEMASK, SCOREOFFSET);
+
+ byte[] out=msa.traceback(read, ref, 0, ref.length-1, max[0], max[1], max[2], false);
+
+ int[] score=null;
+ score=msa.score(read, ref, 0, ref.length-1, max[0], max[1], max[2], false);
+
+ System.out.println(new String(ref));
+ System.out.println(new String(read));
+ System.out.println(new String(out));
+ System.out.println("Score: "+Arrays.toString(score));
+ }
+
+
+ public MultiStateAligner9Flat(int maxRows_, int maxColumns_){
+ super(maxRows_, maxColumns_);
+
+ {
+ int[][][] packed0=null;
+ byte[] grefbuffer0=null;
+ int[] vertLimit0=null;
+ int[] horizLimit0=null;
+
+ try {
+ packed0=new int[3][maxRows+1][maxColumns+1];
+ grefbuffer0=new byte[maxColumns+2];
+ vertLimit0=new int[maxRows+1];
+ horizLimit0=new int[maxColumns+1];
+ } catch (OutOfMemoryError e) {
+ packed0=null;
+ grefbuffer0=null;
+ vertLimit0=null;
+ horizLimit0=null;
+ throw new RuntimeException(e.toString());
+ }
+
+ packed=packed0;
+ grefbuffer=grefbuffer0;
+ vertLimit=vertLimit0;
+ horizLimit=horizLimit0;
+ }
+
+ Arrays.fill(vertLimit, BADoff);
+ Arrays.fill(horizLimit, BADoff);
+
+// for(int i=0; i<maxColumns+1; i++){
+// scores[0][i]=0-i;
+// }
+
+ for(int matrix=0; matrix<packed.length; matrix++){
+ for(int i=1; i<=maxRows; i++){
+ for(int j=0; j<packed[matrix][i].length; j++){
+ packed[matrix][i][j]|=BADoff;
+ }
+// packed[matrix][i][0]|=MODE_INS;
+ }
+// for(int i=0; i<maxRows+1; i++){
+// scores[matrix][i][0]=(i*POINTSoff_NOREF);
+// }
+ for(int i=0; i<=maxRows; i++){
+
+ int prevScore=(i<2 ? 0 : packed[matrix][i-1][0]);
+ int score=(i<2 ? (i*POINTSoff_INS) :
+ (i<LIMIT_FOR_COST_3 ? prevScore+POINTSoff_INS2 :
+ (i<LIMIT_FOR_COST_4 ? prevScore+POINTSoff_INS3 : prevScore+POINTSoff_INS4)));
+
+ packed[matrix][i][0]=score;
+ }
+// for(int i=1; i<maxColumns+1; i++){
+// prevState[matrix][0][i]=MODE_DEL;
+// }
+// for(int i=0; i<=maxColumns; i++){
+// packed[matrix][0][i]|=MODE_MS;
+// }
+ }
+ }
+
+ @Override
+ /** return new int[] {rows, maxC, maxS, max};
+ * Will not fill areas that cannot match minScore */
+ public final int[] fillLimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore, int[] gaps){
+ if(gaps==null){return fillLimitedX(read, ref, refStartLoc, refEndLoc, minScore);}
+ else{
+ byte[] gref=makeGref(ref, gaps, refStartLoc, refEndLoc, read);
+
+ if(verbose && greflimit>0 && greflimit<500){
+ System.err.println(new String(gref, 0, greflimit));
+ }
+
+ assert(gref!=null) : "Excessively long read:\n"+new String(read);
+ return fillLimitedX(read, gref, 0, greflimit, minScore);
+ }
+ }
+
+
+ /** return new int[] {rows, maxC, maxS, max};
+ * Will not fill areas that cannot match minScore */
+ private final int[] fillLimitedX(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore){
+ if(verbose){System.err.println("fillLimitedX");}
+// minScore=0;
+// assert(minScore>0);
+ rows=read.length;
+ columns=refEndLoc-refStartLoc+1;
+
+ final int halfband=(bandwidth<1 && bandwidthRatio<=0) ? 0 :
+ Tools.max(Tools.min(bandwidth<1 ? 9999999 : bandwidth, bandwidthRatio<=0 ? 9999999 : 8+(int)(rows*bandwidthRatio)), (columns-rows+8))/2;
+
+ if(minScore<1 || (columns+rows<90) || ((halfband<1 || halfband*3>columns) && (columns>read.length+Tools.min(170, read.length+20)))){
+// assert(false) : minScore;
+// assert(minScore>0) : minScore;
+// assert(false) : +minScore+", "+columns+", "+read.length+", "+Tools.min(100, read.length);
+ return fillUnlimited(read, ref, refStartLoc, refEndLoc);
+ }
+
+// final int BARRIER_I2=columns-BARRIER_I1;
+ final int BARRIER_I2=rows-BARRIER_I1, BARRIER_I2b=columns-1;
+ final int BARRIER_D2=rows-BARRIER_D1;
+
+ minScore-=MIN_SCORE_ADJUST; //Increases quality trivially
+
+ assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows+"\n"+
+ refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n";
+ assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns+"\n"+
+ refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n";
+
+ assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc+"\n"+
+ refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n";
+ assert(refEndLoc<ref.length) : "Check that values are in-bounds before calling this function: "+refEndLoc+", "+ref.length+"\n"+
+ refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n";
+
+ if(verbose){
+ System.err.println("Clearing matrix due to verbose mode.");
+ for(int x=0; x<packed.length; x++){
+ for(int y=1; y<rows+1; y++){
+ Arrays.fill(packed[x][y], 1, columns+1, BADoff);
+ }
+ }
+ }
+
+ for(int x=0; x<packed.length; x++){
+
+// Arrays.fill(packed[x][1], 1, columns+1, BADoff);
+ Arrays.fill(packed[x][rows], 1, columns+1, BADoff);
+// for(int y=1; y<rows+1; y++){
+// Arrays.fill(packed[x][y], 1, columns+1, BADoff);
+// }
+ }
+
+ int minGoodCol=1;
+ int maxGoodCol=columns;
+
+ final int minScore_off=(minScore<<SCOREOFFSET);
+ final int maxGain=(read.length-1)*POINTSoff_MATCH2+POINTSoff_MATCH;
+ final int floor=minScore_off-maxGain;
+// final int subfloor=Tools.max(BADoff, floor-200*POINTSoff_MATCH2);
+ final int subfloor=floor-5*POINTSoff_MATCH2;
+ assert(subfloor>BADoff); //TODO: Actually, it needs to be substantially more.
+ assert(subfloor<minScore_off) : minScore_off+", "+floor+", "+BADoff+", "+subfloor;
+
+ if(verbose2){
+ System.out.println();
+ System.out.println("minScore="+minScore+"\t"+minScore_off);
+ System.out.println("maxGain="+(maxGain>>SCOREOFFSET)+"\t"+(maxGain));
+ System.out.println("floor="+(floor>>SCOREOFFSET)+"\t"+(floor));
+ System.out.println("subfloor="+(subfloor>>SCOREOFFSET)+"\t"+(subfloor));
+ System.out.println("BADoff="+(BADoff>>SCOREOFFSET)+"\t"+(BADoff));
+ System.out.println("maxGain="+(maxGain>>SCOREOFFSET)+"\t"+(maxGain));
+ System.out.println();
+ }
+
+ vertLimit[rows]=minScore_off;
+ boolean prevDefined=false;
+ for(int i=rows-1; i>=0; i--){
+ byte c=read[i];
+ if(AminoAcid.isFullyDefined(c)){
+ vertLimit[i]=Tools.max(vertLimit[i+1]-(prevDefined ? POINTSoff_MATCH2 : POINTSoff_MATCH), floor);
+ prevDefined=true;
+ }else{
+ vertLimit[i]=Tools.max(vertLimit[i+1]-POINTSoff_NOCALL, floor);
+ prevDefined=false;
+ }
+ }
+
+ horizLimit[columns]=minScore_off;
+ prevDefined=false;
+ for(int i=columns-1; i>=0; i--){
+ byte c=ref[refStartLoc+i];
+ if(AminoAcid.isFullyDefined(c)){
+ horizLimit[i]=Tools.max(horizLimit[i+1]-(prevDefined ? POINTSoff_MATCH2 : POINTSoff_MATCH), floor);
+ prevDefined=true;
+ }else{
+ horizLimit[i]=Tools.max(horizLimit[i+1]-(prevDefined && c==GAPC ? POINTSoff_DEL : POINTSoff_NOREF), floor);
+ prevDefined=false;
+ }
+ }
+
+// vertLimit[rows]=minScore_off;
+// for(int i=rows-1; i>=0; i--){
+// vertLimit[i]=Tools.max(vertLimit[i+1]-POINTSoff_MATCH2, floor);
+// }
+//
+// horizLimit[columns]=minScore_off;
+// for(int i=columns-1; i>=0; i--){
+// horizLimit[i]=Tools.max(horizLimit[i+1]-POINTSoff_MATCH2, floor);
+// }
+
+ for(int row=1; row<=rows; row++){
+
+ final int colStart=(halfband<1 ? minGoodCol : Tools.max(minGoodCol, row-halfband));
+ final int colStop=(halfband<1 ? maxGoodCol : Tools.min(maxGoodCol, row+halfband*2-1));
+
+ minGoodCol=-1;
+ maxGoodCol=-2;
+
+ final int vlimit=vertLimit[row];
+
+ if(verbose2){
+ System.out.println();
+ System.out.println("row="+row);
+ System.out.println("colStart="+colStart);
+ System.out.println("colStop="+colStop);
+ System.out.println("vlimit="+(vlimit>>SCOREOFFSET)+"\t"+(vlimit));
+ }
+
+ if(colStart<0 || colStop<colStart){break;}
+
+
+ if(colStart>1){
+ assert(row>0);
+ packed[MODE_MS][row][colStart-1]=subfloor;
+ packed[MODE_INS][row][colStart-1]=subfloor;
+ packed[MODE_DEL][row][colStart-1]=subfloor;
+ }
+
+
+ for(int col=colStart; col<=columns; col++){
+
+
+ if(verbose2){
+ System.err.println("\ncol "+col);
+ }
+
+ final byte call0=(row<2 ? (byte)'?' : read[row-2]);
+ final byte call1=read[row-1];
+ final byte ref0=(col<2 ? (byte)'!' : ref[refStartLoc+col-2]);
+ final byte ref1=ref[refStartLoc+col-1];
+
+ final boolean gap=(ref1==GAPC);
+ assert(call1!=GAPC);
+
+// final boolean match=(read[row-1]==ref[refStartLoc+col-1]);
+// final boolean prevMatch=(row<2 || col<2 ? false : read[row-2]==ref[refStartLoc+col-2]);
+ final boolean match=(call1==ref1 && ref1!='N');
+ final boolean prevMatch=(call0==ref0 && ref0!='N');
+
+// System.err.println("")
+
+ iterationsLimited++;
+ final int limit=Tools.max(vlimit, horizLimit[col]);
+ final int limit3=Tools.max(floor, (match ? limit-POINTSoff_MATCH2 : limit-POINTSoff_SUB3));
+
+ final int delNeeded=Tools.max(0, row-col-1);
+ final int insNeeded=Tools.max(0, (rows-row)-(columns-col)-1);
+
+ final int delPenalty=calcDelScoreOffset(delNeeded);
+ final int insPenalty=calcInsScoreOffset(insNeeded);
+
+
+ final int scoreFromDiag_MS=packed[MODE_MS][row-1][col-1]&SCOREMASK;
+ final int scoreFromDel_MS=packed[MODE_DEL][row-1][col-1]&SCOREMASK;
+ final int scoreFromIns_MS=packed[MODE_INS][row-1][col-1]&SCOREMASK;
+
+ final int scoreFromDiag_DEL=packed[MODE_MS][row][col-1]&SCOREMASK;
+ final int scoreFromDel_DEL=packed[MODE_DEL][row][col-1]&SCOREMASK;
+
+ final int scoreFromDiag_INS=packed[MODE_MS][row-1][col]&SCOREMASK;
+ final int scoreFromIns_INS=packed[MODE_INS][row-1][col]&SCOREMASK;
+
+// if(scoreFromDiag_MS<limit3 && scoreFromDel_MS<limit3 && scoreFromIns_MS<limit3
+// && scoreFromDiag_DEL<limit && scoreFromDel_DEL<limit && scoreFromDiag_INS<limit && scoreFromIns_INS<limit){
+// iterationsLimited--; //A "fast" iteration
+// }
+
+ if(gap || (scoreFromDiag_MS<=limit3 && scoreFromDel_MS<=limit3 && scoreFromIns_MS<=limit3)){
+ packed[MODE_MS][row][col]=subfloor;
+ }else{//Calculate match and sub scores
+ final int streak=(packed[MODE_MS][row-1][col-1]&TIMEMASK);
+
+ {//Calculate match/sub score
+
+ int score;
+ int time;
+ byte prevState;
+
+ if(match){
+
+ int scoreMS=scoreFromDiag_MS+(prevMatch ? POINTSoff_MATCH2 : POINTSoff_MATCH);
+ int scoreD=scoreFromDel_MS+POINTSoff_MATCH;
+ int scoreI=scoreFromIns_MS+POINTSoff_MATCH;
+
+// byte prevState;
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? streak+1 : 1);
+ prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+ prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+ prevState=MODE_INS;
+ }
+
+// if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+// assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+// assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+//// packed[MODE_MS][row][col]=(score|prevState|time);
+// packed[MODE_MS][row][col]=(score|time);
+// assert((score&SCOREMASK)==score);
+//// assert((prevState&MODEMASK)==prevState);
+// assert((time&TIMEMASK)==time);
+
+ }else{
+
+ int scoreMS;
+ if(ref1!='N' && call1!='N'){
+ scoreMS=scoreFromDiag_MS+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) :
+ (streak==0 ? POINTSoff_SUB : streak<LIMIT_FOR_COST_3 ? POINTSoff_SUB2 : POINTSoff_SUB3));
+ }else{
+ scoreMS=scoreFromDiag_MS+POINTSoff_NOCALL;
+ }
+
+ int scoreD=scoreFromDel_MS+POINTSoff_SUB; //+2 to move it as close as possible to the deletion / insertion
+ int scoreI=scoreFromIns_MS+POINTSoff_SUB;
+
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? 1 : streak+1);
+// time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1);
+ prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+ prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+ prevState=MODE_INS;
+ }
+
+// if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+// assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+// assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+//// packed[MODE_MS][row][col]=(score|prevState|time);
+// packed[MODE_MS][row][col]=(score|time);
+// assert((score&SCOREMASK)==score);
+//// assert((prevState&MODEMASK)==prevState);
+// assert((time&TIMEMASK)==time);
+ }
+
+ final int limit2;
+ if(delNeeded>0){
+ limit2=limit-delPenalty;
+ }else if(insNeeded>0){
+ limit2=limit-insPenalty;
+ }else{
+ limit2=limit;
+ }
+ assert(limit2>=limit);
+
+ if(verbose2){System.err.println("MS: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));}
+
+ if(score>=limit2){
+ maxGoodCol=col;
+ if(minGoodCol<0){minGoodCol=col;}
+ }else{
+ score=subfloor;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_MS][row][col]=(score|prevState|time);
+ packed[MODE_MS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+ }
+
+ if((scoreFromDiag_DEL<=limit && scoreFromDel_DEL<=limit) || row<BARRIER_D1 || row>BARRIER_D2){
+// assert((scoreFromDiag_DEL<=limit && scoreFromDel_DEL<=limit)) : scoreFromDiag_DEL+", "+row;
+ packed[MODE_DEL][row][col]=subfloor;
+ }else{//Calculate DEL score
+
+ final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK;
+
+ int scoreMS=scoreFromDiag_DEL+POINTSoff_DEL;
+ int scoreD=scoreFromDel_DEL+(streak==0 ? POINTSoff_DEL :
+ streak<LIMIT_FOR_COST_3 ? POINTSoff_DEL2 :
+ streak<LIMIT_FOR_COST_4 ? POINTSoff_DEL3 :
+ streak<LIMIT_FOR_COST_5 ? POINTSoff_DEL4 :
+ ((streak&MASK5)==0 ? POINTSoff_DEL5 : 0));
+// int scoreI=scoreFromIns+POINTSoff_DEL;
+
+
+ if(ref1=='N'){
+ scoreMS+=POINTSoff_DEL_REF_N;
+ scoreD+=POINTSoff_DEL_REF_N;
+ }else if(gap){
+ scoreMS+=POINTSoff_GAP;
+ scoreD+=POINTSoff_GAP;
+ }
+
+ //if(match){scoreMS=subfloor;}
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreD){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreD;
+ time=streak+1;
+ prevState=MODE_DEL;
+ }
+
+ final int limit2;
+ if(insNeeded>0){
+ limit2=limit-insPenalty;
+ }else if(delNeeded>0){
+ limit2=limit-calcDelScoreOffset(time+delNeeded)+calcDelScoreOffset(time);
+ }else{
+ limit2=limit;
+ }
+ assert(limit2>=limit);
+ if(verbose2){System.err.println("DEL: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));}
+
+ if(score>=limit2){
+ maxGoodCol=col;
+ if(minGoodCol<0){minGoodCol=col;}
+ }else{
+ score=subfloor;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_DEL][row][col]=(score|prevState|time);
+ packed[MODE_DEL][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+
+// if(gap || (scoreFromDiag_INS<=limit && scoreFromIns_INS<=limit) || col<BARRIER_I1 || col>BARRIER_I2){
+ if(gap || (scoreFromDiag_INS<=limit && scoreFromIns_INS<=limit) || (row<BARRIER_I1 && col>1) || (row>BARRIER_I2 && col<BARRIER_I2b)){
+ packed[MODE_INS][row][col]=subfloor;
+ }else{//Calculate INS score
+
+ final int streak=packed[MODE_INS][row-1][col]&TIMEMASK;
+
+ int scoreMS=scoreFromDiag_INS+POINTSoff_INS;
+// int scoreD=scoreFromDel+POINTSoff_INS;
+ int scoreI=scoreFromIns_INS+(streak==0 ? POINTSoff_INS :
+ streak<LIMIT_FOR_COST_3 ? POINTSoff_INS2 :
+ streak<LIMIT_FOR_COST_4 ? POINTSoff_INS3 : POINTSoff_INS4);
+
+
+// System.err.println("("+row+","+col+")\t"+scoreFromDiag+"+"+POINTSoff_INS+"="+scoreM+", "+
+// scoreFromSub+"+"+POINTSoff_INS+"="+scoreS+", "
+// +scoreD+", "+scoreFromIns+"+"+
+// (streak==0 ? POINTSoff_INS : streak<LIMIT_FOR_COST_3 ? POINTSoff_INS2 : POINTSoff_INS3)+"="+scoreI);
+
+ //if(match){scoreMS=subfloor;}
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreI){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreI;
+ time=streak+1;
+ prevState=MODE_INS;
+ }
+
+ final int limit2;
+ if(delNeeded>0){
+ limit2=limit-delPenalty;
+ }else if(insNeeded>0){
+ limit2=limit-calcInsScoreOffset(time+insNeeded)+calcInsScoreOffset(time);
+ }else{
+ limit2=limit;
+ }
+ assert(limit2>=limit);
+
+ if(verbose2){System.err.println("INS: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));}
+ if(score>=limit2){
+ maxGoodCol=col;
+ if(minGoodCol<0){minGoodCol=col;}
+ }else{
+ score=subfloor;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_INS][row][col]=(score|prevState|time);
+ packed[MODE_INS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+
+
+ if(col>=colStop){
+ if(col>colStop && (maxGoodCol<col || halfband>0)){break;}
+ if(row>1){
+ packed[MODE_MS][row-1][col+1]=subfloor;
+ packed[MODE_INS][row-1][col+1]=subfloor;
+ packed[MODE_DEL][row-1][col+1]=subfloor;
+ }
+ }
+ }
+ }
+
+
+ int maxCol=-1;
+ int maxState=-1;
+ int maxScore=Integer.MIN_VALUE;
+
+ for(int state=0; state<packed.length; state++){
+ for(int col=1; col<=columns; col++){
+ int x=packed[state][rows][col]&SCOREMASK;
+ if(x>maxScore){
+ maxScore=x;
+ maxCol=col;
+ maxState=state;
+ }
+ }
+ }
+
+ assert(maxScore>=BADoff);
+// if(maxScore==BADoff){
+// return null;
+// }
+// if(maxScore<floor){
+// return null;
+// }
+
+ if(verbose){
+ System.out.println("Filled matrix.");
+ printMatrix(packed, rows, columns, TIMEMASK, SCOREOFFSET);
+ }
+ if(verbose){System.err.println("maxscore="+(maxScore>>SCOREOFFSET)+", minscore="+(minScore_off>>SCOREOFFSET));}
+
+ if(maxScore<minScore_off){
+ return null;
+ }
+
+ maxScore>>=SCOREOFFSET;
+
+ if(verbose){
+ System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore);
+ }
+ return new int[] {rows, maxCol, maxState, maxScore};
+ }
+
+ @Override
+ /** return new int[] {rows, maxC, maxS, max};
+ * Will not fill areas that cannot match minScore */
+ public final int[] fillUnlimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int[] gaps){
+ if(gaps==null){return fillUnlimited(read, ref, refStartLoc, refEndLoc);}
+ else{
+ byte[] gref=makeGref(ref, gaps, refStartLoc, refEndLoc, read);
+ assert(gref!=null) : "Excessively long read:\n"+new String(read);
+ return fillUnlimited(read, gref, 0, greflimit);
+ }
+ }
+
+
+ /** return new int[] {rows, maxC, maxS, max};
+ * Does not require a min score (ie, same as old method) */
+ private final int[] fillUnlimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc){
+ rows=read.length;
+ columns=refEndLoc-refStartLoc+1;
+
+ final int maxGain=(read.length-1)*POINTSoff_MATCH2+POINTSoff_MATCH;
+ final int subfloor=0-2*maxGain;
+ assert(subfloor>BADoff && subfloor*2>BADoff) : (read.length-1)+", "+maxGain+", "+subfloor+", "+(subfloor*2)+", "+BADoff+"\n"
+ +rows+", "+columns+", "+POINTSoff_MATCH2+", "+SCOREOFFSET+"\n"+new String(read)+"\n"; //TODO: Actually, it needs to be substantially more.
+// final int BARRIER_I2=columns-BARRIER_I1;
+ final int BARRIER_I2=rows-BARRIER_I1, BARRIER_I2b=columns-1;
+ final int BARRIER_D2=rows-BARRIER_D1;
+
+ //temporary, for finding a bug
+ if(rows>maxRows || columns>maxColumns){
+ throw new RuntimeException("rows="+rows+", maxRows="+maxRows+", cols="+columns+", maxCols="+maxColumns+"\n"+new String(read)+"\n");
+ }
+
+ assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows;
+ assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns;
+
+ assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc;
+ assert(refEndLoc<ref.length) : "Check that values are in-bounds before calling this function: "+refEndLoc+", "+ref.length;
+
+ for(int row=1; row<=rows; row++){
+
+// int minc=max(1, row-20);
+// int maxc=min(columns, row+20);
+
+ for(int col=1; col<=columns; col++){
+ iterationsUnlimited++;
+
+// final boolean match=(read[row-1]==ref[refStartLoc+col-1]);
+// final boolean prevMatch=(row<2 || col<2 ? false : read[row-2]==ref[refStartLoc+col-2]);
+
+ final byte call0=(row<2 ? (byte)'?' : read[row-2]);
+ final byte call1=read[row-1];
+ final byte ref0=(col<2 ? (byte)'!' : ref[refStartLoc+col-2]);
+ final byte ref1=ref[refStartLoc+col-1];
+
+ final boolean match=(call1==ref1 && ref1!='N');
+ final boolean prevMatch=(call0==ref0 && ref0!='N');
+
+ final boolean gap=(ref1==GAPC);
+ assert(call1!=GAPC);
+
+ if(gap){
+ packed[MODE_MS][row][col]=subfloor;
+ }else{//Calculate match and sub scores
+
+ final int scoreFromDiag=packed[MODE_MS][row-1][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row-1][col-1]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col-1]&SCOREMASK;
+ final int streak=(packed[MODE_MS][row-1][col-1]&TIMEMASK);
+
+ {//Calculate match/sub score
+
+ if(match){
+
+ int scoreMS=scoreFromDiag+(prevMatch ? POINTSoff_MATCH2 : POINTSoff_MATCH);
+ int scoreD=scoreFromDel+POINTSoff_MATCH;
+ int scoreI=scoreFromIns+POINTSoff_MATCH;
+
+ int score;
+ int time;
+// byte prevState;
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? streak+1 : 1);
+// prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+// prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+// prevState=MODE_INS;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_MS][row][col]=(score|prevState|time);
+ packed[MODE_MS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+
+ }else{
+
+ int scoreMS;
+ if(ref1!='N' && call1!='N'){
+ scoreMS=scoreFromDiag+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) :
+ (streak==0 ? POINTSoff_SUB : streak<LIMIT_FOR_COST_3 ? POINTSoff_SUB2 : POINTSoff_SUB3));
+ }else{
+ scoreMS=scoreFromDiag+POINTSoff_NOCALL;
+ }
+
+ int scoreD=scoreFromDel+POINTSoff_SUB; //+2 to move it as close as possible to the deletion / insertion
+ int scoreI=scoreFromIns+POINTSoff_SUB;
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? 1 : streak+1);
+// time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1);
+ prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+ prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+ prevState=MODE_INS;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_MS][row][col]=(score|prevState|time);
+ packed[MODE_MS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+ }
+ }
+
+ if(row<BARRIER_D1 || row>BARRIER_D2){
+ packed[MODE_DEL][row][col]=subfloor;
+ }else{//Calculate DEL score
+
+ final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK;
+
+ final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK;
+
+ int scoreMS=scoreFromDiag+POINTSoff_DEL;
+ int scoreD=scoreFromDel+(streak==0 ? POINTSoff_DEL :
+ streak<LIMIT_FOR_COST_3 ? POINTSoff_DEL2 :
+ streak<LIMIT_FOR_COST_4 ? POINTSoff_DEL3 :
+ streak<LIMIT_FOR_COST_5 ? POINTSoff_DEL4 :
+ ((streak&MASK5)==0 ? POINTSoff_DEL5 : 0));
+// int scoreI=scoreFromIns+POINTSoff_DEL;
+
+ if(ref1=='N'){
+ scoreMS+=POINTSoff_DEL_REF_N;
+ scoreD+=POINTSoff_DEL_REF_N;
+ }else if(gap){
+ scoreMS+=POINTSoff_GAP;
+ scoreD+=POINTSoff_GAP;
+ }
+
+ //if(match){scoreMS=subfloor;}
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreD){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreD;
+ time=streak+1;
+ prevState=MODE_DEL;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_DEL][row][col]=(score|prevState|time);
+ packed[MODE_DEL][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+
+ //Calculate INS score
+// if(gap || col<BARRIER_I1 || col>BARRIER_I2){
+ if(gap || (row<BARRIER_I1 && col>1) || (row>BARRIER_I2 && col<BARRIER_I2b)){
+ packed[MODE_INS][row][col]=subfloor;
+ }else{//Calculate INS score
+
+ final int streak=packed[MODE_INS][row-1][col]&TIMEMASK;
+
+ final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK;
+
+ int scoreMS=scoreFromDiag+POINTSoff_INS;
+// int scoreD=scoreFromDel+POINTSoff_INS;
+ int scoreI=scoreFromIns+(streak==0 ? POINTSoff_INS :
+ streak<LIMIT_FOR_COST_3 ? POINTSoff_INS2 :
+ streak<LIMIT_FOR_COST_4 ? POINTSoff_INS3 : POINTSoff_INS4);
+
+// System.err.println("("+row+","+col+")\t"+scoreFromDiag+"+"+POINTSoff_INS+"="+scoreM+", "+
+// scoreFromSub+"+"+POINTSoff_INS+"="+scoreS+", "
+// +scoreD+", "+scoreFromIns+"+"+
+// (streak==0 ? POINTSoff_INS : streak<LIMIT_FOR_COST_3 ? POINTSoff_INS2 : POINTSoff_INS3)+"="+scoreI);
+
+ //if(match){scoreMS=subfloor;}
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreI){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreI;
+ time=streak+1;
+ prevState=MODE_INS;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_INS][row][col]=(score|prevState|time);
+ packed[MODE_INS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+ }
+ }
+
+
+ int maxCol=-1;
+ int maxState=-1;
+ int maxScore=Integer.MIN_VALUE;
+
+ for(int state=0; state<packed.length; state++){
+ for(int col=1; col<=columns; col++){
+ int x=packed[state][rows][col]&SCOREMASK;
+ if(x>maxScore){
+ maxScore=x;
+ maxCol=col;
+ maxState=state;
+ }
+ }
+ }
+ maxScore>>=SCOREOFFSET;
+
+// System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore);
+ return new int[] {rows, maxCol, maxState, maxScore};
+ }
+
+ @Deprecated
+ /** return new int[] {rows, maxC, maxS, max}; */
+ public final int[] fillQ(byte[] read, byte[] ref, byte[] baseScores, int refStartLoc, int refEndLoc){
+ assert(false) : "Needs to be redone to work with score cutoffs. Not difficult.";
+ rows=read.length;
+ columns=refEndLoc-refStartLoc+1;
+
+ assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows;
+ assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns;
+
+ assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc;
+ assert(refEndLoc<ref.length) : "Check that values are in-bounds before calling this function: "+refEndLoc+", "+ref.length;
+
+ for(int row=1; row<=rows; row++){
+
+// int minc=max(1, row-20);
+// int maxc=min(columns, row+20);
+
+ for(int col=1; col<=columns; col++){
+
+ final boolean match=(read[row-1]==ref[refStartLoc+col-1]);
+ final boolean prevMatch=(row<2 || col<2 ? false : read[row-2]==ref[refStartLoc+col-2]);
+
+ {//Calculate match and sub scores
+
+ final int scoreFromDiag=packed[MODE_MS][row-1][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row-1][col-1]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col-1]&SCOREMASK;
+ final int streak=(packed[MODE_MS][row-1][col-1]&TIMEMASK);
+
+ {//Calculate match/sub score
+
+ if(match){
+
+ int scoreMS=scoreFromDiag+(prevMatch ? POINTSoff_MATCH2 : POINTSoff_MATCH);
+ int scoreD=scoreFromDel+POINTSoff_MATCH;
+ int scoreI=scoreFromIns+POINTSoff_MATCH;
+
+ int score;
+ int time;
+// byte prevState;
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? streak+1 : 1);
+// prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+// prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+// prevState=MODE_INS;
+ }
+ score+=(((int)baseScores[row-1])<<SCOREOFFSET); //modifier
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_MS][row][col]=(score|prevState|time);
+ packed[MODE_MS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+
+ }else{
+
+ int scoreMS=scoreFromDiag+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) :
+ (streak==0 ? POINTSoff_SUB : streak<LIMIT_FOR_COST_3 ? POINTSoff_SUB2 : POINTSoff_SUB3));
+ int scoreD=scoreFromDel+POINTSoff_SUB; //+2 to move it as close as possible to the deletion / insertion
+ int scoreI=scoreFromIns+POINTSoff_SUB;
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? 1 : streak+1);
+// time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1);
+ prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+ prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+ prevState=MODE_INS;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_MS][row][col]=(score|prevState|time);
+ packed[MODE_MS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+ }
+ }
+
+ {//Calculate DEL score
+
+ final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK;
+
+ final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK;
+
+ int scoreMS=scoreFromDiag+POINTSoff_DEL;
+ int scoreD=scoreFromDel+(streak==0 ? POINTSoff_DEL :
+ streak<LIMIT_FOR_COST_3 ? POINTSoff_DEL2 :
+ streak<LIMIT_FOR_COST_4 ? POINTSoff_DEL3 :
+ streak<LIMIT_FOR_COST_5 ? POINTSoff_DEL4 :
+ ((streak&MASK5)==0 ? POINTSoff_DEL5 : 0));
+// int scoreI=scoreFromIns+POINTSoff_DEL;
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreD){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreD;
+ time=streak+1;
+ prevState=MODE_DEL;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_DEL][row][col]=(score|prevState|time);
+ packed[MODE_DEL][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+
+ {//Calculate INS score
+
+ final int streak=packed[MODE_INS][row-1][col]&TIMEMASK;
+
+ final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK;
+
+ int scoreMS=scoreFromDiag+POINTSoff_INS;
+// int scoreD=scoreFromDel+POINTSoff_INS;
+ int scoreI=scoreFromIns+(streak==0 ? POINTSoff_INS :
+ streak<LIMIT_FOR_COST_3 ? POINTSoff_INS2 :
+ streak<LIMIT_FOR_COST_4 ? POINTSoff_INS3 : POINTSoff_INS4);
+
+// System.err.println("("+row+","+col+")\t"+scoreFromDiag+"+"+POINTSoff_INS+"="+scoreM+", "+
+// scoreFromSub+"+"+POINTSoff_INS+"="+scoreS+", "
+// +scoreD+", "+scoreFromIns+"+"+
+// (streak==0 ? POINTSoff_INS : streak<LIMIT_FOR_COST_3 ? POINTSoff_INS2 : POINTSoff_INS3)+"="+scoreI);
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreI){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreI;
+ time=streak+1;
+ prevState=MODE_INS;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_INS][row][col]=(score|prevState|time);
+ packed[MODE_INS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+ }
+ }
+
+
+ int maxCol=-1;
+ int maxState=-1;
+ int maxScore=Integer.MIN_VALUE;
+
+ for(int state=0; state<packed.length; state++){
+ for(int col=1; col<=columns; col++){
+ int x=packed[state][rows][col]&SCOREMASK;
+ if(x>maxScore){
+ maxScore=x;
+ maxCol=col;
+ maxState=state;
+ }
+ }
+ }
+ maxScore>>=SCOREOFFSET;
+
+// System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore);
+ return new int[] {rows, maxCol, maxState, maxScore};
+ }
+
+ @Override
+ /** @return {score, bestRefStart, bestRefStop} */
+ /** Generates the match string */
+ public final byte[] traceback(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state, boolean gapped){
+ if(gapped){
+ final byte[] gref=grefbuffer;
+ int gstart=translateToGappedCoordinate(refStartLoc, gref);
+ int gstop=translateToGappedCoordinate(refEndLoc, gref);
+ byte[] out=traceback2(read, gref, gstart, gstop, row, col, state);
+ return out;
+ }else{
+ return traceback2(read, ref, refStartLoc, refEndLoc, row, col, state);
+ }
+ }
+
+ @Override
+ /** Generates the match string */
+ public final byte[] traceback2(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state){
+// assert(false);
+ assert(refStartLoc<=refEndLoc) : refStartLoc+", "+refEndLoc;
+ assert(row==rows);
+
+ byte[] out=new byte[row+col-1]; //TODO if an out of bound crash occurs, try removing the "-1".
+ int outPos=0;
+
+ int gaps=0;
+
+ if(state==MODE_INS){
+ //TODO ? Maybe not needed.
+ }
+
+ while(row>0 && col>0){
+
+// byte prev0=(byte)(packed[state][row][col]&MODEMASK);
+
+ final int time=packed[state][row][col]&TIMEMASK;
+ final byte prev;
+
+// System.err.println("state="+state+", prev="+prev+", row="+row+", col="+col+", score="+scores[state][row][col]);
+
+ if(state==MODE_MS){
+ if(time>1){prev=(byte)state;}
+ else{
+ final int scoreFromDiag=packed[MODE_MS][row-1][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row-1][col-1]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col-1]&SCOREMASK;
+ if(scoreFromDiag>=scoreFromDel && scoreFromDiag>=scoreFromIns){prev=MODE_MS;}
+ else if(scoreFromDel>=scoreFromIns){prev=MODE_DEL;}
+ else{prev=MODE_INS;}
+ }
+
+ byte c=read[row-1];
+ byte r=ref[refStartLoc+col-1];
+ if(c==r){
+ out[outPos]='m';
+ }else{
+ if(!AminoAcid.isFullyDefined(c)){
+ out[outPos]='N';
+ }else if(!AminoAcid.isFullyDefined(r)){
+// out[outPos]='X';
+ out[outPos]='N';
+ }else{
+ out[outPos]='S';
+ }
+ }
+
+ row--;
+ col--;
+ }else if(state==MODE_DEL){
+ if(time>1){prev=(byte)state;}
+ else{
+ final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK;
+ if(scoreFromDiag>=scoreFromDel){prev=MODE_MS;}
+ else{prev=MODE_DEL;}
+ }
+
+ byte r=ref[refStartLoc+col-1];
+ if(r==GAPC){
+ out[outPos]='-';
+ gaps++;
+ }else{
+ out[outPos]='D';
+ }
+ col--;
+ }else{
+ if(time>1){prev=(byte)state;}
+ else{
+ final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK;
+ if(scoreFromDiag>=scoreFromIns){prev=MODE_MS;}
+ else{prev=MODE_INS;}
+ }
+
+ assert(state==MODE_INS) : state;
+ if(col==0){
+ out[outPos]='X';
+ }else if(col>=columns){
+ out[outPos]='Y';
+ }else{
+ out[outPos]='I';
+ }
+ row--;
+ }
+
+// assert(prev==prev0);
+ state=prev;
+ outPos++;
+ }
+
+ assert(row==0 || col==0);
+ if(col!=row){
+ while(row>0){
+ out[outPos]='X';
+ outPos++;
+ row--;
+ col--;
+ }
+ if(col>0){
+ //do nothing
+ }
+ }
+
+
+ //Shrink and reverse the string
+ byte[] out2=new byte[outPos];
+ for(int i=0; i<outPos; i++){
+ out2[i]=out[outPos-i-1];
+ }
+ out=null;
+
+ if(gaps==0){return out2;}
+
+ //TODO Consider outputting this compressed.
+ byte[] out3=new byte[out2.length+gaps*(GAPLEN-1)];
+ for(int i=0, j=0; i<out2.length; i++){
+ byte c=out2[i];
+ if(c!=GAPC){
+ out3[j]=c;
+ j++;
+ }else{
+ int lim=j+GAPLEN;
+ for(; j<lim; j++){
+ out3[j]='D';
+ }
+ }
+ }
+ return out3;
+ }
+
+ @Override
+ /** @return {score, bestRefStart, bestRefStop} */
+ public final int[] score(final byte[] read, final byte[] ref, final int refStartLoc, final int refEndLoc,
+ final int maxRow, final int maxCol, final int maxState, boolean gapped){
+ if(gapped){
+ if(verbose){
+ System.err.println("score():");
+ System.err.println("origin="+grefRefOrigin+", "+refStartLoc+", "+refEndLoc+", "+maxRow+", "+maxCol);
+ }
+ final byte[] gref=grefbuffer;
+ int gstart=translateToGappedCoordinate(refStartLoc, gref);
+ int gstop=translateToGappedCoordinate(refEndLoc, gref);
+
+ assert(translateFromGappedCoordinate(gstart, gref)==refStartLoc); //TODO: Remove slow assertions
+ assert(translateFromGappedCoordinate(gstop, gref)==refEndLoc);
+
+ assert(gstart==0) : gstart; //TODO: skip translation if this is always zero
+
+ if(verbose){System.err.println("gstart, gstop: "+gstart+", "+gstop);}
+ int[] out=score2(read, gref, gstart, gstop, maxRow, maxCol, maxState);
+ if(verbose){System.err.println("got score "+Arrays.toString(out));}
+
+ assert(out[1]==translateToGappedCoordinate(translateFromGappedCoordinate(out[1], gref), gref)) :
+ "Verifying: "+out[1]+" -> "+translateFromGappedCoordinate(out[1], gref)+" -> "+
+ translateToGappedCoordinate(translateFromGappedCoordinate(out[1], gref), gref);
+ assert(out[2]==translateToGappedCoordinate(translateFromGappedCoordinate(out[2], gref), gref));
+
+ out[1]=translateFromGappedCoordinate(out[1], gref);
+ out[2]=translateFromGappedCoordinate(out[2], gref);
+ if(verbose){System.err.println("returning score "+Arrays.toString(out));}
+ return out;
+ }else{
+ return score2(read, ref, refStartLoc, refEndLoc, maxRow, maxCol, maxState);
+ }
+ }
+
+ @Override
+ /** @return {score, bestRefStart, bestRefStop, maxRow, maxCol, maxState}, <br>
+ * or {score, bestRefStart, bestRefStop, maxRow, maxCol, maxState, padLeft, padRight} <br>
+ * if more padding is needed */
+ public final int[] score2(final byte[] read, final byte[] ref, final int refStartLoc, final int refEndLoc,
+ final int maxRow, final int maxCol, final int maxState){
+
+ int row=maxRow;
+ int col=maxCol;
+ int state=maxState;
+
+ assert(maxState>=0 && maxState<packed.length) :
+ maxState+", "+maxRow+", "+maxCol+"\n"+new String(read)+"\n"+toString(ref, refStartLoc, refEndLoc);
+ assert(maxRow>=0 && maxRow<packed[0].length) :
+ maxState+", "+maxRow+", "+maxCol+"\n"+new String(read)+"\n"+toString(ref, refStartLoc, refEndLoc);
+ assert(maxCol>=0 && maxCol<packed[0][0].length) :
+ maxState+", "+maxRow+", "+maxCol+"\n"+new String(read)+"\n"+toString(ref, refStartLoc, refEndLoc);
+
+ int score=packed[maxState][maxRow][maxCol]&SCOREMASK; //Or zero, if it is to be recalculated
+
+ if(row<rows){
+ int difR=rows-row;
+ int difC=columns-col;
+
+ while(difR>difC){
+ score+=POINTSoff_NOREF;
+ difR--;
+ }
+
+ row+=difR;
+ col+=difR;
+
+ }
+
+ assert(refStartLoc<=refEndLoc);
+ assert(row==rows);
+
+
+ final int bestRefStop=refStartLoc+col-1;
+
+ if(verbose){System.err.println("Scoring.");}
+
+ int stateTime=0;
+
+ while(row>0 && col>0){
+
+ if(verbose){System.err.println("state="+state+", row="+row+", col="+col);}
+
+ final int time=packed[state][row][col]&TIMEMASK;
+ final byte prev;
+
+ if(state==MODE_MS){
+ if(time>1){prev=(byte)state;}
+ else{
+ final int scoreFromDiag=packed[MODE_MS][row-1][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row-1][col-1]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col-1]&SCOREMASK;
+ if(scoreFromDiag>=scoreFromDel && scoreFromDiag>=scoreFromIns){prev=MODE_MS;}
+ else if(scoreFromDel>=scoreFromIns){prev=MODE_DEL;}
+ else{prev=MODE_INS;}
+ }
+ row--;
+ col--;
+ }else if(state==MODE_DEL){
+ if(time>1){prev=(byte)state;}
+ else{
+ final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK;
+ if(scoreFromDiag>=scoreFromDel){prev=MODE_MS;}
+ else{prev=MODE_DEL;}
+ }
+ col--;
+ }else{
+ assert(state==MODE_INS);
+ if(time>1){prev=(byte)state;}
+ else{
+ final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK;
+ if(scoreFromDiag>=scoreFromIns){prev=MODE_MS;}
+ else{prev=MODE_INS;}
+ }
+ row--;
+ }
+
+ if(col<0){
+ if(verbose){
+ System.err.println("Warning, column went below 0 at row="+row);
+ }
+ break; //prevents an out of bounds access
+ }
+
+// assert(prev==prev0);
+ if(state==prev){stateTime++;}else{stateTime=0;}
+ state=prev;
+
+ if(verbose){System.err.println("state2="+state+", time="+time+", stateTime="+stateTime+", row2="+row+", col2="+col+"\n");}
+ }
+// assert(false) : row+", "+col;
+ if(row>col){
+ col-=row;
+ }
+
+ final int bestRefStart=refStartLoc+col;
+
+ score>>=SCOREOFFSET;
+
+ if(verbose){
+ System.err.println("bestRefStart="+bestRefStart+", refStartLoc="+refStartLoc);
+ System.err.println("bestRefStop="+bestRefStop+", refEndLoc="+refEndLoc);
+ }
+
+ int padLeft=0;
+ int padRight=0;
+ if(bestRefStart<refStartLoc){
+ padLeft=Tools.max(0, refStartLoc-bestRefStart);
+ }else if(bestRefStart==refStartLoc && state==MODE_INS){
+ padLeft=stateTime;
+ }
+ if(bestRefStop>refEndLoc){
+ padRight=Tools.max(0, bestRefStop-refEndLoc);
+ }else if(bestRefStop==refEndLoc && maxState==MODE_INS){
+ padRight=packed[maxState][maxRow][maxCol]&TIMEMASK;
+ }
+
+ int[] rvec;
+ if(padLeft>0 || padRight>0){ //Suggest extra padding in cases of overflow
+ rvec=new int[] {score, bestRefStart, bestRefStop, maxRow, maxCol, maxState, padLeft, padRight};
+ }else{
+ rvec=new int[] {score, bestRefStart, bestRefStop, maxRow, maxCol, maxState};
+ }
+ return rvec;
+ }
+
+ /**
+ * Fills grefbuffer
+ * @param ref
+ * @param a
+ * @param b
+ * @param gaps
+ * @return gref
+ */
+ private final byte[] makeGref(byte[] ref, int[] gaps, int refStartLoc, int refEndLoc, byte[] read){
+ assert(gaps!=null && gaps.length>0);
+
+ assert(refStartLoc<=gaps[0]) : refStartLoc+", "+refEndLoc+", "+Arrays.toString(gaps);
+ assert(refEndLoc>=gaps[gaps.length-1]);
+
+ final int g0_old=gaps[0];
+ final int gN_old=gaps[gaps.length-1];
+ gaps[0]=Tools.min(gaps[0], refStartLoc);
+ gaps[gaps.length-1]=Tools.max(gN_old, refEndLoc);
+ grefRefOrigin=gaps[0];
+
+ if(verbose){System.err.println("\ngaps2: "+Arrays.toString(gaps));}
+
+// grefRefOrigin=Tools.min(gaps[0], refStartLoc);
+
+// //This block is no longer needed since the array is preallocated.
+// int len=0;
+// final int gb2=GAPBUFFER*2;
+// for(int i=0; i<gaps.length; i+=2){
+// int x=gaps[i];
+// int y=gaps[i+1];
+// len+=(y-x+1);
+// if(i+2<gaps.length){
+// int z=gaps[i+2];
+// assert(z>y);
+// int gap=z-y-1;
+// if(gap<MINGAP){
+// len+=gap;
+// }else{
+// len+=gb2;
+// gap-=gb2;
+// int div=gap/GAPLEN;
+// int rem=gap%GAPLEN;
+// len+=(div+rem);
+// }
+// }
+// }
+ byte[] gref=grefbuffer;
+
+ int gpos=0;
+ for(int i=0; i<gaps.length; i+=2){
+ int x=gaps[i];
+ int y=gaps[i+1];
+
+ for(int r=x; r<=y; r++, gpos++){
+ //TODO: if out of bounds, use an 'N'
+ assert(gpos<gref.length) :
+ "\ngpos="+gpos+", gref.length="+gref.length+", read.length="+read.length+", gaps2="+Arrays.toString(gaps)+
+ "\ni="+i+", r="+r+", x="+x+", y="+y+
+ "\nGapTools.calcGrefLen("+gaps[0]+", "+gaps[gaps.length-1]+", gaps)="+GapTools.calcGrefLen(gaps[0], gaps[gaps.length-1], gaps)+
+ "\nGapTools.calcGrefLen("+gaps[0]+", "+gaps[gaps.length-1]+", gaps)="+GapTools.calcGrefLen(gaps[0], gaps[gaps.length-1], gaps)+
+ "\n"+refStartLoc+", "+refEndLoc+", "+greflimit+", "+GREFLIMIT2_CUSHION+"\n"+new String(gref)+"\n"+new String(read)+"\n";
+ gref[gpos]=ref[r];
+ }
+
+ if(i+2<gaps.length){
+ int z=gaps[i+2];
+ assert(z>y);
+ int gap=z-y-1;
+ assert(gap>=MINGAP) : gap+"\t"+MINGAP;
+ if(gap<MINGAP){
+ assert(false) : "TODO - just fill in normally";
+ }else{
+ int rem=gap%GAPLEN;
+ int lim=y+GAPBUFFER+rem;
+
+ int div=(gap-GAPBUFFER2)/GAPLEN;
+ if(verbose){
+ System.err.println("div = "+div);
+ }
+ assert(div>0);
+
+ for(int r=y+1; r<=lim; r++, gpos++){
+ gref[gpos]=ref[r];
+ }
+ for(int g=0; g<div; g++, gpos++){
+ gref[gpos]=GAPC;
+ }
+ for(int r=z-GAPBUFFER; r<z; r++, gpos++){
+ gref[gpos]=ref[r];
+ }
+ }
+ }
+ }
+
+ greflimit=gpos;
+
+ assert(gref[gpos-1]==ref[refEndLoc]);
+// assert(greflimit+GREFLIMIT2_CUSHION<=gref.length) : refStartLoc+", "+refEndLoc+", "+greflimit+", "+GREFLIMIT2_CUSHION+"\n"+new String(gref);
+ //Add a cushion to the end to clear out the prior data (especially GAPC) that was there
+ {
+ final int lim=Tools.min(gref.length, greflimit+GREFLIMIT2_CUSHION);
+ if(lim>gref.length){
+ System.err.println("gref buffer overflow: "+lim+" > "+gref.length);
+ return null;
+ }
+ for(int i=greflimit, r=refEndLoc+1; i<lim; i++, r++){
+ gref[i]=(r<ref.length ? ref[r] : (byte)'N');
+ greflimit2=i;
+ }
+ }
+
+ if(verbose){
+ System.err.println("gref:\n"+new String(gref));
+ }
+
+ gaps[0]=g0_old;
+ gaps[gaps.length-1]=gN_old;
+
+ if(verbose){
+ System.err.println("\ngaps3: "+Arrays.toString(gaps));
+ }
+
+ return gref;
+ }
+
+
+// public final int[] translateScoreFromGappedCoordinate(int[] score){
+//// {score, bestRefStart, bestRefStop}
+// int a=score[1];
+// int b=score[2];
+// int a2=-9999;
+// int b2=-9999;
+// for(int i=0, j=grefRefOrigin; i<grefbuffer.length; i++){
+// byte c=grefbuffer[i];
+//
+// if(i==a){a2=j;}
+// if(i==b){
+// b2=j;
+// assert(a2!=-9999);
+// score[1]=a2;
+// score[2]=b2;
+// return score;
+// }
+//
+// j+=(c==GAPC ? GAPLEN : 1);
+//// if(c!=GAPC){j++;}
+//// else{j+=GAPLEN;}
+// }
+// throw new RuntimeException("Out of bounds.");
+// }
+
+ private final int translateFromGappedCoordinate(int point, byte[] gref){
+ if(verbose){System.err.println("translateFromGappedCoordinate("+point+"), gro="+grefRefOrigin+", grl="+greflimit);}
+ if(point<=0){return grefRefOrigin+point;}
+ for(int i=0, j=grefRefOrigin; i<greflimit2; i++){
+// if(verbose){System.err.println("i="+i+", j="+j+", sym="+(char)gref[i]);}
+ byte c=gref[i];
+ assert(point>=i) : "\n"+grefRefOrigin+"\n"+point+"\n"+new String(gref)+"\n";
+
+ if(i==point){
+ if(verbose){System.err.println(" -> "+j);}
+ return j;
+ }
+
+ j+=(c==GAPC ? GAPLEN : 1);
+// if(c!=GAPC){j++;}
+// else{j+=GAPLEN;}
+ }
+
+ System.err.println(grefRefOrigin);
+ System.err.println(point);
+ System.err.println(new String(gref));
+
+ throw new RuntimeException("Out of bounds.");
+ }
+
+ private final int translateToGappedCoordinate(int point, byte[] gref){
+ if(verbose){System.err.println("translateToGappedCoordinate("+point+"), gro="+grefRefOrigin+", grl="+greflimit);}
+ if(point<=grefRefOrigin){return point-grefRefOrigin;}
+ for(int i=0, j=grefRefOrigin; i<greflimit2; i++){
+// if(verbose){System.err.println("i="+i+", j="+j+", sym="+(char)gref[i]);}
+ assert(point>=j) : "\n"+grefRefOrigin+"\n"+point+"\n"+new String(gref)+"\n";
+ byte c=gref[i];
+
+ if(j==point){
+ if(verbose){System.err.println(" -> "+i);}
+ return i;
+ }
+
+ j+=(c==GAPC ? GAPLEN : 1);
+// if(c!=GAPC){j++;}
+// else{j+=GAPLEN;}
+ }
+
+ System.err.println(grefRefOrigin);
+ System.err.println(point);
+ System.err.println(new String(gref));
+
+ throw new RuntimeException("Out of bounds.");
+ }
+
+
+ /** Calculates score based on an array from Index */
+ private final int calcAffineScore(int[] locArray){
+ int score=0;
+ int lastLoc=-2; //Last true location
+ int lastValue=-1;
+ int timeInMode=0;
+
+ for(int i=0; i<locArray.length; i++){
+ int loc=locArray[i];
+
+ if(loc>0){//match
+ if(loc==lastValue){//contiguous match
+ score+=POINTS_MATCH2;
+ }else if(loc==lastLoc || lastLoc<0){//match after a sub, or first match
+ score+=POINTS_MATCH;
+ }else if(loc<lastLoc){//deletion
+ assert(lastLoc>=0);
+ score+=POINTS_MATCH;
+ score+=POINTS_DEL;
+ int dif=lastLoc-loc+1;
+ if(dif>MINGAP){
+ int rem=dif%GAPLEN;
+ int div=(dif-GAPBUFFER2)/GAPLEN;
+ score+=(div*POINTS_GAP);
+ assert(rem+GAPBUFFER2<dif);
+ dif=rem+GAPBUFFER2;
+ assert(dif>LIMIT_FOR_COST_4); //and probably LIMIT_FOR_COST_5
+// assert(false) : div;
+ }
+ if(dif>LIMIT_FOR_COST_5){
+ score+=((dif-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTS_DEL5;
+ dif=LIMIT_FOR_COST_5;
+ }
+ if(dif>LIMIT_FOR_COST_4){
+ score+=(dif-LIMIT_FOR_COST_4)*POINTS_DEL4;
+ dif=LIMIT_FOR_COST_4;
+ }
+ if(dif>LIMIT_FOR_COST_3){
+ score+=(dif-LIMIT_FOR_COST_3)*POINTS_DEL3;
+ dif=LIMIT_FOR_COST_3;
+ }
+ if(dif>1){
+ score+=(dif-1)*POINTS_DEL2;
+ }
+ timeInMode=1;
+ }else if(loc>lastLoc){//insertion
+ assert(lastLoc>=0);
+ score+=POINTS_MATCH;
+ score+=POINTS_INS;
+ int dif=Tools.min(loc-lastLoc+1, 5);
+ assert(dif>0);
+ if(dif>LIMIT_FOR_COST_4){
+ score+=(dif-LIMIT_FOR_COST_4)*POINTS_INS4;
+ dif=LIMIT_FOR_COST_4;
+ }
+ if(dif>LIMIT_FOR_COST_3){
+ score+=(dif-LIMIT_FOR_COST_3)*POINTS_INS3;
+ dif=LIMIT_FOR_COST_3;
+ }
+ if(dif>1){
+ score+=(dif-1)*POINTS_INS2;
+ }
+ timeInMode=1;
+ }else{
+ assert(false);
+ }
+ lastLoc=loc;
+ }else{//substitution
+ if(lastValue<0 && timeInMode>0){//contiguous
+ if(timeInMode<LIMIT_FOR_COST_3){score+=POINTS_SUB2;}
+ else{score+=POINTS_SUB3;}
+ timeInMode++;
+ }else{
+ score+=POINTS_SUB;
+ timeInMode=1;
+ }
+ }
+ lastValue=loc;
+ }
+ assert(score<=maxQuality(locArray.length));
+ return score;
+ }
+
+ @Override
+ public final int calcAffineScore(final int[] locArray, final byte[] baseScores, final byte bases[]){
+ int score=0;
+ int lastLoc=-3; //Last true location
+ int lastValue=-1;
+ int timeInMode=0;
+
+ for(int i=0; i<locArray.length; i++){
+ final int loc=locArray[i];
+
+ if(loc>0){//match
+ if(loc==lastValue){//contiguous match
+ score+=(POINTS_MATCH2+baseScores[i]);
+ }else if(loc==lastLoc || lastLoc<0){//match after a sub, or first match
+ score+=(POINTS_MATCH+baseScores[i]);
+ }else if(loc<lastLoc){//deletion
+ assert(lastLoc>=0);
+ score+=(POINTS_MATCH+baseScores[i]);
+ score+=POINTS_DEL;
+ int dif=lastLoc-loc+1;
+ if(dif>MINGAP){
+ int rem=dif%GAPLEN;
+ int div=(dif-GAPBUFFER2)/GAPLEN;
+ score+=(div*POINTS_GAP);
+ assert(rem+GAPBUFFER2<dif);
+ dif=rem+GAPBUFFER2;
+ assert(dif>LIMIT_FOR_COST_4); //and probably LIMIT_FOR_COST_5
+// assert(false) : div;
+ }
+ if(dif>LIMIT_FOR_COST_5){
+ score+=((dif-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTS_DEL5;
+ dif=LIMIT_FOR_COST_5;
+ }
+ if(dif>LIMIT_FOR_COST_4){
+ score+=(dif-LIMIT_FOR_COST_4)*POINTS_DEL4;
+ dif=LIMIT_FOR_COST_4;
+ }
+ if(dif>LIMIT_FOR_COST_3){
+ score+=(dif-LIMIT_FOR_COST_3)*POINTS_DEL3;
+ dif=LIMIT_FOR_COST_3;
+ }
+ if(dif>1){
+ score+=(dif-1)*POINTS_DEL2;
+ }
+ timeInMode=1;
+ }else if(loc>lastLoc){//insertion
+ assert(lastLoc>=0);
+ score+=(POINTS_MATCH+baseScores[i]);
+ score+=POINTS_INS;
+ int dif=Tools.min(loc-lastLoc+1, 5);
+ assert(dif>0);
+ if(dif>LIMIT_FOR_COST_4){
+ score+=(dif-LIMIT_FOR_COST_4)*POINTS_INS4;
+ dif=LIMIT_FOR_COST_4;
+ }
+ if(dif>LIMIT_FOR_COST_3){
+ score+=(dif-LIMIT_FOR_COST_3)*POINTS_INS3;
+ dif=LIMIT_FOR_COST_3;
+ }
+ if(dif>1){
+ score+=(dif-1)*POINTS_INS2;
+ }
+ timeInMode=1;
+ }else{
+ assert(false);
+ }
+ lastLoc=loc;
+ }else if(loc==-1){//substitution
+ if(lastValue<0 && timeInMode>0){//contiguous
+ if(timeInMode<LIMIT_FOR_COST_3){score+=POINTS_SUB2;}
+ else{score+=POINTS_SUB3;}
+ timeInMode++;
+ }else{
+ score+=POINTS_SUB;
+ timeInMode=1;
+ }
+ }else{
+ assert(loc==-2) : "\ni="+i+", loc="+loc+", score="+score+", lastLoc="+lastLoc+", lastValue="+lastValue
+ +", time="+timeInMode+", length="+locArray.length+"\nbases=\n"+new String(bases)
+ +"\nlocs[]=\n"+Arrays.toString(locArray)+"\n"+new String(bases)+"\n"
+ +"If this happens please ensure that the reference has a startpad of Ns longer than readlength.";//N (no-call or no-ref)
+ timeInMode=0;
+ score+=POINTS_NOCALL;
+ }
+ lastValue=loc;
+ }
+ assert(score<=maxQuality(locArray.length));
+ return score;
+ }
+
+ @Override
+ public final int calcAffineScore(int[] locArray, byte[] baseScores, byte[] bases, int minContig){
+ assert(minContig>1) : minContig;
+
+ int contig=0;
+ int maxContig=0;
+
+ int score=0;
+ int lastLoc=-3; //Last true location
+ int lastValue=-1;
+ int timeInMode=0;
+
+ for(int i=0; i<locArray.length; i++){
+ int loc=locArray[i];
+
+ if(loc>0){//match
+ if(loc==lastValue){//contiguous match
+ contig++;
+ score+=(POINTS_MATCH2+baseScores[i]);
+ }else if(loc==lastLoc || lastLoc<0){//match after a sub, or first match
+ maxContig=Tools.max(maxContig, contig);
+ contig=1;
+ score+=(POINTS_MATCH+baseScores[i]);
+ }else if(loc<lastLoc){//deletion
+ maxContig=Tools.max(maxContig, contig);
+ contig=0;
+ assert(lastLoc>=0);
+ score+=(POINTS_MATCH+baseScores[i]);
+ score+=POINTS_DEL;
+ int dif=lastLoc-loc+1;
+ if(dif>MINGAP){
+ int rem=dif%GAPLEN;
+ int div=(dif-GAPBUFFER2)/GAPLEN;
+ score+=(div*POINTS_GAP);
+ assert(rem+GAPBUFFER2<dif);
+ dif=rem+GAPBUFFER2;
+ assert(dif>LIMIT_FOR_COST_4); //and probably LIMIT_FOR_COST_5
+// assert(false) : div;
+ }
+ if(dif>LIMIT_FOR_COST_5){
+ score+=((dif-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTS_DEL5;
+ dif=LIMIT_FOR_COST_5;
+ }
+ if(dif>LIMIT_FOR_COST_4){
+ score+=(dif-LIMIT_FOR_COST_4)*POINTS_DEL4;
+ dif=LIMIT_FOR_COST_4;
+ }
+ if(dif>LIMIT_FOR_COST_3){
+ score+=(dif-LIMIT_FOR_COST_3)*POINTS_DEL3;
+ dif=LIMIT_FOR_COST_3;
+ }
+ if(dif>1){
+ score+=(dif-1)*POINTS_DEL2;
+ }
+ timeInMode=1;
+ }else if(loc>lastLoc){//insertion
+ maxContig=Tools.max(maxContig, contig);
+ contig=0;
+ assert(lastLoc>=0);
+ score+=(POINTS_MATCH+baseScores[i]);
+ score+=POINTS_INS;
+ int dif=Tools.min(loc-lastLoc+1, 5);
+ assert(dif>0);
+ if(dif>LIMIT_FOR_COST_4){
+ score+=(dif-LIMIT_FOR_COST_4)*POINTS_INS4;
+ dif=LIMIT_FOR_COST_4;
+ }
+ if(dif>LIMIT_FOR_COST_3){
+ score+=(dif-LIMIT_FOR_COST_3)*POINTS_INS3;
+ dif=LIMIT_FOR_COST_3;
+ }
+ if(dif>1){
+ score+=(dif-1)*POINTS_INS2;
+ }
+ timeInMode=1;
+ }else{
+ assert(false);
+ }
+ lastLoc=loc;
+ }else if(loc==-1){//substitution
+ if(lastValue<0 && timeInMode>0){//contiguous
+ if(timeInMode<LIMIT_FOR_COST_3){score+=POINTS_SUB2;}
+ else{score+=POINTS_SUB3;}
+ timeInMode++;
+ }else{
+ score+=POINTS_SUB;
+ timeInMode=1;
+ }
+ }else{
+ assert(loc==-2) : loc+"\n"+Arrays.toString(locArray)+"\n"+Arrays.toString(baseScores)+"\n"+new String(bases)+"\n"
+ +"If this happens please ensure that the reference has a startpad of Ns longer than readlength.";//N (no-call or no-ref)
+ timeInMode=0;
+ score+=POINTS_NOCALL;
+ }
+ lastValue=loc;
+ }
+ assert(score<=maxQuality(locArray.length));
+ if(Tools.max(contig, maxContig)<minContig){score=Tools.min(score, -50*locArray.length);}
+ return score;
+ }
+
+ @Override
+ public final int scoreNoIndels(byte[] read, byte[] ref, final int refStart){
+ return scoreNoIndels(read, ref, refStart, null);
+ }
+ @Override
+ public final int scoreNoIndels(byte[] read, byte[] ref, final int refStart, final SiteScore ss){
+
+ int score=0;
+ int mode=-1;
+ int timeInMode=0;
+
+ //This block handles cases where the read runs outside the reference
+ //Of course, padding the reference with 'N' would be better, but...
+ int readStart=0;
+ int readStop=read.length;
+ final int refStop=refStart+read.length;
+ boolean semiperfect=true;
+ int norefs=0;
+
+ if(refStart<0){
+ readStart=0-refStart;
+ score+=POINTS_NOREF*readStart;
+ norefs+=readStart;
+ }
+ if(refStop>ref.length){
+ int dif=(refStop-ref.length);
+ readStop-=dif;
+ score+=POINTS_NOREF*dif;
+ norefs+=dif;
+ }
+
+// if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed.
+
+ for(int i=readStart; i<readStop; i++){
+ byte c=read[i];
+ byte r=ref[refStart+i];
+
+ if(c==r && c!='N'){
+ if(mode==MODE_MS){
+ timeInMode++;
+ score+=POINTS_MATCH2;
+ }else{
+ timeInMode=0;
+ score+=POINTS_MATCH;
+ }
+ mode=MODE_MS;
+ }else if(c<0 || c=='N'){
+ score+=POINTS_NOCALL;
+ semiperfect=false;
+ }else if(r<0 || r=='N'){
+ score+=POINTS_NOREF;
+ norefs++;
+ }else{
+ if(mode==MODE_SUB){timeInMode++;}
+ else{timeInMode=0;}
+
+ if(timeInMode==0){score+=POINTS_SUB;}
+ else if(timeInMode<LIMIT_FOR_COST_3){score+=POINTS_SUB2;}
+ else{score+=POINTS_SUB3;}
+ mode=MODE_SUB;
+ semiperfect=false;
+ }
+ }
+
+
+ if(semiperfect && ss!=null){ss.semiperfect=((ss.stop==ss.start+read.length-1) && (norefs<=read.length/2));}
+
+ return score;
+ }
+
+ @Override
+ public final byte[] genMatchNoIndels(byte[] read, byte[] ref, final int refStart){
+ if(read==null || ref==null){return null;}
+
+ final byte[] match=new byte[read.length];
+
+ for(int i=0, j=refStart; i<read.length; i++, j++){
+ byte c=read[i];
+ byte r=(j<0 || j>=ref.length) ? (byte)'N' : ref[j];
+
+ if(c=='N' || r=='N'){match[i]='N';}
+ else if(c==r){match[i]='m';}
+ else{match[i]='S';}
+
+ }
+
+ return match;
+ }
+
+ @Override
+ public final int scoreNoIndels(byte[] read, byte[] ref, byte[] baseScores, final int refStart){
+ return scoreNoIndels(read, ref, baseScores, refStart, null);
+ }
+ @Override
+ public final int scoreNoIndels(byte[] read, byte[] ref, byte[] baseScores, final int refStart, SiteScore ss){
+
+ int score=0;
+ int mode=-1;
+ int timeInMode=0;
+ int norefs=0;
+
+ //This block handles cases where the read runs outside the reference
+ //Of course, padding the reference with 'N' would be better, but...
+ int readStart=0;
+ int readStop=read.length;
+ final int refStop=refStart+read.length;
+ boolean semiperfect=true;
+
+ if(refStart<0){
+ readStart=0-refStart;
+ score+=POINTS_NOREF*readStart;
+ norefs+=readStart;
+ }
+ if(refStop>ref.length){
+ int dif=(refStop-ref.length);
+ readStop-=dif;
+ score+=POINTS_NOREF*dif;
+ norefs+=dif;
+ }
+
+// if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed.
+
+ for(int i=readStart; i<readStop; i++){
+ byte c=read[i];
+ byte r=ref[refStart+i];
+
+ if(c==r && c!='N'){
+ if(mode==MODE_MS){
+ timeInMode++;
+ score+=POINTS_MATCH2;
+ }else{
+ timeInMode=0;
+ score+=POINTS_MATCH;
+ }
+ score+=baseScores[i];
+ mode=MODE_MS;
+ }else if(c<0 || c=='N'){
+ score+=POINTS_NOCALL;
+ semiperfect=false;
+ }else if(r<0 || r=='N'){
+ score+=POINTS_NOREF;
+ norefs++;
+ }else{
+ if(mode==MODE_SUB){timeInMode++;}
+ else{timeInMode=0;}
+
+ if(timeInMode==0){score+=POINTS_SUB;}
+ else if(timeInMode<LIMIT_FOR_COST_3){score+=POINTS_SUB2;}
+ else{score+=POINTS_SUB3;}
+ mode=MODE_SUB;
+ semiperfect=false;
+ }
+ }
+
+
+ if(semiperfect && ss!=null){ss.semiperfect=((ss.stop==ss.start+read.length-1) && (norefs<=read.length/2));}
+ assert(Read.CHECKSITE(ss, read, -1));
+
+ return score;
+ }
+
+ @Override
+ public final int scoreNoIndelsAndMakeMatchString(byte[] read, byte[] ref, byte[] baseScores, final int refStart, byte[][] matchReturn){
+ int score=0;
+ int mode=-1;
+ int timeInMode=0;
+
+ assert(refStart<=ref.length) : refStart+", "+ref.length;
+
+ //This block handles cases where the read runs outside the reference
+ //Of course, padding the reference with 'N' would be better, but...
+ int readStart=0;
+ int readStop=read.length;
+ final int refStop=refStart+read.length;
+ if(refStart<0 || refStop>ref.length){return -99999;}
+ if(refStart<0){
+ readStart=0-refStart;
+ score+=POINTS_NOREF*readStart;
+ }
+ if(refStop>ref.length){
+ int dif=(refStop-ref.length);
+ System.err.println(new String(read)+"\ndif="+dif+", ref.length="+ref.length+", refStop="+refStop);
+ readStop-=dif;
+ score+=POINTS_NOREF*dif;
+ }
+ assert(refStart+readStop<=ref.length) : "readStart="+readStart+", readStop="+readStop+
+ ", refStart="+refStart+", refStop="+refStop+", ref.length="+ref.length+", read.length="+read.length;
+
+ assert(matchReturn!=null);
+ assert(matchReturn.length==1);
+ if(matchReturn[0]==null || matchReturn[0].length!=read.length){
+ assert(matchReturn[0]==null || matchReturn[0].length<read.length) : matchReturn[0].length+"!="+read.length;
+ matchReturn[0]=new byte[read.length];
+ }
+ final byte[] match=matchReturn[0];
+
+// if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed.
+
+ for(int i=readStart; i<readStop; i++){
+ byte c=read[i];
+ byte r=ref[refStart+i];
+
+ assert(r!='.' && c!='.');
+
+ if(c==r && c!='N'){
+ if(mode==MODE_MS){
+ timeInMode++;
+ score+=POINTS_MATCH2;
+ }else{
+ timeInMode=0;
+ score+=POINTS_MATCH;
+ }
+ score+=baseScores[i];
+ match[i]='m';
+ mode=MODE_MS;
+ }else if(c<0 || c=='N'){
+ score+=POINTS_NOCALL;
+ match[i]='N';
+ }else if(r<0 || r=='N'){
+ score+=POINTS_NOREF;
+// match[i]='m';
+ match[i]='N';
+ }else{
+ match[i]='S';
+ if(mode==MODE_SUB){timeInMode++;}
+ else{timeInMode=0;}
+
+ if(timeInMode==0){score+=POINTS_SUB;}
+ else if(timeInMode<LIMIT_FOR_COST_3){score+=POINTS_SUB2;}
+ else{score+=POINTS_SUB3;}
+ mode=MODE_SUB;
+ }
+ }
+
+ return score;
+ }
+
+ @Override
+ public final int scoreNoIndelsAndMakeMatchString(byte[] read, byte[] ref, final int refStart, byte[][] matchReturn){
+ int score=0;
+ int mode=-1;
+ int timeInMode=0;
+
+ assert(refStart<=ref.length) : refStart+", "+ref.length;
+
+ //This block handles cases where the read runs outside the reference
+ //Of course, padding the reference with 'N' would be better, but...
+ int readStart=0;
+ int readStop=read.length;
+ final int refStop=refStart+read.length;
+ if(refStart<0 || refStop>ref.length){return -99999;}
+ if(refStart<0){
+ readStart=0-refStart;
+ score+=POINTS_NOREF*readStart;
+ }
+ if(refStop>ref.length){
+ int dif=(refStop-ref.length);
+ System.err.println(new String(read)+"\ndif="+dif+", ref.length="+ref.length+", refStop="+refStop);
+ readStop-=dif;
+ score+=POINTS_NOREF*dif;
+ }
+ assert(refStart+readStop<=ref.length) : "readStart="+readStart+", readStop="+readStop+
+ ", refStart="+refStart+", refStop="+refStop+", ref.length="+ref.length+", read.length="+read.length;
+
+ assert(matchReturn!=null);
+ assert(matchReturn.length==1);
+ if(matchReturn[0]==null || matchReturn[0].length!=read.length){
+ assert(matchReturn[0]==null || matchReturn[0].length<read.length) : matchReturn[0].length+"!="+read.length;
+ matchReturn[0]=new byte[read.length];
+ }
+ final byte[] match=matchReturn[0];
+
+// if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed.
+
+ for(int i=readStart; i<readStop; i++){
+ byte c=read[i];
+ byte r=ref[refStart+i];
+
+ assert(r!='.' && c!='.');
+
+ if(c==r && c!='N'){
+ if(mode==MODE_MS){
+ timeInMode++;
+ score+=POINTS_MATCH2;
+ }else{
+ timeInMode=0;
+ score+=POINTS_MATCH;
+ }
+ match[i]='m';
+ mode=MODE_MS;
+ }else if(c<0 || c=='N'){
+ score+=POINTS_NOCALL;
+ match[i]='N';
+ }else if(r<0 || r=='N'){
+ score+=POINTS_NOREF;
+// match[i]='m';
+ match[i]='N';
+ }else{
+ match[i]='S';
+ if(mode==MODE_SUB){timeInMode++;}
+ else{timeInMode=0;}
+
+ if(timeInMode==0){score+=POINTS_SUB;}
+ else if(timeInMode<LIMIT_FOR_COST_3){score+=POINTS_SUB2;}
+ else{score+=POINTS_SUB3;}
+ mode=MODE_SUB;
+ }
+ }
+
+ return score;
+ }
+
+ @Override
+ public final int maxQuality(int numBases){
+ return POINTS_MATCH+(numBases-1)*(POINTS_MATCH2);
+ }
+
+ @Override
+ public final int maxQuality(byte[] baseScores){
+ return POINTS_MATCH+(baseScores.length-1)*(POINTS_MATCH2)+Tools.sumInt(baseScores);
+ }
+
+ @Override
+ public final int maxImperfectScore(int numBases){
+// int maxQ=maxQuality(numBases);
+//// maxImperfectSwScore=maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB);
+// int maxI=maxQ+POINTS_DEL;
+// maxI=Tools.max(maxI, maxQ+POINTS_INS-POINTS_MATCH2);
+// maxI=Tools.min(maxI, maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB));
+
+ int maxQ=maxQuality(numBases);
+ int maxI=maxQ+Tools.min(POINTS_DEL, POINTS_INS-POINTS_MATCH2);
+ assert(maxI<(maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB)));
+ return maxI;
+ }
+
+ @Override
+ public final int maxImperfectScore(byte[] baseScores){
+// int maxQ=maxQuality(numBases);
+//// maxImperfectSwScore=maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB);
+// int maxI=maxQ+POINTS_DEL;
+// maxI=Tools.max(maxI, maxQ+POINTS_INS-POINTS_MATCH2);
+// maxI=Tools.min(maxI, maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB));
+
+ int maxQ=maxQuality(baseScores);
+ int maxI=maxQ+Tools.min(POINTS_DEL, POINTS_INS-POINTS_MATCH2);
+ assert(maxI<(maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB)));
+ return maxI;
+ }
+
+ @Override
+ public int calcDelScore(int len, boolean approximateGaps){
+ if(len<=0){return 0;}
+ int score=POINTS_DEL;
+
+ if(approximateGaps && len>MINGAP){
+ int rem=len%GAPLEN;
+ int div=(len-GAPBUFFER2)/GAPLEN;
+ score+=(div*POINTS_GAP);
+ assert(rem+GAPBUFFER2<len);
+ len=rem+GAPBUFFER2;
+ assert(len>LIMIT_FOR_COST_4); //and probably LIMIT_FOR_COST_5
+// assert(false) : div;
+ }
+
+ if(len>LIMIT_FOR_COST_5){
+ score+=((len-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTS_DEL5;
+ len=LIMIT_FOR_COST_5;
+ }
+ if(len>LIMIT_FOR_COST_4){
+ score+=(len-LIMIT_FOR_COST_4)*POINTS_DEL4;
+ len=LIMIT_FOR_COST_4;
+ }
+ if(len>LIMIT_FOR_COST_3){
+ score+=(len-LIMIT_FOR_COST_3)*POINTS_DEL3;
+ len=LIMIT_FOR_COST_3;
+ }
+ if(len>1){
+ score+=(len-1)*POINTS_DEL2;
+ }
+ return score;
+ }
+
+ private static int calcDelScoreOffset(int len){
+ if(len<=0){return 0;}
+ int score=POINTSoff_DEL;
+
+ if(len>LIMIT_FOR_COST_5){
+ score+=((len-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTSoff_DEL5;
+ len=LIMIT_FOR_COST_5;
+ }
+ if(len>LIMIT_FOR_COST_4){
+ score+=(len-LIMIT_FOR_COST_4)*POINTSoff_DEL4;
+ len=LIMIT_FOR_COST_4;
+ }
+ if(len>LIMIT_FOR_COST_3){
+ score+=(len-LIMIT_FOR_COST_3)*POINTSoff_DEL3;
+ len=LIMIT_FOR_COST_3;
+ }
+ if(len>1){
+ score+=(len-1)*POINTSoff_DEL2;
+ }
+ return score;
+ }
+
+ @Override
+ public int calcInsScore(int len){
+ if(len<=0){return 0;}
+ int score=POINTS_INS;
+
+ if(len>LIMIT_FOR_COST_4){
+ score+=(len-LIMIT_FOR_COST_4)*POINTS_INS4;
+ len=LIMIT_FOR_COST_4;
+ }
+ if(len>LIMIT_FOR_COST_3){
+ score+=(len-LIMIT_FOR_COST_3)*POINTS_INS3;
+ len=LIMIT_FOR_COST_3;
+ }
+ if(len>1){
+ score+=(len-1)*POINTS_INS2;
+ }
+ return score;
+ }
+
+ private static int calcInsScoreOffset(int len){
+ if(len<=0){return 0;}
+ int score=POINTSoff_INS;
+ if(len>LIMIT_FOR_COST_4){
+ score+=(len-LIMIT_FOR_COST_4)*POINTSoff_INS4;
+ len=LIMIT_FOR_COST_4;
+ }
+ if(len>LIMIT_FOR_COST_3){
+ score+=(len-LIMIT_FOR_COST_3)*POINTSoff_INS3;
+ len=LIMIT_FOR_COST_3;
+ }
+ if(len>1){
+ score+=(len-1)*POINTSoff_INS2;
+ }
+ return score;
+ }
+
+
+ private final int[][][] packed;
+ private final byte[] grefbuffer;
+ private int greflimit=-1;
+ private int greflimit2=-1;
+ private int grefRefOrigin=-1;
+
+
+ @Override
+ /**DO NOT MODIFY*/
+ public final byte[] getGrefbuffer(){
+ return grefbuffer;
+ }
+
+ public final int[] vertLimit;
+ public final int[] horizLimit;
+
+ @Override
+ public CharSequence showVertLimit(){
+ StringBuilder sb=new StringBuilder();
+ for(int i=0; i<=rows; i++){sb.append(vertLimit[i]>>SCOREOFFSET).append(",");}
+ return sb;
+ }
+
+ @Override
+ public CharSequence showHorizLimit(){
+ StringBuilder sb=new StringBuilder();
+ for(int i=0; i<=columns; i++){sb.append(horizLimit[i]>>SCOREOFFSET).append(",");}
+ return sb;
+ }
+
+ public static float minIdToMinRatio(double minid){
+ if(minid>1){minid=minid/100;}
+ assert(minid>0 && minid<=1) : "Min identity should be between 0 and 1. Values above 1 will be assumed to be percent and divided by 100.";
+ double matchdif=POINTS_MATCH-POINTS_MATCH2;
+ double match=POINTS_MATCH2;
+ double sub=-POINTS_MATCH2+0.5*(matchdif+POINTS_SUB)+0.5*POINTS_SUB2;
+ double del=0.8*(matchdif+POINTS_DEL)+0.1*POINTS_DEL2+0.05*POINTS_DEL3+0.05*POINTS_DEL4;
+ double ins=-POINTS_MATCH2+0.8*(matchdif+POINTS_INS)+0.15*(POINTS_INS2)+0.05*(POINTS_INS3);
+ double badAvg=.2*sub+.3*del+.5*ins;
+ double badFraction=1-minid;
+ double minratio=(match+badFraction*badAvg)/match;
+ assert(minratio<=1);
+ minratio=Tools.max(0.1, minratio);
+ return (float)minratio;
+ }
+
+ public static final int TIMEBITS=9;
+ public static final int SCOREBITS=32-TIMEBITS;
+ public static final int MAX_TIME=((1<<TIMEBITS)-1);
+ public static final int MAX_SCORE=((1<<(SCOREBITS-1))-1)-2000;
+ public static final int MIN_SCORE=0-MAX_SCORE; //Keeps it 1 point above "BAD".
+
+ public static final int SCOREOFFSET=TIMEBITS;
+
+ public static final int TIMEMASK=~((-1)<<TIMEBITS);
+ public static final int SCOREMASK=(~((-1)<<SCOREBITS))<<SCOREOFFSET;
+
+ private static final byte MODE_MS=0;
+ private static final byte MODE_DEL=1;
+ private static final byte MODE_INS=2;
+ private static final byte MODE_SUB=3;
+
+ public static final int POINTS_NOREF=0;
+ public static final int POINTS_NOCALL=0;
+ public static final int POINTS_MATCH=92;
+ public static final int POINTS_MATCH2=100;
+ public static final int POINTS_COMPATIBLE=50;
+ public static final int POINTS_SUB=-87;
+ public static final int POINTS_SUBR=-89; //increased penalty if prior match streak was at most 1
+ public static final int POINTS_SUB2=-75;
+ public static final int POINTS_SUB3=-50;
+ public static final int POINTS_MATCHSUB=-10;
+ public static final int POINTS_INS=-100;
+ public static final int POINTS_INS2=-81;
+ public static final int POINTS_INS3=-59;
+ public static final int POINTS_INS4=-45;
+ public static final int POINTS_DEL=-140;
+ public static final int POINTS_DEL2=-73;
+ public static final int POINTS_DEL3=-58;
+ public static final int POINTS_DEL4=-44;
+ public static final int POINTS_DEL5=-30;
+ public static final int POINTS_DEL_REF_N=-10;
+ public static final int POINTS_GAP=0-GAPCOST;
+
+ public static final int TIMESLIP=4;
+ public static final int MASK5=TIMESLIP-1;
+ static{assert(Integer.bitCount(TIMESLIP)==1);}
+
+ //TODO: Consider removing these barriers entirely for PacBio reads. Would make code faster, too.
+ private static final int BARRIER_I1=1;
+ private static final int BARRIER_D1=1;
+
+
+ public static final int LIMIT_FOR_COST_3=5;
+ public static final int LIMIT_FOR_COST_4=20;
+ public static final int LIMIT_FOR_COST_5=80;
+
+ public static final int BAD=MIN_SCORE-1;
+
+
+ public static final int POINTSoff_NOREF=(POINTS_NOREF<<SCOREOFFSET);
+ public static final int POINTSoff_NOCALL=(POINTS_NOCALL<<SCOREOFFSET);
+ public static final int POINTSoff_MATCH=(POINTS_MATCH<<SCOREOFFSET);
+ public static final int POINTSoff_MATCH2=(POINTS_MATCH2<<SCOREOFFSET);
+ public static final int POINTSoff_COMPATIBLE=(POINTS_COMPATIBLE<<SCOREOFFSET);
+ public static final int POINTSoff_SUB=(POINTS_SUB<<SCOREOFFSET);
+ public static final int POINTSoff_SUBR=(POINTS_SUBR<<SCOREOFFSET);
+ public static final int POINTSoff_SUB2=(POINTS_SUB2<<SCOREOFFSET);
+ public static final int POINTSoff_SUB3=(POINTS_SUB3<<SCOREOFFSET);
+ public static final int POINTSoff_MATCHSUB=(POINTS_MATCHSUB<<SCOREOFFSET);
+ public static final int POINTSoff_INS=(POINTS_INS<<SCOREOFFSET);
+ public static final int POINTSoff_INS2=(POINTS_INS2<<SCOREOFFSET);
+ public static final int POINTSoff_INS3=(POINTS_INS3<<SCOREOFFSET);
+ public static final int POINTSoff_INS4=(POINTS_INS4<<SCOREOFFSET);
+ public static final int POINTSoff_DEL=(POINTS_DEL<<SCOREOFFSET);
+ public static final int POINTSoff_DEL2=(POINTS_DEL2<<SCOREOFFSET);
+ public static final int POINTSoff_DEL3=(POINTS_DEL3<<SCOREOFFSET);
+ public static final int POINTSoff_DEL4=(POINTS_DEL4<<SCOREOFFSET);
+ public static final int POINTSoff_DEL5=(POINTS_DEL5<<SCOREOFFSET);
+ public static final int POINTSoff_GAP=(POINTS_GAP<<SCOREOFFSET);
+ public static final int POINTSoff_DEL_REF_N=(POINTS_DEL_REF_N<<SCOREOFFSET);
+ public static final int BADoff=(BAD<<SCOREOFFSET);
+ public static final int MAXoff_SCORE=MAX_SCORE<<SCOREOFFSET;
+ public static final int MINoff_SCORE=MIN_SCORE<<SCOREOFFSET;
+
+ /** TODO: possibly enclose all uses of affine arrays in a branch controlled by this */
+ public static final boolean AFFINE_ARRAYS=false;
+ public static final int[] POINTS_INS_ARRAY;
+ public static final int[] POINTSoff_INS_ARRAY;
+ public static final int[] POINTS_INS_ARRAY_C;
+ public static final int[] POINTSoff_INS_ARRAY_C;
+
+ public static final int[] POINTS_SUB_ARRAY;
+ public static final int[] POINTSoff_SUB_ARRAY;
+ public static final int[] POINTS_SUB_ARRAY_C;
+ public static final int[] POINTSoff_SUB_ARRAY_C;
+
+ static{
+ POINTS_INS_ARRAY=new int[604];
+ POINTSoff_INS_ARRAY=new int[604];
+ POINTS_INS_ARRAY_C=new int[604];
+ POINTSoff_INS_ARRAY_C=new int[604];
+
+ for(int i=1; i<POINTS_INS_ARRAY.length; i++){
+ int pts, ptsoff;
+ if(i>LIMIT_FOR_COST_4){
+ pts=POINTS_INS4;
+ ptsoff=POINTSoff_INS4;
+ }else if(i>LIMIT_FOR_COST_3){
+ pts=POINTS_INS3;
+ ptsoff=POINTSoff_INS3;
+ }else if(i>1){
+ pts=POINTS_INS2;
+ ptsoff=POINTSoff_INS2;
+ }else{
+ pts=POINTS_INS;
+ ptsoff=POINTSoff_INS;
+ }
+ POINTS_INS_ARRAY[i]=pts;
+ POINTSoff_INS_ARRAY[i]=ptsoff;
+ POINTS_INS_ARRAY_C[i]=Tools.max(MIN_SCORE, pts+POINTS_INS_ARRAY_C[i-1]);
+ POINTSoff_INS_ARRAY_C[i]=Tools.max(MINoff_SCORE, ptsoff+POINTSoff_INS_ARRAY_C[i-1]);
+ }
+
+
+ POINTS_SUB_ARRAY=new int[604];
+ POINTSoff_SUB_ARRAY=new int[604];
+ POINTS_SUB_ARRAY_C=new int[604];
+ POINTSoff_SUB_ARRAY_C=new int[604];
+
+ for(int i=1; i<POINTS_SUB_ARRAY.length; i++){
+ int pts, ptsoff;
+ if(i>LIMIT_FOR_COST_3){
+ pts=POINTS_SUB3;
+ ptsoff=POINTSoff_SUB3;
+ }else if(i>1){
+ pts=POINTS_SUB2;
+ ptsoff=POINTSoff_SUB2;
+ }else{
+ pts=POINTS_SUB;
+ ptsoff=POINTSoff_SUB;
+ }
+ POINTS_SUB_ARRAY[i]=pts;
+ POINTSoff_SUB_ARRAY[i]=ptsoff;
+ POINTS_SUB_ARRAY_C[i]=Tools.max(MIN_SCORE, pts+POINTS_SUB_ARRAY_C[i-1]);
+ POINTSoff_SUB_ARRAY_C[i]=Tools.max(MINoff_SCORE, ptsoff+POINTSoff_SUB_ARRAY_C[i-1]);
+ }
+ }
+
+ public final int POINTS_NOREF(){return POINTS_NOREF;}
+ public final int POINTS_NOCALL(){return POINTS_NOCALL;}
+ public final int POINTS_MATCH(){return POINTS_MATCH;}
+ public final int POINTS_MATCH2(){return POINTS_MATCH2;}
+ public final int POINTS_COMPATIBLE(){return POINTS_COMPATIBLE;}
+ public final int POINTS_SUB(){return POINTS_SUB;}
+ public final int POINTS_SUBR(){return POINTS_SUBR;}
+ public final int POINTS_SUB2(){return POINTS_SUB2;}
+ public final int POINTS_SUB3(){return POINTS_SUB3;}
+ public final int POINTS_MATCHSUB(){return POINTS_MATCHSUB;}
+ public final int POINTS_INS(){return POINTS_INS;}
+ public final int POINTS_INS2(){return POINTS_INS2;}
+ public final int POINTS_INS3(){return POINTS_INS3;}
+ public final int POINTS_INS4(){return POINTS_INS4;}
+ public final int POINTS_DEL(){return POINTS_DEL;}
+ public final int POINTS_DEL2(){return POINTS_DEL2;}
+ public final int POINTS_DEL3(){return POINTS_DEL3;}
+ public final int POINTS_DEL4(){return POINTS_DEL4;}
+ public final int POINTS_DEL5(){return POINTS_DEL5;}
+ public final int POINTS_DEL_REF_N(){return POINTS_DEL_REF_N;}
+ public final int POINTS_GAP(){return POINTS_GAP;}
+
+ public final int TIMESLIP(){return TIMESLIP;}
+ public final int MASK5(){return MASK5;}
+ public final int SCOREOFFSET(){return SCOREOFFSET();}
+
+ final int BARRIER_I1(){return BARRIER_I1;}
+ final int BARRIER_D1(){return BARRIER_D1;}
+
+ public final int LIMIT_FOR_COST_3(){return LIMIT_FOR_COST_3;}
+ public final int LIMIT_FOR_COST_4(){return LIMIT_FOR_COST_4;}
+ public final int LIMIT_FOR_COST_5(){return LIMIT_FOR_COST_5;}
+
+ public final int BAD(){return BAD;}
+
+
+ private int rows;
+ private int columns;
+
+}
diff --git a/current/align2/MultiStateAligner9PacBio.java b/current/align2/MultiStateAligner9PacBio.java
new file mode 100755
index 0000000..69e9c4e
--- /dev/null
+++ b/current/align2/MultiStateAligner9PacBio.java
@@ -0,0 +1,2541 @@
+package align2;
+
+import java.util.Arrays;
+
+import stream.Read;
+import stream.SiteScore;
+
+import dna.AminoAcid;
+
+/**
+ * Based on MSA9ts, with transform scores tweaked for PacBio. */
+public final class MultiStateAligner9PacBio extends MSA{
+
+
+ public static void main(String[] args){
+ byte[] read=args[0].getBytes();
+ byte[] ref=args[1].getBytes();
+
+ byte[] original=ref;
+
+ MultiStateAligner9PacBio msa=new MultiStateAligner9PacBio(read.length, ref.length);
+
+ System.out.println("Initial: ");
+ printMatrix(msa.packed, read.length, ref.length, TIMEMASK, SCOREOFFSET);
+
+ int[] max=msa.fillLimited(read, ref, 0, ref.length-1, 0, null);
+
+ System.out.println("Max: "+Arrays.toString(max));
+
+ System.out.println("Final: ");
+ printMatrix(msa.packed, read.length, ref.length, TIMEMASK, SCOREOFFSET);
+
+ byte[] out=msa.traceback(read, ref, 0, ref.length-1, max[0], max[1], max[2], false);
+
+ int[] score=null;
+ score=msa.score(read, ref, 0, ref.length-1, max[0], max[1], max[2], false);
+
+ System.out.println(new String(ref));
+ System.out.println(new String(read));
+ System.out.println(new String(out));
+ System.out.println("Score: "+Arrays.toString(score));
+ }
+
+
+ public MultiStateAligner9PacBio(int maxRows_, int maxColumns_){
+ super(maxRows_, maxColumns_);
+
+ {
+ int[][][] packed0=null;
+ byte[] grefbuffer0=null;
+ int[] vertLimit0=null;
+ int[] horizLimit0=null;
+
+ try {
+ packed0=new int[3][maxRows+1][maxColumns+1];
+ grefbuffer0=new byte[maxColumns+2];
+ vertLimit0=new int[maxRows+1];
+ horizLimit0=new int[maxColumns+1];
+ } catch (OutOfMemoryError e) {
+ packed0=null;
+ grefbuffer0=null;
+ vertLimit0=null;
+ horizLimit0=null;
+ throw new RuntimeException(e.toString());
+ }
+
+ packed=packed0;
+ grefbuffer=grefbuffer0;
+ vertLimit=vertLimit0;
+ horizLimit=horizLimit0;
+ }
+
+ Arrays.fill(vertLimit, BADoff);
+ Arrays.fill(horizLimit, BADoff);
+
+// for(int i=0; i<maxColumns+1; i++){
+// scores[0][i]=0-i;
+// }
+
+ for(int matrix=0; matrix<packed.length; matrix++){
+ for(int i=1; i<=maxRows; i++){
+ for(int j=0; j<packed[matrix][i].length; j++){
+ packed[matrix][i][j]|=BADoff;
+ }
+// packed[matrix][i][0]|=MODE_INS;
+ }
+// for(int i=0; i<maxRows+1; i++){
+// scores[matrix][i][0]=(i*POINTSoff_NOREF);
+// }
+ for(int i=0; i<=maxRows; i++){
+
+ int prevScore=(i<2 ? 0 : packed[matrix][i-1][0]);
+ int score=(i<2 ? (i*POINTSoff_INS) :
+ (i<LIMIT_FOR_COST_3 ? prevScore+POINTSoff_INS2 :
+ (i<LIMIT_FOR_COST_4 ? prevScore+POINTSoff_INS3 : prevScore+POINTSoff_INS4)));
+
+ packed[matrix][i][0]=score;
+ }
+// for(int i=1; i<maxColumns+1; i++){
+// prevState[matrix][0][i]=MODE_DEL;
+// }
+// for(int i=0; i<=maxColumns; i++){
+// packed[matrix][0][i]|=MODE_MS;
+// }
+ }
+ }
+
+ @Override
+ /** return new int[] {rows, maxC, maxS, max};
+ * Will not fill areas that cannot match minScore */
+ public final int[] fillLimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore, int[] gaps){
+ if(gaps==null){return fillLimitedX(read, ref, refStartLoc, refEndLoc, minScore);}
+ else{
+ byte[] gref=makeGref(ref, gaps, refStartLoc, refEndLoc, read);
+
+ if(verbose && greflimit>0 && greflimit<500){
+ System.err.println(new String(gref, 0, greflimit));
+ }
+
+ assert(gref!=null) : "Excessively long read:\n"+new String(read);
+ return fillLimitedX(read, gref, 0, greflimit, minScore);
+ }
+ }
+
+
+ /** return new int[] {rows, maxC, maxS, max};
+ * Will not fill areas that cannot match minScore */
+ private final int[] fillLimitedX(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore){
+ if(verbose){System.err.println("fillLimitedX");}
+// minScore=0;
+// assert(minScore>0);
+ rows=read.length;
+ columns=refEndLoc-refStartLoc+1;
+
+ final int halfband=(bandwidth<1 && bandwidthRatio<=0) ? 0 :
+ Tools.max(Tools.min(bandwidth<1 ? 9999999 : bandwidth, bandwidthRatio<=0 ? 9999999 : 8+(int)(rows*bandwidthRatio)), (columns-rows+8))/2;
+
+ if(minScore<1 || (columns+rows<90) || ((halfband<1 || halfband*3>columns) && (columns>read.length+Tools.min(170, read.length+20)))){
+// assert(false) : minScore;
+// assert(minScore>0) : minScore;
+// assert(false) : +minScore+", "+columns+", "+read.length+", "+Tools.min(100, read.length);
+ return fillUnlimited(read, ref, refStartLoc, refEndLoc);
+ }
+
+// final int BARRIER_I2=columns-BARRIER_I1;
+ final int BARRIER_I2=rows-BARRIER_I1, BARRIER_I2b=columns-1;
+ final int BARRIER_D2=rows-BARRIER_D1;
+
+ minScore-=MIN_SCORE_ADJUST; //Increases quality trivially
+
+ assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows+"\n"+
+ refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n";
+ assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns+"\n"+
+ refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n";
+
+ assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc+"\n"+
+ refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n";
+ assert(refEndLoc<ref.length) : "Check that values are in-bounds before calling this function: "+refEndLoc+", "+ref.length+"\n"+
+ refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n";
+
+ if(verbose){
+ System.err.println("Clearing matrix due to verbose mode.");
+ for(int x=0; x<packed.length; x++){
+ for(int y=1; y<rows+1; y++){
+ Arrays.fill(packed[x][y], 1, columns+1, BADoff);
+ }
+ }
+ }
+
+ for(int x=0; x<packed.length; x++){
+
+// Arrays.fill(packed[x][1], 1, columns+1, BADoff);
+ Arrays.fill(packed[x][rows], 1, columns+1, BADoff);
+// for(int y=1; y<rows+1; y++){
+// Arrays.fill(packed[x][y], 1, columns+1, BADoff);
+// }
+ }
+
+ int minGoodCol=1;
+ int maxGoodCol=columns;
+
+ final int minScore_off=(minScore<<SCOREOFFSET);
+ final int maxGain=(read.length-1)*POINTSoff_MATCH2+POINTSoff_MATCH;
+ final int floor=minScore_off-maxGain;
+// final int subfloor=Tools.max(BADoff, floor-200*POINTSoff_MATCH2);
+ final int subfloor=floor-5*POINTSoff_MATCH2;
+ assert(subfloor>BADoff); //TODO: Actually, it needs to be substantially more.
+ assert(subfloor<minScore_off) : minScore_off+", "+floor+", "+BADoff+", "+subfloor;
+
+ if(verbose2){
+ System.out.println();
+ System.out.println("minScore="+minScore+"\t"+minScore_off);
+ System.out.println("maxGain="+(maxGain>>SCOREOFFSET)+"\t"+(maxGain));
+ System.out.println("floor="+(floor>>SCOREOFFSET)+"\t"+(floor));
+ System.out.println("subfloor="+(subfloor>>SCOREOFFSET)+"\t"+(subfloor));
+ System.out.println("BADoff="+(BADoff>>SCOREOFFSET)+"\t"+(BADoff));
+ System.out.println("maxGain="+(maxGain>>SCOREOFFSET)+"\t"+(maxGain));
+ System.out.println();
+ }
+
+ vertLimit[rows]=minScore_off;
+ boolean prevDefined=false;
+ for(int i=rows-1; i>=0; i--){
+ byte c=read[i];
+ if(AminoAcid.isFullyDefined(c)){
+ vertLimit[i]=Tools.max(vertLimit[i+1]-(prevDefined ? POINTSoff_MATCH2 : POINTSoff_MATCH), floor);
+ prevDefined=true;
+ }else{
+ vertLimit[i]=Tools.max(vertLimit[i+1]-POINTSoff_NOCALL, floor);
+ prevDefined=false;
+ }
+ }
+
+ horizLimit[columns]=minScore_off;
+ prevDefined=false;
+ for(int i=columns-1; i>=0; i--){
+ byte c=ref[refStartLoc+i];
+ if(AminoAcid.isFullyDefined(c)){
+ horizLimit[i]=Tools.max(horizLimit[i+1]-(prevDefined ? POINTSoff_MATCH2 : POINTSoff_MATCH), floor);
+ prevDefined=true;
+ }else{
+ horizLimit[i]=Tools.max(horizLimit[i+1]-(prevDefined && c==GAPC ? POINTSoff_DEL : POINTSoff_NOREF), floor);
+ prevDefined=false;
+ }
+ }
+
+// vertLimit[rows]=minScore_off;
+// for(int i=rows-1; i>=0; i--){
+// vertLimit[i]=Tools.max(vertLimit[i+1]-POINTSoff_MATCH2, floor);
+// }
+//
+// horizLimit[columns]=minScore_off;
+// for(int i=columns-1; i>=0; i--){
+// horizLimit[i]=Tools.max(horizLimit[i+1]-POINTSoff_MATCH2, floor);
+// }
+
+ for(int row=1; row<=rows; row++){
+
+ final int colStart=(halfband<1 ? minGoodCol : Tools.max(minGoodCol, row-halfband));
+ final int colStop=(halfband<1 ? maxGoodCol : Tools.min(maxGoodCol, row+halfband*2-1));
+
+ minGoodCol=-1;
+ maxGoodCol=-2;
+
+ final int vlimit=vertLimit[row];
+
+ if(verbose2){
+ System.out.println();
+ System.out.println("row="+row);
+ System.out.println("colStart="+colStart);
+ System.out.println("colStop="+colStop);
+ System.out.println("vlimit="+(vlimit>>SCOREOFFSET)+"\t"+(vlimit));
+ }
+
+ if(colStart<0 || colStop<colStart){break;}
+
+
+ if(colStart>1){
+ assert(row>0);
+ packed[MODE_MS][row][colStart-1]=subfloor;
+ packed[MODE_INS][row][colStart-1]=subfloor;
+ packed[MODE_DEL][row][colStart-1]=subfloor;
+ }
+
+
+ for(int col=colStart; col<=columns; col++){
+
+
+ if(verbose2){
+ System.err.println("\ncol "+col);
+ }
+
+ final byte call0=(row<2 ? (byte)'?' : read[row-2]);
+ final byte call1=read[row-1];
+ final byte ref0=(col<2 ? (byte)'!' : ref[refStartLoc+col-2]);
+ final byte ref1=ref[refStartLoc+col-1];
+
+ final boolean gap=(ref1==GAPC);
+ assert(call1!=GAPC);
+
+// final boolean match=(read[row-1]==ref[refStartLoc+col-1]);
+// final boolean prevMatch=(row<2 || col<2 ? false : read[row-2]==ref[refStartLoc+col-2]);
+ final boolean match=(call1==ref1 && ref1!='N');
+ final boolean prevMatch=(call0==ref0 && ref0!='N');
+
+// System.err.println("")
+
+ iterationsLimited++;
+ final int limit=Tools.max(vlimit, horizLimit[col]);
+ final int limit3=Tools.max(floor, (match ? limit-POINTSoff_MATCH2 : limit-POINTSoff_SUB3));
+
+ final int delNeeded=Tools.max(0, row-col-1);
+ final int insNeeded=Tools.max(0, (rows-row)-(columns-col)-1);
+
+ final int delPenalty=calcDelScoreOffset(delNeeded);
+ final int insPenalty=calcInsScoreOffset(insNeeded);
+
+
+ final int scoreFromDiag_MS=packed[MODE_MS][row-1][col-1]&SCOREMASK;
+ final int scoreFromDel_MS=packed[MODE_DEL][row-1][col-1]&SCOREMASK;
+ final int scoreFromIns_MS=packed[MODE_INS][row-1][col-1]&SCOREMASK;
+
+ final int scoreFromDiag_DEL=packed[MODE_MS][row][col-1]&SCOREMASK;
+ final int scoreFromDel_DEL=packed[MODE_DEL][row][col-1]&SCOREMASK;
+
+ final int scoreFromDiag_INS=packed[MODE_MS][row-1][col]&SCOREMASK;
+ final int scoreFromIns_INS=packed[MODE_INS][row-1][col]&SCOREMASK;
+
+// if(scoreFromDiag_MS<limit3 && scoreFromDel_MS<limit3 && scoreFromIns_MS<limit3
+// && scoreFromDiag_DEL<limit && scoreFromDel_DEL<limit && scoreFromDiag_INS<limit && scoreFromIns_INS<limit){
+// iterationsLimited--; //A "fast" iteration
+// }
+
+ if(gap || (scoreFromDiag_MS<=limit3 && scoreFromDel_MS<=limit3 && scoreFromIns_MS<=limit3)){
+ packed[MODE_MS][row][col]=subfloor;
+ }else{//Calculate match and sub scores
+ final int streak=(packed[MODE_MS][row-1][col-1]&TIMEMASK);
+
+ {//Calculate match/sub score
+
+ int score;
+ int time;
+ byte prevState;
+
+ if(match){
+
+ int scoreMS=scoreFromDiag_MS+(prevMatch ? POINTSoff_MATCH2 : POINTSoff_MATCH);
+ int scoreD=scoreFromDel_MS+POINTSoff_MATCH;
+ int scoreI=scoreFromIns_MS+POINTSoff_MATCH;
+
+// byte prevState;
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? streak+1 : 1);
+ prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+ prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+ prevState=MODE_INS;
+ }
+
+// if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+// assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+// assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+//// packed[MODE_MS][row][col]=(score|prevState|time);
+// packed[MODE_MS][row][col]=(score|time);
+// assert((score&SCOREMASK)==score);
+//// assert((prevState&MODEMASK)==prevState);
+// assert((time&TIMEMASK)==time);
+
+ }else{
+
+ int scoreMS;
+ if(ref1!='N' && call1!='N'){
+ scoreMS=scoreFromDiag_MS+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) :
+ (streak==0 ? POINTSoff_SUB : streak<LIMIT_FOR_COST_3 ? POINTSoff_SUB2 : POINTSoff_SUB3));
+ }else{
+ scoreMS=scoreFromDiag_MS+POINTSoff_NOCALL;
+ }
+
+ int scoreD=scoreFromDel_MS+POINTSoff_SUB; //+2 to move it as close as possible to the deletion / insertion
+ int scoreI=scoreFromIns_MS+POINTSoff_SUB;
+
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? 1 : streak+1);
+// time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1);
+ prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+ prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+ prevState=MODE_INS;
+ }
+
+// if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+// assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+// assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+//// packed[MODE_MS][row][col]=(score|prevState|time);
+// packed[MODE_MS][row][col]=(score|time);
+// assert((score&SCOREMASK)==score);
+//// assert((prevState&MODEMASK)==prevState);
+// assert((time&TIMEMASK)==time);
+ }
+
+ final int limit2;
+ if(delNeeded>0){
+ limit2=limit-delPenalty;
+ }else if(insNeeded>0){
+ limit2=limit-insPenalty;
+ }else{
+ limit2=limit;
+ }
+ assert(limit2>=limit);
+
+ if(verbose2){System.err.println("MS: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));}
+
+ if(score>=limit2){
+ maxGoodCol=col;
+ if(minGoodCol<0){minGoodCol=col;}
+ }else{
+ score=subfloor;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_MS][row][col]=(score|prevState|time);
+ packed[MODE_MS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+ }
+
+ if((scoreFromDiag_DEL<=limit && scoreFromDel_DEL<=limit) || row<BARRIER_D1 || row>BARRIER_D2){
+// assert((scoreFromDiag_DEL<=limit && scoreFromDel_DEL<=limit)) : scoreFromDiag_DEL+", "+row;
+ packed[MODE_DEL][row][col]=subfloor;
+ }else{//Calculate DEL score
+
+ final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK;
+
+ int scoreMS=scoreFromDiag_DEL+POINTSoff_DEL;
+ int scoreD=scoreFromDel_DEL+(streak==0 ? POINTSoff_DEL :
+ streak<LIMIT_FOR_COST_3 ? POINTSoff_DEL2 :
+ streak<LIMIT_FOR_COST_4 ? POINTSoff_DEL3 :
+ streak<LIMIT_FOR_COST_5 ? POINTSoff_DEL4 :
+ ((streak&MASK5)==0 ? POINTSoff_DEL5 : 0));
+// int scoreI=scoreFromIns+POINTSoff_DEL;
+
+
+ if(ref1=='N'){
+ scoreMS+=POINTSoff_DEL_REF_N;
+ scoreD+=POINTSoff_DEL_REF_N;
+ }else if(gap){
+ scoreMS+=POINTSoff_GAP;
+ scoreD+=POINTSoff_GAP;
+ }
+
+ //if(match){scoreMS=subfloor;}
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreD){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreD;
+ time=streak+1;
+ prevState=MODE_DEL;
+ }
+
+ final int limit2;
+ if(insNeeded>0){
+ limit2=limit-insPenalty;
+ }else if(delNeeded>0){
+ limit2=limit-calcDelScoreOffset(time+delNeeded)+calcDelScoreOffset(time);
+ }else{
+ limit2=limit;
+ }
+ assert(limit2>=limit);
+ if(verbose2){System.err.println("DEL: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));}
+
+ if(score>=limit2){
+ maxGoodCol=col;
+ if(minGoodCol<0){minGoodCol=col;}
+ }else{
+ score=subfloor;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_DEL][row][col]=(score|prevState|time);
+ packed[MODE_DEL][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+
+// if(gap || (scoreFromDiag_INS<=limit && scoreFromIns_INS<=limit) || col<BARRIER_I1 || col>BARRIER_I2){
+ if(gap || (scoreFromDiag_INS<=limit && scoreFromIns_INS<=limit) || (row<BARRIER_I1 && col>1) || (row>BARRIER_I2 && col<BARRIER_I2b)){
+ packed[MODE_INS][row][col]=subfloor;
+ }else{//Calculate INS score
+
+ final int streak=packed[MODE_INS][row-1][col]&TIMEMASK;
+
+ int scoreMS=scoreFromDiag_INS+POINTSoff_INS;
+// int scoreD=scoreFromDel+POINTSoff_INS;
+ int scoreI=scoreFromIns_INS+(streak==0 ? POINTSoff_INS :
+ streak<LIMIT_FOR_COST_3 ? POINTSoff_INS2 :
+ streak<LIMIT_FOR_COST_4 ? POINTSoff_INS3 : POINTSoff_INS4);
+
+
+// System.err.println("("+row+","+col+")\t"+scoreFromDiag+"+"+POINTSoff_INS+"="+scoreM+", "+
+// scoreFromSub+"+"+POINTSoff_INS+"="+scoreS+", "
+// +scoreD+", "+scoreFromIns+"+"+
+// (streak==0 ? POINTSoff_INS : streak<LIMIT_FOR_COST_3 ? POINTSoff_INS2 : POINTSoff_INS3)+"="+scoreI);
+
+ //if(match){scoreMS=subfloor;}
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreI){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreI;
+ time=streak+1;
+ prevState=MODE_INS;
+ }
+
+ final int limit2;
+ if(delNeeded>0){
+ limit2=limit-delPenalty;
+ }else if(insNeeded>0){
+ limit2=limit-calcInsScoreOffset(time+insNeeded)+calcInsScoreOffset(time);
+ }else{
+ limit2=limit;
+ }
+ assert(limit2>=limit);
+
+ if(verbose2){System.err.println("INS: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));}
+ if(score>=limit2){
+ maxGoodCol=col;
+ if(minGoodCol<0){minGoodCol=col;}
+ }else{
+ score=subfloor;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_INS][row][col]=(score|prevState|time);
+ packed[MODE_INS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+
+
+ if(col>=colStop){
+ if(col>colStop && (maxGoodCol<col || halfband>0)){break;}
+ if(row>1){
+ packed[MODE_MS][row-1][col+1]=subfloor;
+ packed[MODE_INS][row-1][col+1]=subfloor;
+ packed[MODE_DEL][row-1][col+1]=subfloor;
+ }
+ }
+ }
+ }
+
+
+ int maxCol=-1;
+ int maxState=-1;
+ int maxScore=Integer.MIN_VALUE;
+
+ for(int state=0; state<packed.length; state++){
+ for(int col=1; col<=columns; col++){
+ int x=packed[state][rows][col]&SCOREMASK;
+ if(x>maxScore){
+ maxScore=x;
+ maxCol=col;
+ maxState=state;
+ }
+ }
+ }
+
+ assert(maxScore>=BADoff);
+// if(maxScore==BADoff){
+// return null;
+// }
+// if(maxScore<floor){
+// return null;
+// }
+
+ if(verbose){
+ System.out.println("Filled matrix.");
+ printMatrix(packed, rows, columns, TIMEMASK, SCOREOFFSET);
+ }
+ if(verbose){System.err.println("maxscore="+(maxScore>>SCOREOFFSET)+", minscore="+(minScore_off>>SCOREOFFSET));}
+
+ if(maxScore<minScore_off){
+ return null;
+ }
+
+ maxScore>>=SCOREOFFSET;
+
+ if(verbose){
+ System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore);
+ }
+ return new int[] {rows, maxCol, maxState, maxScore};
+ }
+
+ @Override
+ /** return new int[] {rows, maxC, maxS, max};
+ * Will not fill areas that cannot match minScore */
+ public final int[] fillUnlimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int[] gaps){
+ if(gaps==null){return fillUnlimited(read, ref, refStartLoc, refEndLoc);}
+ else{
+ byte[] gref=makeGref(ref, gaps, refStartLoc, refEndLoc, read);
+ assert(gref!=null) : "Excessively long read:\n"+new String(read);
+ return fillUnlimited(read, gref, 0, greflimit);
+ }
+ }
+
+
+ /** return new int[] {rows, maxC, maxS, max};
+ * Does not require a min score (ie, same as old method) */
+ private final int[] fillUnlimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc){
+ rows=read.length;
+ columns=refEndLoc-refStartLoc+1;
+
+ final int maxGain=(read.length-1)*POINTSoff_MATCH2+POINTSoff_MATCH;
+ final int subfloor=0-2*maxGain;
+ assert(subfloor>BADoff && subfloor*2>BADoff) : (read.length-1)+", "+maxGain+", "+subfloor+", "+(subfloor*2)+", "+BADoff+"\n"
+ +rows+", "+columns+", "+POINTSoff_MATCH2+", "+SCOREOFFSET+"\n"+new String(read)+"\n"; //TODO: Actually, it needs to be substantially more.
+// final int BARRIER_I2=columns-BARRIER_I1;
+ final int BARRIER_I2=rows-BARRIER_I1, BARRIER_I2b=columns-1;
+ final int BARRIER_D2=rows-BARRIER_D1;
+
+ //temporary, for finding a bug
+ if(rows>maxRows || columns>maxColumns){
+ throw new RuntimeException("rows="+rows+", maxRows="+maxRows+", cols="+columns+", maxCols="+maxColumns+"\n"+new String(read)+"\n");
+ }
+
+ assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows;
+ assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns;
+
+ assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc;
+ assert(refEndLoc<ref.length) : "Check that values are in-bounds before calling this function: "+refEndLoc+", "+ref.length;
+
+ for(int row=1; row<=rows; row++){
+
+// int minc=max(1, row-20);
+// int maxc=min(columns, row+20);
+
+ for(int col=1; col<=columns; col++){
+ iterationsUnlimited++;
+
+// final boolean match=(read[row-1]==ref[refStartLoc+col-1]);
+// final boolean prevMatch=(row<2 || col<2 ? false : read[row-2]==ref[refStartLoc+col-2]);
+
+ final byte call0=(row<2 ? (byte)'?' : read[row-2]);
+ final byte call1=read[row-1];
+ final byte ref0=(col<2 ? (byte)'!' : ref[refStartLoc+col-2]);
+ final byte ref1=ref[refStartLoc+col-1];
+
+ final boolean match=(call1==ref1 && ref1!='N');
+ final boolean prevMatch=(call0==ref0 && ref0!='N');
+
+ final boolean gap=(ref1==GAPC);
+ assert(call1!=GAPC);
+
+ if(gap){
+ packed[MODE_MS][row][col]=subfloor;
+ }else{//Calculate match and sub scores
+
+ final int scoreFromDiag=packed[MODE_MS][row-1][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row-1][col-1]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col-1]&SCOREMASK;
+ final int streak=(packed[MODE_MS][row-1][col-1]&TIMEMASK);
+
+ {//Calculate match/sub score
+
+ if(match){
+
+ int scoreMS=scoreFromDiag+(prevMatch ? POINTSoff_MATCH2 : POINTSoff_MATCH);
+ int scoreD=scoreFromDel+POINTSoff_MATCH;
+ int scoreI=scoreFromIns+POINTSoff_MATCH;
+
+ int score;
+ int time;
+// byte prevState;
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? streak+1 : 1);
+// prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+// prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+// prevState=MODE_INS;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_MS][row][col]=(score|prevState|time);
+ packed[MODE_MS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+
+ }else{
+
+ int scoreMS;
+ if(ref1!='N' && call1!='N'){
+ scoreMS=scoreFromDiag+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) :
+ (streak==0 ? POINTSoff_SUB : streak<LIMIT_FOR_COST_3 ? POINTSoff_SUB2 : POINTSoff_SUB3));
+ }else{
+ scoreMS=scoreFromDiag+POINTSoff_NOCALL;
+ }
+
+ int scoreD=scoreFromDel+POINTSoff_SUB; //+2 to move it as close as possible to the deletion / insertion
+ int scoreI=scoreFromIns+POINTSoff_SUB;
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? 1 : streak+1);
+// time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1);
+ prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+ prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+ prevState=MODE_INS;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_MS][row][col]=(score|prevState|time);
+ packed[MODE_MS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+ }
+ }
+
+ if(row<BARRIER_D1 || row>BARRIER_D2){
+ packed[MODE_DEL][row][col]=subfloor;
+ }else{//Calculate DEL score
+
+ final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK;
+
+ final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK;
+
+ int scoreMS=scoreFromDiag+POINTSoff_DEL;
+ int scoreD=scoreFromDel+(streak==0 ? POINTSoff_DEL :
+ streak<LIMIT_FOR_COST_3 ? POINTSoff_DEL2 :
+ streak<LIMIT_FOR_COST_4 ? POINTSoff_DEL3 :
+ streak<LIMIT_FOR_COST_5 ? POINTSoff_DEL4 :
+ ((streak&MASK5)==0 ? POINTSoff_DEL5 : 0));
+// int scoreI=scoreFromIns+POINTSoff_DEL;
+
+ if(ref1=='N'){
+ scoreMS+=POINTSoff_DEL_REF_N;
+ scoreD+=POINTSoff_DEL_REF_N;
+ }else if(gap){
+ scoreMS+=POINTSoff_GAP;
+ scoreD+=POINTSoff_GAP;
+ }
+
+ //if(match){scoreMS=subfloor;}
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreD){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreD;
+ time=streak+1;
+ prevState=MODE_DEL;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_DEL][row][col]=(score|prevState|time);
+ packed[MODE_DEL][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+
+ //Calculate INS score
+// if(gap || col<BARRIER_I1 || col>BARRIER_I2){
+ if(gap || (row<BARRIER_I1 && col>1) || (row>BARRIER_I2 && col<BARRIER_I2b)){
+ packed[MODE_INS][row][col]=subfloor;
+ }else{//Calculate INS score
+
+ final int streak=packed[MODE_INS][row-1][col]&TIMEMASK;
+
+ final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK;
+
+ int scoreMS=scoreFromDiag+POINTSoff_INS;
+// int scoreD=scoreFromDel+POINTSoff_INS;
+ int scoreI=scoreFromIns+(streak==0 ? POINTSoff_INS :
+ streak<LIMIT_FOR_COST_3 ? POINTSoff_INS2 :
+ streak<LIMIT_FOR_COST_4 ? POINTSoff_INS3 : POINTSoff_INS4);
+
+// System.err.println("("+row+","+col+")\t"+scoreFromDiag+"+"+POINTSoff_INS+"="+scoreM+", "+
+// scoreFromSub+"+"+POINTSoff_INS+"="+scoreS+", "
+// +scoreD+", "+scoreFromIns+"+"+
+// (streak==0 ? POINTSoff_INS : streak<LIMIT_FOR_COST_3 ? POINTSoff_INS2 : POINTSoff_INS3)+"="+scoreI);
+
+ //if(match){scoreMS=subfloor;}
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreI){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreI;
+ time=streak+1;
+ prevState=MODE_INS;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_INS][row][col]=(score|prevState|time);
+ packed[MODE_INS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+ }
+ }
+
+
+ int maxCol=-1;
+ int maxState=-1;
+ int maxScore=Integer.MIN_VALUE;
+
+ for(int state=0; state<packed.length; state++){
+ for(int col=1; col<=columns; col++){
+ int x=packed[state][rows][col]&SCOREMASK;
+ if(x>maxScore){
+ maxScore=x;
+ maxCol=col;
+ maxState=state;
+ }
+ }
+ }
+ maxScore>>=SCOREOFFSET;
+
+// System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore);
+ return new int[] {rows, maxCol, maxState, maxScore};
+ }
+
+ @Deprecated
+ /** return new int[] {rows, maxC, maxS, max}; */
+ public final int[] fillQ(byte[] read, byte[] ref, byte[] baseScores, int refStartLoc, int refEndLoc){
+ assert(false) : "Needs to be redone to work with score cutoffs. Not difficult.";
+ rows=read.length;
+ columns=refEndLoc-refStartLoc+1;
+
+ assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows;
+ assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns;
+
+ assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc;
+ assert(refEndLoc<ref.length) : "Check that values are in-bounds before calling this function: "+refEndLoc+", "+ref.length;
+
+ for(int row=1; row<=rows; row++){
+
+// int minc=max(1, row-20);
+// int maxc=min(columns, row+20);
+
+ for(int col=1; col<=columns; col++){
+
+ final boolean match=(read[row-1]==ref[refStartLoc+col-1]);
+ final boolean prevMatch=(row<2 || col<2 ? false : read[row-2]==ref[refStartLoc+col-2]);
+
+ {//Calculate match and sub scores
+
+ final int scoreFromDiag=packed[MODE_MS][row-1][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row-1][col-1]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col-1]&SCOREMASK;
+ final int streak=(packed[MODE_MS][row-1][col-1]&TIMEMASK);
+
+ {//Calculate match/sub score
+
+ if(match){
+
+ int scoreMS=scoreFromDiag+(prevMatch ? POINTSoff_MATCH2 : POINTSoff_MATCH);
+ int scoreD=scoreFromDel+POINTSoff_MATCH;
+ int scoreI=scoreFromIns+POINTSoff_MATCH;
+
+ int score;
+ int time;
+// byte prevState;
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? streak+1 : 1);
+// prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+// prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+// prevState=MODE_INS;
+ }
+ score+=(((int)baseScores[row-1])<<SCOREOFFSET); //modifier
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_MS][row][col]=(score|prevState|time);
+ packed[MODE_MS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+
+ }else{
+
+ int scoreMS=scoreFromDiag+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) :
+ (streak==0 ? POINTSoff_SUB : streak<LIMIT_FOR_COST_3 ? POINTSoff_SUB2 : POINTSoff_SUB3));
+ int scoreD=scoreFromDel+POINTSoff_SUB; //+2 to move it as close as possible to the deletion / insertion
+ int scoreI=scoreFromIns+POINTSoff_SUB;
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? 1 : streak+1);
+// time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1);
+ prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+ prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+ prevState=MODE_INS;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_MS][row][col]=(score|prevState|time);
+ packed[MODE_MS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+ }
+ }
+
+ {//Calculate DEL score
+
+ final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK;
+
+ final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK;
+
+ int scoreMS=scoreFromDiag+POINTSoff_DEL;
+ int scoreD=scoreFromDel+(streak==0 ? POINTSoff_DEL :
+ streak<LIMIT_FOR_COST_3 ? POINTSoff_DEL2 :
+ streak<LIMIT_FOR_COST_4 ? POINTSoff_DEL3 :
+ streak<LIMIT_FOR_COST_5 ? POINTSoff_DEL4 :
+ ((streak&MASK5)==0 ? POINTSoff_DEL5 : 0));
+// int scoreI=scoreFromIns+POINTSoff_DEL;
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreD){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreD;
+ time=streak+1;
+ prevState=MODE_DEL;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_DEL][row][col]=(score|prevState|time);
+ packed[MODE_DEL][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+
+ {//Calculate INS score
+
+ final int streak=packed[MODE_INS][row-1][col]&TIMEMASK;
+
+ final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK;
+
+ int scoreMS=scoreFromDiag+POINTSoff_INS;
+// int scoreD=scoreFromDel+POINTSoff_INS;
+ int scoreI=scoreFromIns+(streak==0 ? POINTSoff_INS :
+ streak<LIMIT_FOR_COST_3 ? POINTSoff_INS2 :
+ streak<LIMIT_FOR_COST_4 ? POINTSoff_INS3 : POINTSoff_INS4);
+
+// System.err.println("("+row+","+col+")\t"+scoreFromDiag+"+"+POINTSoff_INS+"="+scoreM+", "+
+// scoreFromSub+"+"+POINTSoff_INS+"="+scoreS+", "
+// +scoreD+", "+scoreFromIns+"+"+
+// (streak==0 ? POINTSoff_INS : streak<LIMIT_FOR_COST_3 ? POINTSoff_INS2 : POINTSoff_INS3)+"="+scoreI);
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreI){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreI;
+ time=streak+1;
+ prevState=MODE_INS;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_INS][row][col]=(score|prevState|time);
+ packed[MODE_INS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+ }
+ }
+
+
+ int maxCol=-1;
+ int maxState=-1;
+ int maxScore=Integer.MIN_VALUE;
+
+ for(int state=0; state<packed.length; state++){
+ for(int col=1; col<=columns; col++){
+ int x=packed[state][rows][col]&SCOREMASK;
+ if(x>maxScore){
+ maxScore=x;
+ maxCol=col;
+ maxState=state;
+ }
+ }
+ }
+ maxScore>>=SCOREOFFSET;
+
+// System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore);
+ return new int[] {rows, maxCol, maxState, maxScore};
+ }
+
+ @Override
+ /** @return {score, bestRefStart, bestRefStop} */
+ /** Generates the match string */
+ public final byte[] traceback(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state, boolean gapped){
+ if(gapped){
+ final byte[] gref=grefbuffer;
+ int gstart=translateToGappedCoordinate(refStartLoc, gref);
+ int gstop=translateToGappedCoordinate(refEndLoc, gref);
+ byte[] out=traceback2(read, gref, gstart, gstop, row, col, state);
+ return out;
+ }else{
+ return traceback2(read, ref, refStartLoc, refEndLoc, row, col, state);
+ }
+ }
+
+ @Override
+ /** Generates the match string */
+ public final byte[] traceback2(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state){
+// assert(false);
+ assert(refStartLoc<=refEndLoc) : refStartLoc+", "+refEndLoc;
+ assert(row==rows);
+
+ byte[] out=new byte[row+col-1]; //TODO if an out of bound crash occurs, try removing the "-1".
+ int outPos=0;
+
+ int gaps=0;
+
+ if(state==MODE_INS){
+ //TODO ? Maybe not needed.
+ }
+
+ while(row>0 && col>0){
+
+// byte prev0=(byte)(packed[state][row][col]&MODEMASK);
+
+ final int time=packed[state][row][col]&TIMEMASK;
+ final byte prev;
+
+// System.err.println("state="+state+", prev="+prev+", row="+row+", col="+col+", score="+scores[state][row][col]);
+
+ if(state==MODE_MS){
+ if(time>1){prev=(byte)state;}
+ else{
+ final int scoreFromDiag=packed[MODE_MS][row-1][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row-1][col-1]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col-1]&SCOREMASK;
+ if(scoreFromDiag>=scoreFromDel && scoreFromDiag>=scoreFromIns){prev=MODE_MS;}
+ else if(scoreFromDel>=scoreFromIns){prev=MODE_DEL;}
+ else{prev=MODE_INS;}
+ }
+
+ byte c=read[row-1];
+ byte r=ref[refStartLoc+col-1];
+ if(c==r){
+ out[outPos]='m';
+ }else{
+ if(!AminoAcid.isFullyDefined(c)){
+ out[outPos]='N';
+ }else if(!AminoAcid.isFullyDefined(r)){
+// out[outPos]='X';
+ out[outPos]='N';
+ }else{
+ out[outPos]='S';
+ }
+ }
+
+ row--;
+ col--;
+ }else if(state==MODE_DEL){
+ if(time>1){prev=(byte)state;}
+ else{
+ final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK;
+ if(scoreFromDiag>=scoreFromDel){prev=MODE_MS;}
+ else{prev=MODE_DEL;}
+ }
+
+ byte r=ref[refStartLoc+col-1];
+ if(r==GAPC){
+ out[outPos]='-';
+ gaps++;
+ }else{
+ out[outPos]='D';
+ }
+ col--;
+ }else{
+ if(time>1){prev=(byte)state;}
+ else{
+ final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK;
+ if(scoreFromDiag>=scoreFromIns){prev=MODE_MS;}
+ else{prev=MODE_INS;}
+ }
+
+ assert(state==MODE_INS) : state;
+ if(col==0){
+ out[outPos]='X';
+ }else if(col>=columns){
+ out[outPos]='Y';
+ }else{
+ out[outPos]='I';
+ }
+ row--;
+ }
+
+// assert(prev==prev0);
+ state=prev;
+ outPos++;
+ }
+
+ assert(row==0 || col==0);
+ if(col!=row){
+ while(row>0){
+ out[outPos]='X';
+ outPos++;
+ row--;
+ col--;
+ }
+ if(col>0){
+ //do nothing
+ }
+ }
+
+
+ //Shrink and reverse the string
+ byte[] out2=new byte[outPos];
+ for(int i=0; i<outPos; i++){
+ out2[i]=out[outPos-i-1];
+ }
+ out=null;
+
+ if(gaps==0){return out2;}
+
+ //TODO Consider outputting this compressed.
+ byte[] out3=new byte[out2.length+gaps*(GAPLEN-1)];
+ for(int i=0, j=0; i<out2.length; i++){
+ byte c=out2[i];
+ if(c!=GAPC){
+ out3[j]=c;
+ j++;
+ }else{
+ int lim=j+GAPLEN;
+ for(; j<lim; j++){
+ out3[j]='D';
+ }
+ }
+ }
+ return out3;
+ }
+
+ @Override
+ /** @return {score, bestRefStart, bestRefStop} */
+ public final int[] score(final byte[] read, final byte[] ref, final int refStartLoc, final int refEndLoc,
+ final int maxRow, final int maxCol, final int maxState, boolean gapped){
+ if(gapped){
+ if(verbose){
+ System.err.println("score():");
+ System.err.println("origin="+grefRefOrigin+", "+refStartLoc+", "+refEndLoc+", "+maxRow+", "+maxCol);
+ }
+ final byte[] gref=grefbuffer;
+ int gstart=translateToGappedCoordinate(refStartLoc, gref);
+ int gstop=translateToGappedCoordinate(refEndLoc, gref);
+
+ assert(translateFromGappedCoordinate(gstart, gref)==refStartLoc); //TODO: Remove slow assertions
+ assert(translateFromGappedCoordinate(gstop, gref)==refEndLoc);
+
+ assert(gstart==0) : gstart; //TODO: skip translation if this is always zero
+
+ if(verbose){System.err.println("gstart, gstop: "+gstart+", "+gstop);}
+ int[] out=score2(read, gref, gstart, gstop, maxRow, maxCol, maxState);
+ if(verbose){System.err.println("got score "+Arrays.toString(out));}
+
+ assert(out[1]==translateToGappedCoordinate(translateFromGappedCoordinate(out[1], gref), gref)) :
+ "Verifying: "+out[1]+" -> "+translateFromGappedCoordinate(out[1], gref)+" -> "+
+ translateToGappedCoordinate(translateFromGappedCoordinate(out[1], gref), gref);
+ assert(out[2]==translateToGappedCoordinate(translateFromGappedCoordinate(out[2], gref), gref));
+
+ out[1]=translateFromGappedCoordinate(out[1], gref);
+ out[2]=translateFromGappedCoordinate(out[2], gref);
+ if(verbose){System.err.println("returning score "+Arrays.toString(out));}
+ return out;
+ }else{
+ return score2(read, ref, refStartLoc, refEndLoc, maxRow, maxCol, maxState);
+ }
+ }
+
+ @Override
+ /** @return {score, bestRefStart, bestRefStop, maxRow, maxCol, maxState}, <br>
+ * or {score, bestRefStart, bestRefStop, maxRow, maxCol, maxState, padLeft, padRight} <br>
+ * if more padding is needed */
+ public final int[] score2(final byte[] read, final byte[] ref, final int refStartLoc, final int refEndLoc,
+ final int maxRow, final int maxCol, final int maxState){
+
+ int row=maxRow;
+ int col=maxCol;
+ int state=maxState;
+
+ assert(maxState>=0 && maxState<packed.length) :
+ maxState+", "+maxRow+", "+maxCol+"\n"+new String(read)+"\n"+toString(ref, refStartLoc, refEndLoc);
+ assert(maxRow>=0 && maxRow<packed[0].length) :
+ maxState+", "+maxRow+", "+maxCol+"\n"+new String(read)+"\n"+toString(ref, refStartLoc, refEndLoc);
+ assert(maxCol>=0 && maxCol<packed[0][0].length) :
+ maxState+", "+maxRow+", "+maxCol+"\n"+new String(read)+"\n"+toString(ref, refStartLoc, refEndLoc);
+
+ int score=packed[maxState][maxRow][maxCol]&SCOREMASK; //Or zero, if it is to be recalculated
+
+ if(row<rows){
+ int difR=rows-row;
+ int difC=columns-col;
+
+ while(difR>difC){
+ score+=POINTSoff_NOREF;
+ difR--;
+ }
+
+ row+=difR;
+ col+=difR;
+
+ }
+
+ assert(refStartLoc<=refEndLoc);
+ assert(row==rows);
+
+
+ final int bestRefStop=refStartLoc+col-1;
+
+ if(verbose){System.err.println("Scoring.");}
+
+ int stateTime=0;
+
+ while(row>0 && col>0){
+
+ if(verbose){System.err.println("state="+state+", row="+row+", col="+col);}
+
+ final int time=packed[state][row][col]&TIMEMASK;
+ final byte prev;
+
+ if(state==MODE_MS){
+ if(time>1){prev=(byte)state;}
+ else{
+ final int scoreFromDiag=packed[MODE_MS][row-1][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row-1][col-1]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col-1]&SCOREMASK;
+ if(scoreFromDiag>=scoreFromDel && scoreFromDiag>=scoreFromIns){prev=MODE_MS;}
+ else if(scoreFromDel>=scoreFromIns){prev=MODE_DEL;}
+ else{prev=MODE_INS;}
+ }
+ row--;
+ col--;
+ }else if(state==MODE_DEL){
+ if(time>1){prev=(byte)state;}
+ else{
+ final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK;
+ if(scoreFromDiag>=scoreFromDel){prev=MODE_MS;}
+ else{prev=MODE_DEL;}
+ }
+ col--;
+ }else{
+ assert(state==MODE_INS);
+ if(time>1){prev=(byte)state;}
+ else{
+ final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK;
+ if(scoreFromDiag>=scoreFromIns){prev=MODE_MS;}
+ else{prev=MODE_INS;}
+ }
+ row--;
+ }
+
+ if(col<0){
+ if(verbose){
+ System.err.println("Warning, column went below 0 at row="+row);
+ }
+ break; //prevents an out of bounds access
+ }
+
+// assert(prev==prev0);
+ if(state==prev){stateTime++;}else{stateTime=0;}
+ state=prev;
+
+ if(verbose){System.err.println("state2="+state+", time="+time+", stateTime="+stateTime+", row2="+row+", col2="+col+"\n");}
+ }
+// assert(false) : row+", "+col;
+ if(row>col){
+ col-=row;
+ }
+
+ final int bestRefStart=refStartLoc+col;
+
+ score>>=SCOREOFFSET;
+
+ if(verbose){
+ System.err.println("bestRefStart="+bestRefStart+", refStartLoc="+refStartLoc);
+ System.err.println("bestRefStop="+bestRefStop+", refEndLoc="+refEndLoc);
+ }
+
+ int padLeft=0;
+ int padRight=0;
+ if(bestRefStart<refStartLoc){
+ padLeft=Tools.max(0, refStartLoc-bestRefStart);
+ }else if(bestRefStart==refStartLoc && state==MODE_INS){
+ padLeft=stateTime;
+ }
+ if(bestRefStop>refEndLoc){
+ padRight=Tools.max(0, bestRefStop-refEndLoc);
+ }else if(bestRefStop==refEndLoc && maxState==MODE_INS){
+ padRight=packed[maxState][maxRow][maxCol]&TIMEMASK;
+ }
+
+ int[] rvec;
+ if(padLeft>0 || padRight>0){ //Suggest extra padding in cases of overflow
+ rvec=new int[] {score, bestRefStart, bestRefStop, maxRow, maxCol, maxState, padLeft, padRight};
+ }else{
+ rvec=new int[] {score, bestRefStart, bestRefStop, maxRow, maxCol, maxState};
+ }
+ return rvec;
+ }
+
+ /**
+ * Fills grefbuffer
+ * @param ref
+ * @param a
+ * @param b
+ * @param gaps
+ * @return gref
+ */
+ private final byte[] makeGref(byte[] ref, int[] gaps, int refStartLoc, int refEndLoc, byte[] read){
+ assert(gaps!=null && gaps.length>0);
+
+ assert(refStartLoc<=gaps[0]) : refStartLoc+", "+refEndLoc+", "+Arrays.toString(gaps)+"\n"+new String(read);
+ assert(refEndLoc>=gaps[gaps.length-1]);
+
+ final int g0_old=gaps[0];
+ final int gN_old=gaps[gaps.length-1];
+ gaps[0]=Tools.min(gaps[0], refStartLoc);
+ gaps[gaps.length-1]=Tools.max(gN_old, refEndLoc);
+ grefRefOrigin=gaps[0];
+
+ if(verbose){System.err.println("\ngaps2: "+Arrays.toString(gaps));}
+
+// grefRefOrigin=Tools.min(gaps[0], refStartLoc);
+
+// //This block is no longer needed since the array is preallocated.
+// int len=0;
+// final int gb2=GAPBUFFER*2;
+// for(int i=0; i<gaps.length; i+=2){
+// int x=gaps[i];
+// int y=gaps[i+1];
+// len+=(y-x+1);
+// if(i+2<gaps.length){
+// int z=gaps[i+2];
+// assert(z>y);
+// int gap=z-y-1;
+// if(gap<MINGAP){
+// len+=gap;
+// }else{
+// len+=gb2;
+// gap-=gb2;
+// int div=gap/GAPLEN;
+// int rem=gap%GAPLEN;
+// len+=(div+rem);
+// }
+// }
+// }
+ byte[] gref=grefbuffer;
+
+ int gpos=0;
+ for(int i=0; i<gaps.length; i+=2){
+ int x=gaps[i];
+ int y=gaps[i+1];
+
+ for(int r=x; r<=y; r++, gpos++){
+ //TODO: if out of bounds, use an 'N'
+ assert(gpos<gref.length) :
+ "\ngpos="+gpos+", gref.length="+gref.length+", read.length="+read.length+", gaps2="+Arrays.toString(gaps)+
+ "\ni="+i+", r="+r+", x="+x+", y="+y+
+ "\nGapTools.calcGrefLen("+gaps[0]+", "+gaps[gaps.length-1]+", gaps)="+GapTools.calcGrefLen(gaps[0], gaps[gaps.length-1], gaps)+
+ "\nGapTools.calcGrefLen("+gaps[0]+", "+gaps[gaps.length-1]+", gaps)="+GapTools.calcGrefLen(gaps[0], gaps[gaps.length-1], gaps)+
+ "\n"+refStartLoc+", "+refEndLoc+", "+greflimit+", "+GREFLIMIT2_CUSHION+"\n"+new String(gref)+"\n"+new String(read)+"\n";
+ gref[gpos]=ref[r];
+ }
+
+ if(i+2<gaps.length){
+ int z=gaps[i+2];
+ assert(z>y);
+ int gap=z-y-1;
+ assert(gap>=MINGAP) : gap+"\t"+MINGAP;
+ if(gap<MINGAP){
+ assert(false) : "TODO - just fill in normally";
+ }else{
+ int rem=gap%GAPLEN;
+ int lim=y+GAPBUFFER+rem;
+
+ int div=(gap-GAPBUFFER2)/GAPLEN;
+ if(verbose){
+ System.err.println("div = "+div);
+ }
+ assert(div>0);
+
+ for(int r=y+1; r<=lim; r++, gpos++){
+ gref[gpos]=ref[r];
+ }
+ for(int g=0; g<div; g++, gpos++){
+ gref[gpos]=GAPC;
+ }
+ for(int r=z-GAPBUFFER; r<z; r++, gpos++){
+ gref[gpos]=ref[r];
+ }
+ }
+ }
+ }
+
+ greflimit=gpos;
+
+ assert(gref[gpos-1]==ref[refEndLoc]);
+// assert(greflimit+GREFLIMIT2_CUSHION<=gref.length) : refStartLoc+", "+refEndLoc+", "+greflimit+", "+GREFLIMIT2_CUSHION+"\n"+new String(gref);
+ //Add a cushion to the end to clear out the prior data (especially GAPC) that was there
+ {
+ final int lim=Tools.min(gref.length, greflimit+GREFLIMIT2_CUSHION);
+ if(lim>gref.length){
+ System.err.println("gref buffer overflow: "+lim+" > "+gref.length);
+ return null;
+ }
+ for(int i=greflimit, r=refEndLoc+1; i<lim; i++, r++){
+ gref[i]=(r<ref.length ? ref[r] : (byte)'N');
+ greflimit2=i;
+ }
+ }
+
+ if(verbose){
+ System.err.println("gref:\n"+new String(gref));
+ }
+
+ gaps[0]=g0_old;
+ gaps[gaps.length-1]=gN_old;
+
+ if(verbose){
+ System.err.println("\ngaps3: "+Arrays.toString(gaps));
+ }
+
+ return gref;
+ }
+
+
+// public final int[] translateScoreFromGappedCoordinate(int[] score){
+//// {score, bestRefStart, bestRefStop}
+// int a=score[1];
+// int b=score[2];
+// int a2=-9999;
+// int b2=-9999;
+// for(int i=0, j=grefRefOrigin; i<grefbuffer.length; i++){
+// byte c=grefbuffer[i];
+//
+// if(i==a){a2=j;}
+// if(i==b){
+// b2=j;
+// assert(a2!=-9999);
+// score[1]=a2;
+// score[2]=b2;
+// return score;
+// }
+//
+// j+=(c==GAPC ? GAPLEN : 1);
+//// if(c!=GAPC){j++;}
+//// else{j+=GAPLEN;}
+// }
+// throw new RuntimeException("Out of bounds.");
+// }
+
+ private final int translateFromGappedCoordinate(int point, byte[] gref){
+ if(verbose){System.err.println("translateFromGappedCoordinate("+point+"), gro="+grefRefOrigin+", grl="+greflimit);}
+ if(point<=0){return grefRefOrigin+point;}
+ for(int i=0, j=grefRefOrigin; i<greflimit2; i++){
+// if(verbose){System.err.println("i="+i+", j="+j+", sym="+(char)gref[i]);}
+ byte c=gref[i];
+ assert(point>=i) : "\n"+grefRefOrigin+"\n"+point+"\n"+new String(gref)+"\n";
+
+ if(i==point){
+ if(verbose){System.err.println(" -> "+j);}
+ return j;
+ }
+
+ j+=(c==GAPC ? GAPLEN : 1);
+// if(c!=GAPC){j++;}
+// else{j+=GAPLEN;}
+ }
+
+ System.err.println(grefRefOrigin);
+ System.err.println(point);
+ System.err.println(new String(gref));
+
+ throw new RuntimeException("Out of bounds.");
+ }
+
+ private final int translateToGappedCoordinate(int point, byte[] gref){
+ if(verbose){System.err.println("translateToGappedCoordinate("+point+"), gro="+grefRefOrigin+", grl="+greflimit);}
+ if(point<=grefRefOrigin){return point-grefRefOrigin;}
+ for(int i=0, j=grefRefOrigin; i<greflimit2; i++){
+// if(verbose){System.err.println("i="+i+", j="+j+", sym="+(char)gref[i]);}
+ assert(point>=j) : "\n"+grefRefOrigin+"\n"+point+"\n"+new String(gref)+"\n";
+ byte c=gref[i];
+
+ if(j==point){
+ if(verbose){System.err.println(" -> "+i);}
+ return i;
+ }
+
+ j+=(c==GAPC ? GAPLEN : 1);
+// if(c!=GAPC){j++;}
+// else{j+=GAPLEN;}
+ }
+
+ System.err.println(grefRefOrigin);
+ System.err.println(point);
+ System.err.println(new String(gref));
+
+ throw new RuntimeException("Out of bounds.");
+ }
+
+
+ /** Calculates score based on an array from Index */
+ private final int calcAffineScore(int[] locArray){
+ int score=0;
+ int lastLoc=-2; //Last true location
+ int lastValue=-1;
+ int timeInMode=0;
+
+ for(int i=0; i<locArray.length; i++){
+ int loc=locArray[i];
+
+ if(loc>0){//match
+ if(loc==lastValue){//contiguous match
+ score+=POINTS_MATCH2;
+ }else if(loc==lastLoc || lastLoc<0){//match after a sub, or first match
+ score+=POINTS_MATCH;
+ }else if(loc<lastLoc){//deletion
+ assert(lastLoc>=0);
+ score+=POINTS_MATCH;
+ score+=POINTS_DEL;
+ int dif=lastLoc-loc+1;
+ if(dif>MINGAP){
+ int rem=dif%GAPLEN;
+ int div=(dif-GAPBUFFER2)/GAPLEN;
+ score+=(div*POINTS_GAP);
+ assert(rem+GAPBUFFER2<dif);
+ dif=rem+GAPBUFFER2;
+ assert(dif>LIMIT_FOR_COST_4); //and probably LIMIT_FOR_COST_5
+// assert(false) : div;
+ }
+ if(dif>LIMIT_FOR_COST_5){
+ score+=((dif-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTS_DEL5;
+ dif=LIMIT_FOR_COST_5;
+ }
+ if(dif>LIMIT_FOR_COST_4){
+ score+=(dif-LIMIT_FOR_COST_4)*POINTS_DEL4;
+ dif=LIMIT_FOR_COST_4;
+ }
+ if(dif>LIMIT_FOR_COST_3){
+ score+=(dif-LIMIT_FOR_COST_3)*POINTS_DEL3;
+ dif=LIMIT_FOR_COST_3;
+ }
+ if(dif>1){
+ score+=(dif-1)*POINTS_DEL2;
+ }
+ timeInMode=1;
+ }else if(loc>lastLoc){//insertion
+ assert(lastLoc>=0);
+ score+=POINTS_MATCH;
+ score+=POINTS_INS;
+ int dif=Tools.min(loc-lastLoc+1, 5);
+ assert(dif>0);
+ if(dif>LIMIT_FOR_COST_4){
+ score+=(dif-LIMIT_FOR_COST_4)*POINTS_INS4;
+ dif=LIMIT_FOR_COST_4;
+ }
+ if(dif>LIMIT_FOR_COST_3){
+ score+=(dif-LIMIT_FOR_COST_3)*POINTS_INS3;
+ dif=LIMIT_FOR_COST_3;
+ }
+ if(dif>1){
+ score+=(dif-1)*POINTS_INS2;
+ }
+ timeInMode=1;
+ }else{
+ assert(false);
+ }
+ lastLoc=loc;
+ }else{//substitution
+ if(lastValue<0 && timeInMode>0){//contiguous
+ if(timeInMode<LIMIT_FOR_COST_3){score+=POINTS_SUB2;}
+ else{score+=POINTS_SUB3;}
+ timeInMode++;
+ }else{
+ score+=POINTS_SUB;
+ timeInMode=1;
+ }
+ }
+ lastValue=loc;
+ }
+ assert(score<=maxQuality(locArray.length));
+ return score;
+ }
+
+ @Override
+ public final int calcAffineScore(final int[] locArray, final byte[] baseScores, final byte bases[]){
+ int score=0;
+ int lastLoc=-3; //Last true location
+ int lastValue=-1;
+ int timeInMode=0;
+
+ for(int i=0; i<locArray.length; i++){
+ final int loc=locArray[i];
+
+ if(loc>0){//match
+ if(loc==lastValue){//contiguous match
+ score+=(POINTS_MATCH2+baseScores[i]);
+ }else if(loc==lastLoc || lastLoc<0){//match after a sub, or first match
+ score+=(POINTS_MATCH+baseScores[i]);
+ }else if(loc<lastLoc){//deletion
+ assert(lastLoc>=0);
+ score+=(POINTS_MATCH+baseScores[i]);
+ score+=POINTS_DEL;
+ int dif=lastLoc-loc+1;
+ if(dif>MINGAP){
+ int rem=dif%GAPLEN;
+ int div=(dif-GAPBUFFER2)/GAPLEN;
+ score+=(div*POINTS_GAP);
+ assert(rem+GAPBUFFER2<dif);
+ dif=rem+GAPBUFFER2;
+ assert(dif>LIMIT_FOR_COST_4); //and probably LIMIT_FOR_COST_5
+// assert(false) : div;
+ }
+ if(dif>LIMIT_FOR_COST_5){
+ score+=((dif-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTS_DEL5;
+ dif=LIMIT_FOR_COST_5;
+ }
+ if(dif>LIMIT_FOR_COST_4){
+ score+=(dif-LIMIT_FOR_COST_4)*POINTS_DEL4;
+ dif=LIMIT_FOR_COST_4;
+ }
+ if(dif>LIMIT_FOR_COST_3){
+ score+=(dif-LIMIT_FOR_COST_3)*POINTS_DEL3;
+ dif=LIMIT_FOR_COST_3;
+ }
+ if(dif>1){
+ score+=(dif-1)*POINTS_DEL2;
+ }
+ timeInMode=1;
+ }else if(loc>lastLoc){//insertion
+ assert(lastLoc>=0);
+ score+=(POINTS_MATCH+baseScores[i]);
+ score+=POINTS_INS;
+ int dif=Tools.min(loc-lastLoc+1, 5);
+ assert(dif>0);
+ if(dif>LIMIT_FOR_COST_4){
+ score+=(dif-LIMIT_FOR_COST_4)*POINTS_INS4;
+ dif=LIMIT_FOR_COST_4;
+ }
+ if(dif>LIMIT_FOR_COST_3){
+ score+=(dif-LIMIT_FOR_COST_3)*POINTS_INS3;
+ dif=LIMIT_FOR_COST_3;
+ }
+ if(dif>1){
+ score+=(dif-1)*POINTS_INS2;
+ }
+ timeInMode=1;
+ }else{
+ assert(false);
+ }
+ lastLoc=loc;
+ }else if(loc==-1){//substitution
+ if(lastValue<0 && timeInMode>0){//contiguous
+ if(timeInMode<LIMIT_FOR_COST_3){score+=POINTS_SUB2;}
+ else{score+=POINTS_SUB3;}
+ timeInMode++;
+ }else{
+ score+=POINTS_SUB;
+ timeInMode=1;
+ }
+ }else{
+ assert(loc==-2) : "\ni="+i+", loc="+loc+", score="+score+", lastLoc="+lastLoc+", lastValue="+lastValue
+ +", time="+timeInMode+", length="+locArray.length+"\nbases=\n"+new String(bases)
+ +"\nlocs[]=\n"+Arrays.toString(locArray)+"\n"+new String(bases)+"\n"
+ +"If this happens please ensure that the reference has a startpad of Ns longer than readlength.";//N (no-call or no-ref)
+ timeInMode=0;
+ score+=POINTS_NOCALL;
+ }
+ lastValue=loc;
+ }
+ assert(score<=maxQuality(locArray.length));
+ return score;
+ }
+
+ @Override
+ public final int calcAffineScore(int[] locArray, byte[] baseScores, byte[] bases, int minContig){
+ assert(minContig>1) : minContig;
+
+ int contig=0;
+ int maxContig=0;
+
+ int score=0;
+ int lastLoc=-3; //Last true location
+ int lastValue=-1;
+ int timeInMode=0;
+
+ for(int i=0; i<locArray.length; i++){
+ int loc=locArray[i];
+
+ if(loc>0){//match
+ if(loc==lastValue){//contiguous match
+ contig++;
+ score+=(POINTS_MATCH2+baseScores[i]);
+ }else if(loc==lastLoc || lastLoc<0){//match after a sub, or first match
+ maxContig=Tools.max(maxContig, contig);
+ contig=1;
+ score+=(POINTS_MATCH+baseScores[i]);
+ }else if(loc<lastLoc){//deletion
+ maxContig=Tools.max(maxContig, contig);
+ contig=0;
+ assert(lastLoc>=0);
+ score+=(POINTS_MATCH+baseScores[i]);
+ score+=POINTS_DEL;
+ int dif=lastLoc-loc+1;
+ if(dif>MINGAP){
+ int rem=dif%GAPLEN;
+ int div=(dif-GAPBUFFER2)/GAPLEN;
+ score+=(div*POINTS_GAP);
+ assert(rem+GAPBUFFER2<dif);
+ dif=rem+GAPBUFFER2;
+ assert(dif>LIMIT_FOR_COST_4); //and probably LIMIT_FOR_COST_5
+// assert(false) : div;
+ }
+ if(dif>LIMIT_FOR_COST_5){
+ score+=((dif-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTS_DEL5;
+ dif=LIMIT_FOR_COST_5;
+ }
+ if(dif>LIMIT_FOR_COST_4){
+ score+=(dif-LIMIT_FOR_COST_4)*POINTS_DEL4;
+ dif=LIMIT_FOR_COST_4;
+ }
+ if(dif>LIMIT_FOR_COST_3){
+ score+=(dif-LIMIT_FOR_COST_3)*POINTS_DEL3;
+ dif=LIMIT_FOR_COST_3;
+ }
+ if(dif>1){
+ score+=(dif-1)*POINTS_DEL2;
+ }
+ timeInMode=1;
+ }else if(loc>lastLoc){//insertion
+ maxContig=Tools.max(maxContig, contig);
+ contig=0;
+ assert(lastLoc>=0);
+ score+=(POINTS_MATCH+baseScores[i]);
+ score+=POINTS_INS;
+ int dif=Tools.min(loc-lastLoc+1, 5);
+ assert(dif>0);
+ if(dif>LIMIT_FOR_COST_4){
+ score+=(dif-LIMIT_FOR_COST_4)*POINTS_INS4;
+ dif=LIMIT_FOR_COST_4;
+ }
+ if(dif>LIMIT_FOR_COST_3){
+ score+=(dif-LIMIT_FOR_COST_3)*POINTS_INS3;
+ dif=LIMIT_FOR_COST_3;
+ }
+ if(dif>1){
+ score+=(dif-1)*POINTS_INS2;
+ }
+ timeInMode=1;
+ }else{
+ assert(false);
+ }
+ lastLoc=loc;
+ }else if(loc==-1){//substitution
+ if(lastValue<0 && timeInMode>0){//contiguous
+ if(timeInMode<LIMIT_FOR_COST_3){score+=POINTS_SUB2;}
+ else{score+=POINTS_SUB3;}
+ timeInMode++;
+ }else{
+ score+=POINTS_SUB;
+ timeInMode=1;
+ }
+ }else{
+ assert(loc==-2) : loc+"\n"+Arrays.toString(locArray)+"\n"+Arrays.toString(baseScores)+"\n"+new String(bases)+"\n"
+ +"If this happens please ensure that the reference has a startpad of Ns longer than readlength.";//N (no-call or no-ref)
+ timeInMode=0;
+ score+=POINTS_NOCALL;
+ }
+ lastValue=loc;
+ }
+ assert(score<=maxQuality(locArray.length));
+ if(Tools.max(contig, maxContig)<minContig){score=Tools.min(score, -50*locArray.length);}
+ return score;
+ }
+
+ @Override
+ public final int scoreNoIndels(byte[] read, byte[] ref, final int refStart){
+ return scoreNoIndels(read, ref, refStart, null);
+ }
+ @Override
+ public final int scoreNoIndels(byte[] read, byte[] ref, final int refStart, final SiteScore ss){
+
+ int score=0;
+ int mode=-1;
+ int timeInMode=0;
+
+ //This block handles cases where the read runs outside the reference
+ //Of course, padding the reference with 'N' would be better, but...
+ int readStart=0;
+ int readStop=read.length;
+ final int refStop=refStart+read.length;
+ boolean semiperfect=true;
+ int norefs=0;
+
+ if(refStart<0){
+ readStart=0-refStart;
+ score+=POINTS_NOREF*readStart;
+ norefs+=readStart;
+ }
+ if(refStop>ref.length){
+ int dif=(refStop-ref.length);
+ readStop-=dif;
+ score+=POINTS_NOREF*dif;
+ norefs+=dif;
+ }
+
+// if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed.
+
+ for(int i=readStart; i<readStop; i++){
+ byte c=read[i];
+ byte r=ref[refStart+i];
+
+ if(c==r && c!='N'){
+ if(mode==MODE_MS){
+ timeInMode++;
+ score+=POINTS_MATCH2;
+ }else{
+ timeInMode=0;
+ score+=POINTS_MATCH;
+ }
+ mode=MODE_MS;
+ }else if(c<0 || c=='N'){
+ score+=POINTS_NOCALL;
+ semiperfect=false;
+ }else if(r<0 || r=='N'){
+ score+=POINTS_NOREF;
+ norefs++;
+ }else{
+ if(mode==MODE_SUB){timeInMode++;}
+ else{timeInMode=0;}
+
+ if(timeInMode==0){score+=POINTS_SUB;}
+ else if(timeInMode<LIMIT_FOR_COST_3){score+=POINTS_SUB2;}
+ else{score+=POINTS_SUB3;}
+ mode=MODE_SUB;
+ semiperfect=false;
+ }
+ }
+
+
+ if(semiperfect && ss!=null){ss.semiperfect=((ss.stop==ss.start+read.length-1) && (norefs<=read.length/2));}
+
+ return score;
+ }
+
+ @Override
+ public final byte[] genMatchNoIndels(byte[] read, byte[] ref, final int refStart){
+ if(read==null || ref==null){return null;}
+
+ final byte[] match=new byte[read.length];
+
+ for(int i=0, j=refStart; i<read.length; i++, j++){
+ byte c=read[i];
+ byte r=(j<0 || j>=ref.length) ? (byte)'N' : ref[j];
+
+ if(c=='N' || r=='N'){match[i]='N';}
+ else if(c==r){match[i]='m';}
+ else{match[i]='S';}
+
+ }
+
+ return match;
+ }
+
+ @Override
+ public final int scoreNoIndels(byte[] read, byte[] ref, byte[] baseScores, final int refStart){
+ return scoreNoIndels(read, ref, baseScores, refStart, null);
+ }
+ @Override
+ public final int scoreNoIndels(byte[] read, byte[] ref, byte[] baseScores, final int refStart, SiteScore ss){
+
+ int score=0;
+ int mode=-1;
+ int timeInMode=0;
+ int norefs=0;
+
+ //This block handles cases where the read runs outside the reference
+ //Of course, padding the reference with 'N' would be better, but...
+ int readStart=0;
+ int readStop=read.length;
+ final int refStop=refStart+read.length;
+ boolean semiperfect=true;
+
+ if(refStart<0){
+ readStart=0-refStart;
+ score+=POINTS_NOREF*readStart;
+ norefs+=readStart;
+ }
+ if(refStop>ref.length){
+ int dif=(refStop-ref.length);
+ readStop-=dif;
+ score+=POINTS_NOREF*dif;
+ norefs+=dif;
+ }
+
+// if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed.
+
+ for(int i=readStart; i<readStop; i++){
+ byte c=read[i];
+ byte r=ref[refStart+i];
+
+ if(c==r && c!='N'){
+ if(mode==MODE_MS){
+ timeInMode++;
+ score+=POINTS_MATCH2;
+ }else{
+ timeInMode=0;
+ score+=POINTS_MATCH;
+ }
+ score+=baseScores[i];
+ mode=MODE_MS;
+ }else if(c<0 || c=='N'){
+ score+=POINTS_NOCALL;
+ semiperfect=false;
+ }else if(r<0 || r=='N'){
+ score+=POINTS_NOREF;
+ norefs++;
+ }else{
+ if(mode==MODE_SUB){timeInMode++;}
+ else{timeInMode=0;}
+
+ if(timeInMode==0){score+=POINTS_SUB;}
+ else if(timeInMode<LIMIT_FOR_COST_3){score+=POINTS_SUB2;}
+ else{score+=POINTS_SUB3;}
+ mode=MODE_SUB;
+ semiperfect=false;
+ }
+ }
+
+
+ if(semiperfect && ss!=null){ss.semiperfect=((ss.stop==ss.start+read.length-1) && (norefs<=read.length/2));}
+ assert(Read.CHECKSITE(ss, read, -1));
+
+ return score;
+ }
+
+ @Override
+ public final int scoreNoIndelsAndMakeMatchString(byte[] read, byte[] ref, byte[] baseScores, final int refStart, byte[][] matchReturn){
+ int score=0;
+ int mode=-1;
+ int timeInMode=0;
+
+ assert(refStart<=ref.length) : refStart+", "+ref.length;
+
+ //This block handles cases where the read runs outside the reference
+ //Of course, padding the reference with 'N' would be better, but...
+ int readStart=0;
+ int readStop=read.length;
+ final int refStop=refStart+read.length;
+ if(refStart<0 || refStop>ref.length){return -99999;}
+ if(refStart<0){
+ readStart=0-refStart;
+ score+=POINTS_NOREF*readStart;
+ }
+ if(refStop>ref.length){
+ int dif=(refStop-ref.length);
+ System.err.println(new String(read)+"\ndif="+dif+", ref.length="+ref.length+", refStop="+refStop);
+ readStop-=dif;
+ score+=POINTS_NOREF*dif;
+ }
+ assert(refStart+readStop<=ref.length) : "readStart="+readStart+", readStop="+readStop+
+ ", refStart="+refStart+", refStop="+refStop+", ref.length="+ref.length+", read.length="+read.length;
+
+ assert(matchReturn!=null);
+ assert(matchReturn.length==1);
+ if(matchReturn[0]==null || matchReturn[0].length!=read.length){
+ assert(matchReturn[0]==null || matchReturn[0].length<read.length) : matchReturn[0].length+"!="+read.length;
+ matchReturn[0]=new byte[read.length];
+ }
+ final byte[] match=matchReturn[0];
+
+// if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed.
+
+ for(int i=readStart; i<readStop; i++){
+ byte c=read[i];
+ byte r=ref[refStart+i];
+
+ assert(r!='.' && c!='.');
+
+ if(c==r && c!='N'){
+ if(mode==MODE_MS){
+ timeInMode++;
+ score+=POINTS_MATCH2;
+ }else{
+ timeInMode=0;
+ score+=POINTS_MATCH;
+ }
+ score+=baseScores[i];
+ match[i]='m';
+ mode=MODE_MS;
+ }else if(c<0 || c=='N'){
+ score+=POINTS_NOCALL;
+ match[i]='N';
+ }else if(r<0 || r=='N'){
+ score+=POINTS_NOREF;
+// match[i]='m';
+ match[i]='N';
+ }else{
+ match[i]='S';
+ if(mode==MODE_SUB){timeInMode++;}
+ else{timeInMode=0;}
+
+ if(timeInMode==0){score+=POINTS_SUB;}
+ else if(timeInMode<LIMIT_FOR_COST_3){score+=POINTS_SUB2;}
+ else{score+=POINTS_SUB3;}
+ mode=MODE_SUB;
+ }
+ }
+
+ return score;
+ }
+
+ @Override
+ public final int scoreNoIndelsAndMakeMatchString(byte[] read, byte[] ref, final int refStart, byte[][] matchReturn){
+ int score=0;
+ int mode=-1;
+ int timeInMode=0;
+
+ assert(refStart<=ref.length) : refStart+", "+ref.length;
+
+ //This block handles cases where the read runs outside the reference
+ //Of course, padding the reference with 'N' would be better, but...
+ int readStart=0;
+ int readStop=read.length;
+ final int refStop=refStart+read.length;
+ if(refStart<0 || refStop>ref.length){return -99999;}
+ if(refStart<0){
+ readStart=0-refStart;
+ score+=POINTS_NOREF*readStart;
+ }
+ if(refStop>ref.length){
+ int dif=(refStop-ref.length);
+ System.err.println(new String(read)+"\ndif="+dif+", ref.length="+ref.length+", refStop="+refStop);
+ readStop-=dif;
+ score+=POINTS_NOREF*dif;
+ }
+ assert(refStart+readStop<=ref.length) : "readStart="+readStart+", readStop="+readStop+
+ ", refStart="+refStart+", refStop="+refStop+", ref.length="+ref.length+", read.length="+read.length;
+
+ assert(matchReturn!=null);
+ assert(matchReturn.length==1);
+ if(matchReturn[0]==null || matchReturn[0].length!=read.length){
+ assert(matchReturn[0]==null || matchReturn[0].length<read.length) : matchReturn[0].length+"!="+read.length;
+ matchReturn[0]=new byte[read.length];
+ }
+ final byte[] match=matchReturn[0];
+
+// if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed.
+
+ for(int i=readStart; i<readStop; i++){
+ byte c=read[i];
+ byte r=ref[refStart+i];
+
+ assert(r!='.' && c!='.');
+
+ if(c==r && c!='N'){
+ if(mode==MODE_MS){
+ timeInMode++;
+ score+=POINTS_MATCH2;
+ }else{
+ timeInMode=0;
+ score+=POINTS_MATCH;
+ }
+ match[i]='m';
+ mode=MODE_MS;
+ }else if(c<0 || c=='N'){
+ score+=POINTS_NOCALL;
+ match[i]='N';
+ }else if(r<0 || r=='N'){
+ score+=POINTS_NOREF;
+// match[i]='m';
+ match[i]='N';
+ }else{
+ match[i]='S';
+ if(mode==MODE_SUB){timeInMode++;}
+ else{timeInMode=0;}
+
+ if(timeInMode==0){score+=POINTS_SUB;}
+ else if(timeInMode<LIMIT_FOR_COST_3){score+=POINTS_SUB2;}
+ else{score+=POINTS_SUB3;}
+ mode=MODE_SUB;
+ }
+ }
+
+ return score;
+ }
+
+ @Override
+ public final int maxQuality(int numBases){
+ return POINTS_MATCH+(numBases-1)*(POINTS_MATCH2);
+ }
+
+ @Override
+ public final int maxQuality(byte[] baseScores){
+ return POINTS_MATCH+(baseScores.length-1)*(POINTS_MATCH2)+Tools.sumInt(baseScores);
+ }
+
+ @Override
+ public final int maxImperfectScore(int numBases){
+// int maxQ=maxQuality(numBases);
+//// maxImperfectSwScore=maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB);
+// int maxI=maxQ+POINTS_DEL;
+// maxI=Tools.max(maxI, maxQ+POINTS_INS-POINTS_MATCH2);
+// maxI=Tools.min(maxI, maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB));
+
+ int maxQ=maxQuality(numBases);
+ int maxI=maxQ+Tools.min(POINTS_DEL, POINTS_INS-POINTS_MATCH2);
+ assert(maxI<(maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB)));
+ return maxI;
+ }
+
+ @Override
+ public final int maxImperfectScore(byte[] baseScores){
+// int maxQ=maxQuality(numBases);
+//// maxImperfectSwScore=maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB);
+// int maxI=maxQ+POINTS_DEL;
+// maxI=Tools.max(maxI, maxQ+POINTS_INS-POINTS_MATCH2);
+// maxI=Tools.min(maxI, maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB));
+
+ int maxQ=maxQuality(baseScores);
+ int maxI=maxQ+Tools.min(POINTS_DEL, POINTS_INS-POINTS_MATCH2);
+ assert(maxI<(maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB)));
+ return maxI;
+ }
+
+ @Override
+ public int calcDelScore(int len, boolean approximateGaps){
+ if(len<=0){return 0;}
+ int score=POINTS_DEL;
+
+ if(approximateGaps && len>MINGAP){
+ int rem=len%GAPLEN;
+ int div=(len-GAPBUFFER2)/GAPLEN;
+ score+=(div*POINTS_GAP);
+ assert(rem+GAPBUFFER2<len);
+ len=rem+GAPBUFFER2;
+ assert(len>LIMIT_FOR_COST_4); //and probably LIMIT_FOR_COST_5
+// assert(false) : div;
+ }
+
+ if(len>LIMIT_FOR_COST_5){
+ score+=((len-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTS_DEL5;
+ len=LIMIT_FOR_COST_5;
+ }
+ if(len>LIMIT_FOR_COST_4){
+ score+=(len-LIMIT_FOR_COST_4)*POINTS_DEL4;
+ len=LIMIT_FOR_COST_4;
+ }
+ if(len>LIMIT_FOR_COST_3){
+ score+=(len-LIMIT_FOR_COST_3)*POINTS_DEL3;
+ len=LIMIT_FOR_COST_3;
+ }
+ if(len>1){
+ score+=(len-1)*POINTS_DEL2;
+ }
+ return score;
+ }
+
+ private static int calcDelScoreOffset(int len){
+ if(len<=0){return 0;}
+ int score=POINTSoff_DEL;
+
+ if(len>LIMIT_FOR_COST_5){
+ score+=((len-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTSoff_DEL5;
+ len=LIMIT_FOR_COST_5;
+ }
+ if(len>LIMIT_FOR_COST_4){
+ score+=(len-LIMIT_FOR_COST_4)*POINTSoff_DEL4;
+ len=LIMIT_FOR_COST_4;
+ }
+ if(len>LIMIT_FOR_COST_3){
+ score+=(len-LIMIT_FOR_COST_3)*POINTSoff_DEL3;
+ len=LIMIT_FOR_COST_3;
+ }
+ if(len>1){
+ score+=(len-1)*POINTSoff_DEL2;
+ }
+ return score;
+ }
+
+ @Override
+ public int calcInsScore(int len){
+ if(len<=0){return 0;}
+ int score=POINTS_INS;
+
+ if(len>LIMIT_FOR_COST_4){
+ score+=(len-LIMIT_FOR_COST_4)*POINTS_INS4;
+ len=LIMIT_FOR_COST_4;
+ }
+ if(len>LIMIT_FOR_COST_3){
+ score+=(len-LIMIT_FOR_COST_3)*POINTS_INS3;
+ len=LIMIT_FOR_COST_3;
+ }
+ if(len>1){
+ score+=(len-1)*POINTS_INS2;
+ }
+ return score;
+ }
+
+ private static int calcInsScoreOffset(int len){
+ if(len<=0){return 0;}
+ int score=POINTSoff_INS;
+ if(len>LIMIT_FOR_COST_4){
+ score+=(len-LIMIT_FOR_COST_4)*POINTSoff_INS4;
+ len=LIMIT_FOR_COST_4;
+ }
+ if(len>LIMIT_FOR_COST_3){
+ score+=(len-LIMIT_FOR_COST_3)*POINTSoff_INS3;
+ len=LIMIT_FOR_COST_3;
+ }
+ if(len>1){
+ score+=(len-1)*POINTSoff_INS2;
+ }
+ return score;
+ }
+
+
+ private final int[][][] packed;
+ private final byte[] grefbuffer;
+ private int greflimit=-1;
+ private int greflimit2=-1;
+ private int grefRefOrigin=-1;
+
+
+ @Override
+ /**DO NOT MODIFY*/
+ public final byte[] getGrefbuffer(){
+ return grefbuffer;
+ }
+
+ public final int[] vertLimit;
+ public final int[] horizLimit;
+
+ @Override
+ public CharSequence showVertLimit(){
+ StringBuilder sb=new StringBuilder();
+ for(int i=0; i<=rows; i++){sb.append(vertLimit[i]>>SCOREOFFSET).append(",");}
+ return sb;
+ }
+
+ @Override
+ public CharSequence showHorizLimit(){
+ StringBuilder sb=new StringBuilder();
+ for(int i=0; i<=columns; i++){sb.append(horizLimit[i]>>SCOREOFFSET).append(",");}
+ return sb;
+ }
+
+ public static float minIdToMinRatio(double minid){
+ if(minid>1){minid=minid/100;}
+ assert(minid>0 && minid<=1) : "Min identity should be between 0 and 1. Values above 1 will be assumed to be percent and divided by 100.";
+ double matchdif=POINTS_MATCH-POINTS_MATCH2;
+ double match=POINTS_MATCH2;
+ double sub=-POINTS_MATCH2+0.5*(matchdif+POINTS_SUB)+0.5*POINTS_SUB2;
+ double del=0.8*(matchdif+POINTS_DEL)+0.1*POINTS_DEL2+0.05*POINTS_DEL3+0.05*POINTS_DEL4;
+ double ins=-POINTS_MATCH2+0.8*(matchdif+POINTS_INS)+0.15*(POINTS_INS2)+0.05*(POINTS_INS3);
+ double badAvg=.2*sub+.3*del+.5*ins;
+ double badFraction=1-minid;
+ double minratio=(match+badFraction*badAvg)/match;
+ assert(minratio<=1);
+ minratio=Tools.max(0.1, minratio);
+ return (float)minratio;
+ }
+
+ public static final int TIMEBITS=9;
+ public static final int SCOREBITS=32-TIMEBITS;
+ public static final int MAX_TIME=((1<<TIMEBITS)-1);
+ public static final int MAX_SCORE=((1<<(SCOREBITS-1))-1)-2000;
+ public static final int MIN_SCORE=0-MAX_SCORE; //Keeps it 1 point above "BAD".
+
+ public static final int SCOREOFFSET=TIMEBITS;
+
+ public static final int TIMEMASK=~((-1)<<TIMEBITS);
+ public static final int SCOREMASK=(~((-1)<<SCOREBITS))<<SCOREOFFSET;
+
+ private static final byte MODE_MS=0;
+ private static final byte MODE_DEL=1;
+ private static final byte MODE_INS=2;
+ private static final byte MODE_SUB=3;
+
+ public static final int POINTS_NOREF=0;
+ public static final int POINTS_NOCALL=0;
+ public static final int POINTS_MATCH=90;
+ public static final int POINTS_MATCH2=100; //Note: Changing to 90 substantially reduces false positives
+ public static final int POINTS_COMPATIBLE=50;
+ public static final int POINTS_SUB=-137;
+ public static final int POINTS_SUBR=-157; //increased penalty if prior match streak was at most 1
+ public static final int POINTS_SUB2=-49;
+ public static final int POINTS_SUB3=-25;
+ public static final int POINTS_MATCHSUB=-10;
+ public static final int POINTS_INS=-205;
+ public static final int POINTS_INS2=-42;
+ public static final int POINTS_INS3=-23;
+ public static final int POINTS_INS4=-8;
+ public static final int POINTS_DEL=-292;
+ public static final int POINTS_DEL2=-37;
+ public static final int POINTS_DEL3=-17;
+ public static final int POINTS_DEL4=-2;
+ public static final int POINTS_DEL5=-1;
+ public static final int POINTS_DEL_REF_N=-10;
+ public static final int POINTS_GAP=0-GAPCOST;
+
+ public static final int TIMESLIP=4;
+ public static final int MASK5=TIMESLIP-1;
+ static{assert(Integer.bitCount(TIMESLIP)==1);}
+
+ //TODO: Consider removing these barriers entirely for PacBio reads. Would make code faster, too.
+ private static final int BARRIER_I1=1;
+ private static final int BARRIER_D1=1;
+
+
+ public static final int LIMIT_FOR_COST_3=5;
+ public static final int LIMIT_FOR_COST_4=20;
+ public static final int LIMIT_FOR_COST_5=80;
+
+ public static final int BAD=MIN_SCORE-1;
+
+
+ public static final int POINTSoff_NOREF=(POINTS_NOREF<<SCOREOFFSET);
+ public static final int POINTSoff_NOCALL=(POINTS_NOCALL<<SCOREOFFSET);
+ public static final int POINTSoff_MATCH=(POINTS_MATCH<<SCOREOFFSET);
+ public static final int POINTSoff_MATCH2=(POINTS_MATCH2<<SCOREOFFSET);
+ public static final int POINTSoff_COMPATIBLE=(POINTS_COMPATIBLE<<SCOREOFFSET);
+ public static final int POINTSoff_SUB=(POINTS_SUB<<SCOREOFFSET);
+ public static final int POINTSoff_SUBR=(POINTS_SUBR<<SCOREOFFSET);
+ public static final int POINTSoff_SUB2=(POINTS_SUB2<<SCOREOFFSET);
+ public static final int POINTSoff_SUB3=(POINTS_SUB3<<SCOREOFFSET);
+ public static final int POINTSoff_MATCHSUB=(POINTS_MATCHSUB<<SCOREOFFSET);
+ public static final int POINTSoff_INS=(POINTS_INS<<SCOREOFFSET);
+ public static final int POINTSoff_INS2=(POINTS_INS2<<SCOREOFFSET);
+ public static final int POINTSoff_INS3=(POINTS_INS3<<SCOREOFFSET);
+ public static final int POINTSoff_INS4=(POINTS_INS4<<SCOREOFFSET);
+ public static final int POINTSoff_DEL=(POINTS_DEL<<SCOREOFFSET);
+ public static final int POINTSoff_DEL2=(POINTS_DEL2<<SCOREOFFSET);
+ public static final int POINTSoff_DEL3=(POINTS_DEL3<<SCOREOFFSET);
+ public static final int POINTSoff_DEL4=(POINTS_DEL4<<SCOREOFFSET);
+ public static final int POINTSoff_DEL5=(POINTS_DEL5<<SCOREOFFSET);
+ public static final int POINTSoff_GAP=(POINTS_GAP<<SCOREOFFSET);
+ public static final int POINTSoff_DEL_REF_N=(POINTS_DEL_REF_N<<SCOREOFFSET);
+ public static final int BADoff=(BAD<<SCOREOFFSET);
+ public static final int MAXoff_SCORE=MAX_SCORE<<SCOREOFFSET;
+ public static final int MINoff_SCORE=MIN_SCORE<<SCOREOFFSET;
+
+ /** TODO: possibly enclose all uses of affine arrays in a branch controlled by this */
+ public static final boolean AFFINE_ARRAYS=false;
+ public static final int[] POINTS_INS_ARRAY;
+ public static final int[] POINTSoff_INS_ARRAY;
+ public static final int[] POINTS_INS_ARRAY_C;
+ public static final int[] POINTSoff_INS_ARRAY_C;
+
+ public static final int[] POINTS_SUB_ARRAY;
+ public static final int[] POINTSoff_SUB_ARRAY;
+ public static final int[] POINTS_SUB_ARRAY_C;
+ public static final int[] POINTSoff_SUB_ARRAY_C;
+
+ static{
+ POINTS_INS_ARRAY=new int[604];
+ POINTSoff_INS_ARRAY=new int[604];
+ POINTS_INS_ARRAY_C=new int[604];
+ POINTSoff_INS_ARRAY_C=new int[604];
+
+ for(int i=1; i<POINTS_INS_ARRAY.length; i++){
+ int pts, ptsoff;
+ if(i>LIMIT_FOR_COST_4){
+ pts=POINTS_INS4;
+ ptsoff=POINTSoff_INS4;
+ }else if(i>LIMIT_FOR_COST_3){
+ pts=POINTS_INS3;
+ ptsoff=POINTSoff_INS3;
+ }else if(i>1){
+ pts=POINTS_INS2;
+ ptsoff=POINTSoff_INS2;
+ }else{
+ pts=POINTS_INS;
+ ptsoff=POINTSoff_INS;
+ }
+ POINTS_INS_ARRAY[i]=pts;
+ POINTSoff_INS_ARRAY[i]=ptsoff;
+ POINTS_INS_ARRAY_C[i]=Tools.max(MIN_SCORE, pts+POINTS_INS_ARRAY_C[i-1]);
+ POINTSoff_INS_ARRAY_C[i]=Tools.max(MINoff_SCORE, ptsoff+POINTSoff_INS_ARRAY_C[i-1]);
+ }
+
+
+ POINTS_SUB_ARRAY=new int[604];
+ POINTSoff_SUB_ARRAY=new int[604];
+ POINTS_SUB_ARRAY_C=new int[604];
+ POINTSoff_SUB_ARRAY_C=new int[604];
+
+ for(int i=1; i<POINTS_SUB_ARRAY.length; i++){
+ int pts, ptsoff;
+ if(i>LIMIT_FOR_COST_3){
+ pts=POINTS_SUB3;
+ ptsoff=POINTSoff_SUB3;
+ }else if(i>1){
+ pts=POINTS_SUB2;
+ ptsoff=POINTSoff_SUB2;
+ }else{
+ pts=POINTS_SUB;
+ ptsoff=POINTSoff_SUB;
+ }
+ POINTS_SUB_ARRAY[i]=pts;
+ POINTSoff_SUB_ARRAY[i]=ptsoff;
+ POINTS_SUB_ARRAY_C[i]=Tools.max(MIN_SCORE, pts+POINTS_SUB_ARRAY_C[i-1]);
+ POINTSoff_SUB_ARRAY_C[i]=Tools.max(MINoff_SCORE, ptsoff+POINTSoff_SUB_ARRAY_C[i-1]);
+ }
+ }
+
+ public final int POINTS_NOREF(){return POINTS_NOREF;}
+ public final int POINTS_NOCALL(){return POINTS_NOCALL;}
+ public final int POINTS_MATCH(){return POINTS_MATCH;}
+ public final int POINTS_MATCH2(){return POINTS_MATCH2;}
+ public final int POINTS_COMPATIBLE(){return POINTS_COMPATIBLE;}
+ public final int POINTS_SUB(){return POINTS_SUB;}
+ public final int POINTS_SUBR(){return POINTS_SUBR;}
+ public final int POINTS_SUB2(){return POINTS_SUB2;}
+ public final int POINTS_SUB3(){return POINTS_SUB3;}
+ public final int POINTS_MATCHSUB(){return POINTS_MATCHSUB;}
+ public final int POINTS_INS(){return POINTS_INS;}
+ public final int POINTS_INS2(){return POINTS_INS2;}
+ public final int POINTS_INS3(){return POINTS_INS3;}
+ public final int POINTS_INS4(){return POINTS_INS4;}
+ public final int POINTS_DEL(){return POINTS_DEL;}
+ public final int POINTS_DEL2(){return POINTS_DEL2;}
+ public final int POINTS_DEL3(){return POINTS_DEL3;}
+ public final int POINTS_DEL4(){return POINTS_DEL4;}
+ public final int POINTS_DEL5(){return POINTS_DEL5;}
+ public final int POINTS_DEL_REF_N(){return POINTS_DEL_REF_N;}
+ public final int POINTS_GAP(){return POINTS_GAP;}
+
+ public final int TIMESLIP(){return TIMESLIP;}
+ public final int MASK5(){return MASK5;}
+ public final int SCOREOFFSET(){return SCOREOFFSET();}
+
+ final int BARRIER_I1(){return BARRIER_I1;}
+ final int BARRIER_D1(){return BARRIER_D1;}
+
+ public final int LIMIT_FOR_COST_3(){return LIMIT_FOR_COST_3;}
+ public final int LIMIT_FOR_COST_4(){return LIMIT_FOR_COST_4;}
+ public final int LIMIT_FOR_COST_5(){return LIMIT_FOR_COST_5;}
+
+ public final int BAD(){return BAD;}
+
+
+ private int rows;
+ private int columns;
+
+}
diff --git a/current/align2/MultiStateAligner9PacBioAdapter.java b/current/align2/MultiStateAligner9PacBioAdapter.java
new file mode 100755
index 0000000..5be70f6
--- /dev/null
+++ b/current/align2/MultiStateAligner9PacBioAdapter.java
@@ -0,0 +1,1756 @@
+package align2;
+
+import java.util.Arrays;
+
+import stream.Read;
+import stream.SiteScore;
+
+import dna.AminoAcid;
+import dna.ChromosomeArray;
+import dna.Data;
+
+/**
+ * Based on MSA9ts, with transform scores tweaked for PacBio. */
+public final class MultiStateAligner9PacBioAdapter {
+
+
+ public MultiStateAligner9PacBioAdapter(int maxRows_, int maxColumns_){
+// assert(maxColumns_>=200);
+// assert(maxRows_>=200);
+ maxRows=maxRows_;
+ maxColumns=maxColumns_;
+ packed=new int[3][maxRows+1][maxColumns+1];
+
+ vertLimit=new int[maxRows+1];
+ horizLimit=new int[maxColumns+1];
+ Arrays.fill(vertLimit, BADoff);
+ Arrays.fill(horizLimit, BADoff);
+
+// for(int i=0; i<maxColumns+1; i++){
+// scores[0][i]=0-i;
+// }
+
+ for(int matrix=0; matrix<packed.length; matrix++){
+ for(int i=1; i<=maxRows; i++){
+ for(int j=0; j<packed[matrix][i].length; j++){
+ packed[matrix][i][j]|=BADoff;
+ }
+// packed[matrix][i][0]|=MODE_INS;
+ }
+// for(int i=0; i<maxRows+1; i++){
+// scores[matrix][i][0]=(i*POINTSoff_NOREF);
+// }
+ for(int i=0; i<=maxRows; i++){
+
+ int prevScore=(i<2 ? 0 : packed[matrix][i-1][0]);
+ int score=(i<2 ? (i*POINTSoff_INS) :
+ (i<LIMIT_FOR_COST_3 ? prevScore+POINTSoff_INS2 :
+ (i<LIMIT_FOR_COST_4 ? prevScore+POINTSoff_INS3 : prevScore+POINTSoff_INS4)));
+
+ packed[matrix][i][0]=score;
+ }
+// for(int i=1; i<maxColumns+1; i++){
+// prevState[matrix][0][i]=MODE_DEL;
+// }
+// for(int i=0; i<=maxColumns; i++){
+// packed[matrix][0][i]|=MODE_MS;
+// }
+ }
+ }
+
+
+ /** return new int[] {rows, maxC, maxS, max};
+ * Will not fill areas that cannot match minScore */
+ public final int[] fillLimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore){
+ return fillLimitedX(read, ref, refStartLoc, refEndLoc, minScore);
+ }
+
+
+ /** return new int[] {rows, maxC, maxS, max};
+ * Will not fill areas that cannot match minScore */
+ private final int[] fillLimitedX(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore){
+// minScore=0;
+// assert(minScore>0);
+ rows=read.length;
+ columns=refEndLoc-refStartLoc+1;
+
+ if(/*read.length<40 || */false || minScore<=0 || columns>read.length+Tools.min(100, read.length)){
+// assert(false) : minScore;
+ return fillUnlimited(read, ref, refStartLoc, refEndLoc);
+ }
+
+ minScore-=100; //Increases quality trivially
+
+ assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows+"\n"+
+ refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n";
+ assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns+"\n"+
+ refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n";
+
+ assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc+"\n"+
+ refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n";
+ assert(refEndLoc<ref.length) : "Check that values are in-bounds before calling this function: "+refEndLoc+", "+ref.length+"\n"+
+ refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n";
+
+// for(int x=0; x<packed.length; x++){
+// for(int y=1; y<rows+1; y++){
+// Arrays.fill(packed[x][y], 1, columns+1, BADoff);
+// }
+// }
+ for(int x=0; x<packed.length; x++){
+
+// Arrays.fill(packed[x][1], 1, columns+1, BADoff);
+ Arrays.fill(packed[x][rows], 1, columns+1, BADoff);
+// for(int y=1; y<rows+1; y++){
+// Arrays.fill(packed[x][y], 1, columns+1, BADoff);
+// }
+ }
+
+ int minGoodCol=1;
+ int maxGoodCol=columns;
+
+ final int minScore_off=(minScore<<SCOREOFFSET);
+ final int maxGain=(read.length-1)*POINTSoff_MATCH2+POINTSoff_MATCH;
+ final int floor=minScore_off-maxGain;
+// final int subfloor=Tools.max(BADoff, floor-200*POINTSoff_MATCH2);
+ final int subfloor=floor-5*POINTSoff_MATCH2;
+ assert(subfloor>BADoff); //TODO: Actually, it needs to be substantially more.
+ assert(subfloor<minScore_off) : minScore_off+", "+floor+", "+BADoff+", "+subfloor;
+
+ if(verbose2){
+ System.out.println();
+ System.out.println("minScore="+minScore+"\t"+minScore_off);
+ System.out.println("maxGain="+(maxGain>>SCOREOFFSET)+"\t"+(maxGain));
+ System.out.println("floor="+(floor>>SCOREOFFSET)+"\t"+(floor));
+ System.out.println("subfloor="+(subfloor>>SCOREOFFSET)+"\t"+(subfloor));
+ System.out.println("BADoff="+(BADoff>>SCOREOFFSET)+"\t"+(BADoff));
+ System.out.println("maxGain="+(maxGain>>SCOREOFFSET)+"\t"+(maxGain));
+ System.out.println();
+ }
+
+ vertLimit[rows]=minScore_off;
+ for(int i=rows-1; i>=0; i--){
+ vertLimit[i]=Tools.max(vertLimit[i+1]-POINTSoff_MATCH2, floor);
+ }
+
+ horizLimit[columns]=minScore_off;
+ for(int i=columns-1; i>=0; i--){
+ horizLimit[i]=Tools.max(horizLimit[i+1]-POINTSoff_MATCH2, floor);
+ }
+
+ for(int row=1; row<=rows; row++){
+
+ int colStart=minGoodCol;
+ int colStop=maxGoodCol;
+
+ minGoodCol=-1;
+ maxGoodCol=-2;
+
+ final int vlimit=vertLimit[row];
+
+ if(verbose2){
+ System.out.println();
+ System.out.println("row="+row);
+ System.out.println("colStart="+colStart);
+ System.out.println("colStop="+colStop);
+ System.out.println("vlimit="+(vlimit>>SCOREOFFSET)+"\t"+(vlimit));
+ }
+
+ if(colStart<0 || colStop<colStart){break;}
+
+
+ if(colStart>1){
+ assert(row>0);
+ packed[MODE_MS][row][colStart-1]=subfloor;
+ packed[MODE_INS][row][colStart-1]=subfloor;
+ packed[MODE_DEL][row][colStart-1]=subfloor;
+ }
+
+
+ for(int col=colStart; col<=columns; col++){
+
+
+ if(verbose2){
+ System.out.println("\ncol "+col);
+ }
+
+ final byte call0=(row<2 ? (byte)'?' : read[row-2]);
+ final byte call1=read[row-1];
+ final byte ref0=(col<2 ? (byte)'!' : ref[refStartLoc+col-2]);
+ final byte ref1=ref[refStartLoc+col-1];
+
+// final boolean match=(read[row-1]==ref[refStartLoc+col-1]);
+// final boolean prevMatch=(row<2 || col<2 ? false : read[row-2]==ref[refStartLoc+col-2]);
+ final boolean match=(call1==ref1 && ref1!='N');
+ final boolean prevMatch=(call0==ref0 && ref0!='N');
+
+// System.err.println("")
+
+ iterationsLimited++;
+ final int limit=Tools.max(vlimit, horizLimit[col]);
+ final int limit3=Tools.max(floor, (match ? limit-POINTSoff_MATCH2 : limit-POINTSoff_SUB3));
+
+ final int delNeeded=Tools.max(0, row-col-1);
+ final int insNeeded=Tools.max(0, (rows-row)-(columns-col)-1);
+
+ final int delPenalty=calcDelScoreOffset(delNeeded);
+ final int insPenalty=calcInsScoreOffset(insNeeded);
+
+
+ final int scoreFromDiag_MS=packed[MODE_MS][row-1][col-1]&SCOREMASK;
+ final int scoreFromDel_MS=packed[MODE_DEL][row-1][col-1]&SCOREMASK;
+ final int scoreFromIns_MS=packed[MODE_INS][row-1][col-1]&SCOREMASK;
+
+ final int scoreFromDiag_DEL=packed[MODE_MS][row][col-1]&SCOREMASK;
+ final int scoreFromDel_DEL=packed[MODE_DEL][row][col-1]&SCOREMASK;
+
+ final int scoreFromDiag_INS=packed[MODE_MS][row-1][col]&SCOREMASK;
+ final int scoreFromIns_INS=packed[MODE_INS][row-1][col]&SCOREMASK;
+
+// if(scoreFromDiag_MS<limit3 && scoreFromDel_MS<limit3 && scoreFromIns_MS<limit3
+// && scoreFromDiag_DEL<limit && scoreFromDel_DEL<limit && scoreFromDiag_INS<limit && scoreFromIns_INS<limit){
+// iterationsLimited--; //A "fast" iteration
+// }
+
+ if((scoreFromDiag_MS<=limit3 && scoreFromDel_MS<=limit3 && scoreFromIns_MS<=limit3)){
+ packed[MODE_MS][row][col]=subfloor;
+ }else{//Calculate match and sub scores
+ final int streak=(packed[MODE_MS][row-1][col-1]&TIMEMASK);
+
+ {//Calculate match/sub score
+
+ int score;
+ int time;
+ byte prevState;
+
+ if(match){
+
+ int scoreMS=scoreFromDiag_MS+(prevMatch ? POINTSoff_MATCH2 : POINTSoff_MATCH);
+ int scoreD=scoreFromDel_MS+POINTSoff_MATCH;
+ int scoreI=scoreFromIns_MS+POINTSoff_MATCH;
+
+// byte prevState;
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? streak+1 : 1);
+ prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+ prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+ prevState=MODE_INS;
+ }
+
+// assert(time<=MAX_TIME);//if(time>MAX_TIME){time=MAX_TIME-3;}
+// assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+// assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+//// packed[MODE_MS][row][col]=(score|prevState|time);
+// packed[MODE_MS][row][col]=(score|time);
+// assert((score&SCOREMASK)==score);
+//// assert((prevState&MODEMASK)==prevState);
+// assert((time&TIMEMASK)==time);
+
+ }else{
+
+ int scoreMS;
+ if(ref1!='N' && call1!='N'){
+ scoreMS=scoreFromDiag_MS+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) :
+ (streak==0 ? POINTSoff_SUB : streak<LIMIT_FOR_COST_3 ? POINTSoff_SUB2 : POINTSoff_SUB3));
+ }else{
+ scoreMS=scoreFromDiag_MS+POINTSoff_NOCALL;
+ }
+
+ int scoreD=scoreFromDel_MS+POINTSoff_SUB; //+2 to move it as close as possible to the deletion / insertion
+ int scoreI=scoreFromIns_MS+POINTSoff_SUB;
+
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? 1 : streak+1);
+// time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1);
+ prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+ prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+ prevState=MODE_INS;
+ }
+
+// assert(time<=MAX_TIME);//if(time>MAX_TIME){time=MAX_TIME-3;}
+// assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+// assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+//// packed[MODE_MS][row][col]=(score|prevState|time);
+// packed[MODE_MS][row][col]=(score|time);
+// assert((score&SCOREMASK)==score);
+//// assert((prevState&MODEMASK)==prevState);
+// assert((time&TIMEMASK)==time);
+ }
+
+ final int limit2;
+ if(delNeeded>0){
+ limit2=limit-delPenalty;
+ }else if(insNeeded>0){
+ limit2=limit-insPenalty;
+ }else{
+ limit2=limit;
+ }
+ assert(limit2>=limit);
+
+ if(verbose2){System.err.println("MS: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));}
+
+ if(score>=limit2){
+ maxGoodCol=col;
+ if(minGoodCol<0){minGoodCol=col;}
+ }else{
+ score=subfloor;
+ }
+
+ assert(time<=MAX_TIME);//if(time>MAX_TIME){time=MAX_TIME-3;}
+ assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_MS][row][col]=(score|prevState|time);
+ packed[MODE_MS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+ }
+
+ if((scoreFromDiag_DEL<=limit && scoreFromDel_DEL<=limit)){
+// assert((scoreFromDiag_DEL<=limit && scoreFromDel_DEL<=limit)) : scoreFromDiag_DEL+", "+row;
+ packed[MODE_DEL][row][col]=subfloor;
+ }else{//Calculate DEL score
+
+ final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK;
+
+ int scoreMS=scoreFromDiag_DEL+POINTSoff_DEL;
+ int scoreD=scoreFromDel_DEL+(streak==0 ? POINTSoff_DEL :
+ streak<LIMIT_FOR_COST_3 ? POINTSoff_DEL2 :
+ streak<LIMIT_FOR_COST_4 ? POINTSoff_DEL3 : POINTSoff_DEL4);
+// int scoreI=scoreFromIns+POINTSoff_DEL;
+
+
+ if(ref1=='N'){
+ scoreMS+=POINTSoff_DEL_REF_N;
+ scoreD+=POINTSoff_DEL_REF_N;
+ }
+
+ //if(match){scoreMS=subfloor;}
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreD){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreD;
+ time=streak+1;
+ prevState=MODE_DEL;
+ }
+
+ final int limit2;
+ if(insNeeded>0){
+ limit2=limit-insPenalty;
+ }else if(delNeeded>0){
+ limit2=limit-calcDelScoreOffset(time+delNeeded)+calcDelScoreOffset(time);
+ }else{
+ limit2=limit;
+ }
+ assert(limit2>=limit);
+ if(verbose2){System.err.println("DEL: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));}
+
+ if(score>=limit2){
+ maxGoodCol=col;
+ if(minGoodCol<0){minGoodCol=col;}
+ }else{
+ score=subfloor;
+ }
+
+ assert(time<=MAX_TIME);//if(time>MAX_TIME){time=MAX_TIME-3;}
+ assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_DEL][row][col]=(score|prevState|time);
+ packed[MODE_DEL][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+
+ if(scoreFromDiag_INS<=limit && scoreFromIns_INS<=limit){
+ packed[MODE_INS][row][col]=subfloor;
+ }else{//Calculate INS score
+
+ final int streak=packed[MODE_INS][row-1][col]&TIMEMASK;
+
+ int scoreMS=scoreFromDiag_INS+POINTSoff_INS;
+// int scoreD=scoreFromDel+POINTSoff_INS;
+ int scoreI=scoreFromIns_INS+(streak==0 ? POINTSoff_INS :
+ streak<LIMIT_FOR_COST_3 ? POINTSoff_INS2 :
+ streak<LIMIT_FOR_COST_4 ? POINTSoff_INS3 : POINTSoff_INS4);
+
+
+// System.err.println("("+row+","+col+")\t"+scoreFromDiag+"+"+POINTSoff_INS+"="+scoreM+", "+
+// scoreFromSub+"+"+POINTSoff_INS+"="+scoreS+", "
+// +scoreD+", "+scoreFromIns+"+"+
+// (streak==0 ? POINTSoff_INS : streak<LIMIT_FOR_COST_3 ? POINTSoff_INS2 : POINTSoff_INS3)+"="+scoreI);
+
+ //if(match){scoreMS=subfloor;}
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreI){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreI;
+ time=streak+1;
+ prevState=MODE_INS;
+ }
+
+ final int limit2;
+ if(delNeeded>0){
+ limit2=limit-delPenalty;
+ }else if(insNeeded>0){
+ limit2=limit-calcInsScoreOffset(time+insNeeded)+calcInsScoreOffset(time);
+ }else{
+ limit2=limit;
+ }
+ assert(limit2>=limit);
+
+ if(verbose2){System.err.println("INS: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));}
+ if(score>=limit2){
+ maxGoodCol=col;
+ if(minGoodCol<0){minGoodCol=col;}
+ }else{
+ score=subfloor;
+ }
+
+ assert(time<=MAX_TIME);//if(time>MAX_TIME){time=MAX_TIME-3;}
+ assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_INS][row][col]=(score|prevState|time);
+ packed[MODE_INS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+
+
+ if(col>=colStop){
+ if(col>colStop && maxGoodCol<col){break;}
+ if(row>1){
+ packed[MODE_MS][row-1][col+1]=subfloor;
+ packed[MODE_INS][row-1][col+1]=subfloor;
+ packed[MODE_DEL][row-1][col+1]=subfloor;
+ }
+ }
+ }
+ }
+
+
+ int maxCol=-1;
+ int maxState=-1;
+ int maxScore=Integer.MIN_VALUE;
+
+ for(int state=0; state<packed.length; state++){
+ for(int col=1; col<=columns; col++){
+ int x=packed[state][rows][col]&SCOREMASK;
+ if(x>maxScore){
+ maxScore=x;
+ maxCol=col;
+ maxState=state;
+ }
+ }
+ }
+
+ assert(maxScore>=BADoff);
+// if(maxScore==BADoff){
+// return null;
+// }
+// if(maxScore<floor){
+// return null;
+// }
+ if(maxScore<minScore_off){
+ return null;
+ }
+
+ maxScore>>=SCOREOFFSET;
+
+// System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore);
+ return new int[] {rows, maxCol, maxState, maxScore};
+ }
+
+
+ /** return new int[] {rows, maxC, maxS, max};
+ * Does not require a min score (ie, same as old method) */
+ private final int[] fillUnlimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc){
+ rows=read.length;
+ columns=refEndLoc-refStartLoc+1;
+
+ final int maxGain=(read.length-1)*POINTSoff_MATCH2+POINTSoff_MATCH;
+ final int subfloor=0-2*maxGain;
+ assert(subfloor>BADoff && subfloor*2>BADoff); //TODO: Actually, it needs to be substantially more.
+
+ //temporary, for finding a bug
+ if(rows>maxRows || columns>maxColumns){
+ throw new RuntimeException("rows="+rows+", maxRows="+maxRows+", cols="+columns+", maxCols="+maxColumns+"\n"+new String(read)+"\n");
+ }
+
+ assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows;
+ assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns;
+
+ assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc;
+ assert(refEndLoc<ref.length) : "Check that values are in-bounds before calling this function: "+refEndLoc+", "+ref.length;
+
+ for(int row=1; row<=rows; row++){
+
+// int minc=max(1, row-20);
+// int maxc=min(columns, row+20);
+
+ for(int col=1; col<=columns; col++){
+ iterationsUnlimited++;
+
+// final boolean match=(read[row-1]==ref[refStartLoc+col-1]);
+// final boolean prevMatch=(row<2 || col<2 ? false : read[row-2]==ref[refStartLoc+col-2]);
+
+ final byte call0=(row<2 ? (byte)'?' : read[row-2]);
+ final byte call1=read[row-1];
+ final byte ref0=(col<2 ? (byte)'!' : ref[refStartLoc+col-2]);
+ final byte ref1=ref[refStartLoc+col-1];
+
+ final boolean match=(call1==ref1 && ref1!='N');
+ final boolean prevMatch=(call0==ref0 && ref0!='N');
+
+ {//Calculate match and sub scores
+
+ final int scoreFromDiag=packed[MODE_MS][row-1][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row-1][col-1]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col-1]&SCOREMASK;
+ final int streak=(packed[MODE_MS][row-1][col-1]&TIMEMASK);
+
+ {//Calculate match/sub score
+
+ if(match){
+
+ int scoreMS=scoreFromDiag+(prevMatch ? POINTSoff_MATCH2 : POINTSoff_MATCH);
+ int scoreD=scoreFromDel+POINTSoff_MATCH;
+ int scoreI=scoreFromIns+POINTSoff_MATCH;
+
+ int score;
+ int time;
+// byte prevState;
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? streak+1 : 1);
+// prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+// prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+// prevState=MODE_INS;
+ }
+
+ assert(time<=MAX_TIME);//if(time>MAX_TIME){time=MAX_TIME-3;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_MS][row][col]=(score|prevState|time);
+ packed[MODE_MS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+
+ }else{
+
+ int scoreMS;
+ if(ref1!='N' && call1!='N'){
+ scoreMS=scoreFromDiag+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) :
+ (streak==0 ? POINTSoff_SUB : streak<LIMIT_FOR_COST_3 ? POINTSoff_SUB2 : POINTSoff_SUB3));
+ }else{
+ scoreMS=scoreFromDiag+POINTSoff_NOCALL;
+ }
+
+ int scoreD=scoreFromDel+POINTSoff_SUB; //+2 to move it as close as possible to the deletion / insertion
+ int scoreI=scoreFromIns+POINTSoff_SUB;
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? 1 : streak+1);
+// time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1);
+ prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+ prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+ prevState=MODE_INS;
+ }
+
+ assert(time<=MAX_TIME);//if(time>MAX_TIME){time=MAX_TIME-3;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_MS][row][col]=(score|prevState|time);
+ packed[MODE_MS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+ }
+ }
+
+ {//Calculate DEL score
+
+ final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK;
+
+ final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK;
+
+ int scoreMS=scoreFromDiag+POINTSoff_DEL;
+ int scoreD=scoreFromDel+(streak==0 ? POINTSoff_DEL :
+ streak<LIMIT_FOR_COST_3 ? POINTSoff_DEL2 :
+ streak<LIMIT_FOR_COST_4 ? POINTSoff_DEL3 : POINTSoff_DEL4);
+// int scoreI=scoreFromIns+POINTSoff_DEL;
+
+ if(ref1=='N'){
+ scoreMS+=POINTSoff_DEL_REF_N;
+ scoreD+=POINTSoff_DEL_REF_N;
+ }
+
+ //if(match){scoreMS=subfloor;}
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreD){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreD;
+ time=streak+1;
+ prevState=MODE_DEL;
+ }
+
+ assert(time<=MAX_TIME);//if(time>MAX_TIME){time=MAX_TIME-3;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_DEL][row][col]=(score|prevState|time);
+ packed[MODE_DEL][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+
+ {//Calculate INS score
+
+ final int streak=packed[MODE_INS][row-1][col]&TIMEMASK;
+
+ final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK;
+
+ int scoreMS=scoreFromDiag+POINTSoff_INS;
+// int scoreD=scoreFromDel+POINTSoff_INS;
+ int scoreI=scoreFromIns+(streak==0 ? POINTSoff_INS :
+ streak<LIMIT_FOR_COST_3 ? POINTSoff_INS2 :
+ streak<LIMIT_FOR_COST_4 ? POINTSoff_INS3 : POINTSoff_INS4);
+
+// System.err.println("("+row+","+col+")\t"+scoreFromDiag+"+"+POINTSoff_INS+"="+scoreM+", "+
+// scoreFromSub+"+"+POINTSoff_INS+"="+scoreS+", "
+// +scoreD+", "+scoreFromIns+"+"+
+// (streak==0 ? POINTSoff_INS : streak<LIMIT_FOR_COST_3 ? POINTSoff_INS2 : POINTSoff_INS3)+"="+scoreI);
+
+ //if(match){scoreMS=subfloor;}
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreI){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreI;
+ time=streak+1;
+ prevState=MODE_INS;
+ }
+
+ assert(time<=MAX_TIME);//if(time>MAX_TIME){time=MAX_TIME-3;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_INS][row][col]=(score|prevState|time);
+ packed[MODE_INS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+ }
+ }
+
+
+ int maxCol=-1;
+ int maxState=-1;
+ int maxScore=Integer.MIN_VALUE;
+
+ for(int state=0; state<packed.length; state++){
+ for(int col=1; col<=columns; col++){
+ int x=packed[state][rows][col]&SCOREMASK;
+ if(x>maxScore){
+ maxScore=x;
+ maxCol=col;
+ maxState=state;
+ }
+ }
+ }
+ maxScore>>=SCOREOFFSET;
+
+// System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore);
+ return new int[] {rows, maxCol, maxState, maxScore};
+ }
+
+ @Deprecated
+ /** return new int[] {rows, maxC, maxS, max}; */
+ public final int[] fillQ(byte[] read, byte[] ref, byte[] baseScores, int refStartLoc, int refEndLoc){
+ assert(false) : "Needs to be redone to work with score cutoffs. Not difficult.";
+ rows=read.length;
+ columns=refEndLoc-refStartLoc+1;
+
+ assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows;
+ assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns;
+
+ assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc;
+ assert(refEndLoc<ref.length) : "Check that values are in-bounds before calling this function: "+refEndLoc+", "+ref.length;
+
+ for(int row=1; row<=rows; row++){
+
+// int minc=max(1, row-20);
+// int maxc=min(columns, row+20);
+
+ for(int col=1; col<=columns; col++){
+
+ final boolean match=(read[row-1]==ref[refStartLoc+col-1]);
+ final boolean prevMatch=(row<2 || col<2 ? false : read[row-2]==ref[refStartLoc+col-2]);
+
+ {//Calculate match and sub scores
+
+ final int scoreFromDiag=packed[MODE_MS][row-1][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row-1][col-1]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col-1]&SCOREMASK;
+ final int streak=(packed[MODE_MS][row-1][col-1]&TIMEMASK);
+
+ {//Calculate match/sub score
+
+ if(match){
+
+ int scoreMS=scoreFromDiag+(prevMatch ? POINTSoff_MATCH2 : POINTSoff_MATCH);
+ int scoreD=scoreFromDel+POINTSoff_MATCH;
+ int scoreI=scoreFromIns+POINTSoff_MATCH;
+
+ int score;
+ int time;
+// byte prevState;
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? streak+1 : 1);
+// prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+// prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+// prevState=MODE_INS;
+ }
+ score+=(((int)baseScores[row-1])<<SCOREOFFSET); //modifier
+
+ assert(time<=MAX_TIME);//if(time>MAX_TIME){time=MAX_TIME-3;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_MS][row][col]=(score|prevState|time);
+ packed[MODE_MS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+
+ }else{
+
+ int scoreMS=scoreFromDiag+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) :
+ (streak==0 ? POINTSoff_SUB : streak<LIMIT_FOR_COST_3 ? POINTSoff_SUB2 : POINTSoff_SUB3));
+ int scoreD=scoreFromDel+POINTSoff_SUB; //+2 to move it as close as possible to the deletion / insertion
+ int scoreI=scoreFromIns+POINTSoff_SUB;
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? 1 : streak+1);
+// time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1);
+ prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+ prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+ prevState=MODE_INS;
+ }
+
+ assert(time<=MAX_TIME);//if(time>MAX_TIME){time=MAX_TIME-3;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_MS][row][col]=(score|prevState|time);
+ packed[MODE_MS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+ }
+ }
+
+ {//Calculate DEL score
+
+ final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK;
+
+ final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK;
+
+ int scoreMS=scoreFromDiag+POINTSoff_DEL;
+ int scoreD=scoreFromDel+(streak==0 ? POINTSoff_DEL :
+ streak<LIMIT_FOR_COST_3 ? POINTSoff_DEL2 :
+ streak<LIMIT_FOR_COST_4 ? POINTSoff_DEL3 : POINTSoff_DEL4);
+// int scoreI=scoreFromIns+POINTSoff_DEL;
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreD){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreD;
+ time=streak+1;
+ prevState=MODE_DEL;
+ }
+
+ assert(time<=MAX_TIME);//if(time>MAX_TIME){time=MAX_TIME-3;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_DEL][row][col]=(score|prevState|time);
+ packed[MODE_DEL][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+
+ {//Calculate INS score
+
+ final int streak=packed[MODE_INS][row-1][col]&TIMEMASK;
+
+ final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK;
+
+ int scoreMS=scoreFromDiag+POINTSoff_INS;
+// int scoreD=scoreFromDel+POINTSoff_INS;
+ int scoreI=scoreFromIns+(streak==0 ? POINTSoff_INS :
+ streak<LIMIT_FOR_COST_3 ? POINTSoff_INS2 :
+ streak<LIMIT_FOR_COST_4 ? POINTSoff_INS3 : POINTSoff_INS4);
+
+// System.err.println("("+row+","+col+")\t"+scoreFromDiag+"+"+POINTSoff_INS+"="+scoreM+", "+
+// scoreFromSub+"+"+POINTSoff_INS+"="+scoreS+", "
+// +scoreD+", "+scoreFromIns+"+"+
+// (streak==0 ? POINTSoff_INS : streak<LIMIT_FOR_COST_3 ? POINTSoff_INS2 : POINTSoff_INS3)+"="+scoreI);
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreI){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreI;
+ time=streak+1;
+ prevState=MODE_INS;
+ }
+
+ assert(time<=MAX_TIME);//if(time>MAX_TIME){time=MAX_TIME-3;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_INS][row][col]=(score|prevState|time);
+ packed[MODE_INS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+ }
+ }
+
+
+ int maxCol=-1;
+ int maxState=-1;
+ int maxScore=Integer.MIN_VALUE;
+
+ for(int state=0; state<packed.length; state++){
+ for(int col=1; col<=columns; col++){
+ int x=packed[state][rows][col]&SCOREMASK;
+ if(x>maxScore){
+ maxScore=x;
+ maxCol=col;
+ maxState=state;
+ }
+ }
+ }
+ maxScore>>=SCOREOFFSET;
+
+// System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore);
+ return new int[] {rows, maxCol, maxState, maxScore};
+ }
+
+
+ /** Generates the match string */
+ public final byte[] traceback(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state){
+// assert(false);
+ assert(refStartLoc<=refEndLoc) : refStartLoc+", "+refEndLoc;
+ assert(row==rows);
+
+ byte[] out=new byte[row+col-1]; //TODO if an out of bound crash occurs, try removing the "-1".
+ int outPos=0;
+
+ if(state==MODE_INS){
+ //TODO ? Maybe not needed.
+ }
+
+ while(row>0 && col>0){
+
+// byte prev0=(byte)(packed[state][row][col]&MODEMASK);
+
+ final int time=packed[state][row][col]&TIMEMASK;
+ final byte prev;
+
+// System.err.println("state="+state+", prev="+prev+", row="+row+", col="+col+", score="+scores[state][row][col]);
+
+ if(state==MODE_MS){
+ if(time>1){prev=(byte)state;}
+ else{
+ final int scoreFromDiag=packed[MODE_MS][row-1][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row-1][col-1]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col-1]&SCOREMASK;
+ if(scoreFromDiag>=scoreFromDel && scoreFromDiag>=scoreFromIns){prev=MODE_MS;}
+ else if(scoreFromDel>=scoreFromIns){prev=MODE_DEL;}
+ else{prev=MODE_INS;}
+ }
+
+ byte c=read[row-1];
+ byte r=ref[refStartLoc+col-1];
+ if(c==r){
+ out[outPos]='m';
+ }else{
+ if(!AminoAcid.isFullyDefined(c)){
+ out[outPos]='N';
+ }else if(!AminoAcid.isFullyDefined(r)){
+// out[outPos]='X';
+ out[outPos]='N';
+ }else{
+ out[outPos]='S';
+ }
+ }
+
+ row--;
+ col--;
+ }else if(state==MODE_DEL){
+ if(time>1){prev=(byte)state;}
+ else{
+ final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK;
+ if(scoreFromDiag>=scoreFromDel){prev=MODE_MS;}
+ else{prev=MODE_DEL;}
+ }
+
+ byte r=ref[refStartLoc+col-1];
+ out[outPos]='D';
+
+ col--;
+ }else{
+ if(time>1){prev=(byte)state;}
+ else{
+ final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK;
+ if(scoreFromDiag>=scoreFromIns){prev=MODE_MS;}
+ else{prev=MODE_INS;}
+ }
+
+ assert(state==MODE_INS) : state;
+ if(col==0){
+ out[outPos]='X';
+ }else if(col>=columns){
+ out[outPos]='Y';
+ }else{
+ out[outPos]='I';
+ }
+ row--;
+ }
+
+// assert(prev==prev0);
+ state=prev;
+ outPos++;
+ }
+
+ assert(row==0 || col==0);
+ if(col!=row){
+ while(row>0){
+ out[outPos]='X';
+ outPos++;
+ row--;
+ col--;
+ }
+ if(col>0){
+ //do nothing
+ }
+ }
+
+
+ //Shrink and reverse the string
+ byte[] out2=new byte[outPos];
+ for(int i=0; i<outPos; i++){
+ out2[i]=out[outPos-i-1];
+ }
+ out=null;
+
+ return out2;
+ }
+
+ /** @return {score, bestRefStart, bestRefStop} */
+ public final int[] score(final byte[] read, final byte[] ref, final int refStartLoc, final int refEndLoc,
+ final int maxRow, final int maxCol, final int maxState){
+
+ int row=maxRow;
+ int col=maxCol;
+ int state=maxState;
+
+ assert(maxState>=0 && maxState<packed.length) :
+ maxState+", "+maxRow+", "+maxCol+"\n"+new String(read)+"\n"+toString(ref, refStartLoc, refEndLoc);
+ assert(maxRow>=0 && maxRow<packed[0].length) :
+ maxState+", "+maxRow+", "+maxCol+"\n"+new String(read)+"\n"+toString(ref, refStartLoc, refEndLoc);
+ assert(maxCol>=0 && maxCol<packed[0][0].length) :
+ maxState+", "+maxRow+", "+maxCol+"\n"+new String(read)+"\n"+toString(ref, refStartLoc, refEndLoc);
+
+ int score=packed[maxState][maxRow][maxCol]&SCOREMASK; //Or zero, if it is to be recalculated
+
+ if(row<rows){
+ int difR=rows-row;
+ int difC=columns-col;
+
+ while(difR>difC){
+ score+=POINTSoff_NOREF;
+ difR--;
+ }
+
+ row+=difR;
+ col+=difR;
+
+ }
+
+ assert(refStartLoc<=refEndLoc);
+ assert(row==rows);
+
+
+ final int bestRefStop=refStartLoc+col-1;
+
+ while(row>0 && col>0){
+// System.err.println("state="+state+", row="+row+", col="+col);
+
+
+
+// byte prev0=(byte)(packed[state][row][col]&MODEMASK);
+
+ final int time=packed[state][row][col]&TIMEMASK;
+ final byte prev;
+
+ if(state==MODE_MS){
+ if(time>1){prev=(byte)state;}
+ else{
+ final int scoreFromDiag=packed[MODE_MS][row-1][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row-1][col-1]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col-1]&SCOREMASK;
+ if(scoreFromDiag>=scoreFromDel && scoreFromDiag>=scoreFromIns){prev=MODE_MS;}
+ else if(scoreFromDel>=scoreFromIns){prev=MODE_DEL;}
+ else{prev=MODE_INS;}
+ }
+ row--;
+ col--;
+ }else if(state==MODE_DEL){
+ if(time>1){prev=(byte)state;}
+ else{
+ final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK;
+ if(scoreFromDiag>=scoreFromDel){prev=MODE_MS;}
+ else{prev=MODE_DEL;}
+ }
+ col--;
+ }else{
+ assert(state==MODE_INS);
+ if(time>1){prev=(byte)state;}
+ else{
+ final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK;
+ if(scoreFromDiag>=scoreFromIns){prev=MODE_MS;}
+ else{prev=MODE_INS;}
+ }
+ row--;
+ }
+
+ if(col<0){
+ System.err.println(row);
+ break; //prevents an out of bounds access
+
+ }
+
+// assert(prev==prev0);
+ state=prev;
+
+// System.err.println("state2="+state+", row2="+row+", col2="+col+"\n");
+ }
+// assert(false) : row+", "+col;
+ if(row>col){
+ col-=row;
+ }
+
+ final int bestRefStart=refStartLoc+col;
+
+ score>>=SCOREOFFSET;
+ int[] rvec;
+ if(bestRefStart<refStartLoc || bestRefStop>refEndLoc){ //Suggest extra padding in cases of overflow
+ int padLeft=Tools.max(0, refStartLoc-bestRefStart);
+ int padRight=Tools.max(0, bestRefStop-refEndLoc);
+ rvec=new int[] {score, bestRefStart, bestRefStop, padLeft, padRight};
+ }else{
+ rvec=new int[] {score, bestRefStart, bestRefStop};
+ }
+ return rvec;
+ }
+
+
+ /** Will not fill areas that cannot match minScore.
+ * @return {score, bestRefStart, bestRefStop} */
+ public final int[] fillAndScoreLimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore){
+ int a=Tools.max(0, refStartLoc);
+ int b=Tools.min(ref.length-1, refEndLoc);
+ assert(b>=a);
+
+ int[] score;
+
+ if(b-a>=maxColumns){
+ System.err.println("Warning: Max alignment columns exceeded; restricting range. "+(b-a+1)+" > "+maxColumns);
+ assert(false) : refStartLoc+", "+refEndLoc;
+ b=Tools.min(ref.length-1, a+maxColumns-1);
+ }
+ int[] max=fillLimited(read, ref, a, b, minScore);
+ score=(max==null ? null : score(read, ref, a, b, max[0], max[1], max[2]));
+
+ return score;
+ }
+
+
+
+ public final int scoreNoIndels(byte[] read, SiteScore ss){
+ ChromosomeArray cha=Data.getChromosome(ss.chrom);
+ return scoreNoIndels(read, cha.array, ss.start, ss);
+ }
+
+ public final int scoreNoIndels(byte[] read, final int chrom, final int refStart){
+ ChromosomeArray cha=Data.getChromosome(chrom);
+ return scoreNoIndels(read, cha.array, refStart, null);
+ }
+
+ public final int scoreNoIndels(byte[] read, SiteScore ss, byte[] baseScores){
+ ChromosomeArray cha=Data.getChromosome(ss.chrom);
+ return scoreNoIndels(read, cha.array, baseScores, ss.start, ss);
+ }
+
+ public final int scoreNoIndels(byte[] read, final int chrom, final int refStart, byte[] baseScores){
+ ChromosomeArray cha=Data.getChromosome(chrom);
+ return scoreNoIndels(read, cha.array, baseScores, refStart, null);
+ }
+
+
+
+ public final int scoreNoIndels(byte[] read, byte[] ref, final int refStart, final SiteScore ss){
+
+ int score=0;
+ int mode=-1;
+ int timeInMode=0;
+
+ //This block handles cases where the read runs outside the reference
+ //Of course, padding the reference with 'N' would be better, but...
+ int readStart=0;
+ int readStop=read.length;
+ final int refStop=refStart+read.length;
+ boolean semiperfect=true;
+ int norefs=0;
+
+ if(refStart<0){
+ readStart=0-refStart;
+ score+=POINTS_NOREF*readStart;
+ norefs+=readStart;
+ }
+ if(refStop>ref.length){
+ int dif=(refStop-ref.length);
+ readStop-=dif;
+ score+=POINTS_NOREF*dif;
+ norefs+=dif;
+ }
+
+// if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed.
+
+ for(int i=readStart; i<readStop; i++){
+ byte c=read[i];
+ byte r=ref[refStart+i];
+
+ if(c==r && c!='N'){
+ if(mode==MODE_MS){
+ timeInMode++;
+ score+=POINTS_MATCH2;
+ }else{
+ timeInMode=0;
+ score+=POINTS_MATCH;
+ }
+ mode=MODE_MS;
+ }else if(c<0 || c=='N'){
+ score+=POINTS_NOCALL;
+ semiperfect=false;
+ }else if(r<0 || r=='N'){
+ score+=POINTS_NOREF;
+ norefs++;
+ }else{
+ if(mode==MODE_SUB){timeInMode++;}
+ else{timeInMode=0;}
+
+ if(timeInMode==0){score+=POINTS_SUB;}
+ else if(timeInMode<LIMIT_FOR_COST_3){score+=POINTS_SUB2;}
+ else{score+=POINTS_SUB3;}
+ mode=MODE_SUB;
+ semiperfect=false;
+ }
+ }
+
+ if(semiperfect && ss!=null){ss.semiperfect=((ss.stop==ss.start+read.length-1) && (norefs<=read.length/2));}
+
+ return score;
+ }
+
+
+ public final int scoreNoIndels(byte[] read, byte[] ref, byte[] baseScores, final int refStart, SiteScore ss){
+
+ int score=0;
+ int mode=-1;
+ int timeInMode=0;
+ int norefs=0;
+
+ //This block handles cases where the read runs outside the reference
+ //Of course, padding the reference with 'N' would be better, but...
+ int readStart=0;
+ int readStop=read.length;
+ final int refStop=refStart+read.length;
+ boolean semiperfect=true;
+
+ if(refStart<0){
+ readStart=0-refStart;
+ score+=POINTS_NOREF*readStart;
+ norefs+=readStart;
+ }
+ if(refStop>ref.length){
+ int dif=(refStop-ref.length);
+ readStop-=dif;
+ score+=POINTS_NOREF*dif;
+ norefs+=dif;
+ }
+
+// if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed.
+
+ for(int i=readStart; i<readStop; i++){
+ byte c=read[i];
+ byte r=ref[refStart+i];
+
+ if(c==r && c!='N'){
+ if(mode==MODE_MS){
+ timeInMode++;
+ score+=POINTS_MATCH2;
+ }else{
+ timeInMode=0;
+ score+=POINTS_MATCH;
+ }
+ score+=baseScores[i];
+ mode=MODE_MS;
+ }else if(c<0 || c=='N'){
+ score+=POINTS_NOCALL;
+ semiperfect=false;
+ }else if(r<0 || r=='N'){
+ score+=POINTS_NOREF;
+ norefs++;
+ }else{
+ if(mode==MODE_SUB){timeInMode++;}
+ else{timeInMode=0;}
+
+ if(timeInMode==0){score+=POINTS_SUB;}
+ else if(timeInMode<LIMIT_FOR_COST_3){score+=POINTS_SUB2;}
+ else{score+=POINTS_SUB3;}
+ mode=MODE_SUB;
+ semiperfect=false;
+ }
+ }
+
+ if(semiperfect && ss!=null){ss.semiperfect=((ss.stop==ss.start+read.length-1) && (norefs<=read.length/2));}
+ assert(Read.CHECKSITE(ss, read, -1));
+
+ return score;
+ }
+
+
+ public final int scoreNoIndelsAndMakeMatchString(byte[] read, byte[] ref, byte[] baseScores, final int refStart, byte[][] matchReturn){
+ int score=0;
+ int mode=-1;
+ int timeInMode=0;
+
+ assert(refStart<=ref.length) : refStart+", "+ref.length;
+
+ //This block handles cases where the read runs outside the reference
+ //Of course, padding the reference with 'N' would be better, but...
+ int readStart=0;
+ int readStop=read.length;
+ final int refStop=refStart+read.length;
+ if(refStart<0){
+ readStart=0-refStart;
+ score+=POINTS_NOREF*readStart;
+ }
+ if(refStop>ref.length){
+ int dif=(refStop-ref.length);
+ System.err.println("dif="+dif+", ref.length="+ref.length+", refStop="+refStop);
+ readStop-=dif;
+ score+=POINTS_NOREF*dif;
+ }
+ assert(refStart+readStop<=ref.length) : "readStart="+readStart+", readStop="+readStop+
+ ", refStart="+refStart+", refStop="+refStop+", ref.length="+ref.length+", read.length="+read.length;
+
+ assert(matchReturn!=null);
+ assert(matchReturn.length==1);
+ if(matchReturn[0]==null || matchReturn[0].length!=read.length){
+ assert(matchReturn[0]==null || matchReturn[0].length<read.length) : matchReturn[0].length+"!="+read.length;
+ matchReturn[0]=new byte[read.length];
+ }
+ final byte[] match=matchReturn[0];
+
+// if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed.
+
+ for(int i=readStart; i<readStop; i++){
+ byte c=read[i];
+ byte r=ref[refStart+i];
+
+ assert(r!='.' && c!='.');
+
+ if(c==r && c!='N'){
+ if(mode==MODE_MS){
+ timeInMode++;
+ score+=POINTS_MATCH2;
+ }else{
+ timeInMode=0;
+ score+=POINTS_MATCH;
+ }
+ score+=baseScores[i];
+ match[i]='m';
+ mode=MODE_MS;
+ }else if(c<0 || c=='N'){
+ score+=POINTS_NOCALL;
+ match[i]='N';
+ }else if(r<0 || r=='N'){
+ score+=POINTS_NOREF;
+// match[i]='m';
+ match[i]='N';
+ }else{
+ match[i]='S';
+ if(mode==MODE_SUB){timeInMode++;}
+ else{timeInMode=0;}
+
+ if(timeInMode==0){score+=POINTS_SUB;}
+ else if(timeInMode<LIMIT_FOR_COST_3){score+=POINTS_SUB2;}
+ else{score+=POINTS_SUB3;}
+ mode=MODE_SUB;
+ }
+ }
+
+ return score;
+ }
+
+
+ public final int scoreNoIndelsAndMakeMatchString(byte[] read, byte[] ref, final int refStart, byte[][] matchReturn){
+ int score=0;
+ int mode=-1;
+ int timeInMode=0;
+
+ assert(refStart<=ref.length) : refStart+", "+ref.length;
+
+ //This block handles cases where the read runs outside the reference
+ //Of course, padding the reference with 'N' would be better, but...
+ int readStart=0;
+ int readStop=read.length;
+ final int refStop=refStart+read.length;
+ if(refStart<0){
+ readStart=0-refStart;
+ score+=POINTS_NOREF*readStart;
+ }
+ if(refStop>ref.length){
+ int dif=(refStop-ref.length);
+ System.err.println("dif="+dif+", ref.length="+ref.length+", refStop="+refStop);
+ readStop-=dif;
+ score+=POINTS_NOREF*dif;
+ }
+ assert(refStart+readStop<=ref.length) : "readStart="+readStart+", readStop="+readStop+
+ ", refStart="+refStart+", refStop="+refStop+", ref.length="+ref.length+", read.length="+read.length;
+
+ assert(matchReturn!=null);
+ assert(matchReturn.length==1);
+ if(matchReturn[0]==null || matchReturn[0].length!=read.length){
+ assert(matchReturn[0]==null || matchReturn[0].length<read.length) : matchReturn[0].length+"!="+read.length;
+ matchReturn[0]=new byte[read.length];
+ }
+ final byte[] match=matchReturn[0];
+
+// if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed.
+
+ for(int i=readStart; i<readStop; i++){
+ byte c=read[i];
+ byte r=ref[refStart+i];
+
+ assert(r!='.' && c!='.');
+
+ if(c==r && c!='N'){
+ if(mode==MODE_MS){
+ timeInMode++;
+ score+=POINTS_MATCH2;
+ }else{
+ timeInMode=0;
+ score+=POINTS_MATCH;
+ }
+ match[i]='m';
+ mode=MODE_MS;
+ }else if(c<0 || c=='N'){
+ score+=POINTS_NOCALL;
+ match[i]='N';
+ }else if(r<0 || r=='N'){
+ score+=POINTS_NOREF;
+// match[i]='m';
+ match[i]='N';
+ }else{
+ match[i]='S';
+ if(mode==MODE_SUB){timeInMode++;}
+ else{timeInMode=0;}
+
+ if(timeInMode==0){score+=POINTS_SUB;}
+ else if(timeInMode<LIMIT_FOR_COST_3){score+=POINTS_SUB2;}
+ else{score+=POINTS_SUB3;}
+ mode=MODE_SUB;
+ }
+ }
+
+ return score;
+ }
+
+ public final static int maxQuality(int numBases){
+ return POINTS_MATCH+(numBases-1)*(POINTS_MATCH2);
+ }
+
+ public final static int maxQuality(byte[] baseScores){
+ return POINTS_MATCH+(baseScores.length-1)*(POINTS_MATCH2)+Tools.sumInt(baseScores);
+ }
+
+ public final static int maxImperfectScore(int numBases){
+// int maxQ=maxQuality(numBases);
+//// maxImperfectSwScore=maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB);
+// int maxI=maxQ+POINTS_DEL;
+// maxI=Tools.max(maxI, maxQ+POINTS_INS-POINTS_MATCH2);
+// maxI=Tools.min(maxI, maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB));
+
+ int maxQ=maxQuality(numBases);
+ int maxI=maxQ+Tools.min(POINTS_DEL, POINTS_INS-POINTS_MATCH2);
+ assert(maxI<(maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB)));
+ return maxI;
+ }
+
+ public final static int maxImperfectScore(byte[] baseScores){
+// int maxQ=maxQuality(numBases);
+//// maxImperfectSwScore=maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB);
+// int maxI=maxQ+POINTS_DEL;
+// maxI=Tools.max(maxI, maxQ+POINTS_INS-POINTS_MATCH2);
+// maxI=Tools.min(maxI, maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB));
+
+ int maxQ=maxQuality(baseScores);
+ int maxI=maxQ+Tools.min(POINTS_DEL, POINTS_INS-POINTS_MATCH2);
+ assert(maxI<(maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB)));
+ return maxI;
+ }
+
+ public final static String toString(int[] a){
+
+ int width=7;
+
+ StringBuilder sb=new StringBuilder((a.length+1)*width+2);
+ for(int num : a){
+ String s=" "+num;
+ int spaces=width-s.length();
+ assert(spaces>=0) : width+", "+s.length()+", "+s+", "+num+", "+spaces;
+ for(int i=0; i<spaces; i++){sb.append(' ');}
+ sb.append(s);
+ }
+
+ return sb.toString();
+ }
+
+ public final static String toTimePacked(int[] a){
+ int width=7;
+
+ StringBuilder sb=new StringBuilder((a.length+1)*width+2);
+ for(int num_ : a){
+ int num=num_&TIMEMASK;
+ String s=" "+num;
+ int spaces=width-s.length();
+ assert(spaces>=0) : width+", "+s.length()+", "+s+", "+num+", "+spaces;
+ for(int i=0; i<spaces; i++){sb.append(' ');}
+ sb.append(s);
+ }
+
+ return sb.toString();
+ }
+
+ public final static String toScorePacked(int[] a){
+ int width=7;
+
+ String minString=" -";
+ String maxString=" ";
+ while(minString.length()<width){minString+='9';}
+ while(maxString.length()<width){maxString+='9';}
+
+ StringBuilder sb=new StringBuilder((a.length+1)*width+2);
+ for(int num_ : a){
+ int num=num_>>SCOREOFFSET;
+ String s=" "+num;
+ if(s.length()>width){s=num>0 ? maxString : minString;}
+ int spaces=width-s.length();
+ assert(spaces>=0) : width+", "+s.length()+", "+s+", "+num+", "+spaces;
+ for(int i=0; i<spaces; i++){sb.append(' ');}
+ sb.append(s);
+ }
+
+ return sb.toString();
+ }
+
+ public final static String toString(byte[] a){
+
+ int width=6;
+
+ StringBuilder sb=new StringBuilder((a.length+1)*width+2);
+ for(int num : a){
+ String s=" "+num;
+ int spaces=width-s.length();
+ assert(spaces>=0);
+ for(int i=0; i<spaces; i++){sb.append(' ');}
+ sb.append(s);
+ }
+
+ return sb.toString();
+ }
+
+ public final static String toString(byte[] ref, int startLoc, int stopLoc){
+ StringBuilder sb=new StringBuilder(stopLoc-startLoc+1);
+ for(int i=startLoc; i<=stopLoc; i++){sb.append((char)ref[i]);}
+ return sb.toString();
+ }
+
+ public static int calcDelScore(int len){
+ if(len<=0){return 0;}
+ int score=POINTS_DEL;
+
+ if(len>LIMIT_FOR_COST_4){
+ score+=(len-LIMIT_FOR_COST_4)*POINTS_DEL4;
+ len=LIMIT_FOR_COST_4;
+ }
+ if(len>LIMIT_FOR_COST_3){
+ score+=(len-LIMIT_FOR_COST_3)*POINTS_DEL3;
+ len=LIMIT_FOR_COST_3;
+ }
+ if(len>1){
+ score+=(len-1)*POINTS_DEL2;
+ }
+ return score;
+ }
+
+ private static int calcDelScoreOffset(int len){
+ if(len<=0){return 0;}
+ int score=POINTSoff_DEL;
+
+ if(len>LIMIT_FOR_COST_4){
+ score+=(len-LIMIT_FOR_COST_4)*POINTSoff_DEL4;
+ len=LIMIT_FOR_COST_4;
+ }
+ if(len>LIMIT_FOR_COST_3){
+ score+=(len-LIMIT_FOR_COST_3)*POINTSoff_DEL3;
+ len=LIMIT_FOR_COST_3;
+ }
+ if(len>1){
+ score+=(len-1)*POINTSoff_DEL2;
+ }
+ return score;
+ }
+
+ public static int calcInsScore(int len){
+ if(len<=0){return 0;}
+ int score=POINTS_INS;
+
+ if(len>LIMIT_FOR_COST_4){
+ score+=(len-LIMIT_FOR_COST_4)*POINTS_INS4;
+ len=LIMIT_FOR_COST_4;
+ }
+ if(len>LIMIT_FOR_COST_3){
+ score+=(len-LIMIT_FOR_COST_3)*POINTS_INS3;
+ len=LIMIT_FOR_COST_3;
+ }
+ if(len>1){
+ score+=(len-1)*POINTS_INS2;
+ }
+ return score;
+ }
+
+ private static int calcInsScoreOffset(int len){
+ if(len<=0){return 0;}
+ int score=POINTSoff_INS;
+ if(len>LIMIT_FOR_COST_4){
+ score+=(len-LIMIT_FOR_COST_4)*POINTSoff_INS4;
+ len=LIMIT_FOR_COST_4;
+ }
+ if(len>LIMIT_FOR_COST_3){
+ score+=(len-LIMIT_FOR_COST_3)*POINTSoff_INS3;
+ len=LIMIT_FOR_COST_3;
+ }
+ if(len>1){
+ score+=(len-1)*POINTSoff_INS2;
+ }
+ return score;
+ }
+
+
+ public final int maxRows;
+ public final int maxColumns;
+
+ private final int[][][] packed;
+
+ public final int[] vertLimit;
+ public final int[] horizLimit;
+
+ CharSequence showVertLimit(){
+ StringBuilder sb=new StringBuilder();
+ for(int i=0; i<=rows; i++){sb.append(vertLimit[i]>>SCOREOFFSET).append(",");}
+ return sb;
+ }
+ CharSequence showHorizLimit(){
+ StringBuilder sb=new StringBuilder();
+ for(int i=0; i<=columns; i++){sb.append(horizLimit[i]>>SCOREOFFSET).append(",");}
+ return sb;
+ }
+
+// public static final int MODEBITS=2;
+ public static final int TIMEBITS=12;
+ public static final int SCOREBITS=32-TIMEBITS;
+ public static final int MAX_TIME=((1<<TIMEBITS)-1);
+ public static final int MAX_SCORE=((1<<(SCOREBITS-1))-1)-2000;
+ public static final int MIN_SCORE=0-MAX_SCORE; //Keeps it 1 point above "BAD".
+
+// public static final int MODEOFFSET=0; //Always zero.
+// public static final int TIMEOFFSET=0;
+ public static final int SCOREOFFSET=TIMEBITS;
+
+// public static final int MODEMASK=~((-1)<<MODEBITS);
+// public static final int TIMEMASK=(~((-1)<<TIMEBITS))<<TIMEOFFSET;
+ public static final int TIMEMASK=~((-1)<<TIMEBITS);
+ public static final int SCOREMASK=(~((-1)<<SCOREBITS))<<SCOREOFFSET;
+
+ private static final byte MODE_MS=0;
+ private static final byte MODE_DEL=1;
+ private static final byte MODE_INS=2;
+ private static final byte MODE_SUB=3;
+
+ public static final int POINTS_NOREF=-10;
+ public static final int POINTS_NOCALL=-10;
+ public static final int POINTS_MATCH=90;
+ public static final int POINTS_MATCH2=100; //Note: Changing to 90 substantially reduces false positives
+ public static final int POINTS_COMPATIBLE=50;
+ public static final int POINTS_SUB=-143;
+ public static final int POINTS_SUBR=-161; //increased penalty if prior match streak was at most 1
+ public static final int POINTS_SUB2=-54;
+ public static final int POINTS_SUB3=-35;
+ public static final int POINTS_MATCHSUB=-10;
+ public static final int POINTS_INS=-207;
+ public static final int POINTS_INS2=-51;
+ public static final int POINTS_INS3=-37;
+ public static final int POINTS_INS4=-15;
+ public static final int POINTS_DEL=-273;
+ public static final int POINTS_DEL2=-38;
+ public static final int POINTS_DEL3=-27;
+ public static final int POINTS_DEL4=-15;
+ public static final int POINTS_DEL_REF_N=-10;
+
+
+ public static final int LIMIT_FOR_COST_3=5;
+ public static final int LIMIT_FOR_COST_4=30;
+
+ public static final int BAD=MIN_SCORE-1;
+
+
+ public static final int POINTSoff_NOREF=(POINTS_NOREF<<SCOREOFFSET);
+ public static final int POINTSoff_NOCALL=(POINTS_NOCALL<<SCOREOFFSET);
+ public static final int POINTSoff_MATCH=(POINTS_MATCH<<SCOREOFFSET);
+ public static final int POINTSoff_MATCH2=(POINTS_MATCH2<<SCOREOFFSET);
+ public static final int POINTSoff_COMPATIBLE=(POINTS_COMPATIBLE<<SCOREOFFSET);
+ public static final int POINTSoff_SUB=(POINTS_SUB<<SCOREOFFSET);
+ public static final int POINTSoff_SUBR=(POINTS_SUBR<<SCOREOFFSET);
+ public static final int POINTSoff_SUB2=(POINTS_SUB2<<SCOREOFFSET);
+ public static final int POINTSoff_SUB3=(POINTS_SUB3<<SCOREOFFSET);
+ public static final int POINTSoff_MATCHSUB=(POINTS_MATCHSUB<<SCOREOFFSET);
+ public static final int POINTSoff_INS=(POINTS_INS<<SCOREOFFSET);
+ public static final int POINTSoff_INS2=(POINTS_INS2<<SCOREOFFSET);
+ public static final int POINTSoff_INS3=(POINTS_INS3<<SCOREOFFSET);
+ public static final int POINTSoff_INS4=(POINTS_INS4<<SCOREOFFSET);
+ public static final int POINTSoff_DEL=(POINTS_DEL<<SCOREOFFSET);
+ public static final int POINTSoff_DEL2=(POINTS_DEL2<<SCOREOFFSET);
+ public static final int POINTSoff_DEL3=(POINTS_DEL3<<SCOREOFFSET);
+ public static final int POINTSoff_DEL4=(POINTS_DEL4<<SCOREOFFSET);
+ public static final int POINTSoff_DEL_REF_N=(POINTS_DEL_REF_N<<SCOREOFFSET);
+ public static final int BADoff=(BAD<<SCOREOFFSET);
+ public static final int MAXoff_SCORE=MAX_SCORE<<SCOREOFFSET;
+ public static final int MINoff_SCORE=MIN_SCORE<<SCOREOFFSET;
+
+ private int rows;
+ private int columns;
+
+ public long iterationsLimited=0;
+ public long iterationsUnlimited=0;
+
+ public boolean verbose=false;
+ public boolean verbose2=false;
+
+}
diff --git a/current/align2/MultiStateAligner9PacBioAdapter2.java b/current/align2/MultiStateAligner9PacBioAdapter2.java
new file mode 100755
index 0000000..8b89113
--- /dev/null
+++ b/current/align2/MultiStateAligner9PacBioAdapter2.java
@@ -0,0 +1,1756 @@
+package align2;
+
+import java.util.Arrays;
+
+import stream.Read;
+import stream.SiteScore;
+
+import dna.AminoAcid;
+import dna.ChromosomeArray;
+import dna.Data;
+
+/**
+ * Based on MSA9ts, with transform scores tweaked for PacBio. */
+public final class MultiStateAligner9PacBioAdapter2 {
+
+
+ public MultiStateAligner9PacBioAdapter2(int maxRows_, int maxColumns_){
+// assert(maxColumns_>=200);
+// assert(maxRows_>=200);
+ maxRows=maxRows_;
+ maxColumns=maxColumns_;
+ packed=new int[3][maxRows+1][maxColumns+1];
+
+ vertLimit=new int[maxRows+1];
+ horizLimit=new int[maxColumns+1];
+ Arrays.fill(vertLimit, BADoff);
+ Arrays.fill(horizLimit, BADoff);
+
+// for(int i=0; i<maxColumns+1; i++){
+// scores[0][i]=0-i;
+// }
+
+ for(int matrix=0; matrix<packed.length; matrix++){
+ for(int i=1; i<=maxRows; i++){
+ for(int j=0; j<packed[matrix][i].length; j++){
+ packed[matrix][i][j]|=BADoff;
+ }
+// packed[matrix][i][0]|=MODE_INS;
+ }
+// for(int i=0; i<maxRows+1; i++){
+// scores[matrix][i][0]=(i*POINTSoff_NOREF);
+// }
+ for(int i=0; i<=maxRows; i++){
+
+ int prevScore=(i<2 ? 0 : packed[matrix][i-1][0]);
+ int score=(i<2 ? (i*POINTSoff_INS) :
+ (i<LIMIT_FOR_COST_3 ? prevScore+POINTSoff_INS2 :
+ (i<LIMIT_FOR_COST_4 ? prevScore+POINTSoff_INS3 : prevScore+POINTSoff_INS4)));
+
+ packed[matrix][i][0]=score;
+ }
+// for(int i=1; i<maxColumns+1; i++){
+// prevState[matrix][0][i]=MODE_DEL;
+// }
+// for(int i=0; i<=maxColumns; i++){
+// packed[matrix][0][i]|=MODE_MS;
+// }
+ }
+ }
+
+
+ /** return new int[] {rows, maxC, maxS, max};
+ * Will not fill areas that cannot match minScore */
+ public final int[] fillLimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore){
+ return fillLimitedX(read, ref, refStartLoc, refEndLoc, minScore);
+ }
+
+
+ /** return new int[] {rows, maxC, maxS, max};
+ * Will not fill areas that cannot match minScore */
+ private final int[] fillLimitedX(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore){
+// minScore=0;
+// assert(minScore>0);
+ rows=read.length;
+ columns=refEndLoc-refStartLoc+1;
+
+ if(/*read.length<40 || */false || minScore<=0 || columns>read.length+Tools.min(100, read.length)){
+// assert(false) : minScore;
+ return fillUnlimited(read, ref, refStartLoc, refEndLoc);
+ }
+
+ minScore-=100; //Increases quality trivially
+
+ assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows+"\n"+
+ refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n";
+ assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns+"\n"+
+ refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n";
+
+ assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc+"\n"+
+ refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n";
+ assert(refEndLoc<ref.length) : "Check that values are in-bounds before calling this function: "+refEndLoc+", "+ref.length+"\n"+
+ refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n";
+
+// for(int x=0; x<packed.length; x++){
+// for(int y=1; y<rows+1; y++){
+// Arrays.fill(packed[x][y], 1, columns+1, BADoff);
+// }
+// }
+ for(int x=0; x<packed.length; x++){
+
+// Arrays.fill(packed[x][1], 1, columns+1, BADoff);
+ Arrays.fill(packed[x][rows], 1, columns+1, BADoff);
+// for(int y=1; y<rows+1; y++){
+// Arrays.fill(packed[x][y], 1, columns+1, BADoff);
+// }
+ }
+
+ int minGoodCol=1;
+ int maxGoodCol=columns;
+
+ final int minScore_off=(minScore<<SCOREOFFSET);
+ final int maxGain=(read.length-1)*POINTSoff_MATCH2+POINTSoff_MATCH;
+ final int floor=minScore_off-maxGain;
+// final int subfloor=Tools.max(BADoff, floor-200*POINTSoff_MATCH2);
+ final int subfloor=floor-5*POINTSoff_MATCH2;
+ assert(subfloor>BADoff); //TODO: Actually, it needs to be substantially more.
+ assert(subfloor<minScore_off) : minScore_off+", "+floor+", "+BADoff+", "+subfloor;
+
+ if(verbose2){
+ System.out.println();
+ System.out.println("minScore="+minScore+"\t"+minScore_off);
+ System.out.println("maxGain="+(maxGain>>SCOREOFFSET)+"\t"+(maxGain));
+ System.out.println("floor="+(floor>>SCOREOFFSET)+"\t"+(floor));
+ System.out.println("subfloor="+(subfloor>>SCOREOFFSET)+"\t"+(subfloor));
+ System.out.println("BADoff="+(BADoff>>SCOREOFFSET)+"\t"+(BADoff));
+ System.out.println("maxGain="+(maxGain>>SCOREOFFSET)+"\t"+(maxGain));
+ System.out.println();
+ }
+
+ vertLimit[rows]=minScore_off;
+ for(int i=rows-1; i>=0; i--){
+ vertLimit[i]=Tools.max(vertLimit[i+1]-POINTSoff_MATCH2, floor);
+ }
+
+ horizLimit[columns]=minScore_off;
+ for(int i=columns-1; i>=0; i--){
+ horizLimit[i]=Tools.max(horizLimit[i+1]-POINTSoff_MATCH2, floor);
+ }
+
+ for(int row=1; row<=rows; row++){
+
+ int colStart=minGoodCol;
+ int colStop=maxGoodCol;
+
+ minGoodCol=-1;
+ maxGoodCol=-2;
+
+ final int vlimit=vertLimit[row];
+
+ if(verbose2){
+ System.out.println();
+ System.out.println("row="+row);
+ System.out.println("colStart="+colStart);
+ System.out.println("colStop="+colStop);
+ System.out.println("vlimit="+(vlimit>>SCOREOFFSET)+"\t"+(vlimit));
+ }
+
+ if(colStart<0 || colStop<colStart){break;}
+
+
+ if(colStart>1){
+ assert(row>0);
+ packed[MODE_MS][row][colStart-1]=subfloor;
+ packed[MODE_INS][row][colStart-1]=subfloor;
+ packed[MODE_DEL][row][colStart-1]=subfloor;
+ }
+
+
+ for(int col=colStart; col<=columns; col++){
+
+
+ if(verbose2){
+ System.out.println("\ncol "+col);
+ }
+
+ final byte call0=(row<2 ? (byte)'?' : read[row-2]);
+ final byte call1=read[row-1];
+ final byte ref0=(col<2 ? (byte)'!' : ref[refStartLoc+col-2]);
+ final byte ref1=ref[refStartLoc+col-1];
+
+// final boolean match=(read[row-1]==ref[refStartLoc+col-1]);
+// final boolean prevMatch=(row<2 || col<2 ? false : read[row-2]==ref[refStartLoc+col-2]);
+ final boolean match=(call1==ref1 && ref1!='N');
+ final boolean prevMatch=(call0==ref0 && ref0!='N');
+
+// System.err.println("")
+
+ iterationsLimited++;
+ final int limit=Tools.max(vlimit, horizLimit[col]);
+ final int limit3=Tools.max(floor, (match ? limit-POINTSoff_MATCH2 : limit-POINTSoff_SUB3));
+
+ final int delNeeded=Tools.max(0, row-col-1);
+ final int insNeeded=Tools.max(0, (rows-row)-(columns-col)-1);
+
+ final int delPenalty=calcDelScoreOffset(delNeeded);
+ final int insPenalty=calcInsScoreOffset(insNeeded);
+
+
+ final int scoreFromDiag_MS=packed[MODE_MS][row-1][col-1]&SCOREMASK;
+ final int scoreFromDel_MS=packed[MODE_DEL][row-1][col-1]&SCOREMASK;
+ final int scoreFromIns_MS=packed[MODE_INS][row-1][col-1]&SCOREMASK;
+
+ final int scoreFromDiag_DEL=packed[MODE_MS][row][col-1]&SCOREMASK;
+ final int scoreFromDel_DEL=packed[MODE_DEL][row][col-1]&SCOREMASK;
+
+ final int scoreFromDiag_INS=packed[MODE_MS][row-1][col]&SCOREMASK;
+ final int scoreFromIns_INS=packed[MODE_INS][row-1][col]&SCOREMASK;
+
+// if(scoreFromDiag_MS<limit3 && scoreFromDel_MS<limit3 && scoreFromIns_MS<limit3
+// && scoreFromDiag_DEL<limit && scoreFromDel_DEL<limit && scoreFromDiag_INS<limit && scoreFromIns_INS<limit){
+// iterationsLimited--; //A "fast" iteration
+// }
+
+ if((scoreFromDiag_MS<=limit3 && scoreFromDel_MS<=limit3 && scoreFromIns_MS<=limit3)){
+ packed[MODE_MS][row][col]=subfloor;
+ }else{//Calculate match and sub scores
+ final int streak=(packed[MODE_MS][row-1][col-1]&TIMEMASK);
+
+ {//Calculate match/sub score
+
+ int score;
+ int time;
+ byte prevState;
+
+ if(match){
+
+ int scoreMS=scoreFromDiag_MS+(prevMatch ? POINTSoff_MATCH2 : POINTSoff_MATCH);
+ int scoreD=scoreFromDel_MS+POINTSoff_MATCH;
+ int scoreI=scoreFromIns_MS+POINTSoff_MATCH;
+
+// byte prevState;
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? streak+1 : 1);
+ prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+ prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+ prevState=MODE_INS;
+ }
+
+// assert(time<=MAX_TIME);//if(time>MAX_TIME){time=MAX_TIME-3;}
+// assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+// assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+//// packed[MODE_MS][row][col]=(score|prevState|time);
+// packed[MODE_MS][row][col]=(score|time);
+// assert((score&SCOREMASK)==score);
+//// assert((prevState&MODEMASK)==prevState);
+// assert((time&TIMEMASK)==time);
+
+ }else{
+
+ int scoreMS;
+ if(ref1!='N' && call1!='N'){
+ scoreMS=scoreFromDiag_MS+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) :
+ (streak==0 ? POINTSoff_SUB : streak<LIMIT_FOR_COST_3 ? POINTSoff_SUB2 : POINTSoff_SUB3));
+ }else{
+ scoreMS=scoreFromDiag_MS+POINTSoff_NOCALL;
+ }
+
+ int scoreD=scoreFromDel_MS+POINTSoff_SUB; //+2 to move it as close as possible to the deletion / insertion
+ int scoreI=scoreFromIns_MS+POINTSoff_SUB;
+
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? 1 : streak+1);
+// time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1);
+ prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+ prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+ prevState=MODE_INS;
+ }
+
+// assert(time<=MAX_TIME);//if(time>MAX_TIME){time=MAX_TIME-3;}
+// assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+// assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+//// packed[MODE_MS][row][col]=(score|prevState|time);
+// packed[MODE_MS][row][col]=(score|time);
+// assert((score&SCOREMASK)==score);
+//// assert((prevState&MODEMASK)==prevState);
+// assert((time&TIMEMASK)==time);
+ }
+
+ final int limit2;
+ if(delNeeded>0){
+ limit2=limit-delPenalty;
+ }else if(insNeeded>0){
+ limit2=limit-insPenalty;
+ }else{
+ limit2=limit;
+ }
+ assert(limit2>=limit);
+
+ if(verbose2){System.err.println("MS: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));}
+
+ if(score>=limit2){
+ maxGoodCol=col;
+ if(minGoodCol<0){minGoodCol=col;}
+ }else{
+ score=subfloor;
+ }
+
+ assert(time<=MAX_TIME);//if(time>MAX_TIME){time=MAX_TIME-3;}
+ assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_MS][row][col]=(score|prevState|time);
+ packed[MODE_MS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+ }
+
+ if((scoreFromDiag_DEL<=limit && scoreFromDel_DEL<=limit)){
+// assert((scoreFromDiag_DEL<=limit && scoreFromDel_DEL<=limit)) : scoreFromDiag_DEL+", "+row;
+ packed[MODE_DEL][row][col]=subfloor;
+ }else{//Calculate DEL score
+
+ final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK;
+
+ int scoreMS=scoreFromDiag_DEL+POINTSoff_DEL;
+ int scoreD=scoreFromDel_DEL+(streak==0 ? POINTSoff_DEL :
+ streak<LIMIT_FOR_COST_3 ? POINTSoff_DEL2 :
+ streak<LIMIT_FOR_COST_4 ? POINTSoff_DEL3 : POINTSoff_DEL4);
+// int scoreI=scoreFromIns+POINTSoff_DEL;
+
+
+ if(ref1=='N'){
+ scoreMS+=POINTSoff_DEL_REF_N;
+ scoreD+=POINTSoff_DEL_REF_N;
+ }
+
+ //if(match){scoreMS=subfloor;}
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreD){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreD;
+ time=streak+1;
+ prevState=MODE_DEL;
+ }
+
+ final int limit2;
+ if(insNeeded>0){
+ limit2=limit-insPenalty;
+ }else if(delNeeded>0){
+ limit2=limit-calcDelScoreOffset(time+delNeeded)+calcDelScoreOffset(time);
+ }else{
+ limit2=limit;
+ }
+ assert(limit2>=limit);
+ if(verbose2){System.err.println("DEL: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));}
+
+ if(score>=limit2){
+ maxGoodCol=col;
+ if(minGoodCol<0){minGoodCol=col;}
+ }else{
+ score=subfloor;
+ }
+
+ assert(time<=MAX_TIME);//if(time>MAX_TIME){time=MAX_TIME-3;}
+ assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_DEL][row][col]=(score|prevState|time);
+ packed[MODE_DEL][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+
+ if(scoreFromDiag_INS<=limit && scoreFromIns_INS<=limit){
+ packed[MODE_INS][row][col]=subfloor;
+ }else{//Calculate INS score
+
+ final int streak=packed[MODE_INS][row-1][col]&TIMEMASK;
+
+ int scoreMS=scoreFromDiag_INS+POINTSoff_INS;
+// int scoreD=scoreFromDel+POINTSoff_INS;
+ int scoreI=scoreFromIns_INS+(streak==0 ? POINTSoff_INS :
+ streak<LIMIT_FOR_COST_3 ? POINTSoff_INS2 :
+ streak<LIMIT_FOR_COST_4 ? POINTSoff_INS3 : POINTSoff_INS4);
+
+
+// System.err.println("("+row+","+col+")\t"+scoreFromDiag+"+"+POINTSoff_INS+"="+scoreM+", "+
+// scoreFromSub+"+"+POINTSoff_INS+"="+scoreS+", "
+// +scoreD+", "+scoreFromIns+"+"+
+// (streak==0 ? POINTSoff_INS : streak<LIMIT_FOR_COST_3 ? POINTSoff_INS2 : POINTSoff_INS3)+"="+scoreI);
+
+ //if(match){scoreMS=subfloor;}
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreI){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreI;
+ time=streak+1;
+ prevState=MODE_INS;
+ }
+
+ final int limit2;
+ if(delNeeded>0){
+ limit2=limit-delPenalty;
+ }else if(insNeeded>0){
+ limit2=limit-calcInsScoreOffset(time+insNeeded)+calcInsScoreOffset(time);
+ }else{
+ limit2=limit;
+ }
+ assert(limit2>=limit);
+
+ if(verbose2){System.err.println("INS: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));}
+ if(score>=limit2){
+ maxGoodCol=col;
+ if(minGoodCol<0){minGoodCol=col;}
+ }else{
+ score=subfloor;
+ }
+
+ assert(time<=MAX_TIME);//if(time>MAX_TIME){time=MAX_TIME-3;}
+ assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_INS][row][col]=(score|prevState|time);
+ packed[MODE_INS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+
+
+ if(col>=colStop){
+ if(col>colStop && maxGoodCol<col){break;}
+ if(row>1){
+ packed[MODE_MS][row-1][col+1]=subfloor;
+ packed[MODE_INS][row-1][col+1]=subfloor;
+ packed[MODE_DEL][row-1][col+1]=subfloor;
+ }
+ }
+ }
+ }
+
+
+ int maxCol=-1;
+ int maxState=-1;
+ int maxScore=Integer.MIN_VALUE;
+
+ for(int state=0; state<packed.length; state++){
+ for(int col=1; col<=columns; col++){
+ int x=packed[state][rows][col]&SCOREMASK;
+ if(x>maxScore){
+ maxScore=x;
+ maxCol=col;
+ maxState=state;
+ }
+ }
+ }
+
+ assert(maxScore>=BADoff);
+// if(maxScore==BADoff){
+// return null;
+// }
+// if(maxScore<floor){
+// return null;
+// }
+ if(maxScore<minScore_off){
+ return null;
+ }
+
+ maxScore>>=SCOREOFFSET;
+
+// System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore);
+ return new int[] {rows, maxCol, maxState, maxScore};
+ }
+
+
+ /** return new int[] {rows, maxC, maxS, max};
+ * Does not require a min score (ie, same as old method) */
+ private final int[] fillUnlimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc){
+ rows=read.length;
+ columns=refEndLoc-refStartLoc+1;
+
+ final int maxGain=(read.length-1)*POINTSoff_MATCH2+POINTSoff_MATCH;
+ final int subfloor=0-2*maxGain;
+ assert(subfloor>BADoff && subfloor*2>BADoff); //TODO: Actually, it needs to be substantially more.
+
+ //temporary, for finding a bug
+ if(rows>maxRows || columns>maxColumns){
+ throw new RuntimeException("rows="+rows+", maxRows="+maxRows+", cols="+columns+", maxCols="+maxColumns+"\n"+new String(read)+"\n");
+ }
+
+ assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows;
+ assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns;
+
+ assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc;
+ assert(refEndLoc<ref.length) : "Check that values are in-bounds before calling this function: "+refEndLoc+", "+ref.length;
+
+ for(int row=1; row<=rows; row++){
+
+// int minc=max(1, row-20);
+// int maxc=min(columns, row+20);
+
+ for(int col=1; col<=columns; col++){
+ iterationsUnlimited++;
+
+// final boolean match=(read[row-1]==ref[refStartLoc+col-1]);
+// final boolean prevMatch=(row<2 || col<2 ? false : read[row-2]==ref[refStartLoc+col-2]);
+
+ final byte call0=(row<2 ? (byte)'?' : read[row-2]);
+ final byte call1=read[row-1];
+ final byte ref0=(col<2 ? (byte)'!' : ref[refStartLoc+col-2]);
+ final byte ref1=ref[refStartLoc+col-1];
+
+ final boolean match=(call1==ref1 && ref1!='N');
+ final boolean prevMatch=(call0==ref0 && ref0!='N');
+
+ {//Calculate match and sub scores
+
+ final int scoreFromDiag=packed[MODE_MS][row-1][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row-1][col-1]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col-1]&SCOREMASK;
+ final int streak=(packed[MODE_MS][row-1][col-1]&TIMEMASK);
+
+ {//Calculate match/sub score
+
+ if(match){
+
+ int scoreMS=scoreFromDiag+(prevMatch ? POINTSoff_MATCH2 : POINTSoff_MATCH);
+ int scoreD=scoreFromDel+POINTSoff_MATCH;
+ int scoreI=scoreFromIns+POINTSoff_MATCH;
+
+ int score;
+ int time;
+// byte prevState;
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? streak+1 : 1);
+// prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+// prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+// prevState=MODE_INS;
+ }
+
+ assert(time<=MAX_TIME);//if(time>MAX_TIME){time=MAX_TIME-3;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_MS][row][col]=(score|prevState|time);
+ packed[MODE_MS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+
+ }else{
+
+ int scoreMS;
+ if(ref1!='N' && call1!='N'){
+ scoreMS=scoreFromDiag+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) :
+ (streak==0 ? POINTSoff_SUB : streak<LIMIT_FOR_COST_3 ? POINTSoff_SUB2 : POINTSoff_SUB3));
+ }else{
+ scoreMS=scoreFromDiag+POINTSoff_NOCALL;
+ }
+
+ int scoreD=scoreFromDel+POINTSoff_SUB; //+2 to move it as close as possible to the deletion / insertion
+ int scoreI=scoreFromIns+POINTSoff_SUB;
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? 1 : streak+1);
+// time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1);
+ prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+ prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+ prevState=MODE_INS;
+ }
+
+ assert(time<=MAX_TIME);//if(time>MAX_TIME){time=MAX_TIME-3;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_MS][row][col]=(score|prevState|time);
+ packed[MODE_MS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+ }
+ }
+
+ {//Calculate DEL score
+
+ final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK;
+
+ final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK;
+
+ int scoreMS=scoreFromDiag+POINTSoff_DEL;
+ int scoreD=scoreFromDel+(streak==0 ? POINTSoff_DEL :
+ streak<LIMIT_FOR_COST_3 ? POINTSoff_DEL2 :
+ streak<LIMIT_FOR_COST_4 ? POINTSoff_DEL3 : POINTSoff_DEL4);
+// int scoreI=scoreFromIns+POINTSoff_DEL;
+
+ if(ref1=='N'){
+ scoreMS+=POINTSoff_DEL_REF_N;
+ scoreD+=POINTSoff_DEL_REF_N;
+ }
+
+ //if(match){scoreMS=subfloor;}
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreD){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreD;
+ time=streak+1;
+ prevState=MODE_DEL;
+ }
+
+ assert(time<=MAX_TIME);//if(time>MAX_TIME){time=MAX_TIME-3;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_DEL][row][col]=(score|prevState|time);
+ packed[MODE_DEL][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+
+ {//Calculate INS score
+
+ final int streak=packed[MODE_INS][row-1][col]&TIMEMASK;
+
+ final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK;
+
+ int scoreMS=scoreFromDiag+POINTSoff_INS;
+// int scoreD=scoreFromDel+POINTSoff_INS;
+ int scoreI=scoreFromIns+(streak==0 ? POINTSoff_INS :
+ streak<LIMIT_FOR_COST_3 ? POINTSoff_INS2 :
+ streak<LIMIT_FOR_COST_4 ? POINTSoff_INS3 : POINTSoff_INS4);
+
+// System.err.println("("+row+","+col+")\t"+scoreFromDiag+"+"+POINTSoff_INS+"="+scoreM+", "+
+// scoreFromSub+"+"+POINTSoff_INS+"="+scoreS+", "
+// +scoreD+", "+scoreFromIns+"+"+
+// (streak==0 ? POINTSoff_INS : streak<LIMIT_FOR_COST_3 ? POINTSoff_INS2 : POINTSoff_INS3)+"="+scoreI);
+
+ //if(match){scoreMS=subfloor;}
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreI){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreI;
+ time=streak+1;
+ prevState=MODE_INS;
+ }
+
+ assert(time<=MAX_TIME);//if(time>MAX_TIME){time=MAX_TIME-3;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_INS][row][col]=(score|prevState|time);
+ packed[MODE_INS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+ }
+ }
+
+
+ int maxCol=-1;
+ int maxState=-1;
+ int maxScore=Integer.MIN_VALUE;
+
+ for(int state=0; state<packed.length; state++){
+ for(int col=1; col<=columns; col++){
+ int x=packed[state][rows][col]&SCOREMASK;
+ if(x>maxScore){
+ maxScore=x;
+ maxCol=col;
+ maxState=state;
+ }
+ }
+ }
+ maxScore>>=SCOREOFFSET;
+
+// System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore);
+ return new int[] {rows, maxCol, maxState, maxScore};
+ }
+
+ @Deprecated
+ /** return new int[] {rows, maxC, maxS, max}; */
+ public final int[] fillQ(byte[] read, byte[] ref, byte[] baseScores, int refStartLoc, int refEndLoc){
+ assert(false) : "Needs to be redone to work with score cutoffs. Not difficult.";
+ rows=read.length;
+ columns=refEndLoc-refStartLoc+1;
+
+ assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows;
+ assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns;
+
+ assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc;
+ assert(refEndLoc<ref.length) : "Check that values are in-bounds before calling this function: "+refEndLoc+", "+ref.length;
+
+ for(int row=1; row<=rows; row++){
+
+// int minc=max(1, row-20);
+// int maxc=min(columns, row+20);
+
+ for(int col=1; col<=columns; col++){
+
+ final boolean match=(read[row-1]==ref[refStartLoc+col-1]);
+ final boolean prevMatch=(row<2 || col<2 ? false : read[row-2]==ref[refStartLoc+col-2]);
+
+ {//Calculate match and sub scores
+
+ final int scoreFromDiag=packed[MODE_MS][row-1][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row-1][col-1]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col-1]&SCOREMASK;
+ final int streak=(packed[MODE_MS][row-1][col-1]&TIMEMASK);
+
+ {//Calculate match/sub score
+
+ if(match){
+
+ int scoreMS=scoreFromDiag+(prevMatch ? POINTSoff_MATCH2 : POINTSoff_MATCH);
+ int scoreD=scoreFromDel+POINTSoff_MATCH;
+ int scoreI=scoreFromIns+POINTSoff_MATCH;
+
+ int score;
+ int time;
+// byte prevState;
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? streak+1 : 1);
+// prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+// prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+// prevState=MODE_INS;
+ }
+ score+=(((int)baseScores[row-1])<<SCOREOFFSET); //modifier
+
+ assert(time<=MAX_TIME);//if(time>MAX_TIME){time=MAX_TIME-3;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_MS][row][col]=(score|prevState|time);
+ packed[MODE_MS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+
+ }else{
+
+ int scoreMS=scoreFromDiag+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) :
+ (streak==0 ? POINTSoff_SUB : streak<LIMIT_FOR_COST_3 ? POINTSoff_SUB2 : POINTSoff_SUB3));
+ int scoreD=scoreFromDel+POINTSoff_SUB; //+2 to move it as close as possible to the deletion / insertion
+ int scoreI=scoreFromIns+POINTSoff_SUB;
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? 1 : streak+1);
+// time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1);
+ prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+ prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+ prevState=MODE_INS;
+ }
+
+ assert(time<=MAX_TIME);//if(time>MAX_TIME){time=MAX_TIME-3;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_MS][row][col]=(score|prevState|time);
+ packed[MODE_MS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+ }
+ }
+
+ {//Calculate DEL score
+
+ final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK;
+
+ final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK;
+
+ int scoreMS=scoreFromDiag+POINTSoff_DEL;
+ int scoreD=scoreFromDel+(streak==0 ? POINTSoff_DEL :
+ streak<LIMIT_FOR_COST_3 ? POINTSoff_DEL2 :
+ streak<LIMIT_FOR_COST_4 ? POINTSoff_DEL3 : POINTSoff_DEL4);
+// int scoreI=scoreFromIns+POINTSoff_DEL;
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreD){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreD;
+ time=streak+1;
+ prevState=MODE_DEL;
+ }
+
+ assert(time<=MAX_TIME);//if(time>MAX_TIME){time=MAX_TIME-3;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_DEL][row][col]=(score|prevState|time);
+ packed[MODE_DEL][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+
+ {//Calculate INS score
+
+ final int streak=packed[MODE_INS][row-1][col]&TIMEMASK;
+
+ final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK;
+
+ int scoreMS=scoreFromDiag+POINTSoff_INS;
+// int scoreD=scoreFromDel+POINTSoff_INS;
+ int scoreI=scoreFromIns+(streak==0 ? POINTSoff_INS :
+ streak<LIMIT_FOR_COST_3 ? POINTSoff_INS2 :
+ streak<LIMIT_FOR_COST_4 ? POINTSoff_INS3 : POINTSoff_INS4);
+
+// System.err.println("("+row+","+col+")\t"+scoreFromDiag+"+"+POINTSoff_INS+"="+scoreM+", "+
+// scoreFromSub+"+"+POINTSoff_INS+"="+scoreS+", "
+// +scoreD+", "+scoreFromIns+"+"+
+// (streak==0 ? POINTSoff_INS : streak<LIMIT_FOR_COST_3 ? POINTSoff_INS2 : POINTSoff_INS3)+"="+scoreI);
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreI){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreI;
+ time=streak+1;
+ prevState=MODE_INS;
+ }
+
+ assert(time<=MAX_TIME);//if(time>MAX_TIME){time=MAX_TIME-3;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_INS][row][col]=(score|prevState|time);
+ packed[MODE_INS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+ }
+ }
+
+
+ int maxCol=-1;
+ int maxState=-1;
+ int maxScore=Integer.MIN_VALUE;
+
+ for(int state=0; state<packed.length; state++){
+ for(int col=1; col<=columns; col++){
+ int x=packed[state][rows][col]&SCOREMASK;
+ if(x>maxScore){
+ maxScore=x;
+ maxCol=col;
+ maxState=state;
+ }
+ }
+ }
+ maxScore>>=SCOREOFFSET;
+
+// System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore);
+ return new int[] {rows, maxCol, maxState, maxScore};
+ }
+
+
+ /** Generates the match string */
+ public final byte[] traceback(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state){
+// assert(false);
+ assert(refStartLoc<=refEndLoc) : refStartLoc+", "+refEndLoc;
+ assert(row==rows);
+
+ byte[] out=new byte[row+col-1]; //TODO if an out of bound crash occurs, try removing the "-1".
+ int outPos=0;
+
+ if(state==MODE_INS){
+ //TODO ? Maybe not needed.
+ }
+
+ while(row>0 && col>0){
+
+// byte prev0=(byte)(packed[state][row][col]&MODEMASK);
+
+ final int time=packed[state][row][col]&TIMEMASK;
+ final byte prev;
+
+// System.err.println("state="+state+", prev="+prev+", row="+row+", col="+col+", score="+scores[state][row][col]);
+
+ if(state==MODE_MS){
+ if(time>1){prev=(byte)state;}
+ else{
+ final int scoreFromDiag=packed[MODE_MS][row-1][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row-1][col-1]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col-1]&SCOREMASK;
+ if(scoreFromDiag>=scoreFromDel && scoreFromDiag>=scoreFromIns){prev=MODE_MS;}
+ else if(scoreFromDel>=scoreFromIns){prev=MODE_DEL;}
+ else{prev=MODE_INS;}
+ }
+
+ byte c=read[row-1];
+ byte r=ref[refStartLoc+col-1];
+ if(c==r){
+ out[outPos]='m';
+ }else{
+ if(!AminoAcid.isFullyDefined(c)){
+ out[outPos]='N';
+ }else if(!AminoAcid.isFullyDefined(r)){
+// out[outPos]='X';
+ out[outPos]='N';
+ }else{
+ out[outPos]='S';
+ }
+ }
+
+ row--;
+ col--;
+ }else if(state==MODE_DEL){
+ if(time>1){prev=(byte)state;}
+ else{
+ final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK;
+ if(scoreFromDiag>=scoreFromDel){prev=MODE_MS;}
+ else{prev=MODE_DEL;}
+ }
+
+ byte r=ref[refStartLoc+col-1];
+ out[outPos]='D';
+
+ col--;
+ }else{
+ if(time>1){prev=(byte)state;}
+ else{
+ final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK;
+ if(scoreFromDiag>=scoreFromIns){prev=MODE_MS;}
+ else{prev=MODE_INS;}
+ }
+
+ assert(state==MODE_INS) : state;
+ if(col==0){
+ out[outPos]='X';
+ }else if(col>=columns){
+ out[outPos]='Y';
+ }else{
+ out[outPos]='I';
+ }
+ row--;
+ }
+
+// assert(prev==prev0);
+ state=prev;
+ outPos++;
+ }
+
+ assert(row==0 || col==0);
+ if(col!=row){
+ while(row>0){
+ out[outPos]='X';
+ outPos++;
+ row--;
+ col--;
+ }
+ if(col>0){
+ //do nothing
+ }
+ }
+
+
+ //Shrink and reverse the string
+ byte[] out2=new byte[outPos];
+ for(int i=0; i<outPos; i++){
+ out2[i]=out[outPos-i-1];
+ }
+ out=null;
+
+ return out2;
+ }
+
+ /** @return {score, bestRefStart, bestRefStop} */
+ public final int[] score(final byte[] read, final byte[] ref, final int refStartLoc, final int refEndLoc,
+ final int maxRow, final int maxCol, final int maxState){
+
+ int row=maxRow;
+ int col=maxCol;
+ int state=maxState;
+
+ assert(maxState>=0 && maxState<packed.length) :
+ maxState+", "+maxRow+", "+maxCol+"\n"+new String(read)+"\n"+toString(ref, refStartLoc, refEndLoc);
+ assert(maxRow>=0 && maxRow<packed[0].length) :
+ maxState+", "+maxRow+", "+maxCol+"\n"+new String(read)+"\n"+toString(ref, refStartLoc, refEndLoc);
+ assert(maxCol>=0 && maxCol<packed[0][0].length) :
+ maxState+", "+maxRow+", "+maxCol+"\n"+new String(read)+"\n"+toString(ref, refStartLoc, refEndLoc);
+
+ int score=packed[maxState][maxRow][maxCol]&SCOREMASK; //Or zero, if it is to be recalculated
+
+ if(row<rows){
+ int difR=rows-row;
+ int difC=columns-col;
+
+ while(difR>difC){
+ score+=POINTSoff_NOREF;
+ difR--;
+ }
+
+ row+=difR;
+ col+=difR;
+
+ }
+
+ assert(refStartLoc<=refEndLoc);
+ assert(row==rows);
+
+
+ final int bestRefStop=refStartLoc+col-1;
+
+ while(row>0 && col>0){
+// System.err.println("state="+state+", row="+row+", col="+col);
+
+
+
+// byte prev0=(byte)(packed[state][row][col]&MODEMASK);
+
+ final int time=packed[state][row][col]&TIMEMASK;
+ final byte prev;
+
+ if(state==MODE_MS){
+ if(time>1){prev=(byte)state;}
+ else{
+ final int scoreFromDiag=packed[MODE_MS][row-1][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row-1][col-1]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col-1]&SCOREMASK;
+ if(scoreFromDiag>=scoreFromDel && scoreFromDiag>=scoreFromIns){prev=MODE_MS;}
+ else if(scoreFromDel>=scoreFromIns){prev=MODE_DEL;}
+ else{prev=MODE_INS;}
+ }
+ row--;
+ col--;
+ }else if(state==MODE_DEL){
+ if(time>1){prev=(byte)state;}
+ else{
+ final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK;
+ if(scoreFromDiag>=scoreFromDel){prev=MODE_MS;}
+ else{prev=MODE_DEL;}
+ }
+ col--;
+ }else{
+ assert(state==MODE_INS);
+ if(time>1){prev=(byte)state;}
+ else{
+ final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK;
+ if(scoreFromDiag>=scoreFromIns){prev=MODE_MS;}
+ else{prev=MODE_INS;}
+ }
+ row--;
+ }
+
+ if(col<0){
+ System.err.println(row);
+ break; //prevents an out of bounds access
+
+ }
+
+// assert(prev==prev0);
+ state=prev;
+
+// System.err.println("state2="+state+", row2="+row+", col2="+col+"\n");
+ }
+// assert(false) : row+", "+col;
+ if(row>col){
+ col-=row;
+ }
+
+ final int bestRefStart=refStartLoc+col;
+
+ score>>=SCOREOFFSET;
+ int[] rvec;
+ if(bestRefStart<refStartLoc || bestRefStop>refEndLoc){ //Suggest extra padding in cases of overflow
+ int padLeft=Tools.max(0, refStartLoc-bestRefStart);
+ int padRight=Tools.max(0, bestRefStop-refEndLoc);
+ rvec=new int[] {score, bestRefStart, bestRefStop, padLeft, padRight};
+ }else{
+ rvec=new int[] {score, bestRefStart, bestRefStop};
+ }
+ return rvec;
+ }
+
+
+ /** Will not fill areas that cannot match minScore.
+ * @return {score, bestRefStart, bestRefStop} */
+ public final int[] fillAndScoreLimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore){
+ int a=Tools.max(0, refStartLoc);
+ int b=Tools.min(ref.length-1, refEndLoc);
+ assert(b>=a);
+
+ int[] score;
+
+ if(b-a>=maxColumns){
+ System.err.println("Warning: Max alignment columns exceeded; restricting range. "+(b-a+1)+" > "+maxColumns);
+ assert(false) : refStartLoc+", "+refEndLoc;
+ b=Tools.min(ref.length-1, a+maxColumns-1);
+ }
+ int[] max=fillLimited(read, ref, a, b, minScore);
+ score=(max==null ? null : score(read, ref, a, b, max[0], max[1], max[2]));
+
+ return score;
+ }
+
+
+
+ public final int scoreNoIndels(byte[] read, SiteScore ss){
+ ChromosomeArray cha=Data.getChromosome(ss.chrom);
+ return scoreNoIndels(read, cha.array, ss.start, ss);
+ }
+
+ public final int scoreNoIndels(byte[] read, final int chrom, final int refStart){
+ ChromosomeArray cha=Data.getChromosome(chrom);
+ return scoreNoIndels(read, cha.array, refStart, null);
+ }
+
+ public final int scoreNoIndels(byte[] read, SiteScore ss, byte[] baseScores){
+ ChromosomeArray cha=Data.getChromosome(ss.chrom);
+ return scoreNoIndels(read, cha.array, baseScores, ss.start, ss);
+ }
+
+ public final int scoreNoIndels(byte[] read, final int chrom, final int refStart, byte[] baseScores){
+ ChromosomeArray cha=Data.getChromosome(chrom);
+ return scoreNoIndels(read, cha.array, baseScores, refStart, null);
+ }
+
+
+
+ public final int scoreNoIndels(byte[] read, byte[] ref, final int refStart, final SiteScore ss){
+
+ int score=0;
+ int mode=-1;
+ int timeInMode=0;
+
+ //This block handles cases where the read runs outside the reference
+ //Of course, padding the reference with 'N' would be better, but...
+ int readStart=0;
+ int readStop=read.length;
+ final int refStop=refStart+read.length;
+ boolean semiperfect=true;
+ int norefs=0;
+
+ if(refStart<0){
+ readStart=0-refStart;
+ score+=POINTS_NOREF*readStart;
+ norefs+=readStart;
+ }
+ if(refStop>ref.length){
+ int dif=(refStop-ref.length);
+ readStop-=dif;
+ score+=POINTS_NOREF*dif;
+ norefs+=dif;
+ }
+
+// if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed.
+
+ for(int i=readStart; i<readStop; i++){
+ byte c=read[i];
+ byte r=ref[refStart+i];
+
+ if(c==r && c!='N'){
+ if(mode==MODE_MS){
+ timeInMode++;
+ score+=POINTS_MATCH2;
+ }else{
+ timeInMode=0;
+ score+=POINTS_MATCH;
+ }
+ mode=MODE_MS;
+ }else if(c<0 || c=='N'){
+ score+=POINTS_NOCALL;
+ semiperfect=false;
+ }else if(r<0 || r=='N'){
+ score+=POINTS_NOREF;
+ norefs++;
+ }else{
+ if(mode==MODE_SUB){timeInMode++;}
+ else{timeInMode=0;}
+
+ if(timeInMode==0){score+=POINTS_SUB;}
+ else if(timeInMode<LIMIT_FOR_COST_3){score+=POINTS_SUB2;}
+ else{score+=POINTS_SUB3;}
+ mode=MODE_SUB;
+ semiperfect=false;
+ }
+ }
+
+ if(semiperfect && ss!=null){ss.semiperfect=((ss.stop==ss.start+read.length-1) && (norefs<=read.length/2));}
+
+ return score;
+ }
+
+
+ public final int scoreNoIndels(byte[] read, byte[] ref, byte[] baseScores, final int refStart, SiteScore ss){
+
+ int score=0;
+ int mode=-1;
+ int timeInMode=0;
+ int norefs=0;
+
+ //This block handles cases where the read runs outside the reference
+ //Of course, padding the reference with 'N' would be better, but...
+ int readStart=0;
+ int readStop=read.length;
+ final int refStop=refStart+read.length;
+ boolean semiperfect=true;
+
+ if(refStart<0){
+ readStart=0-refStart;
+ score+=POINTS_NOREF*readStart;
+ norefs+=readStart;
+ }
+ if(refStop>ref.length){
+ int dif=(refStop-ref.length);
+ readStop-=dif;
+ score+=POINTS_NOREF*dif;
+ norefs+=dif;
+ }
+
+// if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed.
+
+ for(int i=readStart; i<readStop; i++){
+ byte c=read[i];
+ byte r=ref[refStart+i];
+
+ if(c==r && c!='N'){
+ if(mode==MODE_MS){
+ timeInMode++;
+ score+=POINTS_MATCH2;
+ }else{
+ timeInMode=0;
+ score+=POINTS_MATCH;
+ }
+ score+=baseScores[i];
+ mode=MODE_MS;
+ }else if(c<0 || c=='N'){
+ score+=POINTS_NOCALL;
+ semiperfect=false;
+ }else if(r<0 || r=='N'){
+ score+=POINTS_NOREF;
+ norefs++;
+ }else{
+ if(mode==MODE_SUB){timeInMode++;}
+ else{timeInMode=0;}
+
+ if(timeInMode==0){score+=POINTS_SUB;}
+ else if(timeInMode<LIMIT_FOR_COST_3){score+=POINTS_SUB2;}
+ else{score+=POINTS_SUB3;}
+ mode=MODE_SUB;
+ semiperfect=false;
+ }
+ }
+
+ if(semiperfect && ss!=null){ss.semiperfect=((ss.stop==ss.start+read.length-1) && (norefs<=read.length/2));}
+ assert(Read.CHECKSITE(ss, read, -1));
+
+ return score;
+ }
+
+
+ public final int scoreNoIndelsAndMakeMatchString(byte[] read, byte[] ref, byte[] baseScores, final int refStart, byte[][] matchReturn){
+ int score=0;
+ int mode=-1;
+ int timeInMode=0;
+
+ assert(refStart<=ref.length) : refStart+", "+ref.length;
+
+ //This block handles cases where the read runs outside the reference
+ //Of course, padding the reference with 'N' would be better, but...
+ int readStart=0;
+ int readStop=read.length;
+ final int refStop=refStart+read.length;
+ if(refStart<0){
+ readStart=0-refStart;
+ score+=POINTS_NOREF*readStart;
+ }
+ if(refStop>ref.length){
+ int dif=(refStop-ref.length);
+ System.err.println("dif="+dif+", ref.length="+ref.length+", refStop="+refStop);
+ readStop-=dif;
+ score+=POINTS_NOREF*dif;
+ }
+ assert(refStart+readStop<=ref.length) : "readStart="+readStart+", readStop="+readStop+
+ ", refStart="+refStart+", refStop="+refStop+", ref.length="+ref.length+", read.length="+read.length;
+
+ assert(matchReturn!=null);
+ assert(matchReturn.length==1);
+ if(matchReturn[0]==null || matchReturn[0].length!=read.length){
+ assert(matchReturn[0]==null || matchReturn[0].length<read.length) : matchReturn[0].length+"!="+read.length;
+ matchReturn[0]=new byte[read.length];
+ }
+ final byte[] match=matchReturn[0];
+
+// if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed.
+
+ for(int i=readStart; i<readStop; i++){
+ byte c=read[i];
+ byte r=ref[refStart+i];
+
+ assert(r!='.' && c!='.');
+
+ if(c==r && c!='N'){
+ if(mode==MODE_MS){
+ timeInMode++;
+ score+=POINTS_MATCH2;
+ }else{
+ timeInMode=0;
+ score+=POINTS_MATCH;
+ }
+ score+=baseScores[i];
+ match[i]='m';
+ mode=MODE_MS;
+ }else if(c<0 || c=='N'){
+ score+=POINTS_NOCALL;
+ match[i]='N';
+ }else if(r<0 || r=='N'){
+ score+=POINTS_NOREF;
+// match[i]='m';
+ match[i]='N';
+ }else{
+ match[i]='S';
+ if(mode==MODE_SUB){timeInMode++;}
+ else{timeInMode=0;}
+
+ if(timeInMode==0){score+=POINTS_SUB;}
+ else if(timeInMode<LIMIT_FOR_COST_3){score+=POINTS_SUB2;}
+ else{score+=POINTS_SUB3;}
+ mode=MODE_SUB;
+ }
+ }
+
+ return score;
+ }
+
+
+ public final int scoreNoIndelsAndMakeMatchString(byte[] read, byte[] ref, final int refStart, byte[][] matchReturn){
+ int score=0;
+ int mode=-1;
+ int timeInMode=0;
+
+ assert(refStart<=ref.length) : refStart+", "+ref.length;
+
+ //This block handles cases where the read runs outside the reference
+ //Of course, padding the reference with 'N' would be better, but...
+ int readStart=0;
+ int readStop=read.length;
+ final int refStop=refStart+read.length;
+ if(refStart<0){
+ readStart=0-refStart;
+ score+=POINTS_NOREF*readStart;
+ }
+ if(refStop>ref.length){
+ int dif=(refStop-ref.length);
+ System.err.println("dif="+dif+", ref.length="+ref.length+", refStop="+refStop);
+ readStop-=dif;
+ score+=POINTS_NOREF*dif;
+ }
+ assert(refStart+readStop<=ref.length) : "readStart="+readStart+", readStop="+readStop+
+ ", refStart="+refStart+", refStop="+refStop+", ref.length="+ref.length+", read.length="+read.length;
+
+ assert(matchReturn!=null);
+ assert(matchReturn.length==1);
+ if(matchReturn[0]==null || matchReturn[0].length!=read.length){
+ assert(matchReturn[0]==null || matchReturn[0].length<read.length) : matchReturn[0].length+"!="+read.length;
+ matchReturn[0]=new byte[read.length];
+ }
+ final byte[] match=matchReturn[0];
+
+// if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed.
+
+ for(int i=readStart; i<readStop; i++){
+ byte c=read[i];
+ byte r=ref[refStart+i];
+
+ assert(r!='.' && c!='.');
+
+ if(c==r && c!='N'){
+ if(mode==MODE_MS){
+ timeInMode++;
+ score+=POINTS_MATCH2;
+ }else{
+ timeInMode=0;
+ score+=POINTS_MATCH;
+ }
+ match[i]='m';
+ mode=MODE_MS;
+ }else if(c<0 || c=='N'){
+ score+=POINTS_NOCALL;
+ match[i]='N';
+ }else if(r<0 || r=='N'){
+ score+=POINTS_NOREF;
+// match[i]='m';
+ match[i]='N';
+ }else{
+ match[i]='S';
+ if(mode==MODE_SUB){timeInMode++;}
+ else{timeInMode=0;}
+
+ if(timeInMode==0){score+=POINTS_SUB;}
+ else if(timeInMode<LIMIT_FOR_COST_3){score+=POINTS_SUB2;}
+ else{score+=POINTS_SUB3;}
+ mode=MODE_SUB;
+ }
+ }
+
+ return score;
+ }
+
+ public final static int maxQuality(int numBases){
+ return POINTS_MATCH+(numBases-1)*(POINTS_MATCH2);
+ }
+
+ public final static int maxQuality(byte[] baseScores){
+ return POINTS_MATCH+(baseScores.length-1)*(POINTS_MATCH2)+Tools.sumInt(baseScores);
+ }
+
+ public final static int maxImperfectScore(int numBases){
+// int maxQ=maxQuality(numBases);
+//// maxImperfectSwScore=maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB);
+// int maxI=maxQ+POINTS_DEL;
+// maxI=Tools.max(maxI, maxQ+POINTS_INS-POINTS_MATCH2);
+// maxI=Tools.min(maxI, maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB));
+
+ int maxQ=maxQuality(numBases);
+ int maxI=maxQ+Tools.min(POINTS_DEL, POINTS_INS-POINTS_MATCH2);
+ assert(maxI<(maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB)));
+ return maxI;
+ }
+
+ public final static int maxImperfectScore(byte[] baseScores){
+// int maxQ=maxQuality(numBases);
+//// maxImperfectSwScore=maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB);
+// int maxI=maxQ+POINTS_DEL;
+// maxI=Tools.max(maxI, maxQ+POINTS_INS-POINTS_MATCH2);
+// maxI=Tools.min(maxI, maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB));
+
+ int maxQ=maxQuality(baseScores);
+ int maxI=maxQ+Tools.min(POINTS_DEL, POINTS_INS-POINTS_MATCH2);
+ assert(maxI<(maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB)));
+ return maxI;
+ }
+
+ public final static String toString(int[] a){
+
+ int width=7;
+
+ StringBuilder sb=new StringBuilder((a.length+1)*width+2);
+ for(int num : a){
+ String s=" "+num;
+ int spaces=width-s.length();
+ assert(spaces>=0) : width+", "+s.length()+", "+s+", "+num+", "+spaces;
+ for(int i=0; i<spaces; i++){sb.append(' ');}
+ sb.append(s);
+ }
+
+ return sb.toString();
+ }
+
+ public final static String toTimePacked(int[] a){
+ int width=7;
+
+ StringBuilder sb=new StringBuilder((a.length+1)*width+2);
+ for(int num_ : a){
+ int num=num_&TIMEMASK;
+ String s=" "+num;
+ int spaces=width-s.length();
+ assert(spaces>=0) : width+", "+s.length()+", "+s+", "+num+", "+spaces;
+ for(int i=0; i<spaces; i++){sb.append(' ');}
+ sb.append(s);
+ }
+
+ return sb.toString();
+ }
+
+ public final static String toScorePacked(int[] a){
+ int width=7;
+
+ String minString=" -";
+ String maxString=" ";
+ while(minString.length()<width){minString+='9';}
+ while(maxString.length()<width){maxString+='9';}
+
+ StringBuilder sb=new StringBuilder((a.length+1)*width+2);
+ for(int num_ : a){
+ int num=num_>>SCOREOFFSET;
+ String s=" "+num;
+ if(s.length()>width){s=num>0 ? maxString : minString;}
+ int spaces=width-s.length();
+ assert(spaces>=0) : width+", "+s.length()+", "+s+", "+num+", "+spaces;
+ for(int i=0; i<spaces; i++){sb.append(' ');}
+ sb.append(s);
+ }
+
+ return sb.toString();
+ }
+
+ public final static String toString(byte[] a){
+
+ int width=6;
+
+ StringBuilder sb=new StringBuilder((a.length+1)*width+2);
+ for(int num : a){
+ String s=" "+num;
+ int spaces=width-s.length();
+ assert(spaces>=0);
+ for(int i=0; i<spaces; i++){sb.append(' ');}
+ sb.append(s);
+ }
+
+ return sb.toString();
+ }
+
+ public final static String toString(byte[] ref, int startLoc, int stopLoc){
+ StringBuilder sb=new StringBuilder(stopLoc-startLoc+1);
+ for(int i=startLoc; i<=stopLoc; i++){sb.append((char)ref[i]);}
+ return sb.toString();
+ }
+
+ public static int calcDelScore(int len){
+ if(len<=0){return 0;}
+ int score=POINTS_DEL;
+
+ if(len>LIMIT_FOR_COST_4){
+ score+=(len-LIMIT_FOR_COST_4)*POINTS_DEL4;
+ len=LIMIT_FOR_COST_4;
+ }
+ if(len>LIMIT_FOR_COST_3){
+ score+=(len-LIMIT_FOR_COST_3)*POINTS_DEL3;
+ len=LIMIT_FOR_COST_3;
+ }
+ if(len>1){
+ score+=(len-1)*POINTS_DEL2;
+ }
+ return score;
+ }
+
+ private static int calcDelScoreOffset(int len){
+ if(len<=0){return 0;}
+ int score=POINTSoff_DEL;
+
+ if(len>LIMIT_FOR_COST_4){
+ score+=(len-LIMIT_FOR_COST_4)*POINTSoff_DEL4;
+ len=LIMIT_FOR_COST_4;
+ }
+ if(len>LIMIT_FOR_COST_3){
+ score+=(len-LIMIT_FOR_COST_3)*POINTSoff_DEL3;
+ len=LIMIT_FOR_COST_3;
+ }
+ if(len>1){
+ score+=(len-1)*POINTSoff_DEL2;
+ }
+ return score;
+ }
+
+ public static int calcInsScore(int len){
+ if(len<=0){return 0;}
+ int score=POINTS_INS;
+
+ if(len>LIMIT_FOR_COST_4){
+ score+=(len-LIMIT_FOR_COST_4)*POINTS_INS4;
+ len=LIMIT_FOR_COST_4;
+ }
+ if(len>LIMIT_FOR_COST_3){
+ score+=(len-LIMIT_FOR_COST_3)*POINTS_INS3;
+ len=LIMIT_FOR_COST_3;
+ }
+ if(len>1){
+ score+=(len-1)*POINTS_INS2;
+ }
+ return score;
+ }
+
+ private static int calcInsScoreOffset(int len){
+ if(len<=0){return 0;}
+ int score=POINTSoff_INS;
+ if(len>LIMIT_FOR_COST_4){
+ score+=(len-LIMIT_FOR_COST_4)*POINTSoff_INS4;
+ len=LIMIT_FOR_COST_4;
+ }
+ if(len>LIMIT_FOR_COST_3){
+ score+=(len-LIMIT_FOR_COST_3)*POINTSoff_INS3;
+ len=LIMIT_FOR_COST_3;
+ }
+ if(len>1){
+ score+=(len-1)*POINTSoff_INS2;
+ }
+ return score;
+ }
+
+
+ public final int maxRows;
+ public final int maxColumns;
+
+ private final int[][][] packed;
+
+ public final int[] vertLimit;
+ public final int[] horizLimit;
+
+ CharSequence showVertLimit(){
+ StringBuilder sb=new StringBuilder();
+ for(int i=0; i<=rows; i++){sb.append(vertLimit[i]>>SCOREOFFSET).append(",");}
+ return sb;
+ }
+ CharSequence showHorizLimit(){
+ StringBuilder sb=new StringBuilder();
+ for(int i=0; i<=columns; i++){sb.append(horizLimit[i]>>SCOREOFFSET).append(",");}
+ return sb;
+ }
+
+// public static final int MODEBITS=2;
+ public static final int TIMEBITS=12;
+ public static final int SCOREBITS=32-TIMEBITS;
+ public static final int MAX_TIME=((1<<TIMEBITS)-1);
+ public static final int MAX_SCORE=((1<<(SCOREBITS-1))-1)-2000;
+ public static final int MIN_SCORE=0-MAX_SCORE; //Keeps it 1 point above "BAD".
+
+// public static final int MODEOFFSET=0; //Always zero.
+// public static final int TIMEOFFSET=0;
+ public static final int SCOREOFFSET=TIMEBITS;
+
+// public static final int MODEMASK=~((-1)<<MODEBITS);
+// public static final int TIMEMASK=(~((-1)<<TIMEBITS))<<TIMEOFFSET;
+ public static final int TIMEMASK=~((-1)<<TIMEBITS);
+ public static final int SCOREMASK=(~((-1)<<SCOREBITS))<<SCOREOFFSET;
+
+ private static final byte MODE_MS=0;
+ private static final byte MODE_DEL=1;
+ private static final byte MODE_INS=2;
+ private static final byte MODE_SUB=3;
+
+ public static final int POINTS_NOREF=-12;
+ public static final int POINTS_NOCALL=-12;
+ public static final int POINTS_MATCH=90;
+ public static final int POINTS_MATCH2=100;
+ public static final int POINTS_COMPATIBLE=50;
+ public static final int POINTS_SUB=-145;
+ public static final int POINTS_SUBR=-163; //increased penalty if prior match streak was at most 1
+ public static final int POINTS_SUB2=-59;
+ public static final int POINTS_SUB3=-35;
+ public static final int POINTS_MATCHSUB=-10;
+ public static final int POINTS_INS=-214;
+ public static final int POINTS_INS2=-55;
+ public static final int POINTS_INS3=-39;
+ public static final int POINTS_INS4=-17;
+ public static final int POINTS_DEL=-258;
+ public static final int POINTS_DEL2=-35;
+ public static final int POINTS_DEL3=-26;
+ public static final int POINTS_DEL4=-14;
+ public static final int POINTS_DEL_REF_N=-10;
+
+
+ public static final int LIMIT_FOR_COST_3=5;
+ public static final int LIMIT_FOR_COST_4=30;
+
+ public static final int BAD=MIN_SCORE-1;
+
+
+ public static final int POINTSoff_NOREF=(POINTS_NOREF<<SCOREOFFSET);
+ public static final int POINTSoff_NOCALL=(POINTS_NOCALL<<SCOREOFFSET);
+ public static final int POINTSoff_MATCH=(POINTS_MATCH<<SCOREOFFSET);
+ public static final int POINTSoff_MATCH2=(POINTS_MATCH2<<SCOREOFFSET);
+ public static final int POINTSoff_COMPATIBLE=(POINTS_COMPATIBLE<<SCOREOFFSET);
+ public static final int POINTSoff_SUB=(POINTS_SUB<<SCOREOFFSET);
+ public static final int POINTSoff_SUBR=(POINTS_SUBR<<SCOREOFFSET);
+ public static final int POINTSoff_SUB2=(POINTS_SUB2<<SCOREOFFSET);
+ public static final int POINTSoff_SUB3=(POINTS_SUB3<<SCOREOFFSET);
+ public static final int POINTSoff_MATCHSUB=(POINTS_MATCHSUB<<SCOREOFFSET);
+ public static final int POINTSoff_INS=(POINTS_INS<<SCOREOFFSET);
+ public static final int POINTSoff_INS2=(POINTS_INS2<<SCOREOFFSET);
+ public static final int POINTSoff_INS3=(POINTS_INS3<<SCOREOFFSET);
+ public static final int POINTSoff_INS4=(POINTS_INS4<<SCOREOFFSET);
+ public static final int POINTSoff_DEL=(POINTS_DEL<<SCOREOFFSET);
+ public static final int POINTSoff_DEL2=(POINTS_DEL2<<SCOREOFFSET);
+ public static final int POINTSoff_DEL3=(POINTS_DEL3<<SCOREOFFSET);
+ public static final int POINTSoff_DEL4=(POINTS_DEL4<<SCOREOFFSET);
+ public static final int POINTSoff_DEL_REF_N=(POINTS_DEL_REF_N<<SCOREOFFSET);
+ public static final int BADoff=(BAD<<SCOREOFFSET);
+ public static final int MAXoff_SCORE=MAX_SCORE<<SCOREOFFSET;
+ public static final int MINoff_SCORE=MIN_SCORE<<SCOREOFFSET;
+
+ private int rows;
+ private int columns;
+
+ public long iterationsLimited=0;
+ public long iterationsUnlimited=0;
+
+ public boolean verbose=false;
+ public boolean verbose2=false;
+
+}
diff --git a/current/align2/MultiStateAligner9PacBioAdapter_WithBarriers.java b/current/align2/MultiStateAligner9PacBioAdapter_WithBarriers.java
new file mode 100755
index 0000000..c183cf6
--- /dev/null
+++ b/current/align2/MultiStateAligner9PacBioAdapter_WithBarriers.java
@@ -0,0 +1,2594 @@
+package align2;
+
+import java.util.Arrays;
+
+import stream.Read;
+import stream.SiteScore;
+
+import dna.AminoAcid;
+import dna.ChromosomeArray;
+import dna.Data;
+
+/**
+ * Based on MSA9ts, with transform scores tweaked for PacBio. */
+public final class MultiStateAligner9PacBioAdapter_WithBarriers {
+
+
+ public static void main(String[] args){
+ byte[] read=args[0].getBytes();
+ byte[] ref=args[1].getBytes();
+
+ byte[] original=ref;
+
+ MultiStateAligner9PacBioAdapter_WithBarriers msa=new MultiStateAligner9PacBioAdapter_WithBarriers(read.length, ref.length);
+
+ System.out.println("Initial: ");
+ for(int mode=0; mode<msa.packed.length; mode++){
+ for(int row=0; row<msa.packed[mode].length; row++){
+ System.out.println(toScorePacked(msa.packed[mode][row]));
+ }
+ System.out.println();
+ for(int row=0; row<msa.packed[mode].length; row++){
+ System.out.println(toTimePacked(msa.packed[mode][row]));
+ }
+ System.out.println();
+ }
+
+ int[] max=msa.fillLimited(read, ref, 0, ref.length-1, 0, null);
+
+ System.out.println("Max: "+Arrays.toString(max));
+
+ System.out.println("Initial: ");
+ for(int mode=0; mode<msa.packed.length; mode++){
+ for(int row=0; row<msa.packed[mode].length; row++){
+ System.out.println(toScorePacked(msa.packed[mode][row]));
+ }
+ System.out.println();
+ for(int row=0; row<msa.packed[mode].length; row++){
+ System.out.println(toTimePacked(msa.packed[mode][row]));
+ }
+ System.out.println();
+ }
+
+ byte[] out=msa.traceback(read, ref, 0, ref.length-1, max[0], max[1], max[2], false);
+
+ int[] score=null;
+ score=msa.score(read, ref, 0, ref.length-1, max[0], max[1], max[2], false);
+
+ System.out.println(new String(ref));
+ System.out.println(new String(read));
+ System.out.println(new String(out));
+ System.out.println("Score: "+Arrays.toString(score));
+ }
+
+
+ public MultiStateAligner9PacBioAdapter_WithBarriers(int maxRows_, int maxColumns_){
+// assert(maxColumns_>=200);
+// assert(maxRows_>=200);
+ maxRows=maxRows_;
+ maxColumns=maxColumns_;
+ packed=new int[3][maxRows+1][maxColumns+1];
+ grefbuffer=new byte[maxColumns+2];
+
+ vertLimit=new int[maxRows+1];
+ horizLimit=new int[maxColumns+1];
+ Arrays.fill(vertLimit, BADoff);
+ Arrays.fill(horizLimit, BADoff);
+
+// for(int i=0; i<maxColumns+1; i++){
+// scores[0][i]=0-i;
+// }
+
+ for(int matrix=0; matrix<packed.length; matrix++){
+ for(int i=1; i<=maxRows; i++){
+ for(int j=0; j<packed[matrix][i].length; j++){
+ packed[matrix][i][j]|=BADoff;
+ }
+// packed[matrix][i][0]|=MODE_INS;
+ }
+// for(int i=0; i<maxRows+1; i++){
+// scores[matrix][i][0]=(i*POINTSoff_NOREF);
+// }
+ for(int i=0; i<=maxRows; i++){
+
+ int prevScore=(i<2 ? 0 : packed[matrix][i-1][0]);
+ int score=(i<2 ? (i*POINTSoff_INS) :
+ (i<LIMIT_FOR_COST_3 ? prevScore+POINTSoff_INS2 :
+ (i<LIMIT_FOR_COST_4 ? prevScore+POINTSoff_INS3 : prevScore+POINTSoff_INS4)));
+
+ packed[matrix][i][0]=score;
+ }
+// for(int i=1; i<maxColumns+1; i++){
+// prevState[matrix][0][i]=MODE_DEL;
+// }
+// for(int i=0; i<=maxColumns; i++){
+// packed[matrix][0][i]|=MODE_MS;
+// }
+ }
+ }
+
+
+ /** return new int[] {rows, maxC, maxS, max};
+ * Will not fill areas that cannot match minScore */
+ public final int[] fillLimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore, int[] gaps){
+ if(gaps==null){return fillLimitedX(read, ref, refStartLoc, refEndLoc, minScore);}
+ else{
+ byte[] gref=makeGref(ref, gaps, refStartLoc, refEndLoc);
+ assert(gref!=null) : "Excessively long read:\n"+new String(read);
+ return fillLimitedX(read, gref, 0, greflimit, minScore);
+ }
+ }
+
+
+ /** return new int[] {rows, maxC, maxS, max};
+ * Will not fill areas that cannot match minScore */
+ private final int[] fillLimitedX(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore){
+// minScore=0;
+// assert(minScore>0);
+ rows=read.length;
+ columns=refEndLoc-refStartLoc+1;
+
+ if(/*read.length<40 || */false || minScore<=0 || columns>read.length+Tools.min(100, read.length)){
+// assert(false) : minScore;
+ return fillUnlimited(read, ref, refStartLoc, refEndLoc);
+ }
+
+// final int BARRIER_I2=columns-BARRIER_I1;
+ final int BARRIER_I2=rows-BARRIER_I1, BARRIER_I2b=columns-1;
+ final int BARRIER_D2=rows-BARRIER_D1;
+
+ minScore-=100; //Increases quality trivially
+
+ assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows+"\n"+
+ refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n";
+ assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns+"\n"+
+ refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n";
+
+ assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc+"\n"+
+ refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n";
+ assert(refEndLoc<ref.length) : "Check that values are in-bounds before calling this function: "+refEndLoc+", "+ref.length+"\n"+
+ refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n";
+
+// for(int x=0; x<packed.length; x++){
+// for(int y=1; y<rows+1; y++){
+// Arrays.fill(packed[x][y], 1, columns+1, BADoff);
+// }
+// }
+ for(int x=0; x<packed.length; x++){
+
+// Arrays.fill(packed[x][1], 1, columns+1, BADoff);
+ Arrays.fill(packed[x][rows], 1, columns+1, BADoff);
+// for(int y=1; y<rows+1; y++){
+// Arrays.fill(packed[x][y], 1, columns+1, BADoff);
+// }
+ }
+
+ int minGoodCol=1;
+ int maxGoodCol=columns;
+
+ final int minScore_off=(minScore<<SCOREOFFSET);
+ final int maxGain=(read.length-1)*POINTSoff_MATCH2+POINTSoff_MATCH;
+ final int floor=minScore_off-maxGain;
+// final int subfloor=Tools.max(BADoff, floor-200*POINTSoff_MATCH2);
+ final int subfloor=floor-5*POINTSoff_MATCH2;
+ assert(subfloor>BADoff); //TODO: Actually, it needs to be substantially more.
+ assert(subfloor<minScore_off) : minScore_off+", "+floor+", "+BADoff+", "+subfloor;
+
+ if(verbose2){
+ System.out.println();
+ System.out.println("minScore="+minScore+"\t"+minScore_off);
+ System.out.println("maxGain="+(maxGain>>SCOREOFFSET)+"\t"+(maxGain));
+ System.out.println("floor="+(floor>>SCOREOFFSET)+"\t"+(floor));
+ System.out.println("subfloor="+(subfloor>>SCOREOFFSET)+"\t"+(subfloor));
+ System.out.println("BADoff="+(BADoff>>SCOREOFFSET)+"\t"+(BADoff));
+ System.out.println("maxGain="+(maxGain>>SCOREOFFSET)+"\t"+(maxGain));
+ System.out.println();
+ }
+
+ vertLimit[rows]=minScore_off;
+ for(int i=rows-1; i>=0; i--){
+ vertLimit[i]=Tools.max(vertLimit[i+1]-POINTSoff_MATCH2, floor);
+ }
+
+ horizLimit[columns]=minScore_off;
+ for(int i=columns-1; i>=0; i--){
+ horizLimit[i]=Tools.max(horizLimit[i+1]-POINTSoff_MATCH2, floor);
+ }
+
+ for(int row=1; row<=rows; row++){
+
+ int colStart=minGoodCol;
+ int colStop=maxGoodCol;
+
+ minGoodCol=-1;
+ maxGoodCol=-2;
+
+ final int vlimit=vertLimit[row];
+
+ if(verbose2){
+ System.out.println();
+ System.out.println("row="+row);
+ System.out.println("colStart="+colStart);
+ System.out.println("colStop="+colStop);
+ System.out.println("vlimit="+(vlimit>>SCOREOFFSET)+"\t"+(vlimit));
+ }
+
+ if(colStart<0 || colStop<colStart){break;}
+
+
+ if(colStart>1){
+ assert(row>0);
+ packed[MODE_MS][row][colStart-1]=subfloor;
+ packed[MODE_INS][row][colStart-1]=subfloor;
+ packed[MODE_DEL][row][colStart-1]=subfloor;
+ }
+
+
+ for(int col=colStart; col<=columns; col++){
+
+
+ if(verbose2){
+ System.out.println("\ncol "+col);
+ }
+
+ final byte call0=(row<2 ? (byte)'?' : read[row-2]);
+ final byte call1=read[row-1];
+ final byte ref0=(col<2 ? (byte)'!' : ref[refStartLoc+col-2]);
+ final byte ref1=ref[refStartLoc+col-1];
+
+ final boolean gap=(ref1==GAPC);
+ assert(call1!=GAPC);
+
+// final boolean match=(read[row-1]==ref[refStartLoc+col-1]);
+// final boolean prevMatch=(row<2 || col<2 ? false : read[row-2]==ref[refStartLoc+col-2]);
+ final boolean match=(call1==ref1 && ref1!='N');
+ final boolean prevMatch=(call0==ref0 && ref0!='N');
+
+// System.err.println("")
+
+ iterationsLimited++;
+ final int limit=Tools.max(vlimit, horizLimit[col]);
+ final int limit3=Tools.max(floor, (match ? limit-POINTSoff_MATCH2 : limit-POINTSoff_SUB3));
+
+ final int delNeeded=Tools.max(0, row-col-1);
+ final int insNeeded=Tools.max(0, (rows-row)-(columns-col)-1);
+
+ final int delPenalty=calcDelScoreOffset(delNeeded);
+ final int insPenalty=calcInsScoreOffset(insNeeded);
+
+
+ final int scoreFromDiag_MS=packed[MODE_MS][row-1][col-1]&SCOREMASK;
+ final int scoreFromDel_MS=packed[MODE_DEL][row-1][col-1]&SCOREMASK;
+ final int scoreFromIns_MS=packed[MODE_INS][row-1][col-1]&SCOREMASK;
+
+ final int scoreFromDiag_DEL=packed[MODE_MS][row][col-1]&SCOREMASK;
+ final int scoreFromDel_DEL=packed[MODE_DEL][row][col-1]&SCOREMASK;
+
+ final int scoreFromDiag_INS=packed[MODE_MS][row-1][col]&SCOREMASK;
+ final int scoreFromIns_INS=packed[MODE_INS][row-1][col]&SCOREMASK;
+
+// if(scoreFromDiag_MS<limit3 && scoreFromDel_MS<limit3 && scoreFromIns_MS<limit3
+// && scoreFromDiag_DEL<limit && scoreFromDel_DEL<limit && scoreFromDiag_INS<limit && scoreFromIns_INS<limit){
+// iterationsLimited--; //A "fast" iteration
+// }
+
+ if(gap || (scoreFromDiag_MS<=limit3 && scoreFromDel_MS<=limit3 && scoreFromIns_MS<=limit3)){
+ packed[MODE_MS][row][col]=subfloor;
+ }else{//Calculate match and sub scores
+ final int streak=(packed[MODE_MS][row-1][col-1]&TIMEMASK);
+
+ {//Calculate match/sub score
+
+ int score;
+ int time;
+ byte prevState;
+
+ if(match){
+
+ int scoreMS=scoreFromDiag_MS+(prevMatch ? POINTSoff_MATCH2 : POINTSoff_MATCH);
+ int scoreD=scoreFromDel_MS+POINTSoff_MATCH;
+ int scoreI=scoreFromIns_MS+POINTSoff_MATCH;
+
+// byte prevState;
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? streak+1 : 1);
+ prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+ prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+ prevState=MODE_INS;
+ }
+
+// if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+// assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+// assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+//// packed[MODE_MS][row][col]=(score|prevState|time);
+// packed[MODE_MS][row][col]=(score|time);
+// assert((score&SCOREMASK)==score);
+//// assert((prevState&MODEMASK)==prevState);
+// assert((time&TIMEMASK)==time);
+
+ }else{
+
+ int scoreMS;
+ if(ref1!='N' && call1!='N'){
+ scoreMS=scoreFromDiag_MS+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) :
+ (streak==0 ? POINTSoff_SUB : streak<LIMIT_FOR_COST_3 ? POINTSoff_SUB2 : POINTSoff_SUB3));
+ }else{
+ scoreMS=scoreFromDiag_MS+POINTSoff_NOCALL;
+ }
+
+ int scoreD=scoreFromDel_MS+POINTSoff_SUB; //+2 to move it as close as possible to the deletion / insertion
+ int scoreI=scoreFromIns_MS+POINTSoff_SUB;
+
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? 1 : streak+1);
+// time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1);
+ prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+ prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+ prevState=MODE_INS;
+ }
+
+// if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+// assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+// assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+//// packed[MODE_MS][row][col]=(score|prevState|time);
+// packed[MODE_MS][row][col]=(score|time);
+// assert((score&SCOREMASK)==score);
+//// assert((prevState&MODEMASK)==prevState);
+// assert((time&TIMEMASK)==time);
+ }
+
+ final int limit2;
+ if(delNeeded>0){
+ limit2=limit-delPenalty;
+ }else if(insNeeded>0){
+ limit2=limit-insPenalty;
+ }else{
+ limit2=limit;
+ }
+ assert(limit2>=limit);
+
+ if(verbose2){System.err.println("MS: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));}
+
+ if(score>=limit2){
+ maxGoodCol=col;
+ if(minGoodCol<0){minGoodCol=col;}
+ }else{
+ score=subfloor;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_MS][row][col]=(score|prevState|time);
+ packed[MODE_MS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+ }
+
+ if((scoreFromDiag_DEL<=limit && scoreFromDel_DEL<=limit) || row<BARRIER_D1 || row>BARRIER_D2){
+// assert((scoreFromDiag_DEL<=limit && scoreFromDel_DEL<=limit)) : scoreFromDiag_DEL+", "+row;
+ packed[MODE_DEL][row][col]=subfloor;
+ }else{//Calculate DEL score
+
+ final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK;
+
+ int scoreMS=scoreFromDiag_DEL+POINTSoff_DEL;
+ int scoreD=scoreFromDel_DEL+(streak==0 ? POINTSoff_DEL :
+ streak<LIMIT_FOR_COST_3 ? POINTSoff_DEL2 :
+ streak<LIMIT_FOR_COST_4 ? POINTSoff_DEL3 :
+ streak<LIMIT_FOR_COST_5 ? POINTSoff_DEL4 :
+ ((streak&MASK5)==0 ? POINTSoff_DEL5 : 0));
+// int scoreI=scoreFromIns+POINTSoff_DEL;
+
+
+ if(ref1=='N'){
+ scoreMS+=POINTSoff_DEL_REF_N;
+ scoreD+=POINTSoff_DEL_REF_N;
+ }else if(gap){
+ scoreMS+=POINTSoff_GAP;
+ scoreD+=POINTSoff_GAP;
+ }
+
+ //if(match){scoreMS=subfloor;}
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreD){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreD;
+ time=streak+1;
+ prevState=MODE_DEL;
+ }
+
+ final int limit2;
+ if(insNeeded>0){
+ limit2=limit-insPenalty;
+ }else if(delNeeded>0){
+ limit2=limit-calcDelScoreOffset(time+delNeeded)+calcDelScoreOffset(time);
+ }else{
+ limit2=limit;
+ }
+ assert(limit2>=limit);
+ if(verbose2){System.err.println("DEL: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));}
+
+ if(score>=limit2){
+ maxGoodCol=col;
+ if(minGoodCol<0){minGoodCol=col;}
+ }else{
+ score=subfloor;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_DEL][row][col]=(score|prevState|time);
+ packed[MODE_DEL][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+
+// if(gap || (scoreFromDiag_INS<=limit && scoreFromIns_INS<=limit) || col<BARRIER_I1 || col>BARRIER_I2){
+ if(gap || (scoreFromDiag_INS<=limit && scoreFromIns_INS<=limit) || (row<BARRIER_I1 && col>1) || (row>BARRIER_I2 && col<BARRIER_I2b)){
+ packed[MODE_INS][row][col]=subfloor;
+ }else{//Calculate INS score
+
+ final int streak=packed[MODE_INS][row-1][col]&TIMEMASK;
+
+ int scoreMS=scoreFromDiag_INS+POINTSoff_INS;
+// int scoreD=scoreFromDel+POINTSoff_INS;
+ int scoreI=scoreFromIns_INS+(streak==0 ? POINTSoff_INS :
+ streak<LIMIT_FOR_COST_3 ? POINTSoff_INS2 :
+ streak<LIMIT_FOR_COST_4 ? POINTSoff_INS3 : POINTSoff_INS4);
+
+
+// System.err.println("("+row+","+col+")\t"+scoreFromDiag+"+"+POINTSoff_INS+"="+scoreM+", "+
+// scoreFromSub+"+"+POINTSoff_INS+"="+scoreS+", "
+// +scoreD+", "+scoreFromIns+"+"+
+// (streak==0 ? POINTSoff_INS : streak<LIMIT_FOR_COST_3 ? POINTSoff_INS2 : POINTSoff_INS3)+"="+scoreI);
+
+ //if(match){scoreMS=subfloor;}
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreI){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreI;
+ time=streak+1;
+ prevState=MODE_INS;
+ }
+
+ final int limit2;
+ if(delNeeded>0){
+ limit2=limit-delPenalty;
+ }else if(insNeeded>0){
+ limit2=limit-calcInsScoreOffset(time+insNeeded)+calcInsScoreOffset(time);
+ }else{
+ limit2=limit;
+ }
+ assert(limit2>=limit);
+
+ if(verbose2){System.err.println("INS: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));}
+ if(score>=limit2){
+ maxGoodCol=col;
+ if(minGoodCol<0){minGoodCol=col;}
+ }else{
+ score=subfloor;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_INS][row][col]=(score|prevState|time);
+ packed[MODE_INS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+
+
+ if(col>=colStop){
+ if(col>colStop && maxGoodCol<col){break;}
+ if(row>1){
+ packed[MODE_MS][row-1][col+1]=subfloor;
+ packed[MODE_INS][row-1][col+1]=subfloor;
+ packed[MODE_DEL][row-1][col+1]=subfloor;
+ }
+ }
+ }
+ }
+
+
+ int maxCol=-1;
+ int maxState=-1;
+ int maxScore=Integer.MIN_VALUE;
+
+ for(int state=0; state<packed.length; state++){
+ for(int col=1; col<=columns; col++){
+ int x=packed[state][rows][col]&SCOREMASK;
+ if(x>maxScore){
+ maxScore=x;
+ maxCol=col;
+ maxState=state;
+ }
+ }
+ }
+
+ assert(maxScore>=BADoff);
+// if(maxScore==BADoff){
+// return null;
+// }
+// if(maxScore<floor){
+// return null;
+// }
+ if(maxScore<minScore_off){
+ return null;
+ }
+
+ maxScore>>=SCOREOFFSET;
+
+// System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore);
+ return new int[] {rows, maxCol, maxState, maxScore};
+ }
+
+
+ /** return new int[] {rows, maxC, maxS, max};
+ * Will not fill areas that cannot match minScore */
+ public final int[] fillUnlimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int[] gaps){
+ if(gaps==null){return fillUnlimited(read, ref, refStartLoc, refEndLoc);}
+ else{
+ byte[] gref=makeGref(ref, gaps, refStartLoc, refEndLoc);
+ assert(gref!=null) : "Excessively long read:\n"+new String(read);
+ return fillUnlimited(read, gref, 0, greflimit);
+ }
+ }
+
+
+ /** return new int[] {rows, maxC, maxS, max};
+ * Does not require a min score (ie, same as old method) */
+ private final int[] fillUnlimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc){
+ rows=read.length;
+ columns=refEndLoc-refStartLoc+1;
+
+ final int maxGain=(read.length-1)*POINTSoff_MATCH2+POINTSoff_MATCH;
+ final int subfloor=0-2*maxGain;
+ assert(subfloor>BADoff && subfloor*2>BADoff); //TODO: Actually, it needs to be substantially more.
+// final int BARRIER_I2=columns-BARRIER_I1;
+ final int BARRIER_I2=rows-BARRIER_I1, BARRIER_I2b=columns-1;
+ final int BARRIER_D2=rows-BARRIER_D1;
+
+ //temporary, for finding a bug
+ if(rows>maxRows || columns>maxColumns){
+ throw new RuntimeException("rows="+rows+", maxRows="+maxRows+", cols="+columns+", maxCols="+maxColumns+"\n"+new String(read)+"\n");
+ }
+
+ assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows;
+ assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns;
+
+ assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc;
+ assert(refEndLoc<ref.length) : "Check that values are in-bounds before calling this function: "+refEndLoc+", "+ref.length;
+
+ for(int row=1; row<=rows; row++){
+
+// int minc=max(1, row-20);
+// int maxc=min(columns, row+20);
+
+ for(int col=1; col<=columns; col++){
+ iterationsUnlimited++;
+
+// final boolean match=(read[row-1]==ref[refStartLoc+col-1]);
+// final boolean prevMatch=(row<2 || col<2 ? false : read[row-2]==ref[refStartLoc+col-2]);
+
+ final byte call0=(row<2 ? (byte)'?' : read[row-2]);
+ final byte call1=read[row-1];
+ final byte ref0=(col<2 ? (byte)'!' : ref[refStartLoc+col-2]);
+ final byte ref1=ref[refStartLoc+col-1];
+
+ final boolean match=(call1==ref1 && ref1!='N');
+ final boolean prevMatch=(call0==ref0 && ref0!='N');
+
+ final boolean gap=(ref1==GAPC);
+ assert(call1!=GAPC);
+
+ if(gap){
+ packed[MODE_MS][row][col]=subfloor;
+ }else{//Calculate match and sub scores
+
+ final int scoreFromDiag=packed[MODE_MS][row-1][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row-1][col-1]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col-1]&SCOREMASK;
+ final int streak=(packed[MODE_MS][row-1][col-1]&TIMEMASK);
+
+ {//Calculate match/sub score
+
+ if(match){
+
+ int scoreMS=scoreFromDiag+(prevMatch ? POINTSoff_MATCH2 : POINTSoff_MATCH);
+ int scoreD=scoreFromDel+POINTSoff_MATCH;
+ int scoreI=scoreFromIns+POINTSoff_MATCH;
+
+ int score;
+ int time;
+// byte prevState;
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? streak+1 : 1);
+// prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+// prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+// prevState=MODE_INS;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_MS][row][col]=(score|prevState|time);
+ packed[MODE_MS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+
+ }else{
+
+ int scoreMS;
+ if(ref1!='N' && call1!='N'){
+ scoreMS=scoreFromDiag+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) :
+ (streak==0 ? POINTSoff_SUB : streak<LIMIT_FOR_COST_3 ? POINTSoff_SUB2 : POINTSoff_SUB3));
+ }else{
+ scoreMS=scoreFromDiag+POINTSoff_NOCALL;
+ }
+
+ int scoreD=scoreFromDel+POINTSoff_SUB; //+2 to move it as close as possible to the deletion / insertion
+ int scoreI=scoreFromIns+POINTSoff_SUB;
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? 1 : streak+1);
+// time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1);
+ prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+ prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+ prevState=MODE_INS;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_MS][row][col]=(score|prevState|time);
+ packed[MODE_MS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+ }
+ }
+
+ if(row<BARRIER_D1 || row>BARRIER_D2){
+ packed[MODE_DEL][row][col]=subfloor;
+ }else{//Calculate DEL score
+
+ final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK;
+
+ final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK;
+
+ int scoreMS=scoreFromDiag+POINTSoff_DEL;
+ int scoreD=scoreFromDel+(streak==0 ? POINTSoff_DEL :
+ streak<LIMIT_FOR_COST_3 ? POINTSoff_DEL2 :
+ streak<LIMIT_FOR_COST_4 ? POINTSoff_DEL3 :
+ streak<LIMIT_FOR_COST_5 ? POINTSoff_DEL4 :
+ ((streak&MASK5)==0 ? POINTSoff_DEL5 : 0));
+// int scoreI=scoreFromIns+POINTSoff_DEL;
+
+ if(ref1=='N'){
+ scoreMS+=POINTSoff_DEL_REF_N;
+ scoreD+=POINTSoff_DEL_REF_N;
+ }else if(gap){
+ scoreMS+=POINTSoff_GAP;
+ scoreD+=POINTSoff_GAP;
+ }
+
+ //if(match){scoreMS=subfloor;}
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreD){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreD;
+ time=streak+1;
+ prevState=MODE_DEL;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_DEL][row][col]=(score|prevState|time);
+ packed[MODE_DEL][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+
+ //Calculate INS score
+// if(gap || col<BARRIER_I1 || col>BARRIER_I2){
+ if(gap || (row<BARRIER_I1 && col>1) || (row>BARRIER_I2 && col<BARRIER_I2b)){
+ packed[MODE_INS][row][col]=subfloor;
+ }else{//Calculate INS score
+
+ final int streak=packed[MODE_INS][row-1][col]&TIMEMASK;
+
+ final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK;
+
+ int scoreMS=scoreFromDiag+POINTSoff_INS;
+// int scoreD=scoreFromDel+POINTSoff_INS;
+ int scoreI=scoreFromIns+(streak==0 ? POINTSoff_INS :
+ streak<LIMIT_FOR_COST_3 ? POINTSoff_INS2 :
+ streak<LIMIT_FOR_COST_4 ? POINTSoff_INS3 : POINTSoff_INS4);
+
+// System.err.println("("+row+","+col+")\t"+scoreFromDiag+"+"+POINTSoff_INS+"="+scoreM+", "+
+// scoreFromSub+"+"+POINTSoff_INS+"="+scoreS+", "
+// +scoreD+", "+scoreFromIns+"+"+
+// (streak==0 ? POINTSoff_INS : streak<LIMIT_FOR_COST_3 ? POINTSoff_INS2 : POINTSoff_INS3)+"="+scoreI);
+
+ //if(match){scoreMS=subfloor;}
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreI){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreI;
+ time=streak+1;
+ prevState=MODE_INS;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_INS][row][col]=(score|prevState|time);
+ packed[MODE_INS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+ }
+ }
+
+
+ int maxCol=-1;
+ int maxState=-1;
+ int maxScore=Integer.MIN_VALUE;
+
+ for(int state=0; state<packed.length; state++){
+ for(int col=1; col<=columns; col++){
+ int x=packed[state][rows][col]&SCOREMASK;
+ if(x>maxScore){
+ maxScore=x;
+ maxCol=col;
+ maxState=state;
+ }
+ }
+ }
+ maxScore>>=SCOREOFFSET;
+
+// System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore);
+ return new int[] {rows, maxCol, maxState, maxScore};
+ }
+
+ @Deprecated
+ /** return new int[] {rows, maxC, maxS, max}; */
+ public final int[] fillQ(byte[] read, byte[] ref, byte[] baseScores, int refStartLoc, int refEndLoc){
+ assert(false) : "Needs to be redone to work with score cutoffs. Not difficult.";
+ rows=read.length;
+ columns=refEndLoc-refStartLoc+1;
+
+ assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows;
+ assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns;
+
+ assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc;
+ assert(refEndLoc<ref.length) : "Check that values are in-bounds before calling this function: "+refEndLoc+", "+ref.length;
+
+ for(int row=1; row<=rows; row++){
+
+// int minc=max(1, row-20);
+// int maxc=min(columns, row+20);
+
+ for(int col=1; col<=columns; col++){
+
+ final boolean match=(read[row-1]==ref[refStartLoc+col-1]);
+ final boolean prevMatch=(row<2 || col<2 ? false : read[row-2]==ref[refStartLoc+col-2]);
+
+ {//Calculate match and sub scores
+
+ final int scoreFromDiag=packed[MODE_MS][row-1][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row-1][col-1]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col-1]&SCOREMASK;
+ final int streak=(packed[MODE_MS][row-1][col-1]&TIMEMASK);
+
+ {//Calculate match/sub score
+
+ if(match){
+
+ int scoreMS=scoreFromDiag+(prevMatch ? POINTSoff_MATCH2 : POINTSoff_MATCH);
+ int scoreD=scoreFromDel+POINTSoff_MATCH;
+ int scoreI=scoreFromIns+POINTSoff_MATCH;
+
+ int score;
+ int time;
+// byte prevState;
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? streak+1 : 1);
+// prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+// prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+// prevState=MODE_INS;
+ }
+ score+=(((int)baseScores[row-1])<<SCOREOFFSET); //modifier
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_MS][row][col]=(score|prevState|time);
+ packed[MODE_MS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+
+ }else{
+
+ int scoreMS=scoreFromDiag+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) :
+ (streak==0 ? POINTSoff_SUB : streak<LIMIT_FOR_COST_3 ? POINTSoff_SUB2 : POINTSoff_SUB3));
+ int scoreD=scoreFromDel+POINTSoff_SUB; //+2 to move it as close as possible to the deletion / insertion
+ int scoreI=scoreFromIns+POINTSoff_SUB;
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? 1 : streak+1);
+// time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1);
+ prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+ prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+ prevState=MODE_INS;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_MS][row][col]=(score|prevState|time);
+ packed[MODE_MS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+ }
+ }
+
+ {//Calculate DEL score
+
+ final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK;
+
+ final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK;
+
+ int scoreMS=scoreFromDiag+POINTSoff_DEL;
+ int scoreD=scoreFromDel+(streak==0 ? POINTSoff_DEL :
+ streak<LIMIT_FOR_COST_3 ? POINTSoff_DEL2 :
+ streak<LIMIT_FOR_COST_4 ? POINTSoff_DEL3 :
+ streak<LIMIT_FOR_COST_5 ? POINTSoff_DEL4 :
+ ((streak&MASK5)==0 ? POINTSoff_DEL5 : 0));
+// int scoreI=scoreFromIns+POINTSoff_DEL;
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreD){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreD;
+ time=streak+1;
+ prevState=MODE_DEL;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_DEL][row][col]=(score|prevState|time);
+ packed[MODE_DEL][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+
+ {//Calculate INS score
+
+ final int streak=packed[MODE_INS][row-1][col]&TIMEMASK;
+
+ final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK;
+
+ int scoreMS=scoreFromDiag+POINTSoff_INS;
+// int scoreD=scoreFromDel+POINTSoff_INS;
+ int scoreI=scoreFromIns+(streak==0 ? POINTSoff_INS :
+ streak<LIMIT_FOR_COST_3 ? POINTSoff_INS2 :
+ streak<LIMIT_FOR_COST_4 ? POINTSoff_INS3 : POINTSoff_INS4);
+
+// System.err.println("("+row+","+col+")\t"+scoreFromDiag+"+"+POINTSoff_INS+"="+scoreM+", "+
+// scoreFromSub+"+"+POINTSoff_INS+"="+scoreS+", "
+// +scoreD+", "+scoreFromIns+"+"+
+// (streak==0 ? POINTSoff_INS : streak<LIMIT_FOR_COST_3 ? POINTSoff_INS2 : POINTSoff_INS3)+"="+scoreI);
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreI){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreI;
+ time=streak+1;
+ prevState=MODE_INS;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_INS][row][col]=(score|prevState|time);
+ packed[MODE_INS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+ }
+ }
+
+
+ int maxCol=-1;
+ int maxState=-1;
+ int maxScore=Integer.MIN_VALUE;
+
+ for(int state=0; state<packed.length; state++){
+ for(int col=1; col<=columns; col++){
+ int x=packed[state][rows][col]&SCOREMASK;
+ if(x>maxScore){
+ maxScore=x;
+ maxCol=col;
+ maxState=state;
+ }
+ }
+ }
+ maxScore>>=SCOREOFFSET;
+
+// System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore);
+ return new int[] {rows, maxCol, maxState, maxScore};
+ }
+
+
+ /** @return {score, bestRefStart, bestRefStop} */
+ /** Generates the match string */
+ public final byte[] traceback(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state, boolean gapped){
+ if(gapped){
+ final byte[] gref=grefbuffer;
+ int gstart=translateToGappedCoordinate(refStartLoc, gref);
+ int gstop=translateToGappedCoordinate(refEndLoc, gref);
+ byte[] out=traceback2(read, gref, gstart, gstop, row, col, state);
+ return out;
+ }else{
+ return traceback2(read, ref, refStartLoc, refEndLoc, row, col, state);
+ }
+ }
+
+
+ /** Generates the match string */
+ public final byte[] traceback2(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state){
+// assert(false);
+ assert(refStartLoc<=refEndLoc) : refStartLoc+", "+refEndLoc;
+ assert(row==rows);
+
+ byte[] out=new byte[row+col-1]; //TODO if an out of bound crash occurs, try removing the "-1".
+ int outPos=0;
+
+ int gaps=0;
+
+ if(state==MODE_INS){
+ //TODO ? Maybe not needed.
+ }
+
+ while(row>0 && col>0){
+
+// byte prev0=(byte)(packed[state][row][col]&MODEMASK);
+
+ final int time=packed[state][row][col]&TIMEMASK;
+ final byte prev;
+
+// System.err.println("state="+state+", prev="+prev+", row="+row+", col="+col+", score="+scores[state][row][col]);
+
+ if(state==MODE_MS){
+ if(time>1){prev=(byte)state;}
+ else{
+ final int scoreFromDiag=packed[MODE_MS][row-1][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row-1][col-1]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col-1]&SCOREMASK;
+ if(scoreFromDiag>=scoreFromDel && scoreFromDiag>=scoreFromIns){prev=MODE_MS;}
+ else if(scoreFromDel>=scoreFromIns){prev=MODE_DEL;}
+ else{prev=MODE_INS;}
+ }
+
+ byte c=read[row-1];
+ byte r=ref[refStartLoc+col-1];
+ if(c==r){
+ out[outPos]='m';
+ }else{
+ if(!AminoAcid.isFullyDefined(c)){
+ out[outPos]='N';
+ }else if(!AminoAcid.isFullyDefined(r)){
+// out[outPos]='X';
+ out[outPos]='N';
+ }else{
+ out[outPos]='S';
+ }
+ }
+
+ row--;
+ col--;
+ }else if(state==MODE_DEL){
+ if(time>1){prev=(byte)state;}
+ else{
+ final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK;
+ if(scoreFromDiag>=scoreFromDel){prev=MODE_MS;}
+ else{prev=MODE_DEL;}
+ }
+
+ byte r=ref[refStartLoc+col-1];
+ if(r==GAPC){
+ out[outPos]='-';
+ gaps++;
+ }else{
+ out[outPos]='D';
+ }
+ col--;
+ }else{
+ if(time>1){prev=(byte)state;}
+ else{
+ final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK;
+ if(scoreFromDiag>=scoreFromIns){prev=MODE_MS;}
+ else{prev=MODE_INS;}
+ }
+
+ assert(state==MODE_INS) : state;
+ if(col==0){
+ out[outPos]='X';
+ }else if(col>=columns){
+ out[outPos]='Y';
+ }else{
+ out[outPos]='I';
+ }
+ row--;
+ }
+
+// assert(prev==prev0);
+ state=prev;
+ outPos++;
+ }
+
+ assert(row==0 || col==0);
+ if(col!=row){
+ while(row>0){
+ out[outPos]='X';
+ outPos++;
+ row--;
+ col--;
+ }
+ if(col>0){
+ //do nothing
+ }
+ }
+
+
+ //Shrink and reverse the string
+ byte[] out2=new byte[outPos];
+ for(int i=0; i<outPos; i++){
+ out2[i]=out[outPos-i-1];
+ }
+ out=null;
+
+ if(gaps==0){return out2;}
+
+ //TODO Consider outputting this compressed.
+ byte[] out3=new byte[out2.length+gaps*(GAPLEN-1)];
+ for(int i=0, j=0; i<out2.length; i++){
+ byte c=out2[i];
+ if(c!=GAPC){
+ out3[j]=c;
+ j++;
+ }else{
+ int lim=j+GAPLEN;
+ for(; j<lim; j++){
+ out3[j]='D';
+ }
+ }
+ }
+ return out3;
+ }
+
+ /** @return {score, bestRefStart, bestRefStop} */
+ public final int[] score(final byte[] read, final byte[] ref, final int refStartLoc, final int refEndLoc,
+ final int maxRow, final int maxCol, final int maxState, boolean gapped){
+ if(gapped){
+ if(verbose){
+ System.err.println("score():");
+ System.err.println("origin="+grefRefOrigin+", "+refStartLoc+", "+refEndLoc+", "+maxRow+", "+maxCol);
+ }
+ final byte[] gref=grefbuffer;
+ int gstart=translateToGappedCoordinate(refStartLoc, gref);
+ int gstop=translateToGappedCoordinate(refEndLoc, gref);
+
+ assert(translateFromGappedCoordinate(gstart, gref)==refStartLoc); //TODO: Remove slow assertions
+ assert(translateFromGappedCoordinate(gstop, gref)==refEndLoc);
+
+ assert(gstart==0) : gstart; //TODO: skip translation if this is always zero
+
+ if(verbose){System.err.println("gstart, gstop: "+gstart+", "+gstop);}
+ int[] out=score2(read, gref, gstart, gstop, maxRow, maxCol, maxState);
+ if(verbose){System.err.println("got score "+Arrays.toString(out));}
+
+ assert(out[1]==translateToGappedCoordinate(translateFromGappedCoordinate(out[1], gref), gref)) :
+ "Verifying: "+out[1]+" -> "+translateFromGappedCoordinate(out[1], gref)+" -> "+
+ translateToGappedCoordinate(translateFromGappedCoordinate(out[1], gref), gref);
+ assert(out[2]==translateToGappedCoordinate(translateFromGappedCoordinate(out[2], gref), gref));
+
+ out[1]=translateFromGappedCoordinate(out[1], gref);
+ out[2]=translateFromGappedCoordinate(out[2], gref);
+ if(verbose){System.err.println("returning score "+Arrays.toString(out));}
+ return out;
+ }else{
+ return score2(read, ref, refStartLoc, refEndLoc, maxRow, maxCol, maxState);
+ }
+ }
+
+ /** @return {score, bestRefStart, bestRefStop} */
+ public final int[] score2(final byte[] read, final byte[] ref, final int refStartLoc, final int refEndLoc,
+ final int maxRow, final int maxCol, final int maxState){
+
+ int row=maxRow;
+ int col=maxCol;
+ int state=maxState;
+
+ assert(maxState>=0 && maxState<packed.length) :
+ maxState+", "+maxRow+", "+maxCol+"\n"+new String(read)+"\n"+toString(ref, refStartLoc, refEndLoc);
+ assert(maxRow>=0 && maxRow<packed[0].length) :
+ maxState+", "+maxRow+", "+maxCol+"\n"+new String(read)+"\n"+toString(ref, refStartLoc, refEndLoc);
+ assert(maxCol>=0 && maxCol<packed[0][0].length) :
+ maxState+", "+maxRow+", "+maxCol+"\n"+new String(read)+"\n"+toString(ref, refStartLoc, refEndLoc);
+
+ int score=packed[maxState][maxRow][maxCol]&SCOREMASK; //Or zero, if it is to be recalculated
+
+ if(row<rows){
+ int difR=rows-row;
+ int difC=columns-col;
+
+ while(difR>difC){
+ score+=POINTSoff_NOREF;
+ difR--;
+ }
+
+ row+=difR;
+ col+=difR;
+
+ }
+
+ assert(refStartLoc<=refEndLoc);
+ assert(row==rows);
+
+
+ final int bestRefStop=refStartLoc+col-1;
+
+ while(row>0 && col>0){
+// System.err.println("state="+state+", row="+row+", col="+col);
+
+
+
+// byte prev0=(byte)(packed[state][row][col]&MODEMASK);
+
+ final int time=packed[state][row][col]&TIMEMASK;
+ final byte prev;
+
+ if(state==MODE_MS){
+ if(time>1){prev=(byte)state;}
+ else{
+ final int scoreFromDiag=packed[MODE_MS][row-1][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row-1][col-1]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col-1]&SCOREMASK;
+ if(scoreFromDiag>=scoreFromDel && scoreFromDiag>=scoreFromIns){prev=MODE_MS;}
+ else if(scoreFromDel>=scoreFromIns){prev=MODE_DEL;}
+ else{prev=MODE_INS;}
+ }
+ row--;
+ col--;
+ }else if(state==MODE_DEL){
+ if(time>1){prev=(byte)state;}
+ else{
+ final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK;
+ if(scoreFromDiag>=scoreFromDel){prev=MODE_MS;}
+ else{prev=MODE_DEL;}
+ }
+ col--;
+ }else{
+ assert(state==MODE_INS);
+ if(time>1){prev=(byte)state;}
+ else{
+ final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK;
+ if(scoreFromDiag>=scoreFromIns){prev=MODE_MS;}
+ else{prev=MODE_INS;}
+ }
+ row--;
+ }
+
+ if(col<0){
+ System.err.println(row);
+ break; //prevents an out of bounds access
+
+ }
+
+// assert(prev==prev0);
+ state=prev;
+
+// System.err.println("state2="+state+", row2="+row+", col2="+col+"\n");
+ }
+// assert(false) : row+", "+col;
+ if(row>col){
+ col-=row;
+ }
+
+ final int bestRefStart=refStartLoc+col;
+
+ score>>=SCOREOFFSET;
+ int[] rvec;
+ if(bestRefStart<refStartLoc || bestRefStop>refEndLoc){ //Suggest extra padding in cases of overflow
+ int padLeft=Tools.max(0, refStartLoc-bestRefStart);
+ int padRight=Tools.max(0, bestRefStop-refEndLoc);
+ rvec=new int[] {score, bestRefStart, bestRefStop, padLeft, padRight};
+ }else{
+ rvec=new int[] {score, bestRefStart, bestRefStop};
+ }
+ return rvec;
+ }
+
+
+ /** Will not fill areas that cannot match minScore.
+ * @return {score, bestRefStart, bestRefStop} */
+ public final int[] fillAndScoreLimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore, int[] gaps){
+ int a=Tools.max(0, refStartLoc);
+ int b=Tools.min(ref.length-1, refEndLoc);
+ assert(b>=a);
+
+ int[] score;
+
+ if(gaps==null){
+ if(verbose){
+ System.err.println("no gaps");
+ }
+ if(b-a>=maxColumns){
+ System.err.println("Warning: Max alignment columns exceeded; restricting range. "+(b-a+1)+" > "+maxColumns);
+ assert(false) : refStartLoc+", "+refEndLoc;
+ b=Tools.min(ref.length-1, a+maxColumns-1);
+ }
+ int[] max=fillLimited(read, ref, a, b, minScore, gaps);
+ score=(max==null ? null : score(read, ref, a, b, max[0], max[1], max[2], false));
+ }else{
+ if(verbose){System.err.println("\ngaps: "+Arrays.toString(gaps)+"\n"+new String(read)+"\ncoords: "+refStartLoc+", "+refEndLoc);}
+ int[] max=fillLimited(read, ref, a, b, minScore, gaps);
+ if(verbose){System.err.println("max: "+Arrays.toString(max));}
+// score=(max==null ? null : score(read, grefbuffer, 0, greflimit, max[0], max[1], max[2], true));
+ score=(max==null ? null : score(read, ref, a, b, max[0], max[1], max[2], true));
+ }
+ return score;
+ }
+
+ public final int[] fillAndScoreLimited(byte[] read, SiteScore ss, int thresh, int minScore){
+ return fillAndScoreLimited(read, ss.chrom, ss.start, ss.stop, thresh, minScore, ss.gaps);
+ }
+ /*
+ public final int[] fillAndScoreLimited_Gapped(byte[] read, SiteScore ss, int thresh, int minScore){
+ if(ss.gaps==null){return fillAndScoreLimited(read, ss.chrom, ss.start, ss.stop, thresh, minScore);}
+ int[] gaps=ss.gaps;
+ final int bound1=gaps[0]=Tools.min(ss.start, gaps[0]);
+ final int bound2=gaps[gaps.length-1]=Tools.max(ss.stop, gaps[gaps.length-1]);
+
+ //This block is no longer needed since the array is preallocated.
+ int len=0;
+ final int gb2=GAPBUFFER*2;
+ for(int i=0; i<gaps.length; i+=2){
+ int x=gaps[i];
+ int y=gaps[i+1];
+ len+=(y-x+1);
+ if(i+2<gaps.length){
+ int z=gaps[i+2];
+ assert(z>y);
+ int gap=z-y-1;
+ if(gap<MINGAP){
+ len+=gap;
+ }else{
+ len+=gb2;
+ gap-=gb2;
+ int div=gap/GAPLEN;
+ int rem=gap%GAPLEN;
+ len+=(div+rem);
+ }
+ }
+ }
+ byte[] gref=grefbuffer;
+ assert(gref.length>=len) : ss+"\t"+len+"\t"+gref.length;
+
+ ChromosomeArray cha=Data.getChromosome(ss.chrom);
+
+ for(int i=0, j=0; i<gaps.length; i+=2){
+ int x=gaps[i];
+ int y=gaps[i+1];
+
+ for(int r=x; r<=y; r++, j++){
+ gref[j]=cha.get(r);
+ }
+
+ if(i+2<gaps.length){
+ int z=gaps[i+2];
+ assert(z>y);
+ int gap=z-y-1;
+ assert(gap>=MINGAP);
+ if(gap<MINGAP){
+ assert(false) : "TODO - just fill in normally";
+ }else{
+ int div=gap/GAPLEN;
+ assert(div>0);
+ int rem=gap%GAPLEN;
+ int lim=y+GAPBUFFER;
+
+ for(int r=y+1; r<=lim; r++, j++){
+ gref[j]=cha.get(r);
+ }
+ for(int g=0; g<div; g++, j++){
+ gref[j]=GAPC;
+ }
+ for(int r=z-GAPBUFFER; r<z; r++, j++){
+ gref[j]=cha.get(r);
+ }
+ }
+ }
+ }
+// fillAndScoreLimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore)
+ int[] scoreArray=fillAndScoreLimited(read, ref, 0, ref.length-1, minScore);
+ //Need to remap coordinates.
+
+// {score, bestRefStart, bestRefStop}
+ if(scoreArray==null){return null;}
+
+ int rstart=scoreArray[1];
+ int rstop=scoreArray[2];
+
+ int rstart2=-9999;
+ int rstop2=-9999;
+
+ for(int i=0, j=bound1; i<=len; i++){
+ byte refc=ref[i];
+
+ if(i==rstart){rstart2=j;}
+ if(i==rstop){
+ rstop2=j;
+ assert(rstart2>-9999);
+ break;
+ }
+
+ if(refc!=GAPC){
+ j++;
+ }else{
+ j+=GAPLEN;
+ }
+ }
+ assert(rstart2>-9999 && rstop2>-9999);
+ scoreArray[1]=rstart2;
+ scoreArray[2]=rstop2;
+
+ return scoreArray;
+ }*/
+
+ /**
+ * Fills grefbuffer
+ * @param ref
+ * @param a
+ * @param b
+ * @param gaps
+ * @return gref
+ */
+ public final byte[] makeGref(byte[] ref, int[] gaps, int refStartLoc, int refEndLoc){
+ assert(gaps!=null && gaps.length>0);
+
+ assert(refStartLoc<=gaps[0]) : refStartLoc+", "+refEndLoc+", "+Arrays.toString(gaps);
+ assert(refEndLoc>=gaps[gaps.length-1]);
+
+ final int g0_old=gaps[0];
+ final int gN_old=gaps[gaps.length-1];
+ gaps[0]=Tools.min(gaps[0], refStartLoc);
+ gaps[gaps.length-1]=Tools.max(gN_old, refEndLoc);
+ grefRefOrigin=gaps[0];
+
+ if(verbose){System.err.println("\ngaps2: "+Arrays.toString(gaps));}
+
+// grefRefOrigin=Tools.min(gaps[0], refStartLoc);
+
+// //This block is no longer needed since the array is preallocated.
+// int len=0;
+// final int gb2=GAPBUFFER*2;
+// for(int i=0; i<gaps.length; i+=2){
+// int x=gaps[i];
+// int y=gaps[i+1];
+// len+=(y-x+1);
+// if(i+2<gaps.length){
+// int z=gaps[i+2];
+// assert(z>y);
+// int gap=z-y-1;
+// if(gap<MINGAP){
+// len+=gap;
+// }else{
+// len+=gb2;
+// gap-=gb2;
+// int div=gap/GAPLEN;
+// int rem=gap%GAPLEN;
+// len+=(div+rem);
+// }
+// }
+// }
+ byte[] gref=grefbuffer;
+
+ int gpos=0;
+ for(int i=0; i<gaps.length; i+=2){
+ int x=gaps[i];
+ int y=gaps[i+1];
+
+ for(int r=x; r<=y; r++, gpos++){
+ //TODO: if out of bounds, use an 'N'
+ gref[gpos]=ref[r];
+ }
+
+ if(i+2<gaps.length){
+ int z=gaps[i+2];
+ assert(z>y);
+ int gap=z-y-1;
+ assert(gap>=MINGAP) : gap+"\t"+MINGAP;
+ if(gap<MINGAP){
+ assert(false) : "TODO - just fill in normally";
+ }else{
+ int rem=gap%GAPLEN;
+ int lim=y+GAPBUFFER+rem;
+
+ int div=(gap-GAPBUFFER2)/GAPLEN;
+ if(verbose){
+ System.err.println("div = "+div);
+ }
+ assert(div>0);
+
+ for(int r=y+1; r<=lim; r++, gpos++){
+ gref[gpos]=ref[r];
+ }
+ for(int g=0; g<div; g++, gpos++){
+ gref[gpos]=GAPC;
+ }
+ for(int r=z-GAPBUFFER; r<z; r++, gpos++){
+ gref[gpos]=ref[r];
+ }
+ }
+ }
+ }
+
+ greflimit=gpos;
+
+ assert(gref[gpos-1]==ref[refEndLoc]);
+
+ //Add a cushion to the end to clear out the prior data (especially GAPC) that was there
+ {
+ final int lim=Tools.min(gref.length, greflimit+GREFLIMIT2_CUSHION);
+ if(lim>gref.length){
+ System.err.println("gref buffer overflow: "+lim+" > "+gref.length);
+ return null;
+ }
+ for(int i=greflimit, r=refEndLoc+1; i<lim; i++, r++){
+ gref[i]=(r<ref.length ? ref[r] : (byte)'N');
+ greflimit2=i;
+ }
+ }
+
+ if(verbose){
+ System.err.println("gref:\n"+new String(gref));
+ }
+
+ gaps[0]=g0_old;
+ gaps[gaps.length-1]=gN_old;
+
+ if(verbose){
+ System.err.println("\ngaps3: "+Arrays.toString(gaps));
+ }
+
+ return gref;
+ }
+
+
+// public final int[] translateScoreFromGappedCoordinate(int[] score){
+//// {score, bestRefStart, bestRefStop}
+// int a=score[1];
+// int b=score[2];
+// int a2=-9999;
+// int b2=-9999;
+// for(int i=0, j=grefRefOrigin; i<grefbuffer.length; i++){
+// byte c=grefbuffer[i];
+//
+// if(i==a){a2=j;}
+// if(i==b){
+// b2=j;
+// assert(a2!=-9999);
+// score[1]=a2;
+// score[2]=b2;
+// return score;
+// }
+//
+// j+=(c==GAPC ? GAPLEN : 1);
+//// if(c!=GAPC){j++;}
+//// else{j+=GAPLEN;}
+// }
+// throw new RuntimeException("Out of bounds.");
+// }
+
+ private final int translateFromGappedCoordinate(int point, byte[] gref){
+ if(verbose){System.err.println("translateFromGappedCoordinate("+point+"), gro="+grefRefOrigin+", grl="+greflimit);}
+ if(point<=0){return grefRefOrigin+point;}
+ for(int i=0, j=grefRefOrigin; i<greflimit2; i++){
+// if(verbose){System.err.println("i="+i+", j="+j+", sym="+(char)gref[i]);}
+ byte c=gref[i];
+ assert(point>=i) : "\n"+grefRefOrigin+"\n"+point+"\n"+new String(gref)+"\n";
+
+ if(i==point){
+ if(verbose){System.err.println(" -> "+j);}
+ return j;
+ }
+
+ j+=(c==GAPC ? GAPLEN : 1);
+// if(c!=GAPC){j++;}
+// else{j+=GAPLEN;}
+ }
+
+ System.err.println(grefRefOrigin);
+ System.err.println(point);
+ System.err.println(new String(gref));
+
+ throw new RuntimeException("Out of bounds.");
+ }
+
+ private final int translateToGappedCoordinate(int point, byte[] gref){
+ if(verbose){System.err.println("translateToGappedCoordinate("+point+"), gro="+grefRefOrigin+", grl="+greflimit);}
+ if(point<=grefRefOrigin){return point-grefRefOrigin;}
+ for(int i=0, j=grefRefOrigin; i<greflimit2; i++){
+// if(verbose){System.err.println("i="+i+", j="+j+", sym="+(char)gref[i]);}
+ assert(point>=j) : "\n"+grefRefOrigin+"\n"+point+"\n"+new String(gref)+"\n";
+ byte c=gref[i];
+
+ if(j==point){
+ if(verbose){System.err.println(" -> "+i);}
+ return i;
+ }
+
+ j+=(c==GAPC ? GAPLEN : 1);
+// if(c!=GAPC){j++;}
+// else{j+=GAPLEN;}
+ }
+
+ System.err.println(grefRefOrigin);
+ System.err.println(point);
+ System.err.println(new String(gref));
+
+ throw new RuntimeException("Out of bounds.");
+ }
+
+ public final int[] fillAndScoreLimited(byte[] read, int chrom, int start, int stop, int thresh, int minScore, int[] gaps){
+ return fillAndScoreLimited(read, Data.getChromosome(chrom).array, start-thresh, stop+thresh, minScore, gaps);
+ }
+
+ @Deprecated
+ public final int[] fillAndScoreQ(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, byte[] baseScores){
+ int a=Tools.max(0, refStartLoc);
+ int b=Tools.min(ref.length-1, refEndLoc);
+ assert(b>=a);
+ if(b-a>=maxColumns){
+ System.err.println("Warning: Max alignment columns exceeded; restricting range. "+(b-a+1)+" > "+maxColumns);
+ b=Tools.min(ref.length-1, a+maxColumns-1);
+ }
+ int[] max=fillQ(read, ref, baseScores, a, b);
+// int[] score=score(read, ref, a, b, max[0], max[1], max[2]);
+// return score;
+ return null;
+ }
+
+ @Deprecated
+ public final int[] fillAndScoreQ(byte[] read, SiteScore ss, int thresh, byte[] baseScores){
+ return fillAndScoreQ(read, ss.chrom, ss.start, ss.stop, thresh, baseScores);
+ }
+
+ @Deprecated
+ public final int[] fillAndScoreQ(byte[] read, int chrom, int start, int stop, int thresh, byte[] baseScores){
+ return fillAndScoreQ(read, Data.getChromosome(chrom).array, start-thresh, stop+thresh, baseScores);
+ }
+
+// public final int scoreNoIndels(byte[] read, SiteScore ss){
+//
+// ChromosomeArray cha=Data.getChromosome(ss.chrom);
+// final int refStart=ss.start;
+//
+// int score=0;
+// int mode=MODE_START;
+// int timeInMode=0;
+// if(refStart<0 || refStart+read.length>cha.maxIndex+1){return -99999;} //TODO: Partial match
+//
+// for(int i=0; i<read.length; i++){
+// byte c=read[i];
+// byte r=cha.get(refStart+i);
+//
+// if(c==r){
+// if(mode==MODE_MS){
+// timeInMode++;
+// score+=POINTSoff_MATCH2;
+// }else{
+// timeInMode=0;
+// score+=POINTSoff_MATCH;
+// }
+// mode=MODE_MS;
+// }else if(c<0 || c=='N'){
+// score+=POINTSoff_NOCALL;
+// }else if(r<0 || r=='N'){
+// score+=POINTSoff_NOREF;
+// }else{
+// if(mode==MODE_SUB){timeInMode++;}
+// else{timeInMode=0;}
+//
+// if(timeInMode==0){score+=POINTSoff_SUB;}
+// else if(timeInMode<LIMIT_FOR_COST_3){score+=POINTSoff_SUB2;}
+// else{score+=POINTSoff_SUB3;}
+// }
+// }
+//
+// return score;
+// }
+
+
+ public final int scoreNoIndels(byte[] read, SiteScore ss){
+ ChromosomeArray cha=Data.getChromosome(ss.chrom);
+ return scoreNoIndels(read, cha.array, ss.start, ss);
+ }
+
+ public final int scoreNoIndels(byte[] read, final int chrom, final int refStart){
+ ChromosomeArray cha=Data.getChromosome(chrom);
+ return scoreNoIndels(read, cha.array, refStart, null);
+ }
+
+ public final int scoreNoIndels(byte[] read, SiteScore ss, byte[] baseScores){
+ ChromosomeArray cha=Data.getChromosome(ss.chrom);
+ return scoreNoIndels(read, cha.array, baseScores, ss.start, ss);
+ }
+
+ public final int scoreNoIndels(byte[] read, final int chrom, final int refStart, byte[] baseScores){
+ ChromosomeArray cha=Data.getChromosome(chrom);
+ return scoreNoIndels(read, cha.array, baseScores, refStart, null);
+ }
+
+
+// public final int scoreNoIndels(byte[] read, final int chrom, final int refStart){
+//
+// ChromosomeArray cha=Data.getChromosome(chrom);
+//
+// int score=0;
+// int mode=MODE_START;
+// int timeInMode=0;
+//
+// //This block handles cases where the read runs outside the reference
+// //Of course, padding the reference with 'N' would be better, but...
+// int readStart=0;
+// int readStop=read.length;
+// final int refStop=refStart+read.length;
+// if(refStart<0){
+// readStart=0-refStart;
+// score+=POINTSoff_NOREF*readStart;
+// }
+// if(refStop>cha.maxIndex+1){
+// int dif=(cha.maxIndex+1-refStop);
+// readStop-=dif;
+// score+=POINTSoff_NOREF*dif;
+// }
+//
+//// if(refStart<0 || refStart+read.length>cha.maxIndex+1){return -99999;} //No longer needed.
+//
+// for(int i=readStart; i<readStop; i++){
+// byte c=read[i];
+// byte r=cha.get(refStart+i);
+//
+// if(c==r){
+// if(mode==MODE_MS){
+// timeInMode++;
+// score+=POINTSoff_MATCH2;
+// }else{
+// timeInMode=0;
+// score+=POINTSoff_MATCH;
+// }
+// mode=MODE_MS;
+// }else if(c<0 || c=='N'){
+// score+=POINTSoff_NOCALL;
+// }else if(r<0 || r=='N'){
+// score+=POINTSoff_NOREF;
+// }else{
+// if(mode==MODE_SUB){timeInMode++;}
+// else{timeInMode=0;}
+//
+// if(timeInMode==0){score+=POINTSoff_SUB;}
+// else if(timeInMode<LIMIT_FOR_COST_3){score+=POINTSoff_SUB2;}
+// else{score+=POINTSoff_SUB3;}
+// }
+// }
+//
+// return score;
+// }
+
+
+
+ /** Calculates score based on an array from Index */
+ public final static int calcAffineScore(int[] locArray){
+ int score=0;
+ int lastLoc=-2; //Last true location
+ int lastValue=-1;
+ int timeInMode=0;
+
+ for(int i=0; i<locArray.length; i++){
+ int loc=locArray[i];
+
+ if(loc>0){//match
+ if(loc==lastValue){//contiguous match
+ score+=POINTS_MATCH2;
+ }else if(loc==lastLoc || lastLoc<0){//match after a sub, or first match
+ score+=POINTS_MATCH;
+ }else if(loc<lastLoc){//deletion
+ assert(lastLoc>=0);
+ score+=POINTS_MATCH;
+ score+=POINTS_DEL;
+ int dif=lastLoc-loc+1;
+ if(dif>MINGAP){
+ int rem=dif%GAPLEN;
+ int div=(dif-GAPBUFFER2)/GAPLEN;
+ score+=(div*POINTS_GAP);
+ assert(rem+GAPBUFFER2<dif);
+ dif=rem+GAPBUFFER2;
+ assert(dif>LIMIT_FOR_COST_4); //and probably LIMIT_FOR_COST_5
+// assert(false) : div;
+ }
+ if(dif>LIMIT_FOR_COST_5){
+ score+=((dif-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTS_DEL5;
+ dif=LIMIT_FOR_COST_5;
+ }
+ if(dif>LIMIT_FOR_COST_4){
+ score+=(dif-LIMIT_FOR_COST_4)*POINTS_DEL4;
+ dif=LIMIT_FOR_COST_4;
+ }
+ if(dif>LIMIT_FOR_COST_3){
+ score+=(dif-LIMIT_FOR_COST_3)*POINTS_DEL3;
+ dif=LIMIT_FOR_COST_3;
+ }
+ if(dif>1){
+ score+=(dif-1)*POINTS_DEL2;
+ }
+ timeInMode=1;
+ }else if(loc>lastLoc){//insertion
+ assert(lastLoc>=0);
+ score+=POINTS_MATCH;
+ score+=POINTS_INS;
+ int dif=Tools.min(loc-lastLoc+1, 5);
+ assert(dif>0);
+ if(dif>LIMIT_FOR_COST_4){
+ score+=(dif-LIMIT_FOR_COST_4)*POINTS_INS4;
+ dif=LIMIT_FOR_COST_4;
+ }
+ if(dif>LIMIT_FOR_COST_3){
+ score+=(dif-LIMIT_FOR_COST_3)*POINTS_INS3;
+ dif=LIMIT_FOR_COST_3;
+ }
+ if(dif>1){
+ score+=(dif-1)*POINTS_INS2;
+ }
+ timeInMode=1;
+ }else{
+ assert(false);
+ }
+ lastLoc=loc;
+ }else{//substitution
+ if(lastValue<0 && timeInMode>0){//contiguous
+ if(timeInMode<LIMIT_FOR_COST_3){score+=POINTS_SUB2;}
+ else{score+=POINTS_SUB3;}
+ timeInMode++;
+ }else{
+ score+=POINTS_SUB;
+ timeInMode=1;
+ }
+ }
+ lastValue=loc;
+ }
+ assert(score<=maxQuality(locArray.length));
+ return score;
+ }
+
+ /** Calculates score based on an array from Index */
+ public final static int calcAffineScore(int[] locArray, byte[] baseScores){
+ int score=0;
+ int lastLoc=-2; //Last true location
+ int lastValue=-1;
+ int timeInMode=0;
+
+ for(int i=0; i<locArray.length; i++){
+ int loc=locArray[i];
+
+ if(loc>0){//match
+ if(loc==lastValue){//contiguous match
+ score+=(POINTS_MATCH2+baseScores[i]);
+ }else if(loc==lastLoc || lastLoc<0){//match after a sub, or first match
+ score+=(POINTS_MATCH+baseScores[i]);
+ }else if(loc<lastLoc){//deletion
+ assert(lastLoc>=0);
+ score+=(POINTS_MATCH+baseScores[i]);
+ score+=POINTS_DEL;
+ int dif=lastLoc-loc+1;
+ if(dif>MINGAP){
+ int rem=dif%GAPLEN;
+ int div=(dif-GAPBUFFER2)/GAPLEN;
+ score+=(div*POINTS_GAP);
+ assert(rem+GAPBUFFER2<dif);
+ dif=rem+GAPBUFFER2;
+ assert(dif>LIMIT_FOR_COST_4); //and probably LIMIT_FOR_COST_5
+// assert(false) : div;
+ }
+ if(dif>LIMIT_FOR_COST_5){
+ score+=((dif-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTS_DEL5;
+ dif=LIMIT_FOR_COST_5;
+ }
+ if(dif>LIMIT_FOR_COST_4){
+ score+=(dif-LIMIT_FOR_COST_4)*POINTS_DEL4;
+ dif=LIMIT_FOR_COST_4;
+ }
+ if(dif>LIMIT_FOR_COST_3){
+ score+=(dif-LIMIT_FOR_COST_3)*POINTS_DEL3;
+ dif=LIMIT_FOR_COST_3;
+ }
+ if(dif>1){
+ score+=(dif-1)*POINTS_DEL2;
+ }
+ timeInMode=1;
+ }else if(loc>lastLoc){//insertion
+ assert(lastLoc>=0);
+ score+=(POINTS_MATCH+baseScores[i]);
+ score+=POINTS_INS;
+ int dif=Tools.min(loc-lastLoc+1, 5);
+ assert(dif>0);
+ if(dif>LIMIT_FOR_COST_4){
+ score+=(dif-LIMIT_FOR_COST_4)*POINTS_INS4;
+ dif=LIMIT_FOR_COST_4;
+ }
+ if(dif>LIMIT_FOR_COST_3){
+ score+=(dif-LIMIT_FOR_COST_3)*POINTS_INS3;
+ dif=LIMIT_FOR_COST_3;
+ }
+ if(dif>1){
+ score+=(dif-1)*POINTS_INS2;
+ }
+ timeInMode=1;
+ }else{
+ assert(false);
+ }
+ lastLoc=loc;
+ }else{//substitution
+ if(lastValue<0 && timeInMode>0){//contiguous
+ if(timeInMode<LIMIT_FOR_COST_3){score+=POINTS_SUB2;}
+ else{score+=POINTS_SUB3;}
+ timeInMode++;
+ }else{
+ score+=POINTS_SUB;
+ timeInMode=1;
+ }
+ }
+ lastValue=loc;
+ }
+ assert(score<=maxQuality(locArray.length));
+ return score;
+ }
+
+
+ public final int scoreNoIndels(byte[] read, byte[] ref, final int refStart, final SiteScore ss){
+
+ int score=0;
+ int mode=-1;
+ int timeInMode=0;
+
+ //This block handles cases where the read runs outside the reference
+ //Of course, padding the reference with 'N' would be better, but...
+ int readStart=0;
+ int readStop=read.length;
+ final int refStop=refStart+read.length;
+ boolean semiperfect=true;
+ int norefs=0;
+
+ if(refStart<0){
+ readStart=0-refStart;
+ score+=POINTS_NOREF*readStart;
+ norefs+=readStart;
+ }
+ if(refStop>ref.length){
+ int dif=(refStop-ref.length);
+ readStop-=dif;
+ score+=POINTS_NOREF*dif;
+ norefs+=dif;
+ }
+
+// if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed.
+
+ for(int i=readStart; i<readStop; i++){
+ byte c=read[i];
+ byte r=ref[refStart+i];
+
+ if(c==r && c!='N'){
+ if(mode==MODE_MS){
+ timeInMode++;
+ score+=POINTS_MATCH2;
+ }else{
+ timeInMode=0;
+ score+=POINTS_MATCH;
+ }
+ mode=MODE_MS;
+ }else if(c<0 || c=='N'){
+ score+=POINTS_NOCALL;
+ semiperfect=false;
+ }else if(r<0 || r=='N'){
+ score+=POINTS_NOREF;
+ norefs++;
+ }else{
+ if(mode==MODE_SUB){timeInMode++;}
+ else{timeInMode=0;}
+
+ if(timeInMode==0){score+=POINTS_SUB;}
+ else if(timeInMode<LIMIT_FOR_COST_3){score+=POINTS_SUB2;}
+ else{score+=POINTS_SUB3;}
+ mode=MODE_SUB;
+ semiperfect=false;
+ }
+ }
+
+ if(semiperfect && ss!=null){ss.semiperfect=((ss.stop==ss.start+read.length-1) && (norefs<=read.length/2));}
+
+ return score;
+ }
+
+
+ public final int scoreNoIndels(byte[] read, byte[] ref, byte[] baseScores, final int refStart, SiteScore ss){
+
+ int score=0;
+ int mode=-1;
+ int timeInMode=0;
+ int norefs=0;
+
+ //This block handles cases where the read runs outside the reference
+ //Of course, padding the reference with 'N' would be better, but...
+ int readStart=0;
+ int readStop=read.length;
+ final int refStop=refStart+read.length;
+ boolean semiperfect=true;
+
+ if(refStart<0){
+ readStart=0-refStart;
+ score+=POINTS_NOREF*readStart;
+ norefs+=readStart;
+ }
+ if(refStop>ref.length){
+ int dif=(refStop-ref.length);
+ readStop-=dif;
+ score+=POINTS_NOREF*dif;
+ norefs+=dif;
+ }
+
+// if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed.
+
+ for(int i=readStart; i<readStop; i++){
+ byte c=read[i];
+ byte r=ref[refStart+i];
+
+ if(c==r && c!='N'){
+ if(mode==MODE_MS){
+ timeInMode++;
+ score+=POINTS_MATCH2;
+ }else{
+ timeInMode=0;
+ score+=POINTS_MATCH;
+ }
+ score+=baseScores[i];
+ mode=MODE_MS;
+ }else if(c<0 || c=='N'){
+ score+=POINTS_NOCALL;
+ semiperfect=false;
+ }else if(r<0 || r=='N'){
+ score+=POINTS_NOREF;
+ norefs++;
+ }else{
+ if(mode==MODE_SUB){timeInMode++;}
+ else{timeInMode=0;}
+
+ if(timeInMode==0){score+=POINTS_SUB;}
+ else if(timeInMode<LIMIT_FOR_COST_3){score+=POINTS_SUB2;}
+ else{score+=POINTS_SUB3;}
+ mode=MODE_SUB;
+ semiperfect=false;
+ }
+ }
+
+ if(semiperfect && ss!=null){ss.semiperfect=((ss.stop==ss.start+read.length-1) && (norefs<=read.length/2));}
+ assert(Read.CHECKSITE(ss, read, -1));
+
+ return score;
+ }
+
+
+ public final int scoreNoIndelsAndMakeMatchString(byte[] read, byte[] ref, byte[] baseScores, final int refStart, byte[][] matchReturn){
+ int score=0;
+ int mode=-1;
+ int timeInMode=0;
+
+ assert(refStart<=ref.length) : refStart+", "+ref.length;
+
+ //This block handles cases where the read runs outside the reference
+ //Of course, padding the reference with 'N' would be better, but...
+ int readStart=0;
+ int readStop=read.length;
+ final int refStop=refStart+read.length;
+ if(refStart<0){
+ readStart=0-refStart;
+ score+=POINTS_NOREF*readStart;
+ }
+ if(refStop>ref.length){
+ int dif=(refStop-ref.length);
+ System.err.println("dif="+dif+", ref.length="+ref.length+", refStop="+refStop);
+ readStop-=dif;
+ score+=POINTS_NOREF*dif;
+ }
+ assert(refStart+readStop<=ref.length) : "readStart="+readStart+", readStop="+readStop+
+ ", refStart="+refStart+", refStop="+refStop+", ref.length="+ref.length+", read.length="+read.length;
+
+ assert(matchReturn!=null);
+ assert(matchReturn.length==1);
+ if(matchReturn[0]==null || matchReturn[0].length!=read.length){
+ assert(matchReturn[0]==null || matchReturn[0].length<read.length) : matchReturn[0].length+"!="+read.length;
+ matchReturn[0]=new byte[read.length];
+ }
+ final byte[] match=matchReturn[0];
+
+// if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed.
+
+ for(int i=readStart; i<readStop; i++){
+ byte c=read[i];
+ byte r=ref[refStart+i];
+
+ assert(r!='.' && c!='.');
+
+ if(c==r && c!='N'){
+ if(mode==MODE_MS){
+ timeInMode++;
+ score+=POINTS_MATCH2;
+ }else{
+ timeInMode=0;
+ score+=POINTS_MATCH;
+ }
+ score+=baseScores[i];
+ match[i]='m';
+ mode=MODE_MS;
+ }else if(c<0 || c=='N'){
+ score+=POINTS_NOCALL;
+ match[i]='N';
+ }else if(r<0 || r=='N'){
+ score+=POINTS_NOREF;
+// match[i]='m';
+ match[i]='N';
+ }else{
+ match[i]='S';
+ if(mode==MODE_SUB){timeInMode++;}
+ else{timeInMode=0;}
+
+ if(timeInMode==0){score+=POINTS_SUB;}
+ else if(timeInMode<LIMIT_FOR_COST_3){score+=POINTS_SUB2;}
+ else{score+=POINTS_SUB3;}
+ mode=MODE_SUB;
+ }
+ }
+
+ return score;
+ }
+
+
+ public final int scoreNoIndelsAndMakeMatchString(byte[] read, byte[] ref, final int refStart, byte[][] matchReturn){
+ int score=0;
+ int mode=-1;
+ int timeInMode=0;
+
+ assert(refStart<=ref.length) : refStart+", "+ref.length;
+
+ //This block handles cases where the read runs outside the reference
+ //Of course, padding the reference with 'N' would be better, but...
+ int readStart=0;
+ int readStop=read.length;
+ final int refStop=refStart+read.length;
+ if(refStart<0){
+ readStart=0-refStart;
+ score+=POINTS_NOREF*readStart;
+ }
+ if(refStop>ref.length){
+ int dif=(refStop-ref.length);
+ System.err.println("dif="+dif+", ref.length="+ref.length+", refStop="+refStop);
+ readStop-=dif;
+ score+=POINTS_NOREF*dif;
+ }
+ assert(refStart+readStop<=ref.length) : "readStart="+readStart+", readStop="+readStop+
+ ", refStart="+refStart+", refStop="+refStop+", ref.length="+ref.length+", read.length="+read.length;
+
+ assert(matchReturn!=null);
+ assert(matchReturn.length==1);
+ if(matchReturn[0]==null || matchReturn[0].length!=read.length){
+ assert(matchReturn[0]==null || matchReturn[0].length<read.length) : matchReturn[0].length+"!="+read.length;
+ matchReturn[0]=new byte[read.length];
+ }
+ final byte[] match=matchReturn[0];
+
+// if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed.
+
+ for(int i=readStart; i<readStop; i++){
+ byte c=read[i];
+ byte r=ref[refStart+i];
+
+ assert(r!='.' && c!='.');
+
+ if(c==r && c!='N'){
+ if(mode==MODE_MS){
+ timeInMode++;
+ score+=POINTS_MATCH2;
+ }else{
+ timeInMode=0;
+ score+=POINTS_MATCH;
+ }
+ match[i]='m';
+ mode=MODE_MS;
+ }else if(c<0 || c=='N'){
+ score+=POINTS_NOCALL;
+ match[i]='N';
+ }else if(r<0 || r=='N'){
+ score+=POINTS_NOREF;
+// match[i]='m';
+ match[i]='N';
+ }else{
+ match[i]='S';
+ if(mode==MODE_SUB){timeInMode++;}
+ else{timeInMode=0;}
+
+ if(timeInMode==0){score+=POINTS_SUB;}
+ else if(timeInMode<LIMIT_FOR_COST_3){score+=POINTS_SUB2;}
+ else{score+=POINTS_SUB3;}
+ mode=MODE_SUB;
+ }
+ }
+
+ return score;
+ }
+
+ public final static int maxQuality(int numBases){
+ return POINTS_MATCH+(numBases-1)*(POINTS_MATCH2);
+ }
+
+ public final static int maxQuality(byte[] baseScores){
+ return POINTS_MATCH+(baseScores.length-1)*(POINTS_MATCH2)+Tools.sumInt(baseScores);
+ }
+
+ public final static int maxImperfectScore(int numBases){
+// int maxQ=maxQuality(numBases);
+//// maxImperfectSwScore=maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB);
+// int maxI=maxQ+POINTS_DEL;
+// maxI=Tools.max(maxI, maxQ+POINTS_INS-POINTS_MATCH2);
+// maxI=Tools.min(maxI, maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB));
+
+ int maxQ=maxQuality(numBases);
+ int maxI=maxQ+Tools.min(POINTS_DEL, POINTS_INS-POINTS_MATCH2);
+ assert(maxI<(maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB)));
+ return maxI;
+ }
+
+ public final static int maxImperfectScore(byte[] baseScores){
+// int maxQ=maxQuality(numBases);
+//// maxImperfectSwScore=maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB);
+// int maxI=maxQ+POINTS_DEL;
+// maxI=Tools.max(maxI, maxQ+POINTS_INS-POINTS_MATCH2);
+// maxI=Tools.min(maxI, maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB));
+
+ int maxQ=maxQuality(baseScores);
+ int maxI=maxQ+Tools.min(POINTS_DEL, POINTS_INS-POINTS_MATCH2);
+ assert(maxI<(maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB)));
+ return maxI;
+ }
+
+ public final static String toString(int[] a){
+
+ int width=7;
+
+ StringBuilder sb=new StringBuilder((a.length+1)*width+2);
+ for(int num : a){
+ String s=" "+num;
+ int spaces=width-s.length();
+ assert(spaces>=0) : width+", "+s.length()+", "+s+", "+num+", "+spaces;
+ for(int i=0; i<spaces; i++){sb.append(' ');}
+ sb.append(s);
+ }
+
+ return sb.toString();
+ }
+
+ public final static String toTimePacked(int[] a){
+ int width=7;
+
+ StringBuilder sb=new StringBuilder((a.length+1)*width+2);
+ for(int num_ : a){
+ int num=num_&TIMEMASK;
+ String s=" "+num;
+ int spaces=width-s.length();
+ assert(spaces>=0) : width+", "+s.length()+", "+s+", "+num+", "+spaces;
+ for(int i=0; i<spaces; i++){sb.append(' ');}
+ sb.append(s);
+ }
+
+ return sb.toString();
+ }
+
+ public final static String toScorePacked(int[] a){
+ int width=7;
+
+ String minString=" -";
+ String maxString=" ";
+ while(minString.length()<width){minString+='9';}
+ while(maxString.length()<width){maxString+='9';}
+
+ StringBuilder sb=new StringBuilder((a.length+1)*width+2);
+ for(int num_ : a){
+ int num=num_>>SCOREOFFSET;
+ String s=" "+num;
+ if(s.length()>width){s=num>0 ? maxString : minString;}
+ int spaces=width-s.length();
+ assert(spaces>=0) : width+", "+s.length()+", "+s+", "+num+", "+spaces;
+ for(int i=0; i<spaces; i++){sb.append(' ');}
+ sb.append(s);
+ }
+
+ return sb.toString();
+ }
+
+ public final static String toString(byte[] a){
+
+ int width=6;
+
+ StringBuilder sb=new StringBuilder((a.length+1)*width+2);
+ for(int num : a){
+ String s=" "+num;
+ int spaces=width-s.length();
+ assert(spaces>=0);
+ for(int i=0; i<spaces; i++){sb.append(' ');}
+ sb.append(s);
+ }
+
+ return sb.toString();
+ }
+
+ public final static String toString(byte[] ref, int startLoc, int stopLoc){
+ StringBuilder sb=new StringBuilder(stopLoc-startLoc+1);
+ for(int i=startLoc; i<=stopLoc; i++){sb.append((char)ref[i]);}
+ return sb.toString();
+ }
+
+ public static int calcDelScore(int len){
+ if(len<=0){return 0;}
+ int score=POINTS_DEL;
+
+ if(len>LIMIT_FOR_COST_5){
+ score+=((len-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTS_DEL5;
+ len=LIMIT_FOR_COST_5;
+ }
+ if(len>LIMIT_FOR_COST_4){
+ score+=(len-LIMIT_FOR_COST_4)*POINTS_DEL4;
+ len=LIMIT_FOR_COST_4;
+ }
+ if(len>LIMIT_FOR_COST_3){
+ score+=(len-LIMIT_FOR_COST_3)*POINTS_DEL3;
+ len=LIMIT_FOR_COST_3;
+ }
+ if(len>1){
+ score+=(len-1)*POINTS_DEL2;
+ }
+ return score;
+ }
+
+ private static int calcDelScoreOffset(int len){
+ if(len<=0){return 0;}
+ int score=POINTSoff_DEL;
+
+ if(len>LIMIT_FOR_COST_5){
+ score+=((len-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTSoff_DEL5;
+ len=LIMIT_FOR_COST_5;
+ }
+ if(len>LIMIT_FOR_COST_4){
+ score+=(len-LIMIT_FOR_COST_4)*POINTSoff_DEL4;
+ len=LIMIT_FOR_COST_4;
+ }
+ if(len>LIMIT_FOR_COST_3){
+ score+=(len-LIMIT_FOR_COST_3)*POINTSoff_DEL3;
+ len=LIMIT_FOR_COST_3;
+ }
+ if(len>1){
+ score+=(len-1)*POINTSoff_DEL2;
+ }
+ return score;
+ }
+
+ public static int calcInsScore(int len){
+ if(len<=0){return 0;}
+ int score=POINTS_INS;
+
+ if(len>LIMIT_FOR_COST_4){
+ score+=(len-LIMIT_FOR_COST_4)*POINTS_INS4;
+ len=LIMIT_FOR_COST_4;
+ }
+ if(len>LIMIT_FOR_COST_3){
+ score+=(len-LIMIT_FOR_COST_3)*POINTS_INS3;
+ len=LIMIT_FOR_COST_3;
+ }
+ if(len>1){
+ score+=(len-1)*POINTS_INS2;
+ }
+ return score;
+ }
+
+ private static int calcInsScoreOffset(int len){
+ if(len<=0){return 0;}
+ int score=POINTSoff_INS;
+ if(len>LIMIT_FOR_COST_4){
+ score+=(len-LIMIT_FOR_COST_4)*POINTSoff_INS4;
+ len=LIMIT_FOR_COST_4;
+ }
+ if(len>LIMIT_FOR_COST_3){
+ score+=(len-LIMIT_FOR_COST_3)*POINTSoff_INS3;
+ len=LIMIT_FOR_COST_3;
+ }
+ if(len>1){
+ score+=(len-1)*POINTSoff_INS2;
+ }
+ return score;
+ }
+
+
+ public final int maxRows;
+ public final int maxColumns;
+
+ private final int[][][] packed;
+ private final byte[] grefbuffer;
+ private int greflimit=-1;
+ private int greflimit2=-1;
+ private int grefRefOrigin=-1;
+
+ public static final int GAPBUFFER=Shared.GAPBUFFER;
+ public static final int GAPBUFFER2=Shared.GAPBUFFER2;
+ public static final int GAPLEN=Shared.GAPLEN;
+ public static final int MINGAP=Shared.MINGAP;
+ public static final int GAPCOST=Shared.GAPCOST*2;
+ public static final byte GAPC=Shared.GAPC;
+
+ private static final int GREFLIMIT2_CUSHION=128; //Tools.max(GAPBUFFER2, GAPLEN);
+
+
+ /**DO NOT MODIFY*/
+ public final byte[] getGrefbuffer(){
+ return grefbuffer;
+ }
+
+ public final int[] vertLimit;
+ public final int[] horizLimit;
+
+ CharSequence showVertLimit(){
+ StringBuilder sb=new StringBuilder();
+ for(int i=0; i<=rows; i++){sb.append(vertLimit[i]>>SCOREOFFSET).append(",");}
+ return sb;
+ }
+ CharSequence showHorizLimit(){
+ StringBuilder sb=new StringBuilder();
+ for(int i=0; i<=columns; i++){sb.append(horizLimit[i]>>SCOREOFFSET).append(",");}
+ return sb;
+ }
+
+// public static final int MODEBITS=2;
+ public static final int TIMEBITS=12;
+ public static final int SCOREBITS=32-TIMEBITS;
+ public static final int MAX_TIME=((1<<TIMEBITS)-1);
+ public static final int MAX_SCORE=((1<<(SCOREBITS-1))-1)-2000;
+ public static final int MIN_SCORE=0-MAX_SCORE; //Keeps it 1 point above "BAD".
+
+// public static final int MODEOFFSET=0; //Always zero.
+// public static final int TIMEOFFSET=0;
+ public static final int SCOREOFFSET=TIMEBITS;
+
+// public static final int MODEMASK=~((-1)<<MODEBITS);
+// public static final int TIMEMASK=(~((-1)<<TIMEBITS))<<TIMEOFFSET;
+ public static final int TIMEMASK=~((-1)<<TIMEBITS);
+ public static final int SCOREMASK=(~((-1)<<SCOREBITS))<<SCOREOFFSET;
+
+ private static final byte MODE_MS=0;
+ private static final byte MODE_DEL=1;
+ private static final byte MODE_INS=2;
+ private static final byte MODE_SUB=3;
+
+// public final static int POINTS_NOREF=-1;
+// public final static int POINTS_MATCH=10;
+// public final static int POINTS_SUB=-15;
+// public final static int POINTS_SUB2=-10;
+// public final static int POINTS_SUB3=-2;
+// public final static int POINTS_INS=-30;
+// public final static int POINTS_INS2=-3;
+// public final static int POINTS_INS3=-1;
+// public final static int POINTS_DEL=-30;
+// public final static int POINTS_DEL2=-2;
+// public final static int POINTS_DEL3=-1;
+
+// public final static int POINTS_NOREF=-5;
+// public final static int POINTS_MATCH=10;
+// public final static int POINTS_SUB=-13;
+// public final static int POINTS_SUB2=-7;
+// public final static int POINTS_SUB3=-3;
+// public final static int POINTS_INS=-21;
+// public final static int POINTS_INS2=-2;
+// public final static int POINTS_INS3=-1;
+// public final static int POINTS_DEL=-20;
+// public final static int POINTS_DEL2=-2;
+// public final static int POINTS_DEL3=-1;
+
+ public static final int POINTS_NOREF=-8;
+ public static final int POINTS_NOCALL=-8;
+ public static final int POINTS_MATCH=90;
+ public static final int POINTS_MATCH2=100; //Note: Changing to 90 substantially reduces false positives
+ public static final int POINTS_COMPATIBLE=50;
+ public static final int POINTS_SUB=-141;
+ public static final int POINTS_SUBR=-159; //increased penalty if prior match streak was at most 1
+ public static final int POINTS_SUB2=-49;
+ public static final int POINTS_SUB3=-27;
+ public static final int POINTS_MATCHSUB=-10;
+ public static final int POINTS_INS=-204;
+ public static final int POINTS_INS2=-42;
+ public static final int POINTS_INS3=-25;
+ public static final int POINTS_INS4=-8;
+ public static final int POINTS_DEL=-287;
+ public static final int POINTS_DEL2=-39;
+ public static final int POINTS_DEL3=-21;
+ public static final int POINTS_DEL4=-12;
+ public static final int POINTS_DEL5=-8;
+ public static final int POINTS_DEL_REF_N=-10;
+ public static final int POINTS_GAP=0-GAPCOST;
+
+ public static final int TIMESLIP=4;
+ public static final int MASK5=TIMESLIP-1;
+ static{assert(Integer.bitCount(TIMESLIP)==1);}
+
+ //TODO: Consider removing these barriers entirely for PacBio reads. Would make code faster, too.
+ private static final int BARRIER_I1=0;
+ private static final int BARRIER_D1=0;
+
+
+ public static final int LIMIT_FOR_COST_3=5;
+ public static final int LIMIT_FOR_COST_4=25;
+ public static final int LIMIT_FOR_COST_5=80;
+
+ public static final int BAD=MIN_SCORE-1;
+
+
+ public static final int POINTSoff_NOREF=(POINTS_NOREF<<SCOREOFFSET);
+ public static final int POINTSoff_NOCALL=(POINTS_NOCALL<<SCOREOFFSET);
+ public static final int POINTSoff_MATCH=(POINTS_MATCH<<SCOREOFFSET);
+ public static final int POINTSoff_MATCH2=(POINTS_MATCH2<<SCOREOFFSET);
+ public static final int POINTSoff_COMPATIBLE=(POINTS_COMPATIBLE<<SCOREOFFSET);
+ public static final int POINTSoff_SUB=(POINTS_SUB<<SCOREOFFSET);
+ public static final int POINTSoff_SUBR=(POINTS_SUBR<<SCOREOFFSET);
+ public static final int POINTSoff_SUB2=(POINTS_SUB2<<SCOREOFFSET);
+ public static final int POINTSoff_SUB3=(POINTS_SUB3<<SCOREOFFSET);
+ public static final int POINTSoff_MATCHSUB=(POINTS_MATCHSUB<<SCOREOFFSET);
+ public static final int POINTSoff_INS=(POINTS_INS<<SCOREOFFSET);
+ public static final int POINTSoff_INS2=(POINTS_INS2<<SCOREOFFSET);
+ public static final int POINTSoff_INS3=(POINTS_INS3<<SCOREOFFSET);
+ public static final int POINTSoff_INS4=(POINTS_INS4<<SCOREOFFSET);
+ public static final int POINTSoff_DEL=(POINTS_DEL<<SCOREOFFSET);
+ public static final int POINTSoff_DEL2=(POINTS_DEL2<<SCOREOFFSET);
+ public static final int POINTSoff_DEL3=(POINTS_DEL3<<SCOREOFFSET);
+ public static final int POINTSoff_DEL4=(POINTS_DEL4<<SCOREOFFSET);
+ public static final int POINTSoff_DEL5=(POINTS_DEL5<<SCOREOFFSET);
+ public static final int POINTSoff_GAP=(POINTS_GAP<<SCOREOFFSET);
+ public static final int POINTSoff_DEL_REF_N=(POINTS_DEL_REF_N<<SCOREOFFSET);
+ public static final int BADoff=(BAD<<SCOREOFFSET);
+ public static final int MAXoff_SCORE=MAX_SCORE<<SCOREOFFSET;
+ public static final int MINoff_SCORE=MIN_SCORE<<SCOREOFFSET;
+
+ private int rows;
+ private int columns;
+
+ public long iterationsLimited=0;
+ public long iterationsUnlimited=0;
+
+ public boolean verbose=false;
+ public boolean verbose2=false;
+
+}
diff --git a/current/align2/MultiStateAligner9XFlat.java b/current/align2/MultiStateAligner9XFlat.java
new file mode 100755
index 0000000..98311af
--- /dev/null
+++ b/current/align2/MultiStateAligner9XFlat.java
@@ -0,0 +1,2423 @@
+package align2;
+
+import java.util.Arrays;
+
+import stream.Read;
+import stream.SiteScore;
+
+import dna.AminoAcid;
+
+/**
+ * Based on MSA9ts, with transform scores tweaked for PacBio. */
+public final class MultiStateAligner9XFlat extends MSA{
+
+
+ public static void main(String[] args){
+ byte[] read=args[0].getBytes();
+ byte[] ref=args[1].getBytes();
+
+ byte[] original=ref;
+
+ MultiStateAligner9XFlat msa=new MultiStateAligner9XFlat(read.length, ref.length);
+
+ System.out.println("Initial: ");
+ printMatrix(msa.packed, read.length, ref.length, TIMEMASK, SCOREOFFSET);
+
+ int[] max=msa.fillLimited(read, ref, 0, ref.length-1, 0, null);
+
+ System.out.println("Max: "+Arrays.toString(max));
+
+ System.out.println("Final: ");
+ printMatrix(msa.packed, read.length, ref.length, TIMEMASK, SCOREOFFSET);
+
+ byte[] out=msa.traceback(read, ref, 0, ref.length-1, max[0], max[1], max[2], false);
+
+ int[] score=null;
+ score=msa.score(read, ref, 0, ref.length-1, max[0], max[1], max[2], false);
+
+ System.out.println(new String(ref));
+ System.out.println(new String(read));
+ System.out.println(new String(out));
+ System.out.println("Score: "+Arrays.toString(score));
+ }
+
+
+ public MultiStateAligner9XFlat(int maxRows_, int maxColumns_){
+ super(maxRows_, maxColumns_);
+
+ {
+ int[][][] packed0=null;
+ byte[] grefbuffer0=null;
+ int[] vertLimit0=null;
+ int[] horizLimit0=null;
+
+ try {
+ packed0=new int[3][maxRows+1][maxColumns+1];
+ grefbuffer0=new byte[maxColumns+2];
+ vertLimit0=new int[maxRows+1];
+ horizLimit0=new int[maxColumns+1];
+ } catch (OutOfMemoryError e) {
+ packed0=null;
+ grefbuffer0=null;
+ vertLimit0=null;
+ horizLimit0=null;
+ throw new RuntimeException(e.toString());
+ }
+
+ packed=packed0;
+ grefbuffer=grefbuffer0;
+ vertLimit=vertLimit0;
+ horizLimit=horizLimit0;
+ }
+
+ Arrays.fill(vertLimit, BADoff);
+ Arrays.fill(horizLimit, BADoff);
+
+// for(int i=0; i<maxColumns+1; i++){
+// scores[0][i]=0-i;
+// }
+
+ for(int matrix=0; matrix<packed.length; matrix++){
+ for(int i=1; i<=maxRows; i++){
+ for(int j=0; j<packed[matrix][i].length; j++){
+ packed[matrix][i][j]|=BADoff;
+ }
+// packed[matrix][i][0]|=MODE_INS;
+ }
+// for(int i=0; i<maxRows+1; i++){
+// scores[matrix][i][0]=(i*POINTSoff_NOREF);
+// }
+ for(int i=0; i<=maxRows; i++){
+
+ int prevScore=(i<2 ? 0 : packed[matrix][i-1][0]);
+ int score=(i<2 ? (i*POINTSoff_INS) :
+ (i<LIMIT_FOR_COST_3 ? prevScore+POINTSoff_INS2 : prevScore+POINTSoff_INS3));
+
+ packed[matrix][i][0]=score;
+ }
+// for(int i=1; i<maxColumns+1; i++){
+// prevState[matrix][0][i]=MODE_DEL;
+// }
+// for(int i=0; i<=maxColumns; i++){
+// packed[matrix][0][i]|=MODE_MS;
+// }
+ }
+ }
+
+ @Override
+ /** return new int[] {rows, maxC, maxS, max};
+ * Will not fill areas that cannot match minScore */
+ public final int[] fillLimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore, int[] gaps){
+ if(gaps==null){return fillLimitedX(read, ref, refStartLoc, refEndLoc, minScore);}
+ else{
+ byte[] gref=makeGref(ref, gaps, refStartLoc, refEndLoc, read);
+
+ if(verbose && greflimit>0 && greflimit<500){
+ System.err.println(new String(gref, 0, greflimit));
+ }
+
+ assert(gref!=null) : "Excessively long read:\n"+new String(read);
+ return fillLimitedX(read, gref, 0, greflimit, minScore);
+ }
+ }
+
+
+ /** return new int[] {rows, maxC, maxS, max};
+ * Will not fill areas that cannot match minScore */
+ private final int[] fillLimitedX(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore){
+ if(verbose){System.err.println("fillLimitedX");}
+// minScore=0;
+// assert(minScore>0);
+ rows=read.length;
+ columns=refEndLoc-refStartLoc+1;
+
+ final int halfband=(bandwidth<1 && bandwidthRatio<=0) ? 0 :
+ Tools.max(Tools.min(bandwidth<1 ? 9999999 : bandwidth, bandwidthRatio<=0 ? 9999999 : 8+(int)(rows*bandwidthRatio)), (columns-rows+8))/2;
+
+ if(minScore<1 || (columns+rows<90) || ((halfband<1 || halfband*3>columns) && (columns>read.length+Tools.min(170, read.length+20)))){
+// assert(false) : minScore;
+// assert(minScore>0) : minScore;
+// assert(false) : +minScore+", "+columns+", "+read.length+", "+Tools.min(100, read.length);
+ return fillUnlimited(read, ref, refStartLoc, refEndLoc);
+ }
+
+// final int BARRIER_I2=columns-BARRIER_I1;
+ final int BARRIER_I2=rows-BARRIER_I1, BARRIER_I2b=columns-1;
+ final int BARRIER_D2=rows-BARRIER_D1;
+
+ minScore-=MIN_SCORE_ADJUST; //Increases quality trivially
+
+ assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows+"\n"+
+ refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n";
+ assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns+"\n"+
+ refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n";
+
+ assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc+"\n"+
+ refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n";
+ assert(refEndLoc<ref.length) : "Check that values are in-bounds before calling this function: "+refEndLoc+", "+ref.length+"\n"+
+ refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n";
+
+ if(verbose){
+ System.err.println("Clearing matrix due to verbose mode.");
+ for(int x=0; x<packed.length; x++){
+ for(int y=1; y<rows+1; y++){
+ Arrays.fill(packed[x][y], 1, columns+1, BADoff);
+ }
+ }
+ }
+
+ for(int x=0; x<packed.length; x++){
+
+// Arrays.fill(packed[x][1], 1, columns+1, BADoff);
+ Arrays.fill(packed[x][rows], 1, columns+1, BADoff);
+// for(int y=1; y<rows+1; y++){
+// Arrays.fill(packed[x][y], 1, columns+1, BADoff);
+// }
+ }
+
+ int minGoodCol=1;
+ int maxGoodCol=columns;
+
+ final int minScore_off=(minScore<<SCOREOFFSET);
+ final int maxGain=(read.length-1)*POINTSoff_MATCH2+POINTSoff_MATCH;
+ final int floor=minScore_off-maxGain;
+// final int subfloor=Tools.max(BADoff, floor-200*POINTSoff_MATCH2);
+ final int subfloor=floor-5*POINTSoff_MATCH2;
+ assert(subfloor>BADoff); //TODO: Actually, it needs to be substantially more.
+ assert(subfloor<minScore_off) : minScore_off+", "+floor+", "+BADoff+", "+subfloor;
+
+ if(verbose2){
+ System.out.println();
+ System.out.println("minScore="+minScore+"\t"+minScore_off);
+ System.out.println("maxGain="+(maxGain>>SCOREOFFSET)+"\t"+(maxGain));
+ System.out.println("floor="+(floor>>SCOREOFFSET)+"\t"+(floor));
+ System.out.println("subfloor="+(subfloor>>SCOREOFFSET)+"\t"+(subfloor));
+ System.out.println("BADoff="+(BADoff>>SCOREOFFSET)+"\t"+(BADoff));
+ System.out.println("maxGain="+(maxGain>>SCOREOFFSET)+"\t"+(maxGain));
+ System.out.println();
+ }
+
+ vertLimit[rows]=minScore_off;
+ boolean prevDefined=false;
+ for(int i=rows-1; i>=0; i--){
+ byte c=read[i];
+ if(AminoAcid.isFullyDefined(c)){
+ vertLimit[i]=Tools.max(vertLimit[i+1]-(prevDefined ? POINTSoff_MATCH2 : POINTSoff_MATCH), floor);
+ prevDefined=true;
+ }else{
+ vertLimit[i]=Tools.max(vertLimit[i+1]-POINTSoff_NOCALL, floor);
+ prevDefined=false;
+ }
+ }
+
+ horizLimit[columns]=minScore_off;
+ prevDefined=false;
+ for(int i=columns-1; i>=0; i--){
+ byte c=ref[refStartLoc+i];
+ if(AminoAcid.isFullyDefined(c)){
+ horizLimit[i]=Tools.max(horizLimit[i+1]-(prevDefined ? POINTSoff_MATCH2 : POINTSoff_MATCH), floor);
+ prevDefined=true;
+ }else{
+ horizLimit[i]=Tools.max(horizLimit[i+1]-(prevDefined && c==GAPC ? POINTSoff_DEL : POINTSoff_NOREF), floor);
+ prevDefined=false;
+ }
+ }
+
+// vertLimit[rows]=minScore_off;
+// for(int i=rows-1; i>=0; i--){
+// vertLimit[i]=Tools.max(vertLimit[i+1]-POINTSoff_MATCH2, floor);
+// }
+//
+// horizLimit[columns]=minScore_off;
+// for(int i=columns-1; i>=0; i--){
+// horizLimit[i]=Tools.max(horizLimit[i+1]-POINTSoff_MATCH2, floor);
+// }
+
+ for(int row=1; row<=rows; row++){
+
+ final int colStart=(halfband<1 ? minGoodCol : Tools.max(minGoodCol, row-halfband));
+ final int colStop=(halfband<1 ? maxGoodCol : Tools.min(maxGoodCol, row+halfband*2-1));
+
+ minGoodCol=-1;
+ maxGoodCol=-2;
+
+ final int vlimit=vertLimit[row];
+
+ if(verbose2){
+ System.out.println();
+ System.out.println("row="+row);
+ System.out.println("colStart="+colStart);
+ System.out.println("colStop="+colStop);
+ System.out.println("vlimit="+(vlimit>>SCOREOFFSET)+"\t"+(vlimit));
+ }
+
+ if(colStart<0 || colStop<colStart){break;}
+
+
+ if(colStart>1){
+ assert(row>0);
+ packed[MODE_MS][row][colStart-1]=subfloor;
+ packed[MODE_INS][row][colStart-1]=subfloor;
+ packed[MODE_DEL][row][colStart-1]=subfloor;
+ }
+
+
+ for(int col=colStart; col<=columns; col++){
+
+
+ if(verbose2){
+ System.err.println("\ncol "+col);
+ }
+
+ final byte call0=(row<2 ? (byte)'?' : read[row-2]);
+ final byte call1=read[row-1];
+ final byte ref0=(col<2 ? (byte)'!' : ref[refStartLoc+col-2]);
+ final byte ref1=ref[refStartLoc+col-1];
+
+ final boolean gap=(ref1==GAPC);
+ assert(call1!=GAPC);
+
+// final boolean match=(read[row-1]==ref[refStartLoc+col-1]);
+// final boolean prevMatch=(row<2 || col<2 ? false : read[row-2]==ref[refStartLoc+col-2]);
+ final boolean match=(call1==ref1 && ref1!='N');
+ final boolean prevMatch=(call0==ref0 && ref0!='N');
+
+// System.err.println("")
+
+ iterationsLimited++;
+ final int limit=Tools.max(vlimit, horizLimit[col]);
+ final int limit3=Tools.max(floor, (match ? limit-POINTSoff_MATCH2 : limit-POINTSoff_SUB));
+
+ final int delNeeded=Tools.max(0, row-col-1);
+ final int insNeeded=Tools.max(0, (rows-row)-(columns-col)-1);
+
+ final int delPenalty=calcDelScoreOffset(delNeeded);
+ final int insPenalty=calcInsScoreOffset(insNeeded);
+
+
+ final int scoreFromDiag_MS=packed[MODE_MS][row-1][col-1]&SCOREMASK;
+ final int scoreFromDel_MS=packed[MODE_DEL][row-1][col-1]&SCOREMASK;
+ final int scoreFromIns_MS=packed[MODE_INS][row-1][col-1]&SCOREMASK;
+
+ final int scoreFromDiag_DEL=packed[MODE_MS][row][col-1]&SCOREMASK;
+ final int scoreFromDel_DEL=packed[MODE_DEL][row][col-1]&SCOREMASK;
+
+ final int scoreFromDiag_INS=packed[MODE_MS][row-1][col]&SCOREMASK;
+ final int scoreFromIns_INS=packed[MODE_INS][row-1][col]&SCOREMASK;
+
+// if(scoreFromDiag_MS<limit3 && scoreFromDel_MS<limit3 && scoreFromIns_MS<limit3
+// && scoreFromDiag_DEL<limit && scoreFromDel_DEL<limit && scoreFromDiag_INS<limit && scoreFromIns_INS<limit){
+// iterationsLimited--; //A "fast" iteration
+// }
+
+ if(gap || (scoreFromDiag_MS<=limit3 && scoreFromDel_MS<=limit3 && scoreFromIns_MS<=limit3)){
+ packed[MODE_MS][row][col]=subfloor;
+ }else{//Calculate match and sub scores
+ final int streak=(packed[MODE_MS][row-1][col-1]&TIMEMASK);
+
+ {//Calculate match/sub score
+
+ int score;
+ int time;
+ byte prevState;
+
+ if(match){
+
+ int scoreMS=scoreFromDiag_MS+(prevMatch ? POINTSoff_MATCH2 : POINTSoff_MATCH);
+ int scoreD=scoreFromDel_MS+POINTSoff_MATCH;
+ int scoreI=scoreFromIns_MS+POINTSoff_MATCH;
+
+// byte prevState;
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? streak+1 : 1);
+ prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+ prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+ prevState=MODE_INS;
+ }
+
+// if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+// assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+// assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+//// packed[MODE_MS][row][col]=(score|prevState|time);
+// packed[MODE_MS][row][col]=(score|time);
+// assert((score&SCOREMASK)==score);
+//// assert((prevState&MODEMASK)==prevState);
+// assert((time&TIMEMASK)==time);
+
+ }else{
+
+ int scoreMS;
+ if(ref1!='N' && call1!='N'){
+ scoreMS=scoreFromDiag_MS+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) : POINTSoff_SUB);
+ }else{
+ scoreMS=scoreFromDiag_MS+POINTSoff_NOCALL;
+ }
+
+ int scoreD=scoreFromDel_MS+POINTSoff_SUB; //+2 to move it as close as possible to the deletion / insertion
+ int scoreI=scoreFromIns_MS+POINTSoff_SUB;
+
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? 1 : streak+1);
+// time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1);
+ prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+ prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+ prevState=MODE_INS;
+ }
+
+// if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+// assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+// assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+//// packed[MODE_MS][row][col]=(score|prevState|time);
+// packed[MODE_MS][row][col]=(score|time);
+// assert((score&SCOREMASK)==score);
+//// assert((prevState&MODEMASK)==prevState);
+// assert((time&TIMEMASK)==time);
+ }
+
+ final int limit2;
+ if(delNeeded>0){
+ limit2=limit-delPenalty;
+ }else if(insNeeded>0){
+ limit2=limit-insPenalty;
+ }else{
+ limit2=limit;
+ }
+ assert(limit2>=limit);
+
+ if(verbose2){System.err.println("MS: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));}
+
+ if(score>=limit2){
+ maxGoodCol=col;
+ if(minGoodCol<0){minGoodCol=col;}
+ }else{
+ score=subfloor;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_MS][row][col]=(score|prevState|time);
+ packed[MODE_MS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+ }
+
+ if((scoreFromDiag_DEL<=limit && scoreFromDel_DEL<=limit) || row<BARRIER_D1 || row>BARRIER_D2){
+// assert((scoreFromDiag_DEL<=limit && scoreFromDel_DEL<=limit)) : scoreFromDiag_DEL+", "+row;
+ packed[MODE_DEL][row][col]=subfloor;
+ }else{//Calculate DEL score
+
+ final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK;
+
+ int scoreMS=scoreFromDiag_DEL+POINTSoff_DEL;
+ int scoreD=scoreFromDel_DEL+(streak==0 ? POINTSoff_DEL :
+ streak<LIMIT_FOR_COST_3 ? POINTSoff_DEL2 : POINTSoff_DEL3);
+// int scoreI=scoreFromIns+POINTSoff_DEL;
+
+
+ if(ref1=='N'){
+ scoreMS+=POINTSoff_DEL_REF_N;
+ scoreD+=POINTSoff_DEL_REF_N;
+ }else if(gap){
+ scoreMS+=POINTSoff_GAP;
+ scoreD+=POINTSoff_GAP;
+ }
+
+ //if(match){scoreMS=subfloor;}
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreD){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreD;
+ time=streak+1;
+ prevState=MODE_DEL;
+ }
+
+ final int limit2;
+ if(insNeeded>0){
+ limit2=limit-insPenalty;
+ }else if(delNeeded>0){
+ limit2=limit-calcDelScoreOffset(time+delNeeded)+calcDelScoreOffset(time);
+ }else{
+ limit2=limit;
+ }
+ assert(limit2>=limit);
+ if(verbose2){System.err.println("DEL: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));}
+
+ if(score>=limit2){
+ maxGoodCol=col;
+ if(minGoodCol<0){minGoodCol=col;}
+ }else{
+ score=subfloor;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_DEL][row][col]=(score|prevState|time);
+ packed[MODE_DEL][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+
+// if(gap || (scoreFromDiag_INS<=limit && scoreFromIns_INS<=limit) || col<BARRIER_I1 || col>BARRIER_I2){
+ if(gap || (scoreFromDiag_INS<=limit && scoreFromIns_INS<=limit) || (row<BARRIER_I1 && col>1) || (row>BARRIER_I2 && col<BARRIER_I2b)){
+ packed[MODE_INS][row][col]=subfloor;
+ }else{//Calculate INS score
+
+ final int streak=packed[MODE_INS][row-1][col]&TIMEMASK;
+
+ int scoreMS=scoreFromDiag_INS+POINTSoff_INS;
+// int scoreD=scoreFromDel+POINTSoff_INS;
+ int scoreI=scoreFromIns_INS+(streak==0 ? POINTSoff_INS :
+ streak<LIMIT_FOR_COST_3 ? POINTSoff_INS2 : POINTSoff_INS3);
+
+
+// System.err.println("("+row+","+col+")\t"+scoreFromDiag+"+"+POINTSoff_INS+"="+scoreM+", "+
+// scoreFromSub+"+"+POINTSoff_INS+"="+scoreS+", "
+// +scoreD+", "+scoreFromIns+"+"+
+// (streak==0 ? POINTSoff_INS : streak<LIMIT_FOR_COST_3 ? POINTSoff_INS2 : POINTSoff_INS3)+"="+scoreI);
+
+ //if(match){scoreMS=subfloor;}
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreI){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreI;
+ time=streak+1;
+ prevState=MODE_INS;
+ }
+
+ final int limit2;
+ if(delNeeded>0){
+ limit2=limit-delPenalty;
+ }else if(insNeeded>0){
+ limit2=limit-calcInsScoreOffset(time+insNeeded)+calcInsScoreOffset(time);
+ }else{
+ limit2=limit;
+ }
+ assert(limit2>=limit);
+
+ if(verbose2){System.err.println("INS: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));}
+ if(score>=limit2){
+ maxGoodCol=col;
+ if(minGoodCol<0){minGoodCol=col;}
+ }else{
+ score=subfloor;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_INS][row][col]=(score|prevState|time);
+ packed[MODE_INS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+
+
+ if(col>=colStop){
+ if(col>colStop && (maxGoodCol<col || halfband>0)){break;}
+ if(row>1){
+ packed[MODE_MS][row-1][col+1]=subfloor;
+ packed[MODE_INS][row-1][col+1]=subfloor;
+ packed[MODE_DEL][row-1][col+1]=subfloor;
+ }
+ }
+ }
+ }
+
+
+ int maxCol=-1;
+ int maxState=-1;
+ int maxScore=Integer.MIN_VALUE;
+
+ for(int state=0; state<packed.length; state++){
+ for(int col=1; col<=columns; col++){
+ int x=packed[state][rows][col]&SCOREMASK;
+ if(x>maxScore){
+ maxScore=x;
+ maxCol=col;
+ maxState=state;
+ }
+ }
+ }
+
+ assert(maxScore>=BADoff);
+// if(maxScore==BADoff){
+// return null;
+// }
+// if(maxScore<floor){
+// return null;
+// }
+
+ if(verbose){
+ System.out.println("Filled matrix.");
+ printMatrix(packed, rows, columns, TIMEMASK, SCOREOFFSET);
+ }
+ if(verbose){System.err.println("maxscore="+(maxScore>>SCOREOFFSET)+", minscore="+(minScore_off>>SCOREOFFSET));}
+
+ if(maxScore<minScore_off){
+ return null;
+ }
+
+ maxScore>>=SCOREOFFSET;
+
+ if(verbose){
+ System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore);
+ }
+ return new int[] {rows, maxCol, maxState, maxScore};
+ }
+
+ @Override
+ /** return new int[] {rows, maxC, maxS, max};
+ * Will not fill areas that cannot match minScore */
+ public final int[] fillUnlimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int[] gaps){
+ if(gaps==null){return fillUnlimited(read, ref, refStartLoc, refEndLoc);}
+ else{
+ byte[] gref=makeGref(ref, gaps, refStartLoc, refEndLoc, read);
+ assert(gref!=null) : "Excessively long read:\n"+new String(read);
+ return fillUnlimited(read, gref, 0, greflimit);
+ }
+ }
+
+
+ /** return new int[] {rows, maxC, maxS, max};
+ * Does not require a min score (ie, same as old method) */
+ private final int[] fillUnlimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc){
+ rows=read.length;
+ columns=refEndLoc-refStartLoc+1;
+
+ final int maxGain=(read.length-1)*POINTSoff_MATCH2+POINTSoff_MATCH;
+ final int subfloor=0-2*maxGain;
+ assert(subfloor>BADoff && subfloor*2>BADoff) : (read.length-1)+", "+maxGain+", "+subfloor+", "+(subfloor*2)+", "+BADoff+"\n"
+ +rows+", "+columns+", "+POINTSoff_MATCH2+", "+SCOREOFFSET+"\n"+new String(read)+"\n"; //TODO: Actually, it needs to be substantially more.
+// final int BARRIER_I2=columns-BARRIER_I1;
+ final int BARRIER_I2=rows-BARRIER_I1, BARRIER_I2b=columns-1;
+ final int BARRIER_D2=rows-BARRIER_D1;
+
+ //temporary, for finding a bug
+ if(rows>maxRows || columns>maxColumns){
+ throw new RuntimeException("rows="+rows+", maxRows="+maxRows+", cols="+columns+", maxCols="+maxColumns+"\n"+new String(read)+"\n");
+ }
+
+ assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows;
+ assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns;
+
+ assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc;
+ assert(refEndLoc<ref.length) : "Check that values are in-bounds before calling this function: "+refEndLoc+", "+ref.length;
+
+ for(int row=1; row<=rows; row++){
+
+// int minc=max(1, row-20);
+// int maxc=min(columns, row+20);
+
+ for(int col=1; col<=columns; col++){
+ iterationsUnlimited++;
+
+// final boolean match=(read[row-1]==ref[refStartLoc+col-1]);
+// final boolean prevMatch=(row<2 || col<2 ? false : read[row-2]==ref[refStartLoc+col-2]);
+
+ final byte call0=(row<2 ? (byte)'?' : read[row-2]);
+ final byte call1=read[row-1];
+ final byte ref0=(col<2 ? (byte)'!' : ref[refStartLoc+col-2]);
+ final byte ref1=ref[refStartLoc+col-1];
+
+ final boolean match=(call1==ref1 && ref1!='N');
+ final boolean prevMatch=(call0==ref0 && ref0!='N');
+
+ final boolean gap=(ref1==GAPC);
+ assert(call1!=GAPC);
+
+ if(gap){
+ packed[MODE_MS][row][col]=subfloor;
+ }else{//Calculate match and sub scores
+
+ final int scoreFromDiag=packed[MODE_MS][row-1][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row-1][col-1]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col-1]&SCOREMASK;
+ final int streak=(packed[MODE_MS][row-1][col-1]&TIMEMASK);
+
+ {//Calculate match/sub score
+
+ if(match){
+
+ int scoreMS=scoreFromDiag+(prevMatch ? POINTSoff_MATCH2 : POINTSoff_MATCH);
+ int scoreD=scoreFromDel+POINTSoff_MATCH;
+ int scoreI=scoreFromIns+POINTSoff_MATCH;
+
+ int score;
+ int time;
+// byte prevState;
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? streak+1 : 1);
+// prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+// prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+// prevState=MODE_INS;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_MS][row][col]=(score|prevState|time);
+ packed[MODE_MS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+
+ }else{
+
+ int scoreMS;
+ if(ref1!='N' && call1!='N'){
+ scoreMS=scoreFromDiag+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) : POINTSoff_SUB);
+ }else{
+ scoreMS=scoreFromDiag+POINTSoff_NOCALL;
+ }
+
+ int scoreD=scoreFromDel+POINTSoff_SUB; //+2 to move it as close as possible to the deletion / insertion
+ int scoreI=scoreFromIns+POINTSoff_SUB;
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? 1 : streak+1);
+// time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1);
+ prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+ prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+ prevState=MODE_INS;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_MS][row][col]=(score|prevState|time);
+ packed[MODE_MS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+ }
+ }
+
+ if(row<BARRIER_D1 || row>BARRIER_D2){
+ packed[MODE_DEL][row][col]=subfloor;
+ }else{//Calculate DEL score
+
+ final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK;
+
+ final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK;
+
+ int scoreMS=scoreFromDiag+POINTSoff_DEL;
+ int scoreD=scoreFromDel+(streak==0 ? POINTSoff_DEL :
+ streak<LIMIT_FOR_COST_3 ? POINTSoff_DEL2 : POINTSoff_DEL3);
+// int scoreI=scoreFromIns+POINTSoff_DEL;
+
+ if(ref1=='N'){
+ scoreMS+=POINTSoff_DEL_REF_N;
+ scoreD+=POINTSoff_DEL_REF_N;
+ }else if(gap){
+ scoreMS+=POINTSoff_GAP;
+ scoreD+=POINTSoff_GAP;
+ }
+
+ //if(match){scoreMS=subfloor;}
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreD){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreD;
+ time=streak+1;
+ prevState=MODE_DEL;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_DEL][row][col]=(score|prevState|time);
+ packed[MODE_DEL][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+
+ //Calculate INS score
+// if(gap || col<BARRIER_I1 || col>BARRIER_I2){
+ if(gap || (row<BARRIER_I1 && col>1) || (row>BARRIER_I2 && col<BARRIER_I2b)){
+ packed[MODE_INS][row][col]=subfloor;
+ }else{//Calculate INS score
+
+ final int streak=packed[MODE_INS][row-1][col]&TIMEMASK;
+
+ final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK;
+
+ int scoreMS=scoreFromDiag+POINTSoff_INS;
+// int scoreD=scoreFromDel+POINTSoff_INS;
+ int scoreI=scoreFromIns+(streak==0 ? POINTSoff_INS :
+ streak<LIMIT_FOR_COST_3 ? POINTSoff_INS2 : POINTSoff_INS3);
+
+// System.err.println("("+row+","+col+")\t"+scoreFromDiag+"+"+POINTSoff_INS+"="+scoreM+", "+
+// scoreFromSub+"+"+POINTSoff_INS+"="+scoreS+", "
+// +scoreD+", "+scoreFromIns+"+"+
+// (streak==0 ? POINTSoff_INS : streak<LIMIT_FOR_COST_3 ? POINTSoff_INS2 : POINTSoff_INS3)+"="+scoreI);
+
+ //if(match){scoreMS=subfloor;}
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreI){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreI;
+ time=streak+1;
+ prevState=MODE_INS;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_INS][row][col]=(score|prevState|time);
+ packed[MODE_INS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+ }
+ }
+
+
+ int maxCol=-1;
+ int maxState=-1;
+ int maxScore=Integer.MIN_VALUE;
+
+ for(int state=0; state<packed.length; state++){
+ for(int col=1; col<=columns; col++){
+ int x=packed[state][rows][col]&SCOREMASK;
+ if(x>maxScore){
+ maxScore=x;
+ maxCol=col;
+ maxState=state;
+ }
+ }
+ }
+ maxScore>>=SCOREOFFSET;
+
+// System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore);
+ return new int[] {rows, maxCol, maxState, maxScore};
+ }
+
+ @Deprecated
+ /** return new int[] {rows, maxC, maxS, max}; */
+ public final int[] fillQ(byte[] read, byte[] ref, byte[] baseScores, int refStartLoc, int refEndLoc){
+ assert(false) : "Needs to be redone to work with score cutoffs. Not difficult.";
+ rows=read.length;
+ columns=refEndLoc-refStartLoc+1;
+
+ assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows;
+ assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns;
+
+ assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc;
+ assert(refEndLoc<ref.length) : "Check that values are in-bounds before calling this function: "+refEndLoc+", "+ref.length;
+
+ for(int row=1; row<=rows; row++){
+
+// int minc=max(1, row-20);
+// int maxc=min(columns, row+20);
+
+ for(int col=1; col<=columns; col++){
+
+ final boolean match=(read[row-1]==ref[refStartLoc+col-1]);
+ final boolean prevMatch=(row<2 || col<2 ? false : read[row-2]==ref[refStartLoc+col-2]);
+
+ {//Calculate match and sub scores
+
+ final int scoreFromDiag=packed[MODE_MS][row-1][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row-1][col-1]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col-1]&SCOREMASK;
+ final int streak=(packed[MODE_MS][row-1][col-1]&TIMEMASK);
+
+ {//Calculate match/sub score
+
+ if(match){
+
+ int scoreMS=scoreFromDiag+(prevMatch ? POINTSoff_MATCH2 : POINTSoff_MATCH);
+ int scoreD=scoreFromDel+POINTSoff_MATCH;
+ int scoreI=scoreFromIns+POINTSoff_MATCH;
+
+ int score;
+ int time;
+// byte prevState;
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? streak+1 : 1);
+// prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+// prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+// prevState=MODE_INS;
+ }
+ score+=(((int)baseScores[row-1])<<SCOREOFFSET); //modifier
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_MS][row][col]=(score|prevState|time);
+ packed[MODE_MS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+
+ }else{
+
+ int scoreMS=scoreFromDiag+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) : POINTSoff_SUB);
+ int scoreD=scoreFromDel+POINTSoff_SUB; //+2 to move it as close as possible to the deletion / insertion
+ int scoreI=scoreFromIns+POINTSoff_SUB;
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? 1 : streak+1);
+// time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1);
+ prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+ prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+ prevState=MODE_INS;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_MS][row][col]=(score|prevState|time);
+ packed[MODE_MS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+ }
+ }
+
+ {//Calculate DEL score
+
+ final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK;
+
+ final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK;
+
+ int scoreMS=scoreFromDiag+POINTSoff_DEL;
+ int scoreD=scoreFromDel+(streak==0 ? POINTSoff_DEL :
+ streak<LIMIT_FOR_COST_3 ? POINTSoff_DEL2 : POINTSoff_DEL3);
+// int scoreI=scoreFromIns+POINTSoff_DEL;
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreD){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreD;
+ time=streak+1;
+ prevState=MODE_DEL;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_DEL][row][col]=(score|prevState|time);
+ packed[MODE_DEL][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+
+ {//Calculate INS score
+
+ final int streak=packed[MODE_INS][row-1][col]&TIMEMASK;
+
+ final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK;
+
+ int scoreMS=scoreFromDiag+POINTSoff_INS;
+// int scoreD=scoreFromDel+POINTSoff_INS;
+ int scoreI=scoreFromIns+(streak==0 ? POINTSoff_INS :
+ streak<LIMIT_FOR_COST_3 ? POINTSoff_INS2 : POINTSoff_INS3);
+
+// System.err.println("("+row+","+col+")\t"+scoreFromDiag+"+"+POINTSoff_INS+"="+scoreM+", "+
+// scoreFromSub+"+"+POINTSoff_INS+"="+scoreS+", "
+// +scoreD+", "+scoreFromIns+"+"+
+// (streak==0 ? POINTSoff_INS : streak<LIMIT_FOR_COST_3 ? POINTSoff_INS2 : POINTSoff_INS3)+"="+scoreI);
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreI){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreI;
+ time=streak+1;
+ prevState=MODE_INS;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_INS][row][col]=(score|prevState|time);
+ packed[MODE_INS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+ }
+ }
+
+
+ int maxCol=-1;
+ int maxState=-1;
+ int maxScore=Integer.MIN_VALUE;
+
+ for(int state=0; state<packed.length; state++){
+ for(int col=1; col<=columns; col++){
+ int x=packed[state][rows][col]&SCOREMASK;
+ if(x>maxScore){
+ maxScore=x;
+ maxCol=col;
+ maxState=state;
+ }
+ }
+ }
+ maxScore>>=SCOREOFFSET;
+
+// System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore);
+ return new int[] {rows, maxCol, maxState, maxScore};
+ }
+
+ @Override
+ /** @return {score, bestRefStart, bestRefStop} */
+ /** Generates the match string */
+ public final byte[] traceback(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state, boolean gapped){
+ if(gapped){
+ final byte[] gref=grefbuffer;
+ int gstart=translateToGappedCoordinate(refStartLoc, gref);
+ int gstop=translateToGappedCoordinate(refEndLoc, gref);
+ byte[] out=traceback2(read, gref, gstart, gstop, row, col, state);
+ return out;
+ }else{
+ return traceback2(read, ref, refStartLoc, refEndLoc, row, col, state);
+ }
+ }
+
+ @Override
+ /** Generates the match string */
+ public final byte[] traceback2(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state){
+// assert(false);
+ assert(refStartLoc<=refEndLoc) : refStartLoc+", "+refEndLoc;
+ assert(row==rows);
+
+ byte[] out=new byte[row+col-1]; //TODO if an out of bound crash occurs, try removing the "-1".
+ int outPos=0;
+
+ int gaps=0;
+
+ if(state==MODE_INS){
+ //TODO ? Maybe not needed.
+ }
+
+ while(row>0 && col>0){
+
+// byte prev0=(byte)(packed[state][row][col]&MODEMASK);
+
+ final int time=packed[state][row][col]&TIMEMASK;
+ final byte prev;
+
+// System.err.println("state="+state+", prev="+prev+", row="+row+", col="+col+", score="+scores[state][row][col]);
+
+ if(state==MODE_MS){
+ if(time>1){prev=(byte)state;}
+ else{
+ final int scoreFromDiag=packed[MODE_MS][row-1][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row-1][col-1]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col-1]&SCOREMASK;
+ if(scoreFromDiag>=scoreFromDel && scoreFromDiag>=scoreFromIns){prev=MODE_MS;}
+ else if(scoreFromDel>=scoreFromIns){prev=MODE_DEL;}
+ else{prev=MODE_INS;}
+ }
+
+ byte c=read[row-1];
+ byte r=ref[refStartLoc+col-1];
+ if(c==r){
+ out[outPos]='m';
+ }else{
+ if(!AminoAcid.isFullyDefined(c)){
+ out[outPos]='N';
+ }else if(!AminoAcid.isFullyDefined(r)){
+// out[outPos]='X';
+ out[outPos]='N';
+ }else{
+ out[outPos]='S';
+ }
+ }
+
+ row--;
+ col--;
+ }else if(state==MODE_DEL){
+ if(time>1){prev=(byte)state;}
+ else{
+ final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK;
+ if(scoreFromDiag>=scoreFromDel){prev=MODE_MS;}
+ else{prev=MODE_DEL;}
+ }
+
+ byte r=ref[refStartLoc+col-1];
+ if(r==GAPC){
+ out[outPos]='-';
+ gaps++;
+ }else{
+ out[outPos]='D';
+ }
+ col--;
+ }else{
+ if(time>1){prev=(byte)state;}
+ else{
+ final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK;
+ if(scoreFromDiag>=scoreFromIns){prev=MODE_MS;}
+ else{prev=MODE_INS;}
+ }
+
+ assert(state==MODE_INS) : state;
+ if(col==0){
+ out[outPos]='X';
+ }else if(col>=columns){
+ out[outPos]='Y';
+ }else{
+ out[outPos]='I';
+ }
+ row--;
+ }
+
+// assert(prev==prev0);
+ state=prev;
+ outPos++;
+ }
+
+ assert(row==0 || col==0);
+ if(col!=row){
+ while(row>0){
+ out[outPos]='X';
+ outPos++;
+ row--;
+ col--;
+ }
+ if(col>0){
+ //do nothing
+ }
+ }
+
+
+ //Shrink and reverse the string
+ byte[] out2=new byte[outPos];
+ for(int i=0; i<outPos; i++){
+ out2[i]=out[outPos-i-1];
+ }
+ out=null;
+
+ if(gaps==0){return out2;}
+
+ //TODO Consider outputting this compressed.
+ byte[] out3=new byte[out2.length+gaps*(GAPLEN-1)];
+ for(int i=0, j=0; i<out2.length; i++){
+ byte c=out2[i];
+ if(c!=GAPC){
+ out3[j]=c;
+ j++;
+ }else{
+ int lim=j+GAPLEN;
+ for(; j<lim; j++){
+ out3[j]='D';
+ }
+ }
+ }
+ return out3;
+ }
+
+ @Override
+ /** @return {score, bestRefStart, bestRefStop} */
+ public final int[] score(final byte[] read, final byte[] ref, final int refStartLoc, final int refEndLoc,
+ final int maxRow, final int maxCol, final int maxState, boolean gapped){
+ if(gapped){
+ if(verbose){
+ System.err.println("score():");
+ System.err.println("origin="+grefRefOrigin+", "+refStartLoc+", "+refEndLoc+", "+maxRow+", "+maxCol);
+ }
+ final byte[] gref=grefbuffer;
+ int gstart=translateToGappedCoordinate(refStartLoc, gref);
+ int gstop=translateToGappedCoordinate(refEndLoc, gref);
+
+ assert(translateFromGappedCoordinate(gstart, gref)==refStartLoc); //TODO: Remove slow assertions
+ assert(translateFromGappedCoordinate(gstop, gref)==refEndLoc);
+
+ assert(gstart==0) : gstart; //TODO: skip translation if this is always zero
+
+ if(verbose){System.err.println("gstart, gstop: "+gstart+", "+gstop);}
+ int[] out=score2(read, gref, gstart, gstop, maxRow, maxCol, maxState);
+ if(verbose){System.err.println("got score "+Arrays.toString(out));}
+
+ assert(out[1]==translateToGappedCoordinate(translateFromGappedCoordinate(out[1], gref), gref)) :
+ "Verifying: "+out[1]+" -> "+translateFromGappedCoordinate(out[1], gref)+" -> "+
+ translateToGappedCoordinate(translateFromGappedCoordinate(out[1], gref), gref);
+ assert(out[2]==translateToGappedCoordinate(translateFromGappedCoordinate(out[2], gref), gref));
+
+ out[1]=translateFromGappedCoordinate(out[1], gref);
+ out[2]=translateFromGappedCoordinate(out[2], gref);
+ if(verbose){System.err.println("returning score "+Arrays.toString(out));}
+ return out;
+ }else{
+ return score2(read, ref, refStartLoc, refEndLoc, maxRow, maxCol, maxState);
+ }
+ }
+
+ @Override
+ /** @return {score, bestRefStart, bestRefStop, maxRow, maxCol, maxState}, <br>
+ * or {score, bestRefStart, bestRefStop, maxRow, maxCol, maxState, padLeft, padRight} <br>
+ * if more padding is needed */
+ public final int[] score2(final byte[] read, final byte[] ref, final int refStartLoc, final int refEndLoc,
+ final int maxRow, final int maxCol, final int maxState){
+
+ int row=maxRow;
+ int col=maxCol;
+ int state=maxState;
+
+ assert(maxState>=0 && maxState<packed.length) :
+ maxState+", "+maxRow+", "+maxCol+"\n"+new String(read)+"\n"+toString(ref, refStartLoc, refEndLoc);
+ assert(maxRow>=0 && maxRow<packed[0].length) :
+ maxState+", "+maxRow+", "+maxCol+"\n"+new String(read)+"\n"+toString(ref, refStartLoc, refEndLoc);
+ assert(maxCol>=0 && maxCol<packed[0][0].length) :
+ maxState+", "+maxRow+", "+maxCol+"\n"+new String(read)+"\n"+toString(ref, refStartLoc, refEndLoc);
+
+ int score=packed[maxState][maxRow][maxCol]&SCOREMASK; //Or zero, if it is to be recalculated
+
+ if(row<rows){
+ int difR=rows-row;
+ int difC=columns-col;
+
+ while(difR>difC){
+ score+=POINTSoff_NOREF;
+ difR--;
+ }
+
+ row+=difR;
+ col+=difR;
+
+ }
+
+ assert(refStartLoc<=refEndLoc);
+ assert(row==rows);
+
+
+ final int bestRefStop=refStartLoc+col-1;
+
+ if(verbose){System.err.println("Scoring.");}
+
+ int stateTime=0;
+
+ while(row>0 && col>0){
+
+ if(verbose){System.err.println("state="+state+", row="+row+", col="+col);}
+
+ final int time=packed[state][row][col]&TIMEMASK;
+ final byte prev;
+
+ if(state==MODE_MS){
+ if(time>1){prev=(byte)state;}
+ else{
+ final int scoreFromDiag=packed[MODE_MS][row-1][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row-1][col-1]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col-1]&SCOREMASK;
+ if(scoreFromDiag>=scoreFromDel && scoreFromDiag>=scoreFromIns){prev=MODE_MS;}
+ else if(scoreFromDel>=scoreFromIns){prev=MODE_DEL;}
+ else{prev=MODE_INS;}
+ }
+ row--;
+ col--;
+ }else if(state==MODE_DEL){
+ if(time>1){prev=(byte)state;}
+ else{
+ final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK;
+ if(scoreFromDiag>=scoreFromDel){prev=MODE_MS;}
+ else{prev=MODE_DEL;}
+ }
+ col--;
+ }else{
+ assert(state==MODE_INS);
+ if(time>1){prev=(byte)state;}
+ else{
+ final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK;
+ if(scoreFromDiag>=scoreFromIns){prev=MODE_MS;}
+ else{prev=MODE_INS;}
+ }
+ row--;
+ }
+
+ if(col<0){
+ if(verbose){
+ System.err.println("Warning, column went below 0 at row="+row);
+ }
+ break; //prevents an out of bounds access
+ }
+
+// assert(prev==prev0);
+ if(state==prev){stateTime++;}else{stateTime=0;}
+ state=prev;
+
+ if(verbose){System.err.println("state2="+state+", time="+time+", stateTime="+stateTime+", row2="+row+", col2="+col+"\n");}
+ }
+// assert(false) : row+", "+col;
+ if(row>col){
+ col-=row;
+ }
+
+ final int bestRefStart=refStartLoc+col;
+
+ score>>=SCOREOFFSET;
+
+ if(verbose){
+ System.err.println("bestRefStart="+bestRefStart+", refStartLoc="+refStartLoc);
+ System.err.println("bestRefStop="+bestRefStop+", refEndLoc="+refEndLoc);
+ }
+
+ int padLeft=0;
+ int padRight=0;
+ if(bestRefStart<refStartLoc){
+ padLeft=Tools.max(0, refStartLoc-bestRefStart);
+ }else if(bestRefStart==refStartLoc && state==MODE_INS){
+ padLeft=stateTime;
+ }
+ if(bestRefStop>refEndLoc){
+ padRight=Tools.max(0, bestRefStop-refEndLoc);
+ }else if(bestRefStop==refEndLoc && maxState==MODE_INS){
+ padRight=packed[maxState][maxRow][maxCol]&TIMEMASK;
+ }
+
+ int[] rvec;
+ if(padLeft>0 || padRight>0){ //Suggest extra padding in cases of overflow
+ rvec=new int[] {score, bestRefStart, bestRefStop, maxRow, maxCol, maxState, padLeft, padRight};
+ }else{
+ rvec=new int[] {score, bestRefStart, bestRefStop, maxRow, maxCol, maxState};
+ }
+ return rvec;
+ }
+
+ /**
+ * Fills grefbuffer
+ * @param ref
+ * @param a
+ * @param b
+ * @param gaps
+ * @return gref
+ */
+ private final byte[] makeGref(byte[] ref, int[] gaps, int refStartLoc, int refEndLoc, byte[] read){
+ assert(gaps!=null && gaps.length>0);
+
+ assert(refStartLoc<=gaps[0]) : refStartLoc+", "+refEndLoc+", "+Arrays.toString(gaps);
+ assert(refEndLoc>=gaps[gaps.length-1]);
+
+ final int g0_old=gaps[0];
+ final int gN_old=gaps[gaps.length-1];
+ gaps[0]=Tools.min(gaps[0], refStartLoc);
+ gaps[gaps.length-1]=Tools.max(gN_old, refEndLoc);
+ grefRefOrigin=gaps[0];
+
+ if(verbose){System.err.println("\ngaps2: "+Arrays.toString(gaps));}
+
+// grefRefOrigin=Tools.min(gaps[0], refStartLoc);
+
+// //This block is no longer needed since the array is preallocated.
+// int len=0;
+// final int gb2=GAPBUFFER*2;
+// for(int i=0; i<gaps.length; i+=2){
+// int x=gaps[i];
+// int y=gaps[i+1];
+// len+=(y-x+1);
+// if(i+2<gaps.length){
+// int z=gaps[i+2];
+// assert(z>y);
+// int gap=z-y-1;
+// if(gap<MINGAP){
+// len+=gap;
+// }else{
+// len+=gb2;
+// gap-=gb2;
+// int div=gap/GAPLEN;
+// int rem=gap%GAPLEN;
+// len+=(div+rem);
+// }
+// }
+// }
+ byte[] gref=grefbuffer;
+
+ int gpos=0;
+ for(int i=0; i<gaps.length; i+=2){
+ int x=gaps[i];
+ int y=gaps[i+1];
+
+ for(int r=x; r<=y; r++, gpos++){
+ //TODO: if out of bounds, use an 'N'
+ assert(gpos<gref.length) :
+ "\ngpos="+gpos+", gref.length="+gref.length+", read.length="+read.length+", gaps2="+Arrays.toString(gaps)+
+ "\ni="+i+", r="+r+", x="+x+", y="+y+
+ "\nGapTools.calcGrefLen("+gaps[0]+", "+gaps[gaps.length-1]+", gaps)="+GapTools.calcGrefLen(gaps[0], gaps[gaps.length-1], gaps)+
+ "\nGapTools.calcGrefLen("+gaps[0]+", "+gaps[gaps.length-1]+", gaps)="+GapTools.calcGrefLen(gaps[0], gaps[gaps.length-1], gaps)+
+ "\n"+refStartLoc+", "+refEndLoc+", "+greflimit+", "+GREFLIMIT2_CUSHION+"\n"+new String(gref)+"\n"+new String(read)+"\n";
+ gref[gpos]=ref[r];
+ }
+
+ if(i+2<gaps.length){
+ int z=gaps[i+2];
+ assert(z>y);
+ int gap=z-y-1;
+ assert(gap>=MINGAP) : gap+"\t"+MINGAP;
+ if(gap<MINGAP){
+ assert(false) : "TODO - just fill in normally";
+ }else{
+ int rem=gap%GAPLEN;
+ int lim=y+GAPBUFFER+rem;
+
+ int div=(gap-GAPBUFFER2)/GAPLEN;
+ if(verbose){
+ System.err.println("div = "+div);
+ }
+ assert(div>0);
+
+ for(int r=y+1; r<=lim; r++, gpos++){
+ gref[gpos]=ref[r];
+ }
+ for(int g=0; g<div; g++, gpos++){
+ gref[gpos]=GAPC;
+ }
+ for(int r=z-GAPBUFFER; r<z; r++, gpos++){
+ gref[gpos]=ref[r];
+ }
+ }
+ }
+ }
+
+ greflimit=gpos;
+
+ assert(gref[gpos-1]==ref[refEndLoc]);
+// assert(greflimit+GREFLIMIT2_CUSHION<=gref.length) : refStartLoc+", "+refEndLoc+", "+greflimit+", "+GREFLIMIT2_CUSHION+"\n"+new String(gref);
+ //Add a cushion to the end to clear out the prior data (especially GAPC) that was there
+ {
+ final int lim=Tools.min(gref.length, greflimit+GREFLIMIT2_CUSHION);
+ if(lim>gref.length){
+ System.err.println("gref buffer overflow: "+lim+" > "+gref.length);
+ return null;
+ }
+ for(int i=greflimit, r=refEndLoc+1; i<lim; i++, r++){
+ gref[i]=(r<ref.length ? ref[r] : (byte)'N');
+ greflimit2=i;
+ }
+ }
+
+ if(verbose){
+ System.err.println("gref:\n"+new String(gref));
+ }
+
+ gaps[0]=g0_old;
+ gaps[gaps.length-1]=gN_old;
+
+ if(verbose){
+ System.err.println("\ngaps3: "+Arrays.toString(gaps));
+ }
+
+ return gref;
+ }
+
+
+// public final int[] translateScoreFromGappedCoordinate(int[] score){
+//// {score, bestRefStart, bestRefStop}
+// int a=score[1];
+// int b=score[2];
+// int a2=-9999;
+// int b2=-9999;
+// for(int i=0, j=grefRefOrigin; i<grefbuffer.length; i++){
+// byte c=grefbuffer[i];
+//
+// if(i==a){a2=j;}
+// if(i==b){
+// b2=j;
+// assert(a2!=-9999);
+// score[1]=a2;
+// score[2]=b2;
+// return score;
+// }
+//
+// j+=(c==GAPC ? GAPLEN : 1);
+//// if(c!=GAPC){j++;}
+//// else{j+=GAPLEN;}
+// }
+// throw new RuntimeException("Out of bounds.");
+// }
+
+ private final int translateFromGappedCoordinate(int point, byte[] gref){
+ if(verbose){System.err.println("translateFromGappedCoordinate("+point+"), gro="+grefRefOrigin+", grl="+greflimit);}
+ if(point<=0){return grefRefOrigin+point;}
+ for(int i=0, j=grefRefOrigin; i<greflimit2; i++){
+// if(verbose){System.err.println("i="+i+", j="+j+", sym="+(char)gref[i]);}
+ byte c=gref[i];
+ assert(point>=i) : "\n"+grefRefOrigin+"\n"+point+"\n"+new String(gref)+"\n";
+
+ if(i==point){
+ if(verbose){System.err.println(" -> "+j);}
+ return j;
+ }
+
+ j+=(c==GAPC ? GAPLEN : 1);
+// if(c!=GAPC){j++;}
+// else{j+=GAPLEN;}
+ }
+
+ System.err.println(grefRefOrigin);
+ System.err.println(point);
+ System.err.println(new String(gref));
+
+ throw new RuntimeException("Out of bounds.");
+ }
+
+ private final int translateToGappedCoordinate(int point, byte[] gref){
+ if(verbose){System.err.println("translateToGappedCoordinate("+point+"), gro="+grefRefOrigin+", grl="+greflimit);}
+ if(point<=grefRefOrigin){return point-grefRefOrigin;}
+ for(int i=0, j=grefRefOrigin; i<greflimit2; i++){
+// if(verbose){System.err.println("i="+i+", j="+j+", sym="+(char)gref[i]);}
+ assert(point>=j) : "\n"+grefRefOrigin+"\n"+point+"\n"+new String(gref)+"\n";
+ byte c=gref[i];
+
+ if(j==point){
+ if(verbose){System.err.println(" -> "+i);}
+ return i;
+ }
+
+ j+=(c==GAPC ? GAPLEN : 1);
+// if(c!=GAPC){j++;}
+// else{j+=GAPLEN;}
+ }
+
+ System.err.println(grefRefOrigin);
+ System.err.println(point);
+ System.err.println(new String(gref));
+
+ throw new RuntimeException("Out of bounds.");
+ }
+
+
+ /** Calculates score based on an array from Index */
+ private final int calcAffineScore(int[] locArray){
+ int score=0;
+ int lastLoc=-2; //Last true location
+ int lastValue=-1;
+ int timeInMode=0;
+
+ for(int i=0; i<locArray.length; i++){
+ int loc=locArray[i];
+
+ if(loc>0){//match
+ if(loc==lastValue){//contiguous match
+ score+=POINTS_MATCH2;
+ }else if(loc==lastLoc || lastLoc<0){//match after a sub, or first match
+ score+=POINTS_MATCH;
+ }else if(loc<lastLoc){//deletion
+ assert(lastLoc>=0);
+ score+=POINTS_MATCH;
+ score+=POINTS_DEL;
+ int dif=lastLoc-loc+1;
+
+ if(dif>LIMIT_FOR_COST_3){
+ score+=(dif-LIMIT_FOR_COST_3)*POINTS_DEL3;
+ dif=LIMIT_FOR_COST_3;
+ }
+ if(dif>1){
+ score+=(dif-1)*POINTS_DEL2;
+ }
+ timeInMode=1;
+ }else if(loc>lastLoc){//insertion
+ assert(lastLoc>=0);
+ score+=POINTS_MATCH;
+ score+=POINTS_INS;
+ int dif=Tools.min(loc-lastLoc+1, 5);
+ assert(dif>0);
+
+ if(dif>LIMIT_FOR_COST_3){
+ score+=(dif-LIMIT_FOR_COST_3)*POINTS_INS3;
+ dif=LIMIT_FOR_COST_3;
+ }
+ if(dif>1){
+ score+=(dif-1)*POINTS_INS2;
+ }
+ timeInMode=1;
+ }else{
+ assert(false);
+ }
+ lastLoc=loc;
+ }else{//substitution
+ if(lastValue<0 && timeInMode>0){//contiguous
+ score+=POINTS_SUB;
+ timeInMode++;
+ }else{
+ score+=POINTS_SUB;
+ timeInMode=1;
+ }
+ }
+ lastValue=loc;
+ }
+ assert(score<=maxQuality(locArray.length));
+ return score;
+ }
+
+ @Override
+ public final int calcAffineScore(final int[] locArray, final byte[] baseScores, final byte bases[]){
+ int score=0;
+ int lastLoc=-3; //Last true location
+ int lastValue=-1;
+ int timeInMode=0;
+
+ for(int i=0; i<locArray.length; i++){
+ final int loc=locArray[i];
+
+ if(loc>0){//match
+ if(loc==lastValue){//contiguous match
+ score+=(POINTS_MATCH2+baseScores[i]);
+ }else if(loc==lastLoc || lastLoc<0){//match after a sub, or first match
+ score+=(POINTS_MATCH+baseScores[i]);
+ }else if(loc<lastLoc){//deletion
+ assert(lastLoc>=0);
+ score+=(POINTS_MATCH+baseScores[i]);
+ score+=POINTS_DEL;
+ int dif=lastLoc-loc+1;
+
+ if(dif>LIMIT_FOR_COST_3){
+ score+=(dif-LIMIT_FOR_COST_3)*POINTS_DEL3;
+ dif=LIMIT_FOR_COST_3;
+ }
+ if(dif>1){
+ score+=(dif-1)*POINTS_DEL2;
+ }
+ timeInMode=1;
+ }else if(loc>lastLoc){//insertion
+ assert(lastLoc>=0);
+ score+=(POINTS_MATCH+baseScores[i]);
+ score+=POINTS_INS;
+ int dif=Tools.min(loc-lastLoc+1, 5);
+ assert(dif>0);
+
+ if(dif>LIMIT_FOR_COST_3){
+ score+=(dif-LIMIT_FOR_COST_3)*POINTS_INS3;
+ dif=LIMIT_FOR_COST_3;
+ }
+ if(dif>1){
+ score+=(dif-1)*POINTS_INS2;
+ }
+ timeInMode=1;
+ }else{
+ assert(false);
+ }
+ lastLoc=loc;
+ }else if(loc==-1){//substitution
+ if(lastValue<0 && timeInMode>0){//contiguous
+ score+=POINTS_SUB;
+ timeInMode++;
+ }else{
+ score+=POINTS_SUB;
+ timeInMode=1;
+ }
+ }else{
+ assert(loc==-2) : "\ni="+i+", loc="+loc+", score="+score+", lastLoc="+lastLoc+", lastValue="+lastValue
+ +", time="+timeInMode+", length="+locArray.length+"\nbases=\n"+new String(bases)
+ +"\nlocs[]=\n"+Arrays.toString(locArray)+"\n"+new String(bases)+"\n"
+ +"If this happens please ensure that the reference has a startpad of Ns longer than readlength.";//N (no-call or no-ref)
+ timeInMode=0;
+ score+=POINTS_NOCALL;
+ }
+ lastValue=loc;
+ }
+ assert(score<=maxQuality(locArray.length));
+ return score;
+ }
+
+ @Override
+ public final int calcAffineScore(int[] locArray, byte[] baseScores, byte[] bases, int minContig){
+ assert(minContig>1) : minContig;
+
+ int contig=0;
+ int maxContig=0;
+
+ int score=0;
+ int lastLoc=-3; //Last true location
+ int lastValue=-1;
+ int timeInMode=0;
+
+ for(int i=0; i<locArray.length; i++){
+ int loc=locArray[i];
+
+ if(loc>0){//match
+ if(loc==lastValue){//contiguous match
+ contig++;
+ score+=(POINTS_MATCH2+baseScores[i]);
+ }else if(loc==lastLoc || lastLoc<0){//match after a sub, or first match
+ maxContig=Tools.max(maxContig, contig);
+ contig=1;
+ score+=(POINTS_MATCH+baseScores[i]);
+ }else if(loc<lastLoc){//deletion
+ maxContig=Tools.max(maxContig, contig);
+ contig=0;
+ assert(lastLoc>=0);
+ score+=(POINTS_MATCH+baseScores[i]);
+ score+=POINTS_DEL;
+ int dif=lastLoc-loc+1;
+
+ if(dif>LIMIT_FOR_COST_3){
+ score+=(dif-LIMIT_FOR_COST_3)*POINTS_DEL3;
+ dif=LIMIT_FOR_COST_3;
+ }
+ if(dif>1){
+ score+=(dif-1)*POINTS_DEL2;
+ }
+ timeInMode=1;
+ }else if(loc>lastLoc){//insertion
+ maxContig=Tools.max(maxContig, contig);
+ contig=0;
+ assert(lastLoc>=0);
+ score+=(POINTS_MATCH+baseScores[i]);
+ score+=POINTS_INS;
+ int dif=Tools.min(loc-lastLoc+1, 5);
+ assert(dif>0);
+
+ if(dif>LIMIT_FOR_COST_3){
+ score+=(dif-LIMIT_FOR_COST_3)*POINTS_INS3;
+ dif=LIMIT_FOR_COST_3;
+ }
+ if(dif>1){
+ score+=(dif-1)*POINTS_INS2;
+ }
+ timeInMode=1;
+ }else{
+ assert(false);
+ }
+ lastLoc=loc;
+ }else if(loc==-1){//substitution
+ if(lastValue<0 && timeInMode>0){//contiguous
+ score+=POINTS_SUB;
+ timeInMode++;
+ }else{
+ score+=POINTS_SUB;
+ timeInMode=1;
+ }
+ }else{
+ assert(loc==-2) : loc+"\n"+Arrays.toString(locArray)+"\n"+Arrays.toString(baseScores)+"\n"+new String(bases)+"\n"
+ +"If this happens please ensure that the reference has a startpad of Ns longer than readlength.";//N (no-call or no-ref)
+ timeInMode=0;
+ score+=POINTS_NOCALL;
+ }
+ lastValue=loc;
+ }
+ assert(score<=maxQuality(locArray.length));
+ if(Tools.max(contig, maxContig)<minContig){score=Tools.min(score, -50*locArray.length);}
+ return score;
+ }
+
+ @Override
+ public final int scoreNoIndels(byte[] read, byte[] ref, final int refStart){
+ return scoreNoIndels(read, ref, refStart, null);
+ }
+ @Override
+ public final int scoreNoIndels(byte[] read, byte[] ref, final int refStart, final SiteScore ss){
+
+ int score=0;
+ int mode=-1;
+ int timeInMode=0;
+
+ //This block handles cases where the read runs outside the reference
+ //Of course, padding the reference with 'N' would be better, but...
+ int readStart=0;
+ int readStop=read.length;
+ final int refStop=refStart+read.length;
+ boolean semiperfect=true;
+ int norefs=0;
+
+ if(refStart<0){
+ readStart=0-refStart;
+ score+=POINTS_NOREF*readStart;
+ norefs+=readStart;
+ }
+ if(refStop>ref.length){
+ int dif=(refStop-ref.length);
+ readStop-=dif;
+ score+=POINTS_NOREF*dif;
+ norefs+=dif;
+ }
+
+// if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed.
+
+ for(int i=readStart; i<readStop; i++){
+ byte c=read[i];
+ byte r=ref[refStart+i];
+
+ if(c==r && c!='N'){
+ if(mode==MODE_MS){
+ timeInMode++;
+ score+=POINTS_MATCH2;
+ }else{
+ timeInMode=0;
+ score+=POINTS_MATCH;
+ }
+ mode=MODE_MS;
+ }else if(c<0 || c=='N'){
+ score+=POINTS_NOCALL;
+ semiperfect=false;
+ }else if(r<0 || r=='N'){
+ score+=POINTS_NOREF;
+ norefs++;
+ }else{
+ if(mode==MODE_SUB){timeInMode++;}
+ else{timeInMode=0;}
+
+ score+=POINTS_SUB;
+ mode=MODE_SUB;
+ semiperfect=false;
+ }
+ }
+
+
+ if(semiperfect && ss!=null){ss.semiperfect=((ss.stop==ss.start+read.length-1) && (norefs<=read.length/2));}
+
+ return score;
+ }
+
+ @Override
+ public final byte[] genMatchNoIndels(byte[] read, byte[] ref, final int refStart){
+ if(read==null || ref==null){return null;}
+
+ final byte[] match=new byte[read.length];
+
+ for(int i=0, j=refStart; i<read.length; i++, j++){
+ byte c=read[i];
+ byte r=(j<0 || j>=ref.length) ? (byte)'N' : ref[j];
+
+ if(c=='N' || r=='N'){match[i]='N';}
+ else if(c==r){match[i]='m';}
+ else{match[i]='S';}
+
+ }
+
+ return match;
+ }
+
+ @Override
+ public final int scoreNoIndels(byte[] read, byte[] ref, byte[] baseScores, final int refStart){
+ return scoreNoIndels(read, ref, baseScores, refStart, null);
+ }
+ @Override
+ public final int scoreNoIndels(byte[] read, byte[] ref, byte[] baseScores, final int refStart, SiteScore ss){
+
+ int score=0;
+ int mode=-1;
+ int timeInMode=0;
+ int norefs=0;
+
+ //This block handles cases where the read runs outside the reference
+ //Of course, padding the reference with 'N' would be better, but...
+ int readStart=0;
+ int readStop=read.length;
+ final int refStop=refStart+read.length;
+ boolean semiperfect=true;
+
+ if(refStart<0){
+ readStart=0-refStart;
+ score+=POINTS_NOREF*readStart;
+ norefs+=readStart;
+ }
+ if(refStop>ref.length){
+ int dif=(refStop-ref.length);
+ readStop-=dif;
+ score+=POINTS_NOREF*dif;
+ norefs+=dif;
+ }
+
+// if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed.
+
+ for(int i=readStart; i<readStop; i++){
+ byte c=read[i];
+ byte r=ref[refStart+i];
+
+ if(c==r && c!='N'){
+ if(mode==MODE_MS){
+ timeInMode++;
+ score+=POINTS_MATCH2;
+ }else{
+ timeInMode=0;
+ score+=POINTS_MATCH;
+ }
+ score+=baseScores[i];
+ mode=MODE_MS;
+ }else if(c<0 || c=='N'){
+ score+=POINTS_NOCALL;
+ semiperfect=false;
+ }else if(r<0 || r=='N'){
+ score+=POINTS_NOREF;
+ norefs++;
+ }else{
+ if(mode==MODE_SUB){timeInMode++;}
+ else{timeInMode=0;}
+
+ score+=POINTS_SUB;
+ mode=MODE_SUB;
+ semiperfect=false;
+ }
+ }
+
+
+ if(semiperfect && ss!=null){ss.semiperfect=((ss.stop==ss.start+read.length-1) && (norefs<=read.length/2));}
+ assert(Read.CHECKSITE(ss, read, -1));
+
+ return score;
+ }
+
+ @Override
+ public final int scoreNoIndelsAndMakeMatchString(byte[] read, byte[] ref, byte[] baseScores, final int refStart, byte[][] matchReturn){
+ int score=0;
+ int mode=-1;
+ int timeInMode=0;
+
+ assert(refStart<=ref.length) : refStart+", "+ref.length;
+
+ //This block handles cases where the read runs outside the reference
+ //Of course, padding the reference with 'N' would be better, but...
+ int readStart=0;
+ int readStop=read.length;
+ final int refStop=refStart+read.length;
+ if(refStart<0 || refStop>ref.length){return -99999;}
+ if(refStart<0){
+ readStart=0-refStart;
+ score+=POINTS_NOREF*readStart;
+ }
+ if(refStop>ref.length){
+ int dif=(refStop-ref.length);
+ System.err.println(new String(read)+"\ndif="+dif+", ref.length="+ref.length+", refStop="+refStop);
+ readStop-=dif;
+ score+=POINTS_NOREF*dif;
+ }
+ assert(refStart+readStop<=ref.length) : "readStart="+readStart+", readStop="+readStop+
+ ", refStart="+refStart+", refStop="+refStop+", ref.length="+ref.length+", read.length="+read.length;
+
+ assert(matchReturn!=null);
+ assert(matchReturn.length==1);
+ if(matchReturn[0]==null || matchReturn[0].length!=read.length){
+ assert(matchReturn[0]==null || matchReturn[0].length<read.length) : matchReturn[0].length+"!="+read.length;
+ matchReturn[0]=new byte[read.length];
+ }
+ final byte[] match=matchReturn[0];
+
+// if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed.
+
+ for(int i=readStart; i<readStop; i++){
+ byte c=read[i];
+ byte r=ref[refStart+i];
+
+ assert(r!='.' && c!='.');
+
+ if(c==r && c!='N'){
+ if(mode==MODE_MS){
+ timeInMode++;
+ score+=POINTS_MATCH2;
+ }else{
+ timeInMode=0;
+ score+=POINTS_MATCH;
+ }
+ score+=baseScores[i];
+ match[i]='m';
+ mode=MODE_MS;
+ }else if(c<0 || c=='N'){
+ score+=POINTS_NOCALL;
+ match[i]='N';
+ }else if(r<0 || r=='N'){
+ score+=POINTS_NOREF;
+// match[i]='m';
+ match[i]='N';
+ }else{
+ match[i]='S';
+ if(mode==MODE_SUB){timeInMode++;}
+ else{timeInMode=0;}
+
+ score+=POINTS_SUB;
+ mode=MODE_SUB;
+ }
+ }
+
+ return score;
+ }
+
+ @Override
+ public final int scoreNoIndelsAndMakeMatchString(byte[] read, byte[] ref, final int refStart, byte[][] matchReturn){
+ int score=0;
+ int mode=-1;
+ int timeInMode=0;
+
+ assert(refStart<=ref.length) : refStart+", "+ref.length;
+
+ //This block handles cases where the read runs outside the reference
+ //Of course, padding the reference with 'N' would be better, but...
+ int readStart=0;
+ int readStop=read.length;
+ final int refStop=refStart+read.length;
+ if(refStart<0 || refStop>ref.length){return -99999;}
+ if(refStart<0){
+ readStart=0-refStart;
+ score+=POINTS_NOREF*readStart;
+ }
+ if(refStop>ref.length){
+ int dif=(refStop-ref.length);
+ System.err.println(new String(read)+"\ndif="+dif+", ref.length="+ref.length+", refStop="+refStop);
+ readStop-=dif;
+ score+=POINTS_NOREF*dif;
+ }
+ assert(refStart+readStop<=ref.length) : "readStart="+readStart+", readStop="+readStop+
+ ", refStart="+refStart+", refStop="+refStop+", ref.length="+ref.length+", read.length="+read.length;
+
+ assert(matchReturn!=null);
+ assert(matchReturn.length==1);
+ if(matchReturn[0]==null || matchReturn[0].length!=read.length){
+ assert(matchReturn[0]==null || matchReturn[0].length<read.length) : matchReturn[0].length+"!="+read.length;
+ matchReturn[0]=new byte[read.length];
+ }
+ final byte[] match=matchReturn[0];
+
+// if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed.
+
+ for(int i=readStart; i<readStop; i++){
+ byte c=read[i];
+ byte r=ref[refStart+i];
+
+ assert(r!='.' && c!='.');
+
+ if(c==r && c!='N'){
+ if(mode==MODE_MS){
+ timeInMode++;
+ score+=POINTS_MATCH2;
+ }else{
+ timeInMode=0;
+ score+=POINTS_MATCH;
+ }
+ match[i]='m';
+ mode=MODE_MS;
+ }else if(c<0 || c=='N'){
+ score+=POINTS_NOCALL;
+ match[i]='N';
+ }else if(r<0 || r=='N'){
+ score+=POINTS_NOREF;
+// match[i]='m';
+ match[i]='N';
+ }else{
+ match[i]='S';
+ if(mode==MODE_SUB){timeInMode++;}
+ else{timeInMode=0;}
+
+ score+=POINTS_SUB;
+ mode=MODE_SUB;
+ }
+ }
+
+ return score;
+ }
+
+ @Override
+ public final int maxQuality(int numBases){
+ return POINTS_MATCH+(numBases-1)*(POINTS_MATCH2);
+ }
+
+ @Override
+ public final int maxQuality(byte[] baseScores){
+ return POINTS_MATCH+(baseScores.length-1)*(POINTS_MATCH2)+Tools.sumInt(baseScores);
+ }
+
+ @Override
+ public final int maxImperfectScore(int numBases){
+// int maxQ=maxQuality(numBases);
+//// maxImperfectSwScore=maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB);
+// int maxI=maxQ+POINTS_DEL;
+// maxI=Tools.max(maxI, maxQ+POINTS_INS-POINTS_MATCH2);
+// maxI=Tools.min(maxI, maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB));
+
+ int maxQ=maxQuality(numBases);
+ int maxI=maxQ+Tools.min(POINTS_DEL, POINTS_INS-POINTS_MATCH2);
+ assert(maxI<(maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB)));
+ return maxI;
+ }
+
+ @Override
+ public final int maxImperfectScore(byte[] baseScores){
+// int maxQ=maxQuality(numBases);
+//// maxImperfectSwScore=maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB);
+// int maxI=maxQ+POINTS_DEL;
+// maxI=Tools.max(maxI, maxQ+POINTS_INS-POINTS_MATCH2);
+// maxI=Tools.min(maxI, maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB));
+
+ int maxQ=maxQuality(baseScores);
+ int maxI=maxQ+Tools.min(POINTS_DEL, POINTS_INS-POINTS_MATCH2);
+ assert(maxI<(maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB)));
+ return maxI;
+ }
+
+ @Override
+ public int calcDelScore(int len, boolean approximateGaps){
+ if(len<=0){return 0;}
+ int score=POINTS_DEL;
+
+ if(approximateGaps && len>MINGAP){
+ int rem=len%GAPLEN;
+ int div=(len-GAPBUFFER2)/GAPLEN;
+ score+=(div*POINTS_GAP);
+ assert(rem+GAPBUFFER2<len);
+ len=rem+GAPBUFFER2;
+ }
+
+ if(len>LIMIT_FOR_COST_3){
+ score+=(len-LIMIT_FOR_COST_3)*POINTS_DEL3;
+ len=LIMIT_FOR_COST_3;
+ }
+ if(len>1){
+ score+=(len-1)*POINTS_DEL2;
+ }
+ return score;
+ }
+
+ private static int calcDelScoreOffset(int len){
+ if(len<=0){return 0;}
+ int score=POINTSoff_DEL;
+
+ if(len>LIMIT_FOR_COST_3){
+ score+=(len-LIMIT_FOR_COST_3)*POINTSoff_DEL3;
+ len=LIMIT_FOR_COST_3;
+ }
+ if(len>1){
+ score+=(len-1)*POINTSoff_DEL2;
+ }
+ return score;
+ }
+
+ @Override
+ public int calcInsScore(int len){
+ if(len<=0){return 0;}
+ int score=POINTS_INS;
+
+ if(len>LIMIT_FOR_COST_3){
+ score+=(len-LIMIT_FOR_COST_3)*POINTS_INS3;
+ len=LIMIT_FOR_COST_3;
+ }
+ if(len>1){
+ score+=(len-1)*POINTS_INS2;
+ }
+ return score;
+ }
+
+ private static int calcInsScoreOffset(int len){
+ if(len<=0){return 0;}
+ int score=POINTSoff_INS;
+
+ if(len>LIMIT_FOR_COST_3){
+ score+=(len-LIMIT_FOR_COST_3)*POINTSoff_INS3;
+ len=LIMIT_FOR_COST_3;
+ }
+ if(len>1){
+ score+=(len-1)*POINTSoff_INS2;
+ }
+ return score;
+ }
+
+
+ private final int[][][] packed;
+ private final byte[] grefbuffer;
+ private int greflimit=-1;
+ private int greflimit2=-1;
+ private int grefRefOrigin=-1;
+
+
+ @Override
+ /**DO NOT MODIFY*/
+ public final byte[] getGrefbuffer(){
+ return grefbuffer;
+ }
+
+ public final int[] vertLimit;
+ public final int[] horizLimit;
+
+ @Override
+ public CharSequence showVertLimit(){
+ StringBuilder sb=new StringBuilder();
+ for(int i=0; i<=rows; i++){sb.append(vertLimit[i]>>SCOREOFFSET).append(",");}
+ return sb;
+ }
+
+ @Override
+ public CharSequence showHorizLimit(){
+ StringBuilder sb=new StringBuilder();
+ for(int i=0; i<=columns; i++){sb.append(horizLimit[i]>>SCOREOFFSET).append(",");}
+ return sb;
+ }
+
+ public static float minIdToMinRatio(double minid){
+ if(minid>1){minid=minid/100;}
+ assert(minid>0 && minid<=1) : "Min identity should be between 0 and 1. Values above 1 will be assumed to be percent and divided by 100.";
+ double matchdif=POINTS_MATCH-POINTS_MATCH2;
+ double match=POINTS_MATCH2;
+ double sub=-POINTS_MATCH2+0.5*(matchdif+POINTS_SUB)+0.5*POINTS_SUB;
+ double del=0.8*(matchdif+POINTS_DEL)+0.1*POINTS_DEL2+0.05*POINTS_DEL3+0.05*POINTS_DEL3;
+ double ins=-POINTS_MATCH2+0.8*(matchdif+POINTS_INS)+0.15*(POINTS_INS2)+0.05*(POINTS_INS3);
+ double badAvg=.2*sub+.3*del+.5*ins;
+ double badFraction=1-minid;
+ double minratio=(match+badFraction*badAvg)/match;
+ assert(minratio<=1);
+ minratio=Tools.max(0.1, minratio);
+ return (float)minratio;
+ }
+
+ public static final int TIMEBITS=9;
+ public static final int SCOREBITS=32-TIMEBITS;
+ public static final int MAX_TIME=((1<<TIMEBITS)-1);
+ public static final int MAX_SCORE=((1<<(SCOREBITS-1))-1)-2000;
+ public static final int MIN_SCORE=0-MAX_SCORE; //Keeps it 1 point above "BAD".
+
+ public static final int SCOREOFFSET=TIMEBITS;
+
+ public static final int TIMEMASK=~((-1)<<TIMEBITS);
+ public static final int SCOREMASK=(~((-1)<<SCOREBITS))<<SCOREOFFSET;
+
+ private static final byte MODE_MS=0;
+ private static final byte MODE_DEL=1;
+ private static final byte MODE_INS=2;
+ private static final byte MODE_SUB=3;
+
+ public static final int POINTS_NOREF=0;
+ public static final int POINTS_NOCALL=0;
+ public static final int POINTS_MATCH=92;
+ public static final int POINTS_MATCH2=100;
+ public static final int POINTS_COMPATIBLE=50;
+ public static final int POINTS_SUB=-87;
+ public static final int POINTS_SUBR=-89; //increased penalty if prior match streak was at most 1
+// public static final int POINTS_SUB2=-75;
+// public static final int POINTS_SUB3=-50;
+// public static final int POINTS_MATCHSUB=-10;
+ public static final int POINTS_INS=-100;
+ public static final int POINTS_INS2=-81;
+ public static final int POINTS_INS3=-59;
+// public static final int POINTS_INS4=-45;
+ public static final int POINTS_DEL=-140;
+ public static final int POINTS_DEL2=-73;
+ public static final int POINTS_DEL3=-58;
+// public static final int POINTS_DEL4=-44;
+// public static final int POINTS_DEL5=-30;
+ public static final int POINTS_DEL_REF_N=-10;
+ public static final int POINTS_GAP=0-GAPCOST;
+
+ public static final int TIMESLIP=4;
+ public static final int MASK5=TIMESLIP-1;
+ static{assert(Integer.bitCount(TIMESLIP)==1);}
+
+ //TODO: Consider removing these barriers entirely for PacBio reads. Would make code faster, too.
+ private static final int BARRIER_I1=1;
+ private static final int BARRIER_D1=1;
+
+
+ public static final int LIMIT_FOR_COST_3=5;
+// public static final int LIMIT_FOR_COST_4=20;
+// public static final int LIMIT_FOR_COST_5=80;
+
+ public static final int BAD=MIN_SCORE-1;
+
+
+ public static final int POINTSoff_NOREF=(POINTS_NOREF<<SCOREOFFSET);
+ public static final int POINTSoff_NOCALL=(POINTS_NOCALL<<SCOREOFFSET);
+ public static final int POINTSoff_MATCH=(POINTS_MATCH<<SCOREOFFSET);
+ public static final int POINTSoff_MATCH2=(POINTS_MATCH2<<SCOREOFFSET);
+ public static final int POINTSoff_COMPATIBLE=(POINTS_COMPATIBLE<<SCOREOFFSET);
+ public static final int POINTSoff_SUB=(POINTS_SUB<<SCOREOFFSET);
+ public static final int POINTSoff_SUBR=(POINTS_SUBR<<SCOREOFFSET);
+// public static final int POINTSoff_SUB2=(POINTS_SUB2<<SCOREOFFSET);
+// public static final int POINTSoff_SUB3=(POINTS_SUB3<<SCOREOFFSET);
+// public static final int POINTSoff_MATCHSUB=(POINTS_MATCHSUB<<SCOREOFFSET);
+ public static final int POINTSoff_INS=(POINTS_INS<<SCOREOFFSET);
+ public static final int POINTSoff_INS2=(POINTS_INS2<<SCOREOFFSET);
+ public static final int POINTSoff_INS3=(POINTS_INS3<<SCOREOFFSET);
+// public static final int POINTSoff_INS4=(POINTS_INS4<<SCOREOFFSET);
+ public static final int POINTSoff_DEL=(POINTS_DEL<<SCOREOFFSET);
+ public static final int POINTSoff_DEL2=(POINTS_DEL2<<SCOREOFFSET);
+ public static final int POINTSoff_DEL3=(POINTS_DEL3<<SCOREOFFSET);
+// public static final int POINTSoff_DEL4=(POINTS_DEL4<<SCOREOFFSET);
+// public static final int POINTSoff_DEL5=(POINTS_DEL5<<SCOREOFFSET);
+ public static final int POINTSoff_GAP=(POINTS_GAP<<SCOREOFFSET);
+ public static final int POINTSoff_DEL_REF_N=(POINTS_DEL_REF_N<<SCOREOFFSET);
+ public static final int BADoff=(BAD<<SCOREOFFSET);
+ public static final int MAXoff_SCORE=MAX_SCORE<<SCOREOFFSET;
+ public static final int MINoff_SCORE=MIN_SCORE<<SCOREOFFSET;
+
+ /** TODO: possibly enclose all uses of affine arrays in a branch controlled by this */
+ public static final boolean AFFINE_ARRAYS=false;
+ public static final int[] POINTS_INS_ARRAY;
+ public static final int[] POINTSoff_INS_ARRAY;
+ public static final int[] POINTS_INS_ARRAY_C;
+ public static final int[] POINTSoff_INS_ARRAY_C;
+
+ public static final int[] POINTS_SUB_ARRAY;
+ public static final int[] POINTSoff_SUB_ARRAY;
+ public static final int[] POINTS_SUB_ARRAY_C;
+ public static final int[] POINTSoff_SUB_ARRAY_C;
+
+ static{
+ POINTS_INS_ARRAY=new int[604];
+ POINTSoff_INS_ARRAY=new int[604];
+ POINTS_INS_ARRAY_C=new int[604];
+ POINTSoff_INS_ARRAY_C=new int[604];
+
+ for(int i=1; i<POINTS_INS_ARRAY.length; i++){
+ int pts, ptsoff;
+ if(i>LIMIT_FOR_COST_3){
+ pts=POINTS_INS3;
+ ptsoff=POINTSoff_INS3;
+ }else if(i>1){
+ pts=POINTS_INS2;
+ ptsoff=POINTSoff_INS2;
+ }else{
+ pts=POINTS_INS;
+ ptsoff=POINTSoff_INS;
+ }
+ POINTS_INS_ARRAY[i]=pts;
+ POINTSoff_INS_ARRAY[i]=ptsoff;
+ POINTS_INS_ARRAY_C[i]=Tools.max(MIN_SCORE, pts+POINTS_INS_ARRAY_C[i-1]);
+ POINTSoff_INS_ARRAY_C[i]=Tools.max(MINoff_SCORE, ptsoff+POINTSoff_INS_ARRAY_C[i-1]);
+ }
+
+
+ POINTS_SUB_ARRAY=new int[604];
+ POINTSoff_SUB_ARRAY=new int[604];
+ POINTS_SUB_ARRAY_C=new int[604];
+ POINTSoff_SUB_ARRAY_C=new int[604];
+
+ for(int i=1; i<POINTS_SUB_ARRAY.length; i++){
+ int pts, ptsoff;
+ {
+ pts=POINTS_SUB;
+ ptsoff=POINTSoff_SUB;
+ }
+ POINTS_SUB_ARRAY[i]=pts;
+ POINTSoff_SUB_ARRAY[i]=ptsoff;
+ POINTS_SUB_ARRAY_C[i]=Tools.max(MIN_SCORE, pts+POINTS_SUB_ARRAY_C[i-1]);
+ POINTSoff_SUB_ARRAY_C[i]=Tools.max(MINoff_SCORE, ptsoff+POINTSoff_SUB_ARRAY_C[i-1]);
+ }
+ }
+
+ public final int POINTS_NOREF(){return POINTS_NOREF;}
+ public final int POINTS_NOCALL(){return POINTS_NOCALL;}
+ public final int POINTS_MATCH(){return POINTS_MATCH;}
+ public final int POINTS_MATCH2(){return POINTS_MATCH2;}
+ public final int POINTS_COMPATIBLE(){return POINTS_COMPATIBLE;}
+ public final int POINTS_SUB(){return POINTS_SUB;}
+ public final int POINTS_SUBR(){return POINTS_SUBR;}
+ public final int POINTS_SUB2(){return POINTS_SUB;}
+ public final int POINTS_SUB3(){return POINTS_SUB;}
+ public final int POINTS_MATCHSUB(){return POINTS_MATCH;}
+ public final int POINTS_INS(){return POINTS_INS;}
+ public final int POINTS_INS2(){return POINTS_INS2;}
+ public final int POINTS_INS3(){return POINTS_INS3;}
+ public final int POINTS_INS4(){return POINTS_INS3;}
+ public final int POINTS_DEL(){return POINTS_DEL;}
+ public final int POINTS_DEL2(){return POINTS_DEL2;}
+ public final int POINTS_DEL3(){return POINTS_DEL3;}
+ public final int POINTS_DEL4(){return POINTS_DEL3;}
+ public final int POINTS_DEL5(){return POINTS_DEL3;}
+ public final int POINTS_DEL_REF_N(){return POINTS_DEL_REF_N;}
+ public final int POINTS_GAP(){return POINTS_GAP;}
+
+ public final int TIMESLIP(){return TIMESLIP;}
+ public final int MASK5(){return MASK5;}
+ public final int SCOREOFFSET(){return SCOREOFFSET();}
+
+ final int BARRIER_I1(){return BARRIER_I1;}
+ final int BARRIER_D1(){return BARRIER_D1;}
+
+ public final int LIMIT_FOR_COST_3(){return LIMIT_FOR_COST_3;}
+ public final int LIMIT_FOR_COST_4(){return LIMIT_FOR_COST_3;}
+ public final int LIMIT_FOR_COST_5(){return LIMIT_FOR_COST_3;}
+
+ public final int BAD(){return BAD;}
+
+
+ private int rows;
+ private int columns;
+
+}
diff --git a/current/align2/MultiStateAligner9ts.java b/current/align2/MultiStateAligner9ts.java
new file mode 100755
index 0000000..568b5ae
--- /dev/null
+++ b/current/align2/MultiStateAligner9ts.java
@@ -0,0 +1,2433 @@
+package align2;
+
+import java.util.Arrays;
+
+import stream.SiteScore;
+
+import dna.AminoAcid;
+
+/**
+ * "P" for "Packed".<br>
+ * Same as MSA2P, but the "prevState" field was removed.
+ * Yields identical results to MSA2, but is faster.
+ * For very long reads (over 2000bp) the score may overflow, so MSA2 should be used instead,
+ * or the time field should be shrunk. */
+public final class MultiStateAligner9ts extends MSA{
+
+
+ public static void main(String[] args){
+ byte[] read=args[0].getBytes();
+ byte[] ref=args[1].getBytes();
+
+ byte[] original=ref;
+
+ MultiStateAligner9ts msa=new MultiStateAligner9ts(read.length, ref.length);
+
+ System.out.println("Initial: ");
+ printMatrix(msa.packed, read.length, ref.length, TIMEMASK, SCOREOFFSET);
+
+ int[] max=msa.fillLimited(read, ref, 0, ref.length-1, 0, null);
+
+ System.out.println("Max: "+Arrays.toString(max));
+
+ System.out.println("Final: ");
+ printMatrix(msa.packed, read.length, ref.length, TIMEMASK, SCOREOFFSET);
+
+ byte[] out=msa.traceback(read, ref, 0, ref.length-1, max[0], max[1], max[2], false);
+
+ int[] score=null;
+ score=msa.score(read, ref, 0, ref.length-1, max[0], max[1], max[2], false);
+
+ System.out.println(new String(ref));
+ System.out.println(new String(read));
+ System.out.println(new String(out));
+ System.out.println("Score: "+Arrays.toString(score));
+ }
+
+
+ public MultiStateAligner9ts(int maxRows_, int maxColumns_){
+ super(maxRows_, maxColumns_);
+
+ {
+ int[][][] packed0=null;
+ byte[] grefbuffer0=null;
+ int[] vertLimit0=null;
+ int[] horizLimit0=null;
+
+ try {
+ packed0=new int[3][maxRows+1][maxColumns+1];
+ grefbuffer0=new byte[maxColumns+2];
+ vertLimit0=new int[maxRows+1];
+ horizLimit0=new int[maxColumns+1];
+ } catch (OutOfMemoryError e) {
+ packed0=null;
+ grefbuffer0=null;
+ vertLimit0=null;
+ horizLimit0=null;
+ throw new RuntimeException(e.toString());
+ }
+
+ packed=packed0;
+ grefbuffer=grefbuffer0;
+ vertLimit=vertLimit0;
+ horizLimit=horizLimit0;
+ }
+
+ Arrays.fill(vertLimit, BADoff);
+ Arrays.fill(horizLimit, BADoff);
+
+// for(int i=0; i<maxColumns+1; i++){
+// scores[0][i]=0-i;
+// }
+
+ for(int matrix=0; matrix<packed.length; matrix++){
+ for(int i=1; i<=maxRows; i++){
+ for(int j=0; j<packed[matrix][i].length; j++){
+ packed[matrix][i][j]|=BADoff;
+ }
+// packed[matrix][i][0]|=MODE_INS;
+ }
+// for(int i=0; i<maxRows+1; i++){
+// scores[matrix][i][0]=(i*POINTSoff_NOREF);
+// }
+ for(int i=0; i<=maxRows; i++){
+
+ int prevScore=(i<2 ? 0 : packed[matrix][i-1][0]);
+ int score=(i<2 ? (i*POINTSoff_INS) :
+ (i<LIMIT_FOR_COST_3 ? prevScore+POINTSoff_INS2 :
+ (i<LIMIT_FOR_COST_4 ? prevScore+POINTSoff_INS3 : prevScore+POINTSoff_INS4)));
+
+ packed[matrix][i][0]=score;
+ }
+// for(int i=1; i<maxColumns+1; i++){
+// prevState[matrix][0][i]=MODE_DEL;
+// }
+// for(int i=0; i<=maxColumns; i++){
+// packed[matrix][0][i]|=MODE_MS;
+// }
+ }
+ }
+
+ @Override
+ public final int[] fillLimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore, int[] gaps){
+ if(gaps==null){return fillLimitedX(read, ref, refStartLoc, refEndLoc, minScore);}
+ else{
+ byte[] gref=makeGref(ref, gaps, refStartLoc, refEndLoc);
+ assert(gref!=null) : "Excessively long read:\n"+new String(read);
+ return fillLimitedX(read, gref, 0, greflimit, minScore);
+ }
+ }
+
+
+ /** return new int[] {rows, maxC, maxS, max};
+ * Will not fill areas that cannot match minScore */
+ private final int[] fillLimitedX(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore){
+// minScore=0;
+// assert(minScore>0);
+ rows=read.length;
+ columns=refEndLoc-refStartLoc+1;
+
+ final int halfband=(bandwidth<1 && bandwidthRatio<=0) ? 0 :
+ Tools.max(Tools.min(bandwidth<1 ? 9999999 : bandwidth, bandwidthRatio<=0 ? 9999999 : 8+(int)(rows*bandwidthRatio)), (columns-rows+8))/2;
+
+ if(minScore<1 || (columns+rows<90) || ((halfband<1 || halfband*3>columns) && (columns>read.length+Tools.min(170, read.length+20)))){
+// assert(false) : minScore;
+// assert(minScore>0) : minScore;
+// assert(false) : +minScore+", "+columns+", "+read.length+", "+Tools.min(100, read.length);
+ return fillUnlimited(read, ref, refStartLoc, refEndLoc);
+ }
+
+// final int BARRIER_I2=columns-BARRIER_I1;
+ final int BARRIER_I2=rows-BARRIER_I1, BARRIER_I2b=columns-1;
+ final int BARRIER_D2=rows-BARRIER_D1;
+
+ minScore-=MIN_SCORE_ADJUST; //Increases quality trivially
+
+ assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows+"\n"+
+ refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n";
+ assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns+"\n"+
+ refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n";
+
+ assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc+"\n"+
+ refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n";
+ assert(refEndLoc<ref.length) : "Check that values are in-bounds before calling this function: "+refEndLoc+", "+ref.length+"\n"+
+ refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n";
+
+// for(int x=0; x<packed.length; x++){
+// for(int y=1; y<rows+1; y++){
+// Arrays.fill(packed[x][y], 1, columns+1, BADoff);
+// }
+// }
+ for(int x=0; x<packed.length; x++){
+
+// Arrays.fill(packed[x][1], 1, columns+1, BADoff);
+ Arrays.fill(packed[x][rows], 1, columns+1, BADoff);
+// for(int y=1; y<rows+1; y++){
+// Arrays.fill(packed[x][y], 1, columns+1, BADoff);
+// }
+ }
+
+ int minGoodCol=1;
+ int maxGoodCol=columns;
+
+ final int minScore_off=(minScore<<SCOREOFFSET);
+ final int maxGain=(read.length-1)*POINTSoff_MATCH2+POINTSoff_MATCH;
+ final int floor=minScore_off-maxGain;
+// final int subfloor=Tools.max(BADoff, floor-200*POINTSoff_MATCH2);
+ final int subfloor=floor-5*POINTSoff_MATCH2;
+ assert(subfloor>BADoff); //TODO: Actually, it needs to be substantially more.
+ assert(subfloor<minScore_off) : minScore_off+", "+floor+", "+BADoff+", "+subfloor;
+
+ if(verbose2){
+ System.out.println();
+ System.out.println("minScore="+minScore+"\t"+minScore_off);
+ System.out.println("maxGain="+(maxGain>>SCOREOFFSET)+"\t"+(maxGain));
+ System.out.println("floor="+(floor>>SCOREOFFSET)+"\t"+(floor));
+ System.out.println("subfloor="+(subfloor>>SCOREOFFSET)+"\t"+(subfloor));
+ System.out.println("BADoff="+(BADoff>>SCOREOFFSET)+"\t"+(BADoff));
+ System.out.println("maxGain="+(maxGain>>SCOREOFFSET)+"\t"+(maxGain));
+ System.out.println();
+ }
+
+ vertLimit[rows]=minScore_off;
+ boolean prevDefined=false;
+ for(int i=rows-1; i>=0; i--){
+ byte c=read[i];
+ if(AminoAcid.isFullyDefined(c)){
+ vertLimit[i]=Tools.max(vertLimit[i+1]-(prevDefined ? POINTSoff_MATCH2 : POINTSoff_MATCH), floor);
+ prevDefined=true;
+ }else{
+ vertLimit[i]=Tools.max(vertLimit[i+1]-POINTSoff_NOCALL, floor);
+ prevDefined=false;
+ }
+ }
+
+ horizLimit[columns]=minScore_off;
+ prevDefined=false;
+ for(int i=columns-1; i>=0; i--){
+ byte c=ref[refStartLoc+i];
+ if(AminoAcid.isFullyDefined(c)){
+ horizLimit[i]=Tools.max(horizLimit[i+1]-(prevDefined ? POINTSoff_MATCH2 : POINTSoff_MATCH), floor);
+ prevDefined=true;
+ }else{
+ horizLimit[i]=Tools.max(horizLimit[i+1]-(prevDefined && c==GAPC ? POINTSoff_DEL : POINTSoff_NOREF), floor);
+ prevDefined=false;
+ }
+ }
+
+// vertLimit[rows]=minScore_off;
+// for(int i=rows-1; i>=0; i--){
+// vertLimit[i]=Tools.max(vertLimit[i+1]-POINTSoff_MATCH2, floor);
+// }
+//
+// horizLimit[columns]=minScore_off;
+// for(int i=columns-1; i>=0; i--){
+// horizLimit[i]=Tools.max(horizLimit[i+1]-POINTSoff_MATCH2, floor);
+// }
+
+ for(int row=1; row<=rows; row++){
+
+ final int colStart=(halfband<1 ? minGoodCol : Tools.max(minGoodCol, row-halfband));
+ final int colStop=(halfband<1 ? maxGoodCol : Tools.min(maxGoodCol, row+halfband*2-1));
+
+ minGoodCol=-1;
+ maxGoodCol=-2;
+
+ final int vlimit=vertLimit[row];
+
+ if(verbose2){
+ System.out.println();
+ System.out.println("row="+row);
+ System.out.println("colStart="+colStart);
+ System.out.println("colStop="+colStop);
+ System.out.println("vlimit="+(vlimit>>SCOREOFFSET)+"\t"+(vlimit));
+ }
+
+ if(colStart<0 || colStop<colStart){break;}
+
+
+ if(colStart>1){
+ assert(row>0);
+ packed[MODE_MS][row][colStart-1]=subfloor;
+ packed[MODE_INS][row][colStart-1]=subfloor;
+ packed[MODE_DEL][row][colStart-1]=subfloor;
+ }
+
+
+ for(int col=colStart; col<=columns; col++){
+
+
+ if(verbose2){
+ System.out.println("\ncol "+col);
+ }
+
+ final byte call0=(row<2 ? (byte)'?' : read[row-2]);
+ final byte call1=read[row-1];
+ final byte ref0=(col<2 ? (byte)'!' : ref[refStartLoc+col-2]);
+ final byte ref1=ref[refStartLoc+col-1];
+
+ final boolean gap=(ref1==GAPC);
+ assert(call1!=GAPC);
+
+// final boolean match=(read[row-1]==ref[refStartLoc+col-1]);
+// final boolean prevMatch=(row<2 || col<2 ? false : read[row-2]==ref[refStartLoc+col-2]);
+ final boolean match=(call1==ref1 && ref1!='N');
+ final boolean prevMatch=(call0==ref0 && ref0!='N');
+
+// System.err.println("")
+
+ iterationsLimited++;
+ final int limit=Tools.max(vlimit, horizLimit[col]);
+ final int limit3=Tools.max(floor, (match ? limit-POINTSoff_MATCH2 : limit-POINTSoff_SUB3));
+
+ final int delNeeded=Tools.max(0, row-col-1);
+ final int insNeeded=Tools.max(0, (rows-row)-(columns-col)-1);
+
+ final int delPenalty=calcDelScoreOffset(delNeeded);
+ final int insPenalty=calcInsScoreOffset(insNeeded);
+
+
+ final int scoreFromDiag_MS=packed[MODE_MS][row-1][col-1]&SCOREMASK;
+ final int scoreFromDel_MS=packed[MODE_DEL][row-1][col-1]&SCOREMASK;
+ final int scoreFromIns_MS=packed[MODE_INS][row-1][col-1]&SCOREMASK;
+
+ final int scoreFromDiag_DEL=packed[MODE_MS][row][col-1]&SCOREMASK;
+ final int scoreFromDel_DEL=packed[MODE_DEL][row][col-1]&SCOREMASK;
+
+ final int scoreFromDiag_INS=packed[MODE_MS][row-1][col]&SCOREMASK;
+ final int scoreFromIns_INS=packed[MODE_INS][row-1][col]&SCOREMASK;
+
+// if(scoreFromDiag_MS<limit3 && scoreFromDel_MS<limit3 && scoreFromIns_MS<limit3
+// && scoreFromDiag_DEL<limit && scoreFromDel_DEL<limit && scoreFromDiag_INS<limit && scoreFromIns_INS<limit){
+// iterationsLimited--; //A "fast" iteration
+// }
+
+ if(gap || (scoreFromDiag_MS<=limit3 && scoreFromDel_MS<=limit3 && scoreFromIns_MS<=limit3)){
+ packed[MODE_MS][row][col]=subfloor;
+ }else{//Calculate match and sub scores
+ final int streak=(packed[MODE_MS][row-1][col-1]&TIMEMASK);
+
+ {//Calculate match/sub score
+
+ int score;
+ int time;
+ byte prevState;
+
+ if(match){
+
+ int scoreMS=scoreFromDiag_MS+(prevMatch ? POINTSoff_MATCH2 : POINTSoff_MATCH);
+ int scoreD=scoreFromDel_MS+POINTSoff_MATCH;
+ int scoreI=scoreFromIns_MS+POINTSoff_MATCH;
+
+// byte prevState;
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? streak+1 : 1);
+ prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+ prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+ prevState=MODE_INS;
+ }
+
+// if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+// assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+// assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+//// packed[MODE_MS][row][col]=(score|prevState|time);
+// packed[MODE_MS][row][col]=(score|time);
+// assert((score&SCOREMASK)==score);
+//// assert((prevState&MODEMASK)==prevState);
+// assert((time&TIMEMASK)==time);
+
+ }else{
+
+ int scoreMS;
+ if(ref1!='N' && call1!='N'){
+ scoreMS=scoreFromDiag_MS+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) :
+ (streak==0 ? POINTSoff_SUB : streak<LIMIT_FOR_COST_3 ? POINTSoff_SUB2 : POINTSoff_SUB3));
+ }else{
+ scoreMS=scoreFromDiag_MS+POINTSoff_NOCALL;
+ }
+
+ int scoreD=scoreFromDel_MS+POINTSoff_SUB; //+2 to move it as close as possible to the deletion / insertion
+ int scoreI=scoreFromIns_MS+POINTSoff_SUB;
+
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? 1 : streak+1);
+// time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1);
+ prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+ prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+ prevState=MODE_INS;
+ }
+
+// if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+// assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+// assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+//// packed[MODE_MS][row][col]=(score|prevState|time);
+// packed[MODE_MS][row][col]=(score|time);
+// assert((score&SCOREMASK)==score);
+//// assert((prevState&MODEMASK)==prevState);
+// assert((time&TIMEMASK)==time);
+ }
+
+ final int limit2;
+ if(delNeeded>0){
+ limit2=limit-delPenalty;
+ }else if(insNeeded>0){
+ limit2=limit-insPenalty;
+ }else{
+ limit2=limit;
+ }
+ assert(limit2>=limit);
+
+ if(verbose2){System.err.println("MS: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));}
+
+ if(score>=limit2){
+ maxGoodCol=col;
+ if(minGoodCol<0){minGoodCol=col;}
+ }else{
+ score=subfloor;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_MS][row][col]=(score|prevState|time);
+ packed[MODE_MS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+ }
+
+ if((scoreFromDiag_DEL<=limit && scoreFromDel_DEL<=limit) || row<BARRIER_D1 || row>BARRIER_D2){
+// assert((scoreFromDiag_DEL<=limit && scoreFromDel_DEL<=limit)) : scoreFromDiag_DEL+", "+row;
+ packed[MODE_DEL][row][col]=subfloor;
+ }else{//Calculate DEL score
+
+ final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK;
+
+ int scoreMS=scoreFromDiag_DEL+POINTSoff_DEL;
+ int scoreD=scoreFromDel_DEL+(streak==0 ? POINTSoff_DEL :
+ streak<LIMIT_FOR_COST_3 ? POINTSoff_DEL2 :
+ streak<LIMIT_FOR_COST_4 ? POINTSoff_DEL3 :
+ streak<LIMIT_FOR_COST_5 ? POINTSoff_DEL4 :
+ ((streak&MASK5)==0 ? POINTSoff_DEL5 : 0));
+// int scoreI=scoreFromIns+POINTSoff_DEL;
+
+
+ if(ref1=='N'){
+ scoreMS+=POINTSoff_DEL_REF_N;
+ scoreD+=POINTSoff_DEL_REF_N;
+ }else if(gap){
+ scoreMS+=POINTSoff_GAP;
+ scoreD+=POINTSoff_GAP;
+ }
+
+ //if(match){scoreMS=subfloor;}
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreD){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreD;
+ time=streak+1;
+ prevState=MODE_DEL;
+ }
+
+ final int limit2;
+ if(insNeeded>0){
+ limit2=limit-insPenalty;
+ }else if(delNeeded>0){
+ limit2=limit-calcDelScoreOffset(time+delNeeded)+calcDelScoreOffset(time);
+ }else{
+ limit2=limit;
+ }
+ assert(limit2>=limit);
+ if(verbose2){System.err.println("DEL: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));}
+
+ if(score>=limit2){
+ maxGoodCol=col;
+ if(minGoodCol<0){minGoodCol=col;}
+ }else{
+ score=subfloor;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_DEL][row][col]=(score|prevState|time);
+ packed[MODE_DEL][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+
+// if(gap || (scoreFromDiag_INS<=limit && scoreFromIns_INS<=limit) || col<BARRIER_I1 || col>BARRIER_I2){
+ if(gap || (scoreFromDiag_INS<=limit && scoreFromIns_INS<=limit) || (row<BARRIER_I1 && col>1) || (row>BARRIER_I2 && col<BARRIER_I2b)){
+ packed[MODE_INS][row][col]=subfloor;
+ }else{//Calculate INS score
+
+ final int streak=packed[MODE_INS][row-1][col]&TIMEMASK;
+
+ int scoreMS=scoreFromDiag_INS+POINTSoff_INS;
+// int scoreD=scoreFromDel+POINTSoff_INS;
+ int scoreI=scoreFromIns_INS+(streak==0 ? POINTSoff_INS :
+ streak<LIMIT_FOR_COST_3 ? POINTSoff_INS2 :
+ streak<LIMIT_FOR_COST_4 ? POINTSoff_INS3 : POINTSoff_INS4);
+
+
+// System.err.println("("+row+","+col+")\t"+scoreFromDiag+"+"+POINTSoff_INS+"="+scoreM+", "+
+// scoreFromSub+"+"+POINTSoff_INS+"="+scoreS+", "
+// +scoreD+", "+scoreFromIns+"+"+
+// (streak==0 ? POINTSoff_INS : streak<LIMIT_FOR_COST_3 ? POINTSoff_INS2 : POINTSoff_INS3)+"="+scoreI);
+
+ //if(match){scoreMS=subfloor;}
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreI){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreI;
+ time=streak+1;
+ prevState=MODE_INS;
+ }
+
+ final int limit2;
+ if(delNeeded>0){
+ limit2=limit-delPenalty;
+ }else if(insNeeded>0){
+ limit2=limit-calcInsScoreOffset(time+insNeeded)+calcInsScoreOffset(time);
+ }else{
+ limit2=limit;
+ }
+ assert(limit2>=limit);
+
+ if(verbose2){System.err.println("INS: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));}
+ if(score>=limit2){
+ maxGoodCol=col;
+ if(minGoodCol<0){minGoodCol=col;}
+ }else{
+ score=subfloor;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_INS][row][col]=(score|prevState|time);
+ packed[MODE_INS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+
+
+ if(col>=colStop){
+ if(col>colStop && (maxGoodCol<col || halfband>0)){break;}
+ if(row>1){
+ packed[MODE_MS][row-1][col+1]=subfloor;
+ packed[MODE_INS][row-1][col+1]=subfloor;
+ packed[MODE_DEL][row-1][col+1]=subfloor;
+ }
+ }
+ }
+ }
+
+
+ int maxCol=-1;
+ int maxState=-1;
+ int maxScore=Integer.MIN_VALUE;
+
+ for(int state=0; state<packed.length; state++){
+ for(int col=1; col<=columns; col++){
+ int x=packed[state][rows][col]&SCOREMASK;
+ if(x>maxScore){
+ maxScore=x;
+ maxCol=col;
+ maxState=state;
+ }
+ }
+ }
+
+ assert(maxScore>=BADoff);
+// if(maxScore==BADoff){
+// return null;
+// }
+// if(maxScore<floor){
+// return null;
+// }
+ if(maxScore<minScore_off){
+ return null;
+ }
+
+ maxScore>>=SCOREOFFSET;
+
+// System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore);
+ return new int[] {rows, maxCol, maxState, maxScore};
+ }
+
+ @Override
+ public final int[] fillUnlimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int[] gaps){
+ if(gaps==null){return fillUnlimited(read, ref, refStartLoc, refEndLoc);}
+ else{
+ byte[] gref=makeGref(ref, gaps, refStartLoc, refEndLoc);
+ assert(gref!=null) : "Excessively long read:\n"+new String(read);
+ return fillUnlimited(read, gref, 0, greflimit);
+ }
+ }
+
+
+ /** return new int[] {rows, maxC, maxS, max};
+ * Does not require a min score (ie, same as old method) */
+ private final int[] fillUnlimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc){
+ rows=read.length;
+ columns=refEndLoc-refStartLoc+1;
+
+ final int maxGain=(read.length-1)*POINTSoff_MATCH2+POINTSoff_MATCH;
+ final int subfloor=0-2*maxGain;
+ assert(subfloor>BADoff && subfloor*2>BADoff) : (read.length-1)+", "+maxGain+", "+subfloor+", "+(subfloor*2)+", "+BADoff+"\n"
+ +rows+", "+columns+", "+POINTSoff_MATCH2+", "+SCOREOFFSET+"\n"+new String(read)+"\n"; //TODO: Actually, it needs to be substantially more.
+// final int BARRIER_I2=columns-BARRIER_I1;
+ final int BARRIER_I2=rows-BARRIER_I1, BARRIER_I2b=columns-1;
+ final int BARRIER_D2=rows-BARRIER_D1;
+
+ //temporary, for finding a bug
+ if(rows>maxRows || columns>maxColumns){
+ throw new RuntimeException("rows="+rows+", maxRows="+maxRows+", cols="+columns+", maxCols="+maxColumns+"\n"+new String(read)+"\n");
+ }
+
+ assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows;
+ assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns;
+
+ assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc;
+ assert(refEndLoc<ref.length) : "Check that values are in-bounds before calling this function: "+refEndLoc+", "+ref.length;
+
+ for(int row=1; row<=rows; row++){
+
+// int minc=max(1, row-20);
+// int maxc=min(columns, row+20);
+
+ for(int col=1; col<=columns; col++){
+ iterationsUnlimited++;
+
+// final boolean match=(read[row-1]==ref[refStartLoc+col-1]);
+// final boolean prevMatch=(row<2 || col<2 ? false : read[row-2]==ref[refStartLoc+col-2]);
+
+ final byte call0=(row<2 ? (byte)'?' : read[row-2]);
+ final byte call1=read[row-1];
+ final byte ref0=(col<2 ? (byte)'!' : ref[refStartLoc+col-2]);
+ final byte ref1=ref[refStartLoc+col-1];
+
+ final boolean match=(call1==ref1 && ref1!='N');
+ final boolean prevMatch=(call0==ref0 && ref0!='N');
+
+ final boolean gap=(ref1==GAPC);
+ assert(call1!=GAPC);
+
+ if(gap){
+ packed[MODE_MS][row][col]=subfloor;
+ }else{//Calculate match and sub scores
+
+ final int scoreFromDiag=packed[MODE_MS][row-1][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row-1][col-1]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col-1]&SCOREMASK;
+ final int streak=(packed[MODE_MS][row-1][col-1]&TIMEMASK);
+
+ {//Calculate match/sub score
+
+ if(match){
+
+ int scoreMS=scoreFromDiag+(prevMatch ? POINTSoff_MATCH2 : POINTSoff_MATCH);
+ int scoreD=scoreFromDel+POINTSoff_MATCH;
+ int scoreI=scoreFromIns+POINTSoff_MATCH;
+
+ int score;
+ int time;
+// byte prevState;
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? streak+1 : 1);
+// prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+// prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+// prevState=MODE_INS;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_MS][row][col]=(score|prevState|time);
+ packed[MODE_MS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+
+ }else{
+
+ int scoreMS;
+ if(ref1!='N' && call1!='N'){
+ scoreMS=scoreFromDiag+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) :
+ (streak==0 ? POINTSoff_SUB : streak<LIMIT_FOR_COST_3 ? POINTSoff_SUB2 : POINTSoff_SUB3));
+ }else{
+ scoreMS=scoreFromDiag+POINTSoff_NOCALL;
+ }
+
+ int scoreD=scoreFromDel+POINTSoff_SUB; //+2 to move it as close as possible to the deletion / insertion
+ int scoreI=scoreFromIns+POINTSoff_SUB;
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? 1 : streak+1);
+// time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1);
+ prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+ prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+ prevState=MODE_INS;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_MS][row][col]=(score|prevState|time);
+ packed[MODE_MS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+ }
+ }
+
+ if(row<BARRIER_D1 || row>BARRIER_D2){
+ packed[MODE_DEL][row][col]=subfloor;
+ }else{//Calculate DEL score
+
+ final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK;
+
+ final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK;
+
+ int scoreMS=scoreFromDiag+POINTSoff_DEL;
+ int scoreD=scoreFromDel+(streak==0 ? POINTSoff_DEL :
+ streak<LIMIT_FOR_COST_3 ? POINTSoff_DEL2 :
+ streak<LIMIT_FOR_COST_4 ? POINTSoff_DEL3 :
+ streak<LIMIT_FOR_COST_5 ? POINTSoff_DEL4 :
+ ((streak&MASK5)==0 ? POINTSoff_DEL5 : 0));
+// int scoreI=scoreFromIns+POINTSoff_DEL;
+
+ if(ref1=='N'){
+ scoreMS+=POINTSoff_DEL_REF_N;
+ scoreD+=POINTSoff_DEL_REF_N;
+ }else if(gap){
+ scoreMS+=POINTSoff_GAP;
+ scoreD+=POINTSoff_GAP;
+ }
+
+ //if(match){scoreMS=subfloor;}
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreD){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreD;
+ time=streak+1;
+ prevState=MODE_DEL;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_DEL][row][col]=(score|prevState|time);
+ packed[MODE_DEL][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+
+ //Calculate INS score
+// if(gap || col<BARRIER_I1 || col>BARRIER_I2){
+ if(gap || (row<BARRIER_I1 && col>1) || (row>BARRIER_I2 && col<BARRIER_I2b)){
+ packed[MODE_INS][row][col]=subfloor;
+ }else{//Calculate INS score
+
+ final int streak=packed[MODE_INS][row-1][col]&TIMEMASK;
+
+ final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK;
+
+ int scoreMS=scoreFromDiag+POINTSoff_INS;
+// int scoreD=scoreFromDel+POINTSoff_INS;
+ int scoreI=scoreFromIns+(streak==0 ? POINTSoff_INS :
+ streak<LIMIT_FOR_COST_3 ? POINTSoff_INS2 :
+ streak<LIMIT_FOR_COST_4 ? POINTSoff_INS3 : POINTSoff_INS4);
+
+// System.err.println("("+row+","+col+")\t"+scoreFromDiag+"+"+POINTSoff_INS+"="+scoreM+", "+
+// scoreFromSub+"+"+POINTSoff_INS+"="+scoreS+", "
+// +scoreD+", "+scoreFromIns+"+"+
+// (streak==0 ? POINTSoff_INS : streak<LIMIT_FOR_COST_3 ? POINTSoff_INS2 : POINTSoff_INS3)+"="+scoreI);
+
+ //if(match){scoreMS=subfloor;}
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreI){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreI;
+ time=streak+1;
+ prevState=MODE_INS;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_INS][row][col]=(score|prevState|time);
+ packed[MODE_INS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+ }
+ }
+
+
+ int maxCol=-1;
+ int maxState=-1;
+ int maxScore=Integer.MIN_VALUE;
+
+ for(int state=0; state<packed.length; state++){
+ for(int col=1; col<=columns; col++){
+ int x=packed[state][rows][col]&SCOREMASK;
+ if(x>maxScore){
+ maxScore=x;
+ maxCol=col;
+ maxState=state;
+ }
+ }
+ }
+ maxScore>>=SCOREOFFSET;
+
+// System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore);
+ return new int[] {rows, maxCol, maxState, maxScore};
+ }
+
+ @Deprecated
+ /** return new int[] {rows, maxC, maxS, max}; */
+ public final int[] fillQ(byte[] read, byte[] ref, byte[] baseScores, int refStartLoc, int refEndLoc){
+ assert(false) : "Needs to be redone to work with score cutoffs. Not difficult.";
+ rows=read.length;
+ columns=refEndLoc-refStartLoc+1;
+
+ assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows;
+ assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns;
+
+ assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc;
+ assert(refEndLoc<ref.length) : "Check that values are in-bounds before calling this function: "+refEndLoc+", "+ref.length;
+
+ for(int row=1; row<=rows; row++){
+
+// int minc=max(1, row-20);
+// int maxc=min(columns, row+20);
+
+ for(int col=1; col<=columns; col++){
+
+ final boolean match=(read[row-1]==ref[refStartLoc+col-1]);
+ final boolean prevMatch=(row<2 || col<2 ? false : read[row-2]==ref[refStartLoc+col-2]);
+
+ {//Calculate match and sub scores
+
+ final int scoreFromDiag=packed[MODE_MS][row-1][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row-1][col-1]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col-1]&SCOREMASK;
+ final int streak=(packed[MODE_MS][row-1][col-1]&TIMEMASK);
+
+ {//Calculate match/sub score
+
+ if(match){
+
+ int scoreMS=scoreFromDiag+(prevMatch ? POINTSoff_MATCH2 : POINTSoff_MATCH);
+ int scoreD=scoreFromDel+POINTSoff_MATCH;
+ int scoreI=scoreFromIns+POINTSoff_MATCH;
+
+ int score;
+ int time;
+// byte prevState;
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? streak+1 : 1);
+// prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+// prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+// prevState=MODE_INS;
+ }
+ score+=(((int)baseScores[row-1])<<SCOREOFFSET); //modifier
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_MS][row][col]=(score|prevState|time);
+ packed[MODE_MS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+
+ }else{
+
+ int scoreMS=scoreFromDiag+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) :
+ (streak==0 ? POINTSoff_SUB : streak<LIMIT_FOR_COST_3 ? POINTSoff_SUB2 : POINTSoff_SUB3));
+ int scoreD=scoreFromDel+POINTSoff_SUB; //+2 to move it as close as possible to the deletion / insertion
+ int scoreI=scoreFromIns+POINTSoff_SUB;
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? 1 : streak+1);
+// time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1);
+ prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+ prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+ prevState=MODE_INS;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_MS][row][col]=(score|prevState|time);
+ packed[MODE_MS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+ }
+ }
+
+ {//Calculate DEL score
+
+ final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK;
+
+ final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK;
+
+ int scoreMS=scoreFromDiag+POINTSoff_DEL;
+ int scoreD=scoreFromDel+(streak==0 ? POINTSoff_DEL :
+ streak<LIMIT_FOR_COST_3 ? POINTSoff_DEL2 :
+ streak<LIMIT_FOR_COST_4 ? POINTSoff_DEL3 :
+ streak<LIMIT_FOR_COST_5 ? POINTSoff_DEL4 :
+ ((streak&MASK5)==0 ? POINTSoff_DEL5 : 0));
+// int scoreI=scoreFromIns+POINTSoff_DEL;
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreD){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreD;
+ time=streak+1;
+ prevState=MODE_DEL;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_DEL][row][col]=(score|prevState|time);
+ packed[MODE_DEL][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+
+ {//Calculate INS score
+
+ final int streak=packed[MODE_INS][row-1][col]&TIMEMASK;
+
+ final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK;
+
+ int scoreMS=scoreFromDiag+POINTSoff_INS;
+// int scoreD=scoreFromDel+POINTSoff_INS;
+ int scoreI=scoreFromIns+(streak==0 ? POINTSoff_INS :
+ streak<LIMIT_FOR_COST_3 ? POINTSoff_INS2 :
+ streak<LIMIT_FOR_COST_4 ? POINTSoff_INS3 : POINTSoff_INS4);
+
+// System.err.println("("+row+","+col+")\t"+scoreFromDiag+"+"+POINTSoff_INS+"="+scoreM+", "+
+// scoreFromSub+"+"+POINTSoff_INS+"="+scoreS+", "
+// +scoreD+", "+scoreFromIns+"+"+
+// (streak==0 ? POINTSoff_INS : streak<LIMIT_FOR_COST_3 ? POINTSoff_INS2 : POINTSoff_INS3)+"="+scoreI);
+
+ int score;
+ int time;
+ byte prevState;
+ if(scoreMS>=scoreI){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreI;
+ time=streak+1;
+ prevState=MODE_INS;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead";
+ assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead";
+// packed[MODE_INS][row][col]=(score|prevState|time);
+ packed[MODE_INS][row][col]=(score|time);
+ assert((score&SCOREMASK)==score);
+// assert((prevState&MODEMASK)==prevState);
+ assert((time&TIMEMASK)==time);
+ }
+ }
+ }
+
+
+ int maxCol=-1;
+ int maxState=-1;
+ int maxScore=Integer.MIN_VALUE;
+
+ for(int state=0; state<packed.length; state++){
+ for(int col=1; col<=columns; col++){
+ int x=packed[state][rows][col]&SCOREMASK;
+ if(x>maxScore){
+ maxScore=x;
+ maxCol=col;
+ maxState=state;
+ }
+ }
+ }
+ maxScore>>=SCOREOFFSET;
+
+// System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore);
+ return new int[] {rows, maxCol, maxState, maxScore};
+ }
+
+ @Override
+ /** @return {score, bestRefStart, bestRefStop} */
+ /** Generates the match string */
+ public final byte[] traceback(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state, boolean gapped){
+ if(gapped){
+ final byte[] gref=grefbuffer;
+ int gstart=translateToGappedCoordinate(refStartLoc, gref);
+ int gstop=translateToGappedCoordinate(refEndLoc, gref);
+ byte[] out=traceback2(read, gref, gstart, gstop, row, col, state);
+ return out;
+ }else{
+ return traceback2(read, ref, refStartLoc, refEndLoc, row, col, state);
+ }
+ }
+
+ @Override
+ /** Generates the match string */
+ public final byte[] traceback2(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state){
+// assert(false);
+ assert(refStartLoc<=refEndLoc) : refStartLoc+", "+refEndLoc;
+ assert(row==rows);
+
+ byte[] out=new byte[row+col-1]; //TODO if an out of bound crash occurs, try removing the "-1".
+ int outPos=0;
+
+ int gaps=0;
+
+ if(state==MODE_INS){
+ //TODO ? Maybe not needed.
+ }
+
+ while(row>0 && col>0){
+
+// byte prev0=(byte)(packed[state][row][col]&MODEMASK);
+
+ final int time=packed[state][row][col]&TIMEMASK;
+ final byte prev;
+
+// System.err.println("state="+state+", prev="+prev+", row="+row+", col="+col+", score="+scores[state][row][col]);
+
+ if(state==MODE_MS){
+ if(time>1){prev=(byte)state;}
+ else{
+ final int scoreFromDiag=packed[MODE_MS][row-1][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row-1][col-1]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col-1]&SCOREMASK;
+ if(scoreFromDiag>=scoreFromDel && scoreFromDiag>=scoreFromIns){prev=MODE_MS;}
+ else if(scoreFromDel>=scoreFromIns){prev=MODE_DEL;}
+ else{prev=MODE_INS;}
+ }
+
+ byte c=read[row-1];
+ byte r=ref[refStartLoc+col-1];
+ if(c==r){
+ out[outPos]='m';
+ }else{
+ if(!AminoAcid.isFullyDefined(c)){
+ out[outPos]='N';
+ }else if(!AminoAcid.isFullyDefined(r)){
+// out[outPos]='X';
+ out[outPos]='N';
+ }else{
+ out[outPos]='S';
+ }
+ }
+
+ row--;
+ col--;
+ }else if(state==MODE_DEL){
+ if(time>1){prev=(byte)state;}
+ else{
+ final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK;
+ if(scoreFromDiag>=scoreFromDel){prev=MODE_MS;}
+ else{prev=MODE_DEL;}
+ }
+
+ byte r=ref[refStartLoc+col-1];
+ if(r==GAPC){
+ out[outPos]='-';
+ gaps++;
+ }else{
+ out[outPos]='D';
+ }
+ col--;
+ }else{
+ if(time>1){prev=(byte)state;}
+ else{
+ final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK;
+ if(scoreFromDiag>=scoreFromIns){prev=MODE_MS;}
+ else{prev=MODE_INS;}
+ }
+
+ assert(state==MODE_INS) : state;
+ if(col==0){
+ out[outPos]='X';
+ }else if(col>=columns){
+ out[outPos]='Y';
+ }else{
+ out[outPos]='I';
+ }
+ row--;
+ }
+
+// assert(prev==prev0);
+ state=prev;
+ outPos++;
+ }
+
+ assert(row==0 || col==0);
+ if(col!=row){
+ while(row>0){
+ out[outPos]='X';
+ outPos++;
+ row--;
+ col--;
+ }
+ if(col>0){
+ //do nothing
+ }
+ }
+
+
+ //Shrink and reverse the string
+ byte[] out2=new byte[outPos];
+ for(int i=0; i<outPos; i++){
+ out2[i]=out[outPos-i-1];
+ }
+ out=null;
+
+ if(gaps==0){return out2;}
+
+ //TODO Consider outputting this compressed.
+ byte[] out3=new byte[out2.length+gaps*(GAPLEN-1)];
+ for(int i=0, j=0; i<out2.length; i++){
+ byte c=out2[i];
+ if(c!=GAPC){
+ out3[j]=c;
+ j++;
+ }else{
+ int lim=j+GAPLEN;
+ for(; j<lim; j++){
+ out3[j]='D';
+ }
+ }
+ }
+ return out3;
+ }
+
+ @Override
+ /** @return {score, bestRefStart, bestRefStop} */
+ public final int[] score(final byte[] read, final byte[] ref, final int refStartLoc, final int refEndLoc,
+ final int maxRow, final int maxCol, final int maxState, boolean gapped){
+ if(gapped){
+ if(verbose){
+ System.err.println("score():");
+ System.err.println("origin="+grefRefOrigin+", "+refStartLoc+", "+refEndLoc+", "+maxRow+", "+maxCol);
+ }
+ final byte[] gref=grefbuffer;
+ int gstart=translateToGappedCoordinate(refStartLoc, gref);
+ int gstop=translateToGappedCoordinate(refEndLoc, gref);
+
+ assert(translateFromGappedCoordinate(gstart, gref)==refStartLoc); //TODO: Remove slow assertions
+ assert(translateFromGappedCoordinate(gstop, gref)==refEndLoc);
+
+ assert(gstart==0) : gstart; //TODO: skip translation if this is always zero
+
+ if(verbose){System.err.println("gstart, gstop: "+gstart+", "+gstop);}
+ int[] out=score2(read, gref, gstart, gstop, maxRow, maxCol, maxState);
+ if(verbose){System.err.println("got score "+Arrays.toString(out));}
+
+ assert(out[1]==translateToGappedCoordinate(translateFromGappedCoordinate(out[1], gref), gref)) :
+ "Verifying: "+out[1]+" -> "+translateFromGappedCoordinate(out[1], gref)+" -> "+
+ translateToGappedCoordinate(translateFromGappedCoordinate(out[1], gref), gref);
+ assert(out[2]==translateToGappedCoordinate(translateFromGappedCoordinate(out[2], gref), gref));
+
+ out[1]=translateFromGappedCoordinate(out[1], gref);
+ out[2]=translateFromGappedCoordinate(out[2], gref);
+ if(verbose){System.err.println("returning score "+Arrays.toString(out));}
+ return out;
+ }else{
+ return score2(read, ref, refStartLoc, refEndLoc, maxRow, maxCol, maxState);
+ }
+ }
+
+ @Override
+ /** @return {score, bestRefStart, bestRefStop, maxRow, maxCol, maxState}, <br>
+ * or {score, bestRefStart, bestRefStop, maxRow, maxCol, maxState, padLeft, padRight} <br>
+ * if more padding is needed */
+ public final int[] score2(final byte[] read, final byte[] ref, final int refStartLoc, final int refEndLoc,
+ final int maxRow, final int maxCol, final int maxState){
+
+ int row=maxRow;
+ int col=maxCol;
+ int state=maxState;
+
+ assert(maxState>=0 && maxState<packed.length) :
+ maxState+", "+maxRow+", "+maxCol+"\n"+new String(read)+"\n"+toString(ref, refStartLoc, refEndLoc);
+ assert(maxRow>=0 && maxRow<packed[0].length) :
+ maxState+", "+maxRow+", "+maxCol+"\n"+new String(read)+"\n"+toString(ref, refStartLoc, refEndLoc);
+ assert(maxCol>=0 && maxCol<packed[0][0].length) :
+ maxState+", "+maxRow+", "+maxCol+"\n"+new String(read)+"\n"+toString(ref, refStartLoc, refEndLoc);
+
+ int score=packed[maxState][maxRow][maxCol]&SCOREMASK; //Or zero, if it is to be recalculated
+
+ if(row<rows){
+ int difR=rows-row;
+ int difC=columns-col;
+
+ while(difR>difC){
+ score+=POINTSoff_NOREF;
+ difR--;
+ }
+
+ row+=difR;
+ col+=difR;
+
+ }
+
+ assert(refStartLoc<=refEndLoc);
+ assert(row==rows);
+
+
+ final int bestRefStop=refStartLoc+col-1;
+
+ while(row>0 && col>0){
+// System.err.println("state="+state+", row="+row+", col="+col);
+
+
+
+// byte prev0=(byte)(packed[state][row][col]&MODEMASK);
+
+ final int time=packed[state][row][col]&TIMEMASK;
+ final byte prev;
+
+ if(state==MODE_MS){
+ if(time>1){prev=(byte)state;}
+ else{
+ final int scoreFromDiag=packed[MODE_MS][row-1][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row-1][col-1]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col-1]&SCOREMASK;
+ if(scoreFromDiag>=scoreFromDel && scoreFromDiag>=scoreFromIns){prev=MODE_MS;}
+ else if(scoreFromDel>=scoreFromIns){prev=MODE_DEL;}
+ else{prev=MODE_INS;}
+ }
+ row--;
+ col--;
+ }else if(state==MODE_DEL){
+ if(time>1){prev=(byte)state;}
+ else{
+ final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK;
+ final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK;
+ if(scoreFromDiag>=scoreFromDel){prev=MODE_MS;}
+ else{prev=MODE_DEL;}
+ }
+ col--;
+ }else{
+ assert(state==MODE_INS);
+ if(time>1){prev=(byte)state;}
+ else{
+ final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK;
+ final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK;
+ if(scoreFromDiag>=scoreFromIns){prev=MODE_MS;}
+ else{prev=MODE_INS;}
+ }
+ row--;
+ }
+
+ if(col<0){
+ System.err.println(row);
+ break; //prevents an out of bounds access
+
+ }
+
+// assert(prev==prev0);
+ state=prev;
+
+// System.err.println("state2="+state+", row2="+row+", col2="+col+"\n");
+ }
+// assert(false) : row+", "+col;
+ if(row>col){
+ col-=row;
+ }
+
+ final int bestRefStart=refStartLoc+col;
+
+ score>>=SCOREOFFSET;
+ int[] rvec;
+ if(bestRefStart<refStartLoc || bestRefStop>refEndLoc){ //Suggest extra padding in cases of overflow
+ int padLeft=Tools.max(0, refStartLoc-bestRefStart);
+ int padRight=Tools.max(0, bestRefStop-refEndLoc);
+ rvec=new int[] {score, bestRefStart, bestRefStop, maxRow, maxCol, maxState, padLeft, padRight};
+ }else{
+ rvec=new int[] {score, bestRefStart, bestRefStop, maxRow, maxCol, maxState};
+ }
+ return rvec;
+ }
+
+ /**
+ * Fills grefbuffer
+ * @param ref
+ * @param a
+ * @param b
+ * @param gaps
+ * @return gref
+ */
+ private final byte[] makeGref(byte[] ref, int[] gaps, int refStartLoc, int refEndLoc){
+ assert(gaps!=null && gaps.length>0);
+
+ assert(refStartLoc<=gaps[0]) : refStartLoc+", "+refEndLoc+", "+Arrays.toString(gaps);
+ assert(refEndLoc>=gaps[gaps.length-1]);
+
+ final int g0_old=gaps[0];
+ final int gN_old=gaps[gaps.length-1];
+ gaps[0]=Tools.min(gaps[0], refStartLoc);
+ gaps[gaps.length-1]=Tools.max(gN_old, refEndLoc);
+ grefRefOrigin=gaps[0];
+
+ if(verbose){System.err.println("\ngaps2: "+Arrays.toString(gaps));}
+
+// grefRefOrigin=Tools.min(gaps[0], refStartLoc);
+
+// //This block is no longer needed since the array is preallocated.
+// int len=0;
+// final int gb2=GAPBUFFER*2;
+// for(int i=0; i<gaps.length; i+=2){
+// int x=gaps[i];
+// int y=gaps[i+1];
+// len+=(y-x+1);
+// if(i+2<gaps.length){
+// int z=gaps[i+2];
+// assert(z>y);
+// int gap=z-y-1;
+// if(gap<MINGAP){
+// len+=gap;
+// }else{
+// len+=gb2;
+// gap-=gb2;
+// int div=gap/GAPLEN;
+// int rem=gap%GAPLEN;
+// len+=(div+rem);
+// }
+// }
+// }
+ byte[] gref=grefbuffer;
+
+ int gpos=0;
+ for(int i=0; i<gaps.length; i+=2){
+ int x=gaps[i];
+ int y=gaps[i+1];
+
+ for(int r=x; r<=y; r++, gpos++){
+ //TODO: if out of bounds, use an 'N'
+// assert(gpos<gref.length) : refStartLoc+", "+refEndLoc+", "+greflimit+", "+GREFLIMIT2_CUSHION+"\n"+new String(gref);
+ gref[gpos]=ref[r];
+ }
+
+ if(i+2<gaps.length){
+ int z=gaps[i+2];
+ assert(z>y);
+ int gap=z-y-1;
+ assert(gap>=MINGAP) : gap+"\t"+MINGAP;
+ if(gap<MINGAP){
+ assert(false) : "TODO - just fill in normally";
+ }else{
+ int rem=gap%GAPLEN;
+ int lim=y+GAPBUFFER+rem;
+
+ int div=(gap-GAPBUFFER2)/GAPLEN;
+ if(verbose){
+ System.err.println("div = "+div);
+ }
+ assert(div>0);
+
+ for(int r=y+1; r<=lim; r++, gpos++){
+ gref[gpos]=ref[r];
+ }
+ for(int g=0; g<div; g++, gpos++){
+ gref[gpos]=GAPC;
+ }
+ for(int r=z-GAPBUFFER; r<z; r++, gpos++){
+ gref[gpos]=ref[r];
+ }
+ }
+ }
+ }
+
+ greflimit=gpos;
+
+ assert(gref[gpos-1]==ref[refEndLoc]);
+// assert(greflimit+GREFLIMIT2_CUSHION<=gref.length) : refStartLoc+", "+refEndLoc+", "+greflimit+", "+GREFLIMIT2_CUSHION+"\n"+new String(gref);
+ //Add a cushion to the end to clear out the prior data (especially GAPC) that was there
+ {
+ final int lim=Tools.min(gref.length, greflimit+GREFLIMIT2_CUSHION);
+ if(lim>gref.length){
+ System.err.println("gref buffer overflow: "+lim+" > "+gref.length);
+ return null;
+ }
+ for(int i=greflimit, r=refEndLoc+1; i<lim; i++, r++){
+ gref[i]=(r<ref.length ? ref[r] : (byte)'N');
+ greflimit2=i;
+ }
+ }
+
+ if(verbose){
+ System.err.println("gref:\n"+new String(gref));
+ }
+
+ gaps[0]=g0_old;
+ gaps[gaps.length-1]=gN_old;
+
+ if(verbose){
+ System.err.println("\ngaps3: "+Arrays.toString(gaps));
+ }
+
+ return gref;
+ }
+
+
+// public final int[] translateScoreFromGappedCoordinate(int[] score){
+//// {score, bestRefStart, bestRefStop}
+// int a=score[1];
+// int b=score[2];
+// int a2=-9999;
+// int b2=-9999;
+// for(int i=0, j=grefRefOrigin; i<grefbuffer.length; i++){
+// byte c=grefbuffer[i];
+//
+// if(i==a){a2=j;}
+// if(i==b){
+// b2=j;
+// assert(a2!=-9999);
+// score[1]=a2;
+// score[2]=b2;
+// return score;
+// }
+//
+// j+=(c==GAPC ? GAPLEN : 1);
+//// if(c!=GAPC){j++;}
+//// else{j+=GAPLEN;}
+// }
+// throw new RuntimeException("Out of bounds.");
+// }
+
+ private final int translateFromGappedCoordinate(int point, byte[] gref){
+ if(verbose){System.err.println("translateFromGappedCoordinate("+point+"), gro="+grefRefOrigin+", grl="+greflimit);}
+ if(point<=0){return grefRefOrigin+point;}
+ for(int i=0, j=grefRefOrigin; i<greflimit2; i++){
+// if(verbose){System.err.println("i="+i+", j="+j+", sym="+(char)gref[i]);}
+ byte c=gref[i];
+ assert(point>=i) : "\n"+grefRefOrigin+"\n"+point+"\n"+new String(gref)+"\n";
+
+ if(i==point){
+ if(verbose){System.err.println(" -> "+j);}
+ return j;
+ }
+
+ j+=(c==GAPC ? GAPLEN : 1);
+// if(c!=GAPC){j++;}
+// else{j+=GAPLEN;}
+ }
+
+ System.err.println(grefRefOrigin);
+ System.err.println(point);
+ System.err.println(new String(gref));
+
+ throw new RuntimeException("Out of bounds.");
+ }
+
+ private final int translateToGappedCoordinate(int point, byte[] gref){
+ if(verbose){System.err.println("translateToGappedCoordinate("+point+"), gro="+grefRefOrigin+", grl="+greflimit);}
+ if(point<=grefRefOrigin){return point-grefRefOrigin;}
+ for(int i=0, j=grefRefOrigin; i<greflimit2; i++){
+// if(verbose){System.err.println("i="+i+", j="+j+", sym="+(char)gref[i]);}
+ assert(point>=j) : "\n"+grefRefOrigin+"\n"+point+"\n"+new String(gref)+"\n";
+ byte c=gref[i];
+
+ if(j==point){
+ if(verbose){System.err.println(" -> "+i);}
+ return i;
+ }
+
+ j+=(c==GAPC ? GAPLEN : 1);
+// if(c!=GAPC){j++;}
+// else{j+=GAPLEN;}
+ }
+
+ System.err.println(grefRefOrigin);
+ System.err.println(point);
+ System.err.println(new String(gref));
+
+ throw new RuntimeException("Out of bounds.");
+ }
+
+
+ /** Calculates score based on an array from Index */
+ private final int calcAffineScore(int[] locArray){
+ int score=0;
+ int lastLoc=-2; //Last true location
+ int lastValue=-1;
+ int timeInMode=0;
+
+ for(int i=0; i<locArray.length; i++){
+ int loc=locArray[i];
+
+ if(loc>0){//match
+ if(loc==lastValue){//contiguous match
+ score+=POINTS_MATCH2;
+ }else if(loc==lastLoc || lastLoc<0){//match after a sub, or first match
+ score+=POINTS_MATCH;
+ }else if(loc<lastLoc){//deletion
+ assert(lastLoc>=0);
+ score+=POINTS_MATCH;
+ score+=POINTS_DEL;
+ int dif=lastLoc-loc+1;
+ if(dif>MINGAP){
+ int rem=dif%GAPLEN;
+ int div=(dif-GAPBUFFER2)/GAPLEN;
+ score+=(div*POINTS_GAP);
+ assert(rem+GAPBUFFER2<dif);
+ dif=rem+GAPBUFFER2;
+ assert(dif>LIMIT_FOR_COST_4); //and probably LIMIT_FOR_COST_5
+// assert(false) : div;
+ }
+ if(dif>LIMIT_FOR_COST_5){
+ score+=((dif-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTS_DEL5;
+ dif=LIMIT_FOR_COST_5;
+ }
+ if(dif>LIMIT_FOR_COST_4){
+ score+=(dif-LIMIT_FOR_COST_4)*POINTS_DEL4;
+ dif=LIMIT_FOR_COST_4;
+ }
+ if(dif>LIMIT_FOR_COST_3){
+ score+=(dif-LIMIT_FOR_COST_3)*POINTS_DEL3;
+ dif=LIMIT_FOR_COST_3;
+ }
+ if(dif>1){
+ score+=(dif-1)*POINTS_DEL2;
+ }
+ timeInMode=1;
+ }else if(loc>lastLoc){//insertion
+ assert(lastLoc>=0);
+ score+=POINTS_MATCH;
+ score+=POINTS_INS;
+ int dif=Tools.min(loc-lastLoc+1, 5);
+ assert(dif>0);
+ if(dif>LIMIT_FOR_COST_4){
+ score+=(dif-LIMIT_FOR_COST_4)*POINTS_INS4;
+ dif=LIMIT_FOR_COST_4;
+ }
+ if(dif>LIMIT_FOR_COST_3){
+ score+=(dif-LIMIT_FOR_COST_3)*POINTS_INS3;
+ dif=LIMIT_FOR_COST_3;
+ }
+ if(dif>1){
+ score+=(dif-1)*POINTS_INS2;
+ }
+ timeInMode=1;
+ }else{
+ assert(false);
+ }
+ lastLoc=loc;
+ }else{//substitution
+ if(lastValue<0 && timeInMode>0){//contiguous
+ if(timeInMode<LIMIT_FOR_COST_3){score+=POINTS_SUB2;}
+ else{score+=POINTS_SUB3;}
+ timeInMode++;
+ }else{
+ score+=POINTS_SUB;
+ timeInMode=1;
+ }
+ }
+ lastValue=loc;
+ }
+ assert(score<=maxQuality(locArray.length));
+ return score;
+ }
+
+ @Override
+ public final int calcAffineScore(final int[] locArray, final byte[] baseScores, final byte bases[]){
+ int score=0;
+ int lastLoc=-3; //Last true location
+ int lastValue=-1;
+ int timeInMode=0;
+
+ for(int i=0; i<locArray.length; i++){
+ final int loc=locArray[i];
+
+ if(loc>0){//match
+ if(loc==lastValue){//contiguous match
+ score+=(POINTS_MATCH2+baseScores[i]);
+ }else if(loc==lastLoc || lastLoc<0){//match after a sub, or first match
+ score+=(POINTS_MATCH+baseScores[i]);
+ }else if(loc<lastLoc){//deletion
+ assert(lastLoc>=0);
+ score+=(POINTS_MATCH+baseScores[i]);
+ score+=POINTS_DEL;
+ int dif=lastLoc-loc+1;
+ if(dif>MINGAP){
+ int rem=dif%GAPLEN;
+ int div=(dif-GAPBUFFER2)/GAPLEN;
+ score+=(div*POINTS_GAP);
+ assert(rem+GAPBUFFER2<dif);
+ dif=rem+GAPBUFFER2;
+ assert(dif>LIMIT_FOR_COST_4); //and probably LIMIT_FOR_COST_5
+// assert(false) : div;
+ }
+ if(dif>LIMIT_FOR_COST_5){
+ score+=((dif-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTS_DEL5;
+ dif=LIMIT_FOR_COST_5;
+ }
+ if(dif>LIMIT_FOR_COST_4){
+ score+=(dif-LIMIT_FOR_COST_4)*POINTS_DEL4;
+ dif=LIMIT_FOR_COST_4;
+ }
+ if(dif>LIMIT_FOR_COST_3){
+ score+=(dif-LIMIT_FOR_COST_3)*POINTS_DEL3;
+ dif=LIMIT_FOR_COST_3;
+ }
+ if(dif>1){
+ score+=(dif-1)*POINTS_DEL2;
+ }
+ timeInMode=1;
+ }else if(loc>lastLoc){//insertion
+ assert(lastLoc>=0);
+ score+=(POINTS_MATCH+baseScores[i]);
+ score+=POINTS_INS;
+ int dif=Tools.min(loc-lastLoc+1, 5);
+ assert(dif>0);
+ if(dif>LIMIT_FOR_COST_4){
+ score+=(dif-LIMIT_FOR_COST_4)*POINTS_INS4;
+ dif=LIMIT_FOR_COST_4;
+ }
+ if(dif>LIMIT_FOR_COST_3){
+ score+=(dif-LIMIT_FOR_COST_3)*POINTS_INS3;
+ dif=LIMIT_FOR_COST_3;
+ }
+ if(dif>1){
+ score+=(dif-1)*POINTS_INS2;
+ }
+ timeInMode=1;
+ }else{
+ assert(false);
+ }
+ lastLoc=loc;
+ }else if(loc==-1){//substitution
+ if(lastValue<0 && timeInMode>0){//contiguous
+ if(timeInMode<LIMIT_FOR_COST_3){score+=POINTS_SUB2;}
+ else{score+=POINTS_SUB3;}
+ timeInMode++;
+ }else{
+ score+=POINTS_SUB;
+ timeInMode=1;
+ }
+ }else{
+ assert(loc==-2) : "\ni="+i+", loc="+loc+", score="+score+", lastLoc="+lastLoc+", lastValue="+lastValue
+ +", time="+timeInMode+", length="+locArray.length+"\nbases=\n"+new String(bases)
+ +"\nlocs[]=\n"+Arrays.toString(locArray)+"\n"+new String(bases)+"\n"
+ +"If this happens please ensure that the reference has a startpad of Ns longer than readlength.";//N (no-call or no-ref)
+ timeInMode=0;
+ score+=POINTS_NOCALL;
+ }
+ lastValue=loc;
+ }
+ assert(score<=maxQuality(locArray.length));
+ return score;
+ }
+
+ @Override
+ public final int calcAffineScore(int[] locArray, byte[] baseScores, byte[] bases, int minContig){
+ assert(minContig>1) : minContig;
+
+ int contig=0;
+ int maxContig=0;
+
+ int score=0;
+ int lastLoc=-3; //Last true location
+ int lastValue=-1;
+ int timeInMode=0;
+
+ for(int i=0; i<locArray.length; i++){
+ int loc=locArray[i];
+
+ if(loc>0){//match
+ if(loc==lastValue){//contiguous match
+ contig++;
+ score+=(POINTS_MATCH2+baseScores[i]);
+ }else if(loc==lastLoc || lastLoc<0){//match after a sub, or first match
+ maxContig=Tools.max(maxContig, contig);
+ contig=1;
+ score+=(POINTS_MATCH+baseScores[i]);
+ }else if(loc<lastLoc){//deletion
+ maxContig=Tools.max(maxContig, contig);
+ contig=0;
+ assert(lastLoc>=0);
+ score+=(POINTS_MATCH+baseScores[i]);
+ score+=POINTS_DEL;
+ int dif=lastLoc-loc+1;
+ if(dif>MINGAP){
+ int rem=dif%GAPLEN;
+ int div=(dif-GAPBUFFER2)/GAPLEN;
+ score+=(div*POINTS_GAP);
+ assert(rem+GAPBUFFER2<dif);
+ dif=rem+GAPBUFFER2;
+ assert(dif>LIMIT_FOR_COST_4); //and probably LIMIT_FOR_COST_5
+// assert(false) : div;
+ }
+ if(dif>LIMIT_FOR_COST_5){
+ score+=((dif-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTS_DEL5;
+ dif=LIMIT_FOR_COST_5;
+ }
+ if(dif>LIMIT_FOR_COST_4){
+ score+=(dif-LIMIT_FOR_COST_4)*POINTS_DEL4;
+ dif=LIMIT_FOR_COST_4;
+ }
+ if(dif>LIMIT_FOR_COST_3){
+ score+=(dif-LIMIT_FOR_COST_3)*POINTS_DEL3;
+ dif=LIMIT_FOR_COST_3;
+ }
+ if(dif>1){
+ score+=(dif-1)*POINTS_DEL2;
+ }
+ timeInMode=1;
+ }else if(loc>lastLoc){//insertion
+ maxContig=Tools.max(maxContig, contig);
+ contig=0;
+ assert(lastLoc>=0);
+ score+=(POINTS_MATCH+baseScores[i]);
+ score+=POINTS_INS;
+ int dif=Tools.min(loc-lastLoc+1, 5);
+ assert(dif>0);
+ if(dif>LIMIT_FOR_COST_4){
+ score+=(dif-LIMIT_FOR_COST_4)*POINTS_INS4;
+ dif=LIMIT_FOR_COST_4;
+ }
+ if(dif>LIMIT_FOR_COST_3){
+ score+=(dif-LIMIT_FOR_COST_3)*POINTS_INS3;
+ dif=LIMIT_FOR_COST_3;
+ }
+ if(dif>1){
+ score+=(dif-1)*POINTS_INS2;
+ }
+ timeInMode=1;
+ }else{
+ assert(false);
+ }
+ lastLoc=loc;
+ }else if(loc==-1){//substitution
+ if(lastValue<0 && timeInMode>0){//contiguous
+ if(timeInMode<LIMIT_FOR_COST_3){score+=POINTS_SUB2;}
+ else{score+=POINTS_SUB3;}
+ timeInMode++;
+ }else{
+ score+=POINTS_SUB;
+ timeInMode=1;
+ }
+ }else{
+ assert(loc==-2) : loc+"\n"+Arrays.toString(locArray)+"\n"+Arrays.toString(baseScores)+"\n"+new String(bases)+"\n"
+ +"If this happens please ensure that the reference has a startpad of Ns longer than readlength.";//N (no-call or no-ref)
+ timeInMode=0;
+ score+=POINTS_NOCALL;
+ }
+ lastValue=loc;
+ }
+ assert(score<=maxQuality(locArray.length));
+ if(Tools.max(contig, maxContig)<minContig){score=Tools.min(score, -50*locArray.length);}
+ return score;
+ }
+
+ @Override
+ public final int scoreNoIndels(byte[] read, byte[] ref, final int refStart){
+ return scoreNoIndels(read, ref, refStart, null);
+ }
+ @Override
+ public final int scoreNoIndels(byte[] read, byte[] ref, final int refStart, final SiteScore ss){
+
+ int score=0;
+ int mode=-1;
+ int timeInMode=0;
+
+ //This block handles cases where the read runs outside the reference
+ //Of course, padding the reference with 'N' would be better, but...
+ int readStart=0;
+ int readStop=read.length;
+ final int refStop=refStart+read.length;
+ boolean semiperfect=true;
+ int norefs=0;
+
+ if(refStart<0){
+ readStart=0-refStart;
+ score+=POINTS_NOREF*readStart;
+ norefs+=readStart;
+ }
+ if(refStop>ref.length){
+ int dif=(refStop-ref.length);
+ readStop-=dif;
+ score+=POINTS_NOREF*dif;
+ norefs+=dif;
+ }
+
+// if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed.
+
+ for(int i=readStart; i<readStop; i++){
+ byte c=read[i];
+ byte r=ref[refStart+i];
+
+ if(c==r && c!='N'){
+ if(mode==MODE_MS){
+ timeInMode++;
+ score+=POINTS_MATCH2;
+ }else{
+ timeInMode=0;
+ score+=POINTS_MATCH;
+ }
+ mode=MODE_MS;
+ }else if(c<0 || c=='N'){
+ score+=POINTS_NOCALL;
+ semiperfect=false;
+ }else if(r<0 || r=='N'){
+ score+=POINTS_NOREF;
+ norefs++;
+ }else{
+ if(mode==MODE_SUB){timeInMode++;}
+ else{timeInMode=0;}
+
+ if(timeInMode==0){score+=POINTS_SUB;}
+ else if(timeInMode<LIMIT_FOR_COST_3){score+=POINTS_SUB2;}
+ else{score+=POINTS_SUB3;}
+ mode=MODE_SUB;
+ semiperfect=false;
+ }
+ }
+
+ //TODO: Verify this; it's in the PacBio version
+ //if(semiperfect && ss!=null){ss.semiperfect=((ss.stop==ss.start+read.length-1) && (norefs<=read.length/2));}
+
+ return score;
+ }
+
+ @Override
+ public final byte[] genMatchNoIndels(byte[] read, byte[] ref, final int refStart){
+ if(read==null || ref==null){return null;}
+
+ final byte[] match=new byte[read.length];
+
+ for(int i=0, j=refStart; i<read.length; i++, j++){
+ byte c=read[i];
+ byte r=(j<0 || j>=ref.length) ? (byte)'N' : ref[j];
+
+ if(c=='N' || r=='N'){match[i]='N';}
+ else if(c==r){match[i]='m';}
+ else{match[i]='S';}
+
+ }
+
+ return match;
+ }
+
+ @Override
+ public final int scoreNoIndels(byte[] read, byte[] ref, byte[] baseScores, final int refStart){
+ return scoreNoIndels(read, ref, baseScores, refStart, null);
+ }
+ @Override
+ public final int scoreNoIndels(byte[] read, byte[] ref, byte[] baseScores, final int refStart, SiteScore ss){
+
+ int score=0;
+ int mode=-1;
+ int timeInMode=0;
+ int norefs=0;
+
+ //This block handles cases where the read runs outside the reference
+ //Of course, padding the reference with 'N' would be better, but...
+ int readStart=0;
+ int readStop=read.length;
+ final int refStop=refStart+read.length;
+ boolean semiperfect=true;
+
+ if(refStart<0){
+ readStart=0-refStart;
+ score+=POINTS_NOREF*readStart;
+ norefs+=readStart;
+ }
+ if(refStop>ref.length){
+ int dif=(refStop-ref.length);
+ readStop-=dif;
+ score+=POINTS_NOREF*dif;
+ norefs+=dif;
+ }
+
+// if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed.
+
+ for(int i=readStart; i<readStop; i++){
+ byte c=read[i];
+ byte r=ref[refStart+i];
+
+ if(c==r && c!='N'){
+ if(mode==MODE_MS){
+ timeInMode++;
+ score+=POINTS_MATCH2;
+ }else{
+ timeInMode=0;
+ score+=POINTS_MATCH;
+ }
+ score+=baseScores[i];
+ mode=MODE_MS;
+ }else if(c<0 || c=='N'){
+ score+=POINTS_NOCALL;
+ semiperfect=false;
+ }else if(r<0 || r=='N'){
+ score+=POINTS_NOREF;
+ norefs++;
+ }else{
+ if(mode==MODE_SUB){timeInMode++;}
+ else{timeInMode=0;}
+
+ if(timeInMode==0){score+=POINTS_SUB;}
+ else if(timeInMode<LIMIT_FOR_COST_3){score+=POINTS_SUB2;}
+ else{score+=POINTS_SUB3;}
+ mode=MODE_SUB;
+ semiperfect=false;
+ }
+ }
+
+ //TODO: Verify. This is in the PacBio version.
+// if(semiperfect && ss!=null){ss.semiperfect=((ss.stop==ss.start+read.length-1) && (norefs<=read.length/2));}
+// assert(Read.CHECKSITE(ss, read, -1));
+
+ return score;
+ }
+
+ @Override
+ public final int scoreNoIndelsAndMakeMatchString(byte[] read, byte[] ref, byte[] baseScores, final int refStart, byte[][] matchReturn){
+ int score=0;
+ int mode=-1;
+ int timeInMode=0;
+
+ assert(refStart<=ref.length) : refStart+", "+ref.length;
+
+ //This block handles cases where the read runs outside the reference
+ //Of course, padding the reference with 'N' would be better, but...
+ int readStart=0;
+ int readStop=read.length;
+ final int refStop=refStart+read.length;
+ if(refStart<0 || refStop>ref.length){return -99999;}
+ if(refStart<0){
+ readStart=0-refStart;
+ score+=POINTS_NOREF*readStart;
+ }
+ if(refStop>ref.length){
+ int dif=(refStop-ref.length);
+ System.err.println(new String(read)+"\ndif="+dif+", ref.length="+ref.length+", refStop="+refStop);
+ readStop-=dif;
+ score+=POINTS_NOREF*dif;
+ }
+ assert(refStart+readStop<=ref.length) : "readStart="+readStart+", readStop="+readStop+
+ ", refStart="+refStart+", refStop="+refStop+", ref.length="+ref.length+", read.length="+read.length;
+
+ assert(matchReturn!=null);
+ assert(matchReturn.length==1);
+ if(matchReturn[0]==null || matchReturn[0].length!=read.length){
+ assert(matchReturn[0]==null || matchReturn[0].length<read.length) : matchReturn[0].length+"!="+read.length;
+ matchReturn[0]=new byte[read.length];
+ }
+ final byte[] match=matchReturn[0];
+
+// if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed.
+
+ for(int i=readStart; i<readStop; i++){
+ byte c=read[i];
+ byte r=ref[refStart+i];
+
+ assert(r!='.' && c!='.');
+
+ if(c==r && c!='N'){
+ if(mode==MODE_MS){
+ timeInMode++;
+ score+=POINTS_MATCH2;
+ }else{
+ timeInMode=0;
+ score+=POINTS_MATCH;
+ }
+ score+=baseScores[i];
+ match[i]='m';
+ mode=MODE_MS;
+ }else if(c<0 || c=='N'){
+ score+=POINTS_NOCALL;
+ match[i]='N';
+ }else if(r<0 || r=='N'){
+ score+=POINTS_NOREF;
+// match[i]='m';
+ match[i]='N';
+ }else{
+ match[i]='S';
+ if(mode==MODE_SUB){timeInMode++;}
+ else{timeInMode=0;}
+
+ if(timeInMode==0){score+=POINTS_SUB;}
+ else if(timeInMode<LIMIT_FOR_COST_3){score+=POINTS_SUB2;}
+ else{score+=POINTS_SUB3;}
+ mode=MODE_SUB;
+ }
+ }
+
+ return score;
+ }
+
+ @Override
+ public final int scoreNoIndelsAndMakeMatchString(byte[] read, byte[] ref, final int refStart, byte[][] matchReturn){
+ int score=0;
+ int mode=-1;
+ int timeInMode=0;
+
+ assert(refStart<=ref.length) : refStart+", "+ref.length;
+
+ //This block handles cases where the read runs outside the reference
+ //Of course, padding the reference with 'N' would be better, but...
+ int readStart=0;
+ int readStop=read.length;
+ final int refStop=refStart+read.length;
+ if(refStart<0 || refStop>ref.length){return -99999;}
+ if(refStart<0){
+ readStart=0-refStart;
+ score+=POINTS_NOREF*readStart;
+ }
+ if(refStop>ref.length){
+ int dif=(refStop-ref.length);
+ System.err.println(new String(read)+"\ndif="+dif+", ref.length="+ref.length+", refStop="+refStop);
+ readStop-=dif;
+ score+=POINTS_NOREF*dif;
+ }
+ assert(refStart+readStop<=ref.length) : "readStart="+readStart+", readStop="+readStop+
+ ", refStart="+refStart+", refStop="+refStop+", ref.length="+ref.length+", read.length="+read.length;
+
+ assert(matchReturn!=null);
+ assert(matchReturn.length==1);
+ if(matchReturn[0]==null || matchReturn[0].length!=read.length){
+ assert(matchReturn[0]==null || matchReturn[0].length<read.length) : matchReturn[0].length+"!="+read.length;
+ matchReturn[0]=new byte[read.length];
+ }
+ final byte[] match=matchReturn[0];
+
+// if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed.
+
+ for(int i=readStart; i<readStop; i++){
+ byte c=read[i];
+ byte r=ref[refStart+i];
+
+ assert(r!='.' && c!='.');
+
+ if(c==r && c!='N'){
+ if(mode==MODE_MS){
+ timeInMode++;
+ score+=POINTS_MATCH2;
+ }else{
+ timeInMode=0;
+ score+=POINTS_MATCH;
+ }
+ match[i]='m';
+ mode=MODE_MS;
+ }else if(c<0 || c=='N'){
+ score+=POINTS_NOCALL;
+ match[i]='N';
+ }else if(r<0 || r=='N'){
+ score+=POINTS_NOREF;
+// match[i]='m';
+ match[i]='N';
+ }else{
+ match[i]='S';
+ if(mode==MODE_SUB){timeInMode++;}
+ else{timeInMode=0;}
+
+ if(timeInMode==0){score+=POINTS_SUB;}
+ else if(timeInMode<LIMIT_FOR_COST_3){score+=POINTS_SUB2;}
+ else{score+=POINTS_SUB3;}
+ mode=MODE_SUB;
+ }
+ }
+
+ return score;
+ }
+
+ @Override
+ public final int maxQuality(int numBases){
+ return POINTS_MATCH+(numBases-1)*(POINTS_MATCH2);
+ }
+
+ @Override
+ public final int maxQuality(byte[] baseScores){
+ return POINTS_MATCH+(baseScores.length-1)*(POINTS_MATCH2)+Tools.sumInt(baseScores);
+ }
+
+ @Override
+ public final int maxImperfectScore(int numBases){
+// int maxQ=maxQuality(numBases);
+//// maxImperfectSwScore=maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB);
+// int maxI=maxQ+POINTS_DEL;
+// maxI=Tools.max(maxI, maxQ+POINTS_INS-POINTS_MATCH2);
+// maxI=Tools.min(maxI, maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB));
+
+ int maxQ=maxQuality(numBases);
+ int maxI=maxQ+Tools.min(POINTS_DEL, POINTS_INS-POINTS_MATCH2);
+ assert(maxI<(maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB)));
+ return maxI;
+ }
+
+ @Override
+ public final int maxImperfectScore(byte[] baseScores){
+// int maxQ=maxQuality(numBases);
+//// maxImperfectSwScore=maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB);
+// int maxI=maxQ+POINTS_DEL;
+// maxI=Tools.max(maxI, maxQ+POINTS_INS-POINTS_MATCH2);
+// maxI=Tools.min(maxI, maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB));
+
+ int maxQ=maxQuality(baseScores);
+ int maxI=maxQ+Tools.min(POINTS_DEL, POINTS_INS-POINTS_MATCH2);
+ assert(maxI<(maxQ-(POINTS_MATCH2+POINTS_MATCH2)+(POINTS_MATCH+POINTS_SUB)));
+ return maxI;
+ }
+
+ @Override
+ public int calcDelScore(int len, boolean approximateGaps){
+ if(len<=0){return 0;}
+ int score=POINTS_DEL;
+
+ if(approximateGaps && len>MINGAP){
+ int rem=len%GAPLEN;
+ int div=(len-GAPBUFFER2)/GAPLEN;
+ score+=(div*POINTS_GAP);
+ assert(rem+GAPBUFFER2<len);
+ len=rem+GAPBUFFER2;
+ assert(len>LIMIT_FOR_COST_4); //and probably LIMIT_FOR_COST_5
+// assert(false) : div;
+ }
+
+ if(len>LIMIT_FOR_COST_5){
+ score+=((len-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTS_DEL5;
+ len=LIMIT_FOR_COST_5;
+ }
+ if(len>LIMIT_FOR_COST_4){
+ score+=(len-LIMIT_FOR_COST_4)*POINTS_DEL4;
+ len=LIMIT_FOR_COST_4;
+ }
+ if(len>LIMIT_FOR_COST_3){
+ score+=(len-LIMIT_FOR_COST_3)*POINTS_DEL3;
+ len=LIMIT_FOR_COST_3;
+ }
+ if(len>1){
+ score+=(len-1)*POINTS_DEL2;
+ }
+ return score;
+ }
+
+ private static int calcDelScoreOffset(int len){
+ if(len<=0){return 0;}
+ int score=POINTSoff_DEL;
+
+ if(len>LIMIT_FOR_COST_5){
+ score+=((len-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTSoff_DEL5;
+ len=LIMIT_FOR_COST_5;
+ }
+ if(len>LIMIT_FOR_COST_4){
+ score+=(len-LIMIT_FOR_COST_4)*POINTSoff_DEL4;
+ len=LIMIT_FOR_COST_4;
+ }
+ if(len>LIMIT_FOR_COST_3){
+ score+=(len-LIMIT_FOR_COST_3)*POINTSoff_DEL3;
+ len=LIMIT_FOR_COST_3;
+ }
+ if(len>1){
+ score+=(len-1)*POINTSoff_DEL2;
+ }
+ return score;
+ }
+
+ @Override
+ public int calcInsScore(int len){
+ if(len<=0){return 0;}
+ int score=POINTS_INS;
+
+ if(len>LIMIT_FOR_COST_4){
+ score+=(len-LIMIT_FOR_COST_4)*POINTS_INS4;
+ len=LIMIT_FOR_COST_4;
+ }
+ if(len>LIMIT_FOR_COST_3){
+ score+=(len-LIMIT_FOR_COST_3)*POINTS_INS3;
+ len=LIMIT_FOR_COST_3;
+ }
+ if(len>1){
+ score+=(len-1)*POINTS_INS2;
+ }
+ return score;
+ }
+
+ private static int calcInsScoreOffset(int len){
+ if(len<=0){return 0;}
+ int score=POINTSoff_INS;
+ if(len>LIMIT_FOR_COST_4){
+ score+=(len-LIMIT_FOR_COST_4)*POINTSoff_INS4;
+ len=LIMIT_FOR_COST_4;
+ }
+ if(len>LIMIT_FOR_COST_3){
+ score+=(len-LIMIT_FOR_COST_3)*POINTSoff_INS3;
+ len=LIMIT_FOR_COST_3;
+ }
+ if(len>1){
+ score+=(len-1)*POINTSoff_INS2;
+ }
+ return score;
+ }
+
+
+ private final int[][][] packed;
+ private final byte[] grefbuffer;
+ private int greflimit=-1;
+ private int greflimit2=-1;
+ private int grefRefOrigin=-1;
+
+
+ @Override
+ /**DO NOT MODIFY*/
+ public final byte[] getGrefbuffer(){
+ return grefbuffer;
+ }
+
+ public final int[] vertLimit;
+ public final int[] horizLimit;
+
+ @Override
+ public CharSequence showVertLimit(){
+ StringBuilder sb=new StringBuilder();
+ for(int i=0; i<=rows; i++){sb.append(vertLimit[i]>>SCOREOFFSET).append(",");}
+ return sb;
+ }
+
+ @Override
+ public CharSequence showHorizLimit(){
+ StringBuilder sb=new StringBuilder();
+ for(int i=0; i<=columns; i++){sb.append(horizLimit[i]>>SCOREOFFSET).append(",");}
+ return sb;
+ }
+
+ public static float minIdToMinRatio(double minid){
+ if(minid>1){minid=minid/100;}
+ assert(minid>0 && minid<=1) : "Min identity should be between 0 and 1. Values above 1 will be assumed to be percent and divided by 100.";
+ double matchdif=POINTS_MATCH-POINTS_MATCH2;
+ double match=POINTS_MATCH2;
+ double sub=-POINTS_MATCH2+0.5*(matchdif+POINTS_SUB)+0.5*POINTS_SUB2;
+ double del=0.1*(matchdif+POINTS_DEL)+0.2*POINTS_DEL2+0.4*POINTS_DEL3+0.3*POINTS_DEL4;
+ double ins=-POINTS_MATCH2+0.4*(matchdif+POINTS_INS)+0.3*(POINTS_INS2)+0.3*(POINTS_INS3);
+ double badAvg=.7*sub+.2*del+.1*ins;
+ double badFraction=1-minid;
+ double minratio=(match+badFraction*badAvg)/match;
+ assert(minratio<=1);
+ minratio=Tools.max(0.1, minratio);
+ return (float)minratio;
+ }
+
+ public static final int TIMEBITS=11;
+ public static final int SCOREBITS=32-TIMEBITS;
+ public static final int MAX_TIME=((1<<TIMEBITS)-1);
+ public static final int MAX_SCORE=((1<<(SCOREBITS-1))-1)-2000;
+ public static final int MIN_SCORE=0-MAX_SCORE; //Keeps it 1 point above "BAD".
+
+ public static final int SCOREOFFSET=TIMEBITS;
+
+ public static final int TIMEMASK=~((-1)<<TIMEBITS);
+ public static final int SCOREMASK=(~((-1)<<SCOREBITS))<<SCOREOFFSET;
+
+ private static final byte MODE_MS=0;
+ private static final byte MODE_DEL=1;
+ private static final byte MODE_INS=2;
+ private static final byte MODE_SUB=3;
+
+ public static final int POINTS_NOREF=0;
+ public static final int POINTS_NOCALL=0;
+ public static final int POINTS_MATCH=70;
+ public static final int POINTS_MATCH2=100; //Note: Changing to 90 substantially reduces false positives
+ public static final int POINTS_COMPATIBLE=50;
+ public static final int POINTS_SUB=-127;
+ public static final int POINTS_SUBR=-147; //increased penalty if prior match streak was at most 1
+ public static final int POINTS_SUB2=-51;
+ public static final int POINTS_SUB3=-25;
+ public static final int POINTS_MATCHSUB=-10;
+ public static final int POINTS_INS=-395;
+ public static final int POINTS_INS2=-39;
+ public static final int POINTS_INS3=-23;
+ public static final int POINTS_INS4=-8;
+ public static final int POINTS_DEL=-472;
+ public static final int POINTS_DEL2=-33;
+ public static final int POINTS_DEL3=-9;
+ public static final int POINTS_DEL4=-1;
+ public static final int POINTS_DEL5=-1;
+ public static final int POINTS_DEL_REF_N=-10;
+ public static final int POINTS_GAP=0-GAPCOST;
+
+ public static final int TIMESLIP=4;
+ public static final int MASK5=TIMESLIP-1;
+ static{assert(Integer.bitCount(TIMESLIP)==1);}
+
+
+ private static final int BARRIER_I1=2;
+ private static final int BARRIER_D1=3;
+
+
+ public static final int LIMIT_FOR_COST_3=5;
+ public static final int LIMIT_FOR_COST_4=20;
+ public static final int LIMIT_FOR_COST_5=80;
+
+ public static final int BAD=MIN_SCORE-1;
+
+
+ public static final int POINTSoff_NOREF=(POINTS_NOREF<<SCOREOFFSET);
+ public static final int POINTSoff_NOCALL=(POINTS_NOCALL<<SCOREOFFSET);
+ public static final int POINTSoff_MATCH=(POINTS_MATCH<<SCOREOFFSET);
+ public static final int POINTSoff_MATCH2=(POINTS_MATCH2<<SCOREOFFSET);
+ public static final int POINTSoff_COMPATIBLE=(POINTS_COMPATIBLE<<SCOREOFFSET);
+ public static final int POINTSoff_SUB=(POINTS_SUB<<SCOREOFFSET);
+ public static final int POINTSoff_SUBR=(POINTS_SUBR<<SCOREOFFSET);
+ public static final int POINTSoff_SUB2=(POINTS_SUB2<<SCOREOFFSET);
+ public static final int POINTSoff_SUB3=(POINTS_SUB3<<SCOREOFFSET);
+ public static final int POINTSoff_MATCHSUB=(POINTS_MATCHSUB<<SCOREOFFSET);
+ public static final int POINTSoff_INS=(POINTS_INS<<SCOREOFFSET);
+ public static final int POINTSoff_INS2=(POINTS_INS2<<SCOREOFFSET);
+ public static final int POINTSoff_INS3=(POINTS_INS3<<SCOREOFFSET);
+ public static final int POINTSoff_INS4=(POINTS_INS4<<SCOREOFFSET);
+ public static final int POINTSoff_DEL=(POINTS_DEL<<SCOREOFFSET);
+ public static final int POINTSoff_DEL2=(POINTS_DEL2<<SCOREOFFSET);
+ public static final int POINTSoff_DEL3=(POINTS_DEL3<<SCOREOFFSET);
+ public static final int POINTSoff_DEL4=(POINTS_DEL4<<SCOREOFFSET);
+ public static final int POINTSoff_DEL5=(POINTS_DEL5<<SCOREOFFSET);
+ public static final int POINTSoff_GAP=(POINTS_GAP<<SCOREOFFSET);
+ public static final int POINTSoff_DEL_REF_N=(POINTS_DEL_REF_N<<SCOREOFFSET);
+ public static final int BADoff=(BAD<<SCOREOFFSET);
+ public static final int MAXoff_SCORE=MAX_SCORE<<SCOREOFFSET;
+ public static final int MINoff_SCORE=MIN_SCORE<<SCOREOFFSET;
+
+
+ public final int POINTS_NOREF(){return POINTS_NOREF;}
+ public final int POINTS_NOCALL(){return POINTS_NOCALL;}
+ public final int POINTS_MATCH(){return POINTS_MATCH;}
+ public final int POINTS_MATCH2(){return POINTS_MATCH2;}
+ public final int POINTS_COMPATIBLE(){return POINTS_COMPATIBLE;}
+ public final int POINTS_SUB(){return POINTS_SUB;}
+ public final int POINTS_SUBR(){return POINTS_SUBR;}
+ public final int POINTS_SUB2(){return POINTS_SUB2;}
+ public final int POINTS_SUB3(){return POINTS_SUB3;}
+ public final int POINTS_MATCHSUB(){return POINTS_MATCHSUB;}
+ public final int POINTS_INS(){return POINTS_INS;}
+ public final int POINTS_INS2(){return POINTS_INS2;}
+ public final int POINTS_INS3(){return POINTS_INS3;}
+ public final int POINTS_INS4(){return POINTS_INS4;}
+ public final int POINTS_DEL(){return POINTS_DEL;}
+ public final int POINTS_DEL2(){return POINTS_DEL2;}
+ public final int POINTS_DEL3(){return POINTS_DEL3;}
+ public final int POINTS_DEL4(){return POINTS_DEL4;}
+ public final int POINTS_DEL5(){return POINTS_DEL5;}
+ public final int POINTS_DEL_REF_N(){return POINTS_DEL_REF_N;}
+ public final int POINTS_GAP(){return POINTS_GAP;}
+
+ public final int TIMESLIP(){return TIMESLIP;}
+ public final int MASK5(){return MASK5;}
+ public final int SCOREOFFSET(){return SCOREOFFSET();}
+
+ final int BARRIER_I1(){return BARRIER_I1;}
+ final int BARRIER_D1(){return BARRIER_D1;}
+
+ public final int LIMIT_FOR_COST_3(){return LIMIT_FOR_COST_3;}
+ public final int LIMIT_FOR_COST_4(){return LIMIT_FOR_COST_4;}
+ public final int LIMIT_FOR_COST_5(){return LIMIT_FOR_COST_5;}
+
+ public final int BAD(){return BAD;}
+
+
+ private int rows;
+ private int columns;
+
+}
diff --git a/current/align2/NeedlemanWunsch.java b/current/align2/NeedlemanWunsch.java
new file mode 100755
index 0000000..b171b5d
--- /dev/null
+++ b/current/align2/NeedlemanWunsch.java
@@ -0,0 +1,111 @@
+package align2;
+
+import java.util.Arrays;
+public class NeedlemanWunsch {
+
+
+ public static void main(String[] args){
+ byte[] read=args[0].getBytes();
+ byte[] ref=args[1].getBytes();
+ NeedlemanWunsch nw=new NeedlemanWunsch(read.length, ref.length);
+ nw.fill(read, ref, 0, ref.length-1);
+
+ for(int row=0; row<nw.scores.length; row++){
+ System.err.println(Arrays.toString(nw.scores[row]));
+ System.err.println(Arrays.toString(nw.pointers[row]));
+ System.err.println();
+ }
+
+ byte[] out=nw.traceback(read, ref, 0, ref.length-1);
+
+
+
+ System.err.println(new String(out));
+ }
+
+
+ public NeedlemanWunsch(int maxRows_, int maxColumns_){
+ maxRows=maxRows_;
+ maxColumns=maxColumns_;
+ scores=new int[maxRows+1][maxColumns+1];
+ pointers=new byte[maxRows+1][maxColumns+1];
+ for(int i=0; i<maxColumns+1; i++){
+ scores[0][i]=0-i;
+ }
+ for(int i=0; i<maxRows+1; i++){
+ scores[i][0]=0-i;
+ }
+ }
+
+// public void initialize(int rows_, int columns_){
+// rows=rows_;
+// columns=columns_;
+// assert(rows<=maxRows);
+// assert(columns<=maxColumns);
+// }
+
+ public void fill(byte[] read, byte[] ref, int refStartLoc, int refEndLoc){
+ rows=read.length;
+ columns=refEndLoc-refStartLoc+1;
+ System.err.println("rows = "+rows+", columns="+columns);
+
+ for(int row=0; row<rows; row++){
+ for(int col=0; col<columns; col++){
+ System.err.println("row = "+row+", col="+col);
+ int match=(read[row]==ref[refStartLoc+col] ? 1 : -1);
+ int diag=match+scores[row][col];
+ int left=scores[row+1][col]-1;
+ int up=scores[row][col+1]-1;
+ if(diag>=left && diag>=up){
+ scores[row+1][col+1]=diag;
+ pointers[row+1][col+1]=DIAG;
+ }else if(left>=up){
+ scores[row+1][col+1]=left;
+ pointers[row+1][col+1]=LEFT;
+ }else{
+ scores[row+1][col+1]=up;
+ pointers[row+1][col+1]=UP;
+ }
+ }
+ }
+
+ }
+
+ public byte[] traceback(byte[] read, byte[] ref, int refStartLoc, int refEndLoc){
+ int row=read.length;
+ int col=ref.length;
+
+ byte[] out=new byte[Tools.max(row, col)];
+ int outPos=out.length-1;
+
+ while(row>0 || col>0){
+ byte ptr=pointers[row][col];
+ if(ptr==DIAG){
+ out[outPos]=read[row-1];
+ row--;
+ col--;
+ outPos--;
+ }else if(ptr==LEFT){
+ out[outPos]='-';
+ col--;
+ outPos--;
+ }else{
+ assert(ptr==UP);
+// out[outPos]='-';
+ row--;
+ }
+ }
+ return out;
+ }
+
+ public final int maxRows;
+ public final int maxColumns;
+ private final int[][] scores;
+ private final byte[][] pointers;
+
+ public static final byte LEFT=0, DIAG=1, UP=2;
+
+ private int rows;
+ private int columns;
+
+}
diff --git a/current/align2/PackedHeap.java b/current/align2/PackedHeap.java
new file mode 100755
index 0000000..1079860
--- /dev/null
+++ b/current/align2/PackedHeap.java
@@ -0,0 +1,186 @@
+package align2;
+
+public final class PackedHeap {
+
+ public PackedHeap(int maxSize){
+
+ int len=maxSize+1;
+ if((len&1)==1){len++;} //Array size is always even.
+
+ CAPACITY=maxSize;
+ array=new long[len];
+// queue=new PriorityQueue<T>(maxSize);
+ }
+
+ public boolean add(long t){
+ //assert(testForDuplicates());
+// assert(queue.size()==size);
+// queue.add(t);
+ assert(size==0 || array[size]!=-1L);
+ size++;
+ array[size]=t;
+ percDown(size);
+// assert(queue.size()==size);
+// assert(queue.peek()==peek());
+ //assert(testForDuplicates());
+ return true;
+ }
+
+ public long peek(){
+ //assert(testForDuplicates());
+// assert(queue.size()==size);
+ if(size==0){return -1L;}
+// assert(array[1]==queue.peek()) : size+", "+queue.size()+"\n"+
+// array[1]+"\n"+
+// array[2]+" , "+array[3]+"\n"+
+// array[4]+" , "+array[5]+" , "+array[6]+" , "+array[7]+"\n"+
+// queue.peek()+"\n";
+ //assert(testForDuplicates());
+ return array[1];
+ }
+
+ public long poll(){
+ //assert(testForDuplicates());
+// assert(queue.size()==size);
+ if(size==0){return -1L;}
+ long t=array[1];
+// assert(t==queue.poll());
+ array[1]=array[size];
+ array[size]=-1L;
+ size--;
+ if(size>0){percUp(1);}
+// assert(queue.size()==size);
+// assert(queue.peek()==peek());
+ //assert(testForDuplicates());
+ return t;
+ }
+
+ private void percDown(int loc){
+ //assert(testForDuplicates());
+ assert(loc>0);
+ if(loc==1){return;}
+
+ int next=loc/2;
+ final long a=array[loc];
+ long b=array[next];
+
+// while(loc>1 && (a.site<b.site || (a.site==b.site && a.column<b.column))){
+ while(loc>1 && a<b){
+ array[loc]=b;
+ loc=next;
+ next=next/2;
+ b=array[next];
+ }
+
+ array[loc]=a;
+ }
+
+ private void percUp(int loc){
+ //assert(testForDuplicates());
+ assert(loc>0 && loc<=size) : loc+", "+size;
+ int next1=loc*2;
+ int next2=next1+1;
+ if(next1>size){return;}
+ long a=array[loc];
+ long b=array[next1];
+ long c=array[next2];
+ assert(a!=b);
+ assert(b!=c);
+ assert(b!=-1L);
+ //assert(testForDuplicates());
+ if(c==-1L || b<=c){
+ if(a>b){
+// if((a.site>b.site || (a.site==b.site && a.column>b.column))){
+ array[next1]=a;
+ array[loc]=b;
+ //assert(testForDuplicates());
+ percUp(next1);
+ }
+ }else{
+ if(a>c){
+// if((a.site>c.site || (a.site==c.site && a.column>c.column))){
+ array[next2]=a;
+ array[loc]=c;
+ //assert(testForDuplicates());
+ percUp(next2);
+ }
+ }
+ }
+
+ private void percUpIter(int loc){
+ //assert(testForDuplicates());
+ assert(loc>0 && loc<=size) : loc+", "+size;
+ final long a=array[loc];
+ //assert(testForDuplicates());
+
+ int next1=loc*2;
+ int next2=next1+1;
+
+ while(next1<=size){
+
+ long b=array[next1];
+ long c=array[next2];
+ assert(a!=b);
+ assert(b!=c);
+ assert(b!=-1L);
+
+ if(c==-1L || b<=c){
+// if(c==-1L || (b.site<c.site || (b.site==c.site && b.column<c.column))){
+ if(a>b){
+// if((a.site>b.site || (a.site==b.site && a.column>b.column))){
+// array[next1]=a;
+ array[loc]=b;
+ loc=next1;
+ }else{
+ break;
+ }
+ }else{
+ if(a>c){
+// if((a.site>c.site || (a.site==c.site && a.column>c.column))){
+// array[next2]=a;
+ array[loc]=c;
+ loc=next2;
+ }else{
+ break;
+ }
+ }
+ next1=loc*2;
+ next2=next1+1;
+ }
+ array[loc]=a;
+ }
+
+ public boolean isEmpty(){
+// assert((size==0) == queue.isEmpty());
+ return size==0;
+ }
+
+ public void clear(){
+// queue.clear();
+// for(int i=1; i<=size; i++){array[i]=-1L;}
+ size=0;
+ }
+
+ public int size(){
+ return size;
+ }
+
+ public static int tier(int x){
+ int leading=Integer.numberOfLeadingZeros(x);
+ return 31-leading;
+ }
+
+ public boolean testForDuplicates(){
+ for(int i=0; i<array.length; i++){
+ for(int j=i+1; j<array.length; j++){
+ if(array[i]!=-1L && array[i]==array[j]){return false;}
+ }
+ }
+ return true;
+ }
+
+ final long[] array;
+ private final int CAPACITY;
+ private int size=0;
+
+}
diff --git a/current/align2/Pointer.java b/current/align2/Pointer.java
new file mode 100755
index 0000000..d438de9
--- /dev/null
+++ b/current/align2/Pointer.java
@@ -0,0 +1,38 @@
+package align2;
+
+public class Pointer implements Comparable<Pointer>{
+
+ public static Pointer[] loadMatrix(int[][] matrix){
+ Pointer[] out=new Pointer[matrix.length];
+ for(int i=0; i<out.length; i++){
+ int len=(matrix[i]==null ? 0 : matrix[i].length);
+ out[i]=new Pointer(i, len);
+ }
+ return out;
+ }
+
+ public static Pointer[] loadMatrix(int[][] matrix, Pointer[] out){
+ assert(out!=null);
+ assert(out.length==matrix.length);
+ for(int i=0; i<out.length; i++){
+ Pointer p=out[i];
+ int len=(matrix[i]==null ? 0 : matrix[i].length);
+ p.key=i;
+ p.value=len;
+ }
+ return out;
+ }
+
+ public Pointer(int key_, int value_){
+ key=key_;
+ value=value_;
+ }
+
+ @Override
+ public int compareTo(Pointer o) {
+ return value-o.value;
+ }
+
+ public int key;
+ public int value;
+}
\ No newline at end of file
diff --git a/current/align2/PrintTime.java b/current/align2/PrintTime.java
new file mode 100755
index 0000000..7feedea
--- /dev/null
+++ b/current/align2/PrintTime.java
@@ -0,0 +1,37 @@
+package align2;
+
+import java.io.File;
+
+import fileIO.ReadWrite;
+
+public class PrintTime {
+
+ public static void main(String[] args){
+ long millis=System.currentTimeMillis();
+
+ if(args==null || args.length<1){
+ System.err.println("Time:\t"+millis);
+ }
+
+ if(args!=null && args.length>0){
+ File f=new File(args[0]);
+ if(f.exists()){
+ String s=ReadWrite.readString(args[0]);
+// TextFile tf=new TextFile(args[0], false, false);
+// String s=tf.nextLine();
+// tf.close();
+ long old=Long.parseLong(s);
+ long elapsed=millis-old;
+ if(args.length<2 || Tools.parseBoolean(args[1])){
+ System.out.println("Elapsed:\t"+String.format("%.2f", elapsed/1000d));
+ if(true){
+ System.err.println("Elapsed:\t"+String.format("%.2f", elapsed/1000d));
+ }
+ }
+ }
+ f=null;
+ ReadWrite.writeString(millis+"", args[0]);
+ }
+ }
+
+}
diff --git a/current/align2/Quad.java b/current/align2/Quad.java
new file mode 100755
index 0000000..3801ef3
--- /dev/null
+++ b/current/align2/Quad.java
@@ -0,0 +1,33 @@
+package align2;
+
+public class Quad implements Comparable<Quad>{
+
+ public Quad(int col_, int row_, int val_){
+ column=col_;
+ row=row_;
+ site=val_;
+ }
+
+ public boolean equals(Object other){
+ return site==((Quad)other).site;
+ }
+
+ @Override
+ public int hashCode(){return site;}
+
+ @Override
+ public int compareTo(Quad other) {
+ int x=site-other.site;
+ return(x==0 ? column-other.column : x);
+ }
+
+ public String toString(){
+ return("("+column+","+row+","+site+")");
+ }
+
+ public final int column;
+ public int row;
+ public int site;
+ public int list[];
+
+}
diff --git a/current/align2/Quad64.java b/current/align2/Quad64.java
new file mode 100755
index 0000000..838688e
--- /dev/null
+++ b/current/align2/Quad64.java
@@ -0,0 +1,35 @@
+package align2;
+
+public class Quad64 implements Comparable<Quad64>{
+
+ public Quad64(int col_, int row_, int val_){
+ column=col_;
+ row=row_;
+ site=val_;
+ }
+
+ public boolean equals(Object other){
+ assert(false);
+ return site==((Quad64)other).site;
+ }
+
+ @Override
+ public int hashCode(){return (int)site;}
+
+ @Override
+ public int compareTo(Quad64 other) {
+ return site>other.site ? 1 : site<other.site ? -1 : column-other.column;
+// int x=site-other.site;
+// return(x>0 ? 1 : x<0 ? -1 : column-other.column);
+ }
+
+ public String toString(){
+ return("("+column+","+row+","+site+")");
+ }
+
+ public final int column;
+ public int row;
+ public long site;
+ public int list[];
+
+}
diff --git a/current/align2/Quad64Heap.java b/current/align2/Quad64Heap.java
new file mode 100755
index 0000000..61761d7
--- /dev/null
+++ b/current/align2/Quad64Heap.java
@@ -0,0 +1,219 @@
+package align2;
+
+public final class Quad64Heap {
+
+ public Quad64Heap(int maxSize){
+
+ int len=maxSize+1;
+ if((len&1)==1){len++;} //Array size is always even.
+
+ CAPACITY=maxSize;
+ array=new Quad64[len];
+// queue=new PriorityQueue<T>(maxSize);
+ }
+
+ public boolean add(Quad64 t){
+ //assert(testForDuplicates());
+// assert(queue.size()==size);
+// queue.add(t);
+ assert(size==0 || array[size]!=null);
+ size++;
+ array[size]=t;
+ percDown(size);
+// assert(queue.size()==size);
+// assert(queue.peek()==peek());
+ //assert(testForDuplicates());
+ return true;
+ }
+
+ public Quad64 peek(){
+ //assert(testForDuplicates());
+// assert(queue.size()==size);
+ if(size==0){return null;}
+// assert(array[1]==queue.peek()) : size+", "+queue.size()+"\n"+
+// array[1]+"\n"+
+// array[2]+" , "+array[3]+"\n"+
+// array[4]+" , "+array[5]+" , "+array[6]+" , "+array[7]+"\n"+
+// queue.peek()+"\n";
+ //assert(testForDuplicates());
+ return array[1];
+ }
+
+ public Quad64 poll(){
+ //assert(testForDuplicates());
+// assert(queue.size()==size);
+ if(size==0){return null;}
+ Quad64 t=array[1];
+// assert(t==queue.poll());
+ array[1]=array[size];
+ array[size]=null;
+ size--;
+ if(size>0){percUp(1);}
+// assert(queue.size()==size);
+// assert(queue.peek()==peek());
+ //assert(testForDuplicates());
+ return t;
+ }
+
+// private void percDownRecursive(int loc){
+// //assert(testForDuplicates());
+// assert(loc>0);
+// if(loc==1){return;}
+// int next=loc/2;
+// Quad64 a=array[loc];
+// Quad64 b=array[next];
+// assert(a!=b);
+// if(a.compareTo(b)<0){
+// array[next]=a;
+// array[loc]=b;
+// percDown(next);
+// }
+// }
+//
+// private void percDown_old(int loc){
+// //assert(testForDuplicates());
+// assert(loc>0);
+//
+// final Quad64 a=array[loc];
+//
+// while(loc>1){
+// int next=loc/2;
+// Quad64 b=array[next];
+// assert(a!=b);
+// if(a.compareTo(b)<0){
+// array[next]=a;
+// array[loc]=b;
+// loc=next;
+// }else{return;}
+// }
+// }
+
+ private void percDown(int loc){
+ //assert(testForDuplicates());
+ assert(loc>0);
+ if(loc==1){return;}
+
+ int next=loc/2;
+ final Quad64 a=array[loc];
+ Quad64 b=array[next];
+
+// while(loc>1 && (a.site<b.site || (a.site==b.site && a.column<b.column))){
+ while(loc>1 && a.compareTo(b)<0){
+ array[loc]=b;
+ loc=next;
+ next=next/2;
+ b=array[next];
+ }
+
+ array[loc]=a;
+ }
+
+ private void percUp(int loc){
+ //assert(testForDuplicates());
+ assert(loc>0 && loc<=size) : loc+", "+size;
+ int next1=loc*2;
+ int next2=next1+1;
+ if(next1>size){return;}
+ Quad64 a=array[loc];
+ Quad64 b=array[next1];
+ Quad64 c=array[next2];
+ assert(a!=b);
+ assert(b!=c);
+ assert(b!=null);
+ //assert(testForDuplicates());
+ if(c==null || b.compareTo(c)<1){
+ if(a.compareTo(b)>0){
+// if((a.site>b.site || (a.site==b.site && a.column>b.column))){
+ array[next1]=a;
+ array[loc]=b;
+ //assert(testForDuplicates());
+ percUp(next1);
+ }
+ }else{
+ if(a.compareTo(c)>0){
+// if((a.site>c.site || (a.site==c.site && a.column>c.column))){
+ array[next2]=a;
+ array[loc]=c;
+ //assert(testForDuplicates());
+ percUp(next2);
+ }
+ }
+ }
+
+ private void percUpIter(int loc){
+ //assert(testForDuplicates());
+ assert(loc>0 && loc<=size) : loc+", "+size;
+ final Quad64 a=array[loc];
+ //assert(testForDuplicates());
+
+ int next1=loc*2;
+ int next2=next1+1;
+
+ while(next1<=size){
+
+ Quad64 b=array[next1];
+ Quad64 c=array[next2];
+ assert(a!=b);
+ assert(b!=c);
+ assert(b!=null);
+
+ if(c==null || b.compareTo(c)<1){
+// if(c==null || (b.site<c.site || (b.site==c.site && b.column<c.column))){
+ if(a.compareTo(b)>0){
+// if((a.site>b.site || (a.site==b.site && a.column>b.column))){
+// array[next1]=a;
+ array[loc]=b;
+ loc=next1;
+ }else{
+ break;
+ }
+ }else{
+ if(a.compareTo(c)>0){
+// if((a.site>c.site || (a.site==c.site && a.column>c.column))){
+// array[next2]=a;
+ array[loc]=c;
+ loc=next2;
+ }else{
+ break;
+ }
+ }
+ next1=loc*2;
+ next2=next1+1;
+ }
+ array[loc]=a;
+ }
+
+ public boolean isEmpty(){
+// assert((size==0) == queue.isEmpty());
+ return size==0;
+ }
+
+ public void clear(){
+// queue.clear();
+// for(int i=1; i<=size; i++){array[i]=null;}
+ size=0;
+ }
+
+ public int size(){
+ return size;
+ }
+
+ public static int tier(int x){
+ int leading=Integer.numberOfLeadingZeros(x);
+ return 31-leading;
+ }
+
+ public boolean testForDuplicates(){
+ for(int i=0; i<array.length; i++){
+ for(int j=i+1; j<array.length; j++){
+ if(array[i]!=null && array[i]==array[j]){return false;}
+ }
+ }
+ return true;
+ }
+
+ private final Quad64[] array;
+ private final int CAPACITY;
+ private int size=0;
+
+}
diff --git a/current/align2/QuadHeap.java b/current/align2/QuadHeap.java
new file mode 100755
index 0000000..81c6681
--- /dev/null
+++ b/current/align2/QuadHeap.java
@@ -0,0 +1,229 @@
+package align2;
+
+public final class QuadHeap {
+
+ public QuadHeap(int maxSize){
+
+ int len=maxSize+1;
+ if((len&1)==1){len++;} //Array size is always even.
+
+ CAPACITY=maxSize;
+ array=new Quad[len];
+// queue=new PriorityQueue<T>(maxSize);
+ }
+
+ public boolean add(Quad t){
+ //assert(testForDuplicates());
+// assert(queue.size()==size);
+// queue.add(t);
+ assert(size==0 || array[size]!=null);
+ size++;
+ array[size]=t;
+ percDown(size);
+// assert(queue.size()==size);
+// assert(queue.peek()==peek());
+ //assert(testForDuplicates());
+ return true;
+ }
+
+ public Quad peek(){
+ //assert(testForDuplicates());
+// assert(queue.size()==size);
+ if(size==0){return null;}
+// assert(array[1]==queue.peek()) : size+", "+queue.size()+"\n"+
+// array[1]+"\n"+
+// array[2]+" , "+array[3]+"\n"+
+// array[4]+" , "+array[5]+" , "+array[6]+" , "+array[7]+"\n"+
+// queue.peek()+"\n";
+ //assert(testForDuplicates());
+ return array[1];
+ }
+
+ public Quad poll(){
+ //assert(testForDuplicates());
+// assert(queue.size()==size);
+ if(size==0){return null;}
+ Quad t=array[1];
+// assert(t==queue.poll());
+ array[1]=array[size];
+ array[size]=null;
+ size--;
+ if(size>0){percUp(1);}
+// assert(queue.size()==size);
+// assert(queue.peek()==peek());
+ //assert(testForDuplicates());
+ return t;
+ }
+
+// private void percDownRecursive(int loc){
+// //assert(testForDuplicates());
+// assert(loc>0);
+// if(loc==1){return;}
+// int next=loc/2;
+// Quad a=array[loc];
+// Quad b=array[next];
+// assert(a!=b);
+// if(a.compareTo(b)<0){
+// array[next]=a;
+// array[loc]=b;
+// percDown(next);
+// }
+// }
+//
+// private void percDown_old(int loc){
+// //assert(testForDuplicates());
+// assert(loc>0);
+//
+// final Quad a=array[loc];
+//
+// while(loc>1){
+// int next=loc/2;
+// Quad b=array[next];
+// assert(a!=b);
+// if(a.compareTo(b)<0){
+// array[next]=a;
+// array[loc]=b;
+// loc=next;
+// }else{return;}
+// }
+// }
+
+ private void percDown(int loc){
+ //assert(testForDuplicates());
+ assert(loc>0);
+ if(loc==1){return;}
+
+ int next=loc/2;
+ final Quad a=array[loc];
+ Quad b=array[next];
+
+// while(loc>1 && (a.site<b.site || (a.site==b.site && a.column<b.column))){
+ while(loc>1 && a.compareTo(b)<0){
+ array[loc]=b;
+ loc=next;
+ next=next/2;
+ b=array[next];
+ }
+
+ array[loc]=a;
+ }
+
+ private void percUp(int loc){
+ //assert(testForDuplicates());
+ assert(loc>0 && loc<=size) : loc+", "+size;
+ int next1=loc*2;
+ int next2=next1+1;
+ if(next1>size){return;}
+ Quad a=array[loc];
+ Quad b=array[next1];
+ Quad c=array[next2];
+ assert(a!=b);
+ assert(b!=c);
+ assert(b!=null);
+ //assert(testForDuplicates());
+ if(c==null || b.compareTo(c)<1){
+ if(a.compareTo(b)>0){
+// if((a.site>b.site || (a.site==b.site && a.column>b.column))){
+ array[next1]=a;
+ array[loc]=b;
+ //assert(testForDuplicates());
+ percUp(next1);
+ }
+ }else{
+ if(a.compareTo(c)>0){
+// if((a.site>c.site || (a.site==c.site && a.column>c.column))){
+ array[next2]=a;
+ array[loc]=c;
+ //assert(testForDuplicates());
+ percUp(next2);
+ }
+ }
+ }
+
+ private void percUpIter(int loc){
+ //assert(testForDuplicates());
+ assert(loc>0 && loc<=size) : loc+", "+size;
+ final Quad a=array[loc];
+ //assert(testForDuplicates());
+
+ int next1=loc*2;
+ int next2=next1+1;
+
+ while(next1<=size){
+
+ Quad b=array[next1];
+ Quad c=array[next2];
+ assert(a!=b);
+ assert(b!=c);
+ assert(b!=null);
+
+ if(c==null || b.compareTo(c)<1){
+// if(c==null || (b.site<c.site || (b.site==c.site && b.column<c.column))){
+ if(a.compareTo(b)>0){
+// if((a.site>b.site || (a.site==b.site && a.column>b.column))){
+// array[next1]=a;
+ array[loc]=b;
+ loc=next1;
+ }else{
+ break;
+ }
+ }else{
+ if(a.compareTo(c)>0){
+// if((a.site>c.site || (a.site==c.site && a.column>c.column))){
+// array[next2]=a;
+ array[loc]=c;
+ loc=next2;
+ }else{
+ break;
+ }
+ }
+ next1=loc*2;
+ next2=next1+1;
+ }
+ array[loc]=a;
+ }
+
+ public boolean isEmpty(){
+// assert((size==0) == queue.isEmpty());
+ return size==0;
+ }
+
+ public void clear(){
+// queue.clear();
+// for(int i=1; i<=size; i++){array[i]=null;}
+ size=0;
+ }
+
+ public int size(){
+ return size;
+ }
+
+ public static int tier(int x){
+ int leading=Integer.numberOfLeadingZeros(x);
+ return 31-leading;
+ }
+
+ public boolean testForDuplicates(){
+ for(int i=0; i<array.length; i++){
+ for(int j=i+1; j<array.length; j++){
+ if(array[i]!=null && array[i]==array[j]){return false;}
+ }
+ }
+ return true;
+ }
+
+ public String toString(){
+ StringBuilder sb=new StringBuilder();
+ sb.append("[");
+ for(int i=1; i<=size; i++){
+ sb.append((i==1 ? "" : ", ")+array[i]);
+ }
+ sb.append("]");
+ return sb.toString();
+ }
+
+ private final Quad[] array;
+ private final int CAPACITY;
+ private int size=0;
+
+}
diff --git a/current/align2/QualityTools.java b/current/align2/QualityTools.java
new file mode 100755
index 0000000..bae3565
--- /dev/null
+++ b/current/align2/QualityTools.java
@@ -0,0 +1,566 @@
+package align2;
+
+import java.util.Arrays;
+import java.util.Random;
+
+import stream.FASTQ;
+import stream.Read;
+
+import dna.AminoAcid;
+
+import jgi.CalcTrueQuality;
+import jgi.Dedupe;
+
+
+
+/**
+ *
+ * @author Brian Bushnell
+ * @date Jul 17, 2011 12:04:06 PM
+ */
+public class QualityTools {
+
+ /*-------------------- Main --------------------*/
+
+ public static void main(String[] args){
+
+ for(int i=0; i<MATRIX_SIZE; i++){
+ for(int j=0; j<MATRIX_SIZE; j++){
+ System.err.print((int)qualsToPhredSafe((byte)i, (byte)j)+",");
+ }
+ System.err.println();
+ }
+
+// byte[] quals=new byte[] {15, 12, 20, 9, 10, 16, 14, 7, 11, 10, 10, 10, 10, 4, 4, 30, 30, 30, 30};
+// float[] probs=makeKeyProbs(quals, 4);
+// float[] probs2=makeKeyProbs(quals, 4);
+//
+// int[] scores=makeKeyScores(quals, 4, 50, 50, null);
+//
+// System.out.println(Arrays.toString(probs)+"\n");
+// System.out.println(Arrays.toString(probs2)+"\n");
+// System.out.println(Arrays.toString(scores)+"\n");
+//
+// bench(50, 20000000);
+// bench2(50, 20000000);
+// bench(50, 20000000);
+// bench2(50, 20000000);
+// bench(50, 20000000);
+// bench2(50, 20000000);
+
+// System.out.println(1-((1-.1f)*(1-.1f)*(1-.1f)));
+// System.out.println("\n"+Arrays.toString(PROB));
+// System.out.println("\n"+Arrays.toString(INVERSE));
+// System.out.println("\n"+Arrays.toString(SUB_PROB));
+// System.out.println("\n"+Arrays.toString(SUB_INVERSE));
+
+// initializeq102matrix(null);
+// for(int a=0; a<42; a++){
+// for(int b=0; b<42; b++){
+// for(int c=0; c<42; c++){
+// System.out.println(a+"\t"+b+"\t"+c+"\t"+q3ProbMatrix[a][b][c]);
+// }
+// }
+// }
+
+ }
+
+ /*-------------------- Constructors --------------------*/
+
+ public QualityTools(){}
+
+ /*-------------------- Methods --------------------*/
+
+ /*-------------------- Overridden Methods --------------------*/
+
+ /*-------------------- Abstract Methods --------------------*/
+
+ /*-------------------- Static Methods --------------------*/
+
+ public static void bench(int length, int rounds){
+
+ long time=System.nanoTime();
+
+ byte[] qual=new byte[length];
+ for(int i=0; i<qual.length; i++){
+ qual[i]=(byte)(Math.random()*30+5);
+ }
+ for(int i=0; i<rounds; i++){
+ float[] r=makeKeyProbs(qual, null, 8, false);
+ if(r[r.length-1]>1 || r[r.length-1]<0){
+ System.err.println("Ooops! "+Arrays.toString(r));
+ }
+ }
+
+ time=System.nanoTime()-time;
+ float seconds=(float)(time/1000000000d);
+ System.out.println("Bench Time: "+String.format("%.3f",seconds)+" s");
+ }
+
+ public static void bench2(int length, int rounds){
+
+ long time=System.nanoTime();
+
+ byte[] qual=new byte[length];
+ for(int i=0; i<qual.length; i++){
+ qual[i]=(byte)(Math.random()*30+5);
+ }
+ for(int i=0; i<rounds; i++){
+ float[] r=makeKeyProbs2(qual, 8);
+ if(r[r.length-1]>1 || r[r.length-1]<0){
+ System.err.println("Ooops! "+Arrays.toString(r));
+ }
+ }
+
+ time=System.nanoTime()-time;
+ float seconds=(float)(time/1000000000d);
+ System.out.println("Bench2 Time: "+String.format("%.3f",seconds)+" s");
+ }
+
+ public static int[] makeKeyScores(byte[] qual, byte[] bases, int keylen, int range, int baseScore, int[] out, boolean useModulo){
+ float[] probs=makeKeyProbs(qual, bases, keylen, useModulo);
+ return makeKeyScores(probs, (qual.length-keylen+1), range, baseScore, out);
+ }
+
+ public static int[] makeKeyScores(float[] probs, int numProbs, int range, int baseScore, int[] out){
+ if(out==null){out=new int[numProbs];}
+// assert(out.length==probs.length);
+ assert(out.length>=numProbs);
+ for(int i=0; i<numProbs; i++){
+ out[i]=baseScore+(int)Math.round(range*(1-(probs[i])));
+ }
+ return out;
+ }
+
+ public static int[] makeIntScoreArray(byte[] qual, int maxScore, int[] out){
+ if(out==null){out=new int[qual.length];}
+ assert(out.length==qual.length);
+ for(int i=0; i<qual.length; i++){
+ float probM=PROB_CORRECT[qual[i]];
+ out[i]=(int)Math.round(maxScore*probM);
+ }
+ return out;
+ }
+
+ public static byte[] makeByteScoreArray(byte[] qual, int maxScore, byte[] out, boolean negative){
+ if(qual==null){return makeByteScoreArray(maxScore, out, negative);}
+ if(out==null){out=new byte[qual.length];}
+ assert(out.length==qual.length);
+ for(int i=0; i<qual.length; i++){
+ float probM=PROB_CORRECT[qual[i]];
+ int x=(int)Math.round(maxScore*probM);
+ assert(x>=Byte.MIN_VALUE && x<=Byte.MAX_VALUE);
+ if(negative){
+ x=x-maxScore;
+ assert(x<=0);
+ }else{
+ assert(x>=0 && x<=maxScore);
+ }
+ out[i]=(byte)x;
+ }
+ return out;
+ }
+
+ public static byte[] makeByteScoreArray(int maxScore, byte[] out, boolean negative){
+ assert(out!=null);
+// for(int i=0; i<out.length; i++){
+// float probM=SUB_PROB[30];
+// int x=(int)Math.round(maxScore*probM);
+// assert(x>=Byte.MIN_VALUE && x<=Byte.MAX_VALUE);
+// if(negative){
+// x=x-maxScore;
+// assert(x<=0);
+// }else{
+// assert(x>=0 && x<=maxScore);
+// }
+// out[i]=(byte)x;
+// }
+ Arrays.fill(out, (byte)0);
+ return out;
+ }
+
+ /** Returns prob of error for each key */
+ public static float[] makeKeyProbs(byte[] quality, byte[] bases, int keylen, boolean useModulo){
+ return makeKeyProbs(quality, bases, keylen, null, useModulo);
+ }
+
+ /** Returns prob of error for each key */
+ public static float[] makeKeyProbs(byte[] quality, byte[] bases, int keylen, float[] out, boolean useModulo){
+ if(quality==null){return makeKeyProbs(bases, keylen, out, useModulo);}
+ if(out==null){out=new float[quality.length-keylen+1];}
+ assert(out.length>=quality.length-keylen+1) : quality.length+", "+keylen+", "+out.length;
+// assert(out.length==quality.length-keylen+1);
+ float key1=1;
+
+ int timeSinceZero=0;
+ for(int i=0; i<keylen; i++){
+// byte q=(bases==null || bases[i]!='N' ? quality[i] : 0);
+ byte q=quality[i];
+ if(q>0){timeSinceZero++;}else{timeSinceZero=0;} //Tracks location of N's
+ assert(q<PROB_CORRECT.length) : Arrays.toString(quality);
+ float f=PROB_CORRECT[q];
+ key1*=f;
+ }
+ out[0]=1-key1;
+ if(timeSinceZero<keylen){out[0]=1;}
+
+ for(int a=0, b=keylen; b<quality.length; a++, b++){
+// byte qa=(bases==null || bases[a]!='N' ? quality[a] : 0);
+// byte qb=(bases==null || bases[b]!='N' ? quality[b] : 0);
+ byte qa=quality[a];
+ byte qb=quality[b];
+ if(qb>0){timeSinceZero++;}else{timeSinceZero=0;}
+ float ipa=PROB_CORRECT_INVERSE[qa];
+ float pb=PROB_CORRECT[qb];
+ key1=key1*ipa*pb;
+ out[a+1]=1-key1;
+ if(timeSinceZero<keylen){out[a+1]=1;}
+ }
+
+ if(bases!=null){
+ if(useModulo){//Rare case for large references
+ final int shift=2*keylen;
+ final int shift2=shift-2;
+ final int mask=~((-1)<<shift);
+ int kmer=0, rkmer=0;
+
+ int len=0;
+ for(int i=0; i<bases.length; i++){
+ final byte b=bases[i];
+ final int x=Dedupe.baseToNumber[b];
+ final int x2=Dedupe.baseToComplementNumber[b];
+ kmer=((kmer<<2)|x)&mask;
+ rkmer=(rkmer>>>2)|(x2<<shift2);
+
+ if(b=='N'){len=0;}else{len++;}
+ if(len>=keylen){
+ if(kmer%IndexMaker4.MODULO!=0 && rkmer%IndexMaker4.MODULO!=0){
+ out[i-keylen+1]=1f;
+// assert(false) : kmer;
+ }
+ }
+ }
+ }
+ }
+
+ return out;
+ }
+
+ /** Returns prob of error for each key */
+ public static float[] makeKeyProbs(byte[] bases, int keylen, float[] out, boolean useModulo){
+ assert(out!=null) : "Must provide array if no quality vector";
+ Arrays.fill(out, 0);
+
+ if(bases!=null){
+ if(useModulo){//Rare case for large references
+ final int shift=2*keylen;
+ final int shift2=shift-2;
+ final int mask=~((-1)<<shift);
+ int kmer=0, rkmer=0;
+
+ int len=0;
+ for(int i=0; i<bases.length; i++){
+ final byte b=bases[i];
+ final int x=Dedupe.baseToNumber[b];
+ final int x2=Dedupe.baseToComplementNumber[b];
+ kmer=((kmer<<2)|x)&mask;
+ rkmer=(rkmer>>>2)|(x2<<shift2);
+
+ if(b=='N'){len=0;}else{len++;}
+ if(len>=keylen){
+ if(kmer%IndexMaker4.MODULO!=0 && rkmer%IndexMaker4.MODULO!=0){
+ out[i-keylen+1]=1f;
+// assert(false) : kmer;
+ }
+ }
+ }
+ }
+ }
+ return out;
+ }
+
+ public static float[] makeKeyProbs2(byte[] quality, int keylen){
+ float[] out=new float[quality.length-keylen+1];
+
+ final int mid=out.length/2;
+
+ float key1=1;
+ float key2=1;
+ for(int i=0, j=mid; i<keylen; i++, j++){
+ byte q1=quality[i];
+ float f1=PROB_CORRECT[q1];
+ key1*=f1;
+ byte q2=quality[j];
+ float f2=PROB_CORRECT[q2];
+ key2*=f2;
+ }
+ out[0]=1-key1;
+ out[mid]=1-key2;
+
+ for(int a=0, b=keylen, c=mid, d=mid+keylen; d<quality.length;
+ a++, b++, c++, d++){
+ byte qa=quality[a];
+ byte qb=quality[b];
+ byte qc=quality[c];
+ byte qd=quality[d];
+ float ipa=PROB_CORRECT_INVERSE[qa];
+ float ipc=PROB_CORRECT_INVERSE[qc];
+ float pb=PROB_CORRECT[qb];
+ float pd=PROB_CORRECT[qd];
+ key1=key1*ipa*pb;
+ key2=key2*ipc*pd;
+ out[a+1]=1-key1;
+ out[c+1]=1-key2;
+ }
+ return out;
+ }
+
+ public static byte[] makeQualityArray(int length, Random randyQual,
+ int minQual, int maxQual, byte baseQuality, byte slant, int variance) {
+ byte[] out=new byte[length];
+
+ for(int i=0; i<length; i++){
+ byte q=(byte)(baseQuality-(slant*i)/length);
+
+ int hilo=randyQual.nextInt();
+
+// if((hilo&7)>0){
+// int range=Tools.max(1, maxQual-q+1);
+// int delta=Tools.min(randyQual.nextInt(range), randyQual.nextInt(range));
+// q=(byte)(q+delta);
+// }else{
+// int range=Tools.max(1, q-minQual+1);
+// int delta=Tools.min(randyQual.nextInt(range), randyQual.nextInt(range), randyQual.nextInt(range));
+// q=(byte)(q-delta);
+// }
+
+ if((hilo&15)>0){
+ int range=Tools.max(1, maxQual-q+1);
+ int delta=(randyQual.nextInt(range)+randyQual.nextInt(range+1))/2;
+ q=(byte)(q+delta);
+ }else{
+ int range=Tools.max(1, q-minQual+1);
+ int delta=Tools.min(randyQual.nextInt(range), randyQual.nextInt(range));
+ q=(byte)(q-delta);
+ }
+ q=(byte)Tools.min(Tools.max(q, minQual), maxQual);
+ out[i]=q;
+ }
+
+ if(length>50){
+ final int x=length/10;
+ for(int i=0; i<x; i++){
+ int y=x-i;
+ out[i]=(byte)Tools.max(out[i]-(y+randyQual.nextInt(y+1))/2, minQual);
+ out[length-i-1]=(byte)Tools.max(out[length-i-1]-(y+randyQual.nextInt(y+1))/2, minQual);
+ }
+ }
+
+ int delta=0;
+ if(variance>0){
+ delta=(byte)(randyQual.nextInt(variance+1)+randyQual.nextInt(variance+1)-variance);
+ }
+ for(int i=0; i<out.length; i++){
+ int x=Tools.mid(2, out[i]+delta, 41);
+ out[i]=(byte)x;
+ }
+
+ return out;
+ }
+
+ public static int[] modifyOffsets(int[] offsets, float[] keyProbs) {
+ if(offsets==null || offsets.length<3){return offsets;}
+
+ int index=0;
+ float max=keyProbs[offsets[0]];
+ final int maxOffset=offsets[offsets.length-1];
+
+ for(int i=1; i<offsets.length; i++){
+ float f=keyProbs[offsets[i]];
+ if(f>max){
+ max=f;
+ index=i;
+ }
+ }
+
+ if(index==0 || index==offsets.length-1){return offsets;}
+ if(max<.98f){return offsets;}
+
+ final int removed=offsets[index];
+ {
+ int[] offsets2=new int[offsets.length-1];
+ for(int i=0; i<index; i++){offsets2[i]=offsets[i];}
+ for(int i=index; i<offsets2.length; i++){offsets2[i]=offsets[i+1];}
+ offsets=offsets2;
+ offsets2=null;
+ }
+
+ if(index==0){
+ assert(false);
+// int i=offsets[0];
+// assert(i>removed && removed>=0);
+// while(i>removed && keyProbs[i-1]>=keyProbs[i]){i--;}
+// offsets[0]=i;
+ }else if(index==offsets.length){
+ assert(false);
+// int i=offsets[offsets.length-1];
+// assert(i<removed && removed==maxOffset);
+// while(i<removed && keyProbs[i+1]>=keyProbs[i]){i++;}
+// offsets[offsets.length-1]=i;
+ }else if(offsets.length>2){
+ if(index==offsets.length-1){
+ assert(index>1);
+ int i=offsets[index-1]; //5, 7, 9, 5, 6
+ assert(i<removed && removed<maxOffset) : i+", "+removed+", "+maxOffset+", "+index+", "+offsets.length;
+ while(i<removed-1 && keyProbs[i+1]>=keyProbs[i]){i++;}
+ offsets[index-1]=i;
+ }else{
+ assert(index<offsets.length-1 && index>0);
+ int i=offsets[index];
+ assert(i>removed && removed>=0);
+ while(i>removed+1 && keyProbs[i-1]>=keyProbs[i]){i--;}
+ offsets[index]=i;
+ }
+ }
+
+ return offsets;
+ }
+
+ /** Requires qualities under MATRIX_SIZE */
+ public static byte qualsToPhred(byte qa, byte qb){
+ return PHRED_MATRIX[qa][qb];
+ }
+
+ /** Safe version for qualities >=MATRIX_SIZE */
+ public static byte qualsToPhredSafe(byte qa, byte qb){
+ qa=Tools.max((byte)0, Tools.min(qa, MATRIX_SIZE));
+ qb=Tools.max((byte)0, Tools.min(qb, MATRIX_SIZE));
+ return (qa<=qb) ? PHRED_MATRIX[qa][qb] : PHRED_MATRIX[qb][qa];
+ }
+
+ public static float qualsToProbError(byte qa, byte qb){
+ return ERROR_MATRIX[qa][qb];
+ }
+
+ public static float qualsToProbCorrect(byte qa, byte qb){
+ return 1-qualsToProbError(qa, qb);
+ }
+
+ public static float qualsToProbErrorSafe(byte qa, byte qb){
+ qa=Tools.max((byte)0, Tools.min(qa, MATRIX_SIZE));
+ qb=Tools.max((byte)0, Tools.min(qb, MATRIX_SIZE));
+ return (qa<=qb) ? ERROR_MATRIX[qa][qb] : ERROR_MATRIX[qb][qa];
+ }
+
+ public static float qualsToProbCorrectSafe(byte qa, byte qb){
+ return 1-qualsToProbErrorSafe(qa, qb);
+ }
+
+ public static byte[] fakeQuality(int q, int len){
+ assert(q>=0 && q<=127);
+ byte[] r=new byte[len];
+ Arrays.fill(r, (byte)q);
+ return r;
+ }
+
+ /*-------------------- Fields --------------------*/
+
+ /*-------------------- Final Fields --------------------*/
+
+ /*-------------------- Static Fields --------------------*/
+
+ public static final byte MATRIX_SIZE=50;
+
+ /** Probability that this base is an error */
+ public static final float[] PROB_ERROR=makeQualityToFloat(127);
+ /** 1/PROB */
+ public static final float[] PROB_ERROR_INVERSE=makeInverse(PROB_ERROR);
+
+ public static final float[] PROB_CORRECT=oneMinus(PROB_ERROR);
+ public static final float[] PROB_CORRECT_INVERSE=makeInverse(PROB_CORRECT);
+
+ /** Probability that at least one base will be incorrect, given two quality scores */
+ public static final float[][] ERROR_MATRIX=makeErrorMatrix(PROB_ERROR, MATRIX_SIZE);
+
+ /** Combined phred score given two quality scores */
+ public static final byte[][] PHRED_MATRIX=makePhredMatrix(ERROR_MATRIX);
+
+ /*-------------------- Constants --------------------*/
+
+ /*-------------------- Initializers --------------------*/
+
+ private static final double phredToProbError(int phred){
+ if(phred<1){return 1;}
+ return Math.pow(10, 0-.1*phred);
+ }
+
+ public static byte probCorrectToPhred(double prob){
+ return probErrorToPhred(1-prob);
+ }
+
+ public static byte probErrorToPhred(double prob){
+ return probErrorToPhred(prob, true);
+ }
+
+ public static byte probErrorToPhred(double prob, boolean round){
+ double phred=probErrorToPhredDouble(prob);
+ final int q=round ? (int)Math.round(phred) : (int)phred;
+ return (byte)Tools.mid(0, q, Read.MAX_CALLED_QUALITY);
+ }
+
+ public static double probErrorToPhredDouble(double prob){
+ if(prob>=1){return 0;}
+ if(prob<=0.000001){return 60;}
+
+ double phred=-10*Math.log10(prob);
+ return phred;
+ }
+
+ private static final float[] makeQualityToFloat(int n){
+ float[] r=new float[n];
+ for(int i=0; i<n; i++){
+ float x=(float)Math.pow(10, 0-.1*i);
+ r[i]=x;
+ }
+ r[0]=.8f;
+ return r;
+ }
+
+ private static final float[] makeInverse(float[] prob){
+ float[] r=new float[prob.length];
+ for(int i=0; i<r.length; i++){r[i]=1/prob[i];}
+ return r;
+ }
+
+ private static final float[] oneMinus(float[] prob){
+ float[] r=new float[prob.length];
+ for(int i=0; i<r.length; i++){r[i]=1-prob[i];}
+ return r;
+ }
+
+ private static final float[][] makeErrorMatrix(float[] prob, byte maxq){
+ maxq++;
+ float[][] matrix=new float[maxq][maxq];
+ for(int i=0; i<maxq; i++){
+ for(int j=0; j<maxq; j++){
+ float a=prob[i], b=prob[j];
+ matrix[i][j]=1-((1-a)*(1-b));
+ }
+ }
+ return matrix;
+ }
+
+ private static final byte[][] makePhredMatrix(float[][] error){
+ final int maxq=error.length;
+ byte[][] matrix=new byte[maxq][maxq];
+ for(int i=0; i<maxq; i++){
+ for(int j=0; j<maxq; j++){
+ matrix[i][j]=probCorrectToPhred(1-error[i][j]);
+ }
+ }
+ return matrix;
+ }
+
+ /*-------------------- Notes --------------------*/
+
+}
diff --git a/current/align2/RandomReads3.java b/current/align2/RandomReads3.java
new file mode 100755
index 0000000..e077af9
--- /dev/null
+++ b/current/align2/RandomReads3.java
@@ -0,0 +1,1757 @@
+package align2;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.BitSet;
+import java.util.Random;
+
+import stream.ByteBuilder;
+import stream.FASTQ;
+import stream.FastaReadInputStream;
+import stream.Read;
+
+import dna.AminoAcid;
+import dna.ChromosomeArray;
+import dna.Data;
+import dna.FastaToChromArrays2;
+import dna.Gene;
+import dna.Parser;
+import dna.Timer;
+import fileIO.ReadWrite;
+import fileIO.SummaryFile;
+import fileIO.TextStreamWriter;
+
+public final class RandomReads3 {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+
+ public static void main(String[] args){
+
+ Timer t=new Timer();
+
+ FASTQ.ADD_PAIRNUM_TO_CUSTOM_ID=false;
+
+ FastaReadInputStream.MIN_READ_LEN=1;
+ Data.GENOME_BUILD=-1;
+ int build=1;
+ String ref=null;
+ String out=null;
+
+ long maxReads=0;
+ int minlen=100;
+ int maxlen=100;
+
+ int minInsLen=1;
+ int minSubLen=2;
+ int minDelLen=1;
+ int minNLen=1;
+
+ int maxInsLen=12;
+ int maxSubLen=12;
+ int maxDelLen=400;
+ int maxNLen=1;
+
+ int minChrom=-1;
+ int maxChrom=-1;
+
+ int maxSnps=3;
+ int maxInss=2;
+ int maxDels=2;
+ int maxSubs=2;
+ int maxNs=0;
+
+ float snpRate=0;
+ float insRate=0;
+ float delRate=0;
+ float subRate=0;
+ float nRate=0;
+ PERFECT_READ_RATIO=0;
+
+// float snpRate=0.4f;
+// float insRate=0.2f;
+// float delRate=0.2f;
+// float subRate=0.2f;
+// float nRate=0.2f;
+// PERFECT_READ_RATIO=0.5f;
+
+ String pbadapter=null;
+ String fragadapter1=null;
+ String fragadapter2=null;
+
+ long seed2=Long.MIN_VALUE;
+
+ int minQuality=28;
+ int midQuality=32;
+ int maxQuality=36;
+
+ int minInsert=-1, maxInsert=-1, insertDev=-1, insert=-1;
+
+ boolean paired=false;
+ String prefix_=null;
+
+ ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
+
+ for(int i=0; i<args.length; i++){
+// assert(s.contains("=")) : "All arguments must be of the form word=number, e.g., reads=10000";
+ final String arg=args[i];
+ final String[] split=arg.split("=");
+ assert(split.length<=2);
+// assert(split.length==2);
+ final String a=split[0].toLowerCase();
+ final String b=(split.length<2 ? "true" : split[1]);
+
+ int x=-1;
+ try {
+ x=Integer.parseInt(b);
+ } catch (NumberFormatException e) {}
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(Parser.parseZip(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseQuality(arg, a, b)){
+ //do nothing
+ }else if(a.equals("simplenames") || a.equals("simple") || a.equals("tagsimple")){
+ FASTQ.TAG_CUSTOM_SIMPLE=Tools.parseBoolean(b);
+ }else if(a.equals("reads")){
+ maxReads=(int)Tools.parseKMG(b);
+ }else if(a.equals("len") || a.equals("length") || a.equals("readlen") || a.equals("readlength")){
+ minlen=maxlen=x;
+ }else if(a.equals("append") || a.equals("app")){
+ append=ReadStats.append=Tools.parseBoolean(b);
+ }else if(a.equals("overwrite") || a.equals("ow")){
+ overwrite=Tools.parseBoolean(b);
+ }else if(a.startsWith("minlen")){
+ minlen=x;
+ maxlen=Tools.max(minlen, maxlen);
+ }else if(a.startsWith("maxlen")){
+ maxlen=x;
+ minlen=Tools.min(minlen, maxlen);
+ }else if(a.equals("pbadapter") || a.equals("pacbioadapter")){
+ pbadapter=b;
+ }else if(a.equals("fragadapter") || a.equals("fragadapter1")){
+ fragadapter1=b;
+ }else if(a.equals("fragadapter2")){
+ fragadapter2=b;
+ }else if(a.equals("amp")){
+ AMP=Integer.parseInt(b);
+ }else if(a.equals("slashes") || a.equals("addslashes") || a.equals("slash") || a.equals("addslash") || a.equals("addpairnum") || a.equals("pairnum")){
+ FASTQ.ADD_PAIRNUM_TO_CUSTOM_ID=Tools.parseBoolean(b);
+ }else if(a.equals("snprate")){
+ snpRate=Float.parseFloat(b);
+ }else if(a.equals("subrate")){
+ subRate=Float.parseFloat(b);
+ }else if(a.equals("delrate")){
+ delRate=Float.parseFloat(b);
+ }else if(a.equals("insrate")){
+ insRate=Float.parseFloat(b);
+ }else if(a.equals("nrate")){
+ nRate=Float.parseFloat(b);
+ }else if(a.equals("maxsnps")){
+ maxSnps=Integer.parseInt(b);
+ }else if(a.equals("maxdels")){
+ maxDels=Integer.parseInt(b);
+ }else if(a.equals("maxsubs")){
+ maxSubs=Integer.parseInt(b);
+ }else if(a.equals("maxinss") || a.equals("maxins")){
+ maxInss=Integer.parseInt(b);
+ }else if(a.equals("banns")){
+ BAN_NS=Tools.parseBoolean(b);
+ }else if(a.equals("maxns")){
+ maxNs=Integer.parseInt(b);
+ }else if(a.startsWith("maxdellen")){
+ maxDelLen=Integer.parseInt(b);
+ }else if(a.startsWith("maxsublen")){
+ maxSubLen=Integer.parseInt(b);
+ }else if(a.startsWith("maxinslen")){
+ maxInsLen=Integer.parseInt(b);
+ }else if(a.startsWith("maxnlen")){
+ maxNLen=Integer.parseInt(b);
+ }else if(a.startsWith("mindellen")){
+ minDelLen=Integer.parseInt(b);
+ }else if(a.startsWith("minsublen")){
+ minSubLen=Integer.parseInt(b);
+ }else if(a.startsWith("mininslen")){
+ minInsLen=Integer.parseInt(b);
+ }else if(a.startsWith("minnlen")){
+ minNLen=Integer.parseInt(b);
+ }else if(a.equals("fastawrap")){
+ Shared.FASTA_WRAP=Integer.parseInt(b);
+ }else if(a.startsWith("seed")){
+ seed2=Long.parseLong(b);
+ }else if(a.equals("ref") || a.equals("reference")){
+ ref=b;
+ }else if(a.equals("path")){
+ Data.setPath(b);
+ }else if(a.equals("nodisk")){
+ assert(false) : "'nodisk' has not been implemented; please remove that flag.";
+ RefToIndex.NODISK=NODISK=Tools.parseBoolean(b);
+ }else if(a.equals("s") || a.startsWith("snp")){
+ maxSnps=x;
+ snpRate=1;
+ }else if(a.equals("i") || a.startsWith("ins")){
+ maxInss=(x>0 ? 1 : 0);
+ maxInsLen=x;
+ insRate=1;
+ }else if(a.equals("d") || a.startsWith("del")){
+ maxDels=(x>0 ? 1 : 0);
+ maxDelLen=x;
+ delRate=1;
+ }else if(a.equals("u") || a.startsWith("sub")){
+ maxSubs=(x>0 ? 1 : 0);
+ maxSubLen=x;
+ subRate=1;
+ }else if(a.equals("n")){
+ maxNs=x;
+ nRate=1;
+ minNLen=maxNLen=1;
+ }else if(a.startsWith("minchrom")){
+ minChrom=x;
+ }else if(a.equals("int") || a.equals("interleaved") || a.equals("interleave")){
+ OUTPUT_INTERLEAVED=Tools.parseBoolean(b);
+ if(OUTPUT_INTERLEAVED){paired=true;}
+ }else if(a.equals("biasedsnps")){
+ BIASED_SNPS=Tools.parseBoolean(b);
+ }else if(a.startsWith("maxchrom")){
+ maxChrom=x;
+ }else if(a.startsWith("build") || a.startsWith("genome")){
+ build=x;
+// assert(false) : "Set genome to "+x;
+ }else if(a.startsWith("minq")){
+ minQuality=x;
+ midQuality=Tools.max(midQuality, minQuality);
+ maxQuality=Tools.max(maxQuality, minQuality);
+ }else if(a.startsWith("midq")){
+ midQuality=x;
+ }else if(a.startsWith("maxq")){
+ maxQuality=x;
+ midQuality=Tools.min(midQuality, maxQuality);
+ minQuality=Tools.min(minQuality, maxQuality);
+ }else if(a.equals("q")){
+ minQuality=midQuality=maxQuality=x;
+ }else if(a.equals("qv") || a.equals("variance") || a.equals("qvariance")){
+ qVariance=x;
+ }else if(a.equals("mininsert")){
+ minInsert=x;
+ }else if(a.equals("maxinsert")){
+ maxInsert=x;
+ }else if(a.startsWith("minmid")){
+ mateMiddleMin=x;
+ }else if(a.startsWith("maxmid")){
+ mateMiddleMax=x;
+ }else if(a.startsWith("paired")){
+ paired=Tools.parseBoolean(b);
+ }else if(a.startsWith("superflat")){
+ SUPERFLAT_DIST=Tools.parseBoolean(b);
+ }else if(a.startsWith("exponential")){
+ if(b==null){EXP_DIST=true;}
+ else{
+ char c=b.charAt(0);
+ if(Character.isDigit(c) || c=='.'){
+ EXP_DIST=true;
+ EXP_LAMDA=Double.parseDouble(b);
+ }else{
+ EXP_DIST=Tools.parseBoolean(b);
+ }
+ }
+ }else if(a.startsWith("triang")){
+ if(Tools.parseBoolean(b)){
+ SUPERFLAT_DIST=FLAT_DIST=BELL_DIST=false;
+ }
+ }else if(a.startsWith("flat")){
+ FLAT_DIST=Tools.parseBoolean(b);
+ }else if(a.startsWith("bell") || a.startsWith("gauss") || a.startsWith("round")){
+ BELL_DIST=Tools.parseBoolean(b);
+ }else if(a.equals("illuminanames")){
+ ILLUMINA_NAMES=Tools.parseBoolean(b);
+ }else if(a.startsWith("unique")){
+ USE_UNIQUE_SNPS=Tools.parseBoolean(b);
+ }else if(a.startsWith("adderrors") || a.startsWith("usequality")){
+ ADD_ERRORS_FROM_QUALITY=Tools.parseBoolean(b);
+ }else if(a.equals("pacbio")){
+ if(b!=null && (b.charAt(0)=='.' || Character.isDigit(b.charAt(0)))){
+ pbMinErrorRate=pbMaxErrorRate=Float.parseFloat(b);
+ ADD_PACBIO_ERRORS=pbMinErrorRate>0;
+ }else{
+ ADD_PACBIO_ERRORS=Tools.parseBoolean(b);
+ }
+ }else if(a.equals("pbmin") || a.equals("pbminrate")){
+ pbMinErrorRate=Float.parseFloat(b);
+ }else if(a.equals("pbmax") || a.equals("pbmaxrate")){
+ pbMaxErrorRate=Float.parseFloat(b);
+ }else if(a.startsWith("midpad")){
+ midPad=Integer.parseInt(b);
+ }else if(a.startsWith("randomscaffold")){
+ RANDOM_SCAFFOLD=Tools.parseBoolean(b);
+ }else if(a.startsWith("replacenoref")){
+ REPLACE_NOREF=Tools.parseBoolean(b);
+ }else if(a.equals("out") || a.equals("out1")){
+ out=b;
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ }else if(a.equals("ext") || a.equals("extension")){
+ fileExt=b;
+ if(fileExt==null){fileExt=".fq.gz";}
+ if(!fileExt.startsWith(".")){fileExt="."+fileExt;}
+ }else if(a.equals("perfect")){
+ PERFECT_READ_RATIO=(Tools.parseBoolean(b) ? 1 : Float.parseFloat(b));
+ }else if(a.equals("singlescaffold")){
+ FORCE_SINGLE_SCAFFOLD=Tools.parseBoolean(b);
+ }else if(a.equals("samestrand")){
+ mateSameStrand=Tools.parseBoolean(b);
+ }else if(a.equals("minoverlap") || a.equals("overlap")){
+ MIN_SCAFFOLD_OVERLAP=Integer.parseInt(b);
+ }else if(a.equals("prefix")){
+ prefix_=b;
+ }else if(a.equals("slashspace") || a.equals("spaceslash")){
+ boolean y=Tools.parseBoolean(b);
+ if(y){
+ slash1=" /1";
+ slash2=" /2";
+ }else{
+ slash1="/1";
+ slash2="/2";
+ }
+ }else if(a.equals("in")){
+ in1=(b==null || b.equalsIgnoreCase("null") ? null : b);
+ }else{throw new RuntimeException("Unknown parameter "+args[i]);}
+
+ }
+// assert(false) : OUTPUT_INTERLEAVED;
+ assert(build>=0) : "Please specify a genome.";
+
+
+ if(minInsert>-1){mateMiddleMin=minInsert-2*maxlen;}
+ if(maxInsert>-1){mateMiddleMax=maxInsert-2*minlen;}
+ if(insertDev>-1){
+ mateMiddleDev=insertDev;
+ }else{
+ mateMiddleDev=Tools.absdif(mateMiddleMax, mateMiddleMin)/6;
+ }
+ assert(pbMaxErrorRate>=pbMinErrorRate) : "pbMaxErrorRate must be >= pbMinErrorRate";
+
+ ArrayList<ChromosomeArray> chromlist=null;
+ if(ref!=null){
+ chromlist=writeRef(ref, build);
+ }
+
+ Data.setGenome(build);
+ if(minChrom<1){minChrom=1;}
+ if(maxChrom<1){maxChrom=Data.numChroms;}
+
+ if(chromlist==null){
+ Data.loadChromosomes(minChrom, maxChrom);
+ }else{
+ assert(chromlist.size()==maxChrom-minChrom+1) : chromlist.size()+", "+minChrom+", "+maxChrom;
+ for(ChromosomeArray cha : chromlist){
+ Data.chromosomePlusMatrix[cha.chromosome]=cha;
+ }
+ }
+ if(Shared.TRIM_READ_COMMENTS){Data.trimScaffoldNames();}
+
+ if(maxReads<1){
+ Data.sysout.println("No reads to generate; quitting.");
+ return;
+ }
+
+ RandomReads3 rr=(seed2==Long.MIN_VALUE ? new RandomReads3(paired) :
+ new RandomReads3((seed2==-1 ? System.nanoTime() : seed2), paired));
+ rr.prefix=prefix_;
+ if(pbadapter!=null){
+ rr.pbadapter1=pbadapter.getBytes();
+ rr.pbadapter2=AminoAcid.reverseComplementBases(rr.pbadapter1);
+ rr.pbadapter1=rr.pbadapter2; //For PacBio, since adapters never appear in plus configuration
+ }
+ if(fragadapter1!=null){
+ rr.fragadapter1=toAdapters(fragadapter1);
+ rr.fragadapter2=fragadapter2==null ? rr.fragadapter1 : toAdapters(fragadapter2);
+ }
+
+ if(REPLACE_NOREF){
+ for(int chrom=minChrom; chrom<=maxChrom; chrom++){
+ ChromosomeArray cha=Data.getChromosome(chrom);
+ final byte[] array=cha.array;
+ final byte n='N';
+ for(int i=1; i<array.length; i++){
+ if(array[i]==n){
+ array[i]=AminoAcid.numberToBase[rr.randyNoref.nextInt()&3];
+ }
+ }
+ }
+ }
+
+ if(PERFECT_READ_RATIO>=1){
+ snpRate=insRate=delRate=subRate=0;
+ maxSnps=maxInss=maxDels=maxSubs=maxNs=0;
+ }
+
+ if(delRate<=0 || maxDelLen<=0 || maxDels<=0){
+ delRate=0;
+ maxDelLen=minDelLen=maxDels=0;
+ }
+ if(insRate<=0 || maxInsLen<=0 || maxInss<=0){
+ insRate=0;
+ maxInsLen=minInsLen=maxInss=0;
+ }
+ if(subRate<=0 || maxSubLen<=0 || maxSubs<=0){
+ subRate=0;
+ maxSubLen=minSubLen=maxSubs=0;
+ }
+ if(snpRate<=0 || maxSnps<=0){
+ snpRate=0;
+ maxSnps=0;
+ }
+ if(nRate<=0 || maxNLen<=0 || maxNs<=0){
+ nRate=0;
+ maxNLen=minNLen=maxNs=0;
+ }
+
+ System.err.println("snpRate="+snpRate+", max="+maxSnps+", unique="+USE_UNIQUE_SNPS);
+ System.err.println("insRate="+insRate+", max="+maxInss+", len=("+minInsLen+"-"+maxInsLen+")");
+ System.err.println("delRate="+delRate+", max="+maxDels+", len=("+minDelLen+"-"+maxDelLen+")");
+ System.err.println("subRate="+subRate+", max="+maxSubs+", len=("+minSubLen+"-"+maxSubLen+")");
+ System.err.println("nRate ="+nRate+", max="+maxNs+", len=("+minNLen+"-"+maxNLen+")");
+ System.err.println("genome="+Data.GENOME_BUILD);
+ System.err.println("PERFECT_READ_RATIO="+PERFECT_READ_RATIO);
+ System.err.println("ADD_ERRORS_FROM_QUALITY="+ADD_ERRORS_FROM_QUALITY);
+ System.err.println("REPLACE_NOREF="+REPLACE_NOREF);
+ System.err.println("paired="+paired);
+ System.err.println("read length="+(minlen==maxlen ? ""+minlen : minlen+"-"+maxlen));
+ System.err.println("reads="+maxReads);
+ if(paired){
+ System.err.println("insert size="+(mateMiddleMin+2*minlen)+"-"+(mateMiddleMax+2*maxlen));
+ }
+
+// assert(false) : OUTPUT_INTERLEAVED;
+ String fname1="reads_B"+Data.GENOME_BUILD+"_"+maxReads+"x"+maxlen+"bp_"
+ +(maxSnps==0 || snpRate==0 ? 0 : maxSnps)+"S_"+(maxInss==0 || insRate==0 ? 0 : +maxInsLen)+"I_"+(maxDels==0 || delRate==0 ? 0 : maxDelLen)+"D_"+
+ (maxSubs==0 || subRate==0 ? 0 : maxSubLen)+"U_"+
+ (maxNs==0 || nRate==0 ? 0 : maxNs)+"N"/*+"_chr"+minChrom+"-"+maxChrom*/+(paired ? (OUTPUT_INTERLEAVED ? "_interleaved" : "_1") : "")+fileExt;
+
+ String fname2=(!paired || OUTPUT_INTERLEAVED) ? null : "reads_B"+Data.GENOME_BUILD+"_"+maxReads+"x"+maxlen+"bp_"
+ +(maxSnps==0 || snpRate==0 ? 0 : maxSnps)+"S_"+(maxInss==0 || insRate==0 ? 0 : +maxInsLen)+"I_"+(maxDels==0 || delRate==0 ? 0 : maxDelLen)+"D_"+
+ (maxSubs==0 || subRate==0 ? 0 : maxSubLen)+"U_"+
+ (maxNs==0 || nRate==0 ? 0 : maxNs)+"N"/*+"_chr"+minChrom+"-"+maxChrom*/+"_2"+fileExt;
+
+ if(out!=null){
+ fname1=out.replaceFirst("#", "1");
+ fname2=(!out.contains("#") || !paired/* || OUTPUT_INTERLEAVED*/) ? null : out.replaceFirst("#", "2");
+ }
+ if(fname2!=null){OUTPUT_INTERLEAVED=false;}
+// assert(false) : out+", "+fname1+", "+fname2;
+ rr.writeRandomReadsX(maxReads, minlen, maxlen,
+ maxSnps, maxInss, maxDels, maxSubs, maxNs,
+ snpRate, insRate, delRate, subRate, nRate,
+ minInsLen, minDelLen, minSubLen, minNLen,
+ maxInsLen, maxDelLen, maxSubLen, maxNLen,
+ minChrom, maxChrom, minQuality, midQuality, maxQuality, fname1, fname2);
+
+ t.stop();
+ Data.sysout.println("Wrote "+fname1);
+ if(fname2!=null){Data.sysout.println("Wrote "+fname2);}
+ Data.sysout.println("Time: \t"+t);
+
+ }
+
+ private static byte[][] toAdapters(String name){
+ String[] split;
+ ArrayList<byte[]> list=new ArrayList<byte[]>();
+ if(new File(name).exists()){
+ split=new String[] {name};
+ }else{
+ split=name.split(",");
+ }
+ for(String s : split){
+ if(new File(s).exists()){
+ ArrayList<Read> reads=FastaReadInputStream.toReads(s);
+ for(Read r : reads){
+ if(r!=null && r.length()>0){list.add(r.bases);}
+ }
+ }else{
+ list.add(s.getBytes());
+ }
+ }
+ return list.toArray(new byte[list.size()][]);
+ }
+
+ private static ArrayList<ChromosomeArray> writeRef(String reference, int build){
+ ArrayList<ChromosomeArray> chromlist=null;
+ if(reference!=null){
+ {
+ File f=new File(reference);
+ if(!f.exists() || !f.isFile() || !f.canRead()){throw new RuntimeException("Cannot read file "+f.getAbsolutePath());}
+ }
+ {
+ String s=align2.IndexMaker4.fname(1, 1, 13, 1);
+ String dir=new File(s).getParent();
+ dir=dir.replace('\\', '/');
+ dir=dir.replace("ref/index/", "ref/genome/");
+ String sf=dir+"/summary.txt";
+ if(!NODISK && new File(sf).exists() && SummaryFile.compare(sf, reference)){
+ //do nothing
+ System.err.println("NOTE:\tIgnoring reference file because it already appears to have been processed.");
+ System.err.println("NOTE:\tIf you wish to regenerate the index, please manually delete "+dir+"/summary.txt");
+ return null;
+ }
+ File f=new File(dir);
+ if(f.exists()){
+ File[] f2=f.listFiles();
+ if(f2!=null && f2.length>0){
+ if(overwrite){
+ Data.sysout.println("NOTE:\tDeleting contents of "+dir+" because reference is specified and overwrite="+overwrite);
+ for(File f3 : f2){
+ if(f3.isFile()){
+ String f3n=f3.getName();
+ if((f3n.contains(".chrom") || f3n.endsWith(".txt") || f3n.endsWith(".txt.gz")) && !f3n.endsWith("list.txt")){
+ f3.delete();
+ }
+ }
+ }
+ }else{
+ Data.sysout.println(Arrays.toString(f2));
+ throw new RuntimeException("\nThere is already a reference at location '"+f.getAbsolutePath()+"'. " +
+ "Please delete it (and the associated index), or use a different build ID, " +
+ "or remove the 'reference=' parameter from the command line, or set overwrite=true.");
+ }
+ }
+ }
+ dir=dir.replace("ref/genome/", "ref/index/");
+ f=new File(dir);
+ if(f.exists()){
+ File[] f2=f.listFiles();
+ if(f2!=null && f2.length>0){
+ if(overwrite){
+ Data.sysout.println("NOTE:\tDeleting contents of "+dir+" because reference is specified and overwrite="+overwrite);
+ for(File f3 : f2){
+ if(f3.isFile()){f3.delete();}
+ }
+ }else{
+ throw new RuntimeException("\nThere is already an index at location '"+f.getAbsolutePath()+"'. " +
+ "Please delete it, or use a different build ID, or remove the 'reference=' parameter from the command line.");
+ }
+ }
+ }
+ }
+
+ Data.sysout.println("Writing reference.");
+
+ int oldzl=ReadWrite.ZIPLEVEL;
+ ReadWrite.ZIPLEVEL=Tools.max(4, ReadWrite.ZIPLEVEL);
+
+ int minScaf=-1;
+ int maxChromLen=-1;
+ boolean genScaffoldInfo=true;
+
+ maxChromLen=maxChromLen>0 ? maxChromLen : FastaToChromArrays2.MAX_LENGTH;
+ minScaf=minScaf>-1 ? minScaf : FastaToChromArrays2.MIN_SCAFFOLD;
+ midPad=midPad>-1 ? midPad : FastaToChromArrays2.MID_PADDING;
+ String[] ftcaArgs=new String[] {reference, ""+build, "writeinthread=false", "genscaffoldinfo="+genScaffoldInfo, "retain", "waitforwriting=false",
+ "gz="+(Data.CHROMGZ), "maxlen="+maxChromLen,
+ "writechroms="+(!NODISK), "minscaf="+minScaf, "midpad="+midPad, "nodisk="+NODISK};
+
+ chromlist=FastaToChromArrays2.main2(ftcaArgs);
+
+ ReadWrite.ZIPLEVEL=oldzl;
+ }
+ return chromlist;
+ }
+
+
+ public RandomReads3(boolean paired_){
+ this(getSeed(), paired_);
+ }
+
+ public RandomReads3(long seed, boolean paired_){
+ if(randomChrom==null){
+ synchronized(getClass()){
+ if(randomChrom==null){
+ randomChrom=fillRandomChrom();
+ }
+ }
+ }
+ randy=new Random(seed+1);
+ randy2=new Random(seed+2);
+ randyMutationType=new Random(seed+3);
+ randyQual=new Random(seed+5);
+ randyAdapter=new Random(seed+25);
+ paired=paired_;
+
+ randyPerfectRead=new Random(seed+20);
+ randyLength=new Random(seed+21);
+ randyAmp=new Random(seed+22);
+
+ if(paired){
+ randyMate=new Random(seed+6);
+ randy2Mate=new Random(seed+7);
+ randyMutationTypeMate=new Random(seed+8);
+ randyQualMate=new Random(seed+10);
+ randyAdapterMate=new Random(seed+30);
+ }else{
+ randyMate=null;
+ randy2Mate=null;
+ randyMutationTypeMate=null;
+ randyQualMate=null;
+ randyAdapterMate=null;
+ }
+
+ if(REPLACE_NOREF){
+ randyNoref=new Random(seed+31);
+ }else{
+ randyNoref=null;
+ }
+ }
+
+ private final void addErrorsFromQuality(Read r, Random randy){
+ addErrorsFromQuality(r, randy, 0, r.length());
+ }
+
+ private final void addErrorsFromQuality(Read r, Random rand, final int from, final int to){
+ final byte[] quals=r.quality, bases=r.bases;
+ for(int i=from; i<to; i++){
+ final byte q=(quals==null ? 30 : quals[i]);
+ if(AminoAcid.isFullyDefined(bases[i]) && rand.nextFloat()<QualityTools.PROB_ERROR[q]){
+ int old=AminoAcid.baseToNumber[bases[i]];
+ bases[i]=AminoAcid.numberToBase[(old+rand.nextInt(3))%4];
+ }
+ }
+ }
+
+ public void addFragAdapter(Read r, final int loc, final byte[][] adapters, final Random rand){
+ final byte[] bases=r.bases;
+ final byte[] quals=r.quality;
+ final int initial=(bases==null ? 0 : bases.length);
+ final byte[] adapter=adapters[rand.nextInt(adapters.length)];
+
+ if(initial>0 && loc>=0 && loc<initial){
+
+ final int lim=Tools.min(initial, adapter.length+loc);
+ for(int i=loc, j=0; i<lim; i++, j++){
+ if(AminoAcid.isFullyDefined(bases[i])){bases[i]=adapter[j];}
+ }
+ for(int i=lim; i<initial; i++){
+ if(AminoAcid.isFullyDefined(bases[i])){
+ bases[i]=AminoAcid.numberToBase[rand.nextInt(4)];
+ }
+ }
+ }
+ if(ADD_ERRORS_FROM_QUALITY){
+ addErrorsFromQuality(r, rand, loc, bases.length);
+ }
+ }
+
+ public byte[] addPBAdapter(byte[] bases, int[] locs, int readlen, Random rand, byte[] adapter){
+// Data.sysout.println("Adding adapter "+new String(adapter));
+ assert(readlen<=bases.length);
+ int mod=Tools.max((readlen+1)/2, readlen-30-adapter.length);
+ int index=rand.nextInt(mod);
+ index-=adapter.length/2;
+ for(int i=0, j=index; i<adapter.length; i++, j++){
+ if(j>=0 && j<bases.length){bases[j]=adapter[i];}
+ }
+ return bases;
+ }
+
+ public byte[] addSNP(byte[] bases, int[] locs, int readlen, Random rand){
+ assert(readlen<=bases.length);
+ int index=rand.nextInt(readlen);
+ byte old=bases[index];
+ byte oldNum=AminoAcid.baseToNumber[old];
+ if(oldNum<0){oldNum=0;}
+ int num;
+ if(BIASED_SNPS && rand.nextInt(3)>0){
+ num=(oldNum^3);
+ }else{
+ num=(oldNum+rand.nextInt(3)+1)%4;
+ }
+ assert(num>=0 && num<=3 && num!=oldNum);
+ bases[index]=AminoAcid.numberToBase[num];
+ return bases;
+ }
+
+ public byte[] addSNP(byte[] bases, int[] locs, int readlen, Random rand, BitSet bits){
+ assert(readlen<=bases.length);
+ int index=rand.nextInt(readlen);
+
+ while(bits.get(index)){
+ index=rand.nextInt(readlen);
+ }
+ bits.set(index);
+
+ byte old=bases[index];
+ byte oldNum=AminoAcid.baseToNumber[old];
+ if(oldNum<0){oldNum=0;}
+ int num;
+ if(BIASED_SNPS && rand.nextInt(3)>0){
+ num=(oldNum^3);
+ }else{
+ num=(oldNum+rand.nextInt(3)+1)%4;
+ }
+ assert(num>=0 && num<=3 && num!=oldNum) : num+", "+oldNum;
+ bases[index]=AminoAcid.numberToBase[num];
+ return bases;
+ }
+
+
+ public byte[] addSUB(byte[] bases, int[] locs, int minlen, int maxlen, int readlen, Random rand){
+ assert(readlen<=bases.length) : readlen+", "+bases.length;
+ assert(minlen>=1);
+ assert(maxlen>=minlen);
+
+// int len=minlen+randy2.nextInt(maxlen-minlen+1);
+ int len=minlen+Tools.min(rand.nextInt(maxlen-minlen+1), rand.nextInt(maxlen-minlen+1), rand.nextInt(maxlen-minlen+1));
+// int len=minlen+Tools.min(rand.nextInt(maxlen-minlen+1), rand.nextInt(maxlen-minlen+1));
+
+ assert(len>=minlen);
+ assert(len<=maxlen);
+
+// System.err.println(minlen+", "+maxlen+", "+readlen+", "+s.length());
+
+ int index=rand.nextInt(readlen-len+1);
+
+ int lim=index+len-1;
+
+ {//Change first and last to anything except old
+ int i=index;
+
+ byte old=bases[i];
+ if(AminoAcid.isFullyDefined(old)){
+ byte oldNum=AminoAcid.baseToNumber[old];
+ int num=(oldNum+rand.nextInt(4))%4;
+ assert(num>=0 && num<=3);
+ byte base=AminoAcid.numberToBase[num];
+ bases[i]=base;
+ }
+
+ i=lim;
+ old=bases[i];
+ if(AminoAcid.isFullyDefined(old)){
+ byte oldNum=AminoAcid.baseToNumber[old];
+ int num=(oldNum+rand.nextInt(4))%4;
+ assert(num>=0 && num<=3);
+ byte base=AminoAcid.numberToBase[num];
+ bases[i]=base;
+ }
+ }
+
+ for(int i=index+1; i<lim; i++){ //Change middles to anything
+ byte old=bases[i];
+ if(AminoAcid.isFullyDefined(old)){
+ byte oldNum=AminoAcid.baseToNumber[old];
+ int num=(oldNum+rand.nextInt(3)+1)%4;
+ assert(num>=0 && num<=3 && num!=oldNum);
+ byte base=AminoAcid.numberToBase[num];
+ bases[i]=base;
+ }
+ }
+ return bases;
+ }
+
+
+ public byte[] addN(byte[] bases, int[] locs, int minlen, int maxlen, int readlen, Random rand, BitSet bits){
+ assert(readlen<=bases.length) : readlen+", "+bases.length;
+ assert(minlen>=1);
+ assert(maxlen>=minlen);
+
+// int len=minlen+randy2.nextInt(maxlen-minlen+1);
+ int len=minlen+Tools.min(rand.nextInt(maxlen-minlen+1), rand.nextInt(maxlen-minlen+1), rand.nextInt(maxlen-minlen+1));
+
+ assert(len>=minlen);
+ assert(len<=maxlen);
+
+// System.err.println(minlen+", "+maxlen+", "+readlen+", "+s.length());
+
+ int index=rand.nextInt(readlen-len+1);
+ if(bits!=null){
+ int trials=40;
+ while(bits.get(index) && (trials--)>0){
+ index=rand.nextInt(readlen-len+1);
+ }
+ bits.set(index);
+ }
+
+ int lim=index+len-1;
+
+ for(int i=index; i<=lim; i++){bases[i]='N';}
+
+ return bases;
+ }
+
+ public byte[] addInsertion(byte[] bases, int[] locs, int minlen, int maxlen, int readlen, int[] dif, Random rand){
+ assert(readlen<=bases.length) : readlen+", "+bases.length;
+ assert(minlen>0);
+ assert(maxlen>=minlen);
+
+// int len=minlen+randy2.nextInt(maxlen-minlen+1);
+ int len=minlen+Tools.min(rand.nextInt(maxlen-minlen+1), rand.nextInt(maxlen-minlen+1), rand.nextInt(maxlen-minlen+1));
+// int len=minlen+Tools.min(rand.nextInt(maxlen-minlen+1), rand.nextInt(maxlen-minlen+1));
+
+ len=Tools.min(len, readlen-dif[1]-2);
+ if(len<1){return bases;}
+
+ dif[0]-=len;
+ dif[1]+=len;
+
+ int index=rand.nextInt(readlen-len+1); //Assures that all inserted bases will be within the read
+
+// System.err.println("Added insertion "+len+" at "+index);
+
+ byte[] bases2=new byte[bases.length+len];
+ for(int i=0; i<index; i++){bases2[i]=bases[i];}
+
+ for(int i=bases.length-1, j=bases2.length-1; i>=index; i--, j--){
+// if(verbose){
+// System.err.println("i="+i+", bases.length="+bases.length+", j="+j+", bases2.length="+bases2.length+", locs.length="+locs.length+"\n"+Arrays.toString(locs));
+// }
+ if(j<locs.length){locs[j]=locs[i];}
+ bases2[j]=bases[i];
+ }
+
+// for(int i=bases.length-1; i>=index; i--){
+// bases2[i+len]=bases[i];
+//// locs[i+len]=locs[i];
+// }
+// final int locfill=locs[(index==0 ? 0 : index-1)];
+ for(int i=index; i<index+len; i++){
+ int x=rand.nextInt(4);
+ byte b=AminoAcid.numberToBase[x];
+ bases2[i]=b;
+// locs[i]=locfill;
+ locs[i]=-1;
+ }
+
+ return bases2;
+ }
+
+ public int[] makeDelsa(int dels, int minlen, int maxlen, Random rand){
+ if(dels<1){return null;}
+ assert(minlen>0);
+ assert(maxlen>=minlen);
+ int[] delsa=new int[dels];
+ for(int i=0; i<delsa.length; i++){
+// int len=minlen+Tools.min(rand.nextInt(maxlen-minlen+1), rand.nextInt(maxlen-minlen+1), rand.nextInt(maxlen-minlen+1));
+ int len=minlen+Tools.min(rand.nextInt(maxlen-minlen+1), rand.nextInt(maxlen-minlen+1));
+ delsa[i]=len;
+ }
+ return delsa;
+ }
+
+ public byte[] addDeletion(byte[] bases, int[] locs, int len, int readlen, int[] dif, Random rand){
+ assert(bases.length>=readlen+len) : "bases.len="+bases.length+", readlen="+readlen+", len="+len+", dif="+Arrays.toString(dif);
+ assert(len>0);
+
+ dif[0]+=len;
+
+// int index=randy2.nextInt(s.length()-len);
+ int index=1+rand.nextInt(readlen-1); //Assures there will never be a deletion of the first base, which would not technically be a deletion.
+
+// System.err.println("Added deletion "+len+" at "+index);
+
+ byte[] bases2=new byte[bases.length-len];
+ for(int i=0; i<index; i++){bases2[i]=bases[i];}
+ for(int i=index; i<bases2.length; i++){
+ bases2[i]=bases[i+len];
+ locs[i]=locs[i+len];
+ }
+
+ return bases2;
+ }
+
+ public int randomChrom(Read r0, int minChrom, int maxChrom){
+ if(r0!=null){return r0.chrom;}
+
+ int x=-1;
+ int chrom=-1;
+
+ assert(minChrom<=maxChrom) : minChrom+", "+maxChrom;
+ while(chrom<minChrom || chrom>maxChrom){
+ x=randy.nextInt();
+ chrom=randomChrom[(x&0x7FFFFFFF)%randomChrom.length];
+ }
+ return chrom;
+ }
+
+ public int randomStrand(Read r0, int minChrom, int maxChrom, boolean sameStrandMate){
+ if(r0!=null){
+ return sameStrandMate ? r0.strand() : r0.strand()^1;
+ }
+ return randy.nextInt()&1;
+ }
+
+ public int randomLoc(Read r0, int chrom, int readlen, int minMiddle, int maxMiddle, int strand){
+
+ if(r0!=null){
+ return randomLocPaired(r0, chrom, readlen, minMiddle, maxMiddle, strand);
+ }
+ return randomLocSingle(chrom, readlen);
+ }
+
+ public int randomLocPaired(Read r0, int chrom, int readlen, int minMiddle, int maxMiddle, int strand){
+ assert(r0!=null);
+
+ final int midRange=maxMiddle-minMiddle+1;
+ final int middle0=(maxMiddle+minMiddle)/2;
+ int middle;
+ if(SUPERFLAT_DIST){
+ // Data.sysout.print(other.numericID);
+ middle=((int)(r0.numericID%midRange))+minMiddle;
+ // Data.sysout.println("\t"+middle);
+ }else if(FLAT_DIST){
+ middle=randyMate.nextInt(midRange)+minMiddle;
+ }else if(BELL_DIST){
+
+ double g=randyMate.nextGaussian();
+ middle=(int)Math.round((g*mateMiddleDev)+middle0);
+ while(middle<minMiddle || middle>maxMiddle){
+ g=randyMate.nextGaussian();
+ middle=(int)Math.round((g*mateMiddleDev)+middle0);
+ }
+
+// double g=2*randyMate.nextDouble()-1;
+// middle=(int)Math.round((g*mateMiddleDev)+middle0);
+// while(middle<minMiddle || middle>maxMiddle){
+// g=2*randyMate.nextDouble()-1;
+// middle=(int)Math.round((g*mateMiddleDev)+middle0);
+// }
+
+// System.out.println(g);
+ /*
+ nextGaussian() has mean 0 and stdev 1
+ */
+ }else{
+ middle=(randyMate.nextInt(midRange)+randyMate.nextInt(midRange))/2+minMiddle;
+ }
+
+ if(EXP_DIST){
+ double d=999999;
+ int mid=middle;
+ while(d>64){
+ d=Tools.exponential(randy, EXP_LAMDA);
+ }
+// middle=(int)Math.round((1*middle+(((middle+readlen*2)*(d))-(readlen*2)))/2);
+ middle=(int)Math.round((middle+readlen*2)*(0.4*(d*1.4+0.2)+0.6)-(readlen*2));
+ }
+
+ // Data.sysout.println(sameStrand+": "+other.strand+" -> "+strand);
+ int x;
+ if(r0.strand()==Gene.PLUS){
+ x=r0.stop+middle+1;
+ }else{
+ x=r0.start-middle-readlen;
+ }
+ return x;
+ }
+
+ public int randomLocSingle(int chrom, int readlen){
+
+ ChromosomeArray cha=Data.getChromosome(chrom);
+ byte[] array=cha.array;
+ if(readlen>=(cha.maxIndex-cha.minIndex)){return -1;}
+
+ int loc=-1;
+ for(int i=0; loc<0 && i<24; i++){
+ loc=randy.nextInt(cha.maxIndex-readlen);
+ for(int j=0; j<readlen; j++){
+ if(!AminoAcid.isFullyDefined(array[j+loc])){
+ loc=-1;
+ break;
+ }
+ }
+ }
+ return loc;
+ }
+
+ public int[] randomScaffoldLoc(int chrom, int readlen){
+ int[] locs=Data.scaffoldLocs[chrom];
+ int[] lengths=Data.scaffoldLengths[chrom];
+
+ int scaf=randy.nextInt(locs.length);
+ int loc=locs[scaf];
+ int scaflen=lengths[scaf];
+ int start;
+ if(readlen>=scaflen){
+ readlen=scaflen;
+ start=loc;
+ }else{
+ start=loc+randy.nextInt(scaflen-readlen);
+ }
+ return new int[] {start, readlen};
+ }
+
+ public void writeRandomReadsX(long numReads, int minlen, int maxlen,
+ int maxSnps, int maxInss, int maxDels, int maxSubs, int maxNs,
+ float snpRate, float insRate, float delRate, float subRate, float nRate,
+ int minInsLen, int minDelLen, int minSubLen, int minNLen,
+ int maxInsLen, int maxDelLen, int maxSubLen, int maxNLen,
+ int minChrom, int maxChrom,
+ int minQual, int midQual, int maxQual, String fname1, String fname2){
+ FASTQ.TAG_CUSTOM=(prefix==null && !ILLUMINA_NAMES);
+
+ TextStreamWriter tsw1=new TextStreamWriter(fname1, overwrite, false, true);
+ tsw1.start();
+ TextStreamWriter tsw2=null;
+ if(fname2!=null){
+ assert(!fname2.equalsIgnoreCase(fname1));
+ tsw2=new TextStreamWriter(fname2, overwrite, false, true);
+ tsw2.start();
+ }
+
+ assert(minQual<=midQual);
+ assert(midQual<=maxQual);
+ assert(minQual>=0 && maxQual<60);
+
+ final int maxQualP=maxQual;//Tools.max(35, maxQual);
+ final int midQualP=midQual;//30;
+ final int minQualP=minQual;//Tools.min(25, maxQual);
+
+ final BitSet bits=new BitSet(maxlen+1);
+ final int[] locs=new int[(int)Tools.min(300000000, maxlen+(maxDelLen*(long)maxDels))];
+
+ Read lastRead=null;
+ int ampLevel=0;
+ int ampLength=2000;
+
+ for(int i=0; i<numReads; i++){
+
+ final boolean perfect=randyPerfectRead.nextFloat()<PERFECT_READ_RATIO;
+
+ final byte baseQuality;
+ final byte slant;
+ {
+// byte baseSlant=(perfect ? (byte)5 : (byte)(maxQual-minQual+1));
+// slant=(byte)((randyQual.nextInt(baseSlant)+randyQual.nextInt(baseSlant)+1)/2);
+// if(randyQual.nextBoolean()){
+// int range=(perfect ? maxQualP-midQualP+1 : maxQual-midQual+1);
+// int delta=Tools.min(randyQual.nextInt(range), randyQual.nextInt(range));
+// baseQuality=(byte)((perfect ? midQualP : midQual)+delta);
+// }else{
+// int range=perfect ? midQualP-minQualP+1 : midQual-minQual+1;
+// int delta=randyQual.nextInt(range);
+// baseQuality=(byte)((perfect ? midQualP : midQual)-delta);
+// }
+
+ byte baseSlant=(byte)(maxQual-minQual+1);
+ slant=(byte)((randyQual.nextInt(baseSlant)+randyQual.nextInt(baseSlant)+1)/2);
+ if(randyQual.nextBoolean()){
+ int range=maxQual-midQual+1;
+ int delta=Tools.min(randyQual.nextInt(range), randyQual.nextInt(range));
+ baseQuality=(byte)(midQual+delta);
+ }else{
+ int range=midQual-minQual+1;
+ int delta=randyQual.nextInt(range);
+ baseQuality=(byte)(midQual-delta);
+ }
+ }
+
+ int forceChrom=-1, forceLoc=-1;
+
+
+ if(AMP>1 && lastRead!=null){
+ if(ampLevel>0){
+ forceChrom=lastRead.chrom;
+ // forceLoc=lastRead.start+4-randyAmp.nextInt(9);
+ // forceLoc=lastRead.start+10-randyAmp.nextInt(21);
+
+ int a=ampLength;
+ int b=a*2+1;
+ int mode=randyAmp.nextInt(100);
+ if(mode>96){
+ forceLoc=lastRead.start+a-randyAmp.nextInt(b);
+ }else if(mode>85){
+ forceLoc=lastRead.start+a-(randyAmp.nextInt(b)+randyAmp.nextInt(b))/2;
+ }else if(mode>30){
+ forceLoc=lastRead.start+a-(randyAmp.nextInt(b)+randyAmp.nextInt(b)+randyAmp.nextInt(b))/3;
+ }else{
+ forceLoc=lastRead.start+a-(randyAmp.nextInt(b)+randyAmp.nextInt(b)+randyAmp.nextInt(b)+randyAmp.nextInt(b))/4;
+ }
+ }else{
+
+ ampLevel=0;
+ int a1=AMP;
+ if(randyAmp.nextInt(30)==0){a1*=7;}
+
+ if(randyAmp.nextInt(3)>0){
+ ampLevel=Tools.min(randyAmp.nextInt(a1), randyAmp.nextInt(a1));
+ }else{
+ double log=Math.log10(a1*7);
+ ampLevel=(int)Math.round(Math.pow(10, randyAmp.nextDouble()*log));
+ }
+
+ ampLength=500+randyAmp.nextInt(3001)+(int)Tools.min(1000+randyAmp.nextInt(20000), 400*Tools.exponential(randyAmp, 0.8d));
+
+ }
+ }
+
+
+ Read r1=makeRead(null, minlen, maxlen, minChrom, maxChrom,
+ maxSnps, maxInss, maxDels, maxSubs, maxNs,
+ snpRate, insRate, delRate, subRate, nRate,
+ minInsLen, minDelLen, minSubLen, minNLen,
+ maxInsLen, maxDelLen, maxSubLen, maxNLen,
+ mateMiddleMin, mateMiddleMax, mateSameStrand,
+ minQual, midQual, maxQual, baseQuality, slant,
+ perfect, nextReadID, locs, bits, forceChrom, forceLoc);
+
+// assert(false) : r1;
+ if(paired && r1!=null){
+
+ Read r2=null;
+ for(int tries=0; r2==null && tries<100; tries++){
+ r2=makeRead(r1, minlen, maxlen, minChrom, maxChrom,
+ maxSnps, maxInss, maxDels, maxSubs, maxNs,
+ snpRate, insRate, delRate, subRate, nRate,
+ minInsLen, minDelLen, minSubLen, minNLen,
+ maxInsLen, maxDelLen, maxSubLen, maxNLen,
+ mateMiddleMin, mateMiddleMax, mateSameStrand,
+ minQual, midQual, maxQual, baseQuality, slant,
+ perfect, nextReadID, locs, bits, -1, -1);
+ }
+
+ if(r2!=null){
+ if(FORCE_SINGLE_SCAFFOLD){
+ int scaf1=Data.scaffoldIndex(r1.chrom, (r1.start+r1.stop)/2);
+ int scaf2=Data.scaffoldIndex(r2.chrom, (r2.start+r2.stop)/2);
+ if(scaf1!=scaf2){
+ r1=r2=null;
+ }
+ }
+ }
+
+ if(r2!=null){
+ r1.mate=r2;
+ r2.mate=r1;
+ if(fragadapter1!=null){
+ r1.setMapped(true);
+ r2.setMapped(true);
+ int x=Read.insertSizeMapped(r1, r2, false);
+ if(x>0 && x<r1.length()){
+ addFragAdapter(r1, x, fragadapter1, randyAdapter);
+ }
+ if(x<r2.length()){
+ addFragAdapter(r2, x, fragadapter2, randyAdapterMate);
+ }
+ }
+ }else{
+ r1=null;
+ }
+
+// Data.sysout.println(r.strand()+"\t"+r.insertSize());
+ }
+ if(r1!=null){
+ final Read r2=r1.mate;
+// assert(false) : r1;
+ if(prefix!=null){r1.id=prefix+"_"+r1.numericID+slash1;}
+ else if(ILLUMINA_NAMES){r1.id=r1.numericID+slash1;}
+ tsw1.println(r1);
+ if(r2!=null){
+ r2.setPairnum(1);
+ if(prefix!=null){r2.id=prefix+"_"+r1.numericID+slash2;}
+ else if(ILLUMINA_NAMES){r2.id=r1.numericID+slash2;}
+ if(tsw2!=null){tsw2.println(r2);}
+ else{tsw1.println(r2);}
+
+ }
+ nextReadID++;
+ }else{
+ i--;
+ }
+ ampLevel=Tools.max(0, ampLevel-1);
+ if(ampLevel==0){lastRead=null;}
+
+ if(lastRead==null){lastRead=r1;}
+// System.err.println("Made "+r.start+" ~ "+r.stop+" = "+(r.stop-r.start));
+ }
+ tsw1.poison();
+ if(tsw2!=null){tsw2.poison();}
+ }
+
+
+
+ public ArrayList<Read> makeRandomReadsX(int numReads, int minlen, int maxlen,
+ int maxSnps, int maxInss, int maxDels, int maxSubs, int maxNs,
+ float snpRate, float insRate, float delRate, float subRate, float nRate,
+ int minInsLen, int minDelLen, int minSubLen, int minNLen,
+ int maxInsLen, int maxDelLen, int maxSubLen, int maxNLen,
+ int minChrom, int maxChrom,
+ int minQual, int midQual, int maxQual){
+ FASTQ.TAG_CUSTOM=(prefix==null && !ILLUMINA_NAMES);
+
+ assert(minQual<=midQual);
+ assert(midQual<=maxQual);
+ assert(minQual>=0 && maxQual<60);
+
+ if(bits_cached==null){bits_cached=new BitSet(maxlen+1);}
+ if(locs_cached==null || locs_cached.length<Tools.min(300000000, maxlen+(maxDelLen*(long)maxDels))){
+ locs_cached=new int[(int)Tools.min(300000000, maxlen+(maxDelLen*(long)maxDels))];
+ }
+ final BitSet bits=bits_cached;
+ final int[] locs=locs_cached;
+ final ArrayList<Read> list=new ArrayList<Read>(numReads);
+
+ Read lastRead=null;
+ int ampLevel=0;
+ int ampLength=2000;
+
+ for(int i=0; i<numReads; i++){
+
+ final boolean perfect=randyPerfectRead.nextFloat()<PERFECT_READ_RATIO;
+
+ final byte baseQuality;
+ final byte slant;
+ {
+// byte baseSlant=(perfect ? (byte)5 : (byte)(maxQual-minQual+1));
+// slant=(byte)((randyQual.nextInt(baseSlant)+randyQual.nextInt(baseSlant)+1)/2);
+// if(randyQual.nextBoolean()){
+// int range=(perfect ? maxQualP-midQualP+1 : maxQual-midQual+1);
+// int delta=Tools.min(randyQual.nextInt(range), randyQual.nextInt(range));
+// baseQuality=(byte)((perfect ? midQualP : midQual)+delta);
+// }else{
+// int range=perfect ? midQualP-minQualP+1 : midQual-minQual+1;
+// int delta=randyQual.nextInt(range);
+// baseQuality=(byte)((perfect ? midQualP : midQual)-delta);
+// }
+
+ byte baseSlant=(byte)(maxQual-minQual+1);
+ slant=(byte)((randyQual.nextInt(baseSlant)+randyQual.nextInt(baseSlant)+1)/2);
+ if(randyQual.nextBoolean()){
+ int range=maxQual-midQual+1;
+ int delta=Tools.min(randyQual.nextInt(range), randyQual.nextInt(range));
+ baseQuality=(byte)(midQual+delta);
+ }else{
+ int range=midQual-minQual+1;
+ int delta=randyQual.nextInt(range);
+ baseQuality=(byte)(midQual-delta);
+ }
+ }
+
+ int forceChrom=-1, forceLoc=-1;
+ if(AMP>1 && lastRead!=null){
+ if(ampLevel>0){
+ forceChrom=lastRead.chrom;
+// forceLoc=lastRead.start+4-randyAmp.nextInt(9);
+// forceLoc=lastRead.start+10-randyAmp.nextInt(21);
+
+ int a=ampLength;
+ int b=a*2+1;
+ if(randyAmp.nextBoolean()){
+ forceLoc=lastRead.start+a-randyAmp.nextInt(b);
+ }else{
+// if(randyAmp.nextBoolean()){
+// forceLoc=lastRead.start+a-(randyAmp.nextInt(b)+randyAmp.nextInt(b))/2;
+// }else{
+ forceLoc=lastRead.start+a-(randyAmp.nextInt(b)+randyAmp.nextInt(b)+randyAmp.nextInt(b))/3;
+// }
+ }
+ }else{
+
+ int a1=AMP;
+ if(randyAmp.nextInt(30)==0){a1*=7;}
+
+ if(randyAmp.nextInt(3)>0){
+ ampLevel=Tools.min(randyAmp.nextInt(a1), randyAmp.nextInt(a1));
+ }else{
+ double log=Math.log10(a1*7);
+ ampLevel=(int)Math.round(Math.pow(10, randyAmp.nextDouble()*log));
+ }
+ ampLength=500+randyAmp.nextInt(3001);
+// ampLevel=randyAmp.nextInt(AMP);
+ }
+ }
+
+ Read r1=makeRead(null, minlen, maxlen, minChrom, maxChrom,
+ maxSnps, maxInss, maxDels, maxSubs, maxNs,
+ snpRate, insRate, delRate, subRate, nRate,
+ minInsLen, minDelLen, minSubLen, minNLen,
+ maxInsLen, maxDelLen, maxSubLen, maxNLen,
+ mateMiddleMin, mateMiddleMax, mateSameStrand,
+ minQual, midQual, maxQual, baseQuality, slant,
+ perfect, nextReadID, locs, bits, forceChrom, forceLoc);
+
+// assert(false) : r1;
+ if(paired && r1!=null){
+
+ Read r2=null;
+ for(int tries=0; r2==null && tries<100; tries++){
+ r2=makeRead(r1, minlen, maxlen, minChrom, maxChrom,
+ maxSnps, maxInss, maxDels, maxSubs, maxNs,
+ snpRate, insRate, delRate, subRate, nRate,
+ minInsLen, minDelLen, minSubLen, minNLen,
+ maxInsLen, maxDelLen, maxSubLen, maxNLen,
+ mateMiddleMin, mateMiddleMax, mateSameStrand,
+ minQual, midQual, maxQual, baseQuality, slant,
+ perfect, nextReadID, locs, bits, -1, -1);
+ }
+
+ if(r2!=null){
+ if(FORCE_SINGLE_SCAFFOLD){
+ int scaf1=Data.scaffoldIndex(r1.chrom, (r1.start+r1.stop)/2);
+ int scaf2=Data.scaffoldIndex(r2.chrom, (r2.start+r2.stop)/2);
+ if(scaf1!=scaf2){
+ r1=r2=null;
+ }
+ }
+ }
+
+ if(r2!=null){
+ r1.mate=r2;
+ r2.mate=r1;
+ if(fragadapter1!=null){
+ r1.setMapped(true);
+ r2.setMapped(true);
+ int x=Read.insertSizeMapped(r1, r2, false);
+ if(x>0 && x<r1.length()){
+ addFragAdapter(r1, x, fragadapter1, randyAdapter);
+ }
+ if(x<r2.length()){
+ addFragAdapter(r2, x, fragadapter2, randyAdapterMate);
+ }
+ }
+ }else{
+ r1=null;
+ }
+
+// Data.sysout.println(r.strand()+"\t"+r.insertSize());
+ }
+ if(r1!=null){
+// assert(false) : r1;
+ if(ILLUMINA_NAMES){r1.id=r1.numericID+slash1;}
+ if(r1.mate!=null){
+ r1.mate.setPairnum(1);
+ if(ILLUMINA_NAMES){r1.mate.id=r1.numericID+slash2;}
+ }
+ list.add(r1);
+ nextReadID++;
+ }else{
+ i--;
+ }
+ ampLevel=Tools.max(0, ampLevel-1);
+ if(ampLevel==0){lastRead=null;}
+
+ if(lastRead==null){lastRead=r1;}
+// System.err.println("Made "+r1.start+" ~ "+r1.stop+" = "+(r1.stop-r1.start));
+ }
+ return list;
+ }
+
+ public Read makeRead(Read r0, int minlen, int maxlen, int minChrom, int maxChrom,
+ int maxSnps, int maxInss, int maxDels, int maxSubs, int maxNs,
+ float snpRate, float insRate, float delRate, float subRate, float nRate,
+ int minInsLen, int minDelLen, int minSubLen, int minNLen,
+ int maxInsLen, int maxDelLen, int maxSubLen, int maxNLen,
+ int minMiddle, int maxMiddle, boolean sameStrand,
+ int minQual, int midQual, int maxQual, byte baseQuality, byte slant,
+ boolean perfect, long rid, int[] locs, BitSet bits,
+ int FORCE_CHROM, int FORCE_LOC){
+
+// verbose=(rid==3860);
+
+ int SNPs=0;
+ int INSs=0;
+ int DELs=0;
+ int SUBs=0;
+ int Ns=0;
+ int adapters=0;
+
+ while(SNPs<maxSnps && randyMutationType.nextFloat()<snpRate){SNPs++;}
+ while(INSs<maxInss && randyMutationType.nextFloat()<insRate){INSs++;}
+ while(DELs<maxDels && randyMutationType.nextFloat()<delRate){DELs++;}
+ while(SUBs<maxSubs && randyMutationType.nextFloat()<subRate){SUBs++;}
+ while(Ns<maxNs && randyMutationType.nextFloat()<nRate){Ns++;}
+
+// final boolean perfect=randyPerfectRead.nextFloat()<PERFECT_READ_RATIO;
+ if(perfect){SNPs=INSs=DELs=SUBs=Ns=0;}
+
+ if(verbose){
+ System.err.println("\nMaking read with snps="+SNPs+", inss="+INSs+", dels="+DELs+", subs="+SUBs+", Ns="+Ns);
+ System.err.println("perfect="+perfect);
+ }
+
+ int[] delsa=makeDelsa(DELs, minDelLen, maxDelLen, randy2);
+
+ int readlen=(minlen==maxlen ? maxlen : minlen+randyLength.nextInt(maxlen-minlen+1));
+ int inititallen0=readlen+(delsa==null ? 0 : (int)Tools.sum(delsa));
+
+ if(verbose){
+ System.err.println("delsa="+Arrays.toString(delsa));
+ System.err.println("readlen="+readlen+", inititallen0="+inititallen0);
+ }
+
+ final int chrom=(FORCE_CHROM>=0 ? FORCE_CHROM : randomChrom(r0, minChrom, maxChrom));
+ if(chrom<0){return null;}
+ final int strand=randomStrand(r0, minChrom, maxChrom, sameStrand);
+
+ int loc;
+ if(FORCE_LOC>=0){
+ loc=FORCE_LOC;
+ }else if(RANDOM_SCAFFOLD){
+ int[] x=randomScaffoldLoc(chrom, inititallen0);
+ if(x==null){return null;}
+ loc=x[0];
+ inititallen0=x[1];
+ readlen=inititallen0-(delsa==null ? 0 : (int)Tools.sum(delsa));
+ }else{
+ loc=randomLoc(r0, chrom, inititallen0, minMiddle, maxMiddle, strand);
+ }
+
+ if(verbose){
+ System.err.println("chrom="+chrom+", loc="+loc+"~"+(loc+inititallen0-1)+", strand="+strand+", chalen="+Data.getChromosome(chrom).maxIndex);
+ }
+
+ if(r0!=null){
+ int y=loc+inititallen0-1;
+ if(loc<0){loc=0; y=readlen-1; maxDels=0; delsa=null; inititallen0=readlen;}
+ ChromosomeArray cha=Data.getChromosome(chrom);
+ if(y>cha.maxIndex){y=cha.maxIndex; loc=y-readlen+1; maxDels=0; delsa=null; inititallen0=readlen;}
+ if(verbose){
+ System.err.println("After pair compensation:");
+ System.err.println("delsa="+Arrays.toString(delsa));
+ System.err.println("readlen="+readlen+", inititallen0="+inititallen0);
+ System.err.println("chrom="+chrom+", loc="+loc+", strand="+strand);
+ }
+ assert(y<=cha.maxIndex) : y+", "+cha.maxIndex;
+ assert(cha.get(y)>0) : cha.get(y);
+ }
+
+ if(loc<0){
+ if(verbose){
+ System.err.println("Bad values; returning null.");
+ }
+ return null;
+ }
+
+ final ChromosomeArray cha=Data.getChromosome(chrom);
+ if(readlen>=(cha.maxIndex-cha.minIndex)){
+ if(verbose){
+ System.err.println("Too long; returning null.");
+ }
+ return null;
+ }
+ if(loc>=cha.maxIndex || loc<0){return null;}
+ byte[] bases=cha.getBytes(loc, loc+inititallen0-1);
+ assert(bases[0]>0 && bases[bases.length-1]>0) : Arrays.toString(bases);
+ assert(strand==Gene.MINUS || strand==Gene.PLUS);
+
+ for(int i=0; i<bases.length; i++){
+ locs[i]=i+loc;
+ }
+ if(verbose){
+ System.err.println(new String(bases));
+ System.err.println(Arrays.toString(Arrays.copyOf(locs, bases.length)));
+ }
+
+ if(BAN_NS){
+ for(byte b : bases){
+ if(!AminoAcid.isFullyDefined(b)){return null;}
+ }
+ }
+
+ if(pbadapter1!=null && (rid&3)==0){
+ bases=addPBAdapter(bases, locs, readlen, randyAdapter, pbadapter1);
+ adapters++;
+ }else if(pbadapter2!=null && (rid&3)==1){
+ bases=addPBAdapter(bases, locs, readlen, randyAdapter, pbadapter2);
+ adapters++;
+ }
+
+ int[] dif=new int[] {0, 0};
+
+ for(int j=0; delsa!=null && j<delsa.length; j++){
+ bases=addDeletion(bases, locs, delsa[j], readlen, dif, randy2);
+ if(verbose){
+ System.err.println("After adding del "+delsa[j]+": ");
+ System.err.println(new String(bases));
+ System.err.println(Arrays.toString(Arrays.copyOf(locs, bases.length)));
+ }
+ }
+ if(bases.length>readlen){bases=Arrays.copyOf(bases, readlen);}
+ assert(bases.length==readlen);
+
+ for(int j=0; j<INSs; j++){
+ bases=addInsertion(bases, locs, minInsLen, maxInsLen, readlen, dif, randy2);
+ if(verbose){
+ System.err.println("After adding ins: ");
+ System.err.println("'"+new String(bases)+"'");
+ System.err.println(Arrays.toString(Arrays.copyOf(locs, Tools.min(locs.length, bases.length))));
+ }
+ }
+ if(bases.length!=readlen){bases=Arrays.copyOf(bases, readlen);}
+ assert(bases.length==readlen);
+
+ if(USE_UNIQUE_SNPS){
+ bits.clear();
+ for(int j=0; j<SNPs; j++){bases=addSNP(bases, locs, readlen, randy2, bits);}
+ }else{
+ for(int j=0; j<SNPs; j++){bases=addSNP(bases, locs, readlen, randy2);}
+ }
+
+ for(int j=0; j<SUBs; j++){bases=addSUB(bases, locs, minSubLen, maxSubLen, readlen, randy2);}
+
+ if(USE_UNIQUE_SNPS){
+ bits.clear();
+ for(int j=0; j<Ns; j++){bases=addN(bases, locs, minNLen, maxNLen, readlen, randy2, bits);}
+ }else{
+ for(int j=0; j<Ns; j++){bases=addN(bases, locs, minNLen, maxNLen, readlen, randy2, null);}
+ }
+
+ //Fill insertions in loc array
+ for(int i=1; i<bases.length; i++){
+ if(locs[i]<0){locs[i]=locs[i-1];}
+ }
+ for(int i=bases.length-2; i>=0; i--){
+ if(locs[i]<0){locs[i]=locs[i+1];}
+ }
+ final int x=locs[0], y=locs[bases.length-1];
+ if(verbose){
+ System.err.println("After adding SNPs, SUBs, Ns, and fixing locs: ");
+ System.err.println("'"+new String(bases)+"'");
+ System.err.println(Arrays.toString(Arrays.copyOf(locs, Tools.min(locs.length, bases.length))));
+ }
+
+// if(FORCE_LOC>=0 || FORCE_CHROM>=0){
+// if(y<0 || y+readlen>)
+// }
+ assert(FORCE_LOC>=0 || FORCE_CHROM>=0 || y<=cha.maxIndex) : y+", "+r0;
+ assert(FORCE_LOC>=0 || FORCE_CHROM>=0 || cha.get(y)>0) : cha.get(y);
+
+ if(strand==Gene.MINUS){
+ AminoAcid.reverseComplementBasesInPlace(bases);
+ //Reverse loc array; not really necessary
+ for(int i=0, lim=bases.length/2; i<lim; i++){
+ int tmp=locs[i];
+ locs[i]=locs[bases.length-i-1];
+ locs[bases.length-i-1]=tmp;
+ }
+ if(verbose){
+ System.err.println("After reverse-complement: ");
+ System.err.println(new String(bases));
+ System.err.println(Arrays.toString(Arrays.copyOf(locs, bases.length)));
+ }
+ }
+
+ if(verbose){
+ System.err.println("Final lineup: ");
+ System.err.println(new String(bases));
+ for(int i=0; i<bases.length; i++){
+ byte c=cha.get(locs[i]);
+ if(strand==1){c=AminoAcid.baseToComplementExtended[c];}
+ System.err.print((char)c);
+ }
+ System.err.println();
+ }
+
+ byte[] quals=null;
+ if(USE_FIXED_QUALITY){
+ quals=getFixedQualityRead(bases.length);
+ }else{
+// if(perfect){
+// quals=QualityTools.makeQualityArray(bases.length, randyQual, 30, 40, baseQuality, slant, qVariance);
+// }else{
+ quals=QualityTools.makeQualityArray(bases.length, randyQual, minQual, maxQual, baseQuality, slant, qVariance);
+// }
+ }
+ for(int j=0; j<quals.length; j++){
+ if(!AminoAcid.isFullyDefined(bases[j])){quals[j]=0;}
+ }
+
+
+// Read r=new Read(bases, chrom, (byte)strand, loc, loc+bases.length-1, rid, quals, false);
+ Read r=new Read(bases, chrom, (byte)strand, x, y, rid, quals);
+ r.setSynthetic(true);
+ assert(r.length()==readlen);
+
+ if(ADD_ERRORS_FROM_QUALITY && !perfect){addErrorsFromQuality(r, randyQual);}
+ if(ADD_PACBIO_ERRORS && !perfect){
+ addPacBioErrors(r, randyQual.nextFloat()*(pbMaxErrorRate-pbMinErrorRate)+pbMinErrorRate, (1+randyQual.nextFloat())*(pbMaxErrorRate-pbMinErrorRate)*0.25f);
+ }else{
+ assert(r.length()==readlen);
+ }
+
+// r.stop=r.start+readlen+dif[0]-1;
+
+ assert(r.stop>r.start) : r;
+
+ if(adapters>0){r.setHasAdapter(true);}
+ if(FORCE_SINGLE_SCAFFOLD && !Data.isSingleScaffold(r.chrom, r.start, r.stop)){return null;}
+ if(MIN_SCAFFOLD_OVERLAP>0 && Data.scaffoldOverlapLength(r.chrom, r.start, r.stop)<MIN_SCAFFOLD_OVERLAP){return null;}
+ return r;
+ }
+
+ public void addPacBioErrors(final Read r, final float errorRate, final float deviation){
+
+ byte[] bases=r.bases;
+ ByteBuilder bb=new ByteBuilder((int)(bases.length*1.1f));
+ ByteBuilder qq=new ByteBuilder((int)(bases.length*1.1f));
+
+ for(int i=0; i<bases.length; i++){
+ float dev2=2*deviation*randy.nextFloat()-deviation;
+ float rate=errorRate+dev2;
+ float p=randy.nextFloat();
+ byte q=QualityTools.probCorrectToPhred(1-rate);
+ if(p>rate){
+ bb.append(bases[i]);
+ qq.append(q);
+ }else{
+ float p2=randyMutationType.nextFloat();
+ if(p2<0.4){//Ins
+ byte b=AminoAcid.numberToBase[randy2.nextInt(4)];
+ bb.append(b);
+ qq.append(q);
+ i--;
+ }else if(p2<0.75){//Del
+ //do nothing
+ }else{//Sub
+ int x=AminoAcid.baseToNumber[bases[i]]+randy2.nextInt(3);
+ byte b=AminoAcid.numberToBase[x%4];
+ bb.append(b);
+ qq.append(q);
+ }
+ }
+ }
+
+ r.bases=bb.toBytes();
+ if(r.quality!=null){
+ r.quality=qq.toBytes();
+// byte q=QualityTools.probCorrectToPhred(1-errorRate);
+// byte[] qual=new byte[r.length()];
+// Arrays.fill(qual, q);
+// r.quality=qual;
+ }
+ }
+
+ private static int[] fillRandomChrom(){
+
+ int[] in=Arrays.copyOf(Data.chromLengths, Data.chromLengths.length);
+ long total=Tools.sum(in);
+ int div=(int)(total/8192);
+ for(int i=0; i<in.length; i++){in[i]=((in[i]+div-1)/div);}
+
+
+ int sum=0;
+ for(int i=0; i<in.length; i++){sum+=in[i];}
+ int[] out=new int[sum];
+ sum=0;
+ for(int chrom=0; chrom<in.length; chrom++){
+ int size=in[chrom];
+ for(int j=0; j<size; j++){
+ out[sum+j]=chrom;
+ }
+ sum+=size;
+ }
+ return out;
+ }
+
+ public static final byte[] getFixedQualityRead(int bases){
+ if(fixedQuality[bases]==null){
+ fixedQuality[bases]=new byte[bases];
+ Arrays.fill(fixedQuality[bases], FIXED_QUALITY_VALUE);
+ }
+ return fixedQuality[bases];
+ }
+
+ private static synchronized long getSeed(){
+ long r=seed*1000;
+ seed++;
+ return r;
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private final Random randy;
+ private final Random randy2;
+ private final Random randyMutationType;
+ private final Random randyQual;
+ private final Random randyAdapter;
+
+ private final Random randyMate;
+ private final Random randy2Mate;
+ private final Random randyMutationTypeMate;
+ private final Random randyQualMate;
+ private final Random randyAdapterMate;
+
+ private final Random randyPerfectRead;
+ private final Random randyNoref;
+
+ private final Random randyLength;
+
+ private final Random randyAmp;
+
+ public final boolean paired;
+
+ private long nextReadID=0;
+ private byte[] pbadapter1=null;
+ private byte[] pbadapter2=null;
+
+ private byte[][] fragadapter1=null;
+ private byte[][] fragadapter2=null;
+
+ private BitSet bits_cached;
+ private int[] locs_cached;
+
+ private String prefix;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Static Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private static String slash1="/1";
+ private static String slash2="/2";
+
+ private static int[] randomChrom;
+
+ private static long seed=0;
+
+ private static final byte[][] fixedQuality=new byte[301][];
+
+ public static final boolean USE_FIXED_QUALITY=false;
+ public static final byte FIXED_QUALITY_VALUE=24;
+ public static boolean ADD_ERRORS_FROM_QUALITY=true;
+ public static boolean ADD_PACBIO_ERRORS=false;
+ public static float pbMinErrorRate=0.13f;
+ public static float pbMaxErrorRate=0.17f;
+ public static boolean REPLACE_NOREF=false;
+ public static boolean OUTPUT_INTERLEAVED=false;
+ /** Rather than choosing a random location in the concatenated genome, choose a random scaffold, without respect to length */
+ public static boolean RANDOM_SCAFFOLD=false;
+ public static String fileExt=".fq.gz";
+ public static boolean verbose=false;
+
+ public static boolean mateSameStrand=false;
+ public static int mateMiddleMin=-100; //default -25
+ public static int mateMiddleMax=100; //default 475
+ public static int mateMiddleDev=-1;
+ public static boolean SUPERFLAT_DIST=false;
+ public static boolean FLAT_DIST=false;
+ public static boolean BELL_DIST=false;
+ public static boolean EXP_DIST=false;
+ public static double EXP_LAMDA=0.8d;
+ public static boolean BIASED_SNPS=false;
+ public static boolean ILLUMINA_NAMES=false;
+ public static int midPad=500;
+
+ public static boolean NODISK=false;
+
+ public static int AMP=1;
+ public static int qVariance=4;
+
+ public static float PERFECT_READ_RATIO=0f;
+
+ /** Ban generation of reads over unspecified reference bases */
+ static boolean BAN_NS=false;
+
+ public static boolean USE_UNIQUE_SNPS=true;
+ public static boolean FORCE_SINGLE_SCAFFOLD=true;
+ public static int MIN_SCAFFOLD_OVERLAP=1;
+ public static boolean overwrite=true;
+ public static boolean append=false;
+ public static boolean errorState;
+
+ //Input file, for use as quality source
+ public static String in1;
+
+}
diff --git a/current/align2/ReadComparatorID.java b/current/align2/ReadComparatorID.java
new file mode 100755
index 0000000..b306062
--- /dev/null
+++ b/current/align2/ReadComparatorID.java
@@ -0,0 +1,26 @@
+package align2;
+
+import java.util.Comparator;
+
+import stream.Read;
+
+/**
+ * @author Brian Bushnell
+ * @date Oct 27, 2014
+ *
+ */
+
+public final class ReadComparatorID implements Comparator<Read>{
+
+ @Override
+ public int compare(Read r1, Read r2) {
+ if(r1.numericID<r2.numericID){return -1;}
+ else if(r1.numericID>r2.numericID){return 1;}
+
+ if(!r1.id.equals(r2.id)){return r1.id.compareTo(r2.id);}
+ return 0;
+ }
+
+ public static final ReadComparatorID comparator=new ReadComparatorID();
+
+}
diff --git a/current/align2/ReadComparatorMapping.java b/current/align2/ReadComparatorMapping.java
new file mode 100755
index 0000000..ecdda62
--- /dev/null
+++ b/current/align2/ReadComparatorMapping.java
@@ -0,0 +1,174 @@
+package align2;
+
+import java.util.Comparator;
+
+import stream.Read;
+import dna.Gene;
+
+/**
+ * @author Brian Bushnell
+ * @date Oct 27, 2014
+ *
+ */
+
+public class ReadComparatorMapping implements Comparator<Read> {
+
+ @Override
+ public int compare(Read a, Read b) {
+
+ if(a.mate==null){
+ int x=compare2(a, b);
+ if(x!=0){return x;}
+ return compare3(a, b);
+ }else{
+
+ if(a.mapped() && b.mapped()){
+ int x=compare2(a, b);
+ if(x!=0){return x;}
+
+ if(a.paired() && b.paired()){
+ x=compare2(a.mate, b.mate);
+ if(x!=0){return x;}
+ x=compare3(a, b);
+ if(x!=0){return x;}
+ x=compare3(a.mate, b.mate);
+ return x;
+ }else{
+ assert(!a.paired() && !b.paired());
+ return compare3(a, b);
+ }
+ }
+
+ if(!a.mapped() && !b.mapped()){
+ int x=compare2(a.mate, b.mate);
+ if(x!=0){return x;}
+ return compare3(a.mate, b.mate);
+ }else if(a.mapped()){
+ if(a.paired()){
+ int x=compare2(a.mate, b.mate);
+ if(x!=0){return x;}
+ return -1;
+ }else{
+ int x=compareCross(a, b.mate);
+ if(x!=0){return x;}
+ return -1;
+ }
+ }else if(b.mapped()){
+ if(b.paired()){
+ int x=compare2(a.mate, b.mate);
+ if(x!=0){return x;}
+ return 1;
+ }else{
+ int x=compareCross(b, a.mate);
+ if(x!=0){return 0-x;}
+ return 1;
+ }
+ }else{
+ assert(false) : a.mapped()+", "+a.paired()+", "+b.mapped()+", "+b.paired()+", "+a.mate.mapped()+", "+b.mate.mapped();
+ }
+
+ //I think this is unreachable...
+ return compare3(a, b);
+ }
+ }
+
+ public int compare2(Read a, Read b) {
+ if(a.mapped() && !b.mapped()){return -1;}
+ if(b.mapped() && !a.mapped()){return 1;}
+ if(a.chrom!=b.chrom){return a.chrom-b.chrom;}
+ if(a.strand()!=b.strand()){return a.strand()-b.strand();}
+
+ assert(!SAME_STRAND_PAIRS) : "TODO";
+ if(a.strand()==Gene.PLUS){
+ if(a.start!=b.start){return a.start-b.start;}
+ }else{
+ if(a.stop!=b.stop){return a.stop-b.stop;}
+ }
+
+ if(a.paired()!=b.paired()){return a.paired() ? -1 : 1;}
+ return 0;
+ }
+
+ public int compareCross(Read a, Read b) {
+ if(a.mapped() && !b.mapped()){return -1;}
+ if(b.mapped() && !a.mapped()){return 1;}
+ if(a.chrom!=b.chrom){return a.chrom-b.chrom;}
+ if(SAME_STRAND_PAIRS){
+ if(a.strand()!=b.strand()){
+ return a.strand()-b.strand();
+ }
+ }else{
+ if(a.strand()==b.strand()){
+ return a.strand()==0 ? -1 : 1;
+ }
+ }
+ if(a.start!=b.start){return a.start-b.start;}
+ if(a.paired()!=b.paired()){return a.paired() ? -1 : 1;}
+ return 0;
+ }
+
+ public int compare3(Read a, Read b){
+ if(a.length()!=b.length()){
+ return b.length()-a.length(); //Preferentially puts longer reads first
+ }
+ if(a.perfect() != b.perfect()){return a.perfect() ? -1 : 1;}
+ int x;
+
+ if(a.match!=null && b.match!=null){
+ x=compareMatchStrings(a.match, b.match);
+ if(x!=0){return x;}
+ }
+
+ assert(!SAME_STRAND_PAIRS) : "TODO";
+ if(a.strand()==Gene.PLUS){
+ if(a.start!=b.start){return a.start-b.start;} //This line should be dead code
+ if(a.stop!=b.stop){return a.stop-b.stop;}
+ }else{
+ if(a.stop!=b.stop){return a.stop-b.stop;} //This line should be dead code
+ if(a.start!=b.start){return a.start-b.start;}
+ }
+
+ x=compareVectors(a.quality, b.quality);
+ if(x!=0){return 0-x;}
+// if(a.stop!=b.stop){return a.stop-b.stop;}
+ if(a.numericID!=b.numericID){return a.numericID>b.numericID ? 1 : -1;}
+ return a.id.compareTo(b.id);
+ }
+
+ public int compareVectors(final byte[] a, final byte[] b){
+ if(a==null || b==null){
+ if(a==null && b!=null){return 1;}
+ if(a!=null && b==null){return -1;}
+ return 0;
+ }
+ final int lim=Tools.min(a.length, b.length);
+ for(int i=0; i<lim; i++){
+ if(a[i]<b[i]){return -1;}
+ if(a[i]>b[i]){return 1;}
+ }
+ return 0;
+ }
+
+ public int compareMatchStrings(final byte[] a, final byte[] b){
+ if(a==null || b==null){
+ if(a==null && b!=null){return 1;}
+ if(a!=null && b==null){return -1;}
+ return 0;
+ }
+ final int lim=Tools.min(a.length, b.length);
+ for(int i=0; i<lim; i++){
+ if(a[i]!=b[i]){
+ boolean ad=(a[i]=='D');
+ boolean bd=(b[i]=='D');
+ boolean ai=(a[i]=='I' || a[i]=='X' || a[i]=='Y');
+ boolean bi=(b[i]=='I' || b[i]=='X' || b[i]=='Y');
+ if(ai!=bi){return ai ? 1 : -1;}
+ if(ad!=bd){return ad ? 1 : -1;}
+ }
+ }
+ return 0;
+ }
+
+ public static boolean SAME_STRAND_PAIRS=false;
+
+}
diff --git a/current/align2/ReadComparatorName.java b/current/align2/ReadComparatorName.java
new file mode 100755
index 0000000..69e2f25
--- /dev/null
+++ b/current/align2/ReadComparatorName.java
@@ -0,0 +1,28 @@
+package align2;
+
+import java.util.Comparator;
+
+import stream.Read;
+
+/**
+ * @author Brian Bushnell
+ * @date Oct 27, 2014
+ *
+ */
+
+public final class ReadComparatorName implements Comparator<Read>{
+
+ @Override
+ public int compare(Read r1, Read r2) {
+
+ if(r1.id==null && r2.id==null){return r1.pairnum()-r2.pairnum();}
+ if(r1.id==null){return -1;}
+ if(r2.id==null){return 1;}
+ int x=r1.id.compareTo(r2.id);
+ if(x==0){return r1.pairnum()-r2.pairnum();}
+ return x;
+ }
+
+ public static final ReadComparatorName comparator=new ReadComparatorName();
+
+}
diff --git a/current/align2/ReadComparatorTopological.java b/current/align2/ReadComparatorTopological.java
new file mode 100755
index 0000000..f70cfe7
--- /dev/null
+++ b/current/align2/ReadComparatorTopological.java
@@ -0,0 +1,77 @@
+package align2;
+
+import java.util.Comparator;
+
+import stream.Read;
+
+/**
+ * @author Brian Bushnell
+ * @date Oct 27, 2014
+ *
+ */
+
+
+public class ReadComparatorTopological implements Comparator<Read>{
+
+ @Override
+ public int compare(Read r1, Read r2) {
+ return compare(r1, r2, true);
+ }
+
+ public int compare(Read r1, Read r2, boolean compareMates) {
+
+ int x=compareVectors(r1.bases, r2.bases);
+ if(x!=0){return x;}
+
+ if(r1.mate!=null && r2.mate!=null){
+ x=compareVectors(r1.mate.bases, r2.mate.bases);
+ }
+ if(x!=0){return x;}
+
+ if(r1.bases!=null && r2.bases!=null && r1.length()!=r2.length()){return r1.length()-r2.length();}
+ if(r1.mate!=null && r2.mate!=null && r1.mate.bases!=null && r2.mate.bases!=null
+ && r1.mateLength()!=r2.mateLength()){return r1.mateLength()-r2.mateLength();}
+
+ x=compareVectors(r1.quality, r2.quality);
+ if(x!=0){return 0-x;}
+
+ if(r1.mate!=null && r2.mate!=null){
+ x=compareVectors(r1.mate.quality, r2.mate.quality);
+ }
+ if(x!=0){return 0-x;}
+
+ if(r1.numericID!=r2.numericID){return r1.numericID>r2.numericID ? 1 : -1;}
+
+ return r1.id.compareTo(r2.id);
+ }
+
+ public int compareVectors(final byte[] a, final byte[] b){
+ if(a==null || b==null){
+ if(a==null && b!=null){return 1;}
+ if(a!=null && b==null){return -1;}
+ return 0;
+ }
+ final int lim=Tools.min(a.length, b.length);
+ for(int i=0; i<lim; i++){
+ if(a[i]<b[i]){return -1;}
+ if(a[i]>b[i]){return 1;}
+ }
+ return 0;
+ }
+
+ public int compareVectorsN(final byte[] a, final byte[] b){
+ if(a==null || b==null){
+ if(a==null && b!=null){return 1;}
+ if(a!=null && b==null){return -1;}
+ return 0;
+ }
+ final int lim=Tools.min(a.length, b.length);
+ for(int i=0; i<lim; i++){
+ if(a[i]=='N' && b[i]!='N'){return 1;}
+ if(a[i]!='N' && b[i]=='N'){return -1;}
+ if(a[i]<b[i]){return -1;}
+ if(a[i]>b[i]){return 1;}
+ }
+ return 0;
+ }
+}
diff --git a/current/align2/ReadErrorComparator.java b/current/align2/ReadErrorComparator.java
new file mode 100755
index 0000000..bb3b4f8
--- /dev/null
+++ b/current/align2/ReadErrorComparator.java
@@ -0,0 +1,38 @@
+package align2;
+
+import java.util.Comparator;
+
+import stream.Read;
+
+/**
+ * @author Brian Bushnell
+ * @date May 30, 2013
+ *
+ */
+public final class ReadErrorComparator implements Comparator<Read>{
+
+ @Override
+ public int compare(Read r1, Read r2) {
+
+ int a=(r1.errors+(r1.mate==null ? 0 : r1.mate.errors));
+ int b=(r2.errors+(r2.mate==null ? 0 : r2.mate.errors));
+ if(a!=b){return a-b;}
+
+ a=(r1.length()+(r1.mate==null ? 0 : r1.mateLength()));
+ b=(r2.length()+(r2.mate==null ? 0 : r2.mateLength()));
+ if(a!=b){return b-a;}
+
+ float a2=(r1.expectedErrors(true, 0)+(r1.mate==null ? 0 : r1.mate.expectedErrors(true, 0)));
+ float b2=(r2.expectedErrors(true, 0)+(r2.mate==null ? 0 : r2.mate.expectedErrors(true, 0)));
+ if(a2!=b2){return a2>b2 ? 1 : -1;}
+
+ if(r1.numericID<r2.numericID){return -1;}
+ else if(r1.numericID>r2.numericID){return 1;}
+
+ if(!r1.id.equals(r2.id)){return r1.id.compareTo(r2.id);}
+ return 0;
+ }
+
+ public static final ReadErrorComparator comparator=new ReadErrorComparator();
+
+}
diff --git a/current/align2/ReadLengthComparator.java b/current/align2/ReadLengthComparator.java
new file mode 100755
index 0000000..99003bd
--- /dev/null
+++ b/current/align2/ReadLengthComparator.java
@@ -0,0 +1,48 @@
+package align2;
+
+import java.util.Comparator;
+
+import stream.Read;
+
+/**
+ * Sorts longest reads first
+ * @author Brian Bushnell
+ * @date Jul 19, 2013
+ *
+ */
+public final class ReadLengthComparator implements Comparator<Read> {
+
+ private ReadLengthComparator(){}
+
+ @Override
+ public int compare(Read a, Read b) {
+ int x=compare2(a, b);
+ if(x==0){x=compare2(a.mate, b.mate);}
+ if(x==0){x=a.id.compareTo(b.id);}
+ if(x==0){x=a.numericID>b.numericID ? 1 : a.numericID<b.numericID ? -1 : 0;}
+ return x;
+ }
+
+ private static int compare2(Read a, Read b) {
+ if(a==b){return 0;}
+ if(a==null){return 1;}
+ if(b==null){return -1;}
+ int x=compareByLength(a.bases, b.bases);
+ return x;
+ }
+
+ private static int compareByLength(byte[] a, byte[] b){
+ if(a==b){return 0;}
+ if(a==null){return 1;}
+ if(b==null){return -1;}
+ return b.length-a.length;
+// if(a.length!=b.length){return b.length-a.length;}
+// for(int i=0; i<a.length; i++){
+// if(a[i]!=b[i]){return a[i]-b[i];}
+// }
+// return 0;
+ }
+
+ public static final ReadLengthComparator comparator=new ReadLengthComparator();
+
+}
diff --git a/current/align2/ReadStats.java b/current/align2/ReadStats.java
new file mode 100755
index 0000000..901d899
--- /dev/null
+++ b/current/align2/ReadStats.java
@@ -0,0 +1,1304 @@
+package align2;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import dna.AminoAcid;
+
+import stream.Read;
+import stream.SamLine;
+
+import fileIO.TextStreamWriter;
+
+
+/**
+ * @author Brian Bushnell
+ * @date Mar 18, 2013
+ *
+ */
+public class ReadStats {
+
+ public ReadStats(){this(true);}
+
+ public ReadStats(boolean addToList){
+ if(addToList){
+ synchronized(ReadStats.class){
+ objectList.add(this);
+ }
+ }
+
+ if(COLLECT_QUALITY_STATS){
+ aqualArray=new long[2][127];
+ qualLength=new long[2][MAXLEN];
+ qualSum=new long[2][MAXLEN];
+ qualSumDouble=new double[2][MAXLEN];
+ bqualHistOverall=new long[127];
+ }else{
+ aqualArray=null;
+ qualLength=null;
+ qualSum=null;
+ qualSumDouble=null;
+ bqualHistOverall=null;
+ }
+
+ if(BQUAL_HIST_FILE!=null){
+ bqualHist=new long[2][MAXLEN][127];
+ }else{
+ bqualHist=null;
+ }
+
+ if(QUAL_COUNT_HIST_FILE!=null){
+ qcountHist=new long[2][127];
+ }else{
+ qcountHist=null;
+ }
+
+ if(COLLECT_MATCH_STATS){
+ matchSum=new long[2][MAXLEN];
+ delSum=new long[2][MAXLEN];
+ insSum=new long[2][MAXLEN];
+ subSum=new long[2][MAXLEN];
+ nSum=new long[2][MAXLEN];
+ clipSum=new long[2][MAXLEN];
+ otherSum=new long[2][MAXLEN];
+ }else{
+ matchSum=null;
+ delSum=null;
+ insSum=null;
+ subSum=null;
+ nSum=null;
+ clipSum=null;
+ otherSum=null;
+ }
+
+ if(COLLECT_QUALITY_ACCURACY){
+ qualMatch=new long[99];
+ qualSub=new long[99];
+ qualIns=new long[99];
+ qualDel=new long[99];
+ }else{
+ qualMatch=null;
+ qualSub=null;
+ qualIns=null;
+ qualDel=null;
+ }
+
+ if(COLLECT_INSERT_STATS){
+ insertHist=new LongList(MAXLEN);
+ }else{
+ insertHist=null;
+ }
+
+ if(COLLECT_BASE_STATS){
+ baseHist=new LongList[2][5];
+ for(int i=0; i<baseHist.length; i++){
+ for(int j=0; j<baseHist[i].length; j++){
+ baseHist[i][j]=new LongList(400);
+ }
+ }
+ }else{
+ baseHist=null;
+ }
+
+
+ if(COLLECT_INDEL_STATS){
+ insHist=new LongList(100);
+ delHist=new LongList(100);
+ delHist2=new LongList(100);
+ }else{
+ insHist=null;
+ delHist=null;
+ delHist2=null;
+ }
+
+ if(COLLECT_GC_STATS){
+ gcHist=new long[GC_BINS+1];
+ }else{
+ gcHist=null;
+ }
+
+ if(COLLECT_ERROR_STATS){
+ errorHist=new LongList(100);
+ }else{
+ errorHist=null;
+ }
+
+ if(COLLECT_LENGTH_STATS){
+ lengthHist=new LongList(501);
+ }else{
+ lengthHist=null;
+ }
+
+ if(COLLECT_IDENTITY_STATS){
+ idHist=new long[ID_BINS+1];
+ idBaseHist=new long[ID_BINS+1];
+ }else{
+ idHist=null;
+ idBaseHist=null;
+ }
+
+ if(COLLECT_TIME_STATS){
+ timeHist=new LongList(1001);
+ }else{
+ timeHist=null;
+ }
+
+ }
+
+ public static ReadStats mergeAll(){
+ if(objectList==null || objectList.isEmpty()){return merged=null;}
+ if(objectList.size()==1){return merged=objectList.get(0);}
+
+ ReadStats x=new ReadStats(false);
+ for(ReadStats rs : objectList){
+ x.read2Count+=rs.read2Count;
+ if(COLLECT_QUALITY_STATS){
+ for(int i=0; i<MAXLEN; i++){
+ x.qualLength[0][i]+=rs.qualLength[0][i];
+ x.qualLength[1][i]+=rs.qualLength[1][i];
+ x.qualSum[0][i]+=rs.qualSum[0][i];
+ x.qualSum[1][i]+=rs.qualSum[1][i];
+ x.qualSumDouble[0][i]+=rs.qualSumDouble[0][i];
+ x.qualSumDouble[1][i]+=rs.qualSumDouble[1][i];
+ }
+ for(int i=0; i<x.aqualArray[0].length; i++){
+ x.aqualArray[0][i]+=rs.aqualArray[0][i];
+ x.aqualArray[1][i]+=rs.aqualArray[1][i];
+ }
+ for(int i=0; i<x.bqualHistOverall.length; i++){
+ x.bqualHistOverall[i]+=rs.bqualHistOverall[i];
+ }
+ if(BQUAL_HIST_FILE!=null){
+ for(int i=0; i<x.bqualHist.length; i++){
+ for(int j=0; j<x.bqualHist[i].length; j++){
+ for(int k=0; k<x.bqualHist[i][j].length; k++){
+ x.bqualHist[i][j][k]+=rs.bqualHist[i][j][k];
+ }
+ }
+ }
+ }
+ if(QUAL_COUNT_HIST_FILE!=null){
+ for(int i=0; i<x.qcountHist.length; i++){
+ for(int j=0; j<x.qcountHist[i].length; j++){
+ x.qcountHist[i][j]+=rs.qcountHist[i][j];
+ }
+ }
+ }
+ }
+
+ if(COLLECT_MATCH_STATS){
+ for(int i=0; i<MAXLEN; i++){
+ x.matchSum[0][i]+=rs.matchSum[0][i];
+ x.matchSum[1][i]+=rs.matchSum[1][i];
+ x.delSum[0][i]+=rs.delSum[0][i];
+ x.delSum[1][i]+=rs.delSum[1][i];
+ x.insSum[0][i]+=rs.insSum[0][i];
+ x.insSum[1][i]+=rs.insSum[1][i];
+ x.subSum[0][i]+=rs.subSum[0][i];
+ x.subSum[1][i]+=rs.subSum[1][i];
+ x.nSum[0][i]+=rs.nSum[0][i];
+ x.nSum[1][i]+=rs.nSum[1][i];
+ x.clipSum[0][i]+=rs.clipSum[0][i];
+ x.clipSum[1][i]+=rs.clipSum[1][i];
+ x.otherSum[0][i]+=rs.otherSum[0][i];
+ x.otherSum[1][i]+=rs.otherSum[1][i];
+ }
+ }
+ if(COLLECT_INSERT_STATS){
+ x.insertHist.add(rs.insertHist);
+ }
+ if(COLLECT_BASE_STATS){
+ for(int i=0; i<rs.baseHist.length; i++){
+ for(int j=0; j<rs.baseHist[i].length; j++){
+ x.baseHist[i][j].add(rs.baseHist[i][j]);
+ }
+ }
+ }
+ if(COLLECT_QUALITY_ACCURACY){
+ for(int i=0; i<x.qualMatch.length; i++){
+ x.qualMatch[i]+=rs.qualMatch[i];
+ x.qualSub[i]+=rs.qualSub[i];
+ x.qualIns[i]+=rs.qualIns[i];
+ x.qualDel[i]+=rs.qualDel[i];
+ }
+ }
+
+
+ if(COLLECT_INDEL_STATS){
+ x.delHist.add(rs.delHist);
+ x.delHist2.add(rs.delHist2);
+ x.insHist.add(rs.insHist);
+ }
+
+ if(COLLECT_LENGTH_STATS){
+ x.lengthHist.add(rs.lengthHist);
+ }
+
+
+ if(COLLECT_ERROR_STATS){
+ x.errorHist.add(rs.errorHist);
+ }
+
+ if(COLLECT_GC_STATS){
+ for(int i=0; i<rs.gcHist.length; i++){
+ x.gcHist[i]+=rs.gcHist[i];
+ }
+ }
+
+ if(COLLECT_IDENTITY_STATS){
+ for(int i=0; i<rs.idHist.length; i++){
+ x.idHist[i]+=rs.idHist[i];
+ x.idBaseHist[i]+=rs.idBaseHist[i];
+ }
+ }
+
+ if(COLLECT_TIME_STATS){
+ x.timeHist.add(rs.timeHist);
+ }
+
+ x.gcMaxReadLen=Tools.max(x.gcMaxReadLen, rs.gcMaxReadLen);
+ x.idMaxReadLen=Tools.max(x.idMaxReadLen, rs.idMaxReadLen);
+ }
+
+ merged=x;
+ return x;
+ }
+
+ public void addToQualityHistogram(final Read r){
+ if(r==null){return;}
+ addToQualityHistogram2(r);
+ if(r.mate!=null){addToQualityHistogram2(r.mate);}
+ }
+
+ private void addToQualityHistogram2(final Read r){
+ int pairnum=r.pairnum();
+ if(r==null || r.quality==null || r.quality.length<1){return;}
+ byte[] quals=r.quality, bases=r.bases;
+ final Object obj=r.obj;
+ if(obj!=null){
+ if(obj.getClass()==SamLine.class){
+ pairnum=((SamLine)obj).pairnum();
+ }else if(obj.getClass()==TrimRead.class){
+ quals=(pairnum==0 ? ((TrimRead)obj).qual1 : ((TrimRead)obj).qual2);
+ bases=(pairnum==0 ? ((TrimRead)obj).bases1 : ((TrimRead)obj).bases2);
+ }
+ }
+ if(pairnum==1){read2Count++;}
+ addToQualityHistogram(quals, pairnum);
+ int x=Read.avgQualityByProbability(bases, quals, true, 0);
+ aqualArray[pairnum][x]++;
+ if(BQUAL_HIST_FILE!=null){
+ addToBQualityHistogram(quals, pairnum);
+ }
+ if(QUAL_COUNT_HIST_FILE!=null){
+ addToQCountHistogram(quals, pairnum);
+ }
+ }
+
+ public void addToQualityHistogram(byte[] qual, int pairnum){
+ if(qual==null || qual.length<1){return;}
+ final int limit=Tools.min(qual.length, MAXLEN);
+ final long[] ql=qualLength[pairnum], qs=qualSum[pairnum];
+ final double[] qsd=qualSumDouble[pairnum];
+ ql[limit-1]++;
+ for(int i=0; i<limit; i++){
+ qs[i]+=qual[i];
+ qsd[i]+=QualityTools.PROB_ERROR[qual[i]];
+ }
+ for(byte q : qual){
+ bqualHistOverall[q]++;
+ }
+ }
+
+ private void addToBQualityHistogram(byte[] qual, int pairnum){
+ if(qual==null || qual.length<1){return;}
+ final int limit=Tools.min(qual.length, MAXLEN);
+ final long[][] bqh=bqualHist[pairnum];
+ for(int i=0; i<limit; i++){
+ bqh[i][qual[i]]++;
+ }
+ }
+
+ private void addToQCountHistogram(byte[] qual, int pairnum){
+ if(qual==null || qual.length<1){return;}
+ final long[] qch=qcountHist[pairnum];
+ for(byte q : qual){
+ qch[q]++;
+ }
+ }
+
+ public void addToQualityAccuracy(final Read r){
+ if(r==null){return;}
+ addToQualityAccuracy(r, 0);
+ if(r.mate!=null){addToQualityAccuracy(r.mate, 1);}
+ }
+
+ public void addToQualityAccuracy(final Read r, int pairnum){
+ if(r==null || r.quality==null || r.quality.length<1 || !r.mapped() || r.match==null/* || r.discarded()*/){return;}
+ final byte[] bases=r.bases;
+ final byte[] qual=r.quality;
+ byte[] match=r.match;
+
+ if(match!=null && r.shortmatch()){match=Read.toLongMatchString(match);}
+
+ final boolean plus=(r.strand()==0);
+ int rpos=0;
+ byte lastm='A';
+ for(int mpos=0; mpos<match.length/* && rpos<limit*/; mpos++){
+ byte b=bases[rpos];
+ byte q=qual[rpos];
+ byte m=match[plus ? mpos : match.length-mpos-1];
+
+ {
+ if(m=='m'){
+ qualMatch[q]++;
+ }else if(m=='S'){
+ qualSub[q]++;
+ }else if(m=='I'){
+ if(AminoAcid.isFullyDefined(b)){qualIns[q]++;}
+ }else if(m=='N'){
+ //do nothing
+ }else if(m=='C'){
+ //do nothing
+ }else if(m=='D'){
+ if(lastm!=m){
+ int x=rpos, y=rpos-1;
+ if(x<qual.length){
+ if(AminoAcid.isFullyDefined(bases[x])){
+ qualDel[qual[x]]++;
+ }
+ }
+ if(y>=0){
+ if(AminoAcid.isFullyDefined(bases[y])){
+ qualDel[qual[y]]++;
+ }
+ }
+ }
+ rpos--;
+ }else{
+ assert(!Character.isDigit(m)) : ((char)m);
+ }
+ }
+
+ rpos++;
+ lastm=m;
+ }
+
+ }
+
+ public void addToErrorHistogram(final Read r){
+ if(r==null){return;}
+ addToErrorHistogram(r, 0);
+ if(r.mate!=null){addToErrorHistogram(r.mate, 1);}
+ }
+
+ private void addToErrorHistogram(final Read r, int pairnum){
+ if(r==null || r.bases==null || r.length()<1 || !r.mapped() || r.match==null/* || r.discarded()*/){return;}
+ int x=r.countSubs();
+ errorHist.increment(x, 1);
+ }
+
+ public void addToLengthHistogram(final Read r){
+ if(r==null){return;}
+ addToLengthHistogram(r, 0);
+ if(r.mate!=null){addToLengthHistogram(r.mate, 1);}
+ }
+
+ private void addToLengthHistogram(final Read r, int pairnum){
+ if(r==null || r.bases==null){return;}
+ int x=Tools.min(r.length(), MAXLENGTHLEN);
+ lengthHist.increment(x, 1);
+ }
+
+ public void addToGCHistogram(final Read r){
+ if(r==null){return;}
+ addToGCHistogram(r, 0);
+ if(r.mate!=null){addToGCHistogram(r.mate, 1);}
+ }
+
+ private void addToGCHistogram(final Read r, int pairnum){
+ if(r==null || r.bases==null){return;}
+ gcHist[Tools.min(GC_BINS, (int)(r.gc()*(GC_BINS+1)))]++;
+ gcMaxReadLen=Tools.max(r.length(), gcMaxReadLen);
+ }
+
+ public void addToIdentityHistogram(final Read r){
+ if(r==null){return;}
+ addToIdentityHistogram(r, 0);
+ if(r.mate!=null){addToIdentityHistogram(r.mate, 1);}
+ }
+
+ private void addToIdentityHistogram(final Read r, int pairnum){
+ if(r==null || r.bases==null || r.length()<1 || !r.mapped() || r.match==null/* || r.discarded()*/){return;}
+ float id=r.identity();
+ idHist[(int)(id*ID_BINS)]++;
+ idBaseHist[(int)(id*ID_BINS)]+=r.length();
+ idMaxReadLen=Tools.max(r.length(), idMaxReadLen);
+ }
+
+ public void addToTimeHistogram(final Read r){
+ if(r==null){return;}
+ addToTimeHistogram(r, 0);//Time for pairs is the same.
+ }
+
+ private void addToTimeHistogram(final Read r, int pairnum){
+ if(r==null){return;}
+ assert(r.obj!=null && r.obj.getClass()==Long.class);
+ int x=(int)Tools.min(((Long)r.obj).longValue(), MAXTIMELEN);
+ timeHist.increment(x, 1);
+ }
+
+ public void addToIndelHistogram(final Read r){
+ if(r==null){return;}
+ addToIndelHistogram(r, 0);
+ if(r.mate!=null){addToIndelHistogram(r.mate, 1);}
+ }
+
+ private void addToIndelHistogram(final Read r, int pairnum){
+ if(r==null || r.bases==null || r.length()<1 || !r.mapped() || r.match==null/* || r.discarded()*/){return;}
+ final byte[] bases=r.bases, match=r.match;
+ final int limit=Tools.min(bases.length, MAXLEN);
+
+ int rpos=0;
+ int streak=0;
+ byte lastm='A';
+ for(int mpos=0; mpos<match.length && rpos<limit; mpos++){
+ byte m=match[mpos];
+
+ {
+ if(lastm!=m){
+ if(lastm=='D'){
+ streak=Tools.min(streak, MAXDELLEN2);
+ if(streak<MAXDELLEN){delHist.increment(streak, 1);}
+ delHist2.increment(streak/100, 1);
+ }else if(lastm=='I'){
+ streak=Tools.min(streak, MAXINSLEN);
+ insHist.increment(streak, 1);
+ }
+ streak=0;
+ }
+ }
+ streak++;
+ rpos++;
+ lastm=m;
+ }
+ if(lastm=='D'){
+ streak=Tools.min(streak, MAXDELLEN2);
+ if(streak<MAXDELLEN){delHist.increment(streak, 1);}
+ delHist2.increment(streak/100, 1);
+ }else if(lastm=='I'){
+ streak=Tools.min(streak, MAXINSLEN);
+ insHist.increment(streak, 1);
+ }
+ }
+
+ public void addToMatchHistogram(final Read r){
+ if(r==null){return;}
+ addToMatchHistogram2(r);
+ if(r.mate!=null){addToMatchHistogram2(r.mate);}
+ }
+
+ private void addToMatchHistogram2(final Read r){
+ if(r==null || r.bases==null || r.length()<1 || !r.mapped() || r.match==null/* || r.discarded()*/){return;}
+ int pairnum=r.pairnum();
+ if(r.obj!=null){
+ if(r.obj.getClass()==SamLine.class){
+ pairnum=((SamLine)r.obj).pairnum();
+ }
+ }
+ if(pairnum==1){read2Count++;}
+ final byte[] bases=r.bases;
+ final int limit=Tools.min(bases.length, MAXLEN);
+ final long[] ms=matchSum[pairnum], ds=delSum[pairnum], is=insSum[pairnum],
+ ss=subSum[pairnum], ns=nSum[pairnum], cs=clipSum[pairnum], os=otherSum[pairnum];
+
+ byte[] match=r.match;
+ if(r.shortmatch() && match!=null){match=Read.toLongMatchString(match);}
+
+ if(match==null){
+ for(int i=0; i<limit; i++){
+ byte b=bases[i];
+ if(b=='N'){ns[i]++;}
+ else{os[i]++;}
+ }
+ }else{
+ final boolean plus=(r.strand()==0);
+ int rpos=0;
+ byte lastm='A';
+ for(int mpos=0; mpos<match.length && rpos<limit; mpos++){
+ byte b=bases[rpos];//bases[plus ? rpos : bases.length-rpos-1];
+ byte m=match[plus ? mpos : match.length-mpos-1];//match[mpos];
+ if(b=='N'){
+ if(m=='D'){
+ if(lastm!=m){ds[rpos]++;}
+ rpos--;
+ }else{ns[rpos]++;}
+ }else{
+ if(m=='m'){
+ ms[rpos]++;
+ }else if(m=='S'){
+ ss[rpos]++;
+ }else if(m=='I'){
+ is[rpos]++;
+ }else if(m=='N'){
+// assert(false) : "\n"+r+"\n"+new String(Data.getChromosome(r.chrom).getBytes(r.start, r.stop))+"\nrpos="+rpos+", mpos="+mpos;
+ os[rpos]++;
+ }else if(m=='C'){
+// assert(false) : r;
+ cs[rpos]++;
+ }else if(m=='D'){
+ if(lastm!=m){ds[rpos]++;}
+ rpos--;
+ }else{
+ os[rpos]++;
+ assert(false) : "For read "+r.numericID+", unknown symbol in match string: ASCII "+m+" = "+(char)m;
+ }
+ }
+ rpos++;
+ lastm=m;
+ }
+ }
+ }
+
+ public void addToInsertHistogram(final Read r, boolean ignoreMappingStrand){
+ if(verbose){
+ System.err.print(r.numericID);
+ if(r==null || r.mate==null || !r.mapped() || !r.mate.mapped() || !r.paired()){
+ System.err.println("\n");
+ }else{
+ System.err.println("\t"+r.strand()+"\t"+r.insertSizeMapped(ignoreMappingStrand)+"\t"+r.mate.insertSizeMapped(ignoreMappingStrand));
+ }
+ }
+ if(r==null || r.mate==null || !r.mapped() || !r.mate.mapped() || !r.paired()){return;}
+ int x=Tools.min(MAXINSERTLEN, r.insertSizeMapped(ignoreMappingStrand));
+ if(x>0){insertHist.increment(x, 1);}
+// assert(x!=1) : "\n"+r+"\n\n"+r.mate+"\n";
+// System.out.println("Incrementing "+x);
+ }
+
+ public void addToBaseHistogram(final Read r){
+ addToBaseHistogram2(r);
+ if(r.mate!=null){addToBaseHistogram2(r.mate);}
+ }
+
+ public void addToBaseHistogram2(final Read r){
+ if(r==null || r.bases==null){return;}
+ int pairnum=r.pairnum();
+ if(r.obj!=null){
+ if(r.obj.getClass()==SamLine.class){
+ pairnum=((SamLine)r.obj).pairnum();
+ }
+ }
+ if(pairnum==1){read2Count++;}
+ final byte[] bases=r.bases;
+ final LongList[] lists=baseHist[pairnum];
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ int x=AminoAcid.baseToNumber[b]+1;
+ lists[x].increment(i, 1);
+ }
+ }
+
+ public static boolean testFiles(boolean allowDuplicates){
+ return Tools.testOutputFiles(overwrite, append, allowDuplicates,
+ AVG_QUAL_HIST_FILE, QUAL_HIST_FILE, BQUAL_HIST_FILE, BQUAL_HIST_OVERALL_FILE, QUAL_COUNT_HIST_FILE,
+ MATCH_HIST_FILE, INSERT_HIST_FILE, BASE_HIST_FILE, QUAL_ACCURACY_FILE, INDEL_HIST_FILE, ERROR_HIST_FILE, LENGTH_HIST_FILE,
+ GC_HIST_FILE, IDENTITY_HIST_FILE, TIME_HIST_FILE);
+ }
+
+ public static boolean writeAll(){
+ if(collectingStats()){
+ ReadStats rs=mergeAll();
+ boolean paired=rs.read2Count>0;
+
+ if(AVG_QUAL_HIST_FILE!=null){rs.writeAverageQualityToFile(AVG_QUAL_HIST_FILE, paired);}
+ if(QUAL_HIST_FILE!=null){rs.writeQualityToFile(QUAL_HIST_FILE, paired);}
+ if(BQUAL_HIST_FILE!=null){rs.writeBQualityToFile(BQUAL_HIST_FILE, paired);}
+ if(BQUAL_HIST_OVERALL_FILE!=null){rs.writeBQualityOverallToFile(BQUAL_HIST_OVERALL_FILE);}
+ if(QUAL_COUNT_HIST_FILE!=null){rs.writeQCountToFile(QUAL_COUNT_HIST_FILE, paired);}
+ if(MATCH_HIST_FILE!=null){rs.writeMatchToFile(MATCH_HIST_FILE, paired);}
+ if(INSERT_HIST_FILE!=null){rs.writeInsertToFile(INSERT_HIST_FILE);}
+ if(BASE_HIST_FILE!=null){rs.writeBaseContentToFile(BASE_HIST_FILE, paired);}
+ if(QUAL_ACCURACY_FILE!=null){rs.writeQualityAccuracyToFile(QUAL_ACCURACY_FILE);}
+
+ if(INDEL_HIST_FILE!=null){rs.writeIndelToFile(INDEL_HIST_FILE);}
+ if(ERROR_HIST_FILE!=null){rs.writeErrorToFile(ERROR_HIST_FILE);}
+ if(LENGTH_HIST_FILE!=null){rs.writeLengthToFile(LENGTH_HIST_FILE);}
+ if(GC_HIST_FILE!=null){rs.writeGCToFile(GC_HIST_FILE, true);}
+ if(IDENTITY_HIST_FILE!=null){rs.writeIdentityToFile(IDENTITY_HIST_FILE, true);}
+ if(TIME_HIST_FILE!=null){rs.writeTimeToFile(TIME_HIST_FILE);}
+
+ return rs.errorState;
+ }
+ return false;
+ }
+
+ public void writeAverageQualityToFile(String fname, boolean writePaired){
+ TextStreamWriter tsw=new TextStreamWriter(fname, overwrite, append, false);
+ tsw.start();
+ tsw.print("#Quality\tcount1\tfraction1"+(writePaired ? "\tcount2\tfraction2" : "")+"\n");
+
+ long sum1=Tools.sum(aqualArray[0]);
+ long sum2=Tools.sum(aqualArray[1]);
+ double mult1=1.0/Tools.max(1, sum1);
+ double mult2=1.0/Tools.max(1, sum2);
+
+ long y=sum1+sum2;
+ for(int i=0; i<aqualArray[0].length; i++){
+ long x1=aqualArray[0][i];
+ long x2=aqualArray[1][i];
+ y-=x1;
+ y-=x2;
+ tsw.print(String.format("%d\t%d\t%.5f", i, x1, x1*mult1));
+ if(writePaired){
+ tsw.print(String.format("\t%d\t%.5f", x2, x2*mult2));
+ }
+ tsw.print("\n");
+ if(y<=0){break;}
+ }
+ tsw.poison();
+ tsw.waitForFinish();
+ errorState|=tsw.errorState;
+ }
+
+ public void writeQCountToFile(String fname, boolean writePaired){
+ TextStreamWriter tsw=new TextStreamWriter(fname, overwrite, append, false);
+ tsw.start();
+ tsw.print("#Quality\tcount1\tfraction1"+(writePaired ? "\tcount2\tfraction2" : "")+"\n");
+
+ long sum1=Tools.sum(qcountHist[0]);
+ long sum2=Tools.sum(qcountHist[1]);
+ double mult1=1.0/Tools.max(1, sum1);
+ double mult2=1.0/Tools.max(1, sum2);
+
+ long y=sum1+sum2;
+ for(int i=0; i<qcountHist[0].length; i++){
+ long x1=qcountHist[0][i];
+ long x2=qcountHist[1][i];
+ y-=x1;
+ y-=x2;
+ tsw.print(String.format("%d\t%d\t%.5f", i, x1, x1*mult1));
+ if(writePaired){
+ tsw.print(String.format("\t%d\t%.5f", x2, x2*mult2));
+ }
+ tsw.print("\n");
+ if(y<=0){break;}
+ }
+ tsw.poison();
+ tsw.waitForFinish();
+ errorState|=tsw.errorState;
+ }
+
+ public void writeQualityToFile(String fname, boolean writePaired){
+ TextStreamWriter tsw=new TextStreamWriter(fname, overwrite, append, false);
+ tsw.start();
+ final boolean measure=(matchSum!=null);
+ if(measure){
+ if(writePaired){
+ tsw.print("#BaseNum\tRead1_linear\tRead1_log\tRead1_measured\tRead2_linear\tRead2_log\tRead2_measured\n");
+ }else{
+ tsw.print("#BaseNum\tRead1_linear\tRead1_log\tRead1_measured\n");
+ }
+ }else{
+ tsw.print("#BaseNum\tRead1_linear\tRead1_log"+(writePaired ? "\tRead2_linear\tRead2_log" : "")+"\n");
+ }
+
+ final long[] qs1=qualSum[0], qs2=qualSum[1], ql1=qualLength[0], ql2=qualLength[1];
+ final double[] qsd1=qualSumDouble[0], qsd2=qualSumDouble[1];
+
+ for(int i=MAXLEN-2; i>=0; i--){
+ ql1[i]+=ql1[i+1];
+ ql2[i]+=ql2[i+1];
+ }
+
+ if(writePaired){
+ for(int i=0; i<MAXLEN && (ql1[i]>0 || ql2[i]>0); i++){
+ int a=i+1;
+ double blin, clin, blog, clog;
+ blin=qs1[i]/(double)Tools.max(1, ql1[i]);
+ clin=qs2[i]/(double)Tools.max(1, ql2[i]);
+ blog=qsd1[i]/(double)Tools.max(1, ql1[i]);
+ clog=qsd2[i]/(double)Tools.max(1, ql2[i]);
+ blog=QualityTools.probErrorToPhredDouble(blog);
+ clog=QualityTools.probErrorToPhredDouble(clog);
+ if(measure){
+ double bcalc=calcQualityAtPosition(i, 0);
+ double ccalc=calcQualityAtPosition(i, 1);
+ tsw.print(String.format("%d\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n", a, blin, blog, bcalc, clin, clog, ccalc));
+ }else{
+ tsw.print(String.format("%d\t%.3f\t%.3f\t%.3f\t%.3f\n", a, blin, blog, clin, clog));
+ }
+ }
+ }else{
+ for(int i=0; i<MAXLEN && ql1[i]>0; i++){
+ int a=i+1;
+ double blin, blog;
+ blin=qs1[i]/(double)Tools.max(1, ql1[i]);
+ blog=qsd1[i]/(double)Tools.max(1, ql1[i]);
+ blog=QualityTools.probErrorToPhredDouble(blog);
+ if(measure){
+ double bcalc=calcQualityAtPosition(i, 0);
+ tsw.print(String.format("%d\t%.3f\t%.3f\t%.3f\n", a, blin, blog, bcalc));
+ }else{
+ tsw.print(String.format("%d\t%.3f\t%.3f\n", a, blin, blog));
+ }
+ }
+ }
+ tsw.poison();
+ tsw.waitForFinish();
+ errorState|=tsw.errorState;
+ }
+
+ private double calcQualityAtPosition(int pos, int pairnum){
+ long m=matchSum[pairnum][pos];
+ long d=delSum[pairnum][pos]; //left-adjacent deletion
+ long d2=delSum[pairnum][Tools.min(pos, delSum[pairnum].length-1)]; //right-adjacent deletion
+ long i=insSum[pairnum][pos];
+ long s=subSum[pairnum][pos];
+ long n=nSum[pairnum][pos]; //Not generally useful
+ long good=Tools.max(0, m*2-d-d2);
+ long total=Tools.max(0, m*2+i*2+s*2); //not d
+ long bad=total-good;
+ if(total<1){return 0;}
+ double error=bad/(double)total;
+ return QualityTools.probErrorToPhredDouble(error);
+ }
+
+ public void writeBQualityOverallToFile(String fname){
+ final long[] cp30=Arrays.copyOf(bqualHistOverall, bqualHistOverall.length);
+ for(int i=0; i<30; i++){cp30[i]=0;}
+
+ final long sum=Tools.sum(bqualHistOverall);
+ final long median=Tools.percentile(bqualHistOverall, 0.5);
+ final double mean=Tools.averageHistogram(bqualHistOverall);
+ final double stdev=Tools.standardDeviationHistogram(bqualHistOverall);
+ final double mean30=Tools.averageHistogram(cp30);
+ final double stdev30=Tools.standardDeviationHistogram(cp30);
+ final double mult=1.0/Tools.max(1, sum);
+ long y=sum;
+
+ TextStreamWriter tsw=new TextStreamWriter(fname, overwrite, append, false);
+ tsw.start();
+ tsw.print("#Median\t"+median+"\n");
+ tsw.print("#Mean\t"+String.format("%.3f", mean)+"\n");
+ tsw.print("#STDev\t"+String.format("%.3f", stdev)+"\n");
+ tsw.print("#Mean_30\t"+String.format("%.3f", mean30)+"\n");
+ tsw.print("#STDev_30\t"+String.format("%.3f", stdev30)+"\n");
+ tsw.print("#Quality\tbases\tfraction\n");
+
+ for(int i=0; i<bqualHistOverall.length; i++){
+ long x=bqualHistOverall[i];
+ y-=x;
+ tsw.print(String.format("%d\t%d\t%.5f", i, x, x*mult));
+ tsw.print("\n");
+ if(y<=0){break;}
+ }
+ tsw.poison();
+ tsw.waitForFinish();
+ errorState|=tsw.errorState;
+ }
+
+ public void writeBQualityToFile(String fname, boolean writePaired){
+ TextStreamWriter tsw=new TextStreamWriter(fname, overwrite, append, false);
+ tsw.start();
+ tsw.print("#BaseNum\tcount_1\tmin_1\tmax_1\tmean_1\tQ1_1\tmed_1\tQ3_1\tLW_1\tRW_1");
+ if(writePaired){tsw.print("\tcount_2\tmin_2\tmax_2\tmean_2\tQ1_2\tmed_2\tQ3_2\tLW_2\tRW_2");}
+ tsw.print("\n");
+
+ for(int i=0; i<MAXLEN; i++){
+ final long[] a1=bqualHist[0][i], a2=bqualHist[1][i];
+ final long sum1=Tools.sum(a1), sum2=Tools.sum(a2);
+ if(sum1<1 && sum2<1){break;}
+
+ {
+ final long a[]=a1, sum=sum1;
+
+ final long weightedSum=Tools.sumHistogram(a);
+ final long med=Tools.median(a), min=Tools.minHistogram(a), max=Tools.maxHistogram(a);
+ final long firstQuart=Tools.percentile(a, 0.25);
+ final long thirdQuart=Tools.percentile(a, 0.75);
+ final long leftWhisker=Tools.percentile(a, 0.02);
+ final long rightWhisker=Tools.percentile(a, 0.98);
+ final double mean=weightedSum*1.0/Tools.max(sum, 0);
+ tsw.print(String.format("%d\t%d\t%d\t%d\t%.2f\t%d\t%d\t%d\t%d\t%d", i, sum, min, max, mean, firstQuart, med, thirdQuart, leftWhisker, rightWhisker));
+ }
+
+ if(writePaired){
+ final long a[]=a2, sum=sum2;
+
+ final long weightedSum=Tools.sumHistogram(a);
+ final long med=Tools.median(a), min=Tools.minHistogram(a), max=Tools.maxHistogram(a);
+ final long firstQuart=Tools.percentile(a, 0.25);
+ final long thirdQuart=Tools.percentile(a, 0.75);
+ final long leftWhisker=Tools.percentile(a, 0.02);
+ final long rightWhisker=Tools.percentile(a, 0.98);
+ final double mean=weightedSum*1.0/Tools.max(sum, 0);
+ tsw.print(String.format("\t%d\t%d\t%d\t%.2f\t%d\t%d\t%d\t%d\t%d", sum, min, max, mean, firstQuart, med, thirdQuart, leftWhisker, rightWhisker));
+ }
+ tsw.print("\n");
+ }
+ tsw.poison();
+ tsw.waitForFinish();
+ errorState|=tsw.errorState;
+ }
+
+ public void writeQualityAccuracyToFile(String fname){
+
+ int max=qualMatch.length;
+ for(int i=max-1; i>=0; i--){
+ if(qualMatch[i]+qualSub[i]+qualIns[i]+qualDel[i]>0){break;}
+ max=i;
+ }
+
+ double devsum=0;
+ double devsumSub=0;
+ long observations=0;
+ for(int i=0; i<max; i++){
+ long qm=qualMatch[i]*2;
+ long qs=qualSub[i]*2;
+ long qi=qualIns[i]*2;
+ long qd=qualDel[i];
+
+ double phred=-1;
+ double phredSub=-1;
+
+ long sum=qm+qs+qi+qd;
+ if(sum>0){
+ double mult=1.0/sum;
+ double subRate=(qs)*mult;
+ double errorRate=(qs+qi+qd)*mult;
+
+ phredSub=QualityTools.probErrorToPhredDouble(subRate);
+ phred=QualityTools.probErrorToPhredDouble(errorRate);
+ double deviation=phred-i;
+ double deviationSub=phredSub-i;
+ if(i==Read.MIN_CALLED_QUALITY && deviation<0){deviation=0;}
+ else if(i==Read.MAX_CALLED_QUALITY && max==Read.MAX_CALLED_QUALITY+1 && deviation>0){deviation=0;}
+ if(i==Read.MIN_CALLED_QUALITY && deviationSub<0){deviationSub=0;}
+ else if(i==Read.MAX_CALLED_QUALITY && max==Read.MAX_CALLED_QUALITY+1 && deviationSub>0){deviationSub=0;}
+ devsum+=(Math.abs(deviation)*sum);
+ devsumSub+=(Math.abs(deviationSub)*sum);
+ observations+=sum;
+ }
+ }
+
+ TextStreamWriter tsw=new TextStreamWriter(fname, overwrite, append, false);
+ tsw.start();
+ tsw.print(String.format("#Deviation\t%.3f\n", devsum/observations));
+ tsw.print(String.format("#DeviationSub\t%.3f\n", devsumSub/observations));
+ tsw.print("#Quality\tMatch\tSub\tIns\tDel\tTrueQuality\tTrueQualitySub\n");
+ for(int i=0; i<max; i++){
+ long qm=qualMatch[i]*2;
+ long qs=qualSub[i]*2;
+ long qi=qualIns[i]*2;
+ long qd=qualDel[i];
+
+ double phred=-1;
+ double phredSub=-1;
+
+ long sum=qm+qs+qi+qd;
+ if(sum>0){
+ double mult=1.0/sum;
+ double subRate=(qs)*mult;
+ double errorRate=(qs+qi+qd)*mult;
+
+ phredSub=QualityTools.probErrorToPhredDouble(subRate);
+ phred=QualityTools.probErrorToPhredDouble(errorRate);
+
+// System.err.println("sub: "+qs+"/"+sum+" -> "+subRate+" -> "+phredSub);
+ }
+
+ tsw.print(i+"\t"+qm+"\t"+qs+"\t"+qi+"\t"+qd);
+ tsw.print(phred>=0 ? String.format("\t%.2f", phred) : "\t");
+ tsw.print(phredSub>=0 ? String.format("\t%.2f\n", phredSub) : "\t\n");
+
+// System.err.println(qm+"\t"+qs+"\t"+qi+"\t"+qd);
+ }
+
+ tsw.poison();
+ tsw.waitForFinish();
+ errorState|=tsw.errorState;
+ }
+
+ public void writeMatchToFile(String fname, boolean writePaired){
+ if(!writePaired){
+ writeMatchToFileUnpaired(fname);
+ return;
+ }
+ TextStreamWriter tsw=new TextStreamWriter(fname, overwrite, false, false);
+ tsw.start();
+ tsw.print("#BaseNum\tMatch1\tSub1\tDel1\tIns1\tN1\tOther1\tMatch2\tSub2\tDel2\tIns2\tN2\tOther2\n");
+
+ final long[] ms1=matchSum[0], ds1=delSum[0], is1=insSum[0],
+ ss1=subSum[0], ns1=nSum[0], cs1=clipSum[0], os1=otherSum[0];
+ final long[] ms2=matchSum[1], ds2=delSum[1], is2=insSum[1],
+ ss2=subSum[1], ns2=nSum[1], cs2=clipSum[1], os2=otherSum[1];
+
+ for(int i=0; i<MAXLEN; i++){
+ int a=i+1;
+ long sum1=ms1[i]+is1[i]+ss1[i]+ns1[i]+cs1[i]+os1[i]; //no deletions
+ long sum2=ms2[i]+is2[i]+ss2[i]+ns2[i]+cs2[i]+os2[i]; //no deletions
+ if(sum1==0 && sum2==0){break;}
+ double inv1=1.0/(double)Tools.max(1, sum1);
+ double inv2=1.0/(double)Tools.max(1, sum2);
+
+ tsw.print(String.format("%d", a));
+ tsw.print(String.format("\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f",
+ ms1[i]*inv1, ss1[i]*inv1, ds1[i]*inv1, is1[i]*inv1, ns1[i]*inv1, (os1[i]+cs1[i])*inv1));
+ tsw.print(String.format("\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f",
+ ms2[i]*inv2, ss2[i]*inv2, ds2[i]*inv2, is2[i]*inv2, ns2[i]*inv2, (os2[i]+cs2[i])*inv2)
+// +", "+ms2[i]+", "+is2[i]+", "+ss2[i]+", "+ns2[i]+", "+cs2[i]+", "+os2[i]
+ );
+ tsw.print("\n");
+ }
+ tsw.poison();
+ tsw.waitForFinish();
+ errorState|=tsw.errorState;
+ }
+
+ public void writeMatchToFileUnpaired(String fname){
+ TextStreamWriter tsw=new TextStreamWriter(fname, overwrite, false, false);
+ tsw.start();
+ tsw.print("#BaseNum\tMatch1\tSub1\tDel1\tIns1\tN1\tOther1\n");
+
+ final long[] ms1=matchSum[0], ds1=delSum[0], is1=insSum[0],
+ ss1=subSum[0], ns1=nSum[0], cs1=clipSum[0], os1=otherSum[0];
+
+ for(int i=0; i<MAXLEN; i++){
+ int a=i+1;
+ long sum1=ms1[i]+is1[i]+ss1[i]+ns1[i]+cs1[i]+os1[i]; //no deletions
+ if(sum1==0){break;}
+ double inv1=1.0/(double)Tools.max(1, sum1);
+
+ tsw.print(String.format("%d", a));
+ tsw.print(String.format("\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f",
+ ms1[i]*inv1, ss1[i]*inv1, ds1[i]*inv1, is1[i]*inv1, ns1[i]*inv1, (os1[i]+cs1[i])*inv1)
+// +", "+ms1[i]+", "+is1[i]+", "+ss1[i]+", "+ns1[i]+", "+cs1[i]+", "+os1[i]
+ );
+ tsw.print("\n");
+ }
+ tsw.poison();
+ tsw.waitForFinish();
+ errorState|=tsw.errorState;
+ }
+
+ public void writeInsertToFile(String fname){
+ StringBuilder sb=new StringBuilder();
+ sb.append("#Mean\t"+String.format("%.3f", Tools.averageHistogram(insertHist.array))+"\n");
+ sb.append("#Median\t"+Tools.percentile(insertHist.array, 0.5)+"\n");
+ sb.append("#Mode\t"+Tools.calcMode(insertHist.array)+"\n");
+ sb.append("#STDev\t"+String.format("%.3f", Tools.standardDeviationHistogram(insertHist.array))+"\n");
+ sb.append("#PercentOfPairs\t"+String.format("%.3f", matedPercent)+"\n");
+ sb.append("#InsertSize\tCount\n");
+ writeHistogramToFile(fname, sb.toString(), insertHist, !skipZeroInsertCount);
+ }
+
+ public void writeBaseContentToFile(String fname, boolean paired){
+ TextStreamWriter tsw=new TextStreamWriter(fname, overwrite, false, false);
+ tsw.start();
+// if(paired){
+ tsw.print("#Pos\tA\tC\tG\tT\tN\n");
+// }
+
+ LongList[] lists;
+
+ int max=writeBaseContentToFile2(tsw, baseHist[0], 0);
+ if(paired){
+ writeBaseContentToFile2(tsw, baseHist[1], max);
+ }
+
+ tsw.poison();
+ tsw.waitForFinish();
+ errorState|=tsw.errorState;
+ }
+
+ private static int writeBaseContentToFile2(TextStreamWriter tsw, LongList[] lists, int offset){
+ int max=0;
+ StringBuilder sb=new StringBuilder(100);
+ for(LongList ll : lists){max=Tools.max(max, ll.size);}
+ for(int i=0; i<max; i++){
+ long a=lists[1].get(i);
+ long c=lists[2].get(i);
+ long g=lists[3].get(i);
+ long t=lists[4].get(i);
+ long n=lists[0].get(i);
+ double mult=1.0/(a+c+g+t+n);
+
+ sb.setLength(0);
+ sb.append(i+offset).append('\t');
+ sb.append(String.format("%.5f\t", a*mult));
+ sb.append(String.format("%.5f\t", c*mult));
+ sb.append(String.format("%.5f\t", g*mult));
+ sb.append(String.format("%.5f\t", t*mult));
+ sb.append(String.format("%.5f\n", n*mult));
+
+ tsw.print(sb.toString());
+ }
+ return max;
+ }
+
+ public void writeIndelToFile(String fname){
+ TextStreamWriter tsw=new TextStreamWriter(fname, overwrite, false, false);
+ tsw.start();
+ tsw.print("#Length\tDeletions\tInsertions\n");
+
+ int max=Tools.max(insHist.size, delHist.size);
+
+ for(int i=0; i<max; i++){
+ long x=delHist.get(i);
+ long y=insHist.get(i);
+ if(x>0 || y>0 || !skipZeroIndel){
+ tsw.print(i+"\t"+x+"\t"+y+"\n");
+ }
+ }
+
+ //TODO: Disabled because it was irritating when graphing. Should write to a different file.
+// tsw.print("#Length_bin\tDeletions\n");
+// max=delHist2.size;
+// for(int i=0; i<max; i++){
+// long x=delHist2.get(i);
+// if(x>0 || !skipZeroIndel){
+// tsw.print((i*DEL_BIN)+"\t"+x+"\n");
+// }
+// }
+
+ tsw.poison();
+ tsw.waitForFinish();
+ errorState|=tsw.errorState;
+ }
+
+ public void writeErrorToFile(String fname){
+ writeHistogramToFile(fname, "#Errors\tCount\n", errorHist, false);
+ }
+
+ public void writeLengthToFile(String fname){
+ writeHistogramToFile(fname, "#Length\tCount\n", lengthHist, false);
+ }
+
+ public void writeTimeToFile(String fname){
+ writeHistogramToFile(fname, "#Time\tCount\n", timeHist, false);
+ }
+
+ public void writeHistogramToFile(String fname, String header, LongList hist, boolean printZeros){
+ TextStreamWriter tsw=new TextStreamWriter(fname, overwrite, false, false);
+ tsw.start();
+ tsw.print(header);
+
+ int max=hist.size;
+
+ for(int i=0; i<max; i++){
+ long x=hist.get(i);
+ if(x>0 || printZeros){
+ tsw.print(i+"\t"+x+"\n");
+ }
+ }
+ tsw.poison();
+ tsw.waitForFinish();
+ errorState|=tsw.errorState;
+ }
+
+ public void writeGCToFile(String fname, boolean printZeros){
+ final long[] hist;
+ if(GC_BINS_AUTO && gcMaxReadLen+1<gcHist.length){
+ hist=Tools.downsample(gcHist, gcMaxReadLen+1);
+ }else{
+ hist=gcHist;
+ }
+ final int bins=hist.length;
+ final double gcMult=100.0/Tools.max(1, bins-1);
+ final long total=Tools.sum(hist);
+ final long max=Tools.max(hist);
+ final double countsPerX=Tools.max(1, ((max*1000.0)/40));
+ final double fractionMult=1.0/Tools.max(1, total);
+ long sum=0;
+
+ TextStreamWriter tsw=new TextStreamWriter(fname, overwrite, false, false);
+ tsw.start();
+ tsw.print("#Mean\t"+String.format("%.3f", Tools.averageHistogram(hist)*gcMult)+"\n");
+ tsw.print("#Median\t"+String.format("%.3f", Tools.percentile(hist, 0.5)*gcMult)+"\n");
+ tsw.print("#Mode\t"+String.format("%.3f", Tools.calcMode(hist)*gcMult)+"\n");
+ tsw.print("#STDev\t"+String.format("%.3f", Tools.standardDeviationHistogram(hist)*gcMult)+"\n");
+ if(GC_PLOT_X){
+ tsw.print("#GC\tCount\tCumulative\tPlot\n");
+ }else{
+ tsw.print("#GC\tCount\n");
+ }
+
+ for(int i=0; i<bins; i++){
+ long x=hist[i];
+ sum+=x;
+ if(x>0 || printZeros){
+ //This assumes GC_BINS==100
+// tsw.print(i+"\t"+x+"\n");
+ if(GC_PLOT_X){
+ tsw.print(String.format("%.1f\t%d\t", i*gcMult, x));
+
+ tsw.print(String.format("%.3f\t", sum*fractionMult));
+
+ int len=(int)((x*1000)/countsPerX);
+ for(int j=0; j<len; j++){tsw.print("X");}
+ if(len<1 && x>0){
+ if((x*1000f)/countsPerX>0.1f){tsw.print("x");}
+ else{tsw.print(".");}
+ }
+
+ tsw.print("\n");
+ }else{
+ tsw.print(String.format("%.1f\t%d\n", i*gcMult, x));
+ }
+ }
+ }
+ tsw.poison();
+ tsw.waitForFinish();
+ errorState|=tsw.errorState;
+ }
+
+ public void writeIdentityToFile(String fname, boolean printZeros){
+ final long[] hist, histb;
+ if(ID_BINS_AUTO && idMaxReadLen+1<idHist.length){
+ hist=Tools.downsample(idHist, idMaxReadLen+1);
+ histb=Tools.downsample(idBaseHist, idMaxReadLen+1);
+ }else{
+ hist=idHist;
+ histb=idBaseHist;
+ }
+ final int max=hist.length;
+ final double mult=100.0/(max-1);
+
+ TextStreamWriter tsw=new TextStreamWriter(fname, overwrite, false, false);
+ tsw.start();
+
+
+ tsw.print("#Mean_reads\t"+String.format("%.3f", (Tools.averageHistogram(hist)*mult))+"\n");
+ tsw.print("#Mean_bases\t"+(String.format("%.3f", Tools.averageHistogram(histb)*mult))+"\n");
+ tsw.print("#Median_reads\t"+(int)Math.round(Tools.percentile(hist, 0.5)*mult)+"\n");
+ tsw.print("#Median_bases\t"+(int)Math.round(Tools.percentile(histb, 0.5)*mult)+"\n");
+ tsw.print("#Mode_reads\t"+(int)Math.round(Tools.calcMode(hist)*mult)+"\n");
+ tsw.print("#Mode_bases\t"+(int)Math.round(Tools.calcMode(histb)*mult)+"\n");
+ tsw.print("#STDev_reads\t"+String.format("%.3f", (Tools.standardDeviationHistogram(hist)*mult))+"\n");
+ tsw.print("#STDev_bases\t"+String.format("%.3f", (Tools.standardDeviationHistogram(histb)*mult))+"\n");
+ tsw.print("#Identity\tReads\tBases\n");
+
+ for(int i=0; i<max; i++){
+ long x=hist[i], x2=histb[i];
+ if(x>0 || printZeros){
+ tsw.print(String.format("%.1f", i*mult)+"\t"+x+"\t"+x2+"\n");
+ }
+ }
+ tsw.poison();
+ tsw.waitForFinish();
+ errorState|=tsw.errorState;
+ }
+
+ public static boolean collectingStats(){
+ return COLLECT_QUALITY_STATS || COLLECT_QUALITY_ACCURACY || COLLECT_MATCH_STATS || COLLECT_INSERT_STATS || COLLECT_BASE_STATS
+ || COLLECT_INDEL_STATS || COLLECT_GC_STATS || COLLECT_ERROR_STATS || COLLECT_LENGTH_STATS || COLLECT_IDENTITY_STATS || COLLECT_TIME_STATS;
+ }
+
+ //Tracks to see if read2s have been encountered, for displaying stats.
+ private long read2Count=0;
+
+ public final long[][] aqualArray;
+ public final long[][] qualLength;
+ public final long[][] qualSum;
+
+ public final long[][][] bqualHist;
+ public final long[] bqualHistOverall;
+
+ public final long[][] qcountHist;
+
+ public final double[][] qualSumDouble;
+
+ public final long[][] matchSum;
+ public final long[][] delSum;
+ public final long[][] insSum;
+ public final long[][] subSum;
+ public final long[][] nSum;
+ public final long[][] clipSum;
+ public final long[][] otherSum;
+
+ public final long[] qualMatch;
+ public final long[] qualSub;
+ public final long[] qualIns;
+ public final long[] qualDel;
+
+ public final long[] gcHist;
+ public final long[] idHist;
+ public final long[] idBaseHist;
+ private int gcMaxReadLen=1;
+ private int idMaxReadLen=1;
+
+ public final LongList[][] baseHist;
+
+ /** Insert size */
+ public final LongList insertHist;
+ /** Read length */
+ public final LongList lengthHist;
+ /** Number errors per read */
+ public final LongList errorHist;
+ /** Insertion length */
+ public final LongList insHist;
+ /** Deletion length */
+ public final LongList delHist;
+ /** Deletion length, binned */
+ public final LongList delHist2;
+ /** Time */
+ public final LongList timeHist;
+
+ public static final int MAXLEN=6000;
+ public static final int MAXINSERTLEN=40000;
+ public static final int MAXLENGTHLEN=80000;
+ public static final int MAXTIMELEN=80000;
+ public static final int MAXINSLEN=1000;
+ public static final int MAXDELLEN=1000;
+ public static final int MAXDELLEN2=1000000;
+ public static final int DEL_BIN=100;
+ public static int GC_BINS=100;
+ public static int ID_BINS=100;
+ public static boolean ID_BINS_AUTO=false;
+ public static boolean GC_BINS_AUTO=false;
+ public static boolean GC_PLOT_X=false;
+
+ public boolean errorState=false;
+
+ public static ReadStats merged=null;
+
+ public static double matedPercent=0;
+
+ public static ArrayList<ReadStats> objectList=new ArrayList<ReadStats>();
+ public static boolean COLLECT_QUALITY_STATS=false;
+ public static boolean COLLECT_QUALITY_ACCURACY=false;
+ public static boolean COLLECT_MATCH_STATS=false;
+ public static boolean COLLECT_INSERT_STATS=false;
+ public static boolean COLLECT_BASE_STATS=false;
+ public static boolean COLLECT_INDEL_STATS=false;
+ public static boolean COLLECT_GC_STATS=false;
+ public static boolean COLLECT_ERROR_STATS=false;
+ public static boolean COLLECT_LENGTH_STATS=false;
+ public static boolean COLLECT_IDENTITY_STATS=false;
+ public static boolean COLLECT_TIME_STATS=false;
+
+ public static String AVG_QUAL_HIST_FILE=null;
+ public static String QUAL_HIST_FILE=null;
+ public static String BQUAL_HIST_FILE=null;
+ public static String QUAL_COUNT_HIST_FILE=null;
+ public static String BQUAL_HIST_OVERALL_FILE=null;
+ public static String QUAL_ACCURACY_FILE=null;
+ public static String MATCH_HIST_FILE=null;
+ public static String INSERT_HIST_FILE=null;
+ public static String BASE_HIST_FILE=null;
+ public static String INDEL_HIST_FILE=null;
+ public static String ERROR_HIST_FILE=null;
+ public static String LENGTH_HIST_FILE=null;
+ public static String GC_HIST_FILE=null;
+ public static String IDENTITY_HIST_FILE=null;
+ public static String TIME_HIST_FILE=null;
+
+ public static boolean overwrite=false;
+ public static boolean append=false;
+ public static final boolean verbose=false;
+
+ public static boolean skipZeroInsertCount=true;
+ public static boolean skipZeroIndel=true;
+
+}
diff --git a/current/align2/RefToIndex.java b/current/align2/RefToIndex.java
new file mode 100755
index 0000000..b6cfd19
--- /dev/null
+++ b/current/align2/RefToIndex.java
@@ -0,0 +1,166 @@
+package align2;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Date;
+
+import dna.ChromosomeArray;
+import dna.Data;
+import dna.FastaToChromArrays2;
+import fileIO.FileFormat;
+import fileIO.ReadWrite;
+import fileIO.SummaryFile;
+
+/**
+ * @author Brian Bushnell
+ * @date Sep 25, 2013
+ *
+ */
+public class RefToIndex {
+
+ public static final void clear(){
+ chromlist=null;
+ }
+
+ public static String summaryLoc(int build){
+ String s=IndexMaker4.fname(1, 1, 13, 1, build);
+ String dir=new File(s).getParent();
+ dir=dir.replace('\\', '/');
+ dir=dir.replace("ref/index/", "ref/genome/");
+ String sf=dir+"/summary.txt";
+ return sf;
+ }
+
+ public static void makeIndex(String reference, int build, PrintStream sysout, int keylen){
+ assert(reference!=null);
+ {
+ File f=new File(reference);
+ if(!f.exists() || !f.isFile() || !f.canRead()){
+ if(!reference.startsWith("stdin")){
+ throw new RuntimeException("Cannot read file "+f.getAbsolutePath());
+ }
+ }else{
+ FileFormat ff=FileFormat.testInput(reference, FileFormat.FA, null, false, true, true);
+ if(!ff.fasta()){
+ throw new RuntimeException("Reference file is not in fasta format: "+reference+"\n"+ff);
+ }
+ }
+ }
+
+ String s=IndexMaker4.fname(1, 1, keylen, 1);
+ String dir=new File(s).getParent();
+ dir=dir.replace('\\', '/');
+ final String base=dir.substring(0, dir.length()-7);
+ final String args=(Shared.COMMAND_LINE==null ? "null" : Arrays.toString(Shared.COMMAND_LINE));
+ final String indexlog=base+"build"+build+"_"+
+ (System.nanoTime()&Long.MAX_VALUE)+"."+((args==null ? (reference==null ? "null" : reference) : args).hashCode()&Integer.MAX_VALUE)+".log";
+ dir=dir.replace("ref/index/", "ref/genome/");
+ String sf=dir+"/summary.txt";
+ if(!NODISK && new File(sf).exists() && SummaryFile.compare(sf, reference)){
+ //do nothing
+ if(LOG && !NODISK){
+ if(!new File(base).exists()){new File(base).mkdirs();}
+ ReadWrite.writeString(new Date()+"\nFound an already-written genome for build "+build+".\n"+args+"\n", indexlog, true);
+ }
+ sysout.println("NOTE:\tIgnoring reference file because it already appears to have been processed.");
+ sysout.println("NOTE:\tIf you wish to regenerate the index, please manually delete "+dir+"/summary.txt");
+ }else{
+ if(NODISK){}
+ else{//Delete old data if present
+ File f=new File(dir);
+ if(f.exists()){
+ File[] f2=f.listFiles();
+ if(f2!=null && f2.length>0){
+ if(overwrite || f2[0].getAbsolutePath().equals(new File(reference).getAbsolutePath())){
+ sysout.println("NOTE:\tDeleting contents of "+dir+" because reference is specified and overwrite="+overwrite);
+ if(LOG && !NODISK){ReadWrite.writeString(new Date()+"\nDeleting genome for build "+build+".\n"+args+"\n", indexlog, true);}
+ for(File f3 : f2){
+ if(f3.isFile()){
+ String f3n=f3.getName();
+ if((f3n.contains(".chrom") || f3n.endsWith(".txt") || f3n.endsWith(".txt.gz")) && !f3n.endsWith("list.txt")){
+ f3.delete();
+ }
+ }
+ }
+ }else{
+ sysout.println(Arrays.toString(f2));
+ if(LOG && !NODISK){ReadWrite.writeString(new Date()+"\nFailed to overwrite genome for build "+build+".\n"+args+"\n", indexlog, true);}
+ throw new RuntimeException("\nThere is already a reference at location '"+f.getAbsolutePath()+"'. " +
+ "Please delete it (and the associated index), or use a different build ID, " +
+ "or remove the 'reference=' parameter from the command line, or set overwrite=true.");
+ }
+ }
+ }
+
+ dir=dir.replace("ref/genome/", "ref/index/");
+ f=new File(dir);
+ if(f.exists()){
+ File[] f2=f.listFiles();
+ if(f2!=null && f2.length>0){
+ if(overwrite){
+ sysout.println("NOTE:\tDeleting contents of "+dir+" because reference is specified and overwrite="+overwrite);
+ if(LOG && !NODISK){ReadWrite.writeString(new Date()+"\nDeleting index for build "+build+".\n"+args+"\n", indexlog, true);}
+ for(File f3 : f2){
+ if(f3.isFile()){f3.delete();}
+ }
+ }else{
+ if(LOG && !NODISK){ReadWrite.writeString(new Date()+"\nFailed to overwrite index for build "+build+".\n"+args+"\n", indexlog, true);}
+ throw new RuntimeException("\nThere is already an index at location '"+f.getAbsolutePath()+"'. " +
+ "Please delete it, or use a different build ID, or remove the 'reference=' parameter from the command line.");
+ }
+ }
+ }
+ }
+
+ if(!NODISK){
+ sysout.println("Writing reference.");
+ if(LOG && !NODISK){
+ if(!new File(base).exists()){new File(base).mkdirs();}
+ ReadWrite.writeString(new Date()+"\nWriting genome for build "+build+".\n"+args+"\n", indexlog, true);
+ }
+ }
+
+ int oldzl=ReadWrite.ZIPLEVEL;
+ ReadWrite.ZIPLEVEL=Tools.max(4, ReadWrite.ZIPLEVEL);
+
+ //assert(false) : "minScaf="+minScaf+", midPad="+midPad+", maxChromLen="+maxChromLen+
+ // ", startPad="+startPad+", stopPad="+stopPad+", FastaToChromArrays2.END_PADDING="+FastaToChromArrays2.END_PADDING;
+
+ maxChromLen=maxChromLen>0 ? maxChromLen : AUTO_CHROMBITS ? FastaToChromArrays2.MAX_LENGTH : ((1L<<(31-(chrombits<0 ? 2 : chrombits)))-200000);
+ minScaf=minScaf>-1 ? minScaf : FastaToChromArrays2.MIN_SCAFFOLD;
+ midPad=midPad>-1 ? midPad : FastaToChromArrays2.MID_PADDING;
+ startPad=startPad>-1 ? startPad : FastaToChromArrays2.START_PADDING;
+ stopPad=stopPad>-1 ? stopPad : FastaToChromArrays2.END_PADDING;
+
+ String[] ftcaArgs=new String[] {reference, ""+build, "writeinthread=false", "genscaffoldinfo="+genScaffoldInfo, "retain", "waitforwriting=false",
+ "gz="+(Data.CHROMGZ), "maxlen="+maxChromLen,
+ "writechroms="+(!NODISK), "minscaf="+minScaf, "midpad="+midPad, "startpad="+startPad, "stoppad="+stopPad, "nodisk="+NODISK};
+
+ chromlist=FastaToChromArrays2.main2(ftcaArgs);
+
+ ReadWrite.ZIPLEVEL=oldzl;
+ }
+
+ }
+
+ public static boolean AUTO_CHROMBITS=true;
+ public static boolean LOG=false;
+ public static boolean NODISK=false;
+ public static boolean overwrite=true;
+ public static boolean append=false;
+ public static boolean genScaffoldInfo=true;
+
+ public static long maxChromLen=-1;
+
+ public static int minScaf=-1, midPad=-1, stopPad=-1, startPad=-1;
+ public static int chrombits=-1;
+// public static int minScaf=FastaToChromArrays2.MIN_SCAFFOLD;
+// public static int midPad=FastaToChromArrays2.MID_PADDING;
+// public static int startPad=FastaToChromArrays2.START_PADDING;
+// public static int stopPad=FastaToChromArrays2.END_PADDING;
+
+ public static ArrayList<ChromosomeArray> chromlist=null;
+
+}
diff --git a/current/align2/ReformatBatchOutput.java b/current/align2/ReformatBatchOutput.java
new file mode 100755
index 0000000..f8f2f83
--- /dev/null
+++ b/current/align2/ReformatBatchOutput.java
@@ -0,0 +1,217 @@
+package align2;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import fileIO.TextFile;
+
+public class ReformatBatchOutput {
+
+// Elapsed: 31.7
+//
+// Mapping Statistics for 0s_default.sam:
+// mapped: 100.00%
+// retained: 96.06%
+// discarded: 0.00%
+// ambiguous: 3.94%
+//
+// Strict correctness (both ends exactly correct):
+// true positive: 96.06%
+// false positive: 0.00%
+//
+// Loose correctness (one end approximately correct):
+// true positive: 96.06%
+// false positive: 0.00%
+//
+// false negative: 0.00%
+// Elapsed: 2.34
+// Elapsed: 20.51
+
+
+// Elapsed: 0.33
+//
+// Mapping Statistics for bwa_0S_0I_0D_0U_0N_r100.sam:
+// primary alignments: 100 found of 100 expected
+// secondary alignments: 0 found
+// mapped: 100.000%
+// retained: 97.000%
+// discarded: 0.000%
+// ambiguous: 3.000%
+//
+// Strict correctness (both ends exactly correct):
+// true positive: 97.000%
+// false positive: 0.000%
+//
+// Loose correctness (one end approximately correct):
+// true positive: 97.000%
+// false positive: 0.000%
+//
+// false negative: 0.000%
+
+
+ public static void main(String[] args){
+ TextFile tf=new TextFile(args[0], false, false);
+ String[] lines=tf.toStringLines();
+ ArrayList<String> list=new ArrayList<String>();
+
+ int mode=0;
+
+ System.out.println(header());
+
+ for(String s : lines){
+ if(s.startsWith("Elapsed:")){
+ if(!list.isEmpty()){
+ process(list); //failure
+ list.clear();
+ mode=0;
+ }
+ mode++;
+ }
+
+ if(mode>0){
+ list.add(s);
+ if(s.startsWith("false negative:")){
+ process(list);
+ list.clear();
+ mode=0;
+ }
+ }
+ }
+ }
+
+
+ public static String header() {
+ return("program\tfile\tvartype\tcount\treads\tprimary\tsecondary\ttime\tmapped\tretained\tdiscarded\tambiguous\ttruePositive\t" +
+ "falsePositive\ttruePositiveL\tfalsePositiveL\tfalseNegative");
+ }
+
+ //bwa_1S_0I_0D_0U_0N_r400000x100.sam
+ public static int getReads(String name){
+// String[] split=name.substring(0, name.length()-4).split("_");
+ String[] split=name.split("_");
+ String r=(split[split.length-1]);
+ if(r.charAt(0)=='r' && Character.isDigit(r.charAt(r.length()-1))){
+ assert(r.charAt(0)=='r') : Arrays.toString(split)+", "+name;
+ r=r.substring(1);
+ if(r.contains("x")){
+ r=r.substring(0, r.indexOf('x'));
+ }
+ return Integer.parseInt(r);
+ }else{
+ for(String s : split){
+ if(s.endsWith("bp") && s.contains("x") && Character.isDigit(s.charAt(0))){
+ r=s.substring(0, s.indexOf('x')-1);
+ return Integer.parseInt(r);
+ }
+ }
+ }
+ return 0;
+ }
+
+ public static char getVarType(String name){
+// String[] split=name.substring(0, name.length()-4).split("_");
+ String[] split=name.split("_");
+ for(String s : split){
+ char c=s.charAt(0);
+ if(Character.isDigit(c) && c!='0' && !s.endsWith("bp")){
+ return s.charAt(s.length()-1);
+ }
+ }
+ return '?';
+ }
+
+ public static int getCount(String name){
+// String[] split=name.substring(0, name.length()-4).split("_");
+ String[] split=name.split("_");
+ for(String s : split){
+ char c=s.charAt(0);
+ if(Character.isDigit(c) && c!='0' && !s.endsWith("bp")){
+ String r=s.substring(0, s.length()-1);
+ return Integer.parseInt(r);
+ }
+ }
+ return 0;
+ }
+
+ public static String getProgram(String name){
+ return name.substring(0, name.indexOf('_'));
+ }
+
+
+
+ public static void process(ArrayList<String> list){
+
+ String name=null;
+// String count=null;
+ String time=null;
+ StringBuilder sb=new StringBuilder();
+
+ int primary=0;
+ int secondary=0;
+ int expected=0;
+
+ for(String s : list){
+ String[] split=s.split("\t");
+ String a=split[0];
+ String b=(split.length>1 ? split[1] : null);
+ if(a.equals("Elapsed:")){
+ time=b;
+ }else if(a.startsWith("lines:")){
+ //do nothing
+ }else if(a.startsWith("Mapping Statistics for ")){
+ name=a.replace("Mapping Statistics for ", "").replace(".sam:", "");
+ }else if(a.startsWith("primary alignments:")){
+ b=b.replace(" found of ", "_");
+ b=b.replace(" expected", "");
+ String[] split2=b.split("_");
+ primary=Integer.parseInt(split2[0]);
+ expected=Integer.parseInt(split2[1]);
+ }else if(a.startsWith("secondary alignments:")){
+ b=b.replace(" found", "");
+ secondary=Integer.parseInt(b);
+ }else if(b!=null){
+ assert(!b.contains("found")) : "\na='"+a+"'\nb='"+b+"'\n"+a.equals("primary alignments:");
+ sb.append('\t').append(b.replace("%", ""));
+ }
+
+ }
+
+// if(name!=null){
+// count="";
+//
+// String[] split=name.split("_");
+//
+// for(String s : split){
+// if(s!=null && s.length()>0 && s.charAt(0)!='0'){
+// for(int i=0; i<s.length(); i++){
+// char c=s.charAt(i);
+// if(Character.isDigit(c)){count=count+c;}
+// else{break;}
+// }
+// }
+// if(count.length()>0){break;}
+// }
+// }
+
+ String prg=null;
+ char type='S';
+ int reads=1;
+ int vars=0;
+
+ if(name!=null){
+ try {
+ prg=getProgram(name);
+ type=getVarType(name);
+ reads=getReads(name);
+ vars=getCount(name);
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+
+ System.out.println(prg+"\t"+name+"\t"+type+"\t"+vars+"\t"+reads+"\t"+primary+"\t"+secondary+"\t"+time+sb);
+
+ }
+
+}
diff --git a/current/align2/ReformatBatchOutput2.java b/current/align2/ReformatBatchOutput2.java
new file mode 100755
index 0000000..27f55d2
--- /dev/null
+++ b/current/align2/ReformatBatchOutput2.java
@@ -0,0 +1,61 @@
+package align2;
+
+import java.util.ArrayList;
+
+import fileIO.TextFile;
+
+public class ReformatBatchOutput2 {
+
+// Elapsed: 31.7
+//
+// Mapping Statistics for 0s_default.sam:
+// mapped: 100.00%
+// retained: 96.06%
+// discarded: 0.00%
+// ambiguous: 3.94%
+//
+// Strict correctness (both ends exactly correct):
+// true positive: 96.06%
+// false positive: 0.00%
+//
+// Loose correctness (one end approximately correct):
+// true positive: 96.06%
+// false positive: 0.00%
+//
+// false negative: 0.00%
+// Elapsed: 2.34
+// Elapsed: 20.51
+
+
+ public static void main(String[] args){
+ TextFile tf=new TextFile(args[0], false, false);
+ String[] lines=tf.toStringLines();
+ ArrayList<String> list=new ArrayList<String>();
+
+ int mode=0;
+
+ System.out.println(header());
+
+ for(String s : lines){
+ if(s.startsWith("Elapsed:")){mode++;}
+ if(mode>1){
+ mode=0;
+ }else{
+// list.add(s);
+ if(s.startsWith("Mapping Statistics for ")){
+ System.out.println(s.replace("Mapping Statistics for ", "").replace(".sam:", "")+"\t");
+ }else if(s.startsWith("Mapping:")){
+ s=s.replace("Mapping:", "").replace("seconds.", "").trim();
+ System.out.print(s+"\t");
+ }
+ }
+ }
+ }
+
+
+ public static String header() {
+ return("name\tcount\ttime\tmapTime\tmapped\tretained\tdiscarded\tambiguous\ttruePositive\t" +
+ "falsePositive\ttruePositiveL\tfalsePositiveL\tfalseNegative");
+ }
+
+}
diff --git a/current/align2/Shared.java b/current/align2/Shared.java
new file mode 100755
index 0000000..c534df6
--- /dev/null
+++ b/current/align2/Shared.java
@@ -0,0 +1,146 @@
+package align2;
+
+import java.lang.management.ManagementFactory;
+import java.util.List;
+
+import dna.Data;
+
+public class Shared {
+
+ private static int THREADS=setThreads(-1);
+
+ public static int READ_BUFFER_LENGTH=200;
+ private static int READ_BUFFER_NUM_BUFFERS=setBuffers();
+ public static long READ_BUFFER_MAX_DATA=400000;
+
+ /** Temporary, for testing; will be made non-global */
+ public static boolean AMINO_IN=false;
+
+ //TODO: For some reason, it seems as though GAPBUFFER must equal exactly 1/2 of GAPLEN. Not good; 1/4 would be far better.
+
+ public static final int GAPBUFFER=64; //TODO: Seems to break less than 64, for some reason
+ public static final int GAPBUFFER2=2*GAPBUFFER;
+ public static final int GAPLEN=128; //TODO: May break when over 128
+ public static final int MINGAP=GAPBUFFER2+GAPLEN;
+ public static final int GAPCOST=Tools.max(1, GAPLEN/64);
+ public static final byte GAPC='-';
+
+ public static String BBMAP_VERSION_STRING="35.85";
+
+ public static boolean TRIM_READ_COMMENTS=false;
+
+ public static boolean USE_JNI=false;//Data.GENEPOOL;
+ public static boolean USE_MPI=false;
+ public static boolean MPI_KEEP_ALL=true;
+ /** Use ConcurrentReadInputStreamMPI instead of D */
+ public static boolean USE_CRISMPI=true;
+ public static int MPI_RANK=0;
+ public static int MPI_NUM_RANKS=1;
+
+ public static int FASTA_WRAP=70;
+ public static byte FAKE_QUAL=30;
+
+ public static String BBMAP_CLASS=null;
+ public static String[] COMMAND_LINE=null;
+ public static List<String> JVM_ARGS(){
+ return ManagementFactory.getRuntimeMXBean().getInputArguments();
+ }
+
+ public static long getAvailableMemory(){
+ long usableMemory;
+ {
+ long memory=Runtime.getRuntime().maxMemory();
+ double xmsRatio=Shared.xmsRatio();
+ usableMemory=(long)Tools.max(((memory-96000000-(20*400000))*(xmsRatio>0.97 ? 0.82 : 0.75)), memory*0.45);
+ }
+ return usableMemory;
+ }
+
+ /** Directory in which to write temp files */
+ public static String TMPDIR=(System.getenv("TMPDIR")==null ? null : (System.getenv("TMPDIR")+"/").replaceAll("//", "/"));
+// static{assert(false) : "TMPDIR="+TMPDIR;}
+
+ /** Anomaly probably resolved as of v.20.1
+ * This variable should be TRUE for normal users and FALSE for me. */
+ public static boolean anomaly=!System.getProperty("user.dir").contains("/bushnell/") && !Data.WINDOWS;
+
+ public static final char[] getTLCB(int len){
+ char[] buffer=TLCB.get();
+ if(buffer==null || buffer.length<len){
+ buffer=new char[len];
+ if(len<1000000){TLCB.set(buffer);}
+ }
+ return buffer;
+ }
+ private static final ThreadLocal<char[]> TLCB=new ThreadLocal<char[]>();
+
+ public static int setThreads(String x){
+ int y=Data.LOGICAL_PROCESSORS;
+ if(x!=null && !x.equalsIgnoreCase("auto")){
+ y=Integer.parseInt(x);
+ }
+ return setThreads(y);
+ }
+
+ public static int setThreads(int x){
+ if(x>0){
+ THREADS=x;
+ }else{
+ THREADS=Tools.max(1, Data.LOGICAL_PROCESSORS);
+ }
+ setBuffers();
+ return THREADS;
+ }
+
+ public static int threads(){
+ assert(THREADS>0);
+ return THREADS;
+ }
+
+ public static int capBuffers(int num){
+ return setBuffers(Tools.min(num, READ_BUFFER_NUM_BUFFERS));
+ }
+
+ public static int setBuffers(){
+ return setBuffersFromThreads(THREADS);
+ }
+
+ public static int setBuffersFromThreads(int threads){
+ return setBuffers(Tools.max(4, (threads*3)/2));
+ }
+
+ public static int setBuffers(int num){
+ num=Tools.max(2, num);
+ return READ_BUFFER_NUM_BUFFERS=num;
+ }
+
+ public static int numBuffers(){
+ return READ_BUFFER_NUM_BUFFERS;
+ }
+
+ public static boolean LOW_MEMORY=false;
+
+ /** Ratio of -Xms to -Xmx parameters */
+ public static final double xmsRatio(){
+ Runtime rt=Runtime.getRuntime();
+ return rt.totalMemory()*1.0/rt.maxMemory();
+ }
+
+ /** Print statistics about current memory use and availability */
+ public static final void printMemory(){
+ if(GC_BEFORE_PRINT_MEMORY){
+ System.gc();
+ System.gc();
+ }
+ Runtime rt=Runtime.getRuntime();
+ long mmemory=rt.maxMemory()/1000000;
+ long tmemory=rt.totalMemory()/1000000;
+ long fmemory=rt.freeMemory()/1000000;
+ long umemory=tmemory-fmemory;
+ System.err.println("Memory: "+"max="+mmemory+/*"m, total="+tmemory+*/"m, "+"free="+fmemory+"m, used="+umemory+"m");
+ }
+
+ /** Do garbage collection prior to printing memory usage */
+ private static final boolean GC_BEFORE_PRINT_MEMORY=false;
+
+}
diff --git a/current/align2/Solver.java b/current/align2/Solver.java
new file mode 100755
index 0000000..1fdbc50
--- /dev/null
+++ b/current/align2/Solver.java
@@ -0,0 +1,243 @@
+package align2;
+
+import java.util.Arrays;
+
+public class Solver {
+
+
+ public static final long bruteForce(int[] offsets, int[] lengths, int chunk, int minLists, int maxTotalLength){
+
+ int bits=offsets.length;
+ int max=(1<<bits)-1;
+
+ for(long i=0; i<=max; i++){
+ long x=evaluate(offsets, lengths, chunk, i);
+ }
+
+ assert(false);
+ return 0;
+ }
+
+
+ public static final void findWorstGreedy(final int[] offsets, final int[] lengths,
+ final int chunk, final int[] lists, int[] r){
+ assert(r!=null && r.length==2);
+
+ long min=Long.MAX_VALUE;
+ int worstIndex=-1;
+ for(int i=0; i<lists.length; i++){
+ long value=valueOfElement(offsets, lengths, 1f, chunk, lists, i);
+ if(value<min){
+ if(min<EARLY_TERMINATION_SCORE){//Can speed up greedy algo
+ r[0]=i;
+ r[1]=(value<Integer.MIN_VALUE ? Integer.MIN_VALUE : value>Integer.MAX_VALUE ? Integer.MAX_VALUE : (int)value);
+ return;
+ }
+ min=value;
+ worstIndex=i;
+ }
+ }
+// if(min>0){worstIndex=-1;}
+ r[0]=worstIndex;
+ r[1]=(min<Integer.MIN_VALUE ? Integer.MIN_VALUE : min>Integer.MAX_VALUE ? Integer.MAX_VALUE : (int)min);
+ }
+
+
+ public static final void findWorstGreedy(final int[] offsets, final int[] lengths,
+ final float[] weights, final int chunk, final int[] lists, int[] r){
+ assert(r!=null && r.length==2);
+
+ long min=Long.MAX_VALUE;
+ int worstIndex=-1;
+ for(int i=0; i<lists.length; i++){
+// for(int i=lists.length-1; i>=0; i--){
+ long value=valueOfElement(offsets, lengths, weights[i], chunk, lists, i);
+ if(value<min){
+ if(min<EARLY_TERMINATION_SCORE && i!=0){//Can speed up greedy algo
+ r[0]=i;
+ r[1]=(value<Integer.MIN_VALUE ? Integer.MIN_VALUE : value>Integer.MAX_VALUE ? Integer.MAX_VALUE : (int)value);
+// System.out.print(".");
+ return;
+ }
+ min=value;
+ worstIndex=i;
+ }
+ }
+// if(min>0){worstIndex=-1;}
+ r[0]=worstIndex;
+ r[1]=(min<Integer.MIN_VALUE ? Integer.MIN_VALUE : min>Integer.MAX_VALUE ? Integer.MAX_VALUE : (int)min);
+ }
+
+
+ public static long valueOfElement(final int[] offsets, final int[] lengths, float keyWeight,
+ final int chunk, final int[] lists, int index){
+
+ final int numlists=lists.length;
+ if(numlists<1){return 0;}
+
+ final int prospect=lists[index];
+ if(lengths[prospect]==0){return -999999;}
+
+ long valuep=POINTS_PER_LIST+(POINTS_PER_LIST*2/lists.length)+((POINTS_PER_LIST*10)/lengths[prospect]);
+ long valuem=POINTS_PER_SITE*lengths[prospect];
+
+ if(prospect==0 || (prospect==offsets.length-1)){
+ valuep+=BONUS_POINTS_FOR_END_LIST;
+ }
+
+ if(numlists==1){
+ valuep+=(POINTS_FOR_TOTAL_LIST_WIDTH+POINTS_PER_BASE1)*chunk;
+ return ((long)(valuep*keyWeight))+valuem;
+ }
+
+
+ final int first=lists[0];
+ final int last=lists[numlists-1];
+
+ //Offsets of elements to the left and right of the prospect
+// final int offL=(prospect==first ? - : offsets[lists[index-1]]);
+// final int offP=offsets[prospect];
+// final int offR=(prospect==last ? offsets[offsets.length-1] : offsets[lists[index+1]]);
+// assert(offL<=offP);
+// assert(offP<=offR);
+// assert(offL<offR) : "\noffsets.length="+offsets.length+", lengths.length="+lengths.length+"\n"+
+// ", chunk="+chunk+", lists.length="+lists.length+", index="+index+"\n"+
+// ", offL="+offL+", offR="+offR+", prospect="+prospect+", first="+first+", last="+last+"\n"+
+// ", valuep="+valuep+", valuem="+valuem+", weight="+keyWeight+"\n"+
+// "offsets = "+Arrays.toString(offsets)+"\tlengths = "+Arrays.toString(lengths)+"\nlists = "+Arrays.toString(lists)+"\n";
+
+ final int offL=(prospect==first ? -1 : offsets[lists[index-1]]);
+ final int offP=offsets[prospect];
+ final int offR=(prospect==last ? offsets[offsets.length-1]+1 : offsets[lists[index+1]]);
+ assert(offL<=offP);
+ assert(offP<=offR);
+ assert(offL<offR) : "\noffsets.length="+offsets.length+", lengths.length="+lengths.length+"\n"+
+ ", chunk="+chunk+", lists.length="+lists.length+", index="+index+"\n"+
+ ", offL="+offL+", offR="+offR+", prospect="+prospect+", first="+first+", last="+last+"\n"+
+ ", valuep="+valuep+", valuem="+valuem+", weight="+keyWeight+"\n"+
+ "offsets = "+Arrays.toString(offsets)+"\tlengths = "+Arrays.toString(lengths)+"\nlists = "+Arrays.toString(lists)+"\n";
+
+ int oldLeftSpace=offP-offL;
+ int oldRightSpace=offR-offP;
+ int newSpace=offR-offL;
+
+// int oldLeftSpace=Tools.max((offP-offL)-1, 0)+1;
+// int oldRightSpace=Tools.max((offR-offP)-1, 0)+1;
+// int newSpace=Tools.max(offR-offL;
+
+ long spaceScore=((oldLeftSpace*oldLeftSpace+oldRightSpace*oldRightSpace)-(newSpace*newSpace))*MULT_FOR_SPACING_PENALTY;
+ assert(spaceScore>0) : "\n"+spaceScore+", "+oldLeftSpace+", "+oldRightSpace+", "+newSpace+"\n"+
+ Arrays.toString(offsets)+"\nprospect="+prospect+"\n";
+ valuep+=spaceScore;
+
+ int uniquelyCovered;
+ if(prospect==first){
+ uniquelyCovered=offR-offP; //Technically, -1 should be added
+ }else if(prospect==last){
+ uniquelyCovered=offP-offL; //Technically, -1 should be added
+ }else{
+ int a=offL+chunk;
+ int b=offR-a;
+ uniquelyCovered=(b>0 ? b : 0);
+ }
+
+ if(prospect==first || prospect==last){
+ valuep+=(POINTS_PER_BASE1+POINTS_FOR_TOTAL_LIST_WIDTH)*uniquelyCovered;
+ }else{
+ valuep+=POINTS_PER_BASE1*uniquelyCovered;
+ }
+
+ return ((long)(valuep*keyWeight))+valuem;
+ }
+
+ public static int[] toBitList(final int key){
+ final int numlists=Integer.bitCount(key);
+ final int[] lists=new int[numlists];
+ for(int i=0, ptr=0; ptr<numlists; i++){
+ if((masks32[i]&key)!=0){
+ lists[ptr]=i;
+ ptr++;
+ }
+ }
+ return lists;
+ }
+
+ public static int[] toBitList(final long key){
+ final int numlists=Long.bitCount(key);
+ assert(numlists>0);
+ final int[] lists=new int[numlists];
+ for(int i=0, ptr=0; ptr<numlists; i++){
+ if((masks[i]&key)!=0){
+ lists[ptr]=i;
+ ptr++;
+ }
+ }
+ return lists;
+ }
+
+ public static long evaluate(int[] offsets, int[] lengths, final int chunk, final long key){
+
+ long score=0;
+
+ final int[] lists=toBitList(key);
+ final int numlists=lists.length;
+
+ final int first=lists[0];
+ final int last=lists[numlists-1];
+
+ score+=numlists*POINTS_PER_LIST;
+ for(int i=0; i<numlists; i++){
+ int list=lists[i];
+ score+=POINTS_PER_SITE*lengths[list];
+ }
+ if(first==0){score+=BONUS_POINTS_FOR_END_LIST;}
+ if(last==offsets.length-1){score+=BONUS_POINTS_FOR_END_LIST;}
+
+ score+=(POINTS_FOR_TOTAL_LIST_WIDTH*(offsets[last]-offsets[first]+chunk));
+
+ //TODO: Special case both ends
+ for(int i=1; i<numlists; i++){
+ int list1=lists[i-1];
+ int list2=lists[i];
+ int space=offsets[list2]-offsets[list1];
+ int uncovered=space>chunk ? space-chunk : 0;
+
+ score+=MULT_FOR_SPACING_PENALTY*(space*space);
+ score-=POINTS_PER_BASE1*uncovered;
+ }
+
+ if(first>0){
+ long x=offsets[first];
+ score+=MULT_FOR_SPACING_PENALTY*(x*x);
+ score-=POINTS_PER_BASE1*x;
+ }
+
+ if(last<(offsets.length-1)){
+ long x=offsets[offsets.length-1]-offsets[last];
+ score+=MULT_FOR_SPACING_PENALTY*(x*x);
+ score-=POINTS_PER_BASE1*x;
+ }
+
+ return score;
+ }
+
+ public static final int BASE_POINTS_PER_SITE=-50; //Used to set POINTS_PER_SITE
+ public static long POINTS_PER_SITE=-50; //TODO: Make private with a get() and set() function
+
+ public static final long MULT_FOR_SPACING_PENALTY=-30;
+
+ public static long EARLY_TERMINATION_SCORE=(POINTS_PER_SITE*2000); //TODO: Should be set dynamically
+
+ public static final long POINTS_PER_LIST=30000;
+ public static final long POINTS_PER_BASE1=6000; //Points for a base covered once
+ public static final long POINTS_PER_BASE2=1000;//POINTS_PER_BASE1/4; //Points for a base covered twice
+ public static final long BONUS_POINTS_FOR_END_LIST=40000; //Extra points for the first and last list
+ public static final long POINTS_FOR_TOTAL_LIST_WIDTH=5500; //multiplier for distance between first and last list
+
+ public static final long[] masks=new long[64];
+ public static final int[] masks32=new int[32];
+ static{
+ for(int i=0; i<masks.length; i++){masks[i]=(1L<<i);}
+ for(int i=0; i<masks32.length; i++){masks32[i]=(1<<i);}
+ }
+}
diff --git a/current/align2/SortReadsByID.java b/current/align2/SortReadsByID.java
new file mode 100755
index 0000000..574159f
--- /dev/null
+++ b/current/align2/SortReadsByID.java
@@ -0,0 +1,283 @@
+package align2;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+
+import stream.ConcurrentReadInputStream;
+import stream.Read;
+
+import dna.Data;
+import dna.Parser;
+import dna.Timer;
+import fileIO.ReadWrite;
+import fileIO.FileFormat;
+import fileIO.TextStreamWriter;
+
+/**
+ * @author Brian Bushnell
+ * @date Nov 1, 2012
+ *
+ */
+public class SortReadsByID {
+
+ public static void main(String[] args){
+
+ Parser parser=new Parser();
+ String in1=null;
+ String in2=null;
+ String out="raw_idsorted#.txt.gz";
+
+ for(int i=0; i<args.length; i++){
+ final String arg=args[i];
+ final String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if("null".equalsIgnoreCase(b)){b=null;}
+// System.err.println("Processing "+args[i]);
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(Parser.parseCommonStatic(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseZip(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseQuality(arg, a, b)){
+ //do nothing
+ }else if(parser.parseInterleaved(arg, a, b)){
+ //do nothing
+ }else if(a.equals("i") || a.equals("in") || a.equals("input") || a.equals("in1") || a.equals("input1")){
+ in1=b;
+ if(b.indexOf('#')>=0){
+ in1=b.replaceFirst("#", "1");
+ in2=b.replaceFirst("#", "2");
+ }
+ }else if(a.equals("in2") || a.equals("input2")){
+ in2=b;
+ }else if(a.equals("o") || a.equals("out") || a.equals("output")){
+ out=b;
+ }else if(a.endsWith("renumber")){
+ RENUMBER=Tools.parseBoolean(b);
+ }else if(a.equals("append") || a.equals("app")){
+ append=ReadStats.append=Tools.parseBoolean(b);
+ }else if(a.equals("overwrite") || a.equals("ow")){
+ overwrite=Tools.parseBoolean(b);
+ Data.sysout.println("Set overwrite to "+overwrite);
+ }else if(a.endsWith("blocksize")){
+ BLOCKSIZE=Integer.parseInt(b);
+ }else{
+ throw new RuntimeException("Unknown parameter: "+args[i]);
+ }
+ }
+
+ if(in1==null){throw new RuntimeException("Please specify input file.");}
+ if(out==null){throw new RuntimeException("Please specify output file.");}
+ if(in1.equalsIgnoreCase(in2) || in1.equalsIgnoreCase(out) || (in2!=null && in2.equalsIgnoreCase(out))){
+ throw new RuntimeException("Duplicate filenames.");
+ }
+
+ if(out!=null && !out.contains("#")){
+ throw new RuntimeException("Output filename must contain '#' symbol.");
+ }
+
+ SortReadsByID srid=new SortReadsByID(in1, in2, out);
+ srid.process();
+ }
+
+
+ public void process(){
+
+ Timer tRead=new Timer();
+ Timer tSort=new Timer();
+ Timer tAll=new Timer();
+
+ tRead.start();
+ tAll.start();
+
+ final long maxReads=-1;
+ ConcurrentReadInputStream cris;
+ {
+ FileFormat ff1=FileFormat.testInput(in1, FileFormat.FASTQ, null, true, true);
+ FileFormat ff2=FileFormat.testInput(in2, FileFormat.FASTQ, null, true, true);
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2);
+ cris.start(); //4567
+ }
+
+ HashMap<Integer, Block> map=new HashMap<Integer, Block>();
+
+ {
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ while(reads!=null && reads.size()>0){
+ //System.err.println("reads.size()="+reads.size());
+ for(Read r : reads){
+
+ int bin=(int)(r.numericID/BLOCKSIZE);
+ Block b=map.get(bin);
+ if(b==null){
+ String o1=out.replaceFirst("#", "_bin"+bin+"_1");
+ String o2=(cris.paired() && !OUT_INTERLEAVED) ? out.replaceFirst("#", "_bin"+bin+"_2") : null;
+ b=new Block(o1, o2);
+ map.put(bin, b);
+ }
+ b.add(r);
+ }
+ //System.err.println("returning list");
+ cris.returnList(ln.id, ln.list.isEmpty());
+ //System.err.println("fetching list");
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ReadWrite.closeStream(cris);
+ }
+
+ for(Block b : map.values()){b.close();}
+
+ tRead.stop();
+ Data.sysout.println("Read time: \t"+tRead);
+ tSort.start();
+
+ String o1=out.replaceFirst("#", "1");
+ String o2=(cris.paired() && !OUT_INTERLEAVED) ? out.replaceFirst("#", "2") : null;
+ Block sorted=new Block(o1, o2);
+
+ long count=0;
+
+ ArrayList<Integer> keys=new ArrayList<Integer>();
+ keys.addAll(map.keySet());
+ Collections.sort(keys);
+ for(Integer key : keys){
+ Block b=map.get(key);
+ b.join();
+ map.remove(key);
+ {
+ FileFormat ff1=FileFormat.testInput(b.out1, FileFormat.FASTQ, null, true, true);
+ FileFormat ff2=FileFormat.testInput(b.out2, FileFormat.FASTQ, null, true, true);
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2);
+ cris.start(); //4567
+ }
+ ArrayList<Read> reads2=new ArrayList<Read>((int)b.count);
+ count+=b.count;
+
+ {
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ while(reads!=null && reads.size()>0){
+ reads2.addAll(reads);
+ //System.err.println("returning list");
+ cris.returnList(ln.id, ln.list.isEmpty());
+ //System.err.println("fetching list");
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ReadWrite.closeStream(cris);
+ }
+
+ Collections.sort(reads2, ReadComparatorID.comparator);
+ for(Read r : reads2){sorted.add(r);}
+ new File(b.out1).delete();
+ if(b.out2!=null){new File(b.out2).delete();}
+ }
+
+ sorted.close();
+ sorted.join();
+
+ tSort.stop();
+ tAll.stop();
+
+ Data.sysout.println("Total reads: \t"+count);
+ Data.sysout.println("Sort time: \t"+tSort);
+ Data.sysout.println("Total time: \t"+tAll);
+
+ }
+
+ /**
+ * @param in1
+ * @param in2
+ * @param covstats
+ */
+ public SortReadsByID(String in1_, String in2_, String out_) {
+ in1=in1_;
+ in2=in2_;
+ out=out_;
+
+ FileFormat ff=FileFormat.testOutput(out, FileFormat.BREAD, null, true, false, append, false);
+ outFastq=ff.fastq();
+ outFasta=ff.fasta();
+ outText=ff.bread();
+ }
+
+ public String in1;
+ public String in2;
+ public String out;
+
+ private final boolean outText;
+ private final boolean outFasta;
+ private final boolean outFastq;
+
+ public static int BLOCKSIZE=8000000;
+ public static boolean overwrite=true;
+ public static boolean append=false;
+ public static boolean RENUMBER=false;
+ public static boolean OUT_INTERLEAVED=false;
+
+ private class Block{
+
+ public Block(String out1_, String out2_){
+ out1=out1_;
+ out2=out2_;
+
+ tsw1=new TextStreamWriter(out1, overwrite, false, false);
+ tsw2=(out2==null ? null : new TextStreamWriter(out2, overwrite, false, false));
+
+ tsw1.start();
+ if(tsw2!=null){tsw2.start();}
+ }
+
+ public void add(Read r){
+ count++;
+ Read r2=r.mate;
+
+ StringBuilder sb1=outText ? r.toText(true) : outFastq ? r.toFastq() : outFasta ? r.toFasta() : null;
+ StringBuilder sb2=r2==null ? null : outText ? r2.toText(true) : outFastq ? r2.toFastq() : outFasta ? r2.toFasta() : null;
+
+ tsw1.print(sb1.append('\n'));
+ if(sb2!=null){
+ if(tsw2!=null){
+ tsw2.print(sb2.append('\n'));
+ }else{
+ tsw1.print(sb2.append('\n')); //Interleaved
+ }
+ }
+
+ }
+
+ public void close(){
+ tsw1.poison();
+ if(tsw2!=null){tsw2.poison();}
+ }
+
+ public void join(){
+ tsw1.waitForFinish();
+ if(tsw2!=null){tsw2.waitForFinish();}
+ }
+
+ String out1;
+ String out2;
+
+ TextStreamWriter tsw1;
+ TextStreamWriter tsw2;
+
+ long count=0;
+
+ }
+
+
+}
diff --git a/current/align2/SortReadsByMapping.java b/current/align2/SortReadsByMapping.java
new file mode 100755
index 0000000..d731bf3
--- /dev/null
+++ b/current/align2/SortReadsByMapping.java
@@ -0,0 +1,1972 @@
+package align2;
+
+import java.io.File;
+import java.io.OutputStream;
+import java.io.PrintWriter;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.concurrent.ArrayBlockingQueue;
+
+import stream.ConcurrentLegacyReadInputStream;
+import stream.ConcurrentReadInputStream;
+import stream.RTextInputStream;
+import stream.Read;
+import stream.ReadStreamStringWriter;
+import stream.ReadStreamWriter;
+
+import dna.AminoAcid;
+import dna.Data;
+import dna.Gene;
+import dna.Timer;
+import fileIO.ReadWrite;
+
+public class SortReadsByMapping {
+
+
+ public static void main(String[] args){
+
+ for(String s_ : args){
+ String s=s_.toLowerCase();
+ String split[]=(s.contains("=") ? s.split("=") : null);
+ if(s.equalsIgnoreCase("merge")){MERGE_DUPLICATES=true;}
+ else if(s.equalsIgnoreCase("regen")){REGENERATE_MATCH_STRING=true;}
+ else if(s.equalsIgnoreCase("trim")){TRIM_LOW_QUALITY_TAILS=true;}
+ else if(s.equalsIgnoreCase("fixshort")){FIX_SHORT_PAIRED_READS=true;}
+ else if(s.equalsIgnoreCase("removesingletonduplicates")){REMOVE_SINGLETON_DUPLICATES_OF_PAIRS=true;}
+ else if(s.equalsIgnoreCase("swaptoplus") || s.equalsIgnoreCase("swap")){SWAP_READ1_TO_PLUS=true;}
+ else if(s.equalsIgnoreCase("mergeoppositestrand")){MERGE_OPPOSITE_STRAND_DUPLICATES=true;}
+ else if(s.startsWith("merge=")){
+ MERGE_DUPLICATES=(split[1].startsWith("t") || split[1].equals("1") ? true : false);
+ }else if(s.startsWith("regen=")){
+ REGENERATE_MATCH_STRING=(split[1].startsWith("t") || split[1].equals("1") ? true : false);
+ }else if(s.startsWith("trim=")){
+ TRIM_LOW_QUALITY_TAILS=(split[1].startsWith("t") || split[1].equals("1") ? true : false);
+ }else if(s.startsWith("fixshort=")){
+ FIX_SHORT_PAIRED_READS=(split[1].startsWith("t") || split[1].equals("1") ? true : false);
+ }else if(s.startsWith("removesingletonduplicates=") || s.startsWith("removesingletonduplicatesofpairs=")){
+ REMOVE_SINGLETON_DUPLICATES_OF_PAIRS=(split[1].startsWith("t") || split[1].equals("1") ? true : false);
+ }else if(s.startsWith("minq=") || s.startsWith("minquality=") || s.startsWith("trimquality=")){
+ TRIM_QUALITY=Byte.parseByte(split[1]);
+ }else if(s.startsWith("window=") || s.startsWith("trimwindow=")){
+ TRIM_WINDOW=Byte.parseByte(split[1]);
+ }else if(s.startsWith("swaptoplus=") || s.startsWith("swap=")){
+ SWAP_READ1_TO_PLUS=(split[1].startsWith("t") || split[1].equals("1") ? true : false);
+ }else if(s.startsWith("mergeoppositestrand=")){
+ MERGE_OPPOSITE_STRAND_DUPLICATES=(split[1].startsWith("t") || split[1].equals("1") ? true : false);;
+ }else if(s.startsWith("readlimit=")){
+ READ_LIMIT=Long.parseLong(split[1]);
+ Data.sysout.println("Set READ_LIMIT to "+READ_LIMIT);
+ }else if(s.startsWith("threads=")){
+ REGEN_THREADS=Integer.parseInt(split[1]);
+ }else if(s.startsWith("overwrite=")){
+ overwrite=Tools.parseBoolean(split[1]);
+ }
+ }
+
+ Read.DECOMPRESS_MATCH_ON_LOAD=true;
+
+ SortReadsByMapping srt;
+
+ String reads1=args[0];
+ String reads2=args[1].equalsIgnoreCase("null") ? null : args[1];
+ String outname=args[2].equalsIgnoreCase("null") ? ReadWrite.parseRoot(reads1)+"mapped_sorted#.txt.gz" : args[2];
+ assert(outname.contains("#"));
+ int blocksize=Integer.parseInt(args[3]);
+
+ srt=new SortReadsByMapping(reads1, reads2, outname, blocksize);
+
+
+ srt.process();
+
+ double rmult=100d/(srt.processed);
+ double bmult=100d/srt.basesInitiallyMapped;
+
+ float pmult=(srt.paired ? 2 : 1);
+
+ long remaining=srt.processed-srt.merged-srt.merged2-srt.removedSingletonDupe-srt.removedLQ-srt.removedShort;
+ Data.sysout.println("Processed "+srt.processed+" reads; "+remaining+" remaining"+String.format(" (%.2f%%)", remaining*rmult));
+ if(MERGE_DUPLICATES){
+ Data.sysout.println("Merged "+srt.merged2+" strict duplicates"+String.format(" (%.2f%%)", srt.merged2*rmult));
+ Data.sysout.println("Merged "+srt.merged+" duplicates"+String.format(" (%.2f%%)", srt.merged*rmult));
+ if(srt.paired && REMOVE_SINGLETON_DUPLICATES_OF_PAIRS){
+ Data.sysout.println("Removed "+srt.removedSingletonDupe+" singleton duplicates of pairs"+
+ String.format(" (%.2f%%)", srt.removedSingletonDupe*rmult));
+ }
+ }
+ if(FIX_SHORT_PAIRED_READS){
+ Data.sysout.println("Removed "+srt.removedShort+" short reads"+String.format(" (%.2f%%)", srt.removedShort*rmult));
+ Data.sysout.println("Trimmed "+srt.basesOverlapping+" overlapping bases of "+srt.basesInitiallyMapped+" initially mapped"+
+ String.format(" (%.2f%%)", srt.basesOverlapping*bmult));
+ }
+ if(TRIM_LOW_QUALITY_TAILS){
+ Data.sysout.println("Removed "+srt.removedLQ+" low-quality reads"+String.format(" (%.2f%%)", srt.removedLQ*rmult));
+ Data.sysout.println("Trimmed "+srt.basesRemoved+" low-quality bases of "+srt.basesMapped+" mapped"+
+ String.format(" (%.2f%%)", srt.basesRemoved*bmult));
+ }
+
+ Data.sysout.println("Total valid, mapped tags written: "+
+ srt.validReadsWritten+String.format(" (%.2f%%)", srt.validReadsWritten*rmult/pmult));
+ Data.sysout.println("Total valid, mapped bases written: "+
+ srt.validBasesWritten+String.format(" (%.2f%%)", srt.validBasesWritten*bmult));
+ }
+
+ public SortReadsByMapping(String fname1, String fname2, String outname_, int blocksize_){
+ assert(fname2==null || !fname1.equals(fname2)) : "Error - input files have same name.";
+
+ long limit=READ_LIMIT;
+ RTextInputStream rtis=new RTextInputStream(fname1, fname2, limit);
+ outname=outname_;
+ paired=rtis.paired();
+ cris=new ConcurrentLegacyReadInputStream(rtis, limit);
+ blocksize=blocksize_;
+ assert(blocksize>200000);
+
+ blockwriter1=(fname1==null ? null : new ReadStreamStringWriter(null, true, 4, false));
+ blockwriter2=(fname2==null ? null : new ReadStreamStringWriter(null, false, 4, false));
+ }
+
+ public void process(){
+
+ final String fname1=outname.replaceFirst("#", "1");
+ final String fname2=(!paired ? null : outname.replaceFirst("#", "2"));
+ if(!overwrite){
+ if(fname1!=null && new File(fname1).exists()){throw new RuntimeException("Destination file "+fname1+" already exists.");}
+ if(fname2!=null && new File(fname2).exists()){throw new RuntimeException("Destination file "+fname2+" already exists.");}
+ }
+
+ Timer t=new Timer();
+ Timer total=new Timer();
+ t.start();
+ total.start();
+
+
+ cris.start();
+ System.err.println("Started cris");
+
+ Thread bwt1=null, bwt2=null;
+ if(fname1!=null){
+ bwt1=new Thread(blockwriter1);
+ bwt1.start();
+ }
+ if(fname2!=null){
+ bwt2=new Thread(blockwriter2);
+ bwt2.start();
+ }
+ System.err.println("Started blockwriters");
+
+ {
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+ assert(paired==(r.mate!=null));
+ if(paired){
+ asymmetricReads=(r.length()!=r.mateLength());
+ }
+ }
+
+ while(reads!=null && reads.size()>0){
+ //System.err.println("reads.size()="+reads.size());
+
+
+ if(KILL_BAD_PAIRS && paired){
+ for(Read r : reads){
+
+ if(r.isBadPair(REQUIRE_CORRECT_STRANDS_PAIRS, SAME_STRAND_PAIRS, 20000)){
+ int x=r.mapScore/r.length();
+ int y=r.mate.mapScore/r.mateLength();
+ if(x>=y){
+ r.mate.clearAnswers(false);
+ }else{
+ r.clearAnswers(false);
+ }
+ }
+
+ addRead(r);
+ }
+ }else{
+ for(Read r : reads){addRead(r);}
+ }
+
+
+
+ //System.err.println("returning list");
+ cris.returnList(ln.id, ln.list.isEmpty());
+ //System.err.println("fetching list");
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ System.err.println("Finished reading");
+ cris.returnList(ln.id, ln.list.isEmpty());
+ System.err.println("Returned list");
+ ReadWrite.closeStream(cris);
+ System.err.println("Closed stream");
+ }
+
+
+ synchronized(this){this.notifyAll();}
+ System.err.println("Notified all");
+
+ finishWritingBlocks();
+ System.err.println("Wrote blocks");
+
+
+ if(bwt1!=null){blockwriter1.poison();}
+ if(bwt2!=null){blockwriter2.poison();}
+
+ if(bwt1!=null){
+ while(bwt1.isAlive()){
+ try {
+ bwt1.join();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+ if(bwt2!=null){
+ while(bwt2.isAlive()){
+ try {
+ bwt2.join();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+ t.stop();
+ Data.sysout.println("Temp Write Time: "+t);
+ t.start();
+
+ if(ReadWrite.ZIPLEVEL<2){ReadWrite.ZIPLEVEL=2;}
+ ReadStreamWriter wt1=(fname1==null ? null : new ReadStreamStringWriter(fname1, true, 4, false));
+ ReadStreamWriter wt2=(fname2==null ? null : new ReadStreamStringWriter(fname2, false, 4, false));
+
+ Thread wtt1=(wt1==null ? null : new Thread(wt1));
+ Thread wtt2=(wt2==null ? null : new Thread(wt2));
+
+ if(wtt1!=null){wtt1.start();}
+ if(wtt2!=null){wtt2.start();}
+
+ ArrayList<String> keys=new ArrayList<String>(table.size());
+ keys.addAll(table.keySet());
+ Collections.sort(keys);
+
+ final ReadComparatorMapping mcomp=new ReadComparatorMapping();
+
+ int lastChrom=-1;
+ for(String key : keys){
+ Block b=table.get(key);
+ table.remove(key);
+ processed+=b.added;
+
+ if(UNLOAD_CHROMS_WHEN_DONE && lastChrom>-1 && b.chrom!=lastChrom){
+ Data.unload(lastChrom, false); //Saves memory when regenerating match strings
+ }
+ lastChrom=b.chrom;
+
+ if(b.added>MAX_BLOCKSIZE_TO_SORT){
+ if(true){throw new RuntimeException("Skipping sorting for key "+key+" of size "+b.added);}
+ RTextInputStream temp=new RTextInputStream(b.fname1, b.fname2, -1);
+ ArrayList<Read> reads=temp.nextList();
+ while(reads!=null && reads.size()>0){
+ if(reads!=null && reads.size()>0){
+ if(wt1!=null){wt1.addList(reads);}
+ if(wt2!=null){wt2.addList(reads);}
+ }
+ b.numRead+=reads.size();
+ reads=temp.nextList();
+ }
+ temp.close();
+ temp=null;
+
+ Data.sysout.println(key+"\t"+b.added);
+ b.delete();
+ }else{
+ ArrayList<Read> list=b.readBlock();
+ Data.sysout.println(key+"\t"+list.size());
+ b.delete();
+
+ //Collections.sort(list, mcomp);
+ if(MERGE_DUPLICATES){
+ if(!paired){
+ Collections.sort(list, mcomp);
+ if(USE_STRICT_MERGE){findAndMergeDuplicatesStrict(list);}
+ findAndMergeDuplicates(list, false);
+ }else{
+
+ //Possibly, doing two passes (unswap, merge, reswap, merge) is unnecessary...
+
+ if(SWAP_READ1_TO_PLUS && paired){
+ //Unswap
+ for(int i=0; i<list.size(); i++){
+ Read r=list.get(i);
+ if(r!=null && r.mate!=null && r.swapped()){list.set(i, r.mate);}
+ }
+ //Merge
+ doPairedSplitAndMergeSeries(list, mcomp, false);
+ //Reswap
+ for(int i=0; i<list.size(); i++){
+ Read r=list.get(i);
+ if(r!=null && r.mate!=null && r.swapped()){list.set(i, r.mate);}
+ }
+
+ if(MERGE_OPPOSITE_STRAND_DUPLICATES){
+ //Merge
+ doPairedSplitAndMergeSeries(list, mcomp, true);
+ }else{
+ Collections.sort(list, mcomp);
+ }
+ }else{
+ doPairedSplitAndMergeSeries(list, mcomp, false);
+ }
+ }
+ }
+
+ //Unswap
+ if(SWAP_READ1_TO_PLUS && paired){
+ for(int i=0; i<list.size(); i++){
+ Read r=list.get(i);
+ if(r!=null && r.mate!=null && r.swapped()){
+ list.set(i, r.mate);
+ }
+ }
+ }
+
+ if(FIX_SHORT_PAIRED_READS && paired){
+ int[] rvector=new int[4];
+ int removedTemp=fixShort(list, rvector);
+ removedShort+=removedTemp;
+ basesOverlapping+=rvector[1];
+ basesInitiallyMapped+=rvector[2];
+ int needRegen=rvector[3];
+
+ if(REGENERATE_MATCH_STRING && needRegen>0){
+ regenMatchStrings(list);
+ }
+ }else{
+ for(Read r : list){
+ if(r!=null){
+ if(r.mapped()){basesInitiallyMapped+=r.length();}
+ if(r.mate!=null && r.mate.mapped()){basesInitiallyMapped+=r.mate.length();}
+ }
+ }
+ }
+
+ if(TRIM_LOW_QUALITY_TAILS){
+ int[] rvector=new int[4];
+ int removedTemp=trimTails(list, TRIM_WINDOW, TRIM_QUALITY, rvector);
+ removedLQ+=removedTemp;
+ basesRemoved+=rvector[1];
+ basesMapped+=rvector[2];
+ int needRegen=rvector[3];
+
+ if(REGENERATE_MATCH_STRING && needRegen>0){
+ regenMatchStrings(list);
+ }
+ }else{
+ for(Read r : list){
+ if(r!=null){
+ if(r.mapped() && !r.invalid()){basesMapped+=r.length();}
+ if(r.mate!=null && !r.mate.invalid() && r.mate.mapped()){basesMapped+=r.mateLength();}
+ }
+ }
+ }
+
+ //Reswap
+ if(SWAP_READ1_TO_PLUS && paired){
+ for(int i=0; i<list.size(); i++){
+ Read r=list.get(i);
+ if(r!=null && r.mate!=null && r.swapped()){
+ list.set(i, r.mate);
+ }
+ }
+ }
+
+ if(list!=null && list.size()>0){
+ if(wt1!=null){wt1.addList(list);}
+ if(wt2!=null){wt2.addList(list);}
+ }
+ }
+ }
+
+ //Add poison
+// if(wt1!=null){wt1.addList(null);}
+// if(wt2!=null){wt2.addList(null);}
+ if(wt1!=null){wt1.poison();}
+ if(wt2!=null){wt2.poison();}
+
+ readsWritten=0;
+ basesWritten=0;
+ validReadsWritten=0;
+ validBasesWritten=0;
+
+ if(wtt1!=null){
+ while(wtt1.isAlive()){
+ try {
+ wtt1.join();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ readsWritten+=wt1.readsWritten();
+ basesWritten+=wt1.basesWritten();
+ validReadsWritten+=wt1.validReadsWritten();
+ validBasesWritten+=wt1.validBasesWritten();
+ }
+
+ if(wtt2!=null){
+ while(wtt2.isAlive()){
+ try {
+ wtt2.join();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ readsWritten+=wt2.readsWritten();
+ basesWritten+=wt2.basesWritten();
+ validReadsWritten+=wt2.validReadsWritten();
+ validBasesWritten+=wt2.validBasesWritten();
+ }
+
+ t.stop();
+ total.stop();
+ Data.sysout.println("Final Sort + Write Time: "+t);
+ Data.sysout.println("Total Time: "+total);
+
+ }
+
+
+ private void doPairedSplitAndMergeSeries(ArrayList<Read> list, final ReadComparatorMapping mcomp, boolean mergeDifferentLength){
+
+ //This special section is probably not necessary.
+ //Theoretically, keeping everything in a single list should work fine.
+
+ int p=0, e1=0, e2=0, e12=0;
+ for(Read r : list){
+ if(r!=null){
+ if(r.paired()){
+ p++;
+ }else if(r.mapped() && r.mate.mapped()){
+ e12++;
+ }else if(r.mapped()){
+ e1++;
+ }else if(r.mate.mapped()){
+ e2++;
+ }
+ }
+ }
+
+ ArrayList<Read> listP=new ArrayList<Read>(p);
+ ArrayList<Read> list1=new ArrayList<Read>(e1);
+ ArrayList<Read> list2=new ArrayList<Read>(e2);
+ ArrayList<Read> list12=new ArrayList<Read>(e12);
+
+ for(Read r : list){
+ if(r!=null){
+ if(r.paired()){
+ listP.add(r);
+ }else if(r.mapped() && r.mate.mapped()){
+ list12.add(r);
+ }else if(r.mapped()){
+ list1.add(r);
+ }else if(r.mate.mapped()){
+ list2.add(r);
+ }
+ }
+ }
+ list.clear();
+
+ Collections.sort(listP, mcomp);
+ if(USE_STRICT_MERGE){findAndMergeDuplicatesStrict(listP);}
+ findAndMergeDuplicates(listP, mergeDifferentLength);
+ list.addAll(listP);
+ listP=null;
+
+ Collections.sort(list1, mcomp);
+ findAndMergeDuplicates(list1, mergeDifferentLength);
+ list.addAll(list1);
+ list1=null;
+
+ Collections.sort(list2, mcomp);
+ findAndMergeDuplicates(list2, mergeDifferentLength);
+ list.addAll(list2);
+ list2=null;
+
+ Collections.sort(list12, mcomp);
+ if(USE_STRICT_MERGE){findAndMergeDuplicatesStrict(list12);}
+ findAndMergeDuplicates(list12, mergeDifferentLength);
+ list.addAll(list12);
+ list12=null;
+
+ Tools.condense(list);
+ Collections.sort(list, mcomp);
+ if(REMOVE_SINGLETON_DUPLICATES_OF_PAIRS){
+ findAndRemoveSingletonDuplicatesOfPairs(list);
+ Tools.condense(list);
+ Collections.sort(list, mcomp);
+ }
+ }
+
+
+ private void doPairedSplitAndMergeSeries_old(ArrayList<Read> list, final ReadComparatorMapping mcomp){
+
+ //This special section is probably not necessary.
+ //Theoretically, keeping everything in a single list should work fine.
+
+ int p=0, e1=0, e2=0, e12=0;
+ for(Read r : list){
+ if(r!=null){
+ if(r.paired()){
+ p++;
+ }else if(r.mapped() && r.mate.mapped()){
+ e12++;
+ }else if(r.mapped()){
+ e1++;
+ }else if(r.mate.mapped()){
+ e2++;
+ }
+ }
+ }
+
+ ArrayList<Read> listP=new ArrayList<Read>(p);
+ ArrayList<Read> list1=new ArrayList<Read>(e1);
+ ArrayList<Read> list2=new ArrayList<Read>(e2);
+ ArrayList<Read> list12=new ArrayList<Read>(e12);
+
+ for(Read r : list){
+ if(r!=null){
+ if(r.paired()){
+ listP.add(r);
+ }else if(r.mapped() && r.mate.mapped()){
+ list12.add(r);
+ }else if(r.mapped()){
+ list1.add(r);
+ }else if(r.mate.mapped()){
+ list2.add(r);
+ }
+ }
+ }
+ list.clear();
+
+ Collections.sort(listP, mcomp);
+ if(USE_STRICT_MERGE){findAndMergeDuplicatesStrict(listP);}
+ findAndMergeDuplicates(listP, false);
+ if(asymmetricReads && SWAP_READ1_TO_PLUS){
+ Tools.condense(listP);
+ Collections.sort(listP, mcomp);
+ findAndMergeDuplicates(listP, true);
+ }
+ list.addAll(listP);
+ listP=null;
+
+ Collections.sort(list1, mcomp);
+ findAndMergeDuplicates(list1, false);
+ if(asymmetricReads && SWAP_READ1_TO_PLUS){
+ Tools.condense(list1);
+ Collections.sort(list1, mcomp);
+ findAndMergeDuplicates(list1, true);
+ }
+ list.addAll(list1);
+ list1=null;
+
+ Collections.sort(list2, mcomp);
+ findAndMergeDuplicates(list2, false);
+ if(asymmetricReads && SWAP_READ1_TO_PLUS){
+ Tools.condense(list2);
+ Collections.sort(list2, mcomp);
+ findAndMergeDuplicates(list2, true);
+ }
+ list.addAll(list2);
+ list2=null;
+
+ Collections.sort(list12, mcomp);
+ if(USE_STRICT_MERGE){findAndMergeDuplicatesStrict(list12);}
+ findAndMergeDuplicates(list12, false);
+ if(asymmetricReads && SWAP_READ1_TO_PLUS){
+ Tools.condense(list12);
+ Collections.sort(list12, mcomp);
+ findAndMergeDuplicates(list12, true);
+ }
+ list.addAll(list12);
+ list12=null;
+
+ Tools.condense(list);
+ Collections.sort(list, mcomp);
+ if(REMOVE_SINGLETON_DUPLICATES_OF_PAIRS){
+ findAndRemoveSingletonDuplicatesOfPairs(list);
+ Tools.condense(list);
+ Collections.sort(list, mcomp);
+ }
+ }
+
+
+ private static int trimTails(ArrayList<Read> list, int thresh, byte minq, int[] rvector){
+
+ int removed=0;
+ int basesRemoved=0;
+ int basesMapped=0;
+ int needRegen=0;
+ for(int i=0; i<list.size(); i++){
+ final Read r=list.get(i);
+ if(r!=null){
+ final Read r2=r.mate;
+ if(r.mapped() && !r.invalid()){
+ basesMapped+=r.length();
+ basesRemoved+=trimTail(r, thresh, minq);
+ if(r.match==null && r.valid()){needRegen++;}
+ else{
+ assert(r.invalid() || TranslateColorspaceRead.verifyMatchString2(r, true));
+ }
+ }
+ if(r2!=null && r2.mapped() && !r2.invalid()){
+ basesMapped+=r2.length();
+ basesRemoved+=trimTail(r2, thresh, minq);
+ if(r2.match==null && r2.valid()){needRegen++;}
+ else{
+ assert(r2.invalid() || TranslateColorspaceRead.verifyMatchString2(r2, true));
+ }
+ }
+
+ if((r.invalid() || !r.mapped()) && (r2==null || r2.invalid() || !r2.mapped())){
+ removed++;
+ list.set(i, null);
+ }
+ }
+ }
+
+ if(rvector!=null){
+ rvector[0]=removed;
+ rvector[1]=basesRemoved;
+ rvector[2]=basesMapped;
+ rvector[3]=needRegen;
+ }
+
+ return removed;
+ }
+
+
+ private static int fixShort(ArrayList<Read> list, int[] rvector){
+
+ int removed=0;
+ int basesRemoved=0;
+ int basesMapped=0;
+ int needRegen=0;
+ for(int i=0; i<list.size(); i++){
+ final Read r=list.get(i);
+ final Read r2=(r==null ? null : r.mate);
+
+ if(r!=null){
+ if(r.mapped()){basesMapped+=r.length();}
+ if(r2!=null && r2.mapped()){basesMapped+=r2.length();}
+ }
+
+ if(r!=null && r2!=null && r.mapped() && r.paired() && r2.mapped()){
+
+ final int expectedMinLengthOuter=r.length()+r2.length();
+
+ final int refLengthOuter;
+ final int refLengthInner;
+
+ if(SAME_STRAND_PAIRS){
+ throw new RuntimeException("TODO");
+ }else{
+ if(r.strand()==Gene.PLUS && r2.strand()==Gene.MINUS){
+ refLengthOuter=r2.stop-r.start+1;
+ refLengthInner=r2.start-r.stop-1;
+ }else if(r.strand()==Gene.MINUS && r2.strand()==Gene.PLUS){
+ refLengthOuter=r.stop-r2.start+1;
+ refLengthInner=r.start-r2.stop-1;
+ }else{
+ //Wrong strands - don't do anything
+ refLengthOuter=expectedMinLengthOuter;
+ refLengthInner=1;
+ }
+ }
+
+ //TODO: Merge the qualities and calls of the trimmed portions
+ if(false && refLengthOuter<expectedMinLengthOuter){
+ //Short read type 1: outer distance is too short
+
+ if(refLengthOuter<Tools.max(r.length()/2, 35)){
+ r.setInvalid(true);
+ r2.setInvalid(true);
+ basesRemoved+=(r.length()+r2.length());
+ }else{
+ int dif=expectedMinLengthOuter-refLengthOuter;
+ int rem1=dif/2;
+ int rem2=dif-rem1;
+ if(r2.length()-rem2<10){
+ r2.setInvalid(true);
+ r.setPaired(false);
+ r2.setPaired(false);
+ basesRemoved+=r2.length();
+ rem1=dif;
+ rem2=0;
+ }else if(r.length()-rem1<10){
+ r.setInvalid(true);
+ r.setPaired(false);
+ r2.setPaired(false);
+ basesRemoved+=r.length();
+ rem1=0;
+ rem2=dif;
+ }
+ if(rem1>0){
+ basesRemoved+=trimTailByXBases(r, rem1);
+ if(r.match==null && r.valid()){needRegen++;}
+ else{
+ assert(r.invalid() || TranslateColorspaceRead.verifyMatchString2(r, true));
+ }
+ }
+ if(rem2>0){
+ basesRemoved+=trimTailByXBases(r2, rem2);
+ if(r2.match==null && r2.valid()){needRegen++;}
+ else{
+ assert(r2.invalid() || TranslateColorspaceRead.verifyMatchString2(r2, true));
+ }
+ }
+ }
+ }else if(refLengthInner<=0 || refLengthOuter<expectedMinLengthOuter){
+// System.err.println(r.toText(false));
+ //Short read type 2: inner distance is negative
+
+ int cOverlap1=countCalledBasesOnOrAfterRefLoc(r, r.strand()==Gene.PLUS ? r2.start : r2.stop);
+ int cOverlap2=countCalledBasesOnOrAfterRefLoc(r2, r2.strand()==Gene.PLUS ? r.start : r.stop);
+
+ int overlap=Tools.max(cOverlap1, cOverlap2);
+
+ if(overlap>0){
+ int toRemain=r.length()+r2.length()-overlap;
+
+ if(toRemain<Tools.max(r.length()/2, 34)){
+ r.setInvalid(true);
+ r2.setInvalid(true);
+ basesRemoved+=(r.length()+r2.length());
+// System.err.println("Removed read. refLengthOuter="+refLengthOuter+", refLengthInner="+
+// refLengthInner+", cOverlap1="+cOverlap1+", cOverlap2="+cOverlap2+", toRemain="+toRemain+
+// "\n"+new String(r.match)+"\n"+new String(r2.match)+"\n"+
+// r.start+"~"+r.stop+"("+Gene.strandCodes[r.strand()]+")\n"+
+// r2.start+"~"+r2.stop+"("+Gene.strandCodes[r2.strand()]+")\n");
+
+ }else{
+// System.err.println((cOverlap1==cOverlap2 ? "" : "****\t")+cOverlap1+", "+cOverlap2);
+ int rem1=overlap/2;
+ int rem2=overlap-rem1;
+ if(r2.length()-rem2<10){
+ r2.setInvalid(true);
+ r.setPaired(false);
+ r2.setPaired(false);
+ basesRemoved+=r2.length();
+ rem1=overlap;
+ rem2=0;
+ }else if(r.length()-rem1<10){
+ r.setInvalid(true);
+ r.setPaired(false);
+ r2.setPaired(false);
+ basesRemoved+=r.length();
+ rem1=0;
+ rem2=overlap;
+ }
+ if(rem1>0){
+ basesRemoved+=trimTailByXBases(r, rem1);
+ if(r.match==null && r.valid()){needRegen++;}
+ else{
+ assert(r.invalid() || TranslateColorspaceRead.verifyMatchString2(r, true));
+ }
+ }
+ if(rem2>0){
+ basesRemoved+=trimTailByXBases(r2, rem2);
+ if(r2.match==null && r2.valid()){needRegen++;}
+ else{
+ assert(r2.invalid() || TranslateColorspaceRead.verifyMatchString2(r2, true));
+ }
+ }
+ }
+ }
+ }
+
+ if((r.invalid() || !r.mapped()) && (r2==null || r2.invalid() || !r2.mapped())){
+ removed++;
+ list.set(i, null);
+ }
+ }
+ }
+
+ if(rvector!=null){
+ rvector[0]=removed;
+ rvector[1]=basesRemoved;
+ rvector[2]=basesMapped;
+ rvector[3]=needRegen;
+ }
+
+ return removed;
+ }
+
+
+ //TODO: Add support for deletions
+ /** thresh: Must see this many consecutive 'm' to stop. */
+ private static int trimTail(Read r, int thresh, byte minq){
+ byte[] bases=r.bases;
+ byte[] match=r.match;
+ byte[] quality=r.quality;
+
+ assert(match!=null);
+ if(r.strand()==Gene.MINUS){ //Remember to un-reverse later
+ Tools.reverseInPlace(match);
+ }
+
+
+ int lastBadLoc=quality.length;
+ int lastBadMLoc=match.length;
+ int qloc=quality.length-1;
+ int mloc=match.length-1;
+
+ for(; mloc>=0 && qloc>=0; mloc--){
+
+ assert(qloc<lastBadLoc) : "\n"+qloc+", "+lastBadLoc+", "+mloc+", "+lastBadMLoc+"\n"+r.toText(false)+"\n";
+ assert(mloc<lastBadMLoc) : "\n"+qloc+", "+lastBadLoc+", "+mloc+", "+lastBadMLoc+"\n"+r.toText(false)+"\n";
+ if(lastBadLoc-qloc>thresh){break;}
+
+ byte m=match[mloc];
+ byte q=quality[qloc];
+
+ if(m=='D'){
+ //do nothing
+ lastBadLoc=qloc+1;
+ lastBadMLoc=mloc;
+ }else{
+ if(q<minq || m!='m'){
+ lastBadLoc=qloc;
+ lastBadMLoc=mloc;
+ }
+ qloc--;
+ }
+ }
+
+ if(lastBadLoc==quality.length){
+ if(r.strand()==Gene.MINUS){Tools.reverseInPlace(match);}
+ return 0;
+ }
+
+ if(lastBadLoc<6){
+ if(r.strand()==Gene.MINUS){Tools.reverseInPlace(match);}
+ r.setInvalid(true);
+ return quality.length;
+ }
+
+// {
+// if(r.strand()==Gene.MINUS){ //Remember to un-reverse later
+// Tools.reverseInPlace(match);
+// }
+// System.err.println("\nBefore:\n"+r.toText(false));
+// if(r.strand()==Gene.MINUS){ //Remember to un-reverse later
+// Tools.reverseInPlace(match);
+// }
+// }
+
+ assert(lastBadLoc<quality.length);
+ assert(lastBadMLoc<match.length);
+
+ bases=Arrays.copyOf(bases, lastBadLoc);
+ quality=Arrays.copyOf(quality, lastBadLoc);
+ match=Arrays.copyOf(match, lastBadMLoc);
+
+ if(r.strand()==Gene.MINUS){Tools.reverseInPlace(match);}
+
+ boolean realign=false;
+ int lengthOfMatchString=0;
+ for(byte m : match){
+ if(m=='m' || m=='N' || m=='s' || m=='S' || m=='D'){
+ lengthOfMatchString++;
+ }else if(m=='X' || m=='Y'){
+ realign=true;
+ }
+ }
+
+// assert(!realign) : r.toText(false);
+
+ if(realign){
+ System.err.println("Killed match string while trimming this read:\n"+r.toText(false));
+ r.match=null;
+ match=null;
+ }else{
+ if(r.strand()==Gene.PLUS){
+ r.stop=r.start+lengthOfMatchString-1;
+ }else{
+ r.start=r.stop-lengthOfMatchString+1;
+ }
+ }
+
+ int trimmed=r.quality.length-quality.length;
+ r.quality=quality;
+ r.match=match;
+ r.bases=bases;
+
+ assert(trimmed>0);
+ assert(r.quality.length==r.length());
+ assert(r.match==null || r.match.length>=r.quality.length);
+
+// System.err.println("After:\n"+r.toText(false));
+
+ return trimmed;
+ }
+
+
+ private static int trimTailByXBases(Read r, final int x){
+ byte[] bases=r.bases;
+ byte[] match=r.match;
+ byte[] quality=r.quality;
+
+ final int newLen=bases.length-x;
+
+ if(newLen<6){
+ r.setInvalid(true);
+ return quality.length;
+ }
+
+ assert(match!=null);
+ if(r.strand()==Gene.MINUS){ //Remember to un-reverse later
+ Tools.reverseInPlace(match);
+ }
+
+ int qloc=quality.length-1;
+ int mloc=match.length-1;
+
+ for(; mloc>=0 && qloc>=newLen; mloc--){
+
+ byte m=match[mloc];
+// byte q=quality[qloc];
+
+ if(m=='D'){
+ //do nothing
+ }else{
+ qloc--;
+ }
+ }
+
+ while(mloc>=0 && match[mloc]=='D'){mloc--;}
+ assert(qloc==newLen-1);
+
+ bases=Arrays.copyOf(bases, newLen);
+ quality=Arrays.copyOf(quality, newLen);
+ match=Arrays.copyOf(match, mloc+1);
+
+ if(r.strand()==Gene.MINUS){Tools.reverseInPlace(match);}
+
+ boolean realign=false;
+ int lengthOfMatchString=0;
+ for(byte m : match){
+ if(m=='m' || m=='N' || m=='s' || m=='S' || m=='D'){
+ lengthOfMatchString++;
+ }else if(m=='X' || m=='Y'){
+ realign=true;
+ }
+ }
+
+// assert(!realign) : r.toText(false);
+
+ if(realign){
+ System.err.println("Killed match string while trimming this read:\n"+r.toText(false));
+ r.match=null;
+ match=null;
+ }else{
+ if(r.strand()==Gene.PLUS){
+ r.stop=r.start+lengthOfMatchString-1;
+ }else{
+ r.start=r.stop-lengthOfMatchString+1;
+ }
+ }
+
+ int trimmed=r.quality.length-quality.length;
+ r.quality=quality;
+ r.match=match;
+ r.bases=bases;
+
+ assert(trimmed>0);
+ assert(r.quality.length==r.length());
+ assert(r.match==null || r.match.length>=r.quality.length);
+
+// System.err.println("After:\n"+r.toText(false));
+
+ return trimmed;
+ }
+
+
+ private static int countCalledBasesOnOrAfterRefLoc(Read r, final int rlimit){
+ final int clen=r.length();
+ byte[] match=r.match;
+
+ if(r.strand()==Gene.PLUS){
+
+// final int rlimit=rlimit_0-1;
+
+ int cloc=0;
+ int mloc=0;
+ int rloc=r.start;
+ for(; mloc<match.length && rloc<rlimit; mloc++){
+ byte m=match[mloc];
+
+ if(m=='D'){
+ rloc++;
+ }else if(m=='X' || m=='Y' || m=='I'){
+ cloc++;
+ }else{
+ cloc++;
+ rloc++;
+ }
+ }
+
+ if(rloc>=rlimit){
+
+ if(rloc>rlimit){
+ return clen+(rloc-rlimit);
+ }
+
+ int ret=clen-cloc;
+ assert(rloc==rlimit) : "ret="+ret+", clen="+clen+", cloc="+cloc+",\n"+
+ "rloc="+rloc+", rlimit="+rlimit+", mloc="+mloc+", mlen="+match.length+",\n"+
+ "r.start="+r.start+", r.stop="+r.stop+", r2.start="+r.mate.start+", r2.stop="+r.mate.stop+"\n\n"+r.toText(false)+"\n\n";
+ assert(ret>=0 && ret<=clen) : "ret="+ret+", clen="+clen+", cloc="+cloc+",\n"+
+ "rloc="+rloc+", rlimit="+rlimit+", mloc="+mloc+", mlen="+match.length+",\n"+
+ "r.start="+r.start+", r.stop="+r.stop+", r2.start="+r.mate.start+", r2.stop="+r.mate.stop;
+ return ret;
+ }else{
+ assert(cloc==clen) : clen+", "+cloc+"\n"+r.toText(false)+"\n"; //Maybe cloc==clen
+ return 0;
+ }
+ }else{
+
+// final int rlimit=rlimit_0+1;
+
+ int cloc=clen-1;
+ int mloc=match.length-1;
+ int rloc=r.stop;
+ for(; mloc>=0 && rloc>rlimit; mloc--){
+ byte m=match[mloc];
+
+ if(m=='D'){
+ rloc--;
+ }else if(m=='X' || m=='Y' || m=='I'){
+ cloc--;
+ }else{
+ cloc--;
+ rloc--;
+ }
+ }
+
+ if(rloc<=rlimit){
+ if(rloc<rlimit){
+ return clen+(rlimit-rloc);
+ }
+
+ assert(rloc==rlimit);
+ int ret=cloc+1;
+ assert(ret>=0 && ret<=clen) : "ret="+ret+", clen="+clen+", cloc="+cloc+",\n"+
+ "rloc="+rloc+", rlimit="+rlimit+", mloc="+mloc+", mlen="+match.length+",\n"+
+ "r.start="+r.start+", r.stop="+r.stop+", r2.start="+r.mate.start+", r2.stop="+r.mate.stop;
+ return ret;
+ }else{
+ assert(cloc==-1) : clen+", "+cloc; //Maybe cloc==-1
+ return 0;
+ }
+ }
+ }
+
+
+ private void findAndMergeDuplicates(ArrayList<Read> list, boolean mergeDifferentLengthReads){
+ if(list==null || list.size()<2){return;}
+ Read current=list.get(0);
+
+ for(int i=1; i<list.size(); i++){
+ final Read r=list.get(i);
+ final Read r2=r.mate;
+ boolean merge=false;
+
+ assert(paired==(current.mate!=null));
+
+ final boolean dupeLoose=current.isDuplicateByMapping(r, false, false);
+ final boolean mdupeLoose=(current.mate==null ? false : current.mate.isDuplicateByMapping(r.mate, false, false));
+ final boolean lengthOK=(mergeDifferentLengthReads ||
+ (r.length()==current.length() && (!paired || r2.length()==current.mateLength())));
+
+ if(lengthOK && (dupeLoose || mdupeLoose)){
+ if(paired){
+
+ if(r.length()==current.length()){
+ //Normal case
+
+ if(dupeLoose && mdupeLoose){
+
+ boolean dupeStrict=current.isDuplicateByMapping(r, true, true);
+ boolean mdupeStrict=current.mate.isDuplicateByMapping(r2, true, true);
+ if(dupeStrict && mdupeStrict){
+ if(current.perfect() && current.mate.perfect() && r.perfect() && r2.perfect()){
+ // assert(!contains(current.match, 'N'));
+ // assert(!contains(r.match, 'N'));
+ current.merge(r, true, true);
+ // assert(!contains(current.match, 'N'));
+ }else{
+
+ boolean N1=contains(current.bases, 'N');
+ boolean N2=contains(current.mate.bases, 'N');
+ boolean N3=contains(r.bases, 'N');
+ boolean N4=contains(r2.bases, 'N');
+
+ current.merge(r, false, true);
+
+ if(!N1 || !N3){
+ assert(!contains(current.bases, 'N')) : N1+", "+N2+", "+N3+", "+N4+"\n"+
+ current.toText(false)+"\n"+r.toText(false)+"\n";
+ }
+ if(!N2 || !N4){
+ assert(!contains(current.mate.bases, 'N')) : N1+", "+N2+", "+N3+", "+N4+"\n"+
+ current.mate.toText(false)+"\n"+r2.toText(false)+"\n";
+ }
+
+ }
+ }else{
+ current.merge(r, false, false);
+ }
+ merge=true;
+ }else if(current.paired() && r.paired()){
+ //do nothing - not duplicates
+ }else if(current.paired() && !r.paired()){
+ // if(dupe){
+ // r2.clearAnswers();
+ // current.merge(r, false, false);
+ // merge=true;
+ // }else{
+ // assert(mdupe);
+ // r.clearAnswers();
+ // current.merge(r, false, false);
+ // merge=true;
+ // }
+ }else if(r.paired() && !current.paired()){ //This should not happen...
+ // if(dupe){
+ // current.mate.clearAnswers();
+ // r.merge(current, false, false);
+ // merge=false;
+ // merged++;
+ // list.set(cindex, null);
+ // current=r;
+ // cindex=i;
+ // }else{
+ // assert(mdupe);
+ // current.clearAnswers();
+ // r.merge(current, false, false);
+ // merge=false;
+ // merged++;
+ // list.set(cindex, null);
+ // current=r;
+ // cindex=i;
+ // }
+ }else{//Neither is paired
+ if(dupeLoose && !current.mate.mapped() && !r2.mapped()){
+ boolean dupeStrict=current.isDuplicateByMapping(r, true, true);
+ current.merge(r, false, dupeStrict);
+ merge=true;
+ }else if(mdupeLoose && !current.mapped() && !r.mapped()){
+ boolean mdupeStrict=current.mate.isDuplicateByMapping(r2, true, true);
+ current.merge(r, false, mdupeStrict);
+ merge=true;
+ }
+ }
+
+ assert(r2==null || r.numericID==r2.numericID);
+ assert(current.mate==null || current.numericID==current.mate.numericID);
+ assert(r.mate==null || r.mate.mate==r);
+ assert(current.mate==null || current.mate.mate==current);
+
+ }else{
+ //Asymmetric paired reads, such as 50-35 from Solid, where one read got swapped
+
+ if(dupeLoose && mdupeLoose){
+
+ boolean dupeStrict=current.isDuplicateByMapping(r, false, true);
+ boolean mdupeStrict=current.mate.isDuplicateByMapping(r2, false, true);
+ if(dupeStrict && mdupeStrict){
+ if(current.perfect() && current.mate.perfect() && r.perfect() && r2.perfect()){
+ // assert(!contains(current.match, 'N'));
+ // assert(!contains(r.match, 'N'));
+ current.merge(r, true, true);
+ // assert(!contains(current.match, 'N'));
+ }else{
+
+// boolean N1=contains(current.bases, 'N');
+// boolean N2=contains(current.mate.bases, 'N');
+// boolean N3=contains(r.bases, 'N');
+// boolean N4=contains(r2.bases, 'N');
+
+ current.merge(r, false, true);
+
+// if(!N1 || !N3){
+// assert(!contains(current.bases, 'N')) : N1+", "+N2+", "+N3+", "+N4+"\n"+
+// current.toText(false)+"\n"+r.toText(false)+"\n";
+// }
+// if(!N2 || !N4){
+// assert(!contains(current.mate.bases, 'N')) : N1+", "+N2+", "+N3+", "+N4+"\n"+
+// current.mate.toText(false)+"\n"+r2.toText(false)+"\n";
+// }
+
+ }
+ }else{
+ //Do nothing, but delete second copy
+// current.merge(r, false, false);
+ }
+ merge=true;
+ }else if(current.paired() && r.paired()){
+ //do nothing - not duplicates
+ }else if(current.paired() && !r.paired()){
+ // if(dupe){
+ // r2.clearAnswers();
+ // current.merge(r, false, false);
+ // merge=true;
+ // }else{
+ // assert(mdupe);
+ // r.clearAnswers();
+ // current.merge(r, false, false);
+ // merge=true;
+ // }
+ }else if(r.paired() && !current.paired()){ //This should not happen...
+ // if(dupe){
+ // current.mate.clearAnswers();
+ // r.merge(current, false, false);
+ // merge=false;
+ // merged++;
+ // list.set(cindex, null);
+ // current=r;
+ // cindex=i;
+ // }else{
+ // assert(mdupe);
+ // current.clearAnswers();
+ // r.merge(current, false, false);
+ // merge=false;
+ // merged++;
+ // list.set(cindex, null);
+ // current=r;
+ // cindex=i;
+ // }
+ }else{//Neither is paired
+
+ //In this case just remove the lower-quality read (generally the second one).
+ //Should be very, very rare.
+
+ merge=true; //Pretend to merge but really just delete the second copy.
+
+// if(dupeLoose && !current.mate.mapped() && !r2.mapped()){
+// boolean dupeStrict=current.isDuplicateByMapping(r, true, true);
+// current.merge(r, false, dupeStrict);
+// merge=true;
+// }else if(mdupeLoose && !current.mapped() && !r.mapped()){
+// boolean mdupeStrict=current.mate.isDuplicateByMapping(r2, true, true);
+// current.merge(r, false, mdupeStrict);
+// merge=true;
+// }
+ }
+ }
+
+ assert(r2==null || r.numericID==r2.numericID);
+ assert(current.mate==null || current.numericID==current.mate.numericID);
+ assert(r.mate==null || r.mate.mate==r);
+ assert(current.mate==null || current.mate.mate==current);
+
+ }else{
+ //Single-ended
+
+ assert(dupeLoose);
+ boolean dupeStrict=current.isDuplicateByMapping(r, true, true);
+ if(current.perfect() && r.perfect()){
+ current.merge(r, true, true);
+ }else{
+ current.merge(r, false, dupeStrict);
+ }
+ merge=true;
+ }
+ }
+
+ if(merge){
+ merged++;
+ list.set(i, null);
+ }else{
+ current=r;
+ }
+ }
+
+ if(REGENERATE_MATCH_STRING){regenMatchStrings(list);}
+ }
+
+
+ private void findAndMergeDuplicatesStrict(ArrayList<Read> list){
+ if(list==null || list.size()<2){return;}
+
+ int addIndex=0;
+
+ ArrayList<Read> toMerge=new ArrayList<Read>();
+ ArrayList<Read> toMerge2=new ArrayList<Read>();
+
+ for(int i=0; i<list.size(); i++){
+ Read r=list.set(i, null);
+
+ if(!toMerge.isEmpty()){
+ Read current=toMerge.get(0);
+ assert(paired==(current.mate!=null));
+
+ final boolean dupeStrict=current.isDuplicateByMapping(r, true, true);
+
+ if(!paired){
+ if(!dupeStrict){
+ Read x=toMerge.get(0);
+ if(toMerge.size()>1){
+ merged2+=toMerge.size()-1;
+ x=mergeReads(toMerge, true);
+ }
+ assert(list.get(addIndex)==null);
+ list.set(addIndex, x);
+ addIndex++;
+ toMerge.clear();
+ }
+ }else{
+ assert(toMerge.size()==toMerge2.size());
+ final boolean mdupeStrict=current.mate.isDuplicateByMapping(r.mate, true, true);
+ if(!dupeStrict || !mdupeStrict){
+ Read x=toMerge.get(0);
+ if(toMerge.size()>1){
+ merged2+=toMerge.size()-1;
+ x=mergeReads(toMerge, true);
+ Read y=mergeReads(toMerge2, true);
+ assert(x.mate==y);
+ assert(y.mate==x);
+ assert(x!=y);
+ }
+ assert(list.get(addIndex)==null);
+ list.set(addIndex, x);
+ addIndex++;
+ toMerge.clear();
+ toMerge2.clear();
+ }
+ }
+ }
+
+ toMerge.add(r);
+ if(paired){toMerge2.add(r.mate);}
+ }
+
+ if(!toMerge.isEmpty()){
+ Read x=toMerge.get(0);
+ if(toMerge.size()>1){
+ merged2+=toMerge.size()-1;
+ x=mergeReads(toMerge, true);
+ if(paired){
+ Read y=mergeReads(toMerge2, true);
+ assert(x.mate==y);
+ assert(y.mate==x);
+ assert(x!=y);
+ }
+ }
+ assert(list.get(addIndex)==null);
+ list.set(addIndex, x);
+ addIndex++;
+ }
+
+ for(int i=list.size()-1; i>=0 && list.get(i)==null; i--){list.remove(i);}
+
+ if(REGENERATE_MATCH_STRING){regenMatchStrings(list);}
+ }
+
+
+ private void findAndRemoveSingletonDuplicatesOfPairs(ArrayList<Read> list){
+ if(list==null || list.size()<2){return;}
+ assert(paired);
+
+ Read current=null;
+ for(int i=0; i<list.size(); i++){
+ Read r=list.get(i);
+ if(r.paired()){current=r;}
+ else if(current!=null && !r.paired()){
+ assert(r.length()==current.length()) : "Merging different-length reads is supported but seems to be not useful.";
+ if(r.mapped() && !r.mate.mapped()){
+ final boolean sameLength=current.length()==r.length();
+ if(current.isDuplicateByMapping(r, sameLength, true)){
+ if(current.mate.isDuplicateByBases(r.mate, r.mateLength()/5, r.mateLength()/5, (byte)14, false, true)){
+ list.set(i, null);
+ removedSingletonDupe++;
+ }
+ }
+ }else{
+ final boolean sameLength=current.mateLength()==r.mateLength();
+ if(current.mate.isDuplicateByMapping(r.mate, sameLength, true)){
+ if(!sameLength || current.isDuplicateByBases(r, r.length()/5, r.length()/5, (byte)14, false, true)){
+ list.set(i, null);
+ removedSingletonDupe++;
+ }
+ }
+ }
+ }
+ }
+
+ if(REGENERATE_MATCH_STRING){regenMatchStrings(list);}
+ }
+
+
+ /** Assumes all reads in the list are duplicates.
+ * Only merges reads in list, not their mates.
+ * Mutates the first input read. */
+ public static Read mergeReads(ArrayList<Read> list, boolean retainPerfect){
+ if(list==null || list.isEmpty()){return null;}
+ if(list.size()==1){return list.get(0);}
+
+ //This block prevents the destruction of perfect reads.
+ {
+ Read a=list.get(0);
+ for(int i=1; i<list.size(); i++){
+ Read b=list.get(i);
+ a.copies+=b.copies;
+ if(a.perfect() && !b.perfect()){
+ list.set(i, null);
+ }
+ }
+ Tools.condense(list);
+ if(list.size()==1){return list.get(0);}
+ }
+
+ int len=0;
+ for(Read r : list){len=Tools.max(len, r.length());}
+
+ assert(len==list.get(0).length());
+
+ int[][] count=new int[4][len];
+ int[][] qual=new int[4][len];
+ byte[][] maxQual=new byte[4][len];
+
+ for(Read r : list){
+ for(int i=0; i<r.length(); i++){
+ byte b=r.bases[i];
+ assert((b!='A' && b!='C' && b!='G' && b!='T'));
+ byte q=r.quality[i];
+ if(b>=0 && b<=3){
+ count[b][i]+=r.copies;
+ qual[b][i]+=q;
+ maxQual[b][i]=Tools.max(q, maxQual[b][i]);
+ }
+ }
+ }
+
+ int[] carray=new int[4];
+ int[] qarray=new int[4];
+ byte[] marray=new byte[4];
+
+
+ byte[] bases=new byte[len];
+ byte[] quality=new byte[len];
+
+ for(int i=0; i<len; i++){
+ for(int j=0; j<4; j++){
+ carray[j]=count[j][i];
+ qarray[j]=qual[j][i];
+ marray[j]=maxQual[j][i];
+
+ qarray[j]=(qarray[j]+marray[j])/2;
+ }
+ byte best=findBest(carray, qarray, marray);
+ if(best<0){
+ bases[i]='N';
+ quality[i]=0;
+ }else{
+ bases[i]=AminoAcid.numberToBase[best];
+ int q=2*qarray[best];
+ for(int j=0; j<4; j++){q-=qarray[j];}
+ q=Tools.min(48, Tools.max(q, 1));
+ quality[i]=(byte)q;
+ }
+ }
+
+ final Read r=list.get(0);
+
+// if(r.match!=null){
+// for(int i=0; i<len; i++){
+// if(bases[i]!=r.bases[i]){
+// r.match=null;
+// break;
+// }
+// }
+// }
+
+ //Uses the primary read as a template and merging the new data in.
+ //That way, if the new data differs from the primary (best) read, but the new quality score is very low,
+ //the old data can be retained (with quality reduced).
+ //This may or may not be a good idea.
+
+ boolean killMatch=false;
+ final boolean retain=retainPerfect && r.perfect();
+ for(int i=0; i<bases.length; i++){
+ byte b1=r.bases[i];
+ byte b2=bases[i];
+ byte q1=r.quality[i];
+ byte q2=quality[i];
+
+ assert(q1>=0);
+ assert(q2>=0);
+
+ if(b1==b2){
+ r.quality[i]=q2;
+ if(b1=='N'){r.quality[i]=0;}
+ }else{
+ if(b2=='N'){
+ r.quality[i]=Tools.min(r.quality[i], (byte)2);
+ }else if(b1=='N'){
+ r.bases[i]=b2;
+ r.quality[i]=q2;
+ killMatch=true;
+ }else{
+ if(retain){
+ r.quality[i]=Tools.max((byte)2, (byte)(q1-q2));
+ }else if(q2-q1>10){
+ r.bases[i]=b2;
+ r.quality[i]=q2;
+ killMatch=true;
+ }else if(q1<15 && q2>20){
+ r.bases[i]=b2;
+ r.quality[i]=q2;
+ killMatch=true;
+ }else{
+ r.quality[i]=Tools.max((byte)2, (byte)(q1-q2));
+ }
+ }
+ }
+ }
+
+ if(killMatch){r.match=null;}
+
+ return r;
+ }
+
+ private static final byte findBest(int[] count, int[] qual, byte[] maxqual){
+ byte best=-1;
+ int bestScore=-1;
+ for(byte i=0; i<count.length; i++){
+ int score=count[i]*8+qual[i]*2+maxqual[i];
+ if(score>0){
+ if(score>bestScore){
+ best=i;
+ bestScore=score;
+ }else if(score==bestScore){
+ if(qual[i]>qual[best]){
+ best=i;
+ bestScore=score;
+ }else if(qual[i]==qual[best] && count[i]>count[best]){
+ best=i;
+ bestScore=score;
+ }
+ }
+ }
+ }
+ return best;
+ }
+
+
+ private void regenMatchStrings(ArrayList<Read> list){
+ if(list==null || list.isEmpty()){return;}
+
+ int needed=0;
+ for(Read r : list){
+ if(r!=null){
+ if(r.mapped() && r.match==null){
+ needed++;
+ }else if(r.mate!=null && r.mate.mapped() && r.mate.match==null){
+ needed++;
+ }
+ }
+ }
+ if(needed<1){return;}
+
+
+ final int lim=100;
+
+// System.err.println("Starting RMTs");
+ RegenMatchThread[] rmt=new RegenMatchThread[Tools.max(1, Tools.min(REGEN_THREADS, needed/lim))];
+ for(int i=0; i<rmt.length; i++){
+ rmt[i]=new RegenMatchThread();
+ rmt[i].start();
+ }
+ ArrayList<Read> list2=new ArrayList<Read>(lim);
+ for(Read r : list){
+ if(r!=null){
+ boolean flag=false;
+ if(r.mapped() && r.match==null){
+ flag=true;
+ }else if(r.mate!=null && r.mate.mapped() && r.mate.match==null){
+ flag=true;
+ }
+ if(flag){
+ list2.add(r);
+ if(list2.size()>=lim){
+ while(list2!=null){
+ try {
+ REGEN_PIPE.put(list2);
+ list2=null;
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ list2=new ArrayList<Read>(lim);
+ }
+ }
+ }
+ }
+
+ if(list2!=null && list2.size()>0){
+ while(list2!=null){
+ try {
+ REGEN_PIPE.put(list2);
+ list2=null;
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+
+// System.err.println("Poisoning RMTs");
+ //Poison
+ for(int i=0; i<rmt.length; i++){
+ list2=new ArrayList<Read>(0);
+ while(list2!=null){
+ try {
+ REGEN_PIPE.put(list2);
+ list2=null;
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+
+// System.err.println("Joining RMTs");
+ for(int i=0; i<rmt.length; i++){
+ while(rmt[i].isAlive()){
+ try {
+ rmt[i].join();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+
+
+ }
+
+
+ private static boolean contains(byte[] array, char c){
+ if(array==null){return false;}
+ for(byte b : array){if(b==c){return true;}}
+ return false;
+ }
+
+ private String makeKey(Read r){
+ String key;
+
+ if(!r.mapped() && (r.mate==null || !r.mate.mapped())){
+ key="Z";
+ }else{
+ if(r.mapped()){key=makeKey2(r, r.strand()).insert(0, 'C').toString();}
+ else{
+ byte strand=r.mate.strand();
+ if(!SAME_STRAND_PAIRS){
+ strand=(byte)(1^strand);
+ }
+ key=makeKey2(r.mate, strand).insert(0, MOVE_SINGLETONS_TO_END ? 'D' : 'C').toString();
+ }
+ }
+
+ return key;
+ }
+
+
+ private StringBuilder makeKey2(Read r, byte strand){
+ assert(r.mapped());
+
+ StringBuilder sb=new StringBuilder(32);
+
+ int num=Tools.max(r.start/blocksize, 0);
+ if(r.chrom<=9){sb.append('0');}
+ sb.append(r.chrom);
+ sb.append("_").append(strand).append("_");
+ for(int i=6-(""+num).length(); i>0; i--){
+ sb.append('0');
+ }
+ sb.append(num);
+ return sb;
+ }
+
+
+ private void addRead(Read r){
+ Read r2=r.mate;
+ assert(r2==null || r.numericID==r2.numericID);
+ boolean swap=false;
+ if(SWAP_READ1_TO_PLUS && r2!=null){
+
+ if(r.paired() && r.mapped() && r.valid() && r2.mapped() && r2.valid()){ //Ideal pair
+ if(r.strand()==Gene.MINUS && r2.strand()==Gene.PLUS){swap=true;}
+ }else if(r.mapped() && r.valid() && r2.mapped() && r2.valid()){
+ if(r.strand()==Gene.MINUS && r2.strand()==Gene.PLUS){swap=true;}
+ }else if(r.mapped() && r.valid()){
+ if(r.strand()==Gene.MINUS){swap=true;}
+ }else if(r2.mapped() && r2.valid()){
+ if(r2.strand()==Gene.PLUS){swap=true;}
+ }
+ }
+
+ if(swap){
+ r.setSwapped(true);
+ r2.setSwapped(true);
+ Read temp=r;
+ r=r2;
+ r2=temp;
+ }
+ assert(r2==null || (r.numericID==r2.numericID && r!=r2));
+
+ String key=makeKey(r);
+
+// String key=sb.toString();
+ Block b=table.get(key);
+ if(b==null){
+ //System.err.println("Created block "+key);
+ b=new Block(key, outname, r.chrom);
+ table.put(key, b);
+ }
+ b.add(r);
+ }
+
+
+ public void finishWritingBlocks(){
+ System.err.println("Called finishWritingBlocks()");
+ int numWritten=0;
+ for(String key : table.keySet()){
+ Block b=table.get(key);
+ b.finishWritingBuffer();
+ numWritten++;
+ }
+ assert(numWritten==table.size()) : "Only wrote "+numWritten+" of "+table.size();
+ }
+
+
+ private class Block{
+
+ public Block(String name_, String fname_, int chrom_){
+
+ if(DONT_COMPRESS_TEMP_FILES){
+ if(fname_.endsWith(".gz") || fname_.endsWith(".zip") || fname_.endsWith(".bz2")){
+ fname_=fname_.substring(0, fname_.lastIndexOf('.'));
+ }
+ }
+
+ name=name_;
+ fname1=fname_.replaceFirst("#", "_msort_tempBlock_"+name+"_1");
+ fname2=(!paired ? null : fname_.replaceFirst("#", "_msort_tempBlock_"+name+"_2"));
+ chrom=chrom_;
+// Data.sysout.println(fname1);
+ if(fname1==null){
+ assert(false);
+ outStream1=null;
+ writer1=null;
+ }else{
+ outStream1=ReadWrite.getOutputStream(fname1, append, true, false);
+ writer1=new PrintWriter(outStream1);
+ }
+
+ if(fname2==null){
+ outStream2=null;
+ writer2=null;
+ }else{
+ outStream2=ReadWrite.getOutputStream(fname2, append, true, false);
+ writer2=new PrintWriter(outStream2);
+ }
+ }
+
+ public void add(Read r){
+ buffer.add(r);
+ added++;
+ if(buffer.size()>=WRITE_BUFFER){
+ writeBuffer(false);
+ }
+ }
+
+ public void writeBuffer(boolean close){
+
+ written+=buffer.size();
+ ArrayList<Read> temp=buffer;
+ buffer=(close ? null : new ArrayList<Read>(WRITE_BUFFER));
+
+ if(close){
+// System.err.println("Closing "+name+": "+ fname1+", "+fname2);
+ if(blockwriter1!=null){blockwriter1.addList(temp, writer1, outStream1, close);}
+ if(blockwriter2!=null){blockwriter2.addList(temp, writer2, outStream2, close);}
+ }else{
+ if(blockwriter1!=null && temp!=null && !temp.isEmpty()){blockwriter1.addList(temp, writer1, outStream1, close);}
+ if(blockwriter2!=null && temp!=null && !temp.isEmpty()){blockwriter2.addList(temp, writer2, outStream2, close);}
+ }
+
+ assert(added==written);
+// buffer.clear();
+ }
+
+ public void finishWritingBuffer(){
+ //System.err.println("Writing block "+name);
+ writeBuffer(true);
+
+// finishWriting(writer1, outStream1);
+// if(fname2!=null){
+// finishWriting(writer2, outStream2);
+// }
+
+ }
+
+ public synchronized ArrayList<Read> readBlock(){
+ RTextInputStream temp=new RTextInputStream(fname1, fname2, -1);
+ ArrayList<Read> out=new ArrayList<Read>((int)written);
+ ArrayList<Read> reads=temp.nextList();
+ while(reads!=null && reads.size()>0){
+ out.addAll(reads);
+ numRead+=reads.size();
+ reads=temp.nextList();
+ }
+ temp.close();
+ temp=null;
+ assert(numRead==written);
+
+ return out;
+ }
+
+ public synchronized void delete() {
+ if(fname1!=null){new File(fname1).delete();}
+ if(fname2!=null){new File(fname2).delete();}
+ }
+
+ public final String name;
+ public final String fname1, fname2;
+ public final int chrom; //Necessary for unloading data
+
+ public final OutputStream outStream1, outStream2;
+ public final PrintWriter writer1, writer2;
+ private ArrayList<Read> buffer=new ArrayList<Read>(WRITE_BUFFER);
+
+ public long added=0, written=0, numRead=0;
+ }
+
+ private class RegenMatchThread extends Thread{
+
+ @Override
+ public void run(){
+ for(ArrayList<Read> list=take(); !list.isEmpty(); list=take()){
+ for(Read r : list){
+ if(r!=null){
+ final Read r2=r.mate;
+ if(r.mapped() && r.match==null && r.valid()){regenMatchString(r);}
+ if(r2!=null && r2.mapped() && r2.match==null && r.valid()){regenMatchString(r2);}
+ }
+ }
+ }
+ }
+
+ private void regenMatchString(Read r){
+ assert(r.match==null);
+
+ TranslateColorspaceRead.realign_new(r.topSite(), r.bases, tcr.msaBS, 4, 1, 0, false, true, r.numericID);
+ assert(false) : "TODO: move ss locs back to read.";
+
+ r.setPerfectFlag(Integer.MAX_VALUE);
+ assert(!r.perfect() || r.stop-r.start==(r.length()-1)) :
+ "\n"+r.toText(false)+"\n"+new String(r.bases)+"\n"+new String(AminoAcid.reverseComplementBases(r.bases))+
+ "\n"+Data.getChromosome(r.chrom).getString(r.topSite().start, r.topSite().stop)+"\n";
+
+ if(r.match!=null){
+// boolean xy=TranslateColorspaceRead.containsXY(r.match);
+ assert(TranslateColorspaceRead.verifyMatchString2(r, true)) : r.toText(false);
+ }
+ }
+
+ private ArrayList<Read> take(){
+ ArrayList<Read> list=null;
+ while(list==null){
+ try {
+ list=REGEN_PIPE.take();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ return list;
+ }
+
+ private final TranslateColorspaceRead tcr=null; /*new TranslateColorspaceRead(2000, 3000);*/ //Specific type needs to be specified.
+ }
+
+ public final String outname;
+ private final ConcurrentReadInputStream cris;
+ private final ArrayBlockingQueue<ArrayList<Read>> REGEN_PIPE=new ArrayBlockingQueue<ArrayList<Read>>(40);
+ public long merged=0;
+ public long merged2=0;
+ public long removedSingletonDupe=0;
+ public long removedLQ=0;
+ public long removedShort=0;
+ public long processed=0;
+ public long basesInitiallyMapped=0;
+ public long basesOverlapping=0;
+ public long basesMapped=0;
+ public long basesRemoved=0;
+// public long numSwapped=0;
+ private long readsWritten;
+ private long basesWritten;
+ private long validReadsWritten;
+ private long validBasesWritten;
+
+ private boolean asymmetricReads=false;
+
+ private final HashMap<String, Block> table=new HashMap<String, Block>(4096);
+
+ public final boolean paired;
+ public final int blocksize;
+
+ public static boolean USE_CRIS=true; //Similar speed either way. "true" may be better with many threads.
+ public static boolean MOVE_SINGLETONS_TO_END=false;
+
+ public static long READ_LIMIT=-1; //Max number of reads to process
+ public static final int WRITE_BUFFER=8000; //Bigger number uses more memory, for less frequent writes.
+ public static final int MAX_BLOCKSIZE_TO_SORT=8000000;
+ public static boolean overwrite=false;
+ public static boolean append=false;
+
+ public static final boolean DONT_COMPRESS_TEMP_FILES=false;
+ public static boolean MERGE_DUPLICATES=false;
+ public static final boolean KILL_BAD_PAIRS=true;
+ public static boolean SAME_STRAND_PAIRS=false;
+ public static boolean REQUIRE_CORRECT_STRANDS_PAIRS=true;
+ public static boolean REMOVE_SINGLETON_DUPLICATES_OF_PAIRS=true;
+ public static boolean USE_STRICT_MERGE=false;
+
+ public static boolean SWAP_READ1_TO_PLUS=false;
+ public static boolean MERGE_OPPOSITE_STRAND_DUPLICATES=false; //Requires SWAP_READ1_TO_PLUS=true
+
+ public static final boolean UNLOAD_CHROMS_WHEN_DONE=true;
+
+ public static boolean FIX_SHORT_PAIRED_READS=false;
+
+ public static boolean TRIM_LOW_QUALITY_TAILS=false;
+ public static byte TRIM_QUALITY=7;
+ public static byte TRIM_WINDOW=3;
+
+ public static boolean REGENERATE_MATCH_STRING=false;
+ public static int REGEN_THREADS=Shared.threads();
+
+ private final ReadStreamWriter blockwriter1;
+ private final ReadStreamWriter blockwriter2;
+
+// private final TranslateColorspaceRead tcr2=new TranslateColorspaceRead(200, 2400);
+}
diff --git a/current/align2/SortReadsTopologically.java b/current/align2/SortReadsTopologically.java
new file mode 100755
index 0000000..8db035a
--- /dev/null
+++ b/current/align2/SortReadsTopologically.java
@@ -0,0 +1,605 @@
+package align2;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.PrintWriter;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.zip.ZipOutputStream;
+
+import stream.ConcurrentGenericReadInputStream;
+import stream.ConcurrentLegacyReadInputStream;
+import stream.ConcurrentReadInputStream;
+import stream.FASTQ;
+import stream.FastaReadInputStream;
+import stream.FastqReadInputStream;
+import stream.RTextInputStream;
+import stream.Read;
+import stream.ReadStreamStringWriter;
+import stream.ReadStreamWriter;
+
+import dna.AminoAcid;
+import dna.Data;
+import dna.Parser;
+import dna.Timer;
+import fileIO.ReadWrite;
+import fileIO.FileFormat;
+
+public class SortReadsTopologically {
+
+
+ public static void main(String[] args){
+
+ Parser parser=new Parser();
+ String in1=null;
+ String in2=null;
+ String out="raw_tsorted#.txt.gz";
+ int prefix=4;
+
+ for(int i=0; i<args.length; i++){
+ final String arg=args[i];
+ final String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if("null".equalsIgnoreCase(b)){b=null;}
+// System.err.println("Processing "+args[i]);
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(Parser.parseCommonStatic(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseZip(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseQuality(arg, a, b)){
+ //do nothing
+ }else if(parser.parseInterleaved(arg, a, b)){
+ //do nothing
+ }else if(a.equals("i") || a.equals("in") || a.equals("input") || a.equals("in1") || a.equals("input1")){
+ in1=b;
+ if(b.indexOf('#')>=0){
+ in1=b.replaceFirst("#", "1");
+ in2=b.replaceFirst("#", "2");
+ }
+ }else if(a.equals("in2") || a.equals("input2")){
+ in2=b;
+ }else if(a.equals("o") || a.equals("out") || a.equals("output")){
+ out=b;
+ }else if(a.endsWith("merge")){
+ MERGE_DUPLICATES=Tools.parseBoolean(b);
+ }else if(a.equals("append") || a.equals("app")){
+ append=ReadStats.append=Tools.parseBoolean(b);
+ }else if(a.equals("overwrite") || a.equals("ow")){
+ overwrite=Tools.parseBoolean(b);
+ Data.sysout.println("Set overwrite to "+overwrite);
+ }else if(a.equals("prefix")){
+ prefix=Integer.parseInt(b);
+ }else if(a.endsWith("blocksize")){
+ MAX_BLOCKSIZE_TO_SORT=Integer.parseInt(b);
+ }else{
+ throw new RuntimeException("Unknown parameter: "+args[i]);
+ }
+ }
+
+ if(in1==null){throw new RuntimeException("Please specify input file.");}
+ if(out==null){throw new RuntimeException("Please specify output file.");}
+ if(in1.equalsIgnoreCase(in2) || in1.equalsIgnoreCase(out) || (in2!=null && in2.equalsIgnoreCase(out))){
+ throw new RuntimeException("Duplicate filenames.");
+ }
+
+ FileFormat ff=FileFormat.testInput(in1, FileFormat.FASTQ, null, true, false);
+ boolean fastq=ff.fastq();
+ boolean fasta=ff.fasta();
+ boolean bread=ff.bread();
+
+ if(out!=null && !out.contains("#")){
+ throw new RuntimeException("Output filename must contain '#' symbol.");
+ }
+
+ SortReadsTopologically srt;
+ if(fasta){
+ FastaReadInputStream fris1=new FastaReadInputStream(in1, (FASTQ.FORCE_INTERLEAVED && in2==null), false, true, in2==null ? Shared.READ_BUFFER_MAX_DATA : -1);
+ FastaReadInputStream fris2=(in2==null ? null : new FastaReadInputStream(in2, false, false, true, -1));
+ ConcurrentGenericReadInputStream cris=new ConcurrentGenericReadInputStream(fris1, fris2, -1);
+ srt=new SortReadsTopologically(cris, out, prefix);
+ }else if(fastq){
+ FastqReadInputStream fris1=new FastqReadInputStream(in1, true);
+ FastqReadInputStream fris2=(in2==null ? null : new FastqReadInputStream(in2, true));
+ ConcurrentGenericReadInputStream cris=new ConcurrentGenericReadInputStream(fris1, fris2, -1);
+ srt=new SortReadsTopologically(cris, out, prefix);
+ }else{
+ srt=new SortReadsTopologically(in1, in2, out, prefix);
+ }
+
+ srt.processMT();
+ if(MERGE_DUPLICATES){
+ Data.sysout.println("Merged "+srt.merged+" duplicates of "+srt.processed+" total.");
+ if(srt.correctMerged>0 || srt.incorrectMerged>0){
+ Data.sysout.println("Merged "+srt.correctMerged+" reads from same origin (correct).");
+ Data.sysout.println("Merged "+srt.incorrectMerged+" reads from different origin (incorrect).");
+ }
+ }
+ }
+
+ public SortReadsTopologically(String fname1, String fname2, String outname_, int prefix_){
+ assert(fname2==null || !fname1.equals(fname2)) : "Error - input files have same name.";
+
+ RTextInputStream rtis=new RTextInputStream(fname1, fname2, -1);
+ outname=outname_;
+ paired=rtis.paired();
+ cris=new ConcurrentLegacyReadInputStream(rtis, -1);
+ prefix=prefix_;
+ assert(prefix<=5);
+
+ blockwriter1=(fname1==null ? null : new ReadStreamStringWriter(null, true, 4, false));
+ blockwriter2=(fname2==null ? null : new ReadStreamStringWriter(null, false, 4, false));
+ }
+
+ public SortReadsTopologically(ConcurrentReadInputStream cris_, String outname_, int prefix_){
+ cris=cris_;
+ outname=outname_;
+ paired=cris.paired();
+ prefix=prefix_;
+ assert(prefix<=5);
+
+ blockwriter1=(new ReadStreamStringWriter(null, true, 4, false));
+ blockwriter2=(!paired ? null : new ReadStreamStringWriter(null, false, 4, false));
+ }
+
+ public void processMT(){
+
+ final String fname1=outname.replaceFirst("#", "1");
+ final String fname2=(!paired ? null : outname.replaceFirst("#", "2"));
+ if(fname1!=null && new File(fname1).exists()){
+ if(overwrite){new File(fname1).delete();}
+ else{throw new RuntimeException("Destination file "+fname1+" already exists.");}
+ }
+ if(fname2!=null && new File(fname2).exists()){
+ if(overwrite){new File(fname1).delete();}
+ else{throw new RuntimeException("Destination file "+fname2+" already exists.");}
+ }
+
+ Timer t=new Timer();
+ Timer total=new Timer();
+ t.start();
+ total.start();
+
+// assert(false) : fname1+", "+fname2+", "+outname+", "+prefix;
+
+ cris.start();
+ System.err.println("Started cris");
+
+ Thread bwt1=null, bwt2=null;
+ if(fname1!=null){
+ bwt1=new Thread(blockwriter1);
+ bwt1.start();
+ }
+ if(fname2!=null){
+ bwt2=new Thread(blockwriter2);
+ bwt2.start();
+ }
+ System.err.println("Started blockwriters");
+
+ {
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+ assert(paired==(r.mate!=null));
+
+ assert(prefix<=5);
+ }
+
+ while(reads!=null && reads.size()>0){
+ //System.err.println("reads.size()="+reads.size());
+ for(Read r : reads){addRead(r);}
+ //System.err.println("returning list");
+ cris.returnList(ln.id, ln.list.isEmpty());
+ //System.err.println("fetching list");
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ System.err.println("Finished reading");
+ cris.returnList(ln.id, ln.list.isEmpty());
+ System.err.println("Returned list");
+ ReadWrite.closeStream(cris);
+ System.err.println("Closed stream");
+ }
+
+
+ synchronized(this){this.notifyAll();}
+ System.err.println("Notified all");
+
+ finishWritingBlocks();
+ System.err.println("Wrote blocks");
+
+ if(bwt1!=null){blockwriter1.poison();}
+ if(bwt2!=null){blockwriter2.poison();}
+// if(bwt1!=null){blockwriter1.addList(null);}
+// if(bwt2!=null){blockwriter2.addList(null);}
+
+ if(bwt1!=null){
+ while(bwt1.isAlive()){
+ try {
+ bwt1.join();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+ if(bwt2!=null){
+ while(bwt2.isAlive()){
+ try {
+ bwt2.join();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+
+ t.stop();
+ Data.sysout.println("Temp Write Time: "+t);
+ t.start();
+
+ if(ReadWrite.ZIPLEVEL<2){ReadWrite.ZIPLEVEL=2;}
+ ReadStreamWriter wt1=(fname1==null ? null : new ReadStreamStringWriter(fname1, true, 4, false));
+ ReadStreamWriter wt2=(fname2==null ? null : new ReadStreamStringWriter(fname2, false, 4, false));
+
+ Thread wtt1=(wt1==null ? null : new Thread(wt1));
+ Thread wtt2=(wt2==null ? null : new Thread(wt2));
+
+ if(wtt1!=null){wtt1.start();}
+ if(wtt2!=null){wtt2.start();}
+
+ ArrayList<String> keys=new ArrayList<String>(table.size());
+ keys.addAll(table.keySet());
+ Collections.sort(keys);
+
+ ReadComparatorTopological tcomp=new ReadComparatorTopological();
+
+ for(String key : keys){
+ Block b=table.get(key);
+ table.remove(key);
+ processed+=b.added;
+
+ if(b.added>MAX_BLOCKSIZE_TO_SORT){
+ System.err.println("Skipping sorting for key "+key+" of size "+b.added);
+ RTextInputStream temp=new RTextInputStream(b.fname1, b.fname2, -1);
+ ArrayList<Read> reads=temp.nextList();
+ while(reads!=null && reads.size()>0){
+ if(reads!=null && reads.size()>0){
+ if(wt1!=null){wt1.addList(reads);}
+ if(wt2!=null){wt2.addList(reads);}
+ }
+ b.numRead+=reads.size();
+ reads=temp.nextList();
+ }
+ temp.close();
+ temp=null;
+
+// Data.sysout.println(key+"\t"+b.added);
+ b.delete();
+ }else{
+ ArrayList<Read> list=b.readBlock();
+ if(PRINT_BLOCKS){Data.sysout.println(key+"\t"+list.size());}
+ b.delete();
+
+ Collections.sort(list, tcomp);
+ if(MERGE_DUPLICATES){
+ int count;
+ count=mergeDuplicates(list, 0, 0, (byte)-99);
+ if(count>0){
+ Tools.condense(list);
+ Collections.sort(list, tcomp);
+ }
+ count=mergeDuplicates(list, 1, 0, (byte)-99);
+// if(count>0){
+// Tools.condense(list);
+// Collections.sort(list, tcomp);
+// }
+// count=mergeDuplicates(list, 0, 1, (byte)2);
+
+ Tools.condense(list);
+ Collections.sort(list, tcomp);
+ }
+ if(list!=null && list.size()>0){
+ if(wt1!=null){wt1.addList(list);}
+ if(wt2!=null){wt2.addList(list);}
+ }
+ }
+ }
+
+ //Add poison
+// if(wt1!=null){wt1.addList(null);}
+// if(wt2!=null){wt2.addList(null);}
+ if(wt1!=null){wt1.poison();}
+ if(wt2!=null){wt2.poison();}
+
+ if(wtt1!=null){
+ while(wtt1.isAlive()){
+ try {
+ wtt1.join();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+
+ if(wtt2!=null){
+ while(wtt2.isAlive()){
+ try {
+ wtt2.join();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+
+ t.stop();
+ total.stop();
+ Data.sysout.println("Final Sort + Write Time: "+t);
+ Data.sysout.println("Total Time: "+total);
+
+ }
+
+
+ private int mergeDuplicates(ArrayList<Read> list, int nmax, int mmax, byte qmax){
+ if(list==null || list.size()<2){return 0;}
+ Read current=list.get(0);
+
+ int correct=0;
+ int incorrect=0;
+
+ int count=0;
+ for(int i=1; i<list.size(); i++){
+ Read r=list.get(i);
+ boolean keep=false;
+ if(current.length()==r.length() && ((current.mate==null && r.mate==null) || (current.mateLength()==r.mateLength())) &&
+ current.isDuplicateByBases(r, nmax, mmax, qmax, true, false)){
+ Read r2=r.mate;
+ Read c2=current.mate;
+ if(c2==null || c2.isDuplicateByBases(r2, nmax, mmax, qmax, true, false)){
+// assert(r.synthetic()) : r;
+ if(r.synthetic()){
+ if(r.originalSite!=null && current.originalSite!=null){
+ if(r.originalSite.equals(current.originalSite)){
+ correct++;
+ }else{
+ incorrect++;
+ }
+ }else if(r.chrom>0){
+ if(r.chrom==current.chrom && r.start==current.start && r.stop==current.stop && r.strand()==current.strand()){
+ correct++;
+ }else{
+ incorrect++;
+ }
+ }
+ if(r2!=null && c2!=null && r2.originalSite!=null && c2.originalSite!=null){
+ if(r2.originalSite.equals(c2.originalSite)){
+ correct++;
+ }else{
+ incorrect++;
+ }
+ }else if(r2!=null && c2!=null && r2.chrom>0){
+ if(r2.chrom==c2.chrom && r2.start==c2.start && r2.stop==c2.stop && r2.strand()==c2.strand()){
+ correct++;
+ }else{
+ incorrect++;
+ }
+ }
+ }
+ current.merge(r, true, true);
+ list.set(i, null);
+ count++;
+ keep=true;
+ }
+ }
+ if(!keep){current=r;}
+ }
+ merged+=count;
+ correctMerged+=correct;
+ incorrectMerged+=incorrect;
+ return count;
+ }
+
+
+ private void addRead(Read r){
+ StringBuilder sb=new StringBuilder(prefix);
+ boolean bad=false;
+ for(int i=0; i<prefix && i<r.length(); i++){
+ byte b=r.bases[i];
+
+ if(b>=0 && b<=3){
+ sb.append((int)b);
+ }else{
+
+ if(AminoAcid.isFullyDefined(b)){
+ sb.append((char)b);
+ }else{
+ bad=true;
+ sb.append('N');
+ }
+ }
+
+ }
+
+ String key=bad ? "ZN" : sb.toString();
+// String key=sb.toString();
+ Block b=table.get(key);
+ if(b==null){
+ //System.err.println("Created block "+key);
+ b=new Block(key, outname);
+ table.put(key, b);
+ }
+ b.add(r);
+ }
+
+
+ public void finishWritingBlocks(){
+ System.err.println("Called finishWritingBlocks()");
+ for(String key : table.keySet()){
+ Block b=table.get(key);
+ b.finishWritingBuffer();
+ }
+ }
+
+
+
+ private static final void finishWriting(PrintWriter writer, OutputStream outStream){
+ writer.flush();
+ if(outStream.getClass()==ZipOutputStream.class){
+ ZipOutputStream zos=(ZipOutputStream)outStream;
+ try {
+ zos.closeEntry();
+ zos.finish();
+ } catch (IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ writer.close();
+ try {
+ outStream.close();
+ } catch (IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+
+ private class Block{
+
+ public Block(String name_, String fname_){
+
+ if(DONT_COMPRESS_TEMP_FILES){
+ while(fname_.endsWith(".gz") || fname_.endsWith(".zip") || fname_.endsWith(".bz2")){
+ fname_=fname_.substring(0, fname_.lastIndexOf('.'));
+ }
+ }
+
+ name=name_;
+ fname1=fname_.replaceFirst("#", "_tsort_tempBlock_"+name+"_1");
+ fname2=(!paired ? null : fname_.replaceFirst("#", "_tsort_tempBlock_"+name+"_2"));
+
+ if(fname1==null){
+ assert(false);
+ outStream1=null;
+ writer1=null;
+ }else{
+ outStream1=ReadWrite.getOutputStream(fname1, append, true, false);
+ writer1=new PrintWriter(outStream1);
+ }
+
+ if(fname2==null){
+ outStream2=null;
+ writer2=null;
+ }else{
+ outStream2=ReadWrite.getOutputStream(fname2, append, true, false);
+ writer2=new PrintWriter(outStream2);
+ }
+ }
+
+ public void add(Read r){
+ buffer.add(r);
+ added++;
+ if(buffer.size()>=WRITE_BUFFER){
+ writeBuffer(false);
+ }
+ }
+
+ public void writeBuffer(boolean close){
+
+ written+=buffer.size();
+ ArrayList<Read> temp=buffer;
+ buffer=(close ? null : new ArrayList<Read>(WRITE_BUFFER));
+
+ if(close){
+// System.err.println("Closing "+name+": "+ fname1+", "+fname2);
+ if(blockwriter1!=null){blockwriter1.addList(temp, writer1, outStream1, close);}
+ if(blockwriter2!=null){blockwriter2.addList(temp, writer2, outStream2, close);}
+ }else{
+ if(blockwriter1!=null && temp!=null && !temp.isEmpty()){blockwriter1.addList(temp, writer1, outStream1, close);}
+ if(blockwriter2!=null && temp!=null && !temp.isEmpty()){blockwriter2.addList(temp, writer2, outStream2, close);}
+ }
+
+ assert(added==written);
+// buffer.clear();
+ }
+
+ public void finishWritingBuffer(){
+ //System.err.println("Writing block "+name);
+ writeBuffer(true);
+
+// finishWriting(writer1, outStream1);
+// if(fname2!=null){
+// finishWriting(writer2, outStream2);
+// }
+
+ }
+
+ public synchronized ArrayList<Read> readBlock(){
+ RTextInputStream temp=new RTextInputStream(fname1, fname2, -1);
+ ArrayList<Read> out=new ArrayList<Read>((int)written);
+ ArrayList<Read> reads=temp.nextList();
+ while(reads!=null && reads.size()>0){
+ out.addAll(reads);
+ numRead+=reads.size();
+ reads=temp.nextList();
+ }
+ temp.close();
+ temp=null;
+ assert(numRead==written);
+
+ return out;
+ }
+
+ public synchronized void delete() {
+ if(fname1!=null){new File(fname1).delete();}
+ if(fname2!=null){new File(fname2).delete();}
+ }
+
+ public final String name;
+ public final String fname1, fname2;
+
+ public final OutputStream outStream1, outStream2;
+ public final PrintWriter writer1, writer2;
+ private ArrayList<Read> buffer=new ArrayList<Read>(WRITE_BUFFER);
+
+ public long added=0, written=0, numRead=0;
+ }
+
+ public final String outname;
+ private final ConcurrentReadInputStream cris;
+ public long merged=0;
+ public long processed=0;
+
+ public long correctMerged=0;
+ public long incorrectMerged=0;
+
+ private final HashMap<String, Block> table=new HashMap<String, Block>(4096);
+
+ public final boolean paired;
+ public final int prefix;
+
+ public static boolean USE_CRIS=true; //Similar speed either way. "true" may be better with many threads.
+
+ public static final int WRITE_BUFFER=1000; //Bigger number uses more memory, for less frequent writes.
+ public static int MAX_BLOCKSIZE_TO_SORT=16000000;
+
+ public static final boolean DONT_COMPRESS_TEMP_FILES=false;
+ public static boolean MERGE_DUPLICATES=false;
+ public static boolean overwrite=false;
+ public static boolean append=false;
+ public static boolean PRINT_BLOCKS=false;
+
+
+ private final ReadStreamWriter blockwriter1;
+ private final ReadStreamWriter blockwriter2;
+
+
+}
diff --git a/current/align2/SplitMappedReads.java b/current/align2/SplitMappedReads.java
new file mode 100755
index 0000000..b4a6dca
--- /dev/null
+++ b/current/align2/SplitMappedReads.java
@@ -0,0 +1,320 @@
+package align2;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.PrintWriter;
+import java.util.ArrayList;
+import java.util.zip.ZipOutputStream;
+
+import stream.ConcurrentLegacyReadInputStream;
+import stream.RTextInputStream;
+import stream.Read;
+import stream.SiteScore;
+
+import dna.Data;
+import dna.Timer;
+import fileIO.ReadWrite;
+
+public class SplitMappedReads {
+
+
+ public static void main(String[] args){
+
+ String reads1=args[0];
+ String reads2=args[1].equalsIgnoreCase("null") ? null : args[1];
+ String outname=args[2].equalsIgnoreCase("null") ? "" : args[2];
+
+ int minChrom=1;
+ int maxChrom=25;
+ if(args.length>3){
+ minChrom=maxChrom=Byte.parseByte(args[3]);
+ if(args.length>4){
+ maxChrom=Byte.parseByte(args[4]);
+ }
+ }
+ assert(minChrom<=maxChrom && minChrom>=0);
+
+ SplitMappedReads smr=new SplitMappedReads(reads1, reads2, outname, minChrom, maxChrom);
+ smr.process();
+
+ }
+
+ public SplitMappedReads(String fname1, String fname2, String outname_, int minChrom, int maxChrom){
+ this(new RTextInputStream(fname1, fname2, -1), outname_, minChrom, maxChrom);
+ assert(fname2==null || !fname1.equals(fname2)) : "Error - input files have same name.";
+ }
+
+ public SplitMappedReads(RTextInputStream stream_, String outname_, int minChrom, int maxChrom){
+ stream=stream_;
+ outname=outname_;
+ paired=stream.paired();
+// assert(outname.contains("#")) : "Output file name must contain the character '#' to be used for chromosome number.";
+
+ MIN_CHROM=minChrom;
+ MAX_CHROM=maxChrom;
+ assert(MIN_CHROM>=0);
+ assert(MAX_CHROM>=MIN_CHROM);
+
+ outArraySingle1=new OutputStream[maxChrom+1];
+ printArraySingle1=new PrintWriter[maxChrom+1];
+ bufferArraySingle1=new ArrayList[maxChrom+1];
+ for(int i=minChrom; i<outArraySingle1.length; i++){
+ bufferArraySingle1[i]=new ArrayList<Read>(WRITE_BUFFER);
+ outArraySingle1[i]=ReadWrite.getOutputStream(outname.replace("#", "single_1_chr"+i), false, true, false);
+ printArraySingle1[i]=new PrintWriter(outArraySingle1[i]);
+ printArraySingle1[i].println("#Chromosome "+i+" Read 1 Singletons");
+ printArraySingle1[i].println("#"+Read.header());
+ }
+
+ if(!paired){
+ outArraySingle2=null;
+ printArraySingle2=null;
+ bufferArraySingle2=null;
+ outArrayPaired1=null;
+ printArrayPaired1=null;
+ bufferArrayPaired1=null;
+ outArrayPaired2=null;
+ printArrayPaired2=null;
+ bufferArrayPaired2=null;
+ }else{
+
+ outArraySingle2=new OutputStream[maxChrom+1];
+ printArraySingle2=new PrintWriter[maxChrom+1];
+ bufferArraySingle2=new ArrayList[maxChrom+1];
+ for(int i=minChrom; i<outArraySingle2.length; i++){
+ bufferArraySingle2[i]=new ArrayList<Read>(WRITE_BUFFER);
+ outArraySingle2[i]=ReadWrite.getOutputStream(outname.replace("#", "single_2_chr"+i), false, true, false);
+ printArraySingle2[i]=new PrintWriter(outArraySingle2[i]);
+ printArraySingle2[i].println("#Chromosome "+i+" Read 2 Singletons");
+ printArraySingle2[i].println("#"+Read.header());
+ }
+
+ outArrayPaired1=new OutputStream[maxChrom+1];
+ printArrayPaired1=new PrintWriter[maxChrom+1];
+ bufferArrayPaired1=new ArrayList[maxChrom+1];
+ for(int i=minChrom; i<outArrayPaired1.length; i++){
+ bufferArrayPaired1[i]=new ArrayList<Read>(WRITE_BUFFER);
+ outArrayPaired1[i]=ReadWrite.getOutputStream(outname.replace("#", "paired_1_chr"+i), false, true, false);
+ printArrayPaired1[i]=new PrintWriter(outArrayPaired1[i]);
+ printArrayPaired1[i].println("#Chromosome "+i+" Read 1 Paired");
+ printArrayPaired1[i].println("#"+Read.header());
+ }
+
+ outArrayPaired2=new OutputStream[maxChrom+1];
+ printArrayPaired2=new PrintWriter[maxChrom+1];
+ bufferArrayPaired2=new ArrayList[maxChrom+1];
+ for(int i=minChrom; i<outArrayPaired2.length; i++){
+ bufferArrayPaired2[i]=new ArrayList<Read>(WRITE_BUFFER);
+ outArrayPaired2[i]=ReadWrite.getOutputStream(outname.replace("#", "paired_2_chr"+i), false, true, false);
+ printArrayPaired2[i]=new PrintWriter(outArrayPaired2[i]);
+ printArrayPaired2[i].println("#Chromosome "+i+" Read 2 Paired");
+ printArrayPaired2[i].println("#"+Read.header());
+ }
+
+ }
+
+ cris=(USE_CRIS ? new ConcurrentLegacyReadInputStream(stream, -1) : null);
+ }
+
+ public void process(){
+
+ Timer t=new Timer();
+
+ if(cris!=null){
+ cris.start();
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ while(reads!=null && reads.size()>0){
+ processReads(reads);
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ cris.returnList(ln.id, ln.list.isEmpty());
+ }else{
+ ArrayList<Read> reads=stream.nextList();
+ while(reads!=null && reads.size()>0){
+ processReads(reads);
+ reads=stream.nextList();
+ }
+ }
+
+ synchronized(this){this.notifyAll();}
+
+ finish();
+
+ t.stop();
+ Data.sysout.println("Time:\t"+t);
+ }
+
+
+
+ private void processReads(ArrayList<Read> reads){
+ for(Read r : reads){
+ addRead(r, 1);
+ if(r.mate!=null){
+ addRead(r.mate, 2);
+ }
+ }
+ }
+
+
+ private void addRead(Read r, int side){
+
+ if(r.chrom<1 && r.numSites()>0){
+ SiteScore ss=r.topSite(); //Should not be necessary
+ r.start=ss.start;
+ r.stop=ss.stop;
+ r.chrom=ss.chrom;
+ r.setStrand(ss.strand);
+ }
+
+ //Ensure no superfluous data is written
+ r.sites=null;
+ r.originalSite=null;
+ r.obj=null;
+
+// System.err.println("Adding to chrom "+r.chrom+", side "+side+", paired="+r.paired+", "+(r.list==null ? "null" : r.list.size()));
+ if(r.chrom<MIN_CHROM || r.chrom>MAX_CHROM){return;}
+
+ final PrintWriter writer;
+ final ArrayList<Read> list;
+
+ if(side==1){
+ if(r.paired()){
+ writer=printArrayPaired1[r.chrom];
+ list=bufferArrayPaired1[r.chrom];
+ }else{
+ writer=printArraySingle1[r.chrom];
+ list=bufferArraySingle1[r.chrom];
+ }
+ }else{
+ assert(side==2);
+ if(r.paired()){
+ writer=printArrayPaired2[r.chrom];
+ list=bufferArrayPaired2[r.chrom];
+ }else{
+ writer=printArraySingle2[r.chrom];
+ list=bufferArraySingle2[r.chrom];
+ }
+ }
+
+ assert(list.size()<WRITE_BUFFER);
+ list.add(r);
+
+ if(list.size()>=WRITE_BUFFER){
+ writeList((ArrayList<Read>)list.clone(), writer);
+ list.clear();
+ }
+ }
+
+
+ private void writeList(ArrayList<Read> list, PrintWriter writer){
+
+ synchronized(writer){
+ for(Read r : list){
+ writer.println(r.toText(true));
+ }
+ }
+ }
+
+
+ public void finish(){
+
+ final PrintWriter[][] writers=new PrintWriter[][] {printArraySingle1, printArraySingle2, printArrayPaired1, printArrayPaired2};
+ final OutputStream[][] streams=new OutputStream[][] {outArraySingle1, outArraySingle2, outArrayPaired1, outArrayPaired2};
+ final ArrayList<Read>[][] buffers=new ArrayList[][] {bufferArraySingle1, bufferArraySingle2, bufferArrayPaired1, bufferArrayPaired2};
+
+
+ for(int x=0; x<buffers.length; x++){
+
+
+ PrintWriter[] printArray=writers[x];
+ ArrayList<Read>[] bufferArray=buffers[x];
+
+ for(int i=0; printArray!=null && i<printArray.length; i++){
+ PrintWriter writer=printArray[i];
+ ArrayList<Read> list=bufferArray[i];
+
+ if(list!=null && !list.isEmpty()){
+ writeList(list, writer);
+ list=null;
+ }
+ }
+ }
+
+ //TODO: Wait for writing to finish, if it is done in threads.
+
+
+ for(int x=0; x<writers.length; x++){
+
+
+ PrintWriter[] printArray=writers[x];
+ OutputStream[] outArray=streams[x];
+
+ for(int i=0; printArray!=null && i<printArray.length; i++){
+ if(printArray[i]!=null){
+ synchronized(printArray[i]){
+ printArray[i].flush();
+ if(outArray[i].getClass()==ZipOutputStream.class){
+ ZipOutputStream zos=(ZipOutputStream)outArray[i];
+ try {
+ zos.closeEntry();
+ zos.finish();
+ } catch (IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ printArray[i].close();
+ try {
+ outArray[i].close();
+ } catch (IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+ }
+ }
+
+// if(cris!=null){cris.shutdown();}
+// stream.shutdown();
+
+ if(cris!=null){ReadWrite.closeStream(cris);}
+ else{stream.close();}
+ }
+
+
+ public final String outname;
+ private final RTextInputStream stream;
+ private final ConcurrentLegacyReadInputStream cris;
+
+ private final OutputStream[] outArraySingle1;
+ private final PrintWriter[] printArraySingle1;
+ private final ArrayList<Read>[] bufferArraySingle1;
+
+ private final OutputStream[] outArraySingle2;
+ private final PrintWriter[] printArraySingle2;
+ private final ArrayList<Read>[] bufferArraySingle2;
+
+ private final OutputStream[] outArrayPaired1;
+ private final PrintWriter[] printArrayPaired1;
+ private final ArrayList<Read>[] bufferArrayPaired1;
+
+ private final OutputStream[] outArrayPaired2;
+ private final PrintWriter[] printArrayPaired2;
+ private final ArrayList<Read>[] bufferArrayPaired2;
+
+ private final int MIN_CHROM;
+ private final int MAX_CHROM;
+
+ public final boolean paired;
+
+ public static boolean USE_CRIS=true; //Similar speed either way. "true" may be better with many threads.
+
+ public static final int WRITE_BUFFER=400; //Bigger number uses more memory, for less frequent writes.
+
+
+}
diff --git a/current/align2/Tools.java b/current/align2/Tools.java
new file mode 100755
index 0000000..87824ed
--- /dev/null
+++ b/current/align2/Tools.java
@@ -0,0 +1,2319 @@
+package align2;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Random;
+import java.util.concurrent.atomic.AtomicIntegerArray;
+import java.util.concurrent.atomic.AtomicLongArray;
+import java.util.regex.Pattern;
+
+import stream.Read;
+import stream.SamLine;
+import stream.SiteScore;
+
+import dna.AminoAcid;
+import dna.CoverageArray;
+import dna.Data;
+import fileIO.ByteFile;
+import fileIO.FileFormat;
+import fileIO.ReadWrite;
+
+public final class Tools {
+
+ public static int secondHighestPosition(int[] array) {
+ int maxP, maxP2;
+ if(array[0]>=array[1]){
+ maxP=0;
+ maxP2=1;
+ }else{
+ maxP=1;
+ maxP2=0;
+ }
+ for(int i=2; i<array.length; i++){
+ int x=array[i];
+ if(x>array[maxP2]){
+ if(x>=array[maxP]){
+ maxP2=maxP;
+ maxP=i;
+ }else{
+ maxP2=i;
+ }
+ }
+ }
+ return maxP2;
+ }
+
+ /**
+ * @param probError
+ * @return
+ */
+ public static float[] inverse(float[] array) {
+ float[] out=new float[array.length];
+ for(int i=0; i<array.length; i++){
+ out[i]=1/max(array[i], 1000000000f);
+ }
+ return out;
+ }
+
+ public static boolean checkHeader(String s){
+ if(s==null){return true;}
+ boolean ok=true;
+ for(int i=0; i<s.length() && ok; i++){
+ char c=s.charAt(i);
+ ok=(c>=32 && c<=126);
+ }
+ return ok;
+ }
+
+ public static String fixHeader(String s){
+// assert(false) : new String(specialChars);
+ if(checkHeader(s)){return s;}
+ StringBuilder sb=new StringBuilder(s.length());
+ for(int i=0; i<s.length(); i++){
+ final char c=s.charAt(i), d;
+
+ if(c>=0 && c<=255){
+ d=specialChars[c];
+ }else{
+ d='X';
+ }
+// System.err.println(c+"="+(int)c);
+ sb.append(d);
+ }
+ return sb.toString();
+ }
+
+
+ /**
+ * Returns this file name if it is a file, or all the files in the directory if it is a directory.
+ * @param b
+ * @param fasta
+ * @param fastq
+ * @param sam
+ * @param any
+ * @return
+ */
+ public static ArrayList<String> getFileOrFiles(String b, ArrayList<String> list, boolean fasta, boolean fastq, boolean sam, boolean any){
+ if(list==null){list=new ArrayList<String>();}
+ String[] split=b.split(",");
+ for(String s : split){
+ File f=new File(s);
+ if(f.isDirectory()){
+ for(File f2 : f.listFiles()){
+ if(f2.isFile()){
+ String name=f2.getName().toLowerCase();
+ String ext=ReadWrite.rawExtension(name);
+
+ boolean pass=any || (fasta && FileFormat.isFasta(ext)) || (fastq && FileFormat.isFastq(ext)) || (sam && FileFormat.isSamOrBam(ext));
+
+ if(pass){
+ String s2=f2.getAbsolutePath();
+ list.add(s2);
+ }
+ }
+ }
+ }else{
+ list.add(s);
+ }
+ }
+ return list;
+ }
+
+ /** Add names to a collection.
+ * This can be a literal name, or a text file with one name per line,
+ * or a fastq, fasta, or sam file, in which case the read names will be added.
+ * @param s
+ * @param names
+ * @return
+ */
+ public static final int addNames(String s, Collection<String> names, boolean allowSubprocess){
+ int added=0;
+ if(new File(s).exists()){
+
+ int[] vector=FileFormat.testFormat(s, false, false);
+ final int type=vector[0];
+ ByteFile bf=ByteFile.makeByteFile(s, false, allowSubprocess);
+
+ if(type==FileFormat.FASTQ){
+ int num=0;
+ for(byte[] line=bf.nextLine(); line!=null; line=bf.nextLine(), num++){
+ if((num&3)==0 && line.length>0){
+ names.add(new String(line, 1, line.length-1));
+ }
+ }
+ }else if(type==FileFormat.FASTA){
+ for(byte[] line=bf.nextLine(); line!=null; line=bf.nextLine()){
+ if(line.length>0 && line[0]=='>'){
+ names.add(new String(line, 1, line.length-1));
+ }
+ }
+ }else if(type==FileFormat.SAM){
+ for(byte[] line=bf.nextLine(); line!=null; line=bf.nextLine()){
+ if(line.length>0 && line[0]!='@'){
+ String name=SamLine.parseNameOnly(line);
+ if(name!=null && name.length()>0){names.add(name);}
+ }
+ }
+ }else{
+ for(byte[] line=bf.nextLine(); line!=null; line=bf.nextLine()){
+ if(line.length>0){
+ names.add(new String(line));
+ }
+ }
+ }
+ bf.close();
+ }else{
+ added++;
+ names.add(s);
+ }
+ return added;
+ }
+
+ /**
+ * Make copies of any read with ambiguous bases to represent all possible non-ambiguous representations.
+ * @param reads A list of reads
+ * @param k minimum length of reads to replicate
+ * @return A list of reads with no ambiguity codes.
+ */
+ public static ArrayList<Read> replicateAmbiguous(ArrayList<Read> reads, int minlen) {
+ ArrayList<Read> out=new ArrayList<Read>();
+ for(Read r1 : reads){
+ final Read r2=r1.mate;
+ r1.mate=null;
+
+ if(r1.containsUndefined() && r1.length()>=minlen){
+ ArrayList<Read> temp=makeReplicates(r1);
+ out.addAll(temp);
+ }else{
+ out.add(r1);
+ }
+ if(r2!=null){
+ r2.mate=null;
+ if(r2.containsUndefined() && r2.length()>=minlen){
+ ArrayList<Read> temp=makeReplicates(r2);
+ out.addAll(temp);
+ }else{
+ out.add(r2);
+ }
+ }
+ }
+ return out;
+ }
+
+ /**
+ * Make copies of this read to represent all possible non-ambiguous representations.
+ * Return a list of all fully-defined versions.
+ * @param r A read to replicate
+ * @return A list of reads with no ambiguity codes.
+ */
+ public static ArrayList<Read> makeReplicates(final Read r) {
+// System.err.println("\n***Called makeReplicates("+new String(r.bases)+")");
+ ArrayList<Read> temp=null;
+ if(!r.containsUndefined()){
+ temp=new ArrayList<Read>();
+ temp.add(r);
+ return temp;
+ }
+ final byte[] bases=r.bases;
+ for(int i=0; i<r.bases.length; i++){
+ byte b=bases[i];
+ if(!AminoAcid.isFullyDefined(b)){
+ temp=replicateAtPosition(r, i);
+ break;
+ }
+ }
+ assert(temp!=null);
+ final ArrayList<Read> out;
+ if(temp.get(0).containsUndefined()){
+ out=new ArrayList<Read>();
+ for(Read rr : temp){
+ out.addAll(makeReplicates(rr));
+ }
+ }else{
+ out=temp;
+ }
+ return out;
+ }
+
+ /**
+ * @param r A read
+ * @param pos The position of an ambiguous base
+ * @param out A list of replicates
+ */
+ private static ArrayList<Read> replicateAtPosition(final Read r, final int pos) {
+// System.err.println("Called replicateAtPosition("+new String(r.bases)+", "+pos+")");
+ if(r.quality!=null){
+ r.quality[pos]=Shared.FAKE_QUAL;
+ }
+ final byte[] bases=r.bases;
+ final byte b=bases[pos];
+ final int num=AminoAcid.baseToNumberExtended[b]&0xF;
+ assert(num>0 && Integer.bitCount(num)>1 && Integer.bitCount(num)<=4) : b+", "+num;
+ ArrayList<Read> out=new ArrayList<Read>(4);
+ for(int i=0; i<4; i++){
+ int mask=(1<<i);
+ if((num&mask)==mask){
+ Read rr=r.clone();
+ rr.bases=rr.bases.clone();
+ rr.bases[pos]=AminoAcid.numberToBase[i];
+// System.err.println("Added clone ("+new String(rr.bases)+")");
+ out.add(rr);
+ }
+ }
+ return out;
+ }
+
+ /** Checks for permission to read files, and input name collisions. */
+ public static boolean testOutputFiles(boolean overwrite, boolean append, boolean allowDuplicates, ArrayList<String>...args){
+ if(args==null || args.length==0){return true;}
+ ArrayList<String> list=new ArrayList<String>();
+ for(ArrayList<String> als : args){
+ if(als!=null){
+ list.addAll(als);
+ }
+ }
+ return testOutputFiles(overwrite, append, allowDuplicates, list.toArray(new String[list.size()]));
+ }
+
+ /** Checks for permission to overwrite files, and output name collisions. */
+ public static boolean testOutputFiles(boolean overwrite, boolean append, boolean allowDuplicates, String...args){
+ if(args==null || args.length==0){return true;}
+ HashSet<String> set=new HashSet<String>(args.length*2);
+ int terms=0;
+ for(String s : args){
+ if(s!=null){
+ if(isOutputFileName(s)){
+ terms++;
+
+ if(!overwrite && !append && new File(s).exists()){
+ assert(overwrite) : "File "+s+" exists and overwrite=false";
+ return false;
+ }
+
+ if(!allowDuplicates && set.contains(s)){
+ assert(false) : "Duplicate file "+s+" was specified for multiple output streams.";
+ return false;
+ }
+
+ set.add(s);
+ }
+ }
+ }
+ return true;
+ }
+
+ /** Checks for permission to read files, and input name collisions. */
+ public static boolean testInputFiles(boolean allowDuplicates, boolean throwException, ArrayList<String>...args){
+ if(args==null || args.length==0){return true;}
+ ArrayList<String> list=new ArrayList<String>();
+ for(ArrayList<String> als : args){
+ if(als!=null){
+ list.addAll(als);
+ }
+ }
+ return testInputFiles(allowDuplicates, throwException, list.toArray(new String[list.size()]));
+ }
+
+ /** Checks for permission to read files, and input name collisions. */
+ public static boolean testInputFiles(boolean allowDuplicates, boolean throwException, String[]...args){
+ if(args==null || args.length==0){return true;}
+ for(String[] s : args){
+ if(!testInputFiles(allowDuplicates, throwException, s)){return false;}
+ }
+ return true;
+ }
+
+ /** Checks for permission to read files, and input name collisions. */
+ public static boolean testInputFiles(boolean allowDuplicates, boolean throwException, String...args){
+ if(args==null || args.length==0){return true;}
+ HashSet<String> set=new HashSet<String>(args.length*2);
+ int terms=0;
+ for(String s : args){
+ if(s!=null){
+ String s2=s.toLowerCase();
+ if(canRead(s)){
+ terms++;
+ }else{
+ if(throwException){throw new RuntimeException("Can't read file '"+s+"'");}
+ return false;
+ }
+
+ if(!allowDuplicates && set.contains(s2)){
+ if(throwException){throw new RuntimeException("Duplicate file "+s+" was specified for multiple input streams.");}
+ return false;
+ }
+
+ set.add(s2);
+ }
+ }
+ return true;
+ }
+
+ /** Checks for permission to overwrite files, and output name collisions.
+ * @return True if no problems are detected */
+ public static boolean testForDuplicateFiles(boolean throwException, String...args){
+ if(args==null || args.length==0){return true;}
+ HashSet<String> set=new HashSet<String>(args.length*2);
+ int terms=0;
+ for(String s0 : args){
+ if(s0!=null){
+ String s=s0.toLowerCase();
+ terms++;
+ if(set.contains(s) && !s.equals("stdout") && !s.startsWith("stdout.")){
+ if(throwException){throw new RuntimeException("File '"+s0+"' was specified multiple times.");}
+ return false;
+ }
+ set.add(s);
+ }
+ }
+ return true;
+ }
+
+ public static final boolean canWrite(String s, boolean overwrite){
+ if(isNullFileName(s) || isSpecialOutputName(s)){return true;}
+ File f=new File(s);
+ if(f.exists()){return overwrite && f.canWrite();}
+ return true;
+ }
+
+// public static final boolean outputDestinationExists(String s){
+// if(isNullFileName(s)){return false;}
+// if(isSpecialOutputName(s)){return false;}
+// File f=new File(s);
+// return f.exists();
+// }
+
+ public static final boolean isOutputFileName(String s){
+ return !(isNullFileName(s) || isSpecialOutputName(s));
+ }
+
+ public static final boolean isNullFileName(String s){
+ if(s==null || s.equalsIgnoreCase("null") || s.equalsIgnoreCase("none")){return true;}
+ for(int i=0; i<s.length(); i++){
+ if(!Character.isWhitespace(s.charAt(i))){return false;}
+ }
+ return true;
+ }
+
+ public static final boolean isSpecialOutputName(String s){
+ if(s==null){return false;}
+ s=s.toLowerCase();
+ return s.equals("stdout") || s.equals("stderr") || s.equals("standardout") || s.equals("standarderr")
+ || s.equals("/dev/null") || s.startsWith("stdout.") || s.startsWith("stderr.");
+ }
+
+ public static final boolean isSpecialInputName(String s){
+ if(s==null){return false;}
+ s=s.toLowerCase();
+ return s.equals("stdin") || s.equals("standardin") || s.startsWith("stdin.");
+ }
+
+ public static final boolean canRead(String s){
+ if(s==null){return false;}
+ if(isSpecialInputName(s)){return true;}
+ File f=new File(s);
+ return f.canRead();
+ }
+
+ /** Returns index of first matching location */
+ public static final int contains(final byte[] big, final byte[] small, final int maxMismatches){
+ int x=containsForward(big, small, maxMismatches);
+ return x>=0 ? x : containsReverse(big, small, maxMismatches);
+ }
+
+ /** Returns index of first matching location */
+ public static final int containsForward(final byte[] big, final byte[] small, final int maxMismatches){
+ final int ilimit=big.length-small.length;
+// System.err.println("Entering: ilimit="+ilimit+", maxMismatches="+maxMismatches+", small.length="+small.length);
+ for(int i=0; i<=ilimit; i++){
+ int mismatches=0;
+ for(int j=0; j<small.length && mismatches<=maxMismatches; j++){
+ final byte b=big[i+j];
+ final byte s=small[j];
+ if(b!=s){mismatches++;}
+ }
+ if(mismatches<=maxMismatches){
+// System.err.println("Returning "+i+", mismatches="+mismatches);
+ return i;
+ }
+ }
+ return -1;
+ }
+
+ /** Returns index of first matching location */
+ public static final int containsReverse(final byte[] big, final byte[] small, final int maxMismatches){
+ final int ilimit=big.length-small.length;
+ for(int i=0; i<=ilimit; i++){
+ int mismatches=0;
+ for(int j=0, k=small.length-1; j<small.length && mismatches<=maxMismatches; j++, k--){
+ final byte b=big[i+j];
+ final byte s=AminoAcid.baseToComplementExtended[small[k]];
+ if(b!=s){mismatches++;}
+ }
+ if(mismatches<=maxMismatches){return i;}
+ }
+ return -1;
+ }
+
+ /** Removes null elements by shrinking the list. May change list order. */
+ public static final <X> int condense(ArrayList<X> list){
+ if(list==null || list.size()==0){return 0;}
+ int removed=0;
+
+ for(int i=list.size()-1; i>0; i--){
+ if(list.get(i)==null){
+ removed++;
+ X last=list.get(list.size()-1);
+ list.set(i, last);
+ list.remove(list.size()-1);
+ }
+ }
+ return removed;
+ }
+
+ /** Removes null elements by shrinking the list. Will not change list order. */
+ public static final <X> int condenseStrict(ArrayList<X> list){
+ if(list==null || list.size()==0){return 0;}
+ int removed=0;
+
+ int insertPos=0;
+ for(int i=0; i<list.size(); i++){
+ X x=list.get(i);
+ if(x!=null){
+ if(insertPos!=i){
+ assert(insertPos<i);
+ while(list.get(insertPos)!=null){insertPos++;}
+ assert(insertPos<i && list.get(insertPos)==null) : insertPos+", "+i; //slow, temporary
+ list.set(i, null);
+ list.set(insertPos, x);
+ }
+ insertPos++;
+ }else{
+ removed++;
+ }
+ }
+ for(int i=0; i<removed; i++){
+ X x=list.remove(list.size()-1);
+ assert(x==null);
+ }
+ return removed;
+ }
+
+ /** Removes null elements by shrinking the array. Will not change array order. */
+ public static final <X> X[] condenseStrict(X[] array){
+ if(array==null){return array;}
+ int nulls=0;
+ for(X x : array){if(x==null){nulls++;}}
+ if(nulls==0){return array;}
+ X[] array2=Arrays.copyOf(array, array.length-nulls);
+
+ int j=0;
+ for(X x : array){
+ if(x!=null){
+ array2[j]=x;
+ j++;
+ }
+ }
+ return array2;
+ }
+
+ /** Creates a new list without null elements. */
+ public static final <X> ArrayList<X> condenseNew(ArrayList<X> list){
+ ArrayList<X> temp=new ArrayList<X>(list.size());
+ for(X x : list){
+ if(x!=null){temp.add(x);}
+ }
+ return temp;
+ }
+
+ //This should also be correct. I'm not sure which is faster.
+// /** Removes null elements by shrinking the list. Will not change list order. */
+// public static final <X> int condenseStrict(ArrayList<X> list){
+// if(list==null || list.size()==0){return 0;}
+// int removed=0;
+// int last=0;
+//
+// for(int i=0; i<list.size(); i++){
+// X x=list.get(i);
+// if(x==null){
+// removed++;
+// }else{
+// while(last<i && list.get(last)!=null){last++;}
+// assert(last==i || list.get(last)==null);
+// if(last!=i){
+// assert(last<i);
+// list.set(last, x);
+// list.set(i, null);
+// }
+// }
+// }
+// for(int i=0; i<removed; i++){
+// X x=list.remove(list.size()-1);
+// assert(x==null);
+// }
+// return removed;
+// }
+
+
+ public static final int calcMedianDistance(int[] array){
+ if(array==null || array.length<2){return 500000000;}
+ int[] dif=new int[array.length-1];
+ for(int i=0; i<array.length; i++){
+ dif[i]=(array[i+1]-array[i]);
+ }
+ Arrays.sort(dif);
+ return dif[dif.length/2];
+ }
+
+
+
+
+// public static final int trimSiteList(ArrayList<SiteScore> ssl, float fractionOfMax, boolean retainPaired){
+//// assert(false);
+// if(ssl==null || ssl.size()==0){return -999999;}
+// if(ssl.size()==1){return ssl.get(0).score;}
+// int maxScore=-999999;
+// for(SiteScore ss : ssl){
+// maxScore=Tools.max(maxScore, ss.score);
+// }
+//
+// int cutoff=(int) (maxScore*fractionOfMax);
+// trimSitesBelowCutoff(ssl, cutoff, retainPaired);
+//// trimSitesBelowCutoffInplace(ssl, cutoff);
+// return maxScore;
+// }
+
+ /** minSitesToRetain should be set to 1 if the list is not sorted by score (for efficiency of removal). Otherwise, it can be higher. */
+ public static final int trimSiteList(ArrayList<SiteScore> ssl, float fractionOfMax, boolean retainPaired, boolean retainSemiperfect,
+ int minSitesToRetain, int maxSitesToRetain){
+// assert(false);
+ if(ssl==null || ssl.size()==0){return -999999;}
+ if(ssl.size()==1){return ssl.get(0).score;}
+ int maxScore=-999999;
+
+ if(minSitesToRetain>1 && minSitesToRetain<ssl.size()){
+ assert(inOrder(ssl));
+ maxScore=ssl.get(0).score;
+ }else{
+ for(SiteScore ss : ssl){
+ maxScore=Tools.max(maxScore, ss.score);
+ }
+ }
+
+ int cutoff=(int) (maxScore*fractionOfMax);
+ trimSitesBelowCutoff(ssl, cutoff, retainPaired, retainSemiperfect, minSitesToRetain, maxSitesToRetain);
+ return maxScore;
+ }
+
+ /** minSitesToRetain should be set to 1 if the list is not sorted by score. Otherwise, it can be higher. */
+ public static final void trimSiteListByMax(ArrayList<SiteScore> ssl, int cutoff, boolean retainPaired, boolean retainSemiperfect,
+ int minSitesToRetain, int maxSitesToRetain){
+// assert(false);
+ if(ssl==null || ssl.size()==0){return;}
+ if(ssl.size()==1){return;}
+
+ trimSitesBelowCutoff(ssl, cutoff, retainPaired, retainSemiperfect, minSitesToRetain, maxSitesToRetain);
+ }
+
+ public static final <X extends Comparable<? super X>> boolean inOrder(ArrayList<X> list){
+ if(list==null || list.size()<2){return true;}
+ for(int i=1; i<list.size(); i++){
+ X xa=list.get(i-1);
+ X xb=list.get(i);
+ if(xa.compareTo(xb)>0){return false;}
+ }
+ return true;
+ }
+
+
+
+ public static final int mergeDuplicateSites(ArrayList<SiteScore> list, boolean doAssertions, boolean mergeDifferentGaps){
+ if(list==null || list.size()<2){return 0;}
+ Collections.sort(list, SiteScore.PCOMP);
+
+ int removed=0;
+
+ SiteScore a=list.get(0);
+ for(int i=1; i<list.size(); i++){
+ SiteScore b=list.get(i);
+ if(a.positionalMatch(b, true)){
+
+ if(doAssertions){
+ if(!(a.perfect==b.perfect ||
+ (a.perfect && (a.score>b.score || a.slowScore>b.slowScore)))){
+ throw new RuntimeException("\n"+SiteScore.header()+"\n"+a.toText()+"\n"+b.toText()+"\n");
+ }
+
+ assert(a.perfect==b.perfect ||
+ (a.perfect && (a.score>b.score || a.slowScore>b.slowScore))) :
+ "\n"+SiteScore.header()+"\n"+a.toText()+"\n"+b.toText()+"\n";
+ }
+
+ a.setSlowScore(max(a.slowScore, b.slowScore));
+// a.setPairedScore(a.pairedScore<=0 && b.pairedScore<=0 ? 0 : max(a.slowScore+1, a.pairedScore, b.pairedScore));
+ a.setPairedScore((a.pairedScore<=a.slowScore && b.pairedScore<=a.slowScore) ? 0 : max(0, a.pairedScore, b.pairedScore));
+ a.setScore(max(a.score, b.score));
+ a.perfect=(a.perfect || b.perfect);
+ a.semiperfect=(a.semiperfect || b.semiperfect);
+
+ removed++;
+ list.set(i, null);
+ }else if(mergeDifferentGaps && a.positionalMatch(b, false)){ //Same outermost boundaries, different gaps
+
+ SiteScore better=null;
+ if(a.score!=b.score){
+ better=(a.score>b.score ? a : b);
+ }else if(a.slowScore!=b.slowScore){
+ better=(a.slowScore>b.slowScore ? a : b);
+ }else if(a.pairedScore!=b.pairedScore){
+ better=(a.pairedScore>b.pairedScore ? a : b);
+ }else{
+ better=a;
+ }
+
+ a.setSlowScore(max(a.slowScore, b.slowScore));
+ a.setPairedScore((a.pairedScore<=a.slowScore && b.pairedScore<=a.slowScore) ? 0 : max(0, a.pairedScore, b.pairedScore));
+ a.setScore(max(a.score, b.score));
+ a.perfect=(a.perfect || b.perfect);//TODO: This is not correct. And perfect sites should not have gaps anyway.
+ a.semiperfect=(a.semiperfect || b.semiperfect);
+ a.gaps=better.gaps;
+
+ removed++;
+ list.set(i, null);
+ }
+ else{
+ a=b;
+ }
+ }
+
+// if(removed>0){condense(list);}
+ if(removed>0){condenseStrict(list);}
+ return removed;
+ }
+
+
+
+ public static final int subsumeOverlappingSites(ArrayList<SiteScore> list, boolean subsumeIfOnlyStartMatches, boolean subsumeInexact){
+ if(list==null || list.size()<2){return 0;}
+ Collections.sort(list, SiteScore.PCOMP);
+
+ int removed=0;
+
+
+ for(int i=0; i<list.size(); i++){
+ SiteScore a=list.get(i);
+
+ assert(a==null || !a.perfect || a.semiperfect);
+
+ boolean overlappingA=true;
+ if(a!=null){
+ for(int j=i+1; overlappingA && j<list.size(); j++){
+ SiteScore b=list.get(j);
+ assert(b==null || !b.perfect || b.semiperfect);
+ if(b!=null){
+ overlappingA=(a.chrom==b.chrom && b.start<a.stop && b.stop>a.start);
+ if(overlappingA && a.strand==b.strand){
+
+ SiteScore better=null;
+ if(a.perfect!=b.perfect){
+ better=a.perfect ? a : b;
+ }else if(a.semiperfect!=b.semiperfect){
+ better=a.semiperfect ? a : b;
+ }else if(a.score!=b.score){
+ better=(a.score>b.score ? a : b);
+ }else if(a.slowScore!=b.slowScore){
+ better=(a.slowScore>b.slowScore ? a : b);
+ }else if(a.pairedScore!=b.pairedScore){
+ better=(a.pairedScore>b.pairedScore ? a : b);
+ }else if(a.pairedScore!=b.pairedScore){
+ better=(a.quickScore>b.quickScore ? a : b);
+ }else{
+ better=a;
+ }
+
+// if((a.perfect && b.perfect) || (a.semiperfect && b.semiperfect)){
+ if(a.semiperfect && b.semiperfect){
+ if(a.start==b.start || a.stop==b.stop){
+ list.set(i, better);
+ list.set(j, null);
+ removed++;
+ a=better;
+ }else{
+ //retain both of them
+ }
+ }else if(a.perfect || b.perfect){
+ list.set(i, better);
+ list.set(j, null);
+ removed++;
+ a=better;
+ }else if(a.semiperfect || b.semiperfect){
+ if(a.start==b.start && a.stop==b.stop){
+ list.set(i, better);
+ list.set(j, null);
+ removed++;
+ a=better;
+ }else{
+ //retain both of them
+ }
+ }else if(subsumeInexact || (a.start==b.start && (subsumeIfOnlyStartMatches || a.stop==b.stop))){
+ assert(!a.semiperfect && !a.perfect && !b.semiperfect && !b.perfect);
+ a.setLimits(min(a.start, b.start), max(a.stop, b.stop));
+ a.setSlowScore(max(a.slowScore, b.slowScore));
+ a.setPairedScore(a.pairedScore<=0 && b.pairedScore<=0 ? 0 : max(a.slowScore+1, a.pairedScore, b.pairedScore));
+ a.quickScore=max(a.quickScore, b.quickScore);
+ a.setScore(max(a.score, b.score, a.pairedScore));
+ a.gaps=better.gaps;//Warning! Merging gaps would be better; this could cause out-of-bounds.
+ //TODO: Test for a subsumption length limit.
+ list.set(j, null);
+ removed++;
+ }
+ }
+ }
+ }
+ }
+ }
+
+// if(removed>0){condense(list);}
+ if(removed>0){condenseStrict(list);}
+ return removed;
+ }
+
+
+
+ public static final int removeOverlappingSites(ArrayList<SiteScore> list, boolean requireAMatchingEnd){
+ if(list==null || list.size()<2){return 0;}
+ Collections.sort(list, SiteScore.PCOMP);
+
+ int removed=0;
+
+
+ for(int i=0; i<list.size(); i++){
+ SiteScore a=list.get(i);
+ boolean overlappingA=true;
+ if(a!=null){
+ for(int j=i+1; overlappingA && j<list.size(); j++){
+ SiteScore b=list.get(j);
+ if(b!=null){
+ overlappingA=(a.chrom==b.chrom && b.start<a.stop && b.stop>a.start);
+ if(overlappingA && a.strand==b.strand){
+
+ SiteScore better=null;
+ if(a.perfect!=b.perfect){
+ better=a.perfect ? a : b;
+ }else if(a.score!=b.score){
+ better=(a.score>b.score ? a : b);
+ }else if(a.slowScore!=b.slowScore){
+ better=(a.slowScore>b.slowScore ? a : b);
+ }else if(a.pairedScore!=b.pairedScore){
+ better=(a.pairedScore>b.pairedScore ? a : b);
+ }else if(a.pairedScore!=b.pairedScore){
+ better=(a.quickScore>b.quickScore ? a : b);
+ }else{
+ better=a;
+ }
+
+ if(a.start==b.start && a.stop==b.stop){
+ list.set(i, better);
+ list.set(j, null);
+ a=better;
+ removed++;
+ }else if(a.start==b.start || a.stop==b.stop){ //In this case they cannot both be perfect
+ list.set(i, better);
+ list.set(j, null);
+ a=better;
+ removed++;
+ }else if(!requireAMatchingEnd && a.score!=b.score){
+ list.set(i, better);
+ list.set(j, null);
+ a=better;
+ removed++;
+ }
+ }
+ }
+ }
+ }
+ }
+
+// if(removed>0){condense(list);}
+ if(removed>0){condenseStrict(list);}
+ return removed;
+ }
+
+
+
+ /** Returns the number of sitescores in the list within "thresh" of the top score. Assumes list is sorted descending.
+ * This is used to determine whether a mapping is ambiguous. */
+ public static final int countTopScores(ArrayList<SiteScore> list, int thresh){
+ assert(thresh>=0) : thresh;
+ if(list==null || list.isEmpty()){return 0;}
+ int count=1;
+ final SiteScore ss=list.get(0);
+ final int limit=ss.score-thresh;
+
+ for(int i=1; i<list.size(); i++){
+ SiteScore ss2=list.get(i);
+ if(ss2.score<limit){break;}
+ if(ss.start!=ss2.start && ss.stop!=ss2.stop){ //Don't count mappings to the same location
+ count++;
+ }
+ }
+ return count;
+ }
+
+
+
+ /** Assumes list is sorted by NON-PAIRED score.
+ * Returns number removed. */
+ public static final int removeLowQualitySitesPaired(ArrayList<SiteScore> list, int maxSwScore, float multSingle, float multPaired){
+ if(list==null || list.size()==0){return 0;}
+
+ assert(multSingle>=multPaired);
+
+ int initialSize=list.size();
+ final int swScoreThresh=(int)(maxSwScore*multSingle); //Change low-quality alignments to no-hits.
+ final int swScoreThreshPaired=(int)(maxSwScore*multPaired);
+ if(list.get(0).score<swScoreThreshPaired){list.clear(); return initialSize;}
+
+ for(int i=list.size()-1; i>=0; i--){
+ SiteScore ss=list.get(i);
+ assert(ss.score==ss.slowScore) : ss.quickScore+", "+ss.slowScore+", "+ss.pairedScore+", "+ss.score+"\n"+ss;
+ assert(i==0 || ss.slowScore<=list.get(i-1).slowScore) : "List is not sorted by singleton score!";
+ if(ss.pairedScore>0){
+ assert(ss.pairedScore>ss.quickScore || ss.pairedScore>ss.slowScore) : ss;
+ if(ss.slowScore<swScoreThreshPaired){list.remove(i);}
+ }else{
+ assert(ss.pairedScore<=0) : ss.toText();
+ if(ss.slowScore<swScoreThresh){list.remove(i);}
+ }
+ }
+
+ return initialSize-list.size();
+ }
+
+
+
+// /** Assumes list is sorted by NON-PAIRED score.
+// * Returns number removed. */
+// public static final int removeLowQualitySitesUnpaired(ArrayList<SiteScore> list, int maxSwScore, float multSingle){
+// if(list==null || list.size()==0){return 0;}
+//
+// int initialSize=list.size();
+// final int swScoreThresh=(int)(maxSwScore*multSingle); //Change low-quality alignments to no-hits.
+// if(list.get(0).score<swScoreThresh){list.clear(); return initialSize;}
+//
+//// for(int i=list.size()-1; i>=0; i--){
+// for(int i=list.size()-1; i>1; i--){
+// SiteScore ss=list.get(i);
+// assert(ss.score==ss.slowScore);
+// assert(i==0 || ss.slowScore<=list.get(i-1).slowScore) : "List is not sorted by singleton score!";
+// assert(ss.pairedScore==0) : ss.toText();
+// if(ss.slowScore<swScoreThresh){list.remove(i);}
+// }
+//
+// return initialSize-list.size();
+// }
+
+
+ /** Assumes list is sorted by NON-PAIRED score.
+ * Returns number removed. */
+ public static final int removeLowQualitySitesUnpaired(ArrayList<SiteScore> list, int thresh){
+ if(list==null || list.size()==0){return 0;}
+
+ int initialSize=list.size();
+ if(list.get(0).score<thresh){list.clear(); return initialSize;}
+
+// for(int i=list.size()-1; i>=0; i--){
+ for(int i=list.size()-1; i>1; i--){
+ SiteScore ss=list.get(i);
+ assert(ss.score==ss.slowScore || (ss.score<=0 && ss.slowScore<=0)) : ss;
+ assert(i==0 || ss.slowScore<=list.get(i-1).slowScore) : "List is not sorted by singleton score!";
+ assert(ss.pairedScore<=0) : ss.toText();
+ if(ss.slowScore<thresh){list.remove(i);}
+ }
+
+ return initialSize-list.size();
+ }
+
+
+
+ /** Assumes list is sorted by NON-PAIRED score.
+ * Returns number removed. */
+ public static final int removeLowQualitySitesPaired2(ArrayList<SiteScore> list, int maxSwScore, float multSingle, float multPaired, int expectedSites){
+ if(list==null || list.size()==0){return 0;}
+
+ assert(multSingle>=multPaired);
+
+ int initialSize=list.size();
+ final int swScoreThresh=(int)(maxSwScore*multSingle); //Change low-quality alignments to no-hits.
+ final int swScoreThreshPaired=(int)(maxSwScore*multPaired);
+ final int swScoreThresh2=(int)(maxSwScore*multSingle*1.2f);
+ final int swScoreThreshPaired2=(int)(maxSwScore*multPaired*1.1f);
+ if(list.get(0).score<swScoreThreshPaired){list.clear(); return initialSize;}
+ final int nthBest=list.get(Tools.min(list.size(), expectedSites)-1).score-maxSwScore/64;
+
+ for(int i=list.size()-1, min=expectedSites*2; i>min; i--){
+ if(list.get(i).slowScore>=nthBest){break;}
+ list.remove(i);
+ }
+
+ for(int i=list.size()-1; i>=0; i--){
+ SiteScore ss=list.get(i);
+ assert(ss.score==ss.slowScore);
+ assert(i==0 || ss.slowScore<=list.get(i-1).slowScore) : "List is not sorted by singleton score!";
+ if(ss.pairedScore>0){
+ int thresh=(i>=expectedSites ? swScoreThreshPaired2 : swScoreThreshPaired);
+ assert(ss.pairedScore>ss.quickScore || ss.pairedScore>ss.slowScore) : ss;
+ if(ss.slowScore<thresh){list.remove(i);}
+ }else{
+ int thresh=(i>=expectedSites ? swScoreThresh2 : swScoreThresh);
+// assert(ss.pairedScore==0) : ss.toText(); //Disable in case of negative values
+ if(ss.slowScore<thresh){list.remove(i);}
+ }
+ }
+
+ return initialSize-list.size();
+ }
+
+
+
+ /** Assumes list is sorted by NON-PAIRED score.
+ * Returns number removed.
+ * This has a couple of changes (like potentially removing the second-best site) that make it applicable to SKIMMER not MAPPER.
+ * */
+ public static final int removeLowQualitySitesUnpaired2(ArrayList<SiteScore> list, int maxSwScore, float multSingle, int expectedSites){
+ if(list==null || list.size()==0){return 0;}
+
+ for(int i=expectedSites/2; i<list.size(); i++){
+ if(list.get(i).perfect){expectedSites++;}
+ }
+
+ int initialSize=list.size();
+ final int swScoreThresh=(int)(maxSwScore*multSingle); //Change low-quality alignments to no-hits.
+ final int swScoreThresh2=(int)(maxSwScore*multSingle*1.2f); //Change low-quality alignments to no-hits.
+ if(list.get(0).score<swScoreThresh){list.clear(); return initialSize;}
+ final int nthBest=list.get(Tools.min(list.size(), expectedSites)-1).score-maxSwScore/64;
+
+ for(int i=list.size()-1, min=expectedSites*2; i>min; i--){
+ if(list.get(i).slowScore>=nthBest){break;}
+ list.remove(i);
+ }
+
+// for(int i=list.size()-1; i>=0; i--){
+ for(int i=list.size()-1; i>=1; i--){
+ SiteScore ss=list.get(i);
+ assert(ss.score==ss.slowScore);
+ assert(i==0 || ss.slowScore<=list.get(i-1).slowScore) : "List is not sorted by singleton score!";
+ assert(ss.pairedScore<=0) : ss.toText(); //This was "==0" but that makes the assertion fire for negative values.
+ int thresh=(i>=expectedSites ? swScoreThresh2 : swScoreThresh);
+ if(ss.slowScore<thresh){list.remove(i);}
+ }
+
+ return initialSize-list.size();
+ }
+
+
+// public static final void trimSitesBelowCutoff(ArrayList<SiteScore> ssl, int cutoff, boolean retainPaired){
+// trimSitesBelowCutoff(ssl, cutoff, retainPaired, 1);
+// }
+
+
+// public static final void trimSitesBelowCutoff(ArrayList<SiteScore> ssl, int cutoff, boolean retainPaired, int minSitesToRetain){
+//// assert(false);
+// assert(minSitesToRetain>=1);
+// if(ssl==null || ssl.size()<minSitesToRetain){return;}
+//
+// ArrayList<SiteScore> ssl2=new ArrayList<SiteScore>(ssl.size());
+// for(SiteScore ss : ssl){
+// if(ss.score>=cutoff || (retainPaired && ss.pairedScore>0)){
+// ssl2.add(ss);
+// }
+// }
+//
+//// Collections.sort(ssl2);
+//// System.err.println("Cutoff: "+cutoff);
+//// for(SiteScore ss : ssl2){
+//// System.err.print("("+ss.chrom+", "+ss.score+"), ");
+//// }
+//// System.err.println();
+//
+// if(ssl2.size()==ssl.size()){return;}
+//// System.err.println("cutoff: "+cutoff+",\tsize: "+ssl.size()+" -> "+ssl2.size());
+// ssl.clear();
+// ssl.addAll(ssl2);
+// }
+
+
+ public static final void trimSitesBelowCutoff(ArrayList<SiteScore> ssl, int cutoff, boolean retainPaired, boolean retainSemiperfect,
+ int minSitesToRetain, int maxSitesToRetain){
+// assert(false);
+ assert(minSitesToRetain>=1);
+ assert(maxSitesToRetain>minSitesToRetain) : maxSitesToRetain+", "+minSitesToRetain+"\nError - maxsites2 must be greater than "+minSitesToRetain+"!";
+ if(ssl==null || ssl.size()<=minSitesToRetain){return;}
+ while(ssl.size()>maxSitesToRetain){ssl.remove(ssl.size()-1);}
+
+ int removed=0;
+ final int maxToRemove=ssl.size()-minSitesToRetain;
+
+ assert(minSitesToRetain==1 || inOrder(ssl));
+
+ if(retainPaired){
+ for(int i=ssl.size()-1; i>=0; i--){
+ SiteScore ss=ssl.get(i);
+ if(!retainSemiperfect || !ss.semiperfect){
+ if(ss.score<cutoff && ss.pairedScore<=0){
+ ssl.set(i, null);
+ removed++;
+ if(removed>=maxToRemove){
+ assert(removed==maxToRemove);
+ break;
+ }
+ }
+ }
+ }
+ }else{
+ for(int i=ssl.size()-1; i>=0; i--){
+ SiteScore ss=ssl.get(i);
+ if(!retainSemiperfect || !ss.semiperfect){
+ if(ss.score<cutoff){
+ ssl.set(i, null);
+ removed++;
+ if(removed>=maxToRemove){
+ assert(removed==maxToRemove);
+ break;
+ }
+ }
+ }
+ }
+ }
+
+ if(removed>0){
+ condenseStrict(ssl);
+ }
+ assert(ssl.size()>=minSitesToRetain);
+ }
+
+ //Messes up order
+// public static final void trimSitesBelowCutoffInplace(ArrayList<SiteScore> ssl, int cutoff, boolean retainPaired){
+//// assert(false);
+// if(ssl==null || ssl.size()<2){return;}
+//
+// for(int i=0; i<ssl.size(); i++){
+// SiteScore ss=ssl.get(i);
+// if(ss.score<cutoff && (!retainPaired || ss.pairedScore==0)){
+// SiteScore temp=ssl.remove(ssl.size()-1);
+// if(i<ssl.size()){
+// ssl.set(i, temp);
+// i--;
+// }
+// }
+// }
+// }
+
+ public static CharSequence toStringSafe(byte[] array){
+ if(array==null){return "null";}
+ StringBuilder sb=new StringBuilder();
+ sb.append(Arrays.toString(array));
+ if(array.length<1 || array[0]<32 || array[0]>126){return sb;}
+ sb.append('\n');
+ for(int i=0; i<array.length; i++){
+ byte b=array[i];
+ if(b<32 || b>126){break;}
+ sb.append((char)b);
+ }
+ return sb;
+ }
+
+ public static boolean equals(int[] a, int[] b){
+ if(a==b){return true;}
+ if(a==null || b==null){return false;}
+ if(a.length!=b.length){return false;}
+ for(int i=0; i<a.length; i++){
+ if(a[i]!=b[i]){return false;}
+ }
+ return true;
+ }
+
+ public static boolean equals(byte[] a, byte[] b){
+ if(a==b){return true;}
+ if(a==null || b==null){return false;}
+ if(a.length!=b.length){return false;}
+ for(int i=0; i<a.length; i++){
+ if(a[i]!=b[i]){return false;}
+ }
+ return true;
+ }
+
+ /**
+ * @param array
+ * @param s
+ * @return True if the array starts with the String.
+ */
+ public static boolean startsWith(byte[] array, String s) {
+ if(array==null || s==null || array.length<s.length()){return false;}
+ for(int i=0; i<s.length(); i++){
+ if(array[i]!=s.charAt(i)){return false;}
+ }
+ return true;
+ }
+
+ public static int compare(byte[] a, byte[] b){
+ if(a==b){return 0;}
+ if(a==null){return -1;}
+ if(b==null){return 1;}
+ int lim=min(a.length, b.length);
+ for(int i=0; i<lim; i++){
+ if(a[i]!=b[i]){return a[i]-b[i];}
+ }
+ return a.length-b.length;
+ }
+
+ public static int sumInt(byte[] array){
+ long x=0;
+ for(byte y : array){x+=y;}
+ assert(x<=Integer.MAX_VALUE && x>=Integer.MIN_VALUE) : x;
+ return (int)x;
+ }
+
+ public static long sum(byte[] array){
+ long x=0;
+ for(byte y : array){x+=y;}
+ return x;
+ }
+
+ public static long sum(char[] array){
+ long x=0;
+ for(char y : array){x+=y;}
+ return x;
+ }
+
+ public static long sum(short[] array){
+ long x=0;
+ for(short y : array){x+=y;}
+ return x;
+ }
+
+ public static int cardinality(short[] array){
+ int x=0;
+ for(int y : array){if(y!=0){x++;}}
+ return x;
+ }
+
+ public static long sum(int[] array){
+ long x=0;
+ for(int y : array){x+=y;}
+ return x;
+ }
+
+ public static double mean(int[] array){
+ return sum(array)/(double)array.length;
+ }
+
+ public static double harmonicMean(int[] array){
+ double sum=0;
+ for(int x : array){
+ if(x>0){sum+=1.0/x;}
+ }
+ return array.length/sum;
+ }
+
+ public static int cardinality(int[] array){
+ int x=0;
+ for(int y : array){if(y!=0){x++;}}
+ return x;
+ }
+
+ public static long sum(int[] array, int from, int to){
+ long x=0;
+ for(int i=from; i<=to; i++){x+=array[i];}
+ return x;
+ }
+
+ public static long sum(long[] array){
+ long x=0;
+ for(long y : array){x+=y;}
+ return x;
+ }
+
+ public static long sum(long[] array, int from, int to){
+ long x=0;
+ for(int i=from; i<=to; i++){x+=array[i];}
+ return x;
+ }
+
+ public static long sumHistogram(long[] array){
+ long x=0;
+ for(int i=1; i<array.length; i++){
+ x+=(i*array[i]);
+ }
+ return x;
+ }
+
+ public static long minHistogram(long[] array){
+ for(int i=0; i<array.length; i++){
+ if(array[i]>0){return i;}
+ }
+ return 0;
+ }
+
+ public static long maxHistogram(long[] array){
+ for(int i=array.length-1; i>=0; i--){
+ if(array[i]>0){return i;}
+ }
+ return 0;
+ }
+
+ public static long sum(AtomicIntegerArray array){
+ long x=0;
+ for(int i=0; i<array.length(); i++){x+=array.get(i);}
+ return x;
+ }
+
+ public static long sum(AtomicLongArray array){
+ long x=0;
+ for(int i=0; i<array.length(); i++){x+=array.get(i);}
+ return x;
+ }
+
+ public static long[] toArray(AtomicLongArray array){
+ long[] x=new long[array.length()];
+ for(int i=0; i<array.length(); i++){x[i]=array.get(i);}
+ return x;
+ }
+
+ public static long[] toArray(CoverageArray array){
+ long[] x=new long[array.maxIndex+1];
+ for(int i=0; i<=array.maxIndex; i++){x[i]=array.get(i);}
+ return x;
+ }
+
+ public static int min(int[] array){
+ int min=Integer.MAX_VALUE;
+ for(int y : array){if(y<min){min=y;}}
+ return min;
+ }
+
+ public static byte min(byte[] array){
+ byte min=Byte.MAX_VALUE;
+ for(byte y : array){if(y<min){min=y;}}
+ return min;
+ }
+
+ public static int intSum(int[] array){
+ int x=0;
+ for(int y : array){x+=y;}
+ return x;
+ }
+
+ public static void reverseInPlace(final byte[] array){
+ if(array==null){return;}
+ final int max=array.length/2, last=array.length-1;
+ for(int i=0; i<max; i++){
+ byte temp=array[i];
+ array[i]=array[last-i];
+ array[last-i]=temp;
+ }
+ }
+
+ public static void reverseInPlace(final char[] array){
+ if(array==null){return;}
+ final int max=array.length/2, last=array.length-1;
+ for(int i=0; i<max; i++){
+ char temp=array[i];
+ array[i]=array[last-i];
+ array[last-i]=temp;
+ }
+ }
+
+ public static void reverseInPlace(final int[] array){
+ if(array==null){return;}
+ final int max=array.length/2, last=array.length-1;
+ for(int i=0; i<max; i++){
+ int temp=array[i];
+ array[i]=array[last-i];
+ array[last-i]=temp;
+ }
+ }
+
+ public static void reverseInPlace(final int[] array, final int from, final int to){
+ if(array==null){return;}
+ final int max=to/2, last=to-1;
+ for(int i=0; i<max; i++){
+ int temp=array[i];
+ array[i]=array[last-i];
+ array[last-i]=temp;
+ }
+ }
+
+ public static byte[] reverseAndCopy(final byte[] array){
+// if(array==null){return null;}
+// byte[] copy=Arrays.copyOf(array, array.length);
+// reverseInPlace(copy);
+// return copy;
+ return reverseAndCopy(array, null);
+ }
+
+ public static int[] reverseAndCopy(final int[] array){
+// if(array==null){return null;}
+// int[] copy=Arrays.copyOf(array, array.length);
+// reverseInPlace(copy);
+// return copy;
+ return reverseAndCopy(array, null);
+ }
+
+ public static byte[] reverseAndCopy(final byte[] array, byte[] out){
+ if(array==null){
+ assert(out==null);
+ return null;
+ }
+ if(out==null){out=new byte[array.length];}
+ assert(array.length==out.length && array!=out);
+ for(int i=0, last=array.length-1; i<array.length; i++){out[i]=array[last-i];}
+ return out;
+ }
+
+ public static int[] reverseAndCopy(final int[] array, int[] out){
+ if(array==null){
+ assert(out==null);
+ return null;
+ }
+ if(out==null){out=new int[array.length];}
+ assert(array.length==out.length && array!=out);
+ for(int i=0, last=array.length-1; i<array.length; i++){out[i]=array[last-i];}
+ return out;
+ }
+
+ public static void cullHighFreqEntries(int[][] data, float fractionToExclude){
+ if(fractionToExclude<=0){return;}
+ int[] count=new int[data.length];
+
+ long numBases=0;
+
+ for(int i=0; i<data.length; i++){
+ count[i]=(data[i]==null ? 0 : data[i].length);
+ numBases+=count[i];
+ }
+
+ int numIndicesToRemove=((int)(numBases*fractionToExclude));
+
+ Arrays.sort(count);
+
+ for(int i=1; i<count.length; i++){
+ assert(count[i]>=count[i-1]) : "\n\ncount["+i+"]="+count[i]+"\ncount["+(i-1)+"]="+count[i-1]+"\n";
+ }
+
+ int pos=count.length-1;
+ for(int sum=0; pos>1 && sum<numIndicesToRemove; pos--){
+ sum+=count[pos];
+ }
+ int maxLengthToKeep2=count[pos];
+
+ for(int i=0; i<data.length; i++){
+ if(data[i]!=null && data[i].length>maxLengthToKeep2){data[i]=null;}
+ }
+ }
+
+ public static int findLimitForHighFreqEntries(int[][] data, float fractionToExclude){
+ if(fractionToExclude<=0){return Integer.MAX_VALUE;}
+ int[] count=new int[data.length];
+
+ long numBases=0;
+
+ for(int i=0; i<data.length; i++){
+ count[i]=(data[i]==null ? 0 : data[i].length);
+ numBases+=count[i];
+ }
+
+ int numIndicesToRemove=((int)(numBases*fractionToExclude));
+
+ Arrays.sort(count);
+
+ for(int i=1; i<count.length; i++){
+ assert(count[i]>=count[i-1]) : "\n\ncount["+i+"]="+count[i]+"\ncount["+(i-1)+"]="+count[i-1]+"\n";
+ }
+
+ int pos=count.length-1;
+ for(int sum=0; pos>1 && sum<numIndicesToRemove; pos--){
+ sum+=count[pos];
+ }
+ int maxLengthToKeep2=count[pos];
+
+ return maxLengthToKeep2;
+ }
+
+ public static void cullClumpyEntries(final int[][] data, final int maxDist, final int minLength, final float fraction){
+
+ long total=0;
+ long removedSites=0;
+ long removedKeys=0;
+
+ if(maxDist<=0){return;}
+ for(int i=0; i<data.length; i++){
+ int[] array=data[i];
+ total+=(array==null ? 0 : array.length);
+ if(array!=null && array.length>=minLength){
+ if(isClumpy(array, maxDist, fraction)){
+ removedSites+=array.length;
+ removedKeys++;
+ data[i]=null;
+ }
+ }
+ }
+
+// System.err.println("Removed\t"+removedSites+"\t/ "+total+"\tsites," +
+// " or "+String.format("%.4f", (removedSites*100f/total))+"%");
+// System.err.println("Removed\t"+removedKeys+"\t/ "+data.length+"\tkeys," +
+// " or "+String.format("%.4f", (removedKeys*100f/data.length))+"%");
+
+ }
+
+ public static HashSet<Integer> banClumpyEntries(final int[][] data, final int maxDist, final int minLength, final float fraction){
+
+ HashSet<Integer> set=new HashSet<Integer>(128);
+
+ long total=0;
+ long removedSites=0;
+ long removedKeys=0;
+
+ if(maxDist<=0){return set;}
+
+ for(int i=0; i<data.length; i++){
+ int[] array=data[i];
+ total+=(array==null ? 0 : array.length);
+ if(array!=null && array.length>=minLength){
+ if(isClumpy(array, maxDist, fraction)){
+ removedSites+=array.length;
+ removedKeys++;
+ set.add(i);
+ }
+ }
+ }
+
+// System.err.println("Banned\t"+removedSites+"\t/ "+total+"\tsites," +
+// " or "+String.format("%.4f", (removedSites*100f/total))+"%");
+// System.err.println("Banned\t"+removedKeys+"\t/ "+data.length+"\tkeys," +
+// " or "+String.format("%.4f", (removedKeys*100f/data.length))+"%");
+
+ return set;
+
+ }
+
+ public static final boolean isClumpy(final int[] array, final int maxDist, final float fraction){
+ if(array==null){return false;}
+ int count=0;
+ for(int i=1; i<array.length; i++){
+ int dif=array[i]-array[i-1];
+ if(dif<=maxDist){count++;}
+ }
+ return count>=(array.length*fraction);
+ }
+
+ public static int[] makeLengthHistogram(int[][] x, int buckets) {
+ int[] lengths=new int[x.length];
+ long total=0;
+ for(int i=0; i<x.length; i++){
+ int[] list=x[i];
+ if(list!=null){
+ lengths[i]=list.length;
+ total+=list.length;
+ }
+ }
+ Arrays.sort(lengths);
+
+ int[] hist=new int[buckets+1];
+
+ long sum=0;
+ int ptr=0;
+ for(int i=0; i<buckets; i++){
+ long nextLimit=((total*i)+buckets/2)/buckets;
+ while(ptr<lengths.length && sum<nextLimit){
+ sum+=lengths[ptr];
+ ptr++;
+ }
+
+ hist[i]=lengths[Tools.max(0, ptr-1)];
+ }
+ hist[hist.length-1]=lengths[lengths.length-1];
+
+// System.out.println(Arrays.toString(hist));
+// assert(false);
+ return hist;
+ }
+
+ public static String toKMG(long x){
+ double div=1;
+ String ext="";
+ if(x>10000000000000L){
+ div=1000000000000L;
+ ext="T";
+ }else if(x>10000000000L){
+ div=1000000000L;
+ ext="B";
+ }else if(x>10000000){
+ div=1000000;
+ ext="M";
+ }else if(x>100000){
+ div=1000;
+ ext="K";
+ }
+ return String.format("%.2f", x/div)+ext;
+ }
+
+ public static byte[] parseRemap(String b){
+ final byte[] remap;
+ if(b==null || ("f".equalsIgnoreCase(b) || "false".equalsIgnoreCase(b))){
+ remap=null;
+ }else{
+ assert((b.length()&1)==0) : "Length of remap argument must be even. No whitespace is allowed.";
+
+ remap=new byte[128];
+ for(int j=0; j<remap.length; j++){remap[j]=(byte)j;}
+ for(int j=0; j<b.length(); j+=2){
+ char x=b.charAt(j), y=b.charAt(j+1);
+ remap[x]=(byte)y;
+ }
+ }
+ return remap;
+ }
+
+ public static long parseKMG(String b){
+ if(b==null){return 0;}
+ char c=Character.toLowerCase(b.charAt(b.length()-1));
+ boolean dot=b.indexOf('.')>=0;
+ if(!Character.isLetter(c) && !dot){return Long.parseLong(b);}
+
+ long mult=1;
+ if(Character.isLetter(c)){
+ if(c=='k'){mult=1000;}
+ else if(c=='m'){mult=1000000;}
+ else if(c=='g' || c=='b'){mult=1000000000;}
+ else if(c=='t'){mult=1000000000000L;}
+ else{throw new RuntimeException(b);}
+ b=b.substring(0, b.length()-1);
+ }
+
+ if(!dot){return Long.parseLong(b)*mult;}
+
+ return (long)(Double.parseDouble(b)*mult);
+ }
+
+ public static boolean isNumber(String s){
+ if(s==null || s.length()==0){return false;}
+ char c=s.charAt(0);
+ return Character.isDigit(c) || c=='.' || c=='-';
+ }
+
+ public static boolean parseBoolean(String s){
+ if(s==null || s.length()<1){return true;}
+ if(s.length()==1){
+ char c=Character.toLowerCase(s.charAt(0));
+ return c=='t' || c=='1';
+ }
+ if(s.equalsIgnoreCase("null") || s.equalsIgnoreCase("none")){return false;}
+ return Boolean.parseBoolean(s);
+ }
+
+ public static int[] parseIntArray(String s, String regex){
+ if(s==null){return null;}
+ String[] split=s.split(regex);
+ int[] array=new int[split.length];
+ for(int i=0; i<split.length; i++){
+ array[i]=Integer.parseInt(split[i]);
+ }
+ return array;
+ }
+
+ public static byte[] parseByteArray(String s, String regex){
+ if(s==null){return null;}
+ String[] split=s.split(regex);
+ byte[] array=new byte[split.length];
+ for(int i=0; i<split.length; i++){
+ array[i]=Byte.parseByte(split[i]);
+ }
+ return array;
+ }
+
+ public static int parseIntHexDecOctBin(final String s){
+ if(s==null || s.length()<1){return 0;}
+ int radix=10;
+ if(s.length()>1 && s.charAt(1)=='0'){
+ final char c=s.charAt(1);
+ if(c=='x' || c=='X'){radix=16;}
+ else if(c=='b' || c=='B'){radix=2;}
+ else if(c=='o' || c=='O'){radix=8;}
+ }
+ return Integer.parseInt(s, radix);
+ }
+
+ public static int parseInt(byte[] array, int a, int b){
+ assert(b>a);
+ int r=0;
+ final byte z='0';
+ boolean negative=false;
+ if(array[a]=='-'){negative=true; a++;}
+ for(; a<b; a++){
+ int x=(array[a]-z);
+ assert(x<10 && x>=0) : x+" = "+(char)array[a]+"\narray="+new String(array)+", start="+a+", stop="+b;
+ r=(r*10)+x;
+ }
+ if(negative){r*=-1;}
+ return r;
+ }
+
+ public static long parseLong(byte[] array, int a, int b){
+ assert(b>a);
+ long r=0;
+ final byte z='0';
+ boolean negative=false;
+ if(array[a]=='-'){negative=true; a++;}
+ for(; a<b; a++){
+ int x=(array[a]-z);
+ assert(x<10 && x>=0) : x+" = "+(char)array[a]+"\narray="+new String(array)+", start="+a+", stop="+b;
+ r=(r*10)+x;
+ }
+ if(negative){r*=-1;}
+ return r;
+ }
+
+ /** TODO: This (temporarily) uses a lot of memory. Could be reduced by making an array of length max(x) and counting occurrences. */
+ public static int[] makeLengthHistogram2(int[] x, int buckets, boolean verbose) {
+ int[] lengths=Arrays.copyOf(x, x.length);
+ long total=sum(x);
+ Arrays.sort(lengths);
+
+ if(verbose){
+ System.out.println("Length array size:\t"+x.length);
+ System.out.println("Min value: \t"+lengths[0]);
+ System.out.println("Med value: \t"+lengths[lengths.length/2]);
+ System.out.println("Max value: \t"+lengths[lengths.length-1]);
+ System.out.println("Total: \t"+total);
+ }
+
+ int[] hist=new int[buckets+1];
+
+ long sum=0;
+ int ptr=0;
+ for(int i=0; i<buckets; i++){
+ long nextLimit=((total*i)+buckets/2)/buckets;
+ while(ptr<lengths.length && sum<nextLimit){
+ sum+=lengths[ptr];
+ ptr++;
+ }
+
+ hist[i]=lengths[Tools.max(0, ptr-1)];
+ }
+ hist[hist.length-1]=lengths[lengths.length-1];
+
+// System.out.println(Arrays.toString(hist));
+// assert(false);
+ return hist;
+ }
+
+ public static int[] makeLengthHistogram3(int[] x, int buckets, boolean verbose) {
+ int max=max(x);
+ if(max>x.length){
+ Data.sysout.println("Reverted to old histogram mode.");
+ return makeLengthHistogram2(x, buckets, verbose);
+ }
+
+ int[] counts=new int[max+1];
+ long total=0;
+ for(int i=0; i<x.length; i++){
+ int a=x[i];
+ if(a>=0){
+ counts[a]++;
+ total+=a;
+ }
+ }
+
+ return makeLengthHistogram4(counts, buckets, total, verbose);
+ }
+
+ /** Uses counts of occurrences of lengths rather than raw lengths */
+ public static int[] makeLengthHistogram4(int[] counts, int buckets, long total, boolean verbose) {
+ if(total<=0){
+ total=0;
+ for(int i=1; i<counts.length; i++){
+ total+=(i*counts[i]);
+ }
+ }
+
+ if(verbose){
+// System.out.println("Length array size:\t"+x.length);
+// System.out.println("Min value: \t"+lengths[0]);
+// System.out.println("Med value: \t"+lengths[lengths.length/2]);
+// System.out.println("Max value: \t"+lengths[lengths.length-1]);
+ System.err.println("Total: \t"+total);
+ }
+
+ int[] hist=new int[buckets+1];
+
+ long sum=0;
+ int ptr=0;
+ for(int i=0; i<buckets; i++){
+ long nextLimit=((total*i)+buckets/2)/buckets;
+ while(ptr<counts.length && sum<nextLimit){
+ sum+=counts[ptr]*ptr;
+ ptr++;
+ }
+
+ hist[i]=Tools.max(0, ptr-1);
+ }
+ hist[hist.length-1]=counts.length-1;
+
+// System.out.println(Arrays.toString(hist));
+// assert(false);
+ return hist;
+ }
+
+ /**
+ * @param cov
+ * @return
+ */
+ public static int average(short[] array) {
+ return (int)(array==null || array.length==0 ? 0 : sum(array)/array.length);
+ }
+
+ /**
+ * @param cov
+ * @return
+ */
+ public static int average(int[] array) {
+ return (int)(array==null || array.length==0 ? 0 : sum(array)/array.length);
+ }
+
+ public static int median(int[] array){return percentile(array, .5);}
+
+ public static int median(long[] array){return percentile(array, .5);}
+
+ public static int percentile(int[] array, double fraction){
+ if(array==null || array.length<1){return 0;}
+ long target=(long)(sum(array)*fraction);
+ long sum=0;
+ for(int i=0; i<array.length; i++){
+ sum+=array[i];
+ if(sum>=target){
+ return i;
+ }
+ }
+ return array.length-1;
+ }
+
+ public static int percentile(long[] array, double fraction){
+ if(array==null || array.length<1){return 0;}
+ long target=(long)(sum(array)*fraction);
+ long sum=0;
+ for(int i=0; i<array.length; i++){
+ sum+=array[i];
+ if(sum>=target){
+ return i;
+ }
+ }
+ return array.length-1;
+ }
+
+ public static int calcMode(long array[]){
+ if(array==null || array.length<1){return 0;}
+ int median=percentile(array, 0.5);
+ int mode=0;
+ long modeCount=array[mode];
+ for(int i=1; i<array.length; i++){
+ long count=array[i];
+ if(count>modeCount || (count==modeCount && absdif(i, median)<absdif(mode, median))){
+ mode=i;
+ modeCount=count;
+ }
+ }
+ return mode;
+ }
+
+ public static int absdif(int a, int b) {
+ return a>b ? a-b : b-a;
+ }
+
+ public static float absdif(float a, float b) {
+ return a>b ? a-b : b-a;
+ }
+
+ public static double absdif(double a, double b) {
+ return a>b ? a-b : b-a;
+ }
+
+ /** Uses unsigned math */
+ public static final int absdifUnsigned(int a, int b){
+ return (a<0 == b<0) ? a>b ? a-b : b-a : Integer.MAX_VALUE;
+ }
+
+ public static final boolean overlap(int a1, int b1, int a2, int b2){
+ assert(a1<=b1 && a2<=b2) : a1+", "+b1+", "+a2+", "+b2;
+ return a2<=b1 && b2>=a1;
+ }
+
+ public static final int overlapLength(int a1, int b1, int a2, int b2){
+ if(!overlap(a1,b1,a2,b2)){return 0;}
+ if(a1<=a2){
+ return b1>=b2 ? b2-a2+1 : b1-a2+1;
+ }else{
+ return b2>=b1 ? b1-a1+1 : b2-a1+1;
+ }
+ }
+
+ /** Is (a1, b1) within (a2, b2) ? */
+ public static final boolean isWithin(int a1, int b1, int a2, int b2){
+ assert(a1<=b1 && a2<=b2) : a1+", "+b1+", "+a2+", "+b2;
+ return a1>=a2 && b1<=b2;
+ }
+
+ public static final int constrict(int point, int a, int b){
+ assert(a<=b);
+ return(point<a ? a : point>b ? b : point);
+ }
+
+ public static final int indexOf(byte[] array, byte b){
+ int i=0;
+ while(i<array.length && array[i]!=b){i++;}
+ return (i==array.length ? -1 : i);
+ }
+
+ public static final int indexOf(char[] array, char b){
+ int i=0;
+ while(i<array.length && array[i]!=b){i++;}
+ return (i==array.length ? -1 : i);
+ }
+
+ public static final int lastIndexOf(byte[] array, byte b){
+ int i=array.length-1;
+ while(i>=0 && array[i]!=b){i--;}
+ return i;
+ }
+
+ public static final int stringLength(long x){
+ if(x<0){
+ if(x==Integer.MIN_VALUE){return 11;}
+ return lengthOf(-x)+1;
+ }
+ return lengthOf(x);
+ }
+
+ public static final int stringLength(int x){
+ if(x<0){
+ if(x==Long.MIN_VALUE){return 20;}
+ return lengthOf(-x)+1;
+ }
+ return lengthOf(x);
+ }
+
+ public static final int lengthOf(int x){
+ assert(x>=0);
+ int i=1;
+ while(x>ilens[i]){i++;}
+ return i;
+ }
+
+ public static final int lengthOf(long x){
+ assert(x>=0);
+ int i=1;
+ while(x>llens[i]){i++;}
+ return i;
+ }
+
+ public static final int max(int[] array){return array[maxIndex(array)];}
+
+ public static final int maxIndex(int[] array){
+ int max=array[0], maxIndex=0;
+ for(int i=1; i<array.length; i++){
+ if(array[i]>max){max=array[i];maxIndex=i;}
+ }
+ return maxIndex;
+ }
+
+ public static final long max(long[] array){return array[maxIndex(array)];}
+
+ public static final int maxIndex(long[] array){
+ long max=array[0];
+ int maxIndex=0;
+ for(int i=1; i<array.length; i++){
+ if(array[i]>max){max=array[i];maxIndex=i;}
+ }
+ return maxIndex;
+ }
+
+ public static final double max(double[] array){return array[maxIndex(array)];}
+
+ public static final int maxIndex(double[] array){
+ double max=array[0];
+ int maxIndex=0;
+ for(int i=1; i<array.length; i++){
+ if(array[i]>max){max=array[i];maxIndex=i;}
+ }
+ return maxIndex;
+ }
+
+ public static final double standardDeviation(long[] numbers){
+ if(numbers==null || numbers.length<1){return 0;}
+ long sum=sum(numbers);
+ double avg=sum/(double)numbers.length;
+ double sumdev2=0;
+ for(int i=0; i<numbers.length; i++){
+ long x=numbers[i];
+ double dev=avg-x;
+ sumdev2+=(dev*dev);
+ }
+ return Math.sqrt(sumdev2/numbers.length);
+ }
+
+ public static final double standardDeviation(int[] numbers){
+ if(numbers==null || numbers.length<1){return 0;}
+ long sum=sum(numbers);
+ double avg=sum/(double)numbers.length;
+ double sumdev2=0;
+ for(int i=0; i<numbers.length; i++){
+ long x=numbers[i];
+ double dev=avg-x;
+ sumdev2+=(dev*dev);
+ }
+ return Math.sqrt(sumdev2/numbers.length);
+ }
+
+ public static final double standardDeviation(char[] numbers){
+ if(numbers==null || numbers.length<1){return 0;}
+ long sum=sum(numbers);
+ double avg=sum/(double)numbers.length;
+ double sumdev2=0;
+ for(int i=0; i<numbers.length; i++){
+ long x=numbers[i];
+ double dev=avg-x;
+ sumdev2+=(dev*dev);
+ }
+ return Math.sqrt(sumdev2/numbers.length);
+ }
+
+ public static final double standardDeviation(short[] numbers){
+ if(numbers==null || numbers.length<1){return 0;}
+ long sum=sum(numbers);
+ double avg=sum/(double)numbers.length;
+ double sumdev2=0;
+ for(int i=0; i<numbers.length; i++){
+ long x=numbers[i];
+ double dev=avg-x;
+ sumdev2+=(dev*dev);
+ }
+ return Math.sqrt(sumdev2/numbers.length);
+ }
+
+ public static final double averageHistogram(long[] histogram){
+ long sum=max(1, sum(histogram));
+ long sum2=0;
+ for(int i=0; i<histogram.length; i++){
+ sum2+=(histogram[i]*i);
+ }
+ double avg=sum2/(double)sum;
+ return avg;
+ }
+
+ public static final double standardDeviationHistogram(char[] histogram){
+ long sum=max(1, sum(histogram));
+ long sum2=0;
+ for(int i=0; i<histogram.length; i++){
+ sum2+=(histogram[i]*i);
+ }
+ double avg=sum2/(double)sum;
+ double sumdev2=0;
+ for(int i=0; i<histogram.length; i++){
+ double dev=avg-i;
+ double dev2=dev*dev;
+ sumdev2+=(histogram[i]*dev2);
+ }
+ return Math.sqrt(sumdev2/sum);
+ }
+
+ public static final double standardDeviationHistogram(int[] histogram){
+ long sum=max(1, sum(histogram));
+ long sum2=0;
+ for(int i=0; i<histogram.length; i++){
+ sum2+=(histogram[i]*i);
+ }
+ double avg=sum2/(double)sum;
+ double sumdev2=0;
+ for(int i=0; i<histogram.length; i++){
+ double dev=avg-i;
+ double dev2=dev*dev;
+ sumdev2+=(histogram[i]*dev2);
+ }
+ return Math.sqrt(sumdev2/sum);
+ }
+
+ public static final double standardDeviationHistogram(long[] histogram){
+ long sum=max(1, sum(histogram));
+ long sum2=0;
+ for(int i=0; i<histogram.length; i++){
+ sum2+=(histogram[i]*i);
+ }
+ double avg=sum2/(double)sum;
+ double sumdev2=0;
+ for(int i=0; i<histogram.length; i++){
+ double dev=avg-i;
+ double dev2=dev*dev;
+ sumdev2+=(histogram[i]*dev2);
+ }
+ return Math.sqrt(sumdev2/sum);
+ }
+
+ /** Special version that calculates standard deviation based on unique kmers rather than overall events */
+ public static final double standardDeviationHistogramKmer(long[] histogram){
+ final long sum=sum(histogram);
+ double sumU=0;
+ for(int i=0; i<histogram.length; i++){
+ long x=histogram[i];
+ sumU+=(x/(double)max(i, 1));
+ }
+ double avg=sum/max(sumU, 1);
+ double sumdev2=0;
+ for(int i=1; i<histogram.length; i++){
+ double dev=avg-i;
+ double dev2=dev*dev;
+ long x=histogram[i];
+ sumdev2+=((x/(double)max(i, 1))*dev2);
+ }
+ return Math.sqrt(sumdev2/sumU);
+ }
+
+ public static final double standardDeviationHistogram(AtomicLongArray histogram){
+ long sum=max(1, sum(histogram));
+ long sum2=0;
+ for(int i=0; i<histogram.length(); i++){
+ sum2+=(histogram.get(i)*i);
+ }
+ double avg=sum2/(double)sum;
+ double sumdev2=0;
+ for(int i=0; i<histogram.length(); i++){
+ double dev=avg-i;
+ double dev2=dev*dev;
+ sumdev2+=(histogram.get(i)*dev2);
+ }
+ return Math.sqrt(sumdev2/sum);
+ }
+
+ /** Special version that calculates standard deviation based on unique kmers rather than overall events */
+ public static final double standardDeviationHistogramKmer(AtomicLongArray histogram){
+ final long sum=sum(histogram);
+ double sumU=0;
+ for(int i=0; i<histogram.length(); i++){
+ long x=histogram.get(i);
+ sumU+=(x/(double)max(i, 1));
+ }
+ double avg=sum/max(sumU, 1);
+ double sumdev2=0;
+ for(int i=1; i<histogram.length(); i++){
+ double dev=avg-i;
+ double dev2=dev*dev;
+ long x=histogram.get(i);
+ sumdev2+=((x/(double)max(i, 1))*dev2);
+ }
+ return Math.sqrt(sumdev2/sumU);
+ }
+
+ public static final long[] downsample(long[] array, int bins){
+ if(array==null || array.length==bins){return array;}
+ assert(bins<=array.length);
+ assert(bins>=0);
+ long[] r=new long[bins];
+ if(bins==0){return r;}
+ double mult=bins/(double)array.length;
+ for(int i=0; i<array.length; i++){
+ int j=(int)(mult*i);
+ r[j]+=array[i];
+ }
+ return r;
+ }
+
+
+ public static final void pause(int millis){
+ try {
+ Thread.sleep(millis);
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+
+ public static final int min(int x, int y){return x<y ? x : y;}
+ public static final int max(int x, int y){return x>y ? x : y;}
+ public static final int min(int x, int y, int z){return x<y ? (x<z ? x : z) : (y<z ? y : z);}
+ public static final int max(int x, int y, int z){return x>y ? (x>z ? x : z) : (y>z ? y : z);}
+ public static final int min(int x, int y, int z, int z2){return min(min(x,y), min(z,z2));}
+ public static final int max(int x, int y, int z, int z2){return max(max(x,y), max(z,z2));}
+
+ //Median of 3
+ public static final int mid(int x, int y, int z){return x<y ? (x<z ? min(y, z) : x) : (y<z ? min(x, z) : y);}
+
+ public static final char min(char x, char y){return x<y ? x : y;}
+ public static final char max(char x, char y){return x>y ? x : y;}
+
+ public static final byte min(byte x, byte y){return x<y ? x : y;}
+ public static final byte max(byte x, byte y){return x>y ? x : y;}
+ public static final byte min(byte x, byte y, byte z){return x<y ? min(x, z) : min(y, z);}
+ public static final byte max(byte x, byte y, byte z){return x>y ? max(x, z) : max(y, z);}
+ public static final byte min(byte x, byte y, byte z, byte a){return min(min(x, y), min(z, a));}
+ public static final byte max(byte x, byte y, byte z, byte a){return max(max(x, y), max(z, a));}
+
+ public static final long min(long x, long y){return x<y ? x : y;}
+ public static final long max(long x, long y){return x>y ? x : y;}
+ public static final long min(long x, long y, long z){return x<y ? (x<z ? x : z) : (y<z ? y : z);}
+ public static final long max(long x, long y, long z){return x>y ? (x>z ? x : z) : (y>z ? y : z);}
+ public static final long min(long x, long y, long z, long z2){return min(min(x,y), min(z,z2));}
+ public static final long max(long x, long y, long z, long z2){return max(max(x,y), max(z,z2));}
+ public static final long mid(long x, long y, long z){return x<y ? (x<z ? min(y, z) : x) : (y<z ? min(x, z) : y);}
+ public static final int longToInt(long x){return x<Integer.MIN_VALUE ? Integer.MIN_VALUE : x>Integer.MAX_VALUE ? Integer.MAX_VALUE : (int)x;}
+
+ public static final double min(double x, double y){return x<y ? x : y;}
+ public static final double max(double x, double y){return x>y ? x : y;}
+ public static final double mid(double x, double y, double z){return x<y ? (x<z ? min(y, z) : x) : (y<z ? min(x, z) : y);}
+
+ public static final float min(float x, float y){return x<y ? x : y;}
+ public static final float max(float x, float y){return x>y ? x : y;}
+
+ public static final int min(int[] array, int fromIndex, int toIndex){
+ int min=array[fromIndex];
+ for(int i=fromIndex+1; i<=toIndex; i++){
+ min=min(min, array[i]);
+ }
+ return min;
+ }
+
+ public static final int max(int[] array, int fromIndex, int toIndex){
+ int max=array[fromIndex];
+ for(int i=fromIndex+1; i<=toIndex; i++){
+ max=max(max, array[i]);
+ }
+ return max;
+ }
+
+ public static int minIndex(int[] array) {
+ if(array==null || array.length<1){return -1;}
+ float min=array[0];
+ int index=0;
+ for(int i=1; i<array.length; i++){
+ if(array[i]<min){
+ min=array[i];
+ index=i;
+ }
+ }
+ return index;
+ }
+
+ public static double exponential(Random randy, double lamda){
+// for(int i=0; i<20; i++){
+// double p=randy.nextDouble();
+// double r=-Math.log(1-p)/lamda;
+// System.err.println(p+", "+lamda+"->"+"\n"+r);
+// }
+// assert(false);
+ double p=randy.nextDouble();
+ return -Math.log(1-p)/lamda;
+ }
+
+ public static double log2(double d){
+ return Math.log(d)*invlog2;
+ }
+
+ public static double logRoot2(double d){
+ return Math.log(d)*invlogRoot2;
+ }
+
+ public static double log1point2(double d){
+ return Math.log(d)*invlog1point2;
+ }
+
+ private static final double log2=Math.log(2);
+ private static final double invlog2=1/log2;
+ private static final double logRoot2=Math.log(Math.sqrt(2));
+ private static final double invlogRoot2=1/logRoot2;
+ private static final double log1point2=Math.log(1.2);
+ private static final double invlog1point2=1/log1point2;
+
+ public static final char[] specialChars;
+
+ public static final int[] ilens;
+ public static final long[] llens;
+
+ /** A single whitespace */
+ public static final Pattern whitespace = Pattern.compile("\\s");
+ /** Multiple whitespace */
+ public static final Pattern whitespacePlus = Pattern.compile("\\s+");
+
+ static{
+ ilens=new int[Integer.toString(Integer.MAX_VALUE).length()+1];
+ llens=new long[Long.toString(Long.MAX_VALUE).length()+1];
+ for(int i=1, x=9; i<ilens.length; i++){
+ ilens[i]=x;
+ x=(x*10)+9;
+ }
+ ilens[ilens.length-1]=Integer.MAX_VALUE;
+ for(long i=1, x=9; i<llens.length; i++){
+ llens[(int)i]=x;
+ x=(x*10)+9;
+ }
+ llens[llens.length-1]=Long.MAX_VALUE;
+
+ specialChars=new char[256];
+ Arrays.fill(specialChars, 'X');
+ for(int i=0; i<32; i++){
+ specialChars[i]=' ';
+ }
+ for(int i=32; i<127; i++){
+ specialChars[i]=(char)i;
+ }
+ specialChars[127]=' ';
+ specialChars[128]='C';
+ specialChars[129]='u';
+ specialChars[130]='e';
+ specialChars[131]='a';
+ specialChars[132]='a';
+ specialChars[133]='a';
+ specialChars[134]='a';
+ specialChars[135]='c';
+ specialChars[136]='e';
+ specialChars[137]='e';
+ specialChars[138]='e';
+ specialChars[139]='i';
+ specialChars[140]='i';
+ specialChars[141]='i';
+ specialChars[142]='S';
+ specialChars[143]='S';
+ specialChars[144]='E';
+ specialChars[145]='a';
+ specialChars[146]='a';
+ specialChars[147]='o';
+ specialChars[148]='o';
+ specialChars[149]='o';
+ specialChars[150]='u';
+ specialChars[151]='u';
+ specialChars[152]='y';
+ specialChars[153]='O';
+ specialChars[154]='U';
+ specialChars[155]='c';
+ specialChars[156]='L';
+ specialChars[157]='Y';
+ specialChars[158]='P';
+ specialChars[159]='f';
+ specialChars[160]='a';
+ specialChars[161]='i';
+ specialChars[162]='o';
+ specialChars[163]='u';
+ specialChars[164]='n';
+ specialChars[165]='N';
+ specialChars[166]='a';
+ specialChars[167]='o';
+ specialChars[168]='?';
+ specialChars[224]='a';
+ specialChars[224]='B';
+ specialChars[230]='u';
+ specialChars[252]='n';
+ specialChars[253]='2';
+ }
+
+}
diff --git a/current/align2/TranslateColorspaceRead.java b/current/align2/TranslateColorspaceRead.java
new file mode 100755
index 0000000..752fec4
--- /dev/null
+++ b/current/align2/TranslateColorspaceRead.java
@@ -0,0 +1,2139 @@
+package align2;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+
+import stream.Read;
+import stream.SiteScore;
+import var.Variation;
+import var.Varlet;
+
+import dna.AminoAcid;
+import dna.ChromosomeArray;
+import dna.Data;
+import dna.Gene;
+import dna.Timer;
+
+public final class TranslateColorspaceRead {
+
+ public TranslateColorspaceRead(MSA msa){
+ msaBS=msa;
+ }
+
+ private static CharSequence toString(byte[][] crbmq) {
+ StringBuilder sb=new StringBuilder();
+ for(int i=0; i<2; i++){
+ if(crbmq[i]==null){sb.append("null");}
+ else{
+ for(byte b : crbmq[i]){
+ if(b=='N'){sb.append('N');}
+ else{sb.append((char)(b+'0'));}
+ }
+ sb.append('\n');
+ }
+ }
+ sb.append(new String(crbmq[2]));
+ sb.append('\n');
+ sb.append(crbmq[3]==null ? "null" : new String(crbmq[3]));
+ sb.append('\n');
+ return sb;
+ }
+
+ private static String toStringCS(byte[] colors){
+ StringBuilder sb=new StringBuilder(colors.length);
+ for(byte b : colors){
+ if(b>3){sb.append((char)b);}
+ else{sb.append((char)(b+'0'));}
+ }
+ sb.append('\n');
+ return sb.toString();
+ }
+
+ public void realignByReversingRef(final Read r, final int padding, final boolean recur){
+ realignByReversingRef(r, msaBS, padding, recur);
+ }
+
+ /** This aligns a read with the reference, and generates the match string. */
+ public static void realignByReversingRef(final Read r, final MSA msa, int padding, final boolean recur){
+ if(r.shortmatch()){
+ r.match=null;
+ r.setShortMatch(false);
+ }
+// assert(r.colorspace());
+// assert(msa.colorspace);
+ padding=Tools.min(padding, (msa.maxColumns-r.length())/2-20);
+ padding=Tools.max(padding, 0);
+ final ChromosomeArray chacs=Data.getChromosome(r.chrom);
+ if(verbose){
+ System.err.println("Realigning.");
+ System.err.println("Original: "+r.start+", "+r.stop+", "+Gene.strandCodes[r.strand()]);
+ }
+
+ {
+ assert(r.stop>=r.start); //Otherwise this is pointless...
+ int a=r.length();
+ int b=r.stop-r.start+1;
+ if(b<a){
+ int c=Tools.min(r.length(), a-b+10)/2;
+ padding=Tools.max(padding, c+1);
+ }
+ }
+ padding=Tools.min(padding, r.length()+10);
+ padding=Tools.min(padding, (msa.maxColumns-Tools.max(r.length(), GapTools.calcGrefLen(r.start, r.stop, r.gaps)))/2-1);
+
+// if(padding==4){System.err.print(".");}
+// else{
+// System.err.println("\npadding="+padding+", \trecur="+recur);
+// if(padding>10){
+// if(r.match!=null){System.err.print(new String(r.match));}
+// if(padding>20){System.err.print("\t"+r.start+", "+r.stop+", "+(r.stop-r.start+1));}
+// System.err.println();
+// }
+// }
+
+ final int maxQ=msa.maxQuality(r.length());
+ final int maxI=msa.maxImperfectScore(r.length());
+
+ if(r.strand()==Gene.PLUS){
+ assert(maxQ>maxI);
+
+ byte[][] matchR=new byte[1][];
+ if(r.match!=null && r.match.length==r.length()){
+ matchR[0]=r.match;
+ }else{
+ // System.err.println(new String(r.match));
+ matchR[0]=r.match=new byte[r.length()];
+ }
+ int scoreNoIndel=msa.scoreNoIndelsAndMakeMatchString(r.bases, chacs.array, r.start, matchR);
+ r.match=matchR[0];
+
+ if(scoreNoIndel>=maxI){
+ if(verbose){System.err.println("Quick match.");}
+// assert(r.match[0]!='X') : r.toText(false);
+// assert(r.match[r.match.length-1]!='X') : r.toText(false);
+ // assert(r.stop==r.start+r.length()-1);
+ r.stop=r.start+r.length()-1;
+ r.mapScore=scoreNoIndel;
+ }else{
+ if(verbose){System.err.println("Slow match.");}
+
+// int minLoc=Tools.max(r.start-padding, chacs.minIndex);
+ int minLoc=Tools.max(r.start-padding, 0); //It's OK to be off the beginning as long as bases prior to the true start are 'N'
+ int maxLoc=Tools.min(r.stop+padding, chacs.maxIndex);
+
+ //These assertions are not too important... they indicate the read mapped off the end of the chromosome.
+ assert(minLoc<=r.start) : "\nchr"+r.chrom+": "+minLoc+", "+maxLoc+", "+r.start+", "+r.stop+
+ ", "+chacs.minIndex+", "+chacs.maxIndex+"\n"+r.toText(false);
+ assert(maxLoc>=r.stop) : "\nchr"+r.chrom+": "+minLoc+", "+maxLoc+", "+r.start+", "+r.stop+
+ ", "+chacs.minIndex+", "+chacs.maxIndex+"\n"+r.toText(false);
+
+ // System.err.println("Aligning:\n"+new String(r.bases)+"\n"+chacs.getString(minLoc, maxLoc));
+ int[] max=msa.fillLimited(r.bases, chacs.array, minLoc, maxLoc, scoreNoIndel, r.gaps);
+ // System.err.println(Arrays.toString(max));
+ r.match=msa.traceback(r.bases, chacs.array, minLoc, maxLoc, max[0], max[1], max[2], r.gaps!=null);
+// System.err.println(new String(r.match));
+ int[] score=msa.score(r.bases, chacs.array, minLoc, maxLoc, max[0], max[1], max[2], r.gaps!=null);
+// System.err.println(Arrays.toString(score));
+ r.start=score[1];
+ r.stop=score[2];
+ r.mapScore=score[0];
+ // System.err.println(Arrays.toString(score));
+ // assert(false);
+ }
+ }else{
+ assert(maxQ>maxI);
+
+ byte[][] matchR=new byte[1][];
+ if(r.match!=null && r.match.length==r.length()){
+ matchR[0]=r.match;
+ }else{
+ // System.err.println(new String(r.match));
+ matchR[0]=r.match=new byte[r.length()];
+ }
+
+ int scoreNoIndel=-9999;
+ if(r.length()==(r.stop-r.start+1)){
+
+ byte[] ref=chacs.getBytes(r.start, r.stop);
+ AminoAcid.reverseComplementBasesInPlace(ref);
+ scoreNoIndel=msa.scoreNoIndelsAndMakeMatchString(r.bases, ref, 0, matchR);
+ r.match=matchR[0];
+ }
+
+ if(scoreNoIndel>=maxI){
+ if(verbose){System.err.println("Quick match.");}
+ assert(r.match[0]!='X') : r.toText(false);
+ assert(r.match[r.match.length-1]!='X') : r.toText(false);
+ r.stop=r.start+r.length()-1;
+ r.mapScore=scoreNoIndel;
+ }else{
+ if(verbose){System.err.println("Slow match.");}
+
+// int minLoc=Tools.max(r.start-padding, chacs.minIndex);
+ int minLoc=Tools.max(r.start-padding, 0); //It's OK to be off the beginning as long as bases prior to the true start are 'N'
+ int maxLoc=Tools.min(r.stop+padding, chacs.maxIndex);
+
+ //These assertions are not too important... they indicate the read mapped off the end of the chromosome.
+ assert(minLoc<=r.start) : "\nchr"+r.chrom+": "+minLoc+", "+maxLoc+", "+r.start+", "+r.stop+
+ ", "+chacs.minIndex+", "+chacs.maxIndex+"\n"+r.toText(false);
+ assert(maxLoc>=r.stop) : "\nchr"+r.chrom+": "+minLoc+", "+maxLoc+", "+r.start+", "+r.stop+
+ ", "+chacs.minIndex+", "+chacs.maxIndex+"\n"+r.toText(false);
+
+ byte[] ref=chacs.getBytes(minLoc, maxLoc);
+ // System.err.println("Aligning:\n"+new String(r.bases)+"\n"+new String(ref));
+ AminoAcid.reverseComplementBasesInPlace(ref);
+
+ // System.err.println("Aligning:\n"+new String(r.bases)+"\n"+new String(ref));
+ int[] max=msa.fillLimited(r.bases, ref, 0, ref.length-1, scoreNoIndel, r.gaps);
+ // System.err.println(Arrays.toString(max));
+ r.match=msa.traceback(r.bases, ref, 0, ref.length-1, max[0], max[1], max[2], r.gaps!=null);
+// System.err.println(new String(r.match));
+ int[] score=msa.score(r.bases, ref, 0, ref.length-1, max[0], max[1], max[2], r.gaps!=null);
+// System.err.println(Arrays.toString(score));
+ // System.err.println(Arrays.toString(score));
+ // assert(false);
+
+ int start2=minLoc+(ref.length-score[2]-1);
+ int stop2=maxLoc-(score[1]);
+
+ r.start=start2;
+ r.stop=stop2;
+ r.mapScore=score[0];
+ }
+ }
+ if(verbose){System.err.println("Final: "+r.start+", "+r.stop+", "+Gene.strandCodes[r.strand()]);}
+
+ if(recur && r.stop<chacs.maxIndex && r.start>0 && (r.match[0]=='X' || r.match[0]=='I' ||
+ r.match[r.match.length-1]=='Y' || r.match[r.match.length-1]=='X' || r.match[r.match.length-1]=='I')){
+ int xy=0;
+ for(int i=0; i<r.match.length; i++){
+ byte b=r.match[i];
+ if(b=='X' || b=='Y' || b=='I'){xy++;}
+ }
+// System.err.println("xy = "+xy);
+ realignByReversingRef(r, msa, Tools.min(10+padding+2*xy, msa.maxColumns/2-r.length()-20), false);
+ }
+// assert(r.mapScore>0) : padding+", "+recur+", "+r.mapScore+", "+r.strand()+", "+r.colorspace()+"\n"+r.toText(false);
+// assert(r.match[0]!='X') : r.toText(false);
+// assert(r.match[r.match.length-1]!='X') : r.toText(false);
+ }
+
+ public void realign_new(Read r, int padding, boolean recur, int minScore, boolean forbidIndels){
+ SiteScore ss=r.toSite();
+ TranslateColorspaceRead.realign_new(ss, r.bases, msaBS, padding, recur ? 1 : 0, minScore, forbidIndels, true, r.numericID);
+ r.setFromSite(ss);
+ }
+
+ /** For some reason realign was making the match string backwards... */
+ public static void realign_new(final SiteScore ss, final byte[] bases, final MSA msa, int padding, final int recur, int minValidScore,
+ boolean forbidIndels, boolean fixXY, final long id){
+ if(verbose){System.err.println("Calling realign_new on ss "+ss);}
+ if(ss.matchContainsXY()){ss.fixXY(bases, false, msa);} //This must run regardless of 'fixXY' or else an XY read could be semiperfect but not marked as such
+ ss.clipTipIndels(bases, 4, 10, msa);
+ if(verbose){System.err.println("After fixXY and clipTipIndels: "+ss);}
+ assert(Read.CHECKSITE(ss, bases, id));
+// final byte[] bases=ss.plus() ? basesP : basesM;
+
+ if(verbose){System.err.println("Padding = "+padding+"; msa.maxColumns = "+msa.maxColumns+"; maplen = "+(ss.stop()-ss.start()+1)+"; gaps = "+Arrays.toString(ss.gaps));}
+
+ assert(padding>=0) : padding+", id="+id+", "+ss;
+ padding=Tools.min(padding, (msa.maxColumns-bases.length)/2-20);
+ if(verbose){System.err.println("Padding = "+padding);}
+ assert(padding>=0) : padding+", id="+id+", "+ss;
+ padding=Tools.max(padding, 0);
+ if(verbose){System.err.println("Padding = "+padding);}
+ assert(padding>=0) : padding+", id="+id+", "+ss;
+
+
+ final ChromosomeArray chacs=Data.getChromosome(ss.chrom);
+ if(verbose){
+ System.err.println("Realigning.");
+ System.err.println("Original: "+ss.start()+", "+ss.stop()+", "+Gene.strandCodes[ss.strand()]);
+ if(verbose){System.err.println("F. Estimated greflen: "+GapTools.calcGrefLen(ss.start(), ss.stop(), ss.gaps));}
+ }
+ assert(ss.gaps==null || (ss.gaps[0]==ss.start() && ss.gaps[ss.gaps.length-1]==ss.stop())) : id+"\n"+new String(bases); //123
+
+
+ {
+ int expectedLen=GapTools.calcGrefLen(ss.start(), ss.stop(), ss.gaps);
+ if(expectedLen>msa.maxColumns-20){
+ //TODO: Alternately, I could kill the site.
+ ss.setStop(ss.start()+Tools.min(bases.length+40, msa.maxColumns-20));
+ if(ss.gaps!=null){ss.gaps=GapTools.fixGaps(ss.start(), ss.stop(), ss.gaps, Shared.MINGAP);} //Still needed?
+ }
+ if(verbose){System.err.println("F. Estimated greflen2: "+GapTools.calcGrefLen(ss.start(), ss.stop(), ss.gaps));}
+ assert(ss.gaps==null || (ss.gaps[0]==ss.start() && ss.gaps[ss.gaps.length-1]==ss.stop())) : id+"\n"+new String(bases); //123
+ }
+
+ if(ss.start()<0){ss.setStart(0);} //Prevents assertion errors. This change should be reset by the realignment so it shouldn't mattess.
+ if(ss.stop()>chacs.maxIndex){ss.setStop(chacs.maxIndex);} //Also to prevent a potential assertion error in unpadded references
+ assert(0<=ss.start()) : "\nchr"+ss.chrom+": ss.setStart()"+ss.start()+", ss.setStop()"+ss.stop()+", padding="+padding+
+ ", chacs.minIndex="+chacs.minIndex+", chacs.maxIndex="+chacs.maxIndex+"\nread:\n"+ss.toText();
+ assert(chacs.maxIndex>=ss.stop()) : "\nchr"+ss.chrom+": ss.setStart()"+ss.start()+", ss.setStop()"+ss.stop()+", padding="+padding+
+ ", chacs.minIndex="+chacs.minIndex+", chacs.maxIndex="+chacs.maxIndex+"\nread:\n"+ss.toText();
+ assert(ss.gaps==null || (ss.gaps[0]==ss.start() && ss.gaps[ss.gaps.length-1]==ss.stop())) : id+"\n"+new String(bases); //123
+
+ {
+ assert(ss.stop()>=ss.start()); //Otherwise this is pointless...
+ int a=bases.length;
+ int b=ss.stop()-ss.start()+1;
+ if(b<a){
+ int c=Tools.min(bases.length, a-b+10)/2;
+ padding=Tools.max(padding, c+1);
+// if(verbose){System.err.println("Padding = "+padding);}
+// assert(padding>=0) : padding;
+ }
+ }
+ assert(ss.gaps==null || (ss.gaps[0]==ss.start() && ss.gaps[ss.gaps.length-1]==ss.stop())) : id+"\n"+new String(bases); //123
+
+ if(verbose){System.err.println("Padding = "+padding);}
+
+ {
+ int oldPadding=padding;
+ padding=Tools.max(0, Tools.min(padding, (msa.maxColumns-Tools.max(bases.length, GapTools.calcGrefLen(ss.start(), ss.stop(), ss.gaps)))/2-100));
+ if(forbidIndels){padding=0;}
+ if(verbose){
+ System.err.println("oldPadding="+oldPadding+", padding="+padding);
+ System.err.println("L. calcGrefLen1 = "+GapTools.calcGrefLen(ss.start(), ss.stop(), ss.gaps));
+ System.err.println("L. calcGrefLen2 = "+GapTools.calcGrefLen(ss.start()-oldPadding, ss.stop()+oldPadding, ss.gaps));
+ System.err.println("L. calcGrefLen3 = "+GapTools.calcGrefLen(ss.start()-padding, ss.stop()+padding, ss.gaps));
+ }
+
+
+ assert(padding>=0) : id+", "+padding;
+
+// assert(GapTools.calcGrefLen(ss.start()-padding, ss.stop()+padding, ss.gaps)<msa.maxColumns) :
+// oldPadding+", "+padding+", "+bases.length+", "+GapTools.calcGrefLen(ss.start()-padding, ss.stop()+padding, ss.gaps)+", "+GapTools.calcGrefLen(ss.start(), ss.stop(), ss.gaps);
+ }
+ assert(ss.gaps==null || (ss.gaps[0]==ss.start() && ss.gaps[ss.gaps.length-1]==ss.stop())) : id+"\n"+new String(bases); //123
+
+ final int maxQ=msa.maxQuality(bases.length);
+ final int maxI=msa.maxImperfectScore(bases.length);
+
+ if(ss.strand()==Gene.PLUS){
+ assert(maxQ>maxI);
+
+ byte[][] matchR=new byte[1][];
+ if(ss.match!=null && ss.match.length==bases.length){
+ matchR[0]=ss.match;
+ }else{
+ // System.err.println(new String(ss.match));
+ matchR[0]=ss.match=new byte[bases.length];
+ }
+ int scoreNoIndel=msa.scoreNoIndelsAndMakeMatchString(bases, chacs.array, ss.start(), matchR);
+ ss.match=matchR[0];
+
+ assert(0<=ss.start()) : "\nchr"+ss.chrom+": ss.setStart()"+ss.start()+", ss.setStop()"+ss.stop()+", padding="+padding+
+ ", chacs.minIndex="+chacs.minIndex+", chacs.maxIndex="+chacs.maxIndex+"\nread:\n"+ss.toText();
+ assert(chacs.maxIndex>=ss.stop()) : "\nchr"+ss.chrom+": ss.setStart()"+ss.start()+", ss.setStop()"+ss.stop()+", padding="+padding+
+ ", chacs.minIndex="+chacs.minIndex+", chacs.maxIndex="+chacs.maxIndex+"\nread:\n"+ss.toText();
+
+ if(verbose){System.err.println("G. Estimated greflen: "+GapTools.calcGrefLen(ss.start(), ss.stop(), ss.gaps));}
+ assert(ss.gaps==null || (ss.gaps[0]==ss.start() && ss.gaps[ss.gaps.length-1]==ss.stop())) : id+"\n"+new String(bases); //123
+
+ if(scoreNoIndel>=maxI || forbidIndels){
+ if(verbose){System.err.println("Quick match.");}
+// assert(ss.match[0]!='X') : ss.toText();
+// assert(ss.match[ss.match.length-1]!='X') : ss.toText();
+ // assert(ss.setStop()=ss.start()+bases.length-1);
+ ss.setStop(ss.start()+bases.length-1);
+ ss.setSlowScore(scoreNoIndel);
+ assert(ss.gaps==null || (ss.gaps[0]==ss.start() && ss.gaps[ss.gaps.length-1]==ss.stop())) : id+"\n"+new String(bases); //123
+ }else{
+ if(verbose){System.err.println("Slow match.");}
+
+// int minLoc=Tools.max(ss.start()-padding, chacs.minIndex);
+ int minLoc=Tools.max(ss.start()-padding, 0); //It's OK to be off the beginning as long as bases prior to the true start are 'N'
+ int maxLoc=Tools.min(ss.stop()+padding, chacs.maxIndex);
+
+ if(verbose){System.err.println("minLoc = "+minLoc+", maxLoc = "+maxLoc);}
+ if(verbose){System.err.println("H. Estimated greflen: "+GapTools.calcGrefLen(ss.start(), ss.stop(), ss.gaps));}
+ if(verbose){System.err.println("H. Estimated greflen2: "+GapTools.calcGrefLen(minLoc, maxLoc, ss.gaps));}
+
+ //These assertions are not too important... they indicate the read mapped off the end of the chromosome.
+ assert(minLoc<=ss.start()) : "\nchr"+ss.chrom+": minloc="+minLoc+", maxLoc="+maxLoc+", ss.setStart()"+ss.start()+", ss.setStop()"+ss.stop()+", padding="+padding+
+ ", chacs.minIndex="+chacs.minIndex+", chacs.maxIndex="+chacs.maxIndex+"\nread:\n"+ss.toText();
+ assert(maxLoc>=ss.stop()) : "\nchr"+ss.chrom+": minloc="+minLoc+", maxLoc="+maxLoc+", ss.setStart()"+ss.start()+", ss.setStop()"+ss.stop()+", padding="+padding+
+ ", chacs.minIndex="+chacs.minIndex+", chacs.maxIndex="+chacs.maxIndex+"\nread:\n"+ss.toText();
+
+ // System.err.println("Aligning:\n"+new String(bases)+"\n"+chacs.getString(minLoc, maxLoc));
+
+ int[] max=null;
+ int[] score=null;
+ assert(ss.gaps==null || (ss.gaps[0]==ss.start() && ss.gaps[ss.gaps.length-1]==ss.stop())) : id+"\n"+new String(bases); //123
+ try {
+ if(verbose){
+ System.err.println("Calling fillLimited(bases, chacs, "+minLoc+", "+maxLoc+", "+
+ Tools.max(scoreNoIndel, minValidScore)+", "+(ss.gaps==null ? "null" : Arrays.toString(ss.gaps))+")");
+ }
+ max=msa.fillLimited(bases, chacs.array, minLoc, maxLoc, Tools.max(scoreNoIndel, minValidScore), ss.gaps);
+ score=(max==null ? null : msa.score(bases, chacs.array, minLoc, maxLoc, max[0], max[1], max[2], ss.gaps!=null));
+ if(verbose){System.err.println("I. Estimated greflen: "+GapTools.calcGrefLen(ss.start(), ss.stop(), ss.gaps));}
+
+ if(score!=null && score.length>6){
+ int[] oldArray=score.clone();
+ assert(score.length==8);
+ int extraPadLeft=score[6];
+ int extraPadRight=score[7];
+
+ if(ss.gaps==null){
+ assert(maxLoc-minLoc+1<=msa.maxColumns);
+ int newlen=(maxLoc-minLoc+1+extraPadLeft+extraPadRight);
+ if(newlen>=msa.maxColumns-80){
+ while(newlen>=msa.maxColumns-80 && extraPadLeft>extraPadRight){newlen--;extraPadLeft--;}
+ while(newlen>=msa.maxColumns-80 && extraPadLeft<extraPadRight){newlen--;extraPadRight--;}
+ while(newlen>=msa.maxColumns-80){newlen-=2;extraPadLeft--;extraPadRight--;}
+ }else{
+ int x=Tools.max(0, Tools.min(20, ((msa.maxColumns-newlen)/2)-40));
+ extraPadLeft=Tools.max(x, extraPadLeft);
+ extraPadRight=Tools.max(x, extraPadRight);
+ }
+ }else{
+ //TODO: In this case the alignment will probably be wrong.
+ int greflen=Tools.max(bases.length, GapTools.calcGrefLen(minLoc, maxLoc, ss.gaps));
+ int newlen=(greflen+1+extraPadLeft+extraPadRight);
+ if(newlen>=msa.maxColumns-80){
+ while(newlen>=msa.maxColumns-80 && extraPadLeft>extraPadRight){newlen--;extraPadLeft--;}
+ while(newlen>=msa.maxColumns-80 && extraPadLeft<extraPadRight){newlen--;extraPadRight--;}
+ while(newlen>=msa.maxColumns-80){newlen-=2;extraPadLeft--;extraPadRight--;}
+ }else{
+ int x=Tools.max(0, Tools.min(20, ((msa.maxColumns-newlen)/2)-40));
+ extraPadLeft=Tools.max(x, extraPadLeft);
+ extraPadRight=Tools.max(x, extraPadRight);
+ }
+ }
+
+ assert(extraPadLeft>=0 && extraPadRight>=0) : extraPadLeft+", "+extraPadRight+"\n"+id+", "+ss+", "+new String(bases);
+ minLoc=Tools.max(0, minLoc-extraPadLeft);
+ maxLoc=Tools.min(chacs.maxIndex, maxLoc+extraPadRight);
+
+ if(verbose){System.err.println("J. Estimated greflen: "+GapTools.calcGrefLen(ss.start(), ss.stop(), ss.gaps));}
+ if(verbose){System.err.println("J. Estimated greflen2: "+GapTools.calcGrefLen(minLoc, maxLoc, ss.gaps));}
+ max=msa.fillLimited(bases, chacs.array, minLoc, maxLoc, Tools.max(scoreNoIndel, minValidScore), ss.gaps);
+ score=(max==null ? null : msa.score(bases, chacs.array, minLoc, maxLoc, max[0], max[1], max[2], ss.gaps!=null));
+
+ if(score==null || score[0]<oldArray[0]){
+ if(!Shared.anomaly){System.err.println("Read "+id+": Padded match string alignment result was inferior. Triple-aligning. :(");}
+
+ if(ss.gaps==null){
+ assert(maxLoc-minLoc+1<=msa.maxColumns);
+ int newlen=(maxLoc-minLoc+1+extraPadLeft+extraPadRight);
+ if(newlen>=msa.maxColumns-80){
+ while(newlen>=msa.maxColumns-80 && extraPadLeft>extraPadRight){newlen--;extraPadLeft--;}
+ while(newlen>=msa.maxColumns-80 && extraPadLeft<extraPadRight){newlen--;extraPadRight--;}
+ while(newlen>=msa.maxColumns-80){newlen-=2;extraPadLeft--;extraPadRight--;}
+ }else{
+ int x=Tools.max(0, Tools.min(20, ((msa.maxColumns-newlen)/2)-40));
+ extraPadLeft=Tools.max(x, extraPadLeft);
+ extraPadRight=Tools.max(x, extraPadRight);
+ }
+ }else{
+ //TODO: In this case the alignment will probably be wrong.
+ int greflen=Tools.max(bases.length, GapTools.calcGrefLen(minLoc, maxLoc, ss.gaps));
+ int newlen=(greflen+1+extraPadLeft+extraPadRight);
+ if(newlen>=msa.maxColumns-80){
+ while(newlen>=msa.maxColumns-80 && extraPadLeft>extraPadRight){newlen--;extraPadLeft--;}
+ while(newlen>=msa.maxColumns-80 && extraPadLeft<extraPadRight){newlen--;extraPadRight--;}
+ while(newlen>=msa.maxColumns-80){newlen-=2;extraPadLeft--;extraPadRight--;}
+ }else{
+ int x=Tools.max(0, Tools.min(20, ((msa.maxColumns-newlen)/2)-40));
+ extraPadLeft=Tools.max(x, extraPadLeft);
+ extraPadRight=Tools.max(x, extraPadRight);
+ }
+ }
+
+ assert(extraPadLeft>=0 && extraPadRight>=0) : extraPadLeft+", "+extraPadRight+"\n"+id+", "+ss+", "+new String(bases);
+ minLoc=Tools.max(0, minLoc-extraPadLeft);
+ maxLoc=Tools.min(chacs.maxIndex, maxLoc+extraPadRight);
+
+ max=msa.fillLimited(bases, chacs.array, minLoc, maxLoc, Tools.max(scoreNoIndel, minValidScore), ss.gaps);
+ score=(max==null ? null : msa.score(bases, chacs.array, minLoc, maxLoc, max[0], max[1], max[2], ss.gaps!=null));
+
+ if(minLoc>0 && maxLoc<chacs.maxIndex && (score==null || score[0]<oldArray[0])){
+ if(!Shared.anomaly){System.err.println("Still inferior.");}
+ minLoc=Tools.max(ss.start()-8, 0); //It's OK to be off the beginning as long as bases prior to the true start are 'N'
+ maxLoc=Tools.min(ss.stop()+8, chacs.maxIndex);
+ max=msa.fillUnlimited(bases, chacs.array, minLoc, maxLoc, ss.gaps);
+ score=(max==null ? null : msa.score(bases, chacs.array, minLoc, maxLoc, max[0], max[1], max[2], ss.gaps!=null));
+ }
+ }
+ }
+ } catch (Exception e) {
+ System.err.println("Caught exception:\n");
+ e.printStackTrace();
+ assert(false) : ss.toText();
+ }
+ assert(ss.gaps==null || (ss.gaps[0]==ss.start() && ss.gaps[ss.gaps.length-1]==ss.stop())) : id+"\n"+new String(bases); //123
+
+ if(max!=null){
+ ss.match=msa.traceback(bases, chacs.array, minLoc, maxLoc, max[0], max[1], max[2], ss.gaps!=null);
+ ss.setLimits(score[1], score[2]);
+ if(verbose){System.err.println(ss.lengthsAgree()+", "+ss.start+", "+ss.stop);}
+ ss.fixLimitsXY();
+ if(verbose){System.err.println(ss.lengthsAgree()+", "+ss.start+", "+ss.stop);}
+ ss.setSlowScore(score[0]);
+ assert(ss.lengthsAgree()) : ss.matchLength()+", "+ss.mappedLength()+"\n\nss: "+ss+"\nbases: "+new String(bases);
+ }else{
+ ss.setStop(ss.start()+bases.length-1);
+ ss.setSlowScore(scoreNoIndel);
+ assert(ss.lengthsAgree()) : ss.matchLength()+", "+ss.mappedLength()+"\n\nss: "+ss+"\nbases: "+new String(bases);
+ }
+ assert(ss.gaps==null || (ss.gaps[0]==ss.start() && ss.gaps[ss.gaps.length-1]==ss.stop())) : id+"\n"+new String(bases); //123
+ }
+ assert(ss.gaps==null || (ss.gaps[0]==ss.start() && ss.gaps[ss.gaps.length-1]==ss.stop())) : id+"\n"+new String(bases); //123
+ assert(ss.lengthsAgree()) : ss.matchLength()+", "+ss.mappedLength()+"\n\nss: "+ss+"\nbases: "+new String(bases);
+ }else{
+ assert(maxQ>maxI);
+
+ byte[][] matchR=new byte[1][];
+ if(ss.match!=null && ss.match.length==bases.length){
+ matchR[0]=ss.match;
+ }else{
+ matchR[0]=ss.match=new byte[bases.length];
+ }
+
+ int scoreNoIndel=msa.scoreNoIndelsAndMakeMatchString(bases, chacs.array, ss.start(), matchR);
+ ss.match=matchR[0];
+
+ if(scoreNoIndel>=maxI || forbidIndels){
+ if(verbose){System.err.println("Quick match.");}
+ assert(ss.match[0]!='X') : ss.toText();
+ assert(ss.match[ss.match.length-1]!='X') : ss.toText();
+ ss.setStop(ss.start()+bases.length-1);
+ ss.setSlowScore(scoreNoIndel);
+ assert(ss.lengthsAgree());
+ }else{
+ if(verbose){System.err.println("Slow match.");}
+
+ int minLoc=Tools.max(ss.start()-padding, 0); //It's OK to be off the beginning as long as bases prior to the true start are 'N'
+ int maxLoc=Tools.min(ss.stop()+padding, chacs.maxIndex);
+ if(verbose){System.err.println("Slow match "+minLoc+" ~ "+maxLoc);}
+ if(verbose){System.err.println("K. Estimated greflen: "+GapTools.calcGrefLen(ss.start(), ss.stop(), ss.gaps));}
+ if(verbose){System.err.println("K. Estimated greflen2: "+GapTools.calcGrefLen(minLoc, maxLoc, ss.gaps));}
+
+ //These assertions are not too important... they indicate the read mapped off the end of the chromosome.
+ assert(minLoc<=ss.start()) : "\nchr"+ss.chrom+": "+minLoc+", "+maxLoc+", "+ss.start()+", "+ss.stop()+
+ ", "+chacs.minIndex+", "+chacs.maxIndex+"\n"+ss.toText();
+ assert(maxLoc>=ss.stop()) : "\nchr"+ss.chrom+": "+minLoc+", "+maxLoc+", "+ss.start()+", "+ss.stop()+
+ ", "+chacs.minIndex+", "+chacs.maxIndex+"\n"+ss.toText();
+
+ if(verbose){System.err.println("Aligning:\n"+new String(bases)+"\n"+chacs.getString(minLoc, maxLoc));}
+ int[] max=msa.fillLimited(bases, chacs.array, minLoc, maxLoc, Tools.max(scoreNoIndel, minValidScore), ss.gaps);
+ if(verbose){System.err.println("Aligned3: {rows, maxC, maxS, max} = "+Arrays.toString(max));}
+ int[] score=null;
+ score=(max==null ? null : msa.score(bases, chacs.array, minLoc, maxLoc, max[0], max[1], max[2], ss.gaps!=null));
+
+ if(score!=null && score.length>6){
+ if(verbose){System.err.println("Entering condition because score="+Arrays.toString(score));}
+ int[] oldArray=score.clone();
+ assert(score.length==8);
+ int extraPadLeft=score[6];
+ int extraPadRight=score[7];
+
+ if(ss.gaps==null){
+ assert(maxLoc-minLoc+1<=msa.maxColumns);
+ int newlen=(maxLoc-minLoc+1+extraPadLeft+extraPadRight);
+ if(newlen>=msa.maxColumns-80){
+ while(newlen>=msa.maxColumns-80 && extraPadLeft>extraPadRight){newlen--;extraPadLeft--;}
+ while(newlen>=msa.maxColumns-80 && extraPadLeft<extraPadRight){newlen--;extraPadRight--;}
+ while(newlen>=msa.maxColumns-80){newlen-=2;extraPadLeft--;extraPadRight--;}
+ }
+ }else{
+ //TODO: In this case the alignment will probably be wrong.
+ int greflen=Tools.max(bases.length, GapTools.calcGrefLen(minLoc, maxLoc, ss.gaps));
+ int newlen=(greflen+1+extraPadLeft+extraPadRight);
+ if(newlen>=msa.maxColumns-80){
+ while(newlen>=msa.maxColumns-80 && extraPadLeft>extraPadRight){newlen--;extraPadLeft--;}
+ while(newlen>=msa.maxColumns-80 && extraPadLeft<extraPadRight){newlen--;extraPadRight--;}
+ while(newlen>=msa.maxColumns-80){newlen-=2;extraPadLeft--;extraPadRight--;}
+ }else{
+ int x=Tools.max(0, Tools.min(20, ((msa.maxColumns-newlen)/2)-40));
+ extraPadLeft=Tools.max(x, extraPadLeft);
+ extraPadRight=Tools.max(x, extraPadRight);
+ }
+ }
+
+ minLoc=Tools.max(0, minLoc-extraPadLeft);
+ maxLoc=Tools.min(chacs.maxIndex, maxLoc+extraPadRight);
+ if(verbose){System.err.println("Set extraPadLeft="+extraPadLeft+", extraPadRight="+extraPadRight);}
+ if(verbose){System.err.println("Set minLoc="+minLoc+", maxLoc="+maxLoc);}
+
+ max=msa.fillLimited(bases, chacs.array, minLoc, maxLoc, Tools.max(scoreNoIndel, minValidScore), ss.gaps);
+ score=(max==null ? null : msa.score(bases, chacs.array, minLoc, maxLoc, max[0], max[1], max[2], ss.gaps!=null));
+
+ if(score==null || score[0]<oldArray[0]){
+ if(!Shared.anomaly){System.err.println("Read "+id+": Padded match string alignment result was inferior. Triple-aligning. :(");}
+
+ if(ss.gaps==null){
+ assert(maxLoc-minLoc+1<=msa.maxColumns);
+ int newlen=(maxLoc-minLoc+1+extraPadLeft+extraPadRight);
+ if(newlen>=msa.maxColumns-80){
+ while(newlen>=msa.maxColumns-80 && extraPadLeft>extraPadRight){newlen--;extraPadLeft--;}
+ while(newlen>=msa.maxColumns-80 && extraPadLeft<extraPadRight){newlen--;extraPadRight--;}
+ while(newlen>=msa.maxColumns-80){newlen-=2;extraPadLeft--;extraPadRight--;}
+ }else{
+ int x=Tools.max(0, Tools.min(20, ((msa.maxColumns-newlen)/2)-40));
+ extraPadLeft=Tools.max(x, extraPadLeft);
+ extraPadRight=Tools.max(x, extraPadRight);
+ }
+ }else{
+ //TODO: In this case the alignment will probably be wrong.
+ int greflen=Tools.max(bases.length, GapTools.calcGrefLen(minLoc, maxLoc, ss.gaps));
+ int newlen=(greflen+1+extraPadLeft+extraPadRight);
+ if(newlen>=msa.maxColumns-80){
+ while(newlen>=msa.maxColumns-80 && extraPadLeft>extraPadRight){newlen--;extraPadLeft--;}
+ while(newlen>=msa.maxColumns-80 && extraPadLeft<extraPadRight){newlen--;extraPadRight--;}
+ while(newlen>=msa.maxColumns-80){newlen-=2;extraPadLeft--;extraPadRight--;}
+ }else{
+ int x=Tools.max(0, Tools.min(20, ((msa.maxColumns-newlen)/2)-40));
+ extraPadLeft=Tools.max(x, extraPadLeft);
+ extraPadRight=Tools.max(x, extraPadRight);
+ }
+ }
+
+ minLoc=Tools.max(0, minLoc-extraPadLeft);
+ maxLoc=Tools.min(chacs.maxIndex, maxLoc+extraPadRight);
+ max=msa.fillLimited(bases, chacs.array, minLoc, maxLoc, Tools.max(scoreNoIndel, minValidScore), ss.gaps);
+ score=(max==null ? null : msa.score(bases, chacs.array, minLoc, maxLoc, max[0], max[1], max[2], ss.gaps!=null));
+ }
+ }
+
+
+ if(verbose){System.err.println(Arrays.toString(max));}
+ assert(ss.gaps==null || (ss.gaps[0]==ss.start() && ss.gaps[ss.gaps.length-1]==ss.stop())) : id+"\n"+new String(bases)+"\n"+ss;
+
+ if(max!=null){
+ ss.match=msa.traceback(bases, chacs.array, minLoc, maxLoc, max[0], max[1], max[2], ss.gaps!=null);
+ ss.setLimits(score[1], score[2]);
+ ss.fixLimitsXY();
+ ss.setSlowScore(score[0]);
+ if(verbose){System.err.println("Aligned4:\n"+new String(bases)+"\n"+chacs.getString(ss.start(), ss.stop())+"\n"+new String(ss.match));}
+ assert(ss.lengthsAgree()) : id+"\n"+new String(bases)+"\n"+ss;
+ }else{
+ assert(ss.match[0]!='X') : id+"\n"+new String(bases)+"\n"+ss;
+ assert(ss.match[ss.match.length-1]!='X' && ss.match[ss.match.length-1]!='Y') : id+"\n"+new String(bases)+"\n"+ss;
+ ss.setStop(ss.start()+bases.length-1);
+ ss.setSlowScore(scoreNoIndel);
+ assert(ss.lengthsAgree()) : id+"\n"+new String(bases)+"\n"+ss;
+ }
+ assert(ss.gaps==null || (ss.gaps[0]==ss.start() && ss.gaps[ss.gaps.length-1]==ss.stop())) : id+"\n"+new String(bases)+"\n"+ss;
+ assert(ss.lengthsAgree()) : ss.matchLength()+", "+ss.mappedLength()+"\n\nss: "+ss+"\nbases: "+new String(bases);
+ }
+ assert(ss.gaps==null || (ss.gaps[0]==ss.start() && ss.gaps[ss.gaps.length-1]==ss.stop())) : id+"\n"+new String(bases); //123
+ assert(ss.lengthsAgree()) : ss.matchLength()+", "+ss.mappedLength()+"\n\nss: "+ss+"\nbases: "+new String(bases);
+ }
+ if(verbose){System.err.println("Final: "+ss.start()+", "+ss.stop()+", "+Gene.strandCodes[ss.strand()]+", recur="+recur);}
+ assert(ss.lengthsAgree()) : ss.matchLength()+", "+ss.mappedLength()+"\n\nss: "+ss+"\nbases: "+new String(bases);
+
+ final int leftPaddingNeeded=ss.leftPaddingNeeded(4, 5), rightPaddingNeeded=ss.rightPaddingNeeded(4, 5);
+ if(ss.stop()<chacs.maxIndex && ss.start()>0 && (leftPaddingNeeded>0 || rightPaddingNeeded>0)){
+ assert(ss.lengthsAgree()) : ss.matchLength()+", "+ss.mappedLength()+"\n\nss: "+ss+"\nbases: "+new String(bases);
+ if(recur>0){
+ ss.gaps=GapTools.fixGaps(ss.start(), ss.stop(), ss.gaps, Shared.MINGAP);
+
+ int p_temp=Tools.min(10+Tools.max(leftPaddingNeeded, rightPaddingNeeded), (msa.maxColumns-bases.length)/2-20);
+
+ if(verbose){System.err.println("re-calling realign_new.");}
+ realign_new(ss, bases, msa, p_temp, recur-1, minValidScore, forbidIndels, fixXY, id);
+ assert(ss.lengthsAgree());
+ }else{
+ if(verbose){System.err.println("Not recurring. fixXY="+fixXY);}
+// int len1=Read.calcMatchLength(ss.match);
+// int len2=ss.stop()-ss.start()+1;
+ if(fixXY && ss.matchContainsXY()){
+ ss.fixXY(bases, false, msa);
+ assert(ss.lengthsAgree());
+ }
+ assert(ss.gaps==null || (ss.gaps[0]==ss.start() && ss.gaps[ss.gaps.length-1]==ss.stop())) : id+"\n"+new String(bases); //123
+ }
+ }
+ ss.setPerfect(bases);
+ assert(Read.CHECKSITE(ss, bases, id));
+ }
+
+ private static final boolean checkArray(byte[] bases){
+ for(byte b : bases){
+// assert(b>0) : Arrays.toString(bases);
+ if(b<=0){return false;}
+ }
+ return true;
+ }
+
+
+ public static byte[] translateQuality(byte[] qcs){
+ byte[] qbs=new byte[qcs.length+1];
+ qbs[0]=qcs[0];
+ qbs[qbs.length-1]=qcs[qcs.length-1];
+ for(int i=1; i<qcs.length; i++){
+ int x=Tools.min(qcs[i-1], qcs[i]);
+ int y=Tools.max(qcs[i-1], qcs[i]);
+ qbs[i]=(byte) ((3*x+y)/4);
+ }
+ return qbs;
+ }
+
+ private static int fixIndels(byte[][] crbmq, Read r){
+
+ byte[] colors=crbmq[0];
+ byte[] colorRef=crbmq[1];
+ byte[] baseRef=crbmq[2];
+ byte[] match=crbmq[3];
+ byte[] quality=crbmq[4];
+
+ for(int i=0; i<match.length; i++){
+ if(match[i]=='X' || match[i]=='Y'){
+// assert(false) : "\n"+new String(colors)+"\n"+new String(colorRef)+"\n"+new String(baseRef)+"\n"+new String(match)+"\n";
+
+ assert(false) : "TODO: Truncate ends.\n"+toString(crbmq)+"\n";
+
+ match[i]='I';
+// match[i]='S';
+ }
+ }
+
+ int fixed=0;
+
+ for(int loc=0, refloc=0, mloc=0; mloc<match.length; mloc++){
+ byte b=match[mloc];
+ boolean ok=true;
+ if(b=='S' || b== 'm' || b=='N'){
+ loc++;
+ refloc++;
+ }else if(b=='D'){
+ fixed++;
+ ok=fixDeletion(crbmq, mloc, r);
+ match=crbmq[3];
+ assert(ok);
+ }
+// else if(b=='I' || b=='X' || b=='Y'){
+// ok=fixInsertion(crbmq, mloc);
+// assert(ok);
+// }
+ else if(b=='I'){
+ fixed++;
+ ok=fixInsertion(crbmq, mloc);
+ match=crbmq[3];
+// assert(ok);
+ }else{
+ assert(false) : ""+((char)b);
+ }
+ if(!ok){
+ return -1;
+ }
+ }
+
+ colors=crbmq[0];
+ colorRef=crbmq[1];
+ baseRef=crbmq[2];
+ match=crbmq[3];
+
+ assert(baseRef.length==colorRef.length+1);
+ if(colorRef.length>colors.length){
+ colorRef=Arrays.copyOf(colorRef, colors.length);
+ baseRef=Arrays.copyOf(baseRef, colorRef.length+1);
+ crbmq[1]=colorRef;
+ crbmq[2]=baseRef;
+ }
+
+ return fixed;
+ }
+
+ private static boolean fixDeletion(final byte[][] crbmq, int loc, Read r){
+
+ byte[] colors=crbmq[0];
+ byte[] colorRef=crbmq[1];
+ byte[] baseRef=crbmq[2];
+ byte[] match=crbmq[3];
+ byte[] quality=crbmq[4];
+
+ assert(match[loc]=='D') : loc;
+
+ int len=1;
+ for(int i=loc+1; i<match.length; i++){
+ byte b=match[i];
+ if(b=='D'){
+ len++;
+ }else{
+ break;
+ }
+ }
+ int b=loc+len-1;
+
+ //TODO
+ if(loc<=1 || b>match.length-2){return false;} //Indels on very ends need to be processed differently
+
+ //Deletion is from a to b, inclusive. Note that basespace coords are +1 from colorspace coords.
+
+ byte[] colorRef2=new byte[colorRef.length-len];
+ byte[] baseRef2=new byte[baseRef.length-len];
+ byte[] match2=new byte[match.length-len];
+
+ assert(loc<colorRef2.length) : "TODO: Seems odd... "+loc+", "+colorRef2.length+", "+match;
+ assert(baseRef2.length==colorRef2.length+1);
+
+ for(int i=0; i<=loc; i++){
+ colorRef2[i]=colorRef[i];
+ baseRef2[i]=baseRef[i];
+ match2[i]=match[i];
+ }
+
+ for(int i=loc+1; i<baseRef2.length; i++){
+ baseRef2[i]=baseRef[i+len];
+ }
+ for(int i=loc+1; i<colorRef2.length; i++){
+ colorRef2[i]=colorRef[i+len];
+ }
+ for(int i=loc+1; i<match2.length; i++){
+ match2[i]=match[i+len];
+ }
+
+
+ colorRef2[loc]=AminoAcid.baseToColor(baseRef2[loc], baseRef2[loc+1]);
+ if(colorRef2[loc]==colors[loc]){
+ match2[loc]='m';
+ }else{
+ assert(colorRef2[loc]!='N' && colors[loc]!='N') : "TODO\n"+r.toText(false)+"\n"+toString(crbmq)+"\n";
+ match2[loc]='S';
+ }
+
+ crbmq[1]=colorRef2;
+ crbmq[2]=baseRef2;
+ crbmq[3]=match2;
+ crbmq[4]=quality;
+
+ return true;
+ }
+
+ private static boolean fixInsertion(final byte[][] crbmq, int loc){
+
+ byte[] colors=crbmq[0];
+ byte[] colorRef=crbmq[1];
+ byte[] baseRef=crbmq[2];
+ byte[] match=crbmq[3];
+ byte[] quality=crbmq[4];
+
+ assert(match[loc]=='I');
+
+ int len=1;
+ for(int i=loc+1; i<match.length; i++){
+ byte b=match[i];
+ if(b=='I'){
+ len++;
+ }else{
+ break;
+ }
+ }
+ int b=loc+len-1;
+
+ byte[] colorRef2=new byte[colorRef.length+len];
+ byte[] baseRef2=new byte[baseRef.length+len];
+ byte[] match2=new byte[match.length]; //TODO: Unnecessary duplication\
+
+ //TODO
+// if(b>match.length-2){return false;} //Indels on very ends need to be processed differently
+
+
+ //Deletion is from a to b, inclusive. Note that basespace coords are +1 from colorspace coords
+
+ assert(loc<colorRef2.length) : "TODO: Seems odd... "+loc+", "+colorRef2.length+", "+match;
+ assert(baseRef2.length==colorRef2.length+1);
+
+ //Fill first half
+ for(int i=0; i<loc; i++){
+ colorRef2[i]=colorRef[i];
+ baseRef2[i]=baseRef[i];
+ match2[i]=match[i];
+ }
+ baseRef2[loc]=baseRef[loc];
+
+ //Fill last half
+ for(int i=loc+1; i<colorRef.length; i++){
+ colorRef2[i+len]=colorRef[i];
+ }
+ for(int i=loc; i<baseRef.length; i++){
+ baseRef2[i+len]=baseRef[i];
+ }
+ for(int i=loc+1; i<match.length; i++){
+ match2[i]=match[i];
+ }
+
+ //Now, just fill in the inserted portion
+ if(verbose){
+ System.err.println("loc="+loc+", colorRef2="+colorRef2.length+", colors="+colors.length+", match2="+match2.length);
+ System.err.println("max="+Tools.min(loc+len, Tools.min(colorRef2.length, colors.length)-1));
+ }
+ for(int i=loc, max=Tools.min(loc+len, Tools.min(colorRef2.length, colors.length)-1); i<=max; i++){
+ colorRef2[i]=colors[i];
+ }
+ for(int i=loc, max=Tools.min(loc+len, match2.length-1); i<=max; i++){
+ match2[i]='m';
+ }
+
+
+
+ if(loc==0){
+ for(int i=(Tools.min(loc+len, colorRef.length-1)); i>=0; i--){
+ if(DISCARD_NOCALLED_INSERTIONS && colorRef2[i]=='N'){return false;} //Fail.
+
+// if(colorRef2[i]=='N'){System.err.println("Keeping no-called insertion:\n"+toString(crbmq)+"\n");}
+
+// assert(colorRef2[i]!='N') : "TODO\n"+toString(crbmq)+"\n";
+
+ // System.err.println(""+(char)AminoAcid.colorToBase(baseRef2[i-1], colorRef2[i-1]));
+ // System.err.println(""+(char)baseRef2[i-1]);
+ // System.err.println(""+(char)colorRef2[i-1]);
+ // System.err.println(new String(baseRef)+"\t"+new String(colorRef)+"\t"+new String(baseRef2)+"\t"+new String(colorRef2));
+ // System.err.println("loc="+loc+", i="+i);
+ baseRef2[i]=AminoAcid.colorToBase(baseRef2[i+1], colorRef2[i]);
+ }
+ }else{
+
+ for(int i=loc+1, max=loc+len; i<=max; i++){
+ if(DISCARD_NOCALLED_INSERTIONS && colorRef2[i-1]=='N'){return false;} //Fail.
+
+// if(colorRef2[i-1]=='N'){System.err.println("Keeping no-called insertion:\n"+toString(crbmq)+"\n");}
+
+// assert(colorRef2[i-1]!='N') : "TODO\n"+toString(crbmq)+"\n";
+
+ // System.err.println(""+(char)AminoAcid.colorToBase(baseRef2[i-1], colorRef2[i-1]));
+ // System.err.println(""+(char)baseRef2[i-1]);
+ // System.err.println(""+(char)colorRef2[i-1]);
+ // System.err.println(new String(baseRef)+"\t"+new String(colorRef)+"\t"+new String(baseRef2)+"\t"+new String(colorRef2));
+ // System.err.println("loc="+loc+", i="+i);
+ baseRef2[i]=AminoAcid.colorToBase(baseRef2[i-1], colorRef2[i-1]);
+ }
+ }
+
+
+ crbmq[1]=colorRef2;
+ crbmq[2]=baseRef2;
+ crbmq[3]=match2;
+ crbmq[4]=quality;
+
+ return true;
+ }
+
+
+// private static int fixNocalls(final byte[][] crbmq){
+// byte[] colors=crbmq[0];
+// byte[] colorRef=crbmq[1];
+// byte[] baseRef=crbmq[2];
+//
+// int fixedRef=0;
+// int fixedCall=0;
+// assert(colors.length==colorRef.length) : "\n"+Arrays.toString(colors)+"\n"+Arrays.toString(colorRef)+
+// "\n"+new String(baseRef)+"\n"+new String(crbmq[3])+"\n";
+// for(int i=0; i<colors.length; i++){
+// if(colors[i]=='N' || colors[i]=='.'){
+// colors[i]=colorRef[i];
+// fixedCall++;
+// }
+// if(colorRef[i]=='N' || colorRef[i]=='.'){
+// colorRef[i]=colors[i];
+// fixedRef++;
+// }
+// }
+//
+// if(fixedRef>0){
+// for(int i=1; i<colorRef.length; i++){
+// if(baseRef[i]=='N'){
+// baseRef[i]=AminoAcid.colorToBase(baseRef[i-1], colorRef[i-1]);
+// }
+// }
+// for(int i=colorRef.length-2; i>=0; i--){
+// if(baseRef[i]=='N'){
+// baseRef[i]=AminoAcid.colorToBase(baseRef[i+1], colorRef[i+1]);
+// }
+// }
+// }
+// return fixedRef+fixedCall;
+// }
+
+
+ private static int fixNocallsInline(final byte[][] crbmq, Read read){
+ byte[] colors=crbmq[0];
+ byte[] colorRef=crbmq[1];
+ byte[] baseRef=crbmq[2];
+ byte[] match=crbmq[3];
+
+ int fixedRef=0;
+ int fixedCall=0;
+
+// boolean indels=false;
+//
+// int indexOfIndel=colors.length;
+// for(int i=0; i<match.length; i++){
+// if(match[i]=='I' || match[i]=='X' || match[i]=='Y' || match[i]=='D'){
+// indels=true;
+// indexOfIndel=i;
+// break;
+// }
+// }
+
+
+ for(int mi=0, ci=0, ri=0; mi<match.length; mi++){
+
+ assert(ci<colors.length) : "\n"+read.toText(false)+"\n"+toString(crbmq);
+
+ if(ri>=colorRef.length){
+ System.err.println("Failed fixNocallsInline for read "+read.numericID);
+ System.err.println(read.toText(false));
+ System.err.println(toString(crbmq));
+ return -1;
+ }
+
+ assert(ri<colorRef.length) : "\n"+read.toText(false)+"\n"+toString(crbmq);
+
+ final byte m=match[mi];
+ final byte c=colors[ci];
+ final byte r=colorRef[ri];
+
+
+ if(m=='m' || m=='S' || m=='N' || m=='X'){
+
+ if(c=='N' || c=='.'){
+ if(r!='N' && r!='.'){
+ colors[ci]=r;
+ fixedCall++;
+// match[mi]='m';
+ }
+ }
+ if(r=='N' || r=='.'){
+ if(c!='N' && c!='.'){
+ colorRef[ri]=c;
+ fixedRef++;
+ match[mi]='m';
+ }
+ }
+ if(m=='X'){//Not sure about this
+ if(c=='N' || c=='.'){
+ match[mi]='N';
+ }
+ match[mi]='m';
+ }
+
+ ci++;
+ ri++;
+ }else if(m=='D'){
+ ri++;
+ }else if(m=='I'){
+ ci++;
+ }else{
+ assert(false) : "m="+(char)m+"\n"+read.toText(false)+"\n"+toString(crbmq);
+ }
+
+// assert(m!='Y') : "m="+(char)m+"\n"+read.toText(false)+"\n"+toString(crbmq);
+ }
+
+ if(fixedRef>0){
+ {//forward
+
+ for(int mi=0, ri=0; mi<match.length; mi++){
+
+ assert(ri<colorRef.length) : "\n"+read.toText(false)+"\n"+toString(crbmq);
+
+ byte m=match[mi];
+ byte r=colorRef[ri];
+
+ if(m=='m' || m=='S' || m=='N'){
+
+ if(baseRef[ri]=='N'){
+ baseRef[ri]=AminoAcid.colorToBase(baseRef[ri+1], r);
+ }
+ if(baseRef[ri+1]=='N'){
+ baseRef[ri+1]=AminoAcid.colorToBase(baseRef[ri], r);
+ }
+ ri++;
+ }else if(m=='D'){
+ ri++;
+ }else if(m=='I'){
+ }else{
+ assert(false) : "m="+(char)m+"\n"+read.toText(false)+"\n"+toString(crbmq);
+ }
+ }
+ }
+
+
+ {//reverse
+
+ for(int mi=match.length-1, ri=colorRef.length-1; mi>=0; mi--){
+
+ assert(ri>=0) : "\n"+read.toText(false)+"\n"+toString(crbmq);
+
+ byte m=match[mi];
+ byte r=colorRef[ri];
+
+ if(m=='m' || m=='S' || m=='N'){
+
+ if(baseRef[ri]=='N'){
+ baseRef[ri]=AminoAcid.colorToBase(baseRef[ri+1], r);
+ }
+ if(baseRef[ri+1]=='N'){
+ baseRef[ri+1]=AminoAcid.colorToBase(baseRef[ri], r);
+ }
+ ri--;
+ }else if(m=='D'){
+ ri--;
+ }else if(m=='I'){
+ }else{
+ assert(false) : "m="+(char)m+"\n"+read.toText(false)+"\n"+toString(crbmq);
+ }
+ }
+ }
+ }
+ return fixedRef+fixedCall;
+ }
+
+
+ private static int fixNocalls(final byte[][] crbmq){
+ byte[] colors=crbmq[0];
+ byte[] colorRef=crbmq[1];
+ byte[] baseRef=crbmq[2];
+ byte[] match=crbmq[3];
+
+ int fixedRef=0;
+ int fixedCall=0;
+
+ boolean indels=false;
+
+ int indexOfIndel=colors.length;
+ for(int i=0; i<match.length; i++){
+ if(match[i]=='I' || match[i]=='X' || match[i]=='Y' || match[i]=='D'){
+ indels=true;
+ indexOfIndel=i;
+ break;
+ }
+ }
+
+// assert(colors.length==colorRef.length) : "\n"+Arrays.toString(colors)+"\n"+Arrays.toString(colorRef)+
+// "\n"+new String(baseRef)+"\n"+new String(crbmq[3])+"\n";
+ for(int i=0; i<indexOfIndel; i++){
+// if(match[i]=='I' || match[i]=='X' || match[i]=='Y' || match[i]=='D'){
+// indels=true;
+// break;
+// }
+ if(colors[i]=='N' || colors[i]=='.'){
+ if(colorRef[i]!='N' && colorRef[i]!='.'){
+ colors[i]=colorRef[i];
+ fixedCall++;
+ assert(match[i]!='I' && match[i]!='D') : toString(crbmq);
+ match[i]='m';
+ }
+ }
+ if(colorRef[i]=='N' || colorRef[i]=='.'){
+ if(colors[i]!='N' && colors[i]!='.'){
+ colorRef[i]=colors[i];
+ fixedRef++;
+ assert(match[i]!='I' && match[i]!='D') : toString(crbmq);
+ match[i]='m';
+ }
+ }
+ }
+
+ assert(indels || colors.length==colorRef.length) : "\n"+toString(crbmq)+"\n";
+
+ if(fixedRef>0){
+
+ for(int i=1; i<indexOfIndel; i++){
+ if(match[i]=='I' || match[i]=='X' || match[i]=='Y' || match[i]=='D'){
+ assert(false);
+ break;
+ }
+ if(baseRef[i]=='N'){
+ baseRef[i]=AminoAcid.colorToBase(baseRef[i-1], colorRef[i-1]);
+ }
+ }
+ if(!indels){
+ for(int i=colorRef.length-2; i>=0; i--){
+ if(baseRef[i]=='N'){
+ baseRef[i]=AminoAcid.colorToBase(baseRef[i+1], colorRef[i+1]);
+ }
+ }
+ }
+ }
+ return fixedRef+fixedCall;
+ }
+
+
+ private static int fixNocallsBackward(final byte[][] crbmq){
+ byte[] colors=crbmq[0];
+ byte[] colorRef=crbmq[1];
+ byte[] baseRef=crbmq[2];
+ byte[] match=crbmq[3];
+
+ int fixedRef=0;
+ int fixedCall=0;
+
+ boolean indels=false;
+
+ int indexOfIndelCall=0;
+ int indexOfIndelRef=0;
+ int indexOfIndelMatch=0;
+ for(int i=match.length-1; i>=0; i--){
+ if(match[i]=='I' || match[i]=='X' || match[i]=='Y' || match[i]=='D'){
+ indels=true;
+ int safe=(match.length-1)-i;
+ indexOfIndelMatch=i;
+ indexOfIndelCall=colors.length-safe;
+ indexOfIndelRef=colorRef.length-safe;
+// System.err.println("indexOfIndelMatch="+indexOfIndelMatch+
+// "\nindexOfIndelCall="+indexOfIndelCall+
+// "\nindexOfIndelRef="+indexOfIndelRef+
+// "\nsafe="+safe);
+ break;
+ }
+ }
+
+// assert(colors.length==colorRef.length) : "\n"+Arrays.toString(colors)+"\n"+Arrays.toString(colorRef)+
+// "\n"+new String(baseRef)+"\n"+new String(crbmq[3])+"\n";
+ for(int i=colors.length-1, j=colorRef.length-1, k=match.length-1; i>=indexOfIndelCall; i--, j--, k--){
+// if(match[i]=='I' || match[i]=='X' || match[i]=='Y' || match[i]=='D'){
+// indels=true;
+// break;
+// }
+ if(colors[i]=='N' || colors[i]=='.'){
+ if(colorRef[j]!='N' && colorRef[j]!='.'){
+ colors[i]=colorRef[j];
+ fixedCall++;
+ assert(match[k]!='I' && match[k]!='D') : "i="+i+", j="+j+", k="+k+"\n"+toString(crbmq);
+ match[k]='m';
+ }
+ }
+ if(colorRef[j]=='N' || colorRef[j]=='.'){
+ if(colors[i]!='N' && colors[i]!='.'){
+ colorRef[j]=colors[i];
+ fixedRef++;
+ assert(match[k]!='I' && match[k]!='D') : "i="+i+", j="+j+", k="+k+"\n"+toString(crbmq);
+ match[k]='m';
+ }
+ }
+ }
+
+ assert(indels || colors.length==colorRef.length) : "\n"+toString(crbmq)+"\n";
+
+ if(fixedRef>0){
+ if(!indels){
+ for(int i=1; i<colorRef.length; i++){
+ if(baseRef[i]=='N'){baseRef[i]=AminoAcid.colorToBase(baseRef[i-1], colorRef[i-1]);}
+ }
+ }
+ for(int i=colorRef.length-2; i>=indexOfIndelRef; i--){
+ if(match[i]=='I' || match[i]=='X' || match[i]=='Y' || match[i]=='D'){
+ assert(false);
+ break;
+ }
+ if(baseRef[i]=='N'){
+ baseRef[i]=AminoAcid.colorToBase(baseRef[i+1], colorRef[i+1]);
+ }
+ }
+ }
+ return fixedRef+fixedCall;
+ }
+
+ public static boolean perfectMatch(final byte[] match){
+ if(match==null){return false;}
+ for(int i=0; i<match.length; i++){
+ byte b=match[i];
+ if(b!='m'){return false;}
+ }
+ return true;
+ }
+
+ private static boolean containsIndels(final byte[] match){
+ for(int i=0; i<match.length; i++){
+ byte b=match[i];
+ if(b=='I' || b=='D' || b=='X' || b=='Y'){return true;}
+ }
+ return false;
+ }
+
+ private static boolean containsNocalls(final byte[] match){
+ for(int i=0; i<match.length; i++){
+ byte b=match[i];
+ if(b=='N' || b=='X' || b=='Y'){return true;}
+ }
+ return false;
+ }
+
+ private static boolean containsXY(final byte[] match){
+ for(int i=0; i<match.length; i++){
+ byte b=match[i];
+ if(b=='X' || b=='Y'){return true;}
+ }
+ return false;
+ }
+
+ //TODO: Add support for deletions
+ /** thresh: Must see this many consecutive 'm' to stop. */
+ private static int trimEnd(final byte[][] crbmq, int thresh, Read r){
+
+ byte[] colors=crbmq[0];
+ byte[] colorRef=crbmq[1];
+ byte[] baseRef=crbmq[2];
+ byte[] match=crbmq[3];
+ byte[] quality=crbmq[4];
+
+// if(match[0]=='m' && match[match.length-1]=='m'){return 0;}
+ {
+// byte a=match[0], b=match[1], c=match[match.length-1];
+// if(a=='m' && b=='m' && c=='m' || c=='S'){return 0;}
+
+// System.err.println(new String(match));
+ byte a=match[match.length-1], b=match[match.length-2];
+
+// System.err.println("a="+(char)a+", b="+(char)b);
+// System.err.println("X");
+ if((a=='m' || a=='D') && (b=='m')){return 0;}
+// System.err.println("Y");
+ if((a=='m' || a=='D') && quality[quality.length-1]>=22 && quality[quality.length-2]>=22){return 0;}
+// System.err.println("Z");
+ }
+
+ int last=match.length-1;
+ int minBadIndex=last;
+ int mcount=0;
+
+ int insertions=0;
+
+ while(last>1 && mcount<thresh){
+ byte c=match[last];
+ if(c=='m'){mcount++;}
+ else if(match[last]=='S' || match[last]=='I' || match[last]=='N'
+ || match[last]=='X'|| match[last]=='Y'){
+ minBadIndex=last;
+ mcount=0;
+ if(match[last]=='I' || match[last]=='X'|| match[last]=='Y'){
+ insertions++;
+ }
+ }else{
+ break;
+ }
+ last--;
+ }
+
+ final int trim=match.length-minBadIndex;
+
+ int trim2=insertions-trim;
+
+ colors=Arrays.copyOf(colors, colors.length-trim);
+ if(trim2!=0){
+ colorRef=Arrays.copyOf(colorRef, colorRef.length-trim+insertions);
+ baseRef=Arrays.copyOf(baseRef, baseRef.length-trim+insertions);
+ }
+ match=Arrays.copyOf(match, match.length-trim);
+ quality=Arrays.copyOf(quality, quality.length-trim);
+
+ crbmq[0]=colors;
+ crbmq[1]=colorRef;
+ crbmq[2]=baseRef;
+ crbmq[3]=match;
+ crbmq[4]=quality;
+
+ if(r.strand()==Gene.PLUS){
+ r.stop-=(trim-insertions);
+ }else{
+ r.start+=(trim-insertions);
+ }
+
+// System.err.println(new String(match));
+ return trim;
+ }
+
+ /** thresh: Must see this many consecutive 'm' to stop. */
+ private static int trimStart(final byte[][] crbmq, int thresh, Read r){
+ assert(false) : "TODO";
+ byte[] colors=crbmq[0];
+ byte[] colorRef=crbmq[1];
+ byte[] baseRef=crbmq[2];
+ byte[] match=crbmq[3];
+ byte[] quality=crbmq[4];
+
+// if(match[0]=='m' && match[match.length-1]=='m'){return 0;}
+ {
+ byte a=match[0];
+ if(a=='m'){return 0;}
+ }
+ if(match[0]=='S' || match[0]=='I' || match[0]=='N'
+ || match[0]=='X'|| match[0]=='Y'){
+ int last=match.length-1;
+ int minBadIndex=last;
+ int mcount=0;
+
+ int insertions=0;
+
+ while(last>1 && mcount<thresh){
+ byte c=match[last];
+ if(c=='m'){mcount++;}
+ else if(match[last]=='S' || match[last]=='I' || match[last]=='N'
+ || match[last]=='X'|| match[last]=='Y'){
+ minBadIndex=last;
+ mcount=0;
+ if(match[last]=='I' || match[last]=='X'|| match[last]=='Y'){
+ insertions++;
+ }
+ }else{
+ break;
+ }
+ last--;
+ }
+
+ final int trim=match.length-minBadIndex;
+
+ int trim2=insertions-trim;
+
+ colors=Arrays.copyOf(colors, colors.length-trim);
+ if(trim2!=0){
+ colorRef=Arrays.copyOf(colorRef, colorRef.length-trim+insertions);
+ baseRef=Arrays.copyOf(baseRef, baseRef.length-trim+insertions);
+ }
+ match=Arrays.copyOf(match, match.length-trim);
+ quality=Arrays.copyOf(quality, quality.length-trim);
+
+ crbmq[0]=colors;
+ crbmq[1]=colorRef;
+ crbmq[2]=baseRef;
+ crbmq[3]=match;
+ crbmq[4]=quality;
+
+ if(r.strand()==Gene.PLUS){
+ r.stop-=(trim-insertions);
+ }else{
+ r.start+=(trim-insertions);
+ }
+
+ return trim;
+ }
+ return 0;
+ }
+
+
+// private static int trimStart(final byte[][] crbmq, Read r){
+//
+// byte[] colors=crbmq[0];
+// byte[] colorRef=crbmq[1];
+// byte[] baseRef=crbmq[2];
+// byte[] match=crbmq[3];
+// byte[] quality=crbmq[4];
+//
+// if(match[0]=='m' || match[0]=='S'){return 0;}
+//
+// int index=0;
+// int insertions=0;
+// while(index<match.length && (match[index]=='I' || match[index]=='X')){
+// if(match[index]=='I'){insertions++;}
+// index++;
+// }
+// if(index==0){return 0;}
+// System.err.println("*** "+r.toText(false));
+// System.err.println(toString(crbmq));
+//
+// int start2=index-insertions;
+//
+// colors=Arrays.copyOfRange(colors, index, colors.length);
+// if(start2!=0){
+// colorRef=Arrays.copyOfRange(colorRef, index-insertions, colorRef.length);
+// baseRef=Arrays.copyOfRange(baseRef, index-insertions, baseRef.length);
+// }
+// match=Arrays.copyOfRange(match, index, match.length);
+// quality=Arrays.copyOfRange(quality, index, quality.length);
+//
+//
+// crbmq[0]=colors;
+// crbmq[1]=colorRef;
+// crbmq[2]=baseRef;
+// crbmq[3]=match;
+// crbmq[4]=quality;
+//
+// System.err.println("\n"+toString(crbmq));
+//
+// if(r.strand()==Gene.PLUS){
+// r.start+=(index-insertions);
+// }else{
+// r.stop-=(index-insertions);
+// }
+//
+// return index;
+// }
+
+
+ private static int fixSubs(final byte[][] crbmq){
+
+ byte[] colors=crbmq[0];
+ byte[] colorRef=crbmq[1];
+ byte[] baseRef=crbmq[2];
+ byte[] match=crbmq[3];
+ byte[] quality=crbmq[4];
+
+ assert(colors.length==colorRef.length) : "\n"+toString(crbmq);
+ assert(colors.length==match.length) : "\n"+toString(crbmq);
+ assert(colors.length==baseRef.length-1) : "\n"+toString(crbmq);
+
+ int first=match.length-1, last=0;
+
+ for(int i=0; i<match.length; i++){
+ if(match[i]=='S'){
+ first=Tools.min(first, i);
+ last=Tools.max(last, i);
+ }
+ }
+
+ if(verbose){System.err.println("First="+first+", last="+last);}
+
+ if(first>last){return 0;} //No subs
+
+
+ if(last>=colors.length-1 && first==0){
+ return -1; //Cannot decode
+ }else if(first>0){ //Go right only
+ if(verbose){System.err.println("max="+Tools.min(last+1, colors.length));}
+ for(int i=first, max=Tools.min(last+1, colors.length); i<max; i++){
+ match[i]='m';
+ if(colors[i-1]!='N' && baseRef[i-1]!='N'){
+ baseRef[i]=AminoAcid.colorToBase(baseRef[i-1], colors[i-1]);
+ }//else do nothing
+ }
+ if(last==match.length-1 && colors[last]!='N' && baseRef[last]!='N'){
+ baseRef[last+1]=AminoAcid.colorToBase(baseRef[last], colors[last]);
+ }
+ }else if(first==0){ //Left only
+ for(int i=last, min=Tools.max(0, first); i>=min; i--){
+ match[i]='m';
+ if(colors[i]!='N' && baseRef[i+1]!='N'){
+ baseRef[i]=AminoAcid.colorToBase(baseRef[i+1], colors[i]);
+ assert(baseRef[i]!='N') : i+", "+colors[i]+", "+(char)baseRef[i+1];
+ }//else do nothing
+ }
+ }
+
+ return 1;
+
+ }
+
+
+ private static int distToMismatch(byte[] colors, byte[] colorRef, int loc, int limit) {
+ int min=limit+1;
+ int left=Tools.max(0, loc-limit);
+ int right=Tools.min(colors.length, loc+limit+1);
+ for(int i=left; i<loc; i++){
+ if(colors[i]!=colorRef[i]){
+ min=Tools.min(min, loc-i);
+ }
+ }
+ for(int i=loc+1; i<right; i++){
+ if(colors[i]!=colorRef[i]){
+ min=Tools.min(min, i-loc);
+ }
+ }
+ return min;
+ }
+
+
+ public static boolean verifyMatchString2(Read r, boolean loud){
+ int maxVars=0;
+
+ assert(r.mapped());
+ assert(r.valid());
+ if(r.match==null){return false;}
+ if(r.match.length<r.length()){return false;}
+
+ byte last='m';
+ for(int i=0; i<r.match.length; i++){
+ byte b=r.match[i];
+ if(b=='X' || b=='Y'){
+// assert(false) : read.toText(false);
+// b=r.match[i]='I';
+ } //TODO: Should not be needed, if reads are trimmed...
+
+ if(b!='m' && b!=last){
+ maxVars++;
+ }
+ last=b;
+ }
+
+ if(maxVars==0){
+ assert(r.match.length==r.length());
+ return true;
+ }
+
+// byte[] original=Arrays.copyOf(call, call.length);
+ if(r.strand()==Gene.MINUS){
+ AminoAcid.reverseComplementBasesInPlace(r.bases);
+ Tools.reverseInPlace(r.quality);
+ }
+
+
+ //assert(checkArray(call)) :
+// "\n"+new String(original)+"\n"+new String(Tools.reverseAndCopy(call))+"\n"+
+// "\n"+Arrays.toString(original)+"\n"+Arrays.toString(Tools.reverseAndCopy(call))+"\n";
+
+// assert(false) : "TODO: ensure read is aligned with forward strand.";
+
+ ChromosomeArray cha=Data.getChromosome(r.chrom);
+
+
+ boolean b=true;
+ try{
+ b=(verifyMatchString(r.bases, cha.array, r.match, r.start, loud));
+ }catch(Exception e){
+ System.err.println(e);
+ System.err.println("This read failed verifyMatchString:\n"+r.toText(false)+"\n");
+ b=true;//ignores the problem.
+ }
+
+ if(r.strand()==Gene.MINUS){
+ AminoAcid.reverseComplementBasesInPlace(r.bases);
+ Tools.reverseInPlace(r.quality);
+ }
+ return b;
+ }
+
+
+ public static boolean verifyMatchString(byte[] call, byte[] ref, byte[] match, int rstart, boolean loud){
+
+ boolean ok=true;
+ for(int ci=0, mi=0, ri=rstart; ok && mi<match.length; mi++){
+ byte m=match[mi];
+ byte c=(m=='D' ? (byte)'?' : call[ci]);
+// byte r=((m=='I' || m=='X' || m=='Y') ? (byte)'?' : ref[ri]);
+ byte r=((m=='I' || m=='X' || m=='Y') ? (byte)'?' : ((ri>=0 && ri<ref.length) ? ref[ri] : (byte)'N'));
+
+ if(m=='m' || m=='s'){
+ ok=c==r;
+ ci++;
+ ri++;
+ }else if(m=='D'){
+ ri++;
+ }else if(m=='I' || m=='X' || m=='Y'){
+ ci++;
+ }else if(m=='S'){
+ ok=c!=r;
+ ci++;
+ ri++;
+ }else if(m=='N'){
+ ok=(c=='N' || r=='N');
+ ci++;
+ ri++;
+ }else{
+ assert(false) : (char)m;
+ }
+
+ }
+
+ if(!ok && loud){
+ System.err.println("NOT OK!");
+ if(call[0]<4){
+ if(ref.length>400){
+ System.err.println(toStringCS(Arrays.copyOfRange(ref, rstart, rstart+call.length))+" (ref)");
+ }else{
+ System.err.println(toStringCS(ref)+" (ref)");
+ }
+ System.err.println(toStringCS(call)+" (call)");
+ System.err.println(new String(match));
+ }else{
+ if(ref.length>400){
+ System.err.println(new String(Arrays.copyOfRange(ref, rstart, rstart+call.length))+" (ref)");
+ }else{
+ System.err.println(new String(ref)+" (ref)");
+ }
+ System.err.println(new String(call)+" (call)");
+ System.err.println(new String(match));
+ }
+ }
+
+ if(!ok){
+
+ ok=true;
+
+ if(loud){System.err.println("Attempting to fix and skip error.");}
+ for(int ci=0, mi=0, ri=rstart; mi<match.length; mi++){
+ byte m=match[mi];
+ byte c=(m=='D' ? (byte)'?' : call[ci]);
+// byte r=((m=='I' || m=='X' || m=='Y') ? (byte)'?' : ref[ri]);
+ byte r=((m=='I' || m=='X' || m=='Y') ? (byte)'?' : ((ri>=0 && ri<ref.length) ? ref[ri] : (byte)'N'));
+
+ if(m=='m' || m=='s'){
+ if(!AminoAcid.isFullyDefined(c) || !AminoAcid.isFullyDefined(r)){
+ match[mi]='N';
+ }else{
+ ok=(ok && c==r);
+ }
+ ci++;
+ ri++;
+ }else if(m=='D'){
+ ri++;
+ }else if(m=='I' || m=='X' || m=='Y'){
+ ci++;
+ }else if(m=='S'){
+ ok=(ok && c!=r);
+ ci++;
+ ri++;
+ }else if(m=='N'){
+ ok=(ok && (!AminoAcid.isFullyDefined(c) || !AminoAcid.isFullyDefined(r)));
+ ci++;
+ ri++;
+ }else{
+ assert(false) : (char)m;
+ }
+ }
+
+ if(call[0]<4){
+ if(ref.length>400){
+ System.err.println(toStringCS(Arrays.copyOfRange(ref, rstart, rstart+call.length))+" (ref)");
+ }else{
+ System.err.println(toStringCS(ref)+" (ref)");
+ }
+ System.err.println(toStringCS(call)+" (call)");
+ System.err.println(new String(match));
+ }else{
+ if(ref.length>400){
+ System.err.println(new String(Arrays.copyOfRange(ref, rstart, rstart+call.length))+" (ref)");
+ }else{
+ System.err.println(new String(ref)+" (ref)");
+ }
+ System.err.println(new String(call)+" (call)");
+ System.err.println(new String(match));
+ }
+
+
+
+ if(THROW_EXCEPTION_ON_VERIFY_FAILURE){
+ System.err.println("Fixed successfully?\t"+ok);
+ throw new RuntimeException("Failed VerifyMatchString()");
+ }
+
+ }
+
+ return ok;
+ }
+
+ //TODO: No-calls and no-ref are currently considered the same.
+ /** When this is called, the match string should be plus-oriented */
+ public ArrayList<Varlet> toVars(final Read read, final boolean CONDENSE, final boolean CONDENSE_SNPS, final boolean SPLIT_SUBS){
+ assert(read.match!=null);
+ byte[] match=read.match;
+ byte[] quality=read.quality;
+ byte[] call=read.bases;
+
+ if(quality==null){quality=Read.getFakeQuality(call.length);}
+
+ assert(checkArray(call));
+
+ int maxVars=0;
+
+ byte last='m';
+ for(int i=0; i<match.length; i++){
+ byte b=match[i];
+ if(b=='X' || b=='Y'){
+// assert(false) : read.toText(false);
+ b=match[i]='I';
+ } //TODO: Should not be needed, if reads are trimmed...
+
+ if(b!='m' && b!=last){
+ maxVars++;
+ }
+ last=b;
+ }
+
+ if(maxVars==0){return null;}
+
+// byte[] original=Arrays.copyOf(call, call.length);
+ if(read.strand()==Gene.MINUS){
+ AminoAcid.reverseComplementBasesInPlace(call);
+ Tools.reverseInPlace(quality);
+ }
+
+
+ //assert(checkArray(call)) :
+// "\n"+new String(original)+"\n"+new String(Tools.reverseAndCopy(call))+"\n"+
+// "\n"+Arrays.toString(original)+"\n"+Arrays.toString(Tools.reverseAndCopy(call))+"\n";
+
+// assert(false) : "TODO: ensure read is aligned with forward strand.";
+
+ ArrayList<Varlet> vars=new ArrayList<Varlet>(maxVars);
+ ChromosomeArray cha=Data.getChromosome(read.chrom);
+
+ boolean vms=false;
+ try {
+ vms=verifyMatchString(call, cha.array, match, read.start, true);
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ vms=false;
+ System.err.println("in TranslateColorspace.toVars(), a read failed verification:\n"+read.toText(false)+"\n");
+ }
+
+ if(verbose){
+ System.err.println("Making vars:");
+ System.err.println(new String(call));
+ System.err.println(cha.getString(read.start, read.stop));
+ System.err.println(new String(match));
+
+ }
+
+ int readQuality;
+ {
+ int totalQual=0;
+ int minQual=quality[0];
+ for(int i=0; i<quality.length; i++){
+ totalQual+=quality[i];
+ minQual=Tools.min(minQual, quality[i]);
+ }
+ readQuality=(totalQual+2*minQual)/(read.length()+2);
+ }
+ final float expectedErrors=read.expectedErrors(false, 0);
+
+ last='m';
+ int callPos=0;
+ int refPos=read.start;
+
+ //Make variations, then merge adjacent variations.
+ for(int matchPos=0; matchPos<match.length; matchPos++){
+
+ if(match[matchPos]=='N'){
+ byte a=call[callPos];
+ byte b=cha.get(refPos);
+ if(a!='N' && b=='N'){match[matchPos]='R';}
+ }
+
+ final byte type=match[matchPos];
+
+ if(type=='m'){
+ callPos++;
+ refPos++;
+ }else{
+ byte m;
+ int nCount=0; //"no-call": "N" in read
+ int rCount=0; //"no-ref": "N" in ref but read is called
+ int iCount=0;
+ int dCount=0;
+ int sCount=0;
+
+ //call string
+ StringBuilder cs=new StringBuilder(8);
+
+ //ref string
+ StringBuilder rs=new StringBuilder(8);
+
+ final int mstart=matchPos;
+ final int cstart=callPos;
+ final int rstart=refPos;
+
+ int qualSum=0;
+ int qualMin=quality[callPos];
+
+ while(matchPos<match.length && (m=match[matchPos])==type){
+
+ //TODO: Not very good for deletions...
+ qualSum+=quality[callPos];
+ qualMin=Tools.min(qualMin, quality[callPos]);
+
+ if(m=='I'){
+ iCount++;
+ cs.append((char)call[callPos]);
+ callPos++;
+ }else if(m=='D'){
+ dCount++;
+ rs.append((char)cha.get(refPos));
+ refPos++;
+ }else if(m=='S'){
+ sCount++;
+ cs.append((char)call[callPos]);
+ rs.append((char)cha.get(refPos));
+ assert(call[callPos]!='N');
+ assert(cha.get(refPos)!='N');
+ callPos++;
+ refPos++;
+ if(SPLIT_SUBS){
+ matchPos++;break;//Forces all subs to be split
+ }
+ }else if(m=='N'){
+
+ assert(call[callPos]=='N') : callPos+"\n"+new String(call)+"\n"+new String(match)+"\n"+cha.getString(read.start, read.stop)+"\n";
+ nCount++;
+// cs.append((char)call[callPos]);
+ cs.append('N');
+ rs.append((char)cha.get(refPos));
+ callPos++;
+ refPos++;
+
+ //This block corrects for a rare situation when both no-calls and no-refs are mixed in a single 'N' block.
+ {
+ int x=matchPos+1;
+ if(x<match.length && match[x]=='N'){
+ byte a=call[callPos];
+ byte b=cha.get(refPos);
+ if(a!='N' && b=='N'){match[x]='R';}
+ }
+ }
+
+ }else if(m=='R'){
+ assert(call[callPos]!='N');
+ assert(cha.get(refPos)=='N');
+ rCount++;
+ cs.append((char)call[callPos]);
+ rs.append((char)cha.get(refPos));
+ callPos++;
+ refPos++;
+ matchPos++;break; //Output no-ref individually
+ }else{
+ System.err.println("Detected invalid decode for read "+read.numericID+":");
+ System.err.println((char)m+"\n"+new String(rs)+"\n"+new String(cs)+"\n"+new String(match)+"\n"
+ +new String(call)+"\n"+cha.getString(read.start, read.stop)+"\n"+read.toText(false)+"\n");
+ return null;
+// assert(false) : (char)m+"\n"+new String(rs)+"\n"+new String(cs)+"\n"+new String(match)+"\n"
+// +new String(call)+"\n"+cha.getString(read.start, read.stop)+"\n"+read.toText(false)+"\n";
+ }
+ matchPos++;
+ }
+ matchPos--;
+
+ int mstop=matchPos;
+
+ int mlen=iCount+dCount+sCount+nCount+rCount;
+
+ int clen=iCount+sCount+nCount+rCount;
+ int rlen=dCount+sCount+nCount+rCount;
+
+ Varlet v;
+
+ callPos=cstart+clen;
+ refPos=rstart+rlen;
+
+ final int rstop=Tools.max(rstart, rstart+rlen-1);
+ final int cstop=cstart+clen-1;
+
+ final byte varType;
+
+ if(rlen==0){
+ varType=Variation.INS;
+ if(verbose){System.err.println("Setting type INS: "+Variation.varTypeMap[varType]);}
+ }else if(clen==0){varType=Variation.DEL;}
+ else if(rCount>0){varType=Variation.NOREF;}
+ else if(cs.charAt(0)=='N'){
+ varType=Variation.NOCALL;
+ if(verbose){System.err.println("Setting type NOCALL: "+Variation.varTypeMap[varType]);}
+ }else if(mlen==1){varType=Variation.SNP;}
+ else{varType=Variation.DELINS;}
+
+
+ final int headDist, tailDist, endDist;
+ {
+ int cstart2=cstart, cstop2=cstop;
+ if(varType==Variation.DEL){
+ cstart2--;
+ cstop2++;
+ }
+
+ assert(cstop2>=cstart2) : Variation.varTypeMap[varType]+", "+cstop2+", "+cstart2+", "+clen+
+ "\n'"+cs+"', '"+rs+"'\n"+new String(match);
+ assert(cstop2<call.length);
+
+ if(read.strand()==Gene.PLUS){
+ headDist=cstart2;
+ tailDist=call.length-cstop2-1;
+ }else{
+ tailDist=cstart2;
+ headDist=call.length-cstop2-1;
+ }
+ endDist=Tools.min(headDist, tailDist);
+ assert(headDist>=0);
+ assert(tailDist>=0);
+ }
+
+
+ int varQuality;
+ if(varType==Variation.DEL){
+ varQuality=((qualSum/mlen)+(qualMin))/2;
+ }else{
+ if(callPos<quality.length-1 && callPos>1){
+ qualMin=Tools.min(quality[callPos-2], quality[callPos-2], quality[callPos-2], quality[callPos-2]);
+ varQuality=(quality[callPos-2]+quality[callPos-1]+quality[callPos]+quality[callPos+1]+(qualMin))/5;
+ }else if(callPos<quality.length && callPos>0){
+ qualMin=Tools.min(quality[callPos-1], quality[callPos]);
+ varQuality=qualMin;
+ }else{
+ varQuality=((qualSum/mlen)+(qualMin))/2;
+ }
+ }
+
+ if(verbose){
+ System.err.println("mlen="+mlen+", rlen="+rlen+", clen="+clen+", varType="+Variation.varTypeMap[varType]+"\n"+
+ ", cs="+cs+", nCount="+nCount+", rCount="+rCount+", iCount="+iCount+", dCount="+dCount+", sCount="+sCount);
+ }
+
+// assert(read.mapScore>0) : read.toText(false);
+ v=new Varlet(read.chrom, read.strand(), rstart, rstop, mstart, mstop, varType, rs.toString(), cs.toString(),
+ varQuality, readQuality, read.mapScore, read.errors, expectedErrors, (read.paired() ? 1 : 0), read.numericID,
+ read.length(), read.start, read.stop, read.copies, headDist, tailDist, endDist,
+ read.pairnum());
+
+// if(v.varType==Variation.NOREF){System.err.print("R");}
+
+ if(v.varType==Variation.SNP){
+ if(v.call.equals(v.ref)){
+ System.err.println("\n"+read.toText(false));
+ System.err.println("\n"+v.toText());
+ System.err.println("\n"+read.strand());
+ System.err.println("\n");
+ System.err.println(cha.getString(read.start, read.stop));
+ System.err.println(new String(call));
+ System.err.println(new String(match));
+ System.err.println("\n");
+ assert(false);
+ }
+
+ }
+
+ vars.add(v);
+ }
+ }
+ //assert(checkArray(call));
+
+// assert(read.numericID!=3448228) : CONDENSE+"\n"+vars;
+
+// boolean fail=false;
+// {
+// int nr=0;
+// for(Variation v : vars){
+// if(v.varType==Variation.NOREF){
+// nr++;
+// fail=nr>0;
+// }
+// }
+// System.err.print(" "+nr);
+// }
+// if(fail){verbose=true;}
+
+// if(read.numericID==3448228){verbose=true;}
+
+ //Optionally, merge nearby variations
+ if(CONDENSE && vars.size()>1){
+ boolean condense=false;
+
+ int mergeDistance=1; // 1 for adjacent, 2 for non-adjacent.
+
+ for(int i=1; i<vars.size() && !condense; i++){
+ Varlet v1=vars.get(i-1);
+ Varlet v2=vars.get(i);
+ assert(v1.matchStop<v2.matchStart);
+
+ if(!v1.isNR_or_NC() && !v2.isNR_or_NC()){
+ if(v1.endLoc>=v2.beginLoc){condense=true;} //To prevent overlapping variations
+ else if(CONDENSE_SNPS || (v1.varType!=Variation.SNP && v2.varType!=Variation.SNP)){
+ condense|=(v1.matchStop>=v2.matchStart-mergeDistance);
+ }
+ }
+
+ if(verbose){
+ System.err.println("Compared\n"+v1+"\nand\n"+v2+"\ncondense="+condense+"\n"+v1.matchStart+", "+v2.matchStart+", "+mergeDistance);
+ }
+ }
+
+// condense=false;
+ if(condense){
+ if(verbose){
+ System.err.println("Condensing:");
+ for(Varlet v : vars){
+ System.err.println(v);
+ }
+ }
+ ArrayList<Varlet> list2=new ArrayList<Varlet>(vars.size()-1);
+ for(int i=vars.size()-2; i>=0; i--){
+ Varlet prev=vars.get(i);
+// Varlet v=vars.get(i+1);
+ Varlet v=vars.remove(i+1);
+
+
+ boolean merge=(!v.isNR_or_NC() && !prev.isNR_or_NC() && (prev.matchStop>=v.matchStart-mergeDistance || prev.endLoc>=v.beginLoc));
+ if(merge && !CONDENSE_SNPS && prev.endLoc<v.beginLoc){
+ if(v.varType==Variation.SNP || prev.varType==Variation.SNP){
+ merge=false;
+ }
+ }
+
+ byte varType;
+
+ if(merge){ //then merge.
+
+// if(v.varType==prev.varType){
+// varType=v.varType;
+// }else{
+// varType=Variation.DELINS;
+// }
+ varType=Variation.DELINS;
+
+ int midstart=prev.endLoc+1;
+ int midstop=v.beginLoc-1;
+
+ if(prev.varType==Variation.INS){midstart--;}
+
+ String middle=(midstart>midstop ? "" : cha.getString(midstart, midstop));
+
+ String cs=(prev.call==null ? "" : prev.call)+middle+(v.call==null ? "" : v.call);
+ String rs=(prev.ref==null ? "" : prev.ref)+middle+(v.ref==null ? "" : v.ref);
+
+ final int headDist=Tools.min(v.headDist, prev.headDist);
+ final int tailDist=Tools.min(v.tailDist, prev.tailDist);
+ final int endDist=Tools.min(v.endDist, prev.endDist);
+
+
+ Varlet v2=new Varlet(read.chrom, read.strand(), prev.beginLoc, v.endLoc, prev.matchStart, v.matchStop, varType,
+ rs, cs, (prev.avgVarQuality()+v.avgVarQuality())/2, readQuality, read.mapScore, read.errors, expectedErrors,
+ (read.paired() ? 1 : 0), read.numericID, read.length(),
+ read.start, read.stop, read.copies, headDist, tailDist, endDist, read.pairnum());
+
+ vars.remove(i); //prev
+ vars.add(v2);
+ }else{
+ list2.add(v);
+ }
+ }
+ assert(vars.size()==1);
+ list2.add(vars.get(0));
+ Collections.reverse(list2);
+ vars=list2;
+
+ if(verbose){
+ System.err.println("Condensed:");
+ for(Varlet v : vars){
+ System.err.println(v);
+ }
+ System.err.println();
+ }
+ }
+ }
+
+// {
+// int nr=0;
+// for(Variation v : vars){
+// if(v.varType==Variation.NOREF){
+// nr++;
+// }
+// }
+// System.err.println(" "+nr);
+// }
+//
+// assert(!fail);
+
+// assert(read.numericID!=3448228) : CONDENSE+"\n"+vars;
+
+ //assert(checkArray(call));
+ //Don't exit early and forget to undo this!
+ if(read.strand()==Gene.MINUS){
+ AminoAcid.reverseComplementBasesInPlace(call);
+ Tools.reverseInPlace(quality);
+ }
+ //assert(checkArray(call));
+ return vars;
+ }
+
+
+ public MSA msaBS;
+
+ public static boolean verbose=false;
+
+ public static boolean DISCARD_NOCALLED_INSERTIONS=false;
+ public static boolean THROW_EXCEPTION_ON_VERIFY_FAILURE=true; //Throws an exception when "verify match string" fails
+
+}
diff --git a/current/align2/TrimRead.java b/current/align2/TrimRead.java
new file mode 100755
index 0000000..66758a7
--- /dev/null
+++ b/current/align2/TrimRead.java
@@ -0,0 +1,540 @@
+package align2;
+
+import java.io.Serializable;
+import java.util.Arrays;
+
+import stream.Read;
+import stream.SiteScore;
+
+import dna.AminoAcid;
+import dna.Gene;
+
+/**
+ * Helper class for processes that do inline quality trimming.
+ * @author Brian Bushnell
+ * @date Mar 15, 2013
+ *
+ */
+public final class TrimRead implements Serializable {
+
+ /**
+ *
+ */
+ private static final long serialVersionUID = 8791743639124592480L;
+
+ public static void main(String[] args){
+ byte[] bases=args[0].getBytes();
+ byte[] quals=(args.length<2 ? null : args[1].getBytes());
+ if(quals!=null){
+ for(int i=0; i<quals.length; i++){quals[i]-=32;}
+ }
+ byte[] match=(args.length<3 ? null : args[2].getBytes());
+ int minq=(args.length<4 ? 5 : Integer.parseInt(args[3]));
+ Read r=new Read(bases, quals, 1);
+ r.match=match;
+ System.out.println("Before trim:\n"+r.toFastq()+(r.match==null ? "" : "\n"+new String(r.match)));
+ System.out.println(Arrays.toString(r.quality));
+ TrimRead tr=trim(r, true, true, minq, 1);
+ System.out.println("\nAfter trim:\n"+r.toFastq()+(r.match==null ? "" : "\n"+new String(r.match)));
+ if(r.match==null){
+ r.match=new byte[r.length()];
+ for(int i=0; i<r.length(); i++){r.match[i]='m';}
+ }
+ tr.untrim();
+ System.out.println("\nAfter untrim:\n"+r.toFastq()+(r.match==null ? "" : "\n"+new String(r.match)));
+ }
+
+ public static TrimRead trim(Read r, boolean trimLeft, boolean trimRight, int trimq, int minlen){
+ if(r==null || r.bases==null){return null;}
+
+ final int a, b;
+ if(optimalMode){
+ long packed=testOptimal(r.bases, r.quality, QualityTools.PROB_ERROR[trimq]);
+ a=trimLeft ? (int)((packed>>32)&0xFFFFFFFFL) : 0;
+ b=trimRight ? (int)((packed)&0xFFFFFFFFL) : 0;
+ }else if(windowMode){
+ a=0;
+ b=(trimRight ? testRightWindow(r.bases, r.quality, (byte)trimq, windowLength) : 0);
+ }else{
+ a=(trimLeft ? testLeft(r.bases, r.quality, (byte)trimq) : 0);
+ b=(trimRight ? testRight(r.bases, r.quality, (byte)trimq) : 0);
+ }
+ return (a+b==0 ? null : new TrimRead(r, a, b, trimq, minlen));
+ }
+
+ /**
+ * Trim until at least 'minlen' consecutive bases exceed 'minq'
+ * @param r Read to trim
+ * @param trimLeft Trim left side
+ * @param trimRight Trim right side
+ * @param trimq Maximum quality to trim
+ * @param minlen Minimum consecutive bases over minq before trimming stops
+ * @return Number of bases trimmed
+ */
+ public static int trimFast(Read r, boolean trimLeft, boolean trimRight, int trimq, int minlen){
+ final byte[] bases=r.bases, qual=r.quality;
+ if(bases==null || bases.length<1){return 0;}
+ final int a, b;
+ if(optimalMode){
+ long packed=testOptimal(bases, qual, QualityTools.PROB_ERROR[trimq]);
+ a=trimLeft ? (int)((packed>>32)&0xFFFFFFFFL) : 0;
+ b=trimRight ? (int)((packed)&0xFFFFFFFFL) : 0;
+ }else if(windowMode){
+ a=0;
+ b=(trimRight ? testRightWindow(bases, qual, (byte)trimq, windowLength) : 0);
+ }else{
+ a=(trimLeft ? testLeft(bases, qual, (byte)trimq) : 0);
+ b=(trimRight ? testRight(bases, qual, (byte)trimq) : 0);
+ }
+ return trimByAmount(r, a, b, minlen);
+ }
+
+ public static boolean untrim(Read r){
+ if(r==null || r.obj==null){return false;}
+ if(r.obj.getClass()!=TrimRead.class){return false;}
+ TrimRead tr=(TrimRead)r.obj;
+ return tr.untrim();
+ }
+
+// public TrimRead(Read r_, boolean trimLeft, boolean trimRight, int trimq_, int minlen_){
+// this(r_, (trimLeft ? testLeft(r_.bases, r_.quality, (byte)trimq_) : 0), (trimRight ? testRight(r_.bases, r_.quality, (byte)trimq_) : 0), trimq_, minlen_);
+// }
+
+ public TrimRead(Read r_, int trimLeft, int trimRight, int trimq_, int minlen_){
+ minlen_=Tools.max(minlen_, 0);
+ r=r_;
+ bases1=r.bases;
+ qual1=r.quality;
+ trimq=(byte)trimq_;
+ assert(bases1!=null || qual1==null) : "\n"+new String(bases1)+"\n"+new String(qual1)+"\n";
+ assert(bases1==null || qual1==null || bases1.length==qual1.length) : "\n"+new String(bases1)+"\n"+new String(qual1)+"\n";
+ int trimmed=trim(trimLeft, trimRight, minlen_);
+ if(trimmed>0){
+ assert(bases2==null || bases2.length>=minlen_ || bases1.length<minlen_) : bases1.length+", "+bases2.length+", "+minlen_+", "+trimLeft+", "+trimRight;
+ r.bases=bases2;
+ r.quality=qual2;
+ r.obj=this;
+ trimMatch(r);
+ }
+ }
+
+ /** Trim the left end of the read, from left to right */
+ private int trim(final boolean trimLeft, final boolean trimRight, final int minlen){
+ final int a, b;
+ if(optimalMode){
+ long packed=testOptimal(bases1, qual1, QualityTools.PROB_ERROR[trimq]);
+ a=trimLeft ? (int)((packed>>32)&0xFFFFFFFFL) : 0;
+ b=trimRight ? (int)((packed)&0xFFFFFFFFL) : 0;
+ }else{
+ a=(trimLeft ? testLeft(bases1, qual1, (byte)trimq) : 0);
+ b=(trimRight ? testRight(bases1, qual1, (byte)trimq) : 0);
+ }
+ return trim(a, b, minlen);
+ }
+
+ /** Trim the left end of the read, from left to right */
+ private int trim(int trimLeft, int trimRight, final int minlen){
+ assert(trimLeft>=0 && trimRight>=0) : "trimLeft="+trimLeft+", trimRight="+trimRight+", minlen="+minlen+", len="+bases1.length;
+ assert(trimLeft>0 || trimRight>0) : "trimLeft="+trimLeft+", trimRight="+trimRight+", minlen="+minlen+", len="+bases1.length;
+ final int maxTrim=Tools.min(bases1.length, bases1.length-minlen);
+ if(trimLeft+trimRight>maxTrim){
+ int excess=trimLeft+trimRight-maxTrim;
+ if(trimLeft>0 && excess>0){
+ trimLeft=Tools.max(0, trimLeft-excess);
+ excess=trimLeft+trimRight-maxTrim;
+ }
+ if(trimRight>0 && excess>0){
+ trimRight=Tools.max(0, trimRight-excess);
+ excess=trimLeft+trimRight-maxTrim;
+ }
+
+ }
+
+ leftTrimmed=trimLeft;
+ rightTrimmed=trimRight;
+ final int sum=leftTrimmed+rightTrimmed;
+
+ if(verbose){
+ System.err.println("leftTrimmed="+leftTrimmed+", rightTrimmed="+rightTrimmed+", sum="+sum);
+ }
+
+ if(sum==0){
+ bases2=bases1;
+ qual2=qual1;
+ }else{
+ bases2=Arrays.copyOfRange(bases1, trimLeft, bases1.length-trimRight);
+ qual2=((qual1==null || (trimLeft+trimRight>=qual1.length)) ? null : Arrays.copyOfRange(qual1, trimLeft, qual1.length-trimRight));
+ }
+ return sum;
+ }
+
+ /** Trim bases outside of leftLoc and rightLoc, excluding leftLoc and rightLoc */
+ public static int trimToPosition(Read r, int leftLoc, int rightLoc, int minResultingLength){
+ final int len=r.length();
+ return trimByAmount(r, leftLoc, len-rightLoc-1, minResultingLength);
+ }
+
+ /** Remove non-genetic-code from reads */
+ public static int trimBadSequence(Read r){
+ final byte[] bases=r.bases, quals=r.quality;
+ if(bases==null){return 0;}
+ final int minGenetic=20;
+ int lastNon=-1;
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ if(!AminoAcid.isACGTN(b)){lastNon=i;}
+ if(i-lastNon>minGenetic){break;}
+ }
+ if(lastNon>=0){
+ r.bases=Arrays.copyOfRange(bases, lastNon+1, bases.length);
+ if(quals!=null){
+ r.quality=Arrays.copyOfRange(quals, lastNon+1, quals.length);
+ }
+ }
+ return lastNon+1;
+ }
+
+ /** Trim this many bases from each end */
+ public static int trimByAmount(Read r, int leftTrimAmount, int rightTrimAmount, int minResultingLength){
+
+ leftTrimAmount=Tools.max(leftTrimAmount, 0);
+ rightTrimAmount=Tools.max(rightTrimAmount, 0);
+
+ //These assertions are unnecessary if the mapping information will never be used or output.
+ assert(r.match==null) : "TODO: Handle trimming of reads with match strings.";
+ assert(r.sites==null) : "TODO: Handle trimming of reads with SiteScores.";
+
+ final byte[] bases=r.bases, qual=r.quality;
+ final int len=(bases==null ? 0 : bases.length), qlen=(qual==null ? 0 : qual.length);
+ if(len<1){return 0;}
+ minResultingLength=Tools.min(len, Tools.max(minResultingLength, 0));
+ if(leftTrimAmount+rightTrimAmount+minResultingLength>len){
+ rightTrimAmount=Tools.max(1, len-minResultingLength);
+ leftTrimAmount=0;
+ }
+
+ final int total=leftTrimAmount+rightTrimAmount;
+ if(total>0){
+ r.bases=Arrays.copyOfRange(bases, leftTrimAmount, len-rightTrimAmount);
+ r.quality=(leftTrimAmount+rightTrimAmount>=qlen ? null : Arrays.copyOfRange(qual, leftTrimAmount, qlen-rightTrimAmount));
+ trimMatch(r);
+ if(r.stop>r.start){ //TODO: Fixing mapped coordinates needs more work.
+ r.start+=leftTrimAmount;
+ r.stop-=rightTrimAmount;
+ }
+ }
+
+ if(verbose){
+ System.err.println("leftTrimmed="+leftTrimAmount+", rightTrimmed="+rightTrimAmount+
+ ", sum="+total+", final length="+r.length());
+ }
+
+ return total;
+ }
+
+ /** Count number of bases that need trimming on each side, and pack into a long */
+ private static long testOptimal(byte[] bases, byte[] qual, float avgErrorRate){
+ if(optimalBias>=0){avgErrorRate=optimalBias;}//Override
+ assert(avgErrorRate>0 && avgErrorRate<=1) : "Average error rate ("+avgErrorRate+") must be between 0 (exclusive) and 1 (inclusive)";
+ if(bases==null || bases.length==0){return 0;}
+ if(qual==null){return avgErrorRate<1 ? 0 : ((((long)testLeftN(bases))<<32) | (((long)testRightN(bases))&0xFFFFFFFFL));}
+
+ float maxScore=0;
+ float score=0;
+ int maxLoc=-1;
+ int maxCount=-1;
+ int count=0;
+
+ final float nprob=Tools.max(Tools.min(avgErrorRate*1.1f, 1), NPROB);
+
+ for(int i=0; i<bases.length; i++){
+ final byte b=bases[i];
+ byte q=qual[i];
+// float probError=(b=='N' ? nprob : ADJUST_QUALITY ? CalcTrueQuality.estimateErrorProbAvg(qual, bases, i) : QualityTools.PROB_ERROR[q]);
+// float probError=(b=='N' ? nprob : ADJUST_QUALITY ? CalcTrueQuality.estimateErrorProbGeoAvg(qual, bases, i) : QualityTools.PROB_ERROR[q]);
+// float probError=(b=='N' ? nprob : ADJUST_QUALITY ? CalcTrueQuality.estimateErrorProb2(qual, bases, i) : QualityTools.PROB_ERROR[q]);
+
+// float probError=(b=='N' ? nprob : q==1 ? PROB1 : QualityTools.PROB_ERROR[q]);
+ float probError=(b=='N' ? nprob : QualityTools.PROB_ERROR[q]);
+
+// assert(q>0 || b=='N') : "index "+i+": q="+q+", b="+(char)b+"\n"+new String(bases)+"\n"+Arrays.toString(qual)+"\n";
+
+ float delta=avgErrorRate-probError;
+ score=score+delta;
+ if(score>0){
+ count++;
+ if(score>maxScore || (score==maxScore && count>maxCount)){
+ maxScore=score;
+ maxCount=count;
+ maxLoc=i;
+ }
+ }else{
+ score=0;
+ count=0;
+ }
+ }
+
+ final int left, right;
+ if(maxScore>0){
+ assert(maxLoc>=0);
+ assert(maxCount>0);
+ left=maxLoc-maxCount+1;
+ assert(left>=0 && left<=bases.length);
+ right=bases.length-maxLoc-1;
+ }else{
+ left=0;
+ right=bases.length;
+ }
+ final long packed=((((long)left)<<32) | (((long)right)&0xFFFFFFFFL));
+
+ if(verbose){
+ System.err.println(Arrays.toString(qual));
+ System.err.println("After testLocal: maxScore="+maxScore+", maxLoc="+maxLoc+", maxCount="+maxCount+
+ ", left="+left+", right="+right+", returning "+Long.toHexString(packed));
+ }
+ return packed;
+ }
+
+ /** Count number of bases that need trimming on left side */
+ private static int testLeft(byte[] bases, byte[] qual, final byte trimq){
+ if(bases==null || bases.length==0){return 0;}
+ if(qual==null){return trimq>0 ? 0 : testLeftN(bases);}
+ int good=0;
+ int lastBad=-1;
+ int i=0;
+ for(; i<bases.length && good<minGoodInterval; i++){
+ final byte q=qual[i];
+ final byte b=bases[i];
+ assert(q>0 || b=='N') : "index "+i+": q="+q+", b="+(char)b+"\n"+new String(bases)+"\n"+Arrays.toString(qual)+"\n";
+ if(q>trimq){good++;}
+ else{good=0; lastBad=i;}
+ }
+ if(verbose){
+// System.err.println(Arrays.toString(qual));
+ System.err.println("After testLeft: good="+good+", lastBad="+lastBad+", i="+i+", returning "+(lastBad+1));
+// assert(false);
+ }
+ return lastBad+1;
+ }
+
+ /** Count number of bases that need trimming on right side using a sliding window */
+ private static int testRightWindow(byte[] bases, byte[] qual, final byte trimq, final int window){
+ if(bases==null || bases.length==0){return 0;}
+ if(qual==null || qual.length<window){return trimq>0 ? 0 : testRightN(bases);}
+ final int thresh=Tools.max(window*trimq, 1);
+ int sum=0;
+ for(int i=0, j=-window; i<qual.length; i++, j++){
+ final byte q=qual[i];
+ sum+=q;
+ if(j>=-1){
+ if(j>=0){sum-=qual[j];}
+ if(sum<thresh){
+ return qual.length-j-1;
+ }
+ }
+ }
+ return 0;
+ }
+
+ /** Count number of bases that need trimming on right side */
+ private static int testRight(byte[] bases, byte[] qual, final byte trimq){
+ if(bases==null || bases.length==0){return 0;}
+ if(qual==null){return trimq>0 ? 0 : testRightN(bases);}
+ int good=0;
+ int lastBad=bases.length;
+ int i=bases.length-1;
+ for(; i>=0 && good<minGoodInterval; i--){
+ final byte q=qual[i];
+ final byte b=bases[i];
+ assert(q>0 || b=='N') : "index "+i+": q="+q+", b="+(char)b+"\n"+new String(bases)+"\n"+Arrays.toString(qual)+"\n";
+ if(q>trimq){good++;}
+ else{good=0; lastBad=i;}
+ }
+ if(verbose){
+ System.err.println("After trimLeft: good="+good+", lastBad="+lastBad+", i="+i+", returning "+(bases.length-lastBad));
+ }
+ return bases.length-lastBad;
+ }
+
+ /** Count number of bases that need trimming on left side, considering only N as bad */
+ public static int testLeftN(byte[] bases){
+ if(bases==null || bases.length==0){return 0;}
+ int good=0;
+ int lastBad=-1;
+ for(int i=0; i<bases.length && good<minGoodInterval; i++){
+ final byte b=bases[i];
+ //if(dna.AminoAcid.isFullyDefined(b)){good++;}
+ if(b!=((byte)'N')){good++;}
+ else{good=0; lastBad=i;}
+ }
+ return lastBad+1;
+ }
+
+ /** Count number of bases that need trimming on right side, considering only N as bad */
+ public static int testRightN(byte[] bases){
+ if(bases==null || bases.length==0){return 0;}
+ int good=0;
+ int lastBad=bases.length;
+ for(int i=bases.length-1; i>=0 && good<minGoodInterval; i--){
+ final byte b=bases[i];
+ //if(dna.AminoAcid.isFullyDefined(b)){good++;}
+ if(b!=((byte)'N')){good++;}
+ else{good=0; lastBad=i;}
+ }
+ return bases.length-lastBad;
+ }
+
+ public boolean untrim(){
+ if(leftTrimmed==0 && rightTrimmed==0){return false;}
+ r.setPerfect(false);
+
+ final int lt, rt;
+ if(r.strand()==Gene.PLUS){
+ lt=leftTrimmed;
+ rt=rightTrimmed;
+ }else{
+ lt=rightTrimmed;
+ rt=leftTrimmed;
+ }
+
+ boolean returnToShort=false;
+ if(verbose){System.err.println("Untrimming");}
+ if(r.match!=null){
+ if(r.shortmatch()){
+ r.match=Read.toLongMatchString(r.match);
+ r.setShortMatch(false);
+ returnToShort=true;
+ }
+ byte[] match2=new byte[r.match.length+lt+rt];
+ int i=0;
+ for(; i<lt; i++){
+ match2[i]='C';
+ }
+ for(int j=0; j<r.match.length; i++, j++){
+ match2[i]=r.match[j];
+ }
+ for(; i<match2.length; i++){
+ match2[i]='C';
+ }
+ r.match=match2;
+ }
+ r.bases=bases1;
+ r.quality=qual1;
+ r.start-=lt;
+ r.stop+=rt;
+ if(returnToShort){
+ r.match=Read.toShortMatchString(r.match);
+ r.setShortMatch(true);
+ }
+
+ if(r.sites!=null){
+ for(SiteScore ss : r.sites){untrim(ss);}
+ }
+
+ return true;
+ }
+
+ private boolean untrim(SiteScore ss){
+ if(ss==null){return false;}
+ if(leftTrimmed==0 && rightTrimmed==0){return false;}
+ ss.perfect=ss.semiperfect=false;
+
+ final int lt, rt;
+ if(ss.strand==Gene.PLUS){
+ lt=leftTrimmed;
+ rt=rightTrimmed;
+ }else{
+ lt=rightTrimmed;
+ rt=leftTrimmed;
+ }
+
+ boolean returnToShort=false;
+ if(verbose){System.err.println("Untrimming ss "+ss);}
+ if(ss.match!=null){
+
+ boolean shortmatch=false;
+ for(byte b : ss.match){
+ if(Character.isDigit(b)){shortmatch=true; break;}
+ }
+
+ if(shortmatch){
+ ss.match=Read.toLongMatchString(ss.match);
+ returnToShort=true;
+ }
+ byte[] match2=new byte[ss.match.length+lt+rt];
+ int i=0;
+ for(; i<lt; i++){
+ match2[i]='C';
+ }
+ for(int j=0; j<ss.match.length; i++, j++){
+ match2[i]=ss.match[j];
+ }
+ for(; i<match2.length; i++){
+ match2[i]='C';
+ }
+ ss.match=match2;
+ }
+ ss.setLimits(ss.start-lt, ss.stop+rt);
+ if(returnToShort){ss.match=Read.toShortMatchString(ss.match);}
+ return true;
+ }
+
+ private static boolean trimMatch(Read r){
+ if(r.match==null && r.sites==null){return false;}
+
+ //Easy mode!
+ r.match=null;
+ if(r.sites!=null){
+ for(SiteScore ss : r.sites){
+ if(ss!=null){ss.match=null;}
+ }
+ }
+ return true;
+
+ //TODO - need to adjust read start and stop based on this information. Also check strand!
+// byte[] match=r.match;
+// if(r.shortmatch()){match=Read.toLongMatchString(match);}
+// byte[] match2=new byte[match.length-leftTrimmed-rightTrimmed];
+// for(int mpos=0, bpos=0; bpos<leftTrimmed; mpos++){
+// byte m=match[mpos];
+// if(m=='D'){
+// //do nothing
+// }else{
+// bpos++;
+// }
+// }
+ }
+
+
+ public final Read r;
+
+ /** untrimmed bases */
+ public final byte[] bases1;
+ /** untrimmed qualities */
+ public final byte[] qual1;
+ /** trimmed bases */
+ public byte[] bases2;
+ /** trimmed qualities */
+ public byte[] qual2;
+
+
+ public final byte trimq;
+ public int leftTrimmed;
+ public int rightTrimmed;
+
+ /** Require this many consecutive good bases to stop trimming. Minimum is 1.
+ * This is for the old trimming mode and not really used anymore */
+ public static int minGoodInterval=2;
+
+ public static boolean verbose=false;
+ public static boolean optimalMode=true;
+ public static boolean windowMode=false;
+ public static float optimalBias=-1f;
+
+ public static int windowLength=4;
+
+ private static final float NPROB=0.75f;
+// public static float PROB1=QualityTools.PROB_ERROR[1];
+
+
+}
diff --git a/current/assemble/AbstractBuildThread.java b/current/assemble/AbstractBuildThread.java
new file mode 100755
index 0000000..272d012
--- /dev/null
+++ b/current/assemble/AbstractBuildThread.java
@@ -0,0 +1,43 @@
+package assemble;
+
+import java.util.ArrayList;
+
+import align2.LongList;
+import stream.ByteBuilder;
+import stream.ConcurrentReadInputStream;
+import stream.Read;
+
+/**
+ * @author Brian Bushnell
+ * @date Jul 18, 2015
+ *
+ */
+abstract class AbstractBuildThread extends Thread {
+
+ public AbstractBuildThread(int id_, int mode_, ConcurrentReadInputStream[] crisa_){
+ id=id_;
+ crisa=crisa_;
+ mode=mode_;
+ }
+
+ /** Input read stream */
+ final ConcurrentReadInputStream[] crisa;
+
+ final int mode;
+ int minCountSeedCurrent;
+
+ final int[] leftCounts=new int[4];
+ final int[] rightCounts=new int[4];
+ final ByteBuilder builderT=new ByteBuilder();
+
+ final LongList insertSizes=new LongList();
+
+ ArrayList<Read> contigs=new ArrayList<Read>();
+
+ long readsInT=0;
+ long basesInT=0;
+ long lowqReadsT=0;
+ long lowqBasesT=0;
+ final int id;
+
+}
diff --git a/current/assemble/AbstractExploreThread.java b/current/assemble/AbstractExploreThread.java
new file mode 100755
index 0000000..b965439
--- /dev/null
+++ b/current/assemble/AbstractExploreThread.java
@@ -0,0 +1,78 @@
+package assemble;
+
+import stream.ByteBuilder;
+import ukmer.Kmer;
+
+/**
+ * Searches for dead ends.
+ * @author Brian Bushnell
+ * @date Jul 20, 2015
+ *
+ */
+abstract class AbstractExploreThread extends ShaveObject implements Runnable {
+
+ /**
+ * Constructor
+ */
+ public AbstractExploreThread(int id_, int kbig_){
+ id=id_;
+ myKmer=new Kmer(kbig_);
+ thread=new Thread(this);
+ }
+
+ @Override
+ public final void run(){
+ //TODO:
+
+ //With processNextVictims enabled, the number of dead ends found drops from the first pass to the next, then stabilizes.
+ //So, they are not being reset correctly.
+
+ //Also, the number found - even with one thread - is nondeterministic if both are enabled.
+ //Unstable whether or not processNextVictims is disabled. But that's probably to be expected as the count is not exact.
+ //What should be exact is the number of kmers removed for being dead ends.
+
+ //The number is lower than expected. 65k for 600k reads with errors. Most are bubbles, but 40% should be dead ends, or 240k.
+
+ while(processNextTable(myKmer)){}
+ while(processNextVictims(myKmer)){}
+
+ for(int i=0; i<removeMatrixT.length; i++){
+ for(int j=0; j<removeMatrixT.length; j++){
+ if((i==FORWARD_BRANCH || i==BACKWARD_BRANCH) && (j==FORWARD_BRANCH || j==BACKWARD_BRANCH)){
+ bubblesFoundT+=removeMatrixT[i][j];
+ }
+ }
+ }
+ }
+
+ boolean processNextTable(){return processNextTable(myKmer);}
+ abstract boolean processNextTable(final Kmer kmer);
+
+ boolean processNextVictims(){return processNextVictims(myKmer);}
+ abstract boolean processNextVictims(final Kmer kmer);
+
+ /*--------------------------------------------------------------*/
+
+ public final void start(){thread.start();}
+ public final Thread.State getState(){return thread.getState();}
+ public final void join() throws InterruptedException{thread.join();}
+
+ /*--------------------------------------------------------------*/
+
+ long kmersTestedT=0;
+ long deadEndsFoundT=0;
+ long bubblesFoundT=0;
+
+ final int id;
+ final Kmer myKmer;
+
+ final int[] leftCounts=new int[4];
+ final int[] rightCounts=new int[4];
+ final ByteBuilder builderT=new ByteBuilder();
+
+ long[][] countMatrixT=new long[8][8];
+ long[][] removeMatrixT=new long[8][8];
+
+ public final Thread thread;
+
+}
diff --git a/current/assemble/AbstractRemoveThread.java b/current/assemble/AbstractRemoveThread.java
new file mode 100755
index 0000000..335d371
--- /dev/null
+++ b/current/assemble/AbstractRemoveThread.java
@@ -0,0 +1,187 @@
+package assemble;
+
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import dna.Timer;
+
+import ukmer.HashArrayU1D;
+import ukmer.KmerNodeU;
+import ukmer.KmerTableSetU;
+
+import kmer.AbstractKmerTableSet;
+import kmer.HashArray1D;
+import kmer.KmerNode;
+import kmer.KmerTableSet;
+
+/**
+ * Removes kmers with counts outside a certain range.
+ * @author Brian Bushnell
+ * @date Jul 20, 2015
+ */
+public abstract class AbstractRemoveThread extends Thread{
+
+ /**
+ * Constructor
+ */
+ public AbstractRemoveThread(int id_, int min_, int max_, AtomicInteger nextTable_){
+ id=id_;
+ min=min_;
+ max=max_;
+ nextTable=nextTable_;
+ assert(nextTable.get()==0);
+ }
+
+ @Override
+ public final void run(){
+ while(processNextTable()){}
+ }
+
+ abstract boolean processNextTable();
+
+ /*--------------------------------------------------------------*/
+
+ public static long process(final int threads, final int min, final int max, AbstractKmerTableSet tables, boolean print){
+ Timer t=new Timer();
+
+ final AtomicInteger nextTable=new AtomicInteger(0);
+ long kmersRemoved=0;
+
+ /* Create Removethreads */
+ ArrayList<AbstractRemoveThread> alpt=new ArrayList<AbstractRemoveThread>(threads);
+ for(int i=0; i<threads; i++){
+ final AbstractRemoveThread art;
+ if(tables.getClass()==KmerTableSet.class){
+ art=new RemoveThread1(i, min, max, nextTable, (KmerTableSet)tables);
+ }else{
+ art=new RemoveThread2(i, min, max, nextTable, (KmerTableSetU)tables);
+ }
+ alpt.add(art);
+ }
+ for(AbstractRemoveThread pt : alpt){pt.start();}
+
+ for(AbstractRemoveThread pt : alpt){
+ while(pt.getState()!=Thread.State.TERMINATED){
+ try {
+ pt.join();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+
+ kmersRemoved+=pt.kmersRemovedT;
+ }
+
+ t.stop();
+ if(print){
+ outstream.println("Removed "+kmersRemoved+" kmers.");
+ outstream.println("Remove time: "+t);
+ }
+
+ return kmersRemoved;
+ }
+
+ /*--------------------------------------------------------------*/
+
+ private static class RemoveThread1 extends AbstractRemoveThread{
+
+ /**
+ * Constructor
+ */
+ public RemoveThread1(int id_, int min_, int max_, AtomicInteger nextTable_, KmerTableSet tables_){
+ super(id_, min_, max_, nextTable_);
+ tables=tables_;
+ }
+
+ @Override
+ boolean processNextTable(){
+ final int tnum=nextTable.getAndAdd(1);
+ if(tnum>=tables.ways){return false;}
+ final HashArray1D table=tables.getTable(tnum);
+ final int[] values=table.values();
+ final int lim=table.arrayLength();
+ for(int cell=0; cell<lim; cell++){
+ final int value=values[cell];
+ if(value<min || value>max){values[cell]=0;}
+ }
+ for(KmerNode kn : table.victims().array()){
+ if(kn!=null){traverseKmerNode(kn);}
+ }
+
+ table.clearOwnership();
+ kmersRemovedT+=table.regenerate();
+ return true;
+ }
+
+ private void traverseKmerNode(KmerNode kn){
+ if(kn==null){return;}
+ final int value=kn.count();
+ if(value<min || value>max){kn.set(0);}
+ traverseKmerNode(kn.left());
+ traverseKmerNode(kn.right());
+ }
+
+ private final KmerTableSet tables;
+
+ }
+
+ /*--------------------------------------------------------------*/
+
+ private static class RemoveThread2 extends AbstractRemoveThread{
+
+ /**
+ * Constructor
+ */
+ public RemoveThread2(int id_, int min_, int max_, AtomicInteger nextTable_, KmerTableSetU tables_){
+ super(id_, min_, max_, nextTable_);
+ tables=tables_;
+ }
+
+ @Override
+ boolean processNextTable(){
+ final int tnum=nextTable.getAndAdd(1);
+ if(tnum>=tables.ways){return false;}
+ final HashArrayU1D table=tables.getTable(tnum);
+ final int[] values=table.values();
+ final int lim=table.arrayLength();
+ for(int cell=0; cell<lim; cell++){
+ final int value=values[cell];
+ if(value<min || value>max){values[cell]=0;}
+ }
+ for(KmerNodeU kn : table.victims().array()){
+ if(kn!=null){traverseKmerNode(kn);}
+ }
+
+ table.clearOwnership();
+ kmersRemovedT+=table.regenerate();
+ return true;
+ }
+
+ private void traverseKmerNode(KmerNodeU kn){
+ if(kn==null){return;}
+ final int value=kn.count();
+ if(value<min || value>max){kn.set(0);}
+ traverseKmerNode(kn.left());
+ traverseKmerNode(kn.right());
+ }
+
+ private final KmerTableSetU tables;
+
+ }
+
+ /*--------------------------------------------------------------*/
+
+ long kmersRemovedT=0;
+
+ final int id;
+ final int min;
+ final int max;
+
+ final AtomicInteger nextTable;
+
+ /** Print messages to this stream */
+ static PrintStream outstream=System.err;
+
+}
\ No newline at end of file
diff --git a/current/assemble/AbstractShaveThread.java b/current/assemble/AbstractShaveThread.java
new file mode 100755
index 0000000..13da055
--- /dev/null
+++ b/current/assemble/AbstractShaveThread.java
@@ -0,0 +1,33 @@
+package assemble;
+
+/**
+ * @author Brian Bushnell
+ * @date Jul 20, 2015
+ *
+ */
+/**
+ * Removes dead-end kmers.
+ */
+abstract class AbstractShaveThread extends Thread{
+
+ /**
+ * Constructor
+ */
+ public AbstractShaveThread(int id_){
+ id=id_;
+ }
+
+ @Override
+ public final void run(){
+ while(processNextTable()){}
+ }
+
+ abstract boolean processNextTable();
+
+ /*--------------------------------------------------------------*/
+
+ long kmersRemovedT=0;
+
+ final int id;
+
+}
\ No newline at end of file
diff --git a/current/assemble/KmerCompressor.java b/current/assemble/KmerCompressor.java
new file mode 100755
index 0000000..9de2e1b
--- /dev/null
+++ b/current/assemble/KmerCompressor.java
@@ -0,0 +1,877 @@
+package assemble;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.BitSet;
+import java.util.Collections;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicLong;
+
+import kmer.AbstractKmerTable;
+import kmer.AbstractKmerTableSet;
+import kmer.HashArray1D;
+import kmer.HashForest;
+import kmer.KmerNode;
+import kmer.KmerTableSet;
+
+import ukmer.Kmer;
+import ukmer.KmerTableSetU;
+
+import stream.ByteBuilder;
+import stream.FastaReadInputStream;
+import stream.Read;
+
+import align2.IntList;
+import align2.LongList;
+import align2.ReadLengthComparator;
+import align2.ReadStats;
+import align2.Shared;
+import align2.Tools;
+import dna.AminoAcid;
+import dna.Parser;
+import dna.Timer;
+import fileIO.ByteFile;
+import fileIO.ByteStreamWriter;
+import fileIO.ReadWrite;
+import fileIO.FileFormat;
+
+
+/**
+ * Assembles kmers into a concise representation.
+ * @author Brian Bushnell
+ * @date May 15, 2015
+ *
+ */
+public class KmerCompressor {
+
+ /**
+ * Code entrance from the command line.
+ * @param args Command line arguments
+ */
+ public static void main(String[] args){
+
+ args=Parser.parseConfig(args);
+ if(Parser.parseHelp(args, true)){
+ printOptions();
+ System.exit(0);
+ }
+
+ Timer t=new Timer(), t2=new Timer();
+ t.start();
+ t2.start();
+
+ final KmerCompressor wog=new KmerCompressor(args, true);
+ t2.stop();
+ outstream.println("Initialization Time: \t"+t2);
+
+ ///And run it
+ wog.process(t);
+ }
+
+ public static final int preparseK(String[] args){
+ int k=31;
+ for(int i=0; i<args.length; i++){
+ final String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if("null".equalsIgnoreCase(b)){b=null;}
+ while(a.charAt(0)=='-' && (a.indexOf('.')<0 || i>1 || !new File(a).exists())){a=a.substring(1);}
+
+ if(a.equals("k")){
+ k=Integer.parseInt(b);
+ }
+ }
+ return Kmer.getMult(k)*Kmer.getK(k);
+ }
+
+ /**
+ * Display usage information.
+ */
+ protected static final void printOptions(){
+ outstream.println("Syntax:\nTODO");
+ }
+
+ /**
+ * Constructor.
+ * @param args Command line arguments
+ */
+ public KmerCompressor(String[] args, boolean setDefaults){
+ System.err.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+ k=preparseK(args);
+
+ if(setDefaults){
+ /* Set global defaults */
+ ReadWrite.ZIPLEVEL=8;
+ ReadWrite.USE_UNPIGZ=true;
+ ReadWrite.USE_PIGZ=true;
+ FastaReadInputStream.SPLIT_READS=false;
+ ByteFile.FORCE_MODE_BF2=Shared.threads()>2;
+ AbstractKmerTableSet.defaultMinprob=0.5;
+ }
+
+ /* Initialize local variables with defaults */
+ Parser parser=new Parser();
+ ArrayList<String> in1=new ArrayList<String>();
+ ArrayList<String> in2=new ArrayList<String>();
+ int fuse_=0;
+
+ {
+ boolean b=false;
+ assert(b=true);
+ EA=b;
+ }
+
+ /* Parse arguments */
+ for(int i=0; i<args.length; i++){
+
+ final String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if("null".equalsIgnoreCase(b)){b=null;}
+ while(a.charAt(0)=='-' && (a.indexOf('.')<0 || i>1 || !new File(a).exists())){a=a.substring(1);}
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(Parser.parseCommonStatic(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseZip(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseQuality(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseFasta(arg, a, b)){
+ //do nothing
+ }else if(parser.parseInterleaved(arg, a, b)){
+ //do nothing
+ }else if(parser.parseTrim(arg, a, b)){
+ //do nothing
+ }else if(a.equals("in") || a.equals("in1")){
+ in1.clear();
+ if(b!=null){
+ String[] s=b.split(",");
+ for(String ss : s){
+ in1.add(ss);
+ }
+ }
+ }else if(a.equals("in2")){
+ in2.clear();
+ if(b!=null){
+ String[] s=b.split(",");
+ for(String ss : s){
+ in2.add(ss);
+ }
+ }
+ }else if(a.equals("out") || a.equals("contigs")){
+ outContigs=b;
+ }else if(a.equals("append") || a.equals("app")){
+ append=ReadStats.append=Tools.parseBoolean(b);
+ }else if(a.equals("overwrite") || a.equals("ow")){
+ overwrite=Tools.parseBoolean(b);
+ }else if(a.equals("fuse")){
+ if(b==null || Character.isLetter(b.charAt(0))){
+ fuse_=Tools.parseBoolean(b) ? 100000 : 0;
+ }else{
+ fuse_=Integer.parseInt(b);
+ }
+ }else if(a.equals("showstats") || a.equals("stats")){
+ showStats=Tools.parseBoolean(b);
+ }else if(a.equals("mincount") || a.equals("mincov") || a.equals("mindepth") || a.equals("min")){
+ minCount=(int)Tools.parseKMG(b);
+ }else if(a.equals("maxcount") || a.equals("maxcov") || a.equals("maxdepth") || a.equals("max")){
+ maxCount=(int)Tools.parseKMG(b);
+ }else if(a.equals("requiresamecount") || a.equals("rsc") || a.equals("rsd")){
+ REQUIRE_SAME_COUNT=Tools.parseBoolean(b);
+ }else if(a.equals("threads") || a.equals("t")){
+ Shared.setThreads(b);
+ }else if(a.equals("buildthreads") || a.equals("bthreads") || a.equals("bt")){
+ if(b.equalsIgnoreCase("auto")){
+ BUILD_THREADS=Shared.threads();
+ }else{
+ BUILD_THREADS=Integer.parseInt(b);
+ }
+ }else if(a.equals("showspeed") || a.equals("ss")){
+ showSpeed=Tools.parseBoolean(b);
+ }else if(a.equals("verbose")){
+// assert(false) : "Verbose flag is currently static final; must be recompiled to change.";
+ verbose=Tools.parseBoolean(b);
+ }else if(a.equals("verbose2")){
+// assert(false) : "Verbose flag is currently static final; must be recompiled to change.";
+ verbose2=Tools.parseBoolean(b);
+ }else if(a.equals("ilb") || a.equals("ignoreleftbranches") || a.equals("ignoreleftjunctions") || a.equals("ibb") || a.equals("ignorebackbranches")){
+ extendThroughLeftJunctions=Tools.parseBoolean(b);
+ }
+
+ else if(KmerTableSetU.isValidArgument(a)){
+ //Do nothing
+ }else{
+ throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ fuse=fuse_;
+ LOAD_THREADS=Shared.threads();
+
+ {//Process parser fields
+ Parser.processQuality();
+ }
+
+ /* Adjust I/O settings and filenames */
+
+ assert(FastaReadInputStream.settingsOK());
+
+ nextTable=new AtomicInteger[1];
+ nextVictims=new AtomicInteger[1];
+ for(int i=0; i<1; i++){
+ nextTable[i]=new AtomicInteger(0);
+ nextVictims[i]=new AtomicInteger(0);
+ }
+
+ if(!Tools.testOutputFiles(overwrite, append, false, outContigs)){
+ throw new RuntimeException("\nCan't write to some output files; overwrite="+overwrite+"\n");
+ }
+ assert(LOAD_THREADS>0);
+ outstream.println("Using "+LOAD_THREADS+" threads.");
+
+
+ final int bytesPerKmer;
+ {
+ int mult=12+k; //worst case for no assembly;
+ if(true){mult+=4;}
+ bytesPerKmer=mult;
+ }
+
+ tables=new KmerTableSet(args, bytesPerKmer);
+ k2=tables.k2;
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Outer Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+
+ public final void process(Timer t){
+
+ /* Count kmers */
+ process2();
+
+ /* Stop timer and calculate speed statistics */
+ t.stop();
+
+
+ if(showSpeed){
+
+ //Format with k or m suffixes
+ String rpstring=(readsIn<100000 ? ""+readsIn : readsIn<100000000 ? (readsIn/1000)+"k" : (readsIn/1000000)+"m");
+ String bpstring=(basesIn<100000 ? ""+basesIn : basesIn<100000000 ? (basesIn/1000)+"k" : (basesIn/1000000)+"m");
+
+ while(rpstring.length()<8){rpstring=" "+rpstring;}
+ while(bpstring.length()<8){bpstring=" "+bpstring;}
+
+ outstream.println("\nTotal Time: \t"+t);
+ }
+
+ if(showStats && outContigs!=null && FileFormat.isFasta(ReadWrite.rawExtension(outContigs))){
+ outstream.println();
+ jgi.AssemblyStats2.main(new String[] {"in="+outContigs});
+ }
+
+ /* Throw an exception if errors were detected */
+ if(errorState){
+ throw new RuntimeException(getClass().getSimpleName()+" terminated in an error state; the output may be corrupt.");
+ }
+ }
+
+ public long loadKmers(Timer t){
+ tables.process(t);
+ return tables.kmersLoaded;
+ }
+
+ public final void process2(){
+
+ /* Start phase timer */
+ Timer t=new Timer();
+
+ /* Fill tables with kmers */
+ outstream.println("\nLoading kmers.\n");
+ loadKmers(t);
+
+ t.stop();
+// outstream.println("Input: \t"+tables.readsIn+" reads \t\t"+tables.basesIn+" bases.");
+// outstream.println("Unique Kmers: \t"+tables.kmersLoaded);
+// outstream.println("Load Time: \t"+t);
+
+
+ t.start();
+
+ {
+ /* Build contigs */
+ outstream.println("\nBuilding contigs.\n");
+ buildContigs();
+
+ if(DISPLAY_PROGRESS){
+ outstream.println("\nAfter building contigs:");
+ Shared.printMemory();
+ outstream.println();
+ }
+
+ t.stop();
+
+ if(readsIn>0){outstream.println("Input: \t"+readsIn+" reads \t\t"+basesIn+" bases.");}
+ outstream.println("Bases generated: \t"+basesBuilt);
+ outstream.println("Contigs generated: \t"+contigsBuilt);
+ outstream.println("Longest contig: \t"+longestContig);
+ outstream.println("Contig-building time: \t"+t);
+ }
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /**
+ * Build contigs.
+ */
+ private final void buildContigs(){
+
+ allContigs=new ArrayList<Read>();
+
+ tables.initializeOwnership();
+
+ /* Create ProcessThreads */
+ ArrayList<AbstractBuildThread> alpt=new ArrayList<AbstractBuildThread>(BUILD_THREADS);
+ for(int i=0; i<BUILD_THREADS; i++){alpt.add(makeBuildThread(i));}
+ for(AbstractBuildThread pt : alpt){pt.start();}
+
+ /* Wait for threads to die, and gather statistics */
+ for(AbstractBuildThread pt : alpt){
+ while(pt.getState()!=Thread.State.TERMINATED){
+ try {
+ pt.join();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ for(Read contig : pt.contigs){
+ allContigs.add(contig);
+ contigsBuilt++;
+ basesBuilt+=contig.length();
+ longestContig=Tools.max(longestContig, contig.length());
+ }
+
+ readsIn+=pt.readsInT;
+ basesIn+=pt.basesInT;
+ lowqReads+=pt.lowqReadsT;
+ lowqBases+=pt.lowqBasesT;
+ }
+
+ if(outContigs!=null){
+ FileFormat ff=FileFormat.testOutput(outContigs, FileFormat.FA, 0, 0, true, overwrite, append, false);
+// ConcurrentReadOutputStream ros=ConcurrentReadOutputStream.getStream(ff, null, null, null, 4, null, false);
+// ros.start();
+ ByteStreamWriter bsw=new ByteStreamWriter(ff);
+ bsw.start();
+ if(allContigs!=null){
+// Collections.sort(allContigs, ReadComparatorID.comparator);
+ Collections.sort(allContigs, ReadLengthComparator.comparator);
+ fuse(allContigs, fuse);
+ for(int i=0; i<allContigs.size(); i++){
+ Read r=allContigs.get(i);
+ bsw.println(r);
+ }
+ }
+ errorState|=bsw.poisonAndWait();
+ }
+ }
+
+ private static void fuse(ArrayList<Read> contigs, int fuse){
+ if(fuse<2){return;}
+ ArrayList<Read> temp=new ArrayList<Read>();
+ ByteBuilder bb=new ByteBuilder();
+ int num=0;
+ for(int i=0; i<contigs.size(); i++){
+ Read r=contigs.set(i, null);
+ if(bb.length()>0){bb.append('N');}
+ bb.append(r.bases);
+ if(bb.length()>=fuse){
+ Read fused=new Read(bb.toBytes(), -1, -1, -1, ""+num, null, num, 0);
+ num++;
+ temp.add(fused);
+ bb.clear();
+ }
+ }
+ if(bb.length()>0){
+ Read fused=new Read(bb.toBytes(), -1, -1, -1, ""+num, null, num, 0);
+ temp.add(fused);
+ bb.clear();
+ }
+ contigs.clear();
+ contigs.addAll(temp);
+ temp=null;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Classes ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /*--------------------------------------------------------------*/
+ /*---------------- BuildThread ----------------*/
+ /*--------------------------------------------------------------*/
+
+ BuildThread makeBuildThread(int id){
+ return new BuildThread(id);
+ }
+
+ /**
+ * Builds contigs.
+ */
+ private class BuildThread extends AbstractBuildThread{
+
+ public BuildThread(int id_){
+ super(id_, Tadpole.contigMode, null);
+ }
+
+ @Override
+ public void run(){
+ //Build from kmers
+
+ //Final pass
+ while(processNextTable(nextTable[0])){}
+ while(processNextVictims(nextVictims[0])){}
+ }
+
+ private boolean processNextTable(AtomicInteger aint){
+ final int tnum=aint.getAndAdd(1);
+ if(tnum>=tables.ways){return false;}
+ final HashArray1D table=tables.getTable(tnum);
+ if(verbose && id==0){System.err.println("Processing table "+tnum+", size "+table.size());}
+ final int max=table.arrayLength();
+ for(int cell=0; cell<max; cell++){
+ int x=processCell(table, cell);
+ }
+ return true;
+ }
+
+ private boolean processNextVictims(AtomicInteger aint){
+ final int tnum=aint.getAndAdd(1);
+ if(tnum>=tables.ways){return false;}
+ final HashArray1D table=tables.getTable(tnum);
+ final HashForest forest=table.victims();
+ if(verbose && id==0){System.err.println("Processing forest "+tnum+", size "+forest.size());}
+ final int max=forest.arrayLength();
+ for(int cell=0; cell<max; cell++){
+ KmerNode kn=forest.getNode(cell);
+ int x=traverseKmerNode(kn);
+ }
+ return true;
+ }
+
+ private int processCell(HashArray1D table, int cell){
+ int count=table.readCellValue(cell);
+ if(count<minCount || count>maxCount){return 0;}
+
+ long key=table.getKmer(cell);
+
+ if(verbose){outstream.println("id="+id+" processing cell "+cell+"; \tkmer="+key+"\t"+toText(key));}
+ int owner=table.getCellOwner(cell);
+ if(verbose){outstream.println("Owner is initially "+owner);}
+ if(owner>-1){return 0;}
+ owner=table.setOwner(key, id, cell);
+ if(verbose){outstream.println("Owner is now "+owner);}
+ if(owner!=id){return 0;}
+ return processKmer(key);
+ }
+
+ private int traverseKmerNode(KmerNode kn){
+ int sum=0;
+ if(kn!=null){
+ sum+=processKmerNode(kn);
+ if(kn.left()!=null){
+ sum+=traverseKmerNode(kn.left());
+ }
+ if(kn.right()!=null){
+ sum+=traverseKmerNode(kn.right());
+ }
+ }
+ return sum;
+ }
+
+ private int processKmerNode(KmerNode kn){
+ final long key=kn.pivot();
+ final int count=kn.getValue(key);
+ if(count<minCount || count>maxCount){return 0;}
+
+ if(verbose){outstream.println("id="+id+" processing KmerNode; \tkmer="+key+"\t"+toText(key));}
+ int owner=kn.getOwner(key);
+ if(verbose){outstream.println("Owner is initially "+owner);}
+ if(owner>-1){return 0;}
+ owner=kn.setOwner(key, id);
+ if(verbose){outstream.println("Owner is now "+owner);}
+ if(owner!=id){return 0;}
+ return processKmer(key);
+ }
+
+ /** Returns length of new contig */
+ private int processKmer(long key){
+ byte[] contig=makeContig(key, builderT, true);
+ if(contig!=null){
+ final long num=contigNum.incrementAndGet();
+ final String id;
+ if(REQUIRE_SAME_COUNT){
+ id="n"+num+",c="+tables.getCount(key);
+ }else{
+ id=Long.toString(num);
+ }
+
+ Read r=new Read(contig, -1, -1, -1, id, null, num, 0);
+ contigs.add(r);
+ if(verbose){System.err.println("Added "+contig.length);}
+ return contig.length;
+ }else{
+ if(verbose){System.err.println("Created null contig.");}
+ }
+ return 0;
+ }
+
+ /** From kmers */
+ private byte[] makeContig(final long key, final ByteBuilder bb, boolean alreadyClaimed){
+ builderT.setLength(0);
+ builderT.appendKmer(key, k);
+ if(verbose){outstream.println("Filled builder: "+builderT);}
+
+ final int initialLength=bb.length();
+ assert(initialLength==k);
+ if(initialLength<k){return null;}
+// System.err.print("A");
+
+ {
+ boolean success=(alreadyClaimed || claim(key, id));
+ if(verbose){System.err.println("Thread "+id+" checking owner after setting: "+findOwner(bb, id));}
+ if(!success){
+ assert(bb.length()==k);
+ // release(bb, id); //no need to release
+ return null;
+ }
+ }
+// System.err.print("B");
+ if(verbose /*|| true*/){System.err.println("Thread "+id+" building contig; initial length "+bb.length());}
+ if(verbose){System.err.println("Extending to right.");}
+ {
+ final int status=extendToRight(bb, rightCounts, id);
+
+ if(status==DEAD_END){
+ //do nothing
+ }else if(status==TOO_LONG){
+ //do nothing
+ }else if(status==BAD_SEED){
+ if(bb.length()<=k){
+ release(key, id);
+ return null;
+ }
+ }else{
+ throw new RuntimeException("Bad return value: "+status);
+ }
+ }
+// System.err.print("C");
+ bb.reverseComplementInPlace();
+ if(verbose /*|| true*/){System.err.println("Extending rcomp to right; current length "+bb.length());}
+ {
+ final int status=extendToRight(bb, rightCounts, id);
+
+ if(status==DEAD_END){
+ //do nothing
+ }else if(status==TOO_LONG){
+ //do nothing
+ }else if(status==BAD_SEED){
+ if(bb.length()<=k){
+ release(key, id);
+ return null;
+ }
+ }else{
+ throw new RuntimeException("Bad return value: "+status);
+ }
+ }
+// System.err.print("D");
+
+ if(verbose /*|| true*/){System.err.println("A: Final length for thread "+id+": "+bb.length());}
+
+ //TODO: Success only if this thread actually owns some kmer in the contig. And trim unowned terminal kmers.
+
+ if(bb.length()>=k){
+ bb.reverseComplementInPlace();
+ return bb.toBytes();
+ }
+ if(verbose /*|| true*/){System.err.println("A: Contig was too short for "+id+": "+bb.length());}
+// assert(false) : bb.length()+", "+initialLength+", "+minExtension+", "+minContigLen;
+// System.err.print("F");
+ return null;
+ }
+
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Extension Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+
+ /**
+ * Extend these bases into a contig.
+ * Stops at both left and right junctions.
+ * Claims ownership.
+ */
+ public int extendToRight(final ByteBuilder bb, final int[] rightCounts, final int id){
+ if(bb.length()<k){return BAD_SEED;}
+ final int shift=2*k;
+ final int shift2=shift-2;
+ final long mask=~((-1L)<<shift);
+ long kmer=0;
+ long rkmer=0;
+ int len=0;
+
+ /* Loop through the bases, maintaining a forward and reverse kmer via bitshifts, to get the rightmost kmer */
+ {
+ final int bblen=bb.length();
+ final byte[] bases=bb.array;
+ for(int i=bblen-k; i<bblen; i++){
+ final byte b=bases[i];
+ final long x=AminoAcid.baseToNumber[b];
+ final long x2=AminoAcid.baseToComplementNumber[b];
+ kmer=((kmer<<2)|x)&mask;
+ rkmer=(rkmer>>>2)|(x2<<shift2);
+ if(x<0){
+ len=0;
+ kmer=rkmer=0;
+ }else{len++;}
+ if(verbose){outstream.println("A: Scanning i="+i+", len="+len+", kmer="+kmer+", rkmer="+rkmer+"\t"+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
+ }
+ }
+
+ if(len<k){
+ if(verbose){outstream.println("Returning BAD_SEED 1");}
+ return BAD_SEED;
+ }
+ else{assert(len==k);}
+
+ /* Now the trailing kmer has been initialized. */
+
+ long key=toValue(kmer, rkmer);
+ HashArray1D table=tables.getTableForKey(key);
+ int count=table.getValue(key);
+ if(count<minCount || count>maxCount){
+ if(verbose){
+ outstream.println("Returning because count was too low: "+count);
+ outstream.println("Returning BAD_SEED 2");
+ }
+ return BAD_SEED;
+ }
+
+ int owner=table.getOwner(key);
+ if(verbose){outstream.println("Owner: "+owner);}
+ if(owner>-1 && owner!=id){
+ if(verbose){outstream.println("Returning BAD_SEED 3");}
+ return BAD_SEED;
+ }
+
+ owner=table.setOwner(key, id);
+ if(verbose){outstream.println("A. Owner is now "+id+" for key "+key);}
+ if(owner!=id){
+ if(verbose){
+ outstream.println("Returning early because owner was "+owner+" for thread "+id+".");
+ outstream.println("Returning BAD_SEED 4");
+ }
+ return BAD_SEED;
+ }
+
+ final int maxLen=bb.length()+90000;
+
+ while(bb.length()<maxLen){
+
+ fillRightCounts(kmer, rkmer, rightCounts, mask, shift2);
+ int selected=-1;
+ for(int i=0; i<4; i++){
+ final int count2=rightCounts[i];
+ if(count2>=minCount && count2<=maxCount && (!REQUIRE_SAME_COUNT || count2==count)){
+ final long y=i;
+ final long y2=AminoAcid.numberToComplement[i];
+ final long kmer2=((kmer<<2)|(long)y)&mask;
+ final long rkmer2=(rkmer>>>2)|(y2<<shift2);
+ final long key2=toValue(kmer2, rkmer2);
+ HashArray1D table2=tables.getTableForKey(key2);
+ if(table2.getOwner(key2)<0){
+ if(table2.setOwner(key2, id)==id){
+ selected=i;
+ kmer=kmer2;
+ rkmer=rkmer2;
+ key=key2;
+ count=count2;
+ final byte b=AminoAcid.numberToBase[selected];
+ bb.append(b);
+ break;
+ }
+ }
+ }
+ }
+
+ if(verbose){
+ outstream.println("kmer: "+toText(kmer)+", "+toText(rkmer));
+ outstream.println("Counts: "+count+", "+Arrays.toString(rightCounts));
+ }
+
+ if(selected<0){
+ if(verbose){outstream.println("Returning DEAD_END");}
+ return DEAD_END;
+ }//TODO: Explore on failure
+ }
+ assert(owner!=id);
+ if(verbose){
+ outstream.println("Current contig length: "+bb.length()+"\nReturning TOO_LONG");
+ }
+ return TOO_LONG;
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Helper Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ protected final Kmer getKmer(byte[] bases, int loc, Kmer kmer){
+ kmer.clear();
+ for(int i=loc, lim=loc+kmer.k; i<lim; i++){
+ byte b=bases[i];
+ int x=AminoAcid.baseToNumber[b];
+ if(x<0){return null;}
+ kmer.addRightNumeric(x);
+ }
+ assert(kmer.len==kmer.k);
+ return kmer;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Recall Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private final long rcomp(long kmer){return AminoAcid.reverseComplementBinaryFast(kmer, k);}
+ private final long toValue(long kmer, long rkmer){return tables.toValue(kmer, rkmer);}
+ public final int getCount(long kmer, long rkmer){return tables.getCount(kmer, rkmer);}
+ private final boolean claim(long kmer, int id){return claim(kmer, rcomp(kmer), id);}
+ private final boolean claim(long kmer, long rkmer, int id){return tables.claim(kmer, rkmer, id);}
+ private final int findOwner(ByteBuilder bb, int id){return tables.findOwner(bb, id);}
+ private final void release(long key, int id){tables.release(key, id);}
+ private final int fillRightCounts(long kmer, long rkmer, int[] counts, long mask, int shift2){return tables.fillRightCounts(kmer, rkmer, counts, mask, shift2);}
+ private final StringBuilder toText(long kmer){return AbstractKmerTable.toText(kmer, k);}
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public final KmerTableSet tables;
+
+ /** Normal kmer length */
+ private final int k;
+ /** k-1; used in some expressions */
+ private final int k2;
+
+ private ArrayList<Read> allContigs;
+ private long contigsBuilt=0;
+ private long basesBuilt=0;
+ private long longestContig=0;
+
+ protected boolean extendThroughLeftJunctions=true;
+
+ private int minCount=1;
+ private int maxCount=Integer.MAX_VALUE;
+
+ /** Only extend to kmers with the same count as this kmer */
+ private boolean REQUIRE_SAME_COUNT=false;
+
+ public boolean showStats=true;
+
+ /** Has this class encountered errors while processing? */
+ public boolean errorState=false;
+
+ /** Contig output file */
+ private String outContigs=null;
+
+ long readsIn=0;
+ long basesIn=0;
+ long readsOut=0;
+ long basesOut=0;
+ long lowqReads=0;
+ long lowqBases=0;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- ThreadLocal Temps ----------------*/
+ /*--------------------------------------------------------------*/
+
+ protected final void initializeThreadLocals(){
+ if(localRightCounts.get()!=null){return;}
+ localRightCounts.set(new int[4]);
+ localLongList.set(new LongList());
+ localIntList.set(new IntList());
+ localByteBuilder.set(new ByteBuilder());
+ localBitSet.set(new BitSet(300));
+ localKmer.set(new Kmer(k));
+ }
+
+ protected ThreadLocal<int[]> localRightCounts=new ThreadLocal<int[]>();
+ protected ThreadLocal<LongList> localLongList=new ThreadLocal<LongList>();
+ protected ThreadLocal<IntList> localIntList=new ThreadLocal<IntList>();
+ protected ThreadLocal<ByteBuilder> localByteBuilder=new ThreadLocal<ByteBuilder>();
+ protected ThreadLocal<BitSet> localBitSet=new ThreadLocal<BitSet>();
+ protected ThreadLocal<Kmer> localKmer=new ThreadLocal<Kmer>();
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Final Primitives ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** True iff java was launched with the -ea' flag */
+ private final boolean EA;
+
+ /** For numbering contigs */
+ final AtomicLong contigNum=new AtomicLong(0);
+
+ /** For controlling access to tables for contig-building */
+ final AtomicInteger nextTable[];
+
+ /** For controlling access to victim buffers for contig-building */
+ final AtomicInteger nextVictims[];
+
+ final int fuse;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Static Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Print messages to this stream */
+ protected static PrintStream outstream=System.err;
+ /** Permission to overwrite existing files */
+ public static boolean overwrite=false;
+ /** Permission to append to existing files */
+ public static boolean append=false;
+ /** Print speed statistics upon completion */
+ public static boolean showSpeed=true;
+ /** Display progress messages such as memory usage */
+ public static boolean DISPLAY_PROGRESS=true;
+ /** Verbose messages */
+ public static boolean verbose=false;
+ /** Debugging verbose messages */
+ public static boolean verbose2=false;
+ /** Number of load threads */
+ public static int LOAD_THREADS=Shared.threads();
+ /** Number of build threads */
+ public static int BUILD_THREADS=1;
+
+ /** Explore codes */
+ public static final int KEEP_GOING=0, DEAD_END=1, TOO_SHORT=2, TOO_LONG=3, TOO_DEEP=4;
+
+ /** Extend codes */
+ public static final int BAD_SEED=12;
+
+ public static final int STATUS_UNEXPLORED=0, STATUS_EXPLORED=1, STATUS_REMOVE=2, STATUS_KEEP=3;
+
+}
diff --git a/current/assemble/Postfilter.java b/current/assemble/Postfilter.java
new file mode 100755
index 0000000..4c53030
--- /dev/null
+++ b/current/assemble/Postfilter.java
@@ -0,0 +1,238 @@
+package assemble;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import jgi.FilterByCoverage;
+
+import stream.FastaReadInputStream;
+import align2.BBMap;
+import align2.ReadStats;
+import align2.Shared;
+import align2.Tools;
+import dna.Data;
+import dna.Parser;
+import dna.Timer;
+import fileIO.ByteFile;
+import fileIO.FileFormat;
+import fileIO.ReadWrite;
+
+/**
+ * @author Brian Bushnell
+ * @date Jul 8, 2015
+ *
+ */
+public class Postfilter {
+
+ /**
+ * Code entrance from the command line.
+ * @param args Command line arguments
+ */
+ public static void main(String[] args){
+
+ args=Parser.parseConfig(args);
+ if(Parser.parseHelp(args, true)){
+ printOptions();
+ System.exit(0);
+ }
+
+ Timer t=new Timer();
+
+ //Create a new CountKmersExact instance
+ Postfilter pf=new Postfilter(args, true);
+
+ ///And run it
+ pf.process(t);
+ }
+
+ /**
+ * Display usage information.
+ */
+ private static void printOptions(){
+ outstream.println("Please consult the shellscript for usage information.");
+ }
+
+
+ /**
+ * Constructor.
+ * @param args Command line arguments
+ */
+ public Postfilter(String[] args, boolean setDefaults){
+ for(String s : args){if(s.contains("standardout") || s.contains("stdout")){outstream=System.err;}}
+ System.err.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+
+ if(setDefaults){
+ /* Set global defaults */
+ ReadWrite.ZIPLEVEL=8;
+ ReadWrite.USE_UNPIGZ=true;
+ ReadWrite.USE_PIGZ=true;
+ FastaReadInputStream.SPLIT_READS=false;
+ ByteFile.FORCE_MODE_BF2=Shared.threads()>2;
+ }
+
+ /* Parse arguments */
+ for(int i=0; i<args.length; i++){
+
+ final String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if("null".equalsIgnoreCase(b)){b=null;}
+ while(a.charAt(0)=='-' && (a.indexOf('.')<0 || i>1 || !new File(a).exists())){a=a.substring(1);}
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(a.equals("append") || a.equals("app")){
+ append=ReadStats.append=Tools.parseBoolean(b);
+ }else if(a.equals("overwrite") || a.equals("ow")){
+ overwrite=Tools.parseBoolean(b);
+ }else if(a.equals("in") || a.equals("in1")){
+ in1=b;
+ }else if(a.equals("in2")){
+ in2=b;
+ }else if(a.equals("ref") || a.equals("contigs") || a.equals("assembly")){
+ ref=b;
+ }else if(a.equals("out") || a.equals("out1")){
+ out=b;
+ }else if(a.equals("outdirty") || a.equals("outd") || a.equals("outbad")){
+ outdirty=b;
+ }else if(a.equals("showstats")){
+ showStats=Tools.parseBoolean(b);
+ }else if(a.equals("covstats") || a.equals("cov")){
+ covstats=b;
+ }else if(a.equals("maxindel")){
+ maxIndel=Integer.parseInt(b);
+ }else if(a.equals("minhits")){
+ minHits=Integer.parseInt(b);
+ }else if(a.equals("minc") || a.equals("mincov") || a.equals("mincoverage")){
+ minCoverage=Double.parseDouble(b);
+ }else if(a.equals("minp") || a.equals("minpercent")){
+ minCoveredPercent=Double.parseDouble(b);
+ }else if(a.equals("minr") || a.equals("minreads")){
+ minReads=Long.parseLong(b);
+ }else if(a.equals("minl") || a.equals("minlen") || a.equals("minlength")){
+ minLength=Integer.parseInt(b);
+ }else if(a.equals("rescue")){
+ rescue=Tools.parseBoolean(b);
+ }else if(a.equals("trim") || a.equals("trimends")){
+ if(b==null || Character.isLetter(b.charAt(0))){
+ trimEnds=Tools.parseBoolean(b) ? 100 : 0;
+ }else{
+ trimEnds=Integer.parseInt(b);
+ }
+ trimEnds=Tools.max(trimEnds, 0);
+ }else{
+ mapArgs.add(arg);
+ }
+ }
+
+ if(in2==null && in1!=null && in1.contains("#") && !new File(in1).exists()){
+ in2=in1.replaceFirst("#", "2");
+ in1=in1.replaceFirst("#", "1");
+ }
+
+ if(!Tools.testOutputFiles(overwrite, append, false, covstats, out)){
+ throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+
+ covstats+", "+out+"\n");
+ }
+ if(!Tools.testInputFiles(false, true, in1, in2, ref)){
+ throw new RuntimeException("\nCan't read to some input files.\n");
+ }
+ if(!Tools.testForDuplicateFiles(true, in1, in2, covstats, out, ref)){
+ throw new RuntimeException("\nSome file names were specified multiple times.\n");
+ }
+
+ assert(in1!=null);
+ assert(out!=null);
+ assert(ref!=null);
+ assert(covstats!=null);
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Outer Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+
+ public void process(Timer t){
+
+// bbmap.sh in=../reads.fq.gz ref=contigs.fasta nodisk ambig=all maxindel=100 covstats=covstats.txt minhits=2;
+// filterbycoverage.sh in=contigs.fasta out=filtered.fasta cov=covstats.txt mincov=2 minr=6 minp=95 minl=400
+
+ mapArgs.add("in="+in1);
+ if(in2!=null){mapArgs.add("in2="+in2);}
+ mapArgs.add("ref="+ref);
+ mapArgs.add("covstats="+covstats);
+ mapArgs.add("ambig=all");
+ mapArgs.add("minhits="+minHits);
+ mapArgs.add("maxindel="+maxIndel);
+ mapArgs.add("nodisk");
+ mapArgs.add("append="+append);
+ mapArgs.add("ow="+overwrite);
+ mapArgs.add("bw="+bw);
+ mapArgs.add("tipsearch="+tipsearch);
+ mapArgs.add("rescue="+rescue);
+ BBMap.main(mapArgs.toArray(new String[0]));
+ Data.unloadAll();
+
+ mapArgs.clear();
+ mapArgs.add("in="+ref);
+ mapArgs.add("out="+out);
+ if(outdirty!=null){mapArgs.add("outdirty="+outdirty);}
+ mapArgs.add("covstats="+covstats);
+ mapArgs.add("mincov="+minCoverage);
+ mapArgs.add("minr="+minReads);
+ mapArgs.add("minp="+minCoveredPercent);
+ mapArgs.add("minl="+minLength);
+ mapArgs.add("trim="+trimEnds);
+ mapArgs.add("append="+append);
+ mapArgs.add("ow="+overwrite);
+ FilterByCoverage.main(mapArgs.toArray(new String[0]));
+
+ if(showStats && out!=null && FileFormat.isStdio(out)){
+ outstream.println();
+ jgi.AssemblyStats2.main(new String[] {"in="+out});
+ }
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private ArrayList<String> mapArgs=new ArrayList<String>();
+
+ private String in1=null;
+ private String in2=null;
+ private String ref=null;
+ private String out="filtered.fa";
+ private String outdirty=null;
+ private String covstats="covstats.txt";
+
+ private int maxIndel=0;
+ private int minHits=2;
+ private int bw=20;
+ private int tipsearch=0;
+ private boolean rescue=false;
+
+ private int trimEnds=0;
+
+ private double minCoverage=2;
+ private double minCoveredPercent=95;
+ private long minReads=6;
+ private int minLength=400;
+
+ boolean showStats=true;
+
+ boolean append=false;
+ boolean overwrite=false;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Static Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Print messages to this stream */
+ private static PrintStream outstream=System.err;
+
+}
diff --git a/current/assemble/ShaveObject.java b/current/assemble/ShaveObject.java
new file mode 100755
index 0000000..a5ab35a
--- /dev/null
+++ b/current/assemble/ShaveObject.java
@@ -0,0 +1,31 @@
+package assemble;
+
+import java.io.PrintStream;
+
+/**
+ * Holds constants for shaving.
+ * @author Brian Bushnell
+ * @date Jul 20, 2015
+ *
+ */
+public abstract class ShaveObject {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Static Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Print messages to this stream */
+ static PrintStream outstream=System.err;
+
+ /** Explore codes */
+ public static final int KEEP_GOING=0, DEAD_END=1, TOO_SHORT=2, TOO_LONG=3, TOO_DEEP=4, FORWARD_BRANCH=5, BACKWARD_BRANCH=6, LOOP=7;
+ public static final int STATUS_UNEXPLORED=0, STATUS_EXPLORED=1, STATUS_REMOVE=2, STATUS_KEEP=3;
+
+ public static boolean printEventCounts=false;
+
+ /** Verbose messages */
+ public static boolean verbose=false;
+ /** Debugging verbose messages */
+ public static boolean verbose2=false;
+
+}
diff --git a/current/assemble/Shaver.java b/current/assemble/Shaver.java
new file mode 100755
index 0000000..d226b77
--- /dev/null
+++ b/current/assemble/Shaver.java
@@ -0,0 +1,225 @@
+package assemble;
+
+import java.util.ArrayList;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import ukmer.KmerTableSetU;
+
+import kmer.AbstractKmerTableSet;
+import kmer.KmerTableSet;
+import dna.Timer;
+
+/**
+ * Designed for removal of dead ends (aka hairs).
+ * @author Brian Bushnell
+ * @date Jun 26, 2015
+ *
+ */
+public abstract class Shaver extends ShaveObject {
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Factory ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public static final Shaver makeShaver(AbstractKmerTableSet tables, int threads){
+ return makeShaver(tables, threads, 1, 1, 1, 100, 100, true, true);
+ }
+
+ public static final Shaver makeShaver(AbstractKmerTableSet tables, int threads,
+ int minCount, int maxCount, int minSeed, int maxLengthToDiscard, int maxDistanceToExplore,
+ boolean removeHair, boolean removeBubbles){
+ final Class<?> c=tables.getClass();
+ if(c==KmerTableSet.class){
+ return new Shaver1((KmerTableSet)tables, threads, minCount, maxCount, minSeed, maxLengthToDiscard, maxDistanceToExplore, removeHair, removeBubbles);
+ }else if(c==KmerTableSetU.class){
+ return new Shaver2((KmerTableSetU)tables, threads, minCount, maxCount, minSeed, maxLengthToDiscard, maxDistanceToExplore, removeHair, removeBubbles);
+ }
+ throw new RuntimeException("Unhandled class "+c);
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Constructor ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public Shaver(AbstractKmerTableSet tables_, int threads_,
+ int minCount_, int maxCount_, int minSeed_, int maxLengthToDiscard_, int maxDistanceToExplore_,
+ boolean removeHair_, boolean removeBubbles_){
+ threads=threads_;
+ minCount=minCount_;
+ maxCount=maxCount_;
+ minSeed=minSeed_;
+ maxLengthToDiscard=maxLengthToDiscard_;
+ maxDistanceToExplore=maxDistanceToExplore_;
+ removeHair=removeHair_;
+ removeBubbles=removeBubbles_;
+
+ kbig=tables_.kbig();
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Outer Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ abstract AbstractExploreThread makeExploreThread(int id_);
+ abstract AbstractShaveThread makeShaveThread(int id_);
+
+ public final long shave(int minCount_, int maxCount_, int minSeed_, int maxLengthToDiscard_, int maxDistanceToExplore_, boolean removeHair_, boolean removeBubbles_){
+ minCount=minCount_;
+ maxCount=maxCount_;
+ minSeed=minSeed_;
+ maxLengthToDiscard=maxLengthToDiscard_;
+ maxDistanceToExplore=maxDistanceToExplore_;
+ removeHair=removeHair_;
+ removeBubbles=removeBubbles_;
+ return shave();
+ }
+
+ public final long shave(){
+ assert(minSeed>=minCount) : "Required: mincount >= minSeed >= maxCount. "+minCount+", "+minSeed+", "+maxCount;
+ assert(minSeed<=maxCount) : "Required: mincount >= minSeed >= maxCount. "+minCount+", "+minSeed+", "+maxCount;
+ assert(removeHair || removeBubbles);
+
+ Timer t=new Timer();
+
+ long kmersTestedTemp=0;
+ long deadEndsFoundTemp=0;
+ long kmersRemovedTemp=0;
+ long bubblesFoundTemp=0;
+
+ tables().initializeOwnership();
+
+
+ countMatrix=new long[8][8];
+ removeMatrix=new long[8][8];
+
+ {
+ nextTable.set(0);
+ nextVictims.set(0);
+
+ /* Create Explorethreads */
+ ArrayList<AbstractExploreThread> alpt=new ArrayList<AbstractExploreThread>(threads);
+ for(int i=0; i<threads; i++){alpt.add(makeExploreThread(i));}
+ for(AbstractExploreThread pt : alpt){pt.start();}
+
+ /* Wait for threads to die, and gather statistics */
+ for(AbstractExploreThread pt : alpt){
+ while(pt.getState()!=Thread.State.TERMINATED){
+ try {
+ pt.join();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+
+ kmersTestedTemp+=pt.kmersTestedT;
+ deadEndsFoundTemp+=pt.deadEndsFoundT;
+ bubblesFoundTemp+=pt.bubblesFoundT;
+
+ for(int i=0; i<countMatrix.length; i++){
+ for(int j=0; j<countMatrix[i].length; j++){
+ countMatrix[i][j]+=pt.countMatrixT[i][j];
+ removeMatrix[i][j]+=pt.removeMatrixT[i][j];
+ }
+ }
+ }
+ kmersTested+=kmersTestedTemp;
+ deadEndsFound+=deadEndsFoundTemp;
+ bubblesFound+=bubblesFoundTemp;
+
+ t.stop();
+
+ outstream.println("Tested "+kmersTestedTemp+" kmers.");
+ outstream.println("Found "+deadEndsFoundTemp+" dead ends.");
+ outstream.println("Found "+bubblesFoundTemp+" bubbles.");
+
+ outstream.println("Search time: "+t);
+ }
+
+ {
+ t.start();
+
+ nextTable.set(0);
+ nextVictims.set(0);
+
+ /* Create Shavethreads */
+ ArrayList<AbstractShaveThread> alpt=new ArrayList<AbstractShaveThread>(threads);
+ for(int i=0; i<threads; i++){alpt.add(makeShaveThread(i));}
+ for(AbstractShaveThread pt : alpt){pt.start();}
+
+ for(AbstractShaveThread pt : alpt){
+ while(pt.getState()!=Thread.State.TERMINATED){
+ try {
+ pt.join();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+
+ kmersRemovedTemp+=pt.kmersRemovedT;
+ }
+
+ kmersRemoved+=kmersRemovedTemp;
+
+ outstream.println("Removed "+kmersRemovedTemp+" kmers.");
+ t.stop();
+ outstream.println("Shave time: "+t);
+ }
+
+ if(printEventCounts){
+ outstream.println("\nEvent counts:");
+ for(int i=0; i<countMatrix.length; i++){
+ for(int j=0; j<countMatrix[i].length; j++){
+ outstream.print(countMatrix[i][j]+" ");
+ }
+ outstream.println();
+ }
+ outstream.println("\nRemoval counts:");
+ for(int i=0; i<removeMatrix.length; i++){
+ for(int j=0; j<removeMatrix[i].length; j++){
+ outstream.print(removeMatrix[i][j]+" ");
+ }
+ outstream.println();
+ }
+ }
+
+ return kmersRemovedTemp;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public long kmersTested=0;
+ public long deadEndsFound=0;
+ public long bubblesFound=0;
+ public long kmersRemoved=0;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Final Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ abstract AbstractKmerTableSet tables();
+ final int kbig;
+ final int threads;
+ int minCount;
+ int maxCount;
+ int minSeed;
+ int maxLengthToDiscard;
+ int maxDistanceToExplore;
+ boolean removeHair;
+ boolean removeBubbles;
+
+ private long[][] countMatrix;
+ private long[][] removeMatrix;
+
+ /** For controlling access to tables */
+ final AtomicInteger nextTable=new AtomicInteger(0);
+
+ /** For controlling access to victim buffers */
+ final AtomicInteger nextVictims=new AtomicInteger(0);
+
+}
diff --git a/current/assemble/Shaver1.java b/current/assemble/Shaver1.java
new file mode 100755
index 0000000..26ef39f
--- /dev/null
+++ b/current/assemble/Shaver1.java
@@ -0,0 +1,568 @@
+package assemble;
+
+import java.util.Arrays;
+import java.util.concurrent.atomic.AtomicIntegerArray;
+
+import kmer.AbstractKmerTable;
+import kmer.AbstractKmerTableSet;
+import kmer.HashArray1D;
+import kmer.HashForest;
+import kmer.KmerNode;
+import kmer.KmerTableSet;
+import stream.ByteBuilder;
+import ukmer.Kmer;
+import align2.Tools;
+import dna.AminoAcid;
+
+/**
+ * Designed for removal of dead ends (aka hairs).
+ * @author Brian Bushnell
+ * @date Jun 26, 2015
+ *
+ */
+public class Shaver1 extends Shaver {
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Constructor ----------------*/
+ /*--------------------------------------------------------------*/
+
+
+ public Shaver1(KmerTableSet tables_, int threads_){
+ this(tables_, threads_, 1, 1, 1, 100, 100, true, true);
+ }
+
+ public Shaver1(KmerTableSet tables_, int threads_,
+ int minCount_, int maxCount_, int minSeed_, int maxLengthToDiscard_, int maxDistanceToExplore_,
+ boolean removeHair_, boolean removeBubbles_){
+ super(tables_, threads_, minCount_, maxCount_, minSeed_, maxLengthToDiscard_, maxDistanceToExplore_, removeHair_, removeBubbles_);
+ tables=tables_;
+ k=tables.k;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Outer Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ final AbstractExploreThread makeExploreThread(int id_){return new ExploreThread(id_);}
+ @Override
+ final AbstractShaveThread makeShaveThread(int id_){return new ShaveThread(id_);}
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Dead-End Removal ----------------*/
+ /*--------------------------------------------------------------*/
+
+
+ public boolean exploreAndMark(long kmer, ByteBuilder bb, int[] leftCounts, int[] rightCounts, int minCount, int maxCount,
+ int maxLengthToDiscard, int maxDistanceToExplore, boolean prune
+ , long[][] countMatrixT, long[][] removeMatrixT
+ ){
+ bb.clear();
+ if(findOwner(kmer)>STATUS_UNEXPLORED){return false;}
+
+ bb.appendKmer(kmer, k);
+ final int a=explore(kmer, bb, leftCounts, rightCounts, minCount, maxCount, maxDistanceToExplore);
+
+ bb.reverseComplementInPlace();
+ kmer=tables.rightmostKmer(bb);
+ final int b=explore(kmer, bb, leftCounts, rightCounts, minCount, maxCount, maxDistanceToExplore);
+
+ final int min=Tools.min(a, b);
+ final int max=Tools.max(a, b);
+
+ countMatrixT[min][max]++;
+
+ if(a==TOO_LONG || a==TOO_DEEP || a==LOOP || a==FORWARD_BRANCH){
+ claim(bb, STATUS_EXPLORED, false);
+ return false;
+ }
+
+ if(b==TOO_LONG || b==TOO_DEEP || b==LOOP || b==FORWARD_BRANCH){
+ claim(bb, STATUS_EXPLORED, false);
+ return false;
+ }
+
+ if(bb.length()-k>maxLengthToDiscard){
+ claim(bb, STATUS_EXPLORED, false);
+ return false;
+ }
+
+ if(removeHair && min==DEAD_END){
+ if(max==DEAD_END || max==BACKWARD_BRANCH){
+ removeMatrixT[min][max]++;
+ boolean success=claim(bb, STATUS_REMOVE, false);
+ if(verbose || verbose2){System.err.println("Claiming ("+a+","+b+") length "+bb.length()+": "+bb);}
+ assert(success);
+ return true;
+ }
+ }
+
+ if(removeBubbles){
+ if(a==BACKWARD_BRANCH && b==BACKWARD_BRANCH){
+ removeMatrixT[min][max]++;
+ boolean success=claim(bb, STATUS_REMOVE, false);
+ if(verbose || verbose2){System.err.println("Claiming ("+a+","+b+") length "+bb.length()+": "+bb);}
+ assert(success);
+ return true;
+ }
+ }
+
+ claim(bb, STATUS_EXPLORED, false);
+ return false;
+ }
+
+ /** Explores a single unbranching path in the forward direction.
+ * Returns reason for ending in this direction:
+ * DEAD_END, TOO_LONG, TOO_DEEP, FORWARD_BRANCH, BACKWARD_BRANCH */
+ public int explore(long kmer, ByteBuilder bb, int[] leftCounts, int[] rightCounts, int minCount, int maxCount, int maxLength0){
+ if(verbose){outstream.println("Entering explore with bb.length()="+bb.length());}
+ assert(bb.length()==0 || tables.rightmostKmer(bb)==kmer);
+ if(bb.length()==0){bb.appendKmer(kmer, k);}
+
+ final int initialLength=bb.length();
+ final int maxLength=maxLength0+k;
+ final int shift=2*k;
+ final int shift2=shift-2;
+ final long mask=~((-1L)<<shift);
+
+ long rkmer=AminoAcid.reverseComplementBinaryFast(kmer, k);
+
+ long key=toValue(kmer, rkmer);
+ final long firstKey=key;
+ HashArray1D table=tables.getTableForKey(key);
+ int count=table.getValue(key);
+ assert(count>=minCount && count<=maxCount);
+
+ int nextRightMaxPos=fillRightCounts(kmer, rkmer, rightCounts, mask, shift2);
+ int nextRightMax=rightCounts[nextRightMaxPos];
+ if(nextRightMax<minCount){
+ if(verbose){outstream.println("Returning DEAD_END: rightMax="+nextRightMax);}
+ return DEAD_END;
+ }
+
+ while(bb.length()<=maxLength){
+
+ final int rightMaxPos=nextRightMaxPos;
+ final int rightMax=rightCounts[rightMaxPos];
+ final int rightSecondPos=Tools.secondHighestPosition(rightCounts);
+ final int rightSecond=rightCounts[rightSecondPos];
+
+ if(verbose){
+ outstream.println("kmer: "+toText(kmer, k)+", "+toText(rkmer, k));
+ outstream.println("Right counts: "+count+", "+Arrays.toString(rightCounts));
+ outstream.println("rightMaxPos="+rightMaxPos);
+ outstream.println("rightMax="+rightMax);
+ outstream.println("rightSecondPos="+rightSecondPos);
+ outstream.println("rightSecond="+rightSecond);
+ }
+
+ final int prevCount=count;
+
+ //Generate the new base
+ final byte b=AminoAcid.numberToBase[rightMaxPos];
+ final long x=rightMaxPos;
+ final long x2=AminoAcid.numberToComplement[rightMaxPos];
+ kmer=((kmer<<2)|(long)x)&mask;
+ rkmer=(rkmer>>>2)|(x2<<shift2);
+
+ //Now consider the next kmer
+ key=toValue(kmer, rkmer);
+ if(key==firstKey){
+ if(verbose){outstream.println("Returning LOOP");}
+ return LOOP;
+ }
+ table=tables.getTableForKey(key);
+
+ assert(table.getValue(key)==rightMax);
+ count=rightMax;
+
+ {//Fill right and look for dead end
+ nextRightMaxPos=fillRightCounts(kmer, rkmer, rightCounts, mask, shift2);
+ nextRightMax=rightCounts[nextRightMaxPos];
+ if(nextRightMax<minCount){
+ if(verbose){outstream.println("Returning DEAD_END: rightMax="+rightMax);}
+ return DEAD_END;
+ }
+ }
+
+
+ {//Look left
+ final int leftMaxPos=fillLeftCounts(kmer, rkmer, leftCounts, mask, shift2);
+ final int leftMax=leftCounts[leftMaxPos];
+ final int leftSecondPos=Tools.secondHighestPosition(leftCounts);
+ final int leftSecond=leftCounts[leftSecondPos];
+
+// assert(leftMax==1 || leftMax==0) : prevCount+" -> "+Arrays.toString(leftCounts)+", "+count+", "+Arrays.toString(rightCounts);
+
+ if(verbose){
+ outstream.println("Left counts: "+count+", "+Arrays.toString(leftCounts));
+ outstream.println("leftMaxPos="+leftMaxPos);
+ outstream.println("leftMax="+leftMax);
+ outstream.println("leftSecondPos="+leftSecondPos);
+ outstream.println("leftSecond="+leftSecond);
+ }
+
+ if(leftSecond>=minCount || leftMax>prevCount){//Backward branch
+// assert(leftSecond==1) : prevCount+" -> "+Arrays.toString(leftCounts)+", "+count+", "+Arrays.toString(rightCounts);
+ if(leftMax>prevCount){
+ if(verbose){outstream.println("Returning BACKWARD_BRANCH_LOWER: " +
+ "count="+count+", prevCount="+prevCount+", leftMax="+leftMax+", leftSecond="+leftSecond);}
+ return BACKWARD_BRANCH;
+ }else{
+ assert(leftMax==prevCount);
+ if(leftMax>=2*leftSecond){//This constant is adjustable
+// assert(false) : prevCount+" -> "+Arrays.toString(leftCounts)+", "+count+", "+Arrays.toString(rightCounts);
+ //keep going
+ }else{
+ if(verbose){outstream.println("Returning BACKWARD_BRANCH_SIMILAR: " +
+ "count="+count+", prevCount="+prevCount+", leftMax="+leftMax+", leftSecond="+leftSecond);}
+ return BACKWARD_BRANCH;
+ }
+ }
+ }
+
+ }
+
+ //Look right
+ if(rightSecond>=minCount){
+ if(verbose){outstream.println("Returning FORWARD_BRANCH: rightSecond="+rightSecond);}
+ return FORWARD_BRANCH;
+ }
+
+ if(count>maxCount){
+ if(verbose){outstream.println("Returning TOO_DEEP: rightMax="+rightMax);}
+ return TOO_DEEP;
+ }
+
+ bb.append(b);
+ if(verbose){outstream.println("Added base "+(char)b);}
+ }
+
+ assert(bb.length()>maxLength);
+ if(verbose){outstream.println("Returning TOO_LONG: length="+bb.length());}
+ return TOO_LONG;
+ }
+
+ /** Explores a single unbranching path in the forward direction.
+ * Returns reason for ending in this direction:
+ * DEAD_END, TOO_LONG, TOO_DEEP, FORWARD_BRANCH, BACKWARD_BRANCH */
+ public int explore2(long kmer, ByteBuilder bb, int[] leftCounts, int[] rightCounts, int minCount, int maxCount, int maxLength0){
+ if(verbose){outstream.println("Entering explore with bb.length()="+bb.length());}
+ assert(bb.length()==0 || tables.rightmostKmer(bb)==kmer);
+ if(bb.length()==0){bb.appendKmer(kmer, k);}
+
+ final int maxLength=maxLength0+k;
+ final int shift=2*k;
+ final int shift2=shift-2;
+ final long mask=~((-1L)<<shift);
+
+ long rkmer=AminoAcid.reverseComplementBinaryFast(kmer, k);
+
+ long key=toValue(kmer, rkmer);
+ final long firstKey=key;
+ HashArray1D table=tables.getTableForKey(key);
+ int count=table.getValue(key);
+ assert(count>=minCount && count<=maxCount);
+
+ int rightMaxPos=fillRightCounts(kmer, rkmer, rightCounts, mask, shift2);
+ int rightMax=rightCounts[rightMaxPos];
+ int rightSecondPos=Tools.secondHighestPosition(rightCounts);
+ int rightSecond=rightCounts[rightSecondPos];
+
+ if(verbose){
+ outstream.println("kmer: "+toText(kmer, k)+", "+toText(rkmer, k));
+ outstream.println("Right counts: "+count+", "+Arrays.toString(rightCounts));
+ outstream.println("rightMaxPos="+rightMaxPos);
+ outstream.println("rightMax="+rightMax);
+ outstream.println("rightSecondPos="+rightSecondPos);
+ outstream.println("rightSecond="+rightSecond);
+ }
+
+ if(rightMax<minCount){
+ if(verbose){outstream.println("Returning DEAD_END: rightMax="+rightMax);}
+ return DEAD_END;
+ }else if(rightSecond>=minCount){
+ if(verbose){outstream.println("Returning FORWARD_BRANCH: rightSecond="+rightSecond);}
+ return FORWARD_BRANCH;
+ }else if(rightMax>maxCount){
+ if(verbose){outstream.println("Returning TOO_DEEP: rightMax="+rightMax);}
+ return TOO_DEEP;
+ }
+
+ while(bb.length()<=maxLength){
+ final int prevCount=count;
+
+ //Generate the new base
+ final byte b=AminoAcid.numberToBase[rightMaxPos];
+ final long x=rightMaxPos;
+ final long x2=AminoAcid.numberToComplement[rightMaxPos];
+ kmer=((kmer<<2)|(long)x)&mask;
+ rkmer=(rkmer>>>2)|(x2<<shift2);
+
+ //Now consider the next kmer
+ key=toValue(kmer, rkmer);
+ if(key==firstKey){
+ if(verbose){outstream.println("Returning LOOP");}
+ return LOOP;
+ }
+ table=tables.getTableForKey(key);
+
+ assert(table.getValue(key)==rightMax);
+ count=rightMax;
+
+ {//Look right
+ rightMaxPos=fillRightCounts(kmer, rkmer, rightCounts, mask, shift2);
+ rightMax=rightCounts[rightMaxPos];
+ rightSecondPos=Tools.secondHighestPosition(rightCounts);
+ rightSecond=rightCounts[rightSecondPos];
+
+ if(verbose){
+ outstream.println("kmer: "+toText(kmer, k)+", "+toText(rkmer, k));
+ outstream.println("Right counts: "+count+", "+Arrays.toString(rightCounts));
+ outstream.println("rightMaxPos="+rightMaxPos);
+ outstream.println("rightMax="+rightMax);
+ outstream.println("rightSecondPos="+rightSecondPos);
+ outstream.println("rightSecond="+rightSecond);
+ }
+
+ if(rightMax<minCount){
+ if(verbose){outstream.println("Returning DEAD_END: rightMax="+rightMax);}
+ return DEAD_END;
+ }else if(rightSecond>=minCount){
+ if(verbose){outstream.println("Returning FORWARD_BRANCH: rightSecond="+rightSecond);}
+ return FORWARD_BRANCH;
+ }
+ }
+
+ {//Look left
+ int leftMaxPos=fillLeftCounts(kmer, rkmer, leftCounts, mask, shift2);
+ int leftMax=leftCounts[leftMaxPos];
+ int leftSecondPos=Tools.secondHighestPosition(leftCounts);
+ int leftSecond=leftCounts[leftSecondPos];
+
+// assert(leftMax==1 || leftMax==0) : prevCount+" -> "+Arrays.toString(leftCounts)+", "+count+", "+Arrays.toString(rightCounts);
+
+ if(verbose){
+ outstream.println("Left counts: "+count+", "+Arrays.toString(leftCounts));
+ outstream.println("leftMaxPos="+leftMaxPos);
+ outstream.println("leftMax="+leftMax);
+ outstream.println("leftSecondPos="+leftSecondPos);
+ outstream.println("leftSecond="+leftSecond);
+ }
+
+ if(leftSecond>=minCount){//Backward branch
+// assert(leftSecond==1) : prevCount+" -> "+Arrays.toString(leftCounts)+", "+count+", "+Arrays.toString(rightCounts);
+ if(leftMax>prevCount){
+ if(verbose){outstream.println("Returning BACKWARD_BRANCH_LOWER: " +
+ "count="+count+", prevCount="+prevCount+", leftMax="+leftMax+", leftSecond="+leftSecond);}
+ return BACKWARD_BRANCH;
+ }else{
+ assert(leftMax==prevCount);
+ if(leftMax>=2*leftSecond){//This constant is adjustable
+// assert(false) : prevCount+" -> "+Arrays.toString(leftCounts)+", "+count+", "+Arrays.toString(rightCounts);
+ //keep going
+ }else{
+ if(verbose){outstream.println("Returning BACKWARD_BRANCH_SIMILAR: " +
+ "count="+count+", prevCount="+prevCount+", leftMax="+leftMax+", leftSecond="+leftSecond);}
+ return BACKWARD_BRANCH;
+ }
+ }
+ }
+ }
+
+ if(count>maxCount){
+ if(verbose){outstream.println("Returning TOO_DEEP: rightMax="+rightMax);}
+ return TOO_DEEP;
+ }
+
+ bb.append(b);
+ if(verbose){outstream.println("Added base "+(char)b);}
+ }
+
+ assert(bb.length()>maxLength);
+ if(verbose){outstream.println("Returning TOO_LONG: length="+bb.length()+", rightMax="+rightMax);}
+ return TOO_LONG;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Classes ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /*--------------------------------------------------------------*/
+ /*---------------- ExploreThread ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /**
+ * Searches for dead ends.
+ */
+ class ExploreThread extends AbstractExploreThread{
+
+ /**
+ * Constructor
+ */
+ public ExploreThread(int id_){
+ super(id_, k);
+ }
+
+ @Override
+ boolean processNextTable(Kmer kmer){
+ final int tnum=nextTable.getAndAdd(1);
+ if(tnum>=tables.ways){return false;}
+ final HashArray1D table=tables.getTable(tnum);
+ final int max=table.arrayLength();
+ for(int cell=0; cell<max; cell++){
+ int x=processCell(table, cell);
+ deadEndsFoundT+=x;
+ }
+ return true;
+ }
+
+ @Override
+ boolean processNextVictims(Kmer kmer){
+ final int tnum=nextVictims.getAndAdd(1);
+ if(tnum>=tables.ways){return false;}
+ final HashArray1D table=tables.getTable(tnum);
+ final HashForest forest=table.victims();
+ final int max=forest.arrayLength();
+ for(int cell=0; cell<max; cell++){
+ KmerNode kn=forest.getNode(cell);
+ int x=traverseKmerNode(kn);
+ deadEndsFoundT+=x;
+ }
+ return true;
+ }
+
+ private int processCell(HashArray1D table, int cell){
+ int count=table.readCellValue(cell);
+ if(count<minSeed || count>maxCount){return 0;}
+ int owner=table.getCellOwner(cell);
+ if(owner>STATUS_UNEXPLORED){return 0;}
+ long key=table.getKmer(cell);
+ if(verbose){outstream.println("id="+id+" processing cell "+cell+"; \tkmer="+key+"\t"+AminoAcid.kmerToString(key, k));}
+
+ return processKmer(key);
+ }
+
+ private int traverseKmerNode(KmerNode kn){
+ int sum=0;
+ if(kn!=null){
+ sum+=processKmerNode(kn);
+ if(kn.left()!=null){
+ sum+=traverseKmerNode(kn.left());
+ }
+ if(kn.right()!=null){
+ sum+=traverseKmerNode(kn.right());
+ }
+ }
+ return sum;
+ }
+
+ private int processKmerNode(KmerNode kn){
+ final long key=kn.pivot();
+ final int count=kn.getValue(key);
+ if(count<minSeed || count>maxCount){return 0;}
+ int owner=kn.getOwner(key);
+ if(owner>STATUS_UNEXPLORED){return 0;}
+
+ return processKmer(key);
+ }
+
+ private int processKmer(long key){
+ kmersTestedT++;
+ boolean b=exploreAndMark(key, builderT, leftCounts, rightCounts, minCount, maxCount, maxLengthToDiscard, maxDistanceToExplore, true
+ , countMatrixT, removeMatrixT
+ );
+ return b ? 1 : 0;
+ }
+
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- ShaveThread ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /**
+ * Removes dead-end kmers.
+ */
+ class ShaveThread extends AbstractShaveThread{
+
+ /**
+ * Constructor
+ */
+ public ShaveThread(int id_){
+ super(id_);
+ }
+
+ @Override
+ boolean processNextTable(){
+ final int tnum=nextTable.getAndAdd(1);
+ if(tnum>=tables.ways){return false;}
+// long x=0;
+ final HashArray1D table=tables.getTable(tnum);
+ final AtomicIntegerArray owners=table.owners();
+ final int[] values=table.values();
+ final int max=table.arrayLength();
+ for(int cell=0; cell<max; cell++){
+ if(owners.get(cell)==STATUS_REMOVE){
+// x++;
+ values[cell]=0;
+ }
+ }
+ for(KmerNode kn : table.victims().array()){
+ if(kn!=null){traverseKmerNode(kn);}
+ }
+
+ table.clearOwnership();
+ kmersRemovedT+=table.regenerate();
+// outstream.println(x);
+ return true;
+ }
+
+ private void traverseKmerNode(KmerNode kn){
+ if(kn==null){return;}
+ if(kn.owner()==STATUS_REMOVE){kn.set(0);}
+ traverseKmerNode(kn.left());
+ traverseKmerNode(kn.right());
+ }
+
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Recall Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+
+ private final long toValue(long kmer, long rkmer){return tables.toValue(kmer, rkmer);}
+ int getCount(long kmer, long rkmer){return tables.getCount(kmer, rkmer);}
+ boolean claim(long kmer, int id){return claim(kmer, AminoAcid.reverseComplementBinaryFast(kmer, k), id);}
+ boolean claim(long kmer, long rkmer, int id){return tables.claim(kmer, rkmer, id);}
+ boolean doubleClaim(ByteBuilder bb, int id/*, long rid*/){return tables.doubleClaim(bb, id/*, rid*/);}
+ boolean claim(ByteBuilder bb, int id, /*long rid, */boolean earlyExit){return tables.claim(bb, id/*, rid*/, earlyExit);}
+ boolean claim(byte[] array, int len, int id, /*long rid, */boolean earlyExit){return tables.claim(array, len, id/*, rid*/, earlyExit);}
+ int findOwner(long kmer){return tables.findOwner(kmer);}
+ int findOwner(ByteBuilder bb, int id){return tables.findOwner(bb, id);}
+ int findOwner(byte[] array, int len, int id){return tables.findOwner(array, len, id);}
+ void release(ByteBuilder bb, int id){tables.release(bb, id);}
+ void release(byte[] array, int len, int id){tables.release(array, len, id);}
+ int fillRightCounts(long kmer, long rkmer, int[] counts, long mask, int shift2){return tables.fillRightCounts(kmer, rkmer, counts, mask, shift2);}
+ int fillLeftCounts(long kmer, long rkmer, int[] counts, long mask, int shift2){return tables.fillLeftCounts(kmer, rkmer, counts, mask, shift2);}
+ static StringBuilder toText(long kmer, int k){return AbstractKmerTable.toText(kmer, k);}
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Final Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ AbstractKmerTableSet tables(){return tables;}
+
+ private final KmerTableSet tables;
+ private final int k;
+
+}
diff --git a/current/assemble/Shaver2.java b/current/assemble/Shaver2.java
new file mode 100755
index 0000000..505c788
--- /dev/null
+++ b/current/assemble/Shaver2.java
@@ -0,0 +1,419 @@
+package assemble;
+
+import java.util.Arrays;
+import java.util.concurrent.atomic.AtomicIntegerArray;
+
+import kmer.AbstractKmerTableSet;
+import stream.ByteBuilder;
+import ukmer.AbstractKmerTableU;
+import ukmer.HashArrayU1D;
+import ukmer.HashForestU;
+import ukmer.Kmer;
+import ukmer.KmerNodeU;
+import ukmer.KmerTableSetU;
+import align2.Tools;
+import dna.AminoAcid;
+
+/**
+ * Designed for removal of dead ends (aka hairs).
+ * @author Brian Bushnell
+ * @date Jun 26, 2015
+ *
+ */
+public class Shaver2 extends Shaver {
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Constructor ----------------*/
+ /*--------------------------------------------------------------*/
+
+
+ public Shaver2(KmerTableSetU tables_, int threads_){
+ this(tables_, threads_, 1, 1, 1, 100, 100, true, true);
+ }
+
+ public Shaver2(KmerTableSetU tables_, int threads_,
+ int minCount_, int maxCount_, int minSeed_, int maxLengthToDiscard_, int maxDistanceToExplore_,
+ boolean removeHair_, boolean removeBubbles_){
+ super(tables_, threads_, minCount_, maxCount_, minSeed_, maxLengthToDiscard_, maxDistanceToExplore_, removeHair_, removeBubbles_);
+ tables=tables_;
+
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Outer Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ final AbstractExploreThread makeExploreThread(int id_){return new ExploreThread(id_);}
+ @Override
+ final AbstractShaveThread makeShaveThread(int id_){return new ShaveThread(id_);}
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Dead-End Removal ----------------*/
+ /*--------------------------------------------------------------*/
+
+
+ public boolean exploreAndMark(Kmer kmer, ByteBuilder bb, int[] leftCounts, int[] rightCounts, int minCount, int maxCount,
+ int maxLengthToDiscard, int maxDistanceToExplore, boolean prune
+ , long[][] countMatrixT, long[][] removeMatrixT
+ ){
+ bb.clear();
+ assert(kmer.len>=kmer.kbig);
+ if(findOwner(kmer)>STATUS_UNEXPLORED){return false;}
+
+ bb.appendKmer(kmer);
+ final int a=explore(kmer, bb, leftCounts, rightCounts, minCount, maxCount, maxDistanceToExplore);
+
+ bb.reverseComplementInPlace();
+ kmer=tables.rightmostKmer(bb, kmer);
+ final int b=explore(kmer, bb, leftCounts, rightCounts, minCount, maxCount, maxDistanceToExplore);
+
+ final int min=Tools.min(a, b);
+ final int max=Tools.max(a, b);
+
+ countMatrixT[min][max]++;
+
+ if(a==TOO_LONG || a==TOO_DEEP || a==LOOP || a==FORWARD_BRANCH){
+ claim(bb, STATUS_EXPLORED, false, kmer);
+ return false;
+ }
+
+ if(b==TOO_LONG || b==TOO_DEEP || b==LOOP || b==FORWARD_BRANCH){
+ claim(bb, STATUS_EXPLORED, false, kmer);
+ return false;
+ }
+
+ if(bb.length()-kbig>maxLengthToDiscard){
+ claim(bb, STATUS_EXPLORED, false, kmer);
+ return false;
+ }
+
+ if(removeHair && min==DEAD_END){
+ if(max==DEAD_END || max==BACKWARD_BRANCH){
+ removeMatrixT[min][max]++;
+ boolean success=claim(bb, STATUS_REMOVE, false, kmer);
+ if(verbose || verbose2){System.err.println("Claiming ("+a+","+b+") length "+bb.length()+": "+bb);}
+ assert(success);
+ return true;
+ }
+ }
+
+ if(removeBubbles){
+ if(a==BACKWARD_BRANCH && b==BACKWARD_BRANCH){
+ removeMatrixT[min][max]++;
+ boolean success=claim(bb, STATUS_REMOVE, false, kmer);
+ if(verbose || verbose2){System.err.println("Claiming ("+a+","+b+") length "+bb.length()+": "+bb);}
+ assert(success);
+ return true;
+ }
+ }
+
+ claim(bb, STATUS_EXPLORED, false, kmer);
+ return false;
+ }
+
+ /** Explores a single unbranching path in the forward direction.
+ * Returns reason for ending in this direction:
+ * DEAD_END, TOO_LONG, TOO_DEEP, FORWARD_BRANCH, BACKWARD_BRANCH */
+ public int explore(Kmer kmer, ByteBuilder bb, int[] leftCounts, int[] rightCounts, int minCount, int maxCount, int maxLength0){
+ if(verbose){outstream.println("Entering explore with bb.length()="+bb.length());}
+ assert(bb.length()>=kmer.kbig && kmer.len>=kmer.kbig);
+ if(bb.length()==0){bb.appendKmer(kmer);}
+
+ final int initialLength=bb.length();
+ final int maxLength=maxLength0+kbig;
+
+ final long firstKey=kmer.xor();
+ HashArrayU1D table=tables.getTable(kmer);
+ int count=table.getValue(kmer);
+ assert(count>=minCount && count<=maxCount);
+
+ int nextRightMaxPos=fillRightCounts(kmer, rightCounts);
+ int nextRightMax=rightCounts[nextRightMaxPos];
+ if(nextRightMax<minCount){
+ if(verbose){outstream.println("Returning DEAD_END: rightMax="+nextRightMax);}
+ return DEAD_END;
+ }
+
+ while(bb.length()<=maxLength){
+
+ final int rightMaxPos=nextRightMaxPos;
+ final int rightMax=rightCounts[rightMaxPos];
+ final int rightSecondPos=Tools.secondHighestPosition(rightCounts);
+ final int rightSecond=rightCounts[rightSecondPos];
+
+ if(verbose){
+ outstream.println("kmer: "+toText(kmer));
+ outstream.println("Right counts: "+count+", "+Arrays.toString(rightCounts));
+ outstream.println("rightMaxPos="+rightMaxPos);
+ outstream.println("rightMax="+rightMax);
+ outstream.println("rightSecondPos="+rightSecondPos);
+ outstream.println("rightSecond="+rightSecond);
+ }
+
+ final int prevCount=count;
+
+ //Generate the new base
+ final byte b=AminoAcid.numberToBase[rightMaxPos];
+ final long x=rightMaxPos;
+ long evicted=kmer.addRightNumeric(x);
+
+ //Now consider the next kmer
+ if(kmer.xor()==firstKey){
+ if(verbose){outstream.println("Returning LOOP");}
+ return LOOP;
+ }
+ table=tables.getTable(kmer);
+
+ assert(table.getValue(kmer)==rightMax);
+ count=rightMax;
+
+
+ {//Fill right and look for dead end
+ nextRightMaxPos=fillRightCounts(kmer, rightCounts);
+ nextRightMax=rightCounts[nextRightMaxPos];
+ if(nextRightMax<minCount){
+ if(verbose){outstream.println("Returning DEAD_END: rightMax="+rightMax);}
+ return DEAD_END;
+ }
+ }
+
+
+ {//Look left
+ final int leftMaxPos=fillLeftCounts(kmer, leftCounts);
+ final int leftMax=leftCounts[leftMaxPos];
+ final int leftSecondPos=Tools.secondHighestPosition(leftCounts);
+ final int leftSecond=leftCounts[leftSecondPos];
+
+// assert(leftMax==1 || leftMax==0) : prevCount+" -> "+Arrays.toString(leftCounts)+", "+count+", "+Arrays.toString(rightCounts);
+
+ if(verbose){
+ outstream.println("Left counts: "+count+", "+Arrays.toString(leftCounts));
+ outstream.println("leftMaxPos="+leftMaxPos);
+ outstream.println("leftMax="+leftMax);
+ outstream.println("leftSecondPos="+leftSecondPos);
+ outstream.println("leftSecond="+leftSecond);
+ }
+
+ if(leftSecond>=minCount || leftMax>prevCount){//Backward branch
+// assert(leftSecond==1) : prevCount+" -> "+Arrays.toString(leftCounts)+", "+count+", "+Arrays.toString(rightCounts);
+ if(leftMax>prevCount){
+ if(verbose){outstream.println("Returning BACKWARD_BRANCH_LOWER: " +
+ "count="+count+", prevCount="+prevCount+", leftMax="+leftMax+", leftSecond="+leftSecond);}
+ return BACKWARD_BRANCH;
+ }else{
+ assert(leftMax==prevCount);
+ if(leftMax>=2*leftSecond){//This constant is adjustable
+// assert(false) : prevCount+" -> "+Arrays.toString(leftCounts)+", "+count+", "+Arrays.toString(rightCounts);
+ //keep going
+ }else{
+ if(verbose){outstream.println("Returning BACKWARD_BRANCH_SIMILAR: " +
+ "count="+count+", prevCount="+prevCount+", leftMax="+leftMax+", leftSecond="+leftSecond);}
+ return BACKWARD_BRANCH;
+ }
+ }
+ }
+
+ }
+
+ //Look right
+ if(rightSecond>=minCount){
+ if(verbose){outstream.println("Returning FORWARD_BRANCH: rightSecond="+rightSecond);}
+ return FORWARD_BRANCH;
+ }
+
+ if(count>maxCount){
+ if(verbose){outstream.println("Returning TOO_DEEP: rightMax="+rightMax);}
+ return TOO_DEEP;
+ }
+
+ bb.append(b);
+ if(verbose){outstream.println("Added base "+(char)b);}
+ }
+
+ assert(bb.length()>maxLength);
+ if(verbose){outstream.println("Returning TOO_LONG: length="+bb.length());}
+ return TOO_LONG;
+
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Classes ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /*--------------------------------------------------------------*/
+ /*---------------- ExploreThread ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /**
+ * Searches for dead ends.
+ */
+ class ExploreThread extends AbstractExploreThread{
+
+ /**
+ * Constructor
+ */
+ public ExploreThread(int id_){
+ super(id_, kbig);
+ }
+
+ @Override
+ boolean processNextTable(final Kmer kmer){
+ final int tnum=nextTable.getAndAdd(1);
+ if(tnum>=tables.ways){return false;}
+ final HashArrayU1D table=tables.getTable(tnum);
+ final int max=table.arrayLength();
+ for(int cell=0; cell<max; cell++){
+ int x=processCell(table, cell, kmer);
+ deadEndsFoundT+=x;
+ }
+ return true;
+ }
+
+ @Override
+ boolean processNextVictims(final Kmer kmer){
+ final int tnum=nextVictims.getAndAdd(1);
+ if(tnum>=tables.ways){return false;}
+ final HashArrayU1D table=tables.getTable(tnum);
+ final HashForestU forest=table.victims();
+ final int max=forest.arrayLength();
+ for(int cell=0; cell<max; cell++){
+ KmerNodeU kn=forest.getNode(cell);
+ int x=traverseKmerNodeU(kn, kmer);
+ deadEndsFoundT+=x;
+ }
+ return true;
+ }
+
+ private int processCell(HashArrayU1D table, int cell, Kmer kmer0){
+ int count=table.readCellValue(cell);
+ if(count<minSeed || count>maxCount){return 0;}
+ int owner=table.getCellOwner(cell);
+ if(owner>STATUS_UNEXPLORED){return 0;}
+ Kmer kmer=table.fillKmer(cell, kmer0);
+ if(kmer==null){return 0;}
+ if(verbose){outstream.println("id="+id+" processing cell "+cell+"; \tkmer="+kmer);}
+
+ return processKmer(kmer);
+ }
+
+ private int traverseKmerNodeU(KmerNodeU kn, Kmer kmer){
+ int sum=0;
+ if(kn!=null){
+ sum+=processKmerNodeU(kn, kmer);
+ if(kn.left()!=null){
+ sum+=traverseKmerNodeU(kn.left(), kmer);
+ }
+ if(kn.right()!=null){
+ sum+=traverseKmerNodeU(kn.right(), kmer);
+ }
+ }
+ return sum;
+ }
+
+ private int processKmerNodeU(KmerNodeU kn, Kmer kmer){
+ kmer.setFrom(kn.pivot());
+ final int count=kn.getValue(kmer);
+ if(count<minSeed || count>maxCount){return 0;}
+ int owner=kn.getOwner(kmer);
+ if(owner>STATUS_UNEXPLORED){return 0;}
+
+ return processKmer(kmer);
+ }
+
+ private int processKmer(Kmer kmer){
+ kmersTestedT++;
+ boolean b=exploreAndMark(kmer, builderT, leftCounts, rightCounts, minCount, maxCount, maxLengthToDiscard, maxDistanceToExplore, true
+ , countMatrixT, removeMatrixT
+ );
+ return b ? 1 : 0;
+ }
+
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- ShaveThread ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /**
+ * Removes dead-end kmers.
+ */
+ private class ShaveThread extends AbstractShaveThread{
+
+ /**
+ * Constructor
+ */
+ public ShaveThread(int id_){
+ super(id_);
+ }
+
+ @Override
+ boolean processNextTable(){
+ final int tnum=nextTable.getAndAdd(1);
+ if(tnum>=tables.ways){return false;}
+// long x=0;
+ final HashArrayU1D table=tables.getTable(tnum);
+ final AtomicIntegerArray owners=table.owners();
+ final int[] values=table.values();
+ final int max=table.arrayLength();
+ for(int cell=0; cell<max; cell++){
+ if(owners.get(cell)==STATUS_REMOVE){
+// x++;
+ values[cell]=0;
+ }
+ }
+ for(KmerNodeU kn : table.victims().array()){
+ if(kn!=null){traverseKmerNodeU(kn);}
+ }
+
+ table.clearOwnership();
+ kmersRemovedT+=table.regenerate();
+// outstream.println(x);
+ return true;
+ }
+
+ private void traverseKmerNodeU(KmerNodeU kn){
+ if(kn==null){return;}
+ if(kn.owner()==STATUS_REMOVE){kn.set(0);}
+ traverseKmerNodeU(kn.left());
+ traverseKmerNodeU(kn.right());
+ }
+
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Recall Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+
+ int getCount(Kmer kmer){return tables.getCount(kmer);}
+ boolean claim(Kmer kmer, int id){return tables.claim(kmer, id);}
+ boolean doubleClaim(ByteBuilder bb, int id/*, long rid*/, Kmer kmer){return tables.doubleClaim(bb, id, kmer/*, rid*/);}
+ boolean claim(ByteBuilder bb, int id, /*long rid, */boolean earlyExit, Kmer kmer){return tables.claim(bb, id/*, rid*/, earlyExit, kmer);}
+ boolean claim(byte[] array, int len, int id, /*long rid, */boolean earlyExit, Kmer kmer){return tables.claim(array, len, id/*, rid*/, earlyExit, kmer);}
+ int findOwner(Kmer kmer){return tables.findOwner(kmer);}
+ int findOwner(ByteBuilder bb, int id, Kmer kmer){return tables.findOwner(bb, id, kmer);}
+ int findOwner(byte[] array, int len, int id, Kmer kmer){return tables.findOwner(array, len, id, kmer);}
+ void release(ByteBuilder bb, int id, Kmer kmer){tables.release(bb, id, kmer);}
+ void release(byte[] array, int len, int id, Kmer kmer){tables.release(array, len, id, kmer);}
+ int fillRightCounts(Kmer kmer, int[] counts){return tables.fillRightCounts(kmer, counts);}
+ int fillLeftCounts(Kmer kmer, int[] counts){return tables.fillLeftCounts(kmer, counts);}
+ static StringBuilder toText(Kmer kmer){return AbstractKmerTableU.toText(kmer);}
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Final Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ AbstractKmerTableSet tables(){return tables;}
+
+ private final KmerTableSetU tables;
+
+}
diff --git a/current/assemble/Tadpole.java b/current/assemble/Tadpole.java
new file mode 100755
index 0000000..bf7d0c3
--- /dev/null
+++ b/current/assemble/Tadpole.java
@@ -0,0 +1,1433 @@
+package assemble;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.BitSet;
+import java.util.Collections;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicLong;
+
+import jgi.BBMerge;
+import kmer.AbstractKmerTableSet;
+
+import ukmer.Kmer;
+import ukmer.KmerTableSetU;
+
+import stream.ByteBuilder;
+import stream.ConcurrentReadInputStream;
+import stream.ConcurrentReadOutputStream;
+import stream.FastaReadInputStream;
+import stream.Read;
+
+import align2.IntList;
+import align2.ListNum;
+import align2.LongList;
+import align2.ReadLengthComparator;
+import align2.ReadStats;
+import align2.Shared;
+import align2.Tools;
+import dna.AminoAcid;
+import dna.Parser;
+import dna.Timer;
+import fileIO.ByteFile;
+import fileIO.ByteStreamWriter;
+import fileIO.ReadWrite;
+import fileIO.FileFormat;
+import fileIO.TextStreamWriter;
+
+
+/**
+ * Short-kmer assembler based on KmerCountExact.
+ * @author Brian Bushnell
+ * @date May 15, 2015
+ *
+ */
+public abstract class Tadpole {
+
+ /**
+ * Code entrance from the command line.
+ * @param args Command line arguments
+ */
+ public static void main(String[] args){
+
+ args=Parser.parseConfig(args);
+ if(Parser.parseHelp(args, true)){
+ printOptions();
+ System.exit(0);
+ }
+
+ Timer t=new Timer(), t2=new Timer();
+ t.start();
+ t2.start();
+
+ final Tadpole wog=makeTadpole(args, true);
+ t2.stop();
+ outstream.println("Initialization Time: \t"+t2);
+
+ ///And run it
+ wog.process(t);
+ }
+
+ public static Tadpole makeTadpole(String[] args, boolean setDefaults){
+ final int k=preparseK(args);
+ if(k<=31){
+ return new Tadpole1(args, true);
+ }else{
+ return new Tadpole2(args, true);
+ }
+ }
+
+ public static final int preparseK(String[] args){
+ int k=31;
+ for(int i=0; i<args.length; i++){
+ final String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if("null".equalsIgnoreCase(b)){b=null;}
+ while(a.charAt(0)=='-' && (a.indexOf('.')<0 || i>1 || !new File(a).exists())){a=a.substring(1);}
+
+ if(a.equals("k")){
+ k=Integer.parseInt(b);
+ }
+ }
+ return Kmer.getMult(k)*Kmer.getK(k);
+ }
+
+ /**
+ * Display usage information.
+ */
+ protected static final void printOptions(){
+ outstream.println("Syntax:\nTODO");
+ }
+
+ /**
+ * Constructor.
+ * @param args Command line arguments
+ */
+ public Tadpole(String[] args, boolean setDefaults){
+ System.err.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+ kbig=preparseK(args);
+
+ if(setDefaults){
+ /* Set global defaults */
+ ReadWrite.ZIPLEVEL=2;
+ ReadWrite.USE_UNPIGZ=true;
+ ReadWrite.USE_PIGZ=true;
+ FastaReadInputStream.SPLIT_READS=false;
+ ByteFile.FORCE_MODE_BF2=Shared.threads()>2;
+ AbstractKmerTableSet.defaultMinprob=0.5;
+ }
+
+ /* Initialize local variables with defaults */
+ Parser parser=new Parser();
+ boolean ecc_=false, ecco_=false, setEcc_=false;
+ boolean useOwnership_=false, setUseOwnership_=false;
+
+ {
+ boolean b=false;
+ assert(b=true);
+ EA=b;
+ }
+
+ /* Parse arguments */
+ for(int i=0; i<args.length; i++){
+
+ final String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if("null".equalsIgnoreCase(b)){b=null;}
+ while(a.charAt(0)=='-' && (a.indexOf('.')<0 || i>1 || !new File(a).exists())){a=a.substring(1);}
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(Parser.parseCommonStatic(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseZip(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseQuality(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseFasta(arg, a, b)){
+ //do nothing
+ }else if(parser.parseInterleaved(arg, a, b)){
+ //do nothing
+ }else if(parser.parseTrim(arg, a, b)){
+ //do nothing
+ }else if(a.equals("in") || a.equals("in1") || a.equals("ine") || a.equals("ine1")){
+ in1.clear();
+ if(b!=null){
+ String[] s=b.split(",");
+ for(String ss : s){
+ in1.add(ss);
+ }
+ }
+ }else if(a.equals("in2") || a.equals("ine2")){
+ in2.clear();
+ if(b!=null){
+ String[] s=b.split(",");
+ for(String ss : s){
+ in2.add(ss);
+ }
+ }
+ }else if(a.equals("out") || a.equals("oute") || a.equals("oute1")){
+ out1.clear();
+ if(b!=null){
+ String[] s=b.split(",");
+ for(String ss : s){
+ out1.add(ss);
+ }
+ }
+ }else if(a.equals("out2") || a.equals("oute2")){
+ out2.clear();
+ if(b!=null){
+ String[] s=b.split(",");
+ for(String ss : s){
+ out2.add(ss);
+ }
+ }
+ }else if(a.equals("outkmers") || a.equals("outk") || a.equals("dump")){
+ outKmers=b;
+ }else if(a.equals("mincounttodump")){
+ minToDump=(int)Tools.parseKMG(b);
+ }else if(a.equals("hist") || a.equals("khist")){
+ outHist=b;
+ }else if(a.equals("ihist") || a.equals("inserthistogram")){
+ outInsert=b;
+ }else if(a.equals("append") || a.equals("app")){
+ append=ReadStats.append=Tools.parseBoolean(b);
+ }else if(a.equals("overwrite") || a.equals("ow")){
+ overwrite=Tools.parseBoolean(b);
+ }else if(a.equals("mode")){
+ if(Character.isDigit(b.charAt(0))){
+ processingMode=(int)Tools.parseKMG(b);
+ }else if(b.equalsIgnoreCase("contig")){
+ processingMode=contigMode;
+ }else if(b.equalsIgnoreCase("extend")){
+ processingMode=extendMode;
+ }else if(b.equalsIgnoreCase("correct") || b.equalsIgnoreCase("ecc")){
+ processingMode=correctMode;
+ }else if(b.equalsIgnoreCase("insert")){
+ processingMode=insertMode;
+ }else{
+ assert(false) : "Unknown mode "+b;
+ }
+ }else if(a.equals("ownership")){
+ if("auto".equalsIgnoreCase(b)){
+ setUseOwnership_=false;
+ }else{
+ useOwnership_=Tools.parseBoolean(b);
+ setUseOwnership_=true;
+ }
+ }else if(a.equals("showstats") || a.equals("stats")){
+ showStats=Tools.parseBoolean(b);
+ }else if(a.equals("maxextension") || a.equals("maxe")){
+ extendLeft=extendRight=(int)Tools.parseKMG(b);
+ }else if(a.equals("extendright") || a.equals("er")){
+ extendRight=(int)Tools.parseKMG(b);
+ }else if(a.equals("extendleft") || a.equals("el")){
+ extendLeft=(int)Tools.parseKMG(b);
+ }else if(a.equals("minextension") || a.equals("mine")){
+ minExtension=(int)Tools.parseKMG(b);
+ }else if(a.equals("maxcontiglength") || a.equals("maxcontig") || a.equals("maxlength") || a.equals("maxlen") || a.equals("maxc")){
+ maxContigLen=(int)Tools.parseKMG(b);
+ if(maxContigLen<0){maxContigLen=1000000000;}
+ }else if(a.equals("mincontiglength") || a.equals("mincontiglen") || a.equals("mincontig") || a.equals("minc")){
+ minContigLen=(int)Tools.parseKMG(b);
+ }else if(a.equals("mincoverage") || a.equals("mincov")){
+ minCoverage=Float.parseFloat(b);
+ }else if(a.equals("branchlower") || a.equals("branchlowerconst") || a.equals("blc")){
+ branchLowerConst=(int)Tools.parseKMG(b);
+ }else if(a.equals("branchmult2") || a.equals("bm2")){
+ branchMult2=(int)Tools.parseKMG(b);
+ }else if(a.equals("branchmult") || a.equals("branchmult1") || a.equals("bm1")){
+ branchMult1=(int)Tools.parseKMG(b);
+ }else if(a.equals("mincount") || a.equals("mincov") || a.equals("mindepth") || a.equals("min")){
+ minCountSeed=minCountExtend=(int)Tools.parseKMG(b);
+ }else if(a.equals("mindepthseed") || a.equals("mds") || a.equals("mincountseed") || a.equals("mcs")){
+ minCountSeed=(int)Tools.parseKMG(b);
+ }else if(a.equals("mindepthextend") || a.equals("mde") || a.equals("mincountextend") || a.equals("mce")){
+ minCountExtend=(int)Tools.parseKMG(b);
+ }else if(a.equals("mincountretain") || a.equals("mincr") || a.equals("mindepthretain") || a.equals("mindr")){
+ kmerRangeMin=(int)Tools.parseKMG(b);
+ }else if(a.equals("maxcountretain") || a.equals("maxcr") || a.equals("maxdepthretain") || a.equals("maxdr")){
+ kmerRangeMax=(int)Tools.parseKMG(b);
+ }else if(a.equals("contigpasses")){
+ contigPasses=(int)Tools.parseKMG(b);
+ }else if(a.equals("contigpassmult")){
+ contigPassMult=Double.parseDouble(b);
+ assert(contigPassMult>=1) : "contigPassMult must be at least 1.";
+ }else if(a.equals("threads") || a.equals("t")){
+ Shared.setThreads(b);
+ }else if(a.equals("showspeed") || a.equals("ss")){
+ showSpeed=Tools.parseBoolean(b);
+ }else if(a.equals("verbose")){
+// assert(false) : "Verbose flag is currently static final; must be recompiled to change.";
+ verbose=Tools.parseBoolean(b);
+ }else if(a.equals("verbose2")){
+// assert(false) : "Verbose flag is currently static final; must be recompiled to change.";
+ verbose2=Tools.parseBoolean(b);
+ }else if(a.equals("ordered")){
+ ordered=Tools.parseBoolean(b);
+ }else if(a.equals("reads") || a.startsWith("maxreads")){
+ maxReads=Tools.parseKMG(b);
+ }else if(a.equals("histcolumns")){
+ histColumns=(int)Tools.parseKMG(b);
+ }else if(a.equals("histmax")){
+ histMax=(int)Tools.parseKMG(b);
+ }else if(a.equals("histheader")){
+ histHeader=Tools.parseBoolean(b);
+ }else if(a.equals("nzo") || a.equals("nonzeroonly")){
+ histZeros=!Tools.parseBoolean(b);
+ }else if(a.equals("ilb") || a.equals("ignoreleftbranches") || a.equals("ignoreleftjunctions") || a.equals("ibb") || a.equals("ignorebackbranches")){
+ extendThroughLeftJunctions=Tools.parseBoolean(b);
+ }else if(a.equals("ibo") || a.equals("ignorebadowner")){
+ IGNORE_BAD_OWNER=Tools.parseBoolean(b);
+ }
+
+ //Shaver
+ else if(a.equals("shave") || a.equals("removedeadends")){
+ if(b==null || Character.isLetter(b.charAt(0))){
+ removeDeadEnds=Tools.parseBoolean(b);
+ }else{
+ maxShaveDepth=Integer.parseInt(b);
+ removeDeadEnds=(maxShaveDepth>0);
+ }
+ }else if(a.equals("rinse") || a.equals("shampoo") || a.equals("removebubbles")){
+ removeBubbles=Tools.parseBoolean(b);
+ }else if(a.equals("maxshavedepth") || a.equals("shavedepth") || a.equals("msd")){
+ maxShaveDepth=Integer.parseInt(b);
+ }else if(a.equals("shavediscardlength") || a.equals("shavelength") || a.equals("discardlength") || a.equals("sdl")){
+ shaveDiscardLen=Integer.parseInt(b);
+ }else if(a.equals("shaveexploredistance") || a.equals("shaveexploredist") || a.equals("exploredist") || a.equals("sed")){
+ shaveExploreDist=Integer.parseInt(b);
+ }else if(a.equals("printeventcounts")){
+ Shaver.printEventCounts=Tools.parseBoolean(b);
+ }
+
+ //Error Correction
+ else if(a.equals("ecctail") || a.equals("eccright") || a.equals("tail")){
+ ECC_TAIL=Tools.parseBoolean(b);
+ }else if(a.equals("pincer") || a.equals("eccpincer")){
+ ECC_PINCER=Tools.parseBoolean(b);
+ }else if(a.equals("eccall") || a.equals("eccfull") || a.equals("ecccomplete") || a.equals("aecc") || a.equals("aec") || a.equals("aggressive")){
+ ECC_ALL=Tools.parseBoolean(b);
+ }else if(a.equals("ecc")){
+ ecc_=Tools.parseBoolean(b);
+ setEcc_=true;
+ }else if(a.equals("ecco")){
+ ecco_=Tools.parseBoolean(b);
+ }else if(a.equals("em1") || a.equals("errormult1")){
+ errorMult1=Float.parseFloat(b);
+ }else if(a.equals("em2") || a.equals("errormult2")){
+ errorMult2=Float.parseFloat(b);
+ }else if(a.equals("elc") || a.equals("errorlowerconst")){
+ errorLowerConst=Integer.parseInt(b);
+ }else if(a.equals("mcc") || a.equals("mincountcorrect")){
+ minCountCorrect=Integer.parseInt(b);
+ }else if(a.equals("psc") || a.equals("pathsimilarityconstant")){
+ pathSimilarityConstant=Integer.parseInt(b);
+ }else if(a.equals("psf") || a.equals("pathsimilarityfraction")){
+ pathSimilarityFraction=Float.parseFloat(b);
+ }else if(a.equals("eep") || a.equals("errorextensionpincer")){
+ errorExtensionPincer=Integer.parseInt(b);
+ }else if(a.equals("eet") || a.equals("errorextensiontail")){
+ errorExtensionTail=Integer.parseInt(b);
+ }else if(a.equals("mbb") || a.equals("markbad") || a.equals("markbadbases")){
+ if(b==null){b="1";}
+ MARK_BAD_BASES=Integer.parseInt(b);
+ }else if(a.equals("mdo") || a.equals("markdeltaonly")){
+ MARK_DELTA_ONLY=Tools.parseBoolean(b);
+ }
+
+ //Trimming
+ else if(a.equals("trim") || a.equals("trimends")){
+ if(b==null || Character.isLetter(b.charAt(0))){
+ trimEnds=Tools.parseBoolean(b) ? -1 : 0;
+ }else{
+ trimEnds=Integer.parseInt(b);
+ }
+ trimEnds=Tools.max(trimEnds, 0);
+ }
+
+ else if(KmerTableSetU.isValidArgument(a)){
+ //Do nothing
+ }else{
+ throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ if(trimEnds==-1){
+ trimEnds=kbig/2;
+ }
+
+ if(verbose){
+ assert(false) : "Verbose is disabled.";
+// AbstractKmerTableU.verbose=true;
+ }
+ THREADS=Shared.threads();
+
+ assert(kmerRangeMax>=kmerRangeMin) : "kmerRangeMax must be at least kmerRangeMin: "+kmerRangeMax+", "+kmerRangeMin;
+
+ if(processingMode==extendMode || processingMode==correctMode){
+
+// {//Use in and out if ine and oute are not specified, in this mode.
+// if(ine1.isEmpty() && ine2.isEmpty()){
+// ine1.addAll(in1);
+// ine2.addAll(in2);
+// }
+// if(oute1.isEmpty() && oute2.isEmpty() && outContigs!=null){
+// oute1.add(outContigs);
+// }
+// }
+
+ if(processingMode==extendMode){
+ if(extendLeft==-1){extendLeft=100;}
+ if(extendRight==-1){extendRight=100;}
+ }else if(processingMode==correctMode){
+ extendLeft=extendRight=0;
+ if(!setEcc_){ecc_=true;}
+ }
+ }
+
+ {//Process parser fields
+ Parser.processQuality();
+ }
+
+ if(setUseOwnership_){
+ useOwnership=useOwnership_;
+ }else{
+ useOwnership=(processingMode==contigMode || removeBubbles || removeDeadEnds);
+ }
+
+ final int extraBytesPerKmer;
+ {
+ int x=0;
+ if(useOwnership){x+=4;}
+ if(processingMode==correctMode){}
+ else if(processingMode==contigMode || processingMode==extendMode){x+=1;}
+ extraBytesPerKmer=x;
+ }
+
+ /* Set final variables; post-process and validate argument combinations */
+
+ ecc=ecc_;
+ ecco=ecco_;
+
+ /* Adjust I/O settings and filenames */
+
+ assert(FastaReadInputStream.settingsOK());
+
+ for(int i=0; i<in1.size(); i++){
+ String s=in1.get(i);
+ if(s!=null && s.contains("#") && !new File(s).exists()){
+ int pound=s.lastIndexOf('#');
+ String a=s.substring(0, pound);
+ String b=s.substring(pound+1);
+ in1.set(i, a+1+b);
+ in2.add(a+2+b);
+ }
+ }
+
+ for(int i=0; i<out1.size(); i++){
+ String s=out1.get(i);
+ if(s!=null && s.contains("#")){
+ int pound=s.lastIndexOf('#');
+ String a=s.substring(0, pound);
+ String b=s.substring(pound+1);
+ out1.set(i, a+1+b);
+ out2.add(a+2+b);
+ }
+ }
+
+ nextTable=new AtomicInteger[contigPasses];
+ nextVictims=new AtomicInteger[contigPasses];
+ for(int i=0; i<contigPasses; i++){
+ nextTable[i]=new AtomicInteger(0);
+ nextVictims[i]=new AtomicInteger(0);
+ }
+
+ if(!Tools.testOutputFiles(overwrite, append, false, outKmers, outHist)){
+ throw new RuntimeException("\nCan't write to some output files; overwrite="+overwrite+"\n");
+ }
+ if(!Tools.testOutputFiles(overwrite, append, false, out1, out2)){
+ throw new RuntimeException("\nCan't write to some output files; overwrite="+overwrite+"\n");
+ }
+ if(!Tools.testInputFiles(true, true, in1, in2)){
+ throw new RuntimeException("\nCan't read to some input files.\n");
+ }
+ assert(THREADS>0);
+ outstream.println("Using "+THREADS+" threads.");
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Outer Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+
+ public final void process(Timer t){
+
+ /* Check for output file collisions */
+ Tools.testOutputFiles(overwrite, append, false, outKmers, outHist);
+
+ /* Count kmers */
+ process2(processingMode);
+
+ if(THREADS>1 && outHist!=null && outKmers!=null){
+ Timer tout=new Timer();
+ tout.start();
+ Thread a=new DumpKmersThread();
+ Thread b=new MakeKhistThread();
+ a.start();
+ b.start();
+ while(a.getState()!=Thread.State.TERMINATED){
+ try {
+ a.join();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ while(b.getState()!=Thread.State.TERMINATED){
+ try {
+ b.join();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ tout.stop();
+ outstream.println("Write Time: \t"+tout);
+ }else{
+ if(outHist!=null){
+ makeKhist();
+ }
+ if(outKmers!=null){
+ dumpKmersAsText();
+ }
+ }
+
+ clearData();
+
+ /* Stop timer and calculate speed statistics */
+ t.stop();
+
+
+ if(showSpeed){
+ double rpnano=readsIn/(double)(t.elapsed);
+ double bpnano=basesIn/(double)(t.elapsed);
+
+ //Format with k or m suffixes
+ String rpstring=(readsIn<100000 ? ""+readsIn : readsIn<100000000 ? (readsIn/1000)+"k" : (readsIn/1000000)+"m");
+ String bpstring=(basesIn<100000 ? ""+basesIn : basesIn<100000000 ? (basesIn/1000)+"k" : (basesIn/1000000)+"m");
+
+ while(rpstring.length()<8){rpstring=" "+rpstring;}
+ while(bpstring.length()<8){bpstring=" "+bpstring;}
+
+ outstream.println("\nTotal Time: \t"+t);
+
+ if(processingMode==extendMode || processingMode==correctMode){
+ outstream.println("\nReads Processed: "+rpstring+" \t"+String.format("%.2fk reads/sec", rpnano*1000000));
+ outstream.println("Bases Processed: "+bpstring+" \t"+String.format("%.2fm bases/sec", bpnano*1000));
+ }
+ }
+
+ {
+ String outContigs=out1.isEmpty() ? null : out1.get(0);
+ if(showStats && outContigs!=null && processingMode==contigMode && FileFormat.isFasta(ReadWrite.rawExtension(outContigs))){
+ outstream.println();
+ jgi.AssemblyStats2.main(new String[] {"in="+outContigs});
+ }
+ }
+
+ /* Throw an exception if errors were detected */
+ if(errorState){
+ throw new RuntimeException(getClass().getSimpleName()+" terminated in an error state; the output may be corrupt.");
+ }
+ }
+
+ abstract void makeKhist();
+ abstract void dumpKmersAsText();
+ public abstract long loadKmers(Timer t);
+
+ public final void clearData(){
+ allContigs=null;
+ tables().clear();
+ }
+
+ public final void process2(int mode){
+
+ /* Start phase timer */
+ Timer t=new Timer();
+
+ /* Fill tables with kmers */
+ outstream.println("\nLoading kmers.\n");
+ loadKmers(t);
+
+ t.stop();
+// outstream.println("Input: \t"+tables.readsIn+" reads \t\t"+tables.basesIn+" bases.");
+// outstream.println("Unique Kmers: \t"+tables.kmersLoaded);
+// outstream.println("Load Time: \t"+t);
+
+
+ t.start();
+
+ shaveAndRinse(t, removeDeadEnds, removeBubbles, true);
+
+ if(kmerRangeMin>1 || kmerRangeMax<Integer.MAX_VALUE){
+ AbstractRemoveThread.process(THREADS, kmerRangeMin, kmerRangeMax, tables(), true);
+ }
+
+ if(mode==extendMode || mode==correctMode){
+ outstream.println("\nExtending/error-correcting.\n");
+
+ final boolean vic=Read.VALIDATE_IN_CONSTRUCTOR;
+ Read.VALIDATE_IN_CONSTRUCTOR=false;
+ extendReads();
+ Read.VALIDATE_IN_CONSTRUCTOR=vic;
+
+ if(DISPLAY_PROGRESS){
+ outstream.println("\nAfter extending reads:");
+ Shared.printMemory();
+ outstream.println();
+ }
+
+ t.stop();
+
+ outstream.println("Input: \t"+readsIn+" reads \t\t"+basesIn+" bases.");
+ outstream.println("Output: \t"+readsIn+" reads \t\t"+(basesIn+basesExtended)+" bases.");
+ if(extendLeft>0 || extendRight>0){
+ outstream.println("Bases extended: \t"+basesExtended);
+ outstream.println("Reads extended: \t"+readsExtended+String.format(" \t(%.2f%%)", readsExtended*100.0/readsIn));
+ }
+ if(ecc){
+ long partial=(readsCorrected-readsFullyCorrected);
+ outstream.println("Errors detected: \t"+basesDetected);
+ outstream.println("Errors corrected: \t"+(basesCorrectedTail+basesCorrectedPincer)+" \t("+basesCorrectedPincer+" pincer, "+basesCorrectedTail+" tail)");
+ outstream.println("Reads with errors detected: \t"+readsDetected+String.format(" \t(%.2f%%)", readsDetected*100.0/readsIn));
+ outstream.println("Reads fully corrected: \t"+readsCorrected+String.format(" \t(%.2f%% of detected)", readsFullyCorrected*100.0/readsDetected));
+ outstream.println("Reads partly corrected: \t"+partial+String.format(" \t(%.2f%% of detected)", partial*100.0/readsDetected));
+ }
+ if(MARK_BAD_BASES>0){
+ outstream.println("Reads marked: \t"+readsMarked+String.format(" \t(%.2f%%)", readsMarked*100.0/readsIn));
+ outstream.println("Bases marked: \t"+basesMarked+String.format(" \t(%.2f%%)", basesMarked*100.0/basesIn));
+ }
+
+ outstream.println("Extend/error-correct time: \t"+t);
+ }else{
+ /* Build contigs */
+ outstream.println("\nBuilding contigs.\n");
+ buildContigs(mode);
+
+ if(DISPLAY_PROGRESS){
+ outstream.println("\nAfter building contigs:");
+ Shared.printMemory();
+ outstream.println();
+ }
+
+ t.stop();
+
+ if(readsIn>0){outstream.println("Input: \t"+readsIn+" reads \t\t"+basesIn+" bases.");}
+ outstream.println("Bases generated: \t"+basesBuilt);
+ outstream.println("Contigs generated: \t"+contigsBuilt);
+ outstream.println("Longest contig: \t"+longestContig);
+ outstream.println("Contig-building time: \t"+t);
+ }
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public final long shaveAndRinse(Timer t, boolean shave, boolean rinse, boolean print){
+ long removed=0;
+ if(shave || rinse){
+
+ if(print){
+ if(rinse && shave){
+ outstream.println("\nRemoving dead ends and error bubbles.");
+ }else if(shave){
+ outstream.println("\nRemoving dead ends.");
+ }else if(rinse){
+ outstream.println("\nRemoving error bubbles.");
+ }
+ }
+
+ removed=shave(shave, rinse);
+ t.stop();
+
+ if(print){
+ outstream.println("Kmers removed: \t"+removed);
+ outstream.println("Removal time: \t"+t);
+ }
+
+ t.start();
+ }
+ return removed;
+ }
+
+ abstract long shave(boolean shave, boolean rinse);
+ abstract void initializeOwnership();
+
+ /**
+ * Build contigs.
+ */
+ private final void buildContigs(final int mode){
+
+ if(mode==contigMode){
+ allContigs=new ArrayList<Read>();
+ allInserts=null;
+
+ if(useOwnership){
+ initializeOwnership();
+ }
+
+ }else if(mode==insertMode){
+ allContigs=null;
+ allInserts=new LongList();
+ }else if(mode==extendMode){
+ throw new RuntimeException("extendMode: TODO");
+ }else{
+ throw new RuntimeException("Unknown mode "+mode);
+ }
+
+ /* Create read input stream */
+ final ConcurrentReadInputStream[] crisa=(mode==contigMode ? null : makeCrisArray(in1, in2));
+
+ /* Create ProcessThreads */
+ ArrayList<AbstractBuildThread> alpt=new ArrayList<AbstractBuildThread>(THREADS);
+ for(int i=0; i<THREADS; i++){alpt.add(makeBuildThread(i, mode, crisa));}
+ for(AbstractBuildThread pt : alpt){pt.start();}
+
+ /* Wait for threads to die, and gather statistics */
+ for(AbstractBuildThread pt : alpt){
+ while(pt.getState()!=Thread.State.TERMINATED){
+ try {
+ pt.join();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ for(Read contig : pt.contigs){
+ allContigs.add(contig);
+ contigsBuilt++;
+ basesBuilt+=contig.length();
+ longestContig=Tools.max(longestContig, contig.length());
+ }
+ if(allInserts!=null){
+ allInserts.add(pt.insertSizes);
+ }
+
+ readsIn+=pt.readsInT;
+ basesIn+=pt.basesInT;
+ lowqReads+=pt.lowqReadsT;
+ lowqBases+=pt.lowqBasesT;
+ }
+
+ /* Shut down I/O streams; capture error status */
+ if(crisa!=null){
+ for(ConcurrentReadInputStream cris : crisa){
+ errorState|=ReadWrite.closeStreams(cris);
+ }
+ }
+
+ if(outInsert!=null){
+ FileFormat ff=FileFormat.testOutput(outInsert, FileFormat.TEXT, 0, 0, true, overwrite, append, false);
+ TextStreamWriter tsw=new TextStreamWriter(ff);
+ tsw.start();
+ for(int i=0; i<allInserts.size; i++){
+ long count=allInserts.get(i);
+ if(count>0 || histZeros){
+ tsw.print(i+"\t"+count+"\n");
+ }
+ }
+ errorState|=tsw.poisonAndWait();
+ }
+
+ String outContigs=out1.isEmpty() ? null : out1.get(0);
+ if(outContigs!=null){
+ FileFormat ff=FileFormat.testOutput(outContigs, FileFormat.FA, 0, 0, true, overwrite, append, false);
+// ConcurrentReadOutputStream ros=ConcurrentReadOutputStream.getStream(ff, null, null, null, 4, null, false);
+// ros.start();
+ ByteStreamWriter bsw=new ByteStreamWriter(ff);
+ bsw.start();
+ if(allContigs!=null){
+// Collections.sort(allContigs, ReadComparatorID.comparator);
+ Collections.sort(allContigs, ReadLengthComparator.comparator);
+ for(int i=0; i<allContigs.size(); i++){
+ Read r=allContigs.get(i);
+ bsw.println(r);
+ }
+ }
+ errorState|=bsw.poisonAndWait();
+ }
+ }
+
+
+ /**
+ * @param i
+ * @param mode
+ * @param crisa
+ * @return
+ */
+ abstract AbstractBuildThread makeBuildThread(int i, int mode, ConcurrentReadInputStream[] crisa);
+
+ /**
+ * Extend reads.
+ */
+ private final void extendReads(){
+
+ /* Create read input stream */
+ final ConcurrentReadInputStream[] crisa=makeCrisArray(in1, in2);
+
+ /* Create read input stream */
+ final ConcurrentReadOutputStream[] rosa=makeCrosArray(out1, out2);
+
+ /* Create ProcessThreads */
+ ArrayList<ExtendThread> alpt=new ArrayList<ExtendThread>(THREADS);
+ for(int i=0; i<THREADS; i++){alpt.add(new ExtendThread(i, crisa, rosa));}
+ for(ExtendThread pt : alpt){pt.start();}
+
+ /* Wait for threads to die, and gather statistics */
+ for(ExtendThread pt : alpt){
+ while(pt.getState()!=Thread.State.TERMINATED){
+ try {
+ pt.join();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+
+ readsIn+=pt.readsInT;
+ basesIn+=pt.basesInT;
+ lowqReads+=pt.lowqReadsT;
+ lowqBases+=pt.lowqBasesT;
+ readsExtended+=pt.readsExtendedT;
+ basesExtended+=pt.basesExtendedT;
+ readsCorrected+=pt.readsCorrectedT;
+ basesCorrectedPincer+=pt.basesCorrectedPincerT;
+ basesCorrectedTail+=pt.basesCorrectedTailT;
+ readsFullyCorrected+=pt.readsFullyCorrectedT;
+ readsDetected+=pt.readsDetectedT;
+ basesDetected+=pt.basesDetectedT;
+ readsMarked+=pt.readsMarkedT;
+ basesMarked+=pt.basesMarkedT;
+ }
+
+ /* Shut down I/O streams; capture error status */
+ for(ConcurrentReadInputStream cris : crisa){
+ errorState|=ReadWrite.closeStreams(cris);
+ }
+ /* Shut down I/O streams; capture error status */
+ if(rosa!=null){
+ for(ConcurrentReadOutputStream ros : rosa){
+ errorState|=ReadWrite.closeStream(ros);
+ }
+ }
+ }
+
+ private final ConcurrentReadInputStream[] makeCrisArray(ArrayList<String> list1, ArrayList<String> list2){
+ final ConcurrentReadInputStream[] array;
+
+ array=new ConcurrentReadInputStream[list1.size()];
+ for(int i=0; i<list1.size(); i++){
+ String a=list1.get(i);
+ String b=(list2.size()>i ? list2.get(i): null);
+ if(verbose){System.err.println("Creating cris for "+a);}
+
+ final ConcurrentReadInputStream cris;
+ {
+ FileFormat ff1=FileFormat.testInput(a, FileFormat.FASTA, null, true, true);
+ FileFormat ff2=(b==null ? null : FileFormat.testInput(b, FileFormat.FASTA, null, true, true));
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, false, ff1, ff2);
+ }
+ array[i]=cris;
+ }
+ return array;
+ }
+
+ private final ConcurrentReadOutputStream[] makeCrosArray(ArrayList<String> list1, ArrayList<String> list2){
+ final ConcurrentReadOutputStream[] array;
+
+ array=new ConcurrentReadOutputStream[list1.size()];
+ for(int i=0; i<list1.size(); i++){
+ String a=list1.get(i);
+ String b=(list2.size()>i ? list2.get(i): null);
+ if(verbose){System.err.println("Creating cris for "+a);}
+
+ final ConcurrentReadOutputStream cris;
+ {
+ final int buff=(!ordered ? 12 : Tools.max(32, 2*Shared.threads()));
+ FileFormat ff1=FileFormat.testOutput(a, FileFormat.FASTQ, null, true, overwrite, append, ordered);
+ FileFormat ff2=(b==null ? null : FileFormat.testOutput(b, FileFormat.FASTQ, null, true, overwrite, append, ordered));
+// assert(false) : list1+", "+list2;
+ cris=ConcurrentReadOutputStream.getStream(ff1, ff2, null, null, buff, null, false);
+ }
+ array[i]=cris;
+ }
+ return array;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Classes ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /*--------------------------------------------------------------*/
+ /*---------------- ExtendThread ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /**
+ * Extends reads.
+ */
+ private final class ExtendThread extends Thread{
+
+ /**
+ * Constructor
+ * @param cris_ Read input stream
+ */
+ public ExtendThread(int id_, ConcurrentReadInputStream[] crisa_, ConcurrentReadOutputStream[] rosa_){
+ id=id_;
+ crisa=crisa_;
+ rosa=rosa_;
+ leftCounts=extendThroughLeftJunctions ? null : new int[4];
+ }
+
+ @Override
+ public void run(){
+ for(int i=0; i<crisa.length; i++){
+ ConcurrentReadInputStream cris=crisa[i];
+ ConcurrentReadOutputStream ros=(rosa!=null && rosa.length>i ? rosa[i] : null);
+ synchronized(crisa){
+ if(!cris.started()){
+ cris.start();
+ }
+ }
+ if(ros!=null){
+ synchronized(rosa){
+ if(!ros.started()){
+ ros.start();
+ }
+ }
+ }
+ run(cris, ros);
+ }
+ }
+
+ private void run(ConcurrentReadInputStream cris, ConcurrentReadOutputStream ros){
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ //While there are more reads lists...
+ while(reads!=null && reads.size()>0){
+
+ final ArrayList<Read> listOut=new ArrayList<Read>(reads.size());
+
+ //For each read (or pair) in the list...
+ for(int i=0; i<reads.size(); i++){
+ final Read r1=reads.get(i);
+ final Read r2=r1.mate;
+
+ processReadPair(r1, r2);
+ listOut.add(r1);
+ }
+ if(ros!=null){ros.add(listOut, ln.id);}
+
+ //Fetch a new read list
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ cris.returnList(ln.id, ln.list.isEmpty());
+ }
+
+ private void processReadPair(Read r1, Read r2){
+ if(verbose){System.err.println("Considering read "+r1.id+" "+new String(r1.bases));}
+
+ readsInT++;
+ basesInT+=r1.length();
+ if(r2!=null){
+ readsInT++;
+ basesInT+=r2.length();
+ }
+
+ if(ecco && r1!=null && r2!=null && !r1.discarded() && !r2.discarded()){BBMerge.findOverlapStrict(r1, r2, true);}
+
+ processRead(r1);
+ processRead(r2);
+ }
+
+ private void processRead(Read r){
+ if(r==null){return;}
+ if(!r.validated()){r.validate(true);}
+ if(r.discarded()){
+ lowqBasesT+=r.length();
+ lowqReadsT++;
+ return;
+ }
+ if(ecc || MARK_BAD_BASES>0){
+ final int corrected=errorCorrect(r, leftCounts, rightCounts, kmerList, countList, builderT, detectedArrayT, bitsetT, kmerT);
+ final int detected=detectedArrayT[0];
+ final int correctedPincer=detectedArrayT[1];
+ final int correctedTail=detectedArrayT[2];
+ final int marked=detectedArrayT[3];
+ assert(corrected==correctedPincer+correctedTail) : corrected+", "+Arrays.toString(detectedArrayT);
+ if(detected>0){
+ readsDetectedT++;
+ basesDetectedT+=detected;
+ if(corrected>0){
+ readsCorrectedT++;
+ basesCorrectedPincerT+=correctedPincer;
+ basesCorrectedTailT+=correctedTail;
+ }
+ if(corrected==detected){
+ readsFullyCorrectedT++;
+ }
+ }
+ if(marked>0){
+ readsMarkedT++;
+ basesMarkedT+=marked;
+ }
+ }
+ int extension=0;
+ if(extendRight>0){
+ extension+=extendRead(r, builderT, leftCounts, rightCounts, extendRight, kmerT);
+ }
+ if(extendLeft>0){
+ r.reverseComplement();
+ extension+=extendRead(r, builderT, leftCounts, rightCounts, extendLeft, kmerT);
+ r.reverseComplement();
+ }
+ basesExtendedT+=extension;
+ readsExtendedT+=(extension>0 ? 1 : 0);
+ }
+
+ /*--------------------------------------------------------------*/
+
+ /** Input read stream */
+ private final ConcurrentReadInputStream[] crisa;
+ private final ConcurrentReadOutputStream[] rosa;
+
+ private final int[] leftCounts;
+ private final int[] rightCounts=new int[4];
+ private final int[] detectedArrayT=new int[4];
+ private final ByteBuilder builderT=new ByteBuilder();
+ private final Kmer kmerT=new Kmer(kbig);
+ private final BitSet bitsetT=new BitSet(300);
+ private final LongList kmerList=new LongList();
+ private final IntList countList=new IntList();
+
+ private long readsInT=0;
+ private long basesInT=0;
+ private long lowqReadsT=0;
+ private long lowqBasesT=0;
+ private long readsExtendedT=0;
+ private long basesExtendedT=0;
+ private long readsCorrectedT=0;
+ private long basesCorrectedTailT=0;
+ private long basesCorrectedPincerT=0;
+ private long readsFullyCorrectedT=0;
+ private long readsDetectedT=0;
+ private long basesDetectedT=0;
+ private long readsMarkedT=0;
+ private long basesMarkedT=0;
+
+ private final int id;
+
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Extension Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public abstract int extendRead(Read r, ByteBuilder bb, int[] leftCounts, int[] rightCounts, int distance);
+ public abstract int extendRead(Read r, ByteBuilder bb, int[] leftCounts, int[] rightCounts, int distance, final Kmer kmer);
+// {
+// throw new RuntimeException("Must be overridden.");
+// }
+
+ /**
+ * Extend these bases to the right by at most 'distance'.
+ * Stops at right junctions only.
+ * Does not claim ownership.
+ */
+ public abstract int extendToRight2(final ByteBuilder bb, final int[] leftCounts, final int[] rightCounts, final int distance, boolean includeJunctionBase);
+
+ /**
+ * Extend these bases to the right by at most 'distance'.
+ * Stops at right junctions only.
+ * Does not claim ownership.
+ */
+ public int extendToRight2(final ByteBuilder bb, final int[] leftCounts, final int[] rightCounts, final int distance, boolean includeJunctionBase, Kmer kmer){
+ throw new RuntimeException("Must be overridden.");
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Error Correction ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public abstract int errorCorrect(Read r);
+
+ public abstract int errorCorrect(Read r, final int[] leftCounts, final int[] rightCounts, LongList kmers, IntList counts,
+ final ByteBuilder bb, final int[] detectedArray, final BitSet bs, Kmer kmer);
+
+ /** Changes to N any base covered strictly by kmers with count below minCount */
+ public final int markBadBases(final byte[] bases, final byte[] quals, final IntList counts, final BitSet bs, final int minCount, boolean deltaOnly){
+ if(counts.size<1){return 0;}
+
+ bs.clear();
+ assert(counts.size==bases.length-kbig+1) : counts.size+", "+bases.length;
+
+// boolean flag=true;
+ for(int i=0; i<counts.size;){
+ final int count=counts.get(i);
+ if(count>=minCount){
+ bs.set(i, i+kbig);
+ i+=kbig;
+ }else{
+// flag=false;
+ i++;
+ }
+ }
+ {//Last cycle
+ final int i=counts.size-1;
+ final int count=counts.get(i);
+ if(count>=minCount){
+ bs.set(i, i+kbig);
+ }
+ }
+
+ final int card=bs.cardinality();
+ final int toMark=bases.length-card;
+ int marked=0;
+ assert(card<=bases.length);
+
+ int consecutiveBad=0;
+ for(int i=0; i<bases.length; i++){
+ if(bs.get(i)){
+ consecutiveBad=0;
+ }else{
+ consecutiveBad++;
+ boolean mark=(bases[i]!='N');
+ if(mark && deltaOnly){
+ mark=(consecutiveBad>=kbig) || bs.get(i+1) || (i>0 && bs.get(i-1));
+ }
+ if(mark){
+ marked++;
+ bases[i]='N';
+ if(quals!=null){
+ quals[i]=0;
+ }
+ }
+ if(bases[i]=='N'){consecutiveBad=0;}
+ }
+ }
+
+// assert(toMark==0 && flag) : "toMark="+toMark+"card="+card+"len="+bases.length+"\n"+bs+"\n"+new String(bases)+"\n"+counts+"\nminCount="+minCount+"\n";
+
+ return marked;
+ }
+
+ protected final boolean isSimilar(final int a, int loc1, int loc2, final IntList counts){
+ loc1=Tools.max(loc1, 0);
+ loc2=Tools.min(loc2, counts.size-1);
+ for(int i=loc1; i<=loc2; i++){
+ if(!isSimilar(a, counts.get(i))){return false;}
+ }
+ return true;
+ }
+
+ protected final boolean isSimilar(final int a, final int b){
+ int min=Tools.min(a, b);
+ int max=Tools.max(a, b);
+ int dif=max-min;
+ assert(dif>=0);
+ return (dif<pathSimilarityConstant || dif<max*pathSimilarityFraction);
+ }
+
+ protected final boolean isError(final int a, int loc1, int loc2, final IntList counts){
+ loc1=Tools.max(loc1, 0);
+ loc2=Tools.min(loc2, counts.size-1);
+ for(int i=loc1; i<=loc2; i++){
+ if(!isError(a, counts.get(i))){return false;}
+ }
+ return true;
+ }
+
+ protected final boolean isError(final int high, final int low){
+ return (low*errorMult1<high || (low<=errorLowerConst && high>=Tools.max(minCountCorrect, low*errorMult2)));
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Helper Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ protected final Kmer getKmer(byte[] bases, int loc, Kmer kmer){
+ kmer.clear();
+ for(int i=loc, lim=loc+kmer.kbig; i<lim; i++){
+ byte b=bases[i];
+ int x=AminoAcid.baseToNumber[b];
+ if(x<0){return null;}
+ kmer.addRightNumeric(x);
+ }
+ assert(kmer.len==kmer.kbig);
+ return kmer;
+ }
+
+ protected final boolean isJunction(int rightMax, int rightSecond, int leftMax, int leftSecond){
+ if(isJunction(rightMax, rightSecond)){return true;}
+ return isJunction(leftMax, leftSecond);
+ }
+
+ private final boolean isJunction(int max, int second){
+ if(second<1 || second*branchMult1<max || (second<=branchLowerConst && max>=Tools.max(minCountExtend, second*branchMult2))){
+ return false;
+ }
+ if(verbose){outstream.println("Breaking because second-highest was too high:\n" +
+ "max="+max+", second="+second+", branchMult1="+branchMult1+"\n" +
+ "branchLowerConst="+branchLowerConst+", minCountExtend="+minCountExtend+", branchMult2="+branchMult2+"\n" +
+ (second*branchMult1<max)+", "+(second<=branchLowerConst)+", "+(max>=Tools.max(minCountExtend, second*branchMult2)));}
+ return true;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Helper Classes ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private final class DumpKmersThread extends Thread {
+
+ DumpKmersThread(){}
+
+ public void run(){
+ dumpKmersAsText();
+ }
+
+ }
+
+ private final class MakeKhistThread extends Thread {
+
+ MakeKhistThread(){}
+
+ public void run(){
+ makeKhist();
+ }
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ abstract AbstractKmerTableSet tables();
+
+// int ways; //MUST be set by subclass
+ /** Big kmer length */
+ final int kbig;
+
+ private ArrayList<Read> allContigs;
+ private LongList allInserts;
+ private long contigsBuilt=0;
+ private long basesBuilt=0;
+ private long longestContig=0;
+
+ protected boolean extendThroughLeftJunctions=true;
+
+ private boolean removeBubbles=false;
+ private boolean removeDeadEnds=false;
+ protected int maxShaveDepth=1;
+ protected int shaveDiscardLen=150;
+ protected int shaveExploreDist=100;
+
+ protected int kmerRangeMin=0;
+ protected int kmerRangeMax=Integer.MAX_VALUE;
+
+ protected int processingMode=contigMode;
+
+ private int extendLeft=-1;
+ protected int extendRight=-1;
+
+ /** Track kmer ownership */
+ public final boolean useOwnership;
+
+ public int maxContigLen=1000000000;
+ public int minExtension=2;
+ public int minContigLen=100;
+ public float minCoverage=1;
+
+ int trimEnds=0;
+
+ int minCountSeed=3;
+
+ protected int minCountExtend=2;
+ protected float branchMult1=20;
+ protected float branchMult2=3;
+ private int branchLowerConst=3;
+
+ private float errorMult1=60;
+ private float errorMult2=3;
+ private int errorLowerConst=3;//3 seems fine
+ private int minCountCorrect=4;//5 is more conservative...
+ private int pathSimilarityConstant=3;
+ private float pathSimilarityFraction=0.3f;//0.3
+ protected int errorExtensionPincer=5;//default 5; higher is more conservative
+ protected int errorExtensionTail=9;//default 9; higher is more conservative
+
+ public boolean showStats=true;
+
+ /** Has this class encountered errors while processing? */
+ public boolean errorState=false;
+
+ /** Input reads to extend */
+ private ArrayList<String> in1=new ArrayList<String>(), in2=new ArrayList<String>();
+ /** Output extended reads */
+ private ArrayList<String> out1=new ArrayList<String>(), out2=new ArrayList<String>();
+
+
+// /** Contig output file */
+// private String outContigs=null;
+ /** Insert size histogram */
+ private String outInsert=null;
+ /** Kmer count output file */
+ protected String outKmers=null;
+ /** Histogram output file */
+ protected String outHist=null;
+
+ /** Histogram columns */
+ protected int histColumns=2;
+ /** Histogram rows */
+ protected int histMax=100000;
+ /** Print a histogram header */
+ protected boolean histHeader=true;
+ /** Histogram show rows with 0 count */
+ protected boolean histZeros=false;
+
+ protected boolean smoothHist=false;
+
+ /** Maximum input reads (or pairs) to process. Does not apply to references. -1 means unlimited. */
+ private long maxReads=-1;
+
+ long readsIn=0;
+ long basesIn=0;
+ long readsOut=0;
+ long basesOut=0;
+ long lowqReads=0;
+ long lowqBases=0;
+ long basesExtended=0;
+ long readsExtended=0;
+ long readsCorrected=0;
+ long basesCorrectedTail=0;
+ long basesCorrectedPincer=0;
+ long readsFullyCorrected=0;
+ long readsDetected=0;
+ long basesDetected=0;
+ long readsMarked=0;
+ long basesMarked=0;
+
+ protected boolean ECC_PINCER=true;
+ protected boolean ECC_TAIL=true;
+ protected boolean ECC_ALL=false;
+ /** Mark bases as bad if they are completely covered by kmers with a count below this */
+ protected int MARK_BAD_BASES=0;
+ /** Only mark bad bases that are adjacent to good bases */
+ protected boolean MARK_DELTA_ONLY=true;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- ThreadLocal Temps ----------------*/
+ /*--------------------------------------------------------------*/
+
+ protected final void initializeThreadLocals(){
+ if(localLeftCounts.get()!=null){return;}
+ localLeftCounts.set(new int[4]);
+ localRightCounts.set(new int[4]);
+ localLongList.set(new LongList());
+ localIntList.set(new IntList());
+ localByteBuilder.set(new ByteBuilder());
+ localBitSet.set(new BitSet(300));
+ localKmer.set(new Kmer(kbig));
+ }
+
+ protected ThreadLocal<int[]> localLeftCounts=new ThreadLocal<int[]>();
+ protected ThreadLocal<int[]> localRightCounts=new ThreadLocal<int[]>();
+ protected ThreadLocal<LongList> localLongList=new ThreadLocal<LongList>();
+ protected ThreadLocal<IntList> localIntList=new ThreadLocal<IntList>();
+ protected ThreadLocal<ByteBuilder> localByteBuilder=new ThreadLocal<ByteBuilder>();
+ protected ThreadLocal<BitSet> localBitSet=new ThreadLocal<BitSet>();
+ protected ThreadLocal<Kmer> localKmer=new ThreadLocal<Kmer>();
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Final Primitives ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** min kmer count to dump to text */
+ protected int minToDump=1;
+
+ /** Correct via kmers */
+ private final boolean ecc;
+
+ /** Correct via overlap */
+ final boolean ecco;
+
+ /** True iff java was launched with the -ea' flag */
+ private final boolean EA;
+
+ /** For numbering contigs */
+ final AtomicLong contigNum=new AtomicLong(0);
+
+ int contigPasses=16;
+ double contigPassMult=1.7;
+
+ /** For controlling access to tables for contig-building */
+ final AtomicInteger nextTable[];
+
+ /** For controlling access to victim buffers for contig-building */
+ final AtomicInteger nextVictims[];
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Static Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Print messages to this stream */
+ protected static PrintStream outstream=System.err;
+ /** Permission to overwrite existing files */
+ public static boolean overwrite=false;
+ /** Permission to append to existing files */
+ public static boolean append=false;
+ /** Force output reads to stay in input order */
+ public static boolean ordered=false;
+ /** Print speed statistics upon completion */
+ public static boolean showSpeed=true;
+ /** Display progress messages such as memory usage */
+ public static boolean DISPLAY_PROGRESS=true;
+ /** Verbose messages */
+ public static boolean verbose=false;
+ /** Debugging verbose messages */
+ public static boolean verbose2=false;
+ /** Number of ProcessThreads */
+ public static int THREADS=Shared.threads();
+ /** Do garbage collection prior to printing memory usage */
+ private static final boolean GC_BEFORE_PRINT_MEMORY=false;
+
+ static boolean IGNORE_BAD_OWNER=false;
+
+ public static final int contigMode=0;
+ public static final int extendMode=1;
+ public static final int correctMode=2;
+ public static final int insertMode=3;
+
+ /** Explore codes */
+ public static final int KEEP_GOING=0, DEAD_END=1, TOO_SHORT=2, TOO_LONG=3, TOO_DEEP=4, FORWARD_BRANCH=5, BACKWARD_BRANCH=6, LOOP=7;
+
+ /** Extend codes */
+ public static final int BAD_OWNER=11, BAD_SEED=12, BRANCH=13;
+
+ public static final int STATUS_UNEXPLORED=0, STATUS_EXPLORED=1, STATUS_REMOVE=2, STATUS_KEEP=3;
+
+}
diff --git a/current/assemble/Tadpole1.java b/current/assemble/Tadpole1.java
new file mode 100755
index 0000000..ee5009f
--- /dev/null
+++ b/current/assemble/Tadpole1.java
@@ -0,0 +1,1314 @@
+package assemble;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.BitSet;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import jgi.BBMerge;
+import kmer.AbstractKmerTable;
+import kmer.HashArray1D;
+import kmer.HashForest;
+import kmer.KmerNode;
+import kmer.KmerTableSet;
+import stream.ByteBuilder;
+import stream.ConcurrentReadInputStream;
+import stream.Read;
+import ukmer.Kmer;
+import align2.IntList;
+import align2.ListNum;
+import align2.LongList;
+import align2.Shared;
+import align2.Tools;
+import dna.AminoAcid;
+import dna.Parser;
+import dna.Timer;
+
+
+/**
+ * Short-kmer assembler based on KmerCountExact.
+ * @author Brian Bushnell
+ * @date May 15, 2015
+ *
+ */
+public class Tadpole1 extends Tadpole {
+
+ /**
+ * Code entrance from the command line.
+ * @param args Command line arguments
+ */
+ public static void main(String[] args){
+
+ args=Parser.parseConfig(args);
+ if(Parser.parseHelp(args, true)){
+ printOptions();
+ System.exit(0);
+ }
+
+ Timer t=new Timer(), t2=new Timer();
+ t.start();
+ t2.start();
+
+ //Create a new CountKmersExact instance
+ Tadpole1 wog=new Tadpole1(args, true);
+ t2.stop();
+ outstream.println("Initialization Time: \t"+t2);
+
+ ///And run it
+ wog.process(t);
+ }
+
+ /**
+ * Constructor.
+ * @param args Command line arguments
+ */
+ public Tadpole1(String[] args, boolean setDefaults){
+ super(args, setDefaults);
+
+ final int bytesPerKmer;
+ {
+ int mult=12;
+ if(useOwnership){mult+=4;}
+ if(processingMode==correctMode){}
+ else if(processingMode==contigMode || processingMode==extendMode){mult+=1;}
+ bytesPerKmer=mult;
+ }
+
+ tables=new KmerTableSet(args, bytesPerKmer);
+ k=tables.k;
+ k2=tables.k2;
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Outer Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ void initializeOwnership(){
+ tables.initializeOwnership();
+ }
+
+ @Override
+ long shave(boolean shave, boolean rinse){
+ final Shaver shaver=new Shaver1(tables, THREADS);
+ long sum=0;
+
+ for(int i=0; i<maxShaveDepth; i++){
+ int a=1, b=maxShaveDepth, c=i+1;
+ // if(i>3){Shaver.verbose2=true;}
+ outstream.println("\nShave("+a+", "+b+", "+c+")");
+ long removed=shaver.shave(a, b, c, Tools.max(minContigLen, shaveDiscardLen), shaveExploreDist, shave, rinse);
+ sum+=removed;
+ if(removed<100 || i>2){break;}
+ }
+
+ System.err.println();
+ return sum;
+ }
+
+ @Override
+ public long loadKmers(Timer t){
+ tables.process(t);
+ return tables.kmersLoaded;
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Recall Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private final long rcomp(long kmer){return AminoAcid.reverseComplementBinaryFast(kmer, k);}
+ private final long toValue(long kmer, long rkmer){return tables.toValue(kmer, rkmer);}
+ public final int getCount(long kmer, long rkmer){return tables.getCount(kmer, rkmer);}
+ private final boolean claim(long kmer, int id){return claim(kmer, rcomp(kmer), id);}
+ private final boolean claim(long kmer, long rkmer, int id){return tables.claim(kmer, rkmer, id);}
+ private final boolean doubleClaim(ByteBuilder bb, int id/*, long rid*/){return tables.doubleClaim(bb, id/*, rid*/);}
+ private final boolean claim(ByteBuilder bb, int id, /*long rid, */boolean earlyExit){return tables.claim(bb, id/*, rid*/, earlyExit);}
+ private final boolean claim(byte[] array, int len, int id, /*long rid, */boolean earlyExit){return tables.claim(array, len, id/*, rid*/, earlyExit);}
+ private final int findOwner(long kmer){return tables.findOwner(kmer);}
+ private final int findOwner(ByteBuilder bb, int id){return tables.findOwner(bb, id);}
+ private final int findOwner(byte[] array, int len, int id){return tables.findOwner(array, len, id);}
+ private final void release(long key, int id){tables.release(key, id);}
+ private final void release(ByteBuilder bb, int id){tables.release(bb, id);}
+ private final void release(byte[] array, int len, int id){tables.release(array, len, id);}
+ private final int fillRightCounts(long kmer, long rkmer, int[] counts, long mask, int shift2){return tables.fillRightCounts(kmer, rkmer, counts, mask, shift2);}
+ private final int fillLeftCounts(long kmer, long rkmer, int[] counts, long mask, int shift2){return tables.fillLeftCounts(kmer, rkmer, counts, mask, shift2);}
+ private final StringBuilder toText(long kmer){return AbstractKmerTable.toText(kmer, k);}
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Classes ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /*--------------------------------------------------------------*/
+ /*---------------- BuildThread ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ BuildThread makeBuildThread(int id, int mode, ConcurrentReadInputStream[] crisa){
+ return new BuildThread(id, mode, crisa);
+ }
+
+ /**
+ * Builds contigs.
+ */
+ private class BuildThread extends AbstractBuildThread{
+
+ public BuildThread(int id_, int mode_, ConcurrentReadInputStream[] crisa_){
+ super(id_, mode_, crisa_);
+ }
+
+ @Override
+ public void run(){
+ if(crisa==null || crisa.length==0){
+ //Build from kmers
+
+ if(id==0){System.err.print("Seeding with min count = ");}
+ String comma="";
+ for(int i=contigPasses-1; i>0; i--){
+ minCountSeedCurrent=(int)Tools.min(Integer.MAX_VALUE, Tools.max(minCountSeed+i, (long)Math.floor((minCountSeed)*Math.pow(contigPassMult, i)*0.92-0.25) ));
+ if(id==0){
+ System.err.print(comma+minCountSeedCurrent);
+ comma=", ";
+ }
+ while(processNextTable(nextTable[i])){}
+ while(processNextVictims(nextVictims[i])){}
+ }
+ //Final pass
+ minCountSeedCurrent=minCountSeed;
+ if(id==0){System.err.println(comma+minCountSeedCurrent);}
+ while(processNextTable(nextTable[0])){}
+ while(processNextVictims(nextVictims[0])){}
+ }else{
+ //Extend reads
+ for(ConcurrentReadInputStream cris : crisa){
+ synchronized(crisa){
+ if(!cris.started()){
+ cris.start();
+ }
+ }
+ run(cris);
+ }
+ }
+ }
+
+ private boolean processNextTable(AtomicInteger aint){
+ final int tnum=aint.getAndAdd(1);
+ if(tnum>=tables.ways){return false;}
+ final HashArray1D table=tables.getTable(tnum);
+ if(verbose && id==0){System.err.println("Processing table "+tnum+", size "+table.size());}
+ final int max=table.arrayLength();
+ for(int cell=0; cell<max; cell++){
+ int x=processCell(table, cell);
+ }
+ return true;
+ }
+
+ private boolean processNextVictims(AtomicInteger aint){
+ final int tnum=aint.getAndAdd(1);
+ if(tnum>=tables.ways){return false;}
+ final HashArray1D table=tables.getTable(tnum);
+ final HashForest forest=table.victims();
+ if(verbose && id==0){System.err.println("Processing forest "+tnum+", size "+forest.size());}
+ final int max=forest.arrayLength();
+ for(int cell=0; cell<max; cell++){
+ KmerNode kn=forest.getNode(cell);
+ int x=traverseKmerNode(kn);
+ }
+ return true;
+ }
+
+ private int processCell(HashArray1D table, int cell){
+ int count=table.readCellValue(cell);
+ if(count<minCountSeedCurrent){return 0;}
+
+ long key=table.getKmer(cell);
+
+ if(verbose){outstream.println("id="+id+" processing cell "+cell+"; \tkmer="+key+"\t"+toText(key));}
+ if(useOwnership){
+ int owner=table.getCellOwner(cell);
+ if(verbose){outstream.println("Owner is initially "+owner);}
+ if(owner>-1){return 0;}
+ owner=table.setOwner(key, id, cell);
+ if(verbose){outstream.println("Owner is now "+owner);}
+ if(owner!=id){return 0;}
+ }
+ return processKmer(key);
+ }
+
+ private int traverseKmerNode(KmerNode kn){
+ int sum=0;
+ if(kn!=null){
+ sum+=processKmerNode(kn);
+ if(kn.left()!=null){
+ sum+=traverseKmerNode(kn.left());
+ }
+ if(kn.right()!=null){
+ sum+=traverseKmerNode(kn.right());
+ }
+ }
+ return sum;
+ }
+
+ private int processKmerNode(KmerNode kn){
+ final long key=kn.pivot();
+ final int count=kn.getValue(key);
+ if(count<minCountSeedCurrent){return 0;}
+
+ if(verbose){outstream.println("id="+id+" processing KmerNode; \tkmer="+key+"\t"+toText(key));}
+ if(useOwnership){
+ int owner=kn.getOwner(key);
+ if(verbose){outstream.println("Owner is initially "+owner);}
+ if(owner>-1){return 0;}
+ owner=kn.setOwner(key, id);
+ if(verbose){outstream.println("Owner is now "+owner);}
+ if(owner!=id){return 0;}
+ }
+ return processKmer(key);
+ }
+
+ private int processKmer(long key){
+ byte[] contig=makeContig(key, builderT, true);
+ if(contig!=null){
+ float coverage=tables.calcCoverage(contig, contig.length);
+ if(coverage<minCoverage){return 0;}
+ if(verbose){System.err.println("Added "+contig.length);}
+ final long num=contigNum.incrementAndGet();
+ Read r=new Read(contig, -1, -1, -1, "*", null, num, 0);
+ float gc=r.gc();
+ r.id="contig_"+num+",length="+contig.length+",cov="+String.format("%.1f", coverage)+",gc="+String.format("%.3f", gc);
+ contigs.add(r);
+ return contig.length;
+ }else{
+ if(verbose){System.err.println("Created null contig.");}
+ }
+ return 0;
+ }
+
+ private void run(ConcurrentReadInputStream cris){
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ //While there are more reads lists...
+ while(reads!=null && reads.size()>0){
+
+ //For each read (or pair) in the list...
+ for(int i=0; i<reads.size(); i++){
+ final Read r1=reads.get(i);
+ final Read r2=r1.mate;
+
+ processReadPair(r1, r2);
+ }
+
+ //Fetch a new read list
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ cris.returnList(ln.id, ln.list.isEmpty());
+ }
+
+ private void processReadPair(Read r1, Read r2){
+ if(verbose){System.err.println("Considering read "+r1.id+" "+new String(r1.bases));}
+
+ readsInT++;
+ basesInT+=r1.length();
+ if(r2!=null){
+ readsInT++;
+ basesInT+=r2.length();
+ }
+
+ if(mode==insertMode){
+ int x=BBMerge.findOverlapStrict(r1, r2, false);
+ if(x<1){
+ x=findInsertSize(r1, r2, rightCounts);
+ }
+ insertSizes.increment(Tools.max(x, 0));
+ return;
+ }
+
+ if(ecco && r1!=null && r2!=null && !r1.discarded() && !r2.discarded()){BBMerge.findOverlapStrict(r1, r2, true);}
+
+ if(r1!=null){
+ if(r1.discarded()){
+ lowqBasesT+=r1.length();
+ lowqReadsT++;
+ }else{
+ byte[] contig=makeContig(r1.bases, builderT, r1.numericID);
+ if(contig!=null){
+ if(verbose){System.err.println("Added "+contig.length);}
+ final long num=contigNum.incrementAndGet();
+ Read temp=new Read(contig, -1, -1, -1, "contig_"+num+"_length_"+contig.length, null, num, 0);
+ contigs.add(temp);
+ }
+ }
+ }
+ if(r2!=null){
+ if(r2.discarded()){
+ lowqBasesT+=r2.length();
+ lowqReadsT++;
+ }else{
+ byte[] contig=makeContig(r2.bases, builderT, r1.numericID);
+ if(contig!=null){
+ if(verbose){System.err.println("Added "+contig.length);}
+ final long num=contigNum.incrementAndGet();
+ Read temp=new Read(contig, -1, -1, -1, "contig_"+num+"_length_"+contig.length, null, num, 0);
+ contigs.add(temp);
+ }
+ }
+ }
+ }
+
+ /** From kmers */
+ private byte[] makeContig(final long key, final ByteBuilder bb, boolean alreadyClaimed){
+ builderT.setLength(0);
+ builderT.appendKmer(key, k);
+ if(verbose){outstream.println("Filled builder: "+builderT);}
+
+ final int initialLength=bb.length();
+ assert(initialLength==k);
+ if(initialLength<k){return null;}
+// System.err.print("A");
+
+ boolean success=(alreadyClaimed || !useOwnership ? true : claim(key, id));
+ if(verbose){System.err.println("Thread "+id+" checking owner after setting: "+findOwner(bb, id));}
+ if(!success){
+ assert(bb.length()==k);
+// release(bb, id); //no need to release
+ return null;
+ }
+// System.err.print("B");
+ if(verbose /*|| true*/){System.err.println("Thread "+id+" building contig; initial length "+bb.length());}
+ if(verbose){System.err.println("Extending to right.");}
+ {
+ final int status=extendToRight(bb, leftCounts, rightCounts, id);
+
+ if(status==DEAD_END){
+ //do nothing
+ }else if(status==LOOP){//TODO
+ //special case - handle specially, for a loop with no obvious junction, e.g. long tandem repeat.
+ //Perhaps, the last kmer should be reclassified as a junction and removed.
+ }else if(status==BAD_SEED){
+ assert(bb.length()==k);
+ release(key, id);
+// System.err.print("B1");
+ return null;
+ }else{
+ if(bb.length()==k){
+ if(status==BAD_OWNER){
+ if(IGNORE_BAD_OWNER){
+ //do nothing
+ }else{
+ release(key, id);
+// System.err.print("B2");
+ return null;
+ }
+ }else if(status==BRANCH){
+ release(key, id);
+// System.err.print("B3");
+ return null;
+ }else{
+ throw new RuntimeException("Bad return value: "+status);
+ }
+ }else{
+ if(status==BAD_OWNER){
+ if(IGNORE_BAD_OWNER){
+ bb.length--;
+ }else{
+ release(bb, id);
+// System.err.print("B4");
+ return null;
+ }
+ }else if(status==BRANCH){
+ //do nothing
+ }else{
+ throw new RuntimeException("Bad return value: "+status);
+ }
+ }
+ }
+ }
+// System.err.print("C");
+ bb.reverseComplementInPlace();
+ if(verbose /*|| true*/){System.err.println("Extending rcomp to right; current length "+bb.length());}
+ {
+ final int status=extendToRight(bb, leftCounts, rightCounts, id);
+
+ if(status==DEAD_END){
+ //do nothing
+ }else if(status==LOOP){//TODO
+ //special case - handle specially, for a loop with no obvious junction, e.g. long tandem repeat.
+ //Perhaps, the last kmer should be reclassified as a junction and removed.
+ }else if(status==BAD_SEED){
+ assert(false) : bb;//This should never happen.
+ assert(bb.length()==k);
+ release(key, id);
+ return null;
+ }else{
+ if(status==BAD_OWNER){
+ if(IGNORE_BAD_OWNER){
+ if(bb.length()>k){bb.length--;}
+ }else{
+ release(bb, id);
+// System.err.print("C1");
+ return null;
+ }
+ }else if(status==BRANCH){
+ //do nothing
+ }else{
+ throw new RuntimeException("Bad return value: "+status);
+ }
+ }
+ }
+// System.err.print("D");
+
+ if(verbose /*|| true*/){System.err.println("A: Final length for thread "+id+": "+bb.length());}
+
+ // if(useOwnership && THREADS==1){assert(claim(bases, bases.length, id, rid));}
+ success=(useOwnership ? doubleClaim(bb, id) : true);
+ if(verbose /*|| true*/){System.err.println("Success for thread "+id+": "+success);}
+
+ if(trimEnds>0){bb.trimByAmount(trimEnds, trimEnds);}
+ if(bb.length()>=initialLength+minExtension && bb.length()>=minContigLen){
+ if(success){
+ bb.reverseComplementInPlace();
+ return bb.toBytes();
+ }else{
+// System.err.print("E");
+ // assert(false) : bb.length()+", "+id;
+ release(bb, id);
+ return null;
+ }
+ }
+ if(verbose /*|| true*/){System.err.println("A: Contig was too short for "+id+": "+bb.length());}
+// assert(false) : bb.length()+", "+initialLength+", "+minExtension+", "+minContigLen;
+// System.err.print("F");
+ return null;
+ }
+
+ /** From a seed */
+ private byte[] makeContig(final byte[] bases, final ByteBuilder bb, long rid){
+ if(bases==null || bases.length<k){return null;}
+// if(verbose /*|| true*/){System.err.println("Thread "+id+" checking owner: "+findOwner(bases, bases.length, id));}
+ int owner=useOwnership ? findOwner(bases, bases.length, id) : -1;
+ if(owner>=id){return null;}
+ boolean success=(useOwnership ? claim(bases, bases.length, id, true) : true);
+ if(verbose /*|| true*/){System.err.println("Thread "+id+" checking owner after setting: "+findOwner(bases, bases.length, id));}
+ if(!success){
+ release(bases, bases.length, id);
+ return null;
+ }
+ if(verbose /*|| true*/){System.err.println("Thread "+id+" building contig; initial length "+bases.length);}
+ bb.setLength(0);
+ bb.append(bases);
+ if(verbose){System.err.println("Extending to right.");}
+ {
+ final int status=extendToRight(bb, leftCounts, rightCounts, id);
+
+ if(status==DEAD_END){
+ //do nothing
+ }else if(status==LOOP){//TODO
+ //special case - handle specially, for a loop with no obvious junction, e.g. long tandem repeat.
+ //Perhaps, the last kmer should be reclassified as a junction and removed.
+ }else if(status==BAD_SEED){
+ //do nothing
+ }else{
+ if(status==BAD_OWNER){
+ release(bb, id);
+ return null;
+ }else if(status==BRANCH){
+ //do nothing
+ }else{
+ throw new RuntimeException("Bad return value: "+status);
+ }
+ }
+ }
+ bb.reverseComplementInPlace();
+ if(verbose /*|| true*/){System.err.println("Extending rcomp to right; current length "+bb.length());}
+ {
+ final int status=extendToRight(bb, leftCounts, rightCounts, id);
+
+ if(status==DEAD_END){
+ //do nothing
+ }else if(status==LOOP){//TODO
+ //special case - handle specially, for a loop with no obvious junction, e.g. long tandem repeat.
+ //Perhaps, the last kmer should be reclassified as a junction and removed.
+ }else if(status==BAD_SEED){
+ //do nothing
+ }else{
+ if(status==BAD_OWNER){
+ release(bb, id);
+ return null;
+ }else if(status==BRANCH){
+ //do nothing
+ }else{
+ throw new RuntimeException("Bad return value: "+status);
+ }
+ }
+ }
+ if(verbose /*|| true*/){System.err.println("B: Final length for thread "+id+": "+bb.length());}
+
+ // if(useOwnership && THREADS==1){assert(claim(bases, bases.length, id, rid));}
+ success=(useOwnership ? doubleClaim(bb, id) : true);
+ if(verbose /*|| true*/){System.err.println("Success for thread "+id+": "+success);}
+ if(bb.length()>=bases.length+minExtension && bb.length()>=minContigLen){
+ if(success){
+ bb.reverseComplementInPlace();
+ return bb.toBytes();
+ }else{
+ // assert(false) : bb.length()+", "+id;
+ release(bb.array, bb.length(), id);
+ return null;
+ }
+ }
+
+ if(verbose /*|| true*/){System.err.println("B: Contig was too short for "+id+": "+bb.length());}
+ return null;
+ }
+
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Extension Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+
+ public int findInsertSize(Read r1, Read r2, int[] rightCounts){
+ final long kmer1=tables.rightmostKmer(r1.bases, r1.length());
+ final long kmer2=tables.rightmostKmer(r2.bases, r2.length());
+ if(kmer1<0 || kmer2<0){return -1;}
+ final long rkmer1=rcomp(kmer1);
+ final long rkmer2=rcomp(kmer2);
+ final int x=measureInsert(kmer1, rkmer1, kmer2, rkmer2, 24000, rightCounts);
+ if(x<0){return -1;}
+ return r1.length()+r2.length()+x-k;//TODO: May be off by 1.
+ }
+
+ @Override
+ public int extendRead(Read r, ByteBuilder bb, int[] leftCounts, int[] rightCounts, int distance, final Kmer kmer){
+ return extendRead(r, bb, leftCounts, rightCounts, distance);
+ }
+
+ @Override
+ public int extendRead(Read r, ByteBuilder bb, int[] leftCounts, int[] rightCounts, int distance){
+ final int initialLen=r.length();
+ if(initialLen<k){return 0;}
+ bb.setLength(0);
+ bb.append(r.bases);
+ final int extension=extendToRight2(bb, leftCounts, rightCounts, distance, true);
+ if(extension>0){
+ r.bases=bb.toBytes();
+ if(r.quality!=null){
+ final byte q=Shared.FAKE_QUAL;
+ r.quality=Arrays.copyOf(r.quality, r.bases.length);
+ for(int i=initialLen; i<r.quality.length; i++){
+ r.quality[i]=q;
+ }
+ }
+ }
+ assert(extension==r.length()-initialLen);
+ return extension;
+ }
+
+ /** Returns distance between the two kmers, or -1 */
+ public int measureInsert(final long kmer1, final long rkmer1, final long kmer2, final long rkmer2, final int maxlen, final int[] rightCounts){
+ final int shift=2*k;
+ final int shift2=shift-2;
+ final long mask=~((-1L)<<shift);
+ final long key2=toValue(kmer2, rkmer2);
+ long kmer=kmer1;
+ long rkmer=rkmer1;
+ int len=0;
+
+ {
+ int count=tables.getCount(key2);
+ if(count<minCountSeed){return -1;}
+ }
+
+ long key=toValue(kmer, rkmer);
+ int count=tables.getCount(key);
+ if(count<minCountSeed){return -1;}
+ if(count<minCountSeed){
+ if(verbose){outstream.println("Returning because count was too low: "+count);}
+ return -1;
+ }
+
+ int rightMaxPos=fillRightCounts(kmer, rkmer, rightCounts, mask, shift2);
+ int rightMax=rightCounts[rightMaxPos];
+// int rightSecondPos=Tools.secondHighestPosition(rightCounts);
+// int rightSecond=rightCounts[rightSecondPos];
+
+ if(rightMax<minCountExtend){return -1;}
+// if(isJunction(rightMax, rightSecond)){return -1;}
+
+ while(key!=key2 && len<maxlen){
+
+ //Generate the new kmer
+// final byte b=AminoAcid.numberToBase[rightMaxPos];
+ final long x=rightMaxPos;
+ final long x2=AminoAcid.numberToComplement[(int)x];
+
+ //Now consider the next kmer
+ kmer=((kmer<<2)|(long)x)&mask;
+ rkmer=(rkmer>>>2)|(x2<<shift2);
+
+ assert(tables.getCount(kmer, rkmer)==rightMax);
+ count=rightMax;
+
+ assert(count>=minCountExtend) : count;
+
+ rightMaxPos=fillRightCounts(kmer, rkmer, rightCounts, mask, shift2);
+ rightMax=rightCounts[rightMaxPos];
+// rightSecondPos=Tools.secondHighestPosition(rightCounts);
+// rightSecond=rightCounts[rightSecondPos];
+
+ if(verbose){
+ outstream.println("kmer: "+toText(kmer)+", "+toText(rkmer));
+ outstream.println("Counts: "+count+", "+Arrays.toString(rightCounts));
+ outstream.println("rightMaxPos="+rightMaxPos);
+ outstream.println("rightMax="+rightMax);
+// outstream.println("rightSecondPos="+rightSecondPos);
+// outstream.println("rightSecond="+rightSecond);
+ }
+
+ if(rightMax<minCountExtend){
+ if(verbose){outstream.println("A: Breaking because highest right was too low:"+rightMax);}
+ break;
+ }
+
+// if(isJunction(rightMax, rightSecond)){return -1;}
+
+ len++;
+ }
+ return len>=maxlen ? -1 : len;
+ }
+
+
+
+ /**
+ * Extend these bases into a contig.
+ * Stops at both left and right junctions.
+ * Claims ownership.
+ */
+ public int extendToRight(final ByteBuilder bb, final int[] leftCounts, final int[] rightCounts, final int id){
+ if(bb.length()<k){return BAD_SEED;}
+ final int shift=2*k;
+ final int shift2=shift-2;
+ final long mask=~((-1L)<<shift);
+ long kmer=0;
+ long rkmer=0;
+ int len=0;
+
+ /* Loop through the bases, maintaining a forward and reverse kmer via bitshifts, to get the rightmost kmer */
+ {
+ final int bblen=bb.length();
+ final byte[] bases=bb.array;
+ for(int i=bblen-k; i<bblen; i++){
+ final byte b=bases[i];
+ final long x=AminoAcid.baseToNumber[b];
+ final long x2=AminoAcid.baseToComplementNumber[b];
+ kmer=((kmer<<2)|x)&mask;
+ rkmer=(rkmer>>>2)|(x2<<shift2);
+ if(x<0){
+ len=0;
+ kmer=rkmer=0;
+ }else{len++;}
+ if(verbose){outstream.println("A: Scanning i="+i+", len="+len+", kmer="+kmer+", rkmer="+rkmer+"\t"+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
+ }
+ }
+
+ if(len<k){return BAD_SEED;}
+ else{assert(len==k);}
+
+ /* Now the trailing kmer has been initialized. */
+
+ long key=toValue(kmer, rkmer);
+ HashArray1D table=tables.getTableForKey(key);
+ int count=table.getValue(key);
+ if(count<minCountSeed){
+ if(verbose){outstream.println("Returning because count was too low: "+count);}
+ return BAD_SEED;
+ }
+
+ int owner=(useOwnership ? table.getOwner(key) : id);
+ if(verbose){outstream.println("Owner: "+owner);}
+ if(owner>id){return BAD_OWNER;}
+
+ int leftMaxPos=0;
+ int leftMax=minCountExtend;
+ int leftSecondPos=1;
+ int leftSecond=0;
+
+ if(leftCounts!=null){
+ leftMaxPos=fillLeftCounts(kmer, rkmer, leftCounts, mask, shift2);
+ leftMax=leftCounts[leftMaxPos];
+ leftSecondPos=Tools.secondHighestPosition(leftCounts);
+ leftSecond=leftCounts[leftSecondPos];
+ }
+
+ int rightMaxPos=fillRightCounts(kmer, rkmer, rightCounts, mask, shift2);
+ int rightMax=rightCounts[rightMaxPos];
+ int rightSecondPos=Tools.secondHighestPosition(rightCounts);
+ int rightSecond=rightCounts[rightSecondPos];
+
+ if(verbose){
+ outstream.println("kmer: "+toText(kmer)+", "+toText(rkmer));
+ outstream.println("Counts: "+count+", "+(leftCounts==null ? "null" : Arrays.toString(leftCounts))+", "+Arrays.toString(rightCounts));
+ outstream.println("leftMaxPos="+leftMaxPos);
+ outstream.println("leftMax="+leftMax);
+ outstream.println("leftSecondPos="+leftSecondPos);
+ outstream.println("leftSecond="+leftSecond);
+ outstream.println("rightMaxPos="+rightMaxPos);
+ outstream.println("rightMax="+rightMax);
+ outstream.println("rightSecondPos="+rightSecondPos);
+ outstream.println("rightSecond="+rightSecond);
+ }
+
+ if(rightMax<minCountExtend){return DEAD_END;}
+ if(isJunction(rightMax, rightSecond, leftMax, leftSecond)){
+ return BRANCH;
+ }
+
+ if(useOwnership){
+ owner=table.setOwner(key, id);
+ if(verbose){outstream.println("A. Owner is now "+id+" for key "+key);}
+ if(owner!=id){
+ if(verbose){outstream.println("Returning early because owner was "+owner+" for thread "+id+".");}
+ return BAD_OWNER;
+ }
+ }
+
+ final int maxLen=Tools.min((extendRight<0 ? maxContigLen : bb.length()+extendRight), maxContigLen);
+
+ while(owner==id && bb.length()<maxLen){
+
+ //Generate the new kmer
+ final byte b=AminoAcid.numberToBase[rightMaxPos];
+ final long x=rightMaxPos;
+ final long x2=AminoAcid.numberToComplement[(int)x];
+
+ final long evicted=(kmer>>>shift2); //Binary value that falls off the end.
+
+ //Now consider the next kmer
+ kmer=((kmer<<2)|(long)x)&mask;
+ rkmer=(rkmer>>>2)|(x2<<shift2);
+
+ key=toValue(kmer, rkmer);
+ table=tables.getTableForKey(key);
+
+ assert(table.getValue(key)==rightMax);
+ count=rightMax;
+
+ assert(count>=minCountExtend) : count;
+
+ if(leftCounts!=null){
+ leftMaxPos=fillLeftCounts(kmer, rkmer, leftCounts, mask, shift2);
+ leftMax=leftCounts[leftMaxPos];
+ leftSecondPos=Tools.secondHighestPosition(leftCounts);
+ leftSecond=leftCounts[leftSecondPos];
+ }
+
+ rightMaxPos=fillRightCounts(kmer, rkmer, rightCounts, mask, shift2);
+ rightMax=rightCounts[rightMaxPos];
+ rightSecondPos=Tools.secondHighestPosition(rightCounts);
+ rightSecond=rightCounts[rightSecondPos];
+
+ if(verbose){
+ outstream.println("kmer: "+toText(kmer)+", "+toText(rkmer));
+ outstream.println("Counts: "+count+", "+(leftCounts==null ? "null" : Arrays.toString(leftCounts))+", "+Arrays.toString(rightCounts));
+ outstream.println("leftMaxPos="+leftMaxPos);
+ outstream.println("leftMax="+leftMax);
+ outstream.println("leftSecondPos="+leftSecondPos);
+ outstream.println("leftSecond="+leftSecond);
+ outstream.println("rightMaxPos="+rightMaxPos);
+ outstream.println("rightMax="+rightMax);
+ outstream.println("rightSecondPos="+rightSecondPos);
+ outstream.println("rightSecond="+rightSecond);
+ }
+
+ if(isJunction(rightMax, rightSecond, leftMax, leftSecond)){
+ if(verbose){outstream.println("B: Breaking because isJunction("+rightMax+", "+rightSecond+", "+leftMax+", "+leftSecond+")");}
+ return BRANCH;
+ }
+
+ if(leftCounts!=null && leftMaxPos!=evicted && branchMult1>0){
+ if(verbose){outstream.println("B: Breaking because of hidden branch: leftMaxPos!=evicted ("+leftMaxPos+"!="+evicted+")" +
+ "\nleftMaxPos="+leftMaxPos+", leftMax="+leftMax+", leftSecondPos="+leftSecondPos+", leftSecond="+leftSecond);}
+ return BRANCH;
+ }
+
+ bb.append(b);
+ if(verbose){outstream.println("Added base "+(char)b);}
+
+ if(useOwnership){
+ owner=table.getOwner(key);
+ if(verbose){outstream.println("Owner is initially "+id+" for key "+key);}
+ if(owner==id){//loop detection
+ if(verbose /*|| true*/){
+// outstream.println(new String(bb.array, bb.length()-31, 31));
+ outstream.println(bb);
+ outstream.println(toText(kmer));
+ outstream.println(toText(rkmer));
+ outstream.println("Breaking because owner was "+owner+" for thread "+id+".");
+ }
+ return LOOP;
+ }
+ owner=table.setOwner(key, id);
+ if(verbose){outstream.println("B. Owner is now "+id+" for key "+key);}
+ }
+
+ if(rightMax<minCountExtend){
+ if(verbose){outstream.println("B: Breaking because highest right was too low:"+rightMax);}
+ return DEAD_END;
+ }
+ }
+ assert(owner!=id);
+ if(verbose /*|| true*/){
+ outstream.println("Current contig: "+bb+"\nReturning because owner was "+owner+" for thread "+id+".");
+ }
+ return BAD_OWNER;
+ }
+
+ @Override
+ public int extendToRight2(final ByteBuilder bb, final int[] leftCounts, final int[] rightCounts, final int distance, boolean includeJunctionBase){
+ if(verbose || verbose2){outstream.println("Entering extendToRight2 (no kmers).");}
+ final int initialLength=bb.length();
+ if(initialLength<k){return 0;}
+ final int shift=2*k;
+ final int shift2=shift-2;
+ final long mask=~((-1L)<<shift);
+ long kmer=0;
+ long rkmer=0;
+
+ /* Loop through the bases, maintaining a forward and reverse kmer via bitshifts, to get the rightmost kmer */
+ {
+ int len=0;
+ final byte[] bases=bb.array;
+ for(int i=initialLength-k; i<initialLength; i++){
+ final byte b=bases[i];
+ final long x=AminoAcid.baseToNumber[b];
+ final long x2=AminoAcid.baseToComplementNumber[b];
+ kmer=((kmer<<2)|x)&mask;
+ rkmer=(rkmer>>>2)|(x2<<shift2);
+ if(x<0){
+ len=0;
+ kmer=rkmer=0;
+ }else{len++;}
+ if(verbose){outstream.println("B: Scanning i="+i+", len="+len+", kmer="+kmer+", rkmer="+rkmer+"\t"+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
+ }
+ if(len<k){
+ if(verbose || verbose2){outstream.println("Returning because len<k: "+len+"<"+k);}
+ return 0;
+ }
+ else{assert(len==k);}
+ }
+ return extendToRight2(bb, leftCounts, rightCounts, distance, includeJunctionBase, kmer, rkmer);
+ }
+
+ /**
+ * Extend these bases to the right by at most 'distance'.
+ * Stops at right junctions only.
+ * Does not claim ownership.
+ */
+ public int extendToRight2(final ByteBuilder bb, final int[] leftCounts, final int[] rightCounts, final int distance, boolean includeJunctionBase,
+ long kmer, long rkmer){
+ if(verbose || verbose2){outstream.println("Entering extendToRight2 (with kmers).");}
+ final int initialLength=bb.length();
+ final int shift=2*k;
+ final int shift2=shift-2;
+ final long mask=~((-1L)<<shift);
+
+ /* Now the trailing kmer has been initialized. */
+
+ long key=toValue(kmer, rkmer);
+ HashArray1D table=tables.getTableForKey(key);
+ int count=table.getValue(key);
+ if(count<minCountSeed){
+ if(verbose || verbose2){outstream.println("Returning because count was too low: "+count+"<"+minCountSeed);}
+ return 0;
+ }
+
+ int leftMaxPos=0;
+ int leftMax=minCountExtend;
+ int leftSecondPos=1;
+ int leftSecond=0;
+
+ if(leftCounts!=null){
+ leftMaxPos=fillLeftCounts(kmer, rkmer, leftCounts, mask, shift2);
+ leftMax=leftCounts[leftMaxPos];
+ leftSecondPos=Tools.secondHighestPosition(leftCounts);
+ leftSecond=leftCounts[leftSecondPos];
+ }
+
+ int rightMaxPos=fillRightCounts(kmer, rkmer, rightCounts, mask, shift2);
+ int rightMax=rightCounts[rightMaxPos];
+ int rightSecondPos=Tools.secondHighestPosition(rightCounts);
+ int rightSecond=rightCounts[rightSecondPos];
+
+ if(verbose){
+ outstream.println("kmer: "+toText(kmer)+", "+toText(rkmer));
+ outstream.println("Counts: "+count+", "+Arrays.toString(rightCounts));
+ outstream.println("rightMaxPos="+rightMaxPos);
+ outstream.println("rightMax="+rightMax);
+ outstream.println("rightSecondPos="+rightSecondPos);
+ outstream.println("rightSecond="+rightSecond);
+ }
+
+ if(rightMax<minCountExtend){
+ if(verbose || verbose2){outstream.println("Returning because rightMax was too low: "+rightMax+"<"+minCountExtend+"\n"+count+", "+Arrays.toString(rightCounts));}
+ return 0;
+ }
+ if(isJunction(rightMax, rightSecond, leftMax, leftSecond)){
+ if(verbose || verbose2){outstream.println("Returning because isJunction: "+rightMax+", "+rightSecond+"; "+leftMax+", "+leftSecond);}
+ return 0;
+ }
+
+ final int maxLen=Tools.min(bb.length()+distance, maxContigLen);
+
+ while(bb.length()<maxLen){
+
+ //Generate the new kmer
+ final byte b=AminoAcid.numberToBase[rightMaxPos];
+ final long x=rightMaxPos;
+ final long x2=AminoAcid.numberToComplement[(int)x];
+
+ final long evicted=(kmer>>>shift2); //Binary value that falls off the end.
+
+ //Now consider the next kmer
+ kmer=((kmer<<2)|(long)x)&mask;
+ rkmer=(rkmer>>>2)|(x2<<shift2);
+
+ key=toValue(kmer, rkmer);
+ table=tables.getTableForKey(key);
+
+ assert(table.getValue(key)==rightMax);
+ count=rightMax;
+
+ assert(count>=minCountExtend) : count;
+
+ if(leftCounts!=null){
+ leftMaxPos=fillLeftCounts(kmer, rkmer, leftCounts, mask, shift2);
+ leftMax=leftCounts[leftMaxPos];
+ leftSecondPos=Tools.secondHighestPosition(leftCounts);
+ leftSecond=leftCounts[leftSecondPos];
+ }
+
+ rightMaxPos=fillRightCounts(kmer, rkmer, rightCounts, mask, shift2);
+ rightMax=rightCounts[rightMaxPos];
+ rightSecondPos=Tools.secondHighestPosition(rightCounts);
+ rightSecond=rightCounts[rightSecondPos];
+
+ if(verbose){
+ outstream.println("kmer: "+toText(kmer)+", "+toText(rkmer));
+ outstream.println("Counts: "+count+", "+Arrays.toString(rightCounts));
+ outstream.println("rightMaxPos="+rightMaxPos);
+ outstream.println("rightMax="+rightMax);
+ outstream.println("rightSecondPos="+rightSecondPos);
+ outstream.println("rightSecond="+rightSecond);
+ }
+
+ if(isJunction(rightMax, rightSecond, leftMax, leftSecond)){
+ if(verbose){outstream.println("B: Breaking because isJunction("+rightMax+", "+rightSecond+", "+leftMax+", "+leftSecond+")");}
+ if(includeJunctionBase && kmer>rkmer){
+ bb.append(b);
+ if(verbose){outstream.println("Added base "+(char)b);}
+ }
+ break;
+ }
+
+ if(leftCounts!=null && leftMaxPos!=evicted){
+ if(verbose){outstream.println("B: Breaking because of hidden branch: leftMaxPos!=evicted ("+leftMaxPos+"!="+evicted+")" +
+ "\nleftMaxPos="+leftMaxPos+", leftMax="+leftMax+", leftSecondPos="+leftSecondPos+", leftSecond="+leftSecond);}
+ if(includeJunctionBase && kmer>rkmer){
+ bb.append(b);
+ if(verbose){outstream.println("Added base "+(char)b);}
+ }
+ break;
+ }
+
+ bb.append(b);
+ if(verbose){outstream.println("Added base "+(char)b);}
+
+ if(rightMax<minCountExtend){
+ if(verbose || verbose2){outstream.println("C: Breaking because highest right was too low: "+rightMax+"<"+minCountExtend);}
+ break;
+ }
+ }
+ if(verbose || verbose2){System.err.println("Extended by "+(bb.length()-initialLength));}
+ return bb.length()-initialLength;
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Error Correction ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public int errorCorrect(Read r){
+ initializeThreadLocals();
+ int corrected=errorCorrect(r, localLeftCounts.get(), localRightCounts.get(), localLongList.get(), localIntList.get(), localByteBuilder.get(), null, localBitSet.get());
+ return corrected;
+ }
+
+ @Override
+ public int errorCorrect(Read r, final int[] leftCounts, final int[] rightCounts, LongList kmers, IntList counts,
+ final ByteBuilder bb, final int[] detectedArray, final BitSet bs, Kmer kmer){
+ return errorCorrect(r, leftCounts, rightCounts, kmers, counts, bb, detectedArray, bs);
+ }
+
+ public int errorCorrect(Read r, final int[] leftCounts, final int[] rightCounts, LongList kmers, IntList counts,
+ final ByteBuilder bb, final int[] detectedArray, final BitSet bs){
+ final byte[] bases=r.bases;
+ final byte[] quals=r.quality;
+ if(detectedArray!=null){
+ detectedArray[0]=0;
+ detectedArray[1]=0;
+ detectedArray[2]=0;
+ detectedArray[3]=0;
+ }
+ int valid=tables.fillKmers(bases, kmers);
+ if(valid<2){return 0;}
+ tables.fillCounts(kmers, counts);
+ int correctedPincer=0;
+ int correctedTail=0;
+
+ if(ECC_PINCER){
+ correctedPincer+=errorCorrectPincer(bases, quals, leftCounts, rightCounts, kmers, counts, bb, detectedArray, errorExtensionPincer);
+ }
+
+ if(ECC_TAIL || ECC_ALL){
+ int start=(ECC_ALL ? 0 : counts.size-k-1);
+// if(ECC_PINCER && detectedArray!=null && detectedArray[0]>correctedPincer){start=start-k;}
+ correctedTail+=errorCorrectTail(bases, quals, leftCounts, rightCounts, kmers, counts, bb, detectedArray, start, errorExtensionTail);
+ r.reverseComplement();
+ valid=tables.fillKmers(bases, kmers);
+ counts.reverse();
+ correctedTail+=errorCorrectTail(bases, quals, leftCounts, rightCounts, kmers, counts, bb, detectedArray, start, errorExtensionTail);
+ r.reverseComplement();
+ counts.reverse();
+ }
+
+ if(MARK_BAD_BASES>0){
+ int marked=markBadBases(bases, quals, counts, bs, MARK_BAD_BASES, MARK_DELTA_ONLY);
+ detectedArray[3]=marked;
+ }
+ assert(detectedArray==null || (correctedPincer==detectedArray[1] && correctedTail==detectedArray[2])) : correctedPincer+", "+correctedTail+", "+Arrays.toString(detectedArray);
+// if(ECC_PINCER && correctedTail>0){
+// valid=fillKmers(bases, kmers);
+// counts.reverse();
+// correctedPincer+=errorCorrectPincer(bases, quals, leftCounts, rightCounts, kmers, counts, bb, detectedArray, errorExtensionPincer);
+// }
+
+ return correctedPincer+correctedTail;
+ }
+
+ public int errorCorrectPincer(final byte[] bases, final byte[] quals, final int[] leftBuffer, final int[] rightBuffer,
+ final LongList kmers, final IntList counts, final ByteBuilder bb, final int[] detectedArray, final int errorExtension){
+
+ int detected=0;
+ int corrected=0;
+
+ //a is the index of the left kmer
+ //b is a+1
+ //c is d-1
+ //d is the index of the right kmer
+ //the base between the kmers is at a+k
+ for(int a=0, d=k+1; d<counts.size; a++, d++){
+ final int aCount=counts.get(a);
+ final int bCount=counts.get(a+1);
+ final int cCount=counts.get(d-1);
+ final int dCount=counts.get(d);
+ if(isError(aCount, bCount) && isError(dCount, cCount) && isSimilar(aCount, dCount)){
+ if(verbose){
+ System.err.println("Found error: "+aCount+", "+bCount+", "+cCount+", "+dCount);
+ }
+ //Looks like a 1bp substitution; attempt to correct.
+ detected++;
+ int ret=correctSingleBasePincer(a, d, bases, quals, leftBuffer, rightBuffer, kmers, counts, bb, errorExtension);
+ corrected+=ret;
+ if(verbose){
+ System.err.println("Corrected error.");
+ }
+ }else{
+ if(verbose){
+ System.err.println("Not an error: "+aCount+", "+bCount+", "+cCount+", "+dCount+
+ "; "+isError(aCount, bCount)+", "+isError(dCount, cCount)+", "+isSimilar(aCount, dCount));
+ }
+ }
+ }
+
+// if(detected==0 && counts.get(0)>2 && counts.get(counts.size-1)>2){
+// assert(!verbose);
+// verbose=true;
+// System.err.println("\n"+counts);
+// errorCorrectPincer(bases, quals, leftBuffer, rightBuffer, kmers, counts, bb, detectedArray);
+// assert(false);
+// }
+
+ if(detectedArray!=null){
+ detectedArray[0]+=detected;
+ detectedArray[1]+=corrected;
+ }
+
+ return corrected;
+ }
+
+ public int errorCorrectTail(final byte[] bases, final byte[] quals, final int[] leftBuffer, final int[] rightBuffer,
+ final LongList kmers, final IntList counts, final ByteBuilder bb, final int[] detectedArray, final int startPos, final int errorExtension){
+ if(bases.length<k+2*(1+errorExtension)){return 0;}
+ int detected=0;
+ int corrected=0;
+
+ //a is the index of the left kmer
+ //b is a+1
+ //the base between the kmers is at a+k
+ for(int a=Tools.max(startPos, errorExtension), lim=counts.size-Tools.min(errorExtension, (errorExtension+3)/2); a<lim; a++){//errorExtension-1
+ final int aCount=counts.get(a);
+ final int bCount=counts.get(a+1);
+ if(isError(aCount, bCount) && isSimilar(aCount, a-errorExtension, a-1, counts) && isError(aCount, a+2, a+k, counts)){
+ if(verbose){
+ System.err.println("Found error: "+aCount+", "+bCount);
+ }
+ //Assume like a 1bp substitution; attempt to correct.
+ detected++;
+ int ret=correctSingleBaseRight(a, bases, quals, leftBuffer, rightBuffer, kmers, counts, bb, errorExtension);
+ corrected+=ret;
+ if(verbose){
+ System.err.println("Corrected error.");
+ }
+ }else{
+ if(verbose){
+ System.err.println("Not an error: "+aCount+", "+bCount+
+ "; "+isError(aCount, bCount)+", "+isSimilar(aCount, a-errorExtension, a-1, counts)+", "+isError(aCount, a+2, a+k, counts));
+ }
+ }
+ }
+
+// if(detected==0 && counts.get(0)>2 && counts.get(counts.size-1)>2){
+// assert(!verbose);
+// verbose=true;
+// System.err.println("\n"+counts);
+// errorCorrectPincer(bases, quals, leftBuffer, rightBuffer, kmers, counts, bb, detectedArray);
+// assert(false);
+// }
+
+ if(detectedArray!=null){
+ detectedArray[0]+=detected;
+ detectedArray[2]+=corrected;
+ }
+
+ return corrected;
+ }
+
+ private int correctSingleBasePincer(final int a, final int d, final byte[] bases, final byte[] quals, final int[] leftBuffer, final int[] rightBuffer,
+ final LongList kmers, final IntList counts, final ByteBuilder bb, final int errorExtension){
+ final byte leftReplacement, rightReplacement;
+ final int loc=a+k;
+ {
+ bb.clear();
+ final long kmer=kmers.get(a);
+ final long rkmer=rcomp(kmer);
+ int extension=extendToRight2(bb, null, rightBuffer, errorExtension, true, kmer, rkmer);
+ if(extension<errorExtension){return 0;}
+ for(int i=1; i<extension; i++){
+ if(bb.get(i)!=bases[loc+i]){
+ return 0;
+ }
+ }
+ leftReplacement=bb.get(0);
+ }
+ {
+ bb.clear();
+ final long rkmer=kmers.get(d);
+ final long kmer=rcomp(rkmer);
+ int extension=extendToRight2(bb, null, rightBuffer, errorExtension, true, kmer, rkmer);
+ if(extension<errorExtension){return 0;}
+ bb.reverseComplementInPlace();
+ for(int i=0; i<extension-1; i++){
+ if(bb.get(i)!=bases[loc+i+1-extension]){
+ return 0;
+ }
+ }
+ rightReplacement=bb.get(extension-1);
+ }
+ if(leftReplacement!=rightReplacement){return 0;}
+ if(bases[loc]==leftReplacement){return 0;}
+ if(!isSimilar(a, leftReplacement, kmers, counts)){return 0;}
+
+ bases[loc]=leftReplacement;
+ assert(d==a+k+1);
+ tables.regenerateKmers(bases, kmers, counts, a);
+ return 1;
+ }
+
+ private int correctSingleBaseRight(final int a, final byte[] bases, final byte[] quals, final int[] leftBuffer, final int[] rightBuffer,
+ final LongList kmers, final IntList counts, final ByteBuilder bb, final int errorExtension0){
+ final byte leftReplacement;
+ final int loc=a+k;
+ final int errorExtension=Tools.min(errorExtension0, bases.length-loc);
+ {
+ bb.clear();
+ final long kmer=kmers.get(a);
+ final long rkmer=rcomp(kmer);
+ int extension=extendToRight2(bb, null, rightBuffer, errorExtension, true, kmer, rkmer);
+ if(extension<errorExtension){return 0;}
+ for(int i=1; i<extension; i++){
+ if(bb.get(i)!=bases[loc+i]){
+ return 0;
+ }
+ }
+ leftReplacement=bb.get(0);
+ }
+
+ if(bases[loc]==leftReplacement){return 0;}
+ if(!isSimilar(a, leftReplacement, kmers, counts)){return 0;}
+
+ bases[loc]=leftReplacement;
+ tables.regenerateKmers(bases, kmers, counts, a);
+ return 1;
+ }
+
+ private boolean isSimilar(int a, byte newBase, LongList kmers, IntList counts){
+ final int shift=2*k;
+ final long mask=~((-1L)<<shift);
+ long kmer=kmers.get(a);
+
+ final long x=AminoAcid.baseToNumber[newBase];
+ kmer=((kmer<<2)|x)&mask;
+ long rkmer=rcomp(kmer);
+ int count=getCount(kmer, rkmer);
+ int aCount=counts.get(a);
+ boolean similar=isSimilar(aCount, count);
+ return similar;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inherited Abstract Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ final void makeKhist(){
+ tables.makeKhist(outHist, histColumns, histMax, histHeader, histZeros, true, smoothHist, 1);
+ }
+ final void dumpKmersAsText(){
+ tables.dumpKmersAsBytes_MT(outKmers, minToDump, true);
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ final KmerTableSet tables(){return tables;}
+ public final KmerTableSet tables;
+
+ /** Normal kmer length */
+ private final int k;
+ /** k-1; used in some expressions */
+ private final int k2;
+
+}
diff --git a/current/assemble/Tadpole2.java b/current/assemble/Tadpole2.java
new file mode 100755
index 0000000..c65e955
--- /dev/null
+++ b/current/assemble/Tadpole2.java
@@ -0,0 +1,1250 @@
+package assemble;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.BitSet;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import jgi.BBMerge;
+import kmer.KmerTableSet;
+import stream.ByteBuilder;
+import stream.ConcurrentReadInputStream;
+import stream.Read;
+import ukmer.AbstractKmerTableU;
+import ukmer.HashArrayU1D;
+import ukmer.HashForestU;
+import ukmer.Kmer;
+import ukmer.KmerNodeU;
+import ukmer.KmerTableSetU;
+import align2.IntList;
+import align2.ListNum;
+import align2.LongList;
+import align2.Shared;
+import align2.Tools;
+import dna.AminoAcid;
+import dna.Parser;
+import dna.Timer;
+
+
+/**
+ * Short-kmer assembler based on KmerCountExact.
+ * @author Brian Bushnell
+ * @date May 15, 2015
+ *
+ */
+public class Tadpole2 extends Tadpole {
+
+ /**
+ * Code entrance from the command line.
+ * @param args Command line arguments
+ */
+ public static void main(String[] args){
+
+ args=Parser.parseConfig(args);
+ if(Parser.parseHelp(args, true)){
+ printOptions();
+ System.exit(0);
+ }
+
+ Timer t=new Timer(), t2=new Timer();
+ t.start();
+ t2.start();
+
+ //Create a new CountKmersExact instance
+ Tadpole2 wog=new Tadpole2(args, true);
+ t2.stop();
+ outstream.println("Initialization Time: \t"+t2);
+
+ ///And run it
+ wog.process(t);
+ }
+
+ /**
+ * Constructor.
+ * @param args Command line arguments
+ */
+ public Tadpole2(String[] args, boolean setDefaults){
+ super(args, setDefaults);
+
+ final int extraBytesPerKmer;
+ {
+ int x=0;
+ if(useOwnership){x+=4;}
+ if(processingMode==correctMode){}
+ else if(processingMode==contigMode || processingMode==extendMode){x+=1;}
+ extraBytesPerKmer=x;
+ }
+
+ tables=new KmerTableSetU(args, extraBytesPerKmer);
+ assert(kbig==tables.kbig);
+// kbig=tables.kbig;
+ ksmall=tables.k;
+// k2=tables.k2;
+// ways=tables.ways;
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Outer Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ void initializeOwnership(){
+ tables.initializeOwnership();
+ }
+
+ @Override
+ long shave(boolean shave, boolean rinse){
+ final Shaver2 shaver=new Shaver2(tables, THREADS);
+ long sum=0;
+
+ for(int i=0; i<maxShaveDepth; i++){
+ int a=1, b=maxShaveDepth, c=i+1;
+ // if(i>3){Shaver2.verbose2=true;}
+ outstream.println("\nShave("+a+", "+b+", "+c+")");
+ long removed=shaver.shave(a, b, c, Tools.max(minContigLen, shaveDiscardLen), shaveExploreDist, shave, rinse);
+ sum+=removed;
+ if(removed<100 || i>2){break;}
+ }
+
+ System.err.println();
+ return sum;
+ }
+
+ @Override
+ public long loadKmers(Timer t){
+ tables.process(t);
+ return tables.kmersLoaded;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Recall Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public final int getCount(Kmer kmer){return tables.getCount(kmer);}
+ private final boolean claim(Kmer kmer, int id){return tables.claim(kmer, id);}
+ private final boolean doubleClaim(ByteBuilder bb, int id/*, long rid*/, Kmer kmer){return tables.doubleClaim(bb, id/*, rid*/, kmer);}
+ private final boolean claim(ByteBuilder bb, int id, /*long rid, */boolean earlyExit, Kmer kmer){return tables.claim(bb, id/*, rid*/, earlyExit, kmer);}
+ private final boolean claim(byte[] array, int len, int id, /*long rid, */boolean earlyExit, Kmer kmer){return tables.claim(array, len, id/*, rid*/, earlyExit, kmer);}
+ private final int findOwner(Kmer kmer){return tables.findOwner(kmer);}
+ private final int findOwner(ByteBuilder bb, int id, Kmer kmer){return tables.findOwner(bb, id, kmer);}
+ private final int findOwner(byte[] array, int len, int id, Kmer kmer){return tables.findOwner(array, len, id, kmer);}
+ private final void release(Kmer kmer, int id){tables.release(kmer, id);}
+ private final void release(ByteBuilder bb, int id, Kmer kmer){tables.release(bb, id, kmer);}
+ private final void release(byte[] array, int len, int id, Kmer kmer){tables.release(array, len, id, kmer);}
+ private final int fillRightCounts(Kmer kmer, int[] counts){return tables.fillRightCounts(kmer, counts);}
+ private final int fillLeftCounts(Kmer kmer, int[] counts){return tables.fillLeftCounts(kmer, counts);}
+ private final StringBuilder toText(Kmer kmer){return AbstractKmerTableU.toText(kmer);}
+ private final StringBuilder toText(long[] key, int k){return AbstractKmerTableU.toText(key, k);}
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Classes ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /*--------------------------------------------------------------*/
+ /*---------------- BuildThread ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ BuildThread makeBuildThread(int id, int mode, ConcurrentReadInputStream[] crisa){
+ return new BuildThread(id, mode, crisa);
+ }
+
+ /**
+ * Builds contigs.
+ */
+ private class BuildThread extends AbstractBuildThread{
+
+ public BuildThread(int id_, int mode_, ConcurrentReadInputStream[] crisa_){
+ super(id_, mode_, crisa_);
+ }
+
+ @Override
+ public void run(){
+ if(crisa==null || crisa.length==0){
+ //Build from kmers
+
+ if(id==0){System.err.print("Seeding with min count = ");}
+ String comma="";
+ for(int i=contigPasses-1; i>0; i--){
+ minCountSeedCurrent=(int)Tools.min(Integer.MAX_VALUE, Tools.max(minCountSeed+i, (long)Math.floor((minCountSeed)*Math.pow(contigPassMult, i)*0.92-0.25) ));
+ if(id==0){
+ System.err.print(comma+minCountSeedCurrent);
+ comma=", ";
+ }
+ while(processNextTable(nextTable[i])){}
+ while(processNextVictims(nextVictims[i])){}
+ }
+ //Final pass
+ minCountSeedCurrent=minCountSeed;
+ if(id==0){System.err.println(comma+minCountSeedCurrent);}
+ while(processNextTable(nextTable[0])){}
+ while(processNextVictims(nextVictims[0])){}
+ }else{
+ //Extend reads
+ for(ConcurrentReadInputStream cris : crisa){
+ synchronized(crisa){
+ if(!cris.started()){
+ cris.start();
+ }
+ }
+ run(cris);
+ }
+ }
+ }
+
+ private boolean processNextTable(AtomicInteger aint){
+ final int tnum=aint.getAndAdd(1);
+ if(tnum>=tables.ways){return false;}
+ final HashArrayU1D table=tables.getTable(tnum);
+ final int max=table.arrayLength();
+ if(verbose && id==0){System.err.println("Processing table "+tnum+", size "+table.size()+", length "+max);}
+ for(int cell=0; cell<max; cell++){
+ if(verbose && id==0){System.err.println("Processing cell "+cell);}
+ int x=processCell(table, cell, myKmer);
+ }
+ return true;
+ }
+
+ private boolean processNextVictims(AtomicInteger aint){
+ final int tnum=aint.getAndAdd(1);
+ if(tnum>=tables.ways){return false;}
+ final HashArrayU1D table=tables.getTable(tnum);
+ final HashForestU forest=table.victims();
+ if(verbose && id==0){System.err.println("Processing forest "+tnum+", size "+forest.size());}
+ final int max=forest.arrayLength();
+ for(int cell=0; cell<max; cell++){
+ KmerNodeU kn=forest.getNode(cell);
+ int x=traverseKmerNodeU(kn);
+ }
+ return true;
+ }
+
+ private int processCell(HashArrayU1D table, int cell, Kmer kmer){
+ int count=table.readCellValue(cell);
+ if(count<minCountSeedCurrent){
+ if(verbose){System.err.println("For cell "+cell+", count="+count);}
+ return 0;
+ }
+
+ kmer=table.fillKmer(cell, kmer);
+// assert(kmer.verify(false));
+// assert(kmer.verify(true));
+
+ if(verbose){outstream.println("id="+id+" processing cell "+cell+"; \tkmer="+kmer);}
+ if(useOwnership){
+ int owner=table.getCellOwner(cell);
+ if(verbose){outstream.println("Owner is initially "+owner);}
+ if(owner>-1){return 0;}
+ owner=table.setOwner(kmer, id, cell);
+ if(verbose){outstream.println("Owner is now "+owner);}
+ if(owner!=id){return 0;}
+ }
+ return processKmer(kmer);
+ }
+
+ private int traverseKmerNodeU(KmerNodeU kn){
+ int sum=0;
+ if(kn!=null){
+ sum+=processKmerNodeU(kn);
+ if(kn.left()!=null){
+ sum+=traverseKmerNodeU(kn.left());
+ }
+ if(kn.right()!=null){
+ sum+=traverseKmerNodeU(kn.right());
+ }
+ }
+ return sum;
+ }
+
+ private int processKmerNodeU(KmerNodeU kn){
+ final long[] key=kn.pivot();
+ final int count=kn.getValue(key);
+ if(count<minCountSeedCurrent){return 0;}
+
+ if(verbose){outstream.println("id="+id+" processing KmerNodeU; \tkmer="+Arrays.toString(key)+"\t"+toText(key, ksmall));}
+ if(useOwnership){
+ int owner=kn.getOwner(key);
+ if(verbose){outstream.println("Owner is initially "+owner);}
+ if(owner>-1){return 0;}
+ owner=kn.setOwner(key, id);
+ if(verbose){outstream.println("Owner is now "+owner);}
+ if(owner!=id){return 0;}
+ }
+
+ myKmer.setFrom(key);
+ return processKmer(myKmer);
+ }
+
+ private int processKmer(Kmer kmer){
+
+ byte[] contig=makeContig(builderT, kmer, true);
+ if(contig!=null){
+ float coverage=tables.calcCoverage(contig, contig.length, kmer);
+ if(coverage<minCoverage){return 0;}
+ if(verbose){System.err.println("Added "+contig.length);}
+ final long num=contigNum.incrementAndGet();
+ Read r=new Read(contig, -1, -1, -1, "*", null, num, 0);
+ float gc=r.gc();
+ r.id="contig_"+num+",length="+contig.length+",cov="+String.format("%.1f", coverage)+",gc="+String.format("%.3f", gc);
+ contigs.add(r);
+ return contig.length;
+ }else{
+ if(verbose){System.err.println("Created null contig.");}
+ }
+ return 0;
+ }
+
+ private void run(ConcurrentReadInputStream cris){
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ //While there are more reads lists...
+ while(reads!=null && reads.size()>0){
+
+ //For each read (or pair) in the list...
+ for(int i=0; i<reads.size(); i++){
+ final Read r1=reads.get(i);
+ final Read r2=r1.mate;
+
+ processReadPair(r1, r2);
+ }
+
+ //Fetch a new read list
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ cris.returnList(ln.id, ln.list.isEmpty());
+ }
+
+ private void processReadPair(Read r1, Read r2){
+ if(verbose){System.err.println("Considering read "+r1.id+" "+new String(r1.bases));}
+
+ readsInT++;
+ basesInT+=r1.length();
+ if(r2!=null){
+ readsInT++;
+ basesInT+=r2.length();
+ }
+
+ if(mode==insertMode){
+ int x=BBMerge.findOverlapStrict(r1, r2, false);
+ if(x<1){
+ x=findInsertSize(r1, r2, rightCounts, myKmer, myKmer2);
+ }
+ insertSizes.increment(Tools.max(x, 0));
+ return;
+ }
+
+ if(ecco && r1!=null && r2!=null && !r1.discarded() && !r2.discarded()){BBMerge.findOverlapStrict(r1, r2, true);}
+
+ if(r1!=null){
+ if(r1.discarded()){
+ lowqBasesT+=r1.length();
+ lowqReadsT++;
+ }else{
+ byte[] contig=makeContig(r1.bases, builderT, r1.numericID, myKmer);
+ if(contig!=null){
+ if(verbose){System.err.println("Added "+contig.length);}
+ final long num=contigNum.incrementAndGet();
+ Read temp=new Read(contig, -1, -1, -1, "contig_"+num+"_length_"+contig.length, null, num, 0);
+ contigs.add(temp);
+ }
+ }
+ }
+ if(r2!=null){
+ if(r2.discarded()){
+ lowqBasesT+=r2.length();
+ lowqReadsT++;
+ }else{
+ byte[] contig=makeContig(r2.bases, builderT, r1.numericID, myKmer);
+ if(contig!=null){
+ if(verbose){System.err.println("Added "+contig.length);}
+ final long num=contigNum.incrementAndGet();
+ Read temp=new Read(contig, -1, -1, -1, "contig_"+num+"_length_"+contig.length, null, num, 0);
+ contigs.add(temp);
+ }
+ }
+ }
+ }
+
+ /** From kmers */
+ private byte[] makeContig(final ByteBuilder bb, Kmer kmer, boolean alreadyClaimed){
+ bb.setLength(0);
+ bb.appendKmer(kmer);
+ if(verbose){outstream.println("Filled bb: "+bb);}
+
+ final int initialLength=bb.length();
+ assert(initialLength==kbig);
+ if(initialLength<kbig){return null;}
+
+ boolean success=(alreadyClaimed || !useOwnership ? true : claim(kmer, id));
+ if(verbose){System.err.println("Thread "+id+" checking owner after setting: "+findOwner(bb, id, kmer));}
+ if(!success){
+ assert(bb.length()==kbig);
+// release(bb, id); //no need to release
+ return null;
+ }
+ if(verbose /*|| true*/){System.err.println("Thread "+id+" building contig; initial length "+bb.length());}
+ if(verbose){System.err.println("Extending to right.");}
+
+ {
+ final int status=extendToRight(bb, leftCounts, rightCounts, id, kmer);
+
+ if(status==DEAD_END){
+ //do nothing
+ }else if(status==LOOP){//TODO
+ //special case - handle specially, for a loop with no obvious junction, e.g. long tandem repeat.
+ //Perhaps, the last kmer should be reclassified as a junction and removed.
+ }else if(status==BAD_SEED){
+ assert(bb.length()==kbig);
+ release(kmer, id);
+ return null;
+ }else{
+ if(bb.length()==kbig){
+ if(status==BAD_OWNER){
+ release(kmer, id);
+ return null;
+ }else if(status==BRANCH){
+ release(kmer, id);
+ return null;
+ }else{
+ throw new RuntimeException("Bad return value: "+status);
+ }
+ }else{
+ if(status==BAD_OWNER){
+ release(bb, id, kmer);
+ return null;
+ }else if(status==BRANCH){
+ //do nothing
+ }else{
+ throw new RuntimeException("Bad return value: "+status);
+ }
+ }
+ }
+ }
+
+// success=extendToRight(bb, leftCounts, rightCounts, id, kmer);
+// if(!success){
+// release(bb, id, kmer);
+// return null;
+// }
+ bb.reverseComplementInPlace();
+ if(verbose /*|| true*/){System.err.println("Extending rcomp to right; current length "+bb.length());}
+
+ {
+ final int status=extendToRight(bb, leftCounts, rightCounts, id, kmer);
+
+ if(status==DEAD_END){
+ //do nothing
+ }else if(status==LOOP){//TODO
+ //special case - handle specially, for a loop with no obvious junction, e.g. long tandem repeat.
+ //Perhaps, the last kmer should be reclassified as a junction and removed.
+ }else if(status==BAD_SEED){
+ assert(false) : bb;//This should never happen.
+ assert(bb.length()==kbig);
+ release(kmer, id);
+ return null;
+ }else{
+ if(status==BAD_OWNER){
+ release(bb, id, kmer);
+ return null;
+ }else if(status==BRANCH){
+ //do nothing
+ }else{
+ throw new RuntimeException("Bad return value: "+status);
+ }
+ }
+ }
+// success=extendToRight(bb, leftCounts, rightCounts, id, kmer);
+// if(!success){
+// release(bb, id, kmer);
+// return null;
+// }
+ if(verbose /*|| true*/){System.err.println("Final length for thread "+id+": "+bb.length());}
+ // if(useOwnership && THREADS==1){assert(claim(bases, bases.length, id, rid));}
+ success=(useOwnership ? doubleClaim(bb, id, kmer) : true);
+ if(verbose /*|| true*/){System.err.println("Success for thread "+id+": "+success);}
+
+ if(trimEnds>0){bb.trimByAmount(trimEnds, trimEnds);}
+ if(bb.length()>=initialLength+minExtension && bb.length()>=minContigLen){
+ if(success){
+ bb.reverseComplementInPlace();
+ return bb.toBytes();
+ }else{
+ // assert(false) : bb.length()+", "+id;
+ release(bb, id, kmer);
+ return null;
+ }
+ }
+ if(verbose /*|| true*/){System.err.println("Contig was too short for "+id+": "+bb.length());}
+ return null;
+ }
+
+ /** From a seed */
+ private byte[] makeContig(final byte[] bases, final ByteBuilder bb, long rid, final Kmer kmer){
+ if(bases==null || bases.length<kbig){return null;}
+// if(verbose /*|| true*/){System.err.println("Thread "+id+" checking owner: "+findOwner(bases, bases.length, id));}
+ int owner=useOwnership ? findOwner(bases, bases.length, id, kmer) : -1;
+ if(owner>=id){return null;}
+ boolean success=(useOwnership ? claim(bases, bases.length, id, true, kmer) : true);
+ if(verbose /*|| true*/){System.err.println("Thread "+id+" checking owner after setting: "+findOwner(bases, bases.length, id, kmer));}
+ if(!success){
+ release(bases, bases.length, id, kmer);
+ return null;
+ }
+ if(verbose /*|| true*/){System.err.println("Thread "+id+" building contig; initial length "+bases.length);}
+ bb.setLength(0);
+ bb.append(bases);
+ if(verbose){System.err.println("Extending to right.");}
+ {
+ final int status=extendToRight(bb, leftCounts, rightCounts, id, kmer);
+
+ if(status==DEAD_END){
+ //do nothing
+ }else if(status==LOOP){//TODO
+ //special case - handle specially, for a loop with no obvious junction, e.g. long tandem repeat.
+ //Perhaps, the last kmer should be reclassified as a junction and removed.
+ }else if(status==BAD_SEED){
+ //do nothing
+ }else{
+ if(status==BAD_OWNER){
+ release(bb.array, bb.length(), id, kmer);
+ return null;
+ }else if(status==BRANCH){
+ //do nothing
+ }else{
+ throw new RuntimeException("Bad return value: "+status);
+ }
+ }
+ }
+// success=extendToRight(bb, leftCounts, rightCounts, id, kmer);
+// if(!success){
+// release(bb.array, bb.length(), id, kmer);
+// return null;
+// }
+ bb.reverseComplementInPlace();
+ if(verbose /*|| true*/){System.err.println("Extending rcomp to right; current length "+bb.length());}
+ {
+ final int status=extendToRight(bb, leftCounts, rightCounts, id, kmer);
+
+ if(status==DEAD_END){
+ //do nothing
+ }else if(status==LOOP){//TODO
+ //special case - handle specially, for a loop with no obvious junction, e.g. long tandem repeat.
+ //Perhaps, the last kmer should be reclassified as a junction and removed.
+ }else if(status==BAD_SEED){
+ //do nothing
+ }else{
+ if(status==BAD_OWNER){
+ release(bb.array, bb.length(), id, kmer);
+ return null;
+ }else if(status==BRANCH){
+ //do nothing
+ }else{
+ throw new RuntimeException("Bad return value: "+status);
+ }
+ }
+ }
+// success=extendToRight(bb, leftCounts, rightCounts, id, kmer);
+// if(!success){
+// release(bb.array, bb.length(), id, kmer);
+// return null;
+// }
+ if(verbose /*|| true*/){System.err.println("Final length for thread "+id+": "+bb.length());}
+ // if(useOwnership && THREADS==1){assert(claim(bases, bases.length, id, rid));}
+ success=(useOwnership ? doubleClaim(bb, id, kmer) : true);
+ if(verbose /*|| true*/){System.err.println("Success for thread "+id+": "+success);}
+ if(bb.length()>=bases.length+minExtension && bb.length()>=minContigLen){
+ if(success){
+ bb.reverseComplementInPlace();
+ return bb.toBytes();
+ }else{
+ // assert(false) : bb.length()+", "+id;
+ release(bb.array, bb.length(), id, kmer);
+ return null;
+ }
+ }
+ if(verbose /*|| true*/){System.err.println("Contig was too short for "+id+": "+bb.length());}
+ return null;
+ }
+
+ /*--------------------------------------------------------------*/
+
+ private final Kmer myKmer=new Kmer(kbig);
+ private final Kmer myKmer2=new Kmer(kbig);
+
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Extension Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+
+ public int findInsertSize(Read r1, Read r2, int[] rightCounts, Kmer kmer1, Kmer kmer2){
+ kmer1=tables.rightmostKmer(r1.bases, r1.length(), kmer1);
+ kmer2=tables.rightmostKmer(r2.bases, r2.length(), kmer2);
+ if(kmer1==null || kmer2==null){return -1;}
+ final int x=measureInsert(kmer1, kmer2, 24000, rightCounts);
+ if(x<0){return -1;}
+ return r1.length()+r2.length()+x-kbig;//TODO: May be off by 1.
+ }
+
+ /* (non-Javadoc)
+ * @see assemble.Tadpole#extendRead(stream.Read, stream.ByteBuilder, int[], int[], int)
+ */
+ @Override
+ public int extendRead(Read r, ByteBuilder bb, int[] leftCounts, int[] rightCounts, int distance) {
+ return extendRead(r, bb, leftCounts, rightCounts, distance, localKmer.get());
+ }
+
+ public int extendRead(Read r, ByteBuilder bb, int[] leftCounts, int[] rightCounts, int distance, final Kmer kmer){
+ final int initialLen=r.length();
+ if(initialLen<kbig){return 0;}
+ bb.setLength(0);
+ bb.append(r.bases);
+ Kmer temp=tables.rightmostKmer(bb, kmer);
+ if(temp==null){return 0;}
+ final int extension=extendToRight2_inner(bb, leftCounts, rightCounts, distance, true, kmer);
+ if(extension>0){
+ r.bases=bb.toBytes();
+ if(r.quality!=null){
+ final byte q=Shared.FAKE_QUAL;
+ r.quality=Arrays.copyOf(r.quality, r.bases.length);
+ for(int i=initialLen; i<r.quality.length; i++){
+ r.quality[i]=q;
+ }
+ }
+ }
+ assert(extension==r.length()-initialLen);
+ return extension;
+ }
+
+ /** Returns distance between the two kmers, or -1 */
+ public int measureInsert(final Kmer kmer1, final Kmer kmer2, final int maxlen, final int[] rightCounts){
+ int len=0;
+
+ {
+ int count=tables.getCount(kmer2);
+ if(count<minCountSeed){return -1;}
+ }
+
+ int count=tables.getCount(kmer1);
+ if(count<minCountSeed){return -1;}
+ if(count<minCountSeed){
+ if(verbose){outstream.println("Returning because count was too low: "+count);}
+ return -1;
+ }
+
+ int rightMaxPos=fillRightCounts(kmer1, rightCounts);
+ int rightMax=rightCounts[rightMaxPos];
+// int rightSecondPos=Tools.secondHighestPosition(rightCounts);
+// int rightSecond=rightCounts[rightSecondPos];
+
+ if(rightMax<minCountExtend){return -1;}
+// if(isJunction(rightMax, rightSecond)){return -1;}
+
+ while(!kmer1.equals(kmer2) && len<maxlen){
+
+ //Generate the new kmer
+// final byte b=AminoAcid.numberToBase[rightMaxPos];
+ final long x=rightMaxPos;
+ kmer1.addRightNumeric(x);
+
+ assert(tables.getCount(kmer1)==rightMax);
+ count=rightMax;
+
+ assert(count>=minCountExtend) : count;
+
+ rightMaxPos=fillRightCounts(kmer1, rightCounts);
+ rightMax=rightCounts[rightMaxPos];
+// rightSecondPos=Tools.secondHighestPosition(rightCounts);
+// rightSecond=rightCounts[rightSecondPos];
+
+ if(verbose){
+ outstream.println("kmer: "+kmer1);
+ outstream.println("Counts: "+count+", "+Arrays.toString(rightCounts));
+ outstream.println("rightMaxPos="+rightMaxPos);
+ outstream.println("rightMax="+rightMax);
+// outstream.println("rightSecondPos="+rightSecondPos);
+// outstream.println("rightSecond="+rightSecond);
+ }
+
+ if(rightMax<minCountExtend){
+ if(verbose){outstream.println("Breaking because highest right was too low:"+rightMax);}
+ break;
+ }
+
+// if(isJunction(rightMax, rightSecond)){return -1;}
+
+ len++;
+ }
+ return len>=maxlen ? -1 : len;
+ }
+
+
+
+ /**
+ * Extend these bases into a contig.
+ * Stops at both left and right junctions.
+ * Claims ownership.
+ */
+ public int extendToRight(final ByteBuilder bb, final int[] leftCounts, final int[] rightCounts, final int id, Kmer kmer){
+ if(bb.length()<kbig){return BAD_SEED;}
+ kmer.clear();
+
+ kmer=tables.rightmostKmer(bb, kmer);
+ if(kmer==null || kmer.len<kbig){return BAD_SEED;}
+ assert(kmer.len==kbig);
+
+ /* Now the trailing kmer has been initialized. */
+
+ if(verbose){
+ System.err.println("extendToRight kmer="+kmer+", bb="+bb);
+ }
+
+ HashArrayU1D table=tables.getTable(kmer);
+ int count=table.getValue(kmer);
+ if(count<minCountSeed){
+ if(verbose){outstream.println("Returning because count was too low: "+count);}
+ return BAD_SEED;
+ }
+
+ int owner=(useOwnership ? table.getOwner(kmer) : id);
+ if(verbose){outstream.println("Owner: "+owner);}
+ if(owner>id){return BAD_OWNER;}
+
+ int leftMaxPos=0;
+ int leftMax=minCountExtend;
+ int leftSecondPos=1;
+ int leftSecond=0;
+
+ if(leftCounts!=null){
+ leftMaxPos=fillLeftCounts(kmer, leftCounts);
+ leftMax=leftCounts[leftMaxPos];
+ leftSecondPos=Tools.secondHighestPosition(leftCounts);
+ leftSecond=leftCounts[leftSecondPos];
+ }
+
+ int rightMaxPos=fillRightCounts(kmer, rightCounts);
+ int rightMax=rightCounts[rightMaxPos];
+ int rightSecondPos=Tools.secondHighestPosition(rightCounts);
+ int rightSecond=rightCounts[rightSecondPos];
+
+ if(verbose){
+ outstream.println("kmer: "+toText(kmer));
+ outstream.println("Counts: "+count+", "+(leftCounts==null ? "null" : Arrays.toString(leftCounts))+", "+Arrays.toString(rightCounts));
+ outstream.println("leftMaxPos="+leftMaxPos);
+ outstream.println("leftMax="+leftMax);
+ outstream.println("leftSecondPos="+leftSecondPos);
+ outstream.println("leftSecond="+leftSecond);
+ outstream.println("rightMaxPos="+rightMaxPos);
+ outstream.println("rightMax="+rightMax);
+ outstream.println("rightSecondPos="+rightSecondPos);
+ outstream.println("rightSecond="+rightSecond);
+ }
+
+ if(rightMax<minCountExtend){return DEAD_END;}
+ if(isJunction(rightMax, rightSecond, leftMax, leftSecond)){return BRANCH;}
+
+ if(useOwnership){
+ owner=table.setOwner(kmer, id);
+ if(verbose){outstream.println("A. Owner is now "+id+" for kmer "+kmer);}
+ if(owner!=id){
+ if(verbose){outstream.println("Returning early because owner was "+owner+" for thread "+id+".");}
+ return BAD_OWNER;
+ }
+ }
+
+ final int maxLen=Tools.min((extendRight<0 ? maxContigLen : bb.length()+extendRight), maxContigLen);
+
+ while(owner==id && bb.length()<maxLen){
+
+ //Generate the new kmer
+ final byte b=AminoAcid.numberToBase[rightMaxPos];
+
+ //Now consider the next kmer
+ final long evicted=kmer.addRightNumeric(rightMaxPos);
+
+ table=tables.getTable(kmer);
+
+ assert(table.getValue(kmer)==rightMax);
+ count=rightMax;
+
+ assert(count>=minCountExtend) : count;
+
+ if(leftCounts!=null){
+ leftMaxPos=fillLeftCounts(kmer, leftCounts);
+ leftMax=leftCounts[leftMaxPos];
+ leftSecondPos=Tools.secondHighestPosition(leftCounts);
+ leftSecond=leftCounts[leftSecondPos];
+ }
+
+ rightMaxPos=fillRightCounts(kmer, rightCounts);
+ rightMax=rightCounts[rightMaxPos];
+ rightSecondPos=Tools.secondHighestPosition(rightCounts);
+ rightSecond=rightCounts[rightSecondPos];
+
+ if(verbose){
+ outstream.println("kmer: "+toText(kmer));
+ outstream.println("Counts: "+count+", "+(leftCounts==null ? "null" : Arrays.toString(leftCounts))+", "+Arrays.toString(rightCounts));
+ outstream.println("leftMaxPos="+leftMaxPos);
+ outstream.println("leftMax="+leftMax);
+ outstream.println("leftSecondPos="+leftSecondPos);
+ outstream.println("leftSecond="+leftSecond);
+ outstream.println("rightMaxPos="+rightMaxPos);
+ outstream.println("rightMax="+rightMax);
+ outstream.println("rightSecondPos="+rightSecondPos);
+ outstream.println("rightSecond="+rightSecond);
+ }
+
+ if(isJunction(rightMax, rightSecond, leftMax, leftSecond)){
+ if(verbose){outstream.println("B: Breaking because isJunction("+rightMax+", "+rightSecond+", "+leftMax+", "+leftSecond+")");}
+ return BRANCH;
+ }
+
+ if(leftCounts!=null && leftMaxPos!=evicted){
+ if(verbose){outstream.println("B: Breaking because of hidden branch: leftMaxPos!=evicted ("+leftMaxPos+"!="+evicted+")" +
+ "\nleftMaxPos="+leftMaxPos+", leftMax="+leftMax+", leftSecondPos="+leftSecondPos+", leftSecond="+leftSecond);}
+ return BRANCH;
+ }
+
+ bb.append(b);
+ if(verbose){outstream.println("Added base "+(char)b);}
+
+ if(useOwnership){
+ owner=table.getOwner(kmer);
+ if(verbose){outstream.println("Owner is initially "+id+" for key "+kmer);}
+ if(owner==id){//loop detection
+ if(verbose /*|| true*/){
+// outstream.println(new String(bb.array, bb.length()-31, 31));
+ outstream.println(bb);
+ outstream.println(toText(kmer));
+ outstream.println("Breaking because owner was "+owner+" for thread "+id+".");
+ }
+ return LOOP;
+ }
+ owner=table.setOwner(kmer, id);
+ if(verbose){outstream.println("B. Owner is now "+id+" for kmer "+kmer);}
+ }
+
+ if(rightMax<minCountExtend){
+ if(verbose){outstream.println("B: Breaking because highest right was too low:"+rightMax);}
+ return DEAD_END;
+ }
+ }
+ assert(owner!=id);
+ if(verbose /*|| true*/){
+ outstream.println("Current contig: "+bb+"\nReturning because owner was "+owner+" for thread "+id+".");
+ }
+ return BAD_OWNER;
+ }
+
+ @Override
+ public int extendToRight2(final ByteBuilder bb, final int[] leftCounts, final int[] rightCounts, final int distance, boolean includeJunctionBase){
+ initializeThreadLocals();
+ return extendToRight2(bb, leftCounts, rightCounts, distance, includeJunctionBase, localKmer.get());
+ }
+
+ @Override
+ public int extendToRight2(final ByteBuilder bb, final int[] leftCounts, final int[] rightCounts, final int distance, boolean includeJunctionBase, Kmer kmer){
+ if(verbose || verbose2){outstream.println("Entering extendToRight2 (no kmers).");}
+ final int initialLength=bb.length();
+ if(initialLength<kbig){return 0;}
+ kmer.clear();
+
+ kmer=tables.rightmostKmer(bb, kmer);
+ if(kmer==null || kmer.len<kbig){return 0;}
+ assert(kmer.len==kbig);
+
+ return extendToRight2_inner(bb, leftCounts, rightCounts, distance, includeJunctionBase, kmer);
+ }
+
+ /**
+ * Extend these bases to the right by at most 'distance'.
+ * Stops at right junctions only.
+ * Does not claim ownership.
+ */
+ private int extendToRight2_inner(final ByteBuilder bb, final int[] leftCounts, final int[] rightCounts, final int distance, boolean includeJunctionBase, Kmer kmer){
+ if(verbose || verbose2){outstream.println("Entering extendToRight2_inner (with kmers).");}
+ final int initialLength=bb.length();
+ assert(kmer.len==kbig) : kmer.len+", "+kbig+", "+bb.length();
+
+ HashArrayU1D table=tables.getTable(kmer);
+ int count=table.getValue(kmer);
+ if(count<minCountSeed){
+ if(verbose || verbose2){outstream.println("Returning because count was too low: "+count+"<"+minCountSeed);}
+ return 0;
+ }
+
+ int leftMaxPos=0;
+ int leftMax=minCountExtend;
+ int leftSecondPos=1;
+ int leftSecond=0;
+
+ if(leftCounts!=null){
+ leftMaxPos=fillLeftCounts(kmer, leftCounts);
+ leftMax=leftCounts[leftMaxPos];
+ leftSecondPos=Tools.secondHighestPosition(leftCounts);
+ leftSecond=leftCounts[leftSecondPos];
+ }
+
+ int rightMaxPos=fillRightCounts(kmer, rightCounts);
+ int rightMax=rightCounts[rightMaxPos];
+ int rightSecondPos=Tools.secondHighestPosition(rightCounts);
+ int rightSecond=rightCounts[rightSecondPos];
+
+ if(verbose){
+ outstream.println("kmer: "+toText(kmer));
+ outstream.println("Counts: "+count+", "+Arrays.toString(rightCounts));
+ outstream.println("rightMaxPos="+rightMaxPos);
+ outstream.println("rightMax="+rightMax);
+ outstream.println("rightSecondPos="+rightSecondPos);
+ outstream.println("rightSecond="+rightSecond);
+ }
+
+ if(rightMax<minCountExtend){
+ if(verbose || verbose2){outstream.println("Returning because rightMax was too low: "+rightMax+"<"+minCountExtend+"\n"+count+", "+Arrays.toString(rightCounts));}
+ return 0;
+ }
+ if(isJunction(rightMax, rightSecond, leftMax, leftSecond)){
+ if(verbose || verbose2){outstream.println("Returning because isJunction: "+rightMax+", "+rightSecond+"; "+leftMax+", "+leftSecond);}
+ return 0;
+ }
+
+ final int maxLen=Tools.min(bb.length()+distance, maxContigLen);
+
+ while(bb.length()<maxLen){
+
+ //Generate the new kmer
+ final byte b=AminoAcid.numberToBase[rightMaxPos];
+
+ //Now consider the next kmer
+ final long evicted=kmer.addRightNumeric(rightMaxPos);
+
+ table=tables.getTable(kmer);
+
+ assert(table.getValue(kmer)==rightMax);
+ count=rightMax;
+
+ assert(count>=minCountExtend) : count;
+
+ if(leftCounts!=null){
+ leftMaxPos=fillLeftCounts(kmer, leftCounts);
+ leftMax=leftCounts[leftMaxPos];
+ leftSecondPos=Tools.secondHighestPosition(leftCounts);
+ leftSecond=leftCounts[leftSecondPos];
+ }
+
+ rightMaxPos=fillRightCounts(kmer, rightCounts);
+ rightMax=rightCounts[rightMaxPos];
+ rightSecondPos=Tools.secondHighestPosition(rightCounts);
+ rightSecond=rightCounts[rightSecondPos];
+
+ if(verbose){
+ outstream.println("kmer: "+toText(kmer));
+ outstream.println("Counts: "+count+", "+Arrays.toString(rightCounts));
+ outstream.println("rightMaxPos="+rightMaxPos);
+ outstream.println("rightMax="+rightMax);
+ outstream.println("rightSecondPos="+rightSecondPos);
+ outstream.println("rightSecond="+rightSecond);
+ }
+
+ if(isJunction(rightMax, rightSecond, leftMax, leftSecond)){
+ if(includeJunctionBase && kmer.key()==kmer.array2()){//TODO: Does not work on palindromes.
+ bb.append(b);
+ if(verbose){outstream.println("Added base "+(char)b);}
+ }
+ break;
+ }
+
+ if(leftCounts!=null && leftMaxPos!=evicted){
+ if(verbose){outstream.println("B: Breaking because of hidden branch: leftMaxPos!=evicted ("+leftMaxPos+"!="+evicted+")" +
+ "\nleftMaxPos="+leftMaxPos+", leftMax="+leftMax+", leftSecondPos="+leftSecondPos+", leftSecond="+leftSecond);}
+ if(includeJunctionBase && kmer.key()==kmer.array2()){//TODO: Does not work on palindromes.
+ bb.append(b);
+ if(verbose){outstream.println("Added base "+(char)b);}
+ }
+ break;
+ }
+
+ bb.append(b);
+ if(verbose){outstream.println("Added base "+(char)b);}
+
+ if(rightMax<minCountExtend){
+ if(verbose || verbose2){outstream.println("C: Breaking because highest right was too low: "+rightMax+"<"+minCountExtend);}
+ break;
+ }
+ }
+ if(verbose || verbose2){System.err.println("Extended by "+(bb.length()-initialLength));}
+ return bb.length()-initialLength;
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Error Correction ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public int errorCorrect(Read r){
+ initializeThreadLocals();
+ int corrected=errorCorrect(r, localLeftCounts.get(), localRightCounts.get(), localIntList.get(), localByteBuilder.get(), null, localBitSet.get(), localKmer.get());
+ return corrected;
+ }
+
+ @Override
+ public int errorCorrect(Read r, final int[] leftCounts, final int[] rightCounts, LongList kmers, IntList counts,
+ final ByteBuilder bb, final int[] detectedArray, final BitSet bs, Kmer kmer){
+ return errorCorrect(r, leftCounts, rightCounts, counts, bb, detectedArray, bs, kmer);
+ }
+
+ public int errorCorrect(Read r, final int[] leftCounts, final int[] rightCounts, IntList counts,
+ final ByteBuilder bb, final int[] detectedArray, final BitSet bs, final Kmer kmer){
+ final byte[] bases=r.bases;
+ final byte[] quals=r.quality;
+ if(detectedArray!=null){
+ detectedArray[0]=0;
+ detectedArray[1]=0;
+ detectedArray[2]=0;
+ detectedArray[3]=0;
+ }
+ int valid=tables.fillCounts(bases, counts, kmer);
+ if(valid<2){return 0;}
+ int correctedPincer=0;
+ int correctedTail=0;
+
+ if(ECC_PINCER){
+ correctedPincer+=errorCorrectPincer(bases, quals, leftCounts, rightCounts, counts, bb, detectedArray, errorExtensionPincer, kmer);
+ }
+
+ if(ECC_TAIL || ECC_ALL){
+ int start=(ECC_ALL ? 0 : counts.size-kbig-1);
+// if(ECC_PINCER && detectedArray!=null && detectedArray[0]>correctedPincer){start=start-kbig;}
+ correctedTail+=errorCorrectTail(bases, quals, leftCounts, rightCounts, counts, bb, detectedArray, start, errorExtensionTail, kmer);
+ r.reverseComplement();
+ counts.reverse();
+ correctedTail+=errorCorrectTail(bases, quals, leftCounts, rightCounts, counts, bb, detectedArray, start, errorExtensionTail, kmer);
+ r.reverseComplement();
+ counts.reverse();
+ }
+
+ if(MARK_BAD_BASES>0){
+ int marked=markBadBases(bases, quals, counts, bs, MARK_BAD_BASES, MARK_DELTA_ONLY);
+ detectedArray[3]=marked;
+ }
+ assert(detectedArray==null || (correctedPincer==detectedArray[1] && correctedTail==detectedArray[2])) : correctedPincer+", "+correctedTail+", "+Arrays.toString(detectedArray);
+// if(ECC_PINCER && correctedTail>0){
+// valid=fillKmers(bases, kmers);
+// counts.reverse();
+// correctedPincer+=errorCorrectPincer(bases, quals, leftCounts, rightCounts, kmers, counts, bb, detectedArray, errorExtensionPincer);
+// }
+
+ return correctedPincer+correctedTail;
+ }
+
+ public int errorCorrectPincer(final byte[] bases, final byte[] quals, final int[] leftBuffer, final int[] rightBuffer,
+ final IntList counts, final ByteBuilder bb, final int[] detectedArray, final int errorExtension, final Kmer kmer){
+
+ int detected=0;
+ int corrected=0;
+
+ //a is the index of the left kmer
+ //b is a+1
+ //c is d-1
+ //d is the index of the right kmer
+ //the base between the kmers is at a+k
+ for(int a=0, d=kbig+1; d<counts.size; a++, d++){
+ final int aCount=counts.get(a);
+ final int bCount=counts.get(a+1);
+ final int cCount=counts.get(d-1);
+ final int dCount=counts.get(d);
+ if(isError(aCount, bCount) && isError(dCount, cCount) && isSimilar(aCount, dCount)){
+ if(verbose){
+ System.err.println("Found error: "+aCount+", "+bCount+", "+cCount+", "+dCount);
+ }
+ //Looks like a 1bp substitution; attempt to correct.
+ detected++;
+ int ret=correctSingleBasePincer(a, d, bases, quals, leftBuffer, rightBuffer, counts, bb, errorExtension, kmer);
+ corrected+=ret;
+ if(verbose){
+ System.err.println("Corrected error.");
+ }
+ }else{
+ if(verbose){
+ System.err.println("Not an error: "+aCount+", "+bCount+", "+cCount+", "+dCount+
+ "; "+isError(aCount, bCount)+", "+isError(dCount, cCount)+", "+isSimilar(aCount, dCount));
+ }
+ }
+ }
+
+// if(detected==0 && counts.get(0)>2 && counts.get(counts.size-1)>2){
+// assert(!verbose);
+// verbose=true;
+// System.err.println("\n"+counts);
+// errorCorrectPincer(bases, quals, leftBuffer, rightBuffer, kmers, counts, bb, detectedArray);
+// assert(false);
+// }
+
+ if(detectedArray!=null){
+ detectedArray[0]+=detected;
+ detectedArray[1]+=corrected;
+ }
+
+ return corrected;
+ }
+
+ public int errorCorrectTail(final byte[] bases, final byte[] quals, final int[] leftBuffer, final int[] rightBuffer,
+ final IntList counts, final ByteBuilder bb, final int[] detectedArray, final int startPos, final int errorExtension, final Kmer kmer){
+ if(bases.length<kbig+2*(1+errorExtension)){return 0;}
+ int detected=0;
+ int corrected=0;
+
+ //a is the index of the left kmer
+ //b is a+1
+ //the base between the kmers is at a+k
+ for(int a=Tools.max(startPos, errorExtension), lim=counts.size-Tools.min(errorExtension, (errorExtension+3)/2); a<lim; a++){//errorExtension-1
+ final int aCount=counts.get(a);
+ final int bCount=counts.get(a+1);
+ if(isError(aCount, bCount) && isSimilar(aCount, a-errorExtension, a-1, counts) && isError(aCount, a+2, a+kbig, counts)){
+ if(verbose){
+ System.err.println("Found error: "+aCount+", "+bCount);
+ }
+ //Assume like a 1bp substitution; attempt to correct.
+ detected++;
+ int ret=correctSingleBaseRight(a, bases, quals, leftBuffer, rightBuffer, counts, bb, errorExtension, kmer);
+ corrected+=ret;
+ if(verbose){
+ System.err.println("Corrected error.");
+ }
+ }else{
+ if(verbose){
+ System.err.println("Not an error: "+aCount+", "+bCount+
+ "; "+isError(aCount, bCount)+", "+isSimilar(aCount, a-errorExtension, a-1, counts)+", "+isError(aCount, a+2, a+kbig, counts));
+ }
+ }
+ }
+
+// if(detected==0 && counts.get(0)>2 && counts.get(counts.size-1)>2){
+// assert(!verbose);
+// verbose=true;
+// System.err.println("\n"+counts);
+// errorCorrectPincer(bases, quals, leftBuffer, rightBuffer, kmers, counts, bb, detectedArray);
+// assert(false);
+// }
+
+ if(detectedArray!=null){
+ detectedArray[0]+=detected;
+ detectedArray[2]+=corrected;
+ }
+
+ return corrected;
+ }
+
+ private int correctSingleBasePincer(final int a, final int d, final byte[] bases, final byte[] quals, final int[] leftBuffer, final int[] rightBuffer,
+ final IntList counts, final ByteBuilder bb, final int errorExtension, final Kmer kmer0){
+ final byte leftReplacement, rightReplacement;
+ final int loc=a+kbig;
+ {
+ bb.clear();
+ Kmer kmer=getKmer(bases, a, kmer0);
+ if(kmer==null){return 0;}
+ int extension=extendToRight2_inner(bb, null, rightBuffer, errorExtension, true, kmer);
+ if(extension<errorExtension){return 0;}
+ for(int i=1; i<extension; i++){
+ if(bb.get(i)!=bases[loc+i]){return 0;}
+ }
+ leftReplacement=bb.get(0);
+ }
+ {
+ bb.clear();
+ Kmer kmer=getKmer(bases, d, kmer0);
+ if(kmer==null){return 0;}
+ kmer.rcomp();
+ int extension=extendToRight2_inner(bb, null, rightBuffer, errorExtension, true, kmer);
+ if(extension<errorExtension){return 0;}
+ bb.reverseComplementInPlace();
+ for(int i=0; i<extension-1; i++){
+ if(bb.get(i)!=bases[loc+i+1-extension]){return 0;}
+ }
+ rightReplacement=bb.get(extension-1);
+ }
+ if(leftReplacement!=rightReplacement){return 0;}
+ if(bases[loc]==leftReplacement){return 0;}
+ if(!isSimilar(bases, a, leftReplacement, counts, kmer0)){return 0;}
+
+ bases[loc]=leftReplacement;
+ assert(d==a+kbig+1);
+ tables.regenerateCounts(bases, counts, a, kmer0);
+ return 1;
+ }
+
+ private int correctSingleBaseRight(final int a, final byte[] bases, final byte[] quals, final int[] leftBuffer, final int[] rightBuffer,
+ final IntList counts, final ByteBuilder bb, final int errorExtension0, final Kmer kmer0){
+ final byte leftReplacement;
+ final int loc=a+kbig;
+ final int errorExtension=Tools.min(errorExtension0, bases.length-loc);
+ {
+ bb.clear();
+ Kmer kmer=getKmer(bases, a, kmer0);
+ if(kmer==null){return 0;}
+ int extension=extendToRight2_inner(bb, null, rightBuffer, errorExtension, true, kmer);
+ if(extension<errorExtension){return 0;}
+ for(int i=1; i<extension; i++){
+ if(bb.get(i)!=bases[loc+i]){
+ return 0;
+ }
+ }
+ leftReplacement=bb.get(0);
+ }
+
+ if(bases[loc]==leftReplacement){return 0;}
+ if(!isSimilar(bases, a, leftReplacement, counts, kmer0)){return 0;}
+
+ bases[loc]=leftReplacement;
+ tables.regenerateCounts(bases, counts, a, kmer0);
+ return 1;
+ }
+
+ private final boolean isSimilar(byte[] bases, int a, byte newBase, IntList counts, final Kmer kmer0){
+ Kmer kmer=getKmer(bases, a, kmer0);
+ if(kmer==null){
+ assert(false); //Should never happen
+ return false;
+ }
+ kmer.addRight(newBase);
+ int count=getCount(kmer);
+ int aCount=counts.get(a);
+ boolean similar=isSimilar(aCount, count);
+ return similar;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inherited Abstract Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ final void makeKhist(){
+ tables.makeKhist(outHist, histColumns, histMax, histHeader, histZeros, true, smoothHist, 1);
+ }
+ final void dumpKmersAsText(){
+ tables.dumpKmersAsBytes_MT(outKmers, minToDump, true);
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ final KmerTableSetU tables(){return tables;}
+ public final KmerTableSetU tables;
+
+ /** Normal kmer length */
+ private final int ksmall;
+
+}
diff --git a/current/assemble/TadpoleWrapper.java b/current/assemble/TadpoleWrapper.java
new file mode 100755
index 0000000..054d9bb
--- /dev/null
+++ b/current/assemble/TadpoleWrapper.java
@@ -0,0 +1,88 @@
+package assemble;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+
+import align2.Tools;
+
+import jgi.AssemblyStats2;
+
+/**
+ * Assembles with multiple kmer lengths to find the best kmer length.
+ * @author Brian Bushnell
+ * @date Oct 15, 2015
+ *
+ */
+public class TadpoleWrapper {
+
+ public static void main(String[] args){
+ HashSet<Integer> set=new HashSet<Integer>();
+ ArrayList<String> argList=new ArrayList<String>();
+ String contigsName="contigs%.fa";
+ for(int i=0; i<args.length; i++){
+
+ final String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if("null".equalsIgnoreCase(b)){b=null;}
+ while(a.charAt(0)=='-' && (a.indexOf('.')<0 || i>1 || !new File(a).exists())){a=a.substring(1);}
+
+ if(a.equals("k")){
+ for(String s2 : b.split(",")){
+ set.add(Integer.parseInt(s2));
+ }
+ }else if(a.equals("out")){
+ contigsName=b;
+ assert(b.contains("%")) : "Output name must contain % symbol.";
+ }else{
+ argList.add(arg);
+ }
+ }
+
+ if(set.isEmpty()){
+ kmers=new int[] {31};
+ }else{
+ kmers=new int[set.size()];
+ int i=0;
+ for(Integer x : set){
+ kmers[i]=x;
+ i++;
+ }
+ Arrays.sort(kmers);
+ }
+
+ long[] L50=new long[kmers.length];
+ long[] contiglen=new long[kmers.length];
+ long[] contigs=new long[kmers.length];
+ long[] maxContig=new long[kmers.length];
+
+ argList.add("");
+ argList.add("");
+ StringBuilder sb=new StringBuilder("in=");
+
+ for(int i=0; i<kmers.length; i++){
+ int k=kmers[i];
+ argList.set(argList.size()-2, "k="+k);
+ argList.set(argList.size()-1, "out="+contigsName.replace("%", ""+k));
+ String[] args2=argList.toArray(new String[0]);
+ System.gc();
+ Tadpole.main(args2);
+
+ L50[i]=AssemblyStats2.lastL50;
+ contiglen[i]=AssemblyStats2.lastSize;
+ contigs[i]=AssemblyStats2.lastContigs;
+ maxContig[i]=AssemblyStats2.lastMaxContig;
+ }
+
+ int index=Tools.maxIndex(L50);
+
+ System.err.println("Recommended K:\t"+kmers[index]);
+
+ }
+
+ private static int[] kmers;
+
+}
diff --git a/current/bloom/KCountArray.java b/current/bloom/KCountArray.java
new file mode 100755
index 0000000..da8c443
--- /dev/null
+++ b/current/bloom/KCountArray.java
@@ -0,0 +1,510 @@
+package bloom;
+
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicIntegerArray;
+
+import ukmer.Kmer;
+
+import dna.AminoAcid;
+import dna.Data;
+import align2.Shared;
+import align2.Tools;
+
+/**
+ * @author Brian Bushnell
+ * @date Jul 5, 2012
+ */
+public abstract class KCountArray {
+
+ public static KCountArray makeNew(long cells_, int cbits_, int gap_){
+ return makeNew(cells_+1, cells_, cbits_, gap_, 1);
+ }
+
+ public static KCountArray makeNew(long keys_, long cells_, int cbits_, int gap_, int hashes_){
+ return makeNew(keys_, cells_, cbits_, gap_, hashes_, null, 0);
+ }
+
+// public static KCountArray makeNew(long keys_, long cells_, int cbits_, int gap_, int hashes_, boolean prefilter_){
+// if(!prefilter_){
+// return makeNew(keys_, cells_, cbits_, gap_, hashes_, 0, 0);
+// }else{
+// long totalbits=cells_*cbits_;
+// long prebits=totalbits/4;
+// long postCells=(totalbits-prebits+cbits_-1)/cbits_;
+// int prehashes=(hashes_+1)/2;
+// return makeNew(keys_, postCells, cbits_, gap_, hashes_, prebits, prehashes);
+// }
+// }
+
+ public static KCountArray makeNew(long keys_, long cells_, int cbits_, int gap_, int hashes_, KCountArray prefilter, int prefilterLimit_){
+// assert(keys_>=cells_) : keys_+", "+cells_;
+// assert(cells_>1) : cells_;
+ KCountArray kca;
+ if(false && keys_<=cells_){
+ kca=new KCountArray3(cells_, cbits_, gap_);
+// return new KCountArray4(cells_, cbits_, gap_, 2);
+ }else{
+// kca=new KCountArray4(cells_, cbits_, gap_, hashes_); //Single-threaded; most accurate
+// kca=new KCountArray4MT(cells_, cbits_, gap_, hashes_); //Fast
+// kca=new KCountArray5MT(cells_, cbits_, gap_, hashes_); //Less efficient than 4MT
+// kca=new KCountArray6MT(cells_, cbits_, gap_, hashes_); //Fastest but substantial drop in accuracy
+
+// if(prefilter==null){
+// kca=new KCountArray7MT(cells_, cbits_, gap_, hashes_); //Like 4MT but uses primes
+// }else{
+// kca=new KCountArray8MT(cells_, cbits_, gap_, hashes_, prefilter); //Like 7MT but uses prefilter
+// }
+ kca=new KCountArray7MTA(cells_, cbits_, gap_, hashes_, prefilter, prefilterLimit_); //Like 4MT but uses primes
+
+// if(prefilter==null){
+// kca=new KCountArray9MT(cells_, cbits_, gap_, hashes_); //Like 7MT but uses canonical kmers
+// }else{
+// kca=new KCountArray10MT(cells_, cbits_, gap_, hashes_, prefilter); //Like 8MT but uses canonical kmers
+// }
+ }
+ kca.initialize();
+
+ return kca;
+ }
+
+ protected KCountArray(long cells_, int cbits_){
+ this(cells_, cbits_, 0);
+ }
+
+ protected KCountArray(final long cells_, int cbits_, int gap_){
+ gap=gap_;
+ assert(cbits_<=32);
+ assert(Integer.bitCount(cbits_)==1);
+ assert(Long.bitCount(cells_)==1) || this.getClass()==KCountArray7MT.class : this.getClass();
+
+ numArrays=64;
+ arrayBits=31-Integer.numberOfLeadingZeros(numArrays);
+ arrayMask=numArrays-1;
+
+ while(cbits_*cells_<32*numArrays){
+ assert(false) : cells_+", "+cbits_+", "+numArrays+", "+(cbits_*cells_)+"<"+(32*numArrays);
+ cbits_*=2;
+ } //Increases bits per cell so that at minimum each array is size 1
+
+ assert(cbits_<=32);
+
+ cells=cells_;
+ cellBits=cbits_;
+ valueMask=(cellBits==32 ? Integer.MAX_VALUE : ~((-1)<<cellBits));
+ maxValue=min(Integer.MAX_VALUE, ~((-1)<<min(cellBits,31)));
+ cellsPerWord=32/cellBits;
+ indexShift=Integer.numberOfTrailingZeros(cellsPerWord);
+ cellMask=cellsPerWord-1;
+
+ if(verbose){
+ System.out.println(description());
+ }
+ }
+
+ protected KCountArray(final long cells_, int cbits_, int gap_, int arrays_){
+ gap=gap_;
+ assert(cbits_<=32);
+ assert(Integer.bitCount(cbits_)==1);
+ assert(Long.bitCount(cells_)==1) || this.getClass()==KCountArray7MT.class || this.getClass()==KCountArray7MTA.class || this.getClass()==KCountArray8MT.class;
+
+ numArrays=arrays_;
+ assert(Integer.bitCount(numArrays)==1) : numArrays+", "+cells_+", "+cbits_;
+ arrayBits=31-Integer.numberOfLeadingZeros(numArrays);
+ arrayMask=numArrays-1;
+
+ while(cbits_*cells_<32*numArrays){
+ assert(false) : cells_+", "+cbits_+", "+numArrays+", "+(cbits_*cells_)+"<"+(32*numArrays);
+ cbits_*=2;
+ } //Increases bits per cell so that at minimum each array is size 1
+
+ assert(cbits_<=32) : "Why?";
+
+ cells=cells_;
+ cellBits=cbits_;
+ valueMask=(cellBits==32 ? Integer.MAX_VALUE : ~((-1)<<cellBits));
+ maxValue=min(Integer.MAX_VALUE, ~((-1)<<min(cellBits,31)));
+ cellsPerWord=32/cellBits;
+ indexShift=Integer.numberOfTrailingZeros(cellsPerWord);
+ cellMask=cellsPerWord-1;
+
+ if(verbose){
+ System.out.println(description());
+ }
+ }
+
+ public abstract int read(long key);
+ public int read(long keys[]){throw new RuntimeException("Unimplemented.");}
+ public final int read(long key, int k, boolean makeCanonical){return read(makeCanonical ? makeCanonical2(key, k) : key);}
+
+ public abstract void write(long key, int value);
+
+ public void increment(long key){incrementAndReturn(key, 1);}
+ public void decrement(long key){decrementAndReturn(key, 1);}
+
+ public final int readPrecise(long key, int k, boolean makeCanonical){
+ assert(k<=32);
+ int b=read(makeCanonical ? makeCanonical2(key, k) : key);
+ if(b<1){return b;}
+ int a=readLeft(key, k, makeCanonical);
+ if(a>=b){return b;}
+ int c=readRight(key, k, makeCanonical);
+ if(c>=b){return b;}
+ return (int)(((long)a+(long)c)/2);
+// return max(a, c);
+// int mid=Tools.min(a, b, c);
+// System.out.println("a="+a+", b="+b+", c="+c+" -> "+mid);
+// return mid;
+ }
+
+ public final int readPreciseMin(long key, int k, boolean makeCanonical){
+ assert(k<=32);
+ int b=read(makeCanonical ? makeCanonical2(key, k) : key);
+ if(b<1){return b;}
+ int a=readLeft(key, k, makeCanonical);
+ if(a<1){return a;}
+ int c=readRight(key, k, makeCanonical);
+ return Tools.min(a, b, c);
+ }
+
+ /**
+ * @param key Kmer to evaluate
+ * @return Sum of counts of all 4 possible left-adjacent kmers
+ */
+ public int readLeft(long key, int k, boolean makeCanonical){throw new RuntimeException("Unsupported.");}
+ /**
+ * @param key Kmer to evaluate
+ * @return Sum of counts of all 4 possible right-adjacent kmers
+ */
+ public int readRight(long key, int k, boolean makeCanonical){throw new RuntimeException("Unsupported.");}
+ /**
+ * @param key Kmer to evaluate
+ * @return Array of counts of all 4 possible left-adjacent kmers
+ */
+ public int[] readAllLeft(final long key, final int k, boolean makeCanonical, int[] rvec){throw new RuntimeException("Unsupported.");}
+ /**
+ * @param key Kmer to evaluate
+ * @return Array of counts of all 4 possible right-adjacent kmers
+ */
+ public int[] readAllRight(final long key, final int k, boolean makeCanonical, int[] rvec){throw new RuntimeException("Unsupported.");}
+
+ public void increment(long[] keys){
+ synchronized(this){
+ for(long key : keys){
+ increment(key);
+ }
+ }
+ }
+
+ /** Returns incremented value. Optional method. */
+ public abstract int incrementAndReturn(long key, int incr);
+
+ /** Returns decremented value. Optional method. */
+ public int decrementAndReturn(long key, int incr){
+ throw new RuntimeException("This class "+getClass().getName()+" does not support decrementAndReturn.");
+ }
+
+ /** Returns unincremented value */
+ public abstract int incrementAndReturnUnincremented(long key, int incr);
+
+// /** Returns unincremented value */
+// public final int incrementAndReturnUnincremented(Kmer kmer, int incr){
+// return incrementAndReturnUnincremented(kmer.xor(), incr);
+// }
+
+ public int incrementAndReturnUnincremented(long[] keys, int incr){
+ throw new RuntimeException("Unimplemented.");
+ }
+
+ public abstract long[] transformToFrequency();
+ public final long[] transformToFrequency(int[][] matrix){
+ long[] freq=new long[100000];
+ int maxFreq=freq.length-1;
+
+ if(cellBits!=32){
+ assert(cellBits>0);
+ for(int[] array : matrix){
+ for(int i=0; i<array.length; i++){
+ int word=array[i];
+ int j=cellsPerWord;
+ // System.out.println("initial: word = "+word+", j = "+Integer.toHexString(j)+", cellbits="+cellBits);
+ for(; word!=0; j--){
+ int x=word&valueMask;
+ int x2=(int)min(x, maxFreq);
+ freq[x2]++;
+ word=(word>>>cellBits);
+ // System.out.println("word = "+word+", j = "+Integer.toHexString(j)+", cellbits="+cellBits);
+ }
+ freq[0]+=j;
+ }
+ }
+ }else{
+ for(int[] array : matrix){
+ for(int i=0; i<array.length; i++){
+ int word=array[i];
+ int x2=(int)min(word, maxFreq);
+ freq[x2]++;
+ }
+ }
+ }
+ return freq;
+ }
+
+ public final long[] transformToFrequency(AtomicIntegerArray[] matrix){
+ long[] freq=new long[100000];
+ int maxFreq=freq.length-1;
+
+ if(cellBits!=32){
+ assert(cellBits>0);
+ for(AtomicIntegerArray array : matrix){
+ for(int i=0; i<array.length(); i++){
+ int word=array.get(i);
+ int j=cellsPerWord;
+ // System.out.println("initial: word = "+word+", j = "+Integer.toHexString(j)+", cellbits="+cellBits);
+ for(; word!=0; j--){
+ int x=word&valueMask;
+ int x2=(int)min(x, maxFreq);
+ freq[x2]++;
+ word=(word>>>cellBits);
+ // System.out.println("word = "+word+", j = "+Integer.toHexString(j)+", cellbits="+cellBits);
+ }
+ freq[0]+=j;
+ }
+ }
+ }else{
+ for(AtomicIntegerArray array : matrix){
+ for(int i=0; i<array.length(); i++){
+ int word=array.get(i);
+ int x2=(int)min(word, maxFreq);
+ freq[x2]++;
+ }
+ }
+ }
+ return freq;
+ }
+
+ public final StringBuilder description(){
+ StringBuilder sb=new StringBuilder();
+ long words=cells/cellsPerWord;
+ int wordsPerArray=(int)(words/numArrays);
+ sb.append("cells: \t"+cells).append('\n');
+ sb.append("cellBits:\t"+cellBits).append('\n');
+ sb.append("valueMask:\t"+Long.toHexString(valueMask)).append('\n');
+ sb.append("maxValue:\t"+maxValue).append('\n');
+ sb.append("cellsPerWord:\t"+cellsPerWord).append('\n');
+ sb.append("indexShift:\t"+indexShift).append('\n');
+ sb.append("words: \t"+words).append('\n');
+ sb.append("wordsPerArray:\t"+wordsPerArray).append('\n');
+ sb.append("numArrays:\t"+numArrays).append('\n');
+ sb.append("Memory: \t"+mem()).append('\n');
+ sb.append("Usage: \t"+String.format("%.3f%%",usedFraction()*100));
+ return sb;
+ }
+
+ public final String toShortString(){
+ return (gap>0 ? "gap = "+gap+" \t " : "")+"mem = "+mem()+" \tcells = "+toKMG(cells)+" \tused = "+String.format("%.3f%%",usedFraction()*100);
+ }
+
+ public final String toShortString(int hashes){
+ return (gap>0 ? "gap = "+gap+" \t " : "")+("hashes = "+hashes+" \t ")+
+ "mem = "+mem()+" \tcells = "+toKMG(cells)+" \tused = "+String.format("%.3f%%",usedFraction()*100);
+ }
+
+ public final String toString(){
+ return description().toString();
+ }
+
+ public abstract String toContentsString();
+
+ public abstract double usedFraction();
+
+ public abstract double usedFraction(int mindepth);
+
+ public abstract long cellsUsed(int mindepth);
+
+ public final double estimateUniqueKmers(int hashes){
+ double f=usedFraction();
+ double f2=(1-Math.pow(1-f, 1.0/hashes));
+ double n=(-cells)*Math.log(1-f2);
+ return n;
+ }
+
+ public final double estimateUniqueKmers(int hashes, int mindepth){
+// assert(false) : this.getClass().getName();
+ double f=usedFraction(mindepth);
+ double f2=(1-Math.pow(1-f, 1.0/hashes));
+ double n=(-cells)*Math.log(1-f2);
+ return n;
+ }
+
+ public final String mem(){
+ long mem=(cells*cellBits)/8;
+ if(mem<(1<<20)){
+ return (String.format("%.2f KB", mem*1d/(1<<10)));
+ }else if(mem<(1<<30)){
+ return (String.format("%.2f MB", mem*1d/(1<<20)));
+ }else{
+ return (String.format("%.2f GB", mem*1d/(1<<30)));
+ }
+ }
+
+ public static String toKMG(long x){
+ double div=1;
+ String ext="";
+ if(x>10000000000L){
+ div=1000000000L;
+ ext="B";
+ }else if(x>10000000){
+ div=1000000;
+ ext="M";
+ }else if(x>100000){
+ div=1000;
+ ext="K";
+ }
+ return String.format("%.2f", x/div)+ext;
+ }
+
+ static final AtomicIntegerArray[] allocMatrix(final int numArrays, final int wordsPerArray){
+ final AtomicIntegerArray[] matrix=new AtomicIntegerArray[numArrays];
+ final AllocThread[] array=new AllocThread[Tools.min(Tools.max(Shared.threads()/2, 1), numArrays)];
+ final AtomicInteger next=new AtomicInteger(0);
+ for(int i=0; i<array.length; i++){
+ array[i]=new AllocThread(matrix, next, wordsPerArray);
+ }
+ for(int i=0; i<array.length; i++){array[i].start();}
+ for(AllocThread at : array){
+ while(at.getState()!=Thread.State.TERMINATED){
+ try {
+ at.join();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+ return matrix;
+ }
+
+ private static class AllocThread extends Thread{
+
+ AllocThread(AtomicIntegerArray[] matrix_, AtomicInteger next_, int wordsPerArray_){
+ matrix=matrix_;
+ next=next_;
+ wordsPerArray=wordsPerArray_;
+ }
+
+ @Override
+ public void run(){
+ int x=next.getAndIncrement();
+ while(x<matrix.length){
+ matrix[x]=new AtomicIntegerArray(wordsPerArray);
+ x=next.getAndIncrement();
+ }
+ }
+
+ private final AtomicIntegerArray[] matrix;
+ private final AtomicInteger next;
+ private final int wordsPerArray;
+
+ }
+
+
+// long hash(long x, int y){throw new RuntimeException("Not supported.");}
+ abstract long hash(long x, int y);
+
+ public static final int min(int x, int y){return x<y ? x : y;}
+ public static final int max(int x, int y){return x>y ? x : y;}
+ public static final long min(long x, long y){return x<y ? x : y;}
+ public static final long max(long x, long y){return x>y ? x : y;}
+
+ /** Any necessary initialization. */
+ public void initialize(){}
+
+ /** Any necessary shutdown steps. */
+ public void shutdown(){}
+
+ public final long cells;
+ public final int cellBits;
+ /** Originally this was different than valueMask in the case that valueMask was negative, but now they are the same. */
+ public final int maxValue;
+ public final int gap; //Set this for convenience on gapped tables to make sure you're using the right table.
+
+ protected final int cellsPerWord;
+ protected final int indexShift;
+ protected final int cellMask;
+ protected final int valueMask;
+
+ protected static int minArrays=calcMinArrays();
+ protected final int arrayBits;
+ protected final int numArrays;
+ protected final int arrayMask;
+
+// protected static final int arrayBits=6;
+// protected static final int numArrays=1<<arrayBits;
+// protected static final int arrayMask=numArrays-1;
+
+ public static boolean verbose=false;
+
+ private static final int calcMinArrays(){
+ int x=Shared.threads();
+ while(Integer.bitCount(x)!=1){x++;}
+ return x;
+ }
+
+// public static final boolean isCanonical(long key, int k){
+// assert(k>3 && k<=32);
+// short a=(short)(key&canonMask);
+// short b=AminoAcid.rcompBinaryTable[(int)((key>>(2*(k-4)))&canonMask)];
+//// System.out.println("x="+Long.toBinaryString(key)+"\na="+Integer.toBinaryString(a)+"\nb="+Integer.toBinaryString(b)+"\n"+(a>=b));
+//// assert(a>=b || isCanonical(AminoAcid.reverseComplementBinaryFast(key, k), k));
+// return a>=b;
+// }
+
+// public static final boolean isCanonical(long key, int k){
+// assert(k>3 && k<=32);
+// short a=(short)(key&canonMask);
+// short b=AminoAcid.rcompBinaryTable[(int)((key>>(2*(k-4)))&canonMask)];
+//// System.out.println("x="+Long.toBinaryString(key)+"\na="+Integer.toBinaryString(a)+"\nb="+Integer.toBinaryString(b)+"\n"+(a>=b));
+//// assert(a>=b || isCanonical(AminoAcid.reverseComplementBinaryFast(key, k), k));
+// return a>=b;
+// }
+
+ public static final boolean isCanonical(long key, int k){
+ assert(k>3 && k<=32);
+ long b=AminoAcid.reverseComplementBinaryFast(key, k);
+ return key>=b;
+ }
+
+ /** Assumes that the key is not canonical */
+ public static final long makeCanonical(final long key, final int k){
+ assert(k>3 && k<=32);
+// assert(!isCanonical(key, k));
+ final long r=AminoAcid.reverseComplementBinaryFast(key, k);
+ assert(r>=key);
+// assert(isCanonical(r, k));
+// assert(AminoAcid.reverseComplementBinaryFast(r, k)==key);
+ return r;
+ }
+
+
+ public static final long makeCanonical2(final long key, final int k){
+ assert(k>3 && k<=32);
+ if(isCanonical(key, k)){return key;}
+ long r=AminoAcid.reverseComplementBinaryFast(key, k);
+// assert(isCanonical(r, k)) : k+"\n"+Long.toBinaryString(key)+"\n"+Long.toBinaryString(r)+"\n"+Long.toBinaryString(AminoAcid.reverseComplementBinaryFast(r, k));
+// assert(AminoAcid.reverseComplementBinaryFast(r, k)==key) : k+"\n"+Long.toBinaryString(key)+"\n"+Long.toBinaryString(r)+"\n"+Long.toBinaryString(AminoAcid.reverseComplementBinaryFast(r, k));
+ return r;
+ }
+
+ public KCountArray prefilter(){
+ throw new RuntimeException("TODO: Override");
+ }
+
+ public void purgeFilter(){
+ throw new RuntimeException("TODO: Override");
+ }
+
+// private static final short[] canonMask={0, 3, 15, 63, 255, 1023, 4095, 16383};
+ private static final long canonK=4;
+ static final long canonMask=(1<<(canonK*2))-1; //e.g. 255 for k=4
+
+}
diff --git a/current/bloom/KCountArray2.java b/current/bloom/KCountArray2.java
new file mode 100755
index 0000000..7bb6ac7
--- /dev/null
+++ b/current/bloom/KCountArray2.java
@@ -0,0 +1,227 @@
+package bloom;
+
+/**
+ * @author Brian Bushnell
+ * @date Jul 5, 2012
+ */
+public class KCountArray2 {
+
+ public static void main(String[] args){
+ KCountArray2 kca=new KCountArray2(1024, 16);
+ }
+
+ public KCountArray2(long cells_, int bits_){
+ this(cells_, bits_, 0);
+ }
+
+ public KCountArray2(long cells_, int bits_, int gap_){
+ gap=gap_;
+ assert(bits_<=32);
+ assert(Integer.bitCount(bits_)==1);
+ assert(Long.bitCount(cells_)==1);
+
+ while(bits_*cells_<32*numArrays){
+ assert(false);
+ bits_*=2;
+ } //Increases bits per cell so that at minimum each array is size 1
+
+ assert(bits_!=32);
+
+ cells=cells_;
+ cellBits=bits_;
+ valueMask=~((-1)<<cellBits);
+ maxValue=min(Integer.MAX_VALUE, ~((-1)<<min(cellBits,31)));
+ cellsPerWord=32/cellBits;
+ indexShift=Integer.numberOfTrailingZeros(cellsPerWord);
+ long words=cells/cellsPerWord;
+ int wordsPerArray=(int)(words/numArrays);
+ matrix=new int[numArrays][wordsPerArray];
+
+ if(verbose){
+ System.out.println("cells: \t"+cells);
+ System.out.println("cellBits:\t"+cellBits);
+ System.out.println("valueMask:\t"+Long.toHexString(valueMask));
+ System.out.println("maxValue:\t"+maxValue);
+ System.out.println("cellsPerWord:\t"+cellsPerWord);
+ System.out.println("indexShift:\t"+indexShift);
+ System.out.println("words: \t"+words);
+ System.out.println("wordsPerArray:\t"+wordsPerArray);
+ System.out.println("numArrays:\t"+numArrays);
+
+
+ long mem=words*4;
+ if(mem<(1<<30)){
+ System.out.println("memory: \t"+String.format("%.2f MB", mem*1d/(1<<20)));
+ }else{
+ System.out.println("memory: \t"+String.format("%.2f GB", mem*1d/(1<<30)));
+ }
+ }
+ }
+
+ public int read(long key){
+// System.out.println("key="+key);
+ int arrayNum=(int)(key&arrayMask);
+// System.out.println("array="+arrayNum);
+ key>>>=arrayBits;
+// System.out.println("key2="+key);
+ int[] array=matrix[arrayNum];
+ int index=(int)(key>>>indexShift);
+// System.out.println("index="+index);
+ int word=array[index];
+// System.out.println("word="+Integer.toHexString(word));
+ int cellShift=(int)(cellBits*key);
+// System.out.println("cellShift="+cellShift);
+ return (int)((word>>>cellShift)&valueMask);
+ }
+
+ public void write(long key, int value){
+ int arrayNum=(int)(key&arrayMask);
+ key>>>=arrayBits;
+ int[] array=matrix[arrayNum];
+ int index=(int)(key>>>indexShift);
+ int word=array[index];
+ int cellShift=(int)(cellBits*key);
+ word=(value<<cellShift)|(word&~((valueMask)<<cellShift));
+ array[index]=word;
+ }
+
+ public int increment(long key, int incr){
+ int arrayNum=(int)(key&arrayMask);
+ key>>>=arrayBits;
+ int[] array=matrix[arrayNum];
+ int index=(int)(key>>>indexShift);
+ int word=array[index];
+ int cellShift=(int)(cellBits*key);
+ int value=((word>>>cellShift)&valueMask);
+ if(value==0 && incr>0){cellsUsed++;}
+ else if(incr<0 && value+incr==0){cellsUsed--;}
+ value=min(value+incr, maxValue);
+ word=(value<<cellShift)|(word&~((valueMask)<<cellShift));
+ array[index]=word;
+ return (int)value;
+ }
+
+ /** Returns unincremented value */
+ public int increment2(long key, int incr){
+ int arrayNum=(int)(key&arrayMask);
+ key>>>=arrayBits;
+ int[] array=matrix[arrayNum];
+ int index=(int)(key>>>indexShift);
+ int word=array[index];
+ int cellShift=(int)(cellBits*key);
+ final int value=((word>>>cellShift)&valueMask);
+ final int value2=min(value+incr, maxValue);
+ word=(value2<<cellShift)|(word&~((valueMask)<<cellShift));
+ array[index]=word;
+ return value;
+ }
+
+ public long[] transformToFrequency(){
+ long[] freq=new long[100000];
+ int maxFreq=freq.length-1;
+
+ if(cellBits!=32){
+ assert(cellBits>0);
+ for(int[] array : matrix){
+ for(int i=0; i<array.length; i++){
+ int word=array[i];
+ int j=cellsPerWord;
+ // System.out.println("initial: word = "+word+", j = "+Integer.toHexString(j)+", cellbits="+cellBits);
+ for(; word!=0; j--){
+ int x=word&valueMask;
+ int x2=(int)min(x, maxFreq);
+ freq[x2]++;
+ word=(word>>>cellBits);
+ // System.out.println("word = "+word+", j = "+Integer.toHexString(j)+", cellbits="+cellBits);
+ }
+ freq[0]+=j;
+ }
+ }
+ }else{
+ for(int[] array : matrix){
+ for(int i=0; i<array.length; i++){
+ int word=array[i];
+ int x2=(int)min(word, maxFreq);
+ freq[x2]++;
+ }
+ }
+ }
+ return freq;
+ }
+
+ public String toString(){
+ StringBuilder sb=new StringBuilder();
+ sb.append("[");
+ String comma="";
+ for(int[] array : matrix){
+ for(int i=0; i<array.length; i++){
+ int word=array[i];
+ for(int j=0; j<cellsPerWord; j++){
+ int x=word&valueMask;
+ sb.append(comma);
+ sb.append(x);
+ word>>>=cellBits;
+ comma=", ";
+ }
+ }
+ }
+ sb.append("]");
+ return sb.toString();
+ }
+
+ public double usedFraction(){return cellsUsed/(double)cells;}
+
+ public double usedFraction(int mindepth){return cellsUsed(mindepth)/(double)cells;}
+
+ public long cellsUsed(int mindepth){
+ long count=0;
+ for(int[] array : matrix){
+ if(array!=null){
+ for(int word : array){
+ while(word>0){
+ int x=word&valueMask;
+ if(x>=mindepth){count++;}
+ word>>>=cellBits;
+ }
+ }
+ }
+ }
+ return count;
+ }
+
+ public String mem(){
+ long mem=(cells*cellBits)/8;
+ if(mem<(1<<20)){
+ return (String.format("%.2f KB", mem*1d/(1<<10)));
+ }else if(mem<(1<<30)){
+ return (String.format("%.2f MB", mem*1d/(1<<20)));
+ }else{
+ return (String.format("%.2f GB", mem*1d/(1<<30)));
+ }
+ }
+
+ public static final int min(int x, int y){return x<y ? x : y;}
+ public static final int max(int x, int y){return x>y ? x : y;}
+ public static final long min(long x, long y){return x<y ? x : y;}
+ public static final long max(long x, long y){return x>y ? x : y;}
+
+ private long cellsUsed;
+
+ public final long cells;
+ public final int cellBits;
+ public final int maxValue;
+ public final int gap; //Set this for convenience on gapped tables to make sure you're using the right table.
+
+ private final int cellsPerWord;
+ private final int indexShift;
+ private final int valueMask;
+ private final int[][] matrix;
+
+ private static final int arrayBits=2;
+ private static final int numArrays=1<<arrayBits;
+ private static final int arrayMask=numArrays-1;
+
+ public static boolean verbose=false;
+
+
+}
diff --git a/current/bloom/KCountArray3.java b/current/bloom/KCountArray3.java
new file mode 100755
index 0000000..2eeb61a
--- /dev/null
+++ b/current/bloom/KCountArray3.java
@@ -0,0 +1,141 @@
+package bloom;
+
+/**
+ * @author Brian Bushnell
+ * @date Aug 17, 2012
+ *
+ */
+public class KCountArray3 extends KCountArray {
+
+ public KCountArray3(long cells_, int bits_, int gap_){
+ super(cells_, bits_, gap_);
+ long words=cells/cellsPerWord;
+ int wordsPerArray=(int)(words/numArrays);
+ matrix=new int[numArrays][wordsPerArray];
+ }
+
+ public int read(long key){
+ if(verbose){System.err.println("Reading "+key);}
+// System.out.println("key="+key);
+ int arrayNum=(int)(key&arrayMask);
+// System.out.println("array="+arrayNum);
+ key>>>=arrayBits;
+// System.out.println("key2="+key);
+ int[] array=matrix[arrayNum];
+ int index=(int)(key>>>indexShift);
+// System.out.println("index="+index);
+ int word=array[index];
+// System.out.println("word="+Integer.toHexString(word));
+ int cellShift=(int)(cellBits*key);
+// System.out.println("cellShift="+cellShift);
+ int value=(int)((word>>>cellShift)&valueMask);
+ if(verbose){System.err.println("Read "+value);}
+ return value;
+ }
+
+ public void write(long key, int value){
+ if(verbose){System.err.println("Writing "+key+", "+value);}
+ int arrayNum=(int)(key&arrayMask);
+ key>>>=arrayBits;
+ int[] array=matrix[arrayNum];
+ int index=(int)(key>>>indexShift);
+ int word=array[index];
+ int cellShift=(int)(cellBits*key);
+ word=(value<<cellShift)|(word&~((valueMask)<<cellShift));
+ array[index]=word;
+ }
+
+// static int count138=0;
+ public int incrementAndReturn(long key, int incr){
+ if(verbose){System.err.println("*** Incrementing "+key);}
+// if(key==138){
+// assert(count138==0) : count138;
+// count138++;
+// }
+ int arrayNum=(int)(key&arrayMask);
+ key>>>=arrayBits;
+ int[] array=matrix[arrayNum];
+ int index=(int)(key>>>indexShift);
+ int word=array[index];
+ int cellShift=(int)(cellBits*key);
+ int value=((word>>>cellShift)&valueMask);
+ if(value==0 && incr>0){cellsUsed++;}
+ else if(incr<0 && value+incr==0){cellsUsed--;}
+ value=min(value+incr, maxValue);
+ word=(value<<cellShift)|(word&~((valueMask)<<cellShift));
+ array[index]=word;
+ if(verbose){System.err.println("Returning "+value);}
+ return (int)value;
+ }
+
+ /** Returns unincremented value */
+ public int incrementAndReturnUnincremented(long key, int incr){
+ if(verbose){System.err.println("Incrementing2 "+key);}
+ int arrayNum=(int)(key&arrayMask);
+ key>>>=arrayBits;
+ int[] array=matrix[arrayNum];
+ int index=(int)(key>>>indexShift);
+ int word=array[index];
+ int cellShift=(int)(cellBits*key);
+ final int value=((word>>>cellShift)&valueMask);
+ final int value2=min(value+incr, maxValue);
+ word=(value2<<cellShift)|(word&~((valueMask)<<cellShift));
+ array[index]=word;
+ if(verbose){System.err.println("Returning "+value);}
+ return value;
+ }
+
+ public long[] transformToFrequency(){
+ return transformToFrequency(matrix);
+ }
+
+ public String toContentsString(){
+ StringBuilder sb=new StringBuilder();
+ sb.append("[");
+ String comma="";
+ for(int[] array : matrix){
+ for(int i=0; i<array.length; i++){
+ int word=array[i];
+ for(int j=0; j<cellsPerWord; j++){
+ int x=word&valueMask;
+ sb.append(comma);
+ sb.append(x);
+ word>>>=cellBits;
+ comma=", ";
+ }
+ }
+ }
+ sb.append("]");
+ return sb.toString();
+ }
+
+ public double usedFraction(){return cellsUsed/(double)cells;}
+
+ public double usedFraction(int mindepth){return cellsUsed(mindepth)/(double)cells;}
+
+ public long cellsUsed(int mindepth){
+ long count=0;
+ for(int[] array : matrix){
+ if(array!=null){
+ for(int word : array){
+ while(word>0){
+ int x=word&valueMask;
+ if(x>=mindepth){count++;}
+ word>>>=cellBits;
+ }
+ }
+ }
+ }
+ return count;
+ }
+
+ @Override
+ long hash(long x, int y) {
+ assert(false) : "Unsupported.";
+ return x;
+ }
+
+ private long cellsUsed;
+ private final int[][] matrix;
+
+}
diff --git a/current/bloom/KCountArray4.java b/current/bloom/KCountArray4.java
new file mode 100755
index 0000000..9fc75ba
--- /dev/null
+++ b/current/bloom/KCountArray4.java
@@ -0,0 +1,366 @@
+package bloom;
+
+import java.util.Random;
+
+import dna.Timer;
+
+
+/**
+ *
+ * Uses hashing rather than direct-mapping to support longer kmers.
+ *
+ * @author Brian Bushnell
+ * @date Aug 17, 2012
+ *
+ */
+public class KCountArray4 extends KCountArray {
+
+ public static void main(String[] args){
+ long cells=Long.parseLong(args[0]);
+ int bits=Integer.parseInt(args[1]);
+ int gap=Integer.parseInt(args[2]);
+ int hashes=Integer.parseInt(args[3]);
+
+ verbose=false;
+
+ KCountArray4 kca=new KCountArray4(cells, bits, gap, hashes);
+
+ System.out.println(kca.read(0));
+ kca.increment(0);
+ System.out.println(kca.read(0));
+ kca.increment(0);
+ System.out.println(kca.read(0));
+ System.out.println();
+
+ System.out.println(kca.read(1));
+ kca.increment(1);
+ System.out.println(kca.read(1));
+ kca.increment(1);
+ System.out.println(kca.read(1));
+ System.out.println();
+
+ System.out.println(kca.read(100));
+ kca.increment(100);
+ System.out.println(kca.read(100));
+ kca.increment(100);
+ System.out.println(kca.read(100));
+ kca.increment(100);
+ System.out.println(kca.read(100));
+ System.out.println();
+
+
+ System.out.println(kca.read(150));
+ kca.increment(150);
+ System.out.println(kca.read(150));
+ System.out.println();
+
+ }
+
+ public KCountArray4(long cells_, int bits_, int gap_, int hashes_){
+ super(cells_, bits_, gap_);
+ long words=cells/cellsPerWord;
+ assert(words/numArrays<=Integer.MAX_VALUE);
+ int wordsPerArray=(int)(words/numArrays);
+ hashes=hashes_;
+// System.out.println("cells="+cells+", words="+words+", wordsPerArray="+wordsPerArray+", numArrays="+numArrays+", hashes="+hashes);
+// assert(false);
+ matrix=new int[numArrays][wordsPerArray];
+ assert(hashes>0 && hashes<=hashMasks.length);
+ }
+
+ public int read(final long rawKey){
+ if(verbose){System.err.println("Reading raw key "+rawKey);}
+ long key2=hash(rawKey, 0);
+ int min=readHashed(key2);
+ for(int i=1; i<hashes && min>0; i++){
+ if(verbose){System.err.println("Reading. i="+i+", key2="+key2);}
+ key2=Long.rotateRight(key2, hashBits);
+ key2=hash(key2, i);
+ if(verbose){System.err.println("Rot/hash. i="+i+", key2="+key2);}
+ min=min(min, readHashed(key2));
+ }
+ return min;
+ }
+
+ private int readHashed(long key){
+ if(verbose){System.err.print("Reading hashed key "+key);}
+ key=((key&Long.MAX_VALUE)%(cells-1));
+// System.out.println("key="+key);
+ int arrayNum=(int)(key&arrayMask);
+// System.out.println("array="+arrayNum);
+ key>>>=arrayBits;
+// System.out.println("key2="+key);
+ int[] array=matrix[arrayNum];
+ int index=(int)(key>>>indexShift);
+// assert(false) : indexShift;
+// System.out.println("index="+index);
+ int word=array[index];
+// System.out.println("word="+Integer.toHexString(word));
+ assert(word>>>(cellBits*key) == word>>>(cellBits*(key&cellMask)));
+// int cellShift=(int)(cellBits*(key&cellMask));
+ int cellShift=(int)(cellBits*key);
+ if(verbose){System.err.println(", array="+arrayNum+", index="+index+", cellShift="+(cellShift%32)+", value="+((int)((word>>>cellShift)&valueMask)));}
+// System.out.println("cellShift="+cellShift);
+ return (int)((word>>>cellShift)&valueMask);
+ }
+
+ public void write(final long key, int value){
+ throw new RuntimeException("Not allowed for this class.");
+ }
+
+ public int incrementAndReturn(final long rawKey, int incr){
+// verbose=(rawKey==32662670693L);
+ if(verbose){System.err.println("\n*** Incrementing raw key "+rawKey+" ***");}
+// verbose=true;
+ assert(incr>0);
+
+ long key2=rawKey;
+ if(hashes==1){
+ key2=hash(key2, 0);
+ int x=incrementHashedIfAtMost(key2, incr, maxValue-1);
+ assert(x>=incr) : "original=?, new should be >="+(incr)+", new="+read(rawKey)+", max="+maxValue+", key="+rawKey;
+ return x;
+ }
+
+ final int min=read(rawKey);
+ if(min>=maxValue){return maxValue;}
+
+ assert(key2==rawKey);
+ for(int i=0; i<hashes; i++){
+ key2=hash(key2, i);
+ if(verbose){System.err.println("key2="+key2+", value="+readHashed(key2));}
+ int x=incrementHashedIfAtMost(key2, incr, min);
+ assert(x>=min+incr) : "i="+i+", original="+min+", new should be <="+(min+incr)+", new="+read(rawKey)+", max="+maxValue+", key="+rawKey;
+ if(verbose){System.err.println("postIncr value="+readHashed(key2));}
+// assert(read(rawKey)<=min+incr) : "i="+i+", original="+min+", new should be <="+(min+incr)+", new="+read(rawKey)+", max="+maxValue+", key="+rawKey;
+// assert(readHashed(key2)>=min+incr) : "i="+i+", original="+min+", new should be <="+(min+incr)+", new="+read(rawKey)+", max="+maxValue+", key="+rawKey;
+ key2=Long.rotateRight(key2, hashBits);
+ }
+// assert(read(rawKey)==min+incr) : "original="+min+", new should be "+(min+incr)+", new="+read(rawKey)+", max="+maxValue;
+// assert(false);
+ return min(min+incr, maxValue);
+ }
+
+ /** Returns unincremented value */
+ public int incrementAndReturnUnincremented(long rawKey, int incr){
+// verbose=(rawKey==32662670693L);
+ if(verbose){System.err.println("\n*** Incrementing raw key "+rawKey+" ***");}
+// verbose=true;
+ assert(incr>0);
+
+ long key2=rawKey;
+ if(hashes==1){
+ key2=hash(key2, 0);
+ int x=incrementHashedIfAtMost(key2, incr, maxValue-1);
+ assert(x>=incr) : "original=?, new should be >="+(incr)+", new="+read(rawKey)+", max="+maxValue+", key="+rawKey;
+ return x;
+ }
+
+ final int min=read(rawKey);
+ if(min>=maxValue){return maxValue;}
+
+ assert(key2==rawKey);
+ for(int i=0; i<hashes; i++){
+ key2=hash(key2, i);
+ if(verbose){System.err.println("key2="+key2+", value="+readHashed(key2));}
+ int x=incrementHashedIfAtMost(key2, incr, min);
+ assert(x>=min+incr) : "i="+i+", original="+min+", new should be <="+(min+incr)+", new="+read(rawKey)+", max="+maxValue+", key="+rawKey;
+ if(verbose){System.err.println("postIncr value="+readHashed(key2));}
+// assert(read(rawKey)<=min+incr) : "i="+i+", original="+min+", new should be <="+(min+incr)+", new="+read(rawKey)+", max="+maxValue+", key="+rawKey;
+// assert(readHashed(key2)>=min+incr) : "i="+i+", original="+min+", new should be <="+(min+incr)+", new="+read(rawKey)+", max="+maxValue+", key="+rawKey;
+ key2=Long.rotateRight(key2, hashBits);
+ }
+// assert(read(rawKey)==min+incr) : "original="+min+", new should be "+(min+incr)+", new="+read(rawKey)+", max="+maxValue;
+// assert(false);
+ return min;
+ }
+
+ private int incrementHashedIfAtMost(long key, int incr, int lim){
+ if(verbose){System.err.print("incrementing hashed key "+key);}
+ key=((key&Long.MAX_VALUE)%(cells-1));
+ int arrayNum=(int)(key&arrayMask);
+ key>>>=arrayBits;
+ int[] array=matrix[arrayNum];
+ int index=(int)(key>>>indexShift);
+ int word=array[index];
+ int cellShift=(int)(cellBits*key);
+ int value=((word>>>cellShift)&valueMask);
+ if(verbose){System.err.println(", array="+arrayNum+", index="+index+", cellShift="+(cellShift%32)+", value="+value+", limit="+lim);}
+ if(value>lim){return value;}
+ if(value==0 && incr>0){cellsUsed++;}
+ value=min(value+incr, maxValue);
+ word=(value<<cellShift)|(word&~((valueMask)<<cellShift));
+ array[index]=word;
+ return value;
+ }
+
+ private int incrementHashed(long key, int incr){
+ assert(incr>0);
+ int arrayNum=(int)(key&arrayMask);
+ key>>>=arrayBits;
+ int[] array=matrix[arrayNum];
+ int index=(int)(key>>>indexShift);
+ int word=array[index];
+ int cellShift=(int)(cellBits*key);
+ int value=((word>>>cellShift)&valueMask);
+ if(value==0 && incr>0){cellsUsed++;}
+ value=min(value+incr, maxValue);
+ word=(value<<cellShift)|(word&~((valueMask)<<cellShift));
+ array[index]=word;
+ return value;
+ }
+
+ public long[] transformToFrequency(){
+ return transformToFrequency(matrix);
+ }
+
+ public String toContentsString(){
+ StringBuilder sb=new StringBuilder();
+ sb.append("[");
+ String comma="";
+ for(int[] array : matrix){
+ for(int i=0; i<array.length; i++){
+ int word=array[i];
+ for(int j=0; j<cellsPerWord; j++){
+ int x=word&valueMask;
+ sb.append(comma);
+ sb.append(x);
+ word>>>=cellBits;
+ comma=", ";
+ }
+ }
+ }
+ sb.append("]");
+ return sb.toString();
+ }
+
+ public double usedFraction(){return cellsUsed/(double)cells;}
+
+ public double usedFraction(int mindepth){return cellsUsed(mindepth)/(double)cells;}
+
+ public long cellsUsed(int mindepth){
+ long count=0;
+ for(int[] array : matrix){
+ if(array!=null){
+ for(int word : array){
+ while(word>0){
+ int x=word&valueMask;
+ if(x>=mindepth){count++;}
+ word>>>=cellBits;
+ }
+ }
+ }
+ }
+ return count;
+ }
+
+
+ final long hash(long key, int row){
+ int cell=(int)((Long.MAX_VALUE&key)%(hashArrayLength-1));
+// int cell=(int)(hashCellMask&(key));
+
+ if(row==0){//Doublehash only first time
+ key=key^hashMasks[(row+4)%hashMasks.length][cell];
+ cell=(int)(hashCellMask&(key>>4));
+// cell=(int)(hashCellMask&(key>>hashBits));
+// cell=(int)((Long.MAX_VALUE&key)%(hashArrayLength-1));
+ }
+
+ return key^hashMasks[row][cell];
+ }
+
+ /**
+ * @param i
+ * @param j
+ * @return
+ */
+ private static long[][] makeMasks(int rows, int cols) {
+
+ long seed;
+ synchronized(KCountArray4.class){
+ seed=counter;
+ counter++;
+ }
+
+ Timer t=new Timer();
+ long[][] r=new long[rows][cols];
+ Random randy=new Random(seed);
+ for(int i=0; i<r.length; i++){
+ fillMasks(r[i], randy);
+ }
+ t.stop();
+ if(t.elapsed>200000000L){System.out.println("Mask-creation time: "+t);}
+ return r;
+ }
+
+
+ /**
+ * @param cols
+ * @param randy
+ * @return
+ */
+ private static void fillMasks(long[] r, Random randy) {
+// for(int i=0; i<r.length; i++){
+// long x=0;
+// while(Long.bitCount(x&0xFFFFFFFF)!=16){
+// x=randy.nextLong();
+// }
+// r[i]=(x&Long.MAX_VALUE);
+// }
+
+ final int hlen=(1<<hashBits);
+ assert(r.length==hlen);
+ int[] count1=new int[hlen];
+ int[] count2=new int[hlen];
+ final long mask=hlen-1;
+
+ for(int i=0; i<r.length; i++){
+ long x=0;
+ int y=0;
+ int z=0;
+ while(Long.bitCount(x&0xFFFFFFFFL)!=16){
+ x=randy.nextLong();
+ while(Long.bitCount(x&0xFFFFFFFFL)<16){
+ x|=(1L<<randy.nextInt(32));
+ }
+ while(Long.bitCount(x&0xFFFFFFFFL)>16){
+ x&=(~(1L<<randy.nextInt(32)));
+ }
+ while(Long.bitCount(x&0xFFFFFFFF00000000L)<16){
+ x|=(1L<<(randy.nextInt(32)+32));
+ }
+ while(Long.bitCount(x&0xFFFFFFFF00000000L)>16){
+ x&=(~(1L<<(randy.nextInt(32)+32)));
+ }
+
+// System.out.print(".");
+// y=(((int)(x&mask))^i);
+ y=(((int)(x&mask)));
+ z=(int)((x>>hashBits)&mask);
+ if(count1[y]>0 || count2[z]>0){
+ x=0;
+ }
+ }
+// System.out.println(Long.toBinaryString(x));
+ r[i]=(x&Long.MAX_VALUE);
+ count1[y]++;
+ count2[z]++;
+ }
+
+ }
+
+ public long cellsUsed(){return cellsUsed;}
+
+ private long cellsUsed;
+ private final int[][] matrix;
+ private final int hashes;
+
+
+ private static final int hashBits=6;
+ private static final int hashArrayLength=1<<hashBits;
+ private static final int hashCellMask=hashArrayLength-1;
+ private final long[][] hashMasks=makeMasks(8, hashArrayLength);
+
+ private static long counter=0;
+
+}
diff --git a/current/bloom/KCountArray4MT.java b/current/bloom/KCountArray4MT.java
new file mode 100755
index 0000000..99b9da1
--- /dev/null
+++ b/current/bloom/KCountArray4MT.java
@@ -0,0 +1,529 @@
+package bloom;
+
+import java.util.Arrays;
+import java.util.Random;
+import java.util.concurrent.ArrayBlockingQueue;
+
+import dna.Timer;
+
+
+/**
+ *
+ * Uses hashing rather than direct-mapping to support longer kmers.
+ *
+ * @author Brian Bushnell
+ * @date Aug 17, 2012
+ *
+ */
+public class KCountArray4MT extends KCountArray {
+
+ public static void main(String[] args){
+ long cells=Long.parseLong(args[0]);
+ int bits=Integer.parseInt(args[1]);
+ int gap=Integer.parseInt(args[2]);
+ int hashes=Integer.parseInt(args[3]);
+
+ verbose=false;
+
+ KCountArray4MT kca=new KCountArray4MT(cells, bits, gap, hashes);
+
+ System.out.println(kca.read(0));
+ kca.increment(0);
+ System.out.println(kca.read(0));
+ kca.increment(0);
+ System.out.println(kca.read(0));
+ System.out.println();
+
+ System.out.println(kca.read(1));
+ kca.increment(1);
+ System.out.println(kca.read(1));
+ kca.increment(1);
+ System.out.println(kca.read(1));
+ System.out.println();
+
+ System.out.println(kca.read(100));
+ kca.increment(100);
+ System.out.println(kca.read(100));
+ kca.increment(100);
+ System.out.println(kca.read(100));
+ kca.increment(100);
+ System.out.println(kca.read(100));
+ System.out.println();
+
+
+ System.out.println(kca.read(150));
+ kca.increment(150);
+ System.out.println(kca.read(150));
+ System.out.println();
+
+ }
+
+ public KCountArray4MT(long cells_, int bits_, int gap_, int hashes_){
+ super(cells_, bits_, gap_);
+// verbose=false;
+ long words=cells/cellsPerWord;
+ long x=(words/numArrays);
+ if(x>=Integer.MAX_VALUE){x=Integer.MAX_VALUE-3;}
+ assert(x<=Integer.MAX_VALUE);
+ wordsPerArray=(int)(x);
+ cellsPerArray=cells/numArrays;
+ cellMod=cellsPerArray-1;
+ hashes=hashes_;
+// System.out.println("cells="+cells+", words="+words+", wordsPerArray="+wordsPerArray+", numArrays="+numArrays+", hashes="+hashes);
+// assert(false);
+ matrix=new int[numArrays][];
+ assert(hashes>0 && hashes<=hashMasks.length);
+ }
+
+ public int read(final long rawKey){
+ assert(finished);
+ if(verbose){System.err.println("Reading raw key "+rawKey);}
+ long key2=hash(rawKey, 0);
+ int min=readHashed(key2);
+ for(int i=1; i<hashes && min>0; i++){
+ if(verbose){System.err.println("Reading. i="+i+", key2="+key2);}
+ key2=Long.rotateRight(key2, hashBits);
+ key2=hash(key2, i);
+ if(verbose){System.err.println("Rot/hash. i="+i+", key2="+key2);}
+ min=min(min, readHashed(key2));
+ }
+ return min;
+ }
+
+ private int readHashed(long key){
+ if(verbose){System.err.print("Reading hashed key "+key);}
+// System.out.println("key="+key);
+ int arrayNum=(int)(key&arrayMask);
+ key=(key>>>arrayBits)%(cellMod);
+// key=(key>>>(arrayBits+1))%(cellMod);
+// System.out.println("array="+arrayNum);
+// System.out.println("key2="+key);
+ int[] array=matrix[arrayNum];
+ int index=(int)(key>>>indexShift);
+// assert(false) : indexShift;
+// System.out.println("index="+index);
+ int word=array[index];
+// System.out.println("word="+Integer.toHexString(word));
+ assert(word>>>(cellBits*key) == word>>>(cellBits*(key&cellMask)));
+// int cellShift=(int)(cellBits*(key&cellMask));
+ int cellShift=(int)(cellBits*key);
+ if(verbose){System.err.println(", array="+arrayNum+", index="+index+", cellShift="+(cellShift%32)+", value="+((int)((word>>>cellShift)&valueMask)));}
+// System.out.println("cellShift="+cellShift);
+ return (int)((word>>>cellShift)&valueMask);
+ }
+
+ public void write(final long key, int value){
+ throw new RuntimeException("Not allowed for this class.");
+ }
+
+// @Override
+// /** This should increase speed by doing the first hash outside the critical section, but it does not seem to help. */
+// public void increment(long[] keys){
+// for(int i=0; i<keys.length; i++){
+// keys[i]=hash(keys[i], 0);
+// }
+// synchronized(buffers){
+// for(long key : keys){
+// incrementPartiallyHashed(key);
+// }
+// }
+// }
+
+ public void increment(final long rawKey){
+ if(verbose){System.err.println("\n*** Incrementing raw key "+rawKey+" ***");}
+
+ long key2=rawKey;
+ for(int i=0; i<hashes; i++){
+ key2=hash(key2, i);
+ if(verbose){System.err.println("key2="+key2+", value="+readHashed(key2));}
+// assert(readHashed(key2)==0);
+
+ int bnum=(int)(key2&arrayMask);
+ long[] array=buffers[bnum];
+ int loc=bufferlen[bnum];
+ array[loc]=key2;
+ bufferlen[bnum]++;
+ if(verbose){System.err.println("bufferlen["+bnum+"] = "+bufferlen[bnum]);}
+ if(bufferlen[bnum]>=array.length){
+
+ if(verbose){System.err.println("Moving array.");}
+ bufferlen[bnum]=0;
+ buffers[bnum]=new long[array.length];
+
+ writers[bnum].add(array);
+ if(verbose){System.err.println("Moved.");}
+ }
+// assert(read(rawKey)<=min+incr) : "i="+i+", original="+min+", new should be <="+(min+incr)+", new="+read(rawKey)+", max="+maxValue+", key="+rawKey;
+// assert(readHashed(key2)>=min+incr) : "i="+i+", original="+min+", new should be <="+(min+incr)+", new="+read(rawKey)+", max="+maxValue+", key="+rawKey;
+ key2=Long.rotateRight(key2, hashBits);
+ }
+ }
+
+ private void incrementPartiallyHashed(final long pKey){
+ if(verbose){System.err.println("\n*** Incrementing key "+pKey+" ***");}
+
+ long key2=pKey;
+
+ {
+ int bnum=(int)(key2&arrayMask);
+ long[] array=buffers[bnum];
+ int loc=bufferlen[bnum];
+ array[loc]=key2;
+ bufferlen[bnum]++;
+ if(verbose){System.err.println("bufferlen["+bnum+"] = "+bufferlen[bnum]);}
+ if(bufferlen[bnum]>=array.length){
+
+ if(verbose){System.err.println("Moving array.");}
+ bufferlen[bnum]=0;
+ buffers[bnum]=new long[array.length];
+
+ writers[bnum].add(array);
+ if(verbose){System.err.println("Moved.");}
+ }
+ }
+
+ for(int i=1; i<hashes; i++){
+ key2=Long.rotateRight(key2, hashBits);
+ key2=hash(key2, i);
+ if(verbose){System.err.println("key2="+key2+", value="+readHashed(key2));}
+// assert(readHashed(key2)==0);
+
+ int bnum=(int)(key2&arrayMask);
+ long[] array=buffers[bnum];
+ int loc=bufferlen[bnum];
+ array[loc]=key2;
+ bufferlen[bnum]++;
+ if(verbose){System.err.println("bufferlen["+bnum+"] = "+bufferlen[bnum]);}
+ if(bufferlen[bnum]>=array.length){
+
+ if(verbose){System.err.println("Moving array.");}
+ bufferlen[bnum]=0;
+ buffers[bnum]=new long[array.length];
+
+ writers[bnum].add(array);
+ if(verbose){System.err.println("Moved.");}
+ }
+ }
+ }
+
+ public int incrementAndReturn(long key, int incr){
+ throw new RuntimeException("Operation not supported.");
+ }
+
+ /** Returns unincremented value */
+ public int incrementAndReturnUnincremented(long key, int incr){
+ throw new RuntimeException("Operation not supported.");
+ }
+
+ public long[] transformToFrequency(){
+ return transformToFrequency(matrix);
+ }
+
+ public String toContentsString(){
+ StringBuilder sb=new StringBuilder();
+ sb.append("[");
+ String comma="";
+ for(int[] array : matrix){
+ for(int i=0; i<array.length; i++){
+ int word=array[i];
+ for(int j=0; j<cellsPerWord; j++){
+ int x=word&valueMask;
+ sb.append(comma);
+ sb.append(x);
+ word>>>=cellBits;
+ comma=", ";
+ }
+ }
+ }
+ sb.append("]");
+ return sb.toString();
+ }
+
+ public double usedFraction(){return cellsUsed/(double)cells;}
+
+ public double usedFraction(int mindepth){return cellsUsed(mindepth)/(double)cells;}
+
+ public long cellsUsed(int mindepth){
+ long count=0;
+ for(int[] array : matrix){
+ if(array!=null){
+ for(int word : array){
+ while(word>0){
+ int x=word&valueMask;
+ if(x>=mindepth){count++;}
+ word>>>=cellBits;
+ }
+ }
+ }
+ }
+ return count;
+ }
+
+
+ final long hash(long key, int row){
+ int cell=(int)((Long.MAX_VALUE&key)%(hashArrayLength-1));
+// int cell=(int)(hashCellMask&(key));
+
+ if(row==0){//Doublehash only first time
+ key=key^hashMasks[(row+4)%hashMasks.length][cell];
+ cell=(int)(hashCellMask&(key>>4));
+// cell=(int)(hashCellMask&(key>>hashBits));
+// cell=(int)((Long.MAX_VALUE&key)%(hashArrayLength-1));
+ }
+
+ return key^hashMasks[row][cell];
+ }
+
+ /**
+ * @param i
+ * @param j
+ * @return
+ */
+ private static long[][] makeMasks(int rows, int cols) {
+
+ long seed;
+ synchronized(KCountArray4MT.class){
+ seed=counter;
+ counter++;
+ }
+
+ Timer t=new Timer();
+ long[][] r=new long[rows][cols];
+ Random randy=new Random(seed);
+ for(int i=0; i<r.length; i++){
+ fillMasks(r[i], randy);
+ }
+ t.stop();
+ if(t.elapsed>200000000L){System.out.println("Mask-creation time: "+t);}
+ return r;
+ }
+
+
+ /**
+ * @param cols
+ * @param randy
+ * @return
+ */
+ private static void fillMasks(long[] r, Random randy) {
+// for(int i=0; i<r.length; i++){
+// long x=0;
+// while(Long.bitCount(x&0xFFFFFFFF)!=16){
+// x=randy.nextLong();
+// }
+// r[i]=(x&Long.MAX_VALUE);
+// }
+
+ final int hlen=(1<<hashBits);
+ assert(r.length==hlen);
+ int[] count1=new int[hlen];
+ int[] count2=new int[hlen];
+ final long mask=hlen-1;
+
+ for(int i=0; i<r.length; i++){
+ long x=0;
+ int y=0;
+ int z=0;
+ while(Long.bitCount(x&0xFFFFFFFFL)!=16){
+ x=randy.nextLong();
+ while(Long.bitCount(x&0xFFFFFFFFL)<16){
+ x|=(1L<<randy.nextInt(32));
+ }
+ while(Long.bitCount(x&0xFFFFFFFFL)>16){
+ x&=(~(1L<<randy.nextInt(32)));
+ }
+ while(Long.bitCount(x&0xFFFFFFFF00000000L)<16){
+ x|=(1L<<(randy.nextInt(32)+32));
+ }
+ while(Long.bitCount(x&0xFFFFFFFF00000000L)>16){
+ x&=(~(1L<<(randy.nextInt(32)+32)));
+ }
+
+// System.out.print(".");
+// y=(((int)(x&mask))^i);
+ y=(((int)(x&mask)));
+ z=(int)((x>>hashBits)&mask);
+ if(count1[y]>0 || count2[z]>0){
+ x=0;
+ }
+ }
+// System.out.println(Long.toBinaryString(x));
+ r[i]=(x&Long.MAX_VALUE);
+ count1[y]++;
+ count2[z]++;
+ }
+
+ }
+
+
+ public void initialize(){
+ for(int i=0; i<writers.length; i++){
+ writers[i]=new WriteThread(i);
+ writers[i].start();
+
+// while(!writers[i].isAlive()){
+// System.out.print(".");
+// }
+ }
+ }
+
+ public void shutdown(){
+ if(finished){return;}
+ synchronized(this){
+ if(finished){return;}
+
+ //Clear buffers
+ for(int i=0; i<numArrays; i++){
+ long[] array=buffers[i];
+ int len=bufferlen[i];
+ buffers[i]=null;
+ bufferlen[i]=0;
+
+ if(len<array.length){
+ array=Arrays.copyOf(array, len);
+ }
+
+ if(array.length>0){
+ writers[i].add(array);
+ }
+ }
+
+ //Add poison
+ for(WriteThread wt : writers){
+ wt.add(poison);
+ }
+
+ //Wait for termination
+ for(WriteThread wt : writers){
+// System.out.println("wt"+wt.num+" is alive: "+wt.isAlive());
+ while(wt.isAlive()){
+// System.out.println("wt"+wt.num+" is alive: "+wt.isAlive());
+ try {
+ wt.join(10000);
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ if(wt.isAlive()){System.err.println(wt.getClass().getCanonicalName()+" is taking a long time to die.");}
+ }
+ cellsUsed+=wt.cellsUsedPersonal;
+// System.out.println("cellsUsed="+cellsUsed);
+ }
+
+ assert(!finished);
+ finished=true;
+ }
+ }
+
+ private class WriteThread extends Thread{
+
+ public WriteThread(int tnum){
+ num=tnum;
+ }
+
+ @Override
+ public void run(){
+ assert(matrix[num]==null);
+ array=new int[wordsPerArray]; //Makes NUMA systems use local memory.
+
+ matrix[num]=array;
+
+ long[] keys=null;
+ while(!shutdown){
+
+ if(verbose){System.err.println(" - Reading keys for wt"+num+".");}
+ while(keys==null){
+ try {
+ keys=writeQueue.take();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ if(keys==poison){
+// assert(false);
+ shutdown=true;
+ }else{
+ for(long key : keys){
+ incrementHashedLocal(key);
+ }
+ }
+// System.out.println(" -- Read keys for wt"+num+". poison="+(keys==poison)+", len="+keys.length);
+ if(verbose){System.err.println(" -- Read keys for wt"+num+". (success)");}
+ keys=null;
+ if(verbose){System.err.println("shutdown="+shutdown);}
+ }
+
+// System.out.println(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> I died: "+shutdown+", "+(keys==null)+".");
+// assert(false) : ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> I died: "+shutdown+", "+(keys==null)+".";
+
+ array=null;
+ }
+
+ private void add(long[] keys){
+// assert(isAlive());
+ assert(!shutdown);
+ if(shutdown){return;}
+// assert(keys!=poison);
+ if(verbose){System.err.println(" + Adding keys to wt"+num+".");}
+ boolean success=false;
+ while(!success){
+ try {
+ writeQueue.put(keys);
+ success=true;
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ if(verbose){System.err.println(" ++ Added keys to wt"+num+". (success)");}
+ }
+
+ private int incrementHashedLocal(long key){
+ assert((key&arrayMask)==num);
+ key=(key>>>arrayBits)%(cellMod);
+// key=(key>>>(arrayBits+1))%(cellMod);
+ int index=(int)(key>>>indexShift);
+ int word=array[index];
+ int cellShift=(int)(cellBits*key);
+ int value=((word>>>cellShift)&valueMask);
+ if(value==0){cellsUsedPersonal++;}
+ value=min(value+1, maxValue);
+ word=(value<<cellShift)|(word&~((valueMask)<<cellShift));
+ array[index]=word;
+ return value;
+ }
+
+ private int[] array;
+ private final int num;
+ public long cellsUsedPersonal=0;
+
+ public ArrayBlockingQueue<long[]> writeQueue=new ArrayBlockingQueue<long[]>(16);
+ public boolean shutdown=false;
+
+ }
+
+
+ public long cellsUsed(){return cellsUsed;}
+
+ private boolean finished=false;
+
+ private long cellsUsed;
+ private final int[][] matrix;
+ private final WriteThread[] writers=new WriteThread[numArrays];
+ private final int hashes;
+ private final int wordsPerArray;
+ private final long cellsPerArray;
+ private final long cellMod;
+ private final long[][] hashMasks=makeMasks(8, hashArrayLength);
+
+ private final long[][] buffers=new long[numArrays][1000];
+ private final int[] bufferlen=new int[numArrays];
+
+ private static final int hashBits=6;
+ private static final int hashArrayLength=1<<hashBits;
+ private static final int hashCellMask=hashArrayLength-1;
+ private static final long[] poison=new long[0];
+
+ private static long counter=0;
+
+}
diff --git a/current/bloom/KCountArray5MT.java b/current/bloom/KCountArray5MT.java
new file mode 100755
index 0000000..354b7e3
--- /dev/null
+++ b/current/bloom/KCountArray5MT.java
@@ -0,0 +1,501 @@
+package bloom;
+
+import java.util.Arrays;
+import java.util.Random;
+import java.util.concurrent.ArrayBlockingQueue;
+
+import dna.Timer;
+
+
+/**
+ *
+ * Uses hashing rather than direct-mapping to support longer kmers.
+ *
+ * @author Brian Bushnell
+ * @date Aug 17, 2012
+ *
+ */
+public class KCountArray5MT extends KCountArray {
+
+ public static void main(String[] args){
+ long cells=Long.parseLong(args[0]);
+ int bits=Integer.parseInt(args[1]);
+ int gap=Integer.parseInt(args[2]);
+ int hashes=Integer.parseInt(args[3]);
+
+ verbose=false;
+
+ KCountArray5MT kca=new KCountArray5MT(cells, bits, gap, hashes);
+
+ System.out.println(kca.read(0));
+ kca.increment(0);
+ System.out.println(kca.read(0));
+ kca.increment(0);
+ System.out.println(kca.read(0));
+ System.out.println();
+
+ System.out.println(kca.read(1));
+ kca.increment(1);
+ System.out.println(kca.read(1));
+ kca.increment(1);
+ System.out.println(kca.read(1));
+ System.out.println();
+
+ System.out.println(kca.read(100));
+ kca.increment(100);
+ System.out.println(kca.read(100));
+ kca.increment(100);
+ System.out.println(kca.read(100));
+ kca.increment(100);
+ System.out.println(kca.read(100));
+ System.out.println();
+
+
+ System.out.println(kca.read(150));
+ kca.increment(150);
+ System.out.println(kca.read(150));
+ System.out.println();
+
+ }
+
+ public KCountArray5MT(long cells_, int bits_, int gap_, int hashes_){
+ super(cells_, bits_, gap_);
+// verbose=false;
+ long words=cells/cellsPerWord;
+ assert(words/numArrays<=Integer.MAX_VALUE);
+ wordsPerArray=(int)(words/numArrays);
+ cellsPerArray=cells/numArrays;
+ cellMod=cellsPerArray-1;
+ hashes=hashes_;
+// System.out.println("cells="+cells+", words="+words+", wordsPerArray="+wordsPerArray+", numArrays="+numArrays+", hashes="+hashes);
+// assert(false);
+ matrix=new int[numArrays][];
+ assert(hashes>0 && hashes<=hashMasks.length);
+ }
+
+ public int read(final long rawKey){
+ assert(finished);
+ if(verbose){System.err.println("Reading raw key "+rawKey);}
+ long key2=hash(rawKey, 0);
+ int min=readHashed(key2);
+ for(int i=1; i<hashes && min>0; i++){
+ if(verbose){System.err.println("Reading. i="+i+", key2="+key2);}
+ key2=Long.rotateRight(key2, hashBits);
+ key2=hash(key2, i);
+ if(verbose){System.err.println("Rot/hash. i="+i+", key2="+key2);}
+ min=min(min, readHashed(key2));
+ }
+ return min;
+ }
+
+ private int readHashed(long key){
+ assert(finished);
+ if(verbose){System.err.print("Reading hashed key "+key);}
+// System.out.println("key="+key);
+ int arrayNum=(int)(key&arrayMask);
+ key=(key>>>arrayBits)%(cellMod);
+// System.out.println("array="+arrayNum);
+// System.out.println("key2="+key);
+ int[] array=matrix[arrayNum];
+ int index=(int)(key>>>indexShift);
+// assert(false) : indexShift;
+// System.out.println("index="+index);
+ int word=array[index];
+// System.out.println("word="+Integer.toHexString(word));
+ assert(word>>>(cellBits*key) == word>>>(cellBits*(key&cellMask)));
+// int cellShift=(int)(cellBits*(key&cellMask));
+ int cellShift=(int)(cellBits*key);
+ if(verbose){System.err.println(", array="+arrayNum+", index="+index+", cellShift="+(cellShift%32)+", value="+((int)((word>>>cellShift)&valueMask)));}
+// System.out.println("cellShift="+cellShift);
+ return (int)((word>>>cellShift)&valueMask);
+ }
+
+ public void write(final long key, int value){
+ throw new RuntimeException("Not allowed for this class.");
+ }
+
+ public void increment(final long rawKey){
+ if(verbose){System.err.println("\n*** Incrementing raw key "+rawKey+" ***");}
+
+ buffer[bufferlen]=hash(rawKey, 0);
+ bufferlen++;
+
+ if(bufferlen>=buffer.length){
+
+ if(verbose){System.err.println("Moving array.");}
+
+ for(int w=0; w<writers.length; w++){
+ writers[w].add(buffer);
+ }
+ bufferlen=0;
+ buffer=new long[buffer.length];
+ if(verbose){System.err.println("Moved.");}
+ }
+
+ }
+
+
+ public synchronized void increment(long[] keys){
+ for(int i=0; i<keys.length; i++){
+ keys[i]=hash(keys[i],0);
+ }
+ for(int w=0; w<writers.length; w++){
+ writers[w].add(keys);
+ }
+ }
+
+ public int incrementAndReturn(long key, int incr){
+ throw new RuntimeException("Operation not supported.");
+ }
+
+ /** Returns unincremented value */
+ public int incrementAndReturnUnincremented(long key, int incr){
+ throw new RuntimeException("Operation not supported.");
+ }
+
+ public long[] transformToFrequency(){
+ return transformToFrequency(matrix);
+ }
+
+ public String toContentsString(){
+ StringBuilder sb=new StringBuilder();
+ sb.append("[");
+ String comma="";
+ for(int[] array : matrix){
+ for(int i=0; i<array.length; i++){
+ int word=array[i];
+ for(int j=0; j<cellsPerWord; j++){
+ int x=word&valueMask;
+ sb.append(comma);
+ sb.append(x);
+ word>>>=cellBits;
+ comma=", ";
+ }
+ }
+ }
+ sb.append("]");
+ return sb.toString();
+ }
+
+ public double usedFraction(){return cellsUsed/(double)cells;}
+
+ public double usedFraction(int mindepth){return cellsUsed(mindepth)/(double)cells;}
+
+ public long cellsUsed(int mindepth){
+ long count=0;
+ for(int[] array : matrix){
+ if(array!=null){
+ for(int word : array){
+ while(word>0){
+ int x=word&valueMask;
+ if(x>=mindepth){count++;}
+ word>>>=cellBits;
+ }
+ }
+ }
+ }
+ return count;
+ }
+
+
+ final long hash(long key, int row){
+ int cell=(int)((Long.MAX_VALUE&key)%(hashArrayLength-1));
+// int cell=(int)(hashCellMask&(key));
+
+ if(row==0){//Doublehash only first time
+ key=key^hashMasks[(row+4)%hashMasks.length][cell];
+ cell=(int)(hashCellMask&(key>>4));
+// cell=(int)(hashCellMask&(key>>hashBits));
+// cell=(int)((Long.MAX_VALUE&key)%(hashArrayLength-1));
+ }
+
+ return key^hashMasks[row][cell];
+ }
+
+ /**
+ * @param i
+ * @param j
+ * @return
+ */
+ private static long[][] makeMasks(int rows, int cols) {
+
+ long seed;
+ synchronized(KCountArray5MT.class){
+ seed=counter;
+ counter++;
+ }
+
+ Timer t=new Timer();
+ long[][] r=new long[rows][cols];
+ Random randy=new Random(seed);
+ for(int i=0; i<r.length; i++){
+ fillMasks(r[i], randy);
+ }
+ t.stop();
+ if(t.elapsed>200000000L){System.out.println("Mask-creation time: "+t);}
+ return r;
+ }
+
+
+ /**
+ * @param cols
+ * @param randy
+ * @return
+ */
+ private static void fillMasks(long[] r, Random randy) {
+// for(int i=0; i<r.length; i++){
+// long x=0;
+// while(Long.bitCount(x&0xFFFFFFFF)!=16){
+// x=randy.nextLong();
+// }
+// r[i]=(x&Long.MAX_VALUE);
+// }
+
+ final int hlen=(1<<hashBits);
+ assert(r.length==hlen);
+ int[] count1=new int[hlen];
+ int[] count2=new int[hlen];
+ final long mask=hlen-1;
+
+ for(int i=0; i<r.length; i++){
+ long x=0;
+ int y=0;
+ int z=0;
+ while(Long.bitCount(x&0xFFFFFFFFL)!=16){
+ x=randy.nextLong();
+ while(Long.bitCount(x&0xFFFFFFFFL)<16){
+ x|=(1L<<randy.nextInt(32));
+ }
+ while(Long.bitCount(x&0xFFFFFFFFL)>16){
+ x&=(~(1L<<randy.nextInt(32)));
+ }
+ while(Long.bitCount(x&0xFFFFFFFF00000000L)<16){
+ x|=(1L<<(randy.nextInt(32)+32));
+ }
+ while(Long.bitCount(x&0xFFFFFFFF00000000L)>16){
+ x&=(~(1L<<(randy.nextInt(32)+32)));
+ }
+
+// System.out.print(".");
+// y=(((int)(x&mask))^i);
+ y=(((int)(x&mask)));
+ z=(int)((x>>hashBits)&mask);
+ if(count1[y]>0 || count2[z]>0){
+ x=0;
+ }
+ }
+// System.out.println(Long.toBinaryString(x));
+ r[i]=(x&Long.MAX_VALUE);
+ count1[y]++;
+ count2[z]++;
+ }
+
+ }
+
+
+ public void initialize(){
+ for(int i=0; i<writers.length; i++){
+ writers[i]=new WriteThread(i);
+ writers[i].start();
+
+ while(!writers[i].isAlive()){
+ System.out.print(".");
+ }
+ }
+ }
+
+ public void shutdown(){
+ if(finished){return;}
+ synchronized(this){
+ if(finished){return;}
+
+ //Clear buffer
+ if(bufferlen<buffer.length){
+ buffer=Arrays.copyOf(buffer, bufferlen);
+ }
+
+ if(buffer.length>0){
+ for(int i=0; i<writers.length; i++)
+ writers[i].add(buffer);
+ }
+ buffer=null;
+ bufferlen=0;
+
+
+ //Add poison
+ for(WriteThread wt : writers){
+ wt.add(poison);
+ }
+
+ //Wait for termination
+ for(WriteThread wt : writers){
+// System.out.println("wt"+wt.num+" is alive: "+wt.isAlive());
+ while(wt.isAlive()){
+// System.out.println("wt"+wt.num+" is alive: "+wt.isAlive());
+ try {
+ wt.join(10000);
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ if(wt.isAlive()){System.err.println(wt.getClass().getCanonicalName()+" is taking a long time to die.");}
+ }
+ cellsUsed+=wt.cellsUsedPersonal;
+// System.out.println("cellsUsed="+cellsUsed);
+ }
+
+ assert(!finished);
+ finished=true;
+ }
+ }
+
+ private class WriteThread extends Thread{
+
+ public WriteThread(int tnum){
+ num=tnum;
+ }
+
+ @Override
+ public void run(){
+ assert(matrix[num]==null);
+ array=new int[wordsPerArray]; //Makes NUMA systems use local memory.
+// assert(false);
+ matrix[num]=array;
+
+// assert(num==1);
+
+ long[] keys=null;
+ while(!shutdown){
+// assert(false);
+
+ if(verbose){System.err.println(" - Reading keys for wt"+num+".");}
+ while(keys==null){
+// System.out.println("Searching for keys.");
+ try {
+ keys=writeQueue.take();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+// System.out.println("*******************************************Found keys: "+keys.length);
+// assert(false);
+ }
+ if(keys==poison){
+// assert(false) : " ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~0 ";
+ shutdown=true;
+ }else{
+// assert(false) : " ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~1 ";
+ for(long rawKey : keys){
+// assert(false) : " ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~2 ";
+ if(verbose){System.err.println("Writer "+num+" considering raw key "+rawKey);}
+ long key2=rawKey;
+// int y=read(rawKey);
+ if((key2&arrayMask)==num){
+ int x=incrementHashedLocal(key2);
+ assert(x>=0) : "i="+0+", original=?, new should be >=0, new="+readHashed(key2)+", max="+maxValue+", key="+rawKey;
+ if(verbose){System.err.println("postIncr value="+readHashed(key2));}
+
+// assert(false) : " ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~4 ";
+ }
+
+ for(int i=1; i<hashes; i++){
+ key2=Long.rotateRight(key2, hashBits);
+ key2=hash(key2, i);
+// assert(false) : " ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~3 ";
+ if(verbose){System.err.println("rawKey="+rawKey+", i="+i+", key2="+key2+", value="+readHashed(key2));}
+ if((key2&arrayMask)==num){
+ int x=incrementHashedLocal(key2);
+ assert(x>=0) : "i="+i+", original=?, new should be >=0, new="+readHashed(key2)+", max="+maxValue+", key="+rawKey;
+ if(verbose){System.err.println("postIncr value="+readHashed(key2));}
+
+// assert(false) : " ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~4 ";
+ }
+
+// assert(false) : " ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~5 ";
+ }
+// int z=read(rawKey);
+// assert(hashes!=1 || !b || z==maxValue || z==y+1) : "b="+b+", y="+y+", z="+z+", rawKey="+rawKey+", num="+num;
+ }
+ }
+// System.out.println(" -- Read keys for wt"+num+". poison="+(keys==poison)+", len="+keys.length);
+ if(verbose){System.err.println(" -- Read keys for wt"+num+". (success)");}
+ keys=null;
+ if(verbose){System.err.println("shutdown="+shutdown);}
+ }
+
+// System.out.println(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> I died: "+shutdown+", "+(keys==null)+".");
+// assert(false) : ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> I died: "+shutdown+", "+(keys==null)+".";
+
+ array=null;
+ }
+
+ private void add(long[] keys){
+// assert(isAlive());
+
+// assert(!shutdown);
+// if(shutdown){return;}
+
+ if(verbose){System.err.println(" + Adding keys to wt"+num+".");}
+ boolean success=false;
+ while(!success){
+ try {
+ writeQueue.put(keys);
+ success=true;
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ if(verbose){System.err.println(" ++ Added keys to wt"+num+". (success)");}
+ }
+
+ private int incrementHashedLocal(final long key_){
+ if(verbose){System.err.println("\n*** wt"+num+" incrementing hashed key "+key_+" ***");}
+ assert((key_&arrayMask)==num);
+ long key=(key_>>>arrayBits)%(cellMod);
+ int index=(int)(key>>>indexShift);
+ int word=array[index];
+ int cellShift=(int)(cellBits*key);
+ int value=((word>>>cellShift)&valueMask);
+ if(value==0){cellsUsedPersonal++;}
+ value=min(value+1, maxValue);
+ word=(value<<cellShift)|(word&~((valueMask)<<cellShift));
+ array[index]=word;
+ if(verbose){System.err.println("\n*** wt"+num+" Incremented hashed key "+key_+". Value = "+readHashed(key_)+" ***");}
+ return value;
+ }
+
+ private int[] array;
+ private final int num;
+ public long cellsUsedPersonal=0;
+
+ public ArrayBlockingQueue<long[]> writeQueue=new ArrayBlockingQueue<long[]>(8);
+ public boolean shutdown=false;
+
+ }
+
+
+ public long cellsUsed(){return cellsUsed;}
+
+ private boolean finished=false;
+
+ private long cellsUsed;
+ private final int[][] matrix;
+ private final WriteThread[] writers=new WriteThread[numArrays];
+ private final int hashes;
+ private final int wordsPerArray;
+ private final long cellsPerArray;
+ private final long cellMod;
+ private final long[][] hashMasks=makeMasks(8, hashArrayLength);
+
+ private long[] buffer=new long[2000];
+ private int bufferlen=0;
+
+ private static final int hashBits=6;
+ private static final int hashArrayLength=1<<hashBits;
+ private static final int hashCellMask=hashArrayLength-1;
+ private static final long[] poison=new long[0];
+
+ private static long counter=0;
+
+}
diff --git a/current/bloom/KCountArray6MT.java b/current/bloom/KCountArray6MT.java
new file mode 100755
index 0000000..dd2c83d
--- /dev/null
+++ b/current/bloom/KCountArray6MT.java
@@ -0,0 +1,505 @@
+package bloom;
+
+import java.util.Arrays;
+import java.util.Random;
+import java.util.concurrent.ArrayBlockingQueue;
+
+import dna.Timer;
+
+
+/**
+ *
+ * Uses hashing rather than direct-mapping to support longer kmers.
+ *
+ * @author Brian Bushnell
+ * @date Aug 17, 2012
+ *
+ */
+public class KCountArray6MT extends KCountArray {
+
+ public static void main(String[] args){
+ long cells=Long.parseLong(args[0]);
+ int bits=Integer.parseInt(args[1]);
+ int gap=Integer.parseInt(args[2]);
+ int hashes=Integer.parseInt(args[3]);
+
+ verbose=false;
+
+ KCountArray6MT kca=new KCountArray6MT(cells, bits, gap, hashes);
+
+ System.out.println(kca.read(0));
+ kca.increment(0);
+ System.out.println(kca.read(0));
+ kca.increment(0);
+ System.out.println(kca.read(0));
+ System.out.println();
+
+ System.out.println(kca.read(1));
+ kca.increment(1);
+ System.out.println(kca.read(1));
+ kca.increment(1);
+ System.out.println(kca.read(1));
+ System.out.println();
+
+ System.out.println(kca.read(100));
+ kca.increment(100);
+ System.out.println(kca.read(100));
+ kca.increment(100);
+ System.out.println(kca.read(100));
+ kca.increment(100);
+ System.out.println(kca.read(100));
+ System.out.println();
+
+
+ System.out.println(kca.read(150));
+ kca.increment(150);
+ System.out.println(kca.read(150));
+ System.out.println();
+
+ }
+
+ public KCountArray6MT(long cells_, int bits_, int gap_, int hashes_){
+ super(cells_, bits_, gap_);
+// verbose=false;
+ long words=cells/cellsPerWord;
+ assert(words/numArrays<=Integer.MAX_VALUE);
+ wordsPerArray=(int)(words/numArrays);
+ cellsPerArray=cells/numArrays;
+ cellMod=cellsPerArray-1;
+ hashes=hashes_;
+// System.out.println("cells="+cells+", words="+words+", wordsPerArray="+wordsPerArray+", numArrays="+numArrays+", hashes="+hashes);
+// assert(false);
+ matrix=new int[numArrays][];
+ assert(hashes>0 && hashes<=hashMasks.length);
+ }
+
+ public int read(final long rawKey){
+ assert(finished);
+ if(verbose){System.err.println("Reading raw key "+rawKey);}
+
+ long key1=hash(rawKey, 3);
+ int arrayNum=(int)(key1&arrayMask);
+ long key2=hash(rawKey, 0);
+
+ int min=readHashed(key2, arrayNum);
+ for(int i=1; i<hashes && min>0; i++){
+ if(verbose){System.err.println("Reading. i="+i+", key2="+key2);}
+ key2=Long.rotateRight(key2, hashBits);
+ key2=hash(key2, i);
+ if(verbose){System.err.println("Rot/hash. i="+i+", key2="+key2);}
+ min=min(min, readHashed(key2, arrayNum));
+ }
+ return min;
+ }
+
+ private int readHashed(long key, int arrayNum){
+ if(verbose){System.err.print("Reading hashed key "+key);}
+// System.out.println("key="+key);
+// int arrayNum=(int)(key&arrayMask);
+ key=(key&Long.MAX_VALUE)%(cellMod);
+// key=(key>>>(arrayBits+1))%(cellMod);
+// System.out.println("array="+arrayNum);
+// System.out.println("key2="+key);
+ int[] array=matrix[arrayNum];
+ int index=(int)(key>>>indexShift);
+// assert(false) : indexShift;
+// System.out.println("index="+index);
+ int word=array[index];
+// System.out.println("word="+Integer.toHexString(word));
+ assert(word>>>(cellBits*key) == word>>>(cellBits*(key&cellMask)));
+// int cellShift=(int)(cellBits*(key&cellMask));
+ int cellShift=(int)(cellBits*key);
+ if(verbose){System.err.println(", array="+arrayNum+", index="+index+", cellShift="+(cellShift%32)+", value="+((int)((word>>>cellShift)&valueMask)));}
+// System.out.println("cellShift="+cellShift);
+ return (int)((word>>>cellShift)&valueMask);
+ }
+
+ public void write(final long key, int value){
+ throw new RuntimeException("Not allowed for this class.");
+ }
+
+ public void increment(final long rawKey){
+ if(verbose){System.err.println("\n*** Incrementing raw key "+rawKey+" ***");}
+
+ long key1=hash(rawKey, 3);
+
+ if(verbose){System.err.println("key2="+key1+", value="+read(rawKey));}
+
+ int bnum=(int)(key1&arrayMask);
+ long[] array=buffers[bnum];
+ int loc=bufferlen[bnum];
+
+// key2=Long.rotateRight(key2, hashBits);
+// array[loc]=key2;
+
+ array[loc]=rawKey;
+ bufferlen[bnum]++;
+ if(verbose){System.err.println("bufferlen["+bnum+"] = "+bufferlen[bnum]);}
+ if(bufferlen[bnum]>=array.length){
+
+ if(verbose){System.err.println("Moving array.");}
+ bufferlen[bnum]=0;
+ buffers[bnum]=new long[array.length];
+
+ writers[bnum].add(array);
+ if(verbose){System.err.println("Moved.");}
+ }
+ }
+
+ public int incrementAndReturn(long key, int incr){
+ throw new RuntimeException("Operation not supported.");
+ }
+
+ /** Returns unincremented value */
+ public int incrementAndReturnUnincremented(long key, int incr){
+ throw new RuntimeException("Operation not supported.");
+ }
+
+ public long[] transformToFrequency(){
+ return transformToFrequency(matrix);
+ }
+
+ public String toContentsString(){
+ StringBuilder sb=new StringBuilder();
+ sb.append("[");
+ String comma="";
+ for(int[] array : matrix){
+ for(int i=0; i<array.length; i++){
+ int word=array[i];
+ for(int j=0; j<cellsPerWord; j++){
+ int x=word&valueMask;
+ sb.append(comma);
+ sb.append(x);
+ word>>>=cellBits;
+ comma=", ";
+ }
+ }
+ }
+ sb.append("]");
+ return sb.toString();
+ }
+
+ public double usedFraction(){return cellsUsed/(double)cells;}
+
+ public double usedFraction(int mindepth){return cellsUsed(mindepth)/(double)cells;}
+
+ public long cellsUsed(int mindepth){
+ long count=0;
+ for(int[] array : matrix){
+ if(array!=null){
+ for(int word : array){
+ while(word>0){
+ int x=word&valueMask;
+ if(x>=mindepth){count++;}
+ word>>>=cellBits;
+ }
+ }
+ }
+ }
+ return count;
+ }
+
+
+ final long hash(long key, int row){
+ int cell=(int)((Long.MAX_VALUE&key)%(hashArrayLength-1));
+// int cell=(int)(hashCellMask&(key));
+
+ if(row==0){//Doublehash only first time
+ key=key^hashMasks[(row+4)%hashMasks.length][cell];
+ cell=(int)(hashCellMask&(key>>4));
+// cell=(int)(hashCellMask&(key>>hashBits));
+// cell=(int)((Long.MAX_VALUE&key)%(hashArrayLength-1));
+ }
+
+ return key^hashMasks[row][cell];
+ }
+
+ /**
+ * @param i
+ * @param j
+ * @return
+ */
+ private static long[][] makeMasks(int rows, int cols) {
+
+ long seed;
+ synchronized(KCountArray6MT.class){
+ seed=counter;
+ counter++;
+ }
+
+ Timer t=new Timer();
+ long[][] r=new long[rows][cols];
+ Random randy=new Random(seed);
+ for(int i=0; i<r.length; i++){
+ fillMasks(r[i], randy);
+ }
+ t.stop();
+ if(t.elapsed>200000000L){System.out.println("Mask-creation time: "+t);}
+ return r;
+ }
+
+
+ /**
+ * @param cols
+ * @param randy
+ * @return
+ */
+ private static void fillMasks(long[] r, Random randy) {
+// for(int i=0; i<r.length; i++){
+// long x=0;
+// while(Long.bitCount(x&0xFFFFFFFF)!=16){
+// x=randy.nextLong();
+// }
+// r[i]=(x&Long.MAX_VALUE);
+// }
+
+ final int hlen=(1<<hashBits);
+ assert(r.length==hlen);
+ int[] count1=new int[hlen];
+ int[] count2=new int[hlen];
+ final long mask=hlen-1;
+
+ for(int i=0; i<r.length; i++){
+ long x=0;
+ int y=0;
+ int z=0;
+ while(Long.bitCount(x&0xFFFFFFFFL)!=16){
+ x=randy.nextLong();
+ while(Long.bitCount(x&0xFFFFFFFFL)<16){
+ x|=(1L<<randy.nextInt(32));
+ }
+ while(Long.bitCount(x&0xFFFFFFFFL)>16){
+ x&=(~(1L<<randy.nextInt(32)));
+ }
+ while(Long.bitCount(x&0xFFFFFFFF00000000L)<16){
+ x|=(1L<<(randy.nextInt(32)+32));
+ }
+ while(Long.bitCount(x&0xFFFFFFFF00000000L)>16){
+ x&=(~(1L<<(randy.nextInt(32)+32)));
+ }
+
+// System.out.print(".");
+// y=(((int)(x&mask))^i);
+ y=(((int)(x&mask)));
+ z=(int)((x>>hashBits)&mask);
+ if(count1[y]>0 || count2[z]>0){
+ x=0;
+ }
+ }
+// System.out.println(Long.toBinaryString(x));
+ r[i]=(x&Long.MAX_VALUE);
+ count1[y]++;
+ count2[z]++;
+ }
+
+ }
+
+
+ public void initialize(){
+ for(int i=0; i<writers.length; i++){
+ writers[i]=new WriteThread(i);
+ writers[i].start();
+
+// while(!writers[i].isAlive()){
+// System.out.print(".");
+// }
+ }
+ }
+
+ public void shutdown(){
+ if(finished){return;}
+ synchronized(this){
+ if(finished){return;}
+
+ //Clear buffers
+ for(int i=0; i<numArrays; i++){
+ long[] array=buffers[i];
+ int len=bufferlen[i];
+ buffers[i]=null;
+ bufferlen[i]=0;
+
+ if(len<array.length){
+ array=Arrays.copyOf(array, len);
+ }
+
+ if(array.length>0){
+ writers[i].add(array);
+ }
+ }
+
+ //Add poison
+ for(WriteThread wt : writers){
+ wt.add(poison);
+ }
+
+ //Wait for termination
+ for(WriteThread wt : writers){
+// System.out.println("wt"+wt.num+" is alive: "+wt.isAlive());
+ while(wt.isAlive()){
+// System.out.println("wt"+wt.num+" is alive: "+wt.isAlive());
+ try {
+ wt.join(10000);
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ if(wt.isAlive()){System.err.println(wt.getClass().getCanonicalName()+" is taking a long time to die.");}
+ }
+ cellsUsed+=wt.cellsUsedPersonal;
+// System.out.println("cellsUsed="+cellsUsed);
+ }
+
+ assert(!finished);
+ finished=true;
+ }
+ }
+
+ private class WriteThread extends Thread{
+
+ public WriteThread(int tnum){
+ num=tnum;
+ }
+
+ @Override
+ public void run(){
+ assert(matrix[num]==null);
+ array=new int[wordsPerArray]; //Makes NUMA systems use local memory.
+
+ matrix[num]=array;
+
+ long[] keys=null;
+ while(!shutdown){
+
+ if(verbose){System.err.println(" - Reading keys for wt"+num+".");}
+ while(keys==null){
+ try {
+ keys=writeQueue.take();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ if(keys==poison){
+// assert(false);
+ shutdown=true;
+ }else{
+ for(long key : keys){
+ incrementRawLocal(key);
+ }
+ }
+// System.out.println(" -- Read keys for wt"+num+". poison="+(keys==poison)+", len="+keys.length);
+ if(verbose){System.err.println(" -- Read keys for wt"+num+". (success)");}
+ keys=null;
+ if(verbose){System.err.println("shutdown="+shutdown);}
+ }
+
+// System.out.println(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> I died: "+shutdown+", "+(keys==null)+".");
+// assert(false) : ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> I died: "+shutdown+", "+(keys==null)+".";
+
+ array=null;
+ }
+
+ private void add(long[] keys){
+// assert(isAlive());
+ assert(!shutdown);
+ if(shutdown){return;}
+// assert(keys!=poison);
+ if(verbose){System.err.println(" + Adding keys to wt"+num+".");}
+ boolean success=false;
+ while(!success){
+ try {
+ writeQueue.put(keys);
+ success=true;
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ if(verbose){System.err.println(" ++ Added keys to wt"+num+". (success)");}
+ }
+
+ private int incrementRawLocal(long rawKey){
+// verbose=(rawKey==32662670693L);
+ if(verbose){System.err.println("\n*** Incrementing raw key "+rawKey+" ***");}
+// verbose=true;
+ assert(1>0);
+
+ long key2=rawKey;
+ if(hashes==1){
+ key2=hash(key2, 0);
+// int x=incrementHashedIfAtMost(key2, 1, maxValue-1);
+ int x=incrementHashedLocal(key2);
+ assert(x>=1) : "original=?, new should be >="+(1)+", new="+read(rawKey)+", max="+maxValue+", key="+rawKey;
+ return x;
+ }
+
+ int min=0;
+// final int min=read(rawKey);
+// if(min>=maxValue){return maxValue;}
+
+ assert(key2==rawKey);
+ for(int i=0; i<hashes; i++){
+ key2=hash(key2, i);
+ if(verbose){System.err.println("key2="+key2+", value="+readHashed(key2, num));}
+// int x=incrementHashedIfAtMost(key2, 1, min);
+ int x=incrementHashedLocal(key2);
+ assert(x>=min+1) : "i="+i+", original="+min+", new should be <="+(min+1)+", new="+read(rawKey)+", max="+maxValue+", key="+rawKey;
+ if(verbose){System.err.println("postIncr value="+readHashed(key2, num));}
+// assert(read(rawKey)<=min+1) : "i="+i+", original="+min+", new should be <="+(min+1)+", new="+read(rawKey)+", max="+maxValue+", key="+rawKey;
+// assert(readHashed(key2)>=min+1) : "i="+i+", original="+min+", new should be <="+(min+1)+", new="+read(rawKey)+", max="+maxValue+", key="+rawKey;
+ key2=Long.rotateRight(key2, hashBits);
+ }
+// assert(read(rawKey)==min+1) : "original="+min+", new should be "+(min+1)+", new="+read(rawKey)+", max="+maxValue;
+// assert(false);
+ return min(min+1, maxValue);
+ }
+
+ private int incrementHashedLocal(long key){
+// assert((key&arrayMask)==num);
+ key=(key&Long.MAX_VALUE)%(cellMod);
+// key=(key>>>(arrayBits+1))%(cellMod);
+ int index=(int)(key>>>indexShift);
+ int word=array[index];
+ int cellShift=(int)(cellBits*key);
+ int value=((word>>>cellShift)&valueMask);
+ if(value==0){cellsUsedPersonal++;}
+ value=min(value+1, maxValue);
+ word=(value<<cellShift)|(word&~((valueMask)<<cellShift));
+ array[index]=word;
+ return value;
+ }
+
+ private int[] array;
+ private final int num;
+ public long cellsUsedPersonal=0;
+
+ public ArrayBlockingQueue<long[]> writeQueue=new ArrayBlockingQueue<long[]>(16);
+ public boolean shutdown=false;
+
+ }
+
+
+ public long cellsUsed(){return cellsUsed;}
+
+ private boolean finished=false;
+
+ private long cellsUsed;
+ private final int[][] matrix;
+ private final WriteThread[] writers=new WriteThread[numArrays];
+ private final int hashes;
+ private final int wordsPerArray;
+ private final long cellsPerArray;
+ private final long cellMod;
+ private final long[][] hashMasks=makeMasks(8, hashArrayLength);
+
+ private final long[][] buffers=new long[numArrays][1000];
+ private final int[] bufferlen=new int[numArrays];
+
+ private static final int hashBits=6;
+ private static final int hashArrayLength=1<<hashBits;
+ private static final int hashCellMask=hashArrayLength-1;
+ private static final long[] poison=new long[0];
+
+ private static long counter=0;
+
+}
diff --git a/current/bloom/KCountArray7MT.java b/current/bloom/KCountArray7MT.java
new file mode 100755
index 0000000..e15105b
--- /dev/null
+++ b/current/bloom/KCountArray7MT.java
@@ -0,0 +1,559 @@
+package bloom;
+
+import java.util.Arrays;
+import java.util.Random;
+import java.util.concurrent.ArrayBlockingQueue;
+
+import kmer.Primes;
+
+
+import align2.Tools;
+
+import dna.Timer;
+
+
+/**
+ *
+ * Uses prime numbers for array lengths.
+ *
+ * @author Brian Bushnell
+ * @date Aug 17, 2012
+ *
+ */
+public class KCountArray7MT extends KCountArray {
+
+ public static void main(String[] args){
+ long cells=Long.parseLong(args[0]);
+ int bits=Integer.parseInt(args[1]);
+ int gap=Integer.parseInt(args[2]);
+ int hashes=Integer.parseInt(args[3]);
+
+ verbose=false;
+
+ KCountArray7MT kca=new KCountArray7MT(cells, bits, gap, hashes);
+
+ System.out.println(kca.read(0));
+ kca.increment(0);
+ System.out.println(kca.read(0));
+ kca.increment(0);
+ System.out.println(kca.read(0));
+ System.out.println();
+
+ System.out.println(kca.read(1));
+ kca.increment(1);
+ System.out.println(kca.read(1));
+ kca.increment(1);
+ System.out.println(kca.read(1));
+ System.out.println();
+
+ System.out.println(kca.read(100));
+ kca.increment(100);
+ System.out.println(kca.read(100));
+ kca.increment(100);
+ System.out.println(kca.read(100));
+ kca.increment(100);
+ System.out.println(kca.read(100));
+ System.out.println();
+
+
+ System.out.println(kca.read(150));
+ kca.increment(150);
+ System.out.println(kca.read(150));
+ System.out.println();
+
+ }
+
+ public KCountArray7MT(long cells_, int bits_, int gap_, int hashes_){
+ super(getPrimeCells(cells_, bits_), bits_, gap_, getDesiredArrays(cells_, bits_));
+// verbose=false;
+// assert(false);
+// System.out.println(cells);
+ cellsPerArray=cells/numArrays;
+ wordsPerArray=(int)((cellsPerArray%cellsPerWord)==0 ? (cellsPerArray/cellsPerWord) : (cellsPerArray/cellsPerWord+1));
+ cellMod=cellsPerArray;
+ hashes=hashes_;
+// System.out.println("cells="+cells+", words="+words+", wordsPerArray="+wordsPerArray+", numArrays="+numArrays+", hashes="+hashes);
+// assert(false);
+ matrix=new int[numArrays][];
+ assert(hashes>0 && hashes<=hashMasks.length);
+ }
+
+ private static int getDesiredArrays(long desiredCells, int bits){
+
+ long words=Tools.max((desiredCells*bits+31)/32, minArrays);
+ int arrays=minArrays;
+ while(words/arrays>=Integer.MAX_VALUE){
+ arrays*=2;
+ }
+// assert(false) : arrays;
+ return arrays;
+ }
+
+ private static long getPrimeCells(long desiredCells, int bits){
+
+ int arrays=getDesiredArrays(desiredCells, bits);
+
+ long x=(desiredCells+arrays-1)/arrays;
+ long x2=Primes.primeAtMost(x);
+ return x2*arrays;
+ }
+
+ public int read(final long rawKey){
+ assert(finished);
+ if(verbose){System.err.println("Reading raw key "+rawKey);}
+ long key2=hash(rawKey, 0);
+ int min=readHashed(key2);
+ for(int i=1; i<hashes && min>0; i++){
+ if(verbose){System.err.println("Reading. i="+i+", key2="+key2);}
+ key2=Long.rotateRight(key2, hashBits);
+ key2=hash(key2, i);
+ if(verbose){System.err.println("Rot/hash. i="+i+", key2="+key2);}
+ min=min(min, readHashed(key2));
+ }
+ return min;
+ }
+
+ private int readHashed(long key){
+ if(verbose){System.err.print("Reading hashed key "+key);}
+// System.out.println("key="+key);
+ int arrayNum=(int)(key&arrayMask);
+ key=(key>>>arrayBits)%(cellMod);
+// key=(key>>>(arrayBits+1))%(cellMod);
+// System.out.println("array="+arrayNum);
+// System.out.println("key2="+key);
+ int[] array=matrix[arrayNum];
+ int index=(int)(key>>>indexShift);
+// assert(false) : indexShift;
+// System.out.println("index="+index);
+ int word=array[index];
+// System.out.println("word="+Integer.toHexString(word));
+ assert(word>>>(cellBits*key) == word>>>(cellBits*(key&cellMask)));
+// int cellShift=(int)(cellBits*(key&cellMask));
+ int cellShift=(int)(cellBits*key);
+ if(verbose){System.err.println(", array="+arrayNum+", index="+index+", cellShift="+(cellShift%32)+", value="+((int)((word>>>cellShift)&valueMask)));}
+// System.out.println("cellShift="+cellShift);
+ return (int)((word>>>cellShift)&valueMask);
+ }
+
+ public void write(final long key, int value){
+ throw new RuntimeException("Not allowed for this class.");
+ }
+
+ @Override
+ /** This should increase speed by doing the first hash outside the critical section, but it does not seem to help. */
+ public void increment(long[] keys){
+ for(int i=0; i<keys.length; i++){
+ keys[i]=hash(keys[i], 0);
+ }
+ synchronized(buffers){
+ for(long key : keys){
+ incrementPartiallyHashed(key);
+ }
+ }
+ }
+
+ public void increment(final long rawKey){
+ if(verbose){System.err.println("\n*** Incrementing raw key "+rawKey+" ***");}
+
+ long key2=rawKey;
+ for(int i=0; i<hashes; i++){
+ key2=hash(key2, i);
+ if(verbose){System.err.println("key2="+key2+", value="+readHashed(key2));}
+// assert(readHashed(key2)==0);
+
+ int bnum=(int)(key2&arrayMask);
+ long[] array=buffers[bnum];
+ int loc=bufferlen[bnum];
+ array[loc]=key2;
+ bufferlen[bnum]++;
+ if(verbose){System.err.println("bufferlen["+bnum+"] = "+bufferlen[bnum]);}
+ if(bufferlen[bnum]>=array.length){
+
+ if(verbose){System.err.println("Moving array.");}
+ bufferlen[bnum]=0;
+ buffers[bnum]=new long[array.length];
+
+ writers[bnum].add(array);
+ if(verbose){System.err.println("Moved.");}
+ }
+// assert(read(rawKey)<=min+incr) : "i="+i+", original="+min+", new should be <="+(min+incr)+", new="+read(rawKey)+", max="+maxValue+", key="+rawKey;
+// assert(readHashed(key2)>=min+incr) : "i="+i+", original="+min+", new should be <="+(min+incr)+", new="+read(rawKey)+", max="+maxValue+", key="+rawKey;
+ key2=Long.rotateRight(key2, hashBits);
+ }
+ }
+
+ private void incrementPartiallyHashed(final long pKey){
+ if(verbose){System.err.println("\n*** Incrementing key "+pKey+" ***");}
+
+ long key2=pKey;
+
+ {
+ int bnum=(int)(key2&arrayMask);
+ long[] array=buffers[bnum];
+ int loc=bufferlen[bnum];
+ array[loc]=key2;
+ bufferlen[bnum]++;
+ if(verbose){System.err.println("bufferlen["+bnum+"] = "+bufferlen[bnum]);}
+ if(bufferlen[bnum]>=array.length){
+
+ if(verbose){System.err.println("Moving array.");}
+ bufferlen[bnum]=0;
+ buffers[bnum]=new long[array.length];
+
+ writers[bnum].add(array);
+ if(verbose){System.err.println("Moved.");}
+ }
+ }
+
+ for(int i=1; i<hashes; i++){
+ key2=Long.rotateRight(key2, hashBits);
+ key2=hash(key2, i);
+ if(verbose){System.err.println("key2="+key2+", value="+readHashed(key2));}
+// assert(readHashed(key2)==0);
+
+ int bnum=(int)(key2&arrayMask);
+ long[] array=buffers[bnum];
+ int loc=bufferlen[bnum];
+ array[loc]=key2;
+ bufferlen[bnum]++;
+ if(verbose){System.err.println("bufferlen["+bnum+"] = "+bufferlen[bnum]);}
+ if(bufferlen[bnum]>=array.length){
+
+ if(verbose){System.err.println("Moving array.");}
+ bufferlen[bnum]=0;
+ buffers[bnum]=new long[array.length];
+
+ writers[bnum].add(array);
+ if(verbose){System.err.println("Moved.");}
+ }
+ }
+ }
+
+ public int incrementAndReturn(long key, int incr){
+ throw new RuntimeException("Operation not supported.");
+ }
+
+ /** Returns unincremented value */
+ public int incrementAndReturnUnincremented(long key, int incr){
+ throw new RuntimeException("Operation not supported.");
+ }
+
+ public long[] transformToFrequency(){
+ return transformToFrequency(matrix);
+ }
+
+ public String toContentsString(){
+ StringBuilder sb=new StringBuilder();
+ sb.append("[");
+ String comma="";
+ for(int[] array : matrix){
+ for(int i=0; i<array.length; i++){
+ int word=array[i];
+ for(int j=0; j<cellsPerWord; j++){
+ int x=word&valueMask;
+ sb.append(comma);
+ sb.append(x);
+ word>>>=cellBits;
+ comma=", ";
+ }
+ }
+ }
+ sb.append("]");
+ return sb.toString();
+ }
+
+ public double usedFraction(){return cellsUsed/(double)cells;}
+
+ public double usedFraction(int mindepth){return cellsUsed(mindepth)/(double)cells;}
+
+ public long cellsUsed(int mindepth){
+ long count=0;
+// System.out.println("A");
+ for(int[] array : matrix){
+// System.out.println("B");
+ if(array!=null){
+// System.out.println("C");
+ for(int word : array){
+// System.out.println("D: "+Integer.toHexString(word));
+ while(word>0){
+ int x=word&valueMask;
+// System.out.println("E: "+x+", "+mindepth);
+ if(x>=mindepth){count++;}
+ word>>>=cellBits;
+ }
+ }
+ }
+ }
+ return count;
+ }
+
+
+ final long hash(long key, int row){
+ int cell=(int)((Long.MAX_VALUE&key)%(hashArrayLength-1));
+// int cell=(int)(hashCellMask&(key));
+
+ if(row==0){//Doublehash only first time
+ key=key^hashMasks[(row+4)%hashMasks.length][cell];
+ cell=(int)(hashCellMask&(key>>5));
+// cell=(int)(hashCellMask&(key>>hashBits));
+// cell=(int)((Long.MAX_VALUE&key)%(hashArrayLength-1));
+ }
+
+ return key^hashMasks[row][cell];
+ }
+
+ /**
+ * @param i
+ * @param j
+ * @return
+ */
+ private static long[][] makeMasks(int rows, int cols) {
+
+ long seed;
+ synchronized(KCountArray7MT.class){
+ seed=counter;
+ counter++;
+ }
+
+ Timer t=new Timer();
+ long[][] r=new long[rows][cols];
+ Random randy=new Random(seed);
+ for(int i=0; i<r.length; i++){
+ fillMasks(r[i], randy);
+ }
+ t.stop();
+ if(t.elapsed>200000000L){System.out.println("Mask-creation time: "+t);}
+ return r;
+ }
+
+
+ /**
+ * @param cols
+ * @param randy
+ * @return
+ */
+ private static void fillMasks(long[] r, Random randy) {
+// for(int i=0; i<r.length; i++){
+// long x=0;
+// while(Long.bitCount(x&0xFFFFFFFF)!=16){
+// x=randy.nextLong();
+// }
+// r[i]=(x&Long.MAX_VALUE);
+// }
+
+ final int hlen=(1<<hashBits);
+ assert(r.length==hlen);
+ int[] count1=new int[hlen];
+ int[] count2=new int[hlen];
+ final long mask=hlen-1;
+
+ for(int i=0; i<r.length; i++){
+ long x=0;
+ int y=0;
+ int z=0;
+ while(Long.bitCount(x&0xFFFFFFFFL)!=16){
+ x=randy.nextLong();
+ while(Long.bitCount(x&0xFFFFFFFFL)<16){
+ x|=(1L<<randy.nextInt(32));
+ }
+ while(Long.bitCount(x&0xFFFFFFFFL)>16){
+ x&=(~(1L<<randy.nextInt(32)));
+ }
+ while(Long.bitCount(x&0xFFFFFFFF00000000L)<16){
+ x|=(1L<<(randy.nextInt(32)+32));
+ }
+ while(Long.bitCount(x&0xFFFFFFFF00000000L)>16){
+ x&=(~(1L<<(randy.nextInt(32)+32)));
+ }
+
+// System.out.print(".");
+// y=(((int)(x&mask))^i);
+ y=(((int)(x&mask)));
+ z=(int)((x>>hashBits)&mask);
+ if(count1[y]>0 || count2[z]>0){
+ x=0;
+ }
+ }
+// System.out.println(Long.toBinaryString(x));
+ r[i]=(x&Long.MAX_VALUE);
+ count1[y]++;
+ count2[z]++;
+ }
+
+ }
+
+
+ public void initialize(){
+ for(int i=0; i<writers.length; i++){
+ writers[i]=new WriteThread(i);
+ writers[i].start();
+
+// while(!writers[i].isAlive()){
+// System.out.print(".");
+// }
+ }
+// assert(false) : writers.length;
+ }
+
+ public void shutdown(){
+ if(finished){return;}
+ synchronized(this){
+ if(finished){return;}
+
+ //Clear buffers
+ for(int i=0; i<numArrays; i++){
+ long[] array=buffers[i];
+ int len=bufferlen[i];
+ buffers[i]=null;
+ bufferlen[i]=0;
+
+ if(len<array.length){
+ array=Arrays.copyOf(array, len);
+ }
+
+ if(array.length>0){
+ writers[i].add(array);
+ }
+ }
+
+ //Add poison
+ for(WriteThread wt : writers){
+ wt.add(poison);
+ }
+
+ //Wait for termination
+ for(WriteThread wt : writers){
+// System.out.println("wt"+wt.num+" is alive: "+wt.isAlive());
+ while(wt.isAlive()){
+// System.out.println("wt"+wt.num+" is alive: "+wt.isAlive());
+ try {
+ wt.join(10000);
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ if(wt.isAlive()){System.err.println(wt.getClass().getCanonicalName()+" is taking a long time to die.");}
+ }
+ cellsUsed+=wt.cellsUsedPersonal;
+// System.out.println("cellsUsed="+cellsUsed);
+// System.err.println("wt.cellsUsedPersonal="+wt.cellsUsedPersonal);
+ }
+
+ assert(!finished);
+ finished=true;
+ }
+ }
+
+ private class WriteThread extends Thread{
+
+ public WriteThread(int tnum){
+ num=tnum;
+ }
+
+ @Override
+ public void run(){
+ assert(matrix[num]==null);
+ array=new int[wordsPerArray]; //Makes NUMA systems use local memory.
+
+ matrix[num]=array;
+
+ long[] keys=null;
+ while(!shutdown){
+
+ if(verbose){System.err.println(" - Reading keys for wt"+num+".");}
+ while(keys==null){
+ try {
+ keys=writeQueue.take();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ if(keys==poison){
+// assert(false);
+ shutdown=true;
+ }else{
+ for(long key : keys){
+ incrementHashedLocal(key);
+ }
+ }
+// System.out.println(" -- Read keys for wt"+num+". poison="+(keys==poison)+", len="+keys.length);
+ if(verbose){System.err.println(" -- Read keys for wt"+num+". (success)");}
+ keys=null;
+ if(verbose){System.err.println("shutdown="+shutdown);}
+ }
+
+// System.out.println(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> I died: "+shutdown+", "+(keys==null)+".");
+// assert(false) : ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> I died: "+shutdown+", "+(keys==null)+".";
+
+ array=null;
+ }
+
+ private void add(long[] keys){
+// assert(isAlive());
+ assert(!shutdown);
+ if(shutdown){return;}
+// assert(keys!=poison);
+ if(verbose){System.err.println(" + Adding keys to wt"+num+".");}
+ boolean success=false;
+ while(!success){
+ try {
+ writeQueue.put(keys);
+ success=true;
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ if(verbose){System.err.println(" ++ Added keys to wt"+num+". (success)");}
+ }
+
+ private int incrementHashedLocal(long key){
+ assert((key&arrayMask)==num);
+ key=(key>>>arrayBits)%(cellMod);
+// key=(key>>>(arrayBits+1))%(cellMod);
+ int index=(int)(key>>>indexShift);
+ int word=array[index];
+ int cellShift=(int)(cellBits*key);
+ int value=((word>>>cellShift)&valueMask);
+ if(value==0){cellsUsedPersonal++;}
+ value=min(value+1, maxValue);
+ word=(value<<cellShift)|(word&~((valueMask)<<cellShift));
+ array[index]=word;
+ return value;
+ }
+
+ private int[] array;
+ private final int num;
+ public long cellsUsedPersonal=0;
+
+ public ArrayBlockingQueue<long[]> writeQueue=new ArrayBlockingQueue<long[]>(16);
+ public boolean shutdown=false;
+
+ }
+
+
+ public long cellsUsed(){return cellsUsed;}
+
+ private boolean finished=false;
+
+ private long cellsUsed;
+ private final int[][] matrix;
+ private final WriteThread[] writers=new WriteThread[numArrays];
+ private final int hashes;
+ private final int wordsPerArray;
+ private final long cellsPerArray;
+ private final long cellMod;
+ private final long[][] hashMasks=makeMasks(8, hashArrayLength);
+
+ private final long[][] buffers=new long[numArrays][500];
+ private final int[] bufferlen=new int[numArrays];
+
+ private static final int hashBits=6;
+ private static final int hashArrayLength=1<<hashBits;
+ private static final int hashCellMask=hashArrayLength-1;
+ private static final long[] poison=new long[0];
+
+ private static long counter=0;
+
+}
diff --git a/current/bloom/KCountArray7MTA.java b/current/bloom/KCountArray7MTA.java
new file mode 100755
index 0000000..2baf7c1
--- /dev/null
+++ b/current/bloom/KCountArray7MTA.java
@@ -0,0 +1,660 @@
+package bloom;
+
+import java.lang.Thread.State;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Random;
+import java.util.concurrent.atomic.AtomicIntegerArray;
+
+import kmer.Primes;
+
+
+import align2.Tools;
+
+import dna.Timer;
+
+
+/**
+ *
+ * Uses prime numbers for array lengths.
+ * Uses atomic integers for concurrency control.
+ * Allows an optional prefilter.
+ *
+ * @author Brian Bushnell
+ * @date Aug 17, 2012
+ *
+ */
+public final class KCountArray7MTA extends KCountArray {
+
+ public static void main(String[] args){
+ long cells=Long.parseLong(args[0]);
+ int bits=Integer.parseInt(args[1]);
+ int gap=Integer.parseInt(args[2]);
+ int hashes=Integer.parseInt(args[3]);
+
+ verbose=false;
+
+ KCountArray7MTA kca=new KCountArray7MTA(cells, bits, gap, hashes, null, 0);
+
+ System.out.println(kca.read(0));
+ kca.increment(0);
+ System.out.println(kca.read(0));
+ kca.increment(0);
+ System.out.println(kca.read(0));
+ System.out.println();
+
+ System.out.println(kca.read(1));
+ kca.increment(1);
+ System.out.println(kca.read(1));
+ kca.increment(1);
+ System.out.println(kca.read(1));
+ System.out.println();
+
+ System.out.println(kca.read(100));
+ kca.increment(100);
+ System.out.println(kca.read(100));
+ kca.increment(100);
+ System.out.println(kca.read(100));
+ kca.increment(100);
+ System.out.println(kca.read(100));
+ System.out.println();
+
+
+ System.out.println(kca.read(150));
+ kca.increment(150);
+ System.out.println(kca.read(150));
+ System.out.println();
+
+ }
+
+ public KCountArray7MTA(long cells_, int bits_, int gap_, int hashes_, KCountArray prefilter_, int prefilterLimit_){
+ super(getPrimeCells(cells_, bits_), bits_, gap_, getDesiredArrays(cells_, bits_));
+// verbose=false;
+// System.out.println(cells);
+ cellsPerArray=cells/numArrays;
+ wordsPerArray=(int)((cellsPerArray%cellsPerWord)==0 ? (cellsPerArray/cellsPerWord) : (cellsPerArray/cellsPerWord+1));
+ cellMod=cellsPerArray;
+ hashes=hashes_;
+ prefilter=prefilter_;
+ prefilterLimit=(prefilter==null ? 0 : Tools.min(prefilter.maxValue, prefilterLimit_));
+// System.out.println("cells="+cells+", words="+words+", wordsPerArray="+wordsPerArray+", numArrays="+numArrays+", hashes="+hashes);
+
+ matrix=allocMatrix(numArrays, wordsPerArray);
+
+// matrix=new AtomicIntegerArray[numArrays];
+// for(int i=0; i<matrix.length; i++){
+// matrix[i]=new AtomicIntegerArray(wordsPerArray);
+// }
+
+ assert(hashes>0 && hashes<=hashMasks.length);
+ }
+
+ private static int getDesiredArrays(long desiredCells, int bits){
+
+ long words=Tools.max((desiredCells*bits+31)/32, minArrays);
+ int arrays=minArrays;
+ while(words/arrays>=Integer.MAX_VALUE){
+ arrays*=2;
+ }
+// assert(false) : arrays;
+ return arrays;
+// return Tools.max(arrays, Data.LOGICAL_PROCESSORS*4);
+ }
+
+ private static long getPrimeCells(long desiredCells, int bits){
+
+ int arrays=getDesiredArrays(desiredCells, bits);
+
+ long x=(desiredCells+arrays-1)/arrays;
+ long x2=Primes.primeAtMost(x);
+ return x2*arrays;
+ }
+
+ @Override
+ public final int read(final long rawKey){
+ if(verbose){System.err.println("Reading raw key "+rawKey);}
+ if(prefilter!=null){
+ int pre=prefilter.read(rawKey);
+ if(pre<prefilterLimit){return pre;}
+ }
+ long key2=hash(rawKey, 0);
+ int min=readHashed(key2);
+ for(int i=1; i<hashes && min>0; i++){
+ if(verbose){System.err.println("Reading. i="+i+", key2="+key2);}
+ key2=Long.rotateRight(key2, hashBits);
+ key2=hash(key2, i);
+ if(verbose){System.err.println("Rot/hash. i="+i+", key2="+key2);}
+ min=min(min, readHashed(key2));
+ }
+ return min;
+ }
+
+ @Override
+ public final int read(final long[] rawKeys){
+ if(verbose){System.err.println("Reading raw key "+Arrays.toString(rawKeys));}
+ if(prefilter!=null){
+ int pre=prefilter.read(rawKeys);
+ if(pre<prefilterLimit){return pre;}
+ }
+ long key2=hash(rawKeys[0], (int)(1+(rawKeys[0])%5));
+ int min=maxValue;
+ for(int i=0; i<hashes; i++){
+ for(int keynum=0; keynum<rawKeys.length; keynum++){
+ if(verbose){System.err.println("Reading. i="+i+", key2="+key2);}
+ key2=hash(key2^rawKeys[keynum], i);
+ if(verbose){System.err.println("Rot/hash. i="+i+", key2="+key2);}
+ }
+ min=min(min, readHashed(key2));
+ key2=Long.rotateRight(key2, hashBits);
+ }
+ return min;
+ }
+
+ @Override
+ public final int readLeft(final long key, final int k, boolean makeCanonical){
+ assert(k<=32);
+ final long key2=key>>>2;
+ final int shift=2*(k-1);
+ final long akey=key2|(0L<<shift);
+ final long ckey=key2|(1L<<shift);
+ final long gkey=key2|(2L<<shift);
+ final long tkey=key2|(3L<<shift);
+ final int a=read(makeCanonical ? makeCanonical2(akey, k) : akey);
+ final int c=read(makeCanonical ? makeCanonical2(ckey, k) : ckey);
+ final int g=read(makeCanonical ? makeCanonical2(gkey, k) : gkey);
+ final int t=read(makeCanonical ? makeCanonical2(tkey, k) : tkey);
+ return a+c+g+t;
+ }
+
+ @Override
+ public final int readRight(final long key, final int k, boolean makeCanonical){
+ assert(k<=32);
+ final long mask=(k>=32 ? -1L : ~((-1L)<<(2*k)));
+ final long key2=(key<<2)&mask;
+ final long akey=key2|0L;
+ final long ckey=key2|1L;
+ final long gkey=key2|2L;
+ final long tkey=key2|3L;
+ final int a=read(makeCanonical ? makeCanonical2(akey, k) : akey);
+ final int c=read(makeCanonical ? makeCanonical2(ckey, k) : ckey);
+ final int g=read(makeCanonical ? makeCanonical2(gkey, k) : gkey);
+ final int t=read(makeCanonical ? makeCanonical2(tkey, k) : tkey);
+ return a+c+g+t;
+ }
+
+ @Override
+ public final int[] readAllLeft(final long key, final int k, boolean makeCanonical, int[] rvec){
+ assert(k<=32);
+ if(rvec==null){rvec=new int[4];}
+ final long key2=key>>>2;
+ final int shift=2*(k-1);
+ final long akey=key2|(0L<<shift);
+ final long ckey=key2|(1L<<shift);
+ final long gkey=key2|(2L<<shift);
+ final long tkey=key2|(3L<<shift);
+ rvec[0]=read(makeCanonical ? makeCanonical2(akey, k) : akey);
+ rvec[1]=read(makeCanonical ? makeCanonical2(ckey, k) : ckey);
+ rvec[2]=read(makeCanonical ? makeCanonical2(gkey, k) : gkey);
+ rvec[3]=read(makeCanonical ? makeCanonical2(tkey, k) : tkey);
+ return rvec;
+ }
+
+ @Override
+ public final int[] readAllRight(final long key, final int k, boolean makeCanonical, int[] rvec){
+ assert(k<=32);
+ final long mask=(k>=32 ? -1L : ~((-1L)<<(2*k)));
+ final long key2=(key<<2)&mask;
+ final long akey=key2|0L;
+ final long ckey=key2|1L;
+ final long gkey=key2|2L;
+ final long tkey=key2|3L;
+ rvec[0]=read(makeCanonical ? makeCanonical2(akey, k) : akey);
+ rvec[1]=read(makeCanonical ? makeCanonical2(ckey, k) : ckey);
+ rvec[2]=read(makeCanonical ? makeCanonical2(gkey, k) : gkey);
+ rvec[3]=read(makeCanonical ? makeCanonical2(tkey, k) : tkey);
+ return rvec;
+ }
+
+ private final int readHashed(long key){
+ if(verbose){System.err.print("Reading hashed key "+key);}
+// System.out.println("key="+key);
+ int arrayNum=(int)(key&arrayMask);
+ key=(key>>>arrayBits)%(cellMod);
+// key=(key>>>(arrayBits+1))%(cellMod);
+// System.out.println("array="+arrayNum);
+// System.out.println("key2="+key);
+ AtomicIntegerArray array=matrix[arrayNum];
+ int index=(int)(key>>>indexShift);
+// assert(false) : indexShift;
+// System.out.println("index="+index);
+ int word=array.get(index);
+// System.out.println("word="+Integer.toHexString(word));
+ assert(word>>>(cellBits*key) == word>>>(cellBits*(key&cellMask)));
+// int cellShift=(int)(cellBits*(key&cellMask));
+ int cellShift=(int)(cellBits*key);
+ if(verbose){System.err.println(", array="+arrayNum+", index="+index+", cellShift="+(cellShift%32)+", value="+((int)((word>>>cellShift)&valueMask)));}
+// System.out.println("cellShift="+cellShift);
+ return (int)((word>>>cellShift)&valueMask);
+ }
+
+ @Override
+ public final void write(final long key, int value){
+ throw new RuntimeException("Not allowed for this class.");
+ }
+
+ @Override
+ public final void increment(long[] keys){
+// assert(false) : "This method is not really needed.";
+ for(int i=0; i<keys.length; i++){
+ increment(keys[i]);
+// keys[i]=hash(keys[i], 0);
+// incrementPartiallyHashed(hash(keys[i], 0));
+ }
+ }
+
+ @Override
+ public final void increment(final long rawKey){
+ if(verbose){System.err.println("\n*** Incrementing raw key "+rawKey+" ***");}
+
+ if(prefilter!=null){
+ int x=prefilter.read(rawKey);
+ if(x<prefilterLimit){return;}
+ }
+
+ long key2=rawKey;
+ for(int i=0; i<hashes; i++){
+ key2=hash(key2, i);
+ if(verbose){System.err.println("key2="+key2+", value="+readHashed(key2));}
+// assert(readHashed(key2)==0);
+
+// int bnum=(int)(key2&arrayMask);
+ incrementHashedLocal(key2);
+// assert(read(rawKey)<=min+incr) : "i="+i+", original="+min+", new should be <="+(min+incr)+", new="+read(rawKey)+", max="+maxValue+", key="+rawKey;
+// assert(readHashed(key2)>=min+incr) : "i="+i+", original="+min+", new should be <="+(min+incr)+", new="+read(rawKey)+", max="+maxValue+", key="+rawKey;
+ key2=Long.rotateRight(key2, hashBits);
+ }
+ }
+
+ @Override
+ public final void decrement(final long rawKey){
+ if(verbose){System.err.println("\n*** Decrementing raw key "+rawKey+" ***");}
+
+ assert(prefilter!=null);
+
+ long key2=rawKey;
+ for(int i=0; i<hashes; i++){
+ key2=hash(key2, i);
+ if(verbose){System.err.println("key2="+key2+", value="+readHashed(key2));}
+// assert(readHashed(key2)==0);
+
+// int bnum=(int)(key2&arrayMask);
+ decrementHashedLocal(key2);
+// assert(read(rawKey)<=min+incr) : "i="+i+", original="+min+", new should be <="+(min+incr)+", new="+read(rawKey)+", max="+maxValue+", key="+rawKey;
+// assert(readHashed(key2)>=min+incr) : "i="+i+", original="+min+", new should be <="+(min+incr)+", new="+read(rawKey)+", max="+maxValue+", key="+rawKey;
+ key2=Long.rotateRight(key2, hashBits);
+ }
+ }
+
+ @Override
+ public int incrementAndReturn(long key, int incr){
+ throw new RuntimeException("Operation not supported.");
+ }
+
+ public int incrementAndReturnUnincremented(final long rawKey, final int incr){
+
+ if(verbose){System.err.println("\n*** Incrementing raw key "+rawKey+" ***");}
+
+ if(prefilter!=null){
+ int x=prefilter.read(rawKey);
+ if(x<prefilterLimit){return x;}
+ }
+
+ long key2=rawKey;
+ int value=maxValue;
+ for(int i=0; i<hashes; i++){
+ key2=hash(key2, i);
+ if(verbose){System.err.println("key2="+key2+", value="+readHashed(key2));}
+// assert(readHashed(key2)==0);
+
+// int bnum=(int)(key2&arrayMask);
+ int x=incrementHashedLocalAndReturnUnincremented(key2, incr);
+ value=min(value, x);
+// assert(read(rawKey)<=min+incr) : "i="+i+", original="+min+", new should be <="+(min+incr)+", new="+read(rawKey)+", max="+maxValue+", key="+rawKey;
+// assert(readHashed(key2)>=min+incr) : "i="+i+", original="+min+", new should be <="+(min+incr)+", new="+read(rawKey)+", max="+maxValue+", key="+rawKey;
+ key2=Long.rotateRight(key2, hashBits);
+ }
+ return value;
+ }
+
+ public int incrementAndReturnUnincremented(final long[] rawKeys, final int incr){
+
+ if(verbose){System.err.println("\n*** Incrementing raw keys "+Arrays.toString(rawKeys)+" ***");}
+
+ if(prefilter!=null){
+ int x=prefilter.read(rawKeys);
+ if(x<prefilterLimit){return x;}
+ }
+
+ long key2=hash(rawKeys[0], (int)(1+(rawKeys[0])%5));
+ int value=maxValue;
+ for(int i=0; i<hashes; i++){
+ for(int keynum=0; keynum<rawKeys.length; keynum++){
+ key2=hash(key2^rawKeys[keynum], i);
+ if(verbose){System.err.println("key2="+key2+", value="+readHashed(key2));}
+ // assert(readHashed(key2)==0);
+
+ // int bnum=(int)(key2&arrayMask);
+ // assert(read(rawKey)<=min+incr) : "i="+i+", original="+min+", new should be <="+(min+incr)+", new="+read(rawKey)+", max="+maxValue+", key="+rawKey;
+ // assert(readHashed(key2)>=min+incr) : "i="+i+", original="+min+", new should be <="+(min+incr)+", new="+read(rawKey)+", max="+maxValue+", key="+rawKey;
+ }
+ int x=incrementHashedLocalAndReturnUnincremented(key2, incr);
+ value=min(value, x);
+ key2=Long.rotateRight(key2, hashBits);
+ }
+// assert(value+1==read(rawKeys) || value==maxValue) : value+", "+read(rawKeys);
+ return value;
+ }
+
+ public long[] transformToFrequency(){
+ return transformToFrequency(matrix);
+ }
+
+ public String toContentsString(){
+ StringBuilder sb=new StringBuilder();
+ sb.append("[");
+ String comma="";
+ for(AtomicIntegerArray array : matrix){
+ for(int i=0; i<array.length(); i++){
+ int word=array.get(i);
+ for(int j=0; j<cellsPerWord; j++){
+ int x=word&valueMask;
+ sb.append(comma);
+ sb.append(x);
+ word>>>=cellBits;
+ comma=", ";
+ }
+ }
+ }
+ sb.append("]");
+ return sb.toString();
+ }
+
+ public double usedFraction(){return cellsUsed()/(double)cells;}
+
+ public double usedFraction(int mindepth){return cellsUsed(mindepth)/(double)cells;}
+
+ public long cellsUsed(int mindepth){
+ return cellsUsedMT(mindepth);
+ }
+
+ public long cellsUsedMT(int mindepth){
+// assert(false) : matrix.length;
+ ArrayList<CountUsedThread> list=new ArrayList<CountUsedThread>(matrix.length);
+ for(AtomicIntegerArray aia : matrix){
+ CountUsedThread ctt=new CountUsedThread(aia, mindepth);
+ ctt.start();
+ list.add(ctt);
+ }
+ long x=0;
+ for(CountUsedThread ctt : list){
+ while(ctt.getState()!=State.TERMINATED){
+ try {
+ ctt.join();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ x+=ctt.count;
+ }
+ return x;
+ }
+
+ private class CountUsedThread extends Thread{
+ public CountUsedThread(AtomicIntegerArray a_, int mindepth_){
+ array=a_;
+ mindepth=mindepth_;
+ }
+ public void run(){
+ long temp=0;
+ if(array!=null){
+// System.out.println("C");
+// assert(false) : Integer.toBinaryString(valueMask);
+ if(cellBits==32){
+ for(int i=0, max=array.length(); i<max; i++){
+ int word=array.get(i);
+ if(word!=0){
+ int x=word&valueMask;
+ if(x>=mindepth){temp++;}
+ }
+ }
+ }else{
+ for(int i=0, max=array.length(); i<max; i++){
+ // System.out.println("D: "+Integer.toHexString(word));
+ int word=array.get(i);
+ while(word!=0){
+ int x=word&valueMask;
+ // System.out.println("E: "+x+", "+mindepth);
+ if(x>=mindepth){temp++;}
+ word=word>>>cellBits;
+ }
+ }
+ }
+ }
+ count=temp;
+ }
+ private final AtomicIntegerArray array;
+ private final int mindepth;
+ public long count;
+ }
+
+
+ final long hash(long key, int row){
+ int cell=(int)((Long.MAX_VALUE&key)%(hashArrayLength-1));
+// int cell=(int)(hashCellMask&(key));
+
+ if(row==0){//Doublehash only first time
+ key=key^hashMasks[(row+4)%hashMasks.length][cell];
+ cell=(int)(hashCellMask&(key>>5));
+// cell=(int)(hashCellMask&(key>>hashBits));
+// cell=(int)((Long.MAX_VALUE&key)%(hashArrayLength-1));
+ }
+
+ assert(row>=0 && row<hashMasks.length) : row+", "+hashMasks.length;
+ assert(cell>=0 && cell<hashMasks[0].length) : cell+", "+hashMasks[0].length;
+ return key^hashMasks[row][cell];
+ }
+
+ /**
+ * @param i
+ * @param j
+ * @return
+ */
+ private static long[][] makeMasks(int rows, int cols) {
+
+ long seed;
+ synchronized(KCountArray7MTA.class){
+ seed=counter;
+ counter++;
+ }
+
+ Timer t=new Timer();
+ long[][] r=new long[rows][cols];
+ Random randy=new Random(seed);
+ for(int i=0; i<r.length; i++){
+ fillMasks(r[i], randy);
+ }
+ t.stop();
+ if(t.elapsed>200000000L){System.out.println("Mask-creation time: "+t);}
+ return r;
+ }
+
+
+ /**
+ * @param cols
+ * @param randy
+ * @return
+ */
+ private static void fillMasks(long[] r, Random randy) {
+// for(int i=0; i<r.length; i++){
+// long x=0;
+// while(Long.bitCount(x&0xFFFFFFFF)!=16){
+// x=randy.nextLong();
+// }
+// r[i]=(x&Long.MAX_VALUE);
+// }
+
+ final int hlen=(1<<hashBits);
+ assert(r.length==hlen);
+ int[] count1=new int[hlen];
+ int[] count2=new int[hlen];
+ final long mask=hlen-1;
+
+ for(int i=0; i<r.length; i++){
+ long x=0;
+ int y=0;
+ int z=0;
+ while(Long.bitCount(x&0xFFFFFFFFL)!=16){
+ x=randy.nextLong();
+ while(Long.bitCount(x&0xFFFFFFFFL)<16){
+ x|=(1L<<randy.nextInt(32));
+ }
+ while(Long.bitCount(x&0xFFFFFFFFL)>16){
+ x&=(~(1L<<randy.nextInt(32)));
+ }
+ while(Long.bitCount(x&0xFFFFFFFF00000000L)<16){
+ x|=(1L<<(randy.nextInt(32)+32));
+ }
+ while(Long.bitCount(x&0xFFFFFFFF00000000L)>16){
+ x&=(~(1L<<(randy.nextInt(32)+32)));
+ }
+
+// System.out.print(".");
+// y=(((int)(x&mask))^i);
+ y=(((int)(x&mask)));
+ z=(int)((x>>hashBits)&mask);
+ if(count1[y]>0 || count2[z]>0){
+ x=0;
+ }
+ }
+// System.out.println(Long.toBinaryString(x));
+ r[i]=(x&Long.MAX_VALUE);
+ count1[y]++;
+ count2[z]++;
+ }
+
+ }
+
+
+ public void initialize(){}
+
+ public void shutdown(){
+ if(finished){return;}
+ synchronized(this){
+ if(finished){return;}
+
+ cellsUsed=-1;
+// for(int i=0; i<numArrays; i++){
+// cellsUsed+=cellsUsedPersonal.get(i);
+// }
+ cellsUsed();
+
+ assert(!finished);
+ finished=true;
+ }
+ }
+
+ private int incrementHashedLocal(long key){
+ final int num=(int)(key&arrayMask);
+ final AtomicIntegerArray array=matrix[num];
+ key=(key>>>arrayBits)%(cellMod);
+// key=(key>>>(arrayBits+1))%(cellMod);
+ int index=(int)(key>>>indexShift);
+ int cellShift=(int)(cellBits*key);
+ int value, word, word2;
+ do{
+ word=array.get(index);
+ value=((word>>>cellShift)&valueMask);
+ value=min(value+1, maxValue);
+ word2=(value<<cellShift)|(word&~((valueMask)<<cellShift));
+ }while(word!=word2 && !array.compareAndSet(index, word, word2));
+// if(value==1){cellsUsedPersonal.incrementAndGet(num);}
+ return value;
+ }
+
+ private int incrementHashedLocalAndReturnUnincremented(long key, int incr){
+ assert(incr>=0);
+ final int num=(int)(key&arrayMask);
+ final AtomicIntegerArray array=matrix[num];
+ key=(key>>>arrayBits)%(cellMod);
+// key=(key>>>(arrayBits+1))%(cellMod);
+ int index=(int)(key>>>indexShift);
+ int cellShift=(int)(cellBits*key);
+ int value, word, word2;
+ do{
+ word=array.get(index);
+ value=((word>>>cellShift)&valueMask);
+ int value2=min(value+incr, maxValue);
+ word2=(value2<<cellShift)|(word&~((valueMask)<<cellShift));
+ }while(word!=word2 && !array.compareAndSet(index, word, word2));
+// if(value==1){cellsUsedPersonal.incrementAndGet(num);}
+ return value;
+ }
+
+ private int decrementHashedLocal(long key){
+ final int num=(int)(key&arrayMask);
+ final AtomicIntegerArray array=matrix[num];
+ key=(key>>>arrayBits)%(cellMod);
+// key=(key>>>(arrayBits+1))%(cellMod);
+ int index=(int)(key>>>indexShift);
+ int cellShift=(int)(cellBits*key);
+ int value, word, word2;
+ do{
+ word=array.get(index);
+ value=((word>>>cellShift)&valueMask);
+ value=max(value-1, 0);
+ word2=(value<<cellShift)|(word&~((valueMask)<<cellShift));
+ }while(word!=word2 && !array.compareAndSet(index, word, word2));
+// if(value==1){cellsUsedPersonal.incrementAndGet(num);}
+ return value;
+ }
+
+ public long cellsUsed(){
+ if(cellsUsed<0){
+ synchronized(this){
+ if(cellsUsed<0){
+ cellsUsed=cellsUsed(1);
+ }
+ }
+ }
+ return cellsUsed;
+ }
+
+ public KCountArray prefilter(){
+ return prefilter;
+ }
+
+ public void purgeFilter(){
+ prefilter=null;
+ }
+
+ private boolean finished=false;
+
+ private long cellsUsed;
+ private final AtomicIntegerArray[] matrix;
+ private final int hashes;
+ private final int wordsPerArray;
+ private final long cellsPerArray;
+ private final long cellMod;
+ private final long[][] hashMasks=makeMasks(8, hashArrayLength);
+ private final int prefilterLimit;
+
+ private static final int hashBits=6;
+ private static final int hashArrayLength=1<<hashBits;
+ private static final int hashCellMask=hashArrayLength-1;
+
+ private KCountArray prefilter;
+
+ private static long counter=0;
+
+}
diff --git a/current/bloom/KCountArray8MT.java b/current/bloom/KCountArray8MT.java
new file mode 100755
index 0000000..0055cce
--- /dev/null
+++ b/current/bloom/KCountArray8MT.java
@@ -0,0 +1,585 @@
+package bloom;
+
+import java.util.Arrays;
+import java.util.Random;
+import java.util.concurrent.ArrayBlockingQueue;
+
+import kmer.Primes;
+
+
+import align2.Tools;
+
+import dna.Timer;
+
+
+/**
+ *
+ * Uses prime numbers for array lengths.
+ * Supports a prefilter that is checked before looking at the main filter.
+ *
+ * @author Brian Bushnell
+ * @date Aug 17, 2012
+ *
+ */
+public class KCountArray8MT extends KCountArray {
+
+ public static void main(String[] args){
+ long cells=Long.parseLong(args[0]);
+ int bits=Integer.parseInt(args[1]);
+ int gap=Integer.parseInt(args[2]);
+ int hashes=Integer.parseInt(args[3]);
+
+ verbose=false;
+
+ KCountArray8MT kca=new KCountArray8MT(cells, bits, gap, hashes, null);
+
+ System.out.println(kca.read(0));
+ kca.increment(0);
+ System.out.println(kca.read(0));
+ kca.increment(0);
+ System.out.println(kca.read(0));
+ System.out.println();
+
+ System.out.println(kca.read(1));
+ kca.increment(1);
+ System.out.println(kca.read(1));
+ kca.increment(1);
+ System.out.println(kca.read(1));
+ System.out.println();
+
+ System.out.println(kca.read(100));
+ kca.increment(100);
+ System.out.println(kca.read(100));
+ kca.increment(100);
+ System.out.println(kca.read(100));
+ kca.increment(100);
+ System.out.println(kca.read(100));
+ System.out.println();
+
+
+ System.out.println(kca.read(150));
+ kca.increment(150);
+ System.out.println(kca.read(150));
+ System.out.println();
+
+ }
+
+ public KCountArray8MT(long cells_, int bits_, int gap_, int hashes_, KCountArray prefilter_){
+ super(getPrimeCells(cells_, bits_), bits_, gap_, getDesiredArrays(cells_, bits_));
+// verbose=false;
+// assert(false);
+ cellsPerArray=cells/numArrays;
+ wordsPerArray=(int)((cellsPerArray%cellsPerWord)==0 ? (cellsPerArray/cellsPerWord) : (cellsPerArray/cellsPerWord+1));
+ cellMod=cellsPerArray;
+ hashes=hashes_;
+// System.out.println("cells="+cells+", words="+words+", wordsPerArray="+wordsPerArray+", numArrays="+numArrays+", hashes="+hashes);
+// assert(false);
+ matrix=new int[numArrays][];
+ prefilter=prefilter_;
+ assert(prefilter!=null);
+ assert(hashes>0 && hashes<=hashMasks.length);
+ }
+
+ private static int getDesiredArrays(long desiredCells, int bits){
+
+ long words=Tools.max((desiredCells*bits+31)/32, minArrays);
+ int arrays=minArrays;
+ while(words/arrays>=Integer.MAX_VALUE){
+ arrays*=2;
+ }
+ return arrays;
+ }
+
+ private static long getPrimeCells(long desiredCells, int bits){
+
+ int arrays=getDesiredArrays(desiredCells, bits);
+
+ long x=(desiredCells+arrays-1)/arrays;
+ long x2=Primes.primeAtMost(x);
+ return x2*arrays;
+ }
+
+ public int read(final long rawKey){
+ assert(finished);
+ if(verbose){System.err.println("Reading raw key "+rawKey);}
+ int pre=0;
+ if(prefilter!=null){
+ pre=prefilter.read(rawKey);
+ if(pre<prefilter.maxValue){return pre;}
+ }
+ long key2=hash(rawKey, 0);
+ int min=readHashed(key2);
+ for(int i=1; i<hashes && min>0; i++){
+ if(verbose){System.err.println("Reading. i="+i+", key2="+key2);}
+ key2=Long.rotateRight(key2, hashBits);
+ key2=hash(key2, i);
+ if(verbose){System.err.println("Rot/hash. i="+i+", key2="+key2);}
+ min=min(min, readHashed(key2));
+ }
+ return min;
+ }
+
+ private int readHashed(long key){
+ if(verbose){System.err.print("Reading hashed key "+key);}
+// System.out.println("key="+key);
+ int arrayNum=(int)(key&arrayMask);
+ key=(key>>>arrayBits)%(cellMod);
+// key=(key>>>(arrayBits+1))%(cellMod);
+// System.out.println("array="+arrayNum);
+// System.out.println("key2="+key);
+ int[] array=matrix[arrayNum];
+ int index=(int)(key>>>indexShift);
+// assert(false) : indexShift;
+// System.out.println("index="+index);
+ int word=array[index];
+// System.out.println("word="+Integer.toHexString(word));
+ assert(word>>>(cellBits*key) == word>>>(cellBits*(key&cellMask)));
+// int cellShift=(int)(cellBits*(key&cellMask));
+ int cellShift=(int)(cellBits*key);
+ if(verbose){System.err.println(", array="+arrayNum+", index="+index+", cellShift="+(cellShift%32)+", value="+((int)((word>>>cellShift)&valueMask)));}
+// System.out.println("cellShift="+cellShift);
+ return (int)((word>>>cellShift)&valueMask);
+ }
+
+ public void write(final long key, int value){
+ throw new RuntimeException("Not allowed for this class.");
+ }
+
+ @Override
+ /** This should increase speed by doing the first hash outside the critical section, but it does not seem to help. */
+ public void increment(long[] keys){
+ if(prefilter==null){
+ for(int i=0; i<keys.length; i++){
+ keys[i]=hash(keys[i], 0);
+ }
+ synchronized(buffers){
+ for(long key : keys){
+ incrementPartiallyHashed(key);
+ }
+ }
+ }else{
+ int j=0;
+ for(int i=0; i<keys.length; i++){
+ long key=keys[i];
+ int x=prefilter.read(key);
+ if(x==prefilter.maxValue){
+ keys[j]=hash(key, 0);
+ j++;
+ }
+ }
+ synchronized(buffers){
+ for(int i=0; i<j; i++){
+ incrementPartiallyHashed(keys[i]);
+ }
+ }
+ }
+ }
+
+ public void increment(final long rawKey){
+ if(verbose){System.err.println("\n*** Incrementing raw key "+rawKey+" ***");}
+ if(prefilter!=null){
+ int pre=prefilter.read(rawKey);
+ if(pre<prefilter.maxValue){return;}
+ }
+
+ long key2=rawKey;
+ for(int i=0; i<hashes; i++){
+ key2=hash(key2, i);
+ if(verbose){System.err.println("key2="+key2+", value="+readHashed(key2));}
+// assert(readHashed(key2)==0);
+
+ int bnum=(int)(key2&arrayMask);
+ long[] array=buffers[bnum];
+ int loc=bufferlen[bnum];
+ array[loc]=key2;
+ bufferlen[bnum]++;
+ if(verbose){System.err.println("bufferlen["+bnum+"] = "+bufferlen[bnum]);}
+ if(bufferlen[bnum]>=array.length){
+
+ if(verbose){System.err.println("Moving array.");}
+ bufferlen[bnum]=0;
+ buffers[bnum]=new long[array.length];
+
+ writers[bnum].add(array);
+ if(verbose){System.err.println("Moved.");}
+ }
+// assert(read(rawKey)<=min+incr) : "i="+i+", original="+min+", new should be <="+(min+incr)+", new="+read(rawKey)+", max="+maxValue+", key="+rawKey;
+// assert(readHashed(key2)>=min+incr) : "i="+i+", original="+min+", new should be <="+(min+incr)+", new="+read(rawKey)+", max="+maxValue+", key="+rawKey;
+ key2=Long.rotateRight(key2, hashBits);
+ }
+ }
+
+ private void incrementPartiallyHashed(final long pKey){
+ if(verbose){System.err.println("\n*** Incrementing key "+pKey+" ***");}
+
+ long key2=pKey;
+
+ {
+ int bnum=(int)(key2&arrayMask);
+ long[] array=buffers[bnum];
+ int loc=bufferlen[bnum];
+ array[loc]=key2;
+ bufferlen[bnum]++;
+ if(verbose){System.err.println("bufferlen["+bnum+"] = "+bufferlen[bnum]);}
+ if(bufferlen[bnum]>=array.length){
+
+ if(verbose){System.err.println("Moving array.");}
+ bufferlen[bnum]=0;
+ buffers[bnum]=new long[array.length];
+
+ writers[bnum].add(array);
+ if(verbose){System.err.println("Moved.");}
+ }
+ }
+
+ for(int i=1; i<hashes; i++){
+ key2=Long.rotateRight(key2, hashBits);
+ key2=hash(key2, i);
+ if(verbose){System.err.println("key2="+key2+", value="+readHashed(key2));}
+// assert(readHashed(key2)==0);
+
+ int bnum=(int)(key2&arrayMask);
+ long[] array=buffers[bnum];
+ int loc=bufferlen[bnum];
+ array[loc]=key2;
+ bufferlen[bnum]++;
+ if(verbose){System.err.println("bufferlen["+bnum+"] = "+bufferlen[bnum]);}
+ if(bufferlen[bnum]>=array.length){
+
+ if(verbose){System.err.println("Moving array.");}
+ bufferlen[bnum]=0;
+ buffers[bnum]=new long[array.length];
+
+ writers[bnum].add(array);
+ if(verbose){System.err.println("Moved.");}
+ }
+ }
+ }
+
+ public int incrementAndReturn(long key, int incr){
+ throw new RuntimeException("Operation not supported.");
+ }
+
+ /** Returns unincremented value */
+ public int incrementAndReturnUnincremented(long key, int incr){
+ throw new RuntimeException("Operation not supported.");
+ }
+
+ public long[] transformToFrequency(){
+ return transformToFrequency(matrix);
+ }
+
+ public String toContentsString(){
+ StringBuilder sb=new StringBuilder();
+ sb.append("[");
+ String comma="";
+ for(int[] array : matrix){
+ for(int i=0; i<array.length; i++){
+ int word=array[i];
+ for(int j=0; j<cellsPerWord; j++){
+ int x=word&valueMask;
+ sb.append(comma);
+ sb.append(x);
+ word>>>=cellBits;
+ comma=", ";
+ }
+ }
+ }
+ sb.append("]");
+ return sb.toString();
+ }
+
+ public double usedFraction(){return cellsUsed/(double)cells;}
+
+ public double usedFraction(int mindepth){return cellsUsed(mindepth)/(double)cells;}
+
+ public long cellsUsed(int mindepth){
+ long count=0;
+// System.out.println("A: "+cellBits+", "+Integer.toBinaryString(valueMask));
+ for(int[] array : matrix){
+// System.out.println("B");
+ if(array!=null){
+// System.out.println("C");
+ for(int word : array){
+// System.out.println("D: "+Integer.toBinaryString(word));
+ while(word>0){
+ int x=word&valueMask;
+// System.out.println("E: "+x+", "+mindepth);
+ if(x>=mindepth){count++;}
+ word>>>=cellBits;
+ }
+ }
+ }
+ }
+ return count;
+ }
+
+ final long hash(long key, int row){
+ int cell=(int)((Long.MAX_VALUE&key)%(hashArrayLength-1));
+// int cell=(int)(hashCellMask&(key));
+
+ if(row==0){//Doublehash only first time
+ key=key^hashMasks[(row+4)%hashMasks.length][cell];
+ cell=(int)(hashCellMask&(key>>5));
+// cell=(int)(hashCellMask&(key>>hashBits));
+// cell=(int)((Long.MAX_VALUE&key)%(hashArrayLength-1));
+ }
+
+ return key^hashMasks[row][cell];
+ }
+
+ /**
+ * @param i
+ * @param j
+ * @return
+ */
+ private static long[][] makeMasks(int rows, int cols) {
+
+ long seed;
+ synchronized(KCountArray8MT.class){
+ seed=counter;
+ counter++;
+ }
+
+ Timer t=new Timer();
+ long[][] r=new long[rows][cols];
+ Random randy=new Random(seed);
+ for(int i=0; i<r.length; i++){
+ fillMasks(r[i], randy);
+ }
+ t.stop();
+ if(t.elapsed>200000000L){System.out.println("Mask-creation time: "+t);}
+ return r;
+ }
+
+
+ /**
+ * @param cols
+ * @param randy
+ * @return
+ */
+ private static void fillMasks(long[] r, Random randy) {
+// for(int i=0; i<r.length; i++){
+// long x=0;
+// while(Long.bitCount(x&0xFFFFFFFF)!=16){
+// x=randy.nextLong();
+// }
+// r[i]=(x&Long.MAX_VALUE);
+// }
+
+ final int hlen=(1<<hashBits);
+ assert(r.length==hlen);
+ int[] count1=new int[hlen];
+ int[] count2=new int[hlen];
+ final long mask=hlen-1;
+
+ for(int i=0; i<r.length; i++){
+ long x=0;
+ int y=0;
+ int z=0;
+ while(Long.bitCount(x&0xFFFFFFFFL)!=16){
+ x=randy.nextLong();
+ while(Long.bitCount(x&0xFFFFFFFFL)<16){
+ x|=(1L<<randy.nextInt(32));
+ }
+ while(Long.bitCount(x&0xFFFFFFFFL)>16){
+ x&=(~(1L<<randy.nextInt(32)));
+ }
+ while(Long.bitCount(x&0xFFFFFFFF00000000L)<16){
+ x|=(1L<<(randy.nextInt(32)+32));
+ }
+ while(Long.bitCount(x&0xFFFFFFFF00000000L)>16){
+ x&=(~(1L<<(randy.nextInt(32)+32)));
+ }
+
+// System.out.print(".");
+// y=(((int)(x&mask))^i);
+ y=(((int)(x&mask)));
+ z=(int)((x>>hashBits)&mask);
+ if(count1[y]>0 || count2[z]>0){
+ x=0;
+ }
+ }
+// System.out.println(Long.toBinaryString(x));
+ r[i]=(x&Long.MAX_VALUE);
+ count1[y]++;
+ count2[z]++;
+ }
+
+ }
+
+
+ public void initialize(){
+ for(int i=0; i<writers.length; i++){
+ writers[i]=new WriteThread(i);
+ writers[i].start();
+
+// while(!writers[i].isAlive()){
+// System.out.print(".");
+// }
+ }
+ }
+
+ public void shutdown(){
+ if(finished){return;}
+ synchronized(this){
+ if(finished){return;}
+
+ //Clear buffers
+ for(int i=0; i<numArrays; i++){
+ long[] array=buffers[i];
+ int len=bufferlen[i];
+ buffers[i]=null;
+ bufferlen[i]=0;
+
+ if(len<array.length){
+ array=Arrays.copyOf(array, len);
+ }
+
+ if(array.length>0){
+ writers[i].add(array);
+ }
+ }
+
+ //Add poison
+ for(WriteThread wt : writers){
+ wt.add(poison);
+ }
+
+ //Wait for termination
+ for(WriteThread wt : writers){
+// System.out.println("wt"+wt.num+" is alive: "+wt.isAlive());
+ while(wt.isAlive()){
+// System.out.println("wt"+wt.num+" is alive: "+wt.isAlive());
+ try {
+ wt.join(10000);
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ if(wt.isAlive()){System.err.println(wt.getClass().getCanonicalName()+" is taking a long time to die.");}
+ }
+ cellsUsed+=wt.cellsUsedPersonal;
+// System.out.println("cellsUsed="+cellsUsed);
+ }
+
+ assert(!finished);
+ finished=true;
+ }
+ }
+
+ private class WriteThread extends Thread{
+
+ public WriteThread(int tnum){
+ num=tnum;
+ }
+
+ @Override
+ public void run(){
+ assert(matrix[num]==null);
+ array=new int[wordsPerArray]; //Makes NUMA systems use local memory.
+
+ matrix[num]=array;
+
+ long[] keys=null;
+ while(!shutdown){
+
+ if(verbose){System.err.println(" - Reading keys for wt"+num+".");}
+ while(keys==null){
+ try {
+ keys=writeQueue.take();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ if(keys==poison){
+// assert(false);
+ shutdown=true;
+ }else{
+ for(long key : keys){
+ incrementHashedLocal(key);
+ }
+ }
+// System.out.println(" -- Read keys for wt"+num+". poison="+(keys==poison)+", len="+keys.length);
+ if(verbose){System.err.println(" -- Read keys for wt"+num+". (success)");}
+ keys=null;
+ if(verbose){System.err.println("shutdown="+shutdown);}
+ }
+
+// System.out.println(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> I died: "+shutdown+", "+(keys==null)+".");
+// assert(false) : ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> I died: "+shutdown+", "+(keys==null)+".";
+
+ array=null;
+ }
+
+ private void add(long[] keys){
+// assert(isAlive());
+ assert(!shutdown);
+ if(shutdown){return;}
+// assert(keys!=poison);
+ if(verbose){System.err.println(" + Adding keys to wt"+num+".");}
+ boolean success=false;
+ while(!success){
+ try {
+ writeQueue.put(keys);
+ success=true;
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ if(verbose){System.err.println(" ++ Added keys to wt"+num+". (success)");}
+ }
+
+ private int incrementHashedLocal(long key){
+ assert((key&arrayMask)==num);
+ key=(key>>>arrayBits)%(cellMod);
+// key=(key>>>(arrayBits+1))%(cellMod);
+ int index=(int)(key>>>indexShift);
+ int word=array[index];
+ int cellShift=(int)(cellBits*key);
+ int value=((word>>>cellShift)&valueMask);
+ if(value==0){cellsUsedPersonal++;}
+ value=min(value+1, maxValue);
+ word=(value<<cellShift)|(word&~((valueMask)<<cellShift));
+ array[index]=word;
+ return value;
+ }
+
+ private int[] array;
+ private final int num;
+ public long cellsUsedPersonal=0;
+
+ public ArrayBlockingQueue<long[]> writeQueue=new ArrayBlockingQueue<long[]>(16);
+ public boolean shutdown=false;
+
+ }
+
+
+ public long cellsUsed(){return cellsUsed;}
+
+ private boolean finished=false;
+
+ private long cellsUsed;
+ private final int[][] matrix;
+ private final WriteThread[] writers=new WriteThread[numArrays];
+ private final int hashes;
+ private final int wordsPerArray;
+ private final long cellsPerArray;
+ private final long cellMod;
+ private final long[][] hashMasks=makeMasks(8, hashArrayLength);
+
+ private final long[][] buffers=new long[numArrays][500];
+ private final int[] bufferlen=new int[numArrays];
+
+ public final KCountArray prefilter;
+
+ private static final int hashBits=6;
+ private static final int hashArrayLength=1<<hashBits;
+ private static final int hashCellMask=hashArrayLength-1;
+ private static final long[] poison=new long[0];
+
+ private static long counter=0;
+
+}
diff --git a/current/bloom/KmerCount3.java b/current/bloom/KmerCount3.java
new file mode 100755
index 0000000..10cd7d2
--- /dev/null
+++ b/current/bloom/KmerCount3.java
@@ -0,0 +1,173 @@
+package bloom;
+
+import java.util.ArrayList;
+
+import stream.ConcurrentGenericReadInputStream;
+import stream.ConcurrentReadInputStream;
+import stream.FastaReadInputStream;
+import stream.Read;
+
+import align2.ListNum;
+import dna.AminoAcid;
+import dna.Timer;
+import fileIO.FileFormat;
+import fileIO.ReadWrite;
+
+/**
+ * @author Brian Bushnell
+ * @date Jul 5, 2012
+ *
+ */
+public class KmerCount3 extends KmerCountAbstract {
+
+ public static void main(String[] args){
+
+ Timer t=new Timer();
+
+ String fname1=args[0];
+ String fname2=(args.length>3 || args[1].contains(".") ? args[1] : null);
+ int k=Integer.parseInt(args[args.length-2]);
+ int cbits=Integer.parseInt(args[args.length-1]);
+
+ KCountArray2 count=null;
+
+ if(fileIO.FileFormat.hasFastaExtension(fname1)){
+ FastaReadInputStream.TARGET_READ_LEN=300000000;
+ FastaReadInputStream.MIN_READ_LEN=k;
+ }
+ count=countFastq(fname1, fname2, k, cbits);
+
+
+ t.stop();
+ System.out.println("Finished counting; time = "+t);
+
+ long[] freq=count.transformToFrequency();
+
+// System.out.println(count+"\n");
+// System.out.println(Arrays.toString(freq)+"\n");
+
+ long sum=sum(freq);
+ System.out.println("Kmer fraction:");
+ int lim1=8, lim2=16;
+ for(int i=0; i<lim1; i++){
+ String prefix=i+"";
+ while(prefix.length()<8){prefix=prefix+" ";}
+ System.out.println(prefix+"\t"+String.format("%.3f%% ",(100l*freq[i]/(double)sum))+"\t"+freq[i]);
+ }
+ while(lim1<=freq.length){
+ int x=0;
+ for(int i=lim1; i<lim2; i++){
+ x+=freq[i];
+ }
+ String prefix=lim1+"-"+(lim2-1);
+ if(lim2>=freq.length){prefix=lim1+"+";}
+ while(prefix.length()<8){prefix=prefix+" ";}
+ System.out.println(prefix+"\t"+String.format("%.3f%% ",(100l*x/(double)sum))+"\t"+x);
+ lim1*=2;
+ lim2=min(lim2*2, freq.length);
+ }
+ }
+
+ public static KCountArray2 countFastq(String reads1, String reads2, int k, int cbits){
+ assert(k>=1 && k<20);
+ final int kbits=2*k;
+ final long mask=~((-1L)<<(kbits));
+ final long cells=mask+1;
+ if(verbose){System.err.println("k="+k+", kbits="+kbits+", cells="+cells+", mask="+Long.toHexString(mask));}
+ final KCountArray2 count=new KCountArray2(cells, cbits);
+
+ final ConcurrentReadInputStream cris;
+ {
+ FileFormat ff1=FileFormat.testInput(reads1, FileFormat.FASTQ, null, true, true);
+ FileFormat ff2=FileFormat.testInput(reads2, FileFormat.FASTQ, null, true, true);
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2);
+ cris.start(); //4567
+ }
+
+ assert(cris!=null) : reads1;
+ System.err.println("Started cris");
+ boolean paired=cris.paired();
+ System.err.println("Paired: "+paired);
+
+ long kmer=0; //current kmer
+ int len=0; //distance since last contig start or ambiguous base
+
+ {
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+ assert(paired==(r.mate!=null));
+ }
+
+ while(reads!=null && reads.size()>0){
+ //System.err.println("reads.size()="+reads.size());
+ for(Read r : reads){
+ readsProcessed++;
+
+ len=0;
+ kmer=0;
+ byte[] bases=r.bases;
+ byte[] quals=r.quality;
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ int x=AminoAcid.baseToNumber[b];
+ if(x<0 || quals[i]<minQuality){
+ len=0;
+ kmer=0;
+ }else{
+ kmer=((kmer<<2)|x)&mask;
+ len++;
+ if(len>=k){
+// System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer));
+ count.increment(kmer, 1);
+// System.out.println(" -> "+count.read(kmer));
+// System.out.print("Incrementing array for "+Long.toHexString(kmer)+": "+array[(int)kmer]);
+// array[(int)kmer]++;
+// System.out.println(" -> "+array[(int)kmer]+"\n");
+// assert(array[(int)kmer]==count.read(kmer) || array[(int)kmer]>3);
+ }
+ }
+ }
+
+ if(r.mate!=null){
+ len=0;
+ kmer=0;
+ bases=r.mate.bases;
+ quals=r.mate.quality;
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ int x=AminoAcid.baseToNumber[b];
+ if(x<0 || quals[i]<minQuality){
+ len=0;
+ kmer=0;
+ }else{
+ kmer=((kmer<<2)|x)&mask;
+ len++;
+ if(len>=k){
+ count.increment(kmer, 1);
+ }
+ }
+ }
+ }
+
+ }
+ //System.err.println("returning list");
+ cris.returnList(ln.id, ln.list.isEmpty());
+ //System.err.println("fetching list");
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ System.err.println("Finished reading");
+ cris.returnList(ln.id, ln.list.isEmpty());
+ System.err.println("Returned list");
+ ReadWrite.closeStream(cris);
+ System.err.println("Closed stream");
+ System.err.println("Processed "+readsProcessed+" reads.");
+ }
+
+ return count;
+ }
+
+}
diff --git a/current/bloom/KmerCount4.java b/current/bloom/KmerCount4.java
new file mode 100755
index 0000000..e5c9d8b
--- /dev/null
+++ b/current/bloom/KmerCount4.java
@@ -0,0 +1,359 @@
+package bloom;
+
+import java.util.ArrayList;
+
+import stream.ConcurrentGenericReadInputStream;
+import stream.ConcurrentReadInputStream;
+import stream.FastaReadInputStream;
+import stream.Read;
+
+import align2.ListNum;
+import dna.AminoAcid;
+import dna.Timer;
+import fileIO.FileFormat;
+
+/**
+ * @author Brian Bushnell
+ * @date Jul 5, 2012
+ *
+ */
+public class KmerCount4 extends KmerCountAbstract {
+
+ public static void main(String[] args){
+
+ Timer t=new Timer();
+
+ String fname1=args[0];
+ String fname2=(args.length>3 || args[1].contains(".") ? args[1] : null);
+ int k=14;
+ int cbits=16;
+ int gap=0;
+
+ for(int i=(fname2==null ? 1 : 2); i<args.length; i++){
+ final String arg=args[i];
+ final String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=(split.length>1 ? split[1] : "true");
+
+ if(a.equals("k") || a.equals("kmer")){
+ k=Integer.parseInt(b);
+ }else if(a.startsWith("cbits") || a.startsWith("cellbits")){
+ cbits=Integer.parseInt(b);
+ }else if(a.startsWith("gap")){
+ gap=Integer.parseInt(b);
+ }else{
+ throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ KCountArray2 count=null;
+
+ if(fileIO.FileFormat.hasFastaExtension(fname1)){
+ assert(!FastaReadInputStream.SPLIT_READS);
+ FastaReadInputStream.MIN_READ_LEN=k;
+ }
+
+ if(gap==0){
+ count=count(fname1, fname2, k, cbits, true);
+ }else{
+ count=countFastqSplit(fname1, fname2, (k+1)/2, k/2, gap, cbits, true, null);
+ }
+
+
+ t.stop();
+ System.out.println("Finished counting; time = "+t);
+
+ printStatistics(count);
+
+ }
+
+ public static void printStatistics(KCountArray2 count){
+ long[] freq=count.transformToFrequency();
+
+// System.out.println(count+"\n");
+// System.out.println(Arrays.toString(freq)+"\n");
+
+ long sum=sum(freq);
+ System.out.println("Kmer fraction:");
+ int lim1=8, lim2=16;
+ for(int i=0; i<lim1; i++){
+ String prefix=i+"";
+ while(prefix.length()<8){prefix=prefix+" ";}
+ System.out.println(prefix+"\t"+String.format("%.3f%% ",(100l*freq[i]/(double)sum))+"\t"+freq[i]);
+ }
+ while(lim1<=freq.length){
+ int x=0;
+ for(int i=lim1; i<lim2; i++){
+ x+=freq[i];
+ }
+ String prefix=lim1+"-"+(lim2-1);
+ if(lim2>=freq.length){prefix=lim1+"+";}
+ while(prefix.length()<8){prefix=prefix+" ";}
+ System.out.println(prefix+"\t"+String.format("%.3f%% ",(100l*x/(double)sum))+"\t"+x);
+ lim1*=2;
+ lim2=min(lim2*2, freq.length);
+ }
+
+ long sum2=sum-freq[0];
+ long x=freq[1];
+ System.out.println();
+ System.out.println("Keys Counted: \t \t"+keysCounted);
+ System.out.println("Unique: \t \t"+sum2);
+ System.out.println("Avg Sites/Key: \t \t"+String.format("%.3f ",(keysCounted*1d/sum2)));
+ System.out.println();
+ System.out.println("Singleton: \t"+String.format("%.3f%% ",(100l*x/(double)sum2))+"\t"+x);
+ x=sum2-x;
+ System.out.println("Useful: \t"+String.format("%.3f%% ",(100l*x/(double)sum2))+"\t"+x);
+ }
+
+ public static KCountArray2 count(String reads1, String reads2, int k, int cbits, boolean rcomp){
+ return count(reads1, reads2, k, cbits, rcomp, null);
+ }
+
+ public static KCountArray2 count(String reads1, String reads2, int k, int cbits, boolean rcomp, KCountArray2 count){
+ assert(k>=1 && k<20);
+ final int kbits=2*k;
+ final long mask=~((-1L)<<(kbits));
+
+ if(count==null){
+ final long cells=1L<<kbits;
+ if(verbose){System.err.println("k="+k+", kbits="+kbits+", cells="+cells+", mask="+Long.toHexString(mask));}
+ count=new KCountArray2(cells, cbits);
+ }
+
+ final ConcurrentReadInputStream cris;
+ {
+ FileFormat ff1=FileFormat.testInput(reads1, FileFormat.FASTQ, null, true, true);
+ FileFormat ff2=FileFormat.testInput(reads2, FileFormat.FASTQ, null, true, true);
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2);
+ cris.start(); //4567
+ }
+
+ assert(cris!=null) : reads1;
+ System.err.println("Started cris");
+ boolean paired=cris.paired();
+ if(verbose){System.err.println("Paired: "+paired);}
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+ assert(paired==(r.mate!=null));
+ }
+
+ while(reads!=null && reads.size()>0){
+ //System.err.println("reads.size()="+reads.size());
+ for(Read r : reads){
+ readsProcessed++;
+
+ addRead(r, count, k, mask, rcomp);
+ if(r.mate!=null){
+ addRead(r.mate, count, k, mask, rcomp);
+ }
+
+ }
+ //System.err.println("returning list");
+ cris.returnList(ln.id, ln.list.isEmpty());
+ //System.err.println("fetching list");
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ if(verbose){System.err.println("Finished reading");}
+ cris.returnList(ln.id, ln.list.isEmpty());
+ if(verbose){System.err.println("Returned list");}
+ cris.close();
+ if(verbose){System.err.println("Closed stream");}
+ if(verbose){System.err.println("Processed "+readsProcessed+" reads.");}
+
+
+ return count;
+ }
+
+ public static KCountArray2 countFastqSplit(String reads1, String reads2, int k1, int k2, int gap, int cbits, boolean rcomp, KCountArray2 count){
+ assert(k1+k2>=1 && k1+k2<20);
+ assert(gap>=0);
+ final int kbits1=2*k1;
+ final int kbits2=2*k2;
+ final long mask1=~((-1L)<<(kbits1));
+ final long mask2=~((-1L)<<(kbits2));
+
+ if(count==null){
+ final long cells=1L<<(kbits1+kbits2);
+ if(verbose){System.err.println("k1="+k1+", k2="+k2+", kbits1="+kbits1+", kbits2="+kbits2+", cells="+cells+
+ ", mask1="+Long.toHexString(mask1)+", mask2="+Long.toHexString(mask2));}
+ count=new KCountArray2(cells, cbits, gap);
+ }
+ assert(count.gap==gap);
+
+ final ConcurrentReadInputStream cris;
+ {
+ FileFormat ff1=FileFormat.testInput(reads1, FileFormat.FASTQ, null, true, true);
+ FileFormat ff2=FileFormat.testInput(reads2, FileFormat.FASTQ, null, true, true);
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2);
+ cris.start(); //4567
+ }
+
+ assert(cris!=null) : reads1;
+ System.err.println("Started cris");
+ boolean paired=cris.paired();
+ if(verbose){System.err.println("Paired: "+paired);}
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+ assert(paired==(r.mate!=null));
+ }
+
+ while(reads!=null && reads.size()>0){
+ //System.err.println("reads.size()="+reads.size());
+ for(Read r : reads){
+ readsProcessed++;
+
+ addReadSplit(r, count, k1, k2, mask1, mask2, gap, rcomp);
+ if(r.mate!=null){
+ addReadSplit(r.mate, count, k1, k2, mask1, mask2, gap, rcomp);
+ }
+
+ }
+ //System.err.println("returning list");
+ cris.returnList(ln.id, ln.list.isEmpty());
+ //System.err.println("fetching list");
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ if(verbose){System.err.println("Finished reading");}
+ cris.returnList(ln.id, ln.list.isEmpty());
+ if(verbose){System.err.println("Returned list");}
+ cris.close();
+ if(verbose){System.err.println("Closed stream");}
+ if(verbose){System.err.println("Processed "+readsProcessed+" reads.");}
+
+
+ return count;
+ }
+
+ public static void addRead(final Read r, final KCountArray2 count, final int k, final long mask, boolean rcomp){
+ int len=0;
+ long kmer=0;
+ byte[] bases=r.bases;
+ byte[] quals=r.quality;
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ int x=AminoAcid.baseToNumber[b];
+ if(x<0 || (quals!=null && quals[i]<minQuality)){
+ len=0;
+ kmer=0;
+ }else{
+ kmer=((kmer<<2)|x)&mask;
+ len++;
+ if(len>=k){
+ keysCounted++;
+// System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer));
+ count.increment(kmer, 1);
+// System.out.println(" -> "+count.read(kmer));
+// System.out.print("Incrementing array for "+Long.toHexString(kmer)+": "+array[(int)kmer]);
+// array[(int)kmer]++;
+// System.out.println(" -> "+array[(int)kmer]+"\n");
+// assert(array[(int)kmer]==count.read(kmer) || array[(int)kmer]>3);
+ }
+ }
+ }
+ if(rcomp){
+ r.reverseComplement();
+ addRead(r, count, k, mask, false);
+ }
+ }
+
+ public static void addReadSplit(final Read r, final KCountArray2 count, final int k1, final int k2, final long mask1, final long mask2, final int gap, boolean rcomp){
+ int len=0;
+ int shift=k2*2;
+ long kmer1=0;
+ long kmer2=0;
+ byte[] bases=r.bases;
+ byte[] quals=r.quality;
+
+ assert(kmer1>=kmer2);
+
+// assert(false) : k1+", "+k2+", "+mask1+", "+mask2+", "+gap;
+
+ for(int i=0, j=i+k1+gap; j<bases.length; i++, j++){
+ int x1=AminoAcid.baseToNumber[bases[i]];
+ int x2=AminoAcid.baseToNumber[bases[j]];
+ if(x1<0 || x2<0 || (quals!=null && (quals[i]<minQuality || quals[j]<minQuality))){
+ len=0;
+ kmer1=0;
+ kmer2=0;
+ }else{
+ kmer1=((kmer1<<2)|x1)&mask1;
+ kmer2=((kmer2<<2)|x2)&mask2;
+ len++;
+ if(len>=k1){
+ keysCounted++;
+// System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer));
+
+ long key=(kmer1<<shift)|kmer2;
+// System.err.println(Long.toHexString(key));
+ count.increment(key, 1);
+// System.out.println(" -> "+count.read(kmer));
+// System.out.print("Incrementing array for "+Long.toHexString(kmer)+": "+array[(int)kmer]);
+// array[(int)kmer]++;
+// System.out.println(" -> "+array[(int)kmer]+"\n");
+// assert(array[(int)kmer]==count.read(kmer) || array[(int)kmer]>3);
+ }
+ }
+ }
+ if(rcomp){
+ r.reverseComplement();
+ addReadSplit(r, count, k1, k2, mask1, mask2, gap, false);
+ }
+ }
+
+ public static void addReadSplit(final byte[] bases, final KCountArray2 count, final int k1, final int k2, final long mask1, final long mask2, final int gap, boolean rcomp){
+ int len=0;
+ int shift=k2*2;
+ long kmer1=0;
+ long kmer2=0;
+ byte[] quals=null;
+
+ assert(kmer1>=kmer2);
+
+// assert(false) : k1+", "+k2+", "+mask1+", "+mask2+", "+gap;
+
+ for(int i=0, j=i+k1+gap; j<bases.length; i++, j++){
+ int x1=AminoAcid.baseToNumber[bases[i]];
+ int x2=AminoAcid.baseToNumber[bases[j]];
+ if(x1<0 || x2<0 || (quals!=null && (quals[i]<minQuality || quals[j]<minQuality))){
+ len=0;
+ kmer1=0;
+ kmer2=0;
+ }else{
+ kmer1=((kmer1<<2)|x1)&mask1;
+ kmer2=((kmer2<<2)|x2)&mask2;
+ len++;
+ if(len>=k1){
+ keysCounted++;
+// System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer));
+
+ long key=(kmer1<<shift)|kmer2;
+ System.out.println(Long.toHexString(kmer1));
+ System.out.println(Long.toHexString(kmer2));
+ System.out.println(Long.toHexString(key));
+ count.increment(key, 1);
+// System.out.println(" -> "+count.read(kmer));
+// System.out.print("Incrementing array for "+Long.toHexString(kmer)+": "+array[(int)kmer]);
+// array[(int)kmer]++;
+// System.out.println(" -> "+array[(int)kmer]+"\n");
+// assert(array[(int)kmer]==count.read(kmer) || array[(int)kmer]>3);
+ }
+ }
+ }
+ if(rcomp){
+ AminoAcid.reverseComplementBasesInPlace(bases);
+ addReadSplit(bases, count, k1, k2, mask1, mask2, gap, false);
+ }
+ }
+
+}
diff --git a/current/bloom/KmerCount5.java b/current/bloom/KmerCount5.java
new file mode 100755
index 0000000..0f68e20
--- /dev/null
+++ b/current/bloom/KmerCount5.java
@@ -0,0 +1,462 @@
+package bloom;
+
+import java.util.ArrayList;
+import java.util.BitSet;
+
+import jgi.ErrorCorrect;
+
+import stream.ConcurrentGenericReadInputStream;
+import stream.ConcurrentReadInputStream;
+import stream.FastaReadInputStream;
+import stream.Read;
+
+import align2.ListNum;
+import dna.AminoAcid;
+import dna.Timer;
+import fileIO.FileFormat;
+
+/**
+ * @author Brian Bushnell
+ * @date Jul 5, 2012
+ *
+ */
+public class KmerCount5 extends KmerCountAbstract {
+
+ public static void main(String[] args){
+
+ Timer t=new Timer();
+
+ String fname1=args[0];
+ String fname2=(args.length>1 ? args[1] : null);
+ int k=14;
+ int cbits=16;
+ int gap=0;
+
+ for(int i=2; i<args.length; i++){
+ final String arg=args[i];
+ final String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=(split.length>1 ? split[1] : "true");
+
+ if(a.equals("k") || a.equals("kmer")){
+ k=Integer.parseInt(b);
+ }else if(a.startsWith("cbits") || a.startsWith("cellbits")){
+ cbits=Integer.parseInt(b);
+ }else if(a.startsWith("gap")){
+ gap=Integer.parseInt(b);
+ }else{
+ throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ KCountArray count=null;
+
+ if(fileIO.FileFormat.hasFastaExtension(fname1)){
+ assert(!FastaReadInputStream.SPLIT_READS);
+ FastaReadInputStream.MIN_READ_LEN=k;
+ }
+
+ if(gap==0){
+ count=count(fname1, fname2, k, cbits, true);
+ }else{
+ count=countFastqSplit(fname1, fname2, (k+1)/2, k/2, gap, cbits, true, null);
+ }
+
+
+ t.stop();
+ System.out.println("Finished counting; time = "+t);
+
+ printStatistics(count);
+
+ }
+
+ public static void printStatistics(KCountArray count){
+ long[] freq=count.transformToFrequency();
+
+ // System.out.println(count+"\n");
+ // System.out.println(Arrays.toString(freq)+"\n");
+
+ long sum=sum(freq);
+ System.out.println("Kmer fraction:");
+ int lim1=8, lim2=16;
+ for(int i=0; i<lim1; i++){
+ String prefix=i+"";
+ while(prefix.length()<8){prefix=prefix+" ";}
+ System.out.println(prefix+"\t"+String.format("%.3f%% ",(100l*freq[i]/(double)sum))+"\t"+freq[i]);
+ }
+ while(lim1<=freq.length){
+ int x=0;
+ for(int i=lim1; i<lim2; i++){
+ x+=freq[i];
+ }
+ String prefix=lim1+"-"+(lim2-1);
+ if(lim2>=freq.length){prefix=lim1+"+";}
+ while(prefix.length()<8){prefix=prefix+" ";}
+ System.out.println(prefix+"\t"+String.format("%.3f%% ",(100l*x/(double)sum))+"\t"+x);
+ lim1*=2;
+ lim2=min(lim2*2, freq.length);
+ }
+
+ long sum2=sum-freq[0];
+ long x=freq[1];
+ System.out.println();
+ System.out.println("Keys Counted: \t \t"+keysCounted);
+ System.out.println("Unique: \t \t"+sum2);
+ System.out.println("Avg Sites/Key: \t \t"+String.format("%.3f ",(keysCounted*1d/sum2)));
+ System.out.println();
+ System.out.println("Singleton: \t"+String.format("%.3f%% ",(100l*x/(double)sum2))+"\t"+x);
+ x=sum2-x;
+ System.out.println("Useful: \t"+String.format("%.3f%% ",(100l*x/(double)sum2))+"\t"+x);
+ }
+
+ public static KCountArray count(String reads1, String reads2, int k, int cbits, boolean rcomp){
+ return count(reads1, reads2, k, cbits, rcomp, null);
+ }
+
+ public static KCountArray count(String reads1, String reads2, int k, int cbits, boolean rcomp, KCountArray count){
+ assert(k<32 && k>=1 && (count!=null || k<20));
+ final int kbits=2*k;
+ final long mask=~((-1L)<<(kbits));
+
+ if(count==null){
+ final long cells=1L<<kbits;
+ if(verbose){System.err.println("k="+k+", kbits="+kbits+", cells="+cells+", mask="+Long.toHexString(mask));}
+ count=KCountArray.makeNew(cells, cbits, 0);
+ }
+
+ final ConcurrentReadInputStream cris;
+ {
+ FileFormat ff1=FileFormat.testInput(reads1, FileFormat.FASTQ, null, true, true);
+ FileFormat ff2=FileFormat.testInput(reads2, FileFormat.FASTQ, null, true, true);
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2);
+ cris.start(); //4567
+ }
+
+ assert(cris!=null) : reads1;
+ System.err.println("Started cris");
+ boolean paired=cris.paired();
+ if(verbose){System.err.println("Paired: "+paired);}
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+ assert(paired==(r.mate!=null));
+ }
+
+ while(reads!=null && reads.size()>0){
+ //System.err.println("reads.size()="+reads.size());
+ for(Read r : reads){
+ readsProcessed++;
+
+ addRead(r, count, k, mask, rcomp);
+ if(r.mate!=null){
+ addRead(r.mate, count, k, mask, rcomp);
+ }
+
+ }
+ //System.err.println("returning list");
+ cris.returnList(ln.id, ln.list.isEmpty());
+ //System.err.println("fetching list");
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ if(verbose){System.err.println("Finished reading");}
+ cris.returnList(ln.id, ln.list.isEmpty());
+ if(verbose){System.err.println("Returned list");}
+ cris.close();
+ if(verbose){System.err.println("Closed stream");}
+ if(verbose){System.err.println("Processed "+readsProcessed+" reads.");}
+
+
+ return count;
+ }
+
+
+
+
+ public static KCountArray count(final String reads1, final String reads2, final int k, final int cbits, final boolean rcomp,
+ KCountArray counts, final KCountArray trusted, final long maxReads, final int thresh, final int detectStepsize, final boolean conservative){
+
+ assert(k<32 && k>=1 && (counts!=null || k<20));
+ final int kbits=2*k;
+ final long mask=~((-1L)<<(kbits));
+
+ // System.out.println("k="+k+", kbits="+kbits+", mask="+Long.toHexString(mask)+", thresh="+thresh);
+ // System.out.println("\ntrusted=\n"+trusted);
+ // System.out.println("\ncount=\n"+count);
+
+ if(counts==null){
+ final long cells=1L<<kbits;
+ if(verbose){System.err.println("k="+k+", kbits="+kbits+", cells="+cells+", mask="+Long.toHexString(mask));}
+ counts=KCountArray.makeNew(cells, cbits, 0);
+ }
+
+ final ConcurrentReadInputStream cris;
+ {
+ FileFormat ff1=FileFormat.testInput(reads1, FileFormat.FASTQ, null, true, true);
+ FileFormat ff2=FileFormat.testInput(reads2, FileFormat.FASTQ, null, true, true);
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2);
+ cris.start(); //4567
+ }
+
+ assert(cris!=null) : reads1;
+ System.err.println("Started cris");
+ boolean paired=cris.paired();
+ if(verbose){System.err.println("Paired: "+paired);}
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+ assert(paired==(r.mate!=null));
+ }
+
+
+ while(reads!=null && reads.size()>0){
+ //System.err.println("reads.size()="+reads.size());
+ for(Read r : reads){
+
+ Read r2=r.mate;
+ {
+ if(trusted!=null){
+ BitSet bs=(conservative ? ErrorCorrect.detectErrorsBulk(r, trusted, k, thresh, detectStepsize) :
+ ErrorCorrect.detectTrusted(r, trusted, k, thresh, detectStepsize));
+ // System.out.println("\n"+toString(bs, r.length()));
+ // System.out.println(new String(r.bases));
+ for(int i=bs.nextClearBit(0); i<r.length(); i=bs.nextClearBit(i+1)){
+ r.bases[i]='N';
+ r.quality[i]=0;
+ }
+ // System.out.println(new String(r.bases));
+ // System.out.println("used = "+String.format("%.3f%%",count.usedFraction()*100));
+ // System.out.println("used = "+((KCountArray4)count).cellsUsed());
+ // if(bs.length()<r.length()){r=null;}
+ }
+ // if(r!=null){KmerCount5.addRead(r, count, k, mask, rcomp);}
+ KmerCount5.addRead(r, counts, k, mask, rcomp);
+ }
+ if(r2!=null){
+ if(trusted!=null){
+ BitSet bs=(conservative ? ErrorCorrect.detectErrorsBulk(r2, trusted, k, thresh, detectStepsize) :
+ ErrorCorrect.detectTrusted(r2, trusted, k, thresh, detectStepsize));
+ for(int i=bs.nextClearBit(0); i<r2.length(); i=bs.nextClearBit(i+1)){
+ r2.bases[i]='N';
+ r2.quality[i]=0;
+ }
+ }
+ KmerCount5.addRead(r2, counts, k, mask, rcomp);
+ }
+
+ }
+ //System.err.println("returning list");
+ cris.returnList(ln.id, ln.list.isEmpty());
+ //System.err.println("fetching list");
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+
+ if(verbose){System.err.println("Finished reading");}
+ cris.returnList(ln.id, ln.list.isEmpty());
+ if(verbose){System.err.println("Returned list");}
+ cris.close();
+ if(verbose){System.err.println("Closed stream");}
+
+ // System.out.println("*** after ***");
+ // System.out.println("\ntrusted=\n"+trusted);
+ // System.out.println("\ncount=\n"+count);
+
+ return counts;
+ }
+
+
+ public static KCountArray countFastqSplit(String reads1, String reads2, int k1, int k2, int gap, int cbits, boolean rcomp, KCountArray counts){
+ int k=k1+k2;
+ assert(k<32 && k>=1 && (counts!=null || k<20));
+ assert(gap>=0);
+ final int kbits1=2*k1;
+ final int kbits2=2*k2;
+ final long mask1=~((-1L)<<(kbits1));
+ final long mask2=~((-1L)<<(kbits2));
+
+ if(counts==null){
+ final long cells=1L<<(kbits1+kbits2);
+ if(verbose){System.err.println("k1="+k1+", k2="+k2+", kbits1="+kbits1+", kbits2="+kbits2+", cells="+cells+
+ ", mask1="+Long.toHexString(mask1)+", mask2="+Long.toHexString(mask2));}
+ counts=KCountArray.makeNew(cells, cbits, gap);
+ }
+ assert(counts.gap==gap);
+
+ final ConcurrentReadInputStream cris;
+ {
+ FileFormat ff1=FileFormat.testInput(reads1, FileFormat.FASTQ, null, true, true);
+ FileFormat ff2=FileFormat.testInput(reads2, FileFormat.FASTQ, null, true, true);
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2);
+ cris.start(); //4567
+ }
+
+ assert(cris!=null) : reads1;
+ System.err.println("Started cris");
+ boolean paired=cris.paired();
+ if(verbose){System.err.println("Paired: "+paired);}
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+ assert(paired==(r.mate!=null));
+ }
+
+ while(reads!=null && reads.size()>0){
+ //System.err.println("reads.size()="+reads.size());
+ for(Read r : reads){
+ readsProcessed++;
+
+ addReadSplit(r, counts, k1, k2, mask1, mask2, gap, rcomp);
+ if(r.mate!=null){
+ addReadSplit(r.mate, counts, k1, k2, mask1, mask2, gap, rcomp);
+ }
+
+ }
+ //System.err.println("returning list");
+ cris.returnList(ln.id, ln.list.isEmpty());
+ //System.err.println("fetching list");
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ if(verbose){System.err.println("Finished reading");}
+ cris.returnList(ln.id, ln.list.isEmpty());
+ if(verbose){System.err.println("Returned list");}
+ cris.close();
+ if(verbose){System.err.println("Closed stream");}
+ if(verbose){System.err.println("Processed "+readsProcessed+" reads.");}
+
+
+ return counts;
+ }
+
+ public static void addRead(final Read r, final KCountArray count, final int k, final long mask, boolean rcomp){
+ int len=0;
+ long kmer=0;
+ byte[] bases=r.bases;
+ byte[] quals=r.quality;
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ int x=AminoAcid.baseToNumber[b];
+ if(x<0 || (quals!=null && quals[i]<minQuality)){
+ len=0;
+ kmer=0;
+ }else{
+ kmer=((kmer<<2)|x)&mask;
+ len++;
+ if(len>=k){
+ keysCounted++;
+ // System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer));
+ count.increment(kmer);
+ // System.out.println(" -> "+count.read(kmer));
+ // System.out.print("Incrementing array for "+Long.toHexString(kmer)+": "+array[(int)kmer]);
+ // array[(int)kmer]++;
+ // System.out.println(" -> "+array[(int)kmer]+"\n");
+ // assert(array[(int)kmer]==count.read(kmer) || array[(int)kmer]>3);
+ }
+ }
+ }
+ if(rcomp){
+ r.reverseComplement();
+ addRead(r, count, k, mask, false);
+ }
+ }
+
+ public static void addReadSplit(final Read r, final KCountArray count, final int k1, final int k2, final long mask1, final long mask2, final int gap, boolean rcomp){
+ int len=0;
+ int shift=k2*2;
+ long kmer1=0;
+ long kmer2=0;
+ byte[] bases=r.bases;
+ byte[] quals=r.quality;
+
+ assert(kmer1>=kmer2);
+
+ // assert(false) : k1+", "+k2+", "+mask1+", "+mask2+", "+gap;
+
+ for(int i=0, j=i+k1+gap; j<bases.length; i++, j++){
+ int x1=AminoAcid.baseToNumber[bases[i]];
+ int x2=AminoAcid.baseToNumber[bases[j]];
+ if(x1<0 || x2<0 || (quals!=null && (quals[i]<minQuality || quals[j]<minQuality))){
+ len=0;
+ kmer1=0;
+ kmer2=0;
+ }else{
+ kmer1=((kmer1<<2)|x1)&mask1;
+ kmer2=((kmer2<<2)|x2)&mask2;
+ len++;
+ if(len>=k1){
+ keysCounted++;
+ // System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer));
+
+ long key=(kmer1<<shift)|kmer2;
+ // System.err.println(Long.toHexString(key));
+ count.increment(key);
+ // System.out.println(" -> "+count.read(kmer));
+ // System.out.print("Incrementing array for "+Long.toHexString(kmer)+": "+array[(int)kmer]);
+ // array[(int)kmer]++;
+ // System.out.println(" -> "+array[(int)kmer]+"\n");
+ // assert(array[(int)kmer]==count.read(kmer) || array[(int)kmer]>3);
+ }
+ }
+ }
+ if(rcomp){
+ r.reverseComplement();
+ addReadSplit(r, count, k1, k2, mask1, mask2, gap, false);
+ }
+ }
+
+ public static void addReadSplit(final byte[] bases, final KCountArray count, final int k1, final int k2, final long mask1, final long mask2, final int gap, boolean rcomp){
+ int len=0;
+ int shift=k2*2;
+ long kmer1=0;
+ long kmer2=0;
+ byte[] quals=null;
+
+ assert(kmer1>=kmer2);
+
+ // assert(false) : k1+", "+k2+", "+mask1+", "+mask2+", "+gap;
+
+ for(int i=0, j=i+k1+gap; j<bases.length; i++, j++){
+ int x1=AminoAcid.baseToNumber[bases[i]];
+ int x2=AminoAcid.baseToNumber[bases[j]];
+ if(x1<0 || x2<0 || (quals!=null && (quals[i]<minQuality || quals[j]<minQuality))){
+ len=0;
+ kmer1=0;
+ kmer2=0;
+ }else{
+ kmer1=((kmer1<<2)|x1)&mask1;
+ kmer2=((kmer2<<2)|x2)&mask2;
+ len++;
+ if(len>=k1){
+ keysCounted++;
+ // System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer));
+
+ long key=(kmer1<<shift)|kmer2;
+ System.out.println(Long.toHexString(kmer1));
+ System.out.println(Long.toHexString(kmer2));
+ System.out.println(Long.toHexString(key));
+ count.increment(key);
+ // System.out.println(" -> "+count.read(kmer));
+ // System.out.print("Incrementing array for "+Long.toHexString(kmer)+": "+array[(int)kmer]);
+ // array[(int)kmer]++;
+ // System.out.println(" -> "+array[(int)kmer]+"\n");
+ // assert(array[(int)kmer]==count.read(kmer) || array[(int)kmer]>3);
+ }
+ }
+ }
+ if(rcomp){
+ AminoAcid.reverseComplementBasesInPlace(bases);
+ addReadSplit(bases, count, k1, k2, mask1, mask2, gap, false);
+ }
+ }
+
+}
diff --git a/current/bloom/KmerCount6.java b/current/bloom/KmerCount6.java
new file mode 100755
index 0000000..62480e7
--- /dev/null
+++ b/current/bloom/KmerCount6.java
@@ -0,0 +1,437 @@
+package bloom;
+
+import java.util.ArrayList;
+import java.util.BitSet;
+
+import jgi.ErrorCorrect;
+
+import stream.ConcurrentGenericReadInputStream;
+import stream.ConcurrentReadInputStream;
+import stream.FastaReadInputStream;
+import stream.Read;
+
+import align2.ListNum;
+import dna.AminoAcid;
+import dna.Timer;
+import fileIO.FileFormat;
+
+/**
+ * @author Brian Bushnell
+ * @date Jul 5, 2012
+ *
+ */
+public class KmerCount6 extends KmerCountAbstract {
+
+ public static void main(String[] args){
+
+ Timer t=new Timer();
+
+ String fname1=args[0];
+ String fname2=(args.length>1 ? args[1] : null);
+ int k=14;
+ int cbits=16;
+ int gap=0;
+
+ for(int i=2; i<args.length; i++){
+ final String arg=args[i];
+ final String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=(split.length>1 ? split[1] : "true");
+
+ if(a.equals("k") || a.equals("kmer")){
+ k=Integer.parseInt(b);
+ }else if(a.startsWith("cbits") || a.startsWith("cellbits")){
+ cbits=Integer.parseInt(b);
+ }else if(a.startsWith("gap")){
+ gap=Integer.parseInt(b);
+ }else{
+ throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ KCountArray count=null;
+
+ if(fileIO.FileFormat.hasFastaExtension(fname1)){
+ assert(!FastaReadInputStream.SPLIT_READS);
+ FastaReadInputStream.MIN_READ_LEN=k;
+ }
+
+ count=count(fname1, fname2, k, cbits, gap, true, null);
+
+
+ t.stop();
+ System.out.println("Finished counting; time = "+t);
+
+ printStatistics(count);
+
+ }
+
+ public static void printStatistics(KCountArray count){
+ long[] freq=count.transformToFrequency();
+
+// System.out.println(count+"\n");
+// System.out.println(Arrays.toString(freq)+"\n");
+
+ long sum=sum(freq);
+ System.out.println("Kmer fraction:");
+ int lim1=8, lim2=16;
+ for(int i=0; i<lim1; i++){
+ String prefix=i+"";
+ while(prefix.length()<8){prefix=prefix+" ";}
+ System.out.println(prefix+"\t"+String.format("%.3f%% ",(100l*freq[i]/(double)sum))+"\t"+freq[i]);
+ }
+ while(lim1<=freq.length){
+ int x=0;
+ for(int i=lim1; i<lim2; i++){
+ x+=freq[i];
+ }
+ String prefix=lim1+"-"+(lim2-1);
+ if(lim2>=freq.length){prefix=lim1+"+";}
+ while(prefix.length()<8){prefix=prefix+" ";}
+ System.out.println(prefix+"\t"+String.format("%.3f%% ",(100l*x/(double)sum))+"\t"+x);
+ lim1*=2;
+ lim2=min(lim2*2, freq.length);
+ }
+
+ long sum2=sum-freq[0];
+ long x=freq[1];
+ System.out.println();
+ System.out.println("Keys Counted: \t \t"+keysCounted);
+ System.out.println("Unique: \t \t"+sum2);
+ System.out.println("Avg Sites/Key: \t \t"+String.format("%.3f ",(keysCounted*1d/sum2)));
+ System.out.println();
+ System.out.println("Singleton: \t"+String.format("%.3f%% ",(100l*x/(double)sum2))+"\t"+x);
+ x=sum2-x;
+ System.out.println("Useful: \t"+String.format("%.3f%% ",(100l*x/(double)sum2))+"\t"+x);
+ }
+
+ public static KCountArray count(String reads1, String reads2, int k, int cbits, int gap, boolean rcomp, KCountArray count){
+ assert(k<32 && k>=1 && (count!=null || k<20));
+ final int kbits=2*k;
+ final long mask=~((-1L)<<(kbits));
+
+ if(count==null){
+ final long cells=1L<<kbits;
+ if(verbose){System.err.println("k="+k+", kbits="+kbits+", cells="+cells+", mask="+Long.toHexString(mask));}
+ count=KCountArray.makeNew(cells, cbits, gap);
+ }
+ assert(gap==count.gap);
+
+
+ final ConcurrentReadInputStream cris;
+ {
+ FileFormat ff1=FileFormat.testInput(reads1, FileFormat.FASTQ, null, true, true);
+ FileFormat ff2=FileFormat.testInput(reads2, FileFormat.FASTQ, null, true, true);
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2);
+ cris.start(); //4567
+ }
+
+ assert(cris!=null) : reads1;
+ System.err.println("Started cris");
+ boolean paired=cris.paired();
+ if(verbose){System.err.println("Paired: "+paired);}
+
+ count(cris, k, rcomp, count);
+
+ cris.close();
+ if(verbose){System.err.println("Closed stream");}
+ if(verbose){System.err.println("Processed "+readsProcessed+" reads.");}
+
+
+ return count;
+ }
+
+
+ public static void count(ConcurrentReadInputStream cris, int k, boolean rcomp, KCountArray count){
+ assert(k<32 && k>=1 && (count!=null || k<20));
+
+ assert(count!=null);
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+
+ if(count.gap==0){
+ final int kbits=2*k;
+ final long mask=~((-1L)<<(kbits));
+
+
+ while(reads!=null && reads.size()>0){
+ //System.err.println("reads.size()="+reads.size());
+ for(Read r : reads){
+ readsProcessed++;
+
+ addRead(r, count, k, mask, rcomp);
+ if(r.mate!=null){
+ addRead(r.mate, count, k, mask, rcomp);
+ }
+
+ }
+ //System.err.println("returning list");
+ cris.returnList(ln.id, ln.list.isEmpty());
+ //System.err.println("fetching list");
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ }else{
+ final int k1=(k+1)/2;
+ final int k2=k/2;
+ final int kbits1=2*k1;
+ final int kbits2=2*k2;
+ final int gap=count.gap;
+ final long mask1=~((-1L)<<(kbits1));
+ final long mask2=~((-1L)<<(kbits2));
+ while(reads!=null && reads.size()>0){
+ //System.err.println("reads.size()="+reads.size());
+ for(Read r : reads){
+ readsProcessed++;
+
+ addReadSplit(r, count, k1, k2, mask1, mask2, gap, rcomp);
+ if(r.mate!=null){
+ addReadSplit(r.mate, count, k1, k2, mask1, mask2, gap, rcomp);
+ }
+
+ }
+ //System.err.println("returning list");
+ cris.returnList(ln.id, ln.list.isEmpty());
+ //System.err.println("fetching list");
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ }
+
+ if(verbose){System.err.println("Finished reading");}
+ cris.returnList(ln.id, ln.list.isEmpty());
+ if(verbose){System.err.println("Returned list");}
+ }
+
+
+
+
+ public static KCountArray count(final String reads1, final String reads2, final int k, final int cbits, final boolean rcomp,
+ KCountArray count, final KCountArray trusted, final long maxReads, final int thresh, final int detectStepsize, final boolean conservative){
+
+ assert(k<32 && k>=1 && (count!=null || k<20));
+ final int kbits=2*k;
+ final long mask=~((-1L)<<(kbits));
+
+// System.out.println("k="+k+", kbits="+kbits+", mask="+Long.toHexString(mask)+", thresh="+thresh);
+// System.out.println("\ntrusted=\n"+trusted);
+// System.out.println("\ncount=\n"+count);
+
+ if(count==null){
+ final long cells=1L<<kbits;
+ if(verbose){System.err.println("k="+k+", kbits="+kbits+", cells="+cells+", mask="+Long.toHexString(mask));}
+ count=KCountArray.makeNew(cells, cbits, 0);
+ }
+
+ final ConcurrentReadInputStream cris;
+ {
+ FileFormat ff1=FileFormat.testInput(reads1, FileFormat.FASTQ, null, true, true);
+ FileFormat ff2=FileFormat.testInput(reads2, FileFormat.FASTQ, null, true, true);
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2);
+ cris.start(); //4567
+ }
+
+ assert(cris!=null) : reads1;
+ System.err.println("Started cris");
+ boolean paired=cris.paired();
+ if(verbose){System.err.println("Paired: "+paired);}
+
+ count(cris, k, rcomp, count, trusted, thresh, detectStepsize, conservative);
+
+ cris.close();
+ if(verbose){System.err.println("Closed stream");}
+
+// System.out.println("*** after ***");
+// System.out.println("\ntrusted=\n"+trusted);
+// System.out.println("\ncount=\n"+count);
+
+ return count;
+ }
+
+
+
+
+ public static void count(final ConcurrentReadInputStream cris, final int k, final boolean rcomp,
+ final KCountArray count, final KCountArray trusted, final int thresh, final int detectStepsize, final boolean conservative){
+
+ assert(k<32 && k>=1 && (count!=null || k<20));
+ final int kbits=2*k;
+ final long mask=~((-1L)<<(kbits));
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ while(reads!=null && reads.size()>0){
+ //System.err.println("reads.size()="+reads.size());
+ for(Read r : reads){
+
+ Read r2=r.mate;
+ {
+ if(trusted!=null){
+ BitSet bs=(conservative ? ErrorCorrect.detectErrorsBulk(r, trusted, k, thresh, detectStepsize) :
+ ErrorCorrect.detectTrusted(r, trusted, k, thresh, detectStepsize));
+// System.out.println("\n"+toString(bs, r.length()));
+// System.out.println(new String(r.bases));
+ for(int i=bs.nextClearBit(0); i<r.length(); i=bs.nextClearBit(i+1)){
+ r.bases[i]='N';
+ r.quality[i]=0;
+ }
+// System.out.println(new String(r.bases));
+// System.out.println("used = "+String.format("%.3f%%",count.usedFraction()*100));
+// System.out.println("used = "+((KCountArray4)count).cellsUsed());
+// if(bs.length()<r.length()){r=null;}
+ }
+// if(r!=null){addRead(r, count, k, mask, rcomp);}
+ addRead(r, count, k, mask, rcomp);
+ }
+ if(r2!=null){
+ if(trusted!=null){
+ BitSet bs=(conservative ? ErrorCorrect.detectErrorsBulk(r2, trusted, k, thresh, detectStepsize) :
+ ErrorCorrect.detectTrusted(r2, trusted, k, thresh, detectStepsize));
+ for(int i=bs.nextClearBit(0); i<r2.length(); i=bs.nextClearBit(i+1)){
+ r2.bases[i]='N';
+ r2.quality[i]=0;
+ }
+ }
+ addRead(r2, count, k, mask, rcomp);
+ }
+
+ }
+ //System.err.println("returning list");
+ cris.returnList(ln.id, ln.list.isEmpty());
+ //System.err.println("fetching list");
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+
+ if(verbose){System.err.println("Finished reading");}
+ cris.returnList(ln.id, ln.list.isEmpty());
+ if(verbose){System.err.println("Returned list");}
+ }
+
+
+
+ public static void addRead(final Read r, final KCountArray count, final int k, final long mask, boolean rcomp){
+ int len=0;
+ long kmer=0;
+ byte[] bases=r.bases;
+ byte[] quals=r.quality;
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ int x=AminoAcid.baseToNumber[b];
+ if(x<0 || (quals!=null && quals[i]<minQuality)){
+ len=0;
+ kmer=0;
+ }else{
+ kmer=((kmer<<2)|x)&mask;
+ len++;
+ if(len>=k){
+ keysCounted++;
+// System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer));
+ count.increment(kmer);
+// System.out.println(" -> "+count.read(kmer));
+// System.out.print("Incrementing array for "+Long.toHexString(kmer)+": "+array[(int)kmer]);
+// array[(int)kmer]++;
+// System.out.println(" -> "+array[(int)kmer]+"\n");
+// assert(array[(int)kmer]==count.read(kmer) || array[(int)kmer]>3);
+ }
+ }
+ }
+ if(rcomp){
+ r.reverseComplement();
+ addRead(r, count, k, mask, false);
+ }
+ }
+
+ public static void addReadSplit(final Read r, final KCountArray count, final int k1, final int k2, final long mask1, final long mask2, final int gap, boolean rcomp){
+ int len=0;
+ int shift=k2*2;
+ long kmer1=0;
+ long kmer2=0;
+ byte[] bases=r.bases;
+ byte[] quals=r.quality;
+
+ assert(kmer1>=kmer2);
+
+// assert(false) : k1+", "+k2+", "+mask1+", "+mask2+", "+gap;
+
+ for(int i=0, j=i+k1+gap; j<bases.length; i++, j++){
+ int x1=AminoAcid.baseToNumber[bases[i]];
+ int x2=AminoAcid.baseToNumber[bases[j]];
+ if(x1<0 || x2<0 || (quals!=null && (quals[i]<minQuality || quals[j]<minQuality))){
+ len=0;
+ kmer1=0;
+ kmer2=0;
+ }else{
+ kmer1=((kmer1<<2)|x1)&mask1;
+ kmer2=((kmer2<<2)|x2)&mask2;
+ len++;
+ if(len>=k1){
+ keysCounted++;
+// System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer));
+
+ long key=(kmer1<<shift)|kmer2;
+// System.err.println(Long.toHexString(key));
+ count.increment(key);
+// System.out.println(" -> "+count.read(kmer));
+// System.out.print("Incrementing array for "+Long.toHexString(kmer)+": "+array[(int)kmer]);
+// array[(int)kmer]++;
+// System.out.println(" -> "+array[(int)kmer]+"\n");
+// assert(array[(int)kmer]==count.read(kmer) || array[(int)kmer]>3);
+ }
+ }
+ }
+ if(rcomp){
+ r.reverseComplement();
+ addReadSplit(r, count, k1, k2, mask1, mask2, gap, false);
+ }
+ }
+
+ public static void addReadSplit(final byte[] bases, final KCountArray count, final int k1, final int k2, final long mask1, final long mask2, final int gap, boolean rcomp){
+ int len=0;
+ int shift=k2*2;
+ long kmer1=0;
+ long kmer2=0;
+ byte[] quals=null;
+
+ assert(kmer1>=kmer2);
+
+// assert(false) : k1+", "+k2+", "+mask1+", "+mask2+", "+gap;
+
+ for(int i=0, j=i+k1+gap; j<bases.length; i++, j++){
+ int x1=AminoAcid.baseToNumber[bases[i]];
+ int x2=AminoAcid.baseToNumber[bases[j]];
+ if(x1<0 || x2<0 || (quals!=null && (quals[i]<minQuality || quals[j]<minQuality))){
+ len=0;
+ kmer1=0;
+ kmer2=0;
+ }else{
+ kmer1=((kmer1<<2)|x1)&mask1;
+ kmer2=((kmer2<<2)|x2)&mask2;
+ len++;
+ if(len>=k1){
+ keysCounted++;
+// System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer));
+
+ long key=(kmer1<<shift)|kmer2;
+ System.out.println(Long.toHexString(kmer1));
+ System.out.println(Long.toHexString(kmer2));
+ System.out.println(Long.toHexString(key));
+ count.increment(key);
+// System.out.println(" -> "+count.read(kmer));
+// System.out.print("Incrementing array for "+Long.toHexString(kmer)+": "+array[(int)kmer]);
+// array[(int)kmer]++;
+// System.out.println(" -> "+array[(int)kmer]+"\n");
+// assert(array[(int)kmer]==count.read(kmer) || array[(int)kmer]>3);
+ }
+ }
+ }
+ if(rcomp){
+ AminoAcid.reverseComplementBasesInPlace(bases);
+ addReadSplit(bases, count, k1, k2, mask1, mask2, gap, false);
+ }
+ }
+
+}
diff --git a/current/bloom/KmerCount6MT.java b/current/bloom/KmerCount6MT.java
new file mode 100755
index 0000000..20ee79f
--- /dev/null
+++ b/current/bloom/KmerCount6MT.java
@@ -0,0 +1,705 @@
+package bloom;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.BitSet;
+
+import jgi.ErrorCorrect;
+
+import stream.ConcurrentGenericReadInputStream;
+import stream.ConcurrentReadInputStream;
+import stream.FastaReadInputStream;
+import stream.Read;
+
+import align2.ListNum;
+import align2.Tools;
+import dna.AminoAcid;
+import dna.Timer;
+import fileIO.FileFormat;
+
+/**
+ * @author Brian Bushnell
+ * @date Jul 5, 2012
+ *
+ */
+public class KmerCount6MT extends KmerCountAbstract {
+
+ public static void main(String[] args){
+
+ Timer t=new Timer();
+
+ String fname1=args[0];
+ String fname2=(args.length>1 ? args[1] : null);
+ int k=14;
+ int cbits=16;
+ int gap=0;
+ int matrixbits=-1;
+ int hashes=1;
+
+ for(int i=2; i<args.length; i++){
+ final String arg=args[i];
+ final String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=(split.length>1 ? split[1] : "true");
+
+ if(a.equals("k") || a.equals("kmer")){
+ k=Integer.parseInt(b);
+ }else if(a.startsWith("cbits") || a.startsWith("cellbits")){
+ cbits=Integer.parseInt(b);
+ }else if(a.startsWith("gap")){
+ gap=Integer.parseInt(b);
+ }else if(a.startsWith("reads") || a.startsWith("maxreads")){
+ maxReads=Tools.parseKMG(b);
+ }else if(a.startsWith("matrixbits")){
+ matrixbits=Integer.parseInt(b);
+ }else if(a.startsWith("hashes")){
+ hashes=Integer.parseInt(b);
+ }else{
+ throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ int kbits=2*k;
+ if(matrixbits<0){
+ matrixbits=kbits;
+ }
+ matrixbits=Tools.min(kbits, matrixbits);
+
+ if(fileIO.FileFormat.hasFastaExtension(fname1)){
+ assert(!FastaReadInputStream.SPLIT_READS);
+ FastaReadInputStream.MIN_READ_LEN=k;
+ }
+
+ KCountArray count=KCountArray.makeNew(1L<<kbits, 1L<<matrixbits, cbits, gap, hashes);
+ count=count(fname1, fname2, k, cbits, gap, true, count);
+ count.shutdown();
+
+// verbose=true;
+
+ t.stop();
+ System.out.println("Finished counting; time = "+t);
+
+ printStatistics(count);
+
+ }
+
+ public static void printStatistics(KCountArray count){
+ long[] freq=count.transformToFrequency();
+
+// System.out.println(count+"\n");
+// System.out.println(Arrays.toString(freq)+"\n");
+
+ long sum=sum(freq);
+ System.out.println("Kmer fraction:");
+ int lim1=8, lim2=16;
+ for(int i=0; i<lim1; i++){
+ String prefix=i+"";
+ while(prefix.length()<8){prefix=prefix+" ";}
+ System.out.println(prefix+"\t"+String.format("%.3f%% ",(100l*freq[i]/(double)sum))+"\t"+freq[i]);
+ }
+ while(lim1<=freq.length){
+ int x=0;
+ for(int i=lim1; i<lim2; i++){
+ x+=freq[i];
+ }
+ String prefix=lim1+"-"+(lim2-1);
+ if(lim2>=freq.length){prefix=lim1+"+";}
+ while(prefix.length()<8){prefix=prefix+" ";}
+ System.out.println(prefix+"\t"+String.format("%.3f%% ",(100l*x/(double)sum))+"\t"+x);
+ lim1*=2;
+ lim2=min(lim2*2, freq.length);
+ }
+
+ long sum2=sum-freq[0];
+ long x=freq[1];
+ System.out.println();
+ System.out.println("Keys Counted: \t \t"+keysCounted);
+ System.out.println("Unique: \t \t"+sum2);
+ System.out.println("Avg Sites/Key: \t \t"+String.format("%.3f ",(keysCounted*1d/sum2)));
+ System.out.println();
+ System.out.println("Singleton: \t"+String.format("%.3f%% ",(100l*x/(double)sum2))+"\t"+x);
+ x=sum2-x;
+ System.out.println("Useful: \t"+String.format("%.3f%% ",(100l*x/(double)sum2))+"\t"+x);
+ }
+
+ public static KCountArray makeKca(String fname1, String fname2, Iterable<String> extraFiles, int k, int cbits){
+ return makeKca(fname1, fname2, extraFiles, k, cbits, 0, Tools.min(2*k, 35), 1, minQuality, true, maxReads, 1, 1, 1, 2);
+ }
+
+ public static KCountArray makeKca(String fname1, String fname2, Iterable<String> extraFiles,
+ int k, int cbits, int gap, int matrixbits, int hashes, int minqual, boolean rcomp, long maxreads){
+ return makeKca(fname1, fname2, extraFiles, k, cbits, gap, matrixbits, hashes, minqual, rcomp, maxreads, 1, 1, 1, 2);
+ }
+
+ public static KCountArray makeKca(String fname1, String fname2, Iterable<String> extraFiles,
+ int k, int cbits, int gap, int matrixbits, int hashes, int minqual, boolean rcomp, long maxreads, int passes, int stepsize, int thresh1, int thresh2){
+ final int kbits=2*k;
+// verbose=true;
+ if(verbose){System.err.println("Making kca from ("+fname1+", "+fname2+")\nk="+k+", gap="+gap+", matrixbits="+matrixbits+", cbits="+cbits);}
+
+ boolean oldsplit=FastaReadInputStream.SPLIT_READS;
+ long oldmax=maxReads;
+ byte oldq=minQuality;
+ maxReads=maxreads;
+ minQuality=(byte)minqual;
+
+ // System.out.println("kbits="+(kbits)+" -> "+(1L<<kbits)+", matrixbits="+(matrixbits)+" -> "+(1L<<matrixbits)+", cbits="+cbits+", gap="+gap+", hashes="+hashes);
+ KCountArray kca=KCountArray.makeNew(1L<<kbits, 1L<<matrixbits, cbits, gap, hashes);
+
+ if(extraFiles!=null){
+ for(String s : extraFiles){
+ if(fileIO.FileFormat.hasFastaExtension(s)){
+ assert(!FastaReadInputStream.SPLIT_READS);
+ }
+ }
+ }
+
+ if(passes==1){
+
+ count(fname1, fname2, k, cbits, gap, rcomp, kca);
+ if(extraFiles!=null){
+ maxReads=-1;
+ for(String s : extraFiles){
+ count(s, null, k, cbits, gap, rcomp, kca);
+ }
+ }
+ kca.shutdown();
+
+ }else{
+ assert(passes>1);
+ KCountArray trusted=null;
+ for(int i=1; i<passes; i++){
+ boolean conservative=i>2;// /*or, alternately, (trusted==null || trusted.capacity()>0.3)
+ int step=(stepsize==1 ? 1 : stepsize+i%2);
+ // if(!conservative){step=(step+3)/4;}
+ if(!conservative){step=Tools.min(3, (step+3)/4);}
+
+ count(fname1, fname2, k, cbits, true, kca, trusted, maxreads, thresh1, step, conservative);
+ if(extraFiles!=null){
+ maxReads=-1;
+ for(String s : extraFiles){
+ count(s, null, k, cbits, true, kca, trusted, maxreads, thresh1, step, conservative);
+ }
+ }
+ kca.shutdown();
+
+ System.out.println("Trusted: \t"+kca.toShortString());
+ trusted=kca;
+ kca=KCountArray.makeNew(1L<<kbits, 1L<<matrixbits, cbits, gap, hashes);
+
+ }
+
+ count(fname1, fname2, k, cbits, true, kca, trusted, maxreads, thresh2, stepsize, true);
+ if(extraFiles!=null){
+ maxReads=-1;
+ for(String s : extraFiles){
+ count(s, null, k, cbits, true, kca, trusted, maxreads, thresh2, stepsize, true);
+ }
+ }
+ kca.shutdown();
+ }
+
+ minQuality=oldq;
+ maxReads=oldmax;
+ FastaReadInputStream.SPLIT_READS=oldsplit;
+
+
+ return kca;
+ }
+
+ public static KCountArray count(String reads1, String reads2, int k, int cbits, int gap, boolean rcomp, KCountArray count){
+ assert(k<32 && k>=1 && (count!=null || k<20));
+ final int kbits=2*k;
+ final long mask=~((-1L)<<(kbits));
+// System.err.println("countFastq... making a new cris");
+ if(count==null){
+ final long cells=1L<<kbits;
+ if(verbose){System.err.println("k="+k+", kbits="+kbits+", cells="+cells+", mask="+Long.toHexString(mask));}
+ count=KCountArray.makeNew(cells, cbits, gap);
+ }
+ assert(gap==count.gap);
+
+
+ final ConcurrentReadInputStream cris;
+ {
+ FileFormat ff1=FileFormat.testInput(reads1, FileFormat.FASTQ, null, true, true);
+ FileFormat ff2=FileFormat.testInput(reads2, FileFormat.FASTQ, null, true, true);
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2);
+ cris.start(); //4567
+ }
+
+ assert(cris!=null) : reads1;
+ System.err.println("Started cris");
+ boolean paired=cris.paired();
+ if(verbose){System.err.println("Paired: "+paired);}
+
+// count(cris, k, rcomp, count);
+// assert(false) : THREADS;
+ CountThread[] cta=new CountThread[THREADS];
+ for(int i=0; i<cta.length; i++){
+ cta[i]=new CountThread(cris, k, rcomp, count);
+ cta[i].start();
+ }
+
+ for(int i=0; i<cta.length; i++){
+ CountThread ct=cta[i];
+ synchronized(ct){
+ while(ct.isAlive()){
+ try {
+ ct.join(1000);
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+ }
+
+ cris.close();
+ if(verbose){System.err.println("Closed stream");}
+ if(verbose){System.err.println("Processed "+readsProcessed+" reads.");}
+
+
+ return count;
+ }
+
+
+
+
+
+
+ public static KCountArray count(final String reads1, final String reads2, final int k, final int cbits, final boolean rcomp,
+ KCountArray count, final KCountArray trusted, final long maxReads, final int thresh, final int detectStepsize, final boolean conservative){
+
+ assert(k<32 && k>=1 && (count!=null || k<20));
+ final int kbits=2*k;
+ final long mask=~((-1L)<<(kbits));
+
+// System.out.println("k="+k+", kbits="+kbits+", mask="+Long.toHexString(mask)+", thresh="+thresh);
+// System.out.println("\ntrusted=\n"+trusted);
+// System.out.println("\ncount=\n"+count);
+
+ if(count==null){
+ final long cells=1L<<kbits;
+ if(verbose){System.err.println("k="+k+", kbits="+kbits+", cells="+cells+", mask="+Long.toHexString(mask));}
+ count=KCountArray.makeNew(cells, cbits, 0);
+ }
+
+ final ConcurrentReadInputStream cris;
+ {
+ FileFormat ff1=FileFormat.testInput(reads1, FileFormat.FASTQ, null, true, true);
+ FileFormat ff2=FileFormat.testInput(reads2, FileFormat.FASTQ, null, true, true);
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2);
+ cris.start(); //4567
+ }
+
+ assert(cris!=null) : reads1;
+ System.err.println("Started cris");
+ boolean paired=cris.paired();
+ if(verbose){System.err.println("Paired: "+paired);}
+
+
+// count(cris, k, rcomp, count, trusted, thresh, detectStepsize, conservative);
+
+// assert(false) : THREADS;
+ CountThread[] cta=new CountThread[THREADS];
+ for(int i=0; i<cta.length; i++){
+ cta[i]=new CountThread(cris, k, rcomp, count, trusted, thresh, detectStepsize, conservative);
+ cta[i].start();
+ }
+
+ for(int i=0; i<cta.length; i++){
+ CountThread ct=cta[i];
+ synchronized(ct){
+ while(ct.isAlive()){
+ try {
+ ct.join(1000);
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+ }
+
+ cris.close();
+ if(verbose){System.err.println("Closed stream");}
+
+// System.out.println("*** after ***");
+// System.out.println("\ntrusted=\n"+trusted);
+// System.out.println("\ncount=\n"+count);
+
+ return count;
+ }
+
+ private static class CountThread extends Thread{
+
+ CountThread(final ConcurrentReadInputStream cris_, final int k_, final boolean rcomp_, final KCountArray count_){
+ this(cris_, k_, rcomp_, count_, null, 2, 1, true);
+ }
+
+ CountThread(final ConcurrentReadInputStream cris_, final int k_, final boolean rcomp_,
+ final KCountArray count_, final KCountArray trusted_, final int thresh_, final int detectStepsize_, final boolean conservative_){
+ cris=cris_;
+ k=k_;
+ rcomp=rcomp_;
+ counts=count_;
+ trusted=trusted_;
+ thresh=thresh_;
+ detectStepsize=detectStepsize_;
+ conservative=conservative_;
+ MAKE_NEW_ARRAY=(counts.getClass()!=KCountArray4MT.class);
+ }
+
+ public void run(){
+ buffer=new long[BUFFERLEN];
+
+ if(trusted==null){
+ count(cris, k, rcomp, counts);
+ }else{
+ count(cris, k, rcomp, counts, trusted, thresh, detectStepsize, conservative);
+ }
+
+ synchronized(getClass()){
+ keysCounted+=keysCountedLocal;
+ readsProcessed+=readsProcessedLocal;
+
+ if(bufflen>0){
+ if(bufflen<BUFFERLEN){
+ buffer=Arrays.copyOf(buffer, bufflen);
+ }
+ counts.increment(buffer);
+ }
+ buffer=null;
+ bufflen=0;
+ }
+ }
+
+
+
+
+ private void count(ConcurrentReadInputStream cris, int k, boolean rcomp, KCountArray count){
+ assert(k<32 && k>=1 && count!=null);
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+
+ if(count.gap==0){
+ final int kbits=2*k;
+ final long mask=~((-1L)<<(kbits));
+
+
+ while(reads!=null && reads.size()>0){
+ //System.err.println("reads.size()="+reads.size());
+ for(Read r : reads){
+ readsProcessedLocal++;
+
+ addRead(r, count, k, mask, rcomp);
+ if(r.mate!=null){
+ addRead(r.mate, count, k, mask, rcomp);
+ }
+
+ }
+ //System.err.println("returning list");
+ cris.returnList(ln.id, ln.list.isEmpty());
+ //System.err.println("fetching list");
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ }else{
+ final int k1=(k+1)/2;
+ final int k2=k/2;
+ final int kbits1=2*k1;
+ final int kbits2=2*k2;
+ final int gap=count.gap;
+ final long mask1=~((-1L)<<(kbits1));
+ final long mask2=~((-1L)<<(kbits2));
+ while(reads!=null && reads.size()>0){
+ //System.err.println("reads.size()="+reads.size());
+ for(Read r : reads){
+ readsProcessedLocal++;
+
+ addReadSplit(r, count, k1, k2, mask1, mask2, gap, rcomp);
+ if(r.mate!=null){
+ addReadSplit(r.mate, count, k1, k2, mask1, mask2, gap, rcomp);
+ }
+
+ }
+ //System.err.println("returning list");
+ cris.returnList(ln.id, ln.list.isEmpty());
+ //System.err.println("fetching list");
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ }
+
+ if(verbose){System.err.println("Finished reading");}
+ cris.returnList(ln.id, ln.list.isEmpty());
+ if(verbose){System.err.println("Returned list");}
+ }
+
+
+
+
+ private void count(final ConcurrentReadInputStream cris, final int k, final boolean rcomp,
+ final KCountArray count, final KCountArray trusted, final int thresh, final int detectStepsize, final boolean conservative){
+ if(count.gap>0){countFastqSplit(cris, k, rcomp, count, trusted, thresh, detectStepsize, conservative);}
+ assert(k<32 && k>=1 && (count!=null || k<20));
+ final int kbits=2*k;
+ final long mask=~((-1L)<<(kbits));
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ while(reads!=null && reads.size()>0){
+ //System.err.println("reads.size()="+reads.size());
+ for(Read r : reads){
+
+ Read r2=r.mate;
+ {
+ if(trusted!=null){
+ BitSet bs=(conservative ? ErrorCorrect.detectErrorsBulk(r, trusted, k, thresh, detectStepsize) :
+ ErrorCorrect.detectTrusted(r, trusted, k, thresh, detectStepsize));
+// System.out.println("\n"+toString(bs, r.length()));
+// System.out.println(new String(r.bases));
+ for(int i=bs.nextClearBit(0); i<r.length(); i=bs.nextClearBit(i+1)){
+ r.bases[i]='N';
+ if(r.quality!=null){r.quality[i]=0;}
+ }
+// System.out.println(new String(r.bases));
+// System.out.println("used = "+String.format("%.3f%%",count.usedFraction()*100));
+// System.out.println("used = "+((KCountArray4)count).cellsUsed());
+// if(bs.length()<r.length()){r=null;}
+ }
+// if(r!=null){addRead(r, count, k, mask, rcomp);}
+ addRead(r, count, k, mask, rcomp);
+ }
+ if(r2!=null){
+ if(trusted!=null){
+ BitSet bs=(conservative ? ErrorCorrect.detectErrorsBulk(r2, trusted, k, thresh, detectStepsize) :
+ ErrorCorrect.detectTrusted(r2, trusted, k, thresh, detectStepsize));
+ for(int i=bs.nextClearBit(0); i<r2.length(); i=bs.nextClearBit(i+1)){
+ r2.bases[i]='N';
+ if(r2.quality!=null){r2.quality[i]=0;}
+ }
+ }
+ addRead(r2, count, k, mask, rcomp);
+ }
+
+ }
+ //System.err.println("returning list");
+ cris.returnList(ln.id, ln.list.isEmpty());
+ //System.err.println("fetching list");
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+
+ if(verbose){System.err.println("Finished reading");}
+ cris.returnList(ln.id, ln.list.isEmpty());
+ if(verbose){System.err.println("Returned list");}
+ }
+
+
+ private void countFastqSplit(final ConcurrentReadInputStream cris, final int k, final boolean rcomp,
+ final KCountArray count, final KCountArray trusted, final int thresh, final int detectStepsize, final boolean conservative){
+ assert(false) : cris.paired();
+ assert(count.gap>0);
+ assert(k<32 && k>=1 && (count!=null || k<20));
+ final int kbits=2*k;
+ final long mask=~((-1L)<<(kbits));
+
+
+ final int k1=(k+1)/2;
+ final int k2=k/2;
+ final int kbits1=2*k1;
+ final int kbits2=2*k2;
+ final int gap=count.gap;
+ final long mask1=~((-1L)<<(kbits1));
+ final long mask2=~((-1L)<<(kbits2));
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ while(reads!=null && reads.size()>0){
+ //System.err.println("reads.size()="+reads.size());
+ for(Read r : reads){
+
+ Read r2=r.mate;
+ {
+ if(trusted!=null){
+ BitSet bs=(conservative ? ErrorCorrect.detectErrorsBulk(r, trusted, k, thresh, detectStepsize) :
+ ErrorCorrect.detectTrusted(r, trusted, k, thresh, detectStepsize));
+// System.out.println("\n"+toString(bs, r.length()));
+// System.out.println(new String(r.bases));
+ for(int i=bs.nextClearBit(0); i<r.length(); i=bs.nextClearBit(i+1)){
+ r.bases[i]='N';
+ r.quality[i]=0;
+ }
+// System.out.println(new String(r.bases));
+// System.out.println("used = "+String.format("%.3f%%",count.usedFraction()*100));
+// System.out.println("used = "+((KCountArray4)count).cellsUsed());
+// if(bs.length()<r.length()){r=null;}
+ }
+// if(r!=null){addRead(r, count, k, mask, rcomp);}
+
+ addReadSplit(r, count, k1, k2, mask1, mask2, gap, rcomp);
+ }
+ if(r2!=null){
+ if(trusted!=null){
+ BitSet bs=(conservative ? ErrorCorrect.detectErrorsBulk(r2, trusted, k, thresh, detectStepsize) :
+ ErrorCorrect.detectTrusted(r2, trusted, k, thresh, detectStepsize));
+ for(int i=bs.nextClearBit(0); i<r2.length(); i=bs.nextClearBit(i+1)){
+ r2.bases[i]='N';
+ r2.quality[i]=0;
+ }
+ }
+ addReadSplit(r2, count, k1, k2, mask1, mask2, gap, rcomp);
+ }
+
+ }
+ //System.err.println("returning list");
+ cris.returnList(ln.id, ln.list.isEmpty());
+ //System.err.println("fetching list");
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+
+ if(verbose){System.err.println("Finished reading");}
+ cris.returnList(ln.id, ln.list.isEmpty());
+ if(verbose){System.err.println("Returned list");}
+ }
+
+
+
+ private void addRead(Read r, final KCountArray count, final int k, final long mask, boolean rcomp){
+
+ if(PREJOIN && r.mate!=null && r.insert()>0){
+ r.mate.reverseComplement();
+ r=r.joinRead();
+ }
+
+ int len=0;
+ long kmer=0;
+ byte[] bases=r.bases;
+ byte[] quals=r.quality;
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ int x=AminoAcid.baseToNumber[b];
+ if(x<0 || (quals!=null && quals[i]<minQuality)){
+ len=0;
+ kmer=0;
+ }else{
+ kmer=((kmer<<2)|x)&mask;
+ len++;
+ if(len>=k){
+ keysCountedLocal++;
+// System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer));
+
+// System.out.println("Arrays.toString(buffer));
+ buffer[bufflen]=kmer;
+ bufflen++;
+ if(bufflen>=buffer.length){
+// assert(false) : "Submitting "+Arrays.toString(buffer);
+ count.increment(buffer);
+ bufflen=0;
+ if(MAKE_NEW_ARRAY){buffer=new long[BUFFERLEN];}
+ }
+// count.increment(kmer);
+
+// System.out.println(" -> "+count.read(kmer));
+// System.out.print("Incrementing array for "+Long.toHexString(kmer)+": "+array[(int)kmer]);
+// array[(int)kmer]++;
+// System.out.println(" -> "+array[(int)kmer]+"\n");
+// assert(array[(int)kmer]==count.read(kmer) || array[(int)kmer]>3);
+ }
+ }
+ }
+ if(rcomp){
+ r.reverseComplement();
+ addRead(r, count, k, mask, false);
+ }
+ }
+
+ private void addReadSplit(Read r, final KCountArray count, final int k1, final int k2, final long mask1, final long mask2, final int gap, boolean rcomp){
+
+ if(PREJOIN && r.mate!=null && r.insert()>0){
+ if(verbose){System.err.println("Prejoining "+r.numericID+" at "+r.insert());}
+ r.mate.reverseComplement();
+ r=r.joinRead();
+ }
+
+ int len=0;
+ int shift=k2*2;
+ long kmer1=0;
+ long kmer2=0;
+ byte[] bases=r.bases;
+ byte[] quals=r.quality;
+
+ assert(kmer1>=kmer2);
+
+// assert(false) : k1+", "+k2+", "+mask1+", "+mask2+", "+gap;
+
+ if(verbose){System.err.println("Hashing read "+r.numericID+"; loop limits "+(k1+gap)+"-"+(bases.length));}
+ for(int i=0, j=i+k1+gap; j<bases.length; i++, j++){
+ int x1=AminoAcid.baseToNumber[bases[i]];
+ int x2=AminoAcid.baseToNumber[bases[j]];
+
+ if(x1<0 || x2<0 || (quals!=null && (quals[i]<minQuality || quals[j]<minQuality))){
+ len=0;
+ kmer1=0;
+ kmer2=0;
+ }else{
+ kmer1=((kmer1<<2)|x1)&mask1;
+ kmer2=((kmer2<<2)|x2)&mask2;
+ len++;
+ if(len>=k1){
+
+ keysCountedLocal++;
+// System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer));
+
+ long key=(kmer1<<shift)|kmer2;
+// System.err.println(Long.toHexString(key));
+
+ if(verbose){System.err.println("Hashing key "+Long.toHexString(key)+" at length "+len);}
+
+ buffer[bufflen]=key;
+ bufflen++;
+ if(bufflen>=buffer.length){
+ count.increment(buffer);
+ bufflen=0;
+ if(MAKE_NEW_ARRAY){buffer=new long[BUFFERLEN];}
+ }
+// count.increment(kmer);
+
+
+// System.out.println(" -> "+count.read(kmer));
+// System.out.print("Incrementing array for "+Long.toHexString(kmer)+": "+array[(int)kmer]);
+// array[(int)kmer]++;
+// System.out.println(" -> "+array[(int)kmer]+"\n");
+// assert(array[(int)kmer]==count.read(kmer) || array[(int)kmer]>3);
+ }
+ }
+ }
+ if(rcomp){
+ r.reverseComplement();
+ addReadSplit(r, count, k1, k2, mask1, mask2, gap, false);
+ }
+ }
+
+ private final ConcurrentReadInputStream cris;
+ private final int k;
+ private final boolean rcomp;
+ private final KCountArray counts;
+ private final KCountArray trusted;
+ private final int thresh;
+ private final int detectStepsize;
+ private final boolean conservative;
+ private long keysCountedLocal=0;
+ private long readsProcessedLocal=0;
+ private long[] buffer;
+ private int bufflen=0;
+ private final boolean MAKE_NEW_ARRAY;
+ }
+
+}
diff --git a/current/bloom/KmerCount7MT.java b/current/bloom/KmerCount7MT.java
new file mode 100755
index 0000000..ec60326
--- /dev/null
+++ b/current/bloom/KmerCount7MT.java
@@ -0,0 +1,882 @@
+package bloom;
+
+import java.lang.Thread.State;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.BitSet;
+
+import jgi.ErrorCorrect;
+
+import stream.ConcurrentGenericReadInputStream;
+import stream.ConcurrentReadInputStream;
+import stream.FastaReadInputStream;
+import stream.Read;
+import ukmer.Kmer;
+
+import align2.ListNum;
+import align2.Tools;
+import dna.AminoAcid;
+import dna.Timer;
+import fileIO.FileFormat;
+
+/**
+ * @author Brian Bushnell
+ * @date Jul 5, 2012
+ *
+ */
+public class KmerCount7MT extends KmerCountAbstract {
+
+ public static void main(String[] args){
+
+ Timer t=new Timer();
+
+ String fname1=args[0];
+ String fname2=(args.length>1 ? args[1] : null);
+ int k=14;
+ int cbits=16;
+ int gap=0;
+ int matrixbits=-1;
+ int hashes=1;
+
+ for(int i=2; i<args.length; i++){
+ final String arg=args[i];
+ final String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=(split.length>1 ? split[1] : "true");
+
+ if(a.equals("k") || a.equals("kmer")){
+ k=Integer.parseInt(b);
+ }else if(a.startsWith("cbits") || a.startsWith("cellbits")){
+ cbits=Integer.parseInt(b);
+ }else if(a.startsWith("gap")){
+ gap=Integer.parseInt(b);
+ }else if(a.startsWith("reads") || a.startsWith("maxreads")){
+ maxReads=Tools.parseKMG(b);
+ }else if(a.startsWith("matrixbits")){
+ matrixbits=Integer.parseInt(b);
+ }else if(a.startsWith("hashes")){
+ hashes=Integer.parseInt(b);
+ }else if(a.equals("canonical")){
+ CANONICAL=Tools.parseBoolean(b);
+ }else{
+ throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ int kbits=Tools.min(2*k, 62);
+ if(matrixbits<0){
+ matrixbits=kbits;
+ }
+ matrixbits=Tools.min(kbits, matrixbits);
+
+ if(fileIO.FileFormat.hasFastaExtension(fname1)){
+ assert(!FastaReadInputStream.SPLIT_READS);
+ FastaReadInputStream.MIN_READ_LEN=k;
+ }
+
+ KCountArray count=KCountArray.makeNew(1L<<kbits, 1L<<matrixbits, cbits, gap, hashes);
+ try {
+ count=count(fname1, fname2, k, cbits, gap, true, count);
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ count.shutdown();
+
+// verbose=true;
+
+ t.stop();
+ System.out.println("Finished counting; time = "+t);
+
+ printStatistics(count);
+
+ }
+
+ public static void printStatistics(KCountArray count){
+ long[] freq=count.transformToFrequency();
+
+// System.out.println(count+"\n");
+// System.out.println(Arrays.toString(freq)+"\n");
+
+ long sum=sum(freq);
+ System.out.println("Kmer fraction:");
+ int lim1=8, lim2=16;
+ for(int i=0; i<lim1; i++){
+ String prefix=i+"";
+ while(prefix.length()<8){prefix=prefix+" ";}
+ System.out.println(prefix+"\t"+String.format("%.3f%% ",(100l*freq[i]/(double)sum))+"\t"+freq[i]);
+ }
+ while(lim1<=freq.length){
+ int x=0;
+ for(int i=lim1; i<lim2; i++){
+ x+=freq[i];
+ }
+ String prefix=lim1+"-"+(lim2-1);
+ if(lim2>=freq.length){prefix=lim1+"+";}
+ while(prefix.length()<8){prefix=prefix+" ";}
+ System.out.println(prefix+"\t"+String.format("%.3f%% ",(100l*x/(double)sum))+"\t"+x);
+ lim1*=2;
+ lim2=min(lim2*2, freq.length);
+ }
+
+ long sum2=sum-freq[0];
+ long x=freq[1];
+ System.out.println();
+ System.out.println("Keys Counted: \t \t"+keysCounted);
+ System.out.println("Unique: \t \t"+sum2);
+ System.out.println("Avg Sites/Key: \t \t"+String.format("%.3f ",(keysCounted*1d/sum2)));
+ System.out.println();
+ System.out.println("Singleton: \t"+String.format("%.3f%% ",(100l*x/(double)sum2))+"\t"+x);
+ x=sum2-x;
+ System.out.println("Useful: \t"+String.format("%.3f%% ",(100l*x/(double)sum2))+"\t"+x);
+ }
+
+ public static KCountArray makeKca(String fname1, String fname2, Iterable<String> extraFiles, int k, int cbits){
+ return makeKca(fname1, fname2, extraFiles, k, cbits, 0, Tools.min(2*k, 35), 1, minQuality, true, maxReads, 1, 1, 1, 2);
+ }
+
+ public static KCountArray makeKca(String fname1, String fname2, Iterable<String> extraFiles,
+ int k, int cbits, int gap, int matrixbits, int hashes, int minqual, boolean rcomp, long maxreads){
+ assert(matrixbits<63);
+ return makeKca(fname1, fname2, extraFiles, k, cbits, gap, matrixbits, hashes, minqual, rcomp, maxreads, 1, 1, 1, 2);
+ }
+
+ public static KCountArray makeKca(String fname1, String fname2, Iterable<String> extraFiles,
+ int k, int cbits, int gap, int matrixbits, int hashes, int minqual, boolean rcomp, long maxreads, int passes, int stepsize, int thresh1, int thresh2){
+ assert(matrixbits<63);
+ return makeKca(fname1, fname2, extraFiles,
+ k, cbits, gap, 1L<<matrixbits, hashes, minqual, rcomp, maxreads, passes, stepsize, thresh1, thresh2, null, 0);
+ }
+
+ public static KCountArray makeKca(String fname1, String fname2, Iterable<String> extraFiles,
+ int k, int cbits, int gap, long cells, int hashes, int minqual, boolean rcomp, long maxreads, int passes, int stepsize, int thresh1, int thresh2){
+ return makeKca(fname1, fname2, extraFiles,
+ k, cbits, gap, cells, hashes, minqual, rcomp, maxreads, passes, stepsize, thresh1, thresh2, null, 0);
+ }
+
+ public static KCountArray makeKca(String fname1, String fname2, Iterable<String> extraFiles,
+ int k, int cbits, int gap, long cells, int hashes, int minqual, boolean rcomp, long maxreads, int passes, int stepsize, int thresh1, int thresh2,
+ KCountArray prefilter, int prefilterLimit_){
+ final int kbits=Tools.min(2*k, 62);
+// verbose=true;
+ if(verbose){System.err.println("Making kca from ("+fname1+", "+fname2+")\nk="+k+", gap="+gap+", cells="+Tools.toKMG(cells)+", cbits="+cbits);}
+
+ boolean oldsplit=FastaReadInputStream.SPLIT_READS;
+ long oldmax=maxReads;
+ byte oldq=minQuality;
+ maxReads=maxreads;
+ minQuality=(byte)minqual;
+ // System.out.println("kbits="+(kbits)+" -> "+(1L<<kbits)+", matrixbits="+(matrixbits)+" -> "+(1L<<matrixbits)+", cbits="+cbits+", gap="+gap+", hashes="+hashes);
+ KCountArray kca=KCountArray.makeNew(1L<<kbits, cells, cbits, gap, hashes, prefilter, prefilterLimit_);
+
+// System.out.println("a");
+ {//For processing input lists
+ ArrayList<String> extra2=null;
+ if(fname1!=null && fname1.contains(",")){
+ String[] s=fname1.split(",");
+ if(extra2==null){extra2=new ArrayList<String>();}
+ for(int i=1; i<s.length; i++){extra2.add(s[i]);}
+ fname1=s[0];
+ }
+ if(fname2!=null && fname2.contains(",")){
+ String[] s=fname2.split(",");
+ if(extra2==null){extra2=new ArrayList<String>();}
+ for(int i=1; i<s.length; i++){extra2.add(s[i]);}
+ fname2=s[0];
+ }
+ if(extra2!=null){
+ if(extraFiles!=null){
+ for(String s : extraFiles){
+ extra2.add(s);
+ }
+ }
+ extraFiles=extra2;
+ }
+ }
+// System.out.println("b");
+
+ if(extraFiles!=null){
+ for(String s : extraFiles){
+ if(fileIO.FileFormat.hasFastaExtension(s)){
+ assert(!FastaReadInputStream.SPLIT_READS);
+ }
+ }
+ }
+
+// System.out.println("c");
+ if(passes==1){
+// System.out.println("c1");
+ try {
+ count(fname1, fname2, k, cbits, gap, rcomp, kca);
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ if(extraFiles!=null){
+ maxReads=-1;
+ for(String s : extraFiles){
+ try {
+ count(s, null, k, cbits, gap, rcomp, kca);
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+ kca.shutdown();
+
+ }else{
+// System.out.println("c2");
+ assert(passes>1);
+ KCountArray trusted=null;
+ for(int i=1; i<passes; i++){
+ boolean conservative=i>2;// /*or, alternately, (trusted==null || trusted.capacity()>0.3)
+ int step=(stepsize==1 ? 1 : stepsize+i%2);
+ // if(!conservative){step=(step+3)/4;}
+ if(!conservative){step=Tools.min(3, (step+3)/4);}
+
+ try {
+ count(fname1, fname2, k, cbits, true, kca, trusted, maxreads, thresh1, step, conservative);
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ if(extraFiles!=null){
+ maxReads=-1;
+ for(String s : extraFiles){
+ try {
+ count(s, null, k, cbits, true, kca, trusted, maxreads, thresh1, step, conservative);
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+ kca.shutdown();
+
+ System.out.println("Trusted: \t"+kca.toShortString());
+ trusted=kca;
+ kca=KCountArray.makeNew(1L<<kbits, cells, cbits, gap, hashes, prefilter, prefilterLimit_);
+
+ }
+
+ try {
+ count(fname1, fname2, k, cbits, true, kca, trusted, maxreads, thresh2, stepsize, true);
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ if(extraFiles!=null){
+ maxReads=-1;
+ for(String s : extraFiles){
+ try {
+ count(s, null, k, cbits, true, kca, trusted, maxreads, thresh2, stepsize, true);
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+ kca.shutdown();
+ }
+// System.out.println("d");
+ minQuality=oldq;
+ maxReads=oldmax;
+ FastaReadInputStream.SPLIT_READS=oldsplit;
+
+
+ return kca;
+ }
+
+ public static KCountArray count(String reads1, String reads2, int k, int cbits, int gap, boolean rcomp, KCountArray count) throws Exception{
+ assert(k>=1 && (count!=null || k<20));
+ final int kbits=Tools.min(2*k, 62);
+ final long mask=~((-1L)<<(kbits));
+// System.err.println("countFastq... making a new cris");
+ if(count==null){
+ final long cells=1L<<kbits;
+ if(verbose){System.err.println("k="+k+", kbits="+kbits+", cells="+cells+", mask="+Long.toHexString(mask));}
+ count=KCountArray.makeNew(cells, cbits, gap);
+ }
+ assert(gap==count.gap);
+
+ final ConcurrentReadInputStream cris;
+ {
+ FileFormat ff1=FileFormat.testInput(reads1, FileFormat.FASTQ, null, true, true);
+ FileFormat ff2=FileFormat.testInput(reads2, FileFormat.FASTQ, null, true, true);
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2);
+ cris.start(); //4567
+ }
+
+ assert(cris!=null) : reads1;
+ if(verbose){System.err.println("Started cris");}
+ boolean paired=cris.paired();
+ if(verbose){System.err.println("Paired: "+paired);}
+
+// count(cris, k, rcomp, count);
+// assert(false) : THREADS;
+ CountThread[] cta=new CountThread[THREADS];
+ for(int i=0; i<cta.length; i++){
+ cta[i]=new CountThread(cris, k, rcomp, count);
+ cta[i].start();
+ }
+// System.out.println("~1");
+ for(int i=0; i<cta.length; i++){
+// System.out.println("~2");
+ CountThread ct=cta[i];
+ synchronized(ct){
+// System.out.println("~3");
+ while(ct.getState()!=State.TERMINATED){
+// System.out.println("~4");
+ try {
+ ct.join(2000);
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+// System.out.println("~5");
+ }
+ }
+ }
+// System.out.println("~6");
+
+ cris.close();
+ if(verbose){System.err.println("Closed stream");}
+ if(verbose){System.err.println("Processed "+readsProcessed+" reads.");}
+
+
+ return count;
+ }
+
+
+
+
+
+
+ public static KCountArray count(final String reads1, final String reads2, final int k, final int cbits, final boolean rcomp,
+ KCountArray count, final KCountArray trusted, final long maxReads, final int thresh, final int detectStepsize, final boolean conservative)
+ throws Exception{
+
+ assert(k>=1 && (count!=null || k<20));
+ final int kbits=Tools.min(2*k, 62);
+ final long mask=~((-1L)<<(kbits));
+
+// System.out.println("k="+k+", kbits="+kbits+", mask="+Long.toHexString(mask)+", thresh="+thresh);
+// System.out.println("\ntrusted=\n"+trusted);
+// System.out.println("\ncount=\n"+count);
+
+// verbose=true;
+
+ if(count==null){
+ final long cells=1L<<kbits;
+ if(verbose){System.err.println("k="+k+", kbits="+kbits+", cells="+cells+", mask="+Long.toHexString(mask));}
+ count=KCountArray.makeNew(cells, cbits, 0);
+ }
+
+ final ConcurrentReadInputStream cris;
+ {
+ FileFormat ff1=FileFormat.testInput(reads1, FileFormat.FASTQ, null, true, true);
+ FileFormat ff2=FileFormat.testInput(reads2, FileFormat.FASTQ, null, true, true);
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2);
+ cris.start(); //4567
+ }
+
+ assert(cris!=null) : reads1;
+ if(verbose){System.err.println("Started cris");}
+ boolean paired=cris.paired();
+ if(verbose){System.err.println("Paired: "+paired);}
+
+
+// count(cris, k, rcomp, count, trusted, thresh, detectStepsize, conservative);
+
+// assert(false) : THREADS;
+ CountThread[] cta=new CountThread[THREADS];
+ for(int i=0; i<cta.length; i++){
+ cta[i]=new CountThread(cris, k, rcomp, count, trusted, thresh, detectStepsize, conservative);
+ cta[i].start();
+ }
+
+ for(int i=0; i<cta.length; i++){
+ CountThread ct=cta[i];
+ synchronized(ct){
+ while(ct.isAlive()){
+ try {
+ ct.join(1000);
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+ }
+
+ cris.close();
+ if(verbose){System.err.println("Closed stream");}
+
+// System.out.println("*** after ***");
+// System.out.println("\ntrusted=\n"+trusted);
+// System.out.println("\ncount=\n"+count);
+
+ return count;
+ }
+
+ private static class CountThread extends Thread{
+
+ CountThread(final ConcurrentReadInputStream cris_, final int k_, final boolean rcomp_, final KCountArray count_){
+ this(cris_, k_, rcomp_, count_, null, 2, 1, true);
+ }
+
+ CountThread(final ConcurrentReadInputStream cris_, final int k_, final boolean rcomp_,
+ final KCountArray count_, final KCountArray trusted_, final int thresh_, final int detectStepsize_, final boolean conservative_){
+ cris=cris_;
+ k=k_;
+ rcomp=rcomp_;
+ counts=count_;
+ trusted=trusted_;
+ thresh=thresh_;
+ detectStepsize=detectStepsize_;
+ conservative=conservative_;
+ MAKE_NEW_ARRAY=(counts.getClass()!=KCountArray4MT.class);
+ }
+
+ public void run(){
+ buffer=new long[BUFFERLEN];
+// System.out.println("Running");
+ if(trusted==null){
+ count(cris, k, rcomp, counts);
+ }else{
+ count(cris, k, rcomp, counts, trusted, thresh, detectStepsize, conservative);
+ }
+
+ synchronized(getClass()){
+ keysCounted+=keysCountedLocal;
+ readsProcessed+=readsProcessedLocal;
+
+ if(verbose){System.err.println(keysCounted+", "+keysCountedLocal);}
+ if(verbose){System.err.println(readsProcessed+", "+readsProcessedLocal);}
+
+ if(bufflen>0){
+ if(bufflen<BUFFERLEN){
+ buffer=Arrays.copyOf(buffer, bufflen);
+ }
+ if(verbose){System.err.println("Incrementing buffer: "+Arrays.toString(buffer));}
+ counts.increment(buffer);
+ }
+ buffer=null;
+ bufflen=0;
+ }
+ }
+
+
+
+ private void count(ConcurrentReadInputStream cris, int k, boolean rcomp, KCountArray count){
+ assert(k>=1 && count!=null);
+
+// System.out.println("Waiting for list");
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+// System.out.println("Got list: "+(ln==null ? "null" : ln.id)+", "+(ln==null || ln.list==null ? "null" : ln.list.size()));
+
+
+ if(count.gap==0){
+ final int kbits=Tools.min(2*k, 62);
+ final long mask=~((-1L)<<(kbits));
+
+
+ while(reads!=null && reads.size()>0){
+ //System.err.println("reads.size()="+reads.size());
+ for(Read r : reads){
+ readsProcessedLocal++;
+ addRead(r, count, k, mask, rcomp);
+ if(r.mate!=null){
+ addRead(r.mate, count, k, mask, rcomp);
+ }
+ }
+ //System.err.println("returning list");
+ cris.returnList(ln.id, ln.list.isEmpty());
+ //System.err.println("fetching list");
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ }else{
+ final int k1=(k+1)/2;
+ final int k2=k/2;
+ final int kbits1=2*k1;
+ final int kbits2=2*k2;
+ final int gap=count.gap;
+ final long mask1=~((-1L)<<(kbits1));
+ final long mask2=~((-1L)<<(kbits2));
+ while(reads!=null && reads.size()>0){
+ //System.err.println("reads.size()="+reads.size());
+ for(Read r : reads){
+ readsProcessedLocal++;
+ addReadSplit(r, count, k1, k2, mask1, mask2, gap, rcomp);
+ if(r.mate!=null){
+ addReadSplit(r.mate, count, k1, k2, mask1, mask2, gap, rcomp);
+ }
+ }
+ //System.err.println("returning list");
+ cris.returnList(ln.id, ln.list.isEmpty());
+ //System.err.println("fetching list");
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ }
+
+ if(verbose){System.err.println("Finished reading");}
+ cris.returnList(ln.id, ln==null ? true : ln.list.isEmpty());
+ if(verbose){System.err.println("Returned list");}
+ }
+
+
+
+
+ private void count(final ConcurrentReadInputStream cris, final int k, final boolean rcomp,
+ final KCountArray count, final KCountArray trusted, final int thresh, final int detectStepsize, final boolean conservative){
+ if(count.gap>0){countFastqSplit(cris, k, rcomp, count, trusted, thresh, detectStepsize, conservative);}
+ assert(k>=1 && (count!=null || k<20));
+ final int kbits=Tools.min(2*k, 62);
+ final long mask=~((-1L)<<(kbits));
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ while(reads!=null && reads.size()>0){
+ //System.err.println("reads.size()="+reads.size());
+ for(Read r : reads){
+
+ Read r2=r.mate;
+ {
+ if(trusted!=null){
+ BitSet bs=(conservative ? ErrorCorrect.detectErrorsBulk(r, trusted, k, thresh, detectStepsize) :
+ ErrorCorrect.detectTrusted(r, trusted, k, thresh, detectStepsize));
+// System.out.println("\n"+toString(bs, r.length()));
+// System.out.println(new String(r.bases));
+ if(bs!=null){
+ for(int i=bs.nextClearBit(0); i<r.length(); i=bs.nextClearBit(i+1)){
+ r.bases[i]='N';
+ if(r.quality!=null){r.quality[i]=0;}
+ }
+ }
+// System.out.println(new String(r.bases));
+// System.out.println("used = "+String.format("%.3f%%",count.usedFraction()*100));
+// System.out.println("used = "+((KCountArray4)count).cellsUsed());
+// if(bs.length()<r.length()){r=null;}
+ }
+// if(r!=null){addRead(r, count, k, mask, rcomp);}
+ addRead(r, count, k, mask, rcomp);
+ }
+ if(r2!=null){
+ if(trusted!=null){
+ BitSet bs=(conservative ? ErrorCorrect.detectErrorsBulk(r2, trusted, k, thresh, detectStepsize) :
+ ErrorCorrect.detectTrusted(r2, trusted, k, thresh, detectStepsize));
+ if(bs!=null){
+ for(int i=bs.nextClearBit(0); i<r2.length(); i=bs.nextClearBit(i+1)){
+ r2.bases[i]='N';
+ if(r2.quality!=null){r2.quality[i]=0;}
+ }
+ }
+ }
+ addRead(r2, count, k, mask, rcomp);
+ }
+
+ }
+ //System.err.println("returning list");
+ cris.returnList(ln.id, ln.list.isEmpty());
+ //System.err.println("fetching list");
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+
+ if(verbose){System.err.println("Finished reading");}
+ cris.returnList(ln.id, ln.list.isEmpty());
+ if(verbose){System.err.println("Returned list");}
+ }
+
+
+ private void countFastqSplit(final ConcurrentReadInputStream cris, final int k, final boolean rcomp,
+ final KCountArray count, final KCountArray trusted, final int thresh, final int detectStepsize, final boolean conservative){
+ assert(false) : cris.paired();
+ assert(count.gap>0);
+ assert(k<32 && k>=1 && (count!=null || k<20));
+ final int kbits=2*k;
+ final long mask=~((-1L)<<(kbits));
+
+
+ final int k1=(k+1)/2;
+ final int k2=k/2;
+ final int kbits1=2*k1;
+ final int kbits2=2*k2;
+ final int gap=count.gap;
+ final long mask1=~((-1L)<<(kbits1));
+ final long mask2=~((-1L)<<(kbits2));
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ while(reads!=null && reads.size()>0){
+ //System.err.println("reads.size()="+reads.size());
+ for(Read r : reads){
+
+ Read r2=r.mate;
+ {
+ if(trusted!=null){
+ BitSet bs=(conservative ? ErrorCorrect.detectErrorsBulk(r, trusted, k, thresh, detectStepsize) :
+ ErrorCorrect.detectTrusted(r, trusted, k, thresh, detectStepsize));
+// System.out.println("\n"+toString(bs, r.length()));
+// System.out.println(new String(r.bases));
+ for(int i=bs.nextClearBit(0); i<r.length(); i=bs.nextClearBit(i+1)){
+ r.bases[i]='N';
+ r.quality[i]=0;
+ }
+// System.out.println(new String(r.bases));
+// System.out.println("used = "+String.format("%.3f%%",count.usedFraction()*100));
+// System.out.println("used = "+((KCountArray4)count).cellsUsed());
+// if(bs.length()<r.length()){r=null;}
+ }
+// if(r!=null){addRead(r, count, k, mask, rcomp);}
+
+ addReadSplit(r, count, k1, k2, mask1, mask2, gap, rcomp);
+ }
+ if(r2!=null){
+ if(trusted!=null){
+ BitSet bs=(conservative ? ErrorCorrect.detectErrorsBulk(r2, trusted, k, thresh, detectStepsize) :
+ ErrorCorrect.detectTrusted(r2, trusted, k, thresh, detectStepsize));
+ for(int i=bs.nextClearBit(0); i<r2.length(); i=bs.nextClearBit(i+1)){
+ r2.bases[i]='N';
+ r2.quality[i]=0;
+ }
+ }
+ addReadSplit(r2, count, k1, k2, mask1, mask2, gap, rcomp);
+ }
+
+ }
+ //System.err.println("returning list");
+ cris.returnList(ln.id, ln.list.isEmpty());
+ //System.err.println("fetching list");
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+
+ if(verbose){System.err.println("Finished reading");}
+ cris.returnList(ln.id, ln.list.isEmpty());
+ if(verbose){System.err.println("Returned list");}
+ }
+
+
+
+ private void addRead(Read r, final KCountArray count, final int k, final long mask, boolean rcomp){
+ if(k>31){
+ addReadLong(r, count, k, mask);
+ return;
+ }
+ if(PREJOIN && r.mate!=null && r.insert()>0){
+ r.mate.reverseComplement();
+ r=r.joinRead();
+ }
+
+ int len=0;
+ long kmer=0;
+ float prob=1;
+ byte[] bases=r.bases;
+ byte[] quals=r.quality;
+
+ if(bases==null || bases.length<k+count.gap){return;}
+
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ int x=AminoAcid.baseToNumber[b];
+
+ byte q;
+ if(quals==null){
+ q=50;
+ }else{
+ q=quals[i];
+ prob=prob*align2.QualityTools.PROB_CORRECT[q];
+ if(len>k){
+ byte oldq=quals[i-k];
+ prob=prob*align2.QualityTools.PROB_CORRECT_INVERSE[oldq];
+ }
+ }
+
+ if(x<0 || q<minQuality){
+ len=0;
+ kmer=0;
+ prob=1;
+ }else{
+ kmer=((kmer<<2)|x)&mask;
+ len++;
+ if(len>=k && prob>=minProb && (!CANONICAL || KCountArray.isCanonical(kmer, k))){
+ keysCountedLocal++;
+ // System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer));
+ buffer[bufflen]=kmer;
+ bufflen++;
+ if(bufflen>=buffer.length){
+ // assert(false) : "Submitting "+Arrays.toString(buffer);
+ count.increment(buffer);
+ bufflen=0;
+ if(MAKE_NEW_ARRAY){buffer=new long[BUFFERLEN];}
+ }
+ }
+ }
+ }
+
+ if(rcomp){
+ r.reverseComplement();
+ addRead(r, count, k, mask, false);
+ }
+ }
+
+
+
+ private void addReadLong(Read r, final KCountArray count, final int k, final long mask){
+
+ if(PREJOIN && r.mate!=null && r.insert()>0){
+ r.mate.reverseComplement();
+ r=r.joinRead();
+ }
+
+ Kmer kmer=new Kmer(k);
+
+ float prob=1;
+ byte[] bases=r.bases;
+ byte[] quals=r.quality;
+
+ assert(k>31) : k;
+ kmer.clear();
+
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ kmer.addRight(b);
+
+ byte q;
+ if(quals==null){
+ q=50;
+ }else{
+ q=quals[i];
+ prob=prob*align2.QualityTools.PROB_CORRECT[q];
+ if(kmer.len>k){
+ byte oldq=quals[i-k];
+ prob=prob*align2.QualityTools.PROB_CORRECT_INVERSE[oldq];
+ }
+ }
+
+ if(!AminoAcid.isFullyDefined(b) || q<minQuality){
+ kmer.clear();
+ prob=1;
+ }
+ if(kmer.len>=k && prob>=minProb){
+ keysCountedLocal++;
+ // System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer));
+ // System.out.println("Arrays.toString(buffer));
+ buffer[bufflen]=kmer.xor();
+ bufflen++;
+ if(bufflen>=buffer.length){
+ // assert(false) : "Submitting "+Arrays.toString(buffer);
+ count.increment(buffer);
+ bufflen=0;
+ if(MAKE_NEW_ARRAY){buffer=new long[BUFFERLEN];}
+ }
+ }
+ }
+ }
+
+ private void addReadSplit(Read r, final KCountArray count, final int k1, final int k2, final long mask1, final long mask2, final int gap, boolean rcomp){
+
+ if(PREJOIN && r.mate!=null && r.insert()>0){
+ if(verbose){System.err.println("Prejoining "+r.numericID+" at "+r.insert());}
+ r.mate.reverseComplement();
+ r=r.joinRead();
+ }
+
+ int len=0;
+ int shift=k2*2;
+ long kmer1=0;
+ long kmer2=0;
+ float prob=1;
+ byte[] bases=r.bases;
+ byte[] quals=r.quality;
+
+ assert(kmer1>=kmer2);
+
+// assert(false) : k1+", "+k2+", "+mask1+", "+mask2+", "+gap;
+
+ if(verbose){System.err.println("Hashing read "+r.numericID+"; loop limits "+(k1+gap)+"-"+(bases.length));}
+ for(int i=0, j=i+k1+gap; j<bases.length; i++, j++){
+ int x1=AminoAcid.baseToNumber[bases[i]];
+ int x2=AminoAcid.baseToNumber[bases[j]];
+
+ byte q1, q2;
+ if(quals==null){
+ q1=50;
+ q2=50;
+ }else{
+ q1=quals[i];
+ q2=quals[j];
+ prob=prob*align2.QualityTools.PROB_CORRECT[q1]*align2.QualityTools.PROB_CORRECT[q2];
+ if(len>k){
+ byte oldq1=quals[i-k1];
+ byte oldq2=quals[j-k2];
+ prob=prob*align2.QualityTools.PROB_CORRECT_INVERSE[oldq1]*align2.QualityTools.PROB_CORRECT_INVERSE[oldq2];
+ }
+ }
+
+ if(x1<0 || x2<0 || q1<minQuality || q2<minQuality){
+ len=0;
+ kmer1=0;
+ kmer2=0;
+ prob=1;
+ }else{
+ kmer1=((kmer1<<2)|x1)&mask1;
+ kmer2=((kmer2<<2)|x2)&mask2;
+ len++;
+ if(len>=k1 && prob>=minProb){
+
+ keysCountedLocal++;
+// System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer));
+
+ long key=(kmer1<<shift)|kmer2;
+// System.err.println(Long.toHexString(key));
+
+ if(verbose){System.err.println("Hashing key "+Long.toHexString(key)+" at length "+len);}
+
+ buffer[bufflen]=key;
+ bufflen++;
+ if(bufflen>=buffer.length){
+ count.increment(buffer);
+ bufflen=0;
+ if(MAKE_NEW_ARRAY){buffer=new long[BUFFERLEN];}
+ }
+// count.increment(kmer);
+
+
+// System.out.println(" -> "+count.read(kmer));
+// System.out.print("Incrementing array for "+Long.toHexString(kmer)+": "+array[(int)kmer]);
+// array[(int)kmer]++;
+// System.out.println(" -> "+array[(int)kmer]+"\n");
+// assert(array[(int)kmer]==count.read(kmer) || array[(int)kmer]>3);
+ }
+ }
+ }
+ if(rcomp){
+ r.reverseComplement();
+ addReadSplit(r, count, k1, k2, mask1, mask2, gap, false);
+ }
+ }
+
+ private final ConcurrentReadInputStream cris;
+ private final int k;
+ private final boolean rcomp;
+ private final KCountArray counts;
+ private final KCountArray trusted;
+ private final int thresh;
+ private final int detectStepsize;
+ private final boolean conservative;
+ private long keysCountedLocal=0;
+ private long readsProcessedLocal=0;
+ private long[] buffer;
+ private int bufflen=0;
+ private final boolean MAKE_NEW_ARRAY;
+ }
+
+}
diff --git a/current/bloom/KmerCount7MTA.java b/current/bloom/KmerCount7MTA.java
new file mode 100755
index 0000000..44ed3bb
--- /dev/null
+++ b/current/bloom/KmerCount7MTA.java
@@ -0,0 +1,989 @@
+package bloom;
+
+import java.lang.Thread.State;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.BitSet;
+
+import jgi.BBMerge;
+import jgi.ErrorCorrect;
+import kmer.KmerTableSet;
+
+import stream.ConcurrentReadInputStream;
+import stream.FastaReadInputStream;
+import stream.Read;
+import ukmer.Kmer;
+
+import align2.ListNum;
+import align2.Tools;
+import dna.AminoAcid;
+import dna.Parser;
+import dna.Timer;
+import fileIO.FileFormat;
+import fileIO.ReadWrite;
+
+/**
+ * @author Brian Bushnell
+ * @date Jul 5, 2012
+ *
+ */
+public class KmerCount7MTA extends KmerCountAbstract {
+
+ public static void main(String[] args){
+
+ Timer t=new Timer();
+
+ String fname1=args[0];
+ String fname2=(args.length>1 ? args[1] : null);
+ int k=14;
+ int cbits=16;
+ int gap=0;
+ int matrixbits=-1;
+ int hashes=1;
+
+ for(int i=2; i<args.length; i++){
+ final String arg=args[i];
+ final String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=(split.length>1 ? split[1] : "true");
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(Parser.parseQuality(arg, a, b)){
+ //do nothing
+ }else if(a.equals("k") || a.equals("kmer")){
+ k=Integer.parseInt(b);
+ }else if(a.startsWith("cbits") || a.startsWith("cellbits")){
+ cbits=Integer.parseInt(b);
+ }else if(a.startsWith("gap")){
+ gap=Integer.parseInt(b);
+ }else if(a.startsWith("reads") || a.startsWith("maxreads")){
+ maxReads=Tools.parseKMG(b);
+ }else if(a.startsWith("matrixbits")){
+ matrixbits=Integer.parseInt(b);
+ }else if(a.startsWith("hashes")){
+ hashes=Integer.parseInt(b);
+ }else if(a.equals("canonical")){
+ CANONICAL=Tools.parseBoolean(b);
+ }else{
+ throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ {//Process parser fields
+ Parser.processQuality();
+ }
+
+ int kbits=Tools.min(2*k, 62);
+ if(matrixbits<0){
+ matrixbits=kbits;
+ }
+ matrixbits=Tools.min(kbits, matrixbits);
+
+ if(fileIO.FileFormat.hasFastaExtension(fname1)){
+ assert(!FastaReadInputStream.SPLIT_READS);
+ FastaReadInputStream.MIN_READ_LEN=k;
+ }
+
+ KCountArray counts=KCountArray.makeNew(1L<<kbits, 1L<<matrixbits, cbits, gap, hashes);
+ try {
+ counts=count(fname1, fname2, k, cbits, gap, true, false, counts);
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ counts.shutdown();
+
+// verbose=true;
+
+ t.stop();
+ System.out.println("Finished counting; time = "+t);
+
+ printStatistics(counts);
+
+ }
+
+ public static void printStatistics(KCountArray counts){
+ long[] freq=counts.transformToFrequency();
+
+// System.out.println(count+"\n");
+// System.out.println(Arrays.toString(freq)+"\n");
+
+ long sum=sum(freq);
+ System.out.println("Kmer fraction:");
+ int lim1=8, lim2=16;
+ for(int i=0; i<lim1; i++){
+ String prefix=i+"";
+ while(prefix.length()<8){prefix=prefix+" ";}
+ System.out.println(prefix+"\t"+String.format("%.3f%% ",(100l*freq[i]/(double)sum))+"\t"+freq[i]);
+ }
+ while(lim1<=freq.length){
+ int x=0;
+ for(int i=lim1; i<lim2; i++){
+ x+=freq[i];
+ }
+ String prefix=lim1+"-"+(lim2-1);
+ if(lim2>=freq.length){prefix=lim1+"+";}
+ while(prefix.length()<8){prefix=prefix+" ";}
+ System.out.println(prefix+"\t"+String.format("%.3f%% ",(100l*x/(double)sum))+"\t"+x);
+ lim1*=2;
+ lim2=min(lim2*2, freq.length);
+ }
+
+ long sum2=sum-freq[0];
+ long x=freq[1];
+ System.out.println();
+ System.out.println("Keys Counted: \t \t"+keysCounted);
+ System.out.println("Unique: \t \t"+sum2);
+ System.out.println("Avg Sites/Key: \t \t"+String.format("%.3f ",(keysCounted*1d/sum2)));
+ System.out.println();
+ System.out.println("Singleton: \t"+String.format("%.3f%% ",(100l*x/(double)sum2))+"\t"+x);
+ x=sum2-x;
+ System.out.println("Useful: \t"+String.format("%.3f%% ",(100l*x/(double)sum2))+"\t"+x);
+ }
+
+ public static KCountArray makeKca(String fname1, String fname2, Iterable<String> extraFiles, int k, int cbits, boolean rcomp, boolean eccByOverlap){
+ return makeKca(fname1, fname2, extraFiles, k, cbits, 0, Tools.min(2*k, 35), 1, minQuality, rcomp, eccByOverlap, maxReads, 1, 1, 1, 2);
+ }
+
+ public static KCountArray makeKca(String fname1, String fname2, Iterable<String> extraFiles,
+ int k, int cbits, int gap, int matrixbits, int hashes, int minqual, boolean rcomp, boolean eccByOverlap, long maxreads){
+ assert(matrixbits<63);
+ return makeKca(fname1, fname2, extraFiles, k, cbits, gap, matrixbits, hashes, minqual, rcomp, eccByOverlap, maxreads, 1, 1, 1, 2);
+ }
+
+ public static KCountArray makeKca(String fname1, String fname2, Iterable<String> extraFiles,
+ int k, int cbits, int gap, int matrixbits, int hashes, int minqual, boolean rcomp, boolean eccByOverlap, long maxreads, int passes, int stepsize, int thresh1, int thresh2){
+ assert(matrixbits<63);
+ return makeKca(fname1, fname2, extraFiles,
+ k, cbits, gap, 1L<<matrixbits, hashes, minqual, rcomp, eccByOverlap, maxreads, passes, stepsize, thresh1, thresh2, null, 0);
+ }
+
+ public static KCountArray makeKca(String fname1, String fname2, Iterable<String> extraFiles,
+ int k, int cbits, int gap, long cells, int hashes, int minqual, boolean rcomp, boolean eccByOverlap, long maxreads, int passes, int stepsize, int thresh1, int thresh2){
+ return makeKca(fname1, fname2, extraFiles,
+ k, cbits, gap, cells, hashes, minqual, rcomp, eccByOverlap, maxreads, passes, stepsize, thresh1, thresh2, null, 0);
+ }
+
+ public static KCountArray makeKca_als(ArrayList<String> fname1, ArrayList<String> fname2, Iterable<String> extraFiles,
+ int k, int cbits, int gap, long cells, int hashes, int minqual, boolean rcomp, boolean eccByOverlap, long maxreads, int passes, int stepsize, int thresh1, int thresh2,
+ KCountArray prefilter, int prefilterLimit_){
+ String a=null, b=null;
+ ArrayList<String> list=new ArrayList<String>();
+ if(fname1!=null){
+ for(int i=0; i<fname1.size(); i++){
+ if(i==0){a=fname1.get(i);}
+ else{list.add(fname1.get(i));}
+ }
+ }
+ if(fname2!=null){
+ for(int i=0; i<fname2.size(); i++){
+ if(i==0){b=fname2.get(i);}
+ else{list.add(fname2.get(i));}
+ }
+ }
+ if(extraFiles!=null){
+ for(String s : extraFiles){
+ list.add(s);
+ }
+ }
+ return makeKca(a, b, list.isEmpty() ? null : list, k, cbits, gap, cells, hashes, minqual, rcomp, eccByOverlap, maxreads, passes, stepsize, thresh1, thresh2, prefilter, prefilterLimit_);
+ }
+
+ public static KCountArray makeKca(String fname1, String fname2, Iterable<String> extraFiles,
+ int k, int cbits, int gap, long cells, int hashes, int minqual, boolean rcomp,
+ boolean eccByOverlap, long maxreads, int passes, int stepsize, int thresh1, int thresh2,
+ KCountArray prefilter, int prefilterLimit_){
+ final int kbits=Tools.min(2*k, 62);
+// verbose=true;
+ if(verbose){System.err.println("Making kca from ("+fname1+", "+fname2+")\nk="+k+", gap="+gap+", cells="+Tools.toKMG(cells)+", cbits="+cbits);}
+
+ if(fname1==null && fname2==null && extraFiles==null){
+ return KCountArray.makeNew(1L<<kbits, cells, cbits, gap, hashes, prefilter, prefilterLimit_);
+ }
+
+ boolean oldsplit=FastaReadInputStream.SPLIT_READS;
+ long oldmax=maxReads;
+ byte oldq=minQuality;
+ maxReads=maxreads;
+ minQuality=(byte)minqual;
+ // System.out.println("kbits="+(kbits)+" -> "+(1L<<kbits)+", matrixbits="+(matrixbits)+" -> "+(1L<<matrixbits)+", cbits="+cbits+", gap="+gap+", hashes="+hashes);
+ KCountArray kca=KCountArray.makeNew(1L<<kbits, cells, cbits, gap, hashes, prefilter, prefilterLimit_);
+
+// System.out.println("a");
+ {//For processing input lists
+ ArrayList<String> extra2=null;
+ if(fname1!=null && fname1.contains(",")){
+ String[] s=fname1.split(",");
+ if(extra2==null){extra2=new ArrayList<String>();}
+ for(int i=1; i<s.length; i++){extra2.add(s[i]);}
+ fname1=s[0];
+ }
+ if(fname2!=null && fname2.contains(",")){
+ String[] s=fname2.split(",");
+ if(extra2==null){extra2=new ArrayList<String>();}
+ for(int i=1; i<s.length; i++){extra2.add(s[i]);}
+ fname2=s[0];
+ }
+ if(extra2!=null){
+ if(extraFiles!=null){
+ for(String s : extraFiles){
+ extra2.add(s);
+ }
+ }
+ extraFiles=extra2;
+ }
+ }
+// System.out.println("b");
+
+ if(extraFiles!=null){
+ for(String s : extraFiles){
+ if(fileIO.FileFormat.hasFastaExtension(s)){
+ assert(!FastaReadInputStream.SPLIT_READS);
+ }
+ }
+ }
+
+// System.out.println("c");
+ if(passes==1){
+// System.out.println("c1");
+ try {
+ count(fname1, fname2, k, cbits, gap, rcomp, eccByOverlap, kca);
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ if(extraFiles!=null){
+ maxReads=-1;
+ for(String s : extraFiles){
+ try {
+ count(s, null, k, cbits, gap, rcomp, eccByOverlap, kca);
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+ kca.shutdown();
+
+ }else{
+// System.out.println("c2");
+ assert(passes>1);
+ KCountArray trusted=null;
+ for(int i=1; i<passes; i++){
+ boolean conservative=i>2;// /*or, alternately, (trusted==null || trusted.capacity()>0.3)
+ int step=(stepsize==1 ? 1 : stepsize+i%2);
+ // if(!conservative){step=(step+3)/4;}
+ if(!conservative){step=Tools.min(3, (step+3)/4);}
+
+ try {
+ count(fname1, fname2, k, cbits, rcomp, eccByOverlap, kca, trusted, maxreads, thresh1, step, conservative);
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ if(extraFiles!=null){
+ maxReads=-1;
+ for(String s : extraFiles){
+ try {
+ count(s, null, k, cbits, rcomp, eccByOverlap, kca, trusted, maxreads, thresh1, step, conservative);
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+ kca.shutdown();
+
+ System.out.println("Trusted: \t"+kca.toShortString());
+ trusted=kca;
+ kca=KCountArray.makeNew(1L<<kbits, cells, cbits, gap, hashes, prefilter, prefilterLimit_);
+
+ }
+
+ try {
+ count(fname1, fname2, k, cbits, rcomp, eccByOverlap, kca, trusted, maxreads, thresh2, stepsize, true);
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ if(extraFiles!=null){
+ maxReads=-1;
+ for(String s : extraFiles){
+ try {
+ count(s, null, k, cbits, rcomp, eccByOverlap, kca, trusted, maxreads, thresh2, stepsize, true);
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+ kca.shutdown();
+ }
+// System.out.println("d");
+ minQuality=oldq;
+ maxReads=oldmax;
+ FastaReadInputStream.SPLIT_READS=oldsplit;
+
+
+ return kca;
+ }
+
+ public static KCountArray count(String reads1, String reads2, int k, int cbits, int gap, boolean rcomp, boolean eccByOverlap, KCountArray counts) throws Exception{
+ assert(k>=1 && (counts!=null || k<20));
+ final int kbits=Tools.min(2*k, 62);
+ final long mask=~((-1L)<<(kbits));
+// System.err.println("countFastq... making a new cris");
+ if(counts==null){
+ final long cells=1L<<kbits;
+ if(verbose){System.err.println("k="+k+", kbits="+kbits+", cells="+cells+", mask="+Long.toHexString(mask));}
+ counts=KCountArray.makeNew(cells, cbits, gap);
+ }
+ assert(gap==counts.gap);
+
+ final ConcurrentReadInputStream cris;
+ {
+ FileFormat ff1=FileFormat.testInput(reads1, FileFormat.FASTQ, null, true, true);
+ FileFormat ff2=FileFormat.testInput(reads2, FileFormat.FASTQ, null, true, true);
+// if(ff2!=null){ //TODO - interleaved flag
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2);
+ cris.start(); //4567
+ }
+
+ assert(cris!=null) : reads1;
+ if(verbose){System.err.println("Started cris");}
+ boolean paired=cris.paired();
+ if(verbose){System.err.println("Paired: "+paired);}
+
+// countFastq(cris, k, rcomp, count);
+// assert(false) : THREADS;
+ CountThread[] cta=new CountThread[THREADS];
+ for(int i=0; i<cta.length; i++){
+ cta[i]=new CountThread(cris, k, rcomp, eccByOverlap, counts);
+ cta[i].start();
+ }
+// System.out.println("~1");
+ for(int i=0; i<cta.length; i++){
+// System.out.println("~2");
+ CountThread ct=cta[i];
+ synchronized(ct){
+// System.out.println("~3");
+ while(ct.getState()!=State.TERMINATED){
+// System.out.println("~4");
+ try {
+ ct.join(2000);
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+// System.out.println("~5");
+ }
+ }
+ }
+// System.out.println("~6");
+
+ ReadWrite.closeStream(cris);
+ if(verbose){System.err.println("Closed stream");}
+ if(verbose){System.err.println("Processed "+readsProcessed+" reads.");}
+
+
+ return counts;
+ }
+
+
+
+
+
+
+ public static KCountArray count(final String reads1, final String reads2, final int k, final int cbits, final boolean rcomp, final boolean eccByOverlap,
+ KCountArray counts, final KCountArray trusted, final long maxReads, final int thresh, final int detectStepsize, final boolean conservative)
+ throws Exception{
+
+ assert(k>=1 && (counts!=null || k<20));
+ final int kbits=Tools.min(2*k, 62);
+ final long mask=~((-1L)<<(kbits));
+
+// System.out.println("k="+k+", kbits="+kbits+", mask="+Long.toHexString(mask)+", thresh="+thresh);
+// System.out.println("\ntrusted=\n"+trusted);
+// System.out.println("\ncount=\n"+count);
+
+// verbose=true;
+
+ if(counts==null){
+ final long cells=1L<<kbits;
+ if(verbose){System.err.println("k="+k+", kbits="+kbits+", cells="+cells+", mask="+Long.toHexString(mask));}
+ counts=KCountArray.makeNew(cells, cbits, 0);
+ }
+
+ final ConcurrentReadInputStream cris;
+ {
+ FileFormat ff1=FileFormat.testInput(reads1, FileFormat.FASTQ, null, true, true);
+ FileFormat ff2=FileFormat.testInput(reads2, FileFormat.FASTQ, null, true, true);
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2);
+ cris.start(); //4567
+ }
+
+ assert(cris!=null) : reads1;
+ if(verbose){System.err.println("Started cris");}
+ boolean paired=cris.paired();
+ if(verbose){System.err.println("Paired: "+paired);}
+
+
+// countFastq(cris, k, rcomp, count, trusted, thresh, detectStepsize, conservative);
+
+// assert(false) : THREADS;
+ CountThread[] cta=new CountThread[THREADS];
+ for(int i=0; i<cta.length; i++){
+ cta[i]=new CountThread(cris, k, rcomp, eccByOverlap, counts, trusted, thresh, detectStepsize, conservative);
+ cta[i].start();
+ }
+
+ for(int i=0; i<cta.length; i++){
+ CountThread ct=cta[i];
+ synchronized(ct){
+ while(ct.isAlive()){
+ try {
+ ct.join(1000);
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+ }
+
+ cris.close();
+ if(verbose){System.err.println("Closed stream");}
+
+// System.out.println("*** after ***");
+// System.out.println("\ntrusted=\n"+trusted);
+// System.out.println("\ncount=\n"+count);
+
+ return counts;
+ }
+
+ private static class CountThread extends Thread{
+
+ CountThread(final ConcurrentReadInputStream cris_, final int k_, final boolean rcomp_, final boolean eccByOverlap_, final KCountArray counts_){
+ this(cris_, k_, rcomp_, eccByOverlap_, counts_, null, 2, 1, true);
+ }
+
+ CountThread(final ConcurrentReadInputStream cris_, final int k_, final boolean rcomp_, final boolean eccByOverlap_,
+ final KCountArray counts_, final KCountArray trusted_, final int thresh_, final int detectStepsize_, final boolean conservative_){
+ cris=cris_;
+ k=k_;
+ rcomp=rcomp_;
+ eccByOverlap=eccByOverlap_;
+ counts=counts_;
+ trusted=trusted_;
+ thresh=thresh_;
+ detectStepsize=detectStepsize_;
+ conservative=conservative_;
+ }
+
+ public void run(){
+// System.out.println("Running");
+ if(trusted==null){
+ count(cris, k, rcomp, counts);
+ }else{
+ count(cris, k, rcomp, counts, trusted, thresh, detectStepsize, conservative);
+ }
+// System.out.println("Finished: "+readsProcessedLocal);
+
+ synchronized(getClass()){
+ keysCounted+=keysCountedLocal;
+ readsProcessed+=readsProcessedLocal;
+
+ if(verbose){System.err.println(keysCounted+", "+keysCountedLocal);}
+ if(verbose){System.err.println(readsProcessed+", "+readsProcessedLocal);}
+ }
+ }
+
+
+
+ private final void count(ConcurrentReadInputStream cris, int k, boolean rcomp, KCountArray counts){
+ assert(k>=1 && counts!=null);
+
+// System.out.println("Waiting for list");
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+// System.out.println("Got list: "+(ln==null ? "null" : ln.id)+", "+(ln==null || ln.list==null ? "null" : ln.list.size()));
+
+ long[] array=null;
+ final Kmer kmer=new Kmer(k);
+ if(counts.gap==0){
+ final int kbits=Tools.min(2*k, 62);
+ final long mask=~((-1L)<<(kbits));
+
+
+ while(reads!=null && reads.size()>0){
+ //System.err.println("reads.size()="+reads.size());
+ for(Read r1 : reads){
+
+ final Read r2=r1.mate;
+ if(eccByOverlap && r1!=null && r2!=null){BBMerge.findOverlapStrict(r1, r2, true);}
+ readsProcessedLocal++;
+
+ if(k<32){
+ array=addRead_Advanced(r1, counts, k, mask, array);
+ }else{
+ addReadBig(r1, kmer);
+ addReadBig(r1.mate, kmer);
+ }
+// System.out.println(r);
+// System.out.println("kmers hashed: "+keysCountedLocal);
+ }
+ //System.err.println("returning list");
+ cris.returnList(ln.id, ln.list.isEmpty());
+ //System.err.println("fetching list");
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ }else{
+ final int k1=(k+1)/2;
+ final int k2=k/2;
+ final int kbits1=2*k1;
+ final int kbits2=2*k2;
+ final int gap=counts.gap;
+ final long mask1=~((-1L)<<(kbits1));
+ final long mask2=~((-1L)<<(kbits2));
+ while(reads!=null && reads.size()>0){
+ //System.err.println("reads.size()="+reads.size());
+ for(Read r1 : reads){
+ final Read r2=r1.mate;
+ if(eccByOverlap && r1!=null && r2!=null){BBMerge.findOverlapStrict(r1, r2, true);}
+ readsProcessedLocal++;
+ addReadSplit(r1, counts, k1, k2, mask1, mask2, gap, rcomp);
+ if(r1.mate!=null){
+ addReadSplit(r1.mate, counts, k1, k2, mask1, mask2, gap, rcomp);
+ }
+ }
+ //System.err.println("returning list");
+ cris.returnList(ln.id, ln.list.isEmpty());
+ //System.err.println("fetching list");
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ }
+
+ if(verbose){System.err.println("Finished reading");}
+ cris.returnList(ln.id, ln==null ? true : ln.list.isEmpty());
+ if(verbose){System.err.println("Returned list");}
+ }
+
+
+
+
+ private final void count(final ConcurrentReadInputStream cris, final int k, final boolean rcomp,
+ final KCountArray count, final KCountArray trusted, final int thresh, final int detectStepsize, final boolean conservative){
+ if(count.gap>0){
+ countFastqSplit(cris, k, rcomp, count, trusted, thresh, detectStepsize, conservative);
+ return;
+ }
+ assert(k>=1 && (count!=null || k<20));
+ final int kbits=Tools.min(2*k, 62);
+ final long mask=~((-1L)<<(kbits));
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ long[] array=null;
+ while(reads!=null && reads.size()>0){
+ //System.err.println("reads.size()="+reads.size());
+ for(Read r1 : reads){
+
+ Read r2=r1.mate;
+ if(eccByOverlap && r1!=null && r2!=null){BBMerge.findOverlapStrict(r1, r2, true);}
+ {
+ if(trusted!=null){
+ BitSet bs=(conservative ? ErrorCorrect.detectErrorsBulk(r1, trusted, k, thresh, detectStepsize) :
+ ErrorCorrect.detectTrusted(r1, trusted, k, thresh, detectStepsize));
+// System.out.println("\n"+toString(bs, r.length()));
+// System.out.println(new String(r.bases));
+ if(bs!=null){
+ for(int i=bs.nextClearBit(0); i<r1.length(); i=bs.nextClearBit(i+1)){
+ r1.bases[i]='N';
+ if(r1.quality!=null){r1.quality[i]=0;}
+ }
+ }
+// System.out.println(new String(r.bases));
+// System.out.println("used = "+String.format("%.3f%%",count.usedFraction()*100));
+// System.out.println("used = "+((KCountArray4)count).cellsUsed());
+// if(bs.length()<r.length()){r=null;}
+ }
+// if(r!=null){addRead(r, count, k, mask, rcomp);}
+ }
+ if(r2!=null){
+ if(trusted!=null){
+ BitSet bs=(conservative ? ErrorCorrect.detectErrorsBulk(r2, trusted, k, thresh, detectStepsize) :
+ ErrorCorrect.detectTrusted(r2, trusted, k, thresh, detectStepsize));
+ if(bs!=null){
+ for(int i=bs.nextClearBit(0); i<r2.length(); i=bs.nextClearBit(i+1)){
+ r2.bases[i]='N';
+ if(r2.quality!=null){r2.quality[i]=0;}
+ }
+ }
+ }
+ }
+ array=addRead_Advanced(r1, count, k, mask, array);
+
+ }
+ //System.err.println("returning list");
+ cris.returnList(ln.id, ln.list.isEmpty());
+ //System.err.println("fetching list");
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+
+ if(verbose){System.err.println("Finished reading");}
+ cris.returnList(ln.id, ln.list.isEmpty());
+ if(verbose){System.err.println("Returned list");}
+ }
+
+
+ private void countFastqSplit(final ConcurrentReadInputStream cris, final int k, final boolean rcomp,
+ final KCountArray counts, final KCountArray trusted, final int thresh, final int detectStepsize, final boolean conservative){
+ assert(false) : cris.paired();
+ assert(counts.gap>0);
+ assert(k<32 && k>=1 && (counts!=null || k<20));
+ final int kbits=2*k;
+ final long mask=~((-1L)<<(kbits));
+
+
+ final int k1=(k+1)/2;
+ final int k2=k/2;
+ final int kbits1=2*k1;
+ final int kbits2=2*k2;
+ final int gap=counts.gap;
+ final long mask1=~((-1L)<<(kbits1));
+ final long mask2=~((-1L)<<(kbits2));
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ while(reads!=null && reads.size()>0){
+ //System.err.println("reads.size()="+reads.size());
+ for(Read r1 : reads){
+
+ Read r2=r1.mate;
+ if(eccByOverlap && r1!=null && r2!=null){BBMerge.findOverlapStrict(r1, r2, true);}
+ {
+ if(trusted!=null){
+ BitSet bs=(conservative ? ErrorCorrect.detectErrorsBulk(r1, trusted, k, thresh, detectStepsize) :
+ ErrorCorrect.detectTrusted(r1, trusted, k, thresh, detectStepsize));
+// System.out.println("\n"+toString(bs, r.length()));
+// System.out.println(new String(r.bases));
+ for(int i=bs.nextClearBit(0); i<r1.length(); i=bs.nextClearBit(i+1)){
+ r1.bases[i]='N';
+ r1.quality[i]=0;
+ }
+// System.out.println(new String(r.bases));
+// System.out.println("used = "+String.format("%.3f%%",count.usedFraction()*100));
+// System.out.println("used = "+((KCountArray4)count).cellsUsed());
+// if(bs.length()<r.length()){r=null;}
+ }
+// if(r!=null){addRead(r, count, k, mask, rcomp);}
+
+ addReadSplit(r1, counts, k1, k2, mask1, mask2, gap, rcomp);
+ }
+ if(r2!=null){
+ if(trusted!=null){
+ BitSet bs=(conservative ? ErrorCorrect.detectErrorsBulk(r2, trusted, k, thresh, detectStepsize) :
+ ErrorCorrect.detectTrusted(r2, trusted, k, thresh, detectStepsize));
+ for(int i=bs.nextClearBit(0); i<r2.length(); i=bs.nextClearBit(i+1)){
+ r2.bases[i]='N';
+ r2.quality[i]=0;
+ }
+ }
+ addReadSplit(r2, counts, k1, k2, mask1, mask2, gap, rcomp);
+ }
+
+ }
+ //System.err.println("returning list");
+ cris.returnList(ln.id, ln.list.isEmpty());
+ //System.err.println("fetching list");
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+
+ if(verbose){System.err.println("Finished reading");}
+ cris.returnList(ln.id, ln.list.isEmpty());
+ if(verbose){System.err.println("Returned list");}
+ }
+
+
+ /**
+ * Hash a read's kmers into the KCountArray.
+ * Advanced mode processes paired reads together and sorts kmers to eliminate spurious duplicates.
+ * @param r1
+ * @param counts
+ * @param k
+ * @param mask
+ * @param rcomp
+ */
+ private final long[] addRead_Advanced(Read r1, final KCountArray counts, final int k, final long mask, long[] array){
+ assert(counts.gap==0) : "Gapped: TODO";
+ if(PREJOIN && r1.mate!=null && r1.insert()>0){
+ r1.mate.reverseComplement();
+ r1=r1.joinRead();
+ }
+ Read r2=r1.mate;
+ int len1=r1.bases==null ? 0 : Tools.max(0, r1.length()-k+1);
+ int len2=(r2==null || r2.bases==null) ? 0 : Tools.max(0, r2.length()-k+1);
+ int len=len1+len2;
+ if(len<1){return array;}
+ if(array==null || array.length!=len){array=new long[len];}
+ Arrays.fill(array, -1);
+ fillKmerArray(r1, k, mask, array, 0, len1);
+ if(r2!=null){fillKmerArray(r2, k, mask, array, len1, len);}
+ if(KEEP_DUPLICATE_KMERS){
+ for(long kmer : array){
+ if(kmer!=-1){
+ keysCountedLocal++;
+ counts.increment(kmer);
+ }
+ }
+ }else{
+ Arrays.sort(array);
+ long prev=-1;
+ for(int i=0; i<array.length; i++){
+ long kmer=array[i];
+ if(kmer!=prev){
+ keysCountedLocal++;
+ counts.increment(kmer);
+ prev=kmer;
+ }
+ }
+ }
+ return array;
+ }
+
+ private final void addReadBig(Read r, Kmer kmer){
+ if(r==null || r.bases==null){return;}
+ final byte[] bases=r.bases;
+ final byte[] quals=r.quality;
+ int len=0;
+
+ if(bases==null || bases.length<k){return;}
+ kmer.clear();
+
+ /* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */
+ float prob=1;
+ for(int i=0; i<bases.length; i++){
+ final byte b=bases[i];
+ final long x=AminoAcid.baseToNumber[b];
+
+ //Update kmers
+ kmer.addRight(b);
+
+ if(minProb>0 && quals!=null){//Update probability
+ prob=prob*KmerTableSet.PROB_CORRECT[quals[i]];
+ if(len>k){
+ byte oldq=quals[i-k];
+ prob=prob*KmerTableSet.PROB_CORRECT_INVERSE[oldq];
+ }
+ }
+
+ //Handle Ns
+ if(x<0){
+ len=0;
+ prob=1;
+ }else{len++;}
+
+ assert(len==kmer.len);
+
+ if(verbose){System.err.println("Scanning i="+i+", len="+len+", kmer="+kmer+"\t"+new String(bases, Tools.max(0, i-k), Tools.min(i+1, k)));}
+ if(len>=k && prob>=minProb){
+// System.err.println("Incrementing xor()="+kmer.xor());
+ counts.incrementAndReturnUnincremented(kmer.xor(), 1);
+ keysCountedLocal++;
+ }
+ }
+ }
+
+ private final void fillKmerArray(Read r, final int k, final long mask, final long[] array, final int start, final int stop){
+ if(k>31){
+ fillKmerArrayLong(r, k, array, start, stop);
+ return;
+ }
+ assert(counts.gap==0);
+ assert(k<32);
+ assert(!PREJOIN || r.mate==null);
+ assert(CANONICAL);
+ assert(array!=null);
+
+ final byte[] bases=r.bases;
+ final byte[] quals=r.quality;
+
+ if(bases==null || bases.length<k+counts.gap){return;}
+
+ for(int pass=0; pass<2; pass++){
+ int len=0;
+ int idx=(pass==0 ? start-k+1 : stop+k-2);
+ long kmer=0;
+ float prob=1;
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ int x=AminoAcid.baseToNumber[b];
+
+ byte q;
+ if(quals==null){
+ q=50;
+ }else{
+ q=quals[i];
+ prob=prob*align2.QualityTools.PROB_CORRECT[q];
+ if(len>k){
+ byte oldq=quals[i-k];
+ prob=prob*align2.QualityTools.PROB_CORRECT_INVERSE[oldq];
+ }
+ }
+
+ if(x<0 || q<minQuality){
+ len=0;
+ kmer=0;
+ prob=1;
+ }else{
+ kmer=((kmer<<2)|x)&mask;
+ len++;
+ if(len>=k && prob>=minProb){
+ array[idx]=Tools.max(array[idx], kmer);
+ }
+ }
+ if(pass==0){idx++;}else{idx--;}
+ }
+// System.out.println(Arrays.toString(array));
+ r.reverseComplement();
+ }
+ }
+
+ private final void fillKmerArrayLong(Read r, final int k, final long[] array, final int start, final int stop){
+ assert(k>31);
+ assert(counts.gap==0);
+ assert(!PREJOIN || r.mate==null);
+ assert(CANONICAL);
+ assert(array!=null);
+ Kmer kmer=new Kmer(k);
+
+ float prob=1;
+ byte[] bases=r.bases;
+ byte[] quals=r.quality;
+
+ assert(k>31) : k;
+ kmer.clear();
+
+ for(int i=0, idx=start-k+1; i<bases.length; i++, idx++){
+ byte b=bases[i];
+ kmer.addRight(b);
+
+ byte q;
+ if(quals==null){
+ q=50;
+ }else{
+ q=quals[i];
+ prob=prob*align2.QualityTools.PROB_CORRECT[q];
+ if(kmer.len>k){
+ byte oldq=quals[i-k];
+ prob=prob*align2.QualityTools.PROB_CORRECT_INVERSE[oldq];
+ }
+ }
+
+ if(!AminoAcid.isFullyDefined(b) || q<minQuality){
+ kmer.clear();
+ prob=1;
+ }
+ if(kmer.len>=k && prob>=minProb){
+ array[idx]=kmer.xor();
+ }
+ }
+ }
+
+ private final void addReadSplit(Read r, final KCountArray counts, final int k1, final int k2, final long mask1, final long mask2, final int gap, boolean rcomp){
+ assert(false) : "TODO";
+ if(PREJOIN && r.mate!=null && r.insert()>0){
+ if(verbose){System.err.println("Prejoining "+r.numericID+" at "+r.insert());}
+ r.mate.reverseComplement();
+ r=r.joinRead();
+ }
+
+ int len=0;
+ int shift=k2*2;
+ long kmer1=0;
+ long kmer2=0;
+ float prob=1;
+ byte[] bases=r.bases;
+ byte[] quals=r.quality;
+
+ assert(kmer1>=kmer2);
+
+// assert(false) : k1+", "+k2+", "+mask1+", "+mask2+", "+gap;
+
+ if(verbose){System.err.println("Hashing read "+r.numericID+"; loop limits "+(k1+gap)+"-"+(bases.length));}
+ for(int i=0, j=i+k1+gap; j<bases.length; i++, j++){
+ int x1=AminoAcid.baseToNumber[bases[i]];
+ int x2=AminoAcid.baseToNumber[bases[j]];
+
+ byte q1, q2;
+ if(quals==null){
+ q1=50;
+ q2=50;
+ }else{
+ q1=quals[i];
+ q2=quals[j];
+ prob=prob*align2.QualityTools.PROB_CORRECT[q1]*align2.QualityTools.PROB_CORRECT[q2];
+ if(len>k){
+ byte oldq1=quals[i-k1];
+ byte oldq2=quals[j-k2];
+ prob=prob*align2.QualityTools.PROB_CORRECT_INVERSE[oldq1]*align2.QualityTools.PROB_CORRECT_INVERSE[oldq2];
+ }
+ }
+
+ if(x1<0 || x2<0 || q1<minQuality || q2<minQuality){
+ len=0;
+ kmer1=0;
+ kmer2=0;
+ prob=1;
+ }else{
+ kmer1=((kmer1<<2)|x1)&mask1;
+ kmer2=((kmer2<<2)|x2)&mask2;
+ len++;
+ if(len>=k1 && prob>=minProb){
+
+ keysCountedLocal++;
+// System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer));
+
+ long key=(kmer1<<shift)|kmer2;
+// System.err.println(Long.toHexString(key));
+
+ if(verbose){System.err.println("Hashing key "+Long.toHexString(key)+" at length "+len);}
+ counts.increment(key);
+// count.increment(kmer);
+
+
+// System.out.println(" -> "+count.read(kmer));
+// System.out.print("Incrementing array for "+Long.toHexString(kmer)+": "+array[(int)kmer]);
+// array[(int)kmer]++;
+// System.out.println(" -> "+array[(int)kmer]+"\n");
+// assert(array[(int)kmer]==count.read(kmer) || array[(int)kmer]>3);
+ }
+ }
+ }
+ if(rcomp){
+ r.reverseComplement();
+ addReadSplit(r, counts, k1, k2, mask1, mask2, gap, false);
+ }
+ }
+
+ private final ConcurrentReadInputStream cris;
+ private final int k;
+ private final boolean rcomp;
+ private final boolean eccByOverlap;
+ private final KCountArray counts;
+ private final KCountArray trusted;
+ private final int thresh;
+ private final int detectStepsize;
+ private final boolean conservative;
+ private long keysCountedLocal=0;
+ private long readsProcessedLocal=0;
+ }
+
+}
diff --git a/current/bloom/KmerCountAbstract.java b/current/bloom/KmerCountAbstract.java
new file mode 100755
index 0000000..902bcc0
--- /dev/null
+++ b/current/bloom/KmerCountAbstract.java
@@ -0,0 +1,53 @@
+package bloom;
+
+import align2.Shared;
+
+/**
+ * @author Brian Bushnell
+ * @date Dec 2, 2014
+ *
+ */
+public abstract class KmerCountAbstract {
+
+ protected static final long[] transformToFrequency(int[] count){
+ long[] freq=new long[2000];
+ int max=freq.length-1;
+ for(int i=0; i<count.length; i++){
+ int x=count[i];
+ x=min(x, max);
+ freq[x]++;
+ }
+ return freq;
+ }
+
+ protected static final long sum(int[] array){
+ long x=0;
+ for(int y : array){x+=y;}
+ return x;
+ }
+
+ protected static final long sum(long[] array){
+ long x=0;
+ for(long y : array){x+=y;}
+ return x;
+ }
+
+ protected static final int min(int x, int y){return x<y ? x : y;}
+ protected static final int max(int x, int y){return x>y ? x : y;}
+
+ public static byte minQuality=6;
+ public static long readsProcessed=0;
+ public static long maxReads=-1;
+ public static int BUFFERLEN=500;
+
+ public static float minProb=0.5f;
+
+ public static long keysCounted=0;
+
+ public static int THREADS=Shared.threads();
+ public static final boolean verbose=false;
+ public static boolean PREJOIN=false;
+ public static boolean CANONICAL=false;
+ public static boolean KEEP_DUPLICATE_KMERS=false;
+
+}
diff --git a/current/bloom/LargeKmerCount.java b/current/bloom/LargeKmerCount.java
new file mode 100755
index 0000000..e44958a
--- /dev/null
+++ b/current/bloom/LargeKmerCount.java
@@ -0,0 +1,242 @@
+package bloom;
+
+import java.util.ArrayList;
+
+import stream.ConcurrentGenericReadInputStream;
+import stream.FastqReadInputStream;
+import stream.Read;
+
+import align2.ListNum;
+import dna.AminoAcid;
+import dna.Timer;
+import fileIO.ReadWrite;
+
+/**
+ * @author Brian Bushnell
+ * @date Jul 6, 2012
+ *
+ */
+public class LargeKmerCount {
+
+public static void main(String[] args){
+
+ Timer t=new Timer();
+
+ String fname1=args[0];
+ String fname2=(args.length>4 || args[1].contains(".") ? args[1] : null);
+ int indexbits=Integer.parseInt(args[args.length-3]);
+ int cbits=Integer.parseInt(args[args.length-2]);
+ int k=Integer.parseInt(args[args.length-1]);
+
+ KCountArray2 count=countFastq(fname1, fname2, indexbits, cbits, k);
+ t.stop();
+ System.out.println("Finished counting; time = "+t);
+
+ long[] freq=count.transformToFrequency();
+
+// System.out.println(count+"\n");
+// System.out.println(Arrays.toString(freq)+"\n");
+
+ long sum=sum(freq);
+ System.out.println("Kmer fraction:");
+ int lim1=8, lim2=16;
+ for(int i=0; i<lim1; i++){
+ String prefix=i+"";
+ while(prefix.length()<8){prefix=prefix+" ";}
+ System.out.println(prefix+"\t"+String.format("%.3f%% ",(100l*freq[i]/(double)sum))+"\t"+freq[i]);
+ }
+ while(lim1<=freq.length){
+ int x=0;
+ for(int i=lim1; i<lim2; i++){
+ x+=freq[i];
+ }
+ String prefix=lim1+"-"+(lim2-1);
+ if(lim2>=freq.length){prefix=lim1+"+";}
+ while(prefix.length()<8){prefix=prefix+" ";}
+ System.out.println(prefix+"\t"+String.format("%.3f%% ",(100l*x/(double)sum))+"\t"+x);
+ lim1*=2;
+ lim2=min(lim2*2, freq.length);
+ }
+
+ long sum2=sum-freq[0];
+ long x=freq[1];
+ System.out.println();
+ System.out.println("Unique: \t \t"+sum2);
+ System.out.println("CollisionsA:\t \t"+collisionsA);
+ System.out.println("CollisionsB:\t \t"+collisionsB);
+
+ double modifier=(collisionsB)/(double)(32*collisionsA+8*collisionsB);
+
+ System.out.println("Estimate: \t \t"+(sum2+collisionsA+collisionsB-(long)(collisionsA*modifier)));
+ System.out.println();
+ System.out.println("Singleton: \t"+String.format("%.3f%% ",(100l*x/(double)sum2))+"\t"+x);
+ x=sum2-x;
+ System.out.println("Useful: \t"+String.format("%.3f%% ",(100l*x/(double)sum2))+"\t"+x);
+
+ }
+
+ public static KCountArray2 countFastq(String reads1, String reads2, int indexbits, int cbits, int k){
+ assert(indexbits>=1 && indexbits<40);
+ collisionsA=0;
+ collisionsB=0;
+ final long cells=1L<<indexbits;
+ final int kbits=ROTATE_DIST*k;
+ final int xorShift=kbits%64;
+ final long[] rotMasks=makeRotMasks(xorShift);
+ final int[] buffer=new int[k];
+ if(verbose){System.err.println("k="+k+", kbits="+kbits+", indexbits="+indexbits+", cells="+cells+", cbits="+cbits);}
+ if(verbose){System.err.println("xorShift="+xorShift+", rotMasks[3]="+Long.toHexString(rotMasks[3]));}
+ final KCountArray2 count=new KCountArray2(cells, cbits);
+
+ FastqReadInputStream fris1=new FastqReadInputStream(reads1, false);
+ FastqReadInputStream fris2=(reads2==null ? null : new FastqReadInputStream(reads2, false));
+ ConcurrentGenericReadInputStream cris=new ConcurrentGenericReadInputStream(fris1, fris2, maxReads);
+
+ cris.start();
+ System.err.println("Started cris");
+ boolean paired=cris.paired();
+ System.err.println("Paired: "+paired);
+
+ long kmer=0; //current kmer
+ int len=0; //distance since last contig start or ambiguous base
+
+ {
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+ assert(paired==(r.mate!=null));
+ }
+
+ while(reads!=null && reads.size()>0){
+ //System.err.println("reads.size()="+reads.size());
+ for(Read r : reads){
+ readsProcessed++;
+
+ len=0;
+ kmer=0;
+ byte[] bases=r.bases;
+ byte[] quals=r.quality;
+
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ int x=AminoAcid.baseToNumber[b];
+ int x2=buffer[len%buffer.length];
+ buffer[len%buffer.length]=x;
+ if(x<0){
+ len=0;
+ kmer=0;
+ }else{
+ kmer=(Long.rotateLeft(kmer,ROTATE_DIST)^x);
+ len++;
+ if(len>=k){
+ if(len>k){kmer=kmer^rotMasks[x2];}
+ long hashcode=kmer&0x7fffffffffffffffL;
+ long code1=hashcode%(cells-3);
+ long code2=((~hashcode)&0x7fffffffffffffffL)%(cells-5);
+ int value=count.increment2(code1, 1);
+ long temp=count.read(code2);
+ if(temp>0){
+ if(value==0){collisionsA++;}
+ else{collisionsB++;}
+ }
+ }
+ }
+ }
+
+
+ if(r.mate!=null){
+ len=0;
+ kmer=0;
+ bases=r.mate.bases;
+ quals=r.mate.quality;
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ int x=AminoAcid.baseToNumber[b];
+ int x2=buffer[len%buffer.length];
+ buffer[len%buffer.length]=x;
+ if(x<0){
+ len=0;
+ kmer=0;
+ }else{
+ kmer=(Long.rotateLeft(kmer,ROTATE_DIST)^x);
+ len++;
+ if(len>=k){
+ if(len>k){kmer=kmer^rotMasks[x2];}
+ long hashcode=kmer&0x7fffffffffffffffL;
+ long code1=hashcode%(cells-3);
+ long code2=((~hashcode)&0x7fffffffffffffffL)%(cells-5);
+ int value=count.increment2(code1, 1);
+ long temp=count.read(code2);
+ if(temp>0){
+ if(value==0){collisionsA++;}
+ else{collisionsB++;}
+ }
+ }
+ }
+ }
+ }
+
+ }
+ //System.err.println("returning list");
+ cris.returnList(ln.id, ln.list.isEmpty());
+ //System.err.println("fetching list");
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ System.err.println("Finished reading");
+ cris.returnList(ln.id, ln.list.isEmpty());
+ System.err.println("Returned list");
+ ReadWrite.closeStream(cris);
+ System.err.println("Closed stream");
+ System.err.println("Processed "+readsProcessed+" reads.");
+ }
+
+ return count;
+ }
+
+ public static final long[] makeRotMasks(int rotDist){
+ long[] masks=new long[4];
+ for(long i=0; i<4; i++){
+ masks[(int)i]=Long.rotateLeft(i, rotDist);
+ }
+ return masks;
+ }
+
+ public static long[] transformToFrequency(int[] count){
+ long[] freq=new long[2000];
+ int max=freq.length-1;
+ for(int i=0; i<count.length; i++){
+ int x=count[i];
+ x=min(x, max);
+ freq[x]++;
+ }
+ return freq;
+ }
+
+ public static long sum(int[] array){
+ long x=0;
+ for(int y : array){x+=y;}
+ return x;
+ }
+
+ public static long sum(long[] array){
+ long x=0;
+ for(long y : array){x+=y;}
+ return x;
+ }
+
+ public static final int min(int x, int y){return x<y ? x : y;}
+ public static final int max(int x, int y){return x>y ? x : y;}
+
+ public static boolean verbose=true;
+ public static byte minQuality=-5;
+ public static long readsProcessed=0;
+ public static long maxReads=1000000L;
+ public static final int ROTATE_DIST=2;
+
+ public static long collisionsA=0;
+ public static long collisionsB=0;
+
+}
diff --git a/current/bloom/LargeKmerCount2.java b/current/bloom/LargeKmerCount2.java
new file mode 100755
index 0000000..b5ba26d
--- /dev/null
+++ b/current/bloom/LargeKmerCount2.java
@@ -0,0 +1,333 @@
+package bloom;
+
+import java.util.ArrayList;
+import java.util.Random;
+
+import stream.ConcurrentGenericReadInputStream;
+import stream.ConcurrentReadInputStream;
+import stream.FASTQ;
+import stream.FastaReadInputStream;
+import stream.Read;
+
+import align2.ListNum;
+import dna.AminoAcid;
+import dna.Timer;
+import fileIO.FileFormat;
+import fileIO.ReadWrite;
+
+/**
+ * @author Brian Bushnell
+ * @date Jul 6, 2012
+ *
+ */
+public class LargeKmerCount2 {
+
+public static void main(String[] args){
+
+ Timer t=new Timer();
+
+ String fname1=args[0];
+ String fname2=(args.length>4 || args[1].contains(".") ? args[1] : null);
+ int indexbits=Integer.parseInt(args[args.length-3]);
+ int cbits=Integer.parseInt(args[args.length-2]);
+ int k=Integer.parseInt(args[args.length-1]);
+
+ KCountArray2 count=null;
+
+ if(fileIO.FileFormat.hasFastaExtension(fname1)){
+ FastaReadInputStream.MIN_READ_LEN=k;
+ }
+ count=countFastq(fname1, fname2, indexbits, cbits, k);
+
+ FastaReadInputStream.TARGET_READ_LEN=999999999;
+
+ t.stop();
+ System.out.println("Finished counting; time = "+t);
+
+ long[] freq=count.transformToFrequency();
+
+// System.out.println(count+"\n");
+// System.out.println(Arrays.toString(freq)+"\n");
+
+ long sum=sum(freq);
+ System.out.println("Kmer fraction:");
+ int lim1=8, lim2=16;
+ for(int i=0; i<lim1; i++){
+ String prefix=i+"";
+ while(prefix.length()<8){prefix=prefix+" ";}
+ System.out.println(prefix+"\t"+String.format("%.3f%% ",(100l*freq[i]/(double)sum))+"\t"+freq[i]);
+ }
+ while(lim1<=freq.length){
+ int x=0;
+ for(int i=lim1; i<lim2; i++){
+ x+=freq[i];
+ }
+ String prefix=lim1+"-"+(lim2-1);
+ if(lim2>=freq.length){prefix=lim1+"+";}
+ while(prefix.length()<8){prefix=prefix+" ";}
+ System.out.println(prefix+"\t"+String.format("%.3f%% ",(100l*x/(double)sum))+"\t"+x);
+ lim1*=2;
+ lim2=min(lim2*2, freq.length);
+ }
+
+ long estKmers=load+min(actualCollisions, (long)expectedCollisions);
+
+ long sum2=sum-freq[0];
+ long x=freq[1];
+ System.out.println();
+ System.out.println("Keys Counted: \t \t"+keysCounted);
+ System.out.println("Unique: \t \t"+sum2);
+ System.out.println("probCollisions:\t \t"+(long)probNewKeyCollisions);
+ System.out.println("EstimateP: \t \t"+(sum2+(long)probNewKeyCollisions));
+ System.out.println("expectedColl: \t \t"+(long)expectedCollisions);
+ System.out.println("actualColl: \t \t"+(long)actualCollisions);
+ System.out.println("estimateKmers: \t \t"+estKmers);
+ System.out.println();
+ System.out.println("Singleton: \t"+String.format("%.3f%% ",(100l*x/(double)sum2))+"\t"+x);
+ x=sum2-x;
+ System.out.println("Useful: \t"+String.format("%.3f%% ",(100l*x/(double)sum2))+"\t"+x);
+
+ }
+
+ public static KCountArray2 countFastq(String reads1, String reads2, int indexbits, int cbits, int k){
+ assert(indexbits>=1 && indexbits<40);
+ final long cells=1L<<indexbits;
+ final int kbits=ROTATE_DIST*k;
+ final int xorShift=kbits%64;
+ final long[] rotMasks=makeRotMasks(xorShift);
+ final int[] buffer=new int[k];
+ if(verbose){System.err.println("k="+k+", kbits="+kbits+", indexbits="+indexbits+", cells="+cells+", cbits="+cbits);}
+ if(verbose){System.err.println("xorShift="+xorShift+", rotMasks[3]="+Long.toHexString(rotMasks[3]));}
+ final KCountArray2 count=new KCountArray2(cells, cbits);
+ load=0;
+ probNewKeyCollisions=0;
+ invCells=1d/cells;
+ invKmerSpace=Math.pow(0.5, 2*k);
+ if(cells>=Math.pow(4, k)){invCells=0;}
+
+ final ConcurrentReadInputStream cris;
+ {
+ FileFormat ff1=FileFormat.testInput(reads1, FileFormat.FASTQ, null, true, true);
+ FileFormat ff2=FileFormat.testInput(reads2, FileFormat.FASTQ, null, true, true);
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2);
+ if(verbose){System.err.println("Started cris");}
+ cris.start(); //4567
+ }
+ boolean paired=cris.paired();
+ if(verbose){System.err.println("Paired: "+paired);}
+
+ long kmer=0; //current kmer
+ int len=0; //distance since last contig start or ambiguous base
+
+ {
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+ assert(paired==(r.mate!=null));
+ }
+
+ while(reads!=null && reads.size()>0){
+ //System.err.println("reads.size()="+reads.size());
+ for(Read r : reads){
+ readsProcessed++;
+
+ len=0;
+ kmer=0;
+ byte[] bases=r.bases;
+ byte[] quals=r.quality;
+
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ int x=AminoAcid.baseToNumber[b];
+ int x2=buffer[len%buffer.length];
+ buffer[len%buffer.length]=x;
+ if(x<0){
+ len=0;
+ kmer=0;
+ }else{
+ kmer=(Long.rotateLeft(kmer,ROTATE_DIST)^x);
+ len++;
+ if(len>=k){
+ keysCounted++;
+ if(len>k){kmer=kmer^rotMasks[x2];}
+ long hashcode=kmer&0x7fffffffffffffffL;
+// hashcode=randy.nextLong()&~((-1L)<<(2*k));
+ long code1=hashcode%(cells-3);
+// long code2=((~hashcode)&0x7fffffffffffffffL)%(cells-5);
+ int value=count.increment2(code1, 1);
+
+ double probCollision=load*invCells;
+// expectedCollisions+=probCollision;
+ expectedCollisions+=probCollision*(1-(load+min(expectedCollisions, actualCollisions))*invKmerSpace);
+ if(value==0){load++;}
+ else{
+ actualCollisions++;
+ double probNewKey=(load*invCells)*expectedCollisions/(min(expectedCollisions, actualCollisions));
+ double estKeys=load+probNewKeyCollisions;
+ double probOldKey=estKeys*invKmerSpace;
+ probNewKeyCollisions+=probNewKey*(1-probOldKey);
+
+// double estKmers=load+min(actualCollisions, expectedCollisions);
+// double probOldKmer=estKmers*invKmerSpace;
+// probNewKeyCollisions+=(prob*(1-prob2));
+ }
+
+//// probCollisions+=(load*invCells);
+// if(value==0){load++;}
+// else{
+//// long load2=keysCounted-load;
+// double prob=Math.sqrt(load*invCells);
+// double estKmers=load+probNewKeyCollisions;
+// double prob2=estKmers*invKmerSpace;
+//// probCollisions+=(prob*(1-prob2));
+//// probCollisions+=Math.sqrt(prob*(1-prob2));
+// probNewKeyCollisions+=Math.sqrt(prob*(1-prob2));
+//// probCollisions+=min(prob, 1-prob2);
+//// probCollisions+=(load*invCells);
+// }
+ }
+ }
+ }
+
+
+ if(r.mate!=null){
+ len=0;
+ kmer=0;
+ bases=r.mate.bases;
+ quals=r.mate.quality;
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ int x=AminoAcid.baseToNumber[b];
+ int x2=buffer[len%buffer.length];
+ buffer[len%buffer.length]=x;
+ if(x<0){
+ len=0;
+ kmer=0;
+ }else{
+ kmer=(Long.rotateLeft(kmer,ROTATE_DIST)^x);
+ len++;
+ if(len>=k){
+ keysCounted++;
+ if(len>k){kmer=kmer^rotMasks[x2];}
+ long hashcode=kmer&0x7fffffffffffffffL;
+// hashcode=randy.nextLong()&~((-1L)<<(2*k));
+ long code1=hashcode%(cells-3);
+// long code2=((~hashcode)&0x7fffffffffffffffL)%(cells-5);
+ int value=count.increment2(code1, 1);
+
+ double probCollision=load*invCells;
+// expectedCollisions+=probCollision;
+ expectedCollisions+=probCollision*(1-(load+min(expectedCollisions, actualCollisions))*invKmerSpace);
+ if(value==0){load++;}
+ else{
+ actualCollisions++;
+ double probNewKey=(load*invCells)*expectedCollisions/(min(expectedCollisions, actualCollisions));
+ double estKeys=load+probNewKeyCollisions;
+ double probOldKey=estKeys*invKmerSpace;
+ probNewKeyCollisions+=probNewKey*(1-probOldKey);
+
+// double estKmers=load+min(actualCollisions, expectedCollisions);
+// double probOldKmer=estKmers*invKmerSpace;
+// probNewKeyCollisions+=(prob*(1-prob2));
+ }
+
+//// probCollisions+=(load*invCells);
+// if(value==0){load++;}
+// else{
+//// long load2=keysCounted-load;
+// double prob=Math.sqrt(load*invCells);
+// double estKmers=load+probNewKeyCollisions;
+// double prob2=estKmers*invKmerSpace;
+//// probCollisions+=(prob*(1-prob2));
+//// probCollisions+=Math.sqrt(prob*(1-prob2));
+// probNewKeyCollisions+=Math.sqrt(prob*(1-prob2));
+//// probCollisions+=min(prob, 1-prob2);
+//// probCollisions+=(load*invCells);
+// }
+ }
+ }
+ }
+ }
+
+ }
+ //System.err.println("returning list");
+ cris.returnList(ln.id, ln.list.isEmpty());
+ //System.err.println("fetching list");
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ System.err.println("Finished reading");
+ cris.returnList(ln.id, ln.list.isEmpty());
+ System.err.println("Returned list");
+ ReadWrite.closeStream(cris);
+ System.err.println("Closed stream");
+ System.err.println("Processed "+readsProcessed+" reads.");
+ }
+
+ return count;
+ }
+
+ public static final long[] makeRotMasks(int rotDist){
+ long[] masks=new long[4];
+ for(long i=0; i<4; i++){
+ masks[(int)i]=Long.rotateLeft(i, rotDist);
+ }
+ return masks;
+ }
+
+ public static long[] transformToFrequency(int[] count){
+ long[] freq=new long[2000];
+ int max=freq.length-1;
+ for(int i=0; i<count.length; i++){
+ int x=count[i];
+ x=min(x, max);
+ freq[x]++;
+ }
+ return freq;
+ }
+
+ public static long sum(int[] array){
+ long x=0;
+ for(int y : array){x+=y;}
+ return x;
+ }
+
+ public static long sum(long[] array){
+ long x=0;
+ for(long y : array){x+=y;}
+ return x;
+ }
+
+ public static final int min(int x, int y){return x<y ? x : y;}
+ public static final int max(int x, int y){return x>y ? x : y;}
+ public static final long min(long x, long y){return x<y ? x : y;}
+ public static final long max(long x, long y){return x>y ? x : y;}
+ public static final double min(double x, double y){return x<y ? x : y;}
+ public static final double max(double x, double y){return x>y ? x : y;}
+
+ public static boolean verbose=true;
+ public static byte minQuality=-5;
+ public static long readsProcessed=0;
+ public static long maxReads=10000000L;
+ public static final int ROTATE_DIST=2;
+
+ /** Non-empty cells in hash table */
+ public static long load;
+ /** Number of expected collisions */
+ public static double expectedCollisions;
+ /** Number of actual collisions (possibly by same value) */
+ public static long actualCollisions;
+ /** Number of probable collisions caused by new keys */
+ public static double probNewKeyCollisions;
+ /** Inverse of hash table size */
+ public static double invCells;
+ /** Inverse of number of potential kmers */
+ public static double invKmerSpace;
+ /** Inverse of number of potential kmers */
+ public static long keysCounted;
+
+ public static final Random randy=new Random(1);
+
+}
diff --git a/current/bloom/TestLargeKmer.java b/current/bloom/TestLargeKmer.java
new file mode 100755
index 0000000..0d4fb15
--- /dev/null
+++ b/current/bloom/TestLargeKmer.java
@@ -0,0 +1,192 @@
+package bloom;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+
+
+import stream.ConcurrentGenericReadInputStream;
+import stream.FastqReadInputStream;
+import stream.Read;
+
+import align2.ListNum;
+import dna.AminoAcid;
+import dna.Timer;
+import fileIO.ReadWrite;
+
+/**
+ * @author Brian Bushnell
+ * @date Jul 5, 2012
+ *
+ */
+public class TestLargeKmer {
+
+ public static void main(String args[]){
+ Timer t=new Timer();
+
+ String fname1=args[0];
+ String fname2=(args.length>4 || args[1].contains(".") ? args[1] : null);
+ int k=Integer.parseInt(args[args.length-3]);
+ int cbits=Integer.parseInt(args[args.length-2]);
+ int k2=Integer.parseInt(args[args.length-1]);
+
+ KCountArray2 counts=KmerCount3.countFastq(fname1, fname2, k, cbits);
+ long[] counts2=countK2(fname1, fname2, k, counts, k2);
+
+ t.stop();
+ System.out.println("Finished counting; time = "+t+"\n");
+
+ for(int i=0; i<counts2.length; i++){
+ System.out.println(i+":\t"+counts2[i]);
+ }
+ }
+
+ public static long[] countK2(String fname1, String fname2, int k, int cbits, int k2){
+ KCountArray2 counts=KmerCount3.countFastq(fname1, fname2, k, cbits);
+ return countK2(fname1, fname2, k, counts, k2);
+ }
+
+ public static long[] countK2(String fname1, String fname2, int k, KCountArray2 counts1, int k2){
+ assert(k>=1 && k<20);
+ final int kbits=2*k;
+ final long mask=~((-1L)<<(kbits));
+ FastqReadInputStream fris1=new FastqReadInputStream(fname1, false);
+ FastqReadInputStream fris2=(fname2==null ? null : new FastqReadInputStream(fname2, false));
+ ConcurrentGenericReadInputStream cris=new ConcurrentGenericReadInputStream(fris1, fris2, KmerCount3.maxReads);
+
+ cris.start();
+ System.err.println("Started cris");
+ boolean paired=cris.paired();
+
+ long kmer=0; //current kmer
+ int len=0; //distance since last contig start or ambiguous base
+
+
+ final long[] upperBound=new long[BOUND_LEN]; //Lowest upper bound provable of kmer count
+ final int[] ring=new int[k2-k+1];
+ final int[] subcount=new int[BOUND_LEN];
+ final int maxValue=subcount.length-1;
+
+ {
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+ assert(paired==(r.mate!=null));
+ }
+
+ while(reads!=null && reads.size()>0){
+ //System.err.println("reads.size()="+reads.size());
+ for(Read r : reads){
+
+ len=0;
+ kmer=0;
+ Arrays.fill(subcount, 0);
+
+ byte[] bases=r.bases;
+ byte[] quals=r.quality;
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ int x=AminoAcid.baseToNumber[b];
+
+ int ringpos=i%ring.length;
+ int old=ring[ringpos];
+ int value=0;
+
+ if(x<0 || quals[i]<KmerCount3.minQuality){
+ len=0;
+ kmer=0;
+ }else{
+ kmer=((kmer<<2)|x)&mask;
+ len++;
+
+ if(len>=k){
+ value=counts1.read(kmer);
+ }
+ }
+ value=min(value, maxValue);
+
+ ring[ringpos]=value;
+ subcount[value]++;
+
+ if(i>=ring.length){
+ subcount[old]--;
+ }
+
+ if(len>=k2){
+ int sub=0;
+ while(sub<subcount.length && subcount[sub]==0){sub++;}
+ assert(sub<subcount.length);
+ upperBound[sub]++;
+ }
+
+ }
+
+ if(r.mate!=null){
+ bases=r.mate.bases;
+ quals=r.mate.quality;
+
+ len=0;
+ kmer=0;
+ Arrays.fill(subcount, 0);
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ int x=AminoAcid.baseToNumber[b];
+
+ int ringpos=i%ring.length;
+ int old=ring[ringpos];
+ int value=0;
+
+ if(x<0 || quals[i]<KmerCount3.minQuality){
+ len=0;
+ kmer=0;
+ }else{
+ kmer=((kmer<<2)|x)&mask;
+ len++;
+
+ if(len>=k){
+ value=counts1.read(kmer);
+ }
+ }
+ value=min(value, maxValue);
+
+ ring[ringpos]=value;
+ subcount[value]++;
+
+ if(i>=ring.length){
+ subcount[old]--;
+ }
+
+ if(len>=k2){
+ int sub=0;
+ while(sub<subcount.length && subcount[sub]==0){sub++;}
+ assert(sub<subcount.length);
+ upperBound[sub]++;
+ }
+
+ }
+ }
+
+ }
+ //System.err.println("returning list");
+ cris.returnList(ln.id, ln.list.isEmpty());
+ //System.err.println("fetching list");
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ System.err.println("Finished reading");
+ cris.returnList(ln.id, ln.list.isEmpty());
+ System.err.println("Returned list");
+ ReadWrite.closeStreams(cris);
+ System.err.println("Closed stream");
+ }
+
+ return upperBound;
+ }
+
+ public static final int min(int x, int y){return x<y ? x : y;}
+ public static final int max(int x, int y){return x>y ? x : y;}
+
+ public static final int BOUND_LEN=256;
+
+}
diff --git a/current/clump/Clump.java b/current/clump/Clump.java
new file mode 100755
index 0000000..4e99021
--- /dev/null
+++ b/current/clump/Clump.java
@@ -0,0 +1,124 @@
+package clump;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import dna.AminoAcid;
+
+import align2.Tools;
+
+import stream.Read;
+
+/**
+ * A list of reads sharing a kmer.
+ * @author Brian Bushnell
+ * @date Nov 7, 2015
+ *
+ */
+public class Clump extends ArrayList<Read> {
+
+ public Clump(long kmer_){
+ this(kmer_, 8);
+ }
+
+ public Clump(long kmer_, int size){
+ super(size);
+ kmer=kmer_;
+ }
+
+ public boolean add(Read r){
+ long[] obj=(long[]) r.obj;
+ assert(obj[0]==kmer);
+ return super.add(r);
+ }
+
+ /** This will create a count consensus of the bases at each position in the cluster. */
+ public int[][] baseCounts(){
+ int maxLeft=-1, maxRight=-1;
+ for(Read r : this){
+ long[] obj=(long[]) r.obj;
+ int pos=(int)obj[1];
+ maxLeft=Tools.max(maxLeft, pos);
+ maxRight=Tools.max(maxRight, r.length()-pos);
+ }
+ final int width=maxLeft+maxRight;
+// assert(size()==1) : "\nleft="+maxLeft+", right="+maxRight+", width="+width+", "+k+"\n"+get(0).toFastq()+"\n"+get(size()-1).toFastq();
+
+// System.err.println("\n\n");
+ final int[][] counts=new int[4][width];
+ for(Read r : this){
+ long[] obj=(long[]) r.obj;
+ int pos=(int)obj[1];
+ byte[] bases=r.bases, quals=r.quality;
+// System.err.println("pos="+pos+", maxLeft="+maxLeft);
+ for(int cloc=0, rloc=maxLeft-pos; cloc<bases.length; cloc++, rloc++){
+// System.err.println("cloc="+cloc+"/"+bases.length+", rloc="+rloc+"/"+width);
+ int x=AminoAcid.baseToNumber[bases[cloc]];
+ if(x>-1){
+ int q=(quals==null ? 20 : quals[cloc]);
+ counts[x][rloc]+=q;
+ }
+ }
+ }
+// if(size()>0){//Looks correct.
+// System.err.println(Arrays.toString(counts[0]));
+// System.err.println(Arrays.toString(counts[1]));
+// System.err.println(Arrays.toString(counts[2]));
+// System.err.println(Arrays.toString(counts[3]));
+// }
+ return counts;
+ }
+
+
+ public ArrayList<Read> condense(){
+ //TODO - this needs to be expanded. Consensus is not good enough.
+ Read r=consensus();
+ ArrayList<Read> list=new ArrayList<Read>();
+ list.add(r);
+ return list;
+ }
+
+ public Read consensus(){//TODO: Return single read if only 1.
+ final int[][] counts=baseCounts();
+ final int width=counts[0].length;
+ byte[] bases=new byte[width], quals=new byte[width];
+ for(int i=0; i<width; i++){
+ int x=getConsensus(counts, i);
+ if(x<0){
+// System.err.println("q="+0+", x="+x+"; A="+counts[0][i]+", C="+counts[1][i]+", G="+counts[2][i]+", T="+counts[3][i]);
+ bases[i]='N';
+ quals[i]=0;
+ }else{
+ long q=2*counts[x][i]-counts[0][i]-counts[1][i]-counts[2][i]-counts[3][i];
+// System.err.println("q="+q+", x="+x+"; A="+counts[0][i]+", C="+counts[1][i]+", G="+counts[2][i]+", T="+counts[3][i]);
+ bases[i]=AminoAcid.numberToBase[x];
+ quals[i]=(byte)Tools.mid(0, q, 50);
+ }
+ }
+ Read leftmost=this.get(0);
+ Read r=new Read(bases, quals, 0, leftmost.id);
+ //TODO: Attach the long pair, and make sure the kmer location is correct.
+// assert(false) : "\n"+r.toFastq()+"\nCheck kmer location.";
+// assert(size()==1) : "\n"+r.toFastq()+"\n"+get(0).toFastq()+"\n"+get(size()-1).toFastq()+"\n";
+ return r;
+ }
+
+ public int getConsensus(int[][] counts, int pos){
+ int xMax=0;
+ for(int x=1; x<4; x++){
+// System.err.println("x="+x+", max="+max+", Checking "+counts[x][pos]+" vs "+counts[x][max]);
+ if(counts[x][pos]>counts[xMax][pos]){xMax=x;}
+ }
+// assert(counts[max][pos]>=counts[0][pos]);
+// assert(counts[max][pos]>=counts[1][pos]);
+// assert(counts[max][pos]>=counts[2][pos]) : max+", "+counts[max][pos]+", ["+counts[0][pos]+", "+counts[1][pos]+", "+counts[2][pos]+", "+counts[3][pos]+"]";
+// assert(counts[max][pos]>=counts[3][pos]);
+ return (counts[xMax][pos]>0 ? xMax : -1);
+ }
+
+ public final long kmer;
+
+ public static int k=31;
+ private static final long serialVersionUID = 1L;
+
+}
diff --git a/current/clump/ClumpList.java b/current/clump/ClumpList.java
new file mode 100755
index 0000000..2544a45
--- /dev/null
+++ b/current/clump/ClumpList.java
@@ -0,0 +1,114 @@
+package clump;
+
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import align2.Shared;
+
+import stream.Read;
+
+/**
+ * A list of clumps, meaning a list of lists of reads.
+ * Allows adding reads by streaming and generating new clumps as needed.
+ * The input reads must be correctly ordered.
+ * @author Brian Bushnell
+ * @date Nov 9, 2015
+ *
+ */
+public class ClumpList extends ArrayList<Clump> {
+
+ public ClumpList(){}
+
+ public ClumpList(ArrayList<Read> list){
+ addReads(list);
+ }
+
+ public void addReads(ArrayList<Read> list){
+ assert(list.getClass()!=Clump.class) : list.getClass();
+ for(final Read r : list){
+ final long[] obj=(long[])r.obj;
+ final long kmer=obj[0];
+ if(kmer!=currentKmer){
+ currentKmer=kmer;
+ currentClump=new Clump(kmer);
+ add(currentClump);
+ }
+ currentClump.add(r);
+ }
+ }
+
+ public ArrayList<Read> condense(){
+ final int threads=Shared.threads();
+ return condense(threads);
+ }
+
+ public ArrayList<Read> condense(final int threads){
+ final ArrayList<CondenseThread> alct=new ArrayList<CondenseThread>(threads);
+ for(int i=0; i<threads; i++){alct.add(new CondenseThread());}
+
+ if(verbose){outstream.println("Starting condense threads.");}
+ for(CondenseThread ct : alct){ct.start();}
+
+ if(verbose){outstream.println("Waiting for threads.");}
+ long readsThisPass=0;
+ /* Wait for threads to die */
+ for(CondenseThread ct : alct){
+
+ /* Wait for a thread to die */
+ while(ct.getState()!=Thread.State.TERMINATED){
+ try {
+ ct.join();
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ }
+ }
+ readsThisPass+=ct.storage.size();
+ }
+
+ if(verbose){outstream.println("Gathering reads.");}
+ ArrayList<Read> list=new ArrayList<Read>((int)readsThisPass);
+ for(int i=0; i<threads; i++){
+ CondenseThread ct=alct.set(i, null);
+ list.addAll(ct.storage);
+ }
+
+ assert(list.size()==readsThisPass);
+ return list;
+ }
+
+ @Override
+ public void clear(){
+ super.clear();
+ currentClump=null;
+ currentKmer=Long.MIN_VALUE;
+ ptr.set(0);
+ }
+
+ private class CondenseThread extends Thread{
+
+ @Override
+ public void run(){
+ final int size=size();
+ for(int i=ptr.getAndIncrement(); i<size; i=ptr.getAndIncrement()){
+ Clump c=get(i);
+ Read r=c.consensus();
+ storage.add(r);
+ c.clear();
+ set(i, null);
+ }
+ }
+
+ private ArrayList<Read> storage=new ArrayList<Read>();
+
+ }
+
+ private Clump currentClump=null;
+ private long currentKmer=Long.MIN_VALUE;
+ private final AtomicInteger ptr=new AtomicInteger(0);
+
+ private static final long serialVersionUID = 1L;
+ private static boolean verbose=false;
+ private static final PrintStream outstream=System.err;
+
+}
diff --git a/current/clump/ClumpTools.java b/current/clump/ClumpTools.java
new file mode 100755
index 0000000..29b6bb8
--- /dev/null
+++ b/current/clump/ClumpTools.java
@@ -0,0 +1,14 @@
+package clump;
+
+import kmer.KmerTableSet;
+
+/**
+ * @author Brian Bushnell
+ * @date Nov 12, 2015
+ *
+ */
+public class ClumpTools {
+
+ public static KmerTableSet table=null;
+
+}
diff --git a/current/clump/Clumpify.java b/current/clump/Clumpify.java
new file mode 100755
index 0000000..1d2ae1e
--- /dev/null
+++ b/current/clump/Clumpify.java
@@ -0,0 +1,141 @@
+package clump;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Random;
+
+import align2.Shared;
+import align2.Tools;
+import dna.Parser;
+import dna.Timer;
+
+import fileIO.ReadWrite;
+
+/**
+ * @author Brian Bushnell
+ * @date Nov 6, 2015
+ *
+ */
+public class Clumpify {
+
+ /**
+ * Code entrance from the command line.
+ * @param args Command line arguments
+ */
+ public static void main(String[] args){
+ Timer t=new Timer();
+ Clumpify cl=new Clumpify(args);
+ cl.process(t);
+ }
+
+ /**
+ * Constructor.
+ * @param args Command line arguments
+ */
+ public Clumpify(String[] args){
+
+ args=Parser.parseConfig(args);
+ if(Parser.parseHelp(args, true)){
+ printOptions();
+ System.exit(0);
+ }
+
+ outstream.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+ args2=new ArrayList<String>();
+ args2.add("in");
+ args2.add("out");
+ args2.add("groups");
+
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if(b==null || b.equalsIgnoreCase("null")){b=null;}
+ while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens
+
+ if(a.equals("in") || a.equals("in1")){
+ in1=b;
+ }else if(a.equals("out") || a.equals("out1")){
+ out1=b;
+ }else if(a.equals("groups") || a.equals("g") || a.equals("sets")){
+ groups=Integer.parseInt(b);
+ }else if(a.equals("delete")){
+ delete=Tools.parseBoolean(b);
+ }else if(a.equals("usetmpdir")){
+ useTmpdir=Tools.parseBoolean(b);
+ }else{
+ args2.add(arg);
+ }
+ }
+
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** This is called if the program runs with no parameters */
+ private void printOptions(){
+ throw new RuntimeException("TODO");
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Outer Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Create read streams and process all data */
+ void process(Timer t){
+ String[] args=args2.toArray(new String[0]);
+ args[2]="groups="+groups;
+ if(groups==1){
+ args[0]="in="+in1;
+ args[1]="out="+out1;
+ KmerSort.main(args);
+ }else{
+ Random randy=new Random();
+ final String temp;
+ String core=ReadWrite.stripToCore(out1);
+ String path=ReadWrite.getPath(out1);
+ String extension=ReadWrite.getExtension(out1);
+ if(useTmpdir && Shared.TMPDIR!=null){
+ temp=Shared.TMPDIR+core+"_temp%_"+Long.toHexString((randy.nextLong()&Long.MAX_VALUE))+extension;
+ }else{
+ temp=path+core+"_temp%_"+Long.toHexString((randy.nextLong()&Long.MAX_VALUE))+extension;
+ }
+ args[0]="in="+in1;
+ args[1]="out="+temp;
+ KmerSplit.main(args);
+
+ args[0]="in="+temp;
+ args[1]="out="+out1;
+ KmerSort.main(args);
+
+ if(delete){
+ for(int i=0; i<groups; i++){
+ new File(temp.replaceFirst("%", ""+i)).delete();
+ }
+ }
+ }
+ t.stop();
+ System.err.println("Total time: \t"+t);
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private int groups=16;
+ private boolean useTmpdir=false;
+ private boolean delete=true;
+
+ private String in1=null;
+ private String out1=null;
+
+ ArrayList<String> args2=new ArrayList<String>();
+ private PrintStream outstream=System.err;
+
+}
diff --git a/current/clump/Condensor.java b/current/clump/Condensor.java
new file mode 100755
index 0000000..84a1aa2
--- /dev/null
+++ b/current/clump/Condensor.java
@@ -0,0 +1,27 @@
+package clump;
+
+import java.util.ArrayList;
+
+import stream.Read;
+
+/**
+ * @author Brian Bushnell
+ * @date Nov 7, 2015
+ *
+ */
+public class Condensor {
+
+ public Condensor(){
+
+ }
+
+ public ArrayList<Clump> makeClumps(ArrayList<Read> list){
+ throw new RuntimeException();
+ }
+
+ public ArrayList<Read> condense(ArrayList<Read> list){
+
+ throw new RuntimeException();
+ }
+
+}
diff --git a/current/clump/KmerComparator.java b/current/clump/KmerComparator.java
new file mode 100755
index 0000000..c93acb9
--- /dev/null
+++ b/current/clump/KmerComparator.java
@@ -0,0 +1,279 @@
+package clump;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Comparator;
+
+import jgi.Dedupe;
+import align2.Shared;
+import align2.Tools;
+
+import kmer.KmerTableSet;
+import kmer.Primes;
+
+import stream.Read;
+
+/**
+ * @author Brian Bushnell
+ * @date Nov 4, 2015
+ *
+ */
+public class KmerComparator implements Comparator<Read> {
+
+ public KmerComparator(int k_, long minDivisor_){
+ k=k_;
+ assert(k>0 && k<32);
+
+ shift=2*k;
+ shift2=shift-2;
+ mask=~((-1L)<<shift);
+ divisor=Primes.primeAtLeast(minDivisor_);
+ }
+
+ public void hashThreaded(ArrayList<Read> list){
+ int threads=Shared.threads();
+ ArrayList<HashThread> alt=new ArrayList<HashThread>(threads);
+ for(int i=0; i<threads; i++){alt.add(new HashThread(i, threads, list));}
+ for(HashThread ht : alt){ht.start();}
+
+ /* Wait for threads to die */
+ for(HashThread ht : alt){
+
+ /* Wait for a thread to die */
+ while(ht.getState()!=Thread.State.TERMINATED){
+ try {
+ ht.join();
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ }
+ }
+ }
+ }
+
+ public void hash(ArrayList<Read> list, KmerTableSet table, int minCount) {
+ for(Read r : list){hash(r, table, minCount);}
+ }
+
+ private void hash(ArrayList<Read> list){
+ for(Read r : list){hash(r);}
+ }
+
+ public long hash(Read r1, KmerTableSet table, int minCount){
+ long[] kmers=new long[2];
+ r1.obj=kmers;
+ return fillLocalMax(r1, kmers, table, minCount);
+ }
+
+ private long hash(Read r1){
+ long[] kmers=new long[2];
+ r1.obj=kmers;
+ return fillLocalMax(r1, kmers);
+ }
+
+ public void fuse(Read r1){
+ Read r2=r1.mate;
+ if(r2==null){return;}
+ r1.mate=null;
+ final int len1=r1.length(), len2=r2.length();
+ int len=len1+len2+1;
+ byte[] bases=new byte[len];
+ for(int i=0; i<len1; i++){bases[i]=r1.bases[i];}
+ bases[len1]='N';
+ for(int i=0, j=len1+1; i<len2; i++){bases[j]=r2.bases[i];}
+ }
+
+ /* (non-Javadoc)
+ * @see java.util.Comparator#compare(java.lang.Object, java.lang.Object)
+ */
+ @Override
+ public int compare(Read a, Read b) {
+ final long[] alist, blist;
+ if(a.obj==null){
+ alist=new long[2];
+ a.obj=alist;
+ fillLocalMax(a, alist);
+ }else{alist=(long[])a.obj;}
+
+ if(b.obj==null){
+ blist=new long[2];
+ b.obj=alist;
+ fillLocalMax(b, blist);
+ }else{blist=(long[])b.obj;}
+
+ return compare(alist, blist);
+ }
+
+
+ /** Finds the global maximum */
+ private long fillMax(Read r, long[] kmers){
+ return fillMax(r, kmers, null, 0);
+ }
+
+ /** Finds the global maximum */
+ public long fillMax(Read r, long[] kmers, KmerTableSet table, int minCount){
+// Arrays.fill(kmers, -1);
+ kmers[0]=0;
+ kmers[1]=k-1;
+ final byte[] bases=r.bases;
+ long kmer=0;
+ long rkmer=0;
+ int len=0;
+
+ if(bases==null || bases.length<k){return -1;}
+
+ long topMod=-1;
+ boolean rcomp=false;
+
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ long x=Dedupe.baseToNumber[b];
+ long x2=Dedupe.baseToComplementNumber[b];
+ kmer=((kmer<<2)|x)&mask;
+ rkmer=(rkmer>>>2)|(x2<<shift2);
+ if(b=='N'){len=0;}else{len++;}
+ if(len>=k){
+ final long kmax=Tools.max(kmer, rkmer);
+ final long mod=kmax%divisor;
+ if(mod>topMod){
+ if(minCount<2 || table.getCount(kmer, rkmer)>=minCount){
+ topMod=mod;
+ kmers[0]=kmax;
+ kmers[1]=i;
+ rcomp=(kmax!=kmer);
+ }
+ }
+ }
+ }
+ rcomp&=rcompReads;
+
+ if(topMod<0 && minCount>1){
+ return fillMax(r, kmers, null, 0);
+ }
+
+// r.id+=" "+kmers[1]+","+rcomp+","+(bases.length-kmers[1]+k-2);
+ if(rcomp){
+ r.reverseComplement();
+ r.setSwapped(true);
+ kmers[1]=bases.length-kmers[1]+k-2;
+ }
+ if(addName){r.id+=" "+kmers[1]+(rcomp ? ",t" : ",f")+","+kmers[0];}
+ assert(kmers[0]>=0 && kmers[1]>=0) : Arrays.toString(kmers)+"\n"+r;
+ return kmers[0];
+ }
+
+ /** Finds the highest local maximum */
+ private long fillLocalMax(Read r, long[] kmers){
+ return fillLocalMax(r, kmers, null, 0);
+ }
+
+ /** Finds the highest local maximum */
+ public long fillLocalMax(Read r, long[] kmers, KmerTableSet table, int minCount){
+ if(!LOCAL_MAX){return fillMax(r, kmers);}
+ Arrays.fill(kmers, -1);//TODO: Note! 0, 0 can be detected and allowed to fall through.
+ final byte[] bases=r.bases;
+ long kmer=0;
+ long rkmer=0;
+ int len=0;
+
+ if(bases==null || bases.length<k){return -1;}
+
+ long topMod=-1;
+ boolean rcomp=false;
+
+ long mod1=-1, mod2=-1;
+ long kmax1=-1;//, kmax2=-1;
+ boolean rcomp1=false;//, rcomp2=false;
+
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ long x=Dedupe.baseToNumber[b];
+ long x2=Dedupe.baseToComplementNumber[b];
+ kmer=((kmer<<2)|x)&mask;
+ rkmer=(rkmer>>>2)|(x2<<shift2);
+ if(b=='N'){
+ len=0;
+ mod1=mod2=-1;
+ kmax1=-1;//kmax2=-1;
+ }else{len++;}
+ if(len>=k){
+ final long kmax0=Tools.max(kmer, rkmer);
+ final long mod0=kmax0%divisor;
+ final boolean rcomp0=(kmax0!=kmer);
+ if(len>k+1 && mod1>topMod && mod1>mod2 && mod1>mod0){//Local maximum
+ if(minCount<2 || table.getCount(kmer, rkmer)>=minCount){
+ topMod=mod1;
+ kmers[0]=kmax1;
+ kmers[1]=i-1;
+ rcomp=(rcomp1);
+ }
+ }
+ mod2=mod1;
+ mod1=mod0;
+// kmax2=kmax1;
+ kmax1=kmax0;
+// rcomp2=rcomp1;
+ rcomp1=rcomp0;
+ }
+ }
+
+ if(topMod<0){//There was no local maximum
+ if(minCount>1){return fillLocalMax(r, kmers, null, 0);}
+ else{return fillMax(r, kmers, table, minCount);}
+ }
+
+ rcomp&=rcompReads;
+// r.id+=" "+kmers[1]+","+rcomp+","+(bases.length-kmers[1]+k-2);
+ if(rcomp){
+ r.reverseComplement();
+ r.setSwapped(true);
+ kmers[1]=bases.length-kmers[1]+k-2;
+ }
+ if(addName){r.id+=" "+kmers[1]+(rcomp ? ",t" : ",f")+","+kmers[0];}
+ assert(kmers[0]>=0 && kmers[1]>=0) : Arrays.toString(kmers);
+ return kmers[0];
+ }
+
+ private int compare(long[] alist, long[] blist){
+ for(int i=0; i<alist.length; i++){
+ final long a=alist[i], b=blist[i];
+ if(a!=b){
+ return a>b ? 1 : -1;
+ }
+ }
+ return 0;
+ }
+
+ private class HashThread extends Thread{
+
+ HashThread(int id_, int threads_, ArrayList<Read> list_){
+ id=id_;
+ threads=threads_;
+ list=list_;
+ }
+
+ @Override
+ public void run(){
+ for(int i=id; i<list.size(); i+=threads){
+ hash(list.get(i));
+ }
+ }
+
+ final int id;
+ final int threads;
+ final ArrayList<Read> list;
+
+ }
+
+ public final int k;
+
+ final int shift;
+ final int shift2;
+ final long mask;
+
+ public final long divisor;
+ public boolean addName=true;
+ public boolean rcompReads=true;
+
+ public static final boolean LOCAL_MAX=false; //Should improve compression, but decreases compression...?
+
+}
diff --git a/current/clump/KmerComparator_original.java b/current/clump/KmerComparator_original.java
new file mode 100755
index 0000000..c0e778f
--- /dev/null
+++ b/current/clump/KmerComparator_original.java
@@ -0,0 +1,129 @@
+package clump;
+
+import java.util.Arrays;
+import java.util.Comparator;
+
+import jgi.Dedupe;
+import align2.Tools;
+
+import kmer.Primes;
+
+import stream.Read;
+
+/**
+ * @author Brian Bushnell
+ * @date Nov 4, 2015
+ *
+ */
+public class KmerComparator_original implements Comparator<Read> {
+
+ public KmerComparator_original(int k_, int comparisons_, long minDivisor_){
+ k=k_;
+ comparisons=comparisons_;
+ assert(k>0 && k<32);
+ assert(comparisons>0 && comparisons<1000);
+
+ shift=2*k;
+ shift2=shift-2;
+ mask=~((-1L)<<shift);
+
+ divisors=new long[comparisons];
+ divisors[0]=Primes.primeAtLeast(minDivisor_);
+ for(int i=1; i<comparisons; i++){
+ divisors[i]=Primes.primeAtLeast(divisors[i-1]+1);
+ }
+ }
+
+ /* (non-Javadoc)
+ * @see java.util.Comparator#compare(java.lang.Object, java.lang.Object)
+ */
+ @Override
+ public int compare(Read a, Read b) {
+ final long[] alist, blist;
+ if(useCache){
+ if(a.obj==null){
+ alist=new long[comparisons];
+ a.obj=alist;
+ fill(a, alist);
+ }else{alist=(long[])a.obj;}
+
+ if(b.obj==null){
+ blist=new long[comparisons];
+ b.obj=alist;
+ fill(b, blist);
+ }else{blist=(long[])b.obj;}
+ }else{
+ long[][] matrix=local1.get();
+ if(matrix==null){
+ matrix=new long[2][comparisons];
+ local1.set(matrix);
+ }
+ alist=matrix[0];
+ blist=matrix[1];
+ fill(a, alist);
+ fill(b, blist);
+ }
+
+ return compare(alist, blist);
+ }
+
+ public void fill(Read r, long[] kmers){
+ final byte[] bases=r.bases;
+ long kmer=0;
+ long rkmer=0;
+ int len=0;
+
+ if(bases==null || bases.length<k){return;}
+
+ long[] mods=local2.get();
+ if(mods==null){
+ mods=new long[comparisons];
+ local2.set(mods);
+ }
+ Arrays.fill(mods, -1);
+
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ long x=Dedupe.baseToNumber[b];
+ long x2=Dedupe.baseToComplementNumber[b];
+ kmer=((kmer<<2)|x)&mask;
+ rkmer=(rkmer>>>2)|(x2<<shift2);
+ if(b=='N'){len=0;}else{len++;}
+ if(len>=k){
+ final long kmax=Tools.max(kmer, rkmer);
+ for(int j=0; j<comparisons; j++){
+ final long div=divisors[j];
+ final long mod=kmax%div;
+ if(mod>mods[j]){
+ mods[j]=mod;
+ kmers[j]=kmax;
+ }
+ }
+ }
+ }
+ }
+
+ private int compare(long[] alist, long[] blist){
+ for(int i=0; i<comparisons; i++){
+ final long a=alist[i], b=blist[i];
+ if(a!=b){
+ return a>b ? 1 : -1;
+ }
+ }
+ return 0;
+ }
+
+ public final int k;
+
+ final int shift;
+ final int shift2;
+ final long mask;
+
+ public final int comparisons;
+ public final long[] divisors;
+ public static boolean useCache=true;
+
+ private ThreadLocal<long[][]> local1=new ThreadLocal<long[][]>();
+ private ThreadLocal<long[]> local2=new ThreadLocal<long[]>();
+
+}
diff --git a/current/clump/KmerReduce.java b/current/clump/KmerReduce.java
new file mode 100755
index 0000000..cd6cad9
--- /dev/null
+++ b/current/clump/KmerReduce.java
@@ -0,0 +1,427 @@
+package clump;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Random;
+
+import kmer.KmerTableSet;
+
+import stream.ConcurrentReadInputStream;
+import stream.ConcurrentReadOutputStream;
+import stream.FASTQ;
+import stream.FastaReadInputStream;
+import stream.Read;
+import align2.ListNum;
+import align2.ReadStats;
+import align2.Shared;
+import align2.Tools;
+import assemble.AbstractRemoveThread;
+import dna.AminoAcid;
+import dna.Parser;
+import dna.Timer;
+import fileIO.ByteFile;
+import fileIO.FileFormat;
+import fileIO.ReadWrite;
+
+/**
+ * Reduces reads to their feature kmer.
+ * @author Brian Bushnell
+ * @date Nov 10, 2015
+ *
+ */
+public class KmerReduce {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Static Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /**
+ * Code entrance from the command line.
+ * @param args Command line arguments
+ */
+ public static void main(String[] args){
+ final boolean pigz=ReadWrite.USE_PIGZ, unpigz=ReadWrite.USE_UNPIGZ;
+ Timer t=new Timer();
+ KmerReduce kr=new KmerReduce(args);
+ kr.process(t);
+ ReadWrite.USE_PIGZ=pigz;
+ ReadWrite.USE_UNPIGZ=unpigz;
+ }
+
+ /**
+ * @param fname0 Input filename of reads
+ * @param k Kmer length
+ * @param cutoff Minimum count to retain
+ * @return Set of pivot kmers
+ */
+ public static KmerTableSet getValidKmersFromReads(final String fname0, int k, int cutoff){
+ final String fname=fname0+"_"+(new Random().nextLong()>>>1)+".fa.gz";
+ assert(!new File(fname).exists());
+
+ ArrayList<String> arglist=new ArrayList<String>();
+ arglist.add("in="+fname0);
+ arglist.add("out="+fname);
+ arglist.add("k="+k);
+ String[] args=arglist.toArray(new String[0]);
+
+ main(args);
+
+ KmerTableSet set=getValidKmers(fname, k, cutoff);
+ File f=new File(fname);
+ if(f.exists()){f.delete();}
+
+ return set;
+ }
+
+ /**
+ * @param fname Input filename of pivot kmers
+ * @param k Kmer length
+ * @param cutoff Minimum count to retain
+ * @return Set of pivot kmers
+ */
+ public static KmerTableSet getValidKmers(final String fname, int k, int cutoff){
+ ArrayList<String> arglist=new ArrayList<String>();
+ arglist.add("in="+fname);
+ arglist.add("k="+k);
+ if(cutoff>1 && prefilter){
+ arglist.add("prefilter="+(cutoff-1));
+ }
+
+ String[] args=arglist.toArray(new String[0]);
+ KmerTableSet set=new KmerTableSet(args, 12);
+
+ Timer t=new Timer();
+
+ set.process(t);
+// errorState|=set.errorState;
+ assert(!set.errorState);
+ t.stop();
+
+ set.prefilterArray=null;
+ AbstractRemoveThread.process(Shared.threads(), cutoff, Integer.MAX_VALUE, set, true);
+
+ return set;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /**
+ * Constructor.
+ * @param args Command line arguments
+ */
+ public KmerReduce(String[] args){
+
+ args=Parser.parseConfig(args);
+ if(Parser.parseHelp(args, true)){
+ printOptions();
+ System.exit(0);
+ }
+
+ outstream.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+
+ boolean setInterleaved=false; //Whether it was explicitly set.
+
+ Shared.READ_BUFFER_LENGTH=Tools.min(200, Shared.READ_BUFFER_LENGTH);
+ ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
+ ReadWrite.MAX_ZIP_THREADS=Shared.threads();
+
+
+ Parser parser=new Parser();
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if(b==null || b.equalsIgnoreCase("null")){b=null;}
+ while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens
+
+ if(parser.parse(arg, a, b)){
+ //do nothing
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ }else if(a.equals("parse_flag_goes_here")){
+ //Set a variable here
+ }else if(a.equals("k")){
+ k=Integer.parseInt(b);
+ assert(k>0 && k<32);
+ }else if(a.equals("comparisons") || a.equals("c")){
+ //do nothing
+ }else if(a.equals("divisor") || a.equals("div") || a.equals("mindivisor")){
+ minDivisor=Tools.parseKMG(b);
+ }else if(a.equals("rename") || a.equals("addname")){
+ //do nothing
+ }else if(a.equals("rcomp") || a.equals("reversecomplement")){
+ //do nothing
+ }else if(a.equals("condense") || a.equals("consensus")){
+ //do nothing
+ }else if(a.equals("groups") || a.equals("g") || a.equals("sets")){
+ //do nothing
+ }else{
+ outstream.println("Unknown parameter "+args[i]);
+ assert(false) : "Unknown parameter "+args[i];
+ // throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ {//Process parser fields
+ Parser.processQuality();
+
+ maxReads=parser.maxReads;
+
+ overwrite=ReadStats.overwrite=parser.overwrite;
+ append=ReadStats.append=parser.append;
+
+ setInterleaved=parser.setInterleaved;
+
+ in1=parser.in1;
+ in2=parser.in2;
+
+ out1=parser.out1;
+
+ extin=parser.extin;
+ extout=parser.extout;
+ }
+
+ if(in1!=null && in2==null && in1.indexOf('#')>-1 && !new File(in1).exists()){
+ in2=in1.replace("#", "2");
+ in1=in1.replace("#", "1");
+ }
+ if(in2!=null){
+ if(FASTQ.FORCE_INTERLEAVED){outstream.println("Reset INTERLEAVED to false because paired input files were specified.");}
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+ }
+
+ assert(FastaReadInputStream.settingsOK());
+
+ if(in1==null){
+ printOptions();
+ throw new RuntimeException("Error - at least one input file is required.");
+ }
+ if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){
+ ByteFile.FORCE_MODE_BF2=true;
+ }
+
+ if(out1!=null && out1.equalsIgnoreCase("null")){out1=null;}
+
+ if(!Tools.testOutputFiles(overwrite, append, false, out1)){
+ outstream.println((out1==null)+", "+out1);
+ throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+out1+"\n");
+ }
+
+ ffout1=FileFormat.testOutput(out1, FileFormat.FASTQ, extout, true, overwrite, append, false);
+
+ ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, extin, true, true);
+ ffin2=FileFormat.testInput(in2, FileFormat.FASTQ, extin, true, true);
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Outer Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Create read streams and process all data */
+ void process(Timer t){
+
+ final ConcurrentReadInputStream cris;
+ {
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, ffin2, null, null);
+ cris.start();
+ if(verbose){outstream.println("Started cris");}
+ }
+ boolean paired=cris.paired();
+
+ final ConcurrentReadOutputStream ros;
+ if(out1!=null){
+ final int buff=Tools.max(4, Shared.threads());
+
+ assert(!out1.equalsIgnoreCase(in1) && !out1.equalsIgnoreCase(in1)) : "Input file and output file have same name.";
+
+ ros=ConcurrentReadOutputStream.getStream(ffout1, null, buff, null, false);
+ ros.start();
+ }else{ros=null;}
+
+ readsProcessed=0;
+ basesProcessed=0;
+
+ //Process the read stream
+ processInner(cris, ros);
+
+ if(verbose){outstream.println("Finished; closing streams.");}
+
+ errorState|=ReadStats.writeAll();
+ errorState|=ReadWrite.closeStreams(cris, ros);
+
+ t.stop();
+
+ double rpnano=readsProcessed/(double)(t.elapsed);
+ double bpnano=basesProcessed/(double)(t.elapsed);
+
+ String rpstring=(readsProcessed<100000 ? ""+readsProcessed : readsProcessed<100000000 ? (readsProcessed/1000)+"k" : (readsProcessed/1000000)+"m");
+ String bpstring=(basesProcessed<100000 ? ""+basesProcessed : basesProcessed<100000000 ? (basesProcessed/1000)+"k" : (basesProcessed/1000000)+"m");
+
+ while(rpstring.length()<8){rpstring=" "+rpstring;}
+ while(bpstring.length()<8){bpstring=" "+bpstring;}
+
+ outstream.println("Time: \t"+t);
+ outstream.println("Reads Processed: "+rpstring+" \t"+String.format("%.2fk reads/sec", rpnano*1000000));
+ outstream.println("Bases Processed: "+bpstring+" \t"+String.format("%.2fm bases/sec", bpnano*1000));
+
+ if(errorState){
+ throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
+ }
+ }
+
+ /** Manage threads */
+ public void processInner(final ConcurrentReadInputStream cris, final ConcurrentReadOutputStream ros){
+ if(verbose){outstream.println("Making comparator.");}
+ KmerComparator kc=new KmerComparator(k, minDivisor);
+ kc.addName=false;
+ kc.rcompReads=false;
+
+ if(verbose){outstream.println("Making hash threads.");}
+ final int threads=Shared.threads();
+ ArrayList<HashThread> alht=new ArrayList<HashThread>(threads);
+ for(int i=0; i<threads; i++){alht.add(new HashThread(cris, ros, kc));}
+
+ if(verbose){outstream.println("Starting threads.");}
+ for(HashThread ht : alht){ht.start();}
+
+ if(verbose){outstream.println("Waiting for threads.");}
+ /* Wait for threads to die */
+ for(HashThread ht : alht){
+
+ /* Wait for a thread to die */
+ while(ht.getState()!=Thread.State.TERMINATED){
+ try {
+ ht.join();
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ }
+ }
+ readsProcessed+=ht.readsProcessedT;
+ basesProcessed+=ht.basesProcessedT;
+ }
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Classes ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private class HashThread extends Thread{
+
+ HashThread(ConcurrentReadInputStream cris_, ConcurrentReadOutputStream ros_, KmerComparator kc_){
+ cris=cris_;
+ ros=ros_;
+ kc=kc_;
+ }
+
+ @Override
+ public void run(){
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ while(reads!=null && reads.size()>0){
+ ArrayList<Read> out=new ArrayList<Read>(reads.size());
+ for(Read r : reads){
+ final long kmer=kc.hash(r, null, 0);
+ readsProcessedT++;
+ basesProcessedT+=r.length();
+ if(kmer>=0){
+ Read temp=new Read(toBytes(kmer), null, r.numericID, header);
+ out.add(temp);
+ }
+ }
+ if(ros!=null){ros.add(out, ln.id);}
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ if(ln!=null){
+ cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
+ }
+ }
+
+ final ConcurrentReadInputStream cris;
+ final ConcurrentReadOutputStream ros;
+ final KmerComparator kc;
+
+ protected long readsProcessedT=0;
+ protected long basesProcessedT=0;
+
+ private static final String header="1";
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** This is called if the program runs with no parameters */
+ private void printOptions(){
+ throw new RuntimeException("TODO");
+ }
+
+ public byte[] toBytes(final long kmer){
+ byte[] dest=new byte[k];
+ fill(kmer, dest, 0);
+ return dest;
+ }
+
+ public void fill(final long kmer, final byte[] dest, int pos){
+ for(int i=k-1; i>=0; i--, pos++){
+ int x=(int)((kmer>>(2*i))&3);
+ dest[pos]=AminoAcid.numberToBase[x];
+ }
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private int k=31;
+ private long minDivisor=80000000;
+ static boolean prefilter=true;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- I/O Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private String in1=null;
+ private String in2=null;
+
+ private String out1=null;
+
+ private String extin=null;
+ private String extout=null;
+
+ /*--------------------------------------------------------------*/
+
+ protected long readsProcessed=0;
+ protected long basesProcessed=0;
+
+ private long maxReads=-1;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Final Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private final FileFormat ffin1;
+ private final FileFormat ffin2;
+
+ private final FileFormat ffout1;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Common Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private PrintStream outstream=System.err;
+ public static boolean verbose=false;
+ public boolean errorState=false;
+ private boolean overwrite=false;
+ private boolean append=false;
+
+}
diff --git a/current/clump/KmerSort.java b/current/clump/KmerSort.java
new file mode 100755
index 0000000..c84c238
--- /dev/null
+++ b/current/clump/KmerSort.java
@@ -0,0 +1,430 @@
+package clump;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+
+import kmer.KmerTableSet;
+
+import stream.ConcurrentReadInputStream;
+import stream.FastaReadInputStream;
+import stream.ConcurrentReadOutputStream;
+import stream.Read;
+
+import dna.Parser;
+import dna.Timer;
+import fileIO.ByteFile;
+import fileIO.ReadWrite;
+import fileIO.FileFormat;
+
+import align2.ListNum;
+import align2.ReadStats;
+import align2.Shared;
+import align2.Tools;
+
+/**
+ * @author Brian Bushnell
+ * @date June 20, 2014
+ *
+ */
+public class KmerSort {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /**
+ * Code entrance from the command line.
+ * @param args Command line arguments
+ */
+ public static void main(String[] args){
+ final boolean pigz=ReadWrite.USE_PIGZ, unpigz=ReadWrite.USE_UNPIGZ;
+ Timer t=new Timer();
+ KmerSort ks=new KmerSort(args);
+ ks.process(t);
+ ReadWrite.USE_PIGZ=pigz;
+ ReadWrite.USE_UNPIGZ=unpigz;
+ }
+
+ /**
+ * Constructor.
+ * @param args Command line arguments
+ */
+ public KmerSort(String[] args){
+
+ args=Parser.parseConfig(args);
+ if(Parser.parseHelp(args, true)){
+ printOptions();
+ System.exit(0);
+ }
+
+ outstream.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+
+ boolean setInterleaved=false; //Whether it was explicitly set.
+
+ Shared.READ_BUFFER_LENGTH=Tools.min(200, Shared.READ_BUFFER_LENGTH);
+ ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
+ ReadWrite.MAX_ZIP_THREADS=Shared.threads();
+
+
+ Parser parser=new Parser();
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if(b==null || b.equalsIgnoreCase("null")){b=null;}
+ while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens
+
+ if(parser.parse(arg, a, b)){
+ //do nothing
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ }else if(a.equals("parse_flag_goes_here")){
+ //Set a variable here
+ }else if(a.equals("k")){
+ k=Integer.parseInt(b);
+ assert(k>0 && k<32);
+ }else if(a.equals("mincount") || a.equals("mincr")){
+ minCount=Integer.parseInt(b);
+ }else if(a.equals("comparisons") || a.equals("c")){
+ comparisons=Integer.parseInt(b);
+ }else if(a.equals("divisor") || a.equals("div") || a.equals("mindivisor")){
+ minDivisor=Tools.parseKMG(b);
+ }else if(a.equals("rename") || a.equals("addname")){
+ addName=Tools.parseBoolean(b);
+// }else if(a.equals("cache")){
+// KmerComparator.useCache=Tools.parseBoolean(b);//Obsolete
+ }else if(a.equals("rcomp") || a.equals("reversecomplement")){
+ rcomp=Tools.parseBoolean(b);
+ }else if(a.equals("condense") || a.equals("consensus")){
+ condense=Tools.parseBoolean(b);
+ }else if(a.equals("prefilter")){
+ KmerReduce.prefilter=Tools.parseBoolean(b);
+ }else if(a.equals("groups") || a.equals("g") || a.equals("sets")){
+ groups=Integer.parseInt(b);
+ splitInput=(groups>1);
+ }else{
+ outstream.println("Unknown parameter "+args[i]);
+ assert(false) : "Unknown parameter "+args[i];
+ // throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ {//Process parser fields
+ Parser.processQuality();
+
+ maxReads=parser.maxReads;
+
+ overwrite=ReadStats.overwrite=parser.overwrite;
+ append=ReadStats.append=parser.append;
+
+ setInterleaved=parser.setInterleaved;
+
+ in1=parser.in1;
+
+ out1=parser.out1;
+
+ extin=parser.extin;
+ extout=parser.extout;
+ }
+
+ assert(FastaReadInputStream.settingsOK());
+
+ if(in1==null){
+ printOptions();
+ throw new RuntimeException("Error - at least one input file is required.");
+ }
+ if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){
+ ByteFile.FORCE_MODE_BF2=true;
+ }
+
+ if(out1!=null && out1.equalsIgnoreCase("null")){out1=null;}
+
+ if(!Tools.testOutputFiles(overwrite, append, false, out1)){
+ outstream.println((out1==null)+", "+out1);
+ throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+out1+"\n");
+ }
+
+ ffout1=FileFormat.testOutput(out1, FileFormat.FASTQ, extout, true, overwrite, append, false);
+
+ if(groups>1 && in1.contains("%") && (splitInput || !new File(in1).exists())){
+ ffin=new FileFormat[groups];
+ for(int i=0; i<groups; i++){
+ ffin[i]=FileFormat.testInput(in1.replaceFirst("%", ""+i), FileFormat.FASTQ, extin, true, true);
+ }
+ }else{
+ assert(!in1.contains("%") && groups==1) : "The % symbol must only be present in the input filename if groups>1.";
+ ffin=new FileFormat[1];
+ ffin[0]=FileFormat.testInput(in1, FileFormat.FASTQ, extin, true, true);
+ groups=1;
+ }
+// if(groups>1){ReadWrite.USE_UNPIGZ=false;} //Not needed since they are not concurrent
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Outer Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Count kmers */
+ void preprocess(){
+ if(minCount>1 && ClumpTools.table==null){
+ table=KmerReduce.getValidKmersFromReads(in1, k, minCount);
+ ClumpTools.table=table;
+ }
+ }
+
+ /** Create read streams and process all data */
+ void process(Timer t){
+
+ preprocess();
+
+ final ConcurrentReadInputStream[] cris=new ConcurrentReadInputStream[groups];
+ for(int i=0; i<cris.length; i++){
+ cris[i]=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin[i], null, null, null);
+ }
+
+ final ConcurrentReadOutputStream ros;
+ if(out1!=null){
+ final int buff=1;
+
+ if(cris[0].paired() && (in1==null || !in1.contains(".sam"))){
+ outstream.println("Writing interleaved.");
+ }
+
+ assert(!out1.equalsIgnoreCase(in1) && !out1.equalsIgnoreCase(in1)) : "Input file and output file have same name.";
+
+ ros=ConcurrentReadOutputStream.getStream(ffout1, null, null, null, buff, null, false);
+ ros.start();
+ }else{ros=null;}
+
+ readsProcessed=0;
+ basesProcessed=0;
+
+ //Process the read stream
+ processInner(cris, ros);
+
+ table=ClumpTools.table=null;
+
+ errorState|=ReadStats.writeAll();
+
+ t.stop();
+
+ double rpnano=readsProcessed/(double)(t.elapsed);
+ double bpnano=basesProcessed/(double)(t.elapsed);
+
+ String rpstring=(readsProcessed<100000 ? ""+readsProcessed : readsProcessed<100000000 ? (readsProcessed/1000)+"k" : (readsProcessed/1000000)+"m");
+ String bpstring=(basesProcessed<100000 ? ""+basesProcessed : basesProcessed<100000000 ? (basesProcessed/1000)+"k" : (basesProcessed/1000000)+"m");
+
+ while(rpstring.length()<8){rpstring=" "+rpstring;}
+ while(bpstring.length()<8){bpstring=" "+bpstring;}
+
+ outstream.println("Time: \t"+t);
+ outstream.println("Reads Processed: "+rpstring+" \t"+String.format("%.2fk reads/sec", rpnano*1000000));
+ outstream.println("Bases Processed: "+bpstring+" \t"+String.format("%.2fm bases/sec", bpnano*1000));
+
+ if(errorState){
+ throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
+ }
+ }
+
+ /** Collect and sort the reads */
+ void processInner(final ConcurrentReadInputStream[] crisArray, final ConcurrentReadOutputStream ros){
+ if(verbose){outstream.println("Making comparator.");}
+ KmerComparator kc=new KmerComparator(k, minDivisor);
+ kc.addName=addName;
+ kc.rcompReads=rcomp;
+
+ int i=0;
+ for(ConcurrentReadInputStream cris : crisArray){
+ i++;
+ if(verbose){outstream.println("Starting cris "+i+".");}
+ cris.start();
+
+ if(verbose){outstream.println("Fetching reads.");}
+ ArrayList<Read> reads=fetchReads(cris, kc);
+
+ if(verbose){outstream.println("Sorting.");}
+ Collections.sort(reads, kc);
+
+ if(condense){
+ if(verbose){outstream.println("Condensing.");}
+ reads=condenseReads(reads);
+ }
+
+ if(ros!=null){
+ if(verbose){outstream.println("Writing.");}
+ ros.add(reads, 0);
+ }
+ }
+
+ if(ros!=null){
+ if(verbose){outstream.println("Waiting for writing to complete.");}
+ errorState=ReadWrite.closeStream(ros)|errorState;
+ }
+
+ if(verbose){outstream.println("Done!");}
+ }
+
+ public ArrayList<Read> fetchReads(final ConcurrentReadInputStream cris, final KmerComparator kc){
+ if(verbose){outstream.println("Making hash threads.");}
+ final int threads=Shared.threads();
+ ArrayList<HashThread> alht=new ArrayList<HashThread>(threads);
+ for(int i=0; i<threads; i++){alht.add(new HashThread(i, cris, kc));}
+
+ if(verbose){outstream.println("Starting threads.");}
+ for(HashThread ht : alht){ht.start();}
+
+
+ if(verbose){outstream.println("Waiting for threads.");}
+ long readsThisPass=0;
+ /* Wait for threads to die */
+ for(HashThread ht : alht){
+
+ /* Wait for a thread to die */
+ while(ht.getState()!=Thread.State.TERMINATED){
+ try {
+ ht.join();
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ }
+ }
+ readsThisPass+=ht.readsProcessedT;
+ basesProcessed+=ht.basesProcessedT;
+ }
+ readsProcessed+=readsThisPass;
+
+ if(verbose){outstream.println("Closing input stream.");}
+ errorState=ReadWrite.closeStream(cris)|errorState;
+
+ if(verbose){outstream.println("Combining thread output.");}
+ assert(readsProcessed<=Integer.MAX_VALUE);
+ ArrayList<Read> list=new ArrayList<Read>((int)readsThisPass);
+ for(int i=0; i<threads; i++){
+ HashThread ht=alht.set(i, null);
+ list.addAll(ht.storage);
+ }
+
+ assert(list.size()==readsThisPass);
+ return list;
+ }
+
+ public ArrayList<Read> condenseReads(ArrayList<Read> list){
+ ClumpList cl=new ClumpList(list);
+ list.clear();
+ ArrayList<Read> out=cl.condense();
+ cl.clear();
+ return out;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** This is called if the program runs with no parameters */
+ private void printOptions(){
+ throw new RuntimeException("TODO");
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Classes ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private class HashThread extends Thread{
+
+ HashThread(int id_, ConcurrentReadInputStream cris_, KmerComparator kc_){
+ id=id_;
+ cris=cris_;
+ kc=kc_;
+ storage=new ArrayList<Read>();
+ }
+
+ @Override
+ public void run(){
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ while(reads!=null && reads.size()>0){
+ kc.hash(reads, table, minCount);
+ for(Read r : reads){
+ readsProcessedT++;
+ basesProcessedT+=r.length();
+ }
+ storage.addAll(reads);
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ if(ln!=null){
+ cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
+ }
+ Collections.sort(storage, kc);//Optimization for TimSort
+ }
+
+ final int id;
+ final ConcurrentReadInputStream cris;
+ final KmerComparator kc;
+ final ArrayList<Read> storage;
+
+ protected long readsProcessedT=0;
+ protected long basesProcessedT=0;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private int k=31;
+ private int minCount=0;
+ private int comparisons=3;
+ private long minDivisor=80000000;
+
+ private int groups=1;
+
+ KmerTableSet table=null;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- I/O Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private String in1=null;
+
+ private String out1=null;
+
+ private String extin=null;
+ private String extout=null;
+
+ /*--------------------------------------------------------------*/
+
+ protected long readsProcessed=0;
+ protected long basesProcessed=0;
+
+ private long maxReads=-1;
+ private boolean addName=true;
+ private boolean rcomp=true;
+ private boolean condense=false;
+ private boolean splitInput=false;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Final Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private final FileFormat ffin[];
+
+ private final FileFormat ffout1;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Common Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private PrintStream outstream=System.err;
+ public static boolean verbose=false;
+ public boolean errorState=false;
+ private boolean overwrite=false;
+ private boolean append=false;
+
+}
diff --git a/current/clump/KmerSplit.java b/current/clump/KmerSplit.java
new file mode 100755
index 0000000..351e3d0
--- /dev/null
+++ b/current/clump/KmerSplit.java
@@ -0,0 +1,420 @@
+package clump;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import kmer.KmerTableSet;
+
+import stream.ConcurrentReadInputStream;
+import stream.FASTQ;
+import stream.FastaReadInputStream;
+import stream.ConcurrentReadOutputStream;
+import stream.Read;
+
+import dna.Parser;
+import dna.Timer;
+import fileIO.ByteFile;
+import fileIO.ReadWrite;
+import fileIO.FileFormat;
+
+import align2.ListNum;
+import align2.ReadStats;
+import align2.Shared;
+import align2.Tools;
+
+/**
+ * @author Brian Bushnell
+ * @date June 20, 2014
+ *
+ */
+public class KmerSplit {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /**
+ * Code entrance from the command line.
+ * @param args Command line arguments
+ */
+ public static void main(String[] args){
+ final boolean pigz=ReadWrite.USE_PIGZ, unpigz=ReadWrite.USE_UNPIGZ;
+ Timer t=new Timer();
+ KmerSplit ks=new KmerSplit(args);
+ ks.process(t);
+ ReadWrite.USE_PIGZ=pigz;
+ ReadWrite.USE_UNPIGZ=unpigz;
+ }
+
+ /**
+ * Constructor.
+ * @param args Command line arguments
+ */
+ public KmerSplit(String[] args){
+
+ args=Parser.parseConfig(args);
+ if(Parser.parseHelp(args, true)){
+ printOptions();
+ System.exit(0);
+ }
+
+ outstream.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+
+ boolean setInterleaved=false; //Whether it was explicitly set.
+
+
+
+ Shared.READ_BUFFER_LENGTH=Tools.min(200, Shared.READ_BUFFER_LENGTH);
+ ReadWrite.USE_PIGZ=false;
+ ReadWrite.USE_UNPIGZ=true;
+ ReadWrite.MAX_ZIP_THREADS=Shared.threads();
+
+
+ Parser parser=new Parser();
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if(b==null || b.equalsIgnoreCase("null")){b=null;}
+ while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens
+
+ if(parser.parse(arg, a, b)){
+ //do nothing
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ }else if(a.equals("parse_flag_goes_here")){
+ //Set a variable here
+ }else if(a.equals("k")){
+ k=Integer.parseInt(b);
+ assert(k>0 && k<32);
+ }else if(a.equals("mincount") || a.equals("mincr")){
+ minCount=Integer.parseInt(b);
+ }else if(a.equals("groups") || a.equals("g") || a.equals("sets")){
+ groups=Integer.parseInt(b);
+ }else if(a.equals("divisor") || a.equals("div") || a.equals("mindivisor")){
+ minDivisor=Tools.parseKMG(b);
+ }else if(a.equals("rename") || a.equals("addname")){
+ addName=Tools.parseBoolean(b);
+ }else if(a.equals("rcomp") || a.equals("reversecomplement")){
+ //ignore rcomp=Tools.parseBoolean(b);
+ }else if(a.equals("condense")){
+ //ignore condense=Tools.parseBoolean(b);
+ }else if(a.equals("prefilter")){
+ KmerReduce.prefilter=Tools.parseBoolean(b);
+ }else{
+ outstream.println("Unknown parameter "+args[i]);
+ assert(false) : "Unknown parameter "+args[i];
+ // throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ {//Process parser fields
+ Parser.processQuality();
+
+ maxReads=parser.maxReads;
+
+ overwrite=ReadStats.overwrite=parser.overwrite;
+ append=ReadStats.append=parser.append;
+
+ setInterleaved=parser.setInterleaved;
+
+ in1=parser.in1;
+ in2=parser.in2;
+
+ out1=parser.out1;
+
+ extin=parser.extin;
+ extout=parser.extout;
+ }
+
+ if(groups>2){ReadWrite.USE_PIGZ=false;}
+
+ if(in1!=null && in2==null && in1.indexOf('#')>-1 && !new File(in1).exists()){
+ in2=in1.replace("#", "2");
+ in1=in1.replace("#", "1");
+ }
+ if(in2!=null){
+ if(FASTQ.FORCE_INTERLEAVED){outstream.println("Reset INTERLEAVED to false because paired input files were specified.");}
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+ }
+
+ assert(FastaReadInputStream.settingsOK());
+
+ if(in1==null){
+ printOptions();
+ throw new RuntimeException("Error - at least one input file is required.");
+ }
+ if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){
+ ByteFile.FORCE_MODE_BF2=true;
+ }
+
+ if(!setInterleaved){
+ assert(in1!=null) : "\nin1="+in1+"\nin2="+in2+"\nout1="+out1+"\n";
+ if(in2!=null){ //If there are 2 input streams.
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+ outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED);
+ }
+ }
+
+ if(out1!=null && out1.equalsIgnoreCase("null")){out1=null;}
+
+ if(out1!=null){
+ assert(out1.contains("%"));
+ outArray=new String[groups];
+ for(int i=0; i<groups; i++){
+ outArray[i]=out1.replaceFirst("%", ""+i);
+ }
+ if(!Tools.testOutputFiles(overwrite, append, false, outArray)){
+ outstream.println((out1==null)+", "+out1);
+ throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+out1+"\n");
+ }
+ ffout=new FileFormat[groups];
+ for(int i=0; i<groups; i++){
+ ffout[i]=FileFormat.testOutput(outArray[i], FileFormat.FASTQ, extout, true, overwrite, append, false);
+ }
+ }else{
+ outArray=null;
+ throw new RuntimeException("out is a required parameter.");
+ }
+
+ ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, extin, true, true);
+ ffin2=FileFormat.testInput(in2, FileFormat.FASTQ, extin, true, true);
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Outer Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Count kmers */
+ void preprocess(){
+ if(minCount>1 && ClumpTools.table==null){
+ table=KmerReduce.getValidKmersFromReads(in1, k, minCount);
+ ClumpTools.table=table;
+ }
+ }
+
+ /** Create read streams and process all data */
+ void process(Timer t){
+
+ preprocess();
+
+ final ConcurrentReadInputStream cris;
+ {
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, ffin2, null, null);
+ cris.start();
+ if(verbose){outstream.println("Started cris");}
+ }
+ boolean paired=cris.paired();
+ if(!ffin1.samOrBam()){outstream.println("Input is being processed as "+(paired ? "paired" : "unpaired"));}
+ if(cris.paired() && (in1==null || !in1.contains(".sam"))){
+ outstream.println("Writing interleaved.");
+ }
+
+ final ConcurrentReadOutputStream ros[]=new ConcurrentReadOutputStream[groups];
+ for(int i=0; i<groups; i++){
+ final int buff=8;
+
+ assert(!out1.equalsIgnoreCase(in1) && !out1.equalsIgnoreCase(in1)) : "Input file and output file have same name.";
+
+ ros[i]=ConcurrentReadOutputStream.getStream(ffout[i], null, null, null, buff, null, false);
+ ros[i].start();
+ }
+
+ readsProcessed=0;
+ basesProcessed=0;
+
+ //Process the read stream
+ processInner(cris, ros);
+
+ errorState|=ReadStats.writeAll();
+
+ t.stop();
+
+ double rpnano=readsProcessed/(double)(t.elapsed);
+ double bpnano=basesProcessed/(double)(t.elapsed);
+
+ String rpstring=(readsProcessed<100000 ? ""+readsProcessed : readsProcessed<100000000 ? (readsProcessed/1000)+"k" : (readsProcessed/1000000)+"m");
+ String bpstring=(basesProcessed<100000 ? ""+basesProcessed : basesProcessed<100000000 ? (basesProcessed/1000)+"k" : (basesProcessed/1000000)+"m");
+
+ while(rpstring.length()<8){rpstring=" "+rpstring;}
+ while(bpstring.length()<8){bpstring=" "+bpstring;}
+
+ outstream.println("Time: \t"+t);
+ outstream.println("Reads Processed: "+rpstring+" \t"+String.format("%.2fk reads/sec", rpnano*1000000));
+ outstream.println("Bases Processed: "+bpstring+" \t"+String.format("%.2fm bases/sec", bpnano*1000));
+
+ if(errorState){
+ throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
+ }
+ }
+
+ /** Collect and sort the reads */
+ void processInner(final ConcurrentReadInputStream cris, final ConcurrentReadOutputStream[] ros){
+ if(verbose){outstream.println("Making comparator.");}
+ KmerComparator kc=new KmerComparator(k, minDivisor);
+ kc.addName=addName;
+ kc.rcompReads=false;
+
+ if(verbose){outstream.println("Splitting reads.");}
+ splitReads(cris, ros, kc);
+
+ if(verbose){outstream.println("Done!");}
+ }
+
+ public void splitReads(final ConcurrentReadInputStream cris, final ConcurrentReadOutputStream[] ros, final KmerComparator kc){
+ if(verbose){outstream.println("Making hash threads.");}
+ final int threads=Shared.threads();
+ ArrayList<HashThread> alht=new ArrayList<HashThread>(threads);
+ for(int i=0; i<threads; i++){alht.add(new HashThread(i, cris, ros, kc));}
+
+ if(verbose){outstream.println("Starting threads.");}
+ for(HashThread ht : alht){ht.start();}
+
+
+ if(verbose){outstream.println("Waiting for threads.");}
+ /* Wait for threads to die */
+ for(HashThread ht : alht){
+
+ /* Wait for a thread to die */
+ while(ht.getState()!=Thread.State.TERMINATED){
+ try {
+ ht.join();
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ }
+ }
+ readsProcessed+=ht.readsProcessedT;
+ basesProcessed+=ht.basesProcessedT;
+ }
+
+ if(verbose){outstream.println("Closing streams.");}
+ errorState=ReadWrite.closeStreams(cris, ros)|errorState;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** This is called if the program runs with no parameters */
+ private void printOptions(){
+ throw new RuntimeException("TODO");
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Classes ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private class HashThread extends Thread{
+
+ HashThread(int id_, ConcurrentReadInputStream cris_, ConcurrentReadOutputStream[] ros_, KmerComparator kc_){
+ id=id_;
+ cris=cris_;
+ ros=ros_;
+ kc=kc_;
+ }
+
+ @Override
+ public void run(){
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ ArrayList<Read>[] array=new ArrayList[groups];
+ for(int i=0; i<groups; i++){
+ array[i]=new ArrayList<Read>(buffer);
+ }
+
+ while(reads!=null && reads.size()>0){
+ kc.hash(reads, table, minCount);
+ for(Read r : reads){
+ readsProcessedT++;
+ basesProcessedT+=r.length();
+ long kmer=((long[])r.obj)[0];
+ long mod=kmer%kc.divisor;
+ int mod2=(int)(mod%groups);
+ array[mod2].add(r);
+ if(array[mod2].size()>=buffer){
+ ros[mod2].add(array[mod2], 0);
+ array[mod2]=new ArrayList<Read>(buffer);
+ }
+ }
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ if(ln!=null){
+ cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
+ }
+ for(int i=0; i<groups; i++){
+ if(!array[i].isEmpty()){
+ ros[i].add(array[i], 0);
+ }
+ }
+ }
+
+ final int id;
+ final ConcurrentReadInputStream cris;
+ final ConcurrentReadOutputStream[] ros;
+ final KmerComparator kc;
+ static final int buffer=200;
+
+ protected long readsProcessedT=0;
+ protected long basesProcessedT=0;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private int k=31;
+ private long minDivisor=80000000;
+ private int groups=16;
+ private int minCount=0;
+
+ KmerTableSet table=null;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- I/O Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private String in1=null;
+ private String in2=null;
+
+ private String out1=null;
+ private String[] outArray=null;
+
+ private String extin=null;
+ private String extout=null;
+
+ /*--------------------------------------------------------------*/
+
+ protected long readsProcessed=0;
+ protected long basesProcessed=0;
+
+ private long maxReads=-1;
+ private boolean addName=false;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Final Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private final FileFormat ffin1;
+ private final FileFormat ffin2;
+
+ private final FileFormat[] ffout;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Common Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private PrintStream outstream=System.err;
+ public static boolean verbose=false;
+ public boolean errorState=false;
+ private boolean overwrite=false;
+ private boolean append=false;
+
+}
diff --git a/current/cluster/Cluster.java b/current/cluster/Cluster.java
new file mode 100755
index 0000000..13162e8
--- /dev/null
+++ b/current/cluster/Cluster.java
@@ -0,0 +1,270 @@
+package cluster;
+
+import java.util.concurrent.atomic.AtomicLong;
+import java.util.concurrent.atomic.AtomicLongArray;
+
+import stream.Read;
+
+/**
+ * @author Brian Bushnell
+ * @date Mar 24, 2014
+ *
+ */
+public class Cluster{
+
+ public Cluster(int id_, int k1_, int k2_, int arraylen1_, int arraylen2_){
+
+ id=id_;
+ k1=k1_;
+ k2=k2_;
+ arraylen1=arraylen1_;
+ arraylen2=arraylen2_;
+
+ kmerArray1=new AtomicLongArray(arraylen1);
+ kmerProbArray1=new float[arraylen1];
+
+ kmerArray2=new AtomicLongArray(arraylen2);
+ kmerProbArray2=new float[arraylen2];
+ }
+
+ /*--------------------------------------------------------------*/
+
+ public void recalculate(){
+ gc=(float)(gcCount.doubleValue()/baseCount.doubleValue());
+
+ if(k1>0){
+ long kmerCount=0;
+ for(int i=0; i<arraylen1; i++){
+ kmerCount+=kmerArray1.get(i);
+ }
+ double extra=(0.05/arraylen1);
+ double mult=(0.95/kmerCount);
+ for(int i=0; i<arraylen1; i++){
+ kmerProbArray1[i]=(float)(kmerArray1.get(i)*mult+extra);
+ }
+ }
+ if(k2>0){
+ long kmerCount=0;
+ for(int i=0; i<arraylen2; i++){
+ kmerCount+=kmerArray2.get(i);
+ }
+ double extra=(0.05/arraylen2);
+ double mult=(0.95/kmerCount);
+ for(int i=0; i<arraylen2; i++){
+ kmerProbArray2[i]=(float)(kmerArray2.get(i)*mult+extra);
+ }
+ }
+ }
+
+ public void resetAtomics(){
+ for(int i=0; i<arraylen1; i++){
+ kmerArray1.set(i, 0);
+ }
+ for(int i=0; i<arraylen2; i++){
+ kmerArray2.set(i, 0);
+ }
+ depthsum1.set(0);
+ depthsum2.set(0);
+ readCount.set(0);
+ baseCount.set(0);
+ gcCount.set(0);
+ }
+
+ public void add(Read r){
+ if(r==null){return;}
+ ReadTag rt=(ReadTag)r.obj;
+ assert(rt!=null);
+ final byte[] bases=r.bases;
+
+ readCount.incrementAndGet();
+ baseCount.addAndGet(bases.length);
+ gcCount.addAndGet(rt.gcCount);
+
+ if(rt.strand==0){
+ depthsum1.addAndGet(rt.depth);
+ }else{
+ depthsum2.addAndGet(rt.depth);
+ }
+
+ if(k1>0){
+ int[] kmers=rt.kmerArray1(k1);
+ int kmer=-1, run=0;
+ for(int i=0; i<kmers.length; i++){
+ int x=kmers[i];
+ if(x==kmer){
+ run++;
+ }else{
+ if(run>0){kmerArray1.addAndGet(kmer, run);}
+ kmer=x;
+ run=1;
+ }
+ }
+ if(run>0){kmerArray1.addAndGet(kmer, run);}
+ }
+
+ if(k2>0){
+ int[] kmers=rt.kmerArray2(k2);
+ for(int kmer=0; kmer<kmers.length; kmer++){
+ int x=kmers[kmer];
+ if(x>0){kmerArray2.addAndGet(kmer, x);}
+ }
+ }
+ }
+
+ /**
+ * @param r1
+ * @return
+ */
+ public float score(Read r) {
+ if(r==null){return 0;}
+ return r.mate==null ? scoreSingle(r) : scorePaired(r);
+ }
+
+ /**
+ * @param r1
+ * @return
+ */
+ public float scoreSingle(Read r) {
+ if(r==null){return 0;}
+ ReadTag rt=(ReadTag)r.obj;
+
+ assert(false) : "TODO";
+ float depthScore=scoreDepthSingle(rt);
+ float gcScore=scoreGcSingle(rt);
+ float kmerScore=scoreKmer1(rt);
+ assert(false);
+ float depthWeight=.2f;
+ float gcWeight=.2f;
+ float kmerWeight=.6f;
+
+ return depthWeight*depthScore+gcWeight*gcScore+kmerWeight*kmerScore;
+ }
+
+ /**
+ * @param rt
+ * @return
+ */
+ private float scoreKmer1(ReadTag rt) {
+ int[] kmers=rt.kmerArray1(k1);
+
+ float score=0;
+ if(scoreMode1==SCORE_MODE_AND){
+ float f=ClusterTools.andCount(kmers, kmerArray1);
+ assert(false);
+ }else if(scoreMode1==SCORE_MODE_MULT){
+ float f=ClusterTools.innerProduct(kmers, kmerProbArray1);
+ assert(false);
+ }else{
+ throw new RuntimeException(""+scoreMode1);
+ }
+
+ return score;
+ }
+
+ /**
+ * @param rt
+ * @return
+ */
+ private float scoreKmer2(ReadTag rt) {
+ int[] kmers=rt.kmerArray2(k2);
+ float[] probs=rt.kmerFreq2(k2);
+
+ float score=0;
+ if(scoreMode2==SCORE_MODE_AND){
+ float f=ClusterTools.andCount(kmers, kmerArray2);
+ assert(false);
+ }else if(scoreMode2==SCORE_MODE_MULT){
+ float f=ClusterTools.innerProduct(kmers, kmerProbArray2);
+ assert(false);
+ }else if(scoreMode2==SCORE_MODE_DIF){
+ float f=ClusterTools.absDif(probs, kmerProbArray2);
+ assert(false);
+ }else if(scoreMode2==SCORE_MODE_RMS){
+ float f=ClusterTools.rmsDif(probs, kmerProbArray2);
+ assert(false);
+ }else if(scoreMode2==SCORE_MODE_KS){
+ float f=ClusterTools.ksFunction(probs, kmerProbArray2);
+ assert(false);
+ }else{
+ throw new RuntimeException(""+scoreMode2);
+ }
+
+ return score;
+ }
+
+ /**
+ * @param rt
+ * @return
+ */
+ private float scoreGcSingle(ReadTag rt) {
+ assert(false) : "TODO";
+ // TODO Auto-generated method stub
+ return 0;
+ }
+
+ /**
+ * @param rt
+ * @return
+ */
+ private float scoreDepthSingle(ReadTag rt) {
+ assert(false) : "TODO";
+ // TODO Auto-generated method stub
+ return 0;
+ }
+
+ /**
+ * @param r1
+ * @return
+ */
+ public float scorePaired(Read r) {
+ assert(false) : "TODO";
+ if(r==null){return 0;}
+ ReadTag rt=(ReadTag)r.obj;
+
+// ReadTag rt1=rt.r
+
+ return 0;
+ }
+
+ /*--------------------------------------------------------------*/
+
+ public final int id;
+
+ /** 'big' kmer */
+ public final int k1;
+ /** 'small' kmer */
+ public final int k2;
+
+ public final int arraylen1;
+ public final int arraylen2;
+
+ /*--------------------------------------------------------------*/
+
+ public float gc;
+ public int depth1, depth2;
+
+ final AtomicLongArray kmerArray1;
+ final float[] kmerProbArray1;
+
+ final AtomicLongArray kmerArray2;
+ final float[] kmerProbArray2;
+
+ final AtomicLong depthsum1=new AtomicLong(0);
+ final AtomicLong depthsum2=new AtomicLong(0);
+
+ final AtomicLong readCount=new AtomicLong(0);
+ final AtomicLong baseCount=new AtomicLong(0);
+// final AtomicLong kmerCount=new AtomicLong(0);
+ final AtomicLong gcCount=new AtomicLong(0);
+
+ /*--------------------------------------------------------------*/
+
+ public static final int SCORE_MODE_DIF=0;
+ public static final int SCORE_MODE_RMS=1;
+ public static final int SCORE_MODE_AND=2;
+ public static final int SCORE_MODE_MULT=3;
+ public static final int SCORE_MODE_KS=4;
+
+ public static int scoreMode1=SCORE_MODE_MULT;
+ public static int scoreMode2=SCORE_MODE_RMS;
+}
diff --git a/current/cluster/ClusterTools.java b/current/cluster/ClusterTools.java
new file mode 100755
index 0000000..aee5678
--- /dev/null
+++ b/current/cluster/ClusterTools.java
@@ -0,0 +1,180 @@
+package cluster;
+
+import java.util.Arrays;
+import java.util.concurrent.atomic.AtomicLongArray;
+
+import jgi.Dedupe;
+
+import dna.AminoAcid;
+
+import align2.Tools;
+
+/**
+ * @author Brian Bushnell
+ * @date Mar 24, 2014
+ *
+ */
+public class ClusterTools {
+
+ /**
+ * @param bases
+ * @param object
+ * @param k
+ * @return
+ */
+ public static int[] toKmerCounts(byte[] bases, Object object, int k) {
+ // TODO Auto-generated method stub
+ return null;
+ }
+
+ public static int[] toKmers(final byte[] bases, int[] array_, final int k){
+ if(bases==null || bases.length<k){return null;}
+ final int alen=bases.length-k+1;
+ final int[] array=(array_!=null && array_.length==alen ? array_ : new int[alen]);
+
+ final int shift=2*k;
+ final int shift2=shift-2;
+ final int mask=~((-1)<<shift);
+
+ int kmer=0;
+ int rkmer=0;
+ int len=0;
+
+ for(int i=0, j=0; i<bases.length; i++){
+ byte b=bases[i];
+ int x=Dedupe.baseToNumber[b];
+ int x2=Dedupe.baseToComplementNumber[b];
+ kmer=((kmer<<2)|x)&mask;
+ rkmer=(rkmer>>>2)|(x2<<shift2);
+// if(b=='N'){len=0;}else{len++;} //This version will transform 'N' into 'A'
+ if(verbose){System.err.println("Scanning2 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k), Tools.min(i+1, k)));}
+ if(len>=k){
+ array[j]=Tools.min(kmer, rkmer);
+ j++;
+ }
+ }
+
+ Arrays.sort(array);
+ return array;
+ }
+
+ public static int[] toKmerCounts(final byte[] bases, int[] array_, final int k, final int alen){
+ if(bases==null || bases.length<k){return null;}
+ final int[] array=(array_!=null && array_.length==alen ? array_ : new int[alen]);
+
+ final int shift=2*k;
+ final int shift2=shift-2;
+ final int mask=~((-1)<<shift);
+
+ int kmer=0;
+ int rkmer=0;
+ int len=0;
+
+ for(int i=0, j=0; i<bases.length; i++){
+ byte b=bases[i];
+ int x=Dedupe.baseToNumber[b];
+ int x2=Dedupe.baseToComplementNumber[b];
+ kmer=((kmer<<2)|x)&mask;
+ rkmer=(rkmer>>>2)|(x2<<shift2);
+// if(b=='N'){len=0;}else{len++;} //This version will transform 'N' into 'A'
+ if(verbose){System.err.println("Scanning2 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k), Tools.min(i+1, k)));}
+ if(len>=k){
+ array[Tools.min(kmer, rkmer)]++;
+ }
+ }
+
+ Arrays.sort(array);
+ return array;
+ }
+
+ public static int maxCanonicalKmer(int k){
+ final int bits=2*k;
+ final int max=(int)((1L<<bits)-1);
+ int high=0;
+ for(int kmer=0; kmer<=max; kmer++){
+ int canon=Tools.min(kmer, AminoAcid.reverseComplementBinaryFast(kmer, k));
+ high=Tools.max(canon, high);
+ }
+ return high;
+ }
+
+ /**
+ * @param kmers Read kmers
+ * @param counts Cluster kmer counts
+ * @return Score
+ */
+ static final float andCount(int[] kmers, AtomicLongArray counts){
+ int sum=0;
+ for(int i=0; i<kmers.length; i++){
+ int kmer=kmers[i];
+ long count=counts.get(kmer);
+ if(count>0){sum++;}
+ }
+ return sum/(float)kmers.length;
+ }
+
+ /**
+ * @param kmers Read kmers
+ * @param probs Cluster kmer frequencies
+ * @return Score
+ */
+ static final float innerProduct(int[] kmers, float[] probs){
+ float sum=0;
+ for(int kmer : kmers){
+ if(kmer>=0){
+ sum+=probs[kmer];
+ }
+ }
+ return sum;
+ }
+
+ /**
+ * @param a Read kmer frequencies
+ * @param b Cluster kmer frequencies
+ * @return Score
+ */
+ static final float absDif(float[] a, float[] b){
+ assert(a.length==b.length);
+ double sum=0;
+ for(int i=0; i<a.length; i++){
+ sum+=Tools.absdif((double)a[i], (double)b[i]);
+ }
+
+ return (float)sum;
+ }
+
+ /**
+ * @param a Read kmer frequencies
+ * @param b Cluster kmer frequencies
+ * @return Score
+ */
+ static final float rmsDif(float[] a, float[] b){
+ assert(a.length==b.length);
+ double sum=0;
+ for(int i=0; i<a.length; i++){
+ double d=Tools.absdif((double)a[i], (double)b[i]);
+ sum+=d*d;
+ }
+
+ return (float)Math.sqrt(sum/a.length);
+ }
+
+ /**
+ * @param a Read kmer frequencies
+ * @param b Cluster kmer frequencies
+ * @return Score
+ */
+ static final float ksFunction(float[] a, float[] b){
+ assert(a.length==b.length);
+ double sum=0;
+ for(int i=0; i<a.length; i++){
+ double d=(double)a[i]*Math.log(a[i]/b[i]);
+ sum+=d;
+ }
+
+ return (float)sum;
+ }
+
+ public static boolean verbose=false;
+
+}
diff --git a/current/cluster/MergeReadHeaders.java b/current/cluster/MergeReadHeaders.java
new file mode 100755
index 0000000..f2370c3
--- /dev/null
+++ b/current/cluster/MergeReadHeaders.java
@@ -0,0 +1,332 @@
+package cluster;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import stream.ConcurrentGenericReadInputStream;
+import stream.ConcurrentReadInputStream;
+import stream.FASTQ;
+import stream.FastaReadInputStream;
+import stream.ConcurrentReadOutputStream;
+import stream.Read;
+import align2.ListNum;
+import align2.ReadStats;
+import align2.Shared;
+import align2.Tools;
+import dna.Data;
+import dna.Parser;
+import dna.Timer;
+import fileIO.ByteFile;
+import fileIO.ByteFile1;
+import fileIO.ByteFile2;
+import fileIO.FileFormat;
+import fileIO.ReadWrite;
+import fileIO.TextFile;
+
+/**
+ * @author Brian Bushnell
+ * @date Feb 7, 2014
+ *
+ */
+public class MergeReadHeaders {
+
+ public static void main(String[] args){
+ Timer t=new Timer();
+ MergeReadHeaders mgr=new MergeReadHeaders(args);
+ mgr.process(t);
+ }
+
+ public MergeReadHeaders(String[] args){
+ if(args==null || args.length==0){
+ printOptions();
+ System.exit(0);
+ }
+
+ for(String s : args){if(s.startsWith("out=standardout") || s.startsWith("out=stdout")){outstream=System.err;}}
+ outstream.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+
+ Parser parser=new Parser();
+ Shared.READ_BUFFER_LENGTH=Tools.min(200, Shared.READ_BUFFER_LENGTH);
+ Shared.capBuffers(4);
+ ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
+ ReadWrite.MAX_ZIP_THREADS=Shared.threads();
+
+
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(Parser.parseCommonStatic(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseZip(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseQuality(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseFasta(arg, a, b)){
+ //do nothing
+ }else if(parser.parseInterleaved(arg, a, b)){
+ //do nothing
+ }else if(a.equals("passes")){
+ assert(false) : "'passes' is disabled.";
+// passes=Integer.parseInt(b);
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ ByteFile1.verbose=verbose;
+ ByteFile2.verbose=verbose;
+ stream.FastaReadInputStream.verbose=verbose;
+ ConcurrentGenericReadInputStream.verbose=verbose;
+// align2.FastaReadInputStream2.verbose=verbose;
+ stream.FastqReadInputStream.verbose=verbose;
+ ReadWrite.verbose=verbose;
+ }else if(a.equals("reads") || a.equals("maxreads")){
+ maxReads=Tools.parseKMG(b);
+ }else if(a.equals("t") || a.equals("threads")){
+ Shared.setThreads(b);
+ }else if(a.equals("build") || a.equals("genome")){
+ Data.setGenome(Integer.parseInt(b));
+ }else if(a.equals("header")){
+ headerFile=b;
+ }else if(a.equals("in") || a.equals("input") || a.equals("in1") || a.equals("input1")){
+ in1=b;
+ }else if(a.equals("in2") || a.equals("input2")){
+ in2=b;
+ }else if(a.equals("out") || a.equals("output") || a.equals("out1") || a.equals("output1")){
+ out1=b;
+ }else if(a.equals("out2") || a.equals("output2")){
+ out2=b;
+ }else if(a.equals("extin")){
+ extin=b;
+ }else if(a.equals("extout")){
+ extout=b;
+ }else if(a.equals("append") || a.equals("app")){
+ append=ReadStats.append=Tools.parseBoolean(b);
+ }else if(a.equals("overwrite") || a.equals("ow")){
+ overwrite=Tools.parseBoolean(b);
+ }else if(in1==null && i==0 && !arg.contains("=") && (arg.toLowerCase().startsWith("stdin") || new File(arg).exists())){
+ in1=arg;
+ }else if(out1==null && i==1 && !arg.contains("=")){
+ out1=arg;
+ }else{
+ System.err.println("Unknown parameter "+args[i]);
+ assert(false) : "Unknown parameter "+args[i];
+ // throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+
+ if(in1!=null && in2==null && in1.indexOf('#')>-1 && !new File(in1).exists()){
+ in2=in1.replace("#", "2");
+ in1=in1.replace("#", "1");
+ }
+ if(out1!=null && out2==null && out1.indexOf('#')>-1){
+ out2=out1.replace("#", "2");
+ out1=out1.replace("#", "1");
+ }
+ if(in2!=null){
+ if(FASTQ.FORCE_INTERLEAVED){System.err.println("Reset INTERLEAVED to false because paired input files were specified.");}
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+ }
+
+ assert(FastaReadInputStream.settingsOK());
+
+ if(in1==null || headerFile==null){
+ printOptions();
+ throw new RuntimeException("Error - at least one input file and a header file are required.");
+ }
+ if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){
+// if(ReadWrite.isCompressed(in1)){ByteFile.FORCE_MODE_BF2=true;}
+ ByteFile.FORCE_MODE_BF2=true;
+ }
+
+ if(out1==null){
+ if(out2!=null){
+ printOptions();
+ throw new RuntimeException("Error - cannot define out2 without defining out1.");
+ }
+ //out1="stdout";
+ System.err.println("Warning: output destination not set; producing no output. To print to standard out, set 'out=stdout.fq'");
+ }
+
+ if(!parser.setInterleaved){
+ assert(in1!=null && (out1!=null || out2==null)) : "\nin1="+in1+"\nin2="+in2+"\nout1="+out1+"\nout2="+out2+"\n";
+ if(in2!=null){ //If there are 2 input streams.
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+ outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED);
+ }else{ //There is one input stream.
+ if(out2!=null){
+ FASTQ.FORCE_INTERLEAVED=true;
+ FASTQ.TEST_INTERLEAVED=false;
+ outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED);
+ }
+ }
+ }
+
+ if(out1!=null && out1.equalsIgnoreCase("null")){out1=null;}
+ if(out2!=null && out2.equalsIgnoreCase("null")){out2=null;}
+
+ if(!Tools.testOutputFiles(overwrite, append, false, out1, out2)){
+ throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+out1+", "+out2+"\n");
+ }
+
+ Parser.processQuality();
+
+ ffout1=FileFormat.testOutput(out1, FileFormat.FASTQ, extout, true, overwrite, append, false);
+ ffout2=FileFormat.testOutput(out2, FileFormat.FASTQ, extout, true, overwrite, append, false);
+
+ ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, extin, true, true);
+ ffin2=FileFormat.testInput(in2, FileFormat.FASTQ, extin, true, true);
+
+ ffheader=FileFormat.testInput(headerFile, FileFormat.TEXT, null, true, true);
+ }
+
+ /** TODO */
+ public static void printOptions(){
+ System.err.println("Usage information unavailable");
+ }
+
+ void process(Timer t){
+
+
+ final ConcurrentReadInputStream cris;
+ {
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, ffin2, null, null);
+ if(verbose){System.err.println("Started cris");}
+ cris.start();
+ }
+ boolean paired=cris.paired();
+ if(verbose){System.err.println("Input is "+(paired ? "paired" : "unpaired"));}
+
+ ConcurrentReadOutputStream ros=null;
+ if(out1!=null){
+ final int buff=4;
+
+ if(cris.paired() && out2==null && (in1==null || !in1.contains(".sam"))){
+ outstream.println("Writing interleaved.");
+ }
+
+ assert(!out1.equalsIgnoreCase(in1) && !out1.equalsIgnoreCase(in1)) : "Input file and output file have same name.";
+ assert(out2==null || (!out2.equalsIgnoreCase(in1) && !out2.equalsIgnoreCase(in2))) : "out1 and out2 have same name.";
+
+ ros=ConcurrentReadOutputStream.getStream(ffout1, ffout2, null, null, buff, null, false);
+ ros.start();
+ }
+
+ long readsProcessed=0;
+ long basesProcessed=0;
+
+ TextFile tf=new TextFile(ffheader);
+
+ {
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+// System.err.println("Fetched "+reads);
+
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+ assert((ffin1==null || ffin1.samOrBam()) || (r.mate!=null)==cris.paired());
+ }
+
+ while(reads!=null && reads.size()>0){
+
+ for(int idx=0; idx<reads.size(); idx++){
+ final Read r1=reads.get(idx);
+ final Read r2=r1.mate;
+
+ {
+ readsProcessed++;
+ basesProcessed+=r1.length();
+ r1.id=processHeader(tf.readLine());
+ }
+ if(r2!=null){
+ readsProcessed++;
+ basesProcessed+=r2.length();
+ r2.id=processHeader(tf.readLine());
+ }
+
+
+ boolean remove=false;
+ if(remove){reads.set(idx, null);}
+ }
+
+ if(ros!=null){ros.add(reads, ln.id);}
+
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ if(ln!=null){
+ cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
+ }
+ }
+
+ errorState|=ReadWrite.closeStreams(cris, ros);
+
+ t.stop();
+
+ double rpnano=readsProcessed/(double)(t.elapsed);
+ double bpnano=basesProcessed/(double)(t.elapsed);
+
+ String rpstring=(readsProcessed<100000 ? ""+readsProcessed : readsProcessed<100000000 ? (readsProcessed/1000)+"k" : (readsProcessed/1000000)+"m");
+ String bpstring=(basesProcessed<100000 ? ""+basesProcessed : basesProcessed<100000000 ? (basesProcessed/1000)+"k" : (basesProcessed/1000000)+"m");
+
+ while(rpstring.length()<8){rpstring=" "+rpstring;}
+ while(bpstring.length()<8){bpstring=" "+bpstring;}
+
+ outstream.println("Time: \t"+t);
+ outstream.println("Reads Processed: "+rpstring+" \t"+String.format("%.2fk reads/sec", rpnano*1000000));
+ outstream.println("Bases Processed: "+bpstring+" \t"+String.format("%.2fm bases/sec", bpnano*1000));
+
+ if(errorState){
+ throw new RuntimeException("ReformatReads terminated in an error state; the output may be corrupt.");
+ }
+ }
+
+ public static String processHeader(String s){
+ assert(s!=null);
+ return s;
+ }
+
+ /*--------------------------------------------------------------*/
+
+ public boolean errorState=false;
+
+ private String headerFile=null;
+
+ private String in1=null;
+ private String in2=null;
+
+ private String out1=null;
+ private String out2=null;
+
+ private String extin=null;
+ private String extout=null;
+
+ private boolean overwrite=false;
+ private boolean append=false;
+
+ private long maxReads=-1;
+
+ private final FileFormat ffheader;
+
+ private final FileFormat ffin1;
+ private final FileFormat ffin2;
+
+ private final FileFormat ffout1;
+ private final FileFormat ffout2;
+
+ private PrintStream outstream=System.err;
+
+ /*--------------------------------------------------------------*/
+
+ public static boolean verbose=false;
+
+}
diff --git a/current/cluster/ReadTag.java b/current/cluster/ReadTag.java
new file mode 100755
index 0000000..f1412e0
--- /dev/null
+++ b/current/cluster/ReadTag.java
@@ -0,0 +1,107 @@
+package cluster;
+
+import java.io.Serializable;
+
+import stream.Read;
+import align2.Tools;
+
+/**
+ * @author Brian Bushnell
+ * @date Mar 24, 2014
+ *
+ */
+class ReadTag implements Serializable{
+
+ /**
+ *
+ */
+ private static final long serialVersionUID = -6186366525723397478L;
+
+ public ReadTag(Read r_){
+ r=r_;
+ strand=r.strand();
+
+ int gcCount_=0;
+ for(byte b : r.bases){
+ if(b=='G' || b=='C'){
+ gcCount_++;
+ }
+ }
+ gcCount=gcCount_;
+
+ processHeader(r.id);
+ }
+
+ private void processHeader(String s){
+ assert(false) : "TODO";
+ gc=-1;
+ depth=-1;
+ cluster0=-1;
+ }
+
+ Read r1(){
+ return strand==0 ? r : r.mate;
+ }
+
+ Read r2(){
+ return strand==1 ? r : r.mate;
+ }
+
+ ReadTag tag1(){
+ return (ReadTag)r1().obj;
+ }
+
+ ReadTag tag2(){
+ Read r2=r2();
+ return r2==null ? null : (ReadTag)r2.obj;
+ }
+
+// private int[] toKmers(final int k){
+// return ClusterTools.toKmers(r.bases, null, k);
+// }
+
+ int[] kmerArray1(int k1){
+ if(kmerArray1==null){kmerArray1=ClusterTools.toKmers(r.bases, null, k1);}
+ return kmerArray1;
+ }
+
+ int[] kmerArray2(int k2){
+ if(kmerArray2==null){kmerArray2=ClusterTools.toKmerCounts(r.bases, null, k2);}
+ return kmerArray2;
+ }
+
+ float[] kmerFreq2(int k2){
+ if(kmerFreq2==null){
+ int[] counts=kmerArray2(k2);
+ if(counts!=null){
+ long sum=Tools.sum(counts);
+ kmerFreq2=new float[counts.length];
+ float extra=(0.05f/counts.length);
+ float mult=0.95f/sum;
+ for(int i=0; i<counts.length; i++){
+ kmerFreq2[i]=counts[i]*mult+extra;
+ }
+ }
+ }
+ return kmerFreq2;
+ }
+
+ /** Sorted long kmers */
+ private int[] kmerArray1;
+
+ /** Canonically-ordered short kmer counts */
+ private int[] kmerArray2;
+
+ private float[] kmerFreq2;
+
+ final Read r;
+ final byte strand;
+ final int gcCount;
+
+ int depth;
+ int cluster0=-1; //initial cluster
+ int cluster1=-1; //final cluster
+
+ float gc;
+
+}
diff --git a/current/cluster/ReclusterByKmer.java b/current/cluster/ReclusterByKmer.java
new file mode 100755
index 0000000..51efaaf
--- /dev/null
+++ b/current/cluster/ReclusterByKmer.java
@@ -0,0 +1,616 @@
+package cluster;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.concurrent.ThreadLocalRandom;
+
+
+import stream.ConcurrentGenericReadInputStream;
+import stream.ConcurrentReadInputStream;
+import stream.FASTQ;
+import stream.FastaReadInputStream;
+import stream.ConcurrentReadOutputStream;
+import stream.Read;
+import align2.ListNum;
+import align2.Shared;
+import align2.Tools;
+import dna.Data;
+import dna.Parser;
+import dna.Timer;
+import fileIO.ByteFile;
+import fileIO.ByteFile1;
+import fileIO.ByteFile2;
+import fileIO.FileFormat;
+import fileIO.ReadWrite;
+
+/**
+ * @author Brian Bushnell
+ * @date Feb 7, 2014
+ *
+ */
+public class ReclusterByKmer {
+
+ public static void main(String[] args){
+ Timer t=new Timer();
+ ReclusterByKmer rbk=new ReclusterByKmer(args);
+ rbk.process(t);
+ }
+
+ public ReclusterByKmer(String[] args){
+ if(args==null || args.length==0){
+ printOptions();
+ System.exit(0);
+ }
+
+ for(String s : args){if(s.startsWith("out=standardout") || s.startsWith("out=stdout")){outstream=System.err;}}
+ outstream.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+
+ boolean setInterleaved=false; //Whether it was explicitly set.
+
+ Shared.READ_BUFFER_LENGTH=Tools.min(200, Shared.READ_BUFFER_LENGTH);
+ ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
+ ReadWrite.MAX_ZIP_THREADS=8;
+
+ int k1_=12;
+ int k2_=3;
+ Parser parser=new Parser();
+
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(Parser.parseZip(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseCommonStatic(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseFasta(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseQuality(arg, a, b)){
+ //do nothing
+ }else if(parser.parseInterleaved(arg, a, b)){
+ //do nothing
+ }else if(parser.parseFiles(arg, a, b)){
+ //do nothing
+ }else if(parser.parseCommon(arg, a, b)){
+ //do nothing
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ ByteFile1.verbose=verbose;
+ ByteFile2.verbose=verbose;
+ stream.FastaReadInputStream.verbose=verbose;
+ ConcurrentGenericReadInputStream.verbose=verbose;
+// align2.FastaReadInputStream2.verbose=verbose;
+ stream.FastqReadInputStream.verbose=verbose;
+ ReadWrite.verbose=verbose;
+ }else if(a.equals("build") || a.equals("genome")){
+ Data.setGenome(Integer.parseInt(b));
+ }else if(in1==null && i==0 && !arg.contains("=") && (arg.toLowerCase().startsWith("stdin") || new File(arg).exists())){
+ in1=arg;
+ }else{
+ System.err.println("Unknown parameter "+args[i]);
+ assert(false) : "Unknown parameter "+args[i];
+ // throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ {//Process parser fields
+ Parser.processQuality();
+
+ maxReads=parser.maxReads;
+ overwrite=parser.overwrite;
+ append=parser.append;
+
+ setInterleaved=parser.setInterleaved;
+
+ in1=parser.in1;
+ in2=parser.in2;
+// qfin1=parser.qfin1;
+// qfin2=parser.qfin2;
+
+ out1=parser.out1;
+ out2=parser.out2;
+// qfout1=parser.qfout1;
+// qfout2=parser.qfout2;
+
+ extin=parser.extin;
+ extout=parser.extout;
+ }
+
+
+ if(in1!=null && in2==null && in1.indexOf('#')>-1 && !new File(in1).exists()){
+ in2=in1.replace("#", "2");
+ in1=in1.replace("#", "1");
+ }
+ if(out1!=null && out2==null && out1.indexOf('#')>-1){
+ out2=out1.replace("#", "2");
+ out1=out1.replace("#", "1");
+ }
+ if(in2!=null){
+ if(FASTQ.FORCE_INTERLEAVED){System.err.println("Reset INTERLEAVED to false because paired input files were specified.");}
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+ }
+
+ assert(FastaReadInputStream.settingsOK());
+
+ if(in1==null){
+ printOptions();
+ throw new RuntimeException("Error - at least one input file is required.");
+ }
+ if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){
+// if(ReadWrite.isCompressed(in1)){ByteFile.FORCE_MODE_BF2=true;}
+ ByteFile.FORCE_MODE_BF2=true;
+ }
+
+ if(out1==null){
+ if(out2!=null){
+ printOptions();
+ throw new RuntimeException("Error - cannot define out2 without defining out1.");
+ }
+ //out1="stdout";
+ System.err.println("Warning: output destination not set; producing no output. To print to standard out, set 'out=stdout.fq'");
+ }
+
+ if(!setInterleaved){
+ assert(in1!=null && (out1!=null || out2==null)) : "\nin1="+in1+"\nin2="+in2+"\nout1="+out1+"\nout2="+out2+"\n";
+ if(in2!=null){ //If there are 2 input streams.
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+ outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED);
+ }else{ //There is one input stream.
+ if(out2!=null){
+ FASTQ.FORCE_INTERLEAVED=true;
+ FASTQ.TEST_INTERLEAVED=false;
+ outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED);
+ }
+ }
+ }
+
+ if(out1!=null && out1.equalsIgnoreCase("null")){out1=null;}
+ if(out2!=null && out2.equalsIgnoreCase("null")){out2=null;}
+
+ if(!Tools.testOutputFiles(overwrite, append, false, out1, out2)){
+ throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+out1+", "+out2+"\n");
+ }
+
+ ffout1=FileFormat.testOutput(out1, FileFormat.FASTQ, extout, true, overwrite, append, false);
+ ffout2=FileFormat.testOutput(out2, FileFormat.FASTQ, extout, true, overwrite, append, false);
+
+ ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, extin, true, true);
+ ffin2=FileFormat.testInput(in2, FileFormat.FASTQ, extin, true, true);
+
+ /* Check for output file collisions */
+ Tools.testOutputFiles(overwrite, append, false, out1, out2);
+
+ k1=k1_;
+ assert(k1>=-1 && k1<=15) : "k1 must lie between 1 and 15, inclusive (0 to disable)";
+ k2=k2_;
+ assert(k2>=-1 && k2<=6) : "k2 must lie between 1 and 6, inclusive (0 to disable)";
+
+ arraylen1=(k1>0 ? ClusterTools.maxCanonicalKmer(k1)+1 : 0);
+ arraylen2=(k2>0 ? ClusterTools.maxCanonicalKmer(k2)+1 : 0);
+ }
+
+ /*--------------------------------------------------------------*/
+
+ /** TODO */
+ public static void printOptions(){
+ System.err.println("Usage information unavailable");
+ }
+
+ void process(Timer t){
+
+
+ final ConcurrentReadInputStream cris;
+ {
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, ffin2, null, null);
+ if(verbose){System.err.println("Started cris");}
+ cris.start(); //4567
+ }
+ boolean paired=cris.paired();
+ if(verbose){System.err.println("Input is "+(paired ? "paired" : "unpaired"));}
+
+ ConcurrentReadOutputStream ros=null;
+ if(out1!=null){
+ final int buff=4;
+
+ if(cris.paired() && out2==null && (in1==null || !in1.contains(".sam"))){
+ outstream.println("Writing interleaved.");
+ }
+
+ assert(!out1.equalsIgnoreCase(in1) && !out1.equalsIgnoreCase(in1)) : "Input file and output file have same name.";
+ assert(out2==null || (!out2.equalsIgnoreCase(in1) && !out2.equalsIgnoreCase(in2))) : "out1 and out2 have same name.";
+
+ ros=ConcurrentReadOutputStream.getStream(ffout1, ffout2, null, null, buff, null, false);
+ ros.start();
+ }
+
+ long readsProcessed=0;
+ long basesProcessed=0;
+
+ {
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+// System.err.println("Fetched "+reads);
+
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+ assert((ffin1==null || ffin1.samOrBam()) || (r.mate!=null)==cris.paired());
+ }
+
+ while(reads!=null && reads.size()>0){
+
+ for(int idx=0; idx<reads.size(); idx++){
+ final Read r1=reads.get(idx);
+ final Read r2=r1.mate;
+
+ {
+ readsProcessed++;
+ basesProcessed+=r1.length();
+ }
+ if(r2!=null){
+ readsProcessed++;
+ basesProcessed+=r2.length();
+ }
+
+
+ boolean remove=false;
+ if(remove){reads.set(idx, null);}
+ }
+
+ if(ros!=null){ros.add(reads, ln.id);}
+
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ if(ln!=null){
+ cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
+ }
+ }
+
+ errorState|=ReadWrite.closeStreams(cris, ros);
+
+ t.stop();
+
+ double rpnano=readsProcessed/(double)(t.elapsed);
+ double bpnano=basesProcessed/(double)(t.elapsed);
+
+ String rpstring=(readsProcessed<100000 ? ""+readsProcessed : readsProcessed<100000000 ? (readsProcessed/1000)+"k" : (readsProcessed/1000000)+"m");
+ String bpstring=(basesProcessed<100000 ? ""+basesProcessed : basesProcessed<100000000 ? (basesProcessed/1000)+"k" : (basesProcessed/1000000)+"m");
+
+ while(rpstring.length()<8){rpstring=" "+rpstring;}
+ while(bpstring.length()<8){bpstring=" "+bpstring;}
+
+ outstream.println("Time: \t"+t);
+ outstream.println("Reads Processed: "+rpstring+" \t"+String.format("%.2fk reads/sec", rpnano*1000000));
+ outstream.println("Bases Processed: "+bpstring+" \t"+String.format("%.2fm bases/sec", bpnano*1000));
+
+ if(errorState){
+ throw new RuntimeException("ReformatReads terminated in an error state; the output may be corrupt.");
+ }
+ }
+
+ /*--------------------------------------------------------------*/
+
+ private Cluster fetchCluster(int x){
+ if(x>clusterList.size()){
+ synchronized(clusterList){
+ clusterList.ensureCapacity(2*x);
+ for(int i=clusterList.size(); i<x; i++){
+ clusterList.add(new Cluster(i, k1, k2, arraylen1, arraylen2));
+ }
+ clusterList.notifyAll();
+ }
+ }
+ Cluster c=clusterList.get(x);
+ while(c==null){
+ synchronized(clusterList){
+ c=clusterList.get(x);
+ assert(c!=null);
+ }
+ }
+ return c;
+ }
+
+ /**
+ * Creates clusters; Generates kmer spectra for clusters
+ * @param t
+ */
+ private void findKmerSpectra(Timer t){
+
+ /* Create read input stream */
+ final ConcurrentReadInputStream cris;
+ {
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, ffin2);
+ cris.start(); //4567
+ if(verbose){System.err.println("Started cris");}
+ }
+
+ /* Create ClusterThreads */
+ ArrayList<ClusterThread> alct=new ArrayList<ClusterThread>(THREADS);
+
+ for(int i=0; i<THREADS; i++){alct.add(new ClusterThread(i, CLUSTER_MODE_CREATE, -1, cris));}
+ for(ClusterThread ct : alct){ct.start();}
+
+ long readsIn=0, basesIn=0;
+
+ /* Wait for threads to die, and gather statistics */
+ for(ClusterThread ct : alct){
+ while(ct.getState()!=Thread.State.TERMINATED){
+ try {
+ ct.join();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ readsIn+=ct.readsInT;
+ basesIn+=ct.basesInT;
+ }
+
+ /* Shut down I/O streams; capture error status */
+ errorState|=ReadWrite.closeStreams(cris);
+ }
+
+ /**
+ * Assign reads to clusters using additional kmer information.
+ * @param t
+ */
+ private void recluster(Timer t){
+
+ /* Create read input stream */
+ final ConcurrentReadInputStream cris;
+ {
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, ffin2);
+ cris.start(); //4567
+ if(verbose){System.err.println("Started cris");}
+ }
+
+ /* Create ClusterThreads */
+ ArrayList<ClusterThread> alct=new ArrayList<ClusterThread>(THREADS);
+ for(int i=0; i<THREADS; i++){alct.add(new ClusterThread(i, CLUSTER_MODE_RECLUSTER, ambigMode, cris));}
+ for(ClusterThread ct : alct){ct.start();}
+
+ long readsIn=0, basesIn=0;
+
+ /* Wait for threads to die, and gather statistics */
+ for(ClusterThread ct : alct){
+ while(ct.getState()!=Thread.State.TERMINATED){
+ try {
+ ct.join();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ readsIn+=ct.readsInT;
+ basesIn+=ct.basesInT;
+ }
+
+ /* Shut down I/O streams; capture error status */
+ errorState|=ReadWrite.closeStreams(cris);
+ }
+
+ /*--------------------------------------------------------------*/
+
+ private class ClusterThread extends Thread{
+
+ public ClusterThread(int id_, int clusterMode_, int ambigMode_, ConcurrentReadInputStream cris_){
+ id=id_;
+ ambigMode=ambigMode_;
+ clusterMode=clusterMode_;
+ cris=cris_;
+
+ randy=(ambigMode==AMBIG_MODE_RAND) ? ThreadLocalRandom.current() : null;
+ }
+
+ @Override
+ public void run(){
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ //While there are more reads lists...
+ while(reads!=null && reads.size()>0){
+
+ //For each read (or pair) in the list...
+ for(int i=0; i<reads.size(); i++){
+ processRead(reads.get(i));
+ }
+
+ //Fetch a new read list
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ cris.returnList(ln.id, ln.list.isEmpty());
+ }
+
+
+ private void processRead(final Read r1){
+ final Read r2=r1.mate;
+
+ if(verbose){System.err.println("Considering read "+r1.id+" "+new String(r1.bases));}
+
+ readsInT++;
+ basesInT+=r1.length();
+ if(r2!=null){
+ readsInT++;
+ basesInT+=r2.length();
+ }
+
+ final ReadTag rt1=new ReadTag(r1);
+ final ReadTag rt2=(r2==null ? null : new ReadTag(r2));
+
+ r1.obj=rt1;
+ if(r2!=null){r2.obj=rt2;}
+
+ if(clusterMode==CLUSTER_MODE_CREATE){
+ addToCluster(r1, r2, rt1, rt2);
+ }else if(clusterMode==CLUSTER_MODE_RECLUSTER){
+ reCluster(r1, r2, rt1, rt2);
+ }else{
+ throw new RuntimeException("Unknown mode "+clusterMode);
+ }
+ }
+
+ private void addToCluster(Read r1, Read r2, ReadTag rt1, ReadTag rt2){
+ final int cn1=rt1.cluster0;
+ final int cn2=rt2==null ? cn1 : rt2.cluster0;
+ if(cn1==cn2){
+ Cluster c1=fetchCluster(cn1);
+ c1.add(r1);
+ c1.add(r2);
+ }else{
+ Cluster c1=fetchCluster(cn1);
+ Cluster c2=fetchCluster(cn2);
+ c1.add(r1);
+ c1.add(r2);
+ c2.add(r1);
+ c2.add(r2);
+ }
+ }
+
+ private void reCluster(Read r1, Read r2, ReadTag rt1, ReadTag rt2){
+
+ assert(false) : "TODO";
+
+ Cluster bestCluster1=null;
+ Cluster bestCluster2=null;
+
+ float bestScore1=-999999999, bestScore1_2=-999999999;
+ float bestScore2=-999999999, bestScore2_1=-999999999;
+
+ for(Cluster c : clusterList){
+
+ float score1=c.score(r1);
+ float score2=c.score(r2);
+
+ if(bestCluster1==null || score1>bestScore1){
+ bestCluster1=c;
+ bestScore1=score1;
+ bestScore1_2=score2;
+ }
+ if(bestCluster2==null || score2>bestScore2){
+ bestCluster2=c;
+ bestScore2=score2;
+ bestScore2_1=score1;
+ }
+ }
+
+ if(r2==null){
+ rt1.cluster1=bestCluster1.id;
+ }else if(bestCluster1==bestCluster2){
+ rt1.cluster1=rt2.cluster1=bestCluster1.id;
+ }else{
+ assert(r1!=null && r2!=null && bestCluster1!=bestCluster2);
+
+ float a=bestScore1+bestScore1_2;
+ float b=bestScore2+bestScore2_1;
+
+ if(ambigMode==AMBIG_MODE_BEST){
+ if(a>=b){
+ rt1.cluster1=rt2.cluster1=bestCluster1.id;
+ }else{
+ rt1.cluster1=rt2.cluster1=bestCluster2.id;
+ }
+ }else if(ambigMode==AMBIG_MODE_BOTH){
+ assert(false) : "TODO";
+ }else if(ambigMode==AMBIG_MODE_TOSS){
+ rt1.cluster1=rt2.cluster1=-1;
+ }else if(ambigMode==AMBIG_MODE_RAND){
+ if(a<0 || b<0){
+ float c=0-(Tools.min(a, b))*1.5f;
+ a=a+c;
+ b=a+c;
+ }
+ float coin=randy.nextFloat()*(a+b);
+ if(coin<=a){
+ rt1.cluster1=rt2.cluster1=bestCluster1.id;
+ }else{
+ rt1.cluster1=rt2.cluster1=bestCluster2.id;
+ }
+ }
+ }
+
+ }
+
+ final int id;
+ final int clusterMode;
+ final int ambigMode;
+ final ConcurrentReadInputStream cris;
+
+ final ThreadLocalRandom randy;
+
+ long readsInT;
+ long basesInT;
+
+ }
+
+ /*--------------------------------------------------------------*/
+
+ /*--------------------------------------------------------------*/
+
+ /*--------------------------------------------------------------*/
+
+ /*--------------------------------------------------------------*/
+
+ public boolean errorState=false;
+
+ private final ArrayList<Cluster> clusterList=new ArrayList<Cluster>(256);
+
+ /** 'big' kmer */
+ public final int k1;
+ /** 'small' kmer */
+ public final int k2;
+
+ public final int arraylen1;
+ public final int arraylen2;
+
+ private String in1=null;
+ private String in2=null;
+
+ private String out1=null;
+ private String out2=null;
+
+ private String extin=null;
+ private String extout=null;
+
+ private boolean overwrite=false;
+ private boolean append=false;
+
+ private long maxReads=-1;
+
+ private int ambigMode=AMBIG_MODE_RAND;
+
+ private final FileFormat ffin1;
+ private final FileFormat ffin2;
+
+ private final FileFormat ffout1;
+ private final FileFormat ffout2;
+
+ private PrintStream outstream=System.err;
+
+ private int THREADS=Shared.threads();
+
+ /*--------------------------------------------------------------*/
+
+ public static boolean verbose=false;
+
+ public static final int CLUSTER_MODE_CREATE=0;
+ public static final int CLUSTER_MODE_RECLUSTER=1;
+ public static final int CLUSTER_MODE_REFINE=2;
+
+ public static final int AMBIG_MODE_BEST=0;
+ public static final int AMBIG_MODE_BOTH=1;
+ public static final int AMBIG_MODE_TOSS=2;
+ public static final int AMBIG_MODE_RAND=3;
+
+}
diff --git a/current/dna/AminoAcid.java b/current/dna/AminoAcid.java
new file mode 100755
index 0000000..8c49b2f
--- /dev/null
+++ b/current/dna/AminoAcid.java
@@ -0,0 +1,721 @@
+package dna;
+import java.util.Arrays;
+import java.util.HashMap;
+
+import align2.QualityTools;
+import align2.Tools;
+
+import stream.Read;
+
+
+/**
+ * @author Brian Bushnell
+ * @date July 1, 2010
+ *
+ */
+public final class AminoAcid {
+
+
+ public static void main(String[] args){
+// for(String s : stringToAA.keySet()){
+// System.out.println(s+"\t->\t"+stringToAA.get(s));
+// }
+
+ String bases="atctgatTGGcgcgatatatcg";
+ String acids=stringToAAs(bases);
+
+ System.out.println(bases+" -> "+acids);
+
+ }
+
+
+ private AminoAcid(){
+ this(null);
+ assert(false);
+ System.exit(0);
+ }
+
+ private AminoAcid(String line){
+ String[] s2=line.split(", ");
+ String[] s3=new String[s2.length-3];
+ for(int i=3; i<s2.length; i++){
+ s3[i-3]=s2[i];
+ }
+
+ name=s2[0];
+ symbol=s2[1];
+ letter=s2[2].charAt(0);
+ codeStrings=s3;
+ }
+
+ private AminoAcid(String n, String c3, String c1, String[] bases){
+ name=n;
+ symbol=c3;
+ letter=c1.charAt(0);
+ codeStrings=bases;
+ }
+
+ public String toString(){
+ return name+", "+symbol+", "+letter+", "+Arrays.toString(codeStrings);
+ }
+
+ public static String kmerToString(long kmer, int k){
+ StringBuilder sb=new StringBuilder(k);
+ for(int i=0; i<k; i++){
+ int x=(int)(kmer&3);
+ sb.append((char)numberToBase[x]);
+ kmer>>=2;
+ }
+ return sb.reverse().toString();
+ }
+
+
+ public final String name;
+ public final String symbol;
+ public final char letter;
+ public final String[] codeStrings;
+
+
+ //a=1
+ //c=2
+ //g=4
+ //t=8
+
+// R G A (puRine)
+// Y T C (pYrimidine)
+// K G T (Ketone)
+// M A C (aMino group)
+// S G C (Strong interaction)
+// W A T (Weak interaction)
+// B G T C (not A) (B comes after A)
+// D G A T (not C) (D comes after C)
+// H A C T (not G) (H comes after G)
+// V G C A (not T, not U) (V comes after U)
+// N A G C T (aNy)
+// X masked
+// - gap of indeterminate length
+
+ public static final byte[] numberToBase={
+ 'A','C','G','T','N'
+ };
+
+ public static final byte[] numberToComplementaryBase={
+ 'T','G','C','A','N'
+ };
+
+ public static final byte[] numberToComplement={
+ 3,2,1,0,4
+ };
+
+ public static final byte[] numberToBaseExtended={
+ ' ','A','C','M','G','R','S','V', //0-7
+ 'T','W','Y','H','K','D','B','N', //8-15
+ 'X',' ',' ',' ',' ',' ',' ',' ', //16-23
+ };
+
+ /** Has 'N' in position 0. Mainly for translating compressed arrays containing zeroes to bases. */
+ public static final byte[] numberToBaseExtended2={
+ 'N','A','C','M','G','R','S','V', //0-7
+ 'T','W','Y','H','K','D','B','N', //8-15
+ 'X',' ',' ',' ',' ',' ',' ',' ', //16-23
+ };
+
+ public static final byte[] degenerateBases={
+ ' ',' ',' ','M',' ','R','S','V', //0-7
+ ' ','W','Y','H','K','D','B',' ', //8-15
+ ' ',' ',' ',' ',' ',' ',' ',' ', //16-23
+ };
+
+ public static final byte[] numberToComplementaryBaseExtended={
+ ' ','T','G','K','C','Y','W','B', //0-7
+ 'A','S','R','D','M','H','V','N', //8-15
+ 'X',' ',' ',' ',' ',' ',' ',' ', //16-23
+ };
+
+ /** Element i is: 0 for 'A', 1 for 'C', 2 for 'G', 3 for 'T', -1 otherwise */
+ public static final byte[] baseToNumber=new byte[128];
+
+ /** Element i is: 3 for 'A', 2 for 'C', 1 for 'G', 0 for 'T', -1 otherwise */
+ public static final byte[] baseToComplementNumber=new byte[128];
+
+ /** Element i is: 0 for 'A', 1 for 'C', 2 for 'G', 3 for 'T', 4 for 'N', -1 otherwise */
+ public static final byte[] baseToNumberACGTN=new byte[128];
+
+ /** Element i is: 0 for 'A', 1 for 'C', 2 for 'G', 3 for 'T', 0 for 'N', -1 otherwise */
+ public static final byte[] baseToNumberACGTN2=new byte[128];
+
+ /** Element i is: 0 for 'A', 1 for 'C', 2 for 'G', 3 for 'T', 4 otherwise */
+ public static final byte[] baseToNumberACGTother=new byte[128];
+
+ /** A>A, C>C, G>G, T/U>T, other>N */
+ public static final byte[] baseToACGTN=new byte[128];
+
+ public static final byte[] baseToComplementExtended=new byte[128];
+
+ /** Uracil to Thymine, everything else unchanged */
+ public static final byte[] uToT=new byte[256];
+
+ /** Element i is the bitwise OR of constituent IUPAC base numbers in baseToNumber.<br>
+ * For example, baseToNumberExtended['M'] = (baseToNumber['A'] | baseToNumber['C']) = (1 | 2) = 3 <br>
+ * Invalid characters are -1 */
+ public static final byte[] baseToNumberExtended=new byte[128];
+ public static final AminoAcid[] AlphabeticalAAs=new AminoAcid[21];
+ public static final AminoAcid[] codeToAA=new AminoAcid[66];
+ public static final char[] codeToChar=new char[66];
+ public static final byte[] codeToByte=new byte[66];
+ public static final byte[] aminoToCode=new byte[128];
+ public static final HashMap<String, AminoAcid> stringToAA=new HashMap<String, AminoAcid>(512);
+
+ public static final AminoAcid Alanine=new AminoAcid("Alanine, Ala, A, GCU, GCC, GCA, GCG");
+ public static final AminoAcid Arginine=new AminoAcid("Arginine, Arg, R, CGU, CGC, CGA, CGG, AGA, AGG");
+ public static final AminoAcid Asparagine=new AminoAcid("Asparagine, Asn, N, AAU, AAC");
+ public static final AminoAcid AsparticAcid=new AminoAcid("AsparticAcid, Asp, D, GAU, GAC");
+ public static final AminoAcid Cysteine=new AminoAcid("Cysteine, Cys, C, UGU, UGC");
+ public static final AminoAcid GlutamicAcid=new AminoAcid("GlutamicAcid, Glu, E, GAA, GAG");
+ public static final AminoAcid Glutamine=new AminoAcid("Glutamine, Gln, Q, CAA, CAG");
+ public static final AminoAcid Glycine=new AminoAcid("Glycine, Gly, G, GGU, GGC, GGA, GGG");
+ public static final AminoAcid Histidine=new AminoAcid("Histidine, His, H, CAU, CAC");
+ public static final AminoAcid Isoleucine=new AminoAcid("Isoleucine, Ile, I, AUU, AUC, AUA");
+ public static final AminoAcid Leucine=new AminoAcid("Leucine, Leu, L, UUA, UUG, CUU, CUC, CUA, CUG");
+ public static final AminoAcid Lysine=new AminoAcid("Lysine, Lys, K, AAA, AAG");
+ public static final AminoAcid Methionine=new AminoAcid("Methionine, Met, M, AUG");
+ public static final AminoAcid Phenylalanine=new AminoAcid("Phenylalanine, Phe, F, UUU, UUC");
+ public static final AminoAcid Proline=new AminoAcid("Proline, Pro, P, CCU, CCC, CCA, CCG");
+ public static final AminoAcid Serine=new AminoAcid("Serine, Ser, S, UCU, UCC, UCA, UCG, AGU, AGC");
+ public static final AminoAcid Threonine=new AminoAcid("Threonine, Thr, T, ACU, ACC, ACA, ACG");
+ public static final AminoAcid Tryptophan=new AminoAcid("Tryptophan, Trp, W, UGG");
+ public static final AminoAcid Tyrosine=new AminoAcid("Tyrosine, Tyr, Y, UAU, UAC");
+ public static final AminoAcid Valine=new AminoAcid("Valine, Val, V, GUU, GUC, GUA, GUG");
+ public static final AminoAcid END=new AminoAcid("End, End, *, UAA, UGA, UAG");
+ public static final AminoAcid ANY=new AminoAcid("Any, Any, X, XXX");
+
+
+
+
+ public static final byte[][] COLORS=new byte[][] {
+ {0, 1, 2, 3},
+ {1, 0, 3, 2},
+ {2, 3, 0, 1},
+ {3, 2, 1, 0}
+ };
+
+ /** Returns a new reverse-complemented array in ASCII coding*/
+ public static final byte[] reverseComplementBases(final byte[] in){
+ byte[] out=new byte[in.length];
+ final int last=in.length-1;
+ for(int i=0; i<in.length; i++){
+ out[i]=baseToComplementExtended[in[last-i]];
+ }
+ return out;
+ }
+
+
+ public static final void reverseComplementBasesInPlace(final byte[] in){
+ if(in!=null){reverseComplementBasesInPlace(in, in.length);}
+ }
+
+ public static final void reverseComplementBasesInPlace(final byte[] in, final int length){
+ if(in==null){return;}
+ final int last=length-1;
+ final int max=length/2;
+ for(int i=0; i<max; i++){
+ byte a=in[i];
+ byte b=in[last-i];
+// assert(b>0 && b<baseToComplementExtended.length) : ((int)b)+"\t"+((char)b)+"\t"+Arrays.toString(in);
+// System.out.println((char)a+", "+(char)b+", "+i+", "+last);
+ in[i]=baseToComplementExtended[b];
+ in[last-i]=baseToComplementExtended[a];
+ }
+ if((length&1)==1){//Odd length; process middle
+ in[max]=baseToComplementExtended[in[max]];
+ }
+ }
+
+ public static final String reverseComplementBases(String in){
+ return in==null ? null : new String(reverseComplementBases(in.getBytes()));
+ }
+
+ public static final int reverseComplementBinary(int kmer, int k){
+ int out=0;
+ kmer=~kmer;
+ for(int i=0; i<k; i++){
+ out=((out<<2)|(kmer&3));
+ kmer>>=2;
+ }
+ return out;
+ }
+
+ public static final long reverseComplementBinary(long kmer, int k){
+ long out=0;
+ kmer=~kmer;
+ for(int i=0; i<k; i++){
+ out=((out<<2)|(kmer&3L));
+ kmer>>=2;
+ }
+ return out;
+ }
+
+ public static final int reverseComplementBinaryFast(int kmer, int k){
+ int out=0;
+ int extra=k&3;
+ for(int i=0; i<extra; i++){
+ out=((out<<2)|((~kmer)&3));
+ kmer>>=2;
+ }
+ k-=extra;
+ for(int i=0; i<k; i+=4){
+ out=((out<<8)|(rcompBinaryTable[kmer&0xFF]));
+ kmer>>=8;
+ }
+ return out;
+ }
+
+ public static final long reverseComplementBinaryFast(long kmer, int k){
+ long out=0;
+ int extra=k&3;
+ for(int i=0; i<extra; i++){
+ out=((out<<2)|((~kmer)&3L));
+ kmer>>=2;
+ }
+ k-=extra;
+ for(int i=0; i<k; i+=4){
+ out=((out<<8)|(rcompBinaryTable[(int)(kmer&0xFFL)]));
+ kmer>>=8;
+ }
+ return out;
+ }
+
+ public static final byte baseToColor(byte base1, byte base2){
+ byte a=baseToNumber[base1];
+ byte b=baseToNumber[base2];
+ if(a<0 && b<0){return 'N';}
+ if(a<0){a=3;}
+ if(b<0){b=3;}
+ return COLORS[a][b];
+ }
+
+ public static final byte colorToBase(byte base1, byte color){
+ if(!isFullyDefined(base1) || color<0 || color>3){
+// System.err.println("colorToBase("+(char)base1+","+color+") = N");
+ return (byte)'N';
+ }
+ byte a=baseToNumber[base1];
+
+// System.err.println("colorToBase("+(char)base1+","+color+") = "+(char)numberToBase[COLORS[a][color]]);
+
+ return numberToBase[COLORS[a][color]];
+ }
+
+ public static final byte toNumber(String code){
+ return toNumber(code.charAt(0), code.charAt(1), code.charAt(2));
+ }
+
+ public static final AminoAcid toAA(String code){
+ return toAA(code.charAt(0), code.charAt(1), code.charAt(2));
+ }
+
+ public static final char toChar(String code){
+ return toChar(code.charAt(0), code.charAt(1), code.charAt(2));
+ }
+
+ public static final char[] splitBase(char c){
+ byte b=baseToNumberExtended[c];
+ int len=Integer.bitCount(b);
+ char[] out=new char[len];
+
+ int index=0;
+ for(int i=0; i<4; i++){
+ if(((1<<i)&b)!=0){
+ out[index]=(char)numberToBase[i];
+ index++;
+ }
+ }
+ return out;
+ }
+
+
+
+
+ public static final byte[] numberToBases(int code, int n){
+
+ byte[] bytes=new byte[n];
+
+ for(int i=n-1; i>=0; i--){
+ int temp=code&3;
+ code>>=2;
+ bytes[i]=numberToBase[temp];
+ }
+
+ return bytes;
+ }
+
+ public static final int baseTupleToNumber(byte[] tuple){
+
+ int r=0;
+ for(int i=0; i<tuple.length; i++){
+ int temp=baseToNumberACGTN[tuple[i]];
+ if(temp<0 || temp>3){return -1;}
+ r=((r<<2)|temp);
+ }
+
+ return r;
+ }
+
+ public static boolean isFullyDefined(char base){
+ return baseToNumber[base]>=0;
+ }
+
+ public static boolean isFullyDefined(byte base){
+ return base>=0 && baseToNumber[base]>=0;
+ }
+
+ public static boolean isACGTN(char base){
+ return baseToNumberACGTN[base]>=0;
+ }
+
+ public static boolean isACGTN(byte base){
+ return base>=0 && baseToNumberACGTN[base]>=0;
+ }
+
+ public static boolean containsOnlyACGTN(String s){
+ if(s==null || s.length()==0){return true;}
+ for(int i=0; i<s.length(); i++){
+ char c=s.charAt(i);
+ if(baseToNumberACGTN[c]<0){return false;}
+ }
+ return true;
+ }
+
+ public static boolean containsOnlyACGTNQ(String s){
+ if(s==null || s.length()==0){return true;}
+ for(int i=0; i<s.length(); i++){
+ char c=s.charAt(i);
+ if(c!='?' && baseToNumberACGTN[c]<0){return false;}
+ }
+ return true;
+ }
+
+ public static boolean containsOnlyACGTN(byte[] array){
+ if(array==null || array.length==0){return true;}
+ for(int i=0; i<array.length; i++){
+ byte b=array[i];
+ if(b<0 || baseToNumberACGTN[b]<0){return false;}
+ }
+ return true;
+ }
+
+ public static boolean isFullyDefined(String s){
+ for(int i=0; i<s.length(); i++){
+ if(!isFullyDefined(s.charAt(i))){return false;}
+ }
+ return true;
+ }
+
+ public static boolean isFullyDefined(byte[] s){
+ for(int i=0; i<s.length; i++){
+ if(!isFullyDefined(s[i])){return false;}
+ }
+ return true;
+ }
+
+ public static int countUndefined(byte[] s){
+ int x=0;
+ for(int i=0; i<s.length; i++){
+ if(!isFullyDefined(s[i])){x++;}
+ }
+ return x;
+ }
+
+
+ public static final byte toNumber(char c1, char c2, char c3){
+ assert(baseToNumberACGTN2[c1]>=0 && baseToNumberACGTN2[c2]>=0 && baseToNumberACGTN2[c3]>=0);
+ int x=(baseToNumberACGTN2[c1]<<4)|(baseToNumberACGTN2[c2]<<2)|(baseToNumberACGTN2[c3]);
+ return (byte)x;
+ }
+
+ public static final AminoAcid toAA(char c1, char c2, char c3){
+ assert(baseToNumberACGTN2[c1]>=0 && baseToNumberACGTN2[c2]>=0 && baseToNumberACGTN2[c3]>=0);
+ int x=(baseToNumberACGTN2[c1]<<4)|(baseToNumberACGTN2[c2]<<2)|(baseToNumberACGTN2[c3]);
+ return codeToAA[x];
+ }
+
+ public static final char toChar(char c1, char c2, char c3){
+ assert(baseToNumberACGTN2[c1]>=0 && baseToNumberACGTN2[c2]>=0 && baseToNumberACGTN2[c3]>=0);
+ int x=(baseToNumberACGTN2[c1]<<4)|(baseToNumberACGTN2[c2]<<2)|(baseToNumberACGTN2[c3]);
+ return codeToChar[x];
+ }
+
+ public static final byte toByte(byte c1, byte c2, byte c3){
+ int a=baseToNumber[c1], b=baseToNumber[c2], c=baseToNumber[c3];
+ if(a<0 || b<0 || c<0){return (byte)'.';}
+ int x=((a<<4)|(b<<2)|c);
+ return codeToByte[x];
+ }
+
+ public static final char toChar(byte c1, byte c2, byte c3){
+ assert(baseToNumberACGTN2[c1]>=0 && baseToNumberACGTN2[c2]>=0 && baseToNumberACGTN2[c3]>=0);
+ byte n1=baseToNumberACGTN2[c1], n2=baseToNumberACGTN2[c2], n3=baseToNumberACGTN2[c3];
+ if(n1>3 || n2>3 || n3>3){return '?';}
+ int x=(n1<<4)|(n2<<2)|(n3);
+// return (x<codeToChar.length ? codeToChar[x] : '?');
+ return codeToChar[x];
+ }
+
+ public static final String stringToAAs(String bases){
+ StringBuilder sb=new StringBuilder(bases.length()/3);
+ for(int i=2; i<bases.length(); i+=3){
+ char a=toAA(bases.charAt(i-2), bases.charAt(i-1), bases.charAt(i)).letter;
+ sb.append(a);
+ }
+ return sb.toString();
+ }
+
+ public static final byte[][] toAAsSixFrames(byte[] bases){
+ byte[][] out=new byte[6][];
+ if(bases!=null && bases.length>2){
+ for(int i=0; i<3; i++){
+ out[i]=toAAs(bases, i);
+ }
+ byte[] rcomp=reverseComplementBases(bases);
+ for(int i=0; i<3; i++){
+ out[i+3]=toAAs(rcomp, i);
+ }
+ }
+ return out;
+ }
+
+ public static final byte[][] toQualitySixFrames(byte[] quals, int offset){
+ byte[][] out=new byte[6][];
+ if(quals!=null && quals.length>2){
+ for(int i=0; i<3; i++){
+ out[i]=toAAQuality(quals, i);
+ }
+ Tools.reverseInPlace(quals);
+ for(int i=0; i<3; i++){
+ out[i+3]=toAAQuality(quals, i);
+ }
+ Tools.reverseInPlace(quals);
+ }
+
+ if(offset!=0){
+ for(byte[] array : out){
+ if(array!=null){
+ for(int i=0; i<array.length; i++){
+ array[i]+=offset;
+ }
+ }
+ }
+ }
+
+ return out;
+ }
+
+ public static final byte[] toAAs(byte[] bases, int frame){
+ assert(frame>=0 && frame<3);
+ if(bases==null){return null;}
+ int blen=bases.length-frame;
+ if(blen<3){return null;}
+ blen=blen-(blen%3);
+ final int stop=frame+blen;
+ final int alen=blen/3;
+
+ byte[] out=new byte[alen];
+ for(int i=2+frame, j=0; i<stop; i+=3, j++){
+ byte a=toByte(bases[i-2], bases[i-1], bases[i]);
+ out[j]=a;
+ }
+ return out;
+ }
+
+ public static final byte[] toAAQuality(byte[] quals, int frame){
+ assert(frame>=0 && frame<3);
+ int blen=quals.length-frame;
+ if(blen<3){return null;}
+ blen=blen-(blen%3);
+ final int stop=frame+blen;
+ final int alen=blen/3;
+
+ byte[] out=new byte[alen];
+ for(int i=2+frame, j=0; i<stop; i+=3, j++){
+ byte qa=quals[i-2], qb=quals[i-1], qc=quals[i];
+ float pa=QualityTools.PROB_CORRECT[qa], pb=QualityTools.PROB_CORRECT[qb], pc=QualityTools.PROB_CORRECT[qc];
+ float p=pa*pb*pc;
+ byte q=QualityTools.probCorrectToPhred(p);
+ out[j]=q;
+
+// System.out.println();
+// System.out.println(qa+", "+qb+", "+qc+" -> "+q);
+// System.out.println(pa+", "+pb+", "+pc+" -> "+p);
+
+ }
+// System.out.println(Arrays.toString(out));
+ return out;
+ }
+
+ public static final byte[] toNTs(final byte[] aminos){
+ if(aminos==null){return null;}
+ final int alen=aminos.length;
+ final int blen=alen*3;
+
+ final byte[] out=new byte[blen];
+ for(int i=0, j=0; i<alen; i++, j+=3){
+ int code=aminoToCode[aminos[i]];
+ out[j+2]=numberToBase[(code&3)];
+ out[j+1]=numberToBase[((code>>2)&3)];
+ out[j]=numberToBase[((code>>4)&3)];
+ }
+ return out;
+ }
+
+ public static final short[] rcompBinaryTable=makeBinaryRcompTable(4);
+
+ private static final short[] makeBinaryRcompTable(int k){
+ int bits=2*k;
+ short[] r=new short[1<<bits];
+ for(int i=0; i<r.length; i++){
+ r[i]=(short)reverseComplementBinary(i, k);
+ }
+ return r;
+ }
+
+ static {
+
+ for(int i=0; i<uToT.length; i++){uToT[i]=(byte)i;}
+ uToT['u']='t';
+ uToT['U']='T';
+
+ Arrays.fill(baseToACGTN, (byte)'N');
+
+ Arrays.fill(baseToNumberExtended, (byte)-1);
+ for(int i=0; i<numberToBaseExtended.length; i++){
+ char x=(char)numberToBaseExtended[i];
+ if(!Character.isWhitespace(x)){
+ baseToNumberExtended[x]=(byte)i;
+ baseToNumberExtended[Character.toLowerCase(x)]=(byte)i;
+ }
+ }
+ baseToNumberExtended['U']=8;
+ baseToNumberExtended['u']=8;
+
+ Arrays.fill(baseToNumberACGTN, (byte)-1);
+ Arrays.fill(baseToNumberACGTother, (byte)4);
+ for(int i=0; i<numberToBase.length; i++){
+ char x=(char)numberToBase[i];
+ if(!Character.isWhitespace(x)){
+ baseToNumberACGTN[x]=baseToNumberACGTother[x]=(byte)i;
+ baseToNumberACGTN[Character.toLowerCase(x)]=baseToNumberACGTother[Character.toLowerCase(x)]=(byte)i;
+ baseToACGTN[x]=baseToACGTN[Character.toLowerCase(x)]=(byte)x;
+ }
+ }
+ baseToNumberACGTN['U']=baseToNumberACGTN['u']=3;
+ baseToNumberACGTother['U']=baseToNumberACGTother['u']=3;
+ baseToACGTN['U']=baseToACGTN['u']=(byte)'T';
+
+ for(int i=0; i<baseToNumberACGTN.length; i++){baseToNumberACGTN2[i]=baseToNumberACGTN[i];}
+ baseToNumberACGTN2['N']=0;
+ baseToNumberACGTN2['n']=0;
+
+ Arrays.fill(baseToNumber, (byte)-1);
+ for(int i=0; i<numberToBase.length; i++){
+ char x=(char)numberToBase[i];
+ if(x=='A' || x=='C' || x=='G' || x=='T'){
+ baseToNumber[x]=(byte)i;
+ baseToNumber[Character.toLowerCase(x)]=(byte)i;
+ }
+ }
+ baseToNumber['U']=3;
+ baseToNumber['u']=3;
+
+ Arrays.fill(baseToComplementNumber, (byte)-1);
+ baseToComplementNumber['A']=baseToComplementNumber['a']=3;
+ baseToComplementNumber['C']=baseToComplementNumber['c']=2;
+ baseToComplementNumber['G']=baseToComplementNumber['g']=1;
+ baseToComplementNumber['T']=baseToComplementNumber['t']=0;
+ baseToComplementNumber['U']=baseToComplementNumber['u']=0;
+
+ Arrays.fill(baseToComplementExtended, (byte)-1);
+ for(int i=0; i<numberToBaseExtended.length; i++){
+ char x=(char)numberToBaseExtended[i];
+ char x2=(char)numberToComplementaryBaseExtended[i];
+ baseToComplementExtended[x]=(byte)x2;
+ baseToComplementExtended[Character.toLowerCase(x)]=(byte)Character.toLowerCase(x2);
+ }
+ baseToComplementExtended['U']=(byte)'A';
+ baseToComplementExtended['u']=(byte)'a';
+ baseToComplementExtended['?']=(byte)'?';
+ baseToComplementExtended[' ']=(byte)' ';
+ baseToComplementExtended['-']=(byte)'-';
+ baseToComplementExtended['*']=(byte)'*';
+ baseToComplementExtended['.']=(byte)'.';
+
+
+ AlphabeticalAAs[0]=Alanine;
+ AlphabeticalAAs[1]=Arginine;
+ AlphabeticalAAs[2]=Asparagine;
+ AlphabeticalAAs[3]=AsparticAcid;
+ AlphabeticalAAs[4]=Cysteine;
+ AlphabeticalAAs[5]=GlutamicAcid;
+ AlphabeticalAAs[6]=Glutamine;
+ AlphabeticalAAs[7]=Glycine;
+ AlphabeticalAAs[8]=Histidine;
+ AlphabeticalAAs[9]=Isoleucine;
+ AlphabeticalAAs[10]=Leucine;
+ AlphabeticalAAs[11]=Lysine;
+ AlphabeticalAAs[12]=Methionine;
+ AlphabeticalAAs[13]=Phenylalanine;
+ AlphabeticalAAs[14]=Proline;
+ AlphabeticalAAs[15]=Serine;
+ AlphabeticalAAs[16]=Threonine;
+ AlphabeticalAAs[17]=Tryptophan;
+ AlphabeticalAAs[18]=Tyrosine;
+ AlphabeticalAAs[19]=Valine;
+ AlphabeticalAAs[20]=END;
+// AlphabeticalAAs[21]=ANY;
+
+ Arrays.fill(aminoToCode, (byte)-1);
+ for(int i=0; i<AlphabeticalAAs.length; i++){
+ AminoAcid aa=AlphabeticalAAs[i];
+
+ stringToAA.put(aa.name, aa);
+ stringToAA.put(aa.symbol, aa);
+ stringToAA.put(aa.letter+"", aa);
+ for(int j=0; j<aa.codeStrings.length; j++){
+ String s=aa.codeStrings[j];
+ stringToAA.put(s, aa);
+ aa.codeStrings[j]=s.replace('U', 'T');
+ stringToAA.put(aa.codeStrings[j], aa);
+
+ int x=toNumber(s);
+// System.out.println("x="+x+", aa="+aa);
+ codeToAA[x]=aa;
+ codeToChar[x]=aa.letter;
+ codeToByte[x]=(byte)(aa.letter);
+ if(j==0){
+ aminoToCode[aa.letter]=(byte)x;
+ aminoToCode[Character.toLowerCase(aa.letter)]=(byte)x;
+ }
+ }
+ }
+ aminoToCode['X']=aminoToCode['x']=65;
+ codeToAA[65]=ANY;
+ codeToChar[65]='X';
+ codeToByte[65]='X';
+
+ stringToAA.put("X", ANY);
+ stringToAA.put("Start", Methionine);
+ stringToAA.put("Begin", Methionine);
+ stringToAA.put("Stop", END);
+ stringToAA.put("Aspartic Acid", AsparticAcid);
+ stringToAA.put("Glutamic Acid", GlutamicAcid);
+
+ String[] temp=stringToAA.keySet().toArray(new String[0]);
+
+ for(String s : temp){
+ AminoAcid aa=stringToAA.get(s);
+ assert(aa!=null);
+ stringToAA.put(s.toLowerCase(), aa);
+ }
+
+ }
+
+}
diff --git a/current/dna/ChromArrayMaker.java b/current/dna/ChromArrayMaker.java
new file mode 100755
index 0000000..266a7de
--- /dev/null
+++ b/current/dna/ChromArrayMaker.java
@@ -0,0 +1,573 @@
+package dna;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Date;
+
+import stream.CrisWrapper;
+import stream.FASTQ;
+import stream.FastaReadInputStream;
+import stream.Read;
+
+import align2.Shared;
+import align2.Tools;
+
+import fileIO.FileFormat;
+import fileIO.ReadWrite;
+import fileIO.TextStreamWriter;
+
+/**
+ * Replaces FastaToChromArrays with a more general solution that can handle fastq.
+ * @author Brian Bushnell
+ * @date Jul 18, 2014
+ *
+ */
+public class ChromArrayMaker {
+
+// Example:
+// dna.ChromArrayMaker ecoli_K12.fa 1 writeinthread=false genscaffoldinfo=true retain waitforwriting=false
+// gzip=true chromc=false maxlen=536670912 writechroms=true minscaf=1 midpad=300 startpad=8000 stoppad=8000 nodisk=false
+
+ public static void main(String[] args){
+ main2(args);
+ }
+
+ public static ArrayList<ChromosomeArray> main2(String[] args){
+ System.err.println("Executing "+(new Object() { }.getClass().getEnclosingClass().getName())+" "+Arrays.toString(args)+"\n");
+ boolean oldWIT=WRITE_IN_THREAD;
+ WRITE_IN_THREAD=true;
+
+// assert(false) : ReadWrite.ZIPLEVEL;
+
+ String name=null;
+
+ int genome=-1;
+ int chroms=-1;
+ String infile=null;
+ boolean writeinfo=false;
+ boolean genScaffoldInfo=true;
+ boolean writeChroms=true;
+ boolean scafprefixes=Data.scaffoldPrefixes;
+
+ for(int i=0; i<args.length; i++){
+
+ if(true){
+ final String arg=args[i];
+ final String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if("null".equalsIgnoreCase(b)){b=null;}
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(Parser.parseZip(arg, a, b)){
+ //do nothing
+ }else if(a.equals("path") || a.equals("root") || a.equals("tempdir")){
+ Data.setPath(b);
+ }else if(a.equals("name") || a.equals("organism")){
+ name=b;
+ }else if(a.equals("in") || a.equals("input") || a.equals("ref") || a.equals("fasta")){
+ if(split.length<1 || "null".equalsIgnoreCase(b)){b=null;}
+ infile=b;
+ }else if(a.equals("build") || a.equals("genome")){
+ genome=Integer.parseInt(b);
+ }else if(a.equals("chroms")){
+ chroms=Integer.parseInt(b);
+ }else if(a.equals("writeinthread")){
+ WRITE_IN_THREAD=Tools.parseBoolean(b);
+ }else if(a.equals("nodisk")){
+ NODISK=Tools.parseBoolean(b);
+ }else if(a.equals("writeinfo")){
+ writeinfo=Tools.parseBoolean(b);
+ }else if(a.equals("padstart") || a.startsWith("startpad") || a.equals("padfront") || a.startsWith("frontpad")){
+ START_PADDING=Integer.parseInt(b);
+ }else if(a.equals("padstop") || a.startsWith("stoppad") || a.equals("padend") || a.startsWith("endpad")){
+ END_PADDING=Integer.parseInt(b);
+ }else if(a.equals("pad") || a.equals("padding")){
+ START_PADDING=END_PADDING=Integer.parseInt(b);
+ }else if(a.equals("midpad") || a.startsWith("padmid")){
+ MID_PADDING=Integer.parseInt(b);
+ }else if(a.startsWith("minscaf") || a.startsWith("mincontig")){
+ MIN_SCAFFOLD=Integer.parseInt(b);
+ }else if(a.equals("genscaffoldinfo")){
+ genScaffoldInfo=Tools.parseBoolean(b);
+ System.err.println("Set genScaffoldInfo="+genScaffoldInfo);
+ }else if(a.equals("append") || a.equals("app")){
+ append=Tools.parseBoolean(b);
+ }else if(a.equals("overwrite") || a.equals("ow")){
+ overwrite=Tools.parseBoolean(b);
+ }else if(a.equals("mergescaffolds") || a.equals("mergecontigs") || (a.equals("merge"))){
+ MERGE_SCAFFOLDS=Tools.parseBoolean(b);
+ System.err.println("Set MERGE_SCAFFOLDS="+MERGE_SCAFFOLDS);
+ }else if(a.startsWith("maxlen") || a.startsWith("chromlen")){
+ long len=Tools.parseKMG(b);
+ assert(len>0 && len<=Integer.MAX_VALUE);
+ MAX_LENGTH=(int)len;
+ }else if(a.equals("writechroms")){
+ writeChroms=Tools.parseBoolean(b);
+ }else if(a.equals("chromgz") || a.equals("gz")){
+ Data.CHROMGZ=Tools.parseBoolean(b);
+ }else if(a.equals("retain")){
+ RETAIN=Tools.parseBoolean(b);
+ }else if(a.equals("scafprefixes")){
+ scafprefixes=Tools.parseBoolean(b);
+ }else if(a.equals("waitforwriting")){
+ WAIT_FOR_WRITING=Tools.parseBoolean(b);
+ }else{
+ if(i>2){
+ System.err.println("Unknown parameter "+args[i]);
+// throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+ }
+ }
+
+ WAIT_FOR_WRITING=(WAIT_FOR_WRITING || ReadWrite.USE_GZIP || ReadWrite.USE_PIGZ);
+
+ ArrayList<ChromosomeArray> r=RETAIN ? new ArrayList<ChromosomeArray>() : null;
+
+// assert(false) : Arrays.toString(args);
+// assert(RETAIN);
+
+ if(genome<0){genome=Integer.parseInt(args[1]);} //Legacy
+ if(genome<0){throw new RuntimeException("Please specify a genome build number.");}
+
+ if(writeinfo){
+ if(chroms<0){chroms=Integer.parseInt(args[2]);} //Legacy
+ if(chroms<0){throw new RuntimeException("Please the number of chroms.");}
+ writeInfo(genome, chroms, name, null, false, scafprefixes);
+ }else{
+ if(infile==null){infile=args[0].replace('\\', '/');} //Legacy
+ if(infile==null){throw new RuntimeException("Please specify an input file.");}
+ {
+ File f=new File(infile);
+ if(!f.exists() || f.isDirectory()){
+ if(!infile.startsWith("stdin")){
+ throw new RuntimeException("Not a valid file: "+f);
+ }
+ }
+ }
+ String outRoot=Data.ROOT_GENOME+genome+"/";
+
+ ChromArrayMaker ftca=new ChromArrayMaker();
+
+ {
+ boolean oldTI=FASTQ.TEST_INTERLEAVED;
+ boolean oldFI=FASTQ.FORCE_INTERLEAVED;
+ FASTQ.TEST_INTERLEAVED=false;
+ FASTQ.FORCE_INTERLEAVED=false;
+ ftca.makeChroms(infile, outRoot, name, genScaffoldInfo, writeChroms, r, scafprefixes);
+ FASTQ.TEST_INTERLEAVED=oldTI;
+ FASTQ.FORCE_INTERLEAVED=oldFI;
+ }
+ }
+
+ WRITE_IN_THREAD=oldWIT;
+ return r;
+ }
+
+ private ChromArrayMaker(){}
+
+
+ private static int[] countInfo(ChromosomeArray ca){
+ int contigs=0;
+ int startPad=0;
+ int stopPad=0;
+ int undefined=0;
+ int defined=0;//=ca.countDefinedBases();
+
+ int lastN=-1;
+ int lastDef=-1;
+
+ for(int i=0; i<=ca.maxIndex; i++){
+ byte b=ca.get(i);
+ if(AminoAcid.isFullyDefined(b)){
+ if(defined==0){startPad=i; contigs++;}
+ else if(i-lastDef>contigTrigger){contigs++;}
+ lastDef=i;
+ defined++;
+ }else{
+ lastN=i;
+ undefined++;
+ }
+ }
+
+ if(contigs>0 && lastN==ca.maxIndex){
+ stopPad=lastN-lastDef;
+ }else{
+// System.err.println(lastN+", "+lastDef+", "+ca.maxIndex);
+ }
+
+ return new int[] {ca.chromosome, 1, contigs, (ca.maxIndex+1), defined, undefined, startPad, stopPad};
+ }
+
+ @Deprecated
+ public static void writeInfo(int genome, int chroms, String name, String source, boolean unload, boolean scafNamePrefix){
+ Data.GENOME_BUILD=genome;
+ Data.chromosomePlusMatrix=new ChromosomeArray[chroms+1];
+
+ String outRoot=Data.ROOT_GENOME+genome+"/";
+ TextStreamWriter info=new TextStreamWriter(outRoot+"info.txt", true, false, false);
+ info.start();
+ info.print("#Chromosome sizes\n");
+ try {
+ info.print("#Generated on\t"+new Date()+"\n");
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ info.print("#Version\t"+VERSION+"\n");
+ info.print("#chrom\tscaffolds\tcontigs\tlength\tdefined\tundefined\tstartPad\tstopPad\n");
+
+
+ long bases=0;
+ long definedBases=0;
+
+ long contigSum=0;
+
+ for(int chrom=1; chrom<=chroms; chrom++){
+ ChromosomeArray ca=Data.getChromosome(chrom);
+ int[] v=countInfo(ca);
+ info.print(v[0]+"\t"+v[1]+"\t"+v[2]+"\t"+v[3]+"\t"+v[4]+"\t"+v[5]+"\t"+v[6]+"\t"+v[7]+"\n");
+
+ bases+=v[3];
+ definedBases+=v[4];
+ contigSum+=v[2];
+ if(unload){Data.unload(chrom, false);}
+ }
+ info.poison();
+ StringBuilder sb=new StringBuilder();
+ sb.append("#Summary\n");
+ try {
+ sb.append("#Generated on\t"+new Date()+"\n");
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ sb.append("#Version\t"+VERSION+"\n");
+ sb.append("chroms\t"+(chroms)+"\n");
+ sb.append("bases\t"+bases+"\n");
+ sb.append("defined\t"+definedBases+"\n");
+ sb.append("undefined\t"+(bases-definedBases)+"\n");
+ sb.append("contigs\t"+contigSum+"\n");
+ sb.append("scaffolds\t"+chroms+"\n");
+ sb.append("interpad\t"+MID_PADDING+"\n");
+ if(name!=null){sb.append("name\t"+name+"\n");}
+ if(source!=null){sb.append("source\t"+source+"\n");}
+ if(scafNamePrefix){sb.append("scafprefixes\t"+scafNamePrefix+"\n");}//else{assert(false);}
+ ReadWrite.writeString(sb, outRoot+"summary.txt", false);
+ info.waitForFinish();
+ }
+
+ private int makeChroms(String fname, String outRoot, String genomeName, boolean genScaffolds, boolean writeChroms, ArrayList<ChromosomeArray> r,
+ boolean scafNamePrefix){
+
+ if(!NODISK){
+ File f=new File(outRoot);
+ if(!f.exists()){
+ if(!NODISK){f.mkdirs();}
+ }else if(overwrite){
+ for(File g : f.listFiles()){
+ String s=g.getName();
+ if(g.isFile() && s.contains(".chrom")){
+ System.err.println("Deleting "+s);
+ g.delete();
+ }
+ }
+ }
+
+ f=new File(outRoot.replace("ref/genome/", "ref/index/"));
+ if(!f.exists()){
+ if(!NODISK){f.mkdirs();}
+ }else if(overwrite){
+ for(File g : f.listFiles()){
+ String s=g.getName();
+ if(g.isFile() && (s.endsWith(".int2d") || s.endsWith(".block") || s.endsWith(".block2.gz") || s.endsWith(".blockB") || s.endsWith(".blockB2.gz"))){
+ System.err.println("Deleting "+s);
+ g.delete();
+ }
+ }
+ }
+ }
+
+ final FileFormat ffin=FileFormat.testInput(fname, FileFormat.FASTA, null, true, true);
+
+ final boolean OLD_SPLIT_READS=FastaReadInputStream.SPLIT_READS;
+ FastaReadInputStream.SPLIT_READS=false;
+ final int oldNum=Shared.numBuffers();
+ Shared.setBuffers(4);
+ final CrisWrapper criswrapper=new CrisWrapper(-1, false, ffin, null, null, null);
+ Shared.setBuffers(oldNum);
+
+
+ int chrom=1;
+
+ TextStreamWriter infoWriter=null, scafWriter=null;
+ ArrayList<String> infolist=null, scaflist=null;
+
+ if(NODISK){
+ infolist=new ArrayList<String>();
+ infolist.add("#Chromosome sizes");
+ try {
+ infolist.add("#Generated on\t"+new Date());
+ } catch (Exception e1) {
+ e1.printStackTrace();
+ }
+ infolist.add("#Version\t"+VERSION);
+ infolist.add("#chrom\tscaffolds\tcontigs\tlength\tdefined\tundefined\tstartPad\tstopPad");
+ }else{
+ infoWriter=new TextStreamWriter(outRoot+"info.txt", true, false, false);
+ infoWriter.start();
+ infoWriter.print("#Chromosome sizes\n");
+ try {
+ // System.err.println(new Date());
+ infoWriter.print("#Generated on\t"+new Date()+"\n");
+ } catch (Exception e1) {
+ // TODO Auto-generated catch block
+ e1.printStackTrace();
+ }
+ infoWriter.print("#Version\t"+VERSION+"\n");
+ infoWriter.print("#chrom\tscaffolds\tcontigs\tlength\tdefined\tundefined\tstartPad\tstopPad\n");
+ }
+
+ if(genScaffolds){
+ if(NODISK){
+ scaflist=new ArrayList<String>();
+ scaflist.add("#Scaffold names");
+ try {
+ scaflist.add("#Generated on\t"+new Date());
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ scaflist.add("#Version\t"+VERSION);
+ scaflist.add("#chrom\tid\tstart\tlength\tname");
+ }else{
+ //System.err.println("*123 Making ScafWriter; "+ReadWrite.countActiveThreads()+", "+ReadWrite.USE_GZIP+", "+ReadWrite.USE_PIGZ);
+ scafWriter=new TextStreamWriter(outRoot+"scaffolds.txt.gz", true, false, false);
+ scafWriter.start();
+ scafWriter.print("#Scaffold names\n");
+ try {
+ scafWriter.print("#Generated on\t"+new Date()+"\n");
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ scafWriter.print("#Version\t"+VERSION+"\n");
+ scafWriter.print("#chrom\tid\tstart\tlength\tname\n");
+ }
+ }
+
+
+ for(ChromosomeArray ca=makeNextChrom(criswrapper, chrom, infoWriter, scafWriter, infolist, scaflist); ca!=null;
+ ca=makeNextChrom(criswrapper, chrom, infoWriter, scafWriter, infolist, scaflist)){
+ if(ca.array.length>ca.maxIndex+1){ca.resize(ca.maxIndex+1);}
+ if(RETAIN){r.add(ca);}
+
+ if(writeChroms){
+ String x=outRoot+"chr"+chrom+Data.chromExtension();
+ if(new File(x).exists() && !overwrite){throw new RuntimeException("Tried to overwrite existing file "+x+", but overwrite=false.");}
+ ReadWrite.writeObjectInThread(ca, x, false);
+ System.err.println("Writing chunk "+chrom);
+ }
+ chrom++;
+ }
+
+ FastaReadInputStream.SPLIT_READS=OLD_SPLIT_READS;
+
+ if(infoWriter!=null){infoWriter.poison();}
+ if(scafWriter!=null){
+ //System.err.println("*123 Killing ScafWriter; "+ReadWrite.countActiveThreads()+", "+ReadWrite.USE_GZIP+", "+ReadWrite.USE_PIGZ);
+ scafWriter.poison();
+ }
+
+ StringBuilder sb=new StringBuilder();
+ sb.append("#Summary\n");
+ try {
+ sb.append("#Generated on\t"+new Date()+"\n");
+ } catch (Exception e1) {
+ // TODO Auto-generated catch block
+ e1.printStackTrace();
+ }
+ sb.append("#Version\t"+VERSION+"\n");
+ sb.append("chroms\t"+(chrom-1)+"\n");
+ sb.append("bases\t"+lengthSum+"\n");
+ assert((definedSum+undefinedSum)==lengthSum) : definedSum+", "+undefinedSum+", "+lengthSum;
+ sb.append("defined\t"+definedSum+"\n");
+ sb.append("undefined\t"+undefinedSum+"\n");
+ sb.append("contigs\t"+contigSum+"\n");
+ sb.append("scaffolds\t"+scaffoldSum+"\n");
+ sb.append("interpad\t"+MID_PADDING+"\n");
+ if(genomeName!=null){sb.append("name\t"+genomeName+"\n");}
+ if(fname!=null){
+ File f=new File(fname);
+ String cpath=null;
+ try {
+ cpath=f.getCanonicalPath();
+ } catch (IOException e) {
+ cpath=f.getAbsolutePath();
+ }
+ sb.append("source\t"+cpath+"\n");
+ sb.append("bytes\t"+f.length()+"\n");
+ sb.append("last modified\t"+f.lastModified()+"\n");
+ }
+ if(scafNamePrefix){sb.append("scafprefixes\t"+scafNamePrefix+"\n");}//else{assert(false);}
+ if(NODISK){
+ SUMMARY_LIST=new ArrayList<String>();
+ String[] split=sb.toString().split("\n");
+ for(String s : split){SUMMARY_LIST.add(s);}
+ }else{
+ ReadWrite.writeString(sb, outRoot+"summary.txt", false);
+ }
+
+ if(infoWriter!=null){infoWriter.waitForFinish();}
+ if(scafWriter!=null){
+ //System.err.println("*123 Waiting For ScafWriter; "+ReadWrite.countActiveThreads()+", "+ReadWrite.USE_GZIP+", "+ReadWrite.USE_PIGZ);
+ scafWriter.waitForFinish();
+ //System.err.println("*123 ScafWriter Finished; "+ReadWrite.countActiveThreads()+", "+ReadWrite.USE_GZIP+", "+ReadWrite.USE_PIGZ);
+ }
+
+ if(WAIT_FOR_WRITING && ReadWrite.countActiveThreads()>0){
+ System.err.println("Waiting for writing to finish.");
+ ReadWrite.waitForWritingToFinish();
+ System.err.println("Finished.");
+ //System.err.println("*123 countActiveThreads Finished; "+ReadWrite.countActiveThreads()+", "+ReadWrite.USE_GZIP+", "+ReadWrite.USE_PIGZ);
+ }
+
+ if(infolist!=null){
+ INFO_LIST=infolist;
+ LISTBUILD=Data.GENOME_BUILD;
+ }else{INFO_LIST=null;}
+ if(scaflist!=null){
+ SCAF_LIST=scaflist;
+ LISTBUILD=Data.GENOME_BUILD;
+ }else{SCAF_LIST=null;}
+
+ return chrom-1;
+ }
+
+ private ChromosomeArray makeNextChrom(CrisWrapper criswrapper, int chrom, TextStreamWriter infoWriter, TextStreamWriter scafWriter, ArrayList<String> infolist, ArrayList<String> scaflist){
+ assert(FastaReadInputStream.SPLIT_READS==false);
+ ChromosomeArray ca=new ChromosomeArray(chrom, (byte)Gene.PLUS, 0, 120000+START_PADDING);
+ ca.maxIndex=-1;
+ for(int i=0; i<START_PADDING; i++){ca.set(i, 'N');}
+
+ if(verbose){System.err.println("chrom="+chrom+", currentScaffold="+(currentScaffold==null ? "null" : currentScaffold.id));}
+
+ int scaffolds=0;
+ if(currentScaffold!=null && currentScaffold.length()>0){
+ assert(currentScaffold.length()>0);
+ assert(currentScaffold.length()+END_PADDING+ca.maxIndex<MAX_LENGTH);
+
+ if(verbose){System.err.println("A: Writing a scaffold because currentScaffold = "+currentScaffold);}
+ scaffoldSum++;
+ if(scafWriter!=null){scafWriter.print(chrom+"\t"+scaffoldSum+"\t"+(ca.maxIndex+1)+"\t"+currentScaffold.length()+"\t"+currentScaffold.id+"\n");}
+ if(scaflist!=null && currentScaffold!=null){
+ scaflist.add(chrom+"\t"+scaffoldSum+"\t"+(ca.maxIndex+1)+"\t"+currentScaffold.length()+"\t"+currentScaffold.id);
+ if(verbose){System.err.println("A: Added to scaflist: "+scaflist.get(scaflist.size()-1));}
+ }
+ ca.set(ca.maxIndex+1, currentScaffold.bases);
+ scaffolds++;
+
+ currentScaffold=null;
+ }
+
+ while((currentScaffold=criswrapper.next())!=null){
+ if(currentScaffold.length()+MID_PADDING+END_PADDING+ca.maxIndex>MAX_LENGTH){break;}
+ if(scaffolds>0 && !MERGE_SCAFFOLDS){break;}
+
+ if(scaffolds>0){
+ for(int i=0; i<MID_PADDING; i++){
+ ca.set(ca.maxIndex+1, 'N');
+ }
+ }
+ if(currentScaffold.length()>=MIN_SCAFFOLD){
+ if(verbose){System.err.println("B: Writing a scaffold because currentScaffold = "+currentScaffold);}
+ scaffoldSum++;
+ if(scafWriter!=null){scafWriter.print(chrom+"\t"+scaffoldSum+"\t"+(ca.maxIndex+1)+"\t"+currentScaffold.length()+"\t"+currentScaffold.id+"\n");}
+ if(scaflist!=null){
+ scaflist.add(chrom+"\t"+scaffoldSum+"\t"+(ca.maxIndex+1)+"\t"+currentScaffold.length()+"\t"+currentScaffold.id);
+ if(verbose){System.err.println("B: Added to scaflist: "+scaflist.get(scaflist.size()-1));}
+ }
+ ca.set(ca.maxIndex+1, currentScaffold.bases);
+ scaffolds++;
+ }
+ currentScaffold=null;
+ }
+
+// if(verbose){System.err.println("lastHeader="+lastHeader);}
+
+ if(scaffolds==0){return null;}
+
+ if(END_PADDING>0){
+ int terminalN=0;
+ for(int i=ca.maxIndex; i>=0 && terminalN<END_PADDING; i--){
+ if(ca.get(i)=='N'){terminalN++;}
+ else{break;}
+ }
+// System.err.println("\nAdding Ns: ref.length="+ca.maxIndex);
+ while(terminalN<=END_PADDING && ca.maxIndex<MAX_LENGTH-1){
+// System.out.print("N");
+ ca.set(ca.maxIndex+1, 'N');
+ terminalN++;
+ }
+// System.err.println("\nAdded Ns: ref.length="+ca.maxIndex);
+ }
+
+ int[] v=countInfo(ca);
+ v[6]=Tools.max(0, Tools.min(START_PADDING, v[6])); //In case input scaffolds had leading undefined bases
+ v[7]=Tools.max(0, Tools.min(END_PADDING, v[7])); //In case input scaffolds had trailing undefined bases
+ if(infoWriter!=null){
+// infoWriter.print("#chrom\tscaffolds\tcontigs\tlength\tdefined\tundefined\tstartPad\tstopPad\n");
+ infoWriter.print(v[0]+"\t"+scaffolds+"\t"+v[2]+"\t"+v[3]+"\t"+v[4]+"\t"+v[5]+"\t"+v[6]+"\t"+v[7]+"\n");
+ }
+ if(infolist!=null){
+ infolist.add(v[0]+"\t"+scaffolds+"\t"+v[2]+"\t"+v[3]+"\t"+v[4]+"\t"+v[5]+"\t"+v[6]+"\t"+v[7]);
+ }
+ lengthSum+=v[3];
+ definedSum+=v[4];
+ undefinedSum+=v[5];
+ contigSum+=v[2];
+
+ assert((definedSum+undefinedSum)==lengthSum) : definedSum+", "+undefinedSum+", "+lengthSum+
+ "; "+ca.countDefinedBases()+", "+(ca.maxIndex+1)+"\n"+ca.getString(0, ca.maxIndex);
+
+ return ca;
+ }
+
+// private String lastHeader;
+// private String nextHeader;
+ private Read currentScaffold;
+ private long scaffoldSum=0;
+ private long lengthSum=0;
+ private long definedSum=0;
+ private long undefinedSum=0;
+ private long contigSum=0;
+
+
+ public static final int currentVersion(){return VERSION;}
+
+ public static boolean MERGE_SCAFFOLDS=true;
+ public static boolean WRITE_IN_THREAD=false;
+ public static boolean overwrite=true;
+ public static boolean append=false;
+ public static int START_PADDING=8000; //Always applied
+ public static int MID_PADDING=300; //Applied when merging scaffolds
+ public static int END_PADDING=8000; //Only applied if not enough terminal Ns
+ public static int MIN_SCAFFOLD=1;
+ public static int contigTrigger=10;
+ public static int VERSION=5;
+ public static int MAX_LENGTH=(1<<29)-200000;
+
+ public static boolean verbose=false;
+ public static boolean RETAIN=false;
+ public static boolean WAIT_FOR_WRITING=true;
+ public static boolean NODISK=false;
+ public static int LISTBUILD=-1;
+ public static ArrayList<String> INFO_LIST, SCAF_LIST, SUMMARY_LIST;
+
+// public static boolean GENERATE_SCAFFOLD_INFO=true;
+
+}
diff --git a/current/dna/ChromToFasta.java b/current/dna/ChromToFasta.java
new file mode 100755
index 0000000..1c38291
--- /dev/null
+++ b/current/dna/ChromToFasta.java
@@ -0,0 +1,142 @@
+package dna;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import align2.Tools;
+import fileIO.TextStreamWriter;
+
+/**
+ * @author Brian Bushnell
+ * @date Jul 26, 2012
+ *
+ */
+public class ChromToFasta {
+
+ public static void main(String[] args){
+ System.err.println("Executing "+(new Object() { }.getClass().getEnclosingClass().getName())+" "+Arrays.toString(args)+"\n");
+ Timer t=new Timer();
+
+ if(args[0].contains("=") && (args[0].startsWith("build") || args[0].startsWith("genome"))){
+ int build=Integer.parseInt(args[0].split("=")[1]);
+// Data.setGenome(build);
+// String s="", comma="";
+// for(int i=1; i<Data.numChroms; i++){
+// s=s+comma+Data.chromFname(i, build);
+// comma=",";
+// }
+// args[0]=s;
+ args[0]=Data.chromFname(1, build);
+ args[0]=args[0].substring(0, args[0].lastIndexOf('/'));
+ }
+
+ String[] chromfiles=args[0].split(",");
+
+ if(chromfiles.length==1){
+ File f=new File(chromfiles[0]);
+ if(f.isDirectory()){
+ ArrayList<String> list=new ArrayList<String>(4);
+ for(File f2 : f.listFiles()){
+ if(!f2.isDirectory() && f2.isFile()){
+ String s=f2.getAbsolutePath();
+ if(s.endsWith(".chrom") || s.endsWith(".chromC") || s.contains(".chrom.") || s.contains(".chromC.")){
+ list.add(s);
+ }
+ }
+ }
+ chromfiles=list.toArray(new String[list.size()]);
+ }
+ }
+
+ String outfile=args[1];
+ int blocklen=Integer.parseInt(args[2]);
+ int trigger=(args.length>3 ? Integer.parseInt(args[3]) : 0);
+
+ TextStreamWriter tsw=new TextStreamWriter(outfile, true, false, false);
+ tsw.start();
+
+ if(trigger<=0){ //Write normally
+ for(int i=0; i<chromfiles.length; i++){
+ ChromosomeArray cha=ChromosomeArray.read(chromfiles[i]);
+ writeChrom(cha, tsw, blocklen);
+ }
+ }else{ //Break into contigs
+ int contig=1;
+ for(int i=0; i<chromfiles.length; i++){
+ ChromosomeArray cha=ChromosomeArray.read(chromfiles[i]);
+ contig=writeContigs(cha, contig, trigger, blocklen, tsw);
+ }
+ }
+
+ tsw.poison();
+
+ try {tsw.join();}
+ catch (InterruptedException e) {e.printStackTrace();}
+
+ t.stop();
+ System.err.println("Time:\t"+t);
+ }
+
+ public static int writeContigs(ChromosomeArray cha, int contig, int trigger, int fastaBlocklen, TextStreamWriter tsw){
+
+ StringBuilder sb=new StringBuilder(4000);
+
+ int ns=0;
+
+ for(int aloc=cha.minIndex; aloc<=cha.maxIndex; aloc++){
+ byte b=cha.get(aloc);
+ if(b=='N'){
+ ns++;
+ if(sb.length()>0){
+ sb.append('N');
+ if(ns==trigger){
+ sb.setLength(sb.length()-ns);
+ tsw.print(">"+contig+"\n");
+ contig++;
+ writeContig(sb, tsw, fastaBlocklen);
+ sb.setLength(0);
+ }
+ }
+ }else{
+ sb.append((char)b);
+ ns=0;
+ }
+ }
+
+
+ if(sb.length()>0){
+ sb.setLength(sb.length()-ns);
+ tsw.print(">"+contig+"\n");
+ contig++;
+ writeContig(sb, tsw, fastaBlocklen);
+ sb.setLength(0);
+ }
+
+ return contig;
+ }
+
+ public static void writeContig(StringBuilder sb, TextStreamWriter tsw, int blocklen){
+ for(int i=0; i<sb.length(); i+=blocklen){
+ int max=Tools.min(i+blocklen, sb.length());
+ tsw.println(sb.substring(i, max));
+ }
+ }
+
+ public static void writeChrom(ChromosomeArray cha, String fname, int blocklen){
+ TextStreamWriter tsw=new TextStreamWriter(fname, true, false, false);
+ tsw.start();
+ tsw.print(">"+cha.chromosome+"\n");
+ writeChrom(cha, tsw, blocklen);
+ tsw.poison();
+ }
+
+ public static void writeChrom(ChromosomeArray cha, TextStreamWriter tsw, int blocklen){
+ tsw.println(">"+cha.chromosome);
+ for(int i=0; i<=cha.maxIndex; i+=blocklen){
+ int max=Tools.min(i+blocklen-1, cha.maxIndex);
+ tsw.println(cha.getString(i, max));
+ }
+ }
+
+}
diff --git a/current/dna/ChromosomeArray.java b/current/dna/ChromosomeArray.java
new file mode 100755
index 0000000..df90b59
--- /dev/null
+++ b/current/dna/ChromosomeArray.java
@@ -0,0 +1,429 @@
+package dna;
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import jgi.AssemblyStats2;
+
+import stream.ByteBuilder;
+import stream.KillSwitch;
+
+import align2.Tools;
+
+import fileIO.ReadWrite;
+
+
+public class ChromosomeArray implements Serializable {
+
+
+ /**
+ *
+ */
+ private static final long serialVersionUID = 3199182397853127842L;
+
+ public static void main(String[] args){
+ translateFile(args[1], Byte.parseByte(args[0]));
+ }
+
+
+ private static void translateFile(String fname, int chrom){
+
+ long time1=System.nanoTime();
+
+ ChromosomeArray cha=read(fname, chrom);
+ cha.chromosome=chrom;
+ long time2=System.nanoTime();
+
+ int dot=fname.lastIndexOf(".fa");
+ String outfile=fname.substring(0,dot).replace("hs_ref_", "")+".chrom";
+
+ System.out.println("Writing to "+outfile);
+
+ System.out.println("minIndex="+cha.minIndex+", maxIndex="+cha.maxIndex+", length="+cha.array.length+
+ "; time="+String.format("%.3f seconds", (time2-time1)/1000000000d));
+
+ long time3=System.nanoTime();
+ ReadWrite.write(cha, outfile, false);
+ cha=null;
+ System.gc();
+ cha=read(outfile);
+ long time4=System.nanoTime();
+
+ System.out.println("minIndex="+cha.minIndex+", maxIndex="+cha.maxIndex+", length="+cha.array.length+
+ "; time="+String.format("%.3f seconds", (time4-time3)/1000000000d));
+ }
+
+ public static ChromosomeArray read(String fname, int chrom){
+ ChromosomeArray cha=read(fname);
+ assert(cha.chromosome<1);
+ cha.chromosome=chrom;
+ return cha;
+ }
+
+ public static ChromosomeArray read(String fname){
+
+// if(fname.endsWith(".chrom") || fname.endsWith(".chrom.gz")){}
+ ChromosomeArray ca=ReadWrite.read(ChromosomeArray.class, fname, false);
+ if(CHANGE_UNDEFINED_TO_N_ON_READ){
+ ca.changeUndefinedToN();
+ }
+ return ca;
+ }
+
+ public void changeUndefinedToN(){
+ for(int i=0; i<array.length; i++){
+// array[i]=AminoAcid.numberToBase[AminoAcid.baseToNumberACGTother[array[i]]];
+ if(!AminoAcid.isACGTN(array[i])){array[i]='N';}
+ }
+ }
+
+ public ChromosomeArray(){
+ this((byte)-1, Gene.PLUS);
+ }
+
+ /** Actually does reverse complement */
+ public ChromosomeArray complement(){
+ byte otherStrand=(strand==Gene.MINUS ? Gene.PLUS : Gene.MINUS);
+ ChromosomeArray ca=new ChromosomeArray(chromosome, otherStrand, 0, maxIndex);
+ for(int i=0; i<=maxIndex; i++){
+ int pos=maxIndex-i;
+ byte b=AminoAcid.baseToComplementExtended[array[i]];
+ ca.array[pos]=b;
+ }
+ return ca;
+ }
+
+ public ChromosomeArray(int chrom, byte strnd){
+ this(chrom, strnd, Integer.MAX_VALUE, -1);
+ }
+
+ public ChromosomeArray(int chrom, byte strnd, int min, int max){
+ chromosome=chrom;
+ strand=strnd;
+ array=KillSwitch.allocByte1D(Tools.max(1000, max+1));
+ minIndex=min;
+ maxIndex=max;
+ }
+
+
+ public void set(int loc, int val){
+
+ if(loc>=array.length){//Increase size
+ int newlen=(int)(1+(3L*max(array.length, loc))/2);
+ assert(newlen>loc) : newlen+", "+loc+", "+array.length;
+ resize(newlen);
+ assert(array.length==newlen);
+// System.err.println("Resized array to "+newlen);
+ }
+ if(CHANGE_U_TO_T && CHANGE_DEGENERATE_TO_N){
+ val=AminoAcid.baseToACGTN[val];
+ }else{
+ val=Character.toUpperCase((char)val);
+ if(AminoAcid.baseToNumberExtended[val]<0){val='N';}
+ }
+ array[loc]=(val>Byte.MAX_VALUE ? Byte.MAX_VALUE : (byte)val);
+ minIndex=min(loc, minIndex);
+ maxIndex=max(loc, maxIndex);
+ }
+
+
+ public void set(int loc, CharSequence s){
+ int loc2=loc+s.length();
+ if(loc2>array.length){//Increase size
+ int newlen=(int)(1+(3L*max(array.length, loc2))/2);
+ assert(newlen>loc2) : newlen+", "+loc2+", "+array.length;
+ resize(newlen);
+ assert(array.length==newlen);
+// System.err.println("Resized array to "+newlen);
+ }
+
+ if(CHANGE_U_TO_T && CHANGE_DEGENERATE_TO_N){
+ for(int i=0; i<s.length(); i++, loc++){
+ array[loc]=AminoAcid.baseToACGTN[s.charAt(i)];
+ }
+ }else{
+ for(int i=0; i<s.length(); i++, loc++){
+ char c=Character.toUpperCase(s.charAt(i));
+ if(AminoAcid.baseToNumberExtended[c]<0){c='N';}
+ assert(Character.isLetter(c));
+ assert(c<=Byte.MAX_VALUE);
+ array[loc]=(byte)c;
+ }
+ }
+
+ loc--;
+ assert(loc==loc2-1) : "loc="+loc+", loc2="+loc2+", s.len="+s.length();
+ minIndex=min(loc, minIndex);
+ maxIndex=max(loc, maxIndex);
+ }
+
+ public void set(int loc, byte[] s){
+ set(loc, s, s.length);
+ }
+
+ public void set(int loc, ByteBuilder bb){
+ set(loc, bb.array, bb.length());
+ }
+
+ public void set(int loc, byte[] s, final int slen){
+ assert(slen<=s.length && slen>=0);
+ int loc2=loc+slen;
+ if(loc2>array.length){//Increase size
+ int newlen=(int)(1+(3L*max(array.length, loc2))/2);
+ assert(newlen>loc2) : newlen+", "+loc2+", "+array.length;
+ resize(newlen);
+ assert(array.length==newlen);
+// System.err.println("Resized array to "+newlen);
+ }
+
+ if(CHANGE_U_TO_T && CHANGE_DEGENERATE_TO_N){
+ for(int i=0; i<slen; i++, loc++){
+ byte b=(byte)Tools.max(0, s[i]);
+ array[loc]=AminoAcid.baseToACGTN[b];
+ }
+ }else{
+ for(int i=0; i<slen; i++, loc++){
+ char c=Tools.max((char)0, Character.toUpperCase((char)s[i]));
+ if(AminoAcid.baseToNumberExtended[c]<0){c='N';}
+ assert(Character.isLetter(c));
+ assert(c<=Byte.MAX_VALUE);
+ array[loc]=(byte)c;
+ }
+ }
+ loc--;
+ assert(loc==loc2-1) : "loc="+loc+", loc2="+loc2+", s.len="+slen;
+ minIndex=min(loc, minIndex);
+ maxIndex=max(loc, maxIndex);
+ }
+
+ /**
+ * @param loc
+ * @param length
+ * @return
+ */
+ public float calcGC(int loc, int length, int[] counts) {
+ counts=countACGTINOC(loc, length, counts);
+ long at=counts[0]+counts[3];
+ long gc=counts[1]+counts[2];
+ return gc/(float)Tools.max(at+gc, 1);
+ }
+
+ /**
+ * @param loc
+ * @param length
+ * @return counts: {A, C, G, T, Iupac, N, Other, Control}
+ */
+ public int[] countACGTINOC(final int loc, final int length, int[] counts) {
+ final int lim=loc+length;
+ assert(loc>=0 && lim<=maxIndex+1 && loc<=lim);
+ if(counts==null){counts=new int[8];}
+ else{Arrays.fill(counts, 0);}
+ assert(counts.length==8);
+ for(int i=loc; i<lim; i++){
+ byte b=get(i);
+ int num=charToNum[b];
+ counts[num]++;
+ }
+ return counts;
+ }
+
+
+ /** Returns the letter (IUPAC) representation of the base, as a byte */
+ public byte get(int loc){
+ return loc<minIndex || loc>=maxIndex ? (byte)'N' : array[loc];
+ }
+
+ public String getString(int a, int b){
+ StringBuilder sb=new StringBuilder(b-a+1);
+ for(int i=a; i<=b; i++){
+ sb.append((char)get(i));
+ }
+ return sb.toString();
+ }
+
+ /** Returns FASTA format bytes. Same as getString, but faster. */
+ public byte[] getBytes(int a, int b){
+ byte[] out=Arrays.copyOfRange(array, a, b+1);
+// assert(out[0]>0 && out[out.length-1]>0) : a+", "+b+", "+minIndex+", "+maxIndex+", "+array.length;
+ if(a<minIndex || b>maxIndex){
+ for(int i=0; i<out.length; i++){
+ if(out[i]==0){out[i]='N';}
+ }
+ }
+ return out;
+ }
+
+ public byte getNumberACGTN(int loc){
+ return AminoAcid.baseToNumberACGTN[array[loc]];
+ }
+
+ public byte getNumber(int loc){
+ return AminoAcid.baseToNumber[array[loc]];
+ }
+
+ public boolean isFullyDefined(int a, int b){
+ for(int i=a; i<=b; i++){
+ int x=AminoAcid.baseToNumber[array[i]];
+ if(x<0){return false;}
+ }
+ return true;
+ }
+
+ public boolean isFullyUndefined(int a, int b){
+ for(int i=a; i<=b; i++){
+ int x=AminoAcid.baseToNumber[array[i]];
+ if(x>=0){return false;}
+ }
+ return true;
+ }
+
+ public int countDefinedBases(){
+ return countDefinedBases(minIndex, maxIndex);
+ }
+
+ public int countDefinedBases(int a, int b){
+ int sum=0;
+ for(int i=a; i<=b; i++){
+ int x=AminoAcid.baseToNumber[array[i]];
+ if(x>=0){sum++;}
+ }
+ return sum;
+ }
+
+ public int getNumber(int a, int b){
+ return toNumber(a, b, array);
+ }
+
+ public static int toNumber(int a, int b, byte[] bases){
+ assert(b>=a);
+ assert(b-a<17); //<17 for unsigned, <16 for signed
+ int out=0;
+ for(int i=a; i<=b; i++){
+ int x=AminoAcid.baseToNumber[bases[i]];
+ if(x<0){return -1;}
+ out=((out<<2)|x);
+ }
+ return out;
+ }
+
+ public static int toNumber(int a, int b, String bases){
+ int out=0;
+ for(int i=a; i<=b; i++){
+ int x=AminoAcid.baseToNumber[bases.charAt(i)];
+ if(x<0){return -1;}
+ out=((out<<2)|x);
+ }
+ return out;
+ }
+
+ public void resize(int newlen){
+ byte[] temp=KillSwitch.allocByte1D(newlen);
+ int lim=min(array.length, newlen);
+ assert(lim>=maxIndex) : lim+","+maxIndex;
+ for(int i=0; i<lim; i++){
+ temp[i]=array[i];
+ }
+ array=temp;
+ }
+
+ public String toBaseString(){
+ String s=new String(array);
+ return s;
+ }
+
+ public char[] nearestDefinedBase(){
+ char[] r=new char[array.length];
+ final char max=Character.MAX_VALUE;
+
+ char dist=max;
+ for(int i=0; i<r.length; i++){
+ byte b=array[i];
+ if(b=='A' || b=='C' || b=='G' || b=='T'){
+ dist=0;
+ }else{
+ dist=(dist==max ? max : (char)(dist+1));
+ }
+ r[i]=dist;
+ }
+
+ dist=r[r.length-1];
+ for(int i=r.length-1; i>=0; i--){
+ byte b=array[i];
+ if(b=='A' || b=='C' || b=='G' || b=='T'){
+ dist=0;
+ }else{
+ dist=(dist==max ? max : (char)(dist+1));
+ }
+ r[i]=Tools.min(dist, r[i]);
+ }
+ return r;
+ }
+
+ public ArrayList<Range> toContigRanges(final int nBlockSize){
+ assert(nBlockSize>0);
+ ArrayList<Range> list=new ArrayList<Range>();
+
+ int start=-1;
+ int stop=-1;
+ int ns=nBlockSize+1;
+
+ boolean contig=false;
+
+ for(int i=minIndex; i<=maxIndex; i++){
+ byte b=array[i];
+ if(b=='N' || b=='X'){
+ ns++;
+ if(contig && (b=='X' || ns>=nBlockSize)){
+ Range r=new Range(start, stop);
+ list.add(r);
+ contig=false;
+ }
+ }else{
+ ns=0;
+ if(!contig){start=i;}
+ contig=true;
+ stop=i;
+ }
+ }
+ if(contig){
+ Range r=new Range(start, stop);
+ list.add(r);
+ }
+ return list;
+ }
+
+
+ public boolean equalsIgnoreCase(ChromosomeArray other){
+ if(minIndex!=other.minIndex){System.err.println("a");return false;}
+ if(maxIndex!=other.maxIndex){System.err.println("b");return false;}
+ if(chromosome!=other.chromosome){System.err.println("c");return false;}
+ if(array.length!=other.array.length){System.err.println("d");return false;}
+ for(int i=minIndex; i<=maxIndex; i++){
+ if(Character.toLowerCase(array[i])!=Character.toLowerCase(other.array[i])){
+ System.err.println("e");
+ return false;
+ }
+ }
+ return true;
+ }
+
+ private static final long min(long x, long y){return x<y ? x : y;}
+ private static final long max(long x, long y){return x>y ? x : y;}
+ private static final int min(int x, int y){return x<y ? x : y;}
+ private static final int max(int x, int y){return x>y ? x : y;}
+
+ public final byte strand;
+ public int chromosome;
+ public byte[] array;
+ public int maxIndex=-1;
+ public int minIndex=Integer.MAX_VALUE;
+
+ public static boolean CHANGE_UNDEFINED_TO_N_ON_READ=false;
+ public static boolean CHANGE_U_TO_T=true;
+ public static boolean CHANGE_DEGENERATE_TO_N=true;
+
+ /** Translation array for tracking base counts */
+ private static final byte[] charToNum=AssemblyStats2.makeCharToNum();
+
+
+}
diff --git a/current/dna/Coverage.java b/current/dna/Coverage.java
new file mode 100755
index 0000000..3c0d837
--- /dev/null
+++ b/current/dna/Coverage.java
@@ -0,0 +1,26 @@
+package dna;
+import java.util.HashSet;
+
+import var.VarLine;
+
+
+public class Coverage{
+
+ public Coverage(Gene gg){
+ g=gg;
+ }
+
+ public final Gene g;
+ public HashSet<VarLine> varSet; //TODO: Could change these to arrays and sort them.
+ public int min=Integer.MAX_VALUE;
+ public int max=0;
+ public int covered=0;
+ public int uncovered=0;
+ public long sum=0;
+ public float avg;
+ public float covRatio;
+
+ public int[] missingChromRelative;
+ public int[] missingGeneRelative;
+
+}
\ No newline at end of file
diff --git a/current/dna/CoverageArray.java b/current/dna/CoverageArray.java
new file mode 100755
index 0000000..3eb320f
--- /dev/null
+++ b/current/dna/CoverageArray.java
@@ -0,0 +1,136 @@
+package dna;
+import java.io.Serializable;
+import java.util.ArrayList;
+
+import align2.IntList;
+
+import fileIO.ReadWrite;
+
+
+public abstract class CoverageArray implements Serializable {
+
+ /**
+ *
+ */
+ private static final long serialVersionUID = -7175422489330746676L;
+
+
+ public static final CoverageArray read(String fname){
+
+ if(!fname.contains(".ca")){
+ throw new RuntimeException();
+// ca=new CoverageArray2();
+// ca.load(new TsvCoverageFile(fname));
+// return ca;
+ }
+
+ fname=ReadWrite.findFileExtension(fname);
+// System.err.println("Found "+fname);
+
+ return ReadWrite.read(CoverageArray.class, fname, true);
+
+// if(fname.endsWith(".ca2") || fname.contains(".ca2.")){return ReadWrite.read(CoverageArray2.class, fname);}
+// else if(fname.endsWith(".ca") || fname.contains(".ca.")){return ReadWrite.read(CoverageArray1.class, fname);}
+// else{return ReadWrite.read(CoverageArray.class, fname);}
+ }
+
+ public CoverageArray(int chrom){chromosome=chrom;}
+
+ /**
+ * @param loc
+ * @param amt
+ */
+ public abstract void increment(int loc, int amt);
+
+ /**
+ * @param loc
+ */
+ public abstract void increment(int loc);
+
+ public abstract void incrementRange(int min, int max, int amt);
+
+ public void incrementRanges(IntList ranges, int amt){
+ for(int i=0; i<ranges.size; i+=2){
+ int a=ranges.get(i), b=ranges.get(i+1);
+ incrementRange(a, b-1, 1);
+ }
+ }
+
+ public abstract void set(int loc, int val);
+
+ public abstract int get(int loc);
+
+ public abstract void resize(int newlen);
+
+
+ public final double[][] toGraph(int blocksize, int min, int max){
+
+ min=max(min, minIndex);
+ max=min(max, maxIndex);
+ int length=max-min;
+
+ ArrayList<double[]> list=new ArrayList<double[]>();
+
+ int block;
+
+ if(blocksize<=0){
+// block=((array.length+62999)/63000);//For Excel
+// block=((length+62999)/63000);//For Excel
+ block=((length+31499)/31500);//For Excel
+ }else{
+ block=blocksize;
+ }
+ block=max(block, 1);
+
+ int current=0;
+ double[] sum=new double[2];
+ for(int loc=min; loc<=max; loc++){
+ if(current==block){
+ for(int i=0; i<sum.length; i++){
+ sum[i]=sum[i]/current;
+ }
+ sum[0]=Math.round(sum[0]);
+ list.add(sum);
+ sum=new double[2];
+ current=0;
+ }
+
+ sum[0]+=loc;
+ sum[1]+=get(loc);
+
+ current++;
+ }
+
+ return list.toArray(new double[0][]);
+
+ }
+
+
+ public static final void print(double[][] data){
+
+// data=stats.Smoother.weightedAveragePlank(data, 24);
+ assert(false) : "Smoother disabled in this code purely to reduce dependancies.";
+ StringBuilder sb=new StringBuilder(data.length*20);
+ for(double[] d : data){
+ sb.append(String.format("%d\t%.2f\n",(int)d[0],d[1]));
+ }
+ System.out.print(sb);
+ }
+
+ public abstract String toString();
+
+ static final long min(long x, long y){return x<y ? x : y;}
+ static final long max(long x, long y){return x>y ? x : y;}
+ static final int min(int x, int y){return x<y ? x : y;}
+ static final int max(int x, int y){return x>y ? x : y;}
+
+ public int chromosome;
+
+ public int maxIndex=-1;
+ public int minIndex=Integer.MAX_VALUE;
+ public int length(){return maxIndex-minIndex+1;}
+ public abstract int arrayLength();
+
+ private static boolean OVERFLOWED=false;
+
+}
diff --git a/current/dna/CoverageArray2.java b/current/dna/CoverageArray2.java
new file mode 100755
index 0000000..54491db
--- /dev/null
+++ b/current/dna/CoverageArray2.java
@@ -0,0 +1,225 @@
+package dna;
+import java.io.Serializable;
+
+import stream.KillSwitch;
+
+import driver.Translator2;
+
+import fileIO.ReadWrite;
+
+
+public class CoverageArray2 extends CoverageArray implements Serializable {
+
+ /**
+ *
+ */
+ private static final long serialVersionUID = 8242586595591123194L;
+
+ public static void main(String[] args){
+ runSpeedTest(args);
+
+// translateGenomeBuild(args);
+ }
+
+ public static void runSpeedTest(String[] args){
+
+ long time1=System.nanoTime();
+
+ CoverageArray2 ca=(CoverageArray2)read(args[1]);
+ ca.chromosome=Byte.parseByte(args[0]);
+ long time2=System.nanoTime();
+
+// int dot=args[1].lastIndexOf(".");
+// String outfile=args[1].substring(0,dot)+".ca";
+
+ args[1]=args[1].replace('\\', '/');
+ int slash=args[1].lastIndexOf('/');
+ String outfile;
+ if(slash<1){
+ outfile="coverage-chr"+ca.chromosome+"-build"+Data.GENOME_BUILD+".ca";
+ }else{
+ outfile=args[1].substring(0,slash+1)+"coverage-chr"+ca.chromosome+"-build"+Data.GENOME_BUILD+".ca";
+ }
+
+ System.out.println("minIndex="+ca.minIndex+", maxIndex="+ca.maxIndex+", length="+ca.array.length+
+ "; time="+String.format("%.3f seconds", (time2-time1)/1000000000d));
+
+ long time3=System.nanoTime();
+ ReadWrite.write(ca, outfile, false);
+ ca=null;
+ System.gc();
+ ca=(CoverageArray2)read(outfile);
+ long time4=System.nanoTime();
+
+ System.out.println("minIndex="+ca.minIndex+", maxIndex="+ca.maxIndex+", length="+ca.array.length+
+ "; time="+String.format("%.3f seconds", (time4-time3)/1000000000d));
+
+
+ }
+
+ @Deprecated
+ /** Legacy human code */
+ public static void translateGenomeBuild(String[] args){
+
+ Timer t=new Timer();
+
+ int inBuild=Integer.parseInt(args[0]);
+ int outBuild=Integer.parseInt(args[1]);
+ String root=args[2];
+
+ translateGenomeBuild(inBuild, outBuild, root);
+
+ t.stop();
+ System.out.println("Time:\t"+t);
+
+ }
+
+ @Deprecated
+ /** Legacy human code */
+ public static void translateGenomeBuild(int inBuild, int outBuild, String root){
+ root=root.replace('\\', '/');
+ if(!root.endsWith("/")){root+="/";}
+
+ CoverageArray2[] out=new CoverageArray2[27];
+
+ for(int chrom=1; chrom<out.length; chrom++){
+ out[chrom]=new CoverageArray2(chrom, 500);
+ }
+
+ final byte PLUS=Gene.PLUS;
+
+ for(int chrom=1; chrom<=25; chrom++){
+ String infile=root+"coverage-chr"+chrom+"-build"+inBuild+".ca.zip";
+ CoverageArray2 ca1=ReadWrite.read(CoverageArray2.class, infile, true);
+ for(int loc1=ca1.minIndex; loc1<=ca1.maxIndex; loc1++){
+ char cov=(char)ca1.get(loc1);
+ int[] xform=Translator2.translate(inBuild, outBuild, chrom, PLUS, loc1);
+ if(xform!=null){
+ int chrom2=(int)xform[0];
+ int loc2=xform[2];
+ out[chrom2].set(loc2, cov);
+ }
+ }
+ ca1=null;
+ System.out.println("Read "+infile);
+ }
+
+ for(int chrom=1; chrom<=25; chrom++){
+ String outfile=root+"coverage-chr"+chrom+"-build"+outBuild+".ca.zip";
+ out[chrom].resize(out[chrom].maxIndex+1);
+ ReadWrite.write(out[chrom], outfile, false);
+ out[chrom]=null;
+ System.out.println("Wrote "+outfile);
+ }
+
+ }
+
+// public CoverageArray2(){
+// this((int)-1);
+// }
+//
+// public CoverageArray2(int chrom){
+// this(chrom, 1<<24);
+// }
+
+ public CoverageArray2(int chrom, int initialLen){
+ super(chrom);
+ array=KillSwitch.allocChar1D(initialLen);
+ }
+
+ /**
+ * @param loc
+ * @param amt
+ */
+ public void increment(int loc, int amt) {
+ set(loc, get(loc)+amt);
+ }
+
+ /**
+ * @param loc
+ */
+ public void increment(int loc) {
+ set(loc, get(loc)+1);
+ }
+
+ public void incrementRange(int min, int max, int amt) {
+ if(min<0){min=0;}
+ if(max>=array.length){//Increase size
+ int newlen=1+(7*max(array.length, max))/4;
+ assert(newlen>max);
+ resize(newlen);
+ assert(array.length==newlen);
+ }else if(max<0){max=-1;}
+ for(int i=min; i<=max; i++){
+ int val=array[i]+amt;
+ if(val>Character.MAX_VALUE){
+ val=Character.MAX_VALUE;
+ if(!OVERFLOWED){
+ System.err.println("Note: Coverage capped at "+(int)(Character.MAX_VALUE));
+ OVERFLOWED=true;
+ }
+ }
+ array[i]=(char)val;
+ }
+ }
+
+
+ public void set(int loc, int val){
+
+ if(loc>=array.length){//Increase size
+ int newlen=1+(7*max(array.length, loc))/4;
+ assert(newlen>loc);
+ resize(newlen);
+ assert(array.length==newlen);
+ }else if(loc<0){
+// minIndex=min(0, minIndex);
+// maxIndex=max(0, maxIndex);
+ return;
+ }
+
+ if(val>Character.MAX_VALUE && !OVERFLOWED){
+ System.err.println("Note: Coverage capped at "+(int)(Character.MAX_VALUE));
+ OVERFLOWED=true;
+ }
+ array[loc]=(val>Character.MAX_VALUE ? Character.MAX_VALUE : (char)val);
+ minIndex=min(loc, minIndex);
+ maxIndex=max(loc, maxIndex);
+ }
+
+ public int get(int loc){
+ return loc>=array.length || loc<0 ? 0 : array[loc];
+ }
+
+ public void resize(int newlen){
+// System.err.println("Resized CoverageArray "+chromosome+" to "+newlen);
+ char[] temp=KillSwitch.allocChar1D(newlen);
+ int lim=min(array.length, newlen);
+ assert(lim>maxIndex) : lim+","+maxIndex;
+ for(int i=0; i<lim; i++){
+ temp[i]=array[i];
+ }
+ array=temp;
+ }
+
+ public String toString(){
+ StringBuilder sb=new StringBuilder();
+ sb.append('[');
+ for(int i=0; i<=maxIndex; i++){
+ if(i>0){sb.append(", ");}
+ sb.append((int)array[i]);
+ }
+ sb.append(']');
+ return sb.toString();
+ }
+
+ public char[] array;
+ public int length(){return maxIndex-minIndex+1;}
+ public int arrayLength(){return array.length;}
+
+ private static boolean OVERFLOWED=false;
+ /**
+ *
+ */
+// private static final long serialVersionUID = -7493066925636540386L;
+
+}
diff --git a/current/dna/CoverageArray3.java b/current/dna/CoverageArray3.java
new file mode 100755
index 0000000..ac55518
--- /dev/null
+++ b/current/dna/CoverageArray3.java
@@ -0,0 +1,235 @@
+package dna;
+import java.io.Serializable;
+
+import stream.KillSwitch;
+
+import driver.Translator2;
+
+import fileIO.ReadWrite;
+
+
+public class CoverageArray3 extends CoverageArray implements Serializable {
+
+ /**
+ *
+ */
+ private static final long serialVersionUID = -4216985130070239610L;
+
+ public static void main(String[] args){
+ runSpeedTest(args);
+
+// translateGenomeBuild(args);
+ }
+
+ public static void runSpeedTest(String[] args){
+
+ long time1=System.nanoTime();
+
+ CoverageArray3 ca=(CoverageArray3)read(args[1]);
+ ca.chromosome=Byte.parseByte(args[0]);
+ long time2=System.nanoTime();
+
+// int dot=args[1].lastIndexOf(".");
+// String outfile=args[1].substring(0,dot)+".ca";
+
+ args[1]=args[1].replace('\\', '/');
+ int slash=args[1].lastIndexOf('/');
+ String outfile;
+ if(slash<1){
+ outfile="coverage-chr"+ca.chromosome+"-build"+Data.GENOME_BUILD+".ca";
+ }else{
+ outfile=args[1].substring(0,slash+1)+"coverage-chr"+ca.chromosome+"-build"+Data.GENOME_BUILD+".ca";
+ }
+
+ System.out.println("minIndex="+ca.minIndex+", maxIndex="+ca.maxIndex+", length="+ca.array.length+
+ "; time="+String.format("%.3f seconds", (time2-time1)/1000000000d));
+
+ long time3=System.nanoTime();
+ ReadWrite.write(ca, outfile, false);
+ ca=null;
+ System.gc();
+ ca=(CoverageArray3)read(outfile);
+ long time4=System.nanoTime();
+
+ System.out.println("minIndex="+ca.minIndex+", maxIndex="+ca.maxIndex+", length="+ca.array.length+
+ "; time="+String.format("%.3f seconds", (time4-time3)/1000000000d));
+
+
+ }
+
+ @Deprecated
+ /** Legacy human code */
+ public static void translateGenomeBuild(String[] args){
+
+ Timer t=new Timer();
+
+ int inBuild=Integer.parseInt(args[0]);
+ int outBuild=Integer.parseInt(args[1]);
+ String root=args[2];
+
+ translateGenomeBuild(inBuild, outBuild, root);
+
+ t.stop();
+ System.out.println("Time:\t"+t);
+
+ }
+
+ @Deprecated
+ /** Legacy human code */
+ public static void translateGenomeBuild(int inBuild, int outBuild, String root){
+ root=root.replace('\\', '/');
+ if(!root.endsWith("/")){root+="/";}
+
+ CoverageArray3[] out=new CoverageArray3[27];
+
+ for(int chrom=1; chrom<out.length; chrom++){
+ out[chrom]=new CoverageArray3(chrom, 500);
+ }
+
+ final byte PLUS=Gene.PLUS;
+
+ for(int chrom=1; chrom<=25; chrom++){
+ String infile=root+"coverage-chr"+chrom+"-build"+inBuild+".ca.zip";
+ CoverageArray3 ca1=ReadWrite.read(CoverageArray3.class, infile, true);
+ for(int loc1=ca1.minIndex; loc1<=ca1.maxIndex; loc1++){
+ int cov=(int)ca1.get(loc1);
+ int[] xform=Translator2.translate(inBuild, outBuild, chrom, PLUS, loc1);
+ if(xform!=null){
+ int chrom2=(int)xform[0];
+ int loc2=xform[2];
+ out[chrom2].set(loc2, cov);
+ }
+ }
+ ca1=null;
+ System.out.println("Read "+infile);
+ }
+
+ for(int chrom=1; chrom<=25; chrom++){
+ String outfile=root+"coverage-chr"+chrom+"-build"+outBuild+".ca.zip";
+ out[chrom].resize(out[chrom].maxIndex+1);
+ ReadWrite.write(out[chrom], outfile, false);
+ out[chrom]=null;
+ System.out.println("Wrote "+outfile);
+ }
+
+ }
+
+// public CoverageArray3(){
+// this((int)-1);
+// }
+//
+// public CoverageArray3(int chrom){
+// this(chrom, 1<<24);
+// }
+
+ public CoverageArray3(int chrom, int initialLen){
+ super(chrom);
+ array=KillSwitch.allocInt1D(initialLen);
+ }
+
+ /**
+ * @param loc
+ * @param amt
+ */
+ public void increment(int loc, long amt) {
+ set(loc, get(loc)+amt);
+ }
+
+ /**
+ * @param loc
+ */
+ public void increment(int loc) {
+ set(loc, get(loc)+1L);
+ }
+
+ @Override
+ public void increment(int loc, int amt) {
+ increment(loc, (long)amt);
+ }
+
+ @Override
+ public void incrementRange(int min, int max, int amt) {
+ incrementRange(min, max, (long)amt);
+ }
+
+ public void incrementRange(int min, int max, long amt) {
+ if(min<0){min=0;}
+ if(max>=array.length){//Increase size
+ int newlen=1+(7*max(array.length, max))/4;
+ assert(newlen>max);
+ resize(newlen);
+ assert(array.length==newlen);
+ }else if(max<0){max=-1;}
+ for(int i=min; i<=max; i++){
+ long val=array[i]+amt;
+ if(val>Integer.MAX_VALUE){
+ val=Integer.MAX_VALUE;
+ if(!OVERFLOWED){
+ System.err.println("Note: Coverage capped at "+Integer.MAX_VALUE);
+ OVERFLOWED=true;
+ }
+ }
+ array[i]=(int)val;
+ }
+ }
+
+ public void set(int loc, int val){
+ set(loc, (long)val);
+ }
+
+ public void set(int loc, long val){
+
+ if(loc>=array.length){//Increase size
+ int newlen=1+(7*max(array.length, loc))/4;
+ assert(newlen>loc);
+ resize(newlen);
+ assert(array.length==newlen);
+ }else if(loc<0){
+// minIndex=min(0, minIndex);
+// maxIndex=max(0, maxIndex);
+ return;
+ }
+
+ if(val>Integer.MAX_VALUE && !OVERFLOWED){
+ System.err.println("Note: Coverage capped at "+Integer.MAX_VALUE);
+ OVERFLOWED=true;
+ }
+ array[loc]=(val>Integer.MAX_VALUE ? Integer.MAX_VALUE : (int)val);
+ minIndex=min(loc, minIndex);
+ maxIndex=max(loc, maxIndex);
+ }
+
+ public int get(int loc){
+ return loc>=array.length || loc<0 ? 0 : array[loc];
+ }
+
+ public void resize(int newlen){
+// System.err.println("Resized CoverageArray "+chromosome+" to "+newlen);
+ int[] temp=KillSwitch.allocInt1D(newlen);
+ int lim=min(array.length, newlen);
+ assert(lim>maxIndex) : lim+","+maxIndex;
+ for(int i=0; i<lim; i++){
+ temp[i]=array[i];
+ }
+ array=temp;
+ }
+
+ public String toString(){
+ StringBuilder sb=new StringBuilder();
+ sb.append('[');
+ for(int i=0; i<=maxIndex; i++){
+ if(i>0){sb.append(", ");}
+ sb.append((int)array[i]);
+ }
+ sb.append(']');
+ return sb.toString();
+ }
+
+
+ public int[] array;
+ public int length(){return maxIndex-minIndex+1;}
+ public int arrayLength(){return array.length;}
+
+ private static boolean OVERFLOWED=false;
+
+}
diff --git a/current/dna/Data.java b/current/dna/Data.java
new file mode 100755
index 0000000..4bc7e81
--- /dev/null
+++ b/current/dna/Data.java
@@ -0,0 +1,1636 @@
+package dna;
+import java.io.File;
+import java.io.IOException;
+import java.io.PrintStream;
+import java.net.URL;
+import java.net.UnknownHostException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.HashMap;
+import java.util.Map;
+
+import kmer.Primes;
+
+import var.Variation;
+
+import align2.AbstractIndex;
+import align2.AbstractMapper;
+import align2.BBSplitter;
+import align2.ChromLoadThread;
+import align2.RefToIndex;
+import align2.Tools;
+
+
+import driver.Search;
+import fileIO.ChainBlock;
+import fileIO.ChainLine;
+import fileIO.ReadWrite;
+import fileIO.TextFile;
+
+public class Data {
+
+
+ public static void main(String[] args){}
+
+
+ //TODO IMPORTANT! Ensure that this unloads everything big, AND that reloading subsequently works OK.
+ public static void unloadAll(){
+ chromosomePlusMatrix=null;
+ AbstractIndex.clear();
+ RefToIndex.clear();
+
+ AbstractMapper.minChrom=1;
+ AbstractMapper.maxChrom=Integer.MAX_VALUE;
+
+ numChroms=0;
+ numBases=0;
+ numDefinedBases=0;
+ numContigs=0;
+ numScaffolds=0;
+ interScaffoldPadding=0;
+ chromLengths=null;
+ chromDefinedBases=null;
+ chromUndefinedBases=null;
+ chromContigs=null;
+ chromScaffolds=null;
+ chromStartPad=null;
+
+ scaffoldNames=null;
+ scaffoldLocs=null;
+ scaffoldLengths=null;
+
+ BBSplitter.setCountTable=null;
+ BBSplitter.scafCountTable=null;
+ BBSplitter.streamTable=null;
+ BBSplitter.streamTableAmbiguous=null;
+
+ scaffoldNameTable=null;
+ genomeSource=null;
+ name=null;
+
+ GENOME_BUILD=-1;
+ genome_set_to=-1;
+ }
+
+
+ //TODO IMPORTANT! Ensure that this unloads everything big, AND that reloading subsequently works OK.
+ public static void unload(int chrom, boolean unloadSoft){
+
+// unloadGenes(chrom);
+
+ chromosomePlusMatrix[chrom]=null;
+ }
+
+ public static void unloadGenes(int chrom){
+ geneMatrix[chrom]=null;
+ geneSetMatrix[chrom]=null;
+ geneTxRangeMatrix[chrom]=null;
+ geneSetRangeMatrix[chrom]=null;
+ geneCodeRangeMatrix[chrom]=null;
+ geneCodeAndExonRangeMatrix[chrom]=null;
+ geneNearbyRangeMatrix[chrom]=null;
+ exonRangeMatrix[chrom]=null;
+ }
+
+ public static byte find(int x, byte[] array){
+ for(byte i=0; i<array.length; i++){
+ if(array[i]==x){return i;}
+ }
+ return -1;
+ }
+
+ public static void reverse(byte[] array){
+ int mid=array.length/2;
+ for(int i=0; i<mid; i++){
+ byte temp=array[i];
+ array[i]=array[array.length-i-1];
+ array[array.length-i-1]=temp;
+ }
+ }
+
+
+ public static Gene[] getGenes(int chrom){
+ if(geneMatrix[chrom]==null){
+ loadGenes(chrom);
+ }
+ return geneMatrix[chrom];
+ }
+
+
+ public static Gene[] getGenes(int chrom, byte strand){
+ ArrayList<Gene> genes=new ArrayList<Gene>();
+ for(Gene g : getGenes(chrom)){
+ if(g.strand==strand){
+ genes.add(g);
+ }
+ }
+ return genes.toArray(new Gene[genes.size()]);
+ }
+
+
+ public static GeneSet[] getGeneSets(int chrom){
+ if(geneSetMatrix[chrom]==null){
+ loadGenes(chrom);
+ }
+ return geneSetMatrix[chrom];
+ }
+
+
+ public static HashMap<Integer, ArrayList<GeneSet>> getGeneIDTable(){
+ if(geneIDTable==null){
+
+// System.err.println("WAITING FOR CS");
+ synchronized(GENEIDLOCK){
+// System.err.println("ENTER CS");
+ if(geneIDTable==null){
+// System.err.println("ENTER CS2");
+ HashMap<Integer, ArrayList<GeneSet>> temp=new HashMap<Integer, ArrayList<GeneSet>>(2048);
+ for(byte chrom=1; chrom<=25; chrom++){
+ GeneSet[] set=getGeneSets(chrom);
+ for(GeneSet gs : set){
+ int id=-1;
+ for(Gene g : gs.genes){
+ if(id==-1){id=g.id;}
+ else{assert(id==g.id) : gs+"\n"+gs.genes+"\n";}
+ }
+ assert(id>-1);
+ Integer key=new Integer(id);
+ ArrayList<GeneSet> value=temp.get(key);
+ // assert(old==null || chrom>22) : "\nCollision!\n\n"+gs+"\n\nis overwriting\n\n"+old;
+ if(value==null){
+ value=new ArrayList<GeneSet>(2);
+ temp.put(key, value);
+ }
+ value.add(gs);
+ }
+ }
+// System.err.println("EXIT CS2");
+ geneIDTable=temp;
+ }
+// System.err.println("EXIT CS");
+ }
+
+ }
+// System.err.println("GeneIDTable contains "+geneIDTable.size()+" entries.");
+ return geneIDTable;
+ }
+
+ public static ChromosomeArray getChromosome(int chrom){
+ assert(chromosomePlusMatrix!=null);
+ assert(chromosomePlusMatrix.length>chrom) : chromosomePlusMatrix.length+", "+chrom;
+ if(chromosomePlusMatrix[chrom]==null){
+ synchronized(CHROMLOCKS[chrom%CHROMLOCKS.length]){
+ if(chromosomePlusMatrix[chrom]==null){loadChromosome(chrom);}
+ }
+ }
+ assert(chromosomePlusMatrix[chrom].array[0]=='N') : (char)chromosomePlusMatrix[chrom].array[0]+
+ "\nIf you see this message, please regenerate your index.\n"/*+new String(chromosomePlusMatrix[chrom].array)*/;//startpad was too low or for some reason invalid.
+ return chromosomePlusMatrix[chrom];
+ }
+
+ private static void loadGenes(int chrom){
+
+ if(geneMatrix[chrom]!=null){return;} //In case another thread already loaded the chromosome
+ synchronized(CHROMLOCKS[chrom%CHROMLOCKS.length]){
+ if(geneMatrix[chrom]==null){
+
+ // Gene[] genes=FindExons.readGenes(ROOT_GENE+"ref/chr"+chrom+".Ref.Table", Gene.FORMAT_NM);
+ // Gene[] genes=FindExons.readGenes(ROOT_GENE+"ref2/ccds-chr"+chrom+"-genes.txt", Gene.FORMAT_CCDS);
+ // Gene[] genes=FindExons.readGenes(ROOT_GENE+"ref3/ccds-chr"+chrom+"-genes.txt", Gene.FORMAT_CCDS);
+
+ // Gene[] genes=ReadWrite.readArray(Gene.class, ROOT_GENE+"seqGene/chr"+chrom+".ga");
+
+ String fname=ROOT_GENE+"Build"+GENOME_BUILD+"/"+GENE_MAP+"/chr"+chrom+".ga";
+
+ Gene[] genes=ReadWrite.readArray(Gene.class, fname, true);
+
+ Arrays.sort(genes);
+// geneMatrix[chrom]=genes;
+
+ geneTxRangeMatrix[chrom]=findGeneRanges(genes, TX_RANGE);
+ geneCodeRangeMatrix[chrom]=findGeneRanges(genes, CODE_RANGE);
+ geneCodeAndExonRangeMatrix[chrom]=findCodeAndExonRanges(genes, false, true);
+ exonRangeMatrix[chrom]=findCodeAndExonRanges(genes, false, false);
+ geneNearbyRangeMatrix[chrom]=findCodeAndExonRanges(genes, true, true);
+
+ HashMap<String, ArrayList<Gene>> temp=new HashMap<String, ArrayList<Gene>>();
+ HashMap<String, GeneSet> gntable=new HashMap<String, GeneSet>();
+ HashMap<String, Gene> tntable=new HashMap<String, Gene>();
+
+ for(Gene g : genes){
+
+ String trkey=g.mrnaAcc;
+ if(trkey==null){trkey=g.chromosome+"_"+g.id;}
+ if(trkey!=null){
+ Gene old=tntable.get(trkey);
+ if(old!=null){
+ // stdout.println("For transcript '"+g.nameTranscript+"': Overwrote \n"+old+"\nwith\n"+g+"\n");
+ }
+ tntable.put(trkey, g);
+ }
+
+ String key=g.symbol;
+ if(key==null){key=g.mrnaAcc;}
+ ArrayList<Gene> list=temp.get(key);
+ if(list==null){
+ list=new ArrayList<Gene>(8);
+ temp.put(key, list);
+ }
+ list.add(g);
+ }
+
+ GeneSet[] gsm=new GeneSet[temp.size()];
+ String[] keys=temp.keySet().toArray(new String[temp.size()]);
+ for(int i=0; i<keys.length; i++){
+ String key=keys[i];
+ ArrayList<Gene> list=temp.get(key);
+ GeneSet gs=new GeneSet(key, list);
+ gsm[i]=gs;
+ gntable.put(key, gs);
+ }
+
+ geneNameTable[chrom]=gntable;
+ transcriptNameTable[chrom]=tntable;
+ geneSetMatrix[chrom]=gsm;
+ Arrays.sort(geneSetMatrix[chrom]);
+
+ geneSetRangeMatrix[chrom]=findGeneSetRanges(geneSetMatrix[chrom]);
+
+ {
+ assert(geneMatrix[chrom]==null) : "Need to sync.";
+ geneMatrix[chrom]=genes;
+ }
+ }
+ }
+ }
+
+ public static void loadChromosomes(int min, int max){
+ synchronized(CHROMLOCKS){
+ String pattern=chromFname(GENOME_BUILD);
+ ChromLoadThread.loadAll(pattern, min, max, chromosomePlusMatrix);
+ }
+ }
+
+ private static void loadChromosome(int chrom){
+// assert(false);
+ assert(chromosomePlusMatrix[chrom]==null);
+// assert(chrom>0) : chrom; //No longer valid since chrom 0 is now semi-allowed
+
+ String fname=chromFname(chrom, GENOME_BUILD);
+ sysout.println("Loading "+fname);
+ chromosomePlusMatrix[chrom]=ReadWrite.read(ChromosomeArray.class, fname, false);
+ assert(chromosomePlusMatrix[chrom].chromosome==chrom);
+ }
+
+ public static final String chromExtension(){
+ return ".chrom"+(CHROMGZ ? ".gz" : "");
+ }
+
+ public static final String chromFname(int chrom, int genome){
+ return ROOT_GENOME+genome+"/chr"+chrom+chromExtension();
+ }
+
+ public static final String chromFname(int genome){
+ return ROOT_GENOME+genome+"/chr#"+chromExtension();
+ }
+
+ public static Range[] findGeneRanges(Gene[] genes, final int mode){
+
+ ArrayList<Range> list=new ArrayList<Range>(8192);
+ ArrayList<Gene> glist=new ArrayList<Gene>(64);
+
+ Range current=null;
+ for(int i=0; i<genes.length; i++){
+ Gene g=genes[i];
+ Range r;
+
+ int a, b;
+
+ switch(mode){
+ case TX_RANGE: {a=g.txStart; b=g.txStop;}
+ break;
+
+ case CODE_RANGE: {a=g.codeStart; b=g.codeStop;}
+ break;
+
+ default: {throw new RuntimeException();}
+ }
+
+ if(b>=a){
+ r=new Range(a, b);
+
+ if(current==null){
+ current=r;
+ glist.add(g);
+ }else if(current.touches(r)){
+ current=current.merge(r);
+ glist.add(g);
+ }else{
+ current.obj1=glist.toArray(new Gene[glist.size()]);
+ glist.clear();
+ glist.add(g);
+ list.add(current);
+ current=r;
+ }
+ }
+ }
+ if(current!=null){ //i.e., if there were any genes
+ current.obj1=glist.toArray(new Gene[glist.size()]);
+ list.add(current);
+ }
+
+ return list.toArray(new Range[list.size()]);
+ }
+
+ public static Range[] findGeneSetRanges(GeneSet[] genes){
+
+ ArrayList<Range> list=new ArrayList<Range>(8192);
+ ArrayList<GeneSet> glist=new ArrayList<GeneSet>(64);
+
+ Range current=null;
+ for(int i=0; i<genes.length; i++){
+ GeneSet g=genes[i];
+ Range r;
+
+ int a=g.minStart-NEAR, b=g.maxEnd+NEAR;
+
+ if(b>=a){
+ r=new Range(a, b);
+
+ if(current==null){
+ current=r;
+ glist.add(g);
+ }else if(current.touches(r)){
+ current=current.merge(r);
+ glist.add(g);
+ }else{
+ current.obj1=glist.toArray(new GeneSet[glist.size()]);
+ glist.clear();
+ glist.add(g);
+ list.add(current);
+ current=r;
+ }
+ }
+ }
+ if(current!=null){ //i.e., if there were any genes
+ current.obj1=glist.toArray(new GeneSet[glist.size()]);
+ list.add(current);
+ }
+
+ return list.toArray(new Range[list.size()]);
+ }
+
+
+ public static Range[] findCodeAndExonRanges(Gene[] genes, boolean nearby, boolean codingOnly){
+
+
+ ArrayList<Range> list=new ArrayList<Range>(32768);
+
+ for(int i=0; i<genes.length; i++){
+ Gene g=genes[i];
+ Range r;
+
+ for(Exon ex : g.exons){
+
+ int a=ex.a, b=ex.b;
+
+ if(codingOnly){
+ a=max(ex.a, g.codeStart);
+ b=min(ex.b, g.codeStop);
+ }
+
+ assert(ex.a<=ex.b);
+ assert(g.codeStart<=g.codeStop+1) : g;
+
+ if(nearby){
+ a=a-NEAR;
+ b=b+NEAR;
+ }
+
+ if(a<=b){
+ r=new Range(a, b);
+ r.obj1=g;
+// r.obj2=ex;
+ list.add(r);
+
+ }
+
+ }
+ }
+
+ ArrayList<Range> list2=new ArrayList<Range>(list.size());
+ Collections.sort(list);
+
+
+ HashSet<Gene> gset=new HashSet<Gene>(64);
+ Range current=null;
+ for(Range r : list){
+ if(current==null){
+ gset.add((Gene)r.obj1);
+ current=r;
+ }else if(current.touches(r)){
+ gset.add((Gene)r.obj1);
+ current=current.merge(r);
+ }else{
+ current.obj1=gset.toArray(new Gene[gset.size()]);
+ list2.add(current);
+ gset.clear();
+ gset.add((Gene)r.obj1);
+ current=r;
+ }
+ }
+
+ if(current!=null){
+ current.obj1=gset.toArray(new Gene[gset.size()]);
+ list2.add(current);
+ Collections.sort(list2);
+ }
+
+ return list2.toArray(new Range[list2.size()]);
+ }
+
+ public static Range[] geneSetRangeMatrix(int chrom){
+ if(geneSetRangeMatrix[chrom]==null){
+ loadGenes(chrom);
+ }
+ assert(geneSetRangeMatrix[chrom]!=null);
+ return geneSetRangeMatrix[chrom];
+ }
+
+ public static Range[] exonRangeMatrix(int chrom){
+ if(exonRangeMatrix[chrom]==null){
+ loadGenes(chrom);
+ }
+ assert(exonRangeMatrix[chrom]!=null);
+ return exonRangeMatrix[chrom];
+ }
+
+ public static Range[] geneCodeAndExonRangeMatrix(int chrom){
+ if(geneCodeAndExonRangeMatrix[chrom]==null){
+ loadGenes(chrom);
+ }
+ assert(geneCodeAndExonRangeMatrix[chrom]!=null);
+ return geneCodeAndExonRangeMatrix[chrom];
+ }
+
+ public static Range[] geneNearbyRangeMatrix(int chrom){
+ if(geneNearbyRangeMatrix[chrom]==null){
+ loadGenes(chrom);
+ }
+ assert(geneNearbyRangeMatrix[chrom]!=null);
+ return geneNearbyRangeMatrix[chrom];
+ }
+
+ public static HashMap<String, GeneSet> geneNameTable(int chrom){
+ if(geneNameTable[chrom]==null){
+ loadGenes(chrom);
+ }
+ assert(geneNameTable[chrom]!=null);
+ return geneNameTable[chrom];
+ }
+
+ public static HashMap<String, Gene> transcriptNameTable(int chrom){
+ if(transcriptNameTable[chrom]==null){
+ loadGenes(chrom);
+ }
+ assert(transcriptNameTable[chrom]!=null);
+ return transcriptNameTable[chrom];
+ }
+
+
+ public static GeneSet[] getNearestGeneSets(int chrom, int loc){
+ Range[] r=geneSetRangeMatrix(chrom);
+ int index=driver.Search.findPointBinary(loc, r);
+ GeneSet[] sets=(GeneSet[]) r[index].obj1;
+ if(sets==null || sets.length==0){
+ assert(false);
+ return null;
+ }
+ return sets;
+ }
+
+ /** Returns genesets overlapping the range */
+ public static GeneSet[] getNearestGeneSets(int chrom, int loc1, int loc2){
+ assert(loc2>=loc1);
+
+// boolean flag=(chrom==21 && loc1<38540895 && loc2>38540895);//TODO UNDO
+//
+// if(flag){
+// stdout.println(loc1+", "+loc2+", "+((loc1+loc2)/2));
+// for(GeneSet gs : Data.geneNameTable[chrom].values()){
+// if(gs.intersects(loc1, loc2)){
+// stdout.println("%%% "+gs);
+// }
+// }
+// }
+
+ Range[] ranges=geneSetRangeMatrix(chrom);
+ if(ranges==null || ranges.length==0){return null;}
+ int index=driver.Search.findPointBinary(loc1, ranges);
+
+
+// if(flag){
+// Range r0=ranges[index-1];
+// Range r1=ranges[index];
+// Range r2=ranges[index+1];
+//
+// stdout.println("r0: "+r0+"\n"+Arrays.toString((GeneSet[])r0.obj1)+"\n");
+// stdout.println("r1: "+r1+"\n"+Arrays.toString((GeneSet[])r1.obj1)+"\n");
+// stdout.println("r2: "+r2+"\n"+Arrays.toString((GeneSet[])r2.obj1)+"\n");
+//
+// }
+// if(flag){stdout.println("c");}
+
+ Range r1=ranges[index];
+ Range r2=(index>=ranges.length-1 ? null : ranges[index+1]);
+
+ if(ranges[index].b>=loc2 || r2==null || r2.a>loc2){
+ return (GeneSet[])r1.obj1;
+ }
+
+//// if(flag){stdout.println("e");}
+// if(ranges[index].b>=loc2 || (index==ranges.length-1) || ranges[index+1].a>loc2){
+//// if(flag){
+//// stdout.println("f");
+//// stdout.println(ranges[index].b<=loc2);
+//// stdout.println((index==ranges.length-1));
+//// stdout.println(ranges[index+1].a>loc2);
+//// stdout.println(".......");
+//// }
+// return sets1;
+// }
+
+ if(loc1>r1.b && loc2<r2.a){
+ //No-man's land: Return closer of the bounding ranges.
+ int dist1=loc1-r1.b;
+ int dist2=r2.a-loc2;
+ if(dist1>=dist2){
+ return (GeneSet[])r1.obj1;
+ }else{
+ return (GeneSet[])r2.obj1;
+ }
+ }
+
+// assert(false) : "Test: This should be very rare, since it is slow.";
+
+ //Otherwise, return all overlapping ranges.
+ ArrayList<GeneSet> list=new ArrayList<GeneSet>(4);
+
+ while(index<ranges.length && ranges[index].b<loc1){index++;} //Spin until in range
+ for(; index<ranges.length && loc2>=ranges[index].a; index++){
+// if(flag){stdout.println("ADDED RANGE "+ranges[index]);}
+ GeneSet[] gsa=(GeneSet[]) ranges[index].obj1;
+ for(GeneSet gs : gsa){list.add(gs);}
+ }
+ return list.toArray(new GeneSet[list.size()]);
+ }
+
+
+ public static boolean isExonic(byte chrom, int point, int thresh, boolean isCoding){
+ Range[] ranges=(isCoding ? Data.geneCodeAndExonRangeMatrix(chrom) : Data.exonRangeMatrix(chrom));
+ return Search.containsPointBinary(point, ranges, thresh);
+ }
+
+
+ public static final String padFront(String num, int width, String symbol){
+ String r=num;
+ while(r.length()<width){r=symbol+r;}
+ return r;
+ }
+
+ public static final String toBinaryString(long num, int width){
+ String r=Long.toBinaryString(num);
+ while(r.length()<width){r="0"+r;}
+ return r;
+ }
+
+ public static final String toString(double[][] a){
+ StringBuilder sb=new StringBuilder(256);
+ sb.append("[\n");
+ for(double[] b : a){
+ sb.append(" ").append(Arrays.toString(b)).append(",\n");
+ }
+ sb.append("]");
+ return sb.toString();
+ }
+
+ public static final <X> String toStringRecursive(Iterable<X> a){
+ if(a==null){return "null";}
+ StringBuilder sb=new StringBuilder(256);
+ String prefix="";
+ sb.append("[");
+ for(X x : a){
+ sb.append(toStringRecursive(a));
+ if(x!=null && x instanceof Iterable<?>){
+ sb.append("\n");
+ }else{
+ sb.append(", ");
+ }
+ }
+ sb.append("]");
+ return sb.toString();
+ }
+
+ public static final HashMap<String, Integer> geneNameToIdTable(){
+ if(geneNameToIdTable==null){
+ geneIdToNameTable();
+ assert(geneIdToNameTable!=null);
+ assert(geneNameToIdTable!=null);
+ }
+ return geneNameToIdTable;
+ }
+
+ public static final HashMap<Integer, String> geneIdToNameTable(){
+ if(geneIdToNameTable==null){
+
+ synchronized(GENEIDLOCK){
+ if(geneIdToNameTable==null){
+
+ // TextFile tf=new TextFile(ROOT_GENE+"gene_names_36.3.txt");
+ TextFile tf=new TextFile(ROOT_GENE+"gene_names_37.1.txt", false, false);
+ String[] lines=tf.toStringLines();
+ tf.close();
+ HashMap<Integer, String> table=new HashMap<Integer, String>((lines.length*3)/2);
+ for(String s : lines){
+ if(!s.startsWith("#")){
+ String[] line=s.split("\t", -1);
+ // assert(line.length==3) : "'"+s+"'";
+ if(line.length>=3){
+
+ int key=-1;
+ try {
+ key=Integer.parseInt(line[1]);
+ } catch (NumberFormatException e) {
+ System.err.println("Bad line: "+s);
+ throw new RuntimeException(e);
+ }
+
+ table.put(key, (line[2]==null || line[2].length()==0) ? line[1] : line[2]);
+ }
+ }
+ }
+
+ geneIdToNameTable=table;
+
+ HashMap<String, Integer> table2=new HashMap<String, Integer>((lines.length*3)/2);
+ for(Integer id : geneIdToNameTable.keySet()){
+ table2.put(geneIdToNameTable.get(id), id);
+ }
+ geneNameToIdTable=table2;
+ }
+ }
+ }
+ return geneIdToNameTable;
+ }
+
+
+ public static ChainLine[][] getChainLines(int from, int to){
+ if(from==36 && to==37){
+ if(chains36to37==null){
+ chains36to37=ChainBlock.loadChainLines(ROOT_CHAIN+"hg18ToHg19.over.chain");
+ }
+ return chains36to37;
+ }else if(from==37 && to==36){
+ if(chains37to36==null){
+ chains37to36=ChainBlock.loadChainLines(ROOT_CHAIN+"hg19ToHg18.over.chain");
+ }
+ return chains37to36;
+ }
+ throw new RuntimeException("Unknown chain file: "+from+" -> "+to);
+ }
+
+
+ public static final String toStringRecursive(Object a){
+ return a==null ? "null" : a.toString();
+ }
+
+ public static boolean isBaited(Variation v){
+ return isBaited(v, 0);
+ }
+
+ public static boolean isBaited(Variation v, int thresh){
+ int mid=(v.beginLoc+v.endLoc)/2;
+ int len=v.endLoc-v.beginLoc+1;
+ return isBaited(v.chromosome, mid, len/2+thresh);
+ }
+
+ public static boolean isBaited(int chrom, int point, int thresh){
+ if(BAITS==null){
+ BAITS=(int[][][]) ReadWrite.readObject("UNDEFINED_ROOT"+"baits_"+"BAIT_FILE"+"_build"+GENOME_BUILD+".int3d", false);
+ }
+ return isBaited(point, BAITS[chrom], thresh);
+ }
+
+ /** Is this point within "thresh" of a bait */
+ private static boolean isBaited(int point, int[][] baits, int thresh){
+
+ if(baits==null || baits[0].length==0){return false;}
+
+ int[] starts=baits[0];
+ int[] stops=baits[1];
+ int index=Arrays.binarySearch(stops, point);
+
+ if(index>=0){return true;} //Hit inside a bait
+ index=(-index)-1;
+
+ if(index>=stops.length){return point<=(stops[stops.length-1]+thresh);}
+
+// if(index<0 || index>=stops.length){
+// System.err.println(point+" in "+starts[0]+", "+stops[stops.length-1]+" -> "+index+"/"+(stops.length-1));
+// }
+
+ final int a=point-thresh;
+ final int b=point+thresh;
+
+ if(overlap(a, b, starts[index], stops[index])){return true;}
+ for(int i=index+1; i<starts.length && b>=starts[i]; i++){
+ if(overlap(a, b, starts[i], stops[i])){return true;}
+ }
+ for(int i=index-1; i>=0 && a<=stops[i]; i++){
+ if(overlap(a, b, starts[i], stops[i])){return true;}
+ }
+ return false;
+//
+// return point>=(starts[index]-thresh) && point<=(stops[index]+thresh);
+ }
+
+ private static boolean overlap(int a1, int b1, int a2, int b2){
+ assert(a1<=b1 && a2<=b2) : a1+", "+b1+", "+a2+", "+b2;
+ return a2<=b1 && b2>=a1;
+ }
+
+
+ public static final synchronized void setGenome(int g){
+ assert(g>0) : "Genome build number must be at least 1.";
+ if(genome_set_to==g){return;}
+ if(genome_set_to<0){
+ setGenome2(g);
+ }else{
+ throw new RuntimeException("Changing genomes is not currently supported.");
+ }
+ }
+
+ private static final synchronized void setGenome2(int g){
+ assert(genome_set_to!=g);
+ GENOME_BUILD=g;
+ genome_set_to=g;
+ numChroms=-1;
+ numBases=-1;
+ numDefinedBases=-1;
+ numContigs=-1;
+ numScaffolds=-1;
+ name=null;
+ genomeSource=null;
+ scaffoldPrefixes=false;
+ long fastabytes=-1;
+ long fastatime=-1;
+ final int currentVersion=FastaToChromArrays2.currentVersion();
+ int version=0;
+
+ if(GENOME_BUILD==FastaToChromArrays2.LISTBUILD && FastaToChromArrays2.SUMMARY_LIST!=null){
+ for(int i=0; i<FastaToChromArrays2.SUMMARY_LIST.size(); i++){
+ final String s=FastaToChromArrays2.SUMMARY_LIST.get(i);
+ FastaToChromArrays2.SUMMARY_LIST.set(i, null);
+ if(s.charAt(0)=='#'){
+ if(s.startsWith("#Version")){
+ String[] split=s.split("\t");
+ version=(split.length>1 ? Integer.parseInt(split[1]) : 0);
+ }
+ }else{
+ String[] split=s.split("\t");
+ String a=split[0];
+ String b=split[1];
+ if(a.equalsIgnoreCase("chroms")){numChroms=(int)Long.parseLong(b);}
+ else if(a.equalsIgnoreCase("bases")){numBases=Long.parseLong(b);}
+ else if(a.equalsIgnoreCase("defined")){numDefinedBases=Long.parseLong(b);}
+ else if(a.equalsIgnoreCase("contigs")){numContigs=Integer.parseInt(b);}
+ else if(a.equalsIgnoreCase("scaffolds")){numScaffolds=Integer.parseInt(b);}
+ else if(a.equalsIgnoreCase("interpad")){interScaffoldPadding=Integer.parseInt(b);}
+ else if(a.equalsIgnoreCase("undefined")){}
+ else if(a.equalsIgnoreCase("name")){name=b;}
+ else if(a.equalsIgnoreCase("source")){genomeSource=b;}
+ else if(a.equalsIgnoreCase("bytes")){fastabytes=Long.parseLong(b);}
+ else if(a.equalsIgnoreCase("last modified")){fastatime=Long.parseLong(b);}
+ else if(a.equalsIgnoreCase("scafprefixes")){scaffoldPrefixes=Tools.parseBoolean(b);}
+ else{assert(version<currentVersion) : "In array: Unknown term "+s;}
+ }
+ }
+ FastaToChromArrays2.SUMMARY_LIST=null;
+ }else{
+ String s;
+ TextFile tf=new TextFile(ROOT_GENOME+GENOME_BUILD+"/summary.txt", false, false);
+ for(s=tf.nextLine(); s!=null; s=tf.nextLine()){
+ if(s.charAt(0)=='#'){
+ if(s.startsWith("#Version")){
+ String[] split=s.split("\t");
+ version=(split.length>1 ? Integer.parseInt(split[1]) : 0);
+ }
+ }else{
+ String[] split=s.split("\t");
+ String a=split[0];
+ String b=split[1];
+ if(a.equalsIgnoreCase("chroms")){numChroms=(int)Long.parseLong(b);}
+ else if(a.equalsIgnoreCase("bases")){numBases=Long.parseLong(b);}
+ else if(a.equalsIgnoreCase("defined")){numDefinedBases=Long.parseLong(b);}
+ else if(a.equalsIgnoreCase("contigs")){numContigs=Integer.parseInt(b);}
+ else if(a.equalsIgnoreCase("scaffolds")){numScaffolds=Integer.parseInt(b);}
+ else if(a.equalsIgnoreCase("interpad")){interScaffoldPadding=Integer.parseInt(b);}
+ else if(a.equalsIgnoreCase("undefined")){}
+ else if(a.equalsIgnoreCase("name")){name=b;}
+ else if(a.equalsIgnoreCase("source")){genomeSource=b;}
+ else if(a.equalsIgnoreCase("bytes")){fastabytes=Long.parseLong(b);}
+ else if(a.equalsIgnoreCase("last modified")){fastatime=Long.parseLong(b);}
+ else if(a.equalsIgnoreCase("scafprefixes")){scaffoldPrefixes=Tools.parseBoolean(b);}
+ else{assert(version<currentVersion) : "In file "+tf.name+": Unknown term "+s;}
+ }
+ }
+ tf.close();
+ }
+ if(numScaffolds==-1){numScaffolds=numChroms;}
+
+ if(version<currentVersion){
+ assert(false) : "The index format has changed in this version of BBTools. Please delete the /ref/ directory and re-index from the reference fasta," +
+ " or use an older version of BBTools.";
+ if(new File(ROOT_GENOME+GENOME_BUILD+"/info.txt").exists()){new File(ROOT_GENOME+GENOME_BUILD+"/info.txt").delete();}
+ if(new File(ROOT_GENOME+GENOME_BUILD+"/scaffolds.txt").exists()){new File(ROOT_GENOME+GENOME_BUILD+"/scaffolds.txt").delete();}
+ if(new File(ROOT_GENOME+GENOME_BUILD+"/scaffolds.txt.gz").exists()){new File(ROOT_GENOME+GENOME_BUILD+"/scaffolds.txt.gz").delete();}
+ sysout.println("Regenerating genome info in new format, version "+currentVersion+".");
+ dna.FastaToChromArrays2.writeInfo(GENOME_BUILD, numChroms, name, genomeSource, true, scaffoldPrefixes);
+ genome_set_to=-1;
+ setGenome2(g);
+ return;
+ }
+
+ assert(numChroms>0 || allowZeroSizedGenome) : "Genome "+g+": numChroms="+numChroms;
+ assert(numBases>0 || allowZeroSizedGenome) : "Genome "+g+": numBases="+numBases;
+ assert(numDefinedBases>0 || allowZeroSizedGenome) : "Genome "+g+": numDefinedBases="+numDefinedBases;
+ assert(numBases>=numDefinedBases) : "Genome "+g+": numBases>numDefinedBases : "+numBases+">"+numDefinedBases;
+
+ chromosomePlusMatrix=new ChromosomeArray[numChroms+1];
+ chromLengths=new int[numChroms+1];
+ chromDefinedBases=new int[numChroms+1];
+ chromUndefinedBases=new int[numChroms+1];
+ chromContigs=new int[numChroms+1];
+ chromStartPad=new int[numChroms+1];
+ chromScaffolds=new int[numChroms+1];
+
+ scaffoldNames=new byte[numChroms+1][][];
+ scaffoldLocs=new int[numChroms+1][];
+ scaffoldLengths=new int[numChroms+1][];
+
+ if(GENOME_BUILD==FastaToChromArrays2.LISTBUILD && FastaToChromArrays2.INFO_LIST!=null){
+ for(int i=0; i<FastaToChromArrays2.INFO_LIST.size(); i++){
+ final String s=FastaToChromArrays2.INFO_LIST.get(i);
+ FastaToChromArrays2.INFO_LIST.set(i, null);
+ if(s.charAt(0)=='#'){
+ if(s.startsWith("#Version")){
+ String[] split=s.split("\t");
+ version=(split.length>1 ? Integer.parseInt(split[1]) : 0);
+ }
+ }else{
+ assert(version==currentVersion);
+ String[] split=s.split("\t");
+ int chrom=Integer.parseInt(split[0]);
+ chromScaffolds[chrom]=Integer.parseInt(split[1]);
+ chromContigs[chrom]=(split.length>2 ? Integer.parseInt(split[2]) : -1);
+ chromLengths[chrom]=Integer.parseInt(split[3]);
+ chromDefinedBases[chrom]=Integer.parseInt(split[4]);
+ chromUndefinedBases[chrom]=(split.length>5 ? Integer.parseInt(split[5]) : -1);
+ chromStartPad[chrom]=(split.length>6 ? Integer.parseInt(split[6]) : -1);
+ // chromStopPad[chrom]=(split.length>7 ? Integer.parseInt(split[7]) : -1);
+
+ }
+ }
+ FastaToChromArrays2.INFO_LIST=null;
+ }else{
+ String s;
+ TextFile tf=new TextFile(ROOT_GENOME+GENOME_BUILD+"/info.txt", false, false);
+ for(s=tf.nextLine(); s!=null; s=tf.nextLine()){
+ if(s.charAt(0)=='#'){
+ if(s.startsWith("#Version")){
+ String[] split=s.split("\t");
+ version=(split.length>1 ? Integer.parseInt(split[1]) : 0);
+ }
+ }else{
+
+ if(version>=currentVersion){
+ String[] split=s.split("\t");
+ int chrom=Integer.parseInt(split[0]);
+ chromScaffolds[chrom]=Integer.parseInt(split[1]);
+ chromContigs[chrom]=(split.length>2 ? Integer.parseInt(split[2]) : -1);
+ chromLengths[chrom]=Integer.parseInt(split[3]);
+ chromDefinedBases[chrom]=Integer.parseInt(split[4]);
+ chromUndefinedBases[chrom]=(split.length>5 ? Integer.parseInt(split[5]) : -1);
+ chromStartPad[chrom]=(split.length>6 ? Integer.parseInt(split[6]) : -1);
+// chromStopPad[chrom]=(split.length>7 ? Integer.parseInt(split[7]) : -1);
+ }else{
+ tf.close();
+ if(new File(ROOT_GENOME+GENOME_BUILD+"/info.txt").exists()){new File(ROOT_GENOME+GENOME_BUILD+"/info.txt").delete();}
+ if(new File(ROOT_GENOME+GENOME_BUILD+"/scaffolds.txt").exists()){new File(ROOT_GENOME+GENOME_BUILD+"/scaffolds.txt").delete();}
+ if(new File(ROOT_GENOME+GENOME_BUILD+"/scaffolds.txt.gz").exists()){new File(ROOT_GENOME+GENOME_BUILD+"/scaffolds.txt.gz").delete();}
+ sysout.println("Regenerating genome info in new format.");
+ dna.FastaToChromArrays2.writeInfo(GENOME_BUILD, numChroms, name, genomeSource, true, scaffoldPrefixes);
+ tf=new TextFile(ROOT_GENOME+GENOME_BUILD+"/info.txt", false, false);
+ }
+ }
+
+ }
+
+ tf.close();
+ }
+
+ String fname=ROOT_GENOME+GENOME_BUILD+"/scaffolds.txt.gz";
+ boolean hasList=(GENOME_BUILD==FastaToChromArrays2.LISTBUILD && FastaToChromArrays2.SCAF_LIST!=null);
+
+ if(!LOAD_SCAFFOLDS || (!hasList && !new File(fname).exists())){
+ for(int i=0; i<scaffoldNames.length; i++){
+ scaffoldNames[i]=new byte[][] {("chr"+i).getBytes()};
+ scaffoldLocs[i]=new int[] {chromStartPad[i]<0 ? 0 : chromStartPad[i]};
+ scaffoldLengths[i]=new int[] {chromLengths[i]};
+ if(!LOAD_SCAFFOLDS){chromScaffolds[i]=1;}
+
+ assert(chromScaffolds[i]==1) : "This appears to be an old index version. " +
+ "\nPlease regenerate it from the fasta file by rerunning this program,\nusing the ref=<reference file> and overwrite=true flags.\n"+i+", "+chromScaffolds[i];
+ }
+ }else{
+ for(int chrom=0; chrom<scaffoldNames.length; chrom++){
+ int num=chromScaffolds[chrom];
+ assert(chrom==0 || num>=1) : chrom+", "+num+", "+Arrays.toString(chromScaffolds);
+ if(num>0){
+ scaffoldNames[chrom]=new byte[num][];
+ scaffoldLocs[chrom]=new int[num];
+ scaffoldLengths[chrom]=new int[num];
+ }
+ }
+ int[] count=new int[numChroms+1];
+
+ if(hasList){
+
+ if(verbose){System.err.println("Fetching scaffold names from list:\n\n"+FastaToChromArrays2.SCAF_LIST+"\n\n");}
+
+ for(int i=0; i<FastaToChromArrays2.SCAF_LIST.size(); i++){
+ final String s=FastaToChromArrays2.SCAF_LIST.get(i);
+
+ if(verbose){System.err.println("Processing "+s);}
+
+ FastaToChromArrays2.SCAF_LIST.set(i, null);
+ if(s.charAt(0)=='#'){
+ if(verbose){System.err.println("Control string");}
+ if(s.startsWith("#Version")){
+ assert(version==currentVersion) : "Wrong index version; please delete /ref/genome/\n"+version+", "+currentVersion;
+// String[] split=s.split("\t");
+// version=(split.length>1 ? Integer.parseInt(split[1]) : 0);
+// assert(version==currentVersion) : "Wrong version: "+version+", "+currentVersion;
+ }
+ }else{
+ String[] split=s.split("\t");
+ if(verbose){System.err.println("Split into "+Arrays.toString(split));}
+ int chrom=Integer.parseInt(split[0]);
+ int x=count[chrom];
+ count[chrom]++;
+
+ int scaffoldID=Integer.parseInt(split[1]);
+ scaffoldLocs[chrom][x]=Integer.parseInt(split[2]);
+ scaffoldLengths[chrom][x]=Integer.parseInt(split[3]);
+ scaffoldNames[chrom][x]=split[4].getBytes();
+ if(verbose){System.err.println("Set scaffoldNames["+chrom+"]["+x+" to "+(scaffoldNames[chrom][x]==null ? "null" : new String(scaffoldNames[chrom][x])));}
+ }
+ }
+ FastaToChromArrays2.SCAF_LIST=null;
+ }else{
+
+ String s;
+ TextFile tf=new TextFile(fname, false, false);
+ for(s=tf.nextLine(); s!=null; s=tf.nextLine()){
+ if(s.charAt(0)=='#'){
+ if(s.startsWith("#Version")){
+ assert(version==currentVersion) : "Wrong index version; please delete /ref/genome/\n"+version+", "+currentVersion;
+// String[] split=s.split("\t");
+// version=(split.length>1 ? Integer.parseInt(split[1]) : 0);
+// assert(version==currentVersion) : "Wrong version: "+version+", "+currentVersion;
+ }
+ }else{
+ String[] split=s.split("\t");
+ int chrom=Integer.parseInt(split[0]);
+ int x=count[chrom];
+ count[chrom]++;
+
+ int scaffoldID=Integer.parseInt(split[1]);
+ scaffoldLocs[chrom][x]=Integer.parseInt(split[2]);
+ scaffoldLengths[chrom][x]=Integer.parseInt(split[3]);
+ scaffoldNames[chrom][x]=split[4].getBytes();
+ }
+
+ }
+
+ tf.close();
+ }
+ }
+
+// assert(false) : (numChroms+1)+", "+(scaffoldLengths==null)+", "+(scaffoldLengths[0]==null)+", "+(scaffoldLengths[1]==null);
+
+// for(int i=1; i<scaffoldNames.length; i++){
+// stdout.println(Arrays.toString(scaffoldLocs[i]));
+// stdout.println(Arrays.toString(scaffoldLengths[i]));
+// stdout.println(Arrays.toString(scaffoldNames[i]));
+// }
+
+ }
+
+
+// public static String contigName(int x){return scaffoldName(x);}
+//
+// public static String scaffoldName(int x){
+// if(scaffoldNames==null){return "chr"+x;}
+// return scaffoldNames[x][0];
+// }
+
+ public static HashMap<String, ScafLoc> scafNameTable(){
+
+ if(GENOME_BUILD<0){
+ assert(scaffoldNameTable==null);
+ return null;
+ }
+ if(scaffoldNameTable!=null){return scaffoldNameTable;}
+ synchronized(SCAFMAPLOCK){
+ if(scaffoldNameTable!=null){return scaffoldNameTable;}
+ scaffoldNameTable=new HashMap<String, ScafLoc>((int)Tools.min(2L*numScaffolds+10, 1000000000));
+ for(int chrom=0; chrom<scaffoldNames.length; chrom++){
+ if(scaffoldNames[chrom]!=null){
+ for(int scafnum=0; scafnum<scaffoldNames[chrom].length; scafnum++){
+ byte[] name=scaffoldNames[chrom][scafnum];
+ if(name!=null){
+ int loc=scaffoldLocs[chrom][scafnum];
+ ScafLoc sc=new ScafLoc(new String(name), chrom, loc);
+ scaffoldNameTable.put(sc.name, sc);
+ }
+ }
+ }
+ }
+ }
+ return scaffoldNameTable;
+ }
+
+
+ public static ScafLoc getScafLoc(byte[] name){
+ HashMap<String, ScafLoc> map=scafNameTable();
+ if(map==null){return null;}
+ return map.get(new String(name));
+ }
+ public static ScafLoc getScafLoc(String name){
+ HashMap<String, ScafLoc> map=scafNameTable();
+ if(map==null){return null;}
+ return map.get(name);
+ }
+
+ public static byte[] scaffoldName(int chrom, int loc, int idx){return scaffoldNames[chrom][idx];}
+ public static int scaffoldRelativeLoc(int chrom, int loc, int idx){return loc-scaffoldLocs[chrom][idx];}
+
+ public static int scaffoldIndex(int chrom, int loc){
+ int[] array=scaffoldLocs[chrom];
+ if(array==null || array.length<2){return 0;}
+
+ assert(interScaffoldPadding>0);
+ loc=loc+interScaffoldPadding/2; //Puts it on closest scaffold if it is between scaffolds
+
+ int idx=Arrays.binarySearch(array, loc);
+ if(idx>=0){return idx;} //Perfect hit
+
+ //Otherwise, return closest scaffold.
+ int insertPoint=-1-idx;
+ assert(insertPoint>=0 && insertPoint<=array.length);
+ int r=max(0, insertPoint-1);
+ assert(r>=0 && r<array.length);
+ assert(r==0 || loc>array[r]);
+ assert(r==array.length-1 || loc<array[r+1]);
+ return r;
+ }
+
+ /** TODO: This can be made faster */
+ public static boolean isSingleScaffold(int chrom, int loc1, int loc2){
+ assert(loc2>=loc1);
+ if(scaffoldLocs==null){return true;}
+ assert(chrom>=0 && chrom<scaffoldLocs.length) : chrom+", "+scaffoldLocs.length;
+ int[] array=scaffoldLocs[chrom];
+ if(array==null || array.length<2){return true;}
+ assert(interScaffoldPadding>0);
+
+ int idx=Arrays.binarySearch(array, loc1+interScaffoldPadding);
+ final int scaf;
+ if(idx>=0){scaf=idx;} //Perfect hit
+ else{
+ int insertPoint=-1-idx;
+ assert(insertPoint>=0 && insertPoint<=array.length);
+ scaf=max(0, insertPoint-1);
+ assert(scaf>=0 && scaf<array.length);
+ assert(scaf==0 || loc1+interScaffoldPadding>array[scaf]);
+ assert(scaf==array.length-1 || loc1+interScaffoldPadding<array[scaf+1]);
+ }
+ if(scaf==array.length-1){return true;}
+
+ int lowerBound=array[scaf]-interScaffoldPadding;
+ int upperBound=array[scaf+1];
+
+ if(loc2<lowerBound || loc1>upperBound){return false;} //This could happen if a random read was generated in the start or stop padding.
+ assert(scaf==0 || scaf==array.length-1 || (loc1>=lowerBound && loc1<upperBound)) :
+ "chrom="+chrom+", loc1="+loc1+", lowerBound="+lowerBound+", loc2="+loc2+", upperBound="+upperBound;
+ return loc2<upperBound;
+ }
+
+ /** Returns overlap of these two points with the scaffold on which they are centered */
+ public static int scaffoldOverlapLength(int chrom, int loc1, int loc2){
+ assert(loc2>=loc1);
+ int len=loc2-loc1+1;
+ if(scaffoldLocs==null){return len;}
+ int[] array=scaffoldLocs[chrom];
+ if(array==null || array.length<2){return len;}
+ assert(interScaffoldPadding>0);
+
+ int mid=loc1+(interScaffoldPadding+len)/2;
+ int idx=Arrays.binarySearch(array, mid);
+ final int scaf;
+ if(idx>=0){scaf=idx;} //Perfect hit
+ else{
+ int insertPoint=-1-idx;
+ assert(insertPoint>=0 && insertPoint<=array.length);
+ scaf=max(0, insertPoint-1);
+ assert(scaf>=0 && scaf<array.length);
+ assert(scaf==0 || mid>array[scaf]);
+ assert(scaf==array.length-1 || mid<array[scaf+1]) : "\nscaf="+scaf+", array.length="+array.length+"\n"+
+ "loc1="+loc1+", loc2="+loc2+", mid="+mid+", interScaffoldPadding="+interScaffoldPadding+", "+array[scaf]+", "+array[scaf+1]+"\n"+
+ (loc1+interScaffoldPadding)+", "+array[scaf+1];
+ }
+
+ int lowerBound=array[scaf];
+ int upperBound=lowerBound+scaffoldLengths[chrom][scaf];
+// assert(upperBound==array[scaf+1]) : lowerBound+", "+upperBound+", "+array[scaf+1]+", "+interScaffoldPadding; //This should fail.
+
+ return Tools.overlapLength(loc1, loc2, lowerBound, upperBound);
+ }
+
+ public static void trimScaffoldNames(){
+ if(scaffoldNames!=null){
+ for(int i=0; i<scaffoldNames.length; i++){
+ byte[][] matrix=scaffoldNames[i];
+ if(matrix!=null){
+ for(int j=0; j<matrix.length; j++){
+ byte[] array=matrix[j];
+ if(array!=null){
+ for(int k=0; k<array.length; k++){
+ if(Character.isWhitespace(array[k])){
+ matrix[j]=Arrays.copyOf(array, k);
+ break;
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ public static final String findPath(String fname){
+ assert(fname!=null);
+ if(fname.startsWith("?")){//Look in standard locations
+ fname=fname.substring(1);
+ }else{//Use this as the literal path
+ return fname;
+ }
+ String path=ROOT+fname;
+ boolean vb=false;
+ {
+ File f=new File(path);
+ if(!f.exists()){
+ if(vb){System.err.println("Did not find "+fname+" at "+path);}
+ f=new File(ROOT);
+ String res=f.getParent();
+ if(res.length()>0 && !res.endsWith("/")){res=res+"/";}
+ res=res+"resources/"+fname;
+ f=new File(res);
+ if(f.exists()){path=res;}
+ else{if(vb){System.err.println("Did not find "+fname+" at "+res);}}
+ }
+ if(!f.exists()){
+ if(vb){System.err.println("Considering fixing "+path+"\n"+path.contains("/file:"));}
+ if(path.contains("/file:")){
+ String fixed=path.substring(path.lastIndexOf("/file:")+1);
+ f=new File(fixed);
+ if(f.exists()){path=fixed;}
+ else{if(vb){System.err.println("Did not find "+fname+" at "+fixed);}}
+ }
+ }
+ if(!f.exists()){
+ if(vb){System.err.println("Considering getResource");}
+ URL url=Primes.class.getResource("/"+fname);
+ if(url!=null){
+ String temp=url.toString().replace("%20", " ");
+ if(vb){System.err.println("Found URL "+temp);}
+ f=new File(temp);
+ // if(f.exists()){fname=temp;}
+ // else{System.err.println("Did not find "+fname+" at "+temp);}
+ path=temp;
+ }
+ }
+ if(!f.exists() && !path.startsWith("jar:")){
+ String hardlink="/global/projectb/sandbox/gaag/bbtools/resources/"+fname;
+ f=new File(hardlink);
+ if(f.exists()){path=hardlink;}
+ else{if(vb){System.err.println("Did not find "+fname+" at "+hardlink);}}
+ }
+ if(!f.exists() && !path.startsWith("jar:")){
+ System.err.println("Warning! Cannot find "+fname+" "+path);
+ return null;
+ }
+ }
+ if(vb){System.err.println("Found "+fname+" at "+path);}
+ return path;
+ }
+
+ public static final int min(int x, int y){return x<y ? x : y;}
+ public static final int max(int x, int y){return x>y ? x : y;}
+
+ public static final byte min(byte x, byte y){return x<y ? x : y;}
+ public static final byte max(byte x, byte y){return x>y ? x : y;}
+
+ public static final long min(long x, long y){return x<y ? x : y;}
+ public static final long max(long x, long y){return x>y ? x : y;}
+
+ public static final double min(double x, double y){return x<y ? x : y;}
+ public static final double max(double x, double y){return x>y ? x : y;}
+
+ public static final float min(float x, float y){return x<y ? x : y;}
+ public static final float max(float x, float y){return x>y ? x : y;}
+
+ public static int numChroms;
+ public static long numBases;
+ public static long numDefinedBases;
+ public static int numContigs;
+ public static int numScaffolds;
+ public static int interScaffoldPadding;
+ public static int[] chromLengths;
+ public static int[] chromDefinedBases;
+ public static int[] chromUndefinedBases;
+ public static int[] chromContigs;
+ public static int[] chromScaffolds;
+ public static int[] chromStartPad;
+
+ public static boolean allowZeroSizedGenome=true;
+
+ public static byte[][][] scaffoldNames;
+ public static int[][] scaffoldLocs;
+ /** Does NOT include interScaffoldPadding */
+ public static int[][] scaffoldLengths;
+ /** Should be true if scaffold names have extra prefixes (for BBSplitter mode), false otherwise */
+ public static boolean scaffoldPrefixes;
+
+ /** Allows translation of sam coordinates back to native coordinates */
+ public static HashMap<String, ScafLoc> scaffoldNameTable;
+
+ public static String genomeSource;
+ public static String name;
+
+ private static final GeneSet[][] geneSetMatrix=new GeneSet[63][];
+ private static final Gene[][] geneMatrix=new Gene[63][];
+ public static final Range[][] geneSetRangeMatrix=new Range[63][];
+ public static final Range[][] geneTxRangeMatrix=new Range[63][];
+ public static final Range[][] geneCodeRangeMatrix=new Range[63][];
+ private static final Range[][] geneCodeAndExonRangeMatrix=new Range[63][];
+ public static final Range[][] exonRangeMatrix=new Range[63][];
+ public static HashMap<Integer, ArrayList<GeneSet>> geneIDTable;
+
+ /** Ranges within genes and exons or within NEAR their ends */
+ public static final Range[][] geneNearbyRangeMatrix=new Range[63][];
+
+ public static ChromosomeArray[] chromosomePlusMatrix;
+
+ private static HashMap<Integer, String> geneIdToNameTable;
+ private static HashMap<String, Integer> geneNameToIdTable;
+
+ private static final HashMap<String, GeneSet>[] geneNameTable=new HashMap[63];
+ private static final HashMap<String, Gene>[] transcriptNameTable=new HashMap[63];
+
+ public static ChainLine[][] chains36to37;
+ public static ChainLine[][] chains37to36;
+
+ public static int[][][] BAITS;
+
+ private static final int TX_RANGE=0;
+ private static final int CODE_RANGE=1;
+
+
+ public static final int NEAR=200;
+
+ public static boolean ENV=(System.getenv()!=null);
+ public static boolean WINDOWS=(System.getenv().containsKey("OS") && System.getenv().get("OS").equalsIgnoreCase("Windows_NT"));
+ public static boolean GENEPOOL=(System.getenv().containsKey("NERSC_HOST") && System.getenv().get("NERSC_HOST").equalsIgnoreCase("genepool"));
+ public static int LOGICAL_PROCESSORS=CALC_LOGICAL_PROCESSORS();
+ private static String HOSTNAME;
+
+ private static int CALC_LOGICAL_PROCESSORS(){
+ final int procs=Tools.max(1, Runtime.getRuntime().availableProcessors());
+ int slots=procs;
+ Map<String,String> env=System.getenv();
+ String s=env.get("NSLOTS");
+ if(s!=null){
+ int x=slots;
+ try {
+ x=Tools.max(1, Integer.parseInt(s));
+ } catch (NumberFormatException e) {
+ //ignore
+ }
+ if(x<16){slots=x;}
+ }
+ if(slots>8 && (slots*2==procs || (slots==16 && procs==40))){return procs;}//hyperthreading
+ return Tools.min(slots, procs);
+ }
+
+ public static String HOSTNAME(){
+ if(HOSTNAME==null){
+ try {
+ java.net.InetAddress localMachine = java.net.InetAddress.getLocalHost();
+ HOSTNAME=localMachine.getHostName();
+ } catch (UnknownHostException e) {
+ // TODO Auto-generated catch block
+// e.printStackTrace();
+ HOSTNAME="unknown";
+ } catch (NullPointerException e) {
+ // TODO Auto-generated catch block
+// e.printStackTrace();
+ HOSTNAME="unknown";
+ } catch (Throwable e) {
+ HOSTNAME="unknown";
+ }
+ }
+ return HOSTNAME;
+ }
+
+ public static String ROOT(){return ROOT;}
+
+
+ /** Should be the same as ROOT_BASE but is found dynamically */
+ private static String ROOT;
+
+ public static String ROOT_BASE;
+ public static String ROOT_REF;
+ public static String ROOT_GENOME;
+ public static String ROOT_INDEX;
+ public static String ROOT_GENE;
+ public static String ROOT_CHAIN;
+ public static String ROOT_TEMPDIR;
+ public static String ROOT_CURRENT;
+ public static String ROOT_QUALITY;
+
+ static{
+ ROOT=(new File(Data.class.getClassLoader().getResource(Data.class.getName().replace('.', '/') + ".class")
+ .getFile()).getAbsolutePath().replace('\\', '/').replace("dna/Data.class", "").replace("%20", " "));
+ setPath(WINDOWS ? "?windows" : "?unix");
+ if(!WINDOWS || true){setPath("?local");}
+ }
+
+ public static void setPath(String path){
+// System.err.println("***"+path);
+ if(path.indexOf('\\')>=0){path=path.replace('\\', '/');}
+ String mode=(path==null ? "null" : path.toLowerCase());
+ boolean local=mode.equals("?local") || mode.equals(".") || mode.equals("/.") || mode.equals("./");
+ boolean win=mode.equals("?windows");
+ boolean unix=mode.equals("?unix");
+
+ ROOT_CURRENT=System.getProperty("user.dir");
+
+ ROOT_BASE="";
+ ROOT_REF="ref/";
+ ROOT_GENOME=ROOT_REF+"genome/";
+ ROOT_INDEX=ROOT_REF+"index/";
+ ROOT_GENE=ROOT_REF+"genes/";
+ ROOT_CHAIN=ROOT_REF+"chain/";
+ ROOT_QUALITY=ROOT_REF+"qual/";
+
+ if(local){
+ ROOT_TEMPDIR=ROOT_BASE;
+ }else if(win){
+ ROOT_TEMPDIR="C:/workspace/tempdir/";
+ }else if(unix){
+ String s=System.getenv().get("TEMPDIR");
+ ROOT_TEMPDIR=(s==null ? ROOT_BASE : s+"/");
+ }else if(!"null".equals(mode)){
+ if(!path.endsWith("/")){path=path+"/";}
+ ROOT_BASE=path;
+ ROOT_REF=path+"ref/";
+ ROOT_GENOME=ROOT_REF+"genome/";
+ ROOT_INDEX=ROOT_REF+"index/";
+ ROOT_GENE=ROOT_REF+"genes/";
+ ROOT_CHAIN=ROOT_REF+"chain/";
+ ROOT_QUALITY=ROOT_REF+"qual/";
+ }else{
+ ROOT_BASE=null;
+ ROOT_REF=null;
+ ROOT_GENOME=null;
+ ROOT_GENE=null;
+ ROOT_CHAIN=null;
+ ROOT_QUALITY=null;
+ ROOT_TEMPDIR=null;
+ }
+ }
+
+ public static final String VAR_FOLDER="VAR/";
+ public static final String GENE_FOLDER="GENE/";
+
+ public static int GENOME_BUILD=-1;
+ private static int genome_set_to=-1;
+
+ public static final boolean verbose=false;
+
+ /** seqGene, knownGene, refGene, unionGene, seqRefGene, ccs */
+ public static String GENE_MAP="seqRefGene";
+
+ private static final String GENEIDLOCK=new String("GENEIDLOCK");
+
+ private static final String[] CHROMLOCKS=new String[256];
+
+ static{
+ for(int i=0; i<CHROMLOCKS.length; i++){
+ CHROMLOCKS[i]=new String(i+"");
+ }
+ }
+
+ private static final int INTERN_MAP_SIZE=(1<<20);
+ private static final int INTERN_MAP_LIMIT=(1<<19);
+
+ private static final HashMap<String, String> INTERNMAP=new HashMap<String, String>(INTERN_MAP_SIZE);
+// public static final void unloadInternMap(){
+// INTERNMAP=new HashMap<String, String>(INTERN_MAP_SIZE);
+// }
+
+ private static String condense(String s){
+ //TODO - finish this
+ StringBuilder sb=new StringBuilder(s.length());
+ for(int i=0; i<s.length(); i++){
+ sb.append('A');
+ }
+ return sb.toString();
+ }
+
+ public static final void intern(String[] s){
+ if(s==null){return;}
+ for(int i=0; i<s.length; i++){s[i]=intern(s[i]);}
+ }
+ public static String intern(String s){
+ if(s==null || s.length()>25){return new String(s);}
+// calls++;
+//
+// if(s.length()>0 && s.charAt(0)!='?'){
+// s=condense(s);
+// }
+
+ if(s.length()<2){
+// return s.intern();
+ return forceIntern(s);
+ }
+ boolean acgtn=AminoAcid.containsOnlyACGTNQ(s);
+
+ if(acgtn){
+ if(s.length()<4){
+// return s.intern();
+ return forceIntern(s);
+ }
+ if(s.length()>6){
+ return new String(s);
+ }
+ }
+
+ //Otherwise it is non-base string of length 2 to 20, or a base string of length 4 to 6.
+ return forceIntern(s);
+ }
+
+ public static String forceIntern(String s){
+ calls++;
+
+// if(s.length()<2){return s.intern();}
+// boolean acgtn=AminoAcid.containsOnlyACGTNQ(s);
+//
+// if(acgtn){
+// if(s.length()<4){return s.intern();}
+// }
+
+ String old=INTERNMAP.get(s);
+ if(old!=null){return old;}
+
+ synchronized(INTERNMAP){
+// System.err.print(INTERNMAP.size()+"~"+calls+": "+s+", ");
+ if(INTERNMAP.size()>INTERN_MAP_LIMIT){
+ System.err.println("INTERNMAP overflow caused by "+s);
+ INTERNMAP.clear();
+ }
+ if(INTERNMAP.containsKey(s)){return INTERNMAP.get(s);}
+ s=new String(s);
+ INTERNMAP.put(s, s);
+ }
+ return s;
+ }
+ static int calls=0;
+
+ public static PrintStream sysout=System.err;//System.out;
+
+ public static boolean CHROMGZ=true;
+ public static boolean LOAD_SCAFFOLDS=true;
+
+// private static final boolean GUNZIP=testExecute("gunzip --help");
+// private static final boolean GZIP=testExecute("gzip --help");
+// private static final boolean SAMTOOLS=testExecute("samtools --help");
+
+ public static boolean GUNZIP(){return GUNZIP==0 ? GZIP() : GUNZIP>0;}
+// public static boolean UNPIGZ(){return UNPIGZ==0 ? PIGZ() : UNPIGZ>0;}
+ public static boolean GZIP(){
+ if(GZIP==0 && !WINDOWS){
+ synchronized(SUBPROCSYNC){
+ if(GZIP==0){GZIP=testExecute("gzip --version");}
+ }
+ }
+ return GZIP>0;
+ }
+ public static boolean PIGZ(){
+ if(PIGZ==0){
+ synchronized(SUBPROCSYNC){
+ if(PIGZ==0){PIGZ=testExecute("pigz --version");}
+ }
+ }
+ return PIGZ>0;
+ }
+ public static boolean DSRC(){
+ if(DSRC==0){
+ synchronized(SUBPROCSYNC){
+ if(DSRC==0){DSRC=testExecute("dsrc --version");}
+ }
+ }
+ return DSRC>0;
+ }
+ public static boolean BZIP2(){
+ if(BZIP2==0 && !WINDOWS){
+ synchronized(SUBPROCSYNC){
+ if(BZIP2==0){BZIP2=testExecute("bzip2 --version");}
+ }
+ }
+ return BZIP2>0;
+ }
+ public static boolean PBZIP2(){
+ if(PBZIP2==0 && !WINDOWS){
+ synchronized(SUBPROCSYNC){
+ if(PBZIP2==0){PBZIP2=testExecute("pbzip2 --version");}
+ }
+ }
+ return PBZIP2>0;
+ }
+ public static boolean SAMTOOLS(){
+ if(SAMTOOLS==0 && !WINDOWS){
+ synchronized(SUBPROCSYNC){
+ if(SAMTOOLS==0){SAMTOOLS=testExecute("samtools");}
+ }
+ System.err.println(SAMTOOLS>0 ? "Found samtools." : "Could not find samtools.");
+ }
+ return SAMTOOLS>0;
+ }
+ public static boolean SH(){
+ if(SH==0 && !WINDOWS){
+ synchronized(SUBPROCSYNC){
+ if(SH==0){SH=testExecute("sh --version");}
+ }
+// System.err.println(SH>0 ? "Found sh." : "Could not find sh.");
+ if(SH<0){System.err.println("Could not find sh; won't launch I/O subprocesses.");}
+ }
+ return SH>0;
+ }
+ private static final String SUBPROCSYNC=new String("SUBPROCSYNC");
+ private static final String SCAFMAPLOCK=new String("SCAFMAPLOCK");
+
+ /* Set these to zero to enable or -1 to disable */
+ private static int GUNZIP=-1;
+// private static int UNPIGZ=0;
+ private static int GZIP=0;
+ private static int PIGZ=0;
+ private static int DSRC=0;
+ private static int BZIP2=0;
+ private static int PBZIP2=0;
+ private static int SAMTOOLS=0;
+ private static int SH=0;
+
+ private static int testExecute(String s){
+// System.err.println("Testing "+s);
+ try {
+ Process p;
+ p = Runtime.getRuntime().exec(s);
+// System.err.println("Got process.");
+ while(p.getErrorStream().read()>-1){}
+// return p.exitValue()==0;
+// System.err.println("This system does has "+s+" installed.");
+ } catch (IOException e) {
+// System.err.println("This system does not have "+s+" installed.");
+ // TODO Auto-generated catch block
+// e.printStackTrace();
+ return -1;
+ }
+ return 1;
+ }
+
+}
diff --git a/current/dna/Exon.java b/current/dna/Exon.java
new file mode 100755
index 0000000..7cf4a30
--- /dev/null
+++ b/current/dna/Exon.java
@@ -0,0 +1,175 @@
+package dna;
+import java.io.Serializable;
+import java.util.HashMap;
+
+
+public class Exon implements Comparable<Exon>, Serializable{
+
+ /**
+ *
+ */
+ private static final long serialVersionUID = 1890833345682913235L;
+
+
+ public Exon(){
+ a=-1;
+ b=-1;
+ utr=false;
+ cds=false;
+ chromosome=-1;
+ strand=-1;
+ }
+
+// public Exon(String startPoint, String endPoint, String chrom){
+// this(startPoint, endPoint, chrom, "?");
+// }
+//
+// public Exon(int startPoint, int endPoint, String chrom){
+// this(startPoint, endPoint, chrom, "?");
+// }
+//
+// public Exon(int startPoint, int endPoint, byte chrom){
+// this(startPoint, endPoint, chrom, (byte)2);
+// }
+
+ public Exon(String startPoint, String endPoint, String chrom, String strnd, boolean utr_, boolean cds_){
+ this(Integer.parseInt(startPoint), Integer.parseInt(endPoint), toChromosome(chrom), toStrand(strnd), utr_, cds_);
+ }
+
+ public Exon(int startPoint, int endPoint, String chrom, String strnd, boolean utr_, boolean cds_){
+ this(startPoint, endPoint, toChromosome(chrom), toStrand(strnd), utr_, cds_);
+ }
+
+ public Exon(int startPoint, int endPoint, byte chrom, byte strnd, boolean utr_, boolean cds_){
+ a=startPoint;
+ b=endPoint;
+ chromosome=chrom;
+ strand=strnd;
+ utr=utr_;
+ cds=cds_;
+ }
+
+
+
+ public static Exon merge(Exon exon1, Exon exon2){
+ assert(canMerge(exon1, exon2));
+ return new Exon(min(exon1.a, exon2.a), max(exon1.b, exon2.b), exon1.chromosome, exon1.strand, exon1.cds||exon2.cds, exon1.utr||exon2.utr);
+ }
+
+ public static boolean canMerge(Exon exon1, Exon exon2){
+ if(exon1.chromosome!=exon2.chromosome){return false;}
+ return overlap(exon1.a, exon1.b, exon2.a, exon2.b);
+ }
+
+
+ public boolean intersects(int point){return point>=a && point<=b;}
+ //Slow
+ public boolean intersects(int a2, int b2){
+ assert(a2<=b2);
+ return overlap(a, b, a2, b2);
+ }
+
+ public boolean crosses(int a2, int b2){return (a2<a && b2>=a) || (a2<=b && b2>b);}
+ public boolean contains(int a2, int b2){return (a2>=a && b2<=b);}
+
+ public boolean intersectsNearby(int a, int b){
+ return intersects(a-Data.NEAR, b+Data.NEAR);
+ }
+
+ private static boolean overlap(int a1, int b1, int a2, int b2){
+ assert(a1<=b1 && a2<=b2) : a1+", "+b1+", "+a2+", "+b2;
+ return a2<=b1 && b2>=a1;
+ }
+
+ public int distToSpliceSite(int x, int y){
+ int distA=distToPoint(x, y, a);
+ int distB=distToPoint(x, y, b);
+ return min(distA, distB);
+ }
+
+ public static int distToPoint(int x, int y, int point){
+ assert(x<=y);
+ if(y<=point){return point-y;}
+ if(x>=point){return x-point;}
+ return 0;
+ }
+
+ public static byte toStrand(String s){
+ byte r=2;
+ if("-".equals(s)){
+ r=1;
+ }else if("+".equals(s)){
+ r=0;
+ }else{
+ assert("?".equals(s));
+ }
+ return r;
+ }
+
+ public static byte toChromosome(String s){
+ int i=0;
+// System.out.println(s);
+ while(!Character.isDigit(s.charAt(i))){i++;}
+ return Byte.parseByte(s.substring(i));
+ }
+
+ public int length(){
+ int r=(int)(b-a+1);
+ assert(r>0);
+ return r;
+ }
+
+ public String toString(){
+// return "(chr"+chromosome+","+(strand==0 ? "+" : "-")+","+a+"~"+b+")";
+ return "(chr"+chromosome+", "+a+" - "+b+", len "+length()+")";
+ }
+
+ public int compareTo(Exon other){
+ if(chromosome<other.chromosome){return -1;}
+ if(chromosome>other.chromosome){return 1;}
+
+ if(a<other.a){return -1;}
+ if(a>other.a){return 1;}
+
+ if(b<other.a){return -1;}
+ if(b>other.a){return 1;}
+
+ if(strand<other.strand){return -1;}
+ if(strand>other.strand){return 1;}
+
+ if(utr && !other.utr){return -1;}
+ if(!utr && other.utr){return 1;}
+
+ if(cds && !other.cds){return -1;}
+ if(!cds && other.cds){return 1;}
+
+ return 0;
+ }
+
+ public boolean equals(Object other){
+ return equals((Exon)other);
+ }
+
+ public boolean equals(Exon other){
+ return a==other.a && b==other.b && chromosome==other.chromosome && strand==other.strand && utr==other.utr && cds==other.cds;
+ }
+
+ public int hashCode(){
+ int xor=a^(Integer.rotateLeft(b, 16));
+ xor^=Integer.rotateRight(chromosome, 6);
+ return xor;
+ }
+
+
+ private static final int min(int x, int y){return x<y ? x : y;}
+ private static final int max(int x, int y){return x>y ? x : y;}
+
+ public final int a;
+ public final int b;
+ public final boolean utr;
+ public final boolean cds;
+ public final byte chromosome;
+ public final byte strand;
+
+ public static final HashMap<Exon,Exon> table=new HashMap<Exon,Exon>(65536);
+}
diff --git a/current/dna/FastaToChromArrays2.java b/current/dna/FastaToChromArrays2.java
new file mode 100755
index 0000000..a85c83d
--- /dev/null
+++ b/current/dna/FastaToChromArrays2.java
@@ -0,0 +1,587 @@
+package dna;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Date;
+
+import stream.ByteBuilder;
+
+
+import align2.Tools;
+
+import fileIO.ByteFile1;
+import fileIO.ReadWrite;
+import fileIO.TextStreamWriter;
+
+/**
+ * Uses a ByteFile instead of TextFile for better speed and lower memory use.
+ * @author Brian Bushnell
+ * @date Jul 30, 2014
+ *
+ */
+public class FastaToChromArrays2 {
+
+// Example:
+// jgi.FastaToChromArrays ecoli_K12.fa 1 writeinthread=false genscaffoldinfo=true retain waitforwriting=false
+// gzip=true chromc=false maxlen=536670912 writechroms=true minscaf=1 midpad=300 startpad=8000 stoppad=8000 nodisk=false
+
+ public static void main(String[] args){
+ main2(args);
+ }
+
+ public static ArrayList<ChromosomeArray> main2(String[] args){
+ System.err.println("Executing "+(new Object() { }.getClass().getEnclosingClass().getName())+" "+Arrays.toString(args)+"\n");
+ boolean oldWIT=WRITE_IN_THREAD;
+ WRITE_IN_THREAD=true;
+
+// assert(false) : ReadWrite.ZIPLEVEL;
+
+ String name=null;
+
+ int genome=-1;
+ int chroms=-1;
+ String infile=null;
+ boolean writeinfo=false;
+ boolean genScaffoldInfo=true;
+ boolean writeChroms=true;
+ boolean scafprefixes=Data.scaffoldPrefixes;
+
+ for(int i=0; i<args.length; i++){
+
+ if(true){
+ final String arg=args[i];
+ final String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if("null".equalsIgnoreCase(b)){b=null;}
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(a.equals("null")){
+ //do nothing
+ }else if(a.equals("path") || a.equals("root") || a.equals("tempdir")){
+ Data.setPath(b);
+ }else if(a.equals("name") || a.equals("organism")){
+ name=b;
+ }else if(a.equals("in") || a.equals("input") || a.equals("ref") || a.equals("fasta")){
+ if(split.length<1 || "null".equalsIgnoreCase(b)){b=null;}
+ infile=b;
+ }else if(a.equals("build") || a.equals("genome")){
+ genome=Integer.parseInt(b);
+ }else if(a.equals("chroms")){
+ chroms=Integer.parseInt(b);
+ }else if(a.equals("writeinthread")){
+ WRITE_IN_THREAD=Tools.parseBoolean(b);
+ }else if(a.equals("nodisk")){
+ NODISK=Tools.parseBoolean(b);
+ }else if(a.equals("writeinfo")){
+ writeinfo=Tools.parseBoolean(b);
+ }else if(a.equals("padstart") || a.startsWith("startpad") || a.equals("padfront") || a.startsWith("frontpad")){
+ START_PADDING=Integer.parseInt(b);
+ }else if(a.equals("padstop") || a.startsWith("stoppad") || a.equals("padend") || a.startsWith("endpad")){
+ END_PADDING=Integer.parseInt(b);
+ }else if(a.equals("pad") || a.equals("padding")){
+ START_PADDING=END_PADDING=Integer.parseInt(b);
+ }else if(a.equals("midpad") || a.startsWith("padmid")){
+ MID_PADDING=Integer.parseInt(b);
+ }else if(a.startsWith("minscaf") || a.startsWith("mincontig")){
+ MIN_SCAFFOLD=Integer.parseInt(b);
+ }else if(a.equals("genscaffoldinfo")){
+ genScaffoldInfo=Tools.parseBoolean(b);
+ System.err.println("Set genScaffoldInfo="+genScaffoldInfo);
+ }else if(a.equals("append") || a.equals("app")){
+ append=Tools.parseBoolean(b);
+ }else if(a.equals("overwrite") || a.equals("ow")){
+ overwrite=Tools.parseBoolean(b);
+ }else if(a.equals("mergescaffolds") || a.equals("mergecontigs") || (a.equals("merge"))){
+ MERGE_SCAFFOLDS=Tools.parseBoolean(b);
+ System.err.println("Set MERGE_SCAFFOLDS="+MERGE_SCAFFOLDS);
+ }else if(a.startsWith("maxlen") || a.startsWith("chromlen")){
+ long len=Tools.parseKMG(b);
+ assert(len>0 && len<=Integer.MAX_VALUE);
+ MAX_LENGTH=(int)len;
+ }else if(a.equals("writechroms")){
+ writeChroms=Tools.parseBoolean(b);
+ }else if(a.equals("chromgz") || a.equals("gz")){
+ Data.CHROMGZ=Tools.parseBoolean(b);
+ }else if(a.equals("retain")){
+ RETAIN=Tools.parseBoolean(b);
+ }else if(a.equals("scafprefixes")){
+ scafprefixes=Tools.parseBoolean(b);
+ }else if(a.equals("ziplevel") || a.equals("zl")){
+ ReadWrite.ZIPLEVEL=Integer.parseInt(b);
+ }else if(a.equals("waitforwriting")){
+ WAIT_FOR_WRITING=Tools.parseBoolean(b);
+ }else{
+ if(i>2){
+ System.err.println("Unknown parameter "+args[i]);
+// throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+ }
+ }
+
+ WAIT_FOR_WRITING=(WAIT_FOR_WRITING || ReadWrite.USE_GZIP || ReadWrite.USE_PIGZ);
+
+ ArrayList<ChromosomeArray> r=RETAIN ? new ArrayList<ChromosomeArray>() : null;
+
+// assert(false) : Arrays.toString(args);
+// assert(RETAIN);
+
+ if(genome<0){genome=Integer.parseInt(args[1]);} //Legacy
+ if(genome<0){throw new RuntimeException("Please specify a genome build number.");}
+
+ if(writeinfo){
+ if(chroms<0){chroms=Integer.parseInt(args[2]);} //Legacy
+ if(chroms<0){throw new RuntimeException("Please the number of chroms.");}
+ writeInfo(genome, chroms, name, null, false, scafprefixes);
+ }else{
+ if(infile==null){infile=args[0].replace('\\', '/');} //Legacy
+ if(infile==null){throw new RuntimeException("Please specify an input file.");}
+ {
+ File f=new File(infile);
+ if(!f.exists() || f.isDirectory()){
+ if(!infile.startsWith("stdin")){
+ throw new RuntimeException("Not a valid file: "+f);
+ }
+ }
+ }
+ String outRoot=Data.ROOT_GENOME+genome+"/";
+
+ FastaToChromArrays2 ftca=new FastaToChromArrays2();
+ ftca.makeChroms(infile, outRoot, name, genScaffoldInfo, writeChroms, r, scafprefixes);
+ }
+
+ WRITE_IN_THREAD=oldWIT;
+ return r;
+ }
+
+ private FastaToChromArrays2(){}
+
+
+ private static int[] countInfo(ChromosomeArray ca){
+ int contigs=0;
+ int startPad=0;
+ int stopPad=0;
+ int undefined=0;
+ int defined=0;//=ca.countDefinedBases();
+
+ int lastN=-1;
+ int lastDef=-1;
+
+ for(int i=0; i<=ca.maxIndex; i++){
+ byte b=ca.get(i);
+ if(AminoAcid.isFullyDefined(b)){
+ if(defined==0){startPad=i; contigs++;}
+ else if(i-lastDef>contigTrigger){contigs++;}
+ lastDef=i;
+ defined++;
+ }else{
+ lastN=i;
+ undefined++;
+ }
+ }
+
+ if(contigs>0 && lastN==ca.maxIndex){
+ stopPad=lastN-lastDef;
+ }else{
+// System.err.println(lastN+", "+lastDef+", "+ca.maxIndex);
+ }
+
+ return new int[] {ca.chromosome, 1, contigs, (ca.maxIndex+1), defined, undefined, startPad, stopPad};
+ }
+
+ @Deprecated
+ public static void writeInfo(int genome, int chroms, String name, String source, boolean unload, boolean scafNamePrefix){
+ Data.GENOME_BUILD=genome;
+ Data.chromosomePlusMatrix=new ChromosomeArray[chroms+1];
+
+ String outRoot=Data.ROOT_GENOME+genome+"/";
+ TextStreamWriter info=new TextStreamWriter(outRoot+"info.txt", true, false, false);
+ info.start();
+ info.print("#Chromosome sizes\n");
+ try {
+ info.print("#Generated on\t"+new Date()+"\n");
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ info.print("#Version\t"+VERSION+"\n");
+ info.print("#chrom\tscaffolds\tcontigs\tlength\tdefined\tundefined\tstartPad\tstopPad\n");
+
+
+ long bases=0;
+ long definedBases=0;
+
+ long contigSum=0;
+
+ for(int chrom=1; chrom<=chroms; chrom++){
+ ChromosomeArray ca=Data.getChromosome(chrom);
+ int[] v=countInfo(ca);
+ info.print(v[0]+"\t"+v[1]+"\t"+v[2]+"\t"+v[3]+"\t"+v[4]+"\t"+v[5]+"\t"+v[6]+"\t"+v[7]+"\n");
+
+ bases+=v[3];
+ definedBases+=v[4];
+ contigSum+=v[2];
+ if(unload){Data.unload(chrom, false);}
+ }
+ info.poison();
+ StringBuilder sb=new StringBuilder();
+ sb.append("#Summary\n");
+ try {
+ sb.append("#Generated on\t"+new Date()+"\n");
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ sb.append("#Version\t"+VERSION+"\n");
+ sb.append("chroms\t"+(chroms)+"\n");
+ sb.append("bases\t"+bases+"\n");
+ sb.append("defined\t"+definedBases+"\n");
+ sb.append("undefined\t"+(bases-definedBases)+"\n");
+ sb.append("contigs\t"+contigSum+"\n");
+ sb.append("scaffolds\t"+chroms+"\n");
+ sb.append("interpad\t"+MID_PADDING+"\n");
+ if(name!=null){sb.append("name\t"+name+"\n");}
+ if(source!=null){sb.append("source\t"+source+"\n");}
+ if(scafNamePrefix){sb.append("scafprefixes\t"+scafNamePrefix+"\n");}//else{assert(false);}
+ ReadWrite.writeString(sb, outRoot+"summary.txt", false);
+ info.waitForFinish();
+ }
+
+ private int makeChroms(String fname, String outRoot, String genomeName, boolean genScaffolds, boolean writeChroms, ArrayList<ChromosomeArray> r,
+ boolean scafNamePrefix){
+
+ if(!NODISK){
+ File f=new File(outRoot);
+ if(!f.exists()){
+ if(!NODISK){f.mkdirs();}
+ }else if(overwrite){
+ for(File g : f.listFiles()){
+ String s=g.getName();
+ if(g.isFile() && s.contains(".chrom")){
+ System.err.println("Deleting "+s);
+ g.delete();
+ }
+ }
+ }
+
+ f=new File(outRoot.replace("ref/genome/", "ref/index/"));
+ if(!f.exists()){
+ if(!NODISK){f.mkdirs();}
+ }else if(overwrite){
+ for(File g : f.listFiles()){
+ String s=g.getName();
+ if(g.isFile() && (s.endsWith(".int2d") || s.endsWith(".block") || s.endsWith(".block2.gz") || s.endsWith(".blockB") || s.endsWith(".blockB2.gz"))){
+ System.err.println("Deleting "+s);
+ g.delete();
+ }
+ }
+ }
+ }
+
+ ByteFile1 tf=new ByteFile1(fname, false, false);
+ int chrom=1;
+
+ TextStreamWriter infoWriter=null, scafWriter=null;
+ ArrayList<String> infolist=null, scaflist=null;
+
+ if(NODISK){
+ infolist=new ArrayList<String>();
+ infolist.add("#Chromosome sizes");
+ try {
+ infolist.add("#Generated on\t"+new Date());
+ } catch (Exception e1) {
+ e1.printStackTrace();
+ }
+ infolist.add("#Version\t"+VERSION);
+ infolist.add("#chrom\tscaffolds\tcontigs\tlength\tdefined\tundefined\tstartPad\tstopPad");
+ }else{
+ infoWriter=new TextStreamWriter(outRoot+"info.txt", true, false, false);
+ infoWriter.start();
+ infoWriter.print("#Chromosome sizes\n");
+ try {
+ // System.err.println(new Date());
+ infoWriter.print("#Generated on\t"+new Date()+"\n");
+ } catch (Exception e1) {
+ // TODO Auto-generated catch block
+ e1.printStackTrace();
+ }
+ infoWriter.print("#Version\t"+VERSION+"\n");
+ infoWriter.print("#chrom\tscaffolds\tcontigs\tlength\tdefined\tundefined\tstartPad\tstopPad\n");
+ }
+
+ if(genScaffolds){
+ if(NODISK){
+ scaflist=new ArrayList<String>();
+ scaflist.add("#Scaffold names");
+ try {
+ scaflist.add("#Generated on\t"+new Date());
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ scaflist.add("#Version\t"+VERSION);
+ scaflist.add("#chrom\tid\tstart\tlength\tname");
+ }else{
+ //System.err.println("*123 Making ScafWriter; "+ReadWrite.countActiveThreads()+", "+ReadWrite.USE_GZIP+", "+ReadWrite.USE_PIGZ);
+ scafWriter=new TextStreamWriter(outRoot+"scaffolds.txt.gz", true, false, false);
+ scafWriter.start();
+ scafWriter.print("#Scaffold names\n");
+ try {
+ scafWriter.print("#Generated on\t"+new Date()+"\n");
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ scafWriter.print("#Version\t"+VERSION+"\n");
+ scafWriter.print("#chrom\tid\tstart\tlength\tname\n");
+ }
+ }
+
+
+ for(ChromosomeArray ca=makeNextChrom(tf, chrom, infoWriter, scafWriter, infolist, scaflist); ca!=null;
+ ca=makeNextChrom(tf, chrom, infoWriter, scafWriter, infolist, scaflist)){
+ if(ca.array.length>ca.maxIndex+1){ca.resize(ca.maxIndex+1);}
+ if(RETAIN){r.add(ca);}
+
+ if(writeChroms){
+ String x=outRoot+"chr"+chrom+Data.chromExtension();
+ if(new File(x).exists() && !overwrite){throw new RuntimeException("Tried to overwrite existing file "+x+", but overwrite=false.");}
+ ReadWrite.writeObjectInThread(ca, x, false);
+ System.err.println("Writing chunk "+chrom);
+ }
+ chrom++;
+ }
+ lastHeader=nextHeader=null;
+
+ tf.close();
+ if(infoWriter!=null){infoWriter.poison();}
+ if(scafWriter!=null){
+ //System.err.println("*123 Killing ScafWriter; "+ReadWrite.countActiveThreads()+", "+ReadWrite.USE_GZIP+", "+ReadWrite.USE_PIGZ);
+ scafWriter.poison();
+ }
+
+ StringBuilder sb=new StringBuilder();
+ sb.append("#Summary\n");
+ try {
+ sb.append("#Generated on\t"+new Date()+"\n");
+ } catch (Exception e1) {
+ // TODO Auto-generated catch block
+ e1.printStackTrace();
+ }
+ sb.append("#Version\t"+VERSION+"\n");
+ sb.append("chroms\t"+(chrom-1)+"\n");
+ sb.append("bases\t"+lengthSum+"\n");
+ assert((definedSum+undefinedSum)==lengthSum) : definedSum+", "+undefinedSum+", "+lengthSum;
+ sb.append("defined\t"+definedSum+"\n");
+ sb.append("undefined\t"+undefinedSum+"\n");
+ sb.append("contigs\t"+contigSum+"\n");
+ sb.append("scaffolds\t"+scaffoldSum+"\n");
+ sb.append("interpad\t"+MID_PADDING+"\n");
+ if(genomeName!=null){sb.append("name\t"+genomeName+"\n");}
+ if(fname!=null){
+ File f=new File(fname);
+ String cpath=null;
+ try {
+ cpath=f.getCanonicalPath();
+ } catch (IOException e) {
+ cpath=f.getAbsolutePath();
+ }
+ sb.append("source\t"+cpath+"\n");
+ sb.append("bytes\t"+f.length()+"\n");
+ sb.append("last modified\t"+f.lastModified()+"\n");
+ }
+ if(scafNamePrefix){sb.append("scafprefixes\t"+scafNamePrefix+"\n");}//else{assert(false);}
+ if(NODISK){
+ SUMMARY_LIST=new ArrayList<String>();
+ String[] split=sb.toString().split("\n");
+ for(String s : split){SUMMARY_LIST.add(s);}
+ }else{
+ ReadWrite.writeString(sb, outRoot+"summary.txt", false);
+ }
+
+ if(infoWriter!=null){infoWriter.waitForFinish();}
+ if(scafWriter!=null){
+ //System.err.println("*123 Waiting For ScafWriter; "+ReadWrite.countActiveThreads()+", "+ReadWrite.USE_GZIP+", "+ReadWrite.USE_PIGZ);
+ scafWriter.waitForFinish();
+ //System.err.println("*123 ScafWriter Finished; "+ReadWrite.countActiveThreads()+", "+ReadWrite.USE_GZIP+", "+ReadWrite.USE_PIGZ);
+ }
+
+ if(WAIT_FOR_WRITING && ReadWrite.countActiveThreads()>0){
+ System.err.println("Waiting for writing to finish.");
+ ReadWrite.waitForWritingToFinish();
+ System.err.println("Finished.");
+ //System.err.println("*123 countActiveThreads Finished; "+ReadWrite.countActiveThreads()+", "+ReadWrite.USE_GZIP+", "+ReadWrite.USE_PIGZ);
+ }
+
+ if(infolist!=null){
+ INFO_LIST=infolist;
+ LISTBUILD=Data.GENOME_BUILD;
+ }else{INFO_LIST=null;}
+ if(scaflist!=null){
+ SCAF_LIST=scaflist;
+ LISTBUILD=Data.GENOME_BUILD;
+ }else{SCAF_LIST=null;}
+
+ return chrom-1;
+ }
+
+ private ChromosomeArray makeNextChrom(ByteFile1 tf, int chrom, TextStreamWriter infoWriter, TextStreamWriter scafWriter, ArrayList<String> infolist, ArrayList<String> scaflist){
+ ChromosomeArray ca=new ChromosomeArray(chrom, (byte)Gene.PLUS, 0, 120000+START_PADDING);
+ ca.maxIndex=-1;
+ for(int i=0; i<START_PADDING; i++){ca.set(i, 'N');}
+
+ if(verbose){System.err.println("chrom="+chrom+", lastHeader="+lastHeader+", nextHeader="+nextHeader);}
+
+ int scaffolds=0;
+ if(currentScaffold!=null && currentScaffold.length()>0){
+ assert(currentScaffold.length()>0);
+ assert(lastHeader!=null);
+ assert(currentScaffold.length()+END_PADDING+ca.maxIndex<MAX_LENGTH);
+
+// System.err.println("A: Writing a scaffold because currentScaffold = "+currentScaffold);
+ scaffoldSum++;
+ if(scafWriter!=null){scafWriter.print(chrom+"\t"+scaffoldSum+"\t"+(ca.maxIndex+1)+"\t"+currentScaffold.length()+"\t"+lastHeader+"\n");}
+ if(scaflist!=null && lastHeader!=null){
+ scaflist.add(chrom+"\t"+scaffoldSum+"\t"+(ca.maxIndex+1)+"\t"+currentScaffold.length()+"\t"+lastHeader);
+ if(verbose){System.err.println("A: Added to scaflist: "+scaflist.get(scaflist.size()-1));}
+ }
+ ca.set(ca.maxIndex+1, currentScaffold);
+ scaffolds++;
+
+ currentScaffold.setLength(0);
+ lastHeader=nextHeader;
+ }
+
+// if()
+
+ while((currentScaffold=nextScaffold(currentScaffold, tf))!=null){
+ if(currentScaffold.length()+MID_PADDING+END_PADDING+ca.maxIndex>MAX_LENGTH){break;}
+ if(scaffolds>0 && !MERGE_SCAFFOLDS){break;}
+
+ if(scaffolds>0){
+ for(int i=0; i<MID_PADDING; i++){
+ ca.set(ca.maxIndex+1, 'N');
+ }
+ }
+ if(currentScaffold.length()>=MIN_SCAFFOLD){
+// System.err.println("B: Writing a scaffold because currentScaffold = "+currentScaffold);
+ scaffoldSum++;
+ if(scafWriter!=null){scafWriter.print(chrom+"\t"+scaffoldSum+"\t"+(ca.maxIndex+1)+"\t"+currentScaffold.length()+"\t"+lastHeader+"\n");}
+ if(scaflist!=null){
+ scaflist.add(chrom+"\t"+scaffoldSum+"\t"+(ca.maxIndex+1)+"\t"+currentScaffold.length()+"\t"+lastHeader);
+ if(verbose){System.err.println("B: Added to scaflist: "+scaflist.get(scaflist.size()-1));}
+ }
+ ca.set(ca.maxIndex+1, currentScaffold);
+ scaffolds++;
+ }
+
+ currentScaffold.setLength(0);
+ lastHeader=nextHeader;
+ }
+
+ if(verbose){System.err.println("lastHeader="+lastHeader);}
+
+ if(scaffolds==0){return null;}
+
+ if(END_PADDING>0){
+ int terminalN=0;
+ for(int i=ca.maxIndex; i>=0 && terminalN<END_PADDING; i--){
+ if(ca.get(i)=='N'){terminalN++;}
+ else{break;}
+ }
+// System.err.println("\nAdding Ns: ref.length="+ca.maxIndex);
+ while(terminalN<=END_PADDING && ca.maxIndex<MAX_LENGTH-1){
+// System.out.print("N");
+ ca.set(ca.maxIndex+1, 'N');
+ terminalN++;
+ }
+// System.err.println("\nAdded Ns: ref.length="+ca.maxIndex);
+ }
+
+ int[] v=countInfo(ca);
+ v[6]=Tools.max(0, Tools.min(START_PADDING, v[6])); //In case input scaffolds had leading undefined bases
+ v[7]=Tools.max(0, Tools.min(END_PADDING, v[7])); //In case input scaffolds had trailing undefined bases
+ if(infoWriter!=null){
+// infoWriter.print("#chrom\tscaffolds\tcontigs\tlength\tdefined\tundefined\tstartPad\tstopPad\n");
+ infoWriter.print(v[0]+"\t"+scaffolds+"\t"+v[2]+"\t"+v[3]+"\t"+v[4]+"\t"+v[5]+"\t"+v[6]+"\t"+v[7]+"\n");
+ }
+ if(infolist!=null){
+ infolist.add(v[0]+"\t"+scaffolds+"\t"+v[2]+"\t"+v[3]+"\t"+v[4]+"\t"+v[5]+"\t"+v[6]+"\t"+v[7]);
+ }
+ lengthSum+=v[3];
+ definedSum+=v[4];
+ undefinedSum+=v[5];
+ contigSum+=v[2];
+
+ assert((definedSum+undefinedSum)==lengthSum) : definedSum+", "+undefinedSum+", "+lengthSum+
+ "; "+ca.countDefinedBases()+", "+(ca.maxIndex+1)+"\n"+ca.getString(0, ca.maxIndex);
+
+ return ca;
+ }
+
+
+ private ByteBuilder nextScaffold(ByteBuilder sb, ByteFile1 tf){
+ if(sb==null){sb=new ByteBuilder(100);}
+ else{sb.setLength(0);}
+ if(!tf.isOpen()){return null;}
+
+ byte[] s=tf.nextLine();
+
+ for(; s!=null && (s.length==0 || s[0]!='>'); s=tf.nextLine()){
+// if(TRANSLATE_U_TO_T){
+// for(int i=0; i<s.length; i++){
+// if(s[i]=='U'){s[i]='T';}
+// }
+// }
+ sb.append(s);
+// for(int i=0; i<s.length(); i++){
+// char c=s.charAt(i);
+// assert(Character.isLetter(c));
+// sb.append(Character.toUpperCase(c));
+// }
+ }
+
+ nextHeader=(s==null ? null : new String(s, 1, s.length-1));
+ if(s==null && sb.length()==0){return null;}
+ return sb;
+ }
+
+ private String lastHeader;
+ private String nextHeader;
+ private ByteBuilder currentScaffold;
+ private long scaffoldSum=0;
+ private long lengthSum=0;
+ private long definedSum=0;
+ private long undefinedSum=0;
+ private long contigSum=0;
+
+
+ public static final int currentVersion(){return VERSION;}
+
+ public static boolean MERGE_SCAFFOLDS=true;
+ public static boolean WRITE_IN_THREAD=false;
+ public static boolean overwrite=true;
+ public static boolean append=false;
+ public static int START_PADDING=8000; //Always applied
+ public static int MID_PADDING=300; //Applied when merging scaffolds
+ public static int END_PADDING=8000; //Only applied if not enough terminal Ns
+ public static int MIN_SCAFFOLD=1;
+ public static int contigTrigger=10;
+ public static int VERSION=5;
+ public static int MAX_LENGTH=(1<<29)-200000;
+// public static boolean TRANSLATE_U_TO_T;
+
+ public static boolean verbose=false;
+ public static boolean RETAIN=false;
+ public static boolean WAIT_FOR_WRITING=true;
+ public static boolean NODISK=false;
+ public static int LISTBUILD=-1;
+ public static ArrayList<String> INFO_LIST, SCAF_LIST, SUMMARY_LIST;
+
+// public static boolean GENERATE_SCAFFOLD_INFO=true;
+
+}
diff --git a/current/dna/Gene.java b/current/dna/Gene.java
new file mode 100755
index 0000000..a77203e
--- /dev/null
+++ b/current/dna/Gene.java
@@ -0,0 +1,1056 @@
+package dna;
+import java.io.Serializable;
+import java.util.HashSet;
+
+
+public class Gene implements Comparable<Gene>, Serializable{
+
+// /**
+// *
+// */
+ private static final long serialVersionUID = -1342555621377050981L;
+
+
+ public Gene(){
+ chromosome=-1;
+// nc_accession=null;
+ symbol=null;
+ proteinAcc=null;
+ id=-1;
+ mrnaAcc=null;
+ status=-1;
+ completeness=-1;
+ strand=-1;
+ codeStart=txStart=-1;
+ codeStop=txStop=-1;
+ exons=null;
+ cdsStartStat=-1;
+ cdsEndStat=-1;
+ exonFrames=null;
+ txLength=-1;
+ codeLength=-1;
+ exonLength=-1;
+ exonCodeLength=-1;
+ aaLength=-1;
+ utrLength5prime=-1;
+ utrLength3prime=-1;
+ readCorrectly=false;
+ untranslated=false;
+ pseudo=false;
+ description=null;
+ fullDescription=null;
+ valid=true;
+ primarySource=-1;
+ }
+
+ public Gene(byte chrom, byte strand_, int txStart_, int txStop_, int cdStart_, int cdStop_, int gid,
+ String name_, String trans_, String protTrans_, String status_, String completeness_,
+ Exon[] exons_, boolean untran, boolean pseudo_, boolean valid_,
+ String primarySource_, String descript_, String fullDescript_){
+
+ chromosome=chrom;
+// nc_accession=null;
+ symbol=name_;
+ id=gid;
+ mrnaAcc=((trans_==null || trans_.length()<1 || trans_.equals("-")) ? null : trans_);
+ proteinAcc=((protTrans_==null || protTrans_.length()<1 || protTrans_.equals("-")) ? null : protTrans_);
+
+ primarySource=primarySource_==null ? -1 : (byte)find3(primarySource_, sourceCodes);
+ description=descript_;
+ fullDescription=fullDescript_;
+
+
+ status=status_==null ? -1 : (byte)find3(status_, statusCodes);
+ completeness=completeness_==null ? -1 : (byte)find3(completeness_, completenessCodes);
+ strand=strand_;
+
+ exons=exons_;
+
+ txStart=txStart_;
+ txStop=txStop_; //Assuming pure 0-based numbering.
+ codeStart=cdStart_;
+ codeStop=cdStop_; //Assuming pure 0-based numbering.
+
+ assert(codeStart>=txStart) : "("+txStart+", "+txStop+"), ("+codeStart+", "+codeStop+") for "+mrnaAcc;
+ assert(codeStop<=txStop) : "("+txStart+", "+txStop+"), ("+codeStart+", "+codeStop+") for "+mrnaAcc;
+
+
+// cdsStartStat=(byte)find("?", endStatCodes);
+// cdsEndStat=(byte)find("?", endStatCodes);
+ cdsStartStat=-1;
+ cdsEndStat=-1;
+
+ exonFrames=null;
+
+ txLength=txStop-txStart+1;
+ codeLength=(codeStop==codeStart ? 0 : codeStop-codeStart+1);
+
+ untranslated=untran;
+ pseudo=pseudo_;
+
+ int eLen=0, ecLen=0, utr0=0, utr2=0;
+
+ if(exons!=null){
+
+ for(Exon e : exons){
+
+ utr0+=max(0, min(e.b, codeStart)-e.a);
+ utr2+=max(0, e.b-max(e.a, codeStop));
+
+ int len=e.b-e.a+1;
+ eLen+=len;
+ len=(min(e.b, codeStop)-max(e.a, codeStart));
+ len=max(0, len+1);
+ ecLen+=len;
+ }
+ }
+
+
+ exonLength=(eLen<2 ? 0 : eLen);
+ exonCodeLength=(codeLength<1 || exonLength<1 ? 0 : ecLen);
+ aaLength=exonCodeLength/3-1;
+
+ assert(exonLength>=exonCodeLength) : exonLength+", "+codeLength+", "+exonCodeLength+"\n"+this+"\n";
+ assert(codeLength>=exonCodeLength) : exonLength+", "+codeLength+", "+exonCodeLength+"\n"+this+"\n";
+
+ //assert(exonCodeLength%3 == 0); //This should be true with a correct database
+
+ if(strand==PLUS){
+ utrLength5prime=untranslated ? 0 : utr0;
+ utrLength3prime=untranslated ? 0 : utr2;
+ }else{
+ utrLength5prime=untranslated ? 0 : utr2;
+ utrLength3prime=untranslated ? 0 : utr0;
+ }
+
+ //System.err.println(name+", "+exonLength+", "+exonCodeLength+(exons==null ? "" : ", "+exons.length));
+
+ readCorrectly=true;
+ valid=(readCorrectly && valid_);
+ }
+
+
+ public Gene merge(Gene g){
+
+ assert((exons==null && g.exons==null) ||
+ (exons!=null && g.exons!=null && exons.length==g.exons.length));
+// assert(exonLength==g.exonLength);
+ assert(Math.abs(exonLength-g.exonLength)<=8) : "\n\n"+this+"\n\n"+g+"\n\n";
+ assert(strand==g.strand);
+// assert(codeStart==g.codeStart);
+// assert(codeStop==g.codeStop);
+
+ String Xsymbol=symbol;
+ String XproteinAcc=proteinAcc;
+ int Xid=id;
+ String XmrnaAcc=mrnaAcc;
+ int Xstatus=status;
+ int Xcompleteness=completeness;
+ int XcodeStart=codeStart;
+ int XcodeStop=codeStop;
+ int XtxStart=txStart;
+ int XtxStop=txStop;
+ int XcdsStartStat=cdsStartStat;
+ int XcdsEndStat=cdsEndStat;
+ byte[] XexonFrames=exonFrames;
+ int XtxLength=txLength;
+ int XcodeLength=codeLength;
+ int XexonLength=exonLength;
+ int XexonCodeLength=exonCodeLength;
+ int XaaLength=aaLength;
+ int XutrLength5prime=utrLength5prime;
+ int XutrLength3prime=utrLength3prime;
+// boolean XreadCorrectly=readCorrectly;
+ boolean Xuntranslated=untranslated;
+ boolean Xpseudo=pseudo;
+ String Xdescription=description;
+ String XfullDescription=fullDescription;
+ boolean Xvalid=valid;
+
+ assert(untranslated || g.untranslated || g.codeStart>=txStart) : "\n"+this+"\n\n"+g;
+ assert(untranslated || g.untranslated || g.codeStop<=txStop) : "\n"+this+"\n\n"+g;
+
+ if(Xsymbol==null){Xsymbol=g.symbol;}
+ if(XproteinAcc==null){XproteinAcc=g.proteinAcc;}
+ if(Xid<0){Xid=g.id;}
+ if(XmrnaAcc==null){XmrnaAcc=g.mrnaAcc;}
+ if(Xstatus<0){Xstatus=g.status;}
+ if(Xcompleteness<0){Xcompleteness=g.completeness;}
+
+
+ if(XcodeStart==XcodeStop && g.codeStart<g.codeStop){
+ assert(g.codeStart>=txStart);
+ assert(g.codeStop<=txStop);
+ XcodeStart=g.codeStart;
+ XcodeStop=g.codeStop;
+ }
+
+ //These two should never happen...
+ if(XtxStart<0){XtxStart=g.txStart;}
+ if(XtxStop<0){XtxStop=g.txStop;}
+
+ if(XcdsStartStat<0){XcdsStartStat=g.cdsStartStat;}
+ if(XcdsEndStat<0){XcdsEndStat=g.cdsEndStat;}
+ if(XexonFrames==null){XexonFrames=g.exonFrames;}
+ if(XtxLength<0){XtxLength=g.txLength;}
+ if(XcodeLength<0){XcodeLength=g.codeLength;}
+ if(XexonLength<0){XexonLength=g.exonLength;}
+ if(XexonCodeLength<0){XexonCodeLength=g.exonCodeLength;}
+ if(XaaLength<0){XaaLength=g.aaLength;}
+ if(XutrLength5prime<0){XutrLength5prime=g.utrLength5prime;}
+ if(XutrLength3prime<0){XutrLength3prime=g.utrLength3prime;}
+ if(Xdescription==null){Xdescription=g.description;}
+ if(XfullDescription==null){XfullDescription=g.fullDescription;}
+
+// if(XreadCorrectly){}
+// if(Xuntranslated){}
+// if(Xpseudo){}
+// if(Xvalid){}
+
+ //TODO Note that the readCorrectly field gets lost here
+ Gene out=new Gene(chromosome, strand, XtxStart, XtxStop, XcodeStart, XcodeStop, Xid,
+ symbol, XmrnaAcc, XproteinAcc,
+ Xstatus< 0 ? null : statusCodes[Xstatus], Xcompleteness<0 ? null : completenessCodes[Xcompleteness],
+ exons, Xuntranslated, Xpseudo, Xvalid, sourceCodes[primarySource], Xdescription, XfullDescription);
+
+ return out;
+ }
+
+
+ public static byte toStrand(String s){
+ byte r=2;
+ if("-".equals(s)){
+ r=1;
+ }else if("+".equals(s)){
+ r=0;
+ }else if("?".equals(s) || ".".equals(s)){
+ r=2;
+ }else{
+ throw new RuntimeException("Unknown strand: "+s);
+ }
+ return r;
+ }
+
+ public static int toChromosome(final String s){
+//// assert(false) : s;
+// String s2=s;
+// if(s2.endsWith("random")){s2="U";}
+// if(s2.startsWith("chr")){s2=s2.substring(3);}
+// if(s2.equals("MT")){s2="M";}
+//// int loc=find2(s2.toUpperCase(), chromCodes);
+// int loc=find3(s2.toUpperCase(), chromCodes);
+//
+// if(loc<0){
+// if(!Character.isDigit(s2.charAt(0))){
+// loc=find3("U", chromCodes);
+// }else{
+// try {
+// loc=Integer.parseInt(s2);
+// } catch (NumberFormatException e) {
+// throw new RuntimeException(e);
+// }
+// assert(loc>=23 && loc<=26) : loc+", "+s;
+// }
+// }
+// assert(loc>=0) : s;
+// return loc;
+
+ String s2=s;
+ if(s2.startsWith("chr")){s2=s2.substring(3);}
+ int loc=Integer.parseInt(s2);
+
+ assert(loc>=0) : s;
+ return loc;
+ }
+
+ public static int toBuild(final String s){
+ String s2=s.toLowerCase();
+ if(s2.startsWith("build")){s2=s2.substring(5);}
+ else if(s2.startsWith("b")){s2=s2.substring(1);}
+ else if(s2.startsWith("hg")){s2=s2.substring(1);}
+
+ if(s2.startsWith("=")){s2=s2.substring(1);}
+
+ assert(Character.isDigit(s2.charAt(0))) : s;
+
+ return Integer.parseInt(s2);
+ }
+
+ private void fillExons(String eStarts, String eEnds, byte chr, byte str){
+ String[] s1=eStarts.split(",");
+ String[] s2=eEnds.split(",");
+
+ int last=-1;
+
+ for(int i=0; i<s1.length; i++){
+ int a=Integer.parseInt(s1[i]);
+ int b=Integer.parseInt(s2[i])-1; //Note the -1 for 0-based numbering.
+ assert(a>last) : eStarts;
+ last=a;
+
+ boolean cds=overlap(a, b, codeStart, codeStop);
+ boolean utr=(a<codeStart || b>codeStop);
+
+ Exon key=new Exon(a, b, chr, str, utr, cds);
+ Exon value=Exon.table.get(key);
+ if(value==null){
+ value=key;
+ Exon.table.put(key, key);
+ }
+ exons[i]=value;
+ }
+ }
+
+ private Exon[] fillExonsCCDS(String estring, byte chr, byte str){
+ String[] intervals=estring.replace("[","").replace("]","").replace(" ","").split(",");
+
+ int last=-1;
+
+ Exon[] array=new Exon[intervals.length];
+
+ for(int i=0; i<intervals.length; i++){
+ String[] temp=intervals[i].split("-");
+ int a=Integer.parseInt(temp[0]);
+ int b=Integer.parseInt(temp[1]); //Note the pure 0-based numbering.
+ assert(a>last) : estring;
+ last=a;
+
+ boolean cds=overlap(a, b, codeStart, codeStop);
+ boolean utr=(a<codeStart || b>codeStop);
+
+ Exon key=new Exon(a, b, chr, str, utr, cds);
+ Exon value=Exon.table.get(key);
+ if(value==null){
+ value=key;
+ Exon.table.put(key, key);
+ }
+ array[i]=value;
+ }
+ return array;
+ }
+
+ public int toGeneRelativeOffset(int index){
+
+ int off=0;
+
+ if(strand==PLUS){
+
+ // System.out.println();
+ for(Exon e : exons){
+ // System.out.print(e+" * ");
+
+ int temp=0;
+ if(e.intersects(index)){
+ temp=(int)(index-e.a);
+ }else if(e.a>index){
+ break;
+ }else{
+ temp=e.length();
+ }
+ assert(temp<=e.length()) : index +" \t "+e+" \t "+temp+" \t "+e.length();
+ assert(temp>=0) : index+", "+e;
+ off+=temp;
+ }
+
+ }else if(strand==MINUS){
+ for(int i=exons.length-1; i>=0; i--){
+ Exon e=exons[i];
+
+ int temp=0;
+ if(e.intersects(index)){
+ temp=(int)(e.b-index);
+ }else if(e.b<index){
+ break;
+ }else{
+ temp=e.length();
+ }
+ assert(temp<=e.length()) : index +" \t "+e+" \t "+temp+" \t "+e.length();
+ assert(temp>=0) : index+", "+e;
+ off+=temp;
+ }
+
+ }else{assert false : strand;}
+
+ return off;
+ }
+
+ public int[] toExonRelativeOffset(int index){
+
+ int ex=0;
+ int off=0;
+
+ if(strand==0){
+
+ // System.out.println();
+ for(Exon e : exons){
+ // System.out.print(e+" * ");
+
+ int temp=0;
+ if(e.intersects(index)){
+ temp=(int)(index-e.a);
+ }else if(e.a>index){
+ break;
+ }else{
+ ex++;
+ }
+ assert(temp<=e.length()) : index +" \t "+e+" \t "+temp+" \t "+e.length();
+ assert(temp>=0) : index+", "+e;
+ off=temp;
+ }
+
+ }else if(strand==1){
+ for(int i=exons.length-1; i>=0; i--){
+ Exon e=exons[i];
+
+ int temp=0;
+ if(e.intersects(index)){
+ temp=(int)(e.b-index);
+ }else if(e.b<index){
+ break;
+ }else{
+ ex++;
+ }
+ assert(temp<=e.length()) : index +" \t "+e+" \t "+temp+" \t "+e.length();
+ assert(temp>=0) : index+", "+e;
+ off=temp;
+ }
+
+ }else{assert false : strand;}
+
+// if((index-143053138)>-3 && (index-143053138)<3){
+// assert(false) : ("\n\nLooking for "+index+" in\n"+this+
+// "\n\nwith exons\n"+Arrays.toString(exons)+"\n\nResult: "+off+"\n\n");
+// }
+//
+// if((index-143053111)>-10 && (index-143053111)<10){
+// assert(false) : ("\n\nLooking for "+index+" in\n"+this+
+// "\n\nwith exons\n"+Arrays.toString(exons)+"\n\nResult: "+off+"\n\n");
+// }
+
+// if(off==1 && exons[exons.length-1].b==143053111){
+// assert(false) : ("\n\nLooking for "+index+" in\n"+this+
+// "\n\nwith exons\n"+Arrays.toString(exons)+"\n\nResult: "+off+"\n\n");
+// }
+
+ // System.out.println();
+ return new int[] {ex, off};
+ }
+
+
+ public boolean isHypothetical(){
+ return isHypothetical(symbol);
+ }
+
+
+ public static boolean isHypothetical(String s){
+ if(s==null){return false;}
+ if(s.startsWith("C") && s.contains("orf")){return true;}
+ if(s.length()>=4 && s.startsWith("LOC") && Character.isDigit(s.charAt(3))){return true;}
+ return false;
+ }
+
+
+ public boolean isNormalGene(){
+ return valid && !untranslated && !pseudo && !isHypothetical();
+ }
+
+
+ public boolean intersectsTx(int point){
+ return point>=txStart && point<=txStop;
+ }
+ public boolean intersectsTr(int point){
+ assert(!untranslated);
+ return (untranslated ? false : point>=translationStart() && point<=translationStop());
+ }
+ public boolean intersectsCode(int point){
+// assert(!untranslated) : "point = "+point+"\ngene = "+this;
+// return (untranslated ? false : point>=codeStart && point<=codeEnd);
+ return (untranslated ? intersectsTx(point) : point>=codeStart && point<=codeStop);
+ }
+ public boolean intersectsExon(int point){
+ for(Exon e : exons){
+ if(e.intersects(point)){return true;}
+ }
+ return false;
+ }
+
+ /** Note that this skips code intersection checking for untranslated genes. */
+ public boolean intersectsCodeAndExon(int point){
+ if(!untranslated && !intersectsCode(point)){return false;}
+ for(Exon e : exons){
+ if(e.intersects(point)){return true;}
+ }
+ return false;
+ }
+
+
+ /** Note that this skips code intersection checking for untranslated genes. */
+ public boolean intersectsCodeAndExon(int a, int b){
+ if(!untranslated && !intersectsCode(a, b)){return false;}
+ for(Exon e : exons){
+ if(e.intersects(a, b)){return true;}
+ }
+ return false;
+ }
+
+ /** Note that this skips code intersection checking for untranslated genes. */
+ public boolean intersectsIntron(int a, int b){
+ if(exons==null || exons.length<2){return false;}
+ if(!overlap(a, b, exons[0].a, exons[exons.length-1].b)){return false;}
+ for(int i=1; i<exons.length; i++){
+ Exon e1=exons[i-1];
+ Exon e2=exons[i];
+ assert(e1.b<e2.a) : "\n"+e1+"\n"+e2+"\n"+this+"\n";
+
+ assert(a<=b && e1.b+1<=e2.a-1) : "\n"+e1+"\n"+e2+"\n"+this+"\n";
+
+ if(overlap(a, b, e1.b+1, e2.a-1)){return true;}
+ }
+ return false;
+ }
+
+ /** Note that this skips code intersection checking for untranslated genes. */
+ public boolean isDeepIntronic(int a, int b, int distFromEnds){
+ if(exons==null){return false;}
+ for(int i=1; i<exons.length; i++){
+ Exon e1=exons[i-1];
+ Exon e2=exons[i];
+ assert(e1.b<e2.a) : "\n"+e1+"\n"+e2+"\n"+this+"\n";
+ if(a>=e1.b+distFromEnds && b<=e2.a-distFromEnds){return true;}
+ }
+ return false;
+ }
+
+ public boolean intersectsSplice(int a, int b){
+ assert(b>=a);
+ if(exons==null || exons.length<2){return false;}
+ if(b<txStart || a>txStop){return false;}
+ for(Exon e : exons){
+ if(e.a>=a && e.a<=b){return true;}
+ if(e.b>=a && e.b<=b){return true;}
+ }
+ return false;
+ }
+
+ public boolean intersectsNearby(int a, int b){
+ return intersectsCodeAndExon(a-NEAR, b+NEAR);
+ }
+
+ private static int closestToPoint(int a, int b, int point){
+ int a2=(a>point ? a-point : point-a);
+ int b2=(b>point ? b-point : point-b);
+ return a2<b2 ? a : b;
+ }
+
+ /**
+ * @param a
+ * @param b
+ * @return {
+ * distance,<br>
+ * nearest exon number (-1 means coding start or stop),<br>
+ * side (0 means start, 1 means stop),<br>
+ * position (1 means inside, 2 means outside, 3 means both),<br>
+ * site coordinate
+ * }
+ */
+ public int[] nearestSpliceSite(int a, int b){
+
+ int bestDist=999999999;
+ int nearestExon=-1;
+ int side=-1;
+ int position=0;
+ int bestSite=-1;
+
+ boolean strictlyIntronic=this.isDeepIntronic(a, b, 1);
+
+ if(!strictlyIntronic){
+ {
+ int point=codeStart;
+ int x=Exon.distToPoint(a, b, point);
+ if(x<bestDist){
+ bestDist=x;
+ bestSite=point;
+ nearestExon=-1;
+ position=0;
+ if(a<point){position|=2;}
+ if(b>=point){position|=1;}
+ side=(strand==PLUS ? 0 : 1);
+ if(strand==PLUS){
+ side=0;
+ }else if(strand==MINUS){
+ side=1;
+ }
+ }
+
+ point=codeStop;
+ x=Exon.distToPoint(a, b, point);
+ if(x<bestDist){
+ bestDist=x;
+ bestSite=point;
+ nearestExon=-1;
+ position=0;
+ if(b>point){position|=2;}
+ if(a<=point){position|=1;}
+ side=(strand==PLUS ? 1 : 0);
+ }
+ }
+ }
+
+ for(int i=0; i<exons.length; i++){
+ Exon e=exons[i];
+
+ int point=e.a;
+ int x=Exon.distToPoint(a, b, point);
+ if(x<bestDist){
+ bestDist=x;
+ bestSite=point;
+ nearestExon=i;
+ side=(strand==PLUS ? 0 : 1);
+ position=0;
+ if(a<point){position|=2;}
+ if(b>=point){position|=1;}
+ }
+
+ point=e.b;
+ x=Exon.distToPoint(a, b, point);
+ if(x<bestDist){
+ bestDist=x;
+ bestSite=point;
+ nearestExon=i;
+ side=(strand==PLUS ? 1 : 0);
+ position=0;
+ if(b>point){position|=2;}
+ if(a<=point){position|=1;}
+ }
+ }
+
+ if(nearestExon>=0 && strand==MINUS){
+ nearestExon=exons.length-nearestExon-1;
+ }
+
+ return new int[] {bestDist, nearestExon, side, position, bestSite};
+ }
+
+
+
+ public boolean intersectsTx(int a, int b){
+ assert(a<=b);
+ return overlap(a, b, txStart, txStop);
+ }
+ public boolean intersectsTr(int a, int b){
+ assert(a<=b);
+ assert(!untranslated);
+ return (untranslated ? false : overlap(a, b, translationStart(), translationStop()));
+ }
+ public boolean intersectsCode(int a, int b){
+ assert(a<=b);
+// assert(!untranslated) : "a="+a+", b="+b+"\ngene = "+this;
+// return (untranslated ? false : overlap(a, b, codeStart, codeEnd));
+ return (untranslated ? intersectsTx(a, b) : overlap(a, b, codeStart, codeStop));
+ }
+ public boolean intersectsExon(int a, int b){
+// if(!intersectsCode(a, b)){return false;}
+ assert(a<=b);
+ for(Exon e : exons){
+ if(e.intersects(a, b)){return true;}
+ }
+ return false;
+ }
+ public boolean intersectsUTR(int a, int b){
+ if(!intersectsTx(a,b)){return false;}
+ if(untranslated){return true;}
+ if(overlap(a, b, txStart, codeStart)){return true;}
+ if(overlap(a, b, codeStop, txStop)){return true;}
+ return false;
+ }
+ /** Downstream */
+ public boolean intersectsUTR3(int a, int b){
+ if(!intersectsTx(a,b)){return false;}
+ if(untranslated){return false;}
+ if(strand==MINUS){
+ if(overlap(a, b, txStart, codeStart)){return true;}
+ }else{
+ if(overlap(a, b, codeStop, txStop)){return true;}
+ }
+ return false;
+ }
+ /** Upstream */
+ public boolean intersectsUTR5(int a, int b){
+ if(!intersectsTx(a,b)){return false;}
+ if(untranslated){return false;}
+ if(strand==PLUS){
+ if(overlap(a, b, txStart, codeStart)){return true;}
+ }else{
+ if(overlap(a, b, codeStop, txStop)){return true;}
+ }
+ return false;
+ }
+
+ private static boolean overlap(int a1, int b1, int a2, int b2){
+ assert(a1<=b1 && a2<=b2) : a1+", "+b1+", "+a2+", "+b2;
+ return a2<=b1 && b2>=a1;
+ }
+
+ public static final String header(){
+ return "#chrom\tsymbol\tgeneId\tmrnaAcc\tproteinAcc" +
+ "\tstrand\tcodeStart\tcodeStop\ttxStart\ttxStop" +
+ "\t(UNTRANSLATED?)\t(PSEUDOGENE?)\tstatus\tcompleteness\tsource" +
+ "\t[exon0start-exon0stop, ...exonNstart-exonNstop]" +
+ "\tfullName\tdescription";
+ }
+
+// public CharSequence toRefSeqFormat(){
+// return driver.ToRefGeneFormat.format(this);
+// }
+
+ public String toString(){
+
+ StringBuilder sb=new StringBuilder(256);
+
+ sb.append(chromosome+"\t");
+ sb.append(symbol+"\t");
+ sb.append(id+"\t");
+ sb.append(mrnaAcc+"\t");
+ assert(proteinAcc==null || !proteinAcc.equals("null"));
+ sb.append((proteinAcc==null ? "" : proteinAcc)+"\t");
+ sb.append(strandCodes[strand]+"\t");
+ sb.append(codeStart+"\t");
+ sb.append(codeStop+"\t");
+ sb.append(txStart+"\t");
+ sb.append(txStop);
+
+ sb.append("\t"+(untranslated ? "UNTRANS" : ""));
+ sb.append("\t"+(pseudo ? "PSEUDO" : ""));
+
+ sb.append("\t"+(status>=0 ? statusCodes[status] : ""));
+ sb.append("\t"+(completeness>=0 ? completenessCodes[completeness] : ""));
+ sb.append("\t"+(primarySource>=0 ? sourceCodes[primarySource] : ""));
+
+ sb.append("\t[");
+ String comma="";
+ for(int i=0; exons!=null && i<exons.length; i++){
+ sb.append(comma+exons[i].a+"-"+exons[i].b);
+ comma=", ";
+ }
+ sb.append("]");
+
+ assert(description==null || (!description.equals("null") && description.length()>0));
+ sb.append("\t"+(description==null ? "" : description));
+
+ assert(fullDescription==null || (!fullDescription.equals("null") && fullDescription.length()>0));
+ sb.append("\t"+(fullDescription==null ? "" : fullDescription));
+
+ String s=sb.toString();
+ return Character.isWhitespace(s.charAt(0)) ? s : s.trim();
+ }
+
+ public String toShortString(){
+
+ StringBuilder sb=new StringBuilder(256);
+
+ sb.append("chr"+chromosome+"\t");
+ sb.append(symbol+"\t");
+ sb.append(mrnaAcc+"\t");
+ sb.append(strandCodes[strand]+"\t");
+ sb.append("("+codeStart+" - "+codeStop+")");
+ return sb.toString();
+ }
+
+ public int compareTo(Gene other){
+ if(chromosome<other.chromosome){return -1;}
+ if(chromosome>other.chromosome){return 1;}
+
+ if(txStart<other.txStart){return -1;}
+ if(txStart>other.txStart){return 1;}
+
+ if(txStop<other.txStop){return -1;}
+ if(txStop>other.txStop){return 1;}
+
+ if(codeStart<other.codeStart){return -1;}
+ if(codeStart>other.codeStart){return 1;}
+
+ if(codeStop<other.codeStop){return -1;}
+ if(codeStop>other.codeStop){return 1;}
+
+ if(exonLength<other.exonLength){return -1;}
+ if(exonLength>other.exonLength){return 1;}
+
+ if(strand<other.strand){return -1;}
+ if(strand>other.strand){return 1;}
+
+ if(id<other.id){return -1;}
+ if(id>other.id){return 1;}
+
+ if(!symbol.equals(other.symbol)){return symbol.compareTo(other.symbol);}
+ return mrnaAcc.compareTo(other.mrnaAcc);
+ }
+
+ public boolean isIdenticalTo(Gene other){
+ if(chromosome!=other.chromosome){return false;}
+ if(strand!=other.strand){return false;}
+ if(txStart!=other.txStart){return false;}
+ if(txStop!=other.txStop){return false;}
+ if(codeStart!=other.codeStart){return false;}
+ if(codeStop!=other.codeStop){return false;}
+ if(exonLength!=other.exonLength){return false;}
+// if(pseudo!=other.pseudo || untranslated!=other.untranslated){return false;}
+ if(exons==null){
+ if(other.exons!=null){return false;}
+ }else{
+ if(other.exons==null || (other.exons.length!=exons.length)){return false;}
+ for(int i=0; i<exons.length; i++){
+ Exon e1=exons[i], e2=other.exons[i];
+ if(e1.a!=e2.a || e1.b!=e2.b){return false;}
+ //assert(e1.equals(e2));
+ //if(!e1.equals(e2)){return false;}
+ }
+ }
+ return true;
+ }
+
+ public boolean equals(Object other){
+ return equals((Gene)other);
+ }
+
+ public boolean equals(Gene other){//TODO check this
+ return this==other ? true : compareTo(other)==0;
+ }
+
+ public int hashCode(){
+ int xor=txStart^(Integer.rotateLeft(txStop, 16));
+ xor^=(chromosome<<20);
+ xor^=strand;
+ return xor;
+ }
+
+ public int translationStart(){ //TODO Make into a field
+ return (exons==null || exons.length==0) ? codeStart : exons[0].a;
+ }
+
+ public int translationStop(){ //TODO Make into a field
+ return (exons==null || exons.length==0) ? codeStop : exons[exons.length-1].b;
+ }
+
+ public int codeStartStrandCompensated(){ //TODO Make into a field
+ return strand==PLUS ? codeStart : codeStop;
+ }
+
+ public int codeStopStrandCompensated(){ //TODO Make into a field
+ return strand==PLUS ? codeStop : codeStart;
+ }
+
+ public int translationStartStrandCompensated(){ //TODO Make into a field
+ if(strand==PLUS){
+ return (exons==null || exons.length==0) ? codeStart : exons[0].a;
+ }else{
+ return (exons==null || exons.length==0) ? codeStop : exons[exons.length-1].b;
+ }
+ }
+
+ public int translationStopStrandCompensated(){ //TODO Make into a field
+ if(strand==PLUS){
+ return (exons==null || exons.length==0) ? codeStop : exons[exons.length-1].b;
+ }else{
+ return (exons==null || exons.length==0) ? codeStart : exons[0].a;
+ }
+ }
+
+ public int exonStartStrandCompensated(int exNum){
+ if(strand==PLUS){
+ return (exons==null || exons.length==0) ? codeStart : exons[exNum].a;
+ }else{
+ return (exons==null || exons.length==0) ? codeStop : exons[exons.length-exNum-1].b;
+ }
+ }
+
+ public int exonStopStrandCompensated(int exNum){
+ if(strand==PLUS){
+ return (exons==null || exons.length==0) ? codeStop : exons[exNum].b;
+ }else{
+ return (exons==null || exons.length==0) ? codeStart : exons[exons.length-exNum-1].a;
+ }
+ }
+
+ public int findClosestExon(int a, int b) {
+ if(exons==null || exons.length==0){return 0;}
+ int best=Integer.MAX_VALUE;
+ int exnum=-1;
+ for(int i=0; i<exons.length; i++){
+ Exon e=exons[i];
+ int x=calcDistance(a, b, e.a, e.b);
+ if(x<best){
+ best=x;
+ if(strand==PLUS){exnum=i;}
+ else{exnum=exons.length-i-1;}
+ }
+ }
+ assert(exnum>=0);
+ return exnum;
+ }
+
+
+ public static final int find(String a, String[] array){
+ for(int i=0; i<array.length; i++){
+ if(a.equals(array[i])){return i;}
+ }
+// assert(false) : "\n\nCan't find "+a+" in \n"+Arrays.toString(array);
+// System.err.println("Can't find "+a+" in "+Arrays.toString(array));
+ if(!asdf.contains(a)){
+ System.err.println("Can't find "+a);
+ asdf.add(a);
+ }
+ return -1;
+ }
+
+ private static final HashSet<String> asdf=new HashSet<String>();
+
+ public static final int find2(String a, String[] array){
+ for(int i=0; i<array.length; i++){
+ if(a.equals(array[i])){return i;}
+ }
+ return array.length-1; //No assertion
+ }
+
+
+ public static final int find3(String a, String[] array){
+// if(a==null){return -1;}
+ for(int i=0; i<array.length; i++){
+ if(a.equals(array[i])){return i;}
+ }
+ return -1;
+ }
+
+ /** Calculates the minimal distance between two ranges: (a1, b1) and (a2, b2). */
+ public static final int calcDistance(int a1, int b1, int a2, int b2){
+ assert(a1<=b1);
+ assert(a2<=b2);
+ int r;
+ if(b1>=a2 && b2>=a1){r=0;}
+ else if(a1>b2){r=a1-b2;}
+ else{r=a2-b1;}
+ assert(r>=0) : r;
+ return r;
+ }
+
+
+ private static final int min(int x, int y){return x<y ? x : y;}
+ private static final int max(int x, int y){return x>y ? x : y;}
+
+ /** Transcription start position */
+ public final int txStart;
+ /** Transcription end position */
+ public final int txStop;
+ /** Coding region start */
+ public final int codeStart;
+ /** Coding region end */
+ public final int codeStop;
+
+ /** Length of transcribed area */
+ public final int txLength;
+
+ /** Length of coding area */
+ public final int codeLength;
+
+ /** Length of exons (summed) */
+ public final int exonLength;
+
+ /** Length of exonic coding region */
+ public final int exonCodeLength;
+
+ /** Number of amino acids (excluding stop codon) */
+ public final int aaLength;
+
+ public final int utrLength5prime;
+ public final int utrLength3prime;
+
+ /** Reference sequence chromosome or scaffold */
+ public final byte chromosome;
+ /** + or - for strand */
+ public final byte strand;
+ /** ? */
+ public final byte cdsStartStat;
+ /** ? */
+ public final byte cdsEndStat;
+
+ public final boolean readCorrectly;
+
+ /** Array of exons used by this gene */
+ public final Exon[] exons;
+
+ /** Exon frame {0,1,2}, or -1 if no frame for exon */
+ public final byte[] exonFrames;
+
+ /** Name of gene (usually transcript_id from GTF) */
+ public final String mrnaAcc;
+
+ /** Protein accession */
+ public final String proteinAcc;
+
+ /** Alternate name (e.g. gene_id from GTF) */
+ public final String symbol;
+
+ public final String description;
+
+ public final String fullDescription;
+
+ public final byte primarySource;
+
+ /* CCDS file format:
+ * chromosome nc_accession gene gene_id ccds_id ccds_status cds_strand cds_from cds_to cds_locations match_type */
+
+ /* CCDS format stuff */
+
+// public final String nc_accession;
+ public final byte status;
+ public final byte completeness;
+ public final int id;
+
+
+ public final boolean untranslated;
+ public final boolean pseudo;
+ public final boolean valid;
+
+ public static final String[] sourceCodes={
+ "seqGene", "knownGene", "refGene", "unionGene",
+ "reserved1", "reserved2", "reserved3", "reserved4"
+ };
+
+ /** Index with cdsStartStat and cdsEndStat */
+ public static final String[] endStatCodes={"none", "unk", "incmpl", "cmpl"};
+
+ public static final String[] statusCodes={
+ "Unknown","Reviewed","Validated","Provisional","Predicted","Inferred","Public"
+
+// "Public", "Reviewed, update pending", "Reviewed, withdrawal pending",
+// "Withdrawn", "Withdrawn, inconsistent annotation",
+// "Under review, withdrawal", "Under review, update",
+
+ };
+
+ public static final String[] completenessCodes={
+ "Unknown","Complete5End","Complete3End","FullLength","IncompleteBothEnds","Incomplete5End","Incomplete3End","Partial"
+ };
+
+
+ /** Index with chromosome number */
+ public static final String[] chromCodes={"A", "1", "2", "3", "4", "5", "6",
+ "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18",
+ "19", "20", "21", "22", "X", "Y", "M", "U"};
+
+ /** Index with strand number */
+ public static final String[] strandCodes={"+", "-", "?"};
+ public static final char[] strandCodes2={'+', '-', '?'};
+
+ public static final byte PLUS=0;
+ public static final byte MINUS=1;
+ private static final int NEAR=Data.NEAR;
+
+ public static final byte STAT_UNKNOWN=0;
+ public static final byte STAT_REVIEWED=1;
+ public static final byte STAT_VALIDATED=2;
+ public static final byte STAT_PROVISIONAL=3;
+ public static final byte STAT_PREDICTED=4;
+ public static final byte STAT_INFERRED=5;
+ public static final byte STAT_PUBLIC=6;
+
+}
diff --git a/current/dna/GeneSet.java b/current/dna/GeneSet.java
new file mode 100755
index 0000000..d08873e
--- /dev/null
+++ b/current/dna/GeneSet.java
@@ -0,0 +1,132 @@
+package dna;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+
+
+public class GeneSet implements Comparable<GeneSet>{
+
+ public static void main(String[] args){
+ Data.getGeneIDTable();
+ }
+
+ public GeneSet(String n, ArrayList<Gene> g){
+ name=n;
+ id=g.get(0).id;
+ genes=g;
+ chrom=g.get(0).chromosome;
+ transcripts=genes.size();
+ assert(transcripts>0);
+
+ byte st=-1;
+
+ boolean pse=true, unt=true;
+
+ for(int i=0; i<transcripts; i++){
+ Gene gene=g.get(i);
+
+ assert(gene.id==id) : g;
+
+ pse=(pse&&gene.pseudo);
+ unt=(unt&&gene.untranslated);
+ minStart=min((int)gene.txStart, minStart);
+ maxEnd=max((int)gene.txStop, maxEnd);
+ // assert(st==-1 || st==gene.strand) : g;
+ if(st==-1){st=gene.strand;}
+ else if(st!=gene.strand){st=(byte) Gene.find("?", Gene.strandCodes);}
+ }
+
+ pseudo=pse;
+ untranslated=unt;
+
+ for(Gene gene : g){
+ assert(pseudo==gene.pseudo || (!pseudo && gene.pseudo)) : g;
+ assert(untranslated==gene.untranslated || (!untranslated && gene.untranslated)) : g;
+// assert(untranslated==gene.untranslated) : g;
+ }
+
+ strand=st;
+ }
+
+ public String toString(){
+ StringBuilder sb=new StringBuilder();
+ sb.append(name);
+ while(sb.length()<10){sb.append(' ');}
+ sb.append('\t');
+ sb.append(padFront(transcripts+"",2)+" transcript"+(transcripts==1 ? " " : "s"));
+
+ sb.append("\tchr"+chrom+" ("+minStart+" - "+maxEnd+"), '"+Gene.strandCodes[strand]+"'");
+
+ return sb.toString();
+ }
+
+ private static final String padFront(String num, int width){
+ String r=num;
+ while(r.length()<width){r=" "+r;}
+ return r;
+ }
+
+ private static final String padBack(String num, int width){
+ String r=num;
+ while(r.length()<width){r=r+" ";}
+ return r;
+ }
+
+ public final String name;
+ public final int id;
+ public final byte chrom;
+ public final byte strand;
+ public final ArrayList<Gene> genes;
+ public final int transcripts;
+
+ /** True if all transcripts are untranslated */
+ public final boolean untranslated;
+ /** True if all transcripts are psuedogenes */
+ public final boolean pseudo;
+
+ public int minStart=Integer.MAX_VALUE;
+ public int maxEnd=0;
+
+
+ public boolean intersects(int point){
+ return point>=minStart && point<=maxEnd;
+ }
+ public boolean intersects(int point1, int point2){
+ return point2>=minStart && point1<=maxEnd;
+ }
+
+
+ @Override
+ public int compareTo(GeneSet other) {
+ if(chrom!=other.chrom){
+ return chrom>other.chrom ? 1 : -1;
+ }
+ int x=minStart<other.minStart ? -1 : minStart>other.minStart ? 1 : 0;
+ if(x!=0){return x;}
+ return x=name.compareTo(other.name);
+ }
+
+ public boolean equals(Object other){
+ return equals((GeneSet)other);
+ }
+
+ public boolean equals(GeneSet other){
+ return compareTo(other)==0;
+ }
+
+ @Override
+ public int hashCode(){
+ return Integer.rotateLeft(name.hashCode(), 5)^chrom;
+ }
+
+ private static final int min(int x, int y){return x<y ? x : y;}
+ private static final int max(int x, int y){return x>y ? x : y;}
+ private static final byte min(byte x, byte y){return x<y ? x : y;}
+ private static final byte max(byte x, byte y){return x>y ? x : y;}
+ private static final long min(long x, long y){return x<y ? x : y;}
+ private static final long max(long x, long y){return x>y ? x : y;}
+ private static final float min(float x, float y){return x<y ? x : y;}
+ private static final float max(float x, float y){return x>y ? x : y;}
+
+}
\ No newline at end of file
diff --git a/current/dna/IntMap.java b/current/dna/IntMap.java
new file mode 100755
index 0000000..12f83ff
--- /dev/null
+++ b/current/dna/IntMap.java
@@ -0,0 +1,106 @@
+package dna;
+import java.util.Arrays;
+
+
+public class IntMap {
+
+ public static void main(String[] args){
+
+ }
+
+
+ public IntMap(int from, int to){
+ reset(from, to);
+ }
+
+
+ public int get(int key){
+ assert(key>=min && key<=max);
+ return array[key-min];
+ }
+
+
+ public boolean containsKey(int key){
+ assert(key>=min && key<=max);
+ return array[key-min]!=INVALID;
+ }
+
+
+ public int put(int key, int value){
+ assert(key>=min && key<=max);
+ assert(value!=INVALID);
+ int index=key-min;
+ int old=array[index];
+ array[index]=value;
+ return old;
+ }
+
+
+ public int remove(int key){
+ assert(key>=min && key<=max);
+ int index=key-min;
+ int old=array[index];
+ array[index]=INVALID;
+ return old;
+ }
+
+
+ public int size(){
+ int sum=0;
+ for(int i=0; i<array.length; i++){
+ if(array[i]!=INVALID){sum++;}
+ }
+ return sum;
+ }
+
+
+ public int[] keys(){
+ int[] r=new int[size()];
+ for(int i=0, j=0; j<r.length; i++){
+ if(array[i]!=INVALID){
+ r[j]=(min+i);
+ j++;
+ }
+ }
+ return r;
+ }
+
+
+ public int[] values(){
+ int[] r=new int[size()];
+ for(int i=0, j=0; j<r.length; i++){
+ if(array[i]!=INVALID){
+ r[j]=array[i];
+ j++;
+ }
+ }
+ return r;
+ }
+
+
+ public void clear(){
+ Arrays.fill(array, INVALID);
+ }
+
+
+ public void reset(int from, int to){
+ min=from;
+ max=to;
+ assert(max>=min);
+ assert(((long)max)-((long)min)<Integer.MAX_VALUE);
+
+ int size=max-min+1;
+ if(array==null || array.length<size){
+ array=new int[size];
+ }
+ clear();
+ }
+
+
+ public int min;
+ public int max;
+ public int[] array;
+
+ private static final int INVALID=Integer.MIN_VALUE;
+
+}
diff --git a/current/dna/IntMapFlex.java b/current/dna/IntMapFlex.java
new file mode 100755
index 0000000..2152036
--- /dev/null
+++ b/current/dna/IntMapFlex.java
@@ -0,0 +1,135 @@
+package dna;
+import java.util.Arrays;
+
+/** TODO */
+public class IntMapFlex {
+
+ public static void main(String[] args){
+
+ }
+
+
+ public IntMapFlex(int initialCap){
+ reset(0, initialCap-1);
+ }
+
+
+ public int get(int key){
+ assert(key>=min && key<=max);
+ return array[key-min];
+ }
+
+
+ public boolean containsKey(int key){
+// assert(key>=min && key<=max);
+ return array[key-min]!=INVALID;
+ }
+
+
+ public int put(int key, int value){
+ assert(key>=min && key<=max);
+ assert(value!=INVALID);
+ int index=key-min;
+ int old=array[index];
+ array[index]=value;
+ return old;
+ }
+
+
+ public int remove(int key){
+ assert(key>=min && key<=max);
+ int index=key-min;
+ int old=array[index];
+ array[index]=INVALID;
+ return old;
+ }
+
+
+ public int size(){
+ int sum=0;
+ for(int i=0; i<array.length; i++){
+ if(array[i]!=INVALID){sum++;}
+ }
+ return sum;
+ }
+
+
+ public int[] keys(){
+ int[] r=new int[size()];
+ for(int i=0, j=0; j<r.length; i++){
+ if(array[i]!=INVALID){
+ r[j]=(min+i);
+ j++;
+ }
+ }
+ return r;
+ }
+
+
+ public int[] values(){
+ int[] r=new int[size()];
+ for(int i=0, j=0; j<r.length; i++){
+ if(array[i]!=INVALID){
+ r[j]=array[i];
+ j++;
+ }
+ }
+ return r;
+ }
+
+
+ public void clear(){
+ Arrays.fill(array, INVALID);
+ }
+
+
+ public void reset(int from, int to){
+ min=from;
+ max=to;
+ assert(max>=min);
+ assert(((long)max)-((long)min)<Integer.MAX_VALUE);
+
+ int size=max-min+1;
+ if(array==null || array.length<size){
+ array=new int[size];
+ }
+ clear();
+ }
+
+
+ public void remap(int min2, int max2){
+
+ int size=max-min+1;
+ int size2=max2-min2+1;
+
+ if(size>=size2){
+
+ }else{
+ int[] oldArray=array;
+ array=new int[size2];
+
+ }
+ assert(false) : "TODO";
+//
+// min=min2;
+// max=max2;
+// assert(max>=min);
+// assert(((long)max)-((long)min)<Integer.MAX_VALUE);
+//
+// int size=max-min+1;
+// if(array==null || array.length<size){
+// array=new int[size];
+// }
+// clear();
+ }
+
+
+ public int min;
+ public int max;
+ public int minKey;
+ public int maxKey;
+ public int[] array;
+
+ private static final int INVALID=Integer.MIN_VALUE;
+
+}
diff --git a/current/dna/Matrix.java b/current/dna/Matrix.java
new file mode 100755
index 0000000..1580439
--- /dev/null
+++ b/current/dna/Matrix.java
@@ -0,0 +1,93 @@
+package dna;
+import java.util.HashMap;
+import java.util.Set;
+
+import fileIO.MatrixFile;
+
+
+public class Matrix {
+
+
+ public Matrix(float[][] g, int pre, String nm){
+ grid=g;
+ prefix=pre;
+ name=nm;
+ }
+
+ public float[][] subGrid(int prefixLength, int length){
+ float[][] r=new float[length][];
+ int start=prefix-prefixLength;
+ for(int i=0; i<length; i++){
+ r[i]=grid[i+start].clone();
+ }
+ return r;
+ }
+
+ public float[][] grid;
+ public int prefix;
+ public String name;
+
+
+
+
+ private static HashMap<String, Matrix> table=null;
+
+ public static Set<?> keys(){return table.keySet();}
+
+ public static Matrix get(String s){
+ if(table==null){
+ table=new HashMap<String, Matrix>(64);
+// fillTable("matrices.txt");
+// fillTable("matrices2.txt");
+
+// fillTable("matrixN1.txt");
+// fillTable("matrixN2.txt");
+// fillTable("matrixN3.txt");
+// fillTable("matrixN4.txt");
+
+ fillTable("matrix_build37_N1.txt");
+ fillTable("matrix_build37_N2.txt");
+ fillTable("matrix_build37_N3.txt");
+// fillTable("matrix_build37_N4.txt");
+
+
+
+// fillTable("asmGstart_sept9.txt");
+// fillTable("asmEstart_sept9.txt");
+// fillTable("asmTRstart_sept9.txt");
+// fillTable("asmGstop_sept9.txt");
+// fillTable("asmEstop_sept9.txt");
+// fillTable("asmTRstop_sept9.txt");
+// fillTable("asmEstop_sept16.txt");
+
+// fillTable("SplicePercentiles_b37_Sept16.txt");
+ fillTable("SplicePercentiles_b37_Nov24.txt");
+
+ }
+ Matrix m=table.get(s);
+
+// assert(table.containsKey(s)) : "\nCan't find "+s+" in\n\n"+table.keySet()+"\n";
+// assert(m!=null) : "\nValue for "+s+" is null\n";
+
+ if(!table.containsKey(s) || m==null){
+ if(!table.containsKey(s)){throw new RuntimeException("Can't find "+s+" in\n\n"+table.keySet()+"\n");}
+ if(m==null){throw new RuntimeException("\nValue for "+s+" is null");}
+ }
+
+
+ return m;
+ }
+
+ private static void fillTable(String fname){
+ MatrixFile mf=new MatrixFile(fname);
+ Matrix mat=mf.nextMatrix();
+ while(mat!=null){
+// System.out.println("Adding "+mat.name);
+ table.put(mat.name, mat);
+ table.put(mat.name.toLowerCase(), mat);
+ mat=mf.nextMatrix();
+ }
+ mf.close();
+ }
+
+}
diff --git a/current/dna/Motif.java b/current/dna/Motif.java
new file mode 100755
index 0000000..cd9134d
--- /dev/null
+++ b/current/dna/Motif.java
@@ -0,0 +1,231 @@
+package dna;
+
+public abstract class Motif {
+
+ public Motif(String name_, int length_, int center_){
+ center=center_;
+ length=length_;
+ suffix=length-center-1;
+ name=name_;
+
+// assert(center>=0 && center<length);
+ }
+
+
+ public final int countExact(String s){
+ return countExact(s, 0, s.length());
+ }
+
+
+ public final int countExact(String s, int a, int b){
+ return countExact(s.getBytes(), a, b);
+ }
+
+
+ public final int countExtended(String s){
+ return countExtended(s, 0, s.length());
+ }
+
+
+ public final int countExtended(String s, int a, int b){
+ return countExtended(s.getBytes(), a, b);
+ }
+
+
+ public final int countExact(byte[] source, int a, int b){
+
+ int max=min(b, source.length-1)-length+1;
+
+ int count=0;
+
+ for(int i=a; i<=max; i++){
+ if(matchesExactly(source, i)){count++;}
+ }
+
+ return count;
+
+ }
+
+
+ public final int countExtended(byte[] source, int a, int b){
+
+ int max=min(b, source.length-1)-length+1;
+
+ int count=0;
+
+ for(int i=a; i<=max; i++){
+ if(matchesExtended(source, i)){count++;}
+ }
+
+ return count;
+
+ }
+
+
+ public boolean matchesExactly(byte[] source, int a){
+ throw new RuntimeException();
+ }
+
+
+ public boolean matchesExtended(byte[] source, int a){
+ throw new RuntimeException();
+ }
+
+ public float normalize(double strength){
+ return (float)strength;
+ }
+
+
+ public float matchStrength(byte[] source, int a){
+ return(matchesExactly(source, a) ? 1 : 0);
+ }
+
+
+ public static final int minPos(float[] array){
+ int pos=0;
+ for(int i=1; i<array.length; i++){
+ if(array[i]<array[pos]){pos=i;}
+ }
+ return pos;
+ }
+
+
+ public static final int maxPos(float[] array){
+ int pos=0;
+ for(int i=1; i<array.length; i++){
+ if(array[i]>array[pos]){pos=i;}
+ }
+ return pos;
+ }
+
+ public String toString(){
+ return name+", "+length+", "+center;
+ }
+
+
+ public final String name;
+ public String commonLetters;
+ public final int center;
+ public final int length;
+ public final int suffix;
+
+
+ static final int min(int x, int y){return x<y ? x : y;}
+ static final int max(int x, int y){return x>y ? x : y;}
+ static final float min(float x, float y){return x<y ? x : y;}
+ static final float max(float x, float y){return x>y ? x : y;}
+
+ static final byte[] numberToBase=AminoAcid.numberToBase;
+ static final byte[] numberToBaseExtended=AminoAcid.numberToBaseExtended;
+ static final byte[] baseToNumber=AminoAcid.baseToNumberACGTN;
+ static final byte[] baseToNumberExtended=AminoAcid.baseToNumberExtended;
+
+ static final float[] baseProb1={0.256614f, 0.226617f, 0.238012f, 0.278756f};
+
+ //Within 200 of exon and gene ends only
+ static final float[] baseProb2={
+ 0.076019f, 0.046405f, 0.071754f, 0.062437f, 0.067143f, 0.066057f, 0.020333f, 0.073085f,
+ 0.060553f, 0.054897f, 0.068741f, 0.053822f, 0.052896f, 0.059260f, 0.077188f, 0.089412f
+ };
+
+ //name: Overall Frequency MP3
+ static final float[] baseProb3={
+ 0.027343f, 0.011857f, 0.018295f, 0.018524f, 0.015942f, 0.012337f, 0.003792f, 0.014333f,
+ 0.019988f, 0.015837f, 0.020411f, 0.015518f, 0.014382f, 0.011355f, 0.016466f, 0.020234f,
+ 0.014364f, 0.014299f, 0.022875f, 0.015605f, 0.018893f, 0.019412f, 0.006677f, 0.021076f,
+ 0.003629f, 0.005854f, 0.006783f, 0.004067f, 0.010491f, 0.018413f, 0.024257f, 0.019924f,
+ 0.018029f, 0.010640f, 0.019427f, 0.012458f, 0.015158f, 0.017025f, 0.006167f, 0.016547f,
+ 0.018098f, 0.016891f, 0.020042f, 0.013710f, 0.010580f, 0.010773f, 0.018026f, 0.014443f,
+ 0.016281f, 0.009609f, 0.011157f, 0.015849f, 0.017150f, 0.017284f, 0.003696f, 0.021130f,
+ 0.018839f, 0.016316f, 0.021506f, 0.020527f, 0.017442f, 0.018720f, 0.018440f, 0.034811f
+ };
+
+// protected static final Hashtable<String, float[]> percentTable=makePercentTable();
+//
+// private static final Hashtable<String, float[]> makePercentTable(){
+//
+// String[] keys={
+// "Exon Stops MP3",
+// };
+//
+// float[][] values={
+// {
+// 0.00234f, 0.01071f, 0.02476f, 0.05155f, 0.08682f, 0.1453f, 0.22434f, 0.29615f, 0.36233f, 0.41034f,
+// 0.46028f, 0.52224f, 0.58198f, 0.63879f, 0.68356f, 0.70622f, 0.7268f, 0.75131f, 0.77065f, 0.79546f,
+// 0.82445f, 0.85279f, 0.86899f, 0.88287f, 0.89197f, 0.90166f, 0.91405f, 0.93129f, 0.94708f, 0.95521f,
+// 0.96106f, 0.96293f, 0.9663f, 0.97242f, 0.97662f, 0.97866f, 0.98017f, 0.98242f, 0.98459f, 0.98703f,
+// 0.98957f, 0.99064f, 0.99157f, 0.99286f, 0.9952f, 0.99721f, 0.99858f, 0.99914f, 0.99967f, 0.9999f, 0.99998f
+// },
+// };
+//
+// Hashtable<String, float[]> r= new Hashtable<String, float[]>();
+// for(int i=0; i<keys.length; i++){
+// r.put(keys[i], values[i]);
+// }
+//
+// return r;
+// }
+
+ static final float[] invBaseProb1=invert(baseProb1);
+
+ static final float[] invBaseProb2=invert(baseProb2);
+
+ static final float[] invBaseProb3=invert(baseProb3);
+
+ static final float[][] baseProbN={
+ null,
+ baseProb1,
+ baseProb2,
+ baseProb3
+ };
+
+ static final float[][] invBaseProbN={
+ null,
+ invBaseProb1,
+ invBaseProb2,
+ invBaseProb3
+ };
+
+ private static final float[] invert(float[] in){
+ float[] out=new float[in.length];
+ for(int i=0; i<in.length; i++){
+ out[i]=1f/in[i];
+ }
+ return out;
+ }
+
+ protected float[] percentile;
+
+ public abstract int numBases();
+
+ public float percentile(float strength){
+// float[] array=percentiles[numBases()];
+
+ if(percentile==null){
+ throw new RuntimeException("Can't find percentile array for "+this);
+ }
+
+ float[] array=percentile;
+
+ int index=(int)(strength*array.length);
+
+// System.out.print(" *** index = "+index+" -> "+array[index]+" -> "+array[index+1]+" *** ");
+
+ if(index>=array.length-1){return 1;}
+
+ float a, b;
+ if(index==0){
+ a=0;
+ b=array[0];
+ }else{
+ a=array[index];
+ b=array[index+1];
+ }
+
+ float ratio=strength-(index/((float)array.length));
+
+ return ratio*b+(1-ratio)*a;
+
+ }
+
+}
diff --git a/current/dna/MotifMulti.java b/current/dna/MotifMulti.java
new file mode 100755
index 0000000..89b6c3d
--- /dev/null
+++ b/current/dna/MotifMulti.java
@@ -0,0 +1,59 @@
+package dna;
+import java.util.Arrays;
+
+
+public class MotifMulti extends Motif {
+
+ public MotifMulti(String name_, Motif...args){
+ super(name_, args[0].length, args[0].center);
+ commonLetters=Arrays.toString(args);
+ sub=args;
+ }
+
+
+ public boolean matchesExactly(byte[] source, int a){
+ for(int i=0; i<sub.length; i++){
+ Motif m=sub[i];
+ if(m.matchesExactly(source, a)){
+ return true;
+ }
+ }
+ return false;
+ }
+
+
+ public boolean matchesExtended(byte[] source, int a){
+ for(int i=0; i<sub.length; i++){
+ Motif m=sub[i];
+ if(m.matchesExtended(source, a)){
+ return true;
+ }
+ }
+ return false;
+ }
+
+ public float normalize(double strength){
+ return (float)strength;
+// throw new RuntimeException("MotifMulti can't normalize without knowing the submotif.");
+ }
+
+
+ public float matchStrength(byte[] source, int a){
+ float max=0;
+ for(int i=0; i<sub.length; i++){
+ Motif m=sub[i];
+ float temp=m.matchStrength(source, a);
+ temp=m.normalize(temp);
+ max=max(max, temp);
+ }
+ return max;
+ }
+
+ @Override
+ public int numBases() {
+ return sub[0].numBases();
+ }
+
+ public final Motif[] sub;
+
+}
diff --git a/current/dna/MotifProbsN.java b/current/dna/MotifProbsN.java
new file mode 100755
index 0000000..8fbd059
--- /dev/null
+++ b/current/dna/MotifProbsN.java
@@ -0,0 +1,269 @@
+package dna;
+
+import java.util.Arrays;
+
+public class MotifProbsN extends Motif {
+
+ public static void main(String args[]){
+
+ String s1="ATN";
+ String s2="CTATGCCCATCTGATGGCATGAGGATGAA";
+
+// if(args.length>0){s1=args[0];}
+// if(args.length>1){s2=args[1];}
+
+ MotifProbsN m=makeMotif("Exon Stops MP3", 10, 3, 3);
+
+ System.out.println("Made motif "+m.name);
+
+ String source=s2;
+
+
+ int x=m.countExact(source);
+ System.out.println(x+" matches.");
+
+ byte[] sbytes=source.getBytes();
+
+ for(int i=0; i<s2.length(); i++){
+ String sub=s2.substring(i, min(i+m.probs.length, s2.length()));
+ float p=m.matchStrength(sbytes, i);
+ System.out.println(sub+String.format(": \t%.5f ->\t%.5f", p, m.normalize(p)));
+ }
+
+ }
+
+ public static MotifProbsN makeMotif(String name_, int length_, int center_, int n_){
+ Matrix mat=Matrix.get(name_);
+ assert(mat!=null) : "\nCan't find '"+name_+"' in:\n"+Matrix.keys()+"\n\n";
+ float[][] sub=mat.subGrid(center_, length_);
+
+// System.out.println("Found "+name+":\n"+Arrays.toString(sub[preLen]));
+
+ assert(sub[0].length==(1<<(2*n_)));
+
+ MotifProbsN r=new MotifProbsN(name_, sub, center_, n_);
+
+ Matrix percentMatrix=null;
+
+
+ try {
+ percentMatrix=Matrix.get(name_+", "+r.length+", "+r.center);
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+// System.out.println("\nIgnoring missing percentMatrix:\n"+e);
+ }
+
+ if(percentMatrix!=null){
+ r.percentile=percentMatrix.grid[0];
+ }
+// r.percentile=percentTable.get(name);
+
+ return r;
+ }
+
+ public MotifProbsN(String name_, float[][] p, int cen, int n){
+ super(name_, p.length, cen);
+
+ N=n;
+ chunk=new byte[N];
+ baseProb=Motif.baseProbN[N];
+
+ probs=p;
+ importance=positionImportance(probs);
+
+ adjustForBaseProb(probs, baseProb);
+
+ double pmin=1, pmax=1;
+
+ double sum=0;
+ for(int i=0; i<p.length; i++){
+ for(int j=0; j<p[i].length; j++){
+ sum+=p[i][j];
+ }
+ }
+ matrixAvg=(float)(sum/(p.length*p[0].length));
+
+
+ //Adjusts for importance
+ for(int i=0; i<probs.length; i++){
+ for(int j=0; j<probs[i].length; j++){
+ probs[i][j]=(float)Math.pow(probs[i][j], 1+(importance[i]*.8));
+ }
+ }
+
+
+ StringBuilder sb=new StringBuilder();
+ for(int i=0; i<probs.length; i++){
+ int x=maxPos(probs[i]);
+ int y=minPos(probs[i]);
+ sb.append((char)numberToBase[x>>(2*(N-1))]);
+
+// pmax*=probs[i][x]*4; //TODO Note the .25; could be an empirical inverse probability, but that causes complications
+// pmin*=probs[i][y]*4;
+
+ pmax*=probs[i][x];
+ pmin*=probs[i][y];
+
+// pmax*=Math.pow(probs[i][x], 1+importance[i]);
+// pmin*=Math.pow(probs[i][y], 1+importance[i]);
+
+// pmax*=(probs[i][x]+(matrixAvg*importance[i]*.1f));
+// pmin*=(probs[i][y]+(matrixAvg*importance[i]*.1f));
+ }
+
+
+ maxProb=(float)pmax;
+ minProb=(float)pmin;
+
+ invProbDif=1f/(maxProb-minProb);
+ invLength=1f/(length);
+
+ commonLetters=sb.toString();
+
+ lettersUpper=commonLetters.toUpperCase().getBytes();
+ lettersLower=commonLetters.toLowerCase().getBytes();
+
+ numbers=new byte[commonLetters.length()];
+ numbersExtended=new byte[commonLetters.length()];
+
+ for(int i=0; i<lettersUpper.length; i++){
+ byte b=lettersUpper[i];
+ numbers[i]=baseToNumber[b];
+ numbersExtended[i]=baseToNumberExtended[b];
+ }
+
+ }
+
+
+ public void adjustForBaseProb(float[][] grid, float[] base){
+ for(int i=0; i<grid.length; i++){
+ for(int j=0; j<grid[i].length; j++){
+ grid[i][j]/=base[j];
+ }
+ }
+ }
+
+
+ public float normalize(double strength){
+ double r=strength-minProb;
+// r=r/(maxProb-minProb);
+// r=Math.pow(r, 1d/length);
+ r=r*invProbDif;
+ r=Math.pow(r, invLength);
+ return (float)r;
+ }
+
+
+ public float normalize2(double strength){
+ double r=Math.log(strength)-Math.log(minProb);
+
+ double r2=Math.log(maxProb)-Math.log(minProb);
+
+ r=r/r2;
+ return (float)r;
+ }
+
+
+ public boolean matchesExactly(byte[] source, int a){
+
+ a=a-center;
+ if(a<0 || a+length>source.length){return false;}
+
+ for(int i=0; i<lettersUpper.length; i++){
+ int x=i+a;
+ if(source[x]!=lettersUpper[i] && source[x]!=lettersLower[i]){
+ return false;
+ }
+ }
+ return true;
+ }
+
+
+ public float matchStrength(byte[] source, int a){
+
+ a=a-center;
+ if(a<0 || a+length+1>source.length){return minProb;}
+
+ float r=1;
+
+ for(int i=0; i<probs.length; i++){
+ int x=i+a;
+
+ for(int c=0; c<N; c++){
+ chunk[c]=source[x+c];
+ }
+
+ int n=AminoAcid.baseTupleToNumber(chunk);
+ if(n<0 || n>baseProb.length){return minProb;}
+
+// float p1=(probs[i][n]+(matrixAvg*importance[i]*.1f));
+
+// float p1=(float)Math.pow(probs[i][n], 1+importance[i]); //Note: Assumes (A,C,G,T) only.
+ float p1=probs[i][n]; //Note: Assumes (A,C,G,T) only.
+
+// float p2=invBaseProb2[n];
+// float p2=4; //TODO
+//
+// r=r*p1*p2;
+
+ r=r*p1;
+ }
+ return r;
+ }
+
+
+ public float[] positionImportance(float[][] rawProbs){
+ float[] base=baseProb;
+ float[] out=new float[rawProbs.length];
+
+ double maxSum=0;
+
+ for(int i=0; i<out.length; i++){
+ float[] array=rawProbs[i];
+ double sum=0;
+ for(int code=0; code<array.length; code++){
+ double dif=Math.abs(array[code]-base[code]);
+ sum+=Math.pow(dif,1.5); //Raise to a power to increase the effect
+ }
+ sum=Math.pow(sum, 0.75);
+ out[i]=(float)sum;
+ if(sum>maxSum){
+ maxSum=sum;
+ }
+ }
+
+ for(int i=0; i<out.length; i++){
+ out[i]=(float)(out[i]/maxSum);
+// out[i]=out[i]*.9f+.1f; //Weakens the effect
+// out[i]=out[i]*.5f; //makes the scale 0 to .5
+ }
+
+ return out;
+ }
+
+ @Override
+ public int numBases() {
+ return N;
+ }
+
+ public final int N;
+
+ public final float[][] probs;
+ public final float[] importance;
+ public final float matrixAvg;
+
+ public final byte[] lettersUpper;
+ public final byte[] lettersLower;
+ public final byte[] numbers;
+ public final byte[] numbersExtended;
+
+ private final byte[] chunk;
+ private final float[] baseProb;
+
+ public float maxProb;
+ public float minProb;
+
+ public final float invProbDif;
+ public final float invLength;
+
+}
diff --git a/current/dna/MotifSimple.java b/current/dna/MotifSimple.java
new file mode 100755
index 0000000..0ea0395
--- /dev/null
+++ b/current/dna/MotifSimple.java
@@ -0,0 +1,94 @@
+package dna;
+
+public class MotifSimple extends Motif {
+
+ public static void main(String args[]){
+
+ String s1="ATN";
+ String s2="ATGCCCATCTGATG";
+
+ if(args.length>0){s1=args[0];}
+ if(args.length>1){s2=args[1];}
+
+ MotifSimple m=new MotifSimple(s1, 0);
+ String source=s2;
+
+
+ int x=m.countExtended(source);
+ System.out.println(x+" matches.");
+ }
+
+ public MotifSimple(String s, int cen){
+ super(s, s.length(), cen);
+
+ commonLetters=s;
+ lettersUpper=commonLetters.toUpperCase().getBytes();
+ lettersLower=commonLetters.toLowerCase().getBytes();
+
+ boolean x=false;
+ for(int i=0; i<lettersUpper.length; i++){
+ if(lettersUpper[i]!='A' && lettersUpper[i]!='C' && lettersUpper[i]!='G' && lettersUpper[i]!='T'){
+ x=true;
+ }
+ }
+ extended=x;
+
+ numbers=new byte[s.length()];
+ numbersExtended=new byte[s.length()];
+
+ for(int i=0; i<lettersUpper.length; i++){
+ byte b=lettersUpper[i];
+ numbers[i]=baseToNumber[b];
+ numbersExtended[i]=baseToNumberExtended[b];
+ }
+ }
+
+
+ public boolean matchesExactly(byte[] source, int a){
+ assert(!extended);
+
+ a=a-center;
+ if(a<0 || a+length>source.length){return false;}
+
+ for(int i=0; i<lettersUpper.length; i++){
+ int x=i+a;
+ if(source[x]!=lettersUpper[i] && source[x]!=lettersLower[i]){
+ return false;
+ }
+ }
+ return true;
+ }
+
+
+ public boolean matchesExtended(byte[] source, int a){
+
+ a=a-center;
+ if(a<0 || a+length>source.length){return false;}
+
+ for(int i=0; i<lettersUpper.length; i++){
+ int x=i+a;
+
+ byte s=source[x];
+ byte n=baseToNumberExtended[s];
+
+ if((n&numbersExtended[i])!=n){
+ return false;
+ }
+ }
+ return true;
+ }
+
+ @Override
+ public int numBases() {
+ return numbers.length;
+ }
+
+
+ public final byte[] lettersUpper;
+ public final byte[] lettersLower;
+ public final byte[] numbers;
+ public final byte[] numbersExtended;
+
+ public final boolean extended;
+
+}
diff --git a/current/dna/Parser.java b/current/dna/Parser.java
new file mode 100755
index 0000000..a1e519d
--- /dev/null
+++ b/current/dna/Parser.java
@@ -0,0 +1,1000 @@
+package dna;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+
+import jgi.CalcTrueQuality;
+import kmer.AbstractKmerTable;
+
+import stream.ConcurrentDepot;
+import stream.ConcurrentReadInputStream;
+import stream.FASTQ;
+import stream.FastaReadInputStream;
+import stream.KillSwitch;
+import stream.Read;
+import stream.ReadStreamByteWriter;
+import stream.ReadStreamWriter;
+import stream.SamLine;
+import align2.ReadStats;
+import align2.Shared;
+import align2.Tools;
+import align2.TrimRead;
+import fileIO.ByteFile;
+import fileIO.ReadWrite;
+import fileIO.TextFile;
+
+/**
+ * @author Brian Bushnell
+ * @date Mar 21, 2014
+ *
+ */
+public class Parser {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public Parser(){}
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public boolean parse(String arg, String a, String b){
+ if(isJavaFlag(arg)){return true;}
+
+ if(parseQuality(arg, a, b)){return true;}
+ if(parseZip(arg, a, b)){return true;}
+ if(parseSam(arg, a, b)){return true;}
+ if(parseFasta(arg, a, b)){return true;}
+ if(parseCommonStatic(arg, a, b)){return true;}
+ if(parseHist(arg, a, b)){return true;}
+ if(parseQualityAdjust(arg, a, b)){return true;}
+
+ if(parseFiles(arg, a, b)){return true;}
+ if(parseCommon(arg, a, b)){return true;}
+ if(parseTrim(arg, a, b)){return true;}
+ if(parseInterleaved(arg, a, b)){return true;}
+ if(parseMapping(arg, a, b)){return true;}
+ if(parseCardinality(arg, a, b)){return true;}
+ return false;
+ }
+
+ public boolean parseCommon(String arg, String a, String b){
+ if(a.equals("reads") || a.equals("maxreads")){
+ maxReads=Tools.parseKMG(b);
+ }else if(a.equals("samplerate")){
+ samplerate=Float.parseFloat(b);
+ assert(samplerate<=1f && samplerate>=0f) : "samplerate="+samplerate+"; should be between 0 and 1";
+ }else if(a.equals("sampleseed")){
+ sampleseed=Long.parseLong(b);
+ }else if(a.equals("append") || a.equals("app")){
+ append=ReadStats.append=Tools.parseBoolean(b);
+ }else if(a.equals("overwrite") || a.equals("ow")){
+ overwrite=Tools.parseBoolean(b);
+ }else if(a.equals("testsize")){
+ testsize=Tools.parseBoolean(b);
+ }else if(a.equals("breaklen") || a.equals("breaklength")){
+ breakLength=Integer.parseInt(b);
+ }else if(a.equals("recalibrate") || a.equals("recalibratequality") || a.equals("recal")){
+ recalibrateQuality=Tools.parseBoolean(b);
+ }else{
+ return false;
+ }
+ return true;
+ }
+
+ public boolean parseCardinality(String arg, String a, String b){
+ if(a.equals("cardinality") || a.equals("loglog")){
+ loglog=Tools.parseBoolean(b);
+ }else if(a.equals("buckets") || a.equals("loglogbuckets")){
+ loglogbuckets=Integer.parseInt(b);
+ }else if(a.equals("loglogbits")){
+ loglogbits=Integer.parseInt(b);
+ }else if(a.equals("loglogk")){
+ loglogk=Integer.parseInt(b);
+ }else if(a.equals("loglogseed")){
+ loglogseed=Long.parseLong(b);
+ }else{
+ return false;
+ }
+ return true;
+ }
+
+ public boolean parseInterleaved(String arg, String a, String b){
+ if(a.equals("testinterleaved")){
+ FASTQ.TEST_INTERLEAVED=Tools.parseBoolean(b);
+ System.err.println("Set TEST_INTERLEAVED to "+FASTQ.TEST_INTERLEAVED);
+ setInterleaved=true;
+ }else if(a.equals("forceinterleaved")){
+ FASTQ.FORCE_INTERLEAVED=Tools.parseBoolean(b);
+ System.err.println("Set FORCE_INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED);
+ setInterleaved=true;
+ }else if(a.equals("interleaved") || a.equals("int")){
+ if("auto".equalsIgnoreCase(b)){FASTQ.FORCE_INTERLEAVED=!(FASTQ.TEST_INTERLEAVED=true);}
+ else{
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=Tools.parseBoolean(b);
+ System.err.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED);
+ setInterleaved=true;
+ }
+ }else if(a.equals("overrideinterleaved")){
+ boolean x=Tools.parseBoolean(b);
+ ReadStreamByteWriter.ignorePairAssertions=x;
+ if(x){setInterleaved=true;}
+ }else{
+ return false;
+ }
+ return true;
+ }
+
+ public boolean parseTrim(String arg, String a, String b){
+
+ if(a.equals("qtrim1")){
+ if(b!=null && ("f".equalsIgnoreCase(b) || "false".equalsIgnoreCase(b))){qtrim1=false;}
+ else{
+ qtrim1=true;
+ qtrim2=false;
+ }
+ a="qtrim";
+ }else if(a.equals("qtrim2")){
+ if(b!=null && ("f".equalsIgnoreCase(b) || "false".equalsIgnoreCase(b))){qtrim2=false;}
+ else{
+ qtrim2=true;
+ qtrim1=false;
+ }
+ a="qtrim";
+ }else if(a.equals("trimq2")){
+ if(b!=null && ("f".equalsIgnoreCase(b) || "false".equalsIgnoreCase(b))){qtrim2=false;}
+ else{
+ qtrim2=true;
+ qtrim1=false;
+ }
+ a="trimq";
+ }
+
+ if(a.equals("forcetrimmod") || a.equals("forcemrimmodulo") || a.equals("ftm")){
+ forceTrimModulo=Integer.parseInt(b);
+ }else if(a.equals("ftl") || a.equals("forcetrimleft")){
+ forceTrimLeft=Integer.parseInt(b);
+ }else if(a.equals("ftr") || a.equals("forcetrimright")){
+ forceTrimRight=Integer.parseInt(b);
+ }else if(a.equals("ftr2") || a.equals("forcetrimright2")){
+ forceTrimRight2=Integer.parseInt(b);
+ }else if(a.equals("qtrim")/* || a.equals("trim")*/){
+ if(b==null || b.length()==0){qtrimRight=qtrimLeft=true;}
+ else if(b.equalsIgnoreCase("left") || b.equalsIgnoreCase("l")){qtrimLeft=true;qtrimRight=false;}
+ else if(b.equalsIgnoreCase("right") || b.equalsIgnoreCase("r")){qtrimLeft=false;qtrimRight=true;}
+ else if(b.equalsIgnoreCase("both") || b.equalsIgnoreCase("rl") || b.equalsIgnoreCase("lr")){qtrimLeft=qtrimRight=true;}
+ else if(b.equalsIgnoreCase("window") || b.equalsIgnoreCase("w") || b.startsWith("window,") || b.startsWith("w,")){
+ qtrimLeft=false;
+ qtrimRight=true;
+ TrimRead.windowMode=true;
+ TrimRead.optimalMode=false;
+ String[] split=b.split(",");
+ if(b.length()>1){
+ TrimRead.windowLength=Integer.parseInt(split[1]);
+ }
+ }else if(Character.isDigit(b.charAt(0))){
+ parseTrimq(a, b);
+ qtrimRight=true;
+ }else{qtrimRight=qtrimLeft=Tools.parseBoolean(b);}
+ }else if(a.equals("optitrim") || a.equals("otf") || a.equals("otm")){
+ if(b!=null && (b.charAt(0)=='.' || Character.isDigit(b.charAt(0)))){
+ TrimRead.optimalMode=true;
+ TrimRead.optimalBias=Float.parseFloat(b);
+ assert(TrimRead.optimalBias>=0 && TrimRead.optimalBias<1);
+ }else{
+ TrimRead.optimalMode=Tools.parseBoolean(b);
+ }
+ }else if(a.equals("trimgoodinterval")){
+ TrimRead.minGoodInterval=Integer.parseInt(b);
+ }else if(a.equals("trimright") || a.equals("qtrimright")){
+ qtrimRight=Tools.parseBoolean(b);
+ }else if(a.equals("trimleft") || a.equals("qtrimleft")){
+ qtrimLeft=Tools.parseBoolean(b);
+ }else if(a.equals("trimq") || a.equals("trimquality") || a.equals("trimq2")){
+ parseTrimq(a, b);
+ }else if(a.equals("trimbadsequence")){
+ trimBadSequence=Tools.parseBoolean(b);
+ }else if(a.equals("chastityfilter") || a.equals("cf")){
+ chastityFilter=Tools.parseBoolean(b);
+ }else if(a.equals("failnobarcode")){
+ failIfNoBarcode=Tools.parseBoolean(b);
+ }else if(a.equals("badbarcodes") || a.equals("barcodefilter")){
+ if(b!=null && (b.equalsIgnoreCase("crash") || b.equalsIgnoreCase("fail"))){
+ failBadBarcodes=true;
+ removeBadBarcodes=true;
+ }else{
+ removeBadBarcodes=Tools.parseBoolean(b);
+ failBadBarcodes=false;
+ }
+ }else if(a.equals("barcodes") || a.equals("barcode")){
+ if(b==null || b.length()<1){
+ barcodes=null;
+ }else{
+ barcodes=new HashSet<String>();
+ for(String s : b.split(",")){
+ Tools.addNames(s, barcodes, false);
+ }
+ }
+ if(barcodes!=null && barcodes.size()>0 && !failBadBarcodes && !removeBadBarcodes){
+ removeBadBarcodes=true;
+ }
+ }else if(a.equals("requirebothbad") || a.equals("rbb")){
+ requireBothBad=Tools.parseBoolean(b);
+ }else if(a.equals("removeifeitherbad") || a.equals("rieb")){
+ requireBothBad=!Tools.parseBoolean(b);
+ }else if(a.equals("ml") || a.equals("minlen") || a.equals("minlength")){
+ minReadLength=(int)Tools.parseKMG(b);
+ }else if(a.equals("maxlength") || a.equals("maxreadlength") || a.equals("maxreadlen") || a.equals("maxlen")){
+ maxReadLength=(int)Tools.parseKMG(b);
+ }else if(a.equals("mingc")){
+ minGC=Float.parseFloat(b);
+// if(minGC>0){filterGC=true;}
+ assert(minGC>=0 && minGC<=1) : "mingc should be a decimal number between 0 and 1, inclusive.";
+ }else if(a.equals("maxgc")){
+ maxGC=Float.parseFloat(b);
+// if(maxGC<1){filterGC=true;}
+ assert(minGC>=0 && minGC<=1) : "maxgc should be a decimal number between 0 and 1, inclusive.";
+ }else if(a.equals("mlf") || a.equals("minlenfrac") || a.equals("minlenfraction") || a.equals("minlengthfraction")){
+ minLenFraction=Float.parseFloat(b);
+ }else if(a.equals("maxns")){
+ maxNs=Integer.parseInt(b);
+ }else if(a.equals("minconsecutivebases") || a.equals("mcb")){
+ minConsecutiveBases=Integer.parseInt(b);
+ }else if(a.equals("minavgquality") || a.equals("maq")){
+ if(b.indexOf(',')>-1){
+ String[] split=b.split(",");
+ assert(split.length==2) : "maq should be length 1 or 2 (at most 1 comma).\nFormat: maq=quality,bases; e.g. maq=10 or maq=10,20";
+ minAvgQuality=Byte.parseByte(split[0]);
+ minAvgQualityBases=Integer.parseInt(split[1]);
+ }else{
+ minAvgQuality=Byte.parseByte(b);
+ }
+ }else if(a.equals("minavgqualitybases") || a.equals("maqb")){
+ minAvgQualityBases=Integer.parseInt(b);
+ }else if(a.equals("averagequalitybyprobability") || a.equals("aqbp")){
+ Read.AVERAGE_QUALITY_BY_PROBABILITY=Tools.parseBoolean(b);
+ }else if(a.equals("mintl") || a.equals("mintrimlen") || a.equals("mintrimlength")){
+ minTrimLength=Integer.parseInt(b);
+ }else if(a.equals("untrim")){
+ untrim=Tools.parseBoolean(b);
+ }else if(a.equals("tossjunk")){
+ boolean x=Tools.parseBoolean(b);
+ tossJunk=x;
+ if(x){Read.FLAG_JUNK=x;}
+ }else{
+ return false;
+ }
+ return true;
+ }
+
+ private void parseTrimq(String a, String b){
+ if(b.indexOf(',')>=0){
+ String[] split=b.split(",");
+ trimq2=new byte[split.length];
+ for(int i=0; i<split.length; i++){
+ trimq2[i]=Byte.parseByte(split[i]);
+ }
+ trimq=trimq2.length<1 ? 0 : trimq2[0];
+ }else{
+ trimq=Byte.parseByte(b);
+ trimq2=null;
+ }
+// assert(false) : Arrays.toString(trimq2);
+ }
+
+ public boolean parseFiles(String arg, String a, String b){
+ if(a.equals("in") || a.equals("input") || a.equals("in1") || a.equals("input1")){
+ in1=b;
+ }else if(a.equals("in2") || a.equals("input2")){
+ in2=b;
+ }else if(a.equals("out") || a.equals("output") || a.equals("out1") || a.equals("output1")){
+ out1=b;
+ setOut=true;
+ }else if(a.equals("out2") || a.equals("output2")){
+ out2=b;
+ setOut=true;
+ }else if(a.equals("qfin") || a.equals("qfin1")){
+ qfin1=b;
+ }else if(a.equals("qfout") || a.equals("qfout1")){
+ qfout1=b;
+ setOut=true;
+ }else if(a.equals("qfin2")){
+ qfin2=b;
+ }else if(a.equals("qfout2")){
+ qfout2=b;
+ setOut=true;
+ }else if(a.equals("extin")){
+ extin=b;
+ }else if(a.equals("extout")){
+ extout=b;
+ }else if(a.equals("outsingle") || a.equals("outs")){
+ outsingle=b;
+ setOut=true;
+ }else{
+ return false;
+ }
+ return true;
+ }
+
+ public boolean parseMapping(String arg, String a, String b){
+ if(a.equals("idfilter") || a.equals("identityfilter")){
+ idFilter=Float.parseFloat(b);
+ if(idFilter>1f){idFilter/=100;}
+ assert(idFilter<=1f) : "idfilter should be between 0 and 1.";
+ }else if(a.equals("subfilter")){
+ subfilter=Integer.parseInt(b);
+ }else if(a.equals("delfilter")){
+ delfilter=Integer.parseInt(b);
+ }else if(a.equals("insfilter")){
+ insfilter=Integer.parseInt(b);
+ }else if(a.equals("indelfilter")){
+ indelfilter=Integer.parseInt(b);
+ }else if(a.equals("dellenfilter")){
+ dellenfilter=Integer.parseInt(b);
+ }else if(a.equals("inslenfilter")){
+ inslenfilter=Integer.parseInt(b);
+ }else if(a.equals("editfilter")){
+ editfilter=Integer.parseInt(b);
+ }else if(a.equals("build") || a.equals("genome")){
+ build=Integer.parseInt(b);
+ Data.GENOME_BUILD=build;
+ }else{
+ return false;
+ }
+ return true;
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Static Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public static String[] parseConfig(String[] args){
+ boolean found=false;
+ for(String s : args){
+ if(s!=null && s.toLowerCase().startsWith("config=")){
+ found=true;
+ break;
+ }
+ }
+ if(!found){return args;}
+ ArrayList<String> list=new ArrayList<String>();
+ for(int i=0; i<args.length; i++){
+ final String arg=(args[i]==null ? "null" : args[i]);
+ final String[] split=arg.split("=");
+ final String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if("null".equalsIgnoreCase(b)){b=null;}
+
+ if(a.equals("config")){
+ for(String bb : b.split(",")){
+ try{
+ TextFile tf=new TextFile(bb);
+ for(String line=tf.nextLine(); line!=null; line=tf.nextLine()){
+ String line2=line.trim();
+ if(line2.length()>0 && !line2.startsWith("#")){
+ list.add(line2);
+ }
+ }
+ tf.close();
+ }catch(Throwable t){
+ throw new RuntimeException("Could not process config file "+b+"\nCaused by:\n"+t.toString()+"\n");
+ }
+ }
+ }else if(arg!=null && "null".equals(arg)){
+ list.add(arg);
+ }
+ }
+ return list.toArray(new String[list.size()]);
+ }
+
+ public static boolean parseCommonStatic(String arg, String a, String b){
+ if(a.equals("null")){
+ //Do nothing
+ }else if(a.equals("monitor") || a.equals("killswitch")){
+ if(Tools.isNumber(b)){
+ String[] pair=b.split(",");
+ if(pair.length==1){
+ KillSwitch.launch(Double.parseDouble(pair[0]));
+ }else{
+ assert(pair.length==2) : "monitor takes one or two arguments, like this: monitor=600,0.002";
+ KillSwitch.launch(Double.parseDouble(pair[0]), Double.parseDouble(pair[1]));
+ }
+ }else if(Tools.parseBoolean(b)){
+ KillSwitch.launch();
+ }
+ }else if(a.equals("trd") || a.equals("trc") || a.equals("trimreaddescription") || a.equals("trimreaddescriptions")){
+ Shared.TRIM_READ_COMMENTS=Tools.parseBoolean(b);
+ }else if(a.equals("tuc") || a.equals("touppercase")){
+ Read.TO_UPPER_CASE=Tools.parseBoolean(b);
+ }else if(a.equals("lctn") || a.equals("lowercaseton")){
+ Read.LOWER_CASE_TO_N=Tools.parseBoolean(b);
+ }else if(a.equals("changequality") || a.equals("cq")){
+ Read.CHANGE_QUALITY=Tools.parseBoolean(b);
+ }else if(a.equals("tossbrokenreads") || a.equals("tbr")){
+ boolean x=Tools.parseBoolean(b);
+ Read.NULLIFY_BROKEN_QUALITY=x;
+ ConcurrentReadInputStream.REMOVE_DISCARDED_READS=x;
+ }else if(a.equals("flagjunk")){
+ boolean x=Tools.parseBoolean(b);
+ Read.FLAG_JUNK=x;
+ }else if(a.equals("fixjunk")){
+ boolean x=Tools.parseBoolean(b);
+ Read.FIX_JUNK=x;
+ }else if(a.equals("bf1")){
+ ByteFile.FORCE_MODE_BF1=Tools.parseBoolean(b);
+ ByteFile.FORCE_MODE_BF2=!ByteFile.FORCE_MODE_BF1;
+ }else if(a.equals("utot")){
+ Read.U_TO_T=Tools.parseBoolean(b);
+ }else if(a.equals("bf2")){
+ ByteFile.FORCE_MODE_BF2=Tools.parseBoolean(b);
+ ByteFile.FORCE_MODE_BF1=!ByteFile.FORCE_MODE_BF2;
+ }else if(a.equals("usejni") || a.equals("jni")){
+ Shared.USE_JNI=Tools.parseBoolean(b);
+ }else if(a.equals("usempi") || a.equals("mpi")){
+ if(b!=null && Character.isDigit(b.charAt(0))){
+ Shared.MPI_NUM_RANKS=Integer.parseInt(b);
+ Shared.USE_MPI=Shared.MPI_NUM_RANKS>0;
+ }else{
+ Shared.USE_MPI=Tools.parseBoolean(b);
+ }
+ }else if(a.equals("crismpi")){
+ Shared.USE_CRISMPI=Tools.parseBoolean(b);
+ }else if(a.equals("mpikeepall")){
+ Shared.MPI_KEEP_ALL=Tools.parseBoolean(b);
+ }else if(a.equals("readbufferlength") || a.equals("readbufferlen")){
+ Shared.READ_BUFFER_LENGTH=(int)Tools.parseKMG(b);
+ }else if(a.equals("readbufferdata")){
+ Shared.READ_BUFFER_MAX_DATA=(int)Tools.parseKMG(b);
+ }else if(a.equals("readbuffers")){
+ Shared.setBuffers(Integer.parseInt(b));
+ }else if(a.equals("rbm") || a.equals("renamebymapping")){
+ FASTQ.TAG_CUSTOM=Tools.parseBoolean(b);
+ }else if(a.equals("don") || a.equals("deleteoldname")){
+ FASTQ.DELETE_OLD_NAME=Tools.parseBoolean(b);
+ }else if(a.equals("assertcigar")){
+ ReadStreamWriter.ASSERT_CIGAR=Tools.parseBoolean(b);
+ }else if(a.equals("verbosesamline")){
+ SamLine.verbose=Tools.parseBoolean(b);
+ }else if(a.equals("parsecustom") || a.equals("fastqparsecustom")){
+ FASTQ.PARSE_CUSTOM=Tools.parseBoolean(b);
+ System.err.println("Set FASTQ.PARSE_CUSTOM to "+FASTQ.PARSE_CUSTOM);
+ }else if(a.equals("fairqueues")){
+ ConcurrentDepot.fair=Tools.parseBoolean(b);
+ }else if(a.equals("fixheader") || a.equals("fixheaders")){
+ Read.FIX_HEADER=Tools.parseBoolean(b);
+ }else if(a.equals("aminoin")){
+ //Note - ensure changes to this do not conflict with TranslateSixFrames "aain" flag.
+ Shared.AMINO_IN=Tools.parseBoolean(b);
+ }else if(a.equals("maxcalledquality")){
+ int x=Tools.mid(1, Integer.parseInt(b), 93);
+ Read.MAX_CALLED_QUALITY=(byte)x;
+ }else if(a.equals("mincalledquality")){
+ int x=Tools.mid(0, Integer.parseInt(b), 93);
+ Read.MIN_CALLED_QUALITY=(byte)x;
+ }else if(a.equals("t") || a.equals("threads")){
+ Shared.setThreads(b);
+ System.err.println("Set threads to "+Shared.threads());
+ }else if(a.equals("recalpairnum") || a.equals("recalibratepairnum")){
+ CalcTrueQuality.USE_PAIRNUM=Tools.parseBoolean(b);
+ }
+// else if(a.equals("aminoout")){
+// Shared.AMINO_OUT=Tools.parseBoolean(b);
+// }
+ else{
+ return false;
+ }
+ return true;
+ }
+
+ public static boolean parseQuality(String arg, String a, String b){
+ parsedQuality=true; //For internal verification that this function was indeed called.
+ if(a.equals("ignorebadquality") || a.equals("ibq")){
+ FASTQ.IGNORE_BAD_QUALITY=Tools.parseBoolean(b);
+ if(FASTQ.IGNORE_BAD_QUALITY){Read.CHANGE_QUALITY=false;}
+ }else if(a.equals("ascii") || a.equals("asciioffset") || a.equals("quality") || a.equals("qual")){
+ byte x;
+ if(b.equalsIgnoreCase("sanger")){x=33;}
+ else if(b.equalsIgnoreCase("illumina")){x=64;}
+ else if(b.equalsIgnoreCase("auto")){x=-1;FASTQ.DETECT_QUALITY=FASTQ.DETECT_QUALITY_OUT=true;}
+ else{x=(byte)Integer.parseInt(b);}
+ qin=qout=x;
+ }else if(a.equals("asciiin") || a.equals("qualityin") || a.equals("qualin") || a.equals("qin")){
+ byte x;
+ if(b.equalsIgnoreCase("sanger")){x=33;}
+ else if(b.equalsIgnoreCase("illumina")){x=64;}
+ else if(b.equalsIgnoreCase("auto")){x=-1;FASTQ.DETECT_QUALITY=true;}
+ else{x=(byte)Integer.parseInt(b);}
+ qin=x;
+ }else if(a.equals("asciiout") || a.equals("qualityout") || a.equals("qualout") || a.equals("qout")){
+ byte x;
+ if(b.equalsIgnoreCase("sanger")){x=33;}
+ else if(b.equalsIgnoreCase("illumina")){x=64;}
+ else if(b.equalsIgnoreCase("auto")){x=-1;FASTQ.DETECT_QUALITY_OUT=true;}
+ else{x=(byte)Integer.parseInt(b);}
+ qout=x;
+ }else if(a.equals("fakequality") || a.equals("qfake")){
+ Shared.FAKE_QUAL=Byte.parseByte(b);
+ }else if(a.equals("fakefastaqual") || a.equals("fakefastaquality") || a.equals("ffq")){
+ if(b==null || b.length()<1){b="f";}
+ if(Character.isLetter(b.charAt(0))){
+ FastaReadInputStream.FAKE_QUALITY=Tools.parseBoolean(b);
+ }else{
+ int x=Integer.parseInt(b);
+ if(x<1){
+ FastaReadInputStream.FAKE_QUALITY=false;
+ }else{
+ FastaReadInputStream.FAKE_QUALITY=true;
+ Shared.FAKE_QUAL=(byte)Tools.min(x, 50);
+ }
+ }
+ }else if(a.equals("qauto")){
+ FASTQ.DETECT_QUALITY=FASTQ.DETECT_QUALITY_OUT=true;
+ }else{
+ return false;
+ }
+ return true;
+ }
+
+ private static boolean qhistsNull(){
+ return ReadStats.BQUAL_HIST_FILE==null && ReadStats.QUAL_HIST_FILE!=null && ReadStats.AVG_QUAL_HIST_FILE!=null && ReadStats.BQUAL_HIST_OVERALL_FILE!=null
+ && ReadStats.QUAL_COUNT_HIST_FILE==null;
+ }
+
+ public static boolean parseHist(String arg, String a, String b){
+ if(a.equals("qualityhistogram") || a.equals("qualityhist") || a.equals("qhist")){
+ ReadStats.QUAL_HIST_FILE=(b==null || b.equalsIgnoreCase("null") || b.equalsIgnoreCase("none")) ? null : b;
+ ReadStats.COLLECT_QUALITY_STATS=!qhistsNull();
+ if(ReadStats.COLLECT_QUALITY_STATS){System.err.println("Set quality histogram output to "+ReadStats.QUAL_HIST_FILE);}
+ }else if(a.equals("basequalityhistogram") || a.equals("basequalityhist") || a.equals("bqhist")){
+ ReadStats.BQUAL_HIST_FILE=(b==null || b.equalsIgnoreCase("null") || b.equalsIgnoreCase("none")) ? null : b;
+ ReadStats.COLLECT_QUALITY_STATS=!qhistsNull();
+ if(ReadStats.BQUAL_HIST_FILE!=null){System.err.println("Set bquality histogram output to "+ReadStats.BQUAL_HIST_FILE);}
+ }else if(a.equals("qualitycounthistogram") || a.equals("qualitycounthist") || a.equals("qchist")){
+ ReadStats.QUAL_COUNT_HIST_FILE=(b==null || b.equalsIgnoreCase("null") || b.equalsIgnoreCase("none")) ? null : b;
+ ReadStats.COLLECT_QUALITY_STATS=!qhistsNull();
+ if(ReadStats.QUAL_COUNT_HIST_FILE!=null){System.err.println("Set qcount histogram output to "+ReadStats.QUAL_COUNT_HIST_FILE);}
+ }else if(a.equals("averagequalityhistogram") || a.equals("aqhist")){
+ ReadStats.AVG_QUAL_HIST_FILE=(b==null || b.equalsIgnoreCase("null") || b.equalsIgnoreCase("none")) ? null : b;
+ ReadStats.COLLECT_QUALITY_STATS=!qhistsNull();
+ if(ReadStats.COLLECT_QUALITY_STATS){System.err.println("Set average quality histogram output to "+ReadStats.AVG_QUAL_HIST_FILE);}
+ }else if(a.equals("overallbasequalityhistogram") || a.equals("overallbasequalityhist") || a.equals("obqhist")){
+ ReadStats.BQUAL_HIST_OVERALL_FILE=(b==null || b.equalsIgnoreCase("null") || b.equalsIgnoreCase("none")) ? null : b;
+ ReadStats.COLLECT_QUALITY_STATS=(ReadStats.BQUAL_HIST_FILE!=null || ReadStats.QUAL_HIST_FILE!=null || ReadStats.AVG_QUAL_HIST_FILE!=null || ReadStats.BQUAL_HIST_OVERALL_FILE!=null);
+ if(ReadStats.COLLECT_QUALITY_STATS){System.err.println("Set quality histogram output to "+ReadStats.QUAL_HIST_FILE);}
+ }else if(a.equals("matchhistogram") || a.equals("matchhist") || a.equals("mhist")){
+ ReadStats.MATCH_HIST_FILE=(b==null || b.equalsIgnoreCase("null") || b.equalsIgnoreCase("none")) ? null : b;
+ ReadStats.COLLECT_MATCH_STATS=(ReadStats.MATCH_HIST_FILE!=null);
+ if(ReadStats.COLLECT_MATCH_STATS){System.err.println("Set match histogram output to "+ReadStats.MATCH_HIST_FILE);}
+ }else if(a.equals("inserthistogram") || a.equals("inserthist") || a.equals("ihist")){
+ ReadStats.INSERT_HIST_FILE=(b==null || b.equalsIgnoreCase("null") || b.equalsIgnoreCase("none")) ? null : b;
+ ReadStats.COLLECT_INSERT_STATS=(ReadStats.INSERT_HIST_FILE!=null);
+ if(ReadStats.COLLECT_INSERT_STATS){System.err.println("Set insert size histogram output to "+ReadStats.INSERT_HIST_FILE);}
+ }else if(a.equals("basehistogram") || a.equals("basehist") || a.equals("bhist")){
+ ReadStats.BASE_HIST_FILE=(b==null || b.equalsIgnoreCase("null") || b.equalsIgnoreCase("none")) ? null : b;
+ ReadStats.COLLECT_BASE_STATS=(ReadStats.BASE_HIST_FILE!=null);
+ if(ReadStats.COLLECT_BASE_STATS){System.err.println("Set base content histogram output to "+ReadStats.BASE_HIST_FILE);}
+ }else if(a.equals("qualityaccuracyhistogram") || a.equals("qahist")){
+ ReadStats.QUAL_ACCURACY_FILE=(b==null || b.equalsIgnoreCase("null") || b.equalsIgnoreCase("none")) ? null : b;
+ ReadStats.COLLECT_QUALITY_ACCURACY=(ReadStats.QUAL_ACCURACY_FILE!=null);
+ if(ReadStats.COLLECT_QUALITY_ACCURACY){System.err.println("Set quality accuracy histogram output to "+ReadStats.QUAL_ACCURACY_FILE);}
+ }else if(a.equals("indelhistogram") || a.equals("indelhist")){
+ ReadStats.INDEL_HIST_FILE=(b==null || b.equalsIgnoreCase("null") || b.equalsIgnoreCase("none")) ? null : b;
+ ReadStats.COLLECT_INDEL_STATS=(ReadStats.INDEL_HIST_FILE!=null);
+ if(ReadStats.COLLECT_INDEL_STATS){System.err.println("Set indel histogram output to "+ReadStats.INDEL_HIST_FILE);}
+ }else if(a.equals("errorhistogram") || a.equals("ehist")){
+ ReadStats.ERROR_HIST_FILE=(b==null || b.equalsIgnoreCase("null") || b.equalsIgnoreCase("none")) ? null : b;
+ ReadStats.COLLECT_ERROR_STATS=(ReadStats.ERROR_HIST_FILE!=null);
+ if(ReadStats.COLLECT_ERROR_STATS){System.err.println("Set error histogram output to "+ReadStats.ERROR_HIST_FILE);}
+ }else if(a.equals("lengthhistogram") || a.equals("lhist")){
+ ReadStats.LENGTH_HIST_FILE=(b==null || b.equalsIgnoreCase("null") || b.equalsIgnoreCase("none")) ? null : b;
+ ReadStats.COLLECT_LENGTH_STATS=(ReadStats.LENGTH_HIST_FILE!=null);
+ if(ReadStats.COLLECT_LENGTH_STATS){System.err.println("Set length histogram output to "+ReadStats.LENGTH_HIST_FILE);}
+ }else if(a.equals("gchistogram") || a.equals("gchist")){
+ ReadStats.GC_HIST_FILE=(b==null || b.equalsIgnoreCase("null") || b.equalsIgnoreCase("none")) ? null : b;
+ ReadStats.COLLECT_GC_STATS=(ReadStats.GC_HIST_FILE!=null);
+ if(ReadStats.COLLECT_GC_STATS){System.err.println("Set GC histogram output to "+ReadStats.GC_HIST_FILE);}
+ }else if(a.equals("gcbins") || a.equals("gchistbins")){
+ if("auto".equalsIgnoreCase(b)){
+ ReadStats.GC_BINS=750;
+ ReadStats.GC_BINS_AUTO=true;
+ }else{
+ ReadStats.GC_BINS=Integer.parseInt(b);
+ ReadStats.GC_BINS_AUTO=false;
+ }
+ }else if(a.equals("gcchart") || a.equals("gcplot")){
+ ReadStats.GC_PLOT_X=Tools.parseBoolean(b);
+ }else if(a.equals("timehistogram") || a.equals("thist")){
+ ReadStats.TIME_HIST_FILE=(b==null || b.equalsIgnoreCase("null") || b.equalsIgnoreCase("none")) ? null : b;
+ ReadStats.COLLECT_TIME_STATS=(ReadStats.TIME_HIST_FILE!=null);
+ if(ReadStats.COLLECT_IDENTITY_STATS){System.err.println("Set identity histogram output to "+ReadStats.IDENTITY_HIST_FILE);}
+ }else if(a.equals("identityhistogram") || a.equals("idhist")){
+ ReadStats.IDENTITY_HIST_FILE=(b==null || b.equalsIgnoreCase("null") || b.equalsIgnoreCase("none")) ? null : b;
+ ReadStats.COLLECT_IDENTITY_STATS=(ReadStats.IDENTITY_HIST_FILE!=null);
+ if(ReadStats.COLLECT_IDENTITY_STATS){System.err.println("Set identity histogram output to "+ReadStats.IDENTITY_HIST_FILE);}
+ }else if(a.equals("idhistlen") || a.equals("idhistlength") || a.equals("idhistbins") || a.equals("idbins")){
+ if("auto".equalsIgnoreCase(b)){
+ ReadStats.ID_BINS=750;
+ ReadStats.ID_BINS_AUTO=true;
+ }else{
+ ReadStats.ID_BINS=Integer.parseInt(b);
+ ReadStats.ID_BINS_AUTO=false;
+ }
+ }else{
+ return false;
+ }
+ return true;
+ }
+
+ public static boolean parseZip(String arg, String a, String b){
+ if(a.equals("ziplevel") || a.equals("zl")){
+ int x=Integer.parseInt(b);
+ if(x>=0){
+ ReadWrite.ZIPLEVEL=Tools.min(x, 9);
+ }
+ }else if(a.equals("usegzip") || a.equals("gzip")){
+ ReadWrite.USE_GZIP=Tools.parseBoolean(b);
+ }else if(a.equals("usepigz") || a.equals("pigz")){
+ if(b!=null && Character.isDigit(b.charAt(0))){
+ int zt=Integer.parseInt(b);
+ if(zt<1){ReadWrite.USE_PIGZ=false;}
+ else{
+ ReadWrite.USE_PIGZ=true;
+ ReadWrite.MAX_ZIP_THREADS=zt;
+ ReadWrite.ZIP_THREAD_DIVISOR=1;
+ }
+ }else{ReadWrite.USE_PIGZ=Tools.parseBoolean(b);}
+ }else if(a.equals("zipthreaddivisor") || a.equals("ztd")){
+ ReadWrite.ZIP_THREAD_DIVISOR=Integer.parseInt(b);
+ }else if(a.equals("usegunzip") || a.equals("gunzip") || a.equals("ungzip")){
+ ReadWrite.USE_GUNZIP=Tools.parseBoolean(b);
+ }else if(a.equals("useunpigz") || a.equals("unpigz")){
+ ReadWrite.USE_UNPIGZ=Tools.parseBoolean(b);
+ }else{
+ return false;
+ }
+ return true;
+ }
+
+ public static boolean parseSam(String arg, String a, String b){
+ if(a.equals("samversion") || a.equals("samv") || a.equals("sam")){
+ assert(b!=null) : "The sam flag requires a version number, e.g. 'sam=1.4'";
+ SamLine.VERSION=Float.parseFloat(b);
+ }else if(a.equals("notags")){
+ SamLine.NO_TAGS=Tools.parseBoolean(b);
+ }else if(a.equals("mdtag") || a.equals("md")){
+ SamLine.MAKE_MD_TAG=Tools.parseBoolean(b);
+ }else if(a.equals("idtag")){
+ SamLine.MAKE_IDENTITY_TAG=Tools.parseBoolean(b);
+ }else if(a.equals("xmtag") || a.equals("xm")){
+ SamLine.MAKE_XM_TAG=Tools.parseBoolean(b);
+ }else if(a.equals("smtag")){
+ SamLine.MAKE_SM_TAG=Tools.parseBoolean(b);
+ }else if(a.equals("amtag")){
+ SamLine.MAKE_AM_TAG=Tools.parseBoolean(b);
+ }else if(a.equals("nmtag")){
+ SamLine.MAKE_NM_TAG=Tools.parseBoolean(b);
+ }else if(a.equals("stoptag")){
+ SamLine.MAKE_STOP_TAG=Tools.parseBoolean(b);
+ }else if(a.equals("lengthtag")){
+ SamLine.MAKE_LENGTH_TAG=Tools.parseBoolean(b);
+ }else if(a.equals("boundstag")){
+ SamLine.MAKE_BOUNDS_TAG=Tools.parseBoolean(b);
+ }else if(a.equals("scoretag")){
+ SamLine.MAKE_SCORE_TAG=Tools.parseBoolean(b);
+ }else if(a.equals("sortscaffolds")){
+ SamLine.SORT_SCAFFOLDS=Tools.parseBoolean(b);
+ }else if(a.equals("customtag")){
+ SamLine.MAKE_CUSTOM_TAGS=Tools.parseBoolean(b);
+ }else if(a.equals("nhtag")){
+ SamLine.MAKE_NH_TAG=Tools.parseBoolean(b);
+ }else if(a.equals("keepnames")){
+ SamLine.KEEP_NAMES=Tools.parseBoolean(b);
+ }else if(a.equals("saa") || a.equals("secondaryalignmentasterisks")){
+ SamLine.SECONDARY_ALIGNMENT_ASTERISKS=Tools.parseBoolean(b);
+ }else if(a.equals("inserttag")){
+ SamLine.MAKE_INSERT_TAG=Tools.parseBoolean(b);
+ }else if(a.equals("correctnesstag")){
+ SamLine.MAKE_CORRECTNESS_TAG=Tools.parseBoolean(b);
+ }else if(a.equals("intronlen") || a.equals("intronlength")){
+ SamLine.INTRON_LIMIT=Integer.parseInt(b);
+ SamLine.setintron=true;
+ }else if(a.equals("suppressheader") || a.equals("noheader")){
+ ReadStreamWriter.NO_HEADER=Tools.parseBoolean(b);
+ }else if(a.equals("noheadersequences") || a.equals("nhs") || a.equals("suppressheadersequences")){
+ ReadStreamWriter.NO_HEADER_SEQUENCES=Tools.parseBoolean(b);
+ }else if(a.equals("tophat")){
+ if(Tools.parseBoolean(b)){
+ SamLine.MAKE_TOPHAT_TAGS=true;
+ FastaReadInputStream.FAKE_QUALITY=true;
+ Shared.FAKE_QUAL=40;
+ SamLine.MAKE_MD_TAG=true;
+ }
+ }else if(a.equals("xstag") || a.equals("xs")){
+ SamLine.MAKE_XS_TAG=true;
+ if(b!=null){
+ b=b.toLowerCase();
+ if(b.startsWith("fr-")){b=b.substring(3);}
+ if(b.equals("ss") || b.equals("secondstrand")){
+ SamLine.XS_SECONDSTRAND=true;
+ }else if(b.equals("fs") || b.equals("firststrand")){
+ SamLine.XS_SECONDSTRAND=false;
+ }else if(b.equals("us") || b.equals("unstranded")){
+ SamLine.XS_SECONDSTRAND=false;
+ }else{
+ SamLine.MAKE_XS_TAG=Tools.parseBoolean(b);
+ }
+ }
+ SamLine.setxs=true;
+ }else if(parseReadgroup(arg, a, b)){
+ //do nothing
+ }else{
+ return false;
+ }
+ return true;
+ }
+
+ public static boolean parseFasta(String arg, String a, String b){
+ if(a.equals("fastareadlen") || a.equals("fastareadlength")){
+ FastaReadInputStream.TARGET_READ_LEN=Integer.parseInt(b);
+ FastaReadInputStream.SPLIT_READS=(FastaReadInputStream.TARGET_READ_LEN>0);
+ }else if(a.equals("fastaminread") || a.equals("fastaminlen") || a.equals("fastaminlength")){
+ FastaReadInputStream.MIN_READ_LEN=Integer.parseInt(b);
+ }else if(a.equals("forcesectionname")){
+ FastaReadInputStream.FORCE_SECTION_NAME=Tools.parseBoolean(b);
+ }else if(a.equals("fastawrap")){
+ Shared.FASTA_WRAP=Integer.parseInt(b);
+ }else if(a.equals("fastadump")){
+ AbstractKmerTable.FASTA_DUMP=Tools.parseBoolean(b);
+ }else{
+ return false;
+ }
+ return true;
+ }
+
+ public static boolean parseQualityAdjust(String arg, String a, String b){
+ int pass=0;
+ if(a.endsWith("_p1") || a.endsWith("_p2")){
+ pass=Integer.parseInt(a.substring(a.length()-1))-1;
+ a=a.substring(0, a.length()-3);
+ }
+
+ if(a.equals("loadq102")){
+ CalcTrueQuality.use_q102[pass]=Tools.parseBoolean(b);
+ }else if(a.equals("loadqbp")){
+ CalcTrueQuality.use_qbp[pass]=Tools.parseBoolean(b);
+ }else if(a.equals("loadq10")){
+ CalcTrueQuality.use_q10[pass]=Tools.parseBoolean(b);
+ }else if(a.equals("loadq12")){
+ CalcTrueQuality.use_q12[pass]=Tools.parseBoolean(b);
+ }else if(a.equals("loadqb12")){
+ CalcTrueQuality.use_qb12[pass]=Tools.parseBoolean(b);
+ }else if(a.equals("loadqb012")){
+ CalcTrueQuality.use_qb012[pass]=Tools.parseBoolean(b);
+ }else if(a.equals("loadqb123")){
+ CalcTrueQuality.use_qb123[pass]=Tools.parseBoolean(b);
+ }else if(a.equals("loadqb234")){
+ CalcTrueQuality.use_qb234[pass]=Tools.parseBoolean(b);
+ }else if(a.equals("loadq12b12")){
+ CalcTrueQuality.use_q12b12[pass]=Tools.parseBoolean(b);
+ }else if(a.equals("loadqp")){
+ CalcTrueQuality.use_qp[pass]=Tools.parseBoolean(b);
+ }else if(a.equals("loadq")){
+ CalcTrueQuality.use_q[pass]=Tools.parseBoolean(b);
+ }else if(a.equals("observationcutoff")){
+ long x=Long.parseLong(b);
+ CalcTrueQuality.OBSERVATION_CUTOFF[pass]=x;
+ }else if(a.equals("recalpasses")){
+ CalcTrueQuality.passes=Integer.parseInt(b);
+ }else if(a.equals("recalqmax")){
+ int x=Tools.mid(1, Integer.parseInt(b), 93);
+ CalcTrueQuality.setQmax(x);
+ Read.MAX_CALLED_QUALITY=(byte)Tools.max(Read.MAX_CALLED_QUALITY, x);
+ }else if(a.equals("recalqmin")){
+ int x=Tools.mid(0, Integer.parseInt(b), 93);
+ Read.MIN_CALLED_QUALITY=(byte)Tools.min(Read.MIN_CALLED_QUALITY, x);
+ }else if(a.equals("recalwithposition") || a.equals("recalwithpos") || a.equals("recalusepos")){
+ boolean x=Tools.parseBoolean(b);
+ if(!x){
+ Arrays.fill(CalcTrueQuality.use_qp, false);
+ Arrays.fill(CalcTrueQuality.use_qbp, false);
+ }
+ }else if(a.equals("qmatrixmode")){
+ if("weighted".equalsIgnoreCase(b) || "weightedaverage".equalsIgnoreCase(b)){
+ CalcTrueQuality.USE_WEIGHTED_AVERAGE=true;
+ }else if("average".equalsIgnoreCase(b) || "avg".equalsIgnoreCase(b)){
+ CalcTrueQuality.USE_WEIGHTED_AVERAGE=false;
+ CalcTrueQuality.USE_AVERAGE=true;
+ }else if("max".equalsIgnoreCase(b)){
+ CalcTrueQuality.USE_AVERAGE=CalcTrueQuality.USE_WEIGHTED_AVERAGE=false;
+ }
+ }else{
+ return false;
+ }
+ return true;
+ }
+
+ public static boolean isJavaFlag(String arg){
+ if(arg==null){return false;}
+ if(arg.startsWith("-Xmx") || arg.startsWith("-Xms") || arg.startsWith("-Xmn") || arg.equals("-ea") || arg.equals("-da")){return true;}
+ if(arg.startsWith("Xmx") || arg.startsWith("Xms") || arg.startsWith("Xmn")){
+ return arg.length()>3 && Character.isDigit(arg.charAt(3));
+ }
+ return false;
+ }
+
+
+ /** Return true if the user seems confused */
+ public static boolean parseHelp(String[] args, boolean autoExit){
+ if(args==null || args.length==0 || (args.length==1 && args[0]==null)){return true;}
+// if(args.length>1){return false;}
+ final String s=args[args.length-1].toLowerCase();
+
+ if(s.equals("-version") || s.equals("--version")){
+ if(autoExit){printHelp();}
+ return true;
+ }
+
+ if(s.equals("-h") || s.equals("-help") || s.equals("--help")
+ || s.equals("-version") || s.equals("--version") || s.equals("?") || s.equals("-?") || (s.equals("help") && !new File(s).exists())){
+ if(autoExit){printHelp();}
+ return true;
+ }
+ return false;
+ }
+
+ public static void printHelp(){
+ System.err.println("BBMap version "+Shared.BBMAP_VERSION_STRING);
+ System.err.println("For help, please run the shellscript with no parameters, or look in /docs/.");
+ System.exit(0);
+ }
+
+ /** Set SamLine Readgroup Strings */
+ public static boolean parseReadgroup(String arg, String a, String b){
+ if(a.equals("readgroup") || a.equals("readgroupid") || a.equals("rgid")){
+ SamLine.READGROUP_ID=b;
+ if(b!=null){SamLine.READGROUP_TAG="RG:Z:"+b;}
+ }else if(a.equals("readgroupcn") || a.equals("rgcn")){
+ SamLine.READGROUP_CN=b;
+ }else if(a.equals("readgroupds") || a.equals("rgds")){
+ SamLine.READGROUP_DS=b;
+ }else if(a.equals("readgroupdt") || a.equals("rgdt")){
+ SamLine.READGROUP_DT=b;
+ }else if(a.equals("readgroupfo") || a.equals("rgfo")){
+ SamLine.READGROUP_FO=b;
+ }else if(a.equals("readgroupks") || a.equals("rgks")){
+ SamLine.READGROUP_KS=b;
+ }else if(a.equals("readgrouplb") || a.equals("rglb")){
+ SamLine.READGROUP_LB=b;
+ }else if(a.equals("readgrouppg") || a.equals("rgpg")){
+ SamLine.READGROUP_PG=b;
+ }else if(a.equals("readgrouppi") || a.equals("rgpi")){
+ SamLine.READGROUP_PI=b;
+ }else if(a.equals("readgrouppl") || a.equals("rgpl")){
+ SamLine.READGROUP_PL=b;
+ }else if(a.equals("readgrouppu") || a.equals("rgpu")){
+ SamLine.READGROUP_PU=b;
+ }else if(a.equals("readgroupsm") || a.equals("rgsm")){
+ SamLine.READGROUP_SM=b;
+ }else{
+ return false;
+ }
+ return true;
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public boolean loglog=false;
+ public int loglogbuckets=1999;
+ public int loglogbits=8;
+ public int loglogk=31;
+ public long loglogseed=-1;
+
+ public boolean recalibrateQuality=false;
+
+ public int forceTrimModulo=-1;
+ public int forceTrimLeft=-1;
+ public int forceTrimRight=-1;
+ public int forceTrimRight2=-1;
+ public int build=1;
+
+ public long maxReads=-1;
+ public float samplerate=1f;
+ public long sampleseed=-1;
+
+ public boolean qtrimLeft=false;
+ public boolean qtrimRight=false;
+
+ public boolean qtrim1=false;
+ public boolean qtrim2=false;
+
+ public byte trimq=6;
+ public byte[] trimq2=null;
+ public byte minAvgQuality=0;
+ public int minAvgQualityBases=0;
+ public int maxNs=-1;
+ public int minConsecutiveBases=0;
+ public int minReadLength=0;
+ public int maxReadLength=-1;
+ public int minTrimLength=-1;
+ public float minLenFraction=0;
+ public float minGC=0;
+ public float maxGC=1;
+// public boolean filterGC=false;
+ public boolean untrim=false;
+ public boolean tossJunk=false;
+
+ public float idFilter=-1;
+ public int subfilter=-1;
+ public int delfilter=-1;
+ public int insfilter=-1;
+ public int indelfilter=-1;
+ public int dellenfilter=-1;
+ public int inslenfilter=-1;
+ public int editfilter=-1;
+
+ public int breakLength=0;
+ /** Toss pair only if both reads are shorter than limit */
+ public boolean requireBothBad=false;
+ public boolean trimBadSequence=false;
+ public boolean chastityFilter=false;
+ public boolean removeBadBarcodes=false;
+ public boolean failBadBarcodes=false;
+ public boolean failIfNoBarcode=false;
+
+ public HashSet<String> barcodes=null;
+
+ public boolean overwrite=false;
+ public boolean append=false;
+ public boolean testsize=false;
+
+ public boolean setInterleaved=false;
+
+ public String in1=null;
+ public String in2=null;
+
+ public String qfin1=null;
+ public String qfin2=null;
+
+ public String out1=null;
+ public String out2=null;
+ public String outsingle=null;
+ public boolean setOut=false;
+
+ public String qfout1=null;
+ public String qfout2=null;
+
+ public String extin=null;
+ public String extout=null;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Static Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private static byte qin=-1;
+ private static byte qout=-1;
+ private static boolean parsedQuality=false;
+
+ public static void processQuality(){
+ assert(parsedQuality);
+ if(qin!=-1 && qout!=-1){
+ FASTQ.ASCII_OFFSET=qin;
+ FASTQ.ASCII_OFFSET_OUT=qout;
+ FASTQ.DETECT_QUALITY=false;
+ }else if(qin!=-1){
+ FASTQ.ASCII_OFFSET=qin;
+ FASTQ.DETECT_QUALITY=false;
+ }else if(qout!=-1){
+ FASTQ.ASCII_OFFSET_OUT=qout;
+ FASTQ.DETECT_QUALITY_OUT=false;
+ }
+ }
+
+}
diff --git a/current/dna/Range.java b/current/dna/Range.java
new file mode 100755
index 0000000..8814dac
--- /dev/null
+++ b/current/dna/Range.java
@@ -0,0 +1,149 @@
+package dna;
+import java.util.ArrayList;
+import java.util.Arrays;
+
+
+public class Range implements Comparable<Range>{
+
+ /** A numeric range, assuming 0-based, base-centered numbering. */
+ public Range(int aa, int bb){
+
+ assert(aa<=bb) : aa+">"+bb;
+ a=aa;
+ b=bb;
+ length=b-a+1;
+ }
+
+ public static Range toRange(String s){
+ String[] s2=s.replace("[","").replace("]","").replace("(","").replace(")","").replace(",","").split("-");
+
+ int a, b;
+ if(s2.length==1){
+ a=b=Integer.parseInt(s2[0]);
+ }else{
+ a=Integer.parseInt(s2[0]);
+ b=Integer.parseInt(s2[1]);
+ }
+ return new Range(a, b);
+ }
+
+ @Override
+ public int compareTo(Range other) {
+ if(a<other.a){return -1;}
+ if(a>other.a){return 1;}
+
+ if(b<other.b){return -1;}
+ if(b>other.b){return 1;}
+
+ return 0;
+ }
+
+ public boolean includes(int p){
+ return p>=a && p<=b;
+ }
+
+ public boolean intersects(int p1, int p2){
+ return overlap(a, b, p1, p2);
+ }
+
+ public boolean includes(int p1, int p2){
+ assert(p1<=p2);
+ return p1>=a && p2<=b;
+ }
+
+ public boolean intersects(Range other){
+ return intersects(other.a, other.b);
+ }
+
+ public boolean touches(Range other){
+ if(intersects(other.a, other.b)){return true;}
+ return b==other.a-1 || a==other.b+1;
+ }
+
+ public boolean includes(Range other){
+ return includes(other.a, other.b);
+ }
+
+ public boolean equals(Object other){
+ return equals((Range)other);
+ }
+
+ public Range merge(Range other){
+ assert(touches(other));
+ Range r=new Range(min(a, other.a), max(b, other.b));
+
+ assert(r.includes(this));
+ assert(r.includes(other));
+ assert(r.length<=length+other.length);
+ return r;
+ }
+
+ public boolean equals(Range other){
+ return a==other.a && b==other.b;
+ }
+
+ public int hashCode(){
+ return new Long(Long.rotateLeft(a, 16)^b).hashCode();
+ }
+
+ public String toString(){
+ return "("+a+(a==b ? "" : (" - "+b))+")";
+ }
+
+
+ public static boolean overlap(int a1, int b1, int a2, int b2){
+ assert(a1<=b1 && a2<=b2) : a1+", "+b1+", "+a2+", "+b2;
+ return a2<=b1 && b2>=a1;
+ }
+
+
+ public static Range[] toRanges(int[] ...arrays){
+ int len=0;
+ int[] combined=null;
+
+ if(arrays.length==1){
+ combined=arrays[0];
+ len=combined.length;
+ }else{
+ for(int i=0; i<arrays.length; i++){
+ len+=arrays[i].length;
+ }
+ combined=new int[len];
+ for(int i=0, index=0; i<arrays.length; i++){
+ for(int j=0; j<arrays[i].length; j++){
+ combined[index]=arrays[i][j];
+ index++;
+ }
+ }
+ Arrays.sort(combined);
+ }
+
+ ArrayList<Range> list=new ArrayList<Range>(16);
+ int start=combined[0], last=combined[0];
+
+// System.out.println(Arrays.toString(combined));
+
+ for(int i=0; i<len; i++){
+ int x=combined[i];
+ if(x>last+1){
+ list.add(new Range(start, last));
+ start=last=x;
+ }else{
+ last=x;
+ }
+ }
+ list.add(new Range(start, last));
+ return list.toArray(new Range[list.size()]);
+ }
+
+
+ private static final int min(int x, int y){return x<y ? x : y;}
+ private static final int max(int x, int y){return x>y ? x : y;}
+
+ public final int a;
+ public final int b;
+ public final int length;
+
+ public Object obj1=null;
+ public Object obj2=null;
+}
diff --git a/current/dna/ScafLoc.java b/current/dna/ScafLoc.java
new file mode 100755
index 0000000..18daeb0
--- /dev/null
+++ b/current/dna/ScafLoc.java
@@ -0,0 +1,20 @@
+package dna;
+
+/**
+ * @author Brian Bushnell
+ * @date Sep 24, 2013
+ *
+ */
+public class ScafLoc {
+
+ public ScafLoc(String name_, int chrom_, int loc_){
+ name=name_;
+ chrom=chrom_;
+ loc=loc_;
+ }
+
+ public String name;
+ public int chrom;
+ public int loc;
+
+}
diff --git a/current/dna/Scaffold.java b/current/dna/Scaffold.java
new file mode 100755
index 0000000..b27913f
--- /dev/null
+++ b/current/dna/Scaffold.java
@@ -0,0 +1,84 @@
+package dna;
+
+/**
+ * @author Brian Bushnell
+ * @date Jan 4, 2013
+ *
+ */
+public class Scaffold implements Comparable<Scaffold> {
+
+ public Scaffold(String name_, String assembly_, int length_){
+ name=name_;
+ assembly=assembly_;
+ length=length_;
+ }
+
+ /** Assumes SAM format.
+ * e.g.<br> @SQ SN:scaffold_0 LN:1785514 AS:build 9 */
+ public Scaffold(byte[] s){
+ this(new String(s).split("\t"));
+ }
+
+ /** Assumes SAM format.
+ * e.g.<br> @SQ SN:scaffold_0 LN:1785514 AS:build 9 */
+ public Scaffold(String s){
+ this(s.split("\t"));
+ }
+
+ /** Assumes SAM format */
+ public Scaffold(String[] split) {
+ assert(split.length>2 && split[0].equals("@SQ"));
+ for(String s : split){
+ if(s.equals("@SQ")){
+ //Do nothing
+ }else if(s.startsWith("SN:")){
+ assert(name==null);
+ name=new String(s.substring(3)); //Data.forceIntern(s.substring(3));
+ }else if(s.startsWith("LN:")){
+ length=Integer.parseInt(s.substring(3));
+ }else if(s.startsWith("AS:")){
+ assembly=Data.forceIntern(s.substring(3));
+ }
+ }
+ assert(length>-1);
+ assert(name!=null);
+ }
+
+ public Scaffold(String name_, int length_) {
+ name=name_;
+ length=length_;
+ }
+
+ @Override
+ public int hashCode(){
+ return name.hashCode();
+ }
+
+ public int compareTo(Scaffold other){
+ return name.compareTo(other.name);
+ }
+
+ public String toString(){
+ return "@SQ\tSN:"+name+"\tLN:"+length+(assembly==null ? "" : "\tAS:"+assembly);
+ }
+
+ public String name;
+ public String assembly;
+ public int length=-1;
+ public long basehits=0;
+ public long readhits=0;
+ /** For calculating FPKM */
+ public long fraghits=0;
+ public long readhitsMinus=0;
+
+ /** {A,C,G,T,N} */
+ public long[] basecount;
+ public float gc;
+
+ /** For attaching things */
+ public Object obj1;
+
+ /** For attaching more things */
+ public Object obj2;
+
+}
diff --git a/current/dna/Timer.java b/current/dna/Timer.java
new file mode 100755
index 0000000..bdb7dc4
--- /dev/null
+++ b/current/dna/Timer.java
@@ -0,0 +1,28 @@
+package dna;
+
+public class Timer {
+
+ public Timer(){start();}
+
+ public long start(){
+ time1=time2=System.nanoTime();
+ elapsed=0;
+ return time1;
+ }
+
+ public long stop(){
+ time2=System.nanoTime();
+ elapsed=time2-time1;
+ return time2;
+ }
+
+ public String toString(){
+ return String.format("%.3f seconds.", elapsed/1000000000d);
+ }
+
+ public long time1;
+ public long time2;
+ /** in nanos */
+ public long elapsed;
+
+}
diff --git a/current/driver/A_Sample_Textfile.java b/current/driver/A_Sample_Textfile.java
new file mode 100755
index 0000000..b1fd0e3
--- /dev/null
+++ b/current/driver/A_Sample_Textfile.java
@@ -0,0 +1,186 @@
+package driver;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.util.Arrays;
+
+import align2.Shared;
+import align2.Tools;
+import dna.Parser;
+import dna.Timer;
+import fileIO.TextFile;
+import fileIO.FileFormat;
+import fileIO.ReadWrite;
+import fileIO.TextStreamWriter;
+
+/**
+ * @author Brian Bushnell
+ * @date Oct 17, 2014
+ *
+ */
+public class A_Sample_Textfile {
+
+ public static void main(String[] args){
+ Timer t=new Timer();
+ A_Sample_Textfile mb=new A_Sample_Textfile(args);
+ mb.process(t);
+ }
+
+ public A_Sample_Textfile(String[] args){
+
+ args=Parser.parseConfig(args);
+ if(Parser.parseHelp(args, true)){
+ printOptions();
+ System.exit(0);
+ }
+
+ for(String s : args){if(s.startsWith("out=standardout") || s.startsWith("out=stdout")){outstream=System.err;}}
+ outstream.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+
+
+
+ Shared.READ_BUFFER_LENGTH=Tools.min(200, Shared.READ_BUFFER_LENGTH);
+ Shared.capBuffers(4);
+ ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
+ ReadWrite.MAX_ZIP_THREADS=Shared.threads();
+
+
+ Parser parser=new Parser();
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if(b==null || b.equalsIgnoreCase("null")){b=null;}
+ while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens
+
+ if(parser.parse(arg, a, b)){
+ //do nothing
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ ReadWrite.verbose=verbose;
+ }else if(parser.in1==null && i==0 && !arg.contains("=") && (arg.toLowerCase().startsWith("stdin") || new File(arg).exists())){
+ parser.in1=arg;
+ }else{
+ outstream.println("Unknown parameter "+args[i]);
+ assert(false) : "Unknown parameter "+args[i];
+ // throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ {//Process parser fields
+ Parser.processQuality();
+
+ maxReads=parser.maxReads;
+
+ overwrite=parser.overwrite;
+ append=parser.append;
+
+ in1=parser.in1;
+
+ out1=parser.out1;
+ }
+
+ if(in1==null){
+ printOptions();
+ throw new RuntimeException("Error - at least one input file is required.");
+ }
+
+ if(out1!=null && out1.equalsIgnoreCase("null")){out1=null;}
+
+ if(!Tools.testOutputFiles(overwrite, append, false, out1)){
+ outstream.println((out1==null)+", "+out1);
+ throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+out1+"\n");
+ }
+
+ ffout1=FileFormat.testOutput(out1, FileFormat.TEXT, null, true, overwrite, append, false);
+
+ ffin1=FileFormat.testInput(in1, FileFormat.TEXT, null, true, true);
+ }
+
+ void process(Timer t){
+
+ final TextFile tf;
+ {
+ tf=new TextFile(ffin1);
+ if(verbose){outstream.println("Started tf");}
+ }
+
+ final TextStreamWriter tsw;
+ {
+ tsw=new TextStreamWriter(ffout1);
+ tsw.start();
+ if(verbose){outstream.println("Started tsw");}
+ }
+
+ long linesProcessed=0;
+ long charsProcessed=0;
+
+ {
+ String line;
+ while((line=tf.nextLine())!=null){
+ linesProcessed++;
+ charsProcessed+=line.length();
+ String result=processLine(line);
+ if(tsw!=null && result!=null){tsw.println(result);}
+ if(linesProcessed>=maxReads){break;}
+ }
+ }
+
+ errorState|=tsw.poisonAndWait();
+ errorState|=tf.close();
+
+ t.stop();
+
+ double rpnano=linesProcessed/(double)(t.elapsed);
+ double bpnano=charsProcessed/(double)(t.elapsed);
+
+ String rpstring=(linesProcessed<100000 ? ""+linesProcessed : linesProcessed<100000000 ? (linesProcessed/1000)+"k" : (linesProcessed/1000000)+"m");
+ String bpstring=(charsProcessed<100000 ? ""+charsProcessed : charsProcessed<100000000 ? (charsProcessed/1000)+"k" : (charsProcessed/1000000)+"m");
+
+ while(rpstring.length()<8){rpstring=" "+rpstring;}
+ while(bpstring.length()<8){bpstring=" "+bpstring;}
+
+ outstream.println("Time: \t"+t);
+ outstream.println("Lines Processed: "+rpstring+" \t"+String.format("%.2fk lines/sec", rpnano*1000000));
+ outstream.println("Chars Processed: "+bpstring+" \t"+String.format("%.2fm chars/sec", bpnano*1000));
+
+ if(errorState){
+ throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
+ }
+ }
+
+
+ private String processLine(String line){
+ return line;
+ }
+
+
+ /*--------------------------------------------------------------*/
+
+ private void printOptions(){assert(false) : "printOptions: TODO";}
+
+ /*--------------------------------------------------------------*/
+
+ private String in1=null;
+ private String out1=null;
+
+ /*--------------------------------------------------------------*/
+
+ private long maxReads=-1;
+
+ /*--------------------------------------------------------------*/
+
+ private final FileFormat ffin1;
+ private final FileFormat ffout1;
+
+
+ /*--------------------------------------------------------------*/
+
+ private PrintStream outstream=System.err;
+ public static boolean verbose=false;
+ public boolean errorState=false;
+ private boolean overwrite=false;
+ private boolean append=false;
+
+}
diff --git a/current/driver/ClearRam.java b/current/driver/ClearRam.java
new file mode 100755
index 0000000..4e18a81
--- /dev/null
+++ b/current/driver/ClearRam.java
@@ -0,0 +1,64 @@
+package driver;
+
+import java.util.ArrayList;
+
+import fileIO.ReadWrite;
+
+public class ClearRam {
+
+ public static void main(String[] args){
+
+ for(int i=0; i<2; i++){
+
+ try {
+ System.gc();
+ attempt();
+ } catch(final java.lang.OutOfMemoryError e) {
+// e.printStackTrace();
+ System.err.println("Out of memory at "+((current*8)/(1<<20))+" MB");
+ }
+ }
+ }
+
+ public static void attempt(){
+ ArrayList<long[]> list=new ArrayList<long[]>(8000);
+ current=0;
+
+ while(true){
+ long[] array=null;
+
+ array=new long[1<<20];
+ list.add(array);
+
+// for(int i=0; i<array.length; i++){
+// array[i]=current;
+// current++;
+// }
+ current+=array.length;
+ }
+ }
+
+ public static void writeJunk(int megs){
+ try {
+ long[] old=(long[]) ReadWrite.readObject("JUNK"+megs+".long", false);
+ for(int i=1; i<old.length; i++){
+ assert(old[i]==old[i-1]+1);
+ }
+ } catch (Exception e) {
+
+ }
+
+
+
+ long[] array=new long[megs*(1<<17)];
+ long current=System.nanoTime();
+ for(int i=0; i<array.length; i++){
+ array[i]=current+i;
+ }
+ ReadWrite.write(array, "JUNK"+megs+".long", false);
+ System.err.println("Wrote "+((8*array.length)/(1024000))+" MB junk");
+ }
+
+ private static long current=0;
+
+}
diff --git a/current/driver/CollateSpikeIn.java b/current/driver/CollateSpikeIn.java
new file mode 100755
index 0000000..819f1a7
--- /dev/null
+++ b/current/driver/CollateSpikeIn.java
@@ -0,0 +1,46 @@
+package driver;
+
+import fileIO.TextFile;
+
+/**
+ * @author Brian Bushnell
+ * @date Oct 10, 2013
+ *
+ */
+public class CollateSpikeIn {
+
+ public static void main(String[] args){
+// Executing align2.BBMapPacBio [minratio=0.40, fastareadlen=500, out=null, in=/projectb/shared/pacbio/jobs/026/026437/data/filtered_subreads.fasta]
+
+ System.out.println("jobID\t%Control (BBMap)\t%Accuracy (BBMap)");
+
+ TextFile tf=new TextFile(args[0], false, false);
+ String file=null, mapped=null, acc=null;
+ String line=tf.nextLine();
+ while(line!=null){
+ if(line.startsWith("mapped:")){
+ String[] split=line.split("\\p{javaWhitespace}+");
+ mapped=split[1].replace("%", "");
+ }else if(line.startsWith("Match Rate:")){
+ String[] split=line.split("\\p{javaWhitespace}+");
+ acc=split[2].replace("%", "");
+ System.out.println(file+"\t"+mapped+"\t"+acc);
+ file=acc=mapped=null;
+ }else if(line.startsWith("Executing align2.BBMap")){
+ String[] split=line.split("\\p{javaWhitespace}+");
+ for(String s : split){
+ if(s.startsWith("in=")){
+ file=s.replace("in=", "").replace("]", "").replace(",", "");
+ file=file.replace("/projectb/shared/pacbio/jobs/", "").replace("/data/filtered_subreads.fasta", "");
+ file=file.substring(file.indexOf('/')+1);
+ mapped=acc=null;
+ break;
+ }
+ }
+ }
+ line=tf.nextLine();
+ }
+
+ }
+
+}
diff --git a/current/driver/CompareReferenceGenomes.java b/current/driver/CompareReferenceGenomes.java
new file mode 100755
index 0000000..887884e
--- /dev/null
+++ b/current/driver/CompareReferenceGenomes.java
@@ -0,0 +1,47 @@
+package driver;
+
+import dna.ChromosomeArray;
+import dna.Data;
+import dna.Gene;
+
+public class CompareReferenceGenomes {
+
+ public static void main(String[] args){
+ compareGenomes(args[0], args[1]);
+ }
+
+ public static void compareGenomes(String pattern1, String pattern2){
+ for(byte chrom=1; chrom<=25; chrom++){
+ System.out.println("Comparing chromosome "+chrom);
+ String fname1=pattern1.replace("#", ""+chrom);
+ String fname2=pattern2.replace("#", ""+chrom);
+ ChromosomeArray cha=ChromosomeArray.read(fname1);
+ ChromosomeArray chb=ChromosomeArray.read(fname2);
+ boolean result=compare(cha, chb);
+ System.out.println("..."+(result ? "identical." : "different."));
+ }
+ }
+
+ public static boolean compare(ChromosomeArray cha, ChromosomeArray chb){
+ boolean equal=true;
+ if(cha.minIndex!=chb.minIndex || cha.maxIndex!=chb.maxIndex){
+ System.out.println("Index mismatch in chrom "+cha.chromosome+":\n" +
+ "("+cha.minIndex+" - "+cha.maxIndex+") vs ("+chb.minIndex+" - "+chb.maxIndex+")");
+ equal=false;
+ }
+ int start=Data.max(cha.minIndex, chb.minIndex);
+ int stop=Data.min(cha.maxIndex, chb.maxIndex);
+
+ for(int i=start; i<=stop; i++){
+ byte a=cha.get(i);
+ byte b=chb.get(i);
+ if(a!=b){
+ System.out.println(((char)cha.chromosome)+"\t"+i+"\t"+((char)a)+" "+((char)b));
+ equal=false;
+ }
+ }
+ return equal;
+
+ }
+
+}
diff --git a/current/driver/CompareSequences.java b/current/driver/CompareSequences.java
new file mode 100755
index 0000000..e50f6e9
--- /dev/null
+++ b/current/driver/CompareSequences.java
@@ -0,0 +1,62 @@
+package driver;
+
+import dna.ChromosomeArray;
+
+public class CompareSequences {
+
+ public static void main(String[] args){
+
+ ChromosomeArray cha1=ChromosomeArray.read(args[0]);
+ ChromosomeArray cha2=ChromosomeArray.read(args[1]);
+
+ long different=0;
+ long same=0;
+ long nToBase=0;
+ long baseToN=0;
+ long caseDifferent=0;
+ long toUpper=0;
+ long toLower=0;
+ long difLen=cha2.maxIndex-cha1.maxIndex;
+
+ int lim=cha2.maxIndex>cha1.maxIndex ? cha1.maxIndex : cha2.maxIndex;
+
+ for(int i=0; i<lim; i++){
+ char a=(char) cha1.get(i);
+ char b=(char) cha2.get(i);
+ if(a==b){
+ same++;
+ }else{
+ different++;
+ if(a=='N' && b!='N'){
+ nToBase++;
+ }else if(a!='N' && b=='N'){
+ baseToN++;
+ }
+
+ if(Character.toLowerCase(a)==Character.toLowerCase(b)){
+ caseDifferent++;
+ if(a==Character.toLowerCase(a)){
+ toUpper++;
+ }else{
+ toLower++;
+ }
+ }
+
+ }
+ }
+
+ same+=caseDifferent;
+ different-=caseDifferent;
+
+ System.out.println("Length Difference: "+difLen);
+ System.out.println("Same bases: "+same);
+ System.out.println("Different bases: "+different+" ("+(100f*different/(float)(different+same))+"%)");
+ System.out.println("Base-to-N: "+baseToN);
+ System.out.println("N-To-Base: "+nToBase);
+ System.out.println("Changed case: "+caseDifferent);
+ System.out.println("toUpperCase: "+toUpper);
+ System.out.println("toLowerCase: "+toLower);
+
+ }
+
+}
diff --git a/current/driver/ConcatenateFiles.java b/current/driver/ConcatenateFiles.java
new file mode 100755
index 0000000..7fe8d6d
--- /dev/null
+++ b/current/driver/ConcatenateFiles.java
@@ -0,0 +1,92 @@
+package driver;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.util.Arrays;
+
+import dna.Gene;
+import dna.Timer;
+import fileIO.ReadWrite;
+import fileIO.TextFile;
+import fileIO.TextStreamWriter;
+
+public class ConcatenateFiles {
+
+ public static void main(String[] args){
+
+ Timer t=new Timer();
+ final String in=args[0];
+ final String out=(args.length>1 ? args[1] : null);
+ if(new File(in).isDirectory()){
+ try {
+ concatenateDirectory(in, out);
+ } catch (IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }else{
+ concatenatePattern(in, out);
+ }
+ t.stop();
+ System.err.println(t);
+
+ }
+
+ public static void concatenatePattern(final String basename, final String out){
+ assert(false) : "This is human-specific.";
+ String outname=(out==null ? basename.replace("#", "ALL") : out);
+
+ TextStreamWriter tsw=new TextStreamWriter(outname, true, false, true);
+ tsw.start();
+
+ for(int chrom=1; chrom<26; chrom++){
+ String fname=basename.replace("#", ""+chrom);
+ TextFile tf=new TextFile(fname, false, true);
+
+ tsw.print(">chr"+chrom+"\n");
+ for(String s=tf.nextLine(); s!=null; s=tf.nextLine()){
+ char c=s.charAt(0);
+ if(c!='>' && c!=';'){
+ tsw.println(s);
+ }
+ }
+ System.err.print(".");
+ }
+ tsw.poison();
+ tsw.waitForFinish();
+ }
+
+ public static void concatenateDirectory(final String in, String out) throws IOException{
+ if(out==null){out="stdout";}
+
+ final byte[] buf=new byte[32768];
+
+ final File dir=new File(in);
+ final File[] files=dir.listFiles();
+ Arrays.sort(files);
+
+ final File outfile=new File(out);
+ final OutputStream os=ReadWrite.getOutputStream(out, false, true, true);
+
+ for(File f : files){
+ if(f!=null && f.isFile() && !f.equals(outfile)){
+ String fname=f.getAbsolutePath();
+ System.err.println("Processing "+fname);
+
+ InputStream is=ReadWrite.getInputStream(fname, false, false);
+
+ for(int lim=is.read(buf); lim>0; lim=is.read(buf)){
+ os.write(buf, 0, lim);
+ }
+
+ is.close();
+ System.err.print(".");
+ }
+ }
+ ReadWrite.close(os);
+ }
+
+
+}
diff --git a/current/driver/ConcatenateTextFiles.java b/current/driver/ConcatenateTextFiles.java
new file mode 100755
index 0000000..ba0d075
--- /dev/null
+++ b/current/driver/ConcatenateTextFiles.java
@@ -0,0 +1,189 @@
+package driver;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.PrintWriter;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.concurrent.ArrayBlockingQueue;
+
+import align2.ReadStats;
+import align2.Tools;
+
+import dna.Parser;
+import dna.Timer;
+
+import fileIO.ReadWrite;
+import fileIO.TextFile;
+
+public class ConcatenateTextFiles {
+
+ /** Format: infile1,infile2,...infileN,outfile */
+ public static void main(String[] args){
+ System.err.println("Executing "+(new Object() { }.getClass().getEnclosingClass().getName())+" "+Arrays.toString(args)+"\n");
+ Timer t=new Timer();
+
+ if(ReadWrite.ZIPLEVEL<6){ReadWrite.ZIPLEVEL=6;}
+
+ for(int i=0; i<args.length; i++){
+ final String arg=args[i];
+ final String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=(split.length>1 ? split[1] : "true");
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(Parser.parseZip(arg, a, b)){
+ //do nothing
+ }else if(a.equals("append") || a.equals("app")){
+ append=ReadStats.append=Tools.parseBoolean(b);
+ }else if(a.equals("overwrite") || a.equals("ow")){
+ overwrite=Tools.parseBoolean(b);
+ }else{
+ concatenate(args[i].split(","));
+ }
+
+ }
+ t.stop();
+ System.out.println();
+ System.out.println("Time: \t"+t);
+ }
+
+ private static void concatenate(String[] split) {
+ String outname=split[split.length-1];
+ assert(overwrite || !new File(outname).exists()) : outname+" exists.";
+
+ WriteThread wt=new WriteThread(outname);
+ wt.start();
+
+
+ ArrayList<String>[] bufferptr=new ArrayList[] {new ArrayList<String>(LIST_SIZE)};
+
+ for(int i=0; i<split.length-1; i++){
+ processTerm(split[i], bufferptr, wt);
+ }
+
+ ArrayList<String> buffer=bufferptr[0];
+ if(buffer==null){
+ wt.add(new ArrayList<String>(1));
+ }else if(buffer.isEmpty()){
+ wt.add(buffer);
+ }else{
+ wt.add(buffer);
+ wt.add(new ArrayList<String>(1));
+ }
+
+ }
+
+ private static void processTerm(String term, ArrayList<String>[] bufferptr, WriteThread wt){
+
+ System.out.println("Processing term "+term);
+
+ File f=new File(term);
+ if(!f.isDirectory()){
+
+ TextFile tf=new TextFile(term, false, false);
+
+ ArrayList<String> buffer=bufferptr[0];
+
+ String s=null;
+ for(s=tf.nextLine(); s!=null; s=tf.nextLine()){
+ buffer.add(s);
+
+ // System.out.println("Added to buffer");
+ if(buffer.size()>=LIST_SIZE){
+ // System.out.println("Sent buffer");
+
+// System.out.println("****** "+term+" ******");
+// for(String b : buffer){
+// System.out.println(b);
+// }
+
+ wt.add(buffer);
+ bufferptr[0]=buffer=new ArrayList<String>(LIST_SIZE);
+ }
+ }
+ tf.close();
+ }else{
+ assert(f.isDirectory());
+ File[] contents=f.listFiles();
+ for(File c : contents){
+ String abs=c.getAbsolutePath();
+ if(!abs.equals(wt.fname)){
+// System.out.println(c+" == "+new File(wt.fname)+" : "+c.equals(new File(wt.fname)));
+ processTerm(abs, bufferptr, wt);
+ }
+ }
+ }
+ }
+
+ private static class WriteThread extends Thread{
+
+ public WriteThread(String fname_){
+ String temp=fname_;
+ try {
+ temp=new File(fname_).getCanonicalPath();
+ } catch (IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ fname=temp;
+ os=ReadWrite.getOutputStream(fname, append, true, true);
+ writer=new PrintWriter(os);
+ }
+
+ public void add(ArrayList<String> list){
+ assert(list!=null);
+ while(list!=null){
+// System.out.println("Adding list to queue "+queue.size());
+ try {
+ queue.put(list);
+ list=null;
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+
+ @Override
+ public void run(){
+
+ ArrayList<String> list=null;
+ while(list==null){
+// System.out.println("Waiting for list...");
+ try {
+ list=queue.take();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+// System.out.println("Took list of size "+(list==null ? "null" : list.size()+""));
+ if(list!=null){
+ if(list.isEmpty()){
+ ReadWrite.finishWriting(writer, os, fname, allowSubprocess);
+ return;
+ }
+ for(String s : list){
+ if(s!=null){writer.println(s);}
+ }
+ }
+ list=null;
+ }
+ }
+
+ private final OutputStream os;
+ private final PrintWriter writer;
+ private final ArrayBlockingQueue<ArrayList<String>> queue=new ArrayBlockingQueue<ArrayList<String>>(MAX_LISTS);
+ private final String fname;
+
+ }
+
+ public static final int MAX_LISTS=8;
+ public static final int LIST_SIZE=100;
+ public static boolean overwrite=true;
+ public static boolean append=false;
+ public static boolean allowSubprocess=true;
+
+}
diff --git a/current/driver/Concatenator.java b/current/driver/Concatenator.java
new file mode 100755
index 0000000..216d163
--- /dev/null
+++ b/current/driver/Concatenator.java
@@ -0,0 +1,60 @@
+package driver;
+
+import fileIO.TextFile;
+import fileIO.TextStreamWriter;
+
+public class Concatenator {
+
+
+ public static void main(String args[]){
+
+ assert(args.length==2 && !args[1].contains(","));
+ TextStreamWriter tsw=new TextStreamWriter(args[1], false, false, true);
+ tsw.start();
+ for(String s : args[0].split(",")){
+ writeFile(s, tsw);
+ }
+ tsw.poison();
+ }
+
+ public static void writeFile(String fname, TextStreamWriter tsw){
+ TextFile tf=new TextFile(fname, false, false);
+ if(tsw==null){
+ for(String s=tf.nextLine(); s!=null; s=tf.nextLine()){
+ System.out.println(s);
+ }
+ }else{
+ for(String s=tf.nextLine(); s!=null; s=tf.nextLine()){
+ tsw.println(s);
+ }
+ }
+ tf.close();
+ }
+
+
+ public static StringBuilder merge(String[] fnames){
+ StringBuilder sb=new StringBuilder();
+
+ for(int i=0; i<fnames.length; i++){
+ String fname=fnames[i];
+ if(fname!=null){
+ TextFile tf=new TextFile(fname, false, false);
+ String[] lines=tf.toStringLines();
+ tf.close();
+ for(int j=0; j<lines.length; j++){
+ String s=lines[j];
+ lines[j]=null;
+// if(i<2 || !s.startsWith("#")){
+// sb.append(s);
+// sb.append('\n');
+// }
+ sb.append(s);
+ sb.append('\n');
+ }
+ }
+ }
+ return sb;
+ }
+
+
+}
diff --git a/current/driver/ConvertSamToAln.java b/current/driver/ConvertSamToAln.java
new file mode 100755
index 0000000..aaffbfa
--- /dev/null
+++ b/current/driver/ConvertSamToAln.java
@@ -0,0 +1,73 @@
+package driver;
+
+import java.io.OutputStream;
+import java.io.PrintWriter;
+
+import fileIO.ReadWrite;
+import fileIO.TextFile;
+
+public class ConvertSamToAln {
+
+ public static void main(String[] args){
+ for(String s : args){
+ convert(s);
+ System.out.println("Converted "+s);
+ }
+ }
+
+ public static final void convert(String fname){
+ TextFile tf=new TextFile(fname, false, false);
+
+
+
+ String outname=fname;
+ if(outname.toLowerCase().endsWith(".zip")){outname=outname.substring(0, outname.length()-4);}
+ if(outname.toLowerCase().endsWith(".gz")){outname=outname.substring(0, outname.length()-3);}
+ if(outname.toLowerCase().endsWith(".bz2")){outname=outname.substring(0, outname.length()-4);}
+ if(outname.toLowerCase().endsWith(".sam")){outname=outname.substring(0, outname.length()-4);}
+ outname=outname+".aln.gz";
+
+ String s=null;
+
+ OutputStream os=ReadWrite.getOutputStream(outname, false, true, true);
+ PrintWriter out=new PrintWriter(os);
+
+ for(s=tf.nextLine(); s!=null; s=tf.nextLine()){
+ if(!s.startsWith("@")){
+ String[] line=s.split("\t");
+ assert(line.length>1) : s;
+
+ boolean success=true;
+ boolean nomap=false;
+ boolean reverse=false;
+
+ int flag=-1;
+ String chrom=null;
+ int loc=-1;
+
+ try {
+ flag=Integer.parseInt(line[1]);
+ chrom=line[2];
+ loc=Integer.parseInt(line[3]);
+ nomap=((flag&0x4)!=0);
+ reverse=((flag&0x10)!=0);
+ } catch (NumberFormatException e) {
+ success=false;
+ }
+
+ if(success && !nomap){
+ String aln=chrom+"\t"+loc+"\t"+(reverse ? "R" : "F")+"\n";
+ out.print(aln);
+ }
+
+
+ }
+ }
+
+ tf.close();
+ out.flush();
+ out.close();
+
+ }
+
+}
diff --git a/current/driver/CorrelateIdentity.java b/current/driver/CorrelateIdentity.java
new file mode 100755
index 0000000..ed8fc9a
--- /dev/null
+++ b/current/driver/CorrelateIdentity.java
@@ -0,0 +1,189 @@
+package driver;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Random;
+
+import align2.Tools;
+
+import dna.Parser;
+import fileIO.ReadWrite;
+import fileIO.TextFile;
+import fileIO.TextStreamWriter;
+
+/**
+ * @author Brian Bushnell
+ * @date Nov 21, 2014
+ *
+ */
+public class CorrelateIdentity {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /**
+ * Code entrance from the command line.
+ * @param args Command line arguments
+ */
+ public static void main(String[] args){
+
+ args=Parser.parseConfig(args);
+ if(Parser.parseHelp(args, true)){
+ printOptions();
+ System.exit(0);
+ }
+
+ //Create a new CorrelateIdentity instance
+ CorrelateIdentity ci=new CorrelateIdentity(args);
+
+ ///And run it
+ ci.process();
+ }
+
+ /**
+ * Display usage information.
+ */
+ private static void printOptions(){
+ outstream.println("Please consult the shellscript for usage information.");
+ }
+
+
+ /**
+ * Constructor.
+ * @param args Command line arguments
+ */
+ public CorrelateIdentity(String[] args){
+ System.err.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+
+ /* Set global defaults */
+ ReadWrite.ZIPLEVEL=6;
+ ReadWrite.USE_UNPIGZ=true;
+ ReadWrite.USE_PIGZ=true;
+
+ /* Initialize local variables with defaults */
+ boolean setOut=false;
+
+ /* Parse arguments */
+ for(int i=0; i<args.length; i++){
+
+ final String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if("null".equalsIgnoreCase(b)){b=null;}
+ while(a.charAt(0)=='-' && (a.indexOf('.')<0 || i>1 || !new File(a).exists())){a=a.substring(1);}
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(Parser.parseZip(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseQuality(arg, a, b)){
+ //do nothing
+ }else if(a.equals("in") || a.equals("in1")){
+ in1=b;
+ }else if(a.equals("in2")){
+ in2=b;
+ }else if(a.equals("out") || a.equals("out1")){
+ out=b;
+ setOut=true;
+ }else if(a.equals("samplerate")){
+ samplerate=Float.parseFloat(b);
+ assert(samplerate<=1f && samplerate>=0f) : "samplerate="+samplerate+"; should be between 0 and 1";
+ }else if(a.equals("sampleseed")){
+ sampleseed=Long.parseLong(b);
+ }else if(a.equals("overwrite") || a.equals("ow")){
+ overwrite=Tools.parseBoolean(b);
+ }else{
+ throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ {//Process parser fields
+ Parser.processQuality();
+ }
+
+
+ if(!Tools.testOutputFiles(overwrite, append, false, out)){
+ throw new RuntimeException("\nCan't write to some output files; overwrite="+overwrite+"\n");
+ }
+ if(!Tools.testInputFiles(false, true, in1, in2)){
+ throw new RuntimeException("\nCan't read to some input files.\n");
+ }
+ if(!Tools.testForDuplicateFiles(true, in1, in2, out)){
+ throw new RuntimeException("\nSome file names were specified multiple times.\n");
+ }
+
+ assert(in1==null || in1.toLowerCase().startsWith("stdin") || in1.toLowerCase().startsWith("standardin") || new File(in1).exists()) : "Can't find "+in1;
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Outer Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public void process(){
+ final String[][] matrix1, matrix2;
+
+ {
+ TextFile tf=new TextFile(in1);
+ String[] s=tf.toStringLines();
+ tf.close();
+ matrix1=tf.doublesplitWhitespace(s, true);
+ }
+
+ {
+ TextFile tf=new TextFile(in2);
+ String[] s=tf.toStringLines();
+ tf.close();
+ matrix2=tf.doublesplitWhitespace(s, true);
+ }
+
+ ArrayList<String[]> list=new ArrayList<String[]>();
+ for(int i=0; i<matrix1.length; i++){
+ for(int j=1; j<=i; j++){
+ list.add(new String[] {matrix1[i][j], matrix2[i][j]});
+ }
+ }
+
+ Collections.shuffle(list);
+
+ TextStreamWriter tsw=new TextStreamWriter(out, overwrite, append, true);
+ tsw.start();
+ for(String[] pair : list){
+ tsw.print(pair[0]+"\t"+pair[1]+"\n");
+ }
+ tsw.poisonAndWait();
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Input files */
+ public String in1, in2;
+ /** Output file */
+ public String out;
+
+ private Random randy=new Random();
+
+ private float samplerate=1;
+ private float sampleseed=-1;
+ private int columnLength=Integer.MAX_VALUE;
+ private boolean overwrite=false;
+ private boolean append=false;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Static Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Verbose messages */
+ public static final boolean verbose=false; //123
+
+ /** Print messages to this stream */
+ private static PrintStream outstream=System.err;
+
+}
diff --git a/current/driver/CountRNAs.java b/current/driver/CountRNAs.java
new file mode 100755
index 0000000..74a0225
--- /dev/null
+++ b/current/driver/CountRNAs.java
@@ -0,0 +1,32 @@
+package driver;
+
+import dna.Data;
+import dna.Gene;
+
+public class CountRNAs {
+
+ public static void main(String[] args){
+ Data.GENOME_BUILD=Integer.parseInt(args[0]);
+ Data.GENE_MAP=args[1];
+ long coding=0;
+ long noncoding=0;
+ long pseudo=0;
+ for(byte chrom=1; chrom<=24; chrom++){
+ Gene[] genes=Data.getGenes(chrom);
+ for(Gene g : genes){
+ if(g.pseudo){
+ pseudo++;
+ }else if(g.untranslated){
+ noncoding++;
+ }else{
+ coding++;
+ }
+ }
+ }
+ System.out.println("Gene map: "+Data.GENE_MAP);
+ System.out.println("Pseudogenes: "+pseudo);
+ System.out.println("Translated Genes: "+coding);
+ System.out.println("Untranslated Genes: "+noncoding);
+ }
+
+}
diff --git a/current/driver/CountSharedLines.java b/current/driver/CountSharedLines.java
new file mode 100755
index 0000000..88f29da
--- /dev/null
+++ b/current/driver/CountSharedLines.java
@@ -0,0 +1,247 @@
+package driver;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.LinkedHashSet;
+
+import dna.Parser;
+import dna.Timer;
+import fileIO.ReadWrite;
+import fileIO.FileFormat;
+import fileIO.TextFile;
+import fileIO.TextStreamWriter;
+
+import align2.Shared;
+import align2.Tools;
+
+/**
+ * Filters text lines by exact match or substring.
+ * @author Brian Bushnell
+ * @date Jul 6, 2015
+ *
+ */
+public class CountSharedLines {
+
+ public static void main(String[] args){
+ Timer t=new Timer();
+ CountSharedLines mb=new CountSharedLines(args);
+ mb.process(t);
+ }
+
+ public CountSharedLines(String[] args){
+
+ args=Parser.parseConfig(args);
+ if(Parser.parseHelp(args, true)){
+ printOptions();
+ System.exit(0);
+ }
+
+ for(String s : args){if(s.startsWith("out=standardout") || s.startsWith("out=stdout")){outstream=System.err;}}
+ outstream.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+
+ Shared.READ_BUFFER_LENGTH=Tools.min(200, Shared.READ_BUFFER_LENGTH);
+ Shared.capBuffers(4);
+ ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
+ ReadWrite.MAX_ZIP_THREADS=Shared.threads();
+
+
+ Parser parser=new Parser();
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if(b==null || b.equalsIgnoreCase("null")){b=null;}
+ while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens
+
+ if(parser.parseCommon(arg, a, b)){
+ //do nothing
+ }else if(a.equals("in") || a.equals("in1")){
+ if(b!=null){
+ String[] x=b.split(",");
+ for(String s : x){
+ in1.add(s);
+ }
+ }
+ }else if(a.equals("names") || a.equals("in2")){
+ if(b!=null){
+ String[] x=b.split(",");
+ for(String s : x){
+ in2.add(s);
+ }
+ }
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ ReadWrite.verbose=verbose;
+ }else if(a.equals("lines") || a.equals("maxlines")){
+ maxLines=Tools.parseKMG(b);
+ }else if(a.equals("substrings") || a.equals("substring")){
+ if(b==null){b="t";}
+ if(b.equals("header")){
+ lineSubstringOfName=true;
+ }else if(b.equals("name")){
+ nameSubstringOfLine=true;
+ }else{
+ nameSubstringOfLine=lineSubstringOfName=Tools.parseBoolean(b);
+ }
+ }else if(a.equals("prefix") || a.equals("prefixmode")){
+ prefixMode=Tools.parseBoolean(b);
+ }else if(a.equals("replace")){
+ String[] split2=b.split(",");
+ assert(split2.length==2);
+ replace1=split2[0];
+ replace2=split2[1];
+ }else if(a.equals("casesensitive") || a.equals("case")){
+ ignoreCase=!Tools.parseBoolean(b);
+ }else if(a.equals("include") || a.equals("retain")){
+ exclude=!Tools.parseBoolean(b);
+ }else if(a.equals("exclude") || a.equals("remove")){
+ exclude=Tools.parseBoolean(b);
+ }else{
+ outstream.println("Unknown parameter "+args[i]);
+ assert(false) : "Unknown parameter "+args[i];
+ // throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+// {
+// String[] x=in1.toArray(new String[in1.size()]);
+// in1.clear();
+// for(String s : x){
+// Tools.addNames(s, in1);
+// }
+// x=in2.toArray(new String[in2.size()]);
+// in2.clear();
+// for(String s : x){
+// Tools.addNames(s, in2);
+// }
+// }
+
+ {//Process parser fields
+ overwrite=parser.overwrite;
+ append=parser.append;
+ }
+
+ if(in1==null || in2==null){
+ printOptions();
+ throw new RuntimeException("Error - at least one input file is required from each set.");
+ }
+ }
+
+ final String getOutputName(String fname){
+ fname=fname.replaceAll("\\\\", "/");
+ if(!fname.contains("/")){fname="./"+fname;}
+ int idx=fname.lastIndexOf('/');
+ final String out=fname.substring(0, idx+1)+"out_"+fname.substring(idx+1);
+ return out;
+ }
+
+ void process(Timer t){
+
+ for(String fname : in1){
+ processInner(fname, getOutputName(fname), in2);
+ }
+ for(String fname : in2){
+ processInner(fname, getOutputName(fname), in1);
+ }
+
+ t.stop();
+
+ outstream.println("\nTime: "+t);
+
+ if(errorState){
+ throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
+ }
+
+ }
+
+ LinkedHashSet<String> getContents(String fname){
+ final FileFormat ff=FileFormat.testInput(fname, FileFormat.TEXT, null, true, true);
+ final LinkedHashSet<String> set=new LinkedHashSet<String>();
+ final TextFile tf=new TextFile(ff);
+
+ for(String line0=tf.readLine(true); line0!=null; line0=tf.readLine(true)){
+ String line=(ignoreCase ? line0.toLowerCase() : line0);
+ if(replace1!=null){line=line.replace(replace1, replace2);}
+ if(prefixMode){
+ for(int x=1; x<line.length(); x++){
+ char c=line.charAt(x-1);
+ char next=line.charAt(x);
+ if(Character.isWhitespace(c)){
+ line=line.substring(0, x).trim();
+ break;
+ }
+ }
+ }
+ set.add(line);
+ }
+ errorState|=tf.close();
+ return set;
+ }
+
+ void processInner(String fnameIn, String fnameOut, Collection<String> list){
+
+ final LinkedHashSet<String> set1=getContents(fnameIn);
+ final FileFormat ffout=FileFormat.testOutput(fnameOut, FileFormat.TEXT, null, true, overwrite, append, false);
+
+ final TextStreamWriter tsw;
+ if(ffout!=null){
+ tsw=new TextStreamWriter(ffout);
+ tsw.start();
+ }else{tsw=null;}
+
+ for(String fname2 : list){
+ long shared=0;
+ final LinkedHashSet<String> set2=getContents(fname2);
+ for(String s : set1){
+ if(set2.contains(s)){
+ shared++;
+ }
+ }
+ if(tsw!=null){
+ tsw.print(ReadWrite.stripToCore(fname2)+"\t"+shared+"\n");
+ }
+ }
+
+ if(tsw!=null){
+ errorState|=tsw.poisonAndWait();
+ }
+ }
+
+ /*--------------------------------------------------------------*/
+
+ private void printOptions(){
+ assert(false) : "printOptions: TODO";
+ }
+
+
+ /*--------------------------------------------------------------*/
+
+ private LinkedHashSet<String> in1=new LinkedHashSet<String>();
+ private LinkedHashSet<String> in2=new LinkedHashSet<String>();
+
+ /*--------------------------------------------------------------*/
+
+ private boolean exclude=true;
+ private boolean nameSubstringOfLine=false;
+ private boolean lineSubstringOfName=false;
+ private boolean ignoreCase=true;
+ private boolean prefixMode=false;
+ private long maxLines=-1;
+
+ private String replace1=null;
+ private String replace2=null;
+
+
+ /*--------------------------------------------------------------*/
+
+ private PrintStream outstream=System.err;
+ public static boolean verbose=false;
+ public boolean errorState=false;
+ private boolean overwrite=true;
+ private boolean append=false;
+ private boolean useSharedHeader=false;
+
+}
diff --git a/current/driver/EstherFilter.java b/current/driver/EstherFilter.java
new file mode 100755
index 0000000..1b5ff36
--- /dev/null
+++ b/current/driver/EstherFilter.java
@@ -0,0 +1,167 @@
+package driver;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.Collections;
+
+import align2.ListNum;
+
+import stream.ConcurrentGenericReadInputStream;
+import stream.ConcurrentReadInputStream;
+import stream.FastaReadInputStream;
+import stream.Read;
+
+import fileIO.FileFormat;
+import fileIO.ReadWrite;
+
+/**
+ * @author Brian Bushnell
+ * @date Jul 29, 2014
+ *
+ */
+public class EstherFilter {
+
+ public static void main(String[] args){
+ String query=args[0];
+ String ref=args[1];
+ float cutoff=Float.parseFloat(args[2]);
+ boolean outputFasta=false;
+ if(args.length>3 && args[3].equalsIgnoreCase("fasta")){
+ outputFasta=true;
+ }
+ String command="blastall -p blastn -i "+query+" -d "+ref+" -e 0.00001 -m 8";
+
+ ReadWrite.FORCE_KILL=true;
+
+// InputStream is=ReadWrite.getInputStreamFromProcess("stdin", command, false);
+// InputStream is=ReadWrite.getInputStreamFromProcess("", command, false);
+ InputStream is=ReadWrite.getInputStreamFromProcess(null, command, false);
+
+ InputStreamReader isr=new InputStreamReader(is);
+ BufferedReader b=new BufferedReader(isr, 32768);
+
+// System.out.println("Before");
+
+ if(outputFasta){
+// System.out.println("A");
+ processToFasta(b, cutoff, query);
+ }else{
+// System.out.println("B");
+ processToNames(b, cutoff);
+ }
+
+// System.out.println("Finished");
+
+// ReadWrite.finishReading(is, "stdin", true, b, isr);
+// ReadWrite.finishReading(is, "", true, b, isr);
+ ReadWrite.finishReading(is, null, true, b, isr);
+
+ }
+
+ public static void processToFasta(BufferedReader b, float cutoff, String query){
+ String s=null;
+
+ ArrayList<String> names=new ArrayList<String>();
+// System.out.println("Reading line 0");
+ try {
+ s=b.readLine();
+ } catch (IOException e1) {
+ // TODO Auto-generated catch block
+ e1.printStackTrace();
+ }
+// System.out.println("Starting");
+ String prev="";
+
+ while(s!=null){
+ String[] split=s.split("\t");
+ float value=0;
+ try {
+ value=Float.parseFloat(split[11].trim());
+ } catch (NumberFormatException e) {
+ e.printStackTrace();
+// System.err.println("Bad line:\n"+s);
+ }
+ if(value>=cutoff){
+ if(!prev.equals(split[0])){
+ prev=split[0];
+ names.add(split[0]);
+ }
+ }
+// System.out.println("Reading line");
+ try {
+ s=b.readLine();
+ } catch (IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+
+ outputFasta(query, names);
+ }
+
+ public static void processToNames(BufferedReader b, float cutoff){
+ String s=null;
+// System.out.println("Reading line 0");
+ try {
+ s=b.readLine();
+ } catch (IOException e1) {
+ // TODO Auto-generated catch block
+ e1.printStackTrace();
+ }
+// System.out.println("Starting");
+ String prev="";
+ while(s!=null){
+ String[] split=s.split("\t");
+ float value=0;
+ try {
+ value=Float.parseFloat(split[11].trim());
+ } catch (NumberFormatException e) {
+ e.printStackTrace();
+// System.err.println("Bad line:\n"+s);
+ }
+ if(value>=cutoff){
+ System.out.println(split[0]);
+ }
+// System.out.println("Reading line");
+ try {
+ s=b.readLine();
+ } catch (IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+
+ public static void outputFasta(String fname, ArrayList<String> names){
+
+ Collections.sort(names);
+
+ FileFormat ff=FileFormat.testInput(fname, FileFormat.FASTA, null, false, true);
+ ConcurrentReadInputStream cris=ConcurrentReadInputStream.getReadInputStream(-1L, false, ff, null);
+ cris.start(); //4567
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ /* Iterate through read lists from the input stream */
+ while(reads!=null && reads.size()>0){
+
+ for(Read r : reads){
+ if(Collections.binarySearch(names, r.id)>=0){
+ System.out.println(r.toFasta(70));
+ }
+ }
+
+ /* Dispose of the old list and fetch a new one */
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ /* Cleanup */
+ cris.returnList(ln.id, ln.list.isEmpty());
+ }
+
+
+}
diff --git a/current/driver/FilterLines.java b/current/driver/FilterLines.java
new file mode 100755
index 0000000..a0f73fe
--- /dev/null
+++ b/current/driver/FilterLines.java
@@ -0,0 +1,263 @@
+package driver;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.util.Arrays;
+import java.util.LinkedHashSet;
+
+import dna.Parser;
+import dna.Timer;
+import fileIO.ReadWrite;
+import fileIO.FileFormat;
+import fileIO.TextFile;
+import fileIO.TextStreamWriter;
+
+import align2.Shared;
+import align2.Tools;
+
+/**
+ * Filters text lines by exact match or substring.
+ * @author Brian Bushnell
+ * @date Jul 6, 2015
+ *
+ */
+public class FilterLines {
+
+ public static void main(String[] args){
+ Timer t=new Timer();
+ FilterLines mb=new FilterLines(args);
+ mb.process(t);
+ }
+
+ public FilterLines(String[] args){
+
+ args=Parser.parseConfig(args);
+ if(Parser.parseHelp(args, true)){
+ printOptions();
+ System.exit(0);
+ }
+
+ for(String s : args){if(s.startsWith("out=standardout") || s.startsWith("out=stdout")){outstream=System.err;}}
+ outstream.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+
+ Shared.READ_BUFFER_LENGTH=Tools.min(200, Shared.READ_BUFFER_LENGTH);
+ Shared.capBuffers(4);
+ ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
+ ReadWrite.MAX_ZIP_THREADS=Shared.threads();
+
+
+ Parser parser=new Parser();
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if(b==null || b.equalsIgnoreCase("null")){b=null;}
+ while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens
+
+ if(parser.parse(arg, a, b)){
+ //do nothing
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ ReadWrite.verbose=verbose;
+ }else if(a.equals("names")){
+ if(b!=null){
+ String[] x=b.split(",");
+ for(String s : x){
+ names.add(s);
+ }
+ }
+ }else if(a.equals("lines") || a.equals("maxlines")){
+ maxLines=Tools.parseKMG(b);
+ }else if(a.equals("substrings") || a.equals("substring")){
+ if(b==null){b="t";}
+ if(b.equals("header")){
+ lineSubstringOfName=true;
+ }else if(b.equals("name")){
+ nameSubstringOfLine=true;
+ }else{
+ nameSubstringOfLine=lineSubstringOfName=Tools.parseBoolean(b);
+ }
+ }else if(a.equals("prefix") || a.equals("prefixmode")){
+ prefixMode=Tools.parseBoolean(b);
+ }else if(a.equals("replace")){
+ String[] split2=b.split(",");
+ assert(split2.length==2);
+ replace1=split2[0];
+ replace2=split2[1];
+ }else if(a.equals("casesensitive") || a.equals("case")){
+ ignoreCase=!Tools.parseBoolean(b);
+ }else if(a.equals("include") || a.equals("retain")){
+ exclude=!Tools.parseBoolean(b);
+ }else if(a.equals("exclude") || a.equals("remove")){
+ exclude=Tools.parseBoolean(b);
+ }else if(parser.in1==null && i==0 && !arg.contains("=") && (arg.toLowerCase().startsWith("stdin") || new File(arg).exists())){
+ parser.in1=arg;
+ }else if(parser.out1==null && i==1 && !arg.contains("=")){
+ parser.out1=arg;
+ }else{
+ outstream.println("Unknown parameter "+args[i]);
+ assert(false) : "Unknown parameter "+args[i];
+ // throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ {
+ String[] x=names.toArray(new String[names.size()]);
+ names.clear();
+ for(String s : x){
+ Tools.addNames(s, names, true);
+ }
+ }
+ if(ignoreCase){
+ String[] x=names.toArray(new String[names.size()]);
+ names.clear();
+ for(String s : x){
+ names.add(s.toLowerCase());
+ }
+ }
+
+ {//Process parser fields
+ overwrite=parser.overwrite;
+ append=parser.append;
+
+ in1=parser.in1;
+
+ out1=parser.out1;
+ }
+
+ if(in1==null){
+ printOptions();
+ throw new RuntimeException("Error - at least one input file is required.");
+ }
+
+ if(out1!=null && out1.equalsIgnoreCase("null")){out1=null;}
+
+ if(!Tools.testOutputFiles(overwrite, append, false, out1)){
+ outstream.println(out1);
+ throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+out1+"\n");
+ }
+
+ ffout1=FileFormat.testOutput(out1, FileFormat.TEXT, null, true, overwrite, append, false);
+
+ ffin1=FileFormat.testInput(in1, FileFormat.TEXT, null, true, true);
+ }
+
+ void process(Timer t){
+
+
+ final TextFile tf=new TextFile(ffin1);
+
+ final TextStreamWriter tsw;
+ if(out1!=null){
+ tsw=new TextStreamWriter(ffout1);
+ tsw.start();
+ }else{tsw=null;}
+
+ long linesProcessed=0;
+
+ long linesOut=0;
+ long bytesOut=0;
+
+ {
+ for(String line0=tf.readLine(true); line0!=null; line0=tf.readLine(true)){
+ if(maxLines>0 && linesProcessed>=maxLines){break;}
+ linesProcessed++;
+
+ String line=(ignoreCase ? line0.toLowerCase() : line0);
+ if(replace1!=null){line=line.replace(replace1, replace2);}
+ String prefix=null;
+ if(prefixMode){
+ for(int x=1; x<line.length(); x++){
+ char c=line.charAt(x-1);
+ char next=line.charAt(x);
+ if(Character.isWhitespace(c)){
+ prefix=line.substring(0, x).trim();
+ break;
+ }
+ }
+ }
+
+ boolean keepThisLine;
+ boolean match;
+ {
+ match=(names.contains(line) || (prefix!=null && names.contains(prefix)));
+ if(!match && (nameSubstringOfLine || lineSubstringOfName)){
+ for(String name : names){
+ if((lineSubstringOfName && name.contains(line)) || (nameSubstringOfLine && line.contains(name))){match=true;}
+ else if(prefix!=null && ((lineSubstringOfName && name.contains(prefix)) || (nameSubstringOfLine && prefix.contains(name)))){match=true;}
+ }
+ }
+ keepThisLine=(match!=exclude);
+ }
+
+ // assert(false) : names.contains(name)+", "+name+", "+prefix+", "+exclude;
+
+ if(keepThisLine){
+ tsw.println(line0);
+ linesOut++;
+ bytesOut+=line0.length();
+ }
+ }
+ }
+
+ errorState|=tf.close();
+ errorState|=tsw.poisonAndWait();
+
+ t.stop();
+
+ double rpnano=linesProcessed/(double)(t.elapsed);
+
+ outstream.println("\nTime: "+t);
+ outstream.println("Lines Processed: "+linesProcessed+" \t"+String.format("%.2fk reads/sec", rpnano*1000000));
+ outstream.println("Lines Out: "+linesOut);
+
+ if(errorState){
+ throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
+ }
+ }
+
+ /*--------------------------------------------------------------*/
+
+ private void printOptions(){
+ assert(false) : "printOptions: TODO";
+ }
+
+
+ /*--------------------------------------------------------------*/
+
+ private String in1=null;
+
+ private String out1=null;
+
+ /*--------------------------------------------------------------*/
+
+ private boolean exclude=true;
+ private boolean nameSubstringOfLine=false;
+ private boolean lineSubstringOfName=false;
+ private boolean ignoreCase=true;
+ private boolean prefixMode=false;
+ private long maxLines=-1;
+
+ private String replace1=null;
+ private String replace2=null;
+
+ private LinkedHashSet<String> names=new LinkedHashSet<String>();
+
+ /*--------------------------------------------------------------*/
+
+ private final FileFormat ffin1;
+
+ private final FileFormat ffout1;
+
+
+ /*--------------------------------------------------------------*/
+
+ private PrintStream outstream=System.err;
+ public static boolean verbose=false;
+ public boolean errorState=false;
+ private boolean overwrite=true;
+ private boolean append=false;
+ private boolean useSharedHeader=false;
+
+}
diff --git a/current/driver/FilterReadsByName.java b/current/driver/FilterReadsByName.java
new file mode 100755
index 0000000..dcfca15
--- /dev/null
+++ b/current/driver/FilterReadsByName.java
@@ -0,0 +1,437 @@
+package driver;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.LinkedHashSet;
+
+import stream.ConcurrentGenericReadInputStream;
+import stream.ConcurrentReadInputStream;
+import stream.FASTQ;
+import stream.FastaReadInputStream;
+import stream.ConcurrentReadOutputStream;
+import stream.Read;
+import stream.ReadStreamWriter;
+import stream.SamLine;
+
+import dna.Parser;
+import dna.Timer;
+import fileIO.ByteFile;
+import fileIO.ByteFile1;
+import fileIO.ByteFile2;
+import fileIO.ReadWrite;
+import fileIO.FileFormat;
+
+import align2.ListNum;
+import align2.ReadStats;
+import align2.Shared;
+import align2.Tools;
+
+/**
+ * @author Brian Bushnell
+ * @date Oct 8, 2014
+ *
+ */
+public class FilterReadsByName {
+
+ public static void main(String[] args){
+ Timer t=new Timer();
+ FilterReadsByName mb=new FilterReadsByName(args);
+ mb.process(t);
+ }
+
+ public FilterReadsByName(String[] args){
+
+ args=Parser.parseConfig(args);
+ if(Parser.parseHelp(args, true)){
+ printOptions();
+ System.exit(0);
+ }
+
+ for(String s : args){if(s.startsWith("out=standardout") || s.startsWith("out=stdout")){outstream=System.err;}}
+ outstream.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+
+ boolean setInterleaved=false; //Whether it was explicitly set.
+
+ Shared.READ_BUFFER_LENGTH=Tools.min(200, Shared.READ_BUFFER_LENGTH);
+ Shared.capBuffers(4);
+ ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
+ ReadWrite.MAX_ZIP_THREADS=Shared.threads();
+
+ SamLine.SET_FROM_OK=true;
+ ReadStreamWriter.USE_ATTACHED_SAMLINE=true;
+
+ Parser parser=new Parser();
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if(b==null || b.equalsIgnoreCase("null")){b=null;}
+ while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens
+
+ if(parser.parse(arg, a, b)){
+ //do nothing
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ ByteFile1.verbose=verbose;
+ ByteFile2.verbose=verbose;
+ stream.FastaReadInputStream.verbose=verbose;
+ ConcurrentGenericReadInputStream.verbose=verbose;
+// align2.FastaReadInputStream2.verbose=verbose;
+ stream.FastqReadInputStream.verbose=verbose;
+ ReadWrite.verbose=verbose;
+ }else if(a.equals("names")){
+ if(b!=null){
+ String[] x=b.split(",");
+ for(String s : x){
+ names.add(s);
+ }
+ }
+ }else if(a.equals("substrings") || a.equals("substring")){
+ if(b==null){b="t";}
+ if(b.equals("header")){
+ headerSubstringOfName=true;
+ }else if(b.equals("name")){
+ nameSubstringOfHeader=true;
+ }else{
+ nameSubstringOfHeader=headerSubstringOfName=Tools.parseBoolean(b);
+ }
+ }else if(a.equals("casesensitive") || a.equals("case")){
+ ignoreCase=!Tools.parseBoolean(b);
+ }else if(a.equals("include") || a.equals("retain")){
+ exclude=!Tools.parseBoolean(b);
+ }else if(a.equals("exclude") || a.equals("remove")){
+ exclude=Tools.parseBoolean(b);
+ }else if(a.equals("minlen") || a.equals("minlength")){
+ minLength=(int)Tools.parseKMG(b);
+ }else if(a.equals("truncateheadersymbol") || a.equals("truncate") || a.equals("ths")){
+ truncateHeaderSymbol=Tools.parseBoolean(b);
+ }else if(parser.in1==null && i==0 && !arg.contains("=") && (arg.toLowerCase().startsWith("stdin") || new File(arg).exists())){
+ parser.in1=arg;
+ }else{
+ outstream.println("Unknown parameter "+args[i]);
+ assert(false) : "Unknown parameter "+args[i];
+ // throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ {
+ String[] x=names.toArray(new String[names.size()]);
+ names.clear();
+ for(String s : x){
+ Tools.addNames(s, names, true);
+ }
+ }
+ if(ignoreCase){
+ String[] x=names.toArray(new String[names.size()]);
+ names.clear();
+ for(String s : x){
+ names.add(s.toLowerCase());
+ }
+ }
+ if(truncateHeaderSymbol){
+ String[] x=names.toArray(new String[names.size()]);
+ names.clear();
+ for(String s : x){
+ String s2=s;
+ if(s.length()>1 && (s.charAt(0)=='@' || s.charAt(0)=='>')){s2=s.substring(1);}
+ names.add(s2);
+ }
+ }
+
+ {//Process parser fields
+ Parser.processQuality();
+
+ maxReads=parser.maxReads;
+
+ overwrite=ReadStats.overwrite=parser.overwrite;
+ append=ReadStats.append=parser.append;
+
+ setInterleaved=parser.setInterleaved;
+
+ in1=parser.in1;
+ in2=parser.in2;
+ qfin1=parser.qfin1;
+ qfin2=parser.qfin2;
+
+ out1=parser.out1;
+ out2=parser.out2;
+ qfout1=parser.qfout1;
+ qfout2=parser.qfout2;
+
+ extin=parser.extin;
+ extout=parser.extout;
+ }
+
+ if(in1!=null && in2==null && in1.indexOf('#')>-1 && !new File(in1).exists()){
+ in2=in1.replace("#", "2");
+ in1=in1.replace("#", "1");
+ }
+ if(out1!=null && out2==null && out1.indexOf('#')>-1){
+ out2=out1.replace("#", "2");
+ out1=out1.replace("#", "1");
+ }
+ if(in2!=null){
+ if(FASTQ.FORCE_INTERLEAVED){outstream.println("Reset INTERLEAVED to false because paired input files were specified.");}
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+ }
+
+ assert(FastaReadInputStream.settingsOK());
+
+ if(in1==null){
+ printOptions();
+ throw new RuntimeException("Error - at least one input file is required.");
+ }
+ if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){
+ ByteFile.FORCE_MODE_BF2=true;
+ }
+
+ if(out1==null){
+ if(out2!=null){
+ printOptions();
+ throw new RuntimeException("Error - cannot define out2 without defining out1.");
+ }
+// if(!parser.setOut){
+// out1="stdout";
+// }
+ }
+
+ if(!setInterleaved){
+ assert(in1!=null && (out1!=null || out2==null)) : "\nin1="+in1+"\nin2="+in2+"\nout1="+out1+"\nout2="+out2+"\n";
+ if(in2!=null){ //If there are 2 input streams.
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+ outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED);
+ }else{ //There is one input stream.
+ if(out2!=null){
+ FASTQ.FORCE_INTERLEAVED=true;
+ FASTQ.TEST_INTERLEAVED=false;
+ outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED);
+ }
+ }
+ }
+
+ if(out1!=null && out1.equalsIgnoreCase("null")){out1=null;}
+ if(out2!=null && out2.equalsIgnoreCase("null")){out2=null;}
+
+ if(!Tools.testOutputFiles(overwrite, append, false, out1, out2)){
+ outstream.println((out1==null)+", "+(out2==null)+", "+out1+", "+out2);
+ throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+out1+", "+out2+"\n");
+ }
+
+ ffout1=FileFormat.testOutput(out1, FileFormat.FASTQ, extout, true, overwrite, append, false);
+ ffout2=FileFormat.testOutput(out2, FileFormat.FASTQ, extout, true, overwrite, append, false);
+
+ ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, extin, true, true);
+ ffin2=FileFormat.testInput(in2, FileFormat.FASTQ, extin, true, true);
+
+// if(ffin1!=null && ffout1!=null && ffin1.samOrBam()){
+// if(ffout1.samOrBam()){
+// useSharedHeader=true;
+// }else if(ffout1.bread()){
+// SamLine.CONVERT_CIGAR_TO_MATCH=true;
+// }
+// }
+ }
+
+ void process(Timer t){
+
+
+ final ConcurrentReadInputStream cris;
+ {
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, useSharedHeader, ffin1, ffin2, qfin1, qfin2);
+ if(verbose){outstream.println("Started cris");}
+ cris.start(); //4567
+ }
+ boolean paired=cris.paired();
+// if(verbose){
+ if(!ffin1.samOrBam()){outstream.println("Input is being processed as "+(paired ? "paired" : "unpaired"));}
+// }
+
+ final ConcurrentReadOutputStream ros;
+ if(out1!=null){
+ final int buff=4;
+
+ if(cris.paired() && out2==null && (in1==null || !in1.contains(".sam"))){
+ outstream.println("Writing interleaved.");
+ }
+
+ assert(!out1.equalsIgnoreCase(in1) && !out1.equalsIgnoreCase(in1)) : "Input file and output file have same name.";
+ assert(out2==null || (!out2.equalsIgnoreCase(in1) && !out2.equalsIgnoreCase(in2))) : "out1 and out2 have same name.";
+
+ ros=ConcurrentReadOutputStream.getStream(ffout1, ffout2, qfout1, qfout2, buff, null, useSharedHeader);
+ ros.start();
+ }else{ros=null;}
+
+ long readsProcessed=0;
+ long basesProcessed=0;
+
+ long readsOut=0;
+ long basesOut=0;
+
+ {
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+// outstream.println("Fetched "+reads);
+
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+ assert((ffin1==null || ffin1.samOrBam()) || (r.mate!=null)==cris.paired());
+ }
+
+
+
+ while(reads!=null && reads.size()>0){
+
+ ArrayList<Read> retain=new ArrayList<Read>(reads.size());
+ for(int idx=0; idx<reads.size(); idx++){
+ final Read r1=reads.get(idx);
+ final Read r2=r1.mate;
+
+ final int initialLength1=r1.length();
+ final int initialLength2=(r1.mateLength());
+ readsProcessed+=1+r1.mateCount();
+ basesProcessed+=initialLength1+initialLength2;
+
+ final String header=(ignoreCase ? r1.id.toLowerCase() : r1.id);
+ String prefix=null;
+ for(int x=1; x<header.length(); x++){
+ char c=header.charAt(x-1);
+ char next=header.charAt(x);
+ if(Character.isWhitespace(c) || (c=='/' && (next=='1' || next=='2'))){
+ prefix=header.substring(0, x).trim();
+ break;
+ }
+ }
+
+ boolean keepThisRead=(initialLength1>=minLength || initialLength2>=minLength);
+ boolean match=false;
+ if(keepThisRead){
+ match=(names.contains(header) || (prefix!=null && names.contains(prefix)));
+ if(!match && (nameSubstringOfHeader || headerSubstringOfName)){
+ for(String name : names){
+ if((headerSubstringOfName && name.contains(header)) || (nameSubstringOfHeader && header.contains(name))){match=true;}
+ else if(prefix!=null && ((headerSubstringOfName && name.contains(prefix)) || (nameSubstringOfHeader && prefix.contains(name)))){match=true;}
+ }
+ }
+ keepThisRead=(match!=exclude);
+ }
+
+// assert(false) : names.contains(name)+", "+name+", "+prefix+", "+exclude;
+
+ if(keepThisRead){
+ retain.add(r1);
+ readsOut+=1+r1.mateCount();
+ basesOut+=initialLength1+initialLength2;
+ }
+ }
+
+ final ArrayList<Read> listOut=retain;
+
+ if(ros!=null){ros.add(listOut, ln.id);}
+
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ if(ln!=null){
+ cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
+ }
+ }
+
+ errorState|=ReadStats.writeAll();
+
+ errorState|=ReadWrite.closeStreams(cris, ros);
+
+ t.stop();
+
+ double rpnano=readsProcessed/(double)(t.elapsed);
+ double bpnano=basesProcessed/(double)(t.elapsed);
+
+// String rpstring=(readsProcessed<100000 ? ""+readsProcessed : readsProcessed<100000000 ? (readsProcessed/1000)+"k" : (readsProcessed/1000000)+"m");
+// String bpstring=(basesProcessed<100000 ? ""+basesProcessed : basesProcessed<100000000 ? (basesProcessed/1000)+"k" : (basesProcessed/1000000)+"m");
+//
+// while(rpstring.length()<8){rpstring=" "+rpstring;}
+// while(bpstring.length()<8){bpstring=" "+bpstring;}
+
+ outstream.println("Time: "+t);
+ outstream.println("Reads Processed: "+readsProcessed+" \t"+String.format("%.2fk reads/sec", rpnano*1000000));
+ outstream.println("Bases Processed: "+basesProcessed+" \t"+String.format("%.2fm bases/sec", bpnano*1000));
+ outstream.println("Reads Out: "+readsOut);
+ outstream.println("Bases Out: "+basesOut);
+
+ if(errorState){
+ throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
+ }
+ }
+
+ /*--------------------------------------------------------------*/
+
+ private void printOptions(){
+ assert(false) : "printOptions: TODO";
+// outstream.println("Syntax:\n");
+// outstream.println("java -ea -Xmx512m -cp <path> jgi.ReformatReads in=<infile> in2=<infile2> out=<outfile> out2=<outfile2>");
+// outstream.println("\nin2 and out2 are optional. \nIf input is paired and there is only one output file, it will be written interleaved.\n");
+// outstream.println("Other parameters and their defaults:\n");
+// outstream.println("overwrite=false \tOverwrites files that already exist");
+// outstream.println("ziplevel=4 \tSet compression level, 1 (low) to 9 (max)");
+// outstream.println("interleaved=false\tDetermines whether input file is considered interleaved");
+// outstream.println("fastawrap=70 \tLength of lines in fasta output");
+// outstream.println("qin=auto \tASCII offset for input quality. May be set to 33 (Sanger), 64 (Illumina), or auto");
+// outstream.println("qout=auto \tASCII offset for output quality. May be set to 33 (Sanger), 64 (Illumina), or auto (meaning same as input)");
+// outstream.println("outsingle=<file> \t(outs) Write singleton reads here, when conditionally discarding reads from pairs.");
+ }
+
+
+ /*--------------------------------------------------------------*/
+
+ private String in1=null;
+ private String in2=null;
+
+ private String qfin1=null;
+ private String qfin2=null;
+
+ private String out1=null;
+ private String out2=null;
+
+ private String qfout1=null;
+ private String qfout2=null;
+
+ private String extin=null;
+ private String extout=null;
+
+ /*--------------------------------------------------------------*/
+
+ private long maxReads=-1;
+ private boolean exclude=true;
+ private boolean nameSubstringOfHeader=false;
+ private boolean headerSubstringOfName=false;
+ private boolean ignoreCase=true;
+ private boolean truncateHeaderSymbol=false;
+
+ private int minLength=0;
+
+ private LinkedHashSet<String> names=new LinkedHashSet<String>();
+
+ /*--------------------------------------------------------------*/
+
+ private final FileFormat ffin1;
+ private final FileFormat ffin2;
+
+ private final FileFormat ffout1;
+ private final FileFormat ffout2;
+
+
+ /*--------------------------------------------------------------*/
+
+ private PrintStream outstream=System.err;
+ public static boolean verbose=false;
+ public boolean errorState=false;
+ private boolean overwrite=true;
+ private boolean append=false;
+ private boolean useSharedHeader=false;
+
+}
diff --git a/current/driver/FindMotifs.java b/current/driver/FindMotifs.java
new file mode 100755
index 0000000..1eac030
--- /dev/null
+++ b/current/driver/FindMotifs.java
@@ -0,0 +1,362 @@
+package driver;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.BitSet;
+import java.util.Collections;
+import java.util.HashSet;
+
+import dna.ChromosomeArray;
+import dna.Data;
+import dna.Gene;
+import dna.GeneSet;
+import dna.Motif;
+import dna.MotifProbsN;
+
+
+public class FindMotifs {
+
+
+ public static void main(String[] args){
+
+ int chrom=1;
+ if(args.length>0){
+ chrom=Integer.parseInt(args[0]);
+ }
+
+ int maxChrom=22;
+
+
+ float[][] grid={
+
+// {0.19540f, 0.26751f, 0.34873f, 0.18835f},
+// {0.18987f, 0.33930f, 0.28953f, 0.18131f},
+// {0.19421f, 0.32921f, 0.28259f, 0.19399f},
+
+ {0.19519f, 0.23856f, 0.38961f, 0.17664f},
+ {0.17382f, 0.33995f, 0.30720f, 0.17903f},
+ {0.24452f, 0.38376f, 0.25710f, 0.11462f},
+
+ {0.46075f, 0.09954f, 0.38018f, 0.05953f},
+ {0.29028f, 0.38874f, 0.19941f, 0.12156f},
+ {0.17610f, 0.46129f, 0.28953f, 0.07309f},
+
+ {0.99859f, 0.00108f, 0.00011f, 0.00022f},
+ {0.00001f, 0.00001f, 0.00076f, 0.99924f},
+ {0.00001f, 0.00001f, 0.99924f, 0.00076f},
+//
+ {0.20993f, 0.14877f, 0.51269f, 0.12861f},
+ {0.26903f, 0.39861f, 0.18337f, 0.14899f},
+// {0.14812f, 0.26448f, 0.39286f, 0.19453f},
+// {0.23476f, 0.24073f, 0.35697f, 0.16753f},
+// {0.24886f, 0.32043f, 0.22511f, 0.20560f},
+// {0.16504f, 0.31956f, 0.34288f, 0.17252f},
+// {0.23444f, 0.26838f, 0.32531f, 0.17187f},
+ };
+
+
+ float[][] gridATG={
+ {1, 0, 0, 0},
+ {0, 0, 0, 1},
+ {0, 0, 1, 0},
+ };
+
+
+ float[][] grid2={
+// {0.03793f, 0.04397f, 0.06643f, 0.02087f, 0.06272f, 0.11378f, 0.06177f, 0.07713f, 0.06526f, 0.10277f, 0.09535f, 0.04248f, 0.02564f, 0.06590f, 0.05943f, 0.05859f},
+// {0.04534f, 0.04238f, 0.07427f, 0.02956f, 0.06897f, 0.11463f, 0.06579f, 0.07702f, 0.05594f, 0.09821f, 0.08677f, 0.04206f, 0.02363f, 0.05943f, 0.05721f, 0.05880f},
+// {0.04397f, 0.04524f, 0.07766f, 0.02702f, 0.06537f, 0.10372f, 0.07130f, 0.07427f, 0.05234f, 0.09609f, 0.09397f, 0.04164f, 0.02808f, 0.05954f, 0.06399f, 0.05583f},
+//
+// {0.04990f, 0.04164f, 0.07342f, 0.02479f, 0.06039f, 0.11294f, 0.06113f, 0.07013f, 0.06929f, 0.10234f, 0.09556f, 0.03973f, 0.02998f, 0.06102f, 0.05329f, 0.05445f},
+// {0.05075f, 0.04725f, 0.08391f, 0.02765f, 0.06590f, 0.11664f, 0.06388f, 0.07151f, 0.06770f, 0.10012f, 0.07840f, 0.03719f, 0.02458f, 0.06113f, 0.05424f, 0.04916f},
+// {0.04831f, 0.04005f, 0.08931f, 0.03125f, 0.06685f, 0.09249f, 0.09546f, 0.07035f, 0.05583f, 0.08359f, 0.10234f, 0.03867f, 0.02278f, 0.05287f, 0.06293f, 0.04693f},
+//
+// {0.04587f, 0.05117f, 0.07045f, 0.02627f, 0.05488f, 0.09450f, 0.05541f, 0.06420f, 0.06664f, 0.13328f, 0.10478f, 0.04534f, 0.02087f, 0.06208f, 0.05912f, 0.04513f},
+ {0.04598f, 0.04015f, 0.07321f, 0.02892f, 0.06261f, 0.12575f, 0.07331f, 0.07935f, 0.06282f, 0.10637f, 0.08370f, 0.03687f, 0.02246f, 0.05721f, 0.05318f, 0.04810f},
+ {0.03952f, 0.03189f, 0.09704f, 0.02543f, 0.07639f, 0.08666f, 0.10266f, 0.06378f, 0.05424f, 0.07850f, 0.11166f, 0.03899f, 0.02426f, 0.04270f, 0.07819f, 0.04810f},
+
+ {0.04312f, 0.04291f, 0.08878f, 0.01960f, 0.03666f, 0.09800f, 0.04577f, 0.05933f, 0.07098f, 0.14207f, 0.12395f, 0.05255f, 0.02129f, 0.05795f, 0.04990f, 0.04714f},
+ {0.04662f, 0.04672f, 0.06335f, 0.01536f, 0.09069f, 0.14980f, 0.05488f, 0.04556f, 0.07670f, 0.11654f, 0.09026f, 0.02490f, 0.03009f, 0.07162f, 0.04895f, 0.02797f},
+ {0.09503f, 0.01695f, 0.11802f, 0.01409f, 0.20945f, 0.03856f, 0.11622f, 0.02045f, 0.11929f, 0.02998f, 0.09440f, 0.01377f, 0.03464f, 0.01504f, 0.05255f, 0.01155f},
+
+ {0.16580f, 0.13751f, 0.11325f, 0.04185f, 0.02532f, 0.03687f, 0.01409f, 0.02426f, 0.08645f, 0.19578f, 0.05700f, 0.04195f, 0.01165f, 0.01992f, 0.01451f, 0.01377f},
+ {0.07257f, 0.06929f, 0.13042f, 0.01695f, 0.04693f, 0.25532f, 0.05943f, 0.02839f, 0.04079f, 0.06918f, 0.07649f, 0.01240f, 0.01420f, 0.06876f, 0.02373f, 0.01515f},
+ {0.17417f, 0.00021f, 0.00011f, 0.00001f, 0.46213f, 0.00021f, 0.00001f, 0.00021f, 0.28944f, 0.00064f, 0.00001f, 0.00001f, 0.07289f, 0.00001f, 0.00001f, 0.00001f},
+
+/* */ {0.00001f, 0.00001f, 0.00074f, 0.99788f, 0.00001f, 0.00001f, 0.00001f, 0.00106f, 0.00001f, 0.00001f, 0.00001f, 0.00011f, 0.00001f, 0.00001f, 0.00001f, 0.00021f},
+ {0.00001f, 0.00001f, 0.00001f, 0.00001f, 0.00001f, 0.00001f, 0.00001f, 0.00001f, 0.00001f, 0.00001f, 0.00001f, 0.00074f, 0.00001f, 0.00001f, 0.99926f, 0.00001f},
+ {0.00001f, 0.00001f, 0.00001f, 0.00001f, 0.00001f, 0.00001f, 0.00001f, 0.00001f, 0.20934f, 0.14769f, 0.51330f, 0.12893f, 0.00032f, 0.00001f, 0.00042f, 0.00001f},
+
+ {0.07416f, 0.04132f, 0.06346f, 0.03072f, 0.03252f, 0.05265f, 0.01843f, 0.04407f, 0.15447f, 0.23382f, 0.07946f, 0.04598f, 0.00795f, 0.07194f, 0.02182f, 0.02723f},
+// {0.04428f, 0.05859f, 0.11855f, 0.04767f, 0.06155f, 0.11622f, 0.12681f, 0.09514f, 0.02829f, 0.05922f, 0.06621f, 0.02945f, 0.01335f, 0.03157f, 0.08168f, 0.02140f},
+// {0.04015f, 0.02691f, 0.06166f, 0.01875f, 0.07850f, 0.06876f, 0.06314f, 0.05520f, 0.08910f, 0.09916f, 0.15319f, 0.05181f, 0.02712f, 0.04534f, 0.08020f, 0.04100f},
+// {0.07522f, 0.05785f, 0.06282f, 0.03899f, 0.05244f, 0.07151f, 0.04428f, 0.07194f, 0.10965f, 0.11601f, 0.08306f, 0.04948f, 0.01165f, 0.07416f, 0.03644f, 0.04450f},
+// {0.05149f, 0.06187f, 0.09959f, 0.03602f, 0.05996f, 0.11749f, 0.07787f, 0.06420f, 0.03094f, 0.08083f, 0.07342f, 0.04142f, 0.02214f, 0.06018f, 0.09185f, 0.03072f},
+// {0.04195f, 0.03380f, 0.06717f, 0.02161f, 0.08656f, 0.09164f, 0.07342f, 0.06876f, 0.07660f, 0.09810f, 0.12003f, 0.04799f, 0.02871f, 0.04460f, 0.06568f, 0.03337f},
+// {0.07819f, 0.05318f, 0.05710f, 0.04534f, 0.05668f, 0.08232f, 0.04471f, 0.08444f, 0.09948f, 0.10520f, 0.06590f, 0.05573f, 0.01685f, 0.06378f, 0.03697f, 0.05414f},
+ };
+
+
+ Motif gstartMotif=new MotifProbsN("Gene Starts MP1", grid, 6, 1);
+ Motif gstartATG=new MotifProbsN("ATG Gene Starts MP1", gridATG, 0, 1);
+
+ Motif gstartMotif2=new MotifProbsN("Gene Starts MP2", grid2, 8, 2);
+//
+// Motif estartMotif_ag=new MotifProbs(grid_ag, 9);
+// Motif estartMotif_ac=new MotifProbs(grid_ac, 9);
+// Motif estartMotif_atg=new MotifProbs(grid_atg, 9);
+// Motif estartMotif_nonagac=new MotifProbs(grid_nonagac, 9);
+//
+// Motif estartMotif2_ag=new MotifProbsN(grid2_ag, 10);
+// Motif estartMotif2_ac=new MotifProbsN(grid2_ac, 9);
+// Motif estartMotif2_nonagac=new MotifProbsN(grid2_nonagac, 10);
+
+// Motif estartMotif_multi=new MotifMulti(estartMotif_ag, estartMotif_ac, estartMotif_nonagac);
+// Motif estartMotif2_multi=new MotifMulti(estartMotif2_ag, estartMotif2_ac, estartMotif2_nonagac);
+
+ Motif m=gstartMotif2;
+
+
+ ArrayList<Integer> firstBeaten=new ArrayList<Integer>();
+
+ long count=0;
+ for(chrom=1; chrom<=maxChrom; chrom++){
+// count+=analyzeChromosomeGStarts(chrom, m, locations);
+// count+=analyzeChromosomeGStartsStronger(chrom, m, locations, firstBeaten);
+// count+=analyzeChromosomeGStartsStrongerInFrame(chrom, m, locations, firstBeaten, true, Gene.PLUS);
+ count+=analyzeChromosomeGStartsStrongerInFrame(chrom, m, locations, firstBeaten, true, Gene.MINUS);
+ Data.unload(chrom, true);
+ }
+
+ Collections.sort(locations);
+
+ int[] histogram=new int[CLEN+1];
+ int[] histogramBeaten=new int[CLEN+1];
+ for(Integer i : locations){
+ histogram[i]++;
+ }
+ for(Integer i : firstBeaten){
+ histogramBeaten[i]++;
+ }
+
+ System.out.println(count+" sites analyzed. ATG occurances:");
+ for(int i=0; i<histogram.length; i++){
+ System.out.println(i+"\t"+histogram[i]+(firstBeaten.size()==0 ? "" : "\t"+histogramBeaten[i]));
+ }
+
+ }
+
+
+ public static long analyzeChromosomeGStarts(int chrom, Motif m, ArrayList<Integer> list, byte strand){
+ GeneSet[] genes=Data.getGeneSets(chrom);
+ assert(strand==Gene.PLUS) : "TODO";
+ ChromosomeArray ca=Data.getChromosome(chrom);
+
+ HashSet<Integer> eset=new HashSet<Integer>();
+ for(GeneSet g : genes){
+ if(g.strand==strand){
+ if(strand==Gene.PLUS){
+ eset.add(g.minStart);
+ }else{
+ eset.add(ca.maxIndex-g.maxEnd);
+ }
+ }
+ }
+
+ ArrayList<Integer> list2=new ArrayList<Integer>(eset.size());
+ list2.addAll(eset);
+ Collections.sort(list2);
+
+ for(Integer x : list2){
+
+ for(int i=CLEN; i>=0; i--){
+ int pos=x-i;
+ float f=analyze(pos, m, ca);
+ if(f>=THRESH){
+ list.add(i);
+ }
+ }
+ }
+ return list2.size();
+ }
+
+
+ public static long analyzeChromosomeGStartsStronger(int chrom, Motif m, ArrayList<Integer> list, ArrayList<Integer> listBeat, byte strand){
+ GeneSet[] genes=Data.getGeneSets(chrom);
+ assert(strand==Gene.PLUS) : "TODO";
+ ChromosomeArray ca=Data.getChromosome(chrom);
+
+ HashSet<Integer> eset=new HashSet<Integer>();
+ for(GeneSet g : genes){
+ if(g.strand==strand){
+ if(strand==Gene.PLUS){
+ eset.add(g.minStart);
+ }else{
+ eset.add(ca.maxIndex-g.maxEnd);
+ }
+ }
+ }
+
+ ArrayList<Integer> list2=new ArrayList<Integer>(eset.size());
+ list2.addAll(eset);
+ Collections.sort(list2);
+
+ for(Integer x : list2){
+
+// for(int i=CLEN; i>=0; i--){
+// int pos=x-i;
+// float f=analyze(pos, list, m, ca);
+// if(f>=THRESH){
+// list.add(i);
+// }
+// }
+
+ int firstBeaten=CLEN+1;
+ float basis=analyze(x, m, ca);
+ for(int i=0; i<=CLEN; i++){
+ int pos=x-i;
+ float f=analyze(pos, m, ca);
+ if(f>=basis){
+ if(i>0 && i<firstBeaten){
+ firstBeaten=i;
+ listBeat.add(firstBeaten);
+ }
+ list.add(i);
+ }
+ }
+ }
+ return list2.size();
+ }
+
+
+ public static long analyzeChromosomeGStartsStrongerInFrame(int chrom, Motif m, ArrayList<Integer> list, ArrayList<Integer> listBeat, boolean in, byte strand){
+ GeneSet[] genes=Data.getGeneSets(chrom);
+ assert(strand==Gene.PLUS) : "TODO";
+ ChromosomeArray ca=Data.getChromosome(chrom);
+
+ HashSet<Integer> eset=new HashSet<Integer>();
+ for(GeneSet g : genes){
+ if(g.strand==strand){
+ if(strand==Gene.PLUS){
+ eset.add(g.minStart);
+ }else{
+ eset.add(ca.maxIndex-g.maxEnd);
+ }
+ }
+ }
+
+ ArrayList<Integer> list2=new ArrayList<Integer>(eset.size());
+ list2.addAll(eset);
+ Collections.sort(list2);
+
+ for(Integer x : list2){
+
+// for(int i=CLEN; i>=0; i--){
+// int pos=x-i;
+// float f=analyze(pos, list, m, ca);
+// if(f>=THRESH){
+// list.add(i);
+// }
+// }
+
+ int firstBeaten=CLEN+1;
+ float basis=analyze(x, m, ca);
+ for(int i=0; i<=CLEN; i++){
+ int pos=x-i;
+
+
+ if((in && i%3==0) || (!in && i%3==1)){
+ float f=analyze(pos, m, ca);
+ if(f>=basis){
+ if(i>0 && i<firstBeaten){
+ firstBeaten=i;
+ listBeat.add(firstBeaten);
+ }
+ list.add(i);
+ }
+ }
+
+
+ }
+ }
+ return list2.size();
+ }
+
+
+ public static float analyze(int point, Motif m, ChromosomeArray ca){
+
+ float f=m.matchStrength(ca.array, point);
+ f=m.normalize(f);
+ return f;
+ }
+
+
+ private static String padFront(String s, int len){
+ int spaces=len-s.length();
+ for(int i=0; i<spaces; i++){
+ s=" "+s;
+ }
+ return s;
+ }
+
+ public static void swap(long[] a, int x, int y){
+ long temp=a[x];
+ a[x]=a[y];
+ a[y]=temp;
+ }
+
+ public static void swap(char[] a, int x, int y){
+ char temp=a[x];
+ a[x]=a[y];
+ a[y]=temp;
+ }
+
+
+ public static float THRESH=.2f;
+
+
+ public static long analyses=0;
+
+
+ /** Exon start */
+ public static final int ESTART=0;
+
+ /** Exon stop */
+ public static final int ESTOP=1;
+
+ /** Gene (and exon) start */
+ public static final int GSTART=2;
+
+ /** Gene (and exon) stop */
+ public static final int GSTOP=3;
+
+
+ /** Exon start using AG */
+ public static final int ESTARTAG=4;
+
+ /** Exon stop using GT */
+ public static final int ESTOPGT=5;
+
+
+ /** Exon start without AG */
+ public static final int ESTARTATG=6;
+
+ /** Exon stop without GT */
+ public static final int ESTOPNONGT=7;
+
+
+ /** Exon start without AG, AC, or ATG */
+ public static final int ESTARTNON=8;
+
+ /** Exon start using AC */
+ public static final int ESTARTAC=9;
+
+ private static final int CLEN=200;
+
+ public static ArrayList<Integer> locations=new ArrayList<Integer>();
+
+
+
+ private static final int min(int x, int y){return x<y ? x : y;}
+ private static final int max(int x, int y){return x>y ? x : y;}
+
+}
diff --git a/current/driver/FixDumbFile.java b/current/driver/FixDumbFile.java
new file mode 100755
index 0000000..0c9b09a
--- /dev/null
+++ b/current/driver/FixDumbFile.java
@@ -0,0 +1,70 @@
+package driver;
+
+import java.util.ArrayList;
+import java.util.LinkedHashMap;
+import java.util.Set;
+
+import fileIO.ReadWrite;
+import fileIO.TextFile;
+
+/**
+ * @author Brian Bushnell
+ * @date May 15, 2014
+ *
+ */
+public class FixDumbFile {
+
+ public static void main(String[] args){
+
+ String in=args[0];
+ String out=args[1];
+
+ TextFile tf=new TextFile(in);
+
+ LinkedHashMap<String, ArrayList<String[]>> map=new LinkedHashMap<String, ArrayList<String[]>>();
+
+ for(String s=tf.nextLine(); s!=null; s=tf.nextLine()){
+ if(!s.startsWith("library_name")){
+ String[] line=s.split("\t");
+ String key=line[0];
+ ArrayList<String[]> list=map.get(key);
+ if(list==null){
+ list=new ArrayList<String[]>();
+ map.put(key, list);
+ }else if(s.contains("\tmode\t")){
+
+ }
+ list.add(line);
+ }
+ }
+
+ tf.close();
+
+ StringBuilder sb=new StringBuilder();
+ sb.append("library_name\trun_date");
+ Set<String> keys=map.keySet();
+ {
+ String key0=keys.iterator().next();
+ ArrayList<String[]> list0=map.get(key0);
+ for(String[] term : list0){
+ sb.append('\t').append(term[2]);
+ }
+ sb.append('\n');
+ }
+
+ for(String key : keys){
+ ArrayList<String[]> list=map.get(key);
+ String[] term0=list.get(0);
+ sb.append(term0[0]);
+ sb.append('\t').append(term0[1]);
+ for(String[] term : list){
+ sb.append('\t').append(term[3]);
+ }
+ sb.append('\n');
+ }
+
+ ReadWrite.writeString(sb, out);
+
+ }
+
+}
diff --git a/current/driver/GenerateNoCallsFromCoverage.java b/current/driver/GenerateNoCallsFromCoverage.java
new file mode 100755
index 0000000..ca32117
--- /dev/null
+++ b/current/driver/GenerateNoCallsFromCoverage.java
@@ -0,0 +1,426 @@
+package driver;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+import var.VarLine;
+import var.Variation;
+
+import dna.ChromosomeArray;
+import dna.CoverageArray2;
+import dna.Data;
+import dna.Gene;
+
+public class GenerateNoCallsFromCoverage {
+
+// @Deprecated
+// public static ArrayList<VarLine> generateOld(byte chrom, CoverageArray ca, int build, char gender){
+//
+// ArrayList<VarLine> lines=new ArrayList<VarLine>(256);
+//
+// assert(Data.GENOME_BUILD==build);
+// ChromosomeArray chra=Data.getChromosome(chrom);
+//
+// int start=-1;
+// int stop=-1;
+//
+// for(int i=chra.minIndex; i<chra.maxIndex; i++){
+// boolean nc=(ca.get(i)<minCovered);
+//
+// if(nc && start==-1){
+// start=i;
+// }
+//
+// if(!nc && start>-1){
+// stop=i-1;
+//
+// VarLine v1=new VarLine();
+// v1.ploidy=(chrom<=22 ? 2 : chrom>=24 ? 1 : (Byte)Variation.ploidyMap.get("?"));
+//
+// v1.haplotype=1;
+// v1.chromosome=chrom;
+// v1.beginLoc=start;
+// v1.endLoc=stop;
+//
+// v1.ref="=";
+// v1.call=null;
+//
+// v1.totalScore=-1;
+// v1.xRef=-2;
+// v1.xRefArray=null;
+// v1.hapLink=-1;
+// v1.varType=Variation.NOCALL;
+//
+// VarLine v2;
+// if((chrom==23 && gender=='M') || chrom==24 || chrom==25){
+// v2=null;
+// }else{
+// v2=new VarLine();
+// v2.ploidy=(chrom<=22 ? 2 : chrom>=24 ? 1 : (Byte)Variation.ploidyMap.get("?"));
+//
+// v2.haplotype=2;
+// v2.chromosome=chrom;
+// v2.beginLoc=start;
+// v2.endLoc=stop;
+//
+// v2.ref="=";
+// v2.call=null;
+//
+// v2.totalScore=-1;
+// v2.xRef=-2;
+// v2.xRefArray=null;
+// v2.hapLink=-1;
+// v2.varType=Variation.NOCALL;
+// }
+//
+//
+// start=-1;
+// stop=-1;
+// lines.add(v1);
+// if(v2!=null){lines.add(v2);}
+// }
+//
+//
+// }
+//
+// return lines;
+// }
+
+
+
+ public static ArrayList<VarLine> generate(byte chrom, CoverageArray2 ca, int build, char gender){
+
+ assert(minCovered>=1);
+ assert(minHalfCovered>=1);
+ assert(minCovered>=minHalfCovered);
+
+
+ ArrayList<VarLine> lines=new ArrayList<VarLine>(256);
+
+ assert(Data.GENOME_BUILD==build);
+ ChromosomeArray chra=Data.getChromosome(chrom);
+
+ int start=-1;
+ int stop=-1;
+
+
+ byte level=-1;
+
+ boolean haploid=(chrom==23 && gender=='M') || chrom==24 || chrom==25;
+
+ for(int i=chra.minIndex; i<chra.maxIndex; i++){
+
+ final byte newLevel;
+ final int cov=ca.get(i);
+
+ if(haploid){
+ if(cov<minHalfCovered){
+ newLevel=0;
+ }else{
+ newLevel=2;
+ }
+ }else{
+ if(cov<minHalfCovered){
+ newLevel=0;
+ }else if(cov<minCovered){
+ newLevel=1;
+ }else{
+ newLevel=2;
+ }
+ }
+
+
+ if(level==-1){
+ level=newLevel;
+ start=i;
+ }else if(level!=newLevel){ //The level changed; make VarLines
+
+ stop=i-1;
+
+ if(level==0){
+
+ VarLine v1=new VarLine();
+ v1.ploidy=(chrom<=22 ? 2 : chrom>=24 ? 1 : gender=='M' ? 1 : gender=='F' ? 2 : (Byte)Variation.ploidyMap.get("?"));
+ v1.haplotype=1;
+ v1.chromosome=chrom;
+ v1.beginLoc=start;
+ v1.endLoc=stop;
+
+ v1.ref="=";
+ v1.call=null;
+
+ v1.totalScore=-1;
+ v1.hapLink=-1;
+ v1.varType=Variation.NOCALL;
+
+ lines.add(v1);
+ }
+
+ if(level==0 || (level==1 && !haploid)){
+
+ VarLine v2=new VarLine();
+ v2.ploidy=(chrom<=22 ? 2 : chrom>=24 ? 1 : gender=='M' ? 1 : gender=='F' ? 2 : (Byte)Variation.ploidyMap.get("?"));
+
+ v2.haplotype=2;
+ v2.chromosome=chrom;
+ v2.beginLoc=start;
+ v2.endLoc=stop;
+
+ v2.ref="=";
+ v2.call=null;
+
+ v2.totalScore=-1;
+ v2.hapLink=-1;
+ v2.varType=Variation.NOCALL;
+
+ lines.add(v2);
+ }
+
+
+// start=-1;
+ stop=-1;
+ level=newLevel;
+ start=i;
+ }
+ }
+
+ return lines;
+ }
+
+
+ public static ArrayList<VarLine> removeDuplicateNocalls(List<VarLine> input, int copies){
+ ArrayList<VarLine>[] haplo=splitHaplotypes(input, copies);
+
+ ArrayList<VarLine> output=new ArrayList<VarLine>(256);
+// System.err.println("A: copies="+copies+"; input.size="+input.size()+"; haplo="+haplo[0].size()+", "+haplo[1].size());
+ for(ArrayList<VarLine> alv : haplo){
+ VarLine temp=alv.size()==0 ? null : alv.get(0);
+ for(VarLine vl : alv){assert(vl.haplotype==temp.haplotype);}
+ ArrayList<VarLine> alv2=removeDuplicateNocallsHaplotyped(alv);
+// assert(checkCopyCountHaplotyped(alv2)); //Very slow
+
+// output.addAll(removeDuplicateNocallsHaplotyped(alv2)); //This MUST be incorrect.
+
+ output.addAll(alv2);
+ }
+
+ Collections.sort(output);
+
+ return output;
+ }
+
+ public static boolean checkCopyCountHaplotyped(List<VarLine> list){
+
+ int max=0;
+ for(VarLine vl : list){
+ if(vl.endLoc>max){max=vl.endLoc;}
+ }
+
+ byte[] sum=new byte[max+1];
+// byte[] vars=new byte[max+1];
+ byte[] nocalls=new byte[max+1];
+
+ for(VarLine vl : list){
+ for(int i=vl.beginLoc; i<=vl.endLoc; i++){
+ sum[i]++;
+ if(vl.isNoCall()){nocalls[i]++;}
+// else{vars[i]++;}
+ }
+ }
+
+ for(int i=0; i<sum.length; i++){
+ if(nocalls[i]>1){
+ assert(false) : "chr"+list.get(0).chromosome+", "+i;
+ return false;
+ }
+ if(sum[i]>1){
+ assert(false) : "chr"+list.get(0).chromosome+", "+i;
+ return false;
+ }
+ }
+
+ return true;
+ }
+
+
+ /** All elements of input should share haplotype */
+ public static ArrayList<VarLine> removeDuplicateNocallsHaplotyped(ArrayList<VarLine> input){
+
+
+// System.err.println("B: input.size="+input.size());
+
+ Collections.sort(input);
+
+ ArrayList<VarLine> output=new ArrayList<VarLine>(256);
+
+ boolean needToReprocess=false;
+
+ VarLine prev=null;
+
+ final boolean verbose=false;
+
+ for(int i=0; i<input.size(); i++){
+ VarLine current=input.get(i);
+
+ assert(current.endLoc>=current.beginLoc) : current;
+
+// final VarLine current2=current;
+ final VarLine prev2=prev;
+
+// if(current.chromosome==2 && (current.touches(8890433) || (prev!=null && prev.touches(8890433)))){
+// verbose=true;
+// System.err.println("current="+current);
+// System.err.println("touches? "+current.touches(8890433));
+// System.err.println("intersects? "+current.intersects(8890433));
+// }else if(prev==null && verbose){
+// System.err.println("current="+current);
+// System.err.println("touches? "+current.touches(8890433));
+// System.err.println("intersects? "+current.intersects(8890433));
+// }else{
+// verbose=false;
+// }
+
+ boolean problem=prev!=null && prev.intersects(current);
+ if(problem){
+ if(prev.isPoint() && (current.endLoc==prev.beginLoc || current.beginLoc==prev.beginLoc)){
+ problem=false;
+ }
+ if(current.isPoint() && (prev.endLoc==current.beginLoc || prev.beginLoc==current.beginLoc)){
+ problem=false;
+ }
+ }
+
+ if(problem){
+ boolean ncc=current.isNoCall();
+ boolean ncp=prev.isNoCall();
+ boolean refc=current.isRef();
+ boolean refp=prev.isRef();
+ boolean varc=current.isTrueVariation();
+ boolean varp=prev.isTrueVariation();
+ if(!needToReprocess){
+// System.err.println("\nNeed to reprocess because:");
+// System.err.println("\n"+prev);
+// System.err.println("\n"+current);
+ }
+ needToReprocess=true;
+
+ if((ncc && ncp) || (refc && refp) || (refc && ncp)/* || (refc && varp) || (ncp && varp)*/){ //Un-intersect them
+ current=current.clone();
+ {
+ current.ref="=";
+ if(refc){current.call="=";}
+ else if(ncc){current.call=null;}
+ }
+ current.beginLoc=prev.endLoc+1;
+ if(current.beginLoc>current.endLoc){current=null;}
+ else{
+ assert(!prev.intersects(current)
+ || (prev.isPoint() && (current.endLoc==prev.beginLoc || current.beginLoc==prev.beginLoc))
+ || (current.isPoint() && (prev.endLoc==current.beginLoc || prev.beginLoc==current.beginLoc))) :
+ refp+", "+ncp+", "+refc+", "+ncc+"\n"+prev+"\n"+current;
+ }
+ }else if(ncc || refc){
+ current=current.clone();
+ {
+ current.ref="=";
+ if(refc){current.call="=";}
+ else if(ncc){current.call=null;}
+ }
+ current.beginLoc=prev.endLoc+(prev.isPoint() ? 0 : 1);
+ if(current.beginLoc>current.endLoc){current=null;}
+ else{
+ assert(!prev.intersects(current)
+ || (prev.isPoint() && (current.endLoc==prev.beginLoc || current.beginLoc==prev.beginLoc))
+ || (current.isPoint() && (prev.endLoc==current.beginLoc || prev.beginLoc==current.beginLoc))) :
+ refp+", "+ncp+", "+refc+", "+ncc+"\n"+prev+"\n"+current;
+ }
+ }else if(ncp || refp){
+ prev=prev.clone();
+ {
+ prev.ref="=";
+ if(refp){prev.call="=";}
+ else if(ncp){prev.call=null;}
+ }
+ prev.endLoc=current.beginLoc-1;
+ if(prev.beginLoc>prev.endLoc){prev=null;}
+ else{
+ assert(!prev.intersects(current) ||
+ (prev.isNoCall() && prev.lengthRef()==1 && current.isPoint()) //Corner case for intersection
+ ) : "\n"+prev+"\n\n"+current+"\n";
+ }
+
+ if(prev2.endLoc>current.endLoc || (prev2.endLoc==current.endLoc && current.isPoint())){
+ VarLine temp=prev2.clone();
+ {
+ temp.ref="=";
+ if(temp.isRef()){temp.call="=";}
+ else if(temp.isNoCall()){temp.call=null;}
+ }
+ temp.beginLoc=current.endLoc+(current.isPoint() ? 0 : 1);
+ if(temp.beginLoc<=temp.endLoc){
+
+ assert(prev==null || !temp.intersects(prev));
+ assert(!temp.intersects(current)
+ || (temp.isPoint() && (current.endLoc==temp.beginLoc || current.beginLoc==temp.beginLoc))
+ || (current.isPoint() && (temp.endLoc==current.beginLoc || temp.beginLoc==current.beginLoc))) :
+ refp+", "+ncp+", "+refc+", "+ncc+"\n"+temp+"\n"+current;
+
+ if(verbose){System.err.println("Current="+current+"\nprev="+prev+"\nAdding "+temp+"\n");}
+
+ output.add(temp);
+// needToReprocess=true;
+ }
+ }
+ }else{
+ System.out.println("Warning: Deleted variation due to conflict! \n"+prev+"\n"+current+"\n");
+// assert(false) : "\n"+prev+"\n"+current+"\n";
+ current=null;
+ }
+ }
+
+ if(prev!=null){
+ if(verbose){System.err.println("Current="+current+"\nAdding "+prev+"\n");}
+ output.add(prev);
+ }
+ prev=current;
+ }
+ if(prev!=null){output.add(prev);}
+
+ if(needToReprocess){return removeDuplicateNocallsHaplotyped(output);}
+
+ Collections.sort(output);
+ return output;
+ }
+
+
+ public static ArrayList<VarLine>[] splitHaplotypes(List<VarLine> input, int copies){
+ ArrayList<VarLine>[] haplo=new ArrayList[2];
+ for(int i=0; i<haplo.length; i++){
+ haplo[i]=new ArrayList<VarLine>();
+ }
+ for(VarLine vl : input){
+ if(vl.haplotype==1){
+ haplo[0].add(vl);
+ }else if(vl.haplotype==2){
+ haplo[1].add(vl);
+ }else{
+ assert(vl.haplotype==3);
+ if(copies>1){
+ VarLine[] vl2=vl.splitLine();
+ haplo[0].add(vl2[0]);
+ haplo[1].add(vl2[1]);
+ }else{
+ haplo[0].add(vl);
+ }
+ }
+ }
+
+ return haplo;
+ }
+
+
+ public static int minCovered=2;
+ public static int minHalfCovered=1;
+
+}
diff --git a/current/driver/GetSequence.java b/current/driver/GetSequence.java
new file mode 100755
index 0000000..d5b73c2
--- /dev/null
+++ b/current/driver/GetSequence.java
@@ -0,0 +1,101 @@
+package driver;
+
+import dna.AminoAcid;
+import dna.ChromosomeArray;
+import dna.Data;
+import dna.Gene;
+import dna.Range;
+
+public class GetSequence {
+
+ public static void main(String[] args){
+
+ int chrom=-1;
+ byte strand=Gene.PLUS;
+
+ /** Change base to zero or one for the coordinates mode */
+ int base=0;
+
+// char c=args[1].charAt(0);
+// if(c=='+'){strand=Gene.PLUS;}
+// else if(c=='-'){strand=Gene.MINUS;}
+// else{assert(false) : "Invalid strand: "+args[1];}
+
+ int firstLoc=-1;
+ for(int i=0; i<args.length && firstLoc<0; i++){
+ char x=args[i].charAt(0);
+ if(args[i].startsWith("build")){
+ final String arg=args[i];
+ final String[] split=arg.split("=");
+ Data.setGenome(Integer.parseInt(split[1]));
+ }else if(Character.isDigit(x) || x=='[' || x=='('){
+ firstLoc=i;
+ }else{
+ if(args[i].startsWith("chr")){
+ chrom=Gene.toChromosome(args[i]);
+ }else if(x=='b'){
+// Data.GENOME_BUILD=Gene.toBuild(args[i]);
+ Data.setGenome(Gene.toBuild(args[i]));
+ }else if(x=='+' || x=='-'){
+ strand=Gene.toStrand(args[i]);
+ }else{
+ assert(false) : "Bad parameter: "+args[i];
+ }
+ }
+ }
+
+// System.out.println(Data.GENOME_BUILD);
+// System.out.println(chrom);
+// System.out.println(strand);
+
+ assert(strand==Gene.PLUS) : "TODO";
+ ChromosomeArray cha=Data.getChromosome(chrom);
+
+ String[] array=new String[args.length];
+
+// System.out.println("firstLoc="+firstLoc+"/"+args.length);
+ for(int i=firstLoc; i<args.length; i++){
+// System.out.println("Processing "+args[i]);
+ args[i]=args[i].replace("[","").replace("]","").replace("(","").replace(")","").replace(",","").trim();
+
+ Range r=Range.toRange(args[i]);
+ array[i]=cha.getString((int)r.a-base, (int)r.b-base);
+ }
+
+ String combined="";
+
+ System.out.println("Chrom Bounds: "+cha.minIndex+"-"+cha.maxIndex+" ("+cha.array.length+")");
+
+ for(int i=firstLoc; i<array.length; i++){
+ System.out.println("\nchr"+chrom+" ("+args[i]+") = \n"+array[i]);
+// System.out.println("AAs:\t"+AminoAcid.stringToAAs(array[i]));
+ combined+=array[i];
+ String s=AminoAcid.reverseComplementBases(array[i]);
+ System.out.println("\n"+s+" (rcomp)");
+// System.out.println("AAs:\t"+AminoAcid.stringToAAs(s));
+ }
+
+ System.out.println("\nAAs:\n"+AminoAcid.stringToAAs(combined));
+ System.out.println("\nAAs (reverse comp):\n"+AminoAcid.stringToAAs(AminoAcid.reverseComplementBases(combined)));
+ }
+
+ public static byte get(int chrom, int a){
+ ChromosomeArray cha=Data.getChromosome(chrom);
+ return cha.get(a);
+ }
+
+ public static String get(int chrom, int a, int b){
+ return get(chrom, a, b, Gene.PLUS);
+ }
+
+ public static String get(int chrom, int a, int b, byte strand){
+ assert(strand==Gene.PLUS) : "TODO";
+ ChromosomeArray cha=Data.getChromosome(chrom);
+ return cha.getString(a, b);
+ }
+
+
+
+
+
+}
diff --git a/current/driver/Grep.java b/current/driver/Grep.java
new file mode 100755
index 0000000..e0ec69b
--- /dev/null
+++ b/current/driver/Grep.java
@@ -0,0 +1,20 @@
+package driver;
+
+import fileIO.TextFile;
+
+public class Grep {
+
+ public static void main(String[] args){
+
+ TextFile tf=new TextFile(args[0], true, true);
+
+ String s=null;
+
+ for(s=tf.nextLine(); s!=null; s=tf.nextLine()){
+ if(s.contains(args[1])){System.out.println(s);}
+ }
+ tf.close();
+
+ }
+
+}
diff --git a/current/driver/LineCount.java b/current/driver/LineCount.java
new file mode 100755
index 0000000..aca1241
--- /dev/null
+++ b/current/driver/LineCount.java
@@ -0,0 +1,16 @@
+package driver;
+
+import fileIO.TextFile;
+
+public class LineCount {
+
+ public static void main(String[] args){
+
+ TextFile tf=new TextFile(args[0], false, false);
+ long lines=tf.countLines();
+ tf.close();
+ System.out.println(args[0]+" has "+lines+" non-blank lines.");
+
+ }
+
+}
diff --git a/current/driver/LookAtID.java b/current/driver/LookAtID.java
new file mode 100755
index 0000000..d01e509
--- /dev/null
+++ b/current/driver/LookAtID.java
@@ -0,0 +1,48 @@
+package driver;
+
+import java.util.Arrays;
+
+import stream.SiteScoreR;
+
+import align2.Tools;
+import fileIO.TextFile;
+
+/**
+ * @author Brian Bushnell
+ * @date Dec 3, 2012
+ *
+ */
+public class LookAtID {
+
+ public static void main(String[] args){
+
+ TextFile tf=new TextFile(args[0], true, false);
+
+ long max=0;
+
+ long line=0;
+
+ for(String s=tf.nextLine(); s!=null; s=tf.nextLine()){
+ SiteScoreR[] array=SiteScoreR.fromTextArray(s);
+ String[] split=s.split("\t");
+ for(int i=0; i<array.length; i++){
+ SiteScoreR ssr=array[i];
+ String s2=split[i];
+ max=Tools.max(ssr.numericID, max);
+ if(ssr.numericID>=Integer.MAX_VALUE){
+ System.out.println("Found overflow ID "+ssr.numericID+" at line "+line);
+ System.out.println("ssr="+ssr.toText());
+ System.out.println("raw="+s2);
+ System.out.println("All:\n"+Arrays.toString(split));
+ System.out.println();
+ break;
+ }
+ }
+ line++;
+ }
+ tf.close();
+ System.out.println("Max ID was "+max);
+
+ }
+
+}
diff --git a/current/driver/MakeTestScript.java b/current/driver/MakeTestScript.java
new file mode 100755
index 0000000..140b54f
--- /dev/null
+++ b/current/driver/MakeTestScript.java
@@ -0,0 +1,368 @@
+package driver;
+
+public class MakeTestScript {
+
+
+ public static void main(String[] args){
+
+ assert(args.length>=1) : "Please enter number of reads.";
+ numReads=Integer.parseInt(args[0]);
+ readlen=Integer.parseInt(args[1]);
+
+ String mode=args[2];
+ String extra=(args.length>3 ? args[3] : "EXTRA");
+
+ String printtime="java -ea -Xmx96m -cp /house/homedirs/b/bushnell/beta18/ align2.PrintTime ";
+ String gradesam="java -ea -Xmx96m -cp /house/homedirs/b/bushnell/beta18/ align2.GradeSamFile ";
+ String time=mode+"Time.txt";
+
+ String[] strings=null;
+
+// strings=new String[] {
+// "/house/homedirs/b/bushnell/ssaha2/ssaha2 -solexa -outfile #S.sam -best -1 -output sam_soft -save hg37 " +
+// "reads_B1_#Rx#Lbp_#S.fastq",
+// "java -ea -Xmx96m -cp /house/homedirs/b/bushnell/beta18/ align2.PrintTime defaultTime.txt",
+// gradesam+"#S.sam #R ssaha2",
+// "java -ea -Xmx96m -cp /house/homedirs/b/bushnell/beta18/ align2.PrintTime defaultTime.txt"
+// };
+
+ if(mode.equalsIgnoreCase("bwa")){
+ strings=new String[] {
+// printtime+time+" false",
+// "memtime /house/homedirs/b/bushnell/bwa/bwa aln -t 32 "+extra+" reads_B1_#Rx#Lbp_#S.fastq 1>temp_bwa.sai",
+// "memtime /house/homedirs/b/bushnell/bwa/bwa samse "+extra+" temp_bwa.sai reads_B1_#Rx#Lbp_#S.fastq 1>bwa_#S_r#Rx#L.sam",
+// printtime+time,
+// gradesam+"bwa_#S_r#Rx#L.sam #R",
+
+ printtime+time+" false",
+ "/house/homedirs/b/bushnell/bwa/bwa aln -t 32 "+extra+" reads_B1_#Rx#Lbp_#S.fastq 1>bwa_#S_r#Rx#L.sai",
+ "/house/homedirs/b/bushnell/bwa/bwa samse "+extra+" bwa_#S_r#Rx#L.sai reads_B1_#Rx#Lbp_#S.fastq 1>bwa_#S_r#Rx#L.sam",
+ printtime+time,
+ gradesam+"bwa_#S_r#Rx#L.sam #R",
+ };
+ }
+
+ if(mode.equalsIgnoreCase("bwamem")){
+ strings=new String[] {
+// printtime+time+" false",
+// "memtime /house/homedirs/b/bushnell/bwa/bwa aln -t 32 "+extra+" reads_B1_#Rx#Lbp_#S.fastq 1>temp_bwa.sai",
+// "memtime /house/homedirs/b/bushnell/bwa/bwa samse "+extra+" temp_bwa.sai reads_B1_#Rx#Lbp_#S.fastq 1>bwa_#S_r#Rx#L.sam",
+// printtime+time,
+// gradesam+"bwa_#S_r#Rx#L.sam #R",
+
+ printtime+time+" false",
+ "/house/homedirs/b/bushnell/bwa74/bwa mem -t 32 "+extra+" reads_B1_#Rx#Lbp_#S.fastq 1>bwamem_#S_r#Rx#L.sam",
+ printtime+time,
+ gradesam+"bwamem_#S_r#Rx#L.sam #R",
+ };
+ }
+
+ if(mode.equalsIgnoreCase("bwasw")){
+ strings=new String[] {
+// printtime+time+" false",
+// "memtime /house/homedirs/b/bushnell/bwa/bwa aln -t 32 "+extra+" reads_B1_#Rx#Lbp_#S.fastq 1>temp_bwa.sai",
+// "memtime /house/homedirs/b/bushnell/bwa/bwa samse "+extra+" temp_bwa.sai reads_B1_#Rx#Lbp_#S.fastq 1>bwa_#S_r#Rx#L.sam",
+// printtime+time,
+// gradesam+"bwa_#S_r#Rx#L.sam #R",
+
+ printtime+time+" false",
+ "/house/homedirs/b/bushnell/bwa/bwa bwasw -b5 -q2 -r1 -z10 -t 32 "+extra+" reads_B1_#Rx#Lbp_#S.fasta 1>bwa_#S_r#Rx#L.sam",
+ printtime+time,
+ gradesam+"bwa_#S_r#Rx#L.sam #R",
+ };
+ }
+
+ if(mode.startsWith("bbmap")){
+ int k=13;
+ String s2=mode.replaceFirst("bbmap", "");
+ if(s2.length()>0){
+ k=Integer.parseInt(s2);
+ }
+ strings=new String[] {
+ printtime+time+" false",
+ "memtime java -ea -Xmx106g -cp /house/homedirs/b/bushnell/beta18/ " +
+ "align2.BBMap in=reads_B1_#Rx#Lbp_#S.fastq out=bbmap"+k+"_#S_r#Rx#L.sam overwrite k="+k+" printtoerr",
+ printtime+time,
+ gradesam+"bbmap"+k+"_#S_r#Rx#L.sam #R",
+ };
+ }
+
+ if(mode.equalsIgnoreCase("bowtie2")){
+ strings=new String[] {
+ printtime+time+" false",
+ "memtime bowtie2 -x bow2ref -U reads_B1_#Rx#Lbp_#S.fastq -S bowtie2_#S_r#Rx#L.sam --phred33 -p 32",
+ printtime+time,
+ gradesam+"bowtie2_#S_r#Rx#L.sam #R",
+ };
+ }
+
+ if(mode.equalsIgnoreCase("gsnap")){
+ strings=new String[] {
+ printtime+time+" false",
+ "memtime /house/homedirs/b/bushnell/gsnap/bin/gsnap -t 32 -d "+extra+" -A sam reads_B1_#Rx#Lbp_#S.fastq > gsnap_#S_r#Rx#L.sam",
+ printtime+time,
+ gradesam+"gsnap_#S_r#Rx#L.sam #R",
+ };
+ }
+
+
+// strings=new String[] {
+// "bowtie --best -y --chunkmbs 1024 --strata -m 1 -k 2 -v 3 -p 24 -t -q -S HG37" +
+// " reads_B1_#Rx#Lbp_#S.fastq #S_bowtie.sam",
+//
+// "java -ea -Xmx96m -cp /house/homedirs/b/bushnell/beta18/ align2.PrintTime bowtieTime.txt",
+// gradesam+"#S_bowtie.sam #R",
+// "java -ea -Xmx96m -cp /house/homedirs/b/bushnell/beta18/ align2.PrintTime bowtieTime.txt"
+// };
+
+
+// strings=new String[] {
+// "bfast match -T $TMPDIR/ -n 16 -f hg19.fa -r reads_B1_#Rx#Lbp_#S.fastq > $TMPDIR/#S.bmf",
+// "bfast localalign -n 16 -f hg19.fa -m $TMPDIR/#S.bmf > $TMPDIR/#S.baf",
+//// "bfast postprocess -n 16 -a 3 -f hg19.fa -i $TMPDIR/#S.baf > #S.sam",
+//// "bfast postprocess -n 16 -a 3 -m 20 -f hg19.fa -i $TMPDIR/#S.baf > #S_r#Rx#L.sam",
+// "bfast postprocess -n 16 -M 20 -f hg19.fa -i $TMPDIR/#S.baf > #S_r#Rx#L.sam",
+//
+// "java -ea -Xmx96m -cp /house/homedirs/b/bushnell/beta18/ align2.PrintTime bfastTime.txt",
+// gradesam+"#S_r#Rx#L.sam #R",
+// "java -ea -Xmx96m -cp /house/homedirs/b/bushnell/beta18/ align2.PrintTime bfastTime.txt"
+// };
+
+ if(mode.equalsIgnoreCase("smalt")){
+ strings=new String[] {
+ printtime+time+" false",
+ "memtime /house/homedirs/b/bushnell/smalt/smalt_x86_64 map -n 32 -f sam -o smalt_#S_r#Rx#L.sam smaltindex reads_B1_#Rx#Lbp_#S.fastq",
+ printtime+time,
+ gradesam+"smalt_#S_r#Rx#L.sam #R ssaha2",
+ };
+ }
+
+ if(mode.equalsIgnoreCase("snap")){
+ strings=new String[] {
+ printtime+time+" false",
+ "memtime /house/homedirs/b/bushnell/snap/snap single snapref reads_B1_#Rx#Lbp_#S.fastq -o snap_#S_r#Rx#L.sam -t 32 -b",
+ printtime+time,
+ gradesam+"snap_#S_r#Rx#L.sam #R",
+ };
+ }
+
+ if(mode.equalsIgnoreCase("masai")){
+ strings=new String[] {
+ printtime+time+" false",
+ "memtime /house/homedirs/b/bushnell/masai/masai_mapper --output-format sam "+extra+" reads_B1_#Rx#Lbp_#S.fastq",
+ printtime+time,
+ gradesam+"reads_B1_#Rx#Lbp_#S.sam #R",
+ };
+ }
+
+ if(mode.equalsIgnoreCase("blasr")){
+ System.out.println("source /house/sdm/pacbio/smrtanalysis-installs/smrtanalysis-2.0.0/etc/setup.sh\n");
+ strings=new String[] {
+ printtime+time+" false",
+ "memtime blasr reads_B1_#Rx#Lbp_#S.fastq "+extra+" -sam -out blasr_#S_r#Rx#L.sam -bestn 1 -nproc 32",
+ printtime+time,
+ gradesam+"blasr_#S_r#Rx#L.sam #R blasr",
+ };
+ }
+
+// strings=new String[] {
+// "java -ea -Xmx96m -cp /house/homedirs/b/bushnell/beta18/ align2.PrintTime mapTime.txt",
+// "./soap -p 24 -a reads_B1_#Rx#Lbp_#S.fastq -D hg37.fa.index -o #S_r#Rx#L.soap",
+// "perl soap2sam.pl -p #S_r#Rx#L.soap > #S_r#Rx#L.sam",
+// "java -ea -Xmx96m -cp /house/homedirs/b/bushnell/beta18/ align2.PrintTime mapTime.txt",
+// gradesam+"#S_r#Rx#L.sam #R",
+// };
+
+// strings=new String[] {
+// "java -ea -Xmx96m -cp /house/homedirs/b/bushnell/beta18/ align2.PrintTime mapTime.txt",
+// "./bin/gmapper-ls reads_B1_#Rx#Lbp_#S.fastq --single-best-mapping --qv-offset 33 -L hg37 -N 24 -o 5 -h 80% > #S_r#Rx#L.sam",
+// "java -ea -Xmx96m -cp /house/homedirs/b/bushnell/beta18/ align2.PrintTime mapTime.txt",
+// gradesam+"#S_r#Rx#L.sam #R",
+// };
+
+// strings=new String[] {
+// "java -ea -Xmx96m -cp /house/homedirs/b/bushnell/beta18/ align2.PrintTime mapTime.txt",
+// "./bin/MosaikBuild -q reads_B1_#Rx#Lbp_#S.fastq -out $TMPDIR/reads_B1_#Rx100bp_#S_chr1-25.dat -st illumina",
+// "./bin/MosaikAligner -in $TMPDIR/reads_B1_#Rx100bp_#S_chr1-25.dat -out $TMPDIR/reads_B1_#Rx100bp_#S_chr1-25_aligned.dat -ia hg37_ref.dat -hs 15 -bw=29 -j hg37_jumpdb -act 20 -mm 32 -mhp 100 -p 32 -m unique",
+// "./bin/MosaikText -in $TMPDIR/reads_B1_#Rx100bp_#S_chr1-25_aligned.dat -sam #S_r#Rx#L.sam",
+// "java -ea -Xmx96m -cp /house/homedirs/b/bushnell/beta18/ align2.PrintTime mapTime.txt",
+// gradesam+"#S_r#Rx#L.sam #R",
+// };
+
+ int[] blank=new int[] {0, 0, 0, 0, 0};
+
+ int preload=100;
+ if(mode.equalsIgnoreCase("masai")){
+ preload=1000;
+ }
+ print(strings, blank, preload);
+ print(strings, blank, preload);
+ print(strings, blank, preload);
+ print(strings, blank, preload);
+ for(int[] array : sets){
+ print(strings, array, numReads);
+ }
+
+ }
+
+ private static void print(String[] array, int[] blank, int x) {
+
+ int rl=readlen;
+ if(blank.length>5){rl=blank[5];}
+
+ String counts=(blank[0]+"S_"+blank[1]+"I_"+blank[2]+"D_"+blank[3]+"U_"+blank[4]+"N");
+ String reads=""+x;
+ String len=""+rl;
+
+ for(String s : array){
+ String s2=s.replaceAll("#S", counts).replaceAll("#R", reads).replaceAll("#L", len);
+ System.out.println(s2);
+ }
+ System.out.println();
+
+ }
+
+ public static int numReads=400000;
+ public static int readlen=150;
+
+ public static final int[][] sets=new int[][] {
+ {0, 0, 0, 0, 0},
+ {1, 0, 0, 0, 0},
+ {2, 0, 0, 0, 0},
+ {3, 0, 0, 0, 0},
+ {4, 0, 0, 0, 0},
+ {5, 0, 0, 0, 0},
+ {6, 0, 0, 0, 0},
+ {7, 0, 0, 0, 0},
+ {8, 0, 0, 0, 0},
+ {10, 0, 0, 0, 0},
+ {12, 0, 0, 0, 0},
+ {14, 0, 0, 0, 0},
+ {16, 0, 0, 0, 0},
+ {18, 0, 0, 0, 0},
+ {20, 0, 0, 0, 0},
+ {24, 0, 0, 0, 0},
+ {28, 0, 0, 0, 0},
+ {32, 0, 0, 0, 0},
+ {36, 0, 0, 0, 0},
+ {40, 0, 0, 0, 0},
+
+ {0, 1, 0, 0, 0},
+ {0, 2, 0, 0, 0},
+ {0, 3, 0, 0, 0},
+ {0, 4, 0, 0, 0},
+ {0, 5, 0, 0, 0},
+ {0, 6, 0, 0, 0},
+ {0, 7, 0, 0, 0},
+ {0, 8, 0, 0, 0},
+ {0, 10, 0, 0, 0},
+ {0, 12, 0, 0, 0},
+ {0, 14, 0, 0, 0},
+ {0, 16, 0, 0, 0},
+ {0, 18, 0, 0, 0},
+ {0, 20, 0, 0, 0},
+ {0, 24, 0, 0, 0},
+ {0, 28, 0, 0, 0},
+ {0, 32, 0, 0, 0},
+ {0, 36, 0, 0, 0},
+ {0, 40, 0, 0, 0},
+
+ {0, 0, 1, 0, 0},
+ {0, 0, 2, 0, 0},
+ {0, 0, 3, 0, 0},
+ {0, 0, 4, 0, 0},
+ {0, 0, 5, 0, 0},
+ {0, 0, 6, 0, 0},
+ {0, 0, 7, 0, 0},
+ {0, 0, 8, 0, 0},
+ {0, 0, 10, 0, 0},
+ {0, 0, 12, 0, 0},
+ {0, 0, 14, 0, 0},
+ {0, 0, 16, 0, 0},
+ {0, 0, 18, 0, 0},
+ {0, 0, 20, 0, 0},
+ {0, 0, 24, 0, 0},
+ {0, 0, 28, 0, 0},
+ {0, 0, 32, 0, 0},
+ {0, 0, 36, 0, 0},
+ {0, 0, 40, 0, 0},
+ {0, 0, 48, 0, 0},
+ {0, 0, 56, 0, 0},
+ {0, 0, 64, 0, 0},
+ {0, 0, 96, 0, 0},
+ {0, 0, 128, 0, 0},
+ {0, 0, 192, 0, 0},
+ {0, 0, 256, 0, 0},
+ {0, 0, 384, 0, 0},
+ {0, 0, 512, 0, 0},
+ {0, 0, 768, 0, 0},
+ {0, 0, 1000, 0, 0},
+ {0, 0, 1500, 0, 0},
+ {0, 0, 2000, 0, 0},
+ {0, 0, 3000, 0, 0},
+ {0, 0, 4000, 0, 0},
+ {0, 0, 6000, 0, 0},
+ {0, 0, 8000, 0, 0},
+ {0, 0, 12000, 0, 0},
+ {0, 0, 16000, 0, 0},
+ {0, 0, 24000, 0, 0},
+ {0, 0, 32000, 0, 0},
+ {0, 0, 48000, 0, 0},
+ {0, 0, 64000, 0, 0},
+ {0, 0, 96000, 0, 0},
+ {0, 0, 128000, 0, 0},
+
+ {0, 0, 0, 1, 0},
+ {0, 0, 0, 2, 0},
+ {0, 0, 0, 3, 0},
+ {0, 0, 0, 4, 0},
+ {0, 0, 0, 5, 0},
+ {0, 0, 0, 6, 0},
+ {0, 0, 0, 7, 0},
+ {0, 0, 0, 8, 0},
+ {0, 0, 0, 10, 0},
+ {0, 0, 0, 12, 0},
+ {0, 0, 0, 14, 0},
+ {0, 0, 0, 16, 0},
+ {0, 0, 0, 18, 0},
+ {0, 0, 0, 20, 0},
+ {0, 0, 0, 24, 0},
+ {0, 0, 0, 28, 0},
+ {0, 0, 0, 32, 0},
+ {0, 0, 0, 36, 0},
+ {0, 0, 0, 40, 0},
+
+ {0, 0, 0, 0, 1},
+ {0, 0, 0, 0, 2},
+ {0, 0, 0, 0, 3},
+ {0, 0, 0, 0, 4},
+ {0, 0, 0, 0, 5},
+ {0, 0, 0, 0, 6},
+ {0, 0, 0, 0, 7},
+ {0, 0, 0, 0, 8},
+ {0, 0, 0, 0, 10},
+ {0, 0, 0, 0, 12},
+ {0, 0, 0, 0, 14},
+ {0, 0, 0, 0, 16},
+ {0, 0, 0, 0, 18},
+ {0, 0, 0, 0, 20},
+ {0, 0, 0, 0, 24},
+ {0, 0, 0, 0, 28},
+ {0, 0, 0, 0, 32},
+ {0, 0, 0, 0, 36},
+ {0, 0, 0, 0, 40},
+
+ {0, 0, 0, 0, 0, 400},
+ {2, 2, 2, 2, 0, 400},
+ {4, 2, 2, 2, 0, 400},
+ {6, 3, 3, 3, 0, 400},
+ {8, 4, 4, 4, 0, 400},
+ {10, 4, 4, 4, 0, 400},
+ {12, 4, 4, 4, 0, 400},
+ {14, 4, 4, 4, 0, 400},
+ {16, 4, 4, 4, 0, 400},
+ {18, 4, 4, 4, 0, 400},
+ {20, 5, 5, 5, 0, 400},
+ };
+
+}
diff --git a/current/driver/MakeTestScriptScoreOnly.java b/current/driver/MakeTestScriptScoreOnly.java
new file mode 100755
index 0000000..b83201d
--- /dev/null
+++ b/current/driver/MakeTestScriptScoreOnly.java
@@ -0,0 +1,210 @@
+package driver;
+
+public class MakeTestScriptScoreOnly {
+
+
+ public static void main(String[] args){
+
+ assert(args.length==1) : "Please enter number of reads.";
+ numReads=Integer.parseInt(args[0]);
+
+// String[] strings=new String[] {
+// "/work/bbushnell/ssaha2/ssaha2 -solexa -outfile #S.sam -best -1 -output sam_soft -save hg37 " +
+// "/work/bbushnell/synth/reads_B1_#Rx100bp_#S_chr1-25.fq",
+// "java -ea -Xmx96m -cp /work/bbushnell/java/ align.PrintTime defaultTime.txt",
+// "java -ea -Xmx96m -cp /work/bbushnell/java/ align.GradeSamFile #S.sam #R ssaha2",
+// "java -ea -Xmx96m -cp /work/bbushnell/java/ align.PrintTime defaultTime.txt"
+// };
+
+
+// String[] strings=new String[] {
+// "bwa aln -t 22 bs_ /work/bbushnell/synth/reads_B1_100000x100bp_#S_chr1-25.fq > temp_default.sai",
+// "bwa samse bs_ temp_default.sai /work/bbushnell/synth/reads_B1_#Rx100bp_#S_chr1-25.fq > #S_default.sam",
+// "java -ea -Xmx96m -cp /work/bbushnell/java/ align.PrintTime defaultTime.txt",
+// "java -ea -Xmx96m -cp /work/bbushnell/java/ align.GradeSamFile #S_default.sam #R",
+// "java -ea -Xmx96m -cp /work/bbushnell/java/ align.PrintTime defaultTime.txt"
+// };
+
+
+// String[] strings=new String[] {
+// "java -ea -Xms24g -Xmx31g -server -XX:+UseNUMA -XX:+AggressiveOpts -XX:+UseCompressedOops " +
+// "align.TestIndex11f 1 25 100 0 /work/bbushnell/synth/reads_B1_#Rx100bp_#S_chr1-25.fq null " +
+// "outfile=#S_bbmap11f.sam cs=false threads=22 paired=false pairlen=100 build=37 match=short " +
+// "removeambiguous=false fastqparsecustom overwrite savepar=false",
+//
+// "java -ea -Xmx96m -cp /work/bbushnell/java/ align.PrintTime bbmap11fTime.txt",
+// "java -ea -Xmx96m -cp /work/bbushnell/java/ align.GradeSamFile #S_bbmap11f.sam #R",
+// "java -ea -Xmx96m -cp /work/bbushnell/java/ align.PrintTime bbmap11fTime.txt"
+// };
+
+
+// String[] strings=new String[] {
+// "bowtie --best -y --chunkmbs 1024 --strata -m 1 -k 2 -v 3 -p 24 -t -q -S HG37" +
+// " /work/bbushnell/synth/reads_B1_#Rx100bp_#S_chr1-25.fq #S_bowtie.sam",
+//
+// "java -ea -Xmx96m -cp /work/bbushnell/java/ align.PrintTime bowtieTime.txt",
+// "java -ea -Xmx96m -cp /work/bbushnell/java/ align.GradeSamFile #S_bowtie.sam #R",
+// "java -ea -Xmx96m -cp /work/bbushnell/java/ align.PrintTime bowtieTime.txt"
+// };
+
+
+// String[] strings=new String[] {
+// "bfast match -T $TMPDIR/ -n 16 -f hg19.fa -r /work/bbushnell/synth/reads_B1_#Rx100bp_#S_chr1-25.fq > $TMPDIR/#S.bmf",
+// "bfast localalign -n 16 -f hg19.fa -m $TMPDIR/#S.bmf > $TMPDIR/#S.baf",
+//// "bfast postprocess -n 16 -a 3 -f hg19.fa -i $TMPDIR/#S.baf > #S.sam",
+//// "bfast postprocess -n 16 -a 3 -m 20 -f hg19.fa -i $TMPDIR/#S.baf > #S_r#R.sam",
+// "bfast postprocess -n 16 -M 20 -f hg19.fa -i $TMPDIR/#S.baf > #S_r#R.sam",
+//
+// "java -ea -Xmx96m -cp /work/bbushnell/java/ align.PrintTime bfastTime.txt",
+// "java -ea -Xmx96m -cp /work/bbushnell/java/ align.GradeSamFile #S_r#R.sam #R",
+// "java -ea -Xmx96m -cp /work/bbushnell/java/ align.PrintTime bfastTime.txt"
+// };
+
+// String[] strings=new String[] {
+// "smalt_x86_64 map -n 8 -a -f samsoft -o #S_r#R.sam hg37 /work/bbushnell/synth/reads_B1_#Rx100bp_#S_chr1-25.fq",
+// "java -ea -Xmx96m -cp /work/bbushnell/java/ align.PrintTime mapTime.txt",
+// "java -ea -Xmx96m -cp /work/bbushnell/java/ align.GradeSamFile #S_r#R.sam #R ssaha2",
+// "java -ea -Xmx96m -cp /work/bbushnell/java/ align.PrintTime mapTime.txt",
+// };
+
+// String[] strings=new String[] {
+// "java -ea -Xmx96m -cp /work/bbushnell/java/ align.PrintTime mapTime.txt",
+// "./soap -p 24 -a /work/bbushnell/synth/reads_B1_#Rx100bp_#S_chr1-25.fq -D hg37.fa.index -o #S_r#R.soap",
+// "perl soap2sam.pl -p #S_r#R.soap > #S_r#R.sam",
+// "java -ea -Xmx96m -cp /work/bbushnell/java/ align.PrintTime mapTime.txt",
+// "java -ea -Xmx96m -cp /work/bbushnell/java/ align.GradeSamFile #S_r#R.sam #R",
+// };
+
+ String[] strings=new String[] {
+ "java -ea -Xmx96m -cp /work/bbushnell/java/ align.PrintTime mapTime.txt",
+ "./bin/gmapper-ls /work/bbushnell/synth/reads_B1_#Rx100bp_#S_chr1-25.fq --single-best-mapping --qv-offset 33 -L hg37 -N 24 -o 5 -h 80% > #S_r#R.sam",
+ "java -ea -Xmx96m -cp /work/bbushnell/java/ align.PrintTime mapTime.txt",
+ "java -ea -Xmx96m -cp /work/bbushnell/java/ align.GradeSamFile #S_r#R.sam #R",
+ };
+
+ int[] blank=new int[] {0, 0, 0, 0};
+
+ print(strings, blank, 100);
+ print(strings, blank, 100);
+ print(strings, blank, 100);
+ print(strings, blank, 100);
+ for(int[] array : sets){
+ print(strings, array, numReads);
+ }
+
+ }
+
+ private static void print(String[] array, int[] blank, int x) {
+
+ String counts=(blank[0]+"S_"+blank[1]+"I_"+blank[2]+"D_"+blank[3]+"U");
+ String reads=""+x;
+
+ for(String s : array){
+ String s2=s.replaceAll("#S", counts).replaceAll("#R", reads);
+ System.out.println(s2);
+ }
+ System.out.println();
+
+ }
+
+ public static int numReads=400000;
+
+ public static final int[][] sets=new int[][] {
+ {0, 0, 0, 0},
+ {1, 0, 0, 0},
+ {2, 0, 0, 0},
+ {3, 0, 0, 0},
+ {4, 0, 0, 0},
+ {5, 0, 0, 0},
+ {6, 0, 0, 0},
+ {7, 0, 0, 0},
+ {8, 0, 0, 0},
+ {10, 0, 0, 0},
+ {12, 0, 0, 0},
+ {14, 0, 0, 0},
+ {16, 0, 0, 0},
+ {18, 0, 0, 0},
+ {20, 0, 0, 0},
+ {24, 0, 0, 0},
+ {28, 0, 0, 0},
+ {32, 0, 0, 0},
+ {36, 0, 0, 0},
+ {40, 0, 0, 0},
+
+ {0, 1, 0, 0},
+ {0, 2, 0, 0},
+ {0, 3, 0, 0},
+ {0, 4, 0, 0},
+ {0, 5, 0, 0},
+ {0, 6, 0, 0},
+ {0, 7, 0, 0},
+ {0, 8, 0, 0},
+ {0, 10, 0, 0},
+ {0, 12, 0, 0},
+ {0, 14, 0, 0},
+ {0, 16, 0, 0},
+ {0, 20, 0, 0},
+ {0, 24, 0, 0},
+ {0, 28, 0, 0},
+ {0, 32, 0, 0},
+ {0, 36, 0, 0},
+ {0, 40, 0, 0},
+
+ {0, 0, 1, 0},
+ {0, 0, 2, 0},
+ {0, 0, 3, 0},
+ {0, 0, 4, 0},
+ {0, 0, 5, 0},
+ {0, 0, 6, 0},
+ {0, 0, 7, 0},
+ {0, 0, 8, 0},
+ {0, 0, 10, 0},
+ {0, 0, 12, 0},
+ {0, 0, 14, 0},
+ {0, 0, 16, 0},
+ {0, 0, 20, 0},
+ {0, 0, 24, 0},
+ {0, 0, 28, 0},
+ {0, 0, 32, 0},
+ {0, 0, 48, 0},
+ {0, 0, 64, 0},
+ {0, 0, 128, 0},
+ {0, 0, 192, 0},
+ {0, 0, 256, 0},
+ {0, 0, 512, 0},
+ {0, 0, 1000, 0},
+ {0, 0, 2000, 0},
+ {0, 0, 3000, 0},
+ {0, 0, 4000, 0},
+ {0, 0, 6000, 0},
+ {0, 0, 8000, 0},
+ {0, 0, 10000, 0},
+ {0, 0, 12000, 0},
+ {0, 0, 14000, 0},
+ {0, 0, 16000, 0},
+ {0, 0, 20000, 0},
+ {0, 0, 24000, 0},
+ {0, 0, 28000, 0},
+ {0, 0, 32000, 0},
+
+ {0, 0, 0, 1},
+ {0, 0, 0, 2},
+ {0, 0, 0, 3},
+ {0, 0, 0, 4},
+ {0, 0, 0, 5},
+ {0, 0, 0, 6},
+ {0, 0, 0, 7},
+ {0, 0, 0, 8},
+ {0, 0, 0, 10},
+ {0, 0, 0, 12},
+ {0, 0, 0, 14},
+ {0, 0, 0, 16},
+ {0, 0, 0, 20},
+ {0, 0, 0, 24},
+ {0, 0, 0, 28},
+ {0, 0, 0, 32},
+ {0, 0, 0, 36},
+ {0, 0, 0, 40}
+ };
+
+}
diff --git a/current/driver/MeasureGene.java b/current/driver/MeasureGene.java
new file mode 100755
index 0000000..8638dcd
--- /dev/null
+++ b/current/driver/MeasureGene.java
@@ -0,0 +1,230 @@
+package driver;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashSet;
+
+import dna.AminoAcid;
+import dna.ChromosomeArray;
+import dna.Data;
+import dna.Exon;
+import dna.Gene;
+import dna.MotifMulti;
+import dna.MotifProbsN;
+
+public class MeasureGene {
+
+
+ public static void main(String[] args){
+
+ byte minChrom=19;
+ byte maxChrom=22;
+
+ double sum=0;
+ long count=0;
+
+
+ for(byte chrom=minChrom; chrom<=maxChrom; chrom++){
+ Data.getChromosome(chrom);
+ Gene[] genes=Data.getGenes(chrom, Gene.PLUS);
+ genes=toNormalGenes(genes);
+
+ for(Gene g : genes){
+// ArrayList<Exon> exons=getExons(g);
+
+ analyzeGene(g);
+// System.out.println("\nchr"+g.chromosome+"\t"+g.name+"\t"+g.nameTranscript);
+//
+// for(int i=0; i<g.exons.length; i++){
+// Exon e=g.exons[i];
+//
+// float f2=measureExonFrequency(e.a, e.b, e.chromosome, e.strand);
+// float f1, f3;
+//
+// if(i==0 && g.exons.length==1){
+// f1=mGStart.matchStrength(ca.array, e.a);
+// f3=mGStop.matchStrength(ca.array, e.b);
+// }else if(i==0){
+// f1=mGStart.matchStrength(ca.array, e.a);
+// f3=mEStop.matchStrength(ca.array, e.b);
+// }else if(i==g.exons.length-1){
+// f1=mEStart.matchStrength(ca.array, e.a);
+// f3=mGStop.matchStrength(ca.array, e.b);
+// }else{
+// f1=mEStart.matchStrength(ca.array, e.a);
+// f3=mEStop.matchStrength(ca.array, e.b);
+// }
+//
+// if(f2!=0){
+// sum+=f2;
+// count++;
+// System.out.println(String.format("%.3f, %.3f, %.5f", f1, f3, f2));
+// }
+// }
+ }
+
+
+ }
+
+ System.out.println("Sum: "+sum);
+ System.out.println("Count: "+count);
+ System.out.println("Average: "+sum/count);
+
+ }
+
+
+ public static float analyzeGene(Gene g){
+ assert(g.strand==Gene.PLUS) : "TODO";
+ ChromosomeArray ca=Data.getChromosome(g.chromosome);
+
+ System.out.println("\nchr"+g.chromosome+"\t"+g.symbol+"\t"+g.mrnaAcc);
+
+ double sum=0;
+
+ for(int i=0; i<g.exons.length; i++){
+ Exon e=g.exons[i];
+
+ float f2=measureExonFrequency(e.a, e.b, e.chromosome, e.strand);
+ float f1, f3;
+
+ if(i==0 && g.exons.length==1){
+ f1=mGStart.matchStrength(ca.array, e.a);
+ f3=mGStop.matchStrength(ca.array, e.b);
+ }else if(i==0){
+ f1=mGStart.matchStrength(ca.array, e.a);
+ f3=mEStop.matchStrength(ca.array, e.b);
+ }else if(i==g.exons.length-1){
+ f1=mEStart.matchStrength(ca.array, e.a);
+ f3=mGStop.matchStrength(ca.array, e.b);
+ }else{
+ f1=mEStart.matchStrength(ca.array, e.a);
+ f3=mEStop.matchStrength(ca.array, e.b);
+ }
+
+ sum=sum+f1+f3;
+
+// if(f2!=0){System.out.println(String.format("%.3f, %.3f, %.5f", f1, f3, f2));}
+ }
+
+ float avg=(float)(sum/(2*g.exons.length));
+
+ System.out.println(String.format("Average: %.3f", avg));
+ return avg;
+ }
+
+
+ public static Gene[] toNormalGenes(Gene[] genes){
+ ArrayList<Gene> normal=new ArrayList<Gene>(genes.length);
+ for(Gene g : genes){
+ if(g.isNormalGene()){normal.add(g);}
+ }
+ return normal.toArray(new Gene[normal.size()]);
+ }
+
+
+ public static ArrayList<Exon> getExons(Gene...genes){
+ HashSet<Exon> exonTable=new HashSet<Exon>();
+ for(Gene g : genes){
+ for(int i=0; i<g.exons.length; i++){
+ Exon e=g.exons[i];
+ exonTable.add(e);
+ }
+ }
+ ArrayList<Exon> exons=new ArrayList<Exon>(exonTable.size());
+ exons.addAll(exonTable);
+ exonTable=null;
+ Collections.sort(exons);
+ return exons;
+ }
+
+
+ public static float measureExonFrequency(int a, int b, byte chrom, byte strand){
+// assert e.strand==Gene.PLUS;
+
+ int start=a;
+ int stop=b-1;
+
+ double sum=0;
+ int count=0;
+
+ assert(strand==Gene.PLUS) : "TODO";
+ ChromosomeArray ca=Data.getChromosome(chrom);
+
+ for(int i=start; i<stop; i++){
+ int number=0;
+ boolean invalid=false;
+ for(int j=0; j<length; j++){
+ int code=AminoAcid.baseToNumberACGTN[ca.get(i+j)];
+ invalid=invalid || (code<0 || code>3);
+ number=((number<<2)|code);
+ }
+ if(!invalid){
+ count++;
+ sum+=freqDif[number];
+ }else{
+ return 0;
+ }
+ }
+
+ return count>0 ? (float)(sum/count) : 0;
+ }
+
+
+
+
+
+ private static final MotifProbsN mAG=MotifProbsN.makeMotif("AG Exon Starts MP2", 13, 11, 2);
+ private static final MotifProbsN mAC=MotifProbsN.makeMotif("AC Exon Starts MP2", 13, 11, 2);
+ private static final MotifProbsN mATG=MotifProbsN.makeMotif("ATG Exon Starts MP2", 13, 11, 2);
+
+ private static final MotifProbsN mGT=MotifProbsN.makeMotif("GT Exon Stops MP2", 10, 3, 2);
+ private static final MotifProbsN mGC=MotifProbsN.makeMotif("GC Exon Stops MP2", 10, 3, 2);
+
+ private static final MotifProbsN mGStartATG=MotifProbsN.makeMotif("Gene Starts MP2", 13, 11, 2);
+
+ private static final MotifProbsN mGStopTAA=MotifProbsN.makeMotif("TAA Gene Stops MP2", 13, 11, 2);
+ private static final MotifProbsN mGStopTAG=MotifProbsN.makeMotif("TAG Gene Stops MP2", 13, 11, 2);
+ private static final MotifProbsN mGStopTGA=MotifProbsN.makeMotif("TGA Gene Stops MP2", 13, 11, 2);
+
+ private static final MotifMulti mGStart=new MotifMulti("Gene Starts MP2", mGStartATG);
+ private static final MotifMulti mEStart=new MotifMulti("Exon Starts MP2", mAG, mAC);
+ private static final MotifMulti mEStop=new MotifMulti("Exon Stops MP2", mGT, mGC);
+ private static final MotifMulti mGStop=new MotifMulti("Gene Stops MP2", mGStopTAA, mGStopTAG, mGStopTGA);
+
+
+ private static final int length=2;
+
+ //Overall Frequency Exonic
+
+ public static final float[] exonicFreq1={0.259195f, 0.260530f, 0.260441f, 0.219835f};
+
+ //Overall Frequency Non-Exonic
+
+ public static final float[] nonExonicFreq1={0.277111f, 0.204189f, 0.213443f, 0.305257f};
+
+ //Overall Frequency Exonic
+
+ public static final float[] exonicFreq2={0.071395f, 0.055355f, 0.077256f, 0.052618f,
+ 0.079593f, 0.077505f, 0.032685f, 0.071248f, 0.075189f, 0.070017f, 0.070666f,
+ 0.045554f, 0.032210f, 0.057977f, 0.079080f, 0.051651f};
+
+ //Overall Frequency Non-Exonic
+
+ public static final float[] nonExonicFreq2={0.086472f, 0.047310f, 0.070451f, 0.072291f,
+ 0.069003f, 0.055260f, 0.011722f, 0.071913f, 0.058469f, 0.045772f, 0.056984f,
+ 0.054175f, 0.062555f, 0.059560f, 0.076273f, 0.101790f};
+
+ public static final float[] freqDif=(
+ length==2 ? makeDif(exonicFreq2, nonExonicFreq2) :
+ length==1 ? makeDif(exonicFreq1, nonExonicFreq1) :
+ null);
+
+ public static final float[] makeDif(float[] a, float[] b){
+ float[] dif=new float[a.length];
+ for(int i=0; i<a.length; i++){
+ dif[i]=a[i]-b[i];
+ }
+ return dif;
+ }
+
+}
diff --git a/current/driver/MergeBigelow.java b/current/driver/MergeBigelow.java
new file mode 100755
index 0000000..f6d2c9e
--- /dev/null
+++ b/current/driver/MergeBigelow.java
@@ -0,0 +1,245 @@
+package driver;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.util.Arrays;
+import java.util.HashMap;
+
+import align2.Shared;
+import align2.Tools;
+import dna.Parser;
+import dna.Timer;
+import fileIO.TextFile;
+import fileIO.FileFormat;
+import fileIO.ReadWrite;
+import fileIO.TextStreamWriter;
+
+/**
+ * @author Brian Bushnell
+ * @date Oct 17, 2014
+ *
+ */
+public class MergeBigelow {
+
+ public static void main(String[] args){
+ Timer t=new Timer();
+ MergeBigelow mb=new MergeBigelow(args);
+ mb.process(t);
+ }
+
+ public MergeBigelow(String[] args){
+
+ args=Parser.parseConfig(args);
+ if(Parser.parseHelp(args, true)){
+ printOptions();
+ System.exit(0);
+ }
+
+ for(String s : args){if(s.startsWith("out=standardout") || s.startsWith("out=stdout")){outstream=System.err;}}
+ outstream.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+
+
+
+ Shared.READ_BUFFER_LENGTH=Tools.min(200, Shared.READ_BUFFER_LENGTH);
+ Shared.capBuffers(4);
+ ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
+ ReadWrite.MAX_ZIP_THREADS=Shared.threads();
+
+
+ Parser parser=new Parser();
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if(b==null || b.equalsIgnoreCase("null")){b=null;}
+ while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens
+
+ if(parser.parse(arg, a, b)){
+ //do nothing
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ ReadWrite.verbose=verbose;
+ }else if(parser.in1==null && i==0 && !arg.contains("=") && (arg.toLowerCase().startsWith("stdin") || new File(arg).exists())){
+ parser.in1=arg;
+ }else{
+ outstream.println("Unknown parameter "+args[i]);
+ assert(false) : "Unknown parameter "+args[i];
+ // throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ {//Process parser fields
+ Parser.processQuality();
+
+ maxReads=parser.maxReads;
+
+ overwrite=parser.overwrite;
+ append=parser.append;
+
+ in1=parser.in1;
+ in2=parser.in2;
+
+ out1=parser.out1;
+ }
+
+ if(in1==null || in2==null){
+ printOptions();
+ throw new RuntimeException("Error - two input files are required.");
+ }
+
+ if(out1!=null && out1.equalsIgnoreCase("null")){out1=null;}
+
+ if(!Tools.testOutputFiles(overwrite, append, false, out1)){
+ outstream.println((out1==null)+", "+out1);
+ throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+out1+"\n");
+ }
+
+ assert(Tools.testInputFiles(false, true, in1, in2));
+ assert(Tools.testForDuplicateFiles(true, in1, in2, out1));
+
+ ffout1=FileFormat.testOutput(out1, FileFormat.TEXT, null, true, overwrite, append, false);
+
+ ffin1=FileFormat.testInput(in1, FileFormat.TEXT, null, true, true);
+ ffin2=FileFormat.testInput(in2, FileFormat.TEXT, null, true, true);
+ }
+
+ void process(Timer t){
+
+ table=hash(ffin2);
+
+ final TextFile tf;
+ {
+ tf=new TextFile(ffin1);
+ if(verbose){outstream.println("Started tf");}
+ }
+
+ final TextStreamWriter tsw;
+ {
+ tsw=new TextStreamWriter(ffout1);
+ tsw.start();
+ if(verbose){outstream.println("Started tsw");}
+ }
+
+ long linesProcessed=0;
+ long charsProcessed=0;
+
+ {
+ String line;
+ while((line=tf.nextLine())!=null){
+// System.err.println("Processing "+line);
+ linesProcessed++;
+ charsProcessed+=line.length();
+ CharSequence result=processLine(line);
+ if(tsw!=null && result!=null){tsw.println(result);}
+ if(maxReads>0 && linesProcessed>=maxReads){break;}
+ }
+ }
+
+ errorState|=tsw.poisonAndWait();
+ errorState|=tf.close();
+
+ t.stop();
+
+ double rpnano=linesProcessed/(double)(t.elapsed);
+ double bpnano=charsProcessed/(double)(t.elapsed);
+
+ String rpstring=(linesProcessed<100000 ? ""+linesProcessed : linesProcessed<100000000 ? (linesProcessed/1000)+"k" : (linesProcessed/1000000)+"m");
+ String bpstring=(charsProcessed<100000 ? ""+charsProcessed : charsProcessed<100000000 ? (charsProcessed/1000)+"k" : (charsProcessed/1000000)+"m");
+
+ while(rpstring.length()<8){rpstring=" "+rpstring;}
+ while(bpstring.length()<8){bpstring=" "+bpstring;}
+
+ outstream.println("Time: \t"+t);
+ outstream.println("Lines Processed: "+rpstring+" \t"+String.format("%.2fk lines/sec", rpnano*1000000));
+ outstream.println("Chars Processed: "+bpstring+" \t"+String.format("%.2fm chars/sec", bpnano*1000));
+
+ if(errorState){
+ throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
+ }
+ }
+
+
+ private CharSequence processLine(String line){
+ String[] split=line.split(delimiter);
+ String[] split2=table.get(split[0]);
+ if(split2==null){return line;} //Header
+ StringBuilder sb=new StringBuilder();
+ String tab="";
+// assert(false) : split.length+", "+split2.length;
+// System.err.println(split[1]);
+ if(split.length>1){
+ if(split[1].contains(" SCGC")){
+ split[1]=split[1].substring(0, split[1].indexOf(" SCGC"));
+// System.err.println(split[1]);
+ }
+ if(split[1].contains(" "+split[0])){
+ split[1]=split[1].substring(0, split[1].indexOf(" "+split[0]));
+// System.err.println(split[1]);
+ }
+ split[1]=split[1].toLowerCase();
+// System.err.println(split[1]);
+ }
+ for(int i=0; i<split.length; i++){
+ sb.append(tab);
+ sb.append(split[i].replace(',','_'));
+ tab="\t";
+ }
+ for(int i=1; i<split2.length; i++){
+ sb.append(tab);
+ sb.append(split2[i].replace(',','_'));
+ tab="\t";
+ }
+ return sb;
+ }
+
+ private HashMap<String, String[]> hash(FileFormat ff){
+ final HashMap<String, String[]> table=new HashMap<String, String[]>();
+ final TextFile tf;
+ {
+ tf=new TextFile(ff);
+ if(verbose){outstream.println("Started tf");}
+ }
+ {
+ String line;
+ while((line=tf.nextLine())!=null){
+ String[] split=line.split(delimiter);
+ table.put(split[0], split);
+ }
+ }
+ return table;
+ }
+
+
+ /*--------------------------------------------------------------*/
+
+ private void printOptions(){assert(false) : "printOptions: TODO";}
+
+ /*--------------------------------------------------------------*/
+
+ private String in1=null;
+ private String in2=null;
+ private String out1=null;
+
+ private String delimiter="\t";
+ private HashMap<String, String[]> table;
+
+ /*--------------------------------------------------------------*/
+
+ private long maxReads=-1;
+
+ /*--------------------------------------------------------------*/
+
+ private final FileFormat ffin1;
+ private final FileFormat ffin2;
+ private final FileFormat ffout1;
+
+ /*--------------------------------------------------------------*/
+
+ private PrintStream outstream=System.err;
+ public static boolean verbose=false;
+ public boolean errorState=false;
+ private boolean overwrite=false;
+ private boolean append=false;
+
+}
diff --git a/current/driver/MergeCoverageOTU.java b/current/driver/MergeCoverageOTU.java
new file mode 100755
index 0000000..06cc68a
--- /dev/null
+++ b/current/driver/MergeCoverageOTU.java
@@ -0,0 +1,66 @@
+package driver;
+
+import java.util.ArrayList;
+import java.util.LinkedHashMap;
+
+import jgi.CovStatsLine;
+
+import fileIO.TextFile;
+import fileIO.TextStreamWriter;
+
+/**
+ * @author Brian Bushnell
+ * @date Oct 13, 2014
+ *
+ */
+public class MergeCoverageOTU {
+
+ public static void main(String[] args){
+ String a=args[0];
+ String b=args[1];
+ String in=null, out=null;
+ if(a.toLowerCase().startsWith("in=")){
+ in=a.split("=")[1];
+ out=b.split("=")[1];
+ }else if(a.toLowerCase().startsWith("out=")){
+ in=b.split("=")[1];
+ out=a.split("=")[1];
+ }else{
+ in=a;
+ out=b;
+ }
+
+ TextFile tf=new TextFile(in);
+ LinkedHashMap<String, CovStatsLine> map=new LinkedHashMap<String, CovStatsLine>();
+ int count=0;
+ ArrayList<String> headers=new ArrayList<String>();
+ for(String s=tf.nextLine(); s!=null; s=tf.nextLine()){
+ if(count==0 || s.startsWith("#")){
+ headers.add(s);
+ }else{
+ int space=s.indexOf(' ');
+ String otu=s.substring(space+1, s.indexOf('\t'));
+ CovStatsLine csl=new CovStatsLine(s);
+ CovStatsLine old=map.get(otu);
+ if(old==null){
+ map.put(otu, csl);
+ }else{
+ old.add(csl);
+ }
+ }
+ count++;
+ }
+ tf.close();
+
+ TextStreamWriter tsw=new TextStreamWriter(out, true, false, false);
+ tsw.start();
+ for(String s : headers){tsw.println(s);}
+ for(String s : map.keySet()){
+ CovStatsLine csl=map.get(s);
+ csl.id=s;
+ tsw.println(csl.toString());
+ }
+ tsw.poisonAndWait();
+ }
+
+}
diff --git a/current/driver/MergeTextFiles.java b/current/driver/MergeTextFiles.java
new file mode 100755
index 0000000..75c2f01
--- /dev/null
+++ b/current/driver/MergeTextFiles.java
@@ -0,0 +1,96 @@
+package driver;
+
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Hashtable;
+
+import dna.Data;
+
+import fileIO.TextFile;
+
+public class MergeTextFiles {
+
+ public static void main(String[] args){
+ CharSequence sb=mergeWithHeader(args[0], args[1], 0, 1);
+ System.out.println(sb);
+ }
+
+ public static StringBuilder mergeWithHeader(String fname1, String fname2, int col1, int col2){
+
+ TextFile tf1=new TextFile(fname1, false, false);
+ String[][] lines1=TextFile.doublesplitTab(tf1.toStringLines(), false);
+ tf1.close();
+ tf1=null;
+
+ TextFile tf2=new TextFile(fname2, false, false);
+ String[][] lines2=TextFile.doublesplitTab(tf2.toStringLines(), false);
+ tf2.close();
+ tf2=null;
+
+ int maxWidth1=findMaxWidth(lines1);
+ int maxWidth2=findMaxWidth(lines2);
+
+ Hashtable<String, String[]> table1=makeTable(lines1, col1, 1);
+ Hashtable<String, String[]> table2=makeTable(lines2, col2, 1);
+
+ HashSet<String> keySet=new HashSet<String>();
+ keySet.addAll(table1.keySet());
+ keySet.addAll(table2.keySet());
+ String[] keys=keySet.toArray(new String[0]);
+ Arrays.sort(keys);
+
+ StringBuilder sb=new StringBuilder();
+ sb.append(toString(lines1[0], lines2[0], maxWidth1, maxWidth2));
+ sb.append('\n');
+
+ for(String key : keys){
+ String[] line1=table1.get(key);
+ String[] line2=table2.get(key);
+
+ if(line1==null){
+ line1=new String[col1+1];
+ line1[col1]=line2[col2];
+ }
+
+ sb.append(toString(line1, line2, maxWidth1, maxWidth2));
+ sb.append('\n');
+ }
+
+ return sb;
+ }
+
+ private static StringBuilder toString(String[] a, String[] b, int alen, int blen){
+ StringBuilder sb=new StringBuilder();
+ for(int i=0; i<alen; i++){
+ if(a!=null && a.length>i && a[i]!=null){
+ sb.append(a[i]);
+ }
+ sb.append('\t');
+ }
+ for(int i=0; i<blen; i++){
+ if(b!=null && b.length>i && b[i]!=null){
+ sb.append(b[i]);
+ }
+ sb.append('\t');
+ }
+ return sb;
+ }
+
+ private static Hashtable<String, String[]> makeTable(String[][] lines, int col, int firstLine) {
+ Hashtable<String, String[]> table=new Hashtable<String, String[]>();
+ for(int i=firstLine; i<lines.length; i++){
+ String[] line=lines[i];
+ table.put(line[col], line);
+ }
+ return table;
+ }
+
+ private static int findMaxWidth(String[][] matrix){
+ int max=0;
+ for(String[] line : matrix){
+ if(line!=null && max<line.length){max=line.length;}
+ }
+ return max;
+ }
+
+}
diff --git a/current/driver/MergeTextFiles2.java b/current/driver/MergeTextFiles2.java
new file mode 100755
index 0000000..414cbe3
--- /dev/null
+++ b/current/driver/MergeTextFiles2.java
@@ -0,0 +1,96 @@
+package driver;
+
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Hashtable;
+
+import dna.Data;
+
+import fileIO.TextFile;
+
+public class MergeTextFiles2 {
+
+ public static void main(String[] args){
+ CharSequence sb=mergeWithHeader(args[0], args[1], 0, 1);
+ System.out.println(sb);
+ }
+
+ public static StringBuilder mergeWithHeader(String fname1, String fname2, int col1, int col2){
+
+ TextFile tf1=new TextFile(fname1, false, false);
+ String[][] lines1=TextFile.doublesplitTab(tf1.toStringLines(), false);
+ tf1.close();
+ tf1=null;
+
+ TextFile tf2=new TextFile(fname2, false, false);
+ String[][] lines2=TextFile.doublesplitTab(tf2.toStringLines(), false);
+ tf2.close();
+ tf2=null;
+
+ int maxWidth1=findMaxWidth(lines1);
+ int maxWidth2=findMaxWidth(lines2);
+
+ Hashtable<String, String[]> table1=makeTable(lines1, col1, 1);
+ Hashtable<String, String[]> table2=makeTable(lines2, col2, 1);
+
+ HashSet<String> keySet=new HashSet<String>();
+ keySet.addAll(table1.keySet());
+ keySet.addAll(table2.keySet());
+ String[] keys=keySet.toArray(new String[0]);
+ Arrays.sort(keys);
+
+ StringBuilder sb=new StringBuilder();
+ sb.append(toString(lines1[0], lines2[0], maxWidth1, maxWidth2));
+ sb.append('\n');
+
+ for(String key : keys){
+ String[] line1=table1.get(key);
+ String[] line2=table2.get(key);
+
+ if(line1==null){
+ line1=new String[col1+1];
+ line1[col1]=line2[col2];
+ }
+
+ sb.append(toString(line1, line2, maxWidth1, maxWidth2));
+ sb.append('\n');
+ }
+
+ return sb;
+ }
+
+ private static StringBuilder toString(String[] a, String[] b, int alen, int blen){
+ StringBuilder sb=new StringBuilder();
+ for(int i=0; i<alen; i++){
+ if(a!=null && a.length>i && a[i]!=null){
+ sb.append(a[i]);
+ }
+ sb.append('\t');
+ }
+ for(int i=0; i<blen; i++){
+ if(b!=null && b.length>i && b[i]!=null){
+ sb.append(b[i]);
+ }
+ sb.append('\t');
+ }
+ return sb;
+ }
+
+ private static Hashtable<String, String[]> makeTable(String[][] lines, int col, int firstLine) {
+ Hashtable<String, String[]> table=new Hashtable<String, String[]>();
+ for(int i=firstLine; i<lines.length; i++){
+ String[] line=lines[i];
+ table.put(line[col], line);
+ }
+ return table;
+ }
+
+ private static int findMaxWidth(String[][] matrix){
+ int max=0;
+ for(String[] line : matrix){
+ if(line!=null && max<line.length){max=line.length;}
+ }
+ return max;
+ }
+
+}
diff --git a/current/driver/MoveFiles.java b/current/driver/MoveFiles.java
new file mode 100755
index 0000000..5f269fe
--- /dev/null
+++ b/current/driver/MoveFiles.java
@@ -0,0 +1,92 @@
+package driver;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+
+
+public class MoveFiles {
+
+
+ public static void main(String[] args){
+
+ String root=args[0].replace('\\', '/');
+
+ File dir=new File(root);
+ assert(dir.exists());
+ assert(dir.isDirectory());
+
+
+ File[] files=dir.listFiles();
+
+ for(int chrom=1; chrom<=22; chrom++){
+
+ String key="chr"+chrom;
+
+ File dest=new File(root+"/"+key);
+ if(!dest.exists()){
+ dest.mkdir();
+ }
+
+ for(File f : files){
+ String name=f.getName();
+ if(name.contains("/")){
+ name=name.substring(name.lastIndexOf("/")+1);
+ }
+ final String name2=name;
+
+ if(name.contains(".")){
+ name=name.substring(0,name.lastIndexOf("."));
+ }
+
+ while(name.length()>1 && !Character.isDigit(name.charAt(name.length()-1))){
+ name=name.substring(0, name.length()-1);
+ }
+ name=name.toLowerCase();
+
+ if(f.isFile() && name.endsWith("chr"+chrom)){
+ copyFile(f.getAbsolutePath(), dest.getAbsolutePath()+"/"+name2);
+ }
+ }
+ }
+
+ }
+
+
+ /**
+ * @param srFile
+ * @param dtFile
+ * {@link from http://www.roseindia.net/java/beginners/CopyFile.shtml}
+ */
+ private static void copyFile(String src, String dst){
+// assert(false) : src+" -> "+dst;
+ try{
+ File f1 = new File(src);
+ File f2 = new File(dst);
+ InputStream in = new FileInputStream(f1);
+ //For Append the file.
+ // OutputStream out = new FileOutputStream(f2,true);
+
+ //For Overwrite the file.
+ OutputStream out = new FileOutputStream(f2);
+
+ byte[] buf = new byte[16384];
+ int len;
+ while ((len = in.read(buf)) > 0){
+ out.write(buf, 0, len);
+ }
+ in.close();
+ out.close();
+ }catch(FileNotFoundException e){
+ throw new RuntimeException(e);
+ }catch(IOException e){
+ throw new RuntimeException(e);
+ }
+ }
+
+
+}
diff --git a/current/driver/PrintEnv.java b/current/driver/PrintEnv.java
new file mode 100755
index 0000000..03099e4
--- /dev/null
+++ b/current/driver/PrintEnv.java
@@ -0,0 +1,36 @@
+package driver;
+
+import java.net.UnknownHostException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Date;
+import java.util.Map;
+
+/**
+ * @author Brian Bushnell
+ * @date Apr 4, 2013
+ *
+ */
+public class PrintEnv {
+
+ public static void main(String[] args){
+
+ Date d=new Date();
+ System.out.println("Time: "+d.getTime()+" = "+d+"\n");
+
+ Map<String, String> env=System.getenv();
+ ArrayList<String> keys=new ArrayList<String>(env.keySet());
+ Collections.sort(keys);
+ for(String s : keys){
+ System.out.println(s+"\t"+env.get(s));
+ }
+ try {
+ java.net.InetAddress localMachine = java.net.InetAddress.getLocalHost();
+ System.out.println("Hostname of local machine: " + localMachine.getHostName());
+ } catch (UnknownHostException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+
+}
diff --git a/current/driver/ReduceSilva.java b/current/driver/ReduceSilva.java
new file mode 100755
index 0000000..b6c2d44
--- /dev/null
+++ b/current/driver/ReduceSilva.java
@@ -0,0 +1,330 @@
+package driver;
+
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+
+import stream.ConcurrentReadInputStream;
+import stream.FASTQ;
+import stream.FastaReadInputStream;
+import stream.ConcurrentReadOutputStream;
+import stream.Read;
+
+import dna.Parser;
+import dna.Timer;
+import fileIO.ByteFile;
+import fileIO.ReadWrite;
+import fileIO.FileFormat;
+
+import align2.ListNum;
+import align2.ReadStats;
+import align2.Shared;
+import align2.Tools;
+
+/**
+ * @author Brian Bushnell
+ * @date June 20, 2014
+ *
+ */
+public class ReduceSilva {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /**
+ * Code entrance from the command line.
+ * @param args Command line arguments
+ */
+ public static void main(String[] args){
+ Timer t=new Timer();
+ ReduceSilva mb=new ReduceSilva(args);
+ mb.process(t);
+ }
+
+ /**
+ * Constructor.
+ * @param args Command line arguments
+ */
+ public ReduceSilva(String[] args){
+
+ args=Parser.parseConfig(args);
+ if(Parser.parseHelp(args, true)){
+ printOptions();
+ System.exit(0);
+ }
+
+ outstream.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+
+
+ Shared.READ_BUFFER_LENGTH=Tools.min(200, Shared.READ_BUFFER_LENGTH);
+ Shared.capBuffers(4);
+ ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
+ ReadWrite.MAX_ZIP_THREADS=Shared.threads();
+
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+
+ Parser parser=new Parser();
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if(b==null || b.equalsIgnoreCase("null")){b=null;}
+ while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens
+
+ if(parser.parse(arg, a, b)){
+ //do nothing
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ }else if(a.equals("column")){
+ column=Integer.parseInt(b);
+ }else if(a.equals("parse_flag_goes_here")){
+ //Set a variable here
+ }else{
+ outstream.println("Unknown parameter "+args[i]);
+ assert(false) : "Unknown parameter "+args[i];
+ // throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ {//Process parser fields
+ Parser.processQuality();
+
+ maxReads=parser.maxReads;
+
+ overwrite=ReadStats.overwrite=parser.overwrite;
+ append=ReadStats.append=parser.append;
+
+ in1=parser.in1;
+
+ out1=parser.out1;
+
+ extin=parser.extin;
+ extout=parser.extout;
+ }
+
+ assert(FastaReadInputStream.settingsOK());
+
+ if(in1==null){
+ printOptions();
+ throw new RuntimeException("Error - at least one input file is required.");
+ }
+ if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){
+ ByteFile.FORCE_MODE_BF2=true;
+ }
+
+ if(out1!=null && out1.equalsIgnoreCase("null")){out1=null;}
+
+ if(!Tools.testOutputFiles(overwrite, append, false, out1)){
+ outstream.println((out1==null)+", "+out1);
+ throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+out1+"\n");
+ }
+
+ ffout1=FileFormat.testOutput(out1, FileFormat.FASTQ, extout, true, overwrite, append, false);
+
+ ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, extin, true, true);
+ }
+
+ public boolean parseArgument(String arg, String a, String b){
+ if(a.equals("reads") || a.equals("maxreads")){
+ maxReads=Tools.parseKMG(b);
+ return true;
+ }else if(a.equals("some_argument")){
+ maxReads=Tools.parseKMG(b);
+ return true;
+ }
+ return false;
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Outer Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Create read streams and process all data */
+ void process(Timer t){
+
+ final ConcurrentReadInputStream cris;
+ {
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, null);
+ cris.start();
+ if(verbose){outstream.println("Started cris");}
+ }
+ boolean paired=cris.paired();
+ if(!ffin1.samOrBam()){outstream.println("Input is being processed as "+(paired ? "paired" : "unpaired"));}
+
+ final ConcurrentReadOutputStream ros;
+ if(out1!=null){
+ final int buff=4;
+
+ assert(!out1.equalsIgnoreCase(in1) && !out1.equalsIgnoreCase(in1)) : "Input file and output file have same name.";
+
+ ros=ConcurrentReadOutputStream.getStream(ffout1, null, buff, null, false);
+ ros.start();
+ }else{ros=null;}
+
+ readsProcessed=0;
+ basesProcessed=0;
+
+ readsOut=0;
+ basesOut=0;
+
+ //Process the read stream
+ processInner(cris, ros);
+
+ ReadWrite.closeStreams(cris, ros);
+ if(verbose){outstream.println("Finished.");}
+
+ errorState|=ReadStats.writeAll();
+ errorState|=ReadWrite.closeStreams(cris, ros);
+
+ t.stop();
+
+ double rpnano=readsProcessed/(double)(t.elapsed);
+ double bpnano=basesProcessed/(double)(t.elapsed);
+
+ String rpstring=""+readsProcessed;
+ String bpstring=""+basesProcessed;
+ String rostring=""+readsOut;
+ String bostring=""+basesOut;
+
+ final int digits=10;
+ while(rpstring.length()<digits){rpstring=" "+rpstring;}
+ while(bpstring.length()<digits){bpstring=" "+bpstring;}
+ while(rostring.length()<digits){rostring=" "+rostring;}
+ while(bostring.length()<digits){bostring=" "+bostring;}
+
+ outstream.println("Time: \t"+t);
+ outstream.println("Reads Processed: "+rpstring+" \t"+String.format("%.2fk reads/sec", rpnano*1000000));
+ outstream.println("Bases Processed: "+bpstring+" \t"+String.format("%.2fm bases/sec", bpnano*1000));
+ outstream.println();
+ outstream.println("Reads Out: "+rostring+" \t"+String.format("%.2f%%", readsOut*100.0/readsProcessed));
+ outstream.println("Bases Out: "+bostring+" \t"+String.format("%.2f%%", basesOut*100.0/basesProcessed));
+
+ if(errorState){
+ throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
+ }
+ }
+
+ /** Iterate through the reads */
+ void processInner(final ConcurrentReadInputStream cris, final ConcurrentReadOutputStream ros){
+
+ readsProcessed=0;
+ basesProcessed=0;
+
+ {
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+ assert((ffin1==null || ffin1.samOrBam()) || (r.mate!=null)==cris.paired());
+ }
+
+ while(reads!=null && reads.size()>0){
+ if(verbose){outstream.println("Fetched "+reads.size()+" reads.");}
+
+ for(int idx=0; idx<reads.size(); idx++){
+ final Read r1=reads.get(idx);
+
+ final int initialLength1=r1.length();
+
+ {
+ readsProcessed++;
+ basesProcessed+=initialLength1;
+ }
+
+ boolean keep=processRead(r1);
+ if(keep){
+ readsOut++;
+ basesOut+=initialLength1;
+ }else{
+ reads.set(idx, null);
+ }
+
+ }
+
+ if(ros!=null){ros.add(reads, ln.id);}
+
+ cris.returnList(ln.id, ln.list.isEmpty());
+ if(verbose){outstream.println("Returned a list.");}
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ if(ln!=null){
+ cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
+ }
+ }
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /**
+ * Process a single read pair.
+ * @param r1 Read 1
+ * @return True if the read should be kept, false if it should be discarded.
+ */
+ boolean processRead(final Read r1){
+ String[] split=r1.id.split(";");
+ if(split.length<=column){return true;}
+ String taxa=split[split.length-column-1];
+ boolean present=table.contains(taxa);
+ if(present){return false;}
+ table.add(taxa);
+ return true;
+ }
+
+ /** This is called if the program runs with no parameters */
+ private void printOptions(){
+ throw new RuntimeException("TODO");
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private HashSet<String> table=new HashSet<String>();
+
+ private String in1=null;
+
+ private String out1=null;
+
+ private String extin=null;
+ private String extout=null;
+
+ private int column=1;
+
+ /*--------------------------------------------------------------*/
+
+ protected long readsProcessed=0;
+ protected long basesProcessed=0;
+
+ protected long readsOut=0;
+ protected long basesOut=0;
+
+ private long maxReads=-1;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Final Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private final FileFormat ffin1;
+
+ private final FileFormat ffout1;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Common Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private PrintStream outstream=System.err;
+ public static boolean verbose=false;
+ public boolean errorState=false;
+ private boolean overwrite=false;
+ private boolean append=false;
+
+}
diff --git a/current/driver/Sample.java b/current/driver/Sample.java
new file mode 100755
index 0000000..92a0235
--- /dev/null
+++ b/current/driver/Sample.java
@@ -0,0 +1,75 @@
+package driver;
+
+import java.io.BufferedOutputStream;
+import java.io.BufferedReader;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.PrintWriter;
+
+/**
+ * @author Brian Bushnell
+ * @date Oct 13, 2015
+ *
+ * This class will read a file and write it to another file.
+ *
+ */
+public class Sample {
+
+ /** Primary method, called by java */
+ public static void main(String[] args){
+
+ String fnameIn=args[0];
+ String fnameOut=args[1];
+
+ BufferedReader br=getReader(fnameIn);
+ PrintWriter pw=getWriter(fnameOut);
+
+ try {
+ processData(br, pw);
+ } catch (IOException e) {
+ e.printStackTrace();
+ System.exit(1);
+ }
+
+
+ }
+
+ /** Do stuff */
+ static void processData(BufferedReader br, PrintWriter pw) throws IOException{
+ for(String s=br.readLine(); s!=null; s=br.readLine()){
+ //Parsing goes here
+ pw.println(s);
+ }
+ }
+
+ /** Fetches a BufferedReader, which allows line-by-line String iteration over text files */
+ static BufferedReader getReader(String fname){
+ FileInputStream fis=null;
+ try {
+ fis=new FileInputStream(fname);
+ } catch (Exception e) {
+ e.printStackTrace();
+ System.exit(1);
+ }
+ InputStreamReader isr=new InputStreamReader(fis);
+ BufferedReader br=new BufferedReader(isr);
+ return br;
+ }
+
+ /** Fetches a PrintWriter, which transforms Strings into a byte stream. */
+ static PrintWriter getWriter(String fname){
+ FileOutputStream fos=null;
+ try {
+ fos=new FileOutputStream(fname);
+ } catch (Exception e) {
+ e.printStackTrace();
+ System.exit(1);
+ }
+ BufferedOutputStream bos=new BufferedOutputStream(fos);
+ PrintWriter pw=new PrintWriter(bos);
+ return pw;
+ }
+
+}
diff --git a/current/driver/Search.java b/current/driver/Search.java
new file mode 100755
index 0000000..6170a7c
--- /dev/null
+++ b/current/driver/Search.java
@@ -0,0 +1,167 @@
+package driver;
+import java.util.ArrayList;
+import java.util.List;
+
+import dna.Data;
+import dna.Gene;
+import dna.Range;
+
+
+public class Search {
+
+ /** Find genes in the array that overlap point "p" */
+ public static List<Gene> findGenes(int p, Gene[] genes){
+ ArrayList<Gene> list=new ArrayList<Gene>(16);
+
+ for(int i=0; i<genes.length; i++){
+ Gene g=genes[i];
+ if(g.intersectsCode(p)){
+ list.add(g);
+ }
+ }
+
+ return list;
+ }
+
+ /** Find genes in the array that overlap point "p" */
+ public static List<Gene> findGenesBinary(int p, Range[] ranges, boolean nearby){
+ ArrayList<Gene> list=null;
+ int a=findPointBinary(p, ranges);
+
+ Range r=ranges[a];
+
+// System.out.println("Searching for "+p+" in "+r+"; previous range was "+ranges[a-1]);
+ if(!r.includes(p)){return list;}
+
+ list=new ArrayList<Gene>(16);
+
+ Gene[] genes2=(Gene[])r.obj1;
+ assert(genes2.length>0);
+
+// System.out.println("Found "+genes2.length+" to consider.");
+
+
+ //TODO: Specify whether tx or code (etc) coverage is needed.
+ for(int i=0; i<genes2.length; i++){
+ Gene g=genes2[i];
+// System.out.print("Does p overlap gene "+g.codeStart+" - "+g.codeEnd+"?");
+ if(g.txStart>r.b+Data.NEAR){break;}
+ if(nearby){
+ if(g.intersectsNearby(p, p)){list.add(g);}
+ }else{
+ if(g.intersectsCode(p)){list.add(g);}
+ }
+ }
+
+ return list;
+ }
+
+ /** Find genes in the array that overlap point "p" */
+ public static List<Gene> findGenesLinear(int p, Gene[] genes, Range[] ranges){
+ ArrayList<Gene> list=null;
+ int a=findPointLinear(p, ranges);
+
+ Range r=ranges[a];
+
+// System.out.println("Searching for "+p+" in "+r+"; previous range was "+ranges[a-1]);
+ if(!r.includes(p)){return list;}
+
+ list=new ArrayList<Gene>(16);
+
+ Gene[] genes2=(Gene[])r.obj1;
+ assert(genes2.length>0);
+
+// System.out.println("Found "+genes2.length+" to consider.");
+
+
+ //TODO: Specify whether tx or code (etc) coverage is needed.
+ for(int i=0; i<genes2.length; i++){
+ Gene g=genes2[i];
+// System.out.print("Does p overlap gene "+g.codeStart+" - "+g.codeEnd+"?");
+ if(g.txStart>r.b){break;}
+ if(g.intersectsCode(p)){
+ list.add(g);
+// System.out.println(" Yes.");
+ }
+ }
+
+ return list;
+ }
+
+ public static int findPointLinear(int p, Range[] array){
+ for(int i=0; i<array.length; i++){
+ Range r=array[i];
+ if(r.a>p){return i;} //Fail.
+ if(r.includes(p)){return i;} //Success.
+ }
+ return array.length-1;
+ }
+
+ public static int findPointBinary(int p, Range[] array){
+ assert(array!=null);
+ if(array.length==0){return 0;}
+ int result=findPointBinary(p, 0, max(0, array.length-1), array);
+
+ //TODO: Assertions
+
+ return result;
+ }
+
+ public static boolean containsPointBinary(int p, Range[] array, int thresh){
+ assert(array!=null);
+ if(array.length==0){return false;}
+ int rnum=findPointBinary(p, 0, max(0, array.length-1), array);
+
+ int p1=p-thresh, p2=p+thresh;
+ Range r=array[rnum];
+ if(p2>=r.a && p1<=r.b){return true;}
+
+ if(rnum==0 && p<r.a){return false;}
+
+ assert(p>r.b) : "\n\n"+p+"\t"+rnum+"/"+array.length+"\t"+r+"\n\n"; //Otherwise, it violated the search contract.
+ if(array.length<=rnum+1){return false;}
+
+ Range r2=array[rnum+1];
+ assert(r2.a>p) : "\n\n"+p+"\t"+rnum+"/"+array.length+"\t"+r+"\n\n"; //Otherwise, it violated the search contract.
+ return (p2>=r.a && p1<=r.b);
+ }
+
+ public static int findPointBinary(int p, int a, int b, Range[] array){
+ if(a>=b){
+
+ //This line should ensure that p>array[a] when p is not within any range.
+ //Except, of course, when p<(all ranges).
+ //In other words, the return is strictly the LEFT (LOWER) index when p is between two ranges.
+ if(a>0 && p<array[a].a){a--;}
+
+ assert(a>=0);
+ assert(a<array.length);
+ assert(array[a].includes(p) || (a==0 && p<array[a].a) ||
+ (p>array[a].b && (a==array.length-1 || p<array[a+1].a))) :
+ "a="+a+", b="+b+", p="+p+", array[a]="+array[a];
+ return a;
+ }
+
+ int mid=(a+b)/2;
+ Range r=array[mid];
+
+ if(r.a>p){
+ return findPointBinary(p, a, mid-1, array);
+ }else if(r.b<p){
+ return findPointBinary(p, mid+1, b, array);
+ }else{
+ return mid;
+ }
+ }
+
+
+ public static boolean overlaps(int a, Gene g){
+ return a>=g.txStart && a<=g.txStop;
+ }
+ private static final int min(int x, int y){return x<y ? x : y;}
+ private static final int max(int x, int y){return x>y ? x : y;}
+ private static final long min(long x, long y){return x<y ? x : y;}
+ private static final long max(long x, long y){return x>y ? x : y;}
+
+
+}
diff --git a/current/driver/SelectReads.java b/current/driver/SelectReads.java
new file mode 100755
index 0000000..e86b298
--- /dev/null
+++ b/current/driver/SelectReads.java
@@ -0,0 +1,74 @@
+package driver;
+
+import stream.SamLine;
+import align2.Shared;
+import align2.Tools;
+import fileIO.ReadWrite;
+import fileIO.TextFile;
+import fileIO.TextStreamWriter;
+
+/**
+ *
+ * Selects only reads with long deletions
+ *
+ * @author Brian Bushnell
+ * @date Jun 21, 2013
+ *
+ */
+public final class SelectReads {
+
+ public static void main(String[] args){
+
+ assert(args.length>=2) : "Need 2 file names: <input> <output>";
+ assert(!args[0].equalsIgnoreCase(args[1])) : "File names must be different.";
+
+ ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
+ ReadWrite.MAX_ZIP_THREADS=Shared.threads();
+ ReadWrite.ZIP_THREAD_DIVISOR=1;
+
+ int minlen=1;
+ long reads=Long.MAX_VALUE;
+ char symbol='D';
+ if(args.length>2){symbol=(char)args[2].charAt(0);}
+ if(args.length>3){minlen=Integer.parseInt(args[3]);}
+ if(args.length>4){reads=Tools.parseKMG(args[4]);}
+
+ symbol=Character.toUpperCase(symbol);
+ if(symbol=='='){symbol='M';}
+ if(symbol=='X'){symbol='S';}
+ if(symbol=='N'){symbol='D';}
+ if(symbol=='S' || symbol=='H' || symbol=='P'){symbol='C';}
+
+ final int index=Tools.indexOf(new char[] {'M','S','D','I','C'}, symbol);
+ assert(index>=0) : "Symbol (3rd argument) must be M, S, D, I, C (for match string symbols) or M, =, X, D, N, I, S, H, P (for cigar symbols).";
+
+ TextFile tf=new TextFile(args[0], true, false);
+ TextStreamWriter tsw=new TextStreamWriter(args[1], false, false, true);
+ tsw.start();
+
+ for(String line=tf.nextLine(); line!=null; line=tf.nextLine()){
+ if(line.charAt(0)=='@'){
+ tsw.println(line);
+ }else{
+ if((reads=reads-1)<0){break;}
+ SamLine sl=new SamLine(line);
+ if(testLine(sl, minlen, index)){
+ tsw.println(line);
+ }
+ }
+ }
+ tf.close();
+ tsw.poison();
+ tsw.waitForFinish();
+
+ }
+
+
+ private static boolean testLine(SamLine sl, int minlen, int index){
+ assert(sl!=null);
+ if(!sl.mapped() || sl.cigar==null){return false;}
+ int[] msdic=sl.cigarToMdsiMax(sl.cigar);
+ return (msdic!=null && msdic[index]>=minlen);
+ }
+
+}
diff --git a/current/driver/SniffSplices.java b/current/driver/SniffSplices.java
new file mode 100755
index 0000000..4dad5d2
--- /dev/null
+++ b/current/driver/SniffSplices.java
@@ -0,0 +1,198 @@
+package driver;
+
+import java.util.ArrayList;
+
+import dna.AminoAcid;
+import dna.Motif;
+import dna.MotifProbsN;
+
+public class SniffSplices {
+
+ public static void main(String[] args){
+
+// MotifProbsN mAG=MotifProbsN.makeMotif("AG Exon Starts MP2", 11, 13, 11, 2);
+// MotifProbsN mGT=MotifProbsN.makeMotif("GT Exon Stops MP2", 3, 10, 3, 2);
+//
+// MotifProbsN eStarts2=MotifProbsN.makeMotif("Exon Starts MP2", 9, 11, 9, 2);
+// MotifProbsN eStops2=MotifProbsN.makeMotif("Exon Stops MP2", 3, 10, 3, 2);
+//
+// MotifProbsN gStarts2=MotifProbsN.makeMotif("Gene Starts MP2", 9, 11, 9, 2);
+// MotifProbsN gStops2=MotifProbsN.makeMotif("Gene Stops MP2", 3, 10, 3, 2);
+
+
+ Motif m=eStops2;
+// Motif m=eStarts2;
+// Motif m=eStarts2_15;
+
+
+ ArrayList<String> list=new ArrayList<String>();
+
+ boolean rcomp=false;
+ if(args.length>0){
+ for(String s1 : args){
+ String s=s1.toLowerCase();
+ if(s.equalsIgnoreCase("rcomp")){rcomp=true;}
+
+ if(s.contains("estart_ac")){m=eStarts2_AC;}
+ else if(s.contains("estart_15")){m=eStarts2_15;}
+ else if(s.contains("estart")){m=eStarts2;}
+ else if(s.contains("estop_gc")){m=eStops2_GC;}
+ else if(s.contains("estop")){m=eStops2;}
+ else if(s.contains("gstart")){m=gStarts2;}
+ else if(s.contains("gstop")){m=gStops2;}
+ else{list.add(s.toUpperCase());}
+ }
+ }
+
+
+ System.out.println("Using motif "+m);
+
+ int initialLoc=0;
+ int increment=1; //1 for plus strand, -1 for minus strand
+
+// String s="NNNNNNNNAGCGGGAATCGGGGGGTCCTTCTGCTCCCCTGAGCGTCCTTCCTGTGTTCCCAGGC"+
+// "ACTATCGCCTACCTGTTTTTCACCAACCGCCACGAGGTGAGGAAGATGACCCTGGACCGAAGCGAATACACCAGCCTCAT"+
+// "CCCAAACTTGAAGAACGTGGTCGCCCTGGACACCGAGGTGGCCAGCAACAGAATATACTGGTCCGACCTGTCCCAAAGGA"+
+// "AGATCTACAGGTGAGCCTTGGAGCCACACCCAGCGCTCAACCCCCGGTGGCGCGGGGGCCCCTCTCACTGACGCTCTCCT"+
+// "TCCCCTGCTCCTCCCCCTCAGCACCCAAATCGACAGAGCCCCCGGCTTCTCCTCCTATGACACCGTCGTCAGCGAGGACC"+
+// "TCCAGGCCCCTGATGGGCTGGCGGTGGACTGGATCCACAGCAACATATACTGGACAGACTCCATCCTGGGCACCGTCTCC"+
+// "GTGGCCGACACCAAGGGCGTGAAGAGAAAGACGCTCTTCAAGGAGAAAGGCTCTAAGCCACGTGCCATCGTGGTGGATCC"+
+// "CGTTCACGGGTGGGTGCTGCTAAAGCCGAGGGCCACGGAAGGAANNNNNNNN";
+
+ // "AAGTACAGGAATTATATGCCCCCAGGTAA * AGTACAGGAATTATATGCCCCCAGGTAAC"
+// String[] array={
+// "GCCTACTTTGTATGATGACCCTGTCCT",
+// "AGCCCTGGCCGCCTACTTTGTATGATGACCCTGTCCTCCCTCACCCA",
+// };
+// String[] array={
+// "TGGCCGCCGCCGACCGTAAGTTTTGCGCGCAAACTCCC",
+// "TGGCCGCCGCCGACCGTTAAGTTTTGCGCGCAAACTCCC",
+// };
+// String[] array={
+// "CAACTGCCAAGGGAAGGGCACGGTTAGCGGCACCCTCATAGGTAAGTGATGGCCCCAGACGCTGGTCTCTCTCCATCTGGACCTGGCCTGGGAGGTGGCTTGG",
+// "CAACTGCCAAGGGAAGGGCACGGTTAGCGGCACCCTCATAGGTGAGTGATGGCCCCAGACGCTGGTCTCTCTCCATCTGGACCTGGCCTGGGAGGTGGCTTGG",
+// };
+
+// String[] array={
+// "GTCTTTCTCATGTGGTCCTTGTGTTCGTCGAGCAGGCCAGCAAGTGTGACAGTCATGGCACCCACCTGGCAGGGG",
+// "GTCTTTCTCATGTGGTCCTTGTGTTCGTTGAGCAGGCCAGCAAGTGTGACAGTCATGGCACCCACCTGGCAGGGG",
+// };
+
+// String[] array={
+// "GCAGGGTCATGGTCACCGACTTCGAGAATGTGCCCGAGGAGGACGGGACCCGCCTCCACAGACAGGTAAGCACAGCCGTCTGATGGGAGGGCTGCCTCTGCCCATATCCCCATCCTGGAG",
+// "GCAGGGTCATGGTCACCGACTTCGAGAATGTGCCCGAGGAGGACGGGACCCGCTTCCACAGACAGGTAAGCACGGCCGTCTGATGGGAGGGCTGCCTCTGCCCATATCCCCATCCTGGAG",
+// };
+
+
+// String[] array={
+// "RTGTTTTCACTCCAGCCACGGAGCTGGGTCTCTGGTCTCGGGGGCAGCTGTGTGACAGAGCGT" +
+// "GCCTCTCCCTACAGTGCTCTTCGTCTTCCTTTGCCTGGGGGTCTTCCTTCTATGGAAGAACTG",
+// "RTGTTTTCACTCCAGCCACGGAGCTGGGTCTCTGGTCTCGGGGGCAGCTGTGTGACAGAGCGT" +
+// "GCCTCTCCTTACAGTGCTCTTCGTCTTCCTTTGCCTGGGGGTCTTCCTTCTATGGAAGAACTG",
+// };
+
+// String[] array={
+//// "CAGCGAAGATGCGAAGGTGATTCCCGGGTGGG",
+//// "CAGCGAAGATGCGAAGGTGATTTCCGGGTGGG",
+// "GCGGCCGAAGCGGGCCATGGACGCGCTCAAGT",
+// "GCGGCCGGAGCGGGCCATGGACGCGCTCAAGT",
+// };
+
+
+// String[] array={
+// "AAGTATGTTTTTGCTTTTAGGAGGATTCTCT",
+// "AAGTATGTTTTTGTTTTTAGGAGGATTCTCT",
+// };
+
+// String[] array={
+// "TTAGGTTGCTGGTGTCTGTATAATGTGTGT"+
+// "A"+
+// "TCTTTGTTGCAGGTTTGTTTTTTATTCTGC",
+//
+// "TTAGGTTGCTGGTGTCTGTATAATGTGTGT"+
+// "G"+
+// "TCTTTGTTGCAGGTTTGTTTTTTATTCTGC"
+// };
+
+// ATGTATTCTACTTTT[TCTTTT]AAGTATGTTTTTGTTTTTAGGAGGATTCTCTATGG
+
+// String[] array={
+// "CAGGTCCTCGAGATCCTGGGATACAGGAAA",
+// "CAGGTCCTCGAGATCCTGGGATATAGGAAA"
+// };
+
+// String[] array={
+// "TGTTTTTGCTTTTAGGAGGATTCTCTATG",
+// "TGTTTTTGTTTTTAGGAGGATTCTCTATG"
+// };
+
+
+
+ for(String s : list){
+ if(rcomp){s=AminoAcid.reverseComplementBases(s);}
+ System.out.println("For string "+s+":");
+
+ if(!s.startsWith("N") || !s.endsWith("N")){
+ s="NNNN"+s+"NNNN";
+ }
+ byte[] code=s.getBytes();
+
+ for(int i=0; i<s.length(); i++){
+
+ float strength=m.matchStrength(code, i);
+ float norm=m.normalize(strength);
+ float percent=-1;
+ try {
+ percent=m.percentile(norm);
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+// e.printStackTrace();
+ }
+
+ System.out.print((initialLoc+i*increment)+"\t");
+
+ System.out.print(s.charAt(i)+" Strength = "+String.format("%.4f ",norm));
+ if(percent!=-1){System.out.print(String.format("-> %.4f ",percent));}
+ float norm2=norm;
+ while(norm2>0.1f){
+ norm2-=.1f;
+ System.out.print("*");
+ }
+
+// System.out.print("\t"+String.format("%.3f ",m.percentile(norm)));
+
+ System.out.println();
+
+ }
+
+ }
+
+ }
+
+
+ private static final int N_MOTIF=2;
+
+// private static final MotifProbsN eStarts2=MotifProbsN.makeMotif("Exon Starts MP"+N_MOTIF, 12, 9, 2);
+//// private static final MotifProbsN eStops2=MotifProbsN.makeMotif("Exon Stops MP"+N_MOTIF, 3, 11, 3, 2);
+// private static final MotifProbsN eStops2=MotifProbsN.makeMotif("Exon Stops MP"+N_MOTIF, 12, 3, 2);
+//
+// private static final MotifProbsN gStarts2=MotifProbsN.makeMotif("Gene Starts MP"+N_MOTIF, 13, 9, 2);
+// private static final MotifProbsN gStops2=MotifProbsN.makeMotif("Gene Stops MP"+N_MOTIF, 11, 3, 2);
+//
+// private static final MotifProbsN trStarts2=MotifProbsN.makeMotif("Tr Starts MP"+N_MOTIF, 12, 7, 2);
+// private static final MotifProbsN trStops2=MotifProbsN.makeMotif("Tr Stops MP"+N_MOTIF, 11, 6, 2);
+
+ private static final MotifProbsN eStarts2=MotifProbsN.makeMotif("Exon Starts MP"+N_MOTIF, 13, 9, 2);
+ private static final MotifProbsN eStarts2_AC=MotifProbsN.makeMotif("AC Exon Starts MP"+N_MOTIF, 13, 9, 2);
+ private static final MotifProbsN eStarts2_15=MotifProbsN.makeMotif("Exon Starts MP"+N_MOTIF, 19, 15, 2);
+ private static final MotifProbsN eStops2=MotifProbsN.makeMotif("Exon Stops MP"+N_MOTIF, 13, 4, 2);
+ private static final MotifProbsN eStops2_GC=MotifProbsN.makeMotif("GC Exon Stops MP"+N_MOTIF, 13, 4, 2);
+
+ private static final MotifProbsN gStarts2=MotifProbsN.makeMotif("Gene Starts MP"+N_MOTIF, 13, 9, 2);
+ private static final MotifProbsN gStops2=MotifProbsN.makeMotif("Gene Stops MP"+N_MOTIF, 13, 4, 2);
+
+ private static final MotifProbsN trStarts2=MotifProbsN.makeMotif("Tr Starts MP"+N_MOTIF, 13, 7, 2);
+ private static final MotifProbsN trStops2=MotifProbsN.makeMotif("Tr Stops MP"+N_MOTIF, 13, 7, 2);
+
+
+}
diff --git a/current/driver/SummarizeCoverage.java b/current/driver/SummarizeCoverage.java
new file mode 100755
index 0000000..96f286d
--- /dev/null
+++ b/current/driver/SummarizeCoverage.java
@@ -0,0 +1,113 @@
+package driver;
+
+import java.io.File;
+import java.util.ArrayList;
+
+import align2.Tools;
+
+import dna.Parser;
+import fileIO.TextFile;
+import fileIO.TextStreamWriter;
+
+/**
+ * @author Brian Bushnell
+ * @date Apr 29, 2015
+ *
+ */
+public class SummarizeCoverage {
+
+ /**
+ * Code entrance from the command line.
+ * @param args Command line arguments
+ */
+ public static void main(String[] args){
+
+ args=Parser.parseConfig(args);
+ if(Parser.parseHelp(args, true)){
+ assert(false);
+ System.exit(0);
+ }
+
+ //Create a new Seal instance
+ SummarizeCoverage sc=new SummarizeCoverage(args);
+
+ ///And run it
+ sc.process();
+ }
+
+ public SummarizeCoverage(String[] args){
+
+ Parser parser=new Parser();
+
+ ArrayList<String> names=new ArrayList<String>();
+
+ /* Parse arguments */
+ for(int i=0; i<args.length; i++){
+
+ final String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if("null".equalsIgnoreCase(b)){b=null;}
+ while(a.charAt(0)=='-' && (a.indexOf('.')<0 || i>1 || !new File(a).exists())){a=a.substring(1);}
+
+ if(parser.parse(arg, a, b)){
+ //do nothing
+ }else if(!arg.contains("=")){
+ String[] x=(new File(arg).exists() ? new String[] {arg} : arg.split(","));
+ for(String x2 : x){names.add(x2);}
+ }else{
+ throw new RuntimeException("Unknown parameter "+arg);
+ }
+ }
+
+ {//Process parser fields
+ out=(parser.out1==null ? "stdout" : parser.out1);
+ if(parser.in1!=null){
+ String[] x=(new File(parser.in1).exists() ? new String[] {parser.in1} : parser.in1.split(","));
+ for(String x2 : x){names.add(x2);}
+ }
+ }
+
+ in=new ArrayList<String>();
+ for(String s : names){
+ Tools.getFileOrFiles(s, in, false, false, false, true);
+ }
+ }
+
+ public void process(){
+ TextStreamWriter tsw=new TextStreamWriter(out, true, false, false);
+ tsw.start();
+ tsw.print("#File\tPrimary_Name\tPrimary_Count\tOther_Count\tPrimary_MB\tOther_MB\n");
+ for(String fname : in){
+ String pname=null;
+ long pcount=0, ocount=0;
+ double pmb=0, omb=0;
+ TextFile tf=new TextFile(fname);
+ for(String line=tf.nextLine(); line!=null; line=tf.nextLine()){
+ if(!line.startsWith("#")){
+ String[] split=line.split("\t");
+ long count=Long.parseLong(split[5]);
+ double mb=Double.parseDouble(split[2]);
+ if(pcount==0 || mb>pmb || (mb==pmb && count>pcount)){
+ pname=split[0];
+ ocount+=pcount;
+ omb+=pmb;
+ pcount=count;
+ pmb=mb;
+ }else{
+ ocount+=count;
+ omb+=mb;
+ }
+ }
+ }
+ tf.close();
+ tsw.print(String.format("%s\t%s\t%d\t%d\t%.5f\t%.5f\n", fname, pname, pcount, ocount, pmb, omb));
+ }
+ tsw.poisonAndWait();
+ }
+
+ final ArrayList<String> in;
+ final String out;
+
+}
diff --git a/current/driver/SummarizeMSDIN.java b/current/driver/SummarizeMSDIN.java
new file mode 100755
index 0000000..e749427
--- /dev/null
+++ b/current/driver/SummarizeMSDIN.java
@@ -0,0 +1,122 @@
+package driver;
+
+import fileIO.TextFile;
+
+/**
+ * Summarizes match/sub/ins/del/N rates for consecutive BBMap runs
+ * @author Brian Bushnell
+ * @date Jan 8, 2014
+ *
+ */
+public class SummarizeMSDIN {
+
+ public static void main(String[] args){
+ String fname=args[0];
+ boolean M=false;
+ boolean E=false;
+ boolean S=true;
+ boolean D=false;
+ boolean I=false;
+ boolean N=false;
+ boolean B=false;
+ boolean MS=true;
+
+ long mcount=0;
+ long ecount=0;
+ long scount=0;
+ long dcount=0;
+ long icount=0;
+ long ncount=0;
+ long bcount=0;
+
+ TextFile tf=new TextFile(fname);
+ StringBuilder sb=new StringBuilder();
+ for(String s=tf.nextLine(); s!=null; s=tf.nextLine()){
+ String[] split=s.split("\t");
+ if(s.startsWith("Total time:")){
+ if(B){
+ if(sb.length()>0){sb.append('\t');}
+ sb.append(bcount);
+ }
+ if(MS){
+ if(sb.length()>0){sb.append('\t');}
+ sb.append((mcount+scount));
+ }
+ if(M){
+ if(sb.length()>0){sb.append('\t');}
+ sb.append(mcount);
+ }
+ if(E){
+ if(sb.length()>0){sb.append('\t');}
+ sb.append(ecount);
+ }
+ if(S){
+ if(sb.length()>0){sb.append('\t');}
+ sb.append(scount);
+ }
+ if(D){
+ if(sb.length()>0){sb.append('\t');}
+ sb.append(dcount);
+ }
+ if(I){
+ if(sb.length()>0){sb.append('\t');}
+ sb.append(icount);
+ }
+ if(N){
+ if(sb.length()>0){sb.append('\t');}
+ sb.append(ncount);
+ }
+ System.out.println(sb);
+ sb.setLength(0);
+ mcount=ecount=scount=dcount=icount=ncount=bcount=0;
+ }else if(s.startsWith("Match Rate:")){
+ String x=split[split.length-1];
+ try{mcount=(Long.parseLong(x));}catch(Exception e){}
+ }else if(E && s.startsWith("Error Rate:")){
+ String x=split[split.length-1];
+// if(E){
+// if(sb.length()>0){sb.append('\t');}
+// sb.append(x);
+// }
+ try{ecount=(Long.parseLong(x));}catch(Exception e){}
+ }else if(s.startsWith("Sub Rate:")){
+ String x=split[split.length-1];
+// if(S){
+// if(sb.length()>0){sb.append('\t');}
+// sb.append(x);
+// }
+ try{scount=(Long.parseLong(x));}catch(Exception e){}
+ }else if(s.startsWith("Del Rate:")){
+ String x=split[split.length-1];
+// if(D){
+// if(sb.length()>0){sb.append('\t');}
+// sb.append(x);
+// }
+ try{dcount=(Long.parseLong(x));}catch(Exception e){}
+ }else if(s.startsWith("Ins Rate:")){
+ String x=split[split.length-1];
+// if(I){
+// if(sb.length()>0){sb.append('\t');}
+// sb.append(x);
+// }
+ try{icount=(Long.parseLong(x));}catch(Exception e){}
+ }else if(s.startsWith("N Rate:")){
+ String x=split[split.length-1];
+// if(N){
+// if(sb.length()>0){sb.append('\t');}
+// sb.append(x);
+// }
+ try{ncount=(Long.parseLong(x));}catch(Exception e){}
+ }else if(s.startsWith("Reads Used:")){
+ String x=split[split.length-1].replace("(", "").replace(" bases)", "");
+// if(B){
+// if(sb.length()>0){sb.append('\t');}
+// sb.append(x);
+// }
+ try{bcount=(Long.parseLong(x));}catch(Exception e){}
+ }
+ }
+
+ }
+
+}
diff --git a/current/driver/SummarizeSealStats.java b/current/driver/SummarizeSealStats.java
new file mode 100755
index 0000000..d123dba
--- /dev/null
+++ b/current/driver/SummarizeSealStats.java
@@ -0,0 +1,210 @@
+package driver;
+
+import java.io.File;
+import java.util.ArrayList;
+
+import align2.Tools;
+
+import dna.Parser;
+import fileIO.TextFile;
+import fileIO.TextStreamWriter;
+
+/**
+ * @author Brian Bushnell
+ * @date May 8, 2015
+ *
+ */
+public class SummarizeSealStats {
+
+ /**
+ * Code entrance from the command line.
+ * @param args Command line arguments
+ */
+ public static void main(String[] args){
+
+ args=Parser.parseConfig(args);
+ if(Parser.parseHelp(args, true)){
+ assert(false);
+ System.exit(0);
+ }
+
+ //Create a new SummarizeSealStats instance
+ SummarizeSealStats sc=new SummarizeSealStats(args);
+
+ ///And run it
+ sc.summarize();
+ }
+
+ public SummarizeSealStats(String[] args){
+
+ Parser parser=new Parser();
+
+ ArrayList<String> names=new ArrayList<String>();
+
+ /* Parse arguments */
+ for(int i=0; i<args.length; i++){
+
+ final String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if("null".equalsIgnoreCase(b)){b=null;}
+ while(a.charAt(0)=='-' && (a.indexOf('.')<0 || i>1 || !new File(a).exists())){a=a.substring(1);}
+
+ if(parser.parse(arg, a, b)){
+ //do nothing
+ }else if(a.equals("ignoresametaxa")){
+ ignoreSameTaxa=Tools.parseBoolean(b);
+ }else if(a.equals("ignoresamebarcode") || a.equals("ignoresameindex")){
+ ignoreSameBarcode=Tools.parseBoolean(b);
+ }else if(a.equals("ignoresamelocation") || a.equals("ignoresameloc")){
+ ignoreSameLocation=Tools.parseBoolean(b);
+ }else if(a.equals("usetotal") || a.equals("totaldenominator") || a.equals("totald") || a.equals("td")){
+ totalDenominator=Tools.parseBoolean(b);
+ }else if(!arg.contains("=")){
+ String[] x=(new File(arg).exists() ? new String[] {arg} : arg.split(","));
+ for(String x2 : x){names.add(x2);}
+ }else{
+ throw new RuntimeException("Unknown parameter "+arg);
+ }
+ }
+
+ {//Process parser fields
+ out=(parser.out1==null ? "stdout" : parser.out1);
+ if(parser.in1!=null){
+ String[] x=(new File(parser.in1).exists() ? new String[] {parser.in1} : parser.in1.split(","));
+ for(String x2 : x){names.add(x2);}
+ }
+ }
+
+ in=new ArrayList<String>();
+ for(String s : names){
+ Tools.getFileOrFiles(s, in, false, false, false, true);
+ }
+ }
+
+ public void summarize(){
+ TextStreamWriter tsw=new TextStreamWriter(out, true, false, false);
+ tsw.start();
+ tsw.print("#File\tPrimary_Name\tPrimary_Count\tOther_Count\tPrimary_Bases\tOther_Bases\tOther_ppm\n");
+ for(String fname : in){
+ final String s;
+ if(ignoreSameTaxa || ignoreSameBarcode || ignoreSameLocation){
+ s=cleanAndSummarizeFile(fname);
+ }else{
+ s=summarizeFile(fname);
+ }
+ tsw.print(s);
+ }
+ tsw.poisonAndWait();
+ }
+
+ public String summarizeFile(String fname){
+ String pname=null;
+ long pcount=0, ocount=0, tcount=0;
+ long pbases=0, obases=0, tbases=0;
+ TextFile tf=new TextFile(fname);
+ for(String line=tf.nextLine(); line!=null; line=tf.nextLine()){
+ if(line.startsWith("#")){
+ if(line.startsWith("#Total")){
+ String[] split=line.split("\t");
+ tcount=Long.parseLong(split[1]);
+ tbases=Long.parseLong(split[2]);
+ }
+ }else{
+ String[] split=line.split("\t");
+ long count=Long.parseLong(split[1]);
+ long bases=Long.parseLong(split[3]);
+ if(pcount==0 || bases>pbases || (bases==pbases && count>pcount)){
+ pname=split[0];
+ ocount+=pcount;
+ obases+=pbases;
+ pcount=count;
+ pbases=bases;
+ }else{
+ ocount+=count;
+ obases+=bases;
+ }
+ }
+ }
+ tf.close();
+ double ppm;
+ if(totalDenominator && tbases>0){
+ ppm=obases*1000000.0/tbases;
+ }else{
+ ppm=(obases==0 ? 0 : obases*1000000.0/(obases+pbases));
+ }
+ return String.format("%s\t%s\t%d\t%d\t%d\t%d\t%.2f\n", fname, pname, pcount, ocount, pbases, obases, ppm);
+ }
+
+ public String cleanAndSummarizeFile(String fname){
+ String pname=null;
+ long pcount=0, ocount=0, tcount=0;
+ long pbases=0, obases=0, tbases=0;
+ TextFile tf=new TextFile(fname);
+ String[] name0=null, barcode0=null;
+ for(String line=tf.nextLine(); line!=null; line=tf.nextLine()){
+ if(line.startsWith("#")){
+ if(line.startsWith("#Total")){
+ String[] split=line.split("\t");
+ tcount=Long.parseLong(split[1]);
+ tbases=Long.parseLong(split[2]);
+ }
+ }else{
+ String[] split=line.split("\t");
+ String[] name=split[0].toLowerCase().split(",");
+ String[] barcode=name[0].split("-");
+
+ long count=Long.parseLong(split[1]);
+ long bases=Long.parseLong(split[3]);
+ if(pcount==0 || bases>pbases || (bases==pbases && count>pcount)){
+ name0=name;
+ barcode0=barcode;
+ pname=split[0];
+ ocount+=pcount;
+ obases+=pbases;
+ pcount=count;
+ pbases=bases;
+ }else{
+ boolean process=true;
+ if(ignoreSameTaxa){
+ if(name[2].contains(name0[2]) || name0[2].contains(name[2])){
+ process=false;
+ }
+ }
+ if(ignoreSameBarcode){
+ if(barcode[0].equals(barcode0[0]) || barcode[1].equals(barcode0[1])){
+ process=false;
+ }
+ }
+ if(ignoreSameLocation){
+ assert(name.length==4) : "Too many delimiters: "+name.length+"\n"+line+"\n";
+ if(name[3].equals(name0[3])){
+ process=false;
+ }
+ }
+ if(process){
+ ocount+=count;
+ obases+=bases;
+ }
+ }
+ }
+ }
+ tf.close();
+ double ppm;
+ if(totalDenominator && tbases>0){
+ ppm=obases*1000000.0/tbases;
+ }else{
+ ppm=(obases==0 ? 0 : obases*1000000.0/(obases+pbases));
+ }
+ return String.format("%s\t%s\t%d\t%d\t%d\t%d\t%.2f\n", fname, pname, pcount, ocount, pbases, obases, ppm);
+ }
+
+ final ArrayList<String> in;
+ final String out;
+ boolean ignoreSameTaxa=false;
+ boolean ignoreSameBarcode=false;
+ boolean ignoreSameLocation=false;
+ boolean totalDenominator=false;
+
+}
diff --git a/current/driver/TestCompressionSpeed.java b/current/driver/TestCompressionSpeed.java
new file mode 100755
index 0000000..2eca4c4
--- /dev/null
+++ b/current/driver/TestCompressionSpeed.java
@@ -0,0 +1,79 @@
+package driver;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.PrintWriter;
+import java.util.zip.ZipOutputStream;
+
+import dna.Timer;
+
+import fileIO.ReadWrite;
+import fileIO.TextFile;
+
+public class TestCompressionSpeed {
+
+
+ public static void main(String[] args){
+
+ TextFile tf=new TextFile(args[0], false, false);
+ String[] lines=tf.toStringLines();
+ tf.close();
+
+ Timer t=new Timer();
+
+ for(int i=0; i<=9; i++){
+ t.start();
+ String fname=args[1].replaceFirst("#", ""+i);
+ compress(lines, fname, i);
+ t.stop();
+
+ System.out.println("Level "+i+" compress: "+t+" \tsize: "+new File(fname).length());
+ }
+
+ for(int i=0; i<=9; i++){
+ t.start();
+ String fname=args[1].replaceFirst("#", ""+i);
+ String[] lines2=read(fname);
+ assert(lines2.length>=lines.length);
+ t.stop();
+
+ System.out.println("Level "+i+" decompress: "+t);
+ }
+
+ }
+
+
+ public static void compress(String[] text, String fname, int level){
+ ReadWrite.ZIPLEVEL=level;
+ OutputStream os=ReadWrite.getOutputStream(fname, false, true, true);
+ PrintWriter writer=new PrintWriter(os);
+
+ for(String s : text){writer.println(s);}
+ for(String s : text){writer.println(s);}
+ for(String s : text){writer.println(s);}
+ for(String s : text){writer.println(s);}
+
+ try {
+ writer.flush();
+ if(os.getClass()==ZipOutputStream.class){
+ ZipOutputStream zos=(ZipOutputStream)os;
+ zos.closeEntry();
+ zos.finish();
+ }
+ writer.close();
+ } catch (IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+
+
+ public static String[] read(String fname){
+ TextFile tf=new TextFile(fname, false, false);
+ String[] s=tf.toStringLines();
+ tf.close();
+ return s;
+ }
+
+}
diff --git a/current/driver/TestLockSpeed.java b/current/driver/TestLockSpeed.java
new file mode 100755
index 0000000..31aa1f2
--- /dev/null
+++ b/current/driver/TestLockSpeed.java
@@ -0,0 +1,162 @@
+package driver;
+
+import java.io.File;
+import java.lang.Thread.State;
+import java.util.ArrayList;
+import java.util.concurrent.atomic.AtomicLong;
+import java.util.concurrent.atomic.AtomicLongFieldUpdater;
+
+import align2.Shared;
+import align2.Tools;
+
+import dna.Parser;
+import dna.Timer;
+
+/**
+ * @author Brian Bushnell
+ * @date Sep 25, 2014
+ *
+ */
+public class TestLockSpeed {
+
+ public static void main(String[] args){
+
+ int mode=0;
+ long max=1000000000;
+ int threads=Shared.threads();
+
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if(b==null || b.equalsIgnoreCase("null")){b=null;}
+ while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens
+
+ if(Parser.isJavaFlag(arg)){
+ //do nothing
+ }else if(a.equals("mode")){
+ mode=Integer.parseInt(b);
+ }else if(a.equals("threads") || a.equals("t")){
+ threads=Integer.parseInt(b);
+ }else if(a.equals("max")){
+ max=Tools.parseKMG(b);
+ }else{
+ System.err.println("Unknown parameter "+args[i]);
+ assert(false) : "Unknown parameter "+args[i];
+ // throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ CountBox box;
+ if(mode==UNLOCKED || mode==LOCKED){
+ box=new LockBox();
+ }else if(mode==ATOMIC){
+ box=new AtomBox();
+ }else if(mode==VOLATILE || mode==FIELD || mode==STATICFIELD){
+ box=new VolatileBox();
+ }else{
+ throw new RuntimeException("Unknown mode "+mode);
+ }
+
+ ArrayList<CountThread> list=new ArrayList<CountThread>(threads);
+ for(int i=0; i<threads; i++){
+ list.add(new CountThread(box, max, mode));
+ }
+
+ Timer t=new Timer();
+
+ for(CountThread ct : list){ct.start();}
+ for(CountThread ct : list){
+ while(ct.getState()!=State.TERMINATED){
+ try {
+ ct.join();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+
+ t.stop();
+ System.out.println("Time: \t"+t);
+ System.out.println("Count: \t"+box.value());
+ System.out.println("Speed: \t"+String.format("%.3f", (threads*max*1.0)/(t.elapsed))+" giga per second");
+
+ }
+
+ static class CountThread extends Thread{
+
+ public CountThread(CountBox box_, long max_, int mode_){
+ box0=box_;
+ max=max_;
+ mode=mode_;
+ }
+
+ public void run(){
+ if(mode==UNLOCKED){
+ final LockBox box=(LockBox)box0;
+ for(long i=0; i<max; i++){
+ box.counter++;
+ }
+ }else if(mode==LOCKED){
+ final LockBox box=(LockBox)box0;
+ for(long i=0; i<max; i++){
+ box.increment();
+ }
+ }else if(mode==ATOMIC){
+ final AtomBox box=(AtomBox)box0;
+ for(long i=0; i<max; i++){
+ box.increment();
+ }
+ }else if(mode==VOLATILE){
+ final VolatileBox box=(VolatileBox)box0;
+ for(long i=0; i<max; i++){
+ box.counter++;
+ }
+ }else if(mode==FIELD){
+ final VolatileBox box=(VolatileBox)box0;
+ final AtomicLongFieldUpdater<VolatileBox> updater=AtomicLongFieldUpdater.newUpdater(VolatileBox.class, "counter");
+ for(long i=0; i<max; i++){
+ updater.incrementAndGet(box);
+ }
+ }else if(mode==STATICFIELD){
+ final VolatileBox box=(VolatileBox)box0;
+ for(long i=0; i<max; i++){
+ box.increment();
+ }
+ }
+ }
+
+ final CountBox box0;
+ final long max;
+ final int mode;
+ }
+
+ abstract static class CountBox{
+ abstract void increment();
+ abstract long value();
+ }
+
+ static class LockBox extends CountBox{
+ synchronized void increment(){counter++;}
+ long value(){return counter;}
+ long counter;
+ }
+
+ static class AtomBox extends CountBox{
+ void increment(){counter.incrementAndGet();}
+ long value(){return counter.longValue();}
+ AtomicLong counter=new AtomicLong(0);
+ }
+
+ static class VolatileBox extends CountBox{
+ void increment(){updater.incrementAndGet(this);}
+ long value(){return counter;}
+ volatile long counter;
+ static final AtomicLongFieldUpdater<VolatileBox> updater=AtomicLongFieldUpdater.newUpdater(VolatileBox.class, "counter");
+ }
+
+ static final int UNLOCKED=0, LOCKED=1, ATOMIC=2, VOLATILE=3, FIELD=4, STATICFIELD=5;
+
+}
diff --git a/current/driver/Translator.java b/current/driver/Translator.java
new file mode 100755
index 0000000..8c2e781
--- /dev/null
+++ b/current/driver/Translator.java
@@ -0,0 +1,213 @@
+package driver;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import var.VarLine;
+import var.Variation;
+
+import dna.AminoAcid;
+import dna.Data;
+import dna.Gene;
+import dna.Timer;
+import fileIO.ChainLine;
+import fileIO.ReadWrite;
+
+public class Translator {
+
+
+
+
+ public Translator(int from_, int to_){
+ fromBuild=from_;
+ toBuild=to_;
+ lines=Data.getChainLines(fromBuild, toBuild);
+ }
+
+
+ public VarLine[][] translate(VarLine[][] in){
+ ArrayList<VarLine>[] alvls=new ArrayList[in.length];
+ for(int i=0; i<alvls.length; i++){
+ alvls[i]=new ArrayList<VarLine>();
+ }
+
+ for(VarLine[] vla : in){
+ if(vla!=null){
+ for(VarLine vl : vla){
+ VarLine vl2=translate(vl);
+// if(vl.haplotype==1 && (vl.intersects(244821744, 244821748) || (vl2!=null && vl2.intersects(246755120, 246755126)))){
+// System.out.println("\n"+vl+"\n->\n"+vl2);
+// }
+ if(vl2!=null){
+ int chrom=vl2.chromosome;
+ alvls[chrom].add(vl2);
+ }
+ }
+ }
+ }
+
+ VarLine[][] out=new VarLine[alvls.length][];
+ for(int i=0; i<alvls.length; i++){
+ out[i]=alvls[i].toArray(new VarLine[alvls[i].size()]);
+ Arrays.sort(out[i]);
+ alvls[i]=null;
+ }
+
+ return out;
+ }
+
+
+ public Variation[][] translate(Variation[][] in){
+ ArrayList<Variation>[] alvls=new ArrayList[in.length];
+ for(int i=0; i<alvls.length; i++){
+ alvls[i]=new ArrayList<Variation>();
+ }
+
+ for(Variation[] vla : in){
+ if(vla!=null){
+ for(Variation vl : vla){
+ Variation vl2=translate(vl);
+ if(vl2!=null){
+ int chrom=vl2.chromosome;
+ alvls[chrom].add(vl2);
+ }
+ }
+ }
+ }
+
+ Variation[][] out=new Variation[alvls.length][];
+ for(int i=0; i<alvls.length; i++){
+ out[i]=alvls[i].toArray(new Variation[alvls[i].size()]);
+ Arrays.sort(out[i]);
+ alvls[i]=null;
+ }
+
+ return out;
+ }
+
+
+ public VarLine translate(VarLine v){
+
+ ChainLine[] array=lines[v.chromosome];
+ int index=ChainLine.binarySearch(v.beginLoc, array);
+ if(index<0){return null;}
+ ChainLine cl=array[index];
+ if(!cl.contains(v.beginLoc, v.endLoc)){return null;}
+
+// System.out.println(cl);
+
+ int[] dest1=cl.translate(v.beginLoc);
+ int[] dest2=cl.translate(v.endLoc);
+
+ if(dest1==null || dest2==null){return null;}
+
+ VarLine v2=v.clone();
+
+ assert(v!=null);
+ assert(v2!=null) : v;
+
+ if(cl.qStrand==Gene.PLUS){
+ v2.chromosome=(byte)dest1[0];
+ v2.beginLoc=dest1[2];
+ v2.endLoc=dest2[2];
+ }else{
+// assert(false) : "TODO";
+
+ v2.chromosome=(byte)dest1[0];
+ if(v.isPoint()){
+ v2.beginLoc=v2.endLoc=dest1[2]-1;
+ }else{
+ v2.beginLoc=dest2[2];
+ v2.endLoc=dest1[2];
+ }
+
+ if(v2.call!=null && Character.isLetter(v2.call.charAt(0)) && !v2.call.equalsIgnoreCase("ref")){
+ v2.call=AminoAcid.reverseComplementBases(v2.call);
+ }
+
+ if(v2.ref!=null && Character.isLetter(v2.ref.charAt(0)) && !v2.ref.equalsIgnoreCase("ref")){
+ v2.ref=AminoAcid.reverseComplementBases(v2.ref);
+ }
+
+ }
+
+ assert(v2.endLoc-v2.beginLoc==v.endLoc-v.beginLoc) : "\n\n"+v.toSourceString()+"\n\n"+v2.toSourceString()+
+ "\n\n"+v.beginLoc+" -> "+Arrays.toString(dest1)+
+ "\n\n"+v.endLoc+" -> "+Arrays.toString(dest2)+
+ "\n\n"+cl+"\n\n";
+
+ assert(v2.beginLoc<=v2.endLoc) : "\n\n"+v.toSourceString()+"\n\n"+v2.toSourceString()+
+ "\n\n"+v.beginLoc+" -> "+Arrays.toString(dest1)+
+ "\n\n"+v.endLoc+" -> "+Arrays.toString(dest2)+
+ "\n\n"+cl+"\n\n";
+
+ v2.intern();
+ return v2;
+ }
+
+
+ public Variation translate(Variation v){
+
+ if(v.getClass()==VarLine.class){
+ return translate((VarLine)v);
+ }
+ assert(v.getClass()==Variation.class);
+
+ ChainLine[] array=lines[v.chromosome];
+ int index=ChainLine.binarySearch(v.beginLoc, array);
+ if(index<0){return null;}
+ ChainLine cl=array[index];
+ if(!cl.contains(v.beginLoc, v.endLoc)){return null;}
+
+ int[] dest1=cl.translate(v.beginLoc);
+ int[] dest2=cl.translate(v.endLoc);
+ if(dest1==null || dest2==null){return null;}
+
+ Variation v2=v.clone();
+
+ if(cl.qStrand==Gene.PLUS){
+ v2.chromosome=(byte)dest1[0];
+ v2.beginLoc=dest1[2];
+ v2.endLoc=dest2[2];
+ }else{
+// assert(false) : "TODO";
+
+ v2.chromosome=(byte)dest1[0];
+ if(v.isPoint()){
+ v2.beginLoc=v2.endLoc=dest1[2]-1;
+ }else{
+ v2.beginLoc=dest2[2];
+ v2.endLoc=dest1[2];
+ }
+
+ if(v2.call!=null && Character.isLetter(v2.call.charAt(0)) && !v2.call.equalsIgnoreCase("ref")){
+ v2.call=AminoAcid.reverseComplementBases(v2.call);
+ }
+
+ if(v2.ref!=null && Character.isLetter(v2.ref.charAt(0)) && !v2.ref.equalsIgnoreCase("ref")){
+ v2.ref=AminoAcid.reverseComplementBases(v2.ref);
+ }
+
+ }
+
+ assert(v2.endLoc-v2.beginLoc==v.endLoc-v.beginLoc) : "\n\n"+v.toSourceString()+"\n\n"+v2.toSourceString()+
+ "\n\n"+v.beginLoc+" -> "+Arrays.toString(dest1)+
+ "\n\n"+v.endLoc+" -> "+Arrays.toString(dest2)+
+ "\n\n"+cl+"\n\n";
+
+ assert(v2.beginLoc<=v2.endLoc) : "\n\n"+v.toSourceString()+"\n\n"+v2.toSourceString()+
+ "\n\n"+v.beginLoc+" -> "+Arrays.toString(dest1)+
+ "\n\n"+v.endLoc+" -> "+Arrays.toString(dest2)+
+ "\n\n"+cl+"\n\n";
+
+ v2.intern();
+ return v2;
+ }
+
+
+ public final int fromBuild;
+ public final int toBuild;
+ public final ChainLine[][] lines;
+
+}
diff --git a/current/driver/Translator2.java b/current/driver/Translator2.java
new file mode 100755
index 0000000..8244b0e
--- /dev/null
+++ b/current/driver/Translator2.java
@@ -0,0 +1,65 @@
+package driver;
+
+import dna.Data;
+import dna.Gene;
+import fileIO.ChainLine;
+
+public class Translator2 {
+
+
+ public static void main(String[] args){
+
+ int from=Gene.toBuild(args[0]);
+ int to=Gene.toBuild(args[1]);
+
+ if(from==18){from=36;}
+ if(from==19){from=37;}
+ if(to==18){to=36;}
+ if(to==19){to=37;}
+ assert(from!=to);
+ assert(from==36 || from==37);
+ assert(to==36 || to==37);
+
+ int chrom=Gene.toChromosome(args[2]);
+
+ ChainLine[][] lines=Data.getChainLines(from, to);
+
+ for(int i=3; i<args.length; i++){
+ int loc=Integer.parseInt(args[i]);
+ int[] result=ChainLine.translate(loc, lines[chrom]);
+ System.out.print("(build"+from+", chr"+chrom+", +, "+loc+") -> ");
+ System.out.println(result==null ? "null" :
+ "(build"+to+", chr"+result[0]+", "+Gene.strandCodes[result[1]]+", "+result[2]+")");
+ }
+
+// Translator2 tr=new Translator2(from, to);
+//
+// ChainLine[] array=lines[chrom];
+// int index=ChainLine.binarySearch(loc, array);
+//// if(index<0){return null;}
+// ChainLine cl=array[index];
+//
+//// System.out.println(cl);
+//
+// int[] dest=cl.translate(loc);
+//
+//// {qChrom, qStrand, qStart+loc-tStart};
+//
+// System.out.println(chrom+", +, "+loc+" -> "+dest[0]+", "+Gene.strandCodes[dest[1]]+", "+dest[2]);
+ }
+
+ /** chrom, strand, loc */
+ public static final int[] translate(int fromBuild, int toBuild, int chrom, int strand, int loc){
+ ChainLine[][] lines=Data.getChainLines(fromBuild, toBuild);
+ int[] result=ChainLine.translate(loc, lines[chrom]);
+ if(result==null){return null;}
+ int strand2=result[1];
+ if(strand2==strand){
+ result[1]=Gene.PLUS;
+ }else{
+ result[1]=Gene.MINUS;
+ }
+ return result;
+ }
+
+}
diff --git a/current/driver/TransposeTextFile.java b/current/driver/TransposeTextFile.java
new file mode 100755
index 0000000..271e440
--- /dev/null
+++ b/current/driver/TransposeTextFile.java
@@ -0,0 +1,53 @@
+package driver;
+
+import fileIO.ReadWrite;
+import fileIO.TextFile;
+
+public class TransposeTextFile {
+
+ public static void main(String[] args){
+
+ int skipLines=args.length>1 ? Integer.parseInt(args[1]) : 0;
+
+ int minChrom=1;
+ int maxChrom=22;
+
+ for(int i=minChrom; i<=maxChrom; i++){
+ if(args[0].contains("#")){
+ process(args[0].replace("#", ""+i), skipLines);
+ }else{
+ process(args[0], skipLines);
+ break;
+ }
+ }
+
+ }
+
+ public static void process(String fname, int skipLines){
+ TextFile tf=new TextFile(fname, false, false);
+ String[] lines=tf.toStringLines();
+ tf.close();
+ String[][] lines2=TextFile.doublesplitWhitespace(lines, true);
+
+ StringBuilder sb=new StringBuilder(4096);
+
+ int columns=lines2[skipLines].length;
+
+ for(int column=0; column<columns; column++){
+ String tab="";
+ for(int row=skipLines; row<lines.length; row++){
+ sb.append(tab);
+ sb.append(lines2[row][column]);
+ tab="\t";
+ }
+ sb.append("\n");
+ }
+
+ ReadWrite.writeString(sb, fname+".transposed");
+
+ }
+
+
+
+
+}
diff --git a/current/driver/TrimSamFile.java b/current/driver/TrimSamFile.java
new file mode 100755
index 0000000..6761472
--- /dev/null
+++ b/current/driver/TrimSamFile.java
@@ -0,0 +1,65 @@
+package driver;
+
+import java.util.HashSet;
+
+import stream.SamLine;
+
+import align2.Tools;
+
+import fileIO.TextFile;
+
+public class TrimSamFile {
+
+ public static void main(String[] args){
+ String fname=args[0];
+ String scaf=args[1];
+ int from=Integer.parseInt(args[2]);
+ int to=Integer.parseInt(args[3]);
+ TextFile tf=new TextFile(fname, false, false);
+ HashSet<String> set=findBadLines(tf, scaf, from, to);
+ tf.reset();
+ printExcludingSet(tf, set);
+ }
+
+
+ public static HashSet<String> findBadLines(TextFile tf, String scafS, int from, int to){
+ byte[] scaf=scafS.getBytes();
+ HashSet<String> set=new HashSet<String>(16000);
+
+ for(String s=tf.nextLine(); s!=null; s=tf.nextLine()){
+ if(s.charAt(0)!='@'){//header
+ SamLine sl=new SamLine(s);
+
+ if(sl.pos>=from && sl.pos<=to && Tools.equals(sl.rname(), scaf)){
+ set.add(sl.qname);
+ }else if(sl.pnext>=from && sl.pnext<=to && Tools.equals(sl.rnext(), scaf)){
+ set.add(sl.qname);
+ }else if(Tools.equals(sl.rname(), scaf) && Tools.equals(sl.rnext(), scaf) && (sl.pos<from != sl.pnext<from)){
+ set.add(sl.qname);
+ }else if(!sl.mapped() || !sl.nextMapped() || !sl.pairedOnSameChrom()){
+ set.add(sl.qname);
+ }
+ }
+ }
+ return set;
+ }
+
+
+ public static void printExcludingSet(TextFile tf, HashSet<String> set){
+
+ for(String s=tf.nextLine(); s!=null; s=tf.nextLine()){
+ if(s.charAt(0)=='@'){//header
+ System.out.println(s);
+ }else{
+ SamLine sl=new SamLine(s);
+
+ if(!set.contains(sl.qname)){
+ System.out.println(s);
+ }
+ }
+ }
+ }
+
+
+}
+
diff --git a/current/fileIO/ArrayFile.java b/current/fileIO/ArrayFile.java
new file mode 100755
index 0000000..3a12157
--- /dev/null
+++ b/current/fileIO/ArrayFile.java
@@ -0,0 +1,73 @@
+package fileIO;
+
+
+public class ArrayFile extends TextFile{
+
+ public static void main(String[] args){
+
+ try {
+ //Name of mat file
+ String name=args[0];
+
+ ArrayFile mat=new ArrayFile(name);
+
+ String s=null;
+
+ for(s=mat.readLine(); s!=null; s=mat.readLine()){
+ System.out.println(s);
+ }
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+
+ }
+
+
+ public ArrayFile(String name){super(name, false, false);}
+
+ public String nextLine(){
+ String line=readLine();
+ char c=line.charAt(0);
+
+ while(line!=null && c!='{' && c!='/'){
+ line=readLine();
+ c=line.charAt(0);
+ }
+ return line;
+ }
+
+ public float[] nextArray(){
+ String line;
+ String[] split;
+
+ line=nextLine();
+ if(line==null || line.startsWith("//end")){return null;}
+
+ assert(line.startsWith("//name: ")) : line;
+ String name=line.replace("//name: ","").trim();
+
+ line=nextLine();
+ assert(line.startsWith("//size: ")) : line;
+ line=line.replace("//size: ","");
+ int length=Integer.parseInt(line);
+
+
+ float[] grid=new float[length];
+
+ line=nextLine();
+ assert(line.startsWith("{"));
+ if(line.endsWith(",")){line=line.substring(0, line.length()-1);}
+ assert(line.endsWith("}"));
+ line=line.replace("{", "").replace("}", "").replace(" ", "");
+ split=line.split(",");
+ assert(split.length==length);
+ for(int i=0; i<split.length; i++){
+ grid[i]=Float.parseFloat(split[i]);
+ }
+
+ return grid;
+ }
+
+ public static boolean verbose=false;
+
+}
diff --git a/current/fileIO/ByteFile.java b/current/fileIO/ByteFile.java
new file mode 100755
index 0000000..1c3f550
--- /dev/null
+++ b/current/fileIO/ByteFile.java
@@ -0,0 +1,93 @@
+package fileIO;
+import java.io.File;
+import java.io.InputStream;
+import java.util.ArrayList;
+
+import align2.Shared;
+
+
+public abstract class ByteFile {
+
+// public static final ByteFile makeByteFile(String fname){
+// return makeByteFile(fname, false, true);
+// }
+
+ public static final ByteFile makeByteFile(String fname, boolean tryAllExtensions, boolean allowSubprocess){
+ FileFormat ff=FileFormat.testInput(fname, FileFormat.TEXT, null, allowSubprocess, false);
+ return makeByteFile(ff, tryAllExtensions);
+ }
+
+ public static final ByteFile makeByteFile(FileFormat ff, boolean tryAllExtensions){
+ if(!Shared.LOW_MEMORY && (FORCE_MODE_BF2 || (!FORCE_MODE_BF1 && Shared.threads()>4/* && (ReadWrite.isCompressed(fname) || ReadWrite.isSam(fname))*/))){
+// if(allowSubprocess && ((ReadWrite.USE_UNPIGZ || ReadWrite.USE_GUNZIP) && (fname.endsWith(".gz") || fname.endsWith(".gzip")))){}
+ return new ByteFile2(ff, tryAllExtensions);
+ }
+ return new ByteFile1(ff, tryAllExtensions);
+ }
+
+// protected ByteFile(String fname, boolean tryAllExtensions, boolean allowSubprocess_){
+// allowSubprocess=allowSubprocess_;
+// fname=fname.replace('\\', '/');
+// File f=new File(fname);
+//
+// if(tryAllExtensions && !fname.startsWith("jar:") && !f.exists()){
+// name=ReadWrite.findFileExtension(fname);
+// f=new File(name);
+// }else{
+// name=fname;
+// }
+// }
+
+ protected ByteFile(FileFormat ff_, boolean tryAllExtensions){
+ ff=ff_;
+ assert(ff.read()) : ff;
+ }
+
+ public final ArrayList<byte[]> toByteLines(){
+
+ byte[] s=null;
+ ArrayList<byte[]> list=new ArrayList<byte[]>(4096);
+
+ for(s=nextLine(); s!=null; s=nextLine()){
+ list.add(s);
+ }
+
+ return list;
+ }
+
+ public final long countLines(){
+ byte[] s=null;
+ long count=0;
+ for(s=nextLine(); s!=null; s=nextLine()){count++;}
+ reset();
+
+ return count;
+ }
+
+ public abstract void reset();
+
+ public final boolean exists(){
+ return name().equals("stdin") || name().startsWith("stdin.") || name().startsWith("jar:") || new File(name()).exists(); //TODO Ugly and unsafe hack for files in jars
+ }
+
+ public abstract InputStream is();
+ public abstract long lineNum();
+
+ /** Returns true if there was an error */
+ public abstract boolean close();
+
+ public abstract byte[] nextLine();
+
+ public abstract boolean isOpen();
+
+ public final String name(){return ff.name();}
+ public final boolean allowSubprocess(){return ff.allowSubprocess();}
+
+ public final FileFormat ff;
+
+ public static boolean FORCE_MODE_BF1=false;
+ public static boolean FORCE_MODE_BF2=false;
+
+ protected final static byte slashr='\r', slashn='\n', carrot='>', plus='+', at='@';//, tab='\t';
+
+}
diff --git a/current/fileIO/ByteFile1.java b/current/fileIO/ByteFile1.java
new file mode 100755
index 0000000..58c3cfa
--- /dev/null
+++ b/current/fileIO/ByteFile1.java
@@ -0,0 +1,235 @@
+package fileIO;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+
+import dna.Data;
+import dna.Timer;
+
+
+/**
+ * @author Brian Bushnell
+ *
+ */
+public class ByteFile1 extends ByteFile {
+
+
+ public static void main(String[] args) throws IOException{
+ ByteFile1 tf=new ByteFile1(args.length>0 ? args[0] : "stdin", false, true);
+ long first=0, last=100;
+ boolean speedtest=false;
+ if(args.length>1){
+ if(args[1].equalsIgnoreCase("speedtest")){
+ speedtest=true;
+ first=0;
+ last=Long.MAX_VALUE;
+ }else{
+ first=Integer.parseInt(args[1]);
+ last=first+100;
+ }
+ }
+ if(args.length>2){
+ last=Integer.parseInt(args[2]);
+ }
+ speedtest(tf, first, last, !speedtest);
+
+ tf.close();
+ tf.reset();
+ tf.close();
+ }
+
+ private static void speedtest(ByteFile1 tf, long first, long last, boolean reprint){
+ Timer t=new Timer();
+ long lines=0;
+ long bytes=0;
+ for(long i=0; i<first; i++){tf.nextLine();}
+ if(reprint){
+ for(long i=first; i<last; i++){
+ byte[] s=tf.nextLine();
+ if(s==null){break;}
+
+ lines++;
+ bytes+=s.length;
+ System.out.println(new String(s));
+ }
+
+ System.err.println("\n");
+ System.err.println("Lines: "+lines);
+ System.err.println("Bytes: "+bytes);
+ }else{
+ for(long i=first; i<last; i++){
+ byte[] s=tf.nextLine();
+ if(s==null){break;}
+ lines++;
+ bytes+=s.length;
+ }
+ }
+ t.stop();
+
+ if(!reprint){
+ double rpnano=lines/(double)(t.elapsed);
+ double bpnano=bytes/(double)(t.elapsed);
+
+ String rpstring=(lines<100000 ? ""+lines : lines<100000000 ? (lines/1000)+"k" : (lines/1000000)+"m");
+ String bpstring=(bytes<100000 ? ""+bytes : bytes<100000000 ? (bytes/1000)+"k" : (bytes/1000000)+"m");
+
+ while(rpstring.length()<8){rpstring=" "+rpstring;}
+ while(bpstring.length()<8){bpstring=" "+bpstring;}
+
+ System.err.println("Time: \t"+t);
+ System.err.println("Reads Processed: "+rpstring+" \t"+String.format("%.2fk lines/sec", rpnano*1000000));
+ System.err.println("Bases Processed: "+bpstring+" \t"+String.format("%.2fm bytes/sec", bpnano*1000));
+ }
+ }
+
+// public ByteFile1(String name){this(name, false);}
+
+ public ByteFile1(String fname, boolean tryAllExtensions, boolean allowSubprocess_){
+ this(FileFormat.testInput(fname, FileFormat.TEXT, null, allowSubprocess_, false), tryAllExtensions);
+ }
+
+ public ByteFile1(FileFormat ff, boolean tryAllExtensions){
+ super(ff, tryAllExtensions);
+ if(verbose){System.err.println("ByteFile1("+ff+", "+tryAllExtensions+")");}
+ is=open();
+ }
+
+ public final void reset(){
+ close();
+ is=open();
+ }
+
+ public synchronized final boolean close(){
+ if(verbose){System.err.println("Closing "+this.getClass().getName()+" for "+name()+"; open="+open+"; errorState="+errorState);}
+ if(!open){return errorState;}
+ open=false;
+ assert(is!=null);
+ errorState|=ReadWrite.finishReading(is, name(), allowSubprocess());
+
+ is=null;
+ lineNum=-1;
+ if(verbose){System.err.println("Closed "+this.getClass().getName()+" for "+name()+"; open="+open+"; errorState="+errorState);}
+ return errorState;
+ }
+
+ @Override
+ public byte[] nextLine(){
+ if(verbose){System.err.println("Reading line "+this.getClass().getName()+" for "+name()+"; open="+open+"; errorState="+errorState);}
+
+ if(!open || is==null){
+ if(Data.WINDOWS){System.err.println("Attempting to read from a closed file: "+name());}
+ return null;
+ }
+
+// System.out.println("\nCalled nextLine() for line "+lineNum);
+// System.out.println("A: bstart="+bstart+", bstop="+bstop);
+
+ if(bstart<bstop && lasteol==slashr && buffer[bstart]==slashn){bstart++;}
+ assert(bstart>=bstop || (buffer[bstart]!=slashr || buffer[bstart]!=slashn)/*buffer[bstart]>slashr || buffer[bstart]==slashn*/);
+ int nlpos=bstart;
+
+// System.out.println("B: bstart="+bstart+", bstop="+bstop+", nlpos="+nlpos);
+// while(nlpos<bstop && (buffer[nlpos]>slashr || buffer[nlpos]==tab)){nlpos++;}
+ while(nlpos<bstop && (buffer[nlpos]!=slashr && buffer[nlpos]!=slashn)){nlpos++;}
+// System.out.println("C: bstart="+bstart+", bstop="+bstop+", nlpos="+nlpos);
+ if(nlpos>=bstop){
+ nlpos=fillBuffer();
+// System.out.println("Filled buffer.");
+ }else{
+ lasteol=buffer[nlpos];
+ }
+// System.out.println("D: bstart="+bstart+", bstop="+bstop+", nlpos="+nlpos);
+
+ if(nlpos<0 || bstop<1){
+ close();
+ return null;
+ }
+
+ lineNum++;
+ if(bstart==nlpos){//Empty line.
+ bstart=nlpos+1;
+ return blankLine;
+ }
+ byte[] line=Arrays.copyOfRange(buffer, bstart, nlpos);
+ assert(line.length>0) : bstart+", "+nlpos;
+ bstart=nlpos+1;
+// System.out.println("E: bstart="+bstart+", bstop="+bstop+", nlpos="+nlpos);
+ return line;
+ }
+
+ private int fillBuffer(){
+ if(bstart<bstop){ //Shift end bytes to beginning
+ assert(bstart>0);
+// assert(bstop==buffer.length);
+ int extra=bstop-bstart;
+ for(int i=0; i<extra; i++, bstart++){
+ buffer[i]=buffer[bstart];
+// assert(buffer[i]>=slashr || buffer[i]==tab);
+ assert(buffer[i]!=slashr && buffer[i]!=slashn);
+ }
+ bstop=extra;
+ }else{
+ bstop=0;
+ }
+
+ bstart=0;
+ int len=bstop;
+ int r=-1;
+ while(len==bstop){//hit end of input without encountering a newline
+ if(bstop==buffer.length){
+ buffer=Arrays.copyOf(buffer, buffer.length*2);
+ }
+ try {
+ r=is.read(buffer, bstop, buffer.length-bstop);
+ } catch (IOException e) {
+ e.printStackTrace();
+ System.err.println("open="+open);
+ }
+ if(r>0){
+ bstop=bstop+r;
+// while(len<bstop && (buffer[len]>slashr || buffer[len]==tab)){len++;}
+ while(len<bstop && (buffer[len]!=slashr && buffer[len]!=slashn)){len++;}
+ }else{
+ len=bstop;
+ break;
+ }
+ }
+
+// System.out.println("Filled buffer; r="+r+", returning "+len);
+ assert(r==-1 || buffer[len]<=slashr);
+ if(len>0){lasteol=buffer[len];}
+ return len;
+ }
+
+ private final synchronized InputStream open(){
+ if(open){
+ throw new RuntimeException("Attempt to open already-opened TextFile "+name());
+ }
+ open=true;
+ is=ReadWrite.getInputStream(name(), false, allowSubprocess());
+ bstart=-1;
+ bstop=-1;
+ lasteol=-1;
+ return is;
+ }
+
+ public boolean isOpen(){return open;}
+
+ public final InputStream is(){return is;}
+
+ public final long lineNum(){return lineNum;}
+
+ private boolean open=false;
+ private byte[] buffer=new byte[16384];
+ private static final byte[] blankLine=new byte[0];
+ private int bstart=0, bstop=0;
+ public InputStream is;
+ public long lineNum=-1;
+
+ private byte lasteol=-1;
+
+ public static boolean verbose=false;
+
+ private boolean errorState=false;
+
+}
diff --git a/current/fileIO/ByteFile2.java b/current/fileIO/ByteFile2.java
new file mode 100755
index 0000000..97e5696
--- /dev/null
+++ b/current/fileIO/ByteFile2.java
@@ -0,0 +1,402 @@
+package fileIO;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.concurrent.ArrayBlockingQueue;
+
+import dna.Timer;
+
+
+/**
+ * Runs a ByteFile1 in a separate thread. Can speed up disk reading, particularly of compressed files, at cost of slightly more work done.
+ * Drop-in compatible with ByteFile1.
+ * @author Brian Bushnell
+ * @date Sep 23, 2013
+ *
+ */
+public class ByteFile2 extends ByteFile {
+
+
+ public static void main(String[] args) throws IOException{
+ ByteFile2 tf=new ByteFile2(args.length>0 ? args[0] : "stdin", false, true);
+ long first=0, last=100;
+ boolean speedtest=false;
+ if(args.length>1){
+ if(args[1].equalsIgnoreCase("speedtest")){
+ speedtest=true;
+ first=0;
+ last=Long.MAX_VALUE;
+ }else{
+ first=Integer.parseInt(args[1]);
+ last=first+100;
+ }
+ }
+ if(args.length>2){
+ last=Integer.parseInt(args[2]);
+ }
+ speedtest(tf, first, last, !speedtest);
+
+ tf.close();
+ tf.reset();
+ tf.close();
+ }
+
+ private static void speedtest(ByteFile2 tf, long first, long last, boolean reprint){
+ Timer t=new Timer();
+ long lines=0;
+ long bytes=0;
+ for(long i=0; i<first; i++){tf.nextLine();}
+ if(reprint){
+ for(long i=first; i<last; i++){
+ byte[] s=tf.nextLine();
+ if(s==null){break;}
+
+ lines++;
+ bytes+=s.length;
+ System.out.println(new String(s));
+ }
+
+ System.err.println("\n");
+ System.err.println("Lines: "+lines);
+ System.err.println("Bytes: "+bytes);
+ }else{
+ for(long i=first; i<last; i++){
+ byte[] s=tf.nextLine();
+ if(s==null){break;}
+ lines++;
+ bytes+=s.length;
+ }
+ }
+ t.stop();
+
+ if(!reprint){
+ double rpnano=lines/(double)(t.elapsed);
+ double bpnano=bytes/(double)(t.elapsed);
+
+ String rpstring=(lines<100000 ? ""+lines : lines<100000000 ? (lines/1000)+"k" : (lines/1000000)+"m");
+ String bpstring=(bytes<100000 ? ""+bytes : bytes<100000000 ? (bytes/1000)+"k" : (bytes/1000000)+"m");
+
+ while(rpstring.length()<8){rpstring=" "+rpstring;}
+ while(bpstring.length()<8){bpstring=" "+bpstring;}
+
+ System.err.println("Time: \t"+t);
+ System.err.println("Reads Processed: "+rpstring+" \t"+String.format("%.2fk lines/sec", rpnano*1000000));
+ System.err.println("Bases Processed: "+bpstring+" \t"+String.format("%.2fm bytes/sec", bpnano*1000));
+ }
+ }
+
+// public ByteFile2(String name()){this(name(), false);}
+
+ public ByteFile2(String fname, boolean tryAllExtensions, boolean allowSubprocess_){
+ this(FileFormat.testInput(fname, FileFormat.TEXT, null, allowSubprocess_, false), tryAllExtensions);
+ }
+
+ public ByteFile2(FileFormat ff, boolean tryAllExtensions){
+ super(ff, tryAllExtensions);
+ if(verbose){System.err.println("ByteFile2("+ff+", "+tryAllExtensions+")");}
+ open();
+ }
+
+ public final void reset(){
+ close();
+ open();
+ }
+
+ public synchronized final boolean close(){
+ if(verbose){System.err.println("ByteFile2("+name()+").close()");}
+ if(isOpen()){
+// errorState|=ReadWrite.killProcess(name());
+ thread.shutdown();
+ while(thread.getState()!=Thread.State.TERMINATED){
+ try {
+ thread.join();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ thread.bf1.close();
+ }
+ thread=null;
+ currentList=null;
+ currentLoc=0;
+// assert(numIn==numOut) : numIn+", "+numOut;
+ if(verbose){System.err.println("ByteFile2("+name()+").close() returned "+errorState);}
+ return errorState;
+ }
+
+ @Override
+ public byte[] nextLine(){
+// if(verbose){System.err.println("Reading line.");}
+// byte[] r=null;
+ if(currentList==null || currentLoc>=currentList.length || currentList[currentLoc]==null){
+ boolean b=getBuffer();
+ if(!b){
+ if(verbose2){System.err.println("nextLine()->getBuffer() returned false.");}
+ return null;
+ }
+ }
+
+ //TODO: This is a race condition; currentList can be changed to null. A defensive copy could be created.
+ assert(currentList!=null && currentList!=poison);
+ assert(currentLoc<currentList.length);
+ assert(currentList[currentLoc]!=null);
+ byte[] r=currentList[currentLoc];
+ assert(r!=null);
+ currentLoc++;
+// numOut++;
+ return r;
+ }
+
+ private boolean getBuffer(){
+ if(verbose2){System.err.println("Getting new buffer.");}
+ currentLoc=0;
+ final BF1Thread bft=thread;
+ if(bft==null){
+ currentList=null;
+ if(verbose2){System.err.println("No buffers available. thread="+thread+", shutdown="+(thread==null ? "X" : ""+thread.shutdown));}
+ return false;
+ }
+ if(currentList==poison){
+ if(verbose2){System.err.println("A: Current list is poison.");}
+ return false;
+ }
+ if(currentList!=null){
+ Arrays.fill(currentList, null); //MUST be done or lines get recycled at end of file.
+ while(currentList!=null){
+ try {
+ if(verbose2){System.err.println("adding to qEmpty list size "+currentList.length+"\n"+toString(currentList));}
+ bft.qEmpty.put(currentList);
+ currentList=null;
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+ assert(currentList==null);
+ while(currentList==null){
+ try {
+ assert(bft!=null);
+ if(verbose2){System.err.println("C: qFull.size()="+bft.qFull.size());}
+ currentList=bft.qFull.take();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ if(verbose2){
+ if(currentList==poison){
+ System.err.println("B: Current list is poison.");
+ }else{
+ System.err.println("getBuffer fetched a new buffer of size "+currentList.length);
+ }
+ }
+ return currentList!=poison;
+ }
+
+ private final synchronized BF1Thread open(){
+ if(verbose2){System.err.println("ByteFile2("+name()+").open()");}
+ assert(thread==null);
+ currentList=null;
+ currentLoc=0;
+// numIn=0;
+// numOut=0;
+ thread=new BF1Thread(ff);
+ thread.start();
+ return thread;
+ }
+
+ private class BF1Thread extends Thread{
+
+// public BF1Thread(String fname){
+// bf1=new ByteFile1(fname, false, allowSubprocess);
+// qFull=new ArrayBlockingQueue<byte[][]>(buffs+2);
+// qEmpty=new ArrayBlockingQueue<byte[][]>(buffs+2);
+// for(int i=0; i<buffs; i++){
+// try {
+// qEmpty.put(new byte[bufflen][]);
+// } catch (InterruptedException e) {
+// // TODO Auto-generated catch block
+// e.printStackTrace();
+// }
+// }
+// }
+
+ public BF1Thread(FileFormat ff){
+ bf1=new ByteFile1(ff, false);
+ qFull=new ArrayBlockingQueue<byte[][]>(buffs+2);
+ qEmpty=new ArrayBlockingQueue<byte[][]>(buffs+2);
+ for(int i=0; i<buffs; i++){
+ try {
+ qEmpty.put(new byte[bufflen][]);
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+
+ @Override
+ public void run(){
+ if(verbose){System.err.println("ByteFile2("+name()+").run()");}
+ byte[] s=null;
+ byte[][] list=null;
+ while(list==null){
+ try {
+ list = qEmpty.take();
+ } catch (InterruptedException e1) {
+ // TODO Auto-generated catch block
+ e1.printStackTrace();
+ }
+ }
+ synchronized(this){
+ if(list==poison || shutdown){
+ shutdown();
+ return;
+ }
+ }
+
+ int loc=0;
+ long bases=0;
+
+ //At this point, list is not null
+ for(s=bf1.nextLine(); s!=null; s=bf1.nextLine()){
+ bases+=s.length;
+ assert(list!=null) : "Somehow the list became null for "+bf1.name()+" at line "+cntr;
+ list[loc]=s;
+ loc++;
+// numIn++;
+// if(verbose){System.err.println("Added line "+numIn);}
+ if(loc>=bufflen || bases>=buffcapacity){
+ if(verbose2){System.err.println("Capacity exceeded.");}
+ while(list!=null){
+ try {
+// synchronized(this){
+// if(!shutdown){
+ if(verbose2){
+ System.err.println("A: Adding to qFull list of size "+loc);
+ System.err.println(ByteFile2.toString(list));
+ }
+ cntr+=list.length;
+ qFull.put(list);
+ if(verbose2){System.err.println("A: qFull.size()="+qFull.size());}
+// }
+// }
+ list=null;
+ loc=0;
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ //At this point, list is null
+ if(shutdown){
+ if(verbose2){System.err.println("Break 1");}
+ break;
+ }
+ while(list==null){
+ if(verbose2){System.err.println("Taking empty list.");}
+ try {
+ list = qEmpty.take();
+ } catch (InterruptedException e1) {
+ // TODO Auto-generated catch block
+ e1.printStackTrace();
+ }
+ }
+ //At this point, list is not null
+ bases=0;
+ if(list==poison){
+ if(verbose2){System.err.println("Break 2");}
+ break;
+ }
+ //At this point, list is not null
+ }
+ }
+ if(verbose2){System.err.println("Run loop exit.");}
+
+ while(list!=null && loc>0){
+ try {
+// synchronized(this){
+// if(!shutdown){
+ if(verbose2){System.err.println("B: Adding list of size "+loc);}
+ qFull.put(list);
+ if(verbose2){System.err.println("B: qFull.size()="+qFull.size());}
+// }
+// }
+ list=null;
+ loc=0;
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ //At this point, list is null
+ shutdown();
+
+ if(verbose){System.err.println("ByteFile2("+name()+").run() finished");}
+ }
+
+ private synchronized void shutdown(){
+ if(verbose || verbose2){System.err.println("ByteFile2("+name()+").shutdown()");}
+ if(shutdown){return;}
+ shutdown=true;
+ if(verbose2){System.err.println("Adding poison.");}
+ qFull.add(poison);
+ qEmpty.add(poison);
+ if(verbose2){System.err.println("D: qFull.size()="+qFull.size());}
+ if(verbose || verbose2){System.err.println("ByteFile2("+name()+").shutdown() finished");}
+ }
+
+ private boolean shutdown=false;
+ final ByteFile1 bf1;
+ final ArrayBlockingQueue<byte[][]> qFull;
+ final ArrayBlockingQueue<byte[][]> qEmpty;
+
+ }
+
+ public boolean isOpen(){
+ if(currentList!=null && currentLoc<currentList.length && currentList[currentLoc]!=null){return true;}
+ final BF1Thread bft=thread;
+ if(bft==null){
+ return false;
+ }
+ return true;
+// synchronized(bft){
+// //NOTE!!! This cannot be used because qFull.size() will not return a correctly synchronized value. Poll() may work.
+// assert(bft.bf1.isOpen() || !bft.qFull.isEmpty()) : bft.bf1.isOpen()+", "+bft.qFull.isEmpty()+", "+bft.qFull.size();
+// return (bft.bf1.isOpen() || !bft.qFull.isEmpty());
+// }
+ }
+
+ /** For debugging */
+ private static String toString(byte[][] x){
+ StringBuilder sb=new StringBuilder();
+ for(byte[] z : x){
+ sb.append(z==null ? "null" : new String(z)).append('\n');
+ }
+ return sb.toString();
+ }
+
+ public final InputStream is(){return thread==null ? null : thread.bf1.is();}
+
+ public final long lineNum(){return thread==null ? -1 : thread.bf1.lineNum();}
+
+ private long cntr;
+ private BF1Thread thread=null;
+ private byte[][] currentList=null;
+ private int currentLoc=0;
+// private int currentSize=0;
+
+// private long numIn=0, numOut=0;
+
+ private static final byte[][] poison=new byte[0][];
+ public static boolean verbose=false;
+ private static final boolean verbose2=false;
+ private static final int bufflen=1000;
+ private static final int buffs=4;
+ private static final int buffcapacity=256000;
+
+ private boolean errorState=false;
+
+}
diff --git a/current/fileIO/ByteStreamWriter.java b/current/fileIO/ByteStreamWriter.java
new file mode 100755
index 0000000..cb980a6
--- /dev/null
+++ b/current/fileIO/ByteStreamWriter.java
@@ -0,0 +1,455 @@
+package fileIO;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.util.concurrent.ArrayBlockingQueue;
+
+import align2.Shared;
+
+import kmer.AbstractKmerTable;
+
+import stream.ByteBuilder;
+import stream.Read;
+import ukmer.AbstractKmerTableU;
+
+import dna.AminoAcid;
+import dna.Data;
+import dna.Timer;
+
+
+
+/**
+ * @author Brian Bushnell
+ * @date Oct 21, 2014
+ *
+ */
+public class ByteStreamWriter extends Thread {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public static void main(String[] args){
+ Timer t=new Timer();
+ final int alen=1000;
+ byte[] array=new byte[alen];
+ for(int i=0; i<array.length; i++){
+ array[i]=AminoAcid.numberToBase[i%4];
+ }
+ array[array.length-1]='\n';
+ long iters=Long.parseLong(args[1]);
+ String fname=args[0];
+ ByteStreamWriter bsw=new ByteStreamWriter(fname, true, false, true);
+ bsw.start();
+ for(long i=0; i<iters; i++){
+ bsw.print(array);
+ }
+ bsw.poisonAndWait();
+ t.stop();
+ System.err.println("MB/s: \t"+String.format("%.2f", ((alen*iters)/(t.elapsed/1000.0))));
+ System.err.println("Time: \t"+t);
+ }
+
+ public ByteStreamWriter(String fname_, boolean overwrite_, boolean append_, boolean allowSubprocess_){
+ this(fname_, overwrite_, append_, allowSubprocess_, 0);
+ }
+
+ public ByteStreamWriter(String fname_, boolean overwrite_, boolean append_, boolean allowSubprocess_, int format){
+ this(FileFormat.testOutput(fname_, FileFormat.TEXT, format, 0, allowSubprocess_, overwrite_, append_, true));
+ }
+
+ public ByteStreamWriter(FileFormat ff){
+ FASTQ=ff.fastq() || ff.text();
+ FASTA=ff.fasta();
+ BREAD=ff.bread();
+ SAM=ff.samOrBam();
+ BAM=ff.bam();
+ SITES=ff.sites();
+ INFO=ff.attachment();
+ OTHER=(!FASTQ && !FASTA && !BREAD && !SAM && !BAM && !SITES && !INFO);
+
+
+ fname=ff.name();
+ overwrite=ff.overwrite();
+ append=ff.append();
+ allowSubprocess=ff.allowSubprocess();
+ assert(!(overwrite&append));
+ assert(ff.canWrite()) : "File "+fname+" exists and overwrite=="+overwrite;
+ if(append && !(ff.raw() || ff.gzip())){throw new RuntimeException("Can't append to compressed files.");}
+
+ if(!BAM || !Data.SAMTOOLS() || !Data.SH()){
+ outstream=ReadWrite.getOutputStream(fname, append, true, allowSubprocess);
+ }else{
+ outstream=ReadWrite.getOutputStreamFromProcess(fname, "samtools view -S -b -h - ", true, append, true, true);
+ }
+
+ queue=new ArrayBlockingQueue<ByteBuilder>(5);
+ buffer=new ByteBuilder(initialLen);
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Primary Method ----------------*/
+ /*--------------------------------------------------------------*/
+
+
+ @Override
+ public void run() {
+ if(verbose){System.err.println("running");}
+ assert(open) : fname;
+
+ synchronized(this){
+ started=true;
+ this.notify();
+ }
+
+ ByteBuilder job=null;
+
+ if(verbose){System.err.println("waiting for jobs");}
+ while(job==null){
+ try {
+ job=queue.take();
+// job.list=queue.take();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+
+ if(verbose){System.err.println("processing jobs");}
+ while(job!=null && job!=POISON2){
+ if(job.length()>0){
+ try {
+ outstream.write(job.array, 0, job.length());
+ } catch (IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+
+ job=null;
+ while(job==null){
+ try {
+ job=queue.take();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+ if(verbose){System.err.println("null/poison job");}
+// assert(false);
+ open=false;
+ ReadWrite.finishWriting(null, outstream, fname, allowSubprocess);
+ if(verbose){System.err.println("finish writing");}
+ synchronized(this){notifyAll();}
+ if(verbose){System.err.println("done");}
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Control and Helpers ----------------*/
+ /*--------------------------------------------------------------*/
+
+
+ @Override
+ public void start(){
+ super.start();
+ if(verbose){System.err.println(this.getState());}
+ synchronized(this){
+ while(!started){
+ try {
+ this.wait(20);
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+ }
+
+
+ public synchronized void poison(){
+ //Don't allow thread to shut down before it has started
+ while(!started || this.getState()==Thread.State.NEW){
+ try {
+ this.wait(20);
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+
+ if(!open){return;}
+ addJob(buffer);
+ buffer=null;
+// System.err.println("Poisoned!");
+// assert(false);
+
+// assert(false) : open+", "+this.getState()+", "+started;
+ open=false;
+ addJob(POISON2);
+ }
+
+ public void waitForFinish(){
+ while(this.getState()!=Thread.State.TERMINATED){
+ try {
+ this.join(1000);
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ }
+ }
+ }
+
+ /**
+ * @return true if there was an error, false otherwise
+ */
+ public boolean poisonAndWait(){
+ poison();
+ waitForFinish();
+ return errorState;
+ }
+
+ //TODO Why is this synchronized?
+ public synchronized void addJob(ByteBuilder bb){
+// System.err.println("Got job "+(j.list==null ? "null" : j.list.size()));
+
+ assert(started) : "Wait for start() to return before using the writer.";
+// while(!started || this.getState()==Thread.State.NEW){
+// try {
+// this.wait(20);
+// } catch (InterruptedException e) {
+// // TODO Auto-generated catch block
+// e.printStackTrace();
+// }
+// }
+
+ boolean success=false;
+ while(!success){
+ try {
+ queue.put(bb);
+ success=true;
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ assert(!queue.contains(bb)); //Hopefully it was not added.
+ }
+ }
+ }
+
+ /** Called after every write to the buffer */
+ private final void flushBuffer(boolean force){
+ final int x=buffer.length();
+ if(x>=maxLen || (force && x>0)){
+ addJob(buffer);
+ buffer=new ByteBuilder(initialLen);
+ }
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Print ----------------*/
+ /*--------------------------------------------------------------*/
+
+
+ @Deprecated
+ /** Avoid using this if possible. */
+ public void print(CharSequence x){
+ if(verbose){System.err.println("Added line '"+x+"'");}
+ assert(open) : x;
+ buffer.append(x);
+ flushBuffer(false);
+ }
+
+ @Deprecated
+ /** Avoid using this if possible. */
+ public void print(StringBuilder x){
+ if(verbose){System.err.println("Added line '"+x+"'");}
+ assert(open) : x;
+ buffer.append(x);
+ flushBuffer(false);
+ }
+
+ @Deprecated
+ /** Avoid using this if possible. */
+ public void print(String x){
+ if(verbose){System.err.println("Added line '"+x+"'");}
+ assert(open) : x;
+ buffer.append(x);
+ flushBuffer(false);
+ }
+
+ public void print(int x){
+ if(verbose){System.err.println("Added line '"+(x)+"'");}
+ assert(open) : x;
+ buffer.append(x);
+ flushBuffer(false);
+ }
+
+ public void print(long x){
+ if(verbose){System.err.println("Added line '"+(x)+"'");}
+ assert(open) : x;
+ buffer.append(x);
+ flushBuffer(false);
+ }
+
+ public void print(float x){
+ if(verbose){System.err.println("Added line '"+(x)+"'");}
+ assert(open) : x;
+ buffer.append(x);
+ flushBuffer(false);
+ }
+
+ public void print(double x){
+ if(verbose){System.err.println("Added line '"+(x)+"'");}
+ assert(open) : x;
+ buffer.append(x);
+ flushBuffer(false);
+ }
+
+ public void print(byte x){
+ if(verbose){System.err.println("Added line '"+((char)x)+"'");}
+ assert(open) : ((char)x);
+ buffer.append(x);
+ flushBuffer(false);
+ }
+
+ public void print(char x){
+ if(verbose){System.err.println("Added line '"+(x)+"'");}
+ assert(open) : (x);
+ buffer.append(x);
+ flushBuffer(false);
+ }
+
+ public void print(byte[] x){
+ if(verbose){System.err.println("Added line '"+new String(x)+"'");}
+ assert(open) : new String(x);
+ buffer.append(x);
+ flushBuffer(false);
+ }
+
+ public void print(char[] x){
+ if(verbose){System.err.println("Added line '"+new String(x)+"'");}
+ assert(open) : new String(x);
+ buffer.append(x);
+ flushBuffer(false);
+ }
+
+ public void print(ByteBuilder x){
+ if(verbose){System.err.println("Added line '"+x+"'");}
+ assert(open) : x;
+ buffer.append(x);
+ flushBuffer(false);
+ }
+
+ public void print(ByteBuilder x, boolean destroy){
+ if(!destroy || buffer.length()>0){print(x);}
+ else{
+ if(verbose){System.err.println("Added line '"+x+"'");}
+ assert(open) : x;
+ addJob(x);
+ }
+ }
+
+ public void print(Read r){
+ assert(!OTHER);
+ ByteBuilder x=(FASTQ ? r.toFastq(buffer) : FASTA ? r.toFasta(FASTA_WRAP, buffer) : SAM ? r.toSam(buffer) :
+ SITES ? r.toSitesB(buffer) : INFO ? r.toInfoB(buffer) : r.toText(true, buffer));
+ flushBuffer(false);
+ }
+
+ public void printKmer(long kmer, int count, int k){
+ AbstractKmerTable.toBytes(kmer, count, k, buffer);
+ flushBuffer(false);
+ }
+
+ public void printKmer(long kmer, int[] values, int k){
+ AbstractKmerTable.toBytes(kmer, values, k, buffer);
+ flushBuffer(false);
+ }
+
+ public void printKmer(long[] array, int count, int k){
+ AbstractKmerTableU.toBytes(array, count, k, buffer);
+ flushBuffer(false);
+ }
+
+ public void printKmer(long[] array, int[] values, int k){
+ AbstractKmerTableU.toBytes(array, values, k, buffer);
+ flushBuffer(false);
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Println ----------------*/
+ /*--------------------------------------------------------------*/
+
+
+ public void println(){print('\n');}
+ public void println(CharSequence x){print(x); print('\n');}
+ public void println(String x){print(x); print('\n');}
+ public void println(StringBuilder x){print(x); print('\n');}
+ public void println(int x){print(x); print('\n');}
+ public void println(long x){print(x); print('\n');}
+ public void println(float x){print(x); print('\n');}
+ public void println(double x){print(x); print('\n');}
+ public void println(byte x){print(x); print('\n');}
+ public void println(char x){print(x); print('\n');}
+ public void println(byte[] x){print(x); print('\n');}
+ public void println(char[] x){print(x); print('\n');}
+ public void println(ByteBuilder x){print(x); print('\n');}
+ public void println(ByteBuilder x, boolean destroy){
+ if(destroy){print(x.append('\n'));}else{print(x); print('\n');}
+ }
+ public void printlnKmer(long kmer, int count, int k){printKmer(kmer, count, k); print('\n');}
+ public void printlnKmer(long kmer, int[] values, int k){printKmer(kmer, values, k); print('\n');}
+ public void printlnKmer(long[] array, int count, int k){printKmer(array, count, k); print('\n');}
+ public void printlnKmer(long[] array, int[] values, int k){printKmer(array, values, k); print('\n');}
+ public void println(Read r){print(r); print('\n');}
+
+
+ public void println(Read r, boolean paired){
+ println(r);
+ if(paired && r.mate!=null){println(r.mate);}
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private ByteBuilder buffer;
+
+ public int initialLen=36000;
+ public int maxLen=32768;
+ public final boolean overwrite;
+ public final boolean append;
+ public final boolean allowSubprocess;
+ public final String fname;
+ private final OutputStream outstream;
+ private final ArrayBlockingQueue<ByteBuilder> queue;
+ private boolean open=true;
+ private volatile boolean started=false;
+
+ /** TODO */
+ public boolean errorState=false;
+
+ /*--------------------------------------------------------------*/
+
+ private final boolean BAM;
+ private final boolean SAM;
+ private final boolean FASTQ;
+ private final boolean FASTA;
+ private final boolean BREAD;
+ private final boolean SITES;
+ private final boolean INFO;
+ private final boolean OTHER;
+
+ private final int FASTA_WRAP=Shared.FASTA_WRAP;
+
+ /*--------------------------------------------------------------*/
+
+// private static final ByteBuilder POISON=new ByteBuilder("POISON_ByteStreamWriter");
+ private static final ByteBuilder POISON2=new ByteBuilder(1);
+
+ public static boolean verbose=false;
+
+}
diff --git a/current/fileIO/ChainBlock.java b/current/fileIO/ChainBlock.java
new file mode 100755
index 0000000..81bdda7
--- /dev/null
+++ b/current/fileIO/ChainBlock.java
@@ -0,0 +1,224 @@
+package fileIO;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+
+import dna.Gene;
+
+/** For loading UCSC .chain files that convert one genome build to another. */
+public class ChainBlock implements Comparable<ChainBlock>{
+
+
+ public static void main(String args[]){
+ ChainLine[][] lines=loadChainLines(args[0]);
+ for(int i=1; i<=22; i++){
+ for(ChainLine line : lines[i]){
+ System.out.println(line);
+ }
+ System.out.println();
+ }
+ }
+
+
+ public ChainBlock(List<String[]> list){
+
+ String[] head=list.get(0);
+ assert("chain".equals(head[0]));
+
+ score=Long.parseLong(head[1]);
+
+ tName=head[2];
+ tChrom=toChromosome(head[2]);
+ tSize=Integer.parseInt(head[3]);
+ tStrand=Gene.toStrand(head[4]);
+ tStart=Integer.parseInt(head[5]);
+ tStop=Integer.parseInt(head[6]);
+
+ qName=head[7];
+ qChrom=toChromosome(head[7]);
+ qSize=Integer.parseInt(head[8]);
+ qStrand=Gene.toStrand(head[9]);
+ qStart=Integer.parseInt(head[10]);
+ qStop=Integer.parseInt(head[11]);
+
+ chainID=Integer.parseInt(head[12]);
+
+ chunks=new int[list.size()-1][];
+ for(int i=1; i<list.size(); i++){
+ String[] line=list.get(i);
+ assert((i==list.size()-1) == (line.length==1));
+ assert((i!=list.size()-1) == (line.length==3));
+ chunks[i-1]=new int[line.length];
+ for(int j=0; j<line.length; j++){
+ chunks[i-1][j]=Integer.parseInt(line[j]);
+ }
+ }
+
+ }
+
+
+ private int toChromosome(String s){
+ int result;
+ try{
+ result=Gene.toChromosome(s);
+ }catch(Exception e){
+ result=Gene.toChromosome("U");
+ }
+ return result;
+ }
+
+
+ public ChainLine[] toLines(){
+ ChainLine[] out=new ChainLine[chunks.length];
+
+ if(qStrand==Gene.PLUS){
+
+ int tloc=tStart, qloc=qStart;
+ for(int i=0; i<chunks.length; i++){
+ int[] chunk=chunks[i];
+ int tloc2=tloc+chunk[0]-1, qloc2=qloc+chunk[0]-1;
+ out[i]=new ChainLine(tChrom, tStrand, tloc, tloc2, qChrom, qStrand, qloc, qloc2);
+ if(chunk.length>1){
+ tloc=tloc2+chunk[1]+1;
+ qloc=qloc2+chunk[2]+1;
+ }
+ }
+ }else{
+
+ int tloc=tStart, qloc=qStop-1;
+ for(int i=0; i<chunks.length; i++){
+ int[] chunk=chunks[i];
+ int tloc2=tloc+chunk[0]-1, qloc2=qloc-chunk[0]+1;
+ out[i]=new ChainLine(tChrom, tStrand, tloc, tloc2, qChrom, qStrand, qloc, qloc2);
+ if(chunk.length>1){
+ tloc=tloc2+chunk[1]+1;
+ qloc=qloc2-chunk[2]-1;
+ }
+ }
+ }
+
+ return out;
+ }
+
+
+ public static ChainLine[][] loadChainLines(String fname){
+ ArrayList<ChainBlock> list=loadChainBlocks(fname);
+ ChainBlock[][] blocks=splitChain(list);
+ ChainLine[][] out=new ChainLine[blocks.length][];
+ ArrayList<ChainLine> temp=new ArrayList<ChainLine>();
+ for(int chrom=0; chrom<blocks.length; chrom++){
+ temp.clear();
+ ChainBlock[] cblocks=blocks[chrom];
+ if(cblocks.length>0){
+ for(ChainBlock block : cblocks){
+ ChainLine[] blines=block.toLines();
+ for(ChainLine line : blines){
+ temp.add(line);
+ }
+ }
+ }
+ if(temp.size()>0){
+ out[chrom]=temp.toArray(new ChainLine[temp.size()]);
+ Arrays.sort(out[chrom]);
+ }
+ }
+ return out;
+ }
+
+
+ public static ArrayList<ChainBlock> loadChainBlocks(String fname){
+ TextFile tf=new TextFile(fname, false, true);
+ String[] lines=tf.toStringLines();
+ tf.close();
+ String[][] text=TextFile.doublesplitWhitespace(lines, true);
+
+ ArrayList<ChainBlock> out=new ArrayList<ChainBlock>();
+ ArrayList<String[]> current=new ArrayList<String[]>(40);
+ for(int i=0; i<text.length; i++){
+ String[] line=text[i];
+ current.add(line);
+ if(line.length==1){
+ out.add(new ChainBlock(current));
+ current.clear();
+ }
+ }
+ Collections.sort(out);
+ return out;
+ }
+
+
+ public static ChainBlock[][] splitChain(ArrayList<ChainBlock> list){
+ int[] size=new int[Gene.chromCodes.length];
+
+ for(ChainBlock cb : list){size[cb.tChrom]++;}
+
+ ChainBlock[][] out=new ChainBlock[size.length][];
+ for(int i=0; i<out.length; i++){out[i]=new ChainBlock[size[i]];}
+
+ Arrays.fill(size, 0);
+ for(ChainBlock cb : list){
+ out[cb.tChrom][size[cb.tChrom]]=cb;
+ size[cb.tChrom]++;
+ }
+
+ return out;
+ }
+
+
+ @Override
+ public int compareTo(ChainBlock other) {
+ int temp;
+
+ temp=tChrom-other.tChrom;
+ if(temp!=0){return temp;}
+
+ temp=tName.compareTo(other.tName);
+ if(temp!=0){return temp;}
+
+ assert(tStrand==other.tStrand);
+
+ temp=tStart-other.tStart;
+ if(temp!=0){return temp;}
+
+ temp=tStop-other.tStop;
+ return temp;
+ }
+
+
+ public long score;
+ public String tName;
+ public int tChrom;
+ public int tSize;
+ public byte tStrand;
+ public int tStart;
+ public int tStop;
+
+ public String qName;
+ public int qChrom;
+ public int qSize;
+ public byte qStrand;
+ public int qStart;
+ public int qStop;
+
+ public int chainID;
+
+ public int[][] chunks;
+
+ //chain 3303 chr1 247249719 + 13192499 13192587 chr1 249250621 - 236203315 236203403 109
+
+// * score -- chain score
+// * tName -- chromosome (reference sequence)
+// * tSize -- chromosome size (reference sequence)
+// * tStrand -- strand (reference sequence)
+// * tStart -- alignment start position (reference sequence)
+// * tEnd -- alignment end position (reference sequence)
+// * qName -- chromosome (query sequence)
+// * qSize -- chromosome size (query sequence)
+// * qStrand -- strand (query sequence)
+// * qStart -- alignment start position (query sequence)
+// * qEnd -- alignment end position (query sequence)
+// * id -- chain ID
+
+}
diff --git a/current/fileIO/ChainLine.java b/current/fileIO/ChainLine.java
new file mode 100755
index 0000000..c1cc983
--- /dev/null
+++ b/current/fileIO/ChainLine.java
@@ -0,0 +1,123 @@
+package fileIO;
+
+import dna.Data;
+import dna.Gene;
+
+public class ChainLine implements Comparable<ChainLine> {
+
+
+ public static void main(String[] args){
+
+ int chrom=Gene.toChromosome(args[0]);
+
+ ChainLine[][] lines=ChainBlock.loadChainLines(Data.ROOT_CHAIN+"hg18ToHg19.over.chain");
+
+ for(int i=1; i<args.length; i++){
+ int loc=Integer.parseInt(args[i]);
+ int[] result=translate(loc, lines[chrom]);
+ System.out.print(chrom+"\t+\t"+loc+"\t->\t");
+ System.out.println(result==null ? "null" : result[0]+"\t"+Gene.strandCodes[result[1]]+"\t"+result[2]);
+ }
+
+ }
+
+
+ public ChainLine(int chromT, byte strandT, int startT, int stopT, int chromQ, byte strandQ, int startQ, int stopQ){
+ tChrom=chromT;
+ tStrand=strandT;
+ tStart=startT;
+ tStop=stopT;
+
+ qChrom=chromQ;
+ qStrand=strandQ;
+ qStart=startQ;
+ qStop=stopQ;
+ }
+
+
+ public String toString(){
+ return tChrom+"\t"+Gene.strandCodes[tStrand]+"\t"+tStart+"\t"+tStop+"\t"+
+ qChrom+"\t"+Gene.strandCodes[qStrand]+"\t"+qStart+"\t"+qStop;
+ }
+
+
+ public static int binarySearch(int loc, ChainLine[] array){
+ return binarySearch(loc, array, 0, array.length-1);
+ }
+
+
+ public static int binarySearch(int loc, ChainLine[] array, int first, int last){
+// if(first>=last){
+// if(first>last){return -1;}
+// assert(first==last && first<array.length);
+// return (array[first].tStart<=loc && array[first].tStop>=loc) ? first : -1;
+// }
+// System.out.println("BinarySearch "+loc+", "+first+", "+last);
+ if(first>last){return -1;}
+ int mid=(first+last)/2;
+ ChainLine midcl=array[mid];
+// System.out.println("mid = "+midcl);
+ if(loc<midcl.tStart){return binarySearch(loc, array, first, mid-1);}
+ else if(loc>midcl.tStop){return binarySearch(loc, array, mid+1, last);}
+ return mid;
+ }
+
+ /** Returns {chrom, strand, loc} */
+ public static int[] translate(int loc, ChainLine[] array){
+ int index=binarySearch(loc, array);
+ if(index<0){return null;}
+ ChainLine cl=array[index];
+ return cl.translate(loc);
+ }
+
+ public int[] translate(int loc){
+ if(loc<tStart || loc>tStop){return null;}
+// assert(loc>=tStart && loc<=tStop);
+ if(qChrom<1 || qChrom>25){return null;}
+ if(qStrand==Gene.PLUS){
+ return new int[] {qChrom, qStrand, qStart+loc-tStart};
+ }else{
+ assert(qStart>=qStop) : this;
+ return new int[] {qChrom, qStrand, qStart-(loc-tStart)};
+ }
+ }
+
+
+ public boolean contains(int a, int b){
+ assert(b>=a);
+ return a>=tStart && b<=tStop;
+ }
+
+
+ public boolean contains(int a){
+ return a>=tStart && a<=tStop;
+ }
+
+
+ @Override
+ public int compareTo(ChainLine other) {
+ int temp;
+
+ temp=tChrom-other.tChrom;
+ if(temp!=0){return temp;}
+
+ assert(tStrand==other.tStrand);
+
+ temp=tStart-other.tStart;
+ if(temp!=0){return temp;}
+
+ temp=tStop-other.tStop;
+ return temp;
+ }
+
+ public int tChrom;
+ public byte tStrand;
+ public int tStart;
+ public int tStop;
+
+ public int qChrom;
+ public byte qStrand;
+ public int qStart;
+ public int qStop;
+
+}
diff --git a/current/fileIO/CompressFiles.java b/current/fileIO/CompressFiles.java
new file mode 100755
index 0000000..d9ab67c
--- /dev/null
+++ b/current/fileIO/CompressFiles.java
@@ -0,0 +1,71 @@
+package fileIO;
+
+import java.io.File;
+
+
+public class CompressFiles {
+
+
+ public static void main(String[] args){
+ for(String s : args){
+ if(s.equalsIgnoreCase("zip")){
+ zip=true;
+ gzip=false;
+ }else if(s.equalsIgnoreCase("gzip") || s.equalsIgnoreCase("gz")){
+ zip=false;
+ gzip=true;
+ }else{
+ compressFiles(s);
+ }
+ }
+ }
+
+
+ public static void compressFiles(String path){
+ File f=new File(path);
+ compressFiles(f);
+ }
+
+ public static void compressFiles(File path){
+
+ if(path.isDirectory()){
+ File[] array=path.listFiles();
+ for(File f : array){compressFiles(f);}
+ }else{
+ compress(path);
+ }
+
+ }
+
+ public static void compress(File in){
+ assert(in.exists());
+ assert(in.isFile());
+ String abs=in.getAbsolutePath();
+// System.out.println("Considering "+abs);
+ if(abs.endsWith(".gz") || abs.endsWith(".zip") || abs.endsWith(".bz2")){return;}
+
+// if(!abs.contains("custom_summary_") || !abs.endsWith("Gene_build36.txt")){return;} //TODO ***TEMPORARY***
+ System.err.println(abs);
+// if(!abs.endsWith(".gvla")){return;} //TODO ***TEMPORARY***
+// if(!abs.endsWith(".gvla") ||
+// !(abs.contains("seqGene") || abs.contains("refGene") || abs.contains("unionGene"))){return;} //TODO ***TEMPORARY***
+ if(abs.toLowerCase().contains("familytree")){return;} //TODO ***TEMPORARY***
+
+ if(PRINT_7Z_BATCH){
+ //-mx=4 is fast; -mx=5 or 6 is slow; 7+ is very slow.
+// System.out.println("C:"+Data.SLASH+"\"Program Files\""+Data.SLASH+"7-Zip"+Data.SLASH+"7z a -mx=4 "+abs+".zip "+abs);
+ System.out.println("C:\\\"Program Files\"\\7-Zip\\7z a -mx=4 "+abs+".gz "+abs);
+ }else{
+ System.out.println("Compressing "+abs+" to "+(zip ? "zip" : "gz"));
+ ReadWrite.copyFile(abs, abs+(zip ? ".zip" : ".gz"));
+ }
+
+ }
+
+
+ public static boolean zip=true;
+ public static boolean gzip=!zip;
+
+ public static boolean PRINT_7Z_BATCH=true;
+
+}
diff --git a/current/fileIO/CopyFile.java b/current/fileIO/CopyFile.java
new file mode 100755
index 0000000..6c5df46
--- /dev/null
+++ b/current/fileIO/CopyFile.java
@@ -0,0 +1,114 @@
+package fileIO;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.util.zip.ZipOutputStream;
+
+import align2.ReadStats;
+import align2.Tools;
+
+import dna.Parser;
+import dna.Timer;
+
+
+/**
+ * Unlike ReadWrite's version, this one forces compression and decompression even with same extensions.
+ * Mainly for benchmarking.
+ * @author Brian Bushnell
+ * @date Jan 23, 2013
+ *
+ */
+public class CopyFile {
+
+ public static void main(String[] args){
+
+ String in=null, out=null;
+ boolean overwrite=true;
+ boolean append=false;
+
+ for(int i=0; i<args.length; i++){
+
+ if(true){
+ final String arg=args[i];
+ final String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if("null".equalsIgnoreCase(b)){b=null;}
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(Parser.parseCommonStatic(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseZip(arg, a, b)){
+ //do nothing
+ }else if(a.equals("in")){
+ in=b;
+ }else if(a.equals("out")){
+ out=b;
+ }else if(a.equals("append") || a.equals("app")){
+ append=ReadStats.append=Tools.parseBoolean(b);
+ }else if(a.equals("overwrite") || a.equals("ow")){
+ overwrite=Tools.parseBoolean(b);
+ }else if(in==null && i==0 && !args[i].contains("=")){
+ in=args[i];
+ }else if(out==null && i==1 && !args[i].contains("=")){
+ out=args[i];
+ }
+ }
+ }
+ assert(in!=null && out!=null);
+ long bytes=new File(in).length();
+ Timer t=new Timer();
+ copyFile(in, out, false, overwrite);
+ t.stop();
+ double mbps1=bytes*1000d/t.elapsed;
+ System.err.println("Time: \t"+t);
+ System.err.println(String.format("Speed: \t%.2f MB/s", mbps1));
+ }
+
+
+ public static synchronized void copyFile(String source, String dest, boolean createPathIfNeeded, boolean overwrite){
+
+ assert(overwrite || !new File(dest).exists()) : "Destination file already exists: "+dest;
+ if(createPathIfNeeded){
+ File parent=new File(dest).getParentFile();
+ if(parent!=null && !parent.exists()){
+ parent.mkdirs();
+ }
+ }
+
+ try{
+ InputStream in=ReadWrite.getInputStream(source, false, true);
+ OutputStream out=ReadWrite.getOutputStream(dest, false, false, true);
+
+ final byte[] buffer=new byte[16384];
+ int len;
+
+ while((len = in.read(buffer)) > 0){
+ out.write(buffer, 0, len);
+ }
+
+ in.close();
+ out.flush();
+ if(out.getClass()==ZipOutputStream.class){
+ ZipOutputStream zos=(ZipOutputStream)out;
+ zos.closeEntry();
+ zos.finish();
+ }
+ // else if(PROCESS_XZ && out.getClass()==org.tukaani.xz.XZOutputStream.class){
+ // org.tukaani.xz.XZOutputStream zos=(org.tukaani.xz.XZOutputStream)out;
+ // zos.finish();
+ // }
+ out.close();
+
+ }catch(FileNotFoundException e){
+ throw new RuntimeException(e);
+ }catch(IOException e){
+ throw new RuntimeException(e);
+ }
+ }
+
+}
diff --git a/current/fileIO/CopyFiles.java b/current/fileIO/CopyFiles.java
new file mode 100755
index 0000000..071f43e
--- /dev/null
+++ b/current/fileIO/CopyFiles.java
@@ -0,0 +1,64 @@
+package fileIO;
+
+import java.io.File;
+
+import dna.Data;
+
+
+public class CopyFiles {
+
+
+ public static void main(String[] args){
+ for(String s : args){
+ renameFiles(s);
+ }
+ }
+
+
+ public static void renameFiles(String path){
+ File f=new File(path);
+ renameFiles(f);
+ }
+
+ public static void renameFiles(File path){
+
+ if(path.isDirectory()){
+ File[] array=path.listFiles();
+ for(File f : array){renameFiles(f);}
+ }else{
+ rename(path);
+ }
+
+ }
+
+ public static void rename(File in){
+ assert(in.exists());
+ assert(in.isFile());
+ String abs=in.getAbsolutePath();
+
+
+ int dot=abs.lastIndexOf('.');
+ int slash=abs.lastIndexOf('/');
+
+// String[] split=Person.parsePath(abs.substring(0, slash));
+// String name=split[0];
+// String out=abs.substring(0, dot)+"_"+name+".txt";
+
+
+
+ String fname=abs.substring(slash+1);
+
+// System.out.println(fname);
+
+
+ if(fname.startsWith("chr") && fname.endsWith(".txt")){
+
+ String out=abs.replace(".txt", ".flow");
+ assert(!out.equals(abs)) : out+", "+abs;
+
+ System.out.println("Renaming "+abs+" to "+out);
+ ReadWrite.copyFile(abs, out);
+ }
+ }
+
+}
diff --git a/current/fileIO/CopyFiles2.java b/current/fileIO/CopyFiles2.java
new file mode 100755
index 0000000..3d0bb2e
--- /dev/null
+++ b/current/fileIO/CopyFiles2.java
@@ -0,0 +1,162 @@
+package fileIO;
+
+import java.io.File;
+
+import dna.Data;
+import dna.Timer;
+
+
+public class CopyFiles2 {
+
+
+ public static void main(String[] args){
+
+ Timer t=new Timer();
+
+ if(args.length>0){
+ assert(args.length==2);
+ inRoots=new String[] {args[0]};
+ outRoot=args[1];
+ }
+
+ for(String inRoot : inRoots){
+ copyFiles(inRoot, outRoot);
+ }
+
+ t.stop();
+ System.out.println("Time:\t"+t);
+ }
+
+
+ public static void copyFiles(String in, String out){
+ File fin=new File(in);
+ File fout=new File(out);
+ copyFiles(fin, fout);
+ }
+
+ public static void copyFiles(File in, File out){
+
+ String abs=in.getAbsolutePath();
+ for(String s : badNames){
+ if(abs.matches(s)){
+ return;
+ }
+ }
+
+ {
+ String temp=out.getAbsolutePath();
+ if(temp.endsWith("\\ASM")){
+ temp=temp.replace("\\ASM", "");
+ }else if(temp.contains("\\ASM\\")){
+ temp=temp.replace("\\ASM\\", "");
+ }
+ out=new File(temp);
+ }
+
+ if(in.isDirectory()){
+// System.out.println("PATH: "+in.getAbsolutePath());
+ if(!out.exists()){
+ out.mkdir();
+ }
+
+ File[] array=in.listFiles();
+ for(File f : array){
+// String outname=f.getAbsolutePath().replace(inRoot, outRoot);
+
+ String outname=out.getAbsolutePath()+"\\"+f.getName();
+
+ File f2=new File(outname);
+ copyFiles(f, f2);
+ }
+ }
+
+ else{
+ copyFile(in, out);
+ }
+
+ }
+
+ public static void copyFile(File in, File out){
+ assert(in.exists());
+ assert(in.isFile());
+
+ if(out.exists()){
+ System.out.println("Skipping existing file "+out.getAbsolutePath());
+ return;
+ }
+
+ String abs=in.getAbsolutePath();
+ String fname=in.getName();
+
+ boolean valid=false;
+
+ for(String s : badNames){
+ if(fname.matches(s)){
+ valid=false;
+ return;
+ }
+ }
+
+ for(String s : dirNames){
+ if(abs.contains(s)){
+ valid=true;
+ break;
+ }
+ }
+
+ for(String s : fileNames){
+ if(valid){break;}
+ if(fname.matches(s)){
+ valid=true;
+ }
+ }
+
+ if(!valid){return;}
+
+ if(abs.endsWith(".tsv")/* && in.length()>4000000*/){
+ out=new File(out.getAbsolutePath()+".zip");
+ }
+
+// if(abs.endsWith(".bz2")){
+// out=new File(out.getAbsolutePath().replace(".bz2", ".zip"));
+// }
+
+ System.out.println("Copying file to "+out.getAbsolutePath());
+ ReadWrite.copyFile(in.getAbsolutePath(), out.getAbsolutePath());
+
+ }
+
+// public static String[] inRoots={"F:\\UTSW_batch_1\\", "F:\\UTSW_batch_2\\"};
+ public static String[] inRoots={"F:\\UTSW_second_set\\"};
+ public static String outRoot="C:\\Data\\OCT_8\\";
+
+ public static final String[] dirNames={"\\CNV\\", "\\SV\\"};
+
+ public static final String[] fileNamesAbsolute={
+ ".*\\\\gene-GS.+-ASM.*\\.tsv.*",
+ ".*\\\\geneVarSummary-GS.+-ASM.*\\.tsv.*",
+ ".*\\\\summary-GS.+-ASM.*\\.tsv.*",
+ ".*\\\\var-GS.+-ASM.*\\.tsv.*",
+ ".*\\\\manifest\\.all",
+ ".*\\\\README\\..*",
+ ".*\\\\version",
+ };
+
+ public static final String[] fileNames={
+ "gene-GS.+-ASM.*\\.tsv.*",
+ "geneVarSummary-GS.+-ASM.*\\.tsv.*",
+ "summary-GS.+-ASM.*\\.tsv.*",
+ "var-GS.+-ASM.*\\.tsv.*",
+ "manifest\\.all",
+ "README\\..*",
+ "version",
+ };
+
+ public static final String[] badNames={
+ ".*AppleDouble.*",
+ ".*DS_Store.*",
+ ".*EVIDENCE.*"
+ };
+
+
+}
diff --git a/current/fileIO/FileFormat.java b/current/fileIO/FileFormat.java
new file mode 100755
index 0000000..a2ee3a4
--- /dev/null
+++ b/current/fileIO/FileFormat.java
@@ -0,0 +1,667 @@
+package fileIO;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import align2.Tools;
+
+import dna.Gene;
+import dna.Parser;
+
+/**
+ * @author Brian Bushnell
+ * @date Dec 19, 2012
+ *
+ */
+public final class FileFormat {
+
+ public static void main(String[] args){
+ stream.FASTQ.warnQualityChange=false;
+ PRINT_WARNING=false;
+ for(int i=0; i<args.length; i++){
+
+ final String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if("null".equalsIgnoreCase(b)){b=null;}
+ while(a.charAt(0)=='-' && (a.indexOf('.')<0 || i>1 || !new File(a).exists())){a=a.substring(1);}
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ }else if(a!=null && b!=null){
+ assert(a.startsWith("in")) : "Unknown parameter "+arg;
+ test(b, true);
+ }else{
+ test(arg, true);
+ }
+ }
+
+ }
+
+// /** Returns an int array: {format, compression, type, interleaved} */
+ private static void test(String fname, boolean forceFileRead){
+ FileFormat ffName=testInput(fname, FASTQ, null, false, false, false);
+ FileFormat ffContent=testInput(fname, ffName.format(), null, false, true, true);
+ FileFormat ff=ffContent;
+// assert(false) : ffName+"\n"+ffContent;
+ if(ff==null){
+ System.out.println("null");
+ }else{
+ int q=33;
+ int len=-1;
+ boolean i=false;
+ if(ff.fastq()){
+ byte qold=stream.FASTQ.ASCII_OFFSET;
+ stream.FASTQ.ASCII_OFFSET=33;
+ int[] qi=testInterleavedAndQuality(fname, false);
+ q=qi[0];
+ i=(qi[1]==INTERLEAVED);
+ len=qi[2];
+ stream.FASTQ.ASCII_OFFSET=qold;
+ }else if(ff.fasta()){
+ i=stream.FASTQ.testInterleavedFasta(fname, false);
+ }
+ String qs=(q==33 ? "sanger" : q==64 ? "illumina" : ""+q);
+ System.out.print(qs+"\t"+FORMAT_ARRAY[ff.format()]+"\t"+COMPRESSION_ARRAY[ff.compression()]+"\t"+(i ? "interleaved" : "single-ended"));
+ if(len>0){System.out.print("\t"+len+"bp");}
+ if(ffName.format()!=ff.format()){System.out.print("\t"+FORMAT_ARRAY[ffName.format()]+"\t(File extension differs from contents)");}
+ System.out.println();
+
+
+
+ }
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public static FileFormat testInput(String fname, String overrideExtension, boolean allowSubprocess){
+ if(verbose){System.err.println("testInputA("+fname+", "+overrideExtension+", "+allowSubprocess+")");}
+ return testInput(fname, FASTQ, overrideExtension, allowSubprocess, true);
+ }
+
+ public static FileFormat testInput(String fname, int defaultFormat, String overrideExtension, boolean allowSubprocess, boolean allowFileRead){
+ if(verbose){System.err.println("testInputB("+fname+", "+defaultFormat+", "+overrideExtension+", "+allowSubprocess+", "+allowFileRead+")");}
+ return testInput(fname, defaultFormat, overrideExtension, allowSubprocess, allowFileRead, false);
+ }
+
+ public static FileFormat testInput(String fname, int defaultFormat, String overrideExtension, boolean allowSubprocess, boolean allowFileRead, boolean forceFileRead){
+ if(verbose){System.err.println("testInputC("+fname+", "+defaultFormat+", "+overrideExtension+", "+allowSubprocess+", "+allowFileRead+", "+forceFileRead+")");}
+ if(fname==null){return null;}
+ int overrideFormat=0;
+ int overrideCompression=0;
+ if(overrideExtension!=null && overrideExtension.length()>0){
+ int[] a=testFormat(overrideExtension, false, false);
+ if(a!=null){
+ overrideFormat=a[0];
+ if(a[1]!=RAW){overrideCompression=a[1];}
+ }
+ }
+ return testInput(fname, defaultFormat, overrideFormat, overrideCompression, allowSubprocess, allowFileRead, forceFileRead);
+ }
+
+ public static FileFormat testInput(String fname, int defaultFormat, int overrideFormat,
+ int overrideCompression, boolean allowSubprocess, boolean allowFileRead, boolean forceFileRead){
+ if(verbose){System.err.println("testInputD("+fname+", "+defaultFormat+", "+overrideFormat+", "+overrideCompression+", "+allowSubprocess+", "+allowFileRead+", "+forceFileRead+")");}
+ if(fname==null){return null;}
+ return new FileFormat(fname, READ, defaultFormat, overrideFormat, overrideCompression, allowSubprocess, allowFileRead, forceFileRead, false, false, false);
+ }
+
+ public static FileFormat testOutput(String fname, int defaultFormat, String overrideExtension, boolean allowSubprocess, boolean overwrite, boolean append, boolean ordered){
+ if(fname==null){return null;}
+ int overrideFormat=0;
+ int overrideCompression=0;
+ if(overrideExtension!=null && overrideExtension.length()>0){
+ int[] a=testFormat(overrideExtension, false, false);
+ if(a!=null){
+ overrideFormat=a[0];
+ if(a[1]!=RAW){overrideCompression=a[1];}
+ }
+ }
+ return testOutput(fname, defaultFormat, overrideFormat, overrideCompression, allowSubprocess, overwrite, append, ordered);
+ }
+
+ public static FileFormat testOutput(String fname, int defaultFormat, int overrideFormat, int overrideCompression, boolean allowSubprocess, boolean overwrite, boolean append, boolean ordered){
+ if(fname==null){return null;}
+ return new FileFormat(fname, WRITE, defaultFormat, overrideFormat, overrideCompression, allowSubprocess, false, false, overwrite, append, ordered);
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Constructor ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private FileFormat(String fname, int mode_, int defaultFormat, int overrideFormat, int overrideCompression, boolean allowSubprocess_,
+ boolean allowFileRead, boolean forceFileRead, boolean overwrite_, boolean append_, boolean ordered_){
+// , boolean interleaved_, long maxReads_){
+
+ if(verbose){
+// new Exception().printStackTrace(System.err);
+ System.err.println("FileFormat(fname="+fname+", mode="+mode_+", dFormat="+defaultFormat+", oFormat="+overrideFormat+", oCompression="+overrideCompression+
+ ", allowSub="+allowSubprocess_+", allowRead="+allowFileRead+", forceFileRead="+forceFileRead+
+ ", ow="+overwrite_+", append="+append_+", ordered="+ordered_+")");
+ }
+ assert(!forceFileRead || allowFileRead);
+
+// assert(!overwrite_ || !append_) : "Both overwrite and append may not be set to true.";
+ if(overwrite_ && append_){overwrite_=false;}
+
+ assert(fname!=null);
+ fname=fname.trim().replace('\\', '/');
+ assert(fname.trim().length()>0) : fname;
+
+ if(defaultFormat<1 && !forceFileRead){defaultFormat=FQ;}
+ allowFileRead&=(mode_==READ);
+ int[] a=testFormat(fname, allowFileRead, forceFileRead);
+
+ if(verbose){System.err.println(Arrays.toString(a));}
+
+ if(a[0]==UNKNOWN && overrideFormat<1){
+ a[0]=defaultFormat;
+ if(defaultFormat!=TEXT && PRINT_WARNING){
+ System.err.println("Unspecified format for "+(mode_==READ ? "input" : "output")+" "+(fname==null ? "stream" : fname)+"; defaulting to "+FORMAT_ARRAY[a[0]]+".");
+ }
+ }
+ if(verbose){System.err.println(Arrays.toString(a));}
+
+ if(overrideFormat>0){a[0]=overrideFormat;}
+ if(overrideCompression>0){a[1]=overrideCompression;}
+
+ if(verbose){System.err.println(Arrays.toString(a));}
+
+
+// {format, compression, type, interleaved, quality, length}
+ name=fname;
+ format=a[0];
+ compression=a[1];
+ type=a[2];
+ interleaved=a[3];
+ asciiOffset=a[4];
+ length=a[5];
+ mode=mode_;
+
+ overwrite=overwrite_;
+ append=append_;
+ allowSubprocess=allowSubprocess_;
+ ordered=ordered_;
+
+// interleaved=interleaved_;
+// maxReads=write() ? -1 : maxReads_;
+
+ assert(forceFileRead || !unknownFormat()) : "Unknown file format for "+fname+"\n"+
+ mode_+", "+defaultFormat+", "+overrideFormat+", "+overrideCompression+", "+allowSubprocess_+", "+allowFileRead+", "+overwrite_;
+ assert(!unknownCompression()) : "Unknown compression for "+fname+"\n"+
+ mode_+", "+defaultFormat+", "+overrideFormat+", "+overrideCompression+", "+allowSubprocess_+", "+allowFileRead+", "+overwrite_;
+ assert(!unknownType()) : "Unknown stream type for "+fname+"\n"+
+ mode_+", "+defaultFormat+", "+overrideFormat+", "+overrideCompression+", "+allowSubprocess_+", "+allowFileRead+", "+overwrite_;
+ assert(!unknownMode()) : "Unknown I/O mode for "+fname+"\n"+
+ mode_+", "+defaultFormat+", "+overrideFormat+", "+overrideCompression+", "+allowSubprocess_+", "+allowFileRead+", "+overwrite_;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public String toString(){
+ StringBuilder sb=new StringBuilder();
+ sb.append(name).append(',');
+ sb.append(format+"("+FORMAT_ARRAY[format]+")").append(',');
+ sb.append(compression+"("+COMPRESSION_ARRAY[compression]+")").append(',');
+ sb.append(type+"("+TYPE_ARRAY[type]+")").append(',');
+// sb.append("ascii"+asciiOffset).append(',');
+ sb.append(mode+"("+MODE_ARRAY[mode]+")").append(',');
+ sb.append("ow="+(overwrite ? "t" : "f")).append(',');
+ sb.append("app="+(append ? "t" : "f")).append(',');
+ sb.append("sub="+(allowSubprocess ? "t" : "f")).append(',');
+ sb.append("ordered="+(ordered ? "t" : "f"));
+ return sb.toString();
+ }
+
+ public static String toString(int[] vector){
+ int format=vector[0], compression=vector[1], type=vector[2], interleaving=vector[3];
+ StringBuilder sb=new StringBuilder();
+ sb.append(format+"("+FORMAT_ARRAY[format]+")").append(',');
+ sb.append(compression+"("+COMPRESSION_ARRAY[compression]+")").append(',');
+ sb.append(type+"("+TYPE_ARRAY[type]+")").append(',');
+ sb.append(interleaving+"("+INTERLEAVING_ARRAY[interleaving]+")");
+ return sb.toString();
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Static Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Returns an int array: {format, compression, type, interleaved, quality, length} */
+ public static final int[] testFormat(String fname, boolean allowFileRead, boolean forceFileRead){
+ if(verbose){System.err.println("testFormat("+fname+", "+allowFileRead+", "+forceFileRead+")");}
+ final int[] r=new int[] {UNKNOWN, RAW, FILE, UNKNOWN, -1, -1};
+ if(fname==null || fname.length()<1){
+ r[2]=STDIO;
+ return r;
+ }
+ String slc=fname.trim().toLowerCase();
+ if(slc.indexOf('/')<0){slc=slc.substring(slc.lastIndexOf('/')+1);}
+ if(slc.indexOf('.')<0){slc="."+slc;}
+ String comp=ReadWrite.compressionType(slc);
+ String ext=ReadWrite.rawExtension(slc);
+
+ if(ext==null){}
+ else if(ext.equals("fq") || ext.equals("fastq")){r[0]=FASTQ;}
+ else if(isFasta(ext)){r[0]=FASTA;}
+ else if(/*ext.equals("txt") || */ext.equals("bread")){r[0]=BREAD;}
+ else if(ext.equals("sam")){r[0]=SAM;}
+ else if(ext.equals("csfasta")){r[0]=CSFASTA;}
+ else if(ext.equals("qual")){r[0]=QUAL;}
+ else if(ext.equals("bam")){r[0]=BAM;}
+ else if(ext.equals("sites") || ext.equals("sitesonly")){r[0]=SITES;}
+ else if(ext.equals("info") || ext.equals("attachment")){r[0]=ATTACHMENT;}
+ else if(ext.equals("scarf")){r[0]=SCARF;}
+ else if(ext.equals("phylip")){r[0]=PHYLIP;}
+ else if(ext.equals("header")){r[0]=HEADER;}
+
+ if(comp!=null){
+ r[1]=Gene.find3(comp, COMPRESSION_ARRAY);
+ assert(r[1]>0) : "Unhandled compression type: "+comp;
+ }
+
+ if(slc.length()>2 && slc.charAt(0)=='s' && slc.charAt(1)=='t'){
+ if(slc.equals("stdin") || slc.startsWith("stdin.") || slc.equals("standardin")){r[2]=STDIO;}
+ else if(slc.equals("stdout") || slc.startsWith("stdout.") || slc.equals("standardout")){r[2]=STDIO;}
+ }else if("/dev/null".equalsIgnoreCase(slc)){
+ r[2]=DEVNULL;
+ }
+
+ if(verbose){System.err.println("Before reading: \t"+r[0]+", "+toString(r)+", "+forceFileRead+", "+(r[0]!=BAM));}
+
+ if(r[0]==UNKNOWN || (forceFileRead && r[0]!=BAM)){
+ File f=(allowFileRead && r[2]==FILE ? new File(fname) : null);
+ if(f!=null && f.exists() && !f.isDirectory()){
+
+// int b1=-1, b2=-1, b3=-1, len=0;
+// try {
+// InputStream is=ReadWrite.getInputStream(fname, false, r[1]==BZ2);
+// int x=is.read();
+// b1=x;
+// while(x>=0 && x!='\n' && x!='\r'){x=is.read();}//read first line
+// while(x=='\n' || x=='\r'){x=is.read();}//finish first line
+// b2=x;//first char of second line
+// while(x>=0 && x!='\n' && x!='\r'){x=is.read(); len++;}
+// while(x=='\n' || x=='\r'){x=is.read();}
+// b3=x;
+// is.close();
+// ReadWrite.finishReading(is, fname, true);
+// } catch (Exception e) {
+// // TODO Auto-generated catch block
+// e.printStackTrace();
+// }
+// if(b1=='>'){r[0]=FA;}
+// else if(b1=='@'){
+// if(b3=='+'){r[0]=FQ;}
+// else if(b2<0 || b2=='@'){r[0]=SAM;}
+// else{if(!forceFileRead){r[0]=FQ;}} //probably a truncated fastq file?
+// }
+// else{if(!forceFileRead){r[0]=BREAD;}} //or possibly scarf
+//
+// if(r[0]==FQ){r[4]=len;}
+//// System.err.println((char)b1+", "+(char)b2+", "+(char)b3+", ");
+
+
+// //a: {quality, interleaved, length, format}
+// //r: {format, compression, type, interleaved, quality, length}
+ try {
+ int[] a=testInterleavedAndQuality(fname, false);
+ if(a!=null){
+ final int aq=a[0], ai=a[1], al=a[2], af=a[3];
+ if(aq>-1){r[4]=aq;}
+ if(ai!=UNKNOWN){r[3]=ai;}
+ if(af!=UNKNOWN && (af!=BREAD || (r[0]!=HEADER && r[0]!=TEXT))){r[0]=af;}
+ if(al>1 && r[5]==-1){r[5]=al;}
+ }
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+
+ if(verbose){System.err.println("After reading: \t"+r[0]+", "+toString(r)+", "+forceFileRead+", "+(r[0]!=BAM));}
+ }else{
+ if(fname.equals("sequential")){r[0]=SEQUENTIAL;}
+ else if(fname.equals("random")){r[0]=RANDOM;}
+ else if(fname.equals("sitesonly")){r[0]=SITES;}
+ }
+ }
+
+
+ if(r[2]==STDIO && allowFileRead){
+ File f=new File(fname);
+ if(f.exists() && !f.isDirectory()){r[2]=FILE;}
+ }
+ if(verbose){System.err.println("testFormat return:\t"+r[0]+", "+toString(r)+", "+forceFileRead+", "+(r[0]!=BAM)+", "+r[4]);}
+ return r;
+ }
+
+ public static boolean hasFastaExtension(String fname){
+ int[] r=testFormat(fname, false, false);
+ return r[0]==FA;
+ }
+
+ public static boolean hasFastqExtension(String fname){
+ int[] r=testFormat(fname, false, false);
+ return r[0]==FQ;
+ }
+
+ public static boolean hasSamOrBamExtension(String fname){
+ int[] r=testFormat(fname, false, false);
+ return r[0]==SAM || r[0]==BAM;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- ??????? ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /**
+ * @param fname File to read
+ * @return {quality, interleaved, length, format}
+ */
+ public static int[] testInterleavedAndQuality(String fname, boolean forceFastq){
+ final ArrayList<String> oct=getFirstOctet(fname);
+ return testInterleavedAndQuality(oct, fname, forceFastq);
+ }
+
+ public static ArrayList<String> getFirstOctet(String fname){
+ if(fname==null){return null;}
+ if(fname.equalsIgnoreCase("stdin") || fname.toLowerCase().startsWith("stdin.")){return null;}
+
+ ArrayList<String> oct=new ArrayList<String>(8);
+
+ {
+ InputStream is=ReadWrite.getInputStream(fname, false, fname.toLowerCase().endsWith(".bz2"));
+ BufferedReader br=new BufferedReader(new InputStreamReader(is));
+ try {
+ int cntr=0;
+ for(String s=br.readLine(); s!=null && cntr<8; s=br.readLine()){
+ oct.add(s);
+ cntr++;
+ }
+ } catch (IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ ReadWrite.finishReading(is, fname, true, br);
+ }
+ return oct;
+ }
+
+ /**
+ * @param oct First 8 lines of file
+ * @param fname File to read
+ * @return {quality, interleaved, length, format}
+ */
+ public static int[] testInterleavedAndQuality(final ArrayList<String> oct, String fname, boolean forceFastq){
+ int len=-1, format=UNKNOWN;
+ byte q=-1, i=UNKNOWN;
+ if(oct==null || oct.size()<1){
+ return new int[] {q, i, len, format};
+ }
+ {
+ String s1=oct.size()>0 ? oct.get(0) : "";
+ String s2=oct.size()>1 ? oct.get(1) : "";
+ String s3=oct.size()>2 ? oct.get(2) : "";
+ int b1=(s1.length()>0 ? s1.charAt(0) : -1);
+ int b2=(s2.length()>0 ? s2.charAt(0) : -1);
+ int b3=(s3.length()>0 ? s3.charAt(0) : -1);
+
+ if(b1=='>'){format=FA;}
+ else if(b1=='@'){
+ if(b3=='+'){format=FQ;}
+ else if(b2<0 || b2=='@'){format=SAM;}
+ else{format=UNKNOWN;} //probably a truncated fastq file?
+ }
+ else{format=BREAD;} //or possibly scarf
+
+ if(format!=FQ){len=-1;}
+ }
+
+ if(format==FQ || forceFastq){
+ boolean old=stream.FASTQ.DETECT_QUALITY;
+ stream.FASTQ.DETECT_QUALITY=true;
+ q=stream.FASTQ.testQuality(oct);
+ i=(byte)(stream.FASTQ.testInterleaved(oct, fname, false) ? INTERLEAVED : SINGLE);
+ // stream.FASTQ.DETECT_QUALITY=old;
+ {
+ String a=oct.size()>1 ? oct.get(1) : null;
+ String b=oct.size()>5 ? oct.get(5) : null;
+ if(a!=null){len=Tools.max(a.length(), len);}
+ if(b!=null){len=Tools.max(b.length(), len);}
+ if(len<2){len=-1;}
+ }
+ }
+
+ int[] r=new int[] {q, i, len, format};
+ if(verbose){System.err.println(Arrays.toString(r));}
+ return r;
+ }
+
+ public static boolean isFasta(String ext){
+ if(ext==null){return false;}
+ return (ext.equals("fa") || ext.equals("fasta") || ext.equals("fas") || ext.equals("fna") || ext.equals("ffn")
+ || ext.equals("frn") || ext.equals("seq") || ext.equals("fsa") || ext.equals("faa"));
+ }
+
+ public static boolean isStdio(String s){
+ if(s==null){return false;}
+ if(new File(s).exists()){return false;}
+ if(s.contains(".")){s=s.substring(0, s.indexOf('.'));
+ }
+ return (s.equalsIgnoreCase("stdin") || s.equalsIgnoreCase("stdout") || s.equalsIgnoreCase("stderr"));
+ }
+
+ public static boolean isFastq(String ext){
+ if(ext==null){return false;}
+ return (ext.equals("fq") || ext.equals("fastq"));
+ }
+
+ public static boolean isSamOrBam(String ext){
+ if(ext==null){return false;}
+ return (ext.equals("sam") || ext.equals("bam"));
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Getters ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public final String name(){return name;}
+ public final int format(){return format;}
+ public final int compression(){return compression;}
+ public final int type(){return type;}
+ public final int mode(){return mode;}
+
+ public final boolean hasName(){return name!=null;}
+ public final boolean canWrite(){
+ assert(write());
+ if(stdio() || devnull()){return true;}
+ assert(hasName());
+ File f=new File(name);
+ if(!f.exists()){return true;}
+ if(!f.canWrite()){return false;}
+ return overwrite() || append();
+ }
+ public final boolean canRead(){
+ assert(read());
+ if(stdio()){return true;}
+ assert(hasName());
+ File f=new File(name);
+ return f.canRead();
+ }
+
+ public final boolean unknownField(){return unknownFormat() || unknownCompression() || unknownType() || unknownMode();}
+
+ public final boolean unknownFormat(){return format<=UNKNOWN;}
+ public final boolean fasta(){return format==FASTA;}
+ public final boolean fastq(){return format==FASTQ;}
+ public final boolean bread(){return format==BREAD;}
+ public final boolean sam(){return format==SAM;}
+ public final boolean samOrBam(){return format==SAM || format==BAM;}
+ public final boolean csfasta(){return format==CSFASTA;}
+ public final boolean qual(){return format==QUAL;}
+ public final boolean sequential(){return format==SEQUENTIAL;}
+ public final boolean random(){return format==RANDOM;}
+ public final boolean sites(){return format==SITES;}
+ public final boolean attachment(){return format==ATTACHMENT;}
+ public final boolean header(){return format==HEADER;}
+ public final boolean bam(){return format==BAM;}
+ public final boolean scarf(){return format==SCARF;}
+ public final boolean text(){return format==TEXT;}
+
+ public final boolean unknownCompression(){return compression<=UNKNOWN;}
+ public final boolean raw(){return compression==RAW;}
+ public final boolean gzip(){return compression==GZIP;}
+ public final boolean zip(){return compression==ZIP;}
+ public final boolean bz2(){return compression==BZ2;}
+ public final boolean xz(){return compression==XZ;}
+ public final boolean sevenz(){return compression==SEVENZ;}
+ public final boolean dsrc(){return compression==DSRC;}
+
+ public final boolean unknownType(){return type<=UNKNOWN;}
+ public final boolean file(){return type==FILE;}
+ public final boolean stdio(){return type==STDIO;}
+ public final boolean devnull(){return type==DEVNULL;}
+
+ public final boolean unknownMode(){return mode<=UNKNOWN;}
+ public final boolean read(){return mode==READ;}
+ public final boolean write(){return mode==WRITE;}
+
+ public final boolean overwrite(){return overwrite;}
+ public final boolean append(){return append;}
+ public final boolean allowSubprocess(){return allowSubprocess;}
+ public final boolean ordered(){return ordered;}
+
+ public final boolean exists(){
+ if(!file()){return read();}
+ File f=new File(name);
+ if(!f.exists() && !gzip()){return false;}
+ long size=f.length();
+ return size>10;
+ }
+
+// public final boolean interleaved(){return interleaved;}
+// public final long maxReads(){return maxReads;}
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private final String name;
+ private final int format;
+ private final int asciiOffset;
+ private final int compression;
+ private final int type;
+ private final int mode;
+ private final int interleaved;
+ private final int length;
+
+ private final boolean overwrite;
+ private final boolean append;
+ private final boolean allowSubprocess;
+ private final boolean ordered;
+
+// private final long maxReads;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Statics ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public static boolean verbose=false;
+ public static boolean PRINT_WARNING=true;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Constants ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public static final int UNKNOWN=0;
+
+ /* Format */
+
+ public static final int FA=1, FASTA=1;
+ public static final int FQ=2, FASTQ=2;
+ public static final int BREAD=3;
+ public static final int SAM=4;
+ public static final int CSFASTA=5;
+ public static final int QUAL=6;
+ public static final int SEQUENTIAL=7;
+ public static final int RANDOM=8;
+ public static final int SITES=9;
+ public static final int ATTACHMENT=10;
+ public static final int BAM=11;
+ public static final int SCARF=12;
+ public static final int TEXT=13, TXT=13;
+ public static final int PHYLIP=14;
+ public static final int HEADER=15;
+
+ private static final String[] FORMAT_ARRAY=new String[] {
+ "unknown", "fasta", "fastq", "bread", "sam", "csfasta",
+ "qual", "sequential", "random", "sites", "attachment",
+ "bam", "scarf", "text", "phylip", "header"
+ };
+
+ public static final String[] EXTENSION_LIST=new String[] {
+ "fq", "fastq", "fa", "fasta", "fas", "fna",
+ "ffn", "frn", "seq", "fsa", "faa",
+ "bread", "sam", "csfasta", "qual", "bam",
+ "scarf", "phylip", "txt",
+ "gz", "gzip", "bz2", "zip", "xz", "dsrc", "header"
+ };
+
+ /* Compression */
+
+ public static final int RAW=1;
+ public static final int GZ=2, GZIP=2;
+ public static final int ZIP=3;
+ public static final int BZ2=4;
+ public static final int XZ=5;
+ public static final int c4=6;
+ public static final int SEVENZ=7;
+ public static final int DSRC=8;
+
+ private static final String[] COMPRESSION_ARRAY=new String[] {
+ "unknown", "raw", "gz", "zip", "bz2", "xz",
+ "c4", "7z", "dsrc"
+ };
+
+ /* Type */
+
+ public static final int FILE=1;
+ public static final int STDIO=2, STDIN=2, STDOUT=2;
+ public static final int DEVNULL=3;
+// public static final int NULL=4;
+
+ private static final String[] TYPE_ARRAY=new String[] {
+ "unknown", "file", "stdio", "devnull"
+ };
+
+ /* Mode */
+
+ public static final int READ=1, WRITE=2;
+
+ private static final String[] MODE_ARRAY=new String[] {
+ "unknown", "read", "write"
+ };
+
+ /* Interleaving */
+
+ public static final int SINGLE=1, INTERLEAVED=2;
+
+ private static final String[] INTERLEAVING_ARRAY=new String[] {
+ "unknown", "single-ended", "interleaved"
+ };
+
+}
diff --git a/current/fileIO/FindFiles.java b/current/fileIO/FindFiles.java
new file mode 100755
index 0000000..e22e70a
--- /dev/null
+++ b/current/fileIO/FindFiles.java
@@ -0,0 +1,113 @@
+package fileIO;
+
+import java.io.File;
+import java.util.ArrayList;
+
+
+public class FindFiles {
+
+
+ public static void main(String[] args){
+
+ String root=args[0];
+// if(root.equals(".")){root=null;}
+ String prefix=args[1];
+ String suffix=(args[2].equals("null") ? null : args[2]);
+ String middle=null;
+
+ if(args.length>3){
+ middle=(args[3].equals("null") ? null : args[3]);
+ }
+
+ boolean NEWLINE=true;
+ boolean BOTH=true;
+
+ ArrayList<String> results=findFiles(root, prefix, suffix, middle);
+ for(String s : results){
+ if(NEWLINE){
+ System.out.println(s);
+ }else{
+ System.out.print(s+" ");
+ }
+ }
+
+
+ if(BOTH){
+ System.out.println();
+ NEWLINE=!NEWLINE;
+ for(String s : results){
+ if(NEWLINE){
+ System.out.println(s);
+ }else{
+ System.out.print(s+" ");
+ }
+ }
+ }
+ }
+
+
+ public FindFiles(String pre, String suf, String mid){
+ assert(!"*".equals(pre)) : "Use # instead of *, which has problems from the command line";
+ assert(!"*".equals(suf)) : "Use # instead of *, which has problems from the command line";
+ prefix=((pre==null || pre.equals("*") || pre.equals("#")) ? null : pre.toLowerCase());
+ suffix=((suf==null || suf.equals("*") || suf.equals("#")) ? null : suf.toLowerCase());
+ middle=((mid==null || mid.equals("*") || mid.equals("#")) ? null : mid.toLowerCase());
+ }
+
+ public static ArrayList<String> findFiles(String root, String prefix, String suffix){
+ return findFiles(root, prefix, suffix, null);
+ }
+
+ public static ArrayList<String> findFiles(String root, String prefix, String suffix, String mid){
+ FindFiles ff=new FindFiles(prefix, suffix, mid);
+ return ff.findFiles(root);
+ }
+
+ public ArrayList<String> findFiles(String path){
+ findFiles(new File(path==null ? "." : path));
+ return results;
+ }
+
+ public ArrayList<String> findFiles(File path){
+
+ if(path.isDirectory()){
+ File[] array=path.listFiles();
+ if(array==null){System.err.println("null contents for "+path.getAbsolutePath());}
+ else{for(File f : array){findFiles(f);}}
+ }else{
+ consider(path);
+ }
+ return results;
+ }
+
+ public void consider(File in){
+// System.out.println("Considering "+in.getAbsolutePath()+" versus '"+prefix+"' '"+suffix+"'");
+ if(!in.exists()){return;}
+ assert(in.exists()) : in;
+ assert(in.isFile());
+ String abs=in.getAbsolutePath();
+// System.out.println("Considering "+abs);
+ String abs2=abs.toLowerCase();
+ int slashLoc=abs2.lastIndexOf(slash);
+ if(slashLoc>-1){
+ abs2=abs2.substring(slashLoc+1);
+ }
+// System.out.println("a");
+ if(prefix!=null && !abs2.startsWith(prefix)){return;}
+// System.out.println("b");
+ if(suffix!=null && !abs2.endsWith(suffix)){return;}
+// System.out.println("c");
+
+ if(middle!=null && !abs2.contains(middle)){return;}
+
+ results.add(abs);
+ }
+
+
+ public ArrayList<String> results=new ArrayList<String>();
+ public String prefix;
+ public String suffix;
+ public String middle;
+ public static final char slash=System.getProperty("file.separator").charAt(0);
+
+}
diff --git a/current/fileIO/GenericTextFile.java b/current/fileIO/GenericTextFile.java
new file mode 100755
index 0000000..0b7cb2b
--- /dev/null
+++ b/current/fileIO/GenericTextFile.java
@@ -0,0 +1,36 @@
+package fileIO;
+
+import java.util.ArrayList;
+
+public class GenericTextFile extends TextFile {
+
+ public GenericTextFile(String name) {
+ super(name, false, false);
+ }
+
+
+
+
+ public String[] toLines(){
+
+ String s=null;
+ ArrayList<String> list=new ArrayList<String>(4096);
+
+ for(s=nextLine(); s!=null; s=nextLine()){
+ list.add(s);
+ }
+
+ return list.toArray(new String[list.size()]);
+
+ }
+
+ public String nextLine(){
+ String line=readLine();
+ while(line!=null && false){
+ line=readLine();
+ }
+ return line;
+ }
+
+
+}
diff --git a/current/fileIO/LoadThread.java b/current/fileIO/LoadThread.java
new file mode 100755
index 0000000..897ef47
--- /dev/null
+++ b/current/fileIO/LoadThread.java
@@ -0,0 +1,139 @@
+package fileIO;
+
+import java.util.Arrays;
+
+import align2.Shared;
+import align2.Tools;
+
+/**
+ * @author Brian Bushnell
+ * @date Jan 2, 2013
+ *
+ */
+public class LoadThread<X> extends Thread{
+
+ public static <Y> LoadThread<Y> load(String fname, Class<Y> c){
+ LoadThread<Y> lt=new LoadThread<Y>(fname, c);
+ lt.start();
+ return lt;
+ }
+
+ private LoadThread(String fname_, Class<X> c_){
+ fname=fname_;
+ c=c_;
+ addThread(1);
+ }
+
+ @Override
+ public void run(){
+ addRunningThread(1);
+ output=ReadWrite.read(c, fname, false);
+ addRunningThread(-1);
+ synchronized(this){this.notify();}
+ }
+
+
+ private static final int addThread(int x){
+ final int lim=(Shared.LOW_MEMORY ? 1 : LIMIT);
+ synchronized(activeThreads){
+ assert(x!=0);
+ if(x>0){
+ activeThreads[0]+=x;
+ activeThreads[1]+=x;
+ }else{
+ addRunningThread(x);
+ }
+ assert(activeThreads[0]==(activeThreads[1]+activeThreads[2]) && activeThreads[0]>=0 && activeThreads[1]>=0 &&
+ activeThreads[2]>=0 && activeThreads[2]<=lim) : Arrays.toString(activeThreads);
+
+ return activeThreads[0];
+ }
+ }
+
+ private static final int addRunningThread(int x){
+ final int lim=(Shared.LOW_MEMORY ? 1 : LIMIT);
+ synchronized(activeThreads){
+ assert(x!=0);
+ if(x>0){
+ assert(activeThreads[1]>=x);
+ while(activeThreads[2]>=lim){
+ try {
+ activeThreads.wait();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ activeThreads[1]-=x; //Remove from waiting
+ }else{
+ activeThreads[0]+=x; //Remove from active
+ }
+ activeThreads[2]+=x; //Change number running
+
+ assert(activeThreads[0]==(activeThreads[1]+activeThreads[2]) && activeThreads[0]>=0 && activeThreads[1]>=0 &&
+ activeThreads[2]>=0 && activeThreads[2]<=lim) : Arrays.toString(activeThreads);
+
+ if(activeThreads[2]==0 || (activeThreads[2]<lim && activeThreads[1]>0)){activeThreads.notify();}
+// System.err.println(activeThreads[2]);
+// try {
+// activeThreads.wait(5000);
+// } catch (InterruptedException e) {
+// // TODO Auto-generated catch block
+// e.printStackTrace();
+// }
+ return activeThreads[2];
+ }
+ }
+
+ public static final int countActiveThreads(){
+ final int lim=(Shared.LOW_MEMORY ? 1 : LIMIT);
+ synchronized(activeThreads){
+ assert(activeThreads[0]==(activeThreads[1]+activeThreads[2]) && activeThreads[0]>=0 && activeThreads[1]>=0 &&
+ activeThreads[2]>=0 && activeThreads[2]<=lim) : Arrays.toString(activeThreads);
+ return activeThreads[0];
+ }
+ }
+
+ public static final void waitForReadingToFinish(){
+ final int lim=(Shared.LOW_MEMORY ? 1 : LIMIT);
+ synchronized(activeThreads){
+ while(activeThreads[0]>0){
+ assert(activeThreads[0]==(activeThreads[1]+activeThreads[2]) && activeThreads[0]>=0 && activeThreads[1]>=0 &&
+ activeThreads[2]>=0 && activeThreads[2]<=lim) : Arrays.toString(activeThreads);
+ try {
+ activeThreads.wait(8000);
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ if(activeThreads[2]==0 || (activeThreads[2]<lim && activeThreads[1]>0)){activeThreads.notify();}
+ }
+ }
+ }
+
+ public final void waitForThisToFinish(){
+ if(output==null){
+ while(this.getState()!=State.TERMINATED){
+ try {
+ this.join();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+ }
+
+ /** {active, waiting, running} <br>
+ * Active means running or waiting.
+ */
+ public static int[] activeThreads={0, 0, 0};
+
+ private final String fname;
+ private final Class<X> c;
+ public X output=null;
+
+ private static final int[] RUNNING=new int[1];
+ public static int LIMIT=Tools.min(8, Tools.max(Shared.threads(), 1));
+
+}
diff --git a/current/fileIO/MatrixFile.java b/current/fileIO/MatrixFile.java
new file mode 100755
index 0000000..1a4b0c1
--- /dev/null
+++ b/current/fileIO/MatrixFile.java
@@ -0,0 +1,89 @@
+package fileIO;
+import dna.Matrix;
+
+
+
+public class MatrixFile extends TextFile{
+
+ public static void main(String[] args){
+
+ try {
+ //Name of mat file
+ String name=args[0];
+
+ MatrixFile mat=new MatrixFile(name);
+
+ String s=null;
+
+ for(s=mat.readLine(); s!=null; s=mat.readLine()){
+ System.out.println(s);
+ }
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+
+ }
+
+
+ public MatrixFile(String name){super(name, false, false);}
+
+ public String nextLine(){
+ String line=readLine();
+
+ while(line!=null && line.charAt(0)!='{' && line.charAt(0)!='/'){
+ line=readLine();
+ }
+ return line;
+ }
+
+ public Matrix nextMatrix(){
+ String line;
+ String[] split;
+
+ line=nextLine();
+ if(line==null || line.startsWith("//end")){return null;}
+
+ assert(line.startsWith("//name: ")) : line;
+ String name=line.replace("//name: ","").trim();
+
+ line=nextLine();
+ assert(line.startsWith("//size: ")) : line;
+ line=line.replace("//size: ","");
+ split=line.split("x");
+ int length=Integer.parseInt(split[0]);
+ int width=Integer.parseInt(split[1]);
+
+ line=nextLine();
+ assert(line.startsWith("//prefix: ")) : line;
+ line=line.replace("//prefix: ","");
+ int prefix=Integer.parseInt(line);
+
+ line=nextLine();
+ assert(line.startsWith("//count: ")) : line;
+ line=line.replace("//count: ","");
+ int count=Integer.parseInt(line);
+
+
+ float[][] grid=new float[length][width];
+ for(int i=0; i<length; i++){
+ line=nextLine();
+
+ while(line.startsWith("//")){line=nextLine();}
+
+ assert(line.startsWith("{"));
+ if(line.endsWith(",")){line=line.substring(0, line.length()-1);}
+ assert(line.endsWith("}"));
+ line=line.replace("{", "").replace("}", "").replace(" ", "");
+ split=line.split(",");
+ assert(split.length==width);
+ for(int j=0; j<split.length; j++){
+ grid[i][j]=Float.parseFloat(split[j]);
+ }
+ }
+
+ return new Matrix(grid, prefix, name);
+ }
+
+ public static boolean verbose=false;
+
+}
diff --git a/current/fileIO/PipeThread.java b/current/fileIO/PipeThread.java
new file mode 100755
index 0000000..d6cefd2
--- /dev/null
+++ b/current/fileIO/PipeThread.java
@@ -0,0 +1,88 @@
+package fileIO;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+
+/**
+ * Listens to an output stream and copies it to an input stream.
+ * For example, redirects the error stream of some process to stderr.
+ * @author Brian Bushnell
+ * @date Jan 22, 2013
+ *
+ */
+public class PipeThread extends Thread {
+
+// public PipeThread(InputStream is_){this(is_, System.err);}
+
+ public PipeThread(InputStream is_, OutputStream os_){
+ is=is_;
+ os=os_;
+ if(is==null){throw new RuntimeException("Null input stream.");}
+ if(os==null){throw new RuntimeException("Null output stream.");}
+// synchronized(list){list.add(this);}
+ }
+
+ public void run(){
+ final byte[] buf=new byte[8196];
+ try {
+ for(int len=is.read(buf); !finished && len>0; len=is.read(buf)){
+ os.write(buf, 0, len);
+ }
+ } catch (IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+
+ if(is!=System.in){
+ try {
+ is.close();
+ } catch (IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+
+ if(os!=System.out && os!=System.err){
+ ReadWrite.close(os);
+ }
+
+ synchronized(this){
+ finished=true;
+ this.notify();
+ }
+ }
+
+ public boolean finished(){
+ synchronized(this){
+ return finished;
+ }
+ }
+
+ public void terminate(){
+ synchronized(this){
+ if(!finished){
+ finished=true;
+ interrupt();
+ }
+ }
+ }
+
+// public static void killList(){
+// System.err.println("Kill list.");
+// synchronized(list){
+// for(PipeThread pt : list){
+// if(!pt.finished){
+// pt.terminate();
+// }
+// }
+// }
+// }
+
+ public final InputStream is;
+ public final OutputStream os;
+ private volatile boolean finished=false;
+
+// private static ArrayList<PipeThread> list=new ArrayList<PipeThread>(8);
+
+}
diff --git a/current/fileIO/ReadWrite.java b/current/fileIO/ReadWrite.java
new file mode 100755
index 0000000..62c5c13
--- /dev/null
+++ b/current/fileIO/ReadWrite.java
@@ -0,0 +1,1613 @@
+package fileIO;
+
+import java.io.BufferedInputStream;
+import java.io.BufferedOutputStream;
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
+import java.io.OutputStream;
+import java.io.PrintWriter;
+import java.io.Reader;
+import java.lang.ProcessBuilder.Redirect;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Locale;
+import java.util.zip.GZIPInputStream;
+import java.util.zip.GZIPOutputStream;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipInputStream;
+import java.util.zip.ZipOutputStream;
+
+import stream.ConcurrentReadStreamInterface;
+import stream.ConcurrentReadOutputStream;
+import stream.KillSwitch;
+import stream.MultiCros;
+
+import align2.Shared;
+import align2.Tools;
+
+import dna.Data;
+
+public class ReadWrite {
+
+
+ public static void main(String[] args){
+ File f=new File(args[1]);
+ assert(!f.exists()) : "Destination file already exists.";
+ copyFile(args[0], args[1]);
+ }
+
+ public static void writeStringInThread(CharSequence x, String fname){
+ writeStringInThread(x, fname, false);
+ }
+
+ public static void writeStringInThread(CharSequence x, String fname, boolean append){
+ addThread(1);
+ new Thread(new WriteStringThread(x, fname, append)).start();
+ }
+
+ public static void writeObjectInThread(Object x, String fname, boolean allowSubprocess){
+ addThread(1);
+ new Thread(new WriteObjectThread(x, fname, allowSubprocess)).start();
+ }
+
+ private static class WriteStringThread implements Runnable{
+
+ private final CharSequence x;
+ private final String fname;
+ private final boolean append;
+ WriteStringThread(CharSequence x_, String fname_, boolean append_){
+ x=x_;
+ fname=fname_;
+ append=append_;
+ }
+
+ @Override
+ public void run() {
+ if(verbose){System.err.println("WriteStringThread.run() started for fname "+fname);}
+ addRunningThread(1);
+ writeStringAsync(x, fname, append);
+ addThread(-1);
+ if(verbose){System.err.println("WriteStringThread.run() finished for fname "+fname);}
+ }
+
+ }
+
+ private static class WriteObjectThread implements Runnable{
+
+ private final Object x;
+ private final String fname;
+ private final boolean allowSubprocess;
+ WriteObjectThread(Object x_, String fname_, boolean allowSubprocess_){
+ x=x_;
+ fname=fname_;
+ allowSubprocess=allowSubprocess_;
+ }
+
+ @Override
+ public void run() {
+ if(verbose){System.err.println("WriteObjectThread.run() started for fname "+fname);}
+ addRunningThread(1);
+// System.out.println(fname+" began writing.");
+ writeAsync(x, fname, allowSubprocess);
+// System.out.println(fname+" finished writing.");
+ addThread(-1);
+// System.out.println(fname+" reports "+countActiveThreads()+" active threads.");
+ if(verbose){System.err.println("WriteObjectThread.run() finished for fname "+fname);}
+ }
+
+ }
+
+ public static boolean setPermissions(String fname, boolean read, boolean write, boolean execute, boolean ownerOnly){
+ File f=new File(fname);
+ if(!f.exists()){return false;}
+ try {
+ f.setReadable(read, ownerOnly);
+ f.setWritable(write, ownerOnly);
+ f.setExecutable(execute, ownerOnly);
+ } catch (Exception e) {
+ return false;
+ }
+ return true;
+ }
+
+ public static void writeString(CharSequence x, String fname){writeString(x, fname, false);}
+ public static void writeString(CharSequence x, String fname, boolean append){
+ if(verbose){System.err.println("writeString(x, "+fname+", "+append+")");}
+ OutputStream os=getOutputStream(fname, append, true, false);
+
+ try {
+
+ synchronized(diskSync){
+ PrintWriter out=new PrintWriter(os);
+ out.print(x);
+ out.flush();
+
+ if(os.getClass()==ZipOutputStream.class){
+ ZipOutputStream zos=(ZipOutputStream)os;
+ zos.closeEntry();
+ zos.finish();
+ }
+// else if(PROCESS_XZ && os.getClass()==org.tukaani.xz.XZOutputStream.class){
+// org.tukaani.xz.XZOutputStream zos=(org.tukaani.xz.XZOutputStream)os;
+// zos.finish();
+// }
+ out.close();
+ }
+// System.out.println("Wrote to "+fname);
+
+// String read=readString(fname);
+// assert(x.equals(read)) : x.length()+", "+read.length();
+
+ } catch (FileNotFoundException e) {
+ throw new RuntimeException(e);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ } catch (OutOfMemoryError e) {
+ KillSwitch.memKill(e);
+ }
+ }
+
+ public static void writeStringAsync(CharSequence x, String fname){writeStringAsync(x, fname, false);}
+ public static void writeStringAsync(CharSequence x, String fname, boolean append){
+ if(verbose){System.err.println("writeStringAsync(x, "+fname+", "+append+")");}
+
+ OutputStream os=getOutputStream(fname, append, true, false);
+
+ try {
+
+ synchronized(diskSync){
+ PrintWriter out=new PrintWriter(os);
+ out.print(x);
+ out.flush();
+
+ if(os.getClass()==ZipOutputStream.class){
+ ZipOutputStream zos=(ZipOutputStream)os;
+ zos.closeEntry();
+ zos.finish();
+ }
+// else if(PROCESS_XZ && os.getClass()==org.tukaani.xz.XZOutputStream.class){
+// org.tukaani.xz.XZOutputStream zos=(org.tukaani.xz.XZOutputStream)os;
+// zos.finish();
+// }
+ out.close();
+ }
+// System.out.println("Wrote to "+fname);
+
+// String read=readString(fname);
+// assert(x.equals(read)) : x.length()+", "+read.length();
+
+ } catch (FileNotFoundException e) {
+ throw new RuntimeException(e);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ } catch (OutOfMemoryError e) {
+ KillSwitch.memKill(e);
+ }
+ }
+
+ public static <X> void write(X x, String fname, boolean allowSubprocess){
+ if(verbose){System.err.println("write(x, "+fname+", "+allowSubprocess+")");}
+
+ OutputStream os=getOutputStream(fname, false, true, allowSubprocess);
+
+ try {
+
+ synchronized(diskSync){
+ ObjectOutputStream out=new ObjectOutputStream(os);
+ out.writeObject(x);
+ close(out);
+ }
+
+ } catch (FileNotFoundException e) {
+ throw new RuntimeException(e);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ } catch (OutOfMemoryError e) {
+ KillSwitch.memKill(e);
+ }
+ }
+
+ public static <X> void writeAsync(X x, String fname, boolean allowSubprocess){
+ if(verbose){System.err.println("writeAsync(x, "+fname+", "+allowSubprocess+")");}
+
+ OutputStream os=getOutputStream(fname, false, true, allowSubprocess);
+
+ try {
+
+ ObjectOutputStream out=new ObjectOutputStream(os);
+ out.writeObject(x);
+ close(out);
+
+ } catch (FileNotFoundException e) {
+ throw new RuntimeException(e);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ } catch (OutOfMemoryError e) {
+ KillSwitch.memKill(e);
+ }
+ }
+
+ public static final boolean finishReading(InputStream is, String fname, boolean killProcess, Reader...ra){
+ if(verbose){System.err.println("finishReading("+is+", "+fname+", "+killProcess+", "+ra.length+")");}
+ boolean error=false;
+ if(ra!=null){
+ for(Reader r : ra){
+ try {
+ r.close();
+ } catch (IOException e) {
+ error=true;
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+ error|=finishReading(is, fname, killProcess);
+ if(verbose){System.err.println("finishReading("+is+", "+fname+", "+killProcess+", "+ra.length+") returned "+error);}
+ return error;
+ }
+
+ public static final boolean finishReading(InputStream is, String fname, boolean killProcess){
+ if(verbose){System.err.println("finishReading("+is+", "+fname+", "+killProcess+")");}
+ boolean error=false;
+ if(is!=System.in){
+ try {
+ is.close();
+ } catch (IOException e) {
+ error=true;
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ if(killProcess && fname!=null && is!=System.in){error|=ReadWrite.killProcess(fname);}
+ if(verbose){System.err.println("finishReading("+is+", "+fname+", "+killProcess+") returned "+error);}
+ return error;
+ }
+
+// public static final boolean finishWriting(PrintWriter writer, OutputStream outStream, String fname){
+// return finishWriting(writer, outStream, fname, fname!=null);
+// }
+
+ public static final boolean finishWriting(PrintWriter writer, OutputStream outStream, String fname, boolean killProcess){
+ if(verbose){System.err.println("finishWriting("+writer+", "+outStream+" , "+fname+", "+killProcess+")");}
+ boolean error=false;
+ if(writer!=null){writer.flush();}
+ close(outStream);
+ if(writer!=null && outStream!=System.out && outStream!=System.err){writer.close();}
+ if(killProcess && fname!=null && outStream!=System.err && outStream!=System.out){error|=ReadWrite.killProcess(fname);}
+ if(verbose){System.err.println("finishWriting("+writer+", "+outStream+" , "+fname+", "+killProcess+") returned "+error);}
+ return error;
+ }
+
+ public static final boolean close(OutputStream os, String fname){
+ if(verbose){System.err.println("close("+os+", "+fname+")");}
+ boolean error=false;
+ if(os!=null){error|=close(os);}
+ if(fname!=null && os!=System.err && os!=System.out){error|=killProcess(fname);}
+ if(verbose){System.err.println("close("+os+", "+fname+") returned "+error);}
+ return error;
+ }
+
+ public static final boolean close(OutputStream os){
+ if(verbose){System.err.println("close("+os+")");}
+ boolean error=false;
+ try {
+ os.flush();
+ } catch (IOException e1) {
+ // TODO Auto-generated catch block
+ e1.printStackTrace();
+ error=true;
+ }
+ if(os.getClass()==ZipOutputStream.class){
+ ZipOutputStream zos=(ZipOutputStream)os;
+ try {
+ zos.closeEntry();
+ zos.finish();
+ } catch (IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ error=true;
+ }
+ }
+// else if(PROCESS_XZ && os.getClass()==org.tukaani.xz.XZOutputStream.class){
+// org.tukaani.xz.XZOutputStream zos=(org.tukaani.xz.XZOutputStream)os;
+// try {
+// zos.finish();
+// } catch (IOException e) {
+// // TODO Auto-generated catch block
+// e.printStackTrace();
+// }
+// }
+ if(os!=System.out && os!=System.err){
+ try {
+ os.close();
+ } catch (IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ error=true;
+ }
+ }
+ if(verbose){System.err.println("close("+os+") returned "+error);}
+ return error;
+ }
+
+ public static OutputStream getOutputStream(FileFormat ff, boolean buffered){
+ return getOutputStream(ff.name(), ff.append(), buffered, ff.allowSubprocess());
+ }
+
+ public static OutputStream getOutputStream(String fname, boolean append, boolean buffered, boolean allowSubprocess){
+
+ if(verbose){
+ System.err.println("getOutputStream("+fname+", "+append+", "+buffered+", "+allowSubprocess+")");
+ new Exception().printStackTrace(System.err);
+ }
+
+// assert(false) : fname; //TODO: for testing
+// fname=fname.replaceAll("\\\\", "/");
+ fname=fname.replace('\\', '/');
+ assert(fname.indexOf('\\')<0);
+// assert(!fname.contains("//"));
+
+ {//Create directories if needed.
+ final int index=fname.lastIndexOf('/');
+ if(index>0){
+ File f=new File(fname.substring(0, index+1));
+ if(!f.exists()){f.mkdirs();}
+ }
+ }
+
+ boolean gzipped=fname.endsWith(".gz") || fname.endsWith(".gzip");
+ boolean zipped=fname.endsWith(".zip");
+ boolean bzipped=PROCESS_BZ2 && fname.endsWith(".bz2");
+ boolean xz=PROCESS_XZ && fname.endsWith(".xz");
+ boolean dsrced=fname.endsWith(".dsrc");
+
+// assert(false) : fname;
+
+ allowSubprocess=(allowSubprocess && Shared.threads()>1);
+
+ if(gzipped){
+// assert(!append);
+ return getGZipOutputStream(fname, append, allowSubprocess);
+ }else if(zipped){
+ assert(!append) : "Append is not allowed for zip archives.";
+ return getZipOutputStream(fname, buffered, allowSubprocess);
+ }else if(bzipped){
+ assert(!append) : "Append is not allowed for bz2 archives.";
+ return getBZipOutputStream(fname, buffered, append, allowSubprocess);
+ }else if(xz){
+ assert(!append) : "Append is not allowed for xz archives.";
+ return getXZOutputStream(fname, buffered, allowSubprocess);
+ }else if(dsrced){
+ assert(!append) : "Append is not allowed for dsrc archives.";
+ return getDsrcOutputStream(fname, buffered, allowSubprocess);
+ }
+ return getRawOutputStream(fname, append, buffered);
+ }
+
+ public static OutputStream getRawOutputStream(String fname, boolean append, boolean buffered){
+
+ if(verbose){System.err.println("getRawOutputStream("+fname+", "+append+", "+buffered+")");}
+
+ if(fname.equals("stdout") || fname.startsWith("stdout.")){
+ return System.out;
+ }else if(fname.equals("stderr") || fname.startsWith("stderr.")){
+ return System.err;
+ }
+
+ if(fname.indexOf('|')>=0){fname=fname.replace('|', '_');}
+
+ FileOutputStream fos=null;
+ try {
+ fos = new FileOutputStream(fname, append);
+ } catch (FileNotFoundException e) {
+ synchronized(ReadWrite.class){
+ try {
+ File f=new File(fname);
+ String parent=f.getParent();
+
+ if(parent!=null){
+ f=new File(parent);
+ if(!f.exists()){
+ boolean b=f.mkdirs();
+ if(!b){System.err.println("Warning - could not create directory "+f.getAbsolutePath());}
+ }
+ }
+ fos = new FileOutputStream(fname, append);
+ } catch (Exception e2) {
+ throw new RuntimeException(e2);
+ }
+ }
+ }
+ assert(fos!=null);
+ if(buffered){return new BufferedOutputStream(fos);}
+ return fos;
+ }
+
+ public static OutputStream getXZOutputStream(String fname, boolean buffered, boolean allowSubprocess){
+ final OutputStream raw=getRawOutputStream(fname, false, buffered);
+ if(RAWMODE){return raw;}
+ throw new RuntimeException("Unsupported format: XZ");
+// try {
+// org.tukaani.xz.LZMA2Options options = new org.tukaani.xz.LZMA2Options();
+// options.setPreset(ZIPLEVEL);
+// org.tukaani.xz.XZOutputStream out=new org.tukaani.xz.XZOutputStream(raw, options);
+// return out;
+// } catch (IOException e) {
+// // TODO Auto-generated catch block
+// e.printStackTrace();
+// }
+// assert(false);
+// return null;
+ }
+
+ public static OutputStream getBZipOutputStream(String fname, boolean buffered, boolean append, boolean allowSubprocess){
+ if(verbose){System.err.println("getBZipOutputStream("+fname+", "+buffered+", "+append+", "+allowSubprocess+")");}
+
+ if(RAWMODE){
+ final OutputStream raw=getRawOutputStream(fname, false, buffered);
+ return raw;
+ }
+
+ if(USE_PBZIP2 && Data.PBZIP2() /* && (Data.SH() /*|| fname.equals("stdout") || fname.startsWith("stdout."))*/){return getPbzip2Stream(fname, append);}
+ if(USE_BZIP2 && Data.BZIP2() /* && (Data.SH() /*|| fname.equals("stdout") || fname.startsWith("stdout."))*/){return getBzip2Stream(fname, append);}
+
+ throw new RuntimeException("bz2 compression not supported in this version, unless bzip2 or pbzip2 is installed.");
+
+
+// getBzip2Stream
+
+// {//comment to disable BZip2
+// try {
+// raw.write('B');
+// raw.write('Z');
+// CBZip2OutputStream out=new CBZip2OutputStream(raw, 8192);
+// return out;
+// } catch (IOException e) {
+// // TODO Auto-generated catch block
+// e.printStackTrace();
+// }
+// assert(false);
+// return null;
+// }
+ }
+
+ public static OutputStream getDsrcOutputStream(String fname, boolean buffered, boolean append){
+ if(verbose){System.err.println("getDsrcOutputStream("+fname+", "+buffered+", "+append+")");}
+ if(RAWMODE){
+ final OutputStream raw=getRawOutputStream(fname, false, buffered);
+ return raw;
+ }
+
+ if(USE_DSRC && Data.DSRC() /*&& (Data.SH() || fname.equals("stdout") || fname.startsWith("stdout."))*/){return getDsrcOutputStream2(fname, append);}
+
+ throw new RuntimeException("dsrc compression requires dsrc in the path.");
+ }
+
+ public static OutputStream getZipOutputStream(String fname, boolean buffered, boolean allowSubprocess){
+ if(verbose){System.err.println("getZipOutputStream("+fname+", "+buffered+", "+allowSubprocess+")");}
+ final OutputStream raw=getRawOutputStream(fname, false, buffered);
+ if(RAWMODE){return raw;}
+ try {
+ ZipOutputStream out=new ZipOutputStream(raw);
+ out.setLevel(ZIPLEVEL);
+ final String basename=basename(fname);
+ out.putNextEntry(new ZipEntry(basename));
+ return out;
+ } catch (IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ assert(false);
+ return null;
+ }
+
+ public static OutputStream getGZipOutputStream(String fname, boolean append, boolean allowSubprocess){
+ if(verbose){System.err.println("getGZipOutputStream("+fname+", "+append+", "+allowSubprocess+")");}
+
+// assert(false) : ReadWrite.ZIPLEVEL+", "+Shared.threads()+", "+MAX_ZIP_THREADS+", "+allowSubprocess+", "+USE_PIGZ+", "+Data.PIGZ();
+ if(allowSubprocess && Shared.threads()>=2){
+ if(USE_PIGZ && Data.PIGZ()/* && (Data.SH() /*|| fname.equals("stdout") || fname.startsWith("stdout."))*/){return getPigzStream(fname, append);}
+ if(USE_GZIP && Data.GZIP()/* && (Data.SH() /*|| fname.equals("stdout") || fname.startsWith("stdout."))*/){return getGzipStream(fname, append);}
+ }
+
+ final OutputStream raw=getRawOutputStream(fname, append, false);
+ if(RAWMODE){return raw;}
+ try {
+ final GZIPOutputStream out=new GZIPOutputStream(raw, 8192){
+ {
+ // def.setLevel(Deflater.DEFAULT_COMPRESSION);
+ def.setLevel(ZIPLEVEL);
+ }
+ };
+ return out;
+ } catch (IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ assert(false);
+ return null;
+ }
+
+ public static OutputStream getPigzStream(String fname, boolean append){
+ if(verbose){System.err.println("getPigzStream("+fname+")");}
+ int threads=Tools.min(MAX_ZIP_THREADS, Tools.max((Shared.threads()+1)/Tools.max(ZIP_THREAD_DIVISOR, 1), 1));
+ threads=Tools.max(1, Tools.min(Shared.threads()-1, threads));
+ int zl=ZIPLEVEL;
+ if(ALLOW_ZIPLEVEL_CHANGE && threads>=4 && zl>0 && zl<4){zl=4;}
+ OutputStream out;
+ out=getOutputStreamFromProcess(fname, "pigz -c -p "+threads+" -"+zl, true, append, true, true);
+
+// assert(false) : ReadWrite.ZIPLEVEL+", "+zl+", "+threads+", "+Shared.threads()+", "+MAX_ZIP_THREADS+", "+USE_PIGZ+", "+Data.PIGZ();
+
+// if(Data.SH()){
+// out=getOutputStreamFromProcess(fname, "pigz -c -p "+threads+" -"+zl, true, append, true);
+// }else{
+//// assert(!append);
+//// assert(false) : "pigz output does not currently work without bash.";
+// out=getOutputStreamFromProcess(fname, "pigz -c -p "+threads+" -"+zl+(append ? " >> " : " > ")+fname, false, append, true);
+// }
+
+ return out;
+ }
+
+ public static OutputStream getGzipStream(String fname, boolean append){
+ if(verbose){System.err.println("getGzipStream("+fname+")");}
+ OutputStream out=getOutputStreamFromProcess(fname, "gzip -c -"+ZIPLEVEL, true, append, true, true);
+ return out;
+ }
+
+ public static OutputStream getBzip2Stream(String fname, boolean append){
+ if(verbose){System.err.println("getBzip2Stream("+fname+")");}
+ OutputStream out=getOutputStreamFromProcess(fname, "bzip2 -c -"+ZIPLEVEL, true, append, true, true);
+ return out;
+ }
+
+ public static OutputStream getPbzip2Stream(String fname, boolean append){
+ if(verbose){System.err.println("getPbzip2Stream("+fname+")");}
+ int threads=Tools.min(MAX_ZIP_THREADS, Tools.max((Shared.threads()+1)/Tools.max(ZIP_THREAD_DIVISOR, 1), 1));
+ threads=Tools.max(1, Tools.min(Shared.threads()-1, threads));
+ OutputStream out=getOutputStreamFromProcess(fname, "pbzip2 -c -p"+threads+" -"+ZIPLEVEL, true, append, true, true);
+ return out;
+ }
+
+ public static OutputStream getDsrcOutputStream2(String fname, boolean append){
+ if(verbose){System.err.println("getDsrcOutpustream2("+fname+")");}
+ int threads=Tools.min(MAX_ZIP_THREADS, Tools.max((Shared.threads()+1)/Tools.max(ZIP_THREAD_DIVISOR, 1), 1));
+ threads=Tools.max(1, Tools.min(Shared.threads()-1, threads));
+ String params=null;
+ if(ZIPLEVEL<=2){
+ params="-d0 -q0 -b8";
+ }else if(ZIPLEVEL<=4){
+ params="-d1 -q1 -b16";
+ }else if(ZIPLEVEL<=8){
+ params="-d2 -q2 -b32";
+ }else{
+ params="-d3 -q2 -b64";
+ }
+ String command="dsrc c -t"+threads+" "+params+" -s";
+ if(fname.equals("stdout") || fname.startsWith("stdout.")){
+ //???
+ assert(false) : "Undefined dsrc option.";
+ }else{
+ command+=" "+fname;
+ }
+ System.err.println(command);//123
+// OutputStream out=getOutputStreamFromProcess(fname, command, true, append, true);
+ OutputStream out=getOutputStreamFromProcess(fname, command+" "+fname, true, append, true, false);
+ return out;
+ }
+
+ public static OutputStream getOutputStreamFromProcess(String fname, String command, boolean sh, boolean append, boolean useProcessBuilder, boolean useFname){
+ if(verbose){System.err.println("getOutputStreamFromProcess("+fname+", "+command+", "+sh+", "+useProcessBuilder+")");}
+
+ OutputStream out=null;
+ Process p=null;
+ if(useProcessBuilder){
+ ProcessBuilder pb=new ProcessBuilder();
+ pb.redirectError(Redirect.INHERIT);
+
+ if(fname.equals("stdout") || fname.startsWith("stdout.")){
+ pb.redirectOutput(Redirect.INHERIT);
+ pb.command(command.split(" "));
+ }else{
+
+ if(useFname){
+ if(append){
+ pb.redirectOutput(ProcessBuilder.Redirect.appendTo(new File(fname)));
+ }else{
+ pb.redirectOutput(new File(fname));
+ }
+ }
+
+ pb.command(command.split(" "));
+ }
+ try {
+ p=pb.start();
+ } catch (IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+
+ addProcess(fname, p);
+ out=p.getOutputStream();
+ {
+ out=p.getOutputStream();
+ InputStream es=p.getErrorStream();
+ assert(es!=null);
+ PipeThread et=new PipeThread(es, System.err);
+ addPipeThread(fname, et);
+ et.start();
+ }
+ return out;
+ }
+
+ if(fname.equals("stdout") || fname.startsWith("stdout.")){
+ try {
+ p = Runtime.getRuntime().exec(command);
+ } catch (IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ InputStream is=p.getInputStream();
+ PipeThread it=new PipeThread(is, System.out);
+ addPipeThread(fname, it);
+ it.start();
+// }else if(fname.equals("stderr") || fname.startsWith("stderr.")){
+// try {
+// p = Runtime.getRuntime().exec(command);
+// } catch (IOException e) {
+// // TODO Auto-generated catch block
+// e.printStackTrace();
+// }
+// InputStream is=p.getErrorStream();
+// PipeThread it=new PipeThread(is, System.err);
+// it.start();
+ }else{
+ try {
+ if(sh){
+ String[] cmd = {
+ "sh",
+ "-c",
+ command+(useFname ? " 1"+(append ? ">>" : ">")+fname : "")
+ };
+ p=Runtime.getRuntime().exec(cmd);
+ }else{
+ //TODO: append won't work here...
+ assert(false) : command;
+ p=Runtime.getRuntime().exec(command);
+ }
+ } catch (IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+
+ addProcess(fname, p);
+ out=p.getOutputStream();
+ InputStream es=p.getErrorStream();
+ assert(es!=null);
+ PipeThread et=new PipeThread(es, System.err);
+ addPipeThread(fname, et);
+ et.start();
+
+ return out;
+ }
+
+ public static String readString(String fname){
+ if(verbose){System.err.println("readString("+fname+")");}
+ String x=null;
+ InputStream is=getInputStream(fname, false, false);
+
+ try {
+
+ StringBuilder sb=new StringBuilder();
+
+// synchronized(diskSync){
+ BufferedReader in=new BufferedReader(new InputStreamReader(is), INBUF);
+ String temp=in.readLine();
+ while(temp!=null){
+ sb.append(temp).append('\n');
+ temp=in.readLine();
+ }
+ in.close();
+// }
+
+ x=sb.toString();
+ } catch (FileNotFoundException e) {
+ throw new RuntimeException(e);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ } catch (OutOfMemoryError e) {
+ KillSwitch.memKill(e);
+ }
+
+ return x;
+ }
+
+ public static Object readObject(String fname, boolean allowSubprocess){
+ if(verbose){System.err.println("readObject("+fname+")");}
+ Object x=null;
+ InputStream is=getInputStream(fname, true, false);
+
+ try {
+// synchronized(diskSync){
+ ObjectInputStream in=new ObjectInputStream(is);
+ x=in.readObject();
+ in.close();
+// }
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ } catch (ClassNotFoundException e) {
+ throw new RuntimeException(e);
+ } catch (OutOfMemoryError e) {
+ KillSwitch.memKill(e);
+ }
+
+ return x;
+ }
+
+ public static InputStream getInputStream(String fname, boolean buffer, boolean allowSubprocess){
+ if(verbose){System.err.println("getInputStream("+fname+", "+buffer+", "+allowSubprocess+")");}
+ boolean xz=fname.endsWith(".xz");
+ boolean gzipped=fname.endsWith(".gz") || fname.endsWith(".gzip");
+ boolean zipped=fname.endsWith(".zip");
+ boolean bzipped=PROCESS_BZ2 && fname.endsWith(".bz2");
+ boolean dsrced=fname.endsWith(".dsrc");
+ boolean bam=fname.endsWith(".bam") && Data.SAMTOOLS();
+
+ allowSubprocess=(allowSubprocess && Shared.threads()>1);
+
+ if(!RAWMODE){
+ if(zipped){return getZipInputStream(fname);}
+ if(gzipped){return getGZipInputStream(fname, allowSubprocess);}
+ if(bzipped){return getBZipInputStream(fname, allowSubprocess);}
+ if(dsrced){return getDsrcInputStream(fname);}
+ if(bam){return getInputStreamFromProcess(fname, "samtools view -h", false);}
+ }
+
+ return getRawInputStream(fname, buffer);
+ }
+
+ public static InputStream getRawInputStream(String fname, boolean buffer){
+ if(verbose){System.err.println("getRawInputStream("+fname+", "+buffer+")");}
+
+ assert(fname!=null);
+ fname=fname.replace('\\', '/');
+ assert(fname.indexOf('\\')<0);
+ assert(!fname.contains("\\\\"));
+// assert(!fname.contains("//")) : fname;
+
+ final boolean jar=fname.startsWith("jar:");
+
+ if(!jar){
+ boolean failed=false;
+ File f=new File(fname);
+ if(!f.exists()){
+ String f2=fname.toLowerCase();
+ if(f2.equals("stdin") || f2.startsWith("stdin.")){
+ // System.err.println("Returning stdin: A");
+ return System.in;
+ }
+
+ if(fname.indexOf('/')<0){
+ f2=Data.ROOT_CURRENT+"/"+fname;
+ if(!new File(f2).exists()){
+ failed=true;
+ }else{
+ fname=f2;
+ }
+ }else{
+ failed=true;
+ }
+ }
+ if(failed){throw new RuntimeException("Can't find file "+fname);}
+ }
+
+// System.err.println("Getting input stream for "+fname);
+// assert(!fname.contains("\\"));
+// assert(!loadedFiles.contains(fname)) : "Already loaded "+fname;
+// loadedFiles.add(fname);
+
+ InputStream in=null;
+ if(jar){
+ try {
+
+ URL url=new URL(fname);
+
+ InputStream is=url.openStream();
+
+ if(buffer){
+ BufferedInputStream bis=new BufferedInputStream(is, INBUF);
+ in=bis;
+ }else{
+ in=is;
+ }
+
+ } catch (FileNotFoundException e) {
+ System.err.println("Error when attempting to read "+fname);
+ throw new RuntimeException(e);
+ } catch (MalformedURLException e) {
+ System.err.println("Error when attempting to read "+fname);
+ throw new RuntimeException(e);
+ } catch (IOException e) {
+ System.err.println("Error when attempting to read "+fname);
+ throw new RuntimeException(e);
+ }
+ }else{
+ try {
+
+ FileInputStream fis=new FileInputStream(fname);
+
+ if(buffer){
+ BufferedInputStream bis=new BufferedInputStream(fis, INBUF);
+ in=bis;
+ }else{
+ in=fis;
+ }
+
+ } catch (FileNotFoundException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ return in;
+ }
+
+ public static InputStream getZipInputStream(String fname){return getZipInputStream(fname, true);}
+ public static InputStream getZipInputStream(String fname, boolean buffer){
+ if(verbose){System.err.println("getZipInputStream("+fname+", "+buffer+")");}
+ InputStream raw=getRawInputStream(fname, buffer);
+ InputStream in=null;
+
+ final String basename=basename(fname);
+
+ try {
+
+ ZipInputStream zis=new ZipInputStream(raw);
+ ZipEntry ze=zis.getNextEntry();
+ assert(ze!=null);
+ assert(basename.equals(ze.getName())) : basename+" != "+ze.getName();
+ in=zis;
+
+ } catch (FileNotFoundException e) {
+ System.err.println("Error when attempting to read "+fname);
+ throw new RuntimeException(e);
+ } catch (IOException e) {
+ System.err.println("Error when attempting to read "+fname);
+ throw new RuntimeException(e);
+ }
+
+ return in;
+ }
+
+ public static InputStream getGZipInputStream(String fname, boolean allowSubprocess){
+ if(verbose){
+ System.err.println("getGZipInputStream("+fname+", "+allowSubprocess+")");
+// new Exception().printStackTrace(System.err);
+ }
+
+ if(allowSubprocess && Shared.threads()>2){
+ if(!fname.startsWith("jar:")){
+ if(verbose){System.err.println("Fetching gzip input stream: "+fname+", "+allowSubprocess+", "+USE_UNPIGZ+", "+Data.PIGZ());}
+ if(USE_UNPIGZ && Data.PIGZ()){return getUnpigzStream(fname);}
+ if(USE_GUNZIP && Data.GUNZIP()){return getGunzipStream(fname);}
+ }
+ }
+ InputStream raw=getRawInputStream(fname, false);
+ InputStream in=null;
+
+ try {
+ in=new GZIPInputStream(raw, INBUF);
+ } catch (FileNotFoundException e) {
+ System.err.println("Error when attempting to read "+fname);
+ throw new RuntimeException(e);
+ } catch (IOException e) {
+ System.err.println("Error when attempting to read "+fname);
+ throw new RuntimeException(e);
+ }
+
+ return in;
+ }
+
+ public static InputStream getGunzipStream(String fname){
+ if(verbose){System.err.println("getGunzipStream("+fname+")");}
+ return getInputStreamFromProcess(fname, "gzip -c -d", false);
+ }
+
+ public static InputStream getUnpigzStream(String fname){
+ if(verbose){System.err.println("getUnpigzStream("+fname+")");}
+ return getInputStreamFromProcess(fname, "pigz -c -d", false);
+ }
+
+ public static InputStream getUnpbzip2Stream(String fname){
+ if(verbose){System.err.println("getUnpbzip2Stream("+fname+")");}
+ return getInputStreamFromProcess(fname, "pbzip2 -c -d", false);
+ }
+
+ public static InputStream getUnbzip2Stream(String fname){
+ if(verbose){System.err.println("getUnbzip2Stream("+fname+")");}
+ return getInputStreamFromProcess(fname, "bzip2 -c -d", false);
+ }
+
+ public static InputStream getUnDsrcStream(String fname){
+ if(verbose){System.err.println("getUnDsrcStream("+fname+")");}
+ int threads=Tools.min(MAX_ZIP_THREADS, Tools.max((Shared.threads()+1)/Tools.max(ZIP_THREAD_DIVISOR, 1), 1));
+ threads=Tools.max(1, Tools.min(Shared.threads()-1, threads));
+ return getInputStreamFromProcess(fname, "dsrc d -s -t"+threads, false);
+ }
+
+
+ public static InputStream getInputStreamFromProcess(String fname, String command, boolean cat){
+ if(verbose){System.err.println("getInputStreamFromProcess("+fname+", "+command+", "+cat+")");}
+
+ //InputStream raw=getRawInputStream(fname, false);
+ InputStream in=null;
+
+ Process p=null;
+ if(fname==null){
+ try {
+ p=Runtime.getRuntime().exec(command);
+ } catch (IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ fname=command;
+ }else if(fname.equals("stdin") || fname.startsWith("stdin.")){
+ try {
+ if(cat){
+ throw new RuntimeException();
+ }else{
+ p=Runtime.getRuntime().exec(command);
+ }
+ } catch (IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ OutputStream os=p.getOutputStream();
+ PipeThread it=new PipeThread(System.in, os);
+ addPipeThread(fname, it);
+ it.start();
+ }else{
+ try {
+ if(cat){
+ assert(false) : "This mode is untested.";
+ String[] cmd = {
+ "sh","cat "+fname,
+ " | "+command
+ };
+ p=Runtime.getRuntime().exec(cmd);
+ }else{
+ p = Runtime.getRuntime().exec(command+" "+fname);
+ }
+ } catch (IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+
+ addProcess(fname, p);
+ in=p.getInputStream();
+ InputStream es=p.getErrorStream();
+ assert(es!=null);
+ PipeThread et=new PipeThread(es, System.err);
+ addPipeThread(fname, et);
+ et.start();
+
+ return in;
+ }
+
+
+ public static InputStream getBZipInputStream(String fname, boolean allowSubprocess){
+ if(verbose){System.err.println("getBZipInputStream("+fname+")");}
+ InputStream in=null;
+
+ try {in=getBZipInputStream2(fname, allowSubprocess);}
+ catch (IOException e) {
+ System.err.println("Error when attempting to read "+fname);
+ throw new RuntimeException(e);
+ }catch (NullPointerException e) {
+ System.err.println("Error when attempting to read "+fname);
+ throw new RuntimeException(e);
+ }
+
+ assert(in!=null);
+ return in;
+ }
+
+ private static InputStream getBZipInputStream2(String fname, boolean allowSubprocess) throws IOException{
+ if(verbose){
+ if(verbose){System.err.println("getBZipInputStream("+fname+")");}
+ }
+
+ if(!fname.startsWith("jar:")){
+ if(verbose){System.err.println("Fetching bz2 input stream: "+fname+", "+USE_PBZIP2+", "+USE_BZIP2+", "+Data.PBZIP2()+Data.BZIP2());}
+ if(USE_PBZIP2 && Data.PBZIP2()){return getUnpbzip2Stream(fname);}
+ if(USE_BZIP2 && Data.BZIP2()){return getUnbzip2Stream(fname);}
+ }
+
+ throw new IOException("\nbzip2 or pbzip2 must be in the path to read bz2 files:\n"+fname+"\n");
+ }
+
+ public static InputStream getDsrcInputStream(String fname){
+ if(verbose){System.err.println("getDsrcInputStream("+fname+")");}
+ InputStream in=null;
+
+ try {in=getDsrcInputStream2(fname);}
+ catch (IOException e) {
+ System.err.println("Error when attempting to read "+fname);
+ throw new RuntimeException(e);
+ }catch (NullPointerException e) {
+ System.err.println("Error when attempting to read "+fname);
+ throw new RuntimeException(e);
+ }
+
+ assert(in!=null);
+ return in;
+ }
+
+ private static InputStream getDsrcInputStream2(String fname) throws IOException{
+ if(verbose){
+ if(verbose){System.err.println("getDsrcInputStream2("+fname+")");}
+ }
+
+ if(USE_DSRC && Data.DSRC()){return getUnDsrcStream(fname);}
+
+ throw new IOException("\nDsrc must be in the path to read Dsrc files:\n"+fname+"\n");
+ }
+
+ public static InputStream getXZInputStream(String fname){
+
+ InputStream in=null;
+
+// if(PROCESS_XZ){
+// InputStream raw=getRawInputStream(fname, true);
+// try {
+// in=new org.tukaani.xz.XZInputStream(raw);
+// } catch (FileNotFoundException e) {
+// throw new RuntimeException(e);
+// } catch (IOException e) {
+// throw new RuntimeException(e);
+// }
+// }
+
+ return in;
+ }
+
+
+ public static <X> X read(Class<X> cx, String fname, boolean allowSubprocess){
+ X x=(X)readObject(fname, allowSubprocess);
+ return x;
+ }
+
+ public static <X> X[] readArray(Class<X> cx, String fname, boolean allowSubprocess){
+ X[] x=(X[])readObject(fname, allowSubprocess);
+ return x;
+ }
+
+ public static <X> X[][] readArray2(Class<X> cx, String fname, boolean allowSubprocess){
+ X[][] x=(X[][])readObject(fname, allowSubprocess);
+ return x;
+ }
+
+ public static <X> X[][][] readArray3(Class<X> cx, String fname, boolean allowSubprocess){
+ X[][][] x=(X[][][])readObject(fname, allowSubprocess);
+ return x;
+ }
+
+
+ public static String basename(String fname){
+ fname=fname.replace('\\', '/');
+ boolean xz=fname.endsWith(".xz");
+ boolean gzipped=fname.endsWith(".gz");
+ boolean zipped=fname.endsWith(".zip");
+ boolean bzipped=PROCESS_BZ2 && fname.endsWith(".bz2");
+ boolean dsrced=fname.endsWith(".dsrc");
+ String basename=fname;
+// if(basename.contains("\\")){basename=basename.substring(basename.lastIndexOf("\\")+1);}
+ if(basename.contains("/")){basename=basename.substring(basename.lastIndexOf('/')+1);}
+ if(zipped || bzipped){basename=basename.substring(0, basename.length()-4);}
+ else if(gzipped){basename=basename.substring(0, basename.length()-3);}
+ else if(dsrced){basename=basename.substring(0, basename.length()-5);}
+ return basename;
+ }
+
+ public static String rawName(String fname){
+ for(String s : compressedExtensions){
+ while(fname.endsWith(s)){fname=fname.substring(0, fname.length()-s.length());}
+ }
+ return fname;
+ }
+
+ public static String stripExtension(String fname){
+ if(fname==null){return null;}
+ for(String s0 : FileFormat.EXTENSION_LIST){
+ String s="."+s0;
+ if(fname.endsWith(s)){return stripExtension(fname.substring(0, fname.length()-s.length()));}
+ }
+ return fname;
+ }
+
+ public static String getExtension(String fname){
+ if(fname==null){return null;}
+ String stripped=stripExtension(fname);
+ if(stripped==null){return fname;}
+ if(stripped.length()==fname.length()){return "";}
+ return fname.substring(stripped.length());
+ }
+
+ public static String stripToCore(String fname){
+ fname=stripPath(fname);
+ return stripExtension(fname);
+ }
+
+ public static String stripPath(String fname){
+ if(fname==null){return null;}
+ fname=fname.replace('\\', '/');
+ if(fname.contains("/")){fname=fname.substring(fname.lastIndexOf('/')+1);}
+ return fname;
+ }
+
+ public static String getPath(String fname){
+ if(fname==null){return null;}
+ fname=fname.replace('\\', '/');
+ if(fname.contains("/")){fname=fname.substring(0, fname.lastIndexOf('/'));}
+ return "";
+ }
+
+ public static String compressionType(String fname){
+ fname=fname.toLowerCase(Locale.ENGLISH);
+ for(String s : compressedExtensions){
+ if(fname.endsWith(s)){return s.substring(1);}
+ }
+ return null;
+ }
+
+ public static boolean isCompressed(String fname){
+ return compressionType(fname)!=null;
+ }
+
+ public static boolean isSam(String fname){
+ fname=fname.toLowerCase(Locale.ENGLISH);
+ if(fname.endsWith(".sam")){return true;}
+ String s=compressionType(fname);
+ if(s==null){return false;}
+ return fname.substring(0, fname.lastIndexOf('.')).endsWith(".sam");
+ }
+
+ public static String rawExtension(String fname){
+ fname=rawName(fname);
+ int x=fname.lastIndexOf('.');
+ if(x<0){return "";}
+ return fname.substring(x+1).toLowerCase(Locale.ENGLISH);
+ }
+
+ public static String parseRoot(String path){
+ File f=new File(path);
+ if(f.isDirectory()){
+ if(!path.endsWith(FILESEP)){
+ path=path+FILESEP;
+ }
+ return path;
+ }else if(f.isFile()){
+ int slash=path.lastIndexOf(FILESEP);
+ if(slash<0){
+ return "";
+ }else{
+ return path.substring(0, slash+1);
+ }
+ }else{
+ throw new RuntimeException("Can't find "+path); //Try using parseRoot2 instead.
+ }
+ }
+
+ /** This one does not throw an exception for non-existing paths */
+ public static String parseRoot2(String path){
+ File f=new File(path);
+
+ if(!f.exists()){
+ if(path.endsWith(FILESEP)){return path;}
+ int slash=path.lastIndexOf(FILESEP);
+ if(slash<0){
+ return "";
+ }else{
+ return path.substring(0, slash+1);
+ }
+ }
+
+ if(f.isDirectory()){
+ if(!path.endsWith(FILESEP)){
+ path=path+FILESEP;
+ }
+ return path;
+ }else if(f.isFile()){
+ int slash=path.lastIndexOf(FILESEP);
+ if(slash<0){
+ return "";
+ }else{
+ return path.substring(0, slash+1);
+ }
+ }else{
+ throw new RuntimeException("Can't find "+path);
+ }
+ }
+
+ public static String findFileExtension(final String fname){
+
+ File file=new File(fname);
+ if(file.exists()){return fname;}
+
+ String basename=fname, temp;
+ if(fname.endsWith(".zip") || fname.endsWith(".gz") || (PROCESS_BZ2 && fname.endsWith(".bz2")) || (PROCESS_XZ && fname.endsWith(".xz"))){
+ basename=fname.substring(0, fname.lastIndexOf('.'));
+ }
+ temp=basename;
+ file=new File(temp);
+ if(!file.exists()){
+ temp=basename+".gz";
+ file=new File(temp);
+ }
+// System.err.println(temp+" "+(file.exists() ? " exists" : " does not exist"));
+ if(!file.exists()){
+ temp=basename+".zip";
+ file=new File(temp);
+ }
+// System.err.println(temp+" "+(file.exists() ? " exists" : " does not exist"));
+ if(!file.exists() && PROCESS_BZ2){
+ temp=basename+".bz2";
+ file=new File(temp);
+ }
+// System.err.println(temp+" "+(file.exists() ? " exists" : " does not exist"));
+ if(!file.exists() && PROCESS_XZ){
+ temp=basename+".xz";
+ file=new File(temp);
+ }
+// System.err.println(temp+" "+(file.exists() ? " exists" : " does not exist"));
+ if(!file.exists()){temp=fname;}
+
+ return temp;
+ }
+
+ public static synchronized void copyFile(String source, String dest){copyFile(source, dest, false);}
+ public static synchronized void copyFile(String source, String dest, boolean createPathIfNeeded){
+
+ assert(!new File(dest).exists()) : "Destination file already exists: "+dest;
+ if(createPathIfNeeded){
+ File parent=new File(dest).getParentFile();
+ if(parent!=null && !parent.exists()){
+ parent.mkdirs();
+ }
+ }
+
+ final boolean oldRawmode=RAWMODE;
+ if((source.endsWith(".zip") && dest.endsWith(".zip"))
+ || (source.endsWith(".gz") && dest.endsWith(".gz")
+ || (source.endsWith(".bz2") && dest.endsWith(".bz2"))
+ || (source.endsWith(".xz") && dest.endsWith(".xz")))){
+ RAWMODE=true;
+ }
+
+ try{
+ InputStream in=getInputStream(source, false, false);
+ OutputStream out=getOutputStream(dest, false, false, true);
+
+ byte[] buffer=new byte[INBUF];
+ int len;
+
+ while((len = in.read(buffer)) > 0){
+ out.write(buffer, 0, len);
+ }
+
+ in.close();
+ out.flush();
+ if(out.getClass()==ZipOutputStream.class){
+ ZipOutputStream zos=(ZipOutputStream)out;
+ zos.closeEntry();
+ zos.finish();
+ }
+// else if(PROCESS_XZ && out.getClass()==org.tukaani.xz.XZOutputStream.class){
+// org.tukaani.xz.XZOutputStream zos=(org.tukaani.xz.XZOutputStream)out;
+// zos.finish();
+// }
+ out.close();
+
+ }catch(FileNotFoundException e){
+ RAWMODE=oldRawmode;
+ throw new RuntimeException(e);
+ }catch(IOException e){
+ RAWMODE=oldRawmode;
+ throw new RuntimeException(e);
+ }
+
+ RAWMODE=oldRawmode;
+ }
+
+ public static void copyDirectoryContents(String from, String to){
+ assert(!from.equalsIgnoreCase(to));
+
+ if(to.indexOf('\\')>0){to=to.replace('\\', '/');}
+
+ File d1=new File(from);
+ assert(d1.exists());
+ assert(d1.isDirectory());
+
+ File d2=new File(to);
+ assert(!d1.equals(d2));
+ if(d2.exists()){
+ assert(d2.isDirectory());
+ }else{
+ d2.mkdirs();
+ }
+ if(!to.endsWith("/")){to=to+"/";}
+
+ File[] array=d1.listFiles();
+
+ for(File f : array){
+ String name=f.getName();
+ String dest=to+name;
+ if(f.isFile()){
+ copyFile(f.getAbsolutePath(), dest);
+ }else{
+ assert(f.isDirectory());
+ File f2=new File(dest);
+ if(!f2.exists()){
+ f2.mkdir();
+ }else{
+ assert(f2.isDirectory());
+ }
+ copyDirectoryContents(f.getAbsolutePath(), f2.getAbsolutePath());
+ }
+ }
+
+ }
+
+
+ private static final int addThread(int x){
+ if(verbose){System.err.println("addThread("+x+")");}
+ synchronized(activeThreads){
+ assert(x!=0);
+ if(x>0){
+ activeThreads[0]+=x;
+ activeThreads[1]+=x;
+ }else{
+ addRunningThread(x);
+ }
+ assert(activeThreads[0]==(activeThreads[1]+activeThreads[2]) && activeThreads[0]>=0 && activeThreads[1]>=0 &&
+ activeThreads[2]>=0 && activeThreads[2]<=maxWriteThreads) : Arrays.toString(activeThreads);
+
+ return activeThreads[0];
+ }
+ }
+
+ private static final int addRunningThread(int x){
+ if(verbose){System.err.println("addRunningThread("+x+")");}
+ final int max=(Shared.LOW_MEMORY ? 1 : maxWriteThreads);
+ synchronized(activeThreads){
+ assert(x!=0);
+ if(x>0){
+ assert(activeThreads[1]>=x);
+ while(activeThreads[2]>=max){
+ try {
+ activeThreads.wait();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ activeThreads[1]-=x; //Remove from waiting
+ }else{
+ activeThreads[0]+=x; //Remove from active
+ }
+ activeThreads[2]+=x; //Change number running
+
+ assert(activeThreads[0]==(activeThreads[1]+activeThreads[2]) && activeThreads[0]>=0 && activeThreads[1]>=0 &&
+ activeThreads[2]>=0 && activeThreads[2]<=max) : Arrays.toString(activeThreads);
+
+ if(activeThreads[2]==0 || (activeThreads[2]<max && activeThreads[1]>0)){activeThreads.notify();}
+ return activeThreads[2];
+ }
+ }
+
+ public static final int countActiveThreads(){
+ if(verbose){System.err.println("countActiveThreads()");}
+ synchronized(activeThreads){
+ assert(activeThreads[0]==(activeThreads[1]+activeThreads[2]) && activeThreads[0]>=0 && activeThreads[1]>=0 &&
+ activeThreads[2]>=0 && activeThreads[2]<=maxWriteThreads) : Arrays.toString(activeThreads);
+ return activeThreads[0];
+ }
+ }
+
+ public static final void waitForWritingToFinish(){
+ if(verbose){System.err.println("waitForWritingToFinish()");}
+ synchronized(activeThreads){
+ while(activeThreads[0]>0){
+ assert(activeThreads[0]==(activeThreads[1]+activeThreads[2]) && activeThreads[0]>=0 && activeThreads[1]>=0 &&
+ activeThreads[2]>=0 && activeThreads[2]<=maxWriteThreads) : Arrays.toString(activeThreads);
+ try {
+ activeThreads.wait(8000);
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ if(activeThreads[2]==0 || (activeThreads[2]<maxWriteThreads && activeThreads[1]>0)){activeThreads.notify();}
+ }
+ }
+ }
+
+
+ public static final boolean closeStream(ConcurrentReadStreamInterface cris){return closeStreams(cris, (ConcurrentReadOutputStream[])null);}
+ public static final boolean closeStream(ConcurrentReadOutputStream ross){return closeStreams(null, ross);}
+
+ public static final boolean closeStreams(MultiCros mc){
+ if(mc==null){return false;}
+ return closeStreams(null, mc.streamList.toArray(new ConcurrentReadOutputStream[0]));
+ }
+
+ public static final boolean closeStreams(ConcurrentReadStreamInterface cris, ConcurrentReadOutputStream...ross){
+ if(verbose){
+ System.err.println("closeStreams("+cris+", "+(ross==null ? "null" : ross.length)+")");
+ new Exception().printStackTrace(System.err);
+ }
+ boolean errorState=false;
+ if(cris!=null){
+ if(verbose){System.err.println("Closing cris; error="+errorState);}
+ cris.close();
+ errorState|=cris.errorState();
+// Object[] prods=cris.producers();
+// for(Object o : prods){
+// if(o!=null && o.getClass()==ReadInputStream.class){
+// ReadInputStream ris=(ReadInputStream)o;
+// ris.
+// }
+// }
+ if(verbose){System.err.println("Closed cris; error="+errorState);}
+ }
+ if(ross!=null){
+ for(ConcurrentReadOutputStream ros : ross){
+ if(verbose){System.err.println("Closing ros "+ros+"; error="+errorState);}
+ if(ros!=null){
+ ros.close();
+ ros.join();
+ errorState|=(ros.errorState() || !ros.finishedSuccessfully());
+ }
+ if(verbose){System.err.println("Closed ros; error="+errorState);}
+ }
+ }
+ return errorState;
+ }
+
+ public static boolean killProcess(String fname){
+ if(verbose){
+ System.err.println("killProcess("+fname+")");
+// new Exception().printStackTrace(System.err);
+ System.err.println("processMap: "+processMap.keySet());
+ }
+ if(fname==null || (!isCompressed(fname) && !fname.endsWith(".bam") && !FORCE_KILL)){return false;}
+
+ boolean error=false;
+ synchronized(processMap){
+ Process p=processMap.remove(fname);
+ if(p!=null){
+ if(verbose){System.err.println("Found Process for "+fname);}
+ int x=-1, tries=0;
+ for(; tries<20; tries++){
+ if(verbose){System.err.println("Trying p.waitFor()");}
+ try {
+ x=p.waitFor();
+ if(verbose){System.err.println("success; return="+x);}
+ break;
+ } catch (InterruptedException e) {
+ if(verbose){System.err.println("Failed.");}
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ error|=(tries>=20 || x!=0);
+ if(tries>=20){
+ if(verbose){System.err.println("Calling p.destroy because tries=="+tries+"; error="+error);}
+ p.destroy();
+ if(verbose){System.err.println("destroyed");}
+ }
+ }
+ }
+ synchronized(pipeThreadMap){
+ ArrayList<PipeThread> atp=pipeThreadMap.remove(fname);
+ if(atp!=null){
+ for(PipeThread p : atp){
+ if(p!=null){
+ if(verbose){System.err.println("Found PipeThread for "+fname);}
+ p.terminate();
+ if(verbose){System.err.println("Terminated PipeThread");}
+ }
+ }
+ }
+ }
+ if(verbose){System.err.println("killProcess("+fname+") returned "+error);}
+ return error;
+ }
+
+ private static void addProcess(String fname, Process p){
+ if(verbose){
+ System.err.println("addProcess("+fname+", "+p+")");
+ new Exception().printStackTrace();
+ }
+ synchronized(processMap){
+// System.err.println("Adding Process for "+fname);
+ Process old=processMap.put(fname, p);
+ if(old!=null){
+ old.destroy();
+ throw new RuntimeException("Duplicate process for file "+fname);
+ }
+ }
+ }
+
+ private static void addPipeThread(String fname, PipeThread pt){
+ if(verbose){System.err.println("addPipeThread("+fname+", "+pt+")");}
+ synchronized(pipeThreadMap){
+// System.err.println("Adding PipeThread for "+fname);
+ ArrayList<PipeThread> atp=pipeThreadMap.get(fname);
+ if(atp==null){
+ atp=new ArrayList<PipeThread>(2);
+ pipeThreadMap.put(fname, atp);
+ }
+ atp.add(pt);
+ }
+ }
+
+ /** {active, waiting, running} <br>
+ * Active means running or waiting.
+ */
+ public static int[] activeThreads={0, 0, 0};
+ public static int maxWriteThreads=Shared.threads();
+
+ public static boolean verbose=false;
+
+ public static boolean RAWMODE=false; //Does not automatically compress and decompress when true
+
+ //For killing subprocesses that are neither compression nor samtools
+ public static boolean FORCE_KILL=false;
+
+ public static boolean USE_GZIP=false;
+ public static boolean USE_PIGZ=false;
+ public static boolean USE_GUNZIP=false;
+ public static boolean USE_UNPIGZ=false;
+ public static boolean USE_BZIP2=true;
+ public static boolean USE_PBZIP2=true;
+ public static boolean USE_DSRC=true;
+
+ public static boolean PROCESS_BZ2=true;
+ public static final boolean PROCESS_XZ=false;
+
+ public static final int INBUF=16384;
+ public static final int OUTBUF=16384;
+
+ public static int ZIPLEVEL=4;
+ public static int MAX_ZIP_THREADS=8;
+ public static int ZIP_THREAD_DIVISOR=2;
+ public static boolean ALLOW_ZIPLEVEL_CHANGE=true;
+
+ public static final String FILESEP=System.getProperty("file.separator");
+
+ private static final String diskSync=new String("DISKSYNC");
+
+ public static final HashSet<String> loadedFiles=new HashSet<String>();
+
+ private static final String[] compressedExtensions=new String[] {".gz", ".gzip", ".zip", ".bz2", ".xz", ".dsrc"};
+
+// private static HashMap<String, Process> inputProcesses=new HashMap<String, Process>(8);
+// private static HashMap<String, Process> outputProcesses=new HashMap<String, Process>(8);
+ private static HashMap<String, Process> processMap=new HashMap<String, Process>(8);
+ private static HashMap<String, ArrayList<PipeThread>> pipeThreadMap=new HashMap<String, ArrayList<PipeThread>>(8);
+
+}
diff --git a/current/fileIO/RenameFiles.java b/current/fileIO/RenameFiles.java
new file mode 100755
index 0000000..f9f5cc0
--- /dev/null
+++ b/current/fileIO/RenameFiles.java
@@ -0,0 +1,158 @@
+package fileIO;
+
+import java.io.File;
+
+import dna.Data;
+
+
+public class RenameFiles {
+
+
+ public static void main(String[] args){
+ for(String s : args){
+ renameFiles(s);
+ }
+ }
+
+
+ public static void renameFiles(String path){
+ File f=new File(path);
+ renameFiles(f);
+ }
+
+ public static void renameFiles(File path){
+
+ if(path.isDirectory()){
+ File[] array=path.listFiles();
+ for(File f : array){renameFiles(f);}
+ }else{
+ rename(path);
+ }
+
+ }
+
+ public static void rename(File in){
+ assert(in.exists()) : in.toString();
+ assert(in.isFile()) : in.toString();
+ String abs=in.getAbsolutePath();
+
+
+ int dot=abs.lastIndexOf('.');
+ int slash=abs.lastIndexOf('/');
+
+// String[] split=Person.parsePath(abs.substring(0, slash));
+// String name=split[0];
+// String out=abs.substring(0, dot)+"_"+name+".txt";
+
+
+
+ String fname=abs.substring(slash+1);
+
+// System.out.println(fname);
+
+
+// if(fname.startsWith("chr") && fname.endsWith(".txt")){
+//
+// String out=abs.replace(".txt", ".flow");
+// assert(!out.equals(abs)) : out+", "+abs;
+//
+// System.out.println("Renaming "+abs+" to "+out);
+// in.renameTo(new File(out));
+// }
+
+ int build=36;
+ if(abs.contains("FL5-") || abs.contains("630-") || abs.contains("618-")){
+ build=37;
+ }
+
+// if(fname.startsWith("var") && fname.endsWith(".vla") && !fname.contains("build")){
+//
+// String out=abs.replace(".vla", "-build"+build+".vla");
+// assert(!out.equals(abs)) : out+", "+abs;
+//
+// System.out.println("Renaming "+abs+" to "+out);
+// in.renameTo(new File(out));
+// }
+//
+// if(fname.startsWith("gene") && fname.endsWith(".gvla") && !fname.contains("build")){
+//
+// String out=abs.replace(".gvla", "-build"+build+".gvla");
+// assert(!out.equals(abs)) : out+", "+abs;
+//
+// System.out.println("Renaming "+abs+" to "+out);
+// in.renameTo(new File(out));
+// }
+//
+// if(fname.endsWith(".tsv.zip") && !fname.contains("build")){
+//
+// String out=abs.replace(".tsv.zip", "-build"+build+".tsv.zip");
+// assert(!out.equals(abs)) : out+", "+abs;
+//
+// System.out.println("Renaming "+abs+" to "+out);
+// in.renameTo(new File(out));
+// }
+//
+// if(fname.endsWith(".tsv.gz") && !fname.contains("build")){
+//
+// String out=abs.replace(".tsv.gz", "-build"+build+".tsv.gz");
+// assert(!out.equals(abs)) : out+", "+abs;
+//
+// System.out.println("Renaming "+abs+" to "+out);
+// in.renameTo(new File(out));
+// }
+//
+// if(fname.endsWith(".tsv") && !fname.contains("build")){
+//
+// String out=abs.replace(".tsv", "-build"+build+".tsv");
+// assert(!out.equals(abs)) : out+", "+abs;
+//
+// System.out.println("Renaming "+abs+" to "+out);
+// in.renameTo(new File(out));
+// }
+//
+// if(fname.endsWith(".ca") && !fname.contains("build")){
+//
+// String out=abs.replace(".ca", "-build"+build+".ca");
+// assert(!out.equals(abs)) : out+", "+abs;
+//
+// System.out.println("Renaming "+abs+" to "+out);
+// in.renameTo(new File(out));
+// }
+//
+// if(fname.endsWith(".ca.zip") && !fname.contains("build")){
+//
+// String out=abs.replace(".ca.zip", "-build"+build+".ca.zip");
+// assert(!out.equals(abs)) : out+", "+abs;
+//
+// System.out.println("Renaming "+abs+" to "+out);
+// in.renameTo(new File(out));
+// }
+//
+// if(fname.contains("-ASM-") && fname.contains("build36")){
+//
+// String out=abs.replace("-build36", "");
+// assert(!out.equals(abs)) : out+", "+abs;
+//
+// System.out.println("Renaming "+abs+" to "+out);
+// in.renameTo(new File(out));
+// }
+
+ if(fname.contains("READMEtxt")){
+ String out=abs.replace("READMEtxt", "README.txt");
+ assert(!out.equals(abs)) : out+", "+abs;
+
+ System.out.println("Renaming "+abs+" to "+out);
+ in.renameTo(new File(out));
+ }
+
+ if(fname.contains("-1.8.0.")){
+
+ String out=abs.replace("-1.8.0.", ".");
+ assert(!out.equals(abs)) : out+", "+abs;
+
+ System.out.println("Renaming "+abs+" to "+out);
+ in.renameTo(new File(out));
+ }
+ }
+
+}
diff --git a/current/fileIO/SummaryFile.java b/current/fileIO/SummaryFile.java
new file mode 100755
index 0000000..d67ba46
--- /dev/null
+++ b/current/fileIO/SummaryFile.java
@@ -0,0 +1,172 @@
+package fileIO;
+
+import java.io.File;
+
+import align2.Tools;
+
+import dna.Data;
+import dna.Parser;
+
+/**
+ * Tests to see if a summary file matches a reference fasta file, based on date, size, and name
+ * @author Brian Bushnell
+ * @date Mar 11, 2013
+ *
+ */
+public class SummaryFile {
+
+ public static void main(String[] args){
+ if(args.length==0){
+ System.out.println("Usage: SummaryFile <summary file> <reference fasta>");
+ System.exit(0);
+ }
+
+ String summary=null, ref=null;
+
+ for(int i=0; i<args.length; i++){
+
+ if(args[i].contains("=")){
+ final String arg=args[i];
+ final String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if("null".equalsIgnoreCase(b)){b=null;}
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(a.equals("summary")){
+ summary=b;
+ }else if(a.equals("ref") || a.equals("reference")){
+ ref=b;
+ }else{
+ throw new RuntimeException("Unknown parameter: "+args[i]);
+ }
+
+ }else{
+ if(args[i].endsWith("summary.txt")){
+ summary=args[i];
+ }else{
+ ref=args[i];
+ }
+ }
+ }
+
+ if(summary==null && args.length>0){
+ summary=args[0];
+ }
+
+ if(summary==null){
+ System.out.println("Usage: SummaryFile <summary file> <reference fasta>");
+ System.exit(0);
+ }
+
+ if(ref==null){
+
+ }
+ }
+
+ public boolean compare(final String refName){
+ try {
+ File ref=new File(refName);
+ if(!ref.exists()){
+ if(refName.startsWith("stdin")){return false;}
+ else{
+ assert(false) : "No such file: "+refName;
+ }
+ }
+// if(!refName.equals(source) && !Files.isSameFile(ref.toPath(), new File(source).toPath())){ //This is Java-7 specific.
+//// assert(false) : refName+", "+source+": "+(Files.isSameFile(ref.toPath(), new File(source).toPath()))+
+//// "\n"+ref.getCanonicalPath()+", "+new File(source).getCanonicalPath()+": "+(ref.getCanonicalPath().equals(new File(source).getCanonicalPath()));
+// return false;
+//
+// }
+ if(!refName.equals(source) && !ref.getCanonicalPath().equals(new File(source).getCanonicalPath())){
+// assert(false) : refName+", "+source+": "+(Files.isSameFile(ref.toPath(), new File(source).toPath()))+
+// "\n"+ref.getCanonicalPath()+", "+new File(source).getCanonicalPath()+": "+(ref.getCanonicalPath().equals(new File(source).getCanonicalPath()));
+ return false;
+
+ }
+ if(bytes!=ref.length()){
+// assert(false) : bytes+", "+ref.length();
+ return false;
+ }
+ if(modified!=ref.lastModified()){
+// assert(false) : modified+", "+ref.lastModified();
+ return false;
+ }
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ return false;
+ }
+ return true;
+ }
+
+ public static boolean compare(final String summaryName, final String refName){
+ assert(refName!=null) : "Null reference file name.";
+ if(!new File(summaryName).exists()){
+// assert(false);
+ return false;
+ }
+ SummaryFile sf=new SummaryFile(summaryName);
+ return sf.compare(refName);
+ }
+
+ public static String getName(){
+ return getName(Data.GENOME_BUILD);
+ }
+
+ public static String getName(int build){
+ return Data.ROOT_GENOME+build+"/summary.txt";
+ }
+
+ public SummaryFile(String path){
+ summaryFname=path;
+ String s;
+ TextFile tf=new TextFile(summaryFname, false, false);
+ for(s=tf.nextLine(); s!=null; s=tf.nextLine()){
+ if(s.charAt(0)=='#'){
+ if(s.startsWith("#Version")){
+ String[] split=s.split("\t");
+ version=(split.length>1 ? Integer.parseInt(split[1]) : 0);
+ }
+ }else{
+ String[] split=s.split("\t");
+ String a=split[0];
+ String b=split[1];
+ if(a.equalsIgnoreCase("chroms")){chroms=(int)Long.parseLong(b);}
+ else if(a.equalsIgnoreCase("bases")){bases=Long.parseLong(b);}
+ else if(a.equalsIgnoreCase("version")){version=Integer.parseInt(b);}
+ else if(a.equalsIgnoreCase("defined")){definedBases=Long.parseLong(b);}
+ else if(a.equalsIgnoreCase("contigs")){contigs=Integer.parseInt(b);}
+ else if(a.equalsIgnoreCase("scaffolds")){scaffolds=Integer.parseInt(b);}
+ else if(a.equalsIgnoreCase("interpad")){interpad=Integer.parseInt(b);}
+ else if(a.equalsIgnoreCase("undefined")){undefinedBases=Long.parseLong(b);}
+ else if(a.equalsIgnoreCase("name")){name=b;}
+ else if(a.equalsIgnoreCase("source")){source=b;}
+ else if(a.equalsIgnoreCase("bytes")){bytes=Long.parseLong(b);}
+ else if(a.equalsIgnoreCase("last modified")){modified=Long.parseLong(b);}
+ else if(a.equalsIgnoreCase("scafprefixes")){scafprefixes=Tools.parseBoolean(b);}
+ else{throw new RuntimeException("In file "+tf.name+": Unknown term "+s);}
+ }
+ }
+ tf.close();
+ }
+
+ public final String summaryFname;
+
+ public int chroms;
+ public long contigs;
+ public long scaffolds;
+ public int interpad;
+ public long bases;
+ public long definedBases;
+ public long undefinedBases;
+ public String name;
+ public String source;
+ public int version;
+ public long bytes;
+ public long modified;
+ public boolean scafprefixes;
+
+}
diff --git a/current/fileIO/TextFile.java b/current/fileIO/TextFile.java
new file mode 100755
index 0000000..93ec5b7
--- /dev/null
+++ b/current/fileIO/TextFile.java
@@ -0,0 +1,266 @@
+package fileIO;
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+
+import dna.Data;
+
+
+public class TextFile {
+
+
+ public static void main(String[] args){
+ TextFile tf=new TextFile(args.length>0 ? args[0] : "stdin", false, false);
+ int first=0, last=100;
+ long lines=0;
+ long bytes=0;
+ if(args.length>1){
+ first=Integer.parseInt(args[1]);
+ last=first+100;
+ }
+ if(args.length>2){
+ last=Integer.parseInt(args[2]);
+ }
+
+ for(int i=0; i<first; i++){tf.readLine();}
+ for(int i=first; i<last; i++){
+ String s=tf.readLine();
+ if(s==null){break;}
+
+ lines++;
+ bytes+=s.length();
+ System.out.println(s);
+// System.out.println(Arrays.toString(s.getBytes()));
+ }
+
+ System.err.println("\n");
+ System.err.println("Lines: "+lines);
+ System.err.println("Bytes: "+bytes);
+ tf.close();
+ tf.reset();
+ tf.close();
+
+// for(int i=first; i<last; i++){
+// String s=tf.readLine();
+// if(s==null){break;}
+//
+// lines++;
+// bytes+=s.length();
+// System.out.println(s);
+// }
+ }
+
+ public TextFile(String name){this(name, false, false);}
+
+ public TextFile(FileFormat ff){this(ff, false);}
+
+ public TextFile(FileFormat ff, boolean tryAllExtensions){
+ file=new File(ff.name());
+ allowSubprocess=ff.allowSubprocess();
+
+ if(tryAllExtensions && !ff.name().startsWith("jar:") && !file.exists()){
+ name=ReadWrite.findFileExtension(ff.name());
+ file=new File(name);
+ }else{
+ name=ff.name();
+ }
+
+ br=open();
+ }
+
+ public TextFile(String fname, boolean allowSubprocess_, boolean tryAllExtensions){
+ fname=fname.replace('\\', '/');
+ file=new File(fname);
+ allowSubprocess=allowSubprocess_;
+
+ if(tryAllExtensions && !fname.startsWith("jar:") && !file.exists()){
+ name=ReadWrite.findFileExtension(fname);
+ file=new File(name);
+ }else{
+ name=fname;
+ }
+
+
+// assert(file.exists()) : "Can't find "+fname;
+
+// if(!file.exists()){
+// throw new RuntimeException("Can't find "+fname);
+// }
+
+ br=open();
+ }
+
+ public static final String[] toStringLines(String fname){
+ TextFile tf=new TextFile(fname);
+ String[] lines=tf.toStringLines();
+ tf.close();
+ return lines;
+ }
+
+ public final String[] toStringLines(){
+
+ String s=null;
+ ArrayList<String> list=new ArrayList<String>(4096);
+
+ for(s=nextLine(); s!=null; s=nextLine()){
+ list.add(s);
+ }
+
+ return list.toArray(new String[list.size()]);
+
+ }
+
+ public final long countLines(){
+
+ String s=null;
+ long count=0;
+
+ for(s=nextLine(); s!=null; s=nextLine()){count++;}
+
+ reset();
+
+ return count;
+
+ }
+
+ public static String[][] doublesplitTab(String[] lines, boolean trim){
+ String[][] lines2=new String[lines.length][];
+ for(int i=0; i<lines.length; i++){
+ if(trim){
+ lines2[i]=lines[i].trim().split("\t", -1);
+ }else{
+ lines2[i]=lines[i].split("\t", -1);
+ }
+ }
+ return lines2;
+ }
+
+
+ public static String[][] doublesplitWhitespace(String[] lines, boolean trim){
+ String[][] lines2=new String[lines.length][];
+ for(int i=0; i<lines.length; i++){
+ if(trim){
+ lines2[i]=lines[i].trim().split("\\p{javaWhitespace}+");
+ }else{
+ lines2[i]=lines[i].split("\\p{javaWhitespace}+");
+ }
+ }
+ return lines2;
+ }
+
+ public final void reset(){
+ close();
+ br=open();
+ }
+
+ public boolean exists(){
+ return name.equals("stdin") || name.startsWith("stdin.") || name.startsWith("jar:") || file.exists(); //TODO Ugly and unsafe hack for files in jars
+ }
+
+ public final boolean close(){
+ if(!open){return false;}
+ open=false;
+ assert(br!=null);
+
+ errorState|=ReadWrite.finishReading(is, name, allowSubprocess, br, isr);
+
+ br=null;
+ is=null;
+ isr=null;
+ lineNum=-1;
+ return false;
+ }
+
+ public String nextLine(){
+ return readLine(true);
+ }
+
+ public final String readLine(){
+ return readLine(true);
+ }
+
+ public final String readLine(boolean skipBlank){
+ String currentLine=null;
+
+
+ //Note: Disabling this block seems to speed things up maybe 5%.
+// boolean ready=false;
+// try {
+// ready=br.ready();
+// } catch (IOException e) {
+// // TODO Auto-generated catch block
+// e.printStackTrace();
+// }
+// if(!ready){return null;}
+
+ if(!open || br==null){
+ if(Data.WINDOWS){System.err.println("Attempting to read from a closed file: "+name);}
+ return null;
+ }
+ try{
+ lineNum++;
+ currentLine=br.readLine();
+// System.out.println(lineNum+":\t"+currentLine);
+ }catch(Exception e){
+ System.err.println("Oops! Bad read in file "+name+" at line "+lineNum);
+ System.err.println(""+open+", "+(br==null));
+ try {
+ File f=new File(name);
+ System.err.println("path and length: \t"+f.getAbsolutePath()+"\t"+f.length());
+ } catch (Exception e1) {
+ //e1.printStackTrace();
+ }
+ throw new RuntimeException(e);
+ }
+ if(currentLine==null){return null;}
+// System.out.println("Read "+line);
+
+// currentLine=currentLine.trim();
+
+ //Note! This may generate a new String for every line and thus be slow.
+// if(currentLine.trim().length()==0){return readLine();} //Skips blank lines
+ if(skipBlank && (currentLine.length()==0 ||
+ (Character.isWhitespace(currentLine.charAt(0)) &&
+ (Character.isWhitespace(currentLine.charAt(currentLine.length()-1)))) &&
+ currentLine.trim().length()==0)){
+ return readLine(skipBlank); //Skips blank lines
+ }
+
+ return currentLine;
+ }
+
+ private final BufferedReader open(){
+
+ if(open){
+ throw new RuntimeException("Attempt to open already-opened TextFile "+name);
+ }
+ open=true;
+
+ is=ReadWrite.getInputStream(name, true, allowSubprocess);
+ isr=new InputStreamReader(is);
+
+ BufferedReader b=new BufferedReader(isr, 32768);
+
+ return b;
+ }
+
+ public boolean isOpen(){return open;}
+
+ private boolean open=false;
+ public boolean errorState=false;
+
+ public final String name;
+ public File file;
+ private final boolean allowSubprocess;
+
+ public InputStream is;
+ public InputStreamReader isr;
+ public BufferedReader br;
+
+ public long lineNum=-1;
+
+ public static boolean verbose=false;
+
+}
diff --git a/current/fileIO/TextStreamWriter.java b/current/fileIO/TextStreamWriter.java
new file mode 100755
index 0000000..27cd780
--- /dev/null
+++ b/current/fileIO/TextStreamWriter.java
@@ -0,0 +1,315 @@
+package fileIO;
+
+import java.io.OutputStream;
+import java.io.PrintWriter;
+import java.util.ArrayList;
+import java.util.concurrent.ArrayBlockingQueue;
+
+import align2.Shared;
+
+import stream.Read;
+
+import dna.Data;
+
+
+
+/**
+ * @author Brian Bushnell
+ * @date Aug 23, 2010
+ *
+ */
+public class TextStreamWriter extends Thread {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public TextStreamWriter(String fname_, boolean overwrite_, boolean append_, boolean allowSubprocess_){
+ this(fname_, overwrite_, append_, allowSubprocess_, 0);
+ }
+
+ public TextStreamWriter(String fname_, boolean overwrite_, boolean append_, boolean allowSubprocess_, int format){
+ this(FileFormat.testOutput(fname_, FileFormat.TEXT, format, 0, allowSubprocess_, overwrite_, append_, true));
+ }
+
+ public TextStreamWriter(FileFormat ff){
+ FASTQ=ff.fastq() || ff.text();
+ FASTA=ff.fasta();
+ BREAD=ff.bread();
+ SAM=ff.samOrBam();
+ BAM=ff.bam();
+ SITES=ff.sites();
+ INFO=ff.attachment();
+ OTHER=(!FASTQ && !FASTA && !BREAD && !SAM && !BAM && !SITES && !INFO);
+
+
+ fname=ff.name();
+ overwrite=ff.overwrite();
+ append=ff.append();
+ allowSubprocess=ff.allowSubprocess();
+ assert(!(overwrite&append));
+ assert(ff.canWrite()) : "File "+fname+" exists and overwrite=="+overwrite;
+ if(append && !(ff.raw() || ff.gzip())){throw new RuntimeException("Can't append to compressed files.");}
+
+ if(!BAM || !Data.SAMTOOLS() || !Data.SH()){
+ myOutstream=ReadWrite.getOutputStream(fname, append, true, allowSubprocess);
+ if(verbose){System.err.println("Created output stream for "+fname+", "+append+", "+true+", "+allowSubprocess);}
+ }else{
+ myOutstream=ReadWrite.getOutputStreamFromProcess(fname, "samtools view -S -b -h - ", true, append, true, true);
+ }
+ myWriter=new PrintWriter(myOutstream);
+ if(verbose){System.err.println("Created PrintWriter for "+myOutstream);}
+
+ queue=new ArrayBlockingQueue<ArrayList<CharSequence>>(5);
+ buffer=new ArrayList<CharSequence>(buffersize);
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Primary Method ----------------*/
+ /*--------------------------------------------------------------*/
+
+
+ @Override
+ public void run() {
+ if(verbose){System.err.println("running");}
+ assert(open) : fname;
+
+ synchronized(this){
+ started=true;
+ this.notify();
+ }
+
+ ArrayList<CharSequence> job=null;
+
+ if(verbose){System.err.println("waiting for jobs");}
+ while(job==null){
+ try {
+ job=queue.take();
+ if(verbose){System.err.println("grabbed first job of size "+job.size());}
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+
+ if(verbose){System.err.println("processing jobs");}
+ while(job!=null && job!=POISON2){
+ if(!job.isEmpty()){
+// if(verbose){System.err.println("writing job of size "+job.size());}
+ for(final CharSequence cs : job){
+// if(verbose){System.err.println("writing cs of size "+cs.length());}
+ assert(cs!=POISON);
+ myWriter.print(cs);
+// if(verbose){System.err.println("printing "+cs);}
+ }
+ }
+
+ job=null;
+ while(job==null){
+ try {
+ job=queue.take();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+ if(verbose){System.err.println("null/poison job");}
+// assert(false);
+ open=false;
+ if(verbose){System.err.println("call finish writing");}
+ ReadWrite.finishWriting(myWriter, myOutstream, fname, allowSubprocess);
+ if(verbose){System.err.println("finished writing");}
+ synchronized(this){notifyAll();}
+ if(verbose){System.err.println("done");}
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Control and Helpers ----------------*/
+ /*--------------------------------------------------------------*/
+
+
+ @Override
+ public void start(){
+ super.start();
+ if(verbose){System.err.println(this.getState());}
+ synchronized(this){
+ while(!started){
+ try {
+ this.wait(20);
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+ }
+
+
+ public synchronized void poison(){
+ //Don't allow thread to shut down before it has started
+ while(!started || this.getState()==Thread.State.NEW){
+ if(verbose){System.err.println("waiting for start.");}
+ try {
+ this.wait(20);
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+
+ if(verbose){System.err.println("testing if open.");}
+ if(!open){return;}
+// if(verbose){System.err.println("adding buffer: "+buffer.size());}
+ addJob(buffer);
+ buffer=null;
+// System.err.println("Poisoned!");
+// assert(false);
+
+// assert(false) : open+", "+this.getState()+", "+started;
+ open=false;
+ addJob(POISON2);
+ }
+
+ public void waitForFinish(){
+ if(verbose){System.err.println("waiting for finish.");}
+ while(this.getState()!=Thread.State.TERMINATED){
+ if(verbose){System.err.println("attempting join.");}
+ try {
+ this.join(1000);
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ }
+ }
+ }
+
+ /**
+ * @return true if there was an error, false otherwise
+ */
+ public boolean poisonAndWait(){
+ poison();
+ waitForFinish();
+ assert(buffer==null || buffer.isEmpty());
+ return errorState;
+ }
+
+ //TODO Why is this synchronized?
+ public synchronized void addJob(ArrayList<CharSequence> j){
+ if(verbose){System.err.println("Got job "+(j==null ? "null" : j.size()));}
+
+ assert(started) : "Wait for start() to return before using the writer.";
+// while(!started || this.getState()==Thread.State.NEW){
+// try {
+// this.wait(20);
+// } catch (InterruptedException e) {
+// // TODO Auto-generated catch block
+// e.printStackTrace();
+// }
+// }
+
+ boolean success=false;
+ while(!success){
+ try {
+ queue.put(j);
+ success=true;
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ assert(!queue.contains(j)); //Hopefully it was not added.
+ }
+ }
+ if(verbose){System.err.println("Put job in queue: "+success);}
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Print ----------------*/
+ /*--------------------------------------------------------------*/
+
+
+ public void print(CharSequence cs){
+// System.err.println("Added line '"+cs+"'");
+// System.err.println("Adding "+cs.length()+" chars.");
+ assert(open) : cs;
+ buffer.add(cs);
+ bufferLen+=cs.length();
+ if(buffer.size()>=buffersize || bufferLen>=maxBufferLen){
+ addJob(buffer);
+ buffer=new ArrayList<CharSequence>(buffersize);
+ bufferLen=0;
+ }
+ }
+
+ public void print(Read r){
+ assert(!OTHER);
+ StringBuilder sb=(FASTQ ? r.toFastq() : FASTA ? r.toFasta(FASTA_WRAP) : SAM ? r.toSam() :
+ SITES ? r.toSites() : INFO ? r.toInfo() : r.toText(true));
+ print(sb);
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Println ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public void println(){
+ print("\n");
+ }
+
+ public void println(CharSequence cs){
+ print(cs);
+ print("\n");
+ }
+
+ public void println(Read r){
+ assert(!OTHER);
+ StringBuilder sb=(FASTQ ? r.toFastq() : FASTA ? r.toFasta(FASTA_WRAP) : SAM ? r.toSam() :
+ SITES ? r.toSites() : INFO ? r.toInfo() : r.toText(true)).append('\n');
+ print(sb);
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private ArrayList<CharSequence> buffer;
+
+ public int buffersize=100;
+ public int maxBufferLen=60000;
+ private int bufferLen=0;
+ public final boolean overwrite;
+ public final boolean append;
+ public final boolean allowSubprocess;
+ public final String fname;
+ private final OutputStream myOutstream;
+ private final PrintWriter myWriter;
+ private final ArrayBlockingQueue<ArrayList<CharSequence>> queue;
+ private boolean open=true;
+ private volatile boolean started=false;
+
+ /** TODO */
+ public boolean errorState=false;
+
+ /*--------------------------------------------------------------*/
+
+ private final boolean BAM;
+ private final boolean SAM;
+ private final boolean FASTQ;
+ private final boolean FASTA;
+ private final boolean BREAD;
+ private final boolean SITES;
+ private final boolean INFO;
+ private final boolean OTHER;
+
+ private final int FASTA_WRAP=Shared.FASTA_WRAP;
+
+ /*--------------------------------------------------------------*/
+
+ private static final String POISON=new String("POISON_TextStreamWriter");
+ private static final ArrayList<CharSequence> POISON2=new ArrayList<CharSequence>(1);
+
+ public static boolean verbose=false;
+
+}
diff --git a/current/jgi/A_Sample.java b/current/jgi/A_Sample.java
new file mode 100755
index 0000000..78eb77e
--- /dev/null
+++ b/current/jgi/A_Sample.java
@@ -0,0 +1,422 @@
+package jgi;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import stream.ConcurrentReadInputStream;
+import stream.FASTQ;
+import stream.FastaReadInputStream;
+import stream.ConcurrentReadOutputStream;
+import stream.Read;
+
+import dna.Parser;
+import dna.Timer;
+import fileIO.ByteFile;
+import fileIO.ReadWrite;
+import fileIO.FileFormat;
+
+import align2.ListNum;
+import align2.ReadStats;
+import align2.Shared;
+import align2.Tools;
+
+/**
+ * This class does nothing.
+ * It is designed to be easily modified into a program
+ * that processes reads in a single thread.
+ *
+ * @author Brian Bushnell
+ * @date June 20, 2014
+ *
+ */
+public class A_Sample {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /**
+ * Code entrance from the command line.
+ * @param args Command line arguments
+ */
+ public static void main(String[] args){
+ Timer t=new Timer();
+ A_Sample as=new A_Sample(args);
+ as.process(t);
+ }
+
+ /**
+ * Constructor.
+ * @param args Command line arguments
+ */
+ public A_Sample(String[] args){
+
+ //Process any config files
+ args=Parser.parseConfig(args);
+
+ //Detect whether the uses needs help
+ if(Parser.parseHelp(args, true)){
+ printOptions();
+ System.exit(0);
+ }
+
+ //Print the program name and arguments
+ outstream.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+
+ boolean setInterleaved=false; //Whether interleaved was explicitly set.
+
+ //Set some shared static variables regarding PIGZ
+ Shared.READ_BUFFER_LENGTH=Tools.min(200, Shared.READ_BUFFER_LENGTH);
+ Shared.capBuffers(4);
+ ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
+ ReadWrite.MAX_ZIP_THREADS=Shared.threads();
+
+ //Create a parser object
+ Parser parser=new Parser();
+
+ //Parse each argument
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+
+ //Break arguments into their constituent parts, in the form of "a=b"
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if(b==null || b.equalsIgnoreCase("null")){b=null;}
+ while(a.startsWith("-")){a=a.substring(1);} //Strip leading hyphens
+
+
+ if(parser.parse(arg, a, b)){//Parse standard flags in the parser
+ //do nothing
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ }else if(a.equals("parse_flag_goes_here")){
+ //Set a variable here
+ }else{
+ outstream.println("Unknown parameter "+args[i]);
+ assert(false) : "Unknown parameter "+args[i];
+ // throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ {//Process parser fields
+ Parser.processQuality();
+
+ maxReads=parser.maxReads;
+
+ overwrite=ReadStats.overwrite=parser.overwrite;
+ append=ReadStats.append=parser.append;
+ setInterleaved=parser.setInterleaved;
+
+ in1=parser.in1;
+ in2=parser.in2;
+ qfin1=parser.qfin1;
+ qfin2=parser.qfin2;
+
+ out1=parser.out1;
+ out2=parser.out2;
+ qfout1=parser.qfout1;
+ qfout2=parser.qfout2;
+
+ extin=parser.extin;
+ extout=parser.extout;
+ }
+
+ //Do input file # replacement
+ if(in1!=null && in2==null && in1.indexOf('#')>-1 && !new File(in1).exists()){
+ in2=in1.replace("#", "2");
+ in1=in1.replace("#", "1");
+ }
+
+ //Do output file # replacement
+ if(out1!=null && out2==null && out1.indexOf('#')>-1){
+ out2=out1.replace("#", "2");
+ out1=out1.replace("#", "1");
+ }
+
+ //Adjust interleaved detection based on the number of input files
+ if(in2!=null){
+ if(FASTQ.FORCE_INTERLEAVED){outstream.println("Reset INTERLEAVED to false because paired input files were specified.");}
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+ }
+
+ assert(FastaReadInputStream.settingsOK());
+
+ //Ensure there is an input file
+ if(in1==null){
+ printOptions();
+ throw new RuntimeException("Error - at least one input file is required.");
+ }
+
+ //Adjust the number of threads for input file reading
+ if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){
+ ByteFile.FORCE_MODE_BF2=true;
+ }
+
+ //Ensure out2 is not set without out1
+ if(out1==null){
+ if(out2!=null){
+ printOptions();
+ throw new RuntimeException("Error - cannot define out2 without defining out1.");
+ }
+ }
+
+ //Adjust interleaved settings based on number of output files
+ if(!setInterleaved){
+ assert(in1!=null && (out1!=null || out2==null)) : "\nin1="+in1+"\nin2="+in2+"\nout1="+out1+"\nout2="+out2+"\n";
+ if(in2!=null){ //If there are 2 input streams.
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+ outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED);
+ }else{ //There is one input stream.
+ if(out2!=null){
+ FASTQ.FORCE_INTERLEAVED=true;
+ FASTQ.TEST_INTERLEAVED=false;
+ outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED);
+ }
+ }
+ }
+
+ //Ensure output files can be written
+ if(!Tools.testOutputFiles(overwrite, append, false, out1, out2)){
+ outstream.println((out1==null)+", "+(out2==null)+", "+out1+", "+out2);
+ throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+out1+", "+out2+"\n");
+ }
+
+ //Ensure input files can be read
+ if(!Tools.testInputFiles(false, true, in1, in2)){
+ throw new RuntimeException("\nCan't read to some input files.\n");
+ }
+
+ //Ensure that no file was specified multiple times
+ if(!Tools.testForDuplicateFiles(true, in1, in2, out1, out2)){
+ throw new RuntimeException("\nSome file names were specified multiple times.\n");
+ }
+
+ //Create output FileFormat objects
+ ffout1=FileFormat.testOutput(out1, FileFormat.FASTQ, extout, true, overwrite, append, ordered);
+ ffout2=FileFormat.testOutput(out2, FileFormat.FASTQ, extout, true, overwrite, append, ordered);
+
+ //Create input FileFormat objects
+ ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, extin, true, true);
+ ffin2=FileFormat.testInput(in2, FileFormat.FASTQ, extin, true, true);
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Outer Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Create read streams and process all data */
+ void process(Timer t){
+
+ //Create a read input stream
+ final ConcurrentReadInputStream cris;
+ {
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, ffin2, qfin1, qfin2);
+ cris.start(); //Start the stream
+ if(verbose){outstream.println("Started cris");}
+ }
+ boolean paired=cris.paired();
+ if(!ffin1.samOrBam()){outstream.println("Input is being processed as "+(paired ? "paired" : "unpaired"));}
+
+ //Optionally create a read output stream
+ final ConcurrentReadOutputStream ros;
+ if(ffout1!=null){
+ final int buff=4;
+
+ if(cris.paired() && out2==null && (in1!=null && !ffin1.samOrBam() && !ffout1.samOrBam())){
+ outstream.println("Writing interleaved.");
+ }
+
+ ros=ConcurrentReadOutputStream.getStream(ffout1, ffout2, qfout1, qfout2, buff, null, false);
+ ros.start(); //Start the stream
+ }else{ros=null;}
+
+ //Reset counters
+ readsProcessed=0;
+ basesProcessed=0;
+
+ //Process the read stream
+ processInner(cris, ros);
+
+ if(verbose){outstream.println("Finished; closing streams.");}
+
+ //Write anything that was accumulated by ReadStats
+ errorState|=ReadStats.writeAll();
+ //Close the read streams
+ errorState|=ReadWrite.closeStreams(cris, ros);
+
+ //Report timing and results
+ {
+ t.stop();
+
+ //Calculate units per nanosecond
+ double rpnano=readsProcessed/(double)(t.elapsed);
+ double bpnano=basesProcessed/(double)(t.elapsed);
+
+ //Add "k" and "m" for large numbers
+ String rpstring=(readsProcessed<100000 ? ""+readsProcessed : readsProcessed<100000000 ? (readsProcessed/1000)+"k" : (readsProcessed/1000000)+"m");
+ String bpstring=(basesProcessed<100000 ? ""+basesProcessed : basesProcessed<100000000 ? (basesProcessed/1000)+"k" : (basesProcessed/1000000)+"m");
+
+ //Format the strings so they have they are right-justified
+ while(rpstring.length()<8){rpstring=" "+rpstring;}
+ while(bpstring.length()<8){bpstring=" "+bpstring;}
+
+ outstream.println("Time: \t"+t);
+ outstream.println("Reads Processed: "+rpstring+" \t"+String.format("%.2fk reads/sec", rpnano*1000000));
+ outstream.println("Bases Processed: "+bpstring+" \t"+String.format("%.2fm bases/sec", bpnano*1000));
+ }
+
+ //Throw an exception of there was an error in a thread
+ if(errorState){
+ throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
+ }
+ }
+
+ /** Iterate through the reads */
+ void processInner(final ConcurrentReadInputStream cris, final ConcurrentReadOutputStream ros){
+
+ //Do anything necessary prior to processing
+
+ {
+ //Grab the first ListNum of reads
+ ListNum<Read> ln=cris.nextList();
+ //Grab the actual read list from the ListNum
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ //Check to ensure pairing is as expected
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+ assert((ffin1==null || ffin1.samOrBam()) || (r.mate!=null)==cris.paired());
+ }
+
+ //As long as there is a nonempty read list...
+ while(reads!=null && reads.size()>0){
+ if(verbose){outstream.println("Fetched "+reads.size()+" reads.");}
+
+ //Loop through each read in the list
+ for(int idx=0; idx<reads.size(); idx++){
+ final Read r1=reads.get(idx);
+ final Read r2=r1.mate;
+
+ //Track the initial length for statistics
+ final int initialLength1=r1.length();
+ final int initialLength2=(r1.mateLength());
+
+ //Increment counters
+ readsProcessed+=1+r1.mateCount();
+ basesProcessed+=initialLength1+initialLength2;
+
+ boolean keep=processReadPair(r1, r2);
+ if(!keep){reads.set(idx, null);}
+ }
+
+ //Output reads to the output stream
+ if(ros!=null){ros.add(reads, ln.id);}
+
+ //Notify the input stream that the list was used
+ cris.returnList(ln.id, ln.list.isEmpty());
+ if(verbose){outstream.println("Returned a list.");}
+
+ //Fetch a new list
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+
+ //Notify the input stream that the final list was used
+ if(ln!=null){
+ cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
+ }
+ }
+
+ //Do anything necessary after processing
+
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /**
+ * Process a single read pair.
+ * @param r1 Read 1
+ * @param r2 Read 2 (may be null)
+ * @return True if the reads should be kept, false if they should be discarded.
+ */
+ boolean processReadPair(final Read r1, final Read r2){
+ throw new RuntimeException("TODO");
+ }
+
+ /** This is called if the program runs with no parameters */
+ private void printOptions(){
+ throw new RuntimeException("TODO");
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Primary input file path */
+ private String in1=null;
+ /** Secondary input file path */
+ private String in2=null;
+
+ private String qfin1=null;
+ private String qfin2=null;
+
+ /** Primary output file path */
+ private String out1=null;
+ /** Secondary output file path */
+ private String out2=null;
+
+ private String qfout1=null;
+ private String qfout2=null;
+
+ /** Override input file extension */
+ private String extin=null;
+ /** Override output file extension */
+ private String extout=null;
+
+ /*--------------------------------------------------------------*/
+
+ /** Number of reads processed */
+ protected long readsProcessed=0;
+ /** Number of bases processed */
+ protected long basesProcessed=0;
+
+ /** Quit after processing this many input reads; -1 means no limit */
+ private long maxReads=-1;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Final Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Primary input file */
+ private final FileFormat ffin1;
+ /** Secondary input file */
+ private final FileFormat ffin2;
+
+ /** Primary output file */
+ private final FileFormat ffout1;
+ /** Secondary output file */
+ private final FileFormat ffout2;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Common Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Print status messages to this output stream */
+ private PrintStream outstream=System.err;
+ /** Print verbose messages */
+ public static boolean verbose=false;
+ /** True if an error was encountered */
+ public boolean errorState=false;
+ /** Overwrite existing output files */
+ private boolean overwrite=false;
+ /** Append to existing output files */
+ private boolean append=false;
+ /** This flag has no effect on singlethreaded programs */
+ private final boolean ordered=false;
+
+}
diff --git a/current/jgi/A_Sample2.java b/current/jgi/A_Sample2.java
new file mode 100755
index 0000000..7853973
--- /dev/null
+++ b/current/jgi/A_Sample2.java
@@ -0,0 +1,160 @@
+package jgi;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import align2.ListNum;
+
+import stream.ConcurrentReadInputStream;
+import stream.ConcurrentReadOutputStream;
+import stream.Read;
+
+import dna.Parser;
+import dna.Timer;
+
+import fileIO.FileFormat;
+import fileIO.ReadWrite;
+
+/**
+ * @author Brian Bushnell
+ * @date Oct 6, 2014
+ *
+ */
+public class A_Sample2 {
+
+ public static void main(String[] args){
+ Timer t=new Timer();
+ A_Sample2 as=new A_Sample2(args);
+ as.process(t);
+ }
+
+ public A_Sample2(String[] args){
+
+ args=Parser.parseConfig(args);
+ if(Parser.parseHelp(args, true)){
+ printOptions();
+ System.exit(0);
+ }
+
+ outstream.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+
+ Parser parser=new Parser();
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if(b==null || b.equalsIgnoreCase("null")){b=null;}
+ while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens
+
+ if(parser.parse(arg, a, b)){
+ //do nothing
+ }else if(a.equals("parse_flag_goes_here")){
+ //Set a variable here
+ }else{
+ outstream.println("Unknown parameter "+args[i]);
+ assert(false) : "Unknown parameter "+args[i];
+ // throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ {//Process parser fields
+ Parser.processQuality();
+
+ maxReads=parser.maxReads;
+ in1=parser.in1;
+ out1=parser.out1;
+ }
+
+ ffout1=FileFormat.testOutput(out1, FileFormat.FASTQ, null, true, true, false, false);
+ ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, null, true, true);
+ }
+
+ void process(Timer t){
+
+ final ConcurrentReadInputStream cris;
+ {
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, null);
+ cris.start();
+ }
+ boolean paired=cris.paired();
+
+ final ConcurrentReadOutputStream ros;
+ if(out1!=null){
+ final int buff=4;
+
+ if(cris.paired() && (in1==null || !in1.contains(".sam"))){
+ outstream.println("Writing interleaved.");
+ }
+
+ assert(!out1.equalsIgnoreCase(in1) && !out1.equalsIgnoreCase(in1)) : "Input file and output file have same name.";
+
+ ros=ConcurrentReadOutputStream.getStream(ffout1, null, buff, null, false);
+ ros.start();
+ }else{ros=null;}
+
+ long readsProcessed=0;
+ {
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+ assert((ffin1==null || ffin1.samOrBam()) || (r.mate!=null)==cris.paired());
+ }
+
+ while(reads!=null && reads.size()>0){
+ if(verbose){outstream.println("Fetched "+reads.size()+" reads.");}
+
+ for(int idx=0; idx<reads.size(); idx++){
+ final Read r1=reads.get(idx);
+
+ // ********* Process reads here *********
+
+ readsProcessed++;
+ }
+
+ if(ros!=null){ros.add(reads, ln.id);}
+
+ cris.returnList(ln.id, ln.list.isEmpty());
+ if(verbose){outstream.println("Returned a list.");}
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ if(ln!=null){
+ cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
+ }
+ }
+ ReadWrite.closeStreams(cris, ros);
+ if(verbose){outstream.println("Finished.");}
+
+ t.stop();
+ outstream.println("Time: \t"+t);
+ outstream.println("Reads Processed: "+readsProcessed+" \t"+String.format("%.2fk reads/sec", (readsProcessed/(double)(t.elapsed))*1000000));
+ }
+
+ /*--------------------------------------------------------------*/
+
+ private void printOptions(){
+ throw new RuntimeException("printOptions: TODO");
+ }
+
+ /*--------------------------------------------------------------*/
+
+ private String in1=null;
+ private String out1=null;
+
+ private final FileFormat ffin1;
+ private final FileFormat ffout1;
+
+ /*--------------------------------------------------------------*/
+
+ private long maxReads=-1;
+
+ /*--------------------------------------------------------------*/
+
+ private java.io.PrintStream outstream=System.err;
+ public static boolean verbose=false;
+
+}
diff --git a/current/jgi/A_SampleD.java b/current/jgi/A_SampleD.java
new file mode 100755
index 0000000..9d5fbf6
--- /dev/null
+++ b/current/jgi/A_SampleD.java
@@ -0,0 +1,168 @@
+package jgi;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import align2.ListNum;
+import align2.Shared;
+import align2.Tools;
+
+import stream.ConcurrentReadInputStream;
+import stream.ConcurrentReadInputStreamD;
+import stream.ConcurrentReadOutputStream;
+import stream.Read;
+import stream.mpi.MPIWrapper;
+
+import dna.Parser;
+import dna.Timer;
+
+import fileIO.FileFormat;
+import fileIO.ReadWrite;
+
+/**
+ * @author Brian Bushnell
+ * @date Oct 6, 2014
+ *
+ */
+public class A_SampleD {
+
+ public static void main(String[] args){
+ Timer t=new Timer();
+ A_SampleD as=new A_SampleD(args);
+ MPIWrapper.mpiInit(args);
+ as.process(t);
+ MPIWrapper.mpiFinalize();
+ }
+
+ public A_SampleD(String[] args){
+
+ args=Parser.parseConfig(args);
+ if(Parser.parseHelp(args, true)){
+ printOptions();
+ System.exit(0);
+ }
+
+ outstream.println("A_SampleD: Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+
+ Parser parser=new Parser();
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if(b==null || b.equalsIgnoreCase("null")){b=null;}
+ while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens
+
+ if(parser.parse(arg, a, b)){
+ //do nothing
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ ConcurrentReadInputStreamD.verbose=verbose;
+ }else if(a.equals("parse_flag_goes_here")){
+ //Set a variable here
+ }else{
+ outstream.println("A_SampleD: Unknown parameter "+args[i]);
+ assert(false) : "Unknown parameter "+args[i];
+ // throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ {//Process parser fields
+ Parser.processQuality();
+
+ maxReads=parser.maxReads;
+ in1=parser.in1;
+ out1=parser.out1;
+ }
+
+ ffout1=FileFormat.testOutput(out1, FileFormat.FASTQ, null, true, true, false, false);
+ ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, null, true, true);
+ }
+
+ void process(Timer t){
+
+ final ConcurrentReadInputStream crisD=ConcurrentReadInputStream.getReadInputStream(
+ maxReads, false, ffin1, null, Shared.USE_MPI, Shared.MPI_KEEP_ALL);
+ crisD.start();
+ final boolean paired=crisD.paired();
+
+ final ConcurrentReadOutputStream ros;
+ if(out1!=null){
+ final int buff=4;
+
+ if(paired && (in1==null || !in1.contains(".sam"))){
+ outstream.println("A_SampleD: Writing interleaved.");
+ }
+
+ assert(!out1.equalsIgnoreCase(in1) && !out1.equalsIgnoreCase(in1)) : "Input file and output file have same name.";
+
+ ros=ConcurrentReadOutputStream.getStream(ffout1, null, buff, null, false);
+ ros.start();
+ }else{ros=null;}
+
+ long readsProcessed=0;
+ {
+
+ ListNum<Read> ln=crisD.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+ if(verbose){outstream.println("A_SampleD: Initial A_SampleD list: "+reads.size());}
+
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+ assert((ffin1==null || ffin1.samOrBam()) || (r.mate!=null)==crisD.paired());
+ }
+
+ while(reads!=null && reads.size()>0){
+ if(verbose){outstream.println("A_SampleD: Fetched "+reads.size()+" reads.");}
+
+ for(int idx=0; idx<reads.size(); idx++){
+ final Read r1=reads.get(idx);
+
+ // ********* Process reads here *********
+
+ readsProcessed++;
+ }
+
+ if(ros!=null){ros.add(reads, ln.id);}
+
+ crisD.returnList(ln.id, ln.list.isEmpty());
+ if(verbose){outstream.println("A_SampleD: Returned a list.");}
+ ln=crisD.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ if(ln!=null){
+ crisD.returnList(ln.id, ln.list==null || ln.list.isEmpty());
+ }
+ }
+ ReadWrite.closeStreams(crisD, ros);
+ if(verbose){outstream.println("A_SampleD: Finished.");}
+
+ t.stop();
+ outstream.println("A_SampleD: Time: \t"+t);
+ outstream.println("A_SampleD: Rank "+ Shared.MPI_RANK + ": Reads Processed: "+readsProcessed+" \t"+String.format("%.2fk reads/sec", (readsProcessed/(double)(t.elapsed))*1000000));
+ }
+
+ /*--------------------------------------------------------------*/
+
+ private void printOptions(){
+ throw new RuntimeException("printOptions: TODO");
+ }
+
+ /*--------------------------------------------------------------*/
+
+ private String in1=null;
+ private String out1=null;
+
+ private final FileFormat ffin1;
+ private final FileFormat ffout1;
+
+ /*--------------------------------------------------------------*/
+
+ private long maxReads=-1;
+
+ /*--------------------------------------------------------------*/
+
+ private java.io.PrintStream outstream=System.err;
+ public static boolean verbose=false;
+
+}
diff --git a/current/jgi/A_SampleMT.java b/current/jgi/A_SampleMT.java
new file mode 100755
index 0000000..31d625c
--- /dev/null
+++ b/current/jgi/A_SampleMT.java
@@ -0,0 +1,529 @@
+package jgi;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import stream.ConcurrentReadInputStream;
+import stream.FASTQ;
+import stream.FastaReadInputStream;
+import stream.ConcurrentReadOutputStream;
+import stream.Read;
+
+import dna.Parser;
+import dna.Timer;
+import fileIO.ByteFile;
+import fileIO.ReadWrite;
+import fileIO.FileFormat;
+
+import align2.ListNum;
+import align2.ReadStats;
+import align2.Shared;
+import align2.Tools;
+
+/**
+ * This class does nothing.
+ * It is designed to be easily modified into a program
+ * that processes reads in multiple threads, by
+ * filling in the processReadPair method.
+ *
+ * @author Brian Bushnell
+ * @date November 19, 2015
+ *
+ */
+public class A_SampleMT {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /**
+ * Code entrance from the command line.
+ * @param args Command line arguments
+ */
+ public static void main(String[] args){
+ //Start a timer immediately upon code entrance.
+ Timer t=new Timer();
+
+ //Create an instance of this class
+ A_SampleMT as=new A_SampleMT(args);
+
+ //Run the object
+ as.process(t);
+ }
+
+ /**
+ * Constructor.
+ * @param args Command line arguments
+ */
+ public A_SampleMT(String[] args){
+
+ //Process any config files
+ args=Parser.parseConfig(args);
+
+ //Detect whether the uses needs help
+ if(Parser.parseHelp(args, true)){
+ printOptions();
+ System.exit(0);
+ }
+
+ //Print the program name and arguments
+ outstream.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+
+ boolean setInterleaved=false; //Whether interleaved was explicitly set.
+
+ //Set some shared static variables regarding PIGZ
+ ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
+ ReadWrite.MAX_ZIP_THREADS=Shared.threads();
+
+ //Create a parser object
+ Parser parser=new Parser();
+
+ //Parse each argument
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+
+ //Break arguments into their constituent parts, in the form of "a=b"
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if(b==null || b.equalsIgnoreCase("null")){b=null;}
+ while(a.startsWith("-")){a=a.substring(1);} //Strip leading hyphens
+
+
+ if(parser.parse(arg, a, b)){//Parse standard flags in the parser
+ //do nothing
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ }else if(a.equals("ordered")){
+ ordered=Tools.parseBoolean(b);
+ }else if(a.equals("parse_flag_goes_here")){
+ long fake_variable=Tools.parseKMG(b);
+ //Set a variable here
+ }else{
+ outstream.println("Unknown parameter "+args[i]);
+ assert(false) : "Unknown parameter "+args[i];
+ }
+ }
+
+ {//Process parser fields
+ Parser.processQuality();
+
+ maxReads=parser.maxReads;
+
+ overwrite=ReadStats.overwrite=parser.overwrite;
+ append=ReadStats.append=parser.append;
+ setInterleaved=parser.setInterleaved;
+
+ in1=parser.in1;
+ in2=parser.in2;
+ qfin1=parser.qfin1;
+ qfin2=parser.qfin2;
+
+ out1=parser.out1;
+ out2=parser.out2;
+ qfout1=parser.qfout1;
+ qfout2=parser.qfout2;
+
+ extin=parser.extin;
+ extout=parser.extout;
+ }
+
+ //Do input file # replacement
+ if(in1!=null && in2==null && in1.indexOf('#')>-1 && !new File(in1).exists()){
+ in2=in1.replace("#", "2");
+ in1=in1.replace("#", "1");
+ }
+
+ //Do output file # replacement
+ if(out1!=null && out2==null && out1.indexOf('#')>-1){
+ out2=out1.replace("#", "2");
+ out1=out1.replace("#", "1");
+ }
+
+ //Adjust interleaved detection based on the number of input files
+ if(in2!=null){
+ if(FASTQ.FORCE_INTERLEAVED){outstream.println("Reset INTERLEAVED to false because paired input files were specified.");}
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+ }
+
+ assert(FastaReadInputStream.settingsOK());
+
+ //Ensure there is an input file
+ if(in1==null){
+ printOptions();
+ throw new RuntimeException("Error - at least one input file is required.");
+ }
+
+ //Adjust the number of threads for input file reading
+ if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){
+ ByteFile.FORCE_MODE_BF2=true;
+ }
+
+ //Ensure out2 is not set without out1
+ if(out1==null){
+ if(out2!=null){
+ printOptions();
+ throw new RuntimeException("Error - cannot define out2 without defining out1.");
+ }
+ }
+
+ //Adjust interleaved settings based on number of output files
+ if(!setInterleaved){
+ assert(in1!=null && (out1!=null || out2==null)) : "\nin1="+in1+"\nin2="+in2+"\nout1="+out1+"\nout2="+out2+"\n";
+ if(in2!=null){ //If there are 2 input streams.
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+ outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED);
+ }else{ //There is one input stream.
+ if(out2!=null){
+ FASTQ.FORCE_INTERLEAVED=true;
+ FASTQ.TEST_INTERLEAVED=false;
+ outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED);
+ }
+ }
+ }
+
+ //Ensure output files can be written
+ if(!Tools.testOutputFiles(overwrite, append, false, out1, out2)){
+ outstream.println((out1==null)+", "+(out2==null)+", "+out1+", "+out2);
+ throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+out1+", "+out2+"\n");
+ }
+
+ //Ensure input files can be read
+ if(!Tools.testInputFiles(false, true, in1, in2)){
+ throw new RuntimeException("\nCan't read to some input files.\n");
+ }
+
+ //Ensure that no file was specified multiple times
+ if(!Tools.testForDuplicateFiles(true, in1, in2, out1, out2)){
+ throw new RuntimeException("\nSome file names were specified multiple times.\n");
+ }
+
+ //Create output FileFormat objects
+ ffout1=FileFormat.testOutput(out1, FileFormat.FASTQ, extout, true, overwrite, append, ordered);
+ ffout2=FileFormat.testOutput(out2, FileFormat.FASTQ, extout, true, overwrite, append, ordered);
+
+ //Create input FileFormat objects
+ ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, extin, true, true);
+ ffin2=FileFormat.testInput(in2, FileFormat.FASTQ, extin, true, true);
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Outer Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Create read streams and process all data */
+ void process(Timer t){
+
+ //Turn off read validation in the input threads to increase speed
+ final boolean vic=Read.VALIDATE_IN_CONSTRUCTOR;
+ Read.VALIDATE_IN_CONSTRUCTOR=Shared.threads()<4;
+
+ //Create a read input stream
+ final ConcurrentReadInputStream cris;
+ {
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, ffin2, qfin1, qfin2);
+ cris.start(); //Start the stream
+ if(verbose){outstream.println("Started cris");}
+ }
+ boolean paired=cris.paired();
+ if(!ffin1.samOrBam()){outstream.println("Input is being processed as "+(paired ? "paired" : "unpaired"));}
+
+ //Optionally create a read output stream
+ final ConcurrentReadOutputStream ros;
+ if(ffout1!=null){
+ //Select output buffer size based on whether it needs to be ordered
+ final int buff=(ordered ? Tools.mid(16, 128, (Shared.threads()*2)/3) : 8);
+
+ //Notify user of output mode
+ if(cris.paired() && out2==null && (in1!=null && !ffin1.samOrBam() && !ffout1.samOrBam())){
+ outstream.println("Writing interleaved.");
+ }
+
+ ros=ConcurrentReadOutputStream.getStream(ffout1, ffout2, qfout1, qfout2, buff, null, false);
+ ros.start(); //Start the stream
+ }else{ros=null;}
+
+ //Reset counters
+ readsProcessed=0;
+ basesProcessed=0;
+
+ //Process the reads in separate threads
+ spawnThreads(cris, ros);
+
+ if(verbose){outstream.println("Finished; closing streams.");}
+
+ //Write anything that was accumulated by ReadStats
+ errorState|=ReadStats.writeAll();
+ //Close the read streams
+ errorState|=ReadWrite.closeStreams(cris, ros);
+
+ //Reset read validation
+ Read.VALIDATE_IN_CONSTRUCTOR=vic;
+
+ //Report timing and results
+ {
+ t.stop();
+
+ //Calculate units per nanosecond
+ double rpnano=readsProcessed/(double)(t.elapsed);
+ double bpnano=basesProcessed/(double)(t.elapsed);
+
+ //Add "k" and "m" for large numbers
+ String rpstring=(readsProcessed<100000 ? ""+readsProcessed : readsProcessed<100000000 ? (readsProcessed/1000)+"k" : (readsProcessed/1000000)+"m");
+ String bpstring=(basesProcessed<100000 ? ""+basesProcessed : basesProcessed<100000000 ? (basesProcessed/1000)+"k" : (basesProcessed/1000000)+"m");
+
+ //Format the strings so they have they are right-justified
+ while(rpstring.length()<8){rpstring=" "+rpstring;}
+ while(bpstring.length()<8){bpstring=" "+bpstring;}
+
+ outstream.println("Time: \t"+t);
+ outstream.println("Reads Processed: "+rpstring+" \t"+String.format("%.2fk reads/sec", rpnano*1000000));
+ outstream.println("Bases Processed: "+bpstring+" \t"+String.format("%.2fm bases/sec", bpnano*1000));
+ }
+
+ //Throw an exception of there was an error in a thread
+ if(errorState){
+ throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
+ }
+ }
+
+ /** Spawn process threads */
+ private void spawnThreads(final ConcurrentReadInputStream cris, final ConcurrentReadOutputStream ros){
+
+ //Do anything necessary prior to processing
+
+ //Determine how many threads may be used
+ final int threads=Shared.threads();
+
+ //Fill a list with ProcessThreads
+ ArrayList<ProcessThread> alpt=new ArrayList<ProcessThread>(threads);
+ for(int i=0; i<threads; i++){
+ alpt.add(new ProcessThread(cris, ros, i));
+ }
+
+ //Start the threads
+ for(ProcessThread pt : alpt){
+ pt.start();
+ }
+
+ //Wait for completion of all threads
+ boolean success=true;
+ for(ProcessThread pt : alpt){
+
+ //Wait until this thread has terminated
+ while(pt.getState()!=Thread.State.TERMINATED){
+ try {
+ //Attempt a join operation
+ pt.join();
+ } catch (InterruptedException e) {
+ //Potentially handle this, if it is expected to occur
+ e.printStackTrace();
+ }
+ }
+
+ //Accumulate per-thread statistics
+ readsProcessed+=pt.readsProcessedT;
+ basesProcessed+=pt.basesProcessedT;
+ success&=pt.success;
+ }
+
+ //Track whether any threads failed
+ if(!success){errorState=true;}
+
+ //Do anything necessary after processing
+
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** This is called if the program runs with no parameters */
+ private void printOptions(){
+ throw new RuntimeException("TODO"); //TODO
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Classes ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** This class is static to prevent accidental writing to shared variables.
+ * It is safe to remove the static modifier. */
+ private static class ProcessThread extends Thread {
+
+ //Constructor
+ ProcessThread(final ConcurrentReadInputStream cris_, final ConcurrentReadOutputStream ros_, final int tid_){
+ cris=cris_;
+ ros=ros_;
+ tid=tid_;
+ }
+
+ //Called by start()
+ public void run(){
+ //Do anything necessary prior to processing
+
+ //Process the reads
+ processInner();
+
+ //Do anything necessary after processing
+
+ //Indicate successful exit status
+ success=true;
+ }
+
+ /** Iterate through the reads */
+ void processInner(){
+
+ //Grab the first ListNum of reads
+ ListNum<Read> ln=cris.nextList();
+ //Grab the actual read list from the ListNum
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ //Check to ensure pairing is as expected
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+// assert(ffin1.samOrBam() || (r.mate!=null)==cris.paired()); //Disabled due to non-static access
+ }
+
+ //As long as there is a nonempty read list...
+ while(reads!=null && reads.size()>0){
+// if(verbose){outstream.println("Fetched "+reads.size()+" reads.");} //Disabled due to non-static access
+
+ //Loop through each read in the list
+ for(int idx=0; idx<reads.size(); idx++){
+ final Read r1=reads.get(idx);
+ final Read r2=r1.mate;
+
+ //Validate reads in worker threads
+ if(!r1.validated()){r1.validate(true);}
+ if(r2!=null && !r2.validated()){r2.validate(true);}
+
+ //Track the initial length for statistics
+ final int initialLength1=r1.length();
+ final int initialLength2=r1.mateLength();
+
+ //Increment counters
+ readsProcessedT+=1+r1.mateCount();
+ basesProcessedT+=initialLength1+initialLength2;
+
+ {
+ //Reads are processed in this block.
+ boolean keep=processReadPair(r1, r2);
+ if(!keep){reads.set(idx, null);}
+ }
+ }
+
+ //Output reads to the output stream
+ if(ros!=null){ros.add(reads, ln.id);}
+
+ //Notify the input stream that the list was used
+ cris.returnList(ln.id, ln.list.isEmpty());
+// if(verbose){outstream.println("Returned a list.");} //Disabled due to non-static access
+
+ //Fetch a new list
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+
+ //Notify the input stream that the final list was used
+ if(ln!=null){
+ cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
+ }
+ }
+
+ /**
+ * Process a read or a read pair.
+ * @param r1 Read 1
+ * @param r2 Read 2 (may be null)
+ * @return True if the reads should be kept, false if they should be discarded.
+ */
+ boolean processReadPair(final Read r1, final Read r2){
+ throw new RuntimeException("TODO: Implement this method."); //TODO
+// return true;
+ }
+
+ /** Number of reads processed by this thread */
+ protected long readsProcessedT=0;
+ /** Number of bases processed by this thread */
+ protected long basesProcessedT=0;
+
+ /** True only if this thread has completed successfully */
+ boolean success=false;
+
+ /** Shared input stream */
+ private final ConcurrentReadInputStream cris;
+ /** Shared output stream */
+ private final ConcurrentReadOutputStream ros;
+ /** Thread ID */
+ final int tid;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Primary input file path */
+ private String in1=null;
+ /** Secondary input file path */
+ private String in2=null;
+
+ private String qfin1=null;
+ private String qfin2=null;
+
+ /** Primary output file path */
+ private String out1=null;
+ /** Secondary output file path */
+ private String out2=null;
+
+ private String qfout1=null;
+ private String qfout2=null;
+
+ /** Override input file extension */
+ private String extin=null;
+ /** Override output file extension */
+ private String extout=null;
+
+ /*--------------------------------------------------------------*/
+
+ /** Number of reads processed */
+ protected long readsProcessed=0;
+ /** Number of bases processed */
+ protected long basesProcessed=0;
+
+ /** Quit after processing this many input reads; -1 means no limit */
+ private long maxReads=-1;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Final Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Primary input file */
+ private final FileFormat ffin1;
+ /** Secondary input file */
+ private final FileFormat ffin2;
+
+ /** Primary output file */
+ private final FileFormat ffout1;
+ /** Secondary output file */
+ private final FileFormat ffout2;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Common Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Print status messages to this output stream */
+ private PrintStream outstream=System.err;
+ /** Print verbose messages */
+ public static boolean verbose=false;
+ /** True if an error was encountered */
+ public boolean errorState=false;
+ /** Overwrite existing output files */
+ private boolean overwrite=false;
+ /** Append to existing output files */
+ private boolean append=false;
+ /** Reads are output in input order */
+ private boolean ordered=false;
+
+}
diff --git a/current/jgi/A_Sample_Unpaired.java b/current/jgi/A_Sample_Unpaired.java
new file mode 100755
index 0000000..171d13a
--- /dev/null
+++ b/current/jgi/A_Sample_Unpaired.java
@@ -0,0 +1,268 @@
+package jgi;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import stream.ByteBuilder;
+import stream.ConcurrentGenericReadInputStream;
+import stream.ConcurrentReadInputStream;
+import stream.FASTQ;
+import stream.FastaReadInputStream;
+import stream.ConcurrentReadOutputStream;
+import stream.Read;
+import align2.ListNum;
+import align2.ReadStats;
+import align2.Shared;
+import align2.Tools;
+import dna.Parser;
+import dna.Timer;
+import fileIO.ByteFile;
+import fileIO.ByteFile1;
+import fileIO.ByteFile2;
+import fileIO.FileFormat;
+import fileIO.ReadWrite;
+
+/**
+ * @author Brian Bushnell
+ * @date Oct 17, 2014
+ *
+ */
+public class A_Sample_Unpaired {
+
+ public static void main(String[] args){
+ Timer t=new Timer();
+ A_Sample_Unpaired mb=new A_Sample_Unpaired(args);
+ mb.process(t);
+ }
+
+ public A_Sample_Unpaired(String[] args){
+
+ args=Parser.parseConfig(args);
+ if(Parser.parseHelp(args, true)){
+ printOptions();
+ System.exit(0);
+ }
+
+ for(String s : args){if(s.startsWith("out=standardout") || s.startsWith("out=stdout")){outstream=System.err;}}
+ outstream.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+
+
+
+ Shared.READ_BUFFER_LENGTH=Tools.min(200, Shared.READ_BUFFER_LENGTH);
+ Shared.capBuffers(4);
+ ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
+ ReadWrite.MAX_ZIP_THREADS=Shared.threads();
+
+ FASTQ.TEST_INTERLEAVED=FASTQ.FORCE_INTERLEAVED=false;
+
+ Parser parser=new Parser();
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if(b==null || b.equalsIgnoreCase("null")){b=null;}
+ while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens
+
+ if(parser.parse(arg, a, b)){
+ //do nothing
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ ByteFile1.verbose=verbose;
+ ByteFile2.verbose=verbose;
+ stream.FastaReadInputStream.verbose=verbose;
+ ConcurrentGenericReadInputStream.verbose=verbose;
+// align2.FastaReadInputStream2.verbose=verbose;
+ stream.FastqReadInputStream.verbose=verbose;
+ ReadWrite.verbose=verbose;
+ }else if(parser.in1==null && i==0 && !arg.contains("=") && (arg.toLowerCase().startsWith("stdin") || new File(arg).exists())){
+ parser.in1=arg;
+ }else if(parser.out1==null && i==1 && !arg.contains("=")){
+ parser.out1=arg;
+ }else{
+ outstream.println("Unknown parameter "+args[i]);
+ assert(false) : "Unknown parameter "+args[i];
+ // throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ {//Process parser fields
+ Parser.processQuality();
+
+ maxReads=parser.maxReads;
+
+ overwrite=ReadStats.overwrite=parser.overwrite;
+ append=ReadStats.append=parser.append;
+
+ in1=parser.in1;
+ qfin1=parser.qfin1;
+
+ out1=parser.out1;
+ qfout1=parser.qfout1;
+
+ extin=parser.extin;
+ extout=parser.extout;
+ }
+
+ assert(FastaReadInputStream.settingsOK());
+
+ if(in1==null){
+ printOptions();
+ throw new RuntimeException("Error - at least one input file is required.");
+ }
+ if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2){
+ ByteFile.FORCE_MODE_BF2=false;
+ ByteFile.FORCE_MODE_BF1=true;
+ }
+
+ if(out1!=null && out1.equalsIgnoreCase("null")){out1=null;}
+
+ if(!Tools.testOutputFiles(overwrite, append, false, out1)){
+ outstream.println((out1==null)+", "+out1);
+ throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+out1+"\n");
+ }
+
+ ffout1=FileFormat.testOutput(out1, FileFormat.FASTQ, extout, true, overwrite, append, false);
+
+ ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, extin, true, true);
+ }
+
+ void process(Timer t){
+
+ final ConcurrentReadInputStream cris;
+ {
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, null, qfin1, null);
+ cris.start();
+ if(verbose){outstream.println("Started cris");}
+ }
+ boolean paired=cris.paired();
+// if(verbose){
+ if(!ffin1.samOrBam()){outstream.println("Input is being processed as "+(paired ? "paired" : "unpaired"));}
+// }
+
+ final ConcurrentReadOutputStream ros;
+ if(out1!=null){
+ final int buff=4;
+
+ assert(!out1.equalsIgnoreCase(in1) && !out1.equalsIgnoreCase(in1)) : "Input file and output file have same name.";
+
+ ros=ConcurrentReadOutputStream.getStream(ffout1, null, qfout1, null, buff, null, false);
+ ros.start();
+ }else{ros=null;}
+
+ long readsProcessed=0;
+ long basesProcessed=0;
+
+ {
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+// outstream.println("Fetched "+reads);
+
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+ assert((ffin1==null || ffin1.samOrBam()) || (r.mate!=null)==cris.paired());
+ }
+
+ while(reads!=null && reads.size()>0){
+
+ for(int idx=0; idx<reads.size(); idx++){
+ final Read r1=reads.get(idx);
+
+ final int initialLength1=r1.length();
+
+ readsProcessed++;
+ basesProcessed+=initialLength1;
+ }
+
+ final ArrayList<Read> listOut=reads;
+
+ if(ros!=null){ros.add(listOut, ln.id);}
+
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ if(ln!=null){
+ cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
+ }
+ }
+
+ errorState|=ReadStats.writeAll();
+
+ errorState|=ReadWrite.closeStreams(cris, ros);
+
+ t.stop();
+
+ double rpnano=readsProcessed/(double)(t.elapsed);
+ double bpnano=basesProcessed/(double)(t.elapsed);
+
+ String rpstring=(readsProcessed<100000 ? ""+readsProcessed : readsProcessed<100000000 ? (readsProcessed/1000)+"k" : (readsProcessed/1000000)+"m");
+ String bpstring=(basesProcessed<100000 ? ""+basesProcessed : basesProcessed<100000000 ? (basesProcessed/1000)+"k" : (basesProcessed/1000000)+"m");
+
+ while(rpstring.length()<8){rpstring=" "+rpstring;}
+ while(bpstring.length()<8){bpstring=" "+bpstring;}
+
+ outstream.println("Time: \t"+t);
+ outstream.println("Reads Processed: "+rpstring+" \t"+String.format("%.2fk reads/sec", rpnano*1000000));
+ outstream.println("Bases Processed: "+bpstring+" \t"+String.format("%.2fm bases/sec", bpnano*1000));
+
+ if(errorState){
+ throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
+ }
+ }
+
+ /*--------------------------------------------------------------*/
+
+ private void printOptions(){
+ assert(false) : "printOptions: TODO";
+// outstream.println("Syntax:\n");
+// outstream.println("java -ea -Xmx512m -cp <path> jgi.ReformatReads in=<infile> in2=<infile2> out=<outfile> out2=<outfile2>");
+// outstream.println("\nin2 and out2 are optional. \nIf input is paired and there is only one output file, it will be written interleaved.\n");
+// outstream.println("Other parameters and their defaults:\n");
+// outstream.println("overwrite=false \tOverwrites files that already exist");
+// outstream.println("ziplevel=4 \tSet compression level, 1 (low) to 9 (max)");
+// outstream.println("interleaved=false\tDetermines whether input file is considered interleaved");
+// outstream.println("fastawrap=70 \tLength of lines in fasta output");
+// outstream.println("qin=auto \tASCII offset for input quality. May be set to 33 (Sanger), 64 (Illumina), or auto");
+// outstream.println("qout=auto \tASCII offset for output quality. May be set to 33 (Sanger), 64 (Illumina), or auto (meaning same as input)");
+// outstream.println("outsingle=<file> \t(outs) Write singleton reads here, when conditionally discarding reads from pairs.");
+ }
+
+
+ /*--------------------------------------------------------------*/
+
+ private String in1=null;
+
+ private String qfin1=null;
+
+ private String out1=null;
+
+ private String qfout1=null;
+
+ private String extin=null;
+ private String extout=null;
+
+ /*--------------------------------------------------------------*/
+
+ private long maxReads=-1;
+
+ /*--------------------------------------------------------------*/
+
+ private final FileFormat ffin1;
+
+ private final FileFormat ffout1;
+
+
+ /*--------------------------------------------------------------*/
+
+ private PrintStream outstream=System.err;
+ public static boolean verbose=false;
+ public boolean errorState=false;
+ private boolean overwrite=false;
+ private boolean append=false;
+
+}
diff --git a/current/jgi/AddAdapters.java b/current/jgi/AddAdapters.java
new file mode 100755
index 0000000..824df8f
--- /dev/null
+++ b/current/jgi/AddAdapters.java
@@ -0,0 +1,765 @@
+package jgi;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Random;
+
+import stream.ConcurrentGenericReadInputStream;
+import stream.ConcurrentReadInputStream;
+import stream.FASTQ;
+import stream.FastaReadInputStream;
+import stream.ConcurrentReadOutputStream;
+import stream.Read;
+import align2.ListNum;
+import align2.QualityTools;
+import align2.ReadStats;
+import align2.Shared;
+import align2.Tools;
+import dna.AminoAcid;
+import dna.Parser;
+import dna.Timer;
+import fileIO.ByteFile;
+import fileIO.ByteFile1;
+import fileIO.ByteFile2;
+import fileIO.FileFormat;
+import fileIO.ReadWrite;
+
+/**
+ * @author Brian Bushnell
+ * @date Mar 16, 2014
+ *
+ */
+public class AddAdapters {
+
+
+
+ public static void main(String[] args){
+ Timer t=new Timer();
+ AddAdapters rr=new AddAdapters(args);
+ if(rr.writeMode){
+ rr.write(t);
+ }else{
+ rr.read(t);
+ }
+ }
+
+ private void printOptions(){
+ System.err.println("Please consult the shellscript for usage information.");
+ }
+
+ public AddAdapters(String[] args){
+
+ if(args==null || args.length==0){
+ printOptions();
+ System.exit(0);
+ }
+
+ for(String s : args){if(s.startsWith("out=standardout") || s.startsWith("out=stdout")){outstream=System.err;}}
+ outstream.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+
+ Parser parser=new Parser();
+
+
+ Shared.READ_BUFFER_LENGTH=Tools.min(200, Shared.READ_BUFFER_LENGTH);
+ Shared.capBuffers(4);
+ ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
+ ReadWrite.MAX_ZIP_THREADS=Shared.threads();
+ ReadWrite.ZIP_THREAD_DIVISOR=1;
+
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(Parser.parseCommonStatic(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseZip(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseQuality(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseFasta(arg, a, b)){
+ //do nothing
+ }else if(parser.parseInterleaved(arg, a, b)){
+ //do nothing
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ ByteFile1.verbose=verbose;
+ ByteFile2.verbose=verbose;
+ stream.FastaReadInputStream.verbose=verbose;
+ ConcurrentGenericReadInputStream.verbose=verbose;
+// align2.FastaReadInputStream2.verbose=verbose;
+ stream.FastqReadInputStream.verbose=verbose;
+ ReadWrite.verbose=verbose;
+ }else if(a.equals("reads") || a.equals("maxreads")){
+ maxReads=Tools.parseKMG(b);
+ }else if(a.equals("t") || a.equals("threads")){
+ Shared.setThreads(b);
+ }else if(a.equals("in") || a.equals("input") || a.equals("in1") || a.equals("input1")){
+ in1=b;
+ }else if(a.equals("in2") || a.equals("input2")){
+ in2=b;
+ }else if(a.equals("out") || a.equals("output") || a.equals("out1") || a.equals("output1")){
+ out1=b;
+ }else if(a.equals("out2") || a.equals("output2")){
+ out2=b;
+ }else if(a.equals("extin")){
+ extin=b;
+ }else if(a.equals("extout")){
+ extout=b;
+ }else if(a.equals("adapter") || a.equals("adapters") || a.equals("ref")){
+ adapterFile=b;
+ }else if(a.equals("literal") || a.equals("literals")){
+ literals=(b==null ? null : b.split(","));
+ }else if(a.equals("rate") || a.equals("prob")){
+ adapterProb=Float.parseFloat(b);
+ }else if(a.equals("minlength") || a.equals("minlen") || a.equals("ml")){
+ minlen=Integer.parseInt(b);
+ }else if(a.equals("3'") || a.equalsIgnoreCase("3prime") || a.equalsIgnoreCase("3-prime") || a.equalsIgnoreCase("right") || a.equalsIgnoreCase("r")){
+ right=Tools.parseBoolean(b);
+ }else if(a.equals("5'") || a.equalsIgnoreCase("5prime") || a.equalsIgnoreCase("5-prime") || a.equalsIgnoreCase("left") || a.equalsIgnoreCase("l")){
+ right=!Tools.parseBoolean(b);
+ }else if(a.equals("end")){
+ if(b.equals("3'") || b.equalsIgnoreCase("3prime") || b.equalsIgnoreCase("3-prime") || b.equalsIgnoreCase("right") || a.equalsIgnoreCase("r")){
+ right=true;
+ }else if(b.equals("5'") || b.equalsIgnoreCase("5prime") || b.equalsIgnoreCase("5-prime") || b.equalsIgnoreCase("left") || a.equalsIgnoreCase("l")){
+ right=true;
+ }
+ }else if(a.equals("addslash")){
+ addslash=Tools.parseBoolean(b);
+ }else if(a.equals("adderrors")){
+ adderrors=Tools.parseBoolean(b);
+ }else if(a.equals("addreversecomplement") || a.equals("arc")){
+ addRC=Tools.parseBoolean(b);
+ }else if(a.equals("addpaired")){
+ addPaired=Tools.parseBoolean(b);
+ }else if(a.equals("append") || a.equals("app")){
+ append=ReadStats.append=Tools.parseBoolean(b);
+ }else if(a.equals("overwrite") || a.equals("ow")){
+ overwrite=Tools.parseBoolean(b);
+ }else if(a.equals("write")){
+ writeMode=Tools.parseBoolean(b);
+ }else if(a.equals("grade")){
+ writeMode=!Tools.parseBoolean(b);
+ }else if(a.equals("mode")){
+ if("grade".equalsIgnoreCase(b) || "read".equalsIgnoreCase(b)){
+ writeMode=false;
+ }else if("generate".equalsIgnoreCase(b) || "write".equalsIgnoreCase(b) || "add".equalsIgnoreCase(b)){
+ writeMode=true;
+ }else{
+ throw new RuntimeException("Unknown mode "+b);
+ }
+ }else if(in1==null && i==0 && !arg.contains("=") && (arg.toLowerCase().startsWith("stdin") || new File(arg).exists())){
+ in1=arg;
+ }else{
+ System.err.println("Unknown parameter "+args[i]);
+ assert(false) : "Unknown parameter "+args[i];
+ // throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ {//Process parser fields
+ Parser.processQuality();
+ }
+
+ if(in1!=null && in2==null && in1.indexOf('#')>-1 && !new File(in1).exists()){
+ in2=in1.replace("#", "2");
+ in1=in1.replace("#", "1");
+ }
+ if(out1!=null && out2==null && out1.indexOf('#')>-1){
+ out2=out1.replace("#", "2");
+ out1=out1.replace("#", "1");
+ }
+ if(in2!=null){
+ if(FASTQ.FORCE_INTERLEAVED){System.err.println("Reset INTERLEAVED to false because paired input files were specified.");}
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+ }
+
+ assert(FastaReadInputStream.settingsOK());
+// if(maxReads!=-1){ReadWrite.USE_GUNZIP=ReadWrite.USE_UNPIGZ=false;}
+
+ if(in1==null){
+ printOptions();
+ throw new RuntimeException("Error - at least one input file is required.");
+ }
+ if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){
+// if(ReadWrite.isCompressed(in1)){ByteFile.FORCE_MODE_BF2=true;}
+ ByteFile.FORCE_MODE_BF2=true;
+ }
+
+ if(writeMode && out1==null){
+ if(out2!=null){
+ printOptions();
+ throw new RuntimeException("Error - cannot define out2 without defining out1.");
+ }
+ System.err.println("No output stream specified. To write to stdout, please specify 'out=stdout.fq' or similar.");
+// out1="stdout";
+ }
+
+ if(!parser.setInterleaved){
+ assert(in1!=null && (!writeMode || out1!=null)) : "\nin1="+in1+"\nin2="+in2+"\nout1="+out1+"\nout2="+out2+"\n";
+ if(in2!=null){ //If there are 2 input streams.
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+ outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED);
+ }else if(writeMode){ //There is one input stream.
+ if(out2!=null){
+ FASTQ.FORCE_INTERLEAVED=true;
+ FASTQ.TEST_INTERLEAVED=false;
+ outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED);
+ }
+ }
+ }
+
+ if(out1!=null && out1.equalsIgnoreCase("null")){out1=null;}
+ if(out2!=null && out2.equalsIgnoreCase("null")){out2=null;}
+
+ if(!Tools.testOutputFiles(overwrite, append, false, out1, out2)){
+ throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+out1+", "+out2+"\n");
+ }
+
+ ffout1=FileFormat.testOutput(out1, FileFormat.FASTQ, extout, true, overwrite, append, false);
+ ffout2=FileFormat.testOutput(out2, FileFormat.FASTQ, extout, true, overwrite, append, false);
+
+ ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, extin, true, true);
+ ffin2=FileFormat.testInput(in2, FileFormat.FASTQ, extin, true, true);
+
+ ffa=FileFormat.testInput(adapterFile, FileFormat.FASTA, null, true, true);
+
+ adapters=makeAdapterList();
+
+ if(writeMode){
+ if(adapters==null || adapters.isEmpty()){
+ throw new RuntimeException("\n\nPlease specify adapters with 'adapters=file.fa' or 'literal=AGCTACGT'\n");
+ }
+ randy=new Random();
+ }
+ }
+
+ private final ArrayList<byte[]> makeAdapterList(){
+ boolean oldTI=FASTQ.TEST_INTERLEAVED;
+ boolean oldFI=FASTQ.FORCE_INTERLEAVED;
+ FASTQ.TEST_INTERLEAVED=false;
+ FASTQ.FORCE_INTERLEAVED=false;
+ ArrayList<byte[]> x=makeAdapterList2();
+ FASTQ.TEST_INTERLEAVED=oldTI;
+ FASTQ.FORCE_INTERLEAVED=oldFI;
+ return x;
+ }
+
+ private final ArrayList<byte[]> makeAdapterList2(){
+ if(ffa==null && literals==null){return null;}
+ ArrayList<byte[]> list=new ArrayList<byte[]>();
+ if(ffa!=null){
+ FastaReadInputStream fris=new FastaReadInputStream(ffa, false, false, -1);
+ for(Read r=fris.next(); r!=null; r=fris.next()){
+ if(r.bases!=null){
+ list.add(r.bases);
+ }
+ }
+ fris.close();
+ }
+ if(literals!=null){
+ for(String s : literals){
+ if(s!=null && !"null".equalsIgnoreCase(s)){
+ list.add(s.getBytes());
+ }
+ }
+ }
+
+ if(addRC){
+ int x=list.size();
+ for(int i=0; i<x; i++){
+ list.add(AminoAcid.reverseComplementBases(list.get(i)));
+ }
+ }
+
+ return list.size()>0 ? list : null;
+ }
+
+ void write(Timer t){
+
+ final ConcurrentReadInputStream cris;
+ {
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, ffin2, null, null);
+ if(verbose){System.err.println("Started cris");}
+ cris.start(); //4567
+ }
+ boolean paired=cris.paired();
+ if(verbose){System.err.println("Input is "+(paired ? "paired" : "unpaired"));}
+
+ ConcurrentReadOutputStream ros=null;
+ if(out1!=null){
+ final int buff=4;
+
+ if(cris.paired() && out2==null && (in1==null || !in1.contains(".sam"))){
+ outstream.println("Writing interleaved.");
+ }
+
+ assert(!out1.equalsIgnoreCase(in1) && !out1.equalsIgnoreCase(in1)) : "Input file and output file have same name.";
+ assert(out2==null || (!out2.equalsIgnoreCase(in1) && !out2.equalsIgnoreCase(in2))) : "out1 and out2 have same name.";
+
+ ros=ConcurrentReadOutputStream.getStream(ffout1, ffout2, null, null, buff, null, false);
+ ros.start();
+ }
+
+ {
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+// System.err.println("Fetched "+reads);
+
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+ assert((ffin1==null || ffin1.samOrBam()) || (r.mate!=null)==cris.paired());
+ }
+
+ while(reads!=null && reads.size()>0){
+
+ for(int idx=0; idx<reads.size(); idx++){
+ final Read r1=reads.get(idx);
+ final Read r2=r1.mate;
+
+ final int initialLength1=r1.length();
+ final int initialLength2=(r1.mateLength());
+
+ addAdapter(r1, addPaired);
+ if(r2!=null && !addPaired){
+ addAdapter(r2, addPaired);
+ }
+
+ if(r2==null){
+ r1.id=r1.numericID+"_"+r1.id;
+ }else{
+ String base=r1.numericID+"_"+r1.id+"_"+r2.id;
+ if(addslash){
+ r1.id=base+" /1";
+ r2.id=base+" /2";
+ }else{
+ r1.id=base;
+ r2.id=base;
+ }
+ }
+ }
+
+ if(ros!=null){ros.add(reads, ln.id);}
+
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ if(ln!=null){
+ cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
+ }
+ }
+
+ errorState|=ReadWrite.closeStreams(cris, ros);
+
+// System.err.println(cris.errorState()+", "+(ros==null ? "null" : (ros.errorState()+", "+ros.finishedSuccessfully())));
+// if(ros!=null){
+// ReadStreamWriter rs1=ros.getRS1();
+// ReadStreamWriter rs2=ros.getRS2();
+// System.err.println(rs1==null ? "null" : rs1.finishedSuccessfully());
+// System.err.println(rs2==null ? "null" : rs2.finishedSuccessfully());
+// }
+// assert(false);
+
+ t.stop();
+
+ double rpnano=readsProcessed/(double)(t.elapsed);
+ double bpnano=basesProcessed/(double)(t.elapsed);
+
+ String rpstring=(readsProcessed<100000 ? ""+readsProcessed : readsProcessed<100000000 ? (readsProcessed/1000)+"k" : (readsProcessed/1000000)+"m");
+ String bpstring=(basesProcessed<100000 ? ""+basesProcessed : basesProcessed<100000000 ? (basesProcessed/1000)+"k" : (basesProcessed/1000000)+"m");
+
+ while(rpstring.length()<8){rpstring=" "+rpstring;}
+ while(bpstring.length()<8){bpstring=" "+bpstring;}
+
+ outstream.println("Adapters Added: \t"+adaptersAdded+" reads ("+String.format("%.2f",adaptersAdded*100.0/readsProcessed)+"%) \t"+
+ adapterBasesAdded+" bases ("+String.format("%.2f",adapterBasesAdded*100.0/basesProcessed)+"%)");
+
+ outstream.println("Valid Output: \t"+validReads+" reads ("+String.format("%.2f",validReads*100.0/readsProcessed)+"%) \t"+
+ validBases+" bases ("+String.format("%.2f",validBases*100.0/basesProcessed)+"%)");
+
+
+ outstream.println("\nTime: \t"+t);
+ outstream.println("Reads Processed: "+rpstring+" \t"+String.format("%.2fk reads/sec", rpnano*1000000));
+ outstream.println("Bases Processed: "+bpstring+" \t"+String.format("%.2fm bases/sec", bpnano*1000));
+
+ if(errorState){
+ throw new RuntimeException("ReformatReads terminated in an error state; the output may be corrupt.");
+ }
+ }
+
+ private void addAdapter(Read r, final int loc){
+ final byte[] bases=r.bases;
+ final byte[] quals=r.quality;
+ final int remaining, initial=(bases==null ? 0 : bases.length);
+ final byte[] adapter;
+ int ab=0, rb=0;
+
+ readsProcessed++;
+ basesProcessed+=initial;
+ if(initial>0 && loc>=0 && loc<initial){
+ adapter=adapters.get(randy.nextInt(adapters.size()));
+ adaptersAdded++;
+
+ if(right){
+ final int lim=Tools.min(initial, adapter.length+loc);
+ for(int i=loc, j=0; i<lim; i++, j++){
+ if(AminoAcid.isFullyDefined(bases[i])){
+ bases[i]=adapter[j];
+ if(adderrors){
+ byte q=(quals==null ? 30 : quals[i]);
+ if(randy.nextFloat()<QualityTools.PROB_ERROR[q]){
+ int old=AminoAcid.baseToNumber[bases[i]];
+ bases[i]=AminoAcid.numberToBase[(old+randy.nextInt(3))%4];
+ }
+ }
+ }
+ ab++;
+ }
+ for(int i=lim; i<initial; i++){
+ if(AminoAcid.isFullyDefined(bases[i])){
+ bases[i]=AminoAcid.numberToBase[randy.nextInt(4)];
+ }
+ rb++;
+ }
+ remaining=loc;
+ }else{
+ final int lim=Tools.max(-1, loc-adapter.length);
+ for(int i=loc, j=adapter.length-1; i>lim; i--, j--){
+ if(AminoAcid.isFullyDefined(bases[i])){
+ bases[i]=adapter[j];
+ if(adderrors){
+ byte q=(quals==null ? 30 : quals[i]);
+ if(randy.nextFloat()<QualityTools.PROB_ERROR[q]){
+ int old=AminoAcid.baseToNumber[bases[i]];
+ bases[i]=AminoAcid.numberToBase[(old+randy.nextInt(3))%4];
+ }
+ }
+ }
+ ab++;
+ }
+ for(int i=lim; i>-1; i--){
+ if(AminoAcid.isFullyDefined(bases[i])){
+ bases[i]=AminoAcid.numberToBase[randy.nextInt(4)];
+ }
+ rb++;
+ }
+ remaining=initial-loc-1;
+ }
+ assert(remaining<initial) : "\nremaining="+remaining+", initial="+initial+", rb="+rb+", ab="+ab+
+ ", loc="+loc+", adapter.length="+(adapter==null ? 0 : adapter.length)+"\n";
+ }else{
+ adapter=null;
+ remaining=initial;
+ }
+
+ assert(remaining==initial-(rb+ab));
+ assert(remaining>=0);
+
+ adapterBasesAdded+=ab;
+ randomBasesAdded+=rb;
+ r.id=initial+"_"+remaining;
+ if(remaining>=minlen){
+ validReads++;
+ validBases+=remaining;
+ }
+ }
+
+ private void addAdapter(Read r, boolean addPaired){
+ final byte[] bases=r.bases;
+ final int initial=(bases==null ? 0 : bases.length);
+ final int loc;
+
+ if(initial>0 && randy.nextFloat()<adapterProb){
+ loc=randy.nextInt(initial);
+ }else{
+ loc=-1;
+ }
+
+ addAdapter(r, loc);
+ if(addPaired && r.mate!=null){addAdapter(r.mate, loc);}
+ }
+
+ /*--------------------------------------------------------------*/
+
+ void read(Timer t){
+
+ final ConcurrentReadInputStream cris;
+ {
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, ffin2, null, null);
+ if(verbose){System.err.println("Started cris");}
+ cris.start(); //4567
+ }
+ boolean paired=cris.paired();
+ if(verbose){System.err.println("Input is "+(paired ? "paired" : "unpaired"));}
+
+ {
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+// System.err.println("Fetched "+reads);
+
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+ assert((ffin1==null || ffin1.samOrBam()) || (r.mate!=null)==cris.paired());
+ }
+
+ while(reads!=null && reads.size()>0){
+
+ for(int idx=0; idx<reads.size(); idx++){
+ final Read r1=reads.get(idx);
+ final Read r2=r1.mate;
+
+ grade(r1, r2);
+ }
+
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ if(ln!=null){
+ cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
+ }
+ }
+
+ errorState|=ReadWrite.closeStream(cris);
+
+ t.stop();
+
+ long validBasesRemoved=validBasesExpected-validBasesCounted;
+ long incorrect=readsProcessed-correct;
+ long incorrectBases=basesProcessed-correctBases;
+
+ double rpnano=readsProcessed/(double)(t.elapsed);
+ double bpnano=basesProcessed/(double)(t.elapsed);
+
+ String rpstring=(readsProcessed<100000 ? ""+readsProcessed : readsProcessed<100000000 ? (readsProcessed/1000)+"k" : (readsProcessed/1000000)+"m");
+ String bpstring=(basesProcessed<100000 ? ""+basesProcessed : basesProcessed<100000000 ? (basesProcessed/1000)+"k" : (basesProcessed/1000000)+"m");
+
+ while(rpstring.length()<8){rpstring=" "+rpstring;}
+ while(bpstring.length()<8){bpstring=" "+bpstring;}
+
+ outstream.println("Total output: \t"+readsProcessed+" reads \t"+basesProcessed+" bases ");
+ outstream.println("Perfectly Correct (% of output): \t"+correct+" reads ("+String.format("%.3f",correct*100.0/readsProcessed)+
+ "%) \t"+correctBases+" bases ("+String.format("%.3f",correctBases*100.0/basesProcessed)+"%)");
+ outstream.println("Incorrect (% of output): \t"+incorrect+" reads ("+String.format("%.3f",incorrect*100.0/readsProcessed)+
+ "%) \t"+incorrectBases+" bases ("+String.format("%.3f",incorrectBases*100.0/basesProcessed)+"%)");
+ outstream.println();
+// outstream.println("Too Short: \t"+tooShort+" reads ("+String.format("%.3f",tooShort*100.0/readsProcessed)+"%) \t"+
+// tooShortBases+" bases ("+String.format("%.3f",tooShortBases*100.0/basesProcessed)+"%)");
+// outstream.println("Too Long: \t"+tooLong+" reads ("+String.format("%.3f",tooLong*100.0/readsProcessed)+"%) \t"+
+// tooLongBases+" bases ("+String.format("%.3f",tooLongBases*100.0/basesProcessed)+"%)");
+
+ outstream.println("Adapters Remaining (% of adapters): \t"+(adapterReadsRemaining)+" reads ("+String.format("%.3f",adapterReadsRemaining*100.0/adapterReadsTotal)+
+ "%) \t"+adapterBasesRemaining+" bases ("+String.format("%.3f",adapterBasesRemaining*100.0/basesProcessed)+"%)");
+ outstream.println("Non-Adapter Removed (% of valid): \t"+tooShort+" reads ("+String.format("%.4f",tooShort*100.0/readsProcessed)+
+ "%) \t"+validBasesRemoved+" bases ("+String.format("%.4f",validBasesRemoved*100.0/validBasesExpected)+"%)");
+
+ if(broken>0 || mispaired>0){
+ outstream.println("Broken: \t"+broken+" reads ("+String.format("%.2f",broken*100.0/readsProcessed)+"%)");
+ outstream.println("Mispaired: \t"+mispaired+" reads ("+String.format("%.2f",mispaired*100.0/readsProcessed)+"%)");
+ }
+
+
+// outstream.println("\nTime: \t"+t);
+// outstream.println("Reads Processed: "+rpstring+" \t"+String.format("%.2fk reads/sec", rpnano*1000000));
+// outstream.println("Bases Processed: "+bpstring+" \t"+String.format("%.2fm bases/sec", bpnano*1000));
+
+ if(errorState){
+ throw new RuntimeException("ReformatReads terminated in an error state; the output may be corrupt.");
+ }
+ }
+
+ private void grade(Read r1, Read r2){
+ final String a=r1.id.split(" ")[0];
+ final String b=(r2==null ? a : r2.id.split(" ")[0]);
+ final int len=a.split("_").length;
+
+ if(r2!=null){
+ if(r1.id.endsWith(" /2") || r2.id.endsWith(" /1") || !a.equals(b)){
+ mispaired+=2;
+ }
+ if(len==3){
+ r2.setPairnum(0);
+ }else if(len==5){
+ if(r1.id.endsWith(" /2")){r1.setPairnum(1);}
+ if(r2.id.endsWith(" /1")){r2.setPairnum(0);}
+ }else{
+ throw new RuntimeException("Headers are corrupt. They must be generated by AddAdapters or RenameReads.");
+ }
+ }else{
+ if(len!=3){
+ throw new RuntimeException("Headers are corrupt, or paired reads are being processed as unpaired. Try running with 'int=t' or with 'in1=' and 'in2='");
+ }
+ }
+ grade(r1);
+ grade(r2);
+ }
+
+ private void grade(Read r){
+ if(r==null){return;}
+ final int offset=(2*r.pairnum());
+
+ String[] sa=r.id.split(" ")[0].split("_");
+ final long id=Long.parseLong(sa[0]);
+ final int initial=Integer.parseInt(sa[1+offset]);
+ final int remaining=Integer.parseInt(sa[2+offset]);
+ final int actual=r.length();
+
+ readsProcessed++;
+ basesProcessed+=actual;
+
+ assert(initial>=remaining);
+
+ if(actual>initial){broken++;}
+
+ validBasesExpected+=remaining;
+
+ if(initial==remaining){//Should not have trimmed
+ if(actual==remaining || (actual<2 && (remaining<1 || remaining<minlen))){
+ correct++;
+ correctBases+=remaining;
+ validBasesCounted+=remaining;
+ trueNeg++;
+ }else if(actual<remaining){
+ tooShort++;
+ tooShortReadBases+=actual;
+ tooShortBases+=(remaining-actual);
+ validBasesCounted+=actual;
+ falsePos++;
+ }else if(actual>remaining){
+ tooLong++;
+ tooLongReadBases+=remaining;
+ tooLongBases+=(actual-remaining);
+ validBasesCounted+=remaining;
+ falseNeg++;
+ }
+ }else{//Should have trimmed
+
+ adapterBasesTotal+=(initial-remaining);
+ adapterReadsTotal++;
+
+ if(actual==remaining || (actual<2 && (remaining<1 || remaining<minlen))){
+ correct++;
+ correctBases+=remaining;
+ validBasesCounted+=remaining;
+ truePos++;
+ }else if(actual<remaining){
+ tooShort++;
+ tooShortReadBases+=actual;
+ tooShortBases+=(remaining-actual);
+ validBasesCounted+=actual;
+ truePos++;
+ }else if(actual>remaining){
+ tooLong++;
+ tooLongReadBases+=actual;
+ tooLongBases+=(actual-remaining);
+ adapterBasesRemaining+=(actual-remaining);
+ validBasesCounted+=remaining;
+ falseNeg++;
+ adapterReadsRemaining++;
+ }
+ }
+ }
+
+ /*--------------------------------------------------------------*/
+
+ public boolean errorState=false;
+
+ private String in1=null;
+ private String in2=null;
+
+ private String out1=null;
+ private String out2=null;
+
+ private String extin=null;
+ private String extout=null;
+
+ private String adapterFile=null;
+ private String[] literals=null;
+
+ private boolean overwrite=false;
+ private boolean append=false;
+
+ /** Add /1 and /2 to paired reads */
+ private boolean addslash=true;
+ /** Encode correct answer in read ID field */
+ private boolean changename=true;
+ /** Add errors from quality value */
+ private boolean adderrors=true;
+
+ /** Add adapters to the same location for read 1 and read 2 */
+ private boolean addPaired=true;
+ /** Add reverse-complemented adapters also */
+ private boolean addRC=false;
+ /** aka 3' */
+ private boolean right=true;
+
+ private long maxReads=-1;
+ private int minlen=1;
+
+ private boolean writeMode=true;
+ private float adapterProb=0.5f;
+
+ private long readsProcessed=0;
+ private long basesProcessed=0;
+ private long adaptersAdded=0;
+ private long adapterBasesAdded=0;
+ private long randomBasesAdded=0;
+ private long validReads=0;
+ private long validBases=0;
+
+ private long truePos=0;
+ private long trueNeg=0;
+ private long falsePos=0;
+ private long falseNeg=0;
+ private long broken=0;
+ private long mispaired=0;
+
+ private long tooShort=0;
+ private long tooLong=0;
+ private long correct=0;
+ private long fullyRemoved=0;
+
+ private long tooShortBases=0;
+ private long tooLongBases=0;
+ private long tooShortReadBases=0;
+ private long tooLongReadBases=0;
+ private long correctBases=0;
+
+ private long validBasesCounted=0;
+ private long validBasesExpected=0;
+
+// private long invalidBasesCounted=0;
+ private long adapterBasesTotal=0;
+ private long adapterReadsTotal=0;
+ private long adapterReadsRemaining=0;
+ private long adapterBasesRemaining=0;
+
+ private final FileFormat ffin1;
+ private final FileFormat ffin2;
+
+ private final FileFormat ffout1;
+ private final FileFormat ffout2;
+
+ private final FileFormat ffa;
+
+ private final ArrayList<byte[]> adapters;
+
+ /*--------------------------------------------------------------*/
+
+ private PrintStream outstream=System.err;
+ public static boolean verbose=false;
+
+ private java.util.Random randy;
+
+}
diff --git a/current/jgi/AssemblyStats2.java b/current/jgi/AssemblyStats2.java
new file mode 100755
index 0000000..eb44fc3
--- /dev/null
+++ b/current/jgi/AssemblyStats2.java
@@ -0,0 +1,1839 @@
+package jgi;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+
+import align2.IntList;
+import align2.LongList;
+import align2.Tools;
+
+import dna.AminoAcid;
+import dna.FastaToChromArrays2;
+import dna.Parser;
+import dna.Timer;
+import fileIO.FileFormat;
+import fileIO.ReadWrite;
+import fileIO.TextStreamWriter;
+
+
+/**
+ * @author Brian Bushnell
+ * @date Sep 16, 2013
+ *
+ */
+public final class AssemblyStats2 {
+
+ /*--------------------------------------------------------------*/
+
+ private static void printOptions(){
+ System.err.println("Please consult the shellscript for usage information.");
+// System.out.println("\nUsage: java -Xmx120m jgi.AssemblyStats2 <input file>");
+// System.out.println("\nOptional flags:");
+// System.out.println("in=<file> \tThe 'in=' flag is only needed if the input file is not the first parameter. 'in=stdin' will pipe from standard in.");
+// System.out.println("format=1 \tUses variable units like MB and KB, and is designed for compatibility with existing tools.");
+// System.out.println("format=2 \tUses only whole numbers of bases, with no commas in numbers, and is designed for machine parsing.");
+// System.out.println("format=3 \tOutputs stats in 2 rows of tab-delimited columns: a header row and a data row.");
+// System.out.println("format=4 \tLike 3 but with scaffold data only.");
+// System.out.println("format=5 \tLike 3 but with contig data only.");
+// System.out.println("format=6 \tLike 3 but the header starts with a #.");
+// System.out.println("format=7 \tLike 1 but without scaffold data.");
+// System.out.println("gc=<file> \tPrint gc statistics per scaffold to a file (or stdout).");
+// System.out.println("gcformat=1 \tid start stop A C G T N GC");
+// System.out.println("gcformat=2 \tid gc");
+// System.out.println("gcformat=3 \tid length A C G T N GC");
+// System.out.println("gcformat=4 \tid length gc");
+// System.out.println("gchist=<file>\tPrint gc content histogram to this file.");
+// System.out.println("gcbins=200 \tNumber of bins in gc histogram.");
+// System.out.println("n=10 \tMinimum number of consecutive Ns between contigs.");
+// System.out.println("k=13 \tDisplay BBMap's estimated memory usage for this genome with specified kmer length.");
+// System.out.println("showspeed=t \tSet to 'f' to suppress display of processing speed.");
+// System.out.println("minscaf=0 \tIgnore scaffolds shorter than this.");
+// System.out.println("n_=t \tThis flag will prefix the terms 'contigs' and 'scaffolds' with 'n_' in formats 3-6.");
+// System.out.println("verbose=t \tSet to false to remove superfluous info.");
+// System.out.println("Output is always tab-delimited. AGCT are fractions of defined bases; N is fraction of total bases.");
+ }
+
+ /*--------------------------------------------------------------*/
+
+ public static void main(String[] args){
+ AssemblyStats2 as=new AssemblyStats2(args);
+ as.process();
+ }
+
+ public AssemblyStats2(String[] args){
+
+ args=Parser.parseConfig(args);
+ if(Parser.parseHelp(args, true)){
+ printOptions();
+ System.exit(0);
+ }
+
+ Timer t=new Timer();
+ ReadWrite.USE_UNPIGZ=ReadWrite.USE_PIGZ=true;
+
+ int GCBINS_=200;
+ int MINSCAF_=0;
+ int gchistdecimals_=-1;
+
+ for(int i=0; i<args.length; i++){
+
+ if(true){
+ final String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if("null".equalsIgnoreCase(b)){b=null;}
+ if(a.charAt(0)=='-' && (a.indexOf('.')<0 || i>0)){a=a.substring(1);}
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(Parser.parseCommonStatic(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseZip(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseQuality(arg, a, b)){
+ //do nothing
+ }else if(arg.contains("=") && (a.equals("in") || a.equals("ref"))){
+ in=b;
+ }else if(a.equals("gc") || a.equals("gcout")){
+ gc=b;
+ if(b==null || "summaryonly".equalsIgnoreCase(b) || "none".equalsIgnoreCase(b)){
+ gc=null;
+ }
+ }else if(a.equals("gchist")){
+ gchistFile=b;
+ if(b==null || "none".equalsIgnoreCase(b)){
+ gchistFile=null;
+ }
+ }else if(a.equals("gchistdecimals")){
+ gchistdecimals_=Integer.parseInt(b);
+ }else if(a.equals("gcbins")){
+ int x=Integer.parseInt(b);
+ if(x>0){GCBINS_=Integer.parseInt(b);}
+ }else if(a.equals("shist") || a.equals("scaffoldhist")){
+ scaffoldHistFile=b;
+ if(b==null || "none".equalsIgnoreCase(b)){
+ scaffoldHistFile=null;
+ }
+ }else if(a.equals("out")){
+ out=b;
+ if(b==null || "summaryonly".equalsIgnoreCase(b) || "none".equalsIgnoreCase(b)){
+ out=null;
+ }else if("benchmark".equalsIgnoreCase(b)){
+ benchmark=true;
+ out=null;
+ gc=null;
+ }
+ }else if(a.equals("benchmark")){
+ benchmark=Tools.parseBoolean(b);
+ if(benchmark){
+ out=null;
+ gc=null;
+ }
+ }else if(a.equals("format")){
+ FORMAT=Integer.parseInt(b);
+ if(FORMAT<0 || FORMAT>7){
+ throw new RuntimeException("\nUnknown format: "+FORMAT+"; valid values are 1 through 7.\n");
+ }
+ }else if(a.equals("gcformat")){
+ GCFORMAT=Integer.parseInt(b);
+ if(GCFORMAT<0 || GCFORMAT>4){
+ throw new RuntimeException("\nUnknown gcformat: "+GCFORMAT+"; valid values are 0 through 4.\n");
+ }
+ }else if(a.equals("cutoff")){
+ cutoff=Tools.parseKMG(b);
+ }else if(a.equals("k") || a.equals("bbmapkmer")){
+ bbmapkmer=Integer.parseInt(b);
+ }else if(a.equals("overwrite") || a.equals("ow")){
+ overwrite=Tools.parseBoolean(b);
+ }else if(a.equals("n_")){
+ N_UNDERSCORE=Tools.parseBoolean(b);
+ }else if(a.equals("header") || a.equals("useheader")){
+ useheader=Tools.parseBoolean(b);
+ }else if(a.equals("addfilename") || a.equals("addname")){
+ addfilename=Tools.parseBoolean(b);
+ }else if(a.equals("minscaf") || a.equals("mincontig") || a.equals("minlen") || a.equals("min")){
+ MINSCAF_=Integer.parseInt(b);
+ }else if(a.equals("showspeed") || a.equals("ss")){
+ showspeed=Tools.parseBoolean(b);
+ }else if(a.equals("printheadersize") || a.equals("phs")){
+ printheadersize=Tools.parseBoolean(b);
+ }else if(a.equals("skipduplicatelines") || a.equals("sdl")){
+ skipDuplicateLines=Tools.parseBoolean(b);
+ }else if(a.equals("printduplicatelines") || a.equals("pdl")){
+ skipDuplicateLines=!Tools.parseBoolean(b);
+ }else if(a.equals("showbbmap")){
+ if(!Tools.parseBoolean(b)){bbmapkmer=0;}
+ }else if(a.equals("contigbreak") || (arg.contains("=") && (a.equals("n") || a.equals("-n")))){
+ maxNs=Integer.parseInt(b);
+ }else if(i>0 && (a.equals("n") || a.equals("-n")) && b!=null){
+ maxNs=Integer.parseInt(b);
+ }else if(in==null && i==0 && !arg.contains("=")){
+ in=arg;
+ }else{
+ throw new RuntimeException("Unknown parameter "+arg);
+ }
+ }
+ }
+
+ {//Process parser fields
+ Parser.processQuality();
+ }
+
+ minScaffold=MINSCAF_;
+ gcbins=GCBINS_;
+
+ if(gchistdecimals_<1){
+ if(gcbins==2 || gcbins==5 || gcbins==10){
+ gchistdecimals_=1;
+ }else if(gcbins==20 || gcbins==25 || gcbins==50 || gcbins==100){
+ gchistdecimals_=2;
+ }
+ if(gcbins>1000 && gchistdecimals_<4){gchistdecimals_=4;}
+ if(gcbins>10000 && gchistdecimals_<5){gchistdecimals_=5;}
+ }
+ gchistDecimals1=gchistdecimals_;
+
+ if(maxNs<0){maxNs=10;}
+
+ if(out==null || out.equalsIgnoreCase("stdout") || out.equalsIgnoreCase("standardout")){out=null;}
+
+ clist=new LongList((int)Tools.min(1<<15, cutoff+1)); //Number of contigs of length x
+ slist=new LongList((int)Tools.min(1<<15, cutoff+1)); //Number of scaffolds of length x
+ sclist1=new LongList((int)Tools.min(1<<15, cutoff+1)); //Sum of contigs per scaffold of length x
+ sclist2=new LongList((int)Tools.min(1<<15, cutoff+1)); //Sum of contig lengths per scaffold of length x
+
+ llist=new LongList(64); //List of contig lengths for contigs at least cutoff in length
+ tlist=new ArrayList<Triple>(64); //List of scaf len, contigs, contig sum for scaffolds at least cutoff in length
+
+ gcbins2=(gcbins>=1000 ? gcbins : gcbins*10);
+
+ gchistArray=new long[gcbins2];
+ gchist_by_base=new long[gcbins2];
+
+ }
+
+ /*--------------------------------------------------------------*/
+
+ public void process(){
+ Timer t=new Timer();
+
+ InputStream is=null;
+ {
+ if(in==null){throw new RuntimeException("No input file.");}
+ if(in.equalsIgnoreCase("stdin") || in.equalsIgnoreCase("standardin")){
+ is=System.in;
+ }else{
+ File f=new File(in);
+ if((!f.exists() || f.isDirectory()) && !in.toLowerCase().startsWith("stdin")){
+ throw new RuntimeException("Input file does not appear to be valid: "+in);
+ }
+ }
+ }
+
+ long[] counts=null;
+ long sum=0;
+
+ boolean fastqMode=false;
+ if(is!=System.in){
+ FileFormat ff=null;
+ try {
+ ff=FileFormat.testInput(in, FileFormat.FA, null, false, true, true);
+ } catch (Throwable e) {
+ //Ignore
+ }
+ if(ff!=null){
+ fastqMode=ff.fastq();
+ }
+// assert(ff==null || (!ff.fastq())) : "AssemblyStats only supports fasta files. To override this message, use the -da flag.";
+ }
+
+ if(is==null){is=ReadWrite.getInputStream(in, false, true);}
+ try {
+ if(benchmark){sum=bench(is);}
+ else{
+ if(fastqMode){
+ counts=countFastq(is, gc);
+ }else{
+ counts=countFasta(is, gc);
+ }
+ }
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ try {
+ if(is!=System.in){is.close();}
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+
+ if(tlist!=null && tlist.size()>0){Collections.sort(tlist);}
+ if(llist!=null && llist.size>0){Arrays.sort(llist.array, 0, llist.size);}
+
+ t.stop();
+
+ if(benchmark){
+ printBenchResults(t, counts, sum, in);
+ }else{
+// System.err.println("\nclist="+clist+"\nslist="+slist+"\nsclist1="+sclist1+"\nsclist2="+sclist2+"\nllist="+llist+"\ntlist="+tlist+"\n"); //***
+// System.err.println("\nclist.size="+clist.size+"\nslist.size="+slist.size+"\nsclist1.size="+sclist1.size+"\nsclist2.size="+sclist2.size+"\nllist.size="+llist.size+"\ntlist.size()="+tlist.size()+"\n"); //***
+ printResults(t, counts, sum, gc_std, in, clist, slist, sclist1, sclist2, llist, tlist, out);
+// System.err.println("Printed results to "+out);
+ writeHistFile(scaffoldHistFile, slist, tlist, false);
+
+ if(gchistFile!=null){printGCHist(gchistFile);}
+ }
+ }
+
+ /*--------------------------------------------------------------*/
+
+ public long[] countFasta(final InputStream is, String gcout) throws IOException{
+
+ long limsum=0;
+ long headerlen=0;
+ final byte[] buf=new byte[32768];
+ final TextStreamWriter tswgc=(gcout==null ? null : new TextStreamWriter(gcout, overwrite, false, false));
+ if(tswgc!=null){
+ tswgc.start();
+ if(GCFORMAT==0 || GCFORMAT==1){
+ tswgc.println("#Name\tLength\tA\tC\tG\tT\tN\tIUPAC\tOther\tGC");
+ }else if(GCFORMAT==2){
+ tswgc.println("#Name\tGC");
+ }else if(GCFORMAT==3){
+ tswgc.println("#Name\tLength\tA\tC\tG\tT\tN\tIUPAC\tOther\tGC");
+ }else if(GCFORMAT==4){
+ tswgc.println("#Name\tLength\tGC");
+ }else{
+ throw new RuntimeException("Unknown format.");
+ }
+ }
+// assert(false) : GCFORMAT+", "+out+", "+tswgc;
+ final long[] counts=new long[8];
+ final long[] overall=new long[8];
+ final StringBuilder hdr=(gcout==null ? null : new StringBuilder());
+ boolean hdmode=false;
+
+ int i=0;
+ int lim=is.read(buf);
+ limsum+=lim;
+
+ int contigs=0;
+ int contiglen=0;
+// int contiglensum=0;
+ int scaffoldlen=0;
+ int ns=0;
+
+ final IntList currentContigs=new IntList(10000);
+
+ while(lim>0){
+ if(hdmode){//Scan to end of header.
+ if(hdr==null){//Header is not being stored
+ while(i<lim){
+ final byte c=buf[i];
+ i++;
+ if(c<=slashr){
+ hdmode=false;
+ contiglen=0;
+// contiglensum=0;
+ scaffoldlen=0;
+ ns=0;
+ contigs=0;
+ break;
+ }
+ headerlen++;
+ }
+ }else{//Store the header
+ while(i<lim){
+ final byte c=buf[i];
+ i++;
+ if(c<=slashr){
+ hdmode=false;
+ contiglen=0;
+// contiglensum=0;
+ scaffoldlen=0;
+ ns=0;
+ contigs=0;
+ break;
+ }
+ hdr.append((char)c);
+ }
+ }
+ }
+
+ if(!hdmode){//Scan bases
+ while(i<lim){
+ final byte c=buf[i];
+ final byte cnum=charToNum[c];
+ i++;
+
+ if(c==carrot){//Start of a new header
+// assert(false) : scaffoldlen;
+ hdmode=true;
+ if(scaffoldlen>0){//if the scaffold was not blank
+
+ {//NEW
+ if(contiglen>0 || contigs==0){
+ currentContigs.set(contigs, contiglen);
+ contigs++;
+// System.out.println("For header "+hdr+": added contig. len="+contiglen+", contigs="+contigs);
+// contiglensum+=contiglen;
+ }
+ }
+
+// assert(false);
+ if(scaffoldlen>=minScaffold){
+
+ int contiglensum=0;
+ {//NEW
+// System.out.println("Dumping "+contigs+" contigs.");
+ for(int j=0; j<contigs; j++){
+ final int cl=currentContigs.get(j);
+ if(cl>0 || contigs==0){
+ contiglensum+=cl;
+ if(cl<cutoff){
+ clist.increment(cl, 1);
+ }else{
+ llist.add(cl);
+ }
+ }
+ }
+ }
+
+ if(scaffoldlen<cutoff){
+ slist.increment(scaffoldlen, 1);
+ sclist1.increment(scaffoldlen, contigs);
+ sclist2.increment(scaffoldlen, contiglensum);
+ }else{
+ tlist.add(new Triple(scaffoldlen, contigs, contiglensum));
+ }
+
+
+ if(hdr!=null){
+ tswgc.print(toString2(hdr, counts));
+ headerlen+=hdr.length();
+ hdr.setLength(0);
+ }
+ {
+ long gc=counts[1]+counts[2];
+ long acgt=gc+counts[0]+counts[3];
+ if(acgt>0){
+ int index=Tools.min((int)((gc*gcbins2)/acgt),gcbins2-1);
+ gchistArray[index]++;
+ gchist_by_base[index]+=scaffoldlen;
+// assert(false);
+ }
+ }
+ for(int j=0; j<counts.length; j++){
+ overall[j]+=counts[j];
+ counts[j]=0;
+ }
+ }else{
+ Arrays.fill(counts, 0);
+ if(hdr!=null){hdr.setLength(0);}
+ }
+ }
+
+ break;
+ }
+
+ if(c>slashr){
+ counts[cnum]++;
+ scaffoldlen++;
+
+// if(c!=noref && c!=noref2){
+ if(cnum!=5){
+ ns=0;
+ contiglen++;
+ }else{
+ ns++;
+ if(ns==maxNs && contiglen>0){
+// if(contiglen<cutoff){
+// clist.increment(contiglen, 1);
+// }else{
+// llist.add(contiglen);
+// }
+//// clist.increment(contiglen, 1);
+// contiglensum+=contiglen;
+// contiglen=0;
+// contigs++;
+
+ {//NEW
+ currentContigs.set(contigs, contiglen);
+ contiglen=0;
+ contigs++;
+ }
+ }
+ }
+ }
+ }
+ }
+ if(i>=lim){
+ i=0;
+ lim=is.read(buf);
+ limsum+=lim;
+ }
+ }
+
+ if(scaffoldlen>0){
+
+// if(contiglen>0 || contigs==0){
+// contigs++;
+// contiglensum+=contiglen;
+// if(contiglen<cutoff){
+// clist.increment(contiglen, 1);
+// }else{
+// llist.add(contiglen);
+// }
+// }
+
+ {//NEW
+ if(contiglen>0 || contigs==0){
+ currentContigs.set(contigs, contiglen);
+ contigs++;
+// contiglensum+=contiglen;
+ }
+ }
+
+ if(scaffoldlen>=minScaffold){
+
+ int contiglensum=0;
+ {//NEW
+// System.out.println("Dumping "+contigs+" contigs.");
+ for(int j=0; j<contigs; j++){
+ final int cl=currentContigs.get(j);
+ if(cl>0 || contigs==0){
+ contiglensum+=cl;
+ if(cl<cutoff){
+ clist.increment(cl, 1);
+ }else{
+ llist.add(cl);
+ }
+ }
+ }
+ }
+
+ if(scaffoldlen<cutoff){
+ slist.increment(scaffoldlen, 1);
+ sclist1.increment(scaffoldlen, contigs);
+ sclist2.increment(scaffoldlen, contiglensum);
+ }else{
+ tlist.add(new Triple(scaffoldlen, contigs, contiglensum));
+ }
+
+
+// slist.increment(scaffoldlen, 1);
+// if(contiglen>0 || contigs==0){
+// contigs++;
+// contiglensum+=contiglen;
+// clist.increment(contiglen, 1);
+// }
+// sclist1.increment(scaffoldlen, contigs);
+// sclist2.increment(scaffoldlen, contiglensum);
+
+ if(hdr!=null){
+ tswgc.print(toString2(hdr, counts));
+ hdr.setLength(0);
+ }
+
+ {
+ long gc=counts[1]+counts[2];
+ long acgt=gc+counts[0]+counts[3];
+ if(acgt>0){
+ int index=Tools.min((int)((gc*gcbins2)/acgt),gcbins2-1);
+ gchistArray[index]++;
+ gchist_by_base[index]+=scaffoldlen;
+ }
+ }
+ for(int j=0; j<counts.length; j++){
+ overall[j]+=counts[j];
+ counts[j]=0;
+ }
+ }
+ }
+
+// System.err.println("clist="+clist+"\nslist="+slist+"\nsclist1="+sclist1+"\nsclist2="+sclist2+"\nllist="+llist+"\ntlist="+tlist); //***
+
+
+ if(tswgc!=null){
+ if(tswgc.fname.equalsIgnoreCase("stdout") || tswgc.fname.startsWith("stdout.")){
+ if(FORMAT>0 && (out==null || out.equalsIgnoreCase("stdout") || out.startsWith("stdout."))){
+ tswgc.print("\n");
+ }
+ }
+ tswgc.poison();
+ tswgc.waitForFinish();
+ }
+ LIMSUM=limsum;
+ HEADERLENSUM=headerlen;
+
+
+ gc_std=Tools.standardDeviationHistogram(gchistArray)/gcbins2;
+ gchistArray_downsampled=Tools.downsample(gchistArray, gcbins);
+
+ gc_bb_std=Tools.standardDeviationHistogram(gchist_by_base)/gcbins2;
+ gchist_by_base_downsampled=Tools.downsample(gchist_by_base, gcbins);
+
+ return overall;
+ }
+
+ public long[] countFastq(final InputStream is, String gcout) throws IOException{
+
+ long limsum=0;
+ long headerlen=0;
+ final byte[] buf=new byte[32768];
+ final TextStreamWriter tswgc=(gcout==null ? null : new TextStreamWriter(gcout, overwrite, false, false));
+ if(tswgc!=null){
+ tswgc.start();
+ if(GCFORMAT==0 || GCFORMAT==1){
+ tswgc.println("#Name\tLength\tA\tC\tG\tT\tN\tIUPAC\tOther\tGC");
+ }else if(GCFORMAT==2){
+ tswgc.println("#Name\tGC");
+ }else if(GCFORMAT==3){
+ tswgc.println("#Name\tLength\tA\tC\tG\tT\tN\tIUPAC\tOther\tGC");
+ }else if(GCFORMAT==4){
+ tswgc.println("#Name\tLength\tGC");
+ }else{
+ throw new RuntimeException("Unknown format.");
+ }
+ }
+// assert(false) : GCFORMAT+", "+out+", "+tswgc;
+ final long[] counts=new long[8];
+ final long[] overall=new long[8];
+ final StringBuilder hdr=(gcout==null ? null : new StringBuilder());
+ int line=0;
+
+ int i=0;
+ int lim=is.read(buf);
+ limsum+=lim;
+
+ int contigs=0;
+ int contiglen=0;
+// int contiglensum=0;
+ int scaffoldlen=0;
+ int ns=0;
+
+ final IntList currentContigs=new IntList(10000);
+
+ while(lim>0){
+ if(line==0){//Scan to end of header.
+// System.err.println("1");
+ if(hdr==null){//Header is not being stored
+ while(i<lim){
+ final byte c=buf[i];
+ i++;
+ if(c<=slashr){
+ line++;
+// System.err.println("1.1");
+ contiglen=0;
+// contiglensum=0;
+ scaffoldlen=0;
+ ns=0;
+ contigs=0;
+ break;
+ }
+ headerlen++;
+ }
+ }else{//Store the header
+ while(i<lim){
+ final byte c=buf[i];
+ i++;
+ if(c<=slashr){
+ line++;
+// System.err.println("1.2");
+ contiglen=0;
+// contiglensum=0;
+ scaffoldlen=0;
+ ns=0;
+ contigs=0;
+ break;
+ }
+ hdr.append((char)c);
+ }
+ }
+ }
+
+ if(line==1){//Scan bases
+// System.err.println("2");
+ while(i<lim){
+ final byte c=buf[i];
+ final byte cnum=charToNum[c];
+ i++;
+
+ if(c<=slashr){//Finish the contig
+// assert(false) : scaffoldlen;
+ line=(line+1)&3;
+// System.err.println("2.1");
+ if(scaffoldlen>0){//if the scaffold was not blank
+
+ {//NEW
+ if(contiglen>0 || contigs==0){
+ currentContigs.set(contigs, contiglen);
+ contigs++;
+// System.out.println("For header "+hdr+": added contig. len="+contiglen+", contigs="+contigs);
+// contiglensum+=contiglen;
+ }
+ }
+
+// assert(false);
+ if(scaffoldlen>=minScaffold){
+
+ int contiglensum=0;
+ {//NEW
+// System.out.println("Dumping "+contigs+" contigs.");
+ for(int j=0; j<contigs; j++){
+ final int cl=currentContigs.get(j);
+ if(cl>0 || contigs==0){
+ contiglensum+=cl;
+ if(cl<cutoff){
+ clist.increment(cl, 1);
+ }else{
+ llist.add(cl);
+ }
+ }
+ }
+ }
+
+ if(scaffoldlen<cutoff){
+ slist.increment(scaffoldlen, 1);
+ sclist1.increment(scaffoldlen, contigs);
+ sclist2.increment(scaffoldlen, contiglensum);
+ }else{
+ tlist.add(new Triple(scaffoldlen, contigs, contiglensum));
+ }
+
+
+ if(hdr!=null){
+ tswgc.print(toString2(hdr, counts));
+ headerlen+=hdr.length();
+ hdr.setLength(0);
+ }
+ {
+ long gc=counts[1]+counts[2];
+ long acgt=gc+counts[0]+counts[3];
+ if(acgt>0){
+ int index=Tools.min((int)((gc*gcbins2)/acgt),gcbins2-1);
+ gchistArray[index]++;
+ gchist_by_base[index]+=scaffoldlen;
+// assert(false);
+ }
+ }
+ for(int j=0; j<counts.length; j++){
+ overall[j]+=counts[j];
+ counts[j]=0;
+ }
+ }else{
+ Arrays.fill(counts, 0);
+ if(hdr!=null){hdr.setLength(0);}
+ }
+ }
+
+ break;
+ }
+
+ if(c>slashr){
+ counts[cnum]++;
+ scaffoldlen++;
+
+// if(c!=noref && c!=noref2){
+ if(cnum!=5){
+ ns=0;
+ contiglen++;
+ }else{
+ ns++;
+ if(ns==maxNs && contiglen>0){
+// if(contiglen<cutoff){
+// clist.increment(contiglen, 1);
+// }else{
+// llist.add(contiglen);
+// }
+//// clist.increment(contiglen, 1);
+// contiglensum+=contiglen;
+// contiglen=0;
+// contigs++;
+
+ {//NEW
+ currentContigs.set(contigs, contiglen);
+ contiglen=0;
+ contigs++;
+ }
+ }
+ }
+ }
+ }
+ }
+
+// System.err.println("3");
+ if(i>=lim){
+ i=0;
+ lim=is.read(buf);
+ limsum+=lim;
+ }
+
+// System.err.println("4");
+ assert(line>1 || lim<=i || i==0) : line+", "+i+", "+lim;
+ while(i<lim && line>1){
+ final byte c=buf[i];
+ i++;
+ if(c<=slashr){
+ line=(line+1)&3;
+ }
+ }
+ }
+
+ if(tswgc!=null){
+ if(tswgc.fname.equalsIgnoreCase("stdout") || tswgc.fname.startsWith("stdout.")){
+ if(FORMAT>0 && (out==null || out.equalsIgnoreCase("stdout") || out.startsWith("stdout."))){
+ tswgc.print("\n");
+ }
+ }
+ tswgc.poison();
+ tswgc.waitForFinish();
+ }
+ LIMSUM=limsum;
+ HEADERLENSUM=headerlen;
+
+
+ gc_std=Tools.standardDeviationHistogram(gchistArray)/gcbins2;
+ gchistArray_downsampled=Tools.downsample(gchistArray, gcbins);
+
+ gc_bb_std=Tools.standardDeviationHistogram(gchist_by_base)/gcbins2;
+ gchist_by_base_downsampled=Tools.downsample(gchist_by_base, gcbins);
+
+ return overall;
+ }
+
+ /*--------------------------------------------------------------*/
+
+ private void printGCHist(String gchistFile){
+ if(!Tools.canWrite(gchistFile, overwrite)){
+ System.err.println("Can't write gc histogram because file exists and overwrite="+overwrite);
+ assert(false);
+ }else{
+ long gchistFilesum=Tools.sum(gchistArray_downsampled);
+ long gchistFilesumbb=Tools.sum(gchist_by_base_downsampled);
+ double invsum=(gchistFilesum==0 ? 0 : 1.0/gchistFilesum);
+ double invsumbb=(gchistFilesum==0 ? 0 : 1.0/gchistFilesumbb);
+ double invbins=1.0/(gcbins==0 ? 1 : gcbins);
+// assert(false) : Arrays.toString(gchistArray);
+ StringBuilder sb=new StringBuilder();
+ sb.append(String.format("#GC\tscaffolds\tfraction\tlength\tlen_fraction\n"));
+ for(int i=0; i<gcbins; i++){
+ sb.append(String.format("%."+gchistDecimals1+"f\t%d\t%.5f\t%d\t%.5f\n",
+ i*invbins, gchistArray_downsampled[i], gchistArray_downsampled[i]*invsum, gchist_by_base_downsampled[i], gchist_by_base_downsampled[i]*invsumbb));
+ }
+ if(gchistFile.equalsIgnoreCase("stdout")){
+ System.out.println(sb);
+ }else{
+ ReadWrite.writeString(sb, gchistFile);
+ }
+ }
+ }
+
+
+ public static void printBenchResults(Timer t, long[] counts, long sum, String in){
+ System.err.println("Time: \t"+t);
+ long bytes=new File(in).length();
+ if(bytes<1){bytes=LIMSUM;}
+ double mbps1=bytes*1000d/t.elapsed;
+ double mbps2=sum*1000d/t.elapsed;
+ System.err.println(String.format("Raw Speed: \t%.2f MBytes/s",mbps1));
+ System.err.println(String.format("Uncompressed Speed:\t%.2f MBytes/s",mbps2));
+ }
+
+
+ public static void printResults(Timer t, long[] counts, long sum, double gc_std, String in, LongList clist, LongList slist, LongList sclist1, LongList sclist2,
+ LongList llist, ArrayList<Triple> tlist, String out){
+
+ String name=in;
+ if(in!=null && !in.toLowerCase().startsWith("stdin")){
+ try {
+ File f=new File(in);
+ name=f.getCanonicalPath();
+ } catch (IOException e) {}
+ }
+
+ long contigs=0;
+ long scaffolds=0;
+ long contiglen=0;
+ long scaflen=0;
+ long contigs1;
+ long contiglen2;
+ long maxScaf=0, maxContig=0;
+ long[] carray=clist.array;
+ long[] sarray=slist.array;
+ long[] scarray1=sclist1.array;
+ long[] scarray2=sclist2.array;
+
+ long[] larray=llist.array;
+
+ StringBuilder sb=new StringBuilder(), sb2=new StringBuilder();
+
+ for(int i=0; i<carray.length; i++){
+ long x=carray[i];
+ if(x>0){
+ contigs+=x;
+ contiglen+=(x*i);
+ maxContig=i;
+ }
+ }
+
+ for(int i=0; i<sarray.length; i++){
+ long x=sarray[i];
+ if(x>0){
+ scaffolds+=x;
+ scaflen+=(x*i);
+ maxScaf=i;
+ }
+ }
+
+ contigs+=llist.size;
+ for(int i=0; i<llist.size; i++){
+ long x=larray[i];
+ assert(x>0);
+ contiglen+=x;
+ maxContig=Tools.max(maxContig, x);
+ }
+
+ scaffolds+=tlist.size();
+ for(Triple tp : tlist){
+ scaflen+=tp.length;
+ maxScaf=Tools.max(maxScaf, tp.length);
+ }
+
+ if(FORMAT<3){
+ sb.append("Main genome scaffold total: \t"+scaffolds+"\n");
+ sb.append("Main genome contig total: \t"+contigs+"\n");
+ }else if(FORMAT==7){
+ sb.append("Main genome contig total: \t"+contigs+"\n");
+ }
+
+ if(FORMAT==0){
+ }else if(FORMAT==1){
+ sb.append("Main genome scaffold sequence total:\t"+String.format("%.3f MB",scaflen/1000000f)+"\n");
+ sb.append("Main genome contig sequence total: \t"+String.format("%.3f MB \t%.3f%% gap",contiglen/1000000f,(scaflen-contiglen)*100f/scaflen)+"\n");
+ }else if(FORMAT==2){
+ sb.append("Main genome scaffold sequence total:\t"+scaflen+"\n");
+ sb.append("Main genome contig sequence total: \t"+String.format("%d \t%.3f%% gap",contiglen,(scaflen-contiglen)*100f/scaflen)+"\n");
+ }else if(FORMAT==3 || FORMAT==6){
+
+ }else if(FORMAT==4){
+
+ }else if(FORMAT==5){
+
+ }else if(FORMAT==7){
+ sb.append("Main genome contig sequence total: \t"+String.format("%.3f MB",contiglen/1000000f)+"\n");
+ }else{throw new RuntimeException("Unknown format");}
+
+ if(FORMAT<3){
+ sb2.append("\n");
+ sb2.append("Minimum \tNumber \tNumber \tTotal \tTotal \tScaffold\n");
+ sb2.append("Scaffold\tof \tof \tScaffold \tContig \tContig \n");
+ sb2.append("Length \tScaffolds \tContigs \tLength \tLength \tCoverage\n");
+ sb2.append("--------\t--------------\t--------------\t--------------\t--------------\t--------\n");
+ }else if(FORMAT==7){
+ sb2.append("\n");
+ sb2.append("Minimum \tNumber \tTotal \n");
+ sb2.append("Contig \tof \tContig \n");
+ sb2.append("Length \tContigs \tLength \n");
+ sb2.append("--------\t--------------\t--------------\n");
+ }
+
+ final int[] lims;
+
+ if(FORMAT==7){
+ int minScaf=-1;
+ for(int i=0; i<carray.length && minScaf<0; i++){
+ long x=sarray[i];
+ if(x>0){minScaf=i;}
+ }
+ if(minScaf<0 && !tlist.isEmpty()){
+ minScaf=(int)Tools.min(tlist.get(0).length, 250000000);
+ }
+ if(minScaf<1){minScaf=1;}
+ int[] temp=new int[] {0, 50, 100, 250, 500, 1000, 2500, 5000, 10000, 25000, 50000, 100000,
+ 250000, 500000, 1000000, 2500000, 5000000, 10000000, 25000000, 50000000, 100000000, 250000000};
+ ArrayList<Integer> tempList=new ArrayList<Integer>();
+ tempList.add(0);
+ for(int i=1; i<temp.length; i++){
+ int x=temp[i];
+ int prev=temp[i-1];
+ if(x>=minScaf || i==temp.length-1){
+ if(prev<minScaf){
+ tempList.add(minScaf);
+ }else{
+ tempList.add(x);
+ }
+ }
+ }
+ lims=new int[tempList.size()];
+ for(int i=0; i<lims.length; i++){
+ lims[i]=tempList.get(i);
+ }
+ }else{
+ lims=new int[] {0, 50, 100, 250, 500, 1000, 2500, 5000, 10000, 25000, 50000, 100000,
+ 250000, 500000, 1000000, 2500000, 5000000, 10000000, 25000000, 50000000, 100000000, 250000000};
+ }
+
+ int lidx=0;
+ int next=0;
+ long csum=contigs;
+ long ssum=scaffolds;
+ long clen=contiglen;
+ long slen=scaflen;
+
+ long cn50=-1;
+ long ln50=-1;
+ long cl50=-1;
+ long ll50=-1;
+
+ long shalf=slen/2;
+ long chalf=clen/2;
+
+ final int numOverCutoff=(FORMAT==7 ? 1000 : 50000);
+ final int numOverCutoffPlusOne=numOverCutoff+1;
+ long numOver50=0;
+ float fractionOver50=0;
+
+
+ //Disable printing of 50~500 when not needed
+// {
+// boolean b=true;
+// for(int i=0; i<500 && i<sarray.length && b; i++){
+// b=(sarray[i]==0);
+// }
+// if(b){
+// lidx=Arrays.binarySearch(lims, 1000)-1;
+// lims[lidx]=0;
+// }
+// }
+ if(skipDuplicateLines){
+ int maxZero=-1;
+ for(int i=0; i<sarray.length && sarray[i]==0; i++){
+ maxZero=i;
+ }
+ if(maxZero>=50){
+ for(int i=1; i<lims.length; i++){
+ if(lims[i]<=maxZero){
+ lidx=i-1;
+ lims[lidx]=0;
+ }else{break;}
+ }
+ }
+ }
+
+ final int lim=slist.size;
+ assert(lim<=sarray.length);
+
+ {
+ //These two loops generate the scaffold table
+ long prevSsum=-1, prevCsum=-1, prevSlen=-1, prevClen=-1;
+ String prevLine=null;
+ for(int i=0; i<lim && slen>0; i++){
+ // System.out.println("\n<A1>\ti="+i+", lidx="+lidx+", lims.length="+lims.length+", next="+next+", sarray.length="+sarray.length+", slen="+slen);
+
+ if(i==next){
+ prevLine=formatX(next, ssum, csum, slen, clen);
+// if(!skipDuplicateLines || ssum!=prevSsum || csum!=prevCsum || slen!=prevSlen || clen!=prevClen){
+ if(prevLine!=null){
+ sb2.append(prevLine);
+ sb2.append('\n');
+ prevLine=null;
+ }
+// }
+ prevSsum=ssum; prevCsum=csum; prevSlen=slen; prevClen=clen;
+
+ lidx++;
+ if(lidx<lims.length){next=lims[lidx];}
+ else{next=-1;}
+ // System.out.println("<A2>\ti="+i+", lidx="+lidx+", lims.length="+lims.length+", next="+next+", sarray.length="+sarray.length+", slen="+slen);
+ // System.out.prontln(sb2);
+ }
+ // System.out.println("<A3>\ti="+i+", lidx="+lidx+", lims.length="+lims.length+", next="+next+", sarray.length="+sarray.length+", slen="+slen);
+
+ if(i==numOverCutoffPlusOne){
+ numOver50=ssum;
+ fractionOver50=slen*100f/scaflen;
+ }
+
+ // long a=carray[i];
+ long b=sarray[i];
+ long c=scarray1[i];
+ long d=scarray2[i];
+
+ if(b>0){
+ csum-=c;
+ ssum-=b;
+ clen-=d;
+ slen-=(b*i);
+ }
+
+ if(ln50==-1 && slen<=shalf){
+ ln50=i;
+ ll50=ssum+b;
+ }
+ // System.out.println("<A4>\tb="+b+", c="+c+", d="+d+", csum="+csum+", ssum="+ssum+", clen="+clen+", slen="+slen);
+
+ // System.out.println("<A5>\ti="+i+", lidx="+lidx+", lims.length="+lims.length+", next="+next+", sarray.length="+sarray.length+", slen="+slen);
+
+ }
+
+ for(Triple tp : tlist){
+ // assert(false) : tlist;
+ while(tp.length>=next && lidx<lims.length){
+
+ prevLine=formatX(next, ssum, csum, slen, clen);
+// if(!skipDuplicateLines || ssum!=prevSsum || csum!=prevCsum || slen!=prevSlen || clen!=prevClen){
+ if(prevLine!=null){
+ sb2.append(prevLine);
+ sb2.append('\n');
+ prevLine=null;
+ }
+// }
+ prevSsum=ssum; prevCsum=csum; prevSlen=slen; prevClen=clen;
+
+ lidx++;
+ if(lidx<lims.length){next=lims[lidx];}
+ // System.out.println("<B>\n"+sb2+"\ni="+"?"+", lidx="+lidx+", lims.length="+lims.length+", next="+next);
+ // else{next=-1;}
+ }
+
+ if(numOver50==0 && tp.length>numOverCutoff){
+ numOver50=ssum;
+ fractionOver50=slen*100f/scaflen;
+ }
+
+ // long a=carray[i];
+ long b=tp.length;
+ long c=tp.contigs;
+ long d=tp.contiglen;
+
+ if(b>0){
+ csum-=c;
+ ssum-=1;
+ clen-=d;
+ slen-=b;
+ }
+
+ if(ln50==-1 && slen<=shalf){
+ ln50=b;
+ ll50=ssum+1;
+ }
+
+ }
+ if(prevLine!=null){
+ sb2.append(prevLine);
+ prevLine=null;
+ }
+ }
+
+ clen=contiglen;
+ csum=contigs;
+ for(int i=0; i<carray.length && clen>0; i++){
+ long a=carray[i];
+
+ csum-=a;
+ clen-=a*i;
+
+ if(cn50==-1 && clen<=chalf){
+ cn50=i;
+ cl50=csum+a;
+ }
+ }
+
+ for(int i=0; i<llist.size && clen>0; i++){
+ long a=larray[i];
+
+ csum-=1;
+ clen-=a;
+
+ if(cn50==-1 && clen<=chalf){
+ cn50=a;
+ cl50=csum+1;
+ }
+ }
+
+ cn50=Tools.max(cn50, 0);
+ ln50=Tools.max(ln50, 0);
+ cl50=Tools.max(cl50, 0);
+ ll50=Tools.max(ll50, 0);
+
+// ByteStreamWriter tsw=new ByteStreamWriter((out==null ? "stdout" : out) , overwrite, append, false);
+ TextStreamWriter tsw=new TextStreamWriter((out==null ? "stdout" : out) , overwrite, append, false);
+ tsw.start();
+
+ lastL50=ln50;
+ lastSize=contiglen;
+ lastContigs=contigs;
+ lastMaxContig=maxContig;
+
+ if(FORMAT<1){
+ //Do nothing
+ }else if(FORMAT<3){
+
+ if(addfilename){sb.append("Filename: \t"+name+"\n");}
+ sb.append("Main genome scaffold N/L50: \t"+ll50+"/"+formatKB(ln50, 3, 0)+"\n");
+ sb.append("Main genome contig N/L50: \t"+cl50+"/"+formatKB(cn50, 3, 0)+"\n");
+ sb.append("Max scaffold length: \t"+formatKB(maxScaf, 3, 0)+"\n");
+ sb.append("Max contig length: \t"+formatKB(maxContig, 3, 0)+"\n");
+ sb.append("Number of scaffolds > 50 KB: \t"+numOver50+"\n");
+ sb.append("% main genome in scaffolds > 50 KB: \t"+String.format("%.2f%%", fractionOver50)+"\n");
+ if(printheadersize){sb.append("Header:\t"+formatKB(HEADERLENSUM, 3, 0)+(HEADERLENSUM<1000 ? " bytes" : ""));}
+
+ // System.out.println();
+ // System.out.println("Scaffolds: "+Tools.sum(slist.array));
+ // for(int i=0; i<slist.size; i++){
+ // if(slist.array[i]>0){System.out.print(i+":"+slist.array[i]+", ");}
+ // }
+ // System.out.println();
+ // System.out.println("Contigs:"+Tools.sum(clist.array));
+ // for(int i=0; i<clist.size; i++){
+ // if(clist.array[i]>0){System.out.print(i+":"+clist.array[i]+", ");}
+ // }
+
+ if(GCFORMAT==0){
+ //Print nothing
+ }else{
+ if(GCFORMAT==1 || GCFORMAT==3 || GCFORMAT==4){
+ tsw.println("A\tC\tG\tT\tN\tIUPAC\tOther\tGC\tGC_stdev");
+ }else{
+ tsw.println("GC\tGC_stdev");
+ }
+ tsw.println(toString3(new StringBuilder(/*"Base Content"*/), counts, gc_std));
+ }
+
+ tsw.println(sb);
+ tsw.println(sb2);
+ }else if(FORMAT==3 || FORMAT==6){
+
+ if(useheader){
+ if(FORMAT==6){sb.append('#');}
+ if(N_UNDERSCORE){sb.append("n_");}
+ sb.append("scaffolds\t");
+ if(N_UNDERSCORE){sb.append("n_");}
+ sb.append("contigs\t");
+ sb.append("scaf_bp\t");
+ sb.append("contig_bp\t");
+ sb.append("gap_pct\t");
+ sb.append("scaf_N50\t");
+ sb.append("scaf_L50\t");
+ sb.append("ctg_N50\t");
+ sb.append("ctg_L50\t");
+ sb.append("scaf_max\t");
+ sb.append("ctg_max\t");
+ sb.append("scaf_n_gt50K\t");
+ sb.append("scaf_pct_gt50K\t");
+ sb.append("gc_avg\t");
+ sb.append("gc_std");
+ if(addfilename){sb.append("\tfilename");}
+
+ sb.append("\n");
+ }
+
+ sb.append(scaffolds+"\t");
+ sb.append(contigs+"\t");
+ sb.append(scaflen+"\t");
+ sb.append(String.format("%d",contiglen)+"\t");
+ sb.append(String.format("%.3f",(scaflen-contiglen)*100f/scaflen)+"\t");
+ sb.append(ll50+"\t");
+ sb.append(formatKB(ln50, 3, 0)+"\t");
+ sb.append(cl50+"\t");
+ sb.append(formatKB(cn50, 3, 0)+"\t");
+ sb.append(formatKB(maxScaf, 3, 0)+"\t");
+ sb.append(formatKB(maxContig, 3, 0)+"\t");
+ sb.append(numOver50+"\t");
+ sb.append(String.format("%.3f", fractionOver50)+"\t");
+ sb.append(String.format("%.5f", (counts[1]+counts[2])*1.0/(counts[0]+counts[1]+counts[2]+counts[3]))+"\t");
+ sb.append(String.format("%.5f", gc_std));
+ if(addfilename){sb.append('\t').append(name);}
+
+ tsw.println(sb);
+ }else if(FORMAT==4){
+
+ if(useheader){
+
+ if(N_UNDERSCORE){sb.append("n_");}
+ sb.append("scaffolds\t");
+// sb.append("contigs\t");
+ sb.append("scaf_bp\t");
+// sb.append("contig_bp\t");
+// sb.append("gap_pct\t");
+ sb.append("scaf_N50\t");
+ sb.append("scaf_L50\t");
+// sb.append("ctg_N50\t");
+// sb.append("ctg_L50\t");
+ sb.append("scaf_max\t");
+// sb.append("ctg_max\t");
+ sb.append("scaf_n_gt50K\t");
+ sb.append("scaf_pct_gt50K");
+// sb.append("gc_avg");
+// sb.append("gc_std");
+ if(addfilename){sb.append("\tfilename");}
+
+ sb.append("\n");
+ }
+
+ sb.append(scaffolds+"\t");
+// sb.append(contigs+"\t");
+ sb.append(scaflen+"\t");
+// sb.append(String.format("%d",contiglen)+"\t");
+// sb.append(String.format("%.3f",(scaflen-contiglen)*100f/scaflen)+"\t");
+ sb.append(ll50+"\t");
+ sb.append(formatKB(ln50, 3, 0)+"\t");
+// sb.append(cl50+"\t");
+// sb.append(formatKB(cn50, 3, 0)+"\t");
+ sb.append(formatKB(maxScaf, 3, 0)+"\t");
+// sb.append(formatKB(maxContig, 3, 0)+"\t");
+ sb.append(numOver50+"\t");
+ sb.append(String.format("%.3f", fractionOver50));
+// sb.append(String.format("%.5f", (counts[1]+counts[2])*1.0/(counts[0]+counts[1]+counts[2]+counts[3])));
+ if(addfilename){sb.append('\t').append(name);}
+ tsw.println(sb);
+ }else if(FORMAT==5){
+
+ if(useheader){
+// sb.append("scaffolds\t");
+ if(N_UNDERSCORE){sb.append("n_");}
+ sb.append("contigs\t");
+// sb.append("scaf_bp\t");
+ sb.append("contig_bp\t");
+ sb.append("gap_pct\t");
+// sb.append("scaf_N50\t");
+// sb.append("scaf_L50\t");
+ sb.append("ctg_N50\t");
+ sb.append("ctg_L50\t");
+// sb.append("scaf_max\t");
+ sb.append("ctg_max\t");
+// sb.append("scaf_n_gt50K\t");
+// sb.append("scaf_pct_gt50K\t");
+ sb.append("gc_avg\t");
+ sb.append("gc_std");
+ if(addfilename){sb.append("\tfilename");}
+
+ sb.append("\n");
+ }
+
+// sb.append(scaffolds+"\t");
+ sb.append(contigs+"\t");
+// sb.append(scaflen+"\t");
+ sb.append(String.format("%d",contiglen)+"\t");
+ sb.append(String.format("%.3f",(scaflen-contiglen)*100f/scaflen)+"\t");
+// sb.append(ll50+"\t");
+// sb.append(formatKB(ln50, 3, 0)+"\t");
+ sb.append(cl50+"\t");
+ sb.append(formatKB(cn50, 3, 0)+"\t");
+// sb.append(formatKB(maxScaf, 3, 0)+"\t");
+ sb.append(formatKB(maxContig, 3, 0)+"\t");
+// sb.append(numOver50+"\t");
+// sb.append(String.format("%.3f", fractionOver50)+"\t");
+ sb.append(String.format("%.5f", (counts[1]+counts[2])*1.0/(counts[0]+counts[1]+counts[2]+counts[3]))+"\t");
+ sb.append(String.format("%.5f", gc_std));
+ if(addfilename){sb.append('\t').append(name);}
+ tsw.println(sb);
+ }else if(FORMAT==7){
+
+ if(addfilename){sb.append("Filename: \t"+name+"\n");}
+ sb.append("Main genome contig N/L50: \t"+cl50+"/"+formatKB(cn50, 3, 0)+"\n");
+ sb.append("Max contig length: \t"+formatKB(maxContig, 3, 0)+"\n");
+ sb.append("Number of contigs > 1 KB: \t"+numOver50+"\n");
+ sb.append("% main genome in contigs > 1 KB: \t"+String.format("%.2f%%", fractionOver50)+"\n");
+ if(printheadersize){sb.append("Header:\t"+formatKB(HEADERLENSUM, 3, 0)+(HEADERLENSUM<1000 ? " bytes" : ""));}
+
+ if(GCFORMAT==0){
+ //Print nothing
+ }else{
+ if(GCFORMAT==1 || GCFORMAT==3 || GCFORMAT==4){
+ tsw.println("A\tC\tG\tT\tGC\tGC_stdev");
+ }else{
+ tsw.println("GC\tGC_stdev");
+ }
+ tsw.println(toString3(new StringBuilder(/*"Base Content"*/), counts, gc_std));
+ }
+
+ tsw.println(sb);
+ tsw.println(sb2);
+ }
+ tsw.poisonAndWait();
+
+ if(showspeed){
+ if(!printheadersize){System.err.println("Header:\t"+formatKB(HEADERLENSUM, 3, 0)+(HEADERLENSUM<1000 ? " bytes" : ""));}
+ System.err.println("Time: \t"+t);
+ long bytes=new File(in).length();
+ if(bytes<1){bytes=LIMSUM;}
+ double mbps=bytes*1000d/t.elapsed;
+ double mbpps=Tools.sum(counts)*1000d/t.elapsed;
+ System.err.println(String.format("Speed:\t%.2f MBytes/s",mbps));
+ System.err.println(String.format(" \t%.2f MBases/s",mbpps));
+ }
+
+ if(bbmapkmer>0){
+ System.err.println("BBMap minimum memory estimate at k="+bbmapkmer+": "+estimateBBMapMemory(counts, scaffolds, HEADERLENSUM, bbmapkmer));
+ }
+
+
+ }
+
+ private static long bbmapMemoryBytes(long[] acgtn, long scaffolds,
+ long headerlen, int k) {
+
+ long keyspace=(1L<<(2*k));
+ long defined=acgtn[0]+acgtn[1]+acgtn[2]+acgtn[3];
+ long undefined=acgtn[4];
+ long midpad=(scaffolds*(FastaToChromArrays2.MID_PADDING));
+ long total=defined+undefined+midpad;
+ int chromlen=FastaToChromArrays2.MAX_LENGTH-FastaToChromArrays2.END_PADDING-FastaToChromArrays2.START_PADDING;
+ int chroms=(int)(total/chromlen);
+ int chromsperblock=Integer.MAX_VALUE/chromlen;
+ int blocks=(chroms+chromsperblock-1)/chromsperblock;
+ long memperblock=keyspace*4;
+ long memforcounts=keyspace*4;
+
+ long mem=0;
+ mem+=total; //reference bulk, including inter-scaffold padding
+ mem+=(chroms*(FastaToChromArrays2.END_PADDING+FastaToChromArrays2.START_PADDING)); //reference tip padding
+ mem+=headerlen; //Header name byte arrays
+ mem+=(scaffolds*(4+4+4+16+8)); //Other structures for scaffold info
+ mem+=(blocks*(memperblock)); //start array for each block
+ mem+=memforcounts; //count array
+ mem+=(defined*4); //key lists
+ mem=(long)(mem/0.66); //Expand to compensate for garbage collection
+ if(k>13){mem=mem+1000000000;}
+ return mem;
+ }
+
+ private static CharSequence estimateBBMapMemory(long[] acgtn, long scaffolds,
+ long headerlen, int k) {
+ long mem=180+bbmapMemoryBytes(acgtn, scaffolds, headerlen, k)/1000000; //in megabytes
+ if(mem>4000){
+ return "-Xmx"+((mem+1500)/1000)+"g \t(at least "+(long)Math.ceil((((mem+1500)/0.85)/1000))+" GB physical RAM)";
+ }else if(mem>2100){
+ return "-Xmx"+((mem+999)/1000)+"g \t(at least "+(long)Math.ceil((((mem+999)/0.85)/1000))+" GB physical RAM)";
+ }else{
+ return "-Xmx"+(((((mem*11)/8+50))/10)*10)+"m \t(at least "+((((long)(((mem*10)/8+50)/0.82))/10)*10)+" MB physical RAM)";
+ }
+ }
+
+
+ public static long bench(InputStream is) throws IOException{
+ final byte[] buf=new byte[32768];
+ long sum=0;
+ for(long len=is.read(buf); len>0; len=is.read(buf)){sum+=len;}
+ return sum;
+ }
+
+
+
+ private static void writeHistFile(String fname, LongList slist, ArrayList<Triple> tlist, boolean ascending){
+ if(fname==null){return;}
+ TextStreamWriter tsw=new TextStreamWriter(fname, overwrite, false, false);
+ tsw.start();
+ tsw.print("#scaffolds\tlength\n");
+ long num=0, len=0;
+
+ final long[] array=slist.array;
+ final int lim=slist.size;
+
+ StringBuilder sb=new StringBuilder(32);
+
+ if(ascending){
+ for(int i=0; i<lim; i++){
+ final long a=array[i];
+ if(a>0){
+ len+=(a*i);
+ num+=a;
+ sb.append(num);
+ sb.append('\t');
+ sb.append(len);
+ sb.append('\n');
+ tsw.print(sb.toString());
+ sb.setLength(0);
+ }
+ }
+
+ if(tlist!=null){
+ for(Triple t : tlist){
+ len+=t.length;
+ num++;
+ sb.append(num);
+ sb.append('\t');
+ sb.append(len);
+ sb.append('\n');
+ tsw.print(sb.toString());
+ sb.setLength(0);
+ }
+ }
+ }else{
+
+ if(tlist!=null){
+ for(int i=tlist.size()-1; i>=0; i--){
+ Triple t=tlist.get(i);
+ len+=t.length;
+ num++;
+ sb.append(num);
+ sb.append('\t');
+ sb.append(len);
+ sb.append('\n');
+ tsw.print(sb.toString());
+ sb.setLength(0);
+ }
+ }
+ for(int i=lim-1; i>=0; i--){
+ final long a=array[i];
+ if(a>0){
+ len+=(a*i);
+ num+=a;
+ sb.append(num);
+ sb.append('\t');
+ sb.append(len);
+ sb.append('\n');
+ tsw.print(sb.toString());
+ sb.setLength(0);
+ }
+ }
+ }
+ tsw.poisonAndWait();
+ }
+
+ private static String toString2(StringBuilder sb, long[] counts){
+ final long a=counts[0], c=counts[1], g=counts[2], t=counts[3], iupac=counts[4], n=counts[5], other=counts[6], control=counts[7];
+ long sumDef=a+c+g+t;
+ long sumAll=sumDef+iupac+n+other;
+ double invDef=1.0/sumDef, invAll=1.0/sumAll;
+ double iupacD=iupac*invAll;
+ double otherD=other*invAll;
+ if(iupac>0 && iupacD<0.0001){iupacD=0.0001;}
+ if(other>0 && otherD<0.0001){otherD=0.0001;}
+ if(GCFORMAT==0 || GCFORMAT==1){
+ return sb.append(String.format("\t%d\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\n",
+ sumAll, a*invDef, c*invDef, g*invDef, t*invDef, n*invAll, iupacD, otherD, (g+c)*invDef)).toString();
+ }else if(GCFORMAT==2){
+ return sb.append(String.format("\t%.4f\n", (g+c)*invDef)).toString();
+ }else if(GCFORMAT==3){
+ return sb.append(String.format("\t%d\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\n",
+ sumAll, a*invDef, c*invDef, g*invDef, t*invDef, n*invAll, iupacD, otherD, (g+c)*invDef)).toString();
+ }else if(GCFORMAT==4){
+ return sb.append(String.format("\t%d\t%.4f\n", sumAll, (g+c)*invDef)).toString();
+ }else{
+ throw new RuntimeException("Unknown format.");
+ }
+ }
+
+ private static String toString3(StringBuilder sb, long[] counts, double gc_std){
+ final long a=counts[0], c=counts[1], g=counts[2], t=counts[3], iupac=counts[4], n=counts[5], other=counts[6], control=counts[7];
+ long sumDef=a+c+g+t;
+ long sumAll=sumDef+iupac+n+other;
+ double invDef=1.0/sumDef, invAll=1.0/sumAll;
+ double iupacD=iupac*invAll;
+ double otherD=other*invAll;
+ if(iupac>0 && iupacD<0.0001){iupacD=0.0001;}
+ if(other>0 && otherD<0.0001){otherD=0.0001;}
+ if(FORMAT==7){
+ return sb.append(String.format("%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\n",
+ a*invDef, c*invDef, g*invDef, t*invDef, (g+c)*invDef, gc_std)).toString();
+ }else if(GCFORMAT==0 || GCFORMAT==1 || GCFORMAT==3 || GCFORMAT==4){
+ return sb.append(String.format("%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\n",
+ a*invDef, c*invDef, g*invDef, t*invDef, n*invAll, iupacD, otherD, (g+c)*invDef, gc_std)).toString();
+ }else if(GCFORMAT==2){
+ return sb.append(String.format("%.4f\t%.4f\n", (g+c)*invDef, gc_std)).toString();
+ }else{
+ throw new RuntimeException("Unknown format.");
+ }
+
+ }
+
+
+ private static final String formatX(int next, long ssum, long csum, long slen, long clen){
+ float cov=clen*100f/slen;
+
+ final String s;
+ if(FORMAT<=1){
+ s=formatKB_all(next, 1, 7)+" \t"+formatComma(ssum, 14)+"\t"+formatComma(csum, 14)+"\t"+formatComma(slen, 14)+"\t"+formatComma(clen, 14)+"\t"+formatPercent(cov);
+ }else if(FORMAT==7){
+ if(next==0){return null;}
+ s=formatKB_all(next, 1, 7)+" \t"+formatComma(csum, 14)+"\t"+formatComma(clen, 14);
+ }else if(FORMAT>=2){
+ s=formatKB_all(next, 1, 7)+" \t"+formatComma(ssum, 14)+"\t"+formatComma(csum, 14)+"\t"+formatComma(slen, 14)+"\t"+formatComma(clen, 14)+"\t"+formatPercent(cov);
+ }else{
+ throw new RuntimeException("Unknown format: "+FORMAT);
+ }
+ return s;
+ }
+
+ private static final String formatKB(long x, int precision, int width){
+ String s;
+ if(FORMAT>=2 || x<1000){
+ s=Long.toString(x);
+ }else if(x<1000000){
+ s=String.format("%."+precision+"f",x/1000f);
+ while(s.contains(".") && (s.endsWith("0") || s.endsWith("."))){s=s.substring(0, s.length()-1);}
+ s=s+" KB";
+ }else{
+ s=String.format("%."+precision+"f",x/1000000f);
+ while(s.contains(".") && (s.endsWith("0") || s.endsWith("."))){s=s.substring(0, s.length()-1);}
+ s=s+" MB";
+ }
+ while(s.length()<width){s=" "+s;}
+ return s;
+ }
+
+ private static final String formatKB_all(long x, int precision, int width){
+ String s;
+
+ if(x==0){s="All";}
+ else if(FORMAT>=2 || x<1000){
+ s=Long.toString(x);
+ }else if(x<1000000){
+ s=String.format("%."+precision+"f",x/1000f);
+ while(s.contains(".") && (s.endsWith("0") || s.endsWith("."))){s=s.substring(0, s.length()-1);}
+ s=s+" KB";
+ }else{
+ s=String.format("%."+precision+"f",x/1000000f);
+ while(s.contains(".") && (s.endsWith("0") || s.endsWith("."))){s=s.substring(0, s.length()-1);}
+ s=s+" MB";
+ }
+
+ while(s.length()<width){s=" "+s;}
+ return s;
+ }
+
+ private static final StringBuilder formatComma(long x, int width){
+ StringBuilder sb=new StringBuilder(width);
+ if(FORMAT<=1){
+ sb.append(x%1000);
+ x/=1000;
+ int len=3;
+ while(x>0){
+ while(sb.length()<len){sb.insert(0, '0');}
+ sb.insert(0, (x%1000)+",");
+ x/=1000;
+ len+=4;
+ }
+ }else if(FORMAT>=2){
+ sb.append(x);
+ }else{
+ throw new RuntimeException("Unknown format: "+FORMAT);
+ }
+ while(sb.length()<width){
+ sb.insert(0, ' ');
+ }
+ return sb;
+ }
+
+ private static final String formatPercent(float x){
+ String s=String.format("%.2f%%", x);
+ while(s.length()<8){s=" "+s;}
+ return s;
+ }
+
+ protected void reset(){
+// clist=null;
+// slist=null;
+// sclist1=null;
+// sclist2=null;
+//
+// gchistArray=null;
+// gc_std=0;
+//
+// llist=null;
+// tlist=null;
+
+ clist.clear();
+ slist.clear();
+ sclist1.clear();
+ sclist2.clear();
+ llist.clear();
+ tlist.clear();
+
+ Arrays.fill(gchistArray, 0);
+ Arrays.fill(gchist_by_base, 0);
+
+ gchistArray_downsampled=null;
+ gchist_by_base_downsampled=null;
+
+ gc_std=0;
+ gc_bb_std=0;
+
+ LIMSUM=0;
+ HEADERLENSUM=0;
+ }
+
+ /**
+ * @return
+ */
+ public static final byte[] makeCharToNum() {
+ byte[] r=new byte[256];
+ Arrays.fill(r, (byte)6);
+ r['a']=r['A']=0;
+ r['c']=r['C']=1;
+ r['g']=r['G']=2;
+ r['t']=r['T']=3;
+ r['u']=r['U']=3;
+ r['n']=r['N']=5;
+ r['x']=r['X']=4;
+ for(byte b : AminoAcid.degenerateBases){
+ if(b!=' '){
+ r[b]=r[Character.toLowerCase(b)]=4;
+ }
+ }
+ r['\n']=r['\r']=r['>']=r['@']=r['+']=7;
+ return r;
+ }
+
+ /*--------------------------------------------------------------*/
+
+ private static final byte[] charToNum=makeCharToNum();
+ public static int GCFORMAT=1;
+ public static int FORMAT=1;
+ private static long cutoff=1000000;
+
+ private static long LIMSUM=0;
+ private static long HEADERLENSUM=0;
+
+ private static int bbmapkmer=0;//13;
+ public static boolean overwrite=false;
+ public static boolean append=false;
+ public static boolean useheader=true;
+ public static boolean addfilename=false;
+ public static boolean showspeed=false;//true;
+ public static boolean printheadersize=false;
+ public static boolean skipDuplicateLines=true;
+ public static boolean N_UNDERSCORE=true;
+
+ private final static byte slashr='\r', slashn='\n', carrot='>', at='@', noref='N', noref2='n';
+
+ /*--------------------------------------------------------------*/
+
+ private boolean benchmark=false;
+ private String in=null, out=null, gc=null, gchistFile=null, scaffoldHistFile=null;
+ private int maxNs=-1;
+
+ /** Number of decimal places for GC histogram */
+ private final int gchistDecimals1;
+
+ /** Number of bins for output (subsampled) GC content histogram */
+ private final int gcbins;
+
+ /** Number of bins for internal GC content histogram */
+ private final int gcbins2;
+
+ /** Minimum scaffold length to count */
+ private final int minScaffold;
+
+ /** Number of contigs of length x */
+ private final LongList clist;
+
+ /** Number of scaffolds of length x */
+ private final LongList slist;
+
+ /** Sum of contigs per scaffold of length x */
+ private final LongList sclist1;
+
+ /** Sum of contig lengths per scaffold of length x */
+ private final LongList sclist2;
+
+ /** List of contig lengths for contigs at least cutoff in length */
+ private final LongList llist;
+
+ /** List of scaf len, contigs, contig sum for scaffolds at least cutoff in length */
+ private final ArrayList<Triple> tlist;
+
+ /** Downsampled gc histogram */
+ private final long[] gchistArray;
+
+ /** Downsampled gc histogram */
+ private long[] gchistArray_downsampled;
+
+ /** gc standard deviation */
+ private double gc_std;
+
+ /** Downsampled gc histogram, using base counts rather than scaffold counts */
+ private final long[] gchist_by_base;
+
+ /** Downsampled gc histogram, using base counts rather than scaffold counts */
+ private long[] gchist_by_base_downsampled;
+
+ /** gc standard deviation, using base counts rather than scaffold counts */
+ private double gc_bb_std;
+
+ public static long lastL50;
+ public static long lastSize;
+ public static long lastContigs;
+ public static long lastMaxContig;
+
+ /*--------------------------------------------------------------*/
+
+ private static class Triple implements Comparable<Triple>{
+
+ Triple(long len_, long contigs_, long contiglen_){
+ length=len_;
+ contigs=contigs_;
+ contiglen=contiglen_;
+ }
+
+ @Override
+ public int compareTo(Triple o) {
+ if(length>o.length){return 1;}
+ if(length<o.length){return -1;}
+ return (int)(contiglen-o.contiglen);
+ }
+
+ public boolean equals(Object o){return equals((Triple)o);}
+
+ public boolean equals(Triple o){
+ return length==o.length && contiglen==o.contiglen;
+ }
+
+ public String toString(){return length+","+contigs+","+contiglen;}
+
+ public final long length;
+ public final long contigs;
+ public final long contiglen;
+
+ }
+
+ /*--------------------------------------------------------------*/
+
+
+
+ /*
+
+ fasta_stats2.linux -n <number of N between contigs> contigs.fa
+ e.g.
+
+ fasta_stats2.linux -n 0 contigs.fa # for aplg assemblies
+ fasta_stats2.linux -n 10 contigs.fa # for velvet
+
+
+
+ Main genome scaffold total: 1610
+ Main genome contig total: 7844
+ Main genome scaffold sequence total: 726.6 MB
+ Main genome contig sequence total: 689.4 MB (-> 5.1% gap)
+ Main genome scaffold N/L50: 6/62.2 MB
+ Main genome contig N/L50: 331/429.0 KB
+ Number of scaffolds > 50 KB: 122
+ % main genome in scaffolds > 50 KB: 98.9%
+
+ Minimum Number Number Total Total Scaffold
+ Scaffold of of Scaffold Contig Contig
+ Length Scaffolds Contigs Length Length Coverage
+ -------- --------- ------- ----------- ----------- --------
+ All 1,610 7,844 726,616,606 689,442,341 94.88%
+ 1 kb 1,610 7,844 726,616,606 689,442,341 94.88%
+ 2.5 kb 1,468 7,677 726,334,758 689,171,164 94.88%
+ 5 kb 537 6,496 723,058,922 685,949,825 94.87%
+ 10 kb 321 6,176 721,557,480 684,511,419 94.87%
+ 25 kb 138 5,900 718,873,396 681,879,275 94.85%
+ 50 kb 122 5,854 718,322,923 681,420,273 94.86%
+ 100 kb 83 5,660 715,543,850 679,452,337 94.96%
+ 250 kb 47 5,326 709,779,897 675,162,461 95.12%
+ 500 kb 32 5,073 704,645,704 671,472,605 95.29%
+ 1 mb 19 4,735 695,996,631 664,862,860 95.53%
+ 2.5 mb 15 4,587 689,883,367 659,102,480 95.54%
+ 5 mb 13 4,463 681,669,379 651,024,951 95.50%
+ */
+
+}
diff --git a/current/jgi/AssemblyStatsWrapper.java b/current/jgi/AssemblyStatsWrapper.java
new file mode 100755
index 0000000..4c18a7f
--- /dev/null
+++ b/current/jgi/AssemblyStatsWrapper.java
@@ -0,0 +1,73 @@
+package jgi;
+
+import java.io.File;
+import java.util.ArrayList;
+
+import align2.Tools;
+
+/**
+ * @author Brian Bushnell
+ * @date Apr 17, 2013
+ *
+ */
+public class AssemblyStatsWrapper {
+
+ public static void main(String[] args){
+ ArrayList<String> alist=new ArrayList<String>();
+ ArrayList<String> ilist=new ArrayList<String>();
+
+ alist.add("");
+ alist.add("header=t");
+ alist.add("showspeed=f");
+ alist.add("addname=t");
+ alist.add("k=0");
+
+ for(String arg : args){
+ if(!arg.contains("=") && Tools.canRead(arg)){
+ ilist.add("in="+arg);
+ }else{
+ String[] split=arg.split("=");
+ if(split[0].equalsIgnoreCase("in") || split[0].equalsIgnoreCase("ref")){
+ if(split.length>1){
+ if(new File(split[1]).exists()){
+ ilist.add(arg);
+ }else{
+ String[] split2=split[1].split(",");
+ for(String s : split2){
+ ilist.add("in="+s);
+ }
+ }
+ }
+ }else{
+ alist.add(arg);
+ }
+ }
+ }
+
+ String[] args2=alist.toArray(new String[alist.size()]);
+ for(int i=0; i<ilist.size(); i++){
+ String s=ilist.get(i);
+// System.err.println("Processing "+s);
+ args2[0]=s;
+ if(i>0){
+ args2[1]="header=f";
+// AssemblyStats2.reset();
+ System.gc();
+ synchronized(AssemblyStatsWrapper.class){
+ try {
+ AssemblyStatsWrapper.class.wait(100);
+ } catch (InterruptedException e) {}
+ }
+ Thread.yield();
+ }
+ AssemblyStats2 as2=new AssemblyStats2(args2);
+ if(i>0){
+ AssemblyStats2.overwrite=false;
+ AssemblyStats2.append=true;
+ }
+ as2.process();
+ }
+
+ }
+
+}
diff --git a/current/jgi/BBDuk2.java b/current/jgi/BBDuk2.java
new file mode 100755
index 0000000..7471d3b
--- /dev/null
+++ b/current/jgi/BBDuk2.java
@@ -0,0 +1,3772 @@
+package jgi;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.BitSet;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.concurrent.ArrayBlockingQueue;
+import java.util.concurrent.atomic.AtomicLongArray;
+
+import kmer.AbstractKmerTable;
+
+import stream.ConcurrentReadInputStream;
+import stream.FASTQ;
+import stream.FastaReadInputStream;
+import stream.ConcurrentReadOutputStream;
+import stream.KillSwitch;
+import stream.Read;
+import stream.SamLine;
+
+import align2.IntList;
+import align2.ListNum;
+import align2.ReadStats;
+import align2.Shared;
+import align2.Tools;
+import align2.TrimRead;
+import dna.AminoAcid;
+import dna.Data;
+import dna.Parser;
+import dna.Timer;
+import fileIO.ByteFile;
+import fileIO.ByteStreamWriter;
+import fileIO.ReadWrite;
+import fileIO.FileFormat;
+import fileIO.TextStreamWriter;
+
+/**
+ * Separates or trims reads based on matching kmers in a reference.
+ * Supports arbitrary K and inexact matches.
+ * Supercedes BBDukF by allowing simultaneous filtering, left- and right-trimming with different references.
+ * @author Brian Bushnell
+ * @date Aug 30, 2013
+ *
+ */
+public class BBDuk2 {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /**
+ * Code entrance from the command line.
+ * @param args Command line arguments
+ */
+ public static void main(String[] args){
+
+ args=Parser.parseConfig(args);
+ if(Parser.parseHelp(args, true)){
+ printOptions();
+ System.exit(0);
+ }
+
+ //Create a new BBDuk instance
+ BBDuk2 bbd=new BBDuk2(args);
+
+ ///And run it
+ bbd.process();
+ }
+
+ /**
+ * Display usage information.
+ */
+ private static void printOptions(){
+ System.err.println("Please consult the shellscript for usage information.");
+ }
+
+
+ /**
+ * Constructor.
+ * @param args Command line arguments
+ */
+ public BBDuk2(String[] args){
+ for(String s : args){if(s.contains("standardout") || s.contains("stdout")){outstream=System.err;}}
+ System.err.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+ System.err.println("BBDuk2 version "+Shared.BBMAP_VERSION_STRING);
+
+ /* Set global defaults */
+ ReadWrite.ZIPLEVEL=2;
+ ReadWrite.USE_UNPIGZ=true;
+ ReadWrite.USE_PIGZ=true;
+ ReadWrite.MAX_ZIP_THREADS=8;
+
+
+ ByteFile.FORCE_MODE_BF2=Shared.threads()>2;
+
+ /* Initialize local variables with defaults */
+ boolean setOut=false, setOutb=false;
+ boolean ktrimRight_=false, ktrimLeft_=false, ktrimN_=false, ktrimExclusive_=false, kfilter_=false;
+ boolean findBestMatch_=false;
+ boolean addTrimmedToBad_=true;
+ boolean rcomp_=true;
+ boolean forbidNs_=false;
+ boolean useForest_=false, useTable_=false, useArray_=true, prealloc_=false;
+ int k_=27, kbig_=-1;
+ int mink_=-1;
+ int ways_=-1; //Currently disabled
+ int maxBadKmers_=0;
+ long skipreads_=0;
+ byte TRIM_SYMBOL_='N';
+ boolean kmaskLowercase_=false;
+
+ Parser parser=new Parser();
+ parser.trimq=6;
+ parser.minAvgQuality=0;
+ parser.minReadLength=10;
+ parser.maxReadLength=Integer.MAX_VALUE;
+ parser.minLenFraction=0f;
+ parser.requireBothBad=false;
+ parser.maxNs=-1;
+ boolean trimByOverlap_=false, useQualityForOverlap_=false, strictOverlap_=true;
+ boolean trimPairsEvenly_=false;
+ boolean ordered_=false;
+ int minoverlap_=-1, mininsert_=-1;
+ int restrictLeft_=0, restrictRight_=0, speed_=0, qSkip_=1;
+ boolean printNonZeroOnly_=true;
+ boolean rename_=false, useRefNames_=false;
+ boolean skipr1_=false, skipr2_=false;
+ boolean ecc_=false;
+ float minBaseFrequency_=0;
+
+
+ String[] literal=null;
+ String[] ref=null;
+
+ scaffoldNames.add(""); //Necessary so that the first real scaffold gets an id of 1, not zero
+ scaffoldLengths.add(0);
+
+ {
+ boolean b=false;
+ assert(b=true);
+ EA=b;
+ }
+
+ /* Parse arguments */
+ for(int i=0; i<args.length; i++){
+
+ final String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if("null".equalsIgnoreCase(b)){b=null;}
+ while(a.charAt(0)=='-' && (a.indexOf('.')<0 || i>1 || !new File(a).exists())){a=a.substring(1);}
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(Parser.parseZip(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseHist(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseCommonStatic(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseQualityAdjust(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseQuality(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseFasta(arg, a, b)){
+ //do nothing
+ }else if(parser.parseInterleaved(arg, a, b)){
+ //do nothing
+ }else if(parser.parseTrim(arg, a, b)){
+ //do nothing
+ }else if(parser.parseCommon(arg, a, b)){
+ //do nothing
+ }else if(parser.parseCardinality(arg, a, b)){
+ //do nothing
+ }else if(a.equals("in") || a.equals("in1")){
+ in1=b;
+ }else if(a.equals("in2")){
+ in2=b;
+ }else if(a.equals("qfin") || a.equals("qfin1")){
+ qfin1=b;
+ }else if(a.equals("qfin2")){
+ qfin2=b;
+ }else if(a.equals("out") || a.equals("out1") || a.equals("outu") || a.equals("outu1") || a.equals("outnonmatch") ||
+ a.equals("outnonmatch1") || a.equals("outunnmatch") || a.equals("outunmatch1") || a.equals("outunnmatched") || a.equals("outunmatched1")){
+ out1=b;
+ setOut=true;
+ }else if(a.equals("out2") || a.equals("outu2") || a.equals("outnonmatch2") || a.equals("outunmatch2") ||
+ a.equals("outnonmatched2") || a.equals("outunmatched2")){
+ out2=b;
+ }else if(a.equals("outb") || a.equals("outm") || a.equals("outb1") || a.equals("outm1") || a.equals("outbad") ||
+ a.equals("outbad1") || a.equals("outmatch") || a.equals("outmatch1")){
+ outb1=b;
+ setOut=true;
+ }else if(a.equals("outb2") || a.equals("outm2") || a.equals("outbad2") || a.equals("outmatch2")){
+ outb2=b;
+ }else if(a.equals("outs") || a.equals("outsingle")){
+ outsingle=b;
+ }else if(a.equals("stats") || a.equals("scafstats")){
+ outstats=b;
+ }else if(a.equals("refstats")){
+ outrefstats=b;
+ }else if(a.equals("rpkm") || a.equals("fpkm") || a.equals("cov") || a.equals("coverage")){
+ outrpkm=b;
+ }else if(a.equals("sam") || a.equals("bam")){
+ samFile=b;
+ }else if(a.equals("duk") || a.equals("outduk")){
+ outduk=b;
+ }else if(a.equals("rqc")){
+ outrqc=b;
+ }else if(a.equals("ref")){
+ ref=(b==null) ? null : (new File(b).exists() ? new String[] {b} : b.split(","));
+ }else if(a.equals("filterref") || a.equals("fref")){
+ refFilter=(b==null) ? null : (new File(b).exists() ? new String[] {b} : b.split(","));
+ }else if(a.equals("maskref") || a.equals("mref") || a.equals("nref")){
+ refMask=(b==null) ? null : (new File(b).exists() ? new String[] {b} : b.split(","));
+ }else if(a.equals("rightref") || a.equals("trref") || a.equals("rref")){
+ refRight=(b==null) ? null : (new File(b).exists() ? new String[] {b} : b.split(","));
+ }else if(a.equals("leftref") || a.equals("tlref") || a.equals("lref")){
+ refLeft=(b==null) ? null : (new File(b).exists() ? new String[] {b} : b.split(","));
+ }else if(a.equals("literal")){
+ literal=(b==null) ? null : b.split(",");
+// assert(false) : b+", "+Arrays.toString(literal);
+ }else if(a.equals("filterliteral") || a.equals("fliteral")){
+ literalFilter=(b==null) ? null : (new File(b).exists() ? new String[] {b} : b.split(","));
+ }else if(a.equals("maskliteral") || a.equals("mliteral") || a.equals("nliteral")){
+ literalMask=(b==null) ? null : (new File(b).exists() ? new String[] {b} : b.split(","));
+ }else if(a.equals("rightliteral") || a.equals("trliteral") || a.equals("rliteral")){
+ literalRight=(b==null) ? null : (new File(b).exists() ? new String[] {b} : b.split(","));
+ }else if(a.equals("leftliteral") || a.equals("tlliteral") || a.equals("lliteral")){
+ literalLeft=(b==null) ? null : (new File(b).exists() ? new String[] {b} : b.split(","));
+ }else if(a.equals("forest")){
+ useForest_=Tools.parseBoolean(b);
+ if(useForest_){useTable_=useArray_=false;}
+ }else if(a.equals("table")){
+ useTable_=Tools.parseBoolean(b);
+ if(useTable_){useForest_=useArray_=false;}
+ }else if(a.equals("array")){
+ useArray_=Tools.parseBoolean(b);
+ if(useArray_){useTable_=useForest_=false;}
+ }else if(a.equals("ways")){
+ ways_=Integer.parseInt(b);
+ }else if(a.equals("ordered") || a.equals("ord")){
+ ordered_=Tools.parseBoolean(b);
+ System.err.println("Set ORDERED to "+ordered_);
+ }else if(a.equals("skipr1")){
+ skipr1_=Tools.parseBoolean(b);
+ }else if(a.equals("skipr2")){
+ skipr2_=Tools.parseBoolean(b);
+ }else if(a.equals("k")){
+ assert(b!=null) : "\nThe k key needs an integer value greater than 0, such as k=27\n";
+ k_=Integer.parseInt(b);
+ if(k_>31){
+ kbig_=k_;
+ k_=31;
+ }else{
+ kbig_=-1;
+ }
+ assert(k_>0 && k_<32) : "k must be at least 1; default is 27.";
+ }else if(a.equals("mink") || a.equals("kmin")){
+ mink_=Integer.parseInt(b);
+ assert(mink_<0 || (mink_>0 && mink_<32)) : "kmin must be between 1 and 31; default is 4, negative numbers disable it.";
+ }else if(a.equals("useshortkmers") || a.equals("shortkmers") || a.equals("usk")){
+ useShortKmers=Tools.parseBoolean(b);
+ }else if(a.equals("trimextra") || a.equals("trimpad") || a.equals("tp")){
+ trimPad=Integer.parseInt(b);
+ }else if(a.equals("hdist") || a.equals("hammingdistance")){
+ hammingDistance=Integer.parseInt(b);
+ assert(hammingDistance>=0 && hammingDistance<4) : "hamming distance must be between 0 and 3; default is 0.";
+ }else if(a.equals("qhdist") || a.equals("queryhammingdistance")){
+ qHammingDistance=Integer.parseInt(b);
+ assert(qHammingDistance>=0 && qHammingDistance<4) : "hamming distance must be between 0 and 3; default is 0.";
+ }else if(a.equals("edits") || a.equals("edist") || a.equals("editdistance")){
+ editDistance=Integer.parseInt(b);
+ assert(editDistance>=0 && editDistance<3) : "edit distance must be between 0 and 2; default is 0.";
+ }else if(a.equals("hdist2") || a.equals("hammingdistance2")){
+ hammingDistance2=Integer.parseInt(b);
+ assert(hammingDistance2>=0 && hammingDistance2<4) : "hamming distance must be between 0 and 3; default is 0.";
+ }else if(a.equals("qhdist2") || a.equals("queryhammingdistance2")){
+ qHammingDistance2=Integer.parseInt(b);
+ assert(qHammingDistance2>=0 && qHammingDistance2<4) : "hamming distance must be between 0 and 3; default is 0.";
+ }else if(a.equals("edits2") || a.equals("edist2") || a.equals("editdistance2")){
+ editDistance2=Integer.parseInt(b);
+ assert(editDistance2>=0 && editDistance2<3) : "edit distance must be between 0 and 2; default is 0.";
+ }else if(a.equals("maxskip") || a.equals("maxrskip") || a.equals("mxs")){
+ maxSkip=Integer.parseInt(b);
+ }else if(a.equals("minskip") || a.equals("minrskip") || a.equals("mns")){
+ minSkip=Integer.parseInt(b);
+ }else if(a.equals("skip") || a.equals("refskip") || a.equals("rskip")){
+ minSkip=maxSkip=Integer.parseInt(b);
+ }else if(a.equals("qskip")){
+ qSkip_=Integer.parseInt(b);
+ }else if(a.equals("speed")){
+ speed_=Integer.parseInt(b);
+ assert(speed_>=0 && speed_<=15) : "Speed range is 0 to 15. Value: "+speed_;
+ }else if(a.equals("skipreads")){
+ skipreads_=Tools.parseKMG(b);
+ }else if(a.equals("maxbadkmers") || a.equals("mbk")){
+ maxBadKmers_=Integer.parseInt(b);
+ }else if(a.equals("minhits") || a.equals("minkmerhits") || a.equals("mkh")){
+ maxBadKmers_=Integer.parseInt(b)-1;
+ }else if(a.equals("showspeed") || a.equals("ss")){
+ showSpeed=Tools.parseBoolean(b);
+ }else if(a.equals("verbose")){
+ assert(false) : "Verbose flag is currently static final; must be recompiled to change.";
+ assert(WAYS>1) : "WAYS=1 is for debug mode.";
+// verbose=Tools.parseBoolean(b); //123
+ if(verbose){outstream=System.err;} //For some reason System.out does not print in verbose mode.
+ }else if(a.equals("mm") || a.equals("maskmiddle")){
+ maskMiddle=Tools.parseBoolean(b);
+ }else if(a.equals("rcomp")){
+ rcomp_=Tools.parseBoolean(b);
+ }else if(a.equals("forbidns") || a.equals("forbidn") || a.equals("fn")){
+ forbidNs_=Tools.parseBoolean(b);
+ }else if(a.equals("findbestmatch") || a.equals("fbm")){
+ findBestMatch_=Tools.parseBoolean(b);
+ }else if(a.equals("kfilter")){
+ kfilter_=Tools.parseBoolean(b);
+ }else if(a.equals("kmask") || a.equals("mask")){
+ if("lc".equalsIgnoreCase(b) || "lowercase".equalsIgnoreCase(b)){
+ kmaskLowercase_=true;
+ ktrimN_=true;
+ }else{
+ if(b==null){b="";}
+ if(b.length()==1 && !b.equalsIgnoreCase("t") && !b.equalsIgnoreCase("f")){
+ ktrimN_=true;
+ TRIM_SYMBOL_=(byte)b.charAt(0);
+ }else{
+ ktrimN_=Tools.parseBoolean(b);
+ }
+ }
+ }else if(a.equals("ktrim")){
+ if(b==null){b="";}
+ if(b.equalsIgnoreCase("left") || b.equalsIgnoreCase("l")){ktrimLeft_=true;}
+ else if(b.equalsIgnoreCase("right") || b.equalsIgnoreCase("r")){ktrimRight_=true;}
+ else if(b.equalsIgnoreCase("n") || b.equalsIgnoreCase("mask")){ktrimN_=true;}
+ else if(b.length()==1 && !b.equalsIgnoreCase("t") && !b.equalsIgnoreCase("f")){
+ ktrimN_=true;
+ TRIM_SYMBOL_=(byte)b.charAt(0);
+ }else{
+ boolean x=Tools.parseBoolean(b);
+ if(!x){
+ ktrimRight_=ktrimLeft_=false;
+ }else{
+ throw new RuntimeException("\nInvalid setting for ktrim: "+b+"\nvalues must be f (false), l (left), r (right), or n");
+ }
+ }
+ }else if(a.equals("ktrimright")){
+ ktrimRight_=Tools.parseBoolean(b);
+ ktrimLeft_=ktrimN_=!(ktrimRight_);
+ }else if(a.equals("ktrimleft")){
+ ktrimLeft_=Tools.parseBoolean(b);
+ ktrimRight_=ktrimN_=!(ktrimLeft_);
+ }else if(a.equals("ktrimn")){
+ ktrimN_=Tools.parseBoolean(b);
+ ktrimLeft_=ktrimRight_=!(ktrimN_);
+ }else if(a.equals("ktrimexclusive")){
+ ktrimExclusive_=Tools.parseBoolean(b);
+ }else if(a.equals("tbo") || a.equals("trimbyoverlap")){
+ trimByOverlap_=Tools.parseBoolean(b);
+ }else if(a.equals("strictoverlap")){
+ strictOverlap_=Tools.parseBoolean(b);
+ }else if(a.equals("usequality")){
+ useQualityForOverlap_=Tools.parseBoolean(b);
+ }else if(a.equals("tpe") || a.equals("tbe") || a.equals("trimpairsevenly")){
+ trimPairsEvenly_=Tools.parseBoolean(b);
+ }else if(a.equals("ottm") || a.equals("outputtrimmedtomatch")){
+ addTrimmedToBad_=Tools.parseBoolean(b);
+ }else if(a.equals("minoverlap")){
+ minoverlap_=Integer.parseInt(b);
+ }else if(a.equals("mininsert")){
+ mininsert_=Integer.parseInt(b);
+ }else if(a.equals("prealloc") || a.equals("preallocate")){
+ if(b==null || b.length()<1 || Character.isLetter(b.charAt(0))){
+ prealloc_=Tools.parseBoolean(b);
+ }else{
+ preallocFraction=Tools.max(0, Double.parseDouble(b));
+ prealloc_=(preallocFraction>0);
+ }
+ }else if(a.equals("restrictleft")){
+ restrictLeft_=Integer.parseInt(b);
+ }else if(a.equals("restrictright")){
+ restrictRight_=Integer.parseInt(b);
+ }else if(a.equals("statscolumns") || a.equals("columns") || a.equals("cols")){
+ STATS_COLUMNS=Integer.parseInt(b);
+ assert(STATS_COLUMNS==3 || STATS_COLUMNS==5) : "statscolumns bust be either 3 or 5. Invalid value: "+STATS_COLUMNS;
+ }else if(a.equals("nzo") || a.equals("nonzeroonly")){
+ printNonZeroOnly_=Tools.parseBoolean(b);
+ }else if(a.equals("rename")){
+ rename_=Tools.parseBoolean(b);
+ }else if(a.equals("refnames") || a.equals("userefnames")){
+ useRefNames_=Tools.parseBoolean(b);
+ }else if(a.equals("initialsize")){
+ initialSize=(int)Tools.parseKMG(b);
+ }else if(a.equals("dump")){
+ dump=b;
+ }else if(a.equals("entropyk") || a.equals("ek")){
+ entropyK=Integer.parseInt(b);
+ }else if(a.equals("entropywindow") || a.equals("ew")){
+ entropyWindow=Integer.parseInt(b);
+ }else if(a.equals("minentropy") || a.equals("entropy") || a.equals("entropyfilter")){
+ entropyCutoff=Float.parseFloat(b);
+ }else if(a.equals("verifyentropy")){
+ verifyEntropy=Tools.parseBoolean(b);
+ }else if(a.equals("minbasefrequency")){
+ minBaseFrequency_=Float.parseFloat(b);
+ }else if(a.equals("ecco") || a.equals("ecc")){
+ ecc_=Tools.parseBoolean(b);
+ }else if(a.equals("copyundefined") || a.equals("cu")){
+ REPLICATE_AMBIGUOUS=Tools.parseBoolean(b);
+ }else if(a.equals("path")){
+ Data.setPath(b);
+ }else if(i==0 && in1==null && arg.indexOf('=')<0 && arg.lastIndexOf('.')>0){
+ in1=args[i];
+ }else if(i==1 && out1==null && arg.indexOf('=')<0 && arg.lastIndexOf('.')>0){
+ out1=args[i];
+ setOut=true;
+ }else if(i==2 && ref==null && arg.indexOf('=')<0 && arg.lastIndexOf('.')>0){
+ ref=(new File(args[i]).exists() ? new String[] {args[i]} : args[i].split(","));
+ }else{
+ throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ if(hammingDistance2==-1){hammingDistance2=hammingDistance;}
+ if(qHammingDistance2==-1){qHammingDistance2=qHammingDistance;}
+ if(editDistance2==-1){editDistance2=editDistance;}
+ minBaseFrequency=minBaseFrequency_;
+
+ {//Process parser fields
+ Parser.processQuality();
+
+ maxReads=parser.maxReads;
+ samplerate=parser.samplerate;
+ sampleseed=parser.sampleseed;
+ recalibrateQuality=parser.recalibrateQuality;
+
+ overwrite=ReadStats.overwrite=parser.overwrite;
+ append=ReadStats.append=parser.append;
+// testsize=parser.testsize;
+// trimBadSequence=parser.trimBadSequence;
+// breakLength=parser.breakLength;
+
+ forceTrimModulo=parser.forceTrimModulo;
+ forceTrimLeft=parser.forceTrimLeft;
+ forceTrimRight=parser.forceTrimRight;
+ forceTrimRight2=parser.forceTrimRight2;
+ qtrimLeft=parser.qtrimLeft;
+ qtrimRight=parser.qtrimRight;
+ trimq=parser.trimq;
+ minLenFraction=parser.minLenFraction;
+ minAvgQuality=parser.minAvgQuality;
+ minAvgQualityBases=parser.minAvgQualityBases;
+ chastityFilter=parser.chastityFilter;
+ failBadBarcodes=parser.failBadBarcodes;
+ removeBadBarcodes=parser.removeBadBarcodes;
+ failIfNoBarcode=parser.failIfNoBarcode;
+ barcodes=parser.barcodes;
+ minReadLength=parser.minReadLength;
+ maxReadLength=parser.maxReadLength;
+ maxNs=parser.maxNs;
+ minConsecutiveBases=parser.minConsecutiveBases;
+// untrim=parser.untrim;
+// minTrimLength=(parser.minTrimLength>=0 ? parser.minTrimLength : minTrimLength);
+// requireBothBad=parser.requireBothBad;
+ removePairsIfEitherBad=!parser.requireBothBad;
+
+ minGC=parser.minGC;
+ maxGC=parser.maxGC;
+ filterGC=(minGC>0 || maxGC<1);
+
+ loglog=(parser.loglog ? new LogLog(parser) : null);
+
+ THREADS=Shared.threads();
+ }
+
+ if(prealloc_){
+ System.err.println("Note - if this program runs out of memory, please disable the prealloc flag.");
+ }
+
+ if(minoverlap_>=0){
+ minOverlap=Tools.max(minoverlap_, 1);
+ minOverlap0=Tools.min(minOverlap0, minOverlap);
+ }
+
+ if(mininsert_>=0){
+ minInsert=Tools.max(mininsert_, 1);
+ minInsert0=Tools.min(minInsert0, minInsert);
+ }
+
+ if(refFilter!=null || literalFilter!=null){kfilter_=true;}
+ if(refMask!=null || literalMask!=null){ktrimN_=true;}
+ if(refRight!=null || literalRight!=null){ktrimRight_=true;}
+ if(refLeft!=null || literalLeft!=null){ktrimLeft_=true;}
+
+ if(ref!=null){
+ if(!ktrimN_ && !ktrimRight_ && !ktrimLeft_){kfilter_=true;}
+ if(kfilter_ && refFilter==null){
+ refFilter=ref;
+ }else if(ktrimN_ && refMask==null){
+ refMask=ref;
+ }else if(ktrimRight_ && refRight==null){
+ refRight=ref;
+ }else if(ktrimLeft_ && refLeft==null){
+ refLeft=ref;
+ }
+ }
+ if(literal!=null){
+ if(!ktrimN_ && !ktrimRight_ && !ktrimLeft_){kfilter_=true;}
+ if(kfilter_ && literalFilter==null){
+ literalFilter=literal;
+ }else if(ktrimN_ && literalMask==null){
+ literalMask=literal;
+ }else if(ktrimRight_ && literalRight==null){
+ literalRight=literal;
+ }else if(ktrimLeft_ && literalLeft==null){
+ literalLeft=literal;
+ }
+ }
+
+ kfilter_=(refFilter!=null || literalFilter!=null);
+ ktrimN_=(refMask!=null || literalMask!=null);
+ ktrimRight_=(refRight!=null || literalRight!=null);
+ ktrimLeft_=(refLeft!=null || literalLeft!=null);
+
+ kfilter=kfilter_;
+ ktrimN=ktrimN_;
+ ktrimRight=ktrimRight_;
+ ktrimLeft=ktrimLeft_;
+ ktrimExclusive=ktrimExclusive_;
+ findBestMatch=findBestMatch_;
+
+ if(refFilter!=null){
+ for(String s : refFilter){refNames.add(s);}
+ }
+ if(literalFilter!=null){refNames.add("literal");}
+ refScafCounts=new int[refNames.size()];
+
+ /* Set final variables; post-process and validate argument combinations */
+
+ useForest=useForest_;
+ useTable=useTable_;
+ useArray=useArray_;
+ hammingDistance=Tools.max(editDistance, hammingDistance);
+ hammingDistance2=Tools.max(editDistance2, hammingDistance2);
+ minSkip=Tools.max(1, Tools.min(minSkip, maxSkip));
+ maxSkip=Tools.max(minSkip, maxSkip);
+ addTrimmedToBad=addTrimmedToBad_;
+ rcomp=rcomp_;
+ forbidNs=(forbidNs_ || hammingDistance<1);
+ trimSymbol=TRIM_SYMBOL_;
+ kmaskLowercase=kmaskLowercase_;
+ skipreads=skipreads_;
+ trimByOverlap=trimByOverlap_;
+ useQualityForOverlap=useQualityForOverlap_;
+ strictOverlap=strictOverlap_;
+ trimPairsEvenly=trimPairsEvenly_;
+ ORDERED=ordered_;
+ restrictLeft=Tools.max(restrictLeft_, 0);
+ restrictRight=Tools.max(restrictRight_, 0);
+ printNonZeroOnly=printNonZeroOnly_;
+ rename=rename_;
+ useRefNames=useRefNames_;
+ speed=speed_;
+ qSkip=qSkip_;
+ noAccel=(speed<1 && qSkip<2);
+ skipR1=skipr1_;
+ skipR2=skipr2_;
+ ecc=ecc_;
+
+ if(strictOverlap){
+ maxRatio=0.05f;
+ ratioMargin=9f;
+ ratioOffset=0.5f;
+ efilterRatio=3.5f;
+ efilterOffset=0.05f;
+ pfilterRatio=0.001f;
+ meeFilter=15f;
+ }else{
+ maxRatio=0.10f;
+ ratioMargin=5f;
+ ratioOffset=0.4f;
+ efilterRatio=6f;
+ efilterOffset=0.05f;
+ pfilterRatio=0.00005f;
+ meeFilter=999999999;
+ }
+
+ MAKE_QUALITY_HISTOGRAM=ReadStats.COLLECT_QUALITY_STATS;
+ MAKE_QUALITY_ACCURACY=ReadStats.COLLECT_QUALITY_ACCURACY;
+ MAKE_MATCH_HISTOGRAM=ReadStats.COLLECT_MATCH_STATS;
+ MAKE_BASE_HISTOGRAM=ReadStats.COLLECT_BASE_STATS;
+ MAKE_EHIST=ReadStats.COLLECT_ERROR_STATS;
+ MAKE_INDELHIST=ReadStats.COLLECT_INDEL_STATS;
+ MAKE_LHIST=ReadStats.COLLECT_LENGTH_STATS;
+ MAKE_GCHIST=ReadStats.COLLECT_GC_STATS;
+ MAKE_IDHIST=ReadStats.COLLECT_IDENTITY_STATS;
+
+ {
+ long usableMemory;
+ long tableMemory;
+ long numTables=Tools.max(1, (kfilter ? 1 : 0)+(ktrimLeft ? 1 : 0)+(ktrimRight ? 1 : 0)+(ktrimN ? 1 : 0));
+
+ {
+ long memory=Runtime.getRuntime().maxMemory();
+ double xmsRatio=Shared.xmsRatio();
+ usableMemory=(long)Tools.max(((memory-96000000-(20*400000 /* for atomic arrays */))*(xmsRatio>0.97 ? 0.82 : 0.75)), memory*0.45);
+ tableMemory=(long)(usableMemory*.95);
+ }
+
+ if(initialSize<1){
+ final long memOverWays=tableMemory/(12*WAYS*numTables);
+ final double mem2=(prealloc_ ? preallocFraction : 1)*tableMemory;
+ initialSize=(prealloc_ || memOverWays<initialSizeDefault ? (int)Tools.min(2142000000, (long)(mem2/(12*WAYS*numTables))) : initialSizeDefault);
+ if(initialSize!=initialSizeDefault){
+ System.err.println("Initial size set to "+initialSize);
+ }
+ }
+ }
+
+ if(ktrimLeft_ || ktrimRight_ || ktrimN_){
+ if(kbig_>k_){
+ System.err.println("*********************** WARNING ***********************");
+ System.err.println("WARNING: When kmer-trimming, the maximum value of K is "+k_+".");
+ System.err.println("K has been reduced from "+kbig_+" to "+k_+".");
+ System.err.println("***********************************************************");
+ kbig_=k_;
+ }
+ }
+
+ if((speed>0 || qSkip>1) && kbig_>k_){
+ System.err.println("*********************** WARNING ***********************");
+ System.err.println("WARNING: When speed>0 or qskip>1, the maximum value of K is "+k_+".");
+ System.err.println("K has been reduced from "+kbig_+" to "+k_+".");
+ System.err.println("***********************************************************");
+ kbig_=k_;
+ }
+
+ if((speed>0 && qSkip>1) || (qSkip>1 && maxSkip>1) || (speed>0 && maxSkip>1)){
+ System.err.println("WARNING: It is not recommended to use more than one of 'qskip', 'speed', and 'rskip/maxskip' together.");
+ System.err.println("qskip="+qSkip+", speed="+speed+", maxskip="+maxSkip);
+ }
+
+ k=k_;
+ k2=k-1;
+ kbig=kbig_;
+ if(kbig>k){
+ minSkip=maxSkip=0;
+ if(maskMiddle){
+ System.err.println("maskMiddle was disabled because kbig>k");
+ maskMiddle=false;
+ }
+ }
+ mink=Tools.min((mink_<1 ? 6 : mink_), k);
+ maxBadKmers=maxBadKmers_;
+ if(mink_>0 && mink_<k){useShortKmers=true;}
+ if(useShortKmers){
+ if(maskMiddle){
+ System.err.println("maskMiddle was disabled because useShortKmers=true");
+ maskMiddle=false;
+ }
+ }
+ assert(findBestMatch==false || kfilter==false || kbig<=k) : "K must be less than 32 in 'findBestMatch' mode";
+
+ assert(!useShortKmers || ktrimRight || ktrimLeft || ktrimN) : "\nSetting mink or useShortKmers also requires setting a ktrim mode, such as 'r', 'l', or 'n'\n";
+
+ middleMask=maskMiddle ? ~(3L<<(2*(k/2))) : -1L;
+
+ if(kfilter || ktrimN || ktrimRight || ktrimLeft){
+ System.err.println("k="+k);
+ if(kbig>k){System.err.println("kbig="+kbig);}
+ if(mink<k && (ktrimN || ktrimRight || ktrimLeft)){System.err.println("mink="+mink);}
+ if(maskMiddle){System.err.println("maskMiddle="+maskMiddle);}
+ if(hammingDistance>0){System.err.println("hamming distance="+hammingDistance);}
+ if(editDistance>0){System.err.println("edit distance="+editDistance);}
+ }
+ if(kfilter){printFilterPlan("kfiltering", refFilter, literalFilter);}
+ if(ktrimN){printFilterPlan(((char)trimSymbol)+"-masking", refMask, literalMask);}
+ if(ktrimRight){printFilterPlan("right-ktrimming", refRight, literalRight);}
+ if(ktrimLeft){printFilterPlan("left-ktrimming", refLeft, literalLeft);}
+ if(qtrimRight || qtrimLeft){
+ System.err.println("quality-trimming "+((qtrimRight && qtrimLeft) ? "both ends" : qtrimRight ? "right end" : " left end")+" to Q"+trimq);
+ }
+ System.err.println();
+
+ hitCounts=(outduk==null ? null : new long[HITCOUNT_LEN+1]);
+
+
+ /* Adjust I/O settings and filenames */
+
+ assert(FastaReadInputStream.settingsOK());
+
+ if(in1==null){
+ printOptions();
+ throw new RuntimeException("Error - at least one input file is required.");
+ }
+
+ if(in1!=null && in1.contains("#") && !new File(in1).exists()){
+ int pound=in1.lastIndexOf('#');
+ String a=in1.substring(0, pound);
+ String b=in1.substring(pound+1);
+ in1=a+1+b;
+ in2=a+2+b;
+ }
+ if(in2!=null && (in2.contains("=") || in2.equalsIgnoreCase("null"))){in2=null;}
+ if(in2!=null){
+ if(FASTQ.FORCE_INTERLEAVED){System.err.println("Reset INTERLEAVED to false because paired input files were specified.");}
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+ }
+
+ if(qfin1!=null && qfin1.contains("#") && in2!=null && !new File(qfin1).exists()){
+ int pound=qfin1.lastIndexOf('#');
+ String a=qfin1.substring(0, pound);
+ String b=qfin1.substring(pound+1);
+ qfin1=a+1+b;
+ qfin2=a+2+b;
+ }
+
+ if(out1!=null && out1.contains("#")){
+ int pound=out1.lastIndexOf('#');
+ String a=out1.substring(0, pound);
+ String b=out1.substring(pound+1);
+ out1=a+1+b;
+ out2=a+2+b;
+ }
+
+ if(outb1!=null && outb1.contains("#")){
+ int pound=outb1.lastIndexOf('#');
+ String a=outb1.substring(0, pound);
+ String b=outb1.substring(pound+1);
+ outb1=a+1+b;
+ outb2=a+2+b;
+ }
+
+ if((out2!=null || outb2!=null) && (in1!=null && in2==null)){
+ if(!FASTQ.FORCE_INTERLEAVED){System.err.println("Forcing interleaved input because paired output was specified for a single input file.");}
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=true;
+ }
+
+ if(!setOut){
+ System.err.println("No output stream specified. To write to stdout, please specify 'out=stdout.fq' or similar.");
+// out1="stdout";
+// outstream=System.err;
+// out2=null;
+ out1=out2=null;
+ }else if("stdout".equalsIgnoreCase(out1) || "standarddout".equalsIgnoreCase(out1)){
+ out1="stdout.fq";
+ outstream=System.err;
+ out2=null;
+ }
+
+ if(!Tools.testOutputFiles(overwrite, append, false, out1, out2, outb1, outb2, outsingle, outstats, outrpkm, outduk, outrqc, outrefstats)){
+ throw new RuntimeException("\nCan't write to some output files; overwrite="+overwrite+"\n");
+ }
+ if(!Tools.testInputFiles(false, true, in1, in2, qfin1, qfin2)){
+ throw new RuntimeException("\nCan't read to some input files.\n");
+ }
+ if(!Tools.testInputFiles(true, true, refFilter, refMask, refRight, refLeft)){
+ throw new RuntimeException("\nCan't read from some reference files.\n");
+ }
+ if(!Tools.testForDuplicateFiles(true, in1, in2, qfin1, qfin2, out1, out2, outb1, outb2, outsingle, outstats, outrpkm, outduk, outrqc, outrefstats)){
+ throw new RuntimeException("\nSome file names were specified multiple times.\n");
+ }
+
+ assert(THREADS>0) : "THREADS must be greater than 0.";
+
+ assert(in1==null || in1.toLowerCase().startsWith("stdin") || in1.toLowerCase().startsWith("standardin") || new File(in1).exists()) : "Can't find "+in1;
+ assert(in2==null || in2.toLowerCase().startsWith("stdin") || in2.toLowerCase().startsWith("standardin") || new File(in2).exists()) : "Can't find "+in2;
+
+ if(!(kfilter || ktrimN || ktrimRight || ktrimLeft || qtrimLeft || qtrimRight || minAvgQuality>0 || maxNs>=0 || trimByOverlap ||
+ MAKE_QUALITY_HISTOGRAM || MAKE_MATCH_HISTOGRAM || MAKE_BASE_HISTOGRAM || MAKE_QUALITY_ACCURACY ||
+ MAKE_EHIST || MAKE_INDELHIST || MAKE_LHIST || MAKE_GCHIST || MAKE_IDHIST ||
+ forceTrimLeft>0 || forceTrimRight>0 || forceTrimModulo>0 || minBaseFrequency>0 || recalibrateQuality)){
+ System.err.println("NOTE: No reference files specified, no trimming mode, no min avg quality, no histograms - read sequences will not be changed.");
+ }
+
+ if(recalibrateQuality){
+ SamLine.SET_FROM_OK=true;//TODO: Should ban other operations
+ }
+
+ if(ref!=null){
+ for(String s0 : ref){
+ assert(s0!=null) : "Specified a null reference.";
+ String s=s0.toLowerCase();
+ assert(s==null || s.startsWith("stdin") || s.startsWith("standardin") || new File(s0).exists()) : "Can't find "+s0;
+ }
+ }
+
+ //Initialize tables
+ final int tableType=(useForest ? AbstractKmerTable.FOREST1D : useTable ? AbstractKmerTable.TABLE : useArray ? AbstractKmerTable.ARRAY1D : 0);
+ if(kfilter){
+ filterMaps=AbstractKmerTable.preallocate(WAYS, tableType, initialSize, (!prealloc_ || preallocFraction<1));
+ }else{filterMaps=null;}
+ if(ktrimN){
+ maskMaps=AbstractKmerTable.preallocate(WAYS, tableType, initialSize, (!prealloc_ || preallocFraction<1));
+ }else{maskMaps=null;}
+ if(ktrimRight){
+ trimRightMaps=AbstractKmerTable.preallocate(WAYS, tableType, initialSize, (!prealloc_ || preallocFraction<1));
+ }else{trimRightMaps=null;}
+ if(ktrimLeft){
+ trimLeftMaps=AbstractKmerTable.preallocate(WAYS, tableType, initialSize, (!prealloc_ || preallocFraction<1));
+ }else{trimLeftMaps=null;}
+
+ //Initialize entropy
+ calcEntropy=(entropyCutoff>0);
+ if(calcEntropy){
+ assert(entropyWindow>0 && entropyCutoff>=0 && entropyCutoff<=1);
+ entropy=new double[entropyWindow+2];
+ final double mult=1d/entropyWindow;
+ for(int i=0; i<entropy.length; i++){
+ double pk=i*mult;
+ entropy[i]=pk*Math.log(pk);
+ }
+ entropyMult=-1/Math.log(entropyWindow);
+ entropyKmerspace=(1<<(2*entropyK));
+ }else{
+ entropy=null;
+ entropyMult=0;
+ entropyKmerspace=1;
+ }
+ }
+
+
+ private void printFilterPlan(String action, String[] refs, String[] lits){
+ String and=(refs!=null && lits!=null ? " and " : "");
+ String s1=(lits==null || lits.length!=1 ? "s" : "");
+ String s2=(refs==null || refs.length!=1 ? "s" : "");
+ System.err.println(action+" using "+(lits!=null ? lits.length+" literal"+s1 : "")+and+(refs!=null ? refs.length+" reference"+s2 : "")+".");
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Outer Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+
+ public void process(){
+
+ if(recalibrateQuality){
+ if(samFile!=null){
+ CalcTrueQuality.main2(new String[] {"in="+samFile, "showstats=f"});
+ }
+ CalcTrueQuality.initializeMatrices();
+ }
+
+ /* Check for output file collisions */
+ if(!Tools.testOutputFiles(overwrite, append, false, out1, out2, outb1, outb2, outstats, outrpkm, outduk, outrqc, outrefstats)){
+ throw new RuntimeException("One or more output files were duplicate or could not be written to. Check the names or set the 'overwrite=true' flag.");
+ }
+
+ /* Start overall timer */
+ Timer t=new Timer();
+
+// boolean dq0=FASTQ.DETECT_QUALITY;
+// boolean ti0=FASTQ.TEST_INTERLEAVED;
+// int rbl0=Shared.READ_BUFFER_LENGTH;
+// FASTQ.DETECT_QUALITY=false;
+// FASTQ.TEST_INTERLEAVED=false;
+// Shared.READ_BUFFER_LENGTH=16;
+
+ process2(t.time1);
+
+// FASTQ.DETECT_QUALITY=dq0;
+// FASTQ.TEST_INTERLEAVED=ti0;
+// Shared.READ_BUFFER_LENGTH=rbl0;
+
+ /* Stop timer and calculate speed statistics */
+ t.stop();
+
+
+ if(showSpeed){
+ double rpnano=readsIn/(double)(t.elapsed);
+ double bpnano=basesIn/(double)(t.elapsed);
+
+ //Format with k or m suffixes
+ String rpstring=(readsIn<100000 ? ""+readsIn : readsIn<100000000 ? (readsIn/1000)+"k" : (readsIn/1000000)+"m");
+ String bpstring=(basesIn<100000 ? ""+basesIn : basesIn<100000000 ? (basesIn/1000)+"k" : (basesIn/1000000)+"m");
+
+ while(rpstring.length()<8){rpstring=" "+rpstring;}
+ while(bpstring.length()<8){bpstring=" "+bpstring;}
+
+ outstream.println("\nTime: \t\t\t"+t);
+ outstream.println("Reads Processed: "+rpstring+" \t"+String.format("%.2fk reads/sec", rpnano*1000000));
+ outstream.println("Bases Processed: "+bpstring+" \t"+String.format("%.2fm bases/sec", bpnano*1000));
+ }
+
+ /* Throw an exception if errors were detected */
+ if(errorState){
+ throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
+ }
+ }
+
+
+ public void process2(long startTime){
+
+ /* Start phase timer */
+ Timer t=new Timer();
+
+ if(DISPLAY_PROGRESS){
+ outstream.println("Initial:");
+ Shared.printMemory();
+ outstream.println();
+ }
+
+ /* Fill tables with reference kmers */
+ {
+ final boolean oldTI=FASTQ.TEST_INTERLEAVED; //TODO: This needs to be changed to a non-static field, or somehow 'read mode' and 'ref mode' need to be distinguished.
+ final boolean oldFI=FASTQ.FORCE_INTERLEAVED;
+ final boolean oldSplit=FastaReadInputStream.SPLIT_READS;
+ final int oldML=FastaReadInputStream.MIN_READ_LEN;
+
+ FASTQ.TEST_INTERLEAVED=false;
+ FASTQ.FORCE_INTERLEAVED=false;
+ FastaReadInputStream.SPLIT_READS=false;
+ FastaReadInputStream.MIN_READ_LEN=1;
+
+ if(kfilter){
+ storedKmersFilter=spawnLoadThreads(refFilter, literalFilter, filterMaps, true);
+ }
+ if(ktrimN){
+ storedKmersMask=spawnLoadThreads(refMask, literalMask, maskMaps, false);
+ }
+ if(ktrimRight){
+ storedKmersRight=spawnLoadThreads(refRight, literalRight, trimRightMaps, false);
+ }
+ if(ktrimLeft){
+ storedKmersLeft=spawnLoadThreads(refLeft, literalLeft, trimLeftMaps, false);
+ }
+
+ FASTQ.TEST_INTERLEAVED=oldTI;
+ FASTQ.FORCE_INTERLEAVED=oldFI;
+ FastaReadInputStream.SPLIT_READS=oldSplit;
+ FastaReadInputStream.MIN_READ_LEN=oldML;
+
+ if(useRefNames){toRefNames();}
+ t.stop();
+ }
+
+ {
+ long ram=freeMemory();
+ ALLOW_LOCAL_ARRAYS=(scaffoldNames!=null && Tools.max(THREADS, 1)*3*8*scaffoldNames.size()<ram*5);
+ }
+
+ /* Dump kmers to text */
+ if(dump!=null && filterMaps!=null){
+ ByteStreamWriter bsw=new ByteStreamWriter(dump, overwrite, false, true);
+ bsw.start();
+ for(AbstractKmerTable set : filterMaps){
+ set.dumpKmersAsBytes(bsw, k, 0);
+ }
+ bsw.poisonAndWait();
+ }
+
+ final boolean vic=Read.VALIDATE_IN_CONSTRUCTOR;
+ Read.VALIDATE_IN_CONSTRUCTOR=THREADS<4;
+
+ /* Do kmer matching of input reads */
+ spawnProcessThreads(t);
+
+ Read.VALIDATE_IN_CONSTRUCTOR=vic;
+
+ /* Write legacy duk statistics (which requires tables) */
+ writeDuk(System.nanoTime()-startTime);
+
+ /* Unload kmers to save memory */
+ if(RELEASE_TABLES){unloadKmers();}
+
+ /* Write statistics to files */
+ writeStats();
+ writeRPKM();
+ writeRefStats();
+ writeRqc();
+
+ /* Unload sequence data to save memory */
+ if(RELEASE_TABLES){unloadScaffolds();}
+
+ outstream.println("\nInput: \t"+readsIn+" reads \t\t"+basesIn+" bases.");
+
+ if(kfilter){
+ outstream.println("Contaminants: \t"+readsKFiltered+" reads ("+String.format("%.2f",readsKFiltered*100.0/readsIn)+"%) \t"+
+ basesKFiltered+" bases ("+String.format("%.2f",basesKFiltered*100.0/basesIn)+"%)");
+ outstream.flush();
+ }
+ if(qtrimLeft || qtrimRight){
+ outstream.println("QTrimmed: \t"+readsQTrimmed+" reads ("+String.format("%.2f",readsQTrimmed*100.0/readsIn)+"%) \t"+
+ basesQTrimmed+" bases ("+String.format("%.2f",basesQTrimmed*100.0/basesIn)+"%)");
+ }
+ if(forceTrimLeft>0 || forceTrimRight>0 || forceTrimRight2>0 || forceTrimModulo>0){
+ outstream.println("FTrimmed: \t"+readsFTrimmed+" reads ("+String.format("%.2f",readsFTrimmed*100.0/readsIn)+"%) \t"+
+ basesFTrimmed+" bases ("+String.format("%.2f",basesFTrimmed*100.0/basesIn)+"%)");
+ }
+ if(ktrimLeft || ktrimRight || ktrimN){
+ outstream.println("KTrimmed: \t"+readsKTrimmed+" reads ("+String.format("%.2f",readsKTrimmed*100.0/readsIn)+"%) \t"+
+ basesKTrimmed+" bases ("+String.format("%.2f",basesKTrimmed*100.0/basesIn)+"%)");
+ }
+ if(trimByOverlap){
+ outstream.println("Trimmed by overlap: \t"+readsTrimmedByOverlap+" reads ("+String.format("%.2f",readsTrimmedByOverlap*100.0/readsIn)+"%) \t"+
+ basesTrimmedByOverlap+" bases ("+String.format("%.2f",basesTrimmedByOverlap*100.0/basesIn)+"%)");
+ }
+ if(filterGC){
+ outstream.println("Filtered by GC: \t"+badGcReads+" reads ("+String.format("%.2f",badGcReads*100.0/readsIn)+"%) \t"+
+ badGcBases+" bases ("+String.format("%.2f",badGcBases*100.0/basesIn)+"%)");
+ }
+ if(minAvgQuality>0 || maxNs>=0 || minBaseFrequency>0 || chastityFilter || removeBadBarcodes){
+ outstream.println("Low quality discards: \t"+readsQFiltered+" reads ("+String.format("%.2f",readsQFiltered*100.0/readsIn)+"%) \t"+
+ basesQFiltered+" bases ("+String.format("%.2f",basesQFiltered*100.0/basesIn)+"%)");
+ }
+ if(calcEntropy){
+ outstream.println("Low entropy discards: \t"+readsEFiltered+" reads ("+String.format("%.2f",readsEFiltered*100.0/readsIn)+"%) \t"+
+ basesEFiltered+" bases ("+String.format("%.2f",basesEFiltered*100.0/basesIn)+"%)");
+ }
+
+ outstream.println("Result: \t"+readsOut+" reads ("+String.format("%.2f",readsOut*100.0/readsIn)+"%) \t"+
+ basesOut+" bases ("+String.format("%.2f",basesOut*100.0/basesIn)+"%)");
+
+ if(loglog!=null){
+ outstream.println("Unique "+loglog.k+"-mers: \t"+loglog.cardinality());
+ }
+ }
+
+ /**
+ * Clear stored kmers.
+ */
+ public void unloadKmers(){
+ for(AbstractKmerTable[] akta : new AbstractKmerTable[][] {filterMaps, maskMaps, trimRightMaps, trimLeftMaps}){
+ if(akta!=null){
+ for(int i=0; i<akta.length; i++){akta[i]=null;}
+ }
+ }
+ }
+
+ /**
+ * Clear stored sequence data.
+ */
+ public void unloadScaffolds(){
+ if(scaffoldNames!=null && !scaffoldNames.isEmpty()){
+ scaffoldNames.clear();
+ scaffoldNames.trimToSize();
+ }
+ scaffoldReadCounts=null;
+ scaffoldBaseCounts=null;
+ hitCounts=null;
+ scaffoldLengths=null;
+ }
+
+ /**
+ * Write statistics about how many reads matched each reference scaffold.
+ */
+ private void writeStats(){
+ if(outstats==null){return;}
+ final TextStreamWriter tsw=new TextStreamWriter(outstats, overwrite, false, false);
+ tsw.start();
+
+ long rsum=0, bsum=0;
+
+ /* Create StringNum list of scaffold names and hitcounts */
+ ArrayList<StringNum> list=new ArrayList<StringNum>();
+ for(int i=1; i<scaffoldNames.size(); i++){
+ final long num1=scaffoldReadCounts.get(i), num2=scaffoldBaseCounts.get(i);
+ if(num1>0 || !printNonZeroOnly){
+ rsum+=num1;
+ bsum+=num2;
+ final String s=scaffoldNames.get(i);
+ final int len=scaffoldLengths.get(i);
+ final StringNum sn=new StringNum(s, len, num1, num2);
+ list.add(sn);
+ }
+ }
+ Collections.sort(list);
+ final double rmult=100.0/(readsIn>0 ? readsIn : 1);
+ final double bmult=100.0/(basesIn>0 ? basesIn : 1);
+
+ tsw.print("#File\t"+in1+(in2==null ? "" : "\t"+in2)+"\n");
+
+ if(STATS_COLUMNS==3){
+ tsw.print(String.format("#Total\t%d\n",readsIn));
+ tsw.print(String.format("#Matched\t%d\t%.5f%%\n",rsum,rmult*rsum));
+ tsw.print("#Name\tReads\tReadsPct\n");
+ for(int i=0; i<list.size(); i++){
+ StringNum sn=list.get(i);
+ tsw.print(String.format("%s\t%d\t%.5f%%\n",sn.name,sn.reads,(sn.reads*rmult)));
+ }
+ }else{
+ tsw.print(String.format("#Total\t%d\t%d\n",readsIn,basesIn));
+ tsw.print(String.format("#Matched\t%d\t%.5f%%\n",rsum,rmult*rsum,bsum,bsum*bmult));
+ tsw.print("#Name\tReads\tReadsPct\tBases\tBasesPct\n");
+ for(int i=0; i<list.size(); i++){
+ StringNum sn=list.get(i);
+ tsw.print(String.format("%s\t%d\t%.5f%%\t%d\t%.5f%%\n",sn.name,sn.reads,(sn.reads*rmult),sn.bases,(sn.bases*bmult)));
+ }
+ }
+ tsw.poisonAndWait();
+ }
+
+ /**
+ * Write RPKM statistics.
+ */
+ private void writeRPKM(){
+ if(outrpkm==null){return;}
+ final TextStreamWriter tsw=new TextStreamWriter(outrpkm, overwrite, false, false);
+ tsw.start();
+
+ /* Count mapped reads */
+ long mapped=0;
+ for(int i=0; i<scaffoldReadCounts.length(); i++){
+ mapped+=scaffoldReadCounts.get(i);
+ }
+
+ /* Print header */
+ tsw.print("#File\t"+in1+(in2==null ? "" : "\t"+in2)+"\n");
+ tsw.print(String.format("#Reads\t%d\n",readsIn));
+ tsw.print(String.format("#Mapped\t%d\n",mapped));
+ tsw.print(String.format("#RefSequences\t%d\n",Tools.max(0, scaffoldNames.size()-1)));
+ tsw.print("#Name\tLength\tBases\tCoverage\tReads\tRPKM\n");
+
+ final float mult=1000000000f/Tools.max(1, mapped);
+
+ /* Print data */
+ for(int i=1; i<scaffoldNames.size(); i++){
+ final long reads=scaffoldReadCounts.get(i);
+ final long bases=scaffoldBaseCounts.get(i);
+ final String s=scaffoldNames.get(i);
+ final int len=scaffoldLengths.get(i);
+ final double invlen=1.0/Tools.max(1, len);
+ final double mult2=mult*invlen;
+ if(reads>0 || !printNonZeroOnly){
+ tsw.print(String.format("%s\t%d\t%d\t%.4f\t%d\t%.4f\n",s,len,bases,bases*invlen,reads,reads*mult2));
+ }
+ }
+ tsw.poisonAndWait();
+ }
+
+ /**
+ * Write statistics on a per-reference basis.
+ */
+ private void writeRefStats(){
+ if(outrefstats==null){return;}
+ final TextStreamWriter tsw=new TextStreamWriter(outrefstats, overwrite, false, false);
+ tsw.start();
+
+ /* Count mapped reads */
+ long mapped=0;
+ for(int i=0; i<scaffoldReadCounts.length(); i++){
+ mapped+=scaffoldReadCounts.get(i);
+ }
+
+ final int numRefs=refNames.size();
+ long[] refReadCounts=new long[numRefs];
+ long[] refBaseCounts=new long[numRefs];
+ long[] refLengths=new long[numRefs];
+
+ for(int r=0, s=1; r<numRefs; r++){
+ final int lim=s+refScafCounts[r];
+ while(s<lim){
+ refReadCounts[r]+=scaffoldReadCounts.get(s);
+ refBaseCounts[r]+=scaffoldBaseCounts.get(s);
+ refLengths[r]+=scaffoldLengths.get(s);
+ s++;
+ }
+ }
+
+ /* Print header */
+ tsw.print("#File\t"+in1+(in2==null ? "" : "\t"+in2)+"\n");
+ tsw.print(String.format("#Reads\t%d\n",readsIn));
+ tsw.print(String.format("#Mapped\t%d\n",mapped));
+ tsw.print(String.format("#References\t%d\n",Tools.max(0, refNames.size())));
+ tsw.print("#Name\tLength\tScaffolds\tBases\tCoverage\tReads\tRPKM\n");
+
+ final float mult=1000000000f/Tools.max(1, mapped);
+
+ /* Print data */
+ for(int i=0; i<refNames.size(); i++){
+ final long reads=refReadCounts[i];
+ final long bases=refBaseCounts[i];
+ final long len=refLengths[i];
+ final int scafs=refScafCounts[i];
+ final String name=ReadWrite.stripToCore(refNames.get(i));
+ final double invlen=1.0/Tools.max(1, len);
+ final double mult2=mult*invlen;
+ if(reads>0 || !printNonZeroOnly){
+ tsw.print(String.format("%s\t%d\t%d\t%d\t%.4f\t%d\t%.4f\n",name,len,scafs,bases,bases*invlen,reads,reads*mult2));
+ }
+ }
+ tsw.poisonAndWait();
+ }
+
+ /**
+ * Write processing statistics in DUK's format.
+ * @param time Elapsed time, nanoseconds
+ */
+ private void writeDuk(long time){
+ if(outduk==null){return;}
+ final TextStreamWriter tsw=new TextStreamWriter(outduk, overwrite, false, false);
+ tsw.start();
+ tsw.println(dukString(time, refFilter));
+ tsw.poisonAndWait();
+ }
+
+ /**
+ * Write RQCFilter stats.
+ * @param time Elapsed time, nanoseconds
+ */
+ private void writeRqc(){
+ if(outrqc==null){return;}
+ addToRqcMap();
+ if(outrqc.endsWith("hashmap")){return;}
+ final TextStreamWriter tsw=new TextStreamWriter(outrqc, overwrite, false, false);
+ tsw.start();
+ tsw.println(rqcString());
+ tsw.poisonAndWait();
+ }
+
+ public static String rqcString(){
+ if(RQC_MAP==null){return null;}
+ StringBuilder sb=new StringBuilder();
+
+ String[] keys=new String[] {"inputReads", "inputBases", "qtrimmedReads", "qtrimmedBases", "qfilteredReads", "qfilteredBases",
+ "ktrimmedReads", "ktrimmedBases", "kfilteredReads", "kfilteredBases", "outputReads", "outputBases"};
+
+ for(String key : keys){
+ String value=RQC_MAP.get(key);
+ if(value!=null){
+ sb.append(key+"="+value+"\n");
+ }
+ }
+
+ return sb.toString();
+ }
+
+ private void addToRqcMap(){
+ putRqc("inputReads", readsIn, false);
+ putRqc("inputBases", basesIn, false);
+ if(qtrimLeft || qtrimRight){
+ putRqc("qtrimmedReads", readsQTrimmed, false);
+ putRqc("qtrimmedBases", basesQTrimmed, false);
+ }
+ putRqc("qfilteredReads", readsQFiltered, false);
+ putRqc("qfilteredBases", basesQFiltered, false);
+
+ if(ktrimLeft || ktrimRight || ktrimN){
+ putRqc("ktrimmedReads", readsKTrimmed, true);
+ putRqc("ktrimmedBases", basesKTrimmed, true);
+ }else{
+ putRqc("kfilteredReads", readsKFiltered, false);
+ putRqc("kfilteredBases", basesKFiltered, false);
+ }
+ putRqc("outputReads", readsOut, true);
+ putRqc("outputBases", basesOut, true);
+ }
+
+ private static void putRqc(String key, long value, boolean evict){putRqc(key, value+"", evict);}
+
+ private static void putRqc(String key, String value, boolean evict){
+ if(RQC_MAP==null){RQC_MAP=new HashMap<String,String>();}
+ if(evict || !RQC_MAP.containsKey(key)){RQC_MAP.put(key, value);}
+ }
+
+ /**
+ * Helper method; formats statistics to be duk-compatible
+ * @param time Elapsed time, nanoseconds
+ * @return duk output string
+ */
+ private String dukString(long time, String[] ref){
+ StringBuilder sb=new StringBuilder();
+ sb.append("##INPUT PARAMETERS##\n");
+ sb.append("#Reference file: "+(ref==null || ref.length<1 ? null : ref.length==1 ? ref[0] : Arrays.toString(ref))+"\n");
+ sb.append("#Query file: "+in1+(in2==null ? "" : ","+in2)+"\n");
+ sb.append("#Not matched reads file: "+out1+(out2==null ? "" : ","+out2)+"\n");
+ sb.append("#Matched reads file: "+outb1+(outb2==null ? "" : ","+outb2)+"\n");
+ sb.append("#Output file (duk): "+outduk+"\n");
+ sb.append("#Output file (stats): "+outstats+"\n");
+ sb.append("#Mer size: "+k+"\n");
+ long size=0;
+ if(kfilter){
+ for(AbstractKmerTable x : filterMaps){size+=x.size();}
+ }
+ if(ktrimN){
+ for(AbstractKmerTable x : maskMaps){size+=x.size();}
+ }
+ if(ktrimRight){
+ for(AbstractKmerTable x : trimRightMaps){size+=x.size();}
+ }
+ if(ktrimLeft){
+ for(AbstractKmerTable x : trimLeftMaps){size+=x.size();}
+ }
+ sb.append("#Avg step size: "+String.format("%.1f", refKmers/(double)(Tools.max(1, size)))+"\n");
+ sb.append("#Cut off: "+maxBadKmers+"\n");
+ sb.append("#Mask middle: "+maskMiddle+"\n");
+ sb.append("#Quality trim: "+((qtrimLeft || qtrimRight) ? trimq : "false")+"\n");
+ sb.append("\n");
+
+ sb.append("##REFERENCE STAT##\n");
+ sb.append("#Total Reads: "+refReads+"\n");
+ sb.append("#Total Bases: "+refBases+"\n");
+ sb.append("#Total kmers: "+refKmers+"\n");
+ sb.append("#Total stored kmers: "+size+"\n");
+ sb.append("\n");
+
+ sb.append("## ELAPSED TIME##\n");
+ sb.append("# Time: "+String.format("%.2f", time/1000000000.0)+" seconds\n");
+ sb.append("\n");
+
+ sb.append("##QUERY FILE STAT##\n");
+ sb.append("# Total number of reads: "+readsIn+"\n");
+ sb.append("# Total number of matched reads: "+readsKFiltered+"\n");
+ sb.append("# Match ratio: "+String.format("%.6f", readsKFiltered*1.0/readsIn)+"\n");
+ sb.append("\n");
+
+ sb.append("##P-VALUE##\n");
+ sb.append("#Avg number of Kmer for each read: "+((basesIn/(Tools.max(readsIn, 1)))-k)+"\n");
+// sb.append("# P value for the given threshold 1 is 4.05231e-14\n"); //duk prints a P value; not sure what it means
+ sb.append("\n");
+
+ sb.append("## Histogram of kmer occurance for reads with at least one occurance ##\n");
+ sb.append("#NumOcc\tNumReads\tPercentage\n");
+
+ long sum=Tools.sum(hitCounts);
+ double mult=100.0/(sum<1 ? 1 : sum);
+ for(int i=0; i<hitCounts.length; i++){
+ long x=hitCounts[i];
+ if(x>0){
+ sb.append(i).append('\t').append(x).append('\t').append(String.format("%.4f",(x*mult))).append('\n');
+ }
+ }
+
+ return sb.toString();
+ }
+
+ /**
+ * Fills the scaffold names array with reference names.
+ */
+ private void toRefNames(){
+ final int numRefs=refNames.size();
+ for(int r=0, s=1; r<numRefs; r++){
+ final int scafs=refScafCounts[r];
+ final int lim=s+scafs;
+ final String name=ReadWrite.stripToCore(refNames.get(r));
+// System.err.println("r="+r+", s="+s+", scafs="+scafs+", lim="+lim+", name="+name);
+ while(s<lim){
+// System.err.println(r+", "+s+". Setting "+scaffoldNames.get(s)+" -> "+name);
+ scaffoldNames.set(s, name);
+ s++;
+ }
+ }
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+
+ /**
+ * Fills tables with kmers from references, using multiple LoadThread.
+ * @return Number of kmers stored.
+ */
+ private long spawnLoadThreads(String[] ref, String[] literal, AbstractKmerTable[] maps, boolean countRef){
+ Timer t=new Timer();
+ if((ref==null || ref.length<1) && (literal==null || literal.length<1)){return 0;}
+ long added=0;
+
+ /* Create load threads */
+ LoadThread[] loaders=new LoadThread[WAYS];
+ for(int i=0; i<loaders.length; i++){
+ loaders[i]=new LoadThread(i, maps[i]);
+ loaders[i].start();
+ }
+
+ /* For each reference file... */
+ int refNum=0;
+ if(ref!=null){
+ for(String refname : ref){
+
+ /* Start an input stream */
+ FileFormat ff=FileFormat.testInput(refname, FileFormat.FASTA, null, false, true);
+ ConcurrentReadInputStream cris=ConcurrentReadInputStream.getReadInputStream(-1L, false, ff, null, null, null, Shared.USE_MPI, true);
+ cris.start(); //4567
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ /* Iterate through read lists from the input stream */
+ while(reads!=null && reads.size()>0){
+ {
+ /* Assign a unique ID number to each scaffold */
+ ArrayList<Read> reads2=new ArrayList<Read>(reads);
+ for(Read r1 : reads2){
+ final Read r2=r1.mate;
+ final Integer id=scaffoldNames.size();
+ if(countRef){refScafCounts[refNum]++;}
+ scaffoldNames.add(r1.id==null ? id.toString() : r1.id);
+ int len=r1.length();
+ r1.obj=id;
+ if(r2!=null){
+ r2.obj=id;
+ len+=r2.length();
+ }
+ scaffoldLengths.add(len);
+ }
+
+ if(REPLICATE_AMBIGUOUS){
+ reads2=Tools.replicateAmbiguous(reads2, Tools.min(k, mink));
+ }
+
+ /* Send a pointer to the read list to each LoadThread */
+ for(LoadThread lt : loaders){
+ boolean b=true;
+ while(b){
+ try {
+ lt.queue.put(reads2);
+ b=false;
+ } catch (InterruptedException e) {
+ //TODO: This will hang due to still-running threads.
+ throw new RuntimeException(e);
+ }
+ }
+ }
+ }
+
+ /* Dispose of the old list and fetch a new one */
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ /* Cleanup */
+ cris.returnList(ln.id, ln.list.isEmpty());
+ errorState|=ReadWrite.closeStream(cris);
+ refNum++;
+ }
+ }
+
+ /* If there are literal sequences to use as references */
+ if(literal!=null){
+ ArrayList<Read> list=new ArrayList<Read>(literal.length);
+ if(verbose){System.err.println("Adding literals "+Arrays.toString(literal));}
+
+ /* Assign a unique ID number to each literal sequence */
+ for(int i=0; i<literal.length; i++){
+ final Integer id=scaffoldNames.size();
+ final Read r=new Read(literal[i].getBytes(), null, id);
+ if(countRef){refScafCounts[refNum]++;}
+ scaffoldNames.add(id.toString());
+ scaffoldLengths.add(r.length());
+ r.obj=id;
+ list.add(r);
+ }
+
+ if(REPLICATE_AMBIGUOUS){
+ list=Tools.replicateAmbiguous(list, Tools.min(k, mink));
+ }
+
+ /* Send a pointer to the read list to each LoadThread */
+ for(LoadThread lt : loaders){
+ boolean b=true;
+ while(b){
+ try {
+ lt.queue.put(list);
+ b=false;
+ } catch (InterruptedException e) {
+ //TODO: This will hang due to still-running threads.
+ throw new RuntimeException(e);
+ }
+ }
+ }
+ }
+
+ /* Signal loaders to terminate */
+ for(LoadThread lt : loaders){
+ boolean b=true;
+ while(b){
+ try {
+ lt.queue.put(POISON);
+ b=false;
+ } catch (InterruptedException e) {
+ //TODO: This will hang due to still-running threads.
+ throw new RuntimeException(e);
+ }
+ }
+ }
+
+ /* Wait for loaders to die, and gather statistics */
+ boolean success=true;
+ for(LoadThread lt : loaders){
+ while(lt.getState()!=Thread.State.TERMINATED){
+ try {
+ lt.join();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ added+=lt.added;
+ refKmers+=lt.refKmersT;
+ refBases+=lt.refBasesT;
+ refReads+=lt.refReadsT;
+ success&=lt.success;
+ }
+ if(!success){KillSwitch.kill("Failed loading ref kmers; aborting.");}
+
+ //Correct statistics for number of threads, since each thread processes all reference data
+ refKmers/=WAYS;
+ refBases/=WAYS;
+ refReads/=WAYS;
+
+ scaffoldReadCounts=new AtomicLongArray(scaffoldNames.size());
+ scaffoldBaseCounts=new AtomicLongArray(scaffoldNames.size());
+
+ t.stop();
+ if(DISPLAY_PROGRESS){
+ outstream.println("Added "+added+" kmers; time: \t"+t);
+ Shared.printMemory();
+ outstream.println();
+ }
+
+ if(verbose){
+ TextStreamWriter tsw=new TextStreamWriter("stdout", false, false, false, FileFormat.TEXT);
+ tsw.start();
+
+ if(kfilter){
+ tsw.println("kfilter tables:");
+ for(AbstractKmerTable x : filterMaps){x.dumpKmersAsText(tsw, k, 1);}
+ }
+ if(ktrimN){
+ tsw.println("ktrimN tables:");
+ for(AbstractKmerTable x : maskMaps){x.dumpKmersAsText(tsw, k, 1);}
+ }
+ if(ktrimRight){
+ tsw.println("ktrimRight tables:");
+ for(AbstractKmerTable x : trimRightMaps){x.dumpKmersAsText(tsw, k, 1);}
+ }
+ if(ktrimLeft){
+ tsw.println("ktrimLeft tables:");
+ for(AbstractKmerTable x : trimLeftMaps){x.dumpKmersAsText(tsw, k, 1);}
+ }
+
+ tsw.poisonAndWait();
+ }
+
+ return added;
+ }
+
+ /**
+ * Match reads against reference kmers, using multiple ProcessThread.
+ * @param t
+ */
+ private void spawnProcessThreads(Timer t){
+ t.start();
+
+ /* Create read input stream */
+ final ConcurrentReadInputStream cris;
+ final boolean paired;
+ {
+ FileFormat ff1=FileFormat.testInput(in1, FileFormat.FASTQ, null, true, true);
+ FileFormat ff2=FileFormat.testInput(in2, FileFormat.FASTQ, null, true, true);
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, ff1.samOrBam(), ff1, ff2, qfin1, qfin2);
+ cris.setSampleRate(samplerate, sampleseed);
+ cris.start(); //4567
+ paired=cris.paired();
+ if(!ff1.samOrBam()){outstream.println("Input is being processed as "+(paired ? "paired" : "unpaired"));}
+ }
+
+ /* Create read output streams */
+ final ConcurrentReadOutputStream ros, rosb, ross;
+ if(out1!=null){
+ final int buff=(!ORDERED ? 12 : Tools.max(32, 2*Shared.threads()));
+ FileFormat ff1=FileFormat.testOutput(out1, FileFormat.FASTQ, null, true, overwrite, append, ORDERED);
+ FileFormat ff2=FileFormat.testOutput(out2, FileFormat.FASTQ, null, true, overwrite, append, ORDERED);
+ ros=ConcurrentReadOutputStream.getStream(ff1, ff2, null, null, buff, null, true);
+ ros.start();
+ }else{ros=null;}
+ if(outb1!=null){
+ final int buff=(!ORDERED ? 12 : Tools.max(32, 2*Shared.threads()));
+ FileFormat ff1=FileFormat.testOutput(outb1, FileFormat.FASTQ, null, true, overwrite, append, ORDERED);
+ FileFormat ff2=FileFormat.testOutput(outb2, FileFormat.FASTQ, null, true, overwrite, append, ORDERED);
+ rosb=ConcurrentReadOutputStream.getStream(ff1, ff2, null, null, buff, null, true);
+ rosb.start();
+ }else{rosb=null;}
+ if(outsingle!=null){
+ final int buff=(!ORDERED ? 12 : Tools.max(32, 2*Shared.threads()));
+ FileFormat ff=FileFormat.testOutput(outsingle, FileFormat.FASTQ, null, true, overwrite, append, ORDERED);
+ ross=ConcurrentReadOutputStream.getStream(ff, null, null, null, buff, null, true);
+ ross.start();
+ }else{ross=null;}
+ if(ros!=null || rosb!=null || ross!=null){
+ t.stop();
+ outstream.println("Started output streams:\t"+t);
+ t.start();
+ }
+
+ /* Optionally skip the first reads, since initial reads may have lower quality */
+ if(skipreads>0){
+ long skipped=0;
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ while(skipped<skipreads && reads!=null && reads.size()>0){
+ skipped+=reads.size();
+
+ if(rosb!=null){rosb.add(new ArrayList<Read>(1), ln.id);}
+ if(ros!=null){ros.add(new ArrayList<Read>(1), ln.id);}
+ if(ross!=null){ross.add(new ArrayList<Read>(1), ln.id);}
+
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ cris.returnList(ln.id, ln.list.isEmpty());
+ if(reads==null || reads.isEmpty()){
+ ReadWrite.closeStreams(cris, ros, rosb, ross);
+ System.err.println("Skipped all of the reads.");
+ System.exit(0);
+ }
+ }
+
+ /* Create ProcessThreads */
+ ArrayList<ProcessThread> alpt=new ArrayList<ProcessThread>(THREADS);
+ for(int i=0; i<THREADS; i++){alpt.add(new ProcessThread(cris, ros, rosb, ross, ALLOW_LOCAL_ARRAYS));}
+ for(ProcessThread pt : alpt){pt.start();}
+
+ /* Wait for threads to die, and gather statistics */
+ for(ProcessThread pt : alpt){
+ while(pt.getState()!=Thread.State.TERMINATED){
+ try {
+ pt.join();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ readsIn+=pt.readsInT;
+ basesIn+=pt.basesInT;
+ readsOut+=pt.readsOutT;
+ basesOut+=pt.basesOutT;
+ readsKFiltered+=pt.readsKFilteredT;
+ basesKFiltered+=pt.basesKFilteredT;
+ readsQTrimmed+=pt.readsQTrimmedT;
+ basesQTrimmed+=pt.basesQTrimmedT;
+ readsFTrimmed+=pt.readsFTrimmedT;
+ basesFTrimmed+=pt.basesFTrimmedT;
+ readsKTrimmed+=pt.readsKTrimmedT;
+ basesKTrimmed+=pt.basesKTrimmedT;
+ readsTrimmedByOverlap+=pt.readsTrimmedByOverlapT;
+ basesTrimmedByOverlap+=pt.basesTrimmedByOverlapT;
+ badGcReads+=pt.badGcReadsT;
+ badGcBases+=pt.badGcBasesT;
+ readsQFiltered+=pt.readsQFilteredT;
+ basesQFiltered+=pt.basesQFilteredT;
+ readsEFiltered+=pt.readsEFilteredT;
+ basesEFiltered+=pt.basesEFilteredT;
+
+ if(hitCounts!=null){
+ for(int i=0; i<hitCounts.length; i++){hitCounts[i]+=pt.hitCountsT[i];}
+ pt.hitCountsT=null;
+ }
+ if(pt.scaffoldReadCountsT!=null && scaffoldReadCounts!=null){
+ for(int i=0; i<pt.scaffoldReadCountsT.length; i++){scaffoldReadCounts.addAndGet(i, pt.scaffoldReadCountsT[i]);}
+ pt.scaffoldReadCountsT=null;
+ }
+ if(pt.scaffoldBaseCountsT!=null && scaffoldBaseCounts!=null){
+ for(int i=0; i<pt.scaffoldBaseCountsT.length; i++){scaffoldBaseCounts.addAndGet(i, pt.scaffoldBaseCountsT[i]);}
+ pt.scaffoldBaseCountsT=null;
+ }
+ }
+
+ /* Shut down I/O streams; capture error status */
+ errorState|=ReadWrite.closeStreams(cris, ros, rosb, ross);
+ errorState|=ReadStats.writeAll();
+
+ t.stop();
+ if(showSpeed){
+ outstream.println("Processing time: \t\t"+t);
+ }
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Classes ----------------*/
+ /*--------------------------------------------------------------*/
+
+
+ /**
+ * Loads kmers into a table. Each thread handles all kmers X such that X%WAYS==tnum.
+ */
+ private class LoadThread extends Thread{
+
+ public LoadThread(final int tnum_, final AbstractKmerTable map_){
+ tnum=tnum_;
+ map=map_;
+ }
+
+ /**
+ * Get the next list of reads (or scaffolds) from the queue.
+ * @return List of reads
+ */
+ private ArrayList<Read> fetch(){
+ ArrayList<Read> list=null;
+ while(list==null){
+ try {
+ list=queue.take();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ return list;
+ }
+
+ @Override
+ public void run(){
+ ArrayList<Read> reads=fetch();
+ while(reads!=POISON){
+ for(Read r1 : reads){
+ assert(r1.pairnum()==0);
+ final Read r2=r1.mate;
+
+ final int rblen=(r1==null ? 0 : r1.length());
+ final int rblen2=r1.mateLength();
+
+ added+=addToMap(r1, rblen>20000000 ? k : rblen>5000000 ? 11 : rblen>500000 ? 2 : 0);
+ if(r2!=null){
+ added+=addToMap(r2, rblen2>20000000 ? k : rblen2>5000000 ? 11 : rblen2>500000 ? 2 : 0);
+ }
+ }
+ reads=fetch();
+ }
+
+ if(map.canRebalance() && map.size()>2L*map.arrayLength()){
+ map.rebalance();
+ }
+ success=true;
+ }
+
+ /**
+ * @param r The current read to process
+ * @param skip Number of bases to skip between kmers
+ * @return Number of kmers stored
+ */
+ private long addToMap(Read r, int skip){
+ skip=Tools.max(minSkip, Tools.min(maxSkip, skip));
+ final byte[] bases=r.bases;
+ final int shift=2*k;
+ final int shift2=shift-2;
+ final long mask=~((-1L)<<shift);
+ final long kmask=lengthMasks[k];
+ long kmer=0;
+ long rkmer=0;
+ long added=0;
+ int len=0;
+
+ if(bases!=null){
+ refReadsT++;
+ refBasesT+=bases.length;
+ }
+ if(bases==null || bases.length<k){return 0;}
+
+ final int id=(Integer)r.obj;
+
+ if(skip>1){ //Process while skipping some kmers
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ long x=Dedupe.baseToNumber[b];
+ long x2=Dedupe.baseToComplementNumber[b];
+ kmer=((kmer<<2)|x)&mask;
+ rkmer=(rkmer>>>2)|(x2<<shift2);
+ if(b=='N'){len=0;}else{len++;}
+ if(verbose){System.err.println("Scanning1 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
+ if(len>=k){
+ refKmersT++;
+ if(len%skip==0){
+ final long extraBase=(i>=bases.length-1 ? -1 : AminoAcid.baseToNumber[bases[i+1]]);
+ added+=addToMap(kmer, rkmer, k, extraBase, id, kmask, hammingDistance, editDistance);
+ if(useShortKmers){
+ if(i==k2){added+=addToMapRightShift(kmer, rkmer, id);}
+ if(i==bases.length-1){added+=addToMapLeftShift(kmer, rkmer, extraBase, id);}
+ }
+ }
+ }
+ }
+ }else{ //Process all kmers
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ long x=Dedupe.baseToNumber[b];
+ long x2=Dedupe.baseToComplementNumber[b];
+ kmer=((kmer<<2)|x)&mask;
+ rkmer=(rkmer>>>2)|(x2<<shift2);
+ if(b=='N'){len=0;}else{len++;}
+ if(verbose){System.err.println("Scanning2 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
+ if(len>=k){
+ refKmersT++;
+ final long extraBase=(i>=bases.length-1 ? -1 : AminoAcid.baseToNumber[bases[i+1]]);
+ final long atm=addToMap(kmer, rkmer, k, extraBase, id, kmask, hammingDistance, editDistance);
+ added+=atm;
+// assert(false) : atm+", "+map.contains(toValue(kmer, rkmer, kmask));
+ if(useShortKmers){
+ if(i==k2){added+=addToMapRightShift(kmer, rkmer, id);}
+ if(i==bases.length-1){added+=addToMapLeftShift(kmer, rkmer, extraBase, id);}
+ }
+ }
+ }
+ }
+ return added;
+ }
+
+
+ /**
+ * Adds short kmers on the left end of the read.
+ * @param kmer Forward kmer
+ * @param rkmer Reverse kmer
+ * @param extraBase Base added to end in case of deletions
+ * @param id Scaffold number
+ * @return Number of kmers stored
+ */
+ private long addToMapLeftShift(long kmer, long rkmer, final long extraBase, final int id){
+ if(verbose){System.err.println("addToMapLeftShift");}
+ long added=0;
+ for(int i=k-1; i>=mink; i--){
+ kmer=kmer&rightMasks[i];
+ rkmer=rkmer>>>2;
+ long x=addToMap(kmer, rkmer, i, extraBase, id, lengthMasks[i], hammingDistance2, editDistance2);
+ added+=x;
+ if(verbose){
+ if((toValue(kmer, rkmer, lengthMasks[i]))%WAYS==tnum){
+ System.err.println("added="+x+"; i="+i+"; tnum="+tnum+"; Added left-shift kmer "+AminoAcid.kmerToString(kmer&~lengthMasks[i], i)+"; value="+(toValue(kmer, rkmer, lengthMasks[i]))+"; kmer="+kmer+"; rkmer="+rkmer+"; kmask="+lengthMasks[i]+"; rightMasks[i+1]="+rightMasks[i+1]);
+ System.err.println("i="+i+"; tnum="+tnum+"; Looking for left-shift kmer "+AminoAcid.kmerToString(kmer&~lengthMasks[i], i));
+ final long value=toValue(kmer, rkmer, lengthMasks[i]);
+ if(map.contains(value)){System.err.println("Found "+value);}
+ }
+ }
+ }
+ return added;
+ }
+
+
+ /**
+ * Adds short kmers on the right end of the read.
+ * @param kmer Forward kmer
+ * @param rkmer Reverse kmer
+ * @param id Scaffold number
+ * @return Number of kmers stored
+ */
+ private long addToMapRightShift(long kmer, long rkmer, final int id){
+ if(verbose){System.err.println("addToMapRightShift");}
+ long added=0;
+ for(int i=k-1; i>=mink; i--){
+ long extraBase=kmer&3L;
+ kmer=kmer>>>2;
+ rkmer=rkmer&rightMasks[i];
+// assert(Long.numberOfLeadingZeros(kmer)>=2*(32-i)) : Long.numberOfLeadingZeros(kmer)+", "+i+", "+kmer+", "+kMasks[i];
+// assert(Long.numberOfLeadingZeros(rkmer)>=2*(32-i)) : Long.numberOfLeadingZeros(rkmer)+", "+i+", "+rkmer+", "+kMasks[i];
+ long x=addToMap(kmer, rkmer, i, extraBase, id, lengthMasks[i], hammingDistance2, editDistance2);
+ added+=x;
+ if(verbose){
+ if((toValue(kmer, rkmer, lengthMasks[i]))%WAYS==tnum){
+ System.err.println("added="+x+"; i="+i+"; tnum="+tnum+"; Added right-shift kmer "+AminoAcid.kmerToString(kmer&~lengthMasks[i], i)+"; value="+(toValue(kmer, rkmer, lengthMasks[i]))+"; kmer="+kmer+"; rkmer="+rkmer+"; kmask="+lengthMasks[i]+"; rightMasks[i+1]="+rightMasks[i+1]);
+ System.err.println("i="+i+"; tnum="+tnum+"; Looking for right-shift kmer "+AminoAcid.kmerToString(kmer&~lengthMasks[i], i));
+ final long value=toValue(kmer, rkmer, lengthMasks[i]);
+ if(map.contains(value)){System.err.println("Found "+value);}
+ }
+ }
+ }
+ return added;
+ }
+
+
+ /**
+ * Adds this kmer to the table, including any mutations implied by editDistance or hammingDistance.
+ * @param kmer Forward kmer
+ * @param rkmer Reverse kmer
+ * @param len Kmer length
+ * @param extraBase Base added to end in case of deletions
+ * @param id Scaffold number
+ * @param kmask0
+ * @return Number of kmers stored
+ */
+ private long addToMap(final long kmer, final long rkmer, final int len, final long extraBase, final int id, final long kmask0, final int hdist, final int edist){
+
+ assert(kmask0==lengthMasks[len]) : kmask0+", "+len+", "+lengthMasks[len]+", "+Long.numberOfTrailingZeros(kmask0)+", "+Long.numberOfTrailingZeros(lengthMasks[len]);
+
+ if(verbose){System.err.println("addToMap_A; len="+len+"; kMasks[len]="+lengthMasks[len]);}
+ assert((kmer&kmask0)==0);
+ final long added;
+ if(hdist==0){
+ final long key=toValue(kmer, rkmer, kmask0);
+ if(speed>0 && ((key/WAYS)&15)<speed){return 0;}
+ if(key%WAYS!=tnum){return 0;}
+ if(verbose){System.err.println("addToMap_B: "+AminoAcid.kmerToString(kmer&~lengthMasks[len], len)+" = "+key);}
+ added=map.setIfNotPresent(key, id);
+ }else if(edist>0){
+// long extraBase=(i>=bases.length-1 ? -1 : AminoAcid.baseToNumber[bases[i+1]]);
+ added=mutate(kmer, rkmer, len, id, edist, extraBase);
+ }else{
+ added=mutate(kmer, rkmer, len, id, hdist, -1);
+ }
+ if(verbose){System.err.println("addToMap added "+added+" keys.");}
+ return added;
+ }
+
+ /**
+ * Mutate and store this kmer through 'dist' recursions.
+ * @param kmer Forward kmer
+ * @param rkmer Reverse kmer
+ * @param id Scaffold number
+ * @param dist Number of mutations
+ * @param extraBase Base added to end in case of deletions
+ * @return Number of kmers stored
+ */
+ private long mutate(final long kmer, final long rkmer, final int len, final int id, final int dist, final long extraBase){
+ long added=0;
+
+ final long key=toValue(kmer, rkmer, lengthMasks[len]);
+
+ if(verbose){System.err.println("mutate_A; len="+len+"; kmer="+kmer+"; rkmer="+rkmer+"; kMasks[len]="+lengthMasks[len]);}
+ if(key%WAYS==tnum){
+ if(verbose){System.err.println("mutate_B: "+AminoAcid.kmerToString(kmer&~lengthMasks[len], len)+" = "+key);}
+ int x=map.setIfNotPresent(key, id);
+ if(verbose){System.err.println("mutate_B added "+x+" keys.");}
+ added+=x;
+ assert(map.contains(key));
+ }
+
+ if(dist>0){
+ final int dist2=dist-1;
+
+ //Sub
+ for(int j=0; j<4; j++){
+ for(int i=0; i<len; i++){
+ final long temp=(kmer&clearMasks[i])|setMasks[j][i];
+ if(temp!=kmer){
+ long rtemp=AminoAcid.reverseComplementBinaryFast(temp, len);
+ added+=mutate(temp, rtemp, len, id, dist2, extraBase);
+ }
+ }
+ }
+
+ if(editDistance>0){
+ //Del
+ if(extraBase>=0 && extraBase<=3){
+ for(int i=1; i<len; i++){
+ final long temp=(kmer&leftMasks[i])|((kmer<<2)&rightMasks[i])|extraBase;
+ if(temp!=kmer){
+ long rtemp=AminoAcid.reverseComplementBinaryFast(temp, len);
+ added+=mutate(temp, rtemp, len, id, dist2, -1);
+ }
+ }
+ }
+
+ //Ins
+ final long eb2=kmer&3;
+ for(int i=1; i<len; i++){
+ final long temp0=(kmer&leftMasks[i])|((kmer&rightMasks[i])>>2);
+ for(int j=0; j<4; j++){
+ final long temp=temp0|setMasks[j][i-1];
+ if(temp!=kmer){
+ long rtemp=AminoAcid.reverseComplementBinaryFast(temp, len);
+ added+=mutate(temp, rtemp, len, id, dist2, eb2);
+ }
+ }
+ }
+ }
+
+ }
+
+ return added;
+ }
+
+ /*--------------------------------------------------------------*/
+
+ /** Number of kmers stored by this thread */
+ public long added=0;
+ /** Number of items encountered by this thread */
+ public long refKmersT=0, refReadsT=0, refBasesT=0;
+ /** Thread number; used to determine which kmers to store */
+ public final int tnum;
+ /** Buffer of input read lists */
+ public final ArrayBlockingQueue<ArrayList<Read>> queue=new ArrayBlockingQueue<ArrayList<Read>>(32);
+
+ /** Destination for storing kmers */
+ private final AbstractKmerTable map;
+
+ /** Completed successfully */
+ boolean success=false;
+
+ }
+
+ /*--------------------------------------------------------------*/
+
+ /**
+ * Matches read kmers against reference kmers, performs binning and/or trimming, and writes output.
+ */
+ private class ProcessThread extends Thread{
+
+ /**
+ * Constructor
+ * @param cris_ Read input stream
+ * @param ros_ Unmatched read output stream (optional)
+ * @param rosb_ Matched read output stream (optional)
+ */
+ public ProcessThread(ConcurrentReadInputStream cris_, ConcurrentReadOutputStream ros_, ConcurrentReadOutputStream rosb_, ConcurrentReadOutputStream ross_, boolean localArrays){
+ cris=cris_;
+ ros=ros_;
+ rosb=rosb_;
+ ross=ross_;
+
+ readstats=(MAKE_QUALITY_HISTOGRAM || MAKE_MATCH_HISTOGRAM || MAKE_BASE_HISTOGRAM || MAKE_QUALITY_ACCURACY ||
+ MAKE_EHIST || MAKE_INDELHIST || MAKE_LHIST || MAKE_GCHIST || MAKE_IDHIST) ?
+ new ReadStats() : null;
+
+ final int alen=(scaffoldNames==null ? 0 : scaffoldNames.size());
+
+ if(findBestMatch){
+ countArray=new int[alen];
+ idList=new IntList();
+ countList=new IntList();
+ }else{
+ countArray=null;
+ idList=countList=null;
+ }
+
+ overlapVector=(trimByOverlap ? new int[5] : null);
+
+ hitCountsT=(hitCounts==null ? null : new long[hitCounts.length]);
+
+ if(localArrays && alen>0 && alen<10000){
+ scaffoldReadCountsT=new long[alen];
+ scaffoldBaseCountsT=new long[alen];
+ }else{
+ scaffoldReadCountsT=scaffoldBaseCountsT=null;
+ }
+
+ if(calcEntropy){
+ entropyCounts=new short[entropyKmerspace];
+ entropyCountCounts=new short[entropyWindow+2];
+ entropyCountCounts[0]=(short)entropyWindow;
+ }else{
+ entropyCounts=entropyCountCounts=null;
+ }
+ }
+
+ @Override
+ public void run(){
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+ ArrayList<Read> bad=(rosb==null ? null : new ArrayList<Read>(Shared.READ_BUFFER_LENGTH));
+ ArrayList<Read> single=new ArrayList<Read>(Shared.READ_BUFFER_LENGTH);
+
+ //While there are more reads lists...
+ while(reads!=null && reads.size()>0){
+
+ int removed=0;
+
+ //For each read (or pair) in the list...
+ for(int i=0; i<reads.size(); i++){
+ final Read r1=reads.get(i);
+ final Read r2=r1.mate;
+
+ if(!r1.validated()){r1.validate(true);}
+ if(r2!=null && !r2.validated()){r2.validate(true);}
+
+ if(readstats!=null){
+ if(MAKE_QUALITY_HISTOGRAM){readstats.addToQualityHistogram(r1);}
+ if(MAKE_BASE_HISTOGRAM){readstats.addToBaseHistogram(r1);}
+ if(MAKE_MATCH_HISTOGRAM){readstats.addToMatchHistogram(r1);}
+ if(MAKE_QUALITY_ACCURACY){readstats.addToQualityAccuracy(r1);}
+
+ if(MAKE_EHIST){readstats.addToErrorHistogram(r1);}
+ if(MAKE_INDELHIST){readstats.addToIndelHistogram(r1);}
+ if(MAKE_LHIST){readstats.addToLengthHistogram(r1);}
+ if(MAKE_GCHIST){readstats.addToGCHistogram(r1);}
+ if(MAKE_IDHIST){readstats.addToIdentityHistogram(r1);}
+ }
+
+ if(loglog!=null){loglog.hash(r1);}
+
+ final int initialLength1=r1.length();
+ final int initialLength2=r1.mateLength();
+
+ final int minlen1=(int)Tools.max(initialLength1*minLenFraction, minReadLength);
+ final int minlen2=(int)Tools.max(initialLength2*minLenFraction, minReadLength);
+
+ if(verbose){System.err.println("Considering read "+r1.id+" "+new String(r1.bases));}
+
+ readsInT++;
+ basesInT+=initialLength1;
+ if(r2!=null){
+ readsInT++;
+ basesInT+=initialLength2;
+ }
+
+ if(chastityFilter){
+ if(r1!=null && r1.failsChastity()){
+ r1.setDiscarded(true);
+ if(r2!=null){r2.setDiscarded(true);}
+ }
+ }
+
+ if(removeBadBarcodes){
+ if(r1!=null && !r1.discarded() && r1.failsBarcode(barcodes, failIfNoBarcode)){
+ if(failBadBarcodes){KillSwitch.kill("Invalid barcode detected: "+r1.id+"\nThis can be disabled with the flag barcodefilter=f");}
+ r1.setDiscarded(true);
+ if(r2!=null){r2.setDiscarded(true);}
+ }
+ }
+
+ if(recalibrateQuality){
+ if(r1!=null && !r1.discarded()){
+ CalcTrueQuality.recalibrate(r1);
+ }
+ if(r2!=null && !r2.discarded()){
+ CalcTrueQuality.recalibrate(r2);
+ }
+ }
+
+ if(filterGC && (initialLength1>0 || initialLength2>0)){
+ final float gc;
+ if(r2==null){
+ gc=r1.gc();
+ }else{
+ gc=(r1.gc()*initialLength1+r2.gc()*initialLength2)/(initialLength1+initialLength2);
+ }
+ if(gc<minGC || gc>maxGC){
+ if(r1!=null && !r1.discarded()){
+ r1.setDiscarded(true);
+ badGcBasesT+=initialLength1;
+ badGcReadsT++;
+ }
+ if(r2!=null && !r2.discarded()){
+ r2.setDiscarded(true);
+ badGcBasesT+=initialLength2;
+ badGcReadsT++;
+ }
+ }
+ }
+
+ if(forceTrimLeft>0 || forceTrimRight>0 || forceTrimRight2>0 || forceTrimModulo>0){
+ if(r1!=null && !r1.discarded()){
+ final int len=r1.length();
+ final int a=forceTrimLeft>0 ? forceTrimLeft : 0;
+ final int b0=forceTrimModulo>0 ? len-1-len%forceTrimModulo : len;
+ final int b1=forceTrimRight>0 ? forceTrimRight : len;
+ final int b2=forceTrimRight2>0 ? len-1-forceTrimRight2 : len;
+ final int b=Tools.min(b0, b1, b2);
+ final int x=TrimRead.trimToPosition(r1, a, b, 1);
+ basesFTrimmedT+=x;
+ readsFTrimmedT+=(x>0 ? 1 : 0);
+ if(r1.length()<minlen1){r1.setDiscarded(true);}
+ }
+ if(r2!=null && !r2.discarded()){
+ final int len=r2.length();
+ final int a=forceTrimLeft>0 ? forceTrimLeft : 0;
+ final int b0=forceTrimModulo>0 ? len-1-len%forceTrimModulo : len;
+ final int b1=forceTrimRight>0 ? forceTrimRight : len;
+ final int b2=forceTrimRight2>0 ? len-1-forceTrimRight2 : len;
+ final int b=Tools.min(b0, b1, b2);
+ final int x=TrimRead.trimToPosition(r2, a, b, 1);
+ basesFTrimmedT+=x;
+ readsFTrimmedT+=(x>0 ? 1 : 0);
+ if(r2.length()<minlen2){r2.setDiscarded(true);}
+ }
+ }
+
+ boolean remove;
+ if(removePairsIfEitherBad){remove=r1.discarded() || (r2!=null && r2.discarded());}
+ else{remove=r1.discarded() && (r2==null || r2.discarded());}
+
+ if(remove){
+ if(r1!=null){
+ basesQFilteredT+=r1.length();
+ readsQFilteredT++;
+ }
+ if(r2!=null){
+ basesQFilteredT+=r2.length();
+ readsQFilteredT++;
+ }
+ if(bad!=null){bad.add(r1);}
+ }else{
+
+ if(ecc && r1!=null && r2!=null){BBMerge.findOverlapStrict(r1, r2, true);}
+
+ //Process kmers
+ if(kfilter && storedKmersFilter>0){
+ //Do kmer matching
+
+ if(!findBestMatch){
+ final int a=(kbig<=k ? countSetKmers(r1, filterMaps) : countSetKmersBig(r1, filterMaps));
+ final int b=(kbig<=k ? countSetKmers(r2, filterMaps) : countSetKmersBig(r2, filterMaps));
+
+ if(r1!=null && a>maxBadKmers){r1.setDiscarded(true);}
+ if(r2!=null && b>maxBadKmers){r2.setDiscarded(true);}
+ }else{
+ final int a=findBestMatch(r1, filterMaps);
+ final int b=findBestMatch(r2, filterMaps);
+
+ if(r1!=null && a>0){r1.setDiscarded(true);}
+ if(r2!=null && b>0){r2.setDiscarded(true);}
+ }
+
+ if((removePairsIfEitherBad && (r1.discarded() || (r2!=null && r2.discarded()))) ||
+ (r1.discarded() && (r2==null || r2.discarded()))){
+ remove=true;
+ if(r1!=null){
+ readsKFilteredT++;
+ basesKFilteredT+=r1.length();
+ }
+ if(r2!=null){
+ readsKFilteredT++;
+ basesKFilteredT+=r2.length();
+ }
+ if(bad!=null){bad.add(r1);}
+ }
+
+ }
+
+ if(ktrimN && storedKmersMask>0){
+ remove=remove || ktrim0(r1, r2, bad, maskMaps, NMODE, minlen1, minlen2);
+ }
+
+ if(ktrimRight && storedKmersRight>0){
+ remove=remove || ktrim0(r1, r2, bad, trimRightMaps, RIGHTMODE, minlen1, minlen2);
+
+ if(trimPairsEvenly && xsum>0 && r2!=null && r1.length()!=r2.length()){
+ int x;
+ if(r1.length()>r2.length()){
+ x=TrimRead.trimToPosition(r1, 0, r2.length()-1, 1);
+ }else{
+ x=TrimRead.trimToPosition(r2, 0, r1.length()-1, 1);
+ }
+ if(rktsum<2){readsKTrimmedT++;}
+ basesKTrimmedT+=x;
+
+ assert(r1.length()==r2.length()) : r1.length()+", "+r2.length();
+ }
+ }
+
+ if(!remove && trimByOverlap && r2!=null && expectedErrors(r1, r2)<meeFilter){
+
+ if(aprob==null || aprob.length<r1.length()){aprob=new float[r1.length()];}
+ if(bprob==null || bprob.length<r2.length()){bprob=new float[r2.length()];}
+
+ //Do overlap trimming
+ r2.reverseComplement();
+// int bestInsert=BBMergeOverlapper.mateByOverlap(r1, r2, aprob, bprob, overlapVector, minOverlap0, minOverlap,
+// overlapMargin, overlapMaxMismatches0, overlapMaxMismatches, overlapMinq);
+ int bestInsert=BBMergeOverlapper.mateByOverlapRatio(r1, r2, aprob, bprob, overlapVector, minOverlap0, minOverlap,
+ minInsert0, minInsert, maxRatio, ratioMargin, ratioOffset, 0.95f, 0.95f, useQualityForOverlap);
+
+ if(bestInsert<minInsert){bestInsert=-1;}
+ boolean ambig=(overlapVector[4]==1);
+ final int bestBad=overlapVector[2];
+
+ if(bestInsert>0 && !ambig && r1.quality!=null && r2.quality!=null && useQualityForOverlap){
+ if(efilterRatio>0 && bestInsert>0 && !ambig){
+ float bestExpected=BBMergeOverlapper.expectedMismatches(r1, r2, bestInsert);
+ if((bestExpected+efilterOffset)*efilterRatio<bestBad){ambig=true;}
+ }
+ if(pfilterRatio>0 && bestInsert>0 && !ambig){
+ float probability=BBMergeOverlapper.probability(r1, r2, bestInsert);
+ if(probability<pfilterRatio){bestInsert=-1;}
+ }
+ }
+
+ r2.reverseComplement();
+
+ if(bestInsert>0 && !ambig){
+ if(bestInsert<r1.length()){
+ if(verbose){System.err.println("Overlap right trimming r1 to "+0+", "+(bestInsert-1));}
+ int x=TrimRead.trimToPosition(r1, 0, bestInsert-1, 1);
+ if(verbose){System.err.println("Trimmed "+x+" bases: "+new String(r1.bases));}
+ readsTrimmedByOverlapT++;
+ basesTrimmedByOverlapT+=x;
+ }
+ if(bestInsert<r2.length()){
+ if(verbose){System.err.println("Overlap right trimming r2 to "+0+", "+(bestInsert-1));}
+ int x=TrimRead.trimToPosition(r2, 0, bestInsert-1, 1);
+ if(verbose){System.err.println("Trimmed "+x+" bases: "+new String(r2.bases));}
+ readsTrimmedByOverlapT++;
+ basesTrimmedByOverlapT+=x;
+ }
+ }
+ }
+
+ if(ktrimLeft && storedKmersLeft>0){
+ remove=remove || ktrim0(r1, r2, bad, trimLeftMaps, LEFTMODE, minlen1, minlen2);
+ }
+
+ }
+
+ if(!remove){
+ //Do quality trimming
+
+ int rlen1=0, rlen2=0;
+ if(r1!=null){
+ if(qtrimLeft || qtrimRight){
+ int x=TrimRead.trimFast(r1, qtrimLeft, qtrimRight, trimq, 1);
+ basesQTrimmedT+=x;
+ readsQTrimmedT+=(x>0 ? 1 : 0);
+ }
+ rlen1=r1.length();
+ if(rlen1<minlen1 || rlen1>maxReadLength){r1.setDiscarded(true);}
+ }
+ if(r2!=null){
+ if(qtrimLeft || qtrimRight){
+ int x=TrimRead.trimFast(r2, qtrimLeft, qtrimRight, trimq, 1);
+ basesQTrimmedT+=x;
+ readsQTrimmedT+=(x>0 ? 1 : 0);
+ }
+ rlen2=r2.length();
+ if(rlen2<minlen2 || rlen2>maxReadLength){r2.setDiscarded(true);}
+ }
+
+ //Discard reads if too short
+ if((removePairsIfEitherBad && (r1.discarded() || (r2!=null && r2.discarded()))) ||
+ (r1.discarded() && (r2==null || r2.discarded()))){
+ basesQTrimmedT+=(r1.length()+r1.mateLength());
+ remove=true;
+ if(addTrimmedToBad && bad!=null){bad.add(r1);}
+ }
+
+ }
+
+ if(!remove){
+ //Do quality filtering
+
+ //Determine whether to discard the reads based on average quality
+ if(minAvgQuality>0){
+ if(r1!=null && r1.quality!=null && r1.avgQuality(false, minAvgQualityBases)<minAvgQuality){r1.setDiscarded(true);}
+ if(r2!=null && r2.quality!=null && r2.avgQuality(false, minAvgQualityBases)<minAvgQuality){r2.setDiscarded(true);}
+ }
+ //Determine whether to discard the reads based on the presence of Ns
+ if(maxNs>=0){
+ if(r1!=null && r1.countUndefined()>maxNs){r1.setDiscarded(true);}
+ if(r2!=null && r2.countUndefined()>maxNs){r2.setDiscarded(true);}
+ }
+ //Determine whether to discard the reads based on a lack of useful kmers
+ if(minConsecutiveBases>0){
+ if(r1!=null && !r1.discarded() && !r1.hasMinConsecutiveBases(minConsecutiveBases)){r1.setDiscarded(true);}
+ if(r2!=null && !r2.discarded() && !r2.hasMinConsecutiveBases(minConsecutiveBases)){r2.setDiscarded(true);}
+ }
+ //Determine whether to discard the reads based on minimum base frequency
+ if(minBaseFrequency>0){
+ if(r1!=null && r1.minBaseCount()<minBaseFrequency*r1.length()){r1.setDiscarded(true);}
+ if(r2!=null && r2.minBaseCount()<minBaseFrequency*r2.length()){r2.setDiscarded(true);}
+ }
+
+ //Discard reads if too short
+ if((removePairsIfEitherBad && (r1.discarded() || (r2!=null && r2.discarded()))) ||
+ (r1.discarded() && (r2==null || r2.discarded()))){
+ basesQFilteredT+=(r1.length()+r1.mateLength());
+ readsQFilteredT+=1+r1.mateCount();
+ remove=true;
+ if(addTrimmedToBad && bad!=null){bad.add(r1);}
+ }
+ }
+
+ if(!remove && calcEntropy){
+ //Test entropy
+
+ if(r1!=null && !r1.discarded() && entropyCutoff>averageEntropy(r1.bases, entropyK, entropyWindow,
+ entropyCounts, entropyCountCounts, entropyKmerspace, verifyEntropy)){r1.setDiscarded(true);}
+ if(r2!=null && !r2.discarded() && entropyCutoff>averageEntropy(r2.bases, entropyK, entropyWindow,
+ entropyCounts, entropyCountCounts, entropyKmerspace, verifyEntropy)){r2.setDiscarded(true);}
+
+ if((removePairsIfEitherBad && (r1.discarded() || (r2!=null && r2.discarded()))) ||
+ (r1.discarded() && (r2==null || r2.discarded()))){
+ basesEFilteredT+=(r1.length()+r1.mateLength());
+ readsEFilteredT+=(r1==null ? 0 : 1)+(r2==null ? 0 : 1);
+ remove=true;
+ if(bad!=null){bad.add(r1);}
+ }
+ }
+
+ if(ross!=null){
+ if(!r1.discarded() && (r2==null || r2.discarded())){
+ Read clone=r1.clone();
+ clone.mate=null;
+ single.add(clone);
+ }else if(r2!=null && r1.discarded() && !r2.discarded()){
+ Read clone=r2.clone();
+ clone.mate=null;
+ single.add(clone);
+ }
+ }
+
+ if(remove){
+ //Evict read
+ removed++;
+ if(r2!=null){removed++;}
+ reads.set(i, null);
+// System.err.println("X1\t"+removed);
+ }else{
+ //Track statistics
+
+ if(r1!=null){
+ readsOutT++;
+ basesOutT+=r1.length();
+ }
+ if(r2!=null){
+ readsOutT++;
+ basesOutT+=r2.length();
+ }
+// System.err.println("X2\t"+readsOutT);
+ }
+ }
+
+ //Send matched list to matched output stream
+ if(rosb!=null){
+ rosb.add(bad, ln.id);
+ bad.clear();
+ }
+
+ //Send unmatched list to unmatched output stream
+ if(ros!=null){
+ ros.add((removed>0 ? Tools.condenseNew(reads) : reads), ln.id); //Creates a new list if old one became empty, to prevent shutting down the cris.
+ }
+
+ if(ross!=null){
+ ross.add(single, ln.id);
+ single.clear();
+ }
+
+ //Fetch a new read list
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ cris.returnList(ln.id, ln.list.isEmpty());
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Helper Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private boolean ktrim0(final Read r1, final Read r2, ArrayList<Read> bad, AbstractKmerTable[] maps, int mode, int minlen1, int minlen2){
+ boolean remove=false;
+ int rlen1=0, rlen2=0;
+ xsum=0;
+ rktsum=0;
+ if(r1!=null){
+ final int x=(mode==NMODE ? kmask(r1, maps) : ktrim(r1, maps, mode));
+ xsum+=x;
+ rktsum+=(x>0 ? 1 : 0);
+ rlen1=r1.length();
+ if(rlen1<minlen1){r1.setDiscarded(true);}
+ }
+ if(r2!=null){
+ final int x=(mode==NMODE ? kmask(r2, maps) : ktrim(r2, maps, mode));
+ xsum+=x;
+ rktsum+=(x>0 ? 1 : 0);
+ rlen2=r2.length();
+ if(rlen2<minlen2){r2.setDiscarded(true);}
+ }
+
+ if((removePairsIfEitherBad && (r1.discarded() || (r2!=null && r2.discarded()))) ||
+ (r1.discarded() && (r2==null || r2.discarded()))){
+ if(!ktrimN){
+ xsum+=(rlen1+rlen2);
+ rktsum=(r1==null ? 0 : 1)+(r2==null ? 0 : 1);
+ }
+ remove=true;
+ if(addTrimmedToBad && bad!=null){bad.add(r1);}
+ }
+ basesKTrimmedT+=xsum;
+ readsKTrimmedT+=rktsum;
+
+ return remove;
+ }
+
+ /**
+ * Transforms a kmer into all canonical values for a given Hamming distance.
+ * Returns the related id stored in the tables.
+ * @param kmer Forward kmer
+ * @param rkmer Reverse kmer
+ * @param lengthMask Bitmask with single '1' set to left of kmer
+ * @param qPos Position of kmer in query
+ * @param len kmer length
+ * @param qHDist Hamming distance
+ * @param sets Kmer hash tables
+ * @return Value stored in table, or -1
+ */
+ private final int getValue(final long kmer, final long rkmer, final long lengthMask, final int qPos, final int len, final int qHDist, final AbstractKmerTable[] sets){
+ int id=getValue(kmer, rkmer, lengthMask, qPos, sets);
+ if(id<1 && qHDist>0){
+ final int qHDist2=qHDist-1;
+
+ //Sub
+ for(int j=0; j<4 && id<1; j++){
+ for(int i=0; i<len && id<1; i++){
+ final long temp=(kmer&clearMasks[i])|setMasks[j][i];
+ if(temp!=kmer){
+ long rtemp=AminoAcid.reverseComplementBinaryFast(temp, len);
+ id=getValue(temp, rtemp, lengthMask, qPos, len, qHDist2, sets);
+ }
+ }
+ }
+ }
+ return id;
+ }
+
+ /**
+ * Transforms a kmer into a canonical value stored in the table and search.
+ * @param kmer Forward kmer
+ * @param rkmer Reverse kmer
+ * @param lengthMask Bitmask with single '1' set to left of kmer
+ * @param qPos Position of kmer in query
+ * @param sets Kmer hash tables
+ * @return Value stored in table
+ */
+ private final int getValue(final long kmer, final long rkmer, final long lengthMask, final int qPos, final AbstractKmerTable[] sets){
+ assert(lengthMask==0 || (kmer<lengthMask && rkmer<lengthMask)) : lengthMask+", "+kmer+", "+rkmer;
+ if(qSkip>1 && (qPos%qSkip!=0)){return -1;}
+
+ final long max=(rcomp ? Tools.max(kmer, rkmer) : kmer);
+ final long key=(max&middleMask)|lengthMask;
+ if(noAccel || ((key/WAYS)&15)>=speed){
+ if(verbose){System.err.println("Testing key "+key);}
+ AbstractKmerTable set=sets[(int)(key%WAYS)];
+ final int id=set.getValue(key);
+ return id;
+ }
+ return -1;
+ }
+
+ /*--------------------------------------------------------------*/
+
+ /**
+ * Counts the number of kmer hits for a read.
+ * @param r Read to process
+ * @param sets Kmer tables
+ * @return Number of hits
+ */
+ private final int countSetKmers(final Read r, final AbstractKmerTable sets[]){
+ if(r==null || r.length()<k){return 0;}
+ if((skipR1 && r.pairnum()==0) || (skipR2 && r.pairnum()==1)){return 0;}
+ final byte[] bases=r.bases;
+ final int minlen=k-1;
+ final int minlen2=(maskMiddle ? k/2 : k);
+ final int shift=2*k;
+ final int shift2=shift-2;
+ final long mask=~((-1L)<<shift);
+ final long kmask=lengthMasks[k];
+ long kmer=0;
+ long rkmer=0;
+ int found=0;
+ int len=0;
+
+ final int start=(restrictRight<1 ? 0 : Tools.max(0, bases.length-restrictRight));
+ final int stop=(restrictLeft<1 ? bases.length : Tools.min(bases.length, restrictLeft));
+
+ /* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */
+ for(int i=start; i<stop; i++){
+ byte b=bases[i];
+ long x=Dedupe.baseToNumber[b];
+ long x2=Dedupe.baseToComplementNumber[b];
+ kmer=((kmer<<2)|x)&mask;
+ rkmer=(rkmer>>>2)|(x2<<shift2);
+ if(b=='N' && forbidNs){len=0;}else{len++;}
+ if(verbose){System.err.println("Scanning6 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
+ if(len>=minlen2 && i>=minlen){
+ final int id=getValue(kmer, rkmer, kmask, i, k, qHammingDistance, sets);
+ if(verbose){System.err.println("Testing kmer "+kmer+"; id="+id);}
+ if(id>0){
+ if(verbose){System.err.println("Found = "+(found+1)+"/"+maxBadKmers);}
+ if(found==maxBadKmers){
+ if(scaffoldReadCountsT!=null){
+ scaffoldReadCountsT[id]++;
+ scaffoldBaseCountsT[id]+=bases.length;
+ }else{
+ scaffoldReadCounts.addAndGet(id, 1);
+ scaffoldBaseCounts.addAndGet(id, bases.length);
+ }
+ if(hitCounts==null){
+ return (found=found+1);
+ }//Early exit, but prevents generation of histogram that goes over maxBadKmers+1.
+ }
+ found++;
+ }
+ }
+ }
+
+ if(hitCountsT!=null){hitCountsT[Tools.min(found, HITCOUNT_LEN)]++;}
+ return found;
+ }
+
+ /**
+ * Returns the id of the sequence with the most kmer matches to this read, or -1 if none are over maxBadKmers.
+ * @param r Read to process
+ * @param sets Kmer tables
+ * @return id of best match
+ */
+ private final int findBestMatch(final Read r, final AbstractKmerTable sets[]){
+ idList.size=0;
+ if(r==null || r.length()<k){return -1;}
+ if((skipR1 && r.pairnum()==0) || (skipR2 && r.pairnum()==1)){return -1;}
+ final byte[] bases=r.bases;
+ final int minlen=k-1;
+ final int minlen2=(maskMiddle ? k/2 : k);
+ final int shift=2*k;
+ final int shift2=shift-2;
+ final long mask=~((-1L)<<shift);
+ final long kmask=lengthMasks[k];
+ long kmer=0;
+ long rkmer=0;
+ int len=0;
+ int found=0;
+
+ final int start=(restrictRight<1 ? 0 : Tools.max(0, bases.length-restrictRight));
+ final int stop=(restrictLeft<1 ? bases.length : Tools.min(bases.length, restrictLeft));
+
+ /* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */
+ for(int i=start; i<stop; i++){
+ byte b=bases[i];
+ long x=Dedupe.baseToNumber[b];
+ long x2=Dedupe.baseToComplementNumber[b];
+ kmer=((kmer<<2)|x)&mask;
+ rkmer=(rkmer>>>2)|(x2<<shift2);
+ if(b=='N' && forbidNs){len=0;}else{len++;}
+ if(verbose){System.err.println("Scanning6 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
+ if(len>=minlen2 && i>=minlen){
+ final int id=getValue(kmer, rkmer, kmask, i, k, qHammingDistance, sets);
+ if(verbose){System.err.println("Testing kmer "+kmer+"; id="+id);}
+ if(id>0){
+ countArray[id]++;
+ if(countArray[id]==1){idList.add(id);}
+ found++;
+ if(verbose){System.err.println("Found = "+found+"/"+maxBadKmers);}
+ }
+ }
+ }
+
+ final int id, max;
+ if(found>maxBadKmers){
+ max=condenseLoose(countArray, idList, countList);
+ int id0=-1;
+ for(int i=0; i<countList.size; i++){
+ if(countList.get(i)==max){
+ id0=idList.get(i); break;
+ }
+ }
+ if(rename){rename(r, idList, countList);}
+ id=id0;
+ }else{
+ max=0;
+ id=-1;
+ }
+
+ if(found>maxBadKmers){
+ if(scaffoldReadCountsT!=null){
+ scaffoldReadCountsT[id]++;
+ scaffoldBaseCountsT[id]+=bases.length;
+ }else{
+ scaffoldReadCounts.addAndGet(id, 1);
+ scaffoldBaseCounts.addAndGet(id, bases.length);
+ }
+ }
+
+ if(hitCountsT!=null){hitCountsT[Tools.min(found, HITCOUNT_LEN)]++;}
+ return id;
+ }
+
+ /** Estimates kmer hit counts for kmers longer than k using consecutive matches
+ * @param r
+ * @param sets
+ * @return Number of sets of consecutive hits of exactly length kbig
+ */
+ private final int countSetKmersBig(final Read r, final AbstractKmerTable sets[]){
+ if(r==null || r.length()<kbig){return 0;}
+ if((skipR1 && r.pairnum()==0) || (skipR2 && r.pairnum()==1)){return 0;}
+ assert(kbig>k);
+ final int sub=kbig-k-1;
+ assert(sub>=0) : kbig+", "+sub;
+ final byte[] bases=r.bases;
+ final int minlen=k-1;
+ final int minlen2=(maskMiddle ? k/2 : k);
+ final int shift=2*k;
+ final int shift2=shift-2;
+ final long mask=~((-1L)<<shift);
+ final long kmask=lengthMasks[k];
+ long kmer=0;
+ long rkmer=0;
+ int found=0;
+ int len=0;
+
+ int bkStart=-1;
+ int bkStop=-1;
+ int id=-1, lastId=-1;
+
+ final int start=(restrictRight<1 ? 0 : Tools.max(0, bases.length-restrictRight));
+ final int stop=(restrictLeft<1 ? bases.length : Tools.min(bases.length, restrictLeft));
+
+ /* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */
+ for(int i=start; i<stop; i++){
+ byte b=bases[i];
+ long x=Dedupe.baseToNumber[b];
+ long x2=Dedupe.baseToComplementNumber[b];
+ kmer=((kmer<<2)|x)&mask;
+ rkmer=(rkmer>>>2)|(x2<<shift2);
+ if(b=='N' && forbidNs){len=0;}else{len++;}
+ if(verbose){System.err.println("Scanning7 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
+ if(len>=minlen2 && i>=minlen){
+ id=getValue(kmer, rkmer, kmask, i, k, qHammingDistance, sets);
+ if(verbose){System.err.println("Testing kmer "+kmer+"; id="+id);}
+ if(id>0){
+ lastId=id;
+ if(bkStart==-1){bkStart=i;}
+ bkStop=i;
+ }else{
+ if(bkStart>-1){
+ int dif=bkStop-bkStart-sub;
+ bkStop=bkStart=-1;
+ if(dif>0){
+ int old=found;
+ found+=dif;
+ if(found>maxBadKmers && old<=maxBadKmers){
+ if(scaffoldReadCountsT!=null){
+ scaffoldReadCountsT[lastId]++;
+ scaffoldBaseCountsT[lastId]+=bases.length;
+ }else{
+ scaffoldReadCounts.addAndGet(lastId, 1);
+ scaffoldBaseCounts.addAndGet(lastId, bases.length);
+ }
+ if(hitCounts==null){
+ return found;
+ }//Early exit, but prevents generation of histogram that goes over maxBadKmers+1.
+ }
+ }
+ }
+ }
+ }
+ }
+
+ // This catches the case where valid kmers extend to the end of the read
+ if(bkStart>-1){
+ int dif=bkStop-bkStart-sub;
+ bkStop=bkStart=-1;
+ if(dif>0){
+ int old=found;
+ found+=dif;
+ if(found>maxBadKmers && old<=maxBadKmers){
+ if(scaffoldReadCountsT!=null){
+ scaffoldReadCountsT[lastId]++;
+ scaffoldBaseCountsT[lastId]+=bases.length;
+ }else{
+ scaffoldReadCounts.addAndGet(lastId, 1);
+ scaffoldBaseCounts.addAndGet(lastId, bases.length);
+ }
+ }
+ }
+ }
+
+ if(hitCountsT!=null){hitCountsT[Tools.min(found, HITCOUNT_LEN)]++;}
+ return found;
+ }
+
+ /**
+ * Trim a read to remove matching kmers and everything to their left or right.
+ * @param r Read to process
+ * @param sets Kmer tables
+ * @return Number of bases trimmed
+ */
+ private final int ktrim(final Read r, final AbstractKmerTable[] sets, int mode){
+ assert(mode==RIGHTMODE || mode==LEFTMODE);
+ if(r==null || r.length()<Tools.max(1, (useShortKmers ? Tools.min(k, mink) : k))){return 0;}
+ if((skipR1 && r.pairnum()==0) || (skipR2 && r.pairnum()==1)){return 0;}
+ if(verbose){System.err.println("KTrimming read "+r.id);}
+ final byte[] bases=r.bases, quals=r.quality;
+ final int minlen=k-1;
+ final int minlen2=(maskMiddle ? k/2 : k);
+ final int shift=2*k;
+ final int shift2=shift-2;
+ final long mask=~((-1L)<<shift);
+ final long kmask=lengthMasks[k];
+ long kmer=0;
+ long rkmer=0;
+ int found=0;
+ int len=0;
+ int id0=-1; //ID of first kmer found.
+
+ int minLoc=999999999, minLocExclusive=999999999;
+ int maxLoc=-1, maxLocExclusive=-1;
+ final int initialLength=r.length();
+
+ final int start=(restrictRight<1 ? 0 : Tools.max(0, bases.length-restrictRight));
+ final int stop=(restrictLeft<1 ? bases.length : Tools.min(bases.length, restrictLeft));
+
+ //Scan for normal kmers
+ for(int i=start; i<stop; i++){
+ byte b=bases[i];
+ long x=Dedupe.baseToNumber[b];
+ long x2=Dedupe.baseToComplementNumber[b];
+ kmer=((kmer<<2)|x)&mask;
+ rkmer=(rkmer>>>2)|(x2<<shift2);
+ if(b=='N' && forbidNs){len=0;}else{len++;}
+ if(verbose){System.err.println("Scanning3 i="+i+", kmer="+kmer+", rkmer="+rkmer+", len="+len+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
+ if(len>=minlen2 && i>=minlen){
+ final int id=getValue(kmer, rkmer, kmask, i, k, qHammingDistance, sets);
+ if(verbose){System.err.println("Testing kmer "+kmer+"; id="+id);}
+ if(id>0){
+ if(id0<0){id0=id;}
+ minLoc=Tools.min(minLoc, i-k+1);
+ assert(minLoc>=0);
+ maxLoc=i;
+ found++;
+ }
+ }
+ }
+
+ if(minLoc!=minLocExclusive){minLocExclusive=minLoc+k;}
+ if(maxLoc!=maxLocExclusive){maxLocExclusive=maxLoc-k;}
+
+ //If nothing was found, scan for short kmers. Only used for trimming.
+ if(useShortKmers && found==0){
+ assert(!maskMiddle && middleMask==-1) : maskMiddle+", "+middleMask+", k="+", mink="+mink;
+
+ //Look for short kmers on left side
+ if(mode==LEFTMODE || mode==NMODE){
+ kmer=0;
+ rkmer=0;
+ len=0;
+ final int lim=Tools.min(k, stop);
+ for(int i=start; i<lim; i++){
+ byte b=bases[i];
+ long x=Dedupe.baseToNumber[b];
+ long x2=Dedupe.baseToComplementNumber[b];
+ kmer=((kmer<<2)|x)&mask;
+ rkmer=rkmer|(x2<<(2*len));
+ len++;
+ if(verbose){System.err.println("Scanning4 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
+ if(len>=mink){
+
+ if(verbose){
+ System.err.println("Looking for left kmer "+AminoAcid.kmerToString(kmer, len));
+ System.err.println("Looking for left rkmer "+AminoAcid.kmerToString(rkmer, len));
+ }
+ final int id=getValue(kmer, rkmer, lengthMasks[len], i, len, qHammingDistance2, sets);
+ if(id>0){
+ if(id0<0){id0=id;}
+ if(verbose){System.err.println("Found "+kmer);}
+ minLoc=0;
+ minLocExclusive=Tools.min(minLocExclusive, i+1);
+ maxLoc=Tools.max(maxLoc, i);
+ maxLocExclusive=Tools.max(maxLocExclusive, 0);
+ found++;
+ }
+ }
+ }
+ }
+
+ //Look for short kmers on right side
+ if(mode==RIGHTMODE || mode==NMODE){
+ kmer=0;
+ rkmer=0;
+ len=0;
+ final int lim=Tools.max(-1, stop-k);
+ for(int i=stop-1; i>lim; i--){
+ byte b=bases[i];
+ long x=Dedupe.baseToNumber[b];
+ long x2=Dedupe.baseToComplementNumber[b];
+ kmer=kmer|(x<<(2*len));
+ rkmer=((rkmer<<2)|x2)&mask;
+ len++;
+ if(verbose){System.err.println("Scanning5 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
+ if(len>=mink){
+
+ final int id=getValue(kmer, rkmer, lengthMasks[len], i, len, qHammingDistance2, sets);
+ if(verbose){System.err.println("Looking for right kmer "+AminoAcid.kmerToString(kmer&~lengthMasks[len], len)+"; id="+id+"; kmask="+lengthMasks[len]);}
+ if(id>0){
+ if(id0<0){id0=id;}
+ if(verbose){System.err.println("Found "+kmer);}
+ minLoc=i;
+ minLocExclusive=Tools.min(minLocExclusive, bases.length);
+ maxLoc=bases.length-1;
+ maxLocExclusive=Tools.max(maxLocExclusive, i-1);
+ found++;
+ }
+ }
+ }
+ }
+ }
+
+
+ if(verbose){System.err.println("found="+found+", minLoc="+minLoc+", maxLoc="+maxLoc+", minLocExclusive="+minLocExclusive+", maxLocExclusive="+maxLocExclusive);}
+
+ if(found==0){return 0;}
+ assert(found>0) : "Overflow in 'found' variable.";
+
+ {//Increment counter for the scaffold whose kmer was first detected
+ if(scaffoldReadCountsT!=null){
+ scaffoldReadCountsT[id0]++;
+ scaffoldBaseCountsT[id0]+=bases.length;
+ }else{
+ scaffoldReadCounts.addAndGet(id0, 1);
+ scaffoldBaseCounts.addAndGet(id0, bases.length);
+ }
+ }
+
+ if(trimPad!=0){
+ maxLoc=Tools.mid(0, maxLoc+trimPad, bases.length);
+ minLoc=Tools.mid(0, minLoc-trimPad, bases.length);
+ maxLocExclusive=Tools.mid(0, maxLocExclusive+trimPad, bases.length);
+ minLocExclusive=Tools.mid(0, minLocExclusive-trimPad, bases.length);
+ }
+
+ //Old version. No longer needed.
+// if(mode==NMODE){ //Replace kmer hit zone with the trim symbol
+// Arrays.fill(bases, minLoc, maxLoc+1, trimSymbol);
+// if(quals!=null){Arrays.fill(quals, minLoc, maxLoc+1, (byte)0);}
+// return maxLoc-minLoc+1;
+// }
+
+ if(mode==LEFTMODE){ //Trim from the read start to the rightmost kmer base
+ if(verbose){System.err.println("Left trimming to "+(ktrimExclusive ? maxLocExclusive+1 : maxLoc+1)+", "+0);}
+ int x=TrimRead.trimToPosition(r, ktrimExclusive ? maxLocExclusive+1 : maxLoc+1, bases.length-1, 1);
+ if(verbose){System.err.println("Trimmed "+x+" bases: "+new String(r.bases));}
+ return x;
+ }else{ //Trim from the leftmost kmer base to the read stop
+ assert(mode==RIGHTMODE);
+ if(verbose){System.err.println("Right trimming to "+0+", "+(ktrimExclusive ? minLocExclusive-1 : minLoc-1));}
+ int x=TrimRead.trimToPosition(r, 0, ktrimExclusive ? minLocExclusive-1 : minLoc-1, 1);
+ if(verbose){System.err.println("Trimmed "+x+" bases: "+new String(r.bases));}
+ return x;
+ }
+ }
+
+
+ /**
+ * Mask a read to cover matching kmers.
+ * @param r Read to process
+ * @param sets Kmer tables
+ * @return Number of bases masked
+ */
+ private final int kmask(final Read r, final AbstractKmerTable[] sets){
+ assert(ktrimN);
+ if(r==null || r.length()<Tools.max(1, (useShortKmers ? Tools.min(k, mink) : k))){return 0;}
+ if((skipR1 && r.pairnum()==0) || (skipR2 && r.pairnum()==1)){return 0;}
+ if(verbose){System.err.println("KMasking read "+r.id);}
+ final byte[] bases=r.bases, quals=r.quality;
+ if(bases==null || bases.length<k){return 0;}
+ final int minlen=k-1;
+ final int minlen2=(maskMiddle ? k/2 : k);
+ final int shift=2*k;
+ final int shift2=shift-2;
+ final long mask=~((-1L)<<shift);
+ final long kmask=lengthMasks[k];
+ long kmer=0;
+ long rkmer=0;
+ int found=0;
+ int len=0;
+ int id0=-1; //ID of first kmer found.
+
+ BitSet bs=new BitSet(bases.length+trimPad+1);
+
+ final int minus=k-1-trimPad;
+ final int plus=trimPad+1;
+
+ final int start=(restrictRight<1 ? 0 : Tools.max(0, bases.length-restrictRight));
+ final int stop=(restrictLeft<1 ? bases.length : Tools.min(bases.length, restrictLeft));
+
+ //Scan for normal kmers
+ for(int i=start; i<stop; i++){
+ byte b=bases[i];
+ long x=Dedupe.baseToNumber[b];
+ long x2=Dedupe.baseToComplementNumber[b];
+ kmer=((kmer<<2)|x)&mask;
+ rkmer=(rkmer>>>2)|(x2<<shift2);
+ if(b=='N' && forbidNs){len=0;}else{len++;}
+ if(verbose){System.err.println("Scanning3 i="+i+", kmer="+kmer+", rkmer="+rkmer+", len="+len+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
+ if(len>=minlen2 && i>=minlen){
+ final int id=getValue(kmer, rkmer, kmask, i, k, qHammingDistance, sets);
+ if(verbose){System.err.println("Testing kmer "+kmer+"; id="+id);}
+ if(id>0){
+ if(id0<0){id0=id;}
+ if(verbose){
+ System.err.println("a: Found "+kmer);
+ System.err.println("Setting "+Tools.max(0, i-minus)+", "+(i+plus));
+ System.err.println("i="+i+", minus="+minus+", plus="+plus+", trimpad="+trimPad+", k="+k);
+ }
+ bs.set(Tools.max(0, i-minus), i+plus);
+ found++;
+ }
+ }
+ }
+
+ //If nothing was found, scan for short kmers.
+ if(useShortKmers){
+ assert(!maskMiddle && middleMask==-1) : maskMiddle+", "+middleMask+", k="+", mink="+mink;
+
+ //Look for short kmers on left side
+ {
+ kmer=0;
+ rkmer=0;
+ len=0;
+ final int lim=Tools.min(k, stop);
+ for(int i=start; i<lim; i++){
+ byte b=bases[i];
+ long x=Dedupe.baseToNumber[b];
+ long x2=Dedupe.baseToComplementNumber[b];
+ kmer=((kmer<<2)|x)&mask;
+ rkmer=rkmer|(x2<<(2*len));
+ len++;
+ if(verbose){System.err.println("Scanning4 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
+ if(len>=mink){
+
+ if(verbose){
+ System.err.println("Looking for left kmer "+AminoAcid.kmerToString(kmer, len));
+ System.err.println("Looking for left rkmer "+AminoAcid.kmerToString(rkmer, len));
+ }
+ final int id=getValue(kmer, rkmer, lengthMasks[len], i, len, qHammingDistance2, sets);
+ if(id>0){
+ if(id0<0){id0=id;}
+ if(verbose){
+ System.err.println("b: Found "+kmer);
+ System.err.println("Setting "+0+", "+(i+plus));
+ }
+ bs.set(0, i+plus);
+ found++;
+ }
+ }
+ }
+ }
+
+ //Look for short kmers on right side
+ {
+ kmer=0;
+ rkmer=0;
+ len=0;
+ final int lim=Tools.max(-1, stop-k);
+ for(int i=stop-1; i>lim; i--){
+ byte b=bases[i];
+ long x=Dedupe.baseToNumber[b];
+ long x2=Dedupe.baseToComplementNumber[b];
+ kmer=kmer|(x<<(2*len));
+ rkmer=((rkmer<<2)|x2)&mask;
+ len++;
+ if(verbose){System.err.println("Scanning5 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
+ if(len>=mink){
+
+ final int id=getValue(kmer, rkmer, lengthMasks[len], i, len, qHammingDistance2, sets);
+ if(verbose){System.err.println("Looking for right kmer "+AminoAcid.kmerToString(kmer&~lengthMasks[len], len)+"; id="+id+"; kmask="+lengthMasks[len]);}
+ if(id>0){
+ if(id0<0){id0=id;}
+ if(verbose){
+ System.err.println("c: Found "+kmer);
+ System.err.println("Setting "+Tools.max(0, i-trimPad)+", "+bases.length);
+ }
+ bs.set(Tools.max(0, i-trimPad), bases.length);
+ found++;
+ }
+ }
+ }
+ }
+ }
+
+
+ if(verbose){System.err.println("found="+found+", bitset="+bs);}
+
+ if(found==0){return 0;}
+ assert(found>0) : "Overflow in 'found' variable.";
+
+ {//Increment counter for the scaffold whose kmer was first detected
+ if(scaffoldReadCountsT!=null){
+ scaffoldReadCountsT[id0]++;
+ scaffoldBaseCountsT[id0]+=bases.length;
+ }else{
+ scaffoldReadCounts.addAndGet(id0, 1);
+ scaffoldBaseCounts.addAndGet(id0, bases.length);
+ }
+ }
+
+ int cardinality=bs.cardinality();
+ assert(cardinality>0);
+
+ //Replace kmer hit zone with the trim symbol
+ for(int i=0; i<bases.length; i++){
+ if(bs.get(i)){
+ if(kmaskLowercase){
+ bases[i]=(byte)Character.toLowerCase(bases[i]);
+ }else{
+ bases[i]=trimSymbol;
+ if(quals!=null && trimSymbol=='N'){quals[i]=0;}
+ }
+ }
+ }
+ return cardinality;
+ }
+
+ /**
+ * @param r
+ * @param idList
+ * @param countList
+ */
+ private void rename(Read r, IntList idList, IntList countList) {
+ if(r==null || idList.size<1){return;}
+ StringBuilder sb=new StringBuilder();
+ if(r.id==null){sb.append(r.numericID);}
+ else{sb.append(r.id);}
+ for(int i=0; i<idList.size; i++){
+ int id=idList.get(i);
+ int count=countList.get(i);
+ sb.append('\t');
+ sb.append(scaffoldNames.get(id));
+ sb.append('=');
+ sb.append(count);
+ }
+ r.id=sb.toString();
+ }
+
+ /**
+ * Pack a list of counts from an array to an IntList.
+ * @param loose Counter array
+ * @param packed Unique values
+ * @param counts Counts of values
+ * @return
+ */
+ private int condenseLoose(int[] loose, IntList packed, IntList counts){
+ counts.size=0;
+ if(packed.size<1){return 0;}
+
+ int max=0;
+ for(int i=0; i<packed.size; i++){
+ final int p=packed.get(i);
+ final int c=loose[p];
+ counts.add(c);
+ loose[p]=0;
+ max=Tools.max(max, c);
+ }
+ return max;
+ }
+
+ private float expectedErrors(Read r1, Read r2){
+ float a=(r1==null ? 0 : r1.expectedErrors(false, -1));
+ float b=(r2==null ? 0 : r2.expectedErrors(false, -1));
+ return Tools.max(a, b);
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Entropy Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private float averageEntropy(final byte[] bases, final int k,
+ final int window, final short[] counts, final short[] countCounts, final int kmerspace, boolean verify){
+ assert(k>0) : "k must be greater than 0";
+// Arrays.fill(counts, 0);
+
+ assert(countCounts[0]==window);
+ if(verify){
+ for(int c : counts){assert(c==0);}
+ for(int i=1; i<countCounts.length; i++){assert(countCounts[i]==0);}
+ }
+
+ final int mask=(k>15 ? -1 : ~((-1)<<(2*k)));
+ int current=0;
+ //int ns=0;
+ int kmer=0, kmer2=0;
+
+ double entropySum=0;
+ int entropyMeasurements=0;
+
+ for(int i=0, i2=-window; i2<bases.length; i++, i2++){
+
+// System.err.println("\nStart: i="+i+", current="+current+", ns="+ns+"\n"+Arrays.toString(counts)+"\n"+Arrays.toString(countCounts));
+
+ if(i<bases.length){
+ byte b=bases[i];
+ if(!AminoAcid.isFullyDefined(b)){
+// ns++;
+ b='A';
+ }
+ final int n=Dedupe.baseToNumber[b];
+ kmer=((kmer<<2)|n)&mask;
+
+ if(counts[kmer]<1){
+ assert(counts[kmer]==0);
+ current++;
+ }
+ countCounts[counts[kmer]]--;
+ assert(countCounts[counts[kmer]]>=-1): i+", "+current+", "+Tools.cardinality(counts)+"\n"+Arrays.toString(counts)+"\n"+Arrays.toString(countCounts);
+ counts[kmer]++;
+ assert(counts[kmer]<=window+1) : Arrays.toString(counts)+"\n"+Arrays.toString(countCounts);
+ countCounts[counts[kmer]]++;
+ if(verify){
+ assert(current==Tools.cardinality(counts)) : current+", "+Tools.cardinality(counts)+"\n"+Arrays.toString(counts);
+ assert(Tools.sum(countCounts)>0 && (Tools.sum(countCounts)<=window+1)): current+", "+Tools.cardinality(counts)+"\n"+Arrays.toString(countCounts);
+ }
+
+// System.err.println("Added "+kmer+"; counts["+kmer+"]="+counts[kmer]);
+ }
+
+ if(i2>=0){
+ byte b2=bases[i2];
+ if(!AminoAcid.isFullyDefined(b2)){
+// ns--;
+ b2='A';
+ }
+ final int n2=Dedupe.baseToNumber[b2];
+ kmer2=((kmer2<<2)|n2)&mask;
+
+ countCounts[counts[kmer2]]--;
+ assert(countCounts[counts[kmer2]]>=0);
+ counts[kmer2]--;
+ countCounts[counts[kmer2]]++;
+ if(counts[kmer2]<1){
+ assert(counts[kmer2]==0) : Arrays.toString(counts);
+ current--;
+ }
+ if(verify){
+ assert(current==Tools.cardinality(counts)) : current+", "+Tools.cardinality(counts)+"\n"+Arrays.toString(counts);
+ assert(Tools.sum(countCounts)>=0 && (Tools.sum(countCounts)<=window)): current+", "+Tools.cardinality(counts)+"\n"+Arrays.toString(countCounts);
+ }
+
+// System.err.println("Removed "+kmer2+"; count="+counts[kmer2]);
+ }
+
+ if(verify && i2>-1 && i<bases.length){
+ assert(Tools.sum(counts)==window);
+ assert(Tools.sum(countCounts)==window): current+", "+Tools.cardinality(counts)+"\n"+Arrays.toString(countCounts);
+ }
+
+ if(i2>=-1 && i<bases.length){
+ float e=calcEntropy(countCounts, window, kmerspace);
+ entropySum+=e;
+ entropyMeasurements++;
+ }
+ }
+
+// System.err.println(" *** ");
+// System.err.println(entropySum+", "+entropyMeasurements+", "+(entropySum/(Tools.max(1, entropyMeasurements))));
+// System.err.println(window+", "+k+", "+kmerspace+", "+counts.length+", "+countCounts.length);
+// System.err.println(" *** ");
+
+ return (float)(entropySum/(Tools.max(1, entropyMeasurements)));
+ }
+
+ private float calcEntropy(short[] countCounts, int window, int kmerspace){
+ double sum=0;
+ for(int i=1; i<countCounts.length; i++){
+ int cc=countCounts[i];
+ double pklogpk=entropy[i];
+ sum+=(cc*pklogpk);
+ }
+// System.err.println("sum = "+sum);
+// System.err.println("entropy = "+(sum*entropyMult));
+ return (float)(sum*entropyMult);
+ }
+
+ /*--------------------------------------------------------------*/
+
+ /** Input read stream */
+ private final ConcurrentReadInputStream cris;
+ /** Output read streams */
+ private final ConcurrentReadOutputStream ros, rosb, ross;
+
+ private final ReadStats readstats;
+ private final int[] overlapVector;
+ private final int[] countArray;
+
+ private final IntList idList;
+ private final IntList countList;
+
+ long[] hitCountsT;
+ long[] scaffoldReadCountsT;
+ long[] scaffoldBaseCountsT;
+
+ final short[] entropyCounts;
+ final short[] entropyCountCounts;
+
+ private float[] aprob, bprob;
+
+ private long readsInT=0;
+ private long basesInT=0;
+ private long readsOutT=0;
+ private long basesOutT=0;
+
+ private long readsQTrimmedT=0;
+ private long basesQTrimmedT=0;
+ private long readsFTrimmedT=0;
+ private long basesFTrimmedT=0;
+ private long readsQFilteredT=0;
+ private long basesQFilteredT=0;
+ private long readsEFilteredT=0;
+ private long basesEFilteredT=0;
+
+ private long readsKTrimmedT=0;
+ private long basesKTrimmedT=0;
+ private long readsKFilteredT=0;
+ private long basesKFilteredT=0;
+
+ private long readsTrimmedByOverlapT=0;
+ private long basesTrimmedByOverlapT=0;
+
+ private long badGcBasesT=0;
+ private long badGcReadsT=0;
+
+ private int xsum=0, rktsum=0;
+ }
+
+ /*--------------------------------------------------------------*/
+
+ /**
+ * Object holding a String and numbers, for tracking the number of read and base hits per scaffold.
+ */
+ private static class StringNum implements Comparable<StringNum>{
+
+ public StringNum(String name_, int len_, long reads_, long bases_){
+ name=name_;
+ length=len_;
+ reads=reads_;
+ bases=bases_;
+ }
+ public final int compareTo(StringNum o){
+ if(bases!=o.bases){return o.bases>bases ? 1 : -1;}
+ if(reads!=o.reads){return o.reads>reads ? 1 : -1;}
+ return name.compareTo(o.name);
+ }
+ public final boolean equals(StringNum o){
+ return compareTo(o)==0;
+ }
+ public final String toString(){
+ return name+"\t"+length+"\t"+reads+"\t"+bases;
+ }
+
+ /*--------------------------------------------------------------*/
+
+ public final String name;
+ public final int length;
+ public final long reads, bases;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Static Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Current available memory */
+ private static final long freeMemory(){
+ Runtime rt=Runtime.getRuntime();
+ return rt.freeMemory();
+ }
+
+ /**
+ * Transforms a kmer into a canonical value stored in the table. Expected to be inlined.
+ * @param kmer Forward kmer
+ * @param rkmer Reverse kmer
+ * @param lengthMask Bitmask with single '1' set to left of kmer
+ * @return Canonical value
+ */
+ private final long toValue(long kmer, long rkmer, long lengthMask){
+ assert(lengthMask==0 || (kmer<lengthMask && rkmer<lengthMask)) : lengthMask+", "+kmer+", "+rkmer;
+ long value=(rcomp ? Tools.max(kmer, rkmer) : kmer);
+ return (value&middleMask)|lengthMask;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** For calculating kmer cardinality */
+ private final LogLog loglog;
+
+ /** Has this class encountered errors while processing? */
+ public boolean errorState=false;
+
+ /** Fraction of available memory preallocated to arrays */
+ private double preallocFraction=1.0;
+ /** Initial size of data structures */
+ private int initialSize=-1;
+
+ /** Hold kmers for filtering. A kmer X such that X%WAYS=Y will be stored in keySets[Y] */
+ private final AbstractKmerTable[] filterMaps;
+ /** Hold kmers for masking */
+ private final AbstractKmerTable[] maskMaps;
+ /** Hold kmers for trimming right (3') */
+ private final AbstractKmerTable[] trimRightMaps;
+ /** Hold kmers for trimming left (5') */
+ private final AbstractKmerTable[] trimLeftMaps;
+
+ /** A scaffold's name is stored at scaffoldNames.get(id).
+ * scaffoldNames[0] is reserved, so the first id is 1. */
+ private final ArrayList<String> scaffoldNames=new ArrayList<String>();
+ /** Names of reference files (refNames[0] is valid). */
+ private final ArrayList<String> refNames=new ArrayList<String>();
+ /** Number of scaffolds per reference. */
+ private final int[] refScafCounts;
+ /** scaffoldCounts[id] stores the number of reads with kmer matches to that scaffold */
+ private AtomicLongArray scaffoldReadCounts;
+ /** scaffoldBaseCounts[id] stores the number of bases with kmer matches to that scaffold */
+ private AtomicLongArray scaffoldBaseCounts;
+ /** Set to false to force threads to share atomic counter arrays. */
+ private boolean ALLOW_LOCAL_ARRAYS=true;
+ /** scaffoldLengths[id] stores the length of that scaffold */
+ private IntList scaffoldLengths=new IntList();
+ /** hitCounts[x] stores the number of reads with exactly x kmer matches */
+ private long[] hitCounts;
+ /** Array of reference files from which to load kmers */
+ private String[] refFilter=null;
+ /** Array of reference files from which to load kmers */
+ private String[] refMask=null;
+ /** Array of reference files from which to load kmers */
+ private String[] refRight=null;
+ /** Array of reference files from which to load kmers */
+ private String[] refLeft=null;
+ /** Array of literal strings from which to load kmers */
+ private String[] literalFilter=null;
+ /** Array of literal strings from which to load kmers */
+ private String[] literalMask=null;
+ /** Array of literal strings from which to load kmers */
+ private String[] literalRight=null;
+ /** Array of literal strings from which to load kmers */
+ private String[] literalLeft=null;
+
+ /** Input reads */
+ private String in1=null, in2=null;
+ /** Input qual files */
+ private String qfin1=null, qfin2=null;
+ /** Output reads (unmatched and at least minlen) */
+ private String out1=null, out2=null;
+ /** Output reads (matched or shorter than minlen) */
+ private String outb1=null, outb2=null;
+ /** Output reads whose mate was discarded */
+ private String outsingle=null;
+ /** Statistics output files */
+ private String outstats=null, outduk=null, outrqc=null, outrpkm=null, outrefstats=null;
+
+ /** Optional file for quality score recalibration */
+ private String samFile=null;
+
+ /** Dump kmers here. */
+ private String dump=null;
+
+ /** Maximum input reads (or pairs) to process. Does not apply to references. -1 means unlimited. */
+ private long maxReads=-1;
+ /** Process this fraction of input reads. */
+ private float samplerate=1f;
+ /** Set samplerate seed to this value. */
+ private long sampleseed=-1;
+
+ /** Output reads in input order. May reduce speed. */
+ private final boolean ORDERED;
+ /** Attempt to match kmers shorter than normal k on read ends when doing kTrimming. */
+ private boolean useShortKmers=false;
+ /** Make the middle base in a kmer a wildcard to improve sensitivity */
+ private boolean maskMiddle=true;
+
+ /** Store reference kmers with up to this many substitutions */
+ private int hammingDistance=0;
+ /** Search for query kmers with up to this many substitutions */
+ private int qHammingDistance=0;
+ /** Store reference kmers with up to this many edits (including indels) */
+ private int editDistance=0;
+ /** Store short reference kmers with up to this many substitutions */
+ private int hammingDistance2=-1;
+ /** Search for short query kmers with up to this many substitutions */
+ private int qHammingDistance2=-1;
+ /** Store short reference kmers with up to this many edits (including indels) */
+ private int editDistance2=-1;
+ /** Never skip more than this many consecutive kmers when hashing reference. */
+ private int maxSkip=1;
+ /** Always skip at least this many consecutive kmers when hashing reference.
+ * 1 means every kmer is used, 2 means every other, etc. */
+ private int minSkip=1;
+
+ /** Trim this much extra around matched kmers */
+ private int trimPad;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Entropy Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private int entropyK=5;
+ private int entropyWindow=50;
+ private float entropyCutoff=-1;
+ private boolean verifyEntropy=false;
+
+ private final boolean calcEntropy;
+ private final int entropyKmerspace;
+ private final double entropyMult;
+ private final double[] entropy;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Statistics ----------------*/
+ /*--------------------------------------------------------------*/
+
+ long readsIn=0;
+ long basesIn=0;
+ long readsOut=0;
+ long basesOut=0;
+
+ long readsQTrimmed=0;
+ long basesQTrimmed=0;
+ long readsFTrimmed=0;
+ long basesFTrimmed=0;
+ long readsQFiltered=0;
+ long basesQFiltered=0;
+ long readsEFiltered=0;
+ long basesEFiltered=0;
+
+ long readsKTrimmed=0;
+ long basesKTrimmed=0;
+ long readsKFiltered=0;
+ long basesKFiltered=0;
+
+ long badGcReads;
+ long badGcBases;
+
+ long readsTrimmedByOverlap;
+ long basesTrimmedByOverlap;
+
+ long refReads=0;
+ long refBases=0;
+ long refKmers=0;
+
+ long storedKmersFilter=0;
+ long storedKmersMask=0;
+ long storedKmersRight=0;
+ long storedKmersLeft=0;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Final Primitives ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Don't look for kmers in read 1 */
+ private final boolean skipR1;
+ /** Don't look for kmers in read 2 */
+ private final boolean skipR2;
+ /** Correct errors via read overlap */
+ private final boolean ecc;
+
+ /** Look for reverse-complements as well as forward kmers. Default: true */
+ private final boolean rcomp;
+ /** Don't allow a read 'N' to match a reference 'A'.
+ * Reduces sensitivity when hdist>0 or edist>0. Default: false. */
+ private final boolean forbidNs;
+ /** AND bitmask with 0's at the middle base */
+ private final long middleMask;
+ /** Use HashForest data structure */
+ private final boolean useForest;
+ /** Use KmerTable data structure */
+ private final boolean useTable;
+ /** Use HashArray data structure (default) */
+ private final boolean useArray;
+
+ /** Normal kmer length */
+ private final int k;
+ /** k-1; used in some expressions */
+ private final int k2;
+ /** Emulated kmer greater than k */
+ private final int kbig;
+ /** Shortest kmer to use for trimming */
+ private final int mink;
+ /** A read may contain up to this many kmers before being considered a match. Default: 0 */
+ private final int maxBadKmers;
+
+ /** Recalibrate quality scores using matrices */
+ private final boolean recalibrateQuality;
+ /** Quality-trim the left side */
+ private final boolean qtrimLeft;
+ /** Quality-trim the right side */
+ private final boolean qtrimRight;
+ /** Trim bases at this quality or below. Default: 4 */
+ private final byte trimq;
+ /** Throw away reads below this average quality before trimming. Default: 0 */
+ private final byte minAvgQuality;
+ /** If positive, calculate average quality from the first X bases only. Default: 0 */
+ private final int minAvgQualityBases;
+ /** Throw away reads failing chastity filter (:Y: in read header) */
+ private final boolean chastityFilter;
+ /** Crash if a barcode is encountered that contains Ns or is not in the table */
+ private final boolean failBadBarcodes;
+ /** Remove reads with Ns in barcodes or that are not in the table */
+ private final boolean removeBadBarcodes;
+ /** Fail reads missing a barcode */
+ private final boolean failIfNoBarcode;
+ /** A set of valid barcodes; null if unused */
+ private final HashSet<String> barcodes;
+ /** Throw away reads containing more than this many Ns. Default: -1 (disabled) */
+ private final int maxNs;
+ /** Throw away reads containing without at least this many consecutive called bases. */
+ private int minConsecutiveBases=0;
+ /** Throw away reads containing fewer than this fraction of any particular base. */
+ private final float minBaseFrequency;
+ /** Throw away reads shorter than this after trimming. Default: 10 */
+ private final int minReadLength;
+ /** Throw away reads longer than this after trimming. Default: Integer.MAX_VALUE */
+ private final int maxReadLength;
+ /** Toss reads shorter than this fraction of initial length, after trimming */
+ private final float minLenFraction;
+ /** Filter reads by whether or not they have matching kmers */
+ private final boolean kfilter;
+ /** Trim matching kmers and all bases to the left */
+ private final boolean ktrimLeft;
+ /** Trim matching kmers and all bases to the right */
+ private final boolean ktrimRight;
+ /** Don't trim, but replace matching kmers with a symbol (default N) */
+ private final boolean ktrimN;
+ /** Exclude kmer itself when ktrimming */
+ private final boolean ktrimExclusive;
+ /** Replace bases covered by matched kmers with this symbol */
+ private final byte trimSymbol;
+ /** Convert masked bases to lowercase */
+ private final boolean kmaskLowercase;
+ /** Output over-trimmed reads to outbad (outmatch). If false, they are discarded. */
+ private final boolean addTrimmedToBad;
+ /** Find the sequence that shares the most kmer matches when filtering. */
+ private final boolean findBestMatch;
+ /** Trim pairs to the same length, when adapter-trimming */
+ private final boolean trimPairsEvenly;
+ /** Trim left bases of the read to this position (exclusive, 0-based) */
+ private final int forceTrimLeft;
+ /** Trim right bases of the read after this position (exclusive, 0-based) */
+ private final int forceTrimRight;
+ /** Trim this many rightmost bases of the read */
+ private final int forceTrimRight2;
+ /** Trim right bases of the read modulo this value.
+ * e.g. forceTrimModulo=50 would trim the last 3bp from a 153bp read. */
+ private final int forceTrimModulo;
+
+ /** Discard reads with GC below this. */
+ private final float minGC;
+ /** Discard reads with GC above this. */
+ private final float maxGC;
+ /** Discard reads outside of GC bounds. */
+ private final boolean filterGC;
+
+ /** If positive, only look for kmer matches in the leftmost X bases */
+ private int restrictLeft;
+ /** If positive, only look for kmer matches the rightmost X bases */
+ private int restrictRight;
+
+ /** Trim implied adapters based on overlap, for reads with insert size shorter than read length */
+ private final boolean trimByOverlap;
+ private final boolean useQualityForOverlap;
+ private final boolean strictOverlap;
+
+// private int minOverlap0=11;
+// private int minOverlap=24;
+// private final int overlapMargin=2;
+// private final int overlapMaxMismatches0=4;
+// private final int overlapMaxMismatches=4;
+// private final int overlapMinq=13;
+
+ private int minOverlap0=7;
+ private int minOverlap=14;
+ private int minInsert0=16;
+ private int minInsert=50;
+
+ private final float maxRatio;
+ private final float ratioMargin;
+ private final float ratioOffset;
+ private final float efilterRatio;
+ private final float efilterOffset;
+ private final float pfilterRatio;
+ private final float meeFilter;
+
+ /** True iff java was launched with the -ea' flag */
+ private final boolean EA;
+ /** Skip this many initial input reads */
+ private final long skipreads;
+
+ /** Pairs go to outbad if either of them is bad, as opposed to requiring both to be bad.
+ * Default: true. */
+ private final boolean removePairsIfEitherBad;
+
+ /** Print only statistics for scaffolds that matched at least one read
+ * Default: true. */
+ private final boolean printNonZeroOnly;
+
+ /** Rename reads to indicate what they matched.
+ * Default: false. */
+ private final boolean rename;
+ /** Use names of reference files instead of scaffolds.
+ * Default: false. */
+ private final boolean useRefNames;
+
+ /** Fraction of kmers to skip, 0 to 15 out of 16 */
+ private final int speed;
+
+ /** Skip this many kmers when examining the read. Default 1.
+ * 1 means every kmer is used, 2 means every other, etc. */
+ private final int qSkip;
+
+ /** True if speed and qSkip are disabled. */
+ private final boolean noAccel;
+
+ private final boolean MAKE_QUALITY_ACCURACY;
+ private final boolean MAKE_QUALITY_HISTOGRAM;
+ private final boolean MAKE_MATCH_HISTOGRAM;
+ private final boolean MAKE_BASE_HISTOGRAM;
+
+ private final boolean MAKE_EHIST;
+ private final boolean MAKE_INDELHIST;
+ private final boolean MAKE_LHIST;
+ private final boolean MAKE_GCHIST;
+ private final boolean MAKE_IDHIST;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Static Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Number of tables (and threads, during loading) */
+ private static final int WAYS=7; //123
+ /** Default initial size of data structures */
+ private static final int initialSizeDefault=128000;
+ /** Verbose messages */
+ public static final boolean verbose=false; //123
+
+ /** Print messages to this stream */
+ private static PrintStream outstream=System.err;
+ /** Permission to overwrite existing files */
+ public static boolean overwrite=true;
+ /** Permission to append to existing files */
+ public static boolean append=false;
+ /** Print speed statistics upon completion */
+ public static boolean showSpeed=true;
+ /** Display progress messages such as memory usage */
+ public static boolean DISPLAY_PROGRESS=true;
+ /** Number of ProcessThreads */
+ public static int THREADS=Shared.threads();
+ /** Indicates end of input stream */
+ private static final ArrayList<Read> POISON=new ArrayList<Read>(0);
+ /** Number of columns for statistics output, 3 or 5 */
+ public static int STATS_COLUMNS=3;
+ /** Release memory used by kmer storage after processing reads */
+ public static boolean RELEASE_TABLES=true;
+ /** Max value of hitCount array */
+ public static final int HITCOUNT_LEN=1000;
+ /** Make unambiguous copies of ref sequences with ambiguous bases */
+ public static boolean REPLICATE_AMBIGUOUS=false;
+
+ /** x&clearMasks[i] will clear base i */
+ private static final long[] clearMasks;
+ /** x|setMasks[i][j] will set base i to j */
+ private static final long[][] setMasks;
+ /** x&leftMasks[i] will clear all bases to the right of i (exclusive) */
+ private static final long[] leftMasks;
+ /** x&rightMasks[i] will clear all bases to the left of i (inclusive) */
+ private static final long[] rightMasks;
+ /** x|kMasks[i] will set the bit to the left of the leftmost base */
+ private static final long[] lengthMasks;
+
+ public static HashMap<String,String> RQC_MAP=null;
+
+ public static final int FILTERMODE=0, RIGHTMODE=1, LEFTMODE=2, NMODE=3;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Static Initializers ----------------*/
+ /*--------------------------------------------------------------*/
+
+ static{
+ clearMasks=new long[32];
+ leftMasks=new long[32];
+ rightMasks=new long[32];
+ lengthMasks=new long[32];
+ setMasks=new long[4][32];
+ for(int i=0; i<32; i++){
+ clearMasks[i]=~(3L<<(2*i));
+ }
+ for(int i=0; i<32; i++){
+ leftMasks[i]=((-1L)<<(2*i));
+ }
+ for(int i=0; i<32; i++){
+ rightMasks[i]=~((-1L)<<(2*i));
+ }
+ for(int i=0; i<32; i++){
+ lengthMasks[i]=((1L)<<(2*i));
+ }
+ for(int i=0; i<32; i++){
+ for(long j=0; j<4; j++){
+ setMasks[(int)j][i]=(j<<(2*i));
+ }
+ }
+ }
+
+}
diff --git a/current/jgi/BBDukF.java b/current/jgi/BBDukF.java
new file mode 100755
index 0000000..e74e46b
--- /dev/null
+++ b/current/jgi/BBDukF.java
@@ -0,0 +1,3839 @@
+package jgi;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.BitSet;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.concurrent.ArrayBlockingQueue;
+import java.util.concurrent.atomic.AtomicLongArray;
+
+import kmer.AbstractKmerTable;
+
+import stream.ConcurrentReadInputStream;
+import stream.FASTQ;
+import stream.FastaReadInputStream;
+import stream.ConcurrentReadOutputStream;
+import stream.KillSwitch;
+import stream.Read;
+import stream.SamLine;
+
+import align2.IntList;
+import align2.ListNum;
+import align2.ReadStats;
+import align2.Shared;
+import align2.Tools;
+import align2.TrimRead;
+import dna.AminoAcid;
+import dna.Data;
+import dna.Parser;
+import dna.Timer;
+import fileIO.ByteFile;
+import fileIO.ByteStreamWriter;
+import fileIO.ReadWrite;
+import fileIO.FileFormat;
+import fileIO.TextStreamWriter;
+
+/**
+ * Separates, trims, or masks sequences based on matching kmers in a reference.
+ * Supports Hamming and and edit distance.
+ * Supports K 1-31 and emulated K>31.
+ * @author Brian Bushnell
+ * @date Aug 30, 2013
+ *
+ */
+public class BBDukF {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /**
+ * Code entrance from the command line.
+ * @param args Command line arguments
+ */
+ public static void main(String[] args){
+
+ args=Parser.parseConfig(args);
+ if(Parser.parseHelp(args, true)){
+ printOptions();
+ System.exit(0);
+ }
+
+ //Create a new BBDuk instance
+ BBDukF bbd=new BBDukF(args);
+
+ //And run it
+ bbd.process();
+ }
+
+ /**
+ * Display usage information.
+ */
+ private static void printOptions(){
+ System.err.println("Please consult the shellscript for usage information.");
+ }
+
+
+ /**
+ * Constructor.
+ * @param args Command line arguments
+ */
+ public BBDukF(String[] args){
+ for(String s : args){if(s.contains("standardout") || s.contains("stdout")){outstream=System.err;}}
+ System.err.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+ System.err.println("BBDuk version "+Shared.BBMAP_VERSION_STRING);
+
+ /* Set global defaults */
+ ReadWrite.ZIPLEVEL=2;
+ ReadWrite.USE_UNPIGZ=true;
+ ReadWrite.USE_PIGZ=true;
+ ReadWrite.MAX_ZIP_THREADS=8;
+
+
+ ByteFile.FORCE_MODE_BF2=Shared.threads()>2;
+
+ /* Initialize local variables with defaults */
+ boolean setOut=false, setOutb=false;
+ boolean ktrimRight_=false, ktrimLeft_=false, ktrimN_=false, ktrimExclusive_=false;
+ boolean findBestMatch_=false;
+ boolean addTrimmedToBad_=true;
+ boolean rcomp_=true;
+ boolean forbidNs_=false;
+ boolean useForest_=false, useTable_=false, useArray_=true, prealloc_=false;
+ int k_=27, kbig_=-1;
+ int mink_=-1;
+ int ways_=-1; //Currently disabled
+ int maxBadKmers_=0;
+ long skipreads_=0;
+ byte TRIM_SYMBOL_='N';
+ boolean kmaskLowercase_=false;
+ boolean kmaskFullyCovered_=false;
+
+
+ Parser parser=new Parser();
+ parser.trimq=6;
+ parser.minAvgQuality=0;
+ parser.minReadLength=10;
+ parser.maxReadLength=Integer.MAX_VALUE;
+ parser.minLenFraction=0f;
+ parser.requireBothBad=false;
+ parser.maxNs=-1;
+ boolean trimByOverlap_=false, useQualityForOverlap_=false, strictOverlap_=true;
+ boolean trimPairsEvenly_=false;
+ boolean ordered_=false;
+ int minoverlap_=-1, mininsert_=-1;
+ int restrictLeft_=0, restrictRight_=0, speed_=0, qSkip_=1;
+ boolean printNonZeroOnly_=true;
+ boolean rename_=false, useRefNames_=false;
+ boolean skipr1_=false, skipr2_=false;
+ boolean ecc_=false;
+ float minBaseFrequency_=0;
+ float minKmerFraction_=0;
+ float minCoveredFraction_=0;
+
+ scaffoldNames.add(""); //Necessary so that the first real scaffold gets an id of 1, not zero
+ scaffoldLengths.add(0);
+
+ {
+ boolean b=false;
+ assert(b=true);
+ EA=b;
+ }
+
+ /* Parse arguments */
+ for(int i=0; i<args.length; i++){
+
+ final String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if("null".equalsIgnoreCase(b)){b=null;}
+ while(a.charAt(0)=='-' && (a.indexOf('.')<0 || i>1 || !new File(a).exists())){a=a.substring(1);}
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(Parser.parseZip(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseHist(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseCommonStatic(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseQualityAdjust(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseQuality(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseFasta(arg, a, b)){
+ //do nothing
+ }else if(parser.parseInterleaved(arg, a, b)){
+ //do nothing
+ }else if(parser.parseTrim(arg, a, b)){
+ //do nothing
+ }else if(parser.parseCommon(arg, a, b)){
+ //do nothing
+ }else if(parser.parseCardinality(arg, a, b)){
+ //do nothing
+ }else if(a.equals("in") || a.equals("in1")){
+ in1=b;
+ }else if(a.equals("in2")){
+ in2=b;
+ }else if(a.equals("qfin") || a.equals("qfin1")){
+ qfin1=b;
+ }else if(a.equals("qfin2")){
+ qfin2=b;
+ }else if(a.equals("out") || a.equals("out1") || a.equals("outu") || a.equals("outu1") || a.equals("outnonmatch") ||
+ a.equals("outnonmatch1") || a.equals("outunnmatch") || a.equals("outunmatch1") || a.equals("outunnmatched") || a.equals("outunmatched1")){
+ out1=b;
+ setOut=true;
+ }else if(a.equals("out2") || a.equals("outu2") || a.equals("outnonmatch2") || a.equals("outunmatch2") ||
+ a.equals("outnonmatched2") || a.equals("outunmatched2")){
+ out2=b;
+ }else if(a.equals("outb") || a.equals("outm") || a.equals("outb1") || a.equals("outm1") || a.equals("outbad") ||
+ a.equals("outbad1") || a.equals("outmatch") || a.equals("outmatch1")){
+ outb1=b;
+ setOut=true;
+ }else if(a.equals("outb2") || a.equals("outm2") || a.equals("outbad2") || a.equals("outmatch2")){
+ outb2=b;
+ }else if(a.equals("outs") || a.equals("outsingle")){
+ outsingle=b;
+ }else if(a.equals("stats") || a.equals("scafstats")){
+ outstats=b;
+ }else if(a.equals("refstats")){
+ outrefstats=b;
+ }else if(a.equals("rpkm") || a.equals("fpkm") || a.equals("cov") || a.equals("coverage")){
+ outrpkm=b;
+ }else if(a.equals("sam") || a.equals("bam")){
+ samFile=b;
+ }else if(a.equals("duk") || a.equals("outduk")){
+ outduk=b;
+ }else if(a.equals("rqc")){
+ outrqc=b;
+ }else if(a.equals("ref")){
+ ref=(b==null) ? null : (new File(b).exists() ? new String[] {b} : b.split(","));
+ }else if(a.equals("literal")){
+ literal=(b==null) ? null : b.split(",");
+// assert(false) : b+", "+Arrays.toString(literal);
+ }else if(a.equals("forest")){
+ useForest_=Tools.parseBoolean(b);
+ if(useForest_){useTable_=useArray_=false;}
+ }else if(a.equals("table")){
+ useTable_=Tools.parseBoolean(b);
+ if(useTable_){useForest_=useArray_=false;}
+ }else if(a.equals("array")){
+ useArray_=Tools.parseBoolean(b);
+ if(useArray_){useTable_=useForest_=false;}
+ }else if(a.equals("ways")){
+ ways_=Integer.parseInt(b);
+ }else if(a.equals("ordered") || a.equals("ord")){
+ ordered_=Tools.parseBoolean(b);
+ System.err.println("Set ORDERED to "+ordered_);
+ }else if(a.equals("skipr1")){
+ skipr1_=Tools.parseBoolean(b);
+ }else if(a.equals("skipr2")){
+ skipr2_=Tools.parseBoolean(b);
+ }else if(a.equals("k")){
+ assert(b!=null) : "\nThe k key needs an integer value greater than 0, such as k=27\n";
+ k_=Integer.parseInt(b);
+ if(k_>31){
+ kbig_=k_;
+ k_=31;
+ }else{
+ kbig_=-1;
+ }
+ assert(k_>0 && k_<32) : "k must be at least 1; default is 27.";
+ }else if(a.equals("mink") || a.equals("kmin")){
+ mink_=Integer.parseInt(b);
+ assert(mink_<0 || (mink_>0 && mink_<32)) : "kmin must be between 1 and 31; default is 4, negative numbers disable it.";
+ }else if(a.equals("useshortkmers") || a.equals("shortkmers") || a.equals("usk")){
+ useShortKmers=Tools.parseBoolean(b);
+ }else if(a.equals("trimextra") || a.equals("trimpad") || a.equals("tp")){
+ trimPad=Integer.parseInt(b);
+ }else if(a.equals("hdist") || a.equals("hammingdistance")){
+ hammingDistance=Integer.parseInt(b);
+ assert(hammingDistance>=0 && hammingDistance<4) : "hamming distance must be between 0 and 3; default is 0.";
+ }else if(a.equals("qhdist") || a.equals("queryhammingdistance")){
+ qHammingDistance=Integer.parseInt(b);
+ assert(qHammingDistance>=0 && qHammingDistance<4) : "hamming distance must be between 0 and 3; default is 0.";
+ }else if(a.equals("edits") || a.equals("edist") || a.equals("editdistance")){
+ editDistance=Integer.parseInt(b);
+ assert(editDistance>=0 && editDistance<3) : "edit distance must be between 0 and 2; default is 0.\n" +
+ "You can bypass this error message with the -da flag, but edist=3 at K=31" +
+ "requires 15,000,000x the time and memory for indexing compared to edist=0.";
+ }else if(a.equals("hdist2") || a.equals("hammingdistance2")){
+ hammingDistance2=Integer.parseInt(b);
+ assert(hammingDistance2>=0 && hammingDistance2<4) : "hamming distance must be between 0 and 3; default is 0.";
+ }else if(a.equals("qhdist2") || a.equals("queryhammingdistance2")){
+ qHammingDistance2=Integer.parseInt(b);
+ assert(qHammingDistance2>=0 && qHammingDistance2<4) : "hamming distance must be between 0 and 3; default is 0.";
+ }else if(a.equals("edits2") || a.equals("edist2") || a.equals("editdistance2")){
+ editDistance2=Integer.parseInt(b);
+ assert(editDistance2>=0 && editDistance2<3) : "edit distance must be between 0 and 2; default is 0.";
+ }else if(a.equals("maxskip") || a.equals("maxrskip") || a.equals("mxs")){
+ maxSkip=Integer.parseInt(b);
+ }else if(a.equals("minskip") || a.equals("minrskip") || a.equals("mns")){
+ minSkip=Integer.parseInt(b);
+ }else if(a.equals("skip") || a.equals("refskip") || a.equals("rskip")){
+ minSkip=maxSkip=Integer.parseInt(b);
+ }else if(a.equals("qskip")){
+ qSkip_=Integer.parseInt(b);
+ }else if(a.equals("speed")){
+ speed_=Integer.parseInt(b);
+ assert(speed_>=0 && speed_<=15) : "Speed range is 0 to 15. Value: "+speed_;
+ }else if(a.equals("skipreads")){
+ skipreads_=Tools.parseKMG(b);
+ }else if(a.equals("maxbadkmers") || a.equals("mbk")){
+ maxBadKmers_=Integer.parseInt(b);
+ }else if(a.equals("minhits") || a.equals("minkmerhits") || a.equals("mkh")){
+ maxBadKmers_=Integer.parseInt(b)-1;
+ }else if(a.equals("minkmerfraction") || a.equals("minfraction") || a.equals("mkf")){
+ minKmerFraction_=Float.parseFloat(b);
+ }else if(a.equals("mincoveredfraction") || a.equals("mincovfraction") || a.equals("mcf")){
+ minCoveredFraction_=Float.parseFloat(b);
+ }else if(a.equals("showspeed") || a.equals("ss")){
+ showSpeed=Tools.parseBoolean(b);
+ }else if(a.equals("verbose")){
+ assert(false) : "Verbose flag is currently static final; must be recompiled to change.";
+ assert(WAYS>1) : "WAYS=1 is for debug mode.";
+// verbose=Tools.parseBoolean(b); //123
+ if(verbose){outstream=System.err;} //For some reason System.out does not print in verbose mode.
+ }else if(a.equals("mm") || a.equals("maskmiddle")){
+ maskMiddle=Tools.parseBoolean(b);
+ }else if(a.equals("rcomp")){
+ rcomp_=Tools.parseBoolean(b);
+ }else if(a.equals("forbidns") || a.equals("forbidn") || a.equals("fn")){
+ forbidNs_=Tools.parseBoolean(b);
+ }else if(a.equals("findbestmatch") || a.equals("fbm")){
+ findBestMatch_=Tools.parseBoolean(b);
+ }else if(a.equals("kfilter")){
+ boolean x=Tools.parseBoolean(b);
+ if(x){ktrimLeft_=ktrimRight_=ktrimN_=false;}
+ }else if(a.equals("ktrim")){
+ if(b==null){b="";}
+ if(b.equalsIgnoreCase("left") || b.equalsIgnoreCase("l")){ktrimLeft_=true;ktrimRight_=false;ktrimN_=false;}
+ else if(b.equalsIgnoreCase("right") || b.equalsIgnoreCase("r")){ktrimLeft_=false;ktrimRight_=true;ktrimN_=false;}
+ else if(b.equalsIgnoreCase("n")){ktrimLeft_=false;ktrimRight_=false;ktrimN_=true;}
+ else if(b.length()==1 && !b.equalsIgnoreCase("t") && !b.equalsIgnoreCase("f")){
+ ktrimLeft_=false;ktrimRight_=false;ktrimN_=true;
+ TRIM_SYMBOL_=(byte)b.charAt(0);
+ }else{
+ boolean x=Tools.parseBoolean(b);
+ assert(!x) : "\nInvalid setting for ktrim - values must be f (false), l (left), r (right), or n.";
+ ktrimRight_=ktrimLeft_=ktrimN_=x;
+ }
+ }else if(a.equals("kmask") || a.equals("mask")){
+ if("lc".equalsIgnoreCase(b) || "lowercase".equalsIgnoreCase(b)){
+ kmaskLowercase_=true;
+ ktrimLeft_=false;ktrimRight_=false;ktrimN_=true;
+ }else{
+ if(Tools.parseBoolean(b)){b="N";}
+ if(b.length()==1 && !b.equalsIgnoreCase("f")){
+ ktrimLeft_=false;ktrimRight_=false;ktrimN_=true;
+ TRIM_SYMBOL_=(byte)b.charAt(0);
+ }else{
+ boolean x=Tools.parseBoolean(b);
+ assert(!x) : "\nInvalid setting for kmask - values must be f (false), t (true), or a single character for replacement.";
+ ktrimRight_=ktrimLeft_=ktrimN_=x;
+ }
+ }
+ }else if(a.equals("kmaskfullycovered") || a.equals("maskfullycovered") || a.equals("mfc")){
+ kmaskFullyCovered_=Tools.parseBoolean(b);
+ }else if(a.equals("ktrimright")){
+ ktrimRight_=Tools.parseBoolean(b);
+ ktrimLeft_=ktrimN_=!(ktrimRight_);
+ }else if(a.equals("ktrimleft")){
+ ktrimLeft_=Tools.parseBoolean(b);
+ ktrimRight_=ktrimN_=!(ktrimLeft_);
+ }else if(a.equals("ktrimn")){
+ ktrimN_=Tools.parseBoolean(b);
+ ktrimLeft_=ktrimRight_=!(ktrimN_);
+ }else if(a.equals("ktrimexclusive")){
+ ktrimExclusive_=Tools.parseBoolean(b);
+ }else if(a.equals("tbo") || a.equals("trimbyoverlap")){
+ trimByOverlap_=Tools.parseBoolean(b);
+ }else if(a.equals("strictoverlap")){
+ strictOverlap_=Tools.parseBoolean(b);
+ }else if(a.equals("usequality")){
+ useQualityForOverlap_=Tools.parseBoolean(b);
+ }else if(a.equals("tpe") || a.equals("tbe") || a.equals("trimpairsevenly")){
+ trimPairsEvenly_=Tools.parseBoolean(b);
+ }else if(a.equals("ottm") || a.equals("outputtrimmedtomatch")){
+ addTrimmedToBad_=Tools.parseBoolean(b);
+ }else if(a.equals("minoverlap")){
+ minoverlap_=Integer.parseInt(b);
+ }else if(a.equals("mininsert")){
+ mininsert_=Integer.parseInt(b);
+ }else if(a.equals("prealloc") || a.equals("preallocate")){
+ if(b==null || b.length()<1 || Character.isLetter(b.charAt(0))){
+ prealloc_=Tools.parseBoolean(b);
+ }else{
+ preallocFraction=Tools.max(0, Double.parseDouble(b));
+ prealloc_=(preallocFraction>0);
+ }
+ }else if(a.equals("restrictleft")){
+ restrictLeft_=Integer.parseInt(b);
+ }else if(a.equals("restrictright")){
+ restrictRight_=Integer.parseInt(b);
+ }else if(a.equals("statscolumns") || a.equals("columns") || a.equals("cols")){
+ STATS_COLUMNS=Integer.parseInt(b);
+ assert(STATS_COLUMNS==3 || STATS_COLUMNS==5) : "statscolumns bust be either 3 or 5. Invalid value: "+STATS_COLUMNS;
+ }else if(a.equals("nzo") || a.equals("nonzeroonly")){
+ printNonZeroOnly_=Tools.parseBoolean(b);
+ }else if(a.equals("rename")){
+ rename_=Tools.parseBoolean(b);
+ }else if(a.equals("refnames") || a.equals("userefnames")){
+ useRefNames_=Tools.parseBoolean(b);
+ }else if(a.equals("initialsize")){
+ initialSize=(int)Tools.parseKMG(b);
+ }else if(a.equals("dump")){
+ dump=b;
+ }else if(a.equals("entropyk") || a.equals("ek")){
+ entropyK=Integer.parseInt(b);
+ }else if(a.equals("entropywindow") || a.equals("ew")){
+ entropyWindow=Integer.parseInt(b);
+ }else if(a.equals("minentropy") || a.equals("entropy") || a.equals("entropyfilter")){
+ entropyCutoff=Float.parseFloat(b);
+ }else if(a.equals("verifyentropy")){
+ verifyEntropy=Tools.parseBoolean(b);
+ }else if(a.equals("minbasefrequency")){
+ minBaseFrequency_=Float.parseFloat(b);
+ }else if(a.equals("ecco") || a.equals("ecc")){
+ ecc_=Tools.parseBoolean(b);
+ }else if(a.equals("copyundefined") || a.equals("cu")){
+ REPLICATE_AMBIGUOUS=Tools.parseBoolean(b);
+ }else if(a.equals("path")){
+ Data.setPath(b);
+ }else if(a.equals("maxbasesoutm")){
+ maxBasesOutm=Tools.parseKMG(b);
+ }else if(a.equals("maxbasesoutu") || a.equals("maxbasesout")){
+ maxBasesOutu=Tools.parseKMG(b);
+ }else if(i==0 && in1==null && arg.indexOf('=')<0 && arg.lastIndexOf('.')>0){
+ in1=args[i];
+ }else if(i==1 && out1==null && arg.indexOf('=')<0 && arg.lastIndexOf('.')>0){
+ out1=args[i];
+ setOut=true;
+ }else if(i==2 && ref==null && arg.indexOf('=')<0 && arg.lastIndexOf('.')>0){
+ ref=(new File(args[i]).exists() ? new String[] {args[i]} : args[i].split(","));
+ }else{
+ throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ if(hammingDistance2==-1){hammingDistance2=hammingDistance;}
+ if(qHammingDistance2==-1){qHammingDistance2=qHammingDistance;}
+ if(editDistance2==-1){editDistance2=editDistance;}
+ minBaseFrequency=minBaseFrequency_;
+
+ {//Process parser fields
+ Parser.processQuality();
+
+ maxReads=parser.maxReads;
+ samplerate=parser.samplerate;
+ sampleseed=parser.sampleseed;
+ recalibrateQuality=parser.recalibrateQuality;
+
+ overwrite=ReadStats.overwrite=parser.overwrite;
+ append=ReadStats.append=parser.append;
+// testsize=parser.testsize;
+// trimBadSequence=parser.trimBadSequence;
+// breakLength=parser.breakLength;
+
+ forceTrimModulo=parser.forceTrimModulo;
+ forceTrimLeft=parser.forceTrimLeft;
+ forceTrimRight=parser.forceTrimRight;
+ forceTrimRight2=parser.forceTrimRight2;
+ qtrimLeft=parser.qtrimLeft;
+ qtrimRight=parser.qtrimRight;
+ trimq=parser.trimq;
+ minLenFraction=parser.minLenFraction;
+ minAvgQuality=parser.minAvgQuality;
+ minAvgQualityBases=parser.minAvgQualityBases;
+ chastityFilter=parser.chastityFilter;
+ failBadBarcodes=parser.failBadBarcodes;
+ removeBadBarcodes=parser.removeBadBarcodes;
+ failIfNoBarcode=parser.failIfNoBarcode;
+ barcodes=parser.barcodes;
+ minReadLength=parser.minReadLength;
+ maxReadLength=parser.maxReadLength;
+ maxNs=parser.maxNs;
+ minConsecutiveBases=parser.minConsecutiveBases;
+// untrim=parser.untrim;
+// minTrimLength=(parser.minTrimLength>=0 ? parser.minTrimLength : minTrimLength);
+// requireBothBad=parser.requireBothBad;
+ removePairsIfEitherBad=!parser.requireBothBad;
+
+ minGC=parser.minGC;
+ maxGC=parser.maxGC;
+ filterGC=(minGC>0 || maxGC<1);
+
+ loglog=(parser.loglog ? new LogLog(parser) : null);
+
+ THREADS=Shared.threads();
+ }
+
+ if(ref!=null){
+ for(String s : ref){refNames.add(s);}
+ }
+ if(literal!=null){refNames.add("literal");}
+ refScafCounts=new int[refNames.size()];
+
+ if(minoverlap_>=0){
+ minOverlap=Tools.max(minoverlap_, 1);
+ minOverlap0=Tools.min(minOverlap0, minOverlap);
+ }
+
+ if(mininsert_>=0){
+ minInsert=Tools.max(mininsert_, 1);
+ minInsert0=Tools.min(minInsert0, minInsert);
+ }
+
+ /* Set final variables; post-process and validate argument combinations */
+
+ useForest=useForest_;
+ useTable=useTable_;
+ useArray=useArray_;
+ hammingDistance=Tools.max(editDistance, hammingDistance);
+ hammingDistance2=Tools.max(editDistance2, hammingDistance2);
+ minSkip=Tools.max(1, Tools.min(minSkip, maxSkip));
+ maxSkip=Tools.max(minSkip, maxSkip);
+ addTrimmedToBad=addTrimmedToBad_;
+ rcomp=rcomp_;
+ forbidNs=(forbidNs_ || hammingDistance<1);
+ trimSymbol=TRIM_SYMBOL_;
+ kmaskLowercase=kmaskLowercase_;
+ kmaskFullyCovered=kmaskFullyCovered_;
+ skipreads=skipreads_;
+ trimByOverlap=trimByOverlap_;
+ useQualityForOverlap=useQualityForOverlap_;
+ strictOverlap=strictOverlap_;
+ trimPairsEvenly=trimPairsEvenly_;
+ ORDERED=ordered_;
+ restrictLeft=Tools.max(restrictLeft_, 0);
+ restrictRight=Tools.max(restrictRight_, 0);
+ printNonZeroOnly=printNonZeroOnly_;
+ rename=rename_;
+ useRefNames=useRefNames_;
+ speed=speed_;
+ qSkip=qSkip_;
+ noAccel=(speed<1 && qSkip<2);
+ accel=!noAccel;
+ skipR1=skipr1_;
+ skipR2=skipr2_;
+ ecc=ecc_;
+
+ if(strictOverlap){
+ maxRatio=0.05f;
+ ratioMargin=9f;
+ ratioOffset=0.5f;
+ efilterRatio=3.5f;
+ efilterOffset=0.05f;
+ pfilterRatio=0.001f;
+ meeFilter=15f;
+ }else{
+ maxRatio=0.10f;
+ ratioMargin=5f;
+ ratioOffset=0.4f;
+ efilterRatio=6f;
+ efilterOffset=0.05f;
+ pfilterRatio=0.00005f;
+ meeFilter=999999999;
+ }
+
+ MAKE_QUALITY_HISTOGRAM=ReadStats.COLLECT_QUALITY_STATS;
+ MAKE_QUALITY_ACCURACY=ReadStats.COLLECT_QUALITY_ACCURACY;
+ MAKE_MATCH_HISTOGRAM=ReadStats.COLLECT_MATCH_STATS;
+ MAKE_BASE_HISTOGRAM=ReadStats.COLLECT_BASE_STATS;
+ MAKE_EHIST=ReadStats.COLLECT_ERROR_STATS;
+ MAKE_INDELHIST=ReadStats.COLLECT_INDEL_STATS;
+ MAKE_LHIST=ReadStats.COLLECT_LENGTH_STATS;
+ MAKE_GCHIST=ReadStats.COLLECT_GC_STATS;
+ MAKE_IDHIST=ReadStats.COLLECT_IDENTITY_STATS;
+
+ {
+ long usableMemory;
+ long tableMemory;
+
+ {
+ long memory=Runtime.getRuntime().maxMemory();
+ double xmsRatio=Shared.xmsRatio();
+ usableMemory=(long)Tools.max(((memory-96000000-(20*400000 /* for atomic arrays */))*(xmsRatio>0.97 ? 0.82 : 0.75)), memory*0.45);
+ tableMemory=(long)(usableMemory*.95);
+ }
+
+ if(initialSize<1){
+ final long memOverWays=tableMemory/(12*WAYS);
+ final double mem2=(prealloc_ ? preallocFraction : 1)*tableMemory;
+ initialSize=(prealloc_ || memOverWays<initialSizeDefault ? (int)Tools.min(2142000000, (long)(mem2/(12*WAYS))) : initialSizeDefault);
+ if(initialSize!=initialSizeDefault){
+ System.err.println("Initial size set to "+initialSize);
+ }
+ }
+ }
+
+ if(ktrimLeft_ || ktrimRight_ || ktrimN_){
+ if(kbig_>k_){
+ System.err.println("*********************** WARNING ***********************");
+ System.err.println("WARNING: When kmer-trimming, the maximum value of K is "+k_+".");
+ System.err.println("K has been reduced from "+kbig_+" to "+k_+".");
+ System.err.println("***********************************************************");
+ kbig_=k_;
+ }
+ }
+
+ if((speed>0 || qSkip>1) && kbig_>k_){
+ System.err.println("*********************** WARNING ***********************");
+ System.err.println("WARNING: When speed>0 or qskip>1, the maximum value of K is "+k_+".");
+ System.err.println("K has been reduced from "+kbig_+" to "+k_+".");
+ System.err.println("***********************************************************");
+ kbig_=k_;
+ }
+
+ if((speed>0 && qSkip>1) || (qSkip>1 && maxSkip>1) || (speed>0 && maxSkip>1)){
+ System.err.println("WARNING: It is not recommended to use more than one of 'qskip', 'speed', and 'rskip/maxskip' together.");
+ System.err.println("qskip="+qSkip+", speed="+speed+", maxskip="+maxSkip);
+ }
+
+ k=k_;
+ k2=k-1;
+ kbig=kbig_;
+ keff=Tools.max(k, kbig);
+ if(kbig>k){
+ minSkip=maxSkip=0;
+ if(maskMiddle){
+ System.err.println("maskMiddle was disabled because kbig>k");
+ maskMiddle=false;
+ }
+ }
+ mink=Tools.min((mink_<1 ? 6 : mink_), k);
+ maxBadKmers0=maxBadKmers_;
+
+ minKmerFraction=Tools.max(minKmerFraction_, 0);
+ assert(minKmerFraction<=1) : "minKmerFraction must range from 0 to 1; value="+minKmerFraction;
+
+ minCoveredFraction=Tools.max(minCoveredFraction_, 0);
+ assert(minCoveredFraction<=1) : "minCoveredFraction must range from 0 to 1; value="+minCoveredFraction;
+
+ if(mink_>0 && mink_<k){useShortKmers=true;}
+ if(useShortKmers){
+ if(maskMiddle){
+ System.err.println("maskMiddle was disabled because useShortKmers=true");
+ maskMiddle=false;
+ }
+ }
+
+ ktrimRight=ktrimRight_;
+ ktrimLeft=ktrimLeft_;
+ ktrimN=ktrimN_;
+ ktrimExclusive=ktrimExclusive_;
+ findBestMatch=findBestMatch_;
+ kfilter=(ref!=null || literal!=null) && !(ktrimRight || ktrimLeft || ktrimN);
+ assert(findBestMatch==false || kfilter==false || kbig<=k) : "K must be less than 32 in 'findBestMatch' mode";
+
+ assert(!useShortKmers || ktrimRight || ktrimLeft || ktrimN) : "\nSetting mink or useShortKmers also requires setting a ktrim mode, such as 'r', 'l', or 'n'\n";
+
+ middleMask=maskMiddle ? ~(3L<<(2*(k/2))) : -1L;
+
+ hitCounts=(outduk==null ? null : new long[HITCOUNT_LEN+1]);
+
+
+ /* Adjust I/O settings and filenames */
+
+ assert(FastaReadInputStream.settingsOK());
+
+ if(in1==null){
+ printOptions();
+ throw new RuntimeException("Error - at least one input file is required.");
+ }
+
+ if(in1!=null && in1.contains("#") && !new File(in1).exists()){
+ int pound=in1.lastIndexOf('#');
+ String a=in1.substring(0, pound);
+ String b=in1.substring(pound+1);
+ in1=a+1+b;
+ in2=a+2+b;
+ }
+ if(in2!=null && (in2.contains("=") || in2.equalsIgnoreCase("null"))){in2=null;}
+ if(in2!=null){
+ if(FASTQ.FORCE_INTERLEAVED){System.err.println("Reset INTERLEAVED to false because paired input files were specified.");}
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+ }
+
+ if(qfin1!=null && qfin1.contains("#") && in2!=null && !new File(qfin1).exists()){
+ int pound=qfin1.lastIndexOf('#');
+ String a=qfin1.substring(0, pound);
+ String b=qfin1.substring(pound+1);
+ qfin1=a+1+b;
+ qfin2=a+2+b;
+ }
+
+ if(out1!=null && out1.contains("#")){
+ int pound=out1.lastIndexOf('#');
+ String a=out1.substring(0, pound);
+ String b=out1.substring(pound+1);
+ out1=a+1+b;
+ out2=a+2+b;
+ }
+
+ if(outb1!=null && outb1.contains("#")){
+ int pound=outb1.lastIndexOf('#');
+ String a=outb1.substring(0, pound);
+ String b=outb1.substring(pound+1);
+ outb1=a+1+b;
+ outb2=a+2+b;
+ }
+
+ if((out2!=null || outb2!=null) && (in1!=null && in2==null)){
+ if(!FASTQ.FORCE_INTERLEAVED){System.err.println("Forcing interleaved input because paired output was specified for a single input file.");}
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=true;
+ }
+
+ if(!setOut){
+ System.err.println("No output stream specified. To write to stdout, please specify 'out=stdout.fq' or similar.");
+// out1="stdout";
+// outstream=System.err;
+// out2=null;
+ out1=out2=null;
+ }else if("stdout".equalsIgnoreCase(out1) || "standarddout".equalsIgnoreCase(out1)){
+ out1="stdout.fq";
+ outstream=System.err;
+ out2=null;
+ }
+
+ if(!Tools.testOutputFiles(overwrite, append, false, out1, out2, outb1, outb2, outsingle, outstats, outrpkm, outduk, outrqc, outrefstats)){
+ throw new RuntimeException("\nCan't write to some output files; overwrite="+overwrite+"\n");
+ }
+ if(!Tools.testInputFiles(false, true, in1, in2, qfin1, qfin2)){
+ throw new RuntimeException("\nCan't read to some input files.\n");
+ }
+ if(!Tools.testInputFiles(true, true, ref)){
+ throw new RuntimeException("\nCan't read to some reference files.\n");
+ }
+ if(!Tools.testForDuplicateFiles(true, in1, in2, qfin1, qfin2, out1, out2, outb1, outb2, outsingle, outstats, outrpkm, outduk, outrqc, outrefstats)){
+ throw new RuntimeException("\nSome file names were specified multiple times.\n");
+ }
+
+ assert(THREADS>0) : "THREADS must be greater than 0.";
+
+ assert(in1==null || in1.toLowerCase().startsWith("stdin") || in1.toLowerCase().startsWith("standardin") || new File(in1).exists()) : "Can't find "+in1;
+ assert(in2==null || in2.toLowerCase().startsWith("stdin") || in2.toLowerCase().startsWith("standardin") || new File(in2).exists()) : "Can't find "+in2;
+
+ if(!((ref!=null || literal!=null) || qtrimLeft || qtrimRight || minAvgQuality>0 || maxNs>=0 || trimByOverlap ||
+ MAKE_QUALITY_HISTOGRAM || MAKE_MATCH_HISTOGRAM || MAKE_BASE_HISTOGRAM || MAKE_QUALITY_ACCURACY ||
+ MAKE_EHIST || MAKE_INDELHIST || MAKE_LHIST || MAKE_GCHIST || MAKE_IDHIST ||
+ forceTrimLeft>0 || forceTrimRight>0 || forceTrimRight2>0 || forceTrimModulo>0 || minBaseFrequency>0 || recalibrateQuality)){
+ System.err.println("NOTE: No reference files specified, no trimming mode, no min avg quality, no histograms - read sequences will not be changed.");
+ }
+
+ if(recalibrateQuality){
+ SamLine.SET_FROM_OK=true;//TODO: Should ban other operations
+ }
+
+ if(ref!=null){
+ for(String s0 : ref){
+ assert(s0!=null) : "Specified a null reference.";
+ String s=s0.toLowerCase();
+ assert(s==null || s.startsWith("stdin") || s.startsWith("standardin") || new File(s0).exists()) : "Can't find "+s0;
+ }
+ }
+
+ //Initialize tables
+ final int tableType=(useForest ? AbstractKmerTable.FOREST1D : useTable ? AbstractKmerTable.TABLE : useArray ? AbstractKmerTable.ARRAY1D : 0);
+ keySets=AbstractKmerTable.preallocate(WAYS, tableType, initialSize, (!prealloc_ || preallocFraction<1));
+
+ //Initialize entropy
+ calcEntropy=(entropyCutoff>0);
+ if(calcEntropy){
+ assert(entropyWindow>0 && entropyCutoff>=0 && entropyCutoff<=1);
+ entropy=new double[entropyWindow+2];
+ final double mult=1d/entropyWindow;
+ for(int i=0; i<entropy.length; i++){
+ double pk=i*mult;
+ entropy[i]=pk*Math.log(pk);
+ }
+ entropyMult=-1/Math.log(entropyWindow);
+ entropyKmerspace=(1<<(2*entropyK));
+ }else{
+ entropy=null;
+ entropyMult=0;
+ entropyKmerspace=1;
+ }
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Outer Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+
+ public void process(){
+
+ if(recalibrateQuality){
+ if(samFile!=null){
+ CalcTrueQuality.main2(new String[] {"in="+samFile, "showstats=f"});
+ }
+ CalcTrueQuality.initializeMatrices();
+ }
+
+ /* Check for output file collisions */
+ if(!Tools.testOutputFiles(overwrite, append, false, out1, out2, outb1, outb2, outstats, outrpkm, outduk, outrqc, outrefstats)){
+ throw new RuntimeException("One or more output files were duplicate or could not be written to. Check the names or set the 'overwrite=true' flag.");
+ }
+
+ /* Start overall timer */
+ Timer t=new Timer();
+
+// boolean dq0=FASTQ.DETECT_QUALITY;
+// boolean ti0=FASTQ.TEST_INTERLEAVED;
+// int rbl0=Shared.READ_BUFFER_LENGTH;
+// FASTQ.DETECT_QUALITY=false;
+// FASTQ.TEST_INTERLEAVED=false;
+// Shared.READ_BUFFER_LENGTH=16;
+
+ process2(t.time1);
+
+// FASTQ.DETECT_QUALITY=dq0;
+// FASTQ.TEST_INTERLEAVED=ti0;
+// Shared.READ_BUFFER_LENGTH=rbl0;
+
+ /* Stop timer and calculate speed statistics */
+ t.stop();
+
+
+ if(showSpeed){
+ double rpnano=readsIn/(double)(t.elapsed);
+ double bpnano=basesIn/(double)(t.elapsed);
+
+ //Format with k or m suffixes
+ String rpstring=(readsIn<100000 ? ""+readsIn : readsIn<100000000 ? (readsIn/1000)+"k" : (readsIn/1000000)+"m");
+ String bpstring=(basesIn<100000 ? ""+basesIn : basesIn<100000000 ? (basesIn/1000)+"k" : (basesIn/1000000)+"m");
+
+ while(rpstring.length()<8){rpstring=" "+rpstring;}
+ while(bpstring.length()<8){bpstring=" "+bpstring;}
+
+ outstream.println("\nTime: \t\t\t"+t);
+ outstream.println("Reads Processed: "+rpstring+" \t"+String.format("%.2fk reads/sec", rpnano*1000000));
+ outstream.println("Bases Processed: "+bpstring+" \t"+String.format("%.2fm bases/sec", bpnano*1000));
+ }
+
+ /* Throw an exception if errors were detected */
+ if(errorState){
+ throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
+ }
+ }
+
+
+ public void process2(long startTime){
+
+ /* Start phase timer */
+ Timer t=new Timer();
+
+ if(DISPLAY_PROGRESS){
+ outstream.println("Initial:");
+ Shared.printMemory();
+ outstream.println();
+ }
+
+ /* Fill tables with reference kmers */
+ if((ref!=null && ref.length>0) || (literal!=null && literal.length>0)){
+ final boolean oldTI=FASTQ.TEST_INTERLEAVED; //TODO: This needs to be changed to a non-static field, or somehow 'read mode' and 'ref mode' need to be distinguished.
+ final boolean oldFI=FASTQ.FORCE_INTERLEAVED;
+ final boolean oldSplit=FastaReadInputStream.SPLIT_READS;
+ final int oldML=FastaReadInputStream.MIN_READ_LEN;
+
+ FASTQ.TEST_INTERLEAVED=false;
+ FASTQ.FORCE_INTERLEAVED=false;
+ FastaReadInputStream.SPLIT_READS=false;
+ FastaReadInputStream.MIN_READ_LEN=1;
+
+ storedKmers=spawnLoadThreads();
+
+ FASTQ.TEST_INTERLEAVED=oldTI;
+ FASTQ.FORCE_INTERLEAVED=oldFI;
+ FastaReadInputStream.SPLIT_READS=oldSplit;
+ FastaReadInputStream.MIN_READ_LEN=oldML;
+
+ if(useRefNames){toRefNames();}
+ t.stop();
+ }
+
+ {
+ long ram=freeMemory();
+ ALLOW_LOCAL_ARRAYS=(scaffoldNames!=null && Tools.max(THREADS, 1)*3*8*scaffoldNames.size()<ram*5);
+ }
+
+ /* Dump kmers to text */
+ if(dump!=null){
+ ByteStreamWriter bsw=new ByteStreamWriter(dump, overwrite, false, true);
+ bsw.start();
+ for(AbstractKmerTable set : keySets){
+ set.dumpKmersAsBytes(bsw, k, 0);
+ }
+ bsw.poisonAndWait();
+ }
+
+ if(storedKmers<1 && (ktrimRight || ktrimLeft || ktrimN)){
+ System.err.println("****** WARNING! A KMER OPERATION WAS CHOSEN BUT NO KMERS WERE LOADED. ******");
+ if(ref==null && literal==null){
+ System.err.println("****** YOU NEED TO SPECIFY A REFERENCE FILE OR LITERAL SEQUENCE. ******\n");
+ }else{
+ System.err.println("****** PLEASE ENSURE K IS LESS THAN OR EQUAL TO REF SEQUENCE LENGTHS. ******\n");
+ }
+ assert(false) : "You can bypass this assertion with the -da flag.";
+ }
+
+ final boolean vic=Read.VALIDATE_IN_CONSTRUCTOR;
+ Read.VALIDATE_IN_CONSTRUCTOR=THREADS<4;
+
+ /* Do kmer matching of input reads */
+ spawnProcessThreads(t);
+
+ Read.VALIDATE_IN_CONSTRUCTOR=vic;
+
+ /* Write legacy duk statistics (which requires tables) */
+ writeDuk(System.nanoTime()-startTime);
+
+ /* Unload kmers to save memory */
+ if(RELEASE_TABLES){unloadKmers();}
+
+ /* Write statistics to files */
+ writeStats();
+ writeRPKM();
+ writeRefStats();
+ writeRqc();
+
+ /* Unload sequence data to save memory */
+ if(RELEASE_TABLES){unloadScaffolds();}
+
+ outstream.println("\nInput: \t"+readsIn+" reads \t\t"+basesIn+" bases.");
+
+ if((ref!=null || literal!=null) && !(ktrimLeft || ktrimRight || ktrimN)){
+ outstream.println("Contaminants: \t"+readsKFiltered+" reads ("+String.format("%.2f",readsKFiltered*100.0/readsIn)+"%) \t"+
+ basesKFiltered+" bases ("+String.format("%.2f",basesKFiltered*100.0/basesIn)+"%)");
+ outstream.flush();
+ }
+ if(qtrimLeft || qtrimRight){
+ outstream.println("QTrimmed: \t"+readsQTrimmed+" reads ("+String.format("%.2f",readsQTrimmed*100.0/readsIn)+"%) \t"+
+ basesQTrimmed+" bases ("+String.format("%.2f",basesQTrimmed*100.0/basesIn)+"%)");
+ }
+ if(forceTrimLeft>0 || forceTrimRight>0 || forceTrimRight2>0 || forceTrimModulo>0){
+ outstream.println("FTrimmed: \t"+readsFTrimmed+" reads ("+String.format("%.2f",readsFTrimmed*100.0/readsIn)+"%) \t"+
+ basesFTrimmed+" bases ("+String.format("%.2f",basesFTrimmed*100.0/basesIn)+"%)");
+ }
+ if(ktrimLeft || ktrimRight || ktrimN){
+ String x=(ktrimN ? "KMasked: " : "KTrimmed:");
+ outstream.println(x+" \t"+readsKTrimmed+" reads ("+String.format("%.2f",readsKTrimmed*100.0/readsIn)+"%) \t"+
+ basesKTrimmed+" bases ("+String.format("%.2f",basesKTrimmed*100.0/basesIn)+"%)");
+ }
+ if(trimByOverlap){
+ outstream.println("Trimmed by overlap: \t"+readsTrimmedByOverlap+" reads ("+String.format("%.2f",readsTrimmedByOverlap*100.0/readsIn)+"%) \t"+
+ basesTrimmedByOverlap+" bases ("+String.format("%.2f",basesTrimmedByOverlap*100.0/basesIn)+"%)");
+ }
+ if(filterGC){
+ outstream.println("Filtered by GC: \t"+badGcReads+" reads ("+String.format("%.2f",badGcReads*100.0/readsIn)+"%) \t"+
+ badGcBases+" bases ("+String.format("%.2f",badGcBases*100.0/basesIn)+"%)");
+ }
+ if(minAvgQuality>0 || maxNs>=0 || minBaseFrequency>0 || chastityFilter || removeBadBarcodes){
+ outstream.println("Low quality discards: \t"+readsQFiltered+" reads ("+String.format("%.2f",readsQFiltered*100.0/readsIn)+"%) \t"+
+ basesQFiltered+" bases ("+String.format("%.2f",basesQFiltered*100.0/basesIn)+"%)");
+ }
+ if(calcEntropy){
+ outstream.println("Low entropy discards: \t"+readsEFiltered+" reads ("+String.format("%.2f",readsEFiltered*100.0/readsIn)+"%) \t"+
+ basesEFiltered+" bases ("+String.format("%.2f",basesEFiltered*100.0/basesIn)+"%)");
+ }
+
+ outstream.println("Result: \t"+readsOut+" reads ("+String.format("%.2f",readsOut*100.0/readsIn)+"%) \t"+
+ basesOut+" bases ("+String.format("%.2f",basesOut*100.0/basesIn)+"%)");
+
+ if(loglog!=null){
+ outstream.println("Unique "+loglog.k+"-mers: \t"+loglog.cardinality());
+ }
+ }
+
+ /**
+ * Clear stored kmers.
+ */
+ public void unloadKmers(){
+ if(keySets!=null){
+ for(int i=0; i<keySets.length; i++){keySets[i]=null;}
+ }
+ }
+
+ /**
+ * Clear stored sequence data.
+ */
+ public void unloadScaffolds(){
+ if(scaffoldNames!=null && !scaffoldNames.isEmpty()){
+ scaffoldNames.clear();
+ scaffoldNames.trimToSize();
+ }
+ scaffoldReadCounts=null;
+ scaffoldBaseCounts=null;
+ hitCounts=null;
+ scaffoldLengths=null;
+ }
+
+ /**
+ * Write statistics about how many reads matched each reference scaffold.
+ */
+ private void writeStats(){
+ if(outstats==null){return;}
+ final TextStreamWriter tsw=new TextStreamWriter(outstats, overwrite, false, false);
+ tsw.start();
+
+ long rsum=0, bsum=0;
+
+ /* Create StringNum list of scaffold names and hitcounts */
+ ArrayList<StringNum> list=new ArrayList<StringNum>();
+ for(int i=1; i<scaffoldNames.size(); i++){
+ final long num1=scaffoldReadCounts.get(i), num2=scaffoldBaseCounts.get(i);
+ if(num1>0 || !printNonZeroOnly){
+ rsum+=num1;
+ bsum+=num2;
+ final String s=scaffoldNames.get(i);
+ final int len=scaffoldLengths.get(i);
+ final StringNum sn=new StringNum(s, len, num1, num2);
+ list.add(sn);
+ }
+ }
+ Collections.sort(list);
+ final double rmult=100.0/(readsIn>0 ? readsIn : 1);
+ final double bmult=100.0/(basesIn>0 ? basesIn : 1);
+
+ tsw.print("#File\t"+in1+(in2==null ? "" : "\t"+in2)+"\n");
+
+ if(STATS_COLUMNS==3){
+ tsw.print(String.format("#Total\t%d\n",readsIn));
+ tsw.print(String.format("#Matched\t%d\t%.5f%%\n",rsum,rmult*rsum));
+ tsw.print("#Name\tReads\tReadsPct\n");
+ for(int i=0; i<list.size(); i++){
+ StringNum sn=list.get(i);
+ tsw.print(String.format("%s\t%d\t%.5f%%\n",sn.name,sn.reads,(sn.reads*rmult)));
+ }
+ }else{
+ tsw.print(String.format("#Total\t%d\t%d\n",readsIn,basesIn));
+ tsw.print(String.format("#Matched\t%d\t%.5f%%\n",rsum,rmult*rsum,bsum,bsum*bmult));
+ tsw.print("#Name\tReads\tReadsPct\tBases\tBasesPct\n");
+ for(int i=0; i<list.size(); i++){
+ StringNum sn=list.get(i);
+ tsw.print(String.format("%s\t%d\t%.5f%%\t%d\t%.5f%%\n",sn.name,sn.reads,(sn.reads*rmult),sn.bases,(sn.bases*bmult)));
+ }
+ }
+ tsw.poisonAndWait();
+ }
+
+ /**
+ * Write RPKM statistics.
+ */
+ private void writeRPKM(){
+ if(outrpkm==null){return;}
+ final TextStreamWriter tsw=new TextStreamWriter(outrpkm, overwrite, false, false);
+ tsw.start();
+
+ /* Count mapped reads */
+ long mapped=0;
+ for(int i=0; i<scaffoldReadCounts.length(); i++){
+ mapped+=scaffoldReadCounts.get(i);
+ }
+
+ /* Print header */
+ tsw.print("#File\t"+in1+(in2==null ? "" : "\t"+in2)+"\n");
+ tsw.print(String.format("#Reads\t%d\n",readsIn));
+ tsw.print(String.format("#Mapped\t%d\n",mapped));
+ tsw.print(String.format("#RefSequences\t%d\n",Tools.max(0, scaffoldNames.size()-1)));
+ tsw.print("#Name\tLength\tBases\tCoverage\tReads\tRPKM\n");
+
+ final float mult=1000000000f/Tools.max(1, mapped);
+
+ /* Print data */
+ for(int i=1; i<scaffoldNames.size(); i++){
+ final long reads=scaffoldReadCounts.get(i);
+ final long bases=scaffoldBaseCounts.get(i);
+ final String s=scaffoldNames.get(i);
+ final int len=scaffoldLengths.get(i);
+ final double invlen=1.0/Tools.max(1, len);
+ final double mult2=mult*invlen;
+ if(reads>0 || !printNonZeroOnly){
+ tsw.print(String.format("%s\t%d\t%d\t%.4f\t%d\t%.4f\n",s,len,bases,bases*invlen,reads,reads*mult2));
+ }
+ }
+ tsw.poisonAndWait();
+ }
+
+ /**
+ * Write statistics on a per-reference basis.
+ */
+ private void writeRefStats(){
+ if(outrefstats==null){return;}
+ final TextStreamWriter tsw=new TextStreamWriter(outrefstats, overwrite, false, false);
+ tsw.start();
+
+ /* Count mapped reads */
+ long mapped=0;
+ for(int i=0; i<scaffoldReadCounts.length(); i++){
+ mapped+=scaffoldReadCounts.get(i);
+ }
+
+ final int numRefs=refNames.size();
+ long[] refReadCounts=new long[numRefs];
+ long[] refBaseCounts=new long[numRefs];
+ long[] refLengths=new long[numRefs];
+
+ for(int r=0, s=1; r<numRefs; r++){
+ final int lim=s+refScafCounts[r];
+ while(s<lim){
+ refReadCounts[r]+=scaffoldReadCounts.get(s);
+ refBaseCounts[r]+=scaffoldBaseCounts.get(s);
+ refLengths[r]+=scaffoldLengths.get(s);
+ s++;
+ }
+ }
+
+ /* Print header */
+ tsw.print("#File\t"+in1+(in2==null ? "" : "\t"+in2)+"\n");
+ tsw.print(String.format("#Reads\t%d\n",readsIn));
+ tsw.print(String.format("#Mapped\t%d\n",mapped));
+ tsw.print(String.format("#References\t%d\n",Tools.max(0, refNames.size())));
+ tsw.print("#Name\tLength\tScaffolds\tBases\tCoverage\tReads\tRPKM\n");
+
+ final float mult=1000000000f/Tools.max(1, mapped);
+
+ /* Print data */
+ for(int i=0; i<refNames.size(); i++){
+ final long reads=refReadCounts[i];
+ final long bases=refBaseCounts[i];
+ final long len=refLengths[i];
+ final int scafs=refScafCounts[i];
+ final String name=ReadWrite.stripToCore(refNames.get(i));
+ final double invlen=1.0/Tools.max(1, len);
+ final double mult2=mult*invlen;
+ if(reads>0 || !printNonZeroOnly){
+ tsw.print(String.format("%s\t%d\t%d\t%d\t%.4f\t%d\t%.4f\n",name,len,scafs,bases,bases*invlen,reads,reads*mult2));
+ }
+ }
+ tsw.poisonAndWait();
+ }
+
+ /**
+ * Write processing statistics in DUK's format.
+ * @param time Elapsed time, nanoseconds
+ */
+ private void writeDuk(long time){
+ if(outduk==null){return;}
+ final TextStreamWriter tsw=new TextStreamWriter(outduk, overwrite, false, false);
+ tsw.start();
+ tsw.println(dukString(time));
+ tsw.poisonAndWait();
+ }
+
+ /**
+ * Write RQCFilter stats.
+ * @param time Elapsed time, nanoseconds
+ */
+ private void writeRqc(){
+ if(outrqc==null){return;}
+ addToRqcMap();
+ if(outrqc.endsWith("hashmap")){return;}
+ final TextStreamWriter tsw=new TextStreamWriter(outrqc, overwrite, false, false);
+ tsw.start();
+ tsw.println(rqcString());
+ tsw.poisonAndWait();
+ }
+
+ public static String rqcString(){
+ if(RQC_MAP==null){return null;}
+ StringBuilder sb=new StringBuilder();
+
+ String[] keys=new String[] {"inputReads", "inputBases", "qtrimmedReads", "qtrimmedBases", "qfilteredReads", "qfilteredBases",
+ "ktrimmedReads", "ktrimmedBases", "kfilteredReads", "kfilteredBases", "outputReads", "outputBases"};
+
+ for(String key : keys){
+ String value=RQC_MAP.get(key);
+ if(value!=null){
+ sb.append(key+"="+value+"\n");
+ }
+ }
+
+ return sb.toString();
+ }
+
+ private void addToRqcMap(){
+ putRqc("inputReads", readsIn, false);
+ putRqc("inputBases", basesIn, false);
+ if(qtrimLeft || qtrimRight){
+ putRqc("qtrimmedReads", readsQTrimmed, false);
+ putRqc("qtrimmedBases", basesQTrimmed, false);
+ }
+ putRqc("qfilteredReads", readsQFiltered, false);
+ putRqc("qfilteredBases", basesQFiltered, false);
+
+ if(ktrimLeft || ktrimRight || ktrimN){
+ putRqc("ktrimmedReads", readsKTrimmed, true);
+ putRqc("ktrimmedBases", basesKTrimmed, true);
+ }else{
+ putRqc("kfilteredReads", readsKFiltered, false);
+ putRqc("kfilteredBases", basesKFiltered, false);
+ }
+ putRqc("outputReads", readsOut, true);
+ putRqc("outputBases", basesOut, true);
+ }
+
+ private static void putRqc(String key, long value, boolean evict){putRqc(key, value+"", evict);}
+
+ private static void putRqc(String key, String value, boolean evict){
+ if(RQC_MAP==null){RQC_MAP=new HashMap<String,String>();}
+ if(evict || !RQC_MAP.containsKey(key)){RQC_MAP.put(key, value);}
+ }
+
+ /**
+ * Helper method; formats statistics to be duk-compatible
+ * @param time Elapsed time, nanoseconds
+ * @return duk output string
+ */
+ private String dukString(long time){
+ StringBuilder sb=new StringBuilder();
+ sb.append("##INPUT PARAMETERS##\n");
+ sb.append("#Reference file: "+(ref==null || ref.length<1 ? null : ref.length==1 ? ref[0] : Arrays.toString(ref))+"\n");
+ sb.append("#Query file: "+in1+(in2==null ? "" : ","+in2)+"\n");
+ sb.append("#Not matched reads file: "+out1+(out2==null ? "" : ","+out2)+"\n");
+ sb.append("#Matched reads file: "+outb1+(outb2==null ? "" : ","+outb2)+"\n");
+ sb.append("#Output file (duk): "+outduk+"\n");
+ sb.append("#Output file (stats): "+outstats+"\n");
+ sb.append("#Mer size: "+k+"\n");
+ long size=0;
+ for(AbstractKmerTable x : keySets){size+=x.size();}
+ sb.append("#Avg step size: "+String.format("%.1f", refKmers/(double)(Tools.max(1, size)))+"\n");
+ sb.append("#Cut off: "+maxBadKmers0+"\n");
+ sb.append("#Mask middle: "+maskMiddle+"\n");
+ sb.append("#Quality trim: "+((qtrimLeft || qtrimRight) ? trimq : "false")+"\n");
+ sb.append("\n");
+
+ sb.append("##REFERENCE STAT##\n");
+ sb.append("#Total Reads: "+refReads+"\n");
+ sb.append("#Total Bases: "+refBases+"\n");
+ sb.append("#Total kmers: "+refKmers+"\n");
+ sb.append("#Total stored kmers: "+size+"\n");
+ sb.append("\n");
+
+ sb.append("## ELAPSED TIME##\n");
+ sb.append("# Time: "+String.format("%.2f", time/1000000000.0)+" seconds\n");
+ sb.append("\n");
+
+ sb.append("##QUERY FILE STAT##\n");
+ sb.append("# Total number of reads: "+readsIn+"\n");
+ sb.append("# Total number of matched reads: "+readsKFiltered+"\n");
+ sb.append("# Match ratio: "+String.format("%.6f", readsKFiltered*1.0/readsIn)+"\n");
+ sb.append("\n");
+
+ sb.append("##P-VALUE##\n");
+ sb.append("#Avg number of Kmer for each read: "+((basesIn/(Tools.max(readsIn, 1)))-k)+"\n");
+// sb.append("# P value for the given threshold 1 is 4.05231e-14\n"); //duk prints a P value; not sure what it means
+ sb.append("\n");
+
+ sb.append("## Histogram of kmer occurance for reads with at least one occurance ##\n");
+ sb.append("#NumOcc\tNumReads\tPercentage\n");
+
+ long sum=Tools.sum(hitCounts);
+ double mult=100.0/(sum<1 ? 1 : sum);
+ for(int i=0; i<hitCounts.length; i++){
+ long x=hitCounts[i];
+ if(x>0){
+ sb.append(i).append('\t').append(x).append('\t').append(String.format("%.4f",(x*mult))).append('\n');
+ }
+ }
+
+ return sb.toString();
+ }
+
+ /**
+ * Fills the scaffold names array with reference names.
+ */
+ private void toRefNames(){
+ final int numRefs=refNames.size();
+ for(int r=0, s=1; r<numRefs; r++){
+ final int scafs=refScafCounts[r];
+ final int lim=s+scafs;
+ final String name=ReadWrite.stripToCore(refNames.get(r));
+// System.err.println("r="+r+", s="+s+", scafs="+scafs+", lim="+lim+", name="+name);
+ while(s<lim){
+// System.err.println(r+", "+s+". Setting "+scaffoldNames.get(s)+" -> "+name);
+ scaffoldNames.set(s, name);
+ s++;
+ }
+ }
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+
+ /**
+ * Fills tables with kmers from references, using multiple LoadThread.
+ * @return Number of kmers stored.
+ */
+ private long spawnLoadThreads(){
+ Timer t=new Timer();
+ if((ref==null || ref.length<1) && (literal==null || literal.length<1)){return 0;}
+ long added=0;
+
+ /* Create load threads */
+ LoadThread[] loaders=new LoadThread[WAYS];
+ for(int i=0; i<loaders.length; i++){
+ loaders[i]=new LoadThread(i);
+ loaders[i].start();
+ }
+
+ /* For each reference file... */
+ int refNum=0;
+ if(ref!=null){
+ for(String refname : ref){
+
+ /* Start an input stream */
+ FileFormat ff=FileFormat.testInput(refname, FileFormat.FASTA, null, false, true);
+ ConcurrentReadInputStream cris=ConcurrentReadInputStream.getReadInputStream(-1L, false, ff, null, null, null, Shared.USE_MPI, true);
+ cris.start(); //4567
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ /* Iterate through read lists from the input stream */
+ while(reads!=null && reads.size()>0){
+ {
+ /* Assign a unique ID number to each scaffold */
+ ArrayList<Read> reads2=new ArrayList<Read>(reads);
+ for(Read r1 : reads2){
+ final Read r2=r1.mate;
+ final Integer id=scaffoldNames.size();
+ refScafCounts[refNum]++;
+ scaffoldNames.add(r1.id==null ? id.toString() : r1.id);
+ int len=r1.length();
+ r1.obj=id;
+ if(r2!=null){
+ r2.obj=id;
+ len+=r2.length();
+ }
+ scaffoldLengths.add(len);
+ }
+
+ if(REPLICATE_AMBIGUOUS){
+ reads2=Tools.replicateAmbiguous(reads2, Tools.min(k, mink));
+ }
+
+ /* Send a pointer to the read list to each LoadThread */
+ for(LoadThread lt : loaders){
+ boolean b=true;
+ while(b){
+ try {
+ lt.queue.put(reads2);
+ b=false;
+ } catch (InterruptedException e) {
+ //TODO: This will hang due to still-running threads.
+ throw new RuntimeException(e);
+ }
+ }
+ }
+ }
+
+ /* Dispose of the old list and fetch a new one */
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ /* Cleanup */
+ cris.returnList(ln.id, ln.list.isEmpty());
+ errorState|=ReadWrite.closeStream(cris);
+ refNum++;
+ }
+ }
+
+ /* If there are literal sequences to use as references */
+ if(literal!=null){
+ ArrayList<Read> list=new ArrayList<Read>(literal.length);
+ if(verbose){System.err.println("Adding literals "+Arrays.toString(literal));}
+
+ /* Assign a unique ID number to each literal sequence */
+ for(int i=0; i<literal.length; i++){
+ final Integer id=scaffoldNames.size();
+ final Read r=new Read(literal[i].getBytes(), null, id);
+ refScafCounts[refNum]++;
+ scaffoldNames.add(id.toString());
+ scaffoldLengths.add(r.length());
+ r.obj=id;
+ list.add(r);
+ }
+
+ if(REPLICATE_AMBIGUOUS){
+ list=Tools.replicateAmbiguous(list, Tools.min(k, mink));
+ }
+
+ /* Send a pointer to the read list to each LoadThread */
+ for(LoadThread lt : loaders){
+ boolean b=true;
+ while(b){
+ try {
+ lt.queue.put(list);
+ b=false;
+ } catch (InterruptedException e) {
+ //TODO: This will hang due to still-running threads.
+ throw new RuntimeException(e);
+ }
+ }
+ }
+ }
+
+ /* Signal loaders to terminate */
+ for(LoadThread lt : loaders){
+ boolean b=true;
+ while(b){
+ try {
+ lt.queue.put(POISON);
+ b=false;
+ } catch (InterruptedException e) {
+ //TODO: This will hang due to still-running threads.
+ throw new RuntimeException(e);
+ }
+ }
+ }
+
+ /* Wait for loaders to die, and gather statistics */
+ boolean success=true;
+ for(LoadThread lt : loaders){
+ while(lt.getState()!=Thread.State.TERMINATED){
+ try {
+ lt.join();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ added+=lt.addedT;
+ refKmers+=lt.refKmersT;
+ refBases+=lt.refBasesT;
+ refReads+=lt.refReadsT;
+ modsum+=lt.modsumT;
+ success&=lt.success;
+ }
+ if(!success){KillSwitch.kill("Failed loading ref kmers; aborting.");}
+
+ //Correct statistics for number of threads, since each thread processes all reference data
+ refKmers/=WAYS;
+ refBases/=WAYS;
+ refReads/=WAYS;
+
+ scaffoldReadCounts=new AtomicLongArray(scaffoldNames.size());
+ scaffoldBaseCounts=new AtomicLongArray(scaffoldNames.size());
+
+ t.stop();
+ if(DISPLAY_PROGRESS){
+ outstream.println("Added "+added+" kmers; time: \t"+t);
+ Shared.printMemory();
+ outstream.println();
+ }
+
+ if(verbose){
+ TextStreamWriter tsw=new TextStreamWriter("stdout", false, false, false, FileFormat.TEXT);
+ tsw.start();
+ for(AbstractKmerTable table : keySets){
+ table.dumpKmersAsText(tsw, k, 1);
+ }
+ tsw.poisonAndWait();
+ }
+
+ return added;
+ }
+
+ /**
+ * Match reads against reference kmers, using multiple ProcessThread.
+ * @param t
+ */
+ private void spawnProcessThreads(Timer t){
+ t.start();
+
+ /* Create read input stream */
+ final ConcurrentReadInputStream cris;
+ final boolean paired;
+ {
+ FileFormat ff1=FileFormat.testInput(in1, FileFormat.FASTQ, null, true, true);
+ FileFormat ff2=FileFormat.testInput(in2, FileFormat.FASTQ, null, true, true);
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, ff1.samOrBam(), ff1, ff2, qfin1, qfin2);
+ cris.setSampleRate(samplerate, sampleseed);
+ cris.start(); //4567
+ paired=cris.paired();
+ if(!ff1.samOrBam()){outstream.println("Input is being processed as "+(paired ? "paired" : "unpaired"));}
+ }
+
+ /* Create read output streams */
+ final ConcurrentReadOutputStream ros, rosb, ross;
+ if(out1!=null){
+ final int buff=(!ORDERED ? 12 : Tools.max(32, 2*Shared.threads()));
+ FileFormat ff1=FileFormat.testOutput(out1, FileFormat.FASTQ, null, true, overwrite, append, ORDERED);
+ FileFormat ff2=FileFormat.testOutput(out2, FileFormat.FASTQ, null, true, overwrite, append, ORDERED);
+ ros=ConcurrentReadOutputStream.getStream(ff1, ff2, null, null, buff, null, true);
+ ros.start();
+ }else{ros=null;}
+ if(outb1!=null){
+ final int buff=(!ORDERED ? 12 : Tools.max(32, 2*Shared.threads()));
+ FileFormat ff1=FileFormat.testOutput(outb1, FileFormat.FASTQ, null, true, overwrite, append, ORDERED);
+ FileFormat ff2=FileFormat.testOutput(outb2, FileFormat.FASTQ, null, true, overwrite, append, ORDERED);
+ rosb=ConcurrentReadOutputStream.getStream(ff1, ff2, null, null, buff, null, true);
+ rosb.start();
+ }else{rosb=null;}
+ if(outsingle!=null){
+ final int buff=(!ORDERED ? 12 : Tools.max(32, 2*Shared.threads()));
+ FileFormat ff=FileFormat.testOutput(outsingle, FileFormat.FASTQ, null, true, overwrite, append, ORDERED);
+ ross=ConcurrentReadOutputStream.getStream(ff, null, null, null, buff, null, true);
+ ross.start();
+ }else{ross=null;}
+ if(ros!=null || rosb!=null || ross!=null){
+ t.stop();
+ outstream.println("Started output streams:\t"+t);
+ t.start();
+ }
+
+ /* Optionally skip the first reads, since initial reads may have lower quality */
+ if(skipreads>0){
+ long skipped=0;
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ while(skipped<skipreads && reads!=null && reads.size()>0){
+ skipped+=reads.size();
+
+ if(rosb!=null){rosb.add(new ArrayList<Read>(1), ln.id);}
+ if(ros!=null){ros.add(new ArrayList<Read>(1), ln.id);}
+ if(ross!=null){ross.add(new ArrayList<Read>(1), ln.id);}
+
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ cris.returnList(ln.id, ln.list.isEmpty());
+ if(reads==null || reads.isEmpty()){
+ ReadWrite.closeStreams(cris, ros, rosb, ross);
+ System.err.println("Skipped all of the reads.");
+ System.exit(0);
+ }
+ }
+
+ /* Create ProcessThreads */
+ ArrayList<ProcessThread> alpt=new ArrayList<ProcessThread>(THREADS);
+ for(int i=0; i<THREADS; i++){alpt.add(new ProcessThread(cris, ros, rosb, ross, ALLOW_LOCAL_ARRAYS));}
+ for(ProcessThread pt : alpt){pt.start();}
+
+ /* Wait for threads to die, and gather statistics */
+ for(ProcessThread pt : alpt){
+
+ /* Wait for a thread to die */
+ while(pt.getState()!=Thread.State.TERMINATED){
+ try {
+ pt.join();
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ }
+ }
+
+ /* Accumulate data from per-thread counters */
+ readsIn+=pt.readsInT;
+ basesIn+=pt.basesInT;
+ readsOut+=pt.readsOutuT;
+ basesOut+=pt.basesOutuT;
+ readsKFiltered+=pt.readsKFilteredT;
+ basesKFiltered+=pt.basesKFilteredT;
+ readsQTrimmed+=pt.readsQTrimmedT;
+ basesQTrimmed+=pt.basesQTrimmedT;
+ readsFTrimmed+=pt.readsFTrimmedT;
+ basesFTrimmed+=pt.basesFTrimmedT;
+ readsKTrimmed+=pt.readsKTrimmedT;
+ basesKTrimmed+=pt.basesKTrimmedT;
+ readsTrimmedByOverlap+=pt.readsTrimmedByOverlapT;
+ basesTrimmedByOverlap+=pt.basesTrimmedByOverlapT;
+ badGcReads+=pt.badGcReadsT;
+ badGcBases+=pt.badGcBasesT;
+ readsQFiltered+=pt.readsQFilteredT;
+ basesQFiltered+=pt.basesQFilteredT;
+ readsEFiltered+=pt.readsEFilteredT;
+ basesEFiltered+=pt.basesEFilteredT;
+
+ if(hitCounts!=null){
+ for(int i=0; i<hitCounts.length; i++){hitCounts[i]+=pt.hitCountsT[i];}
+ pt.hitCountsT=null;
+ }
+ if(pt.scaffoldReadCountsT!=null && scaffoldReadCounts!=null){
+ for(int i=0; i<pt.scaffoldReadCountsT.length; i++){scaffoldReadCounts.addAndGet(i, pt.scaffoldReadCountsT[i]);}
+ pt.scaffoldReadCountsT=null;
+ }
+ if(pt.scaffoldBaseCountsT!=null && scaffoldBaseCounts!=null){
+ for(int i=0; i<pt.scaffoldBaseCountsT.length; i++){scaffoldBaseCounts.addAndGet(i, pt.scaffoldBaseCountsT[i]);}
+ pt.scaffoldBaseCountsT=null;
+ }
+ }
+
+ /* Shut down I/O streams; capture error status */
+ errorState|=ReadWrite.closeStreams(cris, ros, rosb, ross);
+ errorState|=ReadStats.writeAll();
+
+ t.stop();
+ if(showSpeed){
+ outstream.println("Processing time: \t\t"+t);
+ }
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Classes ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /**
+ * Loads kmers into a table. Each thread handles all kmers X such that X%WAYS==tnum.
+ */
+ private class LoadThread extends Thread{
+
+ public LoadThread(final int tnum_){
+ tnum=tnum_;
+ map=keySets[tnum];
+ }
+
+ /**
+ * Get the next list of reads (or scaffolds) from the queue.
+ * @return List of reads
+ */
+ private ArrayList<Read> fetch(){
+ ArrayList<Read> list=null;
+ while(list==null){
+ try {
+ list=queue.take();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ return list;
+ }
+
+ @Override
+ public void run(){
+ ArrayList<Read> reads=fetch();
+ while(reads!=POISON){
+ for(Read r1 : reads){
+ assert(r1.pairnum()==0);
+ final Read r2=r1.mate;
+
+ final int rblen=(r1==null ? 0 : r1.length());
+ final int rblen2=r1.mateLength();
+
+ addedT+=addToMap(r1, rblen>20000000 ? k : rblen>5000000 ? 11 : rblen>500000 ? 2 : 0);
+ if(r2!=null){
+ addedT+=addToMap(r2, rblen2>20000000 ? k : rblen2>5000000 ? 11 : rblen2>500000 ? 2 : 0);
+ }
+ }
+ reads=fetch();
+ }
+
+ if(map.canRebalance() && map.size()>2L*map.arrayLength()){
+ map.rebalance();
+ }
+ success=true;
+ }
+
+ /**
+ * Store the read's kmers in a table.
+ * @param r The current read to process
+ * @param skip Number of bases to skip between kmers
+ * @return Number of kmers stored
+ */
+ private long addToMap(Read r, int skip){
+ skip=Tools.max(minSkip, Tools.min(maxSkip, skip));
+ final byte[] bases=r.bases;
+ final int shift=2*k;
+ final int shift2=shift-2;
+ final long mask=~((-1L)<<shift);
+ final long kmask=lengthMasks[k];
+ long kmer=0;
+ long rkmer=0;
+ long added=0;
+ int len=0;
+
+ if(bases!=null){
+ refReadsT++;
+ refBasesT+=bases.length;
+ }
+ if(bases==null || bases.length<k){return 0;}
+
+ final int id=(Integer)r.obj;
+
+ if(skip>1){ //Process while skipping some kmers
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ long x=Dedupe.baseToNumber[b];
+ long x2=Dedupe.baseToComplementNumber[b];
+ kmer=((kmer<<2)|x)&mask;
+ rkmer=(rkmer>>>2)|(x2<<shift2);
+ if(b=='N'){len=0;}else{len++;}
+ if(verbose){System.err.println("Scanning1 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
+ if(len>=k){
+ refKmersT++;
+ if(len%skip==0){
+ final long extraBase=(i>=bases.length-1 ? -1 : AminoAcid.baseToNumber[bases[i+1]]);
+ added+=addToMap(kmer, rkmer, k, extraBase, id, kmask, hammingDistance, editDistance);
+ if(useShortKmers){
+ if(i==k2){added+=addToMapRightShift(kmer, rkmer, id);}
+ if(i==bases.length-1){added+=addToMapLeftShift(kmer, rkmer, extraBase, id);}
+ }
+ }
+ }
+ }
+ }else{ //Process all kmers
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ long x=Dedupe.baseToNumber[b];
+ long x2=Dedupe.baseToComplementNumber[b];
+ kmer=((kmer<<2)|x)&mask;
+ rkmer=(rkmer>>>2)|(x2<<shift2);
+ if(b=='N'){len=0;}else{len++;}
+ if(verbose){System.err.println("Scanning2 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
+ if(len>=k){
+ refKmersT++;
+ final long extraBase=(i>=bases.length-1 ? -1 : AminoAcid.baseToNumber[bases[i+1]]);
+ final long atm=addToMap(kmer, rkmer, k, extraBase, id, kmask, hammingDistance, editDistance);
+ added+=atm;
+// assert(false) : atm+", "+map.contains(toValue(kmer, rkmer, kmask));
+ if(useShortKmers){
+ if(i==k2){added+=addToMapRightShift(kmer, rkmer, id);}
+ if(i==bases.length-1){added+=addToMapLeftShift(kmer, rkmer, extraBase, id);}
+ }
+ }
+ }
+ }
+ return added;
+ }
+
+
+ /**
+ * Adds short kmers on the left end of the read.
+ * @param kmer Forward kmer
+ * @param rkmer Reverse kmer
+ * @param extraBase Base added to end in case of deletions
+ * @param id Scaffold number
+ * @return Number of kmers stored
+ */
+ private long addToMapLeftShift(long kmer, long rkmer, final long extraBase, final int id){
+ if(verbose){System.err.println("addToMapLeftShift");}
+ long added=0;
+ for(int i=k-1; i>=mink; i--){
+ kmer=kmer&rightMasks[i];
+ rkmer=rkmer>>>2;
+ long x=addToMap(kmer, rkmer, i, extraBase, id, lengthMasks[i], hammingDistance2, editDistance2);
+ added+=x;
+ if(verbose){
+ if((toValue(kmer, rkmer, lengthMasks[i]))%WAYS==tnum){
+ System.err.println("added="+x+"; i="+i+"; tnum="+tnum+"; Added left-shift kmer "+AminoAcid.kmerToString(kmer&~lengthMasks[i], i)+"; value="+(toValue(kmer, rkmer, lengthMasks[i]))+"; kmer="+kmer+"; rkmer="+rkmer+"; kmask="+lengthMasks[i]+"; rightMasks[i+1]="+rightMasks[i+1]);
+ System.err.println("i="+i+"; tnum="+tnum+"; Looking for left-shift kmer "+AminoAcid.kmerToString(kmer&~lengthMasks[i], i));
+ final long value=toValue(kmer, rkmer, lengthMasks[i]);
+ if(map.contains(value)){System.err.println("Found "+value);}
+ }
+ }
+ }
+ return added;
+ }
+
+
+ /**
+ * Adds short kmers on the right end of the read.
+ * @param kmer Forward kmer
+ * @param rkmer Reverse kmer
+ * @param id Scaffold number
+ * @return Number of kmers stored
+ */
+ private long addToMapRightShift(long kmer, long rkmer, final int id){
+ if(verbose){System.err.println("addToMapRightShift");}
+ long added=0;
+ for(int i=k-1; i>=mink; i--){
+ long extraBase=kmer&3L;
+ kmer=kmer>>>2;
+ rkmer=rkmer&rightMasks[i];
+// assert(Long.numberOfLeadingZeros(kmer)>=2*(32-i)) : Long.numberOfLeadingZeros(kmer)+", "+i+", "+kmer+", "+kMasks[i];
+// assert(Long.numberOfLeadingZeros(rkmer)>=2*(32-i)) : Long.numberOfLeadingZeros(rkmer)+", "+i+", "+rkmer+", "+kMasks[i];
+ long x=addToMap(kmer, rkmer, i, extraBase, id, lengthMasks[i], hammingDistance2, editDistance2);
+ added+=x;
+ if(verbose){
+ if((toValue(kmer, rkmer, lengthMasks[i]))%WAYS==tnum){
+ System.err.println("added="+x+"; i="+i+"; tnum="+tnum+"; Added right-shift kmer "+AminoAcid.kmerToString(kmer&~lengthMasks[i], i)+"; value="+(toValue(kmer, rkmer, lengthMasks[i]))+"; kmer="+kmer+"; rkmer="+rkmer+"; kmask="+lengthMasks[i]+"; rightMasks[i+1]="+rightMasks[i+1]);
+ System.err.println("i="+i+"; tnum="+tnum+"; Looking for right-shift kmer "+AminoAcid.kmerToString(kmer&~lengthMasks[i], i));
+ final long value=toValue(kmer, rkmer, lengthMasks[i]);
+ if(map.contains(value)){System.err.println("Found "+value);}
+ }
+ }
+ }
+ return added;
+ }
+
+
+ /**
+ * Adds this kmer to the table, including any mutations implied by editDistance or hammingDistance.
+ * @param kmer Forward kmer
+ * @param rkmer Reverse kmer
+ * @param len Kmer length
+ * @param extraBase Base added to end in case of deletions
+ * @param id Scaffold number
+ * @param kmask0
+ * @return Number of kmers stored
+ */
+ private long addToMap(final long kmer, final long rkmer, final int len, final long extraBase, final int id, final long kmask0, final int hdist, final int edist){
+
+ assert(kmask0==lengthMasks[len]) : kmask0+", "+len+", "+lengthMasks[len]+", "+Long.numberOfTrailingZeros(kmask0)+", "+Long.numberOfTrailingZeros(lengthMasks[len]);
+
+ if(verbose){System.err.println("addToMap_A; len="+len+"; kMasks[len]="+lengthMasks[len]);}
+ assert((kmer&kmask0)==0);
+ final long added;
+ if(hdist==0){
+ final long key=toValue(kmer, rkmer, kmask0);
+ if(speed>0 && ((key/WAYS)&15)<speed){return 0;}
+ if(key%WAYS!=tnum){return 0;}
+ if(verbose){System.err.println("addToMap_B: "+AminoAcid.kmerToString(kmer&~lengthMasks[len], len)+" = "+key);}
+ added=map.setIfNotPresent(key, id);
+ }else if(edist>0){
+// long extraBase=(i>=bases.length-1 ? -1 : AminoAcid.baseToNumber[bases[i+1]]);
+ added=mutate(kmer, rkmer, len, id, edist, extraBase);
+ }else{
+ added=mutate(kmer, rkmer, len, id, hdist, -1);
+ }
+ if(verbose){System.err.println("addToMap added "+added+" keys.");}
+ return added;
+ }
+
+ /**
+ * Mutate and store this kmer through 'dist' recursions.
+ * @param kmer Forward kmer
+ * @param rkmer Reverse kmer
+ * @param id Scaffold number
+ * @param dist Number of mutations
+ * @param extraBase Base added to end in case of deletions
+ * @return Number of kmers stored
+ */
+ private long mutate(final long kmer, final long rkmer, final int len, final int id, final int dist, final long extraBase){
+ long added=0;
+
+ final long key=toValue(kmer, rkmer, lengthMasks[len]);
+
+ if(verbose){System.err.println("mutate_A; len="+len+"; kmer="+kmer+"; rkmer="+rkmer+"; kMasks[len]="+lengthMasks[len]);}
+ if(key%WAYS==tnum){
+ if(verbose){System.err.println("mutate_B: "+AminoAcid.kmerToString(kmer&~lengthMasks[len], len)+" = "+key);}
+ int x=map.setIfNotPresent(key, id);
+ if(verbose){System.err.println("mutate_B added "+x+" keys.");}
+ added+=x;
+ assert(map.contains(key));
+ }
+
+ if(dist>0){
+ final int dist2=dist-1;
+
+ //Sub
+ for(int j=0; j<4; j++){
+ for(int i=0; i<len; i++){
+ final long temp=(kmer&clearMasks[i])|setMasks[j][i];
+ if(temp!=kmer){
+ long rtemp=AminoAcid.reverseComplementBinaryFast(temp, len);
+ added+=mutate(temp, rtemp, len, id, dist2, extraBase);
+ }
+ }
+ }
+
+ if(editDistance>0){
+ //Del
+ if(extraBase>=0 && extraBase<=3){
+ for(int i=1; i<len; i++){
+ final long temp=(kmer&leftMasks[i])|((kmer<<2)&rightMasks[i])|extraBase;
+ if(temp!=kmer){
+ long rtemp=AminoAcid.reverseComplementBinaryFast(temp, len);
+ added+=mutate(temp, rtemp, len, id, dist2, -1);
+ }
+ }
+ }
+
+ //Ins
+ final long eb2=kmer&3;
+ for(int i=1; i<len; i++){
+ final long temp0=(kmer&leftMasks[i])|((kmer&rightMasks[i])>>2);
+ for(int j=0; j<4; j++){
+ final long temp=temp0|setMasks[j][i-1];
+ if(temp!=kmer){
+ long rtemp=AminoAcid.reverseComplementBinaryFast(temp, len);
+ added+=mutate(temp, rtemp, len, id, dist2, eb2);
+ }
+ }
+ }
+ }
+
+ }
+
+ return added;
+ }
+
+ /*--------------------------------------------------------------*/
+
+ /** Number of kmers stored by this thread */
+ public long addedT=0;
+ /** Number of items encountered by this thread */
+ public long refKmersT=0, refReadsT=0, refBasesT=0;
+ /** Thread number; used to determine which kmers to store */
+ public final int tnum;
+ /** Buffer of input read lists */
+ public final ArrayBlockingQueue<ArrayList<Read>> queue=new ArrayBlockingQueue<ArrayList<Read>>(32);
+ /** Used to trick compiler */
+ public long modsumT=0; //123
+
+ /** Destination for storing kmers */
+ private final AbstractKmerTable map;
+
+ /** Completed successfully */
+ boolean success=false;
+
+ }
+
+ /*--------------------------------------------------------------*/
+
+ /**
+ * Matches read kmers against reference kmers, performs binning and/or trimming, and writes output.
+ */
+ private class ProcessThread extends Thread{
+
+ /**
+ * Constructor
+ * @param cris_ Read input stream
+ * @param ros_ Unmatched read output stream (optional)
+ * @param rosb_ Matched read output stream (optional)
+ * @param ross_ Singleton read output stream (optional)
+ */
+ public ProcessThread(ConcurrentReadInputStream cris_, ConcurrentReadOutputStream ros_, ConcurrentReadOutputStream rosb_, ConcurrentReadOutputStream ross_, boolean localArrays){
+ cris=cris_;
+ ros=ros_;
+ rosb=rosb_;
+ ross=ross_;
+
+ readstats=(MAKE_QUALITY_HISTOGRAM || MAKE_MATCH_HISTOGRAM || MAKE_BASE_HISTOGRAM || MAKE_QUALITY_ACCURACY ||
+ MAKE_EHIST || MAKE_INDELHIST || MAKE_LHIST || MAKE_GCHIST || MAKE_IDHIST) ?
+ new ReadStats() : null;
+
+ final int alen=(scaffoldNames==null ? 0 : scaffoldNames.size());
+
+ if(findBestMatch){
+ countArray=new int[alen];
+ idList=new IntList();
+ countList=new IntList();
+ }else{
+ countArray=null;
+ idList=countList=null;
+ }
+
+ overlapVector=(trimByOverlap ? new int[5] : null);
+
+ hitCountsT=(hitCounts==null ? null : new long[hitCounts.length]);
+
+ if(localArrays && alen>0 && alen<10000 && scaffoldReadCounts!=null && scaffoldBaseCounts!=null){
+ scaffoldReadCountsT=new long[alen];
+ scaffoldBaseCountsT=new long[alen];
+ }else{
+ scaffoldReadCountsT=scaffoldBaseCountsT=null;
+ }
+
+ if(calcEntropy){
+ entropyCounts=new short[entropyKmerspace];
+ entropyCountCounts=new short[entropyWindow+2];
+ entropyCountCounts[0]=(short)entropyWindow;
+ }else{
+ entropyCounts=entropyCountCounts=null;
+ }
+
+ maxBasesOutmT=(maxBasesOutm>0 ? Tools.max(1, maxBasesOutm/THREADS) : -1);
+ maxBasesOutuT=(maxBasesOutu>0 ? Tools.max(1, maxBasesOutu/THREADS) : -1);
+ }
+
+ @Override
+ public void run(){
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+ ArrayList<Read> bad=(rosb==null ? null : new ArrayList<Read>(Shared.READ_BUFFER_LENGTH));
+ ArrayList<Read> single=new ArrayList<Read>(Shared.READ_BUFFER_LENGTH);
+
+ //While there are more reads lists...
+ while(reads!=null && reads.size()>0){
+
+ int removed=0;
+
+ //For each read (or pair) in the list...
+ for(int i=0; i<reads.size(); i++){
+ final Read r1=reads.get(i);
+ final Read r2=r1.mate;
+
+ if(!r1.validated()){r1.validate(true);}
+ if(r2!=null && !r2.validated()){r2.validate(true);}
+
+ if(readstats!=null){
+ if(MAKE_QUALITY_HISTOGRAM){readstats.addToQualityHistogram(r1);}
+ if(MAKE_BASE_HISTOGRAM){readstats.addToBaseHistogram(r1);}
+ if(MAKE_MATCH_HISTOGRAM){readstats.addToMatchHistogram(r1);}
+ if(MAKE_QUALITY_ACCURACY){readstats.addToQualityAccuracy(r1);}
+
+ if(MAKE_EHIST){readstats.addToErrorHistogram(r1);}
+ if(MAKE_INDELHIST){readstats.addToIndelHistogram(r1);}
+ if(MAKE_LHIST){readstats.addToLengthHistogram(r1);}
+ if(MAKE_GCHIST){readstats.addToGCHistogram(r1);}
+ if(MAKE_IDHIST){readstats.addToIdentityHistogram(r1);}
+ }
+
+ if(loglog!=null){loglog.hash(r1);}
+
+ final int initialLength1=r1.length();
+ final int initialLength2=r1.mateLength();
+
+ final int minlen1=(int)Tools.max(initialLength1*minLenFraction, minReadLength);
+ final int minlen2=(int)Tools.max(initialLength2*minLenFraction, minReadLength);
+
+ if(verbose){System.err.println("Considering read "+r1.id+" "+new String(r1.bases));}
+
+ readsInT++;
+ basesInT+=initialLength1;
+ if(r2!=null){
+ readsInT++;
+ basesInT+=initialLength2;
+ }
+
+ if(chastityFilter){
+ if(r1!=null && r1.failsChastity()){
+ r1.setDiscarded(true);
+ if(r2!=null){r2.setDiscarded(true);}
+ }
+ }
+
+ if(removeBadBarcodes){
+ if(r1!=null && !r1.discarded() && r1.failsBarcode(barcodes, failIfNoBarcode)){
+ if(failBadBarcodes){KillSwitch.kill("Invalid barcode detected: "+r1.id+"\nThis can be disabled with the flag barcodefilter=f");}
+ r1.setDiscarded(true);
+ if(r2!=null){r2.setDiscarded(true);}
+ }
+ }
+
+ if(recalibrateQuality){
+ if(r1!=null && !r1.discarded()){
+ CalcTrueQuality.recalibrate(r1);
+ }
+ if(r2!=null && !r2.discarded()){
+ CalcTrueQuality.recalibrate(r2);
+ }
+ }
+
+ if(filterGC && (initialLength1>0 || initialLength2>0)){
+ final float gc;
+ if(r2==null){
+ gc=r1.gc();
+ }else{
+ gc=(r1.gc()*initialLength1+r2.gc()*initialLength2)/(initialLength1+initialLength2);
+ }
+ if(gc<minGC || gc>maxGC){
+ if(r1!=null && !r1.discarded()){
+ r1.setDiscarded(true);
+ badGcBasesT+=initialLength1;
+ badGcReadsT++;
+ }
+ if(r2!=null && !r2.discarded()){
+ r2.setDiscarded(true);
+ badGcBasesT+=initialLength2;
+ badGcReadsT++;
+ }
+ }
+ }
+
+ if(forceTrimLeft>0 || forceTrimRight>0 || forceTrimRight2>0 || forceTrimModulo>0){
+ if(r1!=null && !r1.discarded()){
+ final int len=r1.length();
+ final int a=forceTrimLeft>0 ? forceTrimLeft : 0;
+ final int b0=forceTrimModulo>0 ? len-1-len%forceTrimModulo : len;
+ final int b1=forceTrimRight>0 ? forceTrimRight : len;
+ final int b2=forceTrimRight2>0 ? len-1-forceTrimRight2 : len;
+ final int b=Tools.min(b0, b1, b2);
+ final int x=TrimRead.trimToPosition(r1, a, b, 1);
+ basesFTrimmedT+=x;
+ readsFTrimmedT+=(x>0 ? 1 : 0);
+ if(r1.length()<minlen1){r1.setDiscarded(true);}
+ }
+ if(r2!=null && !r2.discarded()){
+ final int len=r2.length();
+ final int a=forceTrimLeft>0 ? forceTrimLeft : 0;
+ final int b0=forceTrimModulo>0 ? len-1-len%forceTrimModulo : len;
+ final int b1=forceTrimRight>0 ? forceTrimRight : len;
+ final int b2=forceTrimRight2>0 ? len-1-forceTrimRight2 : len;
+ final int b=Tools.min(b0, b1, b2);
+ final int x=TrimRead.trimToPosition(r2, a, b, 1);
+ basesFTrimmedT+=x;
+ readsFTrimmedT+=(x>0 ? 1 : 0);
+ if(r2.length()<minlen2){r2.setDiscarded(true);}
+ }
+ }
+
+ boolean remove;
+ if(removePairsIfEitherBad){remove=r1.discarded() || (r2!=null && r2.discarded());}
+ else{remove=r1.discarded() && (r2==null || r2.discarded());}
+
+ if(remove){
+ if(r1!=null){
+ basesQFilteredT+=r1.length();
+ readsQFilteredT++;
+ }
+ if(r2!=null){
+ basesQFilteredT+=r2.length();
+ readsQFilteredT++;
+ }
+ if(bad!=null){bad.add(r1);}
+ }else{
+
+ if(ecc && r1!=null && r2!=null){BBMerge.findOverlapStrict(r1, r2, true);}
+
+ //Process kmers
+ if(ktrimLeft || ktrimRight || ktrimN){
+
+ int rlen1=0, rlen2=0;
+ int xsum=0;
+ int rktsum=0;
+
+ if(ktrimN){
+ if(r1!=null){
+ int x=kmask(r1, keySets);
+ xsum+=x;
+ rktsum+=(x>0 ? 1 : 0);
+ rlen1=r1.length();
+ if(rlen1<minlen1){r1.setDiscarded(true);}
+ }
+ if(r2!=null){
+ int x=kmask(r2, keySets);
+ xsum+=x;
+ rktsum+=(x>0 ? 1 : 0);
+ rlen2=r2.length();
+ if(rlen2<minlen2){r2.setDiscarded(true);}
+ }
+ }else{
+ if(r1!=null){
+ int x=ktrim(r1, keySets);
+ xsum+=x;
+ rktsum+=(x>0 ? 1 : 0);
+ rlen1=r1.length();
+ if(rlen1<minlen1){r1.setDiscarded(true);}
+ }
+ if(r2!=null){
+ int x=ktrim(r2, keySets);
+ xsum+=x;
+ rktsum+=(x>0 ? 1 : 0);
+ rlen2=r2.length();
+ if(rlen2<minlen2){r2.setDiscarded(true);}
+ }
+ }
+
+ if((removePairsIfEitherBad && (r1.discarded() || (r2!=null && r2.discarded()))) ||
+ (r1.discarded() && (r2==null || r2.discarded()))){
+ if(!ktrimN){
+ xsum+=(rlen1+rlen2);
+ rktsum=(r1==null ? 0 : 1)+(r2==null ? 0 : 1);
+ }
+ remove=true;
+ if(addTrimmedToBad && bad!=null){bad.add(r1);}
+ }else if(ktrimRight && trimPairsEvenly && xsum>0 && r2!=null && r1.length()!=r2.length()){
+ int x;
+ if(r1.length()>r2.length()){
+ x=TrimRead.trimToPosition(r1, 0, r2.length()-1, 1);
+ }else{
+ x=TrimRead.trimToPosition(r2, 0, r1.length()-1, 1);
+ }
+ if(rktsum<2){rktsum++;}
+ xsum+=x;
+ assert(r1.length()==r2.length()) : r1.length()+", "+r2.length();
+ }
+ basesKTrimmedT+=xsum;
+ readsKTrimmedT+=rktsum;
+
+ }else{
+ //Do kmer matching
+
+ if(minCoveredFraction>0){
+ if(r1!=null && !r1.discarded()){
+ final int minCoveredBases=(int)Math.ceil(minCoveredFraction*r1.length());
+ final int covered=countCoveredBases(r1, keySets, minCoveredBases);
+ if(covered>=minCoveredBases){r1.setDiscarded(true);}
+ }
+ if(r2!=null && !r2.discarded()){
+ final int minCoveredBases=(int)Math.ceil(minCoveredFraction*r2.length());
+ final int covered=countCoveredBases(r2, keySets, minCoveredBases);
+ if(covered>=minCoveredBases){r2.setDiscarded(true);}
+ }
+ }else{
+
+ final int maxBadKmersR1, maxBadKmersR2;
+ if(minKmerFraction==0){
+ maxBadKmersR1=maxBadKmersR2=maxBadKmers0;
+ }else{
+ final int vk1=r1.numValidKmers(keff), vk2=(r2==null ? 0 : r2.numValidKmers(keff));
+ maxBadKmersR1=Tools.max(maxBadKmers0, (int)((vk1-1)*minKmerFraction));
+ maxBadKmersR2=Tools.max(maxBadKmers0, (int)((vk2-1)*minKmerFraction));
+ }
+
+ if(!findBestMatch){
+ final int a=(kbig<=k ? countSetKmers(r1, keySets, maxBadKmersR1) : countSetKmersBig(r1, keySets, maxBadKmersR1));
+ final int b=(kbig<=k ? countSetKmers(r2, keySets, maxBadKmersR2) : countSetKmersBig(r2, keySets, maxBadKmersR2));
+
+ if(r1!=null && a>maxBadKmersR1){r1.setDiscarded(true);}
+ if(r2!=null && b>maxBadKmersR2){r2.setDiscarded(true);}
+
+ }else{
+ final int a=findBestMatch(r1, keySets, maxBadKmersR1);
+ final int b=findBestMatch(r2, keySets, maxBadKmersR2);
+
+ if(r1!=null && a>0){r1.setDiscarded(true);}
+ if(r2!=null && b>0){r2.setDiscarded(true);}
+ }
+ }
+
+ if((removePairsIfEitherBad && (r1.discarded() || (r2!=null && r2.discarded()))) ||
+ (r1.discarded() && (r2==null || r2.discarded()))){
+ remove=true;
+ if(r1!=null){
+ readsKFilteredT++;
+ basesKFilteredT+=r1.length();
+ }
+ if(r2!=null){
+ readsKFilteredT++;
+ basesKFilteredT+=r2.length();
+ }
+ if(bad!=null){bad.add(r1);}
+ }
+
+ }
+ }
+
+// assert(false) : remove+", "+trimByOverlap+", "+(r2!=null);
+
+ if(!remove && trimByOverlap && r2!=null && expectedErrors(r1, r2)<meeFilter){
+
+ if(aprob==null || aprob.length<r1.length()){aprob=new float[r1.length()];}
+ if(bprob==null || bprob.length<r2.length()){bprob=new float[r2.length()];}
+
+ //Do overlap trimming
+ r2.reverseComplement();
+// int bestInsert=BBMergeOverlapper.mateByOverlap(r1, r2, aprob, bprob, overlapVector, minOverlap0, minOverlap,
+// overlapMargin, overlapMaxMismatches0, overlapMaxMismatches, overlapMinq);
+ int bestInsert=BBMergeOverlapper.mateByOverlapRatio(r1, r2, aprob, bprob, overlapVector, minOverlap0, minOverlap,
+ minInsert0, minInsert, maxRatio, ratioMargin, ratioOffset, 0.95f, 0.95f, useQualityForOverlap);
+
+ if(bestInsert<minInsert){bestInsert=-1;}
+ boolean ambig=(overlapVector[4]==1);
+ final int bestBad=overlapVector[2];
+
+ if(bestInsert>0 && !ambig && r1.quality!=null && r2.quality!=null && useQualityForOverlap){
+ if(efilterRatio>0 && bestInsert>0 && !ambig){
+ float bestExpected=BBMergeOverlapper.expectedMismatches(r1, r2, bestInsert);
+ if((bestExpected+efilterOffset)*efilterRatio<bestBad){ambig=true;}
+ }
+ if(pfilterRatio>0 && bestInsert>0 && !ambig){
+ float probability=BBMergeOverlapper.probability(r1, r2, bestInsert);
+ if(probability<pfilterRatio){bestInsert=-1;}
+ }
+ if(meeFilter>=0 && bestInsert>0 && !ambig){
+ float expected=BBMergeOverlapper.expectedMismatches(r1, r2, bestInsert);
+ if(expected>meeFilter){bestInsert=-1;}
+ }
+ }
+
+ r2.reverseComplement();
+
+ if(bestInsert>0 && !ambig){
+ if(bestInsert<r1.length()){
+ if(verbose){System.err.println("Overlap right trimming r1 to "+0+", "+(bestInsert-1));}
+ int x=TrimRead.trimToPosition(r1, 0, bestInsert-1, 1);
+ if(verbose){System.err.println("Trimmed "+x+" bases: "+new String(r1.bases));}
+ readsTrimmedByOverlapT++;
+ basesTrimmedByOverlapT+=x;
+ }
+ if(bestInsert<r2.length()){
+ if(verbose){System.err.println("Overlap right trimming r2 to "+0+", "+(bestInsert-1));}
+ int x=TrimRead.trimToPosition(r2, 0, bestInsert-1, 1);
+ if(verbose){System.err.println("Trimmed "+x+" bases: "+new String(r2.bases));}
+ readsTrimmedByOverlapT++;
+ basesTrimmedByOverlapT+=x;
+ }
+ }
+ }
+
+ if(!remove){
+ //Do quality trimming
+
+ int rlen1=0, rlen2=0;
+ if(r1!=null){
+ if(qtrimLeft || qtrimRight){
+ int x=TrimRead.trimFast(r1, qtrimLeft, qtrimRight, trimq, 1);
+ basesQTrimmedT+=x;
+ readsQTrimmedT+=(x>0 ? 1 : 0);
+ }
+ rlen1=r1.length();
+ if(rlen1<minlen1 || rlen1>maxReadLength){r1.setDiscarded(true);}
+ }
+ if(r2!=null){
+ if(qtrimLeft || qtrimRight){
+ int x=TrimRead.trimFast(r2, qtrimLeft, qtrimRight, trimq, 1);
+ basesQTrimmedT+=x;
+ readsQTrimmedT+=(x>0 ? 1 : 0);
+ }
+ rlen2=r2.length();
+ if(rlen2<minlen2 || rlen2>maxReadLength){r2.setDiscarded(true);}
+ }
+
+ //Discard reads if too short
+ if((removePairsIfEitherBad && (r1.discarded() || (r2!=null && r2.discarded()))) ||
+ (r1.discarded() && (r2==null || r2.discarded()))){
+ basesQTrimmedT+=(r1.length()+r1.mateLength());
+ remove=true;
+ if(addTrimmedToBad && bad!=null){bad.add(r1);}
+ }
+
+ }
+
+ if(!remove){
+ //Do quality filtering
+
+ //Determine whether to discard the reads based on average quality
+ if(minAvgQuality>0){
+ if(r1!=null && r1.quality!=null && r1.avgQuality(false, minAvgQualityBases)<minAvgQuality){r1.setDiscarded(true);}
+ if(r2!=null && r2.quality!=null && r2.avgQuality(false, minAvgQualityBases)<minAvgQuality){r2.setDiscarded(true);}
+ }
+ //Determine whether to discard the reads based on the presence of Ns
+ if(maxNs>=0){
+ if(r1!=null && r1.countUndefined()>maxNs){r1.setDiscarded(true);}
+ if(r2!=null && r2.countUndefined()>maxNs){r2.setDiscarded(true);}
+ }
+ //Determine whether to discard the reads based on a lack of useful kmers
+ if(minConsecutiveBases>0){
+ if(r1!=null && !r1.discarded() && !r1.hasMinConsecutiveBases(minConsecutiveBases)){r1.setDiscarded(true);}
+ if(r2!=null && !r2.discarded() && !r2.hasMinConsecutiveBases(minConsecutiveBases)){r2.setDiscarded(true);}
+ }
+ //Determine whether to discard the reads based on minimum base frequency
+ if(minBaseFrequency>0){
+ if(r1!=null && r1.minBaseCount()<minBaseFrequency*r1.length()){r1.setDiscarded(true);}
+ if(r2!=null && r2.minBaseCount()<minBaseFrequency*r2.length()){r2.setDiscarded(true);}
+ }
+
+ //Discard reads if too short
+ if((removePairsIfEitherBad && (r1.discarded() || (r2!=null && r2.discarded()))) ||
+ (r1.discarded() && (r2==null || r2.discarded()))){
+ basesQFilteredT+=(r1.length()+r1.mateLength());
+ readsQFilteredT+=1+r1.mateCount();
+ remove=true;
+ if(addTrimmedToBad && bad!=null){bad.add(r1);}
+ }
+ }
+
+ if(!remove && calcEntropy){
+ //Test entropy
+
+ if(r1!=null && !r1.discarded() && entropyCutoff>averageEntropy(r1.bases, entropyK, entropyWindow,
+ entropyCounts, entropyCountCounts, entropyKmerspace, verifyEntropy)){r1.setDiscarded(true);}
+ if(r2!=null && !r2.discarded() && entropyCutoff>averageEntropy(r2.bases, entropyK, entropyWindow,
+ entropyCounts, entropyCountCounts, entropyKmerspace, verifyEntropy)){r2.setDiscarded(true);}
+
+ if((removePairsIfEitherBad && (r1.discarded() || (r2!=null && r2.discarded()))) ||
+ (r1.discarded() && (r2==null || r2.discarded()))){
+ basesEFilteredT+=(r1.length()+r1.mateLength());
+ readsEFilteredT+=(r1==null ? 0 : 1)+(r2==null ? 0 : 1);
+ remove=true;
+ if(bad!=null){bad.add(r1);}
+ }
+ }
+
+ if(ross!=null){
+ if(!r1.discarded() && (r2==null || r2.discarded())){
+ Read clone=r1.clone();
+ clone.mate=null;
+ single.add(clone);
+ }else if(r2!=null && r1.discarded() && !r2.discarded()){
+ Read clone=r2.clone();
+ clone.mate=null;
+ single.add(clone);
+ }
+ }
+
+ if(remove){
+ //Evict read
+ removed++;
+ if(r2!=null){removed++;}
+ reads.set(i, null);
+
+ readsOutmT+=1+r1.mateCount();
+ basesOutmT+=r1.length()+r1.mateLength();
+ }else{
+ readsOutuT+=1+r1.mateCount();
+ basesOutuT+=r1.length()+r1.mateLength();
+ }
+ }
+
+ //Send matched list to matched output stream
+ if(rosb!=null){
+ rosb.add(bad, ln.id);
+ bad.clear();
+ }
+
+ //Send unmatched list to unmatched output stream
+ if(ros!=null){
+ ros.add((removed>0 ? Tools.condenseNew(reads) : reads), ln.id); //Creates a new list if old one became empty, to prevent shutting down the cris.
+ }
+
+ if(ross!=null){
+ ross.add(single, ln.id);
+ single.clear();
+ }
+
+ if(maxBasesOutmT>=0 && basesOutmT>=maxBasesOutmT){break;}
+ if(maxBasesOutuT>=0 && basesOutuT>=maxBasesOutuT){break;}
+
+ //Fetch a new read list
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ cris.returnList(ln.id, ln.list.isEmpty());
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Helper Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /**
+ * Transforms a kmer into all canonical values for a given Hamming distance.
+ * Returns the related id stored in the tables.
+ * @param kmer Forward kmer
+ * @param rkmer Reverse kmer
+ * @param lengthMask Bitmask with single '1' set to left of kmer
+ * @param qPos Position of kmer in query
+ * @param len kmer length
+ * @param qHDist Hamming distance
+ * @param sets Kmer hash tables
+ * @return Value stored in table, or -1
+ */
+ private final int getValue(final long kmer, final long rkmer, final long lengthMask, final int qPos, final int len, final int qHDist, final AbstractKmerTable[] sets){
+ assert(lengthMask==0 || (kmer<lengthMask && rkmer<lengthMask)) : lengthMask+", "+kmer+", "+rkmer;
+ int id=getValue(kmer, rkmer, lengthMask, qPos, sets);
+ if(id<1 && qHDist>0){
+ final int qHDist2=qHDist-1;
+
+ //Sub
+ for(int j=0; j<4 && id<1; j++){
+ for(int i=0; i<len && id<1; i++){
+ final long temp=(kmer&clearMasks[i])|setMasks[j][i];
+// System.err.println(i+", "+j+", "+setMasks[j][i]+", "+qHDist);
+ if(temp!=kmer){
+ long rtemp=AminoAcid.reverseComplementBinaryFast(temp, len);
+// assert(lengthMask==0 || (temp<lengthMask && rtemp<lengthMask)) : lengthMask+", "+temp+", "+rtemp+", "+kmer+", "+rkmer+
+// "\n"+len+", "+Long.numberOfTrailingZeros(lengthMask)+"\n"+
+// Long.toBinaryString(lengthMask|0x8000000000000000L)+"\n"+
+// Long.toBinaryString(temp|0x8000000000000000L)+"\n"+
+// Long.toBinaryString(rtemp|0x8000000000000000L);
+ id=getValue(temp, rtemp, lengthMask, qPos, len, qHDist2, sets);
+ }
+ }
+ }
+ }
+ return id;
+ }
+
+ /**
+ * Transforms a kmer into a canonical value stored in the table and search.
+ * @param kmer Forward kmer
+ * @param rkmer Reverse kmer
+ * @param lengthMask Bitmask with single '1' set to left of kmer
+ * @param qPos Position of kmer in query
+ * @param sets Kmer hash tables
+ * @return Value stored in table
+ */
+ private final int getValue(final long kmer, final long rkmer, final long lengthMask, final int qPos, final AbstractKmerTable[] sets){
+ assert(lengthMask==0 || (kmer<lengthMask && rkmer<lengthMask)) : lengthMask+", "+kmer+", "+rkmer;
+ if(qSkip>1 && (qPos%qSkip!=0)){return -1;}
+
+ final long max=(rcomp ? Tools.max(kmer, rkmer) : kmer);
+ final long key=(max&middleMask)|lengthMask;
+ if(noAccel || ((key/WAYS)&15)>=speed){
+ if(verbose){System.err.println("Testing key "+key);}
+ AbstractKmerTable set=sets[(int)(key%WAYS)];
+ final int id=set.getValue(key);
+ return id;
+ }
+ return -1;
+ }
+
+
+ /**
+ * Counts the number of kmer hits for a read.
+ * @param r Read to process
+ * @param sets Kmer tables
+ * @return Number of hits
+ */
+ private final int countSetKmers(final Read r, final AbstractKmerTable[] sets, final int maxBadKmers){
+ if(r==null || r.length()<k || storedKmers<1){return 0;}
+ if((skipR1 && r.pairnum()==0) || (skipR2 && r.pairnum()==1)){return 0;}
+ final byte[] bases=r.bases;
+ final int minlen=k-1;
+ final int minlen2=(maskMiddle ? k/2 : k);
+ final int shift=2*k;
+ final int shift2=shift-2;
+ final long mask=~((-1L)<<shift);
+ final long kmask=lengthMasks[k];
+ long kmer=0;
+ long rkmer=0;
+ int found=0;
+ int len=0;
+
+ final int start=(restrictRight<1 ? 0 : Tools.max(0, bases.length-restrictRight));
+ final int stop=(restrictLeft<1 ? bases.length : Tools.min(bases.length, restrictLeft));
+
+ /* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */
+ for(int i=start; i<stop; i++){
+ byte b=bases[i];
+ long x=Dedupe.baseToNumber[b];
+ long x2=Dedupe.baseToComplementNumber[b];
+ kmer=((kmer<<2)|x)&mask;
+ rkmer=(rkmer>>>2)|(x2<<shift2);
+ if(b=='N' && forbidNs){len=0;}else{len++;}
+ if(verbose){System.err.println("Scanning6 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
+ if(len>=minlen2 && i>=minlen){
+ final int id=getValue(kmer, rkmer, kmask, i, k, qHammingDistance, sets);
+ if(verbose){System.err.println("Testing kmer "+kmer+"; id="+id);}
+ if(id>0){
+ if(verbose){System.err.println("Found = "+(found+1)+"/"+maxBadKmers);}
+ if(found==maxBadKmers){
+ if(scaffoldReadCountsT!=null){
+ scaffoldReadCountsT[id]++;
+ scaffoldBaseCountsT[id]+=bases.length;
+ }else{
+ scaffoldReadCounts.addAndGet(id, 1);
+ scaffoldBaseCounts.addAndGet(id, bases.length);
+ }
+ if(hitCounts==null){
+ return (found=found+1);
+ }//Early exit, but prevents generation of histogram that goes over maxBadKmers+1.
+ }
+ found++;
+ }
+ }
+ }
+
+ if(hitCountsT!=null){hitCountsT[Tools.min(found, HITCOUNT_LEN)]++;}
+ return found;
+ }
+
+
+ /**
+ * Counts the number of kmer hits for a read.
+ * @param r Read to process
+ * @param sets Kmer tables
+ * @return Number of hits
+ */
+ private final int countCoveredBases(final Read r, final AbstractKmerTable[] sets, final int minCoveredBases){
+ if(r==null || r.length()<k || storedKmers<1){return 0;}
+ if((skipR1 && r.pairnum()==0) || (skipR2 && r.pairnum()==1)){return 0;}
+ final byte[] bases=r.bases;
+ final int minlen=k-1;
+ final int minlen2=(maskMiddle ? k/2 : k);
+ final int shift=2*k;
+ final int shift2=shift-2;
+ final long mask=~((-1L)<<shift);
+ final long kmask=lengthMasks[k];
+ long kmer=0;
+ long rkmer=0;
+ int found=0;
+ int len=0;
+ int lastFound=-1;
+ boolean recorded=false;
+
+ final int start=(restrictRight<1 ? 0 : Tools.max(0, bases.length-restrictRight));
+ final int stop=(restrictLeft<1 ? bases.length : Tools.min(bases.length, restrictLeft));
+
+ /* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */
+ for(int i=start; i<stop; i++){
+ byte b=bases[i];
+ long x=Dedupe.baseToNumber[b];
+ long x2=Dedupe.baseToComplementNumber[b];
+ kmer=((kmer<<2)|x)&mask;
+ rkmer=(rkmer>>>2)|(x2<<shift2);
+ if(b=='N' && forbidNs){len=0;}else{len++;}
+ if(verbose){System.err.println("Scanning6 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
+ if(len>=minlen2 && i>=minlen){
+ final int id=getValue(kmer, rkmer, kmask, i, k, qHammingDistance, sets);
+ if(verbose){System.err.println("Testing kmer "+kmer+"; id="+id);}
+ if(id>0){
+
+ int extra=Tools.min(k, i-lastFound);
+ found+=extra;
+ lastFound=i;
+
+ if(verbose){System.err.println("Found = "+found+"/"+minCoveredBases);}
+ if(found>=minCoveredBases){
+ if(!recorded){
+ if(scaffoldReadCountsT!=null){
+ scaffoldReadCountsT[id]++;
+ scaffoldBaseCountsT[id]+=bases.length;
+ }else{
+ scaffoldReadCounts.addAndGet(id, 1);
+ scaffoldBaseCounts.addAndGet(id, bases.length);
+ }
+ }
+ if(hitCounts==null){
+ return found;
+ }
+ }
+ }
+ }
+ }
+
+ if(hitCountsT!=null){hitCountsT[Tools.min(found, HITCOUNT_LEN)]++;}
+ return found;
+ }
+
+ /**
+ * Returns the id of the sequence with the most kmer matches to this read, or -1 if none are over maxBadKmers.
+ * @param r Read to process
+ * @param sets Kmer tables
+ * @return id of best match
+ */
+ private final int findBestMatch(final Read r, final AbstractKmerTable[] sets, final int maxBadKmers){
+ idList.size=0;
+ if(r==null || r.length()<k || storedKmers<1){return -1;}
+ if((skipR1 && r.pairnum()==0) || (skipR2 && r.pairnum()==1)){return -1;}
+ final byte[] bases=r.bases;
+ final int minlen=k-1;
+ final int minlen2=(maskMiddle ? k/2 : k);
+ final int shift=2*k;
+ final int shift2=shift-2;
+ final long mask=~((-1L)<<shift);
+ final long kmask=lengthMasks[k];
+ long kmer=0;
+ long rkmer=0;
+ int len=0;
+ int found=0;
+
+ final int start=(restrictRight<1 ? 0 : Tools.max(0, bases.length-restrictRight));
+ final int stop=(restrictLeft<1 ? bases.length : Tools.min(bases.length, restrictLeft));
+
+ /* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */
+ for(int i=start; i<stop; i++){
+ byte b=bases[i];
+ long x=Dedupe.baseToNumber[b];
+ long x2=Dedupe.baseToComplementNumber[b];
+ kmer=((kmer<<2)|x)&mask;
+ rkmer=(rkmer>>>2)|(x2<<shift2);
+ if(b=='N' && forbidNs){len=0;}else{len++;}
+ if(verbose){System.err.println("Scanning6 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
+ if(len>=minlen2 && i>=minlen){
+ final int id=getValue(kmer, rkmer, kmask, i, k, qHammingDistance, sets);
+ if(id>0){
+ countArray[id]++;
+ if(countArray[id]==1){idList.add(id);}
+ found++;
+ if(verbose){System.err.println("Found = "+found+"/"+maxBadKmers);}
+ }
+ }
+ }
+
+ final int id, max;
+ if(found>maxBadKmers){
+ max=condenseLoose(countArray, idList, countList);
+ int id0=-1;
+ for(int i=0; i<countList.size; i++){
+ if(countList.get(i)==max){
+ id0=idList.get(i); break;
+ }
+ }
+ if(rename){rename(r, idList, countList);}
+ id=id0;
+ }else{
+ max=0;
+ id=-1;
+ }
+
+ if(found>maxBadKmers){
+ if(scaffoldReadCountsT!=null){
+ scaffoldReadCountsT[id]++;
+ scaffoldBaseCountsT[id]+=bases.length;
+ }else{
+ scaffoldReadCounts.addAndGet(id, 1);
+ scaffoldBaseCounts.addAndGet(id, bases.length);
+ }
+ }
+
+ if(hitCountsT!=null){hitCountsT[Tools.min(found, HITCOUNT_LEN)]++;}
+ return id;
+ }
+
+ /** Estimates kmer hit counts for kmers longer than k using consecutive matches
+ * @param r
+ * @param sets
+ * @return Number of sets of consecutive hits of exactly length kbig
+ */
+ private final int countSetKmersBig(final Read r, final AbstractKmerTable[] sets, final int maxBadKmers){
+ if(r==null || r.length()<kbig || storedKmers<1){return 0;}
+ if((skipR1 && r.pairnum()==0) || (skipR2 && r.pairnum()==1)){return 0;}
+ assert(kbig>k);
+ final int sub=kbig-k-1;
+ assert(sub>=0) : kbig+", "+sub;
+ final byte[] bases=r.bases;
+ final int minlen=k-1;
+ final int minlen2=(maskMiddle ? k/2 : k);
+ final int shift=2*k;
+ final int shift2=shift-2;
+ final long mask=~((-1L)<<shift);
+ final long kmask=lengthMasks[k];
+ long kmer=0;
+ long rkmer=0;
+ int found=0;
+ int len=0;
+
+ int bkStart=-1;
+ int bkStop=-1;
+ int id=-1, lastId=-1;
+
+ final int start=(restrictRight<1 ? 0 : Tools.max(0, bases.length-restrictRight));
+ final int stop=(restrictLeft<1 ? bases.length : Tools.min(bases.length, restrictLeft));
+
+ /* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */
+ for(int i=start; i<stop; i++){
+ byte b=bases[i];
+ long x=Dedupe.baseToNumber[b];
+ long x2=Dedupe.baseToComplementNumber[b];
+ kmer=((kmer<<2)|x)&mask;
+ rkmer=(rkmer>>>2)|(x2<<shift2);
+ if(b=='N' && forbidNs){len=0;}else{len++;}
+ if(verbose){System.err.println("Scanning7 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
+ if(len>=minlen2 && i>=minlen){
+ id=getValue(kmer, rkmer, kmask, i, k, qHammingDistance, sets);
+ if(verbose){System.err.println("Testing kmer "+kmer+"; id="+id);}
+ if(id>0){
+ lastId=id;
+ if(bkStart==-1){bkStart=i;}
+ bkStop=i;
+ }else{
+ if(bkStart>-1){
+ int dif=bkStop-bkStart-sub;
+ bkStop=bkStart=-1;
+ if(dif>0){
+ int old=found;
+ found+=dif;
+ if(found>maxBadKmers && old<=maxBadKmers){
+ if(scaffoldReadCountsT!=null){
+ scaffoldReadCountsT[lastId]++;
+ scaffoldBaseCountsT[lastId]+=bases.length;
+ }else{
+ scaffoldReadCounts.addAndGet(lastId, 1);
+ scaffoldBaseCounts.addAndGet(lastId, bases.length);
+ }
+ if(hitCounts==null){
+ return found;
+ }//Early exit, but prevents generation of histogram that goes over maxBadKmers+1.
+ }
+ }
+ }
+ }
+ }
+ }
+
+ // This catches the case where valid kmers extend to the end of the read
+ if(bkStart>-1){
+ int dif=bkStop-bkStart-sub;
+ bkStop=bkStart=-1;
+ if(dif>0){
+ int old=found;
+ found+=dif;
+ if(found>maxBadKmers && old<=maxBadKmers){
+ if(scaffoldReadCountsT!=null){
+ scaffoldReadCountsT[lastId]++;
+ scaffoldBaseCountsT[lastId]+=bases.length;
+ }else{
+ scaffoldReadCounts.addAndGet(lastId, 1);
+ scaffoldBaseCounts.addAndGet(lastId, bases.length);
+ }
+ }
+ }
+ }
+
+ if(hitCountsT!=null){hitCountsT[Tools.min(found, HITCOUNT_LEN)]++;}
+ return found;
+ }
+
+ /**
+ * Trim a read to remove matching kmers and everything to their left or right.
+ * @param r Read to process
+ * @param sets Kmer tables
+ * @return Number of bases trimmed
+ */
+ private final int ktrim(final Read r, final AbstractKmerTable[] sets){
+ assert(ktrimLeft || ktrimRight);
+ if(r==null || r.length()<Tools.max(1, (useShortKmers ? Tools.min(k, mink) : k)) || storedKmers<1){return 0;}
+ if((skipR1 && r.pairnum()==0) || (skipR2 && r.pairnum()==1)){return 0;}
+ if(verbose){System.err.println("KTrimming read "+r.id);}
+ final byte[] bases=r.bases, quals=r.quality;
+ final int minlen=k-1;
+ final int minlen2=(maskMiddle ? k/2 : k);
+ final int shift=2*k;
+ final int shift2=shift-2;
+ final long mask=~((-1L)<<shift);
+ final long kmask=lengthMasks[k];
+ long kmer=0;
+ long rkmer=0;
+ int found=0;
+ int len=0;
+ int id0=-1; //ID of first kmer found.
+
+ int minLoc=999999999, minLocExclusive=999999999;
+ int maxLoc=-1, maxLocExclusive=-1;
+ final int initialLength=r.length();
+
+ final int start=(restrictRight<1 ? 0 : Tools.max(0, bases.length-restrictRight));
+ final int stop=(restrictLeft<1 ? bases.length : Tools.min(bases.length, restrictLeft));
+
+ //Scan for normal kmers
+ for(int i=start; i<stop; i++){
+ byte b=bases[i];
+ long x=Dedupe.baseToNumber[b];
+ long x2=Dedupe.baseToComplementNumber[b];
+ kmer=((kmer<<2)|x)&mask;
+ rkmer=(rkmer>>>2)|(x2<<shift2);
+ if(b=='N' && forbidNs){len=0;}else{len++;}
+ if(verbose){System.err.println("Scanning3 i="+i+", kmer="+kmer+", rkmer="+rkmer+", len="+len+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
+ if(len>=minlen2 && i>=minlen){
+ final int id=getValue(kmer, rkmer, kmask, i, k, qHammingDistance, sets);
+ if(id>0){
+ if(id0<0){id0=id;}
+ minLoc=Tools.min(minLoc, i-k+1);
+ assert(minLoc>=0);
+ maxLoc=i;
+ found++;
+ }
+ }
+ }
+
+ if(minLoc!=minLocExclusive){minLocExclusive=minLoc+k;}
+ if(maxLoc!=maxLocExclusive){maxLocExclusive=maxLoc-k;}
+
+ //If nothing was found, scan for short kmers. Only used for trimming.
+ if(useShortKmers && found==0){
+ assert(!maskMiddle && middleMask==-1) : maskMiddle+", "+middleMask+", k="+", mink="+mink;
+
+ //Look for short kmers on left side
+ if(ktrimLeft){
+ kmer=0;
+ rkmer=0;
+ len=0;
+ final int lim=Tools.min(k, stop);
+ for(int i=start; i<lim; i++){
+ byte b=bases[i];
+ long x=Dedupe.baseToNumber[b];
+ long x2=Dedupe.baseToComplementNumber[b];
+ kmer=((kmer<<2)|x)&mask;
+ rkmer=rkmer|(x2<<(2*len));
+ len++;
+ if(verbose){System.err.println("Scanning4 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
+ if(len>=mink){
+
+ if(verbose){
+ System.err.println("Looking for left kmer "+AminoAcid.kmerToString(kmer, len));
+ System.err.println("Looking for left rkmer "+AminoAcid.kmerToString(rkmer, len));
+ }
+
+ final int id=getValue(kmer, rkmer, lengthMasks[len], i, len, qHammingDistance2, sets);
+ if(id>0){
+ if(id0<0){id0=id;}
+ if(verbose){System.err.println("Found "+kmer);}
+ minLoc=0;
+ minLocExclusive=Tools.min(minLocExclusive, i+1);
+ maxLoc=Tools.max(maxLoc, i);
+ maxLocExclusive=Tools.max(maxLocExclusive, 0);
+ found++;
+ }
+ }
+ }
+ }
+
+ //Look for short kmers on right side
+ if(ktrimRight){
+ kmer=0;
+ rkmer=0;
+ len=0;
+ final int lim=Tools.max(-1, stop-k);
+ for(int i=stop-1; i>lim; i--){
+ byte b=bases[i];
+ long x=Dedupe.baseToNumber[b];
+ long x2=Dedupe.baseToComplementNumber[b];
+ kmer=kmer|(x<<(2*len));
+ rkmer=((rkmer<<2)|x2)&mask;
+ len++;
+ if(verbose){System.err.println("Scanning5 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
+ if(len>=mink){
+ if(verbose){
+ System.err.println("Looking for right kmer "+
+ AminoAcid.kmerToString(kmer&~lengthMasks[len], len)+"; value="+toValue(kmer, rkmer, lengthMasks[len])+"; kmask="+lengthMasks[len]);
+ }
+ final int id=getValue(kmer, rkmer, lengthMasks[len], i, len, qHammingDistance2, sets);
+ if(id>0){
+ if(id0<0){id0=id;}
+ if(verbose){System.err.println("Found "+kmer);}
+ minLoc=i;
+ minLocExclusive=Tools.min(minLocExclusive, bases.length);
+ maxLoc=bases.length-1;
+ maxLocExclusive=Tools.max(maxLocExclusive, i-1);
+ found++;
+ }
+ }
+ }
+ }
+ }
+
+
+ if(verbose){System.err.println("found="+found+", minLoc="+minLoc+", maxLoc="+maxLoc+", minLocExclusive="+minLocExclusive+", maxLocExclusive="+maxLocExclusive);}
+
+ if(found==0){return 0;}
+ assert(found>0) : "Overflow in 'found' variable.";
+
+ {//Increment counter for the scaffold whose kmer was first detected
+ if(scaffoldReadCountsT!=null){
+ scaffoldReadCountsT[id0]++;
+ scaffoldBaseCountsT[id0]+=bases.length;
+ }else{
+ scaffoldReadCounts.addAndGet(id0, 1);
+ scaffoldBaseCounts.addAndGet(id0, bases.length);
+ }
+ }
+
+ if(trimPad!=0){
+ maxLoc=Tools.mid(0, maxLoc+trimPad, bases.length);
+ minLoc=Tools.mid(0, minLoc-trimPad, bases.length);
+ maxLocExclusive=Tools.mid(0, maxLocExclusive+trimPad, bases.length);
+ minLocExclusive=Tools.mid(0, minLocExclusive-trimPad, bases.length);
+ }
+
+ if(ktrimLeft){ //Trim from the read start to the rightmost kmer base
+ if(verbose){System.err.println("Left trimming to "+(ktrimExclusive ? maxLocExclusive+1 : maxLoc+1)+", "+0);}
+ int x=TrimRead.trimToPosition(r, ktrimExclusive ? maxLocExclusive+1 : maxLoc+1, bases.length-1, 1);
+ if(verbose){System.err.println("Trimmed "+x+" bases: "+new String(r.bases));}
+ return x;
+ }else{ //Trim from the leftmost kmer base to the read stop
+ assert(ktrimRight);
+ if(verbose){System.err.println("Right trimming to "+0+", "+(ktrimExclusive ? minLocExclusive-1 : minLoc-1));}
+ int x=TrimRead.trimToPosition(r, 0, ktrimExclusive ? minLocExclusive-1 : minLoc-1, 1);
+ if(verbose){System.err.println("Trimmed "+x+" bases: "+new String(r.bases));}
+ return x;
+ }
+ }
+
+
+ /**
+ * Mask a read to cover matching kmers.
+ * @param r Read to process
+ * @param sets Kmer tables
+ * @return Number of bases masked
+ */
+ private final int kmask(final Read r, final AbstractKmerTable[] sets){
+ assert(ktrimN);
+ if(r==null || r.length()<Tools.max(1, (useShortKmers ? Tools.min(k, mink) : k)) || storedKmers<1){return 0;}
+ if((skipR1 && r.pairnum()==0) || (skipR2 && r.pairnum()==1)){return 0;}
+ if(verbose){System.err.println("KMasking read "+r.id);}
+ final byte[] bases=r.bases, quals=r.quality;
+ if(bases==null || bases.length<k){return 0;}
+ final int minlen=k-1;
+ final int minminlen=mink-1;
+ final int minlen2=(maskMiddle ? k/2 : k);
+ final int shift=2*k;
+ final int shift2=shift-2;
+ final long mask=~((-1L)<<shift);
+ final long kmask=lengthMasks[k];
+ long kmer=0;
+ long rkmer=0;
+ int found=0;
+ int len=0;
+ int id0=-1; //ID of first kmer found.
+
+ final BitSet bs=new BitSet(bases.length+trimPad+1);
+ if(kmaskFullyCovered){bs.set(0, bases.length);}
+
+ final int minus=k-1-trimPad;
+ final int plus=trimPad+1;
+
+ final int start=(restrictRight<1 ? 0 : Tools.max(0, bases.length-restrictRight));
+ final int stop=(restrictLeft<1 ? bases.length : Tools.min(bases.length, restrictLeft));
+
+ //Scan for normal kmers
+ for(int i=start; i<stop; i++){
+ byte b=bases[i];
+ long x=Dedupe.baseToNumber[b];
+ long x2=Dedupe.baseToComplementNumber[b];
+ kmer=((kmer<<2)|x)&mask;
+ rkmer=(rkmer>>>2)|(x2<<shift2);
+ if(b=='N' && forbidNs){len=0;}else{len++;}
+ if(verbose){System.err.println("Scanning3 i="+i+", kmer="+kmer+", rkmer="+rkmer+", len="+len+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
+
+ if(i>=minlen){
+ final int id;
+ if(len>=minlen2){
+ id=getValue(kmer, rkmer, kmask, i, k, qHammingDistance, sets);
+ }else{
+ id=-1;
+ }
+ if(id>0){
+ if(id0<0){id0=id;}
+ if(verbose){
+ System.err.println("a: Found "+kmer);
+ System.err.println("Setting "+Tools.max(0, i-minus)+", "+(i+plus));
+ System.err.println("i="+i+", minus="+minus+", plus="+plus+", trimpad="+trimPad+", k="+k);
+ }
+ if(!kmaskFullyCovered){bs.set(Tools.max(0, i-minus), i+plus);}
+ found++;
+ }else if(kmaskFullyCovered){
+ bs.clear(Tools.max(0, i-minus), i+plus);
+ }
+ }
+ }
+
+ //If nothing was found, scan for short kmers.
+ if(useShortKmers){
+ assert(!maskMiddle && middleMask==-1) : maskMiddle+", "+middleMask+", k="+", mink="+mink;
+
+ //Look for short kmers on left side
+ {
+ kmer=0;
+ rkmer=0;
+ len=0;
+ int len2=0;
+ final int lim=Tools.min(k, stop);
+ for(int i=start; i<lim; i++){
+ byte b=bases[i];
+ long x=Dedupe.baseToNumber[b];
+ long x2=Dedupe.baseToComplementNumber[b];
+ kmer=((kmer<<2)|x)&mask;
+ rkmer=rkmer|(x2<<(2*len));
+ len++;
+ len2++;
+ if(verbose){System.err.println("Scanning4 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
+
+ if(len2>=minminlen){
+ if(verbose){
+ System.err.println("Looking for left kmer "+AminoAcid.kmerToString(kmer, len));
+ System.err.println("Looking for left rkmer "+AminoAcid.kmerToString(rkmer, len));
+ }
+ final int id;
+ if(len>=mink){
+ id=getValue(kmer, rkmer, lengthMasks[len], i, len, qHammingDistance2, sets);
+ }else{
+ id=-1;
+ }
+ if(id>0){
+ if(id0<0){id0=id;}
+ if(verbose){
+ System.err.println("b: Found "+kmer);
+ System.err.println("Setting "+0+", "+(i+plus));
+ }
+ if(!kmaskFullyCovered){bs.set(0, i+plus);}
+ found++;
+ }else if(kmaskFullyCovered){
+ bs.clear(0, i+plus);
+ }
+ }
+ }
+ }
+
+ //Look for short kmers on right side
+ {
+ kmer=0;
+ rkmer=0;
+ len=0;
+ int len2=0;
+ final int lim=Tools.max(-1, stop-k);
+ for(int i=stop-1; i>lim; i--){
+ byte b=bases[i];
+ long x=Dedupe.baseToNumber[b];
+ long x2=Dedupe.baseToComplementNumber[b];
+ kmer=kmer|(x<<(2*len));
+ rkmer=((rkmer<<2)|x2)&mask;
+ len++;
+ len2++;
+ if(verbose){System.err.println("Scanning5 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
+
+ if(len2>=minminlen){
+ if(verbose){
+ System.err.println("Looking for right kmer "+
+ AminoAcid.kmerToString(kmer&~lengthMasks[len], len)+"; value="+toValue(kmer, rkmer, lengthMasks[len])+"; kmask="+lengthMasks[len]);
+ }
+ final int id;
+ if(len>=mink){
+ id=getValue(kmer, rkmer, lengthMasks[len], i, len, qHammingDistance2, sets);
+ }else{
+ id=-1;
+ }
+ if(id>0){
+ if(id0<0){id0=id;}
+ if(verbose){
+ System.err.println("c: Found "+kmer);
+ System.err.println("Setting "+Tools.max(0, i-trimPad)+", "+bases.length);
+ }
+ if(!kmaskFullyCovered){bs.set(Tools.max(0, i-trimPad), bases.length);}
+ found++;
+ }else if(kmaskFullyCovered){
+ bs.clear(Tools.max(0, i-trimPad), bases.length);
+ }
+ }
+ }
+ }
+ }
+
+
+ if(verbose){System.err.println("found="+found+", bitset="+bs);}
+
+ if(found==0){return 0;}
+ assert(found>0) : "Overflow in 'found' variable.";
+
+ {//Increment counter for the scaffold whose kmer was first detected
+ if(scaffoldReadCountsT!=null){
+ scaffoldReadCountsT[id0]++;
+ scaffoldBaseCountsT[id0]+=bases.length;
+ }else{
+ scaffoldReadCounts.addAndGet(id0, 1);
+ scaffoldBaseCounts.addAndGet(id0, bases.length);
+ }
+ }
+// int y=r.countNocalls();
+ int cardinality=bs.cardinality();
+// assert(cardinality>0);
+
+ //Replace kmer hit zone with the trim symbol
+ for(int i=0; i<bases.length; i++){
+ if(bs.get(i)){
+ if(kmaskLowercase){
+ bases[i]=(byte)Character.toLowerCase(bases[i]);
+ }else{
+ bases[i]=trimSymbol;
+ if(quals!=null && trimSymbol=='N'){quals[i]=0;}
+ }
+ }
+ }
+// assert(cardinality==r.countNocalls() || y>0) : cardinality+", "+r.countNocalls()+"\n"+r.length()+"\n"+bs+"\n"+r;//123
+ return cardinality;
+ }
+
+ /**
+ * @param r
+ * @param idList
+ * @param countList
+ */
+ private void rename(Read r, IntList idList, IntList countList) {
+ if(r==null || idList.size<1){return;}
+ StringBuilder sb=new StringBuilder();
+ if(r.id==null){sb.append(r.numericID);}
+ else{sb.append(r.id);}
+ for(int i=0; i<idList.size; i++){
+ int id=idList.get(i);
+ int count=countList.get(i);
+ sb.append('\t');
+ sb.append(scaffoldNames.get(id));
+ sb.append('=');
+ sb.append(count);
+ }
+ r.id=sb.toString();
+ }
+
+ /**
+ * Pack a list of counts from an array to an IntList.
+ * @param loose Counter array
+ * @param packed Unique values
+ * @param counts Counts of values
+ * @return
+ */
+ private int condenseLoose(int[] loose, IntList packed, IntList counts){
+ counts.size=0;
+ if(packed.size<1){return 0;}
+
+ int max=0;
+ for(int i=0; i<packed.size; i++){
+ final int p=packed.get(i);
+ final int c=loose[p];
+ counts.add(c);
+ loose[p]=0;
+ max=Tools.max(max, c);
+ }
+ return max;
+ }
+
+ private float expectedErrors(Read r1, Read r2){
+ float a=(r1==null ? 0 : r1.expectedErrors(false, -1));
+ float b=(r2==null ? 0 : r2.expectedErrors(false, -1));
+ return Tools.max(a, b);
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Entropy Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /**
+ * Returns the average entropy over all windows for the read.
+ * @param bases Read sequence
+ * @param k Kmer length
+ * @param window Window length.
+ * @param counts An array of counts indexed by kmer
+ * @param countCounts An array of counts of counts indexed by count
+ * @param kmerspace 2^k
+ * @param verify Check consistency of data structures (slow)
+ * @return Entropy
+ */
+ private float averageEntropy(final byte[] bases, final int k,
+ final int window, final short[] counts, final short[] countCounts, final int kmerspace, boolean verify){
+ assert(k>0) : "k must be greater than 0";
+// Arrays.fill(counts, 0);
+
+ assert(countCounts[0]==window);
+ if(verify){
+ for(int c : counts){assert(c==0);}
+ for(int i=1; i<countCounts.length; i++){assert(countCounts[i]==0);}
+ }
+
+ final int mask=(k>15 ? -1 : ~((-1)<<(2*k)));
+ int current=0;
+ //int ns=0;
+ int kmer=0, kmer2=0;
+
+ double entropySum=0;
+ int entropyMeasurements=0;
+
+ for(int i=0, i2=-window; i2<bases.length; i++, i2++){
+
+// System.err.println("\nStart: i="+i+", current="+current+", ns="+ns+"\n"+Arrays.toString(counts)+"\n"+Arrays.toString(countCounts));
+
+ if(i<bases.length){
+ byte b=bases[i];
+ if(!AminoAcid.isFullyDefined(b)){
+// ns++;
+ b='A';
+ }
+ final int n=Dedupe.baseToNumber[b];
+ kmer=((kmer<<2)|n)&mask;
+
+ if(counts[kmer]<1){
+ assert(counts[kmer]==0);
+ current++;
+ }
+ countCounts[counts[kmer]]--;
+ assert(countCounts[counts[kmer]]>=-1): i+", "+current+", "+Tools.cardinality(counts)+"\n"+Arrays.toString(counts)+"\n"+Arrays.toString(countCounts);
+ counts[kmer]++;
+ assert(counts[kmer]<=window+1) : Arrays.toString(counts)+"\n"+Arrays.toString(countCounts);
+ countCounts[counts[kmer]]++;
+ if(verify){
+ assert(current==Tools.cardinality(counts)) : current+", "+Tools.cardinality(counts)+"\n"+Arrays.toString(counts);
+ assert(Tools.sum(countCounts)>0 && (Tools.sum(countCounts)<=window+1)): current+", "+Tools.cardinality(counts)+"\n"+Arrays.toString(countCounts);
+ }
+
+// System.err.println("Added "+kmer+"; counts["+kmer+"]="+counts[kmer]);
+ }
+
+ if(i2>=0){
+ byte b2=bases[i2];
+ if(!AminoAcid.isFullyDefined(b2)){
+// ns--;
+ b2='A';
+ }
+ final int n2=Dedupe.baseToNumber[b2];
+ kmer2=((kmer2<<2)|n2)&mask;
+
+ countCounts[counts[kmer2]]--;
+ assert(countCounts[counts[kmer2]]>=0);
+ counts[kmer2]--;
+ countCounts[counts[kmer2]]++;
+ if(counts[kmer2]<1){
+ assert(counts[kmer2]==0) : Arrays.toString(counts);
+ current--;
+ }
+ if(verify){
+ assert(current==Tools.cardinality(counts)) : current+", "+Tools.cardinality(counts)+"\n"+Arrays.toString(counts);
+ assert(Tools.sum(countCounts)>=0 && (Tools.sum(countCounts)<=window)): current+", "+Tools.cardinality(counts)+"\n"+Arrays.toString(countCounts);
+ }
+
+// System.err.println("Removed "+kmer2+"; count="+counts[kmer2]);
+ }
+
+ if(verify && i2>-1 && i<bases.length){
+ assert(Tools.sum(counts)==window);
+ assert(Tools.sum(countCounts)==window): current+", "+Tools.cardinality(counts)+"\n"+Arrays.toString(countCounts);
+ }
+
+ if(i2>=-1 && i<bases.length){
+ float e=calcEntropy(countCounts, window, kmerspace);
+ entropySum+=e;
+ entropyMeasurements++;
+ }
+ }
+
+// System.err.println(" *** ");
+// System.err.println(entropySum+", "+entropyMeasurements+", "+(entropySum/(Tools.max(1, entropyMeasurements))));
+// System.err.println(window+", "+k+", "+kmerspace+", "+counts.length+", "+countCounts.length);
+// System.err.println(" *** ");
+
+ return (float)(entropySum/(Tools.max(1, entropyMeasurements)));
+ }
+
+ /**
+ * Calculate the entropy of the countCounts for a particular window.
+ * @param countCounts element i holds the number of unique kmers occurring i times in the window.
+ * @param window Window length.
+ * @param kmerspace 2^k
+ * @return Entropy
+ */
+ private float calcEntropy(short[] countCounts, int window, int kmerspace){
+ double sum=0;
+ for(int i=1; i<countCounts.length; i++){
+ int cc=countCounts[i];
+ double pklogpk=entropy[i];
+ sum+=(cc*pklogpk);
+ }
+// System.err.println("sum = "+sum);
+// System.err.println("entropy = "+(sum*entropyMult));
+ return (float)(sum*entropyMult);
+ }
+
+ /*--------------------------------------------------------------*/
+
+ /** Input read stream */
+ private final ConcurrentReadInputStream cris;
+ /** Output read streams */
+ private final ConcurrentReadOutputStream ros, rosb, ross;
+
+ private final ReadStats readstats;
+ private final int[] overlapVector;
+ private final int[] countArray;
+
+ private final IntList idList;
+ private final IntList countList;
+
+ //These "*T" fields are used to store counts on a per-thread basis.
+
+ long[] hitCountsT;
+ long[] scaffoldReadCountsT;
+ long[] scaffoldBaseCountsT;
+
+ final short[] entropyCounts;
+ final short[] entropyCountCounts;
+
+ private float[] aprob, bprob;
+
+ private long readsInT=0;
+ private long basesInT=0;
+ private long readsOutuT=0;
+ private long basesOutuT=0;
+
+ private long readsOutmT=0;
+ private long basesOutmT=0;
+
+ private final long maxBasesOutmT;
+ private final long maxBasesOutuT;
+
+ private long readsQTrimmedT=0;
+ private long basesQTrimmedT=0;
+ private long readsFTrimmedT=0;
+ private long basesFTrimmedT=0;
+ private long readsQFilteredT=0;
+ private long basesQFilteredT=0;
+ private long readsEFilteredT=0;
+ private long basesEFilteredT=0;
+
+ private long readsKTrimmedT=0;
+ private long basesKTrimmedT=0;
+ private long readsKFilteredT=0;
+ private long basesKFilteredT=0;
+
+ private long readsTrimmedByOverlapT=0;
+ private long basesTrimmedByOverlapT=0;
+
+ private long badGcBasesT=0;
+ private long badGcReadsT=0;
+
+ }
+
+ /*--------------------------------------------------------------*/
+
+ /**
+ * Object holding a String and numbers, for tracking the number of read and base hits per sequence.
+ */
+ private static class StringNum implements Comparable<StringNum>{
+
+ public StringNum(String name_, int len_, long reads_, long bases_){
+ name=name_;
+ length=len_;
+ reads=reads_;
+ bases=bases_;
+ }
+ public final int compareTo(StringNum o){
+ if(bases!=o.bases){return o.bases>bases ? 1 : -1;}
+ if(reads!=o.reads){return o.reads>reads ? 1 : -1;}
+ return name.compareTo(o.name);
+ }
+ public final boolean equals(StringNum o){
+ return compareTo(o)==0;
+ }
+ public final String toString(){
+ return name+"\t"+length+"\t"+reads+"\t"+bases;
+ }
+
+ /*--------------------------------------------------------------*/
+
+ /** Sequence name */
+ public final String name;
+ /** Sequence length */
+ public final int length;
+ /** Number of reads or bases mapped to this sequence */
+ public final long reads, bases;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Static Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Current available memory */
+ private static final long freeMemory(){
+ Runtime rt=Runtime.getRuntime();
+ return rt.freeMemory();
+ }
+
+ /**
+ * Transforms a kmer into a canonical value stored in the table. Expected to be inlined.
+ * @param kmer Forward kmer
+ * @param rkmer Reverse kmer
+ * @param lengthMask Bitmask with single '1' set to left of kmer
+ * @return Canonical value
+ */
+ private final long toValue(long kmer, long rkmer, long lengthMask){
+ assert(lengthMask==0 || (kmer<lengthMask && rkmer<lengthMask)) : lengthMask+", "+kmer+", "+rkmer;
+ long value=(rcomp ? Tools.max(kmer, rkmer) : kmer);
+ return (value&middleMask)|lengthMask;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** For calculating kmer cardinality */
+ private final LogLog loglog;
+
+ /** Has this class encountered errors while processing? */
+ public boolean errorState=false;
+
+ /** Fraction of available memory preallocated to arrays */
+ private double preallocFraction=1.0;
+ /** Initial size of data structures */
+ private int initialSize=-1;
+
+ /** Hold kmers. A kmer X such that X%WAYS=Y will be stored in keySets[Y] */
+ private final AbstractKmerTable[] keySets;
+ /** A scaffold's name is stored at scaffoldNames.get(id).
+ * scaffoldNames[0] is reserved, so the first id is 1. */
+ private final ArrayList<String> scaffoldNames=new ArrayList<String>();
+ /** Names of reference files (refNames[0] is valid). */
+ private final ArrayList<String> refNames=new ArrayList<String>();
+ /** Number of scaffolds per reference. */
+ private final int[] refScafCounts;
+ /** scaffoldCounts[id] stores the number of reads with kmer matches to that scaffold */
+ private AtomicLongArray scaffoldReadCounts;
+ /** scaffoldBaseCounts[id] stores the number of bases with kmer matches to that scaffold */
+ private AtomicLongArray scaffoldBaseCounts;
+ /** Set to false to force threads to share atomic counter arrays. */
+ private boolean ALLOW_LOCAL_ARRAYS=true;
+ /** scaffoldLengths[id] stores the length of that scaffold */
+ private IntList scaffoldLengths=new IntList();
+ /** hitCounts[x] stores the number of reads with exactly x kmer matches */
+ private long[] hitCounts;
+ /** Array of reference files from which to load kmers */
+ private String[] ref=null;
+ /** Array of literal strings from which to load kmers */
+ private String[] literal=null;
+
+ /** Input reads */
+ private String in1=null, in2=null;
+ /** Input qual files */
+ private String qfin1=null, qfin2=null;
+ /** Output reads (unmatched and at least minlen) */
+ private String out1=null, out2=null;
+ /** Output reads (matched or shorter than minlen) */
+ private String outb1=null, outb2=null;
+ /** Output reads whose mate was discarded */
+ private String outsingle=null;
+ /** Statistics output files */
+ private String outstats=null, outrqc=null, outrpkm=null, outrefstats=null;
+ @Deprecated
+ /** duk-style statistics */
+ private String outduk=null;
+
+ /** Optional file for quality score recalibration */
+ private String samFile=null;
+
+ /** Dump kmers here. */
+ private String dump=null;
+
+ /** Quit after this many bases written to outm */
+ private long maxBasesOutm=-1;
+ /** Quit after this many bases written to outu */
+ private long maxBasesOutu=-1;
+
+ /** Maximum input reads (or pairs) to process. Does not apply to references. -1 means unlimited. */
+ private long maxReads=-1;
+ /** Process this fraction of input reads. */
+ private float samplerate=1f;
+ /** Set samplerate seed to this value. */
+ private long sampleseed=-1;
+
+ /** Output reads in input order. May reduce speed. */
+ private final boolean ORDERED;
+ /** Attempt to match kmers shorter than normal k on read ends when doing kTrimming. */
+ private boolean useShortKmers=false;
+ /** Make the middle base in a kmer a wildcard to improve sensitivity */
+ private boolean maskMiddle=true;
+
+ /** Store reference kmers with up to this many substitutions */
+ private int hammingDistance=0;
+ /** Search for query kmers with up to this many substitutions */
+ private int qHammingDistance=0;
+ /** Store reference kmers with up to this many edits (including indels) */
+ private int editDistance=0;
+ /** Store short reference kmers with up to this many substitutions */
+ private int hammingDistance2=-1;
+ /** Search for short query kmers with up to this many substitutions */
+ private int qHammingDistance2=-1;
+ /** Store short reference kmers with up to this many edits (including indels) */
+ private int editDistance2=-1;
+ /** Never skip more than this many consecutive kmers when hashing reference. */
+ private int maxSkip=1;
+ /** Always skip at least this many consecutive kmers when hashing reference.
+ * 1 means every kmer is used, 2 means every other, etc. */
+ private int minSkip=1;
+
+ /** Trim this much extra around matched kmers */
+ private int trimPad;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Entropy Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Kmer length for entropy calculation */
+ private int entropyK=5;
+ /** Window length for entropy calculation */
+ private int entropyWindow=50;
+ /** Minimum entropy to be considered "complex", on a scale of 0-1 */
+ private float entropyCutoff=-1;
+ /** Verify consistency of related data structures (slow) */
+ private boolean verifyEntropy=false;
+
+ /** Perform entropy calculation */
+ private final boolean calcEntropy;
+ /** Number of possible unique kmers */
+ private final int entropyKmerspace;
+ /** A precalculated constant */
+ private final double entropyMult;
+ /** Array of precalculated constants */
+ private final double[] entropy;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Statistics ----------------*/
+ /*--------------------------------------------------------------*/
+
+ long readsIn=0;
+ long basesIn=0;
+ long readsOut=0;
+ long basesOut=0;
+
+ long readsQTrimmed=0;
+ long basesQTrimmed=0;
+ long readsFTrimmed=0;
+ long basesFTrimmed=0;
+ long readsQFiltered=0;
+ long basesQFiltered=0;
+ long readsEFiltered=0;
+ long basesEFiltered=0;
+
+ long readsKTrimmed=0;
+ long basesKTrimmed=0;
+ long readsKFiltered=0;
+ long basesKFiltered=0;
+
+ long badGcReads;
+ long badGcBases;
+
+ long readsTrimmedByOverlap;
+ long basesTrimmedByOverlap;
+
+ long refReads=0;
+ long refBases=0;
+ long refKmers=0;
+
+ public long modsum=0; //123
+
+ long storedKmers=0;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Final Primitives ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Don't look for kmers in read 1 */
+ private final boolean skipR1;
+ /** Don't look for kmers in read 2 */
+ private final boolean skipR2;
+ /** Correct errors via read overlap */
+ private final boolean ecc;
+
+ /** Look for reverse-complements as well as forward kmers. Default: true */
+ private final boolean rcomp;
+ /** Don't allow a read 'N' to match a reference 'A'.
+ * Reduces sensitivity when hdist>0 or edist>0. Default: false. */
+ private final boolean forbidNs;
+ /** AND bitmask with 0's at the middle base */
+ private final long middleMask;
+ /** Use HashForest data structure */
+ private final boolean useForest;
+ /** Use KmerTable data structure */
+ private final boolean useTable;
+ /** Use HashArray data structure (default) */
+ private final boolean useArray;
+
+ /** Normal kmer length */
+ private final int k;
+ /** k-1; used in some expressions */
+ private final int k2;
+ /** Emulated kmer greater than k */
+ private final int kbig;
+ /** Effective kmer size */
+ private final int keff;
+ /** Shortest kmer to use for trimming */
+ private final int mink;
+ /** A read may contain up to this many kmers before being considered a match. Default: 0 */
+ private final int maxBadKmers0;
+ /** A read must share at least this fraction of its kmers to be considered a match. Default: 0 */
+ private final float minKmerFraction;
+ /** Reference kmers must cover at least this fraction of read bases to be considered a match. Default: 0 */
+ private final float minCoveredFraction;
+
+ /** Recalibrate quality scores using matrices */
+ private final boolean recalibrateQuality;
+ /** Quality-trim the left side */
+ private final boolean qtrimLeft;
+ /** Quality-trim the right side */
+ private final boolean qtrimRight;
+ /** Trim bases at this quality or below. Default: 4 */
+ private final byte trimq;
+ /** Throw away reads below this average quality before trimming. Default: 0 */
+ private final byte minAvgQuality;
+ /** If positive, calculate average quality from the first X bases only. Default: 0 */
+ private final int minAvgQualityBases;
+ /** Throw away reads failing chastity filter (:Y: in read header) */
+ private final boolean chastityFilter;
+ /** Crash if a barcode is encountered that contains Ns or is not in the table */
+ private final boolean failBadBarcodes;
+ /** Remove reads with Ns in barcodes or that are not in the table */
+ private final boolean removeBadBarcodes;
+ /** Fail reads missing a barcode */
+ private final boolean failIfNoBarcode;
+ /** A set of valid barcodes; null if unused */
+ private final HashSet<String> barcodes;
+ /** Throw away reads containing more than this many Ns. Default: -1 (disabled) */
+ private final int maxNs;
+ /** Throw away reads containing without at least this many consecutive called bases. */
+ private int minConsecutiveBases=0;
+ /** Throw away reads containing fewer than this fraction of any particular base. */
+ private final float minBaseFrequency;
+ /** Throw away reads shorter than this after trimming. Default: 10 */
+ private final int minReadLength;
+ /** Throw away reads longer than this after trimming. Default: Integer.MAX_VALUE */
+ private final int maxReadLength;
+ /** Toss reads shorter than this fraction of initial length, after trimming */
+ private final float minLenFraction;
+ /** Filter reads by whether or not they have matching kmers */
+ private final boolean kfilter;
+ /** Trim matching kmers and all bases to the left */
+ private final boolean ktrimLeft;
+ /** Trim matching kmers and all bases to the right */
+ private final boolean ktrimRight;
+ /** Don't trim, but replace matching kmers with a symbol (default N) */
+ private final boolean ktrimN;
+ /** Exclude kmer itself when ktrimming */
+ private final boolean ktrimExclusive;
+ /** Replace bases covered by matched kmers with this symbol */
+ private final byte trimSymbol;
+ /** Convert masked bases to lowercase */
+ private final boolean kmaskLowercase;
+ /** Only mask fully-covered bases **/
+ private final boolean kmaskFullyCovered;
+ /** Output over-trimmed reads to outbad (outmatch). If false, they are discarded. */
+ private final boolean addTrimmedToBad;
+ /** Find the sequence that shares the most kmer matches when filtering. */
+ private final boolean findBestMatch;
+ /** Trim pairs to the same length, when adapter-trimming */
+ private final boolean trimPairsEvenly;
+ /** Trim left bases of the read to this position (exclusive, 0-based) */
+ private final int forceTrimLeft;
+ /** Trim right bases of the read after this position (exclusive, 0-based) */
+ private final int forceTrimRight;
+ /** Trim this many rightmost bases of the read */
+ private final int forceTrimRight2;
+ /** Trim right bases of the read modulo this value.
+ * e.g. forceTrimModulo=50 would trim the last 3bp from a 153bp read. */
+ private final int forceTrimModulo;
+
+ /** Discard reads with GC below this. */
+ private final float minGC;
+ /** Discard reads with GC above this. */
+ private final float maxGC;
+ /** Discard reads outside of GC bounds. */
+ private final boolean filterGC;
+
+ /** If positive, only look for kmer matches in the leftmost X bases */
+ private int restrictLeft;
+ /** If positive, only look for kmer matches the rightmost X bases */
+ private int restrictRight;
+
+ /** True iff java was launched with the -ea' flag */
+ private final boolean EA;
+ /** Skip this many initial input reads */
+ private final long skipreads;
+
+ /** Pairs go to outbad if either of them is bad, as opposed to requiring both to be bad.
+ * Default: true. */
+ private final boolean removePairsIfEitherBad;
+
+ /** Print only statistics for scaffolds that matched at least one read
+ * Default: true. */
+ private final boolean printNonZeroOnly;
+
+ /** Rename reads to indicate what they matched.
+ * Default: false. */
+ private final boolean rename;
+ /** Use names of reference files instead of scaffolds.
+ * Default: false. */
+ private final boolean useRefNames;
+
+ /** Fraction of kmers to skip, 0 to 15 out of 16 */
+ private final int speed;
+
+ /** Skip this many kmers when examining the read. Default 1.
+ * 1 means every kmer is used, 2 means every other, etc. */
+ private final int qSkip;
+
+ /** noAccel is true if speed and qSkip are disabled, accel is the opposite. */
+ private final boolean noAccel, accel;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- BBMerge Flags ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Trim implied adapters based on overlap, for reads with insert size shorter than read length */
+ private final boolean trimByOverlap;
+ private final boolean useQualityForOverlap;
+ private final boolean strictOverlap;
+
+ private int minOverlap0=7;
+ private int minOverlap=14;
+ private int minInsert0=16;
+ private int minInsert=40;
+
+ private final float maxRatio;
+ private final float ratioMargin;
+ private final float ratioOffset;
+ private final float efilterRatio;
+ private final float efilterOffset;
+ private final float pfilterRatio;
+ private final float meeFilter;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Histogram Flags ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private final boolean MAKE_QUALITY_ACCURACY;
+ private final boolean MAKE_QUALITY_HISTOGRAM;
+ private final boolean MAKE_MATCH_HISTOGRAM;
+ private final boolean MAKE_BASE_HISTOGRAM;
+
+ private final boolean MAKE_EHIST;
+ private final boolean MAKE_INDELHIST;
+ private final boolean MAKE_LHIST;
+ private final boolean MAKE_GCHIST;
+ private final boolean MAKE_IDHIST;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Static Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Number of tables (and threads, during loading) */
+ private static final int WAYS=7; //123
+ /** Default initial size of data structures */
+ private static final int initialSizeDefault=128000;
+ /** Verbose messages */
+ public static final boolean verbose=false; //123
+
+ /** Print messages to this stream */
+ private static PrintStream outstream=System.err;
+ /** Permission to overwrite existing files */
+ public static boolean overwrite=true;
+ /** Permission to append to existing files */
+ public static boolean append=false;
+ /** Print speed statistics upon completion */
+ public static boolean showSpeed=true;
+ /** Display progress messages such as memory usage */
+ public static boolean DISPLAY_PROGRESS=true;
+ /** Number of ProcessThreads */
+ public static int THREADS=Shared.threads();
+ /** Indicates end of input stream */
+ private static final ArrayList<Read> POISON=new ArrayList<Read>(0);
+ /** Number of columns for statistics output, 3 or 5 */
+ public static int STATS_COLUMNS=3;
+ /** Release memory used by kmer storage after processing reads */
+ public static boolean RELEASE_TABLES=true;
+ /** Max value of hitCount array */
+ public static final int HITCOUNT_LEN=1000;
+ /** Make unambiguous copies of ref sequences with ambiguous bases */
+ public static boolean REPLICATE_AMBIGUOUS=false;
+
+ /** x&clearMasks[i] will clear base i */
+ private static final long[] clearMasks;
+ /** x|setMasks[i][j] will set base i to j */
+ private static final long[][] setMasks;
+ /** x&leftMasks[i] will clear all bases to the right of i (exclusive) */
+ private static final long[] leftMasks;
+ /** x&rightMasks[i] will clear all bases to the left of i (inclusive) */
+ private static final long[] rightMasks;
+ /** x|kMasks[i] will set the bit to the left of the leftmost base */
+ private static final long[] lengthMasks;
+
+ public static HashMap<String,String> RQC_MAP=null;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Static Initializers ----------------*/
+ /*--------------------------------------------------------------*/
+
+ static{
+ clearMasks=new long[32];
+ leftMasks=new long[32];
+ rightMasks=new long[32];
+ lengthMasks=new long[32];
+ setMasks=new long[4][32];
+ for(int i=0; i<32; i++){
+ clearMasks[i]=~(3L<<(2*i));
+ }
+ for(int i=0; i<32; i++){
+ leftMasks[i]=((-1L)<<(2*i));
+ }
+ for(int i=0; i<32; i++){
+ rightMasks[i]=~((-1L)<<(2*i));
+ }
+ for(int i=0; i<32; i++){
+ lengthMasks[i]=((1L)<<(2*i));
+ }
+ for(int i=0; i<32; i++){
+ for(long j=0; j<4; j++){
+ setMasks[(int)j][i]=(j<<(2*i));
+ }
+ }
+ }
+
+}
diff --git a/current/jgi/BBMask.java b/current/jgi/BBMask.java
new file mode 100755
index 0000000..5ed8a96
--- /dev/null
+++ b/current/jgi/BBMask.java
@@ -0,0 +1,1396 @@
+package jgi;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.BitSet;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.concurrent.ArrayBlockingQueue;
+import java.util.concurrent.ConcurrentHashMap;
+
+import stream.ConcurrentGenericReadInputStream;
+import stream.ConcurrentReadInputStream;
+import stream.FASTQ;
+import stream.FastaReadInputStream;
+import stream.ConcurrentReadOutputStream;
+import stream.Read;
+import stream.SamLine;
+import align2.IntList;
+import align2.ListNum;
+import align2.ReadStats;
+import align2.Shared;
+import align2.Tools;
+import dna.AminoAcid;
+import dna.CoverageArray;
+import dna.CoverageArray2;
+import dna.CoverageArray3;
+import dna.Parser;
+import dna.Timer;
+import fileIO.ByteFile;
+import fileIO.ByteFile1;
+import fileIO.ByteFile2;
+import fileIO.FileFormat;
+import fileIO.ReadWrite;
+
+/**
+ * Masks a fasta file by inserting 'N' in place of low-complexity short repeats,
+ * and anything covered by mapped reads in a sam file.
+ *
+ * @author Brian Bushnell
+ * @date Feb 18, 2014
+ *
+ */
+public class BBMask{
+
+ public static void main(String[] args){
+ Timer t=new Timer();
+ BBMask masker=new BBMask(args);
+ masker.process(t);
+ }
+
+ public BBMask(String[] args){
+
+ if(args==null || args.length==0){
+ printOptions();
+ System.exit(0);
+ }
+
+ for(String s : args){if(s.startsWith("out=standardout") || s.startsWith("out=stdout")){outstream=System.err;}}
+ outstream.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+
+
+
+ Shared.READ_BUFFER_LENGTH=Tools.min(200, Shared.READ_BUFFER_LENGTH);;
+ ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
+ ReadWrite.MAX_ZIP_THREADS=16;
+ ReadWrite.ZIP_THREAD_DIVISOR=1;
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+
+ Parser parser=new Parser();
+ boolean setEntropyMode=false;
+
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(Parser.parseCommonStatic(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseZip(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseQuality(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseFasta(arg, a, b)){
+ //do nothing
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ ByteFile1.verbose=verbose;
+ ByteFile2.verbose=verbose;
+ stream.FastaReadInputStream.verbose=verbose;
+ ConcurrentGenericReadInputStream.verbose=verbose;
+ // align2.FastaReadInputStream2.verbose=verbose;
+ stream.FastqReadInputStream.verbose=verbose;
+ ReadWrite.verbose=verbose;
+ }else if(a.equals("reads") || a.equals("maxreads")){
+ maxReads=Tools.parseKMG(b);
+ }else if(a.equals("t") || a.equals("threads")){
+ Shared.setThreads(b);
+ }else if(a.equals("sampad") || a.equals("sampadding") || a.equals("sp")){
+ samPad=Integer.parseInt(b);
+ }else if(a.equals("entropymode")){
+ entropyMode=Tools.parseBoolean(b);
+ setEntropyMode=true;
+ }else if(a.equals("maskrepeats") || a.equals("mr")){
+ processRepeats=Tools.parseBoolean(b);
+ }else if(a.equals("masklowentropy") || a.equals("masklowcomplexity") || a.equals("mlc") || a.equals("mle") || a.equals("me")){
+ processEntropy=Tools.parseBoolean(b);
+ }else if(a.equals("in") || a.equals("input") || a.equals("in1") || a.equals("input1") || a.equals("ref")){
+ inRef=b;
+ }else if(a.equals("insam") || a.equals("samin") || a.equals("sam")){
+ inSam=(b==null || b.equalsIgnoreCase("null")) ? null : b.split(",");
+ }else if(a.equals("out") || a.equals("output") || a.equals("out1") || a.equals("output1") || a.equals("output1")){
+ outRef=b;
+ }else if(a.equals("qfin") || a.equals("qfin1")){
+ qfinRef=b;
+ }else if(a.equals("qfout") || a.equals("qfout1")){
+ qfoutRef=b;
+ }else if(a.equals("extin")){
+ extinRef=b;
+ }else if(a.equals("extout")){
+ extoutRef=b;
+ }else if(a.equals("split")){
+ splitMode=Tools.parseBoolean(b);
+ }else if(a.equals("mink") || a.equals("kmin")){
+ mink=mink2=Integer.parseInt(b);
+ }else if(a.equals("maxk") || a.equals("kmax")){
+ maxk=maxk2=Integer.parseInt(b);
+ }else if(a.equals("k")){
+ mink=maxk=mink2=maxk2=Integer.parseInt(b);
+ }else if(a.equals("minkr") || a.equals("krmin")){
+ mink=Integer.parseInt(b);
+ }else if(a.equals("maxkr") || a.equals("krmax")){
+ maxk=Integer.parseInt(b);
+ }else if(a.equals("kr")){
+ mink=maxk=Integer.parseInt(b);
+ }else if(a.equals("mink2") || a.equals("kmin2") || a.equals("minke") || a.equals("kemin")){
+ mink2=Integer.parseInt(b);
+ }else if(a.equals("maxk2") || a.equals("kmax2") || a.equals("maxke") || a.equals("kemax")){
+ maxk2=Integer.parseInt(b);
+ }else if(a.equals("k2") || a.equals("ke")){
+ mink2=maxk2=Integer.parseInt(b);
+ }else if(a.equals("mincov")){
+ mincov=Integer.parseInt(b);
+ }else if(a.equals("maxcov")){
+ maxcov=Integer.parseInt(b);
+ }else if(a.equals("delcov") || a.equals("delcoverage")){
+ includeDeletionCoverage=Tools.parseBoolean(b);
+ }else if(a.equals("window") || a.equals("w")){
+ window=Integer.parseInt(b);
+ }else if(a.equals("ratio")){
+ ratio=Float.parseFloat(b);
+ if(!setEntropyMode){entropyMode=false;}
+ }else if(a.equals("entropy") || a.equals("e")){
+ entropyCutoff=Float.parseFloat(b);
+ if(!setEntropyMode){entropyMode=true;}
+ }else if(a.equals("lowercase") || a.equals("lc")){
+ MaskByLowercase=Tools.parseBoolean(b);
+ }else if(a.equals("minlen")){
+ minlen=Integer.parseInt(b);
+ }else if(a.equals("mincount")){
+ mincount=Integer.parseInt(b);
+ }else if(a.equals("append") || a.equals("app")){
+ append=ReadStats.append=Tools.parseBoolean(b);
+ }else if(a.equals("overwrite") || a.equals("ow")){
+ overwrite=Tools.parseBoolean(b);
+ }else if(a.startsWith("minscaf") || a.startsWith("mincontig")){
+ stream.FastaReadInputStream.MIN_READ_LEN=Integer.parseInt(b);
+ }
+ else if(inRef==null && i==0 && !arg.contains("=") && (arg.toLowerCase().startsWith("stdin") || new File(arg).exists())){
+ inRef=arg;
+ }
+// else if(outRef==null && i==1 && !arg.contains("=")){
+// outRef=arg;
+// }
+ else{
+ System.err.println("Unknown parameter "+args[i]);
+ assert(false) : "Unknown parameter "+args[i];
+ // throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ bits32=(mincov>=Character.MAX_VALUE || maxcov>=Character.MAX_VALUE);
+
+ {//Process parser fields
+ Parser.processQuality();
+ }
+
+ assert(FastaReadInputStream.settingsOK());
+
+ if(inRef==null){
+ printOptions();
+ throw new RuntimeException("Error - at least one input file is required.");
+ }
+ if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){
+ ByteFile.FORCE_MODE_BF2=true;
+ }
+
+ if(outRef!=null && outRef.equalsIgnoreCase("null")){outRef=null;}
+
+ if(!Tools.testOutputFiles(overwrite, append, false, outRef)){
+ throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output file "+outRef+"\n");
+ }
+
+ FASTQ.PARSE_CUSTOM=parsecustom;
+
+ ffoutRef=FileFormat.testOutput(outRef, FileFormat.FASTA, extoutRef, true, overwrite, append, false);
+
+ ffinRef=FileFormat.testInput(inRef, FileFormat.FASTA, extinRef, true, true);
+
+ if(inSam!=null && inSam.length>0){
+ ffinSam=new FileFormat[inSam.length];
+ for(int i=0; i<inSam.length; i++){
+ ffinSam[i]=FileFormat.testInput(inSam[i], FileFormat.SAM, null, true, false);
+ }
+ }else{
+ ffinSam=null;
+ }
+
+ SamLine.CONVERT_CIGAR_TO_MATCH=false;
+
+
+ if(window>0){
+ entropy=new double[window+2];
+ double mult=1d/window;
+ for(int i=0; i<entropy.length; i++){
+ double pk=i*mult;
+ entropy[i]=pk*Math.log(pk);
+ }
+ entropyMult=-1/Math.log(window);
+ }else{
+ entropy=null;
+ entropyMult=0;
+ }
+
+ }
+
+ /*--------------------------------------------------------------*/
+
+
+ public void process(Timer t0){
+
+ Timer t=new Timer();
+ {
+ t.start();
+ outstream.println("Loading input");
+
+ boolean oldTI=FASTQ.TEST_INTERLEAVED;
+ boolean oldFI=FASTQ.FORCE_INTERLEAVED;
+ FASTQ.TEST_INTERLEAVED=false;
+ FASTQ.FORCE_INTERLEAVED=false;
+ map=hashRef();
+ FASTQ.TEST_INTERLEAVED=oldTI;
+ FASTQ.FORCE_INTERLEAVED=oldFI;
+ t.stop();
+
+ outstream.println("Loading Time: \t"+t);
+ }
+
+ long repeats=0, lowcomplexity=0;
+ long mapping=0;
+
+ if(processRepeats && maxk>0){
+ t.start();
+ outstream.println("\nMasking repeats (to disable, set 'mr=f')");
+// repeats=maskRepeats_ST();
+ repeats=maskRepeats();
+ t.stop();
+
+ double rpnano=refReads/(double)(t.elapsed);
+ double bpnano=refBases/(double)(t.elapsed);
+
+ String rpstring=""+refReads;
+ String bpstring=""+refBases;
+ String bmstring=""+repeats;
+
+ while(rpstring.length()<12){rpstring=" "+rpstring;}
+ while(bpstring.length()<12){bpstring=" "+bpstring;}
+ while(bmstring.length()<12){bmstring=" "+bmstring;}
+
+ outstream.println("Repeat Masking Time: \t"+t);
+ //outstream.println("Ref Scaffolds: "+rpstring+" \t"+String.format("%.2fk scafs/sec", rpnano*1000000));
+ outstream.println("Ref Bases: "+bpstring+" \t"+String.format("%.2fm bases/sec", bpnano*1000));
+ outstream.println("Repeat Bases Masked: "+bmstring);
+ }
+
+ if(processEntropy && maxk2>0){
+ t.start();
+ if(entropyMode){
+ outstream.println("\nMasking low-entropy (to disable, set 'mle=f')");
+// lowcomplexity=maskLowEntropy_ST(null);
+ lowcomplexity=maskLowEntropy();
+ }else{
+ outstream.println("\nMasking low-complexity (to disable, set 'mlc=f')");
+ lowcomplexity=maskLowComplexity(null);
+ }
+ t.stop();
+
+ double rpnano=refReads/(double)(t.elapsed);
+ double bpnano=refBases/(double)(t.elapsed);
+
+ String rpstring=""+refReads;
+ String bpstring=""+refBases;
+ String bmstring=""+lowcomplexity;
+
+ while(rpstring.length()<12){rpstring=" "+rpstring;}
+ while(bpstring.length()<12){bpstring=" "+bpstring;}
+ while(bmstring.length()<12){bmstring=" "+bmstring;}
+
+ outstream.println("Low Complexity Masking Time: \t"+t);
+ //outstream.println("Ref Scaffolds: "+rpstring+" \t"+String.format("%.2fk scafs/sec", rpnano*1000000));
+ outstream.println("Ref Bases: "+bpstring+" \t"+String.format("%.2fm bases/sec", bpnano*1000));
+ outstream.println("Low Complexity Bases: "+bmstring);
+ }
+
+ if(ffinSam!=null){
+ t.start();
+ outstream.println("\nMasking from sam");
+ mapping=maskSam();
+ t.stop();
+
+ double rpnano=samReads/(double)(t.elapsed);
+ double bpnano=samBases/(double)(t.elapsed);
+
+ String rpstring=""+samReads;
+ String bpstring=""+samBases;
+ String bmstring=""+mapping;
+
+ while(rpstring.length()<12){rpstring=" "+rpstring;}
+ while(bpstring.length()<12){bpstring=" "+bpstring;}
+ while(bmstring.length()<12){bmstring=" "+bmstring;}
+
+ outstream.println("Sam Masking Time: \t"+t);
+ outstream.println("Sam Reads Processed: "+rpstring+" \t"+String.format("%.2fk reads/sec", rpnano*1000000));
+ outstream.println("Sam Bases Processed: "+bpstring+" \t"+String.format("%.2fm bases/sec", bpnano*1000));
+ outstream.println("Sam Bases Masked: "+bmstring);
+ }
+ long total=repeats+mapping+lowcomplexity, masked=0;
+
+ if(total>0 || true){
+ t.start();
+ if(splitMode){
+ masked=splitFromBitsets();
+ }else{
+ masked=maskFromBitsets(MaskByLowercase);
+ }
+ t.stop();
+ outstream.println("Conversion Time: \t"+t);
+ }
+
+ assert(total==masked) : repeats+", "+mapping+", "+lowcomplexity+", "+total+", "+masked;
+
+ if(outRef!=null){
+ t.start();
+ outstream.println("\nWriting output");
+ writeOutput();
+ t.stop();
+ outstream.println("Writing Time: \t"+t);
+ }
+ {
+ t0.stop();
+ String tstring=""+total;
+ while(tstring.length()<12){tstring=" "+tstring;}
+ outstream.println("\nTotal Bases Masked: "+tstring+"/"+refBases+String.format("\t%.3f%%", total*100.0/refBases));
+ outstream.println("Total Time: \t"+t0);
+ }
+
+
+
+ if(errorState){
+ throw new RuntimeException("\nBBMask terminated in an error state; the output may be corrupt.");
+ }
+ }
+
+ /*--------------------------------------------------------------*/
+
+ private int setHighCoverage(BitSet bs, CoverageArray ca, int maxAllowedCoverage, int maxLen){
+ int numSet=0;
+ for(int i=0; i<maxLen; i++){
+ if(ca.get(i)>maxAllowedCoverage){
+ bs.set(i);
+ numSet++;
+ }
+ }
+ return numSet;
+ }
+
+ private int setLowCoverage(BitSet bs, CoverageArray ca, int minAllowedCoverage, int maxLen){
+ int numSet=0;
+ for(int i=0; i<maxLen; i++){
+ if(ca.get(i)<minAllowedCoverage){
+ bs.set(i);
+ numSet++;
+ }
+ }
+ return numSet;
+ }
+
+ private long maskFromBitsets(final boolean lowercase){
+ System.err.println("\nConverting masked bases to "+(lowercase ? "lower case" : "N")); //123
+ long sum=0;
+ if(!lowercase){
+ for(Read r : map.values()){
+// System.err.println(r.id); //123
+ BitSet bs=((BitSet)r.obj);
+ byte[] bases=r.bases;
+ for(int i=0; i<bases.length; i++){
+ if(bs.get(i)){
+ if(bases[i]!='N'){sum++;}
+ bases[i]='N';
+ }else if(CONVERT_NON_ACGTN && !AminoAcid.isACGTN(bases[i])){
+ bases[i]='N';
+ }
+ }
+ }
+ }else{
+ for(Read r : map.values()){
+ BitSet bs=((BitSet)r.obj);
+ byte[] bases=r.bases;
+ for(int i=0; i<bases.length; i++){
+ if(bs.get(i)){
+ if(!Character.isLowerCase(bases[i]) && bases[i]!='N'){sum++;}
+ bases[i]=(byte)Character.toLowerCase(bases[i]);
+ }else if(CONVERT_NON_ACGTN && !AminoAcid.isACGTN(bases[i])){
+ bases[i]='N';
+ }
+ }
+ }
+ }
+ System.err.println("Done Masking");
+ return sum;
+ }
+
+ private long splitFromBitsets(){
+ System.err.println("\nSplitting reads by removing masked bases"); //123
+ long sum=0;
+
+ LinkedHashMap<String, Read> map2=new LinkedHashMap<String, Read>();
+
+ for(String key : map.keySet()){
+ Read r=map.get(key);
+ BitSet bs=((BitSet)r.obj);
+ int rnum=0;
+ if(bs.isEmpty()){
+ map2.put(key, r);
+ }else{
+ byte[] bases=r.bases;
+ byte[] quals=r.quality;
+
+ int lastGood, lastBad;
+ if(bs.get(0)){
+ lastGood=-1;
+ lastBad=0;
+ }else{
+ lastGood=0;
+ lastBad=-1;
+ }
+
+ int i=1;
+ for(; i<bases.length; i++){
+ if(bs.get(i)){
+ if(lastGood==i-1){
+ int len=lastGood-lastBad;
+ if(len>0){
+ byte[] bases2=Arrays.copyOfRange(bases, lastBad+1, i);
+ byte[] quals2=(quals==null ? null : Arrays.copyOfRange(quals, lastBad+1, i));
+ Read r2=new Read(bases2, -1, -1, -1, r.id+"_"+rnum, quals2, r.numericID, r.flags);
+ Read old=map2.put(r2.id, r2);
+ assert(old==null) : "Duplicate id "+r2.id; //TODO: This can easily be resolved by making a new ID string.
+ }
+ }
+ lastBad=i;
+ }else{
+ lastGood=i;
+ }
+ }
+ if(lastGood==i-1){
+ int len=lastGood-lastBad;
+ if(len>0){
+ byte[] bases2=Arrays.copyOfRange(bases, lastBad+1, i);
+ byte[] quals2=(quals==null ? null : Arrays.copyOfRange(quals, lastBad+1, i));
+ Read r2=new Read(bases2, -1, -1, -1, r.id+"_"+rnum, quals2, r.numericID, r.flags);
+ Read old=map2.put(r2.id, r2);
+ assert(old==null) : "Duplicate id "+r2.id; //TODO: This can easily be resolved by making a new ID string.
+ }
+ }
+ }
+ }
+
+ map.clear();
+ map.putAll(map2);
+ map2.clear();
+
+ System.err.println("Done Splitting");
+ return sum;
+ }
+
+ private void writeOutput(){
+
+ ConcurrentReadOutputStream ros=null;
+ if(ffoutRef!=null){
+ final int buff=16;
+ ros=ConcurrentReadOutputStream.getStream(ffoutRef, null, qfoutRef, null, buff, null, false);
+ ros.start();
+ }
+
+ long i=0;
+ for(String name : map.keySet()){
+ Read r=map.get(name);
+ ArrayList<Read> list=new ArrayList<Read>(1);
+ list.add(r);
+ ros.add(list, i);
+ i++;
+ }
+ errorState|=ReadWrite.closeStream(ros);
+ }
+
+ /*--------------------------------------------------------------*/
+
+ private long maskSam(){
+ long before=0, after=0;
+ for(Read r : map.values()){
+ before+=((BitSet)r.obj).cardinality();
+ }
+ for(FileFormat ff : ffinSam){
+ //maskSam_ST(ff);
+ maskSam_MT(ff);
+ }
+ for(Read r : map.values()){
+ after+=((BitSet)r.obj).cardinality();
+ }
+ return after-before;
+ }
+
+ private void maskSam_MT(FileFormat ff){
+
+ final ConcurrentReadInputStream cris;
+ {
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff, null, null, null);
+ if(verbose){System.err.println("Started cris");}
+ cris.start(); //4567
+ }
+
+ MaskSamThread[] threads=new MaskSamThread[Shared.threads()];
+// outstream.println("Spawning "+numThreads+" threads.");
+ for(int i=0; i<threads.length; i++){threads[i]=new MaskSamThread(cris);}
+ for(int i=0; i<threads.length; i++){threads[i].start();}
+ for(int i=0; i<threads.length; i++){
+ while(threads[i].getState()!=Thread.State.TERMINATED){
+ try {
+ threads[i].join();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+
+ if(covmap!=null){//Move coverage array information to BitSets.
+ for(String rs : map.keySet()){
+ Read r=map.get(rs);
+ BitSet bs=(BitSet)r.obj;
+ CoverageArray ca=covmap.remove(rs);
+ if(maxcov>-1){
+ setHighCoverage(bs, ca, maxcov, r.length());
+ }
+ if(mincov>-1){
+ setLowCoverage(bs, ca, mincov, r.length());
+ }
+ }
+ }
+
+ errorState|=ReadWrite.closeStreams(cris);
+
+ if(errorState){
+ throw new RuntimeException("BBMask terminated in an error state; the output may be corrupt.");
+ }
+ }
+
+ private class MaskSamThread extends Thread{
+
+ MaskSamThread(ConcurrentReadInputStream cris_){
+ cris=cris_;
+ }
+
+ @Override
+ public void run(){
+ maskSam(cris);
+ }
+
+ final ConcurrentReadInputStream cris;
+
+
+ }
+
+ private void maskSam_ST(FileFormat ff){
+
+ final ConcurrentReadInputStream cris;
+ {
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff, null, null, null);
+ if(verbose){System.err.println("Started cris");}
+ cris.start(); //4567
+ }
+
+ maskSam(cris);
+
+ errorState|=ReadWrite.closeStreams(cris);
+
+ if(errorState){
+ throw new RuntimeException("BBMask terminated in an error state; the output may be corrupt.");
+ }
+ }
+
+ private void maskSam(ConcurrentReadInputStream cris){
+
+ long samReads=0;
+ long samBases=0;
+ IntList ranges=new IntList(16);
+
+ {
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ while(reads!=null && reads.size()>0){
+
+ for(int idx=0; idx<reads.size(); idx++){
+ final Read r=reads.get(idx);
+ assert(r.mate==null);
+
+ final int initialLength1=r.length();
+
+ {
+ samReads++;
+ samBases+=initialLength1;
+
+ if(r.mapped()){
+ SamLine sl=(SamLine)r.obj;
+ assert(sl!=null) : "No sam line for read "+r;
+ byte[] rname=sl.rname();
+ assert(rname!=null) : "No rname for sam line "+sl;
+ final String rs=new String(rname);
+ Read ref=map.get(rs);
+ if(ref==null){
+ handleNoRef(rs);
+ }else{
+ final int reflen=ref.length();
+ assert(ref!=null) : "Could not find reference scaffold '"+rs+"' for samline \n"+sl+"\n in set \n"+map.keySet()+"\n";
+ if(covmap==null){
+ BitSet bs=(BitSet)ref.obj;
+ mask(bs, sl, reflen);
+ }else{
+ CoverageArray ca=covmap.get(rs);
+ increment(ca, sl, r.match, reflen, ranges, includeDeletionCoverage, samPad);
+ }
+ }
+ }
+ }
+ }
+
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ if(ln!=null){
+ cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
+ }
+ }
+
+ synchronized(this){
+ this.samBases+=samBases;
+ this.samReads+=samReads;
+ }
+ }
+
+ private void mask(BitSet bs, SamLine sl, int reflen){
+ final int start=Tools.max(0, sl.start(true, false)-samPad);
+ final int stop=Tools.min(sl.stop(start, true, false)+1+samPad, reflen);
+ if(stop>start){
+ synchronized(bs){//Potential bottleneck
+ bs.set(start, stop);
+ }
+ }
+ }
+
+ public static void increment(CoverageArray ca, SamLine sl, byte[] match, int reflen, IntList ranges, boolean includeDels, int samPad){
+ final int start=Tools.max(0, sl.start(true, false));
+ final int stop=Tools.min(sl.stop(start, true, false)+1, reflen);
+ if(stop>start){
+ ranges.clear();
+ boolean numeric=false;
+ if(match==null){
+ assert(sl.cigar!=null);
+ match=SamLine.cigarToShortMatch(sl.cigar, true);
+ numeric=true;
+ }else{
+ for(byte b : match){
+ if(Character.isDigit(b)){
+ numeric=true;
+ break;
+ }
+ }
+ }
+ if(numeric){match=Read.toLongMatchString(match);}
+ fillRanges(match, start, stop, ranges, includeDels);
+// assert(false) : ranges;
+ if(ranges.size>0){
+ if(samPad!=0){//Pad the ranges, but don't let them overlap
+ ranges.set(0, Tools.mid(0, reflen, ranges.get(0)-samPad));
+ ranges.set(1, Tools.mid(0, reflen, ranges.get(1)+samPad));
+ for(int i=2; i<ranges.size; i+=2){
+ ranges.set(i, Tools.mid(ranges.get(i-1), reflen, ranges.get(i)-samPad));
+ ranges.set(i+1, Tools.mid(0, reflen, ranges.get(i+1)+samPad));
+ }
+ }
+ synchronized(ca){//Potential bottleneck
+ ca.incrementRanges(ranges, 1);
+ }
+ }
+ }
+ }
+
+ public static void fillRanges(byte[] longmatch, int start, int stop, IntList ranges, boolean includeDels){
+ assert(ranges.size==0);
+ byte mode='?', lastMode='?';
+ int rpos=start;
+ int lastRpos=start;
+ int rstart=start;
+ for(int mpos=0; mpos<longmatch.length; mpos++){
+ byte m=longmatch[mpos];
+ if(m=='m' || m=='s' || m=='S' || m=='N' || m=='B'){//Little 's' is for a match classified as a sub to improve the affine score.
+ mode='m';
+ rpos++;
+ }else if(m=='I' || m=='X' || m=='Y'){
+ mode='I';
+ }else if(m=='D'){
+ mode='D';
+ rpos++;
+ }else if(m=='C'){
+ mode='C';
+ rpos++;
+ }else{
+ throw new RuntimeException("Invalid match string character '"+(char)m+"' = "+m+" (ascii). " +
+ "Match string should be in long format here.");
+ }
+ if(mode!=lastMode){
+ if(mpos>0){
+ if(lastMode=='m'){
+ ranges.add(rstart);
+ ranges.add(lastRpos);
+ }else if(mode=='D' && includeDels){
+ ranges.add(rstart);
+ ranges.add(lastRpos);
+ }
+ }
+ rstart=lastRpos;
+ }
+ lastMode=mode;
+ lastRpos=rpos;
+ }
+// assert(false) : rstart+", "+rpos+", "+(char)mode+", "+(char)lastMode;
+ //Final cycle
+ if(lastMode=='m'){
+ ranges.add(rstart);
+ ranges.add(lastRpos);
+ }else if(mode=='D' && includeDels){
+ ranges.add(rstart);
+ ranges.add(lastRpos);
+ }
+ assert(rpos==stop) : start+", "+stop+", "+rpos+", "+rstart+", "+lastRpos+"\n"+new String(longmatch)+"\n"+ranges;
+ assert((ranges.size&1)==0);
+ }
+
+ private void handleNoRef(String rname){
+ assert(rname!=null);
+ String ret=norefSet.putIfAbsent(rname, rname);
+ if(ret==null){
+ System.err.println("Warning! Scaffold not found in assembly: "+rname);
+ }
+ }
+
+ /*--------------------------------------------------------------*/
+
+ private LinkedHashMap<String, Read> hashRef(){
+
+
+ final ConcurrentReadInputStream cris;
+ {
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffinRef, null, qfinRef, null);
+ if(verbose){System.err.println("Started cris");}
+ cris.start(); //4567
+ }
+
+ final LinkedHashMap<String, Read> hmr=new LinkedHashMap<String, Read>();
+ if(ffinSam!=null && (mincov>=0 || maxcov>=0)){
+ covmap=new HashMap<String, CoverageArray>();
+ }
+
+ {
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ while(reads!=null && reads.size()>0){
+
+ for(int idx=0; idx<reads.size(); idx++){
+ final Read r=reads.get(idx);
+ final int len=r.length();
+ final byte[] bases=r.bases;
+
+// refLengths.add(len);
+ final BitSet bs=new BitSet(len);
+ r.obj=bs;
+
+ if(covmap!=null){
+ if(bits32){
+ covmap.put(r.id, new CoverageArray3(covmap.size(), len));
+ }else{
+ covmap.put(r.id, new CoverageArray2(covmap.size(), len));
+ }
+ }
+
+ if(MaskByLowercase){
+ for(int i=0; i<len; i++){
+ if(bases[i]=='N' || Character.isLowerCase(bases[i])){bs.set(i);}
+ }
+ }else{
+ for(int i=0; i<len; i++){
+ if(bases[i]=='N'){bs.set(i);}
+ }
+ }
+
+ refReads++;
+ refBases+=len;
+ Read old=hmr.put(r.id, r);
+ assert(old==null) : "Duplicate reference scaffold name "+r.id;
+ }
+
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ if(ln!=null){
+ cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
+ }
+ }
+
+ errorState|=ReadWrite.closeStreams(cris);
+
+ if(errorState){
+ throw new RuntimeException("BBMask terminated in an error state; the output may be corrupt.");
+ }
+
+ return hmr;
+ }
+
+ /*--------------------------------------------------------------*/
+
+
+ private long maskLowComplexity(short[][] matrix){
+ long sum=0;
+ if(matrix==null){matrix=new short[16][];}
+ for(Read r : map.values()){
+ sum+=maskLowComplexity(r, mink2, maxk2, window, ratio, matrix);
+ }
+ return sum;
+ }
+
+ private static int maskLowComplexity(Read r, int mink, int maxk, int window, float ratio, short[][] matrix){
+
+ final byte[] bases=r.bases;
+ final BitSet bs=(BitSet)r.obj;
+
+ int before=bs.cardinality();
+// System.err.println("\nbefore="+before+"\n"+new String(bases)+"\n"+bs);
+
+ for(int k=mink; k<=maxk; k++){
+ if(matrix[k]==null){matrix[k]=new short[(1<<(2*k))];}
+ }
+
+ for(int k=mink; k<=maxk; k++){
+ final short[] counts=matrix[k];
+ final int kmerspace=(1<<(2*k));
+ final int mincount=(int)Math.ceil(ratio*Tools.min(window, kmerspace));
+ maskLowComplexity(bases, bs, k, window, mincount, counts);
+ }
+
+ int after=bs.cardinality();
+
+// System.err.println("before="+before+", after="+after+"\n"+new String(bases)+"\n"+bs);
+
+ return after-before;
+ }
+
+
+ private static void maskLowComplexity(final byte[] bases, final BitSet bs, final int k, final int window, final int mincount, final short[] counts){
+ assert(k>0) : "k must be greater than 0";
+
+ if(verify){
+ for(int c : counts){assert(c==0);}
+ }
+
+ final int mask=(k>15 ? -1 : ~((-1)<<(2*k)));
+ int current=0, ns=0;
+ int kmer=0, kmer2=0;
+
+ for(int i=0, i2=-window; i2<bases.length; i++, i2++){
+
+// System.err.println("\nStart: i="+i+", current="+current+", ns="+ns+"\n"+Arrays.toString(counts));
+
+ if(i<bases.length){
+ final byte b=bases[i];
+ final int n=Dedupe.baseToNumber[b];
+ kmer=((kmer<<2)|n)&mask;
+
+ if(!AminoAcid.isFullyDefined(b)){ns++;}
+ if(counts[kmer]<1){
+ assert(counts[kmer]==0);
+ current++;
+ }
+ counts[kmer]++;
+ if(verify){assert(current==Tools.cardinality(counts)) : current+", "+Tools.cardinality(counts)+"\n"+Arrays.toString(counts);}
+
+// System.err.println("Added "+kmer+"; counts["+kmer+"]="+counts[kmer]);
+ }
+
+ if(i2>=0){
+ final byte b2=bases[i2];
+ final int n2=Dedupe.baseToNumber[b2];
+ kmer2=((kmer2<<2)|n2)&mask;
+
+ if(!AminoAcid.isFullyDefined(b2)){
+ ns--;
+ assert(ns>=0);
+ }
+ counts[kmer2]--;
+ if(counts[kmer2]<1){
+ assert(counts[kmer2]==0) : Arrays.toString(counts);
+ current--;
+ }
+ if(verify){assert(current==Tools.cardinality(counts)) : current+", "+Tools.cardinality(counts)+"\n"+Arrays.toString(counts);}
+
+// System.err.println("Removed "+kmer2+"; count="+counts[kmer2]);
+ }
+
+ if(verify && i2>-1 && i<bases.length){
+ assert(Tools.sum(counts)==window);
+ }
+
+ if(current<mincount && ns<1 && i2>=-1 && i<bases.length){
+// System.err.println("Masked ("+(i2+1)+", "+(i+1)+")");
+ bs.set(i2+1, i+1);
+ }
+ }
+
+ }
+
+
+ /*--------------------------------------------------------------*/
+
+ /*--------------------------------------------------------------*/
+
+
+ private long maskLowEntropy_ST(short[][] matrix){
+ long sum=0;
+ if(matrix==null){matrix=new short[maxk2+1][];}
+ short[] countCounts=new short[window+2];
+ countCounts[0]=(short)window;
+ for(Read r : map.values()){
+ sum+=maskLowEntropy(r, mink2, maxk2, window, entropyCutoff, matrix, countCounts);
+ }
+ return sum;
+ }
+
+ private long maskLowEntropy(){
+ ArrayBlockingQueue<Read> queue=new ArrayBlockingQueue<Read>(map.size());
+ for(Read r : map.values()){queue.add(r);}
+ int numThreads=Tools.min(Shared.threads(), queue.size());
+ MaskLowEntropyThread[] threads=new MaskLowEntropyThread[numThreads];
+ long sum=0;
+// outstream.println("Spawning "+numThreads+" threads.");
+ for(int i=0; i<threads.length; i++){threads[i]=new MaskLowEntropyThread(queue, mink2, maxk2, window, entropyCutoff);}
+ for(int i=0; i<threads.length; i++){threads[i].start();}
+ for(int i=0; i<threads.length; i++){
+ while(threads[i].getState()!=Thread.State.TERMINATED){
+ try {
+ threads[i].join();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ sum+=threads[i].masked;
+ }
+ return sum;
+ }
+
+ private class MaskLowEntropyThread extends Thread{
+
+ MaskLowEntropyThread(ArrayBlockingQueue<Read> queue_, int mink_, int maxk_, int window_, float cutoff_){
+ queue=queue_;
+ mink=mink_;
+ maxk=maxk_;
+ window=(short)window_;
+ cutoff=cutoff_;
+ countCounts=new short[window+2];
+ matrix=new short[maxk+1][];
+ countCounts[0]=window;
+ }
+
+ @Override
+ public void run(){
+ for(Read r=queue.poll(); r!=null; r=queue.poll()){
+ masked+=maskLowEntropy(r, mink, maxk, window, cutoff, matrix, countCounts);
+ }
+ }
+
+ final ArrayBlockingQueue<Read> queue;
+ final int mink;
+ final int maxk;
+ final short window;
+ final float cutoff;
+ final short[] countCounts;
+ final short[][] matrix;
+ long masked=0;
+
+ }
+
+ private int maskLowEntropy(Read r, int mink, int maxk, int window, float cutoff, short[][] matrix, short[] countCounts){
+// outstream.println("maskLowEntropy("+r.numericID+", "+mink+", "+maxk+", "+window+", "+cutoff+", "+matrix.length+", "+countCounts.length+")");
+// System.err.println(new String(r.bases));
+ final byte[] bases=r.bases;
+ final BitSet bs=(BitSet)r.obj;
+
+ int before=bs.cardinality();
+// System.err.println("\nbefore="+before+"\n"+new String(bases)+"\n"+bs);
+
+ for(int k=mink; k<=maxk; k++){
+ if(matrix[k]==null){matrix[k]=new short[(1<<(2*k))];}
+ }
+
+ for(int k=mink; k<=maxk; k++){
+ final short[] counts=matrix[k];
+ final int kmerspace=(1<<(2*k));
+ maskLowEntropy(bases, bs, k, window, counts, countCounts, cutoff, kmerspace);
+ }
+
+ int after=bs.cardinality();
+
+// System.err.println("before="+before+", after="+after+"\n"+new String(bases)+"\n"+bs);
+
+ return after-before;
+ }
+
+
+ private void maskLowEntropy(final byte[] bases, final BitSet bs, final int k, final int window, final short[] counts, final short[] countCounts, float cutoff, int kmerspace){
+ assert(k>0) : "k must be greater than 0";
+// Arrays.fill(counts, 0);
+
+ assert(countCounts[0]==window);
+ if(verify){
+ for(int c : counts){assert(c==0);}
+ for(int i=1; i<countCounts.length; i++){assert(countCounts[i]==0);}
+ }
+
+ final int mask=(k>15 ? -1 : ~((-1)<<(2*k)));
+ int current=0, ns=0;
+ int kmer=0, kmer2=0;
+
+ for(int i=0, i2=-window; i2<bases.length; i++, i2++){
+
+// System.err.println("\nStart: i="+i+", current="+current+", ns="+ns+"\n"+Arrays.toString(counts)+"\n"+Arrays.toString(countCounts));
+
+ if(i<bases.length){
+ final byte b=bases[i];
+ final int n=Dedupe.baseToNumber[b];
+ kmer=((kmer<<2)|n)&mask;
+
+ if(!AminoAcid.isFullyDefined(b)){ns++;}
+ if(counts[kmer]<1){
+ assert(counts[kmer]==0);
+ current++;
+ }
+ countCounts[counts[kmer]]--;
+ assert(countCounts[counts[kmer]]>=-1): i+", "+current+", "+Tools.cardinality(counts)+"\n"+Arrays.toString(counts)+"\n"+Arrays.toString(countCounts);
+ counts[kmer]++;
+ assert(counts[kmer]<=window+1) : Arrays.toString(counts)+"\n"+Arrays.toString(countCounts);
+ countCounts[counts[kmer]]++;
+ if(verify){
+ assert(current==Tools.cardinality(counts)) : current+", "+Tools.cardinality(counts)+"\n"+Arrays.toString(counts);
+ assert(Tools.sum(countCounts)>0 && (Tools.sum(countCounts)<=window+1)): current+", "+Tools.cardinality(counts)+"\n"+Arrays.toString(countCounts);
+ }
+
+// System.err.println("Added "+kmer+"; counts["+kmer+"]="+counts[kmer]);
+ }
+
+ if(i2>=0){
+ final byte b2=bases[i2];
+ final int n2=Dedupe.baseToNumber[b2];
+ kmer2=((kmer2<<2)|n2)&mask;
+
+ if(!AminoAcid.isFullyDefined(b2)){
+ ns--;
+ assert(ns>=0);
+ }
+ countCounts[counts[kmer2]]--;
+ assert(countCounts[counts[kmer2]]>=0);
+ counts[kmer2]--;
+ countCounts[counts[kmer2]]++;
+ if(counts[kmer2]<1){
+ assert(counts[kmer2]==0) : Arrays.toString(counts);
+ current--;
+ }
+ if(verify){
+ assert(current==Tools.cardinality(counts)) : current+", "+Tools.cardinality(counts)+"\n"+Arrays.toString(counts);
+ assert(Tools.sum(countCounts)>=0 && (Tools.sum(countCounts)<=window)): current+", "+Tools.cardinality(counts)+"\n"+Arrays.toString(countCounts);
+ }
+
+// System.err.println("Removed "+kmer2+"; count="+counts[kmer2]);
+ }
+
+ if(verify && i2>-1 && i<bases.length){
+ assert(Tools.sum(counts)==window);
+ assert(Tools.sum(countCounts)==window): current+", "+Tools.cardinality(counts)+"\n"+Arrays.toString(countCounts);
+ }
+
+ if(ns<1 && i2>=-1 && i<bases.length && calcEntropy(countCounts, window, kmerspace)<cutoff){
+// System.err.println("Masked ("+(i2+1)+", "+(i+1)+")");
+ bs.set(i2+1, i+1);
+ }
+ }
+
+ }
+
+ private float calcEntropy(int[] countCounts, int window, int kmerspace){
+ double sum=0;
+ for(int i=1; i<countCounts.length; i++){
+ int cc=countCounts[i];
+ double pklogpk=entropy[i];
+ sum+=(cc*pklogpk);
+ }
+// System.err.println("sum = "+sum);
+// System.err.println("entropy = "+(sum*entropyMult));
+ return (float)(sum*entropyMult);
+ }
+
+ private float calcEntropy(short[] countCounts, int window, int kmerspace){
+ double sum=0;
+ for(int i=1; i<countCounts.length; i++){
+ int cc=countCounts[i];
+ double pklogpk=entropy[i];
+ sum+=(cc*pklogpk);
+ }
+// System.err.println("sum = "+sum);
+// System.err.println("entropy = "+(sum*entropyMult));
+ return (float)(sum*entropyMult);
+ }
+
+
+ /*--------------------------------------------------------------*/
+
+
+
+ private long maskRepeats_ST(){
+ long sum=0;
+ for(Read r : map.values()){
+ sum+=maskRepeats(r, mink, maxk, mincount, minlen);
+ }
+ return sum;
+ }
+
+ private long maskRepeats(){
+ ArrayBlockingQueue<Read> queue=new ArrayBlockingQueue<Read>(map.size());
+ for(Read r : map.values()){queue.add(r);}
+ int numThreads=Tools.min(Shared.threads(), queue.size());
+ MaskRepeatThread[] threads=new MaskRepeatThread[numThreads];
+ long sum=0;
+ for(int i=0; i<threads.length; i++){threads[i]=new MaskRepeatThread(queue, mink, maxk, mincount, minlen);}
+ for(int i=0; i<threads.length; i++){threads[i].start();}
+ for(int i=0; i<threads.length; i++){
+ while(threads[i].getState()!=Thread.State.TERMINATED){
+ try {
+ threads[i].join();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ sum+=threads[i].masked;
+ }
+ return sum;
+ }
+
+ private class MaskRepeatThread extends Thread{
+
+ MaskRepeatThread(ArrayBlockingQueue<Read> queue_, int mink_, int maxk_, int mincount_, int minlen_){
+ queue=queue_;
+ mink=mink_;
+ maxk=maxk_;
+ mincount=mincount_;
+ minlen=minlen_;
+ }
+
+ @Override
+ public void run(){
+ for(Read r=queue.poll(); r!=null; r=queue.poll()){
+ masked+=maskRepeats(r, mink, maxk, mincount, minlen);
+ }
+ }
+
+ final ArrayBlockingQueue<Read> queue;
+ final int mink;
+ final int maxk;
+ final int mincount;
+ final int minlen;
+ long masked=0;
+
+ }
+
+ private static int maskRepeats(Read r, int mink, int maxk, int mincount, int minlen){
+ final byte[] bases=r.bases;
+ final BitSet bs=(BitSet)r.obj;
+
+ int before=bs.cardinality();
+// System.err.println("\nbefore="+before+"\n"+new String(bases)+"\n"+bs);
+
+ for(int k=mink; k<=maxk; k++){
+ maskRepeats(bases, bs, k, Tools.max(minlen, k*mincount));
+ }
+
+ int after=bs.cardinality();
+
+// System.err.println("before="+before+", after="+after+"\n"+new String(bases)+"\n"+bs);
+
+ return after-before;
+ }
+
+
+ private static void maskRepeats(final byte[] bases, final BitSet bs, final int k, final int minlen){
+ final int lim=bases.length-k;
+ final int mask=(k>15 ? -1 : ~((-1)<<(2*k)));
+ for(int loc=0; loc<lim; loc++){
+ int len=repeatLength(bases, k, mask, loc);
+ if(len>=minlen){
+ int a=loc-k, b=loc-k+len;
+ bs.set(a, b);
+// System.err.println("len="+len+", minlen="+minlen+", set "+(loc-k)+"-"+(loc-k+len));
+ loc=Tools.max(loc, b-minlen);
+// System.err.println("Reset loc to "+loc);
+ }else{
+// System.err.println("len="+len+" < minlen="+minlen);
+ }
+ }
+
+ }
+
+
+ private static int repeatLength(final byte[] bases, final int k, final int mask, final int loc){
+
+ final int lim=bases.length;
+ final int key=getInitialKey(bases, loc, k);
+ if(key<0){return 0;}
+ int kmer=key;
+ int gap=0, last=-1;
+ for(int i=loc; i<lim && gap<k; i++){
+ final byte b=bases[i];
+ final int n=Dedupe.baseToNumber[b];
+ kmer=((kmer<<2)|n)&mask;
+ if(kmer==key){
+ last=i;
+ gap=0;
+ }else{
+ gap++;
+ }
+// System.err.println("i="+i+", lim="+lim+", gap="+gap+", last="+last+", b="+(char)b+", n="+n+", key="+key+", kmer="+kmer);
+ }
+
+// System.err.println("k="+k+", mask="+mask+", loc="+loc+", last="+last);
+
+ return (last<0 ? 0 : last-loc+k+1);
+ }
+
+ private static int getInitialKey(byte[] bases, int loc, int k){
+ assert(k<16);
+ int start=loc-k;
+ int key=0;
+ if(start<0){return -1;}
+ for(int i=start; i<loc; i++){
+ final byte b=bases[i];
+ final int n=Dedupe.baseToNumber[b];
+ key=(key<<2)|n;
+ }
+ assert(key>=0);
+ return key;
+ }
+
+ /*--------------------------------------------------------------*/
+
+ private void printOptions(){
+ outstream.println("Syntax:\n");
+ outstream.println("java -ea -Xmx15g -cp <path> jgi.BBMask ref=<file> sam=<file,file,...file> out=<file>");
+ outstream.println("sam and out are optional.\n");
+ outstream.println("Other parameters and their defaults:\n");
+ outstream.println("overwrite=false \tOverwrites files that already exist");
+ outstream.println("ziplevel=2 \tSet compression level, 1 (low) to 9 (max)");
+ outstream.println("fastawrap=70 \tLength of lines in fasta output");
+ outstream.println("qin=auto \tASCII offset for input quality. May be set to 33 (Sanger), 64 (Illumina), or auto");
+ outstream.println("qout=auto \tASCII offset for output quality. May be set to 33 (Sanger), 64 (Illumina), or auto (meaning same as input)");
+ }
+
+ /*--------------------------------------------------------------*/
+
+ /*--------------------------------------------------------------*/
+
+ private LinkedHashMap<String, Read> map=null;
+ private ConcurrentHashMap<String, String> norefSet=new ConcurrentHashMap<String, String>(256, .75f, 16);
+ private HashMap<String, CoverageArray> covmap=null;
+// private IntList refLengths=new IntList();
+
+ private long refReads=0;
+ private long refBases=0;
+// private long repeatsMasked=0;
+
+ private long samReads=0;
+ private long samBases=0;
+// private long samMasked=0;
+
+ public boolean errorState=false;
+
+ private String inRef=null;
+ private String inSam[]=null;
+
+ private String qfinRef=null;
+
+ private String outRef=null;
+
+ private String qfoutRef=null;
+
+ private String extinRef=null;
+ private String extoutRef=null;
+
+ private boolean parsecustom=false;
+ private boolean overwrite=false;
+ private boolean append=false;
+
+ private long maxReads=-1;
+
+ private boolean processRepeats=false;
+ private int mink=5;
+ private int maxk=5;
+ private int minlen=40;
+ private int mincount=4;
+
+ private boolean processEntropy=true;
+ private boolean entropyMode=true;
+ private boolean splitMode=false;
+ private int mink2=5;
+ private int maxk2=5;
+ private int window=80;
+ private float ratio=0.35f; //For complexity, if not in entropyMode
+ private float entropyCutoff=0.70f;
+
+ /** Use 32-bit coverage arrays */
+ private boolean bits32=true;
+ /** Include deletions when calculating coverage */
+ private boolean includeDeletionCoverage=true;
+
+ /** If nonnegative, mask bases with coverage outside this range. */
+ private int mincov=-1;
+ /** If nonnegative, mask bases with coverage outside this range. */
+ private int maxcov=-1;
+
+ private int samPad=0;
+
+ private final FileFormat ffinRef;
+ private final FileFormat[] ffinSam;
+
+ private final FileFormat ffoutRef;
+
+ private PrintStream outstream=System.err;
+
+ private final double[] entropy;
+ private final double entropyMult;
+
+ /*--------------------------------------------------------------*/
+
+ public static boolean verbose=false;
+ public static boolean CONVERT_NON_ACGTN=true;
+ private static boolean verify=false;
+ private static boolean MaskByLowercase=false;
+
+}
diff --git a/current/jgi/BBMerge.java b/current/jgi/BBMerge.java
new file mode 100755
index 0000000..b0474cd
--- /dev/null
+++ b/current/jgi/BBMerge.java
@@ -0,0 +1,2135 @@
+package jgi;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import kmer.KmerTableSet;
+
+import stream.ByteBuilder;
+import stream.ConcurrentReadInputStream;
+import stream.FASTQ;
+import stream.ConcurrentReadOutputStream;
+import stream.Read;
+import stream.ReadStreamWriter;
+
+import dna.AminoAcid;
+import dna.Data;
+import dna.Parser;
+import dna.Timer;
+import fileIO.ReadWrite;
+import fileIO.FileFormat;
+
+import align2.ListNum;
+import align2.LongList;
+import align2.ReadStats;
+import align2.Shared;
+import align2.Tools;
+import align2.TrimRead;
+import assemble.Tadpole;
+
+/**
+ * @author Brian Bushnell
+ * @date Aug 14, 2012
+ *
+ */
+public class BBMerge {
+
+
+ public static void main(String[] args){
+ args=Parser.parseConfig(args);
+ if(Parser.parseHelp(args, true)){
+ printOptions();
+ System.exit(0);
+ }
+// boolean old=Shared.USE_JNI;
+// Shared.USE_JNI=false; //TODO: This is for RQCFilter. Can be removed.
+ BBMerge mr=new BBMerge(args);
+ mr.process();
+// Shared.USE_JNI=old;
+ Read.VALIDATE_IN_CONSTRUCTOR=true;
+ }
+
+
+ private static void printOptions(){
+ System.err.println("Please consult the shellscript for usage information.");
+ }
+
+
+ private static String[] preparse(String[] args){
+ if(args==null){return new String[0];}
+ int nulls=0;
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if(b==null || b.equalsIgnoreCase("null")){b=null;}
+ while(a.startsWith("-")){a=a.substring(1);}
+
+
+ if(a.equals("jni") || a.equals("usejni")){
+ Shared.USE_JNI=Tools.parseBoolean(b);
+ }else if(a.equals("showfullargs") || a.equalsIgnoreCase("showFullArgs")){
+ showFullArgs=Tools.parseBoolean(b);
+ args[i]=null;
+ nulls++;
+ }else if(a.equals("vstrict") || a.equals("verystrict")){
+ vstrict=Tools.parseBoolean(b);
+ args[i]=null;
+ nulls++;
+ }else if(a.equals("ustrict") || a.equals("ultrastrict")){
+ ustrict=Tools.parseBoolean(b);
+ args[i]=null;
+ nulls++;
+ }else if(a.equals("xstrict") || a.equals("hstrict") || a.equals("hyperstrict") || a.equals("maxstrict")){
+ xstrict=Tools.parseBoolean(b);
+ args[i]=null;
+ nulls++;
+ }else if(a.equals("strict")){
+ strict=Tools.parseBoolean(b);
+ args[i]=null;
+ nulls++;
+ }else if(a.equals("loose")){
+ loose=Tools.parseBoolean(b);
+ args[i]=null;
+ nulls++;
+ }else if(a.equals("vloose") || a.equals("veryloose")){
+ vloose=Tools.parseBoolean(b);
+ args[i]=null;
+ nulls++;
+ }else if(a.equals("uloose") || a.equals("ultraloose")){
+ uloose=Tools.parseBoolean(b);
+ args[i]=null;
+ nulls++;
+ }else if(a.equals("xloose") || a.equals("hloose") || a.equals("hyperloose") || a.equals("maxloose")){
+ xloose=Tools.parseBoolean(b);
+ args[i]=null;
+ nulls++;
+ }else if(a.equals("fast")){
+ fast=Tools.parseBoolean(b);
+ args[i]=null;
+ nulls++;
+ }else if(a.equals("default")){
+ if(Tools.parseBoolean(b)){
+ xstrict=ustrict=vstrict=strict=loose=vloose=uloose=xloose=fast=false;
+ }
+ args[i]=null;
+ nulls++;
+ }
+ }
+
+ if(nulls==0){return args;}
+ ArrayList<String> args2=new ArrayList<String>(args.length-nulls+5);
+ if(strict || vstrict || ustrict || xstrict){
+ strict=true;
+ loose=vloose=uloose=xloose=false;
+
+ args2.add("maxbad=4");
+ args2.add("margin=3");
+ args2.add("minqo=8");
+ args2.add("qualiters=2");
+
+ if(xstrict){
+ args2.add("ratiomode=t");
+ args2.add("normalmode=t");
+ args2.add("requireratiomatch=t");
+
+ args2.add("minentropy=56");
+ args2.add("minoverlap=14");
+ args2.add("minoverlap0=3");
+
+ args2.add("maxratio=0.055");
+ args2.add("ratiomargin=12");
+ args2.add("ratiooffset=0.65");
+ args2.add("ratiominoverlapreduction=4");
+ args2.add("efilter=2");
+ args2.add("pfilter=0.25");
+ }else if(ustrict){
+ args2.add("ratiomode=t");
+ args2.add("normalmode=t");
+ args2.add("requireratiomatch=t");
+
+ args2.add("minentropy=56");
+ args2.add("minoverlap=14");
+ args2.add("minoverlap0=3");
+
+ args2.add("maxratio=0.045");
+ args2.add("ratiomargin=12");
+ args2.add("ratiooffset=0.5");
+ args2.add("ratiominoverlapreduction=4");
+ args2.add("efilter=2");
+ args2.add("pfilter=0.03");
+ }else if(vstrict){
+ args2.add("ratiomode=t");
+ args2.add("normalmode=f");
+
+ args2.add("minentropy=52");
+ args2.add("minoverlap=12");
+ args2.add("minoverlap0=4");
+
+ args2.add("maxratio=0.05");
+ args2.add("ratiomargin=12");
+ args2.add("ratiooffset=0.5");
+ args2.add("ratiominoverlapreduction=4");
+ args2.add("efilter=2");
+ args2.add("pfilter=0.008");
+ }else{
+ args2.add("ratiomode=t");
+ args2.add("normalmode=f");
+
+ args2.add("minentropy=42");
+ args2.add("minoverlap0=7");
+ args2.add("minoverlap=11");
+
+ args2.add("maxratio=0.075");
+ args2.add("ratiomargin=7.5");
+ args2.add("ratiooffset=0.55");
+ args2.add("ratiominoverlapreduction=4");
+ args2.add("efilter=4");
+ args2.add("pfilter=0.0008");
+ }
+ }else if(loose || vloose || uloose || xloose){
+ loose=true;
+ strict=vstrict=ustrict=xstrict=false;
+ args2.add("minoverlap=8");
+ args2.add("minoverlap0=9");
+ args2.add("qualiters=4");
+ args2.add("mismatches=3");
+ args2.add("margin=2");
+
+ args2.add("ratiooffset=0.4");
+
+ if(xloose){
+ args2.add("owq=t");
+ args2.add("ouq=t");
+ args2.add("minentropy=22");
+ args2.add("minoverlap=8");
+ args2.add("minoverlap0=7");
+ args2.add("maxratio=0.2");
+ args2.add("mismatches=3");
+ args2.add("ratiomargin=2");
+ args2.add("normalmode=t");
+ args2.add("pfilter=0.0000001");
+ args2.add("efilter=8");
+ args2.add("margin=2");
+ args2.add("ratiominoverlapreduction=2");
+ }else if(vloose || uloose){
+ args2.add("owq=t");
+ args2.add("ouq=t");
+ if(uloose){
+// args2.add("maxratio=0.14");
+// args2.add("ratiomargin=2");
+// args2.add("normalmode=t");
+// args2.add("pfilter=0.0000001");
+
+
+ args2.add("minoverlap=8");
+ args2.add("minoverlap0=7");
+ args2.add("mismatches=3");
+ args2.add("margin=2");
+
+ args2.add("ratiominoverlapreduction=2");
+ args2.add("efilter=8");
+ args2.add("maxratio=0.16");
+ args2.add("ratiomargin=2.2");
+ args2.add("pfilter=0.0000002");
+ args2.add("minentropy=24");
+ }else{
+ args2.add("ratiominoverlapreduction=3");
+ args2.add("maxratio=0.12");
+ args2.add("ratiomargin=3");
+ args2.add("pfilter=0.000004");
+ args2.add("minentropy=28");
+ args2.add("efilter=7.5");
+ args2.add("ratiooffset=0.45");
+ }
+ }else{
+ args2.add("maxratio=0.11");
+ args2.add("ratiomargin=4.7");
+ args2.add("ratiominoverlapreduction=2");
+ args2.add("pfilter=0.00002");
+ args2.add("efilter=8");
+ args2.add("minentropy=30");
+ }
+ }else if(fast){
+ args2.add("maxratio=0.08");
+ args2.add("ratiomargin=2.5");
+ args2.add("ratiominoverlapreduction=3");
+ args2.add("pfilter=0.0002");
+ args2.add("efilter=8");
+ args2.add("minentropy=39");
+ args2.add("mininsert0=50");
+ }
+
+ for(String s : args){
+ if(s!=null){args2.add(s);}
+ }
+ return args2.toArray(new String[args2.size()]);
+ }
+
+ public BBMerge(String[] args){
+ System.err.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+ System.err.println("BBMerge version "+version);
+
+ {
+ String[] args0=args;
+ args=preparse(args);
+
+ if(args0!=args && showFullArgs){
+ System.err.println("Revised arguments: "+Arrays.toString(args)+"\n");
+ }
+ }
+
+ Timer ttotal=new Timer();
+ ttotal.start();
+
+ in1=(args[0].indexOf('=')>0 ? null : args[0]);
+ in2=(in1!=null && args.length>1 && args[1].indexOf('=')<0 ? args[1] : null);
+ if(in2!=null && "null".equalsIgnoreCase(in2)){in2=null;}
+
+ {
+ if(in1!=null && !in1.contains(",") && !in1.startsWith("stdin.") && !in1.equals("stdin")){
+ File f=new File(in1);
+ if(!f.exists() || !f.isFile()){
+ in1=null;
+// throw new RuntimeException(in1+" does not exist.");
+ }
+ }
+ if(in2!=null && !in2.contains(",")){
+ File f=new File(in2);
+ if(!f.exists() || !f.isFile()){
+ in2=null;
+// throw new RuntimeException(in2+" does not exist.");
+ }else if(in1.equalsIgnoreCase(in2)){
+ throw new RuntimeException("Both input files are the same.");
+ }
+ }
+ }
+
+ ReadWrite.MAX_ZIP_THREADS=Shared.threads()-1;
+
+ ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
+
+ Shared.READ_BUFFER_LENGTH=Tools.max(Shared.READ_BUFFER_LENGTH, 400);
+
+ boolean mm0set=false;
+
+ Parser parser=new Parser();
+ parser.trimq2=trimq;
+ parser.minAvgQuality=minAvgQuality;
+ parser.minReadLength=minReadLength;
+ parser.maxReadLength=maxReadLength;
+
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if(b==null || b.equalsIgnoreCase("null")){b=null;}
+ while(a.startsWith("-")){a=a.substring(1);}
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(Parser.parseZip(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseCommonStatic(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseQuality(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseQualityAdjust(arg, a, b)){
+ //do nothing
+ }else if(parser.parseInterleaved(arg, a, b)){
+ //do nothing
+ }else if(parser.parseTrim(arg, a, b)){
+ //do nothing
+ }else if(a.equals("in") || a.equals("in1")){
+ in1=b;
+ }else if(a.equals("in2")){
+ in2=b;
+ }else if(a.equals("extra")){
+ for(String s : b.split(",")){
+ extra.add(s);
+ }
+ }else if(a.equals("useratio") || a.equals("ratio") || a.equals("ratiomode")){
+ useRatioMode=Tools.parseBoolean(b);
+ }else if(a.equals("usenormalmode") || a.equals("normalmode")){
+ useNormalMode=Tools.parseBoolean(b);
+ }else if(a.equals("requireratiomatch") || a.equals("rrm")){
+ requireRatioMatch=Tools.parseBoolean(b);
+ }else if(a.equals("maxratio")){
+ MAX_RATIO=Float.parseFloat(b);
+// useRatioMode=true;
+ }else if(a.equals("ratiomargin")){
+ RATIO_MARGIN=Float.parseFloat(b);
+// useRatioMode=true;
+ }else if(a.equals("ratiooffset")){
+ RATIO_OFFSET=Float.parseFloat(b);
+// useRatioMode=true;
+ }else if(a.equals("ratiominoverlapreduction")){
+ MIN_OVERLAPPING_BASES_RATIO_REDUCTION=Integer.parseInt(b);
+// useRatioMode=true;
+ }else if(a.equals("minentropy") || a.equals("entropy")){
+ if(b!=null && Character.isDigit(b.charAt(0))){
+ minEntropyScore=Integer.parseInt(b);
+ }else{
+ useEntropy=Tools.parseBoolean(b);
+ }
+ }else if(a.equals("minoverlappingbases") || a.equals("minoverlapbases") || a.equals("minoverlap")){
+ MIN_OVERLAPPING_BASES=Integer.parseInt(b);
+ }else if(a.equals("minoverlappingbases0") || a.equals("minoverlapbases0") || a.equals("minoverlap0")){
+ MIN_OVERLAPPING_BASES_0=Integer.parseInt(b);
+ }else if(a.equals("minqo") || a.equals("minq")){
+ MIN_QUALITY=(byte)Integer.parseInt(b);
+ }else if(a.equals("maxq")){
+ Read.MAX_MERGE_QUALITY=(byte)Integer.parseInt(b);
+ }else if(a.equals("qualiters")){
+ QUAL_ITERS=Tools.max(1, Integer.parseInt(b));
+ }else if(a.equals("maxbadbases") || a.equals("maxbad") || a.equals("mismatches")){
+ MAX_MISMATCHES=Integer.parseInt(b);
+ }else if(a.equals("maxbadbases0") || a.equals("maxbad0") || a.equals("mismatches0")){
+ MAX_MISMATCHES0=Integer.parseInt(b);
+ mm0set=true;
+ }else if(a.equals("margin")){
+ MISMATCH_MARGIN=Integer.parseInt(b);
+ }else if(a.equals("usemapping")){
+ USE_MAPPING=Tools.parseBoolean(b);
+ }else if(a.equals("bin")){
+ bin=Integer.parseInt(b);
+ }else if(a.equals("threads") || a.equals("t")){
+ THREADS=Shared.setThreads(b);
+ }else if(a.equals("reads") || a.startsWith("maxreads")){
+ maxReads=Tools.parseKMG(b);
+ }else if(a.equals("outgood") || a.equals("outmerged") || a.equals("outm") || a.equals("out")){
+ out1=(b==null || b.equals("null") ? null : b);
+ }else if(a.equals("outgood1") || a.equals("outmerged1") || a.equals("outm1") || a.equals("out1")){
+ out1=(b==null || b.equals("null") ? null : b);
+ }else if(a.equals("outgood2") || a.equals("outmerged2") || a.equals("outm2") || a.equals("out2")){
+ out2=(b==null || b.equals("null") ? null : b);
+ }else if(a.equals("outb") || a.equals("outu") || a.equals("outunmerged") || a.equals("outbad")){
+ outb1=(b==null || b.equals("null") ? null : b);
+ }else if(a.equals("outb1") || a.equals("outu1") || a.equals("outunmerged1") || a.equals("outbad1")){
+ outb1=(b==null || b.equals("null") ? null : b);
+ }else if(a.equals("outb2") || a.equals("outu2") || a.equals("outunmerged2") || a.equals("outbad2")){
+ outb2=(b==null || b.equals("null") ? null : b);
+ }else if(a.startsWith("outinsert") || a.startsWith("outi") || a.startsWith("outlength")){
+ outinsert=(b==null || b.equals("null") ? null : b);
+ }else if(a.startsWith("outhist") || a.equals("hist") || a.equals("histogram") || a.equals("ihist")){
+ ihist=(b==null || b.equals("null") ? null : b);
+ }else if(a.equals("outa") || a.equals("outadapter")){
+ outAdapter=b;
+ findAdapterSequence=(outAdapter!=null);
+ }else if(a.equals("outc") || a.equals("outcardinality")){
+ outCardinality=b;
+// }else if(a.equals("outputfailed")){
+// OUTPUT_FAILED=Tools.parseBoolean(b);outCardinality
+ }else if(a.equals("mix")){
+ MIX_BAD_AND_GOOD=Tools.parseBoolean(b);
+ }else if(a.equals("nzo") || a.equals("nonzeroonly")){
+ NONZERO_ONLY=Tools.parseBoolean(b);
+ }else if(a.equals("showhiststats")){
+ showHistStats=Tools.parseBoolean(b);
+ }else if(a.equals("verbose")){
+ assert(false) : "verbose flag is static final; recompile to change it.";
+// verbose=Tools.parseBoolean(b);
+ }else if(a.equals("join") || a.equals("merge")){
+ join=Tools.parseBoolean(b);
+ if(join){ecco=false;}
+ }else if(a.equals("ecco") || a.equals("ecc") || a.equals("errorcorrect")){
+ ecco=Tools.parseBoolean(b);
+ if(ecco){join=false;}
+ }else if(a.equals("tbo") || a.equals("trimbyoverlap")){
+ trimByOverlap=Tools.parseBoolean(b);
+ }else if(a.equals("useoverlap") || a.equals("usebases") || a.equals("matebyoverlap") || a.equals("matebybases")){
+ MATE_BY_OVERLAP=Tools.parseBoolean(b);
+ }
+// else if(a.startsWith("skipmated")){
+// SKIP_MATED_READS=Tools.parseBoolean(b);
+// }
+ else if(a.equals("lowercase")){
+ lowercaseAdapters=Tools.parseBoolean(b);
+ }else if(a.equals("append") || a.equals("app")){
+ append=ReadStats.append=Tools.parseBoolean(b);
+ }else if(a.equals("overwrite") || a.equals("ow")){
+ overwrite=Tools.parseBoolean(b);
+ }else if(a.equals("trimonfailure") || a.equals("tof")){
+ if(b!=null && Character.isDigit(b.charAt(0))){
+ TRIM_ON_OVERLAP_FAILURE=Integer.parseInt(b);
+ }else{
+ TRIM_ON_OVERLAP_FAILURE=(Tools.parseBoolean(b) ? 1 : 0);
+ }
+ }else if(a.equals("overlapusingquality") || a.equals("ouq")){
+ overlapUsingQuality=Tools.parseBoolean(b);
+ }else if(a.equals("overlapwithoutquality") || a.equals("owoq") || a.equals("owuq") || a.equals("owq")){
+ overlapWithoutQuality=Tools.parseBoolean(b);
+ }else if(a.equals("maxExpectedErrors") || a.equals("mee") || a.equals("meefilter")){
+ maxExpectedErrors=Float.parseFloat(b);
+ }else if(a.equals("mi") || a.equals("minins") || a.equals("mininsert")){
+ minInsert=Integer.parseInt(b);
+ }else if(a.equals("mi0") || a.equals("mininsert0")){
+ minInsert0=Integer.parseInt(b);
+ }else if(a.equals("minprob")){
+ minProb=Float.parseFloat(b);
+ assert(minProb<1) : "minprob must be less than 1. At 1, even kmers with 100% probablity of being error-free will be discarded.";
+ }else if(a.equals("prealloc")){
+ prealloc=Tools.parseBoolean(b);
+ }else if(a.equals("prefilter")){
+ prefilter=Tools.parseBoolean(b);
+ }else if(a.equals("k")){
+ kmerLength=Integer.parseInt(b);
+ }else if(a.equals("efilter")){
+ if(b==null || Character.isLetter(b.charAt(0))){
+ boolean x=Tools.parseBoolean(b);
+ if(!x){efilterRatio=0;}
+ }else{
+ efilterRatio=Float.parseFloat(b);
+ }
+ useEfilter=efilterRatio>0;
+ }else if(a.equals("pfilter")){
+ if(b==null || Character.isLetter(b.charAt(0))){
+ boolean x=Tools.parseBoolean(b);
+ if(!x){pfilterRatio=0;}
+ }else{
+ pfilterRatio=Float.parseFloat(b);
+ }
+ }else if(a.equals("efilteroffset")){
+ efilterOffset=Float.parseFloat(b);
+ }else if(a.equals("kfilter")){
+ if(b!=null && Character.isDigit(b.charAt(0))){
+ filterCutoff=Integer.parseInt(b);
+ useKFilter=filterCutoff>0;
+ }else{
+ useKFilter=Tools.parseBoolean(b);
+ }
+ }else if(a.equals("usequality")){
+ useQuality=Tools.parseBoolean(b);
+ }else if(a.equals("ignorequality")){
+ useQuality=!Tools.parseBoolean(b);
+ }else if(a.equals("ordered")){
+ ordered=Tools.parseBoolean(b);
+ }else if(a.equals("samplerate")){
+ samplerate=Float.parseFloat(b);
+ assert(samplerate<=1f && samplerate>=0f) : "samplerate="+samplerate+"; should be between 0 and 1";
+ }else if(a.equals("sampleseed")){
+ sampleseed=Long.parseLong(b);
+ }else if(a.equals("recalibrate") || a.equals("recalibratequality") || a.equals("recal")){
+ recalibrateQuality=Tools.parseBoolean(b);
+ }else if(a.equals("recalpairnum") || a.equals("recalibratepairnum")){
+ CalcTrueQuality.USE_PAIRNUM=Tools.parseBoolean(b);
+ }else if(a.equals("path")){
+ Data.setPath(b);
+ }else if(a.equals("iupacton") || a.equals("itn")){
+ iupacToN=Tools.parseBoolean(b);
+ }
+
+ //Extension parameters
+
+ else if(a.equals("extendright") || a.equals("er") || a.equals("extend") || a.equals("extendright1") || a.equals("er1") || a.equals("extend1")){
+ extendRight1=(int)Tools.parseKMG(b);
+ }else if(a.equals("extendright2") || a.equals("er2") || a.equals("extend2")){
+ extendRight2=(int)Tools.parseKMG(b);
+ }else if(a.equals("extenditerations") || a.equals("iterations") || a.equals("ei") || a.equals("iters")){
+ extendIterations=Tools.max(1, (int)Tools.parseKMG(b));
+ }else if(a.equals("ecctadpole") || a.equals("ecct")){
+ eccTadpole=Tools.parseBoolean(b);
+ }else if(a.equals("shave") || a.equals("removedeadends")){
+ shave=Tools.parseBoolean(b);
+ }else if(a.equals("rinse") || a.equals("shampoo") || a.equals("removebubbles")){
+ rinse=Tools.parseBoolean(b);
+ }else if(a.equals("branchlower") || a.equals("branchlowerconst")){
+ branchLowerConst=(int)Tools.parseKMG(b);
+ }else if(a.equals("branchmult2")){
+ branchMult2=(int)Tools.parseKMG(b);
+ }else if(a.equals("branchmult1")){
+ branchMult1=(int)Tools.parseKMG(b);
+ }else if(a.equals("mincount") || a.equals("mincov") || a.equals("mindepth") || a.equals("min")){
+ minCountSeed=minCountExtend=(int)Tools.parseKMG(b);
+ }else if(a.equals("mindepthseed") || a.equals("mds") || a.equals("mincountseed") || a.equals("mcs")){
+ minCountSeed=(int)Tools.parseKMG(b);
+ }else if(a.equals("mindepthextend") || a.equals("mde") || a.equals("mincountextend") || a.equals("mce")){
+ minCountExtend=(int)Tools.parseKMG(b);
+ }else if(a.equals("ilb") || a.equals("ignoreleftbranches") || a.equals("ignoreleftjunctions") || a.equals("ibb") || a.equals("ignorebackbranches")){
+ extendThroughLeftJunctions=Tools.parseBoolean(b);
+ }else{
+ throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+// assert(false) : ecco;
+ minInsert=Tools.max(minInsert, MIN_OVERLAPPING_BASES);
+ if(minInsert0<1){
+ minInsert0=(Tools.max((int)(minInsert*0.75), 5, MIN_OVERLAPPING_BASES_0));
+ int cap=(loose ? 50 : 35);
+ minInsert0=Tools.min(cap, minInsert0);
+ }
+ minInsert0=Tools.min(minInsert, minInsert0);
+
+ if(MATE_BY_OVERLAP && !useNormalMode && !useRatioMode){
+ System.err.println("\n*** WARNING! Both normal and ratio mode were disabled; using normal mode. ***\n");
+ useNormalMode=true;
+ }
+
+ loglog=(outCardinality==null ? null : new LogLog(1999, 8, 31, -1));
+
+ {//Process parser fields
+ Parser.processQuality();
+
+ qtrimLeft=parser.qtrimLeft;
+ qtrimRight=parser.qtrimRight;
+ trimq=(parser.trimq2!=null ? parser.trimq2 : new byte[] {parser.trimq});
+ qtrim1=parser.qtrim1;
+ qtrim2=(parser.qtrim2 || (parser.trimq2!=null && parser.trimq2.length>1));
+ if(qtrim1==false && qtrim2==false){
+ qtrim1=((qtrimLeft||qtrimRight)&&trimq[0]>=0);
+ }
+ minAvgQuality=parser.minAvgQuality;
+ minAvgQualityBases=parser.minAvgQualityBases;
+ minReadLength=Tools.max(1, parser.minReadLength);
+ maxReadLength=(parser.maxReadLength<0 ? Integer.MAX_VALUE : parser.maxReadLength);
+// untrim=parser.untrim;
+
+ forceTrimModulo=parser.forceTrimModulo;
+ forceTrimLeft=parser.forceTrimLeft;
+ forceTrimRight=parser.forceTrimRight;
+ forceTrimRight2=parser.forceTrimRight2;
+ }
+ parseCustom=FASTQ.PARSE_CUSTOM;
+ if(verbose){
+// assert(false) : "verbose flag is static final; recompile to change it.";
+// BBMergeOverlapper.verbose=true;
+ }
+
+ if(trimByOverlap){
+ join=false;
+ }
+
+ if(!mm0set){
+ MAX_MISMATCHES0=MAX_MISMATCHES+(loose ? 2 : 0);
+ }
+
+ if(MAX_MISMATCHES0<MAX_MISMATCHES){
+ MAX_MISMATCHES0=MAX_MISMATCHES+(loose ? 2 : 0);
+ System.err.println("MAX_MISMATCHES0 was set to "+MAX_MISMATCHES0+" to remain >=MAX_MISMATCHES");
+ }
+
+ if(MISMATCH_MARGIN>MAX_MISMATCHES){
+ MISMATCH_MARGIN=MAX_MISMATCHES;
+ System.err.println("MISMATCH_MARGIN was set to "+MISMATCH_MARGIN+" to remain >=MAX_MISMATCHES");
+ }
+
+ if(recalibrateQuality){CalcTrueQuality.initializeMatrices();}
+
+ if(findAdapterSequence){
+ for(int i=0; i<adapterCounts.length; i++){
+ for(int j=0; j<adapterCounts[i].length; j++){
+ adapterCounts[i][j]=new LongList(150);
+ }
+ }
+ }
+
+ if(in2==null && in1!=null && in1.contains("#") && !new File(in1).exists()){
+ in2=in1.replaceFirst("#", "2");
+ in1=in1.replaceFirst("#", "1");
+ }
+
+ if(out2==null && out1!=null && out1.contains("#")){
+ out2=out1.replaceFirst("#", "2");
+ out1=out1.replaceFirst("#", "1");
+ }
+
+ if(outb2==null && outb1!=null && outb1.contains("#")){
+ outb2=outb1.replaceFirst("#", "2");
+ outb1=outb1.replaceFirst("#", "1");
+ }
+
+ if(extendRight1>0 || extendRight2>0 || useKFilter || eccTadpole){
+ ArrayList<String> list=new ArrayList<String>();
+ list.add("in1="+in1);
+ list.add("in2="+in2);
+ if(extra.size()>0){
+ StringBuilder sb=new StringBuilder("in=");
+ String comma="";
+ for(String s : extra){
+ sb.append(comma);
+ sb.append(s);
+ comma=",";
+ }
+ list.add(sb.toString());
+ }
+ list.add("branchlower="+branchLowerConst);
+ list.add("branchmult1="+branchMult1);
+ list.add("branchmult2="+branchMult2);
+ list.add("mincountseed="+minCountSeed);
+ list.add("mincountextend="+minCountExtend);
+ list.add("minprob="+minProb);
+ list.add("k="+kmerLength);
+ list.add("prealloc="+prealloc);
+ list.add("prefilter="+prefilter);
+ tadpole=Tadpole.makeTadpole(list.toArray(new String[0]), false);
+ }else{
+ tadpole=null;
+ }
+
+ if(!Tools.testOutputFiles(overwrite, append, false, out1, out2, outb1, outb2, outinsert, ihist, outCardinality, outAdapter)){
+ throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+
+ out1+", "+out2+", "+outb1+", "+outb2+", "+outinsert+", "+ihist+"\n");
+ }
+ if(!Tools.testInputFiles(false, true, in1, in2)){
+ throw new RuntimeException("\nCan't read to some input files.\n");
+ }
+ if(!Tools.testForDuplicateFiles(true, in1, in2, out1, out2, outb1, outb2, outinsert, ihist)){
+ throw new RuntimeException("\nSome file names were specified multiple times.\n");
+ }
+
+ if(in2!=null){
+ assert(!in1.equalsIgnoreCase(in2));
+ FASTQ.TEST_INTERLEAVED=false;
+ FASTQ.FORCE_INTERLEAVED=false;
+ }else{
+ FASTQ.TEST_INTERLEAVED=true;
+ FASTQ.FORCE_INTERLEAVED=true;
+ }
+
+ if(THREADS<1){THREADS=Shared.threads();}
+
+ useMEEfilter=maxExpectedErrors>0;
+
+ Read.VALIDATE_IN_CONSTRUCTOR=(THREADS<16);
+ }
+
+ void process(){
+ Timer ttotal=new Timer();
+ ttotal.start();
+
+ if(tadpole!=null){
+ Timer tload=new Timer();
+ Tadpole.showSpeed=false;
+ KmerTableSet.showSpeed=false;
+ long kmers=tadpole.loadKmers(tload);
+ tload.stop();
+ System.err.println();
+
+ if(shave || rinse){
+ tload.start();
+ long removed=tadpole.shaveAndRinse(tload, shave, rinse, true);
+ tload.stop();
+ System.err.println();
+ }
+
+// System.err.println("Loaded "+kmers+" kmers in "+tload);
+ }
+
+ runPhase(join, maxReads, false);
+
+ double stdev=0;
+ if(histTotal!=null){
+ stdev=Tools.standardDeviationHistogram(histTotal);
+ }
+
+ final long sum=correctCountTotal+incorrectCountTotal;
+ final double divp=100d/readsProcessedTotal;
+ final double div2=100d/sum;
+
+ writeHistogram(ihist, sum*divp);
+
+ if(outAdapter!=null){
+ assert(findAdapterSequence);
+ writeAdapterConsensus(outAdapter, adapterCounts);
+ }
+
+ if(outCardinality!=null){
+ ReadWrite.writeString(loglog.cardinality()+"\n", outCardinality);
+ }
+
+ ttotal.stop();
+ System.err.println("Total time: "+ttotal+"\n");
+
+ System.err.println("Pairs: \t"+readsProcessedTotal);
+ System.err.println("Joined: \t"+sum+String.format((sum<10000 ? " " : " ")+"\t%.3f%%", sum*divp));
+ System.err.println("Ambiguous: \t"+ambiguousCountTotal+String.format((ambiguousCountTotal<10000 ? " " : " ")+"\t%.3f%%", ambiguousCountTotal*divp));
+ System.err.println("No Solution: \t"+noSolutionCountTotal+String.format((noSolutionCountTotal<10000 ? " " : " ")+"\t%.3f%%", noSolutionCountTotal*divp));
+ if(minInsert>0){System.err.println("Too Short: \t"+tooShortCountTotal+String.format((tooShortCountTotal<10000 ? " " : " ")+"\t%.3f%%", tooShortCountTotal*divp));}
+ if(maxReadLength<Integer.MAX_VALUE){System.err.println("Too Long: \t"+tooLongCountTotal+String.format((tooLongCountTotal<10000 ? " " : " ")+"\t%.3f%%", tooLongCountTotal*divp));}
+
+ if(extendRight1>0 || extendRight2>0){
+ double dive=100d/extensionsAttempted;
+ System.err.println("Fully Extended: \t"+fullyExtendedTotal+String.format((fullyExtendedTotal<10000 ? " " : " ")+"\t%.3f%%", fullyExtendedTotal*dive));
+ System.err.println("Partly Extended: \t"+partlyExtendedTotal+String.format((partlyExtendedTotal<10000 ? " " : " ")+"\t%.3f%%", partlyExtendedTotal*dive));
+ System.err.println("Not Extended: \t"+notExtendedTotal+String.format((notExtendedTotal<10000 ? " " : " ")+"\t%.3f%%", notExtendedTotal*dive));
+ }
+
+ if(parseCustom){
+ System.err.println();
+ System.err.println("Correct: \t"+correctCountTotal+String.format((correctCountTotal<10000 ? " " : " ")+"\t%.3f%%", correctCountTotal*divp)+String.format(" \t%.3f%% of merged", correctCountTotal*div2));
+ System.err.println("Incorrect: \t"+incorrectCountTotal+String.format((incorrectCountTotal<10000 ? " " : " ")+"\t%.3f%%", incorrectCountTotal*divp)+String.format(" \t%.3f%% of merged", incorrectCountTotal*div2));
+ double snr=Tools.max(correctCountTotal, 0.001)/(Tools.max(incorrectCountTotal, 0.001));
+ double snrDB=Tools.mid(-20, 80, 10*Math.log10(snr));
+ System.err.println("SNR: \t"+String.format("%.3f dB", snrDB));
+ System.err.println();
+ System.err.println("Avg Insert Correct: \t"+String.format("%.1f", (insertSumCorrectTotal)*1d/(correctCountTotal)));
+ System.err.println("Avg Insert Incorrect:\t"+String.format("%.1f", (insertSumIncorrectTotal)*1d/(incorrectCountTotal)));
+ }
+
+ System.err.println("\nAvg Insert: \t"+String.format("%.1f", (insertSumCorrectTotal+insertSumIncorrectTotal)*1d/(correctCountTotal+incorrectCountTotal)));
+ System.err.println("Standard Deviation: \t"+String.format("%.1f", stdev));
+ System.err.println("Mode: \t"+Tools.calcMode(histTotal));
+
+ System.err.println();
+ System.err.println("Insert range: \t"+insertMinTotal+" - "+insertMaxTotal);
+ System.err.println("90th percentile: \t"+Tools.percentile(histTotal, .9));
+ System.err.println("75th percentile: \t"+Tools.percentile(histTotal, .75));
+ System.err.println("50th percentile: \t"+Tools.percentile(histTotal, .5));
+ System.err.println("25th percentile: \t"+Tools.percentile(histTotal, .25));
+ System.err.println("10th percentile: \t"+Tools.percentile(histTotal, .1));
+ }
+
+ public static void writeHistogram(String fname, double percentMerged){
+ if(fname==null){return;}
+ StringBuilder sb=new StringBuilder();
+
+ if(showHistStats){
+ sb.append("#Mean\t"+String.format("%.3f", Tools.averageHistogram(histTotal))+"\n");
+ sb.append("#Median\t"+Tools.percentile(histTotal, 0.5)+"\n");
+ sb.append("#Mode\t"+Tools.calcMode(histTotal)+"\n");
+ sb.append("#STDev\t"+String.format("%.3f", Tools.standardDeviationHistogram(histTotal))+"\n");
+ sb.append("#PercentOfPairs\t"+String.format("%.3f", percentMerged)+"\n");
+ }
+ sb.append("#InsertSize\tCount\n");
+ for(int i=0; i<histTotal.length && i<=insertMaxTotal; i+=bin){
+ int x=0;
+ int y=0;
+ for(int j=i; j<i+bin && j<histTotal.length; j++){
+ x+=histTotal[j];
+ y++;
+ }
+ x=(x+bin-1)/y;
+ if(x>0 || !NONZERO_ONLY){
+ sb.append(i+"\t"+x+"\n");
+ }
+ }
+ ReadWrite.writeStringInThread(sb, fname);
+ }
+
+ public static void writeAdapterConsensus(String fname, LongList[][] matrix){
+ StringBuilder sb=new StringBuilder();
+ {
+ sb.append(">Read1_adapter\n");
+ StringBuilder adapter=new StringBuilder();
+ LongList[] lists=matrix[0];
+ long max=0;
+ int lastBase=-1;
+ for(int i=0; true; i++){
+ long a=lists[0].get(i);
+ long c=lists[1].get(i);
+ long g=lists[2].get(i);
+ long t=lists[3].get(i);
+ long sum=(a+c+g+t);
+ max=Tools.max(max, sum);
+ if(sum==0 || (sum<10 && sum<=max/1000) || (max>100 && sum<8)){break;}
+ long thresh=(max>100 ? 4+(sum*2)/3 : (sum*2)/3);
+ if(a>thresh){
+ adapter.append('A');
+ lastBase=i;
+ }else if(c>thresh){
+ adapter.append('C');
+ lastBase=i;
+ }else if(g>thresh){
+ adapter.append('G');
+ lastBase=i;
+ }else if(t>thresh){
+ adapter.append('T');
+ lastBase=i;
+ }else{
+ adapter.append('N');
+ }
+ }
+ if(lastBase<0){sb.append('N');}
+ else{
+ for(int i=0; i<=lastBase; i++){
+ sb.append(adapter.charAt(i));
+ }
+ }
+ sb.append('\n');
+ }
+ if(matrix.length>1){
+ sb.append(">Read2_adapter\n");
+ StringBuilder adapter=new StringBuilder();
+ LongList[] lists=matrix[1];
+ long max=0;
+ int lastBase=-1;
+ for(int i=0; true; i++){
+ long a=lists[0].get(i);
+ long c=lists[1].get(i);
+ long g=lists[2].get(i);
+ long t=lists[3].get(i);
+ long sum=(a+c+g+t);
+ max=Tools.max(max, sum);
+ if(sum==0 || (sum<10 && sum<=max/1000) || (max>100 && sum<8)){break;}
+ long thresh=(max>100 ? 5+(sum*2)/3 : (sum*2)/3);
+ if(a>thresh){
+ adapter.append('A');
+ lastBase=i;
+ }else if(c>thresh){
+ adapter.append('C');
+ lastBase=i;
+ }else if(g>thresh){
+ adapter.append('G');
+ lastBase=i;
+ }else if(t>thresh){
+ adapter.append('T');
+ lastBase=i;
+ }else{
+ adapter.append('N');
+ }
+ }
+ if(lastBase<0){sb.append('N');}
+ else{
+ for(int i=0; i<=lastBase; i++){
+ sb.append(adapter.charAt(i));
+ }
+ }
+ sb.append('\n');
+ }
+ ReadWrite.writeString(sb, fname);
+ }
+
+ public void runPhase(boolean join, long maxReads, boolean perfectonly){
+
+ Timer talign=new Timer();
+
+ ConcurrentReadOutputStream rosgood=null;
+ ConcurrentReadOutputStream rosbad=null;
+ ConcurrentReadOutputStream rosinsert=null;
+
+ if(out1!=null){
+ if(join==true){
+ if(out2==null){System.err.println("Writing mergable reads merged.");}
+ else{
+ System.err.println("WARNING: 2 output files specified even though 'merge=true'. out2 will be ignored.");
+ out2=null;
+ }
+ }else{
+ if(out2==null){System.err.println("Writing mergable reads interleaved.");}
+ else{System.err.println("Writing mergable reads unmerged in two files.");}
+ }
+
+ final FileFormat ff1=FileFormat.testOutput(out1, FileFormat.FASTQ, null, true, overwrite, append, ordered);
+ final FileFormat ff2=FileFormat.testOutput(out2, FileFormat.FASTQ, null, true, overwrite, append, ordered);
+ assert(!ff1.samOrBam()) : "Sam files need reference info for the header.";
+
+ final int buff=Tools.max(16, 2*THREADS);
+ rosgood=ConcurrentReadOutputStream.getStream(ff1, ff2, null, null, buff, null, false);
+ rosgood.start();
+ }
+
+ if(outb1!=null){
+
+ final FileFormat ff1=FileFormat.testOutput(outb1, FileFormat.FASTQ, null, true, overwrite, append, ordered);
+ final FileFormat ff2=FileFormat.testOutput(outb2, FileFormat.FASTQ, null, true, overwrite, append, ordered);
+ assert(!ff1.samOrBam()) : "Sam files need reference info for the header.";
+
+ final int buff=Tools.max(16, 2*THREADS);
+ rosbad=ConcurrentReadOutputStream.getStream(ff1, ff2, null, null, buff, null, false);
+ rosbad.start();
+ }
+
+ if(outinsert!=null){
+ final int buff=Tools.max(16, 2*THREADS);
+
+ String out1=outinsert.replaceFirst("#", "1");
+
+ assert(!out1.equalsIgnoreCase(in1) && !out1.equalsIgnoreCase(in1));
+
+ ReadStreamWriter.HEADER=header();
+ final FileFormat ff=FileFormat.testOutput(out1, FileFormat.ATTACHMENT, ".info", true, overwrite, append, ordered);
+ rosinsert=ConcurrentReadOutputStream.getStream(ff, null, null, null, buff, null, false);
+ rosinsert.start();
+ }
+
+
+ if(rosgood!=null || rosbad!=null || rosinsert!=null){
+ System.err.println("Started output threads.");
+ }
+
+ final ConcurrentReadInputStream cris;
+ {
+ FileFormat ff1=FileFormat.testInput(in1, FileFormat.FASTQ, null, true, true);
+ FileFormat ff2=FileFormat.testInput(in2, FileFormat.FASTQ, null, true, true);
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2);
+ cris.setSampleRate(samplerate, sampleseed);
+ if(verbose){System.err.println("Started cris");}
+ cris.start(); //4567
+ }
+ boolean paired=cris.paired();
+// assert(paired);//Fails on empty files.
+ if(verbose){System.err.println("Paired: "+paired);}
+
+ talign.start();
+
+
+ MateThread[] pta=new MateThread[THREADS];
+ for(int i=0; i<pta.length; i++){
+ pta[i]=new MateThread(cris, rosgood, rosbad, rosinsert, join, trimByOverlap);
+ pta[i].start();
+ }
+
+ insertMinTotal=999999999;
+ insertMaxTotal=0;
+
+ readsProcessedTotal=0;
+ matedCountTotal=0;
+ correctCountTotal=0;
+ ambiguousCountTotal=0;
+ tooShortCountTotal=0;
+ tooLongCountTotal=0;
+ incorrectCountTotal=0;
+ noSolutionCountTotal=0;
+ insertSumCorrectTotal=0;
+ insertSumIncorrectTotal=0;
+
+ Arrays.fill(histTotal, 0);
+
+ for(int i=0; i<pta.length; i++){
+ MateThread ct=pta[i];
+ synchronized(ct){
+ while(ct.isAlive()){
+ try {
+ ct.join(1000);
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+
+ readsProcessedTotal+=ct.pairsProcessed;
+ matedCountTotal+=ct.matedCount;
+ correctCountTotal+=ct.correctCount;
+ ambiguousCountTotal+=ct.ambiguousCount;
+ tooShortCountTotal+=ct.tooShortCount;
+ tooLongCountTotal+=ct.tooLongCount;
+ incorrectCountTotal+=ct.incorrectCount;
+ noSolutionCountTotal+=ct.noSolutionCount;
+ insertSumCorrectTotal+=ct.insertSumCorrect;
+ insertSumIncorrectTotal+=ct.insertSumIncorrect;
+
+ fullyExtendedTotal+=ct.fullyExtendedT;
+ partlyExtendedTotal+=ct.partlyExtendedT;
+ notExtendedTotal+=ct.notExtendedT;
+ extensionsAttempted+=ct.extensionsAttemptedT;
+
+ insertMinTotal=Tools.min(ct.insertMin, insertMinTotal);
+ insertMaxTotal=Tools.max(ct.insertMax, insertMaxTotal);
+
+// System.err.println(ct.insertMin+", "+ct.insertMax);
+
+ if(ct.hist!=null){
+ for(int h=0; h<ct.hist.length; h++){
+ histTotal[h]+=ct.hist[h];
+ }
+ }
+
+ if(findAdapterSequence){
+ LongList[][] adapterCountsT=ct.adapterCountsT;
+ for(int x=0; x<adapterCounts.length; x++){
+ for(int y=0; y<adapterCounts[x].length; y++){
+ adapterCounts[x][y].add(adapterCountsT[x][y]);
+ }
+ }
+ }
+ }
+ }
+
+// System.err.println("Finished reading");
+ errorState|=ReadWrite.closeStreams(cris, rosgood, rosbad, rosinsert);
+
+ talign.stop();
+// System.err.println("Align time: "+talign);
+ }
+
+ public static final float mergeableFraction(String fname1, String fname2, long numReads, float samplerate){
+ long[] hist=makeInsertHistogram(fname1, fname2, numReads, samplerate);
+ if(hist==null || hist.length<2){return 0;}
+ long sum=Tools.sum(hist);
+ return sum<1 ? 0 : (sum-hist[0])/(float)sum;
+ }
+
+ public static final long[] makeInsertHistogram(String fname1, String fname2, long numReads, float samplerate){
+ assert(fname1!=null);
+ final ConcurrentReadInputStream cris;
+ {
+ FileFormat ff1=FileFormat.testInput(fname1, FileFormat.FASTQ, null, true, true);
+ FileFormat ff2=FileFormat.testInput(fname2, FileFormat.FASTQ, null, true, true);
+ if(ff1.stdio()){return null;}
+ assert(!ff1.stdio()) : "Standard in is not allowed as input when calculating insert size distributions for files.";
+ cris=ConcurrentReadInputStream.getReadInputStream(numReads, true, ff1, ff2);
+ cris.setSampleRate(samplerate, 1);
+ if(verbose){System.err.println("Started cris");}
+ cris.start(); //4567
+ if(!cris.paired()){
+ ReadWrite.closeStreams(cris);
+ return null;
+ }
+ }
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+ assert(r.mate!=null);
+ }
+
+ LongList ll=new LongList(500);
+ while(reads!=null && reads.size()>0){
+
+ for(Read r1 : reads){
+ int x=findOverlapLoose(r1, r1.mate, false);
+ if(x>0){ll.increment(x, 1);}
+ else{ll.increment(0, 1);}
+ }
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ReadWrite.closeStreams(cris);
+ return ll.toArray();
+ }
+
+ /** Returns the insert size as calculated by overlap, or -1 */
+ public static final int findOverlapStrict(final Read r1, final Read r2, boolean ecc){
+ final float maxRatio=0.06f;
+ final float ratioMargin=10f;
+ final float ratioOffset=0.5f;
+
+ final float efilterRatio=2f;
+ final float efilterOffset=0.45f;
+ final float pfilterRatio=0.008f;
+
+ final int minOverlap=8;
+ final int minOverlap0=4;
+ final int minInsert=50;
+ final int minInsert0=35;
+ final int entropy=42;
+
+ final int x=findOverlap(r1, r2, ecc,
+ minOverlap, minOverlap0, minInsert, minInsert0, entropy,
+ maxRatio, ratioMargin, ratioOffset,
+ efilterRatio, efilterOffset, pfilterRatio);
+ return x;
+ }
+
+ /** Returns the insert size as calculated by overlap, or -1 */
+ public static final int findOverlapLoose(final Read r1, final Read r2, boolean ecc){
+
+ final float maxRatio=0.12f;
+ final float ratioMargin=3f;
+ final float ratioOffset=0.45f;
+
+ final float efilterRatio=7.5f;
+ final float efilterOffset=0.55f;
+ final float pfilterRatio=0.000004f;
+
+ final int minOverlap=5;
+ final int minOverlap0=6;
+ final int minInsert=16;
+ final int minInsert0=16;
+ final int entropy=28;
+
+ final int x=findOverlap(r1, r2, ecc,
+ minOverlap, minOverlap0, minInsert, minInsert0, entropy,
+ maxRatio, ratioMargin, ratioOffset,
+ efilterRatio, efilterOffset, pfilterRatio);
+ return x;
+ }
+
+ /** Returns the insert size as calculated by overlap, or -1 */
+ public static final int findOverlap(final Read r1, final Read r2, final boolean ecc,
+ int minOverlap, final int minOverlap0, final int minInsert, final int minInsert0, final int entropy,
+ final float maxRatio, final float ratioMargin, final float ratioOffset,
+ final float efilterRatio, final float efilterOffset, final float pfilterRatio){
+
+ assert(r1!=null && r2!=null);
+ if(!r1.validated()){r1.validate(true);}
+ if(!r2.validated()){r2.validate(true);}
+
+ final int len1=r1.length(), len2=r2.length();
+ final int minlen=Tools.min(len1, len2);
+
+ if(minlen<MIN_OVERLAPPING_BASES || minlen<minInsert){
+ return -1;
+ }
+
+ int[] rvector=localRvector.get();
+ if(rvector==null){
+ rvector=new int[5];
+ localRvector.set(rvector);
+ }
+
+ r2.reverseComplement();
+
+ int bestInsert=-1;
+ int bestBad=999999;
+ boolean ambig, tooShort=false;
+
+ if(USE_MAPPING && r1.chrom==r2.chrom && r1.start<r1.stop && r1.mapped() && r2.mapped()){
+ bestBad=0;
+ bestInsert=Read.insertSizeMapped(r1, r2, ignoreMappingStrand);
+ ambig=false;
+ }else{
+ if(entropy>0){
+ int a=BBMergeOverlapper.calcMinOverlapByEntropy(r1.bases, 3, null, entropy);
+ int b=BBMergeOverlapper.calcMinOverlapByEntropy(r2.bases, 3, null, entropy);
+ minOverlap=Tools.max(MIN_OVERLAPPING_BASES, Tools.max(a, b));
+ }else{minOverlap=MIN_OVERLAPPING_BASES;}
+ if(verbose){System.err.println("minOverlap: "+minOverlap);}
+
+ rvector[4]=0;
+
+ int x=BBMergeOverlapper.mateByOverlapRatio(r1, r2, null, null, rvector, minOverlap0, minOverlap,
+ minInsert0, minInsert, maxRatio, ratioMargin, ratioOffset, 0.95f, 0.95f, false);
+ bestInsert=x;
+ bestBad=rvector[2];
+ ambig=(x>-1 ? rvector[4]==1 : false);
+ }
+
+ //TODO: Crucial! This line can vastly reduce merge rate, particularly if quality values are inaccurate.
+ if(bestInsert>0 && !ambig && r1.quality!=null && r2.quality!=null){
+ float bestExpected=BBMergeOverlapper.expectedMismatches(r1, r2, bestInsert);
+ if((bestExpected+efilterOffset)*efilterRatio<bestBad){ambig=true;}
+ if(verbose){System.err.println("Result after efilter: \tinsert="+bestInsert+", bad="+bestBad+", ambig="+ambig);}
+ }
+
+ //TODO: Crucial! This line can vastly reduce merge rate, particularly if quality values are inaccurate.
+ if(pfilterRatio>0 && bestInsert>0 && !ambig && r1.quality!=null && r2.quality!=null){
+ float probability=BBMergeOverlapper.probability(r1, r2, bestInsert);
+ if(probability<pfilterRatio){bestInsert=-1;}
+ if(verbose){System.err.println("Result after pfilter: \tinsert="+bestInsert+", bad="+bestBad+", ambig="+ambig);}
+ }
+
+ tooShort=(!ambig && bestInsert>0 && bestInsert<minInsert);
+
+ if(ecc && bestInsert>-1 && !ambig && !tooShort){
+ errorCorrectWithInsert(r1, r2, bestInsert);
+ }
+
+ if(r2!=null){r2.reverseComplement();}
+ if(!ambig && bestInsert>-1){r1.setInsert(bestInsert);}
+
+ return ambig ? -1 : bestInsert;
+ }
+
+ public static int errorCorrectWithInsert(Read r1, Read r2, int insert){
+ assert(insert>0);
+ int errors=0;
+ Read joined=r1.joinRead(insert);
+
+ if(joined!=null && joined.length()>0){
+ final int lenj=joined.length();
+ final int lim1=Tools.min(joined.length(), r1.length());
+ final int lim2=lenj-Tools.min(r2.length(), lenj);
+
+ r1.bases=Arrays.copyOfRange(joined.bases, 0, lim1);
+ r1.quality=(r1.quality==null ? null : Arrays.copyOfRange(joined.quality, 0, lim1));
+
+ r2.bases=Arrays.copyOfRange(joined.bases, lim2, lenj);
+ r2.quality=(r2.quality==null ? null : Arrays.copyOfRange(joined.quality, lim2, lenj));
+ }
+ return errors;
+ }
+
+ public static String header(){
+ return "#id\tnumericID\tinsert\tstatus\tmismatches\n";
+ }
+
+ private void qtrim(Read r1, Read r2, int iter){
+ if(false /*untrim*/){
+ TrimRead.trim(r1, qtrimLeft, qtrimRight, trimq[iter], 1);
+ TrimRead.trim(r2, qtrimLeft, qtrimRight, trimq[iter], 1);
+ }else{
+ TrimRead.trimFast(r1, qtrimLeft, qtrimRight, trimq[iter], 1);
+ TrimRead.trimFast(r2, qtrimLeft, qtrimRight, trimq[iter], 1);
+ }
+ }
+
+ private class MateThread extends Thread{
+
+
+ public MateThread(ConcurrentReadInputStream cris_, ConcurrentReadOutputStream rosgood_, ConcurrentReadOutputStream rosbad_, ConcurrentReadOutputStream rosi_,
+ boolean joinReads_, boolean trimByOverlap_) {
+ cris=cris_;
+ rosgood=rosgood_;
+ rosbad=rosbad_;
+ rosi=rosi_;
+ joinReads=joinReads_;
+ trimReadsByOverlap=trimByOverlap_;
+
+ if(useEntropy){
+ kmerCounts=new short[1<<(2*entropyK)];
+ }else{
+ kmerCounts=null;
+ }
+
+ if(findAdapterSequence){
+ for(int i=0; i<adapterCountsT.length; i++){
+ for(int j=0; j<adapterCountsT[i].length; j++){
+ adapterCountsT[i][j]=new LongList(150);
+ }
+ }
+ }
+ }
+
+
+ @Override
+ public void run(){
+ processReads();
+ }
+
+ private void processReads() {
+ assert(USE_MAPPING || MATE_BY_OVERLAP);
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+ assert(r.mate!=null);
+ }
+
+ final byte[][] originals=(rosbad!=null || (rosgood!=null && (!join || MIX_BAD_AND_GOOD))) ? new byte[4][] : null;
+ while(reads!=null && reads.size()>0){
+
+ ArrayList<Read> listg=(rosgood==null /*&& rosi==null*/ ? null : new ArrayList<Read>(reads.size()));
+ ArrayList<Read> listb=(rosbad==null ? null : new ArrayList<Read>(reads.size()));
+
+ if(loglog!=null){
+ for(Read r1 : reads){loglog.hash(r1);}
+ }
+
+ for(Read r1 : reads){
+ int bestInsert=findOverlapInThread(r1, originals, listg, listb);
+ }
+
+ if(rosgood!=null){rosgood.add(listg, ln.id);}
+ if(rosi!=null){
+ //This prints both merged and unmerged reads
+ for(Read r1 : reads){//Legacy outinsert support
+ StringBuilder sb=new StringBuilder(40);
+ sb.append(r1.id==null ? r1.numericID+"" : r1.id).append('\t');
+ sb.append(r1.numericID).append('\t');
+ final int bestInsert=r1.insert();
+ sb.append(bestInsert>=0 ? bestInsert : -1);
+ sb.append('\t');
+
+ if(bestInsert==RET_NO_SOLUTION){sb.append('F');}//Failed
+ else if(bestInsert==RET_AMBIG){sb.append('A');} //Ambiguous
+ else if(bestInsert==RET_SHORT){sb.append('S');} //Short
+ else{
+ if(r1.errors>0){sb.append('I');}//Imperfect
+ else{sb.append('P');}//Perfect
+ sb.append('\t');
+ sb.append(r1.errors);
+ }
+ r1.obj=sb;
+ }
+ rosi.add(reads, ln.id);
+ }
+ if(rosbad!=null){rosbad.add(listb, ln.id);}
+
+ // System.err.println("returning list");
+ cris.returnList(ln.id, ln.list.isEmpty());
+ // System.err.println("fetching list");
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ // System.err.println("reads: "+(reads==null ? "null" : reads.size()));
+ }
+ cris.returnList(ln.id, ln.list.isEmpty());
+ }
+
+ private int findOverlapInThread(final Read r1, final byte[][] originals, ArrayList<Read> listg, ArrayList<Read> listb){
+
+ final Read r2=r1.mate;
+ final int trueSize=r1.insert();
+
+ final int bestInsert=processReadPair(r1, r2);
+
+ if(originals!=null){
+ if(eccTadpole){
+ originals[0]=r1.bases;
+ originals[1]=r1.quality;
+ originals[2]=r2.bases;
+ originals[3]=r2.quality;
+ }else{
+ originals[0]=(r1.bases==null ? null : r1.bases.clone());
+ originals[1]=(r1.quality==null ? null : r1.quality.clone());
+ originals[2]=(r2.bases==null ? null : r2.bases.clone());
+ originals[3]=(r2.quality==null ? null : r2.quality.clone());
+ }
+ }
+
+ Read joined=null;
+
+ if(bestInsert>0){
+ if(bestInsert==trueSize){correctCount++;insertSumCorrect+=bestInsert;}
+ else{incorrectCount++;insertSumIncorrect+=bestInsert;}
+ r1.setInsert(bestInsert);
+ insertMin=Tools.min(bestInsert, insertMin);
+ insertMax=Tools.max(bestInsert, insertMax);
+ hist[Tools.min(bestInsert, hist.length-1)]++;
+ if(joinReads){
+ r2.reverseComplement();
+ joined=r1.joinRead(bestInsert);
+ r2.reverseComplement();
+ assert(joined.length()==bestInsert);
+ }else if(ecco){
+ r2.reverseComplement();
+ errorCorrectWithInsert(r1, r2, bestInsert);
+ r2.reverseComplement();
+ }
+ }else if(bestInsert==RET_AMBIG){ambiguousCount++;}
+ else if(bestInsert==RET_SHORT){tooShortCount++;}
+ else if(bestInsert==RET_LONG){tooLongCount++;}
+ else if(bestInsert==RET_NO_SOLUTION){noSolutionCount++;}
+
+ r1.setInsert(bestInsert);
+
+ if(findAdapterSequence && bestInsert>0){
+ storeAdapterSequence(r1, bestInsert);
+ r2.reverseComplement();
+ storeAdapterSequence(r2, bestInsert);
+ r2.reverseComplement();
+ }
+
+ if(originals!=null && (!ecco || bestInsert<1)){
+ r1.bases=originals[0];
+ r1.quality=originals[1];
+ r2.bases=originals[2];
+ r2.quality=originals[3];
+ }
+
+ if(trimReadsByOverlap && bestInsert>0){
+ int trimLim=bestInsert-1;
+ if(trimLim<r1.length()){
+ if(verbose){System.err.println("Overlap right trimming r1 to "+0+", "+(trimLim));}
+ int x=TrimRead.trimToPosition(r1, 0, trimLim, 1);
+ if(verbose){System.err.println("Trimmed "+x+" bases: "+new String(r1.bases));}
+ }
+ if(trimLim<r2.length()){
+ if(verbose){System.err.println("Overlap right trimming r2 to "+0+", "+(trimLim));}
+ int x=TrimRead.trimToPosition(r2, 0, trimLim, 1);
+ if(verbose){System.err.println("Trimmed "+x+" bases: "+new String(r2.bases));}
+ }
+ }
+
+ if(bestInsert>0 || MIX_BAD_AND_GOOD){
+ if(listg!=null){
+ if(joined!=null){
+ listg.add(joined);
+ }else{
+ listg.add(r1);
+ }
+ }
+ }else if(listb!=null){
+ listb.add(r1);
+ }
+ return bestInsert;
+ }
+
+ private final int preprocess(final Read r1, final Read r2, boolean qtrim){
+ assert(r1!=null);
+ if(!r1.validated()){r1.validate(true);}
+ if(r2==null){return RET_BAD;}
+ if(!r2.validated()){r2.validate(true);}
+
+ if(iupacToN){
+ if(r1!=null){r1.convertUndefinedTo((byte)'N');}
+ if(r2!=null){r2.convertUndefinedTo((byte)'N');}
+ }
+
+ if(recalibrateQuality){
+ CalcTrueQuality.recalibrate(r1);
+ CalcTrueQuality.recalibrate(r2);
+ }
+
+ pairsProcessed++;
+
+ if(forceTrimLeft>0 || forceTrimRight>0 || forceTrimModulo>0 || forceTrimRight2>0){
+ if(r1!=null && !r1.discarded()){
+ final int len=r1.length();
+ final int a=forceTrimLeft>0 ? forceTrimLeft : 0;
+ final int b0=forceTrimModulo>0 ? len-1-len%forceTrimModulo : len;
+ final int b1=forceTrimRight>0 ? forceTrimRight : len;
+ final int b2=forceTrimRight2>0 ? len-1-forceTrimRight2 : len;
+ final int b=Tools.min(b0, b1, b2);
+ final int x=TrimRead.trimToPosition(r1, a, b, 1);
+ }
+ if(r2!=null && !r2.discarded()){
+ final int len=r2.length();
+ final int a=forceTrimLeft>0 ? forceTrimLeft : 0;
+ final int b0=forceTrimModulo>0 ? len-1-len%forceTrimModulo : len;
+ final int b1=forceTrimRight>0 ? forceTrimRight : len;
+ final int b2=forceTrimRight2>0 ? len-1-forceTrimRight2 : len;
+ final int b=Tools.min(b0, b1, b2);
+ final int x=TrimRead.trimToPosition(r2, a, b, 1);
+ }
+ }
+
+ if(qtrim){qtrim(r1, r2, 0);}
+
+ if(tadpole!=null && extendRight1>0){
+ extendAndMerge(r1, r2, extendRight1, 1, false);
+ }
+
+ final int len1=r1.length(), len2=r2.length();
+
+ if(len1<minReadLength && len2<minReadLength){
+ return RET_BAD;
+ }else if(len1<2 || len2<2){
+ return RET_AMBIG;
+ }
+
+ if(r1.quality!=null || r2.quality!=null){
+ if(minAvgQuality>0){
+ if(r1.avgQuality(false, minAvgQualityBases)<minAvgQuality || r2.avgQuality(false, minAvgQualityBases)<minAvgQuality){
+ //Failed quality filter
+ return RET_BAD;
+ }
+ }
+ if(useMEEfilter && useQuality){
+ int maxBasesToConsider=Tools.min(Tools.max(len1, len2), len1+len2-minInsert);
+ if(r1.expectedTipErrors(false, maxBasesToConsider)>maxExpectedErrors || r2.expectedTipErrors(false, maxBasesToConsider)>maxExpectedErrors){
+ //Failed MEEFilter
+ return RET_BAD;
+ }
+ }
+ }
+ return 1;
+ }
+
+ private final int extendAndMerge(Read r1, Read r2, int amt, int iters, boolean merge){
+ assert(iters>0);
+ assert(merge || iters==1);
+
+
+ int sum1=0, sum2=0, attempted=0;
+ int bestInsert=RET_AMBIG;
+ for(int i=0; i<iters && (bestInsert==RET_AMBIG || bestInsert==RET_NO_SOLUTION); i++){
+
+ int e1=(sum1==attempted ? extendRead(r1, amt) : 0);
+ r2.reverseComplement();
+ int e2=(sum2==attempted ? extendRead(r2, amt) : 0);
+ r2.reverseComplement();
+
+ attempted+=amt;
+ sum1+=e1;
+ sum2+=e2;
+
+ if(merge){
+ if(e1>0 || e2>0){
+ bestInsert=processReadPair_inner(r1, r2);
+ }else{
+ break;
+ }
+ }
+ }
+ //Todo: un-extend.
+
+ extensionsAttemptedT+=2;
+
+ if(sum1==attempted){fullyExtendedT++;}
+ else if(sum1>0){partlyExtendedT++;}
+ else{notExtendedT++;}
+
+ if(sum2==attempted){fullyExtendedT++;}
+ else if(sum2>0){partlyExtendedT++;}
+ else{notExtendedT++;}
+
+ return bestInsert;
+ }
+
+ private final int extendRead(Read r, int amt){
+ bb.clear();
+ bb.append(r.bases);
+ final int initialLen=r.length();
+ final int extension=tadpole.extendToRight2(bb, leftCounts, rightCounts, amt, false);
+
+// extensionsAttemptedT++;
+// if(extension==amt){
+// fullyExtendedT++;
+// }else if(extension>0){
+// partlyExtendedT++;
+// }else{
+// notExtendedT++;
+// }
+
+ if(extension>0){
+ r.bases=bb.toBytes();
+ if(r.quality!=null){
+ r.quality=Arrays.copyOf(r.quality, r.bases.length);
+ for(int i=initialLen; i<r.quality.length; i++){
+ r.quality[i]=qfake;
+ }
+ }
+ }
+ return extension;
+ }
+
+ private final int mateByOverlap_ratioMode(Read r1, Read r2, int minOverlap){
+ assert(useRatioMode);
+ int min0=MIN_OVERLAPPING_BASES_0-MIN_OVERLAPPING_BASES_RATIO_REDUCTION;
+ int min=minOverlap-MIN_OVERLAPPING_BASES_RATIO_REDUCTION;
+ int x=-1;
+ rvector[4]=0;
+
+ float ratioMargin=RATIO_MARGIN;
+ float maxRatio=MAX_RATIO;
+
+ boolean overlapped=false;
+ if(overlapUsingQuality && r1.quality!=null && r2.quality!=null){
+ overlapped=true;
+ x=BBMergeOverlapper.mateByOverlapRatio(r1, r2, aprob, bprob, rvector,
+ min0, min, minInsert0, minInsert, maxRatio, ratioMargin, RATIO_OFFSET, 0.95f, 0.95f, true);
+ if(verbose){System.err.println("Result from ratiomode1: \tinsert="+x+", bad="+rvector[2]+", ambig="+(rvector[4]==1));}
+ }
+ if(!overlapped || (overlapWithoutQuality && (x<0 || rvector[4]==1))){
+ x=BBMergeOverlapper.mateByOverlapRatio(r1, r2, aprob, bprob, rvector, min0, min,
+ minInsert0, minInsert, maxRatio, ratioMargin, RATIO_OFFSET, 0.95f, 0.95f, false);
+ if(verbose){System.err.println("Result from ratiomode2: \tinsert="+x+", bad="+rvector[2]+", ambig="+(rvector[4]==1));}
+ }
+ return x;
+ }
+
+ private final int mateByOverlap_normalMode(Read r1, Read r2, int minOverlap){
+ final int len1=r1.length(), len2=r2.length();
+ boolean ambigNM=false;
+ int bestInsertNM=-1;
+ int bestBadNM=999999;
+
+ assert(QUAL_ITERS>0);
+ final int maxQualIters=(r1.quality==null || r2.quality==null ? 1 : QUAL_ITERS);
+ final int maxTrims=(r1.quality==null || r2.quality==null ? 0 : TRIM_ON_OVERLAP_FAILURE);
+
+ for(int i=0; i<maxQualIters && bestInsertNM<0 /*&& !ambigNM*/; i++){
+
+ int x=BBMergeOverlapper.mateByOverlap(r1, r2, aprob, bprob, rvector, MIN_OVERLAPPING_BASES_0-i, minOverlap+i,
+ minInsert0, MISMATCH_MARGIN, MAX_MISMATCHES0, MAX_MISMATCHES, (byte)(MIN_QUALITY-2*i));
+ if(x>-1){
+ bestInsertNM=x;
+ bestBadNM=rvector[2];
+ ambigNM=(rvector[4]==1);
+ break;
+ }
+ }
+
+
+ if(loose && bestInsertNM<0){//TODO check for estimated number of overlap errors
+ int x=BBMergeOverlapper.mateByOverlap(r1, r2, aprob, bprob, rvector, MIN_OVERLAPPING_BASES_0-1, minOverlap+2,
+ minInsert0, MISMATCH_MARGIN, MAX_MISMATCHES0+1, MAX_MISMATCHES+1, MIN_QUALITY-1);
+ if(x>-1){
+ bestInsertNM=x;
+ bestBadNM=rvector[2];
+ ambigNM=(rvector[4]==1);
+ }
+ }
+
+ for(int trims=0, q=trimq[0]; trims<maxTrims && !qtrim1 && bestInsertNM<0 /*&& !ambigNM*/; trims++, q+=8){
+ int tr1=TrimRead.trimFast(r1, false, true, q, 1+len1*4/10); //r1.length());
+ int tr2=TrimRead.trimFast(r2, true, false, q, 1+len2*4/10); //r2.length());
+ if(tr1>0 || tr2>0){
+ int x=BBMergeOverlapper.mateByOverlap(r1, r2, aprob, bprob, rvector, MIN_OVERLAPPING_BASES_0-1, minOverlap,
+ minInsert0, MISMATCH_MARGIN, MAX_MISMATCHES0, MAX_MISMATCHES, MIN_QUALITY);
+ if(x>-1){
+ bestInsertNM=x;
+ bestBadNM=rvector[2];
+ ambigNM=(rvector[4]==1);
+ trims=maxTrims;
+ }
+ }
+ }
+ if(verbose){System.err.println("Result from normalmode: \tinsert="+bestInsertNM+", bad="+bestBadNM+", ambig="+ambigNM);}
+
+ rvector[0]=bestInsertNM;
+ rvector[2]=bestBadNM;
+ rvector[4]=(ambigNM ? 1 : 0);
+ return bestInsertNM;
+ }
+
+ private final int calcMinOverlapFromEntropy(final Read r1, final Read r2){
+ if(!useEntropy){return MIN_OVERLAPPING_BASES;}
+ final int minOverlap;
+ if(loose){
+ final int len1=r1.length(), len2=r2.length();
+ int a=BBMergeOverlapper.calcMinOverlapByEntropy(r1.bases, entropyK, kmerCounts, minEntropyScore);
+ int b=BBMergeOverlapper.calcMinOverlapByEntropy(r2.bases, entropyK, kmerCounts, minEntropyScore);
+ float errorRate=r1.expectedErrors(false, len1)/len1+r2.expectedErrors(false, len2)/len2;
+ minOverlap=(int)(Tools.max(MIN_OVERLAPPING_BASES, Tools.max(a, b))*(1+Tools.min(0.04f, errorRate)*4f));
+ }else{
+ int a=BBMergeOverlapper.calcMinOverlapByEntropyTail(r1.bases, entropyK, kmerCounts, minEntropyScore);
+ int b=BBMergeOverlapper.calcMinOverlapByEntropyHead(r2.bases, entropyK, kmerCounts, minEntropyScore);
+ minOverlap=Tools.max(MIN_OVERLAPPING_BASES, Tools.max(a, b));
+ }
+ return minOverlap;
+ }
+
+ private final int lookForAdapters(final Read r1, final Read r2){
+ assert(lowercaseAdapters);
+ if(!lowercaseAdapters){return -1;}
+ if(!Character.isLowerCase(r1.bases[r1.length()-1]) || !Character.isLowerCase(r2.bases[0])){return -1;}
+
+ final int lower1=r1.trailingLowerCase(), lower2=r2.leadingLowerCase();
+
+ final int upper1=r1.length()-lower1, upper2=r2.length()-lower2;
+ final int newlen=Tools.min(upper1, upper2);
+ int good=0, bad=0;
+
+ for(int i=0; i<newlen; i++){
+ byte a=r1.bases[i];
+ byte b=r2.bases[i+lower2];
+ if(a!='N' && b!='N'){
+ if(a==b){good++;}
+ else{bad++;}
+ }
+ }
+ if(bad*4<=good){
+ rvector[0]=newlen;
+ rvector[2]=bad;
+ rvector[4]=0;
+ return newlen;
+ }
+ return -1;
+ }
+
+ private final int mateByOverlap(Read r1, Read r2){
+ final int len1=r1.length(), len2=r2.length();
+
+ final int minOverlap=calcMinOverlapFromEntropy(r1, r2);
+ if(verbose){System.err.println("minOverlap: "+minOverlap);}
+
+ //TODO: Currently this is not used for anything.
+ final int bestInsertAD;
+ final int bestBadAD;
+ if(lowercaseAdapters){
+ bestInsertAD=lookForAdapters(r2, r2);
+ bestBadAD=(bestInsertAD>=0 ? rvector[2] : 0);
+ }
+
+ if(aprob==null || aprob.length<Tools.max(len1, len2)){aprob=new float[Tools.max(len1, len2)];}
+ if(bprob==null || bprob.length<Tools.max(len1, len2)){bprob=new float[Tools.max(len1, len2)];}
+
+ final boolean ambigRM;
+ final int bestBadRM, bestInsertRM;
+ if(useRatioMode){
+ bestInsertRM=mateByOverlap_ratioMode(r1, r2, minOverlap);
+ bestBadRM=rvector[2];
+ ambigRM=(bestInsertRM>-1 ? rvector[4]==1 : false);
+ }else{
+ bestInsertRM=-1;
+ bestBadRM=0;
+ ambigRM=false;
+ }
+
+ final boolean ambigNM;
+ final int bestInsertNM, bestBadNM;
+ if(useNormalMode && ((!requireRatioMatch && (bestInsertRM<0 || ambigRM)) || (requireRatioMatch && (bestInsertRM>0 && !ambigRM)))){
+ bestInsertNM=mateByOverlap_normalMode(r1, r2, minOverlap);
+ bestBadNM=rvector[2];
+ ambigNM=(bestInsertNM>-1 ? rvector[4]==1 : false);
+ }else{
+ ambigNM=false;
+ bestInsertNM=-1;
+ bestBadNM=99999;
+ }
+
+ boolean ambig;
+ int bestBad, bestInsert;
+ if(requireRatioMatch && useNormalMode && useRatioMode){
+ ambig=ambigRM || ambigNM;
+ bestBad=bestBadRM;
+ bestInsert=(bestInsertNM==bestInsertRM ? bestInsertNM : -1);
+
+ if(verbose){System.err.println("Result after rrm: \tinsert="+bestInsertNM+", bad="+bestBadNM+", ambig="+ambigNM);}
+ }else if(useRatioMode && bestInsertRM>-1 && !ambigRM){
+ ambig=ambigRM;
+ bestBad=bestBadRM;
+ bestInsert=bestInsertRM;
+ }else{
+ ambig=ambigNM;
+ bestBad=bestBadNM;
+ bestInsert=bestInsertNM;
+ }
+
+ if(bestBad>MAX_MISMATCHES_R){ambig=true;}
+
+ if(ambig){return RET_AMBIG;}
+ else if(bestInsert<0){return RET_NO_SOLUTION;}
+
+ //TODO: Crucial! This block can vastly reduce merge rate, particularly if quality values are inaccurate.
+ if(useQuality && r1.quality!=null && r2.quality!=null){
+ if(useEfilter && bestInsert>0 && !ambig){
+ float bestExpected=BBMergeOverlapper.expectedMismatches(r1, r2, bestInsert);
+ if((bestExpected+efilterOffset)*efilterRatio<bestBad){ambig=true;}
+ if(verbose){System.err.println("Result after efilter: \tinsert="+bestInsert+", bad="+bestBad+", ambig="+ambig);}
+ }
+
+ if(pfilterRatio>0 && bestInsert>0 && !ambig){
+ float probability=BBMergeOverlapper.probability(r1, r2, bestInsert);
+ if(probability<pfilterRatio){bestInsert=-1;}
+ if(verbose){System.err.println("Result after pfilter: \tinsert="+bestInsert+", bad="+bestBad+", ambig="+ambig);}
+ }
+ }
+
+ if(ambig){return RET_AMBIG;}
+ r1.errors=bestBad;
+ return bestInsert>0 ? bestInsert : RET_NO_SOLUTION;
+ }
+
+ /**
+ *
+ * @param r1 Read1
+ * @param r2 Read2
+ * @return A return code (RET_)
+ */
+ private final int processReadPair(final Read r1, final Read r2){
+
+ {
+ final int x=preprocess(r1, r2, (qtrim1 && !qtrim2));
+ if(x<0){return x;}
+ }
+
+ if(parseCustom){
+ int trueSize=-1;
+ if(r1.id.startsWith("insert=")){
+ trueSize=GradeMergedReads.parseInsert(r1.id);
+ }else{
+ r1.setMapped(true);
+ r2.setMapped(true);
+ trueSize=Read.insertSizeMapped(r1, r2, ignoreMappingStrand);
+ }
+ if(verbose){System.err.println("True Insert: "+trueSize);}
+ r1.setInsert(trueSize);
+ }
+
+ r2.reverseComplement();
+
+ byte[] qual1=r1.quality, qual2=r2.quality;
+ if(!useQuality){//strip qualities
+ r1.quality=r2.quality=null;
+ }
+
+ int bestInsert=processReadPair_inner(r1, r2);
+ if(qtrim2){
+ for(int iter=0; iter<trimq.length && bestInsert<0; iter++){
+ r1.quality=qual1;
+ r2.quality=qual2;
+ // r2.reverseComplement();
+ // qtrim(r1, r2);
+ // r2.reverseComplement();
+
+ TrimRead.trimFast(r1, qtrimLeft, qtrimRight, trimq[iter], 1);
+ TrimRead.trimFast(r2, qtrimRight, qtrimLeft, trimq[iter], 1);//Reversed because read is rcomped
+
+ qual1=r1.quality;
+ qual2=r2.quality;
+ bestInsert=processReadPair_inner(r1, r2);
+ }
+ }
+
+ if(tadpole!=null){
+ if(eccTadpole && (bestInsert==RET_AMBIG || bestInsert==RET_NO_SOLUTION)){
+ int c1=tadpole.errorCorrect(r1);
+ int c2=tadpole.errorCorrect(r2);
+ if(c1>0 || c2>0){
+ bestInsert=processReadPair_inner(r1, r2);
+ }
+ }
+
+ if(extendRight2>0 && (bestInsert==RET_AMBIG || bestInsert==RET_NO_SOLUTION)){
+ bestInsert=extendAndMerge(r1, r2, extendRight2, extendIterations, true);
+ }
+ }
+
+ if(useKFilter && bestInsert>kmerLength){
+ Read joined=r1.joinRead(bestInsert);
+ if(useKFilter){
+ int cov=BBMergeOverlapper.minCoverage(joined, tadpole, kmerLength, filterCutoff);
+ if(cov<filterCutoff){bestInsert=RET_NO_SOLUTION;}
+ if(verbose){System.err.println("Result after kfilter: \tinsert="+bestInsert);}
+ }
+ }
+
+ if(!useQuality){//restore qualities
+ r1.quality=qual1;
+ r2.quality=qual2;
+ }
+ r2.reverseComplement();
+ return bestInsert;
+ }
+
+ /**
+ *
+ * @param r1 Read1
+ * @param r2 Read2
+ * @return A return code (RET_)
+ */
+ private final int processReadPair_inner(final Read r1, final Read r2){
+ int bestInsert=-1;
+ boolean ambig;
+
+ if(USE_MAPPING && r1.chrom==r2.chrom && r1.start<r1.stop && ((r1.mapped() || r1.synthetic()) && (r2.mapped() || r2.synthetic()))){
+ bestInsert=r1.insert();
+ ambig=false;
+ }else{
+ if(MATE_BY_OVERLAP){
+ bestInsert=mateByOverlap(r1, r2);
+ ambig=(bestInsert==RET_AMBIG);
+ }else{
+ ambig=false;
+ bestInsert=-1;
+ }
+ }
+
+ if(ambig){return RET_AMBIG;}
+ else if(bestInsert>0){
+ if(bestInsert<minInsert){return RET_SHORT;}
+ else if(bestInsert>maxReadLength){return RET_LONG;}
+ return bestInsert;
+ }
+ else{return RET_NO_SOLUTION;}
+ }
+
+ private void storeAdapterSequence(Read r, int insert){
+ LongList[] lists=adapterCountsT[r.pairnum()];
+ byte[] bases=r.bases;
+ for(int i=insert, j=0; i<bases.length; i++, j++){
+ byte b=bases[i];
+ int num=AminoAcid.baseToNumber[b];
+ if(num>=0){
+ lists[num].increment(j);
+ }
+ }
+ }
+
+
+ /*--------------------------------------------------------------*/
+
+ final LongList[][] adapterCountsT=new LongList[2][4];
+
+ final byte qfake=Shared.FAKE_QUAL;
+
+ private final int[] rvector=new int[5];
+
+ private final int[] rightCounts=new int[4];
+ private final int[] leftCounts=(extendThroughLeftJunctions ? null : new int[4]);
+
+ private final ByteBuilder bb=new ByteBuilder();
+
+ final long[] hist=new long[histlen];
+ final short[] kmerCounts;
+
+ private float[] aprob, bprob;
+
+ long pairsProcessed=0;
+ long matedCount=0;
+ long correctCount=0;
+ long ambiguousCount=0;
+ long tooShortCount=0;
+ long tooLongCount=0;
+ long incorrectCount=0;
+ long noSolutionCount=0;
+ long insertSumCorrect=0;
+ long insertSumIncorrect=0;
+ int insertMax=0;
+ int insertMin=999999999;
+
+ long fullyExtendedT=0;
+ long partlyExtendedT=0;
+ long notExtendedT=0;
+ long extensionsAttemptedT=0;
+
+ private final ConcurrentReadInputStream cris;
+ private final ConcurrentReadOutputStream rosgood;
+ private final ConcurrentReadOutputStream rosbad;
+ private final ConcurrentReadOutputStream rosi;
+
+ private final boolean joinReads;
+ private final boolean trimReadsByOverlap;
+ }
+
+ /*--------------------------------------------------------------*/
+
+ private String in1;
+ private String in2;
+
+ private ArrayList<String> extra=new ArrayList<String>();
+
+ private String out1=null;
+ private String out2=null;
+ private String outb1=null;
+ private String outb2=null;
+ private String outinsert=null;
+ private String ihist=null;
+ private String outAdapter=null;
+ private String outCardinality=null;
+
+ private final LogLog loglog;
+
+ private long maxReads=-1;
+ private boolean join=true;
+ private boolean ecco=false;
+ private boolean trimByOverlap=false;
+
+ private float pfilterRatio=0.00002f;
+ private float efilterRatio=6f;
+ private float efilterOffset=0.05f;
+ private boolean useEfilter=true;
+ private boolean useMEEfilter=false;
+
+ private boolean ordered=false;
+ private boolean overlapUsingQuality=false;
+ private boolean overlapWithoutQuality=true;
+ private boolean useKFilter=false;
+ private int filterCutoff=1;
+ private int kmerLength=31;
+ private boolean prealloc=false;
+ private boolean prefilter=false;
+ private ArrayList<String> extraFiles;
+
+ private boolean useEntropy=true;
+ private int entropyK=3;
+ private int minEntropyScore=39;//30 loose;//39 normal;//44 strict;
+
+ private long sampleseed=-1;
+ private float samplerate=1;
+
+ private boolean findAdapterSequence=false;
+
+ private final LongList[][] adapterCounts=new LongList[2][4];
+
+ private final Tadpole tadpole;
+ private int extendRight1=0;
+ private int extendRight2=0;
+ private int extendIterations=1;
+ private boolean eccTadpole=false;
+ private boolean shave=false;
+ private boolean rinse=false;
+
+ private boolean extendThroughLeftJunctions=true;
+ private int minCountSeed=3, minCountExtend=2;
+ private float branchMult1=20;
+ private float branchMult2=3;
+ private float minProb=0.5f;
+ private int branchLowerConst=3;
+
+ /*--------------------------------------------------------------*/
+
+ private static ThreadLocal<int[]> localRvector=new ThreadLocal<int[]>();
+
+ static boolean errorState=false;
+
+ private static boolean showFullArgs=true;
+
+ /** Recalibrate quality scores using matrices */
+ static boolean recalibrateQuality=false;
+ static boolean useQuality=true;
+ static boolean qtrimRight=false;
+ static boolean qtrimLeft=false;
+// static boolean untrim=false;
+ static byte[] trimq=new byte[] {6};
+ static byte minAvgQuality=0;
+ static int minAvgQualityBases=0;
+ static float maxExpectedErrors=0;
+ static int minReadLength=1;
+ static int maxReadLength=-1;
+ static int minInsert=35;
+ static int minInsert0=-1;
+ static boolean qtrim1=false;
+ static boolean qtrim2=false;
+ static int TRIM_ON_OVERLAP_FAILURE=1;
+ static int QUAL_ITERS=3;
+ static boolean parseCustom=false;
+
+ static int forceTrimLeft;
+ static int forceTrimRight;
+ static int forceTrimRight2;
+ /** Trim right bases of the read modulo this value.
+ * e.g. forceTrimModulo=50 would trim the last 3bp from a 153bp read. */
+ static int forceTrimModulo;
+
+ static boolean strict=false;
+ static boolean vstrict=false;
+ static boolean ustrict=false;
+ static boolean xstrict=false;
+ static boolean loose=false;
+ static boolean vloose=false;
+ static boolean uloose=false;
+ static boolean xloose=false;
+ static boolean fast=false;
+
+ /** If true, interpret lowercase bases as adapter sequence */
+ static boolean lowercaseAdapters=false;
+
+ private static final int histlen=2000;
+ static long[] histTotal=new long[histlen];
+ static int bin=1;
+
+ static long readsProcessedTotal=0;
+ static long matedCountTotal=0;
+ static long correctCountTotal=0;
+ static long ambiguousCountTotal=0;
+ static long tooShortCountTotal=0;
+ static long tooLongCountTotal=0;
+ static long incorrectCountTotal=0;
+ static long noSolutionCountTotal=0;
+ static long insertSumCorrectTotal=0;
+ static long insertSumIncorrectTotal=0;
+ static long fullyExtendedTotal=0;
+ static long partlyExtendedTotal=0;
+ static long notExtendedTotal=0;
+ static long extensionsAttempted=0;
+
+ static int insertMinTotal=999999999;
+ static int insertMaxTotal=0;
+
+ private static int MIN_OVERLAPPING_BASES=11;
+ private static int MIN_OVERLAPPING_BASES_0=8;
+ private static int MISMATCH_MARGIN=2;
+ private static int MIN_OVERLAPPING_BASES_RATIO_REDUCTION=3;
+
+ static boolean useRatioMode=true;
+ static boolean useNormalMode=false;
+ static boolean requireRatioMatch=false;
+ static int MAX_MISMATCHES_R=20;
+ static float MAX_RATIO=0.09f;
+ static float RATIO_MARGIN=5.5f;
+ static float RATIO_OFFSET=0.55f;
+
+ public static int MAX_MISMATCHES=3;
+ public static int MAX_MISMATCHES0=3;
+ public static byte MIN_QUALITY=10;
+
+ public static final int RET_NO_SOLUTION=-1;
+ public static final int RET_AMBIG=-2;
+ public static final int RET_BAD=-3;
+ public static final int RET_SHORT=-4;
+ public static final int RET_LONG=-5;
+
+ /** Skip alignment and calculate insert from mapping info */
+ protected static boolean USE_MAPPING=false;
+ protected static final boolean ignoreMappingStrand=false;
+
+ private static boolean MATE_BY_OVERLAP=true;
+// private static boolean SKIP_MATED_READS=false;
+// private static boolean OUTPUT_FAILED=true;
+ private static boolean MIX_BAD_AND_GOOD=false;
+ private static boolean NONZERO_ONLY=true;
+ private static boolean showHistStats=true;
+
+ private static boolean overwrite=true;
+ private static boolean append=false;
+ private static final boolean verbose=false;
+
+ private static boolean iupacToN=false;
+
+ private static int THREADS=-1;
+ private static String version="8.9";
+
+}
diff --git a/current/jgi/BBMergeOverlapper.java b/current/jgi/BBMergeOverlapper.java
new file mode 100755
index 0000000..393ad8c
--- /dev/null
+++ b/current/jgi/BBMergeOverlapper.java
@@ -0,0 +1,845 @@
+package jgi;
+
+import java.util.Arrays;
+import java.io.File;
+
+import stream.Read;
+import ukmer.Kmer;
+import align2.Tools;
+import dna.AminoAcid;
+import align2.Shared;
+import assemble.Tadpole;
+import assemble.Tadpole1;
+import assemble.Tadpole2;
+
+/**
+ * @author Brian Bushnell
+ * @date Apr 15, 2014
+ *
+ */
+public final class BBMergeOverlapper {
+
+ static {
+ if(Shared.USE_JNI){
+ String name = "bbtoolsjni";
+ try {
+ System.loadLibrary(name);
+ } catch (UnsatisfiedLinkError e1) {
+ // System.loadLibrary() does not work with MPI.
+ // Need to use System.load() with an explicit full
+ // path to the native library file for the MPI case.
+ boolean success = false;
+ String libpath=System.getProperty("java.library.path");
+ libpath = libpath.replace("-Djava.library.path=","");
+ String[] libpathEntries = libpath.split(File.pathSeparator);
+ for(int i = 0; i < libpathEntries.length; i++) {
+ if(success) break;
+ String lib = libpathEntries[i]+"/"+System.mapLibraryName(name);
+ try {
+ System.load(lib);
+ success = true;
+ } catch (UnsatisfiedLinkError e2) {
+ success = false;
+ if((i+1) >= libpathEntries.length) {
+ throw new RuntimeException("\n\n***** Native library can not be found in java.library.path. *****\n");
+ }
+ }
+ }
+ }
+ }
+ }
+
+ private static native final int mateByOverlapJNI(byte[] a_bases, byte[] b_bases, byte[] a_quality, byte[] b_quality,
+ float[] aprob, float[] bprob, int[] rvector, int minOverlap0, int minOverlap, int minInsert0, int margin,
+ int maxMismatches0, int maxMismatches, int minq);
+
+ private static native final int mateByOverlapRatioJNI_WithQualities(byte[] a_bases, byte[] b_bases, byte[] a_quality, byte[] b_quality,
+ float[] aprob, float[] bprob, int[] rvector, int minOverlap0, int minOverlap, int minInsert0, int minInsert, float maxRatio,
+ float margin, float offset);
+
+ private static native final int mateByOverlapRatioJNI(byte[] a_bases, byte[] b_bases,
+ int[] rvector, int minOverlap0, int minOverlap, int minInsert0, int minInsert, float maxRatio,
+ float margin, float offset, float gIncr, float bIncr);
+
+ protected static final int mateByOverlap(Read a, Read b, float[] aprob, float[] bprob, int[] rvector,
+ int minOverlap0, final int minOverlap, final int minInsert0, int margin, final int maxMismatches0, final int maxMismatches, final int minq) {
+ if(rvector==null){rvector=new int[5];}
+ final int x;
+ if(/*false && */Shared.USE_JNI){
+ x=mateByOverlapJNI(a.bases, b.bases, a.quality, b.quality, aprob, bprob, rvector, minOverlap0, minOverlap, minInsert0, margin, maxMismatches0, maxMismatches, minq);
+ }else{
+ x=mateByOverlapJava_unrolled(a, b, aprob, bprob, rvector, minOverlap0, minOverlap, minInsert0, margin, maxMismatches0, maxMismatches, minq);
+ }
+ return x;
+ }
+
+ protected static final int mateByOverlapRatio(Read a, Read b, float[] aprob, float[] bprob, int[] rvector,
+ int minOverlap0, int minOverlap, int minInsert0, int minInsert, final float maxRatio, final float margin, final float offset,
+ final float gIncr, final float bIncr, boolean useQuality) {
+ if(rvector==null){rvector=new int[5];}
+// final boolean swapped;
+// if(a.length()>b.length()){
+// swapped=true;
+// a.swapBasesWithMate();
+// a.reverseComplement();
+// b.reverseComplement();
+// }else{
+// swapped=false;
+// }
+
+ final int x;
+ if(/*false && */Shared.USE_JNI/* && !useQuality*/){
+ if(useQuality && a.quality!=null && b.quality!=null){
+ x=mateByOverlapRatioJNI_WithQualities(a.bases, b.bases, a.quality, b.quality, aprob, bprob, rvector, minOverlap0, minOverlap, minInsert0, minInsert, maxRatio, margin, offset);
+ }else{
+ x=mateByOverlapRatioJNI(a.bases, b.bases, rvector, minOverlap0, minOverlap, minInsert0, minInsert, maxRatio, margin, offset, gIncr, bIncr);
+ }
+ }else{
+ if(useQuality && a.quality!=null && b.quality!=null){
+ x=mateByOverlapRatioJava_WithQualities(a, b, aprob, bprob, rvector, minOverlap0, minOverlap, minInsert0, minInsert, maxRatio, margin, offset);
+ }else{
+ x=mateByOverlapRatioJava(a, b, rvector, minOverlap0, minOverlap, minInsert0, minInsert, maxRatio, margin, offset, gIncr, bIncr);
+ }
+ }
+// if(swapped){
+// a.swapBasesWithMate();
+// a.reverseComplement();
+// b.reverseComplement();
+// }
+ return x;
+ }
+
+ protected static final int mateByOverlapRatioJava_WithQualities(Read a, Read b, float[] aprob, float[] bprob, int[] rvector,
+ int minOverlap0, int minOverlap, int minInsert0, int minInsert, float maxRatio, final float margin, final float offset) {
+ assert(rvector!=null);
+ assert(margin>=1);
+ minOverlap=Tools.max(4, minOverlap0, minOverlap);
+ minOverlap0=Tools.mid(4, minOverlap0, minOverlap);
+ if(rvector==null){rvector=new int[5];}
+
+ final byte[] abases=a.bases, bbases=b.bases, aqual=a.quality, bqual=b.quality;
+ final int alen=abases.length, blen=bbases.length;
+ final int minLength=Tools.min(alen, blen);
+
+ assert(aqual!=null && bqual!=null);
+ {
+ for(int i=0; i<aqual.length; i++){aprob[i]=probCorrect[aqual[i]];}
+ for(int i=0; i<bqual.length; i++){bprob[i]=probCorrect[bqual[i]];}
+ }
+
+ {
+ float x=findBestRatio_WithQualities(a, b, aprob, bprob, minOverlap0, minOverlap, minInsert, maxRatio, offset);
+ if(x>maxRatio){
+ rvector[2]=minLength;
+ rvector[4]=0;
+ return -1;
+ }
+ maxRatio=Tools.min(maxRatio, x);
+ }
+
+ final float altBadlimit=Tools.max(maxRatio, 0.07f)*2f*alen+1;
+ final float margin2=(margin+offset)/minLength;
+
+ int bestInsert=-1;
+ float bestBad=minLength;
+ float bestRatio=1;
+ boolean ambig=false;
+
+ final int largestInsertToTest=(alen+blen-minOverlap0);
+ final int smallestInsertToTest=minInsert0;
+ for(int insert=largestInsertToTest; insert>=smallestInsertToTest; insert--){
+ if(verbose){System.err.println("\nTesting read "+a.numericID+", overlap "+insert+", insert "+(alen+blen-insert));}
+
+ float good=0, bad=0;
+
+ final int istart=(insert<=blen ? 0 : insert-blen);
+ final int jstart=(insert>=blen ? 0 : blen-insert);
+
+ final int overlapLength=Tools.min(alen-istart, blen-jstart, insert);
+ final float badlimit=Tools.min(altBadlimit, Tools.min(bestRatio, maxRatio)*margin*overlapLength);
+
+ final int imax=istart+overlapLength;
+ for(int i=istart, j=jstart; i<imax && bad<=badlimit; i++, j++){
+ assert(i>=0 && i<alen && j>=0 && j<blen) : "\njstart="+jstart+", j="+j+
+ ", istart="+istart+", i="+i+" \n"+"insert="+insert+", overlap="+overlapLength+", a.length="+a.length()+
+ ", b.length="+b.length()+", bad="+bad+", badlimit="+badlimit+", good="+good;
+ final byte ca=abases[i], cb=bbases[j];
+
+ final float x=aprob[i]*bprob[j];
+
+ if(ca==cb){good+=x;}
+ else{bad+=x;}
+ }
+
+// if(verbose || true){
+// System.err.println("istart="+istart+", jstart="+jstart+", overlapLength="+overlapLength+", overlap="+overlap+", bestOverlap="+bestOverlap);
+// System.err.println("overlap="+overlap+", bad="+bad+", good="+good);
+// System.err.println("bestGood="+bestGood+", bestBad="+bestBad);
+// System.err.println();
+// }
+
+ if(bad<=badlimit){
+ if(bad==0 && good>minOverlap0 && good<minOverlap){
+ rvector[2]=(int)bestBad;
+ rvector[4]=1;
+ return -1;
+ }
+
+ float ratio=(bad+offset)/overlapLength;
+// System.err.println("*** ratio="+ratio+", bestRatio="+bestRatio);
+
+ if(ratio<bestRatio*margin){
+
+ ambig=(ratio*margin>=bestRatio || good<minOverlap);
+ if(ratio<bestRatio){
+ bestInsert=insert;
+ bestBad=bad;
+ bestRatio=ratio;
+ }
+ if(ambig && bestRatio<margin2){
+ rvector[2]=(int)bestBad;
+ rvector[4]=1;
+ return -1;
+ }
+ }
+ }
+ }
+
+ if(!ambig && bestRatio>maxRatio){bestInsert=-1;}
+
+ rvector[2]=(int)bestBad;
+ rvector[4]=(ambig ? 1 : 0);
+
+// System.err.println("***C : "+bestOverlap+", "+ambig+", "+bestBad+", "+(bestOverlap<0 ? -1 : alen+blen-bestOverlap)+", "+
+// (bestOverlap<0 ? -1 : (bestOverlap<alen && alen>=blen) ? bestOverlap+alen-blen : bestOverlap)+", "+alen+", "+blen);
+
+ return (bestInsert<0 ? -1 : bestInsert);
+ }
+
+ protected static final float findBestRatio_WithQualities(Read a, Read b, final float[] aprob, final float[] bprob,
+ final int minOverlap0, final int minOverlap, final int minInsert, final float maxRatio, final float offset) {
+ final byte[] abases=a.bases, bbases=b.bases;
+ final int alen=abases.length, blen=bbases.length;
+
+ float bestRatio=maxRatio+0.0001f;
+// final float altBadlimit=Tools.max(maxRatio, 0.07f)*2f*alen+1;
+ final float halfmax=maxRatio*0.5f;
+
+
+ final int largestInsertToTest=(alen+blen-minOverlap); //TODO: test speed with minOverlap0
+ final int smallestInsertToTest=minInsert;
+ for(int insert=largestInsertToTest; insert>=smallestInsertToTest; insert--){
+ if(verbose){System.err.println("\nTesting read "+a.numericID+", overlap "+insert+", insert "+(alen+blen-insert));}
+
+ final int istart=(insert<=blen ? 0 : insert-blen);
+ final int jstart=(insert>=blen ? 0 : blen-insert);
+ final int overlapLength=Tools.min(alen-istart, blen-jstart, insert);
+
+// final float badlimit=(Tools.min(altBadlimit, bestRatio*overlapLength));
+ final float badlimit=bestRatio*overlapLength;
+ float good=0, bad=0;
+
+ final int imax=istart+overlapLength;
+ for(int i=istart, j=jstart; i<imax && bad<=badlimit; i++, j++){
+ assert(i>=0 && i<alen && j>=0 && j<blen) : "\njstart="+jstart+", j="+j+
+ ", istart="+istart+", i="+i+" \n"+"insert="+insert+", overlap="+overlapLength+", a.length="+a.length()+
+ ", b.length="+b.length()+", bad="+bad+", badlimit="+badlimit+", good="+good;
+ final byte ca=abases[i], cb=bbases[j];
+
+ final float x=aprob[i]*bprob[j];
+
+ if(ca==cb){good+=x;}
+ else{bad+=x;}
+ }
+
+ if(bad<=badlimit){
+ if(bad==0 && good>minOverlap0 && good<minOverlap){
+ return 100f;
+ }
+
+ float ratio=(bad+offset)/overlapLength;
+
+ if(ratio<bestRatio){
+ bestRatio=ratio;
+ if(good>=minOverlap && ratio<halfmax){return bestRatio;}
+ }
+ }
+ }
+
+ return bestRatio;
+ }
+
+ protected static final int mateByOverlapRatioJava(Read a, Read b, int[] rvector,
+ int minOverlap0, int minOverlap, int minInsert0, int minInsert, float maxRatio, final float margin, final float offset, final float gIncr, final float bIncr) {
+ assert(rvector!=null);
+ assert(margin>=1);
+ minOverlap=Tools.max(4, minOverlap0, minOverlap);
+ minOverlap0=Tools.mid(4, minOverlap0, minOverlap);
+ if(rvector==null){rvector=new int[5];}
+
+ final byte[] abases=a.bases, bbases=b.bases;
+ final int alen=abases.length, blen=bbases.length;
+ final int minLength=Tools.min(alen, blen);
+
+ {
+ float x=findBestRatio(a, b, minOverlap0, minOverlap, minInsert, maxRatio, offset, gIncr, bIncr);
+ if(x>maxRatio){
+ rvector[2]=minLength;
+ rvector[4]=0;
+ return -1;
+ }
+ maxRatio=Tools.min(maxRatio, x);
+ }
+
+ final float altBadlimit=Tools.max(maxRatio, 0.07f)*2f*alen+1;
+ final float margin2=(margin+offset)/minLength;
+ final byte N='N';
+
+ int bestInsert=-1;
+ float bestBad=minLength;
+ float bestRatio=1;
+ boolean ambig=false;
+
+ final int largestInsertToTest=(alen+blen-minOverlap0);
+ final int smallestInsertToTest=minInsert0;
+ for(int insert=largestInsertToTest; insert>=smallestInsertToTest; insert--){
+ if(verbose){System.err.println("\nTesting read "+a.numericID+", overlap "+insert+", insert "+(alen+blen-insert));}
+
+ final int istart=(insert<=blen ? 0 : insert-blen);
+ final int jstart=(insert>=blen ? 0 : blen-insert);
+ final int overlapLength=Tools.min(alen-istart, blen-jstart, insert);
+
+ final float badlimit=Tools.min(altBadlimit, Tools.min(bestRatio, maxRatio)*margin*overlapLength);
+ float good=0, bad=0;
+
+ final int imax=istart+overlapLength;
+ for(int i=istart, j=jstart; i<imax && bad<=badlimit; i++, j++){
+ assert(i>=0 && i<alen && j>=0 && j<blen) : "\njstart="+jstart+", j="+j+
+ ", istart="+istart+", i="+i+" \n"+"insert="+insert+", overlap="+overlapLength+", a.length="+a.length()+
+ ", b.length="+b.length()+", bad="+bad+", badlimit="+badlimit+", good="+good;
+ final byte ca=abases[i], cb=bbases[j];
+
+ if(ca==cb){
+ if(ca!=N){good+=gIncr;}
+ }else{bad+=bIncr;}
+ }
+
+// if(verbose || true){
+// System.err.println("istart="+istart+", jstart="+jstart+", overlapLength="+overlapLength+", overlap="+overlap+", bestOverlap="+bestOverlap);
+// System.err.println("overlap="+overlap+", bad="+bad+", good="+good);
+// System.err.println("bestGood="+bestGood+", bestBad="+bestBad);
+// System.err.println();
+// }
+
+ if(bad<=badlimit){
+ if(bad==0 && good>minOverlap0 && good<minOverlap){
+ rvector[2]=(int)bestBad;
+ rvector[4]=1;
+ return -1;
+ }
+
+ float ratio=(bad+offset)/overlapLength;
+// System.err.println("*** ratio="+ratio+", bestRatio="+bestRatio);
+
+ if(ratio<bestRatio*margin){
+
+ ambig=(ratio*margin>=bestRatio || good<minOverlap);
+ if(ratio<bestRatio){
+ bestInsert=insert;
+ bestBad=bad;
+ bestRatio=ratio;
+ }
+ if(ambig && bestRatio<margin2){
+ rvector[2]=(int)bestBad;
+ rvector[4]=1;
+ return -1;
+ }
+ }
+ }
+ }
+
+ if(!ambig && bestRatio>maxRatio){bestInsert=-1;}
+
+ rvector[2]=(int)bestBad;
+ rvector[4]=(ambig ? 1 : 0);
+
+// System.err.println("***C : "+bestOverlap+", "+ambig+", "+bestBad+", "+(bestOverlap<0 ? -1 : alen+blen-bestOverlap)+", "+
+// (bestOverlap<0 ? -1 : (bestOverlap<alen && alen>=blen) ? bestOverlap+alen-blen : bestOverlap)+", "+alen+", "+blen);
+
+ return (bestInsert<0 ? -1 : bestInsert);
+ }
+
+ protected static final float findBestRatio(Read a, Read b,
+ final int minOverlap0, final int minOverlap, final int minInsert, final float maxRatio, final float offset, final float gIncr, final float bIncr) {
+ final byte[] abases=a.bases, bbases=b.bases;
+ final int alen=abases.length, blen=bbases.length;
+
+ float bestRatio=maxRatio+0.0001f;
+// final float altBadlimit=Tools.max(maxRatio, 0.07f)*2f*alen+1;
+ final float halfmax=maxRatio*0.5f;
+ final byte N='N';
+
+
+ final int largestInsertToTest=(alen+blen-minOverlap); //TODO: test speed with minOverlap0
+ final int smallestInsertToTest=minInsert;
+ for(int insert=largestInsertToTest; insert>=smallestInsertToTest; insert--){
+ if(verbose){System.err.println("\nTesting read "+a.numericID+", overlap "+insert+", insert "+(alen+blen-insert));}
+
+ final int istart=(insert<=blen ? 0 : insert-blen);
+ final int jstart=(insert>=blen ? 0 : blen-insert);
+ final int overlapLength=Tools.min(alen-istart, blen-jstart, insert);
+
+// final float badlimit=(Tools.min(altBadlimit, bestRatio*overlapLength));
+ final float badlimit=bestRatio*overlapLength;
+ float good=0, bad=0;
+
+ final int imax=istart+overlapLength;
+ for(int i=istart, j=jstart; i<imax && bad<=badlimit; i++, j++){
+ assert(i>=0 && i<alen && j>=0 && j<blen) : "\njstart="+jstart+", j="+j+
+ ", istart="+istart+", i="+i+" \n"+"insert="+insert+", overlap="+overlapLength+", a.length="+a.length()+
+ ", b.length="+b.length()+", bad="+bad+", badlimit="+badlimit+", good="+good;
+ final byte ca=abases[i], cb=bbases[j];
+
+ if(ca==cb){
+ if(ca!=N){good+=gIncr;}
+ }else{bad+=bIncr;}
+ }
+
+ if(bad<=badlimit){
+ if(bad==0 && good>minOverlap0 && good<minOverlap){
+ return 100f;
+ }
+
+ float ratio=(bad+offset)/overlapLength;
+
+ if(ratio<bestRatio){
+ bestRatio=ratio;
+ if(good>=minOverlap && ratio<halfmax){return bestRatio;}
+ }
+ }
+ }
+
+ return bestRatio;
+ }
+
+ protected static final int mateByOverlapJava_unrolled(Read a, Read b, float[] aprob, float[] bprob, int[] rvector,
+ int minOverlap0, final int minOverlap, final int minInsert0, int margin, final int maxMismatches0, final int maxMismatches, final int minq) {
+ assert(rvector!=null);
+ minOverlap0=Tools.min(Tools.max(1, minOverlap0), minOverlap);
+ assert(maxMismatches<=maxMismatches0);
+ margin=Tools.max(margin, 0);
+ assert(maxMismatches>=margin);
+
+ final byte[] abases=a.bases, bbases=b.bases;
+ final byte[] aqual=a.quality, bqual=b.quality;
+ final int alen=abases.length, blen=bbases.length;
+
+ int bestOverlap=-1;
+ int bestGood=-1;
+ int bestBad=maxMismatches0;
+
+ boolean ambig=false;
+ final int maxOverlap=alen+blen-Tools.max(minOverlap, minInsert0);
+// assert(false) : minOverlap+", "+maxOverlap;
+
+ if(aqual!=null && bqual!=null){
+ for(int i=0; i<aqual.length; i++){aprob[i]=probCorrect[aqual[i]];}
+ for(int i=0; i<bqual.length; i++){bprob[i]=probCorrect[bqual[i]];}
+ }else{
+ for(int i=0; i<alen; i++){aprob[i]=0.98f;}
+ for(int i=0; i<blen; i++){bprob[i]=0.98f;}
+ }
+
+ final float minprob=probCorrect[Tools.mid(1, minq, 41)];
+
+ for(int overlap=Tools.max(minOverlap0, 0); overlap<maxOverlap; overlap++){
+ if(verbose){System.err.println("\nTesting read "+a.numericID+", overlap "+overlap+", insert "+(alen+blen-overlap));}
+
+ int good=0, bad=0;
+
+ int istart=(overlap<=alen ? 0 : overlap-alen);
+ int jstart=(overlap<=alen ? alen-overlap : 0);
+
+ {
+ final int iters=Tools.min(overlap-istart, blen-istart, alen-jstart);
+ final int imax=istart+iters;
+ final int badlim=bestBad+margin;
+
+ for(int i=istart, j=jstart; i<imax && bad<=badlim; i++, j++){
+ assert(j>=0 && j<=alen && i>=0 && i<=blen) : "\njstart="+jstart+", j="+j+
+ ", istart="+istart+", i="+i+" \n"+"overlap="+overlap+", a.length="+alen+
+ ", b.length="+blen+", bad="+bad+", badlim="+badlim+", good="+good;
+ final byte ca1=abases[j], cb1=bbases[i];
+ final float pc=aprob[j]*bprob[i];
+
+ if(pc<=minprob){//do nothing
+ }else if(ca1==cb1){good++;}
+ else{bad++;}
+ }
+
+ if(verbose){
+ final int overlapLen=(imax-istart);
+ System.err.println("overlapLen="+overlapLen+"; coordinates ("+jstart+"-"+(jstart+overlapLen)+"), ("+istart+"-"+imax+")");
+ System.err.println(new String(abases, jstart, overlapLen));
+ System.err.println(new String(bbases, istart, overlapLen));
+ }
+
+ if(verbose){System.err.println("overlap="+overlap+", bad="+bad+", good="+good+", badlim="+badlim+", bestOverlap="+
+ bestOverlap+", bestGood="+bestGood+", bestBad="+bestBad+", ambig="+ambig+", mino="+minOverlap+", mino0="+minOverlap0+
+ ", margin="+margin+", maxMismatches="+maxMismatches);}
+ }
+
+ if(bad*2<good){
+ if(verbose){System.err.print("a");}
+ if(good>minOverlap){//Candidate
+ if(verbose){System.err.print("b");}
+ if(bad<=bestBad){
+
+ if(verbose){System.err.print("c");}
+ if(bad<bestBad || (bad==bestBad && good>bestGood)){//Current winner
+ if(verbose){System.err.print("d");}
+ if(bestBad-bad<margin){ambig=true;}
+ bestOverlap=overlap;
+ bestBad=bad;
+ bestGood=good;
+ }else if(bad==bestBad){
+ if(verbose){System.err.print("e");}
+ ambig=true;
+ }
+
+ if(ambig && bestBad<margin){
+ if(verbose){System.err.print("f");}
+ rvector[2]=bestBad;
+ rvector[4]=(ambig ? 1 : 0);
+ return -1;
+ }
+ }
+ }else if(bad<margin){
+ if(verbose){System.err.print("g");}
+ ambig=true;
+ rvector[2]=bestBad;
+ rvector[4]=(ambig ? 1 : 0);
+ return -1;
+ }else{
+ if(verbose){System.err.print("h");}
+ }
+ }
+ }
+
+ if(!ambig && bestBad>maxMismatches-margin){bestOverlap=-1;}
+
+ rvector[2]=bestBad;
+ rvector[4]=(ambig ? 1 : 0);
+
+ if(verbose){System.err.println("bestOverlap="+
+ bestOverlap+", bestGood="+bestGood+", bestBad="+bestBad+", ambig="+ambig+", mino="+minOverlap+", mino0="+minOverlap0+
+ ", margin="+margin+", maxMismatches="+maxMismatches);}
+
+ return (bestOverlap<0 ? -1 : alen+blen-bestOverlap);
+ }
+
+
+ /**
+ * TODO Use this
+ * @param a
+ * @param b
+ * @param overlap
+ * @return
+ */
+ protected static final float expectedMismatches(Read a, Read b, int overlap) {
+
+ final byte[] abases=a.bases, bbases=b.bases, aqual=a.quality, bqual=b.quality;
+ final int alen=abases.length, blen=bbases.length;
+ final int istart=(overlap<=blen ? 0 : overlap-blen);
+ final int jstart=(overlap<=alen ? alen-overlap : 0);
+
+ float expected=0;
+ float actual=0;
+
+ if(aqual==null || bqual==null){return (overlap+0)/16;}
+
+// System.err.println(istart);
+// System.err.println(jstart);
+// System.err.println();
+//
+// System.err.println(a.id);
+// System.err.println(overlap);
+// System.err.println(new String(a.bases));
+// System.err.println(new String(b.bases));
+// System.err.println();
+// for(int i=istart, j=jstart; i<overlap && i<alen && j<blen; i++, j++){
+// final byte ca=abases[i];
+// System.err.print((char)ca);
+// }
+// System.err.println();
+// for(int i=istart, j=jstart; i<overlap && i<alen && j<blen; i++, j++){
+// final byte cb=bbases[j];
+// System.err.print((char)cb);
+// }
+// System.err.println();
+
+ for(int i=istart, j=jstart; i<overlap && i<alen && j<blen; i++, j++){
+ final byte ca=abases[i], cb=bbases[j];
+ final byte qa=aqual[i], qb=bqual[j];
+
+ if(ca=='N' || cb=='N'){
+ //do nothing
+ }else{
+ assert(AminoAcid.isFullyDefined(ca) && AminoAcid.isFullyDefined(cb)) :
+ "A non-ACGTN base was detected. Please rerun with the flag 'itn'.\n"+(char)ca+", "+(char)cb+"\n";
+ float probC=probCorrect2[qa]*probCorrect2[qb];
+ float probE=1-probC;
+// expected+=Tools.max(0.0005f, probE);
+ expected+=probE;
+ actual+=(ca==cb ? 0 : probC);
+// assert((probE==1) == (ca=='N' || cb=='N')) : ((char)ca)+", "+((char)cb)+", "+qa+", "+qb+", "+probC+", "+probE;
+ }
+ }
+
+// System.err.println("*expected: \t"+expected);
+// System.err.println("*Actual: \t"+actual);
+// System.err.println();
+//
+// assert(a.id.equals("insert=142 /1") || a.id.equals("insert=263 /1")) : a.id;
+
+ return expected;
+ }
+
+ /** Attempt at quantifying probability of an event like this.
+ * TODO: This returns an incorrect answer if reads are unequal lengths. */
+ protected static final float probability(Read a, Read b, int insert) {
+ final byte[] abases=a.bases, bbases=b.bases, aqual=a.quality, bqual=b.quality;
+ final int alen=abases.length, blen=bbases.length;
+ final int istart=(insert<=blen ? 0 : insert-blen);
+ final int jstart=(insert>=blen ? 0 : blen-insert);
+
+ if(aqual==null || bqual==null){return 1;}
+
+ float probActual=1;
+ float probCommon=1;
+
+// float expected=0;
+// float actual=0;
+// int measuredOverlap=0;
+
+// assert(false) : "\n"+a.toFastq()+"\n"+b.toFastq()+"\n"+"istart="+istart+", jstart="+jstart+", insert="+insert+", alen="+alen+", blen="+blen;
+
+ for(int i=istart, j=jstart; i<insert && i<alen && j<blen; i++, j++){
+ final byte ca=abases[i], cb=bbases[j];
+ final byte qa=aqual[i], qb=bqual[j];
+
+ if(ca=='N' || cb=='N'){
+ //do nothing
+ }else{
+
+// System.err.println(((char)ca)+", "+((char)cb)+", "+i+", "+j);
+
+ assert(AminoAcid.isFullyDefined(ca) && AminoAcid.isFullyDefined(cb)) :
+ "A non-ACGTN base was detected. Please rerun with the flag 'itn'.\n"+(char)ca+", "+(char)cb+"\n";
+ float probC=probCorrect2[qa]*probCorrect2[qb];
+ float probM=probC+(1-probC)*0.25f; //probability of matching
+ float probE=1-probM;
+
+ assert(probM>0) : qa+", "+qb+", "+probC+", "+probM+", "+probE;
+ assert(probE>0) : qa+", "+qb+", "+probC+", "+probM+", "+probE;
+
+ probCommon*=Tools.max(probM, probE);
+ probActual*=(ca==cb ? probM : probE);
+
+// expected+=probE;
+// actual+=(ca==cb ? 0 : probM);
+// measuredOverlap++;
+ }
+ }
+
+// if(probActual>probCommon){
+// System.err.println("expected: \t"+expected);
+// System.err.println("Actual: \t"+actual);
+// System.err.println("probCommon: \t"+probCommon);
+// System.err.println("probActual: \t"+probActual);
+// System.err.println();
+// assert(false) : "\n"+a.toFastq()+"\n"+b.toFastq()+"\n";
+// }
+
+ assert(probActual<=probCommon);
+
+ return (float)Math.sqrt(probActual/probCommon); //sqrt is just so people don't need to type so many zeros.
+ }
+
+ protected static int minCoverage(final Read r, final Tadpole tadpole, final int k, int cutoff){
+ if(k<32){
+ return minCoverage(r, (Tadpole1)tadpole, k, cutoff);
+ }else{
+ return minCoverage(r, (Tadpole2)tadpole, k, cutoff);
+ }
+ }
+
+ protected static int minCoverage(final Read r, final Tadpole1 tadpole, final int k, int cutoff){
+ final byte[] bases=r.bases;
+ if(bases==null || bases.length<k){return cutoff;}
+
+ final int shift=2*k;
+ final int shift2=shift-2;
+ final long mask=~((-1L)<<shift);
+ long kmer=0, rkmer=0;
+ int len=0;
+ int min=cutoff;
+
+ for(int i=0; i<bases.length; i++){
+ final byte b=bases[i];
+ final long x=AminoAcid.baseToNumber[b];
+ final long x2=AminoAcid.baseToComplementNumber[b];
+
+ //Update kmers
+ kmer=((kmer<<2)|x)&mask;
+ rkmer=(rkmer>>>2)|(x2<<shift2);
+
+ //Handle Ns
+ if(x<0){
+ len=0;
+ kmer=rkmer=0;
+ }else{len++;}
+
+ if(len>=k){
+ int cov=tadpole.getCount(kmer, rkmer);
+ min=Tools.min(min, cov);
+ if(min<cutoff){return min;}
+ }
+ }
+
+ return min;
+ }
+
+ protected static int minCoverage(final Read r, final Tadpole2 tadpole, final int k, int cutoff){
+ final byte[] bases=r.bases;
+ if(bases==null || bases.length<k){return cutoff;}
+
+ Kmer kmer=new Kmer(k);
+ assert(kmer!=null);
+ int min=cutoff;
+
+ for(int i=0; i<bases.length; i++){
+ final byte b=bases[i];
+
+ //Update kmers
+ kmer.addRight(b);
+
+ if(kmer.len>=k){
+ int cov=tadpole.getCount(kmer);
+ min=Tools.min(min, cov);
+ if(min<cutoff){return min;}
+ }
+ }
+
+ return min;
+ }
+
+ protected static int calcMinOverlapByEntropy(byte[] bases, int k, short[] counts, int minscore){
+ return Tools.max(calcMinOverlapByEntropyTail(bases, k, counts, minscore), calcMinOverlapByEntropyHead(bases, k, counts, minscore));
+// return calcMinOverlapByEntropyTail(bases, k, counts, minscore);
+ }
+
+ protected static int calcMinOverlapByEntropyTail(byte[] bases, int k, short[] counts, int minscore){
+ final int bits=2*k;
+ final int mask=~((-1)<<(bits));
+ int kmer=0, len=0, ones=0, twos=0;
+
+ if(counts==null){
+ counts=localKmerCounts.get();
+ if(counts==null){
+ counts=new short[1<<(bits)];
+ localKmerCounts.set(counts);
+ }
+ }
+
+ Arrays.fill(counts, (short)0);
+
+ for(int i=0, j=bases.length-1; i<bases.length; i++, j--){
+ if(i<bases.length){
+ final byte b=bases[j];
+ if(!AminoAcid.isFullyDefined(b)){
+ len=0;
+ kmer=0;
+ }else{
+ len++;
+ final int n=Dedupe.baseToNumber[b];
+ kmer=((kmer<<2)|n)&mask;
+
+ if(len>=k){
+ counts[kmer]++;
+ if(counts[kmer]==1){ones++;}
+ else if(counts[kmer]==2){twos++;}
+ if(ones*4+twos>=minscore){return i;}
+ }
+ }
+ }
+ }
+ return bases.length+1;
+ }
+
+ protected static int calcMinOverlapByEntropyHead(byte[] bases, int k, short[] counts, int minscore){
+ final int bits=2*k;
+ final int mask=~((-1)<<(bits));
+ int kmer=0, len=0, ones=0, twos=0;
+
+ if(counts==null){
+ counts=localKmerCounts.get();
+ if(counts==null){
+ counts=new short[1<<(bits)];
+ localKmerCounts.set(counts);
+ }
+ }
+
+ Arrays.fill(counts, (short)0);
+
+ for(int i=0; i<bases.length; i++){
+ if(i<bases.length){
+ final byte b=bases[i];
+ if(!AminoAcid.isFullyDefined(b)){
+ len=0;
+ kmer=0;
+ }else{
+ len++;
+ final int n=Dedupe.baseToNumber[b];
+ kmer=((kmer<<2)|n)&mask;
+
+ if(len>=k){
+ counts[kmer]++;
+ if(counts[kmer]==1){ones++;}
+ else if(counts[kmer]==2){twos++;}
+ if(ones*4+twos>=minscore){return i;}
+ }
+ }
+ }
+ }
+ return bases.length+1;
+ }
+
+ private static ThreadLocal<short[]> localKmerCounts=new ThreadLocal<short[]>();
+
+ private static final int BAD_MULT=6;
+ private static final int GOOD_MULT_1=8;
+ private static final int GOOD_MULT_2=400;
+
+ protected static final boolean verbose=false;
+
+ private static final float[] probCorrect=
+ {0.000f, 0.251f, 0.369f, 0.499f, 0.602f, 0.684f, 0.749f, 0.800f, 0.842f, 0.874f, 0.900f, 0.921f, 0.937f, 0.950f, 0.960f, 0.968f,
+ 0.975f, 0.980f, 0.984f, 0.987f, 0.990f, 0.992f, 0.994f, 0.995f, 0.996f, 0.997f, 0.997f, 0.998f, 0.998f, 0.999f, 0.999f, 0.999f,
+ 0.999f, 0.999f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+
+ private static final float[] probCorrect2=
+ {0.0000f, 0.2501f, 0.3690f, 0.4988f, 0.6019f, 0.6838f, 0.7488f, 0.8005f, 0.8415f, 0.8741f, 0.9000f, 0.9206f, 0.9369f, 0.9499f,
+ 0.9602f, 0.9684f, 0.9749f, 0.9800f, 0.9842f, 0.9874f, 0.9900f, 0.9921f, 0.9937f, 0.9950f, 0.9960f, 0.9968f, 0.9975f, 0.9980f,
+ 0.9984f, 0.9987f, 0.9990f, 0.9992f, 0.9994f, 0.9995f, 0.9996f, 0.9997f, 0.9997f, 0.9998f, 0.9998f, 0.9999f, 0.9999f, 0.9999f,
+ 0.9999f, 0.9999f, 0.9999f, 0.9999f, 0.9999f, 0.9999f, 0.9999f, 0.9999f, 0.9999f, 0.9999f, 0.9999f, 0.9999f, 0.9999f, 0.9999f,
+ 0.9999f, 0.9999f, 0.9999f, 0.9999f};
+
+ private static final float[] probCorrect5=
+ {0.20000f, 0.20567f, 0.36904f, 0.49881f, 0.60189f, 0.68377f, 0.74881f, 0.80047f, 0.84151f, 0.87411f, 0.90000f, 0.92057f, 0.93690f,
+ 0.94988f, 0.96019f, 0.96838f, 0.97488f, 0.98005f, 0.98415f, 0.98741f, 0.99000f, 0.99206f, 0.99369f, 0.99499f, 0.99602f, 0.99684f,
+ 0.99749f, 0.99800f, 0.99842f, 0.99874f, 0.99900f, 0.99921f, 0.99937f, 0.99950f, 0.99960f, 0.99968f, 0.99975f, 0.99980f, 0.99984f,
+ 0.99987f, 0.99990f, 0.99992f, 0.99994f, 0.99995f, 0.99996f, 0.99997f, 0.99997f, 0.99998f, 0.99998f, 0.99999f, 0.99999f, 0.99999f,
+ 0.99999f, 0.99999f, 0.99999f, 0.99999f, 0.99999f, 0.99999f, 0.99999f, 0.99999f};
+
+}
diff --git a/current/jgi/BBQC.java b/current/jgi/BBQC.java
new file mode 100755
index 0000000..5d020d4
--- /dev/null
+++ b/current/jgi/BBQC.java
@@ -0,0 +1,1091 @@
+package jgi;
+
+import java.io.File;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Date;
+import java.util.TimeZone;
+
+import stream.FASTQ;
+
+import dna.Data;
+import dna.Parser;
+
+import align2.BBMap;
+import align2.ReadStats;
+import align2.Shared;
+import align2.Tools;
+import align2.TrimRead;
+import fileIO.ByteFile1;
+import fileIO.ReadWrite;
+import fileIO.TextStreamWriter;
+
+/**
+ * Wrapper for BBDukF, BBMap, and BBNorm to perform quality-control and artifact removal.
+ * @author Brian Bushnell
+ * @date Jan 20, 2013
+ *
+ */
+public class BBQC {
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /**
+ * Program entrance from command line.
+ * @param args Command line arguments
+ */
+ public static void main(String[] args){
+
+ ReadWrite.USE_PIGZ=true;
+ ReadWrite.USE_UNPIGZ=true;
+
+ //Create a filter instance
+ BBQC filter=new BBQC(args);
+
+ ///...and execute it.
+ filter.process();
+ }
+
+ /**
+ * Constructor.
+ * @param args Command line arguments
+ */
+ BBQC(String[] args){
+
+ //Optional default parameters to match current pipeline
+// arglist.add("k=22");
+// arglist.add("maxbadkmers=2");
+
+ //Symbols to insert in output filename to denote operations performed; may be overriden from command line
+ String symbols_=null;//"filtered"
+ int passes_=-1;
+
+ //Parse argument list
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+ String[] split=arg.split("="); //Expect key=value pairs
+ String a=split[0].toLowerCase(); //All keys are converted to lower case
+ String b=split.length>1 ? split[1] : null;
+ while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(Parser.parseZip(arg, a, b)){
+ if(a.equals("pigz")){
+ pigz=b;
+ }else if(a.equals("unpigz")){
+ unpigz=b;
+ }else if(a.equals("zl") || a.equals("ziplevel")){
+ zl=b;
+ }
+ }else if(Parser.parseCommonStatic(arg, a, b)){
+ //do nothing
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ ByteFile1.verbose=verbose;
+ stream.FastaReadInputStream.verbose=verbose;
+ primaryArgList.add(arg);
+ }else if(a.equals("in") || a.equals("input") || a.equals("in1") || a.equals("input1")){
+ in1=b;
+ }else if(a.equals("in2") || a.equals("input2")){
+ in2=b;
+ }else if(a.equals("out") || a.equals("output") || a.equals("out1") || a.equals("output1")){
+ out1=b;
+ }else if(a.equals("out2") || a.equals("output2")){
+ out2=b;
+ }else if(a.equals("qfin") || a.equals("qfin1")){
+ qfin1=b;
+ }else if(a.equals("qfout") || a.equals("qfout1")){
+ qfout1=b;
+ }else if(a.equals("qfin2")){
+ qfin2=b;
+ }else if(a.equals("qfout2")){
+ qfout2=b;
+ }else if(a.equals("ref")){
+ if(b!=null){
+ if(!b.contains(",") || new File(b).exists()){
+ filterrefs.add(b);
+ }else{
+ String[] split2=b.split(",");
+ for(String s2 : split2){
+ filterrefs.add(s2);
+ }
+ }
+ }
+ }else if(a.equals("artifactdb")){
+ mainArtifactFile=b;
+ }else if(a.equals("rnadb")){
+ artifactFileRna=b;
+ }else if(a.equals("dnadb")){
+ artifactFileDna=b;
+ }else if(a.equals("phixref")){
+ phixRef=b;
+ }else if(a.equals("append") || a.equals("app")){
+ append=ReadStats.append=Tools.parseBoolean(b);
+ }else if(a.equals("overwrite") || a.equals("ow")){
+ overwrite=Tools.parseBoolean(b);
+ }else if(a.equals("ml") || a.equals("minlen") || a.equals("minlength")){
+ minLen=Integer.parseInt(b);
+ }else if(a.equals("mlf") || a.equals("minlenfrac") || a.equals("minlenfraction") || a.equals("minlengthfraction")){
+ minLenFraction=Float.parseFloat(b);
+ }else if(a.equals("path") || a.equals("outdir")){
+ outDir=b;
+ }else if(a.equals("symbols")){
+ symbols_=b;
+ }else if(a.equals("overallstats") || a.equals("stats")){
+ rqcStatsName=b;
+ }else if(a.equals("scafstats")){
+ scaffoldStatsName=b;
+ }else if(a.equals("kmerstats")){
+ kmerStatsName=b;
+ }else if(a.equals("log")){
+ logName=b;
+ }else if(a.equals("filelist")){
+ fileListName=b;
+ }else if(a.equals("compress")){
+ compress=Tools.parseBoolean(b);
+ }else if(a.equals("rna")){
+ rnaFlag=Tools.parseBoolean(b);
+ }else if(a.equals("phix")){
+ phixFlag=Tools.parseBoolean(b);
+ }else if(a.equals("pjet")){
+ pjetFlag=Tools.parseBoolean(b);
+ }else if(a.equals("ktrim")){
+ ktrim=b;
+ }else if(a.equals("mink")){
+ mink=Integer.parseInt(b);
+ }else if(a.equals("k")){
+ assert(false) : "To specify kmer length, use filterk, trimk, mapk, or normalizek instead of just 'k'";
+ filter_k=Integer.parseInt(b);
+ }else if(a.equals("filterk")){
+ filter_k=Integer.parseInt(b);
+ }else if(a.equals("trimk")){
+ trim_k=Integer.parseInt(b);
+ }else if(a.equals("mapk")){
+ map_k=Integer.parseInt(b);
+ }else if(a.equals("normalizek") || a.equals("normk") || a.equals("ecck")){
+ normalize_k=Integer.parseInt(b);
+ }else if(a.equals("filterhdist")){
+ hdist_filter=Integer.parseInt(b);
+ }else if(a.equals("trimhdist")){
+ hdist_trim=Integer.parseInt(b);
+ }else if(a.equals("trimhdist2")){
+ hdist2_trim=Integer.parseInt(b);
+ }else if(a.equals("maq")){
+ if(b.indexOf(',')>-1){
+ String[] x=b.split(",");
+ assert(x.length==2) : "maq should be length 1 or 2 (at most 1 comma).\nFormat: maq=quality,bases; e.g. maq=10 or maq=10,20";
+ minAvgQuality=Byte.parseByte(x[0]);
+ minAvgQualityBases=Integer.parseInt(x[1]);
+ }else{
+ minAvgQuality=Byte.parseByte(b);
+ }
+ }else if(a.equals("forcetrimmod") || a.equals("forcemrimmodulo") || a.equals("ftm")){
+ forceTrimModulo=Integer.parseInt(b);
+ }else if(a.equals("trimq")){
+ trimq=Byte.parseByte(b);
+ }else if(a.equals("human") || a.equals("removehuman")){
+ removehuman=Tools.parseBoolean(b);
+ }else if(a.equals("normalize") || a.equals("norm")){
+ normalize=Tools.parseBoolean(b);
+ }else if(a.equals("ecc")){
+ ecc=Tools.parseBoolean(b);
+ }else if(a.equals("aec") || a.equals("aecc")){
+ aecc=Tools.parseBoolean(b);
+ if(aecc){ecc=true;}
+ }else if(a.equals("cecc")){
+ cecc=Tools.parseBoolean(b);
+ if(cecc){ecc=true;}
+ }else if(a.equals("markerrorsonly") || a.equals("meo")){
+ meo=Tools.parseBoolean(b);
+ }else if(a.equals("tam")){
+ tam=Tools.parseBoolean(b);
+ }else if(a.equals("taf")){
+ trimAfterFiltering=Tools.parseBoolean(b);
+ }else if(a.equals("mue")){
+ mue=Tools.parseBoolean(b);
+ }else if(a.equals("mw1")){
+ mw1=Tools.parseBoolean(b);
+ }else if(a.equals("max") || a.equals("maxdepth")){
+ maxdepth=Integer.parseInt(b);
+ }else if(a.equals("min") || a.equals("mindepth")){
+ mindepth=Integer.parseInt(b);
+ }else if(a.equals("target") || a.equals("targetdepth")){
+ target=Integer.parseInt(b);
+ }else if(a.equals("prehashes")){
+ prehashes=Integer.parseInt(b);
+ }else if(a.equals("passes")){
+ passes_=Integer.parseInt(b);
+ }else if(a.equals("hashes")){
+ hashes=Integer.parseInt(b);
+ }else if(a.equals("bits")){
+ bits=Integer.parseInt(b);
+ }else if(a.equals("minratio")){
+ minratio=Float.parseFloat(b);
+ }else if(a.equals("maxindel")){
+ maxindel=Integer.parseInt(b);
+ }else if(a.equals("kfilter")){
+ kfilter=Integer.parseInt(b);
+ }else if(a.equals("hits") || a.equals("minhits")){
+ minhits=Integer.parseInt(b);
+ }else if(a.equals("fast")){
+ fast=Tools.parseBoolean(b);
+ }else if(a.equals("local")){
+ local=Tools.parseBoolean(b);
+ }else if(a.equals("mappath") || a.equals("indexpath")){
+ indexPath=b;
+ }else if(a.equals("mapref")){
+ mapRef=b;
+ }else if(a.equals("qtrim")){
+ if(b==null){qtrim="rl";}
+ else if(b.equalsIgnoreCase("left") || b.equalsIgnoreCase("l")){qtrim="l";}
+ else if(b.equalsIgnoreCase("right") || b.equalsIgnoreCase("r")){qtrim="r";}
+ else if(b.equalsIgnoreCase("both") || b.equalsIgnoreCase("rl") || b.equalsIgnoreCase("lr")){qtrim="lr";}
+ else if(Character.isDigit(b.charAt(0))){
+ trimq=Byte.parseByte(b);
+ qtrim=(trimq>=0 ? "lr" : "f");
+ }else{qtrim=""+Tools.parseBoolean(b);}
+ }else if(a.equals("optitrim") || a.equals("otf") || a.equals("otm")){
+ if(b!=null && (b.charAt(0)=='.' || Character.isDigit(b.charAt(0)))){
+ TrimRead.optimalMode=true;
+ TrimRead.optimalBias=Float.parseFloat(b);
+ assert(TrimRead.optimalBias>=0 && TrimRead.optimalBias<1);
+ }else{
+ TrimRead.optimalMode=Tools.parseBoolean(b);
+ }
+ }else if(a.equals("maxns")){
+ maxNs=Integer.parseInt(b);
+ }else if(in1==null && i==0 && !arg.contains("=") && (arg.toLowerCase().startsWith("stdin") || new File(arg).exists())){
+ in1=arg;
+ if(arg.indexOf('#')>-1 && !new File(arg).exists()){
+ in1=arg.replace("#", "1");
+ in2=arg.replace("#", "2");
+ }
+ }else{
+ //Uncaptured arguments are passed to BBDuk
+ primaryArgList.add(arg);
+ }
+ }
+
+ if(passes_>0){passes=passes_;}
+ else if(!normalize){passes=1;}
+
+ if(hdist2_trim<0){hdist2_trim=hdist_trim;}
+
+ //Set final field 'symbols'
+ symbols=(symbols_==null ? abbreviation() : symbols_);
+
+ //Pass overwrite flag to BBDuk
+ primaryArgList.add("ow="+overwrite);
+
+ if(outDir!=null){
+ outDir=outDir.trim().replace('\\', '/');
+ if(outDir.length()>0 && !outDir.endsWith("/")){outDir=outDir+"/";}
+ }else{outDir="";}
+
+ {//Prepend output directory to output files
+ if(logName!=null){logName=outDir+logName+".tmp";} //Add '.tmp' to log file
+ if(fileListName!=null){fileListName=outDir+fileListName;}
+ }
+
+ {//Create unique output file names for second pass
+ if(rqcStatsName!=null){
+ rqcStatsName_kt=outDir+"ktrim_"+rqcStatsName;
+ rqcStatsName=outDir+rqcStatsName;
+ }
+ if(kmerStatsName!=null){
+ kmerStatsName_kt=outDir+"ktrim_"+kmerStatsName;
+ kmerStatsName=outDir+kmerStatsName;
+ }
+ if(scaffoldStatsName!=null){
+ scaffoldStatsName_kt=outDir+"ktrim_"+scaffoldStatsName;
+ scaffoldStatsName=outDir+scaffoldStatsName;
+ }
+ }
+
+ //Create output filename from input filename if no output filename is specified
+ if(out1==null && in1!=null){
+ File f=new File(in1);
+ String name=f.getName();
+ String raw=ReadWrite.rawName(name);
+ int x=raw.lastIndexOf('.');
+ if(x>-1){
+ out1=raw.substring(0, x)+"."+symbols+raw.substring(x)+(compress ? ".gz" : "");
+ }else{
+ out1=raw+"."+symbols+".fastq"+(compress ? ".gz" : "");
+ }
+ }
+
+ tempSalt=KmerNormalize.getSalt(out1, 0);
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Processing Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+
+ /**
+ * Primary method to fully execute the program.
+ */
+ public void process(){
+
+ //Create output directory
+ if(outDir!=null && outDir.length()>0){
+ File f=new File(outDir);
+ if(!f.exists()){
+ f.mkdirs();
+ }
+ }
+
+ //Create log file
+ if(logName!=null){
+ boolean b=Tools.canWrite(logName, overwrite);
+ assert(b) : "Can't write to "+logName;
+ log("start", false);
+ }
+
+ //Create file list file
+ if(fileListName!=null){
+ boolean b=Tools.canWrite(fileListName, overwrite);
+ assert(b) : "Can't write to "+fileListName;
+
+ StringBuilder sb=new StringBuilder();
+ if(out1!=null){sb.append("filtered_fastq="+out1).append('\n');}
+ if(qfout1!=null){sb.append("filtered_qual="+qfout1).append('\n');}
+ if(out2!=null){sb.append("filtered_fastq_2="+out2).append('\n');}
+ if(qfout2!=null){sb.append("filtered_qual_2="+qfout2).append('\n');}
+ if(ihistName!=null){sb.append("ihist="+ihistName).append('\n');}
+ if(scaffoldStatsName!=null){sb.append("scafstats="+scaffoldStatsName).append('\n');}
+
+ if(sb.length()>0){
+ ReadWrite.writeString(sb, fileListName, false);
+ }
+ }
+
+ final String trimPrefix="TEMP_TRIM_"+tempSalt+"_";
+ final String humanPrefix="TEMP_HUMAN_"+tempSalt+"_";
+ final String filterPrefix="TEMP_FILTER_"+tempSalt+"_";
+
+ int oldZL=ReadWrite.ZIPLEVEL;
+ ReadWrite.ZIPLEVEL=2;
+ ReadWrite.ALLOW_ZIPLEVEL_CHANGE=false;
+
+ final String in1s=stripDirs(in1), in2s=stripDirs(in2), qfin1s=stripDirs(qfin1), qfin2s=stripDirs(qfin2);
+ final String out1s=stripDirs(out1), out2s=stripDirs(out2), qfout1s=stripDirs(qfout1), qfout2s=stripDirs(qfout2);
+
+ ktrim(in1, in2, out1s, out2s, qfin1, qfin2, qfout1s, qfout2s, trimPrefix);
+
+ if(in2!=null && out2==null){
+ FASTQ.FORCE_INTERLEAVED=true;
+ FASTQ.TEST_INTERLEAVED=false;
+ }
+
+
+ if(removehuman){
+ filter(out1s, out2s, out1s, out2s, qfout1s, qfout2s, qfout1s, qfout2s, trimPrefix, filterPrefix, true, true, false);
+ delete(trimPrefix, out1s, out2s, qfout1s, qfout2s);
+ if(normalize || ecc){
+ dehumanize(out1s, out2s, out1s, out2s, qfout1s, qfout2s, filterPrefix, humanPrefix, true, true, false);
+ delete(filterPrefix, out1s, out2s, qfout1s, qfout2s);
+ Data.unloadAll();
+ ReadWrite.ZIPLEVEL=oldZL;
+ ReadWrite.ALLOW_ZIPLEVEL_CHANGE=true;
+ normalize(out1s, out2s, out1, out2, qfout1s, qfout2s, qfout1, qfout2, humanPrefix, "", true, true, true);
+ delete(humanPrefix, out1s, out2s, qfout1s, qfout2s);
+ }else{
+ ReadWrite.ZIPLEVEL=oldZL;
+ ReadWrite.ALLOW_ZIPLEVEL_CHANGE=true;
+ dehumanize(out1s, out2s, out1, out2, qfout1s, qfout2s, filterPrefix, "", true, true, true);
+ delete(filterPrefix, out1s, out2s, qfout1s, qfout2s);
+ Data.unloadAll();
+ }
+ }else{
+ if(normalize || ecc){
+ filter(out1s, out2s, out1s, out2s, qfout1s, qfout2s, qfout1s, qfout2s, trimPrefix, filterPrefix, true, true, false);
+ delete(trimPrefix, out1s, out2s, qfout1s, qfout2s);
+ normalize(out1s, out2s, out1, out2, qfout1s, qfout2s, qfout1, qfout2, filterPrefix, "", true, true, true);
+ delete(filterPrefix, out1s, out2s, qfout1s, qfout2s);
+ }else{
+ filter(out1s, out2s, out1, out2, qfout1s, qfout2s, qfout1, qfout2, trimPrefix, "", true, true, true);
+ delete(trimPrefix, out1s, out2s, qfout1s, qfout2s);
+ }
+ }
+
+ //Write combined stats file (number of reads/bases present/removed in each stage)
+ if(rqcStatsName!=null){
+ final TextStreamWriter tsw=new TextStreamWriter(rqcStatsName, overwrite, false, false);
+ tsw.start();
+ tsw.println(BBDukF.rqcString());
+ tsw.poisonAndWait();
+ }
+
+ //Finish writing log file
+ if(logName!=null){
+ log("complete", true);
+ if(logName.endsWith(".tmp")){ //Remove .tmp extension
+ String old=logName;
+ logName=logName.substring(0, logName.length()-4);
+ try {
+ new File(old).renameTo(new File(logName));
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+
+ }
+
+
+ /**
+ * Runs BBDuk to perform:
+ * Kmer trimming, short read removal.
+ *
+ * @param in1 Primary input reads file (required)
+ * @param in2 Secondary input reads file
+ * @param out1 Primary output reads file (required)
+ * @param out2 Secondary output reads file
+ * @param qfin1 Primary input qual file
+ * @param qfin2 Secondary input qual file
+ * @param qfout1 Primary output qual file
+ * @param qfout2 Secondary output qual file
+ * @param outPrefix Append this prefix to output filenames
+ */
+ private void ktrim(String in1, String in2, String out1, String out2, String qfin1, String qfin2, String qfout1, String qfout2, String outPrefix){
+
+ log("ktrim start", true);
+ System.err.println("\nAdapter Trimming Phase Start");
+
+ ArrayList<String> argList=new ArrayList<String>();
+
+ {//Fill list with BBDuk arguments
+ argList.add("ktrim="+(ktrim==null ? "f" : ktrim));
+ if(minLen>0){argList.add("minlen="+minLen);}
+ if(minLenFraction>0){argList.add("minlenfraction="+minLenFraction);}
+ argList.add("mink="+mink);
+ if("r".equalsIgnoreCase(ktrim) || "right".equalsIgnoreCase(ktrim)){
+ argList.add("tbo");
+ argList.add("tpe");
+ }
+ argList.add("k="+trim_k);
+ argList.add("hdist="+hdist_trim);
+ if(hdist2_trim>=0){
+ argList.add("hdist2="+hdist2_trim);
+ }
+ if(forceTrimModulo>0){
+ argList.add("ftm="+forceTrimModulo);
+ }
+ if(pigz!=null){argList.add("pigz="+pigz);}
+ if(unpigz!=null){argList.add("unpigz="+unpigz);}
+ if(zl!=null){argList.add("zl="+zl);}
+
+ //Pass along uncaptured arguments
+ for(String s : primaryArgList){argList.add(s);}
+
+ //Set read I/O files
+ if(in1!=null){argList.add("in1="+in1);}
+ if(in2!=null){argList.add("in2="+in2);}
+ if(out1!=null){argList.add("out1="+(tmpDir==null ? outDir : tmpDir)+outPrefix+out1);}
+ if(out2!=null){argList.add("out2="+(tmpDir==null ? outDir : tmpDir)+outPrefix+out2);}
+ if(qfin1!=null){argList.add("qfin1="+qfin1);}
+ if(qfin2!=null){argList.add("qfin2="+qfin2);}
+ if(qfout1!=null){argList.add("qfout1="+(tmpDir==null ? outDir : tmpDir)+outPrefix+qfout1);}
+ if(qfout2!=null){argList.add("qfout2="+(tmpDir==null ? outDir : tmpDir)+outPrefix+qfout2);}
+
+// if(rqcStatsName!=null){al.add("rqc="+rqcStatsName_kt);} //Old style for 2 log files
+ if(rqcStatsName!=null){argList.add("rqc=hashmap");}
+ if(kmerStatsName!=null){argList.add("outduk="+kmerStatsName_kt);}
+ if(scaffoldStatsName!=null){argList.add("stats="+scaffoldStatsName_kt);}
+ }
+
+ {//Add BBDuk references
+ trimrefs.add(fragAdapters);
+
+ StringBuilder refstring=new StringBuilder();
+ for(String ref : trimrefs){
+ if(ref!=null){
+ refstring.append(refstring.length()==0 ? "ref=" : ",");
+ refstring.append(ref);
+ }
+ }
+
+ if(refstring!=null && refstring.length()>0){
+ argList.add(refstring.toString());
+ }
+ }
+
+ String[] dukargs=argList.toArray(new String[0]);
+
+ {//run BBDuk
+ BBDukF duk=new BBDukF(dukargs);
+ try {
+ duk.process();
+ } catch (Exception e) {
+ e.printStackTrace();
+ log("failed", true);
+ System.exit(1);
+ }
+ }
+
+ //Optionally append files to file list here
+
+ log("ktrim finish", true);
+ }
+
+ /**
+ * Runs BBDuk to perform:
+ * Quality filtering, quality trimming, n removal, short read removal, artifact removal (via kmer filtering), phiX removal.
+ *
+ * @param in1 Primary input reads file (required)
+ * @param in2 Secondary input reads file
+ * @param out1 Primary output reads file (required)
+ * @param out2 Secondary output reads file
+ * @param qfin1 Primary input qual file
+ * @param qfin2 Secondary input qual file
+ * @param qfout1 Primary output qual file
+ * @param qfout2 Secondary output qual file
+ * @param inPrefix Append this prefix to input filenames
+ */
+ private void filter(String in1, String in2, String out1, String out2, String qfin1, String qfin2, String qfout1, String qfout2,
+ String inPrefix, String outPrefix, boolean prependIndir, boolean prependOutdir, boolean lastPhase){
+
+ log("filter start", true);
+ System.err.println("\nArtifact Filter/Quality Trim Phase Start");
+
+ ArrayList<String> argList=new ArrayList<String>();
+
+ {//Fill list with BBDuk arguments
+ if(minAvgQuality>-1){argList.add("maq="+minAvgQuality+","+minAvgQualityBases);}
+ if(maxNs>=0){argList.add("maxns="+maxNs);}
+ if(minLen>0){argList.add("minlen="+minLen);}
+ if(minLenFraction>0){argList.add("minlenfraction="+minLenFraction);}
+ argList.add("k="+filter_k);
+ argList.add("hdist="+hdist_filter);
+
+ if(qtrim!=null && trimAfterFiltering){
+ argList.add("trimq="+trimq);
+ argList.add("qtrim="+qtrim);
+ }
+ if(pigz!=null){argList.add("pigz="+pigz);}
+ if(unpigz!=null){argList.add("unpigz="+unpigz);}
+ if(zl!=null){argList.add("zl="+zl);}
+
+ //Pass along uncaptured arguments
+ for(String s : primaryArgList){argList.add(s);}
+
+ //Set read I/O files
+ if(in1!=null){argList.add("in1="+(prependIndir ? (tmpDir==null ? outDir : tmpDir) : "")+inPrefix+in1);}
+ if(in2!=null){argList.add("in2="+(prependIndir ? (tmpDir==null ? outDir : tmpDir) : "")+inPrefix+in2);}
+ if(out1!=null){argList.add("out1="+(prependOutdir ? (tmpDir==null || lastPhase ? outDir : tmpDir) : "")+outPrefix+out1);}
+ if(out2!=null){argList.add("out2="+(prependOutdir ? (tmpDir==null || lastPhase ? outDir : tmpDir) : "")+outPrefix+out2);}
+ if(qfin1!=null){argList.add("qfin1="+(prependIndir ? (tmpDir==null ? outDir : tmpDir) : "")+inPrefix+qfin1);}
+ if(qfin2!=null){argList.add("qfin2="+(prependIndir ? (tmpDir==null ? outDir : tmpDir) : "")+inPrefix+qfin2);}
+ if(qfout1!=null){argList.add("qfout1="+(prependOutdir ? (tmpDir==null || lastPhase ? outDir : tmpDir) : "")+outPrefix+qfout1);}
+ if(qfout2!=null){argList.add("qfout2="+(prependOutdir ? (tmpDir==null || lastPhase ? outDir : tmpDir) : "")+outPrefix+qfout2);}
+
+// if(rqcStatsName!=null){al.add("rqc="+rqcStatsName);} //Old style for 2 log files
+ if(rqcStatsName!=null){argList.add("rqc=hashmap");}
+ if(kmerStatsName!=null){argList.add("outduk="+kmerStatsName);}
+ if(scaffoldStatsName!=null){argList.add("stats="+scaffoldStatsName);}
+ }
+
+ {//Add BBDuk references
+ filterrefs.add(mainArtifactFile);
+ filterrefs.add(rnaFlag ? artifactFileRna : artifactFileDna);
+
+ if(phixFlag){filterrefs.add(phixRef);}
+ if(pjetFlag){filterrefs.add(pjetRef);}
+
+
+ StringBuilder refstring=new StringBuilder();
+ for(String ref : filterrefs){
+ if(ref!=null){
+ refstring.append(refstring.length()==0 ? "ref=" : ",");
+ refstring.append(ref);
+ }
+ }
+
+ if(refstring!=null && refstring.length()>0){
+ argList.add(refstring.toString());
+ }
+ }
+
+ String[] dukargs=argList.toArray(new String[0]);
+
+ {//Run BBDuk
+ BBDukF duk=new BBDukF(dukargs);
+ try {
+ duk.process();
+ } catch (Exception e) {
+ e.printStackTrace();
+ log("failed", true);
+ System.exit(1);
+ }
+ }
+
+ //Optionally append files to file list here
+
+ log("filter finish", true);
+ }
+
+ /**
+ * Runs BBMap to perform:
+ * Removal of reads that map to human with high identity (~88%).
+ *
+ * @param in1 Primary input reads file (required)
+ * @param in2 Secondary input reads file
+ * @param out1 Primary output reads file (required)
+ * @param out2 Secondary output reads file
+ * @param qfin1 Primary input qual file
+ * @param qfin2 Secondary input qual file
+ * @param qfout1 Primary output qual file
+ * @param qfout2 Secondary output qual file
+ * @param inPrefix Append this prefix to input filenames
+ */
+ private void dehumanize(String in1, String in2, String out1, String out2, String qfin1, String qfin2,
+ String inPrefix, String outPrefix, boolean prependIndir, boolean prependOutdir, boolean lastPhase){
+
+ log("dehumanize start", true);
+ System.err.println("\nHuman Removal Phase Start");
+
+ ArrayList<String> argList=new ArrayList<String>();
+
+ {
+
+ if(kfilter>map_k){argList.add("kfilter="+kfilter);}
+ if(local){argList.add("local");}
+ argList.add("minratio="+minratio);
+ argList.add("maxindel="+maxindel);
+ argList.add("fast="+fast);
+ argList.add("minhits="+minhits);
+ argList.add("tipsearch="+Tools.min(4, maxindel));
+ argList.add("bw=18");
+ argList.add("bwr=0.18");
+ argList.add("quickmatch=f");
+ argList.add("k="+map_k);
+// argList.add("cigar=f");
+ argList.add("idtag=t");
+ argList.add("sam=1.4");
+ argList.add("usemodulo");
+ argList.add("printunmappedcount");
+ argList.add("ow="+overwrite);
+
+ if(mapRef==null){
+ argList.add("path="+indexPath);
+ }else{
+ argList.add("ref="+mapRef);
+ argList.add("nodisk");
+ }
+ if(pigz!=null){argList.add("pigz="+pigz);}
+ if(unpigz!=null){argList.add("unpigz="+unpigz);}
+ if(zl!=null){argList.add("zl="+zl);}
+
+ //Pass along uncaptured arguments
+ for(String s : primaryArgList){argList.add(s);}
+
+ //Set read I/O files
+ if(in1!=null){argList.add("in1="+(prependIndir ? (tmpDir==null ? outDir : tmpDir) : "")+inPrefix+in1);}
+ if(in2!=null){argList.add("in2="+(prependIndir ? (tmpDir==null ? outDir : tmpDir) : "")+inPrefix+in2);}
+ if(out1!=null){argList.add("outu1="+(prependOutdir ? (tmpDir==null || lastPhase ? outDir : tmpDir) : "")+outPrefix+out1);}
+ if(out2!=null){argList.add("outu2="+(prependOutdir ? (tmpDir==null || lastPhase ? outDir : tmpDir) : "")+outPrefix+out2);}
+ if(qfin1!=null){argList.add("qfin1="+(prependIndir ? (tmpDir==null ? outDir : tmpDir) : "")+inPrefix+qfin1);}
+ if(qfin2!=null){argList.add("qfin2="+(prependIndir ? (tmpDir==null ? outDir : tmpDir) : "")+inPrefix+qfin2);}
+
+ }
+
+ String[] args=argList.toArray(new String[0]);
+
+ {//Run BBMap
+ try {
+ BBMap.main(args);
+ } catch (Exception e) {
+ e.printStackTrace();
+ log("failed", true);
+ System.exit(1);
+ }
+ }
+
+ //Optionally append files to file list here
+
+ log("dehumanize finish", true);
+ }
+
+ /**
+ * Runs BBNorm to perform:
+ * Error correction, error marking, quality trimming, normalization
+ *
+ * @param in1 Primary input reads file (required)
+ * @param in2 Secondary input reads file
+ * @param out1 Primary output reads file (required)
+ * @param out2 Secondary output reads file
+ * @param qfin1 Primary input qual file
+ * @param qfin2 Secondary input qual file
+ * @param qfout1 Primary output qual file
+ * @param qfout2 Secondary output qual file
+ * @param inPrefix Append this prefix to input filenames
+ */
+ private void normalize(String in1, String in2, String out1, String out2, String qfin1, String qfin2, String qfout1, String qfout2,
+ String inPrefix, String outPrefix, boolean prependIndir, boolean prependOutdir, boolean lastPhase){
+
+ log("normalization start", true);
+ System.err.println("\nNormalization/Error Correction Phase Start");
+
+ ArrayList<String> argList=new ArrayList<String>();
+
+ {//Fill list with BBDuk arguments
+ if(qtrim!=null && !trimAfterFiltering){
+ argList.add("trimq="+trimq);
+ argList.add("qtrim="+qtrim);
+ }
+ if(minLen>0){argList.add("minlen="+minLen);}
+ //if(minLenFraction>0){argList.add("minlenfraction="+minLenFraction);}
+
+ argList.add("ecc="+ecc);
+ if(aecc){argList.add("aec="+aecc);}
+ if(cecc){argList.add("cecc="+cecc);}
+ argList.add("meo="+meo);
+ argList.add("tam="+tam);
+ argList.add("mue="+mue);
+ argList.add("mw1="+mw1);
+ argList.add("prefilter=t");
+ argList.add("prehashes="+prehashes);
+ argList.add("hashes="+hashes);
+ argList.add("bits="+bits);
+ argList.add("k="+normalize_k);
+ argList.add("passes="+passes);
+ if(normalize){
+ if(target>0){
+ argList.add("target="+target);
+ if(mindepth<0){mindepth=Tools.min(10, target/8);}
+ if(maxdepth<0){maxdepth=Tools.max(target, (int)((target*17L)/16L));}
+ }
+ if(mindepth>=0){argList.add("min="+mindepth);}
+ if(maxdepth>0){argList.add("max="+maxdepth);}
+ }else{
+ argList.add("keepall");
+ }
+ if(pigz!=null){argList.add("pigz="+pigz);}
+ if(unpigz!=null){argList.add("unpigz="+unpigz);}
+ if(zl!=null){argList.add("zl="+zl);}
+
+ //Set read I/O files
+ if(in1!=null){argList.add("in1="+(prependIndir ? (tmpDir==null ? outDir : tmpDir) : "")+inPrefix+in1);}
+ if(in2!=null){argList.add("in2="+(prependIndir ? (tmpDir==null ? outDir : tmpDir) : "")+inPrefix+in2);}
+// if(out1!=null){argList.add("out="+outDir+out1);}
+ if(out1!=null){argList.add("out1="+(prependOutdir ? (tmpDir==null || lastPhase ? outDir : tmpDir) : "")+outPrefix+out1);}
+ if(out2!=null){argList.add("out2="+(prependOutdir ? (tmpDir==null || lastPhase ? outDir : tmpDir) : "")+outPrefix+out2);}
+// if(out2!=null){argList.add("out2="+outDir+out2);}
+ if(qfin1!=null){argList.add("qfin1="+(prependIndir ? (tmpDir==null ? outDir : tmpDir) : "")+inPrefix+qfin1);}
+ if(qfin2!=null){argList.add("qfin2="+(prependIndir ? (tmpDir==null ? outDir : tmpDir) : "")+inPrefix+qfin2);}
+// if(qfout1!=null){argList.add("qfout1="+outDir+qfout1);}
+// if(qfout2!=null){argList.add("qfout2="+outDir+qfout2);}\
+
+
+ if(kmerHistName!=null){argList.add("hist="+kmerHistName);}
+ }
+
+ String[] normargs=argList.toArray(new String[0]);
+
+ {//Run BBNorm
+ try {
+ KmerNormalize.main(normargs);
+ } catch (Exception e) {
+ e.printStackTrace();
+ log("failed", true);
+ System.exit(1);
+ }
+ }
+
+ //Optionally append files to file list here
+
+ log("normalization finish", true);
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Helper Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+
+ /**
+ * Log a message in the log file
+ * @param message Message to log
+ * @param append True to append, false to overwrite
+ */
+ private void log(String message, boolean append){
+ if(logName!=null){
+ ReadWrite.writeString(message+", "+timeString()+"\n", logName, append);
+ }
+ }
+
+
+ /**
+ * Delete all non-null filenames.
+ * @param prefix Append this prefix to filenames before attempting to delete them
+ * @param names Filenames to delete
+ */
+ private void delete(String prefix, String...names){
+ log("delete temp files start", true);
+ if(names!=null){
+ for(String s : names){
+ if(s!=null){
+ s=(tmpDir==null ? outDir : tmpDir)+prefix+s;
+ if(verbose){System.err.println("Trying to delete "+s);}
+ File f=new File(s);
+ if(f.exists()){
+ f.delete();
+ }
+ }
+ }
+ }
+ log("delete temp files finish", true);
+ }
+
+
+ /**
+ * Delete all non-null filenames.
+ * @param prefix Append this prefix to filenames before attempting to delete them
+ * @param names Filenames to delete
+ */
+ private void move(String prefix, String...names){
+ log("delete temp files start", true);
+ if(names!=null){
+ for(String s : names){
+ if(s!=null){
+ s=(tmpDir==null ? outDir : tmpDir)+prefix+s;
+ if(verbose){System.err.println("Trying to delete "+s);}
+ File f=new File(s);
+ if(f.exists()){
+ f.delete();
+ }
+ }
+ }
+ }
+ log("delete temp files finish", true);
+ }
+
+ /**
+ * @return String of symbols indicating which processes were applied to the input reads
+ */
+ private String abbreviation(){
+ StringBuilder sb=new StringBuilder();
+
+ if(mainArtifactFile!=null || (rnaFlag ? artifactFileRna!=null : artifactFileDna!=null)){sb.append("a");}
+
+ if(maxNs>=0){sb.append("n");}
+// if(qtrim!=null && !qtrim.equalsIgnoreCase("f") && !qtrim.equalsIgnoreCase("false")){sb.append("q");}
+ if(minAvgQuality>0){sb.append("q");}
+
+ if(rnaFlag){sb.append("r");}
+ else{sb.append("d");}
+
+ if(phixFlag){sb.append("p");}
+
+ return sb.toString();
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Static Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /**
+ * TODO: Some machines are set to UTC rather than PST
+ * @return Timestamp in RQC's format
+ */
+ public static String timeString(){
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+// sdf.setTimeZone(TimeZone.getTimeZone("PST"));
+ sdf.setTimeZone(TimeZone.getDefault());
+ return sdf.format(new Date());
+ }
+
+ /**
+ * Strips the directories, leaving only a filename
+ * @param fname
+ * @return
+ */
+ public static String stripDirs(String fname){
+ if(fname==null){return null;}
+ if(fname.indexOf('\\')>=0){fname=fname.replace('\\', '/');}
+ final int index=fname.lastIndexOf('/');
+ if(index>=0){fname=fname.substring(index+1);}
+ return fname;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- BBNorm Parameters ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private boolean removehuman=true;
+ private boolean normalize=false;
+ private boolean ecc=false;
+ private boolean aecc=false;
+ private boolean cecc=false;
+ private boolean meo=false;
+ private boolean tam=false;
+ private boolean trimAfterFiltering=true;
+ private boolean mue=false;
+ private boolean mw1=false;
+ private int maxdepth=-1;
+ private int mindepth=6;
+ private int target=50;
+ private int prehashes=3;
+ private int passes=2;
+ private int hashes=4;
+ private int bits=16;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Symbols to insert in output filename to denote operations performed */
+ private final String symbols;
+
+ /** True for rna artifacts, false for dna artifacts */
+ private boolean rnaFlag=false;
+ /** True if phix should be filtered out */
+ private boolean phixFlag=true;
+ /** True if pjet should be filtered out */
+ private boolean pjetFlag=true;
+
+ /** Unused */
+ private boolean tboFlag=false;
+ /** Unused */
+ private boolean tpeFlag=false;
+
+ /** Toss reads shorter than this */
+ private int minLen=40;
+ /** Toss reads shorter than this fraction of initial length, after trimming */
+ private float minLenFraction=0.6f;
+ /** Trim bases at this quality or below */
+ private byte trimq=12;
+ /** Throw away reads below this average quality before trimming. Default: 8 */
+ private byte minAvgQuality=8;
+ /** If positive, calculate the average quality from the first X bases. */
+ private int minAvgQualityBases=0;
+
+ /** Trim reads to be equal to 0 modulo this value. Mainly for 151, 251, and 301bp runs. */
+ private int forceTrimModulo=5;
+ /** Quality-trimming mode */
+ private String qtrim="rl";
+ /** Kmer-trimming mode */
+ private String ktrim="r";
+ /** Kmer length to use for filtering */
+ private int filter_k=27;
+ /** Kmer length to use for trimming */
+ private int trim_k=23;
+ /** Kmer length to use for normalization and error-correction */
+ private int normalize_k=31;
+ /** Kmer length to use for mapping */
+ private int map_k=13;
+ /** Shortest kmer to use for trimming */
+ private int mink=11;
+ /** Throw away reads containing more than this many Ns. Default: 1 */
+ private int maxNs=1;
+ /** Use this Hamming distance when kmer filtering */
+ private int hdist_filter=1;
+ /** Use this Hamming distance when kmer trimming */
+ private int hdist_trim=1;
+ /** Use this Hamming distance when kmer trimming with short kmers */
+ private int hdist2_trim=-1;
+
+ /** Captures the command line "pigz" flag */
+ private String pigz;
+ /** Captures the command line "unpigz" flag */
+ private String unpigz;
+ /** Captures the command line "zl" flag */
+ private String zl;
+
+ private float minratio=0.84f;
+ private int maxindel=6;
+ private int kfilter=0;
+ private int minhits=1;
+ private boolean fast=true;
+ private boolean local=true;
+
+ private boolean verbose=false;
+ private boolean overwrite=true;
+ private boolean append=false;
+ private boolean compress=true;
+
+ /** Arguments to pass to BBDuk */
+ private ArrayList<String> primaryArgList=new ArrayList<String>();
+ /** References to pass to BBDuk for artifact removal */
+ private ArrayList<String> trimrefs=new ArrayList<String>();
+ /** References to pass to BBDuk for artifact removal */
+ private ArrayList<String> filterrefs=new ArrayList<String>();
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Read Data Files ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Directory in which to write all files */
+ private String outDir="";
+
+ /** Directory in which to write all temp files */
+ private String tmpDir=Shared.TMPDIR;
+
+ private final String tempSalt;
+
+ /** Primary input reads file (required) */
+ private String in1=null;
+ /** Secondary input reads file */
+ private String in2=null;
+ /** Primary output reads file (required) */
+ private String out1=null;
+ /** Secondary output reads file */
+ private String out2=null;
+ /** Primary input qual file */
+ private String qfin1=null;
+ /** Secondary input qual file */
+ private String qfin2=null;
+ /** Primary output qual file */
+ private String qfout1=null;
+ /** Secondary output qual file */
+ private String qfout2=null;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Log Files ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private String logName="status.log";
+ private String fileListName="file-list.txt";
+
+ private String rqcStatsName="filterStats.txt";
+ private String kmerStatsName="kmerStats.txt";
+ private String scaffoldStatsName="scaffoldStats.txt";
+ private String kmerHistName="khist.txt";
+
+ private String ihistName=null;
+
+ /** ktrim phase rqc stats file */
+ private String rqcStatsName_kt;
+ /** ktrim phase stats file */
+ private String kmerStatsName_kt;
+ /** ktrim phase scaffold stats file */
+ private String scaffoldStatsName_kt;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Reference Files ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private String mainArtifactFile = "/global/dna/shared/rqc/ref_databases/qaqc/databases/illumina.artifacts/Illumina.artifacts.2013.12.no_DNA_RNA_spikeins.fa";
+ private String artifactFileRna = "/global/dna/shared/rqc/ref_databases/qaqc/databases/illumina.artifacts/RNA_spikeins.artifacts.2012.10.NoPolyA.fa";
+ private String artifactFileDna = "/global/dna/shared/rqc/ref_databases/qaqc/databases/illumina.artifacts/DNA_spikeins.artifacts.2012.10.fa";
+ private String phixRef = "/global/dna/shared/rqc/ref_databases/qaqc/databases/phix174_ill.ref.fa";
+ private String pjetRef = "/global/dna/shared/rqc/ref_databases/qaqc/databases/pJET1.2.fasta";
+
+ private String allArtifactsLatest = "/global/projectb/sandbox/rqc/qcdb/illumina.artifacts/Illumina.artifacts.fa";
+ private String fragAdapters = "/global/projectb/sandbox/gaag/bbtools/data/adapters.fa";
+ private String rnaAdapter = "/global/projectb/sandbox/gaag/bbtools/data/truseq_rna.fa.gz";
+ private String indexPath = "/global/projectb/sandbox/gaag/bbtools/hg19/";
+ private String mapRef = null;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Static Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+}
diff --git a/current/jgi/BBTool_ST.java b/current/jgi/BBTool_ST.java
new file mode 100755
index 0000000..7e4c378
--- /dev/null
+++ b/current/jgi/BBTool_ST.java
@@ -0,0 +1,486 @@
+package jgi;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import stream.ConcurrentReadInputStream;
+import stream.FASTQ;
+import stream.FastaReadInputStream;
+import stream.ConcurrentReadOutputStream;
+import stream.KillSwitch;
+import stream.Read;
+
+import dna.Parser;
+import dna.Timer;
+import fileIO.ByteFile;
+import fileIO.ReadWrite;
+import fileIO.FileFormat;
+
+import align2.ListNum;
+import align2.ReadStats;
+import align2.Shared;
+import align2.Tools;
+
+/**
+ * @author Brian Bushnell
+ * @date Jan 12, 2015
+ *
+ */
+public abstract class BBTool_ST {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /**
+ * Code entrance from the command line.
+ * Must be overridden; the commented body is an example.
+ * @param args Command line arguments
+ */
+ public static void main(String[] args){
+ throw new RuntimeException("This method must be overridden.");
+ /*
+ //Example:
+ Timer t=new Timer();
+ BBTool_ST bbt=new BBTool_ST(args);
+ bbt.process(t);
+ */
+ }
+
+ /**
+ * Constructor. Parses argument list and sets relevant fields.
+ * Must be called by subclass.
+ * @param args Command line arguments
+ */
+ public BBTool_ST(String[] args){
+
+ args=Parser.parseConfig(args);
+ if(Parser.parseHelp(args, true)){
+ printOptions();
+ System.exit(0);
+ }
+
+ outstream.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+
+ boolean setInterleaved=false; //Whether it was explicitly set.
+
+
+
+ Shared.READ_BUFFER_LENGTH=Tools.min(200, Shared.READ_BUFFER_LENGTH);
+ Shared.capBuffers(4);
+ ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
+ ReadWrite.MAX_ZIP_THREADS=Shared.threads();
+
+
+ setDefaults();
+
+ Parser parser=new Parser();
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if(b==null || b.equalsIgnoreCase("null")){b=null;}
+ while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens
+
+ if(parseArgument(arg, a, b)){
+ // do nothing
+ }else if(parser.parse(arg, a, b)){
+ //do nothing
+ }else if(a.equals("parse_flag_goes_here")){
+ //Set a variable here
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ }else{
+ throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ {//Process parser fields
+ Parser.processQuality();
+
+ maxReads=parser.maxReads;
+
+ overwrite=ReadStats.overwrite=parser.overwrite;
+ append=ReadStats.append=parser.append;
+
+ setInterleaved=parser.setInterleaved;
+
+ in1=parser.in1;
+ in2=parser.in2;
+ qfin1=parser.qfin1;
+ qfin2=parser.qfin2;
+
+ out1=parser.out1;
+ out2=parser.out2;
+ qfout1=parser.qfout1;
+ qfout2=parser.qfout2;
+
+ extin=parser.extin;
+ extout=parser.extout;
+ }
+
+ if(in1!=null && in2==null && in1.indexOf('#')>-1 && !new File(in1).exists()){
+ in2=in1.replace("#", "2");
+ in1=in1.replace("#", "1");
+ }
+ if(out1!=null && out2==null && out1.indexOf('#')>-1){
+ out2=out1.replace("#", "2");
+ out1=out1.replace("#", "1");
+ }
+ if(in2!=null){
+ if(FASTQ.FORCE_INTERLEAVED){outstream.println("Reset INTERLEAVED to false because paired input files were specified.");}
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+ }
+
+ assert(FastaReadInputStream.settingsOK());
+
+ if(in1==null){
+ printOptions();
+ throw new RuntimeException("Error - at least one input file is required.");
+ }
+ if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){
+ ByteFile.FORCE_MODE_BF2=true;
+ }
+
+ if(out1==null){
+ if(out2!=null){
+ printOptions();
+ throw new RuntimeException("Error - cannot define out2 without defining out1.");
+ }
+ }
+
+ if(!setInterleaved){
+ assert(in1!=null && (out1!=null || out2==null)) : "\nin1="+in1+"\nin2="+in2+"\nout1="+out1+"\nout2="+out2+"\n";
+ if(in2!=null){ //If there are 2 input streams.
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+ outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED);
+ }else{ //There is one input stream.
+ if(out2!=null){
+ FASTQ.FORCE_INTERLEAVED=true;
+ FASTQ.TEST_INTERLEAVED=false;
+ outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED);
+ }
+ }
+ }
+
+ if(out1!=null && out1.equalsIgnoreCase("null")){out1=null;}
+ if(out2!=null && out2.equalsIgnoreCase("null")){out2=null;}
+
+ if(!Tools.testOutputFiles(overwrite, append, false, out1, out2)){
+ outstream.println((out1==null)+", "+(out2==null)+", "+out1+", "+out2);
+ throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+out1+", "+out2+"\n");
+ }
+
+ ffout1=FileFormat.testOutput(out1, FileFormat.FASTQ, extout, true, overwrite, append, false);
+ ffout2=FileFormat.testOutput(out2, FileFormat.FASTQ, extout, true, overwrite, append, false);
+
+ ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, extin, true, true);
+ ffin2=FileFormat.testInput(in2, FileFormat.FASTQ, extin, true, true);
+ }
+
+ /** Must be overridden if variables are defined at bottom of class file. */
+ abstract void setDefaults();
+
+ protected void reparse(String[] args){
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if(b==null || b.equalsIgnoreCase("null")){b=null;}
+ while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens
+
+ if(parseArgument(arg, a, b)){
+ // do nothing
+ }
+ }
+ }
+
+ /**
+ * Must be overridden; this body is just for example.
+ * Parses an argument from the command line.
+ * Assumed to be in "key=value" form, but this is not required.
+ * @param arg The full original argument.
+ * @param a Left hand side, to lower case.
+ * @param b Right hand side, unaltered.
+ * @return true if a matched some keyword.
+ */
+ public abstract boolean parseArgument(String arg, String a, String b);
+
+// //Example
+// @Override
+// public boolean parseArgument(String arg, String a, String b){
+// if(a.equals("keepunmapped") | a.equals("ku")){
+// keepUnmapped=Tools.parseBoolean(b);
+// return true;
+// }else if(a.equals("ignorepairorder") | a.equals("ipo")){
+// usePairOrder=!Tools.parseBoolean(b);
+// return true;
+// }else if(a.equals("sorted")){
+// sorted=Tools.parseBoolean(b);
+// return true;
+// }
+// return false;
+// }
+
+ /**
+ * And example of how to override parseArgument.
+ */
+ private boolean parseArgument_EXAMPLE(String arg, String a, String b){
+ if(true){throw new RuntimeException("parseArgument() must be overridden.");}
+
+ //These are dummy values for demonstration purposes.
+ //In real code they should be class fields.
+ int value1;
+ boolean value2;
+ String value3;
+
+ if(a.equals("key1")){
+ value1=(int)Tools.parseKMG(b);
+ //do anything else necessary here
+ return true;
+ }else if(a.equals("key2")){
+ value2=Tools.parseBoolean(b);
+ //do anything else necessary here
+ return true;
+ }else if(a.equals("key3")){
+ value3=b;
+ //do anything else necessary here
+ return true;
+ }
+
+ //There was no match to the argument
+ return false;
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Outer Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ void process(){process(new Timer());}
+
+ /** Create read streams and process all data */
+ void process(Timer t){
+
+ //Start the read streams
+ startup();
+
+ readsProcessed=0;
+ basesProcessed=0;
+
+ //Process the read streams
+ processInner(cris_primary, ros_primary);
+
+ //Close the read streams
+ shutdown(cris_primary, ros_primary);
+
+ showStats(t);
+ }
+
+ /** Create read streams */
+ void startup(){
+ startupSubclass();
+
+ if(!Tools.testForDuplicateFiles(true, in1, in2, qfin1, qfin2, out1, out2, qfout1, qfout2)){
+ assert(false) : "Duplicate files.";
+ }
+
+ final ConcurrentReadInputStream cris;
+ final ConcurrentReadOutputStream ros;
+
+ {
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, ffin2, qfin1, qfin2);
+ cris.start();
+ if(verbose){outstream.println("Started cris");}
+ }
+ final boolean paired=cris.paired();
+ if(!ffin1.samOrBam()){outstream.println("Input is being processed as "+(paired ? "paired" : "unpaired"));}
+
+ if(out1!=null){
+ final int buff=4;
+
+ if(cris.paired() && out2==null && (in1==null || !in1.contains(".sam"))){
+ outstream.println("Writing interleaved.");
+ }
+
+ ros=ConcurrentReadOutputStream.getStream(ffout1, ffout2, qfout1, qfout2, buff, null, false);
+ ros.start();
+ }else{ros=null;}
+
+ readsProcessed=0;
+ basesProcessed=0;
+ cris_primary=cris;
+ ros_primary=ros;
+ }
+
+ /** Called before startup().
+ * Implement if necessary */
+ abstract void startupSubclass();
+
+ final void shutdown(final ConcurrentReadInputStream cris, final ConcurrentReadOutputStream ros){
+ shutdownSubclass();
+
+ final boolean paired=cris.paired();
+ if(verbose){outstream.println("Finished.");}
+
+ errorState|=ReadStats.writeAll();
+ errorState|=ReadWrite.closeStreams(cris, ros);
+ }
+
+ /** Called before shutdown().
+ * Implement if necessary */
+ abstract void shutdownSubclass();
+
+ void showStats(final Timer t){
+ t.stop();
+
+ double rpnano=readsProcessed/(double)(t.elapsed);
+ double bpnano=basesProcessed/(double)(t.elapsed);
+
+ String rpstring=(readsProcessed<100000 ? ""+readsProcessed : readsProcessed<100000000 ? (readsProcessed/1000)+"k" : (readsProcessed/1000000)+"m");
+ String bpstring=(basesProcessed<100000 ? ""+basesProcessed : basesProcessed<100000000 ? (basesProcessed/1000)+"k" : (basesProcessed/1000000)+"m");
+
+ while(rpstring.length()<8){rpstring=" "+rpstring;}
+ while(bpstring.length()<8){bpstring=" "+bpstring;}
+
+ outstream.println("Time: \t"+t);
+ outstream.println("Reads Processed: "+rpstring+" \t"+String.format("%.2fk reads/sec", rpnano*1000000));
+ outstream.println("Bases Processed: "+bpstring+" \t"+String.format("%.2fm bases/sec", bpnano*1000));
+
+ showStatsSubclass(t, readsProcessed, basesProcessed);
+
+ if(errorState){
+ throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
+ }
+ }
+
+ /** Called AFTER showStats().
+ * Implement if necessary */
+ abstract void showStatsSubclass(final Timer t, long readsIn, long basesIn);
+
+ /** Iterate through the reads.
+ * This may optionally be overridden. */
+ void processInner(final ConcurrentReadInputStream cris, final ConcurrentReadOutputStream ros){
+
+ readsProcessed=0;
+ basesProcessed=0;
+
+ {
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+ assert((ffin1==null || ffin1.samOrBam()) || (r.mate!=null)==cris.paired());
+ }
+
+ while(reads!=null && reads.size()>0){
+ if(verbose){outstream.println("Fetched "+reads.size()+" reads.");}
+
+ for(int idx=0; idx<reads.size(); idx++){
+ final Read r1=reads.get(idx);
+ final Read r2=r1.mate;
+
+ final int initialLength1=r1.length();
+ final int initialLength2=(r1.mateLength());
+
+ {
+ readsProcessed++;
+ basesProcessed+=initialLength1;
+ }
+ if(r2!=null){
+ readsProcessed++;
+ basesProcessed+=initialLength2;
+ }
+
+ boolean keep=processReadPair(r1, r2);
+ if(!keep){reads.set(idx, null);}
+
+ }
+
+ if(ros!=null){ros.add(reads, ln.id);}
+
+ cris.returnList(ln.id, ln.list.isEmpty());
+ if(verbose){outstream.println("Returned a list.");}
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ if(ln!=null){
+ cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
+ }
+ }
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /**
+ * Process a single read pair. Must be overidden.
+ * @param r1 Read 1
+ * @param r2 Read 2 (may be null)
+ * @return True if the reads should be kept, false if they should be discarded.
+ */
+ abstract boolean processReadPair(final Read r1, final Read r2);
+
+ /** This is called if the program runs with no parameters.
+ * Should be overidden to print usage information. */
+ void printOptions(){
+ throw new RuntimeException("printOptions: TODO");
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ protected String in1=null;
+ protected String in2=null;
+
+ protected String qfin1=null;
+ protected String qfin2=null;
+
+ protected String out1=null;
+ protected String out2=null;
+
+ protected String qfout1=null;
+ protected String qfout2=null;
+
+ protected String extin=null;
+ protected String extout=null;
+
+ /*--------------------------------------------------------------*/
+
+ private ConcurrentReadInputStream cris_primary;
+ private ConcurrentReadOutputStream ros_primary;
+
+ protected long readsProcessed=0;
+ protected long basesProcessed=0;
+
+ private long maxReads=-1;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Final Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ protected final FileFormat ffin1;
+ protected final FileFormat ffin2;
+
+ protected FileFormat ffout1;
+ protected FileFormat ffout2;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Common Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ protected PrintStream outstream=System.err;
+ public static boolean verbose=false;
+ public boolean errorState=false;
+ protected boolean overwrite=false;
+ protected boolean append=false;
+
+}
diff --git a/current/jgi/CalcTrueQuality.java b/current/jgi/CalcTrueQuality.java
new file mode 100755
index 0000000..92d8c96
--- /dev/null
+++ b/current/jgi/CalcTrueQuality.java
@@ -0,0 +1,1699 @@
+package jgi;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import stream.ConcurrentGenericReadInputStream;
+import stream.ConcurrentReadInputStream;
+import stream.FastaReadInputStream;
+import stream.Read;
+import stream.SamLine;
+import align2.ListNum;
+import align2.QualityTools;
+import align2.ReadStats;
+import align2.Shared;
+import align2.Tools;
+import dna.AminoAcid;
+import dna.Data;
+import dna.Gene;
+import dna.Parser;
+import dna.Timer;
+import fileIO.ByteFile;
+import fileIO.ByteFile1;
+import fileIO.ByteFile2;
+import fileIO.FileFormat;
+import fileIO.ReadWrite;
+import fileIO.TextFile;
+import fileIO.TextStreamWriter;
+
+/**
+ * @author Brian Bushnell
+ * @date Jan 13, 2014
+ *
+ */
+public class CalcTrueQuality {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public static void main(String[] args){
+ ReadStats.COLLECT_QUALITY_STATS=true;
+ CalcTrueQuality ctq=new CalcTrueQuality(args);
+ ReadStats.overwrite=overwrite;
+ ctq.process();
+ }
+
+ /** Calls main() but restores original static variable values. */
+ public static void main2(String[] args){
+ final boolean oldCOLLECT_QUALITY_STATS=ReadStats.COLLECT_QUALITY_STATS;
+ final boolean oldoverwrite=ReadStats.overwrite;
+ final int oldREAD_BUFFER_LENGTH=Shared.READ_BUFFER_LENGTH;
+ final boolean oldPIGZ=ReadWrite.USE_PIGZ;
+ final boolean oldUnPIGZ=ReadWrite.USE_UNPIGZ;
+ final int oldZL=ReadWrite.ZIPLEVEL;
+ final boolean oldBF1=ByteFile.FORCE_MODE_BF1;
+ final boolean oldBF2=ByteFile.FORCE_MODE_BF2;
+
+ main(args);
+
+ ReadStats.COLLECT_QUALITY_STATS=oldCOLLECT_QUALITY_STATS;
+ ReadStats.overwrite=oldoverwrite;
+ Shared.READ_BUFFER_LENGTH=oldREAD_BUFFER_LENGTH;
+ ReadWrite.USE_PIGZ=oldPIGZ;
+ ReadWrite.USE_UNPIGZ=oldUnPIGZ;
+ ReadWrite.ZIPLEVEL=oldZL;
+ ByteFile.FORCE_MODE_BF1=oldBF1;
+ ByteFile.FORCE_MODE_BF2=oldBF2;
+ }
+
+ public static void printOptions(){
+ assert(false) : "No help available.";
+ }
+
+ public CalcTrueQuality(String[] args){
+ if(args==null || args.length==0){
+ printOptions();
+ System.exit(0);
+ }
+
+ for(String s : args){if(s.startsWith("out=standardout") || s.startsWith("out=stdout")){outstream=System.err;}}
+ outstream.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+
+
+
+ Shared.READ_BUFFER_LENGTH=Tools.min(200, Shared.READ_BUFFER_LENGTH);
+// Shared.capBuffers(4);
+ ReadWrite.USE_PIGZ=false;
+ ReadWrite.USE_UNPIGZ=true;
+ ReadWrite.ZIPLEVEL=8;
+// SamLine.CONVERT_CIGAR_TO_MATCH=true;
+
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(Parser.parseCommonStatic(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseZip(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseQualityAdjust(arg, a, b)){
+ //do nothing
+ }else if(a.equals("showstats")){
+ showStats=Tools.parseBoolean(b);
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ ByteFile1.verbose=verbose;
+ ByteFile2.verbose=verbose;
+ stream.FastaReadInputStream.verbose=verbose;
+ ConcurrentGenericReadInputStream.verbose=verbose;
+// align2.FastaReadInputStream2.verbose=verbose;
+ stream.FastqReadInputStream.verbose=verbose;
+ ReadWrite.verbose=verbose;
+ }else if(a.equals("reads") || a.equals("maxreads")){
+ maxReads=Tools.parseKMG(b);
+ }else if(a.equals("t") || a.equals("threads")){
+ Shared.setThreads(b);
+ }else if(a.equals("build") || a.equals("genome")){
+ Data.setGenome(Integer.parseInt(b));
+ }else if(a.equals("in") || a.equals("input") || a.equals("in1") || a.equals("input1") || a.equals("sam")){
+ in=b.split(",");
+ }else if(a.equals("hist") || a.equals("qhist")){
+ qhist=b;
+ }else if(a.equals("path")){
+ Data.setPath(b);
+ }else if(a.equals("append") || a.equals("app")){
+// append=ReadStats.append=Tools.parseBoolean(b);
+ assert(false) : "This does not work in append mode.";
+ }else if(a.equals("overwrite") || a.equals("ow")){
+ overwrite=Tools.parseBoolean(b);
+ }else if(a.equals("countindels") || a.equals("indels")){
+ COUNT_INDELS=Tools.parseBoolean(b);
+ }else if(a.equals("writematrices") || a.equals("write") || a.equals("wm")){
+ writeMatrices=Tools.parseBoolean(b);
+ }else if(a.equals("passes") || a.equals("recalpasses")){
+ passes=Integer.parseInt(b);
+ }else if(in==null && i==0 && !arg.contains("=") && (arg.toLowerCase().startsWith("stdin") || new File(arg).exists())){
+ in=arg.split(",");
+ }else{
+ System.err.println("Unknown parameter "+args[i]);
+ assert(false) : "Unknown parameter "+args[i];
+ // throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ assert(FastaReadInputStream.settingsOK());
+// if(maxReads!=-1){ReadWrite.USE_GUNZIP=ReadWrite.USE_UNPIGZ=false;}
+
+ if(in==null){
+ printOptions();
+ throw new RuntimeException("Error - at least one input file is required.");
+ }
+ if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){
+// if(ReadWrite.isCompressed(in1)){ByteFile.FORCE_MODE_BF2=true;}
+ ByteFile.FORCE_MODE_BF2=true;
+ }
+
+// if(!Tools.testOutputFiles(overwrite, append, false, q102out, qbpout, q10out, q12out, qb012out, qb123out, qb234out, qpout, qout, pout)){
+// throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output file "+q102out+"\n");
+// }
+ threads=Shared.threads();
+ if(qhist!=null){readstats=new ReadStats();}
+
+ assert(passes==1 || passes==2);
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Outer Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public void process(){
+ Timer t=new Timer();
+ for(int pass=0; pass<passes; pass++){
+ process(pass);
+ }
+
+ t.stop();
+
+ if(showStats){
+ readsProcessed/=passes;
+ basesProcessed/=passes;
+ readsUsed/=passes;
+ basesUsed/=passes;
+
+ double rpnano=readsProcessed/(double)(t.elapsed);
+ double bpnano=basesProcessed/(double)(t.elapsed);
+
+ String rpstring=(readsProcessed<100000 ? ""+readsProcessed : readsProcessed<100000000 ? (readsProcessed/1000)+"k" : (readsProcessed/1000000)+"m");
+ String bpstring=(basesProcessed<100000 ? ""+basesProcessed : basesProcessed<100000000 ? (basesProcessed/1000)+"k" : (basesProcessed/1000000)+"m");
+
+ while(rpstring.length()<8){rpstring=" "+rpstring;}
+ while(bpstring.length()<8){bpstring=" "+bpstring;}
+
+ outstream.println("Time: \t"+t);
+ outstream.println("Reads Processed: "+rpstring+" \t"+String.format("%.2fk reads/sec", rpnano*1000000));
+ outstream.println("Bases Processed: "+bpstring+" \t"+String.format("%.2fm bases/sec", bpnano*1000));
+
+ rpstring=(readsUsed<100000 ? ""+readsUsed : readsUsed<100000000 ? (readsUsed/1000)+"k" : (readsUsed/1000000)+"m");
+ bpstring=(basesUsed<100000 ? ""+basesUsed : basesUsed<100000000 ? (basesUsed/1000)+"k" : (basesUsed/1000000)+"m");
+
+ while(rpstring.length()<8){rpstring=" "+rpstring;}
+ while(bpstring.length()<8){bpstring=" "+bpstring;}
+
+ outstream.println("Reads Used: "+rpstring);
+ outstream.println("Bases Used: "+bpstring);
+ }
+
+ if(errorState){
+ throw new RuntimeException(this.getClass().getName()+" terminated in an error state; the output may be corrupt.");
+ }
+ }
+
+ public void process(final int pass){
+
+ if(pass>0){
+ initializeMatrices(pass-1);
+ }
+
+ for(String s : in){
+ process_MT(s, pass);
+ }
+
+ if(writeMatrices){
+ writeMatrices(pass);
+ gbmatrices.set(pass, null);
+ }
+
+ System.err.println("Finished pass "+(pass+1)+"\n");
+
+ if(errorState){
+ throw new RuntimeException(this.getClass().getName()+" terminated in an error state; the output may be corrupt.");
+ }
+ }
+
+
+ public void process_MT(String fname, int pass){
+
+ assert(gbmatrices.size()==pass);
+
+ final ConcurrentReadInputStream cris;
+ {
+ FileFormat ff=FileFormat.testInput(fname, FileFormat.SAM, null, true, false);
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff, null);
+ if(verbose){System.err.println("Starting cris");}
+ cris.start(); //4567
+ }
+
+ /* Create Workers */
+ final int wthreads=Tools.mid(1, threads, 20);
+ ArrayList<Worker> alpt=new ArrayList<Worker>(wthreads);
+ for(int i=0; i<wthreads; i++){alpt.add(new Worker(cris, pass));}
+ for(Worker pt : alpt){pt.start();}
+
+ GBMatrixSet gbmatrix=new GBMatrixSet(pass);
+ gbmatrices.add(gbmatrix);
+
+ /* Wait for threads to die, and gather statistics */
+ for(int i=0; i<alpt.size(); i++){
+ Worker pt=alpt.get(i);
+ while(pt.getState()!=Thread.State.TERMINATED){
+ try {
+ pt.join();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ alpt.set(i, null);
+
+ gbmatrix.add(pt.matrixT);
+
+ readsProcessed+=pt.readsProcessedT;
+ basesProcessed+=pt.basesProcessedT;
+ readsUsed+=pt.readsUsedT;
+ basesUsed+=pt.basesUsedT;
+ }
+
+ /* Shut down I/O streams; capture error status */
+ errorState|=ReadWrite.closeStreams(cris);
+
+ }
+
+ static void add(long[] dest, long[] source){
+ assert(dest.length==source.length);
+ for(int i=0; i<dest.length; i++){dest[i]+=source[i];}
+ }
+
+ static void add(long[][] dest, long[][] source){
+ assert(dest.length==source.length);
+ for(int i=0; i<dest.length; i++){add(dest[i], source[i]);}
+ }
+
+ static void add(long[][][] dest, long[][][] source){
+ assert(dest.length==source.length);
+ for(int i=0; i<dest.length; i++){add(dest[i], source[i]);}
+ }
+
+ static void add(long[][][][] dest, long[][][][] source){
+ assert(dest.length==source.length);
+ for(int i=0; i<dest.length; i++){add(dest[i], source[i]);}
+ }
+
+ static void add(long[][][][][] dest, long[][][][][] source){
+ assert(dest.length==source.length);
+ for(int i=0; i<dest.length; i++){add(dest[i], source[i]);}
+ }
+
+ public void writeMatrices(int pass){
+ int oldZL=ReadWrite.ZIPLEVEL;
+ ReadWrite.ZIPLEVEL=8;
+ gbmatrices.get(pass).write();
+ if(qhist!=null){
+ readstats=ReadStats.mergeAll();
+ readstats.writeQualityToFile(qhist, false);
+ }
+ ReadWrite.ZIPLEVEL=oldZL;
+ }
+
+ public static void writeMatrix(String fname, long[][][][][] goodMatrix, long[][][][][] badMatrix, boolean overwrite, boolean append, int pass){
+ assert(fname!=null) : "No file specified";
+ if(fname.startsWith("?")){
+ fname=fname.replaceFirst("\\?", Data.ROOT_QUALITY);
+ }
+ fname=fname.replace("_p#", "_p"+pass);
+ FileFormat ff=FileFormat.testOutput(fname, FileFormat.TEXT, null, false, overwrite, append, false);
+ TextStreamWriter tsw=new TextStreamWriter(ff);
+ //System.err.println("Starting tsw for "+fname);
+ tsw.start();
+ //System.err.println("Started tsw for "+fname);
+ StringBuilder sb=new StringBuilder();
+
+ final int d0=goodMatrix.length, d1=goodMatrix[0].length, d2=goodMatrix[0][0].length, d3=goodMatrix[0][0][0].length, d4=goodMatrix[0][0][0][0].length;
+ for(int a=0; a<d0; a++){
+ for(int b=0; b<d1; b++){
+ for(int c=0; c<d2; c++){
+ for(int d=0; d<d3; d++){
+ for(int e=0; e<d4; e++){
+ long good=goodMatrix[a][b][c][d][e];
+ long bad=badMatrix[a][b][c][d][e];
+ long sum=good+bad;
+ if(sum>0){
+ sb.append(a);
+ sb.append('\t');
+ sb.append(b);
+ sb.append('\t');
+ sb.append(c);
+ sb.append('\t');
+ sb.append(d);
+ sb.append('\t');
+ sb.append(e);
+ sb.append('\t');
+ sb.append(sum);
+ sb.append('\t');
+ sb.append(bad);
+ sb.append('\n');
+ }
+ }
+ if(sb.length()>0){
+ tsw.print(sb.toString());
+ sb.setLength(0);
+ }
+ }
+ }
+ }
+ }
+ //System.err.println("Writing "+fname);
+ tsw.poisonAndWait();
+ if(showStats){System.err.println("Wrote "+fname);}
+ }
+
+ public static void writeMatrix(String fname, long[][][][] goodMatrix, long[][][][] badMatrix, boolean overwrite, boolean append, int pass){
+ assert(fname!=null) : "No file specified";
+ if(fname.startsWith("?")){
+ fname=fname.replaceFirst("\\?", Data.ROOT_QUALITY);
+ }
+ fname=fname.replace("_p#", "_p"+pass);
+ FileFormat ff=FileFormat.testOutput(fname, FileFormat.TEXT, null, false, overwrite, append, false);
+// assert(false) : new File(fname).canWrite()+", "+new File(fname).getAbsolutePath();
+ TextStreamWriter tsw=new TextStreamWriter(ff);
+ //System.err.println("Starting tsw for "+fname);
+ tsw.start();
+ //System.err.println("Started tsw for "+fname);
+ StringBuilder sb=new StringBuilder();
+
+ final int d0=goodMatrix.length, d1=goodMatrix[0].length, d2=goodMatrix[0][0].length, d3=goodMatrix[0][0][0].length;
+ for(int a=0; a<d0; a++){
+ for(int b=0; b<d1; b++){
+ for(int c=0; c<d2; c++){
+ for(int d=0; d<d3; d++){
+ long good=goodMatrix[a][b][c][d];
+ long bad=badMatrix[a][b][c][d];
+ long sum=good+bad;
+ if(sum>0){
+ sb.append(a);
+ sb.append('\t');
+ sb.append(b);
+ sb.append('\t');
+ sb.append(c);
+ sb.append('\t');
+ sb.append(d);
+ sb.append('\t');
+ sb.append(sum);
+ sb.append('\t');
+ sb.append(bad);
+ sb.append('\n');
+ }
+ }
+ if(sb.length()>0){
+ tsw.print(sb.toString());
+ sb.setLength(0);
+ }
+ }
+ }
+ }
+ //System.err.println("Writing "+fname);
+ tsw.poisonAndWait();
+ if(showStats){System.err.println("Wrote "+fname);}
+ }
+
+ public static void writeMatrix(String fname, long[][][] goodMatrix, long[][][] badMatrix, boolean overwrite, boolean append, int pass){
+ assert(fname!=null) : "No file specified";
+ if(fname.startsWith("?")){
+ fname=fname.replaceFirst("\\?", Data.ROOT_QUALITY);
+ }
+ fname=fname.replace("_p#", "_p"+pass);
+ FileFormat ff=FileFormat.testOutput(fname, FileFormat.TEXT, null, false, overwrite, append, false);
+ TextStreamWriter tsw=new TextStreamWriter(ff);
+ //System.err.println("Starting tsw for "+fname);
+ tsw.start();
+ //System.err.println("Started tsw for "+fname);
+ StringBuilder sb=new StringBuilder();
+
+ final int d0=goodMatrix.length, d1=goodMatrix[0].length, d2=goodMatrix[0][0].length;
+ for(int a=0; a<d0; a++){
+ for(int b=0; b<d1; b++){
+ for(int c=0; c<d2; c++){
+ long good=goodMatrix[a][b][c];
+ long bad=badMatrix[a][b][c];
+ long sum=good+bad;
+ if(sum>0){
+ sb.append(a);
+ sb.append('\t');
+ sb.append(b);
+ sb.append('\t');
+ sb.append(c);
+ sb.append('\t');
+ sb.append(sum);
+ sb.append('\t');
+ sb.append(bad);
+ sb.append('\n');
+ }
+ }
+ if(sb.length()>0){
+ tsw.print(sb.toString());
+ sb.setLength(0);
+ }
+ }
+ }
+ //System.err.println("Writing "+fname);
+ tsw.poisonAndWait();
+ if(showStats){System.err.println("Wrote "+fname);}
+ }
+
+ public static void writeMatrix(String fname, long[][] goodMatrix, long[][] badMatrix, boolean overwrite, boolean append, int pass){
+ assert(fname!=null) : "No file specified";
+ if(fname.startsWith("?")){
+ fname=fname.replaceFirst("\\?", Data.ROOT_QUALITY);
+ }
+ fname=fname.replace("_p#", "_p"+pass);
+ FileFormat ff=FileFormat.testOutput(fname, FileFormat.TEXT, null, false, overwrite, append, false);
+ TextStreamWriter tsw=new TextStreamWriter(ff);
+ //System.err.println("Starting tsw for "+fname);
+ tsw.start();
+ //System.err.println("Started tsw for "+fname);
+ StringBuilder sb=new StringBuilder();
+
+ final int d0=goodMatrix.length, d1=goodMatrix[0].length;
+ for(int a=0; a<d0; a++){
+ for(int b=0; b<d1; b++){
+ long good=goodMatrix[a][b];
+ long bad=badMatrix[a][b];
+ long sum=good+bad;
+ if(sum>0){
+ sb.append(a);
+ sb.append('\t');
+ sb.append(b);
+ sb.append('\t');
+ sb.append(sum);
+ sb.append('\t');
+ sb.append(bad);
+ sb.append('\n');
+ }
+ }
+ if(sb.length()>0){
+ tsw.print(sb.toString());
+ sb.setLength(0);
+ }
+ }
+ //System.err.println("Writing "+fname);
+ tsw.poisonAndWait();
+ if(showStats){System.err.println("Wrote "+fname);}
+ }
+
+ public static void writeMatrix(String fname, long[] goodMatrix, long[] badMatrix, boolean overwrite, boolean append, int pass){
+ assert(fname!=null) : "No file specified";
+ if(fname.startsWith("?")){
+ fname=fname.replaceFirst("\\?", Data.ROOT_QUALITY);
+ }
+ fname=fname.replace("_p#", "_p"+pass);
+ FileFormat ff=FileFormat.testOutput(fname, FileFormat.TEXT, null, false, overwrite, append, false);
+ TextStreamWriter tsw=new TextStreamWriter(ff);
+ //System.err.println("Starting tsw for "+fname);
+ tsw.start();
+ //System.err.println("Started tsw for "+fname);
+ StringBuilder sb=new StringBuilder();
+
+ final int d0=goodMatrix.length;
+ for(int a=0; a<d0; a++){
+ long good=goodMatrix[a];
+ long bad=badMatrix[a];
+ long sum=good+bad;
+ if(sum>0){
+ sb.append(a);
+ sb.append('\t');
+ sb.append(sum);
+ sb.append('\t');
+ sb.append(bad);
+ sb.append('\n');
+ }
+ if(sb.length()>0){
+ tsw.print(sb.toString());
+ sb.setLength(0);
+ }
+ }
+ //System.err.println("Writing "+fname);
+ tsw.poisonAndWait();
+ if(showStats){System.err.println("Wrote "+fname);}
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Static Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public static final void recalibrate(Read r){
+ recalibrate(r, true, passes>1);
+ }
+
+ private static final void recalibrate(Read r, boolean pass0, boolean pass1){
+// System.err.println(r.obj);
+// System.err.println(Arrays.toString(r.quality));
+
+ final int pairnum;
+ if(USE_PAIRNUM){
+ int x=r.pairnum();
+ final Object obj=r.obj;
+ if(obj!=null && obj.getClass()==SamLine.class){
+ x=((SamLine)obj).pairnum();
+ }
+ pairnum=x;
+ }else{
+ pairnum=0;
+ }
+ if(pass0){
+ byte[] quals2=recalibrate(r.bases, r.quality, pairnum, 0);
+ for(int i=0; i<quals2.length; i++){
+ r.quality[i]=quals2[i];
+ } //Allows calibrating sam output.
+ }
+ if(pass1){
+ byte[] quals2=recalibrate(r.bases, r.quality, pairnum, 1);
+ for(int i=0; i<quals2.length; i++){
+ r.quality[i]=quals2[i];
+ } //Allows calibrating sam output.
+ }
+
+// assert(OBSERVATION_CUTOFF==0);
+// assert(false) : pass0+", "+pass1;
+//
+// System.err.println(Arrays.toString(r.quality));
+// System.err.println(r.obj);
+// assert(false);
+ }
+
+ public static final byte[] recalibrate(final byte[] bases, final byte[] quals, final int pairnum, int pass){
+ return cmatrices[pass].recalibrate(bases, quals, pairnum);
+ }
+
+ public static final void initializeMatrices(){
+ for(int i=0; i<passes; i++){
+ initializeMatrices(i);
+ }
+ }
+
+ public static final void initializeMatrices(int pass){
+ if(initialized[pass]){return;}
+
+ synchronized(initialized){
+ if(initialized[pass]){return;}
+ assert(cmatrices[pass]==null);
+ cmatrices[pass]=new CountMatrixSet(pass);
+ cmatrices[pass].load();
+ initialized[pass]=true;
+ }
+
+// assert(false) : (q102ProbMatrix!=null)+", "+(qbpProbMatrix!=null)+", "+(q10ProbMatrix!=null)+", "+(q12ProbMatrix!=null)+", "+(qb012ProbMatrix!=null)+", "+(qb234ProbMatrix!=null)+", "+(qpProbMatrix!=null);
+ }
+
+ /*--------------------------------------------------------------*/
+
+ private static double modify(final double sum, final double bad, final int phred, final long cutoff){
+ double expected=QualityTools.PROB_ERROR[phred];
+
+ double sum2=sum+cutoff;
+ double bad2=bad+expected*cutoff;
+ double measured=bad2/sum2;
+
+ return measured;
+
+// double modified=Math.pow(measured*measured*measured*expected, 0.25);
+//// double modified=Math.sqrt(measured*expected);
+//// double modified=(measured+expected)*.5;
+//
+// return modified;
+ }
+
+ public static final float[][][][][] toProbs(long[][][][][] sumMatrix, long[][][][][] badMatrix, final long cutoff){
+ final int d0=sumMatrix.length, d1=sumMatrix[0].length, d2=sumMatrix[0][0].length, d3=sumMatrix[0][0][0].length, d4=sumMatrix[0][0][0][0].length;
+ float[][][][][] probs=new float[d0][d1][d2][d3][d4];
+ for(int a=0; a<d0; a++){
+ for(int b=0; b<d1; b++){
+ for(int c=0; c<d2; c++){
+ for(int d=0; d<d3; d++){
+ for(int e=0; e<d4; e++){
+ double sum=sumMatrix[a][b][c][d][e];
+ double bad=badMatrix[a][b][c][d][e];
+ double modified=modify(sum, bad, b, cutoff);
+ probs[a][b][c][d][e]=(float)modified;
+ }
+ }
+ }
+ }
+ }
+ return probs;
+ }
+
+ public static final float[][][][] toProbs(long[][][][] sumMatrix, long[][][][] badMatrix, final long cutoff){
+ final int d0=sumMatrix.length, d1=sumMatrix[0].length, d2=sumMatrix[0][0].length, d3=sumMatrix[0][0][0].length;
+ float[][][][] probs=new float[d0][d1][d2][d3];
+ for(int a=0; a<d0; a++){
+ for(int b=0; b<d1; b++){
+ for(int c=0; c<d2; c++){
+ for(int d=0; d<d3; d++){
+ double sum=sumMatrix[a][b][c][d];
+ double bad=badMatrix[a][b][c][d];
+ double modified=modify(sum, bad, b, cutoff);
+ probs[a][b][c][d]=(float)modified;
+ }
+ }
+ }
+ }
+ return probs;
+ }
+
+ public static final float[][][] toProbs(long[][][] sumMatrix, long[][][] badMatrix, final long cutoff){
+ final int d0=sumMatrix.length, d1=sumMatrix[0].length, d2=sumMatrix[0][0].length;
+ float[][][] probs=new float[d0][d1][d2];
+ for(int a=0; a<d0; a++){
+ for(int b=0; b<d1; b++){
+ for(int c=0; c<d2; c++){
+ double sum=sumMatrix[a][b][c];
+ double bad=badMatrix[a][b][c];
+ double modified=modify(sum, bad, b, cutoff);
+ probs[a][b][c]=(float)modified;
+ }
+ }
+ }
+ return probs;
+ }
+
+ public static final float[][] toProbs(long[][] sumMatrix, long[][] badMatrix, final long cutoff){
+ final int d0=sumMatrix.length, d1=sumMatrix[0].length;
+ float[][] probs=new float[d0][d1];
+ for(int a=0; a<d0; a++){
+ for(int b=0; b<d1; b++){
+ double sum=sumMatrix[a][b];
+ double bad=badMatrix[a][b];
+ double modified=modify(sum, bad, b, cutoff);
+ probs[a][b]=(float)modified;
+ }
+ }
+ return probs;
+ }
+
+ /*--------------------------------------------------------------*/
+
+ private static String findPath(String fname){
+ assert(fname!=null);
+// return Data.findPath(fname);
+ if(fname.startsWith("?")){
+ fname=fname.replaceFirst("\\?", Data.ROOT_QUALITY);
+ }
+ return fname;
+ }
+
+ public static final long[][] loadMatrix(String fname, int d0){
+ if(fname==null){return null;}
+ fname=findPath(fname);
+ System.err.println("Loading "+fname+".");
+
+ try{
+ long[][] matrix=new long[2][d0];
+
+ TextFile tf=new TextFile(fname, false, false);
+ for(String line=tf.nextLine(); line!=null; line=tf.nextLine()){
+ String[] split=line.split("\t");
+ assert(split.length==3) : Arrays.toString(split);
+ int a=Integer.parseInt(split[0]);
+ long bases=Long.parseLong(split[1]);
+ long errors=Long.parseLong(split[2]);
+ matrix[0][a]=bases;
+ matrix[1][a]=errors;
+ }
+ return matrix;
+ }catch(RuntimeException e){
+ System.err.println("Error - please regenerate calibration matrices.");
+ throw(e);
+ }
+ }
+
+ public static final long[][][] loadMatrix(String fname, int d0, int d1){
+ if(fname==null){return null;}
+ fname=findPath(fname);
+ System.err.println("Loading "+fname+".");
+
+ try{
+ long[][][] matrix=new long[2][d0][d1];
+
+ TextFile tf=new TextFile(fname, false, false);
+ for(String line=tf.nextLine(); line!=null; line=tf.nextLine()){
+ String[] split=line.split("\t");
+ assert(split.length==4) : Arrays.toString(split);
+ int a=Integer.parseInt(split[0]);
+ int b=Integer.parseInt(split[1]);
+ long bases=Long.parseLong(split[2]);
+ long errors=Long.parseLong(split[3]);
+ matrix[0][a][b]=bases;
+ matrix[1][a][b]=errors;
+ }
+ return matrix;
+ }catch(RuntimeException e){
+ System.err.println("Error - please regenerate calibration matrices.");
+ throw(e);
+ }
+ }
+
+ public static final long[][][][] loadMatrix(String fname, int d0, int d1, int d2){
+ if(fname==null){return null;}
+ fname=findPath(fname);
+ System.err.println("Loading "+fname+".");
+
+ try{
+ long[][][][] matrix=new long[2][d0][d1][d2];
+
+ TextFile tf=new TextFile(fname, false, false);
+ for(String line=tf.nextLine(); line!=null; line=tf.nextLine()){
+ String[] split=line.split("\t");
+ assert(split.length==5) : Arrays.toString(split);
+ int a=Integer.parseInt(split[0]);
+ int b=Integer.parseInt(split[1]);
+ int c=Integer.parseInt(split[2]);
+ long bases=Long.parseLong(split[3]);
+ long errors=Long.parseLong(split[4]);
+ matrix[0][a][b][c]=bases;
+ matrix[1][a][b][c]=errors;
+ }
+ return matrix;
+ }catch(RuntimeException e){
+ System.err.println("Error - please regenerate calibration matrices.");
+ throw(e);
+ }
+ }
+
+ public static final long[][][][][] loadMatrix(String fname, int d0, int d1, int d2, int d3){
+ if(fname==null){return null;}
+ fname=findPath(fname);
+ System.err.println("Loading "+fname+".");
+
+ try{
+ long[][][][][] matrix=new long[2][d0][d1][d2][d3];
+
+ TextFile tf=new TextFile(fname, false, false);
+ for(String line=tf.nextLine(); line!=null; line=tf.nextLine()){
+ String[] split=line.split("\t");
+ assert(split.length==6) : Arrays.toString(split);
+ int a=Integer.parseInt(split[0]);
+ int b=Integer.parseInt(split[1]);
+ int c=Integer.parseInt(split[2]);
+ int d=Integer.parseInt(split[3]);
+ long bases=Long.parseLong(split[4]);
+ long errors=Long.parseLong(split[5]);
+ matrix[0][a][b][c][d]=bases;
+ matrix[1][a][b][c][d]=errors;
+ }
+ return matrix;
+ }catch(RuntimeException e){
+ System.err.println("Error - please regenerate calibration matrices.");
+ throw(e);
+ }
+ }
+
+ public static final long[][][][][][] loadMatrix(String fname, int d0, int d1, int d2, int d3, int d4){
+ if(fname==null){return null;}
+ fname=findPath(fname);
+ System.err.println("Loading "+fname+".");
+
+ try{
+ long[][][][][][] matrix=new long[2][d0][d1][d2][d3][d4];
+
+ TextFile tf=new TextFile(fname, false, false);
+ for(String line=tf.nextLine(); line!=null; line=tf.nextLine()){
+ String[] split=line.split("\t");
+ assert(split.length==7) : Arrays.toString(split);
+ int a=Integer.parseInt(split[0]);
+ int b=Integer.parseInt(split[1]);
+ int c=Integer.parseInt(split[2]);
+ int d=Integer.parseInt(split[3]);
+ int e=Integer.parseInt(split[4]);
+ long bases=Long.parseLong(split[5]);
+ long errors=Long.parseLong(split[6]);
+ matrix[0][a][b][c][d][e]=bases;
+ matrix[1][a][b][c][d][e]=errors;
+ }
+ return matrix;
+ }catch(RuntimeException e){
+ System.err.println("Error - please regenerate calibration matrices.");
+ throw(e);
+ }
+ }
+
+ private static byte[] fillBaseToNum(){
+ byte[] btn=new byte[128];
+ Arrays.fill(btn, (byte)5);
+ btn['A']=btn['a']=0;
+ btn['C']=btn['c']=1;
+ btn['G']=btn['g']=2;
+ btn['T']=btn['t']=3;
+ btn['U']=btn['u']=3;
+ btn['E']=4;
+ return btn;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Nested Classes ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private class Worker extends Thread {
+
+ Worker(ConcurrentReadInputStream cris_, int pass_){
+ cris=cris_;
+ pass=pass_;
+ matrixT=new GBMatrixSet(pass);
+ }
+
+ @Override
+ public void run(){
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ while(reads!=null && reads.size()>0){
+
+ for(int idx=0; idx<reads.size(); idx++){
+ Read r1=reads.get(idx);
+ Read r2=r1.mate;
+ if(pass>0){
+ recalibrate(r1, true, false);
+ if(r2!=null){recalibrate(r2, true, false);}
+ }
+ processLocal(r1);
+ processLocal(r2);
+ }
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ if(ln!=null){
+ cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
+ }
+ }
+
+ private void processLocal(Read r){
+
+// assert(false) : pass+", "+matrixT.pass;
+
+ if(r==null){return;}
+ final int pairnum;
+ if(!USE_PAIRNUM){
+ pairnum=0;
+ }else if(r.obj!=null && r.obj.getClass()==SamLine.class){
+ pairnum=((SamLine)r.obj).pairnum();
+ }else{
+ pairnum=r.pairnum();
+ }
+ readsProcessedT++;
+ basesProcessedT+=r.length();
+
+ if(verbose){outstream.println(r+"\n");}
+
+ if(verbose){outstream.println("A");}
+ if(r.match!=null && r.shortmatch()){
+ r.match=Read.toLongMatchString(r.match);
+ r.setShortMatch(false);
+ }
+ final byte[] quals=r.quality, bases=r.bases, match=r.match;
+ if(quals==null || bases==null || match==null){return;}
+ if(verbose){outstream.println("B");}
+// if(r.containsNonNMS() || r.containsConsecutiveS(8)){
+// if(verbose){System.err.println("*************************************************** "+new String(match));}
+// return;
+// }
+ if(r.strand()==Gene.MINUS){
+ Tools.reverseInPlace(match);
+ }
+ if(verbose){outstream.println("C");}
+
+ final byte e='E';
+
+ if(readstatsT!=null){
+ readstatsT.addToQualityHistogram(r);
+ }
+
+ readsUsedT++;
+ for(int qpos=0, mpos=0, last=quals.length-1; mpos<match.length; mpos++){
+
+ final byte m=match[mpos];
+ final byte mprev=match[Tools.max(mpos-1, 0)];
+ final byte mnext=match[Tools.min(mpos+1, match.length-1)];
+
+ if(verbose){outstream.print("D");}
+ final int q0=(qpos>0 ? Tools.mid(QMAX, quals[qpos-1], 0) : QEND);
+ final int q1=quals[qpos];
+ final int q2=(qpos<last ? Tools.mid(QMAX, quals[qpos+1], 0) : QEND);
+
+ byte b0=qpos>1 ? bases[qpos-2] : e;
+ byte b1=qpos>0 ? bases[qpos-1] : e;
+ byte b2=bases[qpos];
+ byte b3=qpos<last ? bases[qpos+1] : e;
+ byte b4=qpos<last-1 ? bases[qpos+2] : e;
+ byte n0=baseToNum[b0];
+ byte n1=baseToNum[b1];
+ byte n2=baseToNum[b2];
+ byte n3=baseToNum[b3];
+ byte n4=baseToNum[b4];
+
+
+ if(m=='N' || !AminoAcid.isFullyDefined(b2)){
+ if(verbose){outstream.print("E");}
+ //do nothing
+ }else if(m=='D'){
+ if(verbose){outstream.print("E");}
+ //do nothing
+ }else if(m=='C'){
+ if(verbose){outstream.print("E");}
+ //do nothing
+ }else{
+ final int pos=Tools.min(qpos, LENMAX-1);
+
+ if(verbose){outstream.print("F");}
+ basesUsedT++;
+ if(m=='m' || (!COUNT_INDELS && m=='I')){
+ final int incr;
+ if(COUNT_INDELS && (mprev=='D' || mnext=='D')){
+ incr=1;
+ matrixT.q102BadMatrix[pairnum][q1][q0][q2]+=1;
+ matrixT.qbpBadMatrix[pairnum][q1][n2][pos]+=1;
+
+ matrixT.q10BadMatrix[pairnum][q1][q0]+=1;
+ matrixT.q12BadMatrix[pairnum][q1][q0]+=1;
+ matrixT.qb12BadMatrix[pairnum][q1][n1][n2]+=1;
+ matrixT.qb012BadMatrix[pairnum][q1][n0][n1][n2]+=1;
+ matrixT.qb123BadMatrix[pairnum][q1][n1][n2][n3]+=1;
+ matrixT.qb234BadMatrix[pairnum][q1][n2][n3][n4]+=1;
+ matrixT.q12b12BadMatrix[pairnum][q1][q2][n1][n2]+=1;
+ matrixT.qpBadMatrix[pairnum][q1][pos]+=1;
+ matrixT.qBadMatrix[pairnum][q1]+=1;
+ matrixT.pBadMatrix[pairnum][pos]+=1;
+ }else{
+ incr=2;
+ }
+ matrixT.q102GoodMatrix[pairnum][q1][q0][q2]+=incr;
+ matrixT.qbpGoodMatrix[pairnum][q1][n2][pos]+=incr;
+
+ matrixT.q10GoodMatrix[pairnum][q1][q0]+=incr;
+ matrixT.q12GoodMatrix[pairnum][q1][q0]+=incr;
+ matrixT.qb12GoodMatrix[pairnum][q1][n1][n2]+=incr;
+ matrixT.qb012GoodMatrix[pairnum][q1][n0][n1][n2]+=incr;
+ matrixT.qb123GoodMatrix[pairnum][q1][n1][n2][n3]+=incr;
+ matrixT.qb234GoodMatrix[pairnum][q1][n2][n3][n4]+=incr;
+ matrixT.q12b12GoodMatrix[pairnum][q1][q2][n1][n2]+=incr;
+ matrixT.qpGoodMatrix[pairnum][q1][pos]+=incr;
+ matrixT.qGoodMatrix[pairnum][q1]+=incr;
+ matrixT.pGoodMatrix[pairnum][pos]+=incr;
+ }else if(m=='S' || m=='I'){
+ matrixT.q102BadMatrix[pairnum][q1][q0][q2]+=2;
+ matrixT.qbpBadMatrix[pairnum][q1][n2][pos]+=2;
+
+ matrixT.q10BadMatrix[pairnum][q1][q0]+=2;
+ matrixT.q12BadMatrix[pairnum][q1][q0]+=2;
+ matrixT.qb12BadMatrix[pairnum][q1][n1][n2]+=2;
+ matrixT.qb012BadMatrix[pairnum][q1][n0][n1][n2]+=2;
+ matrixT.qb123BadMatrix[pairnum][q1][n1][n2][n3]+=2;
+ matrixT.qb234BadMatrix[pairnum][q1][n2][n3][n4]+=2;
+ matrixT.q12b12BadMatrix[pairnum][q1][q2][n1][n2]+=2;
+ matrixT.qpBadMatrix[pairnum][q1][pos]+=2;
+ matrixT.qBadMatrix[pairnum][q1]+=2;
+ matrixT.pBadMatrix[pairnum][pos]+=2;
+ }else{
+ throw new RuntimeException("Bad symbol m='"+((char)m)+"'\n"+new String(match)+"\n"+new String(bases)+"\n");
+ }
+ }
+ if(m!='D'){qpos++;}
+ }
+
+ }
+
+ long readsProcessedT=0;
+ long basesProcessedT=0;
+ final ReadStats readstatsT=(qhist==null ? null : new ReadStats());
+ long readsUsedT=0, basesUsedT;
+
+ private final ConcurrentReadInputStream cris;
+ private final int pass;
+ GBMatrixSet matrixT;
+
+ }
+
+ static class GBMatrixSet{
+
+ GBMatrixSet(int pass_){
+ pass=pass_;
+ assert(pass==0 || (pass==1));
+ }
+
+ final void add(GBMatrixSet incr){
+ CalcTrueQuality.add(q102GoodMatrix, incr.q102GoodMatrix);
+ CalcTrueQuality.add(qbpGoodMatrix, incr.qbpGoodMatrix);
+ CalcTrueQuality.add(q10GoodMatrix, incr.q10GoodMatrix);
+ CalcTrueQuality.add(q12GoodMatrix, incr.q12GoodMatrix);
+ CalcTrueQuality.add(qb12GoodMatrix, incr.qb12GoodMatrix);
+ CalcTrueQuality.add(qb012GoodMatrix, incr.qb012GoodMatrix);
+ CalcTrueQuality.add(qb123GoodMatrix, incr.qb123GoodMatrix);
+ CalcTrueQuality.add(qb234GoodMatrix, incr.qb234GoodMatrix);
+ CalcTrueQuality.add(q12b12GoodMatrix, incr.q12b12GoodMatrix);
+ CalcTrueQuality.add(qpGoodMatrix, incr.qpGoodMatrix);
+ CalcTrueQuality.add(qGoodMatrix, incr.qGoodMatrix);
+ CalcTrueQuality.add(pGoodMatrix, incr.pGoodMatrix);
+
+ CalcTrueQuality.add(q102BadMatrix, incr.q102BadMatrix);
+ CalcTrueQuality.add(qbpBadMatrix, incr.qbpBadMatrix);
+ CalcTrueQuality.add(q10BadMatrix, incr.q10BadMatrix);
+ CalcTrueQuality.add(q12BadMatrix, incr.q12BadMatrix);
+ CalcTrueQuality.add(qb12BadMatrix, incr.qb12BadMatrix);
+ CalcTrueQuality.add(qb012BadMatrix, incr.qb012BadMatrix);
+ CalcTrueQuality.add(qb123BadMatrix, incr.qb123BadMatrix);
+ CalcTrueQuality.add(qb234BadMatrix, incr.qb234BadMatrix);
+ CalcTrueQuality.add(q12b12BadMatrix, incr.q12b12BadMatrix);
+ CalcTrueQuality.add(qpBadMatrix, incr.qpBadMatrix);
+ CalcTrueQuality.add(qBadMatrix, incr.qBadMatrix);
+ CalcTrueQuality.add(pBadMatrix, incr.pBadMatrix);
+ }
+
+ public void write() {
+ if(q102matrix!=null){writeMatrix(q102matrix, q102GoodMatrix, q102BadMatrix, overwrite, append, pass);}
+ if(qbpmatrix!=null){writeMatrix(qbpmatrix, qbpGoodMatrix, qbpBadMatrix, overwrite, append, pass);}
+ if(q10matrix!=null){writeMatrix(q10matrix, q10GoodMatrix, q10BadMatrix, overwrite, append, pass);}
+ if(q12matrix!=null){writeMatrix(q12matrix, q12GoodMatrix, q12BadMatrix, overwrite, append, pass);}
+ if(qb12matrix!=null){writeMatrix(qb12matrix, qb12GoodMatrix, qb12BadMatrix, overwrite, append, pass);}
+ if(qb012matrix!=null){writeMatrix(qb012matrix, qb012GoodMatrix, qb012BadMatrix, overwrite, append, pass);}
+ if(qb123matrix!=null){writeMatrix(qb123matrix, qb123GoodMatrix, qb123BadMatrix, overwrite, append, pass);}
+ if(qb234matrix!=null){writeMatrix(qb234matrix, qb234GoodMatrix, qb234BadMatrix, overwrite, append, pass);}
+ if(q12b12matrix!=null){writeMatrix(q12b12matrix, q12b12GoodMatrix, q12b12BadMatrix, overwrite, append, pass);}
+ if(qpmatrix!=null){writeMatrix(qpmatrix, qpGoodMatrix, qpBadMatrix, overwrite, append, pass);}
+ if(qmatrix!=null){writeMatrix(qmatrix, qGoodMatrix, qBadMatrix, overwrite, append, pass);}
+ if(pmatrix!=null){writeMatrix(pmatrix, pGoodMatrix, pBadMatrix, overwrite, append, pass);}
+ }
+
+ final long[][][][] q102GoodMatrix=new long[2][QMAX2][QMAX2][QMAX2];
+ final long[][][][] q102BadMatrix=new long[2][QMAX2][QMAX2][QMAX2];
+
+ final long[][][][] qbpGoodMatrix=new long[2][QMAX2][BMAX][LENMAX];
+ final long[][][][] qbpBadMatrix=new long[2][QMAX2][BMAX][LENMAX];
+
+ final long[][][] q10GoodMatrix=new long[2][QMAX2][QMAX2];
+ final long[][][] q10BadMatrix=new long[2][QMAX2][QMAX2];
+
+ final long[][][] q12GoodMatrix=new long[2][QMAX2][QMAX2];
+ final long[][][] q12BadMatrix=new long[2][QMAX2][QMAX2];
+
+ final long[][][][] qb12GoodMatrix=new long[2][QMAX2][BMAX][BMAX];
+ final long[][][][] qb12BadMatrix=new long[2][QMAX2][BMAX][BMAX];
+
+ final long[][][][][] qb012GoodMatrix=new long[2][QMAX2][BMAX][BMAX][BMAX];
+ final long[][][][][] qb012BadMatrix=new long[2][QMAX2][BMAX][BMAX][BMAX];
+
+ final long[][][][][] qb123GoodMatrix=new long[2][QMAX2][BMAX][BMAX][BMAX];
+ final long[][][][][] qb123BadMatrix=new long[2][QMAX2][BMAX][BMAX][BMAX];
+
+ final long[][][][][] qb234GoodMatrix=new long[2][QMAX2][BMAX][BMAX][BMAX];
+ final long[][][][][] qb234BadMatrix=new long[2][QMAX2][BMAX][BMAX][BMAX];
+
+ final long[][][][][] q12b12GoodMatrix=new long[2][QMAX2][QMAX2][BMAX][BMAX];
+ final long[][][][][] q12b12BadMatrix=new long[2][QMAX2][QMAX2][BMAX][BMAX];
+
+ final long[][][] qpGoodMatrix=new long[2][QMAX2][LENMAX];
+ final long[][][] qpBadMatrix=new long[2][QMAX2][LENMAX];
+
+ final long[][] qGoodMatrix=new long[2][QMAX2];
+ final long[][] qBadMatrix=new long[2][QMAX2];
+
+ final long[][] pGoodMatrix=new long[2][LENMAX];
+ final long[][] pBadMatrix=new long[2][LENMAX];
+
+ final int pass;
+
+ }
+
+ static class CountMatrixSet{
+
+ CountMatrixSet(int pass_){
+ pass=pass_;
+ assert(pass==0 || (pass==1));
+ load();
+ }
+
+ /**
+ * @param bases
+ * @param quals
+ * @param pairnum
+ * @return
+ */
+ public byte[] recalibrate(byte[] bases, byte[] quals, int pairnum) {
+ final byte[] quals2;
+ final boolean round=(pass<passes-1);
+ if(quals!=null){
+ assert(quals.length<=LENMAX || !(use_qp[pass] || use_qbp[pass])) :
+ "\nThese reads are too long ("+quals.length+"bp) for recalibration using position. Please select different matrices.\n";
+ quals2=new byte[quals.length];
+ for(int i=0; i<bases.length; i++){
+ final byte q2;
+ if(!AminoAcid.isFullyDefined(bases[i])){
+ q2=0;
+ }else{
+ final float prob;
+ if(USE_WEIGHTED_AVERAGE){
+ prob=estimateErrorProb2(quals, bases, i, pairnum, OBSERVATION_CUTOFF[pass]);
+ }else if(USE_AVERAGE){
+ prob=estimateErrorProbAvg(quals, bases, i, pairnum);
+ }else{
+ prob=estimateErrorProbMax(quals, bases, i, pairnum);
+ }
+ q2=Tools.max((byte)2, QualityTools.probErrorToPhred(prob, true));
+ }
+ quals2[i]=q2;
+ }
+ }else{
+ assert(false) : "Can't recalibrate qualities for reads that don't have quality scores.";
+ quals2=null;
+ //TODO
+ }
+ return quals2;
+ }
+
+ void load(){
+ synchronized(initialized){
+ if(initialized[pass]){return;}
+
+ if(use_q102[pass]){
+ q102CountMatrix=loadMatrix(q102matrix.replace("_p#", "_p"+pass), 2, QMAX2, QMAX2, QMAX2);
+ q102ProbMatrix=toProbs(q102CountMatrix[0], q102CountMatrix[1], OBSERVATION_CUTOFF[pass]);
+ }
+ if(use_qbp[pass]){
+ qbpCountMatrix=loadMatrix(qbpmatrix.replace("_p#", "_p"+pass), 2, QMAX2, 4, LENMAX);
+ qbpProbMatrix=toProbs(qbpCountMatrix[0], qbpCountMatrix[1], OBSERVATION_CUTOFF[pass]);
+ }
+ if(use_q10[pass]){
+ q10CountMatrix=loadMatrix(q10matrix.replace("_p#", "_p"+pass), 2, QMAX2, QMAX2);
+ q10ProbMatrix=toProbs(q10CountMatrix[0], q10CountMatrix[1], OBSERVATION_CUTOFF[pass]);
+ }
+ if(use_q12[pass]){
+ q12CountMatrix=loadMatrix(q12matrix.replace("_p#", "_p"+pass), 2, QMAX2, QMAX2);
+ q12ProbMatrix=toProbs(q12CountMatrix[0], q12CountMatrix[1], OBSERVATION_CUTOFF[pass]);
+ }
+ if(use_qb12[pass]){
+ qb12CountMatrix=loadMatrix(qb12matrix.replace("_p#", "_p"+pass), 2, QMAX2, BMAX, 4);
+ qb12ProbMatrix=toProbs(qb12CountMatrix[0], qb12CountMatrix[1], OBSERVATION_CUTOFF[pass]);
+ }
+ if(use_qb012[pass]){
+ qb012CountMatrix=loadMatrix(qb012matrix.replace("_p#", "_p"+pass), 2, QMAX2, BMAX, BMAX, 4);
+ qb012ProbMatrix=toProbs(qb012CountMatrix[0], qb012CountMatrix[1], OBSERVATION_CUTOFF[pass]);
+ }
+ if(use_qb123[pass]){
+ qb123CountMatrix=loadMatrix(qb123matrix.replace("_p#", "_p"+pass), 2, QMAX2, BMAX, 4, BMAX);
+ qb123ProbMatrix=toProbs(qb123CountMatrix[0], qb123CountMatrix[1], OBSERVATION_CUTOFF[pass]);
+ }
+ if(use_qb234[pass]){
+ qb234CountMatrix=loadMatrix(qb234matrix.replace("_p#", "_p"+pass), 2, QMAX2, 4, BMAX, BMAX);
+ qb234ProbMatrix=toProbs(qb234CountMatrix[0], qb234CountMatrix[1], OBSERVATION_CUTOFF[pass]);
+ }
+ if(use_q12b12[pass]){
+ q12b12CountMatrix=loadMatrix(q12b12matrix.replace("_p#", "_p"+pass), 2, QMAX2, QMAX2, BMAX, BMAX);
+ q12b12ProbMatrix=toProbs(q12b12CountMatrix[0], q12b12CountMatrix[1], OBSERVATION_CUTOFF[pass]);
+ }
+ if(use_qp[pass]){
+ qpCountMatrix=loadMatrix(qpmatrix.replace("_p#", "_p"+pass), 2, QMAX2, LENMAX);
+ qpProbMatrix=toProbs(qpCountMatrix[0], qpCountMatrix[1], OBSERVATION_CUTOFF[pass]);
+ }
+ if(use_q[pass]){
+ qCountMatrix=loadMatrix(qmatrix.replace("_p#", "_p"+pass), 2, QMAX2);
+ qProbMatrix=toProbs(qCountMatrix[0], qCountMatrix[1], OBSERVATION_CUTOFF[pass]);
+ }
+
+ initialized[pass]=true;
+ }
+ }
+
+
+
+ public final float estimateErrorProbAvg(byte[] quals, byte[] bases, int pos, int pairnum){
+
+ final byte e='E';
+ final int last=quals.length-1;
+
+ final int q0=(pos>0 ? Tools.mid(QMAX, quals[pos-1], 0) : QEND);
+ final int q1=quals[pos];
+ final int q2=(pos<last ? Tools.mid(QMAX, quals[pos+1], 0) : QEND);
+
+ byte b0=pos>1 ? bases[pos-2] : e;
+ byte b1=pos>0 ? bases[pos-1] : e;
+ byte b2=bases[pos];
+ byte b3=pos<last ? bases[pos+1] : e;
+ byte b4=pos<last-1 ? bases[pos+2] : e;
+ byte n0=baseToNum[b0];
+ byte n1=baseToNum[b1];
+ byte n2=baseToNum[b2];
+ byte n3=baseToNum[b3];
+ byte n4=baseToNum[b4];
+
+ float expected=PROB_ERROR[q1];
+ float sum=0;
+ int x=0;
+
+// System.err.println();
+// System.err.println(((char)b0)+"\t"+((char)b1)+"\t"+((char)b2)+"\t"+((char)b3)+"\t"+((char)b4));
+// System.err.println((n0)+"\t"+(n1)+"\t"+(n2)+"\t"+(n3)+"\t"+(n4));
+// System.err.println(" "+"\t"+(q0)+"\t"+(q1)+"\t"+(q2)+"\t"+(" "));
+// System.err.println("Expected: "+expected);
+
+ if(q102ProbMatrix!=null){
+ float f=q102ProbMatrix[pairnum][q1][q0][q2];
+ sum+=f;
+ x++;
+ }
+ if(qbpProbMatrix!=null){
+ float f=qbpProbMatrix[pairnum][q1][n2][pos];
+ sum+=f;
+ x++;
+ }
+ if(q10ProbMatrix!=null){
+ float f=q10ProbMatrix[pairnum][q1][q0];
+ sum+=f;
+ x++;
+ }
+ if(q12ProbMatrix!=null){
+ float f=q12ProbMatrix[pairnum][q1][q2];
+ sum+=f;
+ x++;
+ }
+ if(qb12ProbMatrix!=null){
+ float f=qb12ProbMatrix[pairnum][q1][n1][n2];
+ sum+=f;
+ x++;
+ }
+ if(qb012ProbMatrix!=null){
+ float f=qb012ProbMatrix[pairnum][q1][n0][n1][n2];
+ sum+=f;
+ x++;
+ }
+ if(qb123ProbMatrix!=null){
+ float f=qb123ProbMatrix[pairnum][q1][n1][n2][n3];
+ sum+=f;
+ x++;
+ }
+ if(qb234ProbMatrix!=null){
+ float f=qb234ProbMatrix[pairnum][q1][n2][n3][n4];
+ sum+=f;
+ x++;
+ }
+ if(q12b12ProbMatrix!=null){
+ float f=q12b12ProbMatrix[pairnum][q1][q2][n1][n2];
+ sum+=f;
+ x++;
+ }
+ if(qpProbMatrix!=null){
+ float f=qpProbMatrix[pairnum][q1][pos];
+ sum+=f;
+ x++;
+ }
+ if(qProbMatrix!=null){
+ float f=qProbMatrix[pairnum][q1];
+ sum+=f;
+ x++;
+ }
+// System.err.println("result: "+sum+", "+x+", "+sum/(double)x);
+//
+// assert(pos<149) : sum+", "+x+", "+sum/(double)x;
+
+ if(x<1){
+ assert(false);
+ return expected;
+ }
+ return (sum/(float)x);
+ }
+
+ public final float estimateErrorProbMax(byte[] quals, byte[] bases, int pos, int pairnum){
+
+ final byte e='E';
+ final int last=quals.length-1;
+
+ final int q0=(pos>0 ? Tools.mid(QMAX, quals[pos-1], 0) : QEND);
+ final int q1=quals[pos];
+ final int q2=(pos<last ? Tools.mid(QMAX, quals[pos+1], 0) : QEND);
+
+ byte b0=pos>1 ? bases[pos-2] : e;
+ byte b1=pos>0 ? bases[pos-1] : e;
+ byte b2=bases[pos];
+ byte b3=pos<last ? bases[pos+1] : e;
+ byte b4=pos<last-1 ? bases[pos+2] : e;
+ byte n0=baseToNum[b0];
+ byte n1=baseToNum[b1];
+ byte n2=baseToNum[b2];
+ byte n3=baseToNum[b3];
+ byte n4=baseToNum[b4];
+
+ final float expected=PROB_ERROR[q1];
+
+ float max=-1;
+
+ if(q102ProbMatrix!=null){
+ float f=q102ProbMatrix[pairnum][q1][q0][q2];
+ max=Tools.max(max, f);
+ }
+ if(qbpProbMatrix!=null){
+ float f=qbpProbMatrix[pairnum][q1][n2][pos];
+ max=Tools.max(max, f);
+ }
+ if(q10ProbMatrix!=null){
+ float f=q10ProbMatrix[pairnum][q1][q0];
+ max=Tools.max(max, f);
+ }
+ if(q12ProbMatrix!=null){
+ float f=q12ProbMatrix[pairnum][q1][q2];
+ max=Tools.max(max, f);
+ }
+ if(qb12ProbMatrix!=null){
+ float f=qb12ProbMatrix[pairnum][q1][n1][n2];
+ max=Tools.max(max, f);
+ }
+ if(qb012ProbMatrix!=null){
+ float f=qb012ProbMatrix[pairnum][q1][n0][n1][n2];
+ max=Tools.max(max, f);
+ }
+ if(qb123ProbMatrix!=null){
+ float f=qb123ProbMatrix[pairnum][q1][n1][n2][n3];
+ max=Tools.max(max, f);
+ }
+ if(qb234ProbMatrix!=null){
+ float f=qb234ProbMatrix[pairnum][q1][n2][n3][n4];
+ max=Tools.max(max, f);
+ }
+ if(q12b12ProbMatrix!=null){
+ float f=q12b12ProbMatrix[pairnum][q1][q2][n1][n2];
+ max=Tools.max(max, f);
+ }
+ if(qpProbMatrix!=null){
+ float f=qpProbMatrix[pairnum][q1][pos];
+ max=Tools.max(max, f);
+ }
+ if(qProbMatrix!=null){
+ float f=qProbMatrix[pairnum][q1];
+ max=Tools.max(max, f);
+ }
+
+ if(max<0){
+ assert(false);
+ return expected;
+ }
+ return max;
+ }
+
+ public final float estimateErrorProbGeoAvg(byte[] quals, byte[] bases, int pos, int pairnum){
+
+ final byte e='E';
+ final int last=quals.length-1;
+
+ final int q0=(pos>0 ? Tools.mid(QMAX, quals[pos-1], 0) : QEND);
+ final int q1=quals[pos];
+ final int q2=(pos<last ? Tools.mid(QMAX, quals[pos+1], 0) : QEND);
+
+ byte b0=pos>1 ? bases[pos-2] : e;
+ byte b1=pos>0 ? bases[pos-1] : e;
+ byte b2=bases[pos];
+ byte b3=pos<last ? bases[pos+1] : e;
+ byte b4=pos<last-1 ? bases[pos+2] : e;
+ byte n0=baseToNum[b0];
+ byte n1=baseToNum[b1];
+ byte n2=baseToNum[b2];
+ byte n3=baseToNum[b3];
+ byte n4=baseToNum[b4];
+
+ float expected=PROB_ERROR[q1];
+ double product=1;
+ int x=0;
+
+// System.err.println();
+// System.err.println(((char)b0)+"\t"+((char)b1)+"\t"+((char)b2)+"\t"+((char)b3)+"\t"+((char)b4));
+// System.err.println((n0)+"\t"+(n1)+"\t"+(n2)+"\t"+(n3)+"\t"+(n4));
+// System.err.println(" "+"\t"+(q0)+"\t"+(q1)+"\t"+(q2)+"\t"+(" "));
+// System.err.println("Expected: "+expected);
+
+ if(q102ProbMatrix!=null){
+ float f=q102ProbMatrix[pairnum][q1][q0][q2];
+ product*=f;
+ x++;
+ }
+ if(qbpProbMatrix!=null){
+ float f=qbpProbMatrix[pairnum][q1][n2][pos];
+ product*=f;
+ x++;
+ }
+ if(q10ProbMatrix!=null){
+ float f=q10ProbMatrix[pairnum][q1][q0];
+ product*=f;
+ x++;
+ }
+ if(q12ProbMatrix!=null){
+ float f=q12ProbMatrix[pairnum][q1][q2];
+ product*=f;
+ x++;
+ }
+ if(qb12ProbMatrix!=null){
+ float f=qb12ProbMatrix[pairnum][q1][n1][n2];
+ product*=f;
+ x++;
+ }
+ if(qb012ProbMatrix!=null){
+ float f=qb012ProbMatrix[pairnum][q1][n0][n1][n2];
+ product*=f;
+ x++;
+ }
+ if(qb123ProbMatrix!=null){
+ float f=qb123ProbMatrix[pairnum][q1][n1][n2][n3];
+ product*=f;
+ x++;
+ }
+ if(qb234ProbMatrix!=null){
+ float f=qb234ProbMatrix[pairnum][q1][n2][n3][n4];
+ product*=f;
+ x++;
+ }
+ if(q12b12ProbMatrix!=null){
+ float f=q12b12ProbMatrix[pairnum][q1][q2][n1][n2];
+ product*=f;
+ x++;
+ }
+ if(qpProbMatrix!=null){
+ float f=qpProbMatrix[pairnum][q1][pos];
+ product*=f;
+ x++;
+ }
+ if(qProbMatrix!=null){
+ float f=qProbMatrix[pairnum][q1];
+ product*=f;
+ x++;
+ }
+
+ if(x<1){
+ assert(false);
+ return expected;
+ }
+ return (float)Math.pow(product, 1.0/x);
+ }
+
+ public final float estimateErrorProb2(byte[] quals, byte[] bases, int pos, int pairnum, float obs_cutoff){
+
+ final byte e='E';
+ final int last=quals.length-1;
+
+ final int q0=(pos>0 ? Tools.mid(QMAX, quals[pos-1], 0) : QEND);
+ final int q1=quals[pos];
+ final int q2=(pos<last ? Tools.mid(QMAX, quals[pos+1], 0) : QEND);
+
+ byte b0=pos>1 ? bases[pos-2] : e;
+ byte b1=pos>0 ? bases[pos-1] : e;
+ byte b2=bases[pos];
+ byte b3=pos<last ? bases[pos+1] : e;
+ byte b4=pos<last-1 ? bases[pos+2] : e;
+ byte n0=baseToNum[b0];
+ byte n1=baseToNum[b1];
+ byte n2=baseToNum[b2];
+ byte n3=baseToNum[b3];
+ byte n4=baseToNum[b4];
+
+ long sum=0, bad=0;
+ if(q102CountMatrix!=null){
+ sum+=q102CountMatrix[0][pairnum][q1][q0][q2];
+ bad+=q102CountMatrix[1][pairnum][q1][q0][q2];
+ }
+ if(qbpCountMatrix!=null){
+ sum+=qbpCountMatrix[0][pairnum][q1][n2][pos];
+ bad+=qbpCountMatrix[1][pairnum][q1][n2][pos];
+ }
+ if(q10CountMatrix!=null){
+ sum+=q10CountMatrix[0][pairnum][q1][q0];
+ bad+=q10CountMatrix[1][pairnum][q1][q0];
+ }
+ if(q12CountMatrix!=null){
+ sum+=q12CountMatrix[0][pairnum][q1][q2];
+ bad+=q12CountMatrix[1][pairnum][q1][q2];
+ }
+ if(qb12CountMatrix!=null){
+ sum+=qb12CountMatrix[0][pairnum][q1][n1][n2];
+ bad+=qb12CountMatrix[1][pairnum][q1][n1][n2];
+ }
+ if(qb012CountMatrix!=null){
+ sum+=qb012CountMatrix[0][pairnum][q1][n0][n1][n2];
+ bad+=qb012CountMatrix[1][pairnum][q1][n0][n1][n2];
+ }
+ if(qb123CountMatrix!=null){
+ sum+=qb123CountMatrix[0][pairnum][q1][n1][n2][n3];
+ bad+=qb123CountMatrix[1][pairnum][q1][n1][n2][n3];
+ }
+ if(qb234CountMatrix!=null){
+ sum+=qb234CountMatrix[0][pairnum][q1][n2][n3][n4];
+ bad+=qb234CountMatrix[1][pairnum][q1][n2][n3][n4];
+ }
+ if(q12b12CountMatrix!=null){
+ sum+=q12b12CountMatrix[0][pairnum][q1][q2][n1][n2];
+ bad+=q12b12CountMatrix[1][pairnum][q1][q2][n1][n2];
+ }
+ if(qpCountMatrix!=null){
+ sum+=qpCountMatrix[0][pairnum][q1][pos];
+ bad+=qpCountMatrix[1][pairnum][q1][pos];
+ }
+ if(qCountMatrix!=null){
+ sum+=qCountMatrix[0][pairnum][q1];
+ bad+=qCountMatrix[1][pairnum][q1];
+ }
+
+ final float expectedRate=PROB_ERROR[q1];
+ float fakeSum=obs_cutoff;
+ float fakeBad=expectedRate*obs_cutoff;
+ if(fakeBad<BAD_CUTOFF){
+ fakeBad=BAD_CUTOFF;
+ fakeSum=BAD_CUTOFF*INV_PROB_ERROR[q1];
+ }
+ return (float)((bad+fakeBad)/(sum+fakeSum));
+ }
+
+ public long[][][][][] q102CountMatrix;
+ public long[][][][][] qbpCountMatrix;
+
+ public long[][][][] q10CountMatrix;
+ public long[][][][] q12CountMatrix;
+ public long[][][][][] qb12CountMatrix;
+ public long[][][][][][] qb012CountMatrix;
+ public long[][][][][][] qb123CountMatrix;
+ public long[][][][][][] qb234CountMatrix;
+ public long[][][][][][] q12b12CountMatrix;
+ public long[][][][] qpCountMatrix;
+ public long[][][] qCountMatrix;
+
+ public float[][][][] q102ProbMatrix;
+ public float[][][][] qbpProbMatrix;
+
+ public float[][][] q10ProbMatrix;
+ public float[][][] q12ProbMatrix;
+ public float[][][][] qb12ProbMatrix;
+ public float[][][][][] qb012ProbMatrix;
+ public float[][][][][] qb123ProbMatrix;
+ public float[][][][][] qb234ProbMatrix;
+ public float[][][][][] q12b12ProbMatrix;
+ public float[][][] qpProbMatrix;
+ public float[][] qProbMatrix;
+
+ final int pass;
+
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private ReadStats readstats;
+
+ private boolean writeMatrices=true;
+
+ ArrayList<GBMatrixSet> gbmatrices=new ArrayList<GBMatrixSet>();
+
+ private PrintStream outstream=System.err;
+ private long maxReads=-1;
+ private String[] in;
+
+ private String qhist=null;
+
+ private long readsProcessed=0;
+ private long basesProcessed=0;
+ private long readsUsed=0;
+ private long basesUsed=0;
+ private boolean errorState=false;
+
+ private final int threads;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Static Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public static boolean showStats=true;
+ private static boolean verbose=false;
+ private static boolean overwrite=true;
+ private static final boolean append=false;
+ public static int passes=2;
+
+ private static String q102matrix="?q102matrix_p#.txt.gz";
+ private static String qbpmatrix="?qbpmatrix_p#.txt.gz";
+ private static String q10matrix="?q10matrix_p#.txt.gz";
+ private static String q12matrix="?q12matrix_p#.txt.gz";
+ private static String qb12matrix="?qb12matrix_p#.txt.gz";
+ private static String qb012matrix="?qb012matrix_p#.txt.gz";
+ private static String qb123matrix="?qb123matrix_p#.txt.gz";
+ private static String qb234matrix="?qb234matrix_p#.txt.gz";
+ private static String q12b12matrix="?q12b12matrix_p#.txt.gz";
+ private static String qpmatrix="?qpmatrix_p#.txt.gz";
+ private static String qmatrix="?qmatrix_p#.txt.gz";
+ private static String pmatrix="?pmatrix_p#.txt.gz";
+
+ private static final boolean[] initialized={false, false};
+
+ public static final synchronized void setQmax(int x){
+ assert(x>2 && x<94);
+ QMAX=x;
+ QEND=(QMAX+1);
+ QMAX2=(QEND+1);
+ }
+ private static int QMAX=42;
+ private static int QEND=QMAX+1;
+ private static int QMAX2=QEND+1;
+ private static final int BMAX=6;
+ private static final int LENMAX=401;
+ private static final byte[] baseToNum=fillBaseToNum();
+ private static final byte[] numToBase={'A', 'C', 'G', 'T', 'E', 'N'};
+ private static final float[] PROB_ERROR=QualityTools.PROB_ERROR;
+ private static final float[] INV_PROB_ERROR=Tools.inverse(PROB_ERROR);
+
+
+ private static final CountMatrixSet[] cmatrices=new CountMatrixSet[2];
+
+ public static boolean[] use_q102={false, false};
+ public static boolean[] use_qbp={true, true};
+ public static boolean[] use_q10={false, false};
+ public static boolean[] use_q12={false, false};
+ public static boolean[] use_qb12={false, false};
+ public static boolean[] use_qb012={false, false};
+ public static boolean[] use_qb123={true, false};
+ public static boolean[] use_qb234={false, false};
+ public static boolean[] use_q12b12={false, false};
+ public static boolean[] use_qp={false, false};
+ public static boolean[] use_q={false, false};
+
+ public static boolean USE_WEIGHTED_AVERAGE=true;
+ public static boolean USE_AVERAGE=true;
+ public static boolean USE_PAIRNUM=true;
+ public static boolean COUNT_INDELS=true;
+
+ public static long OBSERVATION_CUTOFF[]={100, 200}; //Soft threshold
+ public static float BAD_CUTOFF=0.5f; //Soft threshold
+
+
+
+}
diff --git a/current/jgi/CalcTrueQuality_single.java b/current/jgi/CalcTrueQuality_single.java
new file mode 100755
index 0000000..f33f4c5
--- /dev/null
+++ b/current/jgi/CalcTrueQuality_single.java
@@ -0,0 +1,1439 @@
+package jgi;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import stream.ConcurrentGenericReadInputStream;
+import stream.ConcurrentReadInputStream;
+import stream.FastaReadInputStream;
+import stream.Read;
+import align2.ListNum;
+import align2.QualityTools;
+import align2.ReadStats;
+import align2.Shared;
+import align2.Tools;
+import dna.AminoAcid;
+import dna.Data;
+import dna.Gene;
+import dna.Parser;
+import dna.Timer;
+import fileIO.ByteFile;
+import fileIO.ByteFile1;
+import fileIO.ByteFile2;
+import fileIO.FileFormat;
+import fileIO.ReadWrite;
+import fileIO.TextFile;
+import fileIO.TextStreamWriter;
+
+/**
+ * @author Brian Bushnell
+ * @date Jan 13, 2014
+ *
+ */
+public class CalcTrueQuality_single {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public static void main(String[] args){
+ ReadStats.COLLECT_QUALITY_STATS=true;
+ CalcTrueQuality_single ctq=new CalcTrueQuality_single(args);
+ ReadStats.overwrite=ctq.overwrite;
+ ctq.process();
+
+ if(ctq.writeMatrices){
+ ctq.writeMatrices();
+ }
+ }
+
+ public static void printOptions(){
+ assert(false) : "No help available.";
+ }
+
+ public CalcTrueQuality_single(String[] args){
+ if(args==null || args.length==0){
+ printOptions();
+ System.exit(0);
+ }
+
+ for(String s : args){if(s.startsWith("out=standardout") || s.startsWith("out=stdout")){outstream=System.err;}}
+ outstream.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+
+
+
+ Shared.READ_BUFFER_LENGTH=Tools.min(200, Shared.READ_BUFFER_LENGTH);
+ Shared.capBuffers(4);
+ ReadWrite.USE_PIGZ=false;
+ ReadWrite.USE_UNPIGZ=true;
+ ReadWrite.ZIPLEVEL=2;
+// SamLine.CONVERT_CIGAR_TO_MATCH=true;
+
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(Parser.parseCommonStatic(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseZip(arg, a, b)){
+ //do nothing
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ ByteFile1.verbose=verbose;
+ ByteFile2.verbose=verbose;
+ stream.FastaReadInputStream.verbose=verbose;
+ ConcurrentGenericReadInputStream.verbose=verbose;
+// align2.FastaReadInputStream2.verbose=verbose;
+ stream.FastqReadInputStream.verbose=verbose;
+ ReadWrite.verbose=verbose;
+ }else if(a.equals("reads") || a.equals("maxreads")){
+ maxReads=Tools.parseKMG(b);
+ }else if(a.equals("t") || a.equals("threads")){
+ Shared.setThreads(b);
+ }else if(a.equals("build") || a.equals("genome")){
+ Data.setGenome(Integer.parseInt(b));
+ }else if(a.equals("in") || a.equals("input") || a.equals("in1") || a.equals("input1") || a.equals("sam")){
+ in=b.split(",");
+ }else if(a.equals("q102") || a.equals("q102out")){
+ q102out=b;
+ }else if(a.equals("qbp") || a.equals("qbpout")){
+ qbpout=b;
+ }else if(a.equals("hist") || a.equals("qhist")){
+ qhist=b;
+ }else if(a.equals("path")){
+ Data.setPath(b);
+ }else if(a.equals("append") || a.equals("app")){
+// append=ReadStats.append=Tools.parseBoolean(b);
+ assert(false) : "This does not work in append mode.";
+ }else if(a.equals("overwrite") || a.equals("ow")){
+ overwrite=Tools.parseBoolean(b);
+ }else if(a.equals("writematrices") || a.equals("write") || a.equals("wm")){
+ writeMatrices=Tools.parseBoolean(b);
+ }else if(in==null && i==0 && !arg.contains("=") && (arg.toLowerCase().startsWith("stdin") || new File(arg).exists())){
+ in=arg.split(",");
+ }else{
+ System.err.println("Unknown parameter "+args[i]);
+ assert(false) : "Unknown parameter "+args[i];
+ // throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ assert(FastaReadInputStream.settingsOK());
+// if(maxReads!=-1){ReadWrite.USE_GUNZIP=ReadWrite.USE_UNPIGZ=false;}
+
+ if(in==null){
+ printOptions();
+ throw new RuntimeException("Error - at least one input file is required.");
+ }
+ if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){
+// if(ReadWrite.isCompressed(in1)){ByteFile.FORCE_MODE_BF2=true;}
+ ByteFile.FORCE_MODE_BF2=true;
+ }
+
+ if(!Tools.testOutputFiles(overwrite, append, false, q102out, qbpout, q10out, q12out, qb012out, qb123out, qb234out, qpout, qout, pout)){
+ throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output file "+q102out+"\n");
+ }
+ threads=Shared.threads();
+ if(qhist!=null){readstats=new ReadStats();}
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Outer Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public void process(){
+ Timer t=new Timer();
+ for(String s : in){
+ if(threads>1){
+ process_MT(s);
+ }else{
+ process_ST(s);
+ }
+ }
+
+ t.stop();
+
+ double rpnano=readsProcessed/(double)(t.elapsed);
+ double bpnano=basesProcessed/(double)(t.elapsed);
+
+ String rpstring=(readsProcessed<100000 ? ""+readsProcessed : readsProcessed<100000000 ? (readsProcessed/1000)+"k" : (readsProcessed/1000000)+"m");
+ String bpstring=(basesProcessed<100000 ? ""+basesProcessed : basesProcessed<100000000 ? (basesProcessed/1000)+"k" : (basesProcessed/1000000)+"m");
+
+ while(rpstring.length()<8){rpstring=" "+rpstring;}
+ while(bpstring.length()<8){bpstring=" "+bpstring;}
+
+ outstream.println("Time: \t"+t);
+ outstream.println("Reads Processed: "+rpstring+" \t"+String.format("%.2fk reads/sec", rpnano*1000000));
+ outstream.println("Bases Processed: "+bpstring+" \t"+String.format("%.2fm bases/sec", bpnano*1000));
+
+ rpstring=(readsUsed<100000 ? ""+readsUsed : readsUsed<100000000 ? (readsUsed/1000)+"k" : (readsUsed/1000000)+"m");
+ bpstring=(basesUsed<100000 ? ""+basesUsed : basesUsed<100000000 ? (basesUsed/1000)+"k" : (basesUsed/1000000)+"m");
+
+ while(rpstring.length()<8){rpstring=" "+rpstring;}
+ while(bpstring.length()<8){bpstring=" "+bpstring;}
+
+ outstream.println("Reads Used: "+rpstring);
+ outstream.println("Bases Used: "+bpstring);
+
+ if(errorState){
+ throw new RuntimeException(this.getClass().getName()+" terminated in an error state; the output may be corrupt.");
+ }
+ }
+
+ public void process_ST(String fname){
+
+ final ConcurrentReadInputStream cris;
+ {
+ FileFormat ff=FileFormat.testInput(fname, FileFormat.SAM, null, true, false);
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff, null);
+ if(verbose){System.err.println("Starting cris");}
+ cris.start(); //4567
+ }
+
+ {
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ while(reads!=null && reads.size()>0){
+
+ for(int idx=0; idx<reads.size(); idx++){
+ Read r1=reads.get(idx);
+ Read r2=r1.mate;
+ process(r1);
+ process(r2);
+ }
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ if(ln!=null){
+ cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
+ }
+ }
+
+ errorState|=ReadWrite.closeStreams(cris);
+
+ }
+
+ public void process_MT(String fname){
+
+ final ConcurrentReadInputStream cris;
+ {
+ FileFormat ff=FileFormat.testInput(fname, FileFormat.SAM, null, true, false);
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff, null);
+ if(verbose){System.err.println("Starting cris");}
+ cris.start(); //4567
+ }
+
+ /* Create Workers */
+ ArrayList<Worker> alpt=new ArrayList<Worker>(threads);
+ for(int i=0; i<threads; i++){alpt.add(new Worker(cris));}
+ for(Worker pt : alpt){pt.start();}
+
+ /* Wait for threads to die, and gather statistics */
+ for(int i=0; i<alpt.size(); i++){
+ Worker pt=alpt.get(i);
+ while(pt.getState()!=Thread.State.TERMINATED){
+ try {
+ pt.join();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ alpt.set(i, null);
+
+ add(q102GoodMatrix, pt.q102GoodMatrixT);
+ add(q102BadMatrix, pt.q102BadMatrixT);
+
+ add(qbpGoodMatrix, pt.qbpGoodMatrixT);
+ add(qbpBadMatrix, pt.qbpBadMatrixT);
+
+ add(q10GoodMatrix, pt.q10GoodMatrixT);
+ add(q10BadMatrix, pt.q10BadMatrixT);
+
+ add(q12GoodMatrix, pt.q12GoodMatrixT);
+ add(q12BadMatrix, pt.q12BadMatrixT);
+
+ add(qb012GoodMatrix, pt.qb012GoodMatrixT);
+ add(qb012BadMatrix, pt.qb012BadMatrixT);
+
+ add(qb123GoodMatrix, pt.qb123GoodMatrixT);
+ add(qb123BadMatrix, pt.qb123BadMatrixT);
+
+ add(qb234GoodMatrix, pt.qb234GoodMatrixT);
+ add(qb234BadMatrix, pt.qb234BadMatrixT);
+
+ add(qpGoodMatrix, pt.qpGoodMatrixT);
+ add(qpBadMatrix, pt.qpBadMatrixT);
+
+ add(qGoodMatrix, pt.qGoodMatrixT);
+ add(qBadMatrix, pt.qBadMatrixT);
+
+ add(pGoodMatrix, pt.pGoodMatrixT);
+ add(pBadMatrix, pt.pBadMatrixT);
+
+ readsProcessed+=pt.readsProcessedT;
+ basesProcessed+=pt.basesProcessedT;
+ readsUsed+=pt.readsUsedT;
+ basesUsed+=pt.basesUsedT;
+ }
+
+ /* Shut down I/O streams; capture error status */
+ errorState|=ReadWrite.closeStreams(cris);
+
+ }
+
+ private void add(long[] dest, long[] source){
+ assert(dest.length==source.length);
+ for(int i=0; i<dest.length; i++){dest[i]+=source[i];}
+ }
+
+ private void add(long[][] dest, long[][] source){
+ assert(dest.length==source.length);
+ for(int i=0; i<dest.length; i++){add(dest[i], source[i]);}
+ }
+
+ private void add(long[][][] dest, long[][][] source){
+ assert(dest.length==source.length);
+ for(int i=0; i<dest.length; i++){add(dest[i], source[i]);}
+ }
+
+ private void add(long[][][][] dest, long[][][][] source){
+ assert(dest.length==source.length);
+ for(int i=0; i<dest.length; i++){add(dest[i], source[i]);}
+ }
+
+ public void writeMatrices(){
+ int oldZL=ReadWrite.ZIPLEVEL;
+ ReadWrite.ZIPLEVEL=8;
+ if(q102out!=null){writeMatrix(q102out, q102GoodMatrix, q102BadMatrix, overwrite, append);}
+ if(qbpout!=null){writeMatrix(qbpout, qbpGoodMatrix, qbpBadMatrix, overwrite, append);}
+ if(q10out!=null){writeMatrix(q10out, q10GoodMatrix, q10BadMatrix, overwrite, append);}
+ if(q12out!=null){writeMatrix(q12out, q12GoodMatrix, q12BadMatrix, overwrite, append);}
+ if(qb012out!=null){writeMatrix(qb012out, qb012GoodMatrix, qb012BadMatrix, overwrite, append);}
+ if(qb123out!=null){writeMatrix(qb123out, qb123GoodMatrix, qb123BadMatrix, overwrite, append);}
+ if(qb234out!=null){writeMatrix(qb234out, qb234GoodMatrix, qb234BadMatrix, overwrite, append);}
+ if(qpout!=null){writeMatrix(qpout, qpGoodMatrix, qpBadMatrix, overwrite, append);}
+ if(qout!=null){writeMatrix(qout, qGoodMatrix, qBadMatrix, overwrite, append);}
+ if(pout!=null){writeMatrix(pout, pGoodMatrix, pBadMatrix, overwrite, append);}
+ if(qhist!=null){
+ readstats=ReadStats.mergeAll();
+ readstats.writeQualityToFile(qhist, false);
+ }
+ ReadWrite.ZIPLEVEL=oldZL;
+ }
+
+ public static void writeMatrix(String fname, long[][][][] goodMatrix, long[][][][] badMatrix, boolean overwrite, boolean append){
+ assert(fname!=null) : "No file specified";
+ if(fname.startsWith("?")){
+ fname=fname.replaceFirst("\\?", Data.ROOT_QUALITY);
+ }
+ FileFormat ff=FileFormat.testOutput(fname, FileFormat.TEXT, null, false, overwrite, append, false);
+ TextStreamWriter tsw=new TextStreamWriter(ff);
+ System.err.println("Starting tsw for "+fname);
+ tsw.start();
+ System.err.println("Started tsw for "+fname);
+ StringBuilder sb=new StringBuilder();
+
+ final int d0=goodMatrix.length, d1=goodMatrix[0].length, d2=goodMatrix[0][0].length, d3=goodMatrix[0][0][0].length;
+ for(int a=0; a<d0; a++){
+ for(int b=0; b<d1; b++){
+ for(int c=0; c<d2; c++){
+ for(int d=0; d<d3; d++){
+ long good=goodMatrix[a][b][c][d];
+ long bad=badMatrix[a][b][c][d];
+ long sum=good+bad;
+ if(sum>0){
+ sb.append(a);
+ sb.append('\t');
+ sb.append(b);
+ sb.append('\t');
+ sb.append(c);
+ sb.append('\t');
+ sb.append(d);
+ sb.append('\t');
+ sb.append(sum);
+ sb.append('\t');
+ sb.append(bad);
+ sb.append('\n');
+ }
+ }
+ if(sb.length()>0){
+ tsw.print(sb.toString());
+ sb.setLength(0);
+ }
+ }
+ }
+ }
+ System.err.println("Writing "+fname);
+ tsw.poisonAndWait();
+ System.err.println("Done.");
+ }
+
+ public static void writeMatrix(String fname, long[][][] goodMatrix, long[][][] badMatrix, boolean overwrite, boolean append){
+ assert(fname!=null) : "No file specified";
+ if(fname.startsWith("?")){
+ fname=fname.replaceFirst("\\?", Data.ROOT_QUALITY);
+ }
+ FileFormat ff=FileFormat.testOutput(fname, FileFormat.TEXT, null, false, overwrite, append, false);
+ TextStreamWriter tsw=new TextStreamWriter(ff);
+ System.err.println("Starting tsw for "+fname);
+ tsw.start();
+ System.err.println("Started tsw for "+fname);
+ StringBuilder sb=new StringBuilder();
+
+ final int d0=goodMatrix.length, d1=goodMatrix[0].length, d2=goodMatrix[0][0].length;
+ for(int a=0; a<d0; a++){
+ for(int b=0; b<d1; b++){
+ for(int c=0; c<d2; c++){
+ long good=goodMatrix[a][b][c];
+ long bad=badMatrix[a][b][c];
+ long sum=good+bad;
+ if(sum>0){
+ sb.append(a);
+ sb.append('\t');
+ sb.append(b);
+ sb.append('\t');
+ sb.append(c);
+ sb.append('\t');
+ sb.append(sum);
+ sb.append('\t');
+ sb.append(bad);
+ sb.append('\n');
+ }
+ }
+ if(sb.length()>0){
+ tsw.print(sb.toString());
+ sb.setLength(0);
+ }
+ }
+ }
+ System.err.println("Writing "+fname);
+ tsw.poisonAndWait();
+ System.err.println("Done.");
+ }
+
+ public static void writeMatrix(String fname, long[][] goodMatrix, long[][] badMatrix, boolean overwrite, boolean append){
+ assert(fname!=null) : "No file specified";
+ if(fname.startsWith("?")){
+ fname=fname.replaceFirst("\\?", Data.ROOT_QUALITY);
+ }
+ FileFormat ff=FileFormat.testOutput(fname, FileFormat.TEXT, null, false, overwrite, append, false);
+ TextStreamWriter tsw=new TextStreamWriter(ff);
+ System.err.println("Starting tsw for "+fname);
+ tsw.start();
+ System.err.println("Started tsw for "+fname);
+ StringBuilder sb=new StringBuilder();
+
+ final int d0=goodMatrix.length, d1=goodMatrix[0].length;
+ for(int a=0; a<d0; a++){
+ for(int b=0; b<d1; b++){
+ long good=goodMatrix[a][b];
+ long bad=badMatrix[a][b];
+ long sum=good+bad;
+ if(sum>0){
+ sb.append(a);
+ sb.append('\t');
+ sb.append(b);
+ sb.append('\t');
+ sb.append(sum);
+ sb.append('\t');
+ sb.append(bad);
+ sb.append('\n');
+ }
+ }
+ if(sb.length()>0){
+ tsw.print(sb.toString());
+ sb.setLength(0);
+ }
+ }
+ System.err.println("Writing "+fname);
+ tsw.poisonAndWait();
+ System.err.println("Done.");
+ }
+
+ public static void writeMatrix(String fname, long[] goodMatrix, long[] badMatrix, boolean overwrite, boolean append){
+ assert(fname!=null) : "No file specified";
+ if(fname.startsWith("?")){
+ fname=fname.replaceFirst("\\?", Data.ROOT_QUALITY);
+ }
+ FileFormat ff=FileFormat.testOutput(fname, FileFormat.TEXT, null, false, overwrite, append, false);
+ TextStreamWriter tsw=new TextStreamWriter(ff);
+ System.err.println("Starting tsw for "+fname);
+ tsw.start();
+ System.err.println("Started tsw for "+fname);
+ StringBuilder sb=new StringBuilder();
+
+ final int d0=goodMatrix.length;
+ for(int a=0; a<d0; a++){
+ long good=goodMatrix[a];
+ long bad=badMatrix[a];
+ long sum=good+bad;
+ if(sum>0){
+ sb.append(a);
+ sb.append('\t');
+ sb.append(sum);
+ sb.append('\t');
+ sb.append(bad);
+ sb.append('\n');
+ }
+ if(sb.length()>0){
+ tsw.print(sb.toString());
+ sb.setLength(0);
+ }
+ }
+ System.err.println("Writing "+fname);
+ tsw.poisonAndWait();
+ System.err.println("Done.");
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private void process(Read r){
+ if(r==null){return;}
+ readsProcessed++;
+ basesProcessed+=r.length();
+
+ if(verbose){outstream.println(r+"\n");}
+
+ if(verbose){outstream.println("A");}
+ if(r.match!=null && r.shortmatch()){
+ r.match=Read.toLongMatchString(r.match);
+ r.setShortMatch(false);
+ }
+ final byte[] quals=r.quality, bases=r.bases, match=r.match;
+ if(quals==null || bases==null || match==null){return;}
+ if(verbose){outstream.println("B");}
+ if(r.containsNonNMS() || r.containsConsecutiveS(4)){
+ if(verbose){System.err.println("*************************************************** "+new String(match));}
+ return;
+ }
+ if(r.strand()==Gene.MINUS){
+ Tools.reverseInPlace(match);
+ }
+ if(verbose){outstream.println("C");}
+
+ final byte e='E';
+
+ if(readstats!=null){
+ readstats.addToQualityHistogram(r);
+ }
+
+ readsUsed++;
+ for(int i=0, last=quals.length-1; i<quals.length; i++){
+ if(verbose){outstream.print("D");}
+ final byte q0=(i>0 ? (byte)Tools.mid(QMAX, quals[i-1], 0) : QEND);
+ final byte q1=quals[i];
+ final byte q2=(i<last ? (byte)Tools.mid(QMAX, quals[i+1], 0) : QEND);
+
+ byte b0=i>1 ? bases[i-2] : e;
+ byte b1=i>0 ? bases[i-1] : e;
+ byte b2=bases[i];
+ byte b3=i<last ? bases[i+1] : e;
+ byte b4=i<last-1 ? bases[i+2] : e;
+ byte n0=baseToNum[b0];
+ byte n1=baseToNum[b1];
+ byte n2=baseToNum[b2];
+ byte n3=baseToNum[b3];
+ byte n4=baseToNum[b4];
+ byte m=match[i];
+
+ if(m=='N' || !AminoAcid.isFullyDefined(b2)){
+ if(verbose){outstream.print("E");}
+ //do nothing
+ }else{
+
+ if(verbose){outstream.print("F");}
+ basesUsed++;
+ if(m=='m'){
+ q102GoodMatrix[q1][q0][q2]++;
+ qbpGoodMatrix[q1][n2][i]++;
+
+ q10GoodMatrix[q1][q0]++;
+ q12GoodMatrix[q1][q0]++;
+ qb012GoodMatrix[q1][n0][n1][n2]++;
+ qb123GoodMatrix[q1][n1][n2][n3]++;
+ qb234GoodMatrix[q1][n2][n3][n4]++;
+ qpGoodMatrix[q1][i]++;
+ qGoodMatrix[q1]++;
+ pGoodMatrix[i]++;
+ }else if(m=='S'){
+ q102BadMatrix[q1][q0][q2]++;
+ qbpBadMatrix[q1][n2][i]++;
+
+ q10BadMatrix[q1][q0]++;
+ q12BadMatrix[q1][q0]++;
+ qb012BadMatrix[q1][n0][n1][n2]++;
+ qb123BadMatrix[q1][n1][n2][n3]++;
+ qb234BadMatrix[q1][n2][n3][n4]++;
+ qpBadMatrix[q1][i]++;
+ qBadMatrix[q1]++;
+ pBadMatrix[i]++;
+ }else{
+ throw new RuntimeException("Bad symbol m='"+((char)m)+"'\n"+new String(match)+"\n"+new String(bases)+"\n");
+ }
+ }
+ }
+
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Static Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public static final void initializeMatrices(){
+ initializeMatrices(q102, qbp, q10, q12, qb012, qb123, qb234, qp);
+ }
+
+ public static final void recalibrate(Read r){
+ byte[] quals2=recalibrate(r.bases, r.quality);
+// assert(false) : "\n"+Arrays.toString(r.quality)+"\n"+Arrays.toString(quals2);
+ r.quality=quals2;
+ }
+
+ public static final byte[] recalibrate(byte[] bases, byte[] quals){
+ final byte[] quals2=new byte[bases.length];
+ if(quals!=null){
+ for(int i=0; i<bases.length; i++){
+ final byte q2;
+ if(!AminoAcid.isFullyDefined(bases[i])){
+ q2=0;
+ }else{
+ final float prob;
+// prob=CalcTrueQuality.estimateErrorProbAvg(quals, bases, i);
+// prob=CalcTrueQuality.estimateErrorProbGeoAvg(quals, bases, i);
+ if(USE_AVERAGE){
+ prob=CalcTrueQuality_single.estimateErrorProb2(quals, bases, i);
+ }else{
+ prob=CalcTrueQuality_single.estimateErrorProbMax(quals, bases, i);
+ }
+ q2=Tools.max((byte)2, QualityTools.probErrorToPhred(prob));
+ }
+ quals2[i]=q2;
+ }
+ }else{
+ assert(false) : "Can't recalibrate qualities for reads that don't have quality scores.";
+ //TODO
+ }
+ return quals2;
+ }
+
+ public static final void initializeMatrices(boolean q102, boolean qbp, boolean q10, boolean q12, boolean qb012, boolean qb123, boolean qb234, boolean qp){
+ if(initialized[0]){return;}
+
+// assert(false) : q102+". "+qbp+". "+q10+". "+q12+". "+qb012+". "+qb234+". "+qp;
+
+ synchronized(initialized){
+ if(initialized[0]){return;}
+
+ if(q102){
+ q102CountMatrix=loadMatrix(q102matrix, QMAX2, QMAX2, QMAX2);
+ q102ProbMatrix=toProbs(q102CountMatrix[0], q102CountMatrix[1], OBSERVATION_CUTOFF);
+ }
+ if(qbp){
+ qbpCountMatrix=loadMatrix(qbpmatrix, QMAX2, 4, LENMAX);
+ qbpProbMatrix=toProbs(qbpCountMatrix[0], qbpCountMatrix[1], OBSERVATION_CUTOFF);
+ }
+ if(q10){
+ q10CountMatrix=loadMatrix(q10matrix, QMAX2, QMAX2);
+ q10ProbMatrix=toProbs(q10CountMatrix[0], q10CountMatrix[1], OBSERVATION_CUTOFF);
+ }
+ if(q12){
+ q12CountMatrix=loadMatrix(q12matrix, QMAX2, QMAX2);
+ q12ProbMatrix=toProbs(q12CountMatrix[0], q12CountMatrix[1], OBSERVATION_CUTOFF);
+ }
+ if(qb012){
+ qb012CountMatrix=loadMatrix(qb012matrix, QMAX2, BMAX, BMAX, 4);
+ qb012ProbMatrix=toProbs(qb012CountMatrix[0], qb012CountMatrix[1], OBSERVATION_CUTOFF);
+ }
+ if(qb123){
+ qb123CountMatrix=loadMatrix(qb123matrix, QMAX2, BMAX, 4, BMAX);
+ qb123ProbMatrix=toProbs(qb123CountMatrix[0], qb123CountMatrix[1], OBSERVATION_CUTOFF);
+ }
+ if(qb234){
+ qb234CountMatrix=loadMatrix(qb234matrix, QMAX2, 4, BMAX, BMAX);
+ qb234ProbMatrix=toProbs(qb234CountMatrix[0], qb234CountMatrix[1], OBSERVATION_CUTOFF);
+ }
+ if(qp){
+ qpCountMatrix=loadMatrix(qpmatrix, QMAX2, LENMAX);
+ qpProbMatrix=toProbs(qpCountMatrix[0], qpCountMatrix[1], OBSERVATION_CUTOFF);
+ }
+
+ initialized[0]=true;
+ }
+
+// assert(false) : (q102ProbMatrix!=null)+", "+(qbpProbMatrix!=null)+", "+(q10ProbMatrix!=null)+", "+(q12ProbMatrix!=null)+", "+(qb012ProbMatrix!=null)+", "+(qb234ProbMatrix!=null)+", "+(qpProbMatrix!=null);
+ }
+
+ public static final float estimateErrorProbAvg(byte[] quals, byte[] bases, int pos){
+// if(q102ProbMatrix==null && qbpProbMatrix==null){return PROB_ERROR[quals[pos]];}
+
+ final byte e='E';
+ final int last=quals.length-1;
+
+ final byte q0=(pos>0 ? (byte)Tools.mid(QMAX, quals[pos-1], 0) : QEND);
+ final byte q1=quals[pos];
+ final byte q2=(pos<last ? (byte)Tools.mid(QMAX, quals[pos+1], 0) : QEND);
+
+ byte b0=pos>1 ? bases[pos-2] : e;
+ byte b1=pos>0 ? bases[pos-1] : e;
+ byte b2=bases[pos];
+ byte b3=pos<last ? bases[pos+1] : e;
+ byte b4=pos<last-1 ? bases[pos+2] : e;
+ byte n0=baseToNum[b0];
+ byte n1=baseToNum[b1];
+ byte n2=baseToNum[b2];
+ byte n3=baseToNum[b3];
+ byte n4=baseToNum[b4];
+
+ float expected=PROB_ERROR[q1];
+ float sum=0;
+ int x=0;
+
+// System.err.println();
+// System.err.println(((char)b0)+"\t"+((char)b1)+"\t"+((char)b2)+"\t"+((char)b3)+"\t"+((char)b4));
+// System.err.println((n0)+"\t"+(n1)+"\t"+(n2)+"\t"+(n3)+"\t"+(n4));
+// System.err.println(" "+"\t"+(q0)+"\t"+(q1)+"\t"+(q2)+"\t"+(" "));
+// System.err.println("Expected: "+expected);
+
+ if(q102ProbMatrix!=null){
+ float f=q102ProbMatrix[q1][q0][q2];
+ sum+=f;
+// System.err.println(f);
+ x++;
+ }
+ if(qbpProbMatrix!=null){
+ float f=qbpProbMatrix[q1][n2][pos];
+ sum+=f;
+// System.err.println(f);
+ x++;
+ }
+ if(q10ProbMatrix!=null){
+ float f=q10ProbMatrix[q1][q0];
+ sum+=f;
+// System.err.println(f);
+ x++;
+ }
+ if(q12ProbMatrix!=null){
+ float f=q12ProbMatrix[q1][q2];
+ sum+=f;
+// System.err.println(f);
+ x++;
+ }
+ if(qb012ProbMatrix!=null){
+ float f=qb012ProbMatrix[q1][n0][n1][n2];
+ sum+=f;
+// System.err.println(f);
+ x++;
+ }
+ if(qb123ProbMatrix!=null){
+ float f=qb123ProbMatrix[q1][n1][n2][n3];
+ sum+=f;
+// System.err.println(f);
+ x++;
+ }
+ if(qb234ProbMatrix!=null){
+ float f=qb234ProbMatrix[q1][n2][n3][n4];
+ sum+=f;
+// System.err.println(f);
+ x++;
+ }
+ if(qpProbMatrix!=null){
+ float f=qpProbMatrix[q1][pos];
+ sum+=f;
+// System.err.println(f);
+ x++;
+ }
+// System.err.println("result: "+sum+", "+x+", "+sum/(double)x);
+//
+// assert(pos<149) : sum+", "+x+", "+sum/(double)x;
+
+ if(x<1){
+ assert(false);
+ return expected;
+ }
+ return (sum/(float)x);
+ }
+
+ public static final float estimateErrorProbMax(byte[] quals, byte[] bases, int pos){
+// if(q102ProbMatrix==null && qbpProbMatrix==null){return PROB_ERROR[quals[pos]];}
+
+ final byte e='E';
+ final int last=quals.length-1;
+
+ final byte q0=(pos>0 ? (byte)Tools.mid(QMAX, quals[pos-1], 0) : QEND);
+ final byte q1=quals[pos];
+ final byte q2=(pos<last ? (byte)Tools.mid(QMAX, quals[pos+1], 0) : QEND);
+
+ byte b0=pos>1 ? bases[pos-2] : e;
+ byte b1=pos>0 ? bases[pos-1] : e;
+ byte b2=bases[pos];
+ byte b3=pos<last ? bases[pos+1] : e;
+ byte b4=pos<last-1 ? bases[pos+2] : e;
+ byte n0=baseToNum[b0];
+ byte n1=baseToNum[b1];
+ byte n2=baseToNum[b2];
+ byte n3=baseToNum[b3];
+ byte n4=baseToNum[b4];
+
+ final float expected=PROB_ERROR[q1];
+
+ float max=-1;
+
+ if(q102ProbMatrix!=null){
+ float f=q102ProbMatrix[q1][q0][q2];
+ max=Tools.max(max, f);
+ }
+ if(qbpProbMatrix!=null){
+ float f=qbpProbMatrix[q1][n2][pos];
+ max=Tools.max(max, f);
+ }
+ if(q10ProbMatrix!=null){
+ float f=q10ProbMatrix[q1][q0];
+ max=Tools.max(max, f);
+ }
+ if(q12ProbMatrix!=null){
+ float f=q12ProbMatrix[q1][q2];
+ max=Tools.max(max, f);
+ }
+ if(qb012ProbMatrix!=null){
+ float f=qb012ProbMatrix[q1][n0][n1][n2];
+ max=Tools.max(max, f);
+ }
+ if(qb123ProbMatrix!=null){
+ float f=qb123ProbMatrix[q1][n1][n2][n3];
+ max=Tools.max(max, f);
+ }
+ if(qb234ProbMatrix!=null){
+ float f=qb234ProbMatrix[q1][n2][n3][n4];
+ max=Tools.max(max, f);
+ }
+ if(qpProbMatrix!=null){
+ float f=qpProbMatrix[q1][pos];
+ max=Tools.max(max, f);
+ }
+
+ if(max<0){
+ assert(false);
+ return expected;
+ }
+ return max;
+ }
+
+ public static final float estimateErrorProbGeoAvg(byte[] quals, byte[] bases, int pos){
+// if(q102ProbMatrix==null && qbpProbMatrix==null){return PROB_ERROR[quals[pos]];}
+
+ final byte e='E';
+ final int last=quals.length-1;
+
+ final byte q0=(pos>0 ? (byte)Tools.mid(QMAX, quals[pos-1], 0) : QEND);
+ final byte q1=quals[pos];
+ final byte q2=(pos<last ? (byte)Tools.mid(QMAX, quals[pos+1], 0) : QEND);
+
+ byte b0=pos>1 ? bases[pos-2] : e;
+ byte b1=pos>0 ? bases[pos-1] : e;
+ byte b2=bases[pos];
+ byte b3=pos<last ? bases[pos+1] : e;
+ byte b4=pos<last-1 ? bases[pos+2] : e;
+ byte n0=baseToNum[b0];
+ byte n1=baseToNum[b1];
+ byte n2=baseToNum[b2];
+ byte n3=baseToNum[b3];
+ byte n4=baseToNum[b4];
+
+ float expected=PROB_ERROR[q1];
+ double product=1;
+ int x=0;
+
+// System.err.println();
+// System.err.println(((char)b0)+"\t"+((char)b1)+"\t"+((char)b2)+"\t"+((char)b3)+"\t"+((char)b4));
+// System.err.println((n0)+"\t"+(n1)+"\t"+(n2)+"\t"+(n3)+"\t"+(n4));
+// System.err.println(" "+"\t"+(q0)+"\t"+(q1)+"\t"+(q2)+"\t"+(" "));
+// System.err.println("Expected: "+expected);
+
+ if(q102ProbMatrix!=null){
+ float f=q102ProbMatrix[q1][q0][q2];
+ product*=f;
+// System.err.println(f);
+ x++;
+ }
+ if(qbpProbMatrix!=null){
+ float f=qbpProbMatrix[q1][n2][pos];
+ product*=f;
+// System.err.println(f);
+ x++;
+ }
+ if(q10ProbMatrix!=null){
+ float f=q10ProbMatrix[q1][q0];
+ product*=f;
+// System.err.println(f);
+ x++;
+ }
+ if(q12ProbMatrix!=null){
+ float f=q12ProbMatrix[q1][q2];
+ product*=f;
+// System.err.println(f);
+ x++;
+ }
+ if(qb012ProbMatrix!=null){
+ float f=qb012ProbMatrix[q1][n0][n1][n2];
+ product*=f;
+// System.err.println(f);
+ x++;
+ }
+ if(qb123ProbMatrix!=null){
+ float f=qb123ProbMatrix[q1][n1][n2][n3];
+ product*=f;
+// System.err.println(f);
+ x++;
+ }
+ if(qb234ProbMatrix!=null){
+ float f=qb234ProbMatrix[q1][n2][n3][n4];
+ product*=f;
+// System.err.println(f);
+ x++;
+ }
+ if(qpProbMatrix!=null){
+ float f=qpProbMatrix[q1][pos];
+ product*=f;
+// System.err.println(f);
+ x++;
+ }
+// System.err.println("result: "+product+", "+x+", "+(float)Math.pow(product, 1.0/x));
+
+// assert(pos<149) : product+", "+x+", "+(float)Math.pow(product, 1.0/x);
+
+ if(x<1){
+ assert(false);
+ return expected;
+ }
+ return (float)Math.pow(product, 1.0/x);
+ }
+
+ public static final float estimateErrorProb2(byte[] quals, byte[] bases, int pos){
+// if(q102ProbMatrix==null && qbpProbMatrix==null){return PROB_ERROR[quals[pos]];}
+
+ final byte e='E';
+ final int last=quals.length-1;
+
+ final byte q0=(pos>0 ? (byte)Tools.mid(QMAX, quals[pos-1], 0) : QEND);
+ final byte q1=quals[pos];
+ final byte q2=(pos<last ? (byte)Tools.mid(QMAX, quals[pos+1], 0) : QEND);
+
+ byte b0=pos>1 ? bases[pos-2] : e;
+ byte b1=pos>0 ? bases[pos-1] : e;
+ byte b2=bases[pos];
+ byte b3=pos<last ? bases[pos+1] : e;
+ byte b4=pos<last-1 ? bases[pos+2] : e;
+ byte n0=baseToNum[b0];
+ byte n1=baseToNum[b1];
+ byte n2=baseToNum[b2];
+ byte n3=baseToNum[b3];
+ byte n4=baseToNum[b4];
+
+ long sum=OBSERVATION_CUTOFF, bad=0;
+ if(q102CountMatrix!=null){
+ sum+=q102CountMatrix[0][q1][q0][q2];
+ bad+=q102CountMatrix[1][q1][q0][q2];
+ }
+ if(qbpCountMatrix!=null){
+ sum+=qbpCountMatrix[0][q1][n2][pos];
+ bad+=qbpCountMatrix[1][q1][n2][pos];
+ }
+ if(q10CountMatrix!=null){
+ sum+=q10CountMatrix[0][q1][q0];
+ bad+=q10CountMatrix[1][q1][q0];
+ }
+ if(q12CountMatrix!=null){
+ sum+=q12CountMatrix[0][q1][q2];
+ bad+=q12CountMatrix[1][q1][q2];
+ }
+ if(qb012CountMatrix!=null){
+ sum+=qb012CountMatrix[0][q1][n0][n1][n2];
+ bad+=qb012CountMatrix[1][q1][n0][n1][n2];
+ }
+ if(qb123CountMatrix!=null){
+ sum+=qb123CountMatrix[0][q1][n1][n2][n3];
+ bad+=qb123CountMatrix[1][q1][n1][n2][n3];
+ }
+ if(qb234CountMatrix!=null){
+ sum+=qb234CountMatrix[0][q1][n2][n3][n4];
+ bad+=qb234CountMatrix[1][q1][n2][n3][n4];
+ }
+ if(qpCountMatrix!=null){
+ sum+=qpCountMatrix[0][q1][pos];
+ bad+=qpCountMatrix[1][q1][pos];
+ }
+
+ double expected=PROB_ERROR[q1];
+
+ return (float)((bad+(((double)expected)*OBSERVATION_CUTOFF))/(sum+OBSERVATION_CUTOFF));
+
+// double dbad=bad+expected*OBSERVATION_CUTOFF;
+// double observed=dbad/sum;
+//
+// return (float)Math.sqrt(observed*expected);
+ }
+
+ /*--------------------------------------------------------------*/
+
+ private static double modify(final double sum, final double bad, final int phred, final long cutoff){
+ double expected=QualityTools.PROB_ERROR[phred];
+
+ double sum2=sum+cutoff;
+ double bad2=bad+expected*cutoff;
+ double measured=bad2/sum2;
+
+ return measured;
+
+// double modified=Math.pow(measured*measured*measured*expected, 0.25);
+//// double modified=Math.sqrt(measured*expected);
+//// double modified=(measured+expected)*.5;
+//
+// return modified;
+ }
+
+ public static final float[][][][] toProbs(long[][][][] sumMatrix, long[][][][] badMatrix, final long cutoff){
+ final int d0=sumMatrix.length, d1=sumMatrix[0].length, d2=sumMatrix[0][0].length, d3=sumMatrix[0][0][0].length;
+ float[][][][] probs=new float[d0][d1][d2][d3];
+ for(int a=0; a<d0; a++){
+ for(int b=0; b<d1; b++){
+ for(int c=0; c<d2; c++){
+ for(int d=0; d<d3; d++){
+ double sum=sumMatrix[a][b][c][d];
+ double bad=badMatrix[a][b][c][d];
+ double modified=modify(sum, bad, a, cutoff);
+ probs[a][b][c][d]=(float)modified;
+ }
+ }
+ }
+ }
+ return probs;
+ }
+
+ public static final float[][][] toProbs(long[][][] sumMatrix, long[][][] badMatrix, final long cutoff){
+ final int d0=sumMatrix.length, d1=sumMatrix[0].length, d2=sumMatrix[0][0].length;
+ float[][][] probs=new float[d0][d1][d2];
+ for(int a=0; a<d0; a++){
+ for(int b=0; b<d1; b++){
+ for(int c=0; c<d2; c++){
+ double sum=sumMatrix[a][b][c];
+ double bad=badMatrix[a][b][c];
+ double modified=modify(sum, bad, a, cutoff);
+ probs[a][b][c]=(float)modified;
+ }
+ }
+ }
+ return probs;
+ }
+
+ public static final float[][] toProbs(long[][] sumMatrix, long[][] badMatrix, final long cutoff){
+ final int d0=sumMatrix.length, d1=sumMatrix[0].length;
+ float[][] probs=new float[d0][d1];
+ for(int a=0; a<d0; a++){
+ for(int b=0; b<d1; b++){
+ double sum=sumMatrix[a][b];
+ double bad=badMatrix[a][b];
+ double modified=modify(sum, bad, a, cutoff);
+ probs[a][b]=(float)modified;
+ }
+ }
+ return probs;
+ }
+
+ /*--------------------------------------------------------------*/
+
+ private static String findPath(String fname){
+ assert(fname!=null);
+// return Data.findPath(fname);
+ if(fname.startsWith("?")){
+ fname=fname.replaceFirst("\\?", Data.ROOT_QUALITY);
+ }
+ return fname;
+ }
+
+ public static final long[][] loadMatrix(String fname, int d0){
+ if(fname==null){return null;}
+ fname=findPath(fname);
+ long[][] matrix=new long[2][d0];
+
+ TextFile tf=new TextFile(fname, false, false);
+ for(String line=tf.nextLine(); line!=null; line=tf.nextLine()){
+ String[] split=line.split("\t");
+ assert(split.length==3) : Arrays.toString(split);
+ int a=Integer.parseInt(split[0]);
+ long bases=Long.parseLong(split[1]);
+ long errors=Long.parseLong(split[2]);
+ matrix[0][a]=bases;
+ matrix[1][a]=errors;
+ }
+ return matrix;
+ }
+
+ public static final long[][][] loadMatrix(String fname, int d0, int d1){
+ if(fname==null){return null;}
+ fname=findPath(fname);
+ long[][][] matrix=new long[2][d0][d1];
+
+ TextFile tf=new TextFile(fname, false, false);
+ for(String line=tf.nextLine(); line!=null; line=tf.nextLine()){
+ String[] split=line.split("\t");
+ assert(split.length==4) : Arrays.toString(split);
+ int a=Integer.parseInt(split[0]);
+ int b=Integer.parseInt(split[1]);
+ long bases=Long.parseLong(split[2]);
+ long errors=Long.parseLong(split[3]);
+ matrix[0][a][b]=bases;
+ matrix[1][a][b]=errors;
+ }
+ return matrix;
+ }
+
+ public static final long[][][][] loadMatrix(String fname, int d0, int d1, int d2){
+ if(fname==null){return null;}
+ fname=findPath(fname);
+ long[][][][] matrix=new long[2][d0][d1][d2];
+
+ TextFile tf=new TextFile(fname, false, false);
+ for(String line=tf.nextLine(); line!=null; line=tf.nextLine()){
+ String[] split=line.split("\t");
+ assert(split.length==5) : Arrays.toString(split);
+ int a=Integer.parseInt(split[0]);
+ int b=Integer.parseInt(split[1]);
+ int c=Integer.parseInt(split[2]);
+ long bases=Long.parseLong(split[3]);
+ long errors=Long.parseLong(split[4]);
+ matrix[0][a][b][c]=bases;
+ matrix[1][a][b][c]=errors;
+ }
+ return matrix;
+ }
+
+ public static final long[][][][][] loadMatrix(String fname, int d0, int d1, int d2, int d3){
+ if(fname==null){return null;}
+ fname=findPath(fname);
+ long[][][][][] matrix=new long[2][d0][d1][d2][d3];
+
+ TextFile tf=new TextFile(fname, false, false);
+ for(String line=tf.nextLine(); line!=null; line=tf.nextLine()){
+ String[] split=line.split("\t");
+ assert(split.length==6) : Arrays.toString(split);
+ int a=Integer.parseInt(split[0]);
+ int b=Integer.parseInt(split[1]);
+ int c=Integer.parseInt(split[2]);
+ int d=Integer.parseInt(split[3]);
+ long bases=Long.parseLong(split[4]);
+ long errors=Long.parseLong(split[5]);
+ matrix[0][a][b][c][d]=bases;
+ matrix[1][a][b][c][d]=errors;
+ }
+ return matrix;
+ }
+
+ private static byte[] fillBaseToNum(){
+ byte[] btn=new byte[128];
+ Arrays.fill(btn, (byte)5);
+ btn['A']=btn['a']=0;
+ btn['C']=btn['c']=1;
+ btn['G']=btn['g']=2;
+ btn['T']=btn['t']=3;
+ btn['U']=btn['u']=3;
+ btn['E']=4;
+ return btn;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Nested Classes ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private class Worker extends Thread {
+
+ Worker(ConcurrentReadInputStream cris_){
+ cris=cris_;
+ }
+
+ @Override
+ public void run(){
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ while(reads!=null && reads.size()>0){
+
+ for(int idx=0; idx<reads.size(); idx++){
+ Read r1=reads.get(idx);
+ Read r2=r1.mate;
+ processLocal(r1);
+ processLocal(r2);
+ }
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ if(ln!=null){
+ cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
+ }
+ }
+
+ private void processLocal(Read r){
+ if(r==null){return;}
+ readsProcessedT++;
+ basesProcessedT+=r.length();
+
+ if(verbose){outstream.println(r+"\n");}
+
+ if(verbose){outstream.println("A");}
+ if(r.match!=null && r.shortmatch()){
+ r.match=Read.toLongMatchString(r.match);
+ r.setShortMatch(false);
+ }
+ final byte[] quals=r.quality, bases=r.bases, match=r.match;
+ if(quals==null || bases==null || match==null){return;}
+ if(verbose){outstream.println("B");}
+ if(r.containsNonNMS() || r.containsConsecutiveS(4)){
+ if(verbose){System.err.println("*************************************************** "+new String(match));}
+ return;
+ }
+ if(r.strand()==Gene.MINUS){
+ Tools.reverseInPlace(match);
+ }
+ if(verbose){outstream.println("C");}
+
+ final byte e='E';
+
+ if(readstatsT!=null){
+ readstatsT.addToQualityHistogram(r);
+ }
+
+ readsUsedT++;
+ for(int i=0, last=quals.length-1; i<quals.length; i++){
+ if(verbose){outstream.print("D");}
+ final byte q0=(i>0 ? (byte)Tools.mid(QMAX, quals[i-1], 0) : QEND);
+ final byte q1=quals[i];
+ final byte q2=(i<last ? (byte)Tools.mid(QMAX, quals[i+1], 0) : QEND);
+
+ byte b0=i>1 ? bases[i-2] : e;
+ byte b1=i>0 ? bases[i-1] : e;
+ byte b2=bases[i];
+ byte b3=i<last ? bases[i+1] : e;
+ byte b4=i<last-1 ? bases[i+2] : e;
+ byte n0=baseToNum[b0];
+ byte n1=baseToNum[b1];
+ byte n2=baseToNum[b2];
+ byte n3=baseToNum[b3];
+ byte n4=baseToNum[b4];
+ byte m=match[i];
+
+ if(m=='N' || !AminoAcid.isFullyDefined(b2)){
+ if(verbose){outstream.print("E");}
+ //do nothing
+ }else{
+
+ if(verbose){outstream.print("F");}
+ basesUsedT++;
+ if(m=='m'){
+ q102GoodMatrixT[q1][q0][q2]++;
+ qbpGoodMatrixT[q1][n2][i]++;
+
+ q10GoodMatrixT[q1][q0]++;
+ q12GoodMatrixT[q1][q0]++;
+ qb012GoodMatrixT[q1][n0][n1][n2]++;
+ qb123GoodMatrixT[q1][n1][n2][n3]++;
+ qb234GoodMatrixT[q1][n2][n3][n4]++;
+ qpGoodMatrixT[q1][i]++;
+ qGoodMatrixT[q1]++;
+ pGoodMatrixT[i]++;
+ }else if(m=='S'){
+ q102BadMatrixT[q1][q0][q2]++;
+ qbpBadMatrixT[q1][n2][i]++;
+
+ q10BadMatrixT[q1][q0]++;
+ q12BadMatrixT[q1][q0]++;
+ qb012BadMatrixT[q1][n0][n1][n2]++;
+ qb123BadMatrixT[q1][n1][n2][n3]++;
+ qb234BadMatrixT[q1][n2][n3][n4]++;
+ qpBadMatrixT[q1][i]++;
+ qBadMatrixT[q1]++;
+ pBadMatrixT[i]++;
+ }else{
+ throw new RuntimeException("Bad symbol m='"+((char)m)+"'\n"+new String(match)+"\n"+new String(bases)+"\n");
+ }
+ }
+ }
+
+ }
+
+ long readsProcessedT=0;
+ long basesProcessedT=0;
+ final ReadStats readstatsT=(qhist==null ? null : new ReadStats());
+ long readsUsedT=0, basesUsedT;
+
+ private final ConcurrentReadInputStream cris;
+
+ final long[][][] q102GoodMatrixT=new long[QMAX2][QMAX2][QMAX2];
+ final long[][][] q102BadMatrixT=new long[QMAX2][QMAX2][QMAX2];
+
+ final long[][][] qbpGoodMatrixT=new long[QMAX2][BMAX][LENMAX];
+ final long[][][] qbpBadMatrixT=new long[QMAX2][BMAX][LENMAX];
+
+ final long[][] q10GoodMatrixT=new long[QMAX2][QMAX2];
+ final long[][] q10BadMatrixT=new long[QMAX2][QMAX2];
+
+ final long[][] q12GoodMatrixT=new long[QMAX2][QMAX2];
+ final long[][] q12BadMatrixT=new long[QMAX2][QMAX2];
+
+ final long[][][][] qb012GoodMatrixT=new long[QMAX2][BMAX][BMAX][BMAX];
+ final long[][][][] qb012BadMatrixT=new long[QMAX2][BMAX][BMAX][BMAX];
+
+ final long[][][][] qb123GoodMatrixT=new long[QMAX2][BMAX][BMAX][BMAX];
+ final long[][][][] qb123BadMatrixT=new long[QMAX2][BMAX][BMAX][BMAX];
+
+ final long[][][][] qb234GoodMatrixT=new long[QMAX2][BMAX][BMAX][BMAX];
+ final long[][][][] qb234BadMatrixT=new long[QMAX2][BMAX][BMAX][BMAX];
+
+ final long[][] qpGoodMatrixT=new long[QMAX2][LENMAX];
+ final long[][] qpBadMatrixT=new long[QMAX2][LENMAX];
+
+ final long[] qGoodMatrixT=new long[QMAX2];
+ final long[] qBadMatrixT=new long[QMAX2];
+
+ final long[] pGoodMatrixT=new long[LENMAX];
+ final long[] pBadMatrixT=new long[LENMAX];
+
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private ReadStats readstats;
+
+ private boolean writeMatrices=true;
+
+ private long[][][] q102GoodMatrix=new long[QMAX2][QMAX2][QMAX2];
+ private long[][][] q102BadMatrix=new long[QMAX2][QMAX2][QMAX2];
+
+ private long[][][] qbpGoodMatrix=new long[QMAX2][BMAX][LENMAX];
+ private long[][][] qbpBadMatrix=new long[QMAX2][BMAX][LENMAX];
+
+ private long[][] q10GoodMatrix=new long[QMAX2][QMAX2];
+ private long[][] q10BadMatrix=new long[QMAX2][QMAX2];
+
+ private long[][] q12GoodMatrix=new long[QMAX2][QMAX2];
+ private long[][] q12BadMatrix=new long[QMAX2][QMAX2];
+
+ private long[][][][] qb012GoodMatrix=new long[QMAX2][BMAX][BMAX][BMAX];
+ private long[][][][] qb012BadMatrix=new long[QMAX2][BMAX][BMAX][BMAX];
+
+ private long[][][][] qb123GoodMatrix=new long[QMAX2][BMAX][BMAX][BMAX];
+ private long[][][][] qb123BadMatrix=new long[QMAX2][BMAX][BMAX][BMAX];
+
+ private long[][][][] qb234GoodMatrix=new long[QMAX2][BMAX][BMAX][BMAX];
+ private long[][][][] qb234BadMatrix=new long[QMAX2][BMAX][BMAX][BMAX];
+
+ private long[][] qpGoodMatrix=new long[QMAX2][LENMAX];
+ private long[][] qpBadMatrix=new long[QMAX2][LENMAX];
+
+ private long[] qGoodMatrix=new long[QMAX2];
+ private long[] qBadMatrix=new long[QMAX2];
+
+ private long[] pGoodMatrix=new long[LENMAX];
+ private long[] pBadMatrix=new long[LENMAX];
+
+ private PrintStream outstream=System.err;
+ private boolean verbose=false;
+ private long maxReads=-1;
+ private String[] in;
+
+ private String q102out="?q102matrix.txt.gz";
+ private String qbpout="?qbpmatrix.txt.gz";
+ private String q10out="?q10matrix.txt.gz";
+ private String q12out="?q12matrix.txt.gz";
+ private String qb012out="?qb012matrix.txt.gz";
+ private String qb123out="?qb123matrix.txt.gz";
+ private String qb234out="?qb234matrix.txt.gz";
+ private String qpout="?qpmatrix.txt.gz";
+ private String qout="?qmatrix.txt.gz";
+ private String pout="?pmatrix.txt.gz";
+ private String qhist=null;
+
+ private boolean overwrite=true;
+ private final boolean append=false;
+ private long readsProcessed=0;
+ private long basesProcessed=0;
+ private long readsUsed=0;
+ private long basesUsed=0;
+ private boolean errorState=false;
+
+ private final int threads;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Static Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private static final boolean[] initialized={false};
+
+ private static final int QMAX=41;
+ private static final int QEND=QMAX+1;
+ private static final int QMAX2=QEND+1;
+ private static final int BMAX=6;
+ private static final int LENMAX=400;
+ private static final byte[] baseToNum=fillBaseToNum();
+ private static final byte[] numToBase={'A', 'C', 'G', 'T', 'E', 'N'};
+ private static final float[] PROB_ERROR=QualityTools.PROB_ERROR;
+
+ public static String q102matrix="?q102matrix.txt.gz";
+ public static String qbpmatrix="?qbpmatrix.txt.gz";
+ public static String q10matrix="?q10matrix.txt.gz";
+ public static String q12matrix="?q12matrix.txt.gz";
+ public static String qb012matrix="?qb012matrix.txt.gz";
+ public static String qb123matrix="?qb123matrix.txt.gz";
+ public static String qb234matrix="?qb234matrix.txt.gz";
+ public static String qpmatrix="?qpmatrix.txt.gz";
+
+ public static long[][][][] q102CountMatrix;
+ public static long[][][][] qbpCountMatrix;
+
+ public static long[][][] q10CountMatrix;
+ public static long[][][] q12CountMatrix;
+ public static long[][][][][] qb012CountMatrix;
+ public static long[][][][][] qb123CountMatrix;
+ public static long[][][][][] qb234CountMatrix;
+ public static long[][][] qpCountMatrix;
+
+ public static float[][][] q102ProbMatrix;
+ public static float[][][] qbpProbMatrix;
+
+ public static float[][] q10ProbMatrix;
+ public static float[][] q12ProbMatrix;
+ public static float[][][][] qb012ProbMatrix;
+ public static float[][][][] qb123ProbMatrix;
+ public static float[][][][] qb234ProbMatrix;
+ public static float[][] qpProbMatrix;
+
+ public static boolean q102=false;
+ public static boolean qbp=false;
+ public static boolean q10=false;
+ public static boolean q12=false;
+ public static boolean qb012=true;
+ public static boolean qb123=false;
+ public static boolean qb234=false;
+ public static boolean qp=true;
+
+ public static boolean USE_AVERAGE=true;
+
+ public static final long OBSERVATION_CUTOFF=1000; //Soft threshold
+
+
+
+}
diff --git a/current/jgi/CalcUniqueness.java b/current/jgi/CalcUniqueness.java
new file mode 100755
index 0000000..4091683
--- /dev/null
+++ b/current/jgi/CalcUniqueness.java
@@ -0,0 +1,610 @@
+package jgi;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import kmer.AbstractKmerTable;
+import kmer.HashArray1D;
+import kmer.HashForest;
+import kmer.KmerTable;
+
+import stream.ConcurrentGenericReadInputStream;
+import stream.ConcurrentReadInputStream;
+import stream.FASTQ;
+import stream.FastaReadInputStream;
+import stream.Read;
+import align2.ListNum;
+import align2.Shared;
+import align2.Tools;
+import dna.Parser;
+import dna.Timer;
+import fileIO.ByteFile;
+import fileIO.ByteFile1;
+import fileIO.ByteFile2;
+import fileIO.FileFormat;
+import fileIO.ReadWrite;
+import fileIO.TextStreamWriter;
+
+/**
+ * @author Brian Bushnell
+ * @date Mar 24, 2014
+ *
+ */
+public class CalcUniqueness {
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+
+ public static void main(String[] args){
+ Timer t=new Timer();
+ CalcUniqueness rr=new CalcUniqueness(args);
+ rr.process(t);
+ }
+
+ public CalcUniqueness(String[] args){
+
+ args=Parser.parseConfig(args);
+ if(Parser.parseHelp(args, true)){
+ printOptions();
+ System.exit(0);
+ }
+
+ outstream.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+
+ boolean setInterleaved=false; //Whether it was explicitly set.
+
+
+
+ Shared.READ_BUFFER_LENGTH=Tools.min(200, Shared.READ_BUFFER_LENGTH);
+ Shared.capBuffers(4);
+ ReadWrite.USE_UNPIGZ=true;
+ ReadWrite.MAX_ZIP_THREADS=8;
+
+ int k_=20;
+
+ Parser parser=new Parser();
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens
+
+ if(parser.parse(arg, a, b)){
+ //do nothing
+ }else if(a.equals("printlastbin") || a.equals("plb")){
+ printLastBin=Tools.parseBoolean(b);
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ ByteFile1.verbose=verbose;
+ ByteFile2.verbose=verbose;
+ stream.FastaReadInputStream.verbose=verbose;
+ stream.FastqReadInputStream.verbose=verbose;
+ ConcurrentGenericReadInputStream.verbose=verbose;
+ ReadWrite.verbose=verbose;
+ }else if(a.equals("cumulative")){
+ cumulative=Tools.parseBoolean(b);
+ }else if(a.equals("percent") || a.equals("percents")){
+ showPercents=Tools.parseBoolean(b);
+ }else if(a.equals("count") || a.equals("counts")){
+ showCounts=Tools.parseBoolean(b);
+ }else if(a.equals("k")){
+ k_=Integer.parseInt(b);
+ }else if(a.equals("bin") || a.equals("interval")){
+ interval=Integer.parseInt(b);
+ }else if(parser.in1==null && i==0 && !arg.contains("=") && (arg.toLowerCase().startsWith("stdin") || new File(arg).exists())){
+ parser.in1=arg;
+ }else{
+// System.err.println("Unknown parameter "+args[i]);
+// assert(false) : "Unknown parameter "+args[i];
+ throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ {//Process parser fields
+ Parser.processQuality();
+
+ maxReads=parser.maxReads;
+ samplerate=parser.samplerate;
+ sampleseed=parser.sampleseed;
+
+ overwrite=parser.overwrite;
+ append=parser.append;
+ testsize=parser.testsize;
+
+ setInterleaved=parser.setInterleaved;
+
+ in1=parser.in1;
+ in2=parser.in2;
+
+ out=parser.out1;
+
+ extin=parser.extin;
+ extout=parser.extout;
+ }
+ setSampleSeed(-1);
+
+ k=k_;
+ k2=k-1;
+ assert(k>0 && k<32) : "k="+k+"; valid range is 1-31";
+
+ if(in1!=null && in2==null && in1.indexOf('#')>-1 && !new File(in1).exists()){
+ in2=in1.replace("#", "2");
+ in1=in1.replace("#", "1");
+ }
+ if(in2!=null){
+ if(FASTQ.FORCE_INTERLEAVED){System.err.println("Reset INTERLEAVED to false because paired input files were specified.");}
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+ }
+
+ assert(FastaReadInputStream.settingsOK());
+
+ if(in1==null){
+ printOptions();
+ throw new RuntimeException("Error - at least one input file is required.");
+ }
+ if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){
+ ByteFile.FORCE_MODE_BF2=true;
+ }
+
+ if(out==null){
+ out="stdout.txt";
+ }
+
+ if(!setInterleaved){
+ assert(in1!=null && out!=null) : "\nin1="+in1+"\nin2="+in2+"\nout="+out+"\n";
+ if(in2!=null){ //If there are 2 input streams.
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+ outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED);
+ }
+ }
+
+ if(!Tools.testOutputFiles(overwrite, append, false, out)){
+ throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output file "+out+"\n");
+ }
+
+ ffout=FileFormat.testOutput(out, FileFormat.TEXT, extout, false, overwrite, append, false);
+
+ ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, extin, true, true);
+ ffin2=FileFormat.testInput(in2, FileFormat.FASTQ, extin, true, true);
+
+ keySets=new AbstractKmerTable[WAYS];
+
+ //Initialize tables
+ for(int j=0; j<WAYS; j++){
+ if(useForest){
+ keySets[j]=new HashForest(initialSize, true, false);
+ }else if(useTable){
+ keySets[j]=new KmerTable(initialSize, true);
+ }else if(useArray){
+ keySets[j]=new HashArray1D(initialSize, true);
+ }else{
+ throw new RuntimeException("Must use forest, table, or array data structure.");
+ }
+ }
+
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Class ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private class Counter{
+
+ Counter(int mask_){
+ mask=mask_;
+ }
+
+ void increment(final long kmer){
+ AbstractKmerTable table=keySets[(int)(kmer%WAYS)];
+ int count=table.getValue(kmer);
+ if(count<1){
+ table.set(kmer, mask);
+ misses++;
+ cmisses++;
+ }else if((count&mask)==0){
+ table.set(kmer, count|mask);
+ misses++;
+ cmisses++;
+ }else{
+ hits++;
+ chits++;
+ }
+ }
+
+ void reset(){
+ hits=misses=0;
+ }
+
+ double percent(){
+ return misses()*100.0/(hits()+misses());
+ }
+
+ String percentS(){
+ return String.format("%.3f",percent());
+ }
+
+ long hits(){return cumulative ? chits : hits;}
+ long misses(){return cumulative ? cmisses : misses;}
+
+ final int mask;
+
+ /** Per-interval hash hits */
+ long hits=0;
+ /** Per-interval hash misses */
+ long misses=0;
+
+ /** Cumulative hash hits */
+ long chits=0;
+ /** Cumulative hash misses */
+ long cmisses=0;
+
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Primary Method ----------------*/
+ /*--------------------------------------------------------------*/
+
+ void process(Timer t){
+
+ final ConcurrentReadInputStream cris;
+ {
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, ffin2, null, null);
+ cris.setSampleRate(samplerate, sampleseed);
+ if(verbose){System.err.println("Started cris");}
+ cris.start(); //4567
+ }
+ final boolean paired=cris.paired();
+ if(verbose){System.err.println("Input is "+(paired ? "paired" : "unpaired"));}
+
+ TextStreamWriter tsw=null;
+ if(out!=null){
+ tsw=new TextStreamWriter(ffout);
+ tsw.start();
+ tsw.print("#count");
+ if(showPercents){
+ tsw.print("\tfirst\trand");
+ if(paired){tsw.print("\tr1_first\tr1_rand\tr2_first\tr2_rand\tpair");}
+ }
+ if(showCounts){
+ tsw.print("\tfirst_cnt\trand_cnt");
+ if(paired){tsw.print("\tr1_first_cnt\tr1_rand_cnt\tr2_first_cnt\tr2_rand_cnt\tpair_cnt");}
+ }
+ tsw.print("\n");
+ }
+
+ //Counters for overall data statistics
+ long pairsProcessed=0;
+ long readsProcessed=0;
+ long basesProcessed=0;
+
+ //Counter for display intervals
+ long remaining=interval;
+
+ final StringBuilder sb=new StringBuilder(1024);
+
+ {
+ //Fetch initial list
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+ assert((ffin1==null || ffin1.samOrBam()) || (r.mate!=null)==cris.paired());
+ }
+
+ /* Process 1 list of reads per loop iteration */
+ while(reads!=null && reads.size()>0){
+
+ /* Process 1 read per loop iteration */
+ for(Read r1 : reads){
+ final Read r2=r1.mate;
+ final byte[] bases1=(r1==null ? null : r1.bases);
+ final byte[] bases2=(r2==null ? null : r2.bases);
+ final int length1=(bases1==null ? 0 : bases1.length);
+ final int length2=(bases2==null ? 0 : bases2.length);
+
+ pairsProcessed++;
+
+ /* Process read 1 */
+ if(r1!=null){
+
+ readsProcessed++;
+ basesProcessed+=length1;
+
+ if(length1>=k){
+ {//First kmer
+ final long kmer=toKmer(bases1, 0, k);
+ r1CounterFirst.increment(kmer);
+ bothCounterFirst.increment(kmer);
+ }
+ {//Random kmer
+ final long kmer=toKmer(bases1, randy.nextInt(length1-k2), k);
+ r1CounterRand.increment(kmer);
+ bothCounterRand.increment(kmer);
+ }
+ }
+ }
+
+ /* Process read 2 */
+ if(r2!=null){
+
+ readsProcessed++;
+ basesProcessed+=length2;
+
+ if(length2>=k){
+ {//First kmer
+ final long kmer=toKmer(bases2, 0, k);
+ r2CounterFirst.increment(kmer);
+ bothCounterFirst.increment(kmer);
+ }
+ {//Random kmer
+ final long kmer=toKmer(bases2, randy.nextInt(length2-k2), k);
+ r2CounterRand.increment(kmer);
+ bothCounterRand.increment(kmer);
+ }
+ }
+ }
+
+ /* Process pair */
+ if(r1!=null && r2!=null){
+
+ if(length1>k+OFFSET && length2>k+OFFSET){
+ final long kmer1=toKmer(bases1, OFFSET, k);
+ final long kmer2=toKmer(bases2, OFFSET, k);
+ final long kmer=(~((-1L)>>2))|((kmer1<<(2*(31-k)))^(kmer2));
+ assert(kmer>=0) : k+", "+kmer1+", "+kmer2+", "+kmer;
+ {//Pair kmer
+ pairCounter.increment(kmer);
+ }
+ }
+ }
+
+ remaining--;
+ if(remaining<=0){
+
+ printCountsToBuffer(sb, pairsProcessed, paired);
+
+ tsw.print(sb.toString());
+
+ //Reset things
+ sb.setLength(0);
+ remaining=interval;
+ }
+ }
+
+ //Fetch a new list
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ if(ln!=null){//Return final list
+ cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
+ }
+ }
+
+ if(remaining<interval && printLastBin){
+
+ printCountsToBuffer(sb, pairsProcessed, paired);
+
+ tsw.print(sb.toString());
+
+ //Reset things
+ sb.setLength(0);
+ remaining=interval;
+ }
+
+
+ //Close things
+ errorState|=ReadWrite.closeStream(cris);
+ tsw.poisonAndWait();
+ errorState|=tsw.errorState;
+
+ t.stop();
+
+ //Calculate and display statistics
+ double rpnano=readsProcessed/(double)(t.elapsed);
+ double bpnano=basesProcessed/(double)(t.elapsed);
+
+ String rpstring=(readsProcessed<100000 ? ""+readsProcessed : readsProcessed<100000000 ? (readsProcessed/1000)+"k" : (readsProcessed/1000000)+"m");
+ String bpstring=(basesProcessed<100000 ? ""+basesProcessed : basesProcessed<100000000 ? (basesProcessed/1000)+"k" : (basesProcessed/1000000)+"m");
+
+ while(rpstring.length()<8){rpstring=" "+rpstring;}
+ while(bpstring.length()<8){bpstring=" "+bpstring;}
+
+ outstream.println("\nTime: \t"+t);
+ outstream.println("Reads Processed: "+rpstring+" \t"+String.format("%.2fk reads/sec", rpnano*1000000));
+ outstream.println("Bases Processed: "+bpstring+" \t"+String.format("%.2fm bases/sec", bpnano*1000));
+ if(testsize){
+ long bytesProcessed=(new File(in1).length()+(in2==null ? 0 : new File(in2).length()));
+ double xpnano=bytesProcessed/(double)(t.elapsed);
+ String xpstring=(bytesProcessed<100000 ? ""+bytesProcessed : bytesProcessed<100000000 ? (bytesProcessed/1000)+"k" : (bytesProcessed/1000000)+"m");
+ while(xpstring.length()<8){xpstring=" "+xpstring;}
+ outstream.println("Bytes Processed: "+xpstring+" \t"+String.format("%.2fm bytes/sec", xpnano*1000));
+ }
+
+ if(errorState){
+ throw new RuntimeException("CalcUniqueness terminated in an error state; the output may be corrupt.");
+ }
+ }
+
+
+ private void printCountsToBuffer(StringBuilder sb, long pairsProcessed, boolean paired){
+
+ //Display data for the last interval
+ sb.append(pairsProcessed);
+
+ if(showPercents){
+ sb.append('\t');
+ sb.append(bothCounterFirst.percentS());
+ sb.append('\t');
+ sb.append(bothCounterRand.percentS());
+ if(paired){
+ sb.append('\t');
+ sb.append(r1CounterFirst.percentS());
+ sb.append('\t');
+ sb.append(r1CounterRand.percentS());
+ sb.append('\t');
+ sb.append(r2CounterFirst.percentS());
+ sb.append('\t');
+ sb.append(r2CounterRand.percentS());
+ sb.append('\t');
+ sb.append(pairCounter.percentS());
+ }
+ }
+
+ if(showCounts){
+ sb.append('\t');
+ sb.append(bothCounterFirst.misses());
+ sb.append('\t');
+ sb.append(bothCounterRand.misses());
+ if(paired){
+ sb.append('\t');
+ sb.append(r1CounterFirst.misses());
+ sb.append('\t');
+ sb.append(r1CounterRand.misses());
+ sb.append('\t');
+ sb.append(r2CounterFirst.misses());
+ sb.append('\t');
+ sb.append(r2CounterRand.misses());
+ sb.append('\t');
+ sb.append(pairCounter.misses());
+ }
+ }
+
+ sb.append('\n');
+
+ bothCounterFirst.reset();
+ bothCounterRand.reset();
+ r1CounterFirst.reset();
+ r1CounterRand.reset();
+ r2CounterFirst.reset();
+ r2CounterRand.reset();
+ pairCounter.reset();
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Helper Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /**
+ * Generate a kmer from specified start location
+ * @param bases
+ * @param start
+ * @param klen kmer length
+ * @return kmer
+ */
+ private final long toKmer(final byte[] bases, final int start, final int klen){
+ final int stop=start+klen;
+ assert(stop<=bases.length);
+ long kmer=0;
+
+ for(int i=start; i<stop; i++){
+ final byte b=bases[i];
+ final long x=Dedupe.baseToNumber[b];
+ kmer=((kmer<<2)|x);
+ }
+ return kmer;
+ }
+
+ /*--------------------------------------------------------------*/
+
+ private void printOptions(){
+ outstream.println("Syntax:\n");
+ outstream.println("java -ea -Xmx512m -cp <path> jgi.CalcUniqueness in=<infile> in2=<infile2> out=<outfile>");
+ outstream.println("\nin2 and out2 are optional. \nIf input is paired and there is only one output file, it will be written interleaved.\n");
+ outstream.println("Other parameters and their defaults:\n");
+ outstream.println("overwrite=false \tOverwrites files that already exist");
+ outstream.println("interleaved=auto \tDetermines whether input file is considered interleaved");
+ outstream.println("bin=25000 \t(interval) Number of reads per data point");
+ outstream.println("qin=auto \tASCII offset for input quality. May be set to 33 (Sanger), 64 (Illumina), or auto");
+ outstream.println("k=20 \tKmer length");
+ }
+
+
+ public void setSampleSeed(long seed){
+
+ //Note: ThreadLocalRandom does not allow seed to be set.
+ randy=java.util.concurrent.ThreadLocalRandom.current();
+ if(seed>-1){
+// randy.setSeed(seed);
+ }else{
+// randy.setSeed(System.nanoTime());
+ }
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private String in1=null;
+ private String in2=null;
+
+ private String out=null;
+
+ private String extin=null;
+ private String extout=null;
+
+ /*--------------------------------------------------------------*/
+
+ //Counters for hashtable hits and misses of different kmers
+ private final Counter r1CounterFirst=new Counter(1);
+ private final Counter r1CounterRand=new Counter(2);
+ private final Counter r2CounterFirst=new Counter(4);
+ private final Counter r2CounterRand=new Counter(8);
+ private final Counter pairCounter=new Counter(16);
+
+ private final Counter bothCounterFirst=new Counter(32);
+ private final Counter bothCounterRand=new Counter(64);
+
+ /*--------------------------------------------------------------*/
+
+
+ private long maxReads=-1;
+ private float samplerate=1f;
+ private long sampleseed=-1;
+
+ private long interval=25000;
+ private boolean cumulative=false;
+ private boolean showPercents=true;
+ private boolean showCounts=false;
+ private boolean printLastBin=false;
+
+ private final int k, k2;
+ private static final int WAYS=31;
+ private static final int OFFSET=10;
+
+ /** Initial size of data structures */
+ private int initialSize=512000;
+
+ /** Hold kmers. A kmer X such that X%WAYS=Y will be stored in keySets[Y] */
+ private final AbstractKmerTable[] keySets;
+
+ /*--------------------------------------------------------------*/
+
+ private final FileFormat ffin1;
+ private final FileFormat ffin2;
+
+ private final FileFormat ffout;
+
+
+ /*--------------------------------------------------------------*/
+
+ private PrintStream outstream=System.err;
+ public static boolean verbose=false;
+ public boolean errorState=false;
+ private boolean overwrite=false;
+ private boolean append=false;
+ private boolean testsize=false;
+
+ private static final boolean useForest=false, useTable=false, useArray=true;
+
+ private java.util.concurrent.ThreadLocalRandom randy;
+
+}
diff --git a/current/jgi/CallPeaks.java b/current/jgi/CallPeaks.java
new file mode 100755
index 0000000..4cc6982
--- /dev/null
+++ b/current/jgi/CallPeaks.java
@@ -0,0 +1,879 @@
+package jgi;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import stream.ByteBuilder;
+import align2.LongList;
+import align2.ReadStats;
+import align2.Tools;
+
+import dna.Parser;
+import dna.Timer;
+import fileIO.ByteFile1;
+import fileIO.ByteFile2;
+import fileIO.ByteStreamWriter;
+import fileIO.FileFormat;
+import fileIO.ReadWrite;
+import fileIO.TextFile;
+
+/**
+ * @author Brian Bushnell
+ * @date Oct 28, 2014
+ *
+ */
+public class CallPeaks {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public static void main(String[] args){
+ Timer t=new Timer();
+ CallPeaks cp=new CallPeaks(args);
+ cp.process(t);
+ }
+
+ public CallPeaks(String[] args){
+
+ args=Parser.parseConfig(args);
+ if(Parser.parseHelp(args, true)){
+ printOptions();
+ System.exit(0);
+ }
+ if(printClass){outstream.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");}
+
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if(b==null || b.equalsIgnoreCase("null")){b=null;}
+ while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens
+
+ if(Parser.isJavaFlag(arg)){
+ //do nothing
+ }else if(Parser.parseZip(arg, a, b)){
+ //do nothing
+ }else if(a.equals("append") || a.equals("app")){
+ append=ReadStats.append=Tools.parseBoolean(b);
+ }else if(a.equals("overwrite") || a.equals("ow")){
+ overwrite=Tools.parseBoolean(b);
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ ByteFile1.verbose=verbose;
+ ByteFile2.verbose=verbose;
+ ReadWrite.verbose=verbose;
+ }else if(a.equals("in")){
+ in=b;
+ }else if(a.equals("out")){
+ out=b;
+ }else if(a.equals("minheight") || a.equals("h")){
+ minHeight=Long.parseLong(b);
+ }else if(a.equals("minvolume") || a.equals("v")){
+ minVolume=Long.parseLong(b);
+ }else if(a.equals("minwidth") || a.equals("w")){
+ minWidth=Integer.parseInt(b);
+ }else if(a.equals("minpeak") || a.equals("minp")){
+ minPeak=Integer.parseInt(b);
+ }else if(a.equals("maxpeak") || a.equals("maxp")){
+ maxPeak=Integer.parseInt(b);
+ }else if(a.equals("maxpeakcount") || a.equals("maxpc") || a.equals("maxpeaks")){
+ maxPeakCount=Integer.parseInt(b);
+ }else if(a.equals("smoothradius")){
+ smoothRadius=Integer.parseInt(b);
+ }else if(a.equals("smoothprogressive")){
+ smoothProgressiveFlag=Tools.parseBoolean(b);
+ }else if(a.equals("maxradius")){
+ maxRadius=Integer.parseInt(b);
+ }else if(a.equals("progressivemult")){
+ progressiveMult=Float.parseFloat(b);
+ }else if(a.equals("ploidy")){
+ ploidyClaimed=Integer.parseInt(b);
+ }else if(a.equals("column") || a.equals("col") || a.equals("countcolumn")){
+ countColumn=Integer.parseInt(b);
+ }else if(a.equals("k")){
+ k=Integer.parseInt(b);
+ }else if(in==null && i==0 && !arg.contains("=") && (arg.toLowerCase().startsWith("stdin") || new File(arg).exists())){
+ in=arg;
+ }else{
+ outstream.println("Unknown parameter "+args[i]);
+ assert(false) : "Unknown parameter "+args[i];
+ // throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+
+ if(out==null){out="stdout.txt";}
+
+ ffout=FileFormat.testOutput(out, FileFormat.TEXT, null, true, overwrite, append, false);
+ ffin=FileFormat.testInput(in, FileFormat.TEXT, null, true, false);
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Outer Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public void process(Timer t){
+ LongList hist=loadHistogram(ffin);
+ ArrayList<Peak> peaks=callPeaks(hist);
+ long sum=Tools.sum(hist.array);
+ hist=null;
+ printPeaks(peaks, k, sum);
+ t.stop();
+ System.err.println("\nFound "+peaks.size()+" peaks in "+t);
+
+ peaks=null;
+
+ if(errorState){
+ throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
+ }
+ }
+
+ public static boolean printPeaks(long[] array, String fname, boolean ow, long minHeight, long minVolume, int minWidth, int minPeak, int maxPeak, int maxPeakCount,
+ int k, int ploidy, ArrayList<String> list){
+ if(list==null){list=new ArrayList<String>();}
+ list.add("out="+fname);
+ list.add("ow="+ow);
+ list.add("minheight="+minHeight);
+ list.add("minvolume="+minVolume);
+ list.add("minwidth="+minWidth);
+ list.add("minpeak="+minPeak);
+ list.add("maxpeak="+maxPeak);
+ list.add("maxpeaks="+maxPeakCount);
+ list.add("k="+(k<1 ? 31 : k));
+ if(ploidy>0){list.add("ploidy="+ploidy);}
+ CallPeaks cp=new CallPeaks(list.toArray(new String[0]));
+ ArrayList<Peak> peaks=cp.callPeaks(array, array.length);
+ cp.printPeaks(peaks, k, Tools.sum(array));
+ return cp.errorState;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public static LongList loadHistogram(FileFormat ff){
+ LongList list=new LongList(8000);
+ TextFile tf=new TextFile(ff);
+ for(String line=tf.nextLine(); line!=null; line=tf.nextLine()){
+ if(line.startsWith("#")){
+ //ignore
+ }else{
+ String[] split=line.split("\\s+");
+ if(split.length==1){
+ list.add(Long.parseLong(split[0]));
+ }else{
+ list.set(Integer.parseInt(split[0]), Long.parseLong(split[countColumn]));
+ }
+ }
+ }
+ boolean errorState_=tf.close();
+ assert(!errorState_) : "Encountered an error when reading "+ff.name()+".\n" +
+ "To skip this error message, run with the '-da' flag.";
+
+ return list;
+ }
+
+ private static ArrayList<Peak> condense(ArrayList<Peak> in, int maxCount){
+ if(in==null || in.isEmpty()){return in;}
+ maxCount=Tools.max(Tools.min(in.size(), maxCount), 1);
+ ArrayList<Peak> out=new ArrayList<Peak>(Tools.min(maxCount, in.size()));
+
+ final long hlimit, vlimit;
+
+ {
+ long[] heights=new long[in.size()];
+ for(int i=0; i<in.size(); i++){
+ Peak p=in.get(i);
+ heights[i]=(callByRawCount ? p.centerHeight2() : p.centerHeight);
+ }
+ Arrays.sort(heights);
+ hlimit=heights[heights.length-maxCount];
+ }
+
+ {
+ int mc2=(maxCount+1)/2;
+ long[] volumes=new long[in.size()];
+ for(int i=0; i<in.size(); i++){
+ Peak p=in.get(i);
+ volumes[i]=(callByRawCount ? p.volume2 : p.volume);
+ }
+ Arrays.sort(volumes);
+ vlimit=volumes[volumes.length-mc2];
+ }
+
+ for(Peak p : in){
+ final long height=(callByRawCount ? p.centerHeight2() : p.centerHeight);
+ final long volume=(callByRawCount ? p.volume2 : p.volume);
+ if(volume>=vlimit || height>=hlimit){out.add(p);}
+ }
+
+ for(Peak p : in){
+ final long height=(callByRawCount ? p.centerHeight2() : p.centerHeight);
+ final long volume=(callByRawCount ? p.volume2 : p.volume);
+ if(volume<vlimit && height<hlimit){
+ Peak p2=out.get(0);
+ for(Peak temp : out){
+ if(Tools.absdif(p.center, temp.center)<Tools.absdif(p.center, p2.center)){
+ p2=temp;
+ }
+ }
+ if(p2.compatibleWith(p)){
+ p2.absorb(p);
+ }//else discard
+ }
+ }
+
+ return out;
+ }
+
+ private void capWidth(ArrayList<Peak> peaks, float maxWidthMult, long[] counts){
+ float mult=1/maxWidthMult;
+ for(Peak p : peaks){
+ p.start=(int)Math.round(Tools.max(p.start, p.center*mult));
+ p.stop=(int)Math.round(Tools.min(p.stop, p.center*maxWidthMult));
+ p.recalculate(counts);
+ }
+
+// for(int i=0; i<peaks.)
+ }
+
+ private void printPeaks(ArrayList<Peak> peaks, int k, long uniqueKmers){
+ if(ffout==null){return;}
+ ByteStreamWriter bsw=new ByteStreamWriter(ffout);
+ bsw.start();
+
+ if(peaks.size()>0){
+ try {
+ final Peak p0=peaks.get(0);
+ final int center0=p0.center;
+ final int ploidyEstimate=calcPloidy(peaks);
+ final int ploidy=ploidyClaimed>0 ? ploidyClaimed : ploidyEstimate;
+ final long genomeSize=genomeSize(peaks);
+ final long repeatSize=repeatSize(peaks, ploidy);
+ final long haploidSize=genomeSize/ploidy;
+ final long hetLocs=calcHetLocations(peaks, ploidy, k);
+ final double hetRate=hetLocs/(double)haploidSize;
+ final double repeatRate=repeatSize*1.0/genomeSize;
+
+ Peak ploidyPeak=p0, mainPeak=p0;
+ int target=center0*ploidy, haploidCov;
+ for(Peak p : peaks){
+ if(p.volume>mainPeak.volume){
+ mainPeak=p;
+ }
+ if(Tools.absdif(p.center, target)<Tools.absdif(ploidyPeak.center, target)){
+ ploidyPeak=p;
+ }
+ }
+ if(Tools.max(target,ploidyPeak.center)/(float)Tools.min(target,ploidyPeak.center)<1.3f){
+ haploidCov=ploidyPeak.center;
+ }else{
+ haploidCov=target;
+ }
+
+// System.err.println("ploidyEstimate="+ploidyEstimate);
+// System.err.println("genomeSize="+genomeSize);
+// System.err.println("repeatSize="+repeatSize);
+// System.err.println("haploidSize="+haploidSize);
+// System.err.println("hetLocs="+hetLocs);
+// System.err.println("biggestPeak="+biggestPeak(peaks));
+// System.err.println("homozygousPeak("+ploidy+")="+homozygousPeak(peaks, ploidy));
+// System.err.println("homozygousPeak("+ploidyEstimate+")="+homozygousPeak(peaks, ploidyEstimate));
+
+ if(ploidy!=ploidyEstimate){
+ System.err.println("Warning - ploidy detected at "+ploidyEstimate+" differs from stated ploidy of "+ploidyClaimed);
+ }
+
+ if(k>0){bsw.println("#k\t"+k);}
+ bsw.println("#unique_kmers\t"+uniqueKmers);
+ bsw.println("#main_peak\t"+mainPeak.center);
+ bsw.println("#genome_size\t"+genomeSize);
+ if(ploidy>1 || true){bsw.println("#haploid_genome_size\t"+(genomeSize/ploidy));}
+ bsw.println("#fold_coverage\t"+center0);
+ if(ploidy>1 || true){bsw.println("#haploid_fold_coverage\t"+haploidCov);}
+ bsw.println("#ploidy\t"+ploidy);
+ if(ploidy!=ploidyEstimate){bsw.println("#ploidy_detected\t"+ploidyEstimate);}
+ if(ploidy>1){bsw.println("#het_rate\t"+String.format("%.5f", hetRate));}
+ bsw.println("#percent_repeat\t"+String.format("%.3f", (100*repeatRate)));
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ }
+
+ bsw.println("#start\tcenter\tstop\tmax\tvolume");
+ ByteBuilder bb=new ByteBuilder(200);
+ for(Peak p : peaks){
+ p.toBytes(bb);
+ bsw.println(bb);
+ bb.setLength(0);
+ }
+ errorState|=bsw.poisonAndWait();
+ }
+
+ private long genomeSize(ArrayList<Peak> peaks){
+ if(peaks.size()<1){return 0;}
+
+ long sizeSum=0;
+ final Peak p0=peaks.get(0);
+ final int center0=p0.center;
+ final double mult=1.0/(Tools.max(1, center0));
+ for(Peak p : peaks){
+ long size=p.volume*(Math.round(p.center*mult));
+ sizeSum+=size;
+ }
+ return sizeSum;
+ }
+
+ private long repeatSize(ArrayList<Peak> peaks, int ploidy){
+ if(peaks.size()<2){return 0;}
+ assert(ploidy>0) : ploidy;
+ final int homozygousLoc=homozygousPeak(peaks, ploidy);
+ final Peak p0=peaks.get(0);
+ final int center0=p0.center;
+ final double mult=1.0/(Tools.max(1, center0));
+
+ long sizeSum=0;
+ for(int i=homozygousLoc+1; i<peaks.size(); i++){
+ Peak p=peaks.get(i);
+ long size=p.volume*(Math.round(p.center*mult));
+ sizeSum+=size;
+ }
+ return sizeSum;
+ }
+
+ private int biggestPeak(ArrayList<Peak> peaks){
+ if(peaks.size()<2){return peaks.size()-1;}
+
+ final Peak p0=peaks.get(0);
+ Peak biggest=p0;
+ int loc=0;
+ for(int i=1; i<peaks.size(); i++){
+ Peak p=peaks.get(i);
+ if(p.volume>biggest.volume){
+ loc=i;
+ biggest=p;
+ }
+ }
+ return loc;
+ }
+
+ private int secondBiggestPeak(ArrayList<Peak> peaks){
+ if(peaks.size()<2){return peaks.size()-1;}
+
+ Peak biggest=peaks.get(0);
+ Peak second=peaks.get(1);
+ int bloc=0;
+ int sloc=1;
+ if(second.volume>biggest.volume){
+ Peak temp=second;
+ second=biggest;
+ biggest=temp;
+ bloc=1;
+ sloc=0;
+ }
+ for(int i=2; i<peaks.size(); i++){
+ Peak p=peaks.get(i);
+ if(p.volume>second.volume){
+ sloc=i;
+ second=p;
+ if(second.volume>biggest.volume){
+ Peak temp=second;
+ second=biggest;
+ biggest=temp;
+ sloc=bloc;
+ bloc=i;
+ }
+ }
+ }
+ return sloc;
+ }
+
+ private int homozygousPeak(ArrayList<Peak> peaks, final int ploidy){
+ if(peaks.size()<2){return peaks.size()-1;}
+ assert(ploidy>0) : ploidy;
+
+ final Peak p0=peaks.get(0);
+ final int target=p0.center*ploidy;
+// System.err.println("target="+target);
+ int bestDif=Integer.MAX_VALUE;
+ int loc=0;
+ for(int i=0; i<peaks.size(); i++){
+ Peak p=peaks.get(i);
+ int dif=Tools.absdif(target, p.center);
+// System.err.println("dif="+dif+" for peak "+i+", center "+p.center);
+ if(dif<bestDif){
+ bestDif=dif;
+ loc=i;
+// System.err.println("New best at loc "+i);
+ }
+ }
+ return loc;
+ }
+
+ private int calcPloidy(ArrayList<Peak> peaks){
+ if(peaks.size()<2){return 1;}
+
+ final Peak p0=peaks.get(0);
+ final Peak biggest=peaks.get(biggestPeak(peaks));
+
+ if(biggest!=p0){
+ int ratio=(int)(biggest.center/(float)p0.center);
+ return ratio;
+ }else{//p0 is biggest.
+ final Peak second=peaks.get(secondBiggestPeak(peaks));
+ if(second.volume*4<biggest.volume){return 1;} //Probably second is a repeat peak.
+ int ratio=(int)(second.center/(float)biggest.center);
+ return ratio;
+ }
+ }
+
+ private long calcHetLocations(ArrayList<Peak> peaks, final int ploidy, final int k){
+ if(peaks.size()<2){return 0;}
+ assert(ploidy>0) : ploidy;
+ final int homozygousLoc=homozygousPeak(peaks, ploidy);
+ final Peak homoPeak=peaks.get(homozygousLoc);
+ long sum=0;
+ final int lim=ploidy/2;
+ for(int i=0; i<homozygousLoc; i++){
+ final Peak p=peaks.get(i);
+ final int copyCount=Math.round((p.center*ploidy)/(float)homoPeak.center);
+// System.err.println("lim="+lim+". For peak "+i+", copyCount="+copyCount+", volume="+p.volume);
+ if(copyCount>lim){break;}
+// double peakLocs=(p.volume/(double)k);
+ sum=sum+p.volume;
+ }
+ return sum/k;
+ }
+
+ public ArrayList<Peak> callPeaks(LongList list){
+ return callPeaks(list.array, list.size);
+ }
+
+ public ArrayList<Peak> callPeaks(long[] original, int length){
+
+ final long[] array;
+ if(smoothRadius>0){
+ if(smoothProgressiveFlag){
+ array=smoothProgressive(original, smoothRadius);
+ }else{
+ array=smooth(original, smoothRadius);
+ }
+ }else{
+ array=original;
+ }
+
+ ArrayList<Peak> peaks=new ArrayList<Peak>();
+
+ int dip0=-1;
+ for(int i=1; i<length; i++){
+ if(array[i-1]<array[i]){
+ dip0=i-1;
+ break;
+ }
+ }
+ if(dip0<0){return peaks;}
+// assert(false) : dip0+", "+array[dip0);
+
+ final int UP=0, DOWN=1;
+ int mode=UP;
+ int start=dip0, center=-1;
+ long prev=array[dip0];
+ long sum=prev;
+ long sum2=prev*dip0;
+ for(int i=dip0+1; i<length; i++){
+ final long x=array[i];
+
+// if(i<16){System.err.println("i="+i+", x="+x+", mode="+mode+", center="+center+", start="+start+", dip0="+dip0);}
+
+ if(mode==UP){
+ if(x<prev){
+ mode=DOWN;
+ center=i-1;
+ }
+ }else{
+ if(x>prev){
+ mode=UP;
+ int stop=i-1;
+ long max=array[center];
+ if(center>=minPeak && center<=maxPeak && max>=minHeight && (stop-start)>=minWidth && sum>=minVolume){
+ for(int j=center-1; j>=0; j--){//find middle of mesas
+ if(array[j]!=max){
+ center=(center+j+2)/2;
+ break;
+ }
+ }
+ {
+ long valley=array[stop];
+ for(int j=stop; j>=0; j--){//find middle of valleys
+ if(array[j]!=valley){
+ if(valley==0){stop=j+1;}
+ else{stop=(stop+j+2)/2;}
+ break;
+ }
+ }
+ }
+
+ Peak p=new Peak(center, start, stop, max, array[start], array[stop], sum, sum2);
+ peaks.add(p);
+ }else{
+// Peak p=new Peak(center, start, stop, max, sum);
+// System.err.println("*"+p);
+ }
+ start=stop;
+ stop=-1;
+ sum=sum2=0;
+ center=-1;
+ if(i>maxPeak){break;}
+ while(i<array.length && array[i]==0){i++;}//Skip zero regions
+ }
+ }
+
+ sum+=x;
+ sum2+=(x*i);
+ prev=x;
+ }
+
+ if(mode==DOWN){
+ int stop=length;
+ long max=array[center];
+ for(int j=center-1; j>=0; j--){//find middle of mesas
+ if(array[j]!=max){
+ center=(center+j+2)/2;
+ break;
+ }
+ }
+ {
+ long valley=array[stop-1];
+ for(int j=stop-1; j>=0; j--){//find middle of valleys
+ if(array[j]!=valley){
+ if(valley==0){stop=j+1;}
+ else{stop=(stop+j+2)/2;}
+ break;
+ }
+ }
+ }
+ if(center>=minPeak && center<=maxPeak && max>=minHeight && (stop-start)>=minWidth && sum>=minVolume){
+ Peak p=new Peak(center, start, stop, max, array[start], array[Tools.min(stop, length-1)], sum, sum2);
+ peaks.add(p);
+ }else{
+// Peak p=new Peak(center, start, stop, max, sum);
+// System.err.println("*"+p);
+ }
+ }
+
+ capWidth(peaks, maxWidthMult, array);
+
+ if(maxPeakCount<peaks.size()){
+ peaks=condense(peaks, maxPeakCount);
+ }
+
+ capWidth(peaks, maxWidthMult, array);
+
+ if(peaks.size()>1){
+ Peak biggest=peaks.get(biggestPeak(peaks));
+ while(peaks.size()>1 && peaks.get(0).volume<0.0001*biggest.volume){
+ peaks.remove(0);
+ }
+ }
+
+ if(array!=original){
+ recalculate(peaks, original);
+ }
+
+ return peaks;
+ }
+
+ private static void recalculate(ArrayList<Peak> peaks, long[] array){
+ for(Peak p : peaks){
+ p.recalculate(array);
+ }
+ }
+
+ /**
+ * Display usage information.
+ */
+ private static void printOptions(){
+ System.err.println("Please consult the shellscript for usage information.");
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Smoothing ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public static long[] smoothProgressive(final long[] data, int radius0){
+ int radius=radius0;
+ long div=radius*radius;
+ double mult=1.0/div;
+ long[] smoothed=new long[data.length];
+ for(int i=0, next=5; i<data.length; i++){
+ long sum=sumPoint(data, i, radius);
+ double product=sum*mult;
+// if(data[i]>=product){smoothed[i]=(long)Math.ceil(product);}
+// else{smoothed[i]=(long)product;}
+ smoothed[i]=Math.round(product);
+ if(i>next){
+ next=(int)Math.ceil(1+next*progressiveMult);
+ radius+=1;
+ div=radius*radius;
+ mult=1.0/div;
+ if(radius>maxRadius){next=Integer.MAX_VALUE;}
+// System.err.println(radius+", "+div);
+ }
+ }
+ return smoothed;
+ }
+
+ public static long[] smooth(final long[] data, int radius){
+ final long div=radius*radius;
+ final double mult=1.0/div;
+ long[] smoothed=new long[data.length];
+ for(int i=0; i<data.length; i++){
+ long sum=sumPoint(data, i, radius);
+ double product=sum*mult;
+// if(data[i]>=product){smoothed[i]=(long)Math.ceil(product);}
+// else{smoothed[i]=(long)product;}
+ smoothed[i]=Math.round(product);
+ }
+ return smoothed;
+ }
+
+ private static long sumPoint(long[] data, int loc, int radius){
+ long sum=0;
+ int start=loc-radius+1;
+ int stop=loc+radius-1;
+ for(int i=start, x=1; i<loc; i++, x++){
+ int i2=Tools.max(i, 0);
+ sum+=data[i2]*x;
+ }
+ for(int i=loc, x=radius, max=data.length-1; i<=stop; i++, x--){
+ int i2=Tools.min(i, max);
+ sum+=data[i2]*x;
+ }
+ return sum;
+ }
+
+ public static int[] smoothProgressive(final int[] data, int radius0){
+ int radius=radius0;
+ long div=radius*radius;
+ double mult=1.0/div;
+ int[] smoothed=new int[data.length];
+ for(int i=0, next=5; i<data.length; i++){
+ long sum=sumPoint(data, i, radius);
+ double product=sum*mult;
+// if(data[i]>=product){smoothed[i]=(long)Math.ceil(product);}
+// else{smoothed[i]=(long)product;}
+ smoothed[i]=(int)Math.round(product);
+ if(i>next){
+ next=(int)Math.ceil(next*2);
+ radius+=1;
+ div=radius*radius;
+ mult=1.0/div;
+ if(radius>10){next=Integer.MAX_VALUE;}
+// System.err.println(radius+", "+div);
+ }
+ }
+ return smoothed;
+ }
+
+ public static int[] smooth(final int[] data, int radius){
+ final long div=radius*radius;
+ final double mult=1.0/div;
+ int[] smoothed=new int[data.length];
+ for(int i=0; i<data.length; i++){
+ long sum=sumPoint(data, i, radius);
+ double product=sum*mult;
+// if(data[i]>=product){smoothed[i]=(int)Math.ceil(product);}
+// else{smoothed[i]=(int)product;}
+ smoothed[i]=(int)Math.round(product);
+ }
+ return smoothed;
+ }
+
+ private static long sumPoint(int[] data, int loc, int radius){
+ long sum=0;
+ int start=loc-radius+1;
+ int stop=loc+radius-1;
+ for(int i=start, x=1; i<loc; i++, x++){
+ int i2=Tools.max(i, 0);
+ sum+=data[i2]*x;
+ }
+ for(int i=loc, x=radius, max=data.length-1; i<=stop; i++, x--){
+ int i2=Tools.min(i, max);
+ sum+=data[i2]*x;
+ }
+ return sum;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Nested Classes ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private class Peak{
+
+ Peak(int center_, int start_, int stop_, long centerHeight_, long startHeight_, long stopHeight_, long volume_, long volume2_){
+
+ center=center_;
+ start=start_;
+ stop=stop_;
+
+ centerHeight=centerHeight_;
+ startHeight=startHeight_;
+ stopHeight=stopHeight_;
+ volume=volume_;
+ volume2=volume2_;
+
+ assert(center>=0) : this;
+ assert(start<center) : this;
+ assert(stop>center) : this;
+ }
+
+ public boolean compatibleWith(Peak p) {
+ int min=Tools.min(center, p.stop);
+ int max=Tools.max(stop, p.center);
+// assert(min*maxWidthMult>=max) : this+", "+p+", "+(min*maxWidthMult)+", "+max;
+ return min*maxWidthMult>=max;
+ }
+
+ /**
+ * @param array
+ */
+ public void recalculate(long[] array) {
+ centerHeight=array[center];
+ startHeight=array[start];
+ stopHeight=array[stop];
+ volume=0;
+ volume2=0;
+ for(int i=start; i<stop; i++){
+ long x=array[i];
+ volume+=x;
+ volume2+=(x*i);
+ }
+ }
+
+ /**
+ * @param p
+ */
+ public void absorb(Peak p) {
+
+ if(center>p.center){
+ assert(p.stop<stop) : "\n"+this+"\n"+p+"\n";
+ if(start>p.start){
+ start=p.start;
+ startHeight=p.startHeight;
+ }
+ }else{
+ assert(p.start>start) : "\n"+this+"\n"+p+"\n";
+ if(stop<p.stop){
+ stop=p.stop;
+ stopHeight=p.stopHeight;
+ }
+ }
+
+ long c1=callByRawCount ? centerHeight2() : centerHeight;
+ long c2=callByRawCount ? p.centerHeight2() : p.centerHeight;
+
+ if(c1<c2){
+ center=p.center;
+ centerHeight=p.centerHeight;
+ }
+
+ volume+=p.volume;
+ volume2+=p.volume2;
+
+ }
+
+ int width(){return stop-start;}
+
+ @Override
+ public String toString(){
+ return start+"\t"+center+"\t"+stop+"\t"+centerHeight+"\t"+volume;
+ }
+
+ public ByteBuilder toBytes(ByteBuilder bb){
+ if(bb==null){bb=new ByteBuilder();}
+ bb.append(start);
+ bb.append('\t');
+ bb.append(center);
+ bb.append('\t');
+ bb.append(stop);
+ bb.append('\t');
+ bb.append(centerHeight);
+ bb.append('\t');
+ bb.append(volume);
+ bb.append('\t');
+ return bb;
+ }
+
+ /** Inclusive */
+ public int start;
+ public int center;
+ /** Exclusive */
+ public int stop;
+
+ //Unique counts
+ public long startHeight;
+ public long centerHeight;
+ public long stopHeight;
+ public long volume;
+
+ public long volume2;
+
+ //Raw counts
+ public long startHeight2(){return startHeight*start;}
+ public long centerHeight2(){return centerHeight*center;}
+ public long stopHeight2(){return stopHeight*stop;}
+
+
+
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private long minHeight=2;
+ private long minVolume=2;
+ private int minWidth=3;
+ private int minPeak=2;
+ private int maxPeak=Integer.MAX_VALUE;
+ private int maxPeakCount=10;
+ private float maxWidthMult=2.5f;
+ private int smoothRadius=0;
+ private boolean smoothProgressiveFlag=false;
+ private int k=31;
+
+ private int ploidyClaimed=-1;
+
+ private String in;
+ private String out;
+
+ private final FileFormat ffin;
+ private final FileFormat ffout;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Static Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public static int maxRadius=10;
+ public static float progressiveMult=2;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Common Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private static int countColumn=1;
+
+ private PrintStream outstream=System.err;
+ public static boolean verbose=false;
+ public boolean errorState=false;
+ private boolean overwrite=false;
+ private boolean append=false;
+ public static boolean printClass=true;
+
+ public static boolean callByRawCount=true;
+
+}
diff --git a/current/jgi/CorrelateBarcodes.java b/current/jgi/CorrelateBarcodes.java
new file mode 100755
index 0000000..7d5a97e
--- /dev/null
+++ b/current/jgi/CorrelateBarcodes.java
@@ -0,0 +1,471 @@
+package jgi;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import stream.ConcurrentGenericReadInputStream;
+import stream.ConcurrentReadInputStream;
+import stream.FASTQ;
+import stream.FastaReadInputStream;
+import stream.ConcurrentReadOutputStream;
+import stream.Read;
+
+import dna.Parser;
+import dna.Timer;
+import fileIO.ByteFile;
+import fileIO.ByteFile1;
+import fileIO.ByteFile2;
+import fileIO.ReadWrite;
+import fileIO.FileFormat;
+import fileIO.TextStreamWriter;
+
+import align2.ListNum;
+import align2.ReadStats;
+import align2.Shared;
+import align2.Tools;
+
+/**
+ * @author Brian Bushnell
+ * @date June 20, 2014
+ *
+ */
+public class CorrelateBarcodes {
+
+ public static void main(String[] args){
+ Timer t=new Timer();
+ CorrelateBarcodes mb=new CorrelateBarcodes(args);
+ mb.process(t);
+ }
+
+ public CorrelateBarcodes(String[] args){
+
+ args=Parser.parseConfig(args);
+ if(Parser.parseHelp(args, true)){
+ printOptions();
+ System.exit(0);
+ }
+
+ for(String s : args){if(s.startsWith("out=standardout") || s.startsWith("out=stdout")){outstream=System.err;}}
+ outstream.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+
+ boolean setInterleaved=false; //Whether it was explicitly set.
+
+
+
+ Shared.READ_BUFFER_LENGTH=Tools.min(200, Shared.READ_BUFFER_LENGTH);
+ Shared.capBuffers(4);
+ ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
+ ReadWrite.MAX_ZIP_THREADS=Shared.threads();
+
+
+ Parser parser=new Parser();
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if(b==null || b.equalsIgnoreCase("null")){b=null;}
+ while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens
+
+ if(parser.parse(arg, a, b)){
+ //do nothing
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ ByteFile1.verbose=verbose;
+ ByteFile2.verbose=verbose;
+ stream.FastaReadInputStream.verbose=verbose;
+ ConcurrentGenericReadInputStream.verbose=verbose;
+// align2.FastaReadInputStream2.verbose=verbose;
+ stream.FastqReadInputStream.verbose=verbose;
+ ReadWrite.verbose=verbose;
+ }else if(a.equals("outcor") || a.equals("cor")){
+ outcor=b;
+ }else if(a.equals("bqhist")){
+ bqhist=b;
+ }else if(a.equals("baqhist")){//average quality
+ aqhist=b;
+ }else if(a.equals("bmqhist")){//minimum quality
+ mqhist=b;
+ }else if(a.equals("mmq")){//minimum min quality cutoff
+ minBarcodeMinQuality=Integer.parseInt(b);
+ }else if(parser.in1==null && i==0 && !arg.contains("=") && (arg.toLowerCase().startsWith("stdin") || new File(arg).exists())){
+ parser.in1=arg;
+ }else{
+ outstream.println("Unknown parameter "+args[i]);
+ assert(false) : "Unknown parameter "+args[i];
+ // throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ {//Process parser fields
+ Parser.processQuality();
+
+ maxReads=parser.maxReads;
+
+ overwrite=ReadStats.overwrite=parser.overwrite;
+ append=ReadStats.append=parser.append;
+
+ setInterleaved=parser.setInterleaved;
+
+ in1=parser.in1;
+ in2=parser.in2;
+ qfin1=parser.qfin1;
+ qfin2=parser.qfin2;
+
+ out1=parser.out1;
+ out2=parser.out2;
+
+ extin=parser.extin;
+ extout=parser.extout;
+
+ minBarcodeAverageQuality=parser.minAvgQuality;
+ }
+
+ if(in1!=null && in2==null && in1.indexOf('#')>-1 && !new File(in1).exists()){
+ in2=in1.replace("#", "2");
+ in1=in1.replace("#", "1");
+ }
+ if(out1!=null && out2==null && out1.indexOf('#')>-1){
+ out2=out1.replace("#", "2");
+ out1=out1.replace("#", "1");
+ }
+ if(in2!=null){
+ if(FASTQ.FORCE_INTERLEAVED){outstream.println("Reset INTERLEAVED to false because paired input files were specified.");}
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+ }
+
+ assert(FastaReadInputStream.settingsOK());
+
+ if(in1==null){
+ printOptions();
+ throw new RuntimeException("Error - at least one input file is required.");
+ }
+ if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){
+ ByteFile.FORCE_MODE_BF2=true;
+ }
+
+ if(out1==null){
+ if(out2!=null){
+ printOptions();
+ throw new RuntimeException("Error - cannot define out2 without defining out1.");
+ }
+ }
+
+ if(!setInterleaved){
+ assert(in1!=null && (out1!=null || out2==null)) : "\nin1="+in1+"\nin2="+in2+"\nout1="+out1+"\nout2="+out2+"\n";
+ if(in2!=null){ //If there are 2 input streams.
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+ outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED);
+ }else{ //There is one input stream.
+ if(out2!=null){
+ FASTQ.FORCE_INTERLEAVED=true;
+ FASTQ.TEST_INTERLEAVED=false;
+ outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED);
+ }
+ }
+ }
+
+ if(out1!=null && out1.equalsIgnoreCase("null")){out1=null;}
+ if(out2!=null && out2.equalsIgnoreCase("null")){out2=null;}
+
+ if(!Tools.testOutputFiles(overwrite, append, false, out1, out2)){
+ outstream.println((out1==null)+", "+(out2==null)+", "+out1+", "+out2);
+ throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+out1+", "+out2+"\n");
+ }
+
+ ffout1=FileFormat.testOutput(out1, FileFormat.FASTQ, extout, true, overwrite, append, false);
+ ffout2=FileFormat.testOutput(out2, FileFormat.FASTQ, extout, true, overwrite, append, false);
+
+ ffcor=FileFormat.testOutput(outcor, FileFormat.TEXT, extout, true, overwrite, append, false);
+ ffaq=FileFormat.testOutput(aqhist, FileFormat.TEXT, extout, true, overwrite, append, false);
+ ffmq=FileFormat.testOutput(mqhist, FileFormat.TEXT, extout, true, overwrite, append, false);
+
+ ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, extin, true, true);
+ ffin2=FileFormat.testInput(in2, FileFormat.FASTQ, extin, true, true);
+ }
+
+ void process(Timer t){
+
+
+ final ConcurrentReadInputStream cris;
+ {
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, ffin2, qfin1, qfin2);
+ if(verbose){outstream.println("Started cris");}
+ cris.start(); //4567
+ }
+ boolean paired=cris.paired();
+// if(verbose){
+ if(!ffin1.samOrBam()){outstream.println("Input is being processed as "+(paired ? "paired" : "unpaired"));}
+// }
+
+ final ConcurrentReadOutputStream ros;
+ if(out1!=null){
+ final int buff=4;
+
+ if(cris.paired() && out2==null && (in1==null || !in1.contains(".sam"))){
+ outstream.println("Writing interleaved.");
+ }
+
+ assert(!out1.equalsIgnoreCase(in1) && !out1.equalsIgnoreCase(in1)) : "Input file and output file have same name.";
+ assert(out2==null || (!out2.equalsIgnoreCase(in1) && !out2.equalsIgnoreCase(in2))) : "out1 and out2 have same name.";
+
+ ros=ConcurrentReadOutputStream.getStream(ffout1, ffout2, null, null, buff, null, false);
+ ros.start();
+ }else{ros=null;}
+
+ long readsProcessed=0;
+ long basesProcessed=0;
+
+ long readsTossed=0;
+ long basesTossed=0;
+
+ ReadStats readstats=null;
+ ReadStats.COLLECT_QUALITY_STATS=(bqhist!=null);
+ if(ReadStats.COLLECT_QUALITY_STATS){
+ ReadStats.QUAL_HIST_FILE=bqhist;
+ readstats=new ReadStats();
+ }
+
+// assert(false) : bqhist+", "+(ReadStats.COLLECT_QUALITY_STATS)+", "+(readstats==null);
+
+ {
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+// outstream.println("Fetched "+reads);
+
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+ assert((ffin1==null || ffin1.samOrBam()) || (r.mate!=null)==cris.paired());
+ }
+
+ while(reads!=null && reads.size()>0){
+
+ for(int idx=0; idx<reads.size(); idx++){
+ final Read r1=reads.get(idx);
+ final Read r2=r1.mate;
+
+ final int initialLength1=r1.length();
+ final int initialLength2=(r1.mateLength());
+
+ final byte[] barbases, barquals;
+ {
+ String[] s=r1.id.split("_");
+ barbases=s[0].getBytes();
+ barquals=s[1].getBytes();
+ for(int i=0; i<barquals.length; i++){
+ barquals[i]-=33;
+ }
+ }
+
+ final int qbar=Read.avgQualityByProbability(barbases, barquals, true, 0);
+ final int minqbar=Tools.min(barquals);
+ aqhistArray[qbar]++;
+ mqhistArray[minqbar]++;
+
+ if(qbar<minBarcodeAverageQuality || minqbar<minBarcodeMinQuality){
+ r1.setDiscarded(true);
+ readsTossed++;
+ basesTossed+=(initialLength1+initialLength2);
+ if(r2!=null){readsTossed++;}
+ }
+
+// System.err.println(new String(barquals)+" -> "+qbar);
+
+ {
+ readsProcessed++;
+ basesProcessed+=initialLength1;
+ final int q1=r1.avgQualityByProbability(true, 0);
+ qualCor1[q1][qbar]++;
+ }
+ if(r2!=null){
+ readsProcessed++;
+ basesProcessed+=initialLength2;
+ final int q2=r2.avgQualityByProbability(true, 0);
+ qualCor2[q2][qbar]++;
+ }
+
+ if(readstats!=null){
+ readstats.addToQualityHistogram(barquals, 0);
+ }
+
+ }
+
+ ArrayList<Read> listOut=reads;
+
+ if(ros!=null){
+ if(minBarcodeAverageQuality>0){
+ listOut=new ArrayList<Read>(reads.size());
+ for(Read r : reads){
+ if(!r.discarded()){listOut.add(r);}
+ }
+ }
+ ros.add(listOut, ln.id);
+ }
+
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ if(ln!=null){
+ cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
+ }
+ }
+
+ if(ffcor!=null){
+ TextStreamWriter tsw=new TextStreamWriter(ffcor);
+ tsw.start();
+ tsw.print("#Read1_Q\tBar_Q\tstdev\tcount\tRead2_Q\tBar_Q\tstdev\tcount\n");
+ for(int i=0; i<qualCor1.length; i++){
+ long[] array1=qualCor1[i], array2=qualCor2[i];
+ long sum1=Tools.sum(array1), sum2=Tools.sum(array2);
+ double avg1=Tools.averageHistogram(array1), avg2=Tools.averageHistogram(array2);
+ double dev1=Tools.standardDeviationHistogram(array1), dev2=Tools.standardDeviationHistogram(array2);
+ tsw.print(String.format("%d\t%.1f\t%.1f\t%d\t%d\t%.1f\t%.1f\t%d\n", i, avg1, dev1, sum1, i, avg2, dev2, sum2));
+ }
+ tsw.poisonAndWait();
+ errorState|=tsw.errorState;
+ }
+
+ if(aqhist!=null){
+ TextStreamWriter tsw=new TextStreamWriter(ffaq);
+ tsw.start();
+ tsw.print("#Quality\tcount\tfraction\n");
+ long sum=Tools.sum(aqhistArray);
+ double mult=1.0/Tools.max(1, sum);
+ long y=0;
+ for(int i=0; i<aqhistArray.length; i++){
+ long x=aqhistArray[i];
+ tsw.print(String.format("%d\t%d\t%.5f\n", i, x, x*mult));
+ y+=x;
+ if(y==sum){break;}
+ }
+ tsw.poisonAndWait();
+ errorState|=tsw.errorState;
+ }
+
+ if(mqhist!=null){
+ TextStreamWriter tsw=new TextStreamWriter(ffmq);
+ tsw.start();
+ tsw.print("#Quality\tcount\tfraction\n");
+ long sum=Tools.sum(mqhistArray);
+ double mult=1.0/Tools.max(1, sum);
+ long y=0;
+ for(int i=0; i<mqhistArray.length; i++){
+ long x=mqhistArray[i];
+ tsw.print(String.format("%d\t%d\t%.5f\n", i, x, x*mult));
+ y+=x;
+ if(y==sum){break;}
+ }
+ tsw.poisonAndWait();
+ errorState|=tsw.errorState;
+ }
+
+
+ if(readstats!=null){
+ errorState|=ReadStats.writeAll();
+ }
+
+ errorState|=ReadWrite.closeStreams(cris, ros);
+
+ t.stop();
+
+ double rpnano=readsProcessed/(double)(t.elapsed);
+ double bpnano=basesProcessed/(double)(t.elapsed);
+
+ String rpstring=(readsProcessed<100000 ? ""+readsProcessed : readsProcessed<100000000 ? (readsProcessed/1000)+"k" : (readsProcessed/1000000)+"m");
+ String bpstring=(basesProcessed<100000 ? ""+basesProcessed : basesProcessed<100000000 ? (basesProcessed/1000)+"k" : (basesProcessed/1000000)+"m");
+
+ while(rpstring.length()<8){rpstring=" "+rpstring;}
+ while(bpstring.length()<8){bpstring=" "+bpstring;}
+
+ outstream.println("Time: \t"+t);
+ outstream.println("Reads Processed: "+rpstring+" \t"+String.format("%.2fk reads/sec", rpnano*1000000));
+ outstream.println("Bases Processed: "+bpstring+" \t"+String.format("%.2fm bases/sec", bpnano*1000));
+
+ if(minBarcodeAverageQuality>0){
+ outstream.println();
+ outstream.println("Reads Discarded: "+readsTossed+" \t"+String.format("%.3f%%",readsTossed*100.0/readsProcessed));
+ outstream.println("Reads Discarded: "+basesTossed+" \t"+String.format("%.3f%%",basesTossed*100.0/basesProcessed));
+ }
+
+ if(errorState){
+ throw new RuntimeException("ReformatReads terminated in an error state; the output may be corrupt.");
+ }
+ }
+
+ /*--------------------------------------------------------------*/
+
+ private void printOptions(){
+ assert(false) : "printOptions: TODO";
+// outstream.println("Syntax:\n");
+// outstream.println("java -ea -Xmx512m -cp <path> jgi.ReformatReads in=<infile> in2=<infile2> out=<outfile> out2=<outfile2>");
+// outstream.println("\nin2 and out2 are optional. \nIf input is paired and there is only one output file, it will be written interleaved.\n");
+// outstream.println("Other parameters and their defaults:\n");
+// outstream.println("overwrite=false \tOverwrites files that already exist");
+// outstream.println("ziplevel=4 \tSet compression level, 1 (low) to 9 (max)");
+// outstream.println("interleaved=false\tDetermines whether input file is considered interleaved");
+// outstream.println("fastawrap=70 \tLength of lines in fasta output");
+// outstream.println("qin=auto \tASCII offset for input quality. May be set to 33 (Sanger), 64 (Illumina), or auto");
+// outstream.println("qout=auto \tASCII offset for output quality. May be set to 33 (Sanger), 64 (Illumina), or auto (meaning same as input)");
+// outstream.println("outsingle=<file> \t(outs) Write singleton reads here, when conditionally discarding reads from pairs.");
+ }
+
+
+ /*--------------------------------------------------------------*/
+
+ private String in1=null;
+ private String in2=null;
+
+ private String qfin1=null;
+ private String qfin2=null;
+
+ private String out1=null;
+ private String out2=null;
+
+ private String extin=null;
+ private String extout=null;
+
+ private String outcor=null;
+ private String bqhist=null;
+ private String aqhist=null;
+ private String mqhist=null;
+
+ private int minBarcodeAverageQuality=0;
+ private int minBarcodeMinQuality=0;
+
+ private long[][] qualCor1=new long[50][50];
+ private long[][] qualCor2=new long[50][50];
+
+ private long[] aqhistArray=new long[100];
+ private long[] mqhistArray=new long[100];
+
+ /*--------------------------------------------------------------*/
+
+ private long maxReads=-1;
+
+ /*--------------------------------------------------------------*/
+
+ private final FileFormat ffin1;
+ private final FileFormat ffin2;
+
+ private final FileFormat ffcor;
+ private final FileFormat ffaq;
+ private final FileFormat ffmq;
+
+ private final FileFormat ffout1;
+ private final FileFormat ffout2;
+
+
+ /*--------------------------------------------------------------*/
+
+ private PrintStream outstream=System.err;
+ public static boolean verbose=false;
+ public boolean errorState=false;
+ private boolean overwrite=false;
+ private boolean append=false;
+
+}
diff --git a/current/jgi/CountBarcodes.java b/current/jgi/CountBarcodes.java
new file mode 100755
index 0000000..77bd93e
--- /dev/null
+++ b/current/jgi/CountBarcodes.java
@@ -0,0 +1,514 @@
+package jgi;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+
+import stream.ConcurrentGenericReadInputStream;
+import stream.ConcurrentReadInputStream;
+import stream.FASTQ;
+import stream.FastaReadInputStream;
+import stream.ConcurrentReadOutputStream;
+import stream.Read;
+
+import dna.AminoAcid;
+import dna.Parser;
+import dna.Timer;
+import fileIO.ByteFile;
+import fileIO.ByteFile1;
+import fileIO.ByteFile2;
+import fileIO.ReadWrite;
+import fileIO.FileFormat;
+import fileIO.TextStreamWriter;
+
+import align2.BandedAlignerConcrete;
+import align2.ListNum;
+import align2.ReadStats;
+import align2.Shared;
+import align2.Tools;
+
+/**
+ * @author Brian Bushnell
+ * @date June 20, 2014
+ *
+ */
+public class CountBarcodes {
+
+ public static void main(String[] args){
+ Timer t=new Timer();
+ CountBarcodes mb=new CountBarcodes(args);
+ mb.process(t);
+ }
+
+ public CountBarcodes(String[] args){
+
+ args=Parser.parseConfig(args);
+ if(Parser.parseHelp(args, true)){
+ printOptions();
+ System.exit(0);
+ }
+
+ for(String s : args){if(s.startsWith("out=standardout") || s.startsWith("out=stdout")){outstream=System.err;}}
+ outstream.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+
+ boolean setInterleaved=false; //Whether it was explicitly set.
+
+
+
+ Shared.READ_BUFFER_LENGTH=Tools.min(200, Shared.READ_BUFFER_LENGTH);
+ Shared.capBuffers(4);
+ ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
+ ReadWrite.MAX_ZIP_THREADS=Shared.threads();
+
+
+ expectedCodeMap=new HashMap<String,String>(200);
+ validCodeMap=new HashMap<String,String>(200);
+
+ Parser parser=new Parser();
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if(b==null || b.equalsIgnoreCase("null")){b=null;}
+ while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens
+
+ if(parser.parse(arg, a, b)){
+ //do nothing
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ ByteFile1.verbose=verbose;
+ ByteFile2.verbose=verbose;
+ stream.FastaReadInputStream.verbose=verbose;
+ ConcurrentGenericReadInputStream.verbose=verbose;
+// align2.FastaReadInputStream2.verbose=verbose;
+ stream.FastqReadInputStream.verbose=verbose;
+ ReadWrite.verbose=verbose;
+ }else if(a.equals("countundefined")){
+ countUndefined=Tools.parseBoolean(b);
+ }else if(a.equals("printheader") || a.equals("header")){
+ printHeader=Tools.parseBoolean(b);
+ }else if(a.equals("printrows") || a.equals("rows") || a.equals("maxrows")){
+ maxRows=Integer.parseInt(b);
+ }else if(a.equals("expected")){
+ if(b!=null){
+ for(String code : b.split(",")){
+ expectedCodeMap.put(code, code);
+ validCodeMap.put(code, code);
+ }
+ }
+ }else if(a.equals("valid")){
+ if(b!=null){
+ for(String code : b.split(",")){
+ validCodeMap.put(code, code);
+ }
+ }
+ }else if(a.equals("counts") || a.equals("outc")){
+ outCounts=b;
+ }else if(parser.in1==null && i==0 && !arg.contains("=") && (arg.toLowerCase().startsWith("stdin") || new File(arg).exists())){
+ parser.in1=arg;
+ }else if(parser.out1==null && i==1 && !arg.contains("=")){
+ parser.out1=arg;
+ }else{
+ outstream.println("Unknown parameter "+args[i]);
+ assert(false) : "Unknown parameter "+args[i];
+ // throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ {//Process parser fields
+ Parser.processQuality();
+
+ maxReads=parser.maxReads;
+
+ overwrite=ReadStats.overwrite=parser.overwrite;
+ append=ReadStats.append=parser.append;
+
+ setInterleaved=parser.setInterleaved;
+
+ in1=parser.in1;
+ in2=parser.in2;
+ qfin1=parser.qfin1;
+ qfin2=parser.qfin2;
+
+ out1=parser.out1;
+ out2=parser.out2;
+ qfout1=parser.qfout1;
+ qfout2=parser.qfout2;
+
+ extin=parser.extin;
+ extout=parser.extout;
+ }
+
+ if(in1!=null && in2==null && in1.indexOf('#')>-1 && !new File(in1).exists()){
+ in2=in1.replace("#", "2");
+ in1=in1.replace("#", "1");
+ }
+ if(out1!=null && out2==null && out1.indexOf('#')>-1){
+ out2=out1.replace("#", "2");
+ out1=out1.replace("#", "1");
+ }
+ if(in2!=null){
+ if(FASTQ.FORCE_INTERLEAVED){outstream.println("Reset INTERLEAVED to false because paired input files were specified.");}
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+ }
+
+ assert(FastaReadInputStream.settingsOK());
+
+ if(in1==null){
+ printOptions();
+ throw new RuntimeException("Error - at least one input file is required.");
+ }
+ if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){
+ ByteFile.FORCE_MODE_BF2=true;
+ }
+
+ if(out1==null){
+ if(out2!=null){
+ printOptions();
+ throw new RuntimeException("Error - cannot define out2 without defining out1.");
+ }
+// if(!parser.setOut){
+// out1="stdout";
+// }
+ }
+
+ if(!setInterleaved){
+ assert(in1!=null && (out1!=null || out2==null)) : "\nin1="+in1+"\nin2="+in2+"\nout1="+out1+"\nout2="+out2+"\n";
+ if(in2!=null){ //If there are 2 input streams.
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+ outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED);
+ }else{ //There is one input stream.
+ if(out2!=null){
+ FASTQ.FORCE_INTERLEAVED=true;
+ FASTQ.TEST_INTERLEAVED=false;
+ outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED);
+ }
+ }
+ }
+
+ if(out1!=null && out1.equalsIgnoreCase("null")){out1=null;}
+ if(out2!=null && out2.equalsIgnoreCase("null")){out2=null;}
+
+ if(!Tools.testOutputFiles(overwrite, append, false, out1, out2)){
+ outstream.println((out1==null)+", "+(out2==null)+", "+out1+", "+out2);
+ throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+out1+", "+out2+"\n");
+ }
+
+ validCodes=new ArrayList<String>(validCodeMap.size());
+ validCodes.addAll(validCodeMap.values());
+ expectedCodes=new ArrayList<String>(expectedCodeMap.size());
+ expectedCodes.addAll(expectedCodeMap.values());
+
+ ffout1=FileFormat.testOutput(out1, FileFormat.FASTQ, extout, true, overwrite, append, false);
+ ffout2=FileFormat.testOutput(out2, FileFormat.FASTQ, extout, true, overwrite, append, false);
+ ffCounts=FileFormat.testOutput(outCounts, FileFormat.TEXT, null, false, overwrite, append, false);
+
+ ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, extin, true, true);
+ ffin2=FileFormat.testInput(in2, FileFormat.FASTQ, extin, true, true);
+ }
+
+ void process(Timer t){
+
+
+ final ConcurrentReadInputStream cris;
+ {
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, ffin2, qfin1, qfin2);
+ if(verbose){outstream.println("Started cris");}
+ cris.start(); //4567
+ }
+ boolean paired=cris.paired();
+// if(verbose){
+ if(!ffin1.samOrBam()){outstream.println("Input is being processed as "+(paired ? "paired" : "unpaired"));}
+// }
+
+ final ConcurrentReadOutputStream ros;
+ if(out1!=null){
+ final int buff=4;
+
+ if(cris.paired() && out2==null && (in1==null || !in1.contains(".sam"))){
+ outstream.println("Writing interleaved.");
+ }
+
+ assert(!out1.equalsIgnoreCase(in1) && !out1.equalsIgnoreCase(in1)) : "Input file and output file have same name.";
+ assert(out2==null || (!out2.equalsIgnoreCase(in1) && !out2.equalsIgnoreCase(in2))) : "out1 and out2 have same name.";
+
+ ros=ConcurrentReadOutputStream.getStream(ffout1, ffout2, qfout1, qfout2, buff, null, false);
+ ros.start();
+ }else{ros=null;}
+
+ long readsProcessed=0;
+ long basesProcessed=0;
+
+ HashMap<String, StringNum> map=new HashMap<String, StringNum>();
+
+ {
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+// outstream.println("Fetched "+reads);
+
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+ assert((ffin1==null || ffin1.samOrBam()) || (r.mate!=null)==cris.paired());
+ }
+
+ while(reads!=null && reads.size()>0){
+
+ for(int idx=0; idx<reads.size(); idx++){
+ final Read r1=reads.get(idx);
+ final Read r2=r1.mate;
+
+ final int initialLength1=r1.length();
+ final int initialLength2=(r1.mateLength());
+
+ {
+ readsProcessed++;
+ basesProcessed+=initialLength1;
+ }
+ if(r2!=null){
+ readsProcessed++;
+ basesProcessed+=initialLength2;
+ }
+
+ String id=r1.id;
+ int colon=id.lastIndexOf(':');
+ String barcode=id.substring(colon+1);
+
+ if(countUndefined || AminoAcid.isFullyDefined(barcode)){
+ StringNum value=map.get(barcode);
+ if(value==null){
+ value=new StringNum(barcode, 0);
+ map.put(barcode, value);
+ }
+ value.increment();
+ }
+ }
+
+ final ArrayList<Read> listOut=reads;
+
+ if(ros!=null){ros.add(listOut, ln.id);}
+
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ if(ln!=null){
+ cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
+ }
+ }
+
+ ArrayList<StringNum> list=new ArrayList<StringNum>(map.size());
+ list.addAll(map.values());
+ Collections.sort(list);
+ Collections.reverse(list);
+
+ TextStreamWriter tsw=new TextStreamWriter(ffCounts);
+ tsw.start();
+ if(printHeader){
+ tsw.print("#code\tcount\tHamming_dist\tedit_dist\tvalid\n");
+ }
+ for(StringNum sn : list){
+ if(maxRows==0){break;}
+ maxRows--;
+ int hdist=calcHdist(sn.s, expectedCodes);
+ int edist=hdist;
+ if(hdist>1){
+ try {
+ edist=calcEdist(sn.s, expectedCodes);
+ } catch (Exception e) {
+ edist=hdist;
+ }
+ }
+ boolean valid=validCodes.contains(sn.s);
+ tsw.print(sn+"\t"+hdist+"\t"+edist+"\t"+(valid ? "valid" : "")+"\n");
+ }
+ tsw.poisonAndWait();
+ errorState|=tsw.errorState;
+
+ errorState|=ReadStats.writeAll();
+
+ errorState|=ReadWrite.closeStreams(cris, ros);
+
+ t.stop();
+
+ double rpnano=readsProcessed/(double)(t.elapsed);
+ double bpnano=basesProcessed/(double)(t.elapsed);
+
+ String rpstring=(readsProcessed<100000 ? ""+readsProcessed : readsProcessed<100000000 ? (readsProcessed/1000)+"k" : (readsProcessed/1000000)+"m");
+ String bpstring=(basesProcessed<100000 ? ""+basesProcessed : basesProcessed<100000000 ? (basesProcessed/1000)+"k" : (basesProcessed/1000000)+"m");
+
+ while(rpstring.length()<8){rpstring=" "+rpstring;}
+ while(bpstring.length()<8){bpstring=" "+bpstring;}
+
+ outstream.println("Time: \t"+t);
+ outstream.println("Reads Processed: "+rpstring+" \t"+String.format("%.2fk reads/sec", rpnano*1000000));
+ outstream.println("Bases Processed: "+bpstring+" \t"+String.format("%.2fm bases/sec", bpnano*1000));
+
+ if(errorState){
+ throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
+ }
+ }
+
+ /**
+ * @param s
+ * @param expectedCodes
+ * @return
+ */
+ static int calcHdist(String s, ArrayList<String> expectedCodes) {
+ int min=s.length();
+ for(String code : expectedCodes){
+ min=Tools.min(min, hdist(s, code));
+ if(min<1){break;}
+ }
+ return min;
+ }
+
+ static int hdist(String s, String code) {
+ final int min=Tools.min(s.length(), code.length());
+ int subs=0;
+ for(int i=0; i<min; i++){
+ if(s.charAt(i)!=code.charAt(i)){subs++;}
+ }
+ return subs;
+ }
+
+ /**
+ * @param s
+ * @param expectedCodes
+ * @return
+ */
+ int calcEdist(String s, ArrayList<String> expectedCodes) {
+ int min=s.length();
+ for(String code : expectedCodes){
+ min=Tools.min(min, edist(s, code));
+ if(min<1){break;}
+ }
+ return min;
+ }
+
+ int edist(String s, String code) {
+ int x=bandy.alignForward(s.getBytes(), code.getBytes(), 0, 0, s.length(), true);
+ return x;
+ }
+
+ /*--------------------------------------------------------------*/
+
+ private static class StringNum implements Comparable<StringNum>{
+
+ public StringNum(String s_, long n_){
+ s=s_;
+ n=n_;
+ }
+
+ public long increment(){
+ return (n=n+1);
+ }
+
+ @SuppressWarnings("unused")
+ public long increment(long x){
+ return (n=n+x);
+ }
+
+ /* (non-Javadoc)
+ * @see java.lang.Comparable#compareTo(java.lang.Object)
+ */
+ @Override
+ public int compareTo(StringNum o) {
+ if(n<o.n){return -1;}
+ if(n>o.n){return 1;}
+ return s.compareTo(o.s);
+ }
+
+ public String toString(){
+ return s+"\t"+n;
+ }
+
+ @SuppressWarnings("unused")
+ public boolean equals(StringNum other){
+ if(other==null){return false;}
+ if(s==other.s){return true;}
+ if(s==null || other.s==null){return false;}
+ return compareTo(other)==0;
+ }
+
+ final String s;
+ long n;
+
+ }
+
+ /*--------------------------------------------------------------*/
+
+ private void printOptions(){
+ assert(false) : "printOptions: TODO";
+// outstream.println("Syntax:\n");
+// outstream.println("java -ea -Xmx512m -cp <path> jgi.ReformatReads in=<infile> in2=<infile2> out=<outfile> out2=<outfile2>");
+// outstream.println("\nin2 and out2 are optional. \nIf input is paired and there is only one output file, it will be written interleaved.\n");
+// outstream.println("Other parameters and their defaults:\n");
+// outstream.println("overwrite=false \tOverwrites files that already exist");
+// outstream.println("ziplevel=4 \tSet compression level, 1 (low) to 9 (max)");
+// outstream.println("interleaved=false\tDetermines whether input file is considered interleaved");
+// outstream.println("fastawrap=70 \tLength of lines in fasta output");
+// outstream.println("qin=auto \tASCII offset for input quality. May be set to 33 (Sanger), 64 (Illumina), or auto");
+// outstream.println("qout=auto \tASCII offset for output quality. May be set to 33 (Sanger), 64 (Illumina), or auto (meaning same as input)");
+// outstream.println("outsingle=<file> \t(outs) Write singleton reads here, when conditionally discarding reads from pairs.");
+ }
+
+
+ /*--------------------------------------------------------------*/
+
+ private String in1=null;
+ private String in2=null;
+
+ private String qfin1=null;
+ private String qfin2=null;
+
+ private String out1=null;
+ private String out2=null;
+ private String outCounts=null;
+
+ private String qfout1=null;
+ private String qfout2=null;
+
+ private String extin=null;
+ private String extout=null;
+
+ /*--------------------------------------------------------------*/
+
+ private boolean reverseComplimentMate=false;
+ private boolean reverseCompliment=false;
+ private boolean countUndefined=true;
+ private boolean printHeader=true;
+ private int maxRows=-1;
+
+ private long maxReads=-1;
+
+ /*--------------------------------------------------------------*/
+
+ private final FileFormat ffin1;
+ private final FileFormat ffin2;
+
+ private final FileFormat ffout1;
+ private final FileFormat ffout2;
+ private final FileFormat ffCounts;
+
+ private final HashMap<String,String> expectedCodeMap;
+ private final HashMap<String,String> validCodeMap;
+ private final ArrayList<String> expectedCodes;
+ private final ArrayList<String> validCodes;
+
+ private final BandedAlignerConcrete bandy=new BandedAlignerConcrete(21);
+
+ /*--------------------------------------------------------------*/
+
+ private PrintStream outstream=System.err;
+ public static boolean verbose=false;
+ public boolean errorState=false;
+ private boolean overwrite=false;
+ private boolean append=false;
+
+}
diff --git a/current/jgi/CountGC.java b/current/jgi/CountGC.java
new file mode 100755
index 0000000..ca58c64
--- /dev/null
+++ b/current/jgi/CountGC.java
@@ -0,0 +1,389 @@
+package jgi;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+
+import align2.Tools;
+
+import dna.Parser;
+import dna.Timer;
+import fileIO.ByteFile;
+import fileIO.ReadWrite;
+import fileIO.FileFormat;
+import fileIO.TextStreamWriter;
+
+/**
+ * @author Brian Bushnell
+ * @date Dec 13, 2012
+ *
+ */
+public class CountGC {
+
+ public static void main(String[] args){
+
+ Timer t=new Timer();
+ if(args.length==0){
+ System.out.println("Usage: CountGC in=<infile> out=<outfile>");
+ System.out.println("Alternately, 'out=stdout' will print to standard out.");
+ System.out.println("Optional flag, format:");
+ System.out.println("format=1\tid start stop A C G T N GC");
+ System.out.println("format=2\tid gc");
+ System.out.println("format=4\tid length gc");
+ System.out.println("Output is always tab-delimited. AGCT are fractions of defined bases; N is fraction of total bases.");
+ System.exit(0);
+ }
+
+ boolean benchmark=false;
+ ReadWrite.USE_UNPIGZ=true;
+
+ String in=null, out=null;
+
+ for(int i=0; i<args.length; i++){
+
+ if(true){
+ final String arg=args[i];
+ final String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if("null".equalsIgnoreCase(b)){b=null;}
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(Parser.parseCommonStatic(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseZip(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseQuality(arg, a, b)){
+ //do nothing
+ }else if(a.equals("in")){
+ in=b;
+ }else if(a.equals("out")){
+ out=b;
+ if(b==null || "summaryonly".equalsIgnoreCase(b) || "none".equalsIgnoreCase(b)){
+ out=null;
+ SUMMARY_ONLY=true;
+ }else if("benchmark".equalsIgnoreCase(b)){
+ benchmark=true;
+ out=null;
+ SUMMARY_ONLY=true;
+ }
+ }else if(a.equals("benchmark")){
+ benchmark=Tools.parseBoolean(b);
+ if(benchmark){
+ out=null;
+ SUMMARY_ONLY=true;
+ }
+ }else if(a.equals("format")){
+ FORMAT=Integer.parseInt(b);
+ if(FORMAT!=1 && FORMAT!=2 && FORMAT!=4){
+ throw new RuntimeException("\nUnknown format: "+FORMAT+"; valid values are 1, 2, and 4.\n");
+ }
+ }else if(in==null && i==0 && !args[i].contains("=")){
+ in=args[i];
+ }else if(out==null && i==1 && !args[i].contains("=")){
+ out=args[i];
+ }
+ }
+ }
+
+ {//Process parser fields
+ Parser.processQuality();
+ }
+
+ long[] counts=null;
+ long sum=0;
+
+ if(out==null || out.equalsIgnoreCase("stdout") || out.equalsIgnoreCase("standardout")){out=null;}
+
+ InputStream is=null;
+ {
+ if(in==null){throw new RuntimeException("No input file.");}
+ if(in.equalsIgnoreCase("stdin") || in.equalsIgnoreCase("standardin")){
+ is=System.in;
+ }else{
+ File f=new File(in);
+ if((!f.exists() || f.isDirectory()) && !in.toLowerCase().startsWith("stdin")){
+ throw new RuntimeException("Input file does not appear to be valid: "+in);
+ }
+ }
+ }
+
+ if(is==null){is=ReadWrite.getInputStream(in, false, true);}
+ try {
+ if(benchmark){sum=bench2(is);}
+ else{
+ FileFormat ff=FileFormat.testInput(in, FileFormat.FASTA, null, true, true);
+ boolean fastq=ff.fastq();
+ boolean fasta=!fastq; //Default.
+ if(fastq){counts=countFastq(is, out);}
+ else if(fasta){counts=countFasta(is, out);}
+ else{throw new RuntimeException("Unknown or unsupported file format.");}
+ }
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ try {
+ if(is!=System.in){is.close();}
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+
+
+ t.stop();
+
+ if(benchmark){
+ System.err.println("Time: \t"+t);
+ long bytes=new File(in).length();
+ if(bytes<1){bytes=LIMSUM;}
+ double mbps1=bytes*1000d/t.elapsed;
+ double mbps2=sum*1000d/t.elapsed;
+ System.err.println(String.format("Raw Speed: \t%.2f MBytes/s",mbps1));
+ System.err.println(String.format("Uncompressed Speed:\t%.2f MBytes/s",mbps2));
+ }else{
+ System.err.println(toString2(new StringBuilder("Overall"), counts));
+ System.err.println("Time: \t"+t);
+ long bytes=new File(in).length();
+ if(bytes<1){bytes=LIMSUM;}
+ double mbps=bytes*1000d/t.elapsed;
+ double mbpps=Tools.sum(counts)*1000d/t.elapsed;
+ System.err.println(String.format("Speed:\t%.2f MBytes/s",mbps));
+ System.err.println(String.format(" \t%.2f MBases/s",mbpps));
+ }
+
+ }
+
+ public static long bench2(InputStream is) throws IOException{
+ final byte[] buf=new byte[32768];
+ long sum=0;
+ for(long len=is.read(buf); len>0; len=is.read(buf)){sum+=len;}
+ return sum;
+ }
+
+ public static long[] countFasta(InputStream is, String out) throws IOException{
+
+ long limsum=0;
+ final byte[] buf=new byte[32768];
+ final TextStreamWriter tsw=(out==null ? null : new TextStreamWriter(out, true, false, false));
+ if(tsw!=null){tsw.start();}
+ final int[] counts=new int[6];
+ final long[] overall=new long[6];
+ final StringBuilder hdr=new StringBuilder();
+ boolean hdmode=false;
+
+ int i=0;
+ int lim=is.read(buf);
+ limsum+=lim;
+
+ while(lim>0){
+ if(hdmode){
+ while(i<lim){
+ byte c=buf[i];
+ i++;
+ if(c<=slashr){hdmode=false; break;}
+ hdr.append((char)c);
+ }
+ }
+
+ if(!hdmode){
+ while(i<lim){
+ byte c=buf[i];
+ i++;
+
+ if(c==carrot){
+ hdmode=true;
+ if(hdr.length()>0 || Tools.sum(counts)>0){
+ if(tsw!=null){tsw.print(toString2(hdr, counts));}else if(!SUMMARY_ONLY){System.out.print(toString2(hdr, counts));}
+ hdr.setLength(0);
+ for(int j=0; j<counts.length; j++){
+ overall[j]+=counts[j];
+ counts[j]=0;
+ }
+ }
+ break;
+ }
+ counts[charToNum[c]]++;
+
+ }
+ }
+ if(i>=lim){
+ i=0;
+ lim=is.read(buf);
+ limsum+=lim;
+ }
+ }
+
+ if(hdr.length()>0 || Tools.sum(counts)>0){
+ if(tsw!=null){tsw.print(toString2(hdr, counts));}else if(!SUMMARY_ONLY){System.out.println(toString2(hdr, counts));}
+ hdr.setLength(0);
+ for(int j=0; j<counts.length; j++){
+ overall[j]+=counts[j];
+ counts[j]=0;
+ }
+ }
+
+ if(tsw!=null){
+ tsw.poison();
+ tsw.waitForFinish();
+ }
+ LIMSUM=limsum;
+ return overall;
+ }
+
+ public static long[] countFastq(InputStream is, String out) throws IOException{
+// assert(false) : "Fastq mode - TODO"; //TODO
+ long limsum=0;
+ final byte[] buf=new byte[32768];
+ final TextStreamWriter tsw=(out==null ? null : new TextStreamWriter(out, true, false, false));
+ if(tsw!=null){tsw.start();}
+ final int[] counts=new int[6];
+ final long[] overall=new long[6];
+ final StringBuilder hdr=new StringBuilder();
+
+ int mode=3;
+
+ int i=0;
+ int lim=is.read(buf);
+ limsum+=lim;
+
+ while(mode==3 && lim>0){
+ while(i<lim && buf[i]!=at){i++;}
+ if(i>=lim){
+ lim=is.read(buf);
+ limsum+=lim;
+ }else{
+ assert(buf[i]==at);
+ mode=0;
+ }
+ }
+
+ while(lim>0){
+ if(mode==0){
+ while(i<lim){
+ byte c=buf[i];
+ i++;
+ if(c<=slashr){mode++; break;}
+ if(c!=at){hdr.append((char)c);}
+ }
+ while(i<lim && buf[i]<=slashr){i++;} //In case of \n\r
+ }
+
+ if(mode==1){
+ while(i<lim){
+ byte c=buf[i];
+ i++;
+ if(c<=slashr){
+ mode++;
+ if(hdr.length()>0 || Tools.sum(counts)>0){
+ if(tsw!=null){tsw.print(toString2(hdr, counts));}else if(!SUMMARY_ONLY){System.out.print(toString2(hdr, counts));}
+ hdr.setLength(0);
+ for(int j=0; j<counts.length; j++){
+ overall[j]+=counts[j];
+ counts[j]=0;
+ }
+ }
+ break;
+ }
+ counts[charToNum[c]]++;
+ }
+ while(i<lim && buf[i]<=slashr){i++;} //In case of \n\r
+ }
+
+ if(mode==2){
+ while(i<lim){
+ byte c=buf[i];
+ i++;
+ if(c<=slashr){mode++; break;}
+ }
+ while(i<lim && buf[i]<=slashr){i++;} //In case of \n\r
+ }
+
+ if(mode==3){
+ while(i<lim){
+ byte c=buf[i];
+ i++;
+ if(c<=slashr){mode=0; break;}
+ }
+ while(i<lim && buf[i]<=slashr){i++;} //In case of \n\r
+ }
+
+ if(i>=lim){
+ i=0;
+ lim=is.read(buf);
+ limsum+=lim;
+ }
+ }
+
+ if(hdr.length()>0 || Tools.sum(counts)>0){
+ if(tsw!=null){tsw.print(toString2(hdr, counts));}else if(!SUMMARY_ONLY){System.out.println(toString2(hdr, counts));}
+ hdr.setLength(0);
+ for(int j=0; j<counts.length; j++){
+ overall[j]+=counts[j];
+ counts[j]=0;
+ }
+ }
+
+ if(tsw!=null){
+ tsw.poison();
+ tsw.waitForFinish();
+ }
+ LIMSUM=limsum;
+ return overall;
+ }
+
+ private static String toString2(StringBuilder sb, int[] counts){
+ final long sum1=(long)counts[0]+(long)counts[1]+(long)counts[2]+(long)counts[3];
+ final long sum2=sum1+counts[4];
+ final float inv1=1f/Tools.max(1, sum1);
+ final float inv2=1f/Tools.max(1, sum2);
+ if(FORMAT==1){
+ return sb.append(String.format("\t%d\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f\n",
+ sum2, counts[0]*inv1, counts[1]*inv1, counts[2]*inv1, counts[3]*inv1, counts[4]*inv2, (counts[1]+counts[2])*inv1)).toString();
+ }else if(FORMAT==2){
+ return sb.append(String.format("\t%.5f\n", (counts[1]+counts[2])*inv1)).toString();
+ }else if(FORMAT==4){
+ return sb.append(String.format("\t%d\t%.5f\n", sum2, (counts[1]+counts[2])*inv1)).toString();
+ }else{
+ throw new RuntimeException("Unknown format.");
+ }
+ }
+
+ private static String toString2(StringBuilder sb, long[] counts){
+ final long sum1=(long)counts[0]+(long)counts[1]+(long)counts[2]+(long)counts[3];
+ final long sum2=sum1+counts[4];
+ final float inv1=1f/Tools.max(1, sum1);
+ final float inv2=1f/Tools.max(1, sum2);
+ if(FORMAT==1){
+ return sb.append(String.format("\t%d\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f\n",
+ sum2, counts[0]*inv1, counts[1]*inv1, counts[2]*inv1, counts[3]*inv1, counts[4]*inv2, (counts[1]+counts[2])*inv1)).toString();
+ }else if(FORMAT==2){
+ return sb.append(String.format("\t%.5f\n", (counts[1]+counts[2])*inv1)).toString();
+ }else if(FORMAT==4){
+ return sb.append(String.format("\t%d\t%.5f\n", sum2, (counts[1]+counts[2])*inv1)).toString();
+ }else{
+ throw new RuntimeException("Unknown format.");
+ }
+ }
+
+ private static final byte[] charToNum=makeCharToNum();
+ public static int FORMAT=1;
+ public static boolean SUMMARY_ONLY=false;
+ private static long LIMSUM=0;
+
+ final static byte slashr='\r', slashn='\n', carrot='>', at='@';
+
+ /**
+ * @return
+ */
+ private static byte[] makeCharToNum() {
+ byte[] r=new byte[256];
+ Arrays.fill(r, (byte)4);
+ r['a']=r['A']=0;
+ r['c']=r['C']=1;
+ r['g']=r['G']=2;
+ r['t']=r['T']=3;
+ r['\n']=r['\r']=r['>']=r['@']=r['+']=5;
+ return r;
+ }
+}
diff --git a/current/jgi/CountUniqueness.java b/current/jgi/CountUniqueness.java
new file mode 100755
index 0000000..0e87a8a
--- /dev/null
+++ b/current/jgi/CountUniqueness.java
@@ -0,0 +1,110 @@
+package jgi;
+
+import java.io.PrintStream;
+import java.util.ArrayList;
+
+import stream.ConcurrentGenericReadInputStream;
+import stream.ConcurrentReadInputStream;
+import stream.Read;
+import align2.ListNum;
+import dna.Timer;
+import fileIO.FileFormat;
+import fileIO.ReadWrite;
+
+/**
+ * TODO
+ * @author Brian Bushnell
+ * @date Jan 14, 2014
+ *
+ */
+public class CountUniqueness {
+
+
+ public void process(){
+ Timer t=new Timer();
+ for(String s : in){
+ process(s);
+ }
+
+ t.stop();
+
+ double rpnano=readsProcessed/(double)(t.elapsed);
+ double bpnano=basesProcessed/(double)(t.elapsed);
+
+ String rpstring=(readsProcessed<100000 ? ""+readsProcessed : readsProcessed<100000000 ? (readsProcessed/1000)+"k" : (readsProcessed/1000000)+"m");
+ String bpstring=(basesProcessed<100000 ? ""+basesProcessed : basesProcessed<100000000 ? (basesProcessed/1000)+"k" : (basesProcessed/1000000)+"m");
+
+ while(rpstring.length()<8){rpstring=" "+rpstring;}
+ while(bpstring.length()<8){bpstring=" "+bpstring;}
+
+ outstream.println("Time: \t"+t);
+ outstream.println("Reads Processed: "+rpstring+" \t"+String.format("%.2fk reads/sec", rpnano*1000000));
+ outstream.println("Bases Processed: "+bpstring+" \t"+String.format("%.2fm bases/sec", bpnano*1000));
+
+ if(errorState){
+ throw new RuntimeException(this.getClass().getName()+" terminated in an error state; the output may be corrupt.");
+ }
+ }
+
+ private void process(Read r1, Read r2){
+ if(r1==null || r2==null){return;}
+ readsProcessed++;
+ basesProcessed+=r1.length();
+ readsProcessed++;
+ basesProcessed+=r2.length();
+ assert(false) : "TODO";
+ }
+
+ public void process(String fname){
+
+ final ConcurrentReadInputStream cris;
+ {
+ FileFormat ff=FileFormat.testInput(fname, FileFormat.SAM, null, true, false);
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff, null);
+ if(verbose){System.err.println("Starting cris");}
+ cris.start(); //4567
+ }
+
+ {
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ while(reads!=null && reads.size()>0){
+
+ for(int idx=0; idx<reads.size(); idx++){
+ Read r1=reads.get(idx);
+ Read r2=r1.mate;
+ assert(false);
+ process(r1, r2);
+ }
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ if(ln!=null){
+ cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
+ }
+ }
+
+ errorState|=ReadWrite.closeStreams(cris);
+
+ }
+
+ private static final int MAX=41;
+ private static final int MAX2=MAX+1;
+ private long[][][] goodMatrix=new long[MAX2][MAX2][MAX2];
+ private long[][][] badMatrix=new long[MAX2][MAX2][MAX2];
+
+ private PrintStream outstream=System.err;
+ private boolean verbose=false;
+ private long maxReads=-1;
+ private String in[];
+ private String out;
+ private boolean overwrite=true;
+ private boolean append=false;
+ private long readsProcessed=0;
+ private long basesProcessed=0;
+ private boolean errorState=false;
+
+
+}
diff --git a/current/jgi/CovStatsLine.java b/current/jgi/CovStatsLine.java
new file mode 100755
index 0000000..24a5ff6
--- /dev/null
+++ b/current/jgi/CovStatsLine.java
@@ -0,0 +1,92 @@
+package jgi;
+
+import java.util.Arrays;
+
+import align2.Tools;
+
+/**
+ * @author Brian Bushnell
+ * @date Oct 10, 2014
+ *
+ */
+public class CovStatsLine {
+
+ public CovStatsLine(String s){
+ this(s.split("\t"));
+ }
+
+ /**
+ * ID Avg_fold Length Ref_GC Covered_percent Covered_bases Plus_reads Minus_reads (optional.... Read_GC)
+ * @param split
+ */
+ public CovStatsLine(String[] split) {
+ assert(split.length>=8) : Arrays.toString(split);
+ assert(!split[0].startsWith("#")) : Arrays.toString(split);
+ id=split[0];
+ avgFold=Double.parseDouble(split[1]);
+ length=Integer.parseInt(split[2]);
+ refGC=Double.parseDouble(split[3]);
+// coveredPercent=Double.parseDouble(split[4]);
+ coveredBases=Integer.parseInt(split[5]);
+ plusReads=Long.parseLong(split[6]);
+ minusReads=Long.parseLong(split[7]);
+ if(split.length==11){
+ median=Integer.parseInt(split[8]);
+ underMin=Integer.parseInt(split[9]);
+ readGC=Double.parseDouble(split[10]);
+ }else if(split.length==10){
+ median=Integer.parseInt(split[8]);
+ if(CoveragePileup.USE_BITSETS && CoveragePileup.USE_WINDOW){
+ underMin=Integer.parseInt(split[9]);
+ }else{
+ readGC=Double.parseDouble(split[9]);
+ }
+ }else if(split.length==9){
+ readGC=Double.parseDouble(split[8]);
+ }else if(split.length<9){
+ //do nothing
+ }
+ }
+
+ public final double coveredPercent(){
+ return (100.0*coveredBases)/Tools.max(1, length);
+ }
+
+ public final long reads(){return plusReads+minusReads;}
+
+ /**
+ * @param csl
+ */
+ public void add(CovStatsLine csl) {
+ double invlen2=1d/Tools.max(1, length+csl.length);
+ avgFold=((avgFold*length)+(csl.avgFold*csl.length))*invlen2;
+ refGC=((refGC*length)+(csl.refGC*csl.length))*invlen2;
+ readGC=((readGC*reads())+(csl.readGC*csl.reads()))*1.0/(Tools.max(1, reads()+csl.reads()));
+
+ length+=csl.length;
+ coveredBases+=csl.coveredBases;
+ plusReads+=csl.plusReads;
+ minusReads+=csl.minusReads;
+ median=median+csl.median;
+ underMin=underMin+csl.underMin;
+ }
+
+ public String toString(){
+ return String.format("%s\t%.4f\t%d \t%.4f\t%.4f\t%d \t%d\t%d\t%d\t%d\t%.4f", id, avgFold, length,
+ refGC, coveredPercent(), coveredBases, plusReads, minusReads, median, underMin, readGC);
+ }
+
+ public static final String header1="#ID\tAvg_fold\tLength\tRef_GC\tCovered_percent\tCovered_bases\tPlus_reads\tMinus_reads\tMedian_fold\tUnder_min\tRead_GC";
+ public static final String header2="#ID\tAvg_fold\tLength\tRef_GC\tCovered_percent\tCovered_bases\tPlus_reads\tMinus_reads\tRead_GC";
+
+ public String id;
+ public int length;
+ public int coveredBases;
+ public long plusReads;
+ public long minusReads;
+ public double avgFold;
+ public double refGC;
+ public int median;
+ public int underMin;
+ public double readGC;
+}
diff --git a/current/jgi/CoveragePileup.java b/current/jgi/CoveragePileup.java
new file mode 100755
index 0000000..e4968e0
--- /dev/null
+++ b/current/jgi/CoveragePileup.java
@@ -0,0 +1,1801 @@
+package jgi;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.BitSet;
+import java.util.HashMap;
+
+import stream.Read;
+import stream.SamLine;
+import stream.ScaffoldCoordinates;
+import stream.SiteScore;
+
+import align2.LongList;
+import align2.ReadStats;
+import align2.Tools;
+
+import dna.ChromosomeArray;
+import dna.CoverageArray;
+import dna.CoverageArray2;
+import dna.CoverageArray3;
+import dna.Data;
+import dna.Gene;
+import dna.Parser;
+import dna.Scaffold;
+import dna.Timer;
+import fileIO.ByteFile;
+import fileIO.ByteStreamWriter;
+import fileIO.ReadWrite;
+import fileIO.TextFile;
+import fileIO.TextStreamWriter;
+
+/**
+ * @author Brian Bushnell
+ * @date Jan 4, 2013
+ *
+ */
+public class CoveragePileup {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Main ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public static void main(String[] args){
+ CoveragePileup sp=new CoveragePileup(args);
+
+ Timer t=new Timer();
+
+ sp.process();
+
+ t.stop();
+ Data.sysout.println();
+ Data.sysout.println("Time: \t"+t);
+
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public CoveragePileup(String[] args){
+ for(String s : args){
+ if(s.contains("=stdout")){Data.sysout=System.err;}
+ }
+ System.err.println("Executing "+(this.getClass().getName())+" "+Arrays.toString(args)+"\n");
+
+ int vectorMode=-1;
+ boolean outset=false;
+ ReadWrite.USE_UNPIGZ=true;
+// SamLine.RNAME_AS_BYTES=false;
+
+ for(int i=0; i<args.length; i++){
+ final String arg=args[i];
+ final String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if("null".equalsIgnoreCase(b)){b=null;}
+// System.err.println("Processing "+args[i]);
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+// }else if(Parser.parseCommonStatic(arg, a, b)){//TODO: Enable
+// //do nothing
+ }else if(Parser.parseZip(arg, a, b)){
+ //do nothing
+ }else if(a.equals("monitor") || a.equals("killswitch")){
+ Parser.parseCommonStatic(arg, a, b);
+ }else if(a.equals("ref") || a.equals("reference") || a.equals("fasta")){
+ reference=b;
+ }else if(a.equals("in") || a.equals("in1")){
+ in1=b;
+ }else if(a.equals("in2")){
+ in2=b;
+ }else if(a.equals("out") || a.equals("coveragestats") || a.equals("covstats") || a.equals("stats")){
+ covstats=b;
+ outset=true;
+ }else if(a.equals("minscaf") || a.equals("covminscaf")){
+ minscaf=Integer.parseInt(b);
+ }else if(a.equals("minq") || a.equals("minmapq")){
+ minMapq=Integer.parseInt(b);
+ }else if(a.equals("outsam")){
+ outsam=b;
+ }else if(a.equals("rpkm") || a.equals("fpkm") || a.equals("outrpkm")){
+ outrpkm=b;
+ }else if(a.equals("outorf")){
+ outorf=b;
+ }else if(a.equals("orffasta") || a.equals("fastaorf")){
+ orffasta=b;
+ }else if(a.equals("basecov") || a.equals("outcov")){
+ basecov=b;
+ }else if(a.equals("bincov") || a.equals("outbinned")){
+ bincov=b;
+ }else if(a.equals("normcov") || a.equals("outnormalized")){
+ normcov=b;
+ }else if(a.equals("normcovo") || a.equals("outnormalizedoverall")){
+ normcovOverall=b;
+ }else if(a.equals("delta")){
+ DELTA_ONLY=Tools.parseBoolean(b);
+ }else if(a.equals("physical") || a.equals("physicalcoverage") || a.equals("physcov")){
+ PHYSICAL_COVERAGE=Tools.parseBoolean(b);
+ }else if(a.equals("tlen")){
+ USE_TLEN=Tools.parseBoolean(b);
+ }else if(a.equals("hist") || a.equals("histogram") || a.equals("covhist")){
+ histogram=b;
+ }else if(a.equals("reads")){
+ maxReads=Tools.parseKMG(b);
+ }else if(a.equals("scafs") || a.equals("scaffolds")){
+ initialScaffolds=Tools.max(128, (int)(Tools.min(Long.parseLong(b),2000000000)));
+ }else if(a.equals("binsize")){
+ binsize=Integer.parseInt(b);
+ }else if(a.equals("32bit")){
+ bits32=Tools.parseBoolean(b);
+ }else if(a.equals("bitset") || a.equals("usebitset") || a.equals("bitsets") || a.equals("usebitsets")){
+// if(Tools.parseBoolean(b)){arrayMode=BITSET_MODE;}
+ vectorMode=Tools.parseBoolean(b) ? BITSET_MODE : NOTHING_MODE;
+ }else if(a.equals("array") || a.equals("arrays") || a.equals("usearrays")){
+ vectorMode=Tools.parseBoolean(b) ? ARRAY_MODE : NOTHING_MODE;
+ }else if(a.equals("median") || a.equals("calcmedian")){
+ if(Tools.parseBoolean(b)){
+ vectorMode=ARRAY_MODE;
+ }
+ }else if(a.startsWith("nonzero") || a.equals("nzo")){
+ NONZERO_ONLY=Tools.parseBoolean(b);
+ System.err.println("Set NONZERO_ONLY to "+NONZERO_ONLY);
+ }else if(a.equals("append") || a.equals("app")){
+ append=ReadStats.append=Tools.parseBoolean(b);
+ }else if(a.equals("overwrite") || a.equals("ow")){
+ overwrite=Tools.parseBoolean(b);
+ System.err.println("Set overwrite to "+overwrite);
+ }else if(a.equalsIgnoreCase("twocolumn")){
+ TWOCOLUMN=Tools.parseBoolean(b);
+ System.err.println("Set TWOCOLUMN to "+TWOCOLUMN);
+ }else if(a.equalsIgnoreCase("countgc")){
+ COUNT_GC=Tools.parseBoolean(b);
+ System.err.println("Set COUNT_GC to "+COUNT_GC);
+ }else if(a.equals("secondary") || a.equals("usesecondary")){
+ USE_SECONDARY=Tools.parseBoolean(b);
+ System.err.println("Set USE_SECONDARY_ALIGNMENTS to "+USE_SECONDARY);
+ }else if(a.equals("softclip") || a.equals("includesoftclip")){
+ INCLUDE_SOFT_CLIP=Tools.parseBoolean(b);
+ System.err.println("Set INCLUDE_SOFT_CLIP to "+INCLUDE_SOFT_CLIP);
+ }else if(a.equals("keepshortbins") || a.equals("ksb")){
+ KEEP_SHORT_BINS=Tools.parseBoolean(b);
+ System.err.println("Set KEEP_SHORT_BINS to "+KEEP_SHORT_BINS);
+ }else if(a.equals("strandedcoverage") || a.equals("strandedcov") || a.equals("covstranded") || a.equals("stranded")){
+ STRANDED=Tools.parseBoolean(b);
+ }else if(a.equals("startcov") || a.equals("covstart") || a.equals("startonly")){
+ START_ONLY=Tools.parseBoolean(b);
+ }else if(a.equals("concise")){
+ CONCISE=Tools.parseBoolean(b);
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ }else if(a.equals("normc") || a.equals("normalizecoverage")){
+ NORMALIZE_COVERAGE=Tools.parseBoolean(b);
+ }else if(a.equals("header") || a.equals("hdr")){
+ printHeader=Tools.parseBoolean(b);
+ }else if(a.equals("headerpound") || a.equals("#")){
+ headerPound=Tools.parseBoolean(b);
+ }else if(a.equals("stdev")){
+ calcCovStdev=Tools.parseBoolean(b);
+ }else if(a.equals("delcov") || a.equals("dels") || a.equals("includedels") || a.equals("includedeletions") || a.equals("delcoverage")){
+ INCLUDE_DELETIONS=Tools.parseBoolean(b);
+ }else if(a.equals("normb") || a.equals("normalizebins")){
+ try {
+ NORMALIZE_LENGTH_BINS=Integer.parseInt(b);
+ } catch (NumberFormatException e) {
+ boolean x=Tools.parseBoolean(b);
+ NORMALIZE_LENGTH_BINS=x ? 100 : -1;
+ }
+ }else if(a.equals("covwindow")){
+ if(b==null || b.length()<1 || Character.isLetter(b.charAt(0))){
+ USE_WINDOW=Tools.parseBoolean(b);
+ }else{
+ LOW_COV_WINDOW=Integer.parseInt(b);
+ USE_WINDOW=(LOW_COV_WINDOW>0);
+ }
+ }else if(a.equals("covwindowavg")){
+ LOW_COV_DEPTH=Double.parseDouble(b);
+ }else{
+ throw new RuntimeException("Unknown parameter: "+args[i]);
+ }
+
+ }
+
+ if(outsam==null){
+ SamLine.PARSE_0=false;
+ SamLine.PARSE_6=false;
+ SamLine.PARSE_7=false;
+ SamLine.PARSE_8=false;
+ SamLine.PARSE_10=false;
+ SamLine.PARSE_OPTIONAL=false;
+ }
+
+ if(vectorMode>-1){
+ USE_BITSETS=(vectorMode==BITSET_MODE);
+ USE_COVERAGE_ARRAYS=(vectorMode==ARRAY_MODE);
+ }else{
+ if(histogram==null && basecov==null && bincov==null && normcov==null && normcovOverall==null && outorf==null){//No need for coverage array!
+ USE_COVERAGE_ARRAYS=false;
+ if(TWOCOLUMN){//No need for bitset, either!
+ USE_BITSETS=false;
+ }else{
+ USE_BITSETS=true;
+ }
+ }
+ }
+
+ System.err.println("Set USE_COVERAGE_ARRAYS to "+USE_COVERAGE_ARRAYS);
+ System.err.println("Set USE_BITSETS to "+USE_BITSETS);
+
+ if(maxReads<0){maxReads=Long.MAX_VALUE;}
+ {
+ final String a=(args.length>0 ? args[0] : null);
+ final String b=(args.length>1 ? args[1] : null);
+ if(in1==null && a!=null && a.indexOf('=')<0 && (a.startsWith("stdin") || new File(a).exists())){in1=a;}
+ if(covstats==null && b!=null && b.indexOf('=')<0){covstats=b;}
+ if(in1==null){in1="stdin";}
+ if(covstats==null && !outset){
+// out="stdout";
+// System.err.println("Warning: output destination not set; producing no output. To print to standard out, set 'out=stdout'");
+ Data.sysout=System.err;
+ }
+ }
+ assert(in1!=null);
+// assert(out!=null || outset) : "Output file was not set.";
+
+ if(STRANDED){
+ assert(basecov==null || basecov.indexOf('#')>=0) : "Output filenames must contain '#' symbol for strand-specific output.";
+ assert(bincov==null || bincov.indexOf('#')>=0) : "Output filenames must contain '#' symbol for strand-specific output.";
+ assert(normcov==null || normcov.indexOf('#')>=0) : "Output filenames must contain '#' symbol for strand-specific output.";
+ assert(normcovOverall==null || normcovOverall.indexOf('#')>=0) : "Output filenames must contain '#' symbol for strand-specific output.";
+ assert(histogram==null || histogram.indexOf('#')>=0) : "Output filenames must contain '#' symbol for strand-specific output.";
+ assert(covstats==null || covstats.indexOf('#')>=0) : "Output filenames must contain '#' symbol for strand-specific output.";
+ }
+
+ if(!Tools.testOutputFiles(overwrite, append, false, basecov, bincov, normcov, normcovOverall, histogram, covstats, outrpkm)){
+ throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+
+ basecov+", "+bincov+", "+normcov+", "+normcovOverall+", "+histogram+", "+covstats+", "+outrpkm+"\n");
+ }
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Data Structures ----------------*/
+ /*--------------------------------------------------------------*/
+
+
+ /** The goal if this is to garbage-collect unnecessary objects, not really for reusing the object */
+ public void clear(){
+ list=null;
+ table=null;
+ pairTable=null;
+
+ program=null;
+ version=null;
+
+ in1=null;
+ covstats=null;
+ outsam=null;
+ outorf=null;
+ outrpkm=null;
+ reference=null;
+ histogram=null;
+ basecov=null;
+ bincov=null;
+ normcov=null;
+ normcovOverall=null;
+ orffasta=null;
+
+ error=false;
+
+ refBases=0;
+ mappedBases=0;
+ mappedReads=0;
+ readsProcessed=0;
+ totalCoveredBases1=0;
+ totalCoveredBases2=0;
+ scaffoldsWithCoverage1=0;
+ scaffoldsWithCoverage2=0;
+ totalScaffolds=0;
+ }
+
+ public void createDataStructures(){
+ refBases=0;
+ mappedBases=0;
+ mappedReads=0;
+ readsProcessed=0;
+ totalCoveredBases1=0;
+ totalCoveredBases2=0;
+ scaffoldsWithCoverage1=0;
+ scaffoldsWithCoverage2=0;
+ totalScaffolds=0;
+ error=false;
+ list=new ArrayList<Scaffold>(initialScaffolds);
+ table=new HashMap<String, Scaffold>(initialScaffolds);
+
+ if(PHYSICAL_COVERAGE){
+ pairTable=new HashMap<String, Object>();
+ if(COUNT_GC){
+ COUNT_GC=false;
+ System.err.println("COUNT_GC disabled for physical coverage mode.");
+ }
+ if(USE_SECONDARY){
+ USE_SECONDARY=false;
+ System.err.println("USE_SECONDARY disabled for physical coverage mode.");
+ }
+
+ SamLine.PARSE_0=true;
+ SamLine.PARSE_6=true;
+ SamLine.PARSE_7=true;
+ SamLine.PARSE_8=true;
+ SamLine.PARSE_10=false;
+ SamLine.PARSE_OPTIONAL=false;
+ }
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Outer Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Read and process all input data. */
+ public void process(){
+ createDataStructures();
+
+ ByteFile tf=ByteFile.makeByteFile(in1, ReadWrite.USE_UNPIGZ, false);
+
+ final ByteStreamWriter tsw=(outsam==null ? null : new ByteStreamWriter(outsam, overwrite, false, true));
+ if(outsam!=null){tsw.start();}
+
+ byte[] line=processHeader(tf, tsw);
+
+ processReference();
+
+ if(maxReads<0){maxReads=Long.MAX_VALUE;}
+ for(; line!=null && readsProcessed<maxReads; line=tf.nextLine()){
+ if(tsw!=null){tsw.println(line);}
+ processSamLine(line);
+ }
+
+ tf.close();
+ if(tsw!=null){tsw.poison();}
+
+ printOutput();
+
+ if(orffasta!=null){
+ processOrfsFasta(orffasta, outorf, table);
+ }
+
+ if(tsw!=null){tsw.waitForFinish();}
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Setup ----------------*/
+ /*--------------------------------------------------------------*/
+
+
+ /** Process all sam header lines from the tf.
+ * Once a non-header line is encountered, return it.
+ * If non-null, print all lines to the tsw. */
+ public byte[] processHeader(ByteFile tf, ByteStreamWriter tsw){
+ byte[] line=null;
+ for(line=tf.nextLine(); line!=null && (line.length==0 || line[0]=='@'); line=tf.nextLine()){
+ if(tsw!=null){tsw.println(line);}
+
+ if(line.length>2){
+ final byte a=line[1], b=line[2];
+
+ if(a=='S' && b=='Q'){
+ Scaffold scaf=new Scaffold(line);
+ if(COUNT_GC){scaf.basecount=new long[8];}
+ assert(!table.containsKey(scaf.name)) : "\nDuplicate scaffold name!\n"+scaf+"\n\n"+table.get(scaf.name);
+ table.put(scaf.name, scaf);
+ list.add(scaf);
+ refBases+=scaf.length;
+// sc.obj=new CoverageArray2(table.size(), sc.length+1);
+// Data.sysout.println("Made scaffold "+sc.name+" of length "+sc.length);
+ }else if(a=='P' && b=='G'){
+ String[] split=new String(line).split("\t");
+ for(String s : split){
+ if(s.startsWith("PN:")){
+ if(program==null){program=Data.forceIntern(s.substring(3));}
+ }else if(s.startsWith("VN:")){
+ if(version==null){version=Data.forceIntern(s.substring(3));}
+ }
+ }
+ }else if(a=='R' && b=='G'){
+ //Do nothing
+ }else if(a=='H' && b=='D'){
+ //Do nothing
+ }else if(a=='C' && b=='O'){
+ //Do nothing
+ }else{
+ // assert(false) : line;
+ }
+ }
+ }
+ return line;
+ }
+
+
+ public void loadScaffoldsFromIndex(int minChrom, int maxChrom){
+
+ final int[][] lengths=Data.scaffoldLengths;
+ final int[][] locs=Data.scaffoldLocs;
+ final byte[][][] names=Data.scaffoldNames;
+ final int[] counts=new int[8];
+ for(int chrom=minChrom; chrom<=maxChrom; chrom++){
+ final ChromosomeArray ca=Data.getChromosome(chrom);
+// assert(false) : lengths[chrom]+", "+lengths.length+", "+names[chrom]+", "+locs[chrom]+", "+(ca==null);
+ if(lengths[chrom]!=null){
+ final int[] clengths=lengths[chrom];
+ final int[] clocs=locs[chrom];
+ final byte[][] cnames=names[chrom];
+ for(int idx=0; idx<clengths.length; idx++){
+ final int length=clengths[idx];
+ final int loc=clocs[idx];
+ final String name=new String(cnames[idx]);
+ final Scaffold scaf=new Scaffold(name, length);
+ if(ca!=null){
+ scaf.gc=ca.calcGC(loc, length, counts);
+ }
+ if(COUNT_GC){scaf.basecount=new long[8];}
+ assert(!table.containsKey(scaf.name)) : "\nDuplicate scaffold name!\n"+scaf+"\n\n"+table.get(scaf.name);
+ table.put(scaf.name, scaf);
+ list.add(scaf);
+ refBases+=scaf.length;
+ }
+ }
+ }
+ }
+
+
+ public void processReference(){
+ if(reference==null){return;}
+
+ ByteFile bf=ByteFile.makeByteFile(reference, false, false);
+ Scaffold scaf=null;
+ int len=0;
+ final long[] acgtn=new long[8];
+ for(byte[] s=bf.nextLine(); s!=null; s=bf.nextLine()){
+ if(s.length>0 && s[0]=='>'){
+ if(scaf!=null){
+ scaf.length=len;
+ scaf.gc=(float)((acgtn[1]+acgtn[2])*1d/Data.max(1, acgtn[0]+acgtn[1]+acgtn[2]+acgtn[3]));
+ scaf=null;
+ len=0;
+ Arrays.fill(acgtn, 0);
+ }
+
+ String name=new String(s, 1, s.length-1);
+ scaf=table.get(name);
+ if(ADD_FROM_REF && scaf==null){
+ scaf=new Scaffold(name, 0);
+ System.err.println("Warning - SAM header did not include "+name);
+ table.put(name, scaf);
+ }
+ }else{
+ len+=s.length;
+ for(int i=0; i<s.length; i++){
+ acgtn[charToNum[s[i]]]++;
+ }
+ }
+ }
+ if(scaf!=null){
+ scaf.length=len;
+ scaf.gc=(float)((acgtn[1]+acgtn[2])*1d/Data.max(1, acgtn[0]+acgtn[1]+acgtn[2]+acgtn[3]));
+ scaf=null;
+ len=0;
+ Arrays.fill(acgtn, 0);
+ }
+ }
+
+
+ public void processOrfsFasta(String fname_in, String fname_out, HashMap<String, Scaffold> map){
+ TextFile tf=new TextFile(fname_in, false, false);
+ assert(!fname_in.equalsIgnoreCase(fname_out));
+ TextStreamWriter tsw=new TextStreamWriter(fname_out, overwrite, false, true);
+ tsw.start();
+
+ if(printHeader){
+ String pound=(headerPound ? "#" : "");
+ tsw.print(pound+"mappedBases="+mappedBases+"\n");
+ tsw.print(pound+"mappedReads="+mappedReads+"\n");
+ tsw.print(pound+"name\tlength\tdepthSum\tavgDepth\tavgDepth/mappedBases\tminDepth\tmaxDepth\tmedianDepth\tstdDevDepth\tfractionCovered\n");
+ }
+
+ String line;
+ final StringBuilder sb=new StringBuilder(500);
+// Formatter formatter=new Formatter(sb);
+
+ while((line=tf.nextLine())!=null){
+ if(line.length()>1 && line.charAt(0)=='>'){
+
+ String[] split=line.split(" # "); //' # ' used as delimiters
+
+ String orfname=split[0].substring(1).trim(); //In case there are spaces around the ' # ' delimiters
+ String scafname=orfname;
+ if(scafname.contains("_")){//PRODIGAL pads _1 to the name of the first orf of a scaffold, and etc
+ int last=scafname.lastIndexOf('_');
+ boolean numeric=false;
+ for(int i=last+1; i<scafname.length(); i++){
+ if(Character.isDigit(scafname.charAt(i))){numeric=true;}
+ else{numeric=false; break;}
+ }
+ if(numeric){scafname=scafname.substring(0, last);}
+ }
+
+ int start=Integer.parseInt(split[1].trim());
+ int stop=Integer.parseInt(split[2].trim());
+ int strand=Integer.parseInt(split[3].trim());
+ if(strand==1){strand=Gene.PLUS;}else{strand=Gene.MINUS;}
+ Orf orf=new Orf(orfname, start, stop, (byte)strand);
+
+ Scaffold scaf=map.get(scafname);
+// if(scaf==null){scaf=map.get(orfname);}
+
+// assert(scaf!=null) : "\nCan't find scaffold for ("+orf+")\nfrom line\n"+line+"\n";
+// assert(orf.start>=0 && orf.stop<scaf.length) : "\norf goes out of scaffold bounds.\n"+orf+"\n"+scaf+"\n";
+
+ if(scaf==null){
+ System.err.println("Can't find scaffold for ("+orf+")\nfrom line\n"+line+"\nscafname='"+scafname+"'\norfname='"+orfname+"'");
+ if(ABORT_ON_ERROR){
+ tsw.poison();
+ throw new RuntimeException("Aborting.");
+ }
+ }
+ if(orf.start<0 && orf.stop>=scaf.length){
+ Data.sysout.println("orf goes out of scaffold bounds.\n"+orf+"\n"+scaf);
+ if(ABORT_ON_ERROR){
+ tsw.poison();
+ throw new RuntimeException("Aborting.");
+ }
+ }
+
+ if(scaf!=null){
+ CoverageArray ca=(CoverageArray)(STRANDED && strand==1 ? scaf.obj2 : scaf.obj1); //TODO: Strand logic here depends on stranding protocol.
+ orf.readCoverageArray(ca);
+ }
+
+// {
+// tsw.print(String.format("%s\t%d\t", args));
+// }
+
+ sb.append(orf.name).append('\t');
+ sb.append(orf.length()).append('\t');
+ sb.append(orf.baseDepth).append('\t');
+ sb.append(String.format("%.4f", orf.avgCoverage())).append('\t');
+ sb.append(orf.avgCoverage()/mappedBases);
+
+ sb.append('\t');
+ sb.append(orf.minDepth).append('\t');
+ sb.append(orf.maxDepth).append('\t');
+ sb.append(orf.medianDepth).append('\t');
+ sb.append(String.format("%.4f",orf.stdevDepth)).append('\t');
+ sb.append(String.format("%.4f",orf.fractionCovered()));
+
+ sb.append('\n');
+ tsw.print(sb.toString());
+ sb.setLength(0);
+ }
+ }
+
+ tsw.poison();
+ tsw.waitForFinish();
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public boolean addCoverage(final String scafName, final byte[] seq, byte[] match, final int start0, final int stop0, final int readlen, final int strand, int incrementFrags){
+ final Scaffold scaf=table.get(scafName);
+ if(scaf==null){
+ assert(false) : "Can't find "+scafName;
+ return false;
+ }
+ return addCoverage(scaf, seq, match, start0, stop0, readlen, strand, incrementFrags);
+ }
+
+ public boolean addCoverage(final Scaffold scaf, final byte[] seq, byte match[], final int start0, final int stop0, final int readlen, final int strand, int incrementFrags){
+ if(scaf==null){
+ assert(false) : "Adding coverage to a null Scaffold.";
+ return false;
+ }
+ final int start=Tools.max(start0, 0);
+ final int stop=Tools.min(stop0, scaf.length-1);
+
+ assert(start>=0 && stop>=0) : "\nAn error was encountered when processing a read. Output will not be valid.\n"+
+ "\nscafName="+scaf.name+"\nseq="+new String(seq)+"\nstart="+start+
+ "\nstop="+stop+"\nreadlen="+readlen+"\nstrand="+strand+"\nscaf.length="+scaf.length+"\nscaf="+scaf;
+
+ mappedBases+=readlen;
+ mappedReads++;
+
+ scaf.readhits++;
+ scaf.fraghits+=incrementFrags;
+ if(strand==1){scaf.readhitsMinus++;}
+
+ if(seq!=null && scaf.basecount!=null){
+ final long[] counts=scaf.basecount;
+ for(int i=0; i<seq.length; i++){
+ counts[charToNum[seq[i]]]++;
+ }
+ }
+
+ if(!INCLUDE_DELETIONS && !START_ONLY){
+ assert(match!=null) : "Coverage excluding deletions cannot be calculated without a match string.";
+ return addCoverageIgnoringDeletions(scaf, seq, match, start, stop, readlen, strand, incrementFrags);
+ }
+
+ final int basehits=stop-start+1;
+ scaf.basehits+=basehits;
+
+ if(USE_COVERAGE_ARRAYS){
+ if(scaf.obj1==null){
+ scaf.obj1=(bits32 ? new CoverageArray3(table.size(), scaf.length+1) : new CoverageArray2(table.size(), scaf.length+1));
+ if(STRANDED){
+ scaf.obj2=(bits32 ? new CoverageArray3(table.size(), scaf.length+1) : new CoverageArray2(table.size(), scaf.length+1));
+ }
+ }
+ CoverageArray ca=(CoverageArray)(STRANDED && strand==1 ? scaf.obj2 : scaf.obj1);
+ if(START_ONLY){
+ ca.increment(start);
+ }else{
+ ca.incrementRange(start, stop, 1);
+ }
+ }else if(USE_BITSETS){
+ if(scaf.obj1==null){
+ scaf.obj1=new BitSet(scaf.length+1);
+ if(STRANDED){
+ scaf.obj2=new BitSet(scaf.length+1);
+ }
+ }
+ BitSet bs=(BitSet)(STRANDED && strand==1 ? scaf.obj2 : scaf.obj1);
+ if(START_ONLY){
+ bs.set(start);
+ }else{
+ bs.set(start, stop+1);
+ }
+ }
+
+ return true;
+ }
+
+ private boolean addCoverageIgnoringDeletions(final Scaffold scaf, final byte[] seq, byte match[], final int start, final int stop, final int readlen, final int strand, int incrementFrags){
+ assert(!INCLUDE_DELETIONS && !START_ONLY);
+ assert(match!=null) : "Coverage excluding deletions cannot be calculated without a match string.";
+
+ if(Read.isShortMatchString(match)){
+ match=Read.toLongMatchString(match);
+ }
+
+ int basehits=0;
+ if(USE_COVERAGE_ARRAYS){
+ if(scaf.obj1==null){
+ scaf.obj1=(bits32 ? new CoverageArray3(table.size(), scaf.length+1) : new CoverageArray2(table.size(), scaf.length+1));
+ if(STRANDED){
+ scaf.obj2=(bits32 ? new CoverageArray3(table.size(), scaf.length+1) : new CoverageArray2(table.size(), scaf.length+1));
+ }
+ }
+ CoverageArray ca=(CoverageArray)(STRANDED && strand==1 ? scaf.obj2 : scaf.obj1);
+ for(int rpos=start, mpos=0; mpos<match.length && rpos<=stop; mpos++){
+ byte m=match[mpos];
+ if(m=='m' || m=='S' || m=='N'){
+ ca.increment(rpos, 1);
+ basehits++;
+ rpos++;
+ }else if(m=='X' || m=='Y' || m=='C' || m=='I'){
+ //do nothing
+ }else if(m=='D'){
+ rpos++;
+ }else{
+ assert(false) : "Unhandled symbol "+m;
+ }
+ }
+ }else if(USE_BITSETS){
+ if(scaf.obj1==null){
+ scaf.obj1=new BitSet(scaf.length+1);
+ if(STRANDED){
+ scaf.obj2=new BitSet(scaf.length+1);
+ }
+ }
+ BitSet bs=(BitSet)(STRANDED && strand==1 ? scaf.obj2 : scaf.obj1);
+ for(int rpos=start, mpos=0; mpos<match.length && rpos<=stop; mpos++){
+ byte m=match[mpos];
+ if(m=='m' || m=='S' || m=='N'){
+ bs.set(rpos);
+ basehits++;
+ rpos++;
+ }else if(m=='X' || m=='Y' || m=='C' || m=='I'){
+ //do nothing
+ }else if(m=='D'){
+ rpos++;
+ }else{
+ assert(false) : "Unhandled symbol "+m;
+ }
+ }
+ }
+ scaf.basehits+=basehits;
+
+ return true;
+ }
+
+
+ public boolean processSamLine(byte[] line){
+ if(line==null || line.length<3){
+ return false;
+ }else if(line[0]=='@'){
+ if(!error){
+ System.err.println("Unexpected header line: "+line);
+ System.err.println("This should not cause problems, and is probably due to concatenated sam files.\n" +
+ "Supressing future unexpected header warnings.");
+ error=true;
+ }
+
+ if(line[1]=='S' && line[2]=='Q'){
+ Scaffold scaf=new Scaffold(line);
+ if(!table.containsKey(scaf.name)){
+ if(COUNT_GC){scaf.basecount=new long[8];}
+ table.put(scaf.name, scaf);
+ list.add(scaf);
+ refBases+=scaf.length;
+ }
+ }
+ }else{
+ SamLine sl=new SamLine(line);
+ return processSamLine(sl);
+ }
+ return false;
+ }
+
+
+ public boolean processSamLine(SamLine sl){
+ readsProcessed++;
+ final boolean properPair=(sl.hasMate() && sl.mapped() && sl.primary() && sl.properPair() && sl.pairedOnSameChrom());
+ if(PHYSICAL_COVERAGE && properPair){
+ SamLine mate=(SamLine)pairTable.remove(sl.qname);
+ if(mate==null){pairTable.put(sl.qname, sl);}
+ else{
+ final int start1=sl.start(INCLUDE_SOFT_CLIP, false);
+ final int stop1=sl.stop(start1, INCLUDE_SOFT_CLIP, false);
+ final int start2=mate.start(INCLUDE_SOFT_CLIP, false);
+ final int stop2=mate.stop(start2, INCLUDE_SOFT_CLIP, false);
+ final int strand=(sl.pairnum()==0 ? sl.strand() : mate.strand());
+ final int length=USE_TLEN ? sl.tlen : Tools.max(stop1, stop2)-Tools.min(start1, start2)+1;
+ addCoverage(sl.rnameS(), null, null, Tools.min(start1, start2), Tools.max(stop1, stop2), length, strand, 2);
+ }
+ }else if(sl.mapped() && (USE_SECONDARY || sl.primary()) && sl.mapq>=minMapq){
+ assert(sl.seq!=null || sl.cigar!=null) : "This program requires bases or a cigar string for every sam line. Problem line:\n"+sl+"\n";
+// assert(sl.seq!=null) : sl.toString();
+ final int length=sl.length();
+ final int start=sl.start(INCLUDE_SOFT_CLIP, false);
+ final int stop=sl.stop(start, INCLUDE_SOFT_CLIP, false);
+// assert(false && length==stop-start+1) : length+", "+start+", "+stop+", "+(stop-start+1);
+// assert(false) : "'"+new String(sl.rname())+"', '"+sl.rnameS()+"'";
+// assert(false) : "'"+sl.rnameS()+"'";
+ final byte[] match=(INCLUDE_DELETIONS ? null : SamLine.cigarToShortMatch(sl.cigar, true));
+ return addCoverage(sl.rnameS(), sl.seq, match, start, stop, length, sl.strand(), sl.hasMate() ? 1 : 2);
+ }
+ return false;
+ }
+
+
+ public boolean processRead(Read r){
+ readsProcessed++;
+ if(r.mapped() && r.bases!=null){
+ if(USE_SECONDARY && r.sites!=null && r.sites.size()>0){
+ boolean b=false;
+ for(SiteScore ss : r.sites){
+ b=processRead(r, ss) || b;
+ }
+ return b;
+ }else{
+ final Read mate=r.mate;
+ final boolean set1=coords.set(r);
+ final boolean set2=(PHYSICAL_COVERAGE && r.paired() && r.pairnum()==0 && coords2.set(mate));
+ if(set1 && set2 && Tools.equals(coords.name, coords2.name)){
+ final int start1=coords.start;
+ final int stop1=coords.stop;
+ final int start2=coords2.start;
+ final int stop2=coords2.stop;
+ final int strand=r.strand();
+ final int length=Tools.max(stop1, stop2)-Tools.min(start1, start2)+1;
+ addCoverage(new String(coords.name), null, null, Tools.min(start1, start2), Tools.max(stop1, stop2), length, strand, 2-r.mateCount());
+ }else{
+ if(set1){
+ return addCoverage(new String(coords.name), r.bases, r.match, coords.start, coords.stop, r.length(), coords.strand, 2-r.mateCount());
+ }
+ }
+ }
+ }
+ return false;
+ }
+
+
+ public boolean processRead(Read r, SiteScore ss){
+ if(ss!=null && r.bases!=null){
+ if(coords.set(ss)){
+ return addCoverage(new String(coords.name), r.bases, ss.match, coords.start, coords.stop, r.length(), coords.strand, 1);
+ }
+ }
+ return false;
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Output Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+
+ public void printOutput(){
+
+ totalScaffolds=list.size();
+
+ String basecov1=(basecov==null ? null : (STRANDED ? basecov.replaceFirst("#", "1") : basecov));
+ String bincov1=(bincov==null ? null : (STRANDED ? bincov.replaceFirst("#", "1") : bincov));
+ String normcov1=(normcov==null ? null : (STRANDED ? normcov.replaceFirst("#", "1") : normcov));
+ String normcovOverall1=(normcovOverall==null ? null : (STRANDED ? normcovOverall.replaceFirst("#", "1") : normcovOverall));
+ String histogram1=(histogram==null ? null : (STRANDED ? histogram.replaceFirst("#", "1") : histogram));
+ String stats1=(covstats==null ? null : (STRANDED ? covstats.replaceFirst("#", "1") : covstats));
+
+ String basecov2=(basecov==null || !STRANDED ? null : basecov.replaceFirst("#", "2"));
+ String bincov2=(bincov==null || !STRANDED ? null : bincov.replaceFirst("#", "2"));
+ String normcov2=(normcov==null || !STRANDED ? null : normcov.replaceFirst("#", "2"));
+ String normcovOverall2=(normcovOverall==null ? null : (STRANDED ? normcovOverall.replaceFirst("#", "2") : normcovOverall));
+ String histogram2=(histogram==null || !STRANDED ? null : histogram.replaceFirst("#", "2"));
+ String stats2=(covstats==null || !STRANDED ? null : covstats.replaceFirst("#", "2"));
+
+ if(CONCISE){
+ writeCoveragePerBaseConcise(basecov1, list, 0, minscaf);
+ writeCoveragePerBaseConcise(basecov2, list, 1, minscaf);
+ }else{
+ writeCoveragePerBase(basecov1, list, DELTA_ONLY, 0, minscaf);
+ writeCoveragePerBase(basecov2, list, DELTA_ONLY, 1, minscaf);
+ }
+ if(KEEP_SHORT_BINS){
+ writeCoveragePerBaseBinned2(bincov1, list, binsize, 0, minscaf);
+ writeCoveragePerBaseBinned2(bincov2, list, binsize, 1, minscaf);
+ }else{
+ writeCoveragePerBaseBinned(bincov1, list, binsize, 0, minscaf);
+ writeCoveragePerBaseBinned(bincov2, list, binsize, 1, minscaf);
+ }
+ if(normcov!=null){
+ writeCoveragePerBaseNormalized(normcov1, list, binsize, 0, minscaf);
+ writeCoveragePerBaseNormalized(normcov2, list, binsize, 1, minscaf);
+ }
+ if(normcovOverall!=null){
+ writeCoveragePerBaseNormalizedOverall(normcovOverall1, list, binsize, 0, minscaf);
+ writeCoveragePerBaseNormalizedOverall(normcovOverall2, list, binsize, 1, minscaf);
+ }
+ if(outrpkm!=null){
+ writeRPKM(outrpkm, in1, null, readsProcessed, NONZERO_ONLY,list);
+ }
+
+ {
+ long[] hist=writeStats(stats1, 0);
+ if(hist!=null){writeHist(histogram1, hist);}
+
+ if(STRANDED){
+ hist=writeStats(stats2, 1);
+ if(hist!=null){writeHist(histogram2, hist);}
+ }
+ }
+
+ final double mult=1.0/refBases;
+ double depthCovered=mappedBases*mult;
+ double pctScaffoldsWithCoverage=scaffoldsWithCoverage1*100.0/totalScaffolds;
+ double pctCovered=totalCoveredBases1*100*mult;
+
+ Data.sysout.println(String.format("\nAverage coverage: \t%.2f", depthCovered));
+ if(USE_COVERAGE_ARRAYS && calcCovStdev){
+ double[] stdev=standardDeviation(list, 0, minscaf);
+ Data.sysout.println(String.format("Standard deviation: \t%.2f", stdev[1]));
+ }
+ Data.sysout.println(String.format("Percent scaffolds with any coverage: \t%.2f", pctScaffoldsWithCoverage));
+ if(USE_COVERAGE_ARRAYS || USE_BITSETS){
+ Data.sysout.println(String.format("Percent of reference bases covered: \t%.2f", pctCovered));
+ }
+ }
+
+ public int basesUnderAverageCoverage(final int[] array, final double avg, final int window){
+ if(array.length<window){return 0;}
+ final long limit=(long)Math.ceil(window*avg);
+ long covSum=0;
+ int baseCount=0;
+ for(int i=0; i<window; i++){
+ covSum+=array[i];
+ }
+
+ boolean below=false;
+ int lastStop=-1, lastStart=0;
+ for(int a=0, b=window; b<array.length; a++, b++){
+ if(covSum>=limit){
+ if(below){//end range
+ baseCount=b-Tools.max(lastStop+1, lastStart);
+ lastStop=b-1;
+ below=false;
+ }
+ }else{
+ if(!below){//start range
+ lastStart=a;
+ below=true;
+ }
+ }
+ covSum-=array[a];
+ assert(covSum>=0);
+ covSum+=array[b];
+ }
+
+ if(below){//end range
+ baseCount=array.length-Tools.max(lastStop, lastStart);
+ }
+
+ assert(baseCount>=0);
+ return baseCount;
+ }
+
+ public int basesUnderAverageCoverage(final char[] array, final double avg, final int window){
+ if(array.length<window){return 0;}
+ final long limit=(long)Math.ceil(window*avg);
+ long covSum=0;
+ int baseCount=0;
+ for(int i=0; i<window; i++){
+ covSum+=array[i];
+ }
+
+// System.err.println("limit: "+limit);
+
+ boolean below=false;
+ int lastStop=-1, lastStart=0;
+ for(int a=0, b=window; b<array.length; a++, b++){
+ if(covSum>=limit){
+ if(below){//end range
+ baseCount+=b-Tools.max(lastStop+1, lastStart);
+
+// System.err.println("\nprev: "+lastStop+", "+lastStart);
+// System.err.println("end range at "+a+", "+b);
+// System.err.println("baseCount: "+baseCount+", covSum="+covSum);
+
+ lastStop=b-1;
+ below=false;
+ }
+ }else{
+ if(!below){//start range
+
+// System.err.println("\nprev: "+lastStop+", "+lastStart);
+// System.err.println("start range at "+a+", "+b);
+// System.err.println("baseCount: "+baseCount+", covSum="+covSum);
+
+ lastStart=a;
+ below=true;
+ }
+ }
+ covSum-=array[a];
+ assert(covSum>=0);
+ covSum+=array[b];
+ }
+
+ if(below){//end range
+ baseCount+=array.length-Tools.max(lastStop+1, lastStart);
+
+// System.err.println("\nprev: "+lastStop+", "+lastStart);
+// System.err.println("end range at "+array.length);
+// System.err.println("baseCount: "+baseCount+", covSum="+covSum);
+ }
+
+ assert(baseCount>=0);
+ return baseCount;
+ }
+
+ public long[] writeStats(String fname, int strand){
+// System.err.println("Writing stats for "+fname+", "+strand);
+ final TextStreamWriter tsw=(fname==null ? null : new TextStreamWriter(fname, overwrite, false, true));
+
+ if(tsw!=null){
+ tsw.start();
+ if(printHeader){
+ String pound=(headerPound ? "#" : "");
+ if(TWOCOLUMN){
+ tsw.println(pound+"ID\tAvg_fold");
+ }else{
+ tsw.println(pound+"ID\tAvg_fold\tLength\tRef_GC\tCovered_percent\tCovered_bases\tPlus_reads\tMinus_reads"+
+ (USE_COVERAGE_ARRAYS ? ("\tMedian_fold"+(USE_WINDOW ? "\tUnder_"+String.format("%.0f",LOW_COV_DEPTH)+"/"+LOW_COV_WINDOW : "")) : "")+
+ (COUNT_GC ? "\tRead_GC" : "")+(USE_COVERAGE_ARRAYS ? "\tStd_Dev" : ""));
+ }
+ }
+ }
+
+ final int histmax=(bits32 ? 1000000 : Character.MAX_VALUE);
+ final LongList hist=new LongList(Character.MAX_VALUE);
+
+ long coveredScafTemp=0;
+ long coveredBaseTemp=0;
+ for(Scaffold scaf : list){
+ final long sum=scaf.basehits;
+ int covered=0;
+ int median=-1;
+ int underWindowAverage=-1;
+ final double stdev;
+ if(USE_COVERAGE_ARRAYS){
+ CoverageArray ca=(CoverageArray)(STRANDED && strand==1 ? scaf.obj2 : scaf.obj1);
+ if(ca!=null){
+ for(int i=0; i<scaf.length; i++){
+ int x=ca.get(i);
+ hist.increment(Tools.min(x, histmax));
+// sum+=x;
+ if(x>0){covered++;}
+ }
+ if(bits32){
+ int[] array=((CoverageArray3)ca).array;
+ stdev=Tools.standardDeviation(array);
+ underWindowAverage=basesUnderAverageCoverage(array, LOW_COV_DEPTH, LOW_COV_WINDOW);
+ Arrays.sort(array);
+ Tools.reverseInPlace(array);
+ median=ca.get(scaf.length/2);
+ }else{
+ char[] array=((CoverageArray2)ca).array;
+ stdev=Tools.standardDeviation(array);
+ underWindowAverage=basesUnderAverageCoverage(array, LOW_COV_DEPTH, LOW_COV_WINDOW);
+ Arrays.sort(array);
+ Tools.reverseInPlace(array);
+ median=ca.get(scaf.length/2);
+ }
+ }else{
+ stdev=0;
+ }
+ }else if(USE_BITSETS){
+// sum+=scaf.basehits;
+ BitSet bs=(BitSet)(STRANDED && strand==1 ? scaf.obj2 : scaf.obj1);
+ covered=(bs==null ? 0 : bs.cardinality());
+ stdev=-1;
+ }else{
+ stdev=-1;
+ }
+
+ if(sum>0){
+ coveredScafTemp++;
+ }
+ // pw.print(scaf.name);
+ if(tsw!=null && (sum>0 || !NONZERO_ONLY) && scaf.length>=minscaf){
+ if(TWOCOLUMN){
+ tsw.print(String.format("%s\t%.4f\n", scaf.name, sum/(double)scaf.length));
+ }else if(COUNT_GC){
+ long[] bc=scaf.basecount;
+ double gc=(bc[1]+bc[2])*1d/Data.max(1, bc[0]+bc[1]+bc[2]+bc[3]);
+ if(USE_COVERAGE_ARRAYS){
+ if(USE_WINDOW){
+ tsw.print(String.format("%s\t%.4f\t%d\t%.4f\t%.4f\t%d\t%d\t%d\t%d\t%d\t%.4f\t%.2f\n", scaf.name, sum/(double)scaf.length, scaf.length,
+ scaf.gc, covered*100d/scaf.length, covered, (scaf.readhits-scaf.readhitsMinus), scaf.readhitsMinus, median, underWindowAverage, gc, stdev));
+ }else{
+ tsw.print(String.format("%s\t%.4f\t%d\t%.4f\t%.4f\t%d\t%d\t%d\t%d\t%.4f\t%.2f\n", scaf.name, sum/(double)scaf.length, scaf.length,
+ scaf.gc, covered*100d/scaf.length, covered, (scaf.readhits-scaf.readhitsMinus), scaf.readhitsMinus, median, gc, stdev));
+ }
+ }else{
+ tsw.print(String.format("%s\t%.4f\t%d\t%.4f\t%.4f\t%d\t%d\t%d\t%.4f\n", scaf.name, sum/(double)scaf.length, scaf.length,
+ scaf.gc, covered*100d/scaf.length, covered, (scaf.readhits-scaf.readhitsMinus), scaf.readhitsMinus, gc/*, scaf.basehits*/));
+ }
+ }else{
+ if(USE_COVERAGE_ARRAYS){
+ if(USE_WINDOW){
+ tsw.print(String.format("%s\t%.4f\t%d\t%.4f\t%.4f\t%d\t%d\t%d\t%d\t%d\t%.2f\n", scaf.name, sum/(double)scaf.length, scaf.length,
+ scaf.gc, covered*100d/scaf.length, covered, (scaf.readhits-scaf.readhitsMinus), scaf.readhitsMinus, median, underWindowAverage, stdev));
+ }else{
+ tsw.print(String.format("%s\t%.4f\t%d\t%.4f\t%.4f\t%d\t%d\t%d\t%d\t%.2f\n", scaf.name, sum/(double)scaf.length, scaf.length,
+ scaf.gc, covered*100d/scaf.length, covered, (scaf.readhits-scaf.readhitsMinus), scaf.readhitsMinus, median, stdev));
+ }
+ }else{
+ tsw.print(String.format("%s\t%.4f\t%d\t%.4f\t%.4f\t%d\t%d\t%d\n", scaf.name, sum/(double)scaf.length, scaf.length,
+ scaf.gc, covered*100d/scaf.length, covered, (scaf.readhits-scaf.readhitsMinus), scaf.readhitsMinus));
+ }
+ }
+ }
+ coveredBaseTemp+=covered;
+ }
+
+ if(strand==0){
+ scaffoldsWithCoverage1+=coveredScafTemp;
+ totalCoveredBases1+=coveredBaseTemp;
+ }else{
+ scaffoldsWithCoverage2+=coveredScafTemp;
+ totalCoveredBases2+=coveredBaseTemp;
+ }
+
+ if(tsw!=null){tsw.poisonAndWait();}
+ return hist==null ? null : hist.array;
+ }
+
+ /**
+ * Write a histogram of number of bases covered to each depth
+ * @param fname Output filename
+ * @param counts counts[X] stores the number of bases with coverage X
+ */
+ public static void writeHist(String fname, long[] counts){
+ if(fname==null){return;}
+ assert(counts!=null) : "Can't write a histogram with null counts.";
+ ByteStreamWriter tsw=new ByteStreamWriter(fname, overwrite, false, false);
+ tsw.start();
+ if(printHeader){
+ if(headerPound){tsw.print('#');}
+ tsw.println("Coverage\tnumBases");
+ }
+ int max=0;
+ for(max=counts.length-1; max>0 && counts[max]==0; max--){}
+ for(int i=0; i<=max; i++){
+ long x=counts[i];
+ tsw.print(i);
+ tsw.print('\t');
+ tsw.println(x);
+ }
+
+ tsw.poisonAndWait();
+ }
+
+ /**
+ * Prints coverage in this format:
+ * scafname TAB position TAB coverage
+ * scafname TAB position TAB coverage
+ * @param fname Output filename
+ * @param list List of reference scaffolds
+ * @param deltaOnly Only write lines when coverage changes
+ * @param strand Only use coverage from reads mapped to this strand (0=plus, 1=minus)
+ */
+ public static void writeCoveragePerBase(String fname, ArrayList<Scaffold> list, boolean deltaOnly, int strand, int minscaf){
+ if(fname==null || (!STRANDED && strand>0)){return;}
+
+ if(verbose){System.err.println("Starting tsw "+fname);}
+ ByteStreamWriter tsw=new ByteStreamWriter(fname, overwrite, false, true);
+ if(verbose){System.err.println("Created tsw "+fname);}
+ tsw.start();
+// if(verbose){System.err.println("Started tsw "+fname);}
+ if(printHeader){
+ if(headerPound){tsw.print('#');}
+ tsw.println("RefName\tPos\tCoverage");
+ }
+
+ for(Scaffold scaf : list){
+ int last=-1;
+ CoverageArray ca=(CoverageArray)(STRANDED && strand==1 ? scaf.obj2 : scaf.obj1);
+ if(scaf.length>=minscaf){
+ for(int i=0, len=scaf.length; i<len; i++){
+ int x=(ca==null ? 0 : ca.get(i));
+ if(!deltaOnly || x!=last){
+ tsw.print(scaf.name);
+ tsw.print('\t');
+ tsw.print(i);
+ tsw.print('\t');
+ tsw.println(x);
+ last=x;
+ }
+ }
+ }
+ }
+
+ if(verbose){System.err.println("Closing tsw "+fname);}
+ tsw.poisonAndWait();
+ if(verbose){System.err.println("Closed tsw "+fname);}
+ }
+
+ /**
+ * Prints coverage in this format, skipping zero-coverage positions:
+ * #scafname
+ * position TAB coverage
+ * position TAB coverage
+ * @param fname Output filename
+ * @param list List of reference scaffolds
+ * @param strand Only use coverage from reads mapped to this strand (0=plus, 1=minus)
+ */
+ public static void writeCoveragePerBaseConcise(String fname, ArrayList<Scaffold> list, int strand, int minscaf){
+ if(fname==null || (!STRANDED && strand>0)){return;}
+
+ if(verbose){System.err.println("Starting tsw "+fname);}
+ ByteStreamWriter tsw=new ByteStreamWriter(fname, overwrite, false, true);
+ tsw.start();
+ if(verbose){System.err.println("Started tsw "+fname);}
+// tsw.print(pound+"RefName\tPos\tCoverage\n");
+
+ for(Scaffold scaf : list){
+ tsw.print('#');
+ tsw.println(scaf.name);
+ CoverageArray ca=(CoverageArray)(STRANDED && strand==1 ? scaf.obj2 : scaf.obj1);
+ if(scaf.length>=minscaf){
+ for(int i=0; i<scaf.length; i++){
+ int x=(ca==null ? 0 : ca.get(i));
+ if(x>0){
+ tsw.print(i);
+ tsw.print('\t');
+ tsw.println(x);
+ }
+ }
+ }
+ }
+
+ if(verbose){System.err.println("Closing tsw "+fname);}
+ tsw.poisonAndWait();
+ if(verbose){System.err.println("Closed tsw "+fname);}
+// assert(false);
+ }
+
+ /**
+ * Note. As written, this will truncate all trailing bases of each scaffold's length modulo binsize.
+ * For example, with binsize 1000, the last 500 bases of a 1500 base scaffold will be ignored.
+ * @param fname Output filename
+ * @param list List of reference scaffolds
+ * @param binsize Width of coverage bins in bp
+ * @param strand Only use coverage from reads mapped to this strand (0=plus, 1=minus)
+ */
+ public static void writeCoveragePerBaseBinned(String fname, ArrayList<Scaffold> list, int binsize, int strand, int minscaf){
+ if(fname==null || (!STRANDED && strand>0)){return;}
+ TextStreamWriter tsw=new TextStreamWriter(fname, overwrite, false, false);
+ tsw.start();
+ if(printHeader){
+ String pound=(headerPound ? "#" : "");
+ if(calcCovStdev){
+ double[] stdev=standardDeviation(list, strand, minscaf);
+ if(stdev!=null){
+ tsw.print(pound+"Mean\t"+String.format("%.3f", stdev[0])+"\n");
+ tsw.print(pound+"STDev\t"+String.format("%.3f", stdev[1])+"\n");
+ }
+ }
+ tsw.print(pound+"RefName\tCov\tPos\tRunningPos\n");
+ }
+
+ long running=0;
+ final float invbin=1f/binsize;
+ for(Scaffold scaf : list){
+ if(scaf.length>=binsize && scaf.length>=minscaf){
+ CoverageArray ca=(CoverageArray)(STRANDED && strand==1 ? scaf.obj2 : scaf.obj1);
+ int lastPos=-1, nextPos=binsize-1;
+ long sum=0;
+ for(int i=0; i<scaf.length; i++){
+ int x=(ca==null ? 0 : ca.get(i));
+ sum+=x;
+ if(i>=nextPos){
+// float bin=(i-lastPos);
+// tsw.print(String.format("%s\t%.2f\t%d\t%d\n", scaf.name, sum/bin, (i+1), running));
+ tsw.print(String.format("%s\t%.2f\t%d\t%d\n", scaf.name, sum*invbin, (i+1), running));
+ lastPos=i;
+ running+=binsize;
+ nextPos+=binsize;
+ sum=0;
+ }
+ }
+ }
+ }
+
+ tsw.poisonAndWait();
+ }
+
+ /**
+ * This version will NOT truncate all trailing bases of each scaffold's length modulo binsize.
+ * @param fname Output filename
+ * @param list List of reference scaffolds
+ * @param binsize Width of coverage bins in bp
+ * @param strand Only use coverage from reads mapped to this strand (0=plus, 1=minus)
+ */
+ public static void writeCoveragePerBaseBinned2(String fname, ArrayList<Scaffold> list, int binsize, int strand, int minscaf){
+ if(fname==null || (!STRANDED && strand>0)){return;}
+ TextStreamWriter tsw=new TextStreamWriter(fname, overwrite, false, false);
+ tsw.start();
+ if(printHeader){
+ String pound=(headerPound ? "#" : "");
+ if(calcCovStdev){
+ double[] stdev=standardDeviationBinned(list, binsize, strand, minscaf);
+ if(stdev!=null){
+ tsw.print(pound+"Mean\t"+String.format("%.3f", stdev[0])+"\n");
+ tsw.print(pound+"STDev\t"+String.format("%.3f", stdev[1])+"\n");
+ }
+ }
+ tsw.print(pound+"RefName\tCov\tPos\tRunningPos\n");
+ }
+
+ long running=0;
+ for(Scaffold scaf : list){
+ CoverageArray ca=(CoverageArray)(STRANDED && strand==1 ? scaf.obj2 : scaf.obj1);
+ int lastPos=-1, nextPos=binsize-1;
+ long sum=0;
+ final int lim=scaf.length-1;
+ for(int i=0; i<scaf.length; i++){
+ int x=(ca==null ? 0 : ca.get(i));
+ sum+=x;
+ if(i>=nextPos || i==lim){
+ int bin=(i-lastPos);
+ if(scaf.length>=minscaf){
+ tsw.print(String.format("%s\t%.2f\t%d\t%d\n", scaf.name, sum/(float)bin, (i+1), running));
+ }
+ running+=bin;
+ nextPos+=binsize;
+ lastPos=i;
+ sum=0;
+ }
+ }
+ }
+
+ tsw.poisonAndWait();
+ }
+
+ //Unsafe because it will fail if there are over 2 billion bins
+ public static double[] standardDeviationBinnedUnsafe(String fname, ArrayList<Scaffold> scaffolds, int binsize, int strand, int minscaf){
+
+ LongList list=new LongList();
+ for(Scaffold scaf : scaffolds){
+ CoverageArray ca=(CoverageArray)(STRANDED && strand==1 ? scaf.obj2 : scaf.obj1);
+ int lastPos=-1, nextPos=binsize-1;
+ long sum=0;
+ final int lim=scaf.length-1;
+ for(int i=0; i<scaf.length; i++){
+ int x=(ca==null ? 0 : ca.get(i));
+ sum+=x;
+ if(i>=nextPos || i==lim){
+ int bin=(i-lastPos);
+ if(scaf.length>=minscaf){
+ list.add((int)(10*(sum/(double)bin)));
+ }
+ nextPos+=binsize;
+ lastPos=i;
+ sum=0;
+ }
+ }
+ }
+ list.sort();
+
+ double mean=0.1*list.mean();
+ double median=0.1*list.median();
+ double mode=0.1*list.mode();
+ double stdev=0.1*list.stdev();
+ return new double[] {mean, median, mode, stdev};
+ }
+
+ public static double[] standardDeviationBinned(ArrayList<Scaffold> scaffolds, int binsize, int strand, int minscaf){
+ double totalSum=0;
+ long bins=0;
+
+ for(Scaffold scaf : scaffolds){
+ if(scaf.length>=minscaf){
+ CoverageArray ca=(CoverageArray)(STRANDED && strand==1 ? scaf.obj2 : scaf.obj1);
+ int lastPos=-1, nextPos=binsize-1;
+ long tempSum=0;
+ final int lim=scaf.length-1;
+ for(int i=0; i<scaf.length; i++){
+ int x=(ca==null ? 0 : ca.get(i));
+ tempSum+=x;
+ if(i>=nextPos || i==lim){
+ int bin=(i-lastPos);
+ double depth=(tempSum/(double)bin);
+ totalSum+=depth;
+ bins++;
+ nextPos+=binsize;
+ lastPos=i;
+ tempSum=0;
+ }
+ }
+ }
+ }
+
+ if(bins<1){return new double[] {0, 0};}
+ final double mean=totalSum/(double)bins;
+ double sumdev2=0;
+
+ for(Scaffold scaf : scaffolds){
+ if(scaf.length>=minscaf){
+ CoverageArray ca=(CoverageArray)(STRANDED && strand==1 ? scaf.obj2 : scaf.obj1);
+ int lastPos=-1, nextPos=binsize-1;
+ long tempSum=0;
+ final int lim=scaf.length-1;
+ for(int i=0; i<scaf.length; i++){
+ int x=(ca==null ? 0 : ca.get(i));
+ tempSum+=x;
+ if(i>=nextPos || i==lim){
+ int bin=(i-lastPos);
+ double depth=(tempSum/(double)bin);
+ double dev=mean-depth;
+ sumdev2+=(dev*dev);
+ nextPos+=binsize;
+ lastPos=i;
+ tempSum=0;
+ }
+ }
+ }
+ }
+
+ final double stdev=Math.sqrt(sumdev2/bins);
+ return new double[] {mean, stdev};
+ }
+
+ public static double[] standardDeviation(ArrayList<Scaffold> scaffolds, int strand, int minscaf){
+ long totalSum=0, bins=0;
+
+ for(Scaffold scaf : scaffolds){
+ if(scaf.length>=minscaf){
+ final CoverageArray ca=(CoverageArray)(STRANDED && strand==1 ? scaf.obj2 : scaf.obj1);
+ bins+=scaf.length;
+ for(int i=0; i<scaf.length; i++){
+ int depth=(ca==null ? 0 : ca.get(i));
+ totalSum+=depth;
+ }
+ }
+ }
+
+ if(bins<1){return new double[] {0, 0};}
+ final double mean=totalSum/(double)bins;
+ double sumdev2=0;
+
+ for(Scaffold scaf : scaffolds){
+ if(scaf.length>=minscaf){
+ final CoverageArray ca=(CoverageArray)(STRANDED && strand==1 ? scaf.obj2 : scaf.obj1);
+ double sumTemp=0;
+ for(int i=0; i<scaf.length; i++){
+ int depth=(ca==null ? 0 : ca.get(i));
+ double dev=mean-depth;
+ sumTemp+=(dev*dev);
+ }
+ sumdev2+=sumTemp;
+ }
+ }
+
+ final double stdev=Math.sqrt(sumdev2/bins);
+ return new double[] {mean, stdev};
+ }
+
+
+ /**
+ * @param fname Output filename
+ * @param list List of reference scaffolds
+ * @param binsize Width of coverage bins in bp
+ * @param strand Only use coverage from reads mapped to this strand (0=plus, 1=minus)
+ */
+ public static void writeCoveragePerBaseNormalized(String fname, ArrayList<Scaffold> list, double binsize, int strand, int minscaf){
+ if(fname==null || (!STRANDED && strand>0)){return;}
+ TextStreamWriter tsw=new TextStreamWriter(fname, overwrite, false, false);
+ tsw.start();
+ if(printHeader){
+ String pound=(headerPound ? "#" : "");
+ tsw.print(pound+"RefName\tBin\tCov\tPos\tRunningPos\n");
+ }
+
+ double running=0;
+ double invbin=1.0/binsize;
+ final double invbincount=1.0/NORMALIZE_LENGTH_BINS;
+ for(Scaffold scaf : list){
+ if(NORMALIZE_LENGTH_BINS>0){
+ binsize=scaf.length*invbincount;
+ invbin=1.0/binsize;
+ }
+
+ if(scaf.length>=binsize && scaf.length>=minscaf){
+ long max=-1;
+
+ final CoverageArray ca=(CoverageArray)(STRANDED && strand==1 ? scaf.obj2 : scaf.obj1);
+ double lastPos=-1, nextPos=binsize-1;
+ long sum=0;
+
+ if(NORMALIZE_COVERAGE){
+ for(int i=0; i<scaf.length; i++){
+ int x=(ca==null ? 0 : ca.get(i));
+ sum+=x;
+ if(i>=nextPos){
+ max=Tools.max(sum, max);
+ running+=binsize;
+ nextPos+=binsize;
+ sum=0;
+ }
+ }
+ lastPos=-1;
+ nextPos=binsize-1;
+ sum=0;
+ assert(max>-1) : max;
+ }
+ max=Tools.max(max, 1);
+ final double binmult=(NORMALIZE_COVERAGE ? 1d/max : invbin);
+
+// assert(false) : NORMALIZE_COVERAGE+", "+binmult+", "+invbin+", "+max+", "+binsize;
+
+ final String formatString=NORMALIZE_COVERAGE ? "%s\t%d\t%.5f\t%d\t%d\n" : "%s\t%d\t%.2f\t%d\t%d\n";
+ int bin=1;
+ for(int i=0; i<scaf.length; i++){
+ int x=(ca==null ? 0 : ca.get(i));
+ sum+=x;
+ if(i>=nextPos){
+// System.err.println(x+", "+i+", "+nextPos+", "+sum+", "+(sum*binmult));
+ tsw.print(String.format(formatString, scaf.name, bin, sum*binmult, (i+1), (long)running));
+ bin++;
+ lastPos=i;
+ running+=binsize;
+ nextPos+=binsize;
+ sum=0;
+ }
+ }
+ }
+ }
+
+ tsw.poisonAndWait();
+ }
+
+
+
+ /**
+ * @param fname Output filename
+ * @param list List of reference scaffolds
+ * @param binsize Width of coverage bins in bp
+ * @param strand Only use coverage from reads mapped to this strand (0=plus, 1=minus)
+ */
+ public static void writeCoveragePerBaseNormalizedOverall(String fname, ArrayList<Scaffold> list, double binsize, int strand, int minscaf){
+ if(fname==null || (!STRANDED && strand>0)){return;}
+
+ assert(NORMALIZE_LENGTH_BINS>0) : "Must set 'normalizebins' flag to a positive integer.";
+ double running=0;
+ double invbin=1.0/binsize;
+ long usedScafs=0;
+ final double invbincount=1.0/NORMALIZE_LENGTH_BINS;
+
+ double[] normalized=new double[NORMALIZE_LENGTH_BINS+1];
+ double[] absolute=new double[NORMALIZE_LENGTH_BINS+1];
+
+ for(Scaffold scaf : list){
+ if(NORMALIZE_LENGTH_BINS>0){
+ binsize=scaf.length*invbincount;
+ invbin=1.0/binsize;
+ }
+
+ if(scaf.length>=binsize && scaf.length>=minscaf){
+ usedScafs++;
+
+ if(scaf.readhits>0){
+ long max=-1;
+ final CoverageArray ca=(CoverageArray)(STRANDED && strand==1 ? scaf.obj2 : scaf.obj1);
+ double lastPos=-1, nextPos=binsize-1;
+ long sum=0;
+
+ {
+ for(int i=0; i<scaf.length; i++){
+ int x=(ca==null ? 0 : ca.get(i));
+ sum+=x;
+ if(i>=nextPos){
+ max=Tools.max(sum, max);
+ running+=binsize;
+ nextPos+=binsize;
+ sum=0;
+ }
+ }
+ lastPos=-1;
+ nextPos=binsize-1;
+ sum=0;
+ assert(max>-1) : max;
+ }
+ max=Tools.max(max, 1);
+ final double binmult=1d/max;
+
+ // assert(false) : NORMALIZE_COVERAGE+", "+binmult+", "+invbin+", "+max+", "+binsize;
+
+ int bin=1;
+ for(int i=0; i<scaf.length; i++){
+ int x=(ca==null ? 0 : ca.get(i));
+ sum+=x;
+ if(i>=nextPos){
+ normalized[bin]+=(sum*binmult);
+ absolute[bin]+=(sum*invbin);
+ bin++;
+ lastPos=i;
+ running+=binsize;
+ nextPos+=binsize;
+ sum=0;
+ }
+ }
+ }
+ }
+ }
+
+ TextStreamWriter tsw=new TextStreamWriter(fname, overwrite, false, false);
+ tsw.start();
+ if(printHeader){
+ String pound=(headerPound ? "#" : "");
+ tsw.print(pound+"RefName\tBin\tAbsCov\tNormCov\n");
+ }
+ double invScafs=1d/Tools.max(1, usedScafs);
+
+ final double maxNorm=Tools.max(normalized);
+ final double normMult=1/maxNorm;
+
+ for(int bin=1; bin<normalized.length; bin++){
+// assert((absolute[bin]*invScafs)!=Double.NaN && (normalized[bin]*invScafs)!=Double.NaN) : invScafs+", "+absolute[bin]+", "+normalized[bin];
+// assert(false) : invScafs+", "+absolute[bin]+", "+normalized[bin]+", "+(absolute[bin]*invScafs)+", "+(normalized[bin]*invScafs);
+ tsw.print(String.format("%s\t%d\t%.5f\t%.5f\n", "all", bin, absolute[bin]*invScafs, normalized[bin]*normMult));
+ }
+
+ tsw.poisonAndWait();
+ }
+
+
+
+ /**
+ * Write RPKM statistics.
+ */
+ public static void writeRPKM(String out, String in1, String in2, long readsIn, boolean printNonZeroOnly, ArrayList<Scaffold> list){
+ if(out==null){return;}
+ final TextStreamWriter tsw=new TextStreamWriter(out, overwrite, false, false);
+ tsw.start();
+
+ /* Count mapped reads */
+ long mappedReads=0;
+ long mappedFrags=0;
+ for(Scaffold scaf : list){
+ mappedReads+=scaf.readhits;
+ mappedFrags+=scaf.fraghits;
+ }
+ mappedFrags/=2;
+
+ /* Print header */
+ tsw.print("#File\t"+(in1==null ? "" : in1)+(in2==null ? "" : "\t"+in2)+"\n");
+ tsw.print(String.format("#Reads\t%d\n",readsIn));
+ tsw.print(String.format("#Mapped\t%d\n",mappedReads));
+ tsw.print(String.format("#RefSequences\t%d\n",list.size()));
+ tsw.print("#Name\tLength\tBases\tCoverage\tReads\tRPKM\tFrags\tFPKM\n");
+
+ final float readMult=1000000000f/Tools.max(1, mappedReads);
+ final float fragMult=1000000000f/Tools.max(1, mappedFrags);
+
+ /* Print data */
+ for(final Scaffold scaf : list){
+ final long reads=scaf.readhits;
+ final long frags=scaf.fraghits/2;
+ final long bases=scaf.basehits;
+ final String s=scaf.name;
+ final int len=scaf.length;
+ final double invlen=1.0/Tools.max(1, len);
+ final double readMult2=readMult*invlen;
+ final double fragMult2=fragMult*invlen;
+ if(reads>0 || !printNonZeroOnly){
+ tsw.print(String.format("%s\t%d\t%d\t%.4f\t%d\t%.4f\t%d\t%.4f\n",s,len,bases,bases*invlen,reads,reads*readMult2,frags,frags*fragMult2));
+ }
+ }
+ tsw.poisonAndWait();
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** The list of all scaffolds */
+ private ArrayList<Scaffold> list;
+ /** Maps names to scaffolds */
+ private HashMap<String, Scaffold> table;
+ /** Converts BBMap index coordinates to scaffold coordinates */
+ private final ScaffoldCoordinates coords=new ScaffoldCoordinates(), coords2=new ScaffoldCoordinates();
+
+ /** Mapping program name */
+ private String program=null;
+ /** Mapping program version */
+ private String version=null;
+
+ //Inputs
+ /** Primary input file (typically sam) */
+ public String in1=null;
+ /** Secondary input file (typically for coverage directly from BBMap) */
+ public String in2=null;
+ /** Optional, for calculating GC */
+ public String reference=null;
+ public String orffasta=null;
+
+ //Outputs
+ /** Stream unaltered sam input to this output */
+ public String outsam=null;
+ /** Coverage statistics, one line per scaffold */
+ public String covstats=null;
+ public String outorf=null;
+ /** Coverage histogram, one line per depth and one point per base */
+ public String histogram=null;
+ /** Coverage with one line per base */
+ public String basecov=null;
+ /** Coverage with one file per scaffold */
+ public String basecov_ps=null;
+ /** Coverage with one line per bin */
+ public String bincov=null;
+ /** Coverage with one line per bin, normalized by length and/or height */
+ public String normcov=null;
+ /** Coverage with one line per bin, normalized by length and/or height, for combined reference */
+ public String normcovOverall=null;
+ /** rpkm/fpkm output, similar to Seal */
+ public String outrpkm=null;
+
+ /** Typically indicates that a header line was encountered in an unexpected place, e.g. with concatenated sam files. */
+ private boolean error=false;
+
+ /** Total length of reference */
+ public long refBases=0;
+ public long mappedBases=0;
+ public long mappedReads=0;
+ public long readsProcessed=0;
+ public long totalCoveredBases1=0;
+ public long totalCoveredBases2=0;
+ public long scaffoldsWithCoverage1=0;
+ public long scaffoldsWithCoverage2=0;
+ public long totalScaffolds=0;
+
+ //Don't reset these variables when clearing.
+ public long maxReads=-1;
+ public int initialScaffolds=4096;
+ public int binsize=1000;
+ public boolean bits32=false;
+ public int minMapq=0;
+
+ /** Don't print coverage info for scaffolds shorter than this */
+ public int minscaf=0;
+
+ public HashMap<String, Object> pairTable=new HashMap<String, Object>();
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Static Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+
+ /** Permission to overwrite existing files */
+ public static boolean overwrite=false;
+ /** Permission to append to existing files */
+ public static boolean append=false;
+ /** Print verbose log messages */
+ public static boolean verbose=false;
+
+ /** Print headers in output files */
+ public static boolean printHeader=true;
+ /** Prepend '#' symbol to header lines */
+ public static boolean headerPound=true;
+ /** Calculate standard deviation of coverage */
+ public static boolean calcCovStdev=true;
+
+ /** Window size to use when calculating average coverage,
+ * for detecting contiguous low-coverage areas */
+ public static int LOW_COV_WINDOW=500;
+ /** Min average coverage to not be classified as low-depth */
+ public static double LOW_COV_DEPTH=5;
+ /** Print number of bases below a certain average coverage in a window */
+ public static boolean USE_WINDOW=false;
+
+ /** Track base composition of reads covering each scaffold */
+ public static boolean COUNT_GC=true;
+ /** Output in 2-column format ("#ID\tAvg_fold\n") */
+ public static boolean TWOCOLUMN=false;
+ /** Track coverage for strands independently */
+ public static boolean STRANDED=false;
+ /** Add scaffold information from the reference (in addition to sam header) */
+ public static boolean ADD_FROM_REF=false;
+ /** Only print scaffolds with nonzero coverage */
+ public static boolean NONZERO_ONLY=false;
+ /** Store coverage info in numeric arrays */
+ public static boolean USE_COVERAGE_ARRAYS=true;
+ /** Store coverage info in bitsets */
+ public static boolean USE_BITSETS=false;
+ /** Only print lines when coverage changes (for compatibility with Jigsaw) */
+ public static boolean DELTA_ONLY=false;
+ /** Process secondary alignments */
+ public static boolean USE_SECONDARY=true;
+ /** Include coverage of unsequenced middle portion of pairs */
+ public static boolean PHYSICAL_COVERAGE=false;
+ /** Use 'tlen' field when calculating physical coverage */
+ public static boolean USE_TLEN=true;
+ /** Abort on error; otherwise, errors may be ignored */
+ public static boolean ABORT_ON_ERROR=true;
+ /** Print coverage for the last bin of a scaffold, even if it is shorter than binsize */
+ public static boolean KEEP_SHORT_BINS=true;
+ /** Only track coverage for start location */
+ public static boolean START_ONLY=false;
+ /** Only track coverage for start location */
+ public static boolean CONCISE=false;
+ /** Normalize coverage by expression contig coverage as a fraction of its max coverage */
+ public static boolean NORMALIZE_COVERAGE=false;
+ /** Normalize contig length by binning into this many bins per contig */
+ public static int NORMALIZE_LENGTH_BINS=100;
+ /** Include soft-clipped bases in coverage */
+ public static boolean INCLUDE_SOFT_CLIP=false;
+ /** Include deletions/introns in coverage */
+ public static boolean INCLUDE_DELETIONS=true; //TODO: Not enabled; use BBMask increment method to implement.
+
+ /** Translation array for tracking base counts */
+ private static final byte[] charToNum=AssemblyStats2.makeCharToNum();
+
+ private static final int NOTHING_MODE=0, BITSET_MODE=1, ARRAY_MODE=2;
+
+}
diff --git a/current/jgi/CrossContaminate.java b/current/jgi/CrossContaminate.java
new file mode 100755
index 0000000..1a3e156
--- /dev/null
+++ b/current/jgi/CrossContaminate.java
@@ -0,0 +1,513 @@
+package jgi;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Random;
+
+import jgi.Shuffle.ShuffleThread;
+
+import stream.ConcurrentGenericReadInputStream;
+import stream.ConcurrentReadInputStream;
+import stream.FASTQ;
+import stream.FastaReadInputStream;
+import stream.Read;
+
+import dna.Parser;
+import dna.Timer;
+import fileIO.ByteFile;
+import fileIO.ByteFile1;
+import fileIO.ByteFile2;
+import fileIO.ByteStreamWriter;
+import fileIO.ReadWrite;
+import fileIO.FileFormat;
+
+import align2.ListNum;
+import align2.Shared;
+import align2.Tools;
+
+
+/**
+ * Generates artificial cross-contaminated data by mixing reads.
+ * Takes input from multiple files, and writes output to the same number of files.
+ * @author Brian Bushnell
+ * @date Oct 27, 2014
+ *
+ */
+public class CrossContaminate {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public static void main(String[] args){
+ Timer t=new Timer();
+ CrossContaminate cc=new CrossContaminate(args);
+ cc.process(t);
+ }
+
+ public CrossContaminate(String[] args){
+
+ args=Parser.parseConfig(args);
+ if(Parser.parseHelp(args, true)){
+ printOptions();
+ System.exit(0);
+ }
+
+ for(String s : args){if(s.startsWith("out=standardout") || s.startsWith("out=stdout")){outstream=System.err;}}
+ outstream.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+
+ boolean setInterleaved=false; //Whether it was explicitly set.
+
+
+
+ Shared.READ_BUFFER_LENGTH=Tools.min(200, Shared.READ_BUFFER_LENGTH);
+ Shared.capBuffers(4);
+ ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
+ ReadWrite.MAX_ZIP_THREADS=Shared.threads();
+
+
+ ArrayList<String> inTemp=new ArrayList<String>();
+ ArrayList<String> outTemp=new ArrayList<String>();
+
+ Parser parser=new Parser();
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if(b==null || b.equalsIgnoreCase("null")){b=null;}
+ while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens
+
+ if(Parser.isJavaFlag(arg)){
+ //do nothing
+ }else if(Parser.parseQuality(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseZip(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseFasta(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseCommonStatic(arg, a, b)){
+ //do nothing
+ }else if(parser.parseCommon(arg, a, b)){
+ //do nothing
+ }else if(parser.parseInterleaved(arg, a, b)){
+ //do nothing
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ ByteFile1.verbose=verbose;
+ ByteFile2.verbose=verbose;
+ stream.FastaReadInputStream.verbose=verbose;
+ ConcurrentGenericReadInputStream.verbose=verbose;
+// align2.FastaReadInputStream2.verbose=verbose;
+ stream.FastqReadInputStream.verbose=verbose;
+ ReadWrite.verbose=verbose;
+ }else if(a.equals("in")){
+ String[] split2=b.split(",");
+ for(String name : split2){
+ inNames.add(name);
+ }
+ }else if(a.equals("out")){
+ String[] split2=b.split(",");
+ for(String name : split2){
+ outNames.add(name);
+ }
+ }else if(a.equals("innamefile")){
+ String[] split2=b.split(",");
+ for(String name : split2){
+ inTemp.add(name);
+ }
+ }else if(a.equals("outnamefile")){
+ String[] split2=b.split(",");
+ for(String name : split2){
+ outTemp.add(name);
+ }
+ }else if(a.equals("shuffle")){
+ shuffle=Tools.parseBoolean(b);
+ }else if(a.equals("seed")){
+ seed=Long.parseLong(b);
+ }else if(a.equals("minsinks") || a.equals("ns")){
+ minSinks=Integer.parseInt(b);
+ }else if(a.equals("maxsinks") || a.equals("xs")){
+ maxSinks=Integer.parseInt(b);
+ }else if(a.equals("minprob") || a.equals("np")){
+ minProb=Double.parseDouble(b);
+ }else if(a.equals("maxprob") || a.equals("xp")){
+ maxProb=Double.parseDouble(b);
+ }else if(a.equals("showspeed")){
+ showspeed=Tools.parseBoolean(b);
+ }else if(a.equals("shufflethreads")){
+ shufflethreads=Integer.parseInt(b);
+ }else{
+ outstream.println("Unknown parameter "+args[i]);
+ assert(false) : "Unknown parameter "+args[i];
+ // throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+
+ {//Process parser fields
+ Parser.processQuality();
+
+ maxReads=parser.maxReads;
+
+ overwrite=parser.overwrite;
+ append=parser.append;
+
+ setInterleaved=parser.setInterleaved;
+ }
+
+ assert(FastaReadInputStream.settingsOK());
+
+ DecontaminateByNormalization.parseStringsFromFiles(inTemp);
+ DecontaminateByNormalization.parseStringsFromFiles(outTemp);
+
+ inNames.addAll(inTemp);
+ outNames.addAll(outTemp);
+ inTemp=outTemp=null;
+
+ if(inNames.isEmpty() || inNames.size()!=outNames.size()){
+ assert(false) : inNames+"\n"+outNames;
+ printOptions();
+ throw new RuntimeException("Error - at least one input file is required, and # input files must equal # output files.");
+ }
+
+ assert(minSinks<=maxSinks);
+ minSinks=Tools.max(0, minSinks);
+ maxSinks=Tools.min(inNames.size()-1, maxSinks);
+ assert(minSinks<=maxSinks) : minSinks+", "+maxSinks;
+
+ assert(minProb<=maxProb);
+ assert(minProb>=0 && maxProb<=1);
+
+ minProbPow=Math.log(minProb);
+ maxProbPow=Math.log(maxProb);
+
+ if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){
+ ByteFile.FORCE_MODE_BF2=true;
+ }
+
+ if(!Tools.testInputFiles(true, true, inNames.toArray(new String[0]))){
+ outstream.println(outNames);
+ throw new RuntimeException("Can't find some input files:\n"+inNames+"\n");
+ }
+
+ if(!Tools.testOutputFiles(overwrite, append, false, outNames.toArray(new String[0]))){
+ outstream.println(outNames);
+ throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files.\n");
+ }
+
+ if(seed>0){randy.setSeed(seed);}
+
+ vessels=makeVessels(outNames);
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Outer Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ void process(Timer t){
+
+ outstream.println("Processing data.");
+ for(int i=0; i<inNames.size(); i++){
+ try{
+ processOneSource(i);
+ }catch(Throwable e){
+ System.err.println("Failed to open file "+inNames.get(i)+"\nException:"+e+"\n");
+ errorState=true;
+ }
+ }
+
+ for(Vessel v : vessels){
+ errorState|=v.close();
+ }
+
+ if(shuffle){
+ shuffle(shufflethreads);
+ }
+
+ t.stop();
+
+ if(showspeed){
+ double rpnano=readsProcessed/(double)(t.elapsed);
+ double bpnano=basesProcessed/(double)(t.elapsed);
+
+ String rpstring=(readsProcessed<100000 ? ""+readsProcessed : readsProcessed<100000000 ? (readsProcessed/1000)+"k" : (readsProcessed/1000000)+"m");
+ String bpstring=(basesProcessed<100000 ? ""+basesProcessed : basesProcessed<100000000 ? (basesProcessed/1000)+"k" : (basesProcessed/1000000)+"m");
+
+ while(rpstring.length()<8){rpstring=" "+rpstring;}
+ while(bpstring.length()<8){bpstring=" "+bpstring;}
+
+ outstream.println("Time: \t"+t);
+ outstream.println("Reads Processed: "+rpstring+" \t"+String.format("%.2fk reads/sec", rpnano*1000000));
+ outstream.println("Bases Processed: "+bpstring+" \t"+String.format("%.2fm bases/sec", bpnano*1000));
+ }
+
+ if(errorState){
+ throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
+ }
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ void shuffle(final int threads){
+ outstream.println("Shuffling output in "+threads+" thread"+(threads==1 ? "." : "s."));
+ Shuffle.showSpeed=Shuffle.printClass=false;
+ Shuffle.setMaxThreads(threads);
+ for(Vessel v : vessels){
+ ShuffleThread st=new ShuffleThread(v.fname, null, v.fname, null, Shuffle.SHUFFLE, true);
+ st.start();
+ }
+ Shuffle.waitForFinish();
+ }
+
+ void processOneSource(int sourceNum){
+ String fname=inNames.get(sourceNum);
+
+ FileFormat ffin=FileFormat.testInput(fname, FileFormat.FASTQ, null, true, true);
+
+ final ConcurrentReadInputStream cris;
+ {
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin, null);
+ if(verbose){outstream.println("Started cris");}
+ cris.start(); //4567
+ }
+ final boolean paired=cris.paired();
+ if(verbose){
+ if(!ffin.samOrBam()){outstream.println("Input is being processed as "+(paired ? "paired" : "unpaired"));}
+ }
+
+ ArrayList<Vessel> sinks=assignSinks(vessels, sourceNum);
+
+ {
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+// outstream.println("Fetched "+reads);
+
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+ assert((r.mate!=null)==paired);
+ }
+
+ while(reads!=null && reads.size()>0){
+
+ for(int idx=0; idx<reads.size(); idx++){
+ final Read r1=reads.get(idx);
+ final Read r2=r1.mate;
+
+ final int initialLength1=r1.length();
+ final int initialLength2=(r1.mateLength());
+
+ {
+ readsProcessed++;
+ basesProcessed+=initialLength1;
+ }
+ if(r2!=null){
+ readsProcessed++;
+ basesProcessed+=initialLength2;
+ }
+
+ addRead(r1, sinks);
+ }
+
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ if(ln!=null){
+ cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
+ }
+ }
+
+ errorState|=ReadWrite.closeStream(cris);
+ }
+
+ private void addRead(Read r, ArrayList<Vessel> list){
+ double p=randy.nextDouble();
+ for(Vessel v : list){
+ if(p>=v.prob){
+ v.bsw.println(r, true);
+ r=null;
+ break;
+ }
+ }
+ assert(r==null) : p+"\n"+list;
+ }
+
+ private ArrayList<Vessel> makeVessels(ArrayList<String> strings){
+ ArrayList<Vessel> list=new ArrayList<Vessel>(strings.size());
+ for(String s : strings){
+ Vessel v=new Vessel(s, true);
+ list.add(v);
+ }
+ return list;
+ }
+
+ private ArrayList<Vessel> assignSinks(ArrayList<Vessel> list, int sourceNum){
+ int potential=list.size()-1;
+ assert(potential>=minSinks && maxSinks<=potential) : potential+", "+minSinks+", "+maxSinks;
+ int range=maxSinks-minSinks+1;
+
+ int sinks=minSinks+(range>0 ? randy.nextInt(range) : 0);
+ assert(sinks>=0);
+
+ for(Vessel v : list){v.prob=0;}
+ ArrayList<Vessel> sinklist=(ArrayList<Vessel>) list.clone();
+ list=null;
+ Vessel source=sinklist.remove(sourceNum);
+ if(verbose || true){
+ System.err.println("Source: \t"+inNames.get(sourceNum));
+ System.err.println("Sinks: \t"+sinks);
+ }
+
+ while(sinklist.size()>sinks){
+ int x=randy.nextInt(sinklist.size());
+ sinklist.set(x, sinklist.get(sinklist.size()-1));
+ sinklist.remove(sinklist.size()-1);
+ }
+// if(verbose){System.err.println("Sinklist:\n"+sinklist);}
+
+ {
+ double probRange=maxProbPow-minProbPow;
+
+ assert(probRange>=0) : minProb+", "+maxProb+", "+minProbPow+", "+maxProbPow+", "+probRange;
+
+ double remaining=1.0;
+ for(Vessel v : sinklist){
+ double c=Math.pow(Math.E, minProbPow+randy.nextDouble()*probRange)*remaining;
+ remaining-=c;
+ v.prob=c;
+ }
+ source.prob=remaining;
+ sinklist.add(source);
+ if(verbose || true){System.err.println("Sinklist:\t"+sinklist+"\n");}
+ double d=0;
+ for(Vessel v : sinklist){
+ d+=v.prob;
+ v.prob=d;
+ }
+// if(verbose){System.err.println("Sinklist:\t"+sinklist);}
+ d=0;
+ for(Vessel v : sinklist){
+ double temp=v.prob;
+ v.prob=d;
+ d=temp;
+ }
+// if(verbose){System.err.println("Sinklist:\t"+sinklist);}
+ }
+ Collections.reverse(sinklist);
+ assert(sinklist.get(sinklist.size()-1).prob==0.0) : sinklist;
+
+// if(verbose){
+// System.err.println("Sinklist:\t"+sinklist);
+// System.err.println();
+// }
+ if(verbose || true){System.err.println();}
+
+ return sinklist;
+ }
+
+ private void printOptions(){
+ assert(false) : "printOptions: TODO";
+// outstream.println("Syntax:\n");
+// outstream.println("java -ea -Xmx512m -cp <path> jgi.ReformatReads in=<infile> in2=<infile2> out=<outfile> out2=<outfile2>");
+// outstream.println("\nin2 and out2 are optional. \nIf input is paired and there is only one output file, it will be written interleaved.\n");
+// outstream.println("Other parameters and their defaults:\n");
+// outstream.println("overwrite=false \tOverwrites files that already exist");
+// outstream.println("ziplevel=4 \tSet compression level, 1 (low) to 9 (max)");
+// outstream.println("interleaved=false\tDetermines whether input file is considered interleaved");
+// outstream.println("fastawrap=70 \tLength of lines in fasta output");
+// outstream.println("qin=auto \tASCII offset for input quality. May be set to 33 (Sanger), 64 (Illumina), or auto");
+// outstream.println("qout=auto \tASCII offset for output quality. May be set to 33 (Sanger), 64 (Illumina), or auto (meaning same as input)");
+// outstream.println("outsingle=<file> \t(outs) Write singleton reads here, when conditionally discarding reads from pairs.");
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Classes ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private class Vessel{
+
+ public Vessel(String fname_, boolean allowSubprocess){
+ fname=fname_;
+ ff=FileFormat.testOutput(fname, FileFormat.FASTQ, null, allowSubprocess, overwrite, append, false);
+ bsw=new ByteStreamWriter(ff);
+ bsw.start();
+ }
+
+ public boolean close(){
+ return bsw.poisonAndWait();
+ }
+
+ public String toString(){
+ return fname+", "+String.format("%.6f", prob);
+ }
+
+ final String fname;
+ final FileFormat ff;
+ final ByteStreamWriter bsw;
+
+ double prob;
+
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+
+ /*--------------------------------------------------------------*/
+
+ private ArrayList<String> inNames=new ArrayList<String>();
+ private ArrayList<String> outNames=new ArrayList<String>();
+
+ private ArrayList<Vessel> vessels;
+
+ /*--------------------------------------------------------------*/
+
+ private long maxReads=-1;
+ private long seed=-1;
+
+ private int minSinks=1;
+ private int maxSinks=8;
+ private double minProb=0.000005;
+ private double maxProb=0.025;
+
+ private double minProbPow=Math.log(minProb);
+ private double maxProbPow=Math.log(maxProb);
+
+// private double root=3.0;
+//
+// private double minProbRoot=Math.pow(minProb, 1/root);
+// private double maxProbRoot=Math.pow(maxProb, 1/root);
+
+ private final Random randy=new Random();
+
+ long readsProcessed=0;
+ long basesProcessed=0;
+
+ private int shufflethreads=3;
+
+ private boolean shuffle=false;
+ private boolean showspeed=true;
+
+ /*--------------------------------------------------------------*/
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Common Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private PrintStream outstream=System.err;
+ public static boolean verbose=false;
+ public boolean errorState=false;
+ private boolean overwrite=false;
+ private boolean append=false;
+
+}
diff --git a/current/jgi/CutPrimers.java b/current/jgi/CutPrimers.java
new file mode 100755
index 0000000..417857e
--- /dev/null
+++ b/current/jgi/CutPrimers.java
@@ -0,0 +1,241 @@
+package jgi;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.LinkedHashMap;
+
+import align2.ListNum;
+import align2.Tools;
+
+import stream.ConcurrentGenericReadInputStream;
+import stream.ConcurrentReadInputStream;
+import stream.ConcurrentReadOutputStream;
+import stream.Read;
+import stream.SamLine;
+
+import dna.Parser;
+import dna.Timer;
+
+import fileIO.FileFormat;
+import fileIO.ReadWrite;
+import fileIO.TextFile;
+
+/**
+ * Uses two sam files defining primer mapping locations to cut the primed sequence out of the reference.
+ * @author Brian Bushnell
+ * @date Nov 24, 2014
+ *
+ */
+public class CutPrimers {
+
+ public static void main(String[] args){
+ Timer t=new Timer();
+ CutPrimers as=new CutPrimers(args);
+ as.process(t);
+ }
+
+ public CutPrimers(String[] args){
+
+ args=Parser.parseConfig(args);
+ if(Parser.parseHelp(args, true)){
+ printOptions();
+ System.exit(0);
+ }
+
+ outstream.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+
+ Parser parser=new Parser();
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if(b==null || b.equalsIgnoreCase("null")){b=null;}
+ while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens
+
+ if(parser.parse(arg, a, b)){
+ //do nothing
+ }else if(a.equals("parse_flag_goes_here")){
+ //Set a variable here
+ }else if(a.equals("sam1")){
+ sam1=b;
+ }else if(a.equals("sam2")){
+ sam2=b;
+ }else if(a.equals("fake") || a.equals("addfake")){
+ ADD_FAKE_READS=Tools.parseBoolean(b);
+ }else if(a.equals("include") || a.equals("includeprimer") || a.equals("includeprimers")){
+ INCLUDE_PRIMERS=Tools.parseBoolean(b);
+ }else{
+ outstream.println("Unknown parameter "+args[i]);
+ assert(false) : "Unknown parameter "+args[i];
+ // throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ {//Process parser fields
+ Parser.processQuality();
+
+ maxReads=parser.maxReads;
+ in1=parser.in1;
+ out1=parser.out1;
+ }
+
+ ffout1=FileFormat.testOutput(out1, FileFormat.FASTQ, null, true, true, false, false);
+ ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, null, true, true);
+ }
+
+ void process(Timer t){
+
+ final ConcurrentReadInputStream cris;
+ {
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, null);
+ if(verbose){outstream.println("Started cris");}
+ cris.start(); //4567
+ }
+ boolean paired=cris.paired();
+
+ final ConcurrentReadOutputStream ros;
+ if(out1!=null){
+ final int buff=4;
+
+ if(cris.paired() && (in1==null || !in1.contains(".sam"))){
+ outstream.println("Writing interleaved.");
+ }
+
+ assert(!out1.equalsIgnoreCase(in1) && !out1.equalsIgnoreCase(in1)) : "Input file and output file have same name.";
+
+ ros=ConcurrentReadOutputStream.getStream(ffout1, null, buff, null, false);
+ ros.start();
+ }else{ros=null;}
+
+ LinkedHashMap<String, SamLine> p1set=toSamLines(sam1);
+ LinkedHashMap<String, SamLine> p2set=toSamLines(sam2);
+ long readsProcessed=0, readsSuccess=0;
+ {
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+ assert((ffin1==null || ffin1.samOrBam()) || (r.mate!=null)==cris.paired());
+ }
+
+ while(reads!=null && reads.size()>0){
+ ArrayList<Read> readsOut=new ArrayList<Read>(reads.size());
+ if(verbose){outstream.println("Fetched "+reads.size()+" reads.");}
+
+ for(int idx=0; idx<reads.size(); idx++){
+ readsProcessed++;
+ final Read r=reads.get(idx);
+
+ final SamLine sl1=p1set.get(r.id);
+ final SamLine sl2=p2set.get(r.id);
+
+ int oldSize=readsOut.size();
+
+ final int len=r.length();
+ if(sl1!=null && sl2!=null){
+ final int a1=Tools.mid(0, len, sl1.start(true, false));
+ final int a2=Tools.mid(0, len, sl2.start(true, false));
+ final int b1=Tools.mid(0, len, sl1.stop(a1, true, false));
+ final int b2=Tools.mid(0, len, sl2.stop(a2, true, false));
+ if(Tools.overlap(a1, b1, a2, b2)){
+
+ }else{
+
+ final int from, to;
+ if(INCLUDE_PRIMERS){
+ if(a1<a2){
+ from=a1;
+ to=b2+1;
+ }else{
+ from=a2;
+ to=b1+1;
+ }
+ }else{
+ if(a1<a2){
+ from=b1+1;
+ to=a2;
+ }else{
+ from=b2+1;
+ to=a1;
+ }
+ }
+
+ assert(from>=0 && from<r.bases.length && to>=from) : from+", "+to+", "+r.bases.length+"\n"+
+ new String(r.bases)+"\n"+sl1+"\n"+sl2+"\n";
+ final byte[] bases=Arrays.copyOfRange(r.bases, from, to);
+ final byte[] quals=(r.quality==null ? null : Arrays.copyOfRange(r.quality, from, to));
+ readsOut.add(new Read(bases, -1, (byte)0, -1, -1, r.id, quals, r.numericID));
+ readsSuccess++;
+ }
+ }
+
+ if(oldSize==readsOut.size() && ADD_FAKE_READS){
+ readsOut.add(new Read(new byte[] {'N'}, -1, (byte)0, -1, -1, r.id, null, r.numericID));
+ }
+ }
+
+ if(ros!=null){ros.add(readsOut, ln.id);}
+
+ cris.returnList(ln.id, ln.list.isEmpty());
+ if(verbose){outstream.println("Returned a list.");}
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ if(ln!=null){
+ cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
+ }
+ }
+ ReadWrite.closeStreams(cris, ros);
+ if(verbose){outstream.println("Finished.");}
+
+ t.stop();
+ outstream.println("Time: \t"+t);
+ outstream.println("Reads Processed: "+readsProcessed+" \t"+String.format("%.2fk reads/sec", (readsProcessed/(double)(t.elapsed))*1000000));
+ outstream.println("Sequences Generated: "+readsSuccess);
+ }
+
+ public static LinkedHashMap<String, SamLine> toSamLines(String fname){
+ TextFile tf=new TextFile(fname);
+ LinkedHashMap<String, SamLine> list=new LinkedHashMap<String, SamLine>();
+ for(String s=tf.nextLine(); s!=null; s=tf.nextLine()){
+ if(!s.startsWith("@")){
+ SamLine sl=new SamLine(s);
+ list.put(new String(sl.rname()), sl);
+ }
+ }
+ tf.close();
+ return list;
+ }
+
+ /*--------------------------------------------------------------*/
+
+ private void printOptions(){
+ throw new RuntimeException("printOptions: TODO");
+ }
+
+ /*--------------------------------------------------------------*/
+
+ private String in1=null;
+ private String sam1=null;
+ private String sam2=null;
+ private String out1=null;
+
+ private boolean ADD_FAKE_READS=true;
+ private boolean INCLUDE_PRIMERS=false;
+
+ private final FileFormat ffin1;
+ private final FileFormat ffout1;
+
+ /*--------------------------------------------------------------*/
+
+ private long maxReads=-1;
+
+ /*--------------------------------------------------------------*/
+
+ private java.io.PrintStream outstream=System.err;
+ public static boolean verbose=false;
+
+}
diff --git a/current/jgi/DecontaminateByNormalization.java b/current/jgi/DecontaminateByNormalization.java
new file mode 100755
index 0000000..2084fbf
--- /dev/null
+++ b/current/jgi/DecontaminateByNormalization.java
@@ -0,0 +1,618 @@
+package jgi;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Date;
+import java.util.TimeZone;
+
+import stream.ConcurrentGenericReadInputStream;
+import stream.ConcurrentReadInputStream;
+import stream.FASTQ;
+import stream.FastaReadInputStream;
+import stream.ConcurrentReadOutputStream;
+import stream.Read;
+
+import dna.Data;
+import dna.Parser;
+import dna.Timer;
+import fileIO.ByteFile;
+import fileIO.ByteFile1;
+import fileIO.ByteFile2;
+import fileIO.ReadWrite;
+import fileIO.FileFormat;
+import fileIO.TextFile;
+
+import align2.BBMap;
+import align2.ListNum;
+import align2.ReadStats;
+import align2.Shared;
+import align2.Tools;
+
+/**
+ * @author Brian Bushnell
+ * @date October 9, 2014
+ *
+ */
+public class DecontaminateByNormalization {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public static void main(String[] args){
+ Timer t=new Timer();
+ DecontaminateByNormalization dbn=new DecontaminateByNormalization(args);
+ dbn.process(t);
+ }
+
+ public DecontaminateByNormalization(String[] args){
+
+ args=Parser.parseConfig(args);
+ if(Parser.parseHelp(args, true)){
+ printOptions();
+ System.exit(0);
+ }
+
+ for(String s : args){if(s.startsWith("out=standardout") || s.startsWith("out=stdout")){outstream=System.err;}}
+ outstream.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+
+ boolean setInterleaved=false; //Whether it was explicitly set.
+
+
+
+ Shared.READ_BUFFER_LENGTH=Tools.min(200, Shared.READ_BUFFER_LENGTH);
+// Shared.READ_BUFFER_NUM_BUFFERS=Shared.READ_BUFFER_NUM_BUFFERS;
+ ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
+ ReadWrite.MAX_ZIP_THREADS=Shared.threads();
+
+ CoveragePileup.USE_WINDOW=true;
+
+ final ArrayList<String> readNameFiles=new ArrayList<String>();
+ final ArrayList<String> refNameFiles=new ArrayList<String>();
+
+ Parser parser=new Parser();
+ parser.overwrite=true;
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if(b==null || b.equalsIgnoreCase("null")){b=null;}
+ while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens
+
+ if(Parser.isJavaFlag(arg)){
+ //do nothing
+ }else if(Parser.parseQuality(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseZip(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseFasta(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseCommonStatic(arg, a, b)){
+ //do nothing
+ }else if(parser.parseMapping(arg, a, b)){
+ //do nothing
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ ByteFile1.verbose=verbose;
+ ByteFile2.verbose=verbose;
+ stream.FastaReadInputStream.verbose=verbose;
+ ConcurrentGenericReadInputStream.verbose=verbose;
+// align2.FastaReadInputStream2.verbose=verbose;
+ stream.FastqReadInputStream.verbose=verbose;
+ ReadWrite.verbose=verbose;
+ }else if(a.equals("minc")){
+ minc=Float.parseFloat(b);
+ }else if(a.equals("minp")){
+ minp=Float.parseFloat(b);
+ }else if(a.equals("minr")){
+ minr=Integer.parseInt(b);
+ }else if(a.equals("minl")){
+ minl=Integer.parseInt(b);
+ }else if(a.equals("mind") || a.equals("mindepth")){
+ minDepth=Integer.parseInt(b);
+ }else if(a.equals("dp") || a.equals("depthpercentile")){
+ depthPercentile=Float.parseFloat(b);
+ assert(depthPercentile>=0 && depthPercentile<=1) : "depthPercentile must be between 0 and 1.";
+ }else if(a.equals("minprob")){
+ minprob=Float.parseFloat(b);
+ assert(minprob>=0 && minprob<=1) : "minprob must be between 0 and 1.";
+ }else if(a.equals("minratio") || a.equals("ratio")){
+ minRatio=Double.parseDouble(b);
+ }else if(a.equals("basesundermin")){
+ basesUnderMin=Integer.parseInt(b);
+ }else if(a.equals("window")){
+ CoveragePileup.LOW_COV_WINDOW=Integer.parseInt(b);
+ }else if(a.equals("windowcov")){
+ CoveragePileup.LOW_COV_DEPTH=Double.parseDouble(b);
+ }else if(a.equals("mapraw")){
+ mapRawReads=Tools.parseBoolean(b);
+ }else if(a.equals("target")){
+ normTarget=Integer.parseInt(b);
+ }else if(a.equals("hashes")){
+ normHashes=Integer.parseInt(b);
+ }else if(a.equals("passes")){
+ normPasses=Integer.parseInt(b);
+ }else if(a.equals("kfilter")){
+ kfilter=Integer.parseInt(b);
+ }else if(a.equals("ambig") || a.equals("ambiguous")){
+ ambigMode=b;
+ }else if(a.equals("ecc")){
+ ecc=Tools.parseBoolean(b);
+ }else if(a.equals("cecc")){
+ cecc=Tools.parseBoolean(b);
+ if(cecc){ecc=true;aecc=false;}
+ }else if(a.equals("aecc")){
+ aecc=Tools.parseBoolean(b);
+ if(aecc){ecc=true;cecc=false;}
+ }else if(a.equals("prefilter")){
+ prefilter=Tools.parseBoolean(b);
+ }else if(a.equals("filterbits") || a.equals("fbits")){
+ filterBits=Integer.parseInt(b);
+ }else if(a.equals("prefilterbits") || a.equals("prebits") || a.equals("pbits")){
+ prefilterBits=Integer.parseInt(b);
+ }else if(a.equals("logname") || a.equals("log")){
+ logName=b;
+ }else if(a.equals("resultsname") || a.equals("results") || a.equals("summary")){
+ resultsName=b;
+ }else if(a.equals("tempdir") || a.equals("tmpdir")){
+ tempdir=b;
+ }else if(a.equals("outdir") || a.equals("out")){
+ outdir=b;
+ }else if(a.equals("ref") || a.equals("refs")){
+ String[] split2=b.split(",");
+ for(String name : split2){
+ refNames.add(name);
+ }
+ }else if(a.equals("read") || a.equals("reads") || a.equals("data")){
+ String[] split2=b.split(",");
+ for(String name : split2){
+ readNames.add(name);
+ }
+ }else if(a.equals("refnamefile") || a.equals("refnamelist")){
+ String[] split2=b.split(",");
+ for(String name : split2){
+ refNameFiles.add(name);
+ }
+ }else if(a.equals("readnamefile") || a.equals("readnamelist")){
+ String[] split2=b.split(",");
+ for(String name : split2){
+ readNameFiles.add(name);
+ }
+ }else{
+ outstream.println("Unknown parameter "+args[i]);
+ assert(false) : "Unknown parameter "+args[i];
+ // throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ {//Process parser fields
+ Parser.processQuality();
+
+ maxReads=parser.maxReads;
+
+ overwrite=ReadStats.overwrite=parser.overwrite;
+
+ setInterleaved=parser.setInterleaved;
+ }
+
+ parseStringsFromFiles(readNameFiles);
+ parseStringsFromFiles(refNameFiles);
+
+ readNames.addAll(readNameFiles);
+ refNames.addAll(refNameFiles);
+// System.err.println("\n************ 5\n"+readNames+"\n\n"+refNames+"\n\n"+readNameFiles+"\n\n"+refNameFiles);
+
+ if(outdir!=null && outdir.length()>0 && !outdir.endsWith("/")){outdir=outdir+"/";}
+ if(tempdir!=null && tempdir.length()>0 && !tempdir.endsWith("/")){tempdir=tempdir+"/";}
+
+ assert(FastaReadInputStream.settingsOK());
+
+ if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){
+ ByteFile.FORCE_MODE_BF2=true;
+ }
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Outer Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ void process(Timer t){
+ log("Decontaminate start", false);
+ String mergePath=tempdir+"_merged.fq.gz";
+ String normPath=tempdir+"_normalized.fq.gz";
+
+ if(mapRawReads){
+ map(readNames, refNames, 0);
+ }
+ renameAndMerge(readNames, mergePath);
+ normalize(mergePath, normPath, minDepth, normTarget, normHashes, normPasses, ecc, prefilter, normalizeByLowerDepth);
+ demux(normPath, readNames);
+ map(readNames, refNames, 1);
+ filter(readNames, refNames);
+
+ t.stop();
+
+ outstream.println("Time: \t"+t);
+ log("Decontaminate finish", true);
+ }
+
+ public static void parseStringsFromFiles(ArrayList<String> list){
+ String[] x=list.toArray(new String[list.size()]);
+ list.clear();
+ for(String s : x){
+ File f=new File(s);
+ if(f.exists() && f.isFile()){
+ TextFile tf=new TextFile(s);
+ String[] lines=tf.toStringLines();
+ for(String s2 : lines){
+ list.add(s2);
+ }
+ }else{
+ list.add(s);
+ }
+ }
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private void renameAndMerge(ArrayList<String> readPaths, String fnameOut){
+ log("renameAndMerge start", true);
+ System.err.println("\nRename/Merge Phase Start");
+
+ FileFormat ffout=FileFormat.testOutput(fnameOut, FileFormat.FASTQ, null, true, true, false, false);
+
+ long readsProcessed=0;
+ long basesProcessed=0;
+
+ for(String in : readPaths){
+
+ final FileFormat ffin=FileFormat.testInput(in, FileFormat.FASTQ, null, true, true);
+ final ConcurrentReadInputStream cris;
+ final String core=ReadWrite.stripToCore(in);
+ {
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin, null);
+ if(verbose){outstream.println("Started cris");}
+ cris.start(); //4567
+ }
+ boolean paired=cris.paired();
+ if(!ffin.samOrBam()){outstream.println("Input is being processed as "+(paired ? "paired" : "unpaired"));}
+ final ConcurrentReadOutputStream ros;
+
+ final int buff=4;
+
+ if(cris.paired()){
+ outstream.println("Writing interleaved.");
+ }
+
+ assert(!in.equalsIgnoreCase(fnameOut)) : "Input file and output file have same name.";
+
+ ros=ConcurrentReadOutputStream.getStream(ffout, null, buff, null, false);
+ ros.start();
+
+ {
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+ assert((r.mate!=null)==cris.paired());
+ }
+
+ while(reads!=null && reads.size()>0){
+
+ for(int idx=0; idx<reads.size(); idx++){
+ final Read r1=reads.get(idx);
+ final Read r2=r1.mate;
+
+ final int initialLength1=r1.length();
+ final int initialLength2=(r1.mateLength());
+
+ {
+ readsProcessed++;
+ basesProcessed+=initialLength1;
+ r1.id=core+"_"+r1.numericID+" /1";
+ }
+ if(r2!=null){
+ readsProcessed++;
+ basesProcessed+=initialLength2;
+ r2.id=core+"_"+r1.numericID+" /2";
+ }
+
+
+ }
+
+ final ArrayList<Read> listOut=reads;
+
+ if(ros!=null){ros.add(listOut, ln.id);}
+
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ if(ln!=null){
+ cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
+ }
+ }
+
+ errorState|=ReadWrite.closeStreams(cris, ros);
+
+ ffout=FileFormat.testOutput(fnameOut, FileFormat.FASTQ, null, true, false, true, false);
+ }
+ log("renameAndMerge finish", true);
+ }
+
+ private void normalize(String fnameIn, String fnameOut, int min, int target, int hashes, int passes, boolean ecc, boolean prefilter, boolean uselowerdepth){
+ log("normalization start", true);
+ System.err.println("\nNormalization/Error Correction Phase Start");
+
+ ArrayList<String> argList=new ArrayList<String>();
+
+ {
+ argList.add("ecc="+ecc);
+ if(ecc && aecc){argList.add("aecc="+aecc);}
+ if(ecc && cecc){argList.add("cecc="+cecc);}
+ argList.add("prefilter="+prefilter);
+ argList.add("bits="+filterBits);
+ argList.add("prebits="+prefilterBits);
+ argList.add("hashes="+hashes);
+ argList.add("passes="+passes);
+ argList.add("target="+target);
+ argList.add("mindepth="+min);
+ argList.add("maxdepth="+target);
+ argList.add("minprob="+minprob);
+ argList.add("dp="+depthPercentile);
+ argList.add("in="+fnameIn);
+ argList.add("out="+fnameOut);
+ argList.add("uld="+uselowerdepth);
+ }
+
+ String[] normargs=argList.toArray(new String[0]);
+
+ {//Run BBNorm
+ try {
+ KmerNormalize.main(normargs);
+ } catch (Exception e) {
+ e.printStackTrace();
+ log("failed", true);
+ System.exit(1);
+ }
+ }
+
+ log("normalization finish", true);
+ }
+
+ private void demux(String fnameIn, ArrayList<String> readPaths){
+ log("demux start", true);
+ System.err.println("\nDemux Phase Start");
+
+// String dir=ReadWrite.parseRoot(fnameIn);
+
+ ArrayList<String> argList=new ArrayList<String>();
+
+ {
+
+ argList.add("in="+fnameIn);
+ argList.add("out="+tempdir+"%_demuxed.fq.gz");
+
+ StringBuilder sb=new StringBuilder("names=");
+ String comma="";
+ for(String s : readPaths){
+ sb.append(comma);
+ sb.append(ReadWrite.stripToCore(s));
+ comma=",";
+ }
+ argList.add(sb.toString());
+ }
+
+ String[] args=argList.toArray(new String[0]);
+
+ {//Run Demux
+ try {
+ DemuxByName.main(args);
+ } catch (Exception e) {
+ e.printStackTrace();
+ log("failed", true);
+ System.exit(1);
+ }
+ }
+
+ log("demux finish", true);
+ }
+
+ private void map(ArrayList<String> readPaths, ArrayList<String> refnames, int pass){
+ log("map start", true);
+ System.err.println("\nMapping Phase Start");
+
+ for(int i=0; i<readPaths.size(); i++){
+ final String path=readPaths.get(i);
+ final String ref=refnames.get(i);
+ final String core=ReadWrite.stripToCore(path);
+ final String demuxed=tempdir+core+"_demuxed.fq.gz";
+ final String dir=(outdir==null ? "" : outdir);
+
+ final String infile=(pass==0 ? path : demuxed);
+
+ ArrayList<String> argList=new ArrayList<String>();
+
+ {
+ argList.add("in="+infile);
+ argList.add("ref="+ref);
+ argList.add("covstats="+dir+core+"_covstats"+pass+".txt");
+ argList.add("arrays=t");
+ argList.add("nodisk");
+ argList.add("ambig="+ambigMode);
+ if(kfilter>1){argList.add("kfilter="+kfilter);}
+ argList.add("fast");
+ argList.add("ow="+overwrite);
+ argList.add("minscaf=0");
+ }
+
+ String[] args=argList.toArray(new String[0]);
+
+ {//Run BBMap
+ try {
+ BBMap.main(args);
+ } catch (Exception e) {
+ e.printStackTrace();
+ log("failed", true);
+ System.exit(1);
+ }
+ }
+ Data.unloadAll();
+ }
+
+ log("map finish", true);
+ }
+
+ private void filter(ArrayList<String> readPaths, ArrayList<String> refnames){
+ //filterbycoverage.sh -Xmx2g in=PUCA.fasta cov=PUCA_stats.txt out=PUCA_clean.fa minc=5 minp=40
+ log("filter start", true);
+ System.err.println("\nFiltering Phase Start");
+
+ for(int i=0; i<readPaths.size(); i++){
+ String path=readPaths.get(i);
+ String ref=refnames.get(i);
+ String core=ReadWrite.stripToCore(path);
+ String dir=(outdir==null ? /*ReadWrite.parseRoot(ref)*/ "" : outdir);
+ String stats0=mapRawReads ? (outdir==null ? "" : outdir)+core+"_covstats0.txt" : null;
+ String stats1=(outdir==null ? "" : outdir)+core+"_covstats1.txt";
+ String results=(resultsName==null ? "" : (resultsName.contains("/") || resultsName.contains("\\") ? resultsName : (outdir==null ? "" : outdir)+resultsName));
+
+ ArrayList<String> argList=new ArrayList<String>();
+
+ {
+ argList.add("log="+results);
+ argList.add("appendlog="+(i>0));
+ argList.add("logheader="+(i==0));
+ if(stats0!=null){argList.add("cov0="+stats0);}
+ argList.add("cov1="+stats1);
+ argList.add("in="+ref);
+ argList.add("out="+dir+core+"_clean.fasta");
+ argList.add("outd="+dir+core+"_dirty.fasta");
+ argList.add("minc="+minc);
+ argList.add("minp="+minp);
+ argList.add("minr="+minr);
+ argList.add("minl="+minl);
+ argList.add("basesundermin="+basesUnderMin);
+ if(stats0!=null){argList.add("minratio="+minRatio);}
+ argList.add("ow="+overwrite);
+ }
+
+ String[] args=argList.toArray(new String[0]);
+
+ {//Run filtering
+ try {
+ FilterByCoverage.main(args);
+ } catch (Exception e) {
+ e.printStackTrace();
+ log("failed", true);
+ System.exit(1);
+ }
+ }
+ }
+
+ log("filter finish", true);
+ }
+
+ /**
+ * Log a message in the log file
+ * @param message Message to log
+ * @param append True to append, false to overwrite
+ */
+ private void log(String message, boolean append){
+ if(logName!=null){
+ ReadWrite.writeString(message+", "+timeString()+"\n", logName, append);
+ }
+ }
+
+ /**
+ * TODO: Some machines are set to UTC rather than PST
+ * @return Timestamp in RQC's format
+ */
+ public static String timeString(){
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+// sdf.setTimeZone(TimeZone.getTimeZone("PST"));
+ sdf.setTimeZone(TimeZone.getDefault());
+ return sdf.format(new Date());
+ }
+
+ /*--------------------------------------------------------------*/
+
+ private void printOptions(){
+ assert(false) : "printOptions: TODO";
+// outstream.println("Syntax:\n");
+// outstream.println("java -ea -Xmx512m -cp <path> jgi.ReformatReads in=<infile> in2=<infile2> out=<outfile> out2=<outfile2>");
+// outstream.println("\nin2 and out2 are optional. \nIf input is paired and there is only one output file, it will be written interleaved.\n");
+// outstream.println("Other parameters and their defaults:\n");
+// outstream.println("overwrite=false \tOverwrites files that already exist");
+// outstream.println("ziplevel=4 \tSet compression level, 1 (low) to 9 (max)");
+// outstream.println("interleaved=false\tDetermines whether input file is considered interleaved");
+// outstream.println("fastawrap=70 \tLength of lines in fasta output");
+// outstream.println("qin=auto \tASCII offset for input quality. May be set to 33 (Sanger), 64 (Illumina), or auto");
+// outstream.println("qout=auto \tASCII offset for output quality. May be set to 33 (Sanger), 64 (Illumina), or auto (meaning same as input)");
+// outstream.println("outsingle=<file> \t(outs) Write singleton reads here, when conditionally discarding reads from pairs.");
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private final ArrayList<String> readNames=new ArrayList<String>();
+ private final ArrayList<String> refNames=new ArrayList<String>();
+
+ private String logName=null;
+ private String resultsName="results.txt";
+ private String tempdir=(Shared.TMPDIR == null ? "" : Shared.TMPDIR);
+ private String outdir=null;
+
+ /*--------------------------------------------------------------*/
+
+
+ private int kfilter=55;
+ private String ambigMode="random";
+
+ private long maxReads=-1;
+
+ private float minc=3.5f;
+ private float minp=20;
+ private int minr=20;
+ private int minl=500;
+
+ /** Scaffolds will be discarded if there are at least this many bases in windows below a coverage cutoff. */
+ private int basesUnderMin=-1;
+
+ private float depthPercentile=0.75f;
+ private float minprob=0.5f;
+ private int minDepth=2;
+ private int normTarget=20;
+ private int normHashes=4;
+ private int normPasses=1;
+ private int filterBits=32;
+ private int prefilterBits=2;
+ private boolean ecc=false;
+ private boolean cecc=false;
+ private boolean aecc=false;
+ private boolean prefilter=true;
+ private boolean normalizeByLowerDepth=false;
+
+ private double minRatio=1.2f;
+ private boolean mapRawReads=true;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Common Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private PrintStream outstream=System.err;
+ public static boolean verbose=false;
+ public boolean errorState=false;
+ private boolean overwrite=true;
+
+}
diff --git a/current/jgi/Dedupe.java b/current/jgi/Dedupe.java
new file mode 100755
index 0000000..b5da975
--- /dev/null
+++ b/current/jgi/Dedupe.java
@@ -0,0 +1,5796 @@
+package jgi;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.io.Serializable;
+import java.util.ArrayDeque;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedHashMap;
+import java.util.PriorityQueue;
+import java.util.Random;
+import java.util.concurrent.atomic.AtomicIntegerArray;
+
+import stream.ConcurrentCollectionReadInputStream;
+import stream.ConcurrentGenericReadInputStream;
+import stream.ConcurrentReadInputStream;
+import stream.FASTQ;
+import stream.FastaReadInputStream;
+import stream.FastqReadInputStream;
+import stream.ConcurrentReadOutputStream;
+import stream.Read;
+
+import align2.BandedAligner;
+import align2.ListNum;
+import align2.LongM;
+import align2.ReadLengthComparator;
+import align2.ReadStats;
+import align2.Shared;
+import align2.Tools;
+import align2.TrimRead;
+import dna.AminoAcid;
+import dna.Parser;
+import dna.Timer;
+import fileIO.ByteFile1;
+import fileIO.ByteFile2;
+import fileIO.FileFormat;
+import fileIO.ReadWrite;
+import fileIO.ByteStreamWriter;
+import fileIO.TextStreamWriter;
+
+/**
+ * @author Brian Bushnell
+ * @date Jul 18, 2013
+ *
+ */
+public final class Dedupe {
+
+ public static void main(String[] args){
+
+ if(args==null || args.length==0 || (args.length==1 &&
+ (args[0].equalsIgnoreCase("-h") || args[0].equals("-help") || args[0].equals("--help") || args[0].equals("-?") || args[0].equals("?")))){
+ printOptions();
+ System.exit(0);
+ }
+
+ //Preparse to see if Dedupe2 should be used instead
+ {
+ int nam=1;
+ for(int i=0; i<args.length; i++){
+ final String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if("null".equalsIgnoreCase(b)){b=null;}
+ while(a.charAt(0)=='-' && (a.indexOf('.')<0 || i>1 || !new File(a).exists())){a=a.substring(1);}
+ if(a.equals("nam") || a.equals("numaffixmaps")){
+ if(b!=null){nam=Integer.parseInt(b);}
+ }
+ }
+ if(nam>2){
+ Dedupe2.main(args);
+ return;
+ }
+ }
+
+ Dedupe dd=new Dedupe(args);
+ dd.process();
+ }
+
+ private static void printOptions(){
+ System.err.println("Please consult the shellscript for usage information.");
+// outstream.println("Syntax:\n");
+// outstream.println("\njava -ea -Xmx106g -cp <path> jgi.Dedupe <input file> <output file>");
+// outstream.println("\nOptional flags:");
+// outstream.println("in=<file> \tInput file. 'in=stdin' will pipe from standard in.");
+// outstream.println("out=<file> \tOutput file. 'out=stdout' will pipe to standard out.");
+// outstream.println("dot=<file> \tOutput a dot-format overlap graph to this file.");
+// outstream.println("pattern=<file> \tClusters will be written to individual files, where the '%' symbol in the pattern is replaced by cluster number.");
+// outstream.println("");
+// outstream.println("threads=auto \t(t) Set number of threads to use; default is number of logical processors.");
+// outstream.println("overwrite=t \t(ow) Set to false to force the program to abort rather than overwrite an existing file.");
+// outstream.println("showspeed=t \t(ss) Set to 'f' to suppress display of processing speed.");
+// outstream.println("minscaf=0 \t(ms) Ignore contigs/scaffolds shorter than this.");
+// outstream.println("interleaved=auto \tIf true, forces fastq input to be paired and interleaved.");
+//
+// outstream.println("absorbrc=t \t(arc) Absorb reverse-complements as well as normal orientation.");
+// outstream.println("absorbmatch=t \t(am) Absorb exact matches of contigs.");
+// outstream.println("absorbcontainment=t\t(ac) Absorb full containments of contigs.");
+// outstream.println("absorboverlap=f \t(ao) Absorb (merge) non-contained overlaps of contigs.");
+//
+// outstream.println("numaffixmaps=1 \t(nam) Set to 2 to index two prefixes and suffixes per contig.");
+// outstream.println("ignoreaffix1=f \t(ia1) Ignore first affix (for testing).");
+// outstream.println("storesuffix=f \t(ss) Store suffix as well as prefix. Automatically set to true when doing inexact matches.");
+//
+// outstream.println("findoverlap=f \t(fo) Find overlaps between contigs (containments and non-containments).");
+// outstream.println("cluster=f \t(c) Group overlapping contigs into clusters.");
+// outstream.println("fixmultijoins=t \t(fmj) Remove redundant overlaps between the same two contigs.");
+// outstream.println("removecycles=t \t(rc) Remove all cycles so clusters form trees.");
+// outstream.println("renameclusters=f \t(rnc) Rename contigs to indicate which cluster they are in.");
+// outstream.println("minclustersize=1 \t(mcs) Don't output clusters smaller than this.");
+// outstream.println("pbr=f \t(pickbestrepresentative) Only output the single highest-quality read per cluster.");
+// outstream.println("cc=t \t(canonicizeclusters) Flip contigs so clusters have a single orientation.");
+// outstream.println("fcc=f \t(fixcanoncontradictions) Truncate graph at nodes with canonization disputes.");
+// outstream.println("foc=f \t(fixoffsetcontradictions) Truncate graph at nodes with offset disputes.");
+// outstream.println("pto=f \t(preventtransitiveoverlaps) To not look for new edges between nodes in the same cluster.");
+//
+// outstream.println("storename=t \t(sn) Store contig names (set false to save memory).");
+// outstream.println("storequality=t \t(sq) Store quality values for fastq assemblies (set false to save memory).");
+// outstream.println("exact=t \t(ex) Only allow exact symbol matches. When false, an 'N' will match any symbol.");
+// outstream.println("touppercase=f \t(tuc) Change all input bases to upper case.");
+// outstream.println("uniquenames=t \t(un) Ensure all output contigs have unique names. Uses more memory.");
+// outstream.println("maxsubs=0 \t(s) Allow up to this many mismatches (substitutions only, no indels). May be set higher than maxedits.");
+// outstream.println("maxedits=0 \t(e) Allow up to this many edits (subs or indels). Higher is slower, so below 20 is suggested.");
+// //outstream.println("bandwidth=9 \t(bw) Width of banded alignment, if maxedits>0. To ensure correctness, set bandwidth=2*maxedits+1. Higher is slower.");
+// outstream.println("minidentity=100 \t(mid) Allow inter-sequence identity as low as this (subs only, no indels).");
+// outstream.println("k=31 \tKmer length used for finding containments. Containments shorter than k will not be found.");
+// outstream.println("minlengthpercent=0 \t(mlp) Smaller contig must be at least this percent of larger contig's length to be absorbed.");
+// outstream.println("minoverlappercent=0\t(mop) Overlap must be at least this percent of smaller contig's length to cluster and merge.");
+// outstream.println("minoverlap=200 \t(mo) Overlap must be at least this long to cluster and merge.");
+//
+// outstream.println("mopc=0 \t(minoverlappercentmerge) Overlap must be at least this percent of smaller contig's length to cluster.");
+// outstream.println("mopm=0 \t(minoverlappercentcluster) Overlap must be at least this percent of smaller contig's length to merge.");
+// outstream.println("moc=200 \t(minoverlapcluster) Overlap must be at least this long to cluster.");
+// outstream.println("mom=200 \t(minoverlapmerge) Overlap must be at least this long to merge.");
+// outstream.println("rt=f \t(rigoroustransitive) Ensure exact transitivity. Slow. For testing only.");
+//
+// outstream.println("ziplevel=2 \t(zl) Set to 1 (lowest) through 9 (max) to change compression level; lower compression is faster.");
+// outstream.println("sort=f \tsort output by contig length (otherwise it will be random).\n" +
+// " \t'a' for ascending, 'd' for descending, 'f' for false (no sorting).");
+// outstream.println("");
+// outstream.println("Note! When allowing inexact alignments, if maxsubs is less than maxedits, maxsubs is set to maxedits.");
+// outstream.println("If maxsubs and minidentity yield different numbers for some contig, the more liberal is used for substitutions.");
+// outstream.println("For indels, minidentity is ignored and maxedits is always used (due to time and memory constraints).");
+// outstream.println("Regardless of maxsubs, maxedits, or minidentity, no comparison will be made between two sequences unless ");
+// outstream.println("one contains the first or last k bases of the other, exactly, with no edits.");
+
+ }
+
+ public Dedupe(String[] args){
+ for(String s : args){if(s.contains("standardout") || s.contains("stdout")){outstream=System.err;}}
+ System.err.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+
+ ReadWrite.ZIPLEVEL=2;
+ //ReadWrite.USE_UNPIGZ=true;
+
+
+ Parser parser=new Parser();
+ boolean setOut=false, setMcsfs=false;
+ int bandwidth_=-1;
+ int k_=31;
+ int subset_=0, subsetCount_=1;
+
+ {
+ boolean b=false;
+ assert(b=true);
+ EA=b;
+ }
+
+ ReadWrite.MAX_ZIP_THREADS=Shared.threads();
+ ReadWrite.ZIP_THREAD_DIVISOR=1;
+ Read.TO_UPPER_CASE=true;
+
+ for(int i=0; i<args.length; i++){
+
+ final String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if("null".equalsIgnoreCase(b)){b=null;}
+ while(a.charAt(0)=='-' && (a.indexOf('.')<0 || i>1 || !new File(a).exists())){a=a.substring(1);}
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(Parser.parseZip(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseCommonStatic(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseQuality(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseFasta(arg, a, b)){
+ //do nothing
+ }else if(parser.parseInterleaved(arg, a, b)){
+ //do nothing
+ }else if(a.equals("in") || a.equals("in1")){
+ if(b.indexOf(',')>=0 && !new File(b).exists()){
+ in1=b.split(",");
+ }else{
+ in1=new String[] {b};
+ }
+ }else if(a.equals("in2")){
+ if(b.indexOf(',')>=0 && !new File(b).exists()){
+ in2=b.split(",");
+ }else{
+ in2=new String[] {b};
+ }
+ }else if(a.equals("out") || a.equals("out")){
+ out=b;
+ setOut=true;
+ }else if(a.equals("out2")){
+ throw new RuntimeException("Dedupe does not allow 'out2'; for paired reads, output is interleaved.");
+ }else if(a.equals("clusterfilepattern") || a.equals("pattern")){
+ clusterFilePattern=b;
+ assert(clusterFilePattern==null || clusterFilePattern.contains("%")) : "pattern must contain the % symbol.";
+ }else if(a.equals("outbest")){
+ outbest=b;
+ }else if(a.equals("outd") || a.equals("outduplicate")){
+ outdupe=b;
+ }else if(a.equals("csf") || a.equals("clusterstatsfile")){
+ outcsf=b;
+ }else if(a.equals("dot") || a.equals("graph") || a.equals("outdot") || a.equals("outgraph")){
+ outgraph=b;
+ }else if(a.equals("mcsfs") || a.equals("minclustersizeforstats")){
+ minClusterSizeForStats=Integer.parseInt(b);
+ }else if(a.equals("mcs") || a.equals("minclustersize")){
+ minClusterSize=Integer.parseInt(b);
+ if(!setMcsfs){
+ minClusterSizeForStats=minClusterSize;
+ }
+ }else if(a.equals("append") || a.equals("app")){
+ append=ReadStats.append=Tools.parseBoolean(b);
+ }else if(a.equals("overwrite") || a.equals("ow")){
+ overwrite=Tools.parseBoolean(b);
+ }else if(a.equals("sort")){
+ if(b==null){sort=true;}
+ else if(b.equalsIgnoreCase("a")){
+ sort=true;
+ ascending=true;
+ }else if(b.equalsIgnoreCase("d")){
+ sort=true;
+ ascending=false;
+ }else{
+ sort=Tools.parseBoolean(b);
+ }
+ }else if(a.equals("arc") || a.equals("absorbrc") || a.equals("trc") || a.equals("testrc")){
+ ignoreReverseComplement=!Tools.parseBoolean(b);
+ }else if(a.equals("ac") || a.equals("absorbcontainment") || a.equals("absorbcontainments") || a.equals("tc") || a.equals("testcontainment") || a.equals("containment")){
+ absorbContainment=Tools.parseBoolean(b);
+ }else if(a.equals("am") || a.equals("absorbmatch") || a.equals("absorbmatches") || a.equals("tm") || a.equals("testmatch")){
+ absorbMatch=Tools.parseBoolean(b);
+ }else if(a.equals("ao") || a.equals("absorboverlap") || a.equals("absorboverlaps") || a.equals("to") || a.equals("testoverlap")){
+ absorbOverlap=Tools.parseBoolean(b);
+ }else if(a.equals("fo") || a.equals("findoverlap") || a.equals("findoverlaps")){
+ findOverlaps=Tools.parseBoolean(b);
+ }else if(a.equals("c") || a.equals("cluster") || a.equals("clusters")){
+ makeClusters=Tools.parseBoolean(b);
+ }else if(a.equals("fmj") || a.equals("fixmultijoin") || a.equals("fixmultijoins")){
+ fixMultiJoins=Tools.parseBoolean(b);
+ }else if(a.equals("fcc") || a.equals("fixcanoncontradiction") || a.equals("fixcanoncontradictions")){
+ fixCanonContradictions=Tools.parseBoolean(b);
+ }else if(a.equals("foc") || a.equals("fixoffsetcontradiction") || a.equals("fixoffsetcontradictions")){
+ fixOffsetContradictions=Tools.parseBoolean(b);
+ }else if(a.equals("pto") || a.equals("preventtransitiveoverlap") || a.equals("preventtransitiveoverlaps")){
+ preventTransitiveOverlaps=Tools.parseBoolean(b);
+ }else if(a.equals("pbr") || a.equals("pickbestrepresentative")){
+ pickBestRepresentativePerCluster=Tools.parseBoolean(b);
+ }else if(a.equals("mst") || a.equals("maxspanningtree")){
+ maxSpanningTree=Tools.parseBoolean(b);
+ }else if(a.equals("cc") || a.equals("canonicizecluster") || a.equals("canonicizeclusters")){
+ canonicizeClusters=Tools.parseBoolean(b);
+ }else if(a.equals("pc") || a.equals("processcluster") || a.equals("processclusters")){
+ processClusters=Tools.parseBoolean(b);
+ }else if(a.equals("rnc") || a.equals("renamecluster") || a.equals("renameclusters")){
+ renameClusters=Tools.parseBoolean(b);
+ if(renameClusters){storeName=false;}
+ }else if(a.equals("rc") || a.equals("removecycles") || a.equals("removecycle")){
+ removeCycles=Tools.parseBoolean(b);
+ }else if(a.equals("uo") || a.equals("uniqueonly")){
+ UNIQUE_ONLY=Tools.parseBoolean(b);
+ }else if(a.equals("rmn") || a.equals("requirematchingnames")){
+ REQUIRE_MATCHING_NAMES=Tools.parseBoolean(b);
+ }else if(a.equals("ngn") || a.equals("numbergraphnodes")){
+ NUMBER_GRAPH_NODES=Tools.parseBoolean(b);
+ }else if(a.equals("addpairnum")){
+ ADD_PAIRNUM_TO_NAME=Tools.parseBoolean(b);
+ }else if(a.equals("pn") || a.equals("prefixname")){
+// PREFIX_NAME=Tools.parseBoolean(b);
+ }else if(a.equals("k")){
+ k_=Integer.parseInt(b);
+ assert(k_>0 && k_<32) : "k must be between 1 and 31; default is 31, and lower values are slower.";
+ }else if(a.equals("minscaf") || a.equals("ms")){
+ MINSCAF=FastaReadInputStream.MIN_READ_LEN=Integer.parseInt(b);
+ }else if(a.equals("mlp") || a.equals("minlengthpercent")){
+ minLengthPercent=Float.parseFloat(b);
+ }else if(a.equals("mop") || a.equals("minoverlappercent")){
+ minOverlapPercentCluster=minOverlapPercentMerge=Float.parseFloat(b);
+ }else if(a.equals("mopc") || a.equals("minoverlappercentcluster")){
+ minOverlapPercentCluster=Float.parseFloat(b);
+ }else if(a.equals("mopm") || a.equals("minoverlappercentmerge")){
+ minOverlapPercentMerge=Float.parseFloat(b);
+ }else if(a.equals("mo") || a.equals("minoverlap")){
+ minOverlapCluster=minOverlapMerge=Integer.parseInt(b);
+ }else if(a.equals("moc") || a.equals("minoverlapcluster")){
+ minOverlapCluster=Integer.parseInt(b);
+ }else if(a.equals("mom") || a.equals("minoverlapmerge")){
+ minOverlapMerge=Integer.parseInt(b);
+ }else if(a.equals("rt") || a.equals("rigoroustransitive")){
+ rigorousTransitive=Tools.parseBoolean(b);
+ }else if(a.equals("e") || a.equals("maxedits") || a.equals("edits") || a.equals("edist")){
+ maxEdits=Integer.parseInt(b);
+ }else if(a.equals("s") || a.equals("maxsubs") || a.equals("maxsubstitutions") || a.equals("hdist")){
+ maxSubs=Integer.parseInt(b);
+ }else if(a.equals("bw") || a.equals("bandwidth")){
+ bandwidth_=Integer.parseInt(b);
+ }else if(a.equals("mid") || a.equals("minidentity")){
+ minIdentity=Float.parseFloat(b);
+ minIdentityMult=(minIdentity==100f ? 0 : (100f-minIdentity)/100f);
+ }else if(a.equals("threads") || a.equals("t")){
+ THREADS=(b==null || b.equalsIgnoreCase("auto") ? Shared.threads() : Integer.parseInt(b));
+ }else if(a.equals("showspeed") || a.equals("ss")){
+ showSpeed=Tools.parseBoolean(b);
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+// BandedAligner.verbose=verbose;
+ }else if(a.equals("contigbreak") || (arg.contains("=") && (a.equals("n") || a.equals("-n")))){
+ maxNs=Integer.parseInt(b);
+ }else if(a.equals("reads") || a.startsWith("maxreads")){
+ maxReads=Tools.parseKMG(b);
+ }else if(a.equals("sn") || a.equals("storename") || a.equals("storenames") || a.equals("keepnames")){
+ storeName=Tools.parseBoolean(b);
+ }else if(a.equals("ssx") || a.equals("storesuffix") || a.equals("storesuffixes")){
+ storeSuffix=Tools.parseBoolean(b);
+ }else if(a.equals("numaffixmaps") || a.equals("nam")){
+ numAffixMaps=Integer.parseInt(b);
+ }else if(a.equals("mac") || a.equals("maxaffixcopies")){
+ maxAffixCopies=Integer.parseInt(b);
+ }else if(a.equals("me") || a.equals("maxedges")){
+ maxEdges=Integer.parseInt(b);
+ maxEdges2=maxEdges*2;
+ if(maxEdges2<1){maxEdges2=Integer.MAX_VALUE-1;}
+ }else if(a.equals("ignoreaffix1") || a.equals("ia1")){
+ ignoreAffix1=Tools.parseBoolean(b);
+ }else if(a.equals("parsedepth") || a.equals("pd")){
+ parseDepth=Tools.parseBoolean(b);
+ }else if(a.equals("printlengthinedges") || a.equals("ple")){
+ printLengthInEdges=Tools.parseBoolean(b);
+ }else if(a.equals("depthmult") || a.equals("depthratio") || a.equals("dr")){
+ depthRatio=Float.parseFloat(b);
+ if(depthRatio<=0){
+ parseDepth=false;
+ }else{
+ parseDepth=true;
+ assert(depthRatio>0);
+ if(depthRatio<1){depthRatio=1/depthRatio;}
+ }
+ }else if(a.equals("storequality") || a.equals("sq")){
+ storeQuality=Tools.parseBoolean(b);
+ }else if(a.equals("exact") || a.equals("ex")){
+ exact=Tools.parseBoolean(b);
+ }else if(a.equals("uniquenames") || a.equals("un")){
+ uniqueNames=Tools.parseBoolean(b);
+ }else if(a.equals("ftl") || a.equals("forcetrimleft")){
+ forceTrimLeft=Integer.parseInt(b);
+ }else if(a.equals("ftr") || a.equals("forcetrimright")){
+ forceTrimRight=Integer.parseInt(b);
+ }else if(a.equals("subset") || a.equals("sst")){
+ subset_=Integer.parseInt(b);
+ }else if(a.equals("subsets") || a.equals("subsetcount") || a.equals("sstc")){
+ subsetCount_=Integer.parseInt(b);
+ }else if(i==0 && in1==null && arg.indexOf('=')<0 && arg.lastIndexOf('.')>0){
+ String c=args[i];
+ if(c.indexOf(',')>=0 && !new File(c).exists()){
+ in1=c.split(",");
+ }else{
+ in1=new String[] {c};
+ }
+ }else if(i==1 && out==null && arg.indexOf('=')<0 && arg.lastIndexOf('.')>0){
+ out=args[i];
+ setOut=true;
+ }else{
+ throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ {//Process parser fields
+ Parser.processQuality();
+ }
+
+ if(verbose){
+ ReadWrite.verbose=ConcurrentGenericReadInputStream.verbose=ConcurrentReadOutputStream.verbose=ByteFile1.verbose=ByteFile2.verbose=FastqReadInputStream.verbose=true;
+ }
+
+ k=k_;
+ k2=k-1;
+ subset=subset_;
+ subsetCount=subsetCount_;
+ subsetMode=subsetCount>1;
+ assert(subset>=0 && subset<subsetCount) : "subset="+subset+", subsetCount="+subsetCount;
+
+ BandedAligner.penalizeOffCenter=true;
+
+ if(maxSpanningTree){removeCycles=fixMultiJoins=false;}
+ if(absorbOverlap){processClusters=true;}
+ if(processClusters || renameClusters || maxSpanningTree){makeClusters=true;}
+ if(makeClusters){findOverlaps=true;}
+ if(renameClusters){uniqueNames=/*storeName=*/false;}
+
+ if(bandwidth_>-1){
+ bandwidth=Tools.min(bandwidth_, 2*maxEdits+1);
+ customBandwidth=(bandwidth<2*maxEdits+1);
+ }else{
+ bandwidth=2*maxEdits+1;
+ customBandwidth=false;
+ }
+ maxSubs=Tools.max(maxSubs, maxEdits);
+ if(maxSubs>0 || minIdentity<100 || findOverlaps){storeSuffix=true;}
+
+ assert(FastaReadInputStream.settingsOK());
+
+ if(in1==null){
+ printOptions();
+ throw new RuntimeException("Error - at least one input file is required.");
+ }
+
+ for(int i=0; i<in1.length; i++){
+ if(in1[i].equalsIgnoreCase("stdin") && !new File(in1[i]).exists()){in1[i]="stdin.fa";}
+ }
+
+// assert(false) : Arrays.toString(in);
+
+// if(!setOut && clusterFilePattern==null){out="stdout.fa";}
+// else
+// if("stdout".equalsIgnoreCase(out) || "standarddout".equalsIgnoreCase(out)){
+// out="stdout.fa";
+// outstream=System.err;
+// }
+ if(!Tools.canWrite(out, overwrite)){throw new RuntimeException("Output file "+out+" already exists, and overwrite="+overwrite);}
+
+ for(int i=0; i<in1.length; i++){
+ assert(!in1[i].equalsIgnoreCase(out));
+ }
+// assert(false) : "\nabsorbContainment="+absorbContainment+", findOverlaps="+findOverlaps+", absorbOverlap="+absorbOverlap+"\n"+
+// "processClusters="+processClusters+", renameClusters="+renameClusters+", makeClusters="+makeClusters+", uniqueNames="+uniqueNames+"\n"+
+// "storeName="+storeName+", DISPLAY_PROGRESS="+DISPLAY_PROGRESS+", removeCycles="+removeCycles;
+ if(absorbContainment || findOverlaps){
+// assert(false);
+ affixMaps=new HashMap[numAffixMaps];
+ for(int i=0; i<numAffixMaps; i++){
+ affixMaps[i]=new HashMap<LongM, ArrayList<Unit>>(4000000);
+ }
+ if(affixMaps.length>0){affixMap1=affixMaps[0];}
+ if(affixMaps.length>1){affixMap2=affixMaps[1];}
+ }
+// assert(false) : absorbContainment+", "+(affixMap==null);
+
+ if(outdupe==null){
+ dupeWriter=null;
+ }else{
+ FileFormat ff=FileFormat.testOutput(outdupe, FileFormat.FASTA, null, true, overwrite, append, false);
+ dupeWriter=new ByteStreamWriter(ff);
+ }
+ }
+
+ public void process(){
+
+ Timer t=new Timer();
+
+ boolean dq0=FASTQ.DETECT_QUALITY;
+ boolean ti0=FASTQ.TEST_INTERLEAVED;
+ int rbl0=Shared.READ_BUFFER_LENGTH;
+// FASTQ.DETECT_QUALITY=false;
+// FASTQ.TEST_INTERLEAVED=false;
+ Shared.READ_BUFFER_LENGTH=16;
+
+ process2();
+
+ FASTQ.DETECT_QUALITY=dq0;
+ FASTQ.TEST_INTERLEAVED=ti0;
+ Shared.READ_BUFFER_LENGTH=rbl0;
+
+ t.stop();
+
+ double rpnano=readsProcessed/(double)(t.elapsed);
+ double bpnano=basesProcessed/(double)(t.elapsed);
+
+ String rpstring=(readsProcessed<100000 ? ""+readsProcessed : readsProcessed<100000000 ? (readsProcessed/1000)+"k" : (readsProcessed/1000000)+"m");
+ String bpstring=(basesProcessed<100000 ? ""+basesProcessed : basesProcessed<100000000 ? (basesProcessed/1000)+"k" : (basesProcessed/1000000)+"m");
+
+ while(rpstring.length()<8){rpstring=" "+rpstring;}
+ while(bpstring.length()<8){bpstring=" "+bpstring;}
+
+ if(showSpeed){
+ outstream.println("Time: \t\t\t"+t);
+ outstream.println("Reads Processed: "+rpstring+" \t"+String.format("%.2fk reads/sec", rpnano*1000000));
+ outstream.println("Bases Processed: "+bpstring+" \t"+String.format("%.2fm bases/sec", bpnano*1000));
+ }
+
+ if(errorState){
+ throw new RuntimeException("Dedupe terminated in an error state; the output may be corrupt.");
+ }
+ }
+
+ public void process2(){
+ if(dupeWriter!=null){dupeWriter.start();}
+// assert(false) : out;
+ Timer t=new Timer();
+
+ if(DISPLAY_PROGRESS){
+ outstream.println("Initial:");
+ Shared.printMemory();
+ outstream.println();
+ }
+
+ processMatches(t);
+
+ forceTrimLeft=forceTrimRight=-1;
+
+ if(absorbContainment){
+ processContainments(t);
+ }
+
+ if(dupeWriter!=null){dupeWriter.poisonAndWait();}
+
+ if(findOverlaps){
+ findOverlaps(t);
+
+ killAffixMaps();
+
+ if(processClusters || renameClusters || maxSpanningTree){codeMap=null;}
+
+ if(maxSpanningTree){
+ processMst(t);
+ }
+
+ if(processClusters){
+ processClusters(t, absorbOverlap);
+ }
+// if(renameClusters){
+// renameClusters(t);
+// }
+// assert(false) : (codeMap==null)+", "+(affixMap1==null)+", "+processedClusters;
+ }
+
+ outstream.println("Input: \t"+readsProcessed+" reads \t\t"+basesProcessed+" bases.");
+
+ if(absorbMatch){
+ outstream.println("Duplicates: \t"+matches+" reads ("+String.format("%.2f",matches*100.0/readsProcessed)+"%) \t"+
+ baseMatches+" bases ("+String.format("%.2f",baseMatches*100.0/basesProcessed)+"%) \t"+collisions+" collisions.");
+ }
+ if(absorbContainment){
+ outstream.println("Containments: \t"+containments+" reads ("+String.format("%.2f",containments*100.0/readsProcessed)+"%) \t"+
+ baseContainments+" bases ("+String.format("%.2f",baseContainments*100.0/basesProcessed)+"%) \t"+containmentCollisions+" collisions.");
+ }
+ if(findOverlaps){
+ outstream.println("Overlaps: \t"+overlaps+" reads ("+String.format("%.2f",overlaps*100.0/readsProcessed)+"%) \t"+
+ baseOverlaps+" bases ("+String.format("%.2f",baseOverlaps*100.0/basesProcessed)+"%) \t"+overlapCollisions+" collisions.");
+ }
+// outstream.println("Result: \t"+(addedToMain-containments)+" reads \t\t"+(basesProcessed-baseMatches-baseContainments)+" bases.");
+
+ long outReads=(addedToMain-containments);
+ if(UNIQUE_ONLY){outReads=readsProcessed-matches-containments;}
+ long outBases=(basesProcessed-baseMatches-baseContainments);
+ outstream.println("Result: \t"+outReads+" reads ("+String.format("%.2f",outReads*100.0/readsProcessed)+"%) \t"+
+ outBases+" bases ("+String.format("%.2f",outBases*100.0/basesProcessed)+"%)");
+
+ outstream.println("");
+
+ if(out!=null || clusterFilePattern!=null || outbest!=null || outgraph!=null || outcsf!=null){
+ writeOutput(outcsf, t);
+ }
+
+ }
+
+ private void killAffixMaps(){
+ if(affixMaps==null){return;}
+ for(int i=0; i<numAffixMaps; i++){
+ if(affixMaps[i]!=null){affixMaps[i].clear();}
+ affixMaps[i]=null;
+ }
+ affixMap1=null;
+ affixMap2=null;
+ affixMaps=null;
+ }
+
+ private ConcurrentReadInputStream[] makeCrisArray(ArrayList<Read> list){
+ final ConcurrentReadInputStream[] array;
+
+ if(list!=null){
+ array=new ConcurrentReadInputStream[] {new ConcurrentCollectionReadInputStream(list, null, -1)};
+ array[0].start(); //This deadlocks if ConcurrentReadInputStream extends Thread rather than spawning a new thread.
+ }else{
+ array=new ConcurrentReadInputStream[in1.length];
+ multipleInputFiles=array.length>1;
+ for(int i=0; i<in1.length; i++){
+ if(verbose){System.err.println("Creating cris for "+in1[i]);}
+
+ final ConcurrentReadInputStream cris;
+ {
+ FileFormat ff1=FileFormat.testInput(in1[i], FileFormat.FASTA, null, !multipleInputFiles || ReadWrite.USE_UNPIGZ, true);
+ FileFormat ff2=(in2==null || in2.length<=i ? null : FileFormat.testInput(in2[i], FileFormat.FASTA, null, !multipleInputFiles || ReadWrite.USE_UNPIGZ, true));
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, ff1.samOrBam(), ff1, ff2);
+ cris.start();
+ if(cris.paired()){
+ THREADS=1;//Temp fix for losing reads when multithreaded and paired
+ if(absorbContainment){
+ System.err.println("Set absorbContainment to false because it is not currently supported for paired reads.");
+ absorbContainment=false;
+ }
+ }
+ }
+ array[i]=cris;
+ }
+ }
+ return array;
+ }
+
+ private void processMatches(Timer t){
+ crisa=makeCrisArray(null);
+
+ ArrayList<HashThread> alht=new ArrayList<HashThread>(THREADS);
+ for(int i=0; i<THREADS; i++){alht.add(new HashThread(true, (absorbContainment|findOverlaps), absorbMatch, false, false));}
+ for(HashThread ht : alht){ht.start();}
+ for(HashThread ht : alht){
+ while(ht.getState()!=Thread.State.TERMINATED){
+ try {
+ ht.join();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ matches+=ht.matchesT;
+ collisions+=ht.collisionsT;
+ containments+=ht.containmentsT;
+ containmentCollisions+=ht.containmentCollisionsT;
+ baseContainments+=ht.baseContainmentsT;
+ baseMatches+=ht.baseMatchesT;
+ addedToMain+=ht.addedToMainT;
+ readsProcessed+=ht.readsProcessedT;
+ basesProcessed+=ht.basesProcessedT;
+ }
+ alht.clear();
+
+ if(verbose){System.err.println("Attempting to close input streams (1).");}
+ for(ConcurrentReadInputStream cris : crisa){
+ errorState|=ReadWrite.closeStream(cris);
+ }
+ crisa=null;
+
+ if(DISPLAY_PROGRESS){
+ t.stop();
+ outstream.println("Found "+matches+" duplicates.");
+ outstream.println("Finished exact matches. Time: "+t);
+ Shared.printMemory();
+ if(verbose){outstream.println(affixMap1);}
+ outstream.println();
+ t.start();
+ }
+ }
+
+ private void processContainments(Timer t){
+ ArrayList<Read> list=new ArrayList<Read>((int)addedToMain);
+ for(ArrayList<Unit> alu : codeMap.values()){
+ for(Unit u : alu){
+ assert(u.r.mate==null) : "Containments are not currently supported with paired reads.";
+ if(u.valid() && u.r.pairnum()==0){list.add(u.r);}
+ }
+ }
+
+ // if(minLengthPercent>0){
+ // if(verbose){System.err.println("Sorting.");}
+ // Collections.sort(list, ReadLengthComparator.comparator);
+ // Collections.reverse(list);
+ // assert(list.isEmpty() || list.get(0).length()<=list.get(list.size()-1).length()) :
+ // list.get(0).length()+", "+list.get(list.size()-1).length();
+ // }
+
+ crisa=makeCrisArray(subsetMode ? null : list);
+
+ ArrayList<HashThread> alht=new ArrayList<HashThread>(THREADS);
+ for(int i=0; i<THREADS; i++){alht.add(new HashThread(false, false, false, true, false));}
+
+ for(HashThread ht : alht){ht.start();}
+ for(HashThread ht : alht){
+ while(ht.getState()!=Thread.State.TERMINATED){
+ try {
+ ht.join();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ assert(ht.matchesT==0);
+ assert(ht.collisionsT==0);
+ assert(ht.baseMatchesT==0);
+ assert(ht.addedToMainT==0);
+// assert(ht.readsProcessedT==0);
+// assert(ht.basesProcessedT==0);
+ // matches+=ht.matchesT;
+ // collisions+=ht.collisionsT;
+ containments+=ht.containmentsT;
+ containmentCollisions+=ht.containmentCollisionsT;
+ baseContainments+=ht.baseContainmentsT;
+ // baseMatches+=ht.baseMatchesT;
+ // addedToMain+=ht.addedToMainT;
+ // readsProcessed+=ht.readsProcessedT;
+ // basesProcessed+=ht.basesProcessedT;
+ }
+ alht.clear();
+ if(verbose){System.err.println("Attempting to close input streams (2).");}
+ for(ConcurrentReadInputStream cris : crisa){
+ errorState|=ReadWrite.closeStream(cris);
+ }
+
+ if(DISPLAY_PROGRESS){
+ t.stop();
+ outstream.println("Found "+containments+" contained sequences.");
+ outstream.println("Finished containment. Time: "+t);
+ Shared.printMemory();
+ outstream.println();
+ t.start();
+ }
+ crisa=null;
+ if(!findOverlaps){
+ killAffixMaps();
+ }
+
+ long x=removeInvalid(list);
+ list.clear();
+
+ if(DISPLAY_PROGRESS){
+ t.stop();
+ outstream.println("Removed "+x+" invalid entries.");
+ outstream.println("Finished invalid removal. Time: "+t);
+ Shared.printMemory();
+ outstream.println();
+ t.start();
+ }
+ }
+
+ private void findOverlaps(Timer t){
+
+ ArrayList<Read> list=new ArrayList<Read>((int)addedToMain);
+ for(ArrayList<Unit> alu : codeMap.values()){
+ for(Unit u : alu){
+ if(u.valid() && u.r.pairnum()==0){
+ u.unitID=list.size();
+ list.add(u.r);
+ if(u.r.mate!=null){
+ Unit u2=(Unit)u.r.mate.obj;
+ u2.unitID=u.unitID;
+ }
+ }else{
+ u.unitID=Integer.MAX_VALUE;
+ }
+ }
+ }
+
+ if(preventTransitiveOverlaps){
+ clusterNumbers=new AtomicIntegerArray(list.size());
+ for(int i=0; i<clusterNumbers.length(); i++){
+ clusterNumbers.set(i, i);
+ }
+ }
+
+ crisa=makeCrisArray(subsetMode ? null : list);
+
+ ArrayList<HashThread> alht=new ArrayList<HashThread>(THREADS);
+ for(int i=0; i<THREADS; i++){alht.add(new HashThread(false, false, false, false, true));}
+
+ for(HashThread ht : alht){ht.start();}
+ for(HashThread ht : alht){
+ while(ht.getState()!=Thread.State.TERMINATED){
+ try {
+ ht.join();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ assert(ht.matchesT==0);
+ assert(ht.collisionsT==0);
+ assert(ht.baseMatchesT==0);
+ assert(ht.addedToMainT==0);
+
+ overlaps+=ht.overlapsT;
+ baseOverlaps+=ht.baseOverlapsT;
+ overlapCollisions+=ht.overlapCollisionsT;
+ }
+ alht.clear();
+ if(verbose){System.err.println("Attempting to close input streams (3).");}
+ for(ConcurrentReadInputStream cris : crisa){
+ errorState|=ReadWrite.closeStream(cris);
+ }
+
+ if(DISPLAY_PROGRESS){
+ t.stop();
+ outstream.println("Found "+overlaps+" overlaps.");
+ outstream.println("Finished finding overlaps. Time: "+t);
+ Shared.printMemory();
+ outstream.println();
+ t.start();
+ }
+
+ crisa=null;
+
+ if(makeClusters){
+ int intransitive=0, redundant=0;
+ assert((intransitive=countIntransitive(t, list, rigorousTransitive))==0);
+ assert((redundant=countRedundant(t, list))==0);
+ long overlaps=countOverlaps(t, list);
+ assert(intransitive==0);
+ assert(redundant==0);
+// makeTransitive(t, list, rigorousTransitive);
+ if(clusterQueue==null){
+ clusterQueue=new ArrayDeque<ArrayList<Unit>>(list.size()/4+1);
+ processedClusters=new ArrayList<ArrayList<Unit>>();
+ }else{
+ assert(clusterQueue.isEmpty());
+ }
+ makeClusters(t, list);
+ }
+
+ list.clear();
+ }
+
+ private long makeTransitive(Timer t, ArrayList<Read> list, boolean rigorous){
+ assert(false) : "No longer needed.";
+ long added=0;
+ for(Read r : list){
+ assert(r!=null);
+ Unit u=(Unit) r.obj;
+ assert(u!=null);
+ assert(u.valid());
+// Data.sysout.println("Considering "+r.id+"; valid="+u.valid()+", overlaps="+(u.overlapList==null ? "null" : u.overlapList.size()));
+ if(u.valid()){
+
+ if(u.overlapList!=null){
+ for(Overlap o : u.overlapList){
+ Unit u2=(o.u1==u ? o.u2 : o.u1);
+ assert(u2!=u);
+ if(u2.overlapList==null){
+ u2.overlapList=new ArrayList<Overlap>(2);
+ u2.overlapList.add(o);
+ }else{
+ boolean found=false;
+ if(rigorous){
+ found=u2.overlapList.contains(o);
+ }else{
+ for(Overlap o2 : u2.overlapList){
+ if(o2.u1==u || o2.u2==u){found=true; break;}
+ }
+ }
+ if(!found){
+ added++;
+ u2.overlapList.add(o);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ for(Read r : list){
+ Unit u=(Unit) r.obj;
+ if(u.valid()){
+ assert(u.isTransitive());
+ }
+ }
+
+ if(DISPLAY_PROGRESS){
+ t.stop();
+ outstream.println("Added overlaps: "+added);
+ outstream.println("Made overlaps transitive. Time: "+t);
+ Shared.printMemory();
+ outstream.println();
+ t.start();
+ }
+ return added;
+ }
+
+ private int countIntransitive(Timer t, ArrayList<Read> list, boolean rigorous){
+ if(!countTransitive){return 0;}
+ int transitive=0, intransitive=0;
+ for(Read r : list){
+ assert(r!=null);
+ Unit u=(Unit) r.obj;
+ assert(u!=null);
+ assert(u.valid());
+// Data.sysout.println("Considering "+r.id+"; valid="+u.valid()+", overlaps="+(u.overlapList==null ? "null" : u.overlapList.size()));
+ if(u.valid()){
+ if(rigorous ? u.isPerfectlyTransitive() : u.isTransitive()){
+ transitive++;
+ }else{
+ intransitive++;
+ }
+ }
+ }
+
+ if(DISPLAY_PROGRESS){
+ t.stop();
+ outstream.println("Intransitive: "+intransitive+", \ttransitive: "+transitive);
+ outstream.println("Checked transitivity. Time: "+t);
+ Shared.printMemory();
+ outstream.println();
+ t.start();
+ }
+
+ return intransitive;
+ }
+
+ private int countRedundant(Timer t, ArrayList<Read> list){
+ if(!countRedundant){return 0;}
+ int redundant=0, nonredundant=0;
+ for(Read r : list){
+ assert(r!=null);
+ Unit u=(Unit) r.obj;
+ assert(u!=null);
+ assert(u.valid());
+// Data.sysout.println("Considering "+r.id+"; valid="+u.valid()+", overlaps="+(u.overlapList==null ? "null" : u.overlapList.size()));
+ if(u.valid()){
+ if(u.isNonRedundant()){
+ nonredundant++;
+ }else{
+ redundant++;
+ }
+ }
+ }
+
+ if(DISPLAY_PROGRESS){
+ t.stop();
+ outstream.println("Redundant: "+redundant+", \tnonredundant: "+nonredundant);
+ outstream.println("Checked redundancy. Time: "+t);
+ Shared.printMemory();
+ outstream.println();
+ t.start();
+ }
+ return redundant;
+ }
+
+ private long countOverlaps(Timer t, ArrayList<Read> list){
+
+ long overlaps=0, length=0;
+ for(Read r : list){
+ assert(r!=null);
+ Unit u=(Unit) r.obj;
+ assert(u!=null);
+ assert(u.valid());
+// Data.sysout.println("Considering "+r.id+"; valid="+u.valid()+", overlaps="+(u.overlapList==null ? "null" : u.overlapList.size()));
+ if(u.valid() && u.overlapList!=null){
+ for(Overlap o : u.overlapList){
+ overlaps++;
+ length+=o.overlapLen;
+ }
+ }
+ }
+
+ if(DISPLAY_PROGRESS){
+ t.stop();
+ outstream.println("Overlaps: "+overlaps+", \tlength: "+length);
+ outstream.println("Counted overlaps. Time: "+t);
+ Shared.printMemory();
+ outstream.println();
+ t.start();
+ }
+ return overlaps;
+ }
+
+ private long fillClusterSizeMatrix(ArrayList<ArrayList<Unit>> clusters, long[][] clusterSize){
+ int max=0;
+ for(ArrayList<Unit> cluster : clusters){
+ final int cs=Tools.min(clusterSize.length-1, cluster.size());
+ {
+ long reads=0, bases=0;
+ for(Unit u2 : cluster){
+ reads++;
+ bases+=u2.length();
+ }
+ clusterSize[0][cs]++;
+ clusterSize[1][cs]+=reads;
+ clusterSize[2][cs]+=bases;
+ }
+ max=Tools.max(max, cluster.size());
+ }
+ return max;
+ }
+
+ private void makeClusters(Timer t, ArrayList<Read> list){
+
+ final int clusterlen=70000;
+ long[][] clusterSize=new long[3][clusterlen];
+ int max=0;
+ for(Read r : list){
+ Unit u=(Unit) r.obj;
+
+ if(!u.clustered()){
+ ArrayList<Unit> cluster=u.makeCluster();
+ if(cluster.size()>2){cluster.trimToSize();}
+ if(cluster.size()==1 || (!processClusters && !maxSpanningTree)){processedClusters.add(cluster);}
+ else{clusterQueue.add(cluster);}
+ final int cs=Tools.min(clusterlen-1, cluster.size());
+ {
+ long reads=0, bases=0;
+ for(Unit u2 : cluster){
+ reads++;
+ bases+=u2.length();
+ }
+ clusterSize[0][cs]++;
+ clusterSize[1][cs]+=reads;
+ clusterSize[2][cs]+=bases;
+ }
+ max=Tools.max(max, cluster.size());
+ }
+ }
+
+ if(DISPLAY_PROGRESS){
+ t.stop();
+ outstream.println(toClusterSizeString(clusterSize));
+ outstream.println("\nLargest: "+max);
+ outstream.println("Finished making clusters. Time: "+t);
+ Shared.printMemory();
+ outstream.println();
+ t.start();
+ }
+
+
+ long x=removeInvalid(list);
+
+ if(DISPLAY_PROGRESS){
+ t.stop();
+ outstream.println("Removed "+x+" invalid entries.");
+ outstream.println("Finished invalid removal. Time: "+t);
+ Shared.printMemory();
+ outstream.println();
+ t.start();
+ }
+ }
+
+ private String toClusterSizeString(long[][] clusterSizeMatrix){
+
+ long[] clusterSize=clusterSizeMatrix[0];
+ long[] clusterReads=clusterSizeMatrix[1];
+ long[] clusterBases=clusterSizeMatrix[2];
+
+ long totalClusters=Tools.sum(clusterSize);
+
+ long bigClusters=0;
+ for(int i=minClusterSize; i<clusterSize.length; i++){
+ bigClusters+=clusterSize[i];
+ }
+
+ final int spaces=18;
+ final int spaces2=spaces*2, spaces3=spaces*3;
+
+ final StringBuilder sb=new StringBuilder(100), sb2=new StringBuilder(1000);
+ sb2.append("Clusters:");
+ while(sb2.length()<spaces){sb2.append(' ');}
+ sb2.append(totalClusters+(minClusterSize<2 ? "" : " ("+bigClusters+" of at least size "+minClusterSize+")")+"\n");
+
+ sb.append("Size Range");
+ while(sb.length()<spaces){sb.append(' ');}
+ sb.append("Clusters");
+ while(sb.length()<spaces2){sb.append(' ');}
+ sb.append("Reads");
+ while(sb.length()<spaces3){sb.append(' ');}
+ sb.append("Bases");
+
+ sb2.append('\n');
+ sb2.append(sb);
+ sb.setLength(0);
+
+ for(int i=0; i<clusterSize.length-1; i=Tools.max(i+1, i*2)){
+ int a=i+1, b=i*2;
+ if(i<2){
+ sb.append(a);
+ while(sb.length()<spaces){sb.append(' ');}
+ sb.append(clusterSize[a]);
+ while(sb.length()<spaces2){sb.append(' ');}
+ sb.append(clusterReads[a]);
+ while(sb.length()<spaces3){sb.append(' ');}
+ sb.append(clusterBases[a]);
+ }else if(b>=clusterSize.length){
+ long x=Tools.sum(clusterSize, a, clusterSize.length-1);
+ long y=Tools.sum(clusterReads, a, clusterSize.length-1);
+ long z=Tools.sum(clusterBases, a, clusterSize.length-1);
+ if(x>0){
+ sb.append(a+"+");
+ while(sb.length()<spaces){sb.append(' ');}
+ sb.append(x);
+ while(sb.length()<spaces2){sb.append(' ');}
+ sb.append(y);
+ while(sb.length()<spaces3){sb.append(' ');}
+ sb.append(z);
+ }
+ }else{
+ long x=Tools.sum(clusterSize, a, b);
+ long y=Tools.sum(clusterReads, a, b);
+ long z=Tools.sum(clusterBases, a, b);
+ if(x>0){
+ sb.append(a+"-"+b);
+ while(sb.length()<spaces){sb.append(' ');}
+ sb.append(x);
+ while(sb.length()<spaces2){sb.append(' ');}
+ sb.append(y);
+ while(sb.length()<spaces3){sb.append(' ');}
+ sb.append(z);
+ }
+ }
+ if(sb.length()>0){
+ sb2.append('\n');
+ sb2.append(sb);
+ sb.setLength(0);
+ }
+ }
+ return sb2.toString();
+ }
+
+ private void renameClusters(Timer t){
+ assert(false) : "This is now unused; renaming is done at output time.";
+ int cnum=0;
+ final StringBuilder sb=new StringBuilder(64);
+ for(ArrayList<Unit> alu : processedClusters){
+ for(int i=0; i<alu.size(); i++){
+ Unit u=alu.get(i);
+ Read r=u.r;
+ sb.append("Cluster ");
+ sb.append(cnum);
+ sb.append(",contig ");
+ sb.append(i);
+ if(u.offsetValid()){
+ sb.append(",pos ");
+ sb.append(u.offset());
+ }
+ r.id=sb.toString();
+ sb.setLength(0);
+ }
+ }
+
+ if(DISPLAY_PROGRESS){
+ t.stop();
+ outstream.println("Finished cluster renaming. Time: "+t);
+ Shared.printMemory();
+ outstream.println();
+ t.start();
+ }
+ }
+
+ private void processMst(Timer t){
+
+ if(DISPLAY_PROGRESS){outstream.println("Converting to Maximum Spanning Tree.");}
+
+ ArrayList<MstThread> alct=new ArrayList<MstThread>(THREADS);
+ for(int i=0; i<THREADS; i++){
+ alct.add(new MstThread());
+ }
+
+ long overlapsRemoved=0;
+ long overlapBasesRemoved=0;
+ long overlapsRetained=0;
+ long overlapBasesRetained=0;
+
+ for(MstThread ct : alct){ct.start();}
+ for(MstThread ct : alct){
+ while(ct.getState()!=Thread.State.TERMINATED){
+ try {
+ ct.join();
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ }
+ }
+
+ overlapsRemoved+=ct.overlapsRemovedT;
+ overlapBasesRemoved+=ct.overlapBasesRemovedT;
+ overlapsRetained+=ct.overlapsRetainedT;
+ overlapBasesRetained+=ct.overlapBasesRetainedT;
+ }
+ assert(clusterQueue.isEmpty());
+ if(processClusters){
+ for(MstThread ct : alct){
+ clusterQueue.addAll(ct.processedT);
+ ct.processedT.clear();
+ ct.processedT=null;
+ }
+ }else{
+ for(MstThread ct : alct){
+ processedClusters.addAll(ct.processedT);
+ ct.processedT.clear();
+ ct.processedT=null;
+ }
+ clusterQueue=null;
+ }
+ alct.clear();
+
+ assert(affixMaps==null);
+ killAffixMaps();
+
+
+ if(DISPLAY_PROGRESS){
+ t.stop();
+ outstream.println("Removed "+(overlapsRemoved)+" edges ("+overlapBasesRemoved+" bases).");
+ outstream.println("Retained "+(overlapsRetained)+" edges ("+overlapBasesRetained+" bases).");
+
+// outstream.println("\nAfter conversion to Maximum Spanning Tree:");
+// final int[] clusterSize=new int[8200];
+// int max=0;
+// for(ArrayList<Unit> cluster : processedClusters){
+// clusterSize[Tools.min(clusterSize.length-1, cluster.size())]++;
+// max=Tools.max(max, cluster.size());
+// }
+// outstream.println(toClusterSizeString(clusterSize));
+// outstream.println("Largest: "+max);
+
+ outstream.println("Finished MST conversion. Time: "+t);
+ Shared.printMemory();
+ outstream.println();
+ t.start();
+ }
+ }
+
+ private void processClusters(Timer t, boolean mergeClusters){
+
+ ArrayList<ClusterThread> alct=new ArrayList<ClusterThread>(THREADS);
+ for(int i=0; i<THREADS; i++){
+ alct.add(new ClusterThread(fixMultiJoins, canonicizeClusters, removeCycles, fixCanonContradictions, fixOffsetContradictions,
+ mergeClusters, mergeClusters, mergeClusters));
+ }
+
+ long leafMerges=0;
+ long innerMerges=0;
+ long leafBaseMerges=0;
+ long innerBaseMerges=0;
+
+ long multiJoinFailures=0;
+ long multiJoinsFound=0;
+ long multiJoinBasesFound=0;
+ long unitsFlipped=0;
+ long overlapsFlipped=0;
+ long canonContradictoryOverlaps=0;
+ long canonContradictoryClusters=0;
+ long offsetContradictoryOverlaps=0;
+ long offsetContradictoryClusters=0;
+ long cycleOverlaps=0;
+ long cycleClusters=0;
+
+ for(ClusterThread ct : alct){ct.start();}
+ for(ClusterThread ct : alct){
+ while(ct.getState()!=Thread.State.TERMINATED){
+ try {
+ ct.join();
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ }
+ }
+
+ leafMerges+=ct.leafMergesT;
+ innerMerges+=ct.innerMergesT;
+ leafBaseMerges+=ct.leafBaseMergesT;
+ innerBaseMerges+=ct.innerBaseMergesT;
+
+ multiJoinFailures+=ct.multiJoinFailuresT;
+ multiJoinsFound+=ct.multiJoinsFoundT;
+ multiJoinBasesFound+=ct.multiJoinBasesFoundT;
+ unitsFlipped+=ct.unitsFlippedT;
+ overlapsFlipped+=ct.overlapsFlippedT;
+ canonContradictoryOverlaps+=ct.canonContradictoryOverlapsT;
+ canonContradictoryClusters+=ct.canonContradictoryClustersT;
+ offsetContradictoryOverlaps+=ct.offsetContradictoryOverlapsT;
+ offsetContradictoryClusters+=ct.offsetContradictoryClustersT;
+ cycleOverlaps+=ct.cycleOverlapsT;
+ cycleClusters+=ct.cycleClustersT;
+ }
+ alct.clear();
+
+ assert(affixMaps==null || affixMap1==null);
+ killAffixMaps();
+
+ assert(clusterQueue.isEmpty());
+ clusterQueue=null;
+
+ if(DISPLAY_PROGRESS){
+ t.stop();
+ if(fixMultiJoins){
+ outstream.println("Found "+(multiJoinsFound)+" multijoins ("+multiJoinBasesFound+" bases).");
+ outstream.println("Experienced "+(multiJoinFailures)+" multijoin removal failures.");
+ }
+ if(canonicizeClusters){
+ outstream.println("Flipped "+(unitsFlipped)+" reads and "+overlapsFlipped+" overlaps.");
+ outstream.println("Found "+(canonContradictoryClusters)+" clusters ("+canonContradictoryOverlaps+" overlaps) with contradictory orientation cycles.");
+ }
+ if(fixOffsetContradictions){
+ outstream.println("Found "+(offsetContradictoryClusters)+" clusters ("+offsetContradictoryOverlaps+" overlaps) with contradictory offset cycles.");
+ }
+ outstream.println("Found "+(cycleClusters)+" clusters ("+cycleOverlaps+" overlaps) with remaining cycles.");
+ if(absorbOverlap){
+ outstream.println("Merged "+(leafMerges)+" leaves ("+leafBaseMerges+" bases).");
+ outstream.println("Merged "+(innerMerges)+" nonleaves ("+innerBaseMerges+" bases).");
+ }
+
+ outstream.println("\nAfter processing clusters:");
+ final long[][] clusterSize=new long[3][70000];
+ final long max=fillClusterSizeMatrix(processedClusters, clusterSize);
+ outstream.println(toClusterSizeString(clusterSize));
+ outstream.println("\nLargest: "+max);
+
+ outstream.println("Finished processing. Time: "+t);
+ Shared.printMemory();
+ outstream.println();
+ t.start();
+ }
+ }
+
+ private long removeInvalid(ArrayList<Read> list){
+ final LongM keym=new LongM();
+ long removedC=0, removedP=0, removedS=0, invalid=0;
+
+ for(int j=0, lim=list.size(); j<lim; j++){
+ final Read r=list.get(j);
+ final Unit u=(Unit)r.obj;
+
+ if(!u.valid()){
+
+ invalid++;
+
+ if(codeMap!=null && !codeMap.isEmpty()){
+ Long key=u.code1;
+ ArrayList<Unit> alu=codeMap.get(key);
+ if(alu!=null){
+ int valid=0;
+ for(int i=alu.size()-1; i>=0; i--){
+ Unit u2=alu.get(i);
+ if(u2==null || !u2.valid()){
+ alu.remove(i);
+ removedC++;
+ }
+ else{valid++;}
+ }
+ if(valid==0){codeMap.remove(key);}
+ }
+ }
+
+ if(affixMap1!=null && !affixMap1.isEmpty()){
+ {
+ keym.set(u.prefix1);
+ ArrayList<Unit> alu=affixMap1.get(keym);
+ if(alu!=null){
+ int valid=0;
+ for(int i=alu.size()-1; i>=0; i--){
+ Unit u2=alu.get(i);
+ if(u2==null || !u2.valid()){
+ alu.remove(i);
+ removedP++;
+ }
+ else{valid++;}
+ }
+ if(valid==0){affixMap1.remove(keym);}
+ }
+ }
+ if(storeSuffix){
+ keym.set(u.suffix1);
+ ArrayList<Unit> alu=affixMap1.get(keym);
+ if(alu!=null){
+ int valid=0;
+ for(int i=alu.size()-1; i>=0; i--){
+ Unit u2=alu.get(i);
+ if(u2==null || !u2.valid()){
+ alu.remove(i);
+ removedS++;
+ }
+ else{valid++;}
+ }
+ if(valid==0){affixMap1.remove(keym);}
+ }
+ }
+ }
+ if(affixMap2!=null && !affixMap2.isEmpty()){
+ if(u.prefix2!=-1){
+ keym.set(u.prefix2);
+ ArrayList<Unit> alu=affixMap2.get(keym);
+ if(alu!=null){
+ int valid=0;
+ for(int i=alu.size()-1; i>=0; i--){
+ Unit u2=alu.get(i);
+ if(u2==null || !u2.valid()){
+ alu.remove(i);
+ removedP++;
+ }
+ else{valid++;}
+ }
+ if(valid==0){affixMap2.remove(keym);}
+ }
+ }
+ if(storeSuffix && u.suffix2!=-1){
+ keym.set(u.suffix2);
+ ArrayList<Unit> alu=affixMap2.get(keym);
+ if(alu!=null){
+ int valid=0;
+ for(int i=alu.size()-1; i>=0; i--){
+ Unit u2=alu.get(i);
+ if(u2==null || !u2.valid()){
+ alu.remove(i);
+ removedS++;
+ }
+ else{valid++;}
+ }
+ if(valid==0){affixMap2.remove(keym);}
+ }
+ }
+
+ }
+
+ list.set(j, null);
+ }
+ }
+
+ if(invalid>0){
+ Tools.condenseStrict(list);
+ }
+ if(verbose){
+ outstream.println("Removed invalids: "+removedC+", "+removedP+", "+removedS);
+ }
+ return invalid;
+ }
+
+
+ private static ArrayList<Read> addToArray(HashMap<Long, ArrayList<Unit>> codeMap, boolean sort, boolean ascending, boolean clear, long outNum){
+ assert(outNum<=Integer.MAX_VALUE);
+ if(verbose){System.err.println("Making list.");}
+ ArrayList<Read> list=new ArrayList<Read>((int)outNum);
+ if(verbose){System.err.println("Adding.");}
+ for(ArrayList<Unit> alu : codeMap.values()){
+ for(Unit u : alu){
+ if(u.valid() && u.r.pairnum()==0){list.add(u.r);}
+ }
+ if(clear){alu.clear();}
+ }
+ if(clear){codeMap.clear();}
+
+ if(sort){
+ if(verbose){System.err.println("Sorting.");}
+ Collections.sort(list, ReadLengthComparator.comparator);
+ if(ascending){
+ Collections.reverse(list);
+ assert(list.isEmpty() || list.get(0).length()<=list.get(list.size()-1).length()) :
+ list.get(0).length()+", "+list.get(list.size()-1).length();
+ }else{
+ assert(list.isEmpty() || list.get(0).length()>=list.get(list.size()-1).length()) :
+ list.get(0).length()+", "+list.get(list.size()-1).length();
+ }
+ }
+ assert(list.size()==outNum || list.size()*2L==outNum || UNIQUE_ONLY) : list.size()+", "+outNum;
+ return list;
+ }
+
+ private void writeOutput(String clusterStatsFile, Timer t){
+// verbose=true;
+// assert(false) : (processedClusters==null)+", "+(processedClusters.isEmpty())+", "+outgraph+", "+out+", "+clusterFilePattern;
+ if(processedClusters==null || processedClusters.isEmpty()){
+
+ if(out!=null || clusterFilePattern!=null){
+
+ ArrayList<Read> list=addToArray(codeMap, sort, ascending, true, addedToMain-containments);
+ codeMap=null;
+
+ if(sort){
+ if(DISPLAY_PROGRESS){
+ t.stop();
+ outstream.println("Sorted output. Time: "+t);
+ Shared.printMemory();
+ outstream.println();
+ t.start();
+ }
+ }
+
+ writeOutput(list);
+ }
+ }else{
+ if(outgraph!=null){
+ writeGraph(outgraph, processedClusters);
+ }
+ if(out!=null || clusterFilePattern!=null || clusterStatsFile!=null || outbest!=null){
+ writeOutputClusters(clusterStatsFile, processedClusters);
+ }
+ }
+
+ if(DISPLAY_PROGRESS){
+ t.stop();
+ outstream.println("Printed output. Time: "+t);
+ Shared.printMemory();
+ outstream.println();
+ t.start();
+ }
+ }
+
+
+
+ private void writeOutput(ArrayList<Read> list){
+
+ final ByteStreamWriter tsw=(out==null ? null : new ByteStreamWriter(out, overwrite, append, true));
+
+ if(verbose){System.err.println("Writing from array.");}
+ tsw.start();
+
+ HashSet<String> names=((uniqueNames && storeName) ?
+ new HashSet<String>(Tools.min(Integer.MAX_VALUE, Tools.max((int)addedToMain, (int)(addedToMain*1.35)))) : null);
+ long rid=0;
+ for(int x=0; x<list.size(); x++){
+ Read r=list.get(x);
+ list.set(x, null);
+
+ if(r.mate!=null && r.pairnum()!=0){r=r.mate;}
+
+ assert(r.mate==null || r.mate.discarded()==r.discarded());
+
+ if(!r.discarded()){
+ rid++;
+
+ for(int i=0; r!=null && i<2; i++){
+ if(multipleInputFiles){r.numericID=rid;}
+ if(names!=null){
+ String name=(r.id==null ? ""+r.numericID : r.id);
+ if(names.contains(name)){
+ for(long j=0; j<Integer.MAX_VALUE; j++){
+ String name2=name+"_dd"+j;
+ if(r.mate!=null){name2+=(" /"+(i+1));}
+ if(!names.contains(name2)){
+ r.id=name2;
+ names.add(name2);
+ break;
+ }
+ }
+ }else{
+ names.add(name);
+ }
+ }
+ tsw.println(r);
+ r.setDiscarded(true);
+ r=r.mate;
+ }
+ }
+ }
+ if(verbose){System.err.println("Shutting down tsw "+tsw.fname);}
+ tsw.poisonAndWait();
+ }
+
+
+ private void writeOutputClusters(String clusterStatsFile, ArrayList<ArrayList<Unit>> clist){
+
+ Collections.sort(clist, CLUSTER_LENGTH_COMPARATOR);
+
+ if(verbose){System.err.println("Writing clusters.");}
+
+ final ByteStreamWriter tswAll=(out==null ? null : new ByteStreamWriter(out, overwrite, append, true));
+ if(tswAll!=null){tswAll.start();}
+ ByteStreamWriter tswCluster=null;
+ ByteStreamWriter tswBest=null;
+
+ if(outbest!=null){
+ tswBest=new ByteStreamWriter(outbest, overwrite, append, true);
+ tswBest.start();
+ }
+
+ TextStreamWriter csf=null;
+ if(clusterStatsFile!=null){
+ csf=new TextStreamWriter(clusterStatsFile, overwrite, false, false);
+ csf.start();
+ csf.print("#Name\tsize\t"+nmerLength+"-mer frequencies\n");
+ }
+
+ HashSet<String> names=((uniqueNames && storeName) ?
+ new HashSet<String>(Tools.min(Integer.MAX_VALUE, Tools.max((int)addedToMain, (int)(addedToMain*1.35)))) : null);
+ long rid=0;
+ final long[] nmerCounts=new long[maxNmer+1];
+
+ final StringBuilder sb=new StringBuilder(64);
+
+ for(int cnum=0; cnum<clist.size(); cnum++){
+ final ArrayList<Unit> alu=clist.get(cnum);
+// clist.set(cnum, null); //This breaks subsequent output processing
+
+ if(alu.size()<minClusterSize){
+ if(verbose){System.err.println("Ignoring small cluster "+cnum+", size "+alu.size());}
+
+ if(csf!=null && alu.size()>=minClusterSizeForStats){
+ float[] profile=makeNmerProfile(alu, nmerCounts);
+ sb.append("Cluster_");
+ sb.append(cnum);
+ sb.append('\t');
+ sb.append(alu.size());
+ sb.append('\t');
+ for(float f : profile){
+ sb.append(String.format("%.5f ", f));
+ }
+ sb.setCharAt(sb.length()-1, '\n');
+ csf.print(sb.toString());
+ sb.setLength(0);
+ }
+ }else{
+ if(verbose){System.err.println("Writing cluster "+cnum+", size "+alu.size());}
+
+ if(clusterFilePattern!=null){
+ if(tswCluster!=null){
+ if(verbose){System.err.println("Shutting down tswCluster "+tswCluster.fname);}
+ tswCluster.poisonAndWait();
+ tswCluster=null;
+ }
+ tswCluster=new ByteStreamWriter(clusterFilePattern.replaceFirst("%", ""+cnum), overwrite, append, true);
+ if(verbose){System.err.println("Starting tswCluster "+tswCluster.fname);}
+ tswCluster.start();
+ }
+
+ if(csf!=null && alu.size()>=minClusterSizeForStats){
+ float[] profile=makeNmerProfile(alu, nmerCounts);
+ sb.append("Cluster_");
+ sb.append(cnum);
+ sb.append('\t');
+ sb.append(alu.size());
+ sb.append('\t');
+ for(float f : profile){
+ sb.append(String.format("%.5f ", f));
+ }
+ sb.setCharAt(sb.length()-1, '\n');
+ csf.print(sb.toString());
+ sb.setLength(0);
+ }
+
+ if(pickBestRepresentativePerCluster){
+ pickBestRepresenative(alu, true);
+ }
+
+ if(outbest!=null){
+ Unit u=pickBestRepresenative((ArrayList<Unit>)alu.clone(), false);
+ tswBest.println(u.r);
+ if(u.r.mate!=null){tswBest.println(u.r.mate);}
+ }
+
+ for(int contig=0; contig<alu.size(); contig++){
+ final Unit u0=alu.get(contig);
+ alu.set(contig, null);
+ Read r=u0.r;
+ if(r.mate!=null && r.pairnum()!=0){r=r.mate;}
+
+ if(!r.discarded()){
+ rid++;
+
+ for(int i=0; r!=null && i<2; i++){
+ assert(r.pairnum()==i) : i+", "+r.pairnum()+", "+(r.mate==null ? 9 : r.mate.pairnum());
+ Unit u=(Unit)r.obj;
+ if(verbose){System.err.println("Writing read "+r.id);}
+ r.numericID=rid;
+ if(renameClusters){
+ sb.append("Cluster_");
+ sb.append(cnum);
+ sb.append(",contig_");
+ sb.append(contig);
+ if(u.offsetValid()){
+ sb.append(",pos_");
+ sb.append(u.offset());
+ }
+ if(r.mate!=null){sb.append(" /"+(i+1));}
+ r.id=(r.id==null ? sb.toString() : r.id+"\t"+sb);
+ sb.setLength(0);
+ }else if(names!=null){
+ String name=(r.id==null ? ""+r.numericID : r.id);
+ if(names.contains(name)){
+ for(long j=0; j<Integer.MAX_VALUE; j++){
+ String name2=name+"_dd"+j;
+ if(!names.contains(name2)){
+ r.id=name2;
+ names.add(name2);
+ break;
+ }
+ }
+ }else{
+ names.add(name);
+ }
+ }
+ if(tswAll!=null){tswAll.println(r);}
+ if(tswCluster!=null){tswCluster.println(r);}
+ r.setDiscarded(true);
+ r=r.mate;
+ }
+ }
+ }
+ }
+ }
+ if(csf!=null){
+ if(verbose){System.err.println("Shutting down csf "+csf.fname);}
+ csf.poisonAndWait();
+ }
+ if(tswBest!=null){
+ if(verbose){System.err.println("Shutting down tswBest "+tswBest.fname);}
+ tswBest.poisonAndWait();
+ }
+ if(tswAll!=null){
+ if(verbose){System.err.println("Shutting down tswAll "+tswAll.fname);}
+ tswAll.poisonAndWait();
+ }
+ if(tswCluster!=null){
+ if(verbose){System.err.println("Shutting down tswCluster "+tswCluster.fname);}
+ tswCluster.poisonAndWait();
+ }
+ }
+
+
+ private void writeGraph(String graphFile, ArrayList<ArrayList<Unit>> clist){
+ Collections.sort(clist, CLUSTER_LENGTH_COMPARATOR);
+
+ if(verbose){System.err.println("Writing overlap graph.");}
+
+ final TextStreamWriter tsw=(graphFile==null ? null : new TextStreamWriter(graphFile, overwrite, append, true));
+ if(tsw!=null){
+ tsw.start();
+ tsw.print("digraph G {\n");
+ }
+
+ for(int cnum=0; cnum<clist.size(); cnum++){
+ final ArrayList<Unit> alu=clist.get(cnum);
+// clist.set(cnum, null); //This breaks subsequent output processing
+// Collections.sort(alu); //TODO: Remove
+
+ if(alu.size()<minClusterSize){
+ if(verbose){System.err.println("Ignoring small cluster "+cnum+", size "+alu.size());}
+ }else{
+ if(verbose){System.err.println("Processing cluster "+cnum+", size "+alu.size());}
+
+ for(int contig=0; contig<alu.size(); contig++){
+ final Unit u0=alu.get(contig);
+// alu.set(contig, null); //This breaks subsequent output processing
+ Read r=u0.r;
+ if(r.mate!=null && r.pairnum()!=0){r=r.mate;}
+
+ {
+ for(int i=0; r!=null && i<2; i++){
+ assert(r.pairnum()==i) : i+", "+r.pairnum()+", "+(r.mate==null ? 9 : r.mate.pairnum());
+ Unit u=(Unit)r.obj;
+ if(verbose){System.err.println("Writing read "+r.id);}
+
+ if(tsw!=null){
+ tsw.print("\t"+toGraphName(r)+"\n");
+ if(r.mate!=null && r.pairnum()==0){
+ Read r2=r.mate;
+ tsw.print(toGraphName(r)+" -> "+toGraphName(r2)+" [label=mate]");
+ }
+ if(u.overlapList!=null){
+ for(Overlap o : u.overlapList){
+ if(u==o.u1){
+ Read r2=o.u2.r;
+ tsw.print("\t"+toGraphName(r)+" -> "+toGraphName(r2)+" [label=\""+o.toLabel()+"\"]\n");
+ }
+ }
+ }
+ }
+ r=r.mate;
+ }
+ }
+ }
+ }
+ }
+
+ if(tsw!=null){
+ tsw.print("}\n");
+ if(verbose){System.err.println("Shutting down tswAll "+tsw.fname);}
+ tsw.poisonAndWait();
+ }
+ }
+
+ private static String toGraphName(Read r){
+ if(NUMBER_GRAPH_NODES || r.id==null){
+ return r.numericID+((ADD_PAIRNUM_TO_NAME || r.mate!=null) ? "."+(r.pairnum()+1) : "");
+ }else{
+ return r.id.replace(' ','_').replace('\t','_');
+ }
+ }
+
+ private Unit pickBestRepresenative(ArrayList<Unit> alu, boolean clearList){
+ if(alu==null || alu.isEmpty()){return null;}
+ float[] quality=new float[alu.size()];
+ int[] lengths=new int[alu.size()];
+ for(int i=0; i<alu.size(); i++){
+ Unit u=alu.get(i);
+ int len=u.r.length();
+ quality[i]=u.r.expectedErrors(true, 0)/len;
+ lengths[i]=len;
+ }
+ Arrays.sort(quality);
+ Arrays.sort(lengths);
+ int medianLength=lengths[lengths.length/2];
+ float bestQuality=quality[0];
+
+ float currentBestQuality=9999999;
+ Unit best=null;
+ for(int i=0; i<alu.size(); i++){
+ Unit u=alu.get(i);
+ int len=u.r.length();
+ float deviation=Tools.absdif(len, medianLength)*1f/(medianLength+1);
+ if(deviation<0.05){
+ float qual=u.r.expectedErrors(true, 0)/len;
+ qual=(qual+.001f)*(1+10*deviation);
+ if(qual<currentBestQuality || best==null){
+ currentBestQuality=qual;
+ best=u;
+ }
+ }
+ }
+ if(clearList){
+ alu.clear();
+ alu.add(best);
+ }
+ return best;
+ }
+
+ public static long hash(byte[] bases){
+ long code=bases.length;
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ int mode=(int)(code&31);
+ assert(hashcodes[b]!=null) : "Invalid sequence character: '"+(char)b+"'";
+ code=code^hashcodes[b][mode];
+ code=Long.rotateLeft(code, 1);
+ }
+ return code;
+ }
+
+
+ public static long hashReversed(byte[] bases){
+ long code=bases.length;
+ for(int i=bases.length-1; i>=0; i--){
+ byte b=bases[i];
+ assert(hashcodes[b]!=null) : "Invalid sequence character: '"+(char)b+"'";
+ b=baseToComplementExtended[b];
+ int mode=(int)(code&31);
+ code=code^hashcodes[b][mode];
+ code=Long.rotateLeft(code, 1);
+ }
+ return code;
+ }
+
+
+ public static boolean isCanonical(byte[] bases){
+ if(ignoreReverseComplement || bases==null || bases.length==0){return true;}
+ final int lim=(bases.length+1)/2;
+ for(int i=0, j=bases.length-1; i<lim; i++, j--){
+ byte a=bases[i], b=baseToComplementExtended[bases[j]];
+ if(a<b){return true;}
+ if(b<a){return false;}
+ }
+ assert((bases.length&1)==0 || bases[lim-1]==baseToComplementExtended[bases[lim-1]]) :
+ bases.length+", "+lim+", "+bases[lim-1]+", "+(char)bases[lim-1]+(bases.length<1000 ? "\n'"+new String(bases)+"'\n" : ""); //palindrome absorb
+ return true; //palindrome
+ }
+
+
+ private static synchronized long[][] makeCodes(int symbols, int modes){
+ Random randy=new Random(1);
+ long[][] r=new long[symbols][modes];
+ for(int i=0; i<symbols; i++){
+ for(int j=0; j<modes; j++){
+ r[i][j]=randy.nextLong();
+ }
+ }
+ return r;
+ }
+
+// /** Handles IUPAC codes */
+// private static synchronized long[][] makeCodes2(int modes){
+// long[][] r0=makeCodes(26, modes);
+// long[][] r=new long[Tools.max('Z','z')+1][];
+// for(int i=0; i<26; i++){
+// char c=(char)('A'+i);
+// r[c]=r[Character.toLowerCase(c)]=r0[i];
+// }
+// return r;
+// }
+
+ /** Handles IUPAC codes and invalid symbols */
+ private static synchronized long[][] makeCodes2(int modes){
+ long[][] r=makeCodes(128, modes);
+
+ for(int i=0; i<26; i++){
+ char c=(char)('A'+i);
+ r[Character.toLowerCase(c)]=r[c];
+ }
+ return r;
+ }
+
+ private void addDupe(Read r){
+ if(dupeWriter==null){return;}
+ if(r.mate==null || r.pairnum()==0){
+ synchronized(dupeWriter){
+ dupeWriter.println(r);
+ if(r.mate!=null){
+ dupeWriter.println(r.mate);
+ }
+ }
+ }
+ }
+
+
+ private final class MstThread extends Thread{
+
+ public MstThread(){}
+
+ public void run(){
+
+ ArrayList<Unit> cluster=null;
+ while((cluster=nextCluster())!=null){
+ makeMst(cluster);
+ processedT.add(cluster);
+ }
+
+ }
+
+ public void makeMst(ArrayList<Unit> cluster){
+ assert(heap.isEmpty());
+ unvisit(cluster);
+ for(Unit u : cluster){
+ u.flags&=~Unit.VISIT_MASK;
+ Collections.sort(u.overlapList);
+ }
+ {
+ Unit u=cluster.get(0);
+ u.setVisited(true);
+ heap.addAll(u.overlapList);
+ }
+// assert(false) : cluster.size();
+ while(!heap.isEmpty()){
+ Overlap o=heap.poll();
+ assert(!o.mst());
+ if(!o.invalid()){
+// assert(o.u1.overlapList.contains(o)); //slow
+// assert(o.u2.overlapList.contains(o)); //slow
+ assert(o.u1.visited() || o.u2.visited());
+ final Unit u=(!o.u1.visited() ? o.u1 : !o.u2.visited()? o.u2 : null);
+ if(u!=null){
+ o.setMst(true);
+ u.setVisited(true);
+ overlapsRetainedT++;
+ overlapBasesRetainedT+=o.overlapLen;
+ for(Overlap o2 : u.overlapList){
+ if(o2.mst()){
+ //do nothing
+ }else if(!o2.u1.visited() || !o2.u2.visited()){
+ if(heap.size()>=Integer.MAX_VALUE){
+ removeInvalid(heap);
+ }
+ heap.add(o2);
+ }else if(!o2.invalid()){
+ o2.setInvalid(true);
+ overlapsRemovedT++;
+ overlapBasesRemovedT+=o2.overlapLen;
+ }
+ }
+ }
+ }
+ }
+ for(Unit u : cluster){
+ ArrayList<Overlap> alo=u.overlapList;
+ int removed=0;
+ for(int i=0; i<alo.size(); i++){
+ Overlap o=alo.get(i);
+ if(o.invalid()){
+ assert(!o.mst());
+ alo.set(i, null);
+ removed++;
+ }else{
+ assert(o.mst());
+ }
+ }
+ if(removed>0){
+ Tools.condenseStrict(alo);
+ alo.trimToSize();
+ }
+ }
+ }
+
+ private void removeInvalid(PriorityQueue<Overlap> heap){
+ ArrayList<Overlap> valid=new ArrayList<Overlap>(heap.size());
+ for(Overlap o : heap){
+ if(!o.invalid()){
+ assert(!o.u1.visited() || !o.u2.visited());
+ valid.add(o);
+ }
+ }
+ heap.clear();
+ heap.addAll(valid);
+ }
+
+
+ public long overlapsRemovedT=0;
+ public long overlapBasesRemovedT=0;
+ public long overlapsRetainedT=0;
+ public long overlapBasesRetainedT=0;
+
+ private final PriorityQueue<Overlap> heap=new PriorityQueue<Overlap>((1<<16)-1);
+ private ArrayList<ArrayList<Unit>> processedT=new ArrayList<ArrayList<Unit>>();
+ }
+
+
+ /**
+ * Processes clustered sets of reads.
+ * @author Brian Bushnell
+ * @date Aug 9, 2013
+ *
+ */
+ private final class ClusterThread extends Thread{
+
+ public ClusterThread(boolean fixMultiJoins_, boolean canonicize_, boolean removeCycles_,
+ boolean fixCanonContradictions_, boolean fixOffsetContradictions_, boolean mergeClusters_, boolean mergeLeaves_, boolean mergeInner_){
+ fixMultiJoinsT=fixMultiJoins_;
+ canonicizeT=canonicize_;
+ fixCanonContradictionsT=fixCanonContradictions_;
+ fixOffsetContradictionsT=fixOffsetContradictions_;
+ mergeClustersT=mergeClusters_;
+ mergeLeavesT=mergeLeaves_;
+ mergeInnerT=mergeInner_;
+
+// assert(false) : fixMultiJoinsT+", "+canonicizeT+", "+fixCanonContradictionsT+", "+mergeLeavesT+", "+mergeInnerT;
+ bandy=(maxEdits>0 ? BandedAligner.makeBandedAligner(bandwidth) : null);
+// assert(false) : fixMultiJoinsT+", "+canonicizeT+", "+fixCanonContradictionsT+", "+fixOffsetContradictionsT+", "+mergeClustersT+", "+removeCycles_;
+ }
+
+ public void run(){
+
+ final ArrayList<Unit> temp=new ArrayList<Unit>(1000);
+
+ ArrayList<Unit> cluster=null;
+ while((cluster=nextCluster())!=null){
+
+ if(EA){
+ for(Unit u : cluster){assert(u.r.mate==null) : "Cluster processing/merging is not supported for paired reads, only cluster generation.";}
+ }
+
+// for(Unit u : cluster){assert(!u.visited());}
+ unvisit(cluster);
+
+ reorderClusterBreadthFirst(cluster);
+ int multiJoinCount=findMultiJoinsInCluster(cluster, fixMultiJoinsT);
+
+ if(EA){
+ for(Unit u : cluster){assert(!u.visited());}
+ }
+
+ boolean ok=true;
+ if(multiJoinCount!=0){
+ assert(multiJoinCount>0);
+ multiJoinsFoundT+=multiJoinCount;
+ if(!fixMultiJoinsT){
+ multiJoinFailuresT++;
+ ok=false;
+ }
+ }
+
+ int canonContradictions=0;
+ if(ok && canonicizeT){
+ if(EA){
+ for(Unit u : cluster){
+ assert(!u.visited());
+ assert(!u.canonContradiction());
+ assert(!u.canonicized());
+ if(u.overlapList!=null){
+ for(Overlap o : u.overlapList){
+ assert(!o.invalid());
+ assert(!o.canonContradiction()) :
+ o.u1.canonContradiction()+", "+o.u2.canonContradiction()+", "+cluster.contains(o.u1)+", "+cluster.contains(o.u2);
+ }
+ }
+ }
+ }
+ canonContradictions=canonicizeClusterBreadthFirst(cluster, temp);
+// System.err.println("Canonicized cluster of size "+cluster.size()+"; contradictions = "+canonContradictions+"; canonicized = "+temp.size());
+ temp.clear();
+ for(Unit u : cluster){assert(!u.visited());}
+ if(canonContradictions>0){
+ canonContradictoryOverlapsT+=canonContradictions;
+ canonContradictoryClustersT++;
+ if(fixCanonContradictionsT){
+ if(verbose){System.err.println("Pruning cluster to remove canonization contradictions.");}
+ fullyPruneCluster(cluster, temp);
+ if(verbose){System.err.println("Resulting size: "+cluster.size());}
+ if(EA){
+ for(Unit u : cluster){
+ assert(!u.visited());
+ assert(!u.canonContradiction());
+ assert(u.canonicized());
+ if(u.overlapList!=null){
+ for(Overlap o : u.overlapList){
+ assert(!o.invalid());
+ assert(!o.canonContradiction());
+ assert(o.type==FORWARD) : "\n"+o+"\n"+
+ o.u1.canonContradiction()+", "+o.u2.canonContradiction()+", "+o.u1.canonicized()+", "+o.u2.canonicized()+
+ "\n"+cluster.contains(o.u1)+", "+cluster.contains(o.u2)+", "+cluster.size();
+ }
+ }
+ }
+ }
+ }else{
+ ok=false;
+ }
+ }
+ }
+
+ int cycleOverlaps=0;
+ if(ok){
+ cycleOverlaps=findCycles(cluster, removeCycles);
+ for(Unit u : cluster){assert(!u.visited());}
+ if(cycleOverlaps>0){
+ cycleOverlapsT+=cycleOverlaps;
+ cycleClustersT++;
+ }
+ }
+
+ int offsetContradictions=0;
+ if(ok && fixOffsetContradictionsT){
+ if(EA){
+ for(Unit u : cluster){
+ assert(!u.visited());
+ assert(!u.offsetContradiction());
+ assert(!u.offsetValid());
+ assert(u.canonicized());
+ if(u.overlapList!=null){
+ for(Overlap o : u.overlapList){
+ assert(!o.invalid());
+ assert(!o.offsetContradiction());
+ assert(o.type==FORWARD) : o;
+ }
+ }
+ }
+ }
+ offsetContradictions=generateOffsetsBreadthFirst(cluster, temp);
+// System.err.println("Made offsets for cluster of size "+cluster.size()+"; contradictions = "+offsetContradictions+"; set = "+temp.size());
+ temp.clear();
+ for(Unit u : cluster){assert(!u.visited());}
+ if(offsetContradictions>0){
+ offsetContradictoryOverlapsT+=offsetContradictions;
+ offsetContradictoryClustersT++;
+ if(fixOffsetContradictionsT){
+ if(verbose){System.err.println("Pruning cluster to remove offset contradictions.");}
+ fullyPruneCluster(cluster, temp);
+ if(verbose){System.err.println("Resulting size: "+cluster.size());}
+ if(EA){
+ for(Unit u : cluster){
+ assert(!u.visited());
+ assert(!u.offsetContradiction());
+ assert(u.offsetValid());
+ if(u.overlapList!=null){
+ for(Overlap o : u.overlapList){
+ assert(!o.invalid());
+ assert(!o.offsetContradiction());
+ assert(o.type==FORWARD) : o;
+ }
+ }
+ }
+ }
+ }else{
+ ok=false;
+ }
+ }
+ if(ok){Collections.sort(cluster, UNIT_OFFSET_COMPARATOR);}
+ }
+
+ if(ok && absorbOverlap){
+ mergeCluster(cluster);
+ }
+
+ processedClustersT.add(cluster);
+ if(processedClustersT.size()>=threadMaxReadsToBuffer){
+ synchronized(processedClusters){
+ processedClusters.addAll(processedClustersT);
+ processedClustersT.clear();
+ }
+ }
+ }
+ synchronized(processedClusters){
+ processedClusters.addAll(processedClustersT);
+ processedClustersT.clear();
+ }
+ }
+
+ private void fullyPruneCluster(ArrayList<Unit> cluster, ArrayList<Unit> temp){
+ assert(cluster.size()>1) : cluster.size();
+ ArrayList<Unit> pruned=pruneCluster(cluster, true, true, temp);
+ assert(temp.isEmpty());
+ assert(pruned==null || pruned.size()>0);
+ while(pruned!=null){
+ ArrayList<Unit> subcluster=pruned;
+ for(Unit u : subcluster){
+ u.clearVolatileFlags();
+ if(u.overlapList!=null){
+ for(Overlap o : u.overlapList){
+ o.clearVolatileFlags();
+ }
+ }
+ }
+ assert(subcluster.size()>0);
+ pruned=pruneCluster(subcluster, false, false, temp);
+ assert(temp.isEmpty());
+ assert(pruned==null || pruned.size()>0);
+ assert(subcluster.size()>0);
+ if(subcluster.size()==1){
+ processedClustersT.add(subcluster);
+ }else{
+ assert(subcluster.size()>1);
+ synchronized(clusterQueue){
+ clusterQueue.add(subcluster);
+ }
+ }
+ }
+ }
+
+ /**
+ * @param cluster
+ */
+ private void mergeCluster(ArrayList<Unit> cluster) {
+ if(cluster.size()==1){return;}
+ if(mergeLeavesT){
+ mergeLeaves(cluster);
+ }
+ if(mergeInnerT){
+ mergeInner(cluster);
+ }
+ }
+
+ /**
+ * Finds places in the cluster where two Units are joined by multiple different Overlaps.
+ * Returns number of multijoins found.
+ * @param cluster
+ */
+ private int findMultiJoinsInCluster(ArrayList<Unit> cluster, boolean resolveProblems) {
+ if(cluster.size()<2){return 0;}
+ int totalMultiJoins=0;
+ for(Unit ua : cluster){
+ ArrayList<Overlap> list=ua.overlapList;
+ assert(list!=null);
+ if(list.size()>1){
+ Collections.sort(list);
+
+ int multiJoins=0;
+ for(int i=0; i<list.size(); i++){
+ Overlap o=list.get(i);
+ Unit ub=(o.u1==ua ? o.u2 : o.u1);
+ assert(ua!=ub);
+ assert(ua==o.u1 || ua==o.u2);
+ if(ub.visited()){
+ multiJoins++;
+ multiJoinBasesFoundT+=o.overlapLen;
+ if(!o.multiJoin()){o.setMultiJoin(true);}
+ if(resolveProblems){list.set(i, null);}
+ }else{
+ ub.setVisited(true);
+ }
+ }
+
+ if(multiJoins>0){
+ totalMultiJoins+=multiJoins;
+ if(resolveProblems){Tools.condenseStrict(list);}
+ }
+
+ for(int i=0; i<list.size(); i++){
+ Overlap o=list.get(i);
+ Unit ub=(o.u1==ua ? o.u2 : o.u1);
+ assert(ua!=ub);
+ assert(ua==o.u1 || ua==o.u2);
+ assert(ub.visited());
+ ub.setVisited(false);
+ }
+ }
+
+ }
+
+ return totalMultiJoins;
+ }
+
+ private ArrayList<Unit> pruneCluster(ArrayList<Unit> cluster, boolean pruneContradictoryNodes, boolean pruneContradictoryOverlaps, ArrayList<Unit> visited){
+ if(verbose){System.err.println("pruneCluster(size="+cluster.size()+", "+pruneContradictoryNodes+", "+pruneContradictoryOverlaps+")");}
+
+ //pruneContradictoryOverlaps is less strict than pruneContradictoryNodes
+ assert(pruneContradictoryOverlaps || !pruneContradictoryNodes);
+
+ for(Unit ua : cluster){
+ assert(!ua.visited());
+ assert(ua.isPerfectlyTransitive()) : ua;
+ if(ua.visited()){ua.setVisited(false);}
+ }
+
+ int prunedOverlaps=0;
+ int visits=1;
+
+ {
+ final Unit root=cluster.get(0);
+ assert(!root.contradiction());
+ root.setVisited(true);
+ visited.add(root);
+ }
+
+ for(int i=0; i<visited.size(); i++){
+ Unit ua=visited.get(i);
+
+ if(ua.visited() && (!ua.contradiction() || !pruneContradictoryNodes)){
+ ArrayList<Overlap> list=ua.overlapList;
+ if(list!=null){
+ int removed=0;
+ for(int j=0; j<list.size(); j++){
+ Overlap o=list.get(j);
+ Unit ub=(o.u1==ua ? o.u2 : o.u1);
+ assert(o.u1==ua || o.u2==ua);
+ assert(ua!=ub);
+ assert(ub.valid());
+
+ assert(!o.canonContradiction() || (ua.canonContradiction() || ub.canonContradiction())) :
+ "\n"+o.canonContradiction()+", "+ua.canonContradiction()+", "+ub.canonContradiction();
+
+ assert(!o.offsetContradiction() || (ua.offsetContradiction() || ub.offsetContradiction())) :
+ "\n"+o.offsetContradiction()+", "+ua.offsetContradiction()+", "+ub.offsetContradiction();
+
+// assert(o.contradiction()==(ua.contradiction() && ub.contradiction())) :
+// "\n"+o.canonContradiction()+", "+o.offsetContradiction()+
+// "\n"+ua.canonContradiction()+", "+ua.offsetContradiction()+
+// "\n"+ub.canonContradiction()+", "+ub.offsetContradiction();
+
+ final boolean remove=(pruneContradictoryNodes && ub.contradiction() || (pruneContradictoryOverlaps && o.contradiction()));
+ if(!remove && !ub.visited()){
+ ub.setVisited(true);
+ visited.add(ub);
+ visits++;
+ }
+
+ if(remove){
+ if(!o.invalid()){o.setInvalid(true);}
+ list.set(j, null);
+ removed++;
+ prunedOverlaps++;
+ }else{
+ assert(!o.invalid());
+ }
+ }
+ if(removed>0){Tools.condenseStrict(list);}
+ }
+ }
+ }
+
+ if(verbose){System.err.println("cluster.size()="+cluster.size()+", visits="+visits+", visited.size()="+visited.size());}
+
+// if(visited.size()==11486){ //TODO: For testing. Remove.
+// for(int i=0; i<visited.size(); i++){
+// Unit u=visited.get(i);
+// assert(u.visited());
+// assert(!u.canonContradiction());
+// assert(u.canonicized());
+// for(Overlap o : u.overlapList){
+// assert(!o.canonContradiction());
+// assert(o.type==FORWARD) : "\n\no="+o+"\ni="+i+", u.overlapList.size="+u.overlapList.size()+"\n"+
+// o.u1.canonContradiction()+", "+o.u2.canonContradiction()+", "+o.u1.canonicized()+", "+o.u2.canonicized()+
+// "\n"+visited.contains(o.u1)+", "+visited.contains(o.u2)+", "+visited.size()+
+// "\n"+u.overlapList;
+// }
+// }
+// }
+
+ final int numUnvisited=cluster.size()-visits;
+ ArrayList<Unit> pruned=(numUnvisited==0 ? null : new ArrayList<Unit>(numUnvisited));
+ assert(visits==visited.size());
+ assert(visits>=1 && visits<=cluster.size());
+
+ if(visits<cluster.size()){
+ pruned=new ArrayList<Unit>(cluster.size()-visits);
+ for(Unit ua : cluster){
+ if(!ua.visited()){
+ pruned.add(ua);
+ ArrayList<Overlap> list=ua.overlapList;
+ if(list!=null){
+ int removed=0;
+ for(int j=0; j<list.size(); j++){
+ Overlap o=list.get(j);
+ Unit ub=(o.u1==ua ? o.u2 : o.u1);
+ assert(o.u1==ua || o.u2==ua);
+ assert(ua!=ub);
+ assert(ub.valid());
+
+ if(ub.visited() || o.invalid()){
+ assert(ub.visited()==o.invalid()) : "\n"+o+"\n"+ub;
+ list.set(j, null);
+ removed++;
+ }
+ }
+ if(removed>0){Tools.condenseStrict(list);}
+ }
+ }
+ }
+ assert(pruned.size()==numUnvisited);
+ }else{
+ assert(prunedOverlaps==0) : "If this fails then I may need to mark overlaps to remove.";
+ }
+ for(Unit u : cluster){
+ assert(u.isPerfectlyTransitive()) : u;
+ if(EA){
+ if(u.overlapList!=null){
+ for(Overlap o : u.overlapList){assert(!o.invalid());}
+ }
+ }
+ if(u.visited()){u.setVisited(false);}
+ }
+ cluster.clear();
+ cluster.addAll(visited);
+ cluster.trimToSize();
+
+// for(Unit u : cluster){
+//// assert(u.canonicized());
+// for(Overlap o : u.overlapList){
+// assert(pruned==null || !pruned.contains(o.u1));
+// assert(pruned==null || !pruned.contains(o.u2));
+// assert(cluster.contains(o.u1));
+// assert(cluster.contains(o.u2));
+// }
+// }
+// if(pruned!=null){
+// for(Unit u : pruned){
+// for(Overlap o : u.overlapList){
+// assert(pruned.contains(o.u1));
+// assert(pruned.contains(o.u2));
+// assert(!cluster.contains(o.u1));
+// assert(!cluster.contains(o.u2));
+// }
+// }
+// }
+
+ visited.clear();
+ return pruned;
+ }
+
+ /**
+ * Cluster should already be ordered breadth-first
+ * This may fail because removing cycles could change breadth-first traversal, but if it fails, an assertion will be thrown
+ * @param cluster
+ */
+ private int findCycles(ArrayList<Unit> cluster, boolean remove){
+
+ {
+ final Unit root=cluster.get(0);
+ assert(root.length()>=cluster.get(cluster.size()-1).length());
+ root.setVisited(true);
+ }
+ int cycles=0;
+
+ for(Unit ua : cluster){
+ assert(ua.visited());
+ ArrayList<Overlap> list=ua.overlapList;
+ if(list!=null){
+ int removed=0;
+ for(int i=0; i<list.size(); i++){
+ Overlap o=list.get(i);
+ Unit ub=(o.u1==ua ? o.u2 : o.u1);
+ assert(o.u1==ua || o.u2==ua);
+ assert(ua!=ub);
+ assert(ub.valid());
+
+ if(!o.visited()){
+ o.setVisited(true);
+ if(ub.visited()){
+ if(!o.cyclic()){
+ o.setCyclic(true);
+ cycles++;
+ }
+ }else{
+ ub.setVisited(true);
+ }
+ }
+ if(remove && o.cyclic()){
+ list.set(i, null);
+ removed++;
+ }
+ }
+ if(removed>0){Tools.condenseStrict(list);}
+ }
+ }
+
+ for(Unit u : cluster){
+ if(u.visited()){u.setVisited(false);}
+ if(u.overlapList!=null){
+ for(Overlap o : u.overlapList){
+ if(o.visited()){o.setVisited(false);}
+ }
+ }
+ }
+
+ return cycles;
+ }
+
+ /**
+ * Cluster should already be ordered breadth-first
+ * @param cluster
+ */
+ private int generateOffsetsBreadthFirst(ArrayList<Unit> cluster, ArrayList<Unit> temp){
+
+
+ assert(temp!=null);
+ assert(temp.isEmpty());
+ {
+ final Unit root=cluster.get(0);
+ assert(root.length()>=cluster.get(cluster.size()-1).length());
+ root.setOffset(0);
+ temp.add(root);
+ }
+
+ int contradictions=0;
+ for(int i=0; i<temp.size(); i++){
+ Unit u=temp.get(i);
+ assert(!u.visited()) : i;
+ assert(u.offsetValid() || contradictions>0) : i+", "+temp.size()+", "+contradictions+"\n"+toString(temp);
+ if(u.offsetValid() && !u.offsetContradiction()){
+ contradictions+=setOffsetsNeighbors(u, temp);
+ assert(contradictions==0 || (i>0 && temp.size()>2));
+ }
+ }
+
+ int min=0;
+ for(Unit u : temp){
+ if(u.visited()){u.setVisited(false);}
+ if(u.overlapList!=null){
+ for(Overlap o : u.overlapList){
+ if(o.visited()){o.setVisited(false);}
+ }
+ }
+ if(u.offsetValid() && !u.offsetContradiction()){
+ min=Tools.min(min, u.offset());
+ }
+ }
+
+ if(verbose){
+ System.err.println("min offset = "+min);
+ }
+
+ for(Unit u : temp){
+ if(u.offsetValid()){
+ if(verbose){System.err.println("Set "+u.name()+" offset from "+u.offset+" to "+(u.offset-min));}
+ u.offset=u.offset-min;
+ }
+ }
+
+
+ return contradictions;
+ }
+
+ /**
+ * @param root
+ */
+ private int setOffsetsNeighbors(final Unit root, final ArrayList<Unit> temp) {
+ if(verbose){System.err.println("\nsetOffsetsNeighbors("+root.name()+")\nroot.code1="+root.code1+"\n");}
+ assert(root.valid());
+ assert(!root.visited());
+ assert(root.offsetValid());
+ assert(!root.offsetContradiction());
+ root.setVisited(true);
+ if(root.overlapList==null){return 0;}
+ final int contradictions=countOffsetContradictions(root, false);
+ if(verbose){System.err.println("\ncontradictions="+contradictions);}
+ for(Overlap o : root.overlapList){
+ Unit u=(o.u1==root ? o.u2 : o.u1);
+ assert(o.u1==root || o.u2==root);
+ assert(root!=u);
+ assert(u.valid());
+
+ if(verbose){System.err.println("\nProcessing Overlap "+o);}
+ if(!o.visited() && !o.offsetContradiction()){
+ o.setVisited(true);
+ if(!u.offsetContradiction()){
+ if(verbose){System.err.println("Calling setOffset: "+o);}
+ if(!u.offsetValid()){temp.add(u);}
+ boolean b=setOffset(root, u, o);
+ if(verbose){System.err.println("Finished setOffset: "+o);}
+
+// if(x>0){
+// if(verbose){System.err.println("\n*********************************************");}
+// if(verbose){System.err.println("Problem detected with contig "+u.name());}
+// if(verbose){System.err.println("*********************************************\n");}
+// verbose=true;
+// int y2=countOffsetContradictions(root, false);
+// assert(contradictions==y2);
+// }
+
+ assert(b) : "\n"+contradictions+", "+o.offsetContradiction()+", "+root.offsetContradiction()+", "+u.offsetContradiction()+"\n"
+ +root.offsetValid()+", "+u.offsetValid()+", "+OVERLAP_TYPE_NAMES[o.type]+"\n"+b
+ +fixMultiJoins; //This assertion can fail if a multijoin is present
+ assert(u.offsetValid());
+ }
+ }
+ }
+ return contradictions;
+ }
+
+ private int countOffsetContradictions(Unit root, boolean includeKnown){
+ if(verbose){System.err.println("\ncountContradictions("+root.name()+", "+includeKnown+")\nroot.code1="+root.code1+"\n");}
+ assert(root.valid());
+ assert(root.visited());
+ assert(root.offsetValid());
+// assert(!root.offsetContradiction());
+ if(root.overlapList==null){return 0;}
+ int contradictions=0;
+ for(Overlap o : root.overlapList){
+ Unit u=(o.u1==root ? o.u2 : o.u1);
+ assert(o.u1==root || o.u2==root);
+ assert(root!=u);
+ assert(u.valid());
+
+ if(verbose){System.err.println("\nOverlap "+o+"\nu="+u.name()+", offsetValid="+u.offsetValid());}
+
+ boolean contradictory=(u.offsetValid() && u.offset()!=calcOffset(root, u, o));
+ if(verbose){System.err.println("contradictory= \t"+contradictory);}
+ if(contradictory){
+ if(includeKnown || !u.offsetContradiction()){
+ contradictions++;
+ if(!root.offsetContradiction()){root.setOffsetContradiction(true);}
+ }
+ if(!o.offsetContradiction()){o.setOffsetContradiction(true);}
+ if(!u.offsetContradiction()){u.setOffsetContradiction(true);}
+ }
+ assert(contradictory==o.offsetContradiction()) : contradictory+", "+o.offsetContradiction();
+ if(verbose){
+ System.err.println("root.offsetContradiction()=\t"+root.offsetContradiction());
+ System.err.println("u.offsetContradiction()= \t"+u.offsetContradiction());
+ System.err.println("o.offsetContradiction()= \t"+o.offsetContradiction());
+ System.err.println("contradictions= \t"+contradictions);
+ }
+ }
+ if(verbose){System.err.println("Final contradictions="+contradictions+"\n");}
+ return contradictions;
+ }
+
+ /**
+ * Cluster should already be ordered breadth-first
+ * @param cluster
+ */
+ private int canonicizeClusterBreadthFirst(ArrayList<Unit> cluster, ArrayList<Unit> temp) {
+
+ assert(temp!=null);
+ assert(temp.isEmpty());
+ {
+ final Unit root=cluster.get(0);
+ assert(root.length()>=cluster.get(cluster.size()-1).length());
+ root.setCanonicized(true);
+ temp.add(root);
+ }
+
+ int contradictions=0;
+ for(int i=0; i<temp.size(); i++){
+ final Unit u=temp.get(i);
+ assert(!u.visited()) : i;
+ assert(u.canonicized() || contradictions>0) : i+", "+temp.size()+", "+contradictions+"\n"+toString(temp);
+ if(u.canonicized() && !u.canonContradiction()){
+ contradictions+=canonicizeNeighbors(u, temp);
+ assert(contradictions==0 || (i>0 && temp.size()>2));
+
+ if(u.overlapList!=null){
+ for(Overlap o : u.overlapList){
+ assert(o.type==FORWARD || o.canonContradiction() || o.u1.canonContradiction() || o.u2.canonContradiction()) :
+ o+"\n"+contradictions+", "+o.canonContradiction()+", "+o.u1.canonContradiction()+", "+o.u2.canonContradiction()+
+ "\n"+o.u1.canonicized()+", "+o.u2.canonicized()+", "+o.u1.visited()+", "+o.u2.visited();
+ }
+ }
+ }
+
+// if(u.r.numericID==59462 || u.r.numericID==56439){ //TODO: remove
+// System.err.println("\nid="+u.r.numericID+", canonicized="+u.canonicized()+", contradiction="+u.canonContradiction()+", visited="+u.visited());
+// for(Overlap o : u.overlapList){
+// Unit u2=(o.u1==u ? o.u2 : o.u1);
+// assert(o.u1==u || o.u2==u);
+// assert(u2!=u);
+// assert(u2.valid());
+// System.err.println("o = "+o);
+// System.err.println("o.contradiction="+o.canonContradiction());
+// System.err.println("u2.id="+u2.r.numericID+", canonicized="+u2.canonicized()+", contradiction="+u2.canonContradiction()+", visited="+u.visited());
+// }
+// }
+ }
+
+ for(Unit u : temp){
+ if(u.visited()){u.setVisited(false);}
+ if(EA){
+ if(u.overlapList!=null){
+ for(Overlap o : u.overlapList){assert(!o.visited());}
+ }
+ }
+ }
+
+ return contradictions;
+ }
+
+ /**
+ * @param root
+ */
+ private int canonicizeNeighbors(Unit root, ArrayList<Unit> canonicized) {
+ if(verbose){System.err.println("\ncanonicizeNeighbors("+root.name()+")\nroot.code1="+root.code1+"\n");}
+ assert(root.valid());
+ assert(!root.visited());
+ assert(root.canonicized());
+ assert(!root.canonContradiction());
+ root.setVisited(true);
+ if(root.overlapList==null){return 0;}
+ final int contradictions=countCanonContradictions(root, false);
+ if(verbose){System.err.println("\ncontradictions="+contradictions);}
+ for(Overlap o : root.overlapList){
+ Unit u=(o.u1==root ? o.u2 : o.u1);
+ assert(o.u1==root || o.u2==root);
+ assert(root!=u);
+ assert(u.valid());
+
+ if(verbose){System.err.println("\nProcessing Overlap "+o);}
+ if(!o.canonContradiction()){
+ if(!u.canonContradiction()){
+ boolean b=u.canonicized();
+ int dir=o.type;
+ if(verbose){System.err.println("Calling canonicize: "+o);}
+ int x=canonicize(root, u, o);
+ if(verbose){System.err.println("Finished canonicize: "+o);}
+
+// if(x>0){
+// if(verbose){System.err.println("\n*********************************************");}
+// if(verbose){System.err.println("Problem detected with contig "+u.name());}
+// if(verbose){System.err.println("*********************************************\n");}
+// verbose=true;
+// int y2=countCanonContradictions(root, false);
+// assert(contradictions==y2);
+// }
+
+ assert(x==0 || (u.canonicized() && (o.type==FORWARDRC || o.type==REVERSERC)));
+ assert(x==0) : "\n"+x+", "+contradictions+", "+o.canonContradiction()+", "+root.canonContradiction()+", "+u.canonContradiction()+"\n"
+ +root.canonicized()+", "+u.canonicized()+", "+OVERLAP_TYPE_NAMES[o.type]+"\n"+b+", "+dir
+ +fixMultiJoins; //This assertion can fail if a multijoin is present
+ if(!u.canonicized()){
+ u.setCanonicized(true);
+ canonicized.add(u);
+ }
+ assert(u.canonicized());
+ }
+ }
+ }
+ if(EA){
+ for(Overlap o : root.overlapList){
+ assert(o.type==FORWARD || o.canonContradiction() || o.u1.canonContradiction() || o.u2.canonContradiction()) :
+ o+"\n"+contradictions+", "+o.canonContradiction()+", "+o.u1.canonContradiction()+", "+o.u2.canonContradiction()+", "+root.canonContradiction()+
+ "\n"+o.u1.canonicized()+", "+o.u2.canonicized()+", "+o.u1.visited()+", "+o.u2.visited();
+ }
+ }
+ return contradictions;
+ }
+
+ private int countCanonContradictions(Unit root, boolean includeKnown){
+ if(verbose){System.err.println("\ncountContradictions("+root.name()+", "+includeKnown+")\nroot.code1="+root.code1+"\n");}
+ assert(root.valid());
+ assert(root.visited());
+ assert(root.canonicized());
+// assert(!root.canonContradiction());
+ if(root.overlapList==null){return 0;}
+ int contradictions=0;
+ for(Overlap o : root.overlapList){
+ Unit ub=(o.u1==root ? o.u2 : o.u1);
+ assert(o.u1==root || o.u2==root);
+ assert(root!=ub);
+ assert(ub.valid());
+
+ if(verbose){System.err.println("\nOverlap "+o+"\nu="+ub.name()+", canonicized="+ub.canonicized());}
+
+ boolean contradictory=(ub.canonicized() && (o.type==FORWARDRC || o.type==REVERSERC));
+ if(verbose){System.err.println("contradictory= \t"+contradictory);}
+ if(contradictory){
+ if(!o.canonContradiction()){o.setCanonContradiction(true);}
+ if(includeKnown || !ub.canonContradiction()){
+ contradictions++;
+ if(!root.canonContradiction()){root.setCanonContradiction(true);}
+ if(!ub.canonContradiction()){ub.setCanonContradiction(true);}
+ }
+ }
+
+ assert(!o.canonContradiction() || (root.canonContradiction() || ub.canonContradiction())) :
+ "\n"+contradictory+", "+o.canonContradiction()+", "+root.canonContradiction()+", "+ub.canonContradiction();
+
+ assert(contradictory==o.canonContradiction()) : contradictory+", "+o.canonContradiction();
+ if(verbose){
+ System.err.println("root.canonContradiction()=\t"+root.canonContradiction());
+ System.err.println("u.canonContradiction()= \t"+ub.canonContradiction());
+ System.err.println("o.canonContradiction()= \t"+o.canonContradiction());
+ System.err.println("contradictions= \t"+contradictions);
+ }
+ }
+ if(verbose){System.err.println("Final contradictions="+contradictions+"\n");}
+ return contradictions;
+ }
+
+ private String toString(ArrayList<Unit> cluster){
+ for(int i=0; i<cluster.size(); i++){
+ Unit u=cluster.get(i);
+ u.r.id=""+i;
+ }
+ StringBuilder sb=new StringBuilder(1000);
+ for(Unit u : cluster){
+ sb.append(">"+u.name()+"\n");
+ sb.append(new String(u.bases()));
+ sb.append("\n");
+ }
+ sb.append("\n*****\n");
+ for(Unit u : cluster){
+ sb.append("\n"+u.name()+":");
+ if(u.overlapList!=null){
+ for(Overlap o : u.overlapList){
+ Unit ub=(o.u1==u ? o.u2 : o.u1);
+ sb.append(" "+ub.name());
+ }
+ }
+ }
+ sb.append("\n");
+ return sb.toString();
+ }
+
+ private String toShortString(ArrayList<Unit> cluster){
+ for(int i=0; i<cluster.size(); i++){
+ Unit u=cluster.get(i);
+ u.r.id=""+i;
+ }
+ StringBuilder sb=new StringBuilder(1000);
+ for(Unit u : cluster){
+ sb.append("\n"+u.name()+":");
+ if(u.overlapList!=null){
+ for(Overlap o : u.overlapList){
+ Unit ub=(o.u1==u ? o.u2 : o.u1);
+ sb.append(" "+ub.name());
+ }
+ }
+ }
+ sb.append("\n");
+ return sb.toString();
+ }
+
+
+ /**
+ * @param root
+ * @param u2
+ * @param o
+ * @return Number of contradictions
+ */
+ private int canonicize(final Unit root, final Unit u2, final Overlap o){
+ if(o.type==FORWARD){return 0;}
+ if(o.type==FORWARDRC || o.type==REVERSERC){
+ if(u2.canonicized()){return 1;}
+ u2.reverseComplement();
+ unitsFlippedT++;
+ for(Overlap o2 : u2.overlapList){
+ overlapsFlippedT++;
+ o2.flip(u2, bandy);
+ }
+ assert(o.type==FORWARD || o.type==REVERSE) : OVERLAP_TYPE_NAMES[o.type];
+ }
+ if(o.type==REVERSE){o.reverseDirection();}
+ assert(o.type==FORWARD);
+ assert(o.test(bandy, o.edits+maxEdits));
+ return 0;
+ }
+
+
+ /**
+ * @param root
+ * @param u2
+ * @param o
+ * @return true if no contradictions
+ */
+ private boolean setOffset(final Unit root, final Unit u2, final Overlap o){
+ assert(root.offsetValid());
+ assert(!root.offsetContradiction());
+ int offset=calcOffset(root, u2, o);
+
+ if(u2.offsetValid()){return u2.offset()==offset;}
+ u2.setOffset(offset);
+
+ if(verbose){
+ System.err.println("\nroot = "+(root.name()==null ? root.r.numericID+"" : root.name())+", u2 = "+(u2.name()==null ? u2.r.numericID+"" : u2.name())
+ +"\no = "+o
+ +"\nroot.offset = "+root.offset()
+ +"\nu2.offset = "+u2.offset());
+ }
+
+ return true;
+ }
+
+
+ private int calcOffset(final Unit root, final Unit ub, final Overlap o){
+ assert(root.offsetValid());
+ if(o.type==FORWARD){
+ if(root==o.u1){
+ int dif=o.start1-o.start2;
+ if(verbose){System.err.println("root==o.u1=="+root.name()+", start1="+o.start1+"; u2==o.u2=="+ub.name()+", start2="+o.start2+", dif="+dif);}
+ return root.offset+dif;
+ }else{
+ int dif=o.start2-o.start1;
+ if(verbose){System.err.println("root==o.u2=="+root.name()+", start2="+o.start2+"; u2==o.u1=="+ub.name()+", start1="+o.start1+", dif="+dif);}
+ return root.offset+dif;
+ }
+ }else{
+ assert(false) : o;
+ throw new RuntimeException("TODO");
+ }
+ }
+
+
+ /**
+ * @param cluster
+ */
+ private void mergeLeaves(ArrayList<Unit> cluster) {
+ assert(false) : "TODO";
+ for(Unit u : cluster){
+
+ }
+ }
+
+ /**
+ * @param cluster
+ */
+ private void mergeInner(ArrayList<Unit> cluster) {
+ assert(false) : "TODO";
+ for(Unit u : cluster){
+
+ }
+ }
+
+ private ArrayList<ArrayList<Unit>> processedClustersT=new ArrayList<ArrayList<Unit>>(threadMaxReadsToBuffer);
+
+ long leafMergesT=0;
+ long innerMergesT=0;
+ long leafBaseMergesT=0;
+ long innerBaseMergesT=0;
+
+ long multiJoinFailuresT=0;
+ long multiJoinsFoundT=0;
+ long multiJoinBasesFoundT=0;
+ long unitsFlippedT=0;
+ long overlapsFlippedT=0;
+ long canonContradictoryOverlapsT=0;
+ long canonContradictoryClustersT=0;
+ long offsetContradictoryOverlapsT=0;
+ long offsetContradictoryClustersT=0;
+ long cycleOverlapsT=0;
+ long cycleClustersT=0;
+
+ private final boolean fixMultiJoinsT;
+ private final boolean canonicizeT;
+ private final boolean fixCanonContradictionsT;
+ private final boolean fixOffsetContradictionsT;
+ private final boolean mergeClustersT;
+ private final boolean mergeLeavesT;
+ private final boolean mergeInnerT;
+ private final BandedAligner bandy;
+ }
+
+
+ /**
+ * @param cluster
+ */
+ private void unvisit(ArrayList<Unit> cluster) {
+ for(Unit u : cluster){
+ if(u.visited()){u.setVisited(false);}
+ }
+ }
+
+ /**
+ * @param cluster
+ */
+ private void reorderClusterBreadthFirst(ArrayList<Unit> cluster) {
+ if(verbose){System.err.println("reorderClusterBreadthFirst");}
+
+ final int size=cluster.size();
+ Collections.sort(cluster); //Now it is in descending length
+ final Unit root=cluster.get(0);
+ assert(root.length()>=cluster.get(size-1).length()) : root.length()+", "+cluster.get(size-1).length()+", "+root.compareTo(cluster.get(size-1));
+
+ ArrayList<Unit> breadthFirst=new ArrayList<Unit>(cluster.size());
+ root.setVisited(true);
+// System.err.println("root="+root.name());
+ breadthFirst.add(root);
+ for(int i=0; i<breadthFirst.size(); i++){
+ Unit u=breadthFirst.get(i);
+ Collections.sort(u.overlapList); //Sorted in descending overlap length
+ if(u.overlapList!=null){
+ for(Overlap o : u.overlapList){
+ if(!o.u1.visited()){
+ // System.err.println("Visiting "+o.u1.name());
+ o.u1.setVisited(true);
+ breadthFirst.add(o.u1);
+ }
+ if(!o.u2.visited()){
+ // System.err.println("Visiting "+o.u2.name());
+ o.u2.setVisited(true);
+ breadthFirst.add(o.u2);
+ }
+ // System.err.println("***");
+ // System.err.println(toShortString(breadthFirst));
+ }
+ }
+ }
+ for(Unit u : cluster){
+ assert(u.visited());
+ if(u.visited()){u.setVisited(false);}
+ if(EA){
+ if(u.overlapList!=null){
+ for(Overlap o : u.overlapList){assert(!o.visited());}
+ }
+ }
+ }
+// System.err.println("***");
+// System.err.println("Final:");
+// System.err.println(toShortString(breadthFirst));
+ assert(cluster.size()==breadthFirst.size());
+ cluster.clear();
+ cluster.addAll(breadthFirst);
+ }
+
+
+
+ /** Returns next cluster larger than 1 element.
+ * Singleton clusters are added directly to 'processed'. */
+ private ArrayList<Unit> nextCluster(){
+ synchronized(clusterQueue){
+ ArrayList<Unit> cluster=clusterQueue.poll();
+ assert(cluster==null || cluster.size()>1);
+// while(cluster!=null && cluster.size()<2){
+//// unmark(cluster);
+// processedClustersT.add(cluster);
+// cluster=clusterQueue.poll();
+// }
+ return cluster;
+ }
+ }
+
+
+ /**
+ * Creates Unit objects or uses ones already attached to reads.
+ * Places them in local storage and percolates them to shared storage (codeMap), removing exact duplicates.
+ * Also hashes tips and places these in shared affixMap.
+ * Looks for containments in the affix map.
+ * @author Brian Bushnell
+ * @date Jul 24, 2013
+ *
+ */
+ private final class HashThread extends Thread{
+
+ public HashThread(boolean addToCodeMap_, boolean addToAffixMap_, boolean findMatches_, boolean findContainments_, boolean findOverlaps_){
+ addToCodeMapT=addToCodeMap_;
+ addToAffixMapT=addToAffixMap_;
+ findContainmentsT=findContainments_;
+ findOverlapsT=findOverlaps_;
+ findMatchesT=findMatches_;
+ tid=getTid();
+ crisq=new ArrayDeque<ConcurrentReadInputStream>(crisa.length);
+ for(int i=0; i<crisa.length; i++){
+// if(verbose){System.err.println("Adding to crisq.");}
+ crisq.add(crisa[(i+tid)%crisa.length]);
+ }
+ bandy=(maxEdits>0 && (findOverlapsT || findContainmentsT) ? BandedAligner.makeBandedAligner(bandwidth) : null);
+
+// assert(addToCodeMapT) : "addToCodeMapT="+addToCodeMapT+", addToAffixMapT="+addToAffixMapT+", findContainmentsT="+findContainmentsT+
+// ", findOverlapsT="+findOverlapsT+", findMatchesT="+findMatchesT+", convertToUpperCaseT="+convertToUpperCaseT+", numAffixMaps="+numAffixMaps;
+ }
+
+ public void run(){
+
+ ConcurrentReadInputStream cris=crisq.poll();
+
+ while(cris!=null){
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+ // long xx=0;
+ while(reads!=null && reads.size()>0){
+
+ for(Read r : reads){
+ processReadOuter(r);
+ }
+
+ if(codeMapT!=null && (codeMapT.size()>threadMaxReadsToBuffer || basesStoredT>threadMaxBasesToBuffer)){
+ assert(addToCodeMapT);
+ long added=mergeMaps();
+ addedToMainT+=added;
+ }
+
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ cris.returnList(ln.id, ln.list.isEmpty());
+ if(codeMapT!=null && !codeMapT.isEmpty()){
+ long added=mergeMaps();
+ addedToMainT+=added;
+ }
+ cris=crisq.poll();
+ }
+
+ codeMapT=null;
+ localConflictList=null;
+ sharedConflictList=null;
+ }
+
+ /** Return true if this read was a member of this subset. */
+ private boolean processReadOuter(Read r1){
+ if(r1.length()<MINSCAF){return false;}
+ Read r2=r1.mate;
+
+ assert(r1.pairnum()==0);
+ assert(r2==null || r2.pairnum()==1);
+
+ if(!addToCodeMapT && r1.obj==null){
+ if(r1.bases!=null && r1.length()>=MINSCAF){
+ final Unit u=(r1.obj!=null ? (Unit)r1.obj : new Unit(r1));
+ assert(u.r==r1 && (r1.obj==u || r1.obj==null));
+ final long code=u.code1;
+ r1.obj=u;
+ assert(u.r==r1 && r1.obj==u);
+ if(r2!=null && r2.obj==null){r2.obj=new Unit(r2);}
+
+ //Check for subset membership
+ final boolean inSet=u.inSet();
+ if(inSet){
+ final Long codeL=code;
+ ArrayList<Unit> list=codeMap.get(codeL);
+ boolean found=false;
+ for(Unit u0 : list){
+ //Replace with existing read
+ if(u0.equals(u) && u0.r.numericID==r1.numericID){
+ r1=u0.r;
+ r2=r1.mate;
+ found=true;
+ break;
+ }
+ }
+ assert(list!=null);
+ if(!found){
+ return false;
+ }
+ }
+ }
+ }
+ boolean b=processRead(r1);
+ if(r2!=null){processRead(r2);}
+ return b;
+ }
+
+ /** Return true if this read was a member of this subset. */
+ private boolean processRead(Read r){
+ if(r.length()<MINSCAF){return false;}
+
+ final boolean inSet;
+ if(!storeName){r.id=null;}
+ if(!storeQuality){r.quality=null;}
+
+ if(forceTrimLeft>0 || forceTrimRight>0){//Added at request of RQC team
+ if(r!=null && r.length()>0){
+ TrimRead.trimToPosition(r, forceTrimLeft>0 ? forceTrimLeft : 0, forceTrimRight>0 ? forceTrimRight : r.length(), 1);
+ }
+ }
+
+ readsProcessedT++;
+ basesProcessedT+=r.length();
+
+ final Unit u=(r.obj!=null ? (Unit)r.obj : new Unit(r));
+ assert(u.r==r && (r.obj==u || r.obj==null));
+ final long code=u.code1;
+
+ //Check for subset membership
+ inSet=u.inSet();
+
+ r.obj=u;
+ assert(u.r==r && r.obj==u);
+ if(r.mate!=null && r.mate.obj==null){r.mate.obj=new Unit(r.mate);}
+
+ if(verbose){System.err.println("Generated "+code+" for sequence "+u.name()+"\t"+new String(r.bases, 0, Tools.min(40, r.length())));}
+
+ if(addToCodeMapT && inSet){
+ final Long codeL=code;
+ ArrayList<Unit> list=codeMapT.get(codeL);
+ if(list==null){
+ if(verbose){System.err.println("Unique.");}
+ list=new ArrayList<Unit>(1);
+ list.add(u);
+ basesStoredT+=r.length();
+ codeMapT.put(codeL, list);
+ }else{
+ if(verbose){System.err.println("Exists.");}
+ boolean match=false;
+ if(findMatchesT){
+ for(Unit u2 : list){
+ if(pairedEqualsRC(u, u2)){
+// if(u.r.mate!=null){
+// verbose=true;
+//
+// Unit um=(Unit)u.r.mate.obj;
+// Unit u2m=(Unit)u2.r.mate.obj;
+//
+// if(verbose){
+// System.err.println("********");
+// System.err.println(u.r.toFastq());
+// System.err.println(u.r.mate.toFastq());
+// System.err.println("********");
+// System.err.println(u2.r.toFastq());
+// System.err.println(u2.r.mate.toFastq());
+// System.err.println("********");
+// System.err.println(u);
+// System.err.println(u2);
+// System.err.println(um);
+// System.err.println(u2m);
+// System.err.println("********");
+// System.err.println(u.equals(u2));
+// System.err.println(u.compareTo(u2));
+// System.err.println("********");
+// System.err.println(um.equals(u2m));
+// System.err.println(um.compareTo(u2m));
+// System.err.println("********");
+// }
+//
+// verbose=false;
+// }
+ assert(u.r.mate==null || pairedEqualsRC((Unit)u.r.mate.obj, (Unit)u2.r.mate.obj)) :
+ u.r.toFastq()+"\n"+u2.r.toFastq()+"\n"+u.r.mate.toFastq()+"\n"+u2.r.mate.toFastq()+
+ "\n"+u+"\n"+u2+"\n"+u.r.mate.obj+"\n"+u2.r.mate.obj;
+// if(verbose){System.err.println("Matches "+new String(r2.bases, 0, Tools.min(40, r2.length())));}
+ match=true;
+ u2.absorbMatch(u);
+ if(UNIQUE_ONLY){
+ synchronized(u2){
+ if(u2.valid()){
+ matchesT++;
+ baseMatchesT+=u2.length();
+ u2.setValid(false);
+ addDupe(u2.r);
+ }
+ }
+ }
+ break;
+ }
+ }
+ }
+ if(match){
+ addDupe(r);
+ matchesT++;
+ baseMatchesT+=r.length();
+ // if(verbose){System.err.println("matchesT="+matchesT+", baseMatchesT="+baseMatchesT);}
+ }else{
+ collisionsT++;
+ if(verbose){System.err.println("False collision; count = "+collisionsT);}
+ list.add(u);
+ basesStoredT+=r.length();
+ }
+ }
+ }
+
+ if(findContainmentsT){
+ int x=findContainments(u);
+ }
+
+ if(findOverlapsT){
+ int x=findOverlaps(u);
+ }
+
+ return inSet;
+ }
+
+ private int findContainments(final Unit u){
+ if(minLengthPercent<=0 && maxSubs<=0 && minIdentity>=100 && !u.valid()){return 0;}
+ final byte[] bases=u.bases();
+ final int minlen=k-1;
+ final int shift=2*k;
+ final int shift2=shift-2;
+ final long mask=~((-1L)<<shift);
+ long kmer=0;
+ long rkmer=0;
+ int hits=0;
+ int currentContainments=0;
+
+ if(bases==null || bases.length<k){return -1;}
+ final LongM key=new LongM();
+
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ long x=baseToNumber[b];
+ long x2=baseToComplementNumber[b];
+ kmer=((kmer<<2)|x)&mask;
+ rkmer=(rkmer>>>2)|(x2<<shift2);
+// if(verbose){System.err.println("Scanning i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
+ if(i>=minlen){
+ key.set(Tools.max(kmer, rkmer)); //Canonical
+ for(int am=0; am<affixMaps.length; am++){
+ ArrayList<Unit> list=affixMaps[am].get(key);
+ if(list!=null){
+ for(Unit u2 : list){
+ if(u!=u2 && !u.equals(u2)){
+ if(u2.valid()){
+ hits++;
+ if(verbose){
+ System.err.println("\nFound potential containment at am="+am+", i="+i+", key="+key.value()+
+ ", pre1="+u2.prefix1+", pre2="+u2.prefix2+
+ ", suf1="+u2.suffix1+", suf2="+u2.suffix2+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i, k)));
+ }
+ if(u.contains(u2, i, key, bandy, am)){
+ synchronized(u2){
+ if(u2.valid()){
+ currentContainments++;
+ baseContainmentsT+=u2.length();
+ u2.setValid(false);
+ addDupe(u2.r);
+ }
+ }
+ if(UNIQUE_ONLY){
+ synchronized(u){
+ if(u.valid()){
+ currentContainments++;
+ baseContainmentsT+=u.length();
+ u.setValid(false);
+ addDupe(u.r);
+ }
+ }
+ }
+
+ if(verbose){System.err.println("Added containment "+u2);}
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+// assert(false) : hits+", "+currentContainments+", "+baseContainments+"\n"+containmentMapT+"\n";
+
+ containmentCollisionsT+=(hits-currentContainments);
+// outstream.println("hits="+hits+", currentContainments="+currentContainments);
+ containmentsT+=currentContainments;
+ return hits;
+ }
+
+ private int findOverlaps(final Unit u){
+// if(minLengthPercent<=0 && maxSubs<=0 && minIdentity>=100 && !u.valid()){return 0;}
+// if(u.overlapList!=null){u.overlapList.clear();}
+ final byte[] bases=u.bases();
+ final int minlen=k-1;
+ final int shift=2*k;
+ final int shift2=shift-2;
+ final long mask=~((-1L)<<shift);
+ long kmer=0;
+ long rkmer=0;
+ int hits=0;
+ int currentOverlaps=0;
+
+ if(bases==null || bases.length<k){return -1;}
+ final LongM key=new LongM();
+
+ boolean quit=false;
+
+ for(int i=0; i<bases.length && !quit; i++){
+ byte b=bases[i];
+ long x=baseToNumber[b];
+ long x2=baseToComplementNumber[b];
+ kmer=((kmer<<2)|x)&mask;
+ rkmer=(rkmer>>>2)|(x2<<shift2);
+// if(verbose){System.err.println("Scanning i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
+ if(i>=minlen){//valid key
+ key.set(Tools.max(kmer, rkmer)); //Canonical key
+ for(int am=0; am<affixMaps.length; am++){
+ ArrayList<Unit> list=affixMaps[am].get(key);
+ if(list!=null){//found a key collision
+ for(Unit u2 : list){
+ if(quit){break;}//too many edges
+ int u1cluster=-1, u2cluster=-2;
+ if(preventTransitiveOverlaps && u!=u2){
+ u1cluster=u.determineCluster();
+ u2cluster=u2.determineCluster();
+ }
+ if(u1cluster!=u2cluster && u!=u2 && !u.equals(u2) && u2.r!=u.r.mate){//TODO: Not sure why identical things are banned... possibly relates to avoiding inter-pair edges?
+ if(u2.valid()){
+ hits++;
+
+// boolean flag=(u.code1==-3676200394282040623L && u2.code1==-7034423913727372751L) ||
+// (u2.code1==-3676200394282040623L && u.code1==-7034423913727372751L);
+ final boolean flag=false;
+ if(verbose || flag){
+ System.err.println("\nFound potential overlap at am="+am+", i="+i+", key="+key.value()+
+ ", pre1="+u2.prefix1+", pre2="+u2.prefix2+
+ ", suf1="+u2.suffix1+", suf2="+u2.suffix2+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i, k)));
+ }
+
+ final Overlap o;
+ if(maxEdges>1000000000 || u.overlapList==null || u2.overlapList==null ||
+ (u.overlapList.size()<maxEdges && u2.overlapList.size()<maxEdges2)){
+ o=u.makeOverlap(u2, i, key, bandy, am);
+
+ }else{
+ o=null;
+ if(u.overlapList.size()>maxEdges){quit=true;}
+ }
+ if(o!=null){
+
+ if(preventTransitiveOverlaps){
+ mergeClusterIds(u1cluster, u2cluster);
+ }
+
+ assert(o.test(bandy, o.edits+maxEdits)) : o;
+ if(verbose || flag){System.err.println("Created overlap "+o);}
+
+ long comp=u.length()-u2.length();
+ if(comp==0){comp=u.code1-u2.code1;}
+ if(comp==0){comp=u.code2-u2.code2;}
+ if(comp==0){comp=u.prefix1-u2.prefix1;}
+ if(comp==0){comp=u.suffix1-u2.suffix1;}
+ if(comp==0){comp=(u.r.numericID-u2.r.numericID);}
+ assert(comp!=0) : u+", "+u2;
+ Unit ua=(comp<0 ? u : u2);
+ Unit ub=(comp<0 ? u2 : u);
+ assert(ua!=ub);
+ if(verbose || flag){
+ System.err.println("ua="+ua.code1);
+ System.err.println("ub="+ub.code1);
+ System.err.println("u ="+u.code1);
+ System.err.println("u2="+u2.code1);
+ System.err.println("u.r ="+u.r.numericID);
+ System.err.println("u2.r="+u2.r.numericID);
+ System.err.println("ua contains o? "+ua.alreadyHas(o));
+ System.err.println("ub contains o? "+ub.alreadyHas(o));
+ System.err.println("ua.list="+ua.overlapList);
+ System.err.println("ub.list="+ub.overlapList);
+ }
+
+// assert(ua.alreadyHas(o)==ub.alreadyHas(o));
+
+ final boolean uaContainedOverlap;
+
+ synchronized(ua){
+ if(ua.overlapList==null){ua.overlapList=new ArrayList<Overlap>(2);}
+ if(!ua.overlapList.contains(o)){
+ if(EA){
+ synchronized(ub){
+ assert(ub.overlapList==null || !ub.overlapList.contains(o)) :
+ ua.alreadyHas(o)+", "+ub.alreadyHas(o)+"\n"+o+"\n"+ub.overlapList.get(ub.overlapList.indexOf(o))+
+ "\nua.list="+ua.overlapList+"\nub.list="+ub.overlapList+"\nu.code1="+u.code1+"\nu2.code1="+u2.code1;
+ }
+ }
+ currentOverlaps++;
+ baseOverlapsT+=o.overlapLen;
+ ua.overlapList.add(o);
+ if(verbose || flag){System.err.println("Added overlap "+o);}
+ uaContainedOverlap=false;
+ }else{
+ if(verbose || flag){System.err.println("Already contained overlap "+o);}
+ hits--;
+ uaContainedOverlap=true;
+ }
+ }
+
+ if(!uaContainedOverlap){
+ synchronized(ub){
+ if(ub.overlapList==null){ub.overlapList=new ArrayList<Overlap>(2);}
+ assert(!ub.overlapList.contains(o));
+ ub.overlapList.add(o);
+ if(verbose || flag){System.err.println("Added overlap "+o);}
+ }
+ }else{
+ if(verbose || flag){System.err.println("Already contained overlap "+o);}
+ }
+
+
+// assert(ua.alreadyHas(o));
+// assert(ub.alreadyHas(o));
+// assert(ua.overlapList.contains(o));
+// assert(ub.overlapList.contains(o));
+ if(verbose || flag){
+ System.err.println("ua contains o? "+ua.alreadyHas(o));
+ System.err.println("ub contains o? "+ub.alreadyHas(o));
+ System.err.println("ua.list="+ua.overlapList);
+ System.err.println("ub.list="+ub.overlapList);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ if(EA){
+ synchronized(u){
+ if(u.overlapList!=null && u.overlapList.isEmpty()){
+ assert(false) : "Why would this happen?";
+ u.overlapList=null;
+ }
+ }
+ }
+// assert(false) : hits+", "+currentOverlaps+", "+baseOverlaps+"\n"+overlapMapT+"\n";
+
+// assert(hits==currentOverlaps) : hits+", "+currentOverlaps;
+
+ overlapCollisionsT+=(hits-currentOverlaps);
+// outstream.println("hits="+hits+", currentOverlaps="+currentOverlaps);
+ overlapsT+=currentOverlaps;
+ return hits;
+ }
+
+ /** Insert reads processed by a thread into the shared code and affix maps.
+ * If operating in subset mode, only store reads with code equal to subset mod subsetCount. */
+ private long mergeMaps(){
+ if(verbose){System.err.println("Merging maps.");}
+ long novelReads=0, novelKeys=0;
+ long collisionReads=0;
+ long mergedReads=0;
+
+ assert(localConflictList.isEmpty());
+ assert(sharedConflictList.isEmpty());
+
+ synchronized(codeMap){
+ for(Long key : codeMapT.keySet()){
+ if(codeMap.containsKey(key)){
+ localConflictList.add(codeMapT.get(key));
+ sharedConflictList.add(codeMap.get(key));
+ }else{
+ ArrayList<Unit> list=codeMapT.get(key);
+ codeMap.put(key, list);
+ addedList.addAll(list);
+ novelReads+=list.size();
+ novelKeys++;
+ }
+ }
+ }
+
+ if(verbose){System.err.println("Novel reads = "+novelReads+", conflicts = "+localConflictList.size());}
+
+ for(int i=0; i<localConflictList.size(); i++){
+ ArrayList<Unit> listT=localConflictList.get(i);
+ ArrayList<Unit> list=sharedConflictList.get(i);
+ synchronized(list){
+ for(Unit u : listT){
+ if(verbose){System.err.println("Processing novel unit "+u.name());}
+ boolean match=false;
+ if(findMatchesT){
+ for(Unit u2 : list){
+ if(pairedEqualsRC(u, u2)){
+ // if(verbose){System.err.println("Matches "+new String(r2.bases, 0, Tools.min(40, r2.length())));}
+ u2.absorbMatch(u);
+ if(UNIQUE_ONLY){
+ synchronized(u2){
+ if(u2.valid()){
+ mergedReads++;
+ baseMatchesT+=u2.length();
+ u2.setValid(false);
+ addDupe(u2.r);
+ }
+ }
+ }
+ match=true;
+ break;
+ }
+ }
+ }
+ if(match){
+ addDupe(u.r);
+ mergedReads++;
+ baseMatchesT+=u.length();
+ if(verbose){System.err.println("matchesT="+matchesT+", baseMatchesT="+baseMatchesT);}
+ }else{
+ collisionReads++;
+ if(verbose){System.err.println("False collision; count = "+collisionReads);}
+ list.add(u);
+ addedList.add(u);
+ }
+ }
+ }
+ }
+ matchesT+=mergedReads;
+ collisionsT+=collisionReads;
+ if(verbose){System.err.println("Done Merging.");}
+ if(verbose){System.err.println("mapT.size="+codeMapT.size()+", basesStoredT="+basesStoredT);}
+
+ codeMapT.clear();
+ localConflictList.clear();
+ sharedConflictList.clear();
+
+ if(!addedList.isEmpty()){
+ if(addToAffixMapT){
+ final LongM p=new LongM(-1, true);
+ assert(affixMap1!=null || affixMap2!=null);
+ if(affixMap1!=null && !ignoreAffix1){//Allows you to not use am1
+ synchronized(affixMap1){
+ for(Unit u : addedList){
+ if(verbose){System.err.println("Processing affixes for "+u.name());}
+ if(u.prefix1!=-1 || u.prefix1!=u.suffix1){
+ if(verbose){System.err.println("Using prefix "+u.prefix1);}
+ p.set(u.prefix1);
+ ArrayList<Unit> alu=affixMap1.get(p);
+ if(alu==null){
+ if(verbose){System.err.println("Made new alu for "+p);}
+ alu=new ArrayList<Unit>(2);
+ affixMap1.put(p.iCopy(), alu);
+ }
+ if(alu.size()<maxAffixCopies){
+ if(verbose){System.err.println("Added "+u.name());}
+ alu.add(u);
+ }
+ if(verbose){System.err.println(affixMap1.get(p));}
+ }
+ if(storeSuffix && u.prefix1!=u.suffix1){
+ if(verbose){System.err.println("Using suffix "+u.suffix1);}
+ p.set(u.suffix1);
+ ArrayList<Unit> alu=affixMap1.get(p);
+ if(alu==null){
+ if(verbose){System.err.println("Made new alu for "+p);}
+ alu=new ArrayList<Unit>(2);
+ affixMap1.put(p.iCopy(), alu);
+ }
+ if(alu.size()<maxAffixCopies){
+ if(verbose){System.err.println("Added "+u.name());}
+ alu.add(u);
+ }
+ if(verbose){System.err.println(affixMap1.get(p));}
+ }
+ }
+ }
+ }
+ if(affixMap2!=null){
+ synchronized(affixMap2){
+ for(Unit u : addedList){
+ if(u.prefix2!=-1 || u.prefix2!=u.suffix2){
+ p.set(u.prefix2);
+ ArrayList<Unit> alu=affixMap2.get(p);
+ if(alu==null){
+ alu=new ArrayList<Unit>(2);
+ affixMap2.put(p.iCopy(), alu);
+ }
+ if(alu.size()<maxAffixCopies){alu.add(u);}
+ }
+ if(storeSuffix && u.prefix2!=u.suffix2){
+ p.set(u.suffix2);
+ ArrayList<Unit> alu=affixMap2.get(p);
+ if(alu==null){
+ alu=new ArrayList<Unit>(2);
+ affixMap2.put(p.iCopy(), alu);
+ }
+ if(alu.size()<maxAffixCopies){alu.add(u);}
+ }
+ }
+ }
+ }
+ }
+ }
+
+ addedList.clear();
+ basesStoredT=0;
+ return collisionReads+novelReads;
+ }
+
+ private int getTid(){
+ synchronized(HashThread.class){
+ int x=tcount;
+ tcount++;
+ return x;
+ }
+ }
+
+ private LinkedHashMap<Long, ArrayList<Unit>> codeMapT=new LinkedHashMap<Long, ArrayList<Unit>>(threadMaxReadsToBuffer*8);
+ private ArrayList<Unit> addedList=new ArrayList<Unit>(threadMaxReadsToBuffer);
+ private ArrayList<ArrayList<Unit>> localConflictList=new ArrayList<ArrayList<Unit>>(threadMaxReadsToBuffer);
+ private ArrayList<ArrayList<Unit>> sharedConflictList=new ArrayList<ArrayList<Unit>>(threadMaxReadsToBuffer);
+
+ long matchesT=0;
+ long baseMatchesT=0;
+ long baseContainmentsT=0;
+ long collisionsT=0;
+ long containmentsT=0;
+ long containmentCollisionsT=0;
+ long basesStoredT=0;
+ long addedToMainT=0;
+ long readsProcessedT=0;
+ long basesProcessedT=0;
+ long overlapsT=0;
+ long baseOverlapsT=0;
+ long overlapCollisionsT=0;
+
+ private final boolean addToCodeMapT;
+ private final boolean addToAffixMapT;
+ private final boolean findContainmentsT;
+ private final boolean findOverlapsT;
+ private final boolean findMatchesT;
+// private final boolean convertToUpperCaseT;
+ private final int tid;
+ private final ArrayDeque<ConcurrentReadInputStream> crisq;
+ private final BandedAligner bandy;
+ }
+
+ public static boolean equalsRC(byte[] a, byte[] b){
+ if(a==b){return true;}
+ if(a==null || b==null){return false;}
+ if(a.length!=b.length){return false;}
+
+ boolean ca=isCanonical(a);
+ boolean cb=isCanonical(b);
+
+ if(ca==cb){
+ for(int i=0; i<a.length; i++){
+ final byte aa=a[i], bb=b[i];
+ if(aa!=bb){return false;}
+ }
+ }else{
+ for(int i=0, j=b.length-1; i<a.length; i++, j--){
+ final byte aa=a[i], bb=baseToComplementExtended[b[j]];
+ if(aa!=bb){return false;}
+ }
+ }
+ return true;
+ }
+
+ public static boolean pairedEqualsRC(Unit ua, Unit ub){
+ if(verbose){System.err.println("pairedEqualsRC("+ua.name()+", "+ub.name()+")");}
+ if(verbose){System.err.println("ea");}
+ boolean b=equalsRC(ua, ub);
+ if(verbose){System.err.println("eb");}
+ if(!b){return false;}
+ if(verbose){System.err.println("ec");}
+
+ if(ua.r!=null && ub.r!=null){
+ if(verbose){System.err.println("ed");}
+ assert((ua.r.mate==null)==(ub.r.mate==null));
+ if(verbose){System.err.println("ee");}
+ if(ua.r.mate!=null && ub.r.mate!=null){
+ if(verbose){System.err.println("ef");}
+ return ua.canonical()==ub.canonical() && ua.r.pairnum()==ub.r.pairnum() && Tools.compare(ua.r.mate.bases, ub.r.mate.bases)==0;
+ }
+ if(verbose){System.err.println("eg");}
+ }
+ if(verbose){System.err.println("eh");}
+ return true;
+ }
+
+ private static boolean equalsRC(Unit ua, Unit ub){
+ if(verbose){System.err.println("equalsRC("+ua.name()+", "+ub.name()+")");}
+ return ua.code1==ub.code1 && ua.code2==ub.code2 && (ua.canonical()==ub.canonical() ? (ua.prefix1==ub.prefix1 && ua.suffix1==ub.suffix1) :
+ (ua.prefix1==ub.suffix1 && ua.suffix1==ub.prefix1)) && compareRC(ua, ub)==0;
+ }
+
+ public static int comparePairedRC(Unit ua, Unit ub){
+ if(verbose){System.err.println("comparePairedRC("+ua.name()+", "+ub.name()+")");}
+ int x=compareRC(ua, ub);
+ if(x!=0){return x;}
+
+ if(ua.r!=null && ub.r!=null && ua.r.mate!=null && ub.r.mate!=null){
+ if(ua.r.pairnum()!=ub.r.pairnum()){return ua.r.pairnum()-ub.r.pairnum();}
+ return compareRC((Unit)ua.r.mate.obj, (Unit)ub.r.mate.obj);
+ }
+ return 0;
+ }
+
+ //TODO
+ //This is really for sorting by length.
+ private static int compareRC(Unit ua, Unit ub){
+ if(verbose){System.err.println("compareRC("+ua.name()+", "+ub.name()+")");}
+ if(ua==ub){return 0;}
+ if(verbose){System.err.println("a");}
+ if(verbose){System.err.println("a1");}
+ if(ua.length()!=ub.length()){return ub.length()-ua.length();}
+ if(verbose){System.err.println("a2");}
+
+ if(REQUIRE_MATCHING_NAMES){
+ if(ua.name()!=null && ub.name()!=null){
+ int x=ua.name().compareTo(ub.name());
+ if(x!=0){return x;}
+ }
+ }
+ if(verbose){System.err.println("a3");}
+
+ if(ua.r==null || ub.r==null){
+ if(verbose){System.err.println("b");}
+ if(verbose){System.err.println("b1");}
+ if(ua.canonical()){
+ if(verbose){System.err.println("c");}
+ if(ub.canonical()){
+ if(ua.prefix1!=ub.prefix1){return ua.prefix1>ub.prefix1 ? 1 : -1;}
+ if(ua.suffix1!=ub.suffix1){return ua.suffix1>ub.suffix1 ? 1 : -1;}
+ }else{
+ if(ua.prefix1!=ub.suffix1){return ua.prefix1>ub.suffix1 ? 1 : -1;}
+ if(ua.suffix1!=ub.prefix1){return ua.suffix1>ub.prefix1 ? 1 : -1;}
+ }
+ }else{
+ if(verbose){System.err.println("d");}
+ if(ub.canonical()){
+ if(ua.suffix1!=ub.prefix1){return ua.suffix1>ub.prefix1 ? 1 : -1;}
+ if(ua.prefix1!=ub.suffix1){return ua.prefix1>ub.suffix1 ? 1 : -1;}
+ }else{
+ if(ua.suffix1!=ub.suffix1){return ua.suffix1>ub.suffix1 ? 1 : -1;}
+ if(ua.prefix1!=ub.prefix1){return ua.prefix1>ub.prefix1 ? 1 : -1;}
+ }
+ }
+ if(verbose){System.err.println("e");}
+ if(ua.code1!=ub.code1){return ua.code1>ub.code1 ? 1 : -1;}
+ if(ua.code2!=ub.code2){return ua.code2>ub.code2 ? 1 : -1;}
+
+ return ua.pairnum()-ub.pairnum();
+ }
+ if(verbose){System.err.println("f");}
+ final byte[] a=ua.r.bases, b=ub.r.bases;
+ if(a==b){return 0;}
+ if(a==null || b==null){return a==null ? -1 : 1;}
+ if(verbose){System.err.println("g");}
+
+ if(ua.canonical()==ub.canonical()){
+ if(verbose){System.err.println("h");}
+ if(ua.canonical() && ub.canonical()){
+ for(int i=0; i<a.length; i++){
+ final byte aa=a[i], bb=b[i];
+ if(aa!=bb){return aa-bb;}
+ }
+ }else{
+ for(int i=a.length-1; i>=0; i--){
+ final byte aa=baseToComplementExtended[a[i]], bb=baseToComplementExtended[b[i]];
+ if(aa!=bb){return aa-bb;}
+ }
+ }
+ }else{
+ if(verbose){System.err.println("i");}
+ if(ua.canonical()){
+ for(int i=0, j=b.length-1; i<a.length; i++, j--){
+ final byte aa=a[i], bb=baseToComplementExtended[b[j]];
+ if(aa!=bb){return aa-bb;}
+ }
+ }else{
+ for(int i=a.length-1, j=0; i>=0; i--, j++){
+ final byte aa=baseToComplementExtended[a[i]], bb=b[j];
+ if(aa!=bb){return aa-bb;}
+ }
+ }
+ }
+
+ if(verbose){System.err.println("j");}
+ return ua.pairnum()-ub.pairnum();
+ }
+
+ private static long hashTip(byte[] bases, boolean prefix, int k, int skipInitialBases){
+ if(bases==null || bases.length<k){return -1;}
+
+ final int shift=2*k;
+ final int shift2=shift-2;
+ final long mask=~((-1L)<<shift);
+ long kmer=0;
+ long rkmer=0;
+ int len=0;
+
+ final int start=(prefix ? 0+skipInitialBases : bases.length-k-skipInitialBases);
+ final int stop=start+k;
+
+// if(verbose){
+// System.err.println("\n"+new String(bases));
+// System.err.println("prefix="+prefix+", start="+start+", stop="+stop);
+//// System.err.print(new String(bases));
+// }
+ for(int i=start; i<stop; i++){
+ byte b=bases[i];
+// if(verbose){System.err.print((char)b);}
+ long x=baseToNumber[b];
+ long x2=baseToComplementNumber[b];
+ kmer=((kmer<<2)|x)&mask;
+ rkmer=(rkmer>>>2)|(x2<<shift2);
+ len++;
+ }
+ if(verbose){System.err.println(new String(bases, start, k)+" = "+Tools.max(kmer, rkmer));}
+ assert(len==k) : len+","+k;
+ return Tools.max(kmer, rkmer);
+ }
+
+ private static final int calcMaxEdits(int maxEdits, float minIdentityMult, int len){
+ return minIdentityMult==0 ? maxEdits : Tools.max(maxEdits, (int)Math.round(len*minIdentityMult));
+ }
+
+
+ private class Overlap implements Comparable<Overlap>{
+
+ public Overlap(Unit u1_, Unit u2_, int type_, int start1_, int start2_, int stop1_, int stop2_, int len_, int mismatches_, int edits_, BandedAligner bandy){
+ assert(u1_!=u2_);
+ if(verbose){System.err.println("\nCreating an overlap.");}
+ u1=u1_;
+ u2=u2_;
+ type=type_;
+ start1=start1_;
+ start2=start2_;
+ stop1=stop1_;
+ stop2=stop2_;
+ overlapLen=len_;
+ mismatches=mismatches_;
+ edits=edits_;
+
+ assert(Tools.absdif(Tools.absdif(start1, stop1), Tools.absdif(start2, stop2))<=maxEdits) : "\n"+this+"\n>1\n"+new String(u1.r.bases)+"\n>2\n"+new String(u2.r.bases)
+ +"\n>1a\n"+new String(u1.r.bases, Tools.min(start1, stop1), Tools.max(start1, stop1)-Tools.min(start1, stop1)+1)
+ +"\n>2a\n"+new String(u2.r.bases, Tools.min(start2, stop2), Tools.max(start2, stop2)-Tools.min(start2, stop2)+1);
+
+ assert(start1>=0 && start1<=u1.length()) : "type "+type+": "+start1+", "+stop1+", "+u1.length()+", "+start2+", "+stop2+", "+u2.length();
+ assert(stop1>=0 && stop1<=u1.length()) : "type "+type+": "+start1+", "+stop1+", "+u1.length()+", "+start2+", "+stop2+", "+u2.length();
+ assert(start2>=0 && start2<=u2.length()) : "type "+type+": "+start1+", "+stop1+", "+u1.length()+", "+start2+", "+stop2+", "+u2.length();
+ assert(stop2>=0 && stop2<=u2.length()) : "type "+type+": "+start1+", "+stop1+", "+u1.length()+", "+start2+", "+stop2+", "+u2.length();
+
+ assert(type==FORWARD || type==FORWARDRC || type==REVERSE || type==REVERSERC);
+
+ if(verbose){System.err.println(this);}
+
+ assert(Tools.absdif(Tools.absdif(start1, stop1), Tools.absdif(start1, stop1))<=maxEdits);
+
+ assert(test(bandy, edits+maxEdits)) : "\n"+this+"\n>1\n"+new String(u1.r.bases)+"\n>2\n"+new String(u2.r.bases)
+ +"\n>1a\n"+new String(u1.r.bases, Tools.min(start1, stop1), Tools.max(start1, stop1)-Tools.min(start1, stop1)+1)
+ +"\n>2a\n"+new String(u2.r.bases, Tools.min(start2, stop2), Tools.max(start2, stop2)-Tools.min(start2, stop2)+1);
+ if(verbose){System.err.println("Passed test 1.");}
+
+// bandy.verbose=true;
+// test(bandy);
+// assert(false);
+
+ assert(u1!=u2);
+ u1.firstInOverlap(u2);
+ u2.firstInOverlap(u1);
+ assert(u1.length()!=u2.length() || u1.code1!=u2.code1 || u1.code2!=u2.code2 || (u1.r!=null && u1.r.mate!=null)) : "Collision? \n"+this+"\n"+u1+"\n"+u2;
+ assert(u1.firstInOverlap(u2)!=u2.firstInOverlap(u1)) :
+ "\nu1.firstInOverlap(u2)="+u1.firstInOverlap(u2)+"\nu2.firstInOverlap(u1)="+u2.firstInOverlap(u1)+"\nu1="+u1+"\nu2="+u2;
+
+ if(!u1.firstInOverlap(u2)){
+ if(verbose){System.err.println("\nSwapping.");}
+ swap();
+ if(verbose){System.err.println(this);}
+
+ if(EA && !customBandwidth && !test(bandy, edits+maxEdits)){
+ System.err.println("\n"+this);
+ swap();
+ System.err.println("\n"+this);
+ assert(test(bandy, edits+maxEdits)) : "\n"+this+"\n>1\n"+new String(u1.r.bases)+"\n>2\n"+new String(u2.r.bases)+"\n";
+ System.err.println("Passed test 2a, "+bandy.lastEdits+" edits.\n");
+ swap();
+ System.err.println("\n"+this);
+ assert(test(bandy, edits+maxEdits)) : "\n"+this+"\n>1\n"+new String(u1.r.bases)+"\n>2\n"+new String(u2.r.bases)
+ +"\n>1a\n"+new String(u1.r.bases, Tools.min(start1, stop1), Tools.max(start1, stop1)-Tools.min(start1, stop1)+1)
+ +"\n>2a\n"+new String(u2.r.bases, Tools.min(start2, stop2), Tools.max(start2, stop2)-Tools.min(start2, stop2)+1);
+ System.err.println("Passed test 2b, "+bandy.lastEdits+" edits.\n");
+ }
+
+ assert(customBandwidth || test(bandy, edits+maxEdits)) : "\n"+this+"\n>1\n"+new String(u1.r.bases)+"\n>2\n"+new String(u2.r.bases)
+ +"\n>1a\n"+new String(u1.r.bases, Tools.min(start1, stop1), Tools.max(start1, stop1)-Tools.min(start1, stop1)+1)
+ +"\n>2a\n"+new String(u2.r.bases, Tools.min(start2, stop2), Tools.max(start2, stop2)-Tools.min(start2, stop2)+1);
+ if(verbose){System.err.println("Passed test 2.");}
+ }
+
+ if(type==REVERSE || type==REVERSERC){
+ if(verbose){System.err.println("\nReversing.");}
+ reverseDirection();
+ if(verbose){System.err.println(this);}
+
+ if(EA && !Shared.anomaly && !customBandwidth && bandy!=null && !test(bandy, edits+maxEdits)){
+ Shared.anomaly=true;
+ BandedAligner.verbose=true;
+ System.err.println("\n********** Failed test 3, "+bandy.lastEdits+" edits. **********\n");
+ reverseDirection();
+ System.err.println(this);
+ assert(test(bandy, edits+maxEdits)) : "\n"+this+"\n>1\n"+new String(u1.r.bases)+"\n>2\n"+new String(u2.r.bases)+"\n";
+ System.err.println("Passed test 3a, "+bandy.lastEdits+" edits.\n");
+ reverseDirection();
+ System.err.println(this);
+ assert(test(bandy, edits+maxEdits)) : "\n"+this+"\n>1\n"+new String(u1.r.bases)+"\n>2\n"+new String(u2.r.bases)
+ +"\n>1a\n"+new String(u1.r.bases, Tools.min(start1, stop1), Tools.max(start1, stop1)-Tools.min(start1, stop1)+1)
+ +"\n>2a\n"+new String(u2.r.bases, Tools.min(start2, stop2), Tools.max(start2, stop2)-Tools.min(start2, stop2)+1);
+ System.err.println("Passed test 3b, "+bandy.lastEdits+" edits.\n");
+ BandedAligner.verbose=false;
+ assert(false);
+ }
+
+ assert(customBandwidth || test(bandy, edits+maxEdits)) : "\n"+this+"\n>1\n"+new String(u1.r.bases)+"\n>2\n"+new String(u2.r.bases)+"\n";
+ if(verbose){System.err.println("Passed test 3.");}
+ }
+ //Now all overlaps should be FORWARD or FORWARDRC and u1 should be at least as big as u2
+ assert(type==FORWARD || type==FORWARDRC);
+ assert(u1.length()>=u2.length());
+ assert(u1.firstInOverlap(u2));
+ assert(!u2.firstInOverlap(u1));
+ if(verbose){System.err.println("Finished overlap initialization.");}
+ }
+
+ public boolean test(BandedAligner bandy, int editLimit){
+ final int last1=u1.length()-1, last2=u2.length()-1;
+ if(verbose){System.err.println("Testing "+OVERLAP_TYPE_NAMES[type]+", "+start1+", "+start2);}
+ if(type==FORWARD){
+ assert(start1==0 || start2==0) : "start1="+start1+", stop1="+stop1+", last1="+last1+", start2="+start2+", stop2="+stop2+", last2="+last2;
+ if(start2==0){
+ if(verbose){System.err.println("A");}
+ return u1.overlapsForward(u2, start1, start2, bandy, false, editLimit);}
+ else{
+ if(verbose){System.err.println("B");}
+ return u2.overlapsForward(u1, start2, start1, bandy, false, editLimit);}
+ }
+ if(type==FORWARDRC){
+ assert(start1==0 || start2==last2) : "start1="+start1+", stop1="+stop1+", last1="+last1+", start2="+start2+", stop2="+stop2+", last2="+last2;
+ if(start2==last2){return u1.overlapsForwardRC(u2, start1, start2, bandy, false, editLimit);}
+ else{return u2.overlapsReverseRC(u1, start2, start1, bandy, false, editLimit);}
+ }
+ if(type==REVERSE){
+ assert(start1==last1 || start2==last2) : "start1="+start1+", stop1="+stop1+", last1="+last1+", start2="+start2+", stop2="+stop2+", last2="+last2;
+ if(start2==last2){return u1.overlapsReverse(u2, start1, start2, bandy, false, editLimit);}
+ else{return u2.overlapsReverse(u1, start2, start1, bandy, false, editLimit);}
+ }
+ if(type==REVERSERC){
+ assert(start1==last1 || start2==0) : "start1="+start1+", stop1="+stop1+", last1="+last1+", start2="+start2+", stop2="+stop2+", last2="+last2;
+ if(start2==0){return u1.overlapsReverseRC(u2, start1, start2, bandy, false, editLimit);}
+ else{return u2.overlapsForwardRC(u1, start2, start1, bandy, false, editLimit);}
+ }
+ throw new RuntimeException();
+ }
+
+ public boolean equals(Object o){
+ return equals((Overlap)o);
+ }
+
+ public boolean equals(Overlap o){
+ if(this==o){return true;}
+ assert(o!=null) : "*A*\n"+this+"\n"+o+"\n"+u1+"\n"+u2;
+ assert(u1!=null && u2!=null) : "*B*\n"+this+"\n"+o+"\n"+u1+"\n"+u2;
+ assert(u1!=o.u2 || u2!=o.u1) : "*C*\n"+this+"\n"+o+"\n"+u1.firstInOverlap(u2)+"\n"+o.u1.firstInOverlap(o.u2)+"\n"+u1+"\n"+u2;
+ return (u1==o.u1 && u2==o.u2 && type==o.type && start1==o.start1 && start2==o.start2 && stop1==o.stop1 && stop2==o.stop2)
+ ;//|| (u1==o.u2 && u2==o.u1 && type==reverseType(o.type) && start1==o.start2 && start2==o.start1);
+ }
+
+// public int compareTo(Overlap o){
+// int a=compareTo2(o);
+// int b=o.compareTo2(this);
+// assert(a==-b) : "\n"+this+"\n"+o+"\na="+a+", b="+b+", equals="+this.equals(o)
+// +"\nu1.compareTo(o.u1)="+u1.compareTo(o.u1)+"\no.u1.compareTo(u1)="+o.u1.compareTo(u1)
+// +"\nu2.compareTo(o.u2)="+u2.compareTo(o.u2)+"\no.u2.compareTo(u2)="+o.u2.compareTo(u2);
+// return a;
+// }
+
+ public int compareTo(Overlap o){
+ int score1=overlapLen-50*(mismatches+edits);
+ int score2=o.overlapLen-50*(o.mismatches+o.edits);
+ if(score1!=score2){return score2-score1;}
+ if(overlapLen!=o.overlapLen){return o.overlapLen-overlapLen;}
+ int x=u1.compareTo(o.u1);
+ if(x!=0){return -x;}
+ x=u2.compareTo(o.u2);
+ if(x!=0){return -x;}
+ if(type!=o.type){return type-o.type;}
+ if((u1!=o.u1 || u2!=o.u2) && absorbMatch && !subsetMode){
+ boolean oldv=verbose;
+ verbose=true;
+ System.err.println(this);
+ System.err.println(o);
+ System.err.println("********");
+ System.err.println(u1);
+ System.err.println(u2);
+ System.err.println(o.u1);
+ System.err.println(o.u2);
+ System.err.println("********");
+ System.err.println(u1.equals(o.u1));
+ System.err.println("********");
+ System.err.println(u2.equals(o.u2));
+ System.err.println("********");
+ System.err.println(u1.compareTo(o.u1));
+ System.err.println("********");
+ System.err.println(u2.compareTo(o.u2));
+ System.err.println("********");
+ verbose=oldv;
+ }
+ assert(!absorbMatch || (u1==o.u1 && u2==o.u2) || subsetMode) : "\n"+u1.r+"\n"+u2.r+"\n"+o.u1.r+"\n"+o.u2.r
+ +"\n\n"+u1.r.mate+"\n"+u2.r.mate+"\n"+o.u1.r.mate+"\n"+o.u2.r.mate;
+// assert(false) : "\n"+this+"\n"+o+"\n>"+u1.name()+"\n"+new String(u1.bases())+"\n>"+u2.name()+"\n"+new String(u2.bases())+"\n";
+ if(start1!=o.start1){return start1-o.start1;}
+ if(stop1!=o.stop1){return stop1-o.stop1;}
+ if(start2!=o.start2){return start2-o.start2;}
+ if(stop2!=o.stop2){return stop2-o.stop2;}
+ if(this.equals(o)){
+ return 0;
+ }else{
+ //TODO: ensure this assumption is valid.
+ assert(!absorbContainment || !absorbMatch || subsetMode) : "\n"+this+"\n"+o+"\n>"+u1.name()+"\n"+new String(u1.bases())+"\n>"+u2.name()+"\n"+new String(u2.bases())+"\n";
+
+ if(u1.unitID!=o.u1.unitID){return u1.unitID-o.u1.unitID;}
+ if(u2.unitID!=o.u2.unitID){return u2.unitID-o.u2.unitID;}
+ }
+ return 0;
+ }
+
+ public int hashCode(){
+ return u1.hashCode()^u2.hashCode()^overlapLen;
+ }
+
+ public void flip(Unit changed, BandedAligner bandy){
+
+ if(changed==u2){
+ if(type==FORWARD){type=FORWARDRC;}
+ else if(type==FORWARDRC){type=FORWARD;}
+ else if(type==REVERSE){type=REVERSERC;}
+ else if(type==REVERSERC){type=REVERSE;}
+ else{throw new RuntimeException("Unknown overlap type "+type);}
+ start2=u2.length()-start2-1;
+ stop2=u2.length()-stop2-1;
+ }else if(changed==u1){
+ if(type==FORWARD){type=REVERSERC;}
+ else if(type==FORWARDRC){type=REVERSE;}
+ else if(type==REVERSE){type=FORWARDRC;}
+ else if(type==REVERSERC){type=FORWARD;}
+ else{throw new RuntimeException("Unknown overlap type "+type);}
+ start1=u1.length()-start1-1;
+ stop1=u1.length()-stop1-1;
+ }else{throw new RuntimeException("'changed' was not in the Overlap.");}
+
+ assert(test(bandy, edits+maxEdits));
+ }
+
+ public void swap(){
+ Unit tempu=u1;
+ u1=u2;
+ u2=tempu;
+ int temp=start1;
+ start1=start2;
+ start2=temp;
+ temp=stop1;
+ stop1=stop2;
+ stop2=temp;
+ if(type==FORWARDRC){type=REVERSERC;}
+ else if(type==REVERSERC){type=FORWARDRC;}
+ }
+
+ public void reverseDirection(){
+ type=reverseType(type);
+ int temp=start1;
+ start1=stop1;
+ stop1=temp;
+ temp=start2;
+ start2=stop2;
+ stop2=temp;
+ }
+
+ public String toString(){
+ StringBuilder sb=new StringBuilder(80);
+ sb.append("type=");
+ sb.append(OVERLAP_TYPE_NAMES[type]);
+ sb.append(", len=");
+ sb.append(overlapLen);
+ sb.append(", subs=");
+ sb.append(mismatches);
+ sb.append(", edits=");
+ sb.append(edits);
+
+ sb.append(" (");
+ sb.append(u1.name()==null ? u1.r.numericID+"" : u1.name());
+ if(printLengthInEdges){
+ sb.append(", length=");
+ sb.append(u1.length());
+ }
+ sb.append(", start1=");
+ sb.append(start1);
+ sb.append(", stop1=");
+ sb.append(stop1);
+
+ sb.append(") (");
+ sb.append(u2.name()==null ? u2.r.numericID+"" : u2.name());
+ if(printLengthInEdges){
+ sb.append(", length=");
+ sb.append(u2.length());
+ }
+ sb.append(", start2=");
+ sb.append(start2);
+ sb.append(", stop2=");
+ sb.append(stop2);
+ sb.append(")");
+ return sb.toString();
+ }
+
+ public String toLabel(){
+ StringBuilder sb=new StringBuilder(80);
+ sb.append(OVERLAP_TYPE_ABBREVIATIONS[type]);
+ sb.append(',');
+ sb.append(overlapLen);
+ sb.append(',');
+ sb.append(mismatches);
+ sb.append(',');
+ sb.append(edits);
+
+ if(printLengthInEdges){
+ sb.append(',');
+ sb.append(u1.length());
+ }
+ sb.append(',');
+ sb.append(start1);
+ sb.append(',');
+ sb.append(stop1);
+
+ if(printLengthInEdges){
+ sb.append(',');
+ sb.append(u2.length());
+ }
+ sb.append(',');
+ sb.append(start2);
+ sb.append(',');
+ sb.append(stop2);
+
+ return sb.toString();
+ }
+
+
+ private void setCanonContradiction(boolean b){
+ assert(b!=canonContradiction()) : b+", "+canonContradiction();
+ if(b){flags|=CANON_CONTRADICTION_MASK;}
+ else{flags&=~CANON_CONTRADICTION_MASK;}
+ assert(b==canonContradiction()) : b+", "+canonContradiction();
+ }
+
+ private void setOffsetContradiction(boolean b){
+ assert(b!=offsetContradiction()) : b+", "+offsetContradiction();
+ if(b){flags|=OFFSET_CONTRADICTION_MASK;}
+ else{flags&=~OFFSET_CONTRADICTION_MASK;}
+ assert(b==offsetContradiction()) : b+", "+offsetContradiction();
+ }
+
+ private void setMultiJoin(boolean b){
+ assert(b!=multiJoin()) : b+", "+multiJoin();
+ if(b){flags|=MULTIJOIN_MASK;}
+ else{flags&=~MULTIJOIN_MASK;}
+ assert(b==multiJoin()) : b+", "+multiJoin();
+ }
+
+ private void setVisited(boolean b){
+ assert(b!=visited()) : b+", "+visited();
+ if(b){flags|=VISITED_MASK;}
+ else{flags&=~VISITED_MASK;}
+ assert(b==visited()) : b+", "+visited();
+ }
+
+ private void setCyclic(boolean b){
+ assert(b!=cyclic()) : b+", "+cyclic();
+ if(b){flags|=CYCLIC_MASK;}
+ else{flags&=~CYCLIC_MASK;}
+ assert(b==cyclic()) : b+", "+cyclic();
+ }
+
+ private void setInvalid(boolean b){
+ assert(b!=invalid()) : b+", "+invalid();
+ assert(b!=mst()) : b+", "+mst()+", "+invalid();
+ if(b){flags|=INVALID_MASK;}
+ else{flags&=~INVALID_MASK;}
+ assert(b==invalid()) : b+", "+invalid();
+ }
+
+ private void setMst(boolean b){
+ assert(b!=mst()) : b+", "+mst();
+ assert(b!=invalid()) : b+", "+mst()+", "+invalid();
+ if(b){flags|=MST_MASK;}
+ else{flags&=~MST_MASK;}
+ assert(b==mst()) : b+", "+mst();
+ }
+
+ public void clearVolatileFlags(){
+ flags=0;
+// flags=flags&~(MULTIJOIN_MASK|VISITED_MASK|CANON_CONTRADICTION_MASK|CYCLIC_MASK|OFFSET_CONTRADICTION_MASK|INVALID_MASK);
+// assert(!canonContradiction());
+// assert(!offsetContradiction());
+// assert(!multiJoin());
+// assert(!visited());
+// assert(!cyclic());
+// assert(!invalid());
+ }
+
+ public boolean canonContradiction(){return (CANON_CONTRADICTION_MASK&flags)==CANON_CONTRADICTION_MASK;}
+ public boolean offsetContradiction(){return (OFFSET_CONTRADICTION_MASK&flags)==OFFSET_CONTRADICTION_MASK;}
+ public boolean multiJoin(){return (MULTIJOIN_MASK&flags)==MULTIJOIN_MASK;}
+ public boolean visited(){return (VISITED_MASK&flags)==VISITED_MASK;}
+ public boolean cyclic(){return (CYCLIC_MASK&flags)==CYCLIC_MASK;}
+ public boolean invalid(){return (INVALID_MASK&flags)==INVALID_MASK;}
+ public boolean mst(){return (MST_MASK&flags)==MST_MASK;}
+ public boolean contradiction(){return canonContradiction() || offsetContradiction();}
+
+ private static final long VISITED_MASK=(1L<<0);
+ private static final long MULTIJOIN_MASK=(1L<<1);
+ private static final long CYCLIC_MASK=(1L<<2);
+ private static final long CANON_CONTRADICTION_MASK=(1L<<3);
+ private static final long OFFSET_CONTRADICTION_MASK=(1L<<4);
+ private static final long INVALID_MASK=(1L<<5);
+ private static final long MST_MASK=(1L<<6);
+
+ Unit u1;
+ Unit u2;
+ int type;
+ int start1;
+ int start2;
+ int stop1;
+ int stop2;
+
+ long flags=0;
+
+ final int overlapLen;
+ final int mismatches;
+ final int edits;
+ }
+
+ /**
+ * @return
+ */
+ private int determineCluster2(final int uid) {
+ assert(clusterNumbers!=null);
+ boolean stable=false;
+ int cluster=uid;
+ while(!stable){
+ cluster=clusterNumbers.get(uid);
+ if(cluster==0 || cluster==uid){return cluster;}
+ assert(cluster<=uid);
+ final int next=determineCluster2(cluster);
+ if(next>=cluster){return cluster;}
+ stable=clusterNumbers.compareAndSet(uid, cluster, next);
+ }
+ return cluster;
+ }
+
+
+ private int mergeClusterIds(int cluster1, int cluster2) {
+ assert(clusterNumbers!=null);
+
+// System.err.println("Merging clusters "+cluster1+" and "+cluster2);
+
+ while(cluster1!=cluster2){
+ int min=Tools.min(cluster1, cluster2);
+ if(cluster1!=min){
+ assert(cluster1>min);
+ boolean b=clusterNumbers.compareAndSet(cluster1, cluster1, min);
+ if(!b){
+ cluster1=determineCluster2(cluster1);
+ min=Tools.min(cluster1, cluster2);
+ }
+ }
+ if(cluster2!=min){
+ assert(cluster2>min);
+ boolean b=clusterNumbers.compareAndSet(cluster2, cluster2, min);
+ if(!b){
+ cluster2=determineCluster2(cluster2);
+ min=Tools.min(cluster1, cluster2);
+ }
+ }
+ }
+// System.err.println("Returning "+cluster1);
+ return cluster1;
+ }
+
+ private class Unit implements Comparable<Unit>, Serializable {
+
+ /**
+ *
+ */
+ private static final long serialVersionUID = 5232407003873807738L;
+
+ public Unit(Read r_){
+ this(r_, isCanonical(r_.bases));
+ }
+
+ public Unit(Read r_, boolean canonical_){
+// this(r_, canonical_, canonical_ ? hash(r_.bases) : hashReversed(r_.bases));
+ this(r_, canonical_, hash(r_.bases), hashReversed(r_.bases));
+ }
+
+ public Unit(Read r_, boolean canonical_, long codeF_, long codeR_){
+ r=r_;
+ code1=Tools.min(codeF_, codeR_);
+ code2=Tools.max(codeF_, codeR_);
+ long f=r.length();
+ prefix1=hashTip(r.bases, true, k, 0);
+ suffix1=hashTip(r.bases, false, k, 0);
+ if(r.length()>2*k){
+ prefix2=hashTip(r.bases, true, k, k);
+ suffix2=hashTip(r.bases, false, k, k);
+ }
+ if(canonical_){f|=CANON_MASK;}
+ if(r.pairnum()==1){f|=PAIRNUM_MASK;}
+ flags=f;
+ assert(canonical()==canonical_);
+ assert(length()==r.length());
+ assert(pairnum()==r.pairnum());
+ if(parseDepth){
+ int[] quad=KmerNormalize.parseDepth(r.id, null);
+ if(quad!=null){depth=quad[r.pairnum()];}
+ }
+ }
+
+ int determineCluster() {
+ return determineCluster2(unitID);
+ }
+
+ public void absorbMatch(Unit u){
+
+ assert(code1==u.code1 && code2==u.code2 && length()==u.length());
+ if(r==null || u.r==null){return;}
+ u.r.setDiscarded(true);
+ final byte[] bases1=r.bases, bases2=u.r.bases;
+ final byte[] quals1=r.quality, quals2=u.r.quality;
+
+ assert((r.mate==null) == (u.r.mate==null));
+
+ if(r.mate!=null && !u.r.mate.discarded()){
+ ((Unit)r.mate.obj).absorbMatch((Unit)u.r.mate.obj);
+ }
+ if(quals1==null || quals2==null){return;}
+
+ if(canonical()==u.canonical()){
+ for(int i=0; i<bases1.length; i++){
+ byte b1=bases1[i], b2=bases2[i];
+ if(!AminoAcid.isFullyDefined(b1) && AminoAcid.isFullyDefined(b2)){bases1[i]=b2;}
+ else{assert(b1==b2);}
+ if(quals1!=null && quals2!=null){
+ quals1[i]=Tools.max(quals1[i], quals2[i]);
+ }
+ }
+ }else{
+ for(int i=0, j=bases2.length-1; i<bases1.length; i++, j--){
+ byte b1=bases1[i], b2=baseToComplementExtended[bases2[j]];
+ if(!AminoAcid.isFullyDefined(b1) && AminoAcid.isFullyDefined(b2)){bases1[i]=b2;}
+ else{assert(b1==b2);}
+ if(quals1!=null && quals2!=null){
+ quals1[i]=Tools.max(quals1[i], quals2[j]);
+ }
+ }
+ }
+ }
+
+ public boolean alreadyHas(Overlap o){
+ if(overlapList==null){return false;}
+ for(int i=0; i<overlapList.size(); i++){
+ Overlap o2=overlapList.get(i);
+ if(o.equals(o2)){
+ assert(overlapList.contains(o));
+ assert(o2.equals(o));
+ return true;
+ }
+ }
+ assert(!overlapList.contains(o));
+ return false;
+ }
+
+ /**
+ * @param set
+ * @return
+ */
+ public ArrayList<Unit> makeCluster() {
+ assert(!visited());
+ assert(!clustered());
+ assert(valid());
+// assert(set.isEmpty());
+ ArrayList<Unit> cluster=new ArrayList<Unit>(overlapList==null ? 1 : overlapList.size()+1);
+ cluster.add(this);
+ setClustered(true);
+
+ int added=1;
+ for(int i=0; i<cluster.size(); i++){
+ Unit u=cluster.get(i);
+ added+=u.visit(cluster);
+ }
+
+ assert(added==cluster.size());
+ return cluster;
+ }
+
+ /**
+ * @param set
+ * @return
+ */
+ public int visit(ArrayList<Unit> cluster) {
+ assert(!visited());
+ assert(clustered());
+ assert(valid());
+// assert(cluster.contains(this));
+ setVisited(true);
+ int added=0;
+
+ if(r!=null && r.mate!=null){
+ Unit u2=(Unit)r.mate.obj;
+ assert(u2!=this);
+ assert(u2.valid());
+ if(!u2.clustered()){
+ u2.setClustered(true);
+ cluster.add(u2);
+ added++;
+ }
+ }
+
+ if(overlapList!=null){
+ for(Overlap o : overlapList){
+ Unit u2=(o.u1==this ? o.u2 : o.u1);
+ assert(o.u1==this || o.u2==this);
+ assert(u2!=this);
+ assert(u2.valid());
+ if(!u2.clustered()){
+ u2.setClustered(true);
+ cluster.add(u2);
+ added++;
+ }
+ }
+ }
+ return added;
+ }
+
+ public boolean isTransitive(){
+ assert(valid());
+ if(overlapList==null || overlapList.size()==0){return true;}
+ for(Overlap o : overlapList){
+ assert(o.u1==this || o.u2==this);
+ Unit u2=(o.u1==this ? o.u2 : o.u1);
+ assert(u2!=this);
+ if(u2.overlapList==null){
+ return false;
+ }else{
+ boolean found=false;
+ for(Overlap o2 : u2.overlapList){
+ if(o2.u1==this || o2.u2==this){
+ found=true; break;
+ }
+ }
+ if(!found){return false;}
+ }
+ }
+ return true;
+ }
+
+ public boolean isPerfectlyTransitive(){
+ assert(valid());
+ if(overlapList==null || overlapList.size()==0){return true;}
+ for(Overlap o : overlapList){
+ assert(o.u1==this || o.u2==this);
+ Unit u2=(o.u1==this ? o.u2 : o.u1);
+ assert(u2!=this);
+ if(u2.overlapList==null){
+ return false;
+ }else{
+ boolean found=false;
+ for(Overlap o2 : u2.overlapList){
+ if(o2==o){
+ found=true; break;
+ }
+ }
+ if(!found){return false;}
+ }
+ }
+ return true;
+ }
+
+ public boolean isNonRedundant(){
+ assert(valid());
+ if(overlapList==null || overlapList.size()==0){return true;}
+ for(int i=0; i<overlapList.size(); i++){
+ Overlap a=overlapList.get(i);
+ for(int j=0; j<overlapList.size(); j++){
+ Overlap b=overlapList.get(j);
+ if((i==j)!=(a.equals(b))){
+ return false;
+ }
+ }
+ }
+ return true;
+ }
+
+ /**
+ * @param u2
+ * @param loc
+ * @param key
+ * @return
+ */
+ public boolean contains(Unit u2, int loc, LongM key, BandedAligner bandy, int tableNum) {
+ if(verbose){System.err.println("contains: Considering key "+key+", unit "+u2);}
+ if(minLengthPercent>0 && (u2.length()*100f/length())<minLengthPercent){return false;}
+ assert(u2.code1!=code1 || u2.code2!=code2 || u2.length()!=length() || (r!=null && r.mate!=null) || //REQUIRE_MATCHING_NAMES ||
+ (canonical()==u2.canonical() ? (u2.prefix1!=prefix1 && u2.suffix1!=suffix1) : (u2.prefix1!=suffix1 && u2.suffix1!=prefix1))) :
+ "Collision? \n"+this+"\n"+u2+"\n"+r+"\n"+u2.r;
+
+ if(tableNum==0){
+ if(key.value()==u2.prefix1){
+ if(verbose){System.err.println("Containment A1");}
+ if(containsForward(u2, loc-k2, bandy, tableNum==0) || containsReverseRC(u2, loc, bandy, tableNum==0)){return true;}
+ }
+ if(key.value()==u2.suffix1){
+ if(verbose){System.err.println("Containment B1");}
+ if(containsReverse(u2, loc, bandy, tableNum==0) || containsForwardRC(u2, loc-k2, bandy, tableNum==0)){return true;}
+ }
+ }else{
+ if(key.value()==u2.prefix2){
+ if(verbose){System.err.println("Containment A2");}
+ if(containsForward(u2, loc-k2-k, bandy, tableNum==0) || containsReverseRC(u2, loc+k, bandy, tableNum==0)){return true;}
+ }
+ if(key.value()==u2.suffix2){
+ if(verbose){System.err.println("Containment B2");}
+ if(containsReverse(u2, loc+k, bandy, tableNum==0) || containsForwardRC(u2, loc-k2-k, bandy, tableNum==0)){return true;}
+ }
+ }
+ return false;
+ }
+
+ private boolean containsForward(Unit u2, int start, BandedAligner bandy, boolean earlyExit) {
+ if(start+u2.length()>length() || start<0 || start>=length()){return false;}
+// if(true){return false;}
+ if(u2.r!=null){
+ final byte[] a=bases(), b=u2.bases();
+ int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, b.length);
+
+ for(int i=start, j=0; j<b.length; i++, j++){
+ byte aa=a[i];
+ byte bb=b[j];
+ if(aa!=bb){
+ if(exact || (AminoAcid.isFullyDefined(aa) && AminoAcid.isFullyDefined(bb))){
+ if(earlyExit && j<k2){return false;}
+ if((mismatches=mismatches+1)>maxMismatches){
+ if(bandy==null || maxEdits<1){return false;}
+ int edits=bandy.alignForward(b, a, 0, start, maxEdits, exact);
+ assert(b.length<k || a.length<k || bandy.lastRow>=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits;
+ return edits<=maxEdits && bandy.score()>4*edits;
+ }
+ }
+ }
+ }
+ return true;
+ }else{
+ assert(false) : "TODO: Verify by hashing and checking both tips";
+ return false;
+ }
+ }
+
+ private boolean containsForwardRC(Unit u2, int start, BandedAligner bandy, boolean earlyExit) {
+ if(ignoreReverseComplement){return false;}
+ if(start+u2.length()>length() || start<0 || start>=length()){return false;}
+// if(true){return false;}
+ if(u2.r!=null){
+ final byte[] a=bases(), b=u2.bases();
+ int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, b.length);
+
+ for(int i=start, j=b.length-1, iprefix=start+k2; j>=0; i++, j--){
+ byte aa=a[i];
+ byte bb=baseToComplementExtended[b[j]];
+ if(aa!=bb){
+ if(exact || (AminoAcid.isFullyDefined(aa) && AminoAcid.isFullyDefined(bb))){
+ if(earlyExit && i<iprefix){return false;}
+ if((mismatches=mismatches+1)>maxMismatches){
+ if(bandy==null || maxEdits<1){return false;}
+ int edits=bandy.alignForwardRC(b, a, b.length-1, start, maxEdits, exact);
+ assert(b.length<k || a.length<k || bandy.lastRow>=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits;
+ return edits<=maxEdits && bandy.score()>4*edits;
+ }
+ }
+ }
+ }
+ return true;
+ }else{
+ assert(false) : "TODO: Verify by hashing and checking both tips";
+ return false;
+ }
+ }
+
+ private boolean containsReverse(Unit u2, int start, BandedAligner bandy, boolean earlyExit) {
+ if(start+1<u2.length() || start<0 || start>=length()){return false;}
+// if(true){return false;}
+ if(u2.r!=null){
+ final byte[] a=bases(), b=u2.bases();
+ int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, b.length);
+
+ for(int i=start, j=b.length-1, iprefix=start-k2; j>=0; i--, j--){
+ byte aa=a[i];
+ byte bb=b[j];
+ if(aa!=bb){
+ if(exact || (AminoAcid.isFullyDefined(aa) && AminoAcid.isFullyDefined(bb))){
+ if(earlyExit && i>iprefix){return false;}
+ if((mismatches=mismatches+1)>maxMismatches){
+ if(bandy==null || maxEdits<1){return false;}
+ int edits=bandy.alignReverse(b, a, b.length-1, start, maxEdits, exact);
+ assert(b.length<k || a.length<k || bandy.lastRow>=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits;
+ return edits<=maxEdits && bandy.score()>4*edits;
+ }
+ }
+ }
+ }
+ return true;
+ }else{
+ assert(false) : "TODO: Verify by hashing and checking both tips";
+ return false;
+ }
+ }
+
+ private boolean containsReverseRC(Unit u2, int start, BandedAligner bandy, boolean earlyExit) {
+ if(ignoreReverseComplement){return false;}
+ if(start+1<u2.length() || start<0 || start>=length()){return false;}
+// if(true){return false;}
+ if(u2.r!=null){
+ final byte[] a=bases(), b=u2.bases();
+ int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, b.length);
+
+ for(int i=start, j=0; j<b.length; i--, j++){
+ byte aa=a[i];
+ byte bb=baseToComplementExtended[b[j]];
+ if(aa!=bb){
+ if(exact || (AminoAcid.isFullyDefined(aa) && AminoAcid.isFullyDefined(bb))){
+ if(earlyExit && j<k2){return false;}
+ if((mismatches=mismatches+1)>maxMismatches){
+ if(bandy==null || maxEdits<1){return false;}
+ int edits=bandy.alignReverseRC(b, a, 0, start, maxEdits, exact);
+ assert(b.length<k || a.length<k || bandy.lastRow>=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits;
+ return edits<=maxEdits && bandy.score()>4*edits;
+ }
+ }
+ }
+ }
+ return true;
+ }else{
+ assert(false) : "TODO: Verify by hashing and checking both tips";
+ return false;
+ }
+ }
+
+
+ public boolean depthCongruent(int aa, int bb){
+ if(aa<5 && bb<5){return true;}
+ final int a=Tools.max(1, Tools.min(aa, bb));
+ final int b=Tools.max(aa, bb);
+ return a*depthRatio>=b;
+ }
+
+
+ /**
+ * @param u2
+ * @param loc
+ * @param key
+ * @return
+ */
+ public boolean overlaps(Unit u2, int loc, LongM key, BandedAligner bandy, int tableNum, int editLimit) {
+// return makeOverlap(u2, loc, key, bandy, earlyExit)!=null;
+
+// assert(false) : "TODO";
+ if(verbose){System.err.println("overlaps: Considering key "+key+", unit "+u2);}
+ if(parseDepth && !depthCongruent(depth, u2.depth)){return false;}
+ if(minLengthPercent>0){
+ final int len1=length(), len2=u2.length();
+ if(Tools.min(len1, len2)*100f/Tools.max(len1, len2)<minLengthPercent){return false;}
+ }
+ assert(u2.code1!=code1 || u2.code2!=code2 || u2.length()!=length() ||
+ (canonical()==u2.canonical() ? (u2.prefix1!=prefix1 && u2.suffix1!=suffix1) : (u2.prefix1!=suffix1 && u2.suffix1!=prefix1))) :
+ "Collision? \n"+this+"\n"+u2+"\n"+r+"\n"+u2.r;
+
+
+ if(tableNum==0){
+ if(key.value()==u2.prefix1){
+ if(verbose){System.err.println("Testing overlaps A1");}
+ if(overlapsForward(u2, loc-k2, 0, bandy, tableNum==0, editLimit)){
+ if(verbose){System.err.println("Found Overlap A1F");}
+ return true;
+ }
+ if(overlapsReverseRC(u2, loc, 0, bandy, tableNum==0, editLimit)){
+ if(verbose){System.err.println("Found Overlap A1R");}
+ return true;
+ }
+ if(verbose){System.err.println("No Overlap.");}
+ }
+ if(key.value()==u2.suffix1){
+ if(verbose){System.err.println("Testing overlaps B1");}
+ if(verbose){System.err.println("Testing overlaps B1F");}
+ if(overlapsForwardRC(u2, loc-k2, u2.length()-1, bandy, tableNum==0, editLimit)){
+ if(verbose){System.err.println("Found Overlap B1F");}
+ return true;
+ }
+ if(verbose){System.err.println("Testing overlaps B1R");}
+ if(overlapsReverse(u2, loc, u2.length()-1, bandy, tableNum==0, editLimit)){
+ if(verbose){System.err.println("Found Overlap B1R");}
+ return true;
+ }
+ if(verbose){System.err.println("No Overlap.");}
+ }
+ }else{
+ if(key.value()==u2.prefix2){
+ if(verbose){System.err.println("Testing overlaps A2");}
+ if(overlapsForward(u2, loc-k2-k, 0, bandy, tableNum==0, editLimit)){
+ if(verbose){System.err.println("Found Overlap A2F");}
+ return true;
+ }
+ if(overlapsReverseRC(u2, loc+k, 0, bandy, tableNum==0, editLimit)){
+ if(verbose){System.err.println("Found Overlap A2R");}
+ return true;
+ }
+ if(verbose){System.err.println("No Overlap.");}
+ }
+ if(key.value()==u2.suffix2){
+ if(verbose){System.err.println("Testing overlaps B2");}
+ if(overlapsForwardRC(u2, loc-k2-k, u2.length()-1, bandy, tableNum==0, editLimit)){
+ if(verbose){System.err.println("Found Overlap B2F");}
+ return true;
+ }
+ if(overlapsReverse(u2, loc+k, u2.length()-1, bandy, tableNum==0, editLimit)){
+ if(verbose){System.err.println("Found Overlap B2R");}
+ return true;
+ }
+ if(verbose){System.err.println("No Overlap.");}
+ }
+ }
+
+ return false;
+ }
+
+ /**
+ * @param u2
+ * @param loc
+ * @param key
+ * @return
+ */
+ protected Overlap makeOverlap(Unit u2, int loc, LongM key, BandedAligner bandy, int tableNum) {
+ if(verbose){System.err.println("makeOverlap: Considering key "+key+", unit "+u2);}
+ if(parseDepth && !depthCongruent(depth, u2.depth)){return null;}
+ if(minLengthPercent>0){
+ final int len1=length(), len2=u2.length();
+ if(Tools.min(len1, len2)*100f/Tools.max(len1, len2)<minLengthPercent){return null;}
+ }
+ assert(u2.code1!=code1 || u2.code2!=code2 || u2.length()!=length() || (r!=null && r.mate!=null) ||
+ (canonical()==u2.canonical() ? (u2.prefix1!=prefix1 && u2.suffix1!=suffix1) : (u2.prefix1!=suffix1 && u2.suffix1!=prefix1))) :
+ "Collision? \n"+this+"\n"+u2+"\n"+r+"\n"+u2.r;
+
+
+ Overlap o=null;
+ if(tableNum==0){
+ if(key.value()==u2.prefix1){
+ if(verbose){System.err.println("\nTesting makeOverlap A1F");}
+ if((o=makeOverlapForward(u2, loc-k2, bandy, tableNum==0))!=null){
+ if(verbose){System.err.println("Made Overlap A1F");}
+ return o;
+ }
+ if(verbose){System.err.println("\nTesting makeOverlap A1R");}
+ if((o=makeOverlapReverseRC(u2, loc, bandy, tableNum==0))!=null){
+ if(verbose){System.err.println("Made Overlap A1R");}
+ return o;
+ }
+ if(verbose){System.err.println("No Overlap.");}
+ }
+ if(key.value()==u2.suffix1){
+ if(verbose){System.err.println("\nTesting makeOverlap B1F");}
+ if((o=makeOverlapForwardRC(u2, loc-k2, bandy, tableNum==0))!=null){
+ if(verbose){System.err.println("Made Overlap B1F");}
+ return o;
+ }
+ if(verbose){System.err.println("\nTesting makeOverlap B1R");}
+ if((o=makeOverlapReverse(u2, loc, bandy, tableNum==0))!=null){
+ if(verbose){System.err.println("Made Overlap B1R");}
+ return o;
+ }
+ if(verbose){System.err.println("No Overlap.");}
+ }
+ }else{
+ if(key.value()==u2.prefix2){
+ if(verbose){System.err.println("\nTesting makeOverlap A2F");}
+ if((o=makeOverlapForward(u2, loc-k2-k, bandy, tableNum==0))!=null){
+ if(verbose){System.err.println("Made Overlap A2F");}
+ return o;
+ }
+ if(verbose){System.err.println("\nTesting makeOverlap A2R");}
+ if((o=makeOverlapReverseRC(u2, loc+k, bandy, tableNum==0))!=null){
+ if(verbose){System.err.println("Made Overlap A2R");}
+ return o;
+ }
+ if(verbose){System.err.println("No Overlap.");}
+ }
+ if(key.value()==u2.suffix2){
+ if(verbose){System.err.println("\nTesting makeOverlap B2F");}
+ if((o=makeOverlapForwardRC(u2, loc-k2-k, bandy, tableNum==0))!=null){
+ if(verbose){System.err.println("Made Overlap B2F");}
+ return o;
+ }
+ if(verbose){System.err.println("\nTesting makeOverlap B2R");}
+ if((o=makeOverlapReverse(u2, loc+k, bandy, tableNum==0))!=null){
+ if(verbose){System.err.println("Made Overlap B2R");}
+ return o;
+ }
+ if(verbose){System.err.println("No Overlap.");}
+ }
+ }
+ return o;
+ }
+
+ private boolean overlapsForward(Unit u2, int start1, int start2, BandedAligner bandy, boolean earlyExit, int maxEdits) {
+ if(verbose){System.err.println("overlapsForward(u1="+this.name()+", u2="+u2.name()+", start1="+start1+", start2="+start2+", earlyExit="+earlyExit+")");}
+
+ final int len1=length(), len2=u2.length();
+ if(start1<0){
+ start2-=start1;
+ start1=0;
+ if(verbose){System.err.println("Modified: start1="+start1+", start2="+start2);}
+ }
+ int overlapLength=Tools.min(len1-start1, len2-start2);
+ int overlapLength2=Tools.max(len1-start1, len2-start2);
+ int stop1=start1+overlapLength-1, stop2=start2+overlapLength-1;
+ if(verbose){System.err.println("Calculated stop1="+stop1+", stop2="+stop2+", overlapLength="+overlapLength);}
+
+ if(!allowAllContainedOverlaps || overlapLength>Tools.min(len1, len2)){
+ if(verbose){
+ System.err.println("Side block. allowAllContainedOverlaps="+allowAllContainedOverlaps+", minOverlapCluster="+minOverlapCluster);
+ System.err.println("start1="+start1+", stop1="+stop1+", len1="+len1+", start2="+start2+", stop2="+stop2+", len2="+len2+
+ ", overlapLen="+overlapLength+", maxEdits="+maxEdits);
+ }
+ if(overlapLength2<minOverlapCluster){return false;}
+ if(minOverlapPercentCluster>0f && (overlapLength2*100f/Tools.min(len1, len2))<minOverlapPercentCluster){return false;}
+ }
+
+ final byte[] a=bases(), b=u2.bases();
+ assert(a!=null && b!=null) : "Null bases for "+code1+" or "+u2.code1;
+ int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, overlapLength);
+
+ if(verbose){
+ System.err.println("start1="+start1+", stop1="+stop1+", len1="+len1+", start2="+start2+", stop2="+stop2+", len2="+len2+
+ ", overlapLen="+overlapLength+", maxMismatches="+maxMismatches+", maxEdits="+maxEdits);
+ }
+
+ for(int i=start1, j=start2; j<=stop2; i++, j++){
+ byte aa=a[i];
+ byte bb=b[j];
+ if(aa!=bb){
+ if(exact || (AminoAcid.isFullyDefined(aa) && AminoAcid.isFullyDefined(bb))){
+ if(earlyExit && j<k2){return false;}
+ mismatches++;
+ if(verbose){System.err.println("i="+i+", j="+j+", "+(char)aa+"!="+(char)bb+", mismatches="+mismatches+"/"+maxMismatches);}
+ if(mismatches>maxMismatches){
+ if(bandy==null || maxEdits<1){return false;}
+ if(verbose){System.err.println("Mismatches exceeded maximum, attempting banded alignment.");}
+ int edits=bandy.alignForward(b, a, 0, start1, maxEdits, exact);
+ assert(b.length<k || a.length<k || bandy.lastRow>=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits;
+ stop2=bandy.lastQueryLoc;
+ stop1=bandy.lastRefLoc;
+ return edits<=maxEdits && bandy.score()>2*edits; //Set at 2*edits instead of 4*edits to prevent assertion errors when reversing alignment
+ }
+ }
+ }
+ }
+ return true;
+ }
+
+ private boolean overlapsForwardRC(Unit u2, int start1, int start2, BandedAligner bandy, boolean earlyExit, int maxEdits) {
+ if(verbose){System.err.println("overlapsForwardRC(u1="+this.name()+", u2="+u2.name()+", start1="+start1+", start2="+start2+", earlyExit="+earlyExit+")");}
+
+ if(ignoreReverseComplement){return false;}
+ final int len1=length(), len2=u2.length();
+ if(start1<0){
+ start2+=start1;
+ start1=0;
+ if(verbose){System.err.println("Modified: start1="+start1+", start2="+start2);}
+ }
+ final int overlapLength=Tools.min(len1-start1, start2+1);
+ final int overlapLength2=Tools.max(len1-start1, start2+1);
+ int stop1=start1+overlapLength-1, stop2=start2-overlapLength+1;
+ if(verbose){System.err.println("Calculated stop1="+stop1+", stop2="+stop2+", overlapLength="+overlapLength);}
+
+ if(!allowAllContainedOverlaps || overlapLength>Tools.min(len1, len2)){
+ if(overlapLength2<minOverlapCluster){return false;}
+ if(minOverlapPercentCluster>0f && (overlapLength2*100f/Tools.min(len1, len2))<minOverlapPercentCluster){return false;}
+ }
+
+ final byte[] a=bases(), b=u2.bases();
+ assert(a!=null && b!=null) : "Null bases for "+code1+" or "+u2.code1;
+ int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, b.length);
+
+ if(verbose){
+ System.err.println("start1="+start1+", stop1="+stop1+", len1="+len1+", start2="+start2+", stop2="+stop2+", len2="+len2+
+ ", overlapLen="+overlapLength+", maxMismatches="+maxMismatches+", maxEdits="+maxEdits);
+ }
+
+ for(int i=start1, j=start2, iprefix=start1+k2; i<=stop1; i++, j--){
+ byte aa=a[i];
+ byte bb=baseToComplementExtended[b[j]];
+ if(aa!=bb){
+ if(exact || (AminoAcid.isFullyDefined(aa) && AminoAcid.isFullyDefined(bb))){
+ if(earlyExit && i<iprefix){return false;}
+ mismatches++;
+ if(verbose){System.err.println("i="+i+", j="+j+", "+(char)aa+"!="+(char)bb+", mismatches="+mismatches+"/"+maxMismatches);}
+ if(mismatches>maxMismatches){
+ if(bandy==null || maxEdits<1){return false;}
+ if(verbose){System.err.println("Mismatches exceeded maximum, attempting banded alignment.");}
+ int edits=bandy.alignForwardRC(b, a, b.length-1, start1, maxEdits, exact);
+ assert(b.length<k || a.length<k || bandy.lastRow>=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits;
+ stop2=bandy.lastQueryLoc;
+ stop1=bandy.lastRefLoc;
+ return edits<=maxEdits && bandy.score()>2*edits; //Set at 2*edits instead of 4*edits to prevent assertion errors when reversing alignment
+ }
+ }
+ }
+ }
+ return true;
+ }
+
+ private boolean overlapsReverse(Unit u2, int start1, int start2, BandedAligner bandy, boolean earlyExit, int maxEdits) {
+ if(verbose){System.err.println("overlapsReverse(u1="+this.name()+", u2="+u2.name()+", start1="+start1+", start2="+start2+", earlyExit="+earlyExit+")");}
+
+ final int len1=length(), len2=u2.length();
+ if(start1>=len1){
+ start2-=(start1-len1+1);
+ start1=len1-1;
+ if(verbose){System.err.println("Modified: start1="+start1+", start2="+start2);}
+ }
+ final int overlapLength=Tools.min(start1+1, start2+1);
+ final int overlapLength2=Tools.max(start1+1, start2+1);
+ int stop1=start1-overlapLength+1, stop2=start2-overlapLength+1;
+ if(verbose){System.err.println("Calculated stop1="+stop1+", stop2="+stop2+", overlapLength="+overlapLength);}
+
+ if(!allowAllContainedOverlaps || overlapLength>Tools.min(len1, len2)){
+ if(overlapLength2<minOverlapCluster){return false;}
+ if(minOverlapPercentCluster>0f && (overlapLength2*100f/Tools.min(len1, len2))<minOverlapPercentCluster){return false;}
+ }
+
+ final byte[] a=bases(), b=u2.bases();
+ assert(a!=null && b!=null) : "Null bases for "+code1+" or "+u2.code1;
+ int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, b.length);
+
+ if(verbose){
+ System.err.println("start1="+start1+", stop1="+stop1+", len1="+len1+", start2="+start2+", stop2="+stop2+", len2="+len2+
+ ", overlapLen="+overlapLength+", maxMismatches="+maxMismatches+", maxEdits="+maxEdits);
+ }
+
+ for(int i=start1, j=start2, iprefix=start1-k2; i>=stop1; i--, j--){
+ byte aa=a[i];
+ byte bb=b[j];
+ if(aa!=bb){
+ if(exact || (AminoAcid.isFullyDefined(aa) && AminoAcid.isFullyDefined(bb))){
+ if(earlyExit && i>iprefix){return false;}
+ mismatches++;
+ if(verbose){System.err.println("i="+i+", j="+j+", "+(char)aa+"!="+(char)bb+", mismatches="+mismatches+"/"+maxMismatches);}
+ if(mismatches>maxMismatches){
+ if(bandy==null || maxEdits<1){return false;}
+ if(verbose){System.err.println("Mismatches exceeded maximum, attempting banded alignment.");}
+ int edits=bandy.alignReverse(b, a, b.length-1, start1, maxEdits, exact);
+ assert(b.length<k || a.length<k || bandy.lastRow>=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits;
+ stop2=bandy.lastQueryLoc;
+ stop1=bandy.lastRefLoc;
+ return edits<=maxEdits && bandy.score()>2*edits; //Set at 2*edits instead of 4*edits to prevent assertion errors when reversing alignment
+ }
+ }
+ }
+ }
+ return true;
+ }
+
+ private boolean overlapsReverseRC(Unit u2, int start1, int start2, BandedAligner bandy, boolean earlyExit, int maxEdits) {
+ if(verbose){System.err.println("overlapsReverseRC(u1="+this.name()+", u2="+u2.name()+", start1="+start1+", start2="+start2+", earlyExit="+earlyExit+")");}
+
+ if(ignoreReverseComplement){return false;}
+ final int len1=length(), len2=u2.length();
+ if(start1>=len1){
+ start2+=(start1-len1+1);
+ start1=len1-1;
+ if(verbose){System.err.println("Modified: start1="+start1+", start2="+start2);}
+ }
+ final int overlapLength=Tools.min(start1+1, len2-start2);
+ final int overlapLength2=Tools.max(start1+1, len2-start2);
+ int stop1=start1-overlapLength+1, stop2=start2+overlapLength-1;
+ if(verbose){System.err.println("Calculated stop1="+stop1+", stop2="+stop2+", overlapLength="+overlapLength);}
+
+ if(!allowAllContainedOverlaps || overlapLength>Tools.min(len1, len2)){
+ if(overlapLength2<minOverlapCluster){return false;}
+ if(minOverlapPercentCluster>0f && (overlapLength2*100f/Tools.min(len1, len2))<minOverlapPercentCluster){return false;}
+ }
+
+ final byte[] a=bases(), b=u2.bases();
+ assert(a!=null && b!=null) : "Null bases for "+code1+" or "+u2.code1;
+ int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, b.length);
+
+ if(verbose){
+ System.err.println("start1="+start1+", stop1="+stop1+", len1="+len1+", start2="+start2+", stop2="+stop2+", len2="+len2+
+ ", overlapLen="+overlapLength+", maxMismatches="+maxMismatches+", maxEdits="+maxEdits);
+ }
+
+ for(int i=start1, j=start2; j<=stop2; i--, j++){
+ byte aa=a[i];
+ byte bb=baseToComplementExtended[b[j]];
+ if(aa!=bb){
+ if(exact || (AminoAcid.isFullyDefined(aa) && AminoAcid.isFullyDefined(bb))){
+ if(earlyExit && j<k2){return false;}
+ mismatches++;
+ if(verbose){System.err.println("i="+i+", j="+j+", "+(char)aa+"!="+(char)bb+", mismatches="+mismatches+"/"+maxMismatches);}
+ if(mismatches>maxMismatches){
+ if(bandy==null || maxEdits<1){return false;}
+ if(verbose){System.err.println("Mismatches exceeded maximum, attempting banded alignment.");}
+ int edits=bandy.alignReverseRC(b, a, 0, start1, maxEdits, exact);
+ assert(b.length<k || a.length<k || bandy.lastRow>=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits;
+ stop2=bandy.lastQueryLoc;
+ stop1=bandy.lastRefLoc;
+ return edits<=maxEdits && bandy.score()>2*edits; //Set at 2*edits instead of 4*edits to prevent assertion errors when reversing alignment
+ }
+ }
+ }
+ }
+ return true;
+ }
+
+
+
+ private Overlap makeOverlapForward(Unit u2, int start1, BandedAligner bandy, boolean earlyExit) {
+ if(verbose){System.err.println("makeOverlapForward(u1="+this.name()+", u2="+u2.name()+", start="+start1+", earlyExit="+earlyExit+")");}
+ final int len1=length(), len2=u2.length();
+ int start2=0;
+ if(start1<0){
+ start2-=start1;
+ start1=0;
+ }
+ final int overlapLength=Tools.min(len1-start1, len2-start2);
+ final int overlapLength2=Tools.max(len1-start1, len2-start2);
+ int stop1=start1+overlapLength-1, stop2=start2+overlapLength-1;
+ if(verbose){System.err.println("Calculated stop1="+stop1+", stop2="+stop2+", overlapLength="+overlapLength);}
+
+ if(!allowAllContainedOverlaps || overlapLength>Tools.min(len1, len2)){
+ if(overlapLength<minOverlapCluster){return null;}
+ if(minOverlapPercentCluster>0f && (overlapLength*100f/Tools.min(len1, len2))<minOverlapPercentCluster){return null;}
+ }
+
+ final byte[] a=bases(), b=u2.bases();
+ assert(a!=null && b!=null) : "Null bases for "+code1+" or "+u2.code1;
+ int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, overlapLength);
+
+ if(verbose){
+ System.err.println("start1="+start1+", stop1="+stop1+", len1="+len1+", start2="+start2+", stop2="+stop2+", len2="+len2+
+ ", overlapLen="+overlapLength+", maxMismatches="+maxMismatches+", maxEdits="+maxEdits);
+ }
+
+ for(int i=start1, j=start2; j<=stop2; i++, j++){
+ byte aa=a[i];
+ byte bb=b[j];
+ if(aa!=bb){
+ if(exact || (AminoAcid.isFullyDefined(aa) && AminoAcid.isFullyDefined(bb))){
+ if(earlyExit && j<k2){return null;}
+ mismatches++;
+ if(verbose){System.err.println("i="+i+", j="+j+", "+(char)aa+"!="+(char)bb+", mismatches="+mismatches+"/"+maxMismatches);}
+ if(mismatches>1 && bandy!=null){
+ if(maxEdits<1){return null;}
+ if(verbose){System.err.println("mismatches exceeded 1, attempting banded alignment.");}
+ int edits=bandy.alignForward(b, a, start2, start1, maxEdits, exact);
+ if(edits>maxEdits || bandy.score()<=4*edits){
+ if(verbose){System.err.println((edits>maxEdits ? "Too many edits" : "Alignment score too low")+"; returning null.");}
+ return null;
+ }
+ assert(b.length<k || a.length<k || bandy.lastRow>=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits;
+ stop2=bandy.lastQueryLoc;
+ stop1=bandy.lastRefLoc;
+// if(bandy.lastOffset>0){//Ref longer than query
+// for(int k=0; k<bandy.lastOffset; k++){
+// if(stop1+1<=len1){stop1++;}
+// else{stop2--;}//I don't think this can happen
+// }
+// }else if(bandy.lastOffset<0){//Query longer than ref
+// for(int k=0; k>bandy.lastOffset; k--){
+// if(stop2+1<=len2){stop2++;}
+// else{stop1--;}
+// }
+// }
+ return new Overlap(this, u2, FORWARD, start1, start2, stop1, stop2, overlapLength, edits, edits, bandy);
+ }else if(mismatches>maxMismatches){return null;}
+ }
+ }
+ }
+ return new Overlap(this, u2, FORWARD, start1, start2, stop1, stop2, overlapLength, mismatches, 0, bandy);
+ }
+
+ private Overlap makeOverlapForwardRC(Unit u2, int start1, BandedAligner bandy, boolean earlyExit) {
+ if(verbose){System.err.println("makeOverlapForwardRC(u1="+this.name()+", u2="+u2.name()+", start="+start1+", earlyExit="+earlyExit+")");}
+ if(ignoreReverseComplement){return null;}
+ final int len1=length(), len2=u2.length();
+ int start2=len2-1;
+ if(start1<0){
+ start2+=start1;
+ start1=0;
+ }
+ final int overlapLength=Tools.min(len1-start1, start2+1);
+ final int overlapLength2=Tools.max(len1-start1, start2+1);
+ int stop1=start1+overlapLength-1, stop2=start2-overlapLength+1;
+ if(verbose){System.err.println("Calculated stop1="+stop1+", stop2="+stop2+", overlapLength="+overlapLength);}
+
+ if(!allowAllContainedOverlaps || overlapLength>Tools.min(len1, len2)){
+ if(overlapLength<minOverlapCluster){return null;}
+ if(minOverlapPercentCluster>0f && (overlapLength*100f/Tools.min(len1, len2))<minOverlapPercentCluster){return null;}
+ }
+
+ final byte[] a=bases(), b=u2.bases();
+ assert(a!=null && b!=null) : "Null bases for "+code1+" or "+u2.code1;
+ int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, b.length);
+
+ for(int i=start1, j=start2, iprefix=start1+k2; i<=stop1; i++, j--){
+ byte aa=a[i];
+ byte bb=baseToComplementExtended[b[j]];
+ if(aa!=bb){
+ if(exact || (AminoAcid.isFullyDefined(aa) && AminoAcid.isFullyDefined(bb))){
+ if(earlyExit && i<iprefix){return null;}
+ mismatches++;
+ if(verbose){System.err.println("i="+i+", j="+j+", "+(char)aa+"!="+(char)bb+", mismatches="+mismatches+"/"+maxMismatches);}
+ if(mismatches>1 && bandy!=null){
+ if(maxEdits<1){return null;}
+ if(verbose){System.err.println("mismatches exceeded 1, attempting banded alignment.");}
+ int edits=bandy.alignForwardRC(b, a, start2, start1, maxEdits, exact);
+ if(edits>maxEdits || bandy.score()<=4*edits){
+ if(verbose){System.err.println((edits>maxEdits ? "Too many edits" : "Alignment score too low")+"; returning null.");}
+ return null;
+ }
+ assert(b.length<k || a.length<k || bandy.lastRow>=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits;
+ stop2=bandy.lastQueryLoc;
+ stop1=bandy.lastRefLoc;
+// if(bandy.lastOffset>0){//Ref longer than query
+// for(int k=0; k<bandy.lastOffset; k++){
+// if(stop1+1<=len1){stop1++;}
+// else{stop2++;}//I don't think this can happen
+// }
+// }else if(bandy.lastOffset<0){//Query longer than ref
+// for(int k=0; k>bandy.lastOffset; k--){
+// if(stop2>0){stop2--;}
+// else{stop1--;}
+// }
+// }
+ return new Overlap(this, u2, FORWARDRC, start1, start2, stop1, stop2, overlapLength, edits, edits, bandy);
+ }else if(mismatches>maxMismatches){return null;}
+ }
+ }
+ }
+ return new Overlap(this, u2, FORWARDRC, start1, start2, stop1, stop2, overlapLength, mismatches, 0, bandy);
+ }
+
+ private Overlap makeOverlapReverse(Unit u2, int start1, BandedAligner bandy, boolean earlyExit) {
+ if(verbose){System.err.println("makeOverlapReverse(u1="+this.name()+", u2="+u2.name()+", start="+start1+", earlyExit="+earlyExit+")");}
+
+ final int len1=length(), len2=u2.length();
+ int start2=len2-1;
+ if(start1>=len1){
+ start2-=(start1-len1+1);
+ start1=len1-1;
+ }
+ final int overlapLength=Tools.min(start1+1, start2+1);
+ final int overlapLength2=Tools.max(start1+1, start2+1);
+ int stop1=start1-overlapLength+1, stop2=start2-overlapLength+1;
+ if(verbose){System.err.println("Calculated stop1="+stop1+", stop2="+stop2+", overlapLength="+overlapLength);}
+
+ if(!allowAllContainedOverlaps || overlapLength>Tools.min(len1, len2)){
+ if(overlapLength<minOverlapCluster){return null;}
+ if(minOverlapPercentCluster>0f && (overlapLength*100f/Tools.min(len1, len2))<minOverlapPercentCluster){return null;}
+ }
+
+ final byte[] a=bases(), b=u2.bases();
+ assert(a!=null && b!=null) : "Null bases for "+code1+" or "+u2.code1;
+ int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, b.length);
+
+ if(verbose){
+ System.err.println("start1="+start1+", stop1="+stop1+", len1="+len1+", start2="+start2+", stop2="+stop2+", len2="+len2+
+ ", overlapLen="+overlapLength+", maxMismatches="+maxMismatches+", maxEdits="+maxEdits);
+ }
+
+ for(int i=start1, j=start2, iprefix=start1-k2; i>=stop1; i--, j--){
+ byte aa=a[i];
+ byte bb=b[j];
+ if(aa!=bb){
+ if(exact || (AminoAcid.isFullyDefined(aa) && AminoAcid.isFullyDefined(bb))){
+ if(earlyExit && i>iprefix){return null;}
+ mismatches++;
+ if(verbose){System.err.println("i="+i+", j="+j+", "+(char)aa+"!="+(char)bb+", mismatches="+mismatches+"/"+maxMismatches);}
+ if(mismatches>1 && bandy!=null){
+ if(maxEdits<1){return null;}
+ if(verbose){System.err.println("mismatches exceeded 1, attempting banded alignment.");}
+ int edits=bandy.alignReverse(b, a, start2, start1, maxEdits, exact);
+ if(edits>maxEdits || bandy.score()<=4*edits){
+ if(verbose){System.err.println((edits>maxEdits ? "Too many edits" : "Alignment score too low")+"; returning null.");}
+ return null;
+ }
+ assert(b.length<k || a.length<k || bandy.lastRow>=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits;
+ stop2=bandy.lastQueryLoc;
+ stop1=bandy.lastRefLoc;
+// if(bandy.lastOffset>0){//Ref longer than query
+// for(int k=0; k<bandy.lastOffset; k++){
+// if(stop1>0){stop1--;}
+// else{stop2++;}//I don't think this can happen
+// }
+// }else if(bandy.lastOffset<0){//Query longer than ref
+// for(int k=0; k>bandy.lastOffset; k--){
+// if(stop2>0){stop2--;}
+// else{stop1++;}
+// }
+// }
+ return new Overlap(this, u2, REVERSE, start1, start2, stop1, stop2, overlapLength, edits, edits, bandy);
+ }else if(mismatches>maxMismatches){return null;}
+ }
+ }
+ }
+ return new Overlap(this, u2, REVERSE, start1, start2, stop1, stop2, overlapLength, mismatches, 0, bandy);
+ }
+
+ private Overlap makeOverlapReverseRC(Unit u2, int start1, BandedAligner bandy, boolean earlyExit) {
+ if(verbose){System.err.println("makeOverlapReverseRC(u1="+this.name()+", u2="+u2.name()+", start="+start1+", earlyExit="+earlyExit+")");}
+ if(ignoreReverseComplement){return null;}
+ final int len1=length(), len2=u2.length();
+ int start2=0;
+ if(start1>=len1){
+ start2+=(start1-len1+1);
+ start1=len1-1;
+ }
+ final int overlapLength=Tools.min(start1+1, len2-start2);
+ final int overlapLength2=Tools.max(start1+1, len2-start2);
+ int stop1=start1-overlapLength+1, stop2=start2+overlapLength-1;
+ if(verbose){System.err.println("Calculated stop1="+stop1+", stop2="+stop2+", overlapLength="+overlapLength);}
+
+ if(!allowAllContainedOverlaps || overlapLength>Tools.min(len1, len2)){
+ if(overlapLength<minOverlapCluster){return null;}
+ if(minOverlapPercentCluster>0f && (overlapLength*100f/Tools.min(len1, len2))<minOverlapPercentCluster){return null;}
+ }
+
+ final byte[] a=bases(), b=u2.bases();
+ assert(a!=null && b!=null) : "Null bases for "+code1+" or "+u2.code1;
+ int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, b.length);
+
+ if(verbose){
+ System.err.println("start1="+start1+", stop1="+stop1+", len1="+len1+", start2="+start2+", stop2="+stop2+", len2="+len2+
+ ", overlapLen="+overlapLength+", maxMismatches="+maxMismatches+", maxEdits="+maxEdits);
+ }
+
+ for(int i=start1, j=start2; j<=stop2; i--, j++){
+ byte aa=a[i];
+ byte bb=baseToComplementExtended[b[j]];
+ if(aa!=bb){
+ if(exact || (AminoAcid.isFullyDefined(aa) && AminoAcid.isFullyDefined(bb))){
+ if(earlyExit && j<k2){return null;}
+ mismatches++;
+ if(verbose){System.err.println("i="+i+", j="+j+", "+(char)aa+"!="+(char)bb+", mismatches="+mismatches+"/"+maxMismatches);}
+ if(mismatches>1 && bandy!=null){
+ if(maxEdits<1){return null;}
+ if(verbose){System.err.println("mismatches exceeded 1, attempting banded alignment.");}
+ int edits=bandy.alignReverseRC(b, a, start2, start1, maxEdits, exact);
+ if(edits>maxEdits || bandy.score()<=4*edits){
+ if(verbose){System.err.println((edits>maxEdits ? "Too many edits" : "Alignment score too low")+"; returning null.");}
+ return null;
+ }
+ assert(b.length<k || a.length<k || bandy.lastRow>=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits;
+ stop2=bandy.lastQueryLoc;
+ stop1=bandy.lastRefLoc;
+// if(bandy.lastOffset>0){//Ref longer than query
+// for(int k=0; k<bandy.lastOffset; k++){
+// if(stop1>0){stop1--;}
+// else{stop2--;}//I don't think this can happen
+// }
+// }else if(bandy.lastOffset<0){//Query longer than ref
+// for(int k=0; k>bandy.lastOffset; k--){
+// if(stop2+1<=len2){stop2++;}
+// else{stop1++;}
+// }
+// }
+ return new Overlap(this, u2, REVERSERC, start1, start2, stop1, stop2, overlapLength, edits, edits, bandy);
+ }else if(mismatches>maxMismatches){return null;}
+ }
+ }
+ }
+ return new Overlap(this, u2, REVERSERC, start1, start2, stop1, stop2, overlapLength, mismatches, 0, bandy);
+ }
+
+ @Override
+ public int compareTo(Unit b) {
+ int x=comparePairedRC(this, b);
+// int y=comparePairedRC(b, this);
+// boolean eq1=this.equals(b);
+// boolean eq2=b.equals(this);
+//
+// assert((x==y)==(x==0)) : x+", "+y+"\n"+this+"\n"+b;
+// assert((x>0 == y<0) || (x==0 && y==0)) : x+", "+y+"\n"+this+"\n"+b;
+//
+// assert(eq1==eq2): x+", "+y+"\n"+this+"\n"+b;
+// assert(eq1==(x==0)): x+", "+y+"\n"+this+"\n"+b;
+//
+// assert(eq1 || this!=b);
+//
+// if(verbose){ //TODO: Remove
+// System.err.println(this+"\n"+b+"\n"+this.r.toFastq()+"\n"+this.r.mate.toFastq()+"\n"+b.r.toFastq()+"\n"+b.r.mate.toFastq()+"\n");
+// System.err.println("\n"+x+", "+y+", "+eq1+", "+eq2);
+// verbose=false;
+// }
+
+ return x;
+ }
+
+ public boolean equals(Object b){return equals((Unit)b);}
+ public boolean equals(Unit b){
+ boolean x=pairedEqualsRC(this, b);
+// assert(x==pairedEqualsRC(b, this));
+// assert(x==(comparePairedRC(this, b)==0));
+// assert(x==(comparePairedRC(b, this)==0));
+// assert(x || this!=b);
+// System.err.println("\n****EQUALS?****:\n"+this+"\n"+b+"\n**** ****"); //TODO: Remove
+ return x;
+ }
+
+ @Override
+ public int hashCode(){
+ return (int)((code1^(code1>>>32))&0xFFFFFFFFL);
+ }
+
+ private synchronized void setValid(boolean b){
+ assert(b!=valid());
+// if(!b){System.err.println("Setting invalid "+name());}
+ if(b){flags&=~INVALID_MASK;}
+ else{flags|=INVALID_MASK;}
+ assert(b==valid());
+ }
+
+ private synchronized void setClustered(boolean b){
+ assert(b!=clustered());
+ if(b){flags|=CLUSTER_MASK;}
+ else{flags&=~CLUSTER_MASK;}
+ assert(b==clustered());
+ }
+
+ private void setVisited(boolean b){
+ assert(b!=visited());
+ if(b){flags|=VISIT_MASK;}
+ else{flags&=~VISIT_MASK;}
+ assert(b==visited());
+ }
+
+ private synchronized void setCanonical(boolean b){
+ assert(b!=canonical());
+ if(b){flags|=CANON_MASK;}
+ else{flags&=~CANON_MASK;}
+ assert(b==canonical());
+ assert(r==null || b==isCanonical(r.bases));
+ }
+
+ private void setCanonicized(boolean b){
+ assert(b!=canonicized());
+ if(b){flags|=CANONICIZED_MASK;}
+ else{flags&=~CANONICIZED_MASK;}
+ assert(b==canonicized());
+ }
+
+ private synchronized void setCanonContradiction(boolean b){
+// assert(b!=canonContradiction());
+ if(b){flags|=CANON_CONTRADICTION_MASK;}
+ else{flags&=~CANON_CONTRADICTION_MASK;}
+ assert(b==canonContradiction());
+ }
+
+ private synchronized void setOffset(int x){
+ offset=x;
+ setOffsetValid(true);
+ }
+
+ private synchronized void setOffsetValid(boolean b){
+ assert(!offsetValid());
+ if(b){flags|=OFFSET_VALID_MASK;}
+ else{flags&=~OFFSET_VALID_MASK;}
+ assert(b==offsetValid());
+ }
+
+ private synchronized void setOffsetContradiction(boolean b){
+// assert(b!=offsetContradiction());
+ assert(offsetValid());
+ if(b){flags|=OFFSET_CONTRADICTION_MASK;}
+ else{flags&=~OFFSET_CONTRADICTION_MASK;}
+ assert(b==offsetContradiction());
+ }
+
+ private void reverseComplement(){
+ assert(r!=null);
+ r.reverseComplement();
+ long temp=prefix1;
+ prefix1=suffix1;
+ suffix1=temp;
+ temp=prefix2;
+ prefix2=suffix2;
+ suffix2=temp;
+ setCanonical(!canonical());
+ }
+
+ /** Return true if 'this' should be the first Unit in the overlap object */
+ public boolean firstInOverlap(Unit u2){
+ assert(this!=u2) : "\n"+this.r+"\n"+u2.r;
+ if(u2.length()!=length()){return u2.length()<length();}
+ if(u2.code1!=code1){return u2.code1<code1;}
+ if(u2.code2!=code2){return u2.code2<code2;}
+ int x=compareTo(u2);
+ assert(x!=0 || (r!=null && r.mate!=null));
+ if(x!=0){return x>=0;}
+ return r.numericID>=u2.r.numericID;
+ }
+
+ public final boolean inSet(){
+ if(subsetCount<2){return true;}
+ if(r.pairnum()>0){return ((Unit)r.mate.obj).inSet();}
+ return ((code1&Long.MAX_VALUE)%subsetCount)==subset;
+ }
+
+ public byte[] bases(){return r==null ? null : r.bases;}
+
+ public String name(){return r!=null ? r.id : null /*code+""*/;}
+ public String toString(){return "("+name()+","+code1+","+code2+","+length()+","+prefix1+","+suffix1+","+(canonical()?"c":"nc")+",d="+depth+")";}
+
+
+ public final Read r;
+ public final long code1;
+ public final long code2;
+ public long prefix1=-1;
+ public long suffix1=-1;
+ public long prefix2=-1;
+ public long suffix2=-1;
+ /** Distance of leftmost side of this read relative to leftmost side of root.
+ * Assumes everything is in 'forward' orientation. */
+ public int offset=-999999999;
+ public int depth=1;
+// private boolean valid=true;
+
+ public int unitID;
+
+ public ArrayList<Overlap> overlapList;
+
+ private long flags;
+ /** True if the original read orientation was canonical */
+ public final boolean canonical(){return (CANON_MASK&flags)!=0;}
+ /** True if this contig should be output, false if not */
+ public final boolean valid(){return (INVALID_MASK&flags)==0;}
+ /** Length of this contig */
+ public final int length(){return (int)(LEN_MASK&flags);}
+ /** Position of this contig relative to root */
+ public final int offset(){
+ assert(offsetValid());
+ return offset;
+ }
+ public int pairnum(){return (PAIRNUM_MASK&flags)==PAIRNUM_MASK ? 1 : 0;}
+
+ public void clearVolatileFlags(){
+ flags=flags&~(CANONICIZED_MASK|VISIT_MASK|CANON_CONTRADICTION_MASK|OFFSET_VALID_MASK|OFFSET_CONTRADICTION_MASK);
+ assert(!visited());
+ assert(!canonicized());
+ assert(!canonContradiction());
+ assert(!offsetValid());
+ assert(!offsetContradiction());
+ }
+
+ public boolean visited(){return (VISIT_MASK&flags)==VISIT_MASK;}
+ public boolean clustered(){return (CLUSTER_MASK&flags)==CLUSTER_MASK;}
+ public boolean canonicized(){return (CANONICIZED_MASK&flags)==CANONICIZED_MASK;}
+ public boolean canonContradiction(){return (CANON_CONTRADICTION_MASK&flags)==CANON_CONTRADICTION_MASK;}
+ public boolean offsetValid(){return (OFFSET_VALID_MASK&flags)==OFFSET_VALID_MASK;}
+ public boolean offsetContradiction(){return (OFFSET_CONTRADICTION_MASK&flags)==OFFSET_CONTRADICTION_MASK;}
+ public boolean contradiction(){return offsetContradiction() || canonContradiction();}
+
+ private static final long LEN_MASK=0x7FFFFFFFL;
+ private static final long CANON_MASK=(1L<<33);
+ private static final long INVALID_MASK=(1L<<34);
+ private static final long VISIT_MASK=(1L<<35);
+ private static final long CLUSTER_MASK=(1L<<36);
+ private static final long CANONICIZED_MASK=(1L<<37);
+ private static final long CANON_CONTRADICTION_MASK=(1L<<38);
+ private static final long OFFSET_VALID_MASK=(1L<<39);
+ private static final long OFFSET_CONTRADICTION_MASK=(1L<<40);
+ private static final long PAIRNUM_MASK=(1L<<41);
+ }
+
+ private static final class UnitOffsetComparator implements Comparator<Unit> {
+
+ UnitOffsetComparator(){}
+
+ /* (non-Javadoc)
+ * @see java.util.Comparator#compare(java.lang.Object, java.lang.Object)
+ */
+ @Override
+ public int compare(Unit a, Unit b) {
+ if(a.offsetValid() && b.offsetValid()){
+ int x=a.offset()-b.offset();
+ if(x!=0){return x;}
+ }else{
+ if(a.offsetValid()){return -1;}
+ if(b.offsetValid()){return 1;}
+ }
+ return a.compareTo(b);
+ }
+
+ }
+
+ private static final class ClusterLengthComparator implements Comparator<ArrayList<Unit>> {
+
+ ClusterLengthComparator(){}
+
+ /* (non-Javadoc)
+ * @see java.util.Comparator#compare(java.lang.Object, java.lang.Object)
+ */
+ @Override
+ public int compare(ArrayList<Unit> a, ArrayList<Unit> b) {
+ if(a.size()!=b.size()){return b.size()-a.size();}
+ if(a.isEmpty() && b.isEmpty()){return 0;}
+ return a.get(0).compareTo(b.get(0));
+ }
+
+ }
+
+ public static final int[] makeNmerIndex(final int n){
+ final int max=(1<<(2*n))-1;
+ int[] array=new int[max+1];
+
+ int count=0;
+ for(int i=0; i<=max; i++){
+ final int a=i, b=AminoAcid.reverseComplementBinaryFast(i, n);
+ int min=Tools.min(a, b);
+ if(min==a){
+ array[a]=array[b]=count;
+ count++;
+ }
+ }
+ return array;
+ }
+
+ /** Makes a nmer (e.g., tetramer) profile of a cluster */
+ private static final float[] makeNmerProfile(ArrayList<Unit> alu, long[] array_){
+ final int nbits=2*nmerLength;
+ final long[] array=(array_==null ? new long[maxNmer+1] : array_);
+ final int mask=~((-1)<<(nbits));
+
+ long keysCounted=0;
+
+ for(Unit u : alu){
+ byte[] bases=u.r.bases;
+ int len=0;
+ int kmer=0;
+ for(byte b : bases){
+ int x=AminoAcid.baseToNumber[b];
+ if(x<0){
+ len=0;
+ kmer=0;
+ }else{
+ kmer=((kmer<<2)|x)&mask;
+ len++;
+ if(len>=nmerLength){
+ int rkmer=AminoAcid.reverseComplementBinaryFast(kmer, nmerLength);
+ keysCounted++;
+ array[nmerIndex[Tools.min(kmer, rkmer)]]++;
+ }
+ }
+ }
+ }
+
+ if(keysCounted==0){keysCounted=1;}
+ final float mult=1f/keysCounted;
+
+ float[] r=new float[array.length];
+ for(int i=0; i<array.length; i++){
+ r[i]=array[i]*mult;
+ array[i]=0;
+ }
+ return r;
+ }
+
+ private ConcurrentReadInputStream crisa[];
+
+ private final ByteStreamWriter dupeWriter;
+
+
+ private String[] in1=null;
+ private String[] in2=null;
+ private String out=null;
+ private String clusterFilePattern=null;
+ private String outbest=null;
+ private String outdupe=null;
+ private String outcsf=null;
+ private String outgraph=null;
+ private int maxNs=-1;
+ private long maxReads=-1;
+ public boolean errorState=false;
+ boolean sort=false;
+ boolean ascending=true;
+ boolean absorbContainment=true;
+ boolean absorbMatch=true;
+ boolean findOverlaps=false;
+ boolean makeClusters=false;
+ boolean processClusters=false;
+ boolean renameClusters=false;
+ boolean absorbOverlap=false;
+ boolean storeSuffix=false;
+ boolean storeName=true;
+ boolean storeQuality=true;
+ boolean exact=true;
+ boolean uniqueNames=true;
+ boolean maxSpanningTree=false;
+
+ boolean canonicizeClusters=true;
+ boolean removeCycles=true;
+ boolean fixMultiJoins=true;
+ boolean fixCanonContradictions=false;
+ boolean fixOffsetContradictions=false;
+ boolean countTransitive=false;
+ boolean countRedundant=false;
+
+ private boolean multipleInputFiles=false;
+ private boolean rigorousTransitive=false;
+ private int numAffixMaps=1;
+ private int maxAffixCopies=2000000000;
+ private int maxEdges=2000000000;
+ private int maxEdges2=2000000000;
+ private boolean allowAllContainedOverlaps=false;
+// private boolean toUpperCase=false;
+
+ /** Trim left bases of the read to this position (exclusive, 0-based) */
+ private int forceTrimLeft=-1;
+ /** Trim right bases of the read after this position (exclusive, 0-based) */
+ private int forceTrimRight=-1;
+
+ int maxEdits=0;
+ int maxSubs=0;
+ int bandwidth=9;
+ final boolean customBandwidth;
+ float minIdentity=100;
+ float minIdentityMult=0;
+ float minLengthPercent=0;
+ int minOverlapCluster=100;
+ int minOverlapMerge=1;
+ float minOverlapPercentCluster=0;
+ float minOverlapPercentMerge=0;
+
+ private int minClusterSize=1;
+ private int minClusterSizeForStats=1;
+ private boolean pickBestRepresentativePerCluster=false;
+
+ long readsProcessed=0;
+ long basesProcessed=0;
+ long collisions=0;
+ long containments=0;
+ long baseContainments=0;
+ long containmentCollisions=0;
+ long matches=0;
+ long baseMatches=0;
+ long overlaps=0;
+ long baseOverlaps=0;
+ long overlapCollisions=0;
+ long addedToMain=0;
+
+ private final int subset;
+ private final int subsetCount;
+ private final boolean subsetMode;
+
+ private final int k;
+ private final int k2;
+ private final boolean EA;
+
+ private static int tcount=0;
+
+ private LinkedHashMap<Long, ArrayList<Unit>> codeMap=new LinkedHashMap<Long, ArrayList<Unit>>(4000000);
+ private HashMap<LongM, ArrayList<Unit>> affixMap1=null;
+ private HashMap<LongM, ArrayList<Unit>> affixMap2=null;
+ private HashMap<LongM, ArrayList<Unit>>[] affixMaps=null;
+ private ArrayDeque<ArrayList<Unit>> clusterQueue=null;
+ private ArrayList<ArrayList<Unit>> processedClusters=null;
+ private AtomicIntegerArray clusterNumbers=null;
+
+ private static final UnitOffsetComparator UNIT_OFFSET_COMPARATOR=new UnitOffsetComparator();
+ private static final ClusterLengthComparator CLUSTER_LENGTH_COMPARATOR=new ClusterLengthComparator();
+ private static final long[][] hashcodes=makeCodes2(32);
+ public static final byte[] baseToNumber=new byte[128];
+ public static final byte[] baseToComplementNumber=new byte[128];
+ public static final byte[] baseToComplementExtended=new byte[128];
+ public static final int nmerLength=4;
+ public static final int[] nmerIndex=makeNmerIndex(nmerLength);
+ public static final int maxNmer=Tools.max(nmerIndex);
+ private static PrintStream outstream=System.err;
+ /** Permission to overwrite existing files */
+ public static boolean overwrite=false;
+ /** Permission to append to existing files */
+ public static boolean append=false;
+ public static boolean showSpeed=true;
+ public static boolean verbose=false;
+ public static boolean ignoreReverseComplement=false;
+ public static boolean preventTransitiveOverlaps=false;
+ public static boolean ignoreAffix1=false;
+ public static boolean parseDepth=false;
+ public static boolean printLengthInEdges=false;
+ public static float depthRatio=2;
+ public static int MINSCAF=0;
+ public static int THREADS=Shared.threads();
+ public static int threadMaxReadsToBuffer=4000;
+ public static int threadMaxBasesToBuffer=32000000;
+ public static boolean DISPLAY_PROGRESS=true;
+ public static boolean UNIQUE_ONLY=false;
+ public static boolean REQUIRE_MATCHING_NAMES=false;
+ public static boolean NUMBER_GRAPH_NODES=true;
+ public static boolean ADD_PAIRNUM_TO_NAME=true;
+
+ private static int reverseType(int type){return (type+2)%4;}
+ public static final int FORWARD=0;
+ public static final int FORWARDRC=1;
+ public static final int REVERSE=2;
+ public static final int REVERSERC=3;
+ public static final String[] OVERLAP_TYPE_NAMES=new String[] {"FORWARD", "FORWARDRC", "REVERSE", "REVERSERC"};
+ public static final String[] OVERLAP_TYPE_ABBREVIATIONS=new String[] {"F", "FRC", "R", "RRC"};
+
+ static{//All others are 0
+ baseToNumber['A']=baseToNumber['a']=0;
+ baseToNumber['C']=baseToNumber['c']=1;
+ baseToNumber['G']=baseToNumber['g']=2;
+ baseToNumber['T']=baseToNumber['t']=3;
+ baseToNumber['U']=baseToNumber['u']=3;
+
+ baseToComplementNumber['A']=baseToComplementNumber['a']=3;
+ baseToComplementNumber['C']=baseToComplementNumber['c']=2;
+ baseToComplementNumber['G']=baseToComplementNumber['g']=1;
+ baseToComplementNumber['T']=baseToComplementNumber['t']=0;
+ baseToComplementNumber['U']=baseToComplementNumber['u']=0;
+
+ for(int i=0; i<AminoAcid.baseToComplementExtended.length; i++){
+ byte b=AminoAcid.baseToComplementExtended[i];
+ baseToComplementExtended[i]=(b<0 ? (byte)i : b);
+ }
+ }
+
+}
diff --git a/current/jgi/Dedupe2.java b/current/jgi/Dedupe2.java
new file mode 100755
index 0000000..4c47768
--- /dev/null
+++ b/current/jgi/Dedupe2.java
@@ -0,0 +1,5663 @@
+package jgi;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.io.Serializable;
+import java.util.ArrayDeque;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedHashMap;
+import java.util.PriorityQueue;
+import java.util.Random;
+import java.util.concurrent.atomic.AtomicIntegerArray;
+
+import stream.ConcurrentCollectionReadInputStream;
+import stream.ConcurrentGenericReadInputStream;
+import stream.ConcurrentReadInputStream;
+import stream.FASTQ;
+import stream.FastaReadInputStream;
+import stream.FastqReadInputStream;
+import stream.ConcurrentReadOutputStream;
+import stream.Read;
+
+import align2.BandedAligner;
+import align2.ListNum;
+import align2.LongM;
+import align2.ReadLengthComparator;
+import align2.ReadStats;
+import align2.Shared;
+import align2.Tools;
+import align2.TrimRead;
+import dna.AminoAcid;
+import dna.Parser;
+import dna.Timer;
+import fileIO.ByteFile1;
+import fileIO.ByteFile2;
+import fileIO.ByteStreamWriter;
+import fileIO.FileFormat;
+import fileIO.ReadWrite;
+import fileIO.TextStreamWriter;
+
+/**
+ * @author Brian Bushnell
+ * @date Jul 18, 2013
+ *
+ */
+public final class Dedupe2 {
+
+ public static void main(String[] args){
+
+ if(args==null || args.length==0 || (args.length==1 &&
+ (args[0].equalsIgnoreCase("-h") || args[0].equals("-help") || args[0].equals("--help") || args[0].equals("-?") || args[0].equals("?")))){
+ printOptions();
+ System.exit(0);
+ }
+ Dedupe2 dd=new Dedupe2(args);
+ dd.process();
+ }
+
+ private static void printOptions(){
+ System.err.println("Please consult the shellscript for usage information.");
+// outstream.println("Syntax:\n");
+// outstream.println("\njava -ea -Xmx106g -cp <path> jgi.Dedupe2 <input file> <output file>");
+// outstream.println("\nOptional flags:");
+// outstream.println("in=<file> \tInput file. 'in=stdin' will pipe from standard in.");
+// outstream.println("out=<file> \tOutput file. 'out=stdout' will pipe to standard out.");
+// outstream.println("dot=<file> \tOutput a dot-format overlap graph to this file.");
+// outstream.println("pattern=<file> \tClusters will be written to individual files, where the '%' symbol in the pattern is replaced by cluster number.");
+// outstream.println("");
+// outstream.println("threads=auto \t(t) Set number of threads to use; default is number of logical processors.");
+// outstream.println("overwrite=t \t(ow) Set to false to force the program to abort rather than overwrite an existing file.");
+// outstream.println("showspeed=t \t(ss) Set to 'f' to suppress display of processing speed.");
+// outstream.println("minscaf=0 \t(ms) Ignore contigs/scaffolds shorter than this.");
+// outstream.println("interleaved=auto \tIf true, forces fastq input to be paired and interleaved.");
+//
+// outstream.println("absorbrc=t \t(arc) Absorb reverse-complements as well as normal orientation.");
+// outstream.println("absorbmatch=t \t(am) Absorb exact matches of contigs.");
+// outstream.println("absorbcontainment=t\t(ac) Absorb full containments of contigs.");
+// outstream.println("absorboverlap=f \t(ao) Absorb (merge) non-contained overlaps of contigs.");
+//
+// outstream.println("numaffixmaps=1 \t(nam) Set to 2 to index two prefixes and suffixes per contig.");
+// outstream.println("ignoreaffix1=f \t(ia1) Ignore first affix (for testing).");
+// outstream.println("storesuffix=f \t(ss) Store suffix as well as prefix. Automatically set to true when doing inexact matches.");
+//
+// outstream.println("findoverlap=f \t(fo) Find overlaps between contigs (containments and non-containments).");
+// outstream.println("cluster=f \t(c) Group overlapping contigs into clusters.");
+// outstream.println("fixmultijoins=t \t(fmj) Remove redundant overlaps between the same two contigs.");
+// outstream.println("removecycles=t \t(rc) Remove all cycles so clusters form trees.");
+// outstream.println("renameclusters=f \t(rnc) Rename contigs to indicate which cluster they are in.");
+// outstream.println("minclustersize=1 \t(mcs) Don't output clusters smaller than this.");
+// outstream.println("pbr=f \t(pickbestrepresentative) Only output the single highest-quality read per cluster.");
+// outstream.println("cc=t \t(canonicizeclusters) Flip contigs so clusters have a single orientation.");
+// outstream.println("fcc=f \t(fixcanoncontradictions) Truncate graph at nodes with canonization disputes.");
+// outstream.println("foc=f \t(fixoffsetcontradictions) Truncate graph at nodes with offset disputes.");
+// outstream.println("pto=f \t(preventtransitiveoverlaps) To not look for new edges between nodes in the same cluster.");
+//
+// outstream.println("storename=t \t(sn) Store contig names (set false to save memory).");
+// outstream.println("storequality=t \t(sq) Store quality values for fastq assemblies (set false to save memory).");
+// outstream.println("exact=t \t(ex) Only allow exact symbol matches. When false, an 'N' will match any symbol.");
+// outstream.println("touppercase=f \t(tuc) Change all input bases to upper case.");
+// outstream.println("uniquenames=t \t(un) Ensure all output contigs have unique names. Uses more memory.");
+// outstream.println("maxsubs=0 \t(s) Allow up to this many mismatches (substitutions only, no indels). May be set higher than maxedits.");
+// outstream.println("maxedits=0 \t(e) Allow up to this many edits (subs or indels). Higher is slower, so below 20 is suggested.");
+// //outstream.println("bandwidth=9 \t(bw) Width of banded alignment, if maxedits>0. To ensure correctness, set bandwidth=2*maxedits+1. Higher is slower.");
+// outstream.println("minidentity=100 \t(mid) Allow inter-sequence identity as low as this (subs only, no indels).");
+// outstream.println("k=31 \tKmer length used for finding containments. Containments shorter than k will not be found.");
+// outstream.println("minlengthpercent=0 \t(mlp) Smaller contig must be at least this percent of larger contig's length to be absorbed.");
+// outstream.println("minoverlappercent=0\t(mop) Overlap must be at least this percent of smaller contig's length to cluster and merge.");
+// outstream.println("minoverlap=200 \t(mo) Overlap must be at least this long to cluster and merge.");
+//
+// outstream.println("mopc=0 \t(minoverlappercentmerge) Overlap must be at least this percent of smaller contig's length to cluster.");
+// outstream.println("mopm=0 \t(minoverlappercentcluster) Overlap must be at least this percent of smaller contig's length to merge.");
+// outstream.println("moc=200 \t(minoverlapcluster) Overlap must be at least this long to cluster.");
+// outstream.println("mom=200 \t(minoverlapmerge) Overlap must be at least this long to merge.");
+// outstream.println("rt=f \t(rigoroustransitive) Ensure exact transitivity. Slow. For testing only.");
+//
+// outstream.println("ziplevel=2 \t(zl) Set to 1 (lowest) through 9 (max) to change compression level; lower compression is faster.");
+// outstream.println("sort=f \tsort output by contig length (otherwise it will be random).\n" +
+// " \t'a' for ascending, 'd' for descending, 'f' for false (no sorting).");
+// outstream.println("");
+// outstream.println("Note! When allowing inexact alignments, if maxsubs is less than maxedits, maxsubs is set to maxedits.");
+// outstream.println("If maxsubs and minidentity yield different numbers for some contig, the more liberal is used for substitutions.");
+// outstream.println("For indels, minidentity is ignored and maxedits is always used (due to time and memory constraints).");
+// outstream.println("Regardless of maxsubs, maxedits, or minidentity, no comparison will be made between two sequences unless ");
+// outstream.println("one contains the first or last k bases of the other, exactly, with no edits.");
+
+ }
+
+ public Dedupe2(String[] args){
+ for(String s : args){if(s.contains("standardout") || s.contains("stdout")){outstream=System.err;}}
+ System.err.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+
+ ReadWrite.ZIPLEVEL=2;
+ //ReadWrite.USE_UNPIGZ=true;
+
+
+ Parser parser=new Parser();
+ boolean setOut=false, setMcsfs=false;
+ int bandwidth_=-1;
+ int k_=31;
+ int subset_=0, subsetCount_=1;
+
+ {
+ boolean b=false;
+ assert(b=true);
+ EA=b;
+ }
+
+ ReadWrite.MAX_ZIP_THREADS=Shared.threads();
+ ReadWrite.ZIP_THREAD_DIVISOR=1;
+ Read.TO_UPPER_CASE=true;
+
+ for(int i=0; i<args.length; i++){
+
+ final String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if("null".equalsIgnoreCase(b)){b=null;}
+ while(a.charAt(0)=='-' && (a.indexOf('.')<0 || i>1 || !new File(a).exists())){a=a.substring(1);}
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(Parser.parseZip(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseCommonStatic(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseQuality(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseFasta(arg, a, b)){
+ //do nothing
+ }else if(parser.parseInterleaved(arg, a, b)){
+ //do nothing
+ }else if(a.equals("in") || a.equals("in1")){
+ if(b.indexOf(',')>=0 && !new File(b).exists()){
+ in1=b.split(",");
+ }else{
+ in1=new String[] {b};
+ }
+ }else if(a.equals("in2")){
+ if(b.indexOf(',')>=0 && !new File(b).exists()){
+ in2=b.split(",");
+ }else{
+ in2=new String[] {b};
+ }
+ }else if(a.equals("out")){
+ out=b;
+ setOut=true;
+ }else if(a.equals("clusterfilepattern") || a.equals("pattern")){
+ clusterFilePattern=b;
+ assert(clusterFilePattern==null || clusterFilePattern.contains("%")) : "pattern must contain the % symbol.";
+ }else if(a.equals("outbest")){
+ outbest=b;
+ }else if(a.equals("outd") || a.equals("outduplicate")){
+ outdupe=b;
+ }else if(a.equals("csf") || a.equals("clusterstatsfile")){
+ outcsf=b;
+ }else if(a.equals("dot") || a.equals("graph") || a.equals("outdot") || a.equals("outgraph")){
+ outgraph=b;
+ }else if(a.equals("mcsfs") || a.equals("minclustersizeforstats")){
+ minClusterSizeForStats=Integer.parseInt(b);
+ }else if(a.equals("mcs") || a.equals("minclustersize")){
+ minClusterSize=Integer.parseInt(b);
+ if(!setMcsfs){
+ minClusterSizeForStats=minClusterSize;
+ }
+ }else if(a.equals("append") || a.equals("app")){
+ append=ReadStats.append=Tools.parseBoolean(b);
+ }else if(a.equals("overwrite") || a.equals("ow")){
+ overwrite=Tools.parseBoolean(b);
+ }else if(a.equals("sort")){
+ if(b==null){sort=true;}
+ else if(b.equalsIgnoreCase("a")){
+ sort=true;
+ ascending=true;
+ }else if(b.equalsIgnoreCase("d")){
+ sort=true;
+ ascending=false;
+ }else{
+ sort=Tools.parseBoolean(b);
+ }
+ }else if(a.equals("arc") || a.equals("absorbrc") || a.equals("trc") || a.equals("testrc")){
+ ignoreReverseComplement=!Tools.parseBoolean(b);
+ }else if(a.equals("ac") || a.equals("absorbcontainment") || a.equals("absorbcontainments") || a.equals("tc") || a.equals("testcontainment") || a.equals("containment")){
+ absorbContainment=Tools.parseBoolean(b);
+ }else if(a.equals("am") || a.equals("absorbmatch") || a.equals("absorbmatches") || a.equals("tm") || a.equals("testmatch")){
+ absorbMatch=Tools.parseBoolean(b);
+ }else if(a.equals("ao") || a.equals("absorboverlap") || a.equals("absorboverlaps") || a.equals("to") || a.equals("testoverlap")){
+ absorbOverlap=Tools.parseBoolean(b);
+ }else if(a.equals("fo") || a.equals("findoverlap") || a.equals("findoverlaps")){
+ findOverlaps=Tools.parseBoolean(b);
+ }else if(a.equals("c") || a.equals("cluster") || a.equals("clusters")){
+ makeClusters=Tools.parseBoolean(b);
+ }else if(a.equals("fmj") || a.equals("fixmultijoin") || a.equals("fixmultijoins")){
+ fixMultiJoins=Tools.parseBoolean(b);
+ }else if(a.equals("fcc") || a.equals("fixcanoncontradiction") || a.equals("fixcanoncontradictions")){
+ fixCanonContradictions=Tools.parseBoolean(b);
+ }else if(a.equals("foc") || a.equals("fixoffsetcontradiction") || a.equals("fixoffsetcontradictions")){
+ fixOffsetContradictions=Tools.parseBoolean(b);
+ }else if(a.equals("pto") || a.equals("preventtransitiveoverlap") || a.equals("preventtransitiveoverlaps")){
+ preventTransitiveOverlaps=Tools.parseBoolean(b);
+ }else if(a.equals("pbr") || a.equals("pickbestrepresentative")){
+ pickBestRepresentativePerCluster=Tools.parseBoolean(b);
+ }else if(a.equals("mst") || a.equals("maxspanningtree")){
+ maxSpanningTree=Tools.parseBoolean(b);
+ }else if(a.equals("cc") || a.equals("canonicizecluster") || a.equals("canonicizeclusters")){
+ canonicizeClusters=Tools.parseBoolean(b);
+ }else if(a.equals("pc") || a.equals("processcluster") || a.equals("processclusters")){
+ processClusters=Tools.parseBoolean(b);
+ }else if(a.equals("rnc") || a.equals("renamecluster") || a.equals("renameclusters")){
+ renameClusters=Tools.parseBoolean(b);
+ if(renameClusters){storeName=false;}
+ }else if(a.equals("rc") || a.equals("removecycles") || a.equals("removecycle")){
+ removeCycles=Tools.parseBoolean(b);
+ }else if(a.equals("uo") || a.equals("uniqueonly")){
+ UNIQUE_ONLY=Tools.parseBoolean(b);
+ }else if(a.equals("rmn") || a.equals("requirematchingnames")){
+ REQUIRE_MATCHING_NAMES=Tools.parseBoolean(b);
+ }else if(a.equals("ngn") || a.equals("numbergraphnodes")){
+ NUMBER_GRAPH_NODES=Tools.parseBoolean(b);
+ }else if(a.equals("addpairnum")){
+ ADD_PAIRNUM_TO_NAME=Tools.parseBoolean(b);
+ }else if(a.equals("pn") || a.equals("prefixname")){
+// PREFIX_NAME=Tools.parseBoolean(b);
+ }else if(a.equals("k")){
+ k_=Integer.parseInt(b);
+ assert(k_>0 && k_<32) : "k must be between 1 and 31; default is 31, and lower values are slower.";
+ }else if(a.equals("minscaf") || a.equals("ms")){
+ MINSCAF=FastaReadInputStream.MIN_READ_LEN=Integer.parseInt(b);
+ }else if(a.equals("mlp") || a.equals("minlengthpercent")){
+ minLengthPercent=Float.parseFloat(b);
+ }else if(a.equals("mop") || a.equals("minoverlappercent")){
+ minOverlapPercentCluster=minOverlapPercentMerge=Float.parseFloat(b);
+ }else if(a.equals("mopc") || a.equals("minoverlappercentcluster")){
+ minOverlapPercentCluster=Float.parseFloat(b);
+ }else if(a.equals("mopm") || a.equals("minoverlappercentmerge")){
+ minOverlapPercentMerge=Float.parseFloat(b);
+ }else if(a.equals("mo") || a.equals("minoverlap")){
+ minOverlapCluster=minOverlapMerge=Integer.parseInt(b);
+ }else if(a.equals("moc") || a.equals("minoverlapcluster")){
+ minOverlapCluster=Integer.parseInt(b);
+ }else if(a.equals("mom") || a.equals("minoverlapmerge")){
+ minOverlapMerge=Integer.parseInt(b);
+ }else if(a.equals("rt") || a.equals("rigoroustransitive")){
+ rigorousTransitive=Tools.parseBoolean(b);
+ }else if(a.equals("e") || a.equals("maxedits") || a.equals("edits") || a.equals("edist")){
+ maxEdits=Integer.parseInt(b);
+ }else if(a.equals("s") || a.equals("maxsubs") || a.equals("maxsubstitutions") || a.equals("hdist")){
+ maxSubs=Integer.parseInt(b);
+ }else if(a.equals("bw") || a.equals("bandwidth")){
+ bandwidth_=Integer.parseInt(b);
+ }else if(a.equals("mid") || a.equals("minidentity")){
+ minIdentity=Float.parseFloat(b);
+ minIdentityMult=(minIdentity==100f ? 0 : (100f-minIdentity)/100f);
+ }else if(a.equals("threads") || a.equals("t")){
+ THREADS=(b==null || b.equalsIgnoreCase("auto") ? Shared.threads() : Integer.parseInt(b));
+ }else if(a.equals("showspeed") || a.equals("ss")){
+ showSpeed=Tools.parseBoolean(b);
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+// BandedAligner.verbose=verbose;
+ }else if(a.equals("contigbreak") || (arg.contains("=") && (a.equals("n") || a.equals("-n")))){
+ maxNs=Integer.parseInt(b);
+ }else if(a.equals("reads") || a.startsWith("maxreads")){
+ maxReads=Tools.parseKMG(b);
+ }else if(a.equals("sn") || a.equals("storename") || a.equals("storenames") || a.equals("keepnames")){
+ storeName=Tools.parseBoolean(b);
+ }else if(a.equals("ssx") || a.equals("storesuffix") || a.equals("storesuffixes")){
+ storeSuffix=Tools.parseBoolean(b);
+ }else if(a.equals("numaffixmaps") || a.equals("nam")){
+ numAffixMaps=Integer.parseInt(b);
+ }else if(a.equals("mac") || a.equals("maxaffixcopies")){
+ maxAffixCopies=Integer.parseInt(b);
+ }else if(a.equals("me") || a.equals("maxedges")){
+ maxEdges=Integer.parseInt(b);
+ maxEdges2=maxEdges*2;
+ if(maxEdges2<1){maxEdges2=Integer.MAX_VALUE-1;}
+ }else if(a.equals("ignoreaffix1") || a.equals("ia1")){
+ ignoreAffix1=Tools.parseBoolean(b);
+ }else if(a.equals("parsedepth") || a.equals("pd")){
+ parseDepth=Tools.parseBoolean(b);
+ }else if(a.equals("printlengthinedges") || a.equals("ple")){
+ printLengthInEdges=Tools.parseBoolean(b);
+ }else if(a.equals("depthmult") || a.equals("depthratio") || a.equals("dr")){
+ depthRatio=Float.parseFloat(b);
+ if(depthRatio<=0){
+ parseDepth=false;
+ }else{
+ parseDepth=true;
+ assert(depthRatio>0);
+ if(depthRatio<1){depthRatio=1/depthRatio;}
+ }
+ }else if(a.equals("storequality") || a.equals("sq")){
+ storeQuality=Tools.parseBoolean(b);
+ }else if(a.equals("exact") || a.equals("ex")){
+ exact=Tools.parseBoolean(b);
+ }else if(a.equals("uniquenames") || a.equals("un")){
+ uniqueNames=Tools.parseBoolean(b);
+ }else if(a.equals("ftl") || a.equals("forcetrimleft")){
+ forceTrimLeft=Integer.parseInt(b);
+ }else if(a.equals("ftr") || a.equals("forcetrimright")){
+ forceTrimRight=Integer.parseInt(b);
+ }else if(a.equals("subset") || a.equals("sst")){
+ subset_=Integer.parseInt(b);
+ }else if(a.equals("subsets") || a.equals("subsetcount") || a.equals("sstc")){
+ subsetCount_=Integer.parseInt(b);
+ }else if(i==0 && in1==null && arg.indexOf('=')<0 && arg.lastIndexOf('.')>0){
+ String c=args[i];
+ if(c.indexOf(',')>=0 && !new File(c).exists()){
+ in1=c.split(",");
+ }else{
+ in1=new String[] {c};
+ }
+ }else if(i==1 && out==null && arg.indexOf('=')<0 && arg.lastIndexOf('.')>0){
+ out=args[i];
+ setOut=true;
+ }else{
+ throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ {//Process parser fields
+ Parser.processQuality();
+ }
+
+ if(verbose){
+ ReadWrite.verbose=ConcurrentGenericReadInputStream.verbose=ConcurrentReadOutputStream.verbose=ByteFile1.verbose=ByteFile2.verbose=FastqReadInputStream.verbose=true;
+ }
+// verbose=false;
+
+ k=k_;
+ k2=k-1;
+ subset=subset_;
+ subsetCount=subsetCount_;
+ subsetMode=subsetCount>1;
+ assert(subset>=0 && subset<subsetCount) : "subset="+subset+", subsetCount="+subsetCount;
+
+ BandedAligner.penalizeOffCenter=true;
+
+ if(maxSpanningTree){removeCycles=fixMultiJoins=false;}
+ if(absorbOverlap){processClusters=true;}
+ if(processClusters || renameClusters || maxSpanningTree){makeClusters=true;}
+ if(makeClusters){findOverlaps=true;}
+ if(renameClusters){uniqueNames=/*storeName=*/false;}
+
+ if(bandwidth_>-1){
+ bandwidth=Tools.min(bandwidth_, 2*maxEdits+1);
+ customBandwidth=(bandwidth<2*maxEdits+1);
+ }else{
+ bandwidth=2*maxEdits+1;
+ customBandwidth=false;
+ }
+ maxSubs=Tools.max(maxSubs, maxEdits);
+ if(maxSubs>0 || minIdentity<100 || findOverlaps){storeSuffix=true;}
+
+ assert(FastaReadInputStream.settingsOK());
+
+ if(in1==null){
+ printOptions();
+ throw new RuntimeException("Error - at least one input file is required.");
+ }
+
+ for(int i=0; i<in1.length; i++){
+ if(in1[i].equalsIgnoreCase("stdin") && !new File(in1[i]).exists()){in1[i]="stdin.fa";}
+ }
+
+// assert(false) : Arrays.toString(in);
+
+// if(!setOut && clusterFilePattern==null){out="stdout.fa";}
+// else
+// if("stdout".equalsIgnoreCase(out) || "standarddout".equalsIgnoreCase(out)){
+// out="stdout.fa";
+// outstream=System.err;
+// }
+ if(!Tools.canWrite(out, overwrite)){throw new RuntimeException("Output file "+out+" already exists, and overwrite="+overwrite);}
+
+ for(int i=0; i<in1.length; i++){
+ assert(!in1[i].equalsIgnoreCase(out));
+ }
+// assert(false) : "\nabsorbContainment="+absorbContainment+", findOverlaps="+findOverlaps+", absorbOverlap="+absorbOverlap+"\n"+
+// "processClusters="+processClusters+", renameClusters="+renameClusters+", makeClusters="+makeClusters+", uniqueNames="+uniqueNames+", storeName="+storeName;
+ if(absorbContainment || findOverlaps){
+// assert(false);
+ affixMaps=new HashMap[numAffixMaps];
+ for(int i=0; i<numAffixMaps; i++){
+ affixMaps[i]=new HashMap<LongM, ArrayList<Unit>>(4000000);
+ }
+ }
+// assert(false) : absorbContainment+", "+(affixMap==null);
+
+ if(outdupe==null){
+ dupeWriter=null;
+ }else{
+ FileFormat ff=FileFormat.testOutput(outdupe, FileFormat.FASTA, null, true, overwrite, append, false);
+ dupeWriter=new ByteStreamWriter(ff);
+ }
+ }
+
+ public void process(){
+
+ Timer t=new Timer();
+
+ boolean dq0=FASTQ.DETECT_QUALITY;
+ boolean ti0=FASTQ.TEST_INTERLEAVED;
+ int rbl0=Shared.READ_BUFFER_LENGTH;
+// FASTQ.DETECT_QUALITY=false;
+// FASTQ.TEST_INTERLEAVED=false;
+ Shared.READ_BUFFER_LENGTH=16;
+
+ process2();
+
+ FASTQ.DETECT_QUALITY=dq0;
+ FASTQ.TEST_INTERLEAVED=ti0;
+ Shared.READ_BUFFER_LENGTH=rbl0;
+
+ t.stop();
+
+ double rpnano=readsProcessed/(double)(t.elapsed);
+ double bpnano=basesProcessed/(double)(t.elapsed);
+
+ String rpstring=(readsProcessed<100000 ? ""+readsProcessed : readsProcessed<100000000 ? (readsProcessed/1000)+"k" : (readsProcessed/1000000)+"m");
+ String bpstring=(basesProcessed<100000 ? ""+basesProcessed : basesProcessed<100000000 ? (basesProcessed/1000)+"k" : (basesProcessed/1000000)+"m");
+
+ while(rpstring.length()<8){rpstring=" "+rpstring;}
+ while(bpstring.length()<8){bpstring=" "+bpstring;}
+
+ if(showSpeed){
+ outstream.println("Time: \t\t\t"+t);
+ outstream.println("Reads Processed: "+rpstring+" \t"+String.format("%.2fk reads/sec", rpnano*1000000));
+ outstream.println("Bases Processed: "+bpstring+" \t"+String.format("%.2fm bases/sec", bpnano*1000));
+ }
+
+ if(errorState){
+ throw new RuntimeException("Dedupe2 terminated in an error state; the output may be corrupt.");
+ }
+ }
+
+ public void process2(){
+ if(dupeWriter!=null){dupeWriter.start();}
+// assert(false) : out;
+ Timer t=new Timer();
+
+ if(DISPLAY_PROGRESS){
+ outstream.println("Initial:");
+ Shared.printMemory();
+ outstream.println();
+ }
+
+ processMatches(t);
+
+ forceTrimLeft=forceTrimRight=-1;
+
+ if(absorbContainment){
+ processContainments(t);
+ }
+
+ if(dupeWriter!=null){dupeWriter.poisonAndWait();}
+
+ if(findOverlaps){
+ findOverlaps(t);
+
+ killAffixMaps();
+
+ if(processClusters || renameClusters || maxSpanningTree){codeMap=null;}
+
+ if(maxSpanningTree){
+ processMst(t);
+ }
+
+ if(processClusters){
+ processClusters(t, absorbOverlap);
+ }
+// if(renameClusters){
+// renameClusters(t);
+// }
+// assert(false) : (codeMap==null)+", "+(affixMap1==null)+", "+processedClusters;
+ }
+
+ outstream.println("Input: \t"+readsProcessed+" reads \t\t"+basesProcessed+" bases.");
+
+ if(absorbMatch){
+ outstream.println("Duplicates: \t"+matches+" reads ("+String.format("%.2f",matches*100.0/readsProcessed)+"%) \t"+
+ baseMatches+" bases ("+String.format("%.2f",baseMatches*100.0/basesProcessed)+"%) \t"+collisions+" collisions.");
+ }
+ if(absorbContainment){
+ outstream.println("Containments: \t"+containments+" reads ("+String.format("%.2f",containments*100.0/readsProcessed)+"%) \t"+
+ baseContainments+" bases ("+String.format("%.2f",baseContainments*100.0/basesProcessed)+"%) \t"+containmentCollisions+" collisions.");
+ }
+ if(findOverlaps){
+ outstream.println("Overlaps: \t"+overlaps+" reads ("+String.format("%.2f",overlaps*100.0/readsProcessed)+"%) \t"+
+ baseOverlaps+" bases ("+String.format("%.2f",baseOverlaps*100.0/basesProcessed)+"%) \t"+overlapCollisions+" collisions.");
+ }
+// outstream.println("Result: \t"+(addedToMain-containments)+" reads \t\t"+(basesProcessed-baseMatches-baseContainments)+" bases.");
+
+ long outReads=(addedToMain-containments);
+ if(UNIQUE_ONLY){outReads=readsProcessed-matches-containments;}
+ long outBases=(basesProcessed-baseMatches-baseContainments);
+ outstream.println("Result: \t"+outReads+" reads ("+String.format("%.2f",outReads*100.0/readsProcessed)+"%) \t"+
+ outBases+" bases ("+String.format("%.2f",outBases*100.0/basesProcessed)+"%)");
+
+ outstream.println("");
+
+ if(out!=null || clusterFilePattern!=null || outbest!=null || outgraph!=null || outcsf!=null){
+ writeOutput(outcsf, t);
+ }
+
+ }
+
+ private void killAffixMaps(){
+ if(affixMaps==null){return;}
+ for(int i=0; i<numAffixMaps; i++){
+ if(affixMaps[i]!=null){affixMaps[i].clear();}
+ affixMaps[i]=null;
+ }
+ affixMaps=null;
+ }
+
+ private ConcurrentReadInputStream[] makeCrisArray(ArrayList<Read> list){
+ final ConcurrentReadInputStream[] array;
+
+ if(list!=null){
+ array=new ConcurrentReadInputStream[] {new ConcurrentCollectionReadInputStream(list, null, -1)};
+ array[0].start(); //This deadlocks if ConcurrentReadInputStream extends Thread rather than spawning a new thread.
+ }else{
+ array=new ConcurrentReadInputStream[in1.length];
+ multipleInputFiles=array.length>1;
+ for(int i=0; i<in1.length; i++){
+ if(verbose){System.err.println("Creating cris for "+in1[i]);}
+
+ final ConcurrentReadInputStream cris;
+ {
+ FileFormat ff1=FileFormat.testInput(in1[i], FileFormat.FASTA, null, !multipleInputFiles || ReadWrite.USE_UNPIGZ, true);
+ FileFormat ff2=(in2==null || in2.length<=i ? null : FileFormat.testInput(in2[i], FileFormat.FASTA, null, !multipleInputFiles || ReadWrite.USE_UNPIGZ, true));
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, ff1.samOrBam(), ff1, ff2);
+ cris.start();
+ if(cris.paired()){
+ THREADS=1;//Temp fix for losing reads when multithreaded and paired
+ if(absorbContainment){
+ System.err.println("Set absorbContainment to false because it is not currently supported for paired reads.");
+ absorbContainment=false;
+ }
+ }
+ }
+ array[i]=cris;
+ }
+ }
+ return array;
+ }
+
+ private void processMatches(Timer t){
+ crisa=makeCrisArray(null);
+
+ ArrayList<HashThread> alht=new ArrayList<HashThread>(THREADS);
+ for(int i=0; i<THREADS; i++){alht.add(new HashThread(true, (absorbContainment|findOverlaps), absorbMatch, false, false));}
+ for(HashThread ht : alht){ht.start();}
+ for(HashThread ht : alht){
+ while(ht.getState()!=Thread.State.TERMINATED){
+ try {
+ ht.join();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ matches+=ht.matchesT;
+ collisions+=ht.collisionsT;
+ containments+=ht.containmentsT;
+ containmentCollisions+=ht.containmentCollisionsT;
+ baseContainments+=ht.baseContainmentsT;
+ baseMatches+=ht.baseMatchesT;
+ addedToMain+=ht.addedToMainT;
+ readsProcessed+=ht.readsProcessedT;
+ basesProcessed+=ht.basesProcessedT;
+ }
+ alht.clear();
+
+ if(verbose){System.err.println("Attempting to close input streams (1).");}
+ for(ConcurrentReadInputStream cris : crisa){
+ errorState|=ReadWrite.closeStream(cris);
+ }
+ crisa=null;
+
+ if(DISPLAY_PROGRESS){
+ t.stop();
+ outstream.println("Found "+matches+" duplicates.");
+ outstream.println("Finished exact matches. Time: "+t);
+ Shared.printMemory();
+ if(verbose){outstream.println(affixMaps[0]);}
+ outstream.println();
+ t.start();
+ }
+ }
+
+ private void processContainments(Timer t){
+ ArrayList<Read> list=new ArrayList<Read>((int)addedToMain);
+ for(ArrayList<Unit> alu : codeMap.values()){
+ for(Unit u : alu){
+ assert(u.r.mate==null) : "Containments are not currently supported with paired reads.";
+ if(u.valid() && u.r.pairnum()==0){list.add(u.r);}
+ }
+ }
+
+ // if(minLengthPercent>0){
+ // if(verbose){System.err.println("Sorting.");}
+ // Collections.sort(list, ReadLengthComparator.comparator);
+ // Collections.reverse(list);
+ // assert(list.isEmpty() || list.get(0).length()<=list.get(list.size()-1).length()) :
+ // list.get(0).length()+", "+list.get(list.size()-1).length();
+ // }
+
+ crisa=makeCrisArray(subsetMode ? null : list);
+
+ ArrayList<HashThread> alht=new ArrayList<HashThread>(THREADS);
+ for(int i=0; i<THREADS; i++){alht.add(new HashThread(false, false, false, true, false));}
+
+ for(HashThread ht : alht){ht.start();}
+ for(HashThread ht : alht){
+ while(ht.getState()!=Thread.State.TERMINATED){
+ try {
+ ht.join();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ assert(ht.matchesT==0);
+ assert(ht.collisionsT==0);
+ assert(ht.baseMatchesT==0);
+ assert(ht.addedToMainT==0);
+// assert(ht.readsProcessedT==0);
+// assert(ht.basesProcessedT==0);
+ // matches+=ht.matchesT;
+ // collisions+=ht.collisionsT;
+ containments+=ht.containmentsT;
+ containmentCollisions+=ht.containmentCollisionsT;
+ baseContainments+=ht.baseContainmentsT;
+ // baseMatches+=ht.baseMatchesT;
+ // addedToMain+=ht.addedToMainT;
+ // readsProcessed+=ht.readsProcessedT;
+ // basesProcessed+=ht.basesProcessedT;
+ }
+ alht.clear();
+ if(verbose){System.err.println("Attempting to close input streams (2).");}
+ for(ConcurrentReadInputStream cris : crisa){
+ errorState|=ReadWrite.closeStream(cris);
+ }
+
+ if(DISPLAY_PROGRESS){
+ t.stop();
+ outstream.println("Found "+containments+" contained sequences.");
+ outstream.println("Finished containment. Time: "+t);
+ Shared.printMemory();
+ outstream.println();
+ t.start();
+ }
+ crisa=null;
+ if(!findOverlaps){
+ killAffixMaps();
+ }
+
+ long x=removeInvalid(list);
+ list.clear();
+
+ if(DISPLAY_PROGRESS){
+ t.stop();
+ outstream.println("Removed "+x+" invalid entries.");
+ outstream.println("Finished invalid removal. Time: "+t);
+ Shared.printMemory();
+ outstream.println();
+ t.start();
+ }
+ }
+
+ private void findOverlaps(Timer t){
+
+ ArrayList<Read> list=new ArrayList<Read>((int)addedToMain);
+ for(ArrayList<Unit> alu : codeMap.values()){
+ for(Unit u : alu){
+ if(u.valid() && u.r.pairnum()==0){
+ u.unitID=list.size();
+ list.add(u.r);
+ if(u.r.mate!=null){
+ Unit u2=(Unit)u.r.mate.obj;
+ u2.unitID=u.unitID;
+ }
+ }else{
+ u.unitID=Integer.MAX_VALUE;
+ }
+ }
+ }
+
+ if(preventTransitiveOverlaps){
+ clusterNumbers=new AtomicIntegerArray(list.size());
+ for(int i=0; i<clusterNumbers.length(); i++){
+ clusterNumbers.set(i, i);
+ }
+ }
+
+ crisa=makeCrisArray(subsetMode ? null : list);
+
+ ArrayList<HashThread> alht=new ArrayList<HashThread>(THREADS);
+ for(int i=0; i<THREADS; i++){alht.add(new HashThread(false, false, false, false, true));}
+
+ for(HashThread ht : alht){ht.start();}
+ for(HashThread ht : alht){
+ while(ht.getState()!=Thread.State.TERMINATED){
+ try {
+ ht.join();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ assert(ht.matchesT==0);
+ assert(ht.collisionsT==0);
+ assert(ht.baseMatchesT==0);
+ assert(ht.addedToMainT==0);
+
+ overlaps+=ht.overlapsT;
+ baseOverlaps+=ht.baseOverlapsT;
+ overlapCollisions+=ht.overlapCollisionsT;
+ }
+ alht.clear();
+ if(verbose){System.err.println("Attempting to close input streams (3).");}
+ for(ConcurrentReadInputStream cris : crisa){
+ errorState|=ReadWrite.closeStream(cris);
+ }
+
+ if(DISPLAY_PROGRESS){
+ t.stop();
+ outstream.println("Found "+overlaps+" overlaps.");
+ outstream.println("Finished finding overlaps. Time: "+t);
+ Shared.printMemory();
+ outstream.println();
+ t.start();
+ }
+
+ crisa=null;
+
+ if(makeClusters){
+ int intransitive=0, redundant=0;
+ assert((intransitive=countIntransitive(t, list, rigorousTransitive))==0);
+ assert((redundant=countRedundant(t, list))==0);
+ long overlaps=countOverlaps(t, list);
+ assert(intransitive==0);
+ assert(redundant==0);
+// makeTransitive(t, list, rigorousTransitive);
+ if(clusterQueue==null){
+ clusterQueue=new ArrayDeque<ArrayList<Unit>>(list.size()/4+1);
+ processedClusters=new ArrayList<ArrayList<Unit>>();
+ }else{
+ assert(clusterQueue.isEmpty());
+ }
+ makeClusters(t, list);
+ }
+
+ list.clear();
+ }
+
+ private long makeTransitive(Timer t, ArrayList<Read> list, boolean rigorous){
+ assert(false) : "No longer needed.";
+ long added=0;
+ for(Read r : list){
+ assert(r!=null);
+ Unit u=(Unit) r.obj;
+ assert(u!=null);
+ assert(u.valid());
+// Data.sysout.println("Considering "+r.id+"; valid="+u.valid()+", overlaps="+(u.overlapList==null ? "null" : u.overlapList.size()));
+ if(u.valid()){
+
+ if(u.overlapList!=null){
+ for(Overlap o : u.overlapList){
+ Unit u2=(o.u1==u ? o.u2 : o.u1);
+ assert(u2!=u);
+ if(u2.overlapList==null){
+ u2.overlapList=new ArrayList<Overlap>(2);
+ u2.overlapList.add(o);
+ }else{
+ boolean found=false;
+ if(rigorous){
+ found=u2.overlapList.contains(o);
+ }else{
+ for(Overlap o2 : u2.overlapList){
+ if(o2.u1==u || o2.u2==u){found=true; break;}
+ }
+ }
+ if(!found){
+ added++;
+ u2.overlapList.add(o);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ for(Read r : list){
+ Unit u=(Unit) r.obj;
+ if(u.valid()){
+ assert(u.isTransitive());
+ }
+ }
+
+ if(DISPLAY_PROGRESS){
+ t.stop();
+ outstream.println("Added overlaps: "+added);
+ outstream.println("Made overlaps transitive. Time: "+t);
+ Shared.printMemory();
+ outstream.println();
+ t.start();
+ }
+ return added;
+ }
+
+ private int countIntransitive(Timer t, ArrayList<Read> list, boolean rigorous){
+ if(!countTransitive){return 0;}
+ int transitive=0, intransitive=0;
+ for(Read r : list){
+ assert(r!=null);
+ Unit u=(Unit) r.obj;
+ assert(u!=null);
+ assert(u.valid());
+// Data.sysout.println("Considering "+r.id+"; valid="+u.valid()+", overlaps="+(u.overlapList==null ? "null" : u.overlapList.size()));
+ if(u.valid()){
+ if(rigorous ? u.isPerfectlyTransitive() : u.isTransitive()){
+ transitive++;
+ }else{
+ intransitive++;
+ }
+ }
+ }
+
+ if(DISPLAY_PROGRESS){
+ t.stop();
+ outstream.println("Intransitive: "+intransitive+", \ttransitive: "+transitive);
+ outstream.println("Checked transitivity. Time: "+t);
+ Shared.printMemory();
+ outstream.println();
+ t.start();
+ }
+
+ return intransitive;
+ }
+
+ private int countRedundant(Timer t, ArrayList<Read> list){
+ if(!countRedundant){return 0;}
+ int redundant=0, nonredundant=0;
+ for(Read r : list){
+ assert(r!=null);
+ Unit u=(Unit) r.obj;
+ assert(u!=null);
+ assert(u.valid());
+// Data.sysout.println("Considering "+r.id+"; valid="+u.valid()+", overlaps="+(u.overlapList==null ? "null" : u.overlapList.size()));
+ if(u.valid()){
+ if(u.isNonRedundant()){
+ nonredundant++;
+ }else{
+ redundant++;
+ }
+ }
+ }
+
+ if(DISPLAY_PROGRESS){
+ t.stop();
+ outstream.println("Redundant: "+redundant+", \tnonredundant: "+nonredundant);
+ outstream.println("Checked redundancy. Time: "+t);
+ Shared.printMemory();
+ outstream.println();
+ t.start();
+ }
+ return redundant;
+ }
+
+ private long countOverlaps(Timer t, ArrayList<Read> list){
+
+ long overlaps=0, length=0;
+ for(Read r : list){
+ assert(r!=null);
+ Unit u=(Unit) r.obj;
+ assert(u!=null);
+ assert(u.valid());
+// Data.sysout.println("Considering "+r.id+"; valid="+u.valid()+", overlaps="+(u.overlapList==null ? "null" : u.overlapList.size()));
+ if(u.valid() && u.overlapList!=null){
+ for(Overlap o : u.overlapList){
+ overlaps++;
+ length+=o.overlapLen;
+ }
+ }
+ }
+
+ if(DISPLAY_PROGRESS){
+ t.stop();
+ outstream.println("Overlaps: "+overlaps+", \tlength: "+length);
+ outstream.println("Counted overlaps. Time: "+t);
+ Shared.printMemory();
+ outstream.println();
+ t.start();
+ }
+ return overlaps;
+ }
+
+ private long fillClusterSizeMatrix(ArrayList<ArrayList<Unit>> clusters, long[][] clusterSize){
+ int max=0;
+ for(ArrayList<Unit> cluster : clusters){
+ final int cs=Tools.min(clusterSize.length-1, cluster.size());
+ {
+ long reads=0, bases=0;
+ for(Unit u2 : cluster){
+ reads++;
+ bases+=u2.length();
+ }
+ clusterSize[0][cs]++;
+ clusterSize[1][cs]+=reads;
+ clusterSize[2][cs]+=bases;
+ }
+ max=Tools.max(max, cluster.size());
+ }
+ return max;
+ }
+
+ private void makeClusters(Timer t, ArrayList<Read> list){
+
+ final int clusterlen=70000;
+ long[][] clusterSize=new long[3][clusterlen];
+ int max=0;
+ for(Read r : list){
+ Unit u=(Unit) r.obj;
+
+ if(!u.clustered()){
+ ArrayList<Unit> cluster=u.makeCluster();
+ if(cluster.size()>2){cluster.trimToSize();}
+ if(cluster.size()==1 || (!processClusters && !maxSpanningTree)){processedClusters.add(cluster);}
+ else{clusterQueue.add(cluster);}
+ final int cs=Tools.min(clusterlen-1, cluster.size());
+ {
+ long reads=0, bases=0;
+ for(Unit u2 : cluster){
+ reads++;
+ bases+=u2.length();
+ }
+ clusterSize[0][cs]++;
+ clusterSize[1][cs]+=reads;
+ clusterSize[2][cs]+=bases;
+ }
+ max=Tools.max(max, cluster.size());
+ }
+ }
+
+ if(DISPLAY_PROGRESS){
+ t.stop();
+ outstream.println(toClusterSizeString(clusterSize));
+ outstream.println("\nLargest: "+max);
+ outstream.println("Finished making clusters. Time: "+t);
+ Shared.printMemory();
+ outstream.println();
+ t.start();
+ }
+
+
+ long x=removeInvalid(list);
+
+ if(DISPLAY_PROGRESS){
+ t.stop();
+ outstream.println("Removed "+x+" invalid entries.");
+ outstream.println("Finished invalid removal. Time: "+t);
+ Shared.printMemory();
+ outstream.println();
+ t.start();
+ }
+ }
+
+ private String toClusterSizeString(long[][] clusterSizeMatrix){
+
+ long[] clusterSize=clusterSizeMatrix[0];
+ long[] clusterReads=clusterSizeMatrix[1];
+ long[] clusterBases=clusterSizeMatrix[2];
+
+ long totalClusters=Tools.sum(clusterSize);
+
+ long bigClusters=0;
+ for(int i=minClusterSize; i<clusterSize.length; i++){
+ bigClusters+=clusterSize[i];
+ }
+
+ final int spaces=18;
+ final int spaces2=spaces*2, spaces3=spaces*3;
+
+ final StringBuilder sb=new StringBuilder(100), sb2=new StringBuilder(1000);
+ sb2.append("Clusters:");
+ while(sb2.length()<spaces){sb2.append(' ');}
+ sb2.append(totalClusters+(minClusterSize<2 ? "" : " ("+bigClusters+" of at least size "+minClusterSize+")")+"\n");
+
+ sb.append("Size Range");
+ while(sb.length()<spaces){sb.append(' ');}
+ sb.append("Clusters");
+ while(sb.length()<spaces2){sb.append(' ');}
+ sb.append("Reads");
+ while(sb.length()<spaces3){sb.append(' ');}
+ sb.append("Bases");
+
+ sb2.append('\n');
+ sb2.append(sb);
+ sb.setLength(0);
+
+ for(int i=0; i<clusterSize.length-1; i=Tools.max(i+1, i*2)){
+ int a=i+1, b=i*2;
+ if(i<2){
+ sb.append(a);
+ while(sb.length()<spaces){sb.append(' ');}
+ sb.append(clusterSize[a]);
+ while(sb.length()<spaces2){sb.append(' ');}
+ sb.append(clusterReads[a]);
+ while(sb.length()<spaces3){sb.append(' ');}
+ sb.append(clusterBases[a]);
+ }else if(b>=clusterSize.length){
+ long x=Tools.sum(clusterSize, a, clusterSize.length-1);
+ long y=Tools.sum(clusterReads, a, clusterSize.length-1);
+ long z=Tools.sum(clusterBases, a, clusterSize.length-1);
+ if(x>0){
+ sb.append(a+"+");
+ while(sb.length()<spaces){sb.append(' ');}
+ sb.append(x);
+ while(sb.length()<spaces2){sb.append(' ');}
+ sb.append(y);
+ while(sb.length()<spaces3){sb.append(' ');}
+ sb.append(z);
+ }
+ }else{
+ long x=Tools.sum(clusterSize, a, b);
+ long y=Tools.sum(clusterReads, a, b);
+ long z=Tools.sum(clusterBases, a, b);
+ if(x>0){
+ sb.append(a+"-"+b);
+ while(sb.length()<spaces){sb.append(' ');}
+ sb.append(x);
+ while(sb.length()<spaces2){sb.append(' ');}
+ sb.append(y);
+ while(sb.length()<spaces3){sb.append(' ');}
+ sb.append(z);
+ }
+ }
+ if(sb.length()>0){
+ sb2.append('\n');
+ sb2.append(sb);
+ sb.setLength(0);
+ }
+ }
+ return sb2.toString();
+ }
+
+ private void renameClusters(Timer t){
+ assert(false) : "This is now unused; renaming is done at output time.";
+ int cnum=0;
+ final StringBuilder sb=new StringBuilder(64);
+ for(ArrayList<Unit> alu : processedClusters){
+ for(int i=0; i<alu.size(); i++){
+ Unit u=alu.get(i);
+ Read r=u.r;
+ sb.append("Cluster ");
+ sb.append(cnum);
+ sb.append(",contig ");
+ sb.append(i);
+ if(u.offsetValid()){
+ sb.append(",pos ");
+ sb.append(u.offset());
+ }
+ r.id=sb.toString();
+ sb.setLength(0);
+ }
+ }
+
+ if(DISPLAY_PROGRESS){
+ t.stop();
+ outstream.println("Finished cluster renaming. Time: "+t);
+ Shared.printMemory();
+ outstream.println();
+ t.start();
+ }
+ }
+
+ private void processMst(Timer t){
+
+ if(DISPLAY_PROGRESS){outstream.println("Converting to Maximum Spanning Tree.");}
+
+ ArrayList<MstThread> alct=new ArrayList<MstThread>(THREADS);
+ for(int i=0; i<THREADS; i++){
+ alct.add(new MstThread());
+ }
+
+ long overlapsRemoved=0;
+ long overlapBasesRemoved=0;
+ long overlapsRetained=0;
+ long overlapBasesRetained=0;
+
+ for(MstThread ct : alct){ct.start();}
+ for(MstThread ct : alct){
+ while(ct.getState()!=Thread.State.TERMINATED){
+ try {
+ ct.join();
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ }
+ }
+
+ overlapsRemoved+=ct.overlapsRemovedT;
+ overlapBasesRemoved+=ct.overlapBasesRemovedT;
+ overlapsRetained+=ct.overlapsRetainedT;
+ overlapBasesRetained+=ct.overlapBasesRetainedT;
+ }
+ assert(clusterQueue.isEmpty());
+ if(processClusters){
+ for(MstThread ct : alct){
+ clusterQueue.addAll(ct.processedT);
+ ct.processedT.clear();
+ ct.processedT=null;
+ }
+ }else{
+ for(MstThread ct : alct){
+ processedClusters.addAll(ct.processedT);
+ ct.processedT.clear();
+ ct.processedT=null;
+ }
+ clusterQueue=null;
+ }
+ alct.clear();
+
+ assert(affixMaps==null);
+ killAffixMaps();
+
+
+ if(DISPLAY_PROGRESS){
+ t.stop();
+ outstream.println("Removed "+(overlapsRemoved)+" edges ("+overlapBasesRemoved+" bases).");
+ outstream.println("Retained "+(overlapsRetained)+" edges ("+overlapBasesRetained+" bases).");
+
+// outstream.println("\nAfter conversion to Maximum Spanning Tree:");
+// final int[] clusterSize=new int[8200];
+// int max=0;
+// for(ArrayList<Unit> cluster : processedClusters){
+// clusterSize[Tools.min(clusterSize.length-1, cluster.size())]++;
+// max=Tools.max(max, cluster.size());
+// }
+// outstream.println(toClusterSizeString(clusterSize));
+// outstream.println("Largest: "+max);
+
+ outstream.println("Finished MST conversion. Time: "+t);
+ Shared.printMemory();
+ outstream.println();
+ t.start();
+ }
+ }
+
+ private void processClusters(Timer t, boolean mergeClusters){
+
+ ArrayList<ClusterThread> alct=new ArrayList<ClusterThread>(THREADS);
+ for(int i=0; i<THREADS; i++){
+ alct.add(new ClusterThread(fixMultiJoins, canonicizeClusters, removeCycles, fixCanonContradictions, fixOffsetContradictions,
+ mergeClusters, mergeClusters, mergeClusters));
+ }
+
+ long leafMerges=0;
+ long innerMerges=0;
+ long leafBaseMerges=0;
+ long innerBaseMerges=0;
+
+ long multiJoinFailures=0;
+ long multiJoinsFound=0;
+ long multiJoinBasesFound=0;
+ long unitsFlipped=0;
+ long overlapsFlipped=0;
+ long canonContradictoryOverlaps=0;
+ long canonContradictoryClusters=0;
+ long offsetContradictoryOverlaps=0;
+ long offsetContradictoryClusters=0;
+ long cycleOverlaps=0;
+ long cycleClusters=0;
+
+ for(ClusterThread ct : alct){ct.start();}
+ for(ClusterThread ct : alct){
+ while(ct.getState()!=Thread.State.TERMINATED){
+ try {
+ ct.join();
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ }
+ }
+
+ leafMerges+=ct.leafMergesT;
+ innerMerges+=ct.innerMergesT;
+ leafBaseMerges+=ct.leafBaseMergesT;
+ innerBaseMerges+=ct.innerBaseMergesT;
+
+ multiJoinFailures+=ct.multiJoinFailuresT;
+ multiJoinsFound+=ct.multiJoinsFoundT;
+ multiJoinBasesFound+=ct.multiJoinBasesFoundT;
+ unitsFlipped+=ct.unitsFlippedT;
+ overlapsFlipped+=ct.overlapsFlippedT;
+ canonContradictoryOverlaps+=ct.canonContradictoryOverlapsT;
+ canonContradictoryClusters+=ct.canonContradictoryClustersT;
+ offsetContradictoryOverlaps+=ct.offsetContradictoryOverlapsT;
+ offsetContradictoryClusters+=ct.offsetContradictoryClustersT;
+ cycleOverlaps+=ct.cycleOverlapsT;
+ cycleClusters+=ct.cycleClustersT;
+ }
+ alct.clear();
+
+ assert(affixMaps==null);
+ killAffixMaps();
+
+ assert(clusterQueue.isEmpty());
+ clusterQueue=null;
+
+ if(DISPLAY_PROGRESS){
+ t.stop();
+ if(fixMultiJoins){
+ outstream.println("Found "+(multiJoinsFound)+" multijoins ("+multiJoinBasesFound+" bases).");
+ outstream.println("Experienced "+(multiJoinFailures)+" multijoin removal failures.");
+ }
+ if(canonicizeClusters){
+ outstream.println("Flipped "+(unitsFlipped)+" reads and "+overlapsFlipped+" overlaps.");
+ outstream.println("Found "+(canonContradictoryClusters)+" clusters ("+canonContradictoryOverlaps+" overlaps) with contradictory orientation cycles.");
+ }
+ if(fixOffsetContradictions){
+ outstream.println("Found "+(offsetContradictoryClusters)+" clusters ("+offsetContradictoryOverlaps+" overlaps) with contradictory offset cycles.");
+ }
+ outstream.println("Found "+(cycleClusters)+" clusters ("+cycleOverlaps+" overlaps) with remaining cycles.");
+ if(absorbOverlap){
+ outstream.println("Merged "+(leafMerges)+" leaves ("+leafBaseMerges+" bases).");
+ outstream.println("Merged "+(innerMerges)+" nonleaves ("+innerBaseMerges+" bases).");
+ }
+
+ outstream.println("\nAfter processing clusters:");
+ final long[][] clusterSize=new long[3][70000];
+ final long max=fillClusterSizeMatrix(processedClusters, clusterSize);
+ outstream.println(toClusterSizeString(clusterSize));
+ outstream.println("\nLargest: "+max);
+
+ outstream.println("Finished processing. Time: "+t);
+ Shared.printMemory();
+ outstream.println();
+ t.start();
+ }
+ }
+
+ private long removeInvalid(ArrayList<Read> list){
+ final LongM keym=new LongM();
+ long removedC=0, removedP=0, removedS=0, invalid=0;
+
+ for(int j=0, lim=list.size(); j<lim; j++){
+ final Read r=list.get(j);
+ final Unit u=(Unit)r.obj;
+
+ if(!u.valid()){
+
+ invalid++;
+
+ if(codeMap!=null && !codeMap.isEmpty()){
+ Long key=u.code1;
+ ArrayList<Unit> alu=codeMap.get(key);
+ if(alu!=null){
+ int valid=0;
+ for(int i=alu.size()-1; i>=0; i--){
+ Unit u2=alu.get(i);
+ if(u2==null || !u2.valid()){
+ alu.remove(i);
+ removedC++;
+ }
+ else{valid++;}
+ }
+ if(valid==0){codeMap.remove(key);}
+ }
+ }
+
+ for(int num=0; num<numAffixMaps && affixMaps!=null; num++){
+ HashMap<LongM, ArrayList<Unit>> map=affixMaps[num];
+ if(map!=null && !map.isEmpty()){
+ if(u.prefixes[num]!=-1){
+ keym.set(u.prefixes[num]);
+ ArrayList<Unit> alu=map.get(keym);
+ if(alu!=null){
+ int valid=0;
+ for(int i=alu.size()-1; i>=0; i--){
+ Unit u2=alu.get(i);
+ if(u2==null || !u2.valid()){
+ alu.remove(i);
+ removedP++;
+ }
+ else{valid++;}
+ }
+ if(valid==0){map.remove(keym);}
+ }
+ }
+ if(storeSuffix && u.suffixes[num]!=-1){
+ keym.set(u.suffixes[num]);
+ ArrayList<Unit> alu=map.get(keym);
+ if(alu!=null){
+ int valid=0;
+ for(int i=alu.size()-1; i>=0; i--){
+ Unit u2=alu.get(i);
+ if(u2==null || !u2.valid()){
+ alu.remove(i);
+ removedS++;
+ }
+ else{valid++;}
+ }
+ if(valid==0){map.remove(keym);}
+ }
+ }
+ }
+ }
+ list.set(j, null);
+ }
+ }
+
+ if(invalid>0){
+ Tools.condenseStrict(list);
+ }
+ if(verbose){
+ outstream.println("Removed invalids: "+removedC+", "+removedP+", "+removedS);
+ }
+ return invalid;
+ }
+
+
+ private static ArrayList<Read> addToArray(HashMap<Long, ArrayList<Unit>> codeMap, boolean sort, boolean ascending, boolean clear, long outNum){
+ assert(outNum<=Integer.MAX_VALUE);
+ if(verbose){System.err.println("Making list.");}
+ ArrayList<Read> list=new ArrayList<Read>((int)outNum);
+ if(verbose){System.err.println("Adding.");}
+ for(ArrayList<Unit> alu : codeMap.values()){
+ for(Unit u : alu){
+ if(u.valid() && u.r.pairnum()==0){list.add(u.r);}
+ }
+ if(clear){alu.clear();}
+ }
+ if(clear){codeMap.clear();}
+
+ if(sort){
+ if(verbose){System.err.println("Sorting.");}
+ Collections.sort(list, ReadLengthComparator.comparator);
+ if(ascending){
+ Collections.reverse(list);
+ assert(list.isEmpty() || list.get(0).length()<=list.get(list.size()-1).length()) :
+ list.get(0).length()+", "+list.get(list.size()-1).length();
+ }else{
+ assert(list.isEmpty() || list.get(0).length()>=list.get(list.size()-1).length()) :
+ list.get(0).length()+", "+list.get(list.size()-1).length();
+ }
+ }
+ assert(list.size()==outNum || list.size()*2L==outNum || UNIQUE_ONLY) : list.size()+", "+outNum;
+ return list;
+ }
+
+ private void writeOutput(String clusterStatsFile, Timer t){
+// verbose=true;
+// assert(false) : (processedClusters==null)+", "+(processedClusters.isEmpty())+", "+outgraph+", "+out+", "+clusterFilePattern;
+ if(processedClusters==null || processedClusters.isEmpty()){
+
+ if(out!=null || clusterFilePattern!=null){
+
+ ArrayList<Read> list=addToArray(codeMap, sort, ascending, true, addedToMain-containments);
+ codeMap=null;
+
+ if(sort){
+ if(DISPLAY_PROGRESS){
+ t.stop();
+ outstream.println("Sorted output. Time: "+t);
+ Shared.printMemory();
+ outstream.println();
+ t.start();
+ }
+ }
+
+ writeOutput(list);
+ }
+ }else{
+ if(outgraph!=null){
+ writeGraph(outgraph, processedClusters);
+ }
+ if(out!=null || clusterFilePattern!=null || clusterStatsFile!=null || outbest!=null){
+ writeOutputClusters(clusterStatsFile, processedClusters);
+ }
+ }
+
+ if(DISPLAY_PROGRESS){
+ t.stop();
+ outstream.println("Printed output. Time: "+t);
+ Shared.printMemory();
+ outstream.println();
+ t.start();
+ }
+ }
+
+
+
+ private void writeOutput(ArrayList<Read> list){
+
+ final ByteStreamWriter tsw=(out==null ? null : new ByteStreamWriter(out, overwrite, append, true));
+
+ if(verbose){System.err.println("Writing from array.");}
+ tsw.start();
+
+ HashSet<String> names=((uniqueNames && storeName) ?
+ new HashSet<String>(Tools.min(Integer.MAX_VALUE, Tools.max((int)addedToMain, (int)(addedToMain*1.35)))) : null);
+ long rid=0;
+ for(int x=0; x<list.size(); x++){
+ Read r=list.get(x);
+ list.set(x, null);
+
+ if(r.mate!=null && r.pairnum()!=0){r=r.mate;}
+
+ if(!r.discarded()){
+ rid++;
+
+ for(int i=0; r!=null && i<2; i++){
+ if(multipleInputFiles){r.numericID=rid;}
+ if(names!=null){
+ String name=(r.id==null ? ""+r.numericID : r.id);
+ if(names.contains(name)){
+ for(long j=0; j<Integer.MAX_VALUE; j++){
+ String name2=name+"_dd"+j;
+ if(r.mate!=null){name2+=(" /"+(i+1));}
+ if(!names.contains(name2)){
+ r.id=name2;
+ names.add(name2);
+ break;
+ }
+ }
+ }else{
+ names.add(name);
+ }
+ }
+ tsw.println(r);
+ r.setDiscarded(true);
+ r=r.mate;
+ }
+ }
+ }
+ if(verbose){System.err.println("Shutting down tsw "+tsw.fname);}
+ tsw.poisonAndWait();
+ }
+
+
+ private void writeOutputClusters(String clusterStatsFile, ArrayList<ArrayList<Unit>> clist){
+
+ Collections.sort(clist, CLUSTER_LENGTH_COMPARATOR);
+
+ if(verbose){System.err.println("Writing clusters.");}
+
+ final ByteStreamWriter tswAll=(out==null ? null : new ByteStreamWriter(out, overwrite, append, true));
+ if(tswAll!=null){tswAll.start();}
+ ByteStreamWriter tswCluster=null;
+ ByteStreamWriter tswBest=null;
+
+ if(outbest!=null){
+ tswBest=new ByteStreamWriter(outbest, overwrite, append, true);
+ tswBest.start();
+ }
+
+ TextStreamWriter csf=null;
+ if(clusterStatsFile!=null){
+ csf=new TextStreamWriter(clusterStatsFile, overwrite, false, false);
+ csf.start();
+ csf.print("#Name\tsize\t"+nmerLength+"-mer frequencies\n");
+ }
+
+ HashSet<String> names=((uniqueNames && storeName) ?
+ new HashSet<String>(Tools.min(Integer.MAX_VALUE, Tools.max((int)addedToMain, (int)(addedToMain*1.35)))) : null);
+ long rid=0;
+ final long[] nmerCounts=new long[maxNmer+1];
+
+ final StringBuilder sb=new StringBuilder(64);
+
+ for(int cnum=0; cnum<clist.size(); cnum++){
+ final ArrayList<Unit> alu=clist.get(cnum);
+// clist.set(cnum, null); //This breaks subsequent output processing
+
+ if(alu.size()<minClusterSize){
+ if(verbose){System.err.println("Ignoring small cluster "+cnum+", size "+alu.size());}
+
+ if(csf!=null && alu.size()>=minClusterSizeForStats){
+ float[] profile=makeNmerProfile(alu, nmerCounts);
+ sb.append("Cluster_");
+ sb.append(cnum);
+ sb.append('\t');
+ sb.append(alu.size());
+ sb.append('\t');
+ for(float f : profile){
+ sb.append(String.format("%.5f ", f));
+ }
+ sb.setCharAt(sb.length()-1, '\n');
+ csf.print(sb.toString());
+ sb.setLength(0);
+ }
+ }else{
+ if(verbose){System.err.println("Writing cluster "+cnum+", size "+alu.size());}
+
+ if(clusterFilePattern!=null){
+ if(tswCluster!=null){
+ if(verbose){System.err.println("Shutting down tswCluster "+tswCluster.fname);}
+ tswCluster.poisonAndWait();
+ tswCluster=null;
+ }
+ tswCluster=new ByteStreamWriter(clusterFilePattern.replaceFirst("%", ""+cnum), overwrite, append, true);
+ if(verbose){System.err.println("Starting tswCluster "+tswCluster.fname);}
+ tswCluster.start();
+ }
+
+ if(csf!=null && alu.size()>=minClusterSizeForStats){
+ float[] profile=makeNmerProfile(alu, nmerCounts);
+ sb.append("Cluster_");
+ sb.append(cnum);
+ sb.append('\t');
+ sb.append(alu.size());
+ sb.append('\t');
+ for(float f : profile){
+ sb.append(String.format("%.5f ", f));
+ }
+ sb.setCharAt(sb.length()-1, '\n');
+ csf.print(sb.toString());
+ sb.setLength(0);
+ }
+
+ if(pickBestRepresentativePerCluster){
+ pickBestRepresenative(alu, true);
+ }
+
+ if(outbest!=null){
+ Unit u=pickBestRepresenative((ArrayList<Unit>)alu.clone(), false);
+ tswBest.println(u.r);
+ if(u.r.mate!=null){tswBest.println(u.r.mate);}
+ }
+
+ for(int contig=0; contig<alu.size(); contig++){
+ final Unit u0=alu.get(contig);
+ alu.set(contig, null);
+ Read r=u0.r;
+ if(r.mate!=null && r.pairnum()!=0){r=r.mate;}
+
+ if(!r.discarded()){
+ rid++;
+
+ for(int i=0; r!=null && i<2; i++){
+ assert(r.pairnum()==i) : i+", "+r.pairnum()+", "+(r.mate==null ? 9 : r.mate.pairnum());
+ Unit u=(Unit)r.obj;
+ if(verbose){System.err.println("Writing read "+r.id);}
+ r.numericID=rid;
+ if(renameClusters){
+ sb.append("Cluster_");
+ sb.append(cnum);
+ sb.append(",contig_");
+ sb.append(contig);
+ if(u.offsetValid()){
+ sb.append(",pos_");
+ sb.append(u.offset());
+ }
+ if(r.mate!=null){sb.append(" /"+(i+1));}
+ r.id=(r.id==null ? sb.toString() : r.id+"\t"+sb);
+ sb.setLength(0);
+ }else if(names!=null){
+ String name=(r.id==null ? ""+r.numericID : r.id);
+ if(names.contains(name)){
+ for(long j=0; j<Integer.MAX_VALUE; j++){
+ String name2=name+"_dd"+j;
+ if(!names.contains(name2)){
+ r.id=name2;
+ names.add(name2);
+ break;
+ }
+ }
+ }else{
+ names.add(name);
+ }
+ }
+ if(tswAll!=null){tswAll.println(r);}
+ if(tswCluster!=null){tswCluster.println(r);}
+ r.setDiscarded(true);
+ r=r.mate;
+ }
+ }
+ }
+ }
+ }
+ if(csf!=null){
+ if(verbose){System.err.println("Shutting down csf "+csf.fname);}
+ csf.poisonAndWait();
+ }
+ if(tswBest!=null){
+ if(verbose){System.err.println("Shutting down tswBest "+tswBest.fname);}
+ tswBest.poisonAndWait();
+ }
+ if(tswAll!=null){
+ if(verbose){System.err.println("Shutting down tswAll "+tswAll.fname);}
+ tswAll.poisonAndWait();
+ }
+ if(tswCluster!=null){
+ if(verbose){System.err.println("Shutting down tswCluster "+tswCluster.fname);}
+ tswCluster.poisonAndWait();
+ }
+ }
+
+
+ private void writeGraph(String graphFile, ArrayList<ArrayList<Unit>> clist){
+ Collections.sort(clist, CLUSTER_LENGTH_COMPARATOR);
+
+ if(verbose){System.err.println("Writing overlap graph.");}
+
+ final TextStreamWriter tsw=(graphFile==null ? null : new TextStreamWriter(graphFile, overwrite, append, true));
+ if(tsw!=null){
+ tsw.start();
+ tsw.print("digraph G {\n");
+ }
+
+ for(int cnum=0; cnum<clist.size(); cnum++){
+ final ArrayList<Unit> alu=clist.get(cnum);
+// clist.set(cnum, null); //This breaks subsequent output processing
+// Collections.sort(alu); //TODO: Remove
+
+ if(alu.size()<minClusterSize){
+ if(verbose){System.err.println("Ignoring small cluster "+cnum+", size "+alu.size());}
+ }else{
+ if(verbose){System.err.println("Processing cluster "+cnum+", size "+alu.size());}
+
+ for(int contig=0; contig<alu.size(); contig++){
+ final Unit u0=alu.get(contig);
+// alu.set(contig, null); //This breaks subsequent output processing
+ Read r=u0.r;
+ if(r.mate!=null && r.pairnum()!=0){r=r.mate;}
+
+ {
+ for(int i=0; r!=null && i<2; i++){
+ assert(r.pairnum()==i) : i+", "+r.pairnum()+", "+(r.mate==null ? 9 : r.mate.pairnum());
+ Unit u=(Unit)r.obj;
+ if(verbose){System.err.println("Writing read "+r.id);}
+
+ if(tsw!=null){
+ tsw.print("\t"+toGraphName(r)+"\n");
+ if(r.mate!=null && r.pairnum()==0){
+ Read r2=r.mate;
+ tsw.print(toGraphName(r)+" -> "+toGraphName(r2)+" [label=mate]");
+ }
+ if(u.overlapList!=null){
+ for(Overlap o : u.overlapList){
+ if(u==o.u1){
+ Read r2=o.u2.r;
+ tsw.print("\t"+toGraphName(r)+" -> "+toGraphName(r2)+" [label=\""+o.toLabel()+"\"]\n");
+ }
+ }
+ }
+ }
+ r=r.mate;
+ }
+ }
+ }
+ }
+ }
+
+ if(tsw!=null){
+ tsw.print("}\n");
+ if(verbose){System.err.println("Shutting down tswAll "+tsw.fname);}
+ tsw.poisonAndWait();
+ }
+ }
+
+ private static String toGraphName(Read r){
+ if(NUMBER_GRAPH_NODES || r.id==null){
+ return r.numericID+((ADD_PAIRNUM_TO_NAME || r.mate!=null) ? "."+(r.pairnum()+1) : "");
+ }else{
+ return r.id.replace(' ','_').replace('\t','_');
+ }
+ }
+
+ private Unit pickBestRepresenative(ArrayList<Unit> alu, boolean clearList){
+ if(alu==null || alu.isEmpty()){return null;}
+ float[] quality=new float[alu.size()];
+ int[] lengths=new int[alu.size()];
+ for(int i=0; i<alu.size(); i++){
+ Unit u=alu.get(i);
+ int len=u.r.length();
+ quality[i]=u.r.expectedErrors(true, 0)/len;
+ lengths[i]=len;
+ }
+ Arrays.sort(quality);
+ Arrays.sort(lengths);
+ int medianLength=lengths[lengths.length/2];
+ float bestQuality=quality[0];
+
+ float currentBestQuality=9999999;
+ Unit best=null;
+ for(int i=0; i<alu.size(); i++){
+ Unit u=alu.get(i);
+ int len=u.r.length();
+ float deviation=Tools.absdif(len, medianLength)*1f/(medianLength+1);
+ if(deviation<0.05){
+ float qual=u.r.expectedErrors(true, 0)/len;
+ qual=(qual+.001f)*(1+10*deviation);
+ if(qual<currentBestQuality || best==null){
+ currentBestQuality=qual;
+ best=u;
+ }
+ }
+ }
+ if(clearList){
+ alu.clear();
+ alu.add(best);
+ }
+ return best;
+ }
+
+ public static long hash(byte[] bases){
+ long code=bases.length;
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ int mode=(int)(code&31);
+ assert(hashcodes[b]!=null) : "Invalid sequence character: '"+(char)b+"'";
+ code=code^hashcodes[b][mode];
+ code=Long.rotateLeft(code, 1);
+ }
+ return code;
+ }
+
+
+ public static long hashReversed(byte[] bases){
+ long code=bases.length;
+ for(int i=bases.length-1; i>=0; i--){
+ byte b=bases[i];
+ assert(hashcodes[b]!=null) : "Invalid sequence character: '"+(char)b+"'";
+ b=baseToComplementExtended[b];
+ int mode=(int)(code&31);
+ code=code^hashcodes[b][mode];
+ code=Long.rotateLeft(code, 1);
+ }
+ return code;
+ }
+
+
+ public static boolean isCanonical(byte[] bases){
+ if(ignoreReverseComplement || bases==null || bases.length==0){return true;}
+ final int lim=(bases.length+1)/2;
+ for(int i=0, j=bases.length-1; i<lim; i++, j--){
+ byte a=bases[i], b=baseToComplementExtended[bases[j]];
+ if(a<b){return true;}
+ if(b<a){return false;}
+ }
+ assert((bases.length&1)==0 || bases[lim-1]==baseToComplementExtended[bases[lim-1]]) :
+ bases.length+", "+lim+", "+bases[lim-1]+", "+(char)bases[lim-1]+(bases.length<1000 ? "\n'"+new String(bases)+"'\n" : ""); //palindrome absorb
+ return true; //palindrome
+ }
+
+
+ private static synchronized long[][] makeCodes(int symbols, int modes){
+ Random randy=new Random(1);
+ long[][] r=new long[symbols][modes];
+ for(int i=0; i<symbols; i++){
+ for(int j=0; j<modes; j++){
+ r[i][j]=randy.nextLong();
+ }
+ }
+ return r;
+ }
+
+// /** Handles IUPAC codes */
+// private static synchronized long[][] makeCodes2(int modes){
+// long[][] r0=makeCodes(26, modes);
+// long[][] r=new long[Tools.max('Z','z')+1][];
+// for(int i=0; i<26; i++){
+// char c=(char)('A'+i);
+// r[c]=r[Character.toLowerCase(c)]=r0[i];
+// }
+// return r;
+// }
+
+ /** Handles IUPAC codes and invalid symbols */
+ private static synchronized long[][] makeCodes2(int modes){
+ long[][] r=makeCodes(128, modes);
+
+ for(int i=0; i<26; i++){
+ char c=(char)('A'+i);
+ r[Character.toLowerCase(c)]=r[c];
+ }
+ return r;
+ }
+
+ private void addDupe(Read r){
+ if(dupeWriter==null){return;}
+ if(r.mate==null || r.pairnum()==0){
+ synchronized(dupeWriter){
+ dupeWriter.println(r);
+ if(r.mate!=null){
+ dupeWriter.println(r.mate);
+ }
+ }
+ }
+ }
+
+
+ private final class MstThread extends Thread{
+
+ public MstThread(){}
+
+ public void run(){
+
+ ArrayList<Unit> cluster=null;
+ while((cluster=nextCluster())!=null){
+ makeMst(cluster);
+ processedT.add(cluster);
+ }
+
+ }
+
+ public void makeMst(ArrayList<Unit> cluster){
+ assert(heap.isEmpty());
+ unvisit(cluster);
+ for(Unit u : cluster){
+ u.flags&=~Unit.VISIT_MASK;
+ Collections.sort(u.overlapList);
+ }
+ {
+ Unit u=cluster.get(0);
+ u.setVisited(true);
+ heap.addAll(u.overlapList);
+ }
+// assert(false) : cluster.size();
+ while(!heap.isEmpty()){
+ Overlap o=heap.poll();
+ assert(!o.mst());
+ if(!o.invalid()){
+// assert(o.u1.overlapList.contains(o)); //slow
+// assert(o.u2.overlapList.contains(o)); //slow
+ assert(o.u1.visited() || o.u2.visited());
+ final Unit u=(!o.u1.visited() ? o.u1 : !o.u2.visited()? o.u2 : null);
+ if(u!=null){
+ o.setMst(true);
+ u.setVisited(true);
+ overlapsRetainedT++;
+ overlapBasesRetainedT+=o.overlapLen;
+ for(Overlap o2 : u.overlapList){
+ if(o2.mst()){
+ //do nothing
+ }else if(!o2.u1.visited() || !o2.u2.visited()){
+ if(heap.size()>=Integer.MAX_VALUE){
+ removeInvalid(heap);
+ }
+ heap.add(o2);
+ }else if(!o2.invalid()){
+ o2.setInvalid(true);
+ overlapsRemovedT++;
+ overlapBasesRemovedT+=o2.overlapLen;
+ }
+ }
+ }
+ }
+ }
+ for(Unit u : cluster){
+ ArrayList<Overlap> alo=u.overlapList;
+ int removed=0;
+ for(int i=0; i<alo.size(); i++){
+ Overlap o=alo.get(i);
+ if(o.invalid()){
+ assert(!o.mst());
+ alo.set(i, null);
+ removed++;
+ }else{
+ assert(o.mst());
+ }
+ }
+ if(removed>0){
+ Tools.condenseStrict(alo);
+ alo.trimToSize();
+ }
+ }
+ }
+
+ private void removeInvalid(PriorityQueue<Overlap> heap){
+ ArrayList<Overlap> valid=new ArrayList<Overlap>(heap.size());
+ for(Overlap o : heap){
+ if(!o.invalid()){
+ assert(!o.u1.visited() || !o.u2.visited());
+ valid.add(o);
+ }
+ }
+ heap.clear();
+ heap.addAll(valid);
+ }
+
+
+ public long overlapsRemovedT=0;
+ public long overlapBasesRemovedT=0;
+ public long overlapsRetainedT=0;
+ public long overlapBasesRetainedT=0;
+
+ private final PriorityQueue<Overlap> heap=new PriorityQueue<Overlap>((1<<16)-1);
+ private ArrayList<ArrayList<Unit>> processedT=new ArrayList<ArrayList<Unit>>();
+ }
+
+
+ /**
+ * Processes clustered sets of reads.
+ * @author Brian Bushnell
+ * @date Aug 9, 2013
+ *
+ */
+ private final class ClusterThread extends Thread{
+
+ public ClusterThread(boolean fixMultiJoins_, boolean canonicize_, boolean removeCycles_,
+ boolean fixCanonContradictions_, boolean fixOffsetContradictions_, boolean mergeClusters_, boolean mergeLeaves_, boolean mergeInner_){
+ fixMultiJoinsT=fixMultiJoins_;
+ canonicizeT=canonicize_;
+ fixCanonContradictionsT=fixCanonContradictions_;
+ fixOffsetContradictionsT=fixOffsetContradictions_;
+ mergeClustersT=mergeClusters_;
+ mergeLeavesT=mergeLeaves_;
+ mergeInnerT=mergeInner_;
+
+// assert(false) : fixMultiJoinsT+", "+canonicizeT+", "+fixCanonContradictionsT+", "+mergeLeavesT+", "+mergeInnerT;
+ bandy=(maxEdits>0 ? BandedAligner.makeBandedAligner(bandwidth) : null);
+// assert(false) : fixMultiJoinsT+", "+canonicizeT+", "+fixCanonContradictionsT+", "+fixOffsetContradictionsT+", "+mergeClustersT+", "+removeCycles_;
+ }
+
+ public void run(){
+
+ final ArrayList<Unit> temp=new ArrayList<Unit>(1000);
+
+ ArrayList<Unit> cluster=null;
+ while((cluster=nextCluster())!=null){
+
+ if(EA){
+ for(Unit u : cluster){assert(u.r.mate==null) : "Cluster processing/merging is not supported for paired reads, only cluster generation.";}
+ }
+
+// for(Unit u : cluster){assert(!u.visited());}
+ unvisit(cluster);
+
+ reorderClusterBreadthFirst(cluster);
+ int multiJoinCount=findMultiJoinsInCluster(cluster, fixMultiJoinsT);
+
+ if(EA){
+ for(Unit u : cluster){assert(!u.visited());}
+ }
+
+ boolean ok=true;
+ if(multiJoinCount!=0){
+ assert(multiJoinCount>0);
+ multiJoinsFoundT+=multiJoinCount;
+ if(!fixMultiJoinsT){
+ multiJoinFailuresT++;
+ ok=false;
+ }
+ }
+
+ int canonContradictions=0;
+ if(ok && canonicizeT){
+ if(EA){
+ for(Unit u : cluster){
+ assert(!u.visited());
+ assert(!u.canonContradiction());
+ assert(!u.canonicized());
+ if(u.overlapList!=null){
+ for(Overlap o : u.overlapList){
+ assert(!o.invalid());
+ assert(!o.canonContradiction()) :
+ o.u1.canonContradiction()+", "+o.u2.canonContradiction()+", "+cluster.contains(o.u1)+", "+cluster.contains(o.u2);
+ }
+ }
+ }
+ }
+ canonContradictions=canonicizeClusterBreadthFirst(cluster, temp);
+// System.err.println("Canonicized cluster of size "+cluster.size()+"; contradictions = "+canonContradictions+"; canonicized = "+temp.size());
+ temp.clear();
+ for(Unit u : cluster){assert(!u.visited());}
+ if(canonContradictions>0){
+ canonContradictoryOverlapsT+=canonContradictions;
+ canonContradictoryClustersT++;
+ if(fixCanonContradictionsT){
+ if(verbose){System.err.println("Pruning cluster to remove canonization contradictions.");}
+ fullyPruneCluster(cluster, temp);
+ if(verbose){System.err.println("Resulting size: "+cluster.size());}
+ if(EA){
+ for(Unit u : cluster){
+ assert(!u.visited());
+ assert(!u.canonContradiction());
+ assert(u.canonicized());
+ if(u.overlapList!=null){
+ for(Overlap o : u.overlapList){
+ assert(!o.invalid());
+ assert(!o.canonContradiction());
+ assert(o.type==FORWARD) : "\n"+o+"\n"+
+ o.u1.canonContradiction()+", "+o.u2.canonContradiction()+", "+o.u1.canonicized()+", "+o.u2.canonicized()+
+ "\n"+cluster.contains(o.u1)+", "+cluster.contains(o.u2)+", "+cluster.size();
+ }
+ }
+ }
+ }
+ }else{
+ ok=false;
+ }
+ }
+ }
+
+ int cycleOverlaps=0;
+ if(ok){
+ cycleOverlaps=findCycles(cluster, removeCycles);
+ for(Unit u : cluster){assert(!u.visited());}
+ if(cycleOverlaps>0){
+ cycleOverlapsT+=cycleOverlaps;
+ cycleClustersT++;
+ }
+ }
+
+ int offsetContradictions=0;
+ if(ok && fixOffsetContradictionsT){
+ if(EA){
+ for(Unit u : cluster){
+ assert(!u.visited());
+ assert(!u.offsetContradiction());
+ assert(!u.offsetValid());
+ assert(u.canonicized());
+ if(u.overlapList!=null){
+ for(Overlap o : u.overlapList){
+ assert(!o.invalid());
+ assert(!o.offsetContradiction());
+ assert(o.type==FORWARD) : o;
+ }
+ }
+ }
+ }
+ offsetContradictions=generateOffsetsBreadthFirst(cluster, temp);
+// System.err.println("Made offsets for cluster of size "+cluster.size()+"; contradictions = "+offsetContradictions+"; set = "+temp.size());
+ temp.clear();
+ for(Unit u : cluster){assert(!u.visited());}
+ if(offsetContradictions>0){
+ offsetContradictoryOverlapsT+=offsetContradictions;
+ offsetContradictoryClustersT++;
+ if(fixOffsetContradictionsT){
+ if(verbose){System.err.println("Pruning cluster to remove offset contradictions.");}
+ fullyPruneCluster(cluster, temp);
+ if(verbose){System.err.println("Resulting size: "+cluster.size());}
+ if(EA){
+ for(Unit u : cluster){
+ assert(!u.visited());
+ assert(!u.offsetContradiction());
+ assert(u.offsetValid());
+ if(u.overlapList!=null){
+ for(Overlap o : u.overlapList){
+ assert(!o.invalid());
+ assert(!o.offsetContradiction());
+ assert(o.type==FORWARD) : o;
+ }
+ }
+ }
+ }
+ }else{
+ ok=false;
+ }
+ }
+ if(ok){Collections.sort(cluster, UNIT_OFFSET_COMPARATOR);}
+ }
+
+ if(ok && absorbOverlap){
+ mergeCluster(cluster);
+ }
+
+ processedClustersT.add(cluster);
+ if(processedClustersT.size()>=threadMaxReadsToBuffer){
+ synchronized(processedClusters){
+ processedClusters.addAll(processedClustersT);
+ processedClustersT.clear();
+ }
+ }
+ }
+ synchronized(processedClusters){
+ processedClusters.addAll(processedClustersT);
+ processedClustersT.clear();
+ }
+ }
+
+ private void fullyPruneCluster(ArrayList<Unit> cluster, ArrayList<Unit> temp){
+ assert(cluster.size()>1) : cluster.size();
+ ArrayList<Unit> pruned=pruneCluster(cluster, true, true, temp);
+ assert(temp.isEmpty());
+ assert(pruned==null || pruned.size()>0);
+ while(pruned!=null){
+ ArrayList<Unit> subcluster=pruned;
+ for(Unit u : subcluster){
+ u.clearVolatileFlags();
+ if(u.overlapList!=null){
+ for(Overlap o : u.overlapList){
+ o.clearVolatileFlags();
+ }
+ }
+ }
+ assert(subcluster.size()>0);
+ pruned=pruneCluster(subcluster, false, false, temp);
+ assert(temp.isEmpty());
+ assert(pruned==null || pruned.size()>0);
+ assert(subcluster.size()>0);
+ if(subcluster.size()==1){
+ processedClustersT.add(subcluster);
+ }else{
+ assert(subcluster.size()>1);
+ synchronized(clusterQueue){
+ clusterQueue.add(subcluster);
+ }
+ }
+ }
+ }
+
+ /**
+ * @param cluster
+ */
+ private void mergeCluster(ArrayList<Unit> cluster) {
+ if(cluster.size()==1){return;}
+ if(mergeLeavesT){
+ mergeLeaves(cluster);
+ }
+ if(mergeInnerT){
+ mergeInner(cluster);
+ }
+ }
+
+ /**
+ * Finds places in the cluster where two Units are joined by multiple different Overlaps.
+ * Returns number of multijoins found.
+ * @param cluster
+ */
+ private int findMultiJoinsInCluster(ArrayList<Unit> cluster, boolean resolveProblems) {
+ if(cluster.size()<2){return 0;}
+ int totalMultiJoins=0;
+ for(Unit ua : cluster){
+ ArrayList<Overlap> list=ua.overlapList;
+ assert(list!=null);
+ if(list.size()>1){
+ Collections.sort(list);
+
+ int multiJoins=0;
+ for(int i=0; i<list.size(); i++){
+ Overlap o=list.get(i);
+ Unit ub=(o.u1==ua ? o.u2 : o.u1);
+ assert(ua!=ub);
+ assert(ua==o.u1 || ua==o.u2);
+ if(ub.visited()){
+ multiJoins++;
+ multiJoinBasesFoundT+=o.overlapLen;
+ if(!o.multiJoin()){o.setMultiJoin(true);}
+ if(resolveProblems){list.set(i, null);}
+ }else{
+ ub.setVisited(true);
+ }
+ }
+
+ if(multiJoins>0){
+ totalMultiJoins+=multiJoins;
+ if(resolveProblems){Tools.condenseStrict(list);}
+ }
+
+ for(int i=0; i<list.size(); i++){
+ Overlap o=list.get(i);
+ Unit ub=(o.u1==ua ? o.u2 : o.u1);
+ assert(ua!=ub);
+ assert(ua==o.u1 || ua==o.u2);
+ assert(ub.visited());
+ ub.setVisited(false);
+ }
+ }
+
+ }
+
+ return totalMultiJoins;
+ }
+
+ private ArrayList<Unit> pruneCluster(ArrayList<Unit> cluster, boolean pruneContradictoryNodes, boolean pruneContradictoryOverlaps, ArrayList<Unit> visited){
+ if(verbose){System.err.println("pruneCluster(size="+cluster.size()+", "+pruneContradictoryNodes+", "+pruneContradictoryOverlaps+")");}
+
+ //pruneContradictoryOverlaps is less strict than pruneContradictoryNodes
+ assert(pruneContradictoryOverlaps || !pruneContradictoryNodes);
+
+ for(Unit ua : cluster){
+ assert(!ua.visited());
+ assert(ua.isPerfectlyTransitive()) : ua;
+ if(ua.visited()){ua.setVisited(false);}
+ }
+
+ int prunedOverlaps=0;
+ int visits=1;
+
+ {
+ final Unit root=cluster.get(0);
+ assert(!root.contradiction());
+ root.setVisited(true);
+ visited.add(root);
+ }
+
+ for(int i=0; i<visited.size(); i++){
+ Unit ua=visited.get(i);
+
+ if(ua.visited() && (!ua.contradiction() || !pruneContradictoryNodes)){
+ ArrayList<Overlap> list=ua.overlapList;
+ if(list!=null){
+ int removed=0;
+ for(int j=0; j<list.size(); j++){
+ Overlap o=list.get(j);
+ Unit ub=(o.u1==ua ? o.u2 : o.u1);
+ assert(o.u1==ua || o.u2==ua);
+ assert(ua!=ub);
+ assert(ub.valid());
+
+ assert(!o.canonContradiction() || (ua.canonContradiction() || ub.canonContradiction())) :
+ "\n"+o.canonContradiction()+", "+ua.canonContradiction()+", "+ub.canonContradiction();
+
+ assert(!o.offsetContradiction() || (ua.offsetContradiction() || ub.offsetContradiction())) :
+ "\n"+o.offsetContradiction()+", "+ua.offsetContradiction()+", "+ub.offsetContradiction();
+
+// assert(o.contradiction()==(ua.contradiction() && ub.contradiction())) :
+// "\n"+o.canonContradiction()+", "+o.offsetContradiction()+
+// "\n"+ua.canonContradiction()+", "+ua.offsetContradiction()+
+// "\n"+ub.canonContradiction()+", "+ub.offsetContradiction();
+
+ final boolean remove=(pruneContradictoryNodes && ub.contradiction() || (pruneContradictoryOverlaps && o.contradiction()));
+ if(!remove && !ub.visited()){
+ ub.setVisited(true);
+ visited.add(ub);
+ visits++;
+ }
+
+ if(remove){
+ if(!o.invalid()){o.setInvalid(true);}
+ list.set(j, null);
+ removed++;
+ prunedOverlaps++;
+ }else{
+ assert(!o.invalid());
+ }
+ }
+ if(removed>0){Tools.condenseStrict(list);}
+ }
+ }
+ }
+
+ if(verbose){System.err.println("cluster.size()="+cluster.size()+", visits="+visits+", visited.size()="+visited.size());}
+
+// if(visited.size()==11486){ //TODO: For testing. Remove.
+// for(int i=0; i<visited.size(); i++){
+// Unit u=visited.get(i);
+// assert(u.visited());
+// assert(!u.canonContradiction());
+// assert(u.canonicized());
+// for(Overlap o : u.overlapList){
+// assert(!o.canonContradiction());
+// assert(o.type==FORWARD) : "\n\no="+o+"\ni="+i+", u.overlapList.size="+u.overlapList.size()+"\n"+
+// o.u1.canonContradiction()+", "+o.u2.canonContradiction()+", "+o.u1.canonicized()+", "+o.u2.canonicized()+
+// "\n"+visited.contains(o.u1)+", "+visited.contains(o.u2)+", "+visited.size()+
+// "\n"+u.overlapList;
+// }
+// }
+// }
+
+ final int numUnvisited=cluster.size()-visits;
+ ArrayList<Unit> pruned=(numUnvisited==0 ? null : new ArrayList<Unit>(numUnvisited));
+ assert(visits==visited.size());
+ assert(visits>=1 && visits<=cluster.size());
+
+ if(visits<cluster.size()){
+ pruned=new ArrayList<Unit>(cluster.size()-visits);
+ for(Unit ua : cluster){
+ if(!ua.visited()){
+ pruned.add(ua);
+ ArrayList<Overlap> list=ua.overlapList;
+ if(list!=null){
+ int removed=0;
+ for(int j=0; j<list.size(); j++){
+ Overlap o=list.get(j);
+ Unit ub=(o.u1==ua ? o.u2 : o.u1);
+ assert(o.u1==ua || o.u2==ua);
+ assert(ua!=ub);
+ assert(ub.valid());
+
+ if(ub.visited() || o.invalid()){
+ assert(ub.visited()==o.invalid()) : "\n"+o+"\n"+ub;
+ list.set(j, null);
+ removed++;
+ }
+ }
+ if(removed>0){Tools.condenseStrict(list);}
+ }
+ }
+ }
+ assert(pruned.size()==numUnvisited);
+ }else{
+ assert(prunedOverlaps==0) : "If this fails then I may need to mark overlaps to remove.";
+ }
+ for(Unit u : cluster){
+ assert(u.isPerfectlyTransitive()) : u;
+ if(EA){
+ if(u.overlapList!=null){
+ for(Overlap o : u.overlapList){assert(!o.invalid());}
+ }
+ }
+ if(u.visited()){u.setVisited(false);}
+ }
+ cluster.clear();
+ cluster.addAll(visited);
+ cluster.trimToSize();
+
+// for(Unit u : cluster){
+//// assert(u.canonicized());
+// for(Overlap o : u.overlapList){
+// assert(pruned==null || !pruned.contains(o.u1));
+// assert(pruned==null || !pruned.contains(o.u2));
+// assert(cluster.contains(o.u1));
+// assert(cluster.contains(o.u2));
+// }
+// }
+// if(pruned!=null){
+// for(Unit u : pruned){
+// for(Overlap o : u.overlapList){
+// assert(pruned.contains(o.u1));
+// assert(pruned.contains(o.u2));
+// assert(!cluster.contains(o.u1));
+// assert(!cluster.contains(o.u2));
+// }
+// }
+// }
+
+ visited.clear();
+ return pruned;
+ }
+
+ /**
+ * Cluster should already be ordered breadth-first
+ * This may fail because removing cycles could change breadth-first traversal, but if it fails, an assertion will be thrown
+ * @param cluster
+ */
+ private int findCycles(ArrayList<Unit> cluster, boolean remove){
+
+ {
+ final Unit root=cluster.get(0);
+ assert(root.length()>=cluster.get(cluster.size()-1).length());
+ root.setVisited(true);
+ }
+ int cycles=0;
+
+ for(Unit ua : cluster){
+ assert(ua.visited());
+ ArrayList<Overlap> list=ua.overlapList;
+ if(list!=null){
+ int removed=0;
+ for(int i=0; i<list.size(); i++){
+ Overlap o=list.get(i);
+ Unit ub=(o.u1==ua ? o.u2 : o.u1);
+ assert(o.u1==ua || o.u2==ua);
+ assert(ua!=ub);
+ assert(ub.valid());
+
+ if(!o.visited()){
+ o.setVisited(true);
+ if(ub.visited()){
+ if(!o.cyclic()){
+ o.setCyclic(true);
+ cycles++;
+ }
+ }else{
+ ub.setVisited(true);
+ }
+ }
+ if(remove && o.cyclic()){
+ list.set(i, null);
+ removed++;
+ }
+ }
+ if(removed>0){Tools.condenseStrict(list);}
+ }
+ }
+
+ for(Unit u : cluster){
+ if(u.visited()){u.setVisited(false);}
+ if(u.overlapList!=null){
+ for(Overlap o : u.overlapList){
+ if(o.visited()){o.setVisited(false);}
+ }
+ }
+ }
+
+ return cycles;
+ }
+
+ /**
+ * Cluster should already be ordered breadth-first
+ * @param cluster
+ */
+ private int generateOffsetsBreadthFirst(ArrayList<Unit> cluster, ArrayList<Unit> temp){
+
+
+ assert(temp!=null);
+ assert(temp.isEmpty());
+ {
+ final Unit root=cluster.get(0);
+ assert(root.length()>=cluster.get(cluster.size()-1).length());
+ root.setOffset(0);
+ temp.add(root);
+ }
+
+ int contradictions=0;
+ for(int i=0; i<temp.size(); i++){
+ Unit u=temp.get(i);
+ assert(!u.visited()) : i;
+ assert(u.offsetValid() || contradictions>0) : i+", "+temp.size()+", "+contradictions+"\n"+toString(temp);
+ if(u.offsetValid() && !u.offsetContradiction()){
+ contradictions+=setOffsetsNeighbors(u, temp);
+ assert(contradictions==0 || (i>0 && temp.size()>2));
+ }
+ }
+
+ int min=0;
+ for(Unit u : temp){
+ if(u.visited()){u.setVisited(false);}
+ if(u.overlapList!=null){
+ for(Overlap o : u.overlapList){
+ if(o.visited()){o.setVisited(false);}
+ }
+ }
+ if(u.offsetValid() && !u.offsetContradiction()){
+ min=Tools.min(min, u.offset());
+ }
+ }
+
+ if(verbose){
+ System.err.println("min offset = "+min);
+ }
+
+ for(Unit u : temp){
+ if(u.offsetValid()){
+ if(verbose){System.err.println("Set "+u.name()+" offset from "+u.offset+" to "+(u.offset-min));}
+ u.offset=u.offset-min;
+ }
+ }
+
+
+ return contradictions;
+ }
+
+ /**
+ * @param root
+ */
+ private int setOffsetsNeighbors(final Unit root, final ArrayList<Unit> temp) {
+ if(verbose){System.err.println("\nsetOffsetsNeighbors("+root.name()+")\nroot.code1="+root.code1+"\n");}
+ assert(root.valid());
+ assert(!root.visited());
+ assert(root.offsetValid());
+ assert(!root.offsetContradiction());
+ root.setVisited(true);
+ if(root.overlapList==null){return 0;}
+ final int contradictions=countOffsetContradictions(root, false);
+ if(verbose){System.err.println("\ncontradictions="+contradictions);}
+ for(Overlap o : root.overlapList){
+ Unit u=(o.u1==root ? o.u2 : o.u1);
+ assert(o.u1==root || o.u2==root);
+ assert(root!=u);
+ assert(u.valid());
+
+ if(verbose){System.err.println("\nProcessing Overlap "+o);}
+ if(!o.visited() && !o.offsetContradiction()){
+ o.setVisited(true);
+ if(!u.offsetContradiction()){
+ if(verbose){System.err.println("Calling setOffset: "+o);}
+ if(!u.offsetValid()){temp.add(u);}
+ boolean b=setOffset(root, u, o);
+ if(verbose){System.err.println("Finished setOffset: "+o);}
+
+// if(x>0){
+// if(verbose){System.err.println("\n*********************************************");}
+// if(verbose){System.err.println("Problem detected with contig "+u.name());}
+// if(verbose){System.err.println("*********************************************\n");}
+// verbose=true;
+// int y2=countOffsetContradictions(root, false);
+// assert(contradictions==y2);
+// }
+
+ assert(b) : "\n"+contradictions+", "+o.offsetContradiction()+", "+root.offsetContradiction()+", "+u.offsetContradiction()+"\n"
+ +root.offsetValid()+", "+u.offsetValid()+", "+OVERLAP_TYPE_NAMES[o.type]+"\n"+b
+ +fixMultiJoins; //This assertion can fail if a multijoin is present
+ assert(u.offsetValid());
+ }
+ }
+ }
+ return contradictions;
+ }
+
+ private int countOffsetContradictions(Unit root, boolean includeKnown){
+ if(verbose){System.err.println("\ncountContradictions("+root.name()+", "+includeKnown+")\nroot.code1="+root.code1+"\n");}
+ assert(root.valid());
+ assert(root.visited());
+ assert(root.offsetValid());
+// assert(!root.offsetContradiction());
+ if(root.overlapList==null){return 0;}
+ int contradictions=0;
+ for(Overlap o : root.overlapList){
+ Unit u=(o.u1==root ? o.u2 : o.u1);
+ assert(o.u1==root || o.u2==root);
+ assert(root!=u);
+ assert(u.valid());
+
+ if(verbose){System.err.println("\nOverlap "+o+"\nu="+u.name()+", offsetValid="+u.offsetValid());}
+
+ boolean contradictory=(u.offsetValid() && u.offset()!=calcOffset(root, u, o));
+ if(verbose){System.err.println("contradictory= \t"+contradictory);}
+ if(contradictory){
+ if(includeKnown || !u.offsetContradiction()){
+ contradictions++;
+ if(!root.offsetContradiction()){root.setOffsetContradiction(true);}
+ }
+ if(!o.offsetContradiction()){o.setOffsetContradiction(true);}
+ if(!u.offsetContradiction()){u.setOffsetContradiction(true);}
+ }
+ assert(contradictory==o.offsetContradiction()) : contradictory+", "+o.offsetContradiction();
+ if(verbose){
+ System.err.println("root.offsetContradiction()=\t"+root.offsetContradiction());
+ System.err.println("u.offsetContradiction()= \t"+u.offsetContradiction());
+ System.err.println("o.offsetContradiction()= \t"+o.offsetContradiction());
+ System.err.println("contradictions= \t"+contradictions);
+ }
+ }
+ if(verbose){System.err.println("Final contradictions="+contradictions+"\n");}
+ return contradictions;
+ }
+
+ /**
+ * Cluster should already be ordered breadth-first
+ * @param cluster
+ */
+ private int canonicizeClusterBreadthFirst(ArrayList<Unit> cluster, ArrayList<Unit> temp) {
+
+ assert(temp!=null);
+ assert(temp.isEmpty());
+ {
+ final Unit root=cluster.get(0);
+ assert(root.length()>=cluster.get(cluster.size()-1).length());
+ root.setCanonicized(true);
+ temp.add(root);
+ }
+
+ int contradictions=0;
+ for(int i=0; i<temp.size(); i++){
+ final Unit u=temp.get(i);
+ assert(!u.visited()) : i;
+ assert(u.canonicized() || contradictions>0) : i+", "+temp.size()+", "+contradictions+"\n"+toString(temp);
+ if(u.canonicized() && !u.canonContradiction()){
+ contradictions+=canonicizeNeighbors(u, temp);
+ assert(contradictions==0 || (i>0 && temp.size()>2));
+
+ if(u.overlapList!=null){
+ for(Overlap o : u.overlapList){
+ assert(o.type==FORWARD || o.canonContradiction() || o.u1.canonContradiction() || o.u2.canonContradiction()) :
+ o+"\n"+contradictions+", "+o.canonContradiction()+", "+o.u1.canonContradiction()+", "+o.u2.canonContradiction()+
+ "\n"+o.u1.canonicized()+", "+o.u2.canonicized()+", "+o.u1.visited()+", "+o.u2.visited();
+ }
+ }
+ }
+
+// if(u.r.numericID==59462 || u.r.numericID==56439){ //TODO: remove
+// System.err.println("\nid="+u.r.numericID+", canonicized="+u.canonicized()+", contradiction="+u.canonContradiction()+", visited="+u.visited());
+// for(Overlap o : u.overlapList){
+// Unit u2=(o.u1==u ? o.u2 : o.u1);
+// assert(o.u1==u || o.u2==u);
+// assert(u2!=u);
+// assert(u2.valid());
+// System.err.println("o = "+o);
+// System.err.println("o.contradiction="+o.canonContradiction());
+// System.err.println("u2.id="+u2.r.numericID+", canonicized="+u2.canonicized()+", contradiction="+u2.canonContradiction()+", visited="+u.visited());
+// }
+// }
+ }
+
+ for(Unit u : temp){
+ if(u.visited()){u.setVisited(false);}
+ if(EA){
+ if(u.overlapList!=null){
+ for(Overlap o : u.overlapList){assert(!o.visited());}
+ }
+ }
+ }
+
+ return contradictions;
+ }
+
+ /**
+ * @param root
+ */
+ private int canonicizeNeighbors(Unit root, ArrayList<Unit> canonicized) {
+ if(verbose){System.err.println("\ncanonicizeNeighbors("+root.name()+")\nroot.code1="+root.code1+"\n");}
+ assert(root.valid());
+ assert(!root.visited());
+ assert(root.canonicized());
+ assert(!root.canonContradiction());
+ root.setVisited(true);
+ if(root.overlapList==null){return 0;}
+ final int contradictions=countCanonContradictions(root, false);
+ if(verbose){System.err.println("\ncontradictions="+contradictions);}
+ for(Overlap o : root.overlapList){
+ Unit u=(o.u1==root ? o.u2 : o.u1);
+ assert(o.u1==root || o.u2==root);
+ assert(root!=u);
+ assert(u.valid());
+
+ if(verbose){System.err.println("\nProcessing Overlap "+o);}
+ if(!o.canonContradiction()){
+ if(!u.canonContradiction()){
+ boolean b=u.canonicized();
+ int dir=o.type;
+ if(verbose){System.err.println("Calling canonicize: "+o);}
+ int x=canonicize(root, u, o);
+ if(verbose){System.err.println("Finished canonicize: "+o);}
+
+// if(x>0){
+// if(verbose){System.err.println("\n*********************************************");}
+// if(verbose){System.err.println("Problem detected with contig "+u.name());}
+// if(verbose){System.err.println("*********************************************\n");}
+// verbose=true;
+// int y2=countCanonContradictions(root, false);
+// assert(contradictions==y2);
+// }
+
+ assert(x==0 || (u.canonicized() && (o.type==FORWARDRC || o.type==REVERSERC)));
+ assert(x==0) : "\n"+x+", "+contradictions+", "+o.canonContradiction()+", "+root.canonContradiction()+", "+u.canonContradiction()+"\n"
+ +root.canonicized()+", "+u.canonicized()+", "+OVERLAP_TYPE_NAMES[o.type]+"\n"+b+", "+dir
+ +fixMultiJoins; //This assertion can fail if a multijoin is present
+ if(!u.canonicized()){
+ u.setCanonicized(true);
+ canonicized.add(u);
+ }
+ assert(u.canonicized());
+ }
+ }
+ }
+ if(EA){
+ for(Overlap o : root.overlapList){
+ assert(o.type==FORWARD || o.canonContradiction() || o.u1.canonContradiction() || o.u2.canonContradiction()) :
+ o+"\n"+contradictions+", "+o.canonContradiction()+", "+o.u1.canonContradiction()+", "+o.u2.canonContradiction()+", "+root.canonContradiction()+
+ "\n"+o.u1.canonicized()+", "+o.u2.canonicized()+", "+o.u1.visited()+", "+o.u2.visited();
+ }
+ }
+ return contradictions;
+ }
+
+ private int countCanonContradictions(Unit root, boolean includeKnown){
+ if(verbose){System.err.println("\ncountContradictions("+root.name()+", "+includeKnown+")\nroot.code1="+root.code1+"\n");}
+ assert(root.valid());
+ assert(root.visited());
+ assert(root.canonicized());
+// assert(!root.canonContradiction());
+ if(root.overlapList==null){return 0;}
+ int contradictions=0;
+ for(Overlap o : root.overlapList){
+ Unit ub=(o.u1==root ? o.u2 : o.u1);
+ assert(o.u1==root || o.u2==root);
+ assert(root!=ub);
+ assert(ub.valid());
+
+ if(verbose){System.err.println("\nOverlap "+o+"\nu="+ub.name()+", canonicized="+ub.canonicized());}
+
+ boolean contradictory=(ub.canonicized() && (o.type==FORWARDRC || o.type==REVERSERC));
+ if(verbose){System.err.println("contradictory= \t"+contradictory);}
+ if(contradictory){
+ if(!o.canonContradiction()){o.setCanonContradiction(true);}
+ if(includeKnown || !ub.canonContradiction()){
+ contradictions++;
+ if(!root.canonContradiction()){root.setCanonContradiction(true);}
+ if(!ub.canonContradiction()){ub.setCanonContradiction(true);}
+ }
+ }
+
+ assert(!o.canonContradiction() || (root.canonContradiction() || ub.canonContradiction())) :
+ "\n"+contradictory+", "+o.canonContradiction()+", "+root.canonContradiction()+", "+ub.canonContradiction();
+
+ assert(contradictory==o.canonContradiction()) : contradictory+", "+o.canonContradiction();
+ if(verbose){
+ System.err.println("root.canonContradiction()=\t"+root.canonContradiction());
+ System.err.println("u.canonContradiction()= \t"+ub.canonContradiction());
+ System.err.println("o.canonContradiction()= \t"+o.canonContradiction());
+ System.err.println("contradictions= \t"+contradictions);
+ }
+ }
+ if(verbose){System.err.println("Final contradictions="+contradictions+"\n");}
+ return contradictions;
+ }
+
+ private String toString(ArrayList<Unit> cluster){
+ for(int i=0; i<cluster.size(); i++){
+ Unit u=cluster.get(i);
+ u.r.id=""+i;
+ }
+ StringBuilder sb=new StringBuilder(1000);
+ for(Unit u : cluster){
+ sb.append(">"+u.name()+"\n");
+ sb.append(new String(u.bases()));
+ sb.append("\n");
+ }
+ sb.append("\n*****\n");
+ for(Unit u : cluster){
+ sb.append("\n"+u.name()+":");
+ if(u.overlapList!=null){
+ for(Overlap o : u.overlapList){
+ Unit ub=(o.u1==u ? o.u2 : o.u1);
+ sb.append(" "+ub.name());
+ }
+ }
+ }
+ sb.append("\n");
+ return sb.toString();
+ }
+
+ private String toShortString(ArrayList<Unit> cluster){
+ for(int i=0; i<cluster.size(); i++){
+ Unit u=cluster.get(i);
+ u.r.id=""+i;
+ }
+ StringBuilder sb=new StringBuilder(1000);
+ for(Unit u : cluster){
+ sb.append("\n"+u.name()+":");
+ if(u.overlapList!=null){
+ for(Overlap o : u.overlapList){
+ Unit ub=(o.u1==u ? o.u2 : o.u1);
+ sb.append(" "+ub.name());
+ }
+ }
+ }
+ sb.append("\n");
+ return sb.toString();
+ }
+
+
+ /**
+ * @param root
+ * @param u2
+ * @param o
+ * @return Number of contradictions
+ */
+ private int canonicize(final Unit root, final Unit u2, final Overlap o){
+ if(o.type==FORWARD){return 0;}
+ if(o.type==FORWARDRC || o.type==REVERSERC){
+ if(u2.canonicized()){return 1;}
+ u2.reverseComplement();
+ unitsFlippedT++;
+ for(Overlap o2 : u2.overlapList){
+ overlapsFlippedT++;
+ o2.flip(u2, bandy);
+ }
+ assert(o.type==FORWARD || o.type==REVERSE) : OVERLAP_TYPE_NAMES[o.type];
+ }
+ if(o.type==REVERSE){o.reverseDirection();}
+ assert(o.type==FORWARD);
+ assert(o.test(bandy, o.edits+maxEdits));
+ return 0;
+ }
+
+
+ /**
+ * @param root
+ * @param u2
+ * @param o
+ * @return true if no contradictions
+ */
+ private boolean setOffset(final Unit root, final Unit u2, final Overlap o){
+ assert(root.offsetValid());
+ assert(!root.offsetContradiction());
+ int offset=calcOffset(root, u2, o);
+
+ if(u2.offsetValid()){return u2.offset()==offset;}
+ u2.setOffset(offset);
+
+ if(verbose){
+ System.err.println("\nroot = "+(root.name()==null ? root.r.numericID+"" : root.name())+", u2 = "+(u2.name()==null ? u2.r.numericID+"" : u2.name())
+ +"\no = "+o
+ +"\nroot.offset = "+root.offset()
+ +"\nu2.offset = "+u2.offset());
+ }
+
+ return true;
+ }
+
+
+ private int calcOffset(final Unit root, final Unit ub, final Overlap o){
+ assert(root.offsetValid());
+ if(o.type==FORWARD){
+ if(root==o.u1){
+ int dif=o.start1-o.start2;
+ if(verbose){System.err.println("root==o.u1=="+root.name()+", start1="+o.start1+"; u2==o.u2=="+ub.name()+", start2="+o.start2+", dif="+dif);}
+ return root.offset+dif;
+ }else{
+ int dif=o.start2-o.start1;
+ if(verbose){System.err.println("root==o.u2=="+root.name()+", start2="+o.start2+"; u2==o.u1=="+ub.name()+", start1="+o.start1+", dif="+dif);}
+ return root.offset+dif;
+ }
+ }else{
+ assert(false) : o;
+ throw new RuntimeException("TODO");
+ }
+ }
+
+
+ /**
+ * @param cluster
+ */
+ private void mergeLeaves(ArrayList<Unit> cluster) {
+ assert(false) : "TODO";
+ for(Unit u : cluster){
+
+ }
+ }
+
+ /**
+ * @param cluster
+ */
+ private void mergeInner(ArrayList<Unit> cluster) {
+ assert(false) : "TODO";
+ for(Unit u : cluster){
+
+ }
+ }
+
+ private ArrayList<ArrayList<Unit>> processedClustersT=new ArrayList<ArrayList<Unit>>(threadMaxReadsToBuffer);
+
+ long leafMergesT=0;
+ long innerMergesT=0;
+ long leafBaseMergesT=0;
+ long innerBaseMergesT=0;
+
+ long multiJoinFailuresT=0;
+ long multiJoinsFoundT=0;
+ long multiJoinBasesFoundT=0;
+ long unitsFlippedT=0;
+ long overlapsFlippedT=0;
+ long canonContradictoryOverlapsT=0;
+ long canonContradictoryClustersT=0;
+ long offsetContradictoryOverlapsT=0;
+ long offsetContradictoryClustersT=0;
+ long cycleOverlapsT=0;
+ long cycleClustersT=0;
+
+ private final boolean fixMultiJoinsT;
+ private final boolean canonicizeT;
+ private final boolean fixCanonContradictionsT;
+ private final boolean fixOffsetContradictionsT;
+ private final boolean mergeClustersT;
+ private final boolean mergeLeavesT;
+ private final boolean mergeInnerT;
+ private final BandedAligner bandy;
+ }
+
+
+ /**
+ * @param cluster
+ */
+ private void unvisit(ArrayList<Unit> cluster) {
+ for(Unit u : cluster){
+ if(u.visited()){u.setVisited(false);}
+ }
+ }
+
+ /**
+ * @param cluster
+ */
+ private void reorderClusterBreadthFirst(ArrayList<Unit> cluster) {
+ if(verbose){System.err.println("reorderClusterBreadthFirst");}
+
+ final int size=cluster.size();
+ Collections.sort(cluster); //Now it is in descending length
+ final Unit root=cluster.get(0);
+ assert(root.length()>=cluster.get(size-1).length()) : root.length()+", "+cluster.get(size-1).length()+", "+root.compareTo(cluster.get(size-1));
+
+ ArrayList<Unit> breadthFirst=new ArrayList<Unit>(cluster.size());
+ root.setVisited(true);
+// System.err.println("root="+root.name());
+ breadthFirst.add(root);
+ for(int i=0; i<breadthFirst.size(); i++){
+ Unit u=breadthFirst.get(i);
+ Collections.sort(u.overlapList); //Sorted in descending overlap length
+ if(u.overlapList!=null){
+ for(Overlap o : u.overlapList){
+ if(!o.u1.visited()){
+ // System.err.println("Visiting "+o.u1.name());
+ o.u1.setVisited(true);
+ breadthFirst.add(o.u1);
+ }
+ if(!o.u2.visited()){
+ // System.err.println("Visiting "+o.u2.name());
+ o.u2.setVisited(true);
+ breadthFirst.add(o.u2);
+ }
+ // System.err.println("***");
+ // System.err.println(toShortString(breadthFirst));
+ }
+ }
+ }
+ for(Unit u : cluster){
+ assert(u.visited());
+ if(u.visited()){u.setVisited(false);}
+ if(EA){
+ if(u.overlapList!=null){
+ for(Overlap o : u.overlapList){assert(!o.visited());}
+ }
+ }
+ }
+// System.err.println("***");
+// System.err.println("Final:");
+// System.err.println(toShortString(breadthFirst));
+ assert(cluster.size()==breadthFirst.size());
+ cluster.clear();
+ cluster.addAll(breadthFirst);
+ }
+
+
+
+ /** Returns next cluster larger than 1 element.
+ * Singleton clusters are added directly to 'processed'. */
+ private ArrayList<Unit> nextCluster(){
+ synchronized(clusterQueue){
+ ArrayList<Unit> cluster=clusterQueue.poll();
+ assert(cluster==null || cluster.size()>1);
+// while(cluster!=null && cluster.size()<2){
+//// unmark(cluster);
+// processedClustersT.add(cluster);
+// cluster=clusterQueue.poll();
+// }
+ return cluster;
+ }
+ }
+
+
+ /**
+ * Creates Unit objects or uses ones already attached to reads.
+ * Places them in local storage and percolates them to shared storage (codeMap), removing exact duplicates.
+ * Also hashes tips and places these in shared affixMap.
+ * Looks for containments in the affix map.
+ * @author Brian Bushnell
+ * @date Jul 24, 2013
+ *
+ */
+ private final class HashThread extends Thread{
+
+ public HashThread(boolean addToCodeMap_, boolean addToAffixMap_, boolean findMatches_, boolean findContainments_, boolean findOverlaps_){
+ addToCodeMapT=addToCodeMap_;
+ addToAffixMapT=addToAffixMap_;
+ findContainmentsT=findContainments_;
+ findOverlapsT=findOverlaps_;
+ findMatchesT=findMatches_;
+ tid=getTid();
+ crisq=new ArrayDeque<ConcurrentReadInputStream>(crisa.length);
+ for(int i=0; i<crisa.length; i++){
+// if(verbose){System.err.println("Adding to crisq.");}
+ crisq.add(crisa[(i+tid)%crisa.length]);
+ }
+ bandy=(maxEdits>0 && (findOverlapsT || findContainmentsT) ? BandedAligner.makeBandedAligner(bandwidth) : null);
+
+// assert(addToCodeMapT) : "addToCodeMapT="+addToCodeMapT+", addToAffixMapT="+addToAffixMapT+", findContainmentsT="+findContainmentsT+
+// ", findOverlapsT="+findOverlapsT+", findMatchesT="+findMatchesT+", convertToUpperCaseT="+convertToUpperCaseT+", numAffixMaps="+numAffixMaps;
+ }
+
+ public void run(){
+
+ ConcurrentReadInputStream cris=crisq.poll();
+
+ while(cris!=null){
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+ // long xx=0;
+ while(reads!=null && reads.size()>0){
+
+ for(Read r : reads){
+ processReadOuter(r);
+ }
+
+ if(codeMapT!=null && (codeMapT.size()>threadMaxReadsToBuffer || basesStoredT>threadMaxBasesToBuffer)){
+ assert(addToCodeMapT);
+ long added=mergeMaps();
+ addedToMainT+=added;
+ }
+
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ cris.returnList(ln.id, ln.list.isEmpty());
+ if(codeMapT!=null && !codeMapT.isEmpty()){
+ long added=mergeMaps();
+ addedToMainT+=added;
+ }
+ cris=crisq.poll();
+ }
+
+ codeMapT=null;
+ localConflictList=null;
+ sharedConflictList=null;
+ }
+
+ /** Return true if this read was a member of this subset. */
+ private boolean processReadOuter(Read r1){
+ if(r1.length()<MINSCAF){return false;}
+ Read r2=r1.mate;
+
+ assert(r1.pairnum()==0);
+ assert(r2==null || r2.pairnum()==1);
+
+ if(!addToCodeMapT && r1.obj==null){
+ if(r1.bases!=null && r1.length()>=MINSCAF){
+ final Unit u=(r1.obj!=null ? (Unit)r1.obj : new Unit(r1));
+ assert(u.r==r1 && (r1.obj==u || r1.obj==null));
+ final long code=u.code1;
+ r1.obj=u;
+ assert(u.r==r1 && r1.obj==u);
+ if(r2!=null && r2.obj==null){r2.obj=new Unit(r2);}
+
+ //Check for subset membership
+ final boolean inSet=u.inSet();
+ if(inSet){
+ final Long codeL=code;
+ ArrayList<Unit> list=codeMap.get(codeL);
+ boolean found=false;
+ for(Unit u0 : list){
+ //Replace with existing read
+ if(u0.equals(u) && u0.r.numericID==r1.numericID){
+ r1=u0.r;
+ r2=r1.mate;
+ found=true;
+ break;
+ }
+ }
+ assert(list!=null);
+ if(!found){
+ return false;
+ }
+ }
+ }
+ }
+ boolean b=processRead(r1);
+ if(r2!=null){processRead(r2);}
+ return b;
+ }
+
+ /** Return true if this read was a member of this subset. */
+ private boolean processRead(Read r){
+ if(r.length()<MINSCAF){return false;}
+
+ final boolean inSet;
+ if(!storeName){r.id=null;}
+ if(!storeQuality){r.quality=null;}
+
+ if(forceTrimLeft>0 || forceTrimRight>0){//Added at request of RQC team
+ if(r!=null && r.length()>0){
+ TrimRead.trimToPosition(r, forceTrimLeft>0 ? forceTrimLeft : 0, forceTrimRight>0 ? forceTrimRight : r.length(), 1);
+ }
+ }
+
+ readsProcessedT++;
+ basesProcessedT+=r.length();
+
+ final Unit u=(r.obj!=null ? (Unit)r.obj : new Unit(r));
+ assert(u.r==r && (r.obj==u || r.obj==null));
+ final long code=u.code1;
+
+ //Check for subset membership
+ inSet=u.inSet();
+
+ r.obj=u;
+ assert(u.r==r && r.obj==u);
+ if(r.mate!=null && r.mate.obj==null){r.mate.obj=new Unit(r.mate);}
+
+ if(verbose){System.err.println("Generated "+code+" for sequence "+u.name()+"\t"+new String(r.bases, 0, Tools.min(40, r.length())));}
+
+ if(addToCodeMapT && inSet){
+ final Long codeL=code;
+ ArrayList<Unit> list=codeMapT.get(codeL);
+ if(list==null){
+ if(verbose){System.err.println("Unique.");}
+ list=new ArrayList<Unit>(1);
+ list.add(u);
+ basesStoredT+=r.length();
+ codeMapT.put(codeL, list);
+ }else{
+ if(verbose){System.err.println("Exists.");}
+ boolean match=false;
+ if(findMatchesT){
+ for(Unit u2 : list){
+ if(pairedEqualsRC(u, u2)){
+// if(u.r.mate!=null){
+// verbose=true;
+//
+// Unit um=(Unit)u.r.mate.obj;
+// Unit u2m=(Unit)u2.r.mate.obj;
+//
+// if(verbose){
+// System.err.println("********");
+// System.err.println(u.r.toFastq());
+// System.err.println(u.r.mate.toFastq());
+// System.err.println("********");
+// System.err.println(u2.r.toFastq());
+// System.err.println(u2.r.mate.toFastq());
+// System.err.println("********");
+// System.err.println(u);
+// System.err.println(u2);
+// System.err.println(um);
+// System.err.println(u2m);
+// System.err.println("********");
+// System.err.println(u.equals(u2));
+// System.err.println(u.compareTo(u2));
+// System.err.println("********");
+// System.err.println(um.equals(u2m));
+// System.err.println(um.compareTo(u2m));
+// System.err.println("********");
+// }
+//
+// verbose=false;
+// }
+ assert(u.r.mate==null || pairedEqualsRC((Unit)u.r.mate.obj, (Unit)u2.r.mate.obj)) :
+ u.r.toFastq()+"\n"+u2.r.toFastq()+"\n"+u.r.mate.toFastq()+"\n"+u2.r.mate.toFastq()+
+ "\n"+u+"\n"+u2+"\n"+u.r.mate.obj+"\n"+u2.r.mate.obj;
+// if(verbose){System.err.println("Matches "+new String(r2.bases, 0, Tools.min(40, r2.length())));}
+ match=true;
+ u2.absorbMatch(u);
+ if(UNIQUE_ONLY){
+ synchronized(u2){
+ if(u2.valid()){
+ matchesT++;
+ baseMatchesT+=u2.length();
+ u2.setValid(false);
+ addDupe(u2.r);
+ }
+ }
+ }
+ break;
+ }
+ }
+ }
+ if(match){
+ addDupe(r);
+ matchesT++;
+ baseMatchesT+=r.length();
+ // if(verbose){System.err.println("matchesT="+matchesT+", baseMatchesT="+baseMatchesT);}
+ }else{
+ collisionsT++;
+ if(verbose){System.err.println("False collision; count = "+collisionsT);}
+ list.add(u);
+ basesStoredT+=r.length();
+ }
+ }
+ }
+
+ if(findContainmentsT){
+ int x=findContainments(u);
+ }
+
+ if(findOverlapsT){
+ int x=findOverlaps(u);
+ }
+
+ return inSet;
+ }
+
+ private int findContainments(final Unit u){
+ if(minLengthPercent<=0 && maxSubs<=0 && minIdentity>=100 && !u.valid()){return 0;}
+ final byte[] bases=u.bases();
+ final int minlen=k-1;
+ final int shift=2*k;
+ final int shift2=shift-2;
+ final long mask=~((-1L)<<shift);
+ long kmer=0;
+ long rkmer=0;
+ int hits=0;
+ int currentContainments=0;
+
+ if(bases==null || bases.length<k){return -1;}
+ final LongM key=new LongM();
+
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ long x=baseToNumber[b];
+ long x2=baseToComplementNumber[b];
+ kmer=((kmer<<2)|x)&mask;
+ rkmer=(rkmer>>>2)|(x2<<shift2);
+// if(verbose){System.err.println("Scanning i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
+ if(i>=minlen){
+ key.set(Tools.max(kmer, rkmer)); //Canonical
+ for(int am=0; am<affixMaps.length; am++){
+ ArrayList<Unit> list=affixMaps[am].get(key);
+ if(list!=null){
+ for(Unit u2 : list){
+ if(u!=u2 && !u.equals(u2)){
+ if(u2.valid()){
+ hits++;
+ if(verbose){
+ System.err.println("\nFound potential containment at am="+am+", i="+i+", key="+key.value()+
+ ", pre="+Arrays.toString(u2.prefixes)+", suf="+Arrays.toString(u2.suffixes)+
+ ", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i, k)));
+ }
+ if(u.contains(u2, i, key, bandy, am)){
+ synchronized(u2){
+ if(u2.valid()){
+ currentContainments++;
+ baseContainmentsT+=u2.length();
+ u2.setValid(false);
+ addDupe(u2.r);
+ }
+ }
+ if(UNIQUE_ONLY){
+ synchronized(u){
+ if(u.valid()){
+ currentContainments++;
+ baseContainmentsT+=u.length();
+ u.setValid(false);
+ addDupe(u.r);
+ }
+ }
+ }
+
+ if(verbose){System.err.println("Added containment "+u2);}
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+// assert(false) : hits+", "+currentContainments+", "+baseContainments+"\n"+containmentMapT+"\n";
+
+ containmentCollisionsT+=(hits-currentContainments);
+// outstream.println("hits="+hits+", currentContainments="+currentContainments);
+ containmentsT+=currentContainments;
+ return hits;
+ }
+
+ private int findOverlaps(final Unit u){
+// if(minLengthPercent<=0 && maxSubs<=0 && minIdentity>=100 && !u.valid()){return 0;}
+// if(u.overlapList!=null){u.overlapList.clear();}
+ final byte[] bases=u.bases();
+ final int minlen=k-1;
+ final int shift=2*k;
+ final int shift2=shift-2;
+ final long mask=~((-1L)<<shift);
+ long kmer=0;
+ long rkmer=0;
+ int hits=0;
+ int currentOverlaps=0;
+
+ if(bases==null || bases.length<k){return -1;}
+ final LongM key=new LongM();
+
+ boolean quit=false;
+
+ for(int i=0; i<bases.length && !quit; i++){
+ byte b=bases[i];
+ long x=baseToNumber[b];
+ long x2=baseToComplementNumber[b];
+ kmer=((kmer<<2)|x)&mask;
+ rkmer=(rkmer>>>2)|(x2<<shift2);
+// if(verbose){System.err.println("Scanning i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
+ if(i>=minlen){//valid key
+ key.set(Tools.max(kmer, rkmer)); //Canonical key
+ for(int am=0; am<affixMaps.length; am++){
+ ArrayList<Unit> list=affixMaps[am].get(key);
+ if(list!=null){//found a key collision
+ for(Unit u2 : list){
+ if(quit){break;}//too many edges
+ int u1cluster=-1, u2cluster=-2;
+ if(preventTransitiveOverlaps && u!=u2){
+ u1cluster=u.determineCluster();
+ u2cluster=u2.determineCluster();
+ }
+ if(u1cluster!=u2cluster && u!=u2 && !u.equals(u2) && u2.r!=u.r.mate){//TODO: Not sure why identical things are banned... possibly relates to avoiding inter-pair edges?
+ if(u2.valid()){
+ hits++;
+
+// boolean flag=(u.code1==-3676200394282040623L && u2.code1==-7034423913727372751L) ||
+// (u2.code1==-3676200394282040623L && u.code1==-7034423913727372751L);
+ final boolean flag=false;
+ if(verbose || flag){
+ System.err.println("\nFound potential overlap at am="+am+", i="+i+", key="+key.value()+
+ ", pre="+Arrays.toString(u2.prefixes)+", suf="+Arrays.toString(u2.suffixes)+
+ ", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i, k)));
+ }
+
+ final Overlap o;
+ if(maxEdges>1000000000 || u.overlapList==null || u2.overlapList==null ||
+ (u.overlapList.size()<maxEdges && u2.overlapList.size()<maxEdges2)){
+ o=u.makeOverlap(u2, i, key, bandy, am);
+
+ }else{
+ o=null;
+ if(u.overlapList.size()>maxEdges){quit=true;}
+ }
+ if(o!=null){
+
+ if(preventTransitiveOverlaps){
+ mergeClusterIds(u1cluster, u2cluster);
+ }
+
+ assert(o.test(bandy, o.edits+maxEdits)) : o;
+ if(verbose || flag){System.err.println("Created overlap "+o);}
+
+ long comp=u.length()-u2.length();
+ if(comp==0){comp=u.code1-u2.code1;}
+ if(comp==0){comp=u.code2-u2.code2;}
+ if(comp==0){comp=u.prefixes[0]-u2.prefixes[0];}
+ if(comp==0){comp=u.suffixes[0]-u2.suffixes[0];}
+ if(comp==0){comp=(u.r.numericID-u2.r.numericID);}
+ assert(comp!=0) : u+", "+u2;
+ Unit ua=(comp<0 ? u : u2);
+ Unit ub=(comp<0 ? u2 : u);
+ assert(ua!=ub);
+ if(verbose || flag){
+ System.err.println("ua="+ua.code1);
+ System.err.println("ub="+ub.code1);
+ System.err.println("u ="+u.code1);
+ System.err.println("u2="+u2.code1);
+ System.err.println("u.r ="+u.r.numericID);
+ System.err.println("u2.r="+u2.r.numericID);
+ System.err.println("ua contains o? "+ua.alreadyHas(o));
+ System.err.println("ub contains o? "+ub.alreadyHas(o));
+ System.err.println("ua.list="+ua.overlapList);
+ System.err.println("ub.list="+ub.overlapList);
+ }
+
+// assert(ua.alreadyHas(o)==ub.alreadyHas(o));
+
+ final boolean uaContainedOverlap;
+
+ synchronized(ua){
+ if(ua.overlapList==null){ua.overlapList=new ArrayList<Overlap>(2);}
+ if(!ua.overlapList.contains(o)){
+ if(EA){
+ synchronized(ub){
+ assert(ub.overlapList==null || !ub.overlapList.contains(o)) :
+ ua.alreadyHas(o)+", "+ub.alreadyHas(o)+"\n"+o+"\n"+ub.overlapList.get(ub.overlapList.indexOf(o))+
+ "\nua.list="+ua.overlapList+"\nub.list="+ub.overlapList+"\nu.code1="+u.code1+"\nu2.code1="+u2.code1;
+ }
+ }
+ currentOverlaps++;
+ baseOverlapsT+=o.overlapLen;
+ ua.overlapList.add(o);
+ if(verbose || flag){System.err.println("Added overlap "+o);}
+ uaContainedOverlap=false;
+ }else{
+ if(verbose || flag){System.err.println("Already contained overlap "+o);}
+ hits--;
+ uaContainedOverlap=true;
+ }
+ }
+
+ if(!uaContainedOverlap){
+ synchronized(ub){
+ if(ub.overlapList==null){ub.overlapList=new ArrayList<Overlap>(2);}
+ assert(!ub.overlapList.contains(o));
+ ub.overlapList.add(o);
+ if(verbose || flag){System.err.println("Added overlap "+o);}
+ }
+ }else{
+ if(verbose || flag){System.err.println("Already contained overlap "+o);}
+ }
+
+
+// assert(ua.alreadyHas(o));
+// assert(ub.alreadyHas(o));
+// assert(ua.overlapList.contains(o));
+// assert(ub.overlapList.contains(o));
+ if(verbose || flag){
+ System.err.println("ua contains o? "+ua.alreadyHas(o));
+ System.err.println("ub contains o? "+ub.alreadyHas(o));
+ System.err.println("ua.list="+ua.overlapList);
+ System.err.println("ub.list="+ub.overlapList);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ if(EA){
+ synchronized(u){
+ if(u.overlapList!=null && u.overlapList.isEmpty()){
+ assert(false) : "Why would this happen?";
+ u.overlapList=null;
+ }
+ }
+ }
+// assert(false) : hits+", "+currentOverlaps+", "+baseOverlaps+"\n"+overlapMapT+"\n";
+
+// assert(hits==currentOverlaps) : hits+", "+currentOverlaps;
+
+ overlapCollisionsT+=(hits-currentOverlaps);
+// outstream.println("hits="+hits+", currentOverlaps="+currentOverlaps);
+ overlapsT+=currentOverlaps;
+ return hits;
+ }
+
+ /** Insert reads processed by a thread into the shared code and affix maps.
+ * If operating in subset mode, only store reads with code equal to subset mod subsetCount. */
+ private long mergeMaps(){
+ if(verbose){System.err.println("Merging maps.");}
+ long novelReads=0, novelKeys=0;
+ long collisionReads=0;
+ long mergedReads=0;
+
+ assert(localConflictList.isEmpty());
+ assert(sharedConflictList.isEmpty());
+
+ synchronized(codeMap){
+ for(Long key : codeMapT.keySet()){
+ if(codeMap.containsKey(key)){
+ localConflictList.add(codeMapT.get(key));
+ sharedConflictList.add(codeMap.get(key));
+ }else{
+ ArrayList<Unit> list=codeMapT.get(key);
+ codeMap.put(key, list);
+ addedList.addAll(list);
+ novelReads+=list.size();
+ novelKeys++;
+ }
+ }
+ }
+
+ if(verbose){System.err.println("Novel reads = "+novelReads+", conflicts = "+localConflictList.size());}
+
+ for(int i=0; i<localConflictList.size(); i++){
+ ArrayList<Unit> listT=localConflictList.get(i);
+ ArrayList<Unit> list=sharedConflictList.get(i);
+ synchronized(list){
+ for(Unit u : listT){
+ if(verbose){System.err.println("Processing novel unit "+u.name());}
+ boolean match=false;
+ if(findMatchesT){
+ for(Unit u2 : list){
+ if(pairedEqualsRC(u, u2)){
+ // if(verbose){System.err.println("Matches "+new String(r2.bases, 0, Tools.min(40, r2.length())));}
+ u2.absorbMatch(u);
+ if(UNIQUE_ONLY){
+ synchronized(u2){
+ if(u2.valid()){
+ mergedReads++;
+ baseMatchesT+=u2.length();
+ u2.setValid(false);
+ addDupe(u2.r);
+ }
+ }
+ }
+ match=true;
+ break;
+ }
+ }
+ }
+ if(match){
+ addDupe(u.r);
+ mergedReads++;
+ baseMatchesT+=u.length();
+ if(verbose){System.err.println("matchesT="+matchesT+", baseMatchesT="+baseMatchesT);}
+ }else{
+ collisionReads++;
+ if(verbose){System.err.println("False collision; count = "+collisionReads);}
+ list.add(u);
+ addedList.add(u);
+ }
+ }
+ }
+ }
+ matchesT+=mergedReads;
+ collisionsT+=collisionReads;
+ if(verbose){System.err.println("Done Merging.");}
+ if(verbose){System.err.println("mapT.size="+codeMapT.size()+", basesStoredT="+basesStoredT);}
+
+ codeMapT.clear();
+ localConflictList.clear();
+ sharedConflictList.clear();
+
+ if(!addedList.isEmpty()){
+ if(addToAffixMapT){
+ final LongM p=new LongM(-1, true);
+ assert(affixMaps!=null);
+ assert(affixMaps[0]!=null || (affixMaps.length>1 && affixMaps[1]!=null));
+
+ for(int i=0; i<numAffixMaps; i++){
+ HashMap<LongM, ArrayList<Unit>> map=affixMaps[i];
+ if(map!=null && (i>0 || !ignoreAffix1)){
+ synchronized(map){
+ for(Unit u : addedList){
+ final long prefix=u.prefixes[i], suffix=u.suffixes[i];
+ if(verbose){System.err.println("Processing affixes for "+u.name());}
+ if(prefix!=-1 || prefix!=suffix){
+ if(verbose){System.err.println("Using prefix "+prefix);}
+ p.set(prefix);
+ ArrayList<Unit> alu=map.get(p);
+ if(alu==null){
+ if(verbose){System.err.println("Made new alu for "+p);}
+ alu=new ArrayList<Unit>(2);
+ map.put(p.iCopy(), alu);
+ }
+ if(alu.size()<maxAffixCopies){
+ if(verbose){System.err.println("Added "+u.name());}
+ alu.add(u);
+ }
+ if(verbose){System.err.println(map.get(p));}
+ }
+ if(storeSuffix && prefix!=suffix){
+ if(verbose){System.err.println("Using suffix "+suffix);}
+ p.set(suffix);
+ ArrayList<Unit> alu=map.get(p);
+ if(alu==null){
+ if(verbose){System.err.println("Made new alu for "+p);}
+ alu=new ArrayList<Unit>(2);
+ map.put(p.iCopy(), alu);
+ }
+ if(alu.size()<maxAffixCopies){
+ if(verbose){System.err.println("Added "+u.name());}
+ alu.add(u);
+ }
+ if(verbose){System.err.println(map.get(p));}
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ addedList.clear();
+ basesStoredT=0;
+ return collisionReads+novelReads;
+ }
+
+ private int getTid(){
+ synchronized(HashThread.class){
+ int x=tcount;
+ tcount++;
+ return x;
+ }
+ }
+
+ private LinkedHashMap<Long, ArrayList<Unit>> codeMapT=new LinkedHashMap<Long, ArrayList<Unit>>(threadMaxReadsToBuffer*8);
+ private ArrayList<Unit> addedList=new ArrayList<Unit>(threadMaxReadsToBuffer);
+ private ArrayList<ArrayList<Unit>> localConflictList=new ArrayList<ArrayList<Unit>>(threadMaxReadsToBuffer);
+ private ArrayList<ArrayList<Unit>> sharedConflictList=new ArrayList<ArrayList<Unit>>(threadMaxReadsToBuffer);
+
+ long matchesT=0;
+ long baseMatchesT=0;
+ long baseContainmentsT=0;
+ long collisionsT=0;
+ long containmentsT=0;
+ long containmentCollisionsT=0;
+ long basesStoredT=0;
+ long addedToMainT=0;
+ long readsProcessedT=0;
+ long basesProcessedT=0;
+ long overlapsT=0;
+ long baseOverlapsT=0;
+ long overlapCollisionsT=0;
+
+ private final boolean addToCodeMapT;
+ private final boolean addToAffixMapT;
+ private final boolean findContainmentsT;
+ private final boolean findOverlapsT;
+ private final boolean findMatchesT;
+// private final boolean convertToUpperCaseT;
+ private final int tid;
+ private final ArrayDeque<ConcurrentReadInputStream> crisq;
+ private final BandedAligner bandy;
+ }
+
+ public static boolean equalsRC(byte[] a, byte[] b){
+ if(a==b){return true;}
+ if(a==null || b==null){return false;}
+ if(a.length!=b.length){return false;}
+
+ boolean ca=isCanonical(a);
+ boolean cb=isCanonical(b);
+
+ if(ca==cb){
+ for(int i=0; i<a.length; i++){
+ final byte aa=a[i], bb=b[i];
+ if(aa!=bb){return false;}
+ }
+ }else{
+ for(int i=0, j=b.length-1; i<a.length; i++, j--){
+ final byte aa=a[i], bb=baseToComplementExtended[b[j]];
+ if(aa!=bb){return false;}
+ }
+ }
+ return true;
+ }
+
+ public static boolean pairedEqualsRC(Unit ua, Unit ub){
+ if(verbose){System.err.println("pairedEqualsRC("+ua.name()+", "+ub.name()+")");}
+ if(verbose){System.err.println("ea");}
+ boolean b=equalsRC(ua, ub);
+ if(verbose){System.err.println("eb");}
+ if(!b){return false;}
+ if(verbose){System.err.println("ec");}
+
+ if(ua.r!=null && ub.r!=null){
+ if(verbose){System.err.println("ed");}
+ assert((ua.r.mate==null)==(ub.r.mate==null));
+ if(verbose){System.err.println("ee");}
+ if(ua.r.mate!=null && ub.r.mate!=null){
+ if(verbose){System.err.println("ef");}
+ return ua.canonical()==ub.canonical() && ua.r.pairnum()==ub.r.pairnum() && Tools.compare(ua.r.mate.bases, ub.r.mate.bases)==0;
+ }
+ if(verbose){System.err.println("eg");}
+ }
+ if(verbose){System.err.println("eh");}
+ return true;
+ }
+
+ private static boolean equalsRC(Unit ua, Unit ub){
+ if(verbose){System.err.println("equalsRC("+ua.name()+", "+ub.name()+")");}
+ return ua.code1==ub.code1 && ua.code2==ub.code2 && (ua.canonical()==ub.canonical() ? (ua.prefixes[0]==ub.prefixes[0] && ua.suffixes[0]==ub.suffixes[0]) :
+ (ua.prefixes[0]==ub.suffixes[0] && ua.suffixes[0]==ub.prefixes[0])) && compareRC(ua, ub)==0;
+ }
+
+ public static int comparePairedRC(Unit ua, Unit ub){
+ if(verbose){System.err.println("comparePairedRC("+ua.name()+", "+ub.name()+")");}
+ int x=compareRC(ua, ub);
+ if(x!=0){return x;}
+
+ if(ua.r!=null && ub.r!=null && ua.r.mate!=null && ub.r.mate!=null){
+ if(ua.r.pairnum()!=ub.r.pairnum()){return ua.r.pairnum()-ub.r.pairnum();}
+ return compareRC((Unit)ua.r.mate.obj, (Unit)ub.r.mate.obj);
+ }
+ return 0;
+ }
+
+ //TODO
+ //This is really for sorting by length.
+ private static int compareRC(Unit ua, Unit ub){
+ if(verbose){System.err.println("compareRC("+ua.name()+", "+ub.name()+")");}
+ if(ua==ub){return 0;}
+ if(verbose){System.err.println("a");}
+ if(verbose){System.err.println("a1");}
+ if(ua.length()!=ub.length()){return ub.length()-ua.length();}
+ if(verbose){System.err.println("a2");}
+
+ if(REQUIRE_MATCHING_NAMES){
+ if(ua.name()!=null && ub.name()!=null){
+ int x=ua.name().compareTo(ub.name());
+ if(x!=0){return x;}
+ }
+ }
+ if(verbose){System.err.println("a3");}
+
+ if(ua.r==null || ub.r==null){
+ if(verbose){System.err.println("b");}
+ if(verbose){System.err.println("b1");}
+ if(ua.canonical()){
+ if(verbose){System.err.println("c");}
+ if(ub.canonical()){
+ if(ua.prefixes[0]!=ub.prefixes[0]){return ua.prefixes[0]>ub.prefixes[0] ? 1 : -1;}
+ if(ua.suffixes[0]!=ub.suffixes[0]){return ua.suffixes[0]>ub.suffixes[0] ? 1 : -1;}
+ }else{
+ if(ua.prefixes[0]!=ub.suffixes[0]){return ua.prefixes[0]>ub.suffixes[0] ? 1 : -1;}
+ if(ua.suffixes[0]!=ub.prefixes[0]){return ua.suffixes[0]>ub.prefixes[0] ? 1 : -1;}
+ }
+ }else{
+ if(verbose){System.err.println("d");}
+ if(ub.canonical()){
+ if(ua.suffixes[0]!=ub.prefixes[0]){return ua.suffixes[0]>ub.prefixes[0] ? 1 : -1;}
+ if(ua.prefixes[0]!=ub.suffixes[0]){return ua.prefixes[0]>ub.suffixes[0] ? 1 : -1;}
+ }else{
+ if(ua.suffixes[0]!=ub.suffixes[0]){return ua.suffixes[0]>ub.suffixes[0] ? 1 : -1;}
+ if(ua.prefixes[0]!=ub.prefixes[0]){return ua.prefixes[0]>ub.prefixes[0] ? 1 : -1;}
+ }
+ }
+ if(verbose){System.err.println("e");}
+ if(ua.code1!=ub.code1){return ua.code1>ub.code1 ? 1 : -1;}
+ if(ua.code2!=ub.code2){return ua.code2>ub.code2 ? 1 : -1;}
+
+ return ua.pairnum()-ub.pairnum();
+ }
+ if(verbose){System.err.println("f");}
+ final byte[] a=ua.r.bases, b=ub.r.bases;
+ if(a==b){return 0;}
+ if(a==null || b==null){return a==null ? -1 : 1;}
+ if(verbose){System.err.println("g");}
+
+ if(ua.canonical()==ub.canonical()){
+ if(verbose){System.err.println("h");}
+ if(ua.canonical() && ub.canonical()){
+ for(int i=0; i<a.length; i++){
+ final byte aa=a[i], bb=b[i];
+ if(aa!=bb){return aa-bb;}
+ }
+ }else{
+ for(int i=a.length-1; i>=0; i--){
+ final byte aa=baseToComplementExtended[a[i]], bb=baseToComplementExtended[b[i]];
+ if(aa!=bb){return aa-bb;}
+ }
+ }
+ }else{
+ if(verbose){System.err.println("i");}
+ if(ua.canonical()){
+ for(int i=0, j=b.length-1; i<a.length; i++, j--){
+ final byte aa=a[i], bb=baseToComplementExtended[b[j]];
+ if(aa!=bb){return aa-bb;}
+ }
+ }else{
+ for(int i=a.length-1, j=0; i>=0; i--, j++){
+ final byte aa=baseToComplementExtended[a[i]], bb=b[j];
+ if(aa!=bb){return aa-bb;}
+ }
+ }
+ }
+
+ if(verbose){System.err.println("j");}
+ return ua.pairnum()-ub.pairnum();
+ }
+
+ private static long hashTip(byte[] bases, boolean prefix, int k, int skipInitialBases){
+ if(bases==null || bases.length<k){return -1;}
+
+ final int shift=2*k;
+ final int shift2=shift-2;
+ final long mask=~((-1L)<<shift);
+ long kmer=0;
+ long rkmer=0;
+ int len=0;
+
+ final int start=(prefix ? 0+skipInitialBases : bases.length-k-skipInitialBases);
+ final int stop=start+k;
+
+// if(verbose){
+// System.err.println("\n"+new String(bases));
+// System.err.println("prefix="+prefix+", start="+start+", stop="+stop);
+//// System.err.print(new String(bases));
+// }
+ for(int i=start; i<stop; i++){
+ byte b=bases[i];
+// if(verbose){System.err.print((char)b);}
+ long x=baseToNumber[b];
+ long x2=baseToComplementNumber[b];
+ kmer=((kmer<<2)|x)&mask;
+ rkmer=(rkmer>>>2)|(x2<<shift2);
+ len++;
+ }
+ if(verbose){System.err.println(new String(bases, start, k)+" = "+Tools.max(kmer, rkmer));}
+ assert(len==k) : len+","+k;
+ return Tools.max(kmer, rkmer);
+ }
+
+ private static final int calcMaxEdits(int maxEdits, float minIdentityMult, int len){
+ return minIdentityMult==0 ? maxEdits : Tools.max(maxEdits, (int)Math.round(len*minIdentityMult));
+ }
+
+
+ private class Overlap implements Comparable<Overlap>{
+
+ public Overlap(Unit u1_, Unit u2_, int type_, int start1_, int start2_, int stop1_, int stop2_, int len_, int mismatches_, int edits_, BandedAligner bandy){
+ assert(u1_!=u2_);
+ if(verbose){System.err.println("\nCreating an overlap.");}
+ u1=u1_;
+ u2=u2_;
+ type=type_;
+ start1=start1_;
+ start2=start2_;
+ stop1=stop1_;
+ stop2=stop2_;
+ overlapLen=len_;
+ mismatches=mismatches_;
+ edits=edits_;
+
+ assert(Tools.absdif(Tools.absdif(start1, stop1), Tools.absdif(start2, stop2))<=maxEdits) : "\n"+this+"\n>1\n"+new String(u1.r.bases)+"\n>2\n"+new String(u2.r.bases)
+ +"\n>1a\n"+new String(u1.r.bases, Tools.min(start1, stop1), Tools.max(start1, stop1)-Tools.min(start1, stop1)+1)
+ +"\n>2a\n"+new String(u2.r.bases, Tools.min(start2, stop2), Tools.max(start2, stop2)-Tools.min(start2, stop2)+1);
+
+ assert(start1>=0 && start1<=u1.length()) : "type "+type+": "+start1+", "+stop1+", "+u1.length()+", "+start2+", "+stop2+", "+u2.length();
+ assert(stop1>=0 && stop1<=u1.length()) : "type "+type+": "+start1+", "+stop1+", "+u1.length()+", "+start2+", "+stop2+", "+u2.length();
+ assert(start2>=0 && start2<=u2.length()) : "type "+type+": "+start1+", "+stop1+", "+u1.length()+", "+start2+", "+stop2+", "+u2.length();
+ assert(stop2>=0 && stop2<=u2.length()) : "type "+type+": "+start1+", "+stop1+", "+u1.length()+", "+start2+", "+stop2+", "+u2.length();
+
+ assert(type==FORWARD || type==FORWARDRC || type==REVERSE || type==REVERSERC);
+
+ if(verbose){System.err.println(this);}
+
+ assert(Tools.absdif(Tools.absdif(start1, stop1), Tools.absdif(start1, stop1))<=maxEdits);
+
+ assert(test(bandy, edits+maxEdits)) : "\n"+this+"\n>1\n"+new String(u1.r.bases)+"\n>2\n"+new String(u2.r.bases)
+ +"\n>1a\n"+new String(u1.r.bases, Tools.min(start1, stop1), Tools.max(start1, stop1)-Tools.min(start1, stop1)+1)
+ +"\n>2a\n"+new String(u2.r.bases, Tools.min(start2, stop2), Tools.max(start2, stop2)-Tools.min(start2, stop2)+1);
+ if(verbose){System.err.println("Passed test 1.");}
+
+// bandy.verbose=true;
+// test(bandy);
+// assert(false);
+
+ assert(u1!=u2);
+ u1.firstInOverlap(u2);
+ u2.firstInOverlap(u1);
+ assert(u1.length()!=u2.length() || u1.code1!=u2.code1 || u1.code2!=u2.code2 || (u1.r!=null && u1.r.mate!=null)) : "Collision? \n"+this+"\n"+u1+"\n"+u2;
+ assert(u1.firstInOverlap(u2)!=u2.firstInOverlap(u1)) :
+ "\nu1.firstInOverlap(u2)="+u1.firstInOverlap(u2)+"\nu2.firstInOverlap(u1)="+u2.firstInOverlap(u1)+"\nu1="+u1+"\nu2="+u2;
+
+ if(!u1.firstInOverlap(u2)){
+ if(verbose){System.err.println("\nSwapping.");}
+ swap();
+ if(verbose){System.err.println(this);}
+
+ if(EA && !customBandwidth && !test(bandy, edits+maxEdits)){
+ System.err.println("\n"+this);
+ swap();
+ System.err.println("\n"+this);
+ assert(test(bandy, edits+maxEdits)) : "\n"+this+"\n>1\n"+new String(u1.r.bases)+"\n>2\n"+new String(u2.r.bases)+"\n";
+ System.err.println("Passed test 2a, "+bandy.lastEdits+" edits.\n");
+ swap();
+ System.err.println("\n"+this);
+ assert(test(bandy, edits+maxEdits)) : "\n"+this+"\n>1\n"+new String(u1.r.bases)+"\n>2\n"+new String(u2.r.bases)
+ +"\n>1a\n"+new String(u1.r.bases, Tools.min(start1, stop1), Tools.max(start1, stop1)-Tools.min(start1, stop1)+1)
+ +"\n>2a\n"+new String(u2.r.bases, Tools.min(start2, stop2), Tools.max(start2, stop2)-Tools.min(start2, stop2)+1);
+ System.err.println("Passed test 2b, "+bandy.lastEdits+" edits.\n");
+ }
+
+ assert(customBandwidth || test(bandy, edits+maxEdits)) : "\n"+this+"\n>1\n"+new String(u1.r.bases)+"\n>2\n"+new String(u2.r.bases)
+ +"\n>1a\n"+new String(u1.r.bases, Tools.min(start1, stop1), Tools.max(start1, stop1)-Tools.min(start1, stop1)+1)
+ +"\n>2a\n"+new String(u2.r.bases, Tools.min(start2, stop2), Tools.max(start2, stop2)-Tools.min(start2, stop2)+1);
+ if(verbose){System.err.println("Passed test 2.");}
+ }
+
+ if(type==REVERSE || type==REVERSERC){
+ if(verbose){System.err.println("\nReversing.");}
+ reverseDirection();
+ if(verbose){System.err.println(this);}
+
+ if(EA && !Shared.anomaly && !customBandwidth && bandy!=null && !test(bandy, edits+maxEdits)){
+ Shared.anomaly=true;
+ BandedAligner.verbose=true;
+ System.err.println("\n********** Failed test 3, "+bandy.lastEdits+" edits. ***************\n");
+ reverseDirection();
+ System.err.println(this);
+ assert(test(bandy, edits+maxEdits)) : "\n"+this+"\n>1\n"+new String(u1.r.bases)+"\n>2\n"+new String(u2.r.bases)+"\n";
+ System.err.println("Passed test 3a, "+bandy.lastEdits+" edits.\n");
+ reverseDirection();
+ System.err.println(this);
+ assert(test(bandy, edits+maxEdits)) : "\n"+this+"\n>1\n"+new String(u1.r.bases)+"\n>2\n"+new String(u2.r.bases)
+ +"\n>1a\n"+new String(u1.r.bases, Tools.min(start1, stop1), Tools.max(start1, stop1)-Tools.min(start1, stop1)+1)
+ +"\n>2a\n"+new String(u2.r.bases, Tools.min(start2, stop2), Tools.max(start2, stop2)-Tools.min(start2, stop2)+1);
+ System.err.println("Passed test 3b, "+bandy.lastEdits+" edits.\n");
+ BandedAligner.verbose=false;
+ }
+
+ assert(customBandwidth || test(bandy, edits+maxEdits)) : "\n"+this+"\n>1\n"+new String(u1.r.bases)+"\n>2\n"+new String(u2.r.bases)+"\n";
+ if(verbose){System.err.println("Passed test 3.");}
+ }
+ //Now all overlaps should be FORWARD or FORWARDRC and u1 should be at least as big as u2
+ assert(type==FORWARD || type==FORWARDRC);
+ assert(u1.length()>=u2.length());
+ assert(u1.firstInOverlap(u2));
+ assert(!u2.firstInOverlap(u1));
+ if(verbose){System.err.println("Finished overlap initialization.");}
+ }
+
+ public boolean test(BandedAligner bandy, int editLimit){
+ final int last1=u1.length()-1, last2=u2.length()-1;
+ if(verbose){System.err.println("Testing "+OVERLAP_TYPE_NAMES[type]+", "+start1+", "+start2);}
+ if(type==FORWARD){
+ assert(start1==0 || start2==0) : "start1="+start1+", stop1="+stop1+", last1="+last1+", start2="+start2+", stop2="+stop2+", last2="+last2;
+ if(start2==0){
+ if(verbose){System.err.println("A");}
+ return u1.overlapsForward(u2, start1, start2, bandy, false, editLimit);}
+ else{
+ if(verbose){System.err.println("B");}
+ return u2.overlapsForward(u1, start2, start1, bandy, false, editLimit);}
+ }
+ if(type==FORWARDRC){
+ assert(start1==0 || start2==last2) : "start1="+start1+", stop1="+stop1+", last1="+last1+", start2="+start2+", stop2="+stop2+", last2="+last2;
+ if(start2==last2){return u1.overlapsForwardRC(u2, start1, start2, bandy, false, editLimit);}
+ else{return u2.overlapsReverseRC(u1, start2, start1, bandy, false, editLimit);}
+ }
+ if(type==REVERSE){
+ assert(start1==last1 || start2==last2) : "start1="+start1+", stop1="+stop1+", last1="+last1+", start2="+start2+", stop2="+stop2+", last2="+last2;
+ if(start2==last2){return u1.overlapsReverse(u2, start1, start2, bandy, false, editLimit);}
+ else{return u2.overlapsReverse(u1, start2, start1, bandy, false, editLimit);}
+ }
+ if(type==REVERSERC){
+ assert(start1==last1 || start2==0) : "start1="+start1+", stop1="+stop1+", last1="+last1+", start2="+start2+", stop2="+stop2+", last2="+last2;
+ if(start2==0){return u1.overlapsReverseRC(u2, start1, start2, bandy, false, editLimit);}
+ else{return u2.overlapsForwardRC(u1, start2, start1, bandy, false, editLimit);}
+ }
+ throw new RuntimeException();
+ }
+
+ public boolean equals(Object o){
+ return equals((Overlap)o);
+ }
+
+ public boolean equals(Overlap o){
+ if(this==o){return true;}
+ assert(o!=null) : "*A*\n"+this+"\n"+o+"\n"+u1+"\n"+u2;
+ assert(u1!=null && u2!=null) : "*B*\n"+this+"\n"+o+"\n"+u1+"\n"+u2;
+ assert(u1!=o.u2 || u2!=o.u1) : "*C*\n"+this+"\n"+o+"\n"+u1.firstInOverlap(u2)+"\n"+o.u1.firstInOverlap(o.u2)+"\n"+u1+"\n"+u2;
+ return (u1==o.u1 && u2==o.u2 && type==o.type && start1==o.start1 && start2==o.start2 && stop1==o.stop1 && stop2==o.stop2)
+ ;//|| (u1==o.u2 && u2==o.u1 && type==reverseType(o.type) && start1==o.start2 && start2==o.start1);
+ }
+
+// public int compareTo(Overlap o){
+// int a=compareTo2(o);
+// int b=o.compareTo2(this);
+// assert(a==-b) : "\n"+this+"\n"+o+"\na="+a+", b="+b+", equals="+this.equals(o)
+// +"\nu1.compareTo(o.u1)="+u1.compareTo(o.u1)+"\no.u1.compareTo(u1)="+o.u1.compareTo(u1)
+// +"\nu2.compareTo(o.u2)="+u2.compareTo(o.u2)+"\no.u2.compareTo(u2)="+o.u2.compareTo(u2);
+// return a;
+// }
+
+ public int compareTo(Overlap o){
+ int score1=overlapLen-50*(mismatches+edits);
+ int score2=o.overlapLen-50*(o.mismatches+o.edits);
+ if(score1!=score2){return score2-score1;}
+ if(overlapLen!=o.overlapLen){return o.overlapLen-overlapLen;}
+ int x=u1.compareTo(o.u1);
+ if(x!=0){return -x;}
+ x=u2.compareTo(o.u2);
+ if(x!=0){return -x;}
+ if(type!=o.type){return type-o.type;}
+ if((u1!=o.u1 || u2!=o.u2) && absorbMatch && !subsetMode){
+ boolean oldv=verbose;
+ verbose=true;
+ System.err.println(this);
+ System.err.println(o);
+ System.err.println("********");
+ System.err.println(u1);
+ System.err.println(u2);
+ System.err.println(o.u1);
+ System.err.println(o.u2);
+ System.err.println("********");
+ System.err.println(u1.equals(o.u1));
+ System.err.println("********");
+ System.err.println(u2.equals(o.u2));
+ System.err.println("********");
+ System.err.println(u1.compareTo(o.u1));
+ System.err.println("********");
+ System.err.println(u2.compareTo(o.u2));
+ System.err.println("********");
+ verbose=oldv;
+ }
+ assert(!absorbMatch || (u1==o.u1 && u2==o.u2) || subsetMode) : "\n"+u1.r+"\n"+u2.r+"\n"+o.u1.r+"\n"+o.u2.r
+ +"\n\n"+u1.r.mate+"\n"+u2.r.mate+"\n"+o.u1.r.mate+"\n"+o.u2.r.mate;
+// assert(false) : "\n"+this+"\n"+o+"\n>"+u1.name()+"\n"+new String(u1.bases())+"\n>"+u2.name()+"\n"+new String(u2.bases())+"\n";
+ if(start1!=o.start1){return start1-o.start1;}
+ if(stop1!=o.stop1){return stop1-o.stop1;}
+ if(start2!=o.start2){return start2-o.start2;}
+ if(stop2!=o.stop2){return stop2-o.stop2;}
+ if(this.equals(o)){
+ return 0;
+ }else{
+ //TODO: ensure this assumption is valid.
+ assert(!absorbContainment || !absorbMatch || subsetMode) : "\n"+this+"\n"+o+"\n>"+u1.name()+"\n"+new String(u1.bases())+"\n>"+u2.name()+"\n"+new String(u2.bases())+"\n";
+
+ if(u1.unitID!=o.u1.unitID){return u1.unitID-o.u1.unitID;}
+ if(u2.unitID!=o.u2.unitID){return u2.unitID-o.u2.unitID;}
+ }
+ return 0;
+ }
+
+ public int hashCode(){
+ return u1.hashCode()^u2.hashCode()^overlapLen;
+ }
+
+ public void flip(Unit changed, BandedAligner bandy){
+
+ if(changed==u2){
+ if(type==FORWARD){type=FORWARDRC;}
+ else if(type==FORWARDRC){type=FORWARD;}
+ else if(type==REVERSE){type=REVERSERC;}
+ else if(type==REVERSERC){type=REVERSE;}
+ else{throw new RuntimeException("Unknown overlap type "+type);}
+ start2=u2.length()-start2-1;
+ stop2=u2.length()-stop2-1;
+ }else if(changed==u1){
+ if(type==FORWARD){type=REVERSERC;}
+ else if(type==FORWARDRC){type=REVERSE;}
+ else if(type==REVERSE){type=FORWARDRC;}
+ else if(type==REVERSERC){type=FORWARD;}
+ else{throw new RuntimeException("Unknown overlap type "+type);}
+ start1=u1.length()-start1-1;
+ stop1=u1.length()-stop1-1;
+ }else{throw new RuntimeException("'changed' was not in the Overlap.");}
+
+ assert(test(bandy, edits+maxEdits));
+ }
+
+ public void swap(){
+ Unit tempu=u1;
+ u1=u2;
+ u2=tempu;
+ int temp=start1;
+ start1=start2;
+ start2=temp;
+ temp=stop1;
+ stop1=stop2;
+ stop2=temp;
+ if(type==FORWARDRC){type=REVERSERC;}
+ else if(type==REVERSERC){type=FORWARDRC;}
+ }
+
+ public void reverseDirection(){
+ type=reverseType(type);
+ int temp=start1;
+ start1=stop1;
+ stop1=temp;
+ temp=start2;
+ start2=stop2;
+ stop2=temp;
+ }
+
+ public String toString(){
+ StringBuilder sb=new StringBuilder(80);
+ sb.append("type=");
+ sb.append(OVERLAP_TYPE_NAMES[type]);
+ sb.append(", len=");
+ sb.append(overlapLen);
+ sb.append(", subs=");
+ sb.append(mismatches);
+ sb.append(", edits=");
+ sb.append(edits);
+
+ sb.append(" (");
+ sb.append(u1.name()==null ? u1.r.numericID+"" : u1.name());
+ if(printLengthInEdges){
+ sb.append(", length=");
+ sb.append(u1.length());
+ }
+ sb.append(", start1=");
+ sb.append(start1);
+ sb.append(", stop1=");
+ sb.append(stop1);
+
+ sb.append(") (");
+ sb.append(u2.name()==null ? u2.r.numericID+"" : u2.name());
+ if(printLengthInEdges){
+ sb.append(", length=");
+ sb.append(u2.length());
+ }
+ sb.append(", start2=");
+ sb.append(start2);
+ sb.append(", stop2=");
+ sb.append(stop2);
+ sb.append(")");
+ return sb.toString();
+ }
+
+ public String toLabel(){
+ StringBuilder sb=new StringBuilder(80);
+ sb.append(OVERLAP_TYPE_ABBREVIATIONS[type]);
+ sb.append(',');
+ sb.append(overlapLen);
+ sb.append(',');
+ sb.append(mismatches);
+ sb.append(',');
+ sb.append(edits);
+
+ if(printLengthInEdges){
+ sb.append(',');
+ sb.append(u1.length());
+ }
+ sb.append(',');
+ sb.append(start1);
+ sb.append(',');
+ sb.append(stop1);
+
+ if(printLengthInEdges){
+ sb.append(',');
+ sb.append(u2.length());
+ }
+ sb.append(',');
+ sb.append(start2);
+ sb.append(',');
+ sb.append(stop2);
+
+ return sb.toString();
+ }
+
+
+ private void setCanonContradiction(boolean b){
+ assert(b!=canonContradiction()) : b+", "+canonContradiction();
+ if(b){flags|=CANON_CONTRADICTION_MASK;}
+ else{flags&=~CANON_CONTRADICTION_MASK;}
+ assert(b==canonContradiction()) : b+", "+canonContradiction();
+ }
+
+ private void setOffsetContradiction(boolean b){
+ assert(b!=offsetContradiction()) : b+", "+offsetContradiction();
+ if(b){flags|=OFFSET_CONTRADICTION_MASK;}
+ else{flags&=~OFFSET_CONTRADICTION_MASK;}
+ assert(b==offsetContradiction()) : b+", "+offsetContradiction();
+ }
+
+ private void setMultiJoin(boolean b){
+ assert(b!=multiJoin()) : b+", "+multiJoin();
+ if(b){flags|=MULTIJOIN_MASK;}
+ else{flags&=~MULTIJOIN_MASK;}
+ assert(b==multiJoin()) : b+", "+multiJoin();
+ }
+
+ private void setVisited(boolean b){
+ assert(b!=visited()) : b+", "+visited();
+ if(b){flags|=VISITED_MASK;}
+ else{flags&=~VISITED_MASK;}
+ assert(b==visited()) : b+", "+visited();
+ }
+
+ private void setCyclic(boolean b){
+ assert(b!=cyclic()) : b+", "+cyclic();
+ if(b){flags|=CYCLIC_MASK;}
+ else{flags&=~CYCLIC_MASK;}
+ assert(b==cyclic()) : b+", "+cyclic();
+ }
+
+ private void setInvalid(boolean b){
+ assert(b!=invalid()) : b+", "+invalid();
+ assert(b!=mst()) : b+", "+mst()+", "+invalid();
+ if(b){flags|=INVALID_MASK;}
+ else{flags&=~INVALID_MASK;}
+ assert(b==invalid()) : b+", "+invalid();
+ }
+
+ private void setMst(boolean b){
+ assert(b!=mst()) : b+", "+mst();
+ assert(b!=invalid()) : b+", "+mst()+", "+invalid();
+ if(b){flags|=MST_MASK;}
+ else{flags&=~MST_MASK;}
+ assert(b==mst()) : b+", "+mst();
+ }
+
+ public void clearVolatileFlags(){
+ flags=0;
+// flags=flags&~(MULTIJOIN_MASK|VISITED_MASK|CANON_CONTRADICTION_MASK|CYCLIC_MASK|OFFSET_CONTRADICTION_MASK|INVALID_MASK);
+// assert(!canonContradiction());
+// assert(!offsetContradiction());
+// assert(!multiJoin());
+// assert(!visited());
+// assert(!cyclic());
+// assert(!invalid());
+ }
+
+ public boolean canonContradiction(){return (CANON_CONTRADICTION_MASK&flags)==CANON_CONTRADICTION_MASK;}
+ public boolean offsetContradiction(){return (OFFSET_CONTRADICTION_MASK&flags)==OFFSET_CONTRADICTION_MASK;}
+ public boolean multiJoin(){return (MULTIJOIN_MASK&flags)==MULTIJOIN_MASK;}
+ public boolean visited(){return (VISITED_MASK&flags)==VISITED_MASK;}
+ public boolean cyclic(){return (CYCLIC_MASK&flags)==CYCLIC_MASK;}
+ public boolean invalid(){return (INVALID_MASK&flags)==INVALID_MASK;}
+ public boolean mst(){return (MST_MASK&flags)==MST_MASK;}
+ public boolean contradiction(){return canonContradiction() || offsetContradiction();}
+
+ private static final long VISITED_MASK=(1L<<0);
+ private static final long MULTIJOIN_MASK=(1L<<1);
+ private static final long CYCLIC_MASK=(1L<<2);
+ private static final long CANON_CONTRADICTION_MASK=(1L<<3);
+ private static final long OFFSET_CONTRADICTION_MASK=(1L<<4);
+ private static final long INVALID_MASK=(1L<<5);
+ private static final long MST_MASK=(1L<<6);
+
+ Unit u1;
+ Unit u2;
+ int type;
+ int start1;
+ int start2;
+ int stop1;
+ int stop2;
+
+ long flags=0;
+
+ final int overlapLen;
+ final int mismatches;
+ final int edits;
+ }
+
+ /**
+ * @return
+ */
+ private int determineCluster2(final int uid) {
+ assert(clusterNumbers!=null);
+ boolean stable=false;
+ int cluster=uid;
+ while(!stable){
+ cluster=clusterNumbers.get(uid);
+ if(cluster==0 || cluster==uid){return cluster;}
+ assert(cluster<=uid);
+ final int next=determineCluster2(cluster);
+ if(next>=cluster){return cluster;}
+ stable=clusterNumbers.compareAndSet(uid, cluster, next);
+ }
+ return cluster;
+ }
+
+
+ private int mergeClusterIds(int cluster1, int cluster2) {
+ assert(clusterNumbers!=null);
+
+// System.err.println("Merging clusters "+cluster1+" and "+cluster2);
+
+ while(cluster1!=cluster2){
+ int min=Tools.min(cluster1, cluster2);
+ if(cluster1!=min){
+ assert(cluster1>min);
+ boolean b=clusterNumbers.compareAndSet(cluster1, cluster1, min);
+ if(!b){
+ cluster1=determineCluster2(cluster1);
+ min=Tools.min(cluster1, cluster2);
+ }
+ }
+ if(cluster2!=min){
+ assert(cluster2>min);
+ boolean b=clusterNumbers.compareAndSet(cluster2, cluster2, min);
+ if(!b){
+ cluster2=determineCluster2(cluster2);
+ min=Tools.min(cluster1, cluster2);
+ }
+ }
+ }
+// System.err.println("Returning "+cluster1);
+ return cluster1;
+ }
+
+ private class Unit implements Comparable<Unit>, Serializable {
+
+ /**
+ *
+ */
+ private static final long serialVersionUID = -992343322822460643L;
+
+ public Unit(Read r_){
+ this(r_, isCanonical(r_.bases));
+ }
+
+ public Unit(Read r_, boolean canonical_){
+// this(r_, canonical_, canonical_ ? hash(r_.bases) : hashReversed(r_.bases));
+ this(r_, canonical_, hash(r_.bases), hashReversed(r_.bases));
+ }
+
+ public Unit(Read r_, boolean canonical_, long codeF_, long codeR_){
+ r=r_;
+ code1=Tools.min(codeF_, codeR_);
+ code2=Tools.max(codeF_, codeR_);
+ final int len=r.length();
+ for(int i=0; i<prefixes.length; i++){
+ if(len>(i+1)*k){
+ prefixes[i]=hashTip(r.bases, true, k, i*k);
+ suffixes[i]=hashTip(r.bases, false, k, i*k);
+ }else{
+ prefixes[i]=-1;
+ suffixes[i]=-1;
+ }
+ }
+ long f=r.length();
+ if(canonical_){f|=CANON_MASK;}
+ if(r.pairnum()==1){f|=PAIRNUM_MASK;}
+ flags=f;
+ assert(canonical()==canonical_);
+ assert(length()==r.length());
+ assert(pairnum()==r.pairnum());
+ if(parseDepth){
+ int[] quad=KmerNormalize.parseDepth(r.id, null);
+ if(quad!=null){depth=quad[r.pairnum()];}
+ }
+ }
+
+ int determineCluster() {
+ return determineCluster2(unitID);
+ }
+
+ public void absorbMatch(Unit u){
+
+ assert(code1==u.code1 && code2==u.code2 && length()==u.length());
+ if(r==null || u.r==null){return;}
+ u.r.setDiscarded(true);
+ final byte[] bases1=r.bases, bases2=u.r.bases;
+ final byte[] quals1=r.quality, quals2=u.r.quality;
+
+ assert((r.mate==null) == (u.r.mate==null));
+
+ if(r.mate!=null && !u.r.mate.discarded()){
+ ((Unit)r.mate.obj).absorbMatch((Unit)u.r.mate.obj);
+ }
+ if(quals1==null || quals2==null){return;}
+
+ if(canonical()==u.canonical()){
+ for(int i=0; i<bases1.length; i++){
+ byte b1=bases1[i], b2=bases2[i];
+ if(!AminoAcid.isFullyDefined(b1) && AminoAcid.isFullyDefined(b2)){bases1[i]=b2;}
+ else{assert(b1==b2);}
+ if(quals1!=null && quals2!=null){
+ quals1[i]=Tools.max(quals1[i], quals2[i]);
+ }
+ }
+ }else{
+ for(int i=0, j=bases2.length-1; i<bases1.length; i++, j--){
+ byte b1=bases1[i], b2=baseToComplementExtended[bases2[j]];
+ if(!AminoAcid.isFullyDefined(b1) && AminoAcid.isFullyDefined(b2)){bases1[i]=b2;}
+ else{assert(b1==b2);}
+ if(quals1!=null && quals2!=null){
+ quals1[i]=Tools.max(quals1[i], quals2[j]);
+ }
+ }
+ }
+ }
+
+ public boolean alreadyHas(Overlap o){
+ if(overlapList==null){return false;}
+ for(int i=0; i<overlapList.size(); i++){
+ Overlap o2=overlapList.get(i);
+ if(o.equals(o2)){
+ assert(overlapList.contains(o));
+ assert(o2.equals(o));
+ return true;
+ }
+ }
+ assert(!overlapList.contains(o));
+ return false;
+ }
+
+ /**
+ * @param set
+ * @return
+ */
+ public ArrayList<Unit> makeCluster() {
+ assert(!visited());
+ assert(!clustered());
+ assert(valid());
+// assert(set.isEmpty());
+ ArrayList<Unit> cluster=new ArrayList<Unit>(overlapList==null ? 1 : overlapList.size()+1);
+ cluster.add(this);
+ setClustered(true);
+
+ int added=1;
+ for(int i=0; i<cluster.size(); i++){
+ Unit u=cluster.get(i);
+ added+=u.visit(cluster);
+ }
+
+ assert(added==cluster.size());
+ return cluster;
+ }
+
+ /**
+ * @param set
+ * @return
+ */
+ public int visit(ArrayList<Unit> cluster) {
+ assert(!visited());
+ assert(clustered());
+ assert(valid());
+// assert(cluster.contains(this));
+ setVisited(true);
+ int added=0;
+
+ if(r!=null && r.mate!=null){
+ Unit u2=(Unit)r.mate.obj;
+ assert(u2!=this);
+ assert(u2.valid());
+ if(!u2.clustered()){
+ u2.setClustered(true);
+ cluster.add(u2);
+ added++;
+ }
+ }
+
+ if(overlapList!=null){
+ for(Overlap o : overlapList){
+ Unit u2=(o.u1==this ? o.u2 : o.u1);
+ assert(o.u1==this || o.u2==this);
+ assert(u2!=this);
+ assert(u2.valid());
+ if(!u2.clustered()){
+ u2.setClustered(true);
+ cluster.add(u2);
+ added++;
+ }
+ }
+ }
+ return added;
+ }
+
+ public boolean isTransitive(){
+ assert(valid());
+ if(overlapList==null || overlapList.size()==0){return true;}
+ for(Overlap o : overlapList){
+ assert(o.u1==this || o.u2==this);
+ Unit u2=(o.u1==this ? o.u2 : o.u1);
+ assert(u2!=this);
+ if(u2.overlapList==null){
+ return false;
+ }else{
+ boolean found=false;
+ for(Overlap o2 : u2.overlapList){
+ if(o2.u1==this || o2.u2==this){
+ found=true; break;
+ }
+ }
+ if(!found){return false;}
+ }
+ }
+ return true;
+ }
+
+ public boolean isPerfectlyTransitive(){
+ assert(valid());
+ if(overlapList==null || overlapList.size()==0){return true;}
+ for(Overlap o : overlapList){
+ assert(o.u1==this || o.u2==this);
+ Unit u2=(o.u1==this ? o.u2 : o.u1);
+ assert(u2!=this);
+ if(u2.overlapList==null){
+ return false;
+ }else{
+ boolean found=false;
+ for(Overlap o2 : u2.overlapList){
+ if(o2==o){
+ found=true; break;
+ }
+ }
+ if(!found){return false;}
+ }
+ }
+ return true;
+ }
+
+ public boolean isNonRedundant(){
+ assert(valid());
+ if(overlapList==null || overlapList.size()==0){return true;}
+ for(int i=0; i<overlapList.size(); i++){
+ Overlap a=overlapList.get(i);
+ for(int j=0; j<overlapList.size(); j++){
+ Overlap b=overlapList.get(j);
+ if((i==j)!=(a.equals(b))){
+ return false;
+ }
+ }
+ }
+ return true;
+ }
+
+ /**
+ * @param u2
+ * @param loc
+ * @param key
+ * @return
+ */
+ public boolean contains(Unit u2, int loc, LongM key, BandedAligner bandy, int tableNum) {
+ if(verbose){System.err.println("contains: Considering key "+key+", unit "+u2);}
+ if(minLengthPercent>0 && (u2.length()*100f/length())<minLengthPercent){return false;}
+ assert(u2.code1!=code1 || u2.code2!=code2 || u2.length()!=length() || (r!=null && r.mate!=null) || //REQUIRE_MATCHING_NAMES ||
+ (canonical()==u2.canonical() ? (u2.prefixes[0]!=prefixes[0] && u2.suffixes[0]!=suffixes[0]) : (u2.prefixes[0]!=suffixes[0] && u2.suffixes[0]!=prefixes[0]))) :
+ "Collision? \n"+this+"\n"+u2+"\n"+r+"\n"+u2.r;
+
+ final boolean earlyExit=(tableNum==0);
+ final int x=(tableNum+1);
+ final int ktn=k*tableNum;
+
+ if(key.value()==u2.prefixes[tableNum]){
+ if(verbose){System.err.println("Containment A"+x);}
+ if(containsForward(u2, loc-k2-ktn, bandy, earlyExit) || containsReverseRC(u2, loc+ktn, bandy, earlyExit)){return true;}
+ }
+ if(key.value()==u2.suffixes[tableNum]){
+ if(verbose){System.err.println("Containment B"+x);}
+ if(containsReverse(u2, loc+ktn, bandy, earlyExit) || containsForwardRC(u2, loc-k2-ktn, bandy, earlyExit)){return true;}
+ }
+ return false;
+ }
+
+ private boolean containsForward(Unit u2, int start, BandedAligner bandy, boolean earlyExit) {
+ if(start+u2.length()>length() || start<0 || start>=length()){return false;}
+// if(true){return false;}
+ if(u2.r!=null){
+ final byte[] a=bases(), b=u2.bases();
+ int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, b.length);
+
+ for(int i=start, j=0; j<b.length; i++, j++){
+ byte aa=a[i];
+ byte bb=b[j];
+ if(aa!=bb){
+ if(exact || (AminoAcid.isFullyDefined(aa) && AminoAcid.isFullyDefined(bb))){
+ if(earlyExit && j<k2){return false;}
+ if((mismatches=mismatches+1)>maxMismatches){
+ if(bandy==null || maxEdits<1){return false;}
+ int edits=bandy.alignForward(b, a, 0, start, maxEdits, exact);
+ assert(b.length<k || a.length<k || bandy.lastRow>=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits;
+ return edits<=maxEdits && bandy.score()>4*edits;
+ }
+ }
+ }
+ }
+ return true;
+ }else{
+ assert(false) : "TODO: Verify by hashing and checking both tips";
+ return false;
+ }
+ }
+
+ private boolean containsForwardRC(Unit u2, int start, BandedAligner bandy, boolean earlyExit) {
+ if(ignoreReverseComplement){return false;}
+ if(start+u2.length()>length() || start<0 || start>=length()){return false;}
+// if(true){return false;}
+ if(u2.r!=null){
+ final byte[] a=bases(), b=u2.bases();
+ int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, b.length);
+
+ for(int i=start, j=b.length-1, iprefix=start+k2; j>=0; i++, j--){
+ byte aa=a[i];
+ byte bb=baseToComplementExtended[b[j]];
+ if(aa!=bb){
+ if(exact || (AminoAcid.isFullyDefined(aa) && AminoAcid.isFullyDefined(bb))){
+ if(earlyExit && i<iprefix){return false;}
+ if((mismatches=mismatches+1)>maxMismatches){
+ if(bandy==null || maxEdits<1){return false;}
+ int edits=bandy.alignForwardRC(b, a, b.length-1, start, maxEdits, exact);
+ assert(b.length<k || a.length<k || bandy.lastRow>=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits;
+ return edits<=maxEdits && bandy.score()>4*edits;
+ }
+ }
+ }
+ }
+ return true;
+ }else{
+ assert(false) : "TODO: Verify by hashing and checking both tips";
+ return false;
+ }
+ }
+
+ private boolean containsReverse(Unit u2, int start, BandedAligner bandy, boolean earlyExit) {
+ if(start+1<u2.length() || start<0 || start>=length()){return false;}
+// if(true){return false;}
+ if(u2.r!=null){
+ final byte[] a=bases(), b=u2.bases();
+ int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, b.length);
+
+ for(int i=start, j=b.length-1, iprefix=start-k2; j>=0; i--, j--){
+ byte aa=a[i];
+ byte bb=b[j];
+ if(aa!=bb){
+ if(exact || (AminoAcid.isFullyDefined(aa) && AminoAcid.isFullyDefined(bb))){
+ if(earlyExit && i>iprefix){return false;}
+ if((mismatches=mismatches+1)>maxMismatches){
+ if(bandy==null || maxEdits<1){return false;}
+ int edits=bandy.alignReverse(b, a, b.length-1, start, maxEdits, exact);
+ assert(b.length<k || a.length<k || bandy.lastRow>=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits;
+ return edits<=maxEdits && bandy.score()>4*edits;
+ }
+ }
+ }
+ }
+ return true;
+ }else{
+ assert(false) : "TODO: Verify by hashing and checking both tips";
+ return false;
+ }
+ }
+
+ private boolean containsReverseRC(Unit u2, int start, BandedAligner bandy, boolean earlyExit) {
+ if(ignoreReverseComplement){return false;}
+ if(start+1<u2.length() || start<0 || start>=length()){return false;}
+// if(true){return false;}
+ if(u2.r!=null){
+ final byte[] a=bases(), b=u2.bases();
+ int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, b.length);
+
+ for(int i=start, j=0; j<b.length; i--, j++){
+ byte aa=a[i];
+ byte bb=baseToComplementExtended[b[j]];
+ if(aa!=bb){
+ if(exact || (AminoAcid.isFullyDefined(aa) && AminoAcid.isFullyDefined(bb))){
+ if(earlyExit && j<k2){return false;}
+ if((mismatches=mismatches+1)>maxMismatches){
+ if(bandy==null || maxEdits<1){return false;}
+ int edits=bandy.alignReverseRC(b, a, 0, start, maxEdits, exact);
+ assert(b.length<k || a.length<k || bandy.lastRow>=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits;
+ return edits<=maxEdits && bandy.score()>4*edits;
+ }
+ }
+ }
+ }
+ return true;
+ }else{
+ assert(false) : "TODO: Verify by hashing and checking both tips";
+ return false;
+ }
+ }
+
+
+ public boolean depthCongruent(int aa, int bb){
+ if(aa<5 && bb<5){return true;}
+ final int a=Tools.max(1, Tools.min(aa, bb));
+ final int b=Tools.max(aa, bb);
+ return a*depthRatio>=b;
+ }
+
+
+ /**
+ * @param u2
+ * @param loc
+ * @param key
+ * @return
+ */
+ public boolean overlaps(Unit u2, int loc, LongM key, BandedAligner bandy, int tableNum, int editLimit) {
+// return makeOverlap(u2, loc, key, bandy, earlyExit)!=null;
+
+// assert(false) : "TODO";
+ if(verbose){System.err.println("overlaps: Considering key "+key+", unit "+u2);}
+ if(parseDepth && !depthCongruent(depth, u2.depth)){return false;}
+ if(minLengthPercent>0){
+ final int len1=length(), len2=u2.length();
+ if(Tools.min(len1, len2)*100f/Tools.max(len1, len2)<minLengthPercent){return false;}
+ }
+ assert(u2.code1!=code1 || u2.code2!=code2 || u2.length()!=length() ||
+ (canonical()==u2.canonical() ? (u2.prefixes[0]!=prefixes[0] && u2.suffixes[0]!=suffixes[0]) : (u2.prefixes[0]!=suffixes[0] && u2.suffixes[0]!=prefixes[0]))) :
+ "Collision? \n"+this+"\n"+u2+"\n"+r+"\n"+u2.r;
+
+ final boolean earlyExit=(tableNum==0);
+ final int x=(tableNum+1);
+ final int ktn=k*tableNum;
+
+ if(key.value()==u2.prefixes[tableNum]){
+ if(verbose){System.err.println("Testing overlaps A"+x);}
+ if(overlapsForward(u2, loc-k2-ktn, 0, bandy, earlyExit, editLimit)){
+ if(verbose){System.err.println("Found Overlap A"+x+"F");}
+ return true;
+ }
+ if(overlapsReverseRC(u2, loc+ktn, 0, bandy, earlyExit, editLimit)){
+ if(verbose){System.err.println("Found Overlap A"+x+"R");}
+ return true;
+ }
+ if(verbose){System.err.println("No Overlap.");}
+ }
+
+ if(key.value()==u2.suffixes[tableNum]){
+ if(verbose){System.err.println("Testing overlaps B"+x);}
+ if(overlapsForwardRC(u2, loc-k2-ktn, u2.length()-1, bandy, earlyExit, editLimit)){
+ if(verbose){System.err.println("Found Overlap B"+x+"F");}
+ return true;
+ }
+ if(overlapsReverse(u2, loc+ktn, u2.length()-1, bandy, earlyExit, editLimit)){
+ if(verbose){System.err.println("Found Overlap B"+x+"R");}
+ return true;
+ }
+ if(verbose){System.err.println("No Overlap.");}
+ }
+
+ return false;
+ }
+
+ /**
+ * @param u2
+ * @param loc
+ * @param key
+ * @return
+ */
+ protected Overlap makeOverlap(Unit u2, int loc, LongM key, BandedAligner bandy, int tableNum) {
+ if(verbose){System.err.println("makeOverlap: Considering key "+key+", unit "+u2);}
+ if(parseDepth && !depthCongruent(depth, u2.depth)){return null;}
+ if(minLengthPercent>0){
+ final int len1=length(), len2=u2.length();
+ if(Tools.min(len1, len2)*100f/Tools.max(len1, len2)<minLengthPercent){return null;}
+ }
+ assert(u2.code1!=code1 || u2.code2!=code2 || u2.length()!=length() || (r!=null && r.mate!=null) ||
+ (canonical()==u2.canonical() ? (u2.prefixes[0]!=prefixes[0] && u2.suffixes[0]!=suffixes[0]) : (u2.prefixes[0]!=suffixes[0] && u2.suffixes[0]!=prefixes[0]))) :
+ "Collision? \n"+this+"\n"+u2+"\n"+r+"\n"+u2.r;
+
+ final boolean earlyExit=(tableNum==0);
+ final int x=(tableNum+1);
+ final int ktn=k*tableNum;
+
+ Overlap o=null;
+ if(key.value()==u2.prefixes[tableNum]){
+ if(verbose){System.err.println("\nTesting makeOverlap A"+x+"F");}
+ if((o=makeOverlapForward(u2, loc-k2-ktn, bandy, earlyExit))!=null){
+ if(verbose){System.err.println("Made Overlap A"+x+"F");}
+ return o;
+ }
+ if(verbose){System.err.println("\nTesting makeOverlap A"+x+"R");}
+ if((o=makeOverlapReverseRC(u2, loc+ktn, bandy, earlyExit))!=null){
+ if(verbose){System.err.println("Made Overlap A"+x+"R");}
+ return o;
+ }
+ if(verbose){System.err.println("No Overlap.");}
+ }
+ if(key.value()==u2.suffixes[tableNum]){
+ if(verbose){System.err.println("\nTesting makeOverlap B"+x+"F");}
+ if((o=makeOverlapForwardRC(u2, loc-k2-ktn, bandy, earlyExit))!=null){
+ if(verbose){System.err.println("Made Overlap B"+x+"F");}
+ return o;
+ }
+ if(verbose){System.err.println("\nTesting makeOverlap B"+x+"R");}
+ if((o=makeOverlapReverse(u2, loc+ktn, bandy, earlyExit))!=null){
+ if(verbose){System.err.println("Made Overlap B"+x+"R");}
+ return o;
+ }
+ if(verbose){System.err.println("No Overlap.");}
+ }
+ return o;
+ }
+
+ private boolean overlapsForward(Unit u2, int start1, int start2, BandedAligner bandy, boolean earlyExit, int maxEdits) {
+ if(verbose){System.err.println("overlapsForward(u1="+this.name()+", u2="+u2.name()+", start1="+start1+", start2="+start2+", earlyExit="+earlyExit+")");}
+
+ final int len1=length(), len2=u2.length();
+ if(start1<0){
+ start2-=start1;
+ start1=0;
+ if(verbose){System.err.println("Modified: start1="+start1+", start2="+start2);}
+ }
+ int overlapLength=Tools.min(len1-start1, len2-start2);
+ int overlapLength2=Tools.max(len1-start1, len2-start2);
+ int stop1=start1+overlapLength-1, stop2=start2+overlapLength-1;
+ if(verbose){System.err.println("Calculated stop1="+stop1+", stop2="+stop2+", overlapLength="+overlapLength);}
+
+ if(!allowAllContainedOverlaps || overlapLength>Tools.min(len1, len2)){
+ if(verbose){
+ System.err.println("Side block. allowAllContainedOverlaps="+allowAllContainedOverlaps+", minOverlapCluster="+minOverlapCluster);
+ System.err.println("start1="+start1+", stop1="+stop1+", len1="+len1+", start2="+start2+", stop2="+stop2+", len2="+len2+
+ ", overlapLen="+overlapLength+", maxEdits="+maxEdits);
+ }
+ if(overlapLength2<minOverlapCluster){return false;}
+ if(minOverlapPercentCluster>0f && (overlapLength2*100f/Tools.min(len1, len2))<minOverlapPercentCluster){return false;}
+ }
+
+ final byte[] a=bases(), b=u2.bases();
+ assert(a!=null && b!=null) : "Null bases for "+code1+" or "+u2.code1;
+ int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, overlapLength);
+
+ if(verbose){
+ System.err.println("start1="+start1+", stop1="+stop1+", len1="+len1+", start2="+start2+", stop2="+stop2+", len2="+len2+
+ ", overlapLen="+overlapLength+", maxMismatches="+maxMismatches+", maxEdits="+maxEdits);
+ }
+
+ for(int i=start1, j=start2; j<=stop2; i++, j++){
+ byte aa=a[i];
+ byte bb=b[j];
+ if(aa!=bb){
+ if(exact || (AminoAcid.isFullyDefined(aa) && AminoAcid.isFullyDefined(bb))){
+ if(earlyExit && j<k2){return false;}
+ mismatches++;
+ if(verbose){System.err.println("i="+i+", j="+j+", "+(char)aa+"!="+(char)bb+", mismatches="+mismatches+"/"+maxMismatches);}
+ if(mismatches>maxMismatches){
+ if(bandy==null || maxEdits<1){return false;}
+ if(verbose){System.err.println("Mismatches exceeded maximum, attempting banded alignment.");}
+ int edits=bandy.alignForward(b, a, 0, start1, maxEdits, exact);
+ assert(b.length<k || a.length<k || bandy.lastRow>=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits;
+ stop2=bandy.lastQueryLoc;
+ stop1=bandy.lastRefLoc;
+ return edits<=maxEdits && bandy.score()>2*edits; //Set at 2*edits instead of 4*edits to prevent assertion errors when reversing alignment
+ }
+ }
+ }
+ }
+ return true;
+ }
+
+ private boolean overlapsForwardRC(Unit u2, int start1, int start2, BandedAligner bandy, boolean earlyExit, int maxEdits) {
+ if(verbose){System.err.println("overlapsForwardRC(u1="+this.name()+", u2="+u2.name()+", start1="+start1+", start2="+start2+", earlyExit="+earlyExit+")");}
+
+ if(ignoreReverseComplement){return false;}
+ final int len1=length(), len2=u2.length();
+ if(start1<0){
+ start2+=start1;
+ start1=0;
+ if(verbose){System.err.println("Modified: start1="+start1+", start2="+start2);}
+ }
+ final int overlapLength=Tools.min(len1-start1, start2+1);
+ final int overlapLength2=Tools.max(len1-start1, start2+1);
+ int stop1=start1+overlapLength-1, stop2=start2-overlapLength+1;
+ if(verbose){System.err.println("Calculated stop1="+stop1+", stop2="+stop2+", overlapLength="+overlapLength);}
+
+ if(!allowAllContainedOverlaps || overlapLength>Tools.min(len1, len2)){
+ if(overlapLength2<minOverlapCluster){return false;}
+ if(minOverlapPercentCluster>0f && (overlapLength2*100f/Tools.min(len1, len2))<minOverlapPercentCluster){return false;}
+ }
+
+ final byte[] a=bases(), b=u2.bases();
+ assert(a!=null && b!=null) : "Null bases for "+code1+" or "+u2.code1;
+ int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, b.length);
+
+ if(verbose){
+ System.err.println("start1="+start1+", stop1="+stop1+", len1="+len1+", start2="+start2+", stop2="+stop2+", len2="+len2+
+ ", overlapLen="+overlapLength+", maxMismatches="+maxMismatches+", maxEdits="+maxEdits);
+ }
+
+ for(int i=start1, j=start2, iprefix=start1+k2; i<=stop1; i++, j--){
+ byte aa=a[i];
+ byte bb=baseToComplementExtended[b[j]];
+ if(aa!=bb){
+ if(exact || (AminoAcid.isFullyDefined(aa) && AminoAcid.isFullyDefined(bb))){
+ if(earlyExit && i<iprefix){return false;}
+ mismatches++;
+ if(verbose){System.err.println("i="+i+", j="+j+", "+(char)aa+"!="+(char)bb+", mismatches="+mismatches+"/"+maxMismatches);}
+ if(mismatches>maxMismatches){
+ if(bandy==null || maxEdits<1){return false;}
+ if(verbose){System.err.println("Mismatches exceeded maximum, attempting banded alignment.");}
+ int edits=bandy.alignForwardRC(b, a, b.length-1, start1, maxEdits, exact);
+ assert(b.length<k || a.length<k || bandy.lastRow>=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits;
+ stop2=bandy.lastQueryLoc;
+ stop1=bandy.lastRefLoc;
+ return edits<=maxEdits && bandy.score()>2*edits; //Set at 2*edits instead of 4*edits to prevent assertion errors when reversing alignment
+ }
+ }
+ }
+ }
+ return true;
+ }
+
+ private boolean overlapsReverse(Unit u2, int start1, int start2, BandedAligner bandy, boolean earlyExit, int maxEdits) {
+ if(verbose){System.err.println("overlapsReverse(u1="+this.name()+", u2="+u2.name()+", start1="+start1+", start2="+start2+", earlyExit="+earlyExit+")");}
+
+ final int len1=length(), len2=u2.length();
+ if(start1>=len1){
+ start2-=(start1-len1+1);
+ start1=len1-1;
+ if(verbose){System.err.println("Modified: start1="+start1+", start2="+start2);}
+ }
+ final int overlapLength=Tools.min(start1+1, start2+1);
+ final int overlapLength2=Tools.max(start1+1, start2+1);
+ int stop1=start1-overlapLength+1, stop2=start2-overlapLength+1;
+ if(verbose){System.err.println("Calculated stop1="+stop1+", stop2="+stop2+", overlapLength="+overlapLength);}
+
+ if(!allowAllContainedOverlaps || overlapLength>Tools.min(len1, len2)){
+ if(overlapLength2<minOverlapCluster){return false;}
+ if(minOverlapPercentCluster>0f && (overlapLength2*100f/Tools.min(len1, len2))<minOverlapPercentCluster){return false;}
+ }
+
+ final byte[] a=bases(), b=u2.bases();
+ assert(a!=null && b!=null) : "Null bases for "+code1+" or "+u2.code1;
+ int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, b.length);
+
+ if(verbose){
+ System.err.println("start1="+start1+", stop1="+stop1+", len1="+len1+", start2="+start2+", stop2="+stop2+", len2="+len2+
+ ", overlapLen="+overlapLength+", maxMismatches="+maxMismatches+", maxEdits="+maxEdits);
+ }
+
+ for(int i=start1, j=start2, iprefix=start1-k2; i>=stop1; i--, j--){
+ byte aa=a[i];
+ byte bb=b[j];
+ if(aa!=bb){
+ if(exact || (AminoAcid.isFullyDefined(aa) && AminoAcid.isFullyDefined(bb))){
+ if(earlyExit && i>iprefix){return false;}
+ mismatches++;
+ if(verbose){System.err.println("i="+i+", j="+j+", "+(char)aa+"!="+(char)bb+", mismatches="+mismatches+"/"+maxMismatches);}
+ if(mismatches>maxMismatches){
+ if(bandy==null || maxEdits<1){return false;}
+ if(verbose){System.err.println("Mismatches exceeded maximum, attempting banded alignment.");}
+ int edits=bandy.alignReverse(b, a, b.length-1, start1, maxEdits, exact);
+ assert(b.length<k || a.length<k || bandy.lastRow>=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits;
+ stop2=bandy.lastQueryLoc;
+ stop1=bandy.lastRefLoc;
+ return edits<=maxEdits && bandy.score()>2*edits; //Set at 2*edits instead of 4*edits to prevent assertion errors when reversing alignment
+ }
+ }
+ }
+ }
+ return true;
+ }
+
+ private boolean overlapsReverseRC(Unit u2, int start1, int start2, BandedAligner bandy, boolean earlyExit, int maxEdits) {
+ if(verbose){System.err.println("overlapsReverseRC(u1="+this.name()+", u2="+u2.name()+", start1="+start1+", start2="+start2+", earlyExit="+earlyExit+")");}
+
+ if(ignoreReverseComplement){return false;}
+ final int len1=length(), len2=u2.length();
+ if(start1>=len1){
+ start2+=(start1-len1+1);
+ start1=len1-1;
+ if(verbose){System.err.println("Modified: start1="+start1+", start2="+start2);}
+ }
+ final int overlapLength=Tools.min(start1+1, len2-start2);
+ final int overlapLength2=Tools.max(start1+1, len2-start2);
+ int stop1=start1-overlapLength+1, stop2=start2+overlapLength-1;
+ if(verbose){System.err.println("Calculated stop1="+stop1+", stop2="+stop2+", overlapLength="+overlapLength);}
+
+ if(!allowAllContainedOverlaps || overlapLength>Tools.min(len1, len2)){
+ if(overlapLength2<minOverlapCluster){return false;}
+ if(minOverlapPercentCluster>0f && (overlapLength2*100f/Tools.min(len1, len2))<minOverlapPercentCluster){return false;}
+ }
+
+ final byte[] a=bases(), b=u2.bases();
+ assert(a!=null && b!=null) : "Null bases for "+code1+" or "+u2.code1;
+ int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, b.length);
+
+ if(verbose){
+ System.err.println("start1="+start1+", stop1="+stop1+", len1="+len1+", start2="+start2+", stop2="+stop2+", len2="+len2+
+ ", overlapLen="+overlapLength+", maxMismatches="+maxMismatches+", maxEdits="+maxEdits);
+ }
+
+ for(int i=start1, j=start2; j<=stop2; i--, j++){
+ byte aa=a[i];
+ byte bb=baseToComplementExtended[b[j]];
+ if(aa!=bb){
+ if(exact || (AminoAcid.isFullyDefined(aa) && AminoAcid.isFullyDefined(bb))){
+ if(earlyExit && j<k2){return false;}
+ mismatches++;
+ if(verbose){System.err.println("i="+i+", j="+j+", "+(char)aa+"!="+(char)bb+", mismatches="+mismatches+"/"+maxMismatches);}
+ if(mismatches>maxMismatches){
+ if(bandy==null || maxEdits<1){return false;}
+ if(verbose){System.err.println("Mismatches exceeded maximum, attempting banded alignment.");}
+ int edits=bandy.alignReverseRC(b, a, 0, start1, maxEdits, exact);
+ assert(b.length<k || a.length<k || bandy.lastRow>=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits;
+ stop2=bandy.lastQueryLoc;
+ stop1=bandy.lastRefLoc;
+ return edits<=maxEdits && bandy.score()>2*edits; //Set at 2*edits instead of 4*edits to prevent assertion errors when reversing alignment
+ }
+ }
+ }
+ }
+ return true;
+ }
+
+
+
+ private Overlap makeOverlapForward(Unit u2, int start1, BandedAligner bandy, boolean earlyExit) {
+ if(verbose){System.err.println("makeOverlapForward(u1="+this.name()+", u2="+u2.name()+", start="+start1+", earlyExit="+earlyExit+")");}
+ final int len1=length(), len2=u2.length();
+ int start2=0;
+ if(start1<0){
+ start2-=start1;
+ start1=0;
+ }
+ final int overlapLength=Tools.min(len1-start1, len2-start2);
+ final int overlapLength2=Tools.max(len1-start1, len2-start2);
+ int stop1=start1+overlapLength-1, stop2=start2+overlapLength-1;
+ if(verbose){System.err.println("Calculated stop1="+stop1+", stop2="+stop2+", overlapLength="+overlapLength);}
+
+ if(!allowAllContainedOverlaps || overlapLength>Tools.min(len1, len2)){
+ if(overlapLength<minOverlapCluster){return null;}
+ if(minOverlapPercentCluster>0f && (overlapLength*100f/Tools.min(len1, len2))<minOverlapPercentCluster){return null;}
+ }
+
+ final byte[] a=bases(), b=u2.bases();
+ assert(a!=null && b!=null) : "Null bases for "+code1+" or "+u2.code1;
+ int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, overlapLength);
+
+ if(verbose){
+ System.err.println("start1="+start1+", stop1="+stop1+", len1="+len1+", start2="+start2+", stop2="+stop2+", len2="+len2+
+ ", overlapLen="+overlapLength+", maxMismatches="+maxMismatches+", maxEdits="+maxEdits);
+ }
+
+ for(int i=start1, j=start2; j<=stop2; i++, j++){
+ byte aa=a[i];
+ byte bb=b[j];
+ if(aa!=bb){
+ if(exact || (AminoAcid.isFullyDefined(aa) && AminoAcid.isFullyDefined(bb))){
+ if(earlyExit && j<k2){return null;}
+ mismatches++;
+ if(verbose){System.err.println("i="+i+", j="+j+", "+(char)aa+"!="+(char)bb+", mismatches="+mismatches+"/"+maxMismatches);}
+ if(mismatches>1 && bandy!=null){
+ if(maxEdits<1){return null;}
+ if(verbose){System.err.println("mismatches exceeded 1, attempting banded alignment.");}
+ int edits=bandy.alignForward(b, a, start2, start1, maxEdits, exact);
+ if(edits>maxEdits || bandy.score()<=4*edits){
+ if(verbose){System.err.println((edits>maxEdits ? "Too many edits" : "Alignment score too low")+"; returning null.");}
+ return null;
+ }
+ assert(b.length<k || a.length<k || bandy.lastRow>=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits;
+ stop2=bandy.lastQueryLoc;
+ stop1=bandy.lastRefLoc;
+// if(bandy.lastOffset>0){//Ref longer than query
+// for(int k=0; k<bandy.lastOffset; k++){
+// if(stop1+1<=len1){stop1++;}
+// else{stop2--;}//I don't think this can happen
+// }
+// }else if(bandy.lastOffset<0){//Query longer than ref
+// for(int k=0; k>bandy.lastOffset; k--){
+// if(stop2+1<=len2){stop2++;}
+// else{stop1--;}
+// }
+// }
+ return new Overlap(this, u2, FORWARD, start1, start2, stop1, stop2, overlapLength, edits, edits, bandy);
+ }else if(mismatches>maxMismatches){return null;}
+ }
+ }
+ }
+ return new Overlap(this, u2, FORWARD, start1, start2, stop1, stop2, overlapLength, mismatches, 0, bandy);
+ }
+
+ private Overlap makeOverlapForwardRC(Unit u2, int start1, BandedAligner bandy, boolean earlyExit) {
+ if(verbose){System.err.println("makeOverlapForwardRC(u1="+this.name()+", u2="+u2.name()+", start="+start1+", earlyExit="+earlyExit+")");}
+ if(ignoreReverseComplement){return null;}
+ final int len1=length(), len2=u2.length();
+ int start2=len2-1;
+ if(start1<0){
+ start2+=start1;
+ start1=0;
+ }
+ final int overlapLength=Tools.min(len1-start1, start2+1);
+ final int overlapLength2=Tools.max(len1-start1, start2+1);
+ int stop1=start1+overlapLength-1, stop2=start2-overlapLength+1;
+ if(verbose){System.err.println("Calculated stop1="+stop1+", stop2="+stop2+", overlapLength="+overlapLength);}
+
+ if(!allowAllContainedOverlaps || overlapLength>Tools.min(len1, len2)){
+ if(overlapLength<minOverlapCluster){return null;}
+ if(minOverlapPercentCluster>0f && (overlapLength*100f/Tools.min(len1, len2))<minOverlapPercentCluster){return null;}
+ }
+
+ final byte[] a=bases(), b=u2.bases();
+ assert(a!=null && b!=null) : "Null bases for "+code1+" or "+u2.code1;
+ int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, b.length);
+
+ for(int i=start1, j=start2, iprefix=start1+k2; i<=stop1; i++, j--){
+ byte aa=a[i];
+ byte bb=baseToComplementExtended[b[j]];
+ if(aa!=bb){
+ if(exact || (AminoAcid.isFullyDefined(aa) && AminoAcid.isFullyDefined(bb))){
+ if(earlyExit && i<iprefix){return null;}
+ mismatches++;
+ if(verbose){System.err.println("i="+i+", j="+j+", "+(char)aa+"!="+(char)bb+", mismatches="+mismatches+"/"+maxMismatches);}
+ if(mismatches>1 && bandy!=null){
+ if(maxEdits<1){return null;}
+ if(verbose){System.err.println("mismatches exceeded 1, attempting banded alignment.");}
+ int edits=bandy.alignForwardRC(b, a, start2, start1, maxEdits, exact);
+ if(edits>maxEdits || bandy.score()<=4*edits){
+ if(verbose){System.err.println((edits>maxEdits ? "Too many edits" : "Alignment score too low")+"; returning null.");}
+ return null;
+ }
+ assert(b.length<k || a.length<k || bandy.lastRow>=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits;
+ stop2=bandy.lastQueryLoc;
+ stop1=bandy.lastRefLoc;
+// if(bandy.lastOffset>0){//Ref longer than query
+// for(int k=0; k<bandy.lastOffset; k++){
+// if(stop1+1<=len1){stop1++;}
+// else{stop2++;}//I don't think this can happen
+// }
+// }else if(bandy.lastOffset<0){//Query longer than ref
+// for(int k=0; k>bandy.lastOffset; k--){
+// if(stop2>0){stop2--;}
+// else{stop1--;}
+// }
+// }
+ return new Overlap(this, u2, FORWARDRC, start1, start2, stop1, stop2, overlapLength, edits, edits, bandy);
+ }else if(mismatches>maxMismatches){return null;}
+ }
+ }
+ }
+ return new Overlap(this, u2, FORWARDRC, start1, start2, stop1, stop2, overlapLength, mismatches, 0, bandy);
+ }
+
+ private Overlap makeOverlapReverse(Unit u2, int start1, BandedAligner bandy, boolean earlyExit) {
+ if(verbose){System.err.println("makeOverlapReverse(u1="+this.name()+", u2="+u2.name()+", start="+start1+", earlyExit="+earlyExit+")");}
+
+ final int len1=length(), len2=u2.length();
+ int start2=len2-1;
+ if(start1>=len1){
+ start2-=(start1-len1+1);
+ start1=len1-1;
+ }
+ final int overlapLength=Tools.min(start1+1, start2+1);
+ final int overlapLength2=Tools.max(start1+1, start2+1);
+ int stop1=start1-overlapLength+1, stop2=start2-overlapLength+1;
+ if(verbose){System.err.println("Calculated stop1="+stop1+", stop2="+stop2+", overlapLength="+overlapLength);}
+
+ if(!allowAllContainedOverlaps || overlapLength>Tools.min(len1, len2)){
+ if(overlapLength<minOverlapCluster){return null;}
+ if(minOverlapPercentCluster>0f && (overlapLength*100f/Tools.min(len1, len2))<minOverlapPercentCluster){return null;}
+ }
+
+ final byte[] a=bases(), b=u2.bases();
+ assert(a!=null && b!=null) : "Null bases for "+code1+" or "+u2.code1;
+ int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, b.length);
+
+ if(verbose){
+ System.err.println("start1="+start1+", stop1="+stop1+", len1="+len1+", start2="+start2+", stop2="+stop2+", len2="+len2+
+ ", overlapLen="+overlapLength+", maxMismatches="+maxMismatches+", maxEdits="+maxEdits);
+ }
+
+ for(int i=start1, j=start2, iprefix=start1-k2; i>=stop1; i--, j--){
+ byte aa=a[i];
+ byte bb=b[j];
+ if(aa!=bb){
+ if(exact || (AminoAcid.isFullyDefined(aa) && AminoAcid.isFullyDefined(bb))){
+ if(earlyExit && i>iprefix){return null;}
+ mismatches++;
+ if(verbose){System.err.println("i="+i+", j="+j+", "+(char)aa+"!="+(char)bb+", mismatches="+mismatches+"/"+maxMismatches);}
+ if(mismatches>1 && bandy!=null){
+ if(maxEdits<1){return null;}
+ if(verbose){System.err.println("mismatches exceeded 1, attempting banded alignment.");}
+ int edits=bandy.alignReverse(b, a, start2, start1, maxEdits, exact);
+ if(edits>maxEdits || bandy.score()<=4*edits){
+ if(verbose){System.err.println((edits>maxEdits ? "Too many edits" : "Alignment score too low")+"; returning null.");}
+ return null;
+ }
+ assert(b.length<k || a.length<k || bandy.lastRow>=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits;
+ stop2=bandy.lastQueryLoc;
+ stop1=bandy.lastRefLoc;
+// if(bandy.lastOffset>0){//Ref longer than query
+// for(int k=0; k<bandy.lastOffset; k++){
+// if(stop1>0){stop1--;}
+// else{stop2++;}//I don't think this can happen
+// }
+// }else if(bandy.lastOffset<0){//Query longer than ref
+// for(int k=0; k>bandy.lastOffset; k--){
+// if(stop2>0){stop2--;}
+// else{stop1++;}
+// }
+// }
+ return new Overlap(this, u2, REVERSE, start1, start2, stop1, stop2, overlapLength, edits, edits, bandy);
+ }else if(mismatches>maxMismatches){return null;}
+ }
+ }
+ }
+ return new Overlap(this, u2, REVERSE, start1, start2, stop1, stop2, overlapLength, mismatches, 0, bandy);
+ }
+
+ private Overlap makeOverlapReverseRC(Unit u2, int start1, BandedAligner bandy, boolean earlyExit) {
+ if(verbose){System.err.println("makeOverlapReverseRC(u1="+this.name()+", u2="+u2.name()+", start="+start1+", earlyExit="+earlyExit+")");}
+ if(ignoreReverseComplement){return null;}
+ final int len1=length(), len2=u2.length();
+ int start2=0;
+ if(start1>=len1){
+ start2+=(start1-len1+1);
+ start1=len1-1;
+ }
+ final int overlapLength=Tools.min(start1+1, len2-start2);
+ final int overlapLength2=Tools.max(start1+1, len2-start2);
+ int stop1=start1-overlapLength+1, stop2=start2+overlapLength-1;
+ if(verbose){System.err.println("Calculated stop1="+stop1+", stop2="+stop2+", overlapLength="+overlapLength);}
+
+ if(!allowAllContainedOverlaps || overlapLength>Tools.min(len1, len2)){
+ if(overlapLength<minOverlapCluster){return null;}
+ if(minOverlapPercentCluster>0f && (overlapLength*100f/Tools.min(len1, len2))<minOverlapPercentCluster){return null;}
+ }
+
+ final byte[] a=bases(), b=u2.bases();
+ assert(a!=null && b!=null) : "Null bases for "+code1+" or "+u2.code1;
+ int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, b.length);
+
+ if(verbose){
+ System.err.println("start1="+start1+", stop1="+stop1+", len1="+len1+", start2="+start2+", stop2="+stop2+", len2="+len2+
+ ", overlapLen="+overlapLength+", maxMismatches="+maxMismatches+", maxEdits="+maxEdits);
+ }
+
+ for(int i=start1, j=start2; j<=stop2; i--, j++){
+ byte aa=a[i];
+ byte bb=baseToComplementExtended[b[j]];
+ if(aa!=bb){
+ if(exact || (AminoAcid.isFullyDefined(aa) && AminoAcid.isFullyDefined(bb))){
+ if(earlyExit && j<k2){return null;}
+ mismatches++;
+ if(verbose){System.err.println("i="+i+", j="+j+", "+(char)aa+"!="+(char)bb+", mismatches="+mismatches+"/"+maxMismatches);}
+ if(mismatches>1 && bandy!=null){
+ if(maxEdits<1){return null;}
+ if(verbose){System.err.println("mismatches exceeded 1, attempting banded alignment.");}
+ int edits=bandy.alignReverseRC(b, a, start2, start1, maxEdits, exact);
+ if(edits>maxEdits || bandy.score()<=4*edits){
+ if(verbose){System.err.println((edits>maxEdits ? "Too many edits" : "Alignment score too low")+"; returning null.");}
+ return null;
+ }
+ assert(b.length<k || a.length<k || bandy.lastRow>=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits;
+ stop2=bandy.lastQueryLoc;
+ stop1=bandy.lastRefLoc;
+// if(bandy.lastOffset>0){//Ref longer than query
+// for(int k=0; k<bandy.lastOffset; k++){
+// if(stop1>0){stop1--;}
+// else{stop2--;}//I don't think this can happen
+// }
+// }else if(bandy.lastOffset<0){//Query longer than ref
+// for(int k=0; k>bandy.lastOffset; k--){
+// if(stop2+1<=len2){stop2++;}
+// else{stop1++;}
+// }
+// }
+ return new Overlap(this, u2, REVERSERC, start1, start2, stop1, stop2, overlapLength, edits, edits, bandy);
+ }else if(mismatches>maxMismatches){return null;}
+ }
+ }
+ }
+ return new Overlap(this, u2, REVERSERC, start1, start2, stop1, stop2, overlapLength, mismatches, 0, bandy);
+ }
+
+ @Override
+ public int compareTo(Unit b) {
+ int x=comparePairedRC(this, b);
+// int y=comparePairedRC(b, this);
+// boolean eq1=this.equals(b);
+// boolean eq2=b.equals(this);
+//
+// assert((x==y)==(x==0)) : x+", "+y+"\n"+this+"\n"+b;
+// assert((x>0 == y<0) || (x==0 && y==0)) : x+", "+y+"\n"+this+"\n"+b;
+//
+// assert(eq1==eq2): x+", "+y+"\n"+this+"\n"+b;
+// assert(eq1==(x==0)): x+", "+y+"\n"+this+"\n"+b;
+//
+// assert(eq1 || this!=b);
+//
+// if(verbose){ //TODO: Remove
+// System.err.println(this+"\n"+b+"\n"+this.r.toFastq()+"\n"+this.r.mate.toFastq()+"\n"+b.r.toFastq()+"\n"+b.r.mate.toFastq()+"\n");
+// System.err.println("\n"+x+", "+y+", "+eq1+", "+eq2);
+// verbose=false;
+// }
+
+ return x;
+ }
+
+ public boolean equals(Object b){return equals((Unit)b);}
+ public boolean equals(Unit b){
+ boolean x=pairedEqualsRC(this, b);
+// assert(x==pairedEqualsRC(b, this));
+// assert(x==(comparePairedRC(this, b)==0));
+// assert(x==(comparePairedRC(b, this)==0));
+// assert(x || this!=b);
+// System.err.println("\n****EQUALS?****:\n"+this+"\n"+b+"\n**** ****"); //TODO: Remove
+ return x;
+ }
+
+ @Override
+ public int hashCode(){
+ return (int)((code1^(code1>>>32))&0xFFFFFFFFL);
+ }
+
+ private synchronized void setValid(boolean b){
+ assert(b!=valid());
+// if(!b){System.err.println("Setting invalid "+name());}
+ if(b){flags&=~INVALID_MASK;}
+ else{flags|=INVALID_MASK;}
+ assert(b==valid());
+ }
+
+ private synchronized void setClustered(boolean b){
+ assert(b!=clustered());
+ if(b){flags|=CLUSTER_MASK;}
+ else{flags&=~CLUSTER_MASK;}
+ assert(b==clustered());
+ }
+
+ private void setVisited(boolean b){
+ assert(b!=visited());
+ if(b){flags|=VISIT_MASK;}
+ else{flags&=~VISIT_MASK;}
+ assert(b==visited());
+ }
+
+ private synchronized void setCanonical(boolean b){
+ assert(b!=canonical());
+ if(b){flags|=CANON_MASK;}
+ else{flags&=~CANON_MASK;}
+ assert(b==canonical());
+ assert(r==null || b==isCanonical(r.bases));
+ }
+
+ private void setCanonicized(boolean b){
+ assert(b!=canonicized());
+ if(b){flags|=CANONICIZED_MASK;}
+ else{flags&=~CANONICIZED_MASK;}
+ assert(b==canonicized());
+ }
+
+ private synchronized void setCanonContradiction(boolean b){
+// assert(b!=canonContradiction());
+ if(b){flags|=CANON_CONTRADICTION_MASK;}
+ else{flags&=~CANON_CONTRADICTION_MASK;}
+ assert(b==canonContradiction());
+ }
+
+ private synchronized void setOffset(int x){
+ offset=x;
+ setOffsetValid(true);
+ }
+
+ private synchronized void setOffsetValid(boolean b){
+ assert(!offsetValid());
+ if(b){flags|=OFFSET_VALID_MASK;}
+ else{flags&=~OFFSET_VALID_MASK;}
+ assert(b==offsetValid());
+ }
+
+ private synchronized void setOffsetContradiction(boolean b){
+// assert(b!=offsetContradiction());
+ assert(offsetValid());
+ if(b){flags|=OFFSET_CONTRADICTION_MASK;}
+ else{flags&=~OFFSET_CONTRADICTION_MASK;}
+ assert(b==offsetContradiction());
+ }
+
+ private void reverseComplement(){
+ assert(r!=null);
+ r.reverseComplement();
+
+ if(prefixes!=null){
+ assert(suffixes!=null) : "Can't rcomp with null suffix array.";
+ for(int i=0; i<prefixes.length; i++){
+ long temp=prefixes[i];
+ prefixes[i]=suffixes[i];
+ suffixes[i]=temp;
+ }
+ }
+ setCanonical(!canonical());
+ }
+
+ /** Return true if 'this' should be the first Unit in the overlap object */
+ public boolean firstInOverlap(Unit u2){
+ assert(this!=u2) : "\n"+this.r+"\n"+u2.r;
+ if(u2.length()!=length()){return u2.length()<length();}
+ if(u2.code1!=code1){return u2.code1<code1;}
+ if(u2.code2!=code2){return u2.code2<code2;}
+ int x=compareTo(u2);
+ assert(x!=0 || (r!=null && r.mate!=null));
+ if(x!=0){return x>=0;}
+ return r.numericID>=u2.r.numericID;
+ }
+
+ public final boolean inSet(){
+ if(subsetCount<2){return true;}
+ if(r.pairnum()>0){return ((Unit)r.mate.obj).inSet();}
+ return ((code1&Long.MAX_VALUE)%subsetCount)==subset;
+ }
+
+ public byte[] bases(){return r==null ? null : r.bases;}
+
+ public String name(){return r!=null ? r.id : null /*code+""*/;}
+ public String toString(){return "("+name()+","+code1+","+code2+","+length()+","+prefixes[0]+","+suffixes[0]+","+(canonical()?"c":"nc")+",d="+depth+")";}
+
+
+ public final Read r;
+ public final long code1;
+ public final long code2;
+ public final long[] prefixes=(numAffixMaps>0 ? new long[numAffixMaps] : null);
+ public final long[] suffixes=(storeSuffix && numAffixMaps>0 ? new long[numAffixMaps] : null);
+
+ /** Distance of leftmost side of this read relative to leftmost side of root.
+ * Assumes everything is in 'forward' orientation. */
+ public int offset=-999999999;
+ public int depth=1;
+// private boolean valid=true;
+
+ public int unitID;
+
+ public ArrayList<Overlap> overlapList;
+
+ private long flags;
+ /** True if the original read orientation was canonical */
+ public final boolean canonical(){return (CANON_MASK&flags)!=0;}
+ /** True if this contig should be output, false if not */
+ public final boolean valid(){return (INVALID_MASK&flags)==0;}
+ /** Length of this contig */
+ public final int length(){return (int)(LEN_MASK&flags);}
+ /** Position of this contig relative to root */
+ public final int offset(){
+ assert(offsetValid());
+ return offset;
+ }
+ public int pairnum(){return (PAIRNUM_MASK&flags)==PAIRNUM_MASK ? 1 : 0;}
+
+ public void clearVolatileFlags(){
+ flags=flags&~(CANONICIZED_MASK|VISIT_MASK|CANON_CONTRADICTION_MASK|OFFSET_VALID_MASK|OFFSET_CONTRADICTION_MASK);
+ assert(!visited());
+ assert(!canonicized());
+ assert(!canonContradiction());
+ assert(!offsetValid());
+ assert(!offsetContradiction());
+ }
+
+ public boolean visited(){return (VISIT_MASK&flags)==VISIT_MASK;}
+ public boolean clustered(){return (CLUSTER_MASK&flags)==CLUSTER_MASK;}
+ public boolean canonicized(){return (CANONICIZED_MASK&flags)==CANONICIZED_MASK;}
+ public boolean canonContradiction(){return (CANON_CONTRADICTION_MASK&flags)==CANON_CONTRADICTION_MASK;}
+ public boolean offsetValid(){return (OFFSET_VALID_MASK&flags)==OFFSET_VALID_MASK;}
+ public boolean offsetContradiction(){return (OFFSET_CONTRADICTION_MASK&flags)==OFFSET_CONTRADICTION_MASK;}
+ public boolean contradiction(){return offsetContradiction() || canonContradiction();}
+
+ private static final long LEN_MASK=0x7FFFFFFFL;
+ private static final long CANON_MASK=(1L<<33);
+ private static final long INVALID_MASK=(1L<<34);
+ private static final long VISIT_MASK=(1L<<35);
+ private static final long CLUSTER_MASK=(1L<<36);
+ private static final long CANONICIZED_MASK=(1L<<37);
+ private static final long CANON_CONTRADICTION_MASK=(1L<<38);
+ private static final long OFFSET_VALID_MASK=(1L<<39);
+ private static final long OFFSET_CONTRADICTION_MASK=(1L<<40);
+ private static final long PAIRNUM_MASK=(1L<<41);
+ }
+
+ private static final class UnitOffsetComparator implements Comparator<Unit> {
+
+ UnitOffsetComparator(){}
+
+ /* (non-Javadoc)
+ * @see java.util.Comparator#compare(java.lang.Object, java.lang.Object)
+ */
+ @Override
+ public int compare(Unit a, Unit b) {
+ if(a.offsetValid() && b.offsetValid()){
+ int x=a.offset()-b.offset();
+ if(x!=0){return x;}
+ }else{
+ if(a.offsetValid()){return -1;}
+ if(b.offsetValid()){return 1;}
+ }
+ return a.compareTo(b);
+ }
+
+ }
+
+ private static final class ClusterLengthComparator implements Comparator<ArrayList<Unit>> {
+
+ ClusterLengthComparator(){}
+
+ /* (non-Javadoc)
+ * @see java.util.Comparator#compare(java.lang.Object, java.lang.Object)
+ */
+ @Override
+ public int compare(ArrayList<Unit> a, ArrayList<Unit> b) {
+ if(a.size()!=b.size()){return b.size()-a.size();}
+ if(a.isEmpty() && b.isEmpty()){return 0;}
+ return a.get(0).compareTo(b.get(0));
+ }
+
+ }
+
+ private static final int[] makeNmerIndex(int n){
+ final int max=(1<<(2*n))-1;
+ int[] array=new int[max+1];
+
+ int count=0;
+ for(int i=0; i<=max; i++){
+ final int a=i, b=AminoAcid.reverseComplementBinaryFast(i, n);
+ int min=Tools.min(a, b);
+ if(min==a){
+ array[a]=array[b]=count;
+ count++;
+ }
+ }
+ return array;
+ }
+
+ /** Makes a nmer (e.g., tetramer) profile of a cluster */
+ private static final float[] makeNmerProfile(ArrayList<Unit> alu, long[] array_){
+ final int nbits=2*nmerLength;
+ final long[] array=(array_==null ? new long[maxNmer+1] : array_);
+ final int mask=~((-1)<<(nbits));
+
+ long keysCounted=0;
+
+ for(Unit u : alu){
+ byte[] bases=u.r.bases;
+ int len=0;
+ int kmer=0;
+ for(byte b : bases){
+ int x=AminoAcid.baseToNumber[b];
+ if(x<0){
+ len=0;
+ kmer=0;
+ }else{
+ kmer=((kmer<<2)|x)&mask;
+ len++;
+ if(len>=nmerLength){
+ int rkmer=AminoAcid.reverseComplementBinaryFast(kmer, nmerLength);
+ keysCounted++;
+ array[nmerIndex[Tools.min(kmer, rkmer)]]++;
+ }
+ }
+ }
+ }
+
+ if(keysCounted==0){keysCounted=1;}
+ final float mult=1f/keysCounted;
+
+ float[] r=new float[array.length];
+ for(int i=0; i<array.length; i++){
+ r[i]=array[i]*mult;
+ array[i]=0;
+ }
+ return r;
+ }
+
+ private ConcurrentReadInputStream crisa[];
+
+ private final ByteStreamWriter dupeWriter;
+
+
+ private String[] in1=null;
+ private String[] in2=null;
+ private String out=null;
+ private String clusterFilePattern=null;
+ private String outbest=null;
+ private String outdupe=null;
+ private String outcsf=null;
+ private String outgraph=null;
+ private int maxNs=-1;
+ private long maxReads=-1;
+ public boolean errorState=false;
+ boolean sort=false;
+ boolean ascending=true;
+ boolean absorbContainment=true;
+ boolean absorbMatch=true;
+ boolean findOverlaps=false;
+ boolean makeClusters=false;
+ boolean processClusters=false;
+ boolean renameClusters=false;
+ boolean absorbOverlap=false;
+ boolean storeSuffix=true;
+ boolean storeName=true;
+ boolean storeQuality=true;
+ boolean exact=true;
+ boolean uniqueNames=true;
+ boolean maxSpanningTree=false;
+
+ boolean canonicizeClusters=true;
+ boolean removeCycles=true;
+ boolean fixMultiJoins=true;
+ boolean fixCanonContradictions=false;
+ boolean fixOffsetContradictions=false;
+ boolean countTransitive=false;
+ boolean countRedundant=false;
+
+ private boolean multipleInputFiles=false;
+ private boolean rigorousTransitive=false;
+ private int numAffixMaps=1;
+ private int maxAffixCopies=2000000000;
+ private int maxEdges=2000000000;
+ private int maxEdges2=2000000000;
+ private boolean allowAllContainedOverlaps=false;
+// private boolean toUpperCase=false;
+
+ /** Trim left bases of the read to this position (exclusive, 0-based) */
+ private int forceTrimLeft=-1;
+ /** Trim right bases of the read after this position (exclusive, 0-based) */
+ private int forceTrimRight=-1;
+
+ int maxEdits=0;
+ int maxSubs=0;
+ int bandwidth=9;
+ final boolean customBandwidth;
+ float minIdentity=100;
+ float minIdentityMult=0;
+ float minLengthPercent=0;
+ int minOverlapCluster=100;
+ int minOverlapMerge=1;
+ float minOverlapPercentCluster=0;
+ float minOverlapPercentMerge=0;
+
+ private int minClusterSize=1;
+ private int minClusterSizeForStats=1;
+ private boolean pickBestRepresentativePerCluster=false;
+
+ long readsProcessed=0;
+ long basesProcessed=0;
+ long collisions=0;
+ long containments=0;
+ long baseContainments=0;
+ long containmentCollisions=0;
+ long matches=0;
+ long baseMatches=0;
+ long overlaps=0;
+ long baseOverlaps=0;
+ long overlapCollisions=0;
+ long addedToMain=0;
+
+ private final int subset;
+ private final int subsetCount;
+ private final boolean subsetMode;
+
+ private final int k;
+ private final int k2;
+ private final boolean EA;
+
+ private static int tcount=0;
+
+ private LinkedHashMap<Long, ArrayList<Unit>> codeMap=new LinkedHashMap<Long, ArrayList<Unit>>(4000000);
+// private HashMap<LongM, ArrayList<Unit>> affixMap1=null;
+// private HashMap<LongM, ArrayList<Unit>> affixMap2=null;
+ private HashMap<LongM, ArrayList<Unit>>[] affixMaps=null;
+ private ArrayDeque<ArrayList<Unit>> clusterQueue=null;
+ private ArrayList<ArrayList<Unit>> processedClusters=null;
+ private AtomicIntegerArray clusterNumbers=null;
+
+ private static final UnitOffsetComparator UNIT_OFFSET_COMPARATOR=new UnitOffsetComparator();
+ private static final ClusterLengthComparator CLUSTER_LENGTH_COMPARATOR=new ClusterLengthComparator();
+ private static final long[][] hashcodes=makeCodes2(32);
+ public static final byte[] baseToNumber=new byte[128];
+ public static final byte[] baseToComplementNumber=new byte[128];
+ public static final byte[] baseToComplementExtended=new byte[128];
+ public static final int nmerLength=4;
+ public static final int[] nmerIndex=makeNmerIndex(nmerLength);
+ public static final int maxNmer=Tools.max(nmerIndex);
+ private static PrintStream outstream=System.err;
+ /** Permission to overwrite existing files */
+ public static boolean overwrite=false;
+ /** Permission to append to existing files */
+ public static boolean append=false;
+ public static boolean showSpeed=true;
+ public static boolean verbose=false;
+ public static boolean ignoreReverseComplement=false;
+ public static boolean preventTransitiveOverlaps=false;
+ public static boolean ignoreAffix1=false;
+ public static boolean parseDepth=false;
+ public static boolean printLengthInEdges=false;
+ public static float depthRatio=2;
+ public static int MINSCAF=0;
+ public static int THREADS=Shared.threads();
+ public static int threadMaxReadsToBuffer=4000;
+ public static int threadMaxBasesToBuffer=32000000;
+ public static boolean DISPLAY_PROGRESS=true;
+ public static boolean UNIQUE_ONLY=false;
+ public static boolean REQUIRE_MATCHING_NAMES=false;
+ public static boolean NUMBER_GRAPH_NODES=true;
+ public static boolean ADD_PAIRNUM_TO_NAME=true;
+
+ private static int reverseType(int type){return (type+2)%4;}
+ public static final int FORWARD=0;
+ public static final int FORWARDRC=1;
+ public static final int REVERSE=2;
+ public static final int REVERSERC=3;
+ public static final String[] OVERLAP_TYPE_NAMES=new String[] {"FORWARD", "FORWARDRC", "REVERSE", "REVERSERC"};
+ public static final String[] OVERLAP_TYPE_ABBREVIATIONS=new String[] {"F", "FRC", "R", "RRC"};
+
+ static{//All others are 0
+ baseToNumber['A']=baseToNumber['a']=0;
+ baseToNumber['C']=baseToNumber['c']=1;
+ baseToNumber['G']=baseToNumber['g']=2;
+ baseToNumber['T']=baseToNumber['t']=3;
+ baseToNumber['U']=baseToNumber['u']=3;
+
+ baseToComplementNumber['A']=baseToComplementNumber['a']=3;
+ baseToComplementNumber['C']=baseToComplementNumber['c']=2;
+ baseToComplementNumber['G']=baseToComplementNumber['g']=1;
+ baseToComplementNumber['T']=baseToComplementNumber['t']=0;
+ baseToComplementNumber['U']=baseToComplementNumber['u']=0;
+
+ for(int i=0; i<AminoAcid.baseToComplementExtended.length; i++){
+ byte b=AminoAcid.baseToComplementExtended[i];
+ baseToComplementExtended[i]=(b<0 ? (byte)i : b);
+ }
+ }
+
+}
diff --git a/current/jgi/DedupeByMapping.java b/current/jgi/DedupeByMapping.java
new file mode 100755
index 0000000..3281f23
--- /dev/null
+++ b/current/jgi/DedupeByMapping.java
@@ -0,0 +1,452 @@
+package jgi;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.PriorityQueue;
+
+import dna.Timer;
+
+import align2.ListNum;
+import align2.Shared;
+import align2.Tools;
+import stream.ConcurrentReadInputStream;
+import stream.ConcurrentReadOutputStream;
+import stream.Read;
+import stream.ReadStreamWriter;
+import stream.SamLine;
+
+/**
+ * @author Brian Bushnell
+ * @date Jan 30, 2015
+ *
+ */
+public class DedupeByMapping extends BBTool_ST{
+
+ public static void main(String[] args){
+ Timer t=new Timer();
+ DedupeByMapping bbt=new DedupeByMapping(args);
+ bbt.process(t);
+ }
+
+ /**
+ * @param args
+ */
+ public DedupeByMapping(String[] args) {
+ super(args);
+ reparse(args);
+ SamLine.SET_FROM_OK=true;
+ ReadStreamWriter.USE_ATTACHED_SAMLINE=true;
+ if(sorted){queue=new PriorityQueue<Quad>(initialSize);}
+ }
+
+ /* (non-Javadoc)
+ * @see jgi.BBTool_ST#setDefaults()
+ */
+ @Override
+ void setDefaults() {
+ keepUnmapped=true;
+ sorted=false;
+ usePairOrder=true;
+ }
+
+ @Override
+ public boolean parseArgument(String arg, String a, String b) {
+ if(a.equals("keepunmapped") | a.equals("ku")){
+ keepUnmapped=Tools.parseBoolean(b);
+ return true;
+ }else if(a.equals("ignorepairorder") | a.equals("ipo")){
+ usePairOrder=!Tools.parseBoolean(b);
+ return true;
+ }else if(a.equals("sorted")){
+ sorted=Tools.parseBoolean(b);
+ return true;
+ }
+ return false;
+ }
+
+ @Override
+ boolean processReadPair(Read r1, Read r2) {
+ assert(r2==null);
+ return (sorted ? processReadPair_sorted(r1) : processReadPair_unsorted(r1));
+ }
+
+ boolean processReadPair_unsorted(Read r1) {
+ SamLine sl=(SamLine) r1.obj;
+ if(!sl.primary()){return false;}
+ if(sl.mapped()){
+ String rname=new String(sl.rname());
+ Integer x=contigToNumber.get(rname);
+ if(x==null){
+ x=contigToNumber.size();
+ contigToNumber.put(rname, x);
+ }
+ r1.chrom=x;
+ r1.start=sl.start(true, false);
+ r1.stop=sl.stop(r1.start, true, false);
+ r1.setStrand(sl.strand());
+ }else{
+ r1.chrom=-1;
+ r1.start=-1;
+ }
+
+ Read old=nameToRead.get(r1.id);
+ if(old==null){
+ nameToRead.put(r1.id, r1);
+ }else{
+ assert(old.mate==null);
+ old.mate=r1;
+ r1.mate=old;
+ SamLine sl2=(SamLine) old.obj;
+ if(sl2.pairnum()==1){
+ nameToRead.put(r1.id, r1);
+ }
+ }
+ return true;
+ }
+
+
+ boolean processReadPair_sorted(Read r1) {
+ assert(false) : "TODO";
+ SamLine sl=(SamLine) r1.obj;
+ if(!sl.primary()){return false;}
+ if(sl.mapped()){
+ String rname=new String(sl.rname());
+ Integer x=contigToNumber.get(rname);
+ if(x==null){
+ x=contigToNumber.size();
+ contigToNumber.put(rname, x);
+ }
+ r1.chrom=x;
+ r1.start=sl.start(true, false);
+ r1.stop=sl.stop(r1.start, true, false);
+ r1.setStrand(sl.strand());
+ }else{
+ r1.chrom=-1;
+ r1.start=-1;
+ }
+
+ Read old=nameToRead.get(r1.id);
+ if(old==null){
+ nameToRead.put(r1.id, r1);
+ }else{
+ assert(old.mate==null);
+ old.mate=r1;
+ r1.mate=old;
+ SamLine sl2=(SamLine) old.obj;
+ if(sl2.pairnum()==1){
+ nameToRead.put(r1.id, r1);
+ }
+ }
+ return true;
+ }
+
+ @Override
+ void processInner(final ConcurrentReadInputStream cris, final ConcurrentReadOutputStream ros){
+ if(sorted){processInner_sorted(cris, ros);}
+ else{processInner_unsorted(cris, ros);}
+ }
+
+
+ void processInner_unsorted(final ConcurrentReadInputStream cris, final ConcurrentReadOutputStream ros){
+
+ readsProcessed=0;
+ basesProcessed=0;
+
+ {
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+ assert((ffin1==null || ffin1.samOrBam()) || (r.mate!=null)==cris.paired());
+ }
+
+ while(reads!=null && reads.size()>0){
+ if(verbose){outstream.println("Fetched "+reads.size()+" reads.");}
+
+ for(int idx=0; idx<reads.size(); idx++){
+ final Read r1=reads.get(idx);
+ assert(r1.mate==null);
+ assert(r1.obj!=null);
+
+ final int initialLength1=r1.length();
+
+ {
+ readsProcessed++;
+ basesProcessed+=initialLength1;
+ }
+
+ processReadPair(r1, null);
+ }
+
+ cris.returnList(ln.id, ln.list.isEmpty());
+ if(verbose){outstream.println("Returned a list.");}
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ if(ln!=null){
+ cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
+ }
+ }
+
+ {
+ contigToNumber=null;
+ ArrayList<Read> list=new ArrayList<Read>(nameToRead.size());
+ for(String key : nameToRead.keySet()){
+ list.add(nameToRead.get(key));
+ }
+ nameToRead=null;
+ for(int i=0; i<list.size(); i++){
+ Read r1=list.set(i, null);
+
+ Read r2=r1.mate;
+ if(!r1.mapped() && !r1.mateMapped()){
+ unmappedReads+=1+r1.mateCount();
+ unmappedBases+=r1.length()+r1.mateLength();
+ if(keepUnmapped){unmapped.add(r1);}
+ }else{
+ Quad q=new Quad((r1.strand()==0 ? r1.start : r1.stop), r1.chrom, r2==null ? -2 : (r2.strand()==0 ? r2.start : r2.stop), r2==null ? -2 : r2.chrom);
+ Read old1=quadToRead.get(q);
+ if(old1==null){quadToRead.put(q, r1);}
+ else{
+ Read old2=old1.mate;
+ float a=(r1.expectedErrors(true, 0)+(r2==null ? 0 : r2.expectedErrors(true, 0)))/(r1.length()+r1.mateLength());
+ float b=old1.expectedErrors(true, 0)+(old2==null ? 0 : old2.expectedErrors(true, 0))/(old1.length()+old1.mateLength());
+ if(a<b){
+ quadToRead.put(q, r1);
+ duplicateReads+=1+old1.mateCount();
+ duplicateBases+=old1.length()+old1.mateLength();
+ }else{
+ duplicateReads+=1+r1.mateCount();
+ duplicateBases+=r1.length()+r1.mateLength();
+ }
+ }
+ }
+ }
+ list=null;
+ nameToRead=null;
+ }
+
+ {
+ ArrayList<Read> list=new ArrayList<Read>(Shared.READ_BUFFER_LENGTH);
+ int num=0;
+ for(Quad q : quadToRead.keySet()){
+ Read r=quadToRead.get(q);
+ if(keepUnmapped || r.mapped() || (r.mate!=null && r.mate.mapped())){
+ list.add(r);
+ if(list.size()>=Shared.READ_BUFFER_LENGTH){
+ if(ros!=null){
+ ros.add(list, num);
+ num++;
+ }
+ list=new ArrayList<Read>(Shared.READ_BUFFER_LENGTH);
+ }
+ }
+ }
+ if(list.size()>0){
+ if(ros!=null){
+ ros.add(list, num);
+ num++;
+ }
+ list=null;
+ }
+ if(ros!=null && unmapped.size()>0){
+ ros.add(unmapped, num);
+ num++;
+ }
+ }
+ outstream.println("Duplicate reads: "+duplicateReads+" \t("+duplicateBases+" bases)");
+ outstream.println("Unmapped reads: "+unmappedReads+" \t("+unmappedBases+" bases)");
+ }
+
+
+
+ void processInner_sorted(final ConcurrentReadInputStream cris, final ConcurrentReadOutputStream ros){
+ assert(false) : "TODO";
+ readsProcessed=0;
+ basesProcessed=0;
+
+ {
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+ assert((ffin1==null || ffin1.samOrBam()) || (r.mate!=null)==cris.paired());
+ }
+
+ while(reads!=null && reads.size()>0){
+ if(verbose){outstream.println("Fetched "+reads.size()+" reads.");}
+
+ for(int idx=0; idx<reads.size(); idx++){
+ final Read r1=reads.get(idx);
+ assert(r1.mate==null);
+ assert(r1.obj!=null);
+
+ final int initialLength1=r1.length();
+
+ {
+ readsProcessed++;
+ basesProcessed+=initialLength1;
+ }
+
+ processReadPair(r1, null);
+ }
+
+ cris.returnList(ln.id, ln.list.isEmpty());
+ if(verbose){outstream.println("Returned a list.");}
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ if(ln!=null){
+ cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
+ }
+ }
+
+ {
+ contigToNumber=null;
+ ArrayList<Read> list=new ArrayList<Read>(nameToRead.size());
+ for(String key : nameToRead.keySet()){
+ list.add(nameToRead.get(key));
+ }
+ nameToRead=null;
+ for(int i=0; i<list.size(); i++){
+ Read r1=list.set(i, null);
+
+ Read r2=r1.mate;
+ if(!r1.mapped() && !r1.mateMapped()){
+ unmappedReads+=1+r1.mateCount();
+ unmappedBases+=r1.length()+r1.mateLength();
+ if(keepUnmapped){unmapped.add(r1);}
+ }else{
+ Quad q=new Quad((r1.strand()==0 ? r1.start : r1.stop), r1.chrom, r2==null ? -2 : (r2.strand()==0 ? r2.start : r2.stop), r2==null ? -2 : r2.chrom);
+ Read old1=quadToRead.get(q);
+ if(old1==null){quadToRead.put(q, r1);}
+ else{
+ Read old2=old1.mate;
+ float a=(r1.expectedErrors(true, 0)+(r2==null ? 0 : r2.expectedErrors(true, 0)))/(r1.length()+r1.mateLength());
+ float b=old1.expectedErrors(true, 0)+(old2==null ? 0 : old2.expectedErrors(true, 0))/(old1.length()+old1.mateLength());
+ if(a<b){
+ quadToRead.put(q, r1);
+ duplicateReads+=1+old1.mateCount();
+ duplicateBases+=old1.length()+old1.mateLength();
+ }else{
+ duplicateReads+=1+r1.mateCount();
+ duplicateBases+=r1.length()+r1.mateLength();
+ }
+ }
+ }
+ }
+ list=null;
+ nameToRead=null;
+ }
+
+ {
+ ArrayList<Read> list=new ArrayList<Read>(Shared.READ_BUFFER_LENGTH);
+ int num=0;
+ for(Quad q : quadToRead.keySet()){
+ Read r=quadToRead.get(q);
+ if(keepUnmapped || r.mapped() || (r.mate!=null && r.mate.mapped())){
+ list.add(r);
+ if(list.size()>=Shared.READ_BUFFER_LENGTH){
+ if(ros!=null){
+ ros.add(list, num);
+ num++;
+ }
+ list=new ArrayList<Read>(Shared.READ_BUFFER_LENGTH);
+ }
+ }
+ }
+ if(list.size()>0){
+ if(ros!=null){
+ ros.add(list, num);
+ num++;
+ }
+ list=null;
+ }
+ if(ros!=null && unmapped.size()>0){
+ ros.add(unmapped, num);
+ num++;
+ }
+ }
+ outstream.println("Duplicate reads: "+duplicateReads+" \t("+duplicateBases+" bases)");
+ outstream.println("Unmapped reads: "+unmappedReads+" \t("+unmappedBases+" bases)");
+ }
+
+ @Override
+ void startupSubclass() {}
+
+ @Override
+ void shutdownSubclass() {}
+
+ @Override
+ void showStatsSubclass(Timer t, long readsIn, long basesIn) {}
+
+ private static class Quad implements Comparable<Quad>{
+
+ Quad(int start1_, int start2_, int chr1_, int chr2_){
+ if(usePairOrder){
+ start1=start1_;
+ start2=start2_;
+ chr1=chr1_;
+ chr2=chr2_;
+ }else{
+ start1=Tools.max(start1_,start2_);
+ start2=Tools.min(start1_,start2_);
+ chr1=Tools.max(chr1_,chr2_);
+ chr2=Tools.min(chr1_,chr2_);
+ }
+ }
+
+ @Override
+ public int hashCode(){
+ return start1^(Integer.rotateLeft(start2, 8))^(Integer.rotateLeft(chr1, 16))^(Integer.rotateLeft(chr2, 24));
+ }
+
+ @Override
+ public boolean equals(Object o){
+ return equals((Quad)o);
+ }
+
+ public boolean equals(Quad o){
+ return start1==o.start1 && start2==o.start2 && chr1==o.chr1 && chr2==o.chr2;
+ }
+
+ @Override
+ public int compareTo(Quad b) {
+ int x;
+ x=chr1-b.chr1;
+ if(x!=0){return x;}
+ x=start1-b.start1;
+ if(x!=0){return x;}
+ x=chr2-b.chr2;
+ if(x!=0){return x;}
+ x=start2-b.start2;
+ return x;
+ }
+
+ final int start1, start2, chr1, chr2;
+ }
+
+ private boolean keepUnmapped;
+ private boolean sorted;
+ private static boolean usePairOrder;
+
+ private long duplicateReads=0;
+ private long duplicateBases=0;
+ private long unmappedReads=0;
+ private long unmappedBases=0;
+
+ private int initialSize=(int)Tools.min(2000000, Tools.max(80000, Shared.getAvailableMemory()/4000));
+
+ private HashMap<String, Integer> contigToNumber=new HashMap<String, Integer>(initialSize);
+ private LinkedHashMap<Quad, Read> quadToRead=new LinkedHashMap<Quad, Read>(initialSize);
+ private LinkedHashMap<String, Read> nameToRead=new LinkedHashMap<String, Read>(initialSize);
+ private ArrayList<Read> unmapped=new ArrayList<Read>(initialSize/2);
+
+ private PriorityQueue<Quad> queue;
+
+}
diff --git a/current/jgi/DemuxByName.java b/current/jgi/DemuxByName.java
new file mode 100755
index 0000000..e8d00fd
--- /dev/null
+++ b/current/jgi/DemuxByName.java
@@ -0,0 +1,498 @@
+package jgi;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.BitSet;
+import java.util.HashMap;
+import java.util.HashSet;
+
+import stream.ArrayListSet;
+import stream.ConcurrentGenericReadInputStream;
+import stream.ConcurrentReadInputStream;
+import stream.FASTQ;
+import stream.FastaReadInputStream;
+import stream.ConcurrentReadOutputStream;
+import stream.MultiCros;
+import stream.Read;
+
+import dna.Parser;
+import dna.Timer;
+import fileIO.ByteFile;
+import fileIO.ByteFile1;
+import fileIO.ByteFile2;
+import fileIO.ReadWrite;
+import fileIO.FileFormat;
+import fileIO.TextFile;
+
+import align2.ListNum;
+import align2.ReadStats;
+import align2.Shared;
+import align2.Tools;
+
+
+/**
+ * @author Brian Bushnell
+ * @date Oct 9, 2014
+ *
+ */
+public class DemuxByName {
+
+ public static void main(String[] args){
+ Timer t=new Timer();
+ DemuxByName mb=new DemuxByName(args);
+ mb.process(t);
+ }
+
+ public DemuxByName(String[] args){
+
+ args=Parser.parseConfig(args);
+ if(Parser.parseHelp(args, true)){
+ printOptions();
+ System.exit(0);
+ }
+
+ for(String s : args){if(s.startsWith("out=standardout") || s.startsWith("out=stdout")){outstream=System.err;}}
+ outstream.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+
+ boolean setInterleaved=false; //Whether it was explicitly set.
+
+
+
+ Shared.READ_BUFFER_LENGTH=Tools.min(200, Shared.READ_BUFFER_LENGTH);
+ Shared.capBuffers(4);
+ ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
+ ReadWrite.MAX_ZIP_THREADS=Shared.threads();
+
+
+ Parser parser=new Parser();
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if(b==null || b.equalsIgnoreCase("null")){b=null;}
+ while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens
+
+ if(parser.parse(arg, a, b)){
+ //do nothing
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ ByteFile1.verbose=verbose;
+ ByteFile2.verbose=verbose;
+ stream.FastaReadInputStream.verbose=verbose;
+ ConcurrentGenericReadInputStream.verbose=verbose;
+// align2.FastaReadInputStream2.verbose=verbose;
+ stream.FastqReadInputStream.verbose=verbose;
+ ReadWrite.verbose=verbose;
+ }else if(a.equals("names") || a.equals("name") || a.equals("affixes")){
+ if(b!=null){
+ String[] x=b.split(",");
+ for(String s : x){
+ names.add(s);
+ }
+ }
+ }else if(a.equalsIgnoreCase("length") || a.equalsIgnoreCase("len") || a.equalsIgnoreCase("affixlength") || a.equalsIgnoreCase("affixlen")){
+ fixedAffixLength=Integer.parseInt(b);
+ }else if(a.equals("prefixmode") || a.equals("prefix") || a.equals("pm")){
+ prefixMode=Tools.parseBoolean(b);
+ }else if(a.equals("suffixmode") || a.equals("suffix") || a.equals("sm")){
+ prefixMode=!Tools.parseBoolean(b);
+ }else if(a.equals("outu") || a.equals("outu1")){
+ outu1=b;
+ }else if(a.equals("outu2")){
+ outu2=b;
+ }else if(parser.in1==null && i==0 && !arg.contains("=") && (arg.toLowerCase().startsWith("stdin") || new File(arg).exists())){
+ parser.in1=arg;
+ }else{
+ outstream.println("Unknown parameter "+args[i]);
+ assert(false) : "Unknown parameter "+args[i];
+ // throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ {
+ String[] x=names.toArray(new String[names.size()]);
+ names.clear();
+ for(String s : x){
+ File f=new File(s);
+ if(f.exists() && f.isFile()){
+ TextFile tf=new TextFile(s);
+ String[] lines=tf.toStringLines();
+ for(String s2 : lines){
+ names.add(s2);
+ }
+ }else{
+ names.add(s);
+ }
+ }
+
+// for(String s : names){
+// assert(affixLen<0 || affixLen==s.length()) : "All names must have the same length.";
+// affixLen=s.length();
+// }
+// assert(affixLen>0) : "Must include at least one non-zero-length affix (name).";
+
+ {
+ BitSet bs=new BitSet();
+ if(fixedAffixLength>0){
+ bs.set(fixedAffixLength);
+ }
+ for(String s : names){
+ bs.set(s.length());
+ }
+ affixLengths=new int[bs.cardinality()];
+ for(int i=0, bit=-1; i<affixLengths.length; i++){
+ bit=bs.nextSetBit(bit+1);
+ affixLengths[i]=bit;
+ }
+ }
+
+ assert(affixLengths.length>0 && affixLengths[0]>0) : "Must include at least one non-zero-length affix (name).";
+ ReadWrite.MAX_ZIP_THREADS=Tools.max(1, Tools.min(ReadWrite.MAX_ZIP_THREADS, (Shared.threads()*2-1)/Tools.max(1, names.size())));
+ }
+
+ {//Process parser fields
+ Parser.processQuality();
+
+ maxReads=parser.maxReads;
+
+ overwrite=ReadStats.overwrite=parser.overwrite;
+ append=ReadStats.append=parser.append;
+
+ setInterleaved=parser.setInterleaved;
+
+ in1=parser.in1;
+ in2=parser.in2;
+ qfin1=parser.qfin1;
+ qfin2=parser.qfin2;
+
+ out1=parser.out1;
+ out2=parser.out2;
+ qfout1=parser.qfout1;
+ qfout2=parser.qfout2;
+
+ extin=parser.extin;
+ extout=parser.extout;
+ }
+
+ assert(out1==null || out1.contains("%")) : "Output filename must contain '%' symbol, which will be replaced by affix.";
+ assert(out2==null || out2.contains("%")) : "Output filename must contain '%' symbol, which will be replaced by affix.";
+ assert(qfout1==null || qfout1.contains("%")) : "Output filename must contain '%' symbol, which will be replaced by affix.";
+ assert(qfout2==null || qfout2.contains("%")) : "Output filename must contain '%' symbol, which will be replaced by affix.";
+
+ if(in1!=null && in2==null && in1.indexOf('#')>-1 && !new File(in1).exists()){
+ in2=in1.replace("#", "2");
+ in1=in1.replace("#", "1");
+ }
+ if(out1!=null && out2==null && out1.indexOf('#')>-1){
+ out2=out1.replace("#", "2");
+ out1=out1.replace("#", "1");
+ }
+ if(outu1!=null && outu2==null && outu1.indexOf('#')>-1){
+ outu2=outu1.replace("#", "2");
+ outu1=outu1.replace("#", "1");
+ }
+ if(in2!=null){
+ if(FASTQ.FORCE_INTERLEAVED){outstream.println("Reset INTERLEAVED to false because paired input files were specified.");}
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+ }
+
+ assert(FastaReadInputStream.settingsOK());
+
+ if(in1==null){
+ printOptions();
+ throw new RuntimeException("Error - at least one input file is required.");
+ }
+ if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){
+ ByteFile.FORCE_MODE_BF2=true;
+ }
+
+ if(out1==null){
+ if(out2!=null){
+ printOptions();
+ throw new RuntimeException("Error - cannot define out2 without defining out1.");
+ }
+// if(!parser.setOut){
+// out1="stdout";
+// }
+ }
+
+ if(!setInterleaved){
+ assert(in1!=null && (out1!=null || out2==null)) : "\nin1="+in1+"\nin2="+in2+"\nout1="+out1+"\nout2="+out2+"\n";
+ if(in2!=null){ //If there are 2 input streams.
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+ outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED);
+ }else{ //There is one input stream.
+ if(out2!=null){
+ FASTQ.FORCE_INTERLEAVED=true;
+ FASTQ.TEST_INTERLEAVED=false;
+ outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED);
+ }
+ }
+ }
+
+ if(out1!=null && out1.equalsIgnoreCase("null")){out1=null;}
+ if(out2!=null && out2.equalsIgnoreCase("null")){out2=null;}
+
+ if(!Tools.testOutputFiles(overwrite, append, false, out1, out2, outu1, outu2)){
+ outstream.println((out1==null)+", "+(out2==null)+", "+out1+", "+out2+", "+outu1+", "+outu2);
+ throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+out1+", "+out2+", "+outu1+", "+outu2+"\n");
+ }
+
+ ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, extin, true, true);
+ ffin2=FileFormat.testInput(in2, FileFormat.FASTQ, extin, true, true);
+ }
+
+ void process(Timer t){
+
+
+ final ConcurrentReadInputStream cris;
+ {
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, ffin2, qfin1, qfin2);
+ if(verbose){outstream.println("Started cris");}
+ cris.start(); //4567
+ }
+ boolean paired=cris.paired();
+ if(!ffin1.samOrBam()){outstream.println("Input is being processed as "+(paired ? "paired" : "unpaired"));}
+
+ final MultiCros mcros;
+ if(out1!=null){
+ final int buff=4;
+
+ mcros=(fixedAffixLength>0 ? new MultiCros(out1, out2, false, overwrite, append, true, false, FileFormat.FASTQ, buff) : null);
+
+ if(paired && out2==null && (in1==null || !in1.contains(".sam"))){
+ outstream.println("Writing interleaved.");
+ }
+
+ assert(!out1.equalsIgnoreCase(in1) && !out1.equalsIgnoreCase(in1)) : "Input file and output file have same name.";
+ assert(out2==null || (!out2.equalsIgnoreCase(in1) && !out2.equalsIgnoreCase(in2))) : "out1 and out2 have same name.";
+
+ for(String s : names){
+
+ String qf1=null, qf2=null;
+ if(qfout1!=null){qf1=qfout1.replace("%", s);}
+ if(qfout2!=null){qf2=qfout2.replace("%", s);}
+
+ FileFormat ffout1=FileFormat.testOutput(out1.replace("%", s), FileFormat.FASTQ, extout, true, overwrite, append, false);
+ FileFormat ffout2=(out2==null ? null : FileFormat.testOutput(out2.replace("%", s), FileFormat.FASTQ, extout, true, overwrite, append, false));
+ ConcurrentReadOutputStream ros=ConcurrentReadOutputStream.getStream(ffout1, ffout2, qf1, qf2, buff, null, false);
+ ros.start();
+ nameToStream.put(s, ros);
+ }
+ }else{
+ mcros=null;
+ }
+
+ final ConcurrentReadOutputStream rosu;
+ if(outu1!=null){
+ final int buff=4;
+
+ FileFormat ffout1=FileFormat.testOutput(outu1, FileFormat.FASTQ, extout, true, overwrite, append, false);
+ FileFormat ffout2=(outu2==null ? null : FileFormat.testOutput(outu2, FileFormat.FASTQ, extout, true, overwrite, append, false));
+ rosu=ConcurrentReadOutputStream.getStream(ffout1, ffout2, null, null, buff, null, false);
+ rosu.start();
+ }else{
+ rosu=null;
+ }
+
+ long readsProcessed=0;
+ long basesProcessed=0;
+
+ long readsOut=0;
+ long basesOut=0;
+
+ {
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+// outstream.println("Fetched "+reads);
+
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+ assert((ffin1==null || ffin1.samOrBam()) || (r.mate!=null)==cris.paired());
+ }
+
+ for(String s : names){
+ nameToArray.put(s, new ArrayList<Read>());
+ }
+ final ArrayListSet als=(fixedAffixLength<1 ? null : new ArrayListSet(false));
+
+
+ while(reads!=null && reads.size()>0){
+
+ ArrayList<Read> unmatched=new ArrayList<Read>();
+ for(int idx=0; idx<reads.size(); idx++){
+ final Read r1=reads.get(idx);
+ final Read r2=r1.mate;
+
+ final int initialLength1=r1.length();
+ final int initialLength2=(r1.mateLength());
+
+ final String id=r1.id;
+ final int idlen=id.length();
+
+ ArrayList<Read> al2=null;
+ if(names.size()>0){
+ for(int affixLen : affixLengths){
+ final String sub=idlen>=affixLen ? prefixMode ? id.substring(0, affixLen) : id.substring(idlen-affixLen) : id;
+ al2=nameToArray.get(sub);
+ if(al2!=null){break;}
+ }
+ }
+
+ if(al2!=null || als!=null){
+ readsOut++;
+ basesOut+=initialLength1;
+ if(r2!=null){
+ readsOut++;
+ basesOut+=initialLength2;
+ }
+
+ if(al2!=null){
+ al2.add(r1);
+ {
+ readsOut++;
+ basesOut+=initialLength1;
+ }
+ if(r2!=null){
+ readsOut++;
+ basesOut+=initialLength2;
+ }
+ }else if(als!=null){
+ String sub=r1.id;
+ sub=(sub.length()<=fixedAffixLength ? sub : prefixMode ? id.substring(0, fixedAffixLength) : id.substring(idlen-fixedAffixLength));
+ als.add(r1, sub);
+ }
+
+ }else{
+ unmatched.add(r1);
+ }
+ {
+ readsProcessed++;
+ basesProcessed+=initialLength1;
+ }
+ if(r2!=null){
+ readsProcessed++;
+ basesProcessed+=initialLength2;
+ }
+ }
+
+ for(String s : names){
+ ArrayList<Read> listOut=nameToArray.put(s, new ArrayList<Read>());
+ ConcurrentReadOutputStream ros=nameToStream.get(s);
+ if(ros!=null){ros.add(listOut, ln.id);}
+ }
+ if(mcros!=null){mcros.add(als, ln.id);}
+ if(rosu!=null){rosu.add(unmatched, ln.id);}
+
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ if(ln!=null){
+ cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
+ }
+ }
+
+ errorState|=ReadStats.writeAll();
+
+ errorState|=ReadWrite.closeStream(cris);
+
+ for(String s : names){
+ ConcurrentReadOutputStream ros=nameToStream.get(s);
+ errorState|=ReadWrite.closeStream(ros);
+ }
+
+ if(mcros!=null){
+ errorState|=ReadWrite.closeStreams(mcros);
+ }
+
+ t.stop();
+
+ double rpnano=readsProcessed/(double)(t.elapsed);
+ double bpnano=basesProcessed/(double)(t.elapsed);
+
+ outstream.println("Time: "+t);
+ outstream.println("Reads Processed: "+readsProcessed+" \t"+String.format("%.2fk reads/sec", rpnano*1000000));
+ outstream.println("Bases Processed: "+basesProcessed+" \t"+String.format("%.2fm bases/sec", bpnano*1000));
+ outstream.println("Reads Out: "+readsOut);
+ outstream.println("Bases Out: "+basesOut);
+
+ if(errorState){
+ throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
+ }
+ }
+
+ /*--------------------------------------------------------------*/
+
+ private void printOptions(){
+ assert(false) : "printOptions: TODO";
+// outstream.println("Syntax:\n");
+// outstream.println("java -ea -Xmx512m -cp <path> jgi.ReformatReads in=<infile> in2=<infile2> out=<outfile> out2=<outfile2>");
+// outstream.println("\nin2 and out2 are optional. \nIf input is paired and there is only one output file, it will be written interleaved.\n");
+// outstream.println("Other parameters and their defaults:\n");
+// outstream.println("overwrite=false \tOverwrites files that already exist");
+// outstream.println("ziplevel=4 \tSet compression level, 1 (low) to 9 (max)");
+// outstream.println("interleaved=false\tDetermines whether input file is considered interleaved");
+// outstream.println("fastawrap=70 \tLength of lines in fasta output");
+// outstream.println("qin=auto \tASCII offset for input quality. May be set to 33 (Sanger), 64 (Illumina), or auto");
+// outstream.println("qout=auto \tASCII offset for output quality. May be set to 33 (Sanger), 64 (Illumina), or auto (meaning same as input)");
+// outstream.println("outsingle=<file> \t(outs) Write singleton reads here, when conditionally discarding reads from pairs.");
+ }
+
+
+ /*--------------------------------------------------------------*/
+
+ private String in1=null;
+ private String in2=null;
+
+ private String qfin1=null;
+ private String qfin2=null;
+
+ private String out1=null;
+ private String out2=null;
+
+ private String outu1=null;
+ private String outu2=null;
+
+ private String qfout1=null;
+ private String qfout2=null;
+
+ private String extin=null;
+ private String extout=null;
+
+ /*--------------------------------------------------------------*/
+
+ private long maxReads=-1;
+// private boolean exclude=true;
+
+ private boolean prefixMode=true;
+// private int affixLen=-1;
+
+ private int fixedAffixLength=-1;
+
+ private int[] affixLengths;
+
+ private HashSet<String> names=new HashSet<String>();
+ private HashMap<String, ArrayList<Read>> nameToArray=new HashMap<String, ArrayList<Read>>();
+ private HashMap<String, ConcurrentReadOutputStream> nameToStream=new HashMap<String, ConcurrentReadOutputStream>();
+
+ /*--------------------------------------------------------------*/
+
+ private final FileFormat ffin1;
+ private final FileFormat ffin2;
+
+
+ /*--------------------------------------------------------------*/
+
+ private PrintStream outstream=System.err;
+ public static boolean verbose=false;
+ public boolean errorState=false;
+ private boolean overwrite=true;
+ private boolean append=false;
+
+}
diff --git a/current/jgi/Difference.java b/current/jgi/Difference.java
new file mode 100755
index 0000000..81941da
--- /dev/null
+++ b/current/jgi/Difference.java
@@ -0,0 +1,39 @@
+package jgi;
+
+import fileIO.TextFile;
+
+/**
+ * @author Brian Bushnell
+ * @date Oct 9, 2013
+ *
+ */
+public class Difference {
+
+ public static void main(String[] args){
+
+ TextFile tf1=new TextFile(args[0], false, false);
+ TextFile tf2=new TextFile(args[1], false, false);
+
+ String s1=tf1.readLine(false);
+ String s2=tf2.readLine(false);
+
+ int difs=0;
+ int i=1;
+ while(s1!=null && s2!=null){
+ if(!s1.equals(s2)){
+ difs++;
+ System.err.println("Line "+i+":\n"+s1+"\n"+s2+"\n");
+ assert(difs<5);
+ }
+ i++;
+ s1=tf1.readLine(false);
+ s2=tf2.readLine(false);
+ }
+
+ assert(s1==null && s2==null) : "Line "+i+":\n"+s1+"\n"+s2+"\n";
+
+ tf1.close();
+ tf2.close();
+ }
+
+}
diff --git a/current/jgi/ErrorCorrect.java b/current/jgi/ErrorCorrect.java
new file mode 100755
index 0000000..d8a8050
--- /dev/null
+++ b/current/jgi/ErrorCorrect.java
@@ -0,0 +1,849 @@
+package jgi;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.BitSet;
+
+import bloom.KCountArray;
+import bloom.KmerCount6;
+
+
+import stream.ConcurrentGenericReadInputStream;
+import stream.ConcurrentReadInputStream;
+import stream.ConcurrentReadOutputStream;
+import stream.Read;
+
+import dna.AminoAcid;
+import dna.Data;
+import dna.Parser;
+import dna.Timer;
+import fileIO.FileFormat;
+import fileIO.ReadWrite;
+
+import align2.ListNum;
+import align2.ReadStats;
+import align2.Tools;
+
+/**
+ * @author Brian Bushnell
+ * @date Aug 20, 2012
+ *
+ */
+public class ErrorCorrect extends Thread{
+
+ public static void main(String[] args){
+
+ String reads1=args[0];
+ String reads2=(args.length>1 ? args[1] : null);
+
+ int k=23;
+ int cbits=4;
+ int gap=0;
+ int hashes=1;
+ int thresh1=1;
+ int thresh2=2;
+ int matrixbits=34;
+ long maxReads=-1;
+ int buildpasses=1;
+ long tablereads=-1; //How many reads to process when building the hashtable
+ int buildStepsize=4;
+ String output=null;
+ boolean ordered=true;
+ boolean overwrite=false;
+ boolean append=false;
+
+ for(int i=2; i<args.length; i++){
+ final String arg=args[i];
+ final String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=(split.length>1 ? split[1] : "true");
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(Parser.parseZip(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseQuality(arg, a, b)){
+ //do nothing
+ }else if(a.equals("k") || a.equals("kmer")){
+ k=Integer.parseInt(b);
+ }else if(a.startsWith("cbits") || a.startsWith("cellbits")){
+ cbits=Integer.parseInt(b);
+ }else if(a.equals("initialthresh") || a.equals("thresh1")){
+ thresh1=Integer.parseInt(b);
+ }else if(a.equals("thresh") || a.equals("thresh2")){
+ thresh2=Integer.parseInt(b);
+ }else if(a.startsWith("gap")){
+ gap=Integer.parseInt(b);
+ }else if(a.startsWith("matrixbits")){
+ matrixbits=Integer.parseInt(b);
+ }else if(a.startsWith("hashes") || a.startsWith("multihash")){
+ hashes=Integer.parseInt(b);
+ }else if(a.startsWith("maxerrors")){
+ ERROR_CORRECTION_LIMIT=Integer.parseInt(b);
+ }else if(a.startsWith("passes")){
+ buildpasses=Integer.parseInt(b);
+ }else if(a.startsWith("stepsize") || a.startsWith("buildstepsize")){
+ buildStepsize=Integer.parseInt(b);
+ }else if(a.equals("threads") || a.equals("t")){
+ System.err.println("Can't change threadcount for this class."); //THREADS=Integer.parseInt(b);
+ }else if(a.startsWith("reads") || a.startsWith("maxreads")){
+ maxReads=Tools.parseKMG(b);
+ }else if(a.startsWith("tablereads")){
+ tablereads=Tools.parseKMG(b);
+ }else if(a.startsWith("build") || a.startsWith("genome")){
+ Data.setGenome(Integer.parseInt(b));
+ Data.sysout.println("Set genome to "+Data.GENOME_BUILD);
+ }else if(a.equals("outputinfo") || a.startsWith("info")){
+ OUTPUT_INFO=Tools.parseBoolean(b);
+ }else if(a.startsWith("out")){
+ output=b;
+ }else if(a.startsWith("verbose")){
+ KCountArray.verbose=Tools.parseBoolean(b);
+// verbose=KCountArray.verbose=Tools.parseBoolean(b);
+ }else if(a.equals("ordered") || a.equals("ord")){
+ ordered=Tools.parseBoolean(b);
+ }else if(a.equals("append") || a.equals("app")){
+ append=ReadStats.append=Tools.parseBoolean(b);
+ }else if(a.equals("overwrite") || a.equals("ow")){
+ overwrite=Tools.parseBoolean(b);
+ }else{
+ throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ {//Process parser fields
+ Parser.processQuality();
+ }
+
+ KCountArray kca=makeTable(reads1, reads2, k, cbits, gap, hashes, buildpasses, matrixbits, tablereads, buildStepsize, thresh1, thresh2);
+
+ detect(reads1, reads2, kca, k, thresh2, maxReads, output, ordered, append, overwrite);
+
+ }
+
+ public static KCountArray makeTable(String reads1, String reads2, int k, int cbits, int gap, int hashes, int buildpasses, int matrixbits,
+ long maxreads, int stepsize, int thresh1, int thresh2){
+
+ Timer thash=new Timer();
+
+ KmerCount6.maxReads=maxreads;
+ int kbits=2*k;
+ matrixbits=Tools.min(kbits, matrixbits);
+
+ thash.start();
+// Data.sysout.println("kbits="+(kbits)+" -> "+(1L<<kbits)+", matrixbits="+(matrixbits)+" -> "+(1L<<matrixbits)+", cbits="+cbits+", gap="+gap+", hashes="+hashes);
+ KCountArray kca=KCountArray.makeNew(1L<<kbits, 1L<<matrixbits, cbits, gap, hashes);
+
+ assert(gap==0) : "TODO";
+ if(buildpasses==1){
+
+ KmerCount6.count(reads1, reads2, k, cbits, gap, true, kca);
+ kca.shutdown();
+
+ }else{
+ assert(buildpasses>1);
+ KCountArray trusted=null;
+ for(int i=1; i<buildpasses; i++){
+ boolean conservative=i>2;// /*or, alternately, (trusted==null || trusted.capacity()>0.3)
+ int step=(stepsize==1 ? 1 : stepsize+i%2);
+// if(!conservative){step=(step+3)/4;}
+ if(!conservative){step=Tools.min(3, (step+3)/4);}
+
+ KmerCount6.count(reads1, reads2, k, cbits, true, kca, trusted, maxreads, thresh1, step, conservative);
+
+ kca.shutdown();
+ Data.sysout.println("Trusted: \t"+kca.toShortString());
+ trusted=kca;
+ kca=KCountArray.makeNew(1L<<kbits, 1L<<matrixbits, cbits, gap, hashes);
+
+ }
+
+ KmerCount6.count(reads1, reads2, k, cbits, true, kca, trusted, maxreads, thresh2, stepsize, true);
+
+ kca.shutdown();
+ }
+
+
+ thash.stop();
+// Data.sysout.println(kca.toString());
+ Data.sysout.println("Table : \t"+kca.toShortString());
+ Data.sysout.println("Hash time: \t"+thash);
+ return kca;
+ }
+
+ public static void detect(String reads1, String reads2, KCountArray kca, int k, int thresh, long maxReads, String output, boolean ordered, boolean append, boolean overwrite) {
+
+
+ final ConcurrentReadInputStream cris;
+ {
+ FileFormat ff1=FileFormat.testInput(reads1, FileFormat.FASTQ, null, true, true);
+ FileFormat ff2=FileFormat.testInput(reads2, FileFormat.FASTQ, null, true, true);
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2);
+ }
+ assert(cris!=null) : reads1;
+
+ cris.start();
+ if(verbose){System.err.println("Started cris");}
+ boolean paired=cris.paired();
+ if(verbose){System.err.println("Paired: "+paired);}
+
+ ConcurrentReadOutputStream ros=null;
+ if(output!=null){
+ String out1=output.replaceFirst("#", "1"), out2=null;
+
+ if(cris.paired()){
+ if(output.contains("#")){
+ out2=output.replaceFirst("#", "2");
+ }else{
+ System.err.println("Writing interleaved.");
+ }
+ }
+
+ final int buff=(!ordered ? 8 : Tools.max(16, 2*THREADS));
+
+ FileFormat ff1=FileFormat.testOutput(out1, FileFormat.FASTQ, OUTPUT_INFO ? ".info" : null, true, overwrite, append, ordered);
+ FileFormat ff2=FileFormat.testOutput(out2, FileFormat.FASTQ, OUTPUT_INFO ? ".info" : null, true, overwrite, append, ordered);
+ ros=ConcurrentReadOutputStream.getStream(ff1, ff2, buff, null, true);
+
+ assert(!ff1.sam()) : "Sam files need reference info for the header.";
+ }
+
+
+ if(ros!=null){
+ ros.start();
+ Data.sysout.println("Started output threads.");
+ }
+
+ detect(cris, kca, k, thresh, maxReads, ros);
+
+ ReadWrite.closeStreams(cris, ros);
+ if(verbose){System.err.println("Closed stream");}
+ }
+
+ public static void detect(ConcurrentReadInputStream cris, KCountArray kca, int k, int thresh, long maxReads, ConcurrentReadOutputStream ros) {
+ Timer tdetect=new Timer();
+ tdetect.start();
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ long covered=0;
+ long uncovered=0;
+
+ long coveredFinal=0;
+ long uncoveredFinal=0;
+
+ long fullyCorrected=0;
+ long failed=0;
+
+ long totalBases=0;
+ long totalReads=0;
+
+
+ while(reads!=null && reads.size()>0){
+ for(Read r : reads){
+ Read r2=r.mate;
+ {
+
+// if(r.numericID==23){verbose=true;}
+
+ totalReads++;
+ if(verbose){System.err.println();}
+ totalBases+=r.length();
+// BitSet bs=detectErrors(r, kca, k, thresh);
+ BitSet bs=detectErrorsBulk(r, kca, k, thresh, 1);
+ if(verbose){System.err.println(toString(bs, r.length()));}
+// Data.sysout.println(toString(detectErrorsTips(r, kca, k, thresh), r.length()));
+ if(verbose){System.err.println(toString(detectErrors(r, kca, k, thresh), r.length()-k+1));}
+ if(bs==null){//No errors, or can't detect errors
+ assert(false);
+ }else{
+ int x=bs.cardinality();
+ covered+=x;
+ uncovered+=(r.length()-x);
+ if(x<r.length()){
+ bs=correctErrors(r, kca, k, thresh, bs, ERROR_CORRECTION_LIMIT, MAX_ERROR_BURST);
+ }
+ int y=bs.cardinality();
+ coveredFinal+=y;
+ uncoveredFinal+=(r.length()-y);
+ if(x<r.length()){
+ if(y==r.length()){
+ fullyCorrected++;
+ }else{
+ failed++;
+ }
+ }
+ }
+ }
+ if(r2!=null){
+ totalReads++;
+ totalBases+=r2.length();
+// BitSet bs=detectErrors(r2, kca, k, thresh);
+ BitSet bs=detectErrorsBulk(r2, kca, k, thresh, 1);
+ if(verbose){System.err.println(toString(bs, r2.length()));}
+// Data.sysout.println(toString(detectErrorsTips(r2, kca, k, thresh), r2.length()));
+ if(verbose){System.err.println(toString(detectErrors(r2, kca, k, thresh), r2.length()-k+1));}
+ if(bs==null){//No errors, or can't detect errors
+ }else{
+ int x=bs.cardinality();
+ covered+=x;
+ uncovered+=(r2.length()-x);
+ if(x<r2.length()){
+ bs=correctErrors(r2, kca, k, thresh, bs, ERROR_CORRECTION_LIMIT, MAX_ERROR_BURST);
+ }
+ int y=bs.cardinality();
+ coveredFinal+=y;
+ uncoveredFinal+=(r2.length()-y);
+ if(x<r2.length()){
+ if(y==r2.length()){
+ fullyCorrected++;
+ }else{
+ failed++;
+ }
+ }
+ }
+ }
+ }
+
+ if(ros!=null){ //Important to send all lists to output, even empty ones, to keep list IDs straight.
+ if(DONT_OUTPUT_BAD_READS){removeBad(reads);}
+ for(Read r : reads){
+ if(r!=null){
+ r.obj=null;
+ assert(r.bases!=null);
+ if(r.sites!=null && r.sites.isEmpty()){r.sites=null;}
+ }
+ }
+// System.err.println("Adding list of length "+readlist.size());
+ ros.add(reads, ln.id);
+ }
+
+ cris.returnList(ln.id, ln.list.isEmpty());
+ //System.err.println("fetching list");
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ if(verbose){System.err.println("Finished reading");}
+ cris.returnList(ln.id, ln.list.isEmpty());
+ if(verbose){System.err.println("Returned list");}
+
+ tdetect.stop();
+ Data.sysout.println("Detect time: \t"+tdetect);
+ Data.sysout.println("Total reads: \t"+totalReads);
+ Data.sysout.println("Total bases: \t"+totalBases);
+ Data.sysout.println("Reads Corrected:\t"+fullyCorrected);
+ Data.sysout.println("Reads Failed: \t"+failed);
+
+ Data.sysout.println("\n - before correction - ");
+ Data.sysout.println("Covered: \t"+covered);
+ Data.sysout.println("Uncovered: \t"+uncovered);
+
+ Data.sysout.println("\n - after correction - ");
+ Data.sysout.println("Covered: \t"+coveredFinal);
+ Data.sysout.println("Uncovered: \t"+uncoveredFinal);
+ }
+
+ /** Sets a 1 bit at start of each kmer with count at least thresh */
+ public static BitSet detectErrors(final Read r, final KCountArray kca, final int k, final int thresh){
+
+ final int kbits=2*k;
+ final long mask=~((-1L)<<(kbits));
+ final int gap=kca.gap;
+ final byte[] bases=r.bases;
+ assert(kca.gap==0);
+
+ int bslen=r.length()-k-gap+1;
+ if(bslen<1){return null;} //Read is too short to detect errors
+ BitSet bs=new BitSet(bslen);
+
+ int len=0;
+ long kmer=0;
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ int x=AminoAcid.baseToNumber[b];
+ if(x<0){
+ len=0;
+ kmer=0;
+ }else{
+ kmer=((kmer<<2)|x)&mask;
+ len++;
+
+ if(len>=k){
+ int count=kca.read(kmer);
+ if(count>=thresh){
+ bs.set(i+1-k);
+ }
+ }
+ }
+ }
+
+ return bs;
+ }
+
+ /** Sets a 1 bit for every base covered by a kmer with count at least thresh */
+ public static BitSet detectErrorsBulk(final Read r, final KCountArray kca, final int k, final int thresh, final int stepsize){
+
+ final int kbits=2*k;
+ final long mask=~((-1L)<<(kbits));
+ final int gap=kca.gap;
+ final byte[] bases=r.bases;
+ assert(gap==0);
+
+ if(r.bases==null || r.length()<k+gap){return null;} //Read is too short to detect errors
+ BitSet bs=new BitSet(r.length());
+ final int setlen=k+gap;
+
+ int len=0;
+ long kmer=0;
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ int x=AminoAcid.baseToNumber[b];
+ if(x<0){
+ len=0;
+ kmer=0;
+ }else{
+ kmer=((kmer<<2)|x)&mask;
+ len++;
+
+ if(len>=k && ((len-k)%stepsize==0 || i==bases.length-1)){
+ int count=kca.read(kmer);
+ if(count>=thresh){
+ bs.set(i+1-setlen, i+1);
+ }
+ }
+ }
+ }
+ r.errors=bs.cardinality()-r.length();
+
+ return bs;
+ }
+
+ /** Sets 1 for all bases.
+ * Then clears all bits covered by incorrect kmers. */
+ public static BitSet detectTrusted(final Read r, final KCountArray kca, final int k, final int thresh, final int detectStepsize){
+
+ final int kbits=2*k;
+ final long mask=~((-1L)<<(kbits));
+ final int gap=kca.gap;
+ final byte[] bases=r.bases;
+ assert(gap==0);
+
+ if(r.bases==null || r.length()<k+gap){return null;} //Read is too short to detect errors
+ BitSet bs=new BitSet(r.length());
+ bs.set(0, r.length());
+ final int setlen=k+gap;
+
+ int len=0;
+ long kmer=0;
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ int x=AminoAcid.baseToNumber[b];
+ if(x<0){
+ len=0;
+ kmer=0;
+ }else{
+ kmer=((kmer<<2)|x)&mask;
+ len++;
+
+ if(len>=k && (i%detectStepsize==0 || i==bases.length-1)){
+ int count=kca.read(kmer);
+ if(count<thresh){
+ bs.clear(i+1-setlen, i+1);
+// bs.clear(i+1-setlen+detectStepsize, i+1-detectStepsize);
+// bs.clear(i+k/2-detectStepsize, i+k/2+detectStepsize);
+// bs.clear(i+k/2);
+ }
+ }
+ }
+ }
+// assert(bases.length==r.length());
+ return bs;
+ }
+
+ public static BitSet detectErrorsTips(final Read r, final KCountArray kca, final int k, final int thresh){
+// if(kca.gap>0){return detectErrorsSplit(r, kca, k, thresh);}
+
+ final int kbits=2*k;
+ final long mask=~((-1L)<<(kbits));
+ final int gap=kca.gap;
+ final byte[] bases=r.bases;
+ assert(gap==0);
+
+ if(r.bases==null || r.length()<k+gap){return null;} //Read is too short to detect errors
+ BitSet bs=new BitSet(r.length());
+ final int setlen=k+gap;
+
+ int len=0;
+ long kmer=0;
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ int x=AminoAcid.baseToNumber[b];
+ if(x<0){
+ len=0;
+ kmer=0;
+ }else{
+ kmer=((kmer<<2)|x)&mask;
+ len++;
+
+ if(len>=k){
+ int count=kca.read(kmer);
+ if(count>=thresh){
+ bs.set(i+1-setlen);
+ bs.set(i);
+ }
+ }
+ }
+ }
+ return bs;
+ }
+
+
+ /** Assumes bulk mode was used; e.g., any '0' bit is covered by no correct kmers */
+ public static BitSet correctErrors(final Read r, final KCountArray kca, final int k, final int thresh, BitSet bs, final int maxCorrections, final int maxBurst){
+ if(kca.gap>0){assert(false) : "TODO";}
+
+ assert(!OUTPUT_INFO) : "TODO: Outputting correction data is not yet supported.";
+
+ int corrections=0; //Alternately, corrections=r.errorsCorrected
+ r.errors=0;
+
+ if(bs.cardinality()==0){//Cannot be corrected
+ r.errors=r.length();
+ return bs;
+ }
+
+// verbose=!bs.get(0);
+ if(verbose){
+ Data.sysout.println();
+ Data.sysout.println(toString(bs, r.length()));
+ Data.sysout.println(toString(detectErrorsTips(r, kca, k, thresh), r.length()));
+ Data.sysout.println(toString(detectErrors(r, kca, k, thresh), r.length()-k+1));
+ }
+
+
+ int lastloc=-99;
+ int burst=1;
+ while(!bs.get(0) && corrections<maxCorrections){//While the read starts with a '0', correct from the right.
+// Data.sysout.println("Could not correct.");
+// return bs;
+ int errorLoc=bs.nextSetBit(0)-1;//Location to left of first '1'
+ if(Tools.absdif(errorLoc,lastloc)<=BURST_THRESH){burst++;}
+ else{burst=1;}
+ lastloc=errorLoc;
+ boolean success=(burst<=MAX_ERROR_BURST) && correctFromRight(r, kca, k, thresh, bs, errorLoc);
+ if(success){
+ corrections++;
+ bs=detectErrorsBulk(r, kca, k, thresh, 1);
+ if(verbose){System.err.println(">\n"+toString(bs, r.length()));}
+ }else{
+ r.errors=r.length()-bs.cardinality();
+// r.errorsCorrected+=corrections;
+ if(verbose){System.err.println("Could not correct.");}
+ r.bases[errorLoc]='N';
+ r.quality[errorLoc]=0;
+ return bs;
+ }
+ }
+
+ burst=1;
+ while(bs.cardinality()<r.length() && corrections<maxCorrections){
+ if(bs.get(0)){//First bit is a "1", can correct from the left
+ int errorLoc=bs.nextClearBit(0);//Location to left of first '0'
+ if(Tools.absdif(errorLoc,lastloc)<=BURST_THRESH){burst++;}
+ else{burst=1;}
+ lastloc=errorLoc;
+ boolean success=(burst<=MAX_ERROR_BURST) && correctFromLeft(r, kca, k, thresh, bs, errorLoc);
+ if(success){
+ corrections++;
+ bs=detectErrorsBulk(r, kca, k, thresh, 1);
+ if(verbose){System.err.println(">\n"+toString(bs, r.length()));}
+ }else{
+ r.errors=r.length()-bs.cardinality();
+// r.errorsCorrected+=corrections;
+ r.bases[errorLoc]='N';
+ r.quality[errorLoc]=0;
+ if(verbose){System.err.println("Could not correct.");}
+ return bs;
+ }
+ }
+ }
+
+ r.errors=r.length()-bs.cardinality();
+// r.errorsCorrected+=corrections;
+ assert(corrections<=maxCorrections);
+ return bs;
+ }
+
+
+ /**
+ * @param r
+ * @param kca
+ * @param k
+ * @param thresh
+ * @param bs
+ * @param errorLoc
+ * @return
+ */
+ private static boolean correctFromLeft(Read r, KCountArray kca, int k, int thresh, BitSet bs, int error) {
+ final int kbits=2*k;
+ final long mask=~((-1L)<<(kbits));
+ final int gap=kca.gap;
+ final int setlen=k+gap;
+ final int startLoc=error-(setlen)+1;
+ final byte oldBase=r.bases[error];
+ final byte[] bases=r.bases;
+
+ final int minAdvance=Tools.min(MIN_ADVANCE, bases.length-error);
+
+ long kmer=0;
+ int len=0;
+ for(int i=startLoc; i<error; i++){
+ byte b=bases[i];
+ int x=AminoAcid.baseToNumber[b];
+ if(x<0){
+ len=0;
+ kmer=0;
+ throw new RuntimeException("Can't correct from left!\nerror="+error+"\n"+toString(bs, bases.length)+"\n"+new String(bases)+"\nreadID: "+r.numericID);
+ }else{
+ kmer=((kmer<<2)|x)&mask;
+ len++;
+ }
+ }
+ assert(len==setlen-1) : setlen+", "+len+", "+error+", "+startLoc;
+
+ int[] counts=new int[4];
+ int[] dists=new int[4];
+ int maxLoc=Tools.min(bases.length-1, error+setlen-1);
+ if(!bs.get(error+1)){maxLoc=Tools.min(maxLoc, error+9);}
+ else{
+ for(int i=error+2; i<=maxLoc; i++){
+ if(!bs.get(i)){
+ maxLoc=i-1;
+ break;
+ }
+ }
+ }
+
+ if(verbose){System.err.println("correctFromLeft. Error = "+error+", maxloc="+maxLoc);}
+ for(int bnum=0; bnum<4; bnum++){
+ byte c=AminoAcid.numberToBase[bnum];
+ bases[error]=c;
+ if(verbose){System.err.println("Considering "+(char)c);}
+ long key=kmer;
+ for(int loc=error; loc<=maxLoc; loc++){
+ c=bases[loc];
+ int x=AminoAcid.baseToNumber[c];
+ if(x<0){
+ if(verbose){System.err.println("break: N");}
+ break;
+ }
+ key=((key<<2)|x)&mask;
+ int count=kca.read(key);
+ if(count<thresh){
+ if(verbose){System.err.println("break: count="+count);}
+ break;
+ }
+ dists[bnum]++;
+ counts[bnum]+=count;
+ }
+ }
+ bases[error]=oldBase;
+
+ //Note: I could require both to be the same, to decrease false-positives
+
+ final int muid=maxUniqueIndex(dists);
+ Arrays.sort(dists);
+ final int advance=dists[3];
+ final int delta=dists[3]-dists[2];
+// if(advance<minAdvance){return false;}
+ if(delta<minAdvance){return false;}
+
+ int best=(muid<0 ? maxUniqueIndex(counts) : muid);
+
+ if(verbose){System.err.println("Best="+best+": "+Arrays.toString(dists)+" \t"+Arrays.toString(counts));}
+ if(best<0){return false;}
+ byte bestC=AminoAcid.numberToBase[best];
+ if(bestC==oldBase){return false;}
+ bases[error]=bestC;
+
+ r.quality[error]=(byte)Tools.min(10, 3+delta);
+
+ return true;
+ }
+
+
+
+ /**
+ * @param r
+ * @param kca
+ * @param k
+ * @param thresh
+ * @param bs
+ * @param errorLoc
+ * @return
+ */
+ private static boolean correctFromRight(Read r, KCountArray kca, int k, int thresh, BitSet bs, int error) {
+ final int kbits=2*k;
+ final int shift=kbits-2;
+ final long mask=~((-1L)<<(kbits));
+ final int gap=kca.gap;
+ final int setlen=k+gap;
+ final int stopLoc=error+(setlen)-1;
+ final byte oldBase=r.bases[error];
+ final byte[] bases=r.bases;
+
+ final int minAdvance=Tools.min(MIN_ADVANCE, error+1);
+
+ long kmer=0;
+ int len=0;
+
+ for(int i=error+1; i<=stopLoc; i++){
+ byte b=bases[i];
+ int x=AminoAcid.baseToNumber[b];
+ if(x<0){
+ len=0;
+ kmer=0;
+ throw new RuntimeException("Can't correct from right!\nerror="+error+"\n"+toString(bs, bases.length)+"\n"+new String(bases));
+ }else{
+ kmer=((kmer<<2)|x)&mask;
+ len++;
+ }
+// Data.sysout.print((char)b);
+ }
+ kmer<<=2;
+
+ if(verbose){
+ Data.sysout.println();
+ String s=Long.toBinaryString(kmer);
+ while(s.length()<kbits){s="0"+s;}
+ Data.sysout.println("kmer = \t"+s);
+ }
+ assert(len==setlen-1) : setlen+", "+len+", "+error+", "+stopLoc;
+
+ int[] counts=new int[4];
+ int[] dists=new int[4];
+ int minLoc=Tools.max(0, error-setlen+1);
+ if(error==0 || !bs.get(error-1)){minLoc=Tools.max(minLoc, error-9);}
+ else{
+ for(int i=error-2; i>=minLoc; i--){
+ if(!bs.get(i)){
+ minLoc=i+1;
+ break;
+ }
+ }
+ }
+
+ if(verbose){
+ Data.sysout.println("correctFromRight. Error = "+error+", minloc="+minLoc);
+ Data.sysout.println(new String(r.bases));
+ }
+ for(int bnum=0; bnum<4; bnum++){
+ byte c=AminoAcid.numberToBase[bnum];
+ bases[error]=c;
+ if(verbose){System.err.println("Considering "+(char)c);}
+ long key=kmer;
+ for(int loc=error; loc>=minLoc; loc--){
+ c=bases[loc];
+ int x=AminoAcid.baseToNumber[c];
+ if(x<0){
+ if(verbose){System.err.println("break: N");}
+ break;
+ }
+ key=((key>>2)|(((long)x)<<shift))&mask;
+// {
+// String s=Long.toBinaryString(key);
+// while(s.length()<kbits){s="0"+s;}
+// Data.sysout.println("mask="+Long.toBinaryString(mask)+", shift="+shift+", c="+c+", x="+x+", key = \t"+s);
+// }
+ int count=kca.read(key);
+ if(count<thresh){
+ if(verbose){System.err.println("break: count="+count);}
+ break;
+ }
+ dists[bnum]++;
+ counts[bnum]+=count;
+ }
+ }
+ bases[error]=oldBase;
+
+ //Note: I could require both to be the same, to decrease false-positives
+
+ final int muid=maxUniqueIndex(dists);
+ Arrays.sort(dists);
+ final int advance=dists[3];
+ final int delta=dists[3]-dists[2];
+// if(advance<minAdvance){return false;}
+ if(delta<minAdvance){return false;}
+
+ int best=(muid<0 ? maxUniqueIndex(counts) : muid);
+
+ if(verbose){System.err.println("Best="+best+": "+Arrays.toString(dists)+" \t"+Arrays.toString(counts));}
+ if(best<0){return false;}
+ byte bestC=AminoAcid.numberToBase[best];
+ if(bestC==oldBase){return false;}
+ bases[error]=bestC;
+
+ r.quality[error]=(byte)Tools.min(10, 3+delta);
+
+ return true;
+ }
+
+
+ /** returns index of highest value, if unique; else a negative number */
+ private static int maxUniqueIndex(int[] array){
+ int max=array[0];
+ int maxIndex=0;
+ for(int i=1; i<array.length; i++){
+ if(array[i]>max){
+ max=array[i];
+ maxIndex=i;
+ }else if(max==array[i]){
+ maxIndex=-1;
+ }
+ }
+ return maxIndex;
+ }
+
+ public static final String toString(BitSet bs, int len){
+// assert(verbose);
+ StringBuilder sb=new StringBuilder(len);
+ for(int i=0; i<len; i++){sb.append(bs.get(i) ? '1' : '0');}
+ return sb.toString();
+ }
+
+ private static void removeBad(ArrayList<Read> list){
+
+ for(int i=0; i<list.size(); i++){
+ Read r=list.get(i);
+ if(r.errors>0){
+ if(r.mate==null || r.mate.errors>0){
+ list.set(i, null);
+ }
+ }
+ }
+
+ }
+
+ public static boolean verbose=false;
+ /** Bails out if a read still has errors after correcting this many. */
+ public static int ERROR_CORRECTION_LIMIT=6;
+ /** Max allowed number of nearby corrections.
+ * A long error burst indicates the read simply has low coverage, and is not being corrected correctly. */
+ public static int MAX_ERROR_BURST=3;
+ /** Bursts have at most this distance between errors. E.G. '1' means errors are adjacent. */
+ public static int BURST_THRESH=2;
+ /** Withhold uncorrectable reads from output. */
+ public static boolean DONT_OUTPUT_BAD_READS=false;
+ /** Do not correct an error if it is at most this far from the next error. Instead, bail out. */
+ public static int MIN_ADVANCE=1;
+
+ /** Number of threads used for error correction. Does not control number of threads for creating the hash table.
+ * Additionally, up to 2 threads are used for reading and up to 2 for writing. For this (singlethreaded) class, the number does nothing. */
+ public static final int THREADS=1;
+
+ /** Output correction data instead of the corrected read */
+ public static boolean OUTPUT_INFO=false;
+
+
+}
diff --git a/current/jgi/FakeReads.java b/current/jgi/FakeReads.java
new file mode 100755
index 0000000..6028b34
--- /dev/null
+++ b/current/jgi/FakeReads.java
@@ -0,0 +1,367 @@
+package jgi;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import stream.ConcurrentGenericReadInputStream;
+import stream.ConcurrentReadInputStream;
+import stream.FASTQ;
+import stream.FastaReadInputStream;
+import stream.ConcurrentReadOutputStream;
+import stream.Read;
+
+import dna.AminoAcid;
+import dna.Parser;
+import dna.Timer;
+import fileIO.ByteFile;
+import fileIO.ByteFile1;
+import fileIO.ByteFile2;
+import fileIO.ReadWrite;
+import fileIO.FileFormat;
+
+import align2.ListNum;
+import align2.ReadStats;
+import align2.Shared;
+import align2.Tools;
+/**
+ * @author Brian Bushnell
+ * @date Sep 11, 2012
+ *
+ */
+public class FakeReads {
+
+ public static void main(String[] args){
+ Timer t=new Timer();
+ FakeReads rr=new FakeReads(args);
+ rr.process(t);
+ }
+
+ public FakeReads(String[] args){
+ if(args==null || args.length==0){
+ printOptions();
+ System.exit(0);
+ }
+
+ for(String s : args){if(s.startsWith("out=standardout") || s.startsWith("out=stdout")){outstream=System.err;}}
+ outstream.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+
+
+
+ Shared.READ_BUFFER_LENGTH=Tools.min(200, Shared.READ_BUFFER_LENGTH);
+ Shared.capBuffers(4);
+ ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
+ ReadWrite.MAX_ZIP_THREADS=Shared.threads();
+ ReadWrite.ZIP_THREAD_DIVISOR=1;
+
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(Parser.parseCommonStatic(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseZip(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseQuality(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseFasta(arg, a, b)){
+ //do nothing
+ }else if(a.equals("passes")){
+ assert(false) : "'passes' is disabled.";
+// passes=Integer.parseInt(b);
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ ByteFile1.verbose=verbose;
+ ByteFile2.verbose=verbose;
+ stream.FastaReadInputStream.verbose=verbose;
+ ConcurrentGenericReadInputStream.verbose=verbose;
+// align2.FastaReadInputStream2.verbose=verbose;
+ stream.FastqReadInputStream.verbose=verbose;
+ ReadWrite.verbose=verbose;
+ }else if(a.equals("addspacer") || a.equals("addspace") || a.equals("usespacer")){
+ addSpacer=Tools.parseBoolean(b);
+ }else if(a.equals("reads") || a.equals("maxreads")){
+ maxReads=Tools.parseKMG(b);
+ }else if(a.equals("t") || a.equals("threads")){
+ Shared.setThreads(b);
+ }else if(a.equals("in") || a.equals("input") || a.equals("in1") || a.equals("input1")){
+ in1=b;
+ }else if(a.equals("out") || a.equals("output") || a.equals("out1") || a.equals("output1")){
+ out1=b;
+ }else if(a.equals("out2") || a.equals("output2")){
+ out2=b;
+ }else if(a.equals("identifier") || a.equals("id")){
+ identifier=b;
+ }else if(a.equals("qfin") || a.equals("qfin1")){
+ qfin1=b;
+ }else if(a.equals("qfout") || a.equals("qfout1")){
+ qfout1=b;
+ }else if(a.equals("qfout2")){
+ qfout2=b;
+ }else if(a.equals("extin")){
+ extin=b;
+ }else if(a.equals("extout")){
+ extout=b;
+ }else if(a.equals("append") || a.equals("app")){
+ append=ReadStats.append=Tools.parseBoolean(b);
+ }else if(a.equals("overwrite") || a.equals("ow")){
+ overwrite=Tools.parseBoolean(b);
+ }else if(a.startsWith("minscaf") || a.startsWith("mincontig")){
+ stream.FastaReadInputStream.MIN_READ_LEN=Integer.parseInt(b);
+ }else if(a.equals("ml") || a.equals("minlen") || a.equals("minlength")){
+ minReadLength=Integer.parseInt(b);
+ }else if(a.equals("length") || a.equals("maxlen") || a.equals("length")){
+ desiredLength=Integer.parseInt(b);
+ }else if(a.equals("split")){
+ SPLITMODE=Tools.parseBoolean(b);
+ }else if(a.equals("overlap")){
+ SPLITMODE=true;
+ overlap=Integer.parseInt(b);
+ }else if(in1==null && i==0 && !arg.contains("=") && (arg.toLowerCase().startsWith("stdin") || new File(arg).exists())){
+ in1=arg;
+ if(arg.indexOf('#')>-1 && !new File(arg).exists()){
+ in1=b.replace("#", "1");
+ }
+ }else{
+ System.err.println("Unknown parameter "+args[i]);
+ assert(false) : "Unknown parameter "+args[i];
+ // throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ {//Process parser fields
+ Parser.processQuality();
+ }
+
+ if(identifier==null){identifier="";}
+ else{identifier=identifier+"_";}
+
+ if(!addSpacer){spacer="";}
+
+// if(in1!=null && in2==null && in1.indexOf('#')>-1 && !new File(in1).exists()){
+// in2=in1.replace("#", "2");
+// in1=in1.replace("#", "1");
+// }
+ if(out1!=null && out2==null && out1.indexOf('#')>-1){
+ out2=out1.replace("#", "2");
+ out1=out1.replace("#", "1");
+ }
+// if(in2!=null){
+// if(FASTQ.FORCE_INTERLEAVED){System.err.println("Reset INTERLEAVED to false because paired input files were specified.");}
+// FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+// }
+
+ assert(FastaReadInputStream.settingsOK());
+
+ if(in1==null){
+ printOptions();
+ throw new RuntimeException("Error - at least one input file is required.");
+ }
+ if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){
+// if(ReadWrite.isCompressed(in1)){ByteFile.FORCE_MODE_BF2=true;}
+ ByteFile.FORCE_MODE_BF2=true;
+ }
+
+ if(out1==null){
+ if(out2!=null){
+ printOptions();
+ throw new RuntimeException("Error - cannot define out2 without defining out1.");
+ }
+ System.err.println("No output stream specified. To write to stdout, please specify 'out=stdout.fq' or similar.");
+// out1="stdout";
+ }
+
+ if(out1!=null && out1.equalsIgnoreCase("null")){out1=null;}
+ if(out2!=null && out2.equalsIgnoreCase("null")){out2=null;}
+
+ if(!Tools.testOutputFiles(overwrite, append, false, out1, out2)){
+ throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+out1+", "+out2+"\n");
+ }
+
+ ffout1=FileFormat.testOutput(out1, FileFormat.FASTQ, extout, true, overwrite, append, false);
+ ffout2=FileFormat.testOutput(out2, FileFormat.FASTQ, extout, true, overwrite, append, false);
+
+ ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, extin, true, true);
+ }
+
+ void process(Timer t){
+
+
+ final ConcurrentReadInputStream cris;
+ {
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, null, qfin1, null);
+ if(verbose){System.err.println("Started cris");}
+ cris.start(); //4567
+ }
+ boolean paired=cris.paired();
+ if(verbose){System.err.println("Input is "+(paired ? "paired" : "unpaired"));}
+
+ ConcurrentReadOutputStream ros=null;
+ if(out1!=null){
+ final int buff=4;
+
+ if(cris.paired() && out2==null && (in1==null || !in1.contains(".sam"))){
+ outstream.println("Writing interleaved.");
+ }
+
+ assert(!out1.equalsIgnoreCase(in1) && !out1.equalsIgnoreCase(in1)) : "Input file and output file have same name.";
+ assert(out2==null || (!out2.equalsIgnoreCase(in1) && !out2.equalsIgnoreCase(out1))) : "out1 and out2 have same name.";
+
+ ros=ConcurrentReadOutputStream.getStream(ffout1, ffout2, qfout1, qfout2, buff, null, false);
+ ros.start();
+ }
+
+ long readsProcessed=0;
+ long basesProcessed=0;
+
+ {
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+ assert((ffin1==null || ffin1.samOrBam()) || (r.mate!=null)==cris.paired());
+ }
+
+ while(reads!=null && reads.size()>0){
+ ArrayList<Read> fake=new ArrayList<Read>(reads.size());
+
+ for(int idx=0; idx<reads.size(); idx++){
+ Read r=reads.get(idx);
+ {
+ readsProcessed++;
+ basesProcessed+=r.length();
+ }
+ assert(r.mate==null);
+
+ boolean remove=r.length()<minReadLength || (minReadLength+overlap)<2;
+
+ if(remove){
+ //Do nothing
+ }else{
+ int len=Tools.min(r.length(), desiredLength);
+ if(SPLITMODE){len=Tools.min(r.length(), (r.length()+overlap+1)/2);}
+
+ byte[] bases1=Arrays.copyOfRange(r.bases, 0, len);
+ byte[] bases2=Arrays.copyOfRange(r.bases, r.length()-len, r.length());
+ AminoAcid.reverseComplementBasesInPlace(bases2);
+
+ byte[] qual1=null;
+ byte[] qual2=null;
+ if(r.quality!=null){
+ qual1=Arrays.copyOfRange(r.quality, 0, len);
+ qual2=Arrays.copyOfRange(r.quality, r.quality.length-len, r.quality.length);
+ Tools.reverseInPlace(qual2);
+ }
+
+// public Read(byte[] s_, int chrom_, int start_, int stop_, String id_, byte[] quality_, long numericID_, int flags_){
+ Read a=new Read(bases1, -1, -1, -1, identifier+r.numericID+spacer+"/1", qual1, r.numericID, 0);
+ Read b=new Read(bases2, -1, -1, -1, identifier+r.numericID+spacer+"/2", qual2, r.numericID, 0);
+ a.mate=b;
+ b.mate=a;
+ fake.add(a);
+ }
+ }
+
+ if(ros!=null){ros.add(fake, ln.id);}
+
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ if(ln!=null){
+ cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
+ }
+ }
+
+ errorState|=ReadWrite.closeStreams(cris, ros);
+
+ t.stop();
+
+ double rpnano=readsProcessed/(double)(t.elapsed);
+ double bpnano=basesProcessed/(double)(t.elapsed);
+
+ String rpstring=(readsProcessed<100000 ? ""+readsProcessed : readsProcessed<100000000 ? (readsProcessed/1000)+"k" : (readsProcessed/1000000)+"m");
+ String bpstring=(basesProcessed<100000 ? ""+basesProcessed : basesProcessed<100000000 ? (basesProcessed/1000)+"k" : (basesProcessed/1000000)+"m");
+
+ while(rpstring.length()<8){rpstring=" "+rpstring;}
+ while(bpstring.length()<8){bpstring=" "+bpstring;}
+
+ outstream.println("Time: \t"+t);
+ outstream.println("Reads Processed: "+rpstring+" \t"+String.format("%.2fk reads/sec", rpnano*1000000));
+ outstream.println("Bases Processed: "+bpstring+" \t"+String.format("%.2fm bases/sec", bpnano*1000));
+
+ if(errorState){
+ throw new RuntimeException("FakeReads terminated in an error state; the output may be corrupt.");
+ }
+ }
+
+ /*--------------------------------------------------------------*/
+
+ private void printOptions(){
+ outstream.println("Syntax:\n");
+ outstream.println("java -ea -Xmx512m -cp <path> jgi.FakeReads in=<infile> out=<outfile> out2=<outfile2>");
+ outstream.println("\nout2 is optional. \nIf output is paired and there is only one output file, it will be written interleaved.\n");
+ outstream.println("Other parameters and their defaults:\n");
+ outstream.println("overwrite=false \tOverwrites files that already exist");
+ outstream.println("ziplevel=5 \tSet compression level, 1 (low) to 9 (max)");
+ outstream.println("interleaved=false\tDetermines whether input file is considered interleaved");
+ outstream.println("fastawrap=70 \tLength of lines in fasta output");
+ outstream.println("qin=auto \tASCII offset for input quality. May be set to 33 (Sanger), 64 (Illumina), or auto");
+ outstream.println("qout=auto \tASCII offset for output quality. May be set to 33 (Sanger), 64 (Illumina), or auto (meaning same as input)");
+ }
+
+
+ /*--------------------------------------------------------------*/
+
+ public boolean errorState=false;
+
+ public String identifier=null;
+
+ private String in1=null;
+
+ private boolean addSpacer=true;
+ private String spacer=" ";
+
+ private String qfin1=null;
+
+ private String out1=null;
+ private String out2=null;
+
+ private String qfout1=null;
+ private String qfout2=null;
+
+ private String extin=null;
+ private String extout=null;
+
+ private boolean overwrite=false;
+ private boolean append=false;
+
+ private long maxReads=-1;
+ private int minReadLength=1;
+ private int desiredLength=250;
+ private int overlap=50;
+ private boolean SPLITMODE=false;
+
+ private final FileFormat ffin1;
+
+ private final FileFormat ffout1;
+ private final FileFormat ffout2;
+
+
+ /*--------------------------------------------------------------*/
+
+ private PrintStream outstream=System.err;
+ public static boolean verbose=false;
+
+}
+
diff --git a/current/jgi/FilterByCoverage.java b/current/jgi/FilterByCoverage.java
new file mode 100755
index 0000000..2221d98
--- /dev/null
+++ b/current/jgi/FilterByCoverage.java
@@ -0,0 +1,474 @@
+package jgi;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+
+import stream.ConcurrentGenericReadInputStream;
+import stream.ConcurrentReadInputStream;
+import stream.FASTQ;
+import stream.FastaReadInputStream;
+import stream.ConcurrentReadOutputStream;
+import stream.Read;
+
+import dna.Parser;
+import dna.Timer;
+import fileIO.ByteFile;
+import fileIO.ByteFile1;
+import fileIO.ByteFile2;
+import fileIO.ReadWrite;
+import fileIO.FileFormat;
+import fileIO.TextFile;
+import fileIO.TextStreamWriter;
+
+import align2.ListNum;
+import align2.Shared;
+import align2.Tools;
+import align2.TrimRead;
+
+/**
+ * @author Brian Bushnell
+ * @date October 8, 2014
+ *
+ */
+public class FilterByCoverage {
+
+ public static void main(String[] args){
+ Timer t=new Timer();
+ FilterByCoverage mb=new FilterByCoverage(args);
+ mb.process(t);
+ }
+
+ public FilterByCoverage(String[] args){
+
+ args=Parser.parseConfig(args);
+ if(Parser.parseHelp(args, true)){
+ printOptions();
+ System.exit(0);
+ }
+
+ for(String s : args){if(s.startsWith("out=standardout") || s.startsWith("out=stdout")){outstream=System.err;}}
+ outstream.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+
+
+ Shared.READ_BUFFER_LENGTH=Tools.min(20, Shared.READ_BUFFER_LENGTH);
+ Shared.capBuffers(4);
+ ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
+ ReadWrite.MAX_ZIP_THREADS=Shared.threads();
+
+
+ Parser parser=new Parser();
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if(b==null || b.equalsIgnoreCase("null")){b=null;}
+ while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens
+
+ if(parser.parse(arg, a, b)){
+ //do nothing
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ ByteFile1.verbose=verbose;
+ ByteFile2.verbose=verbose;
+ stream.FastaReadInputStream.verbose=verbose;
+ ConcurrentGenericReadInputStream.verbose=verbose;
+// align2.FastaReadInputStream2.verbose=verbose;
+ stream.FastqReadInputStream.verbose=verbose;
+ ReadWrite.verbose=verbose;
+ }else if(a.equals("coverage") || a.equals("cov") || a.equals("covstats") || a.equals("coverage1") || a.equals("cov1") || a.equals("covstats1")){
+ covStatsAfter=b;
+ }else if(a.equals("coverage0") || a.equals("cov0") || a.equals("covstats0")){
+ covStatsBefore=b;
+ }else if(a.equals("minc") || a.equals("mincov") || a.equals("mincoverage")){
+ minCoverage=Double.parseDouble(b);
+ }else if(a.equals("minp") || a.equals("minpercent")){
+ minCoveredPercent=Double.parseDouble(b);
+ }else if(a.equals("minr") || a.equals("minreads")){
+ minReads=Long.parseLong(b);
+ }else if(a.equals("minratio") || a.equals("ratio")){
+ minRatio=Double.parseDouble(b);
+ }else if(a.equals("basesundermin")){
+ basesUnderMin=Integer.parseInt(b);
+ }else if(a.equals("minl") || a.equals("minlen") || a.equals("minlength")){
+ minLength=Integer.parseInt(b);
+ }else if(a.equals("trim") || a.equals("trimends")){
+ if(b==null || Character.isLetter(b.charAt(0))){
+ trimEnds=Tools.parseBoolean(b) ? 100 : 0;
+ }else{
+ trimEnds=Integer.parseInt(b);
+ }
+ trimEnds=Tools.max(trimEnds, 0);
+ }else if(a.equals("appendresults") || a.equals("logappend") || a.equals("appendlog") || a.equals("appendtolog")){
+ logappend=Tools.parseBoolean(b);
+ }else if(a.equals("log") || a.equals("results")){
+ logfile=b;
+ }else if(a.equals("logheader")){
+ logheader=Tools.parseBoolean(b);
+ }else if(a.equals("outd") || a.equals("outdirty")){
+ outdirty=b;
+ }else if(parser.in1==null && i==0 && !arg.contains("=") && (arg.toLowerCase().startsWith("stdin") || new File(arg).exists())){
+ parser.in1=arg;
+ if(arg.indexOf('#')>-1 && !new File(arg).exists()){
+ parser.in1=arg.replace("#", "1");
+ parser.in2=arg.replace("#", "2");
+ }
+ }else if(parser.out1==null && i==1 && !arg.contains("=")){
+ parser.out1=arg;
+ }else{
+ outstream.println("Unknown parameter "+args[i]);
+ assert(false) : "Unknown parameter "+args[i];
+ // throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ {//Process parser fields
+ Parser.processQuality();
+
+ maxReads=parser.maxReads;
+
+ overwrite=parser.overwrite;
+ append=parser.append;
+
+ if(parser.minReadLength>0){minLength=parser.minReadLength;}
+
+ in1=parser.in1;
+ qfin1=parser.qfin1;
+
+ outclean=parser.out1;
+ qfoutclean=parser.qfout1;
+
+ extin=parser.extin;
+ extout=parser.extout;
+ }
+ minLength=Tools.max(1, minLength);
+
+ assert(FastaReadInputStream.settingsOK());
+
+ if(in1==null){
+ printOptions();
+ throw new RuntimeException("Error - at least one input file is required.");
+ }
+ name=ReadWrite.stripToCore(in1);
+
+ if(!ByteFile.FORCE_MODE_BF2){
+ ByteFile.FORCE_MODE_BF2=false;
+ ByteFile.FORCE_MODE_BF1=true;
+ }
+
+ if(outclean!=null && outclean.equalsIgnoreCase("null")){outclean=null;}
+ if(outdirty!=null && outdirty.equalsIgnoreCase("null")){outdirty=null;}
+
+ if(!Tools.testOutputFiles(overwrite, append, false, outclean, outdirty)){
+ outstream.println((outclean==null)+", "+outclean+", "+(outdirty==null)+", "+outdirty);
+ throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+outclean+", "+outdirty+"\n");
+ }
+
+ ffoutclean=FileFormat.testOutput(outclean, FileFormat.FASTA, extout, true, overwrite, append, false);
+ ffoutdirty=FileFormat.testOutput(outdirty, FileFormat.FASTA, extout, true, overwrite, append, false);
+
+ ffin1=FileFormat.testInput(in1, FileFormat.FASTA, extin, true, true);
+ ffCov0=FileFormat.testInput(covStatsBefore, FileFormat.TEXT, ".txt", true, false);
+ ffCov1=FileFormat.testInput(covStatsAfter, FileFormat.TEXT, ".txt", true, false);
+
+ assert(covStatsAfter!=null) : "No coverage file specified.";
+ }
+
+ void process(Timer t){
+
+ final HashMap<String, CovStatsLine> cslMap0=new HashMap<String, CovStatsLine>(1024);
+ final HashMap<String, CovStatsLine> cslMap1=new HashMap<String, CovStatsLine>(1024);
+ if(ffCov0!=null){
+ TextFile tf=new TextFile(ffCov0);
+ int i=0;
+ for(String s=tf.nextLine(); s!=null; s=tf.nextLine()){
+ if(!s.startsWith("#") && i>0){
+ CovStatsLine csl=new CovStatsLine(s);
+ CovStatsLine old=cslMap0.put(csl.id, csl);
+ assert(old==null);
+ }
+ i++;
+ }
+ tf.close();
+ }
+ if(ffCov1!=null){
+ TextFile tf=new TextFile(ffCov1);
+ int i=0;
+ for(String s=tf.nextLine(); s!=null; s=tf.nextLine()){
+ if(!s.startsWith("#") && i>0){
+ CovStatsLine csl=new CovStatsLine(s);
+ CovStatsLine old=cslMap1.put(csl.id, csl);
+ assert(old==null);
+ }
+ i++;
+ }
+ tf.close();
+ }
+
+ final ConcurrentReadInputStream cris;
+ {
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, null, qfin1, null);
+ if(verbose){outstream.println("Started cris");}
+ cris.start(); //4567
+ }
+ assert(!cris.paired());
+
+ final ConcurrentReadOutputStream rosClean;
+ if(outclean!=null){
+ final int buff=4;
+
+ assert(!outclean.equalsIgnoreCase(in1) && !outclean.equalsIgnoreCase(in1)) : "Input file and output file have same name.";
+
+ rosClean=ConcurrentReadOutputStream.getStream(ffoutclean, null, qfoutclean, null, buff, null, false);
+ rosClean.start();
+ }else{rosClean=null;}
+
+ final ConcurrentReadOutputStream rosDirty;
+ if(outdirty!=null){
+ final int buff=4;
+
+ assert(!outdirty.equalsIgnoreCase(in1) && !outdirty.equalsIgnoreCase(in1)) : "Input file and output file have same name.";
+
+ rosDirty=ConcurrentReadOutputStream.getStream(ffoutdirty, null, qfoutdirty, null, buff, null, false);
+ rosDirty.start();
+ }else{rosDirty=null;}
+
+ long readsProcessed=0;
+ long basesProcessed=0;
+
+ long basesTrimmed=0;
+
+ long readsOut=0;
+ long basesOut=0;
+
+ long readsFiltered=0;
+ long basesFiltered=0;
+
+ final TextStreamWriter tsw=(logfile==null ? null : new TextStreamWriter(logfile, (overwrite && !logappend), logappend, true));
+// System.err.println("***** overwrite="+overwrite+", logappend="+logappend+", combined="+(overwrite && !logappend));
+ if(tsw!=null){
+ tsw.start();
+ if(logheader){tsw.print("#assembly\tcontig\tcontam\tlength\tavgFold\treads\tpercentCovered"+(ffCov0==null ? "" : "\tavgFold0\treads0\tnormRatio")+"\n");}
+ }
+
+ {
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+// outstream.println("Fetched "+reads);
+
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+ assert((ffin1==null || ffin1.samOrBam()) || (r.mate!=null)==cris.paired());
+ }
+
+ while(reads!=null && reads.size()>0){
+
+ final ArrayList<Read> cleanList=new ArrayList<Read>(reads.size());
+ final ArrayList<Read> dirtyList=new ArrayList<Read>(reads.size());
+ for(int idx=0; idx<reads.size(); idx++){
+ final Read r1=reads.get(idx);
+ assert(r1.mate==null);
+
+ final int initialLength1=r1.length();
+
+ readsProcessed++;
+ basesProcessed+=initialLength1;
+
+ if(trimEnds>0){
+ if(initialLength1-trimEnds*2<minLength){
+ r1.bases=r1.quality=null;
+ }else{
+ TrimRead.trimByAmount(r1, trimEnds, trimEnds, 0);
+ }
+ }
+ final int length=r1.length();
+ basesTrimmed+=(initialLength1-length);
+
+ final double covRatio;
+ final boolean contam;
+
+ final CovStatsLine csl0=cslMap0.get(r1.id);
+ final CovStatsLine csl1=cslMap1.get(r1.id);
+ if(csl1!=null){
+
+ if(csl0!=null){
+ covRatio=csl0.avgFold/Tools.max(0.01, csl1.avgFold);
+ int underMin=csl0.underMin-csl1.underMin;
+
+ if(csl1.reads()<minReads || length<minLength || csl1.coveredPercent()<minCoveredPercent){
+ contam=true;
+ }else if((csl1.avgFold<minCoverage && covRatio>minRatio) || csl1.avgFold<0.5){
+ contam=true;
+ }else if(basesUnderMin>0 && underMin>basesUnderMin){
+ contam=true;
+ }else{
+ contam=false;
+ }
+ }else{
+ covRatio=0;
+ int underMin=csl1.underMin;
+
+ if(csl1.reads()<minReads || length<minLength || csl1.coveredPercent()<minCoveredPercent || csl1.avgFold<minCoverage){
+ contam=true;
+ }else if(basesUnderMin>0 && underMin>basesUnderMin){
+ contam=true;
+ }else{
+ contam=false;
+ }
+ }
+
+ }else{
+ contam=true;
+ covRatio=0;
+ }
+
+ if(!contam){
+ cleanList.add(r1);
+ readsOut++;
+ basesOut+=length;
+ }else{
+ dirtyList.add(r1);
+ readsFiltered++;
+ basesFiltered+=length;
+ }
+ if(tsw!=null && (length>=minLength || PRINT_SHORT_CONTIG_RESULTS)){
+ if(csl1==null){
+ if(ffCov0==null){
+ tsw.print(String.format("%s\t%s\t%s\t%d\t%.2f\t%d\t%.2f\n", name, r1.id, contam ? "1" : "0", length, 0, 0, 0));
+ }else{
+ tsw.print(String.format("%s\t%s\t%s\t%d\t%.2f\t%d\t%.2f\t%.2f\t%d\t%.2f\n",
+ name, r1.id, contam ? "1" : "0", length, 0, 0, 0, 0, 0, 0));
+ }
+
+ }else if(csl0==null){
+ tsw.print(String.format("%s\t%s\t%s\t%d\t%.2f\t%d\t%.2f\n", name, csl1.id, contam ? "1" : "0", length,
+ csl1.avgFold, csl1.plusReads+csl1.minusReads, csl1.coveredPercent()));
+ }else{
+ tsw.print(String.format("%s\t%s\t%s\t%d\t%.2f\t%d\t%.2f\t%.2f\t%d\t%.2f\n", name, csl1.id, contam ? "1" : "0", length,
+ csl1.avgFold, csl1.plusReads+csl1.minusReads, csl1.coveredPercent(), csl0.avgFold, csl0.plusReads+csl0.minusReads, covRatio));
+ }
+ }
+ }
+
+ if(rosClean!=null){rosClean.add(cleanList, ln.id);}
+ if(rosDirty!=null){rosDirty.add(dirtyList, ln.id);}
+
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ if(ln!=null){
+ cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
+ }
+ }
+
+ errorState|=ReadWrite.closeStreams(cris, rosClean, rosDirty);
+ if(tsw!=null){errorState|=tsw.poisonAndWait();}
+
+ t.stop();
+
+ double rpnano=readsProcessed/(double)(t.elapsed);
+ double bpnano=basesProcessed/(double)(t.elapsed);
+
+ outstream.println("Time: "+t);
+ outstream.println("Reads In: "+readsProcessed+" \t"+String.format("%.2fk reads/sec", rpnano*1000000));
+ outstream.println("Bases In: "+basesProcessed+" \t"+String.format("%.2fm bases/sec", bpnano*1000));
+ outstream.println("Reads Out: "+readsOut);
+ outstream.println("Bases Out: "+basesOut);
+ outstream.println("Reads Filtered: "+readsFiltered);
+ outstream.println("Bases Filtered: "+basesFiltered);
+ if(trimEnds>0){
+ outstream.println("Bases Trimmed: "+basesTrimmed);
+ }
+
+ if(errorState){
+ throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
+ }
+ }
+
+ /*--------------------------------------------------------------*/
+
+ private void printOptions(){
+ assert(false) : "printOptions: TODO";
+// outstream.println("Syntax:\n");
+// outstream.println("java -ea -Xmx512m -cp <path> jgi.ReformatReads in=<infile> in2=<infile2> out=<outfile> out2=<outfile2>");
+// outstream.println("\nin2 and out2 are optional. \nIf input is paired and there is only one output file, it will be written interleaved.\n");
+// outstream.println("Other parameters and their defaults:\n");
+// outstream.println("overwrite=false \tOverwrites files that already exist");
+// outstream.println("ziplevel=4 \tSet compression level, 1 (low) to 9 (max)");
+// outstream.println("interleaved=false\tDetermines whether input file is considered interleaved");
+// outstream.println("fastawrap=70 \tLength of lines in fasta output");
+// outstream.println("qin=auto \tASCII offset for input quality. May be set to 33 (Sanger), 64 (Illumina), or auto");
+// outstream.println("qout=auto \tASCII offset for output quality. May be set to 33 (Sanger), 64 (Illumina), or auto (meaning same as input)");
+// outstream.println("outsingle=<file> \t(outs) Write singleton reads here, when conditionally discarding reads from pairs.");
+ }
+
+
+ /*--------------------------------------------------------------*/
+
+ private String in1=null;
+ private String covStatsBefore=null;
+ private String covStatsAfter=null;
+ private String name=null;
+
+ private String qfin1=null;
+
+ private String outclean=null;
+ private String outdirty=null;
+
+ private String qfoutclean=null;
+ private String qfoutdirty=null;
+
+ private String extin=null;
+ private String extout=null;
+
+ /*--------------------------------------------------------------*/
+
+ private long maxReads=-1;
+
+ /** Scaffolds shorter than this will be discarded. */
+ private int minLength=0;
+ /** Scaffolds with fewer mapped reads will be discarded. */
+ private long minReads=0;
+ /** Scaffolds with lower average coverage will be discarded. */
+ private double minCoverage=5;
+ /** Scaffolds with a lower percent of covered bases will be discarded. */
+ private double minCoveredPercent=40;
+ /** Scaffolds will NOT be discarded based on low coverage unless the coverage dropped by at least this factor. */
+ private double minRatio=0;
+ /** Scaffolds will be discarded if there are at least this many bases in windows below a coverage cutoff. */
+ private int basesUnderMin=-1;
+
+ /** Trim this much from sequence ends */
+ private int trimEnds=0;
+
+ /*--------------------------------------------------------------*/
+
+ private final FileFormat ffin1;
+ private final FileFormat ffCov0;
+ private final FileFormat ffCov1;
+
+ private final FileFormat ffoutclean;
+ private final FileFormat ffoutdirty;
+
+
+ /*--------------------------------------------------------------*/
+
+ private PrintStream outstream=System.err;
+ public static boolean verbose=false;
+ public boolean errorState=false;
+ private boolean overwrite=false;
+ private boolean append=false;
+ private boolean logappend=false;
+ private String logfile=null;
+ private boolean logheader=true;
+ private static boolean PRINT_SHORT_CONTIG_RESULTS=false;
+
+}
diff --git a/current/jgi/FilterBySequence.java b/current/jgi/FilterBySequence.java
new file mode 100755
index 0000000..68ed5ac
--- /dev/null
+++ b/current/jgi/FilterBySequence.java
@@ -0,0 +1,844 @@
+package jgi;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.LinkedHashSet;
+
+import stream.ConcurrentReadInputStream;
+import stream.FASTQ;
+import stream.FastaReadInputStream;
+import stream.ConcurrentReadOutputStream;
+import stream.Read;
+
+import dna.AminoAcid;
+import dna.Parser;
+import dna.Timer;
+import fileIO.ByteFile;
+import fileIO.ReadWrite;
+import fileIO.FileFormat;
+
+import align2.ListNum;
+import align2.ReadStats;
+import align2.Shared;
+import align2.Tools;
+
+/**
+ * Filters by exact sequence matches.
+ * Similar to Dedupe.
+ *
+ * @author Brian Bushnell
+ * @date December 18, 2015
+ *
+ */
+public class FilterBySequence {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /**
+ * Code entrance from the command line.
+ * @param args Command line arguments
+ */
+ public static void main(String[] args){
+ //Start a timer immediately upon code entrance.
+ Timer t=new Timer();
+
+ //Create an instance of this class
+ FilterBySequence as=new FilterBySequence(args);
+
+ //Run the object
+ as.process(t);
+ }
+
+ /**
+ * Constructor.
+ * @param args Command line arguments
+ */
+ public FilterBySequence(String[] args){
+
+ //Process any config files
+ args=Parser.parseConfig(args);
+
+ //Detect whether the uses needs help
+ if(Parser.parseHelp(args, true)){
+ printOptions();
+ System.exit(0);
+ }
+
+ //Print the program name and arguments
+ outstream.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+
+ boolean setInterleaved=false; //Whether interleaved was explicitly set.
+
+ //Set some shared static variables regarding PIGZ
+ ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
+ ReadWrite.MAX_ZIP_THREADS=Shared.threads();
+// FASTQ.FORCE_INTERLEAVED=false;
+// FASTQ.TEST_INTERLEAVED=false;
+
+ //Create a parser object
+ Parser parser=new Parser();
+
+ //Parse each argument
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+
+ //Break arguments into their constituent parts, in the form of "a=b"
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if(b==null || b.equalsIgnoreCase("null")){b=null;}
+ while(a.startsWith("-")){a=a.substring(1);} //Strip leading hyphens
+
+
+ if(parser.parse(arg, a, b)){//Parse standard flags in the parser
+ //do nothing
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ }else if(a.equals("ordered")){
+ ordered=Tools.parseBoolean(b);
+ }else if(a.equals("storebases") || a.equals("keepbases") || a.equals("sb")){
+ storeBases=Tools.parseBoolean(b);
+ }else if(a.equals("include")){
+ include=Tools.parseBoolean(b);
+ }else if(a.equals("exclude")){
+ include=!Tools.parseBoolean(b);
+ }else if(a.equals("rcomp")){
+ rcomp=Tools.parseBoolean(b);
+ }else if(a.equals("casesensitive") || a.equals("case")){
+ toUpperCase=!Tools.parseBoolean(b);
+ }else if(a.equals("parse_flag_goes_here")){
+ long fake_variable=Tools.parseKMG(b);
+ //Set a variable here
+ }else if(a.equals("ref")){
+ if(b==null){ref=null;}
+ else{ref=b.split(",");}
+ }else if(a.equals("literal")){
+ if(b==null){literal=null;}
+ else{literal=b.split(",");}
+ }else{
+ outstream.println("Unknown parameter "+args[i]);
+ assert(false) : "Unknown parameter "+args[i];
+ }
+ }
+
+ {//Process parser fields
+ Parser.processQuality();
+
+ maxReads=parser.maxReads;
+
+ overwrite=ReadStats.overwrite=parser.overwrite;
+ append=ReadStats.append=parser.append;
+ setInterleaved=parser.setInterleaved;
+
+ in1=parser.in1;
+ in2=parser.in2;
+ qfin1=parser.qfin1;
+ qfin2=parser.qfin2;
+
+ out1=parser.out1;
+ out2=parser.out2;
+ qfout1=parser.qfout1;
+ qfout2=parser.qfout2;
+
+ extin=parser.extin;
+ extout=parser.extout;
+ }
+
+ //Do input file # replacement
+ if(in1!=null && in2==null && in1.indexOf('#')>-1 && !new File(in1).exists()){
+ in2=in1.replace("#", "2");
+ in1=in1.replace("#", "1");
+ }
+
+ //Do output file # replacement
+ if(out1!=null && out2==null && out1.indexOf('#')>-1){
+ out2=out1.replace("#", "2");
+ out1=out1.replace("#", "1");
+ }
+
+ //Adjust interleaved detection based on the number of input files
+ if(in2!=null){
+ if(FASTQ.FORCE_INTERLEAVED){outstream.println("Reset INTERLEAVED to false because paired input files were specified.");}
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+ }
+
+ assert(FastaReadInputStream.settingsOK());
+
+ //Ensure there is an input file
+ if(in1==null){
+ printOptions();
+ throw new RuntimeException("Error - at least one input file is required.");
+ }
+
+ //Adjust the number of threads for input file reading
+ if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){
+ ByteFile.FORCE_MODE_BF2=true;
+ }
+
+ //Ensure out2 is not set without out1
+ if(out1==null){
+ if(out2!=null){
+ printOptions();
+ throw new RuntimeException("Error - cannot define out2 without defining out1.");
+ }
+ }
+
+ //Adjust interleaved settings based on number of output files
+ if(!setInterleaved){
+ assert(in1!=null && (out1!=null || out2==null)) : "\nin1="+in1+"\nin2="+in2+"\nout1="+out1+"\nout2="+out2+"\n";
+ if(in2!=null){ //If there are 2 input streams.
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+ outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED);
+ }else{ //There is one input stream.
+ if(out2!=null){
+ FASTQ.FORCE_INTERLEAVED=true;
+ FASTQ.TEST_INTERLEAVED=false;
+ outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED);
+ }
+ }
+ }
+
+ //Ensure output files can be written
+ if(!Tools.testOutputFiles(overwrite, append, false, out1, out2)){
+ outstream.println((out1==null)+", "+(out2==null)+", "+out1+", "+out2);
+ throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+out1+", "+out2+"\n");
+ }
+
+ //Ensure input files can be read
+ if(!Tools.testInputFiles(false, true, in1, in2)){
+ throw new RuntimeException("\nCan't read to some input files.\n");
+ }
+
+ //Ensure ref files can be read
+ if(!Tools.testInputFiles(true, true, ref)){
+ throw new RuntimeException("\nCan't read to some reference files.\n");
+ }
+
+ //Ensure that no file was specified multiple times
+ if(!Tools.testForDuplicateFiles(true, in1, in2, out1, out2)){
+ throw new RuntimeException("\nSome file names were specified multiple times.\n");
+ }
+
+ //Create output FileFormat objects
+ ffout1=FileFormat.testOutput(out1, FileFormat.FASTQ, extout, true, overwrite, append, ordered);
+ ffout2=FileFormat.testOutput(out2, FileFormat.FASTQ, extout, true, overwrite, append, ordered);
+
+ //Create input FileFormat objects
+ ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, extin, true, true);
+ ffin2=FileFormat.testInput(in2, FileFormat.FASTQ, extin, true, true);
+
+ assert(ref!=null || literal!=null) : "No reference sequences.";
+
+ if(ref!=null){
+ ffref=new FileFormat[ref.length];
+ for(int i=0; i<ref.length; i++){
+ ffref[i]=FileFormat.testInput(ref[i], FileFormat.FASTQ, null, true, true);
+ }
+ }else{
+ ffref=null;
+ }
+
+ refSet=new HashSet<Code>();
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Outer Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Create read streams and process all data */
+ void process(Timer t){
+
+ //Turn off read validation in the input threads to increase speed
+ final boolean vic=Read.VALIDATE_IN_CONSTRUCTOR;
+ Read.VALIDATE_IN_CONSTRUCTOR=Shared.threads()<4;
+
+ {
+ if(ffref!=null){
+ for(FileFormat ff : ffref){
+ processReference(ff);
+ }
+ }
+ if(literal!=null){
+ for(String s : literal){
+ refSet.add(new Code(s.getBytes()));
+ }
+ }
+
+ System.err.println("Loaded "+refSet.size()+" unique reference sequence"+(refSet.size()==1 ? "." : "s."));
+ }
+
+ //Create a read input stream
+ final ConcurrentReadInputStream cris;
+ {
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, ffin2, qfin1, qfin2);
+ cris.start(); //Start the stream
+ if(verbose){outstream.println("Started cris");}
+ }
+ boolean paired=cris.paired();
+ if(!ffin1.samOrBam()){outstream.println("Input is being processed as "+(paired ? "paired" : "unpaired"));}
+
+ //Optionally create a read output stream
+ final ConcurrentReadOutputStream ros;
+ if(ffout1!=null){
+ //Select output buffer size based on whether it needs to be ordered
+ final int buff=(ordered ? Tools.mid(16, 128, (Shared.threads()*2)/3) : 8);
+
+ //Notify user of output mode
+ if(cris.paired() && out2==null && (in1!=null && !ffin1.samOrBam() && !ffout1.samOrBam())){
+ outstream.println("Writing interleaved.");
+ }
+
+ ros=ConcurrentReadOutputStream.getStream(ffout1, ffout2, qfout1, qfout2, buff, null, false);
+ ros.start(); //Start the stream
+ }else{ros=null;}
+
+ //Reset counters
+ readsProcessed=0;
+ basesProcessed=0;
+
+ //Process the reads in separate threads
+ spawnProcessThreads(cris, ros);
+
+ if(verbose){outstream.println("Finished; closing streams.");}
+
+ //Write anything that was accumulated by ReadStats
+ errorState|=ReadStats.writeAll();
+ //Close the read streams
+ errorState|=ReadWrite.closeStreams(cris, ros);
+
+ //Reset read validation
+ Read.VALIDATE_IN_CONSTRUCTOR=vic;
+
+ //Report timing and results
+ {
+ t.stop();
+
+ //Calculate units per nanosecond
+ double rpnano=readsProcessed/(double)(t.elapsed);
+ double bpnano=basesProcessed/(double)(t.elapsed);
+ {
+ String rpstring=(""+readsProcessed);
+ String bpstring=(""+basesProcessed);
+
+ //Format the strings so they have they are right-justified
+ while(rpstring.length()<12){rpstring=" "+rpstring;}
+ while(bpstring.length()<12){bpstring=" "+bpstring;}
+
+ outstream.println();
+ outstream.println("Time: \t"+t);
+ outstream.println("Reads Processed: "+rpstring+" \t"+String.format("%.2fk reads/sec", rpnano*1000000));
+ outstream.println("Bases Processed: "+bpstring+" \t"+String.format("%.2fm bases/sec", bpnano*1000));
+ }
+ {
+ String rpstring=(""+readsOut);
+ String bpstring=(""+basesOut);
+
+ //Format the strings so they have they are right-justified
+ outstream.println();
+ while(rpstring.length()<12){rpstring=" "+rpstring;}
+ while(bpstring.length()<12){bpstring=" "+bpstring;}
+ outstream.println("Reads Out: "+rpstring);
+ outstream.println("Bases Out: "+bpstring);
+ }
+ }
+
+ //Throw an exception of there was an error in a thread
+ if(errorState){
+ throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
+ }
+ }
+
+ /** Load ref sequences */
+ private void processReference(final FileFormat ff){
+ final ConcurrentReadInputStream cris;
+ {
+ cris=ConcurrentReadInputStream.getReadInputStream(-1, false, ff, null, null, null);
+ cris.start(); //Start the stream
+ }
+ spawnLoadThreads(cris);
+ ReadWrite.closeStream(cris);
+ }
+
+
+ /** Spawn process threads */
+ private void spawnLoadThreads(final ConcurrentReadInputStream cris){
+
+ //Do anything necessary prior to processing
+
+ //Determine how many threads may be used
+ final int threads=Shared.threads();
+
+ //Fill a list with LoadThreads
+ ArrayList<LoadThread> alpt=new ArrayList<LoadThread>(threads);
+ for(int i=0; i<threads; i++){
+ alpt.add(new LoadThread(cris, i));
+ }
+
+ //Start the threads
+ for(LoadThread pt : alpt){
+ pt.start();
+ }
+
+ //Wait for completion of all threads
+ boolean success=true;
+ for(LoadThread pt : alpt){
+
+ //Wait until this thread has terminated
+ while(pt.getState()!=Thread.State.TERMINATED){
+ try {
+ //Attempt a join operation
+ pt.join();
+ } catch (InterruptedException e) {
+ //Potentially handle this, if it is expected to occur
+ e.printStackTrace();
+ }
+ }
+
+ //Accumulate per-thread statistics
+ readsLoaded+=pt.readsProcessedT;
+ basesLoaded+=pt.basesProcessedT;
+ success&=pt.success;
+ }
+
+ //Track whether any threads failed
+ if(!success){errorState=true;}
+
+ //Do anything necessary after processing
+
+ }
+
+ /** Spawn process threads */
+ private void spawnProcessThreads(final ConcurrentReadInputStream cris, final ConcurrentReadOutputStream ros){
+
+ //Do anything necessary prior to processing
+
+ //Determine how many threads may be used
+ final int threads=Shared.threads();
+
+ //Fill a list with ProcessThreads
+ ArrayList<ProcessThread> alpt=new ArrayList<ProcessThread>(threads);
+ for(int i=0; i<threads; i++){
+ alpt.add(new ProcessThread(cris, ros, i));
+ }
+
+ //Start the threads
+ for(ProcessThread pt : alpt){
+ pt.start();
+ }
+
+ //Wait for completion of all threads
+ boolean success=true;
+ for(ProcessThread pt : alpt){
+
+ //Wait until this thread has terminated
+ while(pt.getState()!=Thread.State.TERMINATED){
+ try {
+ //Attempt a join operation
+ pt.join();
+ } catch (InterruptedException e) {
+ //Potentially handle this, if it is expected to occur
+ e.printStackTrace();
+ }
+ }
+
+ //Accumulate per-thread statistics
+ readsProcessed+=pt.readsProcessedT;
+ basesProcessed+=pt.basesProcessedT;
+ readsOut+=pt.readsOutT;
+ basesOut+=pt.basesOutT;
+ success&=pt.success;
+ }
+
+ //Track whether any threads failed
+ if(!success){errorState=true;}
+
+ //Do anything necessary after processing
+
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** This is called if the program runs with no parameters */
+ private void printOptions(){
+ throw new RuntimeException("TODO"); //TODO
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Classes ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private class ProcessThread extends Thread {
+
+ //Constructor
+ ProcessThread(final ConcurrentReadInputStream cris_, final ConcurrentReadOutputStream ros_, final int tid_){
+ cris=cris_;
+ ros=ros_;
+ tid=tid_;
+ }
+
+ //Called by start()
+ public void run(){
+ //Do anything necessary prior to processing
+
+ //Process the reads
+ processInner();
+
+ //Do anything necessary after processing
+
+ //Indicate successful exit status
+ success=true;
+ }
+
+ /** Iterate through the reads */
+ void processInner(){
+
+ //Grab the first ListNum of reads
+ ListNum<Read> ln=cris.nextList();
+ //Grab the actual read list from the ListNum
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ //Check to ensure pairing is as expected
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+// assert(ffin1.samOrBam() || (r.mate!=null)==cris.paired()); //Disabled due to non-static access
+ }
+
+ //As long as there is a nonempty read list...
+ while(reads!=null && reads.size()>0){
+// if(verbose){outstream.println("Fetched "+reads.size()+" reads.");} //Disabled due to non-static access
+
+ //Loop through each read in the list
+ for(int idx=0; idx<reads.size(); idx++){
+ final Read r1=reads.get(idx);
+ final Read r2=r1.mate;
+
+ //Validate reads in worker threads
+ if(!r1.validated()){r1.validate(true);}
+ if(r2!=null && !r2.validated()){r2.validate(true);}
+
+ //Track the initial length for statistics
+ final int initialLength1=r1.length();
+ final int initialLength2=r1.mateLength();
+
+ //Increment counters
+ readsProcessedT+=1+r1.mateCount();
+ basesProcessedT+=initialLength1+initialLength2;
+
+ {
+ //Reads are processed in this block.
+ boolean keep=processReadPair(r1, r2);
+ if(!keep){reads.set(idx, null);}
+ else{
+ readsOutT+=1+r1.mateCount();
+ basesOutT+=initialLength1+initialLength2;
+ }
+ }
+ }
+
+ //Output reads to the output stream
+ if(ros!=null){ros.add(reads, ln.id);}
+
+ //Notify the input stream that the list was used
+ cris.returnList(ln.id, ln.list.isEmpty());
+// if(verbose){outstream.println("Returned a list.");} //Disabled due to non-static access
+
+ //Fetch a new list
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+
+ //Notify the input stream that the final list was used
+ if(ln!=null){
+ cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
+ }
+ }
+
+ /**
+ * Process a read or a read pair.
+ * @param r1 Read 1
+ * @param r2 Read 2 (may be null)
+ * @return True if the reads should be kept, false if they should be discarded.
+ */
+ boolean processReadPair(final Read r1, final Read r2){
+ final Code c1=new Code(r1.bases);
+ final Code c2=(r2==null ? null : new Code(r2.bases));
+ if(refSet.contains(c1)!=include){return false;}
+ if(c2!=null && refSet.contains(c2)!=include){return false;}
+ return true;
+ }
+
+ /** Number of reads processed by this thread */
+ protected long readsProcessedT=0;
+ /** Number of bases processed by this thread */
+ protected long basesProcessedT=0;
+
+ /** Number of reads output by this thread */
+ protected long readsOutT=0;
+ /** Number of bases output by this thread */
+ protected long basesOutT=0;
+
+ /** True only if this thread has completed successfully */
+ boolean success=false;
+
+ /** Shared input stream */
+ private final ConcurrentReadInputStream cris;
+ /** Shared output stream */
+ private final ConcurrentReadOutputStream ros;
+ /** Thread ID */
+ final int tid;
+ }
+
+ private class LoadThread extends Thread {
+
+ //Constructor
+ LoadThread(final ConcurrentReadInputStream cris_, final int tid_){
+ cris=cris_;
+ tid=tid_;
+ }
+
+ //Called by start()
+ public void run(){
+ //Do anything necessary prior to processing
+
+ //Process the reads
+ processInner();
+
+ //Do anything necessary after processing
+
+ //Indicate successful exit status
+ success=true;
+ }
+
+ /** Iterate through the reads */
+ void processInner(){
+
+ //Grab the first ListNum of reads
+ ListNum<Read> ln=cris.nextList();
+ //Grab the actual read list from the ListNum
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ //Check to ensure pairing is as expected
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+// assert(ffin1.samOrBam() || (r.mate!=null)==cris.paired()); //Disabled due to non-static access
+ }
+
+ LinkedHashSet<Code> codes=new LinkedHashSet<Code>(4000);
+
+ //As long as there is a nonempty read list...
+ while(reads!=null && reads.size()>0){
+// if(verbose){outstream.println("Fetched "+reads.size()+" reads.");} //Disabled due to non-static access
+
+ //Loop through each read in the list
+ for(int idx=0; idx<reads.size(); idx++){
+ final Read r1=reads.get(idx);
+ final Read r2=r1.mate;
+
+ //Validate reads in worker threads
+ if(!r1.validated()){r1.validate(true);}
+ if(r2!=null && !r2.validated()){r2.validate(true);}
+
+ //Track the initial length for statistics
+ final int initialLength1=r1.length();
+ final int initialLength2=r1.mateLength();
+
+ //Increment counters
+ readsProcessedT+=1+r1.mateCount();
+ basesProcessedT+=initialLength1+initialLength2;
+
+ if(r1!=null){codes.add(new Code(r1.bases));}
+ if(r2!=null){codes.add(new Code(r2.bases));}
+ }
+
+ if(codes.size()>2000){
+ synchronized(refSet){
+ for(Code c : codes){
+ refSet.add(c);
+ }
+ codes.clear();
+ }
+ }
+
+ //Notify the input stream that the list was used
+ cris.returnList(ln.id, ln.list.isEmpty());
+// if(verbose){outstream.println("Returned a list.");} //Disabled due to non-static access
+
+ //Fetch a new list
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+
+ if(codes.size()>0){
+ synchronized(refSet){
+ for(Code c : codes){
+ refSet.add(c);
+ }
+ codes.clear();
+ }
+ }
+
+ //Notify the input stream that the final list was used
+ if(ln!=null){
+ cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
+ }
+ }
+
+ /** Number of reads processed by this thread */
+ protected long readsProcessedT=0;
+ /** Number of bases processed by this thread */
+ protected long basesProcessedT=0;
+
+ /** True only if this thread has completed successfully */
+ boolean success=false;
+
+ /** Shared input stream */
+ private final ConcurrentReadInputStream cris;
+ /** Thread ID */
+ final int tid;
+ }
+
+ private class Code {
+
+ Code(byte[] bases_){
+ long fwd=Dedupe.hash(bases_);
+ long rev=Dedupe.hashReversed(bases_);
+ a=(rcomp ? Tools.max(fwd, rev) : fwd);
+ b=(rcomp ? Tools.min(fwd, rev) : rev);
+
+ if(storeBases){
+ if(!rcomp && !toUpperCase){
+ bases=bases_;
+ }else{
+ bases=bases_.clone();
+ if(a!=fwd){AminoAcid.reverseComplementBasesInPlace(bases);}
+ for(int i=0; i<bases.length; i++){
+ bases[i]=(byte) Character.toUpperCase(bases[i]);
+ }
+ }
+ }else{
+ bases=null;
+ }
+ }
+
+ @Override
+ public boolean equals(Object o){
+ return equals((Code)o);
+ }
+
+ public boolean equals(Code c){
+ if(a!=c.a || b!=c.b){return false;}
+ if(bases==null || c.bases==null){return true;}
+ return Tools.equals(bases, c.bases);
+ }
+
+ @Override
+ public int hashCode(){
+ return (int)(a&0x7FFFFFFF);
+ }
+
+ final long a, b;
+ final byte[] bases;
+
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Primary input file path */
+ private String in1=null;
+ /** Secondary input file path */
+ private String in2=null;
+
+ private String qfin1=null;
+ private String qfin2=null;
+
+ /** Primary output file path */
+ private String out1=null;
+ /** Secondary output file path */
+ private String out2=null;
+
+ private String qfout1=null;
+ private String qfout2=null;
+
+ /** Override input file extension */
+ private String extin=null;
+ /** Override output file extension */
+ private String extout=null;
+
+ /** Ref input file path */
+ private String[] ref=null;
+
+ /** Literals */
+ private String[] literal=null;
+
+ private HashSet<Code> refSet;
+
+ private boolean storeBases=true;
+
+ private boolean include=true;
+
+ private boolean rcomp=true;
+
+ private boolean toUpperCase=true;
+
+ /*--------------------------------------------------------------*/
+
+ /** Number of reads processed */
+ protected long readsProcessed=0;
+ /** Number of bases processed */
+ protected long basesProcessed=0;
+
+ protected long readsLoaded=0;
+ protected long basesLoaded=0;
+
+ /** Number of reads output*/
+ protected long readsOut=0;
+ /** Number of bases output*/
+ protected long basesOut=0;
+
+ /** Quit after processing this many input reads; -1 means no limit */
+ private long maxReads=-1;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Final Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Primary input file */
+ private final FileFormat ffin1;
+ /** Secondary input file */
+ private final FileFormat ffin2;
+
+ /** Primary output file */
+ private final FileFormat ffout1;
+ /** Secondary output file */
+ private final FileFormat ffout2;
+
+ /** Reference Files */
+ private final FileFormat[] ffref;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Common Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Print status messages to this output stream */
+ private PrintStream outstream=System.err;
+ /** Print verbose messages */
+ public static boolean verbose=false;
+ /** True if an error was encountered */
+ public boolean errorState=false;
+ /** Overwrite existing output files */
+ private boolean overwrite=false;
+ /** Append to existing output files */
+ private boolean append=false;
+ /** Reads are output in input order */
+ private boolean ordered=false;
+
+}
diff --git a/current/jgi/FilterReadsWithSubs.java b/current/jgi/FilterReadsWithSubs.java
new file mode 100755
index 0000000..459608e
--- /dev/null
+++ b/current/jgi/FilterReadsWithSubs.java
@@ -0,0 +1,129 @@
+package jgi;
+
+import align2.Tools;
+import stream.Read;
+import stream.SamLine;
+import dna.AminoAcid;
+import dna.Timer;
+
+/**
+ * Filters to select only reads with substitution errors
+ * for bases with quality scores in a certain interval.
+ * Used for manually examining specific reads that may have
+ * incorrectly calibrated quality scores, for example.
+ * @author Brian Bushnell
+ * @date May 5, 2015
+ *
+ */
+public class FilterReadsWithSubs extends BBTool_ST {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /**
+ * Code entrance from the command line.
+ * Must be overridden; the commented body is an example.
+ * @param args Command line arguments
+ */
+ public static void main(String[] args){
+ Timer t=new Timer();
+ FilterReadsWithSubs bbt=new FilterReadsWithSubs(args);
+ bbt.process(t);
+ }
+
+ public FilterReadsWithSubs(String[] args){super(args);}
+
+ @Override
+ public boolean parseArgument(String arg, String a, String b) {
+// System.err.println("Calling parseArgument("+arg+","+a+","+b+")");
+ if(a.equals("minq")){
+ minq=(int)Tools.parseKMG(b);
+ return true;
+ }else if(a.equals("maxq")){
+ maxq=(int)Tools.parseKMG(b);
+ return true;
+ }else if(a.equals("keepperfect")){
+ keepPerfect=Tools.parseBoolean(b);
+ return true;
+ }else if(a.equals("countindels")){
+ countIndels=Tools.parseBoolean(b);
+ return true;
+ }
+
+ //There was no match to the argument
+ return false;
+ }
+
+ void setDefaults(){
+ SamLine.SET_FROM_OK=true;
+ minq=0;
+ maxq=99;
+ minsubs=1;
+ countIndels=true;
+ keepPerfect=false;
+ }
+
+ @Override
+ void startupSubclass() {}
+
+ @Override
+ void shutdownSubclass() {}
+
+ @Override
+ void showStatsSubclass(Timer t, long readsIn, long basesIn) {
+ // TODO Auto-generated method stub
+
+ }
+
+ @Override
+ boolean processReadPair(Read r1, Read r2) {
+ assert(r2==null);
+ final byte[] quals=r1.quality, bases=r1.bases;
+ final byte[] match=(r1.match==null ? null : !r1.shortmatch() ? r1.match : Read.toLongMatchString(r1.match));
+ if(match==null || quals==null || bases==null){return false;}
+
+ int subs=0;
+ int indels=0;
+ for(int qpos=0, mpos=0, last=quals.length-1; mpos<match.length; mpos++){
+
+ final byte m=match[mpos];
+ final byte mprev=match[Tools.max(mpos-1, 0)];
+ final byte mnext=match[Tools.min(mpos+1, match.length-1)];
+
+ final byte q1=quals[qpos];
+ final byte b2=bases[qpos];
+
+ int sub=0, indel=0;
+ if(m=='S'){
+ sub=1;
+ }else if(m=='I'){
+ indel=1;
+ }else if(m=='m'){
+ if(mprev=='D' || mnext=='D'){
+ indel=1;
+ }
+ }else if(m=='D'){
+ //do nothing
+ }else if(m=='C'){
+ //do nothing
+ }else{
+ throw new RuntimeException("Bad symbol m='"+((char)m)+"'\n"+new String(match)+"\n"+new String(bases)+"\n");
+ }
+ subs+=sub;
+ indels+=indel;
+ if(q1>=minq && q1<=maxq){
+ if(sub>0 || (indel>0 && countIndels)){return true;}
+ }
+
+ if(m!='D'){qpos++;}
+ }
+ return keepPerfect && subs==0 && indels==0;
+ }
+
+ public int minq, maxq, minsubs;
+ public boolean countIndels;
+ public boolean keepPerfect;
+
+
+}
diff --git a/current/jgi/FindPrimers.java b/current/jgi/FindPrimers.java
new file mode 100755
index 0000000..593e8fb
--- /dev/null
+++ b/current/jgi/FindPrimers.java
@@ -0,0 +1,297 @@
+package jgi;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import align2.ListNum;
+import align2.MSA;
+import align2.Tools;
+
+import stream.ByteBuilder;
+import stream.ConcurrentReadInputStream;
+import stream.Read;
+import stream.SamLine;
+import stream.SiteScore;
+
+import dna.AminoAcid;
+import dna.Gene;
+import dna.Parser;
+import dna.Timer;
+
+import fileIO.ByteStreamWriter;
+import fileIO.FileFormat;
+import fileIO.ReadWrite;
+
+/**
+ * @author Brian Bushnell
+ * @date Oct 6, 2014
+ *
+ */
+public class FindPrimers {
+
+ public static void main(String[] args){
+ Timer t=new Timer();
+ FindPrimers as=new FindPrimers(args);
+ as.process(t);
+ }
+
+ public FindPrimers(String[] args){
+
+ args=Parser.parseConfig(args);
+ if(Parser.parseHelp(args, true)){
+ printOptions();
+ System.exit(0);
+ }
+
+ outstream.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+
+ String query_=null;
+ Parser parser=new Parser();
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if(b==null || b.equalsIgnoreCase("null")){b=null;}
+ while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens
+
+ if(parser.parse(arg, a, b)){
+ //do nothing
+ }else if(a.equals("primer") || a.equals("query") || a.equals("literal")){
+ query_=b;
+ }else if(a.equals("msa")){
+ msaType=b;
+ }else if(a.equals("columns")){
+ columns=Integer.parseInt(b);
+ }else if(a.equals("parse_flag_goes_here")){
+ //Set a variable here
+ }else{
+ outstream.println("Unknown parameter "+args[i]);
+ assert(false) : "Unknown parameter "+args[i];
+ // throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ {//Process parser fields
+ Parser.processQuality();
+
+ maxReads=parser.maxReads;
+ in1=parser.in1;
+ out1=parser.out1;
+ }
+
+
+ if(query_==null){
+ querys=rquerys=null;
+ maxqlen=0;
+ }else{
+ int max=0;
+ String[] s2=query_.split(",");
+ querys=new byte[s2.length][];
+ rquerys=new byte[s2.length][];
+ for(int i=0; i<s2.length; i++){
+ max=Tools.max(max, s2[i].length());
+ querys[i]=s2[i].getBytes();
+ rquerys[i]=AminoAcid.reverseComplementBases(querys[i]);
+ }
+ maxqlen=max;
+ }
+
+ ffout1=FileFormat.testOutput(out1, FileFormat.FASTQ, null, true, true, false, false);
+ ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, null, true, true);
+ }
+
+ void process(Timer t){
+
+ final ConcurrentReadInputStream cris;
+ {
+ //Distributed version
+// ConcurrentReadInputStream cris0=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, null);
+// cris=new ConcurrentReadInputStreamD(cris0, true);
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, null);
+ if(verbose){outstream.println("Started cris");}
+ cris.start(); //4567
+ }
+ boolean paired=cris.paired();
+
+ final ByteStreamWriter bsw;
+ if(out1!=null){
+
+ assert(!out1.equalsIgnoreCase(in1) && !out1.equalsIgnoreCase(in1)) : "Input file and output file have same name.";
+
+ bsw=new ByteStreamWriter(ffout1);
+ bsw.start();
+ }else{bsw=null;}
+
+
+ MSA msa=MSA.makeMSA(maxqlen+3, columns, msaType);
+
+ final Read queryRead=new Read(querys[0], null, 1);
+ queryRead.sites=new ArrayList<SiteScore>();
+
+ long readsProcessed=0;
+ {
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+ assert((ffin1==null || ffin1.samOrBam()) || (r.mate!=null)==cris.paired());
+ }
+
+ while(reads!=null && reads.size()>0){
+ if(verbose){outstream.println("Fetched "+reads.size()+" reads.");}
+
+ for(int idx=0; idx<reads.size(); idx++){
+ final Read r=reads.get(idx);
+
+ if(r.length()+2>msa.maxColumns){
+ msa=MSA.makeMSA(maxqlen+3, r.length()+2+r.length()/2, "MultiStateAligner11ts");
+ }
+
+ SiteScore ss=null;
+ int score1=-999999, score2=-999999;
+
+ final int a=0, b=r.length()-1;
+ int[] max;
+
+ SiteScore ssf=null;
+ for(int qnum=0; qnum<querys.length; qnum++){
+ final byte[] query=querys[qnum];
+ max=msa.fillLimited(query, r.bases, a, b, -9999, null);
+ if(max!=null){
+ int[] score=msa.score(query, r.bases, a, b, max[0], max[1], max[2], false);
+ ss=new SiteScore(1, (byte)0, score[1], score[2], 1, score[0]);
+ if(ssf==null || ss.quickScore>ssf.quickScore){
+ ss.setSlowScore(ss.quickScore);
+ score1=ss.score=ss.quickScore;
+ ss.match=msa.traceback(query, r.bases, a, b, score[3], score[4], score[5], false);
+ ss.hits=qnum;
+ ssf=ss;
+ }
+ }
+ }
+
+ SiteScore ssr=null;
+ for(int qnum=0; qnum<rquerys.length; qnum++){
+ final byte[] rquery=rquerys[qnum];
+ max=msa.fillLimited(rquery, r.bases,a, b, -9999, null);
+ if(max!=null){
+ int[] score=msa.score(rquery, r.bases, a, b, max[0], max[1], max[2], false);
+ ss=new SiteScore(1, (byte)1, score[1], score[2], 1, score[0]);
+ if(ssr==null || ss.quickScore>ssr.quickScore){
+ ss.setSlowScore(ss.quickScore);
+ score1=ss.score=ss.quickScore;
+ ss.match=msa.traceback(rquery, r.bases, a, b, score[3], score[4], score[5], false);
+ ss.hits=qnum;
+ ssr=ss;
+ }
+ }
+ }
+
+ if(ssf==null && ssr==null){}
+ else{
+ if(score1>=score2 && ssf!=null){
+ bsw.println(toBytes(null, r, ssf));
+ }
+ if(score2>score1 && ssr!=null){
+ bsw.println(toBytes(null, r, ssr));
+ }
+ }
+
+ readsProcessed++;
+ }
+
+ cris.returnList(ln.id, ln.list.isEmpty());
+ if(verbose){outstream.println("Returned a list.");}
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ if(ln!=null){
+ cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
+ }
+ }
+
+ ArrayList<Read> rosList=new ArrayList<Read>();
+ rosList.add(queryRead);
+ if(!queryRead.sites.isEmpty()){
+ queryRead.setFromTopSite();
+ queryRead.setMapped(true);
+ }
+
+ bsw.poisonAndWait();
+ ReadWrite.closeStreams(cris);
+ if(verbose){outstream.println("Finished.");}
+
+ t.stop();
+ outstream.println("Time: \t"+t);
+ outstream.println("Reads Processed: "+readsProcessed+" \t"+String.format("%.2fk reads/sec", (readsProcessed/(double)(t.elapsed))*1000000));
+ }
+
+
+ /*--------------------------------------------------------------*/
+
+ private ByteBuilder toBytes(ByteBuilder bb, Read r, SiteScore ss){
+
+ final byte[] query=querys[ss.hits], rquery=rquerys[ss.hits];
+
+ if(bb==null){bb=new ByteBuilder(80);}
+ bb.append("query").append('\t');
+ bb.append(makeFlag(ss)).append('\t');
+ bb.append(r.id.replace('\t', '_')).append('\t');
+ bb.append(ss.start+1).append('\t');
+ bb.append(Tools.max(ss.score/query.length, 4)).append('\t');
+ String cigar=SamLine.toCigar14(ss.match, ss.start, ss.stop, r.length(), query);
+ if(cigar==null){bb.append('*').append('\t');}else{bb.append(cigar).append('\t');}
+ bb.append('0').append('\t');
+ bb.append('*').append('\t');
+ bb.append('0').append('\t');
+
+ bb.append(ss.strand()==Gene.MINUS ? rquery : query).append('\t');
+ bb.append('*').append('\t');
+
+ float f=Read.identity(ss.match);
+ bb.append(String.format("YI:f:%.2f", (100*f)));
+
+ return bb;
+ }
+
+ public static int makeFlag(SiteScore ss){
+ int flag=0;
+ if(ss.strand()==Gene.MINUS){flag|=0x10;}
+// if(r.secondary()){flag|=0x100;}
+// if(r.discarded()){flag|=0x200;}
+ return flag;
+ }
+
+
+
+ /*--------------------------------------------------------------*/
+
+ private void printOptions(){
+ throw new RuntimeException("printOptions: TODO");
+ }
+
+ /*--------------------------------------------------------------*/
+
+ private String in1=null;
+ private String out1=null;
+
+ private final FileFormat ffin1;
+ private final FileFormat ffout1;
+ private final byte[][] querys, rquerys;
+ private final int maxqlen;
+ private int columns=2000;
+ private String msaType="MultiStateAligner11ts";
+
+ /*--------------------------------------------------------------*/
+
+ private long maxReads=-1;
+
+ /*--------------------------------------------------------------*/
+
+ private java.io.PrintStream outstream=System.err;
+ public static boolean verbose=false;
+
+}
diff --git a/current/jgi/FindString.java b/current/jgi/FindString.java
new file mode 100755
index 0000000..7f34f47
--- /dev/null
+++ b/current/jgi/FindString.java
@@ -0,0 +1,25 @@
+package jgi;
+
+import fileIO.TextFile;
+
+/**
+ * @author Brian Bushnell
+ * @date Jun 18, 2013
+ *
+ */
+public class FindString {
+
+ public static void main(String[] args){
+ String fname=args[0];
+ TextFile tf=new TextFile(fname, true, false);
+ for(String line=tf.nextLine(); line!=null; line=tf.nextLine()){
+ boolean b=false;
+ for(int i=1; i<args.length; i++){
+ if(line.contains(args[i])){b=true;break;}
+ }
+ if(b){System.out.println(line);}
+ }
+ tf.close();
+ }
+
+}
diff --git a/current/jgi/FungalRelease.java b/current/jgi/FungalRelease.java
new file mode 100755
index 0000000..abe4460
--- /dev/null
+++ b/current/jgi/FungalRelease.java
@@ -0,0 +1,468 @@
+package jgi;
+
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+
+import stream.ConcurrentReadInputStream;
+import stream.FASTQ;
+import stream.FastaReadInputStream;
+import stream.ConcurrentReadOutputStream;
+import stream.Read;
+
+import dna.Parser;
+import dna.Timer;
+import fileIO.ByteFile;
+import fileIO.ReadWrite;
+import fileIO.FileFormat;
+import fileIO.TextStreamWriter;
+
+import align2.ListNum;
+import align2.ReadLengthComparator;
+import align2.ReadStats;
+import align2.Shared;
+import align2.Tools;
+
+/**
+ * Reformats a fungal assembly for release.
+ * Also creates contig and agp files.
+ *
+ * @author Brian Bushnell
+ * @date December 9, 2015
+ *
+ */
+public class FungalRelease {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /**
+ * Code entrance from the command line.
+ * @param args Command line arguments
+ */
+ public static void main(String[] args){
+ Timer t=new Timer();
+ FungalRelease fr=new FungalRelease(args);
+ fr.process(t);
+ }
+
+ /**
+ * Constructor.
+ * @param args Command line arguments
+ */
+ public FungalRelease(String[] args){
+
+ //Process any config files
+ args=Parser.parseConfig(args);
+
+ //Detect whether the uses needs help
+ if(Parser.parseHelp(args, true)){
+ printOptions();
+ System.exit(0);
+ }
+
+ //Print the program name and arguments
+ outstream.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+ Shared.FASTA_WRAP=60;
+
+ //Set some shared static variables regarding PIGZ
+ Shared.READ_BUFFER_LENGTH=Tools.min(200, Shared.READ_BUFFER_LENGTH);
+ Shared.capBuffers(4);
+ ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
+ ReadWrite.MAX_ZIP_THREADS=Shared.threads();
+
+ Read.TO_UPPER_CASE=true;
+
+ //Create a parser object
+ Parser parser=new Parser();
+
+ //Parse each argument
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+
+ //Break arguments into their constituent parts, in the form of "a=b"
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if(b==null || b.equalsIgnoreCase("null")){b=null;}
+ while(a.startsWith("-")){a=a.substring(1);} //Strip leading hyphens
+
+
+ if(parser.parse(arg, a, b)){//Parse standard flags in the parser
+ //do nothing
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ }else if(a.equals("mingapin")){
+ minGapIn=(int)Tools.parseKMG(b);
+ }else if(a.equals("mingap") || a.equals("mingapout")){
+ minGapOut=(int)Tools.parseKMG(b);
+ }else if(a.equals("minlen") || a.equals("minlength") || a.equals("minscaf")){
+ minScaf=(int)Tools.parseKMG(b);
+ }else if(a.equals("mincontig")){
+ minContig=(int)Tools.parseKMG(b);
+ }else if(a.equals("outc") || a.equals("contigs")){
+ outC=b;
+ }else if(a.equals("qfoutc")){
+ qfoutC=b;
+ }else if(a.equals("sortcontigs")){
+ sortContigs=Tools.parseBoolean(b);
+ }else if(a.equals("sortcscaffolds")){
+ sortScaffolds=Tools.parseBoolean(b);
+ }else if(a.equals("baniupac")){
+ banIupac=Tools.parseBoolean(b);
+ }else if(a.equals("agp")){
+ agpFile=b;
+ }else if(a.equals("legend")){
+ legendFile=b;
+ }else if(a.equals("scafnum")){
+ scafNum=Tools.parseKMG(b);
+ }else if(a.equals("renamescaffolds") || a.equals("rename")){
+ renameScaffolds=Tools.parseBoolean(b);
+ }else if(a.equals("scafnum")){
+ contigNum=Tools.parseKMG(b);
+ }else if(a.equals("renamecontigs")){
+ renameContigs=Tools.parseBoolean(b);
+ }else if(a.equals("parse_flag_goes_here")){
+ //Set a variable here
+ }else{
+ outstream.println("Unknown parameter "+args[i]);
+ assert(false) : "Unknown parameter "+args[i];
+ // throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ {//Process parser fields
+ Parser.processQuality();
+
+ maxReads=parser.maxReads;
+
+ overwrite=ReadStats.overwrite=parser.overwrite;
+ append=ReadStats.append=parser.append;
+
+ in1=parser.in1;
+ qfin1=parser.qfin1;
+
+ out1=parser.out1;
+ qfout1=parser.qfout1;
+
+ extin=parser.extin;
+ extout=parser.extout;
+ }
+
+ assert(FastaReadInputStream.settingsOK());
+
+ //Ensure there is an input file
+ if(in1==null){
+ printOptions();
+ throw new RuntimeException("Error - at least one input file is required.");
+ }
+
+ //Adjust the number of threads for input file reading
+ if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){
+ ByteFile.FORCE_MODE_BF2=true;
+ }
+
+ //Ensure output files can be written
+ if(!Tools.testOutputFiles(overwrite, append, false, out1, outC)){
+ outstream.println((out1==null)+", "+out1);
+ throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+out1+"\n");
+ }
+
+ //Ensure input files can be read
+ if(!Tools.testInputFiles(false, true, in1)){
+ throw new RuntimeException("\nCan't read to some input files.\n");
+ }
+
+ //Ensure that no file was specified multiple times
+ if(!Tools.testForDuplicateFiles(true, in1, out1, outC)){
+ throw new RuntimeException("\nSome file names were specified multiple times.\n");
+ }
+
+ //Create output FileFormat objects
+ ffout1=FileFormat.testOutput(out1, FileFormat.FASTA, extout, true, overwrite, append, ordered);
+
+ //Create output FileFormat objects
+ ffoutC=FileFormat.testOutput(outC, FileFormat.FASTA, extout, true, overwrite, append, ordered);
+
+ //Create input FileFormat objects
+ ffin1=FileFormat.testInput(in1, FileFormat.FASTA, extin, true, true);
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Outer Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Create read streams and process all data */
+ void process(Timer t){
+
+ //Create a read input stream
+ final ConcurrentReadInputStream cris;
+ {
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, null, qfin1, null);
+ cris.start(); //Start the stream
+ if(verbose){outstream.println("Started cris");}
+ }
+
+ //Optionally create a read output stream
+ final ConcurrentReadOutputStream ros, rosc;
+ final int buff=4;
+ if(ffout1!=null){
+ ros=ConcurrentReadOutputStream.getStream(ffout1, null, qfout1, null, buff, null, false);
+ ros.start(); //Start the stream
+ }else{ros=null;}
+ if(ffoutC!=null){
+ rosc=ConcurrentReadOutputStream.getStream(ffoutC, null, qfoutC, null, 4, null, false);
+ rosc.start(); //Start the stream
+ }else{rosc=null;}
+
+ //Reset counters
+ readsProcessed=0;
+ basesProcessed=0;
+
+ //Process the read stream
+ processInner(cris, ros, rosc);
+
+ if(verbose){outstream.println("Finished; closing streams.");}
+
+ //Write anything that was accumulated by ReadStats
+ errorState|=ReadStats.writeAll();
+ //Close the read streams
+ errorState|=ReadWrite.closeStreams(cris, ros, rosc);
+
+ //Report timing and results
+ {
+ t.stop();
+
+ //Calculate units per nanosecond
+ double rpnano=readsProcessed/(double)(t.elapsed);
+ double bpnano=basesProcessed/(double)(t.elapsed);
+
+ //Add "k" and "m" for large numbers
+ String rpstring=(readsProcessed<100000 ? ""+readsProcessed : readsProcessed<100000000 ? (readsProcessed/1000)+"k" : (readsProcessed/1000000)+"m");
+ String bpstring=(basesProcessed<100000 ? ""+basesProcessed : basesProcessed<100000000 ? (basesProcessed/1000)+"k" : (basesProcessed/1000000)+"m");
+
+ //Format the strings so they have they are right-justified
+ while(rpstring.length()<8){rpstring=" "+rpstring;}
+ while(bpstring.length()<8){bpstring=" "+bpstring;}
+
+ outstream.println("Time: \t"+t);
+ outstream.println("Reads Processed: "+rpstring+" \t"+String.format("%.2fk reads/sec", rpnano*1000000));
+ outstream.println("Bases Processed: "+bpstring+" \t"+String.format("%.2fm bases/sec", bpnano*1000));
+ }
+
+ //Throw an exception of there was an error in a thread
+ if(errorState){
+ throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
+ }
+ }
+
+ /** Iterate through the reads */
+ void processInner(final ConcurrentReadInputStream cris, final ConcurrentReadOutputStream ros, final ConcurrentReadOutputStream rosc){
+
+ ArrayList<Read> scaffolds=getReads(cris);
+
+ final boolean makeLegend=(legendFile!=null);
+ TextStreamWriter tswl=(makeLegend ? new TextStreamWriter(legendFile, overwrite, append, false) : null);
+ if(makeLegend){tswl.start();}
+
+ if(ros!=null){
+ if(sortScaffolds){Collections.sort(scaffolds, ReadLengthComparator.comparator);}
+ if(renameScaffolds){
+ for(Read r : scaffolds){
+ String old=r.id;
+ r.id="scaffold_"+scafNum;
+ if(tswl!=null){tswl.println(old+"\t"+r.id);}
+ scafNum++;
+ }
+ }
+ ros.add(scaffolds, 0);
+ }
+ if(tswl!=null){tswl.poisonAndWait();}
+
+ final boolean makeAgp=(agpFile!=null);
+ TextStreamWriter tsw=(makeAgp ? new TextStreamWriter(agpFile, overwrite, append, false) : null);
+ if(makeAgp){tsw.start();}
+
+ if(rosc!=null){
+ ArrayList<Read> contigs=new ArrayList<Read>();
+ for(Read r : scaffolds){
+ ArrayList<Read> temp=r.breakAtGaps(makeAgp, minContig);
+ if(makeAgp){
+ tsw.print((String)r.obj);
+ r.obj=null;
+ }
+ contigs.addAll(temp);
+ }
+ if(sortContigs){Collections.sort(contigs, ReadLengthComparator.comparator);}
+ if(renameContigs){
+ for(Read r : contigs){
+ r.id="contig_"+contigNum;
+ contigNum++;
+ }
+ }
+ rosc.add(contigs, 0);
+ }
+
+ if(makeAgp){tsw.poisonAndWait();}
+
+ }
+
+ /** Iterate through the reads */
+ private ArrayList<Read> getReads(final ConcurrentReadInputStream cris){
+
+ ArrayList<Read> all=new ArrayList<Read>(10000);
+
+ {
+ //Grab the first ListNum of reads
+ ListNum<Read> ln=cris.nextList();
+ //Grab the actual read list from the ListNum
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ //Check to ensure pairing is as expected
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+ assert((ffin1==null || ffin1.samOrBam()) || (r.mate!=null)==cris.paired());
+ }
+
+ //As long as there is a nonempty read list...
+ while(reads!=null && reads.size()>0){
+ if(verbose){outstream.println("Fetched "+reads.size()+" reads.");}
+
+ //Loop through each read in the list
+ for(int idx=0; idx<reads.size(); idx++){
+ final Read r1=reads.get(idx);
+ assert(r1.mate==null);
+
+ //Track the initial length for statistics
+ final int initialLength1=r1.length();
+
+ //Increment counters
+ readsProcessed+=1;
+ basesProcessed+=initialLength1;
+
+ boolean keep=processRead(r1);
+ if(keep){all.add(r1);}
+ }
+
+ //Notify the input stream that the list was used
+ cris.returnList(ln.id, ln.list.isEmpty());
+ if(verbose){outstream.println("Returned a list.");}
+
+ //Fetch a new list
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+
+ //Notify the input stream that the final list was used
+ if(ln!=null){
+ cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
+ }
+ }
+
+ return all;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /**
+ * Process a single read pair.
+ * @param r1 Read 1
+ * @return True if the reads should be kept, false if they should be discarded.
+ */
+ boolean processRead(final Read r1){
+ assert(!banIupac || !r1.containsNonACGTN()) : "Non-ACGTN base found in scaffold "+r1.id;
+ r1.inflateGaps(minGapIn, minGapOut);
+ return r1.length()>=minScaf;
+ }
+
+ /** This is called if the program runs with no parameters */
+ private void printOptions(){
+ throw new RuntimeException("TODO");
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private int minGapIn=1;
+ private int minGapOut=10;
+ private int minScaf=1;
+ private int minContig=1;
+ private long scafNum=1;
+ private long contigNum=1;
+
+ private boolean sortScaffolds=true;
+ private boolean sortContigs=false;
+ private boolean banIupac=true;
+ private boolean renameScaffolds=true;
+ private boolean renameContigs=false;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- I/O Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Primary input file path */
+ private String in1=null;
+
+ private String qfin1=null;
+
+ /** Primary output file path */
+ private String out1=null;
+ private String outC=null;
+
+ private String qfout1=null;
+ private String qfoutC=null;
+
+ private String agpFile=null;
+ private String legendFile=null;
+
+ /** Override input file extension */
+ private String extin=null;
+ /** Override output file extension */
+ private String extout=null;
+
+ /*--------------------------------------------------------------*/
+
+ /** Number of reads processed */
+ protected long readsProcessed=0;
+ /** Number of bases processed */
+ protected long basesProcessed=0;
+
+ /** Quit after processing this many input reads; -1 means no limit */
+ private long maxReads=-1;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Final Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Primary input file */
+ private final FileFormat ffin1;
+
+ /** Primary output file */
+ private final FileFormat ffout1;
+ private final FileFormat ffoutC;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Common Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Print status messages to this output stream */
+ private PrintStream outstream=System.err;
+ /** Print verbose messages */
+ public static boolean verbose=false;
+ /** True if an error was encountered */
+ public boolean errorState=false;
+ /** Overwrite existing output files */
+ private boolean overwrite=false;
+ /** Append to existing output files */
+ private boolean append=false;
+ /** This flag has no effect on singlethreaded programs */
+ private final boolean ordered=false;
+
+}
diff --git a/current/jgi/FuseSequence.java b/current/jgi/FuseSequence.java
new file mode 100755
index 0000000..4f3611b
--- /dev/null
+++ b/current/jgi/FuseSequence.java
@@ -0,0 +1,187 @@
+package jgi;
+
+import java.util.ArrayList;
+
+import align2.ListNum;
+import align2.Tools;
+import dna.Timer;
+import stream.ByteBuilder;
+import stream.ConcurrentReadInputStream;
+import stream.ConcurrentReadOutputStream;
+import stream.Read;
+
+/**
+ * Fuses sequences together, with N-padding in between.
+ * @author Brian Bushnell
+ * @date Jan 20, 2015
+ *
+ */
+public final class FuseSequence extends BBTool_ST {
+
+ public static void main(String[] args){
+ Timer t=new Timer();
+ FuseSequence fs=new FuseSequence(args);
+ fs.process(t);
+ }
+
+ public FuseSequence(String[] args){
+ super(args);
+ reparse(args);
+ }
+
+ void setDefaults(){
+ npad=300;
+ defaultQuality=30;
+ fusePairs=false;
+ }
+
+ @Override
+ public boolean parseArgument(String arg, String a, String b) {
+ if(a.equals("pad") || a.equals("npad") || a.equals("ns")){
+ npad=Integer.parseInt(b);
+ return true;
+ }else if(a.equals("q") || a.equals("quality")){
+ defaultQuality=Byte.parseByte(b);
+ return true;
+ }else if(a.equals("fp") || a.equals("fusepairs")){
+ fusePairs=Tools.parseBoolean(b);
+ return true;
+ }else if(a.equals("rename") || a.equals("name")){
+ name=b;
+ return true;
+ }
+ return false;
+ }
+
+ void processInner(final ConcurrentReadInputStream cris, final ConcurrentReadOutputStream ros){
+
+ readsProcessed=0;
+ basesProcessed=0;
+
+ {
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+ assert((ffin1==null || ffin1.samOrBam()) || (r.mate!=null)==cris.paired());
+ }
+
+ while(reads!=null && reads.size()>0){
+ if(verbose){outstream.println("Fetched "+reads.size()+" reads.");}
+
+ for(int idx=0; idx<reads.size(); idx++){
+ final Read r1=reads.get(idx);
+ final Read r2=r1.mate;
+
+ final int initialLength1=r1.length();
+ final int initialLength2=(r1.mateLength());
+
+ {
+ readsProcessed++;
+ basesProcessed+=initialLength1;
+ }
+ if(r2!=null){
+ readsProcessed++;
+ basesProcessed+=initialLength2;
+ }
+
+ processReadPair(r1, r2);
+
+ }
+
+ if(fusePairs && ros!=null){ros.add(reads, ln.id);}
+
+ cris.returnList(ln.id, ln.list.isEmpty());
+ if(verbose){outstream.println("Returned a list.");}
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ if(ln!=null){
+ cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
+ }
+ }
+ if(!fusePairs && ros!=null){
+ Read r=new Read(bases.toBytes(), quals.toBytes(), 0);
+ r.id=(name==null ? "0" : name);
+ ArrayList<Read> reads=new ArrayList<Read>(1);
+ reads.add(r);
+ ros.add(reads, 0);
+ }
+ }
+
+ /* (non-Javadoc)
+ * @see jgi.BBTool_ST#processReadPair(stream.Read, stream.Read)
+ */
+ @Override
+ boolean processReadPair(Read r1, Read r2) {
+ if(fusePairs){
+ fusePair(r1, r2);
+ return true;
+ }
+ if(r1!=null && r1.length()>0){processRead(r1);}
+ if(r2!=null && r2.length()>0){processRead(r2);}
+ return false;
+ }
+
+ private void fusePair(Read r1, Read r2) {
+ if(r2==null){return;}
+
+ r2.reverseComplement();
+ final int len=r1.length()+r2.length()+npad;
+ byte[] bases=new byte[len];
+ byte[] quals=(r1.quality==null || r2.quality==null ? null : new byte[len]);
+
+ for(int i=0, max=r1.length(); i<max; i++){
+ bases[i]=r1.bases[i];
+ if(quals!=null){quals[i]=r1.quality[i];}
+ }
+ for(int i=0, j=r1.length(); i<npad; i++, j++){
+ bases[j]='N';
+ }
+ for(int i=0, j=r1.length()+npad, max=r2.length(); i<max; i++, j++){
+ bases[j]=r2.bases[i];
+ if(quals!=null){quals[j]=r2.quality[i];}
+ }
+
+ r1.mate=r2.mate=null;
+ r1.bases=bases;
+ r1.quality=quals;
+ }
+
+ private void processRead(Read r) {
+ if(name==null){name=r.id;}
+ if(bases.length>0){
+ for(int i=0; i<npad; i++){
+ bases.append('N');
+ quals.append((byte)0);
+ }
+ }
+ bases.append(r.bases);
+ if(r.quality!=null){
+ quals.append(r.quality);
+ }else{
+ for(int i=0, max=r.length(); i<max; i++){
+ quals.append(defaultQuality);
+ }
+ }
+ }
+
+ @Override
+ void startupSubclass() {}
+
+ @Override
+ void shutdownSubclass() {}
+
+ @Override
+ void showStatsSubclass(Timer t, long readsIn, long basesIn) {}
+
+ int npad;
+ byte defaultQuality;
+ boolean fusePairs;
+ ByteBuilder bases=new ByteBuilder();
+ ByteBuilder quals=new ByteBuilder();
+ String name;
+
+}
diff --git a/current/jgi/GetReads.java b/current/jgi/GetReads.java
new file mode 100755
index 0000000..05ba328
--- /dev/null
+++ b/current/jgi/GetReads.java
@@ -0,0 +1,329 @@
+package jgi;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+
+import stream.ConcurrentGenericReadInputStream;
+import stream.ConcurrentReadInputStream;
+import stream.FASTQ;
+import stream.FastaReadInputStream;
+import stream.Read;
+
+import align2.ListNum;
+import align2.ReadStats;
+import align2.Tools;
+import dna.Data;
+import dna.Parser;
+import dna.Timer;
+import fileIO.ByteFile;
+import fileIO.ByteFile1;
+import fileIO.ReadWrite;
+import fileIO.FileFormat;
+import fileIO.TextStreamWriter;
+
+/**
+ * Grab reads with specified numbers from a file.
+ * TODO Note that much of this is ripped directly from ReformatReads, but is incorrect, because this class does not support dual output files.
+ * @author Brian Bushnell
+ * @date Jul 10, 2013
+ *
+ */
+public class GetReads {
+
+ public static void main(String[] args){
+ new GetReads(args);
+ }
+
+ public GetReads(String[] args){
+ if(args==null || args.length==0){
+ throw new RuntimeException("No arguments.");
+ }
+
+ for(String s : args){if(s.startsWith("out=standardout") || s.startsWith("out=stdout")){outstream=System.err;}}
+ outstream.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+
+ Timer t=new Timer();
+
+ Parser parser=new Parser();
+ String in1=null;
+ String in2=null;
+
+ String qfin1=null;
+ String qfin2=null;
+
+ String out1=null;
+ String out2=null;
+
+ String qfout1=null;
+ String qfout2=null;
+
+ boolean parsecustom=false;
+ boolean errorState=false;
+ long maxReads=-1;
+ int passes=1;
+ boolean testsize=false;
+ boolean overwrite=false, append=false;
+ float samplerate=1f;
+ long sampleseed=1;
+
+
+
+ ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
+
+ HashSet<Long> table=new HashSet<Long>();
+
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=(split.length>1 ? split[1] : "true");
+ while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(Parser.parseCommonStatic(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseZip(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseQuality(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseFasta(arg, a, b)){
+ //do nothing
+ }else if(parser.parseInterleaved(arg, a, b)){
+ //do nothing
+ }else if(a.equals("id") || a.equals("number")){
+ String[] b2=b.split(",");
+ for(String c : b2){
+ final long x, y;
+ if(c.indexOf('-')>=0){
+ String[] c2=c.split("-");
+ assert(c2.length==2) : c;
+ x=Long.parseLong(c2[0]);
+ y=Long.parseLong(c2[1]);
+ }else{
+ x=y=Long.parseLong(c);
+ }
+ for(long z=x; z<=y; z++){
+ table.add(z);
+ }
+ }
+ }else if(a.equals("passes")){
+ passes=Integer.parseInt(b);
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ ByteFile1.verbose=verbose;
+ stream.FastaReadInputStream.verbose=verbose;
+// align2.FastaReadInputStream2.verbose=verbose;
+// align2.FastqReadInputStream.verbose=verbose;
+ }else if(a.equals("reads") || a.startsWith("maxreads")){
+ maxReads=Tools.parseKMG(b);
+ }else if(a.equals("build") || a.equals("genome")){
+ Data.setGenome(Integer.parseInt(b));
+ }else if(a.equals("in") || a.equals("input") || a.equals("in1") || a.equals("input1")){
+ in1=b;
+ if(b.indexOf('#')>-1 && !new File(b).exists()){
+ in1=b.replace("#", "1");
+ in2=b.replace("#", "2");
+ }
+ }else if(a.equals("in2") || a.equals("input2")){
+ in2=b;
+ }else if(a.equals("out") || a.equals("output") || a.equals("out1") || a.equals("output1")){
+ out1=b;
+ if(b.indexOf('#')>-1){
+ out1=b.replace("#", "1");
+ out2=b.replace("#", "2");
+ }
+ }else if(a.equals("out2") || a.equals("output2")){
+ out2=b;
+ }else if(a.equals("qfin") || a.equals("qfin1")){
+ qfin1=b;
+ }else if(a.equals("qfout") || a.equals("qfout1")){
+ qfout1=b;
+ }else if(a.equals("qfin2")){
+ qfin2=b;
+ }else if(a.equals("qfout2")){
+ qfout2=b;
+ }else if(a.equals("testsize")){
+ testsize=Tools.parseBoolean(b);
+ }else if(a.equals("append") || a.equals("app")){
+ append=ReadStats.append=Tools.parseBoolean(b);
+ }else if(a.equals("overwrite") || a.equals("ow")){
+ overwrite=Tools.parseBoolean(b);
+ }else if(a.equals("samplerate")){
+ samplerate=Float.parseFloat(b);
+ assert(samplerate<=1f && samplerate>=0f) : "samplerate="+samplerate+"; should be between 0 and 1";
+ }else if(a.equals("sampleseed")){
+ sampleseed=Long.parseLong(b);
+ }else if(a.startsWith("minscaf") || a.startsWith("mincontig")){
+ stream.FastaReadInputStream.MIN_READ_LEN=Integer.parseInt(b);
+ }else if(in1==null && i==0 && !arg.contains("=") && (arg.toLowerCase().startsWith("stdin") || new File(arg).exists())){
+ in1=arg;
+ if(arg.indexOf('#')>-1 && !new File(arg).exists()){
+ in1=arg.replace("#", "1");
+ in2=arg.replace("#", "2");
+ }
+ }else{
+ System.err.println("Unknown parameter "+args[i]);
+ assert(false) : "Unknown parameter "+args[i];
+ // throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ {//Process parser fields
+ Parser.processQuality();
+ }
+
+ assert(FastaReadInputStream.settingsOK());
+// if(maxReads!=-1){ReadWrite.USE_GUNZIP=ReadWrite.USE_UNPIGZ=false;}
+
+ if(in1==null){
+ throw new RuntimeException("Error - at least one input file is required.");
+ }
+
+ if(out1==null){
+ if(out2!=null){
+ throw new RuntimeException("Error - cannot define out2 without defining out1.");
+ }
+ out1="stdout";
+ }
+
+ if(!parser.setInterleaved){
+ assert(in1!=null && (out1!=null || out2==null)) : "\nin1="+in1+"\nin2="+in2+"\nout1="+out1+"\nout2="+out2+"\n";
+ if(in2!=null){ //If there are 2 input streams.
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+ outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED);
+ }else{ //There is one input stream.
+ if(out2!=null){
+ FASTQ.FORCE_INTERLEAVED=true;
+ FASTQ.TEST_INTERLEAVED=false;
+ outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED);
+ }
+ }
+ }
+
+ if(out1!=null && out1.equalsIgnoreCase("null")){out1=null;}
+ if(out2!=null && out2.equalsIgnoreCase("null")){out2=null;}
+
+ if(!Tools.testOutputFiles(overwrite, append, false, out1, out2)){
+ throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+out1+", "+out2+"\n");
+ }
+
+ FASTQ.PARSE_CUSTOM=parsecustom;
+
+
+ FileFormat ffin=FileFormat.testInput(in1, 0, null, true, true);
+ FileFormat ffout=FileFormat.testOutput(out1, 0, null, true, overwrite, append, false);
+
+
+ final boolean useSharedHeader=(ffin!=null && ffout!=null && ffin.samOrBam() && ffout.samOrBam());
+
+ if(ffin!=null && ffout!=null && ffin.samOrBam() && (ffout.samOrBam() || ffout.bread())){
+ throw new RuntimeException("\nDirect conversion of sam to sam or bread are not currently supported.\nAll other conversions are possible.");
+ }
+
+
+ ConcurrentReadInputStream cris;
+ {
+ FileFormat ff1=FileFormat.testInput(in1, FileFormat.FASTQ, null, true, true);
+ FileFormat ff2=FileFormat.testInput(in2, FileFormat.FASTQ, null, true, true);
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, useSharedHeader, ff1, ff2);
+ }
+
+ cris.setSampleRate(samplerate, sampleseed);
+ outstream.println("Input is "+(cris.paired() ? "paired" : "unpaired"));
+ cris.start(); //4567
+
+ TextStreamWriter tsw=new TextStreamWriter(out1, overwrite, false, false);
+ tsw.start();
+
+
+ long readsProcessed=0;
+ long basesProcessed=0;
+
+ for(int pass=1; pass<=passes; pass++){
+// outstream.println("pass="+pass);
+ if(pass>1){
+ FileFormat ff1=FileFormat.testInput(in1, FileFormat.FASTQ, null, true, true);
+ FileFormat ff2=FileFormat.testInput(in2, FileFormat.FASTQ, null, true, true);
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, useSharedHeader, ff1, ff2);
+ cris.setSampleRate(samplerate, sampleseed);
+ cris.start();
+ }
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+ assert(ffin.samOrBam() || (r.mate!=null)==cris.paired());
+ }
+
+ while(reads!=null && reads.size()>0 && !table.isEmpty()){
+
+ for(Read r1 : reads){
+ {
+ readsProcessed++;
+ basesProcessed+=r1.length();
+ }
+ Read r2=r1.mate;
+ if(r2!=null){
+ readsProcessed++;
+ basesProcessed+=r2.length();
+ }
+
+ if(table.remove(r1.numericID)){
+ tsw.println(r1);
+ if(r2!=null){tsw.println(r2);}
+ if(table.isEmpty()){break;}
+ }
+ }
+
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ cris.returnList(ln.id, ln.list.isEmpty());
+ errorState|=ReadWrite.closeStream(cris);
+ }
+
+ if(tsw!=null){
+ tsw.poisonAndWait();
+ }
+
+ errorState|=(cris.errorState());
+
+ t.stop();
+
+ double rpnano=readsProcessed/(double)(t.elapsed);
+ double bpnano=basesProcessed/(double)(t.elapsed);
+
+ String rpstring=(readsProcessed<100000 ? ""+readsProcessed : readsProcessed<100000000 ? (readsProcessed/1000)+"k" : (readsProcessed/1000000)+"m");
+ String bpstring=(basesProcessed<100000 ? ""+basesProcessed : basesProcessed<100000000 ? (basesProcessed/1000)+"k" : (basesProcessed/1000000)+"m");
+
+ while(rpstring.length()<8){rpstring=" "+rpstring;}
+ while(bpstring.length()<8){bpstring=" "+bpstring;}
+
+ outstream.println("Time: \t"+t);
+ outstream.println("Reads Processed: "+rpstring+" \t"+String.format("%.2fk reads/sec", rpnano*1000000));
+ outstream.println("Bases Processed: "+bpstring+" \t"+String.format("%.2fm bases/sec", bpnano*1000));
+ if(testsize){
+ long bytesProcessed=(new File(in1).length()+(in2==null ? 0 : new File(in2).length()))*passes;
+ double xpnano=bytesProcessed/(double)(t.elapsed);
+ String xpstring=(bytesProcessed<100000 ? ""+bytesProcessed : bytesProcessed<100000000 ? (bytesProcessed/1000)+"k" : (bytesProcessed/1000000)+"m");
+ while(xpstring.length()<8){xpstring=" "+xpstring;}
+ outstream.println("Bytes Processed: "+xpstring+" \t"+String.format("%.2fm bytes/sec", xpnano*1000));
+ }
+
+ if(errorState){
+ throw new RuntimeException("GetReads terminated in an error state; the output may be corrupt.");
+ }
+
+ }
+
+ private PrintStream outstream=System.err;
+ public static boolean verbose=false;
+
+}
diff --git a/current/jgi/GradeMergedReads.java b/current/jgi/GradeMergedReads.java
new file mode 100755
index 0000000..f28d0bb
--- /dev/null
+++ b/current/jgi/GradeMergedReads.java
@@ -0,0 +1,301 @@
+package jgi;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import stream.ConcurrentGenericReadInputStream;
+import stream.ConcurrentReadInputStream;
+import stream.FASTQ;
+import stream.FastaReadInputStream;
+import stream.Read;
+import align2.ListNum;
+import align2.Shared;
+import align2.Tools;
+import dna.Parser;
+import dna.Timer;
+import fileIO.ByteFile;
+import fileIO.ByteFile1;
+import fileIO.ByteFile2;
+import fileIO.FileFormat;
+import fileIO.ReadWrite;
+
+/**
+ * @author Brian Bushnell
+ * @date May 20, 2014
+ *
+ */
+public class GradeMergedReads {
+
+
+ public static void main(String[] args){
+ Timer t=new Timer();
+ GradeMergedReads gmr=new GradeMergedReads(args);
+ gmr.process(t);
+ }
+
+ public GradeMergedReads(String[] args){
+
+ args=Parser.parseConfig(args);
+ if(Parser.parseHelp(args, true)){
+ printOptions();
+ System.exit(0);
+ }
+
+ for(String s : args){if(s.startsWith("out=standardout") || s.startsWith("out=stdout")){outstream=System.err;}}
+ outstream.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+
+
+
+ Shared.READ_BUFFER_LENGTH=Tools.min(200, Shared.READ_BUFFER_LENGTH);
+ Shared.capBuffers(4);
+ ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+ FASTQ.DETECT_QUALITY=false;
+
+ Parser parser=new Parser();
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if(b==null || b.equalsIgnoreCase("null")){b=null;}
+ while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens
+
+ if(parser.parse(arg, a, b)){
+ //do nothing
+ }else if(a.equals("raw") || a.equals("raw1")){
+ raw1=b;
+ if(b.indexOf("#")>=0){
+ raw1=b.replace('#', '1');
+ raw2=b.replace('#', '2');
+ }
+ }else if(a.equals("raw2")){
+ raw2=b;
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ ByteFile1.verbose=verbose;
+ ByteFile2.verbose=verbose;
+ stream.FastaReadInputStream.verbose=verbose;
+ ConcurrentGenericReadInputStream.verbose=verbose;
+// align2.FastaReadInputStream2.verbose=verbose;
+ stream.FastqReadInputStream.verbose=verbose;
+ ReadWrite.verbose=verbose;
+ }else if(parser.in1==null && i==0 && !arg.contains("=") && (arg.toLowerCase().startsWith("stdin") || new File(arg).exists())){
+ parser.in1=arg;
+ }else{
+ System.err.println("Unknown parameter "+i+": "+args[i]);
+ assert(false) : "Unknown parameter "+i+": "+args[i];
+// +"\n"+arg+", "+parser.in1+", "+arg.contains("=")+", "+(arg.toLowerCase().startsWith("stdin")+", "+new File(arg).exists()+", "+new File(arg).getAbsolutePath());
+ // throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ {//Process parser fields
+ Parser.processQuality();
+
+ maxReads=parser.maxReads;
+ in=parser.in1;
+ extin=parser.extin;
+ }
+
+ assert(FastaReadInputStream.settingsOK());
+
+ if(in==null){
+ printOptions();
+ throw new RuntimeException("Error - at least one input file is required.");
+ }
+ if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){
+ ByteFile.FORCE_MODE_BF2=true;
+ }
+
+ FASTQ.PARSE_CUSTOM=false;
+
+ ffin=FileFormat.testInput(in, FileFormat.FASTQ, extin, true, true);
+ }
+
+ void process(Timer t){
+
+ long mergeable=0, total=0;
+ if(raw1!=null){
+ FileFormat ffraw1=FileFormat.testInput(raw1, FileFormat.FASTQ, extin, true, true);
+ FileFormat ffraw2=FileFormat.testInput(raw2, FileFormat.FASTQ, extin, true, true);
+ ConcurrentReadInputStream cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffraw1, ffraw2, null, null);
+ cris.start();
+
+ {
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ while(reads!=null && reads.size()>0){
+ for(int idx=0; idx<reads.size(); idx++){
+ final Read r1=reads.get(idx);
+ String s=r1.id;
+ total++;
+ final int insert=parseInsert(r1.id);
+ if(insert>0 && insert<r1.length()+r1.mateLength()){
+ mergeable++;
+ }
+ }
+
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ errorState|=ReadWrite.closeStream(cris);
+ }
+ }
+
+ final ConcurrentReadInputStream cris;
+ {
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin, null, null, null);
+ if(verbose){System.err.println("Started cris");}
+ cris.start(); //4567
+ }
+
+ long readsProcessed=0;
+ long basesProcessed=0;
+
+ long correct=0;
+ long tooLong=0;
+ long tooShort=0;
+
+ {
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+// System.err.println("Fetched "+reads);
+
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+ assert((ffin==null || ffin.samOrBam()) || (r.mate!=null)==cris.paired());
+ }
+
+ while(reads!=null && reads.size()>0){
+
+ for(int idx=0; idx<reads.size(); idx++){
+ final Read r1=reads.get(idx);
+
+ final int initialLength1=r1.length();
+ final int insert=parseInsert(r1.id);
+
+ int delta=insert-initialLength1;
+ if(delta==0){
+ correct++;
+ }else if(delta>0){
+ tooLong++;
+ }else{
+ tooShort++;
+ }
+
+ readsProcessed++;
+ basesProcessed+=initialLength1;
+ }
+
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ if(ln!=null){
+ cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
+ }
+ }
+
+ errorState|=ReadWrite.closeStream(cris);
+
+ t.stop();
+
+ double rpnano=readsProcessed/(double)(t.elapsed);
+ double bpnano=basesProcessed/(double)(t.elapsed);
+
+ String rpstring=(readsProcessed<100000 ? ""+readsProcessed : readsProcessed<100000000 ? (readsProcessed/1000)+"k" : (readsProcessed/1000000)+"m");
+ String bpstring=(basesProcessed<100000 ? ""+basesProcessed : basesProcessed<100000000 ? (basesProcessed/1000)+"k" : (basesProcessed/1000000)+"m");
+
+ while(rpstring.length()<8){rpstring=" "+rpstring;}
+ while(bpstring.length()<8){bpstring=" "+bpstring;}
+
+ long incorrect=tooShort+tooLong;
+ double snr=10*Math.log10((correct+incorrect+0.0001)/(incorrect+0.0001));
+
+ if(total>0){
+ outstream.println("Input Total: \t"+total+" pairs");
+ outstream.println("Input Overlapping: \t"+String.format("%.5f",mergeable*100.0/total)+"%\t"+mergeable+" reads");
+ }
+ outstream.println("Correct: \t"+String.format("%.5f",correct*100.0/readsProcessed)+"%\t"+correct+" reads");
+ outstream.println("Incorrect: \t"+String.format("%.5f",incorrect*100.0/readsProcessed)+"%\t"+incorrect+" reads");
+ outstream.println("Too Short: \t"+String.format("%.5f",tooShort*100.0/readsProcessed)+"%\t"+tooShort+" reads");
+ outstream.println("Too Long: \t"+String.format("%.5f",tooLong*100.0/readsProcessed)+"%\t"+tooLong+" reads");
+ outstream.println("SNR: \t"+String.format("%.3f",snr));
+
+ outstream.println();
+
+ outstream.println("Time: \t"+t);
+ outstream.println("Reads Processed: "+rpstring+" \t"+String.format("%.2fk reads/sec", rpnano*1000000));
+ outstream.println("Bases Processed: "+bpstring+" \t"+String.format("%.2fm bases/sec", bpnano*1000));
+
+ if(errorState){
+ throw new RuntimeException("GradeMergedReads terminated in an error state; the output may be corrupt.");
+ }
+ }
+
+ static int parseInsert(String s){
+// int space=s.indexOf(' ');
+// if(space<0){space=s.length();}
+ int space=s.length();
+ int equals=s.indexOf('=');
+ for(int i=equals+1;i<s.length();i++){//For programs that rename my reads!
+ if(!Character.isDigit(s.charAt(i))){
+ space=i;
+ break;
+ }
+ }
+ s=s.substring(equals+1, space);
+ int insert=Integer.parseInt(s);
+ return insert;
+ }
+
+ /*--------------------------------------------------------------*/
+
+ private void printOptions(){
+ assert(false) : "Help: TODO";
+ outstream.println("Syntax:\n");
+ outstream.println("java -ea -Xmx200m -cp <path> jgi.GradeMergedReads in=<file>");
+ outstream.println();
+ outstream.println("Other parameters and their defaults:\n");
+ outstream.println("overwrite=false \tOverwrites files that already exist");
+ outstream.println("ziplevel=4 \tSet compression level, 1 (low) to 9 (max)");
+ outstream.println("interleaved=false\tDetermines whether input file is considered interleaved");
+ outstream.println("fastawrap=70 \tLength of lines in fasta output");
+ outstream.println("qin=auto \tASCII offset for input quality. May be set to 33 (Sanger), 64 (Illumina), or auto");
+ }
+
+
+ /*--------------------------------------------------------------*/
+
+ private String in=null;
+
+ private String extin=null;
+
+ private String raw1=null;
+ private String raw2=null;
+
+ /*--------------------------------------------------------------*/
+
+ private long maxReads=-1;
+
+ /*--------------------------------------------------------------*/
+
+ private final FileFormat ffin;
+
+
+ /*--------------------------------------------------------------*/
+
+ private PrintStream outstream=System.err;
+ public static boolean verbose=false;
+ public boolean errorState=false;
+
+
+}
diff --git a/current/jgi/GreedyBarCodeFinder.java b/current/jgi/GreedyBarCodeFinder.java
new file mode 100755
index 0000000..d081750
--- /dev/null
+++ b/current/jgi/GreedyBarCodeFinder.java
@@ -0,0 +1,101 @@
+package jgi;
+
+import java.util.ArrayList;
+import java.util.Random;
+
+import align2.Tools;
+
+import dna.AminoAcid;
+import dna.Timer;
+
+/**
+ * @author Brian Bushnell
+ * @date Jul 10, 2014
+ *
+ */
+public class GreedyBarCodeFinder {
+
+ public static void main(String[] args){
+ Timer t=new Timer();
+
+ GreedyBarCodeFinder finder=new GreedyBarCodeFinder(args);
+ int best=finder.find(finder.rounds);
+
+ t.stop();
+ System.err.println("There are at least "+best+" codes of length "+finder.k+" with mutual hamming distance at least "+finder.hdist);
+ System.err.println("Time: \t"+t);
+ }
+
+ public GreedyBarCodeFinder(String[] args){
+ k=Integer.parseInt(args[0]);
+ hdist=Integer.parseInt(args[1]);
+ rounds=(args.length>2 ? Integer.parseInt(args[2]) : 20);
+ }
+
+ public int find(int rounds){
+ ArrayList<String> list=new ArrayList<String>(1024);
+ final int space=1<<(2*k);
+
+ int[] set=new int[(int)space];
+ if(set!=null){
+ set=new int[(int)space];
+ for(int i=0; i<set.length; i++){set[i]=i;}
+ }
+
+ int best=mainOld(k, hdist, list);
+ for(int i=0; i<rounds; i++){
+ best=Tools.max(best, test(k, hdist, set, list));
+ }
+ return best;
+ }
+
+ static int mainOld(int k, int hdist, ArrayList<String> list){
+
+ final long space=1L<<(2*k);
+ if(list==null){list=new ArrayList<String>(1024);}
+ else{list.clear();}
+
+ for(long kmer=0; kmer<space; kmer++){
+ String s=AminoAcid.kmerToString(kmer, k);
+ int dist=CountBarcodes.calcHdist(s, list);
+ if(dist>=hdist){list.add(s);}
+ }
+
+ return list.size();
+
+ }
+
+ static int test(int k, int hdist, int[] set, ArrayList<String> list){
+
+ final int space=1<<(2*k);
+ if(set!=null){
+ set=new int[(int)space];
+ for(int i=0; i<set.length; i++){set[i]=i;}
+ }
+ Random randy=new Random();
+ for(int i=0; i<set.length; i++){
+ int x=i+randy.nextInt(set.length-i);
+ int temp=set[i];
+ set[i]=set[x];
+ set[x]=temp;
+ }
+
+ if(list==null){list=new ArrayList<String>(1024);}
+ else{list.clear();}
+
+ for(long kmer : set){
+ String s=AminoAcid.kmerToString(kmer, k);
+ int dist=CountBarcodes.calcHdist(s, list);
+ if(dist>=hdist){list.add(s);}
+ }
+
+ return list.size();
+ }
+
+ private final int k;
+ private final int hdist;
+ private int rounds;
+
+ static int MAX_HOMOPOLYMER_LENGTH=99;
+
+}
diff --git a/current/jgi/IdentityMatrix.java b/current/jgi/IdentityMatrix.java
new file mode 100755
index 0000000..3770049
--- /dev/null
+++ b/current/jgi/IdentityMatrix.java
@@ -0,0 +1,319 @@
+package jgi;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import align2.BandedAligner;
+import align2.ListNum;
+import align2.Shared;
+import align2.Tools;
+
+import stream.ConcurrentCollectionReadInputStream;
+import stream.ConcurrentGenericReadInputStream;
+import stream.ConcurrentReadInputStream;
+import stream.FASTQ;
+import stream.Read;
+
+import dna.Parser;
+import dna.Timer;
+
+import fileIO.FileFormat;
+import fileIO.ReadWrite;
+import fileIO.TextStreamWriter;
+
+/**
+ * Calculates an all-to-all identity matrix.
+ * @author Brian Bushnell
+ * @date Nov 23, 2014
+ *
+ */
+public class IdentityMatrix {
+
+ public static void main(String[] args){
+ Timer t=new Timer();
+ IdentityMatrix as=new IdentityMatrix(args);
+ as.process(t);
+ }
+
+ public IdentityMatrix(String[] args){
+
+ args=Parser.parseConfig(args);
+ if(Parser.parseHelp(args, true)){
+ printOptions();
+ System.exit(0);
+ }
+
+ outstream.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+
+ FileFormat.PRINT_WARNING=false;
+ int maxEdits_=-1;
+ int maxWidth_=-1;
+
+ Parser parser=new Parser();
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if(b==null || b.equalsIgnoreCase("null")){b=null;}
+ while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens
+
+ if(parser.parse(arg, a, b)){
+ //do nothing
+ }else if(a.equals("parse_flag_goes_here")){
+ //Set a variable here
+ }else if(a.equals("edits") || a.equals("maxedits")){
+ maxEdits_=Integer.parseInt(b);
+ }else if(a.equals("width") || a.equals("maxwidth")){
+ maxWidth_=Integer.parseInt(b);
+ }else if(a.equals("percent")){
+ percent=Tools.parseBoolean(b);
+ }else{
+ outstream.println("Unknown parameter "+args[i]);
+ assert(false) : "Unknown parameter "+args[i];
+ // throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ {//Process parser fields
+ Parser.processQuality();
+
+ maxReads=parser.maxReads;
+ in1=parser.in1;
+ out1=parser.out1;
+ }
+ FASTQ.FORCE_INTERLEAVED=false;
+ FASTQ.TEST_INTERLEAVED=false;
+
+ maxEdits=maxEdits_==-1 ? BandedAligner.big : maxEdits_;
+ maxWidth=maxWidth_==-1 ? (int)(Tools.min(maxEdits, BandedAligner.big)*2L+1) : maxWidth_;
+
+ ffout1=FileFormat.testOutput(out1, FileFormat.FASTQ, null, true, true, false, false);
+ ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, null, true, true);
+ }
+
+ void process(Timer t){
+
+ allReads=load();
+ Shared.READ_BUFFER_LENGTH=4;
+ ConcurrentCollectionReadInputStream cris=new ConcurrentCollectionReadInputStream(allReads, null, -1);
+ cris.start(); //4567
+
+
+ ArrayList<ProcessThread> threads=new ArrayList<ProcessThread>();
+ final int tmax=Tools.max(Shared.threads(), 1);
+ for(int i=0; i<tmax; i++){
+ threads.add(new ProcessThread(cris));
+ }
+ for(ProcessThread pt : threads){pt.start();}
+ for(ProcessThread pt : threads){
+ while(pt.getState()!=Thread.State.TERMINATED){
+ try {
+ pt.join();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+ ReadWrite.closeStreams(cris);
+
+ final int numReads=allReads.size();
+ for(int i=1; i<numReads; i++){
+ Read r1=allReads.get(i);
+ assert(r1.numericID==i);
+ for(int j=0; j<i; j++){
+ Read r2=allReads.get(j);
+ assert(r2.numericID==j);
+ ((float[])r2.obj)[i]=((float[])r1.obj)[j];
+ }
+ }
+
+ if(ffout1!=null){
+ TextStreamWriter tsw=new TextStreamWriter(ffout1);
+ tsw.start();
+ for(Read r : allReads){
+ float[] obj=(float[])r.obj;
+ tsw.print(r.id);
+ if(percent){
+ for(float f : obj){
+ tsw.print(String.format("\t%.2f", f));
+ }
+ }else{
+ for(float f : obj){
+ tsw.print(String.format("\t%.4f", f));
+ }
+ }
+ tsw.print("\n");
+ r.obj=null;
+ }
+ tsw.poisonAndWait();
+ }
+
+ t.stop();
+ outstream.println("Total Time: \t"+t);
+ outstream.println("Reads Processed: "+allReads.size()+" \t"+String.format("%.2fk alignments/sec", (allReads.size()*(long)(allReads.size())/(double)(t.elapsed))*1000000));
+ outstream.println("Min Similarity: "+String.format("%.5f", minID));
+ outstream.println("Max Similarity: "+String.format("%.5f", maxID));
+ outstream.println("Avg Similarity: "+String.format("%.5f", avgID));
+ }
+
+ private ArrayList<Read> load(){
+ Timer t=new Timer();
+ final ConcurrentReadInputStream cris;
+ {
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, null);
+ if(verbose){outstream.println("Started cris");}
+ cris.start(); //4567
+ }
+ boolean paired=cris.paired();
+ assert(!paired) : "This program is not designed for paired reads.";
+
+ long readsProcessed=0;
+ int maxLen=0;
+ ArrayList<Read> bigList=new ArrayList<Read>();
+ {
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+ assert((ffin1==null || ffin1.samOrBam()) || (r.mate!=null)==cris.paired());
+ }
+
+ while(reads!=null && reads.size()>0){
+ if(verbose){outstream.println("Fetched "+reads.size()+" reads.");}
+
+ for(int idx=0; idx<reads.size(); idx++){
+ final Read r1=reads.get(idx);
+
+ bigList.add(r1);
+ maxLen=Tools.max(maxLen, r1.length());
+
+ readsProcessed++;
+ }
+
+ cris.returnList(ln.id, ln.list.isEmpty());
+ if(verbose){outstream.println("Returned a list.");}
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ if(ln!=null){
+ cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
+ }
+ }
+ ReadWrite.closeStreams(cris);
+ if(verbose){outstream.println("Finished loading "+readsProcessed+" sequences.");}
+
+ longestSequence=maxLen;
+
+ t.stop();
+ outstream.println("Load Time: \t"+t);
+
+ return bigList;
+ }
+
+ /*--------------------------------------------------------------*/
+
+ private class ProcessThread extends Thread {
+
+ ProcessThread(ConcurrentReadInputStream cris_){
+ cris=cris_;
+ maxEdits2=Tools.min(maxEdits, longestSequence);
+ int width=Tools.min(maxEdits2*2+1, maxWidth);
+ bandy=BandedAligner.makeBandedAligner(width);
+ }
+
+ @Override
+ public void run(){
+ final int numReads=allReads.size();
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+ assert((ffin1==null || ffin1.samOrBam()) || (r.mate!=null)==cris.paired());
+ }
+
+ double sum=0;
+ long compares=0;
+
+ while(reads!=null && reads.size()>0){
+ if(verbose){outstream.println("Fetched "+reads.size()+" reads.");}
+
+ for(int idx=0; idx<reads.size(); idx++){
+ final Read r1=reads.get(idx);
+ float[] obj=new float[numReads];
+ r1.obj=obj;
+ for(Read r2 : allReads){
+ if(r2.numericID>r1.numericID){break;}
+// int edits=bandy.alignQuadruple(r1.bases, r2.bases, maxEdits2, false);
+ int edits=bandy.alignQuadrupleProgressive(r1.bases, r2.bases, 10, maxEdits2, false);
+// System.err.println(r1.id+"->"+r2.id+": Edits="+edits);
+ float editRate=edits/(float)Tools.max(r1.length(), r2.length());
+ float similarity=1-editRate;
+ if(r1!=r2){
+ compares++;
+ sum+=similarity;
+ minID=Tools.min(minID, similarity);
+ maxID=Tools.max(maxID, similarity);
+ }
+ if(percent){
+ float id=100*similarity;
+ obj[(int)r2.numericID]=id;
+ }else{
+ obj[(int)r2.numericID]=similarity;
+ }
+ }
+ }
+
+ cris.returnList(ln.id, ln.list.isEmpty());
+ if(verbose){outstream.println("Returned a list.");}
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ if(ln!=null){
+ cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
+ }
+
+ avgID=sum/compares;
+ }
+
+ private final ConcurrentReadInputStream cris;
+ private final BandedAligner bandy;
+ private final int maxEdits2;
+ }
+
+ /*--------------------------------------------------------------*/
+
+ private void printOptions(){
+ throw new RuntimeException("printOptions: TODO");
+ }
+
+ /*--------------------------------------------------------------*/
+
+ private String in1=null;
+ private String out1=null;
+
+ private final FileFormat ffin1;
+ private final FileFormat ffout1;
+ private boolean percent=false;
+
+ private ArrayList<Read> allReads;
+
+ /*--------------------------------------------------------------*/
+
+ private long maxReads=-1;
+ private final int maxEdits;
+ private final int maxWidth;
+ private int longestSequence;
+
+ private double minID=1, maxID=0, avgID=0;
+
+ /*--------------------------------------------------------------*/
+
+ private java.io.PrintStream outstream=System.err;
+ public static boolean verbose=false;
+
+}
diff --git a/current/jgi/Info.java b/current/jgi/Info.java
new file mode 100755
index 0000000..d296594
--- /dev/null
+++ b/current/jgi/Info.java
@@ -0,0 +1,195 @@
+package jgi;
+
+import java.util.Arrays;
+
+/**
+ * @author Brian Bushnell
+ * @date Sep 19, 2012
+ *
+ */
+public final class Info {
+
+ public static void main(String[] args){
+
+ if(args.length>0){
+ if(args.length==2 && Character.isDigit(args[1].charAt(0))){
+ byte[] s=args[0].getBytes();
+ int b=Integer.parseInt(args[1]);
+ int len=prefixForInfoBits(s, b);
+ if(len<0){
+ System.out.println("Input string only contains "+String.format("%.2f",infoInBitsDouble(s, 0, s.length))+" bits.");
+ }else{
+ System.out.println("Prefix needed for "+b+" bits is length "+len+": "+args[0].substring(0, len));
+// assert(false) : "TODO: This is clearly broken.";
+ }
+ }else{
+ for(String s : args){
+ printInfo(s);
+ System.out.println();
+ }
+ }
+ System.exit(0);
+ }
+
+ System.out.println();
+ printInfo("");
+ System.out.println();
+ printInfo("A");
+ System.out.println();
+ printInfo("AG");
+ System.out.println();
+ printInfo("AGT");
+ System.out.println();
+ printInfo("AANAA");
+ System.out.println();
+ printInfo("GGGGGGGCGGG");
+ System.out.println();
+ printInfo("CGGGGGGGGGG");
+ System.out.println();
+ printInfo("AGTCAGTCCTAGNGTACGT");
+ System.out.println();
+ printInfo("AGTCAGTCAGTCAGTC");
+ System.out.println();
+ printInfo("GCGCGCGCGCGCGCGC");
+ System.out.println();
+
+ String[] s=new String[] {"A", "G", "C", "T", ""};
+ for(int i=0; i<40; i++){
+ System.out.println();
+ s[4]=s[4]+s[i%4];
+ printInfo(s[4]);
+ }
+
+ System.out.println("PrefixForBits for AAAATATATGAAATGCATGCAATATGTTATGAAA");
+ for(int i=0; i<60; i+=2){
+ System.out.println(i+"\t"+prefixForInfoBits("AAAATATATGAAATGCATGCAATATGTTATGAAA".getBytes(), i));
+ }
+
+
+ System.out.println("PrefixForBits for GCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGC");
+ for(int i=0; i<60; i+=2){
+ System.out.println(i+"\t"+prefixForInfoBits("GCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGC".getBytes(), i));
+ }
+
+
+ System.out.println("PrefixForBits for ACGTACGTACGTACGTACGTACGTACGTACGTAC");
+ for(int i=0; i<63; i+=2){
+ System.out.println(i+"\t"+prefixForInfoBits("ACGTACGTACGTACGTACGTACGTACGTACGTAC".getBytes(), i));
+ }
+ }
+
+ public static void printInfo(String s){
+ long r=info(s);
+ double bits=Math.log(r)/Math.log(2);
+ System.out.println(s+"\nlen="+s.length()+" \tinfo = "+String.format("%.2f", bits)+" bits. \t("+r+")");
+ }
+
+ public static long info(String s){
+ return info(s.getBytes(), 0, s.length());
+ }
+
+ public static int infoInBits(final byte[] array, final int from, final int len){return 63-Long.numberOfLeadingZeros(info(array, from, len));}
+ public static double infoInBitsDouble(final byte[] array, final int from, final int len){return Math.log(info(array, from, len))*invlog2;}
+ public static long info(final byte[] array){return info(array, 0, array.length);}
+ public static long info(final byte[] array, final int from, final int len){
+ short[] counts=new short[4];
+ long r=1;
+ int used=0;
+ for(int i=from, lim=min(from+len, array.length); i<lim; i++){
+// System.out.print(((char)array[i])+" -> ");
+ byte num=baseToNumber[array[i]];
+// System.out.println(num);
+ if(num>=0){
+ counts[num]++;
+ used++;
+
+ if(used>32 && used>MAX/r){//overflow
+// System.out.println("***");
+ return MAX;
+ }
+ r=r*used;
+
+ /* alternate method */
+// long temp=r*used;
+// if(used>32 && temp/used!=r){//overflow
+// return MAX;
+// }
+// r=temp;
+
+ r=r/counts[num];
+ }
+ }
+ return r;
+ }
+
+ public static int prefixForInfoBits(final byte[] array, final int bits){assert(bits>=0 && bits<63);return prefixForInfo(array, 1L<<bits, 0);}
+ public static int prefixForInfoBits(final byte[] array, final int bits, final int from){assert(bits>=0 && bits<63);return prefixForInfo(array, 1L<<bits, from);}
+ public static int prefixForInfo(final byte[] array, final long info){return prefixForInfo(array, info, 0);}
+
+ public static int prefixForInfo(final byte[] array, final long info, final int from){
+ assert(info>=0);
+ short[] counts=new short[4];
+ long r=1;
+ int used=0;
+ int i=from;
+ for(; i<array.length && r<info; i++){
+// System.out.print(((char)array[i])+" -> ");
+ byte num=baseToNumber[array[i]];
+// System.out.println(num);
+ if(num>=0){
+ counts[num]++;
+ used++;
+
+ if(used>32 && used>MAX/r){//overflow
+// System.out.println("***");
+ return i;
+ }
+ r=r*used;
+
+ /* alternate method */
+// long temp=r*used;
+// if(used>32 && temp/used!=r){//overflow
+// return MAX;
+// }
+// r=temp;
+
+ r=r/counts[num];
+//
+// {
+// String s=new String(array).substring(0, i+1);
+// System.out.println("\n"+s);
+// System.out.println("For len "+i+": r="+r+", bits="+(63-Long.numberOfLeadingZeros(r))+"\t->\t"+(Math.log(r)*invlog2));
+// System.out.println(infoInBitsDouble(s.getBytes(), 0, i+1));
+// System.out.println(info(s.getBytes(), 0, i+1));
+// }
+ }
+ }
+ return r<info ? -1 : i;
+ }
+
+ private static final byte[] numberToBase={
+ 'A','C','G','T','N'
+ };
+
+ /** Element i is: 0 for 'A', 1 for 'C', 2 for 'G', 3 for 'T', -1 otherwise */
+ public static final byte[] baseToNumber=new byte[128];
+
+ static{
+ Arrays.fill(baseToNumber, (byte)-1);
+ for(int i=0; i<numberToBase.length; i++){
+ char x=(char)numberToBase[i];
+ if(x=='A' || x=='C' || x=='G' || x=='T'){
+ baseToNumber[x]=(byte)i;
+ baseToNumber[Character.toLowerCase(x)]=(byte)i;
+ }
+ }
+ baseToNumber['U']=3;
+ baseToNumber['u']=3;
+ }
+
+ private static final int min(int x, int y){return x<y ? x : y;}
+ private static final int max(int x, int y){return x>y ? x : y;}
+
+ private static final long MAX=Long.MAX_VALUE;
+ private static final double invlog2=1.0/Math.log(2);
+}
diff --git a/current/jgi/KmerCountExact.java b/current/jgi/KmerCountExact.java
new file mode 100755
index 0000000..9a973f6
--- /dev/null
+++ b/current/jgi/KmerCountExact.java
@@ -0,0 +1,458 @@
+package jgi;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import kmer.AbstractKmerTableSet;
+import kmer.DumpThread;
+import kmer.KmerTableSet;
+import stream.FastaReadInputStream;
+import ukmer.KmerTableSetU;
+import align2.ReadStats;
+import align2.Shared;
+import align2.Tools;
+import assemble.Shaver;
+import assemble.Tadpole;
+import dna.Parser;
+import dna.Timer;
+import fileIO.ByteFile;
+import fileIO.ReadWrite;
+
+/**
+ * @author Brian Bushnell
+ * @date Nov 22, 2013
+ *
+ */
+public class KmerCountExact {
+
+ /**
+ * Code entrance from the command line.
+ * @param args Command line arguments
+ */
+ public static void main(String[] args){
+
+ args=Parser.parseConfig(args);
+ if(Parser.parseHelp(args, true)){
+ printOptions();
+ System.exit(0);
+ }
+
+ Timer t=new Timer(), t2=new Timer();
+ t.start();
+ t2.start();
+
+ //Create a new CountKmersExact instance
+ KmerCountExact cke=new KmerCountExact(args);
+ t2.stop();
+ outstream.println("Initialization Time: \t"+t2);
+
+ ///And run it
+ cke.process(t);
+ }
+
+ /**
+ * Display usage information.
+ */
+ private static void printOptions(){
+ System.err.println("Please consult the shellscript for usage information.");
+// outstream.println("Syntax:\n");
+// outstream.println("\njava -ea -Xmx20g -cp <path> jgi.KmerCountExact in=<input file>");
+// outstream.println("\nOptional flags:");
+// outstream.println("in=<file> \tThe 'in=' flag is needed if the input file is not the first parameter. 'in=stdin' will pipe from standard in.");
+// outstream.println("in2=<file> \tUse this if 2nd read of pairs are in a different file.");
+// outstream.println("out=<file> \tDump kmers and counts to this file.");
+// outstream.println("");
+// outstream.println("threads=auto \t(t) Set number of threads to use; default is number of logical processors.");
+// outstream.println("overwrite=t \t(ow) Set to false to force the program to abort rather than overwrite an existing file.");
+// outstream.println("showspeed=t \t(ss) Set to 'f' to suppress display of processing speed.");
+// outstream.println("interleaved=auto \t(int) If true, forces fastq input to be paired and interleaved.");
+// outstream.println("k=28 \tKmer length used for finding contaminants. Contaminants shorter than k will not be found.");
+// outstream.println("minavgquality=0 \t(maq) Reads with average quality (before trimming) below this will be discarded.");
+// outstream.println("touppercase=f \t(tuc) Change all letters in reads and reference to upper-case.");
+// outstream.println("qtrim=f \tTrim read ends to remove bases with quality below minq. Performed AFTER looking for kmers. ");
+// outstream.println(" \tValues: t (trim both ends), f (neither end), r (right end only), l (left end only).");
+// outstream.println("minq=4 \tTrim quality threshold.");
+// outstream.println("minlength=2 \t(ml) Reads shorter than this after trimming will be discarded. Pairs will be discarded only if both are shorter.");
+// outstream.println("ziplevel=2 \t(zl) Set to 1 (lowest) through 9 (max) to change compression level; lower compression is faster.");
+// outstream.println("fastawrap=70 \tLength of lines in fasta output");
+// outstream.println("qin=auto \tASCII offset for input quality. May be set to 33 (Sanger), 64 (Illumina), or auto");
+// outstream.println("qout=auto \tASCII offset for output quality. May be set to 33 (Sanger), 64 (Illumina), or auto (meaning same as input)");
+// outstream.println("rcomp=t \tLook for reverse-complements of kmers also.");
+// outstream.println("forest=t \tUse HashForest data structure");
+// outstream.println("table=f \tUse KmerTable data structure");
+// outstream.println("array=f \tUse HashArray data structure");
+ }
+
+
+ /**
+ * Constructor.
+ * @param args Command line arguments
+ */
+ public KmerCountExact(String[] args){
+ for(String s : args){if(s.contains("standardout") || s.contains("stdout")){outstream=System.err;}}
+ System.err.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+
+ /* Set global defaults */
+ ReadWrite.ZIPLEVEL=2;
+ ReadWrite.USE_UNPIGZ=true;
+
+ ByteFile.FORCE_MODE_BF2=Shared.threads()>2;
+
+ /* Initialize local variables with defaults */
+ Parser parser=new Parser();
+ boolean setOut=false;
+ boolean useForest_=false, useTable_=false, useArray_=true;
+
+ /* Parse arguments */
+ for(int i=0; i<args.length; i++){
+
+ final String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if("null".equalsIgnoreCase(b)){b=null;}
+ while(a.charAt(0)=='-' && (a.indexOf('.')<0 || i>1 || !new File(a).exists())){a=a.substring(1);}
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(Parser.parseCommonStatic(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseZip(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseQuality(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseFasta(arg, a, b)){
+ //do nothing
+ }else if(parser.parseInterleaved(arg, a, b)){
+ //do nothing
+ }else if(parser.parseTrim(arg, a, b)){
+ //do nothing
+ }else if(a.equals("out") || a.equals("out1") || a.equals("outkmers") || a.equals("outk") || a.equals("dump")){
+ outKmers=b;
+ setOut=true;
+ }else if(a.equals("mincounttodump") || a.equals("mindump") || a.equals("mincount")){
+ minToDump=Integer.parseInt(b);
+ }else if(a.equals("dumpthreads")){
+ DumpThread.NUM_THREADS=Integer.parseInt(b);
+ }else if(a.equals("hist") || a.equals("khist")){
+ outHist=b;
+ }else if(a.equals("append") || a.equals("app")){
+ append=ReadStats.append=Tools.parseBoolean(b);
+ }else if(a.equals("overwrite") || a.equals("ow")){
+ overwrite=Tools.parseBoolean(b);
+ }else if(a.equals("forest")){
+ useForest_=Tools.parseBoolean(b);
+ if(useForest_){useTable_=useArray_=false;}
+ assert(false) : a+" is deprecated.";
+ }else if(a.equals("table")){
+ useTable_=Tools.parseBoolean(b);
+ if(useTable_){useForest_=useArray_=false;}
+ assert(false) : a+" is deprecated.";
+ }else if(a.equals("array")){
+ useArray_=Tools.parseBoolean(b);
+ if(useArray_){useTable_=useForest_=false;}
+ assert(false) : a+" is deprecated.";
+ }else if(a.equals("threads") || a.equals("t")){
+ THREADS=(b==null || b.equalsIgnoreCase("auto") ? Shared.threads() : Integer.parseInt(b));
+ }else if(a.equals("verbose")){
+ assert(false) : "Verbose flag is currently static final; must be recompiled to change.";
+// verbose=Tools.parseBoolean(b);
+ }else if(a.equals("shave")){
+ shave=Tools.parseBoolean(b);
+ }else if(a.equals("rinse")){
+ rinse=Tools.parseBoolean(b);
+ }else if(a.equals("shavedepth")){
+ shaveDepth=Integer.parseInt(b);
+ }else if(a.equals("histcolumns")){
+ histColumns=Integer.parseInt(b);
+ }else if(a.equals("histmax")){
+ histMax=Integer.parseInt(b);
+ }else if(a.equals("histheader")){
+ histHeader=Tools.parseBoolean(b);
+ }else if(a.equals("nzo") || a.equals("nonzeroonly")){
+ histZeros=!Tools.parseBoolean(b);
+ }else if(a.equals("minheight")){
+ minHeight=Long.parseLong(b);
+ }else if(a.equals("minvolume")){
+ minVolume=Long.parseLong(b);
+ }else if(a.equals("minwidth")){
+ minWidth=Integer.parseInt(b);
+ }else if(a.equals("minpeak")){
+ minPeak=Integer.parseInt(b);
+ }else if(a.equals("maxpeak")){
+ maxPeak=Integer.parseInt(b);
+ }else if(a.equals("maxpeakcount") || a.equals("maxpc") || a.equals("maxpeaks")){
+ maxPeakCount=Integer.parseInt(b);
+ }else if(a.equals("ploidy")){
+ ploidy=Integer.parseInt(b);
+ }else if(a.equals("peaks") || a.equals("peaksout")){
+ outPeaks=b;
+ }else if(a.equals("smooth") || a.equals("smoothe")){
+ smooth=Tools.parseBoolean(b);
+ }else if(a.equals("smoothradius") || a.equals("smootheradius")){
+ smoothRadius=Integer.parseInt(b);
+ }else if(a.equals("maxradius")){
+ CallPeaks.maxRadius=Integer.parseInt(b);
+ }else if(a.equals("progressivemult")){
+ CallPeaks.progressiveMult=Float.parseFloat(b);
+ }else if(KmerTableSet.isValidArgument(a)){
+ //Do nothing
+ }else{
+ throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ {//Process parser fields
+ Parser.processQuality();
+ }
+
+ /* Adjust I/O settings and filenames */
+
+ assert(FastaReadInputStream.settingsOK());
+
+ if(outKmers!=null && !Tools.canWrite(outKmers, overwrite)){throw new RuntimeException("Output file "+outKmers+" already exists, and overwrite="+overwrite);}
+
+ assert(THREADS>0);
+
+ if(DISPLAY_PROGRESS){
+ outstream.println("Initial:");
+ Shared.printMemory();
+ outstream.println();
+ }
+
+// final int tableType=(useForest ? AbstractKmerTable.FOREST1D : useTable ? AbstractKmerTable.TABLE : useArray ? AbstractKmerTable.ARRAY1D : 0);
+ k=Tadpole.preparseK(args);
+
+ if(k<=31){
+ tables=new KmerTableSet(args, 12);
+ }else{
+ tables=new KmerTableSetU(args, 12);
+ }
+ if(tables.prefilter){tables.minProbMain=false;}
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Outer Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+
+ public void process(Timer t){
+
+ /* Check for output file collisions */
+ Tools.testOutputFiles(overwrite, append, false, outKmers, outHist, outPeaks);
+
+ /* Count kmers */
+ process2();
+
+ if(THREADS>1 && (outHist!=null || outPeaks!=null) && outKmers!=null){
+ Timer tout=new Timer();
+ tout.start();
+ Thread a=new DumpKmersThread();
+ Thread b=new MakeKhistThread();
+ a.start();
+ b.start();
+ while(a.getState()!=Thread.State.TERMINATED){
+ try {
+ a.join();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ while(b.getState()!=Thread.State.TERMINATED){
+ try {
+ b.join();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ tout.stop();
+ outstream.println("Write Time: \t"+tout);
+ }else{
+ if(outHist!=null || outPeaks!=null){
+ makeKhist(outHist, outPeaks, histColumns, histMax, histHeader, histZeros, true, smooth);
+ }
+ if(outKmers!=null){
+// tables.dumpKmersAsText(outKmers, minToDump, true);
+ tables.dumpKmersAsBytes_MT(outKmers, minToDump, true);
+ }
+ }
+
+ /* Stop timer and calculate speed statistics */
+ t.stop();
+
+ /* Throw an exception if errors were detected */
+ if(errorState){
+ throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
+ }
+ }
+
+
+ public void process2(){
+
+ /* Start phase timer */
+ Timer t=new Timer();
+
+ AbstractKmerTableSet.DISPLAY_STATS=false;
+
+ /* Fill tables with kmers */
+ tables.process(t);
+ errorState|=tables.errorState;
+
+ t.stop();
+ outstream.println("Input: \t"+tables.readsIn+" reads \t\t"+tables.basesIn+" bases.");
+
+ if(tables.qtrimLeft() || tables.qtrimRight()){
+ outstream.println("QTrimmed: \t"+tables.readsTrimmed+" reads ("+String.format("%.2f",tables.readsTrimmed*100.0/tables.readsIn)+"%) \t"+
+ tables.basesTrimmed+" bases ("+String.format("%.2f",tables.basesTrimmed*100.0/tables.basesIn)+"%)");
+ }
+ if(tables.minAvgQuality()>0){
+ outstream.println("Low quality discards: \t"+tables.lowqReads+" reads ("+String.format("%.2f",tables.lowqReads*100.0/tables.readsIn)+"%) \t"+
+ tables.lowqBases+" bases ("+String.format("%.2f",tables.lowqBases*100.0/tables.basesIn)+"%)");
+ }
+
+ if(shave || rinse){
+ kmersRemoved=shave(shave, rinse, shaveDepth);
+ }
+
+ outstream.println("\nFor K="+tables.kbig());
+ outstream.println("Unique Kmers: \t"+tables.kmersLoaded);
+ if(shave || rinse){
+ outstream.println("After Shaving: \t"+(tables.kmersLoaded-kmersRemoved));
+ }
+ outstream.println("Load Time: \t"+t);
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Helper Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ long shave(boolean shave, boolean rinse, int maxShaveDepth){
+ final Shaver shaver=Shaver.makeShaver(tables, THREADS);
+ long sum=0;
+
+ for(int i=0; i<maxShaveDepth; i++){
+ int a=i+1, b=maxShaveDepth, c=i+1;
+ // if(i>3){Shaver2.verbose2=true;}
+ outstream.println("\nShave("+a+", "+b+", "+c+")");
+ sum+=shaver.shave(a, b, c, 100, 100, shave, rinse);
+ }
+
+ System.err.println();
+ return sum;
+ }
+
+ private void makeKhist(String fname, String peaks, int cols, int max, boolean printHeader, boolean printZeros, boolean printTime, boolean smooth){
+ if(fname==null && peaks==null){return;}
+
+ long[] array=tables.makeKhist(fname, cols, max, printHeader, printZeros, printTime, smooth, smoothRadius);
+
+ if(peaks!=null){
+ CallPeaks.printClass=false;
+ ArrayList<String> args=new ArrayList<String>();
+ if(!smooth && smoothRadius>0){
+ args.add("smoothradius="+smoothRadius);
+ args.add("smoothprogressive=t");
+ }
+ CallPeaks.printPeaks(array, peaks, overwrite, minHeight, minVolume, minWidth, Tools.max(tables.filterMax()+2, minPeak), maxPeak, maxPeakCount, k, ploidy, args);
+ }
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Helper Classes ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private class DumpKmersThread extends Thread {
+
+ DumpKmersThread(){}
+
+ public void run(){
+ tables.dumpKmersAsBytes_MT(outKmers, minToDump, false);
+ }
+
+ }
+
+ private class MakeKhistThread extends Thread {
+
+ MakeKhistThread(){}
+
+ public void run(){
+ makeKhist(outHist, outPeaks, histColumns, histMax, histHeader, histZeros, false, smooth);
+ }
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Hold kmers. */
+ private final AbstractKmerTableSet tables;
+
+ private boolean shave=false;
+ private boolean rinse=false;
+ private int shaveDepth=1;
+
+ private long kmersRemoved=0;
+
+ /** Kmer count output file */
+ private String outKmers=null;
+ /** Histogram output file */
+ private String outHist=null;
+ /** Histogram peak output file */
+ private String outPeaks=null;
+
+ private int smoothRadius=1;
+ private boolean smooth=false;
+
+ private boolean errorState=false;
+
+ /** Histogram columns */
+ private int histColumns=2;
+ /** Histogram rows */
+ private int histMax=100000;
+ /** Print a histogram header */
+ private boolean histHeader=true;
+ /** Histogram show rows with 0 count */
+ private boolean histZeros=false;
+
+ private long minHeight=2;
+ private long minVolume=2;
+ private int minWidth=2;
+ private int minPeak=2;
+ private int maxPeak=Integer.MAX_VALUE;
+ private int maxPeakCount=12;
+
+ private int ploidy=-1;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Final Primitives ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** min kmer count to dump to text */
+ private int minToDump=1;
+
+ final int k;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Static Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Print messages to this stream */
+ private static PrintStream outstream=System.err;
+ /** Permission to overwrite existing files */
+ public static boolean overwrite=false;
+ /** Permission to append to existing files */
+ public static boolean append=false;
+ /** Display progress messages such as memory usage */
+ public static boolean DISPLAY_PROGRESS=true;
+ /** Verbose messages */
+ public static final boolean verbose=false;
+ /** Number of ProcessThreads */
+ public static int THREADS=Shared.threads();
+
+
+}
diff --git a/current/jgi/KmerCoverage.java b/current/jgi/KmerCoverage.java
new file mode 100755
index 0000000..5b531bb
--- /dev/null
+++ b/current/jgi/KmerCoverage.java
@@ -0,0 +1,1219 @@
+package jgi;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.lang.Thread.State;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.concurrent.atomic.AtomicLong;
+
+import bloom.KCountArray;
+import bloom.KmerCount7MTA;
+import bloom.KmerCountAbstract;
+
+
+import stream.ConcurrentGenericReadInputStream;
+import stream.ConcurrentReadInputStream;
+import stream.FASTQ;
+import stream.FastaReadInputStream;
+import stream.ConcurrentReadOutputStream;
+import stream.Read;
+
+import align2.ListNum;
+import align2.ReadStats;
+import align2.Tools;
+import dna.AminoAcid;
+import dna.Data;
+import dna.Parser;
+import dna.Timer;
+import fileIO.ReadWrite;
+import fileIO.FileFormat;
+import fileIO.TextStreamWriter;
+
+/**
+ * @author Brian Bushnell
+ * @date Oct 11, 2012
+ *
+ */
+public class KmerCoverage {
+
+ public static void main(String[] args){
+ for(String s : args){if(s.startsWith("out=standardout") || s.startsWith("out=stdout")){outstream=System.err;}}
+ outstream.println("Executing "+(new Object() { }.getClass().getEnclosingClass().getName())+" "+Arrays.toString(args)+"\n");
+
+ if(args.length<1){throw new RuntimeException("No parameters.");}
+
+ String in1=(args[0].indexOf("=")>0 ? null : args[0]);
+ String in2=(in1!=null && args.length>1 ? args[1] : null);
+ if(in2!=null && "null".equalsIgnoreCase(in2)){in2=null;}
+
+ {
+ if(in1!=null && !in1.contains(",")){
+ File f=new File(in1);
+ if(!f.exists() || !f.isFile()){throw new RuntimeException(in1+" does not exist.");}
+ }
+ if(in2!=null && !in2.contains(",")){
+ File f=new File(in2);
+ if(!f.exists() || !f.isFile()){throw new RuntimeException(in2+" does not exist.");}
+ if(in1.equalsIgnoreCase(in2)){
+ throw new RuntimeException("Both input files are the same.");
+ }
+ }
+ }
+
+ Parser parser=new Parser();
+ KmerCountAbstract.minQuality=4;
+ KmerCountAbstract.minProb=0.1f;
+
+ int k=31;
+ int cbits=16;
+ int gap=0;
+ int hashes=4;
+// int matrixbits=-1;
+ long cells=-1;
+ long maxReads=-1;
+ int buildpasses=1;
+ long tablereads=-1; //How many reads to process when building the hashtable
+ int buildStepsize=4;
+ String output=null;
+ int prehashes=-1;
+ long precells=-1;
+ String histFile=null;
+ int threads=-1;
+
+ int minq=KmerCountAbstract.minQuality;
+ KmerCountAbstract.CANONICAL=true;
+
+ boolean auto=true;
+
+ FastaReadInputStream.TARGET_READ_LEN=Integer.MAX_VALUE;
+
+ List<String> extra=null;
+
+ long memory=Runtime.getRuntime().maxMemory();
+ long tmemory=Runtime.getRuntime().totalMemory();
+// assert(false) : memory+", "+tmemory;
+
+ for(int i=(in1==null ? 0 : 1); i<args.length; i++){
+ if(args[i]==null){args[i]="null";}
+ final String arg=args[i];
+ final String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if("null".equalsIgnoreCase(b)){b=null;}
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(Parser.parseCommonStatic(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseZip(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseQuality(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseFasta(arg, a, b)){
+ //do nothing
+ }else if(parser.parseInterleaved(arg, a, b)){
+ //do nothing
+ }else if(a.equals("k") || a.equals("kmer")){
+ k=Integer.parseInt(b);
+ }else if(a.equals("in") || a.equals("in1")){
+ in1=b;
+ }else if(a.equals("in2")){
+ in2=b;
+ }else if(a.startsWith("bits") ||a.startsWith("cbits") || a.startsWith("cellbits")){
+ cbits=Integer.parseInt(b);
+ }else if(a.startsWith("histlen") ||a.startsWith("histogramlen")){
+ HIST_LEN_PRINT=Tools.min(Integer.MAX_VALUE, Long.parseLong(b)+1);
+ }else if(a.startsWith("gap")){
+ gap=Integer.parseInt(b);
+ }else if(a.startsWith("matrixbits")){
+ int matrixbits=Integer.parseInt(b);
+ assert(matrixbits<63);
+ cells=1L<<matrixbits;
+ }else if(a.startsWith("cells")){
+ cells=Tools.parseKMG(b);
+ }else if(a.startsWith("precells") || a.startsWith("prefiltercells")){
+ precells=Tools.parseKMG(b);
+ prefilter=prefilter || precells!=0;
+ }else if(a.startsWith("minq")){
+ minq=Byte.parseByte(b);
+ }else if(a.equals("zerobin")){
+ ZERO_BIN=Tools.parseBoolean(b);
+ }else if(a.startsWith("minmedian")){
+ MIN_MEDIAN=Integer.parseInt(b);
+ }else if(a.startsWith("minaverage")){
+ MIN_AVERAGE=Integer.parseInt(b);
+ }else if(a.startsWith("minprob")){
+ KmerCountAbstract.minProb=Float.parseFloat(b);
+ }else if(a.startsWith("hashes")){
+ hashes=Integer.parseInt(b);
+ }else if(a.startsWith("prehashes") || a.startsWith("prefilterhashes")){
+ prehashes=Integer.parseInt(b);
+ prefilter=prefilter || prehashes!=0;
+ }else if(a.equals("prefilter")){
+ prefilter=Tools.parseBoolean(b);
+ }else if(a.startsWith("stepsize") || a.startsWith("buildstepsize")){
+ buildStepsize=Integer.parseInt(b);
+ }else if(a.startsWith("passes") || a.startsWith("buildpasses")){
+ buildpasses=Integer.parseInt(b);
+ }else if(a.equals("printcoverage")){
+ OUTPUT_ATTACHMENT=Tools.parseBoolean(b);
+ }else if(a.equals("threads") || a.equals("t")){
+ threads=Integer.parseInt(b);
+ }else if(a.equals("reads") || a.startsWith("maxreads")){
+ maxReads=Tools.parseKMG(b);
+ }else if(a.startsWith("tablereads") || a.startsWith("buildreads")){
+ tablereads=Tools.parseKMG(b);
+ }else if(a.startsWith("out")){
+ output=b;
+ }else if(a.startsWith("hist")){
+ histFile=b;
+ }else if(a.startsWith("verbose")){
+ verbose=Tools.parseBoolean(b);
+ }else if(a.startsWith("datainheader") || a.startsWith("datainname") || a.startsWith("useheader")){
+ USE_HEADER=Tools.parseBoolean(b);
+ }else if(a.equals("ordered") || a.equals("ord")){
+ ordered=Tools.parseBoolean(b);
+ }else if(a.equals("append") || a.equals("app")){
+ append=ReadStats.append=Tools.parseBoolean(b);
+ }else if(a.equals("overwrite") || a.equals("ow")){
+ overwrite=Tools.parseBoolean(b);
+ }else if(a.equals("auto") || a.equals("automatic")){
+ auto=Tools.parseBoolean(b);
+ }else if(a.equals("samplewhenreadingtable") || a.equals("sampleoutput")){
+ DONT_SAMPLE_OUTPUT=!Tools.parseBoolean(b);
+ }else if(a.equals("dontsamplewhenreadingtable") || a.equals("dontsampleoutput")){
+ DONT_SAMPLE_OUTPUT=Tools.parseBoolean(b);
+ }else if(a.startsWith("kmersample")){
+ kmersamplerate=Integer.parseInt(b);
+// KmerCountAbstract.kmersamplerate=kmersamplerate;
+ }else if(a.startsWith("sample") || a.startsWith("readsample")){
+ readsamplerate=Integer.parseInt(b);
+// KmerCountAbstract.readsamplerate=readsamplerate;
+ }else if(a.startsWith("canonical")){
+ CANONICAL=KmerCountAbstract.CANONICAL=Tools.parseBoolean(b);
+ }else if(a.startsWith("fixspikes")){
+ FIX_SPIKES=Tools.parseBoolean(b);
+ }else if(a.equals("printzerocoverage") || a.equals("pzc")){
+ PRINT_ZERO_COVERAGE=Tools.parseBoolean(b);
+ }else if(a.equals("removeduplicatekmers") || a.equals("rdk")){
+ KmerCountAbstract.KEEP_DUPLICATE_KMERS=!Tools.parseBoolean(b);
+ }else if(a.startsWith("extra")){
+ if(b!=null && !b.equalsIgnoreCase("null")){
+ if(new File(b).exists()){
+ extra=new ArrayList<String>();
+ extra.add(b);
+ }else{
+ extra=Arrays.asList(b.split(","));
+ }
+ }
+ }else{
+ throw new RuntimeException("Unknown parameter "+arg);
+ }
+ }
+
+ {//Process parser fields
+ Parser.processQuality();
+ }
+
+ assert(FastaReadInputStream.settingsOK());
+ if(k>31){CANONICAL=KmerCountAbstract.CANONICAL=false;}
+ assert(CANONICAL==KmerCountAbstract.CANONICAL);
+
+// if(output!=null && reads1.contains(",")){
+// throw new RuntimeException("\nLists of input files can only be used with histogram output, not full output.\n" +
+// "Please set output=null or move the extra input files to 'extra=file1,file2,...fileN'");
+// }
+
+ {
+ if(histFile==null){
+
+
+ }else{
+ USE_HISTOGRAM=true;
+ }
+
+ final int maxCount=(int)(cbits>16 ? Integer.MAX_VALUE : (1L<<cbits)-1);
+ assert(maxCount>0);
+ HIST_LEN_PRINT=Tools.max(1, Tools.min(HIST_LEN_PRINT, maxCount));
+ assert(HIST_LEN_PRINT<=Integer.MAX_VALUE) : HIST_LEN_PRINT+", "+Integer.MAX_VALUE;
+ HIST_LEN=(int)Tools.min(maxCount, Tools.max(HIST_LEN_PRINT, HIST_LEN));
+
+ histogram_total=new long[HIST_LEN];
+ }
+
+ if(extra!=null){
+ for(String s : extra){
+ File f=new File(s);
+ if(!f.exists() || !f.isFile()){throw new RuntimeException(s+" does not exist.");}
+ assert(!s.equalsIgnoreCase(in1) && (in2==null || !s.equalsIgnoreCase(in2))) : "\nInput file "+s+" should not be included as an extra file.\n";
+ }
+ }
+
+// outstream.println("ForceInterleaved = "+FASTQ.FORCE_INTERLEAVED);
+
+// assert(false) : reads1+", "+reads2+", "+output;
+// if(FASTQ.FORCE_INTERLEAVED && in2==null){
+// outstream.println()
+// }
+
+ if(threads<=0){
+ if(auto){THREADS=Data.LOGICAL_PROCESSORS;}
+ else{THREADS=8;}
+ }else{
+ THREADS=threads;
+ }
+// KmerCountAbstract.THREADS=Tools.min(THREADS,6);
+ KmerCountAbstract.THREADS=THREADS;
+
+// System.err.println("THREADS="+THREADS+", KmerCountAbstract.THREADS="+KmerCountAbstract.THREADS);
+
+ if(auto && cells==-1){
+ final long usable=(long)Tools.max(((memory-96000000)*.73), memory*0.45);
+ long mem=usable-(USE_HISTOGRAM ? (HIST_LEN*8*(THREADS+1)) : 0);
+ if(buildpasses>1){mem/=2;}
+ cells=(mem*8)/cbits;
+//
+// long tablebytes=((1L<<matrixbits)*cbits)/8;
+// if(tablebytes*3<usable){matrixbits++;}
+// outstream.println(tablebytes/1000000+", "+usable/1000000+", "+(tablebytes*3)/1000000);
+
+ }else if(cells==-1){
+ cells=1L<<34;
+ }
+
+ if(prefilter){
+ if(precells<1){
+ long totalbits=cells*cbits;
+ long prebits=(long)(totalbits*0.35);
+ precells=prebits/2;
+ cells=(totalbits-prebits+cbits-1)/cbits; //Steal memory from cell allocation
+ }
+ if(prehashes<1){
+ prehashes=(hashes+1)/2;
+ }
+ }
+
+ {
+ outstream.println("\nSettings:");
+ outstream.println("threads: \t"+THREADS);
+ outstream.println("k: \t"+k);
+ outstream.println("passes: \t"+buildpasses);
+ outstream.println("bits per cell: \t"+cbits);
+// outstream.println("matrixbits: \t"+matrixbits);
+ outstream.println("cells: \t"+Tools.toKMG(cells));
+ outstream.println("hashes: \t"+hashes);
+ if(prefilter){
+ outstream.println("prefilter bits: \t"+2);
+// outstream.println("matrixbits: \t"+matrixbits);
+ outstream.println("prefilter cells: \t"+(precells>0 && prehashes>0 ? Tools.toKMG(precells) : "?"));
+ outstream.println("prefilter hashes: \t"+(precells>0 && prehashes>0 ? ""+prehashes : "?"));
+ }
+ outstream.println("base min quality: \t"+KmerCountAbstract.minQuality);
+ outstream.println("kmer min prob: \t"+KmerCountAbstract.minProb);
+
+ outstream.println();
+ outstream.println("remove duplicates:\t"+!KmerCountAbstract.KEEP_DUPLICATE_KMERS);
+ outstream.println("fix spikes: \t"+FIX_SPIKES);
+ if(USE_HISTOGRAM && HIST_LEN>0){
+ outstream.println("histogram length: \t"+(USE_HISTOGRAM ? HIST_LEN : 0));
+ }
+ if(histFile!=null){
+ outstream.println("print zero cov: \t"+PRINT_ZERO_COVERAGE);
+ }
+
+ outstream.println();
+ }
+
+ if(!prefilter && k<32 && cells>(1L<<(2*k))){cells=(1L<<(2*k));}
+ assert(cells>0);
+
+// KmerCountAbstract.THREADS=Tools.max(THREADS/2, KmerCountAbstract.THREADS); //Seems like 4 is actually optimal...
+
+ FastaReadInputStream.MIN_READ_LEN=k;
+
+ Timer t=new Timer();
+ Timer ht=new Timer();
+ t.start();
+ ht.start();
+ KCountArray kca;
+ KCountArray prefilterArray=null;
+ outstream.println();
+ if(prefilter){
+ prefilterArray=KmerCount7MTA.makeKca(in1, in2, extra, k, 2, gap, precells, prehashes, minq, true, false, tablereads, 1, buildStepsize, 1, 1, null, 0);
+ outstream.println("Made prefilter: \t"+prefilterArray.toShortString(prehashes));
+ }
+ kca=KmerCount7MTA.makeKca(in1, in2, extra, k, cbits, gap, cells, hashes, minq, true, false, tablereads, buildpasses, buildStepsize, 2, 2,
+ prefilterArray, (prefilterArray==null ? 0 : prefilterArray.maxValue));
+ ht.stop();
+
+ outstream.println("Made hash table: \t"+kca.toShortString(hashes));
+
+ long estUnique;
+ outstream.println();
+ if(prefilterArray!=null){
+ int lim1=prefilterArray.maxValue, lim2=prefilterArray.maxValue+1;
+ double a=prefilterArray.estimateUniqueKmers(prehashes);
+ double b=kca.estimateUniqueKmers(hashes, lim2);
+ a=a-b;
+ if(CANONICAL){
+// a=(a*KCountArray.canonMask)/(KCountArray.canonMask+1);
+// b=(b*KCountArray.canonMask)/(KCountArray.canonMask+1);
+ }else{
+ a/=2;
+ b/=2;
+ }
+ estUnique=((long)((a+b)));
+ outstream.println("Estimated kmers of depth 1-"+lim1+": \t"+(long)a);
+ outstream.println("Estimated kmers of depth "+lim2+"+ : \t"+(long)b);
+ }else{
+// double est=kca.cells*(1-Math.pow(1-Math.sqrt(kca.usedFraction()), 1.0/hashes));
+// double est=kca.cells*(1-Math.pow(1-kca.usedFraction(), 1.0/hashes));
+ double est=kca.estimateUniqueKmers(hashes);
+// System.out.println("Used cells: "+kca.cellsUsed(1));
+ if(CANONICAL){
+// est=(est*KCountArray.canonMask)/(KCountArray.canonMask+1);
+ }else{
+ est/=2;
+ }
+ estUnique=((long)((est)));
+
+ }
+ outstream.println("Estimated unique kmers: \t"+estUnique);//+", or "+estUnique+" counting forward kmers only.");
+// outstream.println("(Includes forward and reverse kmers)");
+ outstream.println();
+ outstream.println("Table creation time:\t\t"+ht);//+" \t"+String.format("%.2f", totalBases*1000000.0/(ht.elapsed))+" kb/sec");
+
+ long bases=0;
+
+ if(in1!=null && in1.contains(",") && !new File(in1).exists()){
+ String[] list1=in1.split(",");
+ String[] list2=(in2==null ? null : in2.split(","));
+ bases=count(list1, list2, kca, k, maxReads, output, ordered, overwrite, histFile, estUnique);
+ }else{
+ bases=count(in1, in2, kca, k, maxReads, output, ordered, overwrite, histFile, estUnique);
+ }
+ printTopology();
+
+ t.stop();
+ outstream.println("\nTotal time: \t\t"+t+" \t"+String.format("%.2f", bases*1000000.0/(t.elapsed))+" kb/sec");
+
+ }
+
+
+ public static void printTopology(){
+ long total=peaks.get()+spikes.get()+flats.get()+valleys.get()+slopes.get();
+ double mult=100.0/total;
+
+ long sp=spikes.get();
+ long pe=peaks.get();
+ long va=valleys.get();
+ long sl=slopes.get();
+ long fl=flats.get();
+ double dsp=mult*sp;
+ double dpe=mult*pe;
+ double dva=mult*va;
+ double dsl=mult*sl;
+ double dfl=mult*fl;
+
+ System.err.println("\nDepth Topology\t");
+ System.err.println("Spikes: \t\t\t"+(dsp<10 ? " " : "")+String.format("%.3f%% \t%d",dsp,sp));
+ System.err.println("Peaks: \t\t\t"+(dpe<10 ? " " : "")+String.format("%.3f%% \t%d",dpe,pe));
+ System.err.println("Valleys: \t\t\t"+(dva<10 ? " " : "")+String.format("%.3f%% \t%d",dva,va));
+ System.err.println("Slopes: \t\t\t"+(dsl<10 ? " " : "")+String.format("%.3f%% \t%d",dsl,sl));
+ System.err.println("Flats: \t\t\t"+(dfl<10 ? " " : "")+String.format("%.3f%% \t%d",dfl,fl));
+ }
+
+
+ public static long count(String reads1, String reads2, KCountArray kca, int k, long maxReads,
+ String output, boolean ordered, boolean overwrite, String histFile, long estUnique) {
+ final ConcurrentReadInputStream cris;
+ {
+ FileFormat ff1=FileFormat.testInput(reads1, FileFormat.FASTQ, null, true, true);
+ FileFormat ff2=FileFormat.testInput(reads2, FileFormat.FASTQ, null, true, true);
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2);
+ cris.start(); //4567
+ }
+
+ assert(cris!=null) : reads1;
+
+ if(fileIO.FileFormat.hasFastaExtension(reads1)){
+ ADD_CARROT=false;
+ }
+
+ if(verbose){System.err.println("Started cris");}
+ boolean paired=cris.paired();
+ if(verbose){System.err.println("Paired: "+paired);}
+
+ final ConcurrentReadOutputStream ros;
+ if(output!=null){
+ final int buff=(!ordered ? 8 : Tools.max(16, 2*THREADS));
+
+ String out1=output.replaceFirst("#", "1");
+ String out2=null;
+
+ if(cris.paired()){
+ if(output.contains("#")){
+ out2=output.replaceFirst("#", "2");
+ }else{
+ outstream.println("Writing interleaved.");
+ }
+ }
+
+ assert(!out1.equalsIgnoreCase(reads1) && !out1.equalsIgnoreCase(reads1));
+ assert(out2==null || (!out2.equalsIgnoreCase(reads1) && !out2.equalsIgnoreCase(reads2)));
+
+ FileFormat ff1=FileFormat.testOutput(out1, FileFormat.FASTQ, OUTPUT_ATTACHMENT ? "attachment" : null, true, overwrite, append, ordered);
+ FileFormat ff2=FileFormat.testOutput(out2, FileFormat.FASTQ, OUTPUT_ATTACHMENT ? "attachment" : null, true, overwrite, append, ordered);
+ ros=ConcurrentReadOutputStream.getStream(ff1, ff2, buff, null, true);
+
+ ros.start();
+ outstream.println("Started output threads.");
+ }else{
+ ros=null;
+ }
+
+ long bases=calcCoverage(cris, kca, k, maxReads, ros, histFile, overwrite, estUnique);
+
+ ReadWrite.closeStreams(cris, ros);
+ if(verbose){System.err.println("Closed stream");}
+ return bases;
+ }
+
+
+ public static long count(String[] list1, String[] list2, KCountArray kca, int k, long maxReads,
+ String output, boolean ordered, boolean overwrite, String histFile, long estUnique) {
+
+ ConcurrentReadOutputStream ros=null;
+ String[] out1=null, out2=null;
+
+
+ final int buff=(!ordered ? 8 : Tools.max(16, 2*THREADS));
+ if(output!=null){
+ if(!new File(output).exists()){
+ out1=output.split(",");
+ }else{
+ out1=new String[] {output};
+ }
+ out2=new String[out1.length];
+ for(int i=0; i<out1.length; i++){
+ if(out1[i].contains("#")){
+ out2[i]=out1[i].replaceFirst("#", "2");
+ out1[i]=out1[i].replaceFirst("#", "1");
+ }
+ }
+ }
+
+ long bases=0;
+
+ for(int x=0; x<list1.length; x++){
+
+ if(out1!=null){
+ if(x==0 || out1.length>1){
+ if(ros!=null){
+ ReadWrite.closeStream(ros);
+ }
+
+ FileFormat ff1=FileFormat.testOutput(out1[x], FileFormat.FASTQ, OUTPUT_ATTACHMENT ? "attachment" : null, true, overwrite, append, ordered);
+ FileFormat ff2=FileFormat.testOutput(out2[x], FileFormat.FASTQ, OUTPUT_ATTACHMENT ? "attachment" : null, true, overwrite, append, ordered);
+ ros=ConcurrentReadOutputStream.getStream(ff1, ff2, buff, null, true);
+
+ ros.start();
+ outstream.println("Started output threads.");
+ }else{
+ ros.resetNextListID();
+ }
+ }
+
+ String reads1=list1[x];
+ String reads2=(list2==null || list2.length<=x ? null : list2[x]);
+
+ final ConcurrentReadInputStream cris;
+ {
+ FileFormat ff1=FileFormat.testInput(reads1, FileFormat.FASTQ, null, true, true);
+ FileFormat ff2=FileFormat.testInput(reads2, FileFormat.FASTQ, null, true, true);
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2);
+ if(verbose){System.err.println("Started cris");}
+ cris.start(); //4567
+ if(ff1.fasta()){ADD_CARROT=false;}
+ }
+ boolean paired=cris.paired();
+ if(verbose){System.err.println("Paired: "+paired);}
+
+ bases+=calcCoverage(cris, kca, k, maxReads, ros, histFile, overwrite, estUnique);
+
+ ReadWrite.closeStream(cris);
+ if(verbose){System.err.println("Closed stream");}
+
+ }
+
+ //Wait until threads finish!
+
+ ReadWrite.closeStream(ros);
+
+ return bases;
+ }
+
+
+
+ public static long calcCoverage(ConcurrentReadInputStream cris, KCountArray kca, int k, long maxReads, ConcurrentReadOutputStream ros,
+ String histFile, boolean overwrite, long estUnique) {
+ Timer tdetect=new Timer();
+ tdetect.start();
+
+ long totalBases=0;
+ long totalReads=0;
+
+// assert(false) : THREADS;
+ ProcessThread[] pta=new ProcessThread[THREADS];
+ for(int i=0; i<pta.length; i++){
+ pta[i]=new ProcessThread(cris, kca, k, ros);
+ pta[i].start();
+ }
+
+ for(int i=0; i<pta.length; i++){
+ ProcessThread ct=pta[i];
+ synchronized(ct){
+ while(ct.getState()!=State.TERMINATED){
+ try {
+ ct.join(1000);
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ totalBases+=ct.totalBases;
+ totalReads+=ct.totalReads;
+
+ for(int j=0; j<histogram_total.length; j++){
+ histogram_total[j]+=ct.hist[j];
+ }
+ }
+ }
+
+ if(!ZERO_BIN && histogram_total!=null && histogram_total.length>1){
+ histogram_total[1]+=histogram_total[0];
+ histogram_total[0]=0;
+ }
+
+// outstream.println();
+ tdetect.stop();
+ outstream.println("Table read time: \t\t"+tdetect+" \t"+String.format("%.2f", totalBases*1000000.0/(tdetect.elapsed))+" kb/sec");
+ outstream.println("Total reads: \t\t"+totalReads);
+ outstream.println("Total bases: \t\t"+totalBases);
+// outstream.println();
+ if(histogram_total!=null){
+ TextStreamWriter tswh=null;
+ StringBuilder sb=new StringBuilder(100);
+ if(USE_HISTOGRAM){
+ tswh=new TextStreamWriter(histFile, overwrite, false, false);
+ tswh.start();
+ tswh.print("#Depth\tRaw_Count\tUnique_Kmers\n");
+ }
+ int lim=(int)(HIST_LEN_PRINT-1);
+ long remaining=Tools.sum(histogram_total);
+ long sumRaw1=0;
+ long sumRaw2=0;
+ long sum1=0;
+ long sum2=0;
+ long sumsquare=0;
+ for(int i=0; i<lim; i++){
+ long x=histogram_total[i];
+ long y=((x+i/2)/(i<1 ? 1 : i)); //x+i/2 rounds to compensate for colliding kmers being put in an overly high bin
+// long y=((x)/(i<1 ? 1 : i));
+ sumRaw1+=x;
+ sum1+=y;
+ sumsquare+=(x*Tools.max(1, i));
+ if(tswh!=null){
+ if(PRINT_ZERO_COVERAGE || x>0 || y>0){
+ sb.append(i).append('\t');
+ sb.append(x).append('\t');
+ sb.append(y).append('\n');
+ }
+ tswh.print(sb.toString());
+ sb.setLength(0);
+ }
+ if(sumRaw1>=remaining){break;} //Stop once there is no more coverage, even if PRINT_ZERO_COVERAGE is not set.
+ }
+ for(int i=lim; i<histogram_total.length; i++){
+ long x=histogram_total[i];
+ sumRaw2+=x;
+ long y=((x+i/2)/(i<1 ? 1 : i)); //x+i/2 rounds to compensate for colliding kmers being put in an overly high bin
+// long y=((x)/(i<1 ? 1 : i));
+ sum2+=y;
+ }
+ if(tswh!=null){
+ if(PRINT_ZERO_COVERAGE || sumRaw2>0 || sum2>0){
+ sb.append(lim).append('\t');
+ sb.append(sumRaw2).append('\t');
+ sb.append(sum2).append('\n');
+ }
+ tswh.print(sb.toString());
+ tswh.poison();
+ tswh.waitForFinish();
+ outstream.println("Wrote histogram to "+histFile);
+ }
+
+ long histCount=Tools.sum(histogram_total); //Total number of kmers counted
+ long halfCount=(histCount+1)/2;
+ double histCountU=0; //Unique kmers counted
+ long temp1=0;
+ double temp2=0;
+ int median_all=-1;
+ int median_unique=-1;
+ for(int i=0; i<histogram_total.length; i++){
+ long x=histogram_total[i];
+ temp1+=x;
+ if(temp1>=halfCount && median_all<0){median_all=i;}
+// histSum+=(x*(double)i);
+ histCountU+=(x/(double)Tools.max(1, i));
+ }
+ double halfCount2=(histCountU)/2;
+ for(int i=0; i<histogram_total.length; i++){
+ long x=histogram_total[i];
+ temp2+=(x/Tools.max(i, 1.0));
+ if(temp2>=halfCount2 && median_unique<0){
+ median_unique=i;
+ break;
+ }
+ }
+ if(median_all<0){median_all=0;}
+ double avg_all=sumsquare/(double)histCount;
+ double avg_unique=histCount/histCountU;
+ double stdev_unique=Tools.standardDeviationHistogramKmer(histogram_total);
+ double stdev_all=Tools.standardDeviationHistogram(histogram_total);
+ outstream.println("Total kmers counted: \t"+(sumRaw1+sumRaw2));
+
+ double uniqueC=((sum1+sum2)*100.0/(sumRaw1+sumRaw2));
+ double uniqueE=((estUnique)*100.0/(sumRaw1+sumRaw2));
+ double uniqueM=Tools.max(uniqueC, uniqueE);
+ outstream.println("Total unique kmer count: \t"+(sum1+sum2));
+ if(CANONICAL){outstream.println("Includes forward kmers only.");}
+ outstream.println("The unique kmer estimate can be more accurate than the unique count, if the tables are very full.");
+ outstream.println("The most accurate value is the greater of the two.");
+ outstream.println();
+
+ outstream.println("Percent unique: \t"+(uniqueM<10 ? " " : "")+String.format("%.2f%%", uniqueM));
+
+ outstream.println("Depth average: \t"+String.format("%.2f\t(unique kmers)", avg_unique));
+ outstream.println("Depth median: \t"+String.format("%d\t(unique kmers)", median_unique));
+ outstream.println("Depth standard deviation: \t"+String.format("%.2f\t(unique kmers)", stdev_unique));
+
+ outstream.println("\nDepth average: \t"+String.format("%.2f\t(all kmers)", avg_all));
+ outstream.println("Depth median: \t"+String.format("%d\t(all kmers)", median_all));
+ outstream.println("Depth standard deviation: \t"+String.format("%.2f\t(all kmers)", stdev_all));
+ }
+
+ return totalBases;
+ }
+
+
+
+ /**
+ * Locates and fixes spikes in a coverage profile (potentially) caused by false positives in a bloom filter.
+ * Theory: If a high-count kmer is adjacent on both sides to low-count kmers, it may be a false positive.
+ * It could either be reduced to the max of the two flanking points or examined in more detail.
+ * @param array An array of kmer counts for adjacent kmers in a read.
+ */
+ private static void fixSpikes(int[] array){
+
+ for(int i=1; i<array.length-1; i++){
+ long a=Tools.max(1, array[i-1]);
+ int b=array[i];
+ long c=Tools.max(1, array[i+1]);
+ if(b>1 && b>a && b>c){
+ //peak
+ if((b>=2*a || b>a+2) && (b>=2*c || b>c+2)){
+ //spike
+ array[i]=(int)Tools.max(a, c);
+ }
+ }
+ }
+ }
+ private static void fixSpikes(int[] array, long[] kmers, KCountArray kca, int k){
+ if(array.length<3){return;}
+ if(array[1]-array[0]>1){
+ array[0]=kca.readPrecise(kmers[0], k, CANONICAL);
+ }
+ if(array[array.length-1]-array[array.length-2]>1){
+ array[array.length-1]=kca.readPrecise(kmers[array.length-1], k, CANONICAL);
+ }
+
+ for(int i=1; i<array.length-1; i++){
+ int b=array[i];
+ if(b>1){
+ long a=Tools.max(1, array[i-1]);
+ long c=Tools.max(1, array[i+1]);
+ long key=kmers[i];
+
+ if(b>a && b>c){
+ //peak
+ if(b<6 || b>a+1 || b>c+1){
+ array[i]=kca.readPreciseMin(key, k, CANONICAL);
+ }
+ // if((b>=2*a || b>a+2) && (b>=2*c || b>c+2)){
+ // //spike
+ // int b1=(int)((a+c)/2);
+ // int b2=kca.readLeft(key, k, CANONICAL);
+ // int b3=kca.readRight(key, k, CANONICAL);
+ // array[i]=Tools.min(b, b1, b2, b3);
+ // }
+ // else
+ // {
+ //// array[i]=kca.readPreciseMin(key, k, CANONICAL);
+ // }
+ }
+ // else
+ // if(Tools.max(ada, adc)>=Tools.max(2, Tools.min((int)a, b, (int)c)/4)){
+ // array[i]=kca.readPrecise(key, k, CANONICAL);
+ // }
+ // else
+ // if(b>a+1 || b>c+1){
+ // //steep
+ // array[i]=kca.readPrecise(key, k, CANONICAL);
+ // }
+ }
+ }
+ }
+
+
+ private static void analyzeSpikes(int[] array, int width){
+ if(array.length<3){return;}
+ int peakcount=0, valleycount=0, spikecount=0, flatcount=0, slopecount=0;
+ for(int i=1; i<array.length-1; i++){
+ long a=array[i-1];
+ int b=array[i];
+ long c=array[i+1];
+ if(b>a && b>c){
+ peakcount++;
+ if((b>=2*a || b>a+2) && (b>=2*c || b>c+2)){
+ spikecount++;
+ }
+ }else if(b<a && b<c){
+ valleycount++;
+ }else if(b==a && b==c){
+ flatcount++;
+ }else{
+ slopecount++;
+ }
+ }
+ if(peakcount>0){peaks.addAndGet(peakcount);}
+ if(valleycount>0){valleys.addAndGet(valleycount);}
+ if(spikecount>0){spikes.addAndGet(spikecount);}
+ if(flatcount>0){flats.addAndGet(flatcount);}
+ if(slopecount>0){slopes.addAndGet(slopecount);}
+ }
+
+
+ /**
+ * @param r
+ * @param kca
+ * @return
+ */
+ public static int[] generateCoverage(Read r, KCountArray kca, int k) {
+ if(k>31){return generateCoverageLong(r, kca, k);}
+ if(kca.gap>0){throw new RuntimeException();}
+ if(r==null || r.bases==null || r.length()<k){return new int[] {0};}
+
+ final int kbits=2*k;
+ final long mask=~((-1L)<<(kbits));
+ final int gap=kca.gap;
+
+ if(r.bases==null || r.length()<k+gap){return null;} //Read is too short to detect errors
+
+ int len=0;
+ long kmer=0;
+ final byte[] bases=r.bases;
+ final int[] out;
+ final long[] kmers=(FIX_SPIKES ? new long[r.length()-k+1] : null);
+
+ if(kmersamplerate<2 || DONT_SAMPLE_OUTPUT){
+ out=new int[r.length()-k+1];
+ Arrays.fill(out, -1);
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ int x=AminoAcid.baseToNumber[b];
+ if(x<0){
+ len=0;
+ kmer=0;
+ }else{
+ kmer=((kmer<<2)|x)&mask;
+ len++;
+
+ if(len>=k){
+// int count=kca.readPrecise(kmer, k, CANONICAL);
+ int count=kca.read(kmer, k, CANONICAL);
+ out[i-k+1]=count;
+ if(kmers!=null){kmers[i-k+1]=kmer;}
+ }
+ }
+ }
+ }else{
+ out=new int[(r.length()-k+1+(kmersamplerate-1))/kmersamplerate];
+ Arrays.fill(out, -1);
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ int x=AminoAcid.baseToNumber[b];
+ if(x<0){
+ len=0;
+ kmer=0;
+ }else{
+ kmer=((kmer<<2)|x)&mask;
+ len++;
+
+ if(len>=k && i%kmersamplerate==0){
+// int count=kca.readPrecise(kmer, k, CANONICAL);
+ int count=kca.read(kmer, k, CANONICAL);
+ out[(i-k+1)/kmersamplerate]=count;
+ if(kmers!=null){kmers[(i-k+1)/kmersamplerate]=kmer;}
+ }
+ }
+ }
+ }
+ if(FIX_SPIKES){fixSpikes(out, kmers, kca, k);}
+// fixSpikes(out, 1);
+
+ analyzeSpikes(out, 1);
+ return out;
+ }
+
+
+
+ /**
+ * @param r
+ * @param kca
+ * @return
+ */
+ public static int[] generateCoverageLong(Read r, KCountArray kca, int k) {
+// assert(false) : "todo";
+// assert(k>31);
+ if(kca.gap>0){throw new RuntimeException();}
+ if(r==null || r.bases==null || r.length()<k){return new int[] {0};}
+
+ final int gap=kca.gap;
+
+ if(r.bases==null || r.length()<k+gap){return null;} //Read is too short to detect errors
+
+ int len=0;
+ long kmer=0;
+ final byte[] bases=r.bases;
+ final int[] out;
+
+ int tailshift=k%32;
+ int tailshiftbits=tailshift*2;
+
+ if(kmersamplerate<2 || DONT_SAMPLE_OUTPUT){
+ out=new int[r.length()-k+1];
+ Arrays.fill(out, -1);
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ int x=AminoAcid.baseToNumber[b];
+ if(x<0){
+ len=0;
+ kmer=0;
+ }else{
+ kmer=Long.rotateLeft(kmer, 2);
+ kmer=kmer^x;
+ len++;
+ if(len>k){
+ long x2=AminoAcid.baseToNumber[bases[i-k]];
+ kmer=kmer^(x2<<tailshiftbits);
+ }
+
+ if(len>=k){
+ int count=kca.read(kmer);
+ out[i-k+1]=count;
+ }
+ }
+ }
+ }else{
+ out=new int[(r.length()-k+1+(kmersamplerate-1))/kmersamplerate];
+ Arrays.fill(out, -1);
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ int x=AminoAcid.baseToNumber[b];
+ if(x<0){
+ len=0;
+ kmer=0;
+ }else{
+ kmer=Long.rotateLeft(kmer, 2);
+ kmer=kmer^x;
+ len++;
+ if(len>k){
+ long x2=AminoAcid.baseToNumber[bases[i-k]];
+ kmer=kmer^(x2<<tailshiftbits);
+ }
+
+ if(len>=k && i%kmersamplerate==0){
+ int count=kca.read(kmer);
+ out[(i-k+1)/kmersamplerate]=count;
+ }
+ }
+ }
+ }
+ fixSpikes(out);
+
+ analyzeSpikes(out, 1);
+ return out;
+ }
+
+
+ private static class ProcessThread extends Thread{
+
+ ProcessThread(ConcurrentReadInputStream cris_, KCountArray kca_, int k_, ConcurrentReadOutputStream ros_){
+ cris=cris_;
+ kca=kca_;
+ k=k_;
+ ros=ros_;
+ }
+
+ public void run(){
+ countInThread();
+ }
+
+ void countInThread() {
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+
+ while(reads!=null && reads.size()>0){
+ for(int rnum=0; rnum<reads.size(); rnum++){
+ Read r=reads.get(rnum);
+ Read r2=r.mate;
+
+ if(DONT_SAMPLE_OUTPUT || r.numericID%readsamplerate==0){
+ boolean toss1=false;
+ boolean toss2=false;
+ {
+ totalReads++;
+ if(verbose){outstream.println();}
+ if(OUTPUT_ATTACHMENT && ros!=null){
+// assert(false) : ros.FASTA+", "+ros.FASTQ+", "+ros.ATTACHMENT;
+ r.obj=(ros.ff1.fastq() ? toFastqString(r) : toFastaString(r));
+ toss1=r.discarded();
+ }else{
+ int[] cov=getCoverageAndIncrementHistogram(r);
+ if(cov==null){toss1=true;}
+ else{
+ Arrays.sort(cov);
+ toss1=(cov[cov.length/2]<MIN_MEDIAN && Tools.average(cov)<MIN_AVERAGE);
+ }
+ }
+ }
+ if(r2!=null){
+ totalReads++;
+ if(verbose){outstream.println();}
+ if(OUTPUT_ATTACHMENT && ros!=null){
+ r2.obj=(ros.ff1.fastq() ? toFastqString(r2) : toFastaString(r2));
+ toss2=r.discarded();
+ }else{
+ int[] cov=getCoverageAndIncrementHistogram(r2);
+ if(cov==null){toss2=true;}
+ else{
+ Arrays.sort(cov);
+ toss2=(cov[cov.length/2]<MIN_MEDIAN && Tools.average(cov)<MIN_AVERAGE);
+ }
+ }
+ }
+ if(toss1 && (toss2 || r2==null)){reads.set(rnum, null);}
+ }
+ }
+
+ if(ros!=null){ //Important to send all lists to output, even empty ones, to keep list IDs straight.
+// System.err.println("Adding list "+ln.id+" of length "+reads.size());
+ ros.add(reads, ln.id);
+ }
+
+ cris.returnList(ln.id, ln.list.isEmpty());
+ //System.err.println("fetching list");
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ if(verbose){System.err.println("Finished reading");}
+ cris.returnList(ln.id, ln.list.isEmpty());
+ if(verbose){System.err.println("Returned list");}
+ }
+
+ private int[] getCoverageAndIncrementHistogram(Read r){
+ if(r.bases==null || r.length()<k){
+ return null;
+ }else{
+ totalBases+=r.length();
+
+ int[] cov=generateCoverage(r, kca, k);
+
+ if(hist!=null){
+ for(int i=0; i<cov.length; i++){
+ int x=Tools.min(cov[i], HIST_LEN-1);
+ if(x>=0){hist[x]++;}
+ }
+ }
+ return cov;
+ }
+ }
+
+ private String toFastaString(Read r){
+ if(r.bases==null || r.length()<k){
+ if(MIN_MEDIAN>0 || MIN_AVERAGE>0){r.setDiscarded(true);}
+ if(USE_HEADER){
+ return (ADD_CARROT ? ">" : "")+r.id+";0;0 0 0 0 0\n"+r.bases==null ? "" : new String(r.bases);
+ }else{
+ return (ADD_CARROT ? ">" : "")+r.id+"\n"+(r.bases==null ? "" : new String(r.bases))+"\n0\n0 0 0 0 0";
+ }
+ }else{
+ totalBases+=r.length();
+
+ int[] cov=generateCoverage(r, kca, k);
+
+ if(hist!=null){
+ for(int i=0; i<cov.length; i++){
+ int x=Tools.max(0, Tools.min(cov[i], HIST_LEN-1));
+ hist[x]++;
+ }
+ }
+
+ StringBuilder sb=new StringBuilder(cov.length*4+r.length()+(r.id==null ? 4 : r.id.length())+10);
+
+ if(USE_HEADER){
+ if(ADD_CARROT || r.id.charAt(0)!='>'){sb.append('>');}
+ sb.append(r.id).append(';');
+
+ int min=cov[0], max=cov[0], sum=0;
+ for(int i=0; i<cov.length; i++){
+ sb.append(cov[i]+" ");
+ min=Tools.min(min, cov[i]);
+ max=Tools.max(max, cov[i]);
+ sum+=cov[i];
+ }
+
+ sb.append(';');
+ Arrays.sort(cov);
+ int median=cov[cov.length/2];
+ sb.append(median).append(' ');
+ sb.append(String.format("%.3f ", sum/(float)cov.length));
+ sb.append(String.format("%.3f ", Tools.standardDeviation(cov)));
+ sb.append(min).append(' ');
+ sb.append(max).append('\n');
+
+ sb.append(new String(r.bases));
+
+ if(median<MIN_MEDIAN || sum/cov.length<MIN_AVERAGE){r.setDiscarded(true);}
+ }else{
+
+ if(ADD_CARROT || r.id.charAt(0)!='>'){sb.append('>');}
+ sb.append(r.id).append('\n');
+ sb.append(new String(r.bases)).append('\n');
+
+ int min=cov[0], max=cov[0], sum=0;
+ for(int i=0; i<cov.length; i++){
+ sb.append(cov[i]+" ");
+ min=Tools.min(min, cov[i]);
+ max=Tools.max(max, cov[i]);
+ sum+=cov[i];
+ }
+
+ sb.append('\n');
+ Arrays.sort(cov);
+ int median=cov[cov.length/2];
+ sb.append(median).append(' ');
+ sb.append(String.format("%.3f ", sum/(float)cov.length));
+ sb.append(String.format("%.3f ", Tools.standardDeviation(cov)));
+ sb.append(min).append(' ');
+ sb.append(max);
+
+ if(median<MIN_MEDIAN || sum/cov.length<MIN_AVERAGE){r.setDiscarded(true);}
+ }
+ return sb.toString();
+ }
+ }
+
+ private StringBuilder toFastqString(Read r){
+ StringBuilder sb=r.toFastq();
+ if(r.bases==null || r.length()<k){
+ if(MIN_MEDIAN>0 || MIN_AVERAGE>0){r.setDiscarded(true);}
+ sb.append("\n0\n0 0 0 0 0");
+ return sb;
+ }else{
+ totalBases+=r.length();
+
+ int[] cov=generateCoverage(r, kca, k);
+
+ if(hist!=null){
+ for(int i=0; i<cov.length; i++){
+ int x=Tools.max(0, Tools.min(cov[i], HIST_LEN-1));
+ assert(x>=0) : i+", "+cov[i]+", "+HIST_LEN;
+ hist[x]++;
+ }
+ }
+ sb.append('\n');
+
+ int min=cov[0], max=cov[0], sum=0;
+ for(int i=0; i<cov.length; i++){
+ sb.append(cov[i]+" ");
+ min=Tools.min(min, cov[i]);
+ max=Tools.max(max, cov[i]);
+ sum+=cov[i];
+ }
+
+ sb.append('\n');
+ Arrays.sort(cov);
+ int median=cov[cov.length/2];
+ sb.append(median).append(' ');
+ sb.append(String.format("%.3f ", sum/(float)cov.length));
+ sb.append(String.format("%.3f ", Tools.standardDeviation(cov)));
+ sb.append(min).append(' ');
+ sb.append(max);
+
+ if(median<MIN_MEDIAN || sum/cov.length<MIN_AVERAGE){r.setDiscarded(true);}
+ return sb;
+ }
+ }
+
+ private final ConcurrentReadInputStream cris;
+ private final KCountArray kca;
+ private final int k;
+ private final ConcurrentReadOutputStream ros;
+ public final long[] hist=new long[HIST_LEN];//(USE_HISTOGRAM ? new long[HIST_LEN] : null);
+
+ private long totalBases=0;
+ private long totalReads=0;
+
+ }
+
+ public static PrintStream outstream=System.err;
+
+
+ public static int HIST_LEN=1<<14;
+ public static long HIST_LEN_PRINT=HIST_LEN;
+ public static boolean USE_HISTOGRAM=false;
+ public static boolean PRINT_ZERO_COVERAGE=false;
+ public static long[] histogram_total;
+
+ private static int THREADS=8;
+ private static boolean verbose=false;
+ private static boolean USE_HEADER=false;
+
+ private static boolean ADD_CARROT=false;
+ private static boolean OUTPUT_ATTACHMENT=true;
+ private static int MIN_MEDIAN=0;
+ private static int MIN_AVERAGE=0;
+
+ public static int kmersamplerate=1;
+ public static int readsamplerate=1;
+ public static boolean DONT_SAMPLE_OUTPUT=false;
+ public static boolean CANONICAL=true;
+ public static boolean ZERO_BIN=false;
+ public static boolean FIX_SPIKES=true;
+ public static boolean ordered=true;
+ public static boolean overwrite=true;
+ public static boolean append=false;
+ public static boolean prefilter=false;
+
+ public static AtomicLong peaks=new AtomicLong();
+ public static AtomicLong spikes=new AtomicLong();
+ public static AtomicLong flats=new AtomicLong();
+ public static AtomicLong valleys=new AtomicLong();
+ public static AtomicLong slopes=new AtomicLong();
+}
diff --git a/current/jgi/KmerNormalize.java b/current/jgi/KmerNormalize.java
new file mode 100755
index 0000000..c5cbe78
--- /dev/null
+++ b/current/jgi/KmerNormalize.java
@@ -0,0 +1,3619 @@
+package jgi;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.lang.Thread.State;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.concurrent.ThreadLocalRandom;
+import java.util.concurrent.atomic.AtomicLong;
+import java.util.concurrent.atomic.AtomicLongArray;
+
+import bloom.KCountArray;
+import bloom.KmerCount7MTA;
+import bloom.KmerCountAbstract;
+
+
+import stream.ConcurrentReadInputStream;
+import stream.FASTQ;
+import stream.FastaReadInputStream;
+import stream.ConcurrentReadOutputStream;
+import stream.Read;
+import ukmer.Kmer;
+
+import align2.ListNum;
+import align2.ReadErrorComparator;
+import align2.ReadStats;
+import align2.Shared;
+import align2.Tools;
+import align2.TrimRead;
+import dna.AminoAcid;
+import dna.Data;
+import dna.Parser;
+import dna.Timer;
+import fileIO.ByteStreamWriter;
+import fileIO.ReadWrite;
+import fileIO.FileFormat;
+import fileIO.TextStreamWriter;
+
+
+
+/**
+ * AKA BBNorm.
+ * Normalize depth by downsampling reads with high coverage.
+ * Uses atomic arrays for bloom filter and an atomic histogram.
+ * Succeeds KmerDownsampleAH.
+ * Includes fast error correction and keep-count-based (rather than random) normalization.
+ *
+ * @author Brian Bushnell
+ * @date May 30, 2013
+ *
+ */
+public class KmerNormalize {
+
+ public static void main(String[] args){
+ for(String s : args){if(s.contains("=standardout") || s.contains("=stdout")){outstream=System.err;}}
+ outstream.println("Executing "+(new Object() { }.getClass().getEnclosingClass().getName())+" "+Arrays.toString(args)+"\n");
+
+ if(args.length<1){throw new RuntimeException("No parameters.");}
+
+ String in1=(args[0].indexOf("=")>0 ? null : args[0]);
+ String in2=(in1!=null && args.length>1 ? args[1] : null);
+ if(in2!=null && "null".equalsIgnoreCase(in2)){in2=null;}
+
+ {
+ if(in1!=null && !in1.contains(",")){
+ File f=new File(in1);
+ if(!f.exists() || !f.isFile()){
+ in1=null;
+// throw new RuntimeException(reads1+" does not exist.");
+ }
+ }
+ if(in2!=null && !in2.contains(",")){
+ File f=new File(in2);
+ if(!f.exists() || !f.isFile()){
+ in2=null;
+// throw new RuntimeException(reads2+" does not exist.");
+ }else if(in1.equalsIgnoreCase(in2)){
+ throw new RuntimeException("Both input files are the same.");
+ }
+ }
+ }
+
+ KmerCountAbstract.minQuality=5;
+ KmerCountAbstract.minProb=0.5f;
+
+ Parser parser=new Parser();
+ parser.trimq=TRIM_QUALITY;
+ parser.minReadLength=MIN_LENGTH;
+
+ int k=31;
+ int cbits=32;
+ int precbits=2;
+ int cbits1=-1;
+ int gap=0;
+ int hashes=3;
+// int matrixbits=-1;
+ long cells=-1;
+ long maxReads=-1;
+ int buildpasses=1;
+ long tablereads=-1; //How many reads to process when building the hashtable
+ int buildStepsize=4;
+
+ String outKeep1=null;
+ String outToss1=null;
+ String outLow1=null, outMid1=null, outHigh1=null, outUnc1=null;
+
+ String outKeep2=null;
+ String outToss2=null;
+ String outLow2=null, outMid2=null, outHigh2=null, outUnc2=null;
+
+ int prehashes=-1;
+ long precells=-1;
+ String khistFile=null;
+ String rhistFile=null;
+ String peakFile=null;
+ String khistFileOut=null;
+ String rhistFileOut=null;
+ String peakFileOut=null;
+ int threads=-1;
+ ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
+
+ int minq=KmerCountAbstract.minQuality;
+ KmerCountAbstract.CANONICAL=true;
+
+
+ int targetDepthF=TARGET_DEPTH_F;
+ int targetDepth1=TARGET_DEPTH_1;
+ int maxDepth=MAX_DEPTH;
+ int minDepth=MIN_DEPTH;
+ int minKmersOverMinDepth=MIN_KMERS_OVER_MIN_DEPTH;
+ float depthPercentile=DEPTH_PERCENTILE;
+
+ int passes=2;
+ boolean tossErrorReadsF=TOSS_ERROR_READS_F;
+ boolean tossErrorReads1=TOSS_ERROR_READS_1;
+ boolean discardBadOnlyF=DISCARD_BAD_ONLY_F;
+ boolean discardBadOnly1=DISCARD_BAD_ONLY_1;
+ boolean fixSpikes=FIX_SPIKES;
+ float highPercentile=HIGH_PERCENTILE;
+ float lowPercentile=LOW_PERCENTILE;
+ int errorDetectRatio=ERROR_DETECT_RATIO;
+ int hthresh=HTHRESH;
+ int lthresh=LTHRESH;
+ boolean countup=COUNTUP;
+ boolean rbb=REQUIRE_BOTH_BAD;
+ boolean setOverlap=false;
+
+
+ boolean auto=true;
+
+ FastaReadInputStream.TARGET_READ_LEN=Integer.MAX_VALUE;
+
+ List<String> extra=null;
+
+ long memory=Runtime.getRuntime().maxMemory();
+ long tmemory=Runtime.getRuntime().totalMemory();
+// assert(false) : memory+", "+tmemory;
+
+ for(int i=(in1==null ? 0 : 1); i<args.length; i++){
+ if(args[i]==null){args[i]="null";}
+ final String arg=args[i];
+ final String[] split=arg.split("=");
+ assert(split.length<3) : "To many '=' signs: "+args[i];
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if("null".equalsIgnoreCase(b)){b=null;}
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(Parser.parseCommonStatic(arg, a, b)){
+ if(a.equals("tbr")){//Handle conflated case
+ tossErrorReads1=tossErrorReadsF=Tools.parseBoolean(b);
+ }
+ }else if(Parser.parseZip(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseQualityAdjust(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseQuality(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseFasta(arg, a, b)){
+ //do nothing
+ }else if(parser.parseInterleaved(arg, a, b)){
+ //do nothing
+ }else if(parser.parseTrim(arg, a, b)){
+ //do nothing
+ }else if(a.equals("keepall")){
+ KEEP_ALL=Tools.parseBoolean(b);
+ }else if(a.equals("k") || a.equals("kmer")){
+ k=Integer.parseInt(b);
+ }else if(a.equals("in") || a.equals("in1")){
+ in1=b;
+ }else if(a.equals("in2")){
+ in2=b;
+ }else if(a.equals("bits") ||a.equals("cbits") || a.equals("cellbits")){
+ cbits=Integer.parseInt(b);
+ }else if(a.equals("bits1") ||a.equals("cbits1") || a.equals("cellbits1")){
+ cbits1=Integer.parseInt(b);
+ }else if(a.equals("prefilterbits") ||a.equals("prebits")){
+ precbits=Integer.parseInt(b);
+ }else if(a.equals("histlen") ||a.equals("histogramlen")){
+ HIST_LEN_PRINT=Tools.min(Integer.MAX_VALUE, Long.parseLong(b)+1);
+ }else if(a.equals("gap")){
+ gap=Integer.parseInt(b);
+ }else if(a.equals("matrixbits")){
+ int matrixbits=Integer.parseInt(b);
+ assert(matrixbits<63);
+ cells=1L<<matrixbits;
+ }else if(a.equals("cells")){
+ cells=Tools.parseKMG(b);
+ }else if(a.equals("precells") || a.equals("prefiltercells")){
+ precells=Tools.parseKMG(b);
+ prefilter=prefilter || precells!=0;
+ }else if(a.equals("prefiltersize") || a.equals("prefilterfraction")){
+ prefilterFraction=Double.parseDouble(b);
+ prefilter=prefilterFraction>0;
+ }else if(a.equals("minq") || a.equals("minqual")){
+ minq=Byte.parseByte(b);
+ }else if(a.equals("zerobin")){
+ ZERO_BIN=Tools.parseBoolean(b);
+ }else if(a.equals("deterministic") || a.equals("dr") || a.equals("det")){
+ boolean x=Tools.parseBoolean(b);
+ DETERMINISTIC=x;
+ }else if(a.equals("minprob")){
+ KmerCountAbstract.minProb=Float.parseFloat(b);
+ assert(KmerCountAbstract.minProb<1) : "minprob must be less than 1. At 1, even kmers with 100% probablity of being error-free will be discarded.";
+ }else if(a.equals("hashes")){
+ hashes=Integer.parseInt(b);
+ }else if(a.equals("prehashes") || a.equals("prefilterhashes")){
+ prehashes=Integer.parseInt(b);
+ prefilter=prefilter || prehashes!=0;
+ }else if(a.equals("prefilter")){
+ prefilter=Tools.parseBoolean(b);
+ }else if(a.equals("countup")){
+ countup=Tools.parseBoolean(b);
+ }else if(a.equals("stepsize") || a.equals("buildstepsize")){
+ buildStepsize=Integer.parseInt(b);
+ }else if(a.equals("passes") || a.equals("p")){
+ passes=Integer.parseInt(b);
+ assert(passes>=1 && passes<=4) : "Passes should be in range 1~4.";
+ }else if(a.equals("1pass") || a.equals("1p")){
+ passes=1;
+ }else if(a.equals("2pass") || a.equals("2p")){
+ passes=2;
+ }else if(a.equals("buildpasses")){
+ buildpasses=Integer.parseInt(b);
+ }else if(a.equals("printcoverage")){
+ assert(false) : "This is not the program you are looking for. Try KmerCoverage. Move along.";
+ }else if(a.equals("threads") || a.equals("t")){
+ threads=Integer.parseInt(b);
+ }else if(a.equals("rn") || a.equals("rename") || a.equals("renamereads")){
+ renameReads=Tools.parseBoolean(b);
+ }else if(a.equals("reads") || a.equals("maxreads")){
+ maxReads=Tools.parseKMG(b);
+ }else if(a.equals("tablereads") || a.equals("buildreads")){
+ tablereads=Tools.parseKMG(b);
+ }else if(a.equals("out") || a.equals("out1") || a.equals("outk") || a.equals("outkeep") || a.equals("outgood")){
+ outKeep1=b;
+ }else if(a.equals("outt") || a.equals("outt1") || a.equals("outtoss") || a.equals("outoss") || a.equals("outbad")){
+ outToss1=b;
+ }else if(a.equals("outl") || a.equals("outl1") || a.equals("outlow") || a.equals("outlow1")){
+ outLow1=b;
+ }else if(a.equals("outm") || a.equals("outm1") || a.equals("outmid") || a.equals("outmid1") || a.equals("outmiddle")){
+ outMid1=b;
+ }else if(a.equals("outh") || a.equals("outh1") || a.equals("outhigh") || a.equals("outhigh1")){
+ outHigh1=b;
+ }else if(a.equals("outu") || a.equals("outu1") || a.equals("outuncorrected")){
+ outUnc1=b;
+ }else if(a.equals("out2") || a.equals("outk2") || a.equals("outkeep2") || a.equals("outgood2")){
+ outKeep2=b;
+ }else if(a.equals("outt2") || a.equals("outtoss2") || a.equals("outoss2") || a.equals("outbad2")){
+ outToss2=b;
+ }else if(a.equals("outl2") || a.equals("outlow2")){
+ outLow2=b;
+ }else if(a.equals("outm2") || a.equals("outmid2") || a.equals("outmiddle2")){
+ outMid2=b;
+ }else if(a.equals("outh2") || a.equals("outhigh2")){
+ outHigh2=b;
+ }else if(a.equals("outu2") || a.equals("outuncorrected2")){
+ outUnc2=b;
+ }else if(a.equals("lbd") || a.equals("lowbindepth") || a.equals("lowerlimit")){
+ LOW_BIN_DEPTH=Integer.parseInt(b);
+ }else if(a.equals("hbd") || a.equals("highbindepth") || a.equals("upperlimit")){
+ HIGH_BIN_DEPTH=Integer.parseInt(b);
+ }else if(a.equals("hist") || a.equals("histin") || a.equals("inhist") || a.equals("khist")){
+ khistFile=b;
+ }else if(a.equals("rhist")){
+ rhistFile=b;
+ }else if(a.equals("histout") || a.equals("outhist") || a.equals("hist2") || a.equals("khistout")){
+ khistFileOut=b;
+ }else if(a.equals("rhistout")){
+ rhistFileOut=b;
+ }else if(a.equals("peaks")){
+ peakFile=b;
+ }else if(a.equals("peaksout")){
+ peakFileOut=b;
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ }else if(a.equals("ordered") || a.equals("ord")){
+ ordered=Tools.parseBoolean(b);
+ }else if(a.equals("append") || a.equals("app")){
+ append=ReadStats.append=Tools.parseBoolean(b);
+ }else if(a.equals("overwrite") || a.equals("ow")){
+ overwrite=Tools.parseBoolean(b);
+ }else if(a.equals("auto") || a.equals("automatic")){
+ auto=Tools.parseBoolean(b);
+ }else if(a.equals("canonical")){
+ CANONICAL=KmerCountAbstract.CANONICAL=Tools.parseBoolean(b);
+ }else if(a.equals("fixspikes") || a.equals("fs")){
+ fixSpikes=Tools.parseBoolean(b);
+ }else if(a.equals("printzerocoverage") || a.equals("pzc")){
+ PRINT_ZERO_COVERAGE=Tools.parseBoolean(b);
+ }else if(a.equals("removeduplicatekmers") || a.equals("rdk")){
+ KmerCountAbstract.KEEP_DUPLICATE_KMERS=!Tools.parseBoolean(b);
+ }else if(a.equals("target") || a.equals("targetdepth") || a.equals("tgt")){
+ targetDepthF=Integer.parseInt(b);
+ }else if(a.equals("target1") || a.equals("targetdepth1") || a.equals("tgt1")){
+ targetDepth1=Integer.parseInt(b);
+ }else if(a.equals("max") || a.equals("maxdepth")){
+ maxDepth=Integer.parseInt(b);
+ }else if(a.equals("min") || a.equals("mindepth")){
+ minDepth=Integer.parseInt(b);
+ }else if(a.equals("minkmers") || a.equals("minkmersovermindepth") || a.equals("mingoodkmersperread") || a.equals("mgkpr")){
+ minKmersOverMinDepth=Tools.max(1, Integer.parseInt(b));
+ }else if(a.equals("percentile") || a.equals("depthpercentile") || a.equals("dp")){
+ depthPercentile=Float.parseFloat(b);
+ if(depthPercentile>1 && depthPercentile<=100){depthPercentile/=100;}
+ assert(depthPercentile>=0 && depthPercentile<=1) : "Depth percentile must be between 0 and 100.";
+ }else if(a.equals("highdepthpercentile") || a.equals("highpercentile") || a.equals("hdp")){
+ highPercentile=Float.parseFloat(b);
+ if(highPercentile>1 && highPercentile<=100){highPercentile/=100;}
+ assert(highPercentile>=0 && highPercentile<=1) : "Depth percentile must be between 0 and 100.";
+ }else if(a.equals("lowdepthpercentile") || a.equals("lowpercentile") || a.equals("ldp")){
+ lowPercentile=Float.parseFloat(b);
+ if(lowPercentile>1 && lowPercentile<=100){lowPercentile/=100;}
+ assert(lowPercentile>=0 && highPercentile<=1) : "Depth percentile must be between 0 and 100.";
+ }else if(a.equals("targetbadpercentilelow") || a.equals("tbpl")){
+ double d=Double.parseDouble(b);
+ if(d>1 && d<=100){d/=100;}
+ assert(d>=0) : "TARGET_BAD_PERCENT_LOW must be at least 0.";
+ TARGET_BAD_PERCENT_LOW=d;
+ TARGET_BAD_PERCENT_HIGH=Tools.max(TARGET_BAD_PERCENT_HIGH, TARGET_BAD_PERCENT_LOW);
+ }else if(a.equals("targetbadpercentilehigh") || a.equals("tbph")){
+ double d=Double.parseDouble(b);
+ if(d>1 && d<=100){d/=100;}
+ assert(d>=0 && lowPercentile<=1) : "TARGET_BAD_PERCENT_HIGH must be at least 0.";
+ TARGET_BAD_PERCENT_HIGH=d;
+ TARGET_BAD_PERCENT_LOW=Tools.min(TARGET_BAD_PERCENT_HIGH, TARGET_BAD_PERCENT_LOW);
+ }else if(a.equals("errordetectratio") || a.equals("edr")){
+ errorDetectRatio=Integer.parseInt(b);
+ }else if(a.equals("errorcorrectratio") || a.equals("ecr")){
+ ERROR_CORRECT_RATIO=Integer.parseInt(b);
+ }else if(a.equals("highthresh") || a.equals("hthresh") || a.equals("ht")){
+ hthresh=Integer.parseInt(b);
+ }else if(a.equals("lowthresh") || a.equals("lthresh") || a.equals("lt")){
+ lthresh=Integer.parseInt(b);
+ }else if(a.equals("echighthresh") || a.equals("echthresh") || a.equals("echt")){
+ EC_HTHRESH=Integer.parseInt(b);
+ }else if(a.equals("eclowthresh") || a.equals("eclthresh") || a.equals("eclt")){
+ EC_LTHRESH=Integer.parseInt(b);
+ }else if(a.equals("markerrors") || a.equals("markonly") || a.equals("meo")){
+ MARK_ERRORS_ONLY=Tools.parseBoolean(b);
+ }else if(a.equals("markuncorrectableerrors") || a.equals("markuncorrectable") || a.equals("mue")){
+ MARK_UNCORRECTABLE_ERRORS=Tools.parseBoolean(b);
+ }else if(a.equals("tam") || a.equals("trimaftermarking")){
+ TRIM_AFTER_MARKING=Tools.parseBoolean(b);
+ }else if(a.equals("markwith1") || a.equals("markwithone") || a.equals("mw1")){
+ MARK_WITH_1=Tools.parseBoolean(b);
+// TrimRead.PROB1=10;
+ }else if(a.equals("aec") || a.equals("aecc") || a.equals("aggressiveerrorcorrection")){
+ boolean x=Tools.parseBoolean(b);
+ if(x){
+ USE_ECC1=USE_ECCF=true;
+ EC_HTHRESH=Tools.min(EC_HTHRESH, 16);
+ EC_LTHRESH=Tools.max(EC_LTHRESH, 3);
+ ERROR_CORRECT_RATIO=Tools.min(ERROR_CORRECT_RATIO, 100);
+ MAX_ERRORS_TO_CORRECT=Tools.max(MAX_ERRORS_TO_CORRECT, 7);
+ SUFFIX_LEN=Tools.min(SUFFIX_LEN, 3);
+ PREFIX_LEN=Tools.min(PREFIX_LEN, 2);
+ }
+ }else if(a.equals("cec") || a.equals("cecc") || a.equals("conservativeerrorcorrection")){
+ boolean x=Tools.parseBoolean(b);
+ if(x){
+ USE_ECC1=USE_ECCF=true;
+ EC_HTHRESH=Tools.max(EC_HTHRESH, 30);
+ EC_LTHRESH=Tools.min(EC_LTHRESH, 1);
+ ERROR_CORRECT_RATIO=Tools.max(ERROR_CORRECT_RATIO, 170);
+ MAX_ERRORS_TO_CORRECT=Tools.min(MAX_ERRORS_TO_CORRECT, 2);
+ MAX_QUAL_TO_CORRECT=Tools.min(MAX_QUAL_TO_CORRECT, 25);
+ SUFFIX_LEN=Tools.max(SUFFIX_LEN, 4);
+ PREFIX_LEN=Tools.max(PREFIX_LEN, 4);
+ }
+ }else if(a.equals("tossbadreads") || a.equals("tosserrorreads") || a.equals("tbr") || a.equals("ter")){
+ tossErrorReads1=tossErrorReadsF=Tools.parseBoolean(b);
+ }else if(a.equals("tossbadreads2") || a.equals("tosserrorreads2") || a.equals("tbr2") || a.equals("ter2") ||
+ a.equals("tossbadreadsf") || a.equals("tosserrorreadsf") || a.equals("tbrf") || a.equals("terf")){
+ tossErrorReadsF=Tools.parseBoolean(b);
+ }else if(a.equals("tossbadreads1") || a.equals("tosserrorreads1") || a.equals("tbr1") || a.equals("ter1")){
+ tossErrorReads1=Tools.parseBoolean(b);
+ }else if(a.equals("abrc") || a.equals("addbadreadscountup")){
+ ADD_BAD_READS_COUNTUP=Tools.parseBoolean(b);
+ }else if(a.equals("discardbadonly") || a.equals("dbo")){
+ discardBadOnly1=discardBadOnlyF=Tools.parseBoolean(b);
+ }else if(a.equals("discardbadonly1") || a.equals("dbo1")){
+ discardBadOnly1=Tools.parseBoolean(b);
+ }else if(a.equals("discardbadonlyf") || a.equals("dbof") || a.equals("discardbadonly2") || a.equals("dbo2")){
+ discardBadOnlyF=Tools.parseBoolean(b);
+ }else if(a.equals("requirebothbad") || a.equals("rbb")){
+ rbb=Tools.parseBoolean(b);//Already caught by parser
+ }else if(a.equals("saverarereads") || a.equals("srr")){
+ SAVE_RARE_READS=Tools.parseBoolean(b);
+ }else if(a.equals("eccbyoverlap") || a.equals("ecco") || a.equals("overlap")){
+ if("auto".equalsIgnoreCase(b)){eccByOverlapAuto=true;}
+ else{
+ eccByOverlap=Tools.parseBoolean(b);
+ eccByOverlapAuto=false;
+ }
+ setOverlap=true;
+ }else if(a.equals("ecc")){
+ USE_ECC1=USE_ECCF=Tools.parseBoolean(b);
+ }else if(a.equals("ecc1")){
+ USE_ECC1=Tools.parseBoolean(b);
+ }else if(a.equals("ecc2") || a.equals("eccf")){
+ USE_ECCF=Tools.parseBoolean(b);
+ }else if(a.equals("ecclimit")){
+ MAX_ERRORS_TO_CORRECT=Integer.parseInt(b);
+ }else if(a.equals("eccmaxqual")){
+ MAX_QUAL_TO_CORRECT=Integer.parseInt(b);
+ }else if(a.equals("cfl")){
+ CORRECT_FROM_LEFT=Tools.parseBoolean(b);
+ }else if(a.equals("cfr")){
+ CORRECT_FROM_RIGHT=Tools.parseBoolean(b);
+ }else if(a.equals("sl") || a.equals("suflen") || a.equals("suffixlen")){
+ SUFFIX_LEN=Integer.parseInt(b);
+ }else if(a.equals("pl") || a.equals("prelen") || a.equals("prefixlen")){
+ PREFIX_LEN=Integer.parseInt(b);
+ }else if(a.equals("histcol") || a.equals("histcolumns") || a.equals("histogramcolumns")){
+ HIST_COLUMNS=Integer.parseInt(b);
+ }else if(a.equals("minheight")){
+ minHeight=Long.parseLong(b);
+ }else if(a.equals("minvolume")){
+ minVolume=Long.parseLong(b);
+ }else if(a.equals("minwidth")){
+ minWidth=Integer.parseInt(b);
+ }else if(a.equals("minpeak")){
+ minPeak=Integer.parseInt(b);
+ }else if(a.equals("maxpeak")){
+ maxPeak=Integer.parseInt(b);
+ }else if(a.equals("ploidy")){
+ ploidy=Integer.parseInt(b);
+ }else if(a.equals("maxpeakcount") || a.equals("maxpc") || a.equals("maxpeaks")){
+ maxPeakCount=Integer.parseInt(b);
+ }else if(a.equals("usetmpdir")){
+ USE_TMPDIR=Tools.parseBoolean(b);
+ }else if(a.equals("uselowerdepth") || a.equals("uld")){
+ USE_LOWER_DEPTH=Tools.parseBoolean(b);
+ }else if(a.equals("tmpdir")){
+ TMPDIR=b;
+ if(b!=null){
+ b=b.trim();
+ if(b.length()==0){b=null;}
+ else{b=(b.replace('\\', '/')+"/").replaceAll("//", "/");}
+ }
+ }else if(a.equals("extra")){
+ if(b!=null && !b.equalsIgnoreCase("null")){
+ if(new File(b).exists()){
+ extra=new ArrayList<String>();
+ extra.add(b);
+ }else{
+ extra=Arrays.asList(b.split(","));
+ }
+ }
+ }else{
+ throw new RuntimeException("Unknown parameter "+arg);
+ }
+ }
+
+ {//Process parser fields
+ Parser.processQuality(); //TODO: This may cause problems with multiple passes! Best to not change the quality.
+
+ TRIM_LEFT=parser.qtrimLeft;
+ TRIM_RIGHT=parser.qtrimRight;
+ TRIM_QUALITY=parser.trimq;
+ MIN_LENGTH=parser.minReadLength;
+ rbb=parser.requireBothBad;
+ }
+
+ assert(passes<2 || (outLow1==null && outMid1==null && outHigh1==null && outUnc1==null)) :
+ "\noutLow, outMid, outHigh, and outUnc don't work with multiple passes. Set passes=1 or eliminate those output streams.";
+
+ assert(in1!=null && !in1.equalsIgnoreCase("stdin") && !in1.toLowerCase().startsWith("stdin.")) :
+ "\nThis program does not allow input from standard in,\nbecause it needs to read the input multiple times.\nOnly files are permitted.";
+
+ if(MARK_ERRORS_ONLY){
+ MAX_ERRORS_TO_CORRECT=Tools.max(MAX_ERRORS_TO_CORRECT, 9999);
+ if(!USE_ECC1 && !USE_ECCF){USE_ECC1=true;}
+ }
+
+ if(!setOverlap && (USE_ECC1 || USE_ECCF)){
+ eccByOverlapAuto=true;
+ }
+
+ if(in1!=null && in1.contains("#") && !new File(in1).exists()){
+ int pound=in1.lastIndexOf('#');
+ String a=in1.substring(0, pound);
+ String b=in1.substring(pound+1);
+ in1=a+1+b;
+ in2=a+2+b;
+ }
+ if(in2!=null && (in2.contains("=") || in2.equalsIgnoreCase("null"))){in2=null;}
+ final boolean paired=(FASTQ.FORCE_INTERLEAVED || in2!=null);
+ if(in2!=null){
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+ }
+
+ if(DETERMINISTIC){ordered=true;}
+
+ boolean ok=Tools.testOutputFiles(overwrite, append, false, outKeep1, outToss1, outKeep2, outToss2, khistFile, khistFileOut, rhistFile, rhistFileOut, peakFile, peakFileOut);
+
+ if(cbits>16 && passes>1){cbits=16;}
+
+ maxDepth=Tools.max(maxDepth, targetDepthF);
+ assert(targetDepthF>0);
+
+ assert(FastaReadInputStream.settingsOK());
+ if(k>31){CANONICAL=KmerCountAbstract.CANONICAL=false;}
+ assert(CANONICAL==KmerCountAbstract.CANONICAL);
+
+// if(output!=null && reads1.contains(",")){
+// throw new RuntimeException("\nLists of input files can only be used with histogram output, not full output.\n" +
+// "Please set output=null or move the extra input files to 'extra=file1,file2,...fileN'");
+// }
+ if(extra!=null){
+ for(String s : extra){
+ File f=new File(s);
+ if(!f.exists() || !f.isFile()){throw new RuntimeException(s+" does not exist.");}
+ assert(!s.equalsIgnoreCase(in1) && (in2==null || !s.equalsIgnoreCase(in2))) : "\nInput file "+s+" should not be included as an extra file.\n";
+ }
+ }
+
+// outstream.println("ForceInterleaved = "+FASTQ.FORCE_INTERLEAVED);
+
+// assert(false) : reads1+", "+reads2+", "+output;
+// if(FASTQ.FORCE_INTERLEAVED && in2==null){
+// outstream.println()
+// }
+
+ if(threads<=0){
+ THREADS=Shared.threads();
+ }else{
+ THREADS=threads;
+ }
+// KmerCountAbstract.THREADS=Tools.min(THREADS,6);
+ KmerCountAbstract.THREADS=THREADS;
+
+// assert(false) : THREADS;
+
+// System.err.println("THREADS="+THREADS+", KmerCountAbstract.THREADS="+KmerCountAbstract.THREADS);
+
+ long bases=0;
+ qhist_total=new long[128];
+ Timer t=new Timer();
+
+ if(passes>1){
+ String lastTemp1=null;
+ String lastTemp2=null;
+
+ if(passes>2){
+// System.out.println(">>>A");
+
+ ERROR_DETECT_RATIO+=50;
+ EC_HTHRESH=EC_HTHRESH*2+20;
+
+ for(int pass=1; pass<passes-1; pass++){
+ final String tempOutPrefix1=getTempPrefix(in1, outKeep1, pass, 1);
+ final String tempOut1=getTempOut(outKeep1, tempOutPrefix1);
+ final String tempOutToss1=(outToss1==null ? null : getTempOut(outToss1, tempOutPrefix1));
+
+ final String tempOutPrefix2=(outKeep2==null ? null : getTempPrefix(in1, outKeep2, pass, 2));
+ final String tempOut2=(outKeep2==null ? null : getTempOut(outKeep2, tempOutPrefix2));
+ final String tempOutToss2=(outToss2==null ? null : getTempOut(outToss2, tempOutPrefix2));
+
+ outstream.println("\n *********** Pass "+pass+" ********** \n");
+
+ int tgt=(targetDepth1<1 ? targetDepthF*4 : targetDepth1*2);
+ int max=(tgt+tgt/4);
+
+ int tgtBadLow=(int)Math.ceil(Tools.min(tgt, targetDepthF*TARGET_BAD_PERCENT_LOW*1.5));
+ int tgtBadHigh=(int)Math.ceil(Tools.min(tgt, targetDepthF*TARGET_BAD_PERCENT_HIGH*1.5));
+
+ CORRECT_ERRORS_THIS_PASS=USE_ECC1;
+ TRIM_LEFT_THIS_PASS=(pass==1 && TRIM_LEFT);
+ TRIM_RIGHT_THIS_PASS=(pass==1 && TRIM_RIGHT);
+ bases+=runPass(auto, memory, (cbits1<1 ? cbits : cbits1), cells, precbits, precells, buildpasses, hashes, prehashes, k,
+ maxReads, tablereads, minq, buildStepsize,
+ (pass==1 ? in1 : lastTemp1), (pass==1 ? in2 : lastTemp2),
+ tempOut1, tempOutToss1, null, null, null, null,
+ tempOut2, tempOutToss2, null, null, null, null,
+ (pass==1 ? khistFile : null), (pass==1 ? rhistFile : null), (pass==1 ? peakFile : null), (pass==1 ? extra : null),
+ tgt, tgtBadLow, tgtBadHigh, max, Tools.min(minDepth, 2), Tools.min(minKmersOverMinDepth, 5),
+ Tools.min(0.8f, Tools.max(0.4f, depthPercentile)*1.2f), false, rbb, true,
+ highPercentile, 0, (errorDetectRatio>100 ? 100+(errorDetectRatio-100)/2 : errorDetectRatio), hthresh, lthresh, false, false, false);
+ lastTemp1=tempOut1;
+ lastTemp2=tempOut2;
+ FASTQ.TEST_INTERLEAVED=true;
+ FASTQ.FORCE_INTERLEAVED=(paired && outKeep2==null);
+ }
+ FASTQ.TEST_INTERLEAVED=true;
+ FASTQ.FORCE_INTERLEAVED=(paired && outKeep2==null);
+// System.out.println(">>>C");
+
+ ERROR_DETECT_RATIO-=50;
+ EC_HTHRESH=(EC_HTHRESH-20)/2;
+
+ for(int pass=passes-1; pass<passes; pass++){
+ final String tempOutPrefix1=getTempPrefix(in1, outKeep1, pass, 1);
+ final String tempOut1=getTempOut(outKeep1, tempOutPrefix1);
+ final String tempOutToss1=(outToss1==null ? null : getTempOut(outToss1, tempOutPrefix1));
+
+ final String tempOutPrefix2=(outKeep2==null ? null : getTempPrefix(in1, outKeep2, pass, 2));
+ final String tempOut2=(outKeep2==null ? null : getTempOut(outKeep2, tempOutPrefix2));
+ final String tempOutToss2=(outToss2==null ? null : getTempOut(outToss2, tempOutPrefix2));
+
+ outstream.println("\n *********** Pass "+pass+" ********** \n");
+
+ int tgt=(targetDepth1<1 ? targetDepthF*2 : targetDepth1);
+ int max=(tgt+tgt/4);
+
+ int tgtBadLow=(int)Math.ceil(Tools.min(tgt, targetDepthF*TARGET_BAD_PERCENT_LOW));
+ int tgtBadHigh=(int)Math.ceil(Tools.min(tgt, targetDepthF*TARGET_BAD_PERCENT_HIGH));
+
+ CORRECT_ERRORS_THIS_PASS=USE_ECC1;
+ TRIM_LEFT_THIS_PASS=(pass==1 && TRIM_LEFT);
+ TRIM_RIGHT_THIS_PASS=(pass==1 && TRIM_RIGHT);
+ bases+=runPass(auto, memory, (cbits1<1 ? cbits : cbits1), cells, precbits, precells, buildpasses, hashes, prehashes, k,
+ maxReads, tablereads, minq, buildStepsize,
+ (pass==1 ? in1 : lastTemp1), (pass==1 ? in2 : lastTemp2),
+ tempOut1, tempOutToss1, null, null, null, null,
+ tempOut2, tempOutToss2, null, null, null, null,
+ (pass==1 ? khistFile : null), (pass==1 ? rhistFile : null), (pass==1 ? peakFile : null), (pass==1 ? extra : null),
+ tgt, tgtBadLow, tgtBadHigh, max, Tools.min(minDepth, 3), minKmersOverMinDepth,
+ Tools.min(0.8f, Tools.max(0.4f, depthPercentile)*1.2f), tossErrorReads1, rbb, discardBadOnly1,
+ highPercentile, lowPercentile, errorDetectRatio, hthresh, lthresh, false, false, false);
+ lastTemp1=tempOut1;
+ lastTemp2=tempOut2;
+ }
+ }else{
+// System.out.println(">>>E");
+ for(int pass=1; pass<passes; pass++){
+ final String tempOutPrefix1=getTempPrefix(in1, outKeep1, pass, 1);
+ final String tempOut1=getTempOut(outKeep1, tempOutPrefix1);
+ final String tempOutToss1=(outToss1==null ? null : getTempOut(outToss1, tempOutPrefix1));
+
+ final String tempOutPrefix2=(outKeep2==null ? null : getTempPrefix(in1, outKeep2, pass, 2));
+ final String tempOut2=(outKeep2==null ? null : getTempOut(outKeep2, tempOutPrefix2));
+ final String tempOutToss2=(outToss2==null ? null : getTempOut(outToss2, tempOutPrefix2));
+
+ outstream.println("\n *********** Pass "+pass+" ********** \n");
+
+ int tgt=(targetDepth1<1 ? targetDepthF*4 : targetDepth1);
+ int max=(tgt+tgt/4);
+
+ int tgtBadLow=(int)Math.ceil(Tools.min(tgt, targetDepthF*TARGET_BAD_PERCENT_LOW));
+ int tgtBadHigh=(int)Math.ceil(Tools.min(tgt, targetDepthF*TARGET_BAD_PERCENT_HIGH));
+
+ CORRECT_ERRORS_THIS_PASS=USE_ECC1;
+ TRIM_LEFT_THIS_PASS=(pass==1 && TRIM_LEFT);
+ TRIM_RIGHT_THIS_PASS=(pass==1 && TRIM_RIGHT);
+ bases+=runPass(auto, memory, (cbits1<1 ? cbits : cbits1), cells, precbits, precells, buildpasses, hashes, prehashes, k,
+ maxReads, tablereads, minq, buildStepsize,
+ (pass==1 ? in1 : lastTemp1), (pass==1 ? in2 : lastTemp2),
+ tempOut1, tempOutToss1, null, null, null, null,
+ tempOut2, tempOutToss2, null, null, null, null,
+ (pass==1 ? khistFile : null), (pass==1 ? rhistFile : null), (pass==1 ? peakFile : null), (pass==1 ? extra : null),
+ tgt, tgtBadLow, tgtBadHigh, max, Tools.min(minDepth, 3), minKmersOverMinDepth,
+ Tools.min(0.8f, Tools.max(0.4f, depthPercentile)*1.2f), tossErrorReads1, rbb, discardBadOnly1,
+ highPercentile, lowPercentile, errorDetectRatio, hthresh, lthresh, false, false, false);
+ lastTemp1=tempOut1;
+ lastTemp2=tempOut2;
+ FASTQ.TEST_INTERLEAVED=true;
+ FASTQ.FORCE_INTERLEAVED=(paired && outKeep2==null);
+ }
+// System.out.println(">>>G");
+ }
+
+ outstream.println("\n *********** Pass "+(passes)+" ********** \n");
+
+ CORRECT_ERRORS_THIS_PASS=USE_ECCF;
+ TRIM_LEFT_THIS_PASS=false;
+ TRIM_RIGHT_THIS_PASS=false;
+ bases+=runPass(auto, memory, cbits, cells, precbits, precells, buildpasses, hashes, prehashes, k,
+ maxReads, tablereads, minq, buildStepsize,
+ lastTemp1, lastTemp2,
+ outKeep1, outToss1, outLow1, outMid1, outHigh1, outUnc1,
+ outKeep2, outToss2, outLow2, outMid2, outHigh2, outUnc2,
+ null, null, null, null,
+ targetDepthF, targetDepthF, targetDepthF, maxDepth, minDepth, minKmersOverMinDepth, depthPercentile, tossErrorReadsF, rbb, discardBadOnlyF,
+ highPercentile, lowPercentile, errorDetectRatio, hthresh, lthresh, fixSpikes, countup, renameReads);
+ }else{
+ CORRECT_ERRORS_THIS_PASS=(USE_ECC1 || USE_ECCF);
+ TRIM_LEFT_THIS_PASS=(TRIM_LEFT);
+ TRIM_RIGHT_THIS_PASS=(TRIM_RIGHT);
+ bases+=runPass(auto, memory, cbits, cells, precbits, precells, buildpasses, hashes, prehashes, k,
+ maxReads, tablereads, minq, buildStepsize,
+ in1, in2,
+ outKeep1, outToss1, outLow1, outMid1, outHigh1, outUnc1,
+ outKeep2, outToss2, outLow2, outMid2, outHigh2, outUnc2,
+ khistFile, rhistFile, peakFile, extra,
+ targetDepthF, targetDepthF, targetDepthF, maxDepth, minDepth, minKmersOverMinDepth, depthPercentile, tossErrorReadsF, rbb, discardBadOnlyF,
+ highPercentile, lowPercentile, errorDetectRatio, hthresh, lthresh, fixSpikes, countup, renameReads);
+ }
+
+ if(outKeep1!=null && (khistFileOut!=null || rhistFileOut!=null || peakFileOut!=null)){
+ outstream.println("\n *********** Output Histogram Generation ********** \n");
+ FASTQ.TEST_INTERLEAVED=true;
+ FASTQ.FORCE_INTERLEAVED=(paired && outKeep2==null);
+ CORRECT_ERRORS_THIS_PASS=false;
+ TRIM_LEFT_THIS_PASS=false;
+ TRIM_RIGHT_THIS_PASS=false;
+ bases+=runPass(auto, memory, cbits, cells, precbits, precells, buildpasses, hashes, prehashes, k,
+ maxReads, tablereads, minq, buildStepsize,
+ outKeep1, outKeep2,
+ null, null, null, null, null, null,
+ null, null, null, null, null, null,
+ khistFileOut, rhistFileOut, peakFileOut, extra,
+ 99999999, 99999999, 99999999, 99999999, 0, 0, .5f, false, rbb, false,
+ 1, 0, 100, 10, 3, fixSpikes, false, false);
+ }
+
+ if(REMOVE_TEMP_FILES && temp_file_set!=null){
+ outstream.println("\nRemoving temp files.");
+ for(String s : temp_file_set){
+ File f=new File(s);
+ if(f.exists()){
+// System.out.println("Deleting "+s);
+ boolean success=false;
+ for(int i=0; i<100 && f.exists() && !success; i++){
+ success=f.delete();
+ f=new File(s);
+ }
+ if(f.exists() && !success){
+// System.err.println(f.canExecute());
+// System.err.println(f.canRead());
+// System.err.println(f.canWrite());
+// System.err.println(f.lastModified());
+// try {
+// java.nio.file.Files.delete(f.toPath());
+// } catch (IOException e) {
+// // TODO Auto-generated catch block
+// e.printStackTrace();
+// }
+ System.err.println("Some temp files (prefixed TEMPFILE_BBNORM) could not be removed may need to be deleted manually.");
+ f.deleteOnExit();
+ }
+ }
+ }
+ }
+
+ t.stop();
+
+
+ outstream.println("\nTotal time: \t\t"+t+" \t"+String.format("%.2f", bases*1000000.0/(t.elapsed))+" kb/sec");
+
+ if(errorState){throw new RuntimeException("KmerNormalize terminated in an error state; the output may be corrupt.");}
+ }
+
+ private static String getTempPrefix(String inFname, String outFname, int pass, int pairnum){
+ String tempOut=null, tempOutPrefix=null;
+ for(int i=0; i<2000 && tempOut==null; i++){
+ tempOutPrefix=(useTmpdir() ? Shared.TMPDIR : "")+"TEMPFILE_BBNORM_P"+pass+"_R"+pairnum+"_"+getSalt(inFname, i)+"_";
+ tempOut=getTempOut(outFname, tempOutPrefix);
+ if(new File(tempOut).exists()){
+ tempOut=null;
+ tempOutPrefix=null;
+ }
+ }
+ if(tempOutPrefix==null){
+ throw new RuntimeException("Can't generate a random temp file name. Try deleting old temp files.");
+ }
+ return tempOutPrefix;
+ }
+
+ private static String getTempOut(String outFname, final String tempOutPrefix){
+ assert(tempOutPrefix!=null);
+ String tempOut=null;
+ if(outFname==null || useTmpdir()){
+ tempOut=tempOutPrefix+".fq.gz";
+ }else{
+ outFname=outFname.replace('\\', '/');
+ int idx=outFname.lastIndexOf('/');
+ if(idx<0){
+ tempOut=tempOutPrefix+outFname;
+ }else{
+ tempOut=outFname.substring(0, idx+1)+tempOutPrefix+outFname.substring(idx+1);
+ }
+ }
+ if(temp_file_set==null){temp_file_set=new HashSet<String>();}
+ if(temp_file_set.contains(tempOut) || new File(tempOut).exists()){
+ return getTempOut(outFname, tempOutPrefix+"_"+(100000*Math.random()));
+ }
+ temp_file_set.add(tempOut);
+ return tempOut;
+ }
+
+ public static String getSalt(String fname, int attempt){
+ return Long.toHexString(System.nanoTime()+attempt)+Long.toHexString(Long.rotateLeft(fname.hashCode(), 31)^System.currentTimeMillis());
+ }
+
+ private static boolean inMemorySort(ArrayList<Read> reads, String sorted, boolean reverse){
+ try{
+ Collections.sort(reads, ReadErrorComparator.comparator);
+ if(reverse){Collections.reverse(reads);}
+ TextStreamWriter tsw=new TextStreamWriter(sorted, overwrite, false, true);
+ tsw.start();
+// assert(false) : "\nreads: "+reads.size()+"\n"+tsw+"\n";
+ for(Read r : reads){
+ tsw.println(r);
+ if(r.mate!=null){tsw.println(r.mate);}
+ }
+ tsw.poison();
+ tsw.waitForFinish();
+ }catch(Throwable t){
+ System.err.println("ERROR: "+t);
+ return false;
+ }
+ return true;
+ }
+
+ private static long runPass(boolean auto, long memory, int cbits, long cells, int pcbits, long precells, int buildpasses, int hashes, int prehashes, int k,
+ long maxReads, long tablereads, int minq, int buildStepsize,
+ String in1, String in2,
+ String outKeep1, String outToss1, String outLow1, String outMid1, String outHigh1, String outUnc1,
+ String outKeep2, String outToss2, String outLow2, String outMid2, String outHigh2, String outUnc2,
+ String khistFile, String rhistFile, String peakFile, List<String> extra,
+ int targetDepth, int targetDepthBadLow, int targetDepthBadHigh, int maxDepth, int minDepth,
+ int minKmersOverMinDepth, float depthPercentile, boolean tossErrorReads, boolean rbb, boolean discardBadOnly,
+ float highPercentile, float lowPercentile, int errorDetectRatio, int hthresh, int lthresh, boolean fixSpikes, boolean countup,
+ boolean rename){
+ assert(in1!=null);
+ TARGET_DEPTH=targetDepth;
+ TARGET_DEPTH_BAD_LOW=targetDepthBadLow;
+ TARGET_DEPTH_BAD_HIGH=targetDepthBadHigh;
+ MAX_DEPTH=maxDepth;
+ MIN_DEPTH=minDepth;
+ MIN_KMERS_OVER_MIN_DEPTH=minKmersOverMinDepth;
+ DEPTH_PERCENTILE=depthPercentile;
+ RENAME_THIS_PASS=rename;
+
+ COUNTUP=countup;
+ if(COUNTUP){
+ TARGET_DEPTH=(int)Math.round(TARGET_DEPTH*0.95);
+ }
+ TOSS_ERROR_READS=tossErrorReads;
+// REQUIRE_BOTH_BAD=(rbb);
+ REQUIRE_BOTH_BAD=(rbb || COUNTUP);
+ DISCARD_BAD_ONLY=discardBadOnly;
+ HIGH_PERCENTILE=highPercentile;
+ LOW_PERCENTILE=(COUNTUP ? LOW_PERCENTILE_COUNTUP : lowPercentile);
+// assert(!COUNTUP) : COUNTUP+", "+LOW_PERCENTILE_COUNTUP+", "+lowPercentile+", "+LOW_PERCENTILE;
+ ERROR_DETECT_RATIO=errorDetectRatio;
+ HTHRESH=hthresh;
+ LTHRESH=lthresh;
+ FIX_SPIKES=fixSpikes;
+
+ {
+ if(khistFile!=null || peakFile!=null){USE_KHISTOGRAM=true;}
+ if(rhistFile!=null){USE_RHISTOGRAM=true;}
+
+ final int maxCount=(int)(cbits>16 ? Integer.MAX_VALUE : (1L<<cbits)-1);
+ assert(maxCount>0);
+ HIST_LEN_PRINT=Tools.max(1, Tools.min(HIST_LEN_PRINT, maxCount));
+ assert(HIST_LEN_PRINT<=Integer.MAX_VALUE) : HIST_LEN_PRINT+", "+Integer.MAX_VALUE;
+ HIST_LEN=(int)Tools.min(maxCount, Tools.max(HIST_LEN_PRINT, HIST_LEN));
+ THREAD_HIST_LEN=Tools.min(THREAD_HIST_LEN, HIST_LEN);
+
+ khistogram=new AtomicLongArray(HIST_LEN);
+ if(USE_RHISTOGRAM && rhistFile!=null){
+ rhistogram=new AtomicLongArray(HIST_LEN);
+ bhistogram=new AtomicLongArray(HIST_LEN);
+ }else{
+ rhistogram=null;
+ bhistogram=null;
+ }
+ }
+
+ if(auto && cells==-1){
+ final long usable=(long)Tools.max(((memory-96000000)*.73), memory*0.45);
+ long mem=usable-(khistogram!=null ? (HIST_LEN*16L*(1)) : 0)-(rhistogram!=null ? (HIST_LEN*8L*(1)) : 0)-(bhistogram!=null ? (HIST_LEN*8L*(1)) : 0);
+ if(buildpasses>1){mem/=2;}
+
+ FILTERBYTES=(COUNTUP ? mem/2 : mem);
+ cells=(FILTERBYTES*8)/cbits;
+//
+// long tablebytes=((1L<<matrixbits)*cbits)/8;
+// if(tablebytes*3<usable){matrixbits++;}
+// outstream.println(tablebytes/1000000+", "+usable/1000000+", "+(tablebytes*3)/1000000);
+
+ }else if(cells==-1){
+ cells=1L<<34;
+ }
+
+ if(prefilter){
+ if(precells<1){
+ long totalbits=cells*cbits;
+ long prebits=(long)(totalbits*prefilterFraction);
+ precells=prebits/pcbits;
+ cells=(totalbits-prebits+cbits-1)/cbits; //Steal memory from cell allocation
+ }
+ if(prehashes<1){
+ prehashes=(hashes+1)/2;
+ }
+ }
+
+ {
+ outstream.println("\nSettings:");
+ outstream.println("threads: \t"+THREADS);
+ outstream.println("k: \t"+k);
+ outstream.println("deterministic: \t"+DETERMINISTIC);
+ outstream.println("toss error reads: \t"+TOSS_ERROR_READS);
+ outstream.println("passes: \t"+buildpasses);
+ outstream.println("bits per cell: \t"+cbits);
+// outstream.println("matrixbits: \t"+matrixbits);
+ outstream.println("cells: \t"+Tools.toKMG(cells));
+ outstream.println("hashes: \t"+hashes);
+ if(prefilter){
+ outstream.println("prefilter bits: \t"+pcbits);
+// outstream.println("matrixbits: \t"+matrixbits);
+ outstream.println("prefilter cells: \t"+(precells>0 && prehashes>0 ? Tools.toKMG(precells) : "?"));
+ outstream.println("prefilter hashes: \t"+(precells>0 && prehashes>0 ? ""+prehashes : "?"));
+ }
+// outstream.println("base min quality: \t"+KmerCountAbstract.minQuality);
+ outstream.println("base min quality: \t"+minq);
+ outstream.println("kmer min prob: \t"+KmerCountAbstract.minProb);
+
+ outstream.println();
+ outstream.println("target depth: \t"+TARGET_DEPTH);
+ outstream.println("min depth: \t"+MIN_DEPTH);
+ outstream.println("max depth: \t"+MAX_DEPTH);
+ outstream.println("min good kmers: \t"+MIN_KMERS_OVER_MIN_DEPTH);
+ outstream.println("depth percentile: \t"+String.format("%.1f", 100*DEPTH_PERCENTILE));
+ outstream.println("ignore dupe kmers:\t"+!KmerCountAbstract.KEEP_DUPLICATE_KMERS);
+ outstream.println("fix spikes: \t"+FIX_SPIKES);
+ if((USE_KHISTOGRAM || USE_RHISTOGRAM) && HIST_LEN>0){
+ outstream.println("histogram length: \t"+HIST_LEN);
+ }
+ if(khistFile!=null || rhistFile!=null){
+ outstream.println("print zero cov: \t"+PRINT_ZERO_COVERAGE);
+ }
+
+ outstream.println();
+ }
+
+ if(!prefilter && k<32 && cells>(1L<<(2*k))){cells=(1L<<(2*k));}
+ assert(cells>0);
+
+// KmerCountAbstract.THREADS=Tools.max(THREADS/2, KmerCountAbstract.THREADS); //Seems like 4 is actually optimal...
+
+ FastaReadInputStream.MIN_READ_LEN=k;
+
+ if(eccByOverlapAuto){
+ eccByOverlapAuto=false;
+ float overlapRatio=BBMerge.mergeableFraction(in1, in2, 1000000, 0.01f);
+ eccByOverlap=(overlapRatio>0.25f);
+ if(eccByOverlap){
+ System.err.println("Enabled overlap correction ("+String.format("%.1f%% percent overlap)", 100*overlapRatio));
+ }
+ }
+
+ Timer t=new Timer();
+ Timer ht=new Timer();
+ t.start();
+ ht.start();
+ KCountArray kca;
+ KCountArray prefilterArray=null;
+// outstream.println();
+ if(prefilter){
+ prefilterArray=KmerCount7MTA.makeKca(in1, in2, extra, k, pcbits, 0, precells, prehashes, minq, true, eccByOverlap, tablereads, 1, buildStepsize, 1, 1, null, 0);
+ outstream.println("Made prefilter: \t"+prefilterArray.toShortString(prehashes));
+ double uf=prefilterArray.usedFraction();
+ if(uf>0.6){
+ outstream.println("Warning: This table is "+(uf>0.995 ? "totally" : uf>0.99 ? "crazy" : uf>0.95 ? "incredibly" : uf>0.9 ? "extremely" : uf>0.8 ? "very" :
+ uf>0.7 ? "fairly" : "somewhat")+" full, which may reduce accuracy for kmers of depth under 3. Ideal load is under 60% used." +
+ "\nFor better accuracy, run on a node with more memory; quality-trim or error-correct reads; " +
+ "or increase the values of the minprob flag to reduce spurious kmers.");
+ }
+ }
+ kca=KmerCount7MTA.makeKca(in1, in2, extra, k, cbits, 0, cells, hashes, minq, true, eccByOverlap, tablereads, buildpasses, buildStepsize, 2, 2, prefilterArray, (prefilterArray==null ? 0 : prefilterArray.maxValue));
+ ht.stop();
+
+ outstream.println("Made hash table: \t"+kca.toShortString(hashes));
+ double uf=kca.usedFraction();
+ if(uf>0.6){
+ outstream.println("Warning: This table is "+(uf>0.995 ? "totally" : uf>0.99 ? "crazy" : uf>0.95 ? "incredibly" : uf>0.9 ? "extremely" : uf>0.8 ? "very" :
+ uf>0.7 ? "fairly" : "somewhat")+" full, which may reduce accuracy. Ideal load is under 60% used." +
+ "\nFor better accuracy, use the 'prefilter' flag; run on a node with more memory; quality-trim or error-correct reads; " +
+ "or increase the values of the minprob flag to reduce spurious kmers. In practice you should still get good normalization results " +
+ "even with loads over 90%, but the histogram and statistics will be off.");
+ }
+
+ long estUnique;
+ outstream.println();
+ if(prefilterArray!=null){
+ int lim1=prefilterArray.maxValue, lim2=prefilterArray.maxValue+1;
+ double a=prefilterArray.estimateUniqueKmers(prehashes);
+ double b=kca.estimateUniqueKmers(hashes, lim2);
+ a=a-b;
+ if(CANONICAL){
+// a=(a*KCountArray.canonMask)/(KCountArray.canonMask+1);
+// b=(b*KCountArray.canonMask)/(KCountArray.canonMask+1);
+ }else{
+ a/=2;
+ b/=2;
+ }
+ estUnique=((long)((a+b)));
+ outstream.println("Estimated kmers of depth 1-"+lim1+": \t"+(long)a);
+ outstream.println("Estimated kmers of depth "+lim2+"+ : \t"+(long)b);
+ }else{
+// double est=kca.cells*(1-Math.pow(1-Math.sqrt(kca.usedFraction()), 1.0/hashes));
+// double est=kca.cells*(1-Math.pow(1-kca.usedFraction(), 1.0/hashes));
+ double est=kca.estimateUniqueKmers(hashes);
+// outstream.println("Used cells: "+kca.cellsUsed(1));
+ if(CANONICAL){
+// est=(est*KCountArray.canonMask)/(KCountArray.canonMask+1);
+ }else{
+ est/=2;
+ }
+ estUnique=((long)((est)));
+
+ }
+ outstream.println("Estimated unique kmers: \t"+estUnique);//+", or "+estUnique+" counting forward kmers only.");
+// outstream.println("(Includes forward and reverse kmers)");
+ outstream.println();
+ outstream.println("Table creation time:\t\t"+ht);//+" \t"+String.format("%.2f", totalBases*1000000.0/(ht.elapsed))+" kb/sec");
+
+ ListNum.setDeterministicRandom(DETERMINISTIC);
+
+ long bases=0;
+ if(COUNTUP){
+ COUNTUP=false;
+
+ int td0=TARGET_DEPTH, md0=MIN_DEPTH, mxd0=MAX_DEPTH, mkomd0=MIN_KMERS_OVER_MIN_DEPTH;
+ TARGET_DEPTH=TARGET_DEPTH*4;
+ MIN_DEPTH=MIN_DEPTH/2;
+ MAX_DEPTH=MAX_DEPTH*4;
+ MIN_KMERS_OVER_MIN_DEPTH=MIN_KMERS_OVER_MIN_DEPTH/2;
+
+ int rnd=(int)(100+Math.random()*1000000);
+ final String tempOutPrefix1=getTempPrefix(in1, outKeep1, rnd, 1);
+ final String tempOut1=getTempOut(outKeep1, tempOutPrefix1);
+ final String tempOutPrefix2=(outKeep2==null ? null : getTempPrefix(in1, outKeep2, rnd, 2));
+ final String tempOut2=(outKeep2==null ? null : getTempOut(outKeep2, tempOutPrefix2));
+ ArrayList<Read> storage=new ArrayList<Read>();
+
+ if(in1!=null && in1.contains(",") && !new File(in1).exists()){
+ String[] list1=in1.split(",");
+ String[] list2=(in2==null ? null : in2.split(","));
+ bases+=count(list1, list2, kca, k, maxReads, null, null, null, null, null, null,
+ null, null, null, null, null, null, false, overwrite, null, null, null, estUnique, storage);
+ }else{
+ bases+=count(in1, in2, kca, k, maxReads, null, null, null, null, null, null,
+ null, null, null, null, null, null, false, overwrite, null, null, null, estUnique, storage);
+ }
+ inMemorySort(storage, tempOut1, false);
+ storage=null;
+ in1=tempOut1;
+ in2=null;
+
+ TARGET_DEPTH=td0;
+ MIN_DEPTH=md0;
+ MAX_DEPTH=mxd0;
+ MIN_KMERS_OVER_MIN_DEPTH=mkomd0;
+
+ COUNTUP=true;
+
+
+ if(in1!=null && in1.contains(",") && !new File(in1).exists()){
+ String[] list1=in1.split(",");
+ String[] list2=(in2==null ? null : in2.split(","));
+ bases+=count(list1, list2, kca, k, maxReads, outKeep1, outToss1, outLow1, outMid1, outHigh1, outUnc1,
+ outKeep2, outToss2, outLow2, outMid2, outHigh2, outUnc2, ordered, overwrite, khistFile, rhistFile, peakFile, estUnique, null);
+ }else{
+ bases+=count(in1, in2, kca, k, maxReads, outKeep1, outToss1, outLow1, outMid1, outHigh1, outUnc1,
+ outKeep2, outToss2, outLow2, outMid2, outHigh2, outUnc2, ordered, overwrite, khistFile, rhistFile, peakFile, estUnique, null);
+ }
+
+ }else{
+
+
+ if(in1!=null && in1.contains(",") && !new File(in1).exists()){
+ String[] list1=in1.split(",");
+ String[] list2=(in2==null ? null : in2.split(","));
+ bases+=count(list1, list2, kca, k, maxReads, outKeep1, outToss1, outLow1, outMid1, outHigh1, outUnc1,
+ outKeep2, outToss2, outLow2, outMid2, outHigh2, outUnc2, ordered, overwrite, khistFile, rhistFile, peakFile, estUnique, null);
+ }else{
+ bases+=count(in1, in2, kca, k, maxReads, outKeep1, outToss1, outLow1, outMid1, outHigh1, outUnc1,
+ outKeep2, outToss2, outLow2, outMid2, outHigh2, outUnc2, ordered, overwrite, khistFile, rhistFile, peakFile, estUnique, null);
+ }
+ }
+
+ if(ANALYZE_TOPOLOGY){printTopology();}
+
+ t.stop();
+// outstream.println("\nTotal time: \t\t"+t+" \t"+String.format("%.2f", bases*1000000.0/(t.elapsed))+" kb/sec");
+ return bases;
+ }
+
+
+ public static void printTopology(){
+ long total=peaks.get()+spikes.get()+flats.get()+valleys.get()+slopes.get();
+ double mult=100.0/total;
+
+ long sp=spikes.get();
+ long pe=peaks.get();
+ long va=valleys.get();
+ long sl=slopes.get();
+ long fl=flats.get();
+ double dsp=mult*sp;
+ double dpe=mult*pe;
+ double dva=mult*va;
+ double dsl=mult*sl;
+ double dfl=mult*fl;
+
+ System.err.println("\nDepth Topology:\t");
+ System.err.println("Spikes: \t\t\t"+(dsp<10 ? " " : "")+String.format("%.3f%% \t%d",dsp,sp));
+ System.err.println("Peaks: \t\t\t"+(dpe<10 ? " " : "")+String.format("%.3f%% \t%d",dpe,pe));
+ System.err.println("Valleys: \t\t\t"+(dva<10 ? " " : "")+String.format("%.3f%% \t%d",dva,va));
+ System.err.println("Slopes: \t\t\t"+(dsl<10 ? " " : "")+String.format("%.3f%% \t%d",dsl,sl));
+ System.err.println("Flats: \t\t\t"+(dfl<10 ? " " : "")+String.format("%.3f%% \t%d",dfl,fl));
+ }
+
+
+ public static long count(String in1, String in2, KCountArray kca, int k, long maxReads,
+ String outKeep1, String outToss1, String outLow1, String outMid1, String outHigh1, String outUnc1,
+ String outKeep2, String outToss2, String outLow2, String outMid2, String outHigh2, String outUnc2,
+ boolean ordered, boolean overwrite, String khistFile, String rhistFile, String peakFile, long estUnique, ArrayList<Read> storage) {
+ final ConcurrentReadInputStream cris;
+ {
+ FileFormat ff1=FileFormat.testInput(in1, FileFormat.FASTQ, null, true, true);
+ FileFormat ff2=FileFormat.testInput(in2, FileFormat.FASTQ, null, true, true);
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2);
+ if(verbose){System.err.println("Started cris");}
+ cris.start(); //4567
+ }
+ boolean paired=cris.paired();
+ if(verbose){System.err.println("Paired: "+paired);}
+
+ ConcurrentReadOutputStream rosKeep=null;
+ if(outKeep1!=null){
+ final int buff=(!ordered ? 8 : Tools.max(16, 2*THREADS));
+
+ final String out=outKeep1;
+ String out1=outKeep1.replaceFirst("#", "1");
+ String out2=outKeep2;
+
+ if(cris.paired() && out2==null){
+ if(out.contains("#")){
+ out2=out.replaceFirst("#", "2");
+ }else{
+ outstream.println("Writing interleaved.");
+ }
+ }
+
+ assert(!out1.equalsIgnoreCase(in1) && !out1.equalsIgnoreCase(in1));
+ assert(out2==null || (!out2.equalsIgnoreCase(in1) && !out2.equalsIgnoreCase(in2)));
+
+ FileFormat ff1=FileFormat.testOutput(out1, FileFormat.FASTQ, null, true, overwrite, append, ordered);
+ FileFormat ff2=FileFormat.testOutput(out2, FileFormat.FASTQ, null, true, overwrite, append, ordered);
+ rosKeep=ConcurrentReadOutputStream.getStream(ff1, ff2, buff, null, true);
+ rosKeep.start();
+ outstream.println("Started output threads.");
+ }
+
+ ConcurrentReadOutputStream rosToss=null;
+ if(outToss1!=null){
+ final int buff=(!ordered ? 8 : Tools.max(16, 2*THREADS));
+
+ final String out=outToss1;
+ String out1=outToss1.replaceFirst("#", "1");
+ String out2=outToss2;
+
+ if(cris.paired() && out2==null){
+ if(out.contains("#")){
+ out2=out.replaceFirst("#", "2");
+ }else{
+ outstream.println("Writing interleaved.");
+ }
+ }
+
+ assert(!out1.equalsIgnoreCase(in1) && !out1.equalsIgnoreCase(in1));
+ assert(out2==null || (!out2.equalsIgnoreCase(in1) && !out2.equalsIgnoreCase(in2)));
+
+ FileFormat ff1=FileFormat.testOutput(out1, FileFormat.FASTQ, null, true, overwrite, append, ordered);
+ FileFormat ff2=FileFormat.testOutput(out2, FileFormat.FASTQ, null, true, overwrite, append, ordered);
+ rosToss=ConcurrentReadOutputStream.getStream(ff1, ff2, buff, null, true);
+
+ rosToss.start();
+ outstream.println("Started output threads.");
+ }
+
+ ConcurrentReadOutputStream rosLow=null;
+ if(outLow1!=null){
+ final int buff=(!ordered ? 8 : Tools.max(16, 2*THREADS));
+
+ final String out=outLow1;
+ String out1=outLow1.replaceFirst("#", "1");
+ String out2=outLow2;
+
+ if(cris.paired() && out2==null){
+ if(out.contains("#")){
+ out2=out.replaceFirst("#", "2");
+ }else{
+ outstream.println("Writing interleaved.");
+ }
+ }
+
+ assert(!out1.equalsIgnoreCase(in1) && !out1.equalsIgnoreCase(in1));
+ assert(out2==null || (!out2.equalsIgnoreCase(in1) && !out2.equalsIgnoreCase(in2)));
+
+ FileFormat ff1=FileFormat.testOutput(out1, FileFormat.FASTQ, null, true, overwrite, append, ordered);
+ FileFormat ff2=FileFormat.testOutput(out2, FileFormat.FASTQ, null, true, overwrite, append, ordered);
+ rosLow=ConcurrentReadOutputStream.getStream(ff1, ff2, buff, null, true);
+
+ rosLow.start();
+ outstream.println("Started output threads.");
+ }
+
+ ConcurrentReadOutputStream rosMid=null;
+ if(outMid1!=null){
+ final int buff=(!ordered ? 8 : Tools.max(16, 2*THREADS));
+
+ final String out=outMid1;
+ String out1=outMid1.replaceFirst("#", "1");
+ String out2=outMid2;
+
+ if(cris.paired() && out2==null){
+ if(out.contains("#")){
+ out2=out.replaceFirst("#", "2");
+ }else{
+ outstream.println("Writing interleaved.");
+ }
+ }
+
+ assert(!out1.equalsIgnoreCase(in1) && !out1.equalsIgnoreCase(in1));
+ assert(out2==null || (!out2.equalsIgnoreCase(in1) && !out2.equalsIgnoreCase(in2)));
+
+ FileFormat ff1=FileFormat.testOutput(out1, FileFormat.FASTQ, null, true, overwrite, append, ordered);
+ FileFormat ff2=FileFormat.testOutput(out2, FileFormat.FASTQ, null, true, overwrite, append, ordered);
+ rosMid=ConcurrentReadOutputStream.getStream(ff1, ff2, buff, null, true);
+
+ rosMid.start();
+ outstream.println("Started output threads.");
+ }
+
+ ConcurrentReadOutputStream rosHigh=null;
+ if(outHigh1!=null){
+ final int buff=(!ordered ? 8 : Tools.max(16, 2*THREADS));
+
+ final String out=outHigh1;
+ String out1=outHigh1.replaceFirst("#", "1");
+ String out2=outHigh2;
+
+ if(cris.paired() && out2==null){
+ if(out.contains("#")){
+ out2=out.replaceFirst("#", "2");
+ }else{
+ outstream.println("Writing interleaved.");
+ }
+ }
+
+ assert(!out1.equalsIgnoreCase(in1) && !out1.equalsIgnoreCase(in1));
+ assert(out2==null || (!out2.equalsIgnoreCase(in1) && !out2.equalsIgnoreCase(in2)));
+
+ FileFormat ff1=FileFormat.testOutput(out1, FileFormat.FASTQ, null, true, overwrite, append, ordered);
+ FileFormat ff2=FileFormat.testOutput(out2, FileFormat.FASTQ, null, true, overwrite, append, ordered);
+ rosHigh=ConcurrentReadOutputStream.getStream(ff1, ff2, buff, null, true);
+
+ rosHigh.start();
+ outstream.println("Started output threads.");
+ }
+
+ ConcurrentReadOutputStream rosUnc=null;
+ if(outUnc1!=null){
+ final int buff=(!ordered ? 8 : Tools.max(16, 2*THREADS));
+
+ final String out=outUnc1;
+ String out1=outUnc1.replaceFirst("#", "1");
+ String out2=outUnc2;
+
+ if(cris.paired() && out2==null){
+ if(out.contains("#")){
+ out2=out.replaceFirst("#", "2");
+ }else{
+ outstream.println("Writing interleaved.");
+ }
+ }
+
+ assert(!out1.equalsIgnoreCase(in1) && !out1.equalsIgnoreCase(in1));
+ assert(out2==null || (!out2.equalsIgnoreCase(in1) && !out2.equalsIgnoreCase(in2)));
+
+ FileFormat ff1=FileFormat.testOutput(out1, FileFormat.FASTQ, null, true, overwrite, append, ordered);
+ FileFormat ff2=FileFormat.testOutput(out2, FileFormat.FASTQ, null, true, overwrite, append, ordered);
+ rosUnc=ConcurrentReadOutputStream.getStream(ff1, ff2, buff, null, true);
+
+ rosUnc.start();
+ outstream.println("Started output threads.");
+ }
+
+ long bases=downsample(cris, kca, k, maxReads, rosKeep, rosToss, rosLow, rosMid, rosHigh, rosUnc, khistFile, rhistFile, peakFile, overwrite, estUnique, storage);
+
+ errorState|=ReadWrite.closeStreams(cris, rosKeep, rosToss, rosLow, rosMid, rosHigh, rosUnc);
+ if(verbose){System.err.println("Closed streams");}
+
+ return bases;
+ }
+
+
+ public static long count(String[] list1, String[] list2, KCountArray kca, int k, long maxReads,
+ String outKeep1, String outToss1, String outLow1, String outMid1, String outHigh1, String outUnc1,
+ String outKeep2, String outToss2, String outLow2, String outMid2, String outHigh2, String outUnc2,
+ boolean ordered, boolean overwrite, String khistFile, String rhistFile, String peakFile, long estUnique, ArrayList<Read> storage) {
+
+ ConcurrentReadOutputStream rosKeep=null, rosToss=null, rosLow=null, rosMid=null, rosHigh=null, rosUnc=null;
+ String[] outKeep1a=null, outKeep2a=null;
+ String[] outToss1a=null, outToss2a=null;
+ String[] outLow1a=null, outLow2a=null;
+ String[] outMid1a=null, outMid2a=null;
+ String[] outHigh1a=null, outHigh2a=null;
+ String[] outUnc1a=null, outUnc2a=null;
+
+
+ final int buff=(!ordered ? 8 : Tools.max(16, 2*THREADS));
+ if(outKeep1!=null){
+ if(!new File(outKeep1).exists()){
+ outKeep1a=outKeep1.split(",");
+ }else{
+ outKeep1a=new String[] {outKeep1};
+ }
+ if(outKeep2!=null){
+ if(!new File(outKeep2).exists()){
+ outKeep2a=outKeep2.split(",");
+ }else{
+ outKeep2a=new String[] {outKeep2};
+ }
+ }else{
+ outKeep2a=new String[outKeep1a.length];
+ for(int i=0; i<outKeep1a.length; i++){
+ if(outKeep1a[i].contains("#")){
+ outKeep2a[i]=outKeep1a[i].replaceFirst("#", "2");
+ outKeep1a[i]=outKeep1a[i].replaceFirst("#", "1");
+ }
+ }
+ }
+ }
+ if(outToss1!=null){
+ if(!new File(outToss1).exists()){
+ outToss1a=outToss1.split(",");
+ }else{
+ outToss1a=new String[] {outToss1};
+ }
+ if(outToss2!=null){
+ if(!new File(outToss2).exists()){
+ outToss2a=outToss2.split(",");
+ }else{
+ outToss2a=new String[] {outToss2};
+ }
+ }else{
+ outToss2a=new String[outToss1a.length];
+ for(int i=0; i<outToss1a.length; i++){
+ if(outToss1a[i].contains("#")){
+ outToss2a[i]=outToss1a[i].replaceFirst("#", "2");
+ outToss1a[i]=outToss1a[i].replaceFirst("#", "1");
+ }
+ }
+ }
+ }
+ if(outLow1!=null){
+ if(!new File(outLow1).exists()){
+ outLow1a=outLow1.split(",");
+ }else{
+ outLow1a=new String[] {outLow1};
+ }
+ if(outLow2!=null){
+ if(!new File(outLow2).exists()){
+ outLow2a=outLow2.split(",");
+ }else{
+ outLow2a=new String[] {outLow2};
+ }
+ }else{
+ outLow2a=new String[outLow1a.length];
+ for(int i=0; i<outLow1a.length; i++){
+ if(outLow1a[i].contains("#")){
+ outLow2a[i]=outLow1a[i].replaceFirst("#", "2");
+ outLow1a[i]=outLow1a[i].replaceFirst("#", "1");
+ }
+ }
+ }
+ }
+ if(outMid1!=null){
+ if(!new File(outMid1).exists()){
+ outMid1a=outMid1.split(",");
+ }else{
+ outMid1a=new String[] {outMid1};
+ }
+ if(outMid2!=null){
+ if(!new File(outMid2).exists()){
+ outMid2a=outMid2.split(",");
+ }else{
+ outMid2a=new String[] {outMid2};
+ }
+ }else{
+ outMid2a=new String[outMid1a.length];
+ for(int i=0; i<outMid1a.length; i++){
+ if(outMid1a[i].contains("#")){
+ outMid2a[i]=outMid1a[i].replaceFirst("#", "2");
+ outMid1a[i]=outMid1a[i].replaceFirst("#", "1");
+ }
+ }
+ }
+ }
+ if(outHigh1!=null){
+ if(!new File(outHigh1).exists()){
+ outHigh1a=outHigh1.split(",");
+ }else{
+ outHigh1a=new String[] {outHigh1};
+ }
+ if(outHigh2!=null){
+ if(!new File(outHigh2).exists()){
+ outHigh2a=outHigh2.split(",");
+ }else{
+ outHigh2a=new String[] {outHigh2};
+ }
+ }else{
+ outHigh2a=new String[outHigh1a.length];
+ for(int i=0; i<outHigh1a.length; i++){
+ if(outHigh1a[i].contains("#")){
+ outHigh2a[i]=outHigh1a[i].replaceFirst("#", "2");
+ outHigh1a[i]=outHigh1a[i].replaceFirst("#", "1");
+ }
+ }
+ }
+ }
+ if(outUnc1!=null){
+ if(!new File(outUnc1).exists()){
+ outUnc1a=outUnc1.split(",");
+ }else{
+ outUnc1a=new String[] {outUnc1};
+ }
+ if(outUnc2!=null){
+ if(!new File(outUnc2).exists()){
+ outUnc2a=outUnc2.split(",");
+ }else{
+ outUnc2a=new String[] {outUnc2};
+ }
+ }else{
+ outUnc2a=new String[outUnc1a.length];
+ for(int i=0; i<outUnc1a.length; i++){
+ if(outUnc1a[i].contains("#")){
+ outUnc2a[i]=outUnc1a[i].replaceFirst("#", "2");
+ outUnc1a[i]=outUnc1a[i].replaceFirst("#", "1");
+ }
+ }
+ }
+ }
+
+ long bases=0;
+
+ for(int x=0; x<list1.length; x++){
+
+ if(outKeep1a!=null){
+ if(x==0 || outKeep1a.length>1){
+ if(rosKeep!=null){
+ rosKeep.close();
+ rosKeep.join();
+ }
+
+ FileFormat ff1=FileFormat.testOutput(outKeep1a[x], FileFormat.FASTQ, null, true, overwrite, append, ordered);
+ FileFormat ff2=FileFormat.testOutput(outKeep2a[x], FileFormat.FASTQ, null, true, overwrite, append, ordered);
+ rosKeep=ConcurrentReadOutputStream.getStream(ff1, ff2, buff, null, true);
+
+ rosKeep.start();
+ outstream.println("Started output threads.");
+ }else{
+ rosKeep.resetNextListID();
+ }
+ }
+
+ if(outToss1a!=null){
+ if(x==0 || outToss1a.length>1){
+ if(rosToss!=null){
+ rosToss.close();
+ rosToss.join();
+ }
+
+ FileFormat ff1=FileFormat.testOutput(outToss1a[x], FileFormat.FASTQ, null, true, overwrite, append, ordered);
+ FileFormat ff2=FileFormat.testOutput(outToss2a[x], FileFormat.FASTQ, null, true, overwrite, append, ordered);
+ rosToss=ConcurrentReadOutputStream.getStream(ff1, ff2, buff, null, true);
+
+ rosToss.start();
+ outstream.println("Started output threads.");
+ }else{
+ rosToss.resetNextListID();
+ }
+ }
+
+ if(outLow1a!=null){
+ if(x==0 || outLow1a.length>1){
+ if(rosLow!=null){
+ rosLow.close();
+ rosLow.join();
+ }
+
+ FileFormat ff1=FileFormat.testOutput(outLow1a[x], FileFormat.FASTQ, null, true, overwrite, append, ordered);
+ FileFormat ff2=FileFormat.testOutput(outLow2a[x], FileFormat.FASTQ, null, true, overwrite, append, ordered);
+ rosLow=ConcurrentReadOutputStream.getStream(ff1, ff2, buff, null, true);
+
+ rosLow.start();
+ outstream.println("Started output threads.");
+ }else{
+ rosLow.resetNextListID();
+ }
+ }
+
+ if(outMid1a!=null){
+ if(x==0 || outMid1a.length>1){
+ if(rosMid!=null){
+ rosMid.close();
+ rosMid.join();
+ }
+
+ FileFormat ff1=FileFormat.testOutput(outMid1a[x], FileFormat.FASTQ, null, true, overwrite, append, ordered);
+ FileFormat ff2=FileFormat.testOutput(outMid2a[x], FileFormat.FASTQ, null, true, overwrite, append, ordered);
+ rosMid=ConcurrentReadOutputStream.getStream(ff1, ff2, buff, null, true);
+
+ rosMid.start();
+ outstream.println("Started output threads.");
+ }else{
+ rosMid.resetNextListID();
+ }
+ }
+
+ if(outHigh1a!=null){
+ if(x==0 || outHigh1a.length>1){
+ if(rosHigh!=null){
+ rosHigh.close();
+ rosHigh.join();
+ }
+
+ FileFormat ff1=FileFormat.testOutput(outHigh1a[x], FileFormat.FASTQ, null, true, overwrite, append, ordered);
+ FileFormat ff2=FileFormat.testOutput(outHigh2a[x], FileFormat.FASTQ, null, true, overwrite, append, ordered);
+ rosHigh=ConcurrentReadOutputStream.getStream(ff1, ff2, buff, null, true);
+
+ rosHigh.start();
+ outstream.println("Started output threads.");
+ }else{
+ rosHigh.resetNextListID();
+ }
+ }
+
+ if(outUnc1a!=null){
+ if(x==0 || outUnc1a.length>1){
+ if(rosUnc!=null){
+ rosUnc.close();
+ rosUnc.join();
+ }
+
+ FileFormat ff1=FileFormat.testOutput(outUnc1a[x], FileFormat.FASTQ, null, true, overwrite, append, ordered);
+ FileFormat ff2=FileFormat.testOutput(outUnc2a[x], FileFormat.FASTQ, null, true, overwrite, append, ordered);
+ rosUnc=ConcurrentReadOutputStream.getStream(ff1, ff2, buff, null, true);
+
+ rosUnc.start();
+ outstream.println("Started output threads.");
+ }else{
+ rosUnc.resetNextListID();
+ }
+ }
+
+ String in1=list1[x];
+ String in2=(list2==null || list2.length<=x ? null : list2[x]);
+
+ final ConcurrentReadInputStream cris;
+ {
+ FileFormat ff1=FileFormat.testInput(in1, FileFormat.FASTQ, null, true, true);
+ FileFormat ff2=FileFormat.testInput(in2, FileFormat.FASTQ, null, true, true);
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2);
+ if(verbose){System.err.println("Started cris");}
+ cris.start(); //4567
+ }
+ boolean paired=cris.paired();
+ if(verbose){System.err.println("Paired: "+paired);}
+
+ bases+=downsample(cris, kca, k, maxReads, rosKeep, rosToss, rosLow, rosMid, rosHigh, rosUnc, khistFile, rhistFile, peakFile, overwrite, estUnique, storage);
+
+ errorState|=ReadWrite.closeStream(cris);
+ if(verbose){System.err.println("Closed stream");}
+
+ }
+
+ errorState|=ReadWrite.closeStreams(null, rosKeep, rosToss, rosLow, rosMid, rosHigh, rosUnc);
+
+ return bases;
+ }
+
+
+
+ public static long downsample(ConcurrentReadInputStream cris, KCountArray kca, int k, long maxReads,
+ ConcurrentReadOutputStream rosKeep, ConcurrentReadOutputStream rosToss, ConcurrentReadOutputStream rosLow, ConcurrentReadOutputStream rosMid, ConcurrentReadOutputStream rosHigh, ConcurrentReadOutputStream rosUnc,
+ String khistFile, String rhistFile, String peakFile, boolean overwrite, long estUnique, ArrayList<Read> storage) {
+ Timer tdetect=new Timer();
+ tdetect.start();
+
+ long totalBases=0;
+ long totalReads=0;
+
+ long readsKept=0;
+ long readsTossed=0;
+ long readsLowBin=0;
+ long readsMidBin=0;
+ long readsHighBin=0;
+ long readsUncorrected=0;
+ long basesKept=0;
+ long basesTossed=0;
+ long basesLowBin=0;
+ long basesMidBin=0;
+ long basesHighBin=0;
+ long basesUncorrected=0;
+
+
+ long errorReads=0;
+ long errorPairs=0;
+ long errorType1=0;
+ long errorType2=0;
+ long errorType3=0;
+
+ long errorsDetected=0;
+ long errorsMarked=0;
+ long errorsCorrected=0;
+ long basesTrimmed=0;
+
+ {
+ KCountArray kcaup=null;
+ if(COUNTUP){
+ final int bits;
+ if(TARGET_DEPTH<=15){
+ bits=4;
+ }else if(TARGET_DEPTH<=255){
+ bits=8;
+ }else{
+ bits=16;
+ }
+
+ long cells=(FILTERBYTES*8)/bits;
+ int kbits=2*k;
+ kcaup=KCountArray.makeNew(1L<<kbits, cells, bits, 0, 3, null, 0);
+ }
+
+ ProcessThread[] pta=new ProcessThread[THREADS];
+ for(int i=0; i<pta.length; i++){
+ pta[i]=new ProcessThread(cris, kca, kcaup, k, rosKeep, rosToss, rosLow, rosMid, rosHigh, rosUnc, storage);
+ pta[i].start();
+ }
+
+ kca=kcaup=null;
+
+ for(int i=0; i<pta.length; i++){
+ ProcessThread ct=pta[i];
+ synchronized(ct){
+ while(ct.getState()!=State.TERMINATED){
+ try {
+ ct.join(1000);
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ totalBases+=ct.totalBases;
+ totalReads+=ct.totalReads;
+ errorReads+=ct.errorReads;
+ errorPairs+=ct.errorPairs;
+ errorType1+=ct.errorType1;
+ errorType2+=ct.errorType2;
+ errorType3+=ct.errorType3;
+
+ readsKept+=ct.readsKept;
+ readsTossed+=ct.readsTossed;
+ readsLowBin+=ct.readsLowBin;
+ readsMidBin+=ct.readsMidBin;
+ readsHighBin+=ct.readsHighBin;
+ readsUncorrected+=ct.readsUncorrected;
+
+ basesKept+=ct.basesKept;
+ basesTossed+=ct.basesTossed;
+ basesLowBin+=ct.basesLowBin;
+ basesMidBin+=ct.basesMidBin;
+ basesHighBin+=ct.basesHighBin;
+ basesUncorrected+=ct.basesUncorrected;
+
+ errorsDetected+=ct.errorsDetected;
+ errorsMarked+=ct.errorsMarked;
+ errorsCorrected+=ct.errorsCorrected;
+ basesTrimmed+=ct.basesTrimmed;
+
+ for(int j=0; j<ct.hist.length; j++){
+ khistogram.addAndGet(j, ct.hist[j]);
+ }
+ for(int j=0; j<ct.qhist.length; j++){
+ qhist_total[j]+=ct.qhist[j];
+ }
+ }
+ }
+ }
+
+ if(!ZERO_BIN && khistogram!=null && khistogram.length()>1){
+ khistogram.addAndGet(1, khistogram.get(0));
+ khistogram.set(0, 0);
+ }
+
+// outstream.println();
+ tdetect.stop();
+ outstream.println("Table read time: \t\t"+tdetect+" \t"+String.format("%.2f", totalBases*1000000.0/(tdetect.elapsed))+" kb/sec");
+
+ {
+ String pad="";
+ String s=""+totalReads;
+ while(pad.length()+s.length()<9){pad+=" ";}
+ outstream.println("Total reads in: \t\t"+totalReads+pad+String.format("\t%.3f%% Kept", (readsKept*100.0/totalReads)));
+ s=""+totalBases;
+ while(pad.length()+s.length()<9){pad+=" ";}
+ outstream.println("Total bases in: \t\t"+totalBases+pad+String.format("\t%.3f%% Kept", (basesKept*100.0/totalBases)));
+
+ if(rosLow!=null){
+ s=""+readsLowBin;
+ while(pad.length()+s.length()<9){pad+=" ";}
+ outstream.println("Low bin reads: \t\t"+readsLowBin+pad+String.format("\t%.3f%%", (readsLowBin*100.0/totalReads)));
+ s=""+basesLowBin;
+ while(pad.length()+s.length()<9){pad+=" ";}
+ outstream.println("Low bin bases: \t\t"+basesLowBin+pad+String.format("\t%.3f%%", (basesLowBin*100.0/totalBases)));
+ }
+ if(rosMid!=null){
+ s=""+readsMidBin;
+ while(pad.length()+s.length()<9){pad+=" ";}
+ outstream.println("Mid bin reads: \t\t"+readsMidBin+pad+String.format("\t%.3f%%", (readsMidBin*100.0/totalReads)));
+ s=""+basesMidBin;
+ while(pad.length()+s.length()<9){pad+=" ";}
+ outstream.println("Mid bin bases: \t\t"+basesMidBin+pad+String.format("\t%.3f%%", (basesMidBin*100.0/totalBases)));
+ }
+ if(rosHigh!=null){
+ s=""+readsHighBin;
+ while(pad.length()+s.length()<9){pad+=" ";}
+ outstream.println("High bin reads: \t\t"+readsHighBin+pad+String.format("\t%.3f%%", (readsHighBin*100.0/totalReads)));
+ s=""+basesHighBin;
+ while(pad.length()+s.length()<9){pad+=" ";}
+ outstream.println("High bin bases: \t\t"+basesHighBin+pad+String.format("\t%.3f%%", (basesHighBin*100.0/totalBases)));
+ }
+
+ s=""+errorReads;
+ while(pad.length()+s.length()<9){pad+=" ";}
+ outstream.println("Error reads in: \t\t"+errorReads+pad+String.format("\t%.3f%%", (errorReads*100.0/totalReads)));
+ if(cris.paired()){
+ s=""+errorPairs;
+ while(pad.length()+s.length()<9){pad+=" ";}
+ outstream.println("Error pairs in: \t\t"+errorPairs+pad+String.format("\t%.3f%%", (errorPairs*200.0/totalReads)));
+ }
+ s=""+errorType1;
+ while(pad.length()+s.length()<9){pad+=" ";}
+ outstream.println("Error type 1: \t\t"+errorType1+pad+String.format("\t%.3f%%", (errorType1*100.0/totalReads)));
+ s=""+errorType2;
+ while(pad.length()+s.length()<9){pad+=" ";}
+ outstream.println("Error type 2: \t\t"+errorType2+pad+String.format("\t%.3f%%", (errorType2*100.0/totalReads)));
+ s=""+errorType3;
+ while(pad.length()+s.length()<9){pad+=" ";}
+ outstream.println("Error type 3: \t\t"+errorType3+pad+String.format("\t%.3f%%", (errorType3*100.0/totalReads)));
+
+
+ if(TRIM_LEFT_THIS_PASS || TRIM_RIGHT_THIS_PASS){
+ outstream.println("\nDuring Trimming:");
+ s=""+(errorsDetected+errorsCorrected+errorsMarked);
+ while(pad.length()+s.length()<9){pad+=" ";}
+ outstream.println("Bases Trimmed: \t\t"+(basesTrimmed));
+ }
+
+ if(CORRECT_ERRORS_THIS_PASS){
+ outstream.println("\nDuring Error Correction:");
+ s=""+(errorsDetected+errorsCorrected+errorsMarked);
+ while(pad.length()+s.length()<9){pad+=" ";}
+ outstream.println("Errors Suspected:\t\t"+(errorsDetected+errorsCorrected+errorsMarked));
+ s=""+errorsCorrected;
+ pad="";
+ while(pad.length()+s.length()<9){pad+=" ";}
+ outstream.println("Errors Corrected:\t\t"+errorsCorrected);
+ s=""+errorsMarked;
+ pad="";
+ while(pad.length()+s.length()<9){pad+=" ";}
+ outstream.println("Errors Marked: \t\t"+errorsMarked+"\n");
+ }
+ }
+
+// outstream.println();
+ if(khistogram!=null){
+
+ if(peakFile!=null){
+ CallPeaks.printClass=false;
+ long[] array=Tools.toArray(khistogram);
+ for(int i=0; i<array.length; i++){
+ long x=array[i];
+ long y=((x+i/2)/(i<1 ? 1 : i)); //x+i/2 rounds to compensate for colliding kmers being put in an overly high bin
+ array[i]=y;
+ }
+
+ ArrayList<String> args=new ArrayList<String>();
+ args.add("smoothradius=1");
+ args.add("smoothprogressive=t");
+ CallPeaks.printPeaks(array, peakFile, overwrite, minHeight, minVolume, minWidth, minPeak, maxPeak, maxPeakCount, k, ploidy, args);
+ }
+
+ ByteStreamWriter bsw=null;
+ if(USE_KHISTOGRAM && khistFile!=null){
+ bsw=new ByteStreamWriter(khistFile, overwrite, false, false);
+ bsw.start();
+
+ if(HIST_COLUMNS==1){
+ bsw.print("#tUnique_Kmers\n");
+ }else if(HIST_COLUMNS==2){
+ bsw.print("#Depth\tUnique_Kmers\n");
+ }else if(HIST_COLUMNS==3){
+ bsw.print("#Depth\tRaw_Count\tUnique_Kmers\n");
+ }
+
+ }
+ int lim=(int)(HIST_LEN_PRINT-1);
+ long remaining=Tools.sum(khistogram);
+ long sumRaw1=0;
+ long sumRaw2=0;
+ long sum1=0;
+ long sum2=0;
+ long sumsquare=0;
+ for(int i=0; i<lim; i++){
+ long x=khistogram.get(i);
+ long y=((x+i/2)/(i<1 ? 1 : i)); //x+i/2 rounds to compensate for colliding kmers being put in an overly high bin
+// long y=((x)/(i<1 ? 1 : i));
+ sumRaw1+=x;
+ sum1+=y;
+ sumsquare+=(x*Tools.max(1, i));
+ if(bsw!=null){
+ if(PRINT_ZERO_COVERAGE /*|| x>0*/ || y>0 || HIST_COLUMNS==1){
+ if(HIST_COLUMNS>1){
+ bsw.print(i);
+ bsw.print('\t');
+ }
+ if(HIST_COLUMNS==3){
+ bsw.print(x);
+ bsw.print('\t');
+ }
+ bsw.print(y);
+ bsw.print('\n');
+ }
+ }
+ if(sumRaw1>=remaining){break;} //Stop once there is no more coverage, even if PRINT_ZERO_COVERAGE is not set.
+ }
+ for(int i=lim; i<khistogram.length(); i++){
+ long x=khistogram.get(i);
+ sumRaw2+=x;
+ long y=((x+i/2)/(i<1 ? 1 : i)); //x+i/2 rounds to compensate for colliding kmers being put in an overly high bin
+// long y=((x)/(i<1 ? 1 : i));
+ sum2+=y;
+ }
+ if(bsw!=null){
+ if(sumRaw2>0 || sum2>0){
+ if(HIST_COLUMNS>1){
+ bsw.print(lim);
+ bsw.print('\t');
+ }
+ if(HIST_COLUMNS==3){
+ bsw.print(sumRaw2);
+ bsw.print('\t');
+ }
+ bsw.print(sum2);
+ bsw.print('\n');
+ }
+ bsw.poisonAndWait();
+ outstream.println("\nWrote histogram to "+khistFile);
+ }
+
+ long histCount=Tools.sum(khistogram); //Total number of kmers counted
+ long halfCount=(histCount+1)/2;
+ double histCountU=0; //Unique kmers counted
+ long temp1=0;
+ double temp2=0;
+ int median_all=-1;
+ int median_unique=-1;
+ for(int i=0; i<khistogram.length(); i++){
+ long x=khistogram.get(i);
+ temp1+=x;
+ if(temp1>=halfCount && median_all<0){median_all=i;}
+// histSum+=(x*(double)i);
+ histCountU+=(x/(double)Tools.max(1, i));
+ }
+ double halfCount2=(histCountU)/2;
+ for(int i=0; i<khistogram.length(); i++){
+ long x=khistogram.get(i);
+ temp2+=(x/Tools.max(i, 1.0));
+ if(temp2>=halfCount2 && median_unique<0){
+ median_unique=i;
+ break;
+ }
+ }
+ if(median_all<0){median_all=0;}
+ double avg_all=sumsquare/(double)histCount;
+ double avg_unique=histCount/histCountU;
+ double stdev_unique=Tools.standardDeviationHistogramKmer(khistogram);
+ double stdev_all=Tools.standardDeviationHistogram(khistogram);
+ outstream.println("Total kmers counted: \t"+(sumRaw1+sumRaw2));
+
+ double uniqueC=((sum1+sum2)*100.0/(sumRaw1+sumRaw2));
+ double uniqueE=((estUnique)*100.0/(sumRaw1+sumRaw2));
+ double uniqueM=Tools.max(uniqueC, uniqueE);
+ outstream.println("Total unique kmer count: \t"+(sum1+sum2));
+ if(CANONICAL){outstream.println("Includes forward kmers only.");}
+ outstream.println("The unique kmer estimate can be more accurate than the unique count, if the tables are very full.");
+ outstream.println("The most accurate value is the greater of the two.");
+ outstream.println();
+
+ outstream.println("Percent unique: \t"+(uniqueM<10 ? " " : "")+String.format("%.2f%%", uniqueM));
+
+ outstream.println("Depth average: \t"+String.format("%.2f\t(unique kmers)", avg_unique));
+ outstream.println("Depth median: \t"+String.format("%d\t(unique kmers)", median_unique));
+ outstream.println("Depth standard deviation: \t"+String.format("%.2f\t(unique kmers)", stdev_unique));
+
+ outstream.println("\nDepth average: \t"+String.format("%.2f\t(all kmers)", avg_all));
+ outstream.println("Depth median: \t"+String.format("%d\t(all kmers)", median_all));
+ outstream.println("Depth standard deviation: \t"+String.format("%.2f\t(all kmers)", stdev_all));
+
+ double avgReadLen=totalBases*1.0/totalReads;
+ double readDepth=median_all*(avgReadLen/(avgReadLen-k+1));
+
+ outstream.println("\nApprox. read depth median: \t"+String.format("%.2f", (readDepth)));
+ }
+
+
+ if(rhistogram!=null){
+ TextStreamWriter tswh=null;
+ StringBuilder sb=new StringBuilder(100);
+ if(USE_RHISTOGRAM && rhistFile!=null){
+ tswh=new TextStreamWriter(rhistFile, overwrite, false, false);
+ tswh.start();
+ tswh.print("#Depth\tReads\tBases\n");
+ }
+ int lim=(int)(HIST_LEN_PRINT-1);
+ long remaining=Tools.sum(rhistogram);
+ long sumReads1=0;
+ long sumReads2=0;
+ long sumBases1=0;
+ long sumBases2=0;
+ long sumSquareReads=0;
+ long sumSquareBases=0;
+ for(int i=0; i<lim; i++){
+ final long x=rhistogram.get(i);
+ final long y=bhistogram.get(i);
+ sumReads1+=x;
+ sumBases1+=y;
+ sumSquareReads+=(x*Tools.max(1, i));
+ sumSquareBases+=(y*Tools.max(1, i));
+ if(tswh!=null){
+ if(PRINT_ZERO_COVERAGE /*|| x>0*/ || y>0){
+ sb.append(i).append('\t');
+ sb.append(x).append('\t');
+ sb.append(y).append('\n');
+ }
+ tswh.print(sb.toString());
+ sb.setLength(0);
+ }
+ if(sumReads1>=remaining){break;} //Stop once there is no more coverage, even if PRINT_ZERO_COVERAGE is not set.
+ }
+ for(int i=lim; i<rhistogram.length(); i++){
+ final long x=rhistogram.get(i);
+ final long y=bhistogram.get(i);
+ sumReads2+=x;
+ sumBases2+=y;
+ }
+ if(tswh!=null){
+ if(sumReads2>0 || sumBases2>0){
+ sb.append(lim).append('\t');
+ sb.append(sumReads2).append('\t');
+ sb.append(sumBases2).append('\n');
+ }
+ tswh.print(sb.toString());
+ tswh.poison();
+ tswh.waitForFinish();
+ outstream.println("\nWrote histogram to "+rhistFile);
+ }
+
+ long rhistCount=Tools.sum(rhistogram); //Total number of reads counted
+ long bhistCount=Tools.sum(bhistogram); //Total number of bases counted
+ int median_reads=-1;
+ int median_bases=-1;
+ {
+ long halfCount=(rhistCount+1)/2;
+ long temp=0;
+ for(int i=0; i<rhistogram.length(); i++){
+ long x=rhistogram.get(i);
+ temp+=x;
+ if(temp>=halfCount && median_reads<0){median_reads=i;}
+ }
+ if(median_reads<0){median_reads=0;}
+ }
+ {
+ long halfCount=(bhistCount+1)/2;
+ long temp=0;
+ for(int i=0; i<bhistogram.length(); i++){
+ long x=bhistogram.get(i);
+ temp+=x;
+ if(temp>=halfCount && median_bases<0){median_bases=i;}
+ }
+ if(median_bases<0){median_bases=0;}
+ }
+ double avg_reads=sumSquareReads/(double)rhistCount;
+ double avg_bases=sumSquareBases/(double)bhistCount;
+// double read_stdev_unique=Tools.standardDeviationHistogramKmer(rhistogram_total);
+ double read_stdev_all=Tools.standardDeviationHistogram(rhistogram);
+ double base_stdev_all=Tools.standardDeviationHistogram(bhistogram);
+ outstream.println("Total reads counted: \t"+(sumReads1+sumReads2));
+
+// double uniqueC=((sumBases1+sumBases2)*100.0/(sumReads1+sumReads2));
+// double uniqueE=((estUnique)*100.0/(sumReads1+sumReads2));
+// double uniqueM=Tools.max(uniqueC, uniqueE);
+ outstream.println("Total bases counted: \t"+(sumBases1+sumBases2));
+
+ outstream.println("Read depth average: \t"+String.format("%.2f", avg_reads));
+ outstream.println("Read depth median: \t"+String.format("%d", median_reads));
+ outstream.println("Read depth standard deviation:\t"+String.format("%.2f", read_stdev_all));
+
+ outstream.println("\nBase depth average: \t"+String.format("%.2f)", avg_bases));
+ outstream.println("Base depth median: \t"+String.format("%d", median_bases));
+ outstream.println("Base depth standard deviation:\t"+String.format("%.2f", base_stdev_all));
+ }
+
+ return totalBases;
+ }
+
+
+
+ /**
+ * Locates and fixes spikes in a coverage profile (potentially) caused by false positives in a bloom filter.
+ * Theory: If a high-count kmer is adjacent on both sides to low-count kmers, it may be a false positive.
+ * It could either be reduced to the max of the two flanking points or examined in more detail.
+ * @param cov An array of kmer counts for adjacent kmers in a read.
+ */
+ private static void fixSpikes(int[] cov){
+
+ for(int i=1; i<cov.length-1; i++){
+ long a=Tools.max(1, cov[i-1]);
+ int b=cov[i];
+ long c=Tools.max(1, cov[i+1]);
+ if(b>1 && b>a && b>c){
+ //peak
+ if((b>=2*a || b>a+2) && (b>=2*c || b>c+2)){
+ //spike
+ cov[i]=(int)Tools.max(a, c);
+ }
+ }
+ }
+ }
+
+ private static void fixSpikes(int[] cov, long[] kmers, KCountArray kca, final int k){
+ assert(k<32) : "this function not tested with k>31";
+ if(cov.length<3){return;}
+ if(cov[1]-cov[0]>1){
+ cov[0]=kca.readPrecise(kmers[0], k, true);
+ }
+ if(cov[cov.length-1]-cov[cov.length-2]>1){
+ cov[cov.length-1]=kca.readPrecise(kmers[cov.length-1], k, true);
+ }
+
+ for(int i=1; i<cov.length-1; i++){
+ int b=cov[i];
+ if(b>1){
+ long a=Tools.max(1, cov[i-1]);
+ long c=Tools.max(1, cov[i+1]);
+ long key=kmers[i];
+
+ if(b>a && b>c){
+ //peak
+ if(b<6 || b>a+1 || b>c+1){
+ cov[i]=kca.readPreciseMin(key, k, true);
+ }
+ // if((b>=2*a || b>a+2) && (b>=2*c || b>c+2)){
+ // //spike
+ // int b1=(int)((a+c)/2);
+ // int b2=kca.readLeft(key, k, CANONICAL);
+ // int b3=kca.readRight(key, k, CANONICAL);
+ // array[i]=Tools.min(b, b1, b2, b3);
+ // }
+ // else
+ // {
+ //// array[i]=kca.readPreciseMin(key, k, CANONICAL);
+ // }
+ }
+ // else
+ // if(Tools.max(ada, adc)>=Tools.max(2, Tools.min((int)a, b, (int)c)/4)){
+ // array[i]=kca.readPrecise(key, k, CANONICAL);
+ // }
+ // else
+ // if(b>a+1 || b>c+1){
+ // //steep
+ // array[i]=kca.readPrecise(key, k, CANONICAL);
+ // }
+ }
+ }
+ }
+
+
+ private static int correctErrors(Read r, int[] cov, long[] kmers, KCountArray kca, final int k,
+ final int low, final int high, final int mult, int maxToCorrect, int maxQual, boolean kmersAlreadyValid, boolean coverageAlreadyValid, long[] qhist,
+ final boolean markOnly, Kmer longkmer){
+ assert(k<32) : "this function not tested with k>31";
+ assert(maxToCorrect>0) : "Don't do error correction with a maximum of 0 errors; it's a waste of time.";
+ if(maxToCorrect<1){return 0;}
+
+ if(!kmersAlreadyValid){kmers=r.toKmers(k, 0, kmers, false, longkmer);}
+ if(kmers==null || kmers.length<3){return -99;}
+
+ if(!coverageAlreadyValid){cov=generateCoverage(kca, k, cov, kmers, true);}
+
+ int disc=countDiscontinuities(cov, low, high, mult);
+
+ if(disc==0){return 0;}
+
+ byte[] copy=r.bases.clone();
+
+ byte[] suffix=new byte[SUFFIX_LEN];
+
+ int cfl=0, cfr=0;
+
+ if(CORRECT_FROM_LEFT){
+ cfl=correctErrorsFromLeft(r, cov, kmers, kca, k, low, high, mult, maxToCorrect, maxQual, suffix, qhist, markOnly, longkmer);
+ if(cfl<0){
+ //Failed correction.
+ r.bases=copy;
+ return cfl;
+ }
+ maxToCorrect-=cfl;
+ }
+
+ if(CORRECT_FROM_RIGHT && maxToCorrect>0){
+ {//Optional block - allows saving of errors corrected from left even if correctErrorsFromRight fails.
+ if(cfl>0){
+ for(int i=0; i<r.length(); i++){
+ copy[i]=r.bases[i];
+ }
+ }
+ }
+ cfr=correctErrorsFromRight(r, cov, kmers, kca, k, low, high, mult, maxToCorrect, maxQual, suffix, qhist, markOnly, longkmer);
+ if(cfr<0){
+ //Failed correction.
+ r.bases=copy;
+ return cfr;
+ }
+ }
+
+ return cfl+cfr;
+ }
+
+ private static int markErrors(Read r, int[] cov, long[] kmers, KCountArray kca, final int k,
+ final int low, final int high, final int mult, int maxToCorrect, boolean kmersAlreadyValid, boolean coverageAlreadyValid, long[] qhist, Kmer longkmer){
+ assert(k<32) : "this function not tested with k>31";
+ assert(maxToCorrect>0) : "Don't do error correction with a maximum of 0 errors; it's a waste of time.";
+ if(maxToCorrect<1){return 0;}
+
+ if(!kmersAlreadyValid){kmers=r.toKmers(k, 0, kmers, false, longkmer);}
+ if(kmers==null || kmers.length<3){return 0;}
+
+ if(!coverageAlreadyValid){cov=generateCoverage(kca, k, cov, kmers, true);}
+
+ int disc=countDiscontinuities(cov, low, high, mult);
+
+ if(disc==0){return 0;}
+
+ int cfl=0, cfr=0;
+
+ if(CORRECT_FROM_LEFT){
+ cfl=markErrorsFromLeft(r, cov, k, low, high, mult, maxToCorrect, qhist);
+ maxToCorrect-=cfl;
+ }
+
+ if(CORRECT_FROM_RIGHT){
+ cfr=markErrorsFromRight(r, cov, k, low, high, mult, maxToCorrect, qhist);
+ }
+
+ int marked=cfl+cfr;
+ final byte[] quals=r.quality;
+ if(marked>0){
+ int found=0;
+ if(quals!=null){
+ for(int i=0; i<quals.length; i++){
+ byte q=quals[i];
+ if(q<0){
+ byte q2=(byte)(MARK_WITH_1 ? 1 : Tools.max(1, -(q/2+3)));
+ quals[i]=q2;
+ found++;
+ }
+ }
+ }
+ assert(found==marked);
+ }else{
+ if(quals!=null){
+ for(int i=0; i<quals.length; i++){
+ assert(quals[i]>=0);
+ }
+ }
+ assert(marked==0) : marked;
+ }
+
+ return marked;
+ }
+
+ /** Returns number of discontinuities detected. This is not the same as the number of errors,
+ * but the presence of discontinuities indicates the presence of errors.
+ * @param cov
+ * @param low
+ * @param high
+ * @param mult
+ * @return
+ */
+ private static int countDiscontinuities(final int[] cov, final int low, final int high, final int mult){
+
+ int found=0;
+
+ for(int i=2; i<cov.length; i++){
+ int a=Tools.min(cov[i-2], cov[i-1]);
+ int b=cov[i];
+ if(a>=high && (b<=low || a>=b*mult)){//error
+ found++;
+ }
+ }
+
+ for(int i=cov.length-3; i>=0; i--){
+ int a=Tools.min(cov[i+2], cov[i+1]);
+ int b=cov[i];
+ if(a>=high && (b<=low || a>=b*mult)){//error
+ found++;
+ }
+ }
+
+ return found;
+ }
+
+
+ private static void regenerateKmersAndCoverage(final Read r, final long[] kmers, final int[] cov, final KCountArray kca, final int k, boolean makeCanonical,
+ Kmer longkmer){
+ assert(r!=null && kmers!=null && cov!=null && kca!=null && kca.gap==0);
+ final byte[] bases=r.bases;
+ if(bases==null || bases.length<k+kca.gap){return;}
+
+ if(k>31){
+ r.toKmers(k, 0, kmers, false, longkmer);
+ generateCoverage(kca, k, cov, kmers, true);
+ return;
+ }
+
+ final int kbits=2*k;
+ final long mask=~((-1L)<<(kbits));
+
+ int len=0;
+ long kmer=0;
+ final int arraylen=bases.length-k+1;
+ assert(kmers.length==arraylen && cov.length==arraylen);
+
+ for(int i=0, j=1-k; i<bases.length; i++, j++){
+ byte b=bases[i];
+ int x=AminoAcid.baseToNumber[b];
+ if(x<0){
+ len=0;
+ kmer=0;
+ }else{
+ kmer=((kmer<<2)|x)&mask;
+ len++;
+
+ if(len>=k){
+ long y=kmer;
+ if(makeCanonical){
+ y=KCountArray.makeCanonical2(y, k);
+ }
+
+ if(kmers[j]!=y){
+ kmers[j]=y;
+ cov[j]=kca.read(y, k, !makeCanonical);
+ }
+ }
+ }
+ }
+ }
+
+
+ private static int correctErrorsFromLeft(final Read r, final int[] cov, final long[] kmers, final KCountArray kca, final int k,
+ final int low, final int high, final int mult, final int maxToCorrect, int maxQual, final byte[] suffix, final long[] qhist, boolean markOnly,
+ Kmer longkmer){
+
+ int found=0;
+ int corrected=0;
+ int uncorrected=0;
+ final byte[] quals=r.quality;
+
+ for(int i=PREFIX_LEN; i<cov.length; i++){
+// int a=Tools.min(cov[i-2], cov[i-1]);
+ final int a=Tools.min(cov, i-PREFIX_LEN, i-1);
+ final int b=cov[i];
+ if(a>=high && (b<=low || a>=b*mult)){//error
+ found++;
+ final int loc=i+k-1;
+ final byte q=(quals==null ? 10 : quals[loc]);
+ if(qhist!=null){qhist[q]++;}
+
+ if(markOnly){
+ corrected++;
+ if(quals==null){r.bases[loc]='N';}
+// else if(q>0){quals[loc]=(byte)Tools.max(1, q/2);}
+ else if(q>0){quals[loc]=(byte)Tools.max(1, q/2-3);}
+ }else{
+ if(found>maxToCorrect || q>maxQual){return 0-found;}
+ boolean success=correctErrorFromLeft(r, cov, kmers, kca, k, low, Tools.max(high, a/2), 2*a, mult, i, suffix);
+ if(success){
+ corrected++;
+ // r.toKmers(k, 0, kmers, false);
+ // generateCoverage(kca, k, cov, kmers, true);
+ regenerateKmersAndCoverage(r, kmers, cov, kca, k, false, longkmer);
+ }else{
+ uncorrected++;
+ break;
+ }
+ }
+ }
+ }
+
+// assert(false) : Arrays.toString(cov)+"\nlow="+low+", high="+high+", mult="+mult+", found="+found+", corrected="+corrected+", uncorrected="+uncorrected;
+
+
+ return (uncorrected>0 ? 0-found : corrected);
+ }
+
+
+ private static int correctErrorsFromRight(final Read r, final int[] cov, final long[] kmers, final KCountArray kca, final int k,
+ final int low, final int high, final int mult, final int maxToCorrect, int maxQual, final byte[] suffix, final long[] qhist, final boolean markOnly,
+ Kmer longkmer){
+
+ int found=0;
+ int corrected=0;
+ int uncorrected=0;
+ final byte[] quals=r.quality;
+
+ final int start=(markOnly ? Tools.min(cov.length-PREFIX_LEN-1, k-1) : cov.length-PREFIX_LEN-1);
+ for(int i=start; i>=0; i--){
+// int a=Tools.min(cov[i+2], cov[i+1]);
+ int a=Tools.min(cov, i+1, i+PREFIX_LEN);
+ int b=cov[i];
+ if(a>=high && (b<=low || a>=b*mult)){//error
+ found++;
+ final byte q=(quals==null ? 10 : quals[i]);
+ if(qhist!=null){qhist[q]++;}
+
+ if(markOnly){
+ corrected++;
+ if(quals==null){r.bases[i]='N';}
+// else if(q>0){quals[i]=(byte)Tools.max(1, q/2);}
+ else if(q>0){quals[i]=(byte)Tools.max(1, q/2-3);}
+ }else{
+ if(found>maxToCorrect || q>maxQual){return 0-found;}
+ boolean success=correctErrorFromRight(r, cov, kmers, kca, k, low, Tools.max(high, a/2), 2*a, mult, i, suffix);
+ if(success){
+ corrected++;
+ // r.toKmers(k, 0, kmers, false);
+ // generateCoverage(kca, k, cov, kmers, true);
+ regenerateKmersAndCoverage(r, kmers, cov, kca, k, false, longkmer);
+ }else{
+ uncorrected++;
+ break;
+ }
+ }
+ }
+ }
+
+// assert(false) : Arrays.toString(cov)+"\nlow="+low+", high="+high+", mult="+mult+", found="+found+", corrected="+corrected+", uncorrected="+uncorrected;
+
+
+ return (uncorrected>0 ? 0-found : corrected);
+ }
+
+
+ private static int markErrorsFromLeft(final Read r, final int[] cov, final int k,
+ final int low, final int high, final int mult, final int maxToCorrect, final long[] qhist){
+
+ int found=0;
+ final byte[] quals=r.quality, bases=r.bases;
+
+ for(int i=PREFIX_LEN; i<cov.length; i++){
+ final int a=Tools.min(cov, i-PREFIX_LEN, i-1);
+ final int b=cov[i];
+ if(a>=high && (b<=low || a>=b*mult)){//error
+ final int loc=i+k-1;
+ final byte q=(quals==null ? 10 : quals[loc]);
+
+ if(q>0){
+ found++;
+ if(qhist!=null){qhist[q]++;}
+ if(quals==null){bases[loc]='N';}
+ else{quals[loc]=(byte)-q;}
+ }
+ }
+ }
+ return found;
+ }
+
+
+ private static int markErrorsFromRight(final Read r, final int[] cov, final int k,
+ final int low, final int high, final int mult, final int maxToCorrect, final long[] qhist){
+
+ int found=0;
+ final byte[] quals=r.quality, bases=r.bases;
+
+ final int start=cov.length-PREFIX_LEN-1;
+ for(int i=start; i>=0; i--){
+ int a=Tools.min(cov, i+1, i+PREFIX_LEN);
+ int b=cov[i];
+ if(a>=high && (b<=low || a>=b*mult)){//error
+ final byte q=(quals==null ? 10 : quals[i]);
+
+ if(q>0){
+ found++;
+ if(qhist!=null){qhist[q]++;}
+ if(quals==null){bases[i]='N';}
+ else{quals[i]=(byte)-q;}
+ }
+ }
+ }
+
+ return found;
+ }
+
+ private static boolean correctErrorFromLeft(final Read r, final int[] cov, final long[] kmers, final KCountArray kca, final int k,
+ final int low, final int targetLowerBound, final int targetUpperBound, final int mult, final int loc, final byte[] suffix){
+
+ for(int i=0, j=loc+k-1; i<suffix.length; i++, j++){
+ if(j<r.length()){
+ suffix[i]=r.bases[j];
+ }else{
+ suffix[i]='N';
+ }
+ }
+
+// if(r.numericID!=3500){return false;}
+
+
+ long kmer=kmers[loc];
+ final boolean defined=(AminoAcid.isFullyDefined(suffix[0]));
+
+ //This block added to allow correction of no-calls
+ if(!defined && loc>0){
+ assert(kmer==-1L) : new String(suffix)+"\t"+kmer;
+ if(kmers[loc-1]!=-1L){
+ final int kbits=2*k;
+ final long mask=~((-1L)<<(kbits));
+ kmer=((kmers[loc-1]<<2)&mask);
+ }
+ }
+// int leftCov=Tools.min(cov[loc-1], cov[loc-2]);
+
+// assert(false) : "kmer = "+AminoAcid.kmerToString(kmers[0], k);
+// assert(false) : "suffix = "+new String(suffix);
+
+// assert(false) : new String(suffix)+"\t"+kmer;
+
+ suffix[0]='A';
+ final int a=testRightSuffix(kca, k, kmer, suffix);
+ suffix[0]='C';
+ final int c=testRightSuffix(kca, k, kmer, suffix);
+ suffix[0]='G';
+ final int g=testRightSuffix(kca, k, kmer, suffix);
+ suffix[0]='T';
+ final int t=testRightSuffix(kca, k, kmer, suffix);
+
+ final int max=Tools.max(a, c, g, t);
+ byte best='N';
+
+// assert(false) : "rid="+r.numericID+"\n"+Arrays.toString(cov)+"\n" +
+// new String(r.bases)+"\n" +
+// "loc="+loc+", "+new String(suffix)+"\n" +
+// "low="+low+", high="+high+", mult="+mult+", a="+a+", c="+c+", g="+g+", t="+t+", max="+max;
+
+ if(max>=targetLowerBound && max<=targetUpperBound){
+ //Found correct answer!
+ final int max2;
+ if(a==max){
+ max2=Tools.max(c, g, t);
+ best='A';
+ }else if(c==max){
+ max2=Tools.max(a, g, t);
+ best='C';
+ }else if(g==max){
+ max2=Tools.max(a, c, t);
+ best='G';
+ }else if(t==max){
+ max2=Tools.max(a, c, g);
+ best='T';
+ }else{
+ max2=max;
+ assert(false);
+ }
+
+// assert(false) : max+", "+max2+", "+low+", "+(char)best;
+ if(max2<=low || max2*mult<=max){
+ final int bnum=loc+k-1;
+ r.bases[bnum]=best;
+ if(!defined && r.quality!=null){
+ assert(r.quality[bnum]==0) : r;
+ r.quality[bnum]=FIXED_N_QUAL;
+ }
+ return true;
+ }
+ }
+
+// assert(false) : max+", "+targetLowerBound+", "+targetUpperBound+", "+low+", "+(char)best;
+
+ return false;
+ }
+
+ private static boolean correctErrorFromRight(final Read r, final int[] cov, final long[] kmers, final KCountArray kca, final int k,
+ final int low, final int targetLowerBound, final int targetUpperBound, final int mult, final int loc, final byte[] suffix){
+
+ for(int i=0, j=loc; i<suffix.length; i++, j--){
+ if(j>=0){
+ suffix[i]=r.bases[j];
+ }else{
+ suffix[i]='N';
+ }
+ }
+// if(r.numericID!=3500){return false;}
+
+ long kmer=kmers[loc];
+ final boolean defined=(AminoAcid.isFullyDefined(suffix[0]));
+
+ //This block added to allow correction of no-calls
+ if(!defined && loc+1<kmers.length){
+ assert(kmer==-1L) : new String(suffix)+"\t"+kmer;
+ if(kmers[loc+1]!=-1L){
+ final int kbits=2*k;
+ final long mask=~((-1L)<<(kbits));
+ kmer=((kmers[loc+1]>>2)&mask);
+ }
+ }
+// int rightCov=Tools.min(cov[loc+1], cov[loc+2]);
+
+// assert(false) : "kmer = "+AminoAcid.kmerToString(kmers[0], k);
+// assert(false) : "suffix = "+new String(suffix);
+
+ suffix[0]='A';
+ final int a=testLeftSuffix(kca, k, kmer, suffix);
+ suffix[0]='C';
+ final int c=testLeftSuffix(kca, k, kmer, suffix);
+ suffix[0]='G';
+ final int g=testLeftSuffix(kca, k, kmer, suffix);
+ suffix[0]='T';
+ final int t=testLeftSuffix(kca, k, kmer, suffix);
+
+ final int max=Tools.max(a, c, g, t);
+ byte best='N';
+
+// assert(false) : "\nrid="+r.numericID+"\n"+Arrays.toString(cov)+"\n" +
+// new String(r.bases)+"\n"+
+// "kmer-2 = "+AminoAcid.kmerToString(kmers[loc-2], k)+"\n"+
+// "kmer-1 = "+AminoAcid.kmerToString(kmers[loc-1], k)+"\n"+
+// "kmer = "+AminoAcid.kmerToString(kmer, k)+"\n"+
+// "kmer+1 = "+AminoAcid.kmerToString(kmers[loc+1], k)+"\n"+
+// "kmer+2 = "+AminoAcid.kmerToString(kmers[loc+2], k)+"\n"+
+// "count=("+kca.read(kmers[loc-2], k, true)+", "+kca.read(kmers[loc-1], k, true)+", "+
+// kca.read(kmer, k, true)+", "+kca.read(kmers[loc+1], k, true)+", "+kca.read(kmers[loc+2], k, true)+")\n"+
+// "loc="+loc+", suffix="+new String(suffix)+"\n" +
+// "low="+low+", high="+high+", mult="+mult+", a="+a+", c="+c+", g="+g+", t="+t+", max="+max;
+
+ if(max>=targetLowerBound && max<=targetUpperBound){
+ //Found correct answer!
+ final int max2;
+ if(a==max){
+ max2=Tools.max(c, g, t);
+ best='A';
+ }else if(c==max){
+ max2=Tools.max(a, g, t);
+ best='C';
+ }else if(g==max){
+ max2=Tools.max(a, c, t);
+ best='G';
+ }else if(t==max){
+ max2=Tools.max(a, c, g);
+ best='T';
+ }else{
+ max2=max;
+ assert(false);
+ }
+
+ if(max2<=low || max2*mult<=max){
+ r.bases[loc]=best;
+ if(!defined && r.quality!=null){
+ assert(r.quality[loc]==0) : r;
+ r.quality[loc]=FIXED_N_QUAL;
+ }
+ return true;
+ }
+ }
+ return false;
+ }
+
+ private static int testRightSuffix(final KCountArray kca, final int k, final long kmer0, final byte[] suffix){
+ assert(k<=31);
+ final int kbits=2*k;
+ final long mask=~((-1L)<<(kbits));
+
+ long kmer=kmer0>>2;
+ int min=Integer.MAX_VALUE;
+
+// System.out.println("Processing suffix "+new String(suffix));
+// System.out.println("kmer = "+AminoAcid.kmerToString(kmer0, k));
+// System.out.println("cov = "+kca.read(kmer0, k, true));
+
+
+ for(int i=0; i<suffix.length && min>0; i++){
+ byte b=suffix[i];
+ if(b=='N'){
+ //TODO: Find best next letter
+ return 0;
+ }
+ assert(b!='N');
+ int x=AminoAcid.baseToNumber[b];
+ assert(x>=0);
+
+
+ kmer=((kmer<<2)|x)&mask;
+ int cov=kca.read(kmer, k, true);
+ min=Tools.min(min, cov);
+
+// System.out.println("kmer = "+AminoAcid.kmerToString(kmer, k));
+// System.out.println("cov = "+cov);
+ }
+// System.out.println("returning "+min);
+
+ assert(min<Integer.MAX_VALUE);
+ return min;
+ }
+
+ private static int testLeftSuffix(final KCountArray kca, final int k, final long kmer0, final byte[] suffix){
+ assert(k<=31);
+ final int kbits=2*k;
+ final int shift=kbits-2;
+ final long mask=~((-1L)<<(kbits));
+
+ long kmer=(kmer0<<2)&mask;
+ int min=Integer.MAX_VALUE;
+
+// System.out.println("Processing suffix "+new String(suffix));
+// System.out.println("kmer = "+AminoAcid.kmerToString(kmer0, k));
+// System.out.println("cov = "+kca.read(kmer0, k, true));
+
+
+ for(int i=0; i<suffix.length && min>0; i++){
+ byte b=suffix[i];
+ if(b=='N'){
+ //TODO: Find best next letter
+ return 0;
+ }
+ assert(b!='N');
+ long x=AminoAcid.baseToNumber[b];
+ assert(x>=0);
+
+// System.out.println("b="+b+", x="+x);
+
+ kmer=((kmer>>2)|(x<<shift));
+ int cov=kca.read(kmer, k, true);
+ min=Tools.min(min, cov);
+
+// System.out.println("kmer = "+AminoAcid.kmerToString(kmer, k));
+// System.out.println("cov = "+cov);
+ }
+// System.out.println("returning "+min);
+
+ assert(min<Integer.MAX_VALUE);
+ return min;
+ }
+
+ private static void analyzeSpikes(int[] array, int width){
+ if(array.length<3){return;}
+ int peakcount=0, valleycount=0, spikecount=0, flatcount=0, slopecount=0;
+ for(int i=1; i<array.length-1; i++){
+ long a=array[i-1];
+ int b=array[i];
+ long c=array[i+1];
+ if(b>a && b>c){
+ peakcount++;
+ if((b>=2*a || b>a+2) && (b>=2*c || b>c+2)){
+ spikecount++;
+ }
+ }else if(b<a && b<c){
+ valleycount++;
+ }else if(b==a && b==c){
+ flatcount++;
+ }else{
+ slopecount++;
+ }
+ }
+ if(peakcount>0){peaks.addAndGet(peakcount);}
+ if(valleycount>0){valleys.addAndGet(valleycount);}
+ if(spikecount>0){spikes.addAndGet(spikecount);}
+ if(flatcount>0){flats.addAndGet(flatcount);}
+ if(slopecount>0){slopes.addAndGet(slopecount);}
+ }
+
+
+ /**
+ * kmer array must be valid at this point
+ * @param r
+ * @param kca
+ * @return
+ */
+ public static int[] generateCoverage(Read r, KCountArray kca, final int k, int[] out, long[] kmers){
+ if(kca.gap>0){throw new RuntimeException("Gapped reads: TODO");}
+
+ assert(kmers!=null);
+ if(kmers==null){return null;} //Read is too short
+
+ out=generateCoverage(kca, k, out, kmers, k<=31);
+
+ if(ANALYZE_TOPOLOGY){analyzeSpikes(out, 1);}
+ return out;
+ }
+
+ /**
+ * kmer array must be valid at this point
+ * @param r
+ * @param kca
+ * @return
+ */
+ public static int[] generateCoverage(KCountArray kca, int k, int[] out, long[] kmers, boolean makeCanonical){
+ if(kca.gap>0){throw new RuntimeException("Gapped reads: TODO");}
+ if(kmers==null){return null;}
+
+ if(out==null || out.length!=kmers.length){out=new int[kmers.length];}
+ Arrays.fill(out, -1);
+
+ for(int i=0; i<kmers.length; i++){
+ long kmer=kmers[i];
+ if(kmer!=-1){
+ int count=kca.read(kmer, k, makeCanonical);
+ out[i]=count;
+ }
+ }
+
+ if(FIX_SPIKES){fixSpikes(out, kmers, kca, k);}
+ return out;
+ }
+
+ /** Returns {depth1, depth2, errors1, errors2} */
+ public static int[] parseDepth(String s, int[] array){
+ if(s==null || !s.startsWith("id=")){return null;}
+ if(array==null){array=new int[4];}
+ String[] split=s.split("[, ]");
+ Arrays.fill(array, -1);
+// assert(false) : s+"\n"+Arrays.toString(split);
+ try {
+ for(int i=1; i<split.length; i++){
+ final String ss=split[i];
+ if(ss.startsWith("d1=")){array[0]=Integer.parseInt(ss.substring(3));}
+ else if(ss.startsWith("d2=")){array[2]=Integer.parseInt(ss.substring(3));}
+ else if(ss.startsWith("e1=")){array[3]=Integer.parseInt(ss.substring(3));}
+ else if(ss.startsWith("e2=")){array[4]=Integer.parseInt(ss.substring(3));}
+ }
+ return array;
+ } catch (NumberFormatException e) {
+ return null;
+ }
+ }
+
+
+ private static class ProcessThread extends Thread{
+
+ ProcessThread(ConcurrentReadInputStream cris_, KCountArray kca_, KCountArray kcaup_, int k_,
+ ConcurrentReadOutputStream rosk_, ConcurrentReadOutputStream rost_, ConcurrentReadOutputStream rosl_, ConcurrentReadOutputStream rosm_, ConcurrentReadOutputStream rosh_, ConcurrentReadOutputStream rosu_,
+ ArrayList<Read> storage_){
+ cris=cris_;
+ kca=kca_;
+ kcaup=kcaup_;
+ k=k_;
+ rosk=rosk_;
+ rost=rost_;
+ rosl=rosl_;
+ rosm=rosm_;
+ rosh=rosh_;
+ rosu=rosu_;
+ storage=storage_;
+ }
+
+ public void run(){
+ randy=ThreadLocalRandom.current();
+ if(COUNTUP){
+ normalizeInThreadByCountup();
+ }else{
+ normalizeInThread();
+ }
+ }
+
+ void normalizeInThread() {
+
+ Kmer longkmer=(k<32 ? null : new Kmer(k));
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ final ArrayList<Read> keepList=(rosk==null ? null : new ArrayList<Read>(Shared.READ_BUFFER_LENGTH));
+ final ArrayList<Read> tossList=(rost==null ? null : new ArrayList<Read>(Shared.READ_BUFFER_LENGTH));
+ final ArrayList<Read> lowList=(rosl==null ? null : new ArrayList<Read>(Shared.READ_BUFFER_LENGTH));
+ final ArrayList<Read> midList=(rosm==null ? null : new ArrayList<Read>(Shared.READ_BUFFER_LENGTH));
+ final ArrayList<Read> highList=(rosh==null ? null : new ArrayList<Read>(Shared.READ_BUFFER_LENGTH));
+ final ArrayList<Read> uncList=(rosu==null ? null : new ArrayList<Read>(Shared.READ_BUFFER_LENGTH));
+
+ int[] cov1=null, cov2=null;
+ long[] kmers1=null, kmers2=null;
+
+ while(reads!=null && reads.size()>0){
+ for(int rnum=0; rnum<reads.size(); rnum++){
+ Read r1=reads.get(rnum);
+ Read r2=r1.mate;
+ assert(r1!=r2);
+
+ if(eccByOverlap && r1!=null && r2!=null){BBMerge.findOverlapStrict(r1, r2, true);}
+
+ if(!TRIM_AFTER_MARKING && (TRIM_LEFT_THIS_PASS || TRIM_RIGHT_THIS_PASS)){
+ if(r1!=null){basesTrimmed+=TrimRead.trimFast(r1, TRIM_LEFT_THIS_PASS, TRIM_RIGHT_THIS_PASS, TRIM_QUALITY, 1);}
+ if(r2!=null){basesTrimmed+=TrimRead.trimFast(r2, TRIM_LEFT_THIS_PASS, TRIM_RIGHT_THIS_PASS, TRIM_QUALITY, 1);}
+ }
+
+ int depthAL1=-1, depthAL2=-1;
+ int truedepth1=-1, truedepth2=-1;
+ int mintruedepth1=-1, mintruedepth2=-1;
+
+ int readcount=0;
+ int basecount=0;
+
+ int lowcount=0, totalcount=0, ec1=0, ec2=0;
+ boolean error1=false, error2=false, uncorrectable1=false, uncorrectable2=false, marked1=false, marked2=false;
+
+ if(r1!=null && r1.bases!=null){
+ readcount++;
+ basecount+=r1.length();
+ if(r1.length()>=k){
+ if(verbose){outstream.println();}
+ kmers1=r1.toKmers(k, kca.gap, kmers1, true, longkmer);
+ cov1=generateCoverage(kca, k, cov1, kmers1, k<32);
+ int[] cov=cov1.clone();
+ sortCoverageAndIncrementHistogram(cov);
+
+ if(cov!=null){
+ final int covlast=cov.length-1;
+ final int high=cov[(int)((covlast)*(1-HIGH_PERCENTILE))];
+ final int low=cov[(int)((covlast)*(1-LOW_PERCENTILE))];
+ mintruedepth1=low;
+ int aboveLimit=covlast;
+ int lc=0;
+ final int mindepth=Tools.max(MIN_DEPTH, high/ERROR_DETECT_RATIO);
+ truedepth1=cov[(int)((covlast)*(1-DEPTH_PERCENTILE))];
+ while(aboveLimit>=0 && cov[aboveLimit]<mindepth){aboveLimit--;}
+ if(aboveLimit+1>=MIN_KMERS_OVER_MIN_DEPTH || (aboveLimit>=0 && MIN_KMERS_OVER_MIN_DEPTH>cov.length)){
+ depthAL1=cov[(int)(aboveLimit*(1-DEPTH_PERCENTILE))];
+ }
+ if(high<=LTHRESH || (high>=HTHRESH && low<=LTHRESH) || high>=low*ERROR_DETECT_RATIO){
+ error1=true;
+ if(high<=LTHRESH){errorType1++;}
+ if(high>=HTHRESH && low<=LTHRESH){errorType2++;}
+ if(high>=low*ERROR_DETECT_RATIO){errorType3++;}
+ }
+
+ totalcount+=cov.length;
+ if(cov[0]<=LTHRESH){
+ lc+=cov.length;
+ }else if(high>=HTHRESH){
+ int lim=Tools.min(LTHRESH, high/ERROR_DETECT_RATIO);
+ for(int i=covlast; i>=0 && cov[i]<=lim; i--){lc++;}
+ }
+ lowcount+=lc;
+
+ if(rhistogram!=null){
+ int d=depthAL1>=0 ? depthAL1 : truedepth1>=0 ? truedepth1 : 0;
+ d=Tools.min(d, HIST_LEN-1);
+ rhistogram.incrementAndGet(d);
+ bhistogram.addAndGet(d, r1.length());
+ }
+ }
+ }
+ }
+ if(r2!=null && r2.bases!=null){
+ readcount++;
+ basecount+=r2.length();
+ if(r2.length()>=k){
+ if(verbose){outstream.println();}
+ kmers2=r2.toKmers(k, kca.gap, kmers2, true, longkmer);
+ cov2=generateCoverage(kca, k, cov2, kmers2, k<32);
+ int[] cov=cov2.clone();
+ sortCoverageAndIncrementHistogram(cov);
+
+ if(cov!=null){
+ final int covlast=cov.length-1;
+ final int high=cov[(int)((covlast)*(1-HIGH_PERCENTILE))];
+ final int low=cov[(int)((covlast)*(1-LOW_PERCENTILE))];
+ mintruedepth2=low;
+ int aboveLimit=covlast;
+ int lc=0;
+ final int mindepth=Tools.max(MIN_DEPTH, high/ERROR_DETECT_RATIO);
+ truedepth2=cov[(int)((covlast)*(1-DEPTH_PERCENTILE))];
+ while(aboveLimit>=0 && cov[aboveLimit]<mindepth){aboveLimit--;}
+ if(aboveLimit+1>=MIN_KMERS_OVER_MIN_DEPTH || (aboveLimit>=0 && MIN_KMERS_OVER_MIN_DEPTH>cov.length)){
+ depthAL2=cov[(int)(aboveLimit*(1-DEPTH_PERCENTILE))];
+ }
+ if(high<=LTHRESH || (high>=HTHRESH && low<=LTHRESH) || high>=low*ERROR_DETECT_RATIO){
+ error2=true;
+ if(high<=LTHRESH){errorType1++;}
+ if(high>=HTHRESH && low<=LTHRESH){errorType2++;}
+ if(high>=low*ERROR_DETECT_RATIO){errorType3++;}
+ }
+
+ totalcount+=cov.length;
+ if(cov[0]<=LTHRESH){
+ lc+=cov.length;
+ }else if(high>=HTHRESH){
+ int lim=Tools.min(LTHRESH, high/ERROR_DETECT_RATIO);
+ for(int i=covlast; i>=0 && cov[i]<=lim; i--){lc++;}
+ }
+ lowcount+=lc;
+ cov2=cov;
+
+ if(rhistogram!=null){
+ int d=depthAL2>=0 ? depthAL2 : truedepth2>=0 ? truedepth2 : 0;
+ d=Tools.min(d, HIST_LEN-1);
+ rhistogram.incrementAndGet(d);
+ bhistogram.addAndGet(d, r2.length());
+ }
+ }
+ }
+ }
+
+ if(RENAME_THIS_PASS){
+ if(r2==null){
+ final String s="id="+r1.numericID+",d1="+depthAL1+(CORRECT_ERRORS_THIS_PASS ? ",e1="+(ec1<0 ? -ec1 : 0) : "");
+ r1.id=s;
+ if(EA){
+ int[] quad=parseDepth(r1.id, null);
+ assert(quad[0]==depthAL1);
+ assert(quad[1]==-1);
+ assert(quad[2]==(CORRECT_ERRORS_THIS_PASS ? (ec1<0 ? -ec1 : 0) : -1));
+ assert(quad[3]==-1);
+ }
+ }else{
+ final String s="id="+r1.numericID+",d1="+depthAL1+",d2="+depthAL2+(CORRECT_ERRORS_THIS_PASS ? ",e1="+(ec1<0 ? -ec1 : 0)+",e2="+(ec2<0 ? -ec2 : 0) : "");
+ r1.id=s+" /1";
+ r2.id=s+" /2";
+ if(EA){
+ int[] quad=parseDepth(r1.id, null);
+ assert(quad[0]==depthAL1);
+ assert(quad[1]==depthAL2);
+ assert(quad[2]==(CORRECT_ERRORS_THIS_PASS ? (ec1<0 ? -ec1 : 0) : -1));
+ assert(quad[3]==(CORRECT_ERRORS_THIS_PASS ? (ec2<0 ? -ec2 : 0) : -1));
+ }
+ }
+ }
+
+ r1.errors=lowcount;
+
+ int maxDepth=MAX_DEPTH;
+ int targetDepth=TARGET_DEPTH;
+
+ if(lowcount>0){
+// targetDepth=(int)((TARGET_DEPTH_BAD_LOW*(long)lowcount+TARGET_DEPTH_BAD_HIGH*(totalcount-(long)lowcount))/totalcount);
+
+ double fractionGood=(totalcount-lowcount)/(float)totalcount;
+ targetDepth=(int)(TARGET_DEPTH_BAD_LOW+(TARGET_DEPTH_BAD_HIGH-TARGET_DEPTH_BAD_LOW)*(fractionGood*fractionGood));
+ assert(TARGET_DEPTH_BAD_LOW<=TARGET_DEPTH_BAD_HIGH);
+ assert(TARGET_DEPTH>=99999999 || (targetDepth>0 && targetDepth<=TARGET_DEPTH)) :
+ targetDepth+", "+TARGET_DEPTH+", "+TARGET_DEPTH_BAD_LOW+", "+TARGET_DEPTH_BAD_HIGH+", "+lowcount+", "+totalcount;
+ assert(TARGET_DEPTH>=99999999 || (targetDepth>=TARGET_DEPTH_BAD_LOW && targetDepth<=TARGET_DEPTH_BAD_HIGH)) :
+ targetDepth+", "+TARGET_DEPTH+", "+TARGET_DEPTH_BAD_LOW+", "+TARGET_DEPTH_BAD_HIGH+", "+lowcount+", "+totalcount;
+ maxDepth=targetDepth;
+ }
+
+ final int minAL=(depthAL1>=0 ? (depthAL2>=0 ? Tools.min(depthAL1, depthAL2) : depthAL1) : depthAL2);
+ final int maxAL=Tools.max(depthAL1, depthAL2);
+ final int minTrueDepth=(r2==null ? truedepth1 : Tools.min(truedepth1, truedepth2));
+ final int maxTrueDepth=Tools.max(truedepth1, truedepth2);
+ final int depthproxyAL=USE_LOWER_DEPTH ? minAL : maxAL;
+ final int truedepthproxy=USE_LOWER_DEPTH ? minTrueDepth : maxTrueDepth;
+ long coin=0;
+ if(depthproxyAL>maxDepth && (error1 || error2 || !DISCARD_BAD_ONLY)){
+ if(r1.rand<0){
+ coin=randy.nextLong(depthproxyAL)+1;
+ }else{
+ coin=((long)(r1.rand*depthproxyAL))+1;
+ }
+ }
+
+ totalReads+=readcount;
+ totalBases+=basecount;
+
+ boolean toss=(depthproxyAL<0 || coin>targetDepth || (r1!=null && r1.length()<MIN_LENGTH) || (r2!=null && r2.length()<MIN_LENGTH));
+ if(TOSS_ERROR_READS && (error1 || error2)){
+ if(SAVE_RARE_READS && depthproxyAL<=targetDepth && depthproxyAL>=HTHRESH){
+ //do nothing
+ }else if(!REQUIRE_BOTH_BAD || r2==null || (error1 && error2)){
+ toss=true;
+ }
+ }
+
+ if(TOSS_BY_LOW_TRUEDEPTH && !SAVE_RARE_READS && maxTrueDepth<MIN_DEPTH && (!REQUIRE_BOTH_BAD || (mintruedepth1<MIN_DEPTH && mintruedepth2<MIN_DEPTH))){
+ toss=true;
+ }
+
+ if(KEEP_ALL){toss=false;}
+
+// if((r==null || verybad1) && (r2==null || verybad2)){toss=true;} //Always toss verybad reads. Turned out to not be helpful.
+
+ if(error1){errorReads++;}
+ if(error2){errorReads++;}
+ if(error1 || error2){errorPairs++;}
+
+ if(toss){
+ if(tossList!=null){tossList.add(r1);}
+ readsTossed+=readcount;
+ basesTossed+=basecount;
+ }else{
+ if(CORRECT_ERRORS_THIS_PASS){
+ if(r1!=null && r1.length()>=k){
+ if(MARK_ERRORS_ONLY){
+ ec1=markErrors(r1, cov1, kmers1, kca, k, EC_LTHRESH, EC_HTHRESH, ERROR_CORRECT_RATIO, MAX_ERRORS_TO_CORRECT, true, true, qhist, longkmer);
+ errorsMarked+=ec1;
+ if(ec1>0){marked1=true;}
+ }else{
+ ec1=correctErrors(r1, cov1, kmers1, kca, k, EC_LTHRESH, EC_HTHRESH, ERROR_CORRECT_RATIO, MAX_ERRORS_TO_CORRECT, MAX_QUAL_TO_CORRECT, true, true, qhist, MARK_ERRORS_ONLY, longkmer);
+ if(ec1>=0){
+ errorsCorrected+=ec1;
+ }else{
+ uncorrectable1=true;
+ if(MAX_ERRORS_TO_CORRECT>0){regenerateKmersAndCoverage(r1, kmers1, cov1, kca, k, false, longkmer);}
+ if(MARK_UNCORRECTABLE_ERRORS){
+ ec1=markErrors(r1, cov1, kmers1, kca, k, EC_LTHRESH, EC_HTHRESH, ERROR_CORRECT_RATIO, MAX_ERRORS_TO_CORRECT, true, true, qhist, longkmer);
+ errorsMarked+=ec1;
+ if(ec1>0){marked1=true;}
+ }else{
+ errorsDetected-=ec1;
+ }
+ }
+ }
+ if(TRIM_AFTER_MARKING){
+ if(marked1 || TRIM_EVEN_IF_NO_ERRORS_DETECTED){
+ basesTrimmed+=TrimRead.trimFast(r1, TRIM_LEFT_THIS_PASS, TRIM_RIGHT_THIS_PASS, TRIM_QUALITY, 1);
+ }
+ }
+ }
+
+ if(r2!=null && r2.length()>=k){
+ if(MARK_ERRORS_ONLY){
+ ec2=markErrors(r2, cov2, kmers2, kca, k, EC_LTHRESH, EC_HTHRESH, ERROR_CORRECT_RATIO, MAX_ERRORS_TO_CORRECT, true, true, qhist, longkmer);
+ errorsMarked+=ec2;
+ if(ec2>0){marked2=true;}
+ }else{
+ ec2=correctErrors(r2, cov2, kmers2, kca, k, EC_LTHRESH, EC_HTHRESH, ERROR_CORRECT_RATIO, MAX_ERRORS_TO_CORRECT, MAX_QUAL_TO_CORRECT, true, true, qhist, MARK_ERRORS_ONLY, longkmer);
+ if(ec2>=0){
+ errorsCorrected+=ec2;
+ }else{
+ uncorrectable2=true;
+ if(MAX_ERRORS_TO_CORRECT>0){regenerateKmersAndCoverage(r2, kmers2, cov2, kca, k, false, longkmer);}
+ if(MARK_UNCORRECTABLE_ERRORS){
+ ec2=markErrors(r2, cov2, kmers2, kca, k, EC_LTHRESH, EC_HTHRESH, ERROR_CORRECT_RATIO, MAX_ERRORS_TO_CORRECT, true, true, qhist, longkmer);
+ errorsMarked+=ec2;
+ if(ec2>0){marked2=true;}
+ }else{
+ errorsDetected-=ec2;
+ }
+ }
+ }
+ if(TRIM_AFTER_MARKING){
+ if(marked2 || TRIM_EVEN_IF_NO_ERRORS_DETECTED){
+ basesTrimmed+=TrimRead.trimFast(r2, TRIM_LEFT_THIS_PASS, TRIM_RIGHT_THIS_PASS, TRIM_QUALITY, 1);
+ }
+ }
+ }
+ }
+
+ if(keepList!=null){keepList.add(r1);}
+ readsKept+=readcount;
+ basesKept+=basecount;
+ }
+
+ if(depthAL1<LOW_BIN_DEPTH && depthAL2<LOW_BIN_DEPTH){
+ readsLowBin+=readcount;
+ basesLowBin+=basecount;
+ if(lowList!=null){lowList.add(r1);}
+// }else if((depth1<0 || depth1>HIGH_BIN_DEPTH) && (depth2<0 || depth2>=HIGH_BIN_DEPTH)){
+ }else if((depthAL1<LOW_BIN_DEPTH || depthAL1>HIGH_BIN_DEPTH) && (depthAL2<LOW_BIN_DEPTH || depthAL2>=HIGH_BIN_DEPTH)){
+ readsHighBin+=readcount;
+ basesHighBin+=basecount;
+ if(highList!=null){highList.add(r1);}
+ }else{
+ assert((depthAL1>=LOW_BIN_DEPTH && depthAL1<=HIGH_BIN_DEPTH) || (depthAL2>=LOW_BIN_DEPTH && depthAL2<=HIGH_BIN_DEPTH)) :
+ depthAL1+", "+depthAL2+", "+LOW_BIN_DEPTH+", "+HIGH_BIN_DEPTH;
+ readsMidBin+=readcount;
+ basesMidBin+=basecount;
+ if(midList!=null){midList.add(r1);}
+ }
+
+ if(uncorrectable1 || uncorrectable2){
+ readsUncorrected+=readcount;
+ basesUncorrected+=basecount;
+ if(uncList!=null){uncList.add(r1);}
+ }
+ }
+
+
+ if(storage!=null){
+ synchronized(storage){
+ storage.addAll(keepList);
+ if(ADD_BAD_READS_COUNTUP){storage.addAll(tossList);}
+ }
+ }
+
+
+ if(rosk!=null){ //Important to send all lists to output, even empty ones, to keep list IDs straight.
+// System.err.println("Adding list "+ln.id+" of length "+reads.size());
+ rosk.add(keepList, ln.id);
+ keepList.clear();
+ }
+ if(rost!=null){ //Important to send all lists to output, even empty ones, to keep list IDs straight.
+// System.err.println("Adding list "+ln.id+" of length "+reads.size());
+ rost.add(tossList, ln.id);
+ tossList.clear();
+ }
+
+ if(rosl!=null){ //Important to send all lists to output, even empty ones, to keep list IDs straight.
+// System.err.println("Adding list "+ln.id+" of length "+reads.size());
+ rosl.add(lowList, ln.id);
+ lowList.clear();
+ }
+ if(rosm!=null){ //Important to send all lists to output, even empty ones, to keep list IDs straight.
+// System.err.println("Adding list "+ln.id+" of length "+reads.size());
+ rosm.add(midList, ln.id);
+ midList.clear();
+ }
+ if(rosh!=null){ //Important to send all lists to output, even empty ones, to keep list IDs straight.
+// System.err.println("Adding list "+ln.id+" of length "+reads.size());
+ rosh.add(highList, ln.id);
+ highList.clear();
+ }
+ if(rosu!=null){ //Important to send all lists to output, even empty ones, to keep list IDs straight.
+// System.err.println("Adding list "+ln.id+" of length "+reads.size());
+ rosu.add(uncList, ln.id);
+ uncList.clear();
+ }
+
+ cris.returnList(ln.id, ln.list.isEmpty());
+ //System.err.println("fetching list");
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ if(verbose){System.err.println("Finished reading");}
+ cris.returnList(ln.id, ln.list.isEmpty());
+ if(verbose){System.err.println("Returned list");}
+ }
+
+
+ void normalizeInThreadByCountup() {
+
+ Kmer longkmer=(k<32 ? null : new Kmer(k));
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ final ArrayList<Read> keepList=(rosk==null ? null : new ArrayList<Read>(Shared.READ_BUFFER_LENGTH));
+ final ArrayList<Read> tossList=(rost==null ? null : new ArrayList<Read>(Shared.READ_BUFFER_LENGTH));
+
+ int[] cov=null, covSorted=null, covup=null;
+ long[] kmers1=null, kmers2=null, kmers3=null;
+
+ while(reads!=null && reads.size()>0){
+ for(int rnum=0; rnum<reads.size(); rnum++){
+ Read r=reads.get(rnum);
+ Read r2=r.mate;
+ assert(r!=r2);
+
+ int readcount=0;
+ int basecount=0;
+ int errors=0, nonerrors=0;
+
+ boolean k1valid=false, k2valid=false;
+
+ if(r!=null && r.bases!=null){
+ readcount++;
+ basecount+=r.length();
+ if(r.length()>=k){
+ if(verbose){outstream.println();}
+ kmers1=r.toKmers(k, kca.gap, kmers1, true, longkmer);
+ k1valid=true;
+ }
+ }
+ if(r2!=null && r2.bases!=null){
+ readcount++;
+ basecount+=r2.length();
+ if(r2.length()>=k){
+ if(verbose){outstream.println();}
+ kmers2=r2.toKmers(k, kca.gap, kmers2, true, longkmer);
+ k2valid=true;
+ }
+ }
+
+ final int mergelen=(k1valid ? kmers1.length : 0)+(k2valid ? kmers2.length : 0);
+ int valid=0, unique=0, desired=0, needed=0, badlyneeded=0;
+ if(mergelen>0){
+ if(kmers3==null || kmers3.length!=mergelen){kmers3=new long[mergelen];}
+ int j=0;
+ if(k1valid){
+ for(int i=0; i<kmers1.length; i++, j++){kmers3[j]=kmers1[i];}
+ }
+ if(k2valid){
+ for(int i=0; i<kmers2.length; i++, j++){kmers3[j]=kmers2[i];}
+ }
+ Arrays.sort(kmers3);
+
+ if(cov==null || cov.length!=mergelen){cov=new int[mergelen];}
+ if(covup==null || covup.length!=mergelen){covup=new int[mergelen];}
+ for(int i=0; i<mergelen; i++){
+ long kmer=kmers3[i];
+ if(kmer==-1){
+ cov[i]=-1;
+ covup[i]=-1;
+ }else if(IGNORE_DUPLICATE_KMERS_COUNTUP && i>0 && kmer==kmers3[i-1]){
+ cov[i]=-1;
+ covup[i]=-1;
+ valid++;
+ }else{
+ cov[i]=kca.read(kmer);
+ covup[i]=kcaup.read(kmer);
+ valid++;
+ unique++;
+ if(cov[i]>=MIN_DEPTH){
+ desired++;
+ if(covup[i]<TARGET_DEPTH){
+ needed++;
+ if(covup[i]<(Tools.min(TARGET_DEPTH, cov[i])*3)/4){badlyneeded++;}
+ }
+ }
+ }
+ }
+ final int invalid=cov.length-valid;
+
+ if(covSorted==null || covSorted.length!=mergelen){covSorted=new int[mergelen];}
+ for(int i=0; i<cov.length; i++){covSorted[i]=cov[i];}
+ Arrays.sort(covSorted);
+
+ int prev=-1;
+ for(int i=0; i<covSorted.length; i++){
+ int x=covSorted[i];
+ if(prev>-1){
+ if((x>=HTHRESH && prev<=LTHRESH) || x>=prev*ERROR_DETECT_RATIO){
+ errors=covSorted.length-i;
+ break;
+ }else{nonerrors++;}
+ }
+ prev=x;
+ }
+ }
+
+ int t1=Tools.max(8, (unique+5)/6);
+ int t2=Tools.max(2, (unique+23)/24);
+
+ boolean toss=!((needed>=t1 || badlyneeded>=t2) && (desired>=MIN_KMERS_OVER_MIN_DEPTH || unique<MIN_KMERS_OVER_MIN_DEPTH));
+ if(TOSS_ERROR_READS && errors>8 && (needed<2*t1 && badlyneeded<2*t2)){toss=true;}
+ if(TOSS_ERROR_READS && errors>unique/2 && (needed<3*t1 && badlyneeded<4*t2)){toss=true;}
+// assert(false) : "\n"+TOSS_ERROR_READS+", "+unique+", "+desired+", "+needed+", "+badlyneeded+", "+errors+", "+t1+", "+t2;
+// System.out.println("valid="+valid+", unique="+unique+", desired="+desired+", needed="+needed+", toss="+toss);
+ if(KEEP_ALL){toss=false;}
+
+ totalReads+=readcount;
+ totalBases+=basecount;
+
+ if(toss){
+// System.out.println("valid="+valid+", unique="+unique+", desired="+desired+", needed="+needed+", toss="+toss);
+ if(tossList!=null){tossList.add(r);}
+ readsTossed+=readcount;
+ basesTossed+=basecount;
+ }else{
+// System.out.println("valid="+valid+", unique="+unique+", desired="+desired+", needed="+needed+", toss="+toss);
+// System.out.println("valid="+valid+", unique="+unique+", desired="+desired+", needed="+needed+", toss="+toss+"\n"+Arrays.toString(cov)
+// +"\n"+Arrays.toString(covup)+"\n"+Arrays.toString(kmers3));
+ for(int i=0; i<mergelen; i++){
+ if(cov[i]>=MIN_DEPTH){
+ long kmer=kmers3[i];
+ kcaup.increment(kmer);
+ }
+ }
+ if(keepList!=null){keepList.add(r);}
+ readsKept+=readcount;
+ basesKept+=basecount;
+ }
+
+ if(mergelen>0){
+// Arrays.sort(cov);
+// incrementHistogramSorted(cov);
+ incrementHistogramSorted(covSorted);
+ }
+ }
+
+ if(storage!=null){
+ synchronized(storage){
+ storage.addAll(keepList);
+ }
+ }
+
+ if(rosk!=null){ //Important to send all lists to output, even empty ones, to keep list IDs straight.
+// System.err.println("Adding list "+ln.id+" of length "+reads.size());
+ rosk.add(keepList, ln.id);
+ keepList.clear();
+ }
+ if(rost!=null){ //Important to send all lists to output, even empty ones, to keep list IDs straight.
+// System.err.println("Adding list "+ln.id+" of length "+reads.size());
+ rost.add(tossList, ln.id);
+ tossList.clear();
+ }
+
+ assert(rosl==null) : "Low fraction out not supported by countup.";
+ assert(rosm==null) : "Mid fraction out not supported by countup.";
+ assert(rosh==null) : "High fraction out not supported by countup.";
+ assert(rosu==null) : "TODO - Uncorrectable fraction out not supported by countup.";
+
+ cris.returnList(ln.id, ln.list.isEmpty());
+ //System.err.println("fetching list");
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ if(verbose){System.err.println("Finished reading");}
+ cris.returnList(ln.id, ln.list.isEmpty());
+ if(verbose){System.err.println("Returned list");}
+ }
+
+ private final int[] getSortedCoverageAndIncrementHistogram(Read r, int[] cov, long[] kmers,
+ boolean kmersAlreadyValid, boolean kmersAlreadyCanonical, boolean coverageAlreadyValid, Kmer longkmer){
+ assert(r!=null && r.bases!=null && r.length()>=k) : r;
+
+ if(!coverageAlreadyValid){
+ if(!kmersAlreadyValid){kmers=r.toKmers(k, kca.gap, kmers, false, longkmer);}
+ cov=generateCoverage(kca, k, cov, kmers, (!kmersAlreadyCanonical && k<32));
+ }
+
+ sortCoverageAndIncrementHistogram(cov);
+ return cov;
+ }
+
+ private void sortCoverageAndIncrementHistogram(int[] cov){
+ if(cov==null || cov.length==0){return;}
+ Arrays.sort(cov);
+ Tools.reverseInPlace(cov);
+ incrementHistogramSorted(cov);
+ }
+
+ /** Handles coverage sorted in either direction */
+ private final void incrementHistogramSorted(int[] cov){
+ if(hist==null || cov==null || cov.length==0){return;}
+
+// outstream.println(Arrays.toString(cov));
+
+ int last=cov[0];
+ long sum=0;
+// long sum2=0;
+ int i=0;
+ while(i<cov.length && cov[i]<0){i++;}
+ for(; i<cov.length; i++){
+ int x=cov[i];
+// outstream.println("Processing "+x);
+ if(x<0){break;}
+ int y=Tools.min(x, HIST_LEN-1);
+ if(y==last){sum++;}
+ else if(sum>0){
+// outstream.println("Incrementing "+last+" by "+sum);
+// sum2+=sum;
+ if(last<hist.length){hist[last]+=sum;}
+ else{khistogram.addAndGet(last, sum);}
+ sum=1;
+ }
+ last=y;
+ }
+// outstream.println("Ended loop");
+ if(sum>0){
+// outstream.println("Incrementing "+last+" by "+sum);
+// sum2+=sum;
+ if(last<hist.length){hist[last]+=sum;}
+ else{khistogram.addAndGet(last, sum);}
+ }
+// assert(sum2==cov.length) : sum2+", "+cov.length+", "+last+", "+sum;
+ }
+
+ private final ConcurrentReadInputStream cris;
+ /** Premade table holding counts of input kmers */
+ private final KCountArray kca;
+ /** Dynamic table holding counts of output kmers */
+ private final KCountArray kcaup;
+ /** kmer length */
+ private final int k;
+ /** Stream for kept reads */
+ private final ConcurrentReadOutputStream rosk;
+ /** Stream for tossed reads */
+ private final ConcurrentReadOutputStream rost;
+ /** Stream for low-count reads */
+ private final ConcurrentReadOutputStream rosl;
+ /** Stream for mid-count reads */
+ private final ConcurrentReadOutputStream rosm;
+ /** Stream for high-count reads */
+ private final ConcurrentReadOutputStream rosh;
+ /** Stream for reads with uncorrectable errors */
+ private final ConcurrentReadOutputStream rosu;
+
+ public final long[] hist=new long[THREAD_HIST_LEN];//(USE_HISTOGRAM ? new long[HIST_LEN] : null);
+ public final long[] qhist=new long[128];
+
+ private final ArrayList<Read> storage;
+
+ private long totalBases=0;
+ private long totalReads=0;
+// private final java.util.Random randy=new java.util.Random();
+ private ThreadLocalRandom randy;
+
+ public long readsKept=0;
+ public long readsTossed=0;
+ public long readsLowBin=0;
+ public long readsMidBin=0;
+ public long readsHighBin=0;
+ public long readsUncorrected=0;
+ public long basesKept=0;
+ public long basesTossed=0;
+ public long basesLowBin=0;
+ public long basesMidBin=0;
+ public long basesHighBin=0;
+ public long basesUncorrected=0;
+
+ public long errorReads=0;
+ public long errorPairs=0;
+ public long errorType1=0;
+ public long errorType2=0;
+ public long errorType3=0;
+
+ public long errorsDetected=0;
+ public long errorsCorrected=0;
+ public long errorsMarked=0;
+ public long basesTrimmed=0;
+ }
+
+ public static PrintStream outstream=Data.sysout;
+
+ private static long minHeight=2;
+ private static long minVolume=2;
+ private static int minWidth=2;
+ private static int minPeak=2;
+ private static int maxPeak=Integer.MAX_VALUE;
+ private static int maxPeakCount=10;
+ private static int ploidy=-1;
+
+ public static int THREAD_HIST_LEN=1<<12;
+ public static int HIST_LEN=1<<20;
+ public static long HIST_LEN_PRINT=HIST_LEN;
+ public static long HIST_COLUMNS=3;
+ public static boolean USE_KHISTOGRAM=false;
+ public static boolean USE_RHISTOGRAM=false;
+ public static boolean PRINT_ZERO_COVERAGE=false;
+ public static AtomicLongArray khistogram;
+ public static AtomicLongArray rhistogram;
+ public static AtomicLongArray bhistogram;
+ public static long[] qhist_total;
+
+ private static int THREADS=Shared.threads();
+ private static boolean verbose=false;
+ private static boolean errorState=false;
+
+ private static boolean EA=false;
+ static{assert(EA=true);}
+
+ private static boolean eccByOverlap=false;
+ private static boolean eccByOverlapAuto=false;
+
+ /** High-depth reads will be downsampled to this level in the current pass */
+ private static int TARGET_DEPTH=100;
+ /** Error-containing reads will be downsampled to at least this level in the current pass */
+ private static int TARGET_DEPTH_BAD_LOW=100;
+ /** Error-containing reads will be downsampled to at most this level in the current pass */
+ private static int TARGET_DEPTH_BAD_HIGH=100;
+ /** High-depth reads will be downsampled to this level in the final pass */
+ private static int TARGET_DEPTH_F=100;
+ /** High-depth reads will be downsampled to this level in the first pass */
+ private static int TARGET_DEPTH_1=-1;
+ /** Reads under this depth will not be downsampled */
+ private static int MAX_DEPTH=-1;
+ /** Reads under this depth will be discarded, and kmers under this depth will be ignored */
+ private static int MIN_DEPTH=5;
+ /** Reads without this many kmers of at least min depth will be discarded */
+ private static int MIN_KMERS_OVER_MIN_DEPTH=15;
+ /** Position in sorted kmer depths array to use as proxy for overall read depth */
+ private static float DEPTH_PERCENTILE=0.54f;
+
+ /** Normalize based on depth of read with lower depth, instead of read with higher depth */
+ public static boolean USE_LOWER_DEPTH=true;
+ /** Throw out reads with depth at absolute depth percentile below mindepth */
+ public static boolean TOSS_BY_LOW_TRUEDEPTH=true;
+ /** Throw out reads containing errors in the current pass */
+ public static boolean TOSS_ERROR_READS=false;
+ /** Throw out reads containing errors in the final pass */
+ public static boolean TOSS_ERROR_READS_F=false;
+ /** Throw out reads containing errors in the first pass */
+ public static boolean TOSS_ERROR_READS_1=false;
+ /** Only downsample error reads on current pass (keep all error-free reads) */
+ public static boolean DISCARD_BAD_ONLY=false;
+ /** Only downsample error reads on first pass (keep all error-free reads) */
+ public static boolean DISCARD_BAD_ONLY_F=false;
+ /** Only downsample error reads on final pass (keep all error-free reads) */
+ public static boolean DISCARD_BAD_ONLY_1=false;
+ /** Require both reads in a pair to be bad before tossing the read */
+ public static boolean REQUIRE_BOTH_BAD=false;
+ /** Don't toss error reads with depth below max */
+ public static boolean SAVE_RARE_READS=false;
+ /** Position in sorted kmer depths array to use as proxy for high depth kmer */
+ public static float HIGH_PERCENTILE=0.90f;
+ /** Position in sorted kmer depths array to use as proxy for low depth kmer */
+ public static float LOW_PERCENTILE=0.25f;
+ /** Position in sorted kmer depths array to use as proxy for low depth kmer, during countup presort pass */
+ public static float LOW_PERCENTILE_COUNTUP=0.20f;
+ /** Set to true to keep error reads during countup presort pass */
+ public static boolean ADD_BAD_READS_COUNTUP=false;
+
+ /** Reads with a high/low ratio of at least this are considered error reads. */
+ public static int ERROR_DETECT_RATIO=125;
+ /** Threshold for high kmer in detection. A high kmer at this or above is considered possibly non-error. */
+ public static int HTHRESH=12;
+ /** Threshold for low kmer in detection. Kmers at this and below are always considered errors. */
+ public static int LTHRESH=3;
+
+ /** Reads with a high/low ratio of at least this are considered error reads. */
+ public static int ERROR_CORRECT_RATIO=140;
+ /** Threshold for high kmer in correction. A high kmer at this or above considered possibly non-error. */
+ public static int EC_HTHRESH=22;
+ /** Threshold for low kmer in correction. Kmers at this and below are considered errors if an adjacent kmer is at or above the high thresh. */
+ public static int EC_LTHRESH=2;
+
+ public static double TARGET_BAD_PERCENT_LOW=0.85;
+ public static double TARGET_BAD_PERCENT_HIGH=1.5;
+
+ private static long FILTERBYTES=-1;
+
+ private static int SUFFIX_LEN=3;
+ private static int PREFIX_LEN=3;
+
+ private static boolean TRIM_LEFT_THIS_PASS=false;
+ private static boolean TRIM_RIGHT_THIS_PASS=false;
+ private static boolean RENAME_THIS_PASS=false;
+
+ private static boolean CORRECT_ERRORS_THIS_PASS=false;
+ private static boolean MARK_ERRORS_ONLY=false;
+ private static boolean TRIM_AFTER_MARKING=false;
+ private static boolean TRIM_EVEN_IF_NO_ERRORS_DETECTED=true;
+ private static boolean MARK_WITH_1=false;
+ private static boolean MARK_UNCORRECTABLE_ERRORS=false;
+ private static boolean USE_ECC1=false;
+ private static boolean USE_ECCF=false;
+ private static boolean CORRECT_FROM_LEFT=true;
+ private static boolean CORRECT_FROM_RIGHT=true;
+
+ private static double prefilterFraction=0.35;
+
+ private static int LOW_BIN_DEPTH=10;
+ private static int HIGH_BIN_DEPTH=80;
+
+ /** ECC_LIMIT */
+ private static int MAX_ERRORS_TO_CORRECT=3;
+ private static int MAX_QUAL_TO_CORRECT=127;
+
+
+ public static boolean IGNORE_DUPLICATE_KMERS_COUNTUP=true;
+
+ public static boolean CANONICAL=true;
+ public static boolean ZERO_BIN=false;
+ public static boolean FIX_SPIKES=false;
+ public static boolean KEEP_ALL=false;
+ public static boolean ordered=false;
+ public static boolean overwrite=true;
+ public static boolean append=false;
+ public static boolean prefilter=false;
+ public static boolean renameReads=false;
+ public static boolean DETERMINISTIC=true;
+ public static boolean COUNTUP=false;
+ public static boolean ANALYZE_TOPOLOGY=false;
+ /** Quality-trim left side of reads before further processing. */
+ public static boolean TRIM_LEFT=false;
+ /** Quality-trim right side of reads before further processing. */
+ public static boolean TRIM_RIGHT=false;
+ public static int MIN_LENGTH=1;
+ /** Trim until 2 consecutive bases are encountered with at least this quality. */
+ public static byte TRIM_QUALITY=5;
+
+ public static boolean REMOVE_TEMP_FILES=true;
+ public static boolean USE_TMPDIR=true;
+ public static String TMPDIR=Shared.TMPDIR;
+ public static boolean useTmpdir(){return USE_TMPDIR && TMPDIR!=null;}
+
+ private static HashSet<String> temp_file_set=null;
+
+ public static AtomicLong peaks=new AtomicLong();
+ public static AtomicLong spikes=new AtomicLong();
+ public static AtomicLong flats=new AtomicLong();
+ public static AtomicLong valleys=new AtomicLong();
+ public static AtomicLong slopes=new AtomicLong();
+
+ public static final byte FIXED_N_QUAL=20;
+
+}
diff --git a/current/jgi/KmerSample.java b/current/jgi/KmerSample.java
new file mode 100755
index 0000000..427e73f
--- /dev/null
+++ b/current/jgi/KmerSample.java
@@ -0,0 +1,123 @@
+package jgi;
+
+import fileIO.TextFile;
+
+/**
+ * @author Brian Bushnell
+ * @date Oct 10, 2012
+ *
+ */
+public class KmerSample {
+
+
+ public static int[] makeKmerSet(int K, String filename){
+
+ //Number of bits in a kmer
+ int kbits=2*K;
+
+ //Number of possible kmers
+ long kmerSpace=(1L<<kbits);
+
+ //Make an array of the correct size, remembering that int is 32 bits
+ int[] array=new int[(int)(kmerSpace/32)];
+
+ //Current kmer
+ long kmer=0;
+
+ //Length of current kmer
+ int len=0;
+
+ //This will create a bitmask of 00000...0000111111...11111, where the number if 1's is equal to kbits.
+ long mask=~((-1L)<<kbits);
+
+ //Initialize an input stream for the fasta file
+ TextFile tf=new TextFile(filename, false, false);
+
+ //Grab the first line of the fasta file
+ String line=tf.nextLine();
+
+ while(line!=null){
+
+ if(line.length()<1){
+ //The line is empty, so ignore it (should never happen in a proper fasta file)
+ }else if(line.charAt(0)=='>'){
+ //The line is name of a new contig/scaffold, so reset the kmer
+ kmer=0;
+ len=0;
+ }else{
+ //Otherwise, generate kmers
+
+ for(int i=0; i<line.length(); i++){
+
+ //The base at location "i" in the string
+ char letter=line.charAt(i);
+
+ //The 2-bit numeric code for the base
+ int code;
+
+ if(letter=='A'){code=0;}
+ else if(letter=='C'){code=1;}
+ else if(letter=='G'){code=2;}
+ else if(letter=='T'){code=3;}
+ else{code=-1;}
+
+ if(code<0){
+ //The base was an N or degenerate letter, so reset the kmer
+ kmer=0;
+ len=0;
+ }else{
+ //insert the code into the current kmer
+ kmer=(kmer<<2); //left shift by 2
+ kmer=(kmer|code); //or with the code
+ kmer=(kmer&mask); //and with the mask to prevent going past the intended kmer length
+ len++; //Increment the length of the kmer
+
+ if(len>=K){
+ //If the kmer is long enough, then add it to the array
+
+ //The index in the array is the upper bits of the kmer. Each location in the array is 32 bits.
+ int index=(int)(kmer/32);
+
+ //The bit within the word of the array is the lower 5 bits of the kmer
+ int bit=(int)(kmer%32);
+
+ //A bitmask to set the correct bit in the array to 1.
+ int x=(1<<bit);
+
+ //OR the array location with the new mask.
+ array[index]=(array[index] | x);
+ }
+ }
+ }
+ }
+
+ //Grab the next line
+ line=tf.nextLine();
+ }
+
+ //Close your input stream
+ tf.close();
+
+ return array;
+ }
+
+ public static boolean containsKmer(long kmer, int[] array){
+
+ //The index in the array is the upper bits of the kmer. Each location in the array is 32 bits.
+ int index=(int)(kmer/32);
+
+ //The bit within the word of the array is the lower 5 bits of the kmer
+ int bit=(int)(kmer%32);
+
+ //A bitmask to test the correct bit in the array to 1.
+ int x=(1<<bit);
+
+ if((array[index]&x)==0){//Check to see if the bit is set in the array
+ return false;
+ }else{
+ return true;
+ }
+
+ }
+
+}
diff --git a/current/jgi/LogLog.java b/current/jgi/LogLog.java
new file mode 100755
index 0000000..ee2f028
--- /dev/null
+++ b/current/jgi/LogLog.java
@@ -0,0 +1,415 @@
+package jgi;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Random;
+import java.util.concurrent.atomic.AtomicIntegerArray;
+
+import kmer.Primes;
+
+import dna.Parser;
+import dna.Timer;
+
+import fileIO.FileFormat;
+import fileIO.ReadWrite;
+
+import stream.ConcurrentGenericReadInputStream;
+import stream.ConcurrentReadInputStream;
+import stream.FastaReadInputStream;
+import stream.Read;
+import ukmer.Kmer;
+
+import align2.ListNum;
+import align2.ReadStats;
+import align2.Shared;
+import align2.Tools;
+
+/**
+ * @author Brian Bushnell
+ * @date Sep 30, 2015
+ *
+ */
+public class LogLog {
+
+ public static void main(String[] args){
+ LogLogWrapper llw=new LogLogWrapper(args);
+ llw.process();
+ }
+
+ public final long cardinality(){
+ long sum=0;
+ for(int i=0; i<maxArray.length(); i++){
+ sum+=maxArray.get(i);
+ }
+ double mean=sum/(double)buckets;
+ long cardinality=(long)((((Math.pow(2, mean)-1)*buckets*SKIPMOD))/1.262);
+ lastCardinality=cardinality;
+ return cardinality;
+ }
+
+ public final long cardinalityH(){
+ double sum=0;
+ for(int i=0; i<maxArray.length(); i++){
+ int x=Tools.max(1, maxArray.get(i));
+ sum+=1.0/x;
+ }
+ double mean=buckets/sum;
+ return (long)((Math.pow(2, mean)*buckets*SKIPMOD));
+ }
+
+ public LogLog(Parser p){
+ this(p.loglogbuckets, p.loglogbits, p.loglogk, p.loglogseed);
+ }
+
+ public LogLog(int buckets_, int bits_, int k_, long seed){
+// hashes=hashes_;
+ buckets=buckets_;
+ bits=bits_;
+ k=Kmer.getKbig(k_);
+ maxArray=(atomic ? new AtomicIntegerArray(buckets) : null);
+ maxArray2=(atomic ? null : new long[buckets]);
+ steps=(63+bits)/bits;
+ tables=new long[numTables][][];
+ for(int i=0; i<numTables; i++){
+ tables[i]=makeCodes(steps, bits, (seed<0 ? -1 : seed+i));
+ }
+
+// assert(false) : "steps="+steps+", "+tables.length+", "+tables[0].length+", "+tables[0][0].length;
+ }
+
+ public long hash(final long value0, final long[][] table){
+ long value=value0, code=value0;
+ long mask=~((-1L)<<bits);
+
+ for(int i=0; i<steps; i++){
+ int x=(int)(value&mask);
+ value>>=bits;
+ code=Long.rotateLeft(code^table[i][x], 3);
+ }
+ return Long.rotateLeft(code, (int)(value0&31));
+ }
+
+ public void add(long number){
+ hash(number);
+ }
+
+ public void hash(Read r){
+ if(r!=null && r.length()>=k){hash(r.bases);}
+ if(r.mateLength()>=k){hash(r.mate.bases);}
+ }
+
+ public void hash(byte[] bases){
+ if(k<32){hashSmall(bases);}
+ else{hashBig(bases);}
+ }
+
+ public void hashSmall(byte[] bases){
+ final int shift=2*k;
+ final int shift2=shift-2;
+ final long mask=~((-1L)<<shift);
+ int len=0;
+
+ long kmer=0, rkmer=0;
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ long x=Dedupe.baseToNumber[b];
+ long x2=Dedupe.baseToComplementNumber[b];
+ kmer=((kmer<<2)|x)&mask;
+ rkmer=(rkmer>>>2)|(x2<<shift2);
+ if(b=='N'){len=0;}else{len++;}
+ if(len>=k){
+ add(Tools.max(kmer, rkmer));
+ }
+ }
+ }
+
+ public void hashBig(byte[] bases){
+
+ Kmer kmer=localKmer.get();
+ if(kmer==null){
+ localKmer.set(new Kmer(k));
+ kmer=localKmer.get();
+ }
+ int len=0;
+
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ long x=Dedupe.baseToNumber[b];
+ kmer.addRightNumeric(x);
+ if(b=='N'){len=0;}else{len++;}
+ if(len>=k){
+ add(kmer.xor());
+ }
+ }
+ }
+
+
+ public void hash(final long number){
+ if(number%SKIPMOD!=0){return;}
+ long key=number;
+
+ int i=(int)(number%5);
+ key=Long.rotateRight(key, 1);
+ key=hash(key, tables[i%numTables]);
+ int leading=Long.numberOfLeadingZeros(key);
+// counts[leading]++;
+
+ if(leading<3){return;}
+ final int bucket=(int)((number&Integer.MAX_VALUE)%buckets);
+
+// if(maxArray!=null){
+ int x=maxArray.get(bucket);
+ while(leading>x){
+ boolean b=maxArray.compareAndSet(bucket, x, leading);
+ if(b){x=leading;}
+ else{x=maxArray.get(bucket);}
+ }
+// }else{
+// maxArray2[bucket]=Tools.max(leading, maxArray2[bucket]);
+// }
+ }
+
+ private static long[][] makeCodes(int length, int bits, long seed){
+ Random randy;
+ if(seed>=0){randy=new Random(seed);}
+ else{randy=new Random();}
+ int modes=1<<bits;
+ long[][] r=new long[length][modes];
+ for(int i=0; i<length; i++){
+ for(int j=0; j<modes; j++){
+ long x=randy.nextLong();
+ while(Long.bitCount(x)>33){
+ x&=(~(1L<<randy.nextInt(64)));
+ }
+ while(Long.bitCount(x)<31){
+ x|=(1L<<randy.nextInt(64));
+ }
+ r[i][j]=x;
+
+ }
+ }
+ return r;
+ }
+
+ public final int k;
+ public final int numTables=4;
+ public final int bits;
+// public final int hashes;
+ public final int steps;
+ private final long[][][] tables;
+ public final AtomicIntegerArray maxArray;
+ public final long[] maxArray2;
+// public final long[] counts=new long[64];
+ public int buckets;
+ private final ThreadLocal<Kmer> localKmer=new ThreadLocal<Kmer>();
+
+
+ private static class LogLogWrapper{
+
+ public LogLogWrapper(String[] args){
+
+ Shared.READ_BUFFER_LENGTH=Tools.min(200, Shared.READ_BUFFER_LENGTH);
+ Shared.capBuffers(4);
+ ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
+ ReadWrite.MAX_ZIP_THREADS=Shared.threads();
+
+
+ Parser parser=new Parser();
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if(b==null || b.equalsIgnoreCase("null")){b=null;}
+ while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens
+
+ if(parser.parse(arg, a, b)){
+ //do nothing
+ }else if(a.equals("buckets") || a.equals("loglogbuckets")){
+ long x=Tools.parseKMG(b);
+ buckets=(int)Primes.primeAtLeast(Tools.min(1000000, x));
+ }else if(a.equals("bits") || a.equals("loglogbits")){
+ bits=Integer.parseInt(b);
+ }else if(a.equals("k") || a.equals("loglogk")){
+ k=Integer.parseInt(b);
+ }else if(a.equals("seed") || a.equals("loglogseed")){
+ seed=Long.parseLong(b);
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ }else if(a.equals("parse_flag_goes_here")){
+ //Set a variable here
+ }else{
+ outstream.println("Unknown parameter "+args[i]);
+ assert(false) : "Unknown parameter "+args[i];
+ // throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ {//Process parser fields
+ Parser.processQuality();
+
+ maxReads=parser.maxReads;
+
+ overwrite=ReadStats.overwrite=parser.overwrite;
+ append=ReadStats.append=parser.append;
+
+ in1=(parser.in1==null ? null : parser.in1.split(","));
+ in2=(parser.in2==null ? null : parser.in2.split(","));
+ out=parser.out1;
+ }
+
+ assert(in1!=null && in1.length>0) : "No primary input file specified.";
+ {
+ ffin1=new FileFormat[in1.length];
+ ffin2=new FileFormat[in1.length];
+
+ for(int i=0; i<in1.length; i++){
+ String a=in1[i];
+ String b=(in2!=null && in2.length>i ? in2[i] : null);
+ assert(a!=null) : "Null input filename.";
+ if(b==null && a.indexOf('#')>-1 && !new File(a).exists()){
+ b=a.replace("#", "2");
+ a=a.replace("#", "1");
+ }
+
+ ffin1[i]=FileFormat.testInput(a, FileFormat.FASTQ, null, true, true);
+ ffin2[i]=FileFormat.testInput(b, FileFormat.FASTQ, null, true, true);
+ }
+ }
+
+ assert(FastaReadInputStream.settingsOK());
+ }
+
+
+ void process(){
+ Timer t=new Timer();
+ LogLog log=new LogLog(buckets, bits, k, seed);
+
+
+ for(int ffnum=0; ffnum<ffin1.length; ffnum++){
+ ConcurrentReadInputStream cris=ConcurrentGenericReadInputStream.getReadInputStream(maxReads, false, ffin1[ffnum], ffin2[ffnum]);
+ cris.start();
+
+ LogLogThread[] threads=new LogLogThread[Shared.threads()];
+ for(int i=0; i<threads.length; i++){
+ threads[i]=new LogLogThread(log, cris);
+ }
+ for(LogLogThread llt : threads){
+ llt.start();
+ }
+ for(LogLogThread llt : threads){
+ while(llt.getState()!=Thread.State.TERMINATED){
+ try {
+ llt.join();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+
+ errorState|=ReadWrite.closeStreams(cris);
+ }
+
+ int[] copy=new int[log.maxArray.length()];
+ for(int i=0; i<log.maxArray.length(); i++){
+// System.err.println(log.maxArray.get(i));
+ copy[i]=log.maxArray.get(i);
+ }
+
+ t.stop();
+
+
+ long cardinality=log.cardinality();
+
+ if(out!=null){
+ ReadWrite.writeString(cardinality+"\n", out);
+ }
+
+// Arrays.sort(copy);
+// System.err.println("Median: "+copy[Tools.median(copy)]);
+
+// System.err.println("Mean: "+Tools.mean(copy));
+// System.err.println("Harmonic Mean: "+Tools.harmonicMean(copy));
+ System.err.println("Cardinality: "+log.cardinality());
+// System.err.println("CardinalityH: "+log.cardinalityH());
+
+// for(long i : log.counts){System.err.println(i);}
+
+ System.err.println("Time: \t"+t);
+ }
+
+ private class LogLogThread extends Thread{
+
+ LogLogThread(LogLog log_, ConcurrentReadInputStream cris_){
+ log=log_;
+ cris=cris_;
+ }
+
+ @Override
+ public void run(){
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+ while(reads!=null && reads.size()>0){
+
+ for(Read r : reads){
+ log.hash(r);
+ }
+
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ cris.returnList(ln.id, ln.list.isEmpty());
+ }
+
+ private final LogLog log;
+ private final ConcurrentReadInputStream cris;
+
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private int buckets=1999;
+ private int bits=8;
+ private int k=31;
+ private long seed=-1;
+
+
+ private String[] in1=null;
+ private String[] in2=null;
+ private String out=null;
+
+ /*--------------------------------------------------------------*/
+
+ protected long readsProcessed=0;
+ protected long basesProcessed=0;
+
+ private long maxReads=-1;
+
+ boolean overwrite=false;
+ boolean append=false;
+ boolean errorState=false;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Final Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private final FileFormat[] ffin1;
+ private final FileFormat[] ffin2;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Common Fields ----------------*/
+ /*--------------------------------------------------------------*/
+ }
+
+ private static PrintStream outstream=System.err;
+ public static boolean verbose=false;
+ public static boolean atomic=true;
+ private static final long SKIPMOD=3;
+ public static long lastCardinality=-1;
+
+}
diff --git a/current/jgi/MakeChimeras.java b/current/jgi/MakeChimeras.java
new file mode 100755
index 0000000..c31f862
--- /dev/null
+++ b/current/jgi/MakeChimeras.java
@@ -0,0 +1,412 @@
+package jgi;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Random;
+
+import stream.ConcurrentGenericReadInputStream;
+import stream.ConcurrentReadInputStream;
+import stream.FASTQ;
+import stream.FastaReadInputStream;
+import stream.Read;
+
+import dna.Parser;
+import dna.Timer;
+import fileIO.ByteFile;
+import fileIO.ByteFile1;
+import fileIO.ByteFile2;
+import fileIO.ReadWrite;
+import fileIO.FileFormat;
+import fileIO.TextStreamWriter;
+
+import align2.ListNum;
+import align2.Shared;
+import align2.Tools;
+
+
+/**
+ * @author Brian Bushnell
+ * @date Oct 7, 2014
+ *
+ */
+public class MakeChimeras {
+
+ public static void main(String[] args){
+ Timer t=new Timer();
+ MakeChimeras mb=new MakeChimeras(args);
+ mb.process(t);
+ }
+
+ public MakeChimeras(String[] args){
+
+ args=Parser.parseConfig(args);
+ if(Parser.parseHelp(args, true)){
+ printOptions();
+ System.exit(0);
+ }
+
+ for(String s : args){if(s.startsWith("out=standardout") || s.startsWith("out=stdout")){outstream=System.err;}}
+ outstream.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+
+
+
+ Shared.READ_BUFFER_LENGTH=Tools.min(200, Shared.READ_BUFFER_LENGTH);
+ Shared.capBuffers(4);
+ ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
+ ReadWrite.MAX_ZIP_THREADS=Shared.threads();
+
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+
+ Parser parser=new Parser();
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if(b==null || b.equalsIgnoreCase("null")){b=null;}
+ while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens
+
+ if(parser.parse(arg, a, b)){
+ //do nothing
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ ByteFile1.verbose=verbose;
+ ByteFile2.verbose=verbose;
+ stream.FastaReadInputStream.verbose=verbose;
+ ConcurrentGenericReadInputStream.verbose=verbose;
+// align2.FastaReadInputStream2.verbose=verbose;
+ stream.FastqReadInputStream.verbose=verbose;
+ ReadWrite.verbose=verbose;
+ }else if(parser.in1==null && i==0 && !arg.contains("=") && (arg.toLowerCase().startsWith("stdin") || new File(arg).exists())){
+ parser.in1=arg;
+ }else if(a.equals("forcelength")){
+ forceLength=Integer.parseInt(b);
+ }else if(a.equals("readsout") || a.equals("chimeras")){
+ readsOut=Tools.parseKMG(b);
+ }else{
+ outstream.println("Unknown parameter "+args[i]);
+ assert(false) : "Unknown parameter "+args[i];
+ // throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ {//Process parser fields
+ Parser.processQuality();
+
+ readsIn=parser.maxReads;
+
+ in1=parser.in1;
+ qfin1=parser.qfin1;
+
+ out1=parser.out1;
+
+ extin=parser.extin;
+ extout=parser.extout;
+ }
+
+ assert(FastaReadInputStream.settingsOK());
+
+ if(in1==null){
+ printOptions();
+ throw new RuntimeException("Error - at least one input file is required.");
+ }
+ if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){
+ ByteFile.FORCE_MODE_BF2=true;
+ }
+
+ if(out1!=null && out1.equalsIgnoreCase("null")){out1=null;}
+
+ if(!Tools.testOutputFiles(overwrite, append, false, out1)){
+ outstream.println((out1==null)+", "+out1);
+ throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output file "+out1+"\n");
+ }
+
+ ffout1=FileFormat.testOutput(out1, FileFormat.FASTQ, extout, true, overwrite, append, false);
+
+ ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, extin, true, true);
+ }
+
+ void process(Timer t){
+ assert(readsOut>0) : "Please set the 'readsout' flag to a positive integer.";
+
+ ArrayList<Read> source=new ArrayList<Read>();
+ {
+ final ConcurrentReadInputStream cris;
+ {
+ cris=ConcurrentReadInputStream.getReadInputStream(readsIn, false, ffin1, null, qfin1, null);
+ if(verbose){outstream.println("Started cris");}
+ cris.start(); //4567
+ }
+ assert(!cris.paired());
+
+ long readsProcessed=0;
+ long basesProcessed=0;
+
+ {
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ // outstream.println("Fetched "+reads);
+
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+ assert((ffin1==null || ffin1.samOrBam()) || (r.mate!=null)==cris.paired());
+ }
+
+ while(reads!=null && reads.size()>0){
+
+ for(int idx=0; idx<reads.size(); idx++){
+ final Read r1=reads.get(idx);
+ assert(r1.mate==null);
+
+ final int initialLength1=r1.length();
+
+ if(initialLength1>0){
+ source.add(r1);
+ }
+
+ readsProcessed++;
+ basesProcessed+=initialLength1;
+ }
+
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ if(ln!=null){
+ cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
+ }
+ }
+
+ errorState|=ReadWrite.closeStream(cris);
+
+ t.stop();
+
+ double rpnano=readsProcessed/(double)(t.elapsed);
+ double bpnano=basesProcessed/(double)(t.elapsed);
+
+ String rpstring=(readsProcessed<100000 ? ""+readsProcessed : readsProcessed<100000000 ? (readsProcessed/1000)+"k" : (readsProcessed/1000000)+"m");
+ String bpstring=(basesProcessed<100000 ? ""+basesProcessed : basesProcessed<100000000 ? (basesProcessed/1000)+"k" : (basesProcessed/1000000)+"m");
+
+ while(rpstring.length()<8){rpstring=" "+rpstring;}
+ while(bpstring.length()<8){bpstring=" "+bpstring;}
+
+ outstream.println("Read Time: \t"+t);
+ outstream.println("Reads In: "+rpstring+" \t"+String.format("%.2fk reads/sec", rpnano*1000000));
+ outstream.println("Bases In: "+bpstring+" \t"+String.format("%.2fm bases/sec", bpnano*1000));
+ }
+
+
+
+ if(readsOut>=0){
+ t.start();
+
+ final TextStreamWriter tsw;
+ if(ffout1==null){
+ tsw=null;
+ }else{
+ tsw=new TextStreamWriter(ffout1);
+ tsw.start();
+ }
+
+ final Random randy=new Random();
+
+ long readsProcessed=0;
+ long basesProcessed=0;
+ final int mod=source.size();
+ for(long i=0; i<readsOut; i++){
+ Read a=source.get(randy.nextInt(mod));
+ Read b=source.get(randy.nextInt(mod));
+ Read c=makeChimera(a, b, randy, i);
+ if(c==null){
+ i--;
+ }else{
+ if(tsw!=null && c!=null){
+ tsw.println(c);
+ readsProcessed++;
+ basesProcessed+=c.length();
+ }
+ }
+ }
+
+ if(tsw!=null){errorState|=tsw.poisonAndWait();}
+
+ t.stop();
+
+ double rpnano=readsProcessed/(double)(t.elapsed);
+ double bpnano=basesProcessed/(double)(t.elapsed);
+
+ String rpstring=(readsProcessed<100000 ? ""+readsProcessed : readsProcessed<100000000 ? (readsProcessed/1000)+"k" : (readsProcessed/1000000)+"m");
+ String bpstring=(basesProcessed<100000 ? ""+basesProcessed : basesProcessed<100000000 ? (basesProcessed/1000)+"k" : (basesProcessed/1000000)+"m");
+
+ while(rpstring.length()<8){rpstring=" "+rpstring;}
+ while(bpstring.length()<8){bpstring=" "+bpstring;}
+
+ outstream.println("Write Time: \t"+t);
+ outstream.println("Reads Out: "+rpstring+" \t"+String.format("%.2fk reads/sec", rpnano*1000000));
+ outstream.println("Bases Out: "+bpstring+" \t"+String.format("%.2fm bases/sec", bpnano*1000));
+ }
+
+ if(errorState){
+ throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
+ }
+ }
+
+ /**
+ * @param a
+ * @param b
+ * @param randy
+ * @return
+ */
+ private Read makeChimera(Read a, Read b, Random randy, long numericID) {
+ final String id=a.id+" ~ "+b.id;
+
+ final Read a2, b2;
+ if(forceLength>0){
+// if(a.length()>b.length()){
+// Read c=a;
+// a=b;
+// b=c;
+// }
+ a2=getPiece(a, randy, forceLength);
+ b2=getPiece(b, randy, b.length()-forceLength);
+ if(a2==null || b2==null){return null;}
+ }else{
+ a2=getPiece(a, randy);
+ b2=getPiece(b, randy);
+ }
+
+ a=b=null;
+
+ final byte[] abases=a2.bases, bbases=b2.bases, aquals=a2.quality, bquals=b2.quality;
+ final int alen=a2.length(), blen=b2.length();
+ final int len=a2.length()+b2.length();
+ byte[] bases=new byte[len];
+ byte[] quals=(aquals==null || bquals==null) ? null : new byte[len];
+
+ for(int i=0; i<alen; i++){
+ bases[i]=abases[i];
+ if(quals!=null){quals[i]=aquals[i];}
+ }
+ for(int i=alen, j=0; j<blen; i++, j++){
+ bases[i]=bbases[j];
+ if(quals!=null){quals[i]=bquals[j];}
+ }
+
+ Read r=new Read(bases, -1, -1, -1, id, quals, numericID, 0);
+ if(randy.nextBoolean()){r.reverseComplement();}
+ return r;
+ }
+
+ /**
+ * @param b
+ * @param randy
+ * @return
+ */
+ private Read getPiece(Read a, Random randy) {
+ int len=randy.nextInt(a.length())+1;
+
+ final int start;
+ if(randy.nextBoolean()){
+ if(randy.nextBoolean()){
+ start=0;
+ }else{
+ start=a.length()-len;
+ }
+ }else{
+ int range=a.length()-len;
+ start=randy.nextInt(range+1);
+ }
+
+ byte[] bases=Arrays.copyOfRange(a.bases, start, start+len);
+ byte[] quals=a.quality==null ? null : Arrays.copyOfRange(a.quality, start, start+len);
+
+ Read r=new Read(bases, -1, -1, -1, a.id, quals, a.numericID, 0);
+ if(randy.nextBoolean()){r.reverseComplement();}
+ return r;
+ }
+
+ /**
+ * @param b
+ * @param randy
+ * @return
+ */
+ private Read getPiece(Read a, Random randy, int len) {
+ len=Tools.min(len, a.length());
+ if(len<1){return null;}
+
+ final int start;
+ if(randy.nextBoolean()){
+ if(randy.nextBoolean()){
+ start=0;
+ }else{
+ start=a.length()-len;
+ }
+ }else{
+ int range=a.length()-len;
+ start=randy.nextInt(range+1);
+ }
+
+ byte[] bases=Arrays.copyOfRange(a.bases, start, start+len);
+ byte[] quals=a.quality==null ? null : Arrays.copyOfRange(a.quality, start, start+len);
+
+ Read r=new Read(bases, -1, -1, -1, a.id, quals, a.numericID, 0);
+ if(randy.nextBoolean()){r.reverseComplement();}
+ return r;
+ }
+
+ /*--------------------------------------------------------------*/
+
+ private void printOptions(){
+ assert(false) : "printOptions: TODO";
+// outstream.println("Syntax:\n");
+// outstream.println("java -ea -Xmx512m -cp <path> jgi.ReformatReads in=<infile> in2=<infile2> out=<outfile> out2=<outfile2>");
+// outstream.println("\nin2 and out2 are optional. \nIf input is paired and there is only one output file, it will be written interleaved.\n");
+// outstream.println("Other parameters and their defaults:\n");
+// outstream.println("overwrite=false \tOverwrites files that already exist");
+// outstream.println("ziplevel=4 \tSet compression level, 1 (low) to 9 (max)");
+// outstream.println("interleaved=false\tDetermines whether input file is considered interleaved");
+// outstream.println("fastawrap=70 \tLength of lines in fasta output");
+// outstream.println("qin=auto \tASCII offset for input quality. May be set to 33 (Sanger), 64 (Illumina), or auto");
+// outstream.println("qout=auto \tASCII offset for output quality. May be set to 33 (Sanger), 64 (Illumina), or auto (meaning same as input)");
+// outstream.println("outsingle=<file> \t(outs) Write singleton reads here, when conditionally discarding reads from pairs.");
+ }
+
+
+ /*--------------------------------------------------------------*/
+
+ private String in1=null;
+
+ private String qfin1=null;
+
+ private String out1=null;
+
+ private String extin=null;
+ private String extout=null;
+
+ private int forceLength=0;
+
+ /*--------------------------------------------------------------*/
+
+ private long readsIn=-1;
+ private long readsOut=-1;
+
+ /*--------------------------------------------------------------*/
+
+ private final FileFormat ffin1;
+
+ private final FileFormat ffout1;
+
+
+ /*--------------------------------------------------------------*/
+
+ private PrintStream outstream=System.err;
+ public static boolean verbose=false;
+ public boolean errorState=false;
+ private boolean overwrite=false;
+ private boolean append=false;
+
+}
diff --git a/current/jgi/MakeCoverageHistogram.java b/current/jgi/MakeCoverageHistogram.java
new file mode 100755
index 0000000..1290d19
--- /dev/null
+++ b/current/jgi/MakeCoverageHistogram.java
@@ -0,0 +1,280 @@
+package jgi;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import stream.ConcurrentLegacyReadInputStream;
+import stream.RTextInputStream;
+import stream.Read;
+import stream.SiteScore;
+
+import dna.AminoAcid;
+import dna.ChromosomeArray;
+import dna.CoverageArray;
+import dna.CoverageArray2;
+import dna.Data;
+import dna.Gene;
+import dna.Parser;
+import dna.Timer;
+import fileIO.ReadWrite;
+
+import align2.ListNum;
+import align2.Tools;
+
+/**
+ * @author Brian Bushnell
+ * @date Jul 16, 2012
+ *
+ */
+public class MakeCoverageHistogram {
+
+ public static void main(String[] args){
+ System.err.println("Executing "+(new Object() { }.getClass().getEnclosingClass().getName())+" "+Arrays.toString(args)+"\n");
+ Timer t=new Timer();
+
+ Data.GENOME_BUILD=-1;
+ for(int i=2; i<args.length; i++){
+ final String arg=args[i];
+ final String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(a.equals("genome") || a.equals("build")){
+ Data.setGenome(Integer.parseInt(b));
+ }else if(a.equals("maxdepth")){
+ MAX_DEPTH=Integer.parseInt(b);
+ }
+ }
+// assert(false) : "MAX_DEPTH="+MAX_DEPTH;
+ assert(Data.GENOME_BUILD>-1);
+
+ calc(args[0], args[1]);
+ t.stop();
+ System.out.println("Time: \t"+t);
+ }
+
+ public static void calc(String fname1, String fname2){
+ RTextInputStream rtis=new RTextInputStream(fname1, (fname2==null || fname2.equals("null") ? null : fname2), -1);
+ ConcurrentLegacyReadInputStream cris=new ConcurrentLegacyReadInputStream(rtis, -1);
+
+ cris.start();
+ System.err.println("Started cris");
+ boolean paired=cris.paired();
+ System.err.println("Paired: "+paired);
+
+ ArrayList<CoverageArray> pcov=new ArrayList<CoverageArray>(8);
+ pcov.add(new CoverageArray2(0,1000));
+ ArrayList<CoverageArray> cov=new ArrayList<CoverageArray>(8);
+ cov.add(new CoverageArray2(0,1000));
+
+ {
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+ assert(paired==(r.mate!=null));
+ }
+
+ while(reads!=null && reads.size()>0){
+ //System.err.println("reads.size()="+reads.size());
+ for(Read r : reads){
+ readsProcessed++;
+
+// System.out.println("Processing read "+r.numericID);
+
+ if(r!=null){
+ if(r.sites!=null){
+
+ for(int x=0; x<r.sites.size(); x++){
+ SiteScore ss=r.sites.get(x);
+
+ if(PROCESS_ALL_SITES || x==0 || ss.semiperfect){
+ sitesProcessed++;
+
+ boolean b=false;
+ if(ss.perfect || ss.semiperfect){
+ b=true;
+ }else{//Check for no-refs
+ int len=ss.stop-ss.start+1;
+ if(len==r.length()){
+ b=checkPerfection(ss.start, ss.stop, r.bases, Data.getChromosome(ss.chrom), ss.strand==Gene.MINUS, 0.5f);
+ }
+ }
+ if(b){
+ while(pcov.size()<=ss.chrom){
+ pcov.add(new CoverageArray2(pcov.size(), 500));
+ }
+ CoverageArray ca=pcov.get(ss.chrom);
+ for(int i=ss.start; i<=ss.stop; i++){
+ ca.increment(i);
+ }
+ }
+
+ while(cov.size()<=ss.chrom){
+ cov.add(new CoverageArray2(cov.size(), 500));
+ }
+ CoverageArray ca=cov.get(ss.chrom);
+ for(int i=ss.start; i<=ss.stop; i++){
+ ca.increment(i);
+ }
+ }
+ }
+// System.out.println(sitesProcessed);
+ }
+ }
+
+ if(r.mate!=null){
+ Read r2=r.mate;
+ if(r2.sites!=null){
+
+ for(int x=0; x<r2.sites.size(); x++){
+ SiteScore ss=r2.sites.get(x);
+
+ if(PROCESS_ALL_SITES || x==0 || ss.semiperfect){
+ sitesProcessed++;
+
+ boolean b=false;
+ if(ss.perfect || ss.semiperfect){
+ b=true;
+ }else{//Check for no-refs
+ int len=ss.stop-ss.start+1;
+ if(len==r2.length()){
+ b=checkPerfection(ss.start, ss.stop, r2.bases, Data.getChromosome(ss.chrom), ss.strand==Gene.MINUS, 0.5f);
+ }
+ }
+ if(b){
+ while(pcov.size()<=ss.chrom){
+ pcov.add(new CoverageArray2(pcov.size(), 500));
+ }
+ CoverageArray ca=pcov.get(ss.chrom);
+ for(int i=ss.start; i<=ss.stop; i++){
+ ca.increment(i);
+ }
+ }
+
+ while(cov.size()<=ss.chrom){
+ cov.add(new CoverageArray2(cov.size(), 500));
+ }
+ CoverageArray ca=cov.get(ss.chrom);
+ for(int i=ss.start; i<=ss.stop; i++){
+ ca.increment(i);
+ }
+ }
+ }
+ }
+ }
+
+// System.out.println(r.toString());
+// assert(r.list!=null);
+// assert(r.list.size()>0);
+
+ }
+ //System.err.println("returning list");
+ cris.returnList(ln.id, ln.list.isEmpty());
+ //System.err.println("fetching list");
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ System.err.println("Finished reading");
+ cris.returnList(ln.id, ln.list.isEmpty());
+ System.err.println("Returned list");
+ ReadWrite.closeStream(cris);
+ System.err.println("Closed stream");
+ System.err.println("Processed "+readsProcessed+" reads.");
+ System.err.println("Processed "+sitesProcessed+" sites.");
+ }
+
+ int max=MAX_DEPTH;
+ long[] hist=new long[max+1];
+ long[] phist=new long[max+1];
+ double[] histF=new double[max+1];
+ double[] phistF=new double[max+1];
+ long[] histC=new long[max+1];
+ long[] phistC=new long[max+1];
+ double[] histCF=new double[max+1];
+ double[] phistCF=new double[max+1];
+
+ for(int chrom=1; chrom<=Data.numChroms; chrom++){
+ ChromosomeArray cha=Data.getChromosome(chrom);
+ if(pcov.size()>chrom){
+ CoverageArray ca=pcov.get(chrom);
+ for(int i=0; i<=cha.maxIndex; i++){
+ int x=ca.get(i);
+ byte b=cha.get(i);
+ if(b!='N'){
+ phist[Tools.min(max, x)]++;
+ }
+ }
+ }
+ }
+
+ for(int chrom=1; chrom<=Data.numChroms; chrom++){
+ ChromosomeArray cha=Data.getChromosome(chrom);
+ if(cov.size()>chrom){
+ CoverageArray ca=cov.get(chrom);
+ for(int i=0; i<=cha.maxIndex; i++){
+ int x=ca.get(i);
+ byte b=cha.get(i);
+ if(b!='N'){
+ hist[Tools.min(max, x)]++;
+ }
+ }
+ }
+ }
+
+ phistC[max]=phist[max];
+ histC[max]=hist[max];
+ for(int i=max; i>0; i--){
+ phistC[i-1]=phistC[i]+phist[i-1];
+ histC[i-1]=histC[i]+hist[i-1];
+ }
+ for(int i=0; i<=max; i++){
+ phistCF[i]=phistC[i]*100d/phistC[0];
+ phistF[i]=phist[i]*100d/phistC[0];
+ histCF[i]=histC[i]*100d/histC[0];
+ histF[i]=hist[i]*100d/histC[0];
+ }
+
+ System.out.println("\nTotal Coverage:");
+ for(int i=0; i<=max; i++){
+ System.out.println(i+"\t"+hist[i]+String.format("\t%.3f%%", histF[i])+"\t"+histC[i]+String.format("\t%.3f%%", histCF[i]));
+ }
+
+
+ System.out.println("\nPerfect Coverage:");
+ for(int i=0; i<=max; i++){
+ System.out.println(i+"\t"+phist[i]+String.format("\t%.3f%%", phistF[i])+"\t"+phistC[i]+String.format("\t%.3f%%", phistCF[i]));
+ }
+
+ }
+
+ private static boolean checkPerfection(int start, int stop, byte[] bases, ChromosomeArray cha, boolean rcomp, float f) {
+
+ int noref=0;
+ if(rcomp){
+ for(int i=0; i<bases.length; i++){
+ byte a=AminoAcid.baseToComplementExtended[bases[bases.length-i-1]];
+ byte b=cha.get(start+i);
+ if(b=='N'){noref++;}
+ else if(a!=b){return false;}
+ }
+ }else{
+ for(int i=0; i<bases.length; i++){
+ byte a=bases[i];
+ byte b=cha.get(start+i);
+ if(b=='N'){noref++;}
+ else if(a!=b){return false;}
+ }
+ }
+ return bases.length-noref>=f*bases.length;
+ }
+
+ public static long readsProcessed=0;
+ public static long sitesProcessed=0;
+ public static boolean PROCESS_ALL_SITES=false;
+ public static int MAX_DEPTH=100;
+
+}
diff --git a/current/jgi/MakeLengthHistogram.java b/current/jgi/MakeLengthHistogram.java
new file mode 100755
index 0000000..de5b8d4
--- /dev/null
+++ b/current/jgi/MakeLengthHistogram.java
@@ -0,0 +1,231 @@
+package jgi;
+
+import java.io.File;
+import java.util.ArrayList;
+
+import stream.ConcurrentReadInputStream;
+import stream.FastaReadInputStream;
+import stream.Read;
+
+import dna.Data;
+import dna.Parser;
+import dna.Timer;
+import fileIO.FileFormat;
+import fileIO.ReadWrite;
+import fileIO.TextStreamWriter;
+
+import align2.ListNum;
+import align2.ReadStats;
+import align2.Shared;
+import align2.Tools;
+
+/**
+ * @author Brian Bushnell
+ * @date Jul 16, 2012
+ *
+ */
+public class MakeLengthHistogram {
+
+ public static void main(String[] args){
+ Timer t=new Timer();
+
+ String in1=null, in2=null;
+ String out=null;
+
+ Data.GENOME_BUILD=-1;
+ ReadWrite.USE_UNPIGZ=true;
+ Shared.READ_BUFFER_LENGTH=Tools.mid(1, Shared.READ_BUFFER_LENGTH, 20);
+
+ /* Parse arguments */
+ for(int i=0; i<args.length; i++){
+
+ final String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if("null".equalsIgnoreCase(b)){b=null;}
+ while(a.charAt(0)=='-' && (a.indexOf('.')<0 || i>1 || !new File(a).exists())){a=a.substring(1);}
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(Parser.parseZip(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseQuality(arg, a, b)){
+ //do nothing
+ }else if(a.equals("reads") || a.equals("maxreads")){
+ maxReads=Tools.parseKMG(b);
+ }else if(a.equals("append") || a.equals("app")){
+ append=ReadStats.append=Tools.parseBoolean(b);
+ }else if(a.equals("overwrite") || a.equals("ow")){
+ overwrite=Tools.parseBoolean(b);
+ }else if(a.equals("in") || a.equals("in1")){
+ in1=b;
+ }else if(a.equals("in2")){
+ in2=b;
+ }else if(a.equals("out") || a.equals("hist") || a.equals("lhist")){
+ out=b;
+ }else if(a.equals("max") || a.equals("maxlength")){
+ MAX_LENGTH=Integer.parseInt(b);
+ }else if(a.equals("nzo") || a.equals("nonzeroonly")){
+ NON_ZERO_ONLY=Tools.parseBoolean(b);
+ }else if(a.startsWith("mult") || a.startsWith("div") || a.startsWith("bin")){
+ MULT=Integer.parseInt(b);
+ }else if(a.equals("round")){
+ ROUND_BINS=Tools.parseBoolean(b);
+ }else if(i==0 && !arg.contains("=")){
+ in1=arg;
+ }else if(i==1 && !arg.contains("=")){
+ in2=arg;
+ }else if(i==3 && !arg.contains("=")){
+ out=arg;
+ }else{
+ throw new RuntimeException("Unknown argument: "+arg);
+ }
+ }
+
+ {//Process parser fields
+ Parser.processQuality();
+ }
+
+ MAX_LENGTH/=MULT;
+
+ calc(in1, in2, out);
+ t.stop();
+ System.err.println("Time: \t"+t);
+ }
+
+ public static void calc(String in1, String in2, String out){
+
+ FastaReadInputStream.MIN_READ_LEN=1;
+
+ final ConcurrentReadInputStream cris;
+ {
+ FileFormat ff1=FileFormat.testInput(in1, FileFormat.FASTQ, null, true, true);
+ FileFormat ff2=FileFormat.testInput(in2, FileFormat.FASTQ, null, true, true);
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2);
+// if(verbose){System.err.println("Started cris");}
+ cris.start(); //4567
+ }
+ boolean paired=cris.paired();
+// if(verbose){System.err.println("Paired: "+paired);}
+
+
+ final int max=MAX_LENGTH;
+ long[] readHist=new long[max+1];
+ long[] baseHist=new long[max+1];
+
+ int maxFound=0;
+ int minFound=Integer.MAX_VALUE;
+
+ {
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+ assert(paired==(r.mate!=null));
+ }
+
+ while(reads!=null && reads.size()>0){
+ //System.err.println("reads.size()="+reads.size());
+ for(Read r1 : reads){
+ final Read r2=r1.mate;
+
+// System.out.println("Processing read "+r.numericID);
+
+ if(r1!=null && r1.bases!=null){
+ readsProcessed++;
+ final int x=r1.length();
+ final int y=Tools.min(max, ((ROUND_BINS ? x+MULT/2 : x))/MULT);
+ readHist[y]++;
+ baseHist[y]+=x;
+ maxFound=Tools.max(maxFound, x);
+ minFound=Tools.min(minFound, x);
+ }
+
+ if(r2!=null && r2.bases!=null){
+ readsProcessed++;
+ final int x=r2.length();
+ final int y=Tools.min(max, ((ROUND_BINS ? x+MULT/2 : x))/MULT);
+ readHist[y]++;
+ baseHist[y]+=x;
+ maxFound=Tools.max(maxFound, x);
+ minFound=Tools.min(minFound, x);
+ }
+
+ }
+ //System.err.println("returning list");
+ cris.returnList(ln.id, ln.list.isEmpty());
+ //System.err.println("fetching list");
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ if(verbose){System.err.println("Finished reading");}
+ cris.returnList(ln.id, ln.list.isEmpty());
+ if(verbose){System.err.println("Returned list");}
+ ReadWrite.closeStream(cris);
+ if(verbose){System.err.println("Closed stream");}
+ System.err.println("Processed "+readsProcessed+" reads.");
+ }
+
+ if(readsProcessed<1){minFound=0;}
+ double stdev=Tools.standardDeviationHistogram(readHist)*MULT;
+ int median=Tools.percentile(readHist, 0.5)*MULT;
+ int mode=Tools.calcMode(readHist)*MULT;
+
+ double[] readHistF=new double[max+1];
+ long[] readHistC=new long[max+1];
+ double[] readHistCF=new double[max+1];
+
+ double[] baseHistF=new double[max+1];
+ long[] baseHistC=new long[max+1];
+ double[] baseHistCF=new double[max+1];
+
+
+ readHistC[max]=readHist[max];
+ baseHistC[max]=baseHist[max];
+ for(int i=max; i>0; i--){
+ readHistC[i-1]=readHistC[i]+readHist[i-1];
+ baseHistC[i-1]=baseHistC[i]+baseHist[i-1];
+ }
+ for(int i=0; i<=max; i++){
+ readHistCF[i]=readHistC[i]*100d/readHistC[0];
+ readHistF[i]=readHist[i]*100d/readHistC[0];
+ baseHistCF[i]=baseHistC[i]*100d/baseHistC[0];
+ baseHistF[i]=baseHist[i]*100d/baseHistC[0];
+ }
+
+ TextStreamWriter tsw=new TextStreamWriter(out==null ? "stdout" : out, overwrite, append, false);
+ tsw.start();
+ tsw.println("#Reads:\t"+readsProcessed);
+ tsw.println("#Bases:\t"+baseHistC[0]);
+ tsw.println("#Max:\t"+maxFound);
+ tsw.println("#Min:\t"+minFound);
+ tsw.println("#Avg:\t"+String.format("%.1f",(baseHistC[0]*1d/readsProcessed)));
+ tsw.println("#Median:\t"+median);
+ tsw.println("#Mode:\t"+mode);
+ tsw.println("#Std_Dev:\t"+String.format("%.1f",stdev));
+ tsw.println("#Read Length Histogram:");
+ tsw.println("#Length\treads\tpct_reads\tcum_reads\tcum_pct_reads\tbases\tpct_bases\tcum_bases\tcum_pct_bases");
+ for(int i=0; i<=max; i++){
+ if(readHist[i]>0 || !NON_ZERO_ONLY){
+ tsw.println((i*MULT)+"\t"+readHist[i]+String.format("\t%.3f%%", readHistF[i])+"\t"+readHistC[i]+String.format("\t%.3f%%", readHistCF[i])+
+ "\t"+baseHist[i]+String.format("\t%.3f%%", baseHistF[i])+"\t"+baseHistC[i]+String.format("\t%.3f%%", baseHistCF[i]));
+ }
+ if(i*MULT>=maxFound){break;}
+ }
+ tsw.poisonAndWait();
+ }
+
+ public static long maxReads=-1;
+ public static long readsProcessed=0;
+ public static int MAX_LENGTH=80000;
+ public static int MULT=10;
+ public static boolean ROUND_BINS=false;
+ public static boolean NON_ZERO_ONLY=false;
+
+ public static boolean append=false;
+ public static boolean overwrite=true;
+ public static boolean verbose=false;
+
+}
diff --git a/current/jgi/MateReadsMT.java b/current/jgi/MateReadsMT.java
new file mode 100755
index 0000000..2734de2
--- /dev/null
+++ b/current/jgi/MateReadsMT.java
@@ -0,0 +1,1600 @@
+package jgi;
+
+import java.io.File;
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import bloom.KCountArray;
+import bloom.KmerCount7MT;
+import bloom.KmerCountAbstract;
+
+
+import stream.ConcurrentReadInputStream;
+import stream.FASTQ;
+import stream.FastaReadInputStream;
+import stream.ConcurrentReadOutputStream;
+import stream.Read;
+import stream.ReadStreamWriter;
+
+import dna.AminoAcid;
+import dna.Parser;
+import dna.Timer;
+import fileIO.ReadWrite;
+import fileIO.FileFormat;
+import fileIO.TextFile;
+
+import align2.ListNum;
+import align2.ReadStats;
+import align2.Shared;
+import align2.Tools;
+import align2.TrimRead;
+
+/**
+ * @author Brian Bushnell
+ * @date Aug 14, 2012
+ *
+ */
+public class MateReadsMT {
+
+
+ public static void main(String[] args){
+ MateReadsMT mr=new MateReadsMT(args);
+ mr.process();
+ }
+
+ public MateReadsMT(String[] args){
+ System.err.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+ System.err.println("BBMerge version "+version);
+
+ Timer ttotal=new Timer();
+ ttotal.start();
+
+ in1_primary=(args[0].indexOf('=')>0 ? null : args[0]);
+ in2_primary=(in1_primary!=null && args.length>1 && args[1].indexOf('=')<0 ? args[1] : null);
+ if(in2_primary!=null && "null".equalsIgnoreCase(in2_primary)){in2_primary=null;}
+
+ {
+ if(in1_primary!=null && !in1_primary.contains(",") && !in1_primary.startsWith("stdin.") && !in1_primary.equals("stdin")){
+ File f=new File(in1_primary);
+ if(!f.exists() || !f.isFile()){
+ in1_primary=null;
+// throw new RuntimeException(in1+" does not exist.");
+ }
+ }
+ if(in2_primary!=null && !in2_primary.contains(",")){
+ File f=new File(in2_primary);
+ if(!f.exists() || !f.isFile()){
+ in2_primary=null;
+// throw new RuntimeException(in2+" does not exist.");
+ }else if(in1_primary.equalsIgnoreCase(in2_primary)){
+ throw new RuntimeException("Both input files are the same.");
+ }
+ }
+ }
+
+ Parser parser=new Parser();
+ ReadWrite.MAX_ZIP_THREADS=Shared.threads()-1;
+
+
+ for(int i=0; i<args.length; i++){
+ final String arg=args[i];
+ final String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=(split.length>1 ? split[1] : "true");
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(Parser.parseCommonStatic(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseZip(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseQualityAdjust(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseQuality(arg, a, b)){
+ //do nothing
+ }else if(parser.parseInterleaved(arg, a, b)){
+ //do nothing
+ }else if(parser.parseTrim(arg, a, b)){
+ //do nothing
+ }else if(a.equals("in") || a.equals("in1")){
+ in1_primary=b;
+ }else if(a.equals("in2")){
+ in2_primary=b;
+ }else if(a.equals("k") || a.equals("kmer")){
+ k_G=Integer.parseInt(b);
+ }else if(a.equals("minoverlappingbases") || a.equals("minoverlapbases")){
+ MIN_OVERLAPPING_BASES=Integer.parseInt(b);
+ }else if(a.equals("minoverlap") || a.equals("minoverlappingkmers") || a.equals("minoverlapkmers")){
+ MIN_OVERLAPPING_KMERS=Integer.parseInt(b);
+ }else if(a.equals("minoverlappingbases0") || a.equals("minoverlapbases0")){
+ MIN_OVERLAPPING_BASES_0=Integer.parseInt(b);
+ }else if(a.equals("minoverlap0") || a.equals("minoverlappingkmers0") || a.equals("minoverlapkmers0")){
+ MIN_OVERLAPPING_KMERS_0=Integer.parseInt(b);
+ }else if(a.equals("minoverlapinsert") || a.equals("minoi")){
+ MIN_OVERLAP_INSERT=Integer.parseInt(b);
+ }else if(a.equals("badlimit")){
+ DEFAULT_BADLIMIT=Integer.parseInt(b);
+ }else if(a.startsWith("matrixbits")){
+ int matrixbits=Integer.parseInt(b);
+ assert(matrixbits<63);
+ totalcells_G=1L<<matrixbits;
+ }else if(a.startsWith("cells")){
+ totalcells_G=Tools.parseKMG(b);
+ }else if(a.equals("passes")){
+ passes_G=Integer.parseInt(b);
+ }else if(a.equals("bin")){
+ bin=Integer.parseInt(b);
+ }else if(a.equals("threads") || a.equals("t")){
+ THREADS=Integer.parseInt(b);
+ }else if(a.startsWith("minq")){
+ MIN_QUALITY=(byte)Integer.parseInt(b);
+ }else if(a.startsWith("minqo")){
+ MIN_QUALITY_FOR_OVERLAP=(byte)Integer.parseInt(b);
+ }else if(a.equals("maxq")){
+ Read.MAX_MERGE_QUALITY=(byte)Integer.parseInt(b);
+ }else if(a.startsWith("minprob")){
+ KmerCountAbstract.minProb=Float.parseFloat(b);
+ }else if(a.startsWith("hashes") || a.startsWith("multihash")){
+ hashes=Integer.parseInt(b);
+ assert(hashes>0 && hashes<25);
+ }else if(a.startsWith("cbits") || a.startsWith("cellbits")){
+ cbits_G=Integer.parseInt(b);
+ int cmax=(1<<cbits_G)-1;
+ MAX_HITS_FOR_BAD=Tools.min(MAX_HITS_FOR_BAD, cmax-1);
+ MIN_HITS_FOR_GOOD=Tools.min(MIN_HITS_FOR_GOOD, cmax);
+ }else if(a.startsWith("minvotes")){
+ MIN_VOTES=Integer.parseInt(b);
+ }else if(a.endsWith("hitsforbad")){
+ MAX_HITS_FOR_BAD=Integer.parseInt(b);
+ }else if(a.endsWith("hitsforgood")){
+ MIN_HITS_FOR_GOOD=Integer.parseInt(b);
+ }else if(a.equals("maxbadbases")){
+ DEFAULT_BADLIMIT_FOR_BASE_MATCHING=Integer.parseInt(b);
+ }else if(a.equals("reads") || a.startsWith("maxreads")){
+ maxReads_G=Tools.parseKMG(b);
+ }else if(a.equals("tablereads") || a.startsWith("tablereads")){
+ tableReads_G=Tools.parseKMG(b);
+ }else if(a.startsWith("gap")){
+ if(b.equalsIgnoreCase("true") || b.equalsIgnoreCase("null")){
+ b="";
+ System.err.println("Note - no hash tables will be used as there are no gaps.");
+ gap_G=new int[1][0];
+ }else{
+ String[] h=b.split("[;:]");
+ gap_G=new int[h.length][];
+ for(int m=0; m<h.length; m++){
+ String[] g=h[m].split(",");
+ if(g.length==0 || (g.length==1 && (g[0].length()==0 || g[0].equalsIgnoreCase("null")))){
+ gap_G[m]=new int[0];
+ }else{
+ gap_G[m]=new int[g.length];
+ for(int j=0; j<g.length; j++){
+ gap_G[m][j]=Integer.parseInt(g[j]);
+ }
+ }
+ }
+ }
+// for(int m=0; m<gap.length; m++){System.err.println(Arrays.toString(gap[m]));}
+// assert(false);
+ }else if(a.startsWith("extra")){
+ extra_G=Arrays.asList(b.split(","));
+ }else if(a.equals("outgood") || a.startsWith("outpair") || a.equals("outmerged") || a.equals("out")){
+ outgood_G=(b==null || b.equals("null") ? null : b);
+ }else if(a.equals("temp") || a.equals("tempfile")){
+ assert(b.contains("#") && b.contains(".txt"));
+ tempfile=b;
+ }else if(a.equals("outb") || a.equals("outu") || a.equals("outunmerged") || a.equals("outbad") || a.startsWith("outunpair") || a.startsWith("outsingle")){
+ outbad_G=(b==null || b.equals("null") ? null : b);
+// assert(outbad==null || b.contains("#")) : "Unpaired read output filname must contain '#' symbol.";
+ }else if(a.startsWith("outinsert") || a.startsWith("outlength")){
+ outinsert_G=(b==null || b.equals("null") ? null : b);
+ }else if(a.startsWith("outhist3") || a.equals("hist3")){
+ outhist3=(b==null || b.equals("null") ? null : b);
+ }else if(a.startsWith("outhist2") || a.equals("hist2")){
+ outhist2=(b==null || b.equals("null") ? null : b);
+ }else if(a.startsWith("outhist") || a.startsWith("hist") || a.equals("ihist")){
+ outhist=(b==null || b.equals("null") ? null : b);
+ }else if(a.equals("outputfailed")){
+ OUTPUT_FAILED=Tools.parseBoolean(b);
+ }else if(a.equals("mix")){
+ MIX_BAD_AND_GOOD=Tools.parseBoolean(b);
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ }else if(a.equals("join")){
+ join_G=Tools.parseBoolean(b);
+ }else if(a.equals("usemapping")){
+ USE_MAPPING=Tools.parseBoolean(b);
+ }else if(a.equals("useoverlap") || a.equals("usebases") || a.equals("matebyoverlap") || a.equals("matebybases")){
+ MATE_BY_OVERLAP=Tools.parseBoolean(b);
+ }else if(a.startsWith("skipmated")){
+ SKIP_MATED_READS=Tools.parseBoolean(b);
+ }else if(a.startsWith("writeintermediatejoined")){
+ WRITE_INTERMEDIATE_JOINED=Tools.parseBoolean(b);
+ }else if(a.startsWith("fillmiddleinter")){
+ FILL_MIDDLE_INTERMEDIATE=Tools.parseBoolean(b);
+ }else if(a.startsWith("fillmiddlefinal")){
+ FILL_MIDDLE_FINAL=Tools.parseBoolean(b);
+ }else if(a.equals("fillmiddle")){
+ FILL_MIDDLE_INTERMEDIATE=FILL_MIDDLE_FINAL=Tools.parseBoolean(b);
+ }else if(a.equals("append") || a.equals("app")){
+ append=ReadStats.append=Tools.parseBoolean(b);
+ }else if(a.equals("overwrite") || a.equals("ow")){
+ overwrite=Tools.parseBoolean(b);
+ }else if(a.equals("trimonfailure") || a.equals("tof")){
+ if(b!=null && Character.isDigit(b.charAt(0))){
+ TRIM_ON_OVERLAP_FAILURE=Integer.parseInt(b);
+ }else{
+ TRIM_ON_OVERLAP_FAILURE=(Tools.parseBoolean(b) ? 1 : 0);
+ }
+ }else if(a.equals("mi") || a.equals("minins") || a.equals("mininsert")){
+ minInsert=Integer.parseInt(b);
+ }else{
+ throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ {//Process parser fields
+ Parser.processQuality();
+
+ qtrimLeft=parser.qtrimLeft;
+ qtrimRight=parser.qtrimRight;
+ trimq=parser.trimq;
+ qtrim=((qtrimLeft||qtrimRight)&&trimq>=0);
+ minReadLength=parser.minReadLength;
+ untrim=parser.untrim;
+ }
+
+ if(in2_primary==null && in1_primary!=null && in1_primary.contains("#") && !new File(in1_primary).exists()){
+ in2_primary=in1_primary.replaceFirst("#", "2");
+ in1_primary=in1_primary.replaceFirst("#", "1");
+ }
+
+ if(in2_primary!=null){
+ assert(!in1_primary.equalsIgnoreCase(in2_primary));
+ FASTQ.TEST_INTERLEAVED=false;
+ FASTQ.FORCE_INTERLEAVED=false;
+ }else{
+ FASTQ.TEST_INTERLEAVED=true;
+ FASTQ.FORCE_INTERLEAVED=true;
+ }
+
+// assert(false) : MATE_BY_OVERLAP;
+
+ if(FILL_MIDDLE_INTERMEDIATE){
+ if(!WRITE_INTERMEDIATE_JOINED){System.err.println("WRITE_INTERMEDIATE_JOINED forced to true.");}
+ WRITE_INTERMEDIATE_JOINED=true;
+ }
+ if(WRITE_INTERMEDIATE_JOINED){
+ if(!join_G){System.err.println("Final output forced to be joined reads.");}
+ join_G=true;
+ //Ultimately I could re-read the initial files, so this is not truly needed.
+ }
+ }
+
+ void process(){
+ Timer ttotal=new Timer();
+ ttotal.start();
+// assert(!FASTQ.PARSE_CUSTOM);
+
+ final int hwthr=Shared.threads();
+ if(THREADS<1){THREADS=hwthr;}
+ System.err.println("Detected "+Runtime.getRuntime().availableProcessors()+" hardware threads; using "+THREADS+" for main process.");
+ long memory=(Runtime.getRuntime().maxMemory());
+ System.err.println("Detected "+(memory/(1L<<20))+" MB available memory.");
+// System.err.println("PARSE_CUSTOM="+FASTQ.PARSE_CUSTOM);
+
+ if(gap_G!=null){
+ for(int[] g : gap_G){
+ maxtables=Tools.max(maxtables, g.length);
+ for(int g2 : g){assert(g2>0) : "TODO: Ungapped kmers do not currently work. Please use gap lengths of >0.";}
+ }
+ }
+ if(maxtables<1 && !USE_MAPPING && !MATE_BY_OVERLAP){
+ throw new RuntimeException("No gap sizes have been specified, so there is no work to do.");
+ }
+
+ if(passes_G>1){totalcells_G*=2;}
+
+ if(auto && maxtables>0 && totalcells_G<0){
+ final long usable=(long)Tools.max(((memory-(256000000))*.7), memory*0.4);
+ long mem=usable;
+ totalcells_G=(mem*8)/cbits_G;
+
+// long tablebytes=((1L<<matrixbits)*cbits)/8;
+// if(tablebytes*3<usable){matrixbits++;}
+// System.err.println(tablebytes/1000000+", "+usable/1000000+", "+(tablebytes*3)/1000000);
+
+ System.err.println("\nAuto settings:");
+ System.err.println("k: \t"+k_G);
+ System.err.println("cbits: \t"+cbits_G);
+// System.err.println("matrixbits: \t"+matrixbits);
+// System.err.println("matrixbits2:\t"+matrixbits2);
+ System.err.println("cells: \t"+Tools.toKMG(totalcells_G));
+ System.err.println("hashes: \t"+hashes);
+ System.err.println();
+ }else if(totalcells_G==-1){
+ totalcells_G=1L<<34;
+ }
+
+ String in1=in1_primary, in2=in2_primary;
+
+ KCountArray middleTable=null;
+ if(FILL_MIDDLE_INTERMEDIATE || FILL_MIDDLE_FINAL){
+ maxtables++;
+ long cells=totalcells_G/maxtables;
+ if(k_G<32 && cells>(1L<<(2*k_G))){cells=(1L<<(2*k_G));}
+ middleTable=KmerCount7MT.makeKca(in1, in2, extra_G, MIDDLE_TABLE_K, cbits_G, 0, cells, hashes+1, MIN_QUALITY, true, tableReads_G, 1, 4, 2, 2, null, 0);
+ middleTable.shutdown();
+ System.err.println("MiddleTable: \tgap = "+middleTable.gap+" \tmem = "+middleTable.mem()+" \tused = "+String.format("%.3f%%",middleTable.usedFraction()*100));
+ }
+
+ final int cmax=(1<<cbits_G)-1;
+ assert(MIN_HITS_FOR_GOOD>MAX_HITS_FOR_BAD && MIN_HITS_FOR_GOOD<=cmax && MAX_HITS_FOR_BAD>0);
+
+
+
+ final int phases=(gap_G==null ? 1 : gap_G.length);
+
+ KmerCountAbstract.PREJOIN=false;
+
+ String a1=in1, a2=in2;
+
+ int oldzip=ReadWrite.ZIPLEVEL;
+ for(int phase=0; phase<phases-1; phase++){
+ ReadWrite.ZIPLEVEL=Tools.min(oldzip, 4);
+ String temp=tempfile.replaceFirst("#", "#_P"+phase);
+ runPhase(gap_G[phase], a1, a2, extra_G, null, temp, null, cbits_G, k_G, totalcells_G, hashes, passes_G,
+ WRITE_INTERMEDIATE_JOINED, Tools.max(maxReads_G, tableReads_G), tableReads_G, true, (FILL_MIDDLE_INTERMEDIATE ? middleTable : null));
+ a1=temp.replaceFirst("#", "1");
+ a2=temp.replaceFirst("#", "2");
+ System.err.println("\nPhase "+(phase+1)+" statistics.");
+ System.err.println("Insert range: \t"+insertMinTotal+" - "+insertMaxTotal);
+ System.err.println("90th percentile: \t"+Tools.percentile(histTotal, .9));
+ System.err.println("50th percentile: \t"+Tools.percentile(histTotal, .5));
+ System.err.println("10th percentile: \t"+Tools.percentile(histTotal, .1));
+ KmerCountAbstract.PREJOIN=true;
+ }
+ ReadWrite.ZIPLEVEL=oldzip;
+ if(!FILL_MIDDLE_FINAL){middleTable=null;}
+ runPhase((gap_G==null ? null : gap_G[gap_G.length-1]), a1, a2, extra_G, outinsert_G, outgood_G, outbad_G, cbits_G, k_G, totalcells_G, hashes, passes_G,
+ join_G, maxReads_G, tableReads_G, false, middleTable);
+
+ if(outhist!=null){
+ StringBuilder sb=new StringBuilder();
+// for(int i=0; i<histTotal.length; i++){
+// sb.append(i+"\t"+histTotal[i]+"\n");
+// }
+ for(int i=0; i<histTotal.length && i<=insertMaxTotal; i+=bin){
+ int x=0;
+ int y=0;
+ for(int j=i; j<i+bin && j<histTotal.length; j++){
+ x+=histTotal[j];
+ y++;
+ }
+ x=(x+bin-1)/y;
+ sb.append(i+"\t"+x+"\n");
+ }
+ ReadWrite.writeStringInThread(sb, outhist);
+ }
+
+ if(outhist2!=null){
+ StringBuilder sb=new StringBuilder();
+ for(int i=0; i<histTotal.length && i<=insertMaxTotal; i+=bin){
+ int x=0;
+ int y=0;
+ for(int j=i; j<i+bin && j<histTotal.length; j++){
+ x+=histTotal[j];
+ y++;
+ }
+ x=(x+bin-1)/y;
+ sb.append(x+"\n");
+ }
+ ReadWrite.writeStringInThread(sb, outhist2);
+ }
+
+ if(outhist3!=null){
+
+ if(!new File(outhist3).exists()){
+ StringBuilder sb=new StringBuilder();
+ for(int i=0; i<histTotal.length; i+=bin){
+ sb.append(i+"\n");
+ }
+ ReadWrite.writeString(sb, outhist3);
+ }
+
+ StringBuilder sb=new StringBuilder();
+ TextFile tf=new TextFile(outhist3, false, false);
+ for(int i=0; i<histTotal.length; i+=bin){
+ int x=0;
+ int y=0;
+ for(int j=i; j<i+bin && j<histTotal.length && i<=insertMaxTotal; j++){
+ x+=histTotal[j];
+ y++;
+ }
+ x=(x+bin-1)/y;
+ sb.append(tf.readLine()+"\t"+x+"\n");
+ }
+ tf.close();
+ ReadWrite.writeStringInThread(sb, outhist3);
+ }
+
+
+ ttotal.stop();
+ System.err.println("Total time: "+ttotal+"\n");
+
+ long sum=correctCountTotal+incorrectCountTotal;
+
+ double div=100d/readsProcessedTotal;
+ System.err.println("Pairs: \t"+readsProcessedTotal);
+ System.err.println("Joined: \t"+sum+String.format((sum<10000 ? " " : " ")+"\t%.3f%%", sum*div));
+ if(FASTQ.PARSE_CUSTOM){
+ System.err.println("Correct: \t"+correctCountTotal+String.format((correctCountTotal<10000 ? " " : " ")+"\t%.3f%%", correctCountTotal*div));
+ System.err.println("Incorrect: \t"+incorrectCountTotal+String.format((incorrectCountTotal<10000 ? " " : " ")+"\t%.3f%%", incorrectCountTotal*div));
+ }
+ System.err.println("Ambiguous: \t"+ambiguousCountTotal+String.format((ambiguousCountTotal<10000 ? " " : " ")+"\t%.3f%%", ambiguousCountTotal*div));
+ System.err.println("No Solution: \t"+noSolutionCountTotal+String.format((noSolutionCountTotal<10000 ? " " : " ")+"\t%.3f%%", noSolutionCountTotal*div));
+ if(minInsert>0){System.err.println("Too Short: \t"+tooShortCountTotal+String.format((tooShortCountTotal<10000 ? " " : " ")+"\t%.3f%%", tooShortCountTotal*div));}
+ System.err.println("Avg Insert: \t\t"+String.format("%.1f", (insertSumCorrectTotal+insertSumIncorrectTotal)*1d/(correctCountTotal+incorrectCountTotal)));
+ if(FASTQ.PARSE_CUSTOM){
+ System.err.println("Avg Insert Correct: \t\t"+String.format("%.1f", (insertSumCorrectTotal)*1d/(correctCountTotal)));
+ System.err.println("Avg Insert Incorrect:\t\t"+String.format("%.1f", (insertSumIncorrectTotal)*1d/(incorrectCountTotal)));
+ }
+
+ System.err.println("\nPhase "+(phases)+" statistics.");
+ System.err.println("Insert range: \t"+insertMinTotal+" - "+insertMaxTotal);
+ System.err.println("90th percentile: \t"+Tools.percentile(histTotal, .9));
+ System.err.println("50th percentile: \t"+Tools.percentile(histTotal, .5));
+ System.err.println("10th percentile: \t"+Tools.percentile(histTotal, .1));
+ }
+
+ public static void runPhase(int[] gap, String in1, String in2, List<String> extra, String outinsert, String outgood, String outbad,
+ int cbits, int k, long totalcells, int multihash, int passes, boolean join, long maxReads, long tableReads, boolean perfectonly, KCountArray middleTable){
+
+ assert(((USE_MAPPING || MATE_BY_OVERLAP) && MIN_VOTES<2) ||
+ (MIN_VOTES>0 && MIN_VOTES<=gap.length)) : "minVotes is set too high. Should be at most the number of (overlapping) gaps.";
+
+ Timer thash=new Timer(), talign=new Timer();
+
+ assert(totalcells>1);
+ if(middleTable!=null){totalcells=totalcells-middleTable.cells;}
+ long cells=totalcells/(gap==null || gap.length==0 ? 1 : gap.length);
+ if(k<32 && cells>1L<<(2*k)){cells=1L<<(2*k);}
+
+ ConcurrentReadOutputStream rosgood=null;
+ ConcurrentReadOutputStream rosbad=null;
+ ConcurrentReadOutputStream rosinsert=null;
+
+ if(outgood!=null){
+ final String out1, out2;
+
+// assert(outgood.contains("#") || sam || fq) : outgood;
+ if(outgood.contains("#")){
+ out1=outgood.replaceFirst("#", "1");
+ out2=outgood.replaceFirst("#", "2");
+ }else{
+ out1=outgood;
+ out2=null;
+ if(!join){System.err.println("Writing joinable reads interleaved.");}
+ else{System.err.println("Writing joinable reads joined.");}
+ }
+ assert(!out1.equalsIgnoreCase(in1) && !out1.equalsIgnoreCase(in1));
+ assert(out2==null || (!out2.equalsIgnoreCase(in1) && !out2.equalsIgnoreCase(in2)));
+
+ final FileFormat ff1=FileFormat.testOutput(out1, FileFormat.FASTQ, null, true, overwrite, append, true);
+ final FileFormat ff2=FileFormat.testOutput(out2, FileFormat.FASTQ, null, true, overwrite, append, true);
+ assert(!ff1.samOrBam()) : "Sam files need reference info for the header.";
+
+ final int buff=Tools.max(16, 2*THREADS);
+ rosgood=ConcurrentReadOutputStream.getStream(ff1, ff2, null, null, buff, null, false);
+ rosgood.start();
+ }
+
+ if(outbad!=null){
+ final String out1, out2;
+
+// assert(outbad.contains("#") || sam || fq) : outbad;
+ if(outbad.contains("#")){
+ out1=outbad.replaceFirst("#", "1");
+ out2=outbad.replaceFirst("#", "2");
+ }else{
+ out1=outbad;
+ out2=null;
+ System.err.println("Writing unjoinable reads interleaved.");
+ }
+ assert(!out1.equalsIgnoreCase(in1) && !out1.equalsIgnoreCase(in1));
+ assert(out2==null || (!out2.equalsIgnoreCase(in1) && !out2.equalsIgnoreCase(in2)));
+
+ final FileFormat ff1=FileFormat.testOutput(out1, FileFormat.FASTQ, null, true, overwrite, append, true);
+ final FileFormat ff2=FileFormat.testOutput(out2, FileFormat.FASTQ, null, true, overwrite, append, true);
+ assert(!ff1.samOrBam()) : "Sam files need reference info for the header.";
+
+ final int buff=Tools.max(16, 2*THREADS);
+ rosbad=ConcurrentReadOutputStream.getStream(ff1, ff2, null, null, buff, null, false);
+ rosbad.start();
+ }
+
+ if(outinsert!=null){
+ final int buff=Tools.max(16, 2*THREADS);
+
+ String out1=outinsert.replaceFirst("#", "1");
+
+ assert(!out1.equalsIgnoreCase(in1) && !out1.equalsIgnoreCase(in1));
+
+ ReadStreamWriter.HEADER=header();
+ final FileFormat ff=FileFormat.testOutput(out1, FileFormat.ATTACHMENT, ".info", true, overwrite, append, true);
+ rosinsert=ConcurrentReadOutputStream.getStream(ff, null, null, null, buff, null, false);
+ rosinsert.start();
+ }
+
+
+ if(rosgood!=null || rosbad!=null || rosinsert!=null){
+ System.err.println("Started output threads.");
+ }
+
+ thash.start();
+
+ KCountArray[] kca=new KCountArray[gap==null ? 0 : gap.length];
+ for(int i=0; i<kca.length; i++){
+ kca[i]=KmerCount7MT.makeKca(in1, in2, extra, k, cbits, gap[i], cells, multihash, MIN_QUALITY, true, tableReads, passes, 4, 2, 2, null, 0);
+ }
+
+ for(int i=0; i<kca.length; i++){
+ kca[i].shutdown();
+ System.err.println("Table "+i+":\tgap = "+kca[i].gap+" \tmem = "+kca[i].mem()+" \tused = "+String.format("%.3f%%",kca[i].usedFraction()*100));
+// printStatistics(kca[i]);
+ }
+
+ if(kca!=null && kca.length>0){
+ thash.stop();
+ System.err.println("Hash time: "+thash);
+ }
+
+ final ConcurrentReadInputStream cris;
+ {
+ FileFormat ff1=FileFormat.testInput(in1, FileFormat.FASTQ, null, true, true);
+ FileFormat ff2=FileFormat.testInput(in2, FileFormat.FASTQ, null, true, true);
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2);
+ if(verbose){System.err.println("Started cris");}
+ cris.start(); //4567
+ }
+ boolean paired=cris.paired();
+// assert(paired);//Fails on empty files.
+ if(verbose){System.err.println("Paired: "+paired);}
+
+ talign.start();
+
+
+ MateThread[] pta=new MateThread[THREADS];
+ for(int i=0; i<pta.length; i++){
+ pta[i]=new MateThread(cris, rosgood, rosbad, rosinsert, k, kca, join, perfectonly, middleTable);
+ pta[i].start();
+ }
+
+ insertMinTotal=999999999;
+ insertMaxTotal=0;
+
+ readsProcessedTotal=0;
+ matedCountTotal=0;
+ correctCountTotal=0;
+ ambiguousCountTotal=0;
+ tooShortCountTotal=0;
+ incorrectCountTotal=0;
+ noSolutionCountTotal=0;
+ insertSumCorrectTotal=0;
+ insertSumIncorrectTotal=0;
+
+ Arrays.fill(histTotal, 0);
+
+ for(int i=0; i<pta.length; i++){
+ MateThread ct=pta[i];
+ synchronized(ct){
+ while(ct.isAlive()){
+ try {
+ ct.join(1000);
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+
+ readsProcessedTotal+=ct.readsProcessed;
+ matedCountTotal+=ct.matedCount;
+ correctCountTotal+=ct.correctCount;
+ ambiguousCountTotal+=ct.ambiguousCount;
+ tooShortCountTotal+=ct.tooShortCount;
+ incorrectCountTotal+=ct.incorrectCount;
+ noSolutionCountTotal+=ct.noSolutionCount;
+ insertSumCorrectTotal+=ct.insertSumCorrect;
+ insertSumIncorrectTotal+=ct.insertSumIncorrect;
+
+ basesTrimmedTotal+=ct.basesTrimmedT;
+ readsTrimmedTotal+=ct.readsTrimmedT;
+
+ insertMinTotal=Tools.min(ct.insertMin, insertMinTotal);
+ insertMaxTotal=Tools.max(ct.insertMax, insertMaxTotal);
+
+// System.err.println(ct.insertMin+", "+ct.insertMax);
+
+ if(ct.hist!=null){
+ for(int h=0; h<ct.hist.length; h++){
+ histTotal[h]+=ct.hist[h];
+ }
+ }
+ }
+ }
+
+ System.err.println("Finished reading");
+ errorState|=ReadWrite.closeStreams(cris, rosgood, rosbad, rosinsert);
+
+ talign.stop();
+// System.err.println("Align time: "+talign);
+ }
+
+
+
+ public static String header(){
+ return "#id\tnumericID\tinsert\tstatus\thashHits\thashMisses\tscore\tsum\tvotes\n";
+ }
+
+
+ /**
+ * @param r
+ * @param mate
+ */
+ public static int mateRead(Read a, Read b, int k1, int k2, long mask1, long mask2, KCountArray kca, int[] rvector) {
+ assert(false) : "Redo this based off of kca[] version.";
+ if(rvector==null){rvector=new int[6];}
+ final int width=kca.gap+k1+k2;
+
+ int maxInsert=kca.gap+a.length()+b.length()-MIN_OVERLAPPING_KMERS+1;
+ int minInsert=width+MIN_OVERLAPPING_KMERS-1;
+
+ if(maxInsert<minInsert){
+ return -1; //Can't be tested
+ }
+
+ if(a.obj==null){a.obj=hash(a, k1, mask1, 2*k2);}
+ if(b.obj==null){b.obj=hash(b, k2, mask2, 0);}
+ long[] half1=(long[])a.obj;
+ long[] half2=(long[])b.obj;
+
+ int bestInsert=-1;
+ int bestScore=-1; //This serves as the threshold for the minimum score to report.
+ int bestGood=-1;
+ int bestBad=DEFAULT_BADLIMIT;
+ int bestSum=0;
+
+ int bestMismatches=DEFAULT_MISMATCHLIMIT;
+ int bestMatches=0;
+
+ final int pivot=k1+kca.gap+b.length()+(k1-k2);
+ boolean ambig=false;
+
+ for(int insert=minInsert; insert<=maxInsert; insert++){
+ int score=scoreIP(half1, half2, insert, pivot, kca, rvector, bestBad);
+ if(score>0){
+
+ final int overlap=Tools.min(insert, a.length()+b.length()-insert);
+ int matches=0;
+ int mismatches=0;
+ boolean ok=true;
+ if(overlap>=4){
+ mismatches=countMismatches(a, b, insert, bestMismatches+10);
+ matches=overlap-mismatches;
+ ok=(mismatches<3 || mismatches*2<matches);
+ bestMismatches=Tools.min(mismatches, bestMismatches);
+ }
+
+ int good=rvector[1], bad=rvector[2], sum=rvector[3];
+ if(ok && good>=MIN_OVERLAPPING_KMERS && bad<=bestBad){
+ if(bad<bestBad || good>bestGood || (good==bestGood && sum>bestSum)){
+ ambig=(bestBad==0);
+ bestScore=score;
+ bestInsert=insert;
+ bestGood=good;
+ bestBad=bad;
+ bestSum=sum;
+ if(ambig){break;}
+ }else if(good==bestGood && sum==bestSum){
+ assert(bad==bestBad && sum==bestSum) : bad+"~"+bestBad+", "+good+"~"+bestGood+", "+sum+"~"+bestSum;
+ ambig=true;
+ }
+ }
+ }
+ }
+
+ rvector[0]=bestScore;
+ rvector[1]=bestGood;
+ rvector[2]=bestBad;
+ rvector[3]=bestSum;
+ rvector[4]=(ambig ? 1 : 0);
+
+ return bestInsert;
+ }
+
+
+ public static int mateRead(Read a, Read b, int k1, int k2, long mask1, long mask2, KCountArray kca[], int[] rvector) {
+
+// verbose=a.numericID>145;
+
+ if(USE_MAPPING){
+ rvector[0]=100;
+ rvector[1]=20;
+ rvector[2]=0;
+ rvector[3]=20; //What is this?
+ rvector[4]=0;
+ rvector[5]=Tools.max(1, MIN_VOTES);
+ return a.insertSizeMapped(ignoreMappingStrand);
+ }
+ if(rvector==null){rvector=new int[6];}
+
+ if(a.obj==null){a.obj=hash(a, k1, mask1, 2*k2);}
+ if(b.obj==null){b.obj=hash(b, k2, mask2, 0);}
+ long[] half1=(long[])a.obj;
+ long[] half2=(long[])b.obj;
+ if(half1==null || half2==null){return -1;}
+
+ int bestInsert=-1;
+ int bestScore=-1; //This serves as the threshold for the minimum score to report.
+ int bestGood=-1;
+ int bestBad=DEFAULT_BADLIMIT;
+
+ int minGap=kca[0].gap;
+ int maxGap=kca[0].gap;
+
+ final int[] pivot=new int[kca.length];
+ final int[] minInsert=new int[kca.length];
+ final int[] maxInsert=new int[kca.length];
+
+ for(int i=0; i<kca.length; i++){
+ int gap=kca[i].gap;
+ minGap=Tools.min(minGap, gap);
+ maxGap=Tools.max(maxGap, gap);
+ pivot[i]=k1+gap+b.length()+(k1-k2);
+ minInsert[i]=minGap+k1+k2+MIN_OVERLAPPING_KMERS-1;
+ maxInsert[i]=maxGap+a.length()+b.length()-MIN_OVERLAPPING_KMERS+1;
+ }
+
+
+ int overallMaxInsert=maxGap+a.length()+b.length()-MIN_OVERLAPPING_KMERS+1;
+ int overallMinInsert=minGap+k1+k2+MIN_OVERLAPPING_KMERS-1;
+
+ if(overallMaxInsert<overallMinInsert){
+ return -1; //Can't be tested
+ }
+
+ int bestMismatches=DEFAULT_MISMATCHLIMIT;
+ int bestMatches=0;
+ int bestVotes=0;
+// int bestSum=0;
+ int bestMatchScore=0;
+
+ boolean ambig=false;
+ final int minOverlap=MIN_OVERLAPPING_KMERS_0;
+
+// System.err.println("len = "+kca.length);
+// int guess=(kca.length==1 ? 0 : mateRead(a, b, k1, k2, mask1, mask2, new KCountArray[] {kca[1]}, null));
+// boolean vb=(guess==331 || guess==332);
+// assert(vb) : a.insertSize() +", "+ guess+", "+kca.length;
+
+// assert(false) : overallMinInsert+", "+overallMaxInsert+", "+minInsert[0]+", "+maxInsert[0];
+
+ for(int insert=overallMinInsert; insert<=overallMaxInsert; insert++){
+// verbose=(insert==174);
+ if(verbose){System.err.println("\nTesting read "+a.numericID+", insert "+insert);}
+
+ int good=-1;
+ int bad=999999;
+ int score=-1;
+ int votes=0;
+// int sum=0;
+ int matchScore=0;
+ for(int g=0; g<kca.length; g++){
+ if(insert>=minInsert[g] && insert<=maxInsert[g]){
+ if(verbose){System.err.println("Testing gap "+kca[g].gap);}
+ int x=scoreIP(half1, half2, insert, pivot[g], kca[g], rvector, bestBad);
+ final int good0=rvector[1], bad0=rvector[2];
+ if(verbose){System.err.println("score="+score+", rvector="+Arrays.toString(rvector));}
+ if((good0>MIN_OVERLAPPING_KMERS) && (bad0>bestBad || good0+bad0>=ACCEL_FACTOR)){
+// score=votes==0 ? x : Tools.min(score, x);
+// if(verbose){System.err.println("new score="+score);}
+// good=votes==0 ? good0 : Tools.min(good0, good);
+// bad=votes==0 ? bad0 : Tools.max(bad0, bad);
+
+ score=votes==0 ? x : Tools.max(score, x);
+ if(verbose){System.err.println("new score="+score);}
+ good=votes==0 ? good0 : Tools.max(good0, good);
+ bad=votes==0 ? bad0 : Tools.min(bad0, bad);
+ // sum=votes==0 ? rvector[3] : Tools.max(rvector[3], sum);
+ votes++;
+ if(bad>bestBad || score<=0){break;}
+ }
+ }
+
+ }
+ if(score>0/* && votes>=MIN_VOTES*/){
+
+ final int overlap=Tools.min(insert, a.length()+b.length()-insert);
+ int matches=0;
+ int mismatches=0;
+ boolean ok=true;
+ if(overlap>=minOverlap){
+ mismatches=countMismatches(a, b, insert, bestMismatches+10);
+ matches=overlap-mismatches;
+ matchScore=matches-mismatches*4;
+ ok=(mismatches<3 || mismatches*2<matches);
+ }
+
+ if(ok && good>=MIN_OVERLAPPING_KMERS && bad<=bestBad){
+
+
+ ambig=true;
+ boolean setBest=false;
+ boolean quit=(bad==0 && bestBad==0);
+
+ if(bad<bestBad){
+ setBest=true;
+ ambig=false;
+ }else if(votes>bestVotes){
+ setBest=true;
+ }else if(votes==bestVotes && matches>=bestMatches && mismatches<=bestMismatches && (matches>bestMatches || mismatches<bestMismatches)){
+ // if(mismatches>=bestMismatches){ambig=true;}
+ setBest=true;
+ }else if(matchScore>=bestMatchScore && (good>bestGood /*|| (good==bestGood && sum>bestSum)*/)){
+ setBest=true;
+ }
+
+ if(setBest){
+ bestScore=score;
+ bestInsert=insert;
+ bestGood=good;
+ bestBad=bad;
+ bestVotes=votes;
+ if(overlap>=minOverlap){
+ bestMismatches=mismatches;
+ bestMatches=matches;
+ bestMatchScore=matchScore;
+ }
+ if(votes<MIN_VOTES){ambig=true;}
+ }
+ if(quit){break;}
+ }
+ }
+ }
+
+// if(vb){
+// if(guess==331){
+// if(bestInsert==331){
+// good331++;
+// System.err.println(guess+", "+bestInsert+", id="+a.numericID+", "+Arrays.toString(rvector));
+// }else{
+// bad331++;
+// System.err.println(guess+", "+bestInsert+", id="+a.numericID+", "+Arrays.toString(rvector));
+// }
+// }else if(guess==332){
+// if(bestInsert==332){
+// good332++;
+// System.err.println(guess+", "+bestInsert+", id="+a.numericID+", "+Arrays.toString(rvector));
+// }else{
+// bad332++;
+// System.err.println(guess+", "+bestInsert+", id="+a.numericID+", "+Arrays.toString(rvector));
+// }
+// }
+// if(good331>0 && bad331>0 && good332>0 && bad332>0){assert(false);}
+// }
+
+ rvector[0]=bestScore;
+ rvector[1]=bestGood;
+ rvector[2]=bestBad;
+// rvector[3]=bestSum;
+ rvector[4]=(ambig ? 1 : 0);
+ rvector[5]=bestVotes;
+
+ return bestInsert;
+ }
+
+
+ public static int mateByOverlap(Read a, Read b, int[] rvector, final int minOverlap0, final int minOverlap) {
+ if(USE_MAPPING){
+ rvector[0]=100;
+ rvector[1]=20;
+ rvector[2]=0;
+ rvector[3]=20; //What is this?
+ rvector[4]=0;
+ rvector[5]=Tools.max(1, MIN_VOTES);
+ return a.insertSizeMapped(ignoreMappingStrand);
+ }
+ if(rvector==null){rvector=new int[6];}
+ final byte[] abases=a.bases, bbases=b.bases, aqual=a.quality, bqual=b.quality;
+
+ int bestOverlap=-1;
+// int bestScore=-1; //This serves as the threshold for the minimum score to report.
+ int bestGood=-1;
+ int bestBad=DEFAULT_BADLIMIT_FOR_BASE_MATCHING;
+ final int margin=2;
+
+ boolean ambig=false;
+ final int maxOverlap=abases.length+bbases.length-Tools.max(minOverlap, MIN_OVERLAP_INSERT);
+// assert(false) : minOverlap+", "+maxOverlap;
+// System.err.print("\nm");
+
+ for(int overlap=Tools.max(minOverlap0, 0); overlap<maxOverlap; overlap++){
+// System.err.print("\nn");
+// verbose=(insert==174);
+ if(verbose){System.err.println("\nTesting read "+a.numericID+", overlap "+overlap+", insert "+(abases.length+bbases.length-overlap));}
+
+
+ int tested=0;
+ int good=0, bad=0;
+
+ int istart=(overlap<=abases.length ? 0 : overlap-abases.length);
+ int jstart=(overlap<=abases.length ? abases.length-overlap : 0);
+// System.err.print("o");
+
+ for(int i=istart, j=jstart, badlim=bestBad+margin; i<overlap && i<bbases.length && j<abases.length && bad<badlim; i++, j++){
+ assert(j>=0 && j<=abases.length && i>=0 && i<=bbases.length) : "\njstart="+jstart+", j="+j+", istart="+istart+", i="+i+" \n"+
+ "overlap="+overlap+", a.length="+a.length()+", b.length="+b.length()+", bad="+bad+", badlim="+badlim+", good="+good+", tested="+tested;
+ byte ca=abases[j], cb=bbases[i];
+ if(ca=='N' || cb=='N' || (aqual!=null && aqual[j]<MIN_QUALITY_FOR_OVERLAP) || (bqual!=null && bqual[i]<MIN_QUALITY_FOR_OVERLAP)){
+ //do nothing
+ }else{
+ assert(AminoAcid.isFullyDefined(ca) && AminoAcid.isFullyDefined(cb)) : (char)ca+", "+(char)cb;
+ tested++;
+ if(ca==cb){good++;}
+ else{bad++;}
+ }
+ }
+// System.err.print("p");
+
+// System.err.println(overlap+", "+bestOverlap+", "+bestGood+", "+bestBad+", "+ambig);
+
+// System.err.print("a");
+ if(good>minOverlap){//Candidate
+ if(bad<=bestBad){
+
+// System.err.print("b");
+ if(bad<bestBad || (bad==bestBad && good>bestGood)){//Current winner
+ if(bad>bestBad-margin){ambig=true;}
+ bestOverlap=overlap;
+ bestBad=bad;
+ bestGood=good;
+// assert(abases.length+bbases.length-bestOverlap<299) :
+// ((abases.length+bbases.length-bestOverlap)+", "+ambig+", "+overlap+", "+good+", "+bad+", "+tested+", "+bestGood+", "+bestBad+", "+a.insertSize());
+ }else if(bad==bestBad){
+ ambig=true;
+ }
+
+// System.err.print("c");
+ if(ambig && bestBad<margin){
+// System.err.print("d");
+ rvector[0]=((bestBad==0 ? 8 : 4)*bestGood-6*bestBad);
+ rvector[1]=bestGood;
+ rvector[2]=bestBad;
+// rvector[3]=bestSum;
+ rvector[4]=(ambig ? 1 : 0);
+ rvector[5]=0;
+// System.err.print("e");
+ return -1;
+ }
+
+// System.err.print("f");
+ }
+ }else if(bad<margin){
+// System.err.print("g");
+ ambig=true;
+ rvector[0]=((bestBad==0 ? 8 : 4)*bestGood-6*bestBad);
+ rvector[1]=bestGood;
+ rvector[2]=bestBad;
+// rvector[3]=bestSum;
+ rvector[4]=(ambig ? 1 : 0);
+ rvector[5]=0;
+ return -1;
+ }
+// System.err.print("h");
+
+// if(abases.length+bbases.length-bestOverlap>299){
+// System.err.println((abases.length+bbases.length-bestOverlap)+", "+ambig+", "+rvector[0]+", "+bestGood+", "+bestBad+", "+a.insertSize());
+// }
+
+ }
+// System.err.println("i");
+
+ rvector[0]=((bestBad==0 ? 8 : 4)*bestGood-6*bestBad);
+ rvector[1]=bestGood;
+ rvector[2]=bestBad;
+// rvector[3]=bestSum;
+ rvector[4]=(ambig ? 1 : 0);
+ rvector[5]=0;
+
+// if(abases.length+bbases.length-bestOverlap>299){
+// System.err.println((abases.length+bbases.length-bestOverlap)+", "+ambig+", "+rvector[0]+", "+bestGood+", "+bestBad+", "+a.insertSize());
+// }
+
+// assert(bestOverlap>-1);
+ return (bestOverlap<0 ? -1 : abases.length+bbases.length-bestOverlap);
+ }
+
+
+ public static int countMismatches(Read a, Read b, int insert, int maxMismatches){
+ final int lengthSum=a.length()+b.length();
+ if(insert>=lengthSum){return 0;}
+ final int overlap=Tools.min(insert, lengthSum-insert);
+
+ int mismatches=0;
+
+
+ int start1=(insert>a.length() ? a.length()-overlap : 0);
+ int start2=(insert>=b.length() ? 0 : b.length()-overlap);
+// System.err.println(insert+", "+overlap+", "+start1+", "+start2);
+
+ while(start1<0 || start2<0){start1++; start2++;}
+ for(int i=start1, j=start2; i<a.length() && j<b.length(); i++, j++){
+ final byte ca=a.bases[i], cb=b.bases[j];
+ if(ca!=cb){
+ final byte qa=a.quality[i], qb=b.quality[j];
+ if(ca=='N' || cb=='N' || qa<7 || qb<7){
+ //do nothing
+ }else{
+ mismatches++;
+ if(mismatches>maxMismatches){break;}
+ }
+ }
+ }
+ return mismatches;
+ }
+
+ public static int scoreIP(long[] half1, long[] half2, int insert, int pivot, KCountArray kca, int[] rvector, final int badlimit){
+ int start1, start2;
+ if(insert<=pivot){ //Short mode; start from base 0 of read a
+ start1=0;
+ start2=pivot-insert;
+ }else{ //Long mode; start from base 0 of read b
+ start1=insert-pivot;
+ start2=0;
+ }
+ if(verbose){
+ System.err.println("ScoreIP. Insert: "+insert+", gap="+kca.gap+", badlimit="+badlimit);
+ }
+ return score(half1, half2, start1, start2, kca, rvector, badlimit);
+ }
+
+ public static int score(long[] half1, long[] half2, int start1, int start2, KCountArray kca, int[] rvector, final int badlimit){
+ int good=0;
+ int bad=0;
+ int sum=0;
+ final int len=Tools.min(half1.length-start1, half2.length-start2);
+// final int incr=Tools.min(len/8, 8); //Accelerates scoring by a factor of 8 for a preview
+ final int incr=Tools.min(len/ACCEL_FACTOR, ACCEL_FACTOR);
+
+ if(incr>1){
+ for(int i=start1, j=start2; i<half1.length && j<half2.length; i+=incr, j+=incr){
+ if(half1[i]!=-1 && half2[j]!=-1){
+ long key=half1[i]|half2[j];
+ int x=kca.read(key);
+ sum+=x;
+ if(x>=MIN_HITS_FOR_GOOD){good++;}
+ else if(x<=MAX_HITS_FOR_BAD){
+ bad++;
+ if(bad>badlimit){break;}
+ }
+// if(verbose){System.err.print("("+Long.toHexString(half1[i])+","+Long.toHexString(half2[j])+","+Long.toHexString(key)+","+x+")");}
+ }
+ }
+ if(verbose){
+ System.err.println("\n(incr="+incr+") Good: "+good+" \tBad: "+bad);
+ }
+ if(bad>good || bad>badlimit){
+ rvector[0]=incr*((bad==0 ? 8 : 4)*good-6*bad);
+ rvector[1]=incr*good;
+ rvector[2]=incr*bad;
+ rvector[3]=incr*sum;
+ return rvector[0];
+ }else{
+ good=0;
+ bad=0;
+ sum=0;
+ }
+ }
+
+ for(int i=start1, j=start2; i<half1.length && j<half2.length; i++, j++){
+ if(half1[i]!=-1 && half2[j]!=-1){
+ long key=half1[i]|half2[j];
+ int x=kca.read(key);
+ sum+=x;
+ if(x>=MIN_HITS_FOR_GOOD){good++;}
+ else if(x<=MAX_HITS_FOR_BAD){
+ bad++;
+ if(bad>badlimit){break;}
+ }
+// if(verbose){System.err.print("("+Long.toHexString(half1[i])+","+Long.toHexString(half2[j])+","+Long.toHexString(key)+","+x+")");}
+ }
+ }
+ if(verbose){
+ System.err.println("\nGood: "+good+" \tBad: "+bad+" \tSum: "+sum);
+ }
+ rvector[0]=((bad==0 ? 8 : 4)*good-6*bad);
+ rvector[1]=good;
+ rvector[2]=bad;
+ rvector[3]=sum;
+ return rvector[0];
+ }
+
+ public static void toHex(long[] array){
+ for(int i=0; i<array.length; i++){
+ System.out.print(Long.toHexString(array[i])+", ");
+ }
+ System.out.println();
+ }
+
+ public static void toHex(long[] array1, long[] array2){
+ for(int i=0; i<array1.length; i++){
+ System.out.print(Long.toHexString(array1[i]|array2[i])+", ");
+ }
+ System.out.println();
+ }
+
+ public static long[] hash(Read r, int k, long mask, int offset){
+ if(r==null || r.bases==null || r.length()<k){return null;}
+ final byte[] bases=r.bases;
+ final byte[] quals=r.quality;
+ final long[] half=new long[bases.length-k+1];
+ Arrays.fill(half, -1);
+
+
+// System.err.println(k+", "+bases.length+", "+half.length+", offset="+offset);
+
+ int len=0;
+ long kmer=0;
+ for(int i=0, j=i-k+1; i<bases.length; i++, j++){
+// System.err.println(len+", "+i+", "+j);
+ byte b=bases[i];
+ int x=AminoAcid.baseToNumber[b];
+ if(x<0 || (quals!=null && quals[i]<MIN_QUALITY)){
+ len=0;
+ kmer=0;
+ }else{
+ kmer=((kmer<<2)|x)&mask;
+ len++;
+ if(len>=k){
+ half[j]=(kmer<<offset);
+ }
+ }
+ }
+ return half;
+ }
+
+
+ private static class MateThread extends Thread{
+
+
+ public MateThread(ConcurrentReadInputStream cris_, ConcurrentReadOutputStream rosgood_, ConcurrentReadOutputStream rosbad_, ConcurrentReadOutputStream rosinsert_,
+ int k_, KCountArray[] kca_, boolean joinReads_, boolean joinperfectonly_, KCountArray middleTable_) {
+ cris=cris_;
+ rosgood=rosgood_;
+ rosbad=rosbad_;
+ rosinsert=rosinsert_;
+ k=k_;
+ kca=kca_;
+ joinReads=joinReads_;
+ joinperfectonly=joinperfectonly_;
+ middleTable=middleTable_;
+ }
+
+
+ @Override
+ public void run(){
+ processMate();
+ }
+
+ private void processMate() {
+
+ final int k1=((k+1)/2);
+ final int k2=k/2;
+ // assert(k1+k2>=1 && k1+k2<20) : k1+", "+k2+", "+(k1+k2);
+ assert(USE_MAPPING || MATE_BY_OVERLAP || kca[0].gap>=0);
+ final int kbits1=2*k1;
+ final int kbits2=2*k2;
+ final long mask1=~((-1L)<<(kbits1));
+ final long mask2=~((-1L)<<(kbits2));
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+ assert(r.mate!=null || WRITE_INTERMEDIATE_JOINED);
+ }
+
+
+ while(reads!=null && reads.size()>0){
+
+ ArrayList<Read> listg=(rosgood==null ? null : new ArrayList<Read>());
+ ArrayList<Read> listb=(rosbad==null ? null : new ArrayList<Read>());
+ ArrayList<Read> listi=(rosinsert==null ? null : new ArrayList<Read>());
+
+ for(Read r1 : reads){
+ final Read r2=r1.mate;
+
+ TrimRead tr1=null, tr2=null;
+
+ boolean remove=false;
+ if(qtrim){
+ if(untrim){
+ if(r1!=null){
+ tr1=TrimRead.trim(r1, qtrimLeft, qtrimRight, trimq, 1);
+ int x=(tr1==null ? 0 : tr1.leftTrimmed+tr1.rightTrimmed);
+ basesTrimmedT+=x;
+ readsTrimmedT+=(x>0 ? 1 : 0);
+ }
+ if(r2!=null){
+ tr2=TrimRead.trim(r2, qtrimLeft, qtrimRight, trimq, 1);
+ int x=(tr2==null ? 0 : tr2.leftTrimmed+tr2.rightTrimmed);
+ basesTrimmedT+=x;
+ readsTrimmedT+=(x>0 ? 1 : 0);
+ }
+ }else{
+ if(r1!=null){
+ int x=TrimRead.trimFast(r1, qtrimLeft, qtrimRight, trimq, 1);
+ basesTrimmedT+=x;
+ readsTrimmedT+=(x>0 ? 1 : 0);
+ }
+ if(r2!=null){
+ int x=TrimRead.trimFast(r2, qtrimLeft, qtrimRight, trimq, 1);
+ basesTrimmedT+=x;
+ readsTrimmedT+=(x>0 ? 1 : 0);
+ }
+ }
+ }
+
+ if(minReadLength>0 && !remove){
+ int rlen=(r1==null ? 0 : r1.length());
+ int rlen2=(r1.mateLength());
+ if(rlen<minReadLength && rlen2<minReadLength){
+ basesTrimmedT+=(rlen+rlen2);
+ remove=true;
+ }
+ }
+
+ if(!remove){
+
+ // verbose=(r.numericID==727 || r.numericID==1364);
+
+ if(r2!=null){r2.reverseComplement();}
+ readsProcessed++;
+
+
+ // System.err.println("True Insert: "+r.insertSize());
+
+ final int[] rvector=new int[6];
+ int trueSize=r1.insertSizeMapped(ignoreMappingStrand);
+
+ int bInsert=-1, hInsert=-1;
+ int bestInsert;
+ int bestScore=-1;
+ int bestGood=-1;
+ int bestBad=999999, bBad=999999, hBad=999999;
+ boolean ambig, tooShort=false;
+ boolean bAmbig=true, hAmbig=true;
+ int bestVotes=-1;
+
+ boolean didb=false, didh=false;
+
+ // assert(false) : r+"\n"+(USE_MAPPING)+", "+(r.chrom==r.mate.chrom)+", "+()+", "+()+", "+()+", "+()+", ";
+
+ if(r2==null){
+ bestScore=100;
+ bestGood=30;
+ bestBad=0;
+ bestInsert=r1.length();
+ assert(r1.length()==r1.insert()) : r1.length()+" != "+r1.insert()+"; actual = "+trueSize;
+ // if(bestInsert!=trueSize){
+ // System.err.println("Bad insert size for pre-joined read "+r.numericID+": len="+r.length()+", insert="+r.insert()+", actual="+trueSize);
+ // }
+ bestVotes=1;
+ ambig=false;
+ }else if(USE_MAPPING && r1.chrom==r2.chrom && r1.start<r1.stop && ((r1.mapped() || r1.synthetic()) && (r2.mapped() || r2.synthetic()))){
+ bestScore=100;
+ bestGood=30;
+ bestBad=0;
+ bestInsert=trueSize;
+ bestVotes=1;
+ ambig=false;
+ }else if(SKIP_MATED_READS && r1.insertvalid() && r1.insert()>0){
+ bestScore=100;
+ bestGood=30;
+ bestBad=0;
+ bestInsert=r1.insert();
+ bestVotes=1;
+ ambig=false;
+ }else{
+ if(MATE_BY_OVERLAP){
+ didb=true;
+ bInsert=mateByOverlap(r1, r2, rvector, MIN_OVERLAPPING_BASES_0, MIN_OVERLAPPING_BASES);
+ bestScore=rvector[0];
+ bestGood=rvector[1];
+ bBad=rvector[2];
+ bAmbig=(rvector[4]==1);
+ bestVotes=rvector[5];
+ final int len1=r1.length(), len2=r2.length();
+ for(int trims=0, q=trimq; trims<TRIM_ON_OVERLAP_FAILURE && !qtrim && bInsert<0 /*&& !bAmbig*/; trims++, q+=8){
+// System.err.println(trims+", "+q);
+ Serializable old1=r1.obj;
+ Serializable old2=r2.obj;
+ tr1=TrimRead.trim(r1, false, true, q, 1+len1*4/10); //r1.length());
+ tr2=TrimRead.trim(r2, true, false, q, 1+len2*4/10); //r2.length());
+ r1.obj=old1;
+ r2.obj=old2;
+ if(tr1!=null || tr2!=null){
+// System.err.println(r1.length()+", "+r2.length());
+ int x=mateByOverlap(r1, r2, rvector, MIN_OVERLAPPING_BASES_0-1, MIN_OVERLAPPING_BASES);
+ if(x>-1){
+// System.err.println(trims);
+ bInsert=x;
+ bestScore=rvector[0];
+ bestGood=rvector[1];
+ bBad=rvector[2];
+ bAmbig=(rvector[4]==1);
+ bestVotes=rvector[5];
+ trims=TRIM_ON_OVERLAP_FAILURE;
+ }else{
+ if(tr1!=null){tr1.untrim();}
+ if(tr2!=null){tr2.untrim();}
+ }
+ }
+ }
+ }
+ if(kca!=null && kca.length>0 && (bAmbig || bInsert<0 || bestBad>0 || bestGood<30)){
+ didh=true;
+ hInsert=mateRead(r1, r2, k1, k2, mask1, mask2, kca, rvector);
+ bestScore=rvector[0];
+ bestGood=rvector[1];
+ hBad=rvector[2];
+ hAmbig=(rvector[4]==1);
+ bestVotes=rvector[5];
+ }
+
+ if(hInsert==bInsert){
+ bestInsert=hInsert;
+ bestBad=Tools.min(bBad, hBad);
+ ambig=bAmbig && hAmbig;
+ }
+ // else if(!didb || bAmbig || bInsert<0){
+ // bestInsert=hInsert;
+ // ambig=hAmbig;
+ // }
+ else if(!didh || hAmbig || hInsert<0){
+ bestInsert=bInsert;
+ bestBad=bBad;
+ ambig=bAmbig;
+ }else{
+ bestInsert=hInsert;
+ bestBad=hBad;
+ ambig=hAmbig;
+ }
+ }
+
+ tooShort=(!ambig && bestInsert>0 && bestInsert<minInsert);
+
+ if(joinperfectonly && bestBad>0){ambig=true;}
+
+ if(ambig){ambiguousCount++;}
+ else if(tooShort){tooShortCount++;}
+ else if(bestInsert==-1){noSolutionCount++;}
+ else if(bestInsert==trueSize){correctCount++;insertSumCorrect+=bestInsert;}
+ else{incorrectCount++;insertSumIncorrect+=bestInsert;}
+
+
+ if(bestInsert>-1 && !ambig){
+ insertMin=Tools.min(bestInsert, insertMin);
+ insertMax=Tools.max(bestInsert, insertMax);
+ hist[Tools.min(bestInsert, hist.length-1)]++;
+ }
+ r1.setInsert(ambig ? -1 : bestInsert);
+
+ {//Clear memory.
+ if(r1.obj!=null){assert(r1.obj.getClass()==long[].class) : r1.obj.getClass();}
+ r1.obj=null;
+ if(r2!=null){r2.obj=null;}
+ }
+
+ if(OUTPUT_FAILED || bestInsert>-1){
+
+ if(untrim && (ambig || bestInsert<0 || !joinReads)){
+ if(tr1!=null){tr1.untrim();}
+ if(tr2!=null){tr2.untrim();}
+ }
+
+ if((ambig || bestInsert<0 || tooShort) && (rosbad!=null || !MIX_BAD_AND_GOOD)){
+ if(rosbad!=null){
+ if(listb!=null){listb.add(r1);}
+ }
+ }else{
+ if(listg!=null){
+ Read x=r1;
+ if(joinReads && r2!=null){
+ x=r1.joinRead(bestInsert);
+ //Disabled because ErrorCorrectMT was retired.
+// if(middleTable!=null && x.containsNocalls()){
+// BitSet bs=ErrorCorrectMT.detectNBulk(x);
+// ErrorCorrectMT.correctErrorsBothSides(x, middleTable, MIDDLE_TABLE_K, MIN_HITS_FOR_GOOD, MAX_HITS_FOR_BAD, bs, 9999999);
+// }
+ }
+ listg.add(x);
+ }
+ }
+
+ if(rosinsert!=null){
+ StringBuilder sb=new StringBuilder(40);
+ sb.append(r1.id==null ? r1.numericID+"" : r1.id).append('\t');
+ sb.append(r1.numericID).append('\t');
+
+ sb.append(bestInsert);
+ sb.append('\t');
+
+ if(bestInsert<0){sb.append('F');}//Failed
+ else if(ambig){sb.append('A');} //Ambiguous
+ else if(tooShort){sb.append('S');} //Short
+ else if(bestInsert>0 && bestBad<1){sb.append('P');} //Perfect
+ else{sb.append('I');}//Imperfect
+
+ if(bestInsert>0){
+ sb.append("\t"+bestGood+"\t"+bestBad+"\t"+bestScore+"\t"+bestVotes);
+ }
+ r1.obj=sb;
+ listi.add(r1);
+ }
+ }
+
+ // if(bestInsert!=trueSize && bestInsert>0 && !ambig){
+ // System.err.println("\nIncorrect answer for read "+r.numericID+"; true insert = "+trueSize+", called at "+bestInsert);
+ //// verbose=true;
+ // for(int i=0; i<300; i++){
+ // int x=testRead(r, r.mate, k1, k2, mask1, mask2, kca, rvector, i);
+ // if((x>0 && rvector[2]<=bestBad) || i==trueSize || i==bestInsert){
+ // verbose=true;
+ // testRead(r, r.mate, k1, k2, mask1, mask2, kca, rvector, i);
+ // verbose=false;
+ // }
+ // }
+ //// verbose=false;
+ // }
+
+ // assert(r.numericID<200);
+ // assert(false);
+ if(r2!=null){r2.reverseComplement();}
+ }
+ }
+
+ if(rosgood!=null){rosgood.add(listg, ln.id);}
+ if(rosbad!=null){rosbad.add(listb, ln.id);}
+ if(rosinsert!=null){rosinsert.add(listi, ln.id);}
+
+ // System.err.println("returning list");
+ cris.returnList(ln.id, ln.list.isEmpty());
+ // System.err.println("fetching list");
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ // System.err.println("reads: "+(reads==null ? "null" : reads.size()));
+ }
+ cris.returnList(ln.id, ln.list.isEmpty());
+ }
+
+ int[] hist=new int[1000];
+
+ long readsProcessed=0;
+ long matedCount=0;
+ long correctCount=0;
+ long ambiguousCount=0;
+ long tooShortCount=0;
+ long incorrectCount=0;
+ long noSolutionCount=0;
+ long insertSumCorrect=0;
+ long insertSumIncorrect=0;
+ int insertMax=0;
+ int insertMin=999999999;
+
+ long basesTrimmedT=0;
+ long readsTrimmedT=0;
+
+ private final ConcurrentReadInputStream cris;
+ private final ConcurrentReadOutputStream rosgood;
+ private final ConcurrentReadOutputStream rosbad;
+ private final ConcurrentReadOutputStream rosinsert;
+ private final int k;
+ private final KCountArray[] kca;
+ private final boolean joinReads;
+ private final boolean joinperfectonly;
+ private final KCountArray middleTable;
+ }
+
+ private String in1_primary;
+ private String in2_primary;
+
+ private String outgood_G=null;
+ private String outbad_G=null;
+ private String outinsert_G=null;
+ private String outhist=null;
+ private String outhist2=null;
+ private String outhist3=null;
+
+ private long maxReads_G=-1;
+ private long tableReads_G=-1;
+ private int k_G=K_DEFAULT;
+ private int[][] gap_G=null;
+ private int cbits_G=2;
+ private long totalcells_G=-1;
+ private int hashes=2;
+ private int passes_G=1;
+ private String tempfile="mateReadsTemp#.txt.gz";
+ private boolean join_G=true;
+ private int maxtables=0;
+ private boolean auto=true;
+
+ private List<String> extra_G=null;
+
+ static boolean errorState=false;
+
+ static boolean qtrimRight=false;
+ static boolean qtrimLeft=false;
+ static boolean untrim=false;
+ static byte trimq=6;
+ static int minReadLength=0;
+ static int minInsert=0;
+ static boolean qtrim=false;
+ static int TRIM_ON_OVERLAP_FAILURE=1;
+
+
+ static int[] histTotal=new int[1000];
+ static int bin=1;
+
+ static long readsProcessedTotal=0;
+ static long matedCountTotal=0;
+ static long correctCountTotal=0;
+ static long ambiguousCountTotal=0;
+ static long tooShortCountTotal=0;
+ static long incorrectCountTotal=0;
+ static long noSolutionCountTotal=0;
+ static long insertSumCorrectTotal=0;
+ static long insertSumIncorrectTotal=0;
+ static long basesTrimmedTotal=0;
+ static long readsTrimmedTotal=0;
+ static int insertMinTotal=999999999;
+ static int insertMaxTotal=0;
+
+ public static int MIN_OVERLAPPING_KMERS=10;
+ public static int MIN_OVERLAPPING_KMERS_0=4;
+ public static int MIN_OVERLAPPING_BASES=12;
+ public static int MIN_OVERLAPPING_BASES_0=8;
+ public static int MIN_OVERLAP_INSERT=16;
+ public static int ACCEL_DIV=10; //Acceleration is actually proportional to inverse of this number.
+ public static int ACCEL_FACTOR=ACCEL_DIV; //Max distance between samples
+ public static int DEFAULT_BADLIMIT=25;
+ public static int DEFAULT_BADLIMIT_FOR_BASE_MATCHING=3;
+ public static int DEFAULT_MISMATCHLIMIT=6;
+ public static int MIN_HITS_FOR_GOOD=3;
+ public static int MAX_HITS_FOR_BAD=1;
+ public static int MIN_VOTES=1;
+ public static int K_DEFAULT=29;
+ public static int MIDDLE_TABLE_K=31;
+ public static byte MIN_QUALITY=8;
+ public static byte MIN_QUALITY_FOR_OVERLAP=7;
+ /** Skip alignment and calculate insert from mapping info */
+ public static boolean USE_MAPPING=false;
+ public static boolean MATE_BY_OVERLAP=true;
+ public static boolean SKIP_MATED_READS=false;
+ public static boolean OUTPUT_FAILED=true;
+ public static boolean MIX_BAD_AND_GOOD=false;
+ public static boolean WRITE_INTERMEDIATE_JOINED=false;
+ public static boolean FILL_MIDDLE_INTERMEDIATE=false;
+ public static boolean FILL_MIDDLE_FINAL=false;
+ public static boolean overwrite=true;
+ public static boolean append=false;
+ public static boolean verbose=false;
+ public static boolean ignoreMappingStrand=false;
+
+ public static int THREADS=-1;
+ public static float version=2.0f;
+
+}
diff --git a/current/jgi/MergeBarcodes.java b/current/jgi/MergeBarcodes.java
new file mode 100755
index 0000000..d0bb2bc
--- /dev/null
+++ b/current/jgi/MergeBarcodes.java
@@ -0,0 +1,487 @@
+package jgi;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+
+import stream.ConcurrentGenericReadInputStream;
+import stream.ConcurrentReadInputStream;
+import stream.FASTQ;
+import stream.FastaReadInputStream;
+import stream.ConcurrentReadOutputStream;
+import stream.Read;
+
+import dna.Parser;
+import dna.Timer;
+import fileIO.ByteFile;
+import fileIO.ByteFile1;
+import fileIO.ByteFile2;
+import fileIO.ReadWrite;
+import fileIO.FileFormat;
+
+import align2.ListNum;
+import align2.ReadStats;
+import align2.Shared;
+import align2.Tools;
+
+/**
+ * @author Brian Bushnell
+ * @date Sep 11, 2012
+ *
+ */
+public class MergeBarcodes {
+
+ public static void main(String[] args){
+ Timer t=new Timer();
+ MergeBarcodes mb=new MergeBarcodes(args);
+ HashMap<String, Read> map=mb.loadBarcodes();
+ mb.mergeWithMap(t, map);
+ }
+
+ public MergeBarcodes(String[] args){
+
+ args=Parser.parseConfig(args);
+ if(Parser.parseHelp(args, true)){
+ printOptions();
+ System.exit(0);
+ }
+
+ for(String s : args){if(s.startsWith("out=standardout") || s.startsWith("out=stdout")){outstream=System.err;}}
+ outstream.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+
+ boolean setInterleaved=false; //Whether it was explicitly set.
+
+
+
+ Shared.READ_BUFFER_LENGTH=Tools.min(200, Shared.READ_BUFFER_LENGTH);
+ Shared.capBuffers(4);
+ ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
+ ReadWrite.MAX_ZIP_THREADS=Shared.threads();
+
+
+ Parser parser=new Parser();
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if(b==null || b.equalsIgnoreCase("null")){b=null;}
+ while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens
+
+ if(parser.parse(arg, a, b)){
+ //do nothing
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ ByteFile1.verbose=verbose;
+ ByteFile2.verbose=verbose;
+ stream.FastaReadInputStream.verbose=verbose;
+ ConcurrentGenericReadInputStream.verbose=verbose;
+// align2.FastaReadInputStream2.verbose=verbose;
+ stream.FastqReadInputStream.verbose=verbose;
+ ReadWrite.verbose=verbose;
+ }else if(a.equals("barcode") || a.equals("bar") || a.equals("index")){
+ inbar=b;
+ }else if(a.equals("addslash")){
+ addslash=Tools.parseBoolean(b);
+ }else if(a.equals("rcompmate") || a.equals("rcm")){
+ reverseComplimentMate=Tools.parseBoolean(b);
+ outstream.println("Set RCOMPMATE to "+reverseComplimentMate);
+ }else if(a.equals("rcomp") || a.equals("rc")){
+ reverseCompliment=Tools.parseBoolean(b);
+ outstream.println("Set RCOMP to "+reverseCompliment);
+ }else if(parser.in1==null && i==0 && !arg.contains("=") && (arg.toLowerCase().startsWith("stdin") || new File(arg).exists())){
+ parser.in1=arg;
+ }else{
+ outstream.println("Unknown parameter "+args[i]);
+ assert(false) : "Unknown parameter "+args[i];
+ // throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ {//Process parser fields
+ Parser.processQuality();
+
+ maxReads=parser.maxReads;
+
+ overwrite=ReadStats.overwrite=parser.overwrite;
+ append=ReadStats.append=parser.append;
+
+ setInterleaved=parser.setInterleaved;
+
+ in1=parser.in1;
+ in2=parser.in2;
+ qfin1=parser.qfin1;
+ qfin2=parser.qfin2;
+
+ out1=parser.out1;
+ out2=parser.out2;
+ qfout1=parser.qfout1;
+ qfout2=parser.qfout2;
+
+ extin=parser.extin;
+ extout=parser.extout;
+ }
+
+ if(in1!=null && in2==null && in1.indexOf('#')>-1 && !new File(in1).exists()){
+ in2=in1.replace("#", "2");
+ in1=in1.replace("#", "1");
+ }
+ if(out1!=null && out2==null && out1.indexOf('#')>-1){
+ out2=out1.replace("#", "2");
+ out1=out1.replace("#", "1");
+ }
+ if(in2!=null){
+ if(FASTQ.FORCE_INTERLEAVED){outstream.println("Reset INTERLEAVED to false because paired input files were specified.");}
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+ }
+
+ assert(FastaReadInputStream.settingsOK());
+
+ if(in1==null){
+ printOptions();
+ throw new RuntimeException("Error - at least one input file is required.");
+ }
+ if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){
+ ByteFile.FORCE_MODE_BF2=true;
+ }
+
+ if(out1==null){
+ if(out2!=null){
+ printOptions();
+ throw new RuntimeException("Error - cannot define out2 without defining out1.");
+ }
+ if(!parser.setOut){
+ System.err.println("No output stream specified. To write to stdout, please specify 'out=stdout.fq' or similar.");
+// out1="stdout";
+ }
+ }
+
+ if(!setInterleaved){
+ assert(in1!=null && (out1!=null || out2==null)) : "\nin1="+in1+"\nin2="+in2+"\nout1="+out1+"\nout2="+out2+"\n";
+ if(in2!=null){ //If there are 2 input streams.
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+ outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED);
+ }else{ //There is one input stream.
+ if(out2!=null){
+ FASTQ.FORCE_INTERLEAVED=true;
+ FASTQ.TEST_INTERLEAVED=false;
+ outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED);
+ }
+ }
+ }
+
+ if(out1!=null && out1.equalsIgnoreCase("null")){out1=null;}
+ if(out2!=null && out2.equalsIgnoreCase("null")){out2=null;}
+
+ if(!Tools.testOutputFiles(overwrite, append, false, out1, out2)){
+ outstream.println((out1==null)+", "+(out2==null)+", "+out1+", "+out2);
+ throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+out1+", "+out2+"\n");
+ }
+
+ assert(inbar!=null) : "Must specify a barcode file.";
+ ffbar=FileFormat.testInput(inbar, FileFormat.FASTQ, extin, true, true);
+
+ ffout1=FileFormat.testOutput(out1, FileFormat.FASTQ, extout, true, overwrite, append, false);
+ ffout2=FileFormat.testOutput(out2, FileFormat.FASTQ, extout, true, overwrite, append, false);
+
+ ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, extin, true, true);
+ ffin2=FileFormat.testInput(in2, FileFormat.FASTQ, extin, true, true);
+ }
+
+ public HashMap<String, Read> loadBarcodes(){
+ return loadBarcodes(outstream, ffbar, maxReads);
+ }
+
+ public static HashMap<String, Read> loadBarcodes(PrintStream outstream, FileFormat ffbar, long maxReads){
+
+ Timer t=new Timer();
+
+ final boolean oldForceInterleaved=FASTQ.FORCE_INTERLEAVED;
+ final boolean oldTestInterleaved=FASTQ.TEST_INTERLEAVED;
+
+ FASTQ.FORCE_INTERLEAVED=false;
+ FASTQ.TEST_INTERLEAVED=false;
+
+ HashMap<String, Read> map=new HashMap<String, Read>(0x10000-1);
+
+ final ConcurrentReadInputStream cris;
+ {
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffbar, null, null, null);
+ if(verbose){outstream.println("Started cris for barcodes");}
+ cris.start(); //4567
+ }
+// final boolean paired=cris.paired();
+// if(!ffin1.samOrBam()){outstream.println("Input is being processed as "+(paired ? "paired" : "unpaired"));}
+
+ long readsProcessed=0;
+ long basesProcessed=0;
+
+ {
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+// outstream.println("Fetched "+reads);
+
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+ assert((r.mate!=null)==cris.paired());
+ }
+
+ while(reads!=null && reads.size()>0){
+
+ for(int idx=0; idx<reads.size(); idx++){
+ final Read r1=reads.get(idx);
+ final Read r2=r1.mate;
+
+ if(r1.id.indexOf(' ')>=0){r1.id=r1.id.split(" ")[0];}
+
+ final int initialLength1=r1.length();
+ final int initialLength2=(r1.mateLength());
+
+ {
+ readsProcessed++;
+ basesProcessed+=initialLength1;
+ }
+ if(r2!=null){
+ readsProcessed++;
+ basesProcessed+=initialLength2;
+ }
+
+ map.put(r1.id, r1);
+ }
+
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ if(ln!=null){
+ cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
+ }
+ }
+
+ boolean errorState=false;
+ errorState|=ReadWrite.closeStream(cris);
+
+ t.stop();
+
+ double rpnano=readsProcessed/(double)(t.elapsed);
+ double bpnano=basesProcessed/(double)(t.elapsed);
+
+ String rpstring=(readsProcessed<100000 ? ""+readsProcessed : readsProcessed<100000000 ? (readsProcessed/1000)+"k" : (readsProcessed/1000000)+"m");
+ String bpstring=(basesProcessed<100000 ? ""+basesProcessed : basesProcessed<100000000 ? (basesProcessed/1000)+"k" : (basesProcessed/1000000)+"m");
+
+ while(rpstring.length()<8){rpstring=" "+rpstring;}
+ while(bpstring.length()<8){bpstring=" "+bpstring;}
+
+ outstream.println("Loaded barcodes.");
+ outstream.println("Time: \t"+t);
+ outstream.println("Barcodes Processed: "+rpstring+" \t"+String.format("%.2fk reads/sec", rpnano*1000000));
+ outstream.println("Bases Processed: "+bpstring+" \t"+String.format("%.2fm bases/sec", bpnano*1000));
+ outstream.println();
+
+ if(errorState){
+ throw new RuntimeException("MergeBarcodes encountered an error; the output may be corrupt.");
+ }
+
+ FASTQ.FORCE_INTERLEAVED=oldForceInterleaved;
+ FASTQ.TEST_INTERLEAVED=oldTestInterleaved;
+
+ return map;
+ }
+
+ void mergeWithMap(Timer t, HashMap<String, Read> map){
+
+ final ConcurrentReadInputStream cris;
+ {
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, ffin2, qfin1, qfin2);
+ if(verbose){outstream.println("Started cris");}
+ cris.start(); //4567
+ }
+ boolean paired=cris.paired();
+ if(!ffin1.samOrBam()){outstream.println("Input is being processed as "+(paired ? "paired" : "unpaired"));}
+
+ final ConcurrentReadOutputStream ros;
+ if(out1!=null){
+ final int buff=4;
+
+ if(cris.paired() && out2==null && (in1==null || !in1.contains(".sam"))){
+ outstream.println("Writing interleaved.");
+ }
+
+ assert(!out1.equalsIgnoreCase(in1) && !out1.equalsIgnoreCase(in1)) : "Input file and output file have same name.";
+ assert(out2==null || (!out2.equalsIgnoreCase(in1) && !out2.equalsIgnoreCase(in2))) : "out1 and out2 have same name.";
+
+ ros=ConcurrentReadOutputStream.getStream(ffout1, ffout2, qfout1, qfout2, buff, null, false);
+ ros.start();
+ }else{ros=null;}
+
+ long readsProcessed=0;
+ long basesProcessed=0;
+ long barcodesFound=0;
+ long barcodesNotFound=0;
+ final StringBuilder prefix=new StringBuilder();
+
+ {
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ // outstream.println("Fetched "+reads);
+
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+ assert((ffin1==null || ffin1.samOrBam()) || (r.mate!=null)==cris.paired());
+ }
+
+ while(reads!=null && reads.size()>0){
+
+ for(int idx=0; idx<reads.size(); idx++){
+ final Read r1=reads.get(idx);
+ final Read r2=r1.mate;
+
+ final int initialLength1=r1.length();
+ final int initialLength2=(r1.mateLength());
+
+ {
+ readsProcessed++;
+ basesProcessed+=initialLength1;
+ if(reverseCompliment){r1.reverseComplement();}
+ }
+ if(r2!=null){
+ readsProcessed++;
+ basesProcessed+=initialLength2;
+ if(reverseCompliment || reverseComplimentMate){r2.reverseComplement();}
+ }
+
+ String key=r1.id;
+ if(key.indexOf(' ')>=0){key=key.split(" ")[0];}
+ Read bar=map.remove(key);
+ if(bar!=null){
+ for(byte b : bar.bases){prefix.append((char)b);}
+ prefix.append('_');
+ for(byte b : bar.quality){prefix.append((char)(b+33));}
+ prefix.append('_');
+ r1.id=prefix+r1.id;
+ barcodesFound++;
+ if(r2!=null){
+ r2.id=prefix+r2.id;
+ barcodesFound++;
+ }
+ prefix.setLength(0);
+ }else{
+ barcodesNotFound++;
+ if(r2!=null){barcodesNotFound++;}
+ }
+ }
+
+ final ArrayList<Read> listOut=reads;
+
+ if(ros!=null){ros.add(listOut, ln.id);}
+
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ if(ln!=null){
+ cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
+ }
+ }
+
+ errorState|=ReadStats.writeAll();
+
+ errorState|=ReadWrite.closeStreams(cris, ros);
+
+ t.stop();
+
+ double rpnano=readsProcessed/(double)(t.elapsed);
+ double bpnano=basesProcessed/(double)(t.elapsed);
+
+ String rpstring=(readsProcessed<100000 ? ""+readsProcessed : readsProcessed<100000000 ? (readsProcessed/1000)+"k" : (readsProcessed/1000000)+"m");
+ String bpstring=(basesProcessed<100000 ? ""+basesProcessed : basesProcessed<100000000 ? (basesProcessed/1000)+"k" : (basesProcessed/1000000)+"m");
+
+ while(rpstring.length()<8){rpstring=" "+rpstring;}
+ while(bpstring.length()<8){bpstring=" "+bpstring;}
+
+ {
+ outstream.println("Barcodes Found: \t"+barcodesFound+" reads ("+String.format("%.2f",barcodesFound*100.0/readsProcessed)+"%)");
+ outstream.println("Barcodes Not Found: \t"+barcodesNotFound+" reads ("+String.format("%.2f",barcodesNotFound*100.0/readsProcessed)+"%)");
+ }
+
+ outstream.println("Time: \t"+t);
+ outstream.println("Reads Processed: "+rpstring+" \t"+String.format("%.2fk reads/sec", rpnano*1000000));
+ outstream.println("Bases Processed: "+bpstring+" \t"+String.format("%.2fm bases/sec", bpnano*1000));
+
+ if(errorState){
+ throw new RuntimeException("ReformatReads terminated in an error state; the output may be corrupt.");
+ }
+ }
+
+ /*--------------------------------------------------------------*/
+
+ private void printOptions(){
+ assert(false) : "printOptions: TODO";
+// outstream.println("Syntax:\n");
+// outstream.println("java -ea -Xmx512m -cp <path> jgi.ReformatReads in=<infile> in2=<infile2> out=<outfile> out2=<outfile2>");
+// outstream.println("\nin2 and out2 are optional. \nIf input is paired and there is only one output file, it will be written interleaved.\n");
+// outstream.println("Other parameters and their defaults:\n");
+// outstream.println("overwrite=false \tOverwrites files that already exist");
+// outstream.println("ziplevel=4 \tSet compression level, 1 (low) to 9 (max)");
+// outstream.println("interleaved=false\tDetermines whether input file is considered interleaved");
+// outstream.println("fastawrap=70 \tLength of lines in fasta output");
+// outstream.println("qin=auto \tASCII offset for input quality. May be set to 33 (Sanger), 64 (Illumina), or auto");
+// outstream.println("qout=auto \tASCII offset for output quality. May be set to 33 (Sanger), 64 (Illumina), or auto (meaning same as input)");
+// outstream.println("outsingle=<file> \t(outs) Write singleton reads here, when conditionally discarding reads from pairs.");
+ }
+
+
+ /*--------------------------------------------------------------*/
+
+ private String inbar=null;
+
+ private String in1=null;
+ private String in2=null;
+
+ private String qfin1=null;
+ private String qfin2=null;
+
+ private String out1=null;
+ private String out2=null;
+
+ private String qfout1=null;
+ private String qfout2=null;
+
+ private String extin=null;
+ private String extout=null;
+
+ /*--------------------------------------------------------------*/
+
+ private boolean reverseComplimentMate=false;
+ private boolean reverseCompliment=false;
+ /** Add /1 and /2 to paired reads */
+ private boolean addslash=false;
+
+ private long maxReads=-1;
+
+ /*--------------------------------------------------------------*/
+
+ private final FileFormat ffbar;
+
+ private final FileFormat ffin1;
+ private final FileFormat ffin2;
+
+ private final FileFormat ffout1;
+ private final FileFormat ffout2;
+
+
+ /*--------------------------------------------------------------*/
+
+ private PrintStream outstream=System.err;
+ public static boolean verbose=false;
+ public boolean errorState=false;
+ private boolean overwrite=false;
+ private boolean append=false;
+
+}
diff --git a/current/jgi/NormAndCorrectWrapper.java b/current/jgi/NormAndCorrectWrapper.java
new file mode 100755
index 0000000..ebfd055
--- /dev/null
+++ b/current/jgi/NormAndCorrectWrapper.java
@@ -0,0 +1,77 @@
+package jgi;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.util.Arrays;
+import java.util.Random;
+
+import align2.Shared;
+import assemble.Tadpole;
+import dna.Parser;
+
+/**
+ * Wraps BBNorm and Tadpole to normalize and correct reads.
+ * @author Brian Bushnell
+ * @date Oct 15, 2015
+ *
+ */
+public class NormAndCorrectWrapper {
+
+ public static void main(String[] args){
+// Timer t=new Timer();
+// t.start();
+ NormAndCorrectWrapper rr=new NormAndCorrectWrapper(args);
+ rr.process();
+ }
+
+ public NormAndCorrectWrapper(String[] args){
+
+ args=Parser.parseConfig(args);
+ if(Parser.parseHelp(args, true)){
+ System.exit(0);
+ }
+
+ outstream.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+
+// Parser parser=new Parser();
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if(b==null || b.equalsIgnoreCase("null")){b=null;}
+ while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens
+
+ if(a.equals("in")){
+ in=b;
+ }else if(a.equals("out")){
+ out=b;
+ }else if(a.equals("ow") || a.equals("overwrite")){
+ out=b;
+ }else{
+ throw new RuntimeException("Unsupported argument");
+ }
+ }
+
+ }
+
+ public void process(){
+
+ Random randy=new Random();
+ String temp=Shared.TMPDIR+"normalized_"+((randy.nextLong()&Long.MAX_VALUE)^in.hashCode())+".fq.gz";
+
+ String[] normArgs=new String[] {"in="+in, "out="+temp, "bits=32", "min=2", "target=100", "pigz", "unpigz", "ow="+overwrite};
+ KmerNormalize.main(normArgs);
+ String[] tadArgs=new String[] {"in="+temp, "out="+out, "mode=correct", "pigz", "unpigz", "ow="+overwrite};
+ Tadpole.main(tadArgs);
+ File f=new File(temp);
+ if(f.exists()){
+ f.delete();
+ }
+ }
+
+ public PrintStream outstream=System.err;
+ public String in="reads.fq.gz", out="corrected.fq.gz";
+ public boolean overwrite=true;
+
+}
diff --git a/current/jgi/Orf.java b/current/jgi/Orf.java
new file mode 100755
index 0000000..cdc44be
--- /dev/null
+++ b/current/jgi/Orf.java
@@ -0,0 +1,111 @@
+package jgi;
+
+import java.util.Arrays;
+
+import align2.Tools;
+import dna.CoverageArray;
+
+/**
+ * This class is designed to help calculate coverage of ORFs
+ * @author Brian Bushnell
+ * @date May 13, 2013
+ *
+ */
+public class Orf implements Comparable<Orf>{
+
+ public Orf(String name_, int start_, int stop_, byte strand_){
+ name=name_;
+ start=start_;
+ stop=stop_;
+ strand=strand_;
+ assert(stop>start || (start==0 && stop==0));
+ }
+
+ public String toString(){
+ return name+"\t"+start+"\t"+stop+"\t"+strand;
+ }
+
+ public int length(){return stop-start+1;}
+
+ public double avgCoverage(){
+ int len=length();
+ return len<=0 ? 0 : baseDepth/(double)len;
+ }
+
+ public double fractionCovered(){
+ int len=length();
+ return len<=0 ? 0 : baseCoverage/(double)len;
+ }
+
+ public int[] readCoverageArray(CoverageArray ca){
+
+ final int len=length();
+ if(len<1 || ca==null){return null;}
+ final int[] array=new int[len];
+
+ baseCoverage=0;
+ baseDepth=0;
+ minDepth=Integer.MAX_VALUE;
+ maxDepth=0;
+ medianDepth=0;
+ stdevDepth=0;
+
+ for(int i=start, j=0; i<=stop; i++, j++){
+ int cov=ca.get(i);
+ array[j]=cov;
+ if(cov>1){
+ baseCoverage++;
+ baseDepth+=cov;
+ minDepth=Tools.min(minDepth, cov);
+ maxDepth=Tools.max(maxDepth, cov);
+ }
+ }
+ if(baseDepth>0){
+ Arrays.sort(array);
+ medianDepth=array[array.length/2];
+ stdevDepth=Tools.standardDeviation(array);
+ }
+ return array;
+ }
+
+ @Override
+ public int compareTo(Orf o) {
+ int x=name.compareTo(o.name);
+ if(x!=0){return x;}
+ x=o.start-start;
+ if(x!=0){return x;}
+ x=o.stop-stop;
+ if(x!=0){return x;}
+ return o.strand-strand;
+ }
+
+ @Override
+ public boolean equals(Object o){return equals((Orf)o);}
+ public boolean equals(Orf o){return compareTo(o)==0;}
+
+ @Override
+ public int hashCode(){return Integer.rotateLeft(name.hashCode(),16)^(start<<8)^(stop)^strand;}
+
+ /** Name of ORF (not necessarily the name of its scaffold) */
+ public String name;
+ public int start;
+ public int stop;
+ public byte strand;
+
+ /** Number of bases with nonzero coverage */
+ public long baseCoverage;
+ /** Number of reads mapped to this orf */
+ public long readDepth=0;
+ /** Number of bases mapped to this orf */
+ public long baseDepth=0;
+ /** Lowest base depth */
+ public long minDepth=0;
+ /** Highest base depth */
+ public long maxDepth=0;
+ /** Median base depth */
+ public long medianDepth=0;
+ /** Standard deviation of depth */
+ public double stdevDepth=0;
+
+
+}
diff --git a/current/jgi/PhylipToFasta.java b/current/jgi/PhylipToFasta.java
new file mode 100755
index 0000000..e9a813f
--- /dev/null
+++ b/current/jgi/PhylipToFasta.java
@@ -0,0 +1,219 @@
+package jgi;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import align2.Shared;
+import align2.Tools;
+
+import dna.Parser;
+import dna.Timer;
+import fileIO.FileFormat;
+import fileIO.ReadWrite;
+import fileIO.TextFile;
+import fileIO.TextStreamWriter;
+
+/**
+ * @author Brian Bushnell
+ * @date Oct 3, 2014
+ *
+ */
+public class PhylipToFasta {
+
+
+
+ public static void main(String[] args){
+ Timer t=new Timer();
+ PhylipToFasta mb=new PhylipToFasta(args);
+ mb.process(t);
+ }
+
+ public PhylipToFasta(String[] args){
+
+ args=Parser.parseConfig(args);
+ if(Parser.parseHelp(args, true)){
+ printOptions();
+ System.exit(0);
+ }
+
+ for(String s : args){if(s.startsWith("out=standardout") || s.startsWith("out=stdout")){outstream=System.err;}}
+ outstream.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+
+
+ ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
+ ReadWrite.MAX_ZIP_THREADS=Shared.threads();
+
+
+ Parser parser=new Parser();
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if(b==null || b.equalsIgnoreCase("null")){b=null;}
+ while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens
+
+ if(parser.parse(arg, a, b)){
+ //do nothing
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ ReadWrite.verbose=verbose;
+ }else if(parser.in1==null && i==0 && !arg.contains("=") && (arg.toLowerCase().startsWith("stdin") || new File(arg).exists())){
+ parser.in1=arg;
+ }else if(parser.out1==null && i==1 && !arg.contains("=")){
+ parser.out1=arg;
+ }else{
+ outstream.println("Unknown parameter "+args[i]);
+ assert(false) : "Unknown parameter "+args[i];
+ // throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ {//Process parser fields
+ Parser.processQuality();
+
+ in1=parser.in1;
+
+ out1=parser.out1;
+ }
+
+
+ if(in1==null){
+ printOptions();
+ throw new RuntimeException("Error - at least one input file is required.");
+ }
+
+
+ if(out1!=null && out1.equalsIgnoreCase("null")){out1=null;}
+
+ if(!Tools.testOutputFiles(overwrite, append, false, out1)){
+ outstream.println((out1==null)+", "+out1);
+ throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+out1+"\n");
+ }
+
+ ffout1=FileFormat.testOutput(out1, FileFormat.FASTA, null, true, overwrite, append, false);
+
+ ffin1=FileFormat.testInput(in1, FileFormat.PHYLIP, ".phylip", true, true);
+ }
+
+ void process(Timer t){
+
+ ArrayList<StringBuilder> data=new ArrayList<StringBuilder>();
+ long bases=0;
+
+ {
+ final TextFile tf=new TextFile(ffin1);
+ String s=tf.nextLine(); //first line is some numbers
+
+ for(s=tf.nextLine(); s!=null; s=tf.nextLine()){
+ if(s.startsWith(" ")){break;}
+ StringBuilder sb=new StringBuilder();
+ data.add(sb);
+ sb.append('>');
+ int pos=0;
+ for(pos=0; pos<s.length(); pos++){
+ char c=s.charAt(pos);
+ if(Character.isWhitespace(c)){break;}
+ sb.append(c);
+ }
+ sb.append('\n');
+ while(pos<s.length() && Character.isWhitespace(s.charAt(pos))){pos++;}
+ while(pos<s.length()){
+ char c=s.charAt(pos);
+ if(Character.isLetter(c)){
+ sb.append(c);
+ bases++;
+ }
+ pos++;
+ }
+ }
+
+ final int mod=data.size();
+ for(int i=0; s!=null; i++){
+ StringBuilder sb=data.get(i%mod);
+ for(int pos=0; pos<s.length(); pos++){
+ char c=s.charAt(pos);
+ if(Character.isLetter(c)){
+ sb.append(c);
+ bases++;
+ }
+ pos++;
+ }
+ s=tf.nextLine();
+ }
+ errorState|=tf.errorState;
+ }
+ final long reads=data.size();
+
+ if(ffout1!=null){
+ TextStreamWriter tsw=new TextStreamWriter(ffout1);
+ tsw.start();
+ for(int i=0; i<data.size(); i++){
+ StringBuilder sb=data.set(i, null);
+ sb.append('\n');
+ tsw.print(sb);
+ }
+ tsw.poisonAndWait();
+ errorState|=tsw.errorState;
+ }
+
+ t.stop();
+
+ double rpnano=reads/(double)(t.elapsed);
+ double bpnano=bases/(double)(t.elapsed);
+
+ String rpstring=(reads<100000 ? ""+reads : reads<100000000 ? (reads/1000)+"k" : (reads/1000000)+"m");
+ String bpstring=(bases<100000 ? ""+bases : bases<100000000 ? (bases/1000)+"k" : (bases/1000000)+"m");
+
+ while(rpstring.length()<8){rpstring=" "+rpstring;}
+ while(bpstring.length()<8){bpstring=" "+bpstring;}
+
+ outstream.println("Time: \t"+t);
+ outstream.println("Reads Processed: "+rpstring+" \t"+String.format("%.2fk reads/sec", rpnano*1000000));
+ outstream.println("Bases Processed: "+bpstring+" \t"+String.format("%.2fm bases/sec", bpnano*1000));
+
+ if(errorState){
+ throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
+ }
+ }
+
+ /*--------------------------------------------------------------*/
+
+ private void printOptions(){
+ assert(false) : "printOptions: TODO";
+// outstream.println("Syntax:\n");
+// outstream.println("java -ea -Xmx512m -cp <path> jgi.ReformatReads in=<infile> in2=<infile2> out=<outfile> out2=<outfile2>");
+// outstream.println("\nin2 and out2 are optional. \nIf input is paired and there is only one output file, it will be written interleaved.\n");
+// outstream.println("Other parameters and their defaults:\n");
+// outstream.println("overwrite=false \tOverwrites files that already exist");
+// outstream.println("ziplevel=4 \tSet compression level, 1 (low) to 9 (max)");
+// outstream.println("interleaved=false\tDetermines whether input file is considered interleaved");
+// outstream.println("fastawrap=70 \tLength of lines in fasta output");
+// outstream.println("qin=auto \tASCII offset for input quality. May be set to 33 (Sanger), 64 (Illumina), or auto");
+// outstream.println("qout=auto \tASCII offset for output quality. May be set to 33 (Sanger), 64 (Illumina), or auto (meaning same as input)");
+// outstream.println("outsingle=<file> \t(outs) Write singleton reads here, when conditionally discarding reads from pairs.");
+ }
+
+
+ /*--------------------------------------------------------------*/
+
+ private String in1=null;
+
+ private String out1=null;
+
+ /*--------------------------------------------------------------*/
+
+ private final FileFormat ffin1;
+ private final FileFormat ffout1;
+
+ /*--------------------------------------------------------------*/
+
+ private PrintStream outstream=System.err;
+ public static boolean verbose=false;
+ public boolean errorState=false;
+ private boolean overwrite=false;
+ private boolean append=false;
+
+}
diff --git a/current/jgi/RQCFilter.java b/current/jgi/RQCFilter.java
new file mode 100755
index 0000000..a18dbdb
--- /dev/null
+++ b/current/jgi/RQCFilter.java
@@ -0,0 +1,2185 @@
+package jgi;
+
+import java.io.File;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Date;
+import java.util.TimeZone;
+
+import dna.Data;
+import dna.Parser;
+import dna.Timer;
+
+import stream.FASTQ;
+import tax.FilterByTaxa;
+import tax.GiToNcbi;
+import tax.TaxTree;
+
+import align2.BBMap;
+import align2.BBSplitter;
+import align2.RefToIndex;
+import align2.Shared;
+import align2.Tools;
+import align2.TrimRead;
+import fileIO.ByteFile1;
+import fileIO.ReadWrite;
+import fileIO.TextStreamWriter;
+
+/**
+ * Wrapper for several other programs to implement Rolling QC's filter stage.
+ * Calls BBDuk, BBMap, BBMerge, and SplitNexteraLMP.
+ * Trims adapters, removes contaminants, and does quality-trimming.
+ * @author Brian Bushnell
+ * @date Nov 26, 2013
+ *
+ */
+public class RQCFilter {
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /**
+ * Program entrance from command line.
+ * @param args Command line arguments
+ */
+ public static void main(String[] args){
+ //Start a timer
+ Timer t=new Timer();
+
+ //Create a filter instance
+ RQCFilter filter=new RQCFilter(args);
+
+ //Execute filtering.
+ filter.process();
+
+ //Report time
+ t.stop();
+ System.err.println("\nOverall Time: \t"+t);
+ }
+
+ /**
+ * Constructor.
+ * @param args Command line arguments
+ */
+ RQCFilter(String[] args){
+
+ //Parses some shared arguments
+ Parser parser=new Parser();
+
+ //Symbols to insert in output filename to denote operations performed; may be overriden from command line
+ String symbols_=null;
+
+ boolean doNextera_=false;
+
+ //Parse argument list
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+ String[] split=arg.split("="); //Expect key=value pairs
+ String a=split[0].toLowerCase(); //All keys are converted to lower case
+ String b=split.length>1 ? split[1] : null;
+ while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(Parser.parseCommonStatic(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseZip(arg, a, b)){
+ if(a.equals("pigz")){
+ pigz=b;
+ }else if(a.equals("unpigz")){
+ unpigz=b;
+ }else if(a.equals("zl") || a.equals("ziplevel")){
+ zl=b;
+ }
+ }else if(parser.parseInterleaved(arg, a, b)){
+ //do nothing
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ ByteFile1.verbose=verbose;
+ stream.FastaReadInputStream.verbose=verbose;
+ primaryArgList.add(arg);
+ }else if(a.equals("in") || a.equals("input") || a.equals("in1") || a.equals("input1")){
+ in1=b;
+ }else if(a.equals("in2") || a.equals("input2")){
+ in2=b;
+ }else if(a.equals("out") || a.equals("output") || a.equals("out1") || a.equals("output1")){
+ out1=b;
+ }else if(a.equals("out2") || a.equals("output2")){
+ out2=b;
+ }else if(a.equals("ref")){
+ if(b!=null){
+ if(!b.contains(",") || new File(b).exists()){
+ bbdukFilterRefs.add(b);
+ }else{
+ String[] split2=b.split(",");
+ for(String s2 : split2){
+ bbdukFilterRefs.add(s2);
+ }
+ }
+ }
+ }else if(a.equals("artifactdb")){
+ mainArtifactFile=b;
+ }else if(a.equals("rnadb")){
+ artifactFileRna=b;
+ }else if(a.equals("dnadb")){
+ artifactFileDna=b;
+ }else if(a.equals("ribodb")){
+ riboKmers=b;
+ }else if(a.equals("phixref")){
+ phixRef=b;
+ }else if(a.equals("fragadapter")){
+ fragAdapter=b;
+ }else if(a.equals("rnaadapter")){
+ rnaAdapter=b;
+ }else if(a.equals("lfpelinker")){
+ lfpeLinker=b;
+ }else if(a.equals("cliplinker") || a.equals("jointseq")){
+ clipLinker=b;
+ }else if(a.equals("clrslinker")){
+ clrsLinker=b;
+ }else if(a.equals("trimfragadapter") || a.equals("trimfragadapters")){
+ fragAdapterFlag=Tools.parseBoolean(b);
+ }else if(a.equals("trimrnaadapter") || a.equals("trimrnaadapters")){
+ rnaAdapterFlag=Tools.parseBoolean(b);
+ }else if(a.equals("removehuman") || a.equals("human")){
+ humanFlag=Tools.parseBoolean(b);
+ }else if(a.equals("removedog") || a.equals("dog")){
+ dogFlag=Tools.parseBoolean(b);
+ }else if(a.equals("removecat") || a.equals("cat")){
+ catFlag=Tools.parseBoolean(b);
+ }else if(a.equals("removemouse") || a.equals("mouse")){
+ mouseFlag=Tools.parseBoolean(b);
+ }else if(a.equals("catdoghuman")){
+ catDogHumanFlag=Tools.parseBoolean(b);
+ }else if(a.equals("catdoghumanmouse") || a.equals("mousecatdoghuman")){
+ mouseCatDogHumanFlag=Tools.parseBoolean(b);
+ }else if(a.equals("aggressive") || a.equals("aggressivehuman")){
+ aggressiveMappingFlag=Tools.parseBoolean(b);
+ }else if(a.equals("removemicrobes") || a.equals("removecommonmicrobes") || a.equals("microbes")){
+ commonMicrobeFlag=Tools.parseBoolean(b);
+ }else if(a.equals("removeribo") || a.equals("ribo")){
+ riboFlag=Tools.parseBoolean(b);
+ }else if(a.equals("riboout") || a.equals("outribo")){
+ riboOutFile=b;
+ }else if(a.equals("overwrite") || a.equals("ow")){
+ overwrite=Tools.parseBoolean(b);
+ }else if(a.equals("ml") || a.equals("minlen") || a.equals("minlength")){
+ minLen=Integer.parseInt(b);
+ }else if(a.equals("mlf") || a.equals("minlenfrac") || a.equals("minlenfraction") || a.equals("minlengthfraction")){
+ minLenFraction=Float.parseFloat(b);
+ }else if(a.equals("libtype") || a.equals("library")){
+ libType=toLibType(b);
+ }else if(a.equals("path") || a.equals("outdir")){
+ outDir=b;
+ }else if(a.equals("symbols")){
+ symbols_=b;
+ }else if(a.equals("overallstats") || a.equals("stats")){
+ rqcStatsName=b;
+ }else if(a.equals("scafstats")){
+ scaffoldStatsName=b;
+ }else if(a.equals("scafstatskt") || a.equals("scafstatstrim")){
+ scaffoldStatsName_kt=b;
+ }else if(a.equals("refstats")){
+ refStatsName=b;
+ }else if(a.equals("kmerstats")){
+ kmerStatsName=b;
+ }else if(a.equals("log")){
+ logName=b;
+ }else if(a.equals("ihist")){
+ ihistName=b;
+ }else if(a.equals("khist") || a.equals("dokhist")){
+ doKhist=Tools.parseBoolean(b);
+ }else if(a.equals("filelist")){
+ fileListName=b;
+ }else if(a.equals("compress")){
+ compress=Tools.parseBoolean(b);
+ }else if(a.equals("dna")){
+ dnaArtifactFlag=Tools.parseBoolean(b);
+ }else if(a.equals("rna")){
+ rnaArtifactFlag=Tools.parseBoolean(b);
+ dnaArtifactFlag=!rnaArtifactFlag; //This line requested by Bryce.
+ }else if(a.equals("phix")){
+ phixFlag=Tools.parseBoolean(b);
+ }else if(a.equals("pjet")){
+ pjetFlag=Tools.parseBoolean(b);
+ }else if(a.equals("jointseq")){
+ jointSeq=b;
+ }else if(a.equals("nextera") || a.equals("nexteralmp")){
+ doNextera_=Tools.parseBoolean(b);
+ }else if(a.equals("ktrim")){
+ ktrim=b;
+ }else if(a.equals("mink")){
+ mink=Integer.parseInt(b);
+ }else if(a.equals("k")){
+ assert(false) : "To specify kmer length, use filterk, trimk, mapk, or normalizek instead of just 'k'";
+ filter_k=Integer.parseInt(b);
+ }else if(a.equals("filterk")){
+ filter_k=Integer.parseInt(b);
+ }else if(a.equals("trimk")){
+ trim_k=Integer.parseInt(b);
+ }else if(a.equals("mapk")){
+ map_k=Integer.parseInt(b);
+ }else if(a.equals("normalizek") || a.equals("normk") || a.equals("ecck")){
+ normalize_k=Integer.parseInt(b);
+ }else if(a.equals("filterhdist")){
+ hdist_filter=Integer.parseInt(b);
+ }else if(a.equals("filterqhdist")){
+ qhdist_filter=Integer.parseInt(b);
+ }else if(a.equals("trimhdist")){
+ hdist_trim=Integer.parseInt(b);
+ }else if(a.equals("trimhdist2")){
+ hdist2_trim=Integer.parseInt(b);
+ }else if(a.equals("ribohdist")){
+ hdist_ribo=Integer.parseInt(b);
+ }else if(a.equals("riboedist") || a.equals("riboedits")){
+ edist_ribo=Integer.parseInt(b);
+ }else if(a.equals("maq")){
+ if(b.indexOf(',')>-1){
+ String[] x=b.split(",");
+ assert(x.length==2) : "maq should be length 1 or 2 (at most 1 comma).\nFormat: maq=quality,bases; e.g. maq=10 or maq=10,20";
+ minAvgQuality=Byte.parseByte(x[0]);
+ minAvgQualityBases=Integer.parseInt(x[1]);
+ }else{
+ minAvgQuality=Byte.parseByte(b);
+ }
+ }else if(a.equals("forcetrimmod") || a.equals("forcemrimmodulo") || a.equals("ftm")){
+ forceTrimModulo=Integer.parseInt(b);
+ }else if(a.equals("trimq")){
+ trimq=Byte.parseByte(b);
+ }else if(a.equals("qtrim")){
+ if(b==null){qtrim="rl";}
+ else if(b.equalsIgnoreCase("left") || b.equalsIgnoreCase("l")){qtrim="l";qtrimFlag=true;}
+ else if(b.equalsIgnoreCase("right") || b.equalsIgnoreCase("r")){qtrim="r";qtrimFlag=true;}
+ else if(b.equalsIgnoreCase("both") || b.equalsIgnoreCase("rl") || b.equalsIgnoreCase("lr")){qtrim="lr";qtrimFlag=true;}
+ else if(Character.isDigit(b.charAt(0))){
+ trimq=Byte.parseByte(b);
+ qtrimFlag=(trimq>=0);
+ qtrim=(qtrimFlag ? "lr" : "f");
+ }else{
+ qtrimFlag=Tools.parseBoolean(b);
+ qtrim=""+qtrimFlag;
+ }
+ }else if(a.equals("optitrim") || a.equals("otf") || a.equals("otm")){
+ if(b!=null && (b.charAt(0)=='.' || Character.isDigit(b.charAt(0)))){
+ TrimRead.optimalMode=true;
+ TrimRead.optimalBias=Float.parseFloat(b);
+ assert(TrimRead.optimalBias>=0 && TrimRead.optimalBias<1);
+ }else{
+ TrimRead.optimalMode=Tools.parseBoolean(b);
+ }
+ }else if(a.equals("maxns")){
+ maxNs=Integer.parseInt(b);
+ }else if(a.equals("usetmpdir")){
+ writeTempToTmpdir=Tools.parseBoolean(b);
+ }else if(a.equals("tmpdir")){
+ tmpDir=b;
+ writeTempToTmpdir=(b!=null);
+ }else if(a.equals("humanpath")){
+ humanPath=b;
+ }else if(a.equals("catpath")){
+ catPath=b;
+ }else if(a.equals("dogpath")){
+ dogPath=b;
+ }else if(a.equals("mousepath")){
+ dogPath=b;
+ }else if(a.equals("mapref") || a.equals("maprefs")){
+ if(b==null){mappingRefs.clear();}
+ else{
+ for(String s : b.split(",")){
+ mappingRefs.add(s);
+ }
+ }
+ }else if(a.equals("chastityfilter") || a.equals("cf")){
+ chastityfilter=b;
+ }else if(a.equals("failnobarcode")){
+ failnobarcode=b;
+ }else if(a.equals("badbarcodes") || a.equals("barcodefilter")){
+ barcodefilter=b;
+ }else if(a.equals("barcodes") || a.equals("barcode")){
+ barcodes=b;
+ }else if(a.equals("extend")){
+ extendFlag=Tools.parseBoolean(b);
+ }else if(a.equals("taxlist") || a.equals("tax") || a.equals("taxa")){
+ taxList=b;
+ }else if(a.equals("taxtree") || a.equals("tree")){
+ taxTree=b;
+ }else if(a.equals("loadgitable")){
+ loadGiTable=Tools.parseBoolean(b);
+ }else if(a.equals("gitable")){
+ giTable=b;
+ loadGiTable=(b!=null);
+ }else if(a.equals("taxlevel") || a.equals("level")){
+ taxLevel=b;
+ }else if(a.equals("microberef")){
+ commonMicrobesRef=b;
+ }else if(a.equals("microbepath")){
+ commonMicrobesPath=b;
+ }else{
+ //Uncaptured arguments are passed to BBDuk
+ primaryArgList.add(arg);
+ }
+ }
+
+ doNextera=doNextera_;
+
+// assert(false) : rnaArtifactFlag+"\n"+primaryArgList+"\n"+libType+"\n"+outDir;
+
+ if(writeTempToTmpdir){
+ if(tmpDir==null){tmpDir=Shared.TMPDIR;}
+ if(tmpDir!=null){
+ tmpDir=tmpDir.replace('\\', '/');
+ if(tmpDir.length()>0 && !tmpDir.endsWith("/")){tmpDir+="/";}
+ }
+ }else{tmpDir=null;}
+
+ if(hdist2_trim<0){hdist2_trim=hdist_trim;}
+
+ //Pass overwrite flag to BBDuk
+ primaryArgList.add("ow="+overwrite);
+
+ if(outDir!=null){
+ outDir=outDir.trim().replace('\\', '/');
+ if(outDir.length()>0 && !outDir.endsWith("/")){outDir=outDir+"/";}
+ }else{outDir="";}
+
+ {//Prepend output directory to output files
+ if(logName!=null){logName=outDir+logName/*+".tmp"*/;} //Add '.tmp' to log file
+ if(reproduceName!=null){reproduceName=outDir+reproduceName;}
+ if(fileListName!=null){fileListName=outDir+fileListName;}
+ if(ihistName!=null){ihistName=outDir+ihistName;}
+ if(khistName!=null){khistName=outDir+khistName;}
+ if(peaksName!=null){peaksName=outDir+peaksName;}
+ if(riboOutFile!=null){riboOutFile=outDir+riboOutFile;}
+ if(humanOutFile!=null){humanOutFile=outDir+humanOutFile;}
+ if(synthOutFile!=null){synthOutFile=outDir+synthOutFile;}
+ if(microbeOutFile!=null){microbeOutFile=outDir+microbeOutFile;}
+ if(microbeStatsFile!=null){microbeStatsFile=outDir+microbeStatsFile;}
+
+ if(cardinalityName!=null){cardinalityName=outDir+cardinalityName;}
+ }
+
+ {//Create unique output file names for second pass
+ if(rqcStatsName!=null){
+ rqcStatsName_kt=outDir+"ktrim_"+rqcStatsName;
+ rqcStatsName=outDir+rqcStatsName;
+ }
+ if(kmerStatsName!=null){
+ kmerStatsName_kt=outDir+"ktrim_"+kmerStatsName;
+ kmerStatsName=outDir+kmerStatsName;
+ }
+ if(scaffoldStatsName!=null){
+ scaffoldStatsName_kt=outDir+"ktrim_"+scaffoldStatsName;
+ scaffoldStatsName=outDir+scaffoldStatsName;
+ }
+ if(refStatsName!=null){
+ refStatsName=outDir+refStatsName;
+ }
+ }
+
+ //Determine execution path
+ if(libType==FRAG || ((libType==LFPE && lfpeLinker==null) || (libType==CLIP && clipLinker==null) || (libType==CLRS && clrsLinker==null))){
+ doTrim=(fragAdapterFlag || rnaAdapterFlag);
+ doFilter=true;
+ }else if(libType==LFPE){
+ doTrim=true;
+ doFilter=true;
+ }else if(libType==CLIP){
+ doTrim=true;
+ doFilter=true;
+ }else if(libType==CLRS){
+ doTrim=true;
+ doFilter=true;
+ }else{
+ throw new RuntimeException("Unknown library type.");
+ }
+
+ if(catFlag && dogFlag && humanFlag){
+ if(mouseFlag){
+ mouseCatDogHumanFlag=true;
+ }else{
+ catDogHumanFlag=true;
+ }
+ }
+
+ if(catDogHumanFlag || mouseCatDogHumanFlag){
+ mouseFlag=false;
+ catFlag=false;
+ dogFlag=false;
+ humanFlag=false;
+ }
+
+ if(dogFlag){mappingRefs.add("path="+dogPath);}
+ if(catFlag){mappingRefs.add("path="+catPath);}
+ if(mouseFlag){mappingRefs.add("path="+mousePath);}
+ doMerge=(ihistName!=null);
+
+ //Set final field 'symbols'
+ symbols=(symbols_==null ? abbreviation() : symbols_);
+
+ assert(in1!=null) : "No input file specified.";
+
+ //Create output filename from input filename if no output filename is specified
+ if(out1==null){
+
+ File f=new File(in1);
+ String name=f.getName();
+ rawName=ReadWrite.rawName(name);
+ int dot=rawName.lastIndexOf('.');
+ if(dot>-1){
+ out1=rawName.substring(0, dot)+"."+symbols+rawName.substring(dot)+(compress ? ".gz" : "");
+ }else{
+ out1=rawName+"."+symbols+".fastq"+(compress ? ".gz" : "");
+ }
+ }else{
+ File f=new File(out1);
+ String name=f.getName();
+ rawName=ReadWrite.rawName(name);
+ }
+
+ tempSalt=KmerNormalize.getSalt(out1, 1);
+ trimPrefix="TEMP_TRIM_"+tempSalt+"_";
+ humanPrefix="TEMP_HUMAN_"+tempSalt+"_";
+ filterPrefix="TEMP_FILTER_"+tempSalt+"_";
+ taxaPrefix="TEMP_TAXA_"+tempSalt+"_";
+ microbePrefix="TEMP_MICROBE_"+tempSalt+"_";
+ riboPrefix="TEMP_RIBO_"+tempSalt+"_";
+
+ if(mappingRefs.size()>0){
+ mappingPrefix=new String[mappingRefs.size()];
+ for(int i=0; i<mappingRefs.size(); i++){
+ mappingPrefix[i]="TEMP_MAP_"+tempSalt+"_"+i+"_";
+ }
+ }else{
+ mappingPrefix=null;
+ }
+
+ if(reproduceName!=null){
+ writeReproduceHeader(reproduceName, args, overwrite);
+ }
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Processing Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+
+ /**
+ * Primary method to fully execute the program.
+ */
+ public void process(){
+
+ //Create output directory
+ if(outDir!=null && outDir.length()>0){
+ File f=new File(outDir);
+ if(!f.exists()){
+ f.mkdirs();
+ }
+ }
+
+ //Create log file
+ if(logName!=null){
+ boolean b=Tools.canWrite(logName, overwrite);
+ assert(b) : "Can't write to "+logName;
+ log("start", false);
+ }
+
+ //Create file list file
+ if(fileListName!=null){
+ boolean b=Tools.canWrite(fileListName, overwrite);
+ assert(b) : "Can't write to "+fileListName;
+
+ StringBuilder sb=new StringBuilder();
+ if(!doNextera){
+ if(out1!=null){sb.append("filtered_fastq="+out1).append('\n');}
+ if(out2!=null){sb.append("filtered_fastq_2="+out2).append('\n');}
+ }
+
+ String x=(outDir==null ? "" : outDir);
+ int xlen=x.length();
+
+ //Determine whether to append the output directory prefix in each case
+ if(ihistName!=null){sb.append("ihist="+(ihistName.startsWith(x) ? ihistName.substring(xlen) : ihistName)).append('\n');}
+ if(doKhist){
+ if(khistName!=null){sb.append("khist="+(khistName.startsWith(x) ? khistName.substring(xlen) : khistName)).append('\n');}
+ if(peaksName!=null){sb.append("peaks="+(peaksName.startsWith(x) ? peaksName.substring(xlen) : peaksName)).append('\n');}
+ }
+ if(scaffoldStatsName!=null){sb.append("scafstats="+(scaffoldStatsName.startsWith(x) ? scaffoldStatsName.substring(xlen) : scaffoldStatsName)).append('\n');}
+ if(refStatsName!=null){sb.append("refstats="+(refStatsName.startsWith(x) ? refStatsName.substring(xlen) : refStatsName)).append('\n');}
+ if(riboFlag && riboOutFile!=null){sb.append("ribo="+(riboOutFile.startsWith(x) ? riboOutFile.substring(xlen) : riboOutFile)).append('\n');}
+ if(commonMicrobeFlag && microbeOutFile!=null){
+ sb.append("microbeReads="+(microbeOutFile.startsWith(x) ? microbeOutFile.substring(xlen) : microbeOutFile)).append('\n');
+ sb.append("microbeStats="+(microbeStatsFile.startsWith(x) ? microbeStatsFile.substring(xlen) : microbeStatsFile)).append('\n');
+ }
+ if(doFilter && synthOutFile!=null){sb.append("synthReads="+(synthOutFile.startsWith(x) ? synthOutFile.substring(xlen) : synthOutFile)).append('\n');}
+ if((humanFlag || catDogHumanFlag || mouseCatDogHumanFlag) && humanOutFile!=null){sb.append("humanReads="+(humanOutFile.startsWith(x) ? humanOutFile.substring(xlen) : humanOutFile)).append('\n');}
+
+ if(sb.length()>0){
+ ReadWrite.writeString(sb, fileListName, false);
+ }
+ }
+
+ {
+
+ //Calculate number of total stems, to determine when to write to the output directory versus localdisk.
+ int step=0;
+ final int numSteps=(doFilter ? 1 : 0)+(doTrim ? 1 : 0)+(doNextera ? 1 : 0)+(riboFlag ? 1 : 0)+(commonMicrobeFlag ? 1 : 0)+
+ ((humanFlag || catDogHumanFlag || mouseCatDogHumanFlag) ? 1 : 0)+mappingRefs.size();
+ String inPrefix=null, outPrefix=null;
+
+ //Adapter trimming
+ if(doTrim){
+ step++;
+ inPrefix=outPrefix;
+ outPrefix=(step<numSteps ? trimPrefix : null);
+// System.err.println("Trim. step="+step+", in="+in1+", out="+out1+", inPrefix="+inPrefix+", outPrefix="+outPrefix);
+
+ final String in1z, in2z, out1z, out2z;
+ if(step==1){
+ in1z=in1; in2z=in2;
+ }else{
+ in1z=stripDirs(out1); in2z=stripDirs(out2);
+ }
+ if(step>=numSteps){
+ out1z=out1; out2z=out2;
+ }else{
+ out1z=stripDirs(out1); out2z=stripDirs(out2);
+ }
+
+ ktrim(in1z, in2z, out1z, out2z, inPrefix, outPrefix, step);
+
+ if(in2!=null && out2==null){
+ FASTQ.FORCE_INTERLEAVED=true;
+ FASTQ.TEST_INTERLEAVED=false;
+ }
+
+ if(inPrefix!=null){
+ delete(inPrefix, out1z, out2z);
+ }
+ }
+
+ //Synthetic contaminant filtering
+ if(doFilter){
+ step++;
+ inPrefix=outPrefix;
+ outPrefix=(step<numSteps ? filterPrefix : null);
+// System.err.println("Filter. step="+step+", in="+in1+", out="+out1+", inPrefix="+inPrefix+", outPrefix="+outPrefix);
+
+ final String in1z, in2z, out1z, out2z;
+ if(step==1){
+ in1z=in1; in2z=in2;
+ }else{
+ in1z=stripDirs(out1); in2z=stripDirs(out2);
+ }
+ if(step>=numSteps){
+ out1z=out1; out2z=out2;
+ }else{
+ out1z=stripDirs(out1); out2z=stripDirs(out2);
+ }
+
+ filter(in1z, in2z, out1z, out2z, synthOutFile, inPrefix, outPrefix, step);
+
+ if(in2!=null && out2==null){
+ FASTQ.FORCE_INTERLEAVED=true;
+ FASTQ.TEST_INTERLEAVED=false;
+ }
+
+ if(step>1){
+ delete(inPrefix, out1z, out2z);
+ }
+ }
+
+ //Ribosomal RNA removal
+ if(riboFlag){
+ step++;
+ inPrefix=outPrefix;
+ outPrefix=(step<numSteps ? riboPrefix : null);
+// System.err.println("Filter. step="+step+", in="+in1+", out="+out1+", inPrefix="+inPrefix+", outPrefix="+outPrefix);
+
+ final String in1z, in2z, out1z, out2z;
+ if(step==1){
+ in1z=in1; in2z=in2;
+ }else{
+ in1z=stripDirs(out1); in2z=stripDirs(out2);
+ }
+ if(step>=numSteps){
+ out1z=out1; out2z=out2;
+ }else{
+ out1z=stripDirs(out1); out2z=stripDirs(out2);
+ }
+
+ filterRibo(in1z, in2z, out1z, out2z, riboOutFile, inPrefix, outPrefix, step);
+
+ if(in2!=null && out2==null){
+ FASTQ.FORCE_INTERLEAVED=true;
+ FASTQ.TEST_INTERLEAVED=false;
+ }
+
+ if(step>1){
+ delete(inPrefix, out1z, out2z);
+ }
+ }
+
+ //Microbial contaminant removal
+ if(commonMicrobeFlag){
+ step++;
+ inPrefix=outPrefix;
+ outPrefix=(step<numSteps ? microbePrefix : null);
+// System.err.println("Filter. step="+step+", in="+in1+", out="+out1+", inPrefix="+inPrefix+", outPrefix="+outPrefix);
+
+ final String in1z, in2z, out1z, out2z;
+ if(step==1){
+ in1z=in1; in2z=in2;
+ }else{
+ in1z=stripDirs(out1); in2z=stripDirs(out2);
+ }
+ if(step>=numSteps){
+ out1z=out1; out2z=out2;
+ }else{
+ out1z=stripDirs(out1); out2z=stripDirs(out2);
+ }
+
+ String ref=taxFilter(commonMicrobesRef);
+// System.err.println("in1z="+in1z+"\nout1z="+out1z+"\ninPrefix="+inPrefix+"\noutPrefix="+outPrefix);
+ removeCommonMicrobes(in1z, in2z, out1z, out2z, microbeOutFile, microbeStatsFile, inPrefix, outPrefix, ref, step, aggressiveMappingFlag);
+
+ if(in2!=null && out2==null){
+ FASTQ.FORCE_INTERLEAVED=true;
+ FASTQ.TEST_INTERLEAVED=false;
+ }
+
+ if(step>1){
+ delete(inPrefix, out1z, out2z);
+ }
+ }
+
+ //Human, cat, dog, and mouse removal
+ if(humanFlag || catDogHumanFlag || mouseCatDogHumanFlag){
+ step++;
+ inPrefix=outPrefix;
+ outPrefix=(step<numSteps ? humanPrefix : null);
+// System.err.println("Human. step="+step+", in="+in1+", out="+out1+", inPrefix="+inPrefix+", outPrefix="+outPrefix);
+
+ final String in1z, in2z, out1z, out2z;
+ if(step==1){
+ in1z=in1; in2z=in2;
+ }else{
+ in1z=stripDirs(out1); in2z=stripDirs(out2);
+ }
+ if(step>=numSteps){
+ out1z=out1; out2z=out2;
+ }else{
+ out1z=stripDirs(out1); out2z=stripDirs(out2);
+ }
+
+ dehumanize(in1z, in2z, out1z, out2z, humanOutFile, inPrefix, outPrefix, step, catDogHumanFlag, mouseCatDogHumanFlag, aggressiveMappingFlag);
+
+ if(in2!=null && out2==null){
+ FASTQ.FORCE_INTERLEAVED=true;
+ FASTQ.TEST_INTERLEAVED=false;
+ }
+
+ Data.unloadAll();
+ if(step>1){
+ delete(inPrefix, out1z, out2z);
+ }
+ }
+
+ //Removal of other assorted reference sequences by mapping
+ if(mappingRefs.size()>0){
+ for(int i=0; i<mappingRefs.size(); i++){
+ step++;
+ inPrefix=outPrefix;
+ outPrefix=(step<numSteps ? mappingPrefix[i] : null);
+ // System.err.println("Human. step="+step+", in="+in1+", out="+out1+", inPrefix="+inPrefix+", outPrefix="+outPrefix);
+
+ final String in1z, in2z, out1z, out2z;
+ if(step==1){
+ in1z=in1; in2z=in2;
+ }else{
+ in1z=stripDirs(out1); in2z=stripDirs(out2);
+ }
+ if(step>=numSteps){
+ out1z=out1; out2z=out2;
+ }else{
+ out1z=stripDirs(out1); out2z=stripDirs(out2);
+ }
+
+ decontamByMapping(in1z, in2z, out1z, out2z, null, null, inPrefix, outPrefix, mappingRefs.get(i), step);
+
+ if(in2!=null && out2==null){
+ FASTQ.FORCE_INTERLEAVED=true;
+ FASTQ.TEST_INTERLEAVED=false;
+ }
+
+ Data.unloadAll();
+ if(step>1){
+ delete(inPrefix, out1z, out2z);
+ }
+ }
+ }
+
+ //Nextera LMP library processing
+ if(doNextera){
+ step++;
+ inPrefix=outPrefix;
+ outPrefix=null;
+// System.err.println("Nextera. step="+step+", in="+in1+", out="+out1+", inPrefix="+inPrefix+", outPrefix="+outPrefix);
+
+ final String in1z, in2z, out1z, out2z;
+ if(step==1){
+ in1z=in1; in2z=in2;
+ }else{
+ in1z=stripDirs(out1); in2z=stripDirs(out2);
+ }
+
+ if(step>=numSteps){
+ out1z=out1; out2z=out2;
+ }else{
+ out1z=stripDirs(out1); out2z=stripDirs(out2);
+ }
+
+ //Insert size calculation
+ if(doMerge){merge(in1z, in2z, inPrefix);}
+
+ splitNextera(in1z, in2z, inPrefix, outPrefix, step);
+
+ if(in2!=null && out2==null){
+ FASTQ.FORCE_INTERLEAVED=true;
+ FASTQ.TEST_INTERLEAVED=false;
+ }
+
+ Data.unloadAll();
+ if(step>1){
+ delete(inPrefix, out1z, out2z);
+ }
+ }else{
+ if(doMerge){//Insert size calculation
+ if(step==0){
+ merge(in1, in2, null);
+ }else{
+ merge(out1, out2, null);
+ }
+ }
+ if(doKhist){
+ if(step==0){
+ khist(in1, in2, null);
+ }else{
+ khist(out1, out2, null);
+ }
+ }
+ }
+ }
+
+ //Write combined stats file (number of reads/bases present/removed in each stage)
+ if(rqcStatsName!=null){
+ final TextStreamWriter tsw=new TextStreamWriter(rqcStatsName, overwrite, false, false);
+ tsw.start();
+ tsw.println(BBDukF.rqcString());
+ tsw.poisonAndWait();
+ }
+
+// {//Set files to permission 777
+// setPermissions((out1==null ? null : outDir+out1),(out2==null ? null : outDir+out2));
+// setPermissions((qfout1==null ? null : outDir+qfout1),(qfout2==null ? null : outDir+qfout2));
+// setPermissions(reproduceName,fileListName);
+// setPermissions(rqcStatsName,kmerStatsName,scaffoldStatsName);
+// setPermissions(rqcStatsName_kt,kmerStatsName_kt,scaffoldStatsName_kt);
+// setPermissions(outDir);
+// }
+
+ //Finish writing log file
+ if(logName!=null){
+ log("complete", true);
+ if(logName.endsWith(".tmp")){ //Remove .tmp extension
+ String old=logName;
+ logName=logName.substring(0, logName.length()-4);
+ new File(old).renameTo(new File(logName));
+ }
+ }
+
+// //Set log file permission
+// setPermissions(logName);
+
+ }
+
+
+ /**
+ * Runs BBDuk to perform:
+ * Kmer trimming, short read removal.
+ *
+ * @param in1 Primary input reads file (required)
+ * @param in2 Secondary input reads file
+ * @param out1 Primary output reads file (required)
+ * @param out2 Secondary output reads file
+ * @param inPrefix Append this prefix to input filenames
+ * @param outPrefix Append this prefix to output filenames
+ */
+ private void ktrim(String in1, String in2, String out1, String out2, String inPrefix, String outPrefix, int stepNum){
+
+ log("ktrim start", true);
+ ktrimFlag=true;
+
+ ArrayList<String> argList=new ArrayList<String>();
+
+ final String inPre=(inPrefix==null ? "" : (tmpDir==null ? outDir : tmpDir)+inPrefix);
+ final String outPre=(outPrefix==null ? outDir : (tmpDir==null ? outDir : tmpDir)+outPrefix);
+
+ {//Fill list with BBDuk arguments
+ argList.add("ktrim="+(ktrim==null ? "f" : ktrim));
+ if(minLen>0){argList.add("minlen="+minLen);}
+ if(minLenFraction>0){argList.add("minlenfraction="+minLenFraction);}
+ if((libType!=CLIP)){
+ argList.add("mink="+mink);
+ if(libType==FRAG && ("r".equalsIgnoreCase(ktrim) || "right".equalsIgnoreCase(ktrim))){
+ if(tboFlag){argList.add("tbo");}
+ if(tpeFlag){argList.add("tpe");}
+ }
+ argList.add("overwrite="+overwrite);
+ argList.add("k="+trim_k);
+ argList.add("hdist="+hdist_trim);
+ if(hdist2_trim>=0){
+ argList.add("hdist2="+hdist2_trim);
+ }
+ if(forceTrimModulo>0){
+ argList.add("ftm="+forceTrimModulo);
+ }
+ }
+ if(pigz!=null){argList.add("pigz="+pigz);}
+ if(unpigz!=null){argList.add("unpigz="+unpigz);}
+ if(zl!=null){argList.add("zl="+zl);}
+
+ //Pass along uncaptured arguments
+ for(String s : primaryArgList){argList.add(s);}
+
+ //Set read I/O files
+ if(in1!=null){argList.add("in1="+inPre+in1);}
+ if(in2!=null){argList.add("in2="+inPre+in2);}
+ if(out1!=null){argList.add("out1="+outPre+out1);}
+ if(out2!=null){argList.add("out2="+outPre+out2);}
+
+// if(rqcStatsName!=null){al.add("rqc="+rqcStatsName_kt);} //Old style for 2 log files
+ if(rqcStatsName!=null){argList.add("rqc=hashmap");}
+ if(kmerStatsName_kt!=null){argList.add("outduk="+kmerStatsName_kt);}
+ if(scaffoldStatsName_kt!=null){argList.add("stats="+scaffoldStatsName_kt);}
+
+ argList.add("loglog"); //Cardinality
+ }
+
+ {//Add BBDuk references
+ ArrayList<String> refs=new ArrayList<String>();
+
+ if(libType==FRAG){
+ if(fragAdapterFlag){refs.add(fragAdapter);}
+ if(rnaAdapterFlag){refs.add(rnaAdapter);}
+ }else if(libType==LFPE){
+ refs.add(lfpeLinker);
+ }else if(libType==CLIP){
+// refs.add(clipLinker);
+ if(clipLinker!=null){
+ argList.add("literal="+clipLinker);
+ {//Special processing for literal strings of approx 4bp
+ String[] split=clipLinker.split(",");
+ int min=split[0].length();
+ for(String s : split){min=Tools.min(min, s.length());}
+ argList.add("k="+min);
+ argList.add("mink=-1");
+ argList.add("mm=f");
+ argList.add("hdist=0");
+ argList.add("edist=0");
+ argList.add("ktrimexclusive=t");
+ }
+ }else{
+ throw new RuntimeException("Null clip linker.");
+ }
+ }else if(libType==CLRS){
+ refs.add(clrsLinker);
+ }else{
+ throw new RuntimeException("Unknown library type.");
+ }
+
+ StringBuilder refstring=new StringBuilder();
+ for(String ref : refs){
+ if(ref!=null){
+ refstring.append(refstring.length()==0 ? "ref=" : ",");
+ refstring.append(ref);
+ }
+ }
+
+ if(refstring!=null && refstring.length()>0){
+ argList.add(refstring.toString());
+ }
+ }
+
+ String[] dukargs=argList.toArray(new String[0]);
+
+ if(reproduceName!=null){
+ writeReproduceFile(reproduceName, "bbduk.sh", dukargs);
+ }
+
+ {//run BBDuk
+ BBDukF duk=new BBDukF(dukargs);
+ try {
+ duk.process();
+ } catch (Exception e) {
+ e.printStackTrace();
+ log("failed", true);
+ System.exit(1);
+ }
+ }
+
+ //Optionally append files to file list here
+
+ log("ktrim finish", true);
+ }
+
+ /**
+ * Runs BBDuk to perform:
+ * Quality filtering, quality trimming, n removal, short read removal, artifact removal (via kmer filtering), phiX removal.
+ *
+ * @param in1 Primary input reads file (required)
+ * @param in2 Secondary input reads file
+ * @param out1 Primary output reads file (required)
+ * @param out2 Secondary output reads file
+ * @param inPrefix Append this prefix to input filenames
+ * @param outPrefix Append this prefix to output filenames
+ */
+ private void filter(String in1, String in2, String out1, String out2, String outbad, String inPrefix, String outPrefix,
+ int stepNum){
+
+ log("filter start", true);
+
+ ArrayList<String> argList=new ArrayList<String>();
+
+ final String inPre=(inPrefix==null ? "" : (tmpDir==null ? outDir : tmpDir)+inPrefix);
+ final String outPre=(outPrefix==null ? outDir : (tmpDir==null ? outDir : tmpDir)+outPrefix);
+
+// System.err.println("inPre="+inPre+", outPre="+outPre+", outDir="+outDir+", tmpDir="+tmpDir); //123
+
+ {//Fill list with BBDuk arguments
+ if(minAvgQuality>-1){argList.add("maq="+minAvgQuality+","+minAvgQualityBases);}
+ if(qtrim!=null){
+ argList.add("trimq="+trimq);
+ argList.add("qtrim="+qtrim);
+ }
+ argList.add("overwrite="+overwrite);
+ if(maxNs>=0){argList.add("maxns="+maxNs);}
+ if(minLen>0){argList.add("minlen="+minLen);}
+ if(minLenFraction>0){argList.add("minlenfraction="+minLenFraction);}
+ argList.add("k="+filter_k);
+ argList.add("hdist="+hdist_filter);
+ if(qhdist_filter>0){argList.add("qhdist="+qhdist_filter);}
+ if(pigz!=null){argList.add("pigz="+pigz);}
+ if(unpigz!=null){argList.add("unpigz="+unpigz);}
+ if(zl!=null){argList.add("zl="+zl);}
+
+ if(chastityfilter!=null){argList.add("cf="+chastityfilter);}
+ if(failnobarcode!=null){argList.add("failnobarcode="+failnobarcode);}
+ if(barcodefilter!=null){argList.add("barcodefilter="+barcodefilter);}
+ if(barcodes!=null){argList.add("barcodes="+barcodes);}
+
+ //Pass along uncaptured arguments
+ for(String s : primaryArgList){argList.add(s);}
+
+ //Set read I/O files
+ if(in1!=null){argList.add("in1="+inPre+in1);}
+ if(in2!=null){argList.add("in2="+inPre+in2);}
+ if(out1!=null){argList.add("out1="+outPre+out1);}
+ if(out2!=null){argList.add("out2="+outPre+out2);}
+ if(outbad!=null){argList.add("outm="+outbad);}
+
+// if(rqcStatsName!=null){al.add("rqc="+rqcStatsName);} //Old style for 2 log files
+ if(rqcStatsName!=null){argList.add("rqc=hashmap");}
+ if(kmerStatsName!=null){argList.add("outduk="+kmerStatsName);}
+ if(scaffoldStatsName!=null){argList.add("stats="+scaffoldStatsName);}
+
+ argList.add("loglog"); //Cardinality
+ }
+
+ {//Add BBDuk references
+ bbdukFilterRefs.add(doNextera ? mainArtifactFile_noNextera : mainArtifactFile);
+ if(dnaArtifactFlag){
+ bbdukFilterRefs.add(doNextera ? artifactFileDna_noNextera : artifactFileDna);
+ }
+ if(rnaArtifactFlag){
+ bbdukFilterRefs.add(artifactFileRna);
+ }
+
+ if(phixFlag){bbdukFilterRefs.add(phixRef);}
+ if(pjetFlag){bbdukFilterRefs.add(pjetRef);}
+
+ if(libType==FRAG){
+
+ }else if(libType==LFPE){
+
+ }else if(libType==CLIP){
+
+ }else if(libType==CLRS){
+
+ }else{
+ throw new RuntimeException("Unknown library type.");
+ }
+
+ StringBuilder refstring=new StringBuilder();
+ for(String ref : bbdukFilterRefs){
+ if(ref!=null){
+ refstring.append(refstring.length()==0 ? "ref=" : ",");
+ refstring.append(ref);
+ }
+ }
+
+ if(refstring!=null && refstring.length()>0){
+ argList.add(refstring.toString());
+ }
+ }
+
+ String[] dukargs=argList.toArray(new String[0]);
+
+ if(reproduceName!=null){
+ writeReproduceFile(reproduceName, "bbduk.sh", dukargs);
+ }
+
+ {//Run BBDuk
+ BBDukF duk=new BBDukF(dukargs);
+ try {
+ duk.process();
+ } catch (Exception e) {
+ e.printStackTrace();
+ log("failed", true);
+ System.exit(1);
+ }
+ }
+
+ //Optionally append files to file list here
+
+ log("filter finish", true);
+ }
+
+ /**
+ * Runs BBDuk to perform:
+ * Ribosomal read removal.
+ *
+ * @param in1 Primary input reads file (required)
+ * @param in2 Secondary input reads file
+ * @param out1 Primary output reads file (required)
+ * @param out2 Secondary output reads file
+ * @param outRibo Output for ribosomal reads
+ * @param inPrefix Append this prefix to input filenames
+ * @param outPrefix Append this prefix to output filenames
+ */
+ private void filterRibo(String in1, String in2, String out1, String out2, String outRibo, String inPrefix, String outPrefix,
+ int stepNum){
+
+ log("filter ribo start", true);
+
+ ArrayList<String> argList=new ArrayList<String>();
+
+ final String inPre=(inPrefix==null ? "" : (tmpDir==null ? outDir : tmpDir)+inPrefix);
+ final String outPre=(outPrefix==null ? outDir : (tmpDir==null ? outDir : tmpDir)+outPrefix);
+
+// System.err.println("inPre="+inPre+", outPre="+outPre+", outDir="+outDir+", tmpDir="+tmpDir); //123
+
+ {//Fill list with BBDuk arguments
+ argList.add("k=31");
+ argList.add("ref="+riboKmers);
+ if(hdist_ribo>0){argList.add("hdist="+hdist_ribo);}
+ if(edist_ribo>0){argList.add("edist="+edist_ribo);}
+
+ //Pass along uncaptured arguments
+ for(String s : primaryArgList){argList.add(s);}
+
+ //Set read I/O files
+ if(in1!=null){argList.add("in1="+inPre+in1);}
+ if(in2!=null){argList.add("in2="+inPre+in2);}
+ if(out1!=null){argList.add("out1="+outPre+out1);}
+ if(out2!=null){argList.add("out2="+outPre+out2);}
+ if(outRibo!=null){argList.add("outm="+outRibo);}
+
+// if(rqcStatsName!=null){al.add("rqc="+rqcStatsName);} //Old style for 2 log files
+ if(rqcStatsName!=null){argList.add("rqc=hashmap");}
+ if(kmerStatsName!=null){argList.add("outduk="+kmerStatsName);}
+ if(scaffoldStatsName!=null){argList.add("stats="+scaffoldStatsName);}
+ }
+
+ String[] dukargs=argList.toArray(new String[0]);
+
+ if(reproduceName!=null){
+ writeReproduceFile(reproduceName, "bbduk.sh", dukargs);
+ }
+
+ {//Run BBDuk
+ BBDukF duk=new BBDukF(dukargs);
+ try {
+ duk.process();
+ } catch (Exception e) {
+ e.printStackTrace();
+ log("failed", true);
+ System.exit(1);
+ }
+ }
+
+ //Optionally append files to file list here
+
+ log("filter ribo finish", true);
+ }
+
+
+ /**
+ * Runs SplitNexteraLMP.
+ *
+ * @param in1 Primary input reads file (required)
+ * @param in2 Secondary input reads file
+ * @param inPrefix Append this prefix to input filenames
+ * @param outPrefix Append this prefix to output filenames
+ */
+ private void splitNextera(String in1, String in2, String inPrefix, String outPrefix, int stepNum){
+
+ log("splitNextera start", true);
+ splitNexteraFlag=true;
+
+ ArrayList<String> argList=new ArrayList<String>();
+
+ final String inPre=(inPrefix==null ? "" : (tmpDir==null ? outDir : tmpDir)+inPrefix);
+ final String outPre=(outPrefix==null ? outDir : (tmpDir==null ? outDir : tmpDir)+outPrefix);
+
+ final String lmpName, fragName, unknownName, singletonName;
+ final String statsName=outPre+nexteraStats;
+
+ int dot=rawName.lastIndexOf('.');
+ if(dot>-1){
+ lmpName=outPre+rawName.substring(0, dot)+"."+symbols+".lmp"+rawName.substring(dot)+(compress ? ".gz" : "");
+ fragName=outPre+rawName.substring(0, dot)+"."+symbols+".frag"+rawName.substring(dot)+(compress ? ".gz" : "");
+ unknownName=outPre+rawName.substring(0, dot)+"."+symbols+".unknown"+rawName.substring(dot)+(compress ? ".gz" : "");
+ singletonName=outPre+rawName.substring(0, dot)+"."+symbols+".singleton"+rawName.substring(dot)+(compress ? ".gz" : "");
+ }else{
+ lmpName=outPre+rawName+"."+symbols+".lmp.fastq"+(compress ? ".gz" : "");
+ fragName=outPre+rawName+"."+symbols+".frag.fastq"+(compress ? ".gz" : "");
+ unknownName=outPre+rawName+"."+symbols+".unknown.fastq"+(compress ? ".gz" : "");
+ singletonName=outPre+rawName+"."+symbols+".singleton.fastq"+(compress ? ".gz" : "");
+ }
+
+ {//Fill list with Nextera arguments
+ argList.add("mask");
+ argList.add("ow="+overwrite);
+ if(minLen>0){argList.add("minlen="+minLen);}
+ if(pigz!=null){argList.add("pigz="+pigz);}
+ if(unpigz!=null){argList.add("unpigz="+unpigz);}
+ if(zl!=null){argList.add("zl="+zl);}
+
+ //Set read I/O files
+ if(in1!=null){argList.add("in1="+inPre+in1);}
+ if(in2!=null){argList.add("in2="+inPre+in2);}
+
+ argList.add("out="+lmpName);
+ argList.add("outu="+unknownName);
+ argList.add("outf="+fragName);
+ argList.add("outs="+singletonName);
+ argList.add("stats="+statsName);
+ }
+
+ String[] splitargs=argList.toArray(new String[0]);
+
+ if(reproduceName!=null){
+ writeReproduceFile(reproduceName, "splitnextera.sh", splitargs);
+ }
+
+ {//run BBDuk
+ SplitNexteraLMP split=new SplitNexteraLMP(splitargs);
+ try {
+ split.process();
+ } catch (Exception e) {
+ e.printStackTrace();
+ log("failed", true);
+ System.exit(1);
+ }
+ }
+
+ if(fileListName!=null){
+ StringBuilder sb=new StringBuilder();
+ sb.append("lmp="+lmpName).append('\n');
+ sb.append("frag="+fragName).append('\n');
+ sb.append("unknown="+unknownName).append('\n');
+ sb.append("singleton="+singletonName).append('\n');
+ sb.append("nexterastats="+statsName).append('\n');
+
+ if(sb.length()>0){
+ ReadWrite.writeString(sb, fileListName, true);
+ }
+ }
+
+ log("splitNextera finish", true);
+ }
+
+ /**
+ * Runs BBMap to perform:
+ * Human contaminant removal.
+ *
+ * @param in1 Primary input reads file (required)
+ * @param in2 Secondary input reads file
+ * @param out1 Primary output reads file (required)
+ * @param out2 Secondary output reads file
+ * @param inPrefix Append this prefix to input filenames
+ * @param outPrefix Append this prefix to output filenames
+ */
+ private void dehumanize(String in1, String in2, String out1, String out2, String outbad, String inPrefix, String outPrefix,
+ int stepNum, boolean catDogHuman, boolean mouseCatDogHuman, boolean aggressive){
+
+ log("dehumanize start", true);
+
+ ArrayList<String> argList=new ArrayList<String>();
+
+ final String inPre=(inPrefix==null ? "" : (tmpDir==null ? outDir : tmpDir)+inPrefix);
+ final String outPre=(outPrefix==null ? outDir : (tmpDir==null ? outDir : tmpDir)+outPrefix);
+
+ {
+
+ argList.add("k="+map_k);
+ argList.add("idtag=t");
+ argList.add("usemodulo");
+ argList.add("printunmappedcount");
+ argList.add("ow="+overwrite);
+ argList.add("qtrim=r");
+ argList.add("trimq=10");
+ argList.add("untrim");
+ argList.add("kfilter=25");
+ argList.add("maxsites=1");
+ argList.add("tipsearch="+0);
+// argList.add("minhits="+1);
+
+ if(aggressive){
+ argList.add("minratio=.75");
+ argList.add("maxindel=8");
+ argList.add("minhits="+1);
+ argList.add("bw=26");
+ argList.add("bwr=0.22");
+ argList.add("build=2");
+ }else{
+ argList.add("minratio=.9");
+ argList.add("maxindel=3");
+ argList.add("minhits="+2);
+ argList.add("bw=12");
+ argList.add("bwr=0.16");
+ argList.add("fast="+true);
+ argList.add("maxsites2=10");
+ }
+
+ if(outbad!=null){argList.add("outm="+outbad);}
+
+ if(mouseCatDogHuman){
+ argList.add("path="+mouseCatDogHumanPath);
+ if(refStatsName!=null){argList.add("refstats="+refStatsName);}
+ }else if(catDogHuman){
+ argList.add("path="+catDogHumanPath);
+ if(refStatsName!=null){argList.add("refstats="+refStatsName);}
+ }else{
+ if(humanRef==null){
+ argList.add("path="+humanPath);
+ }else{
+ RefToIndex.NODISK=true;
+ argList.add("ref="+humanRef);
+ }
+ }
+
+ if(pigz!=null){argList.add("pigz="+pigz);}
+ if(unpigz!=null){argList.add("unpigz="+unpigz);}
+ if(zl!=null){argList.add("zl="+zl);}
+
+// //Pass along uncaptured arguments
+// for(String s : primaryArgList){argList.add(s);}
+
+ //Set read I/O files
+ if(in1!=null){argList.add("in1="+inPre+in1);}
+ if(in2!=null){argList.add("in2="+inPre+in2);}
+ if(out1!=null){argList.add("outu1="+outPre+out1);}
+ if(out2!=null){argList.add("outu2="+outPre+out2);}
+
+ }
+
+ String[] args=argList.toArray(new String[0]);
+
+ {//Run BBMap
+ try {
+ if(catDogHuman || mouseCatDogHuman){
+ BBSplitter.main(args);
+ }else{
+ BBMap.main(args);
+ }
+ } catch (Exception e) {
+ e.printStackTrace();
+ log("failed", true);
+ System.exit(1);
+ }
+ }
+
+ //Clear the index
+ Data.unloadAll();
+
+ //Unset NODISK
+ RefToIndex.NODISK=false;
+
+ if(reproduceName!=null){
+ writeReproduceFile(reproduceName, "bbmap.sh", args);
+ }
+
+ //Optionally append files to file list here
+
+ log("dehumanize finish", true);
+ }
+
+ /**
+ * Runs FilterByTaxa to remove sequences from a reference.
+ */
+ private String taxFilter(String in){
+ if(taxList==null){
+// System.err.println("*Returning "+in);
+ return in;
+ }
+ log("taxFilter start", true);
+
+ String temp=(tmpDir==null ? outDir : tmpDir)+taxaPrefix+"taxa.fa.gz";
+ ArrayList<String> argList=new ArrayList<String>();
+
+ {
+ argList.add("names="+taxList);
+ argList.add("include=f");
+ argList.add("tree="+taxTree);
+ argList.add("level="+taxLevel);
+ argList.add("in="+in);
+ argList.add("out="+temp);
+ argList.add("ow="+overwrite);
+ }
+
+ if(loadGiTable){
+ GiToNcbi.initialize(giTable);
+ }
+
+ String[] args=argList.toArray(new String[0]);
+ FilterByTaxa fbt=new FilterByTaxa(args);
+ fbt.process(new Timer());
+ GiToNcbi.unload();
+
+ if(reproduceName!=null){
+ writeReproduceFile(reproduceName, "filterbytaxa.sh", args);
+ }
+
+ log("taxFilter finish", true);
+
+// System.err.println("*Returning "+(fbt.basesOut>0 ? temp : null));
+
+ return fbt.basesOut>0 ? temp : null;
+ }
+
+ /**
+ * Runs BBMap to perform:
+ * Microbial contaminant removal.
+ */
+ private void removeCommonMicrobes(String in1, String in2, String out1, String out2, String outbad, String scafstats, String inPrefix, String outPrefix,
+ final String ref, int stepNum, boolean aggressive){
+
+ log("removeCommonMicrobes start", true);
+
+ final String inPre=(inPrefix==null ? "" : (tmpDir==null ? outDir : tmpDir)+inPrefix);
+ final String outPre=(outPrefix==null ? outDir : (tmpDir==null ? outDir : tmpDir)+outPrefix);
+
+ if(ref==null){
+ String skipped="Tax filter removed all ref sequences; skipping microbe removal.";
+ System.err.println(skipped);
+ log(skipped, true);
+
+ try {
+ if(in1!=null){
+ File a=new File(inPre+in1);
+ File b=new File(outPre+out1);
+ System.err.println("Renaming "+a+" to "+b);
+ assert(a.exists()) : a;
+ assert(!b.exists() || overwrite) : b;
+ a.renameTo(b);
+ writeReproduceFile(reproduceName, "mv", new String[] {a.toString(), b.toString()});
+ }
+ if(in2!=null && out2!=null){
+ File a=new File(inPre+in2);
+ File b=new File(outPre+out2);
+ System.err.println("Renaming "+a+" to "+b);
+ assert(a.exists()) : a;
+ assert(!b.exists() || overwrite) : b;
+ a.renameTo(b);
+ writeReproduceFile(reproduceName, "mv", new String[] {a.toString(), b.toString()});
+ }
+ } catch (Throwable e) {
+ System.err.println(e.getMessage());
+ e.printStackTrace();
+ log("failed", true);
+ System.exit(1);
+ }
+ log("removeCommonMicrobes finish", true);
+ return;
+ }
+
+ ArrayList<String> argList=new ArrayList<String>();
+ {
+ argList.add("quickmatch");
+ argList.add("k="+map_k);
+ argList.add("idtag=t");
+ argList.add("usemodulo");
+ argList.add("printunmappedcount");
+ argList.add("ow="+overwrite);
+ argList.add("qtrim=rl");
+ argList.add("trimq=10");
+ argList.add("untrim");
+ if(commonMicrobesPath!=null && commonMicrobesRef.equals(ref) && commonMicrobesRef.startsWith(commonMicrobesPath)){
+ RefToIndex.NODISK=false;
+ argList.add("path="+commonMicrobesPath);
+ }else{
+ RefToIndex.NODISK=true;
+ argList.add("ref="+ref);
+ }
+ if(pigz!=null){argList.add("pigz="+pigz);}
+ if(unpigz!=null){argList.add("unpigz="+unpigz);}
+ if(zl!=null){argList.add("zl="+zl);}
+
+ if(aggressive){
+ argList.add("minratio=.75");
+ argList.add("maxindel=8");
+ argList.add("minhits="+1);
+ argList.add("bw=26");
+ argList.add("bwr=0.22");
+ argList.add("build=2");
+ argList.add("tipsearch="+2);
+ }else{
+ argList.add("minratio=.9");
+ argList.add("maxindel=3");
+ argList.add("minhits="+2);
+ argList.add("bw=12");
+ argList.add("bwr=0.16");
+ argList.add("fast="+true);
+ argList.add("maxsites2=10");
+ argList.add("tipsearch="+0);
+ }
+
+ //Set read I/O files
+ if(in1!=null){argList.add("in1="+inPre+in1);}
+ if(in2!=null){argList.add("in2="+inPre+in2);}
+ if(out1!=null){argList.add("outu1="+outPre+out1);}
+ if(out2!=null){argList.add("outu2="+outPre+out2);}
+ if(outbad!=null){argList.add("outm="+outbad);}
+ if(scafstats!=null){argList.add("scafstats="+scafstats);}
+ // assert(false) : scafstats+", "+microbeStatsFile;
+ }
+
+ String[] args=argList.toArray(new String[0]);
+
+ {//Run BBMap
+ try {
+ BBMap.main(args);
+ } catch (Exception e) {
+ e.printStackTrace();
+ log("failed", true);
+ System.exit(1);
+ }
+ }
+
+ //Clear the index
+ Data.unloadAll();
+
+ //Unset NODISK
+ RefToIndex.NODISK=false;
+
+ if(reproduceName!=null){
+ writeReproduceFile(reproduceName, "bbmap.sh", args);
+ }
+
+ if(ref!=null && !ref.equals(commonMicrobesRef)){delete(null, ref);}
+
+ log("removeCommonMicrobes finish", true);
+ }
+
+ /**
+ * Runs BBMap to perform:
+ * Arbitrary contaminant removal.
+ *
+ * @param in1 Primary input reads file (required)
+ * @param in2 Secondary input reads file
+ * @param out1 Primary output reads file (required)
+ * @param out2 Secondary output reads file
+ * @param inPrefix Append this prefix to input filenames
+ * @param outPrefix Append this prefix to output filenames
+ */
+ private void decontamByMapping(String in1, String in2, String out1, String out2, String outbad, String scafstats, String inPrefix, String outPrefix,
+ String ref, int stepNum){
+
+ log("decontamByMapping_"+ref+" start", true);
+ assert(ref!=null) : "Reference was null.";
+
+ ArrayList<String> argList=new ArrayList<String>();
+
+ final String inPre=(inPrefix==null ? "" : (tmpDir==null ? outDir : tmpDir)+inPrefix);
+ final String outPre=(outPrefix==null ? outDir : (tmpDir==null ? outDir : tmpDir)+outPrefix);
+
+ {
+ argList.add("minratio=.9");
+ argList.add("maxindel=3");
+ argList.add("fast="+true);
+ argList.add("minhits="+2);
+ argList.add("tipsearch="+4);
+ argList.add("bw=12");
+ argList.add("bwr=0.16");
+ argList.add("quickmatch");
+ argList.add("k="+map_k);
+ argList.add("idtag=t");
+// argList.add("usemodulo");
+ argList.add("printunmappedcount");
+ argList.add("ow="+overwrite);
+ argList.add("qtrim=rl");
+ argList.add("trimq=10");
+ argList.add("untrim");
+ if(ref.startsWith("path=")){
+ argList.add(ref);
+ }else{
+ RefToIndex.NODISK=true;
+ argList.add("ref="+ref);
+ }
+ if(pigz!=null){argList.add("pigz="+pigz);}
+ if(unpigz!=null){argList.add("unpigz="+unpigz);}
+ if(zl!=null){argList.add("zl="+zl);}
+
+// //Pass along uncaptured arguments
+// for(String s : primaryArgList){argList.add(s);}
+
+ //Set read I/O files
+ if(in1!=null){argList.add("in1="+inPre+in1);}
+ if(in2!=null){argList.add("in2="+inPre+in2);}
+ if(out1!=null){argList.add("outu1="+outPre+out1);}
+ if(out2!=null){argList.add("outu2="+outPre+out2);}
+ if(outbad!=null){argList.add("outm="+outbad);}
+ if(scafstats!=null){argList.add("scafstats="+scafstats);}
+// assert(false) : scafstats+", "+microbeStatsFile;
+ }
+
+ String[] args=argList.toArray(new String[0]);
+
+ {//Run BBMap
+ try {
+ BBMap.main(args);
+ } catch (Exception e) {
+ e.printStackTrace();
+ log("failed", true);
+ System.exit(1);
+ }
+ }
+
+ //Clear the index
+ Data.unloadAll();
+
+ //Unset NODISK
+ RefToIndex.NODISK=false;
+
+ if(reproduceName!=null){
+ writeReproduceFile(reproduceName, "bbmap.sh", args);
+ }
+
+ //Optionally append files to file list here
+
+ log("decontamByMapping_"+ref+" finish", true);
+ }
+
+
+ /**
+ * Runs BBMerge to generate an insert size histogram.
+ *
+ * @param in1 Primary input reads file (required)
+ * @param in2 Secondary input reads file
+ * @param prefix Append this prefix to input filenames
+ */
+ private void merge(String in1, String in2, String prefix){
+
+ log("merge start", true);
+
+ ArrayList<String> argList=new ArrayList<String>();
+
+ final String inPre=(prefix==null ? outDir : (tmpDir==null ? outDir : tmpDir)+prefix);
+
+ {//Fill list with BBMerge arguments
+ if(mergeStrictness!=null){argList.add(mergeStrictness);}
+ argList.add("overwrite="+overwrite);
+
+ //Set read I/O files
+ if(in1!=null){argList.add("in1="+inPre+in1);}
+ if(in2!=null){argList.add("in2="+inPre+in2);}
+
+ if(ihistName!=null){argList.add("ihist="+ihistName);}
+ if(cardinalityName!=null){argList.add("outc="+cardinalityName);}
+ if(pigz!=null){argList.add("pigz="+pigz);}
+ if(unpigz!=null){argList.add("unpigz="+unpigz);}
+ if(zl!=null){argList.add("zl="+zl);}
+
+ if(extendFlag){
+ argList.add("ecct");
+ argList.add("extend2=20");
+ argList.add("iterations=10");
+ argList.add("prefilter");
+ argList.add("prealloc");
+ System.gc();
+ }
+ }
+
+ String[] mergeargs=argList.toArray(new String[0]);
+
+ if(reproduceName!=null){
+ writeReproduceFile(reproduceName, "bbmerge.sh", mergeargs);
+ }
+
+ {//run BBMerge
+ BBMerge merger=new BBMerge(mergeargs);
+ try {
+ merger.process();
+ } catch (Exception e) {
+ e.printStackTrace();
+ log("failed", true);
+ System.exit(1);
+ }
+ }
+
+ //Optionally append files to file list here
+
+ log("merge finish", true);
+ }
+
+
+ /**
+ * Runs BBNorm or KmerCountExact to generate a kmer frequency histogram.
+ *
+ * @param in1 Primary input reads file (required)
+ * @param in2 Secondary input reads file
+ * @param prefix Append this prefix to input filenames
+ */
+ private void khist(String in1, String in2, String prefix){
+
+ log("khist start", true);
+
+ ArrayList<String> argList=new ArrayList<String>();
+
+ final String inPre=(prefix==null ? outDir : (tmpDir==null ? outDir : tmpDir)+prefix);
+
+ final long cardinality=LogLog.lastCardinality;
+ final long capacity=kmerCapacity(12, true);
+ System.err.println("cardinality="+cardinality+", capacity="+capacity);
+
+ if(cardinality<1 || cardinality*1.5>capacity){ //Too many kmers for exact counts; use BBNorm
+ {//Fill list with BBNorm arguments
+ argList.add("overwrite="+overwrite);
+
+ //Set read I/O files
+ if(in1!=null){argList.add("in1="+inPre+in1);}
+ if(in2!=null){argList.add("in2="+inPre+in2);}
+
+ if(khistName!=null){argList.add("khist="+khistName);}
+ if(peaksName!=null){argList.add("peaks="+peaksName);}
+ if(unpigz!=null){argList.add("unpigz="+unpigz);}
+ argList.add("keepall");
+ argList.add("prefilter");
+ argList.add("passes=1");
+ argList.add("bits=16");
+ argList.add("minprob=0");
+ argList.add("minqual=0");
+ argList.add("histcolumns=2");
+ }
+
+ String[] khistargs=argList.toArray(new String[0]);
+
+ if(reproduceName!=null){
+ writeReproduceFile(reproduceName, "khist.sh", khistargs);
+ }
+
+ {//run KmerNormalize
+ try {
+ KmerNormalize.main(khistargs);
+ } catch (Exception e) {
+ e.printStackTrace();
+ log("failed", true);
+ System.exit(1);
+ }
+ }
+ }else{
+ {//Fill list with KmerCountExact arguments
+ argList.add("overwrite="+overwrite);
+
+ //Set read I/O files
+ if(in1!=null){argList.add("in1="+inPre+in1);}
+ if(in2!=null){argList.add("in2="+inPre+in2);}
+
+ if(khistName!=null){argList.add("khist="+khistName);}
+ if(peaksName!=null){argList.add("peaks="+peaksName);}
+ if(unpigz!=null){argList.add("unpigz="+unpigz);}
+
+ if(cardinality*4>capacity){
+ argList.add("prealloc");
+ }
+ }
+
+ String[] khistargs=argList.toArray(new String[0]);
+
+ if(reproduceName!=null){
+ writeReproduceFile(reproduceName, "kmercountexact.sh", khistargs);
+ }
+
+ {//run KmerCountExact
+ try {
+ KmerCountExact.main(khistargs);
+ } catch (Exception e) {
+ e.printStackTrace();
+ log("failed", true);
+ System.exit(1);
+ }
+ }
+ }
+
+ //Optionally append files to file list here
+
+ log("khist finish", true);
+ }
+
+ private long kmerCapacity(int bytesPerKmer, boolean prealloc){
+ System.gc();
+ long memory=Runtime.getRuntime().maxMemory();
+ double xmsRatio=Shared.xmsRatio();
+ long usableMemory=(long)Tools.max(((memory-96000000)*(xmsRatio>0.97 ? 0.82 : 0.75)), memory*0.45);
+ long tableMemory=(long)(usableMemory*.95);
+ long estimatedKmerCapacity=(long)((tableMemory*1.0/bytesPerKmer)*(prealloc ? 0.9 : 0.6));
+ return estimatedKmerCapacity;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Helper Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+
+ /**
+ * Log a message in the log file
+ * @param message Message to log
+ * @param append True to append, false to overwrite
+ */
+ private void log(String message, boolean append){
+ if(logName!=null){
+ ReadWrite.writeString(message+", "+timeString()+"\n", logName, append);
+ }
+ }
+
+
+ /**
+ * Delete all non-null filenames.
+ * @param prefix Append this prefix to filenames before attempting to delete them
+ * @param names Filenames to delete
+ */
+ private void delete(String prefix, String...names){
+ log("delete temp files start", true);
+ if(names!=null){
+ final String pre=(prefix==null ? "" : (tmpDir==null ? outDir : tmpDir)+prefix);
+ for(String s : names){
+ if(s!=null){
+ s=pre+s;
+ if(verbose){System.err.println("Trying to delete "+s);}
+ File f=new File(s);
+ if(f.exists()){
+ f.delete();
+ writeReproduceFile(reproduceName, "rm", new String[] {s});
+ }
+ }
+ }
+ }
+ log("delete temp files finish", true);
+ }
+
+ /**
+ * @return String of symbols indicating which processes were applied to the input reads
+ */
+ private String abbreviation(){
+ StringBuilder sb=new StringBuilder();
+
+ if(mainArtifactFile!=null || (rnaArtifactFlag && artifactFileRna!=null) || (dnaArtifactFlag && artifactFileDna!=null)){sb.append("a");}
+
+ if(maxNs>=0){sb.append("n");}
+// if(qtrim!=null && !qtrim.equalsIgnoreCase("f") && !qtrim.equalsIgnoreCase("false")){sb.append("q");}
+ if(minAvgQuality>0){sb.append("q");}
+
+ if(rnaArtifactFlag){sb.append("r");}
+ if(dnaArtifactFlag){sb.append("d");}
+
+ if(libType==CLIP){sb.append("c");}
+ else if(libType==LFPE){sb.append("l");}
+ else if(libType==CLRS){sb.append("s");}
+
+ if(phixFlag){sb.append("p");}
+ if(humanFlag || catDogHumanFlag || mouseCatDogHumanFlag){sb.append("h");}
+
+// if(ktrimFlag){sb.append("k");}
+
+// if(doTrim){sb.append("k");}
+// if(qtrimFlag){sb.append("t");}
+
+ if(doTrim || qtrimFlag){sb.append("t");}
+
+ return sb.toString();
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Static Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /**
+ * TODO: Some machines are set to UTC rather than PST
+ * @return Timestamp in RQC's format
+ */
+ public static String timeString(){
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+// sdf.setTimeZone(TimeZone.getTimeZone("PST"));
+ sdf.setTimeZone(TimeZone.getDefault());
+ return sdf.format(new Date());
+ }
+
+ /**
+ * Strips the directories, leaving only a filename
+ * @param fname
+ * @return
+ */
+ public static String stripDirs(String fname){
+ if(fname==null){return null;}
+ if(fname.indexOf('\\')>=0){fname=fname.replace('\\', '/');}
+ final int index=fname.lastIndexOf('/');
+ if(index>=0){fname=fname.substring(index+1);}
+ return fname;
+ }
+
+ /**
+ * Set permissions on these files to 777
+ * @param names List of filenames
+ */
+ private static void setPermissions(String...names){
+ if(names==null){return;}
+ for(String name : names){
+ if(name!=null && name.trim().length()>0 && new File(name).exists()){
+ ReadWrite.setPermissions(name, true, true, true, false);
+ }
+ }
+ }
+
+ /**
+ * Writes a single command to the reproduce file
+ * @param fname Filename to write, including path
+ * @param command Command to add to file
+ * @param args Arguments to the command
+ */
+ private static void writeReproduceFile(String fname, String command, String[] args){
+ StringBuilder sb=new StringBuilder();
+ sb.append(command);
+ if(args!=null){
+ for(String s : args){
+ sb.append(' ').append(s);
+ }
+ }
+ sb.append('\n');
+ ReadWrite.writeString(sb, fname, true);
+ }
+
+ /**
+ * Writes the header for the reproduce file
+ * @param fname Filename to write, including path
+ * @param command Command to add to file
+ * @param args Arguments to the command
+ * @param overwrite Permission to overwrite
+ */
+ private static void writeReproduceHeader(String fname, String[] args, boolean overwrite){
+ StringBuilder sb=new StringBuilder();
+ boolean b=Tools.canWrite(fname, overwrite);
+ assert(b) : "Can't write to "+fname;
+ sb.append("#!/bin/bash\n");
+ sb.append("#BBTools version "+Shared.BBMAP_VERSION_STRING+"\n");
+ sb.append("#The steps below recapitulate the output of RQCFilter when run like this:\n");
+ sb.append("#rqcfilter.sh");
+ if(args!=null){
+ for(String s : args){
+ sb.append(' ').append(s);
+ }
+ }
+ sb.append('\n');
+ sb.append('\n');
+ ReadWrite.writeString(sb, fname, false);
+ }
+
+ /**
+ * @param s String representation of library type
+ * @return Numeric code for library type
+ */
+ private static int toLibType(String s){
+ if(s==null){return FRAG;}
+ s=s.trim().toLowerCase();
+ if(s.equals("lfpe")){return LFPE;}
+ if(s.equals("clip")){return CLIP;}
+ if(s.equals("clrs")){return CLRS;}
+ if(s.equals("frag") || s.equals("fragment")){return FRAG;}
+ throw new RuntimeException("Unknown library type "+s);
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Synthetic contaminant filtering */
+ private final boolean doFilter;
+ /** Adapter-trimming */
+ private final boolean doTrim;
+ /** Run BBMerge for insert size calculation */
+ private final boolean doMerge;
+ /** Run KmerNormalize for kmer histogram generation */
+ private boolean doKhist=false;
+ /** Do NexteraLMP splitting */
+ private final boolean doNextera;
+
+ /** Symbols to insert in output filename to denote operations performed */
+ private final String symbols;
+
+ /** Name of raw input file, minus directory and file extension */
+ private final String rawName;
+
+ /** Type of library; controls processing methods and references to use */
+ private int libType=FRAG;
+ /** True to filter rna artifacts */
+ private boolean rnaArtifactFlag=false;
+ /** True to filter dna artifacts */
+ private boolean dnaArtifactFlag=true;
+ /** True if phix should be filtered out */
+ private boolean phixFlag=true;
+ /** True if pjet should be filtered out */
+ private boolean pjetFlag=true;
+
+ /** Enables tbo during adapter trimming */
+ private boolean tboFlag=true;
+ /** Enables tpe during adapter trimming */
+ private boolean tpeFlag=true;
+
+ /** Unused */
+ private String jointSeq=null;
+ /** Toss reads shorter than this */
+ private int minLen=25;
+ /** Toss reads shorter than this fraction of initial length, after trimming */
+ private float minLenFraction=0.333f;
+ /** Trim bases at this quality or below */
+ private byte trimq=10;
+ /** Throw away reads below this average quality before trimming. Default: 5 */
+ private byte minAvgQuality=5;
+ /** If positive, calculate the average quality from the first X bases. */
+ private int minAvgQualityBases=0;
+ /** Trim reads to be equal to 0 modulo this value. Mainly for 151, 251, and 301bp runs. */
+ private int forceTrimModulo=5;
+ /** Quality-trimming mode */
+ private String qtrim="f";//"rl";
+ /** Kmer-trimming mode */
+ private String ktrim="r";
+ /** Kmer length to use for filtering */
+ private int filter_k=27;
+ /** Kmer length to use for trimming */
+ private int trim_k=23;
+ /** Kmer length to use for normalization and error-correction */
+ private int normalize_k=31;
+ /** Kmer length to use for mapping */
+ private int map_k=14;
+ /** Shortest kmer to use for trimming */
+ private int mink=11;
+ /** Throw away reads containing more than this many Ns. Default: 0 (toss reads with any Ns) */
+ private int maxNs=0;
+ /** Use this Hamming distance when kmer filtering */
+ private int hdist_filter=1;
+ /** Use this query Hamming distance when kmer filtering */
+ private int qhdist_filter=0;
+ /** Use this Hamming distance when kmer trimming */
+ private int hdist_trim=1;
+ /** Use this Hamming distance when kmer trimming with short kmers */
+ private int hdist2_trim=-1;
+ /** Use this Hamming distance when kmer trimming with short kmers */
+ private int hdist_ribo=0;
+ /** Use this Hamming distance when kmer trimming with short kmers */
+ private int edist_ribo=0;
+
+ /** Merge strictness: strict, normal, loose, vloose */
+ private String mergeStrictness="loose";
+
+ /** Trim Truseq and Nextera adapters from right side of reads */
+ private boolean fragAdapterFlag=false;
+ /** Trim Truseq-RNA adapters from right side of reads */
+ private boolean rnaAdapterFlag=false;
+
+ /** Performed quality-trimming on reads */
+ private boolean qtrimFlag=false;
+ /** Performed kmer-trimming on reads */
+ private boolean ktrimFlag=false;
+ /** Performed nextera splitting on reads */
+ private boolean splitNexteraFlag=false;
+ /** Remove reads mapping to human with high identity */
+ private boolean humanFlag=false;
+ /** Remove reads mapping to dog with high identity */
+ private boolean dogFlag=false;
+ /** Remove reads mapping to cat with high identity */
+ private boolean catFlag=false;
+ /** Remove reads mapping to mouse with high identity */
+ private boolean mouseFlag=false;
+ /** Remove cat, dog, and human reads at the same time with BBSplit. */
+ private boolean catDogHumanFlag=false;
+ /** Remove mouse, cat, dog, and human reads at the same time with BBSplit. */
+ private boolean mouseCatDogHumanFlag=false;
+ /** Perform cat, dog, mouse, human, and microbe removal aggressively, using unmasked genomes. */
+ private boolean aggressiveMappingFlag=false;
+ /** Remove ribosomal reads */
+ private boolean riboFlag=false;
+ /** Remove reads from common microbial contaminants with BBMap */
+ private boolean commonMicrobeFlag=false;
+ /** Extend reads to merge longer inserts */
+ private boolean extendFlag=false;
+ /** Estimate kmer cardinality */
+ private boolean doCardinality=true;
+
+ private boolean verbose=false;
+ private boolean overwrite=true;
+ private boolean compress=true;
+
+ /** Write temp files to $TMPDIR (localdisk) */
+ private boolean writeTempToTmpdir=true;
+
+ /** Captures the command line "pigz" flag */
+ private String pigz="t";
+ /** Captures the command line "unpigz" flag */
+ private String unpigz="t";
+ /** Captures the command line "zl" flag */
+ private String zl;
+
+ /** Mode for processing chastity flag in Illumina read names */
+ private String chastityfilter="t";
+ /** Consider the absence of a barcode to mean failure */
+ private String failnobarcode=null;
+ /** May be set to true, false, or crash to determine how to handle reads with no barcode */
+ private String barcodefilter="crash";
+ /** An optional list of literal barcodes that are allowed */
+ private String barcodes=null;
+
+ /** Arguments to pass to BBDuk */
+ private ArrayList<String> primaryArgList=new ArrayList<String>();
+ /** References to pass to BBDuk for artifact removal */
+ private ArrayList<String> bbdukFilterRefs=new ArrayList<String>();
+ /** References to pass to BBMap for contaminant removal */
+ private ArrayList<String> mappingRefs=new ArrayList<String>();
+
+ /** List of taxa to NOT map against */
+ private String taxList=null;
+ /** Taxonomic level for filtering */
+ private String taxLevel="order";
+ /** Only needed if there are gi numbers in the references */
+ private boolean loadGiTable=false;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Read Data Files ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private final String tempSalt;
+
+ private final String trimPrefix;
+ private final String humanPrefix;
+ private final String filterPrefix;
+ private final String taxaPrefix;
+ private final String microbePrefix;
+ private final String riboPrefix;
+ private final String[] mappingPrefix;
+
+ /** Directory in which to write all files */
+ private String outDir="";
+
+ /** Directory in which to write all temp files */
+ private String tmpDir=Shared.TMPDIR;
+
+ /** Primary input reads file (required) */
+ private String in1=null;
+ /** Secondary input reads file */
+ private String in2=null;
+ /** Primary output reads file (required) */
+ private String out1=null;
+ /** Secondary output reads file */
+ private String out2=null;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Separated Reads ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private String riboOutFile="ribo.fq.gz";
+ private String humanOutFile="human.fq.gz";
+ private String synthOutFile="synth.fq.gz";
+ private String microbeOutFile="microbes.fq.gz";
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Log Files ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private String logName="status.log";
+ private String reproduceName="reproduce.sh";
+ private String fileListName="file-list.txt";
+
+ private String rqcStatsName="filterStats.txt";
+ private String kmerStatsName="kmerStats.txt";
+ private String scaffoldStatsName="scaffoldStats.txt";
+ private String refStatsName="refStats.txt";
+ private String microbeStatsFile="commonMicrobes.txt";
+ private String nexteraStats="nexteraStats.txt";
+ private String ihistName="ihist_merge.txt";
+ private String khistName="khist.txt";
+ private String peaksName="peaks.txt";
+
+ private String cardinalityName="cardinality.txt";
+
+ /** ktrim phase rqc stats file */
+ private String rqcStatsName_kt;
+ /** ktrim phase stats file */
+ private String kmerStatsName_kt;
+ /** ktrim phase scaffold stats file */
+ private String scaffoldStatsName_kt;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Reference Files ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private String mainArtifactFile_noNextera = "/global/dna/shared/rqc/ref_databases/qaqc/databases/illumina.artifacts/Illumina.artifacts.2013.12.no_DNA_RNA_spikeins_no_Nextera_junction.fa.gz";
+ private String mainArtifactFile = "/global/dna/shared/rqc/ref_databases/qaqc/databases/illumina.artifacts/Illumina.artifacts.2013.12.no_DNA_RNA_spikeins.fa";
+ private String artifactFileRna = "/global/dna/shared/rqc/ref_databases/qaqc/databases/illumina.artifacts/RNA_spikeins.artifacts.2012.10.NoPolyA.fa";
+ private String artifactFileDna = "/global/dna/shared/rqc/ref_databases/qaqc/databases/illumina.artifacts/DNA_spikeins.artifacts.2012.10.fa";
+ private String artifactFileDna_noNextera = "/global/dna/shared/rqc/ref_databases/qaqc/databases/illumina.artifacts/DNA_spikeins.artifacts_no_Nextera_junction.2012.10.fa.gz";
+ private String phixRef = "/global/dna/shared/rqc/ref_databases/qaqc/databases/phix174_ill.ref.fa";
+ private String lfpeLinker = "/global/dna/shared/rqc/ref_databases/qaqc/databases/lfpe.linker.fa";
+ private String clrsLinker = "/global/dna/shared/rqc/ref_databases/qaqc/databases/crelox.fa";
+ private String clipLinker = clipLinkerDefault; //A literal string; "CATG" is supposed to be the normal linker.
+
+ private String pjetRef = "/global/dna/shared/rqc/ref_databases/qaqc/databases/pJET1.2.fasta";
+ private String riboKmers = "/global/projectb/sandbox/gaag/bbtools/ribo/merged_ribokmers20.fa.gz";
+ private String allArtifactsLatest = "/global/projectb/sandbox/rqc/qcdb/illumina.artifacts/Illumina.artifacts.fa";
+ private String fragAdapter = "/global/projectb/sandbox/gaag/bbtools/data/adapters.fa";
+ private String rnaAdapter = "/global/projectb/sandbox/gaag/bbtools/data/truseq_rna.fa.gz";
+
+ private String humanPath = "/global/projectb/sandbox/gaag/bbtools/hg19/";
+ private String catPath = "/global/projectb/sandbox/gaag/bbtools/cat_genome/";
+ private String dogPath = "/global/projectb/sandbox/gaag/bbtools/dog_genome/";
+ private String mousePath = "/global/projectb/sandbox/gaag/bbtools/mouse_genome/";
+ private String humanRef = null;
+
+ private String catDogHumanPath = "/global/projectb/sandbox/gaag/bbtools/catdoghuman/";
+ private String mouseCatDogHumanPath = "/global/projectb/sandbox/gaag/bbtools/mousecatdoghuman/";
+
+ private String commonMicrobesPath = "/global/projectb/sandbox/gaag/bbtools/commonMicrobes/";
+ private String commonMicrobesRef = "/global/projectb/sandbox/gaag/bbtools/commonMicrobes/commonMicrobes.fa.gz";
+ private String taxTree=TaxTree.DefaultTreeFile;
+ private String giTable=TaxTree.DefaultTableFile;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Constants ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Library type codes */
+ private static final int FRAG=0, LFPE=1, CLIP=2, CLRS=3;
+ private static final String clipLinkerDefault = "CATG";
+
+}
diff --git a/current/jgi/RandomGenome.java b/current/jgi/RandomGenome.java
new file mode 100755
index 0000000..26a7166
--- /dev/null
+++ b/current/jgi/RandomGenome.java
@@ -0,0 +1,54 @@
+package jgi;
+
+import java.util.Random;
+
+import dna.AminoAcid;
+import fileIO.ReadWrite;
+import fileIO.TextStreamWriter;
+
+/**
+ * @author Brian Bushnell
+ * @date Jan 3, 2013
+ *
+ */
+public class RandomGenome {
+
+ public static void main(String[] args){
+ ReadWrite.ZIPLEVEL=2;
+ Random randy=new Random();
+ int chroms=Integer.parseInt(args[0]);
+ int len=Integer.parseInt(args[1]);
+
+ String fname=args[2];
+ TextStreamWriter tsw=new TextStreamWriter(fname, false, false, true);
+ tsw.start();
+
+ for(int chrom=1; chrom<=chroms; chrom++){
+ tsw.println(">"+chrom);
+ StringBuilder sb=new StringBuilder(101);
+ for(int i=0, j=0; i<len; i++, j++){
+ char c;
+ if((i/10000)%4==3){
+ c='N';
+ }else{
+ c=(char)AminoAcid.numberToBase[randy.nextInt(4)];
+ }
+ sb.append(c);
+ if(j==100){
+ sb.append('\n');
+ tsw.print(sb);
+ sb=new StringBuilder(101);
+ j=0;
+ }
+ }
+ if(sb.length()>0){
+ sb.append('\n');
+ tsw.print(sb);
+ }
+ }
+ tsw.poison();
+ tsw.waitForFinish();
+
+ }
+
+}
diff --git a/current/jgi/ReadKmerDepthDistribution.java b/current/jgi/ReadKmerDepthDistribution.java
new file mode 100755
index 0000000..5697e9f
--- /dev/null
+++ b/current/jgi/ReadKmerDepthDistribution.java
@@ -0,0 +1,1078 @@
+package jgi;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.lang.Thread.State;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.concurrent.atomic.AtomicLong;
+import java.util.concurrent.atomic.AtomicLongArray;
+
+import bloom.KCountArray;
+import bloom.KmerCount7MTA;
+import bloom.KmerCountAbstract;
+
+
+import stream.ConcurrentGenericReadInputStream;
+import stream.ConcurrentReadInputStream;
+import stream.FASTQ;
+import stream.FastaReadInputStream;
+import stream.ConcurrentReadOutputStream;
+import stream.Read;
+
+import align2.ListNum;
+import align2.ReadStats;
+import align2.Shared;
+import align2.Tools;
+import dna.AminoAcid;
+import dna.Data;
+import dna.Parser;
+import dna.Timer;
+import fileIO.ReadWrite;
+import fileIO.FileFormat;
+import fileIO.TextStreamWriter;
+
+
+
+/**
+ * This class is designed to visualize the distribution of kmer depths across individual reads.
+ * @author Brian Bushnell
+ * @date May 15, 2013
+ *
+ */
+public class ReadKmerDepthDistribution {
+
+ public static void main(String[] args){
+ for(String s : args){if(s.contains("=standardout") || s.contains("=stdout")){outstream=System.err;}}
+ outstream.println("Executing "+(new Object() { }.getClass().getEnclosingClass().getName())+" "+Arrays.toString(args)+"\n");
+
+ if(args.length<1){throw new RuntimeException("No parameters.");}
+
+ String reads1=(args[0].indexOf("=")>0 ? null : args[0]);
+ String reads2=(reads1!=null && args.length>1 ? args[1] : null);
+ if(reads2!=null && "null".equalsIgnoreCase(reads2)){reads2=null;}
+
+ {
+ if(reads1!=null && !reads1.contains(",")){
+ File f=new File(reads1);
+ if(!f.exists() || !f.isFile()){throw new RuntimeException(reads1+" does not exist.");}
+ }
+ if(reads2!=null && !reads2.contains(",")){
+ File f=new File(reads2);
+ if(!f.exists() || !f.isFile()){throw new RuntimeException(reads2+" does not exist.");}
+ if(reads1.equalsIgnoreCase(reads2)){
+ throw new RuntimeException("Both input files are the same.");
+ }
+ }
+ }
+
+ KmerCountAbstract.minQuality=4;
+ KmerCountAbstract.minProb=0.4f;
+
+ int k=31;
+ int cbits=32;
+ int gap=0;
+ int hashes=3;
+// int matrixbits=-1;
+ long cells=-1;
+ long maxReads=-1;
+ int buildpasses=1;
+ long tablereads=-1; //How many reads to process when building the hashtable
+ int buildStepsize=4;
+ String outKeep=null;
+ int prehashes=-1;
+ long precells=-1;
+ String histFile=null;
+ int threads=-1;
+ ReadWrite.ZIPLEVEL=2;
+
+ int minq=KmerCountAbstract.minQuality;
+ KmerCountAbstract.CANONICAL=true;
+
+ boolean auto=true;
+ boolean deterministic=true;
+
+ FastaReadInputStream.TARGET_READ_LEN=Integer.MAX_VALUE;
+
+ List<String> extra=null;
+
+ long memory=Runtime.getRuntime().maxMemory();
+ long tmemory=Runtime.getRuntime().totalMemory();
+// assert(false) : memory+", "+tmemory;
+
+ Parser parser=new Parser();
+ for(int i=(reads1==null ? 0 : 1); i<args.length; i++){
+ if(args[i]==null){args[i]="null";}
+ final String arg=args[i];
+ final String[] split=arg.split("=");
+ assert(split.length<3) : "To many '=' signs: "+args[i];
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if("null".equalsIgnoreCase(b)){b=null;}
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(Parser.parseCommonStatic(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseZip(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseQuality(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseFasta(arg, a, b)){
+ //do nothing
+ }else if(parser.parseInterleaved(arg, a, b)){
+ //do nothing
+ }else if(a.equals("k") || a.equals("kmer")){
+ k=Integer.parseInt(b);
+ }else if(a.equals("in") || a.equals("in1")){
+ reads1=b;
+ }else if(a.equals("in2")){
+ reads2=b;
+ }else if(a.startsWith("bits") ||a.startsWith("cbits") || a.startsWith("cellbits")){
+ cbits=Integer.parseInt(b);
+ }else if(a.startsWith("histlen") ||a.startsWith("histogramlen")){
+ HIST_LEN_PRINT=Tools.min(Integer.MAX_VALUE, Long.parseLong(b)+1);
+ }else if(a.startsWith("gap")){
+ gap=Integer.parseInt(b);
+ }else if(a.startsWith("matrixbits")){
+ int matrixbits=Integer.parseInt(b);
+ assert(matrixbits<63);
+ cells=1L<<matrixbits;
+ }else if(a.startsWith("cells")){
+ cells=Tools.parseKMG(b);
+ }else if(a.startsWith("precells") || a.startsWith("prefiltercells")){
+ precells=Tools.parseKMG(b);
+ prefilter=prefilter || precells!=0;
+ }else if(a.startsWith("minq")){
+ minq=Byte.parseByte(b);
+ }else if(a.equals("zerobin")){
+ ZERO_BIN=Tools.parseBoolean(b);
+ }else if(a.equals("deterministic") || a.equals("dr")){
+ boolean x=Tools.parseBoolean(b);
+ deterministic=x;
+ }else if(a.startsWith("minprob")){
+ KmerCountAbstract.minProb=Float.parseFloat(b);
+ }else if(a.startsWith("hashes")){
+ hashes=Integer.parseInt(b);
+ }else if(a.startsWith("prehashes") || a.startsWith("prefilterhashes")){
+ prehashes=Integer.parseInt(b);
+ prefilter=prefilter || prehashes!=0;
+ }else if(a.equals("prefilter")){
+ prefilter=Tools.parseBoolean(b);
+ }else if(a.startsWith("stepsize") || a.startsWith("buildstepsize")){
+ buildStepsize=Integer.parseInt(b);
+ }else if(a.startsWith("passes") || a.startsWith("buildpasses")){
+ buildpasses=Integer.parseInt(b);
+ }else if(a.equals("printcoverage")){
+ assert(false) : "This is not the program you are looking for. Try KmerCoverage.";
+ }else if(a.equals("threads") || a.equals("t")){
+ threads=Integer.parseInt(b);
+ }else if(a.equals("reads") || a.startsWith("maxreads")){
+ maxReads=Tools.parseKMG(b);
+ }else if(a.startsWith("tablereads") || a.startsWith("buildreads")){
+ tablereads=Tools.parseKMG(b);
+ }else if(a.equals("out") || a.equals("outk") || a.equals("outkeep") || a.equals("outgood")){
+ outKeep=b;
+ }else if(a.startsWith("hist")){
+ histFile=b;
+ }else if(a.startsWith("verbose")){
+ verbose=Tools.parseBoolean(b);
+ }else if(a.equals("ordered") || a.equals("ord")){
+ ordered=Tools.parseBoolean(b);
+ }else if(a.equals("append") || a.equals("app")){
+ append=ReadStats.append=Tools.parseBoolean(b);
+ }else if(a.equals("overwrite") || a.equals("ow")){
+ overwrite=Tools.parseBoolean(b);
+ }else if(a.equals("auto") || a.equals("automatic")){
+ auto=Tools.parseBoolean(b);
+ }else if(a.equals("canonical")){
+ CANONICAL=KmerCountAbstract.CANONICAL=Tools.parseBoolean(b);
+ }else if(a.equals("fixspikes")){
+ FIX_SPIKES=Tools.parseBoolean(b);
+ }else if(a.equals("printzerocoverage") || a.equals("pzc")){
+ PRINT_ZERO_COVERAGE=Tools.parseBoolean(b);
+ }else if(a.equals("removeduplicatekmers") || a.equals("rdk")){
+ KmerCountAbstract.KEEP_DUPLICATE_KMERS=!Tools.parseBoolean(b);
+ }else if(a.equals("target") || a.equals("targetdepth")){
+ TARGET_DEPTH=Integer.parseInt(b);
+ }else if(a.equals("max") || a.equals("maxdepth")){
+ MAX_DEPTH=Integer.parseInt(b);
+ }else if(a.equals("min") || a.equals("mindepth")){
+ MIN_DEPTH=Integer.parseInt(b);
+ }else if(a.equals("minkmers") || a.equals("minkmersovermindepth") || a.equals("mingoodkmersperread") || a.equals("mgkpr")){
+ MIN_KMERS_OVER_MIN_DEPTH=Tools.max(1, Integer.parseInt(b));
+ }else if(a.equals("percentile") || a.equals("depthpercentile") || a.equals("dp")){
+ DEPTH_PERCENTILE=Float.parseFloat(b);
+ if(DEPTH_PERCENTILE>1 && DEPTH_PERCENTILE<=100){DEPTH_PERCENTILE/=100;}
+ assert(DEPTH_PERCENTILE>=0 && DEPTH_PERCENTILE<=1) : "Depth percentile must be between 0 and 100.";
+ }else if(a.equals("extra")){
+ if(b!=null && !b.equalsIgnoreCase("null")){
+ if(new File(b).exists()){
+ extra=new ArrayList<String>();
+ extra.add(b);
+ }else{
+ extra=Arrays.asList(b.split(","));
+ }
+ }
+ }else{
+ throw new RuntimeException("Unknown parameter "+arg);
+ }
+ }
+
+ {//Process parser fields
+ Parser.processQuality();
+ }
+
+ MAX_DEPTH=Tools.max(MAX_DEPTH, TARGET_DEPTH);
+ assert(TARGET_DEPTH>0);
+
+ assert(FastaReadInputStream.settingsOK());
+ if(k>31){CANONICAL=KmerCountAbstract.CANONICAL=false;}
+ assert(CANONICAL==KmerCountAbstract.CANONICAL);
+
+// if(output!=null && reads1.contains(",")){
+// throw new RuntimeException("\nLists of input files can only be used with histogram output, not full output.\n" +
+// "Please set output=null or move the extra input files to 'extra=file1,file2,...fileN'");
+// }
+
+ {
+ if(histFile==null){
+// HIST_LEN=Tools.min(20000, HIST_LEN);
+// HIST_LEN_PRINT=Tools.min(20000, HIST_LEN_PRINT);
+ }else{
+ USE_HISTOGRAM=true;
+ }
+
+ final int maxCount=(int)(cbits>16 ? Integer.MAX_VALUE : (1L<<cbits)-1);
+ assert(maxCount>0);
+ HIST_LEN_PRINT=Tools.max(1, Tools.min(HIST_LEN_PRINT, maxCount));
+ assert(HIST_LEN_PRINT<=Integer.MAX_VALUE) : HIST_LEN_PRINT+", "+Integer.MAX_VALUE;
+ HIST_LEN=(int)Tools.min(maxCount, Tools.max(HIST_LEN_PRINT, HIST_LEN));
+ THREAD_HIST_LEN=Tools.min(THREAD_HIST_LEN, HIST_LEN);
+
+ histogram_total=new AtomicLongArray(HIST_LEN);
+ }
+
+ if(extra!=null){
+ for(String s : extra){
+ File f=new File(s);
+ if(!f.exists() || !f.isFile()){throw new RuntimeException(s+" does not exist.");}
+ assert(!s.equalsIgnoreCase(reads1) && (reads2==null || !s.equalsIgnoreCase(reads2))) : "\nInput file "+s+" should not be included as an extra file.\n";
+ }
+ }
+
+// outstream.println("ForceInterleaved = "+FASTQ.FORCE_INTERLEAVED);
+
+// assert(false) : reads1+", "+reads2+", "+output;
+// if(FASTQ.FORCE_INTERLEAVED && in2==null){
+// outstream.println()
+// }
+
+ if(threads<=0){
+ if(auto){THREADS=Data.LOGICAL_PROCESSORS;}
+ else{THREADS=8;}
+ }else{
+ THREADS=threads;
+ }
+// KmerCountAbstract.THREADS=Tools.min(THREADS,6);
+ KmerCountAbstract.THREADS=THREADS;
+
+// System.err.println("THREADS="+THREADS+", KmerCountAbstract.THREADS="+KmerCountAbstract.THREADS);
+
+ if(auto && cells==-1){
+ final long usable=(long)Tools.max(((memory-96000000)*.73), memory*0.45);
+ long mem=usable-(USE_HISTOGRAM ? (HIST_LEN*8*(1)) : 0);
+ if(buildpasses>1){mem/=2;}
+ cells=(mem*8)/cbits;
+//
+// long tablebytes=((1L<<matrixbits)*cbits)/8;
+// if(tablebytes*3<usable){matrixbits++;}
+// outstream.println(tablebytes/1000000+", "+usable/1000000+", "+(tablebytes*3)/1000000);
+
+ }else if(cells==-1){
+ cells=1L<<34;
+ }
+
+ if(prefilter){
+ if(precells<1){
+ long totalbits=cells*cbits;
+ long prebits=(long)(totalbits*0.35);
+ precells=prebits/2;
+ cells=(totalbits-prebits+cbits-1)/cbits; //Steal memory from cell allocation
+ }
+ if(prehashes<1){
+ prehashes=(hashes+1)/2;
+ }
+ }
+
+ {
+ outstream.println("\nSettings:");
+ outstream.println("threads: \t"+THREADS);
+ outstream.println("k: \t"+k);
+ outstream.println("deterministic: \t"+deterministic);
+ outstream.println("passes: \t"+buildpasses);
+ outstream.println("bits per cell: \t"+cbits);
+// outstream.println("matrixbits: \t"+matrixbits);
+ outstream.println("cells: \t"+Tools.toKMG(cells));
+ outstream.println("hashes: \t"+hashes);
+ if(prefilter){
+ outstream.println("prefilter bits: \t"+2);
+// outstream.println("matrixbits: \t"+matrixbits);
+ outstream.println("prefilter cells: \t"+(precells>0 && prehashes>0 ? Tools.toKMG(precells) : "?"));
+ outstream.println("prefilter hashes: \t"+(precells>0 && prehashes>0 ? ""+prehashes : "?"));
+ }
+ outstream.println("base min quality: \t"+KmerCountAbstract.minQuality);
+ outstream.println("kmer min prob: \t"+KmerCountAbstract.minProb);
+
+ outstream.println();
+ outstream.println("target depth: \t"+TARGET_DEPTH);
+ outstream.println("min depth: \t"+MIN_DEPTH);
+ outstream.println("max depth: \t"+MAX_DEPTH);
+ outstream.println("min good kmers: \t"+MIN_KMERS_OVER_MIN_DEPTH);
+ outstream.println("depth percentile: \t"+String.format("%.1f", 100*DEPTH_PERCENTILE));
+ outstream.println("remove duplicates:\t"+!KmerCountAbstract.KEEP_DUPLICATE_KMERS);
+ outstream.println("fix spikes: \t"+FIX_SPIKES);
+ if(USE_HISTOGRAM && HIST_LEN>0){
+ outstream.println("histogram length: \t"+(USE_HISTOGRAM ? HIST_LEN : 0));
+ }
+ if(histFile!=null){
+ outstream.println("print zero cov: \t"+PRINT_ZERO_COVERAGE);
+ }
+
+ outstream.println();
+ }
+
+ if(!prefilter && k<32 && cells>(1L<<(2*k))){cells=(1L<<(2*k));}
+ assert(cells>0);
+
+// KmerCountAbstract.THREADS=Tools.max(THREADS/2, KmerCountAbstract.THREADS); //Seems like 4 is actually optimal...
+
+ FastaReadInputStream.MIN_READ_LEN=k;
+
+ Timer t=new Timer();
+ Timer ht=new Timer();
+ t.start();
+ ht.start();
+ KCountArray kca;
+ KCountArray prefilterArray=null;
+// outstream.println();
+ if(prefilter){
+ prefilterArray=KmerCount7MTA.makeKca(reads1, reads2, extra, k, 2, gap, precells, prehashes, minq, true, false, tablereads, 1, buildStepsize, 1, 1, null, 0);
+ outstream.println("Made prefilter: \t"+prefilterArray.toShortString(prehashes));
+ double uf=prefilterArray.usedFraction();
+ if(uf>0.6){
+ outstream.println("Warning: This table is "+(uf>0.995 ? "totally" : uf>0.99 ? "crazy" : uf>0.95 ? "incredibly" : uf>0.9 ? "extremely" : uf>0.8 ? "very" :
+ uf>0.7 ? "fairly" : "somewhat")+" full, which may reduce accuracy for kmers of depth under 3. Ideal load is under 60% used." +
+ "\nFor better accuracy, run on a node with more memory; quality-trim or error-correct reads; " +
+ "or increase the values of the minprob flag to reduce spurious kmers.");
+ }
+ }
+ kca=KmerCount7MTA.makeKca(reads1, reads2, extra, k, cbits, gap, cells, hashes, minq, true, false, tablereads, buildpasses, buildStepsize, 2, 2, prefilterArray, (prefilterArray==null ? 0 : prefilterArray.maxValue));
+ ht.stop();
+
+ outstream.println("Made hash table: \t"+kca.toShortString(hashes));
+ double uf=kca.usedFraction();
+ if(uf>0.6){
+ outstream.println("Warning: This table is "+(uf>0.995 ? "totally" : uf>0.99 ? "crazy" : uf>0.95 ? "incredibly" : uf>0.9 ? "extremely" : uf>0.8 ? "very" :
+ uf>0.7 ? "fairly" : "somewhat")+" full, which may reduce accuracy. Ideal load is under 60% used." +
+ "\nFor better accuracy, use the 'prefilter' flag; run on a node with more memory; quality-trim or error-correct reads; " +
+ "or increase the values of the minprob flag to reduce spurious kmers. In practice you should still get good normalization results " +
+ "even with loads over 90%, but the histogram and statistics will be off.");
+ }
+
+ long estUnique;
+ outstream.println();
+ if(prefilterArray!=null){
+ int lim1=prefilterArray.maxValue, lim2=prefilterArray.maxValue+1;
+ double a=prefilterArray.estimateUniqueKmers(prehashes);
+ double b=kca.estimateUniqueKmers(hashes, lim2);
+ a=a-b;
+ if(CANONICAL){
+// a=(a*KCountArray.canonMask)/(KCountArray.canonMask+1);
+// b=(b*KCountArray.canonMask)/(KCountArray.canonMask+1);
+ }else{
+ a/=2;
+ b/=2;
+ }
+ estUnique=((long)((a+b)));
+ outstream.println("Estimated kmers of depth 1-"+lim1+": \t"+(long)a);
+ outstream.println("Estimated kmers of depth "+lim2+"+ : \t"+(long)b);
+ }else{
+// double est=kca.cells*(1-Math.pow(1-Math.sqrt(kca.usedFraction()), 1.0/hashes));
+// double est=kca.cells*(1-Math.pow(1-kca.usedFraction(), 1.0/hashes));
+ double est=kca.estimateUniqueKmers(hashes);
+// outstream.println("Used cells: "+kca.cellsUsed(1));
+ if(CANONICAL){
+// est=(est*KCountArray.canonMask)/(KCountArray.canonMask+1);
+ }else{
+ est/=2;
+ }
+ estUnique=((long)((est)));
+
+ }
+ outstream.println("Estimated unique kmers: \t"+estUnique);//+", or "+estUnique+" counting forward kmers only.");
+// outstream.println("(Includes forward and reverse kmers)");
+ outstream.println();
+ outstream.println("Table creation time:\t\t"+ht);//+" \t"+String.format("%.2f", totalBases*1000000.0/(ht.elapsed))+" kb/sec");
+
+ long bases=0;
+
+ ListNum.setDeterministicRandom(deterministic);
+
+ if(reads1!=null && reads1.contains(",") && !new File(reads1).exists()){
+ throw new RuntimeException("This class is not designed to deal with lists of input files.");
+ }else{
+ bases=count(reads1, reads2, kca, k, maxReads, outKeep, overwrite, histFile, estUnique);
+ }
+ printTopology();
+
+ t.stop();
+ outstream.println("\nTotal time: \t\t"+t+" \t"+String.format("%.2f", bases*1000000.0/(t.elapsed))+" kb/sec");
+
+ }
+
+
+ public static void printTopology(){
+ long total=peaks.get()+spikes.get()+flats.get()+valleys.get()+slopes.get();
+ double mult=100.0/total;
+
+ long sp=spikes.get();
+ long pe=peaks.get();
+ long va=valleys.get();
+ long sl=slopes.get();
+ long fl=flats.get();
+ double dsp=mult*sp;
+ double dpe=mult*pe;
+ double dva=mult*va;
+ double dsl=mult*sl;
+ double dfl=mult*fl;
+
+ System.err.println("\nDepth Topology:\t");
+ System.err.println("Spikes: \t\t\t"+(dsp<10 ? " " : "")+String.format("%.3f%% \t%d",dsp,sp));
+ System.err.println("Peaks: \t\t\t"+(dpe<10 ? " " : "")+String.format("%.3f%% \t%d",dpe,pe));
+ System.err.println("Valleys: \t\t\t"+(dva<10 ? " " : "")+String.format("%.3f%% \t%d",dva,va));
+ System.err.println("Slopes: \t\t\t"+(dsl<10 ? " " : "")+String.format("%.3f%% \t%d",dsl,sl));
+ System.err.println("Flats: \t\t\t"+(dfl<10 ? " " : "")+String.format("%.3f%% \t%d",dfl,fl));
+ }
+
+
+ public static long count(String in1, String in2, KCountArray kca, int k, long maxReads,
+ String outKeep, boolean overwrite, String histFile, long estUnique) {
+ final ConcurrentReadInputStream cris;
+ {
+ FileFormat ff1=FileFormat.testInput(in1, FileFormat.FASTQ, null, true, true);
+ FileFormat ff2=FileFormat.testInput(in2, FileFormat.FASTQ, null, true, true);
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2);
+ if(verbose){System.err.println("Started cris");}
+ cris.start(); //4567
+ }
+ boolean paired=cris.paired();
+ if(verbose){System.err.println("Paired: "+paired);}
+
+ ConcurrentReadOutputStream rosKeep=null;
+ if(outKeep!=null){
+ final int buff=(!ordered ? 8 : Tools.max(16, 2*THREADS));
+
+ String out1=outKeep.replaceFirst("#", "1");
+ String out2=null;
+
+ if(cris.paired()){
+ if(outKeep.contains("#")){
+ out2=outKeep.replaceFirst("#", "2");
+ }else{
+ outstream.println("Writing interleaved.");
+ }
+ }
+
+ assert(!out1.equalsIgnoreCase(in1) && !out1.equalsIgnoreCase(in1));
+ assert(out2==null || (!out2.equalsIgnoreCase(in1) && !out2.equalsIgnoreCase(in2)));
+
+// assert(false) : out1+", "+out2;
+
+ FileFormat ff1=FileFormat.testOutput(out1, FileFormat.FASTQ, "attachment", true, overwrite, append, ordered);
+ FileFormat ff2=FileFormat.testOutput(out2, FileFormat.FASTQ, "attachment", true, overwrite, append, ordered);
+ rosKeep=ConcurrentReadOutputStream.getStream(ff1, ff2, buff, null, true);
+ }
+
+ if(rosKeep!=null){
+ rosKeep.start();
+ outstream.println("Started output threads.");
+ }
+
+ long bases=downsample(cris, kca, k, maxReads, rosKeep, histFile, overwrite, estUnique);
+
+ ReadWrite.closeStreams(cris, rosKeep);
+ if(verbose){System.err.println("Closed streams");}
+
+ return bases;
+ }
+
+
+
+ public static long downsample(ConcurrentReadInputStream cris, KCountArray kca, int k, long maxReads, ConcurrentReadOutputStream rosKeep,
+ String histFile, boolean overwrite, long estUnique) {
+ Timer tdetect=new Timer();
+ tdetect.start();
+
+ long totalBases=0;
+ long totalReads=0;
+ long basesKept=0;
+ long readsKept=0;
+ long basesTossed=0;
+ long readsTossed=0;
+
+// assert(false) : THREADS;
+ ProcessThread[] pta=new ProcessThread[THREADS];
+ for(int i=0; i<pta.length; i++){
+ pta[i]=new ProcessThread(cris, kca, k, rosKeep);
+ pta[i].start();
+ }
+
+ for(int i=0; i<pta.length; i++){
+ ProcessThread ct=pta[i];
+ synchronized(ct){
+ while(ct.getState()!=State.TERMINATED){
+ try {
+ ct.join(1000);
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ totalBases+=ct.totalBases;
+ totalReads+=ct.totalReads;
+ basesKept+=ct.basesKept;
+ readsKept+=ct.readsKept;
+ basesTossed+=ct.basesTossed;
+ readsTossed+=ct.readsTossed;
+
+ for(int j=0; j<ct.hist.length; j++){
+ histogram_total.addAndGet(j, ct.hist[j]);
+ }
+ }
+ }
+
+ if(!ZERO_BIN && histogram_total!=null && histogram_total.length()>1){
+ histogram_total.addAndGet(1, histogram_total.get(0));
+ histogram_total.set(0, 0);
+ }
+
+// outstream.println();
+ tdetect.stop();
+ outstream.println("Table read time: \t\t"+tdetect+" \t"+String.format("%.2f", totalBases*1000000.0/(tdetect.elapsed))+" kb/sec");
+
+ {
+ String pad="";
+ String s=""+totalReads;
+ while(pad.length()+s.length()<9){pad+=" ";}
+ outstream.println("Total reads in: \t\t"+totalReads+pad+String.format("\t(%.3f%% Kept)", (readsKept*100.0/totalReads)));
+ s=""+totalBases;
+ while(pad.length()+s.length()<9){pad+=" ";}
+ outstream.println("Total bases in: \t\t"+totalBases+pad+String.format("\t(%.3f%% Kept)", (basesKept*100.0/totalBases)));
+ }
+// outstream.println();
+ if(histogram_total!=null){
+ TextStreamWriter tswh=null;
+ StringBuilder sb=new StringBuilder(100);
+ if(USE_HISTOGRAM){
+ tswh=new TextStreamWriter(histFile, overwrite, false, false);
+ tswh.start();
+ tswh.print("#Depth\tRaw_Count\tUnique_Kmers\n");
+ }
+ int lim=(int)(HIST_LEN_PRINT-1);
+ long remaining=Tools.sum(histogram_total);
+ long sumRaw1=0;
+ long sumRaw2=0;
+ long sum1=0;
+ long sum2=0;
+ long sumsquare=0;
+ for(int i=0; i<lim; i++){
+ long x=histogram_total.get(i);
+ long y=((x+i/2)/(i<1 ? 1 : i)); //x+i/2 rounds to compensate for colliding kmers being put in an overly high bin
+// long y=((x)/(i<1 ? 1 : i));
+ sumRaw1+=x;
+ sum1+=y;
+ sumsquare+=(x*Tools.max(1, i));
+ if(tswh!=null){
+ if(PRINT_ZERO_COVERAGE /*|| x>0*/ || y>0){
+ sb.append(i).append('\t');
+ sb.append(x).append('\t');
+ sb.append(y).append('\n');
+ }
+ tswh.print(sb.toString());
+ sb.setLength(0);
+ }
+ if(sumRaw1>=remaining){break;} //Stop once there is no more coverage, even if PRINT_ZERO_COVERAGE is not set.
+ }
+ for(int i=lim; i<histogram_total.length(); i++){
+ long x=histogram_total.get(i);
+ sumRaw2+=x;
+ long y=((x+i/2)/(i<1 ? 1 : i)); //x+i/2 rounds to compensate for colliding kmers being put in an overly high bin
+// long y=((x)/(i<1 ? 1 : i));
+ sum2+=y;
+ }
+ if(tswh!=null){
+ if(sumRaw2>0 || sum2>0){
+ sb.append(lim).append('\t');
+ sb.append(sumRaw2).append('\t');
+ sb.append(sum2).append('\n');
+ }
+ tswh.print(sb.toString());
+ tswh.poison();
+ tswh.waitForFinish();
+ outstream.println("Wrote histogram to "+histFile);
+ }
+
+ long histCount=Tools.sum(histogram_total); //Total number of kmers counted
+ long halfCount=(histCount+1)/2;
+ double histCountU=0; //Unique kmers counted
+ long temp1=0;
+ double temp2=0;
+ int median_all=-1;
+ int median_unique=-1;
+ for(int i=0; i<histogram_total.length(); i++){
+ long x=histogram_total.get(i);
+ temp1+=x;
+ if(temp1>=halfCount && median_all<0){median_all=i;}
+// histSum+=(x*(double)i);
+ histCountU+=(x/(double)Tools.max(1, i));
+ }
+ double halfCount2=(histCountU)/2;
+ for(int i=0; i<histogram_total.length(); i++){
+ long x=histogram_total.get(i);
+ temp2+=(x/Tools.max(i, 1.0));
+ if(temp2>=halfCount2 && median_unique<0){
+ median_unique=i;
+ break;
+ }
+ }
+ if(median_all<0){median_all=0;}
+ double avg_all=sumsquare/(double)histCount;
+ double avg_unique=histCount/histCountU;
+ double stdev_unique=Tools.standardDeviationHistogramKmer(histogram_total);
+ double stdev_all=Tools.standardDeviationHistogram(histogram_total);
+ outstream.println("Total kmers counted: \t"+(sumRaw1+sumRaw2));
+
+ double uniqueC=((sum1+sum2)*100.0/(sumRaw1+sumRaw2));
+ double uniqueE=((estUnique)*100.0/(sumRaw1+sumRaw2));
+ double uniqueM=Tools.max(uniqueC, uniqueE);
+ outstream.println("Total unique kmer count: \t"+(sum1+sum2));
+ if(CANONICAL){outstream.println("Includes forward kmers only.");}
+ outstream.println("The unique kmer estimate can be more accurate than the unique count, if the tables are very full.");
+ outstream.println("The most accurate value is the greater of the two.");
+ outstream.println();
+
+ outstream.println("Percent unique: \t"+(uniqueM<10 ? " " : "")+String.format("%.2f%%", uniqueM));
+
+ outstream.println("Depth average: \t"+String.format("%.2f\t(unique kmers)", avg_unique));
+ outstream.println("Depth median: \t"+String.format("%d\t(unique kmers)", median_unique));
+ outstream.println("Depth standard deviation: \t"+String.format("%.2f\t(unique kmers)", stdev_unique));
+
+ outstream.println("\nDepth average: \t"+String.format("%.2f\t(all kmers)", avg_all));
+ outstream.println("Depth median: \t"+String.format("%d\t(all kmers)", median_all));
+ outstream.println("Depth standard deviation: \t"+String.format("%.2f\t(all kmers)", stdev_all));
+ }
+
+ return totalBases;
+ }
+
+
+
+ /**
+ * Locates and fixes spikes in a coverage profile (potentially) caused by false positives in a bloom filter.
+ * Theory: If a high-count kmer is adjacent on both sides to low-count kmers, it may be a false positive.
+ * It could either be reduced to the max of the two flanking points or examined in more detail.
+ * @param array An array of kmer counts for adjacent kmers in a read.
+ */
+ private static void fixSpikes(int[] array){
+
+ for(int i=1; i<array.length-1; i++){
+ long a=Tools.max(1, array[i-1]);
+ int b=array[i];
+ long c=Tools.max(1, array[i+1]);
+ if(b>1 && b>a && b>c){
+ //peak
+ if((b>=2*a || b>a+2) && (b>=2*c || b>c+2)){
+ //spike
+ array[i]=(int)Tools.max(a, c);
+ }
+ }
+ }
+ }
+ private static void fixSpikes(int[] array, long[] kmers, KCountArray kca, int k){
+ if(array.length<3){return;}
+ if(array[1]-array[0]>1){
+ array[0]=kca.readPrecise(kmers[0], k, CANONICAL);
+ }
+ if(array[array.length-1]-array[array.length-2]>1){
+ array[array.length-1]=kca.readPrecise(kmers[array.length-1], k, CANONICAL);
+ }
+
+ for(int i=1; i<array.length-1; i++){
+ int b=array[i];
+ if(b>1){
+ long a=Tools.max(1, array[i-1]);
+ long c=Tools.max(1, array[i+1]);
+ long key=kmers[i];
+
+ if(b>a && b>c){
+ //peak
+ if(b<6 || b>a+1 || b>c+1){
+ array[i]=kca.readPreciseMin(key, k, CANONICAL);
+ }
+ // if((b>=2*a || b>a+2) && (b>=2*c || b>c+2)){
+ // //spike
+ // int b1=(int)((a+c)/2);
+ // int b2=kca.readLeft(key, k, CANONICAL);
+ // int b3=kca.readRight(key, k, CANONICAL);
+ // array[i]=Tools.min(b, b1, b2, b3);
+ // }
+ // else
+ // {
+ //// array[i]=kca.readPreciseMin(key, k, CANONICAL);
+ // }
+ }
+ // else
+ // if(Tools.max(ada, adc)>=Tools.max(2, Tools.min((int)a, b, (int)c)/4)){
+ // array[i]=kca.readPrecise(key, k, CANONICAL);
+ // }
+ // else
+ // if(b>a+1 || b>c+1){
+ // //steep
+ // array[i]=kca.readPrecise(key, k, CANONICAL);
+ // }
+ }
+ }
+ }
+
+
+ private static void analyzeSpikes(int[] array, int width){
+ if(array.length<3){return;}
+ int peakcount=0, valleycount=0, spikecount=0, flatcount=0, slopecount=0;
+ for(int i=1; i<array.length-1; i++){
+ long a=array[i-1];
+ int b=array[i];
+ long c=array[i+1];
+ if(b>a && b>c){
+ peakcount++;
+ if((b>=2*a || b>a+2) && (b>=2*c || b>c+2)){
+ spikecount++;
+ }
+ }else if(b<a && b<c){
+ valleycount++;
+ }else if(b==a && b==c){
+ flatcount++;
+ }else{
+ slopecount++;
+ }
+ }
+ if(peakcount>0){peaks.addAndGet(peakcount);}
+ if(valleycount>0){valleys.addAndGet(valleycount);}
+ if(spikecount>0){spikes.addAndGet(spikecount);}
+ if(flatcount>0){flats.addAndGet(flatcount);}
+ if(slopecount>0){slopes.addAndGet(slopecount);}
+ }
+
+
+ /**
+ * @param r
+ * @param kca
+ * @return
+ */
+ public static int[] generateCoverage(Read r, KCountArray kca, int k, int[] out, long[] kmers) {
+ if(k>31){return generateCoverageLong(r, kca, k, out);}
+ if(kca.gap>0){throw new RuntimeException("Gapped reads: TODO");}
+ if(r==null || r.bases==null || r.length()<k){return new int[] {0};}
+
+ final int kbits=2*k;
+ final long mask=~((-1L)<<(kbits));
+ final int gap=kca.gap;
+
+ if(r.bases==null || r.length()<k+gap){return null;} //Read is too short to detect errors
+
+ int len=0;
+ long kmer=0;
+ final byte[] bases=r.bases;
+ final int arraylen=r.length()-k+1;
+ if(out==null || out.length!=arraylen){out=new int[arraylen];}
+ Arrays.fill(out, -1);
+ if(FIX_SPIKES){
+ if(kmers==null || kmers.length!=arraylen){kmers=new long[arraylen];}
+ Arrays.fill(kmers, -1);
+ }
+
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ int x=AminoAcid.baseToNumber[b];
+ if(x<0){
+ len=0;
+ kmer=0;
+ }else{
+ kmer=((kmer<<2)|x)&mask;
+ len++;
+
+ if(len>=k){
+ // int count=kca.readPrecise(kmer, k, CANONICAL);
+ int count=kca.read(kmer, k, CANONICAL);
+ out[i-k+1]=count;
+ if(kmers!=null){kmers[i-k+1]=kmer;}
+ }
+ }
+ }
+
+ if(FIX_SPIKES){fixSpikes(out, kmers, kca, k);}
+// fixSpikes(out, 1);
+
+ analyzeSpikes(out, 1);
+ return out;
+ }
+
+
+
+ /**
+ * @param r
+ * @param kca
+ * @return
+ */
+ public static int[] generateCoverageLong(Read r, KCountArray kca, int k, int[] out) {
+ assert(k>31);
+ if(kca.gap>0){throw new RuntimeException();}
+ if(r==null || r.bases==null || r.length()<k){return new int[] {0};}
+
+ final int gap=kca.gap;
+
+ if(r.bases==null || r.length()<k+gap){return null;} //Read is too short to detect errors
+
+ int len=0;
+ long kmer=0;
+ final byte[] bases=r.bases;
+
+ final int arraylen=r.length()-k+1;
+ if(out==null || out.length!=arraylen){out=new int[arraylen];}
+ Arrays.fill(out, -1);
+
+ int tailshift=k%32;
+ int tailshiftbits=tailshift*2;
+
+
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ int x=AminoAcid.baseToNumber[b];
+ if(x<0){
+ len=0;
+ kmer=0;
+ }else{
+ kmer=Long.rotateLeft(kmer, 2);
+ kmer=kmer^x;
+ len++;
+ if(len>k){
+ long x2=AminoAcid.baseToNumber[bases[i-k]];
+ kmer=kmer^(x2<<tailshiftbits);
+ }
+
+ if(len>=k){
+ int count=kca.read(kmer);
+ out[i-k+1]=count;
+ }
+ }
+ }
+
+ fixSpikes(out);
+
+ analyzeSpikes(out, 1);
+ return out;
+ }
+
+
+ private static class ProcessThread extends Thread{
+
+ ProcessThread(ConcurrentReadInputStream cris_, KCountArray kca_, int k_, ConcurrentReadOutputStream rosk_){
+ cris=cris_;
+ kca=kca_;
+ k=k_;
+ rosk=rosk_;
+ }
+
+ public void run(){
+ countInThread();
+ }
+
+ void countInThread() {
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ final ArrayList<Read> keep=new ArrayList<Read>(Shared.READ_BUFFER_LENGTH);
+
+ int[] cov1=null;
+ long[] kmers1=null;
+
+ while(reads!=null && reads.size()>0){
+ for(int rnum=0; rnum<reads.size(); rnum++){
+ Read r=reads.get(rnum);
+ Read r2=r.mate;
+ assert(r!=r2);
+
+ int depth=-1;
+
+ int readcount=0;
+ int basecount=0;
+
+ int min=0;
+ int max=0;
+ int[] cov=null;
+ long[] kmers=null;
+
+ if(r!=null && r.bases!=null){
+ readcount++;
+ basecount+=r.length();
+ if(r.length()>=k){
+ if(verbose){outstream.println();}
+ if(FIX_SPIKES && k<32){
+ final int arraylen=r.length()-k+1;
+ if(kmers1==null || kmers1.length!=arraylen){kmers1=new long[arraylen];}
+ kmers=kmers1;
+ }
+ cov=getSortedCoverageAndIncrementHistogram(r, cov1, kmers1);
+ if(cov!=null){;
+ int i=cov.length-1;
+ while(i>=0 && cov[i]<MIN_DEPTH){i--;}
+ if(i+1>=MIN_KMERS_OVER_MIN_DEPTH){depth=cov[(int)(i*(1-DEPTH_PERCENTILE))];}
+ cov1=cov;
+ min=cov[cov.length-1];
+ max=cov[(int)(cov.length*0.05f)];
+ }
+ }
+ }
+
+
+ totalReads+=readcount;
+ totalBases+=basecount;
+ if(max>TARGET_DEPTH && max>2*min){
+ readsKept+=readcount;
+ basesKept+=basecount;
+ StringBuilder sb=new StringBuilder();
+ sb.append(cov[0]);
+ for(int i=1; i<cov.length; i++){
+ sb.append('\t');
+ sb.append(cov[i]);
+ }
+ r.obj=sb.toString();
+ keep.add(r);
+ }else{
+ readsTossed+=readcount;
+ basesTossed+=basecount;
+ }
+ }
+
+
+ if(rosk!=null){ //Important to send all lists to output, even empty ones, to keep list IDs straight.
+// System.err.println("Adding list "+ln.id+" of length "+reads.size());
+ rosk.add(keep, ln.id);
+ }
+ keep.clear();
+
+ cris.returnList(ln.id, ln.list.isEmpty());
+ //System.err.println("fetching list");
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ if(verbose){System.err.println("Finished reading");}
+ cris.returnList(ln.id, ln.list.isEmpty());
+ if(verbose){System.err.println("Returned list");}
+ }
+
+ private final int[] getSortedCoverageAndIncrementHistogram(Read r, int[] cov, long[] kmers){
+ assert(r!=null && r.bases!=null && r.length()>=k) : r;
+ cov=generateCoverage(r, kca, k, cov, kmers);
+ if(cov!=null){
+ Arrays.sort(cov);
+ Tools.reverseInPlace(cov);
+ incrementHistogramSorted(cov);
+ }
+ return cov;
+ }
+
+ private final void incrementHistogramSorted(int[] cov){
+ if(hist==null || cov==null || cov.length==0){return;}
+
+// outstream.println(Arrays.toString(cov));
+
+ int last=cov[0];
+ long sum=0;
+// long sum2=0;
+ for(int x : cov){
+// outstream.println("Processing "+x);
+ if(x<0){break;}
+ int y=Tools.min(x, HIST_LEN-1);
+ if(y==last){sum++;}
+ else if(sum>0){
+// outstream.println("Incrementing "+last+" by "+sum);
+// sum2+=sum;
+ if(last<hist.length){hist[last]+=sum;}
+ else{histogram_total.addAndGet(last, sum);}
+ sum=1;
+ }
+ last=y;
+ }
+// outstream.println("Ended loop");
+ if(sum>0){
+// outstream.println("Incrementing "+last+" by "+sum);
+// sum2+=sum;
+ if(last<hist.length){hist[last]+=sum;}
+ else{histogram_total.addAndGet(last, sum);}
+ }
+// assert(sum2==cov.length) : sum2+", "+cov.length+", "+last+", "+sum;
+ }
+
+ private final ConcurrentReadInputStream cris;
+ private final KCountArray kca;
+ private final int k;
+ /** Stream for kept reads */
+ private final ConcurrentReadOutputStream rosk;
+ public final long[] hist=new long[THREAD_HIST_LEN];//(USE_HISTOGRAM ? new long[HIST_LEN] : null);
+
+ private long totalBases=0;
+ private long totalReads=0;
+
+ public long readsKept=0;
+ public long readsTossed=0;
+ public long basesKept=0;
+ public long basesTossed=0;
+ }
+
+ public static PrintStream outstream=Data.sysout;
+
+ public static int THREAD_HIST_LEN=1<<12;
+ public static int HIST_LEN=1<<20;
+ public static long HIST_LEN_PRINT=HIST_LEN;
+ public static boolean USE_HISTOGRAM=false;
+ public static boolean PRINT_ZERO_COVERAGE=false;
+ public static AtomicLongArray histogram_total;
+
+ private static int THREADS=8;
+ private static boolean verbose=false;
+
+
+ private static int TARGET_DEPTH=50;
+ private static int MAX_DEPTH=-1;
+ private static int MIN_DEPTH=3;
+ private static int MIN_KMERS_OVER_MIN_DEPTH=10;
+ private static float DEPTH_PERCENTILE=0.5f;
+
+
+ public static boolean CANONICAL=true;
+ public static boolean ZERO_BIN=false;
+ public static boolean FIX_SPIKES=true;
+ public static boolean ordered=false;
+ public static boolean overwrite=true;
+ public static boolean append=false;
+ public static boolean prefilter=false;
+
+ public static AtomicLong peaks=new AtomicLong();
+ public static AtomicLong spikes=new AtomicLong();
+ public static AtomicLong flats=new AtomicLong();
+ public static AtomicLong valleys=new AtomicLong();
+ public static AtomicLong slopes=new AtomicLong();
+}
diff --git a/current/jgi/RedirectTest.java b/current/jgi/RedirectTest.java
new file mode 100755
index 0000000..69f98ec
--- /dev/null
+++ b/current/jgi/RedirectTest.java
@@ -0,0 +1,78 @@
+package jgi;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+
+import dna.Data;
+
+import fileIO.PipeThread;
+import fileIO.ReadWrite;
+
+/**
+ * @author Brian Bushnell
+ * @date Jan 22, 2013
+ *
+ */
+public class RedirectTest {
+
+ public static void main(String[] args) throws IOException{
+
+ String fin=args[0];
+// String fout=args[1];
+
+ System.out.println("fin="+fin);
+
+ InputStream in=null;
+ final OutputStream os=System.out;
+ InputStream es=null;
+ Process p=null;
+
+ System.out.println("Samtools="+Data.SAMTOOLS());
+ System.out.println("Gzip="+Data.GZIP());
+ System.out.println("Pigz="+Data.PIGZ());
+ System.out.println("Gunzip="+Data.GUNZIP());
+
+ if(Data.WINDOWS){
+ System.out.println("WINDOWS");
+ in=ReadWrite.getInputStream(fin, false, false);
+ }else{
+ System.out.println("LINUX");
+ p=Runtime.getRuntime().exec("gunzip -c -d "+fin);
+ in=p.getInputStream();
+ es=p.getErrorStream();
+ assert(es!=null);
+ PipeThread et=new PipeThread(es, System.err);
+ et.start();
+ System.out.println(p);
+ }
+
+ final byte[] buf=new byte[4096];
+ for(int len=in.read(buf); len>0; len=in.read(buf)){
+ os.write(buf, 0, len);
+ }
+
+ in.close();
+ if(es!=null){es.close();}
+ ReadWrite.close(os);
+
+ }
+
+ public static void main_0(String[] args) throws IOException{
+
+ String fin=args[0];
+ String fout=args[1];
+
+ InputStream in=ReadWrite.getInputStream(fin, false, false);
+
+ OutputStream os=System.out;
+
+ byte[] buf=new byte[4096];
+
+ for(int len=in.read(buf); len>0; len=in.read(buf)){
+ os.write(buf, 0, len);
+ }
+
+ }
+
+}
diff --git a/current/jgi/ReformatReads.java b/current/jgi/ReformatReads.java
new file mode 100755
index 0000000..7671452
--- /dev/null
+++ b/current/jgi/ReformatReads.java
@@ -0,0 +1,1332 @@
+package jgi;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Random;
+
+import stream.ConcurrentGenericReadInputStream;
+import stream.ConcurrentReadInputStream;
+import stream.FASTQ;
+import stream.FastaReadInputStream;
+import stream.ConcurrentReadOutputStream;
+import stream.KillSwitch;
+import stream.Read;
+import stream.SamLine;
+
+import dna.Data;
+import dna.Parser;
+import dna.Timer;
+import fileIO.ByteFile;
+import fileIO.ByteFile1;
+import fileIO.ByteFile2;
+import fileIO.ReadWrite;
+import fileIO.FileFormat;
+
+import align2.ListNum;
+import align2.ReadStats;
+import align2.Shared;
+import align2.Tools;
+import align2.TrimRead;
+
+/**
+ * @author Brian Bushnell
+ * @date Sep 11, 2012
+ *
+ */
+public class ReformatReads {
+
+ public static void main(String[] args){
+ Timer t=new Timer();
+ ReformatReads rr=new ReformatReads(args);
+ rr.process(t);
+ }
+
+ public ReformatReads(String[] args){
+
+ args=Parser.parseConfig(args);
+ if(Parser.parseHelp(args, true)){
+ printOptions();
+ System.exit(0);
+ }
+
+ for(String s : args){if(s.startsWith("out=standardout") || s.startsWith("out=stdout")){outstream=System.err;}}
+ outstream.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+
+ boolean setInterleaved=false; //Whether it was explicitly set.
+
+
+
+ Shared.READ_BUFFER_LENGTH=Tools.min(200, Shared.READ_BUFFER_LENGTH);
+ Shared.capBuffers(4);
+ ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
+ ReadWrite.MAX_ZIP_THREADS=Shared.threads();
+ ReadWrite.ZIP_THREAD_DIVISOR=1;
+
+ SamLine.SET_FROM_OK=true;
+// SamLine.CONVERT_CIGAR_TO_MATCH=true;
+
+ Parser parser=new Parser();
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if(b==null || b.equalsIgnoreCase("null")){b=null;}
+ while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens
+
+ if(parser.parse(arg, a, b)){
+ //do nothing
+ }else if(a.equals("passes")){
+ assert(false) : "'passes' is disabled.";
+// passes=Integer.parseInt(b);
+ }else if(a.equals("path")){
+ Data.setPath(b);
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ ByteFile1.verbose=verbose;
+ ByteFile2.verbose=verbose;
+ stream.FastaReadInputStream.verbose=verbose;
+ ConcurrentGenericReadInputStream.verbose=verbose;
+// align2.FastaReadInputStream2.verbose=verbose;
+ stream.FastqReadInputStream.verbose=verbose;
+ ReadWrite.verbose=verbose;
+ }else if(a.equals("sample") || a.equals("samplereads") || a.equals("samplereadstarget") || a.equals("srt")){
+ sampleReadsTarget=Tools.parseKMG(b);
+ sampleReadsExact=(sampleReadsTarget>0);
+ }else if(a.equals("samplebases") || a.equals("samplebasestarget") || a.equals("sbt")){
+ sampleBasesTarget=Tools.parseKMG(b);
+ sampleBasesExact=(sampleBasesTarget>0);
+ }else if(a.equals("addslash")){
+ addslash=Tools.parseBoolean(b);
+ }else if(a.equals("slashspace") || a.equals("spaceslash")){
+ boolean x=Tools.parseBoolean(b);
+ if(x){
+ slash1=" /1";
+ slash2=" /2";
+ }else{
+ slash1="/1";
+ slash2="/2";
+ }
+ }else if(a.equals("addunderscore") || a.equals("underscore")){
+ addunderscore=Tools.parseBoolean(b);
+ }else if(a.equals("uniquenames")){
+ uniqueNames=Tools.parseBoolean(b);
+ }else if(a.equals("verifyinterleaved") || a.equals("verifyinterleaving") || a.equals("vint")){
+ verifyinterleaving=Tools.parseBoolean(b);
+ }else if(a.equals("verifypaired") || a.equals("verifypairing") || a.equals("vpair")){
+ verifypairing=Tools.parseBoolean(b);
+ }else if(a.equals("allowidenticalnames") || a.equals("ain")){
+ allowIdenticalPairNames=Tools.parseBoolean(b);
+ }else if(a.equals("rcompmate") || a.equals("rcm")){
+ reverseComplimentMate=Tools.parseBoolean(b);
+ outstream.println("Set RCOMPMATE to "+reverseComplimentMate);
+ }else if(a.equals("rcomp") || a.equals("rc")){
+ reverseCompliment=Tools.parseBoolean(b);
+ outstream.println("Set RCOMP to "+reverseCompliment);
+ }else if(a.equals("deleteempty") || a.equals("deletempty") || a.equals("delempty") || a.equals("def")){
+ deleteEmptyFiles=Tools.parseBoolean(b);
+ }else if(a.equals("mappedonly")){
+ mappedOnly=Tools.parseBoolean(b);
+ }else if(a.equals("unmappedonly")){
+ unmappedOnly=Tools.parseBoolean(b);
+ }else if(a.equals("requiredbits") || a.equals("rbits")){
+ requiredBits=Tools.parseIntHexDecOctBin(b);
+ }else if(a.equals("filterbits") || a.equals("fbits")){
+ filterBits=Tools.parseIntHexDecOctBin(b);
+ }else if(a.equals("primaryonly")){
+ primaryOnly=Tools.parseBoolean(b);
+ }else if(a.equals("remap1")){
+ remap1=Tools.parseRemap(b);
+ }else if(a.equals("remap2")){
+ remap2=Tools.parseRemap(b);
+ }else if(a.equals("remap")){
+ remap1=remap2=Tools.parseRemap(b);
+ }else if(a.equals("skipreads")){
+ skipreads=Tools.parseKMG(b);
+ }else if(a.equals("undefinedton") || a.equals("iupacton") || a.equals("itn")){
+ iupacToN=Tools.parseBoolean(b);
+ }else if(a.equals("quantize")){
+ if(b==null || b.length()<1){b="t";}
+ if(Character.isLetter(b.charAt(0))){
+ quantizeQuality=Tools.parseBoolean(b);
+ }else{
+ quantizeQuality=true;
+ quantizeArray=Tools.parseByteArray(b, ",");
+ }
+ }else if(parser.in1==null && i==0 && !arg.contains("=") && (arg.toLowerCase().startsWith("stdin") || new File(arg).exists())){
+ parser.in1=arg;
+ }else{
+ System.err.println("Unknown parameter "+args[i]);
+ assert(false) : "Unknown parameter "+args[i];
+ // throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ {//Process parser fields
+ Parser.processQuality();
+
+ maxReads=parser.maxReads;
+ samplerate=parser.samplerate;
+ sampleseed=parser.sampleseed;
+
+ overwrite=ReadStats.overwrite=parser.overwrite;
+ append=ReadStats.append=parser.append;
+ testsize=parser.testsize;
+ trimBadSequence=parser.trimBadSequence;
+ breakLength=parser.breakLength;
+ stoptag=SamLine.MAKE_STOP_TAG;
+
+ forceTrimModulo=parser.forceTrimModulo;
+ forceTrimLeft=parser.forceTrimLeft;
+ forceTrimRight=parser.forceTrimRight;
+ forceTrimRight2=parser.forceTrimRight2;
+ qtrimLeft=parser.qtrimLeft;
+ qtrimRight=parser.qtrimRight;
+ trimq=parser.trimq;
+ minAvgQuality=parser.minAvgQuality;
+ minAvgQualityBases=parser.minAvgQualityBases;
+ chastityFilter=parser.chastityFilter;
+ failBadBarcodes=parser.failBadBarcodes;
+ removeBadBarcodes=parser.removeBadBarcodes;
+ failIfNoBarcode=parser.failIfNoBarcode;
+ barcodes=parser.barcodes;
+ maxNs=parser.maxNs;
+ minConsecutiveBases=parser.minConsecutiveBases;
+ minReadLength=parser.minReadLength;
+ maxReadLength=parser.maxReadLength;
+ minLenFraction=parser.minLenFraction;
+ requireBothBad=parser.requireBothBad;
+ minGC=parser.minGC;
+ maxGC=parser.maxGC;
+ filterGC=(minGC>0 || maxGC<1);
+ tossJunk=parser.tossJunk;
+ recalibrateQuality=parser.recalibrateQuality;
+
+ setInterleaved=parser.setInterleaved;
+
+ in1=parser.in1;
+ in2=parser.in2;
+ qfin1=parser.qfin1;
+ qfin2=parser.qfin2;
+
+ out1=parser.out1;
+ out2=parser.out2;
+ outsingle=parser.outsingle;
+ qfout1=parser.qfout1;
+ qfout2=parser.qfout2;
+
+ extin=parser.extin;
+ extout=parser.extout;
+
+ loglog=(parser.loglog ? new LogLog(parser) : null);
+ }
+
+ if(recalibrateQuality){CalcTrueQuality.initializeMatrices();}
+
+ if(SamLine.setxs && !SamLine.setintron){SamLine.INTRON_LIMIT=10;}
+ qtrim=qtrimLeft||qtrimRight;
+
+ if(in1!=null && in2==null && in1.indexOf('#')>-1 && !new File(in1).exists()){
+ in2=in1.replace("#", "2");
+ in1=in1.replace("#", "1");
+ }
+ if(out1!=null && out2==null && out1.indexOf('#')>-1){
+ out2=out1.replace("#", "2");
+ out1=out1.replace("#", "1");
+ }
+
+ if(out1!=null && in1!=null && out1.indexOf('%')>-1){
+ out1=out1.replace("%", ReadWrite.stripExtension(in1));
+ }
+ if(out2!=null && out2.indexOf('%')>-1){
+ if(in2!=null){
+ out2=out2.replace("%", ReadWrite.stripExtension(in2));
+ }else if(in1!=null){
+ out2=out2.replace("%", ReadWrite.stripExtension(in1));
+ }
+ }
+
+ if(in2!=null){
+ if(FASTQ.FORCE_INTERLEAVED){System.err.println("Reset INTERLEAVED to false because paired input files were specified.");}
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+ }
+
+ if(verifyinterleaving || (verifypairing && in2==null)){
+ verifypairing=true;
+ setInterleaved=true;
+// if(FASTQ.FORCE_INTERLEAVED){System.err.println("Reset INTERLEAVED to false because paired input files were specified.");}
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=true;
+ }
+
+ assert(FastaReadInputStream.settingsOK());
+
+ if(in1==null){
+ printOptions();
+ throw new RuntimeException("Error - at least one input file is required.");
+ }
+ if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){
+ ByteFile.FORCE_MODE_BF2=true;
+ }
+
+ if(out1==null){
+ if(out2!=null){
+ printOptions();
+ throw new RuntimeException("Error - cannot define out2 without defining out1.");
+ }
+ if(!parser.setOut){
+ System.err.println("No output stream specified. To write to stdout, please specify 'out=stdout.fq' or similar.");
+// out1="stdout";
+ }
+ }
+
+ if(!setInterleaved){
+ assert(in1!=null && (out1!=null || out2==null)) : "\nin1="+in1+"\nin2="+in2+"\nout1="+out1+"\nout2="+out2+"\n";
+ if(in2!=null){ //If there are 2 input streams.
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+ outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED);
+ }else{ //There is one input stream.
+ if(out2!=null){
+ FASTQ.FORCE_INTERLEAVED=true;
+ FASTQ.TEST_INTERLEAVED=false;
+ outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED);
+ }
+ }
+ }
+
+ if(out1!=null && out1.equalsIgnoreCase("null")){out1=null;}
+ if(out2!=null && out2.equalsIgnoreCase("null")){out2=null;}
+ if(outsingle!=null && outsingle.equalsIgnoreCase("null")){outsingle=null;}
+
+ if(!Tools.testOutputFiles(overwrite, append, false, out1, out2, outsingle)){
+ System.err.println((out1==null)+", "+(out2==null)+", "+out1+", "+out2);
+ throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+out1+", "+out2+"\n");
+ }
+ if(!Tools.testForDuplicateFiles(true, in1, in2, out1, out2, outsingle) || !ReadStats.testFiles(false)){
+ throw new RuntimeException("Duplicate filenames are not allowed.");
+ }
+
+ FASTQ.PARSE_CUSTOM=parsecustom;
+
+ ffout1=FileFormat.testOutput(out1, FileFormat.FASTQ, extout, true, overwrite, append, false);
+ ffout2=FileFormat.testOutput(out2, FileFormat.FASTQ, extout, true, overwrite, append, false);
+ ffoutsingle=FileFormat.testOutput(outsingle, FileFormat.FASTQ, extout, true, overwrite, append, false);
+
+ ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, extin, true, true);
+ ffin2=FileFormat.testInput(in2, FileFormat.FASTQ, extin, true, true);
+
+ assert(ReadStats.testFiles(true)) : "Existing output files specified, but overwrite==false";
+ assert(ReadStats.testFiles(false)) : "Duplicate or output files specified";
+
+// System.err.println("\n"+ReadWrite.USE_PIGZ+", "+ReadWrite.USE_UNPIGZ+", "+Data.PIGZ()+", "+Data.UNPIGZ()+", "+ffin1+"\n");
+// assert(false) : ReadWrite.USE_PIGZ+", "+ReadWrite.USE_UNPIGZ+", "+Data.PIGZ()+", "+Data.UNPIGZ()+", "+ffin1;
+
+ nameMap1=(uniqueNames ? new HashMap<String, Integer>() : null);
+ nameMap2=(uniqueNames ? new HashMap<String, Integer>() : null);
+
+ qualityRemapArray=makeQualityRemapArray(quantizeArray);
+ }
+
+ void process(Timer t){
+
+ long readsRemaining=0;
+ long basesRemaining=0;
+
+ if(sampleReadsExact || sampleBasesExact){
+ long[] counts=countReads(maxReads);
+ readsRemaining=counts[0];
+ basesRemaining=counts[2];
+ setSampleSeed(sampleseed);
+ }
+
+
+ final ConcurrentReadInputStream cris;
+ {
+ useSharedHeader=(ffin1.samOrBam() && ffout1!=null && ffout1.samOrBam());
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, useSharedHeader, ffin1, ffin2, qfin1, qfin2);
+ cris.setSampleRate(samplerate, sampleseed);
+ if(verbose){System.err.println("Started cris");}
+ cris.start(); //4567
+ }
+ boolean paired=cris.paired();
+// if(verbose){
+ System.err.println("Input is being processed as "+(paired ? "paired" : "unpaired"));
+// }
+
+ assert(!paired || breakLength<1) : "Paired input cannot be broken with 'breaklength'";
+
+ final ConcurrentReadOutputStream ros;
+ if(ffout1!=null){
+ final int buff=4;
+
+ if(cris.paired() && ffout2==null && ffout1!=null && !ffout1.samOrBam()){
+ outstream.println("Writing interleaved.");
+ }
+
+ ros=ConcurrentReadOutputStream.getStream(ffout1, ffout2, qfout1, qfout2, buff, null, useSharedHeader);
+ ros.start();
+ }else{ros=null;}
+
+ final ConcurrentReadOutputStream rosb;
+ if(ffoutsingle!=null){
+ final int buff=4;
+
+ rosb=ConcurrentReadOutputStream.getStream(ffoutsingle, null, buff, null, useSharedHeader);
+ rosb.start();
+ }else{rosb=null;}
+ final boolean discardTogether=(!paired || (ffoutsingle==null && !requireBothBad));
+
+ long readsProcessed=0;
+ long basesProcessed=0;
+
+ //Only used with deleteEmptyFiles flag
+ long readsOut1=0;
+ long readsOut2=0;
+ long readsOutSingle=0;
+
+ long basesOut1=0;
+ long basesOut2=0;
+ long basesOutSingle=0;
+
+ long basesFTrimmedT=0;
+ long readsFTrimmedT=0;
+
+ long basesQTrimmedT=0;
+ long readsQTrimmedT=0;
+
+ long lowqBasesT=0;
+ long lowqReadsT=0;
+
+ long badGcBasesT=0;
+ long badGcReadsT=0;
+
+ long readShortDiscardsT=0;
+ long baseShortDiscardsT=0;
+
+ long unmappedReadsT=0;
+ long unmappedBasesT=0;
+
+ long basesSwappedT=0;
+ long readsSwappedT=0;
+
+ final boolean MAKE_QHIST=ReadStats.COLLECT_QUALITY_STATS;
+ final boolean MAKE_QAHIST=ReadStats.COLLECT_QUALITY_ACCURACY;
+ final boolean MAKE_MHIST=ReadStats.COLLECT_MATCH_STATS;
+ final boolean MAKE_BHIST=ReadStats.COLLECT_BASE_STATS;
+
+ final boolean MAKE_EHIST=ReadStats.COLLECT_ERROR_STATS;
+ final boolean MAKE_INDELHIST=ReadStats.COLLECT_INDEL_STATS;
+ final boolean MAKE_LHIST=ReadStats.COLLECT_LENGTH_STATS;
+ final boolean MAKE_GCHIST=ReadStats.COLLECT_GC_STATS;
+ final boolean MAKE_IDHIST=ReadStats.COLLECT_IDENTITY_STATS;
+
+ final ReadStats readstats=(MAKE_QHIST || MAKE_MHIST || MAKE_BHIST || MAKE_QAHIST || MAKE_EHIST || MAKE_INDELHIST || MAKE_LHIST || MAKE_GCHIST || MAKE_IDHIST) ?
+ new ReadStats() : null;
+
+ {
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+// System.err.println("Fetched "+reads);
+
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+ assert((ffin1==null || ffin1.samOrBam()) || (r.mate!=null)==cris.paired());
+ }
+
+ while(reads!=null && reads.size()>0){
+
+ if(skipreads>0){
+ int removed=0;
+ for(int i=0; i<reads.size(); i++){
+ Read r=reads.get(i);
+ if(r.numericID<skipreads){
+ reads.set(i, null);
+ removed++;
+ }else{
+ skipreads=-1;
+ break;
+ }
+ }
+ if(removed>0){
+ Tools.condenseStrict(reads);
+ }
+ }
+
+ ArrayList<Read> singles=(rosb==null ? null : new ArrayList<Read>(32));
+
+ if(breakLength>0){
+ breakReads(reads, breakLength, minReadLength);
+ }
+
+ for(int idx=0; idx<reads.size(); idx++){
+ final Read r1=reads.get(idx);
+ final Read r2=r1.mate;
+
+ final int initialLength1=r1.length();
+ final int initialLength2=(r1.mateLength());
+
+ final int minlen1=(int)Tools.max(initialLength1*minLenFraction, minReadLength);
+ final int minlen2=(int)Tools.max(initialLength2*minLenFraction, minReadLength);
+
+ if(readstats!=null){
+ if(MAKE_QHIST){readstats.addToQualityHistogram(r1);}
+ if(MAKE_BHIST){readstats.addToBaseHistogram(r1);}
+ if(MAKE_MHIST){readstats.addToMatchHistogram(r1);}
+ if(MAKE_QAHIST){readstats.addToQualityAccuracy(r1);}
+
+ if(MAKE_EHIST){readstats.addToErrorHistogram(r1);}
+ if(MAKE_INDELHIST){readstats.addToIndelHistogram(r1);}
+ if(MAKE_LHIST){readstats.addToLengthHistogram(r1);}
+ if(MAKE_GCHIST){readstats.addToGCHistogram(r1);}
+ if(MAKE_IDHIST){readstats.addToIdentityHistogram(r1);}
+ }
+
+ if(loglog!=null){loglog.hash(r1);}
+
+ {
+ readsProcessed++;
+ basesProcessed+=initialLength1;
+ if(reverseCompliment){r1.reverseComplement();}
+ }
+ if(r2!=null){
+ readsProcessed++;
+ basesProcessed+=initialLength2;
+ if(reverseCompliment || reverseComplimentMate){r2.reverseComplement();}
+ }
+
+ if(verifypairing){
+ String s1=r1==null ? null : r1.id;
+ String s2=r2==null ? null : r2.id;
+ boolean b=FASTQ.testPairNames(s1, s2, allowIdenticalPairNames);
+ if(!b){
+ outstream.println("Names do not appear to be correctly paired.\n"+s1+"\n"+s2+"\n");
+ ReadWrite.closeStreams(cris, ros);
+ System.exit(1);
+ }
+ }
+
+ if(tossJunk){
+ if(r1!=null && r1.junk()){
+ lowqBasesT+=r1.length();
+ lowqReadsT++;
+ r1.setDiscarded(true);
+ }
+ if(r2!=null && r2.junk()){
+ lowqBasesT+=r2.length();
+ lowqReadsT++;
+ r2.setDiscarded(true);
+ }
+ }
+
+ if(iupacToN){
+ if(r1!=null){r1.convertUndefinedTo((byte)'N');}
+ if(r2!=null){r2.convertUndefinedTo((byte)'N');}
+ }
+
+ if(remap1!=null && r1!=null){
+ int swaps=r1.remapAndCount(remap1);
+ if(swaps>0){
+ basesSwappedT+=swaps;
+ readsSwappedT++;
+ }
+ }
+ if(remap2!=null && r2!=null){
+ int swaps=r2.remapAndCount(remap2);
+ if(swaps>0){
+ basesSwappedT+=swaps;
+ readsSwappedT++;
+ }
+ }
+
+ if(trimBadSequence){//Experimental
+ if(r1!=null){
+ int x=TrimRead.trimBadSequence(r1);
+ basesQTrimmedT+=x;
+ readsQTrimmedT+=(x>0 ? 1 : 0);
+ }
+ if(r2!=null){
+ int x=TrimRead.trimBadSequence(r2);
+ basesQTrimmedT+=x;
+ readsQTrimmedT+=(x>0 ? 1 : 0);
+ }
+ }
+
+ if(chastityFilter){
+ if(r1!=null && r1.failsChastity()){
+ lowqBasesT+=r1.length()+r1.mateLength();
+ lowqReadsT+=1+r1.mateCount();
+ r1.setDiscarded(true);
+ if(r2!=null){r2.setDiscarded(true);}
+ }
+ }
+
+ if(removeBadBarcodes){
+ if(r1!=null && !r1.discarded() && r1.failsBarcode(barcodes, failIfNoBarcode)){
+ if(failBadBarcodes){KillSwitch.kill("Invalid barcode detected: "+r1.id+"\nThis can be disabled with the flag barcodefilter=f");}
+ lowqBasesT+=r1.length()+r1.mateLength();
+ lowqReadsT+=1+r1.mateCount();
+ r1.setDiscarded(true);
+ if(r2!=null){r2.setDiscarded(true);}
+ }
+ }
+
+ if(filterBits!=0 || requiredBits!=0){
+ if(r1!=null && !r1.discarded()){
+ assert(r1.obj!=null && r1.obj.getClass().equals(SamLine.class)) : "filterbits and requiredbits only work on sam/bam input.";
+ SamLine sl=(SamLine)r1.obj;
+ if(((sl.flag&filterBits)!=0) || ((sl.flag&requiredBits)!=requiredBits)){
+ r1.setDiscarded(true);
+ unmappedBasesT+=initialLength1;
+ unmappedReadsT++;
+ }
+ }
+ if(r2!=null && !r2.discarded()){
+ assert(r2.obj!=null && r2.obj.getClass().equals(SamLine.class)) : "filterbits and requiredbits only work on sam/bam input.";
+ SamLine sl=(SamLine)r2.obj;
+ if(((sl.flag&filterBits)!=0) || ((sl.flag&requiredBits)!=requiredBits)){
+ r2.setDiscarded(true);
+ unmappedBasesT+=initialLength1;
+ unmappedReadsT++;
+ }
+ }
+ }
+
+ if(SamLine.VERSION==1.3f){
+ if(r1!=null && !r1.discarded()){
+ assert(r1.obj!=null && r1.obj.getClass().equals(SamLine.class)) : "filterbits and requiredbits only work on sam/bam input.";
+ SamLine sl=(SamLine)r1.obj;
+ sl.cigar=SamLine.toCigar13(sl.cigar);
+ }
+ if(r2!=null && !r2.discarded()){
+ assert(r2.obj!=null && r2.obj.getClass().equals(SamLine.class)) : "filterbits and requiredbits only work on sam/bam input.";
+ SamLine sl=(SamLine)r2.obj;
+ sl.cigar=SamLine.toCigar13(sl.cigar);
+ }
+ }
+
+ if(stoptag){
+ if(r1!=null && !r1.discarded()){
+ assert(r1.obj!=null && r1.obj.getClass().equals(SamLine.class)) : "stoptag only works on sam/bam input.";
+ SamLine sl=(SamLine)r1.obj;
+ if(sl.mapped() && sl.cigar!=null){
+ if(sl.optional==null){sl.optional=new ArrayList<String>(2);}
+ sl.optional.add(SamLine.makeStopTag(sl.pos, sl.calcCigarLength(false, false), sl.cigar, r1.perfect()));
+ }
+ }
+ if(r2!=null && !r2.discarded()){
+ assert(r2.obj!=null && r2.obj.getClass().equals(SamLine.class)) : "stoptag only works on sam/bam input.";
+ SamLine sl=(SamLine)r2.obj;
+ if(sl.mapped() && sl.cigar!=null){
+ if(sl.optional==null){sl.optional=new ArrayList<String>(2);}
+ sl.optional.add(SamLine.makeStopTag(sl.pos, sl.calcCigarLength(false, false), sl.cigar, r2.perfect()));
+ }
+ }
+ }
+
+ if(mappedOnly){
+ if(r1!=null && !r1.discarded() && (!r1.mapped() || r1.bases==null || r1.secondary())){
+ r1.setDiscarded(true);
+ unmappedBasesT+=initialLength1;
+ unmappedReadsT++;
+ }
+ if(r2!=null && !r2.discarded() && (!r2.mapped() || r2.bases==null || r2.secondary())){
+ r2.setDiscarded(true);
+ unmappedBasesT+=initialLength2;
+ unmappedReadsT++;
+ }
+ }else if(unmappedOnly){
+ if(r1!=null && (r1.mapped() || r1.bases==null || r1.secondary())){
+ r1.setDiscarded(true);
+ unmappedBasesT+=initialLength1;
+ unmappedReadsT++;
+ }
+ if(r2!=null && (r2.mapped() || r2.bases==null || r2.secondary())){
+ r2.setDiscarded(true);
+ unmappedBasesT+=initialLength2;
+ unmappedReadsT++;
+ }
+ }
+
+ if(primaryOnly){
+ if(r1!=null && (r1.bases==null || r1.secondary())){
+ r1.setDiscarded(true);
+ unmappedBasesT+=initialLength1;
+ unmappedReadsT++;
+ }
+ if(r2!=null && (r2.bases==null || r2.secondary())){
+ r2.setDiscarded(true);
+ unmappedBasesT+=initialLength2;
+ unmappedReadsT++;
+ }
+ }
+
+ if(filterGC && (initialLength1>0 || initialLength2>0)){
+ final float gc;
+ if(r2==null){
+ gc=r1.gc();
+ }else{
+ gc=(r1.gc()*initialLength1+r2.gc()*initialLength2)/(initialLength1+initialLength2);
+ }
+ if(gc<minGC || gc>maxGC){
+ if(r1!=null && !r1.discarded()){
+ r1.setDiscarded(true);
+ badGcBasesT+=initialLength1;
+ badGcReadsT++;
+ }
+ if(r2!=null && !r2.discarded()){
+ r2.setDiscarded(true);
+ badGcBasesT+=initialLength2;
+ badGcReadsT++;
+ }
+ }
+ }
+
+ if(recalibrateQuality){
+ if(r1!=null && !r1.discarded()){
+ CalcTrueQuality.recalibrate(r1);
+ }
+ if(r2!=null && !r2.discarded()){
+ CalcTrueQuality.recalibrate(r2);
+ }
+ }
+
+ if(quantizeQuality){
+ final byte[] quals1=r1.quality, quals2=(r2==null ? null : r2.quality);
+ if(quals1!=null){
+ for(int i=0; i<quals1.length; i++){
+ quals1[i]=qualityRemapArray[quals1[i]];
+ }
+ }
+ if(quals2!=null){
+ for(int i=0; i<quals2.length; i++){
+ quals2[i]=qualityRemapArray[quals2[i]];
+ }
+ }
+ }
+
+ if(forceTrimLeft>0 || forceTrimRight>0 || forceTrimModulo>0 || forceTrimRight2>0){
+ if(r1!=null && !r1.discarded()){
+ final int len=r1.length();
+ final int a=forceTrimLeft>0 ? forceTrimLeft : 0;
+ final int b0=forceTrimModulo>0 ? len-1-len%forceTrimModulo : len;
+ final int b1=forceTrimRight>0 ? forceTrimRight : len;
+ final int b2=forceTrimRight2>0 ? len-1-forceTrimRight2 : len;
+ final int b=Tools.min(b0, b1, b2);
+ final int x=TrimRead.trimToPosition(r1, a, b, 1);
+ basesFTrimmedT+=x;
+ readsFTrimmedT+=(x>0 ? 1 : 0);
+ if(r1.length()<minlen1){r1.setDiscarded(true);}
+ }
+ if(r2!=null && !r2.discarded()){
+ final int len=r2.length();
+ final int a=forceTrimLeft>0 ? forceTrimLeft : 0;
+ final int b0=forceTrimModulo>0 ? len-1-len%forceTrimModulo : len;
+ final int b1=forceTrimRight>0 ? forceTrimRight : len;
+ final int b2=forceTrimRight2>0 ? len-1-forceTrimRight2 : len;
+ final int b=Tools.min(b0, b1, b2);
+ final int x=TrimRead.trimToPosition(r2, a, b, 1);
+ basesFTrimmedT+=x;
+ readsFTrimmedT+=(x>0 ? 1 : 0);
+ if(r2.length()<minlen2){r2.setDiscarded(true);}
+ }
+ }
+
+ if(qtrim){
+ if(r1!=null && !r1.discarded()){
+ int x=TrimRead.trimFast(r1, qtrimLeft, qtrimRight, trimq, 1);
+ basesQTrimmedT+=x;
+ readsQTrimmedT+=(x>0 ? 1 : 0);
+ }
+ if(r2!=null && !r2.discarded()){
+ int x=TrimRead.trimFast(r2, qtrimLeft, qtrimRight, trimq, 1);
+ basesQTrimmedT+=x;
+ readsQTrimmedT+=(x>0 ? 1 : 0);
+ }
+ }
+
+ if(minAvgQuality>0){
+ if(r1!=null && !r1.discarded() && r1.avgQuality(false, minAvgQualityBases)<minAvgQuality){
+ lowqBasesT+=r1.length();
+ lowqReadsT++;
+ r1.setDiscarded(true);
+ }
+ if(r2!=null && !r2.discarded() && r2.avgQuality(false, minAvgQualityBases)<minAvgQuality){
+ lowqBasesT+=r2.length();
+ lowqReadsT++;
+ r2.setDiscarded(true);
+ }
+ }
+
+ if(maxNs>=0){
+ if(r1!=null && !r1.discarded() && r1.countUndefined()>maxNs){
+ lowqBasesT+=r1.length();
+ lowqReadsT++;
+ r1.setDiscarded(true);
+ }
+ if(r2!=null && !r2.discarded() && r2.countUndefined()>maxNs){
+ lowqBasesT+=r2.length();
+ lowqReadsT++;
+ r2.setDiscarded(true);
+ }
+ }
+
+ if(minConsecutiveBases>0){
+ if(r1!=null && !r1.discarded() && !r1.hasMinConsecutiveBases(minConsecutiveBases)){
+ lowqBasesT+=r1.length();
+ lowqReadsT++;
+ r1.setDiscarded(true);
+ }
+ if(r2!=null && !r2.discarded() && !r2.hasMinConsecutiveBases(minConsecutiveBases)){
+ lowqBasesT+=r2.length();
+ lowqReadsT++;
+ r2.setDiscarded(true);
+ }
+ }
+
+ if(minlen1>0 || minlen2>0 || maxReadLength>0){
+// assert(false) : minlen1+", "+minlen2+", "+maxReadLength+", "+r1.length();
+ if(r1!=null && !r1.discarded()){
+ int rlen=r1.length();
+ if(rlen<minlen1 || (maxReadLength>0 && rlen>maxReadLength)){
+ r1.setDiscarded(true);
+ readShortDiscardsT++;
+ baseShortDiscardsT+=rlen;
+ }
+ }
+ if(r2!=null && !r2.discarded()){
+ int rlen=r2.length();
+ if(rlen<minlen1 || (maxReadLength>0 && rlen>maxReadLength)){
+ r2.setDiscarded(true);
+ readShortDiscardsT++;
+ baseShortDiscardsT+=rlen;
+ }
+ }
+ }
+
+ boolean remove=false;
+ if(r2==null){
+ remove=r1.discarded();
+ }else{
+ remove=requireBothBad ? (r1.discarded() && r2.discarded()) : (r1.discarded() || r2.discarded());
+ }
+
+ if(remove){reads.set(idx, null);}
+ else if(uniqueNames || addunderscore || addslash){
+
+ if(r1.id==null){r1.id=""+r1.numericID;}
+ if(r2!=null && r2.id==null){r2.id=r1.id;}
+
+ if(uniqueNames){
+
+ {
+ Integer v=nameMap1.get(r1.id);
+ if(v==null){
+ nameMap1.put(r1.id, 1);
+ }else{
+ v++;
+ nameMap1.put(r1.id, v);
+ r1.id=r1.id+"_"+v;
+ }
+ }
+ if(r2!=null){
+ Integer v=nameMap2.get(r2.id);
+ if(v==null){
+ nameMap2.put(r2.id, 1);
+ }else{
+ v++;
+ nameMap2.put(r2.id, v);
+ r2.id=r2.id+"_"+v;
+ }
+ }
+ }
+ if(addunderscore){
+ r1.id=Tools.whitespace.matcher(r1.id).replaceAll("_");
+ if(r2!=null){r2.id=Tools.whitespace.matcher(r2.id).replaceAll("_");}
+ }
+ if(addslash){
+ if(!r1.id.contains(slash1)){r1.id+=slash1;}
+ if(r2!=null){
+ if(!r2.id.contains(slash2)){r2.id+=slash2;}
+ }
+ }
+ }
+
+ if(singles!=null){
+ if(r1.discarded() || (r2!=null && r2.discarded())){
+ if(!r1.discarded()){
+ Read r=r1.clone();
+ r.mate=null;
+ r.setPairnum(0);
+ singles.add(r);
+ }else if(r2!=null && !r2.discarded()){
+ Read r=r2.clone();
+ r.mate=null;
+ r.setPairnum(0);
+ singles.add(r);
+ }
+ }
+ }
+ }
+
+ final ArrayList<Read> listOut;
+
+// assert(false) : sampleReadsExact+", "+sampleBasesExact;
+ if(sampleReadsExact || sampleBasesExact){
+ listOut=new ArrayList<Read>();
+ if(sampleReadsExact){
+ for(Read r : reads){
+ if(r!=null){
+ assert(readsRemaining>0) : readsRemaining;
+ double prob=sampleReadsTarget/(double)(readsRemaining);
+// System.err.println("sampleReadsTarget="+sampleReadsTarget+", readsRemaining="+readsRemaining+", prob="+prob);
+ if(randy.nextDouble()<prob){
+ listOut.add(r);
+ sampleReadsTarget--;
+ }
+ }
+ readsRemaining--;
+ }
+ }else if(sampleBasesExact){
+ for(Read r : reads){
+ if(r!=null){
+ assert(basesRemaining>0) : basesRemaining;
+ int bases=r.length()+(r.mate==null ? 0 : r.mateLength());
+ double prob=sampleBasesTarget/(double)(basesRemaining);
+ if(randy.nextDouble()<prob){
+ listOut.add(r);
+ sampleBasesTarget-=bases;
+ }
+ basesRemaining-=bases;
+ }
+ }
+ }
+ }else{
+ listOut=reads;
+ }
+// if(deleteEmptyFiles){
+ for(Read r : listOut){
+ if(r!=null){
+ readsOut1++;
+ basesOut1+=r.length();
+ if(r.mate!=null){
+ readsOut2++;
+ basesOut2+=r.mateLength();
+ }
+ }
+ }
+ if(singles!=null){
+ for(Read r : singles){
+ if(r!=null){
+ readsOutSingle++;
+ basesOutSingle+=r.length();
+ }
+ }
+ }
+// }
+ if(ros!=null){ros.add(listOut, ln.id);}
+ if(rosb!=null){rosb.add(singles, ln.id);}
+
+ cris.returnList(ln.id, false);
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ if(ln!=null){
+// cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
+ assert(ln.list.isEmpty());
+ cris.returnList(ln.id, true);
+ }
+ }
+
+ errorState|=ReadStats.writeAll();
+
+ errorState|=ReadWrite.closeStreams(cris, ros, rosb);
+
+ if(deleteEmptyFiles){
+ deleteEmpty(readsOut1, readsOut2, readsOutSingle);
+ }
+
+// System.err.println(cris.errorState()+", "+(ros==null ? "null" : (ros.errorState()+", "+ros.finishedSuccessfully())));
+// if(ros!=null){
+// ReadStreamWriter rs1=ros.getRS1();
+// ReadStreamWriter rs2=ros.getRS2();
+// System.err.println(rs1==null ? "null" : rs1.finishedSuccessfully());
+// System.err.println(rs2==null ? "null" : rs2.finishedSuccessfully());
+// }
+
+ t.stop();
+
+ double rpnano=readsProcessed/(double)(t.elapsed);
+ double bpnano=basesProcessed/(double)(t.elapsed);
+
+ String rpstring=(readsProcessed<100000 ? ""+readsProcessed : readsProcessed<100000000 ? (readsProcessed/1000)+"k" : (readsProcessed/1000000)+"m");
+ String bpstring=(basesProcessed<100000 ? ""+basesProcessed : basesProcessed<100000000 ? (basesProcessed/1000)+"k" : (basesProcessed/1000000)+"m");
+
+ while(rpstring.length()<8){rpstring=" "+rpstring;}
+ while(bpstring.length()<8){bpstring=" "+bpstring;}
+
+ final long rawReadsIn=cris.readsIn(), rawBasesIn=cris.basesIn();
+ final double rmult=100.0/rawReadsIn, bmult=100.0/rawBasesIn;
+ final double rpmult=100.0/readsProcessed, bpmult=100.0/basesProcessed;
+
+ outstream.println("Input: \t"+cris.readsIn()+" reads \t"+
+ cris.basesIn()+" bases");
+ if(samplerate!=1f){
+ outstream.println("Processed: \t"+readsProcessed+" reads \t"+
+ basesProcessed+" bases");
+ }
+
+ if(remap1!=null || remap2!=null){
+ outstream.println("Base Transforms: \t"+readsSwappedT+" reads ("+String.format("%.2f",readsSwappedT*rpmult)+"%) \t"+
+ basesSwappedT+" bases ("+String.format("%.2f",basesSwappedT*bpmult)+"%)");
+ }
+ if(qtrim || trimBadSequence){
+ outstream.println("QTrimmed: \t"+readsQTrimmedT+" reads ("+String.format("%.2f",readsQTrimmedT*rpmult)+"%) \t"+
+ basesQTrimmedT+" bases ("+String.format("%.2f",basesQTrimmedT*bpmult)+"%)");
+ }
+ if(forceTrimLeft>0 || forceTrimRight>0 || forceTrimRight2>0 || forceTrimModulo>0){
+ outstream.println("FTrimmed: \t"+readsFTrimmedT+" reads ("+String.format("%.2f",readsFTrimmedT*rpmult)+"%) \t"+
+ basesFTrimmedT+" bases ("+String.format("%.2f",basesFTrimmedT*bpmult)+"%)");
+ }
+ if(minReadLength>0 || maxReadLength>0){
+ outstream.println("Short Read Discards: \t"+readShortDiscardsT+" reads ("+String.format("%.2f",readShortDiscardsT*rpmult)+"%) \t"+
+ baseShortDiscardsT+" bases ("+String.format("%.2f",baseShortDiscardsT*bpmult)+"%)");
+ }
+ if(minAvgQuality>0 || maxNs>=0 || chastityFilter || tossJunk || removeBadBarcodes){
+ outstream.println("Low quality discards: \t"+lowqReadsT+" reads ("+String.format("%.2f",lowqReadsT*rpmult)+"%) \t"+
+ lowqBasesT+" bases ("+String.format("%.2f",lowqBasesT*bpmult)+"%)");
+ }
+ if(filterGC){
+ outstream.println("GC content discards: \t"+badGcReadsT+" reads ("+String.format("%.2f",badGcReadsT*rpmult)+"%) \t"+
+ badGcBasesT+" bases ("+String.format("%.2f",badGcBasesT*bpmult)+"%)");
+ }
+ final long ro=readsOut1+readsOut2+readsOutSingle, bo=basesOut1+basesOut2+basesOutSingle;
+ outstream.println("Output: \t"+ro+" reads ("+String.format("%.2f",ro*rmult)+"%) \t"+
+ bo+" bases ("+String.format("%.2f",bo*bmult)+"%)");
+
+ if(loglog!=null){
+ outstream.println("Unique "+loglog.k+"-mers: \t"+loglog.cardinality());
+ }
+
+ outstream.println("\nTime: \t"+t);
+ outstream.println("Reads Processed: "+rpstring+" \t"+String.format("%.2fk reads/sec", rpnano*1000000));
+ outstream.println("Bases Processed: "+bpstring+" \t"+String.format("%.2fm bases/sec", bpnano*1000));
+ if(testsize){
+ long bytesProcessed=(new File(in1).length()+(in2==null ? 0 : new File(in2).length())+
+ (qfin1==null ? 0 : new File(qfin1).length())+(qfin2==null ? 0 : new File(qfin2).length()));//*passes
+ double xpnano=bytesProcessed/(double)(t.elapsed);
+ String xpstring=(bytesProcessed<100000 ? ""+bytesProcessed : bytesProcessed<100000000 ? (bytesProcessed/1000)+"k" : (bytesProcessed/1000000)+"m");
+ while(xpstring.length()<8){xpstring=" "+xpstring;}
+ outstream.println("Bytes Processed: "+xpstring+" \t"+String.format("%.2fm bytes/sec", xpnano*1000));
+ }
+
+ if(verifypairing){
+ outstream.println("Names appear to be correctly paired.");
+ }
+
+ if(errorState){
+ throw new RuntimeException("ReformatReads terminated in an error state; the output may be corrupt.");
+ }
+ }
+
+ /*--------------------------------------------------------------*/
+
+ private void deleteEmpty(long readsOut1, long readsOut2, long readsOutSingle){
+ deleteEmpty(readsOut1, ffout1, qfout1);
+ deleteEmpty(readsOut2, ffout2, qfout2);
+ deleteEmpty(readsOutSingle, ffoutsingle, null);
+ }
+
+ private void deleteEmpty(long count, FileFormat ff, String qf){
+ try {
+ if(ff!=null && count<1){
+ String s=ff.name();
+ if(s!=null && !ff.stdio() && !ff.devnull()){
+ File f=new File(ff.name());
+ if(f.exists()){
+ f.delete();
+ }
+ }
+ if(qf!=null){
+ File f=new File(qf);
+ if(f.exists()){
+ f.delete();
+ }
+ }
+ }
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+
+
+ public static void breakReads(ArrayList<Read> list, final int max, int min){
+ if(!containsReadsOutsideSizeRange(list, min, max)){return;}
+ assert(max>0 || min>0) : "min or max read length must be positive.";
+ assert(max<1 || max>=min) : "max read length must be at least min read length: "+max+"<"+min;
+ min=Tools.max(0, min);
+
+ ArrayList<Read> temp=new ArrayList<Read>(list.size()*2);
+ for(Read r : list){
+ if(r==null || r.bases==null){
+ temp.add(r);
+ }else if(r.length()<min){
+ temp.add(null);
+ }else if(max<1 || r.length()<=max){
+ temp.add(r);
+ }else{
+ final byte[] bases=r.bases;
+ final byte[] quals=r.quality;
+ final String name=r.id;
+ final int limit=bases.length-min;
+ for(int num=1, start=0, stop=max; start<limit; num++, start+=max, stop+=max){
+ if(verbose){
+ System.err.println(bases.length+", "+start+", "+stop);
+ if(quals!=null){System.err.println(quals.length+", "+start+", "+stop);}
+ }
+ stop=Tools.min(stop, bases.length);
+ byte[] b2=Arrays.copyOfRange(bases, start, stop);
+ byte[] q2=(quals==null ? null : Arrays.copyOfRange(quals, start, stop));
+ String n2=name+"_"+num;
+ Read r2=new Read(b2, -1, -1, -1, n2, q2, r.numericID, r.flags);
+ r2.setMapped(false);
+ temp.add(r2);
+ }
+ }
+ }
+ list.clear();
+ list.ensureCapacity(temp.size());
+// list.addAll(temp);
+ for(Read r : temp){
+ if(r!=null){list.add(r);}
+ }
+ }
+
+ private static boolean containsReadsAboveSize(ArrayList<Read> list, int size){
+ for(Read r : list){
+ if(r!=null && r.bases!=null){
+ if(r.length()>size){
+ assert(r.mate==null) : "Read of length "+r.length()+">"+size+". Paired input is incompatible with 'breaklength'";
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+
+ private static boolean containsReadsOutsideSizeRange(ArrayList<Read> list, int min, int max){
+ for(Read r : list){
+ if(r!=null && r.bases!=null){
+ if((max>0 && r.length()>max) || r.length()<min){
+ assert(r.mate==null) : "Read of length "+r.length()+" outside of range "+min+"-"+max+". Paired input is incompatible with 'breaklength'";
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+
+
+ private long[] countReads(long maxReads){
+ if(ffin1.stdio()){
+ throw new RuntimeException("Can't precount reads from standard in, only from a file.");
+ }
+
+ final ConcurrentReadInputStream cris;
+ {
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, ffin2, null, null);
+ if(verbose){System.err.println("Counting Reads");}
+ cris.start(); //4567
+ }
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ long count=0, count2=0, bases=0;
+
+ while(reads!=null && reads.size()>0){
+ count+=reads.size();
+ for(Read r : reads){
+ bases+=r.length();
+ count2++;
+ if(r.mate!=null){
+ bases+=r.mateLength();
+ count2++;
+ }
+ }
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ cris.returnList(ln.id, ln.list.isEmpty());
+ errorState|=ReadWrite.closeStream(cris);
+ return new long[] {count, count2, bases};
+ }
+
+ public static final byte[] makeQualityRemapArray(byte[] quantizeArray) {
+ byte[] array=new byte[128];
+ for(int i=0; i<array.length; i++){
+ byte q=0;
+ for(byte x : quantizeArray){
+ if(Tools.absdif(x, i)<=Tools.absdif(q, i)){q=x;}
+ }
+ array[i]=q;
+ }
+ return array;
+ }
+
+ private void printOptions(){
+ outstream.println("Syntax:\n");
+ outstream.println("java -ea -Xmx512m -cp <path> jgi.ReformatReads in=<infile> in2=<infile2> out=<outfile> out2=<outfile2>");
+ outstream.println("\nin2 and out2 are optional. \nIf input is paired and there is only one output file, it will be written interleaved.\n");
+ outstream.println("Other parameters and their defaults:\n");
+ outstream.println("overwrite=false \tOverwrites files that already exist");
+ outstream.println("ziplevel=4 \tSet compression level, 1 (low) to 9 (max)");
+ outstream.println("interleaved=false\tDetermines whether input file is considered interleaved");
+ outstream.println("fastawrap=70 \tLength of lines in fasta output");
+ outstream.println("qin=auto \tASCII offset for input quality. May be set to 33 (Sanger), 64 (Illumina), or auto");
+ outstream.println("qout=auto \tASCII offset for output quality. May be set to 33 (Sanger), 64 (Illumina), or auto (meaning same as input)");
+ outstream.println("outsingle=<file> \t(outs) Write singleton reads here, when conditionally discarding reads from pairs.");
+ }
+
+
+ public void setSampleSeed(long seed){
+ randy=new Random();
+ if(seed>-1){randy.setSeed(seed);}
+ }
+
+
+ /*--------------------------------------------------------------*/
+
+ private String in1=null;
+ private String in2=null;
+
+ private String qfin1=null;
+ private String qfin2=null;
+
+ private String out1=null;
+ private String out2=null;
+ private String outsingle=null;
+
+ private String qfout1=null;
+ private String qfout2=null;
+
+ private String extin=null;
+ private String extout=null;
+
+ /*--------------------------------------------------------------*/
+
+ /** For calculating kmer cardinality */
+ private final LogLog loglog;
+
+ /** Tracks names to ensure no duplicate names. */
+ private final HashMap<String,Integer> nameMap1, nameMap2;
+ private boolean uniqueNames=false;
+
+ private boolean reverseComplimentMate=false;
+ private boolean reverseCompliment=false;
+ private boolean verifyinterleaving=false;
+ private boolean verifypairing=false;
+ private boolean allowIdenticalPairNames=true;
+ private boolean trimBadSequence=false;
+ private boolean chastityFilter=false;
+ /** Crash if a barcode is encountered that contains Ns or is not in the table */
+ private final boolean failBadBarcodes;
+ /** Remove reads with Ns in barcodes or that are not in the table */
+ private final boolean removeBadBarcodes;
+ /** Fail reads missing a barcode */
+ private final boolean failIfNoBarcode;
+ /** A set of valid barcodes; null if unused */
+ private final HashSet<String> barcodes;
+ private boolean deleteEmptyFiles=false;
+ private boolean mappedOnly=false;
+ private boolean unmappedOnly=false;
+ private boolean primaryOnly=false;
+ /** For sam file filtering: These bits must be set. */
+ private int requiredBits=0;
+ /** For sam file filtering: These bits must be unset */
+ private int filterBits=0;
+ /** Add /1 and /2 to paired reads */
+ private boolean addslash=false;
+ /** Change read name whitespace to underscores */
+ private boolean addunderscore=false;
+ private String slash1=" /1";
+ private String slash2=" /2";
+ private boolean stoptag=false;
+ private boolean iupacToN=false;
+
+
+ private long maxReads=-1;
+ private long skipreads=-1;
+ private float samplerate=1f;
+ private long sampleseed=-1;
+ private boolean sampleReadsExact=false;
+ private boolean sampleBasesExact=false;
+ private long sampleReadsTarget=0;
+ private long sampleBasesTarget=0;
+
+ /** Recalibrate quality scores using matrices */
+ private boolean recalibrateQuality=false;
+ private boolean qtrimRight=false;
+ private boolean qtrimLeft=false;
+ private final int forceTrimLeft;
+ private final int forceTrimRight;
+ private final int forceTrimRight2;
+ /** Trim right bases of the read modulo this value.
+ * e.g. forceTrimModulo=50 would trim the last 3bp from a 153bp read. */
+ private final int forceTrimModulo;
+ private byte trimq=6;
+ private byte minAvgQuality=0;
+ private int minAvgQualityBases=0;
+ private int maxNs=-1;
+ private int minConsecutiveBases=0;
+ private int breakLength=0;
+ private int maxReadLength=0;
+ private int minReadLength=0;
+ private float minLenFraction=0;
+ private float minGC=0;
+ private float maxGC=1;
+ private boolean filterGC=false;
+ private boolean tossJunk=false;
+ /** Toss pair only if both reads are shorter than limit */
+ private boolean requireBothBad=false;
+
+ private boolean useSharedHeader;
+
+ private byte[] remap1=null, remap2=null;
+
+ private boolean quantizeQuality=false;
+
+ private byte[] quantizeArray={0, 8, 13, 22, 27, 32, 37};
+ private final byte[] qualityRemapArray;
+
+ /*--------------------------------------------------------------*/
+
+ private final FileFormat ffin1;
+ private final FileFormat ffin2;
+
+ private final FileFormat ffout1;
+ private final FileFormat ffout2;
+ private final FileFormat ffoutsingle;
+
+ private final boolean qtrim;
+
+
+ /*--------------------------------------------------------------*/
+
+ private PrintStream outstream=System.err;
+ public static boolean verbose=false;
+ public boolean errorState=false;
+ private boolean overwrite=false;
+ private boolean append=false;
+ private boolean parsecustom=false;
+ private boolean testsize=false;
+
+ private Random randy;
+
+}
diff --git a/current/jgi/RemapQuality.java b/current/jgi/RemapQuality.java
new file mode 100755
index 0000000..15a2b9a
--- /dev/null
+++ b/current/jgi/RemapQuality.java
@@ -0,0 +1,115 @@
+package jgi;
+
+import stream.Read;
+import stream.SamLine;
+import dna.Timer;
+
+/**
+ * Changes quality scores to other quality scores.
+ * @author Brian Bushnell
+ * @date Apr 27, 2015
+ *
+ */
+public class RemapQuality extends BBTool_ST {
+
+ /**
+ * Code entrance from the command line.
+ * Must be overridden; the commented body is an example.
+ * @param args Command line arguments
+ */
+ public static void main(String[] args){
+ //Example:
+ Timer t=new Timer();
+ RemapQuality bbt=new RemapQuality(args);
+ bbt.process(t);
+ }
+
+ void setDefaults(){}
+
+ /**
+ * @param args
+ */
+ public RemapQuality(String[] args) {
+ super(args);
+ SamLine.SET_FROM_OK=true;
+ map=new byte[256];
+ for(int i=0; i<map.length; i++){
+ map[i]=(byte)i;
+ }
+
+ if(mapString==null){//reverse quality
+ for(int i=2; i<=41; i++){
+ map[i]=(byte)(43-i);
+ }
+ }else{
+ String[] pairs=mapString.split(";");
+ for(String pair : pairs){
+ String[] split=pair.split(",");
+ int a=Integer.parseInt(split[0]);
+ int b=Integer.parseInt(split[1]);
+ map[a]=(byte)b;
+ }
+ }
+ }
+
+ /* (non-Javadoc)
+ * @see jgi.BBTool_ST#parseArgument(java.lang.String, java.lang.String, java.lang.String)
+ */
+ @Override
+ public boolean parseArgument(String arg, String a, String b){
+ if(a.equals("map")){
+ mapString=b;
+ return true;
+ }else if(false){
+ return true;
+ }
+ return false;
+ }
+
+ /* (non-Javadoc)
+ * @see jgi.BBTool_ST#startupSubclass()
+ */
+ @Override
+ void startupSubclass() {
+ // TODO Auto-generated method stub
+
+ }
+
+ /* (non-Javadoc)
+ * @see jgi.BBTool_ST#shutdownSubclass()
+ */
+ @Override
+ void shutdownSubclass() {
+ // TODO Auto-generated method stub
+
+ }
+
+ /* (non-Javadoc)
+ * @see jgi.BBTool_ST#showStatsSubclass(dna.Timer, long, long)
+ */
+ @Override
+ void showStatsSubclass(Timer t, long readsIn, long basesIn) {
+ // TODO Auto-generated method stub
+
+ }
+
+ /* (non-Javadoc)
+ * @see jgi.BBTool_ST#processReadPair(stream.Read, stream.Read)
+ */
+ @Override
+ boolean processReadPair(Read r1, Read r2) {
+ if(r1!=null && r1.quality!=null){
+ final byte[] qual=r1.quality;
+ for(int i=0; i<qual.length; i++){qual[i]=map[qual[i]];}
+ }
+ if(r2!=null && r2.quality!=null){
+ final byte[] qual=r2.quality;
+ for(int i=0; i<qual.length; i++){qual[i]=map[qual[i]];}
+ }
+ return true;
+ }
+
+ public String mapString;
+ public final byte[] map;
+
+}
diff --git a/current/jgi/RemoveBadBarcodes.java b/current/jgi/RemoveBadBarcodes.java
new file mode 100755
index 0000000..734fa9b
--- /dev/null
+++ b/current/jgi/RemoveBadBarcodes.java
@@ -0,0 +1,82 @@
+package jgi;
+
+import dna.AminoAcid;
+import dna.Timer;
+import stream.Read;
+
+/**
+ * @author Brian Bushnell
+ * @date Mar 16, 2015
+ *
+ */
+public class RemoveBadBarcodes extends BBTool_ST {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /**
+ * Code entrance from the command line.
+ * Must be overridden; the commented body is an example.
+ * @param args Command line arguments
+ */
+ public static void main(String[] args){
+ Timer t=new Timer();
+ RemoveBadBarcodes bbt=new RemoveBadBarcodes(args);
+ bbt.process(t);
+ }
+
+ /**
+ * @param args
+ */
+ public RemoveBadBarcodes(String[] args) {
+ super(args);
+ }
+
+ void setDefaults(){}
+
+ @Override
+ public boolean parseArgument(String arg, String a, String b) {
+ return false;
+ }
+
+ @Override
+ boolean processReadPair(Read r1, Read r2) {
+ String id=r1.id;
+ int loc=(id==null ? -1 : id.lastIndexOf(':'));
+ if(loc<0 || loc>=id.length()-1){
+ noBarcode++;
+ return false;
+ }
+ for(int i=loc+1; i<id.length(); i++){
+ char c=id.charAt(i);
+ boolean ok=(c=='+' || AminoAcid.isFullyDefined(c));
+ if(!ok){
+ bad++;
+ return false;
+ }
+ }
+ good++;
+ return true;
+ }
+
+ @Override
+ void startupSubclass() {}
+
+ @Override
+ void shutdownSubclass() {}
+
+ @Override
+ void showStatsSubclass(Timer t, long readsIn, long basesIn) {
+
+ outstream.println();
+ outstream.println("Good: "+good);
+ outstream.println("Bad: "+bad);
+ outstream.println("No Barcode: "+noBarcode);
+ }
+
+ long good=0;
+ long bad=0;
+ long noBarcode=0;
+
+}
diff --git a/current/jgi/RenameReads.java b/current/jgi/RenameReads.java
new file mode 100755
index 0000000..3d49c51
--- /dev/null
+++ b/current/jgi/RenameReads.java
@@ -0,0 +1,364 @@
+package jgi;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import dna.Data;
+import dna.Parser;
+import dna.Timer;
+
+import stream.ConcurrentGenericReadInputStream;
+import stream.ConcurrentReadInputStream;
+import stream.FASTQ;
+import stream.FastaReadInputStream;
+import stream.ConcurrentReadOutputStream;
+import stream.Read;
+
+import fileIO.ByteFile;
+import fileIO.ByteFile1;
+import fileIO.ByteFile2;
+import fileIO.FileFormat;
+import fileIO.ReadWrite;
+
+import align2.ListNum;
+import align2.ReadStats;
+import align2.Shared;
+import align2.Tools;
+
+/**
+ * @author Brian Bushnell
+ * @date Aug 23, 2013
+ *
+ */
+public class RenameReads {
+
+ public static void main(String[] args){
+ Timer t=new Timer();
+ RenameReads rr=new RenameReads(args);
+ rr.process(t);
+ }
+
+ private void printOptions(){
+ System.err.println("See shellscript for usage information.");
+ System.exit(1);
+// outstream.println("Syntax:\n");
+// outstream.println("java -ea -Xmx1g -cp <path> jgi.ReformatReads in=<infile> in2=<infile2> out=<outfile> out2=<outfile2> prefix=<>");
+// outstream.println("\nin2 and out2 are optional. \nIf input is paired and there is only one output file, it will be written interleaved.\n");
+// outstream.println("Other parameters and their defaults:\n");
+// outstream.println("overwrite=false \tOverwrites files that already exist");
+// outstream.println("ziplevel=2 \tSet compression level, 1 (low) to 9 (max)");
+// outstream.println("interleaved=auto \tDetermines whether input file is considered interleaved");
+// outstream.println("fastawrap=70 \tLength of lines in fasta output");
+// outstream.println("qin=auto \tASCII offset for input quality. May be set to 33 (Sanger), 64 (Illumina), or auto");
+// outstream.println("qout=auto \tASCII offset for output quality. May be set to 33 (Sanger), 64 (Illumina), or auto (meaning same as input)");
+ }
+
+ public RenameReads(String[] args){
+ if(args==null || args.length==0){
+ printOptions();
+ System.exit(0);
+ }
+
+ for(String s : args){if(s.startsWith("out=standardout") || s.startsWith("out=stdout")){outstream=System.err;}}
+ outstream.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+
+ Parser parser=new Parser();
+
+
+ Shared.READ_BUFFER_LENGTH=Tools.min(200, Shared.READ_BUFFER_LENGTH);
+ Shared.capBuffers(4);
+ ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
+ ReadWrite.MAX_ZIP_THREADS=Shared.threads();
+ ReadWrite.ZIP_THREAD_DIVISOR=1;
+
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(Parser.parseCommonStatic(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseZip(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseQuality(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseFasta(arg, a, b)){
+ //do nothing
+ }else if(parser.parseInterleaved(arg, a, b)){
+ //do nothing
+ }else if(a.equals("passes")){
+ assert(false) : "'passes' is disabled.";
+// passes=Integer.parseInt(b);
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ ByteFile1.verbose=verbose;
+ ByteFile2.verbose=verbose;
+ stream.FastaReadInputStream.verbose=verbose;
+ ConcurrentGenericReadInputStream.verbose=verbose;
+// align2.FastaReadInputStream2.verbose=verbose;
+ stream.FastqReadInputStream.verbose=verbose;
+ ReadWrite.verbose=verbose;
+ }else if(a.equals("reads") || a.equals("maxreads")){
+ maxReads=Tools.parseKMG(b);
+ }else if(a.equals("build") || a.equals("genome")){
+ Data.setGenome(Integer.parseInt(b));
+ }else if(a.equals("in") || a.equals("input") || a.equals("in1") || a.equals("input1")){
+ in1=b;
+ }else if(a.equals("prefix") || a.equals("p")){
+ prefix=b;
+ if(b!=null && !b.endsWith("_")){b="_"+b;}
+ }else if(a.equals("in2") || a.equals("input2")){
+ in2=b;
+ }else if(a.equals("out") || a.equals("output") || a.equals("out1") || a.equals("output1")){
+ out1=b;
+ }else if(a.equals("out2") || a.equals("output2")){
+ out2=b;
+ }else if(a.equals("qfin") || a.equals("qfin1")){
+ qfin1=b;
+ }else if(a.equals("qfout") || a.equals("qfout1")){
+ qfout1=b;
+ }else if(a.equals("qfin2")){
+ qfin2=b;
+ }else if(a.equals("qfout2")){
+ qfout2=b;
+ }else if(a.equals("extin")){
+ extin=b;
+ }else if(a.equals("extout")){
+ extout=b;
+ }else if(a.equals("append") || a.equals("app")){
+ append=ReadStats.append=Tools.parseBoolean(b);
+ }else if(a.equals("overwrite") || a.equals("ow")){
+ overwrite=Tools.parseBoolean(b);
+ }else if(a.equals("renamebyinsert")){
+ renameByInsert=Tools.parseBoolean(b);
+ }else if(a.equals("renamebytrim")){
+ renameByTrim=Tools.parseBoolean(b);
+ }else if(a.equals("addprefix")){
+ addPrefix=Tools.parseBoolean(b);
+ }else if(a.equals("prefixonly")){
+ prefixOnly=Tools.parseBoolean(b);
+ }else if(a.startsWith("minscaf") || a.startsWith("mincontig")){
+ stream.FastaReadInputStream.MIN_READ_LEN=Integer.parseInt(b);
+ }else if(in1==null && i==0 && !arg.contains("=") && (arg.toLowerCase().startsWith("stdin") || new File(arg).exists())){
+ in1=arg;
+ }else{
+ System.err.println("Unknown parameter "+args[i]);
+ assert(false) : "Unknown parameter "+args[i];
+ // throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+
+ renameByMapping=FASTQ.TAG_CUSTOM;
+ }
+
+ {//Process parser fields
+ Parser.processQuality();
+ }
+
+ if(prefix==null || prefix.length()<1){prefix="";}
+ else if(!prefix.endsWith("_") && !prefixOnly){
+ prefix=prefix+"_";
+ }
+
+ if(renameByInsert){
+ prefix="insert=";
+ FASTQ.PARSE_CUSTOM=true;
+ }else if(renameByTrim){
+ prefix="";
+ FASTQ.PARSE_CUSTOM=true;
+ }
+
+ if(in1!=null && in2==null && in1.indexOf('#')>-1 && !new File(in1).exists()){
+ in2=in1.replace("#", "2");
+ in1=in1.replace("#", "1");
+ }
+ if(out1!=null && out2==null && out1.indexOf('#')>-1){
+ out2=out1.replace("#", "2");
+ out1=out1.replace("#", "1");
+ }
+ if(in2!=null){
+ if(FASTQ.FORCE_INTERLEAVED){System.err.println("Reset INTERLEAVED to false because paired input files were specified.");}
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+ }
+
+ assert(FastaReadInputStream.settingsOK());
+
+ if(in1==null){
+ printOptions();
+ throw new RuntimeException("Error - at least one input file is required.");
+ }
+ if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){
+ ByteFile.FORCE_MODE_BF2=true;
+ }
+
+ if(out1==null){
+ if(out2!=null){
+ printOptions();
+ throw new RuntimeException("Error - cannot define out2 without defining out1.");
+ }
+ System.err.println("No output stream specified. To write to stdout, please specify 'out=stdout.fq' or similar.");
+// out1="stdout";
+ }
+
+ if(!parser.setInterleaved){
+ assert(in1!=null && (out1!=null || out2==null)) : "\nin1="+in1+"\nin2="+in2+"\nout1="+out1+"\nout2="+out2+"\n";
+ if(in2!=null){ //If there are 2 input streams.
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+ outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED);
+ }else{ //There is one input stream.
+ if(out2!=null){
+ FASTQ.FORCE_INTERLEAVED=true;
+ FASTQ.TEST_INTERLEAVED=false;
+ outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED);
+ }
+ }
+ }
+
+ if(out1!=null && out1.equalsIgnoreCase("null")){out1=null;}
+ if(out2!=null && out2.equalsIgnoreCase("null")){out2=null;}
+
+ if(!Tools.testOutputFiles(overwrite, append, false, out1, out2)){
+ throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+out1+", "+out2+"\n");
+ }
+
+ ffout1=FileFormat.testOutput(out1, FileFormat.FASTQ, extout, true, overwrite, append, false);
+ ffout2=FileFormat.testOutput(out2, FileFormat.FASTQ, extout, true, overwrite, append, false);
+
+ ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, extin, true, true);
+ ffin2=FileFormat.testInput(in2, FileFormat.FASTQ, extin, true, true);
+
+ if(renameByMapping){
+ assert(ffout1==null || ffout1.fastq()) : "Currently renameByMapping requires fastq output.";
+ }
+ }
+
+ void process(Timer t){
+
+ final ConcurrentReadInputStream cris;
+ {
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, ffin2, qfin1, qfin2);
+ cris.start(); //4567
+ }
+
+// TextStreamWriter tsw=new TextStreamWriter(args[2], false, false, true);
+// tsw.start();
+
+ ConcurrentReadOutputStream ros=null;
+ if(out1!=null){
+ final int buff=4;
+
+ if(cris.paired() && out2==null && (in1==null || !in1.contains(".sam"))){
+ outstream.println("Writing interleaved.");
+ }
+
+ assert(!out1.equalsIgnoreCase(in1) && !out1.equalsIgnoreCase(in1)) : "Input file and output file have same name.";
+ assert(out2==null || (!out2.equalsIgnoreCase(in1) && !out2.equalsIgnoreCase(in2))) : "out1 and out2 have same name.";
+
+ ros=ConcurrentReadOutputStream.getStream(ffout1, ffout2, qfout1, qfout2, buff, null, false);
+ ros.start();
+ }
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ long x=0;
+ while(reads!=null && reads.size()>0){
+
+ for(Read r1 : reads){
+ final Read r2=r1.mate;
+
+ if(renameByMapping){
+ //Should be handled automatically, if output is fastq.
+ }else if(r2!=null && (renameByInsert || renameByTrim)){
+
+ r1.setMapped(true);
+ r2.setMapped(true);
+ x=Read.insertSizeMapped(r1, r2, false);
+ if(verbose){System.err.println("True Insert: "+x);}
+ if(renameByTrim){
+ r1.id=r1.numericID+"_"+r1.length()+"_"+Tools.min(x, r1.length())+" /1";
+ r2.id=r2.numericID+"_"+r2.length()+"_"+Tools.min(x, r2.length())+" /2";
+ }else{
+ r1.id=prefix+x;
+ if(r2!=null){
+ r1.id=r1.id+" /1:"+r1.numericID;
+ r2.id=prefix+x+" /2:"+r1.numericID;
+ }
+ }
+
+ }else if(prefixOnly){
+ r1.id=prefix;
+ if(r1.mate!=null){
+ r2.id=prefix;
+ }
+ x++;
+ }else if(addPrefix){
+ r1.id=prefix+r1.id;
+ if(r1.mate!=null){
+ r2.id=prefix+r2.id;
+ }
+ x++;
+ }else{
+ r1.id=prefix+x;
+ if(r1.mate!=null){
+ r1.id=r1.id+" /1";
+ r1.mate.id=prefix+x+" /2";
+ }
+ x++;
+ }
+ }
+ if(ros!=null){ros.add(reads, ln.id);}
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ cris.returnList(ln.id, ln.list.isEmpty());
+ errorState|=ReadWrite.closeStreams(cris, ros);
+
+ t.stop();
+ System.err.println("Time: "+t);
+ }
+
+ private PrintStream outstream=System.err;
+
+ private String in1=null;
+ private String in2=null;
+
+ private String qfin1=null;
+ private String qfin2=null;
+
+ private String out1=null;
+ private String out2=null;
+
+ private String qfout1=null;
+ private String qfout2=null;
+
+ private String extin=null;
+ private String extout=null;
+
+ private String prefix=null;
+
+ private final FileFormat ffin1;
+ private final FileFormat ffin2;
+
+ private final FileFormat ffout1;
+ private final FileFormat ffout2;
+
+ private boolean overwrite=true;
+ private boolean append=false;
+ private boolean verbose=false;
+ private long maxReads=-1;
+ public boolean errorState=false;
+
+ public boolean renameByMapping=false;
+ public boolean renameByInsert=false;
+ public boolean renameByTrim=false;
+ public boolean addPrefix=false;
+ public boolean prefixOnly=false;
+
+}
diff --git a/current/jgi/SamToEst.java b/current/jgi/SamToEst.java
new file mode 100755
index 0000000..753f1d7
--- /dev/null
+++ b/current/jgi/SamToEst.java
@@ -0,0 +1,487 @@
+package jgi;
+
+
+import java.io.File;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+
+import stream.Read;
+import stream.SamLine;
+
+import dna.Data;
+import dna.Parser;
+import dna.Scaffold;
+import fileIO.ByteFile;
+import fileIO.ReadWrite;
+import fileIO.TextFile;
+import fileIO.TextStreamWriter;
+import align2.LongList;
+import align2.ReadStats;
+import align2.Shared;
+import align2.Tools;
+
+/**
+ *
+ * Processes a sam file of mapped ESTs.
+ * These ESTs may have been broken into smaller pieces for mapping,
+ * and if so, are reassembled.
+ *
+ * Produces a mapping statistics file.
+ *
+ * @author Brian Bushnell
+ * @date Sep 27, 2013
+ *
+ */
+public class SamToEst {
+
+ public static void main(String[] args){
+
+ ByteFile.FORCE_MODE_BF2=Shared.threads()>2;
+
+
+ ReadWrite.USE_UNPIGZ=true;
+
+ String est=null, stats=null, ref=null, sam=null;
+ float fractionForAllCaptured=0.98f;
+
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(Parser.parseCommonStatic(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseZip(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseQuality(arg, a, b)){
+ //do nothing
+ }else if(a.equals("in") || a.equals("input") || a.equals("in1") || a.equals("sam")){
+ sam=b;
+ }else if(a.equals("out") || a.equals("output") || a.equals("stats")){
+ stats=b;
+ }else if(a.equals("ref")){
+ ref=b;
+ }else if(a.equals("est")){
+ est=b;
+ }else if(a.equals("fraction")){
+ fractionForAllCaptured=Float.parseFloat(b);
+ }else if(a.equals("append") || a.equals("app")){
+ append=ReadStats.append=Tools.parseBoolean(b);
+ }else if(a.equals("overwrite") || a.equals("ow")){
+ overwrite=Tools.parseBoolean(b);
+ }else if(sam==null && i==0 && !arg.contains("=") && (arg.toLowerCase().startsWith("stdin") || new File(arg).exists())){
+ sam=arg;
+ }else if(stats==null && i==1 && !arg.contains("=")){
+ stats=arg;
+ }else{
+ System.err.println("Unknown parameter "+args[i]);
+ assert(false) : "Unknown parameter "+args[i];
+ // throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ {//Process parser fields
+ Parser.processQuality();
+ }
+
+ if(stats==null){stats="stdout";}
+ SamToEst ste=new SamToEst(sam, stats, ref, est, fractionForAllCaptured);
+ ste.process();
+ }
+
+ public SamToEst(String in_, String stats_, String ref_, String est_, float fractionForAll_){
+ in=in_;
+ stats=stats_;
+ ref=ref_;
+ estFile=est_;
+ fractionForAll=fractionForAll_;
+ }
+
+ public void process(){
+ HashMap<String, EST> table=new HashMap<String, EST>(initialSize);
+ TextFile tf=new TextFile(in, true, false);
+ String line=null;
+
+ String program=null;
+ String version=null;
+
+ boolean bbmap=false;
+ float bbversion=-1;
+
+ for(line=tf.nextLine(); line!=null && line.startsWith("@"); line=tf.nextLine()){
+ final String[] split=line.split("\t");
+ final String a=split[0];
+
+ if(a.equals("@SQ")){
+ Scaffold sc=new Scaffold(split);
+// assert(!table.containsKey(sc.name)) : "\nDuplicate scaffold name!\n"+sc+"\n\n"+table.get(sc.name);
+// table.put(sc.name, sc);
+ refBases+=sc.length;
+ refCount++;
+ }else if(a.equals("@PG")){
+ for(String s : split){
+ if(s.startsWith("PN:")){
+ String s2=s.substring(3);
+ if(s2.equalsIgnoreCase("bbmap") || s2.startsWith("BBMap")){bbmap=true;}
+ if(program==null){program=Data.forceIntern(s.substring(3));}
+ }else if(s.startsWith("VN:")){
+ if(bbmap && bbversion<0){bbversion=Float.parseFloat(s.substring(3));}
+ if(version==null){version=Data.forceIntern(s.substring(3));}
+ }
+ }
+ }else if(a.equals("@RG")){
+ //Do nothing
+ }else if(a.equals("@HD")){
+ //Do nothing
+ }else if(a.equals("@CO")){
+ //Do nothing
+ }else{
+// assert(false) : line;
+ }
+ }
+
+ EST current=null;
+ boolean err=false;
+ for(; line!=null; line=tf.nextLine()){
+
+ if(line.length()==0){
+
+ }else if(line.charAt(0)=='@'){
+ if(!err){
+ System.err.println("Unexpected header line: "+line);
+ System.err.println("This should not cause problems, and is probably due to concatenated sam files.\n" +
+ "Supressing future unexpected header warnings.");
+ err=true;
+ }
+
+ if(line.startsWith("@SQ")){
+ String[] split=line.split("\t");
+ Scaffold sc=new Scaffold(split);
+// if(!table.containsKey(sc.name)){
+// table.put(sc.name, sc);
+// refBases+=sc.length;
+// refCount++;
+// }
+ }
+ }else{
+
+ SamLine sl=new SamLine(line);
+ if(USE_SECONDARY || sl.primary()){
+
+ if(sl.mapped() && sl.cigar!=null){
+ String cigar=sl.cigar;
+ if(cigar.contains("D") || cigar.contains("N")){
+ int len=0;
+ for(int i=0; i<cigar.length(); i++){
+ char c=cigar.charAt(i);
+ if(Character.isDigit(c)){
+ len=(len*10)+(c-'0');
+ }else{
+ if(c=='D' || c=='N'){
+ introns.increment(len, 1);
+ }
+ len=0;
+ }
+ }
+ }
+ }
+
+// final Scaffold scaf=table.get(new String(sl.rname()));
+// assert(scaf!=null) : "Can't find "+new String(sl.rname());
+// final int a=Tools.max(sl.start(), 0);
+// final int b=Tools.min(sl.stop2(), scaf.length-1);
+// scaf.basehits+=(b-a+1);
+ String name=sl.qname;
+ int x=name.lastIndexOf('_');
+ int part=1;
+// if(x>0){
+ if(x>5 && name.charAt(x-5)=='_' && name.charAt(x-4)=='p' && name.charAt(x-3)=='a' && name.charAt(x-2)=='r' && name.charAt(x-1)=='t'){
+ int partlen=name.length()-x-1;
+ if(partlen>0 && partlen<6){
+ int p2=0;
+ for(int i=x+1; i<name.length(); i++){
+ char c=name.charAt(i);
+ int c2=c-'0';
+ if(c2<0 || c2>9){
+ p2=-1;
+ break;
+ }
+ }
+ if(p2>-1){
+ part=p2;
+ name=name.substring(0, x-5);
+// name=name.substring(0, x);
+// if(current!=null && !current.name.equals(name)){
+// //Special case test for sequences that already end with underscore number
+// if(name.length()>current.name.length()+1 && name.startsWith(current.name) && name.charAt(current.name.length())=='_'){
+// boolean specialCase=true;
+// for(int i=x+1; i<name.length(); i++){
+// char c=name.charAt(i);
+// int c2=c-'0';
+// if(c2<0 || c2>9){
+// specialCase=false;
+// break;
+// }
+// }
+// if(specialCase){name=current.name;}
+// }
+// }
+ }else{
+// assert(false) : x+"\t"+p2+"\t"+name;
+ }
+ }else{
+// assert(false) : x+"\t"+name;
+ }
+ }else{
+// assert(false) : x+"\t"+name;
+ }
+ if(current==null || !current.name.equals(name)){
+// assert(part==1) : "Sam file must be in input order. Run BBMap with the 'ordered' flag.\n"+part+"\n"+sl.qname;
+ if(current!=null){addEst(current);}
+ current=new EST(name);
+ }
+ current.add(sl);
+ }
+ }
+ }
+ if(current!=null){addEst(current);}
+ tf.close();
+
+ if(stats!=null){
+ final TextStreamWriter tsw=new TextStreamWriter(stats, overwrite, false, false);
+ tsw.start();
+
+// numRef: 786
+// numEst: 30985
+// EST-good: 30312 ( 97.83%)
+// EST-best: 30312 ( 97.83%)
+// EST-miss: 379 ( 1.22%)
+// EST-zero: 294 ( 0.95%)
+
+// tsw.println("EST-good:\t"+good+"\t"++"");
+// tsw.println("EST-best:\t"+best+"\t"++"");
+// tsw.println("EST-miss:\t"+miss+"\t"++"");
+// tsw.println("EST-zero:\t"+zero+"\t"++"");
+
+ boolean oldStyle=false;
+
+ if(oldStyle){
+ tsw.println("ref:\t"+ref);
+ tsw.println("est:\t"+estFile);
+ tsw.println("sam:\t"+in);
+
+ tsw.println("numRef:\t"+refCount+"\t"+refBases);
+ tsw.println("numEst:\t"+estCount+"\t"+estBases);
+ tsw.println("type\t#ests\t%ests\t#bases\t%bases");
+ }else{
+
+ tsw.println("ref_file="+ref);
+ tsw.println("est_file="+estFile);
+ tsw.println("sam_file="+in);
+
+ tsw.println("n_ref_scaffolds="+refCount);
+ tsw.println("n_ref_bases="+refBases);
+ tsw.println("n_est="+estCount);
+ tsw.println("n_est_bases="+estBases);
+ tsw.println("type\tn_est\tpct_est\tn_bases\tpct_bases");
+ }
+
+ double multE=100.0/estCount;
+ double multB=100.0/estBases;
+
+ double allBasesPct=multE*allBasesMapped;
+ double mostBasesPct=multE*mostBasesMapped;
+ double someBasesPct=multE*someBasesMapped;
+ double noBasesPct=multE*noBasesMapped;
+ double multiScaffoldPct=multE*multiScaffold;
+
+ double allBasesPctB=multB*allBasesMappedB;
+ double mostBasesPctB=multB*mostBasesMappedB;
+ double someBasesPctB=multB*someBasesMappedB;
+ double noBasesPctB=multB*noBasesMappedB;
+ double multiScaffoldPctB=multB*multiScaffoldB;
+
+ int min=0, max=0, median=0;
+ long sum=0, count=0;
+ for(int i=minIntron; i<introns.size; i++){
+ long x=introns.get(i);
+ if(x>0){
+ if(min==0){min=i;}
+ max=i;
+ sum+=(i*x);
+ count+=x;
+ }
+ }
+ if(count>0){ //If there are any introns
+ long half=(count+1)/2; //50th percentile of number of introns
+ assert(half<=count);
+ long count2=0; //Current sum of length
+ for(int i=0; count2<half; i++){
+ long x=introns.get(i);
+ if(x>0){
+ count2+=x;
+ median=i;
+ }
+ }
+ }
+
+ tsw.println("all:\t"+allBasesMapped+"\t"+String.format("%.4f%%",allBasesPct)+"\t"+allBasesMappedB+"\t"+String.format("%.4f%%",allBasesPctB));
+ tsw.println("most:\t"+mostBasesMapped+"\t"+String.format("%.4f%%",mostBasesPct)+"\t"+mostBasesMappedB+"\t"+String.format("%.4f%%",mostBasesPctB));
+ tsw.println("some:\t"+someBasesMapped+"\t"+String.format("%.4f%%",someBasesPct)+"\t"+someBasesMappedB+"\t"+String.format("%.4f%%",someBasesPctB));
+ tsw.println("zero:\t"+noBasesMapped+"\t"+String.format("%.4f%%",noBasesPct)+"\t"+noBasesMappedB+"\t"+String.format("%.4f%%",noBasesPctB));
+ tsw.println("multi:\t"+multiScaffold+"\t"+String.format("%.4f%%",multiScaffoldPct)+"\t"+multiScaffoldB+"\t"+String.format("%.4f%%",multiScaffoldPctB));
+// tsw.println("numIntrons:\t"+count);
+// tsw.println("minIntron:\t"+min);
+// tsw.println("maxIntron:\t"+max);
+// tsw.println("medIntron:\t"+median);
+// tsw.println("avgIntron:\t"+(long)(sum/(double)(Tools.max(count,1))));
+ tsw.println("introns\tmin\tmax\tmedian\taverage");
+ tsw.println(count+"\t"+min+"\t"+max+"\t"+median+"\t"+String.format("%.1f", (sum/(double)(Tools.max(count,1)))));
+
+ tsw.poisonAndWait();
+ }
+ }
+
+ private void addEst(EST est){
+// Data.sysout.println("\n"+est);
+ estCount++;
+ partCount+=est.parts;
+ estBases+=est.length;
+ estBasesMapped+=est.mappedLength;
+ partCountMapped+=est.mappedParts;
+
+ for(int i=0; i<est.msdicn.length; i++){
+ msdicnOverall[i]+=est.msdicn[i];
+ }
+
+ if(est.scafnames.size()>1){
+ multiScaffold++;
+ multiScaffoldB+=est.length;
+ }
+
+ if(est.mappedParts==est.parts){
+// Data.sysout.print("A");
+ allPartsMapped++;
+ }else if(est.mappedParts>=Tools.max(1, est.parts/2)){
+// Data.sysout.print("B");
+ mostPartsMapped++;
+ }else if(est.mappedParts>0){
+// Data.sysout.print("C");
+ somePartsMapped++;
+ }else{
+// Data.sysout.print("D");
+ noPartsMapped++;
+ }
+
+ int match=est.match();
+ if(match>=(est.length*fractionForAll)){
+// Data.sysout.print("E");
+ allBasesMapped++;
+ allBasesMappedB+=est.length;
+ }else if(match>=est.length/2){
+// Data.sysout.print("F");
+ mostBasesMapped++;
+ mostBasesMappedB+=est.length;
+ }else if(match>0){
+// Data.sysout.print("G");
+ someBasesMapped++;
+ someBasesMappedB+=est.length;
+ }else{
+// Data.sysout.print("H");
+ noBasesMapped++;
+ noBasesMappedB+=est.length;
+ }
+ }
+
+ public final float fractionForAll;
+ public final String in, stats, ref, estFile;
+
+ public long refBases=0;
+ public long estBases=0;
+ public long estBasesMapped=0;
+
+ public long refCount=0;
+ public long estCount=0;
+ public long partCount=0;
+ public long partCountMapped=0;
+
+ public long good=0, best=0, miss=0, zero=0;
+ public long multiScaffold=0, multiScaffoldB=0;
+ public long allPartsMapped=0, mostPartsMapped=0, somePartsMapped=0, noPartsMapped=0;
+ public long allBasesMapped=0, mostBasesMapped=0, someBasesMapped=0, noBasesMapped=0;
+ public long allBasesMappedB=0, mostBasesMappedB=0, someBasesMappedB=0, noBasesMappedB=0;
+ public long[] msdicnOverall=new long[6];
+ public LongList introns=new LongList(1);
+
+ public int initialSize=4096;
+ public boolean ADD_FROM_REF=true;
+ public boolean USE_SECONDARY=false;
+ public static int minIntron=10;
+ public static boolean overwrite=true;
+ public static boolean append=false;
+// public HashMap<String, EST> //Only needed if sam file is unordered.
+
+ public static class EST{
+
+ public EST(String name_){
+ name=name_;
+ System.err.println("New EST: "+name);
+ }
+
+ public void add(SamLine sl){
+ System.err.println("Adding samline "+sl.qname+" to EST "+name);
+ parts++;
+// length+=sl.seq.length();
+ length+=sl.seq.length;
+ if(sl.mapped()){
+// mappedLength+=sl.seq.length();
+ mappedLength+=sl.seq.length;
+ mappedParts++;
+ if(sl.cigar!=null){
+ String matchTag=sl.matchTag();
+
+ int[] temp;
+ if(matchTag==null){
+ temp=SamLine.cigarToMsdic(sl.cigar);
+ }else{
+ temp=Read.matchToMsdicn(matchTag.getBytes());
+ }
+ for(int i=0; i<temp.length; i++){
+ msdicn[i]+=temp[i];
+ }
+ }
+ if(sl.rname()!=null){
+ scafnames.add(new String(sl.rname()));
+ }
+ }
+ }
+
+ public int match(){return msdicn[0];}
+
+ public String toString(){
+ StringBuilder sb=new StringBuilder();
+ sb.append(name).append('\t');
+ sb.append(length).append('\t');
+ sb.append(mappedLength).append('\t');
+ sb.append(parts).append('\t');
+ sb.append(mappedParts).append('\t');
+ sb.append(Arrays.toString(msdicn)).append('\t');
+ sb.append(scafnames).append('\t');
+ return sb.toString();
+ }
+
+ final String name;
+ int length=0, mappedLength=0;
+ int parts=0, mappedParts=0;
+ HashSet<String> scafnames=new HashSet<String>(4);
+
+ int[] msdicn=new int[6];
+
+ }
+
+}
diff --git a/current/jgi/Seal.java b/current/jgi/Seal.java
new file mode 100755
index 0000000..5a2dcdb
--- /dev/null
+++ b/current/jgi/Seal.java
@@ -0,0 +1,3117 @@
+package jgi;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.concurrent.ArrayBlockingQueue;
+import java.util.concurrent.atomic.AtomicLongArray;
+
+import kmer.AbstractKmerTable;
+
+import stream.ArrayListSet;
+import stream.ConcurrentReadInputStream;
+import stream.FASTQ;
+import stream.FastaReadInputStream;
+import stream.ConcurrentReadOutputStream;
+import stream.KillSwitch;
+import stream.MultiCros;
+import stream.Read;
+import stream.SamLine;
+import tax.GiToNcbi;
+import tax.TaxNode;
+import tax.TaxTree;
+
+import align2.IntList;
+import align2.ListNum;
+import align2.ReadStats;
+import align2.Shared;
+import align2.Tools;
+import align2.TrimRead;
+import dna.AminoAcid;
+import dna.Parser;
+import dna.Timer;
+import fileIO.ByteFile;
+import fileIO.ByteStreamWriter;
+import fileIO.ReadWrite;
+import fileIO.FileFormat;
+import fileIO.TextStreamWriter;
+
+/**
+ * SEAL: Sequence Expression AnaLyzer
+ * Derived from BBDuk.
+ * Allows multiple values stored per kmer.
+ * Intended for RNA-seq, coverage, and other reads-per-sequence quantification.
+ * Also performs binning.
+ * @author Brian Bushnell
+ * @date November 10, 2014
+ *
+ */
+public class Seal {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /**
+ * Code entrance from the command line.
+ * @param args Command line arguments
+ */
+ public static void main(String[] args){
+
+ args=Parser.parseConfig(args);
+ if(Parser.parseHelp(args, true)){
+ printOptions();
+ System.exit(0);
+ }
+
+ //Create a new Seal instance
+ Seal pup=new Seal(args);
+
+ ///And run it
+ pup.process();
+ }
+
+ /**
+ * Display usage information.
+ */
+ private static void printOptions(){
+ outstream.println("Please consult the shellscript (seal.sh) for usage information.");
+// outstream.println("Syntax:\n");
+// outstream.println("\njava -ea -Xmx20g -cp <path> jgi.Seal in=<input file> out=<output file> ref=<contaminant files>");
+// outstream.println("\nOptional flags:");
+// outstream.println("in=<file> \tThe 'in=' flag is needed if the input file is not the first parameter. 'in=stdin' will pipe from standard in.");
+// outstream.println("in2=<file> \tUse this if 2nd read of pairs are in a different file.");
+// outstream.println("out=<file> \t(outmatch) 'out=stdout' will pipe to standard out.");
+// outstream.println("out2=<file> \t(outmatch2) Use this to write 2nd read of pairs to a different file.");
+// outstream.println("outu=<file> \t(outunmatched) Write unmatched reads here.");
+// outstream.println("outu2=<file> \t(outunmatched2) Use this to write 2nd read of pairs to a different file.");
+// outstream.println("stats=<file> \tWrite statistics about which contaminants were detected.");
+// outstream.println("rpkm=<file> \tWrite coverage and RPKM/FPKM info.");
+// outstream.println("");
+// outstream.println("threads=auto \t(t) Set number of threads to use; default is number of logical processors.");
+// outstream.println("overwrite=t \t(ow) Set to false to force the program to abort rather than overwrite an existing file.");
+// outstream.println("showspeed=t \t(ss) Set to 'f' to suppress display of processing speed.");
+// outstream.println("interleaved=auto \t(int) If true, forces fastq input to be paired and interleaved.");
+// outstream.println("k=31 \tKmer length used for finding contaminants. Contaminants shorter than k will not be found.");
+// outstream.println("maskmiddle=t \t(mm) Treat the middle base of a kmer as a wildcard.");
+// outstream.println("minkmerhits=0 \t(mh) Reads with more than this many contaminant kmers will be discarded.");
+// outstream.println("minavgquality=0 \t(maq) Reads with average quality (before trimming) below this will be discarded.");
+// outstream.println("touppercase=f \t(tuc) Change all letters in reads and reference to upper-case.");
+// outstream.println(" \tValues: f (don't trim), r (trim right end), l (trim left end), n (convert to N instead of trimming).");
+// outstream.println("qtrim=f \tTrim read ends to remove bases with quality below minq. Performed AFTER looking for kmers. ");
+// outstream.println(" \tValues: t (trim both ends), f (neither end), r (right end only), l (left end only).");
+// outstream.println("trimq=6 \tTrim quality threshold.");
+// outstream.println("minlength=2 \t(ml) Reads shorter than this after trimming will be discarded. Pairs will be discarded only if both are shorter.");
+// outstream.println("ziplevel=2 \t(zl) Set to 1 (lowest) through 9 (max) to change compression level; lower compression is faster.");
+// outstream.println("fastawrap=70 \tLength of lines in fasta output");
+// outstream.println("qin=auto \tASCII offset for input quality. May be set to 33 (Sanger), 64 (Illumina), or auto");
+// outstream.println("qout=auto \tASCII offset for output quality. May be set to 33 (Sanger), 64 (Illumina), or auto (meaning same as input)");
+// outstream.println("rcomp=t \tLook for reverse-complements of kmers also.");
+ }
+
+
+ /**
+ * Constructor.
+ * @param args Command line arguments
+ */
+ public Seal(String[] args){
+ System.err.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+ System.err.println("Seal version "+Shared.BBMAP_VERSION_STRING);
+
+ /* Set global defaults */
+ ReadWrite.ZIPLEVEL=2;
+ ReadWrite.USE_UNPIGZ=true;
+ ReadWrite.USE_PIGZ=true;
+ ReadWrite.MAX_ZIP_THREADS=8;
+
+
+ ByteFile.FORCE_MODE_BF2=Shared.threads()>2;
+ SamLine.SET_FROM_OK=true;
+
+ /* Initialize local variables with defaults */
+ boolean setOut=false, setOutb=false, qtrimRight_=false, qtrimLeft_=false;
+ boolean rcomp_=true;
+ boolean forbidNs_=false;
+ boolean prealloc_=false;
+ boolean useCountvector_=false;
+ int tableType_=AbstractKmerTable.ARRAYH;
+ int k_=31;
+ int ways_=-1; //Currently disabled
+ int minKmerHits_=1;
+ float minKmerFraction_=0;
+ long skipreads_=0;
+
+ Parser parser=new Parser();
+ parser.trimq=6;
+ parser.minAvgQuality=0;
+ parser.minReadLength=10;
+ parser.maxReadLength=Integer.MAX_VALUE;
+ parser.minLenFraction=0f;
+ parser.requireBothBad=false;
+ parser.maxNs=-1;
+ boolean ordered_=false;
+ int restrictLeft_=0, restrictRight_=0, speed_=0, qSkip_=1;
+ int ambigMode_=AMBIG_RANDOM;
+ int matchMode_=MATCH_ALL;
+ boolean keepPairsTogether_=true;
+ boolean printNonZeroOnly_=true;
+ boolean rename_=false, useRefNames_=false;
+ boolean ecc_=false;
+ int clearzone_=0;
+
+ scaffoldNames.add(""); //Necessary so that the first real scaffold gets an id of 1, not zero
+ scaffoldLengths.add(0);
+ scaffoldKmers.add(0);
+ scaffolds.add(null);
+
+ {
+ boolean b=false;
+ assert(b=true);
+ EA=b;
+ }
+
+ /* Parse arguments */
+ for(int i=0; i<args.length; i++){
+
+ final String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if("null".equalsIgnoreCase(b)){b=null;}
+ while(a.charAt(0)=='-' && (a.indexOf('.')<0 || i>1 || !new File(a).exists())){a=a.substring(1);}
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(Parser.parseZip(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseHist(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseCommonStatic(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseQualityAdjust(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseQuality(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseFasta(arg, a, b)){
+ //do nothing
+ }else if(parser.parseInterleaved(arg, a, b)){
+ //do nothing
+ }else if(parser.parseTrim(arg, a, b)){
+ //do nothing
+ }else if(parser.parseCommon(arg, a, b)){
+ //do nothing
+ }else if(a.equals("in") || a.equals("in1")){
+ in1=b;
+ }else if(a.equals("in2")){
+ in2=b;
+ }else if(a.equals("qfin") || a.equals("qfin1")){
+ qfin1=b;
+ }else if(a.equals("qfin2")){
+ qfin2=b;
+ }else if(a.equals("out") || a.equals("out1") || a.equals("outm") || a.equals("outm1") || a.equals("outmatched") || a.equals("outmatched1")){
+ outm1=b;
+ setOut=true;
+ }else if(a.equals("out2") || a.equals("outm") || a.equals("outm2") || a.equals("outmatched") || a.equals("outmatched2")){
+ outm2=b;
+ }else if(a.equals("outu") || a.equals("outu1") || a.equals("outunmatched") || a.equals("outunmatched1")){
+ outu1=b;
+ }else if(a.equals("outu2") || a.equals("outunmatched") || a.equals("outunmatched2")){
+ outu2=b;
+ }else if(a.equals("outpattern") || a.equals("pattern") || a.equals("basename")){
+ outpattern=b;
+ }else if(a.equals("stats") || a.equals("scafstats")){
+ outstats=b;
+ }else if(a.equals("refstats")){
+ outrefstats=b;
+ }else if(a.equals("rpkm") || a.equals("fpkm") || a.equals("cov") || a.equals("coverage") || a.equals("covstats")){
+ outrpkm=b;
+ }else if(a.equals("tax") || a.equals("taxa") || a.equals("outtax")){
+ outtax=b;
+ }else if(a.equals("ref")){
+ ref=(b==null) ? null : (new File(b).exists() ? new String[] {b} : b.split(","));
+ }else if(a.equals("literal")){
+ literal=(b==null) ? null : b.split(",");
+// assert(false) : b+", "+Arrays.toString(literal);
+ }else if(a.equals("forest")){
+ boolean x=Tools.parseBoolean(b);
+ if(x){tableType_=AbstractKmerTable.FOREST2D;}
+ }else if(a.equals("array") || a.equals("array2")){
+ boolean x=Tools.parseBoolean(b);
+ if(x){tableType_=AbstractKmerTable.ARRAY2D;}
+ }else if(a.equals("array1")){
+ boolean x=Tools.parseBoolean(b);
+ if(x){tableType_=AbstractKmerTable.ARRAY1D;}
+ }else if(a.equals("arrayh") || a.equals("hybrid")){
+ boolean x=Tools.parseBoolean(b);
+ if(x){tableType_=AbstractKmerTable.ARRAYH;}
+ }else if(a.equals("ways")){
+ ways_=Integer.parseInt(b);
+ }else if(a.equals("ordered") || a.equals("ord")){
+ ordered_=Tools.parseBoolean(b);
+ System.err.println("Set ORDERED to "+ordered_);
+ }else if(a.equals("k")){
+ assert(b!=null) : "\nThe k key needs an integer value greater than 0, such as k=27\n";
+ k_=Integer.parseInt(b);
+ assert(k_>0 && k_<32) : "k must be at least 1; default is 31.";
+ }else if(a.equals("hdist") || a.equals("hammingdistance")){
+ hammingDistance=Integer.parseInt(b);
+ assert(hammingDistance>=0 && hammingDistance<4) : "hamming distance must be between 0 and 3; default is 0.";
+ }else if(a.equals("qhdist") || a.equals("queryhammingdistance")){
+ qHammingDistance=Integer.parseInt(b);
+ assert(qHammingDistance>=0 && qHammingDistance<4) : "hamming distance must be between 0 and 3; default is 0.";
+ }else if(a.equals("edits") || a.equals("edist") || a.equals("editdistance")){
+ editDistance=Integer.parseInt(b);
+ assert(editDistance>=0 && editDistance<3) : "edit distance must be between 0 and 2; default is 0.";
+ }else if(a.equals("skip") || a.equals("refskip") || a.equals("rskip")){
+ refSkip=Integer.parseInt(b);
+ }else if(a.equals("qskip")){
+ qSkip_=Integer.parseInt(b);
+ }else if(a.equals("speed")){
+ speed_=Integer.parseInt(b);
+ assert(speed_>=0 && speed_<=15) : "Speed range is 0 to 15. Value: "+speed_;
+ }else if(a.equals("skipreads")){
+ skipreads_=Tools.parseKMG(b);
+ }else if(a.equals("minkmerhits") || a.equals("minhits") || a.equals("mh") || a.equals("mkh")){
+ minKmerHits_=Integer.parseInt(b);
+ }else if(a.equals("minkmerfraction") || a.equals("minfraction") || a.equals("mkf")){
+ minKmerFraction_=Float.parseFloat(b);
+ }else if(a.equals("showspeed") || a.equals("ss")){
+ showSpeed=Tools.parseBoolean(b);
+ }else if(a.equals("verbose")){
+ assert(false) : "Verbose flag is currently static final; must be recompiled to change.";
+ assert(WAYS>1) : "WAYS=1 is for debug mode.";
+// verbose=Tools.parseBoolean(b); //123
+ if(verbose){outstream=System.err;} //For some reason System.out does not print in verbose mode.
+ }else if(a.equals("mm") || a.equals("maskmiddle")){
+ maskMiddle=Tools.parseBoolean(b);
+ }else if(a.equals("rcomp")){
+ rcomp_=Tools.parseBoolean(b);
+ }else if(a.equals("forbidns") || a.equals("forbidn") || a.equals("fn")){
+ forbidNs_=Tools.parseBoolean(b);
+ }else if(a.equals("prealloc") || a.equals("preallocate")){
+ if(b==null || b.length()<1 || Character.isLetter(b.charAt(0))){
+ prealloc_=Tools.parseBoolean(b);
+ }else{
+ preallocFraction=Tools.max(0, Double.parseDouble(b));
+ prealloc_=(preallocFraction>0);
+ }
+ }else if(a.equals("restrictleft")){
+ restrictLeft_=Integer.parseInt(b);
+ }else if(a.equals("restrictright")){
+ restrictRight_=Integer.parseInt(b);
+ }else if(a.equals("statscolumns") || a.equals("columns") || a.equals("cols")){
+ STATS_COLUMNS=Integer.parseInt(b);
+ assert(STATS_COLUMNS==3 || STATS_COLUMNS==5) : "statscolumns bust be either 3 or 5. Invalid value: "+STATS_COLUMNS;
+ }else if(a.equals("ambiguous") || a.equals("ambig")){
+ if(b==null){
+ throw new RuntimeException(arg);
+ }else if(b.equalsIgnoreCase("keep") || b.equalsIgnoreCase("best") || b.equalsIgnoreCase("first")){
+ ambigMode_=AMBIG_FIRST;
+ }else if(b.equalsIgnoreCase("all")){
+ ambigMode_=AMBIG_ALL;
+ }else if(b.equalsIgnoreCase("random") || b.equalsIgnoreCase("rand")){
+ ambigMode_=AMBIG_RANDOM;
+ }else if(b.equalsIgnoreCase("toss") || b.equalsIgnoreCase("discard") || b.equalsIgnoreCase("remove")){
+ ambigMode_=AMBIG_TOSS;
+ }else{
+ throw new RuntimeException(arg);
+ }
+ }else if(a.equals("match") || a.equals("mode")){
+ if(b==null){
+ throw new RuntimeException(arg);
+ }else if(b.equalsIgnoreCase("all") || b.equalsIgnoreCase("best")){
+ matchMode_=MATCH_ALL;
+ }else if(b.equalsIgnoreCase("first")){
+ matchMode_=MATCH_FIRST;
+ }else if(b.equalsIgnoreCase("unique") || b.equalsIgnoreCase("firstunique")){
+ matchMode_=MATCH_UNIQUE;
+ }else{
+ throw new RuntimeException(arg);
+ }
+ }else if(a.equals("findbestmatch") || a.equals("fbm")){
+ matchMode_=(Tools.parseBoolean(b) ? MATCH_ALL : MATCH_FIRST);
+ }else if(a.equals("firstuniquematch") || a.equals("fum")){
+ if(Tools.parseBoolean(b)){matchMode_=MATCH_UNIQUE;}
+ }else if(a.equals("keeppairstogether") || a.equals("kpt")){
+ keepPairsTogether_=Tools.parseBoolean(b);
+ }else if(a.equals("nzo") || a.equals("nonzeroonly")){
+ printNonZeroOnly_=Tools.parseBoolean(b);
+ }else if(a.equals("rename")){
+ rename_=Tools.parseBoolean(b);
+ }else if(a.equals("refnames") || a.equals("userefnames")){
+ useRefNames_=Tools.parseBoolean(b);
+ }else if(a.equals("initialsize")){
+ initialSize=(int)Tools.parseKMG(b);
+ }else if(a.equals("dump")){
+ dump=b;
+ }else if(a.equals("countvector")){
+ useCountvector_=Tools.parseBoolean(b);
+ }else if(a.equals("ecco") || a.equals("ecc")){
+ ecc_=Tools.parseBoolean(b);
+ }else if(a.equals("copyundefined") || a.equals("cu")){
+ REPLICATE_AMBIGUOUS=Tools.parseBoolean(b);
+ }else if(a.equals("bbsplit")){
+ BBSPLIT_STYLE=Tools.parseBoolean(b);
+ }else if(a.equals("table") || a.equals("gi") || a.equals("gitable")){
+ giTableFile=b;
+ if("auto".equalsIgnoreCase(b)){giTableFile=TaxTree.DefaultTableFile;}
+ }else if(a.equals("taxnames") || a.equals("taxname")){
+ taxNameFile=b;
+ }else if(a.equals("taxnodes") || a.equals("taxnode")){
+ taxNodeFile=b;
+ }else if(a.equals("taxtree") || a.equals("tree")){
+ taxTreeFile=b;
+ if("auto".equalsIgnoreCase(b)){taxTreeFile=TaxTree.DefaultTreeFile;}
+ }else if(a.equals("mincount")){
+ taxNodeCountLimit=Long.parseLong(b);
+ }else if(a.equals("maxnodes")){
+ taxNodeNumberLimit=Integer.parseInt(b);
+ }else if(a.equals("minlevel")){
+ taxNodeMinLevel=TaxTree.stringToLevel(b.toLowerCase());
+ }else if(a.equals("maxlevel")){
+ taxNodeMaxLevel=TaxTree.stringToLevel(b.toLowerCase());
+ }else if(a.equals("clearzone") || a.equals("cz")){
+ clearzone_=Integer.parseInt(b);
+ }
+
+
+ else if(a.equals("processcontainedref")){
+ processContainedRef=Tools.parseBoolean(b);
+ }else if(a.equals("storerefbases")){
+ storeRefBases=Tools.parseBoolean(b);
+ }
+
+ else if(i==0 && in1==null && arg.indexOf('=')<0 && arg.lastIndexOf('.')>0){
+ in1=args[i];
+ }else if(i==1 && outu1==null && arg.indexOf('=')<0 && arg.lastIndexOf('.')>0){
+ outu1=args[i];
+ setOut=true;
+ }else if(i==2 && ref==null && arg.indexOf('=')<0 && arg.lastIndexOf('.')>0){
+ ref=(new File(args[i]).exists() ? new String[] {args[i]} : args[i].split(","));
+ }else{
+ throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ {//Process parser fields
+ Parser.processQuality();
+
+ maxReads=parser.maxReads;
+ samplerate=parser.samplerate;
+ sampleseed=parser.sampleseed;
+
+ overwrite=ReadStats.overwrite=parser.overwrite;
+ append=ReadStats.append=parser.append;
+
+ forceTrimModulo=parser.forceTrimModulo;
+ forceTrimLeft=parser.forceTrimLeft;
+ forceTrimRight=parser.forceTrimRight;
+ forceTrimRight2=parser.forceTrimRight2;
+ qtrimLeft=parser.qtrimLeft;
+ qtrimRight=parser.qtrimRight;
+ trimq=parser.trimq;
+ minLenFraction=parser.minLenFraction;
+ minAvgQuality=parser.minAvgQuality;
+ minAvgQualityBases=parser.minAvgQualityBases;
+ chastityFilter=parser.chastityFilter;
+ minReadLength=parser.minReadLength;
+ maxReadLength=parser.maxReadLength;
+ maxNs=parser.maxNs;
+ minConsecutiveBases=parser.minConsecutiveBases;
+// minGC=parser.minGC;
+// maxGC=parser.maxGC;
+// filterGC=parser.filterGC;
+// minTrimLength=(parser.minTrimLength>=0 ? parser.minTrimLength : minTrimLength);
+// requireBothBad=parser.requireBothBad;
+ removePairsIfEitherBad=!parser.requireBothBad;
+
+ THREADS=Shared.threads();
+ }
+
+ refNames.add(null);
+ if(ref!=null){
+ ArrayList<String> temp=new ArrayList<String>();
+ for(String s : ref){
+ Tools.getFileOrFiles(s, temp, true, false, false, false);
+ }
+ ref=temp.toArray(new String[0]);
+ if(ref.length<1){ref=null;}
+ refNames.addAll(temp);
+ }
+ if(literal!=null){refNames.add("literal");}
+ refScafCounts=new int[refNames.size()];
+
+ if(prealloc_){
+ System.err.println("Note - if this program runs out of memory, please disable the prealloc flag.");
+ }
+
+ /* Set final variables; post-process and validate argument combinations */
+
+ tableType=tableType_;
+ hammingDistance=Tools.max(editDistance, hammingDistance);
+ refSkip=Tools.max(0, refSkip);
+ rcomp=rcomp_;
+ forbidNs=(forbidNs_ || hammingDistance<1);
+ skipreads=skipreads_;
+ ORDERED=ordered_;
+ restrictLeft=Tools.max(restrictLeft_, 0);
+ restrictRight=Tools.max(restrictRight_, 0);
+ ambigMode=ambigMode_;
+ matchMode=matchMode_;
+ keepPairsTogether=keepPairsTogether_;
+ printNonZeroOnly=printNonZeroOnly_;
+ rename=rename_;
+ useRefNames=useRefNames_;
+ speed=speed_;
+ qSkip=qSkip_;
+ noAccel=(speed<1 && qSkip<2);
+ clearzone=clearzone_;
+ parsecustom=FASTQ.PARSE_CUSTOM;
+ ecc=ecc_;
+
+ USE_TAXTREE=(taxNameFile!=null || taxNodeFile!=null || outtax!=null || taxTreeFile!=null);
+ USE_COUNTVECTOR=useCountvector_;
+ MAKE_QUALITY_HISTOGRAM=ReadStats.COLLECT_QUALITY_STATS;
+ MAKE_QUALITY_ACCURACY=ReadStats.COLLECT_QUALITY_ACCURACY;
+ MAKE_MATCH_HISTOGRAM=ReadStats.COLLECT_MATCH_STATS;
+ MAKE_BASE_HISTOGRAM=ReadStats.COLLECT_BASE_STATS;
+ MAKE_EHIST=ReadStats.COLLECT_ERROR_STATS;
+ MAKE_INDELHIST=ReadStats.COLLECT_INDEL_STATS;
+ MAKE_LHIST=ReadStats.COLLECT_LENGTH_STATS;
+ MAKE_GCHIST=ReadStats.COLLECT_GC_STATS;
+ MAKE_IDHIST=ReadStats.COLLECT_IDENTITY_STATS;
+
+ if((speed>0 && qSkip>1) || (qSkip>1 && refSkip>1) || (speed>0 && refSkip>1)){
+ System.err.println("WARNING: It is not recommended to use more than one of qskip, speed, and rskip together.");
+ System.err.println("qskip="+qSkip+", speed="+speed+", rskip="+refSkip);
+ }
+
+ {
+ long usableMemory;
+ long tableMemory;
+
+ {
+ long memory=Runtime.getRuntime().maxMemory();
+ double xmsRatio=Shared.xmsRatio();
+ usableMemory=(long)Tools.max(((memory-96000000-(20*400000 /* for atomic arrays */))*(xmsRatio>0.97 ? 0.82 : 0.75)), memory*0.45);
+ tableMemory=(long)(usableMemory*.95);
+ }
+
+ if(initialSize<1){
+ final int factor=(tableType==AbstractKmerTable.ARRAY1D ? 12 : tableType==AbstractKmerTable.ARRAYH ? 22 : 27);
+ final long memOverWays=tableMemory/(factor*WAYS);
+ final double mem2=(prealloc_ ? preallocFraction : 1)*tableMemory;
+ initialSize=(prealloc_ || memOverWays<initialSizeDefault ? (int)Tools.min(2142000000, (long)(mem2/(factor*WAYS))) : initialSizeDefault);
+ if(initialSize!=initialSizeDefault){
+ System.err.println("Initial size set to "+initialSize);
+ }
+ }
+ }
+
+ k=k_;
+ k2=k-1;
+ minKmerHits=minKmerHits_;
+ minKmerFraction=Tools.max(minKmerFraction_, 0);
+ assert(minKmerHits>=1) : "minKmerHits must be at least 1; value="+minKmerHits;
+ assert(minKmerFraction<=1) : "minKmerFraction must range from 0 to 1; value="+minKmerFraction;
+
+ kfilter=(ref!=null || literal!=null);
+ assert(kfilter==false || (k>0 && k<32)) : "K must range from 1 to 31.";
+
+ middleMask=maskMiddle ? ~(3L<<(2*(k/2))) : -1L;
+
+
+ /* Adjust I/O settings and filenames */
+
+ assert(FastaReadInputStream.settingsOK());
+
+ if(in1==null){
+ printOptions();
+ throw new RuntimeException("Error - at least one input file is required.");
+ }
+
+ if(in1!=null && in1.contains("#") && !new File(in1).exists()){
+ int pound=in1.lastIndexOf('#');
+ String a=in1.substring(0, pound);
+ String b=in1.substring(pound+1);
+ in1=a+1+b;
+ in2=a+2+b;
+ }
+ if(in2!=null && (in2.contains("=") || in2.equalsIgnoreCase("null"))){in2=null;}
+ if(in2!=null){
+ if(FASTQ.FORCE_INTERLEAVED){System.err.println("Reset INTERLEAVED to false because paired input files were specified.");}
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+ }
+
+ if(qfin1!=null && qfin1.contains("#") && in2!=null && !new File(qfin1).exists()){
+ int pound=qfin1.lastIndexOf('#');
+ String a=qfin1.substring(0, pound);
+ String b=qfin1.substring(pound+1);
+ qfin1=a+1+b;
+ qfin2=a+2+b;
+ }
+
+ if(outu1!=null && outu1.contains("#")){
+ int pound=outu1.lastIndexOf('#');
+ String a=outu1.substring(0, pound);
+ String b=outu1.substring(pound+1);
+ outu1=a+1+b;
+ outu2=a+2+b;
+ }
+
+ if(outm1!=null && outm1.contains("#")){
+ int pound=outm1.lastIndexOf('#');
+ String a=outm1.substring(0, pound);
+ String b=outm1.substring(pound+1);
+ outm1=a+1+b;
+ outm2=a+2+b;
+ }
+
+ if((outu2!=null || outm2!=null) && (in1!=null && in2==null)){
+ if(!FASTQ.FORCE_INTERLEAVED){System.err.println("Forcing interleaved input because paired output was specified for a single input file.");}
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=true;
+ }
+
+ if(!setOut){
+ System.err.println("No output stream specified. To write to stdout, please specify 'out=stdout.fq' or similar.");
+// out1="stdout";
+// outstream=System.err;
+// out2=null;
+ outu1=outu2=null;
+ }else if("stdout".equalsIgnoreCase(outu1) || "standarddout".equalsIgnoreCase(outu1)){
+ outu1="stdout.fq";
+ outstream=System.err;
+ outu2=null;
+ }
+
+ if(!Tools.testOutputFiles(overwrite, append, false, outu1, outu2, outm1, outm2, outpattern, outstats, outrpkm, outrefstats)){
+ throw new RuntimeException("\nCan't write to some output files; overwrite="+overwrite+"\n");
+ }
+ if(!Tools.testInputFiles(false, true, in1, in2, qfin1, qfin2, taxNameFile, taxNodeFile, giTableFile, taxTreeFile)){
+ throw new RuntimeException("\nCan't read to some input files.\n");
+ }
+ if(!Tools.testInputFiles(true, true, ref)){
+ throw new RuntimeException("\nCan't read to some reference files.\n");
+ }
+ if(!Tools.testForDuplicateFiles(true, in1, in2, qfin1, qfin2, outu1, outu2, outm1, outm2, outpattern, outstats, outrpkm, outrefstats)){
+ throw new RuntimeException("\nSome file names were specified multiple times.\n");
+ }
+
+ assert(THREADS>0) : "THREADS must be greater than 0.";
+
+ assert(in1==null || in1.toLowerCase().startsWith("stdin") || in1.toLowerCase().startsWith("standardin") || new File(in1).exists()) : "Can't find "+in1;
+ assert(in2==null || in2.toLowerCase().startsWith("stdin") || in2.toLowerCase().startsWith("standardin") || new File(in2).exists()) : "Can't find "+in2;
+
+ if(ref==null && literal==null){
+ System.err.println("ERROR: No reference sequences specified. Use the -da flag to run anyway.");
+ assert(false) : "Please specify a reference.";
+ }
+
+ if(ref!=null){
+ for(String s0 : ref){
+ assert(s0!=null) : "Specified a null reference.";
+ String s=s0.toLowerCase();
+ assert(s==null || s.startsWith("stdin") || s.startsWith("standardin") || new File(s0).exists()) : "Can't find "+s0;
+ }
+ }
+
+ //Initialize tables
+ keySets=AbstractKmerTable.preallocate(WAYS, tableType, initialSize, (!prealloc_ || preallocFraction<1));
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Outer Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+
+ public void process(){
+
+ /* Check for output file collisions */
+ if(!Tools.testOutputFiles(overwrite, append, false, outu1, outu2, outm1, outm2, outstats, outrpkm, outrefstats)){
+ throw new RuntimeException("One or more output files were duplicate or could not be written to. Check the names or set the 'overwrite=true' flag.");
+ }
+
+ /* Start overall timer */
+ Timer t=new Timer();
+
+// boolean dq0=FASTQ.DETECT_QUALITY;
+// boolean ti0=FASTQ.TEST_INTERLEAVED;
+// int rbl0=Shared.READ_BUFFER_LENGTH;
+// FASTQ.DETECT_QUALITY=false;
+// FASTQ.TEST_INTERLEAVED=false;
+// Shared.READ_BUFFER_LENGTH=16;
+
+ process2(t.time1);
+
+// FASTQ.DETECT_QUALITY=dq0;
+// FASTQ.TEST_INTERLEAVED=ti0;
+// Shared.READ_BUFFER_LENGTH=rbl0;
+
+ /* Stop timer and calculate speed statistics */
+ t.stop();
+
+
+ if(showSpeed){
+ double rpnano=readsIn/(double)(t.elapsed);
+ double bpnano=basesIn/(double)(t.elapsed);
+
+ //Format with k or m suffixes
+ String rpstring=(readsIn<100000 ? ""+readsIn : readsIn<100000000 ? (readsIn/1000)+"k" : (readsIn/1000000)+"m");
+ String bpstring=(basesIn<100000 ? ""+basesIn : basesIn<100000000 ? (basesIn/1000)+"k" : (basesIn/1000000)+"m");
+
+ while(rpstring.length()<8){rpstring=" "+rpstring;}
+ while(bpstring.length()<8){bpstring=" "+bpstring;}
+
+ outstream.println("\nTime: \t\t\t"+t);
+ outstream.println("Reads Processed: "+rpstring+" \t"+String.format("%.2fk reads/sec", rpnano*1000000));
+ outstream.println("Bases Processed: "+bpstring+" \t"+String.format("%.2fm bases/sec", bpnano*1000));
+ }
+
+ /* Throw an exception if errors were detected */
+ if(errorState){
+ throw new RuntimeException("Seal terminated in an error state; the output may be corrupt.");
+ }
+ }
+
+
+ public void process2(long startTime){
+
+ /* Start phase timer */
+ Timer t=new Timer();
+
+ if(DISPLAY_PROGRESS){
+ outstream.println("Initial:");
+ Shared.printMemory();
+ outstream.println();
+ }
+
+ /* Fill tables with reference kmers */
+ {
+ final boolean oldTI=FASTQ.TEST_INTERLEAVED; //TODO: This needs to be changed to a non-static field, or somehow 'read mode' and 'ref mode' need to be distinguished.
+ final boolean oldFI=FASTQ.FORCE_INTERLEAVED;
+ final boolean oldSplit=FastaReadInputStream.SPLIT_READS;
+ final int oldML=FastaReadInputStream.MIN_READ_LEN;
+
+ FASTQ.TEST_INTERLEAVED=false;
+ FASTQ.FORCE_INTERLEAVED=false;
+ FastaReadInputStream.SPLIT_READS=false;
+ FastaReadInputStream.MIN_READ_LEN=1;
+
+ storedKmers=spawnLoadThreads();
+
+ FASTQ.TEST_INTERLEAVED=oldTI;
+ FASTQ.FORCE_INTERLEAVED=oldFI;
+ FastaReadInputStream.SPLIT_READS=oldSplit;
+ FastaReadInputStream.MIN_READ_LEN=oldML;
+
+// if(useRefNames){toRefNames();}
+ t.stop();
+ }
+
+ /* Check memory */
+ {
+ long ram=freeMemory();
+ ALLOW_LOCAL_ARRAYS=(scaffoldNames!=null && Tools.max(THREADS, 1)*3*8*scaffoldNames.size()<ram*5);
+ }
+
+ /* Dump kmers to text */
+ if(dump!=null){
+ ByteStreamWriter bsw=new ByteStreamWriter(dump, overwrite, false, true);
+ bsw.start();
+ for(AbstractKmerTable set : keySets){
+ set.dumpKmersAsBytes(bsw, k, 0);
+ }
+ bsw.poisonAndWait();
+ }
+
+ final boolean vic=Read.VALIDATE_IN_CONSTRUCTOR;
+ Read.VALIDATE_IN_CONSTRUCTOR=THREADS<4;
+
+ /* Do kmer matching of input reads */
+ spawnProcessThreads(t);
+
+ Read.VALIDATE_IN_CONSTRUCTOR=vic;
+
+ /* Unload kmers to save memory */
+ if(RELEASE_TABLES){
+ unloadKmers();
+ }
+
+ if(USE_TAXTREE){
+ if(giTableFile!=null){loadGiToNcbi();}
+ if(USE_TAXTREE){tree=loadTaxTree();}
+ addToTree();
+ }
+
+ /* Write statistics to files */
+ writeStats();
+ writeRPKM();
+ if(!BBSPLIT_STYLE){
+ writeRefStats();
+ }else{
+ writeRefStats_BBSplitStyle(readsIn);
+ }
+ writeTaxonomy();
+
+ /* Unload sequence data to save memory */
+ if(RELEASE_TABLES){
+ unloadScaffolds();
+ tree=null;
+ GiToNcbi.unload();
+ }
+
+ outstream.println("\nInput: \t"+readsIn+" reads \t\t"+basesIn+" bases.");
+
+ if(ref!=null || literal!=null){
+ outstream.println("Matched reads: \t"+readsMatched+" reads ("+String.format("%.2f",readsMatched*100.0/readsIn)+"%) \t"+
+ basesMatched+" bases ("+String.format("%.2f",basesMatched*100.0/basesIn)+"%)");
+ outstream.println("Unmatched reads: \t"+readsUnmatched+" reads ("+String.format("%.2f",readsUnmatched*100.0/readsIn)+"%) \t"+
+ basesUnmatched+" bases ("+String.format("%.2f",basesUnmatched*100.0/basesIn)+"%)");
+ outstream.flush();
+ }
+ if(qtrimLeft || qtrimRight){
+ outstream.println("QTrimmed: \t"+readsQTrimmed+" reads ("+String.format("%.2f",readsQTrimmed*100.0/readsIn)+"%) \t"+
+ basesQTrimmed+" bases ("+String.format("%.2f",basesQTrimmed*100.0/basesIn)+"%)");
+ }
+ if(forceTrimLeft>0 || forceTrimRight>0 || forceTrimRight2>0 || forceTrimModulo>0){
+ outstream.println("FTrimmed: \t"+readsFTrimmed+" reads ("+String.format("%.2f",readsFTrimmed*100.0/readsIn)+"%) \t"+
+ basesFTrimmed+" bases ("+String.format("%.2f",basesFTrimmed*100.0/basesIn)+"%)");
+ }
+ if(minAvgQuality>0 || maxNs>=0){
+ outstream.println("Low quality discards: \t"+readsQFiltered+" reads ("+String.format("%.2f",readsQFiltered*100.0/readsIn)+"%) \t"+
+ basesQFiltered+" bases ("+String.format("%.2f",basesQFiltered*100.0/basesIn)+"%)");
+ }
+ if(parsecustom){
+ outstream.println();
+ outstream.println("Correctly mapped: \t"+correctReads+" reads ("+String.format("%.2f",correctReads*100.0/readsIn)+"%)");
+ outstream.println("Incorrectly mapped: \t"+incorrectReads+" reads ("+String.format("%.2f",incorrectReads*100.0/readsIn)+"%)");
+ }
+// outstream.println("Result: \t"+readsMatched+" reads ("+String.format("%.2f",readsMatched*100.0/readsIn)+"%) \t"+
+// basesMatched+" bases ("+String.format("%.2f",basesMatched*100.0/basesIn)+"%)");
+ }
+
+ /**
+ * Clear stored kmers.
+ */
+ public void unloadKmers(){
+ if(keySets!=null){
+ for(int i=0; i<keySets.length; i++){keySets[i]=null;}
+ }
+ }
+
+ /**
+ * Clear stored sequence data.
+ */
+ public void unloadScaffolds(){
+ if(scaffoldNames!=null && !scaffoldNames.isEmpty()){
+ scaffoldNames.clear();
+ scaffoldNames.trimToSize();
+ }
+ scaffoldReadCounts=null;
+ scaffoldFragCounts=null;
+ scaffoldBaseCounts=null;
+ scaffoldLengths=null;
+ scaffoldKmers=null;
+ scaffolds=null;
+ }
+
+ /**
+ * Write statistics about how many reads matched each reference scaffold.
+ */
+ private void writeStats(){
+ if(outstats==null){return;}
+ final TextStreamWriter tsw=new TextStreamWriter(outstats, overwrite, false, false);
+ tsw.start();
+
+ long rsum=0, bsum=0;
+
+ /* Create StringNum list of scaffold names and hitcounts */
+ ArrayList<StringNum> list=new ArrayList<StringNum>();
+ for(int i=1; i<scaffoldNames.size(); i++){
+ final long num1=scaffoldReadCounts.get(i), num2=scaffoldBaseCounts.get(i);
+ if(num1>0 || !printNonZeroOnly){
+ rsum+=num1;
+ bsum+=num2;
+ final String s=scaffoldNames.get(i);
+ final int len=scaffoldLengths.get(i);
+ final StringNum sn=new StringNum(s, len, num1, num2);
+ list.add(sn);
+ }
+ }
+ Collections.sort(list);
+ final double rmult=100.0/(readsIn>0 ? readsIn : 1);
+ final double bmult=100.0/(basesIn>0 ? basesIn : 1);
+
+ tsw.print("#File\t"+in1+(in2==null ? "" : "\t"+in2)+"\n");
+ if(STATS_COLUMNS==3){
+ tsw.print(String.format("#Total\t%d\n",readsIn));
+// tsw.print(String.format("#Matched\t%d\t%.5f%%\n",rsum,rmult*rsum)); //With ambig=all, gives over 100%
+ tsw.print(String.format("#Matched\t%d\t%.5f%%\n",readsMatched,rmult*readsMatched));
+ tsw.print("#Name\tReads\tReadsPct\n");
+ for(int i=0; i<list.size(); i++){
+ StringNum sn=list.get(i);
+ tsw.print(String.format("%s\t%d\t%.5f%%\n",sn.name,sn.reads,(sn.reads*rmult)));
+ }
+ }else{
+ tsw.print(String.format("#Total\t%d\t%d\n",readsIn,basesIn));
+// tsw.print(String.format("#Matched\t%d\t%.5f%%\n",rsum,rmult*rsum,bsum,bsum*bmult)); //With ambig=all, gives over 100%
+ tsw.print(String.format("#Matched\t%d\t%.5f%%\n",readsMatched,rmult*readsMatched,basesMatched,basesMatched*bmult));
+ tsw.print("#Name\tReads\tReadsPct\tBases\tBasesPct\n");
+ for(int i=0; i<list.size(); i++){
+ StringNum sn=list.get(i);
+ tsw.print(String.format("%s\t%d\t%.5f%%\t%d\t%.5f%%\n",sn.name,sn.reads,(sn.reads*rmult),sn.bases,(sn.bases*bmult)));
+ }
+ }
+ tsw.poisonAndWait();
+ }
+
+ private void writeRPKM(){
+ writeRPKM(outrpkm, in1, in2, readsIn, printNonZeroOnly,
+ scaffoldNames, scaffoldLengths,
+ scaffoldReadCounts, scaffoldFragCounts, scaffoldBaseCounts);
+ }
+
+ /**
+ * Write RPKM statistics.
+ */
+ public void writeRPKM(String out, String in1, String in2, long readsIn, boolean printNonZeroOnly,
+ ArrayList<String> scaffoldNames, IntList scaffoldLengths,
+ AtomicLongArray scaffoldReadCounts, AtomicLongArray scaffoldFragCounts, AtomicLongArray scaffoldBaseCounts){
+ if(out==null){return;}
+ final TextStreamWriter tsw=new TextStreamWriter(out, overwrite, false, false);
+ tsw.start();
+
+ /* Count mapped reads */
+ long mappedReads=0;
+ long mappedFrags=0;
+ for(int i=0; i<scaffoldReadCounts.length(); i++){
+ mappedReads+=scaffoldReadCounts.get(i);
+ mappedFrags+=scaffoldFragCounts.get(i);
+ }
+
+ /* Print header */
+ tsw.print("#File\t"+in1+(in2==null ? "" : "\t"+in2)+"\n");
+ tsw.print(String.format("#Reads\t%d\n",readsIn));
+// tsw.print(String.format("#Mapped\t%d\n",mappedReads));
+ tsw.print(String.format("#Mapped\t%d\n",readsMatched));
+ tsw.print(String.format("#RefSequences\t%d\n",Tools.max(0, scaffoldNames.size()-1)));
+ tsw.print("#Name\tLength\tBases\tCoverage\tReads\tRPKM\tFrags\tFPKM\n");
+
+ final float readMult=1000000000f/Tools.max(1, mappedReads);
+ final float fragMult=1000000000f/Tools.max(1, mappedFrags);
+
+ /* Print data */
+ for(int i=1; i<scaffoldNames.size(); i++){
+ final long reads=scaffoldReadCounts.get(i);
+ final long frags=scaffoldFragCounts.get(i);
+ final long bases=scaffoldBaseCounts.get(i);
+ final String s=scaffoldNames.get(i);
+ final int len=scaffoldLengths.get(i);
+ final double invlen=1.0/Tools.max(1, len);
+ final double readMult2=readMult*invlen;
+ final double fragMult2=fragMult*invlen;
+ if(reads>0 || !printNonZeroOnly){
+ tsw.print(String.format("%s\t%d\t%d\t%.4f\t%d\t%.4f\t%d\t%.4f\n",s,len,bases,bases*invlen,reads,reads*readMult2,frags,frags*fragMult2));
+ }
+ }
+ tsw.poisonAndWait();
+ }
+
+ /**
+ * Write statistics on a per-reference basis.
+ */
+ private void writeRefStats(){
+ if(outrefstats==null){return;}
+ final TextStreamWriter tsw=new TextStreamWriter(outrefstats, overwrite, false, false);
+ tsw.start();
+
+ /* Count mapped reads */
+ long mapped=0;
+ for(int i=0; i<scaffoldReadCounts.length(); i++){
+ mapped+=scaffoldReadCounts.get(i);
+ }
+
+ final int numRefs=refNames.size();
+ long[] refReadCounts=new long[numRefs];
+ long[] refFragCounts=new long[numRefs];
+ long[] refBaseCounts=new long[numRefs];
+ long[] refLengths=new long[numRefs];
+
+ for(int r=1, s=1; r<numRefs; r++){
+ final int lim=s+(useRefNames ? 1 : refScafCounts[r]);
+ while(s<lim){
+ refReadCounts[r]+=scaffoldReadCounts.get(s);
+ refFragCounts[r]+=scaffoldFragCounts.get(s);
+ refBaseCounts[r]+=scaffoldBaseCounts.get(s);
+ refLengths[r]+=scaffoldLengths.get(s);
+ s++;
+ }
+ }
+
+ /* Print header */
+ tsw.print("#File\t"+in1+(in2==null ? "" : "\t"+in2)+"\n");
+ tsw.print(String.format("#Reads\t%d\n",readsIn));
+ tsw.print(String.format("#Mapped\t%d\n",mapped));
+ tsw.print(String.format("#References\t%d\n",refNames.size()-1));
+ tsw.print("#Name\tLength\tScaffolds\tBases\tCoverage\tReads\tRPKM\tFrags\tFPKM\n");
+
+ final float mult=1000000000f/Tools.max(1, mapped);
+
+ /* Print data */
+ for(int i=1; i<refNames.size(); i++){
+ final long reads=refReadCounts[i];
+ final long frags=refFragCounts[i];
+ final long bases=refBaseCounts[i];
+ final long len=refLengths[i];
+ final int scafs=refScafCounts[i];
+ final String name=ReadWrite.stripToCore(refNames.get(i));
+ final double invlen=1.0/Tools.max(1, len);
+ final double mult2=mult*invlen;
+ if(reads>0 || !printNonZeroOnly){
+ tsw.print(String.format("%s\t%d\t%d\t%d\t%.4f\t%d\t%.4f\t%d\t%.4f\n",name,len,scafs,bases,bases*invlen,reads,reads*mult2,frags,frags*mult2));
+ }
+ }
+ tsw.poisonAndWait();
+ }
+
+ /**
+ * Write statistics on a per-reference basis.
+ */
+ private void writeRefStats_BBSplitStyle(long totalReads){
+ if(outrefstats==null){return;}
+ final TextStreamWriter tsw=new TextStreamWriter(outrefstats, overwrite, false, false);
+ tsw.start();
+
+ final int numRefs=refNames.size();
+ long[] refReadCounts=new long[numRefs];
+ long[] refBaseCounts=new long[numRefs];
+
+ for(int r=1, s=1; r<numRefs; r++){
+ final int lim=s+(useRefNames ? 1 : refScafCounts[r]);
+ while(s<lim){
+ refReadCounts[r]+=scaffoldReadCounts.get(s);
+ refBaseCounts[r]+=scaffoldBaseCounts.get(s);
+ s++;
+ }
+ }
+
+ /* Print header */
+ tsw.print("#name\t%unambiguousReads\tunambiguousMB\t%ambiguousReads\tambiguousMB\tunambiguousReads\tambiguousReads\n");
+
+ final float rmult=100f/Tools.max(1, totalReads);
+
+ /* Print data */
+ for(int i=1; i<refNames.size(); i++){
+ final long reads=refReadCounts[i];
+ final long bases=refBaseCounts[i];
+ final float unambigMB=bases*0.000001f;
+
+ final long ambigReads=0; //TODO but not urgent
+ final long ambigBases=0; //TODO but not urgent
+ final float ambigMB=ambigBases*0.000001f;
+
+ final String name=ReadWrite.stripToCore(refNames.get(i));
+
+ final double unambigReadP=rmult*reads;
+ final double ambigReadP=rmult*ambigReads;
+ if(reads>0 || !printNonZeroOnly){
+ tsw.print(String.format("%s\t%.5f\t%.5f\t%.5f\t%.5f\t%d\t%d\n",name,unambigReadP,unambigMB,ambigReadP,ambigMB,reads,ambigReads));
+ }
+ }
+ tsw.poisonAndWait();
+ }
+
+ /**
+ * Write taxonomic information.
+ */
+ private void writeTaxonomy(){
+ if(!USE_TAXTREE || outtax==null){return;}
+
+ long mappedFrags=0;
+ for(int i=0; i<scaffoldReadCounts.length(); i++){
+ mappedFrags+=scaffoldFragCounts.get(i);
+ }
+ final double fragMult=100.0/Tools.max(1, fragsIn);
+
+ final TextStreamWriter tsw=new TextStreamWriter(outtax, overwrite, false, false);
+ tsw.start();
+
+ tsw.print("#File\t"+in1+(in2==null ? "" : "\t"+in2)+"\n");
+ tsw.print(String.format("#Reads\t%d\n",fragsIn));
+ tsw.print(String.format("#Mapped\t%d\n",mappedFrags));
+ tsw.print(String.format("#Limits\t%d\t%d\t%d\t%d\n", taxNodeCountLimit, taxNodeNumberLimit, taxNodeMinLevel, taxNodeMaxLevel));
+ tsw.print("#ID\tCount\tPercent\tLevel\tName\n");
+
+ ArrayList<TaxNode> nodes=tree.gatherNodesAtLeastLimit(taxNodeCountLimit, taxNodeMinLevel, taxNodeMaxLevel);
+
+ for(int i=0, cap=Tools.min(nodes.size(), (taxNodeNumberLimit>0 ? taxNodeNumberLimit : Integer.MAX_VALUE)); i<cap; i++){
+ TaxNode n=nodes.get(i);
+ tsw.print(String.format("%d\t%d\t%.4f\t%s\t%s\n", n.id, n.countSum, n.countSum*fragMult, n.levelString(), n.name));
+ }
+
+ tsw.poisonAndWait();
+ }
+
+// /**
+// * Fills the scaffold names array with reference names.
+// */
+// private void toRefNames(){
+// final int numRefs=refNames.size();
+// for(int r=0, s=1; r<numRefs; r++){
+// final int scafs=refScafCounts[r];
+// final int lim=s+scafs;
+// final String name=ReadWrite.stripToCore(refNames.get(r));
+//// System.err.println("r="+r+", s="+s+", scafs="+scafs+", lim="+lim+", name="+name);
+// while(s<lim){
+//// System.err.println(r+", "+s+". Setting "+scaffoldNames.get(s)+" -> "+name);
+// scaffoldNames.set(s, name);
+// s++;
+// }
+// }
+// }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private static int numKmers(Read r1, Read r2, int k){
+ int x=0;
+ if(r1!=null){
+ x+=Tools.max(r1.length()-k+1, 0);
+ }
+ if(r2!=null){
+ x+=Tools.max(r2.length()-k+1, 0);
+ }
+ return x;
+ }
+
+ private void loadGiToNcbi(){
+ Timer t=new Timer();
+ outstream.println("Loading gi to taxa translation table.");
+ GiToNcbi.initialize(giTableFile);
+ t.stop();
+ if(DISPLAY_PROGRESS){
+ outstream.println("Time: \t"+t);
+ Shared.printMemory();
+ outstream.println();
+ }
+ }
+
+ private TaxTree loadTaxTree(){
+ assert(taxTreeFile!=null || (taxNameFile!=null && taxNodeFile!=null)) : "Must specify both taxname and taxnode files.";
+ Timer t=new Timer();
+ outstream.print("\nLoading tax tree; ");
+ final TaxTree tree;
+ if(taxTreeFile!=null){
+ tree=ReadWrite.read(TaxTree.class, taxTreeFile, true);
+ }else{
+ tree=new TaxTree(taxNameFile, taxNodeFile);
+ }
+ t.stop();
+ if(DISPLAY_PROGRESS){
+ outstream.println("time: \t"+t);
+ Shared.printMemory();
+ outstream.println();
+ }
+ return tree;
+ }
+
+ private void addToTree(){
+ for(int i=0; i<scaffoldFragCounts.length(); i++){
+ long count=scaffoldFragCounts.get(i);
+ if(count>0){
+ String name=scaffoldNames.get(i);
+ assert(name.startsWith("ncbi|") || (name.startsWith("gi|") && GiToNcbi.isInitialized())) :
+ "\nFor taxonomy, all ref names must start with 'gi|' or 'ncbi|'.\n" +
+ "If the names start with 'gi', the gi= flag must be set.\n";
+ int id=GiToNcbi.getID(name);
+ if(id>-1){
+ tree.incrementRaw(id, count);
+ }
+ }
+ }
+ tree.percolateUp();
+ }
+
+ /**
+ * Fills tables with kmers from references, using multiple LoadThread.
+ * @return Number of kmers stored.
+ */
+ private long spawnLoadThreads(){
+ Timer t=new Timer();
+ if((ref==null || ref.length<1) && (literal==null || literal.length<1)){return 0;}
+ long added=0;
+
+ /* Create load threads */
+ LoadThread[] loaders=new LoadThread[WAYS];
+ for(int i=0; i<loaders.length; i++){
+ loaders[i]=new LoadThread(i);
+ loaders[i].start();
+ }
+
+ /* For each reference file... */
+
+ int refNum=1;
+ if(ref!=null){
+
+ HashMap<String, Integer> nameMap=new HashMap<String, Integer>();
+
+ for(String refname : ref){
+
+ /* Start an input stream */
+ FileFormat ff=FileFormat.testInput(refname, FileFormat.FASTA, null, true, true);
+ ConcurrentReadInputStream cris=ConcurrentReadInputStream.getReadInputStream(-1L, false, ff, null, null, null, Shared.USE_MPI, true);
+ cris.start(); //4567
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ final String core=ReadWrite.stripToCore(refname);
+ if(useRefNames){
+ assert(refNum==scaffoldNames.size());
+ assert(!nameMap.containsKey(core));
+ Integer id=scaffoldNames.size();
+ scaffoldNames.add(core);
+ nameMap.put(core, id);
+ }
+
+ /* Iterate through read lists from the input stream */
+ while(reads!=null && reads.size()>0){
+ {
+ /* Assign a unique ID number to each scaffold */
+ ArrayList<Read> reads2=new ArrayList<Read>(reads);
+ for(Read r1 : reads2){
+ final Read r2=r1.mate;
+ if(useRefNames){
+ r1.id=core;
+ if(r2!=null){r2.id=core;}
+ }else if(r1.id==null){r1.id=new Integer(scaffoldNames.size()).toString();}
+ final Integer id;
+ {
+ Integer x=nameMap.get(r1.id);
+ if(x!=null){
+ id=x;
+ }else{
+ id=scaffoldNames.size();
+ scaffoldNames.add(r1.id);
+ nameMap.put(r1.id, id);
+ }
+ }
+ if(useRefNames){assert(refNum==id);}
+
+ refScafCounts[refNum]++;
+ int len=r1.length()+r1.mateLength();
+ r1.obj=id;
+ if(r2!=null){r2.obj=id;}
+
+ scaffoldLengths.increment(id, len);
+ }
+
+ if(REPLICATE_AMBIGUOUS){
+ reads2=Tools.replicateAmbiguous(reads2, k);
+ }
+
+ /* Send a pointer to the read list to each LoadThread */
+ for(LoadThread lt : loaders){
+ boolean b=true;
+ while(b){
+ try {
+ lt.queue.put(reads2);
+ b=false;
+ } catch (InterruptedException e) {
+ //TODO: This will hang due to still-running threads.
+ throw new RuntimeException(e);
+ }
+ }
+ }
+ }
+
+ /* Dispose of the old list and fetch a new one */
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ /* Cleanup */
+ cris.returnList(ln.id, ln.list.isEmpty());
+ errorState|=ReadWrite.closeStream(cris);
+ refNum++;
+ }
+ }
+
+// int refNum=0;
+// if(ref!=null){
+// for(String refname : ref){
+//
+// /* Start an input stream */
+// FileFormat ff=FileFormat.testInput(refname, FileFormat.FASTA, null, true, true);
+// ConcurrentReadInputStream cris=ConcurrentReadInputStream.getReadInputStream(-1L, false, ff, null, null, null, Shared.USE_MPI, true);
+// cris.start(); //4567
+// ListNum<Read> ln=cris.nextList();
+// ArrayList<Read> reads=(ln!=null ? ln.list : null);
+//
+// final String core=ReadWrite.stripToCore(refname);
+//
+// /* Iterate through read lists from the input stream */
+// while(reads!=null && reads.size()>0){
+// {
+// /* Assign a unique ID number to each scaffold */
+// ArrayList<Read> reads2=new ArrayList<Read>(reads);
+// for(Read r1 : reads2){
+// final Read r2=r1.mate;
+// final Integer id=scaffoldNames.size();
+// refScafCounts[refNum]++;
+// scaffoldNames.add(r1.id==null ? id.toString() : r1.id);
+// int len=r1.length();
+// r1.obj=id;
+// if(r2!=null){
+// r2.obj=id;
+// len+=r2.length();
+// }
+// scaffoldLengths.add(len);
+// }
+//
+// if(REPLICATE_AMBIGUOUS){
+// reads2=Tools.replicateAmbiguous(reads2, k);
+// }
+//
+// /* Send a pointer to the read list to each LoadThread */
+// for(LoadThread lt : loaders){
+// boolean b=true;
+// while(b){
+// try {
+// lt.queue.put(reads2);
+// b=false;
+// } catch (InterruptedException e) {
+// //TODO: This will hang due to still-running threads.
+// throw new RuntimeException(e);
+// }
+// }
+// }
+// }
+//
+// /* Dispose of the old list and fetch a new one */
+// cris.returnList(ln.id, ln.list.isEmpty());
+// ln=cris.nextList();
+// reads=(ln!=null ? ln.list : null);
+// }
+// /* Cleanup */
+// cris.returnList(ln.id, ln.list.isEmpty());
+// errorState|=ReadWrite.closeStream(cris);
+// refNum++;
+// }
+// }
+
+ /* If there are literal sequences to use as references */
+ if(literal!=null){
+ ArrayList<Read> list=new ArrayList<Read>(literal.length);
+ if(verbose){System.err.println("Adding literals "+Arrays.toString(literal));}
+
+ /* Assign a unique ID number to each literal sequence */
+ if(useRefNames){
+ final Integer id=scaffoldNames.size();
+ scaffoldNames.add("literal");
+ for(int i=0; i<literal.length; i++){
+ final Read r=new Read(literal[i].getBytes(), null, id);
+ refScafCounts[refNum]++;
+ scaffoldLengths.increment(id, r.length());
+ r.obj=id;
+ list.add(r);
+ }
+ }else{
+ for(int i=0; i<literal.length; i++){
+ final int id=scaffoldNames.size();
+ final Read r=new Read(literal[i].getBytes(), null, id);
+ refScafCounts[refNum]++;
+ scaffoldNames.add(""+id);
+ scaffoldLengths.set(id, r.length());
+ r.obj=id;
+ list.add(r);
+ }
+ }
+
+ if(REPLICATE_AMBIGUOUS){
+ list=Tools.replicateAmbiguous(list, k);
+ }
+
+ /* Send a pointer to the read list to each LoadThread */
+ for(LoadThread lt : loaders){
+ boolean b=true;
+ while(b){
+ try {
+ lt.queue.put(list);
+ b=false;
+ } catch (InterruptedException e) {
+ //TODO: This will hang due to still-running threads.
+ throw new RuntimeException(e);
+ }
+ }
+ }
+ }
+
+ /* Signal loaders to terminate */
+ for(LoadThread lt : loaders){
+ boolean b=true;
+ while(b){
+ try {
+ lt.queue.put(POISON);
+ b=false;
+ } catch (InterruptedException e) {
+ //TODO: This will hang due to still-running threads.
+ throw new RuntimeException(e);
+ }
+ }
+ }
+
+ /* Wait for loaders to die, and gather statistics */
+ boolean success=true;
+ for(LoadThread lt : loaders){
+ while(lt.getState()!=Thread.State.TERMINATED){
+ try {
+ lt.join();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ added+=lt.addedT;
+ refKmers+=lt.refKmersT;
+ refBases+=lt.refBasesT;
+ refReads+=lt.refReadsT;
+ success&=lt.success;
+ }
+ if(!success){KillSwitch.kill("Failed loading ref kmers; aborting.");}
+
+ //Correct statistics for number of threads, since each thread processes all reference data
+ refKmers/=WAYS;
+ refBases/=WAYS;
+ refReads/=WAYS;
+
+ scaffoldReadCounts=new AtomicLongArray(scaffoldNames.size());
+ scaffoldFragCounts=new AtomicLongArray(scaffoldNames.size());
+ scaffoldBaseCounts=new AtomicLongArray(scaffoldNames.size());
+
+ t.stop();
+ if(DISPLAY_PROGRESS){
+ outstream.println("Added "+added+" kmers; time: \t"+t);
+ Shared.printMemory();
+ outstream.println();
+ }
+
+ if(verbose){
+ TextStreamWriter tsw=new TextStreamWriter("stdout", false, false, false, FileFormat.TEXT);
+ tsw.start();
+ for(AbstractKmerTable table : keySets){
+ table.dumpKmersAsText(tsw, k, 1);
+ }
+ tsw.poisonAndWait();
+ }
+
+ return added;
+ }
+
+ /**
+ * Match reads against reference kmers, using multiple ProcessThread.
+ * @param t
+ */
+ private void spawnProcessThreads(Timer t){
+ t.start();
+
+ /* Create read input stream */
+ final ConcurrentReadInputStream cris;
+ final boolean paired;
+ {
+ FileFormat ff1=FileFormat.testInput(in1, FileFormat.FASTQ, null, true, true);
+ FileFormat ff2=FileFormat.testInput(in2, FileFormat.FASTQ, null, true, true);
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, ff1.samOrBam(), ff1, ff2, qfin1, qfin2);
+ cris.setSampleRate(samplerate, sampleseed);
+ cris.start(); //4567
+ paired=cris.paired();
+ if(!ff1.samOrBam()){outstream.println("Input is being processed as "+(paired ? "paired" : "unpaired"));}
+ }
+
+ /* Create read output streams */
+ final ConcurrentReadOutputStream rosm, rosu;
+ final MultiCros mcros;
+ if(outu1!=null){
+ final int buff=(!ORDERED ? 12 : Tools.max(32, 2*Shared.threads()));
+ FileFormat ff1=FileFormat.testOutput(outu1, FileFormat.FASTQ, null, true, overwrite, append, ORDERED);
+ FileFormat ff2=FileFormat.testOutput(outu2, FileFormat.FASTQ, null, true, overwrite, append, ORDERED);
+ rosu=ConcurrentReadOutputStream.getStream(ff1, ff2, null, null, buff, null, true);
+ rosu.start();
+ }else{rosu=null;}
+ if(outm1!=null){
+ final int buff=(!ORDERED ? 12 : Tools.max(32, 2*Shared.threads()));
+ FileFormat ff1=FileFormat.testOutput(outm1, FileFormat.FASTQ, null, true, overwrite, append, ORDERED);
+ FileFormat ff2=FileFormat.testOutput(outm2, FileFormat.FASTQ, null, true, overwrite, append, ORDERED);
+ rosm=ConcurrentReadOutputStream.getStream(ff1, ff2, null, null, buff, null, true);
+ rosm.start();
+ }else{rosm=null;}
+ if(outpattern!=null){
+ final int buff=(!ORDERED ? 12 : Tools.max(32, 2*Shared.threads()));
+ mcros=new MultiCros(outpattern, null, ORDERED, overwrite, append, true, false, FileFormat.FASTQ, buff);
+ }else{mcros=null;}
+
+ if(rosu!=null || rosm!=null || mcros!=null){
+ t.stop();
+ outstream.println("Started output streams:\t"+t);
+ t.start();
+ }
+
+ /* Optionally skip the first reads, since initial reads may have lower quality */
+ if(skipreads>0){
+ long skipped=0;
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ while(skipped<skipreads && reads!=null && reads.size()>0){
+ skipped+=reads.size();
+
+ if(rosm!=null){rosm.add(new ArrayList<Read>(1), ln.id);}
+ if(rosu!=null){rosu.add(new ArrayList<Read>(1), ln.id);}
+
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ cris.returnList(ln.id, ln.list.isEmpty());
+ if(reads==null || reads.isEmpty()){
+ ReadWrite.closeStreams(cris, rosu, rosm);
+ ReadWrite.closeStreams(mcros);
+ System.err.println("Skipped all of the reads.");
+ System.exit(0);
+ }
+ }
+
+ /* Create ProcessThreads */
+ ArrayList<ProcessThread> alpt=new ArrayList<ProcessThread>(THREADS);
+ for(int i=0; i<THREADS; i++){alpt.add(new ProcessThread(cris, rosm, rosu, mcros, ALLOW_LOCAL_ARRAYS));}
+ for(ProcessThread pt : alpt){pt.start();}
+
+ /* Wait for threads to die, and gather statistics */
+ for(ProcessThread pt : alpt){
+ while(pt.getState()!=Thread.State.TERMINATED){
+ try {
+ pt.join();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ readsIn+=pt.readsInT;
+ fragsIn+=pt.fragsInT;
+ basesIn+=pt.basesInT;
+ readsMatched+=pt.readsMatchedT;
+ basesMatched+=pt.basesMatchedT;
+ readsUnmatched+=pt.readsUnmatchedT;
+ basesUnmatched+=pt.basesUnmatchedT;
+ readsQTrimmed+=pt.readsQTrimmedT;
+ basesQTrimmed+=pt.basesQTrimmedT;
+ readsFTrimmed+=pt.readsFTrimmedT;
+ basesFTrimmed+=pt.basesFTrimmedT;
+ readsQFiltered+=pt.readsQFilteredT;
+ basesQFiltered+=pt.basesQFilteredT;
+
+ correctReads+=pt.correctT;
+ incorrectReads+=pt.incorrectT;
+
+ if(pt.scaffoldReadCountsT!=null && scaffoldReadCounts!=null){
+ for(int i=0; i<pt.scaffoldReadCountsT.length; i++){scaffoldReadCounts.addAndGet(i, pt.scaffoldReadCountsT[i]);}
+ pt.scaffoldReadCountsT=null;
+ }
+ if(pt.scaffoldBaseCountsT!=null && scaffoldBaseCounts!=null){
+ for(int i=0; i<pt.scaffoldBaseCountsT.length; i++){scaffoldBaseCounts.addAndGet(i, pt.scaffoldBaseCountsT[i]);}
+ pt.scaffoldBaseCountsT=null;
+ }
+ if(pt.scaffoldFragCountsT!=null && scaffoldFragCounts!=null){
+ for(int i=0; i<pt.scaffoldFragCountsT.length; i++){scaffoldFragCounts.addAndGet(i, pt.scaffoldFragCountsT[i]);}
+ pt.scaffoldFragCountsT=null;
+ }
+ }
+
+ /* Shut down I/O streams; capture error status */
+ errorState|=ReadWrite.closeStreams(cris, rosu, rosm);
+ errorState|=ReadWrite.closeStreams(mcros);
+ errorState|=ReadStats.writeAll();
+
+ t.stop();
+ if(showSpeed){
+ outstream.println("Processing time: \t\t"+t);
+ }
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Classes ----------------*/
+ /*--------------------------------------------------------------*/
+
+
+ /**
+ * Loads kmers into a table. Each thread handles all kmers X such that X%WAYS==tnum.
+ */
+ private class LoadThread extends Thread{
+
+ public LoadThread(final int tnum_){
+ tnum=tnum_;
+ map=keySets[tnum];
+ }
+
+ /**
+ * Get the next list of reads (or scaffolds) from the queue.
+ * @return List of reads
+ */
+ private ArrayList<Read> fetch(){
+ ArrayList<Read> list=null;
+ while(list==null){
+ try {
+ list=queue.take();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ return list;
+ }
+
+ @Override
+ public void run(){
+ ArrayList<Read> reads=fetch();
+ while(reads!=POISON){
+ for(Read r1 : reads){
+ assert(r1.pairnum()==0);
+ final Read r2=r1.mate;
+
+ addedT+=addToMap(r1, refSkip);
+ if(r2!=null){
+ addedT+=addToMap(r2, refSkip);
+ }
+ }
+ reads=fetch();
+ }
+
+// if(AbstractKmerTable.TESTMODE){
+// for(int i=0; i<ll.size; i++){
+// assert(map.contains(ll.get(i), il.get(i)));
+// assert(!map.contains(ll.get(i), Integer.MAX_VALUE));
+// }
+// ll=null;
+// il=null;
+// }
+
+ if(map.canRebalance() && map.size()>2L*map.arrayLength()){
+ map.rebalance();
+ }
+ success=true;
+ }
+
+ /**
+ * @param r The current read to process
+ * @param skip Number of bases to skip between kmers
+ * @return Number of kmers stored
+ */
+ private long addToMap(final Read r, final int skip){
+ final byte[] bases=r.bases;
+ final int shift=2*k;
+ final int shift2=shift-2;
+ final long mask=~((-1L)<<shift);
+ final long kmask=lengthMasks[k];
+ long kmer=0;
+ long rkmer=0;
+ long added=0;
+ int len=0;
+ int totalKmers=0;
+
+ if(tnum==0){
+ if(storeRefBases){
+ assert(r.mate==null);
+ assert(scaffolds.size()==(Integer)r.obj) : scaffolds.size()+", "+(Integer)r.obj/*+"\n"+r.toFasta()*/;
+ scaffolds.add(bases);
+ }
+ if(bases==null || bases.length<k){scaffoldKmers.add(0);}
+ }
+
+ if(bases!=null){
+ refReadsT++;
+ refBasesT+=bases.length;
+ }
+ if(bases==null || bases.length<k){return 0;}
+
+ final int id=(Integer)r.obj;
+
+ if(skip>1){ //Process while skipping some kmers
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ long x=Dedupe.baseToNumber[b];
+ long x2=Dedupe.baseToComplementNumber[b];
+ kmer=((kmer<<2)|x)&mask;
+ rkmer=(rkmer>>>2)|(x2<<shift2);
+ if(b=='N'){len=0;}else{len++;}
+ if(verbose){System.err.println("Scanning1 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
+ if(len>=k){
+ totalKmers++;
+ if(len%skip==0){
+ final long extraBase=(i>=bases.length-1 ? -1 : AminoAcid.baseToNumber[bases[i+1]]);
+ added+=addToMap(kmer, rkmer, k, extraBase, id, kmask);
+ }
+ }
+ }
+ }else{ //Process all kmers
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ long x=Dedupe.baseToNumber[b];
+ long x2=Dedupe.baseToComplementNumber[b];
+ kmer=((kmer<<2)|x)&mask;
+ rkmer=(rkmer>>>2)|(x2<<shift2);
+ if(b=='N'){len=0;}else{len++;}
+ if(verbose){System.err.println("Scanning2 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
+ if(len>=k){
+ totalKmers++;
+ final long extraBase=(i>=bases.length-1 ? -1 : AminoAcid.baseToNumber[bases[i+1]]);
+ final long atm=addToMap(kmer, rkmer, k, extraBase, id, kmask);
+ added+=atm;
+ }
+ }
+ }
+ refKmersT+=totalKmers;
+ if(tnum==0){scaffoldKmers.add(totalKmers);}
+
+ return added;
+ }
+
+
+ /**
+ * Adds this kmer to the table, including any mutations implied by editDistance or hammingDistance.
+ * @param kmer Forward kmer
+ * @param rkmer Reverse kmer
+ * @param extraBase Base added to end in case of deletions
+ * @param id Scaffold number
+ * @param kmask0
+ * @return Number of kmers stored
+ */
+ private long addToMap(final long kmer, final long rkmer, final int len, final long extraBase, final int id, final long kmask0){
+
+ assert(kmask0==lengthMasks[len]) : kmask0+", "+len+", "+lengthMasks[len]+", "+Long.numberOfTrailingZeros(kmask0)+", "+Long.numberOfTrailingZeros(lengthMasks[len]);
+
+ if(verbose){System.err.println("addToMap_A; len="+len+"; kMasks[len]="+lengthMasks[len]);}
+ assert((kmer&kmask0)==0);
+ final long added;
+ if(hammingDistance==0){
+ final long key=toValue(kmer, rkmer, kmask0);
+ if(speed>0 && ((key/WAYS)&15)<speed){return 0;}
+ if(key%WAYS!=tnum){return 0;}
+ if(verbose){System.err.println("addToMap_B: "+AminoAcid.kmerToString(kmer&~lengthMasks[len], len)+" = "+key);}
+// int[] old=map.getValues(key, new int[1]);
+
+// int[] old=map.getValues(key, new int[1]); //123
+
+ added=map.set(key, id);
+// assert(old==null || map.contains(key, old)); //123
+// assert(map.contains(key, id)); //123
+// ll.add(key);
+// il.add(id); assert(AbstractKmerTable.TESTMODE);
+
+// if(AbstractKmerTable.TESTMODE){
+// for(int i=0; i<ll.size; i++){
+// assert(map.contains(ll.get(i), il.get(i)));
+// assert(!map.contains(ll.get(i), Integer.MAX_VALUE));
+// }
+// }
+
+ }else if(editDistance>0){
+// long extraBase=(i>=bases.length-1 ? -1 : AminoAcid.baseToNumber[bases[i+1]]);
+ added=mutate(kmer, rkmer, len, id, editDistance, extraBase);
+ }else{
+ added=mutate(kmer, rkmer, len, id, hammingDistance, -1);
+ }
+ if(verbose){System.err.println("addToMap added "+added+" keys.");}
+ return added;
+ }
+
+// private LongList ll=new LongList();
+// private IntList il=new IntList();
+
+ /**
+ * Mutate and store this kmer through 'dist' recursions.
+ * @param kmer Forward kmer
+ * @param rkmer Reverse kmer
+ * @param id Scaffold number
+ * @param dist Number of mutations
+ * @param extraBase Base added to end in case of deletions
+ * @return Number of kmers stored
+ */
+ private long mutate(final long kmer, final long rkmer, final int len, final int id, final int dist, final long extraBase){
+ long added=0;
+
+ final long key=toValue(kmer, rkmer, lengthMasks[len]);
+
+ if(verbose){System.err.println("mutate_A; len="+len+"; kmer="+kmer+"; rkmer="+rkmer+"; kMasks[len]="+lengthMasks[len]);}
+ if(key%WAYS==tnum){
+ if(verbose){System.err.println("mutate_B: "+AminoAcid.kmerToString(kmer&~lengthMasks[len], len)+" = "+key);}
+ int x=map.set(key, id);
+ if(verbose){System.err.println("mutate_B added "+x+" keys.");}
+ added+=x;
+ assert(map.contains(key));
+ }
+
+ if(dist>0){
+ final int dist2=dist-1;
+
+ //Sub
+ for(int j=0; j<4; j++){
+ for(int i=0; i<len; i++){
+ final long temp=(kmer&clearMasks[i])|setMasks[j][i];
+ if(temp!=kmer){
+ long rtemp=AminoAcid.reverseComplementBinaryFast(temp, len);
+ added+=mutate(temp, rtemp, len, id, dist2, extraBase);
+ }
+ }
+ }
+
+ if(editDistance>0){
+ //Del
+ if(extraBase>=0 && extraBase<=3){
+ for(int i=1; i<len; i++){
+ final long temp=(kmer&leftMasks[i])|((kmer<<2)&rightMasks[i])|extraBase;
+ if(temp!=kmer){
+ long rtemp=AminoAcid.reverseComplementBinaryFast(temp, len);
+ added+=mutate(temp, rtemp, len, id, dist2, -1);
+ }
+ }
+ }
+
+ //Ins
+ final long eb2=kmer&3;
+ for(int i=1; i<len; i++){
+ final long temp0=(kmer&leftMasks[i])|((kmer&rightMasks[i])>>2);
+ for(int j=0; j<4; j++){
+ final long temp=temp0|setMasks[j][i-1];
+ if(temp!=kmer){
+ long rtemp=AminoAcid.reverseComplementBinaryFast(temp, len);
+ added+=mutate(temp, rtemp, len, id, dist2, eb2);
+ }
+ }
+ }
+ }
+
+ }
+
+ return added;
+ }
+
+ /*--------------------------------------------------------------*/
+
+ /** Number of kmers stored by this thread */
+ public long addedT=0;
+ /** Number of items encountered by this thread */
+ public long refKmersT=0, refReadsT=0, refBasesT=0;
+ /** Thread number; used to determine which kmers to store */
+ public final int tnum;
+ /** Buffer of input read lists */
+ public final ArrayBlockingQueue<ArrayList<Read>> queue=new ArrayBlockingQueue<ArrayList<Read>>(32);
+
+ /** Destination for storing kmers */
+ private final AbstractKmerTable map;
+
+ /** Completed successfully */
+ boolean success=false;
+
+ }
+
+ /*--------------------------------------------------------------*/
+
+ /**
+ * Matches read kmers against reference kmers, performs binning and/or trimming, and writes output.
+ */
+ private class ProcessThread extends Thread{
+
+ /**
+ * Constructor
+ * @param cris_ Read input stream
+ * @param rosu_ Unmatched read output stream (optional)
+ * @param rosm_ Matched read output stream (optional)
+ */
+ public ProcessThread(ConcurrentReadInputStream cris_, ConcurrentReadOutputStream rosm_, ConcurrentReadOutputStream rosu_,
+ MultiCros mcros_, boolean localArrays){
+ cris=cris_;
+ rosm=rosm_;
+ rosu=rosu_;
+ mcros=mcros_;
+
+ readstats=(MAKE_QUALITY_HISTOGRAM || MAKE_MATCH_HISTOGRAM || MAKE_BASE_HISTOGRAM || MAKE_QUALITY_ACCURACY ||
+ MAKE_EHIST || MAKE_INDELHIST || MAKE_LHIST || MAKE_GCHIST || MAKE_IDHIST) ?
+ new ReadStats() : null;
+
+ final int alen=(scaffoldNames==null ? 0 : scaffoldNames.size());
+ if(localArrays && alen>0 && alen<10000){
+ scaffoldReadCountsT=new long[alen];
+ scaffoldBaseCountsT=new long[alen];
+ scaffoldFragCountsT=new long[alen];
+ }else{
+ scaffoldReadCountsT=scaffoldBaseCountsT=scaffoldFragCountsT=null;
+ }
+
+ if(USE_COUNTVECTOR){
+ countVector=new IntList(1000);
+ countArray=null;
+ }else{
+ countVector=null;
+ countArray=new int[alen];
+ }
+ }
+
+ @Override
+ public void run(){
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ final ArrayList<Read> mlist=(rosm==null ? null : new ArrayList<Read>(Shared.READ_BUFFER_LENGTH));
+ final ArrayList<Read> ulist=(rosu==null ? null : new ArrayList<Read>(Shared.READ_BUFFER_LENGTH));
+ final ArrayListSet als=(outpattern==null ? null : new ArrayListSet(ORDERED));
+
+ //While there are more reads lists...
+ while(reads!=null && reads.size()>0){
+
+ //For each read (or pair) in the list...
+ for(int i=0; i<reads.size(); i++){
+ final Read r1=reads.get(i);
+ final Read r2=r1.mate;
+
+ if(!r1.validated()){r1.validate(true);}
+ if(r2!=null && !r2.validated()){r2.validate(true);}
+
+ if(readstats!=null){
+ if(MAKE_QUALITY_HISTOGRAM){readstats.addToQualityHistogram(r1);}
+ if(MAKE_BASE_HISTOGRAM){readstats.addToBaseHistogram(r1);}
+ if(MAKE_MATCH_HISTOGRAM){readstats.addToMatchHistogram(r1);}
+ if(MAKE_QUALITY_ACCURACY){readstats.addToQualityAccuracy(r1);}
+
+ if(MAKE_EHIST){readstats.addToErrorHistogram(r1);}
+ if(MAKE_INDELHIST){readstats.addToIndelHistogram(r1);}
+ if(MAKE_LHIST){readstats.addToLengthHistogram(r1);}
+ if(MAKE_GCHIST){readstats.addToGCHistogram(r1);}
+ if(MAKE_IDHIST){readstats.addToIdentityHistogram(r1);}
+ }
+
+ final int initialLength1=r1.length();
+ final int initialLength2=(r1.mateLength());
+
+ final int minlen1=(int)Tools.max(initialLength1*minLenFraction, minReadLength);
+ final int minlen2=(int)Tools.max(initialLength2*minLenFraction, minReadLength);
+
+ if(verbose){System.err.println("Considering read "+r1.id+" "+new String(r1.bases));}
+
+ fragsInT++;
+ readsInT+=(1+r1.mateCount());
+ basesInT+=(r1.length()+r1.mateLength());
+
+ boolean remove=false;
+
+ if(chastityFilter){
+ if(r1!=null && r1.failsChastity()){
+ basesQFilteredT+=r1.length();
+ readsQFilteredT++;
+ r1.setDiscarded(true);
+ }
+ if(r2!=null && r2.failsChastity()){
+ basesQFilteredT+=r2.length();
+ readsQFilteredT++;
+ r2.setDiscarded(true);
+ }
+ }
+
+ if(forceTrimLeft>0 || forceTrimRight>0 || forceTrimRight2>0 || forceTrimModulo>0){
+ if(r1!=null && !r1.discarded()){
+ final int len=r1.length();
+ final int a=forceTrimLeft>0 ? forceTrimLeft : 0;
+ final int b0=forceTrimModulo>0 ? len-1-len%forceTrimModulo : len;
+ final int b1=forceTrimRight>0 ? forceTrimRight : len;
+ final int b2=forceTrimRight2>0 ? len-1-forceTrimRight2 : len;
+ final int b=Tools.min(b0, b1, b2);
+ final int x=TrimRead.trimToPosition(r1, a, b, 1);
+ basesFTrimmedT+=x;
+ readsFTrimmedT+=(x>0 ? 1 : 0);
+ if(r1.length()<minlen1){r1.setDiscarded(true);}
+ }
+ if(r2!=null && !r2.discarded()){
+ final int len=r2.length();
+ final int a=forceTrimLeft>0 ? forceTrimLeft : 0;
+ final int b0=forceTrimModulo>0 ? len-1-len%forceTrimModulo : len;
+ final int b1=forceTrimRight>0 ? forceTrimRight : len;
+ final int b2=forceTrimRight2>0 ? len-1-forceTrimRight2 : len;
+ final int b=Tools.min(b0, b1, b2);
+ final int x=TrimRead.trimToPosition(r2, a, b, 1);
+ basesFTrimmedT+=x;
+ readsFTrimmedT+=(x>0 ? 1 : 0);
+ if(r2.length()<minlen2){r2.setDiscarded(true);}
+ }
+ }
+
+ if(removePairsIfEitherBad){remove=r1.discarded() || (r2!=null && r2.discarded());}
+ else{remove=r1.discarded() && (r2==null || r2.discarded());}
+
+
+
+ if(!remove){
+ //Do quality trimming
+
+ int rlen1=0, rlen2=0;
+ if(r1!=null){
+ if(qtrimLeft || qtrimRight){
+ int x=TrimRead.trimFast(r1, qtrimLeft, qtrimRight, trimq, 1);
+ basesQTrimmedT+=x;
+ readsQTrimmedT+=(x>0 ? 1 : 0);
+ }
+ rlen1=r1.length();
+ if(rlen1<minlen1 || rlen1>maxReadLength){
+ r1.setDiscarded(true);
+ if(verbose){System.err.println(r1.id+" discarded due to length.");}
+ }
+ }
+ if(r2!=null){
+ if(qtrimLeft || qtrimRight){
+ int x=TrimRead.trimFast(r2, qtrimLeft, qtrimRight, trimq, 1);
+ basesQTrimmedT+=x;
+ readsQTrimmedT+=(x>0 ? 1 : 0);
+ }
+ rlen2=r2.length();
+ if(rlen2<minlen2 || rlen2>maxReadLength){
+ r2.setDiscarded(true);
+ if(verbose){System.err.println(r2.id+" discarded due to length.");}
+ }
+ }
+
+ //Discard reads if too short
+ if((removePairsIfEitherBad && (r1.discarded() || (r2!=null && r2.discarded()))) ||
+ (r1.discarded() && (r2==null || r2.discarded()))){
+ basesQFilteredT+=(r1.length()+r1.mateLength());
+ readsQTrimmedT+=1+r1.mateCount();
+ remove=true;
+ }
+ }
+
+
+ if(!remove){
+ //Do quality filtering
+
+ //Determine whether to discard the reads based on average quality
+ if(minAvgQuality>0){
+ if(r1!=null && r1.quality!=null && r1.avgQuality(false, minAvgQualityBases)<minAvgQuality){
+ r1.setDiscarded(true);
+ if(verbose){System.err.println(r1.id+" discarded due to low quality.");}
+ }
+ if(r2!=null && r2.quality!=null && r2.avgQuality(false, minAvgQualityBases)<minAvgQuality){
+ r2.setDiscarded(true);
+ if(verbose){System.err.println(r2.id+" discarded due to low quality.");}
+ }
+ }
+ //Determine whether to discard the reads based on the presence of Ns
+ if(maxNs>=0){
+ if(r1!=null && r1.countUndefined()>maxNs){
+ r1.setDiscarded(true);
+ if(verbose){System.err.println(r1.id+" discarded due to Ns.");}
+ }
+ if(r2!=null && r2.countUndefined()>maxNs){
+ r2.setDiscarded(true);
+ if(verbose){System.err.println(r2.id+" discarded due to Ns.");}
+ }
+ }
+
+ //Determine whether to discard the reads based on a lack of useful kmers
+ if(minConsecutiveBases>0){
+ if(r1!=null && !r1.discarded() && !r1.hasMinConsecutiveBases(minConsecutiveBases)){r1.setDiscarded(true);}
+ if(r2!=null && !r2.discarded() && !r2.hasMinConsecutiveBases(minConsecutiveBases)){r2.setDiscarded(true);}
+ }
+
+ //Discard reads if too short
+ if((removePairsIfEitherBad && (r1.discarded() || (r2!=null && r2.discarded()))) ||
+ (r1.discarded() && (r2==null || r2.discarded()))){
+ basesQFilteredT+=(r1.length()+r1.mateLength());
+ readsQFilteredT+=1+r1.mateCount();
+ remove=true;
+ }
+ }
+
+ final int sites, assigned;
+ if(remove){
+ if(r1!=null){
+ basesQFilteredT+=r1.length();
+ readsQFilteredT++;
+ }
+ if(r2!=null){
+ basesQFilteredT+=r2.length();
+ readsQFilteredT++;
+ }
+ sites=assigned=0;
+ }else{
+
+ if(ecc && r1!=null && r2!=null){BBMerge.findOverlapStrict(r1, r2, true);}
+
+ //Do kmer matching
+ if(keepPairsTogether){
+
+ final int a, b, max;
+
+ if(countArray==null){
+ countVector.size=0;
+ a=findBestMatch(r1, keySets, countVector);
+ b=findBestMatch(r2, keySets, countVector);
+ if(verbose){System.err.println("countVector: "+countVector);}
+ max=condenseLoose(countVector, idList1, countList1);
+ }else{
+ idList1.size=0;
+ a=findBestMatch(r1, keySets, countArray, idList1);
+ b=findBestMatch(r2, keySets, countArray, idList1);
+
+ max=condenseLoose(countArray, idList1, countList1);
+ }
+
+ if(verbose){
+ System.err.println("idList1: "+idList1);
+ System.err.println("countList1: "+countList1);
+ }
+ if(rename){
+ rename(r1, idList1, countList1);
+ rename(r2, idList1, countList1);
+ }
+ filterTopScaffolds(r1, r2, idList1, countList1, finalList1, max, clearzone);
+ if(verbose){
+ System.err.println("idList1: "+idList1);
+ System.err.println("countList1: "+countList1);
+ System.err.println("finalList1: "+finalList1);
+ }
+ sites=finalList1.size;
+
+ final int minhits=Tools.max(minKmerHits, (int)(minKmerFraction*numKmers(r1, r2, k)));
+ if(max>=minhits){
+ assigned=assignTogether(r1, r2, als);
+ }else{
+ readsUnmatchedT+=1+r1.mateCount();
+ basesUnmatchedT+=r1.length()+r1.mateLength();
+ assigned=0;
+ }
+
+ }else{
+ final int max1, max2, a, b;
+ {
+ if(countArray==null){
+ countVector.size=0;
+ a=findBestMatch(r1, keySets, countVector);
+ max1=condenseLoose(countVector, idList1, countList1);
+ }else{
+ idList1.size=0;
+ a=findBestMatch(r1, keySets, countArray, idList1);
+ max1=condenseLoose(countArray, idList1, countList1);
+ }
+ if(rename){rename(r1, idList1, countList1);}
+ filterTopScaffolds(r1, null, idList1, countList1, finalList1, max1, clearzone);
+ }
+ if(r2!=null){
+ if(countArray==null){
+ countVector.size=0;
+ b=findBestMatch(r2, keySets, countVector);
+ max2=condenseLoose(countVector, idList2, countList1);
+ }else{
+ idList2.size=0;
+ b=findBestMatch(r2, keySets, countArray, idList2);
+ max2=condenseLoose(countArray, idList2, countList2);
+ }
+ filterTopScaffolds(r2, null, idList2, countList2, finalList2, max2, clearzone);
+ if(rename){rename(r2, idList2, countList2);}
+ }else{max2=0;}
+
+ sites=finalList1.size+finalList2.size;
+
+ assigned=assignIndependently(r1, r2, max1, max2, als);
+ }
+ }
+
+ if(remove || assigned<1){
+ if(ulist!=null){ulist.add(r1);}
+ }else{
+ if(mlist!=null){mlist.add(r1);}
+ }
+
+ }
+
+ //Send matched list to matched output stream
+ if(rosu!=null){
+ rosu.add(ulist, ln.id);
+ ulist.clear();
+ }
+
+ //Send unmatched list to unmatched output stream
+ if(rosm!=null){
+ rosm.add(mlist, ln.id);
+ mlist.clear();
+ }
+
+ if(mcros!=null){
+ mcros.add(als, ln.id);
+ }
+
+ //Fetch a new read list
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ cris.returnList(ln.id, ln.list.isEmpty());
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Helper Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /**
+ * @param r
+ * @param idList
+ * @param countList
+ */
+ private void rename(Read r, IntList idList, IntList countList) {
+ if(r==null || idList.size<1){return;}
+ StringBuilder sb=new StringBuilder();
+ if(r.id==null){sb.append(r.numericID);}
+ else{sb.append(r.id);}
+ for(int i=0; i<idList.size; i++){
+ int id=idList.get(i);
+ int count=countList.get(i);
+ sb.append('\t');
+ sb.append(scaffoldNames.get(id));
+ sb.append('=');
+ sb.append(count);
+ }
+ r.id=sb.toString();
+ }
+
+ /**
+ * @param r1 Read 1
+ * @param r2 Read 2
+ * @return Number of sites assigned
+ */
+ private int assignTogether(Read r1, Read r2, ArrayListSet als){
+ final int sites=finalList1.size;
+ final int lenSum=r1.length()+(r1.mateLength());
+ final int readSum=1+(r2==null ? 0 : 1);
+ final int start, stop;
+
+ if(sites<2 || ambigMode==AMBIG_ALL){
+ start=0;
+ stop=sites;
+ }else if(ambigMode==AMBIG_TOSS){
+ start=stop=0;
+ }else if(ambigMode==AMBIG_FIRST){
+ finalList1.sort();
+ start=0;
+ stop=1;
+ }else if(ambigMode==AMBIG_RANDOM){
+ start=(int)(r1.numericID%sites);
+ stop=start+1;
+ }else{
+ throw new RuntimeException("Unknown mode "+ambigMode);
+ }
+
+ for(int j=start; j<stop; j++){
+ int id=finalList1.get(j);
+
+ if(als!=null){
+ als.add(r1, scaffoldNames.get(id));
+ }
+
+ if(parsecustom && j==start){
+ String scafName=scaffoldNames.get(id);
+ String rname=r1.parseCustomRname();
+ if(scafName.equals(rname)){
+ correctT+=(1+r1.mateCount());
+ }else{
+ incorrectT+=(1+r1.mateCount());
+ }
+ }
+
+ if(scaffoldReadCountsT!=null){
+ scaffoldReadCountsT[id]+=readSum;
+ scaffoldBaseCountsT[id]+=lenSum;
+ scaffoldFragCountsT[id]++;
+ }else{
+ scaffoldReadCounts.addAndGet(id, readSum);
+ scaffoldBaseCounts.addAndGet(id, lenSum);
+ scaffoldFragCounts.addAndGet(id, 1);
+ }
+ }
+
+ if(start<stop){
+ readsMatchedT+=1+r1.mateCount();
+ basesMatchedT+=r1.length()+r1.mateLength();
+ }else{
+ readsUnmatchedT+=1+r1.mateCount();
+ basesUnmatchedT+=r1.length()+r1.mateLength();
+ }
+
+ return stop-start;
+ }
+
+ /**
+ * @param r1 Read 1
+ * @param r2 Read 2
+ * @param max1 Highest match count for read 1
+ * @param max2 Highest match count for read 2
+ * @return Number of sites assigned
+ */
+ private int assignIndependently(Read r1, Read r2, int max1, int max2, ArrayListSet als){
+ assert(als==null || r2==null) : "Pattern output does not work with keepPairsTogether=false and paired reads\n"+als;
+ int assigned=0;
+ if(max1>=Tools.max(minKmerHits, (int)(minKmerFraction*numKmers(r1, null, k)))){
+ final int sites=finalList1.size;
+ final int lenSum=r1.length();
+ final int start, stop;
+
+ if(sites<2 || ambigMode==AMBIG_ALL){
+ start=0;
+ stop=sites;
+ }else if(ambigMode==AMBIG_TOSS){
+ start=stop=0;
+ }else if(ambigMode==AMBIG_FIRST){
+ finalList1.sort();
+ start=0;
+ stop=1;
+ }else if(ambigMode==AMBIG_RANDOM){
+ start=(int)(r1.numericID%sites);
+ stop=start+1;
+ }else{
+ throw new RuntimeException("Unknown mode "+ambigMode);
+ }
+
+ for(int j=start; j<stop; j++){
+ int id=finalList1.get(j);
+
+ if(als!=null){
+ als.add(r1, scaffoldNames.get(id));
+ }
+
+ if(parsecustom && j==start){
+ String scafName=scaffoldNames.get(id);
+ String rname=r1.parseCustomRname();
+ if(scafName.equals(rname)){
+ correctT++;
+ }else{
+ incorrectT++;
+ }
+ }
+
+ if(scaffoldReadCountsT!=null){
+ scaffoldReadCountsT[id]++;
+ scaffoldBaseCountsT[id]+=lenSum;
+ if(max1>=max2){
+ scaffoldFragCountsT[id]++;
+ }
+ }else{
+ scaffoldReadCounts.addAndGet(id, 1);
+ scaffoldBaseCounts.addAndGet(id, lenSum);
+ if(max1>=max2){
+ scaffoldFragCounts.addAndGet(id, 1);
+ }
+ }
+ }
+ if(start<stop){
+ readsMatchedT++;
+ basesMatchedT+=r1.length();
+ assigned+=(stop-start);
+ }else{
+ readsUnmatchedT++;
+ basesUnmatchedT+=r1.length();
+ }
+ }
+
+ if(max2>=Tools.max(minKmerHits, (int)(minKmerFraction*numKmers(r2, null, k)))){
+ final int sites=finalList2.size;
+ final int lenSum=r2.length();
+ final int start, stop;
+
+ if(sites<2 || ambigMode==AMBIG_ALL){
+ start=0;
+ stop=sites;
+ }else if(ambigMode==AMBIG_TOSS){
+ start=stop=0;
+ }else if(ambigMode==AMBIG_FIRST){
+ finalList2.sort();
+ start=0;
+ stop=1;
+ }else if(ambigMode==AMBIG_RANDOM){
+ start=(int)(r2.numericID%sites);
+ stop=start+1;
+ }else{
+ throw new RuntimeException("Unknown mode "+ambigMode);
+ }
+
+ for(int j=start; j<stop; j++){
+ int id=finalList2.get(j);
+
+ if(als!=null){
+ als.add(r2, scaffoldNames.get(id));
+ throw new RuntimeException("Pattern output does not currently work with keepPairsTogether=false");
+ }
+
+ if(parsecustom && j==start){
+ String scafName=scaffoldNames.get(id);
+ String rname=r2.parseCustomRname();
+ if(scafName.equals(rname)){
+ correctT++;
+ }else{
+ incorrectT++;
+ }
+ }
+
+ if(scaffoldReadCountsT!=null){
+ scaffoldReadCountsT[id]++;
+ scaffoldBaseCountsT[id]+=lenSum;
+ if(max2>max1){
+ scaffoldFragCountsT[id]++;
+ }
+ }else{
+ scaffoldReadCounts.addAndGet(id, 1);
+ scaffoldBaseCounts.addAndGet(id, lenSum);
+ if(max2>max1){
+ scaffoldFragCounts.addAndGet(id, 1);
+ }
+ }
+ }
+ if(start<stop){
+ readsMatchedT++;
+ basesMatchedT+=r2.length();
+ assigned+=(stop-start);
+ }else{
+ readsUnmatchedT++;
+ basesUnmatchedT+=r2.length();
+ }
+ }
+
+ return assigned;
+ }
+
+ /**
+ * Pack a list of nonunique values into a list of unique values and a list of their counts.
+ * @param loose Nonunique values
+ * @param packed Unique values
+ * @param counts Counts of values
+ * @return
+ */
+ private int condenseLoose(IntList loose, IntList packed, IntList counts){
+ packed.size=0;
+ counts.size=0;
+ if(loose.size<1){return 0;}
+ loose.sort();
+ int prev=-1;
+ int max=0;
+ int count=0;
+ for(int i=0; i<loose.size; i++){
+ int id=loose.get(i);
+// System.err.println("i="+i+", id="+id+", count="+count+", prev="+prev);
+ if(id==prev){
+ count++;
+ }else{
+ if(count>0){
+ packed.add(prev);
+ counts.add(count);
+ max=Tools.max(count, max);
+// assert(false) : "i="+i+", "+id+", "+count+", "+packed+", "+counts;
+ }
+ prev=id;
+ count=1;
+ }
+ }
+ if(count>0){
+ packed.add(prev);
+ counts.add(count);
+ max=Tools.max(count, max);
+ }
+ return max;
+ }
+
+ /**
+ * Pack a list of counts from an array to an IntList.
+ * @param loose Counter array
+ * @param packed Unique values
+ * @param counts Counts of values
+ * @return
+ */
+ private int condenseLoose(int[] loose, IntList packed, IntList counts){
+ counts.size=0;
+ if(packed.size<1){return 0;}
+
+ int max=0;
+ for(int i=0; i<packed.size; i++){
+ final int p=packed.get(i);
+ final int c=loose[p];
+ counts.add(c);
+ loose[p]=0;
+ max=Tools.max(max, c);
+ }
+ return max;
+ }
+
+ private void filterTopScaffolds(Read r1, Read r2, IntList packed, IntList counts, IntList out, int max, int cz){
+ out.size=0;
+ if(packed.size<1){return;}
+ if(processContainedRef){
+ filterTopScaffolds_withContainedRef(r1, r2, packed, counts, out);
+ }else{
+ filterTopScaffolds_withClearzone(packed, counts, out, max, cz);
+ }
+ }
+
+ private void filterTopScaffolds_withContainedRef(Read r1, Read r2, IntList packed, IntList counts, IntList out){
+ for(int i=0; i<packed.size; i++){
+ final int p=packed.get(i);
+ final int c=counts.get(i);
+ if(storeRefBases){
+ if(Tools.containsForward(r1.bases, scaffolds.get(p), hammingDistance)>=0 ||
+ (r2!=null && Tools.containsForward(r2.bases, scaffolds.get(p), hammingDistance)>=0)){
+ out.add(p);
+ }else if(rcomp && (Tools.containsReverse(r1.bases, scaffolds.get(p), hammingDistance)>=0 ||
+ (r2!=null && Tools.containsReverse(r2.bases, scaffolds.get(p), hammingDistance)>=0))){
+ out.add(p);
+ }
+ }else if(c>=scaffoldKmers.get(p)){
+ out.add(p);
+ }
+ }
+ }
+
+ private void filterTopScaffolds_withClearzone(IntList packed, IntList counts, IntList out, int max, int cz){
+ final int thresh=Tools.max(1, max-cz);
+ for(int i=0; i<packed.size; i++){
+ final int c=counts.get(i);
+ assert((c>0) && c<=max) : c+"\n"+packed+"\n"+counts;
+ if(c>=thresh){
+ out.add(packed.get(i));
+ }
+ }
+ }
+
+ /**
+ * Transforms a kmer into all canonical values for a given Hamming distance.
+ * Returns the related id stored in the tables.
+ * @param kmer Forward kmer
+ * @param rkmer Reverse kmer
+ * @param lengthMask Bitmask with single '1' set to left of kmer
+ * @param qPos Position of kmer in query
+ * @param len kmer length
+ * @param qHDist Hamming distance
+ * @param sets Kmer hash tables
+ * @return Value stored in table, or -1
+ */
+ private final int[] getValues(final long kmer, final long rkmer, final long lengthMask, final int qPos, final int len, final int qHDist, final AbstractKmerTable[] sets){
+ if(qHDist>0){return getValuesQHD(kmer, rkmer, lengthMask, qPos, len, qHDist, sets, qhList);}
+ int[] ids=getValuesInner(kmer, rkmer, lengthMask, qPos, sets);
+ if((ids==null || ids[0]<0) && qHDist>0){
+ final int qHDist2=qHDist-1;
+
+ //Sub
+ for(int j=0; j<4; j++){
+ for(int i=0; i<len; i++){
+ final long temp=(kmer&clearMasks[i])|setMasks[j][i];
+ if(temp!=kmer){
+ long rtemp=AminoAcid.reverseComplementBinaryFast(temp, len);
+ ids=getValues(temp, rtemp, lengthMask, qPos, len, qHDist2, sets);
+ if(ids!=null && ids[0]>-1){return ids;}
+ }
+ }
+ }
+ }
+ return ids;
+ }
+
+ private final int[] getValuesQHD(final long kmer, final long rkmer, final long lengthMask, final int qPos, final int len, final int qHDist, final AbstractKmerTable[] sets, IntList list){
+ assert(qHDist>0);
+ list.clear();
+ getValuesQHD_inner(kmer, rkmer, lengthMask, qPos, len, qHDist, sets, list);
+ if(list.size>qhdistSizeLimit){
+ list.sort();
+ list.shrinkToUnique();
+ }
+ if(list.size>0){
+ list.add(-1);//indicates end
+ return list.array;
+ }
+ return null;
+ }
+
+ private final void getValuesQHD_inner(final long kmer, final long rkmer, final long lengthMask, final int qPos, final int len, final int qHDist, final AbstractKmerTable[] sets, IntList list){
+ final int sizeLimit=10;
+ int[] ids=getValuesInner(kmer, rkmer, lengthMask, qPos, sets);
+ for(int x : ids){
+ if(x<0){break;}
+ if(list.size>sizeLimit || !list.contains(x)){list.add(x);}
+ }
+ if(qHDist>0){
+ final int qHDist2=qHDist-1;
+
+ //Sub
+ for(int j=0; j<4; j++){
+ for(int i=0; i<len; i++){
+ final long temp=(kmer&clearMasks[i])|setMasks[j][i];
+ if(temp!=kmer){
+ long rtemp=AminoAcid.reverseComplementBinaryFast(temp, len);
+ getValuesQHD_inner(temp, rtemp, lengthMask, qPos, len, qHDist2, sets, list);
+ }
+ }
+ }
+ }
+ }
+
+ /**
+ * Transforms a kmer into a canonical value stored in the table and search.
+ * @param kmer Forward kmer
+ * @param rkmer Reverse kmer
+ * @param lengthMask Bitmask with single '1' set to left of kmer
+ * @param qPos Position of kmer in query
+ * @param sets Kmer hash tables
+ * @return Value stored in table
+ */
+ private final int[] getValuesInner(final long kmer, final long rkmer, final long lengthMask, final int qPos, final AbstractKmerTable[] sets){
+ assert(lengthMask==0 || (kmer<lengthMask && rkmer<lengthMask)) : lengthMask+", "+kmer+", "+rkmer;
+ if(qSkip>1 && (qPos%qSkip!=0)){return null;}
+
+ final long max=(rcomp ? Tools.max(kmer, rkmer) : kmer);
+ final long key=(max&middleMask)|lengthMask;
+ if(noAccel || ((key/WAYS)&15)>=speed){
+ if(verbose){System.err.println("Testing key "+key);}
+ AbstractKmerTable set=sets[(int)(key%WAYS)];
+ if(verbose){System.err.println("Found set "+set.arrayLength()+", "+set.size());}
+ final int[] ids=set.getValues(key, singleton);
+ if(verbose){System.err.println("Found array "+(ids==null ? "null" : Arrays.toString(ids)));}
+ return ids;
+ }
+ return null;
+ }
+
+
+ /**
+ * Returns number of matching kmers.
+ * @param r Read to process
+ * @param sets Kmer tables
+ * @return number of total kmer matches
+ */
+ private final int findBestMatch(final Read r, final AbstractKmerTable sets[], IntList hits){
+ if(r==null || r.bases==null || storedKmers<1){return 0;}
+ final byte[] bases=r.bases;
+ final int minlen=k-1;
+ final int minlen2=(maskMiddle ? k/2 : k);
+ final int shift=2*k;
+ final int shift2=shift-2;
+ final long mask=~((-1L)<<shift);
+ final long kmask=lengthMasks[k];
+ long kmer=0;
+ long rkmer=0;
+ int found=0;
+ int len=0;
+
+ if(bases==null || bases.length<k){return -1;}
+
+ /* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ long x=Dedupe.baseToNumber[b];
+ long x2=Dedupe.baseToComplementNumber[b];
+ kmer=((kmer<<2)|x)&mask;
+ rkmer=(rkmer>>>2)|(x2<<shift2);
+ if(b=='N' && forbidNs){len=0;}else{len++;}
+ if(verbose){System.err.println("Scanning6 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
+ if(len>=minlen2 && i>=minlen){
+ final int[] ids=getValues(kmer, rkmer, kmask, i, k, qHammingDistance, sets);
+ if(ids!=null && ids[0]>-1){
+ for(int id : ids){
+ if(id==-1){break;}
+ hits.add(id);
+ }
+ if(verbose){System.err.println("Found = "+(found+1)+"/"+minKmerHits);}
+ found++;
+ // assert(false) : (matchMode==MATCH_FIRST)+", "+(matchMode==MATCH_UNIQUE)+", "+ (ids.length==1 || ids[1]==-1);
+ if(matchMode==MATCH_FIRST || (matchMode==MATCH_UNIQUE && (ids.length==1 || ids[1]==-1))){break;}
+ }
+ }
+ }
+ return found;
+ }
+
+ /**
+ * Returns number of matching kmers.
+ * @param r Read to process
+ * @param sets Kmer tables
+ * @return number of total kmer matches
+ */
+ private final int findBestMatch(final Read r, final AbstractKmerTable sets[], int[] hits, IntList idList){
+ if(r==null || r.bases==null || storedKmers<1){return 0;}
+ final byte[] bases=r.bases;
+ final int minlen=k-1;
+ final int minlen2=(maskMiddle ? k/2 : k);
+ final int shift=2*k;
+ final int shift2=shift-2;
+ final long mask=~((-1L)<<shift);
+ final long kmask=lengthMasks[k];
+ long kmer=0;
+ long rkmer=0;
+ int found=0;
+ int len=0;
+
+ if(bases==null || bases.length<k){return -1;}
+
+ /* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ long x=Dedupe.baseToNumber[b];
+ long x2=Dedupe.baseToComplementNumber[b];
+ kmer=((kmer<<2)|x)&mask;
+ rkmer=(rkmer>>>2)|(x2<<shift2);
+ if(b=='N' && forbidNs){len=0;}else{len++;}
+ if(verbose){System.err.println("Scanning6 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
+ if(len>=minlen2 && i>=minlen){
+ final int[] ids=getValues(kmer, rkmer, kmask, i, k, qHammingDistance, sets);
+ if(ids!=null && ids[0]>-1){
+ for(int id : ids){
+ if(id==-1){break;}
+ hits[id]++;
+ if(hits[id]==1){idList.add(id);}
+ }
+ if(verbose){System.err.println("Found = "+(found+1)+"/"+minKmerHits);}
+ found++;
+// assert(false) : (matchMode==MATCH_FIRST)+", "+(matchMode==MATCH_UNIQUE)+", "+ (ids.length==1 || ids[1]==-1);
+ if(matchMode==MATCH_FIRST || (matchMode==MATCH_UNIQUE && (ids.length==1 || ids[1]==-1))){break;}
+ }
+ }
+ }
+ return found;
+ }
+
+ /*--------------------------------------------------------------*/
+
+ /** Input read stream */
+ private final ConcurrentReadInputStream cris;
+ /** Output read streams */
+ private final ConcurrentReadOutputStream rosm, rosu;
+ /** Output pattern read streams */
+ private final MultiCros mcros;
+
+ private final ReadStats readstats;
+ private final IntList countVector;
+ private final int[] countArray;
+
+ private final IntList idList1=new IntList(), idList2=new IntList();
+ private final IntList countList1=new IntList(), countList2=new IntList();
+ private final IntList finalList1=new IntList(), finalList2=new IntList();
+ private final IntList qhList=new IntList();
+
+ long[] scaffoldReadCountsT;
+ long[] scaffoldBaseCountsT;
+ long[] scaffoldFragCountsT;
+ final int[] singleton=new int[1];
+
+ private long readsInT=0;
+ private long fragsInT=0;
+ private long basesInT=0;
+ private long readsMatchedT=0;
+ private long basesMatchedT=0;
+ private long readsUnmatchedT=0;
+ private long basesUnmatchedT=0;
+
+ private long readsQTrimmedT=0;
+ private long basesQTrimmedT=0;
+ private long readsFTrimmedT=0;
+ private long basesFTrimmedT=0;
+ private long readsQFilteredT=0;
+ private long basesQFilteredT=0;
+
+ private long correctT=0;
+ private long incorrectT=0;
+
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Helper Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /**
+ * Object holding a String and numbers, for tracking the number of read and base hits per scaffold.
+ */
+ private static class StringNum implements Comparable<StringNum>{
+
+ public StringNum(String name_, int len_, long reads_, long bases_){
+ name=name_;
+ length=len_;
+ reads=reads_;
+ bases=bases_;
+ }
+ public final int compareTo(StringNum o){
+ if(bases!=o.bases){return o.bases>bases ? 1 : -1;}
+ if(reads!=o.reads){return o.reads>reads ? 1 : -1;}
+ return name.compareTo(o.name);
+ }
+ public final boolean equals(StringNum o){
+ return compareTo(o)==0;
+ }
+ public final String toString(){
+ return name+"\t"+length+"\t"+reads+"\t"+bases;
+ }
+
+ /*--------------------------------------------------------------*/
+
+ public final String name;
+ public final int length;
+ public final long reads, bases;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Static Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Current available memory */
+ private static final long freeMemory(){
+ Runtime rt=Runtime.getRuntime();
+ return rt.freeMemory();
+ }
+
+ /**
+ * Transforms a kmer into a canonical value stored in the table. Expected to be inlined.
+ * @param kmer Forward kmer
+ * @param rkmer Reverse kmer
+ * @param lengthMask Bitmask with single '1' set to left of kmer
+ * @return Canonical value
+ */
+ private final long toValue(long kmer, long rkmer, long lengthMask){
+ assert(lengthMask==0 || (kmer<lengthMask && rkmer<lengthMask)) : lengthMask+", "+kmer+", "+rkmer;
+ long value=(rcomp ? Tools.max(kmer, rkmer) : kmer);
+ return (value&middleMask)|lengthMask;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Has this class encountered errors while processing? */
+ public boolean errorState=false;
+
+ /** Fraction of available memory preallocated to arrays */
+ private double preallocFraction=0.5;
+ /** Initial size of data structures */
+ private int initialSize=-1;
+ /** Default initial size of data structures */
+ private static final int initialSizeDefault=128000; //123
+
+ /** Hold kmers. A kmer X such that X%WAYS=Y will be stored in keySets[Y] */
+ private final AbstractKmerTable[] keySets;
+ /** A scaffold's name is stored at scaffoldNames.get(id).
+ * scaffoldNames[0] is reserved, so the first id is 1. */
+ private final ArrayList<String> scaffoldNames=new ArrayList<String>();
+ /** Names of reference files (refNames[0] is valid). */
+ private final ArrayList<String> refNames=new ArrayList<String>();
+ /** Number of scaffolds per reference. */
+ private final int[] refScafCounts;
+ /** scaffoldCounts[id] stores the number of reads with kmer matches to that scaffold */
+ private AtomicLongArray scaffoldReadCounts;
+ /** scaffoldFragCounts[id] stores the number of fragments (reads or pairs) with kmer matches to that scaffold */
+ private AtomicLongArray scaffoldFragCounts;
+ /** scaffoldBaseCounts[id] stores the number of bases with kmer matches to that scaffold */
+ private AtomicLongArray scaffoldBaseCounts;
+ /** Set to false to force threads to share atomic counter arrays. */
+ private boolean ALLOW_LOCAL_ARRAYS=true;
+ /** scaffoldLengths[id] stores the length of that scaffold */
+ private IntList scaffoldLengths=new IntList();
+ /** scaffoldLengths[id] stores the number of kmers in that scaffold (excluding mutants) */
+ private IntList scaffoldKmers=new IntList();
+ /** scaffolds[id] stores the number of kmers in that scaffold */
+ private ArrayList<byte[]> scaffolds=new ArrayList<byte[]>();
+ /** Array of reference files from which to load kmers */
+ private String[] ref=null;
+ /** Array of literal strings from which to load kmers */
+ private String[] literal=null;
+ /** Taxonomic tree */
+ private TaxTree tree;
+
+ /** Input reads */
+ private String in1=null, in2=null;
+ /** Input qual files */
+ private String qfin1=null, qfin2=null;
+ /** Output reads (matched and at least minlen) */
+ private String outm1=null, outm2=null;
+ /** Output reads (unmatched or shorter than minlen) */
+ private String outu1=null, outu2=null;
+ /** Per-sequence or per-reference output pattern */
+ private String outpattern=null;
+ /** Statistics output files */
+ private String outstats=null, outrpkm=null, outrefstats=null, outtax=null;
+ /** NCBI file mapping gi numbers to taxa IDs (gi_taxid_nucl.dmp) */
+ private String giTableFile=null;
+ /** NCBI file of taxonomy names (names.dmp) */
+ private String taxNameFile=null;
+ /** NCBI file of taxonomic tree (nodes.dmp) */
+ private String taxNodeFile=null;
+ /** File containing a serialized TaxTree */
+ private String taxTreeFile;
+
+ /** Store reference sequences */
+ private boolean storeRefBases=false;
+ /** Only look for fully-contained reference sequences */
+ private boolean processContainedRef=false;
+
+ /** Dump kmers here. */
+ private String dump=null;
+
+ /** Maximum input reads (or pairs) to process. Does not apply to references. -1 means unlimited. */
+ private long maxReads=-1;
+ /** Process this fraction of input reads. */
+ private float samplerate=1f;
+ /** Set samplerate seed to this value. */
+ private long sampleseed=-1;
+
+ /** Output reads in input order. May reduce speed. */
+ private final boolean ORDERED;
+ /** Make the middle base in a kmer a wildcard to improve sensitivity */
+ private boolean maskMiddle=true;
+
+ /** Store reference kmers with up to this many substitutions */
+ private int hammingDistance=0;
+ /** Search for query kmers with up to this many substitutions */
+ private int qHammingDistance=0;
+ /** Store reference kmers with up to this many edits (including indels) */
+ private int editDistance=0;
+ /** Always skip this many kmers between used kmers when hashing reference. */
+ private int refSkip=0;
+
+ private long taxNodeCountLimit=1;
+ private int taxNodeNumberLimit=-1;
+ private int taxNodeMinLevel=0;
+ private int taxNodeMaxLevel=TaxTree.stringToLevel("domain");
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Statistics ----------------*/
+ /*--------------------------------------------------------------*/
+
+ long readsIn=0;
+ long fragsIn=0;
+ long basesIn=0;
+ long readsMatched=0;
+ long basesMatched=0;
+ long readsUnmatched=0;
+ long basesUnmatched=0;
+
+ long readsQTrimmed=0;
+ long basesQTrimmed=0;
+ long readsFTrimmed=0;
+ long basesFTrimmed=0;
+ long readsQFiltered=0;
+ long basesQFiltered=0;
+
+ long refReads=0;
+ long refBases=0;
+ long refKmers=0;
+
+ long correctReads=0;
+ long incorrectReads=0;
+
+ long storedKmers=0;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Final Primitives ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Correct errors via read overlap */
+ private final boolean ecc;
+
+ /** Look for reverse-complements as well as forward kmers. Default: true */
+ private final boolean rcomp;
+ /** Don't allow a read 'N' to match a reference 'A'.
+ * Reduces sensitivity when hdist>0 or edist>0. Default: false. */
+ private final boolean forbidNs;
+ /** AND bitmask with 0's at the middle base */
+ private final long middleMask;
+ /** Data structure to use. Default: ARRAYH */
+ private final int tableType;
+
+ /** Normal kmer length */
+ private final int k;
+ /** k-1; used in some expressions */
+ private final int k2;
+ /** A read must share at least this many kmers to be considered a match. Default: 1 */
+ private final int minKmerHits;
+ /** A read must share at least this fraction of its kmers to be considered a match. Default: 0 */
+ private final float minKmerFraction;
+ /** Determines how to handle ambiguously-mapping reads */
+ private final int ambigMode;
+ /** Determines when to early-exit kmer matching */
+ private final int matchMode;
+ /** First and second must differ by more than this to be unambiguous. */
+ private final int clearzone;
+ /** Calculate accuracy rate by parsing headers of synthetic reads */
+ private final boolean parsecustom;
+
+ /** Quality-trim the left side */
+ private final boolean qtrimLeft;
+ /** Quality-trim the right side */
+ private final boolean qtrimRight;
+ /** Trim bases at this quality or below. Default: 4 */
+ private final byte trimq;
+ /** Throw away reads below this average quality before trimming. Default: 0 */
+ private final byte minAvgQuality;
+ /** If positive, calculate average quality from the first X bases only. Default: 0 */
+ private final int minAvgQualityBases;
+ /** Throw away reads failing chastity filter (:Y: in read header) */
+ private final boolean chastityFilter;
+ /** Throw away reads containing more than this many Ns. Default: -1 (disabled) */
+ private final int maxNs;
+ /** Throw away reads containing without at least this many consecutive called bases. */
+ private int minConsecutiveBases=0;
+ /** Throw away reads shorter than this after trimming. Default: 10 */
+ private final int minReadLength;
+ /** Throw away reads longer than this after trimming. Default: Integer.MAX_VALUE */
+ private final int maxReadLength;
+ /** Toss reads shorter than this fraction of initial length, after trimming */
+ private final float minLenFraction;
+ /** Filter reads by whether or not they have matching kmers */
+ private final boolean kfilter;
+ /** Trim left bases of the read to this position (exclusive, 0-based) */
+ private final int forceTrimLeft;
+ /** Trim right bases of the read after this position (exclusive, 0-based) */
+ private final int forceTrimRight;
+ /** Trim this many rightmost bases of the read */
+ private final int forceTrimRight2;
+ /** Trim right bases of the read modulo this value.
+ * e.g. forceTrimModulo=50 would trim the last 3bp from a 153bp read. */
+ private final int forceTrimModulo;
+
+ /** If positive, only look for kmer matches in the leftmost X bases */
+ private int restrictLeft;
+ /** If positive, only look for kmer matches the rightmost X bases */
+ private int restrictRight;
+
+ /** True iff java was launched with the -ea' flag */
+ private final boolean EA;
+ /** Skip this many initial input reads */
+ private final long skipreads;
+
+ /** Pairs go to outbad if either of them is bad, as opposed to requiring both to be bad.
+ * Default: true. */
+ private final boolean removePairsIfEitherBad;
+
+ /** Print only statistics for scaffolds that matched at least one read
+ * Default: true. */
+ private final boolean printNonZeroOnly;
+
+ /** Rename reads to indicate what they matched.
+ * Default: false. */
+ private final boolean rename;
+ /** Use names of reference files instead of scaffolds.
+ * Default: false. */
+ private final boolean useRefNames;
+
+ /** Fraction of kmers to skip, 0 to 15 out of 16 */
+ private final int speed;
+
+ /** Skip this many kmers when examining the read. Default 1.
+ * 1 means every kmer is used, 2 means every other, etc. */
+ private final int qSkip;
+
+ /** True if speed and qSkip are disabled. */
+ private final boolean noAccel;
+
+ /** Pick a single scaffold per read pair, rather than per read */
+ private final boolean keepPairsTogether;
+
+ /** Store match IDs in an IntList rather than int array */
+ private final boolean USE_COUNTVECTOR;
+
+ /** Gather taxanomic information */
+ private final boolean USE_TAXTREE;
+
+ private final boolean MAKE_QUALITY_ACCURACY;
+ private final boolean MAKE_QUALITY_HISTOGRAM;
+ private final boolean MAKE_MATCH_HISTOGRAM;
+ private final boolean MAKE_BASE_HISTOGRAM;
+
+ private final boolean MAKE_EHIST;
+ private final boolean MAKE_INDELHIST;
+ private final boolean MAKE_LHIST;
+ private final boolean MAKE_GCHIST;
+ private final boolean MAKE_IDHIST;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Static Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Number of tables (and threads, during loading) */
+ private static final int WAYS=9; //123
+ /** Verbose messages */
+ public static final boolean verbose=false; //123
+
+ /** Print messages to this stream */
+ private static PrintStream outstream=System.err;
+ /** Permission to overwrite existing files */
+ public static boolean overwrite=true;
+ /** Permission to append to existing files */
+ public static boolean append=false;
+ /** Print speed statistics upon completion */
+ public static boolean showSpeed=true;
+ /** Display progress messages such as memory usage */
+ public static boolean DISPLAY_PROGRESS=true;
+ /** Number of ProcessThreads */
+ public static int THREADS=Shared.threads();
+ /** Indicates end of input stream */
+ private static final ArrayList<Read> POISON=new ArrayList<Read>(0);
+ /** Number of columns for statistics output, 3 or 5 */
+ public static int STATS_COLUMNS=5;
+ /** Release memory used by kmer storage after processing reads */
+ public static boolean RELEASE_TABLES=true;
+ /** Max value of hitCount array */
+ public static final int HITCOUNT_LEN=1000;
+ /** Make unambiguous copies of ref sequences with ambiguous bases */
+ public static boolean REPLICATE_AMBIGUOUS=false;
+ /** Write refstats in similar style to BBSplit */
+ public static boolean BBSPLIT_STYLE=false;
+
+ /** x&clearMasks[i] will clear base i */
+ private static final long[] clearMasks;
+ /** x|setMasks[i][j] will set base i to j */
+ private static final long[][] setMasks;
+ /** x&leftMasks[i] will clear all bases to the right of i (exclusive) */
+ private static final long[] leftMasks;
+ /** x&rightMasks[i] will clear all bases to the left of i (inclusive) */
+ private static final long[] rightMasks;
+ /** x|kMasks[i] will set the bit to the left of the leftmost base */
+ private static final long[] lengthMasks;
+
+ private static final int qhdistSizeLimit=10;
+
+ public static HashMap<String,String> RQC_MAP=null;
+
+ public static final int AMBIG_ALL=1, AMBIG_FIRST=2, AMBIG_TOSS=3, AMBIG_RANDOM=4;
+ public static final int MATCH_ALL=1, MATCH_FIRST=2, MATCH_UNIQUE=3;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Static Initializers ----------------*/
+ /*--------------------------------------------------------------*/
+
+ static{
+ clearMasks=new long[32];
+ leftMasks=new long[32];
+ rightMasks=new long[32];
+ lengthMasks=new long[32];
+ setMasks=new long[4][32];
+ for(int i=0; i<32; i++){
+ clearMasks[i]=~(3L<<(2*i));
+ }
+ for(int i=0; i<32; i++){
+ leftMasks[i]=((-1L)<<(2*i));
+ }
+ for(int i=0; i<32; i++){
+ rightMasks[i]=~((-1L)<<(2*i));
+ }
+ for(int i=0; i<32; i++){
+ lengthMasks[i]=((1L)<<(2*i));
+ }
+ for(int i=0; i<32; i++){
+ for(long j=0; j<4; j++){
+ setMasks[(int)j][i]=(j<<(2*i));
+ }
+ }
+ }
+
+}
diff --git a/current/jgi/Shred.java b/current/jgi/Shred.java
new file mode 100755
index 0000000..780f52e
--- /dev/null
+++ b/current/jgi/Shred.java
@@ -0,0 +1,391 @@
+package jgi;
+
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import stream.ConcurrentReadInputStream;
+import stream.FASTQ;
+import stream.FastaReadInputStream;
+import stream.ConcurrentReadOutputStream;
+import stream.KillSwitch;
+import stream.Read;
+
+import dna.Parser;
+import dna.Timer;
+import fileIO.ByteFile;
+import fileIO.ReadWrite;
+import fileIO.FileFormat;
+
+import align2.ListNum;
+import align2.ReadStats;
+import align2.Shared;
+import align2.Tools;
+
+/**
+ * @author Brian Bushnell
+ * @date June 20, 2014
+ *
+ */
+public class Shred {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /**
+ * Code entrance from the command line.
+ * @param args Command line arguments
+ */
+ public static void main(String[] args){
+ Timer t=new Timer();
+ Shred mb=new Shred(args);
+ mb.process(t);
+ }
+
+ /**
+ * Constructor.
+ * @param args Command line arguments
+ */
+ public Shred(String[] args){
+
+ args=Parser.parseConfig(args);
+ if(Parser.parseHelp(args, true)){
+ printOptions();
+ System.exit(0);
+ }
+
+ outstream.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+
+
+
+ Shared.READ_BUFFER_LENGTH=Tools.min(100, Shared.READ_BUFFER_LENGTH);
+ Shared.capBuffers(4);
+ ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
+ ReadWrite.MAX_ZIP_THREADS=Shared.threads();
+
+
+ Parser parser=new Parser();
+ boolean even=false;
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if(b==null || b.equalsIgnoreCase("null")){b=null;}
+ while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens
+
+ if(parser.parse(arg, a, b)){
+ //do nothing
+ }else if(a.equals("length") || a.equals("len") || a.equals("shredlen") || a.equals("shredlength")){
+ shredLength=(int)Tools.parseKMG(b);
+ }else if(a.equals("overlap")){
+ overlap=(int)Tools.parseKMG(b);
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ }else if(a.equals("even") || a.equals("equal")){
+ even=Tools.parseBoolean(b);
+ }else if(a.equals("parse_flag_goes_here")){
+ //Set a variable here
+ }else{
+ outstream.println("Unknown parameter "+args[i]);
+ assert(false) : "Unknown parameter "+args[i];
+ // throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+ evenLengths=even;
+
+ {//Process parser fields
+ Parser.processQuality();
+
+ maxReads=parser.maxReads;
+
+ overwrite=ReadStats.overwrite=parser.overwrite;
+ append=ReadStats.append=parser.append;
+
+ in1=parser.in1;
+
+ out1=parser.out1;
+
+ extin=parser.extin;
+ extout=parser.extout;
+
+ minLength=parser.minReadLength;
+ }
+
+ minLength=Tools.mid(1, minLength, shredLength);
+ assert(shredLength>0);
+ assert(shredLength>overlap);
+ increment=shredLength-overlap;
+ incMult=1.0/increment;
+ assert(increment>0);
+
+ assert(FastaReadInputStream.settingsOK());
+
+ if(in1==null){
+ printOptions();
+ throw new RuntimeException("Error - at least one input file is required.");
+ }
+ if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){
+ ByteFile.FORCE_MODE_BF2=true;
+ }
+
+ if(out1!=null && out1.equalsIgnoreCase("null")){out1=null;}
+
+ if(!Tools.testOutputFiles(overwrite, append, false, out1)){
+ outstream.println((out1==null)+", "+out1);
+ throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output file "+out1+"\n");
+ }
+
+ ffout1=FileFormat.testOutput(out1, FileFormat.FASTQ, extout, true, overwrite, append, false);
+
+ ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, extin, true, true);
+ }
+
+ public boolean parseArgument(String arg, String a, String b){
+ if(a.equals("reads") || a.equals("maxreads")){
+ maxReads=Tools.parseKMG(b);
+ return true;
+ }else if(a.equals("some_argument")){
+ maxReads=Tools.parseKMG(b);
+ return true;
+ }
+ return false;
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Outer Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Create read streams and process all data */
+ void process(Timer t){
+
+ final ConcurrentReadInputStream cris;
+ {
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, null);
+ cris.start();
+ if(verbose){outstream.println("Started cris");}
+ }
+ boolean paired=cris.paired();
+ if(!ffin1.samOrBam()){outstream.println("Input is being processed as "+(paired ? "paired" : "unpaired"));}
+
+ final ConcurrentReadOutputStream ros;
+ if(out1!=null){
+ final int buff=2;
+
+ if(cris.paired()){KillSwitch.kill("This program does not support paired reads.");}
+
+ ros=ConcurrentReadOutputStream.getStream(ffout1, null, buff, null, false);
+ ros.start();
+ }else{ros=null;}
+
+ readsProcessed=0;
+ basesProcessed=0;
+
+ //Process the read stream
+ processInner(cris, ros);
+
+ ReadWrite.closeStreams(cris, ros);
+ if(verbose){outstream.println("Finished.");}
+
+ errorState|=ReadStats.writeAll();
+ errorState|=ReadWrite.closeStreams(cris, ros);
+
+ t.stop();
+
+ final double rpnano=readsProcessed/(double)(t.elapsed);
+ final double bpnano=basesProcessed/(double)(t.elapsed);
+
+ outstream.println("Time: \t"+t);
+
+ {
+ String rpstring=(readsProcessed<100000 ? ""+readsProcessed : readsProcessed<100000000 ? (readsProcessed/1000)+"k" : (readsProcessed/1000000)+"m");
+ String bpstring=(basesProcessed<100000 ? ""+basesProcessed : basesProcessed<100000000 ? (basesProcessed/1000)+"k" : (basesProcessed/1000000)+"m");
+
+ while(rpstring.length()<8){rpstring=" "+rpstring;}
+ while(bpstring.length()<8){bpstring=" "+bpstring;}
+
+ outstream.println("Reads Processed: "+rpstring+" \t"+String.format("%.2fk reads/sec", rpnano*1000000));
+ outstream.println("Bases Processed: "+bpstring+" \t"+String.format("%.2fm bases/sec", bpnano*1000));
+ }
+
+ {
+ String rpstring=(readsOut<100000 ? ""+readsOut : readsOut<100000000 ? (readsOut/1000)+"k" : (readsOut/1000000)+"m");
+ String bpstring=(basesOut<100000 ? ""+basesOut : basesOut<100000000 ? (basesOut/1000)+"k" : (basesOut/1000000)+"m");
+
+ while(rpstring.length()<8){rpstring=" "+rpstring;}
+ while(bpstring.length()<8){bpstring=" "+bpstring;}
+
+ outstream.println("Reads Out: "+rpstring);
+ outstream.println("Bases Out: "+bpstring);
+ }
+
+ if(errorState){
+ throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
+ }
+ }
+
+ /** Iterate through the reads */
+ void processInner(final ConcurrentReadInputStream cris, final ConcurrentReadOutputStream ros){
+
+ readsProcessed=0;
+ basesProcessed=0;
+
+ readsOut=0;
+ basesOut=0;
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+ assert((ffin1==null || ffin1.samOrBam()) || (r.mate!=null)==cris.paired());
+ }
+
+ while(reads!=null && reads.size()>0){
+ if(verbose){outstream.println("Fetched "+reads.size()+" reads.");}
+
+ ArrayList<Read> listOut=new ArrayList<Read>();
+ for(int idx=0; idx<reads.size(); idx++){
+ final Read r1=reads.get(idx);
+
+ final int initialLength1=r1.length();
+
+ readsProcessed++;
+ basesProcessed+=initialLength1;
+
+ if(evenLengths){
+ processEvenly(r1, listOut);
+ }else{
+ processUnevenly(r1, listOut);
+ }
+ }
+
+ if(ros!=null){ros.add(listOut, ln.id);}
+
+ cris.returnList(ln.id, ln.list.isEmpty());
+ if(verbose){outstream.println("Returned a list.");}
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ if(ln!=null){
+ cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
+ }
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ void processRead(final Read r1, final ArrayList<Read> list){
+ if(r1.length()<minLength){return;}
+ if(r1.length()<=shredLength){
+ r1.numericID=readsOut;
+ list.add(r1);
+ readsOut++;
+ basesOut+=r1.length();
+ return;
+ }
+ if(evenLengths){
+ processEvenly(r1, list);
+ }else{
+ processUnevenly(r1, list);
+ }
+ }
+
+ void processEvenly(final Read r1, final ArrayList<Read> list){
+ final byte[] bases=r1.bases;
+ final byte[] quals=r1.quality;
+ final String name=r1.id;
+
+ final int chunks=(int)Math.ceil((bases.length-overlap)*incMult);
+ assert(chunks>0);
+ double inc2=bases.length/(double)chunks;
+
+ for(int chunk=0; chunk<chunks; chunk++){
+ int a=(int)Math.floor(inc2*chunk);
+ int b=(chunk==chunks-1 ? bases.length : overlap+(int)Math.floor(inc2*(chunk+1)));
+ b=Tools.min(b, a+shredLength);
+ final int length=b-a;
+ if(length<minLength){return;}
+ final byte[] bases2=Arrays.copyOfRange(bases, a, b);
+ final byte[] quals2=(quals==null ? null : Arrays.copyOfRange(quals, a, b));
+ Read shred=new Read(bases2, quals2, readsOut, name+"_"+a+"-"+(b-1));
+ readsOut++;
+ basesOut+=shred.length();
+ list.add(shred);
+ }
+ }
+
+ void processUnevenly(final Read r1, final ArrayList<Read> list){
+ final byte[] bases=r1.bases;
+ final byte[] quals=r1.quality;
+ final String name=r1.id;
+ for(int i=0; i<bases.length; i+=increment){
+ final int limit=Tools.min(i+shredLength, bases.length);
+ final int length=limit-i;
+ if(length<minLength){return;}
+ final byte[] bases2=Arrays.copyOfRange(bases, i, limit);
+ final byte[] quals2=(quals==null ? null : Arrays.copyOfRange(quals, i, limit));
+ Read shred=new Read(bases2, quals2, readsOut, name+"_"+i+"-"+(limit-1));
+ readsOut++;
+ basesOut+=shred.length();
+ list.add(shred);
+ if(limit==bases.length){return;}
+ assert(limit<bases.length);
+ }
+ }
+
+ /** This is called if the program runs with no parameters */
+ private void printOptions(){
+ throw new RuntimeException("TODO");
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private String in1=null;
+ private String out1=null;
+
+ private String extin=null;
+ private String extout=null;
+
+ /*--------------------------------------------------------------*/
+
+ protected long readsProcessed=0;
+ protected long basesProcessed=0;
+ protected long readsOut=0;
+ protected long basesOut=0;
+
+ private long maxReads=-1;
+
+ private int shredLength=500;
+ private int minLength=1;
+ private int overlap=0;
+ private final int increment;
+ private final double incMult;
+
+ private final boolean evenLengths;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Final Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private final FileFormat ffin1;
+ private final FileFormat ffout1;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Common Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private PrintStream outstream=System.err;
+ public static boolean verbose=false;
+ public boolean errorState=false;
+ private boolean overwrite=false;
+ private boolean append=false;
+
+}
diff --git a/current/jgi/Shuffle.java b/current/jgi/Shuffle.java
new file mode 100755
index 0000000..8dc479e
--- /dev/null
+++ b/current/jgi/Shuffle.java
@@ -0,0 +1,494 @@
+package jgi;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+
+import stream.ConcurrentGenericReadInputStream;
+import stream.ConcurrentReadInputStream;
+import stream.FASTQ;
+import stream.FastaReadInputStream;
+import stream.Read;
+
+import dna.Parser;
+import dna.Timer;
+import fileIO.ByteFile;
+import fileIO.ByteFile1;
+import fileIO.ByteFile2;
+import fileIO.ByteStreamWriter;
+import fileIO.ReadWrite;
+import fileIO.FileFormat;
+
+import align2.ListNum;
+import align2.ReadComparatorID;
+import align2.ReadComparatorMapping;
+import align2.ReadComparatorName;
+import align2.ReadComparatorTopological;
+import align2.ReadStats;
+import align2.Shared;
+import align2.Tools;
+
+
+/**
+ * Randomizes the order of reads.
+ * @author Brian Bushnell
+ * @date Oct 27, 2014
+ *
+ */
+public class Shuffle {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public static void main(String[] args){
+ Timer t=new Timer();
+ Shuffle sh=new Shuffle(args);
+ sh.process(t);
+ }
+
+ public Shuffle(String[] args){
+
+ args=Parser.parseConfig(args);
+ if(Parser.parseHelp(args, true)){
+ printOptions();
+ System.exit(0);
+ }
+
+ for(String s : args){if(s.startsWith("out=standardout") || s.startsWith("out=stdout")){outstream=System.err;}}
+ if(printClass){outstream.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");}
+
+ boolean setInterleaved=false; //Whether it was explicitly set.
+
+
+
+ Shared.READ_BUFFER_LENGTH=Tools.min(200, Shared.READ_BUFFER_LENGTH);
+ Shared.capBuffers(4);
+ ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
+ ReadWrite.MAX_ZIP_THREADS=Shared.threads();
+
+
+ int mode_=SHUFFLE;
+
+ Parser parser=new Parser();
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if(b==null || b.equalsIgnoreCase("null")){b=null;}
+ while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens
+
+ if(parser.parse(arg, a, b)){
+ //do nothing
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ ByteFile1.verbose=verbose;
+ ByteFile2.verbose=verbose;
+ stream.FastaReadInputStream.verbose=verbose;
+ ConcurrentGenericReadInputStream.verbose=verbose;
+// align2.FastaReadInputStream2.verbose=verbose;
+ stream.FastqReadInputStream.verbose=verbose;
+ ReadWrite.verbose=verbose;
+ }else if(a.equals("shuffle")){
+ mode_=SHUFFLE;
+ }else if(a.equals("name")){
+ mode_=SORT_NAME;
+ }else if(a.equals("coordinate")){
+ mode_=SORT_COORD;
+ }else if(a.equals("sequence")){
+ mode_=SORT_SEQ;
+ }else if(a.equals("id")){
+ mode_=SORT_ID;
+ }else if(a.equals("mode")){
+ if(b==null){
+ throw new RuntimeException("mode must be shuffle, name, coordinate, sequence, or id.");
+ }else if(b.equals("shuffle")){
+ mode_=SHUFFLE;
+ }else if(b.equals("name")){
+ mode_=SORT_NAME;
+ }else if(b.equals("coordinate")){
+ mode_=SORT_COORD;
+ }else if(b.equals("sequence")){
+ mode_=SORT_SEQ;
+ }else if(b.equals("id")){
+ mode_=SORT_ID;
+ }else{
+ throw new RuntimeException("mode must be shuffle, name, coordinate, sequence, or id.");
+ }
+ }else if(a.equals("showspeed") || a.equals("ss")){
+ showSpeed=Tools.parseBoolean(b);
+ }else if(parser.in1==null && i==0 && !arg.contains("=") && (arg.toLowerCase().startsWith("stdin") || new File(arg).exists())){
+ parser.in1=arg;
+ }else{
+ outstream.println("Unknown parameter "+args[i]);
+ assert(false) : "Unknown parameter "+args[i];
+ // throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ mode=mode_;
+ assert(mode>=1 && mode<=5) : "mode must be shuffle, name, coordinate, sequence, or id.";
+
+ {//Process parser fields
+ Parser.processQuality();
+
+ maxReads=parser.maxReads;
+
+ overwrite=ReadStats.overwrite=parser.overwrite;
+ append=ReadStats.append=parser.append;
+
+ setInterleaved=parser.setInterleaved;
+
+ in1=parser.in1;
+ in2=parser.in2;
+ qfin1=parser.qfin1;
+ qfin2=parser.qfin2;
+
+ out1=parser.out1;
+ out2=parser.out2;
+ qfout1=parser.qfout1;
+ qfout2=parser.qfout2;
+
+ extin=parser.extin;
+ extout=parser.extout;
+ }
+
+ if(in1!=null && in2==null && in1.indexOf('#')>-1 && !new File(in1).exists()){
+ in2=in1.replace("#", "2");
+ in1=in1.replace("#", "1");
+ }
+ if(out1!=null && out2==null && out1.indexOf('#')>-1){
+ out2=out1.replace("#", "2");
+ out1=out1.replace("#", "1");
+ }
+ if(in2!=null){
+ if(FASTQ.FORCE_INTERLEAVED){outstream.println("Reset INTERLEAVED to false because paired input files were specified.");}
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+ }
+
+ assert(FastaReadInputStream.settingsOK());
+
+ if(in1==null){
+ printOptions();
+ throw new RuntimeException("Error - at least one input file is required.");
+ }
+ if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){
+ ByteFile.FORCE_MODE_BF2=true;
+ }
+
+ if(out1==null){
+ if(out2!=null){
+ printOptions();
+ throw new RuntimeException("Error - cannot define out2 without defining out1.");
+ }
+ }
+
+ if(!setInterleaved){
+ assert(in1!=null && (out1!=null || out2==null)) : "\nin1="+in1+"\nin2="+in2+"\nout1="+out1+"\nout2="+out2+"\n";
+ if(in2!=null){ //If there are 2 input streams.
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+ outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED);
+ }else{ //There is one input stream.
+ if(out2!=null){
+ FASTQ.FORCE_INTERLEAVED=true;
+ FASTQ.TEST_INTERLEAVED=false;
+ outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED);
+ }
+ }
+ }
+
+ if(out1!=null && out1.equalsIgnoreCase("null")){out1=null;}
+ if(out2!=null && out2.equalsIgnoreCase("null")){out2=null;}
+
+ if(!Tools.testOutputFiles(overwrite, append, false, out1, out2)){
+ outstream.println((out1==null)+", "+(out2==null)+", "+out1+", "+out2);
+ throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+out1+", "+out2+"\n");
+ }
+
+ ffout1=FileFormat.testOutput(out1, FileFormat.FASTQ, extout, true, overwrite, append, false);
+ ffout2=FileFormat.testOutput(out2, FileFormat.FASTQ, extout, true, overwrite, append, false);
+
+ ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, extin, true, true);
+ ffin2=FileFormat.testInput(in2, FileFormat.FASTQ, extin, true, true);
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Outer Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ void process(Timer t){
+
+ ArrayList<Read> bigList=new ArrayList<Read>(65530);
+ final ConcurrentReadInputStream cris;
+ {
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, ffin2, qfin1, qfin2);
+ if(verbose){outstream.println("Started cris");}
+ cris.start(); //4567
+ }
+ boolean paired=cris.paired();
+ if(!ffin1.samOrBam()){outstream.println("Input is being processed as "+(paired ? "paired" : "unpaired"));}
+
+ long readsProcessed=0;
+ long basesProcessed=0;
+
+ {
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+// outstream.println("Fetched "+reads);
+
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+ assert((ffin1==null || ffin1.samOrBam()) || (r.mate!=null)==cris.paired());
+ }
+
+ while(reads!=null && reads.size()>0){
+
+ for(int idx=0; idx<reads.size(); idx++){
+ final Read r1=reads.get(idx);
+ final Read r2=r1.mate;
+
+ final int initialLength1=r1.length();
+ final int initialLength2=(r1.mateLength());
+
+ {
+ readsProcessed++;
+ basesProcessed+=initialLength1;
+ }
+ if(r2!=null){
+ readsProcessed++;
+ basesProcessed+=initialLength2;
+ }
+ bigList.add(r1);
+ }
+
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ if(ln!=null){
+ cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
+ }
+ }
+
+ errorState|=ReadWrite.closeStream(cris);
+ errorState|=ReadStats.writeAll();
+
+ if(mode==SHUFFLE){
+ Collections.shuffle(bigList);
+ }else if(mode==SORT_NAME){
+ Collections.sort(bigList, ReadComparatorName.comparator);
+ }else if(mode==SORT_SEQ){
+ Collections.sort(bigList, new ReadComparatorTopological());
+ }else if(mode==SORT_COORD){
+ Collections.sort(bigList, new ReadComparatorMapping());
+ }else if(mode==SORT_ID){
+ Collections.sort(bigList, ReadComparatorID.comparator);
+ }else{
+ assert(false) : "No mode set.";
+ }
+
+ if(ffout1!=null){
+ final ByteStreamWriter bsw1, bsw2;
+ if(ffout1!=null){
+ bsw1=new ByteStreamWriter(ffout1);
+ bsw1.start();
+ }else{bsw1=null;}
+ if(ffout2!=null){
+ bsw2=new ByteStreamWriter(ffout2);
+ bsw2.start();
+ }else{bsw2=null;}
+ final boolean b=(bsw2==null);
+ for(int i=0, lim=bigList.size(); i<lim; i++){
+ final Read r1=bigList.set(i, null);
+ final Read r2=r1.mate;
+ bsw1.println(r1, b);
+ if(r2!=null && !b){bsw2.println(r2);}
+ }
+ if(bsw1!=null){errorState|=bsw1.poisonAndWait();}
+ if(bsw2!=null){errorState|=bsw2.poisonAndWait();}
+ }
+
+ t.stop();
+
+ if(showSpeed){
+ double rpnano=readsProcessed/(double)(t.elapsed);
+ double bpnano=basesProcessed/(double)(t.elapsed);
+
+ String rpstring=(readsProcessed<100000 ? ""+readsProcessed : readsProcessed<100000000 ? (readsProcessed/1000)+"k" : (readsProcessed/1000000)+"m");
+ String bpstring=(basesProcessed<100000 ? ""+basesProcessed : basesProcessed<100000000 ? (basesProcessed/1000)+"k" : (basesProcessed/1000000)+"m");
+
+ while(rpstring.length()<8){rpstring=" "+rpstring;}
+ while(bpstring.length()<8){bpstring=" "+bpstring;}
+
+ outstream.println("Time: \t"+t);
+ outstream.println("Reads Processed: "+rpstring+" \t"+String.format("%.2fk reads/sec", rpnano*1000000));
+ outstream.println("Bases Processed: "+bpstring+" \t"+String.format("%.2fm bases/sec", bpnano*1000));
+ }
+
+ if(errorState){
+ throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
+ }
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private void printOptions(){
+ assert(false) : "printOptions: TODO";
+// outstream.println("Syntax:\n");
+// outstream.println("java -ea -Xmx512m -cp <path> jgi.ReformatReads in=<infile> in2=<infile2> out=<outfile> out2=<outfile2>");
+// outstream.println("\nin2 and out2 are optional. \nIf input is paired and there is only one output file, it will be written interleaved.\n");
+// outstream.println("Other parameters and their defaults:\n");
+// outstream.println("overwrite=false \tOverwrites files that already exist");
+// outstream.println("ziplevel=4 \tSet compression level, 1 (low) to 9 (max)");
+// outstream.println("interleaved=false\tDetermines whether input file is considered interleaved");
+// outstream.println("fastawrap=70 \tLength of lines in fasta output");
+// outstream.println("qin=auto \tASCII offset for input quality. May be set to 33 (Sanger), 64 (Illumina), or auto");
+// outstream.println("qout=auto \tASCII offset for output quality. May be set to 33 (Sanger), 64 (Illumina), or auto (meaning same as input)");
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Classes ----------------*/
+ /*--------------------------------------------------------------*/
+
+ static class ShuffleThread extends Thread{
+
+ ShuffleThread(String in1_, String in2_, String out1_, String out2_, int mode_, boolean ow_){
+ in1=in1_;
+ in2=in2_;
+ out1=out1_;
+ out2=out2_;
+ mode=mode_;
+ ow=ow_;
+ }
+
+ @Override
+ public void start(){
+ addThread(1);
+ super.start();
+ }
+
+ @Override
+ public void run(){
+ ArrayList<String> list=new ArrayList<String>();
+ if(in1!=null){list.add("in1="+in1);}
+ if(in2!=null){list.add("in1="+in2);}
+ if(out1!=null){list.add("out1="+out1);}
+ if(out2!=null){list.add("out2="+out2);}
+ list.add("mode="+MODES[mode]);
+ list.add("ow="+ow);
+ try{
+ Shuffle.main(list.toArray(new String[0]));
+ }catch(Throwable e){
+ System.err.println("Failed to shuffle "+in1+"\nException:"+e+"\n");
+ }
+ addThread(-1);
+ }
+
+ final String in1, in2;
+ final String out1, out2;
+ final int mode;
+ final boolean ow;
+
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private String in1=null;
+ private String in2=null;
+
+ private String qfin1=null;
+ private String qfin2=null;
+
+ private String out1=null;
+ private String out2=null;
+
+ private String qfout1=null;
+ private String qfout2=null;
+
+ private String extin=null;
+ private String extout=null;
+
+ /*--------------------------------------------------------------*/
+
+ private long maxReads=-1;
+
+ private final int mode;
+
+ /*--------------------------------------------------------------*/
+
+ private final FileFormat ffin1;
+ private final FileFormat ffin2;
+
+ private final FileFormat ffout1;
+ private final FileFormat ffout2;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Static Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private static int maxShuffleThreads=1;
+ private static int currentShuffleThreads=0;
+
+ public static void setMaxThreads(final int x){
+ assert(x>0);
+ synchronized(SHUFFLE_LOCK){
+ maxShuffleThreads=x;
+ }
+ }
+
+ public static int addThread(final int x){
+ synchronized(SHUFFLE_LOCK){
+ while(x>0 && currentShuffleThreads>=maxShuffleThreads){
+ try {
+ SHUFFLE_LOCK.wait(2000);
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ }
+ }
+ currentShuffleThreads+=x;
+ if(currentShuffleThreads<maxShuffleThreads){SHUFFLE_LOCK.notify();}
+ return currentShuffleThreads;
+ }
+ }
+
+ public static void waitForFinish(){
+ synchronized(SHUFFLE_LOCK){
+ while(currentShuffleThreads>=maxShuffleThreads){
+ try {
+ SHUFFLE_LOCK.wait(2000);
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ }
+ }
+ }
+ }
+
+ private static String SHUFFLE_LOCK=new String("SHUFFLE_LOCK");
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Common Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private PrintStream outstream=System.err;
+ public static boolean verbose=false;
+ public boolean errorState=false;
+ private boolean overwrite=false;
+ private boolean append=false;
+ public static boolean showSpeed=true;
+ public static boolean printClass=true;
+
+ public static final int SHUFFLE=1, SORT_NAME=2, SORT_SEQ=3, SORT_COORD=4, SORT_ID=5;
+ public static final String[] MODES={"shuffle", "name", "sequence", "coordinate", "id"};
+
+
+}
diff --git a/current/jgi/SmallKmerFrequency.java b/current/jgi/SmallKmerFrequency.java
new file mode 100755
index 0000000..2a8d67b
--- /dev/null
+++ b/current/jgi/SmallKmerFrequency.java
@@ -0,0 +1,216 @@
+package jgi;
+
+import java.util.Arrays;
+import java.util.Comparator;
+
+import align2.Tools;
+import dna.AminoAcid;
+import dna.Timer;
+import fileIO.FileFormat;
+import stream.Read;
+
+/**
+ * @author Brian Bushnell
+ * @date Feb 19, 2015
+ *
+ */
+public class SmallKmerFrequency extends BBTool_ST {
+
+ /**
+ * Code entrance from the command line.
+ * Must be overridden; the commented body is an example.
+ * @param args Command line arguments
+ */
+ public static void main(String[] args){
+ Timer t=new Timer();
+ FileFormat.PRINT_WARNING=false;
+ SmallKmerFrequency bbt=new SmallKmerFrequency(args);
+ bbt.process(t);
+ }
+
+ void setDefaults(){
+ k=2;
+ display=3;
+ addNumbers=false;
+ }
+
+ /**
+ * @param args
+ */
+ public SmallKmerFrequency(String[] args) {
+ super(args);
+ reparse(args);
+
+ kmerIndex=makeKmerIndex(k);
+ maxKmer=Tools.max(kmerIndex);
+ counts=new int[maxKmer+1];
+ display=Tools.min(display, counts.length);
+ if(out1!=null){
+ ffout1=FileFormat.testOutput(out1, FileFormat.ATTACHMENT, ".info", true, overwrite, append, true);
+ }
+ kmers=new Kmer[counts.length];
+ for(int i=0; i<kmerIndex.length; i++){
+ int index=kmerIndex[i];
+ if(kmers[index]==null){
+ kmers[index]=new Kmer();
+ kmers[index].s=AminoAcid.kmerToString(i, k);
+ kmers[index].num=i;
+ }
+ }
+// System.err.println(Arrays.toString(kmers));
+ }
+
+ /* (non-Javadoc)
+ * @see jgi.BBTool_ST#parseArgument(java.lang.String, java.lang.String, java.lang.String)
+ */
+ @Override
+ public boolean parseArgument(String arg, String a, String b) {
+ if(a.equals("k")){
+ k=Integer.parseInt(b);
+ return true;
+ }else if(a.equals("display")){
+ display=Integer.parseInt(b);
+ return true;
+ }else if(a.equals("addnumbers") || a.equals("number") || a.equals("count") || a.equals("numbers") || a.equals("counts")){
+ addNumbers=Tools.parseBoolean(b);
+ return true;
+ }
+ return false;
+ }
+
+ @Override
+ boolean processReadPair(Read r1, Read r2) {
+ if(r1!=null){
+ makeKmerProfile(r1.bases, counts, true);
+ sb.append(r1.id);
+ Arrays.sort(kmers, numComparator);
+ for(int i=0; i<counts.length; i++){
+ kmers[i].count=counts[i];
+ }
+ Arrays.sort(kmers, countComparator);
+ for(int i=0; i<display && kmers[i].count>0; i++){
+ sb.append('\t');
+ sb.append(kmers[i].s);
+ if(addNumbers){sb.append('=').append(kmers[i].count);}
+ }
+// sb.append('\n');
+ r1.obj=sb.toString();
+ sb.setLength(0);
+ }
+ if(r2!=null){
+ makeKmerProfile(r2.bases, counts, true);
+ sb.append(r2.id);
+ Arrays.sort(kmers, numComparator);
+ for(int i=0; i<counts.length; i++){
+ kmers[i].count=counts[i];
+ }
+ Arrays.sort(kmers, countComparator);
+ for(int i=0; i<display; i++){
+ sb.append('\t');
+ sb.append(kmers[i].s);
+ if(addNumbers){sb.append('=').append(kmers[i].count);}
+ }
+// sb.append('\n');
+ r2.obj=sb.toString();
+ sb.setLength(0);
+ }
+ return true;
+ }
+
+ /** Makes a kmer (e.g., tetramer) profile of a cluster */
+ private final int[] makeKmerProfile(byte[] bases, int[] array_, boolean clear){
+ final int nbits=2*k;
+ final int[] array=(array_==null ? new int[maxKmer+1] : array_);
+ final int mask=~((-1)<<(nbits));
+ if(clear){Arrays.fill(array, 0);} //TODO: Can be cleared faster using an IntList.
+
+ int keysCounted=0;
+
+ int len=0;
+ int kmer=0;
+ for(byte b : bases){
+ int x=AminoAcid.baseToNumber[b];
+ if(x<0){
+ len=0;
+ kmer=0;
+ }else{
+ kmer=((kmer<<2)|x)&mask;
+ len++;
+ if(len>=k){
+ int rkmer=AminoAcid.reverseComplementBinaryFast(kmer, k);
+ keysCounted++;
+ array[kmerIndex[Tools.min(kmer, rkmer)]]++;
+ }
+ }
+ }
+ return array;
+ }
+
+ @Override
+ void startupSubclass() {}
+
+ @Override
+ void shutdownSubclass() {}
+
+ @Override
+ void showStatsSubclass(Timer t, long readsIn, long basesIn) {}
+
+ private class Kmer{
+
+ String s;
+ int count=0;
+ int num;
+
+ public String toString(){return "("+s+","+num+","+count+")";}
+
+ }
+
+ private static class NumComparator implements Comparator<Kmer>{
+
+ @Override
+ public int compare(Kmer a, Kmer b) {
+ return a.num-b.num;
+ }
+
+ }
+
+ private static class CountComparator implements Comparator<Kmer>{
+
+ @Override
+ public int compare(Kmer a, Kmer b) {
+ return b.count-a.count;
+ }
+
+ }
+
+ public static final int[] makeKmerIndex(final int n){
+ final int max=(1<<(2*n))-1;
+ int[] array=new int[max+1];
+
+ int count=0;
+ for(int i=0; i<=max; i++){
+ final int a=i, b=AminoAcid.reverseComplementBinaryFast(i, n);
+ int min=Tools.min(a, b);
+ if(min==a){
+ array[a]=array[b]=count;
+ count++;
+ }
+ }
+// assert(false) : Arrays.toString(array);
+ return array;
+ }
+
+ private static final NumComparator numComparator=new NumComparator();
+ private static final CountComparator countComparator=new CountComparator();
+
+ private int k;
+ private int display;
+ private boolean addNumbers;
+ private final int maxKmer;
+ private final int[] kmerIndex;
+ private final int[] counts;
+ private final StringBuilder sb=new StringBuilder();
+
+ private final Kmer[] kmers;
+
+}
diff --git a/current/jgi/SplitNexteraLMP.java b/current/jgi/SplitNexteraLMP.java
new file mode 100755
index 0000000..4ffe279
--- /dev/null
+++ b/current/jgi/SplitNexteraLMP.java
@@ -0,0 +1,670 @@
+package jgi;
+
+import java.util.ArrayList;
+
+import kmer.AbstractKmerTable;
+import kmer.TableLoaderLockFree;
+import kmer.TableReader;
+
+import align2.ListNum;
+import align2.Tools;
+import dna.Timer;
+import fileIO.ByteStreamWriter;
+import fileIO.FileFormat;
+import fileIO.ReadWrite;
+import fileIO.TextStreamWriter;
+import stream.ConcurrentReadInputStream;
+import stream.ConcurrentReadOutputStream;
+import stream.Read;
+
+/**
+ * @author Brian Bushnell
+ * @date Mar 2, 2015
+ *
+ */
+public class SplitNexteraLMP extends BBTool_ST {
+
+ public static void main(String[] args){
+ Timer t=new Timer();
+ SplitNexteraLMP bbt=new SplitNexteraLMP(args);
+ bbt.process(t);
+ }
+
+ void setDefaults(){
+ outStats="stderr";
+ minReadLength=40;
+ mask=false;
+ merge=false;
+ testmerge=0;
+ pairedInput=true;
+ symbol='J';
+ useInnerLMP=false;
+ RENAME=true;
+ }
+
+ /**
+ * @param args
+ */
+ public SplitNexteraLMP(String[] args) {
+ super(args);
+ reparse(args);
+
+ tables=(mask ? TableLoaderLockFree.makeTables(AbstractKmerTable.ARRAY1D, 400, true) : null);
+
+ if(outFrag1!=null && outFrag2==null && outFrag1.indexOf('#')>-1){
+ outFrag2=outFrag1.replace("#", "2");
+ outFrag1=outFrag1.replace("#", "1");
+ }
+
+ if(outUnk1!=null && outUnk2==null && outUnk1.indexOf('#')>-1){
+ outUnk2=outUnk1.replace("#", "2");
+ outUnk1=outUnk1.replace("#", "1");
+ }
+
+ if(testmerge>0){
+ System.err.println("Testing merge rate.");
+ float rate=FileFormat.testInput(in1, FileFormat.FASTQ, null, true, true).stdio() ? 1 : BBMerge.mergeableFraction(in1, in2, 1000000, 0.2f);
+ merge=rate>0.1;
+ System.err.println("Merge rate: "+String.format("%.2f%%", rate));
+ if(!merge){
+ System.err.println("Merging was disabled due to a low merge rate of "+String.format("%.3f", rate));
+ }
+ }
+ }
+
+ @Override
+ public boolean parseArgument(String arg, String a, String b) {
+ if(a.equals("symbol") || a.equals("junction")){
+ assert(b!=null && b.length()==1) : "Junction symbol must be a single character.";
+ symbol=(byte)b.charAt(0);
+ return true;
+ }else if(a.equals("outfrag") || a.equals("outfrag1") || a.equals("outf") || a.equals("outf1")){
+ outFrag1=b;
+ return true;
+ }else if(a.equals("outfrag2") || a.equals("outf2")){
+ outFrag2=b;
+ return true;
+ }else if(a.equals("outunknown") || a.equals("outunknown1") || a.equals("outu") || a.equals("outu1")){
+ outUnk1=b;
+ return true;
+ }else if(a.equals("outunknown2") || a.equals("outu2")){
+ outUnk2=b;
+ return true;
+ }else if(a.equals("outsingle") || a.equals("outs")){
+ outSingle=b;
+ return true;
+ }else if(a.equals("minlen") || a.equals("minlength") || a.equals("ml")){
+ minReadLength=Integer.parseInt(b);
+ return true;
+ }else if(a.equals("useinnerlmp") || a.equals("innerlmp")){
+ useInnerLMP=Tools.parseBoolean(b);
+ return true;
+ }else if(a.equals("rename")){
+ RENAME=Tools.parseBoolean(b);
+ return true;
+ }else if(a.equals("literal")){
+ literals=(b==null ? null : b.split(","));
+ return true;
+ }else if(a.equals("mask")){
+ mask=Tools.parseBoolean(b);
+ return true;
+ }else if(a.equals("merge")){
+ merge=Tools.parseBoolean(b);
+ return true;
+ }else if(a.equals("testmerge")){
+ testmerge=Double.parseDouble(b);
+ if(testmerge>1){testmerge/=100;}
+ return true;
+ }else if(a.equals("rcomp")){
+ rcomp=Tools.parseBoolean(b);
+ return true;
+ }else if(a.equals("maskmiddle") || a.equals("mm")){
+ maskMiddle=Tools.parseBoolean(b);
+ return true;
+ }else if(a.equals("k")){
+ k=Integer.parseInt(b);
+ return true;
+ }else if(a.equals("mink")){
+ mink=Integer.parseInt(b);
+ return true;
+ }else if(a.equals("hdist") || a.equals("hammingdistance")){
+ hdist=Integer.parseInt(b);
+ return true;
+ }else if(a.equals("hdist2") || a.equals("hammingdistance2")){
+ hdist2=Integer.parseInt(b);
+ return true;
+ }else if(a.equals("edits") || a.equals("edist") || a.equals("editdistance")){
+ edist=Integer.parseInt(b);
+ return true;
+ }else if(a.equals("dump")){
+ dump=b;
+ return true;
+ }else if(a.equals("stats")){
+ outStats=b;
+ return true;
+ }
+
+ return false;
+ }
+
+ @Override
+ void startupSubclass(){
+
+ if(!Tools.testOutputFiles(overwrite, append, false, out1, out2, outFrag1, outFrag2, outUnk1, outUnk2)){
+ throw new RuntimeException("\noverwrite="+overwrite+", append="+append+"\n" +
+ "Can't write to output files "+out1+", "+out2+", "+outFrag1+", "+outFrag2+", "+outUnk1+", "+outUnk2+"\n");
+ }
+
+ if(!Tools.testForDuplicateFiles(true, in1, in2, qfin1, qfin2, out1, out2, qfout1, qfout2, outFrag1, outFrag2, outUnk1, outUnk2)){
+ assert(false) : "Duplicate files.";
+ }
+
+ ffoutFrag1=FileFormat.testOutput(outFrag1, FileFormat.FASTQ, extout, true, overwrite, append, false);
+ ffoutFrag2=FileFormat.testOutput(outFrag2, FileFormat.FASTQ, extout, true, overwrite, append, false);
+
+ ffoutUnk1=FileFormat.testOutput(outUnk1, FileFormat.FASTQ, extout, true, overwrite, append, false);
+ ffoutUnk2=FileFormat.testOutput(outUnk2, FileFormat.FASTQ, extout, true, overwrite, append, false);
+
+ ffoutSingle=FileFormat.testOutput(outSingle, FileFormat.FASTQ, extout, true, overwrite, append, false);
+
+ if(outFrag1!=null){
+ final int buff=4;
+
+ rosFrag=ConcurrentReadOutputStream.getStream(ffoutFrag1, ffoutFrag2, null, null, buff, null, false);
+ rosFrag.start();
+ }else{rosFrag=null;}
+
+ if(outUnk1!=null){
+ final int buff=4;
+
+ rosUnk=ConcurrentReadOutputStream.getStream(ffoutUnk1, ffoutUnk2, null, null, buff, null, false);
+ rosUnk.start();
+ }else{rosUnk=null;}
+
+ if(outSingle!=null){
+ final int buff=4;
+
+ rosSingle=ConcurrentReadOutputStream.getStream(ffoutSingle, null, null, null, buff, null, false);
+ rosSingle.start();
+ }else{rosSingle=null;}
+
+ }
+
+
+ @Override
+ /** Iterate through the reads.
+ * This may optionally be overridden. */
+ void processInner(final ConcurrentReadInputStream cris, final ConcurrentReadOutputStream rosLmp){
+
+ if(mask){
+ final TableLoaderLockFree loader=new TableLoaderLockFree(tables, k, mink, 0, hdist, edist, rcomp, maskMiddle);
+ loader.setRefSkip(0);
+ loader.hammingDistance2=hdist2;
+ loader.storeMode(TableLoaderLockFree.SET_IF_NOT_PRESENT);
+ long kmers=loader.processData(null, literals, false, false, false);
+ outstream.println("Added "+kmers+" kmers.");
+ if(dump!=null){
+ ByteStreamWriter bsw=new ByteStreamWriter(dump, overwrite, false, true);
+ bsw.start();
+ for(AbstractKmerTable set : tables){
+ set.dumpKmersAsBytes(bsw, k, 0);
+ }
+ bsw.poisonAndWait();
+ }
+
+ reader=new TableReader(k, mink, 0, 0, 0, rcomp, maskMiddle);
+ reader.trimSymbol=symbol;
+ assert(kmers>0) : "There were no stored kmers; please check your settings.";
+
+// assert(false) : hdist+", "+hdist2+", "+maskMiddle+", "+rcomp+", "+k+", "+mink;
+ }else{
+ reader=null;
+ }
+
+ readsProcessed=0;
+ basesProcessed=0;
+
+ readsLmp=0;
+ basesLmp=0;
+ readsFrag=0;
+ basesFrag=0;
+ readsUnk=0;
+ basesUnk=0;
+
+ pairedInput=cris.paired();
+
+ {
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> listIn=(ln!=null ? ln.list : null);
+
+ if(listIn!=null && !listIn.isEmpty()){
+ Read r=listIn.get(0);
+ assert((ffin1==null || ffin1.samOrBam()) || (r.mate!=null)==cris.paired());
+ }
+
+ while(listIn!=null && listIn.size()>0){
+ if(verbose){outstream.println("Fetched "+listIn.size()+" reads.");}
+
+ ArrayList<Read> outLmp=new ArrayList<Read>(listIn.size());
+ ArrayList<Read> outFrag=new ArrayList<Read>(listIn.size());
+ ArrayList<Read> outUnk=new ArrayList<Read>(listIn.size());
+ ArrayList<Read> outSingle=new ArrayList<Read>(listIn.size());
+
+ for(int idx=0; idx<listIn.size(); idx++){
+ final Read r1=listIn.get(idx);
+ final Read r2=r1.mate;
+
+ final int initialLength1=r1.length();
+ final int initialLength2=(r1.mateLength());
+
+ readsProcessed+=1+r1.mateCount();
+ basesProcessed+=initialLength1+initialLength2;
+
+ boolean keep=processReadPair(r1, r2, outLmp, outFrag, outUnk, outSingle);
+
+ }
+
+ for(Read r1 : outLmp){
+ readsLmp+=1+r1.mateCount();
+ basesLmp+=r1.length()+r1.mateLength();
+ assert(r1.mate!=null);
+ assert(r1.pairnum()==0);
+ assert(r1.mate.pairnum()==1) : r1.mate.id+"\n\n"+r1.length()+"\n"+r1+"\n\n"+r1.mateLength()+"\n"+r1.mate+"\n\n";
+ assert(r1.mate.mate==r1);
+ }
+ if(rosLmp!=null){rosLmp.add(outLmp, ln.id);}
+
+ for(Read r1 : outFrag){
+ readsFrag+=1+r1.mateCount();
+ basesFrag+=r1.length()+r1.mateLength();
+ assert(r1.mate!=null);
+ assert(r1.pairnum()==0);
+ assert(r1.mate.pairnum()==1);
+ assert(r1.mate.mate==r1);
+ }
+ if(rosFrag!=null){rosFrag.add(outFrag, ln.id);}
+
+ for(Read r1 : outUnk){
+ readsUnk+=1+r1.mateCount();
+ basesUnk+=r1.length()+r1.mateLength();
+ assert(r1.mate!=null);
+ assert(r1.pairnum()==0);
+ assert(r1.mate.pairnum()==1);
+ assert(r1.mate.mate==r1);
+ }
+ if(rosUnk!=null){rosUnk.add(outUnk, ln.id);}
+
+ for(Read r1 : outSingle){
+ readsSingle+=1+r1.mateCount();
+ basesSingle+=r1.length()+r1.mateLength();
+ assert(r1.pairnum()==0);
+ assert(r1.mate==null);
+ }
+ if(rosSingle!=null){rosSingle.add(outSingle, ln.id);}
+
+ cris.returnList(ln.id, ln.list.isEmpty());
+ if(verbose){outstream.println("Returned a list.");}
+ ln=cris.nextList();
+ listIn=(ln!=null ? ln.list : null);
+ }
+ if(ln!=null){
+ cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
+ }
+ }
+ }
+
+ @Override
+ void showStatsSubclass(final Timer t, long readsIn, long basesIn){
+
+ TextStreamWriter tsw=new TextStreamWriter(outStats==null ? "stderr" : outStats, overwrite, append, false);
+ tsw.start();
+
+ outstream.println("");
+
+ final double rmult=(pairedInput ? 100.0 : 50.0)/readsIn;
+ final double bmult=100.0/basesIn;
+
+ //Note that this can go over 100%
+ tsw.println("Long Mate Pairs: \t"+readsLmp+" reads ("+String.format("%.2f",readsLmp*rmult)+"%) \t"+
+ basesLmp+" bases ("+String.format("%.2f",basesLmp*bmult)+"%)");
+ tsw.println("Fragment Pairs: \t"+readsFrag+" reads ("+String.format("%.2f",readsFrag*rmult)+"%) \t"+
+ basesFrag+" bases ("+String.format("%.2f",basesFrag*bmult)+"%)");
+ tsw.println("Unknown Pairs: \t"+readsUnk+" reads ("+String.format("%.2f",readsUnk*rmult)+"%) \t"+
+ basesUnk+" bases ("+String.format("%.2f",basesUnk*bmult)+"%)");
+ tsw.println("Singletons: \t"+readsSingle+" reads ("+String.format("%.2f",readsSingle*100.0/readsIn)+"%) \t"+
+ basesSingle+" bases ("+String.format("%.2f",basesSingle*bmult)+"%)");
+ tsw.println("\n(Note: Read totals may exceed 100%, though base totals should not.)");
+ tsw.println("");
+ tsw.println("Adapters Detected: \t"+junctionsDetected+" ("+String.format("%.2f%%)",junctionsDetected*100.0/junctionsSought));
+ tsw.println("Bases Recovered: \t"+(basesLmp+basesFrag+basesUnk+basesSingle)+
+ " ("+String.format("%.2f%%)",(basesLmp+basesFrag+basesUnk+basesSingle)*bmult));
+ if(merge){
+ tsw.println("");
+ tsw.println("Merged Pairs: \t"+mergedReadCount+" ("+String.format("%.2f%%)",mergedReadCount*200.0/readsProcessed));
+ tsw.println("Merged Bases: \t"+mergedBaseCount+" ("+String.format("%.2f%%)",mergedBaseCount*100.0/basesProcessed));
+ }
+
+ errorState|=tsw.poisonAndWait();
+ }
+
+ @Override
+ boolean processReadPair(Read r1, Read r2) {
+ throw new RuntimeException("Do not use.");
+ }
+
+ boolean processReadPair(Read r1, Read r2, ArrayList<Read> outLmp, ArrayList<Read> outFrag, ArrayList<Read> outUnk, ArrayList<Read> outSingle) {
+ boolean needsMasking=mask;
+ if(merge){
+ int insert=BBMerge.findOverlapStrict(r1, r2, false);
+ if(insert>0){
+ r2.reverseComplement();
+ Read merged=r1.joinRead(insert);
+ r2.reverseComplement();
+
+ int a=1, b=0, c=0;
+ if(mask){
+ a=reader.kMask(merged, tables);
+ }
+
+ if(a>0 || true){
+ mergedReadCount++;
+ mergedBaseCount+=r1.length()+r2.length()-merged.length();
+ return processMergedRead(merged, r1, r2, outLmp, outFrag, outUnk, outSingle);
+ }else if(mask){
+ needsMasking=false;
+ b=reader.kMask(r1, tables);
+ c=reader.kMask(r2, tables);
+ if(b==0 && c==0){
+ mergedReadCount++;
+ mergedBaseCount+=r1.length()+r2.length()-merged.length();
+ return processMergedRead(merged, r1, r2, outLmp, outFrag, outUnk, outSingle);
+ }
+ }
+ }
+ }
+
+ if(needsMasking){
+ int a=reader.kMask(r1, tables);
+ int b=reader.kMask(r2, tables);
+ }
+
+ junctionsSought++;
+ r1.start=Tools.indexOf(r1.bases, symbol);
+ r1.stop=Tools.lastIndexOf(r1.bases, symbol);
+
+ assert(r1==null || r1.pairnum()==0);
+ assert(r2==null || r2.pairnum()==1);
+
+ if(r2!=null){
+ r2.start=Tools.indexOf(r2.bases, symbol);
+ r2.stop=Tools.lastIndexOf(r2.bases, symbol);
+
+ if(r1.start<0 && r2.start<0){
+ if(verbose){System.err.println("Added unknown pair "+r1.id);}
+ outUnk.add(r1);
+ return true;
+ }
+ r1.mate=r2.mate=null;
+ }else if(r1.start<0){
+ if(verbose){System.err.println("Added singleton "+r1.id);}
+ outSingle.add(r1);
+ return true;
+ }
+
+ junctionsDetected++;
+
+ Read r1left=null, r1right=null, r2left=null, r2right=null;
+// final Read r1left, r1right, r2left, r2right;
+
+ if(r2==null){
+ if(r1.start>=0){
+ int left=r1.start;
+ int right=r1.length()-r1.stop-1;
+
+ r1left=(left>=minReadLength ? r1.subRead(0, r1.start) : null);
+ r1right=null;
+ r2left=null;
+ r2right=(right>=minReadLength ? r1.subRead(r1.stop+1, r1.length()) : null);
+ if(r2right!=null){
+ r2right.setPairnum(1);
+ if(RENAME){
+ r2right.id=r2right.id.replaceFirst(" /1", " /2");
+ r2right.id=r2right.id.replaceFirst(" 1:", " 2:");
+ }
+ }
+ }
+ }else if(r1.start>=0 && r2.start>=0){//confusing
+
+ {
+ int left=r1.start;
+ int right=r1.length()-r1.stop-1;
+
+ r1left=(left>=minReadLength ? r1.subRead(0, r1.start) : null);
+ r1right=(right>=minReadLength ? r1.subRead(r1.stop+1, r1.length()) : null);
+ }
+ {
+ int left=r2.start;
+ int right=r2.length()-r2.stop-1;
+
+ //Note these are reversed
+ r2left=(right>=minReadLength ? r2.subRead(r2.stop+1, r2.length()) : null);
+ r2right=(left>=minReadLength ? r2.subRead(0, r2.start) : null);
+ }
+ }else if(r1.start>=0){
+ int left=r1.start;
+ int right=r1.length()-r1.stop-1;
+
+ r1left=(left>=minReadLength ? r1.subRead(0, r1.start) : null);
+ r1right=(right>=minReadLength ? r1.subRead(r1.stop+1, r1.length()) : null);
+ r2left=null;
+ r2right=r2;
+ }else if(r2.start>=0){
+ int left=r2.start;
+ int right=r2.length()-r2.stop-1;
+
+ //Note these are reversed
+ r2left=(right>=minReadLength ? r2.subRead(r2.stop+1, r2.length()) : null);
+ r2right=(left>=minReadLength ? r2.subRead(0, r2.start) : null);
+ r1left=r1;
+ r1right=null;
+ }else{
+ assert(false) : r1.start+", "+r1.stop+(r2==null ? "null" : ", "+r2.start+", "+r2.stop);
+ }
+
+ boolean outerLMP=false, innerLMP=false, leftFrag=false, rightFrag=false;
+
+ if(r1left!=null && r2right!=null){//outer lmp
+ if(verbose){System.err.println("Added outer LMP "+r1.id);}
+ r1left.mate=r2right;
+ r2right.mate=r1left;
+ outLmp.add(r1left);
+ r1left=r2right=null;
+ outerLMP=true;
+ }
+
+ if(r1right!=null && r2left!=null){//inner lmp
+ if(verbose){System.err.println("Added inner LMP "+r1.id);}
+ if(useInnerLMP){
+ r1right.mate=r2left;
+ r2left.mate=r1right;
+ outLmp.add(r1right);
+ r1right=r2left=null;
+ innerLMP=true;
+ }
+ }
+
+ if(r1left!=null && r2left!=null){//left frag
+ if(verbose){System.err.println("Added left frag "+r1.id);}
+ r1left.mate=r2left;
+ r2left.mate=r1left;
+ outFrag.add(r1left);
+ r1left=r2left=null;
+ leftFrag=true;
+ }
+
+ if(r1right!=null && r2right!=null){//right frag
+ if(verbose){System.err.println("Added right frag "+r1.id);}
+ r1right.mate=r2right;
+ r2right.mate=r1right;
+ outFrag.add(r1right);
+ r1right=r2right=null;
+ rightFrag=true;
+ }
+
+ //Singletons
+ if(r1left!=null){
+ if(verbose){System.err.println("Added singleton r1left "+r1left.id);}
+ outSingle.add(r1left);
+ }
+ if(r1right!=null){
+ if(verbose){System.err.println("Added singleton r1right "+r1right.id);}
+ outSingle.add(r1right);
+ }
+ if(r2left!=null){
+ if(verbose){System.err.println("Added singleton r2left "+r2left.id);}
+ r2left.setPairnum(0);
+ outSingle.add(r2left);
+ }
+ if(r2right!=null){
+ if(verbose){System.err.println("Added singleton r2right "+r2right.id);}
+ r2right.setPairnum(0);
+ outSingle.add(r2right);
+ }
+
+ return true;
+ }
+
+ boolean processMergedRead(Read merged, Read r1, Read r2, ArrayList<Read> outLmp, ArrayList<Read> outFrag, ArrayList<Read> outUnk, ArrayList<Read> outSingle) {
+
+// int a=0, b, c;
+// if(mask){
+// a=reader.kMask(merged, tables);
+// }
+
+ junctionsSought++;
+ merged.start=Tools.indexOf(merged.bases, symbol);
+ merged.stop=Tools.lastIndexOf(merged.bases, symbol);
+
+ assert(merged!=null && merged.pairnum()==0);
+
+ if(merged.start<0){
+ if(verbose){System.err.println("Added frag "+r1.id);}
+ outFrag.add(r1);
+ return true;
+ }
+
+ junctionsDetected++;
+
+ Read r1left=null, r1right=null;
+
+ int left=merged.start;
+ int right=merged.length()-merged.stop-1;
+
+ r1left=(left>=minReadLength ? merged.subRead(0, merged.start) : null);
+ r1right=(right>=minReadLength ? merged.subRead(merged.stop+1, merged.length()-1) : null);
+ if(r1right!=null && r1left!=null){
+ r1right.setPairnum(1);
+ if(RENAME){
+ r1right.id=r1right.id.replaceFirst(" /1", " /2");
+ r1right.id=r1right.id.replaceFirst(" 1:", " 2:");
+ }
+ if(verbose){System.err.println("Added outer LMP "+merged.id);}
+ r1left.mate=r1right;
+ r1right.mate=r1left;
+ outLmp.add(r1left);
+ return true;
+ }
+
+ //Singletons
+ if(r1left!=null){
+ if(verbose){System.err.println("Added singleton r1left "+r1left.id);}
+ outSingle.add(r1left);
+ }
+ if(r1right!=null){
+ if(verbose){System.err.println("Added singleton r1right "+r1right.id);}
+ outSingle.add(r1right);
+ }
+
+ return true;
+ }
+
+ @Override
+ void shutdownSubclass(){
+ errorState|=ReadWrite.closeStreams(null, rosFrag, rosUnk, rosSingle);
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Masking Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private String[] refs=null;
+ private String[] literals=new String[] {"CTGTCTCTTATACACATCTAGATGTGTATAAGAGACAG"};
+
+ private final AbstractKmerTable[] tables;
+
+ private int k=19;
+ private int mink=11;
+ private int hdist=1;
+ private int hdist2=0;
+ private int edist=0;
+ private boolean rcomp=true;
+ private boolean maskMiddle=false;
+
+ private String dump=null;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private TableReader reader;
+
+ protected String outStats="stderr";
+
+ protected String outFrag1;
+ protected String outFrag2;
+
+ protected String outUnk1;
+ protected String outUnk2;
+
+ protected String outSingle;
+
+ protected FileFormat ffoutFrag1;
+ protected FileFormat ffoutFrag2;
+
+ protected FileFormat ffoutUnk1;
+ protected FileFormat ffoutUnk2;
+
+ protected FileFormat ffoutSingle;
+
+ protected ConcurrentReadOutputStream rosFrag;
+ protected ConcurrentReadOutputStream rosUnk;
+ protected ConcurrentReadOutputStream rosSingle;
+
+ private int minReadLength;
+
+ private boolean mask;
+ private boolean merge;
+ private double testmerge;
+
+ private long readsLmp=0;
+ private long basesLmp=0;
+ private long readsFrag=0;
+ private long basesFrag=0;
+ private long readsUnk=0;
+ private long basesUnk=0;
+ private long readsSingle=0;
+ private long basesSingle=0;
+ private long mergedReadCount=0;
+ private long mergedBaseCount=0;
+
+ private long junctionsSought=0, junctionsDetected=0;
+
+ private boolean pairedInput;
+
+ private byte symbol;
+ private boolean useInnerLMP;
+
+ private boolean RENAME;
+
+}
diff --git a/current/jgi/SplitPairsAndSingles.java b/current/jgi/SplitPairsAndSingles.java
new file mode 100755
index 0000000..ef83291
--- /dev/null
+++ b/current/jgi/SplitPairsAndSingles.java
@@ -0,0 +1,809 @@
+package jgi;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.LinkedHashMap;
+
+import stream.ConcurrentGenericReadInputStream;
+import stream.ConcurrentReadInputStream;
+import stream.DualCris;
+import stream.FASTQ;
+import stream.FastaReadInputStream;
+import stream.ConcurrentReadOutputStream;
+import stream.Read;
+
+import align2.ListNum;
+import align2.ReadStats;
+import align2.Shared;
+import align2.Tools;
+import align2.TrimRead;
+import dna.Parser;
+import dna.Timer;
+import fileIO.ByteFile;
+import fileIO.ReadWrite;
+import fileIO.FileFormat;
+
+/**
+ * @author Brian Bushnell
+ * @date Sep 4, 2013
+ *
+ */
+public final class SplitPairsAndSingles {
+
+
+
+ public static void main(String[] args){
+
+ if(args==null || args.length==0 || (args.length==1 &&
+ (args[0].equalsIgnoreCase("-h") || args[0].equals("-help") || args[0].equals("--help") || args[0].equals("-?") || args[0].equals("?")))){
+ printOptions();
+ System.exit(0);
+ }
+ SplitPairsAndSingles dd=new SplitPairsAndSingles(args);
+ dd.process();
+ }
+
+ private static void printOptions(){
+ System.err.println("Please consult the shellscript for usage information.");
+
+ }
+
+ public SplitPairsAndSingles(String[] args){
+ for(String s : args){if(s.contains("standardout") || s.contains("stdout")){outstream=System.err;}}
+ System.err.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+
+ ReadWrite.ZIPLEVEL=2;
+ ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
+ ReadWrite.MAX_ZIP_THREADS=Shared.threads();
+
+
+ ByteFile.FORCE_MODE_BF2=Shared.threads()>2;
+
+ Parser parser=new Parser();
+ parser.trimq=trimq;
+ parser.minReadLength=minReadLength;
+ boolean setOut=false, setOuts=false;
+ boolean fixInterleaving_=false, repair_=false, allowIdenticalPairNames_=false;
+
+ {
+ boolean b=false;
+ assert(b=true);
+ EA=b;
+ }
+
+ for(int i=0; i<args.length; i++){
+
+ final String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if("null".equalsIgnoreCase(b)){b=null;}
+ while(a.charAt(0)=='-' && (a.indexOf('.')<0 || i>1 || !new File(a).exists())){a=a.substring(1);}
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(Parser.parseCommonStatic(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseZip(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseQuality(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseFasta(arg, a, b)){
+ //do nothing
+ }else if(parser.parseInterleaved(arg, a, b)){
+ //do nothing
+ }else if(parser.parseTrim(arg, a, b)){
+ //do nothing
+ }else if(a.equals("in") || a.equals("in1")){
+ in1=b;
+ }else if(a.equals("in2")){
+ in2=b;
+ }else if(a.equals("out") || a.equals("out1") || a.equals("outp") || a.equals("outp1") || a.equals("outpair") || a.equals("outpair1")){
+ out1=b;
+ setOut=true;
+ }else if(a.equals("out2") || a.equals("outp2") || a.equals("outpair2")){
+ out2=b;
+ }else if(a.equals("outs") || a.equals("outsingle") || a.equals("outb") || a.equals("outbad")){
+ outsingle=b;
+ setOut=true;
+ }else if(a.equals("append") || a.equals("app")){
+ append=ReadStats.append=Tools.parseBoolean(b);
+ }else if(a.equals("overwrite") || a.equals("ow")){
+ overwrite=Tools.parseBoolean(b);
+ }else if(a.equals("showspeed") || a.equals("ss")){
+ showSpeed=Tools.parseBoolean(b);
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ }else if(a.equals("reads") || a.startsWith("maxreads")){
+ maxReads=Tools.parseKMG(b);
+ }else if(a.equals("fixinterleaving") || a.equals("fi") || a.equals("fint") || a.equals("fixint")){
+ fixInterleaving_=Tools.parseBoolean(b);
+ if(fixInterleaving_){repair_=false;}
+ }else if(a.equals("allowidenticalnames") || a.equals("ain")){
+ allowIdenticalPairNames_=Tools.parseBoolean(b);
+ }else if(a.equals("repair") || a.equals("rp")){
+ repair_=Tools.parseBoolean(b);
+ if(repair_){fixInterleaving_=false;}
+ }else if(i==0 && in1==null && arg.indexOf('=')<0 && arg.lastIndexOf('.')>0){
+ in1=args[i];
+ }else if(i==1 && out1==null && arg.indexOf('=')<0 && arg.lastIndexOf('.')>0){
+ out1=args[i];
+ setOut=true;
+ }else if(i==2 && outsingle==null && arg.indexOf('=')<0 && arg.lastIndexOf('.')>0){
+ outsingle=args[i];
+ setOuts=true;
+ }else{
+ throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ {//Process parser fields
+ Parser.processQuality();
+
+ qtrimLeft=parser.qtrimLeft;
+ qtrimRight=parser.qtrimRight;
+ trimq=parser.trimq;
+ minReadLength=parser.minReadLength;
+ }
+
+ allowIdenticalPairNames=allowIdenticalPairNames_;
+ fixInterleaving=fixInterleaving_;
+ repair=repair_;
+ assert(!repair || ! fixInterleaving) : "ERROR: Choose 'fixInterleaving' or 'repair', but not both.";
+
+ assert(FastaReadInputStream.settingsOK());
+
+ if(in1==null){
+ printOptions();
+ throw new RuntimeException("Error - at least one input file is required.");
+ }
+
+ if(in1!=null && in1.contains("#") && !new File(in1).exists()){
+ int pound=in1.lastIndexOf('#');
+ String a=in1.substring(0, pound);
+ String b=in1.substring(pound+1);
+ in1=a+1+b;
+ in2=a+2+b;
+ }
+ if(in2!=null && (in2.contains("=") || in2.equalsIgnoreCase("null"))){in2=null;}
+
+ if(fixInterleaving){
+ if(in2!=null){
+ System.err.println("ERROR: 'FixInterleaving' mode only works with a single interleaved input file, not paired input files.");
+ System.err.println("Aborting.");
+ System.exit(1);
+ }
+ parser.setInterleaved=true;
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+ outstream.println("Paired input disabled; running in FixInterleaving mode");
+ }
+
+ if(repair){
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+ outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED);
+ }else{
+ if(!parser.setInterleaved && in2==null){
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=true;
+ outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED);
+ }
+ if(in2!=null){
+ if(FASTQ.FORCE_INTERLEAVED){System.err.println("Reset INTERLEAVED to false because paired input files were specified.");}
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+ }
+ }
+
+ if(out1!=null && out1.contains("#")){
+ int pound=out1.lastIndexOf('#');
+ String a=out1.substring(0, pound);
+ String b=out1.substring(pound+1);
+ out1=a+1+b;
+ out2=a+2+b;
+ }
+
+ if(!setOut){
+ System.err.println("No output stream specified. To write to stdout, please specify 'out=stdout.fq' or similar.");
+// out1="stdout.fq";
+ outstream=System.err;
+ out2=null;
+ }else if("stdout".equalsIgnoreCase(out1) || "standarddout".equalsIgnoreCase(out1)){
+ out1="stdout.fq";
+ outstream=System.err;
+ out2=null;
+ }
+ if(out1!=null && !Tools.canWrite(out1, overwrite)){throw new RuntimeException("Output file "+out1+" already exists, and overwrite="+overwrite);}
+
+ assert(!in1.equalsIgnoreCase(out1));
+ assert(!in1.equalsIgnoreCase(outsingle));
+ assert(!in1.equalsIgnoreCase(in2));
+ assert(out1==null || !out1.equalsIgnoreCase(out2));
+ assert(out1==null || !out1.equalsIgnoreCase(outsingle));
+
+ pairMap=(repair ? new LinkedHashMap<String, Read>() : null);
+ }
+
+ public void process(){
+
+ Timer t=new Timer();
+
+ process2();
+
+ t.stop();
+
+ outstream.println("\nInput: \t"+readsIn+" reads \t\t"+basesIn+" bases.");
+
+ if(qtrimLeft || qtrimRight){
+ outstream.println("Trimmed: \t"+readsTrimmed+" reads ("+String.format("%.2f",readsTrimmed*100.0/readsIn)+"%) \t"+
+ basesTrimmed+" bases ("+String.format("%.2f",basesTrimmed*100.0/basesIn)+"%)");
+ }
+ outstream.println("Result: \t"+readsOut+" reads ("+String.format("%.2f",readsOut*100.0/readsIn)+"%) \t"+
+ basesOut+" bases ("+String.format("%.2f",basesOut*100.0/basesIn)+"%)");
+ outstream.println("Pairs: \t"+pairsOut+" reads ("+String.format("%.2f",pairsOut*100.0/readsIn)+"%) \t"+
+ pairBasesOut+" bases ("+String.format("%.2f",pairBasesOut*100.0/basesIn)+"%)");
+ outstream.println("Singletons: \t"+singlesOut+" reads ("+String.format("%.2f",singlesOut*100.0/readsIn)+"%) \t"+
+ singleBasesOut+" bases ("+String.format("%.2f",singleBasesOut*100.0/basesIn)+"%)");
+
+ double rpnano=readsIn/(double)(t.elapsed);
+ double bpnano=basesIn/(double)(t.elapsed);
+
+ String rpstring=(readsIn<100000 ? ""+readsIn : readsIn<100000000 ? (readsIn/1000)+"k" : (readsIn/1000000)+"m");
+ String bpstring=(basesIn<100000 ? ""+basesIn : basesIn<100000000 ? (basesIn/1000)+"k" : (basesIn/1000000)+"m");
+
+ while(rpstring.length()<8){rpstring=" "+rpstring;}
+ while(bpstring.length()<8){bpstring=" "+bpstring;}
+
+ if(showSpeed){
+ outstream.println("\nTime: \t\t\t"+t);
+ outstream.println("Reads Processed: "+rpstring+" \t"+String.format("%.2fk reads/sec", rpnano*1000000));
+ outstream.println("Bases Processed: "+bpstring+" \t"+String.format("%.2fm bases/sec", bpnano*1000));
+ }
+
+ if(errorState){
+ throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
+ }
+ }
+
+ private void process2(){
+ final ConcurrentReadInputStream cris;
+ if(in2!=null && repair){
+ FileFormat ff1=FileFormat.testInput(in1, FileFormat.FASTQ, null, true, true);
+ FileFormat ff2=FileFormat.testInput(in2, FileFormat.FASTQ, null, true, true);
+ cris=DualCris.getReadInputStream(maxReads, true, ff1, ff2, null, null);
+ }else{
+ FileFormat ff1=FileFormat.testInput(in1, FileFormat.FASTQ, null, true, true);
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, null, null, null);
+ }
+ if(verbose){System.err.println("Started cris");}
+ cris.start();
+ boolean paired=cris.paired();
+ if(verbose){System.err.println("Paired: "+paired);}
+
+ final ConcurrentReadOutputStream ros, rosb;
+ final int buff=4;
+ if(out1!=null){
+ FileFormat ff1=FileFormat.testOutput(out1, FileFormat.FASTQ, null, true, overwrite, append, false);
+ FileFormat ff2=FileFormat.testOutput(out2, FileFormat.FASTQ, null, true, overwrite, append, false);
+ ros=ConcurrentReadOutputStream.getStream(ff1, ff2, buff, null, true);
+ ros.start();
+ }else{ros=null;}
+ if(outsingle!=null){
+ FileFormat ff1=FileFormat.testOutput(outsingle, FileFormat.FASTQ, null, true, overwrite, append, false);
+ rosb=ConcurrentReadOutputStream.getStream(ff1, null, buff, null, true);
+ rosb.start();
+ }else{rosb=null;}
+ if(ros!=null || rosb!=null){
+ outstream.println("Started output stream.");
+ }
+
+// assert(false) : out1+", "+out2+", "+outsingle;
+ if(fixInterleaving){
+ process3_fixInterleaving(cris, ros, rosb);
+ }else if(repair){
+ if(cris.getClass()==DualCris.class){
+ process3_repair((DualCris)cris, ros, rosb);
+ }else{
+ process3_repair(cris, ros, rosb);
+ }
+ }else{
+ process3(cris, ros, rosb);
+ }
+
+
+ ReadWrite.closeStreams(cris, ros, rosb);
+ }
+//
+// private void process3_old(final ConcurrentReadInputStream cris, final ConcurrentReadOutputStream ros, final ConcurrentReadOutputStream rosb){
+//
+// ListNum<Read> ln=cris.nextList();
+// ArrayList<Read> reads0=(ln!=null ? ln.list : null);
+// ArrayList<Read> single=(rosb==null ? null : new ArrayList<Read>(Shared.READ_BUFFER_LENGTH));
+//
+// while(reads0!=null && reads0.size()>0){
+// ArrayList<Read> reads=(ArrayList<Read>) reads0.clone();
+// int removed=0;
+// for(int i=0; i<reads.size(); i++){
+// Read r1=reads.get(i);
+// Read r2=r1.mate;
+//
+// readsIn++;
+// basesIn+=r1.length();
+// if(r2!=null){
+// readsIn++;
+// basesIn+=r2.length();
+// }
+//
+// {
+// if(trimLeft || trimRight){
+// if(r1!=null){
+// int x=TrimRead.trimFast(r1, trimLeft, trimRight, trimq, 1);
+// basesTrimmed+=x;
+// readsTrimmed+=(x>0 ? 1 : 0);
+// }
+// if(r2!=null){
+// int x=TrimRead.trimFast(r2, trimLeft, trimRight, trimq, 1);
+// basesTrimmed+=x;
+// readsTrimmed+=(x>0 ? 1 : 0);
+// }
+// }
+//
+// final int rlen1=(r1==null ? -1 : r1.length());
+// final int rlen2=(r2==null ? -1 : r2.length());
+//
+// if(verbose){System.err.println("rlen1="+rlen1+", rlen2="+rlen2);}
+//
+// if(rlen1<minReadLength || rlen2<minReadLength){
+// reads.set(i, null);
+// removed++;
+// r1.mate=null;
+// if(r2!=null){
+// r2.mate=null;
+// }
+//
+// if(rlen1>=minReadLength){
+// single.add(r1);
+// singlesOut++;
+// singleBasesOut+=rlen1;
+// }
+// if(rlen2>=minReadLength){
+// single.add(r2);
+// singlesOut++;
+// singleBasesOut+=rlen2;
+// }
+// }else{
+// if(r1!=null){
+// pairsOut++;
+// pairBasesOut+=rlen2;
+// }
+// if(r2!=null){
+// pairsOut++;
+// pairBasesOut+=rlen2;
+// }
+// }
+// }
+// }
+//
+// if(rosb!=null){
+// if(verbose){System.err.println("Adding "+single.size()+" to single out.");}
+// rosb.add(new ArrayList<Read>(single), ln.id);
+// single.clear();
+// }
+//
+// if(ros!=null){
+// if(removed>0){Tools.condenseStrict(reads);}
+// ArrayList<Read> x=new ArrayList<Read>(reads.size());
+// x.addAll(reads);
+// if(verbose){System.err.println("Adding "+x.size()+" to pair out.");}
+// ros.add(x, ln.id);
+// }
+//
+// cris.returnList(ln.id, ln.list.isEmpty());
+// ln=cris.nextList();
+// reads0=(ln!=null ? ln.list : null);
+// }
+// cris.returnList(ln.id, ln.list.isEmpty());
+//
+// readsOut+=singlesOut+pairsOut;
+// basesOut+=singleBasesOut+pairBasesOut;
+// }
+
+ private void process3(final ConcurrentReadInputStream cris, final ConcurrentReadOutputStream ros, final ConcurrentReadOutputStream rosb){
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=ln.list;
+
+ final ArrayList<Read> pairs=(ros==null ? null : new ArrayList<Read>(Shared.READ_BUFFER_LENGTH));
+ final ArrayList<Read> singles=(rosb==null ? null : new ArrayList<Read>(Shared.READ_BUFFER_LENGTH));
+
+ while(reads!=null && reads.size()>0){
+ for(int i=0; i<reads.size(); i++){
+ Read r1=reads.get(i);
+ Read r2=r1.mate;
+ processPair(r1, r2, pairs, singles);
+ }
+
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+
+ if(rosb!=null){
+ if(verbose){System.err.println("Adding "+singles.size()+" to single out.");}
+ rosb.add(new ArrayList<Read>(singles), ln.id);
+ singles.clear();
+ }
+
+ if(ros!=null){
+ if(verbose){System.err.println("Adding "+pairs.size()+" to pair out.");}
+ ros.add(new ArrayList<Read>(pairs), ln.id);
+ pairs.clear();
+ }
+ }
+ cris.returnList(ln.id, ln.list.isEmpty());
+
+ readsOut+=singlesOut+pairsOut;
+ basesOut+=singleBasesOut+pairBasesOut;
+ }
+
+ private void process3_fixInterleaving(final ConcurrentReadInputStream cris, final ConcurrentReadOutputStream ros, final ConcurrentReadOutputStream rosb){
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=ln.list;
+
+ final ArrayList<Read> pairs=(ros==null ? null : new ArrayList<Read>(Shared.READ_BUFFER_LENGTH));
+ final ArrayList<Read> singles=(rosb==null ? null : new ArrayList<Read>(Shared.READ_BUFFER_LENGTH));
+
+ Read current=null, prev=null;
+
+ while(reads!=null && reads.size()>0){
+ for(int i=0; i<reads.size(); i++){
+
+ current=reads.get(i);
+// if(verbose){System.err.println("Fetched "+current);}
+
+ if(prev!=null){
+ boolean b=FASTQ.testPairNames(prev, current, allowIdenticalPairNames);
+ if(b){
+ if(verbose){System.err.println("A");}
+ processPair(prev, current, pairs, singles);
+ prev=null;
+ current=null;
+ }else{
+ if(verbose){System.err.println("B");}
+ processPair(prev, null, null, singles);
+ prev=null;
+ }
+ }
+ prev=current;
+ current=null;
+ }
+
+// if(verbose){System.err.println("X\n"+current+"\n"+prev+"\n");}
+
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+
+ if((ln==null || reads==null || reads.isEmpty()) && prev!=null){ //Process last read
+ boolean b=FASTQ.testPairNames(prev, current, allowIdenticalPairNames);
+ if(b){
+ if(verbose){System.err.println("C");}
+ processPair(prev, current, pairs, singles);
+ prev=null;
+ current=null;
+ }else{
+ if(verbose){System.err.println("D");}
+ processPair(prev, null, null, singles);
+ prev=null;
+ }
+ }
+
+ if(rosb!=null){
+ if(verbose){System.err.println("Adding "+singles.size()+" to single out.");}
+ rosb.add(new ArrayList<Read>(singles), ln.id);
+ singles.clear();
+ }
+
+ if(ros!=null){
+ if(verbose){System.err.println("Adding "+pairs.size()+" to pair out.");}
+ ros.add(new ArrayList<Read>(pairs), ln.id);
+ pairs.clear();
+ }
+ }
+ cris.returnList(ln.id, ln.list.isEmpty());
+
+ readsOut+=singlesOut+pairsOut;
+ basesOut+=singleBasesOut+pairBasesOut;
+ }
+
+ private void process3_repair(final DualCris cris, final ConcurrentReadOutputStream ros, final ConcurrentReadOutputStream rosb){
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=ln.list;
+
+ final ArrayList<Read> pairs=(ros==null ? null : new ArrayList<Read>(Shared.READ_BUFFER_LENGTH));
+
+ boolean foundR1=false, foundR2=false;
+ while(reads!=null && reads.size()>0){
+ for(Read r1 : reads){
+ Read r2=r1.mate;
+
+ if(r1.pairnum()==0){foundR1=true;}
+ else{foundR2=true;}
+ if(r2!=null){
+ if(r2.pairnum()==0){foundR1=true;}
+ else{foundR2=true;}
+ }
+
+ {
+ Read pair=repair(r1);
+ if(pair!=null && pairs!=null){pairs.add(pair);}
+ }
+ {
+ Read pair=repair(r2);
+ if(pair!=null && pairs!=null){pairs.add(pair);}
+ }
+ }
+
+// if(verbose){System.err.println("X\n"+current+"\n"+prev+"\n");}
+
+ cris.returnList(ln.id, foundR1, foundR2);
+ foundR1=foundR2=false;
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+
+ if(ros!=null){
+ if(verbose){System.err.println("Adding "+pairs.size()+" to pair out.");}
+ ros.add(new ArrayList<Read>(pairs), ln.id);
+ pairs.clear();
+ }
+ }
+ cris.returnList(ln.id, foundR1, foundR2);
+
+ if(!pairMap.isEmpty()){
+ final ArrayList<Read> singles=new ArrayList<Read>(pairMap.size());
+ for(String key : pairMap.keySet()){
+ Read r=pairMap.get(key);
+ singles.add(r);
+ singlesOut++;
+ singleBasesOut+=r.length();
+ }
+ pairMap.clear();
+ if(verbose){System.err.println("Adding "+singles.size()+" to single out.");}
+ if(rosb!=null){rosb.add(singles, 0);}
+ }
+
+ readsOut+=singlesOut+pairsOut;
+ basesOut+=singleBasesOut+pairBasesOut;
+ }
+
+ private void process3_repair(final ConcurrentReadInputStream cris, final ConcurrentReadOutputStream ros, final ConcurrentReadOutputStream rosb){
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=ln.list;
+
+ final ArrayList<Read> pairs=(ros==null ? null : new ArrayList<Read>(Shared.READ_BUFFER_LENGTH));
+
+ boolean foundR1=false, foundR2=false;
+ while(reads!=null && reads.size()>0){
+ for(Read r1 : reads){
+ Read r2=r1.mate;
+
+ if(r1.pairnum()==0){foundR1=true;}
+ else{foundR2=true;}
+ if(r2!=null){
+ if(r2.pairnum()==0){foundR1=true;}
+ else{foundR2=true;}
+ }
+
+ {
+ Read pair=repair(r1);
+ if(pair!=null && pairs!=null){pairs.add(pair);}
+ }
+ {
+ Read pair=repair(r2);
+ if(pair!=null && pairs!=null){pairs.add(pair);}
+ }
+ }
+
+// if(verbose){System.err.println("X\n"+current+"\n"+prev+"\n");}
+
+ cris.returnList(ln.id, ln.list.isEmpty());
+ foundR1=foundR2=false;
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+
+ if(ros!=null){
+ if(verbose){System.err.println("Adding "+pairs.size()+" to pair out.");}
+ ros.add(new ArrayList<Read>(pairs), ln.id);
+ pairs.clear();
+ }
+ }
+ cris.returnList(ln.id, ln.list.isEmpty());
+
+ if(!pairMap.isEmpty()){
+ final ArrayList<Read> singles=new ArrayList<Read>(pairMap.size());
+ for(String key : pairMap.keySet()){
+ Read r=pairMap.get(key);
+ singles.add(r);
+ singlesOut++;
+ singleBasesOut+=r.length();
+ }
+ pairMap.clear();
+ if(verbose){System.err.println("Adding "+singles.size()+" to single out.");}
+ if(rosb!=null){rosb.add(singles, 0);}
+ }
+
+ readsOut+=singlesOut+pairsOut;
+ basesOut+=singleBasesOut+pairBasesOut;
+ }
+
+
+ private int processPair(Read r1, Read r2, ArrayList<Read> pairs, ArrayList<Read> singles){
+ int removed=0;
+ readsIn++;
+ basesIn+=r1.length();
+ if(r2!=null){
+ readsIn++;
+ basesIn+=r2.length();
+ }
+
+ if(qtrimLeft || qtrimRight){
+ if(r1!=null){
+ int x=TrimRead.trimFast(r1, qtrimLeft, qtrimRight, trimq, 1);
+ basesTrimmed+=x;
+ readsTrimmed+=(x>0 ? 1 : 0);
+ }
+ if(r2!=null){
+ int x=TrimRead.trimFast(r2, qtrimLeft, qtrimRight, trimq, 1);
+ basesTrimmed+=x;
+ readsTrimmed+=(x>0 ? 1 : 0);
+ }
+ }
+ final int rlen1=(r1==null ? -1 : r1.length());
+ final int rlen2=(r2==null ? -1 : r2.length());
+ if(verbose){System.err.println("rlen="+rlen1+", rlen2="+rlen2);}
+
+ if(rlen1>=minReadLength && rlen2>=minReadLength){
+ if(verbose){System.err.println("Sending to pair out:\t"+r1.id+"\t"+r2.id);}
+ r1.mate=r2;
+ r2.mate=r1;
+ r1.setPairnum(0);
+ r2.setPairnum(1);
+ if(pairs!=null){pairs.add(r1);}
+ pairsOut+=2;
+ pairBasesOut+=(rlen1+rlen2);
+ }else if(rlen1>=minReadLength){
+ if(verbose){System.err.println("Sending r1 to single out:\t"+r1.id+"\t"+(r2==null ? "*" : r2.id));}
+ r1.mate=null;
+ r1.setPairnum(0);
+ if(singles!=null){singles.add(r1);}
+ singlesOut++;
+ singleBasesOut+=rlen1;
+ if(r2!=null){removed++;}
+ }else if(rlen2>=minReadLength){
+ if(verbose){System.err.println("Sending r2 to single out:\t"+(r1==null ? "*" : r1.id)+"\t"+r2.id);}
+ r2.mate=null;
+ r2.setPairnum(0);
+ if(singles!=null){singles.add(r2);}
+ singlesOut++;
+ singleBasesOut+=rlen2;
+ if(r1!=null){removed++;}
+ }else{
+ if(verbose){System.err.println("Removed both reads:\t"+(r1==null ? "*" : r1.id)+"\t"+(r2==null ? "*" : r2.id));}
+ if(r1!=null){removed++;}
+ if(r2!=null){removed++;}
+ }
+ return removed;
+ }
+
+
+ private Read repair(Read r){
+ if(r==null){return null;}
+ r.mate=null;
+
+ readsIn++;
+ basesIn+=r.length();
+ final String id=r.id;
+
+ assert(id!=null) : "Read number "+r.numericID+" has no name and thus cannot be re-paired. To ignore this, run with the -da flag.";
+ if(id==null){return null;}
+ final int slash=id.indexOf('/');
+ String[] split=id.split("\\s+");
+
+ if(split.length==1 && slash>0){
+ split=new String[] {id.substring(0, slash), id.substring(slash)};
+ }
+
+ assert(split.length>0);
+ String prefix=split[0];
+ String suffix=(split.length==1 ? null : split[split.length-1]);
+ if(suffix!=null){
+ if(suffix.startsWith("/1") || suffix.startsWith("1:")){
+ r.setPairnum(0);
+ }else if(suffix.startsWith("/2") || suffix.startsWith("2:")){
+ r.setPairnum(1);
+ }else if(id.contains("/1") || id.contains("/2")){
+ split=id.split("/");
+ prefix=split[0];
+ suffix=(split.length==1 ? null : split[split.length-1]);
+
+ if(suffix!=null){
+ if(suffix.startsWith("1")){
+ r.setPairnum(0);
+ }else if(suffix.startsWith("2")){
+ r.setPairnum(1);
+ }
+ }else{
+ //pairnum cannot be determined
+ }
+ }else{
+ //pairnum cannot be determined
+ }
+ }else{
+ //pairnum cannot be determined
+ }
+
+ Read old=pairMap.remove(prefix);
+
+// System.out.println("Processing:\n"+r+"\n"+old+"\n"+readsIn+", "+readsOut+", "+pairsOut);
+
+ if(old==null){
+ pairMap.put(prefix, r);
+ return null;
+ }else{
+ r.mate=old;
+ old.mate=r;
+
+ int len=r.length()+old.length();
+ pairsOut+=2;
+ pairBasesOut+=len;
+
+ if(old.pairnum()==1){
+ r.setPairnum(0);
+ return r;
+ }else{
+ old.setPairnum(0);
+ r.setPairnum(1);
+ return old;
+ }
+ }
+ }
+
+
+ private String in1=null, in2=null;
+ private String out1=null, out2=null;
+ private String outsingle=null;
+ private long maxReads=-1;
+ public boolean errorState=false;
+
+ long readsIn=0;
+ long basesIn=0;
+ long readsOut=0;
+ long basesOut=0;
+ long pairsOut=0;
+ long pairBasesOut=0;
+ long singlesOut=0;
+ long singleBasesOut=0;
+ long readsTrimmed=0;
+ long basesTrimmed=0;
+
+ private final LinkedHashMap<String, Read> pairMap;
+
+ private byte trimq=6;
+ private int minReadLength=20;
+ private final boolean qtrimLeft, qtrimRight;
+
+ private final boolean EA;
+ private final boolean fixInterleaving;
+ private final boolean allowIdenticalPairNames;
+ private final boolean repair;
+
+ private static PrintStream outstream=System.err;
+ /** Permission to overwrite existing files */
+ public static boolean overwrite=false;
+ /** Permission to append to existing files */
+ public static boolean append=false;
+ public static boolean showSpeed=true;
+ public static boolean verbose=false;
+
+}
diff --git a/current/jgi/SplitSam4Way.java b/current/jgi/SplitSam4Way.java
new file mode 100755
index 0000000..9d53806
--- /dev/null
+++ b/current/jgi/SplitSam4Way.java
@@ -0,0 +1,122 @@
+package jgi;
+
+import java.io.PrintStream;
+import java.util.Arrays;
+
+import stream.SamLine;
+
+import dna.Gene;
+import dna.Timer;
+import fileIO.FileFormat;
+import fileIO.TextFile;
+import fileIO.TextStreamWriter;
+
+/**
+ * @author Brian Bushnell
+ * @date Jul 23, 2013
+ *
+ */
+public class SplitSam4Way {
+
+ public static void main(String[] args){
+ new SplitSam4Way(args);
+ }
+
+ private void printOptions(){
+ outstream.println("Syntax:\n");
+ outstream.println("java -ea -Xmx128m -cp <path> jgi.SplitSam4Way <input> <out plus> <out minus> <out chimeric> <out unmapped>");
+ outstream.println("If you do not want one of the output files, use the word 'null'.\n");
+ }
+
+ public SplitSam4Way(String[] args){
+ if(args==null || args.length!=5){
+ printOptions();
+ System.exit(0);
+ }
+
+ for(String s : args){if(s.startsWith("out=standardout") || s.startsWith("out=stdout")){outstream=System.err;}}
+ outstream.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+
+ Timer t=new Timer();
+ long reads=0, bases=0;
+ long preads=0, mreads=0, creads=0, ureads=0;
+
+ String fin=args[0];
+ String fplus=args[1];
+ String fminus=args[2];
+ String fchimeric=args[3];
+ String funmapped=args[4];
+
+ TextFile tf=new TextFile(fin, true, false);
+ TextStreamWriter plus=("null".equalsIgnoreCase(fplus) ? null : new TextStreamWriter(fplus, true, false, true, FileFormat.SAM));
+ TextStreamWriter minus=("null".equalsIgnoreCase(fminus) ? null : new TextStreamWriter(fminus, true, false, true, FileFormat.SAM));
+ TextStreamWriter chimeric=("null".equalsIgnoreCase(fchimeric) ? null : new TextStreamWriter(fchimeric, true, false, true, FileFormat.SAM));
+ TextStreamWriter unmapped=("null".equalsIgnoreCase(funmapped) ? null : new TextStreamWriter(funmapped, true, false, true, FileFormat.SAM));
+
+ if(plus!=null){plus.start();}
+ if(minus!=null){minus.start();}
+ if(chimeric!=null){chimeric.start();}
+ if(unmapped!=null){unmapped.start();}
+
+ for(String line=tf.nextLine(); line!=null; line=tf.nextLine()){
+ if(line.charAt(0)=='@'){
+ if(plus!=null){plus.println(line);}
+ if(minus!=null){minus.println(line);}
+ if(chimeric!=null){chimeric.println(line);}
+ if(unmapped!=null){unmapped.println(line);}
+ }else{
+ SamLine sl=new SamLine(line);
+ reads++;
+// bases+=sl.seq.length();
+ bases+=sl.seq.length;
+
+ if(!sl.mapped() || !sl.nextMapped() || !sl.hasMate() || !sl.primary()){
+ if(unmapped!=null){unmapped.println(line);}
+ ureads++;
+// System.out.println("unmapped: "+sl.mapped()+", "+sl.nextMapped()+", "+sl.hasMate()+", "+!sl.primary());
+ }else if(!sl.pairedOnSameChrom() || sl.strand()==sl.nextStrand()){
+ if(chimeric!=null){chimeric.println(line);}
+ creads++;
+// System.out.println("chimeric: "+sl.pairedOnSameChrom()+", "+(sl.strand()==sl.nextStrand())+", "+sl.strand()+", "+sl.nextStrand()+", "+new String(sl.rname())+", "+new String(sl.rnext()));
+ }else if((sl.firstFragment() ? sl.strand() : sl.nextStrand())==Gene.PLUS){
+ if(plus!=null){plus.println(line);}
+ preads++;
+ }else if((sl.firstFragment() ? sl.strand() : sl.nextStrand())==Gene.MINUS){
+ if(minus!=null){minus.println(line);}
+ mreads++;
+ }else{
+ throw new RuntimeException("Unhandled case: "+sl.firstFragment()+", "+sl.lastFragment()+", "+sl.strand()+", "+sl.nextStrand()+"\n"+sl+"\n");
+ }
+ }
+ }
+
+ if(plus!=null){plus.poisonAndWait();}
+ if(minus!=null){minus.poisonAndWait();}
+ if(chimeric!=null){chimeric.poisonAndWait();}
+ if(unmapped!=null){unmapped.poisonAndWait();}
+ t.stop();
+
+
+ double rpnano=reads/(double)(t.elapsed);
+ double bpnano=bases/(double)(t.elapsed);
+
+ String rpstring=(reads<100000 ? ""+reads : reads<100000000 ? (reads/1000)+"k" : (reads/1000000)+"m");
+ String bpstring=(bases<100000 ? ""+bases : bases<100000000 ? (bases/1000)+"k" : (bases/1000000)+"m");
+
+ while(rpstring.length()<8){rpstring=" "+rpstring;}
+ while(bpstring.length()<8){bpstring=" "+bpstring;}
+
+ outstream.println("Time: \t"+t);
+ outstream.println("Reads Processed: "+rpstring+" \t"+String.format("%.2fk reads/sec", rpnano*1000000));
+ outstream.println("Bases Processed: "+bpstring+" \t"+String.format("%.2fm bases/sec", bpnano*1000));
+ outstream.println("Plus Reads: "+preads);
+ outstream.println("Minus Reads: "+mreads);
+ outstream.println("Chimeric Reads: "+creads);
+ outstream.println("Unmapped Reads: "+ureads);
+
+
+ }
+
+ private PrintStream outstream=System.err;
+
+}
diff --git a/current/jgi/SplitSamFile.java b/current/jgi/SplitSamFile.java
new file mode 100755
index 0000000..6b38304
--- /dev/null
+++ b/current/jgi/SplitSamFile.java
@@ -0,0 +1,87 @@
+package jgi;
+
+import align2.Tools;
+import stream.SamLine;
+import dna.Timer;
+import fileIO.ByteFile;
+import fileIO.ByteStreamWriter;
+
+public class SplitSamFile {
+
+
+ public static void main(String[] args){
+
+ Timer t=new Timer();
+
+ String in=args[0];
+ String outF=args.length>1 ? args[1] : null;
+ String outR=args.length>2 ? args[2] : null;
+ String outU=args.length>3 ? args[3] : null;
+ if(args.length>4){
+ if(args[4].equalsIgnoreCase("header")){includeHeader=true;}
+ }
+
+ ByteFile tf=ByteFile.makeByteFile(in, true, false);
+
+ Tools.testForDuplicateFiles(true, in, outF, outR, outU);
+ Tools.testOutputFiles(true, false, false, outF, outR, outU);
+
+ final ByteStreamWriter fStream, rStream, uStream;
+
+ fStream=(outF==null ? null : new ByteStreamWriter(outF, true, false, true));
+ rStream=(outR==null ? null : new ByteStreamWriter(outR, true, false, true));
+ uStream=(outU==null ? null : new ByteStreamWriter(outU, true, false, true));
+
+ if(fStream!=null){fStream.start();}
+ if(rStream!=null){rStream.start();}
+ if(uStream!=null){uStream.start();}
+
+ long plus=0;
+ long minus=0;
+ long other=0;
+
+ byte[] s=null;
+ for(s=tf.nextLine(); s!=null; s=tf.nextLine()){
+ if(s.length>0){
+ byte c=s[0];
+ if(c=='@'){
+ if(includeHeader){
+ if(fStream!=null){fStream.println(s);}
+ if(rStream!=null){rStream.println(s);}
+ if(uStream!=null){uStream.println(s);}
+ }
+ }else{
+ int flag=SamLine.parseFlagOnly(s);
+ if(SamLine.mapped(flag)){
+ if(SamLine.strand(flag)==0){
+ if(fStream!=null){fStream.println(s);}
+ plus++;
+ }else{
+ if(rStream!=null){rStream.println(s);}
+ minus++;
+ }
+ }else{
+ if(uStream!=null){uStream.println(s);}
+ other++;
+ }
+ }
+ }
+ }
+ tf.close();
+ if(fStream!=null){fStream.poisonAndWait();}
+ if(rStream!=null){rStream.poisonAndWait();}
+ if(uStream!=null){uStream.poisonAndWait();}
+
+ System.err.println("Total reads: \t"+(plus+minus+other));
+ System.err.println("Plus reads: \t"+(plus));
+ System.err.println("Minus reads: \t"+(minus));
+ System.err.println("Unmapped reads:\t"+(other));
+
+ t.stop();
+
+ System.err.println("Time: \t"+t);
+
+ }
+
+ private static boolean includeHeader=false;
+}
diff --git a/current/jgi/SynthMDA.java b/current/jgi/SynthMDA.java
new file mode 100755
index 0000000..9484dfd
--- /dev/null
+++ b/current/jgi/SynthMDA.java
@@ -0,0 +1,447 @@
+package jgi;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Random;
+
+import stream.ByteBuilder;
+import stream.ConcurrentGenericReadInputStream;
+import stream.ConcurrentReadInputStream;
+import stream.FASTQ;
+import stream.FastaReadInputStream;
+import stream.KillSwitch;
+import stream.Read;
+import align2.ListNum;
+import align2.RandomReads3;
+import align2.ReadStats;
+import align2.Shared;
+import align2.Tools;
+import dna.AminoAcid;
+import dna.Parser;
+import dna.Timer;
+import fileIO.ByteFile;
+import fileIO.ByteFile1;
+import fileIO.ByteFile2;
+import fileIO.FileFormat;
+import fileIO.ReadWrite;
+import fileIO.TextStreamWriter;
+
+/**
+ * @author Brian Bushnell
+ * @date Oct 17, 2014
+ *
+ */
+public class SynthMDA {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public static void main(String[] args){
+ Timer t=new Timer();
+ SynthMDA mb=new SynthMDA(args);
+ mb.process(t);
+ }
+
+ public SynthMDA(String[] args){
+
+ args=Parser.parseConfig(args);
+ if(Parser.parseHelp(args, true)){
+ printOptions();
+ System.exit(0);
+ }
+
+ for(String s : args){if(s.startsWith("out=standardout") || s.startsWith("out=stdout")){outstream=System.err;}}
+ outstream.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+
+
+
+ Shared.READ_BUFFER_LENGTH=Tools.min(200, Shared.READ_BUFFER_LENGTH);
+ Shared.capBuffers(4);
+ ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
+ ReadWrite.MAX_ZIP_THREADS=Shared.threads();
+
+ FASTQ.TEST_INTERLEAVED=FASTQ.FORCE_INTERLEAVED=false;
+
+ Parser parser=new Parser();
+ parser.build=7;
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if(b==null || b.equalsIgnoreCase("null")){b=null;}
+ while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens
+
+ if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ ByteFile1.verbose=verbose;
+ ByteFile2.verbose=verbose;
+ stream.FastaReadInputStream.verbose=verbose;
+ ConcurrentGenericReadInputStream.verbose=verbose;
+// align2.FastaReadInputStream2.verbose=verbose;
+ stream.FastqReadInputStream.verbose=verbose;
+ ReadWrite.verbose=verbose;
+ }else if(a.equals("minlen") || a.equals("ml")){
+ minlen=Integer.parseInt(b);
+ }else if(a.equals("maxlen") || a.equals("mxl")){
+ maxlen=Integer.parseInt(b);
+ }else if(a.equals("cycles")){
+ cycles=Integer.parseInt(b);
+ }else if(a.equals("initialratio")){
+ initialRatio=Float.parseFloat(b);
+ }else if(a.equals("ratio")){
+ ratio=Float.parseFloat(b);
+ }else if(a.equals("refout")){
+ out1=b;
+ }else if(a.equals("perfect")){
+ perfectrate=Float.parseFloat(b);
+ }else if(a.equals("length")){
+ readlength=Integer.parseInt(b);
+ }else if(a.equals("paired")){
+ paired=Tools.parseBoolean(b);
+ }else if(a.equals("amp")){
+ amp=Integer.parseInt(b);
+ }else if(a.equals("build")){
+ assert(false) : "Build should have been parsed by parser.";
+ build=Integer.parseInt(b);
+ }else if(a.equals("ref")){
+ ref=b;
+ }else if(a.equals("prefix")){
+ prefix=b;
+ }else if(parser.parse(arg, a, b)){
+ //do nothing
+ }else if(parser.in1==null && i==0 && !arg.contains("=") && (arg.toLowerCase().startsWith("stdin") || new File(arg).exists())){
+ parser.in1=arg;
+ }else if(parser.out1==null && i==1 && !arg.contains("=")){
+ parser.out1=arg;
+ }else{
+ outstream.println("Unknown parameter "+args[i]);
+ assert(false) : "Unknown parameter "+args[i];
+ // throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+ minlen2=Tools.min(minlen2, minlen);
+
+ {//Process parser fields
+ Parser.processQuality();
+
+ if(parser.maxReads>0){reads=parser.maxReads;}
+
+ overwrite=ReadStats.overwrite=parser.overwrite;
+ append=ReadStats.append=parser.append;
+
+ if(ref==null){ref=parser.in1;}
+
+ readsOut=parser.out1;
+
+ extref=parser.extin;
+ extout=parser.extout;
+ build=parser.build;
+ }
+
+ assert(FastaReadInputStream.settingsOK());
+
+ if(ref==null){
+ printOptions();
+ throw new RuntimeException("Error - input reference must be specified.");
+ }
+
+ if(out1==null){
+ out1=ReadWrite.stripToCore(ref)+"_"+Long.toHexString(new Random().nextLong()&Long.MAX_VALUE)+".fa";
+ }
+
+ if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2){
+ ByteFile.FORCE_MODE_BF2=false;
+ ByteFile.FORCE_MODE_BF1=true;
+ }
+
+ if(out1!=null && out1.equalsIgnoreCase("null")){out1=null;}
+
+ if(!Tools.testOutputFiles(overwrite, append, false, out1)){
+ outstream.println((out1==null)+", "+out1);
+ throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+out1+"\n");
+ }
+
+ ffout1=FileFormat.testOutput(out1, FileFormat.FASTQ, extout, true, overwrite, append, false);
+
+ ffref=FileFormat.testInput(ref, FileFormat.FASTQ, extref, true, true);
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Outer Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ void process(Timer t){
+
+ ByteBuilder bb=new ByteBuilder();
+ bb.append('$');
+
+ final ConcurrentReadInputStream cris;
+ {
+ cris=ConcurrentReadInputStream.getReadInputStream(-1, false, ffref, null);
+ if(verbose){outstream.println("Started cris");}
+ cris.start(); //4567
+ }
+ assert(!cris.paired());
+
+ long readsProcessed=0;
+ long basesProcessed=0;
+
+ {
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+// outstream.println("Fetched "+reads);
+
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+ assert((ffref==null || ffref.samOrBam()) || (r.mate!=null)==cris.paired());
+ }
+
+ while(reads!=null && reads.size()>0){
+
+ for(int idx=0; idx<reads.size(); idx++){
+ final Read r1=reads.get(idx);
+
+ final int initialLength1=r1.length();
+
+ bb.append(r1.bases);
+ bb.append('$');
+
+ readsProcessed++;
+ basesProcessed+=initialLength1;
+ }
+
+ final ArrayList<Read> listOut=reads;
+
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ if(ln!=null){
+ cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
+ }
+ }
+ errorState|=ReadStats.writeAll();
+ errorState|=ReadWrite.closeStream(cris);
+
+ ByteBuilder dest=amplify(bb, false, minlen, maxlen, initialRatio);
+ bb=null;
+ for(int i=0; i<cycles; i++){
+ dest=amplify(dest, i<1, minlen, maxlen, ratio);
+// if(dest.length()*ratio>1500000000){break;}
+ }
+// assert(false) : cycles+", "+dest.length();
+
+ TextStreamWriter tsw=(ffout1==null ? null : new TextStreamWriter(ffout1));
+ tsw.start();
+
+ bb=new ByteBuilder();
+ for(int i=0, id=1; i<dest.length(); i++){
+ byte b=dest.get(i);
+ if(b=='$'){
+ if(bb.length()>0){
+ tsw.print(">"+id+"\n");
+ tsw.println(bb.toString());
+ id++;
+ }
+ bb.setLength(0);
+ }else{
+ bb.append(b);
+ }
+ }
+ dest=null;
+ errorState|=tsw.poisonAndWait();
+
+ t.stop();
+
+ double rpnano=readsProcessed/(double)(t.elapsed);
+ double bpnano=basesProcessed/(double)(t.elapsed);
+
+ String rpstring=(readsProcessed<100000 ? ""+readsProcessed : readsProcessed<100000000 ? (readsProcessed/1000)+"k" : (readsProcessed/1000000)+"m");
+ String bpstring=(basesProcessed<100000 ? ""+basesProcessed : basesProcessed<100000000 ? (basesProcessed/1000)+"k" : (basesProcessed/1000000)+"m");
+
+ while(rpstring.length()<8){rpstring=" "+rpstring;}
+ while(bpstring.length()<8){bpstring=" "+bpstring;}
+
+ outstream.println("Time: \t"+t);
+ outstream.println("Reads Processed: "+rpstring+" \t"+String.format("%.2fk reads/sec", rpnano*1000000));
+ outstream.println("Bases Processed: "+bpstring+" \t"+String.format("%.2fm bases/sec", bpnano*1000));
+
+ if(readsOut!=null){
+ FileFormat ff=FileFormat.testOutput(readsOut, FileFormat.FASTQ, null, true, overwrite, false, false);
+ assert(ff!=null);
+ ArrayList<String> list=new ArrayList<String>();
+ list.add("reads="+reads);
+ list.add("length="+readlength);
+ list.add("amp="+amp);
+ if(paired){
+ list.add("paired="+paired);
+ list.add("interleaved="+paired);
+ }
+ list.add("build="+build);
+ list.add("out="+readsOut);
+ list.add("ow="+overwrite);
+ list.add("minq="+16);
+ list.add("midq="+25);
+ list.add("maxq="+38);
+ list.add("adderrors");
+ list.add("snprate="+0.02);
+ list.add("delrate="+0.005);
+ list.add("insrate="+0.005);
+ list.add("nrate="+0.005);
+ list.add("maxinslen="+3);
+ list.add("maxdellen="+3);
+ list.add("maxnlen="+3);
+ list.add("maxinss="+2);
+ list.add("maxdels="+2);
+ list.add("maxns="+2);
+ list.add("maxsnps="+2);
+ list.add("seed=-1");
+ list.add("ref="+out1);
+ if(prefix!=null){list.add("prefix="+prefix);}
+ if(perfectrate>0){
+ list.add("perfect="+perfectrate);
+ }
+ RandomReads3.main(list.toArray(new String[list.size()]));
+ }
+
+ boolean deleteRef=(readsOut!=null);
+ if(deleteRef){
+ if(verbose){System.err.println("Trying to delete "+out1);}
+ try {
+ File f=new File(out1);
+ if(f.exists()){f.delete();}
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+
+ if(errorState){
+ throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
+ }
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private ByteBuilder amplify(ByteBuilder source, boolean retain, int minlen, int maxlen, float ratio){
+ assert(minlen<=maxlen && minlen>0 && maxlen>0);
+ final int range=maxlen-minlen+1;
+ final int slen=source.length();
+ if(slen<minlen2*1.1f){
+ KillSwitch.kill("Input ("+slen+") must be at least 10% longer than minlen ("+minlen2+").");
+ }
+ if(source.length()>=500000000){retain=false;}
+ ByteBuilder dest=(retain ? source : new ByteBuilder());
+ int goal=(int)Tools.min((long)(slen*ratio), 600000000);
+ while(dest.length()<goal){
+ final long initialLength=dest.length();
+ final int start=randy.nextInt(slen);
+ final int len0=minlen+randy.nextInt(range);
+ final boolean forward=randy.nextBoolean();
+ if(initialLength+(long)len0>1500000000){break;}
+// System.err.println(forward+", "+start+", "+len0);
+ if(forward){
+ final int stop=Tools.min(source.length(), start+len0);
+// System.err.println("stop="+stop);
+ for(int i=start; i<stop; i++){
+ byte b=source.get(i);
+ if(b=='$'){
+// System.err.println("b="+(char)b);
+ break;
+ }
+ dest.append(b);
+ }
+ }else{
+ final int stop=Tools.max(0, start-len0);
+// System.err.println("stop="+stop);
+ for(int i=start; i>=stop; i--){
+ byte b=source.get(i);
+ if(b=='$'){
+// System.err.println("b="+(char)b);
+ break;
+ }
+ dest.append(AminoAcid.baseToComplementExtended[b]);
+ }
+ }
+ dest.append('$');
+ long added=dest.length()-initialLength;
+// System.err.println("added "+added+"/"+len0+" ("+initialLength+" -> "+dest.length()+")");
+// if(added<Tools.min(200, minlen) || (added<Tools.min(1000, minlen) && randy.nextBoolean())){dest.setLength(initialLength);}
+ if(added<Tools.min(minlen2, minlen)){dest.setLength((int)initialLength);}
+ }
+ return dest;
+ }
+
+ /*--------------------------------------------------------------*/
+
+ private void printOptions(){
+ assert(false) : "printOptions: TODO";
+// outstream.println("Syntax:\n");
+// outstream.println("java -ea -Xmx512m -cp <path> jgi.ReformatReads in=<infile> in2=<infile2> out=<outfile> out2=<outfile2>");
+// outstream.println("\nin2 and out2 are optional. \nIf input is paired and there is only one output file, it will be written interleaved.\n");
+// outstream.println("Other parameters and their defaults:\n");
+// outstream.println("overwrite=false \tOverwrites files that already exist");
+// outstream.println("ziplevel=4 \tSet compression level, 1 (low) to 9 (max)");
+// outstream.println("interleaved=false\tDetermines whether input file is considered interleaved");
+// outstream.println("fastawrap=70 \tLength of lines in fasta output");
+// outstream.println("qin=auto \tASCII offset for input quality. May be set to 33 (Sanger), 64 (Illumina), or auto");
+// outstream.println("qout=auto \tASCII offset for output quality. May be set to 33 (Sanger), 64 (Illumina), or auto (meaning same as input)");
+// outstream.println("outsingle=<file> \t(outs) Write singleton reads here, when conditionally discarding reads from pairs.");
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private String ref=null;
+ private String out1=null;
+
+ private String extref=null;
+ private String extout=null;
+
+ private final FileFormat ffref;
+ private final FileFormat ffout1;
+
+ /*--------------------------------------------------------------*/
+
+ private int minlen=10000;
+ private int minlen2=4000;
+ private int maxlen=150000;
+ private int cycles=9;
+ private float initialRatio=1.3f;
+ private float ratio=2;
+
+ private String prefix=null;
+
+ /*--------------------------------------------------------------*/
+
+ private long reads=12000000;
+ private int readlength=150;
+ private int amp=200;
+ private boolean paired=true;
+ private int build=7;
+ private String readsOut=null;
+ private float perfectrate=0;
+
+ private final Random randy=new Random();
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Common Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private PrintStream outstream=System.err;
+ public static boolean verbose=false;
+ public boolean errorState=false;
+ private boolean overwrite=false;
+ private boolean append=false;
+
+}
diff --git a/current/jgi/TranslateSixFrames.java b/current/jgi/TranslateSixFrames.java
new file mode 100755
index 0000000..29f2011
--- /dev/null
+++ b/current/jgi/TranslateSixFrames.java
@@ -0,0 +1,455 @@
+package jgi;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import stream.ConcurrentGenericReadInputStream;
+import stream.ConcurrentReadInputStream;
+import stream.FASTQ;
+import stream.FastaReadInputStream;
+import stream.ConcurrentReadOutputStream;
+import stream.Read;
+import stream.SamLine;
+
+import dna.AminoAcid;
+import dna.Parser;
+import dna.Timer;
+import fileIO.ByteFile;
+import fileIO.ByteFile1;
+import fileIO.ByteFile2;
+import fileIO.ReadWrite;
+import fileIO.FileFormat;
+
+import align2.ListNum;
+import align2.Shared;
+import align2.Tools;
+import align2.TrimRead;
+
+/**
+ * @author Brian Bushnell
+ * @date Sep 11, 2012
+ *
+ */
+public class TranslateSixFrames {
+
+ public static void main(String[] args){
+ Timer t=new Timer();
+ TranslateSixFrames rr=new TranslateSixFrames(args);
+ rr.process(t);
+ }
+
+ public TranslateSixFrames(String[] args){
+
+ args=Parser.parseConfig(args);
+ if(Parser.parseHelp(args, true)){
+ printOptions();
+ System.exit(0);
+ }
+
+ for(String s : args){if(s.startsWith("out=standardout") || s.startsWith("out=stdout")){outstream=System.err;}}
+ outstream.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+
+ boolean setInterleaved=false; //Whether it was explicitly set.
+
+
+
+ Shared.READ_BUFFER_LENGTH=Tools.min(200, Shared.READ_BUFFER_LENGTH);
+ Shared.capBuffers(4);
+ ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
+ ReadWrite.MAX_ZIP_THREADS=Shared.threads();
+ ReadWrite.ZIP_THREAD_DIVISOR=1;
+ int frames=6;
+
+ Parser parser=new Parser();
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens
+
+ if(parser.parse(arg, a, b)){
+ //do nothing
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ ByteFile1.verbose=verbose;
+ ByteFile2.verbose=verbose;
+ stream.FastaReadInputStream.verbose=verbose;
+ ConcurrentGenericReadInputStream.verbose=verbose;
+// align2.FastaReadInputStream2.verbose=verbose;
+ stream.FastqReadInputStream.verbose=verbose;
+ ReadWrite.verbose=verbose;
+ }else if(a.equals("tag")){
+ addTag=Tools.parseBoolean(b);
+ }else if(a.equals("skipquality")){
+ skipquality=Tools.parseBoolean(b);
+ }else if(a.equals("translatequality")){
+ skipquality=!Tools.parseBoolean(b);
+ }else if(a.equals("frames")){
+ frames=Integer.parseInt(b);
+ assert(frames>=0 && frames<=6) : "Frames must be in the range of 0 to 6";
+ }else if(a.equals("aain")){
+ NT_IN=!Tools.parseBoolean(b);
+ }else if(a.equals("ntin")){
+ NT_IN=Tools.parseBoolean(b);
+ }else if(a.equals("aaout")){
+ NT_OUT=!Tools.parseBoolean(b);
+ }else if(a.equals("ntout")){
+ NT_OUT=Tools.parseBoolean(b);
+ }else if(a.equals("frames")){
+ frames=Integer.parseInt(b);
+ assert(frames>=0 && frames<=6) : "Frames must be in the range of 0 to 6";
+ }else if(parser.in1==null && i==0 && !arg.contains("=") && (arg.toLowerCase().startsWith("stdin") || new File(arg).exists())){
+ parser.in1=arg;
+ }else if(parser.out1==null && i==1 && !arg.contains("=")){
+ parser.out1=arg;
+ }else{
+ System.err.println("Unknown parameter "+args[i]);
+ assert(false) : "Unknown parameter "+args[i];
+ // throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+ FRAMES=frames;
+ Shared.AMINO_IN=!NT_IN;
+
+ {//Process parser fields
+ Parser.processQuality();
+
+ maxReads=parser.maxReads;
+ samplerate=parser.samplerate;
+ sampleseed=parser.sampleseed;
+
+ overwrite=parser.overwrite;
+ append=parser.append;
+
+ setInterleaved=parser.setInterleaved;
+
+ in1=parser.in1;
+ in2=parser.in2;
+ qfin1=parser.qfin1;
+ qfin2=parser.qfin2;
+
+ out1=parser.out1;
+ out2=parser.out2;
+ qfout1=parser.qfout1;
+ qfout2=parser.qfout2;
+
+ extin=parser.extin;
+ extout=parser.extout;
+ }
+
+ if(in1!=null && in2==null && in1.indexOf('#')>-1 && !new File(in1).exists()){
+ in2=in1.replace("#", "2");
+ in1=in1.replace("#", "1");
+ }
+ if(out1!=null && out2==null && out1.indexOf('#')>-1){
+ out2=out1.replace("#", "2");
+ out1=out1.replace("#", "1");
+ }
+ if(in2!=null){
+ if(FASTQ.FORCE_INTERLEAVED){System.err.println("Reset INTERLEAVED to false because paired input files were specified.");}
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+ }
+
+ assert(FastaReadInputStream.settingsOK());
+// if(maxReads!=-1){ReadWrite.USE_GUNZIP=ReadWrite.USE_UNPIGZ=false;}
+
+ if(in1==null){
+ printOptions();
+ throw new RuntimeException("Error - at least one input file is required.");
+ }
+ if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){
+// if(ReadWrite.isCompressed(in1)){ByteFile.FORCE_MODE_BF2=true;}
+ ByteFile.FORCE_MODE_BF2=true;
+ }
+
+ if(out1==null){
+ if(out2!=null){
+ printOptions();
+ throw new RuntimeException("Error - cannot define out2 without defining out1.");
+ }
+ if(!parser.setOut){
+ out1="stdout";
+ }
+ }
+
+ if(!setInterleaved){
+ assert(in1!=null && (out1!=null || out2==null)) : "\nin1="+in1+"\nin2="+in2+"\nout1="+out1+"\nout2="+out2+"\n";
+ if(in2!=null){ //If there are 2 input streams.
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+ outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED);
+ }else{ //There is one input stream.
+ if(out2!=null){
+ FASTQ.FORCE_INTERLEAVED=true;
+ FASTQ.TEST_INTERLEAVED=false;
+ outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED);
+ }
+ }
+ }
+
+ if(out1!=null && out1.equalsIgnoreCase("null")){out1=null;}
+ if(out2!=null && out2.equalsIgnoreCase("null")){out2=null;}
+
+ if(!Tools.testOutputFiles(overwrite, append, false, out1, out2)){
+ throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+out1+", "+out2+"\n");
+ }
+
+ ffout1=FileFormat.testOutput(out1, FileFormat.FASTA, extout, true, overwrite, append, false);
+ ffout2=FileFormat.testOutput(out2, FileFormat.FASTA, extout, true, overwrite, append, false);
+
+ ffin1=FileFormat.testInput(in1, FileFormat.FASTA, extin, true, true);
+ ffin2=FileFormat.testInput(in2, FileFormat.FASTA, extin, true, true);
+
+ if((ffout1!=null && ffout1.fasta()) || (ffin1!=null && ffin1.fasta())){skipquality=true;}
+ }
+
+ void process(Timer t){
+
+ final ConcurrentReadInputStream cris;
+ {
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, useSharedHeader, ffin1, ffin2, qfin1, qfin2);
+ cris.setSampleRate(samplerate, sampleseed);
+ if(verbose){System.err.println("Started cris");}
+ cris.start(); //4567
+ }
+ boolean paired=cris.paired();
+ if(verbose){System.err.println("Input is "+(paired ? "paired" : "unpaired"));}
+
+ ConcurrentReadOutputStream ros=null;
+ if(out1!=null){
+ final int buff=4;
+
+ if(cris.paired() && out2==null && (in1==null || !in1.contains(".sam"))){
+ outstream.println("Writing interleaved.");
+ }
+
+ assert(!out1.equalsIgnoreCase(in1) && !out1.equalsIgnoreCase(in1)) : "Input file and output file have same name.";
+ assert(out2==null || (!out2.equalsIgnoreCase(in1) && !out2.equalsIgnoreCase(in2))) : "out1 and out2 have same name.";
+
+// System.err.println("Calling ConcurrentReadOutputStream with out1="+out1+", out2="+out2+", qfout1="+qfout1+", qfout2="+qfout2);
+ ros=ConcurrentReadOutputStream.getStream(ffout1, ffout2, qfout1, qfout2, buff, null, useSharedHeader);
+ ros.start();
+ }
+
+ long readsProcessed=0;
+ long basesProcessed=0;
+
+ long readsOut1=0;
+ long readsOut2=0;
+
+ long basesOut1=0;
+ long basesOut2=0;
+
+ {
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+// System.err.println("Fetched "+reads);
+
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+ assert((ffin1==null || ffin1.samOrBam()) || (r.mate!=null)==cris.paired());
+ }
+
+ while(reads!=null && reads.size()>0){
+
+ final ArrayList<Read> listOut=new ArrayList<Read>(reads.size()*(NT_IN ? FRAMES : 1));
+
+ for(int idx=0; idx<reads.size(); idx++){
+ final Read r1=reads.get(idx);
+ final Read r2=r1.mate;
+
+ final int initialLength1=r1.length();
+ final int initialLength2=(r1.mateLength());
+
+ if(NT_IN){//Translate to amino acids
+ toFrames(r1, skipquality, addTag, FRAMES, listOut);
+ }else{
+ listOut.add(r1);
+ }
+
+
+ {
+ readsProcessed++;
+ basesProcessed+=initialLength1;
+ }
+ if(r2!=null){
+ readsProcessed++;
+ basesProcessed+=initialLength2;
+ }
+
+ if(addslash){
+ if(r1.id==null){r1.id=""+r1.numericID;}
+ if(!r1.id.contains(" /1")){r1.id+=" /1";}
+ if(r2!=null){
+ if(r2.id==null){r2.id=""+r2.numericID;}
+ if(!r2.id.contains(" /2")){r2.id+=" /2";}
+ }
+ }
+ }
+
+ if(NT_OUT){//Translate to nucleotides
+ for(int i=0; i<listOut.size(); i++){
+ final Read aa1=listOut.get(i);
+ final Read aa2=aa1.mate;
+ final Read nt1=aa1.aminoToNucleic();
+ if(aa2!=null){
+ final Read nt2=aa2.aminoToNucleic();
+ nt1.mate=nt2;
+ nt2.mate=nt1;
+ }
+ listOut.set(i, nt1);
+ }
+ }
+
+ for(Read r1 : listOut){
+ Read r2=r1.mate;
+ readsOut1++;
+ basesOut1+=r1.length();
+
+ if(r2!=null){
+ readsOut2++;
+ basesOut2+=r2.length();
+ }
+ }
+
+ if(ros!=null){ros.add(listOut, ln.id);}
+
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ if(ln!=null){
+ cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
+ }
+ }
+
+ errorState|=ReadWrite.closeStreams(cris, ros);
+
+ t.stop();
+
+ double rpnano=readsProcessed/(double)(t.elapsed);
+ double bpnano=basesProcessed/(double)(t.elapsed);
+
+ long readsOut=readsOut1+readsOut2;
+ long basesOut=basesOut1+basesOut2;
+
+ String rpstring=(readsProcessed<100000 ? ""+readsProcessed : readsProcessed<100000000 ? (readsProcessed/1000)+"k" : (readsProcessed/1000000)+"m");
+ String bpstring=(basesProcessed<100000 ? ""+basesProcessed : basesProcessed<100000000 ? (basesProcessed/1000)+"k" : (basesProcessed/1000000)+"m");
+ String rostring=(readsOut<100000 ? ""+readsOut : readsOut<100000000 ? (readsOut/1000)+"k" : (readsOut/1000000)+"m");
+ String aastring=(basesOut<100000 ? ""+basesOut : basesOut<100000000 ? (basesOut/1000)+"k" : (basesOut/1000000)+"m");
+
+ while(rpstring.length()<8){rpstring=" "+rpstring;}
+ while(bpstring.length()<8){bpstring=" "+bpstring;}
+ while(rostring.length()<8){rostring=" "+rostring;}
+ while(aastring.length()<8){aastring=" "+aastring;}
+
+ outstream.println("Time: \t"+t);
+ outstream.println("Reads Processed: "+rpstring+" \t"+String.format("%.2fk reads/sec", rpnano*1000000));
+ outstream.println("Bases Processed: "+bpstring+" \t"+String.format("%.2fm bases/sec", bpnano*1000));
+ outstream.println("Reads Out: "+rostring);
+ outstream.println((NT_OUT ? "Bases Out: " : "Amino Acids Out: ")+aastring);
+
+ if(errorState){
+ throw new RuntimeException("TranslateSixFrames terminated in an error state; the output may be corrupt.");
+ }
+ }
+
+ public static final ArrayList<Read> toFrames(Read r1, boolean skipquality, boolean addTag, int frames){
+ return toFrames(r1, skipquality, addTag, frames, new ArrayList<Read>(frames));
+ }
+
+ public static final ArrayList<Read> toFrames(Read r1, boolean skipquality, boolean addTag, int frames, ArrayList<Read> listOut){
+ final Read r2=r1.mate;
+ final byte[][] bm1=AminoAcid.toAAsSixFrames(r1.bases);
+ final byte[][] qm1=(skipquality ? QNULL : AminoAcid.toQualitySixFrames(r1.quality, 0));
+ final byte[][] bm2=(r2==null ? null : AminoAcid.toAAsSixFrames(r2.bases));
+ final byte[][] qm2=(r2==null ? null : (skipquality ? QNULL : AminoAcid.toQualitySixFrames(r2.quality, 0)));
+
+ for(int i=0; i<frames; i++){
+ Read aa1=new Read(bm1[i], r1.chrom, r1.start, r1.stop, (addTag ? r1.id+frametag[i] : r1.id), qm1[i], r1.numericID, r1.flags|Read.AAMASK);
+ Read aa2=null;
+ if(r2!=null){
+ aa2=new Read(bm2[i], r2.chrom, r2.start, r2.stop, (addTag ? r2.id+frametag[i] : r2.id), qm2[i], r2.numericID, r2.flags|Read.AAMASK);
+ aa1.mate=aa2;
+ aa2.mate=aa1;
+ }
+ if(aa1.bases!=null || (aa2!=null && aa2.bases!=null)){listOut.add(aa1);}
+ }
+ return listOut;
+ }
+
+ /*--------------------------------------------------------------*/
+
+ private void printOptions(){
+ outstream.println("Syntax:\n");
+ outstream.println("java -ea -Xmx512m -cp <path> jgi.TranslateSixFrames in=<infile> in2=<infile2> out=<outfile> out2=<outfile2>");
+ outstream.println("\nin2 and out2 are optional. \nIf input is paired and there is only one output file, it will be written interleaved.\n");
+ outstream.println("Other parameters and their defaults:\n");
+ outstream.println("overwrite=false \tOverwrites files that already exist");
+ outstream.println("ziplevel=4 \tSet compression level, 1 (low) to 9 (max)");
+ outstream.println("interleaved=false\tDetermines whether input file is considered interleaved");
+ outstream.println("fastawrap=70 \tLength of lines in fasta output");
+ outstream.println("qin=auto \tASCII offset for input quality. May be set to 33 (Sanger), 64 (Illumina), or auto");
+ outstream.println("qout=auto \tASCII offset for output quality. May be set to 33 (Sanger), 64 (Illumina), or auto (meaning same as input)");
+ }
+
+
+ /*--------------------------------------------------------------*/
+
+ private String in1=null;
+ private String in2=null;
+
+ private String qfin1=null;
+ private String qfin2=null;
+
+ private String out1=null;
+ private String out2=null;
+
+ private String qfout1=null;
+ private String qfout2=null;
+
+ private String extin=null;
+ private String extout=null;
+
+ /*--------------------------------------------------------------*/
+
+ /** Add /1 and /2 to paired reads */
+ private boolean addslash=false;
+
+ private boolean skipquality=false;
+
+ private boolean NT_IN=true;
+ private boolean NT_OUT=false;
+
+ private long maxReads=-1;
+ private float samplerate=1f;
+ private long sampleseed=-1;
+
+ private final int FRAMES;
+
+ /*--------------------------------------------------------------*/
+
+ private final FileFormat ffin1;
+ private final FileFormat ffin2;
+
+ private final FileFormat ffout1;
+ private final FileFormat ffout2;
+
+ private static final String[] frametag=new String[] {" fr1", " fr2", " fr3", " fr4", " fr5", " fr6"};
+ private static final byte[][] QNULL=new byte[6][];
+ private boolean addTag=true;
+
+ /*--------------------------------------------------------------*/
+
+ private PrintStream outstream=System.err;
+ public static boolean verbose=false;
+ public boolean errorState=false;
+ private boolean overwrite=false;
+ private boolean append=false;
+ private boolean useSharedHeader;
+
+}
diff --git a/current/kmer/AbstractKmerTable.java b/current/kmer/AbstractKmerTable.java
new file mode 100755
index 0000000..b2e3356
--- /dev/null
+++ b/current/kmer/AbstractKmerTable.java
@@ -0,0 +1,494 @@
+package kmer;
+
+import java.util.concurrent.atomic.AtomicIntegerArray;
+import java.util.concurrent.locks.Lock;
+
+import stream.ByteBuilder;
+import stream.KillSwitch;
+import align2.Shared;
+import align2.Tools;
+import dna.AminoAcid;
+import fileIO.ByteStreamWriter;
+import fileIO.TextStreamWriter;
+
+/**
+ * @author Brian Bushnell
+ * @date Oct 23, 2013
+ *
+ */
+public abstract class AbstractKmerTable {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Abstract Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Returns count */
+ public abstract int increment(long kmer);
+
+ /** Returns number of entries created */
+ public abstract int incrementAndReturnNumCreated(final long kmer);
+
+ public abstract int set(long kmer, int value);
+
+ public abstract int set(long kmer, int[] vals);
+
+ /** Returns number of kmers added */
+ public abstract int setIfNotPresent(long kmer, int value);
+
+ /**
+ * Fetch the value associated with a kmer.
+ * @param kmer
+ * @return A value. -1 means the kmer was not present.
+ */
+ public abstract int getValue(long kmer);
+
+ /**
+ * Fetch the values associated with a kmer.
+ * @param kmer
+ * @param singleton A blank array of length 1.
+ * @return An array filled with values. Values of -1 are invalid.
+ */
+ public abstract int[] getValues(long kmer, int[] singleton);
+
+ public abstract boolean contains(long kmer);
+
+ public final boolean contains(long kmer, int v){
+ assert(TESTMODE);
+ int[] set=getValues(kmer, new int[] {-1});
+ if(set==null){return false;}
+ for(int s : set){
+ if(s==-1){break;}
+ if(s==v){return true;}
+ }
+ return false;
+ }
+
+ public final boolean contains(long kmer, int[] vals){
+ assert(TESTMODE);
+ int[] set=getValues(kmer, new int[] {-1});
+ if(set==null){return false;}
+ boolean success=true;
+ for(int v : vals){
+ if(v==-1){break;}
+ success=false;
+ for(int s : set){
+ if(s==v){
+ success=true;
+ break;
+ }
+ }
+ if(!success){break;}
+ }
+ return success;
+ }
+
+ public abstract void rebalance();
+
+ public abstract long size();
+ public abstract int arrayLength();
+ public abstract boolean canRebalance();
+
+ public abstract boolean dumpKmersAsText(TextStreamWriter tsw, int k, int mincount);
+ public abstract boolean dumpKmersAsBytes(ByteStreamWriter bsw, int k, int mincount);
+ public abstract boolean dumpKmersAsBytes_MT(final ByteStreamWriter bsw, final ByteBuilder bb, final int k, final int mincount);
+
+ public abstract void fillHistogram(long[] ca, int max);
+
+ abstract Object get(long kmer);
+ abstract void resize();
+ abstract boolean canResize();
+
+
+
+ /**
+ * Removes entries with a value of zero or less.
+ * Rehashes the remainder.
+ * @return Number removed.
+ */
+ abstract long regenerate();
+
+ final void lock(){getLock().lock();}
+ final void unlock(){getLock().unlock();}
+ final boolean tryLock(){return getLock().tryLock();}
+ Lock getLock(){
+ throw new RuntimeException("Unimplemented.");
+ }
+
+ /*--------------------------------------------------------------*/
+ /*--------------- Allocation Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ final AtomicIntegerArray allocAtomicInt(int len){
+ AtomicIntegerArray ret=null;
+ try {
+ ret=new AtomicIntegerArray(len);
+ } catch (OutOfMemoryError e) {
+ synchronized(killMessage){
+ e.printStackTrace();
+ System.err.println(killMessage);
+ KillSwitch.killSilent();
+ }
+ }
+ return ret;
+ }
+
+ final long[] allocLong1D(int len){
+ long[] ret=null;
+ try {
+ ret=new long[len];
+ } catch (OutOfMemoryError e) {
+ synchronized(killMessage){
+ e.printStackTrace();
+ System.err.println(killMessage);
+ KillSwitch.killSilent();
+ }
+ }
+ return ret;
+ }
+
+ final int[] allocInt1D(int len){
+ int[] ret=null;
+ try {
+ ret=new int[len];
+ } catch (OutOfMemoryError e) {
+ synchronized(killMessage){
+ e.printStackTrace();
+ System.err.println(killMessage);
+ KillSwitch.killSilent();
+ }
+ }
+ return ret;
+ }
+
+ final int[][] allocInt2D(int len){
+ int[][] ret=null;
+ try {
+ ret=new int[len][];
+ } catch (OutOfMemoryError e) {
+ synchronized(killMessage){
+ e.printStackTrace();
+ System.err.println(killMessage);
+ KillSwitch.killSilent();
+ }
+ }
+ return ret;
+ }
+
+ final KmerNode[] allocKmerNodeArray(int len){
+ KmerNode[] ret=null;
+ try {
+ ret=new KmerNode[len];
+ } catch (OutOfMemoryError e) {
+ synchronized(killMessage){
+ e.printStackTrace();
+ System.err.println(killMessage);
+ KillSwitch.killSilent();
+ }
+ }
+ return ret;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*--------------- Ownership Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Set the thread owning this kmer. Return the new owner.
+ * Will only change the owner if newOwner is greater than current owner. */
+ public abstract int setOwner(long kmer, int newOwner);
+
+ /** Reset owner to -1 if this is the current owner. */
+ public abstract boolean clearOwner(long kmer, int owner);
+
+ /** Return the thread ID owning this kmer, or -1. */
+ public abstract int getOwner(long kmer);
+
+ /** Create data structures needed for ownership representation */
+ public abstract void initializeOwnership();
+
+ /** Eliminate ownership data structures or set them to -1. */
+ public abstract void clearOwnership();
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public static final StringBuilder toText(long kmer, int k){
+ StringBuilder sb=new StringBuilder(k);
+ for(int i=k-1; i>=0; i--){
+ int x=(int)((kmer>>(2*i))&3);
+ sb.append((char)AminoAcid.numberToBase[x]);
+ }
+ return sb;
+ }
+
+ static final StringBuilder toText(long kmer, int count, int k){
+ StringBuilder sb=new StringBuilder(k+10);
+ return toText(kmer, count, k, sb);
+ }
+
+ static final ByteBuilder toBytes(long kmer, int count, int k){
+ ByteBuilder bb=new ByteBuilder(k+10);
+ return toBytes(kmer, count, k, bb);
+ }
+
+ static final StringBuilder toText(long kmer, int[] values, int k){
+ StringBuilder sb=new StringBuilder(k+10);
+ return toText(kmer, values, k, sb);
+ }
+
+ static final ByteBuilder toBytes(long kmer, int[] values, int k){
+ ByteBuilder bb=new ByteBuilder(k+10);
+ return toBytes(kmer, values, k, bb);
+ }
+
+ static final StringBuilder toText(long kmer, int count, int k, StringBuilder sb){
+ if(FASTA_DUMP){
+ sb.append('>');
+ sb.append(count);
+ sb.append('\n');
+ for(int i=k-1; i>=0; i--){
+ int x=(int)((kmer>>(2*i))&3);
+ sb.append((char)AminoAcid.numberToBase[x]);
+ }
+ }else{
+ for(int i=k-1; i>=0; i--){
+ int x=(int)((kmer>>(2*i))&3);
+ sb.append((char)AminoAcid.numberToBase[x]);
+ }
+ sb.append('\t');
+ sb.append(count);
+ }
+ return sb;
+ }
+
+ static final StringBuilder toText(long kmer, int[] values, int k, StringBuilder sb){
+ if(FASTA_DUMP){
+ sb.append('>');
+ for(int i=0; i<values.length; i++){
+ int x=values[i];
+ if(x==-1){break;}
+ if(i>0){sb.append(',');}
+ sb.append(x);
+ }
+ sb.append('\n');
+ for(int i=k-1; i>=0; i--){
+ int x=(int)((kmer>>(2*i))&3);
+ sb.append((char)AminoAcid.numberToBase[x]);
+ }
+ }else{
+ for(int i=k-1; i>=0; i--){
+ int x=(int)((kmer>>(2*i))&3);
+ sb.append((char)AminoAcid.numberToBase[x]);
+ }
+ sb.append('\t');
+ for(int i=0; i<values.length; i++){
+ int x=values[i];
+ if(x==-1){break;}
+ if(i>0){sb.append(',');}
+ sb.append(x);
+ }
+ }
+ return sb;
+ }
+
+ public static final ByteBuilder toBytes(long kmer, int count, int k, ByteBuilder bb){
+ if(FASTA_DUMP){
+ bb.append('>');
+ bb.append(count);
+ bb.append('\n');
+ for(int i=k-1; i>=0; i--){
+ int x=(int)((kmer>>(2*i))&3);
+ bb.append(AminoAcid.numberToBase[x]);
+ }
+ }else if(NUMERIC_DUMP){
+ bb.append(Long.toHexString(kmer));
+ bb.append('\t');
+ bb.append(count);
+ }else{
+ for(int i=k-1; i>=0; i--){
+ int x=(int)((kmer>>(2*i))&3);
+ bb.append(AminoAcid.numberToBase[x]);
+ }
+ bb.append('\t');
+ bb.append(count);
+ }
+ return bb;
+ }
+
+ public static final ByteBuilder toBytes(long kmer, int[] values, int k, ByteBuilder bb){
+ if(FASTA_DUMP){
+ bb.append('>');
+ for(int i=0; i<values.length; i++){
+ int x=values[i];
+ if(x==-1){break;}
+ if(i>0){bb.append(',');}
+ bb.append(x);
+ }
+ bb.append('\n');
+ for(int i=k-1; i>=0; i--){
+ int x=(int)((kmer>>(2*i))&3);
+ bb.append(AminoAcid.numberToBase[x]);
+ }
+ }else if(NUMERIC_DUMP){
+ bb.append(Long.toHexString(kmer));
+ bb.append('\t');
+ for(int i=0; i<values.length; i++){
+ int x=values[i];
+ if(x==-1){break;}
+ if(i>0){bb.append(',');}
+ bb.append(x);
+ }
+ }else{
+ for(int i=k-1; i>=0; i--){
+ int x=(int)((kmer>>(2*i))&3);
+ bb.append(AminoAcid.numberToBase[x]);
+ }
+ bb.append('\t');
+ for(int i=0; i<values.length; i++){
+ int x=values[i];
+ if(x==-1){break;}
+ if(i>0){bb.append(',');}
+ bb.append(x);
+ }
+ }
+ return bb;
+ }
+
+// static void appendKmerText(long kmer, int count, int k, StringBuilder sb){
+// sb.setLength(0);
+// toText(kmer, count, k, sb);
+// sb.append('\n');
+// }
+
+ static void appendKmerText(long kmer, int count, int k, ByteBuilder bb){
+ bb.setLength(0);
+ toBytes(kmer, count, k, bb);
+ bb.append('\n');
+ }
+
+
+ /** For buffered tables. */
+ long flush(){
+ throw new RuntimeException("Unsupported.");
+ }
+
+ /**
+ * This allocates the data structures in multiple threads. Unfortunately, it does not lead to any speedup, at least for ARRAY type.
+ * @param ways
+ * @param tableType
+ * @param initialSize
+ * @param growable
+ * @return
+ */
+ public static final AbstractKmerTable[] preallocate(int ways, int tableType, int initialSize, boolean growable){
+
+ final AbstractKmerTable[] tables=new AbstractKmerTable[ways];
+
+ {
+ final int t=Tools.max(1, Tools.min(Shared.threads(), 2, ways));
+ final AllocThread[] allocators=new AllocThread[t];
+ for(int i=0; i<t; i++){
+ allocators[i]=new AllocThread(tableType, initialSize, i, t, growable, tables);
+ }
+ for(AllocThread at : allocators){at.start();}
+ for(AllocThread at : allocators){
+ while(at.getState()!=Thread.State.TERMINATED){
+ try {
+ at.join();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+ }
+
+ synchronized(tables){
+ for(int i=0; i<tables.length; i++){
+ final AbstractKmerTable akt=tables[i];
+ if(akt==null){
+ throw new RuntimeException("KmerTable allocation failed, probably due to lack of RAM: "+i+", "+tables.length);
+ }
+ }
+ }
+
+ return tables;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Nested Classes ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private static class AllocThread extends Thread{
+
+ AllocThread(int type_, int initialSize_, int mod_, int div_, boolean growable_, AbstractKmerTable[] tables_){
+ type=type_;
+ size=initialSize_;
+ mod=mod_;
+ div=div_;
+ growable=growable_;
+ tables=tables_;
+ }
+
+ @Override
+ public void run(){
+ for(int i=mod; i<tables.length; i+=div){
+// System.err.println("T"+i+" allocating "+i);
+ final AbstractKmerTable akt;
+ if(type==FOREST1D){
+ akt=new HashForest(size, growable, false);
+ }else if(type==TABLE){
+ akt=new KmerTable(size, growable);
+ }else if(type==ARRAY1D){
+ akt=new HashArray1D(size, growable);
+ }else if(type==NODE1D){
+ throw new RuntimeException("Must use forest, table, or array data structure. Type="+type);
+// akt=new KmerNode2(-1, 0);
+ }else if(type==FOREST2D){
+ akt=new HashForest(size, growable, true);
+ }else if(type==TABLE2D){
+ throw new RuntimeException("Must use forest, table, or array data structure. Type="+type);
+ }else if(type==ARRAY2D){
+ akt=new HashArray2D(size, growable);
+ }else if(type==NODE2D){
+ throw new RuntimeException("Must use forest, table, or array data structure. Type="+type);
+// akt=new KmerNode(-1, 0);
+ }else if(type==ARRAYH){
+ akt=new HashArrayHybrid(size, growable);
+ }else{
+ throw new RuntimeException("Must use forest, table, or array data structure. Type="+type);
+ }
+ synchronized(tables){
+ tables[i]=akt;
+ }
+// System.err.println("T"+i+" allocated "+i);
+ }
+ }
+
+ private final int type;
+ private final int size;
+ private final int mod;
+ private final int div;
+ private final boolean growable;
+ final AbstractKmerTable[] tables;
+
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public static boolean FASTA_DUMP=true;
+ public static boolean NUMERIC_DUMP=false;
+
+ public static final boolean verbose=false;
+ public static final boolean TESTMODE=false; //123 SLOW!
+
+ public static final int UNKNOWN=0, ARRAY1D=1, FOREST1D=2, TABLE=3, NODE1D=4, ARRAY2D=5, FOREST2D=6, TABLE2D=7, NODE2D=8, ARRAYH=9;
+
+ public static final int NOT_PRESENT=-1, HASH_COLLISION=-2;
+ public static final int NO_OWNER=-1;
+
+ private final static String killMessage=new String("\nThis program ran out of memory. Try increasing the -Xmx flag and setting prealloc.");
+
+}
diff --git a/current/kmer/AbstractKmerTableSet.java b/current/kmer/AbstractKmerTableSet.java
new file mode 100755
index 0000000..d91b880
--- /dev/null
+++ b/current/kmer/AbstractKmerTableSet.java
@@ -0,0 +1,432 @@
+package kmer;
+
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import stream.Read;
+
+import jgi.CallPeaks;
+import align2.Shared;
+import align2.Tools;
+import bloom.KCountArray;
+import bloom.KmerCount7MTA;
+import bloom.KmerCountAbstract;
+import dna.Timer;
+import fileIO.ByteStreamWriter;
+
+
+/**
+ * Loads and holds kmers for Tadpole
+ * @author Brian Bushnell
+ * @date Jun 22, 2015
+ *
+ */
+public abstract class AbstractKmerTableSet {
+
+ /**
+ * Display usage information.
+ */
+ public static final void printOptions(){
+ outstream.println("Syntax:\nTODO");
+ }
+
+ public static final boolean isValidArgument(String a){
+ if(a.equals("in") || a.equals("in1")){
+ }else if(a.equals("in2")){
+ }else if(a.equals("append") || a.equals("app")){
+ }else if(a.equals("overwrite") || a.equals("ow")){
+ }else if(a.equals("initialsize")){
+ }else if(a.equals("showstats") || a.equals("stats")){
+ }else if(a.equals("ways")){
+ }else if(a.equals("buflen") || a.equals("bufflen") || a.equals("bufferlength")){
+ }else if(a.equals("k")){
+ }else if(a.equals("threads") || a.equals("t")){
+ }else if(a.equals("showspeed") || a.equals("ss")){
+ }else if(a.equals("ecco")){
+ }else if(a.equals("merge")){
+ }else if(a.equals("verbose")){
+ }else if(a.equals("verbose2")){
+ }else if(a.equals("minprob")){
+ }else if(a.equals("reads") || a.startsWith("maxreads")){
+ }else if(a.equals("prealloc") || a.equals("preallocate")){
+ }else if(a.equals("prefilter")){
+ }else if(a.equals("prefiltersize") || a.equals("prefilterfraction") || a.equals("pff")){
+ }else if(a.equals("minprobprefilter") || a.equals("mpp")){
+ }else if(a.equals("minprobmain") || a.equals("mpm")){
+ }else if(a.equals("prefilterpasses") || a.equals("prepasses")){
+ }else if(a.equals("prehashes") || a.equals("hashes")){
+ }else if(a.equals("onepass")){
+ }else if(a.equals("passes")){
+ }else if(a.equals("rcomp")){
+ }else{
+ return false;
+ }
+ return true;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Outer Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+
+ public final void process(Timer t){
+
+ /* Count kmers */
+ long added=processInput();
+
+ /* Stop timer and calculate speed statistics */
+ t.stop();
+
+ showStats(t, added);
+
+ /* Throw an exception if errors were detected */
+ if(errorState){
+ throw new RuntimeException(getClass().getSimpleName()+" terminated in an error state; the output may be corrupt.");
+ }
+ }
+
+
+ public abstract void clear();
+
+
+ public final long processInput(){
+
+ /* Start phase timer */
+ Timer t=new Timer();
+
+ if(DISPLAY_PROGRESS){
+ outstream.println("Before loading:");
+ Shared.printMemory();
+ outstream.println();
+ }
+
+ System.err.println("Estimated kmer capacity: \t"+estimatedKmerCapacity());
+ prefilterArray=makePrefilter(new KCountArray[1], null);
+ if(prefilterArray!=null){
+ prefilterArray.purgeFilter();
+ filterMax2=Tools.min(filterMax, prefilterArray.maxValue-1);
+ }
+// assert(false) : prefilterArray.cellBits+", "+prefilterArray.maxValue+", "+filterMax+", "+filterMax2;
+
+ /* Fill tables with kmers */
+ long added=loadKmers();
+
+ /* Clear prefilter; no longer needed */
+ prefilterArray=null;
+
+ return added;
+ }
+
+
+ public final KCountArray makePrefilter(final KCountArray[] filter, Timer ht){
+// assert(false) : lastFilter+", "+prefilter+", "+filterMax()+", "+currentPass+", "+filterMemory(currentPass);
+ if(!prefilter){return null;}
+
+ if(filter[0]!=null){
+ filter[0].purgeFilter();
+ assert(filter[0].prefilter()==null);
+ }
+
+ KmerCountAbstract.CANONICAL=true;
+
+ long precells=-1;
+ int cbits=1;
+ if(onePass){
+ while(filterMax>=(1<<cbits)){cbits*=2;}
+ }else{
+ while(filterMax+1>=(1<<cbits)){cbits*=2;}
+ }
+ if(prepasses>2 && currentPass==prepasses-1){cbits=1;}
+
+ byte minq=0;
+ if(precells<1){
+ long prebits=(filterMemory(currentPass)-10)*8;
+
+// System.err.println("prebits="+prebits+", currentPass="+currentPass);
+
+ precells=prebits/cbits;
+ if(precells<100000){ //Not enough memory - no point.
+ prefilter=false;
+ return null;
+ }
+ }
+ if(prehashes<1){prehashes=2;}
+
+ if(onePass){
+ assert(filter==null || filter.length==1) : "Multiple filtering passes are not allowed in onepass mode.\n"+filter.length+","+prepasses+", "+onePass+", "+prefilter;
+ filter[0]=KmerCount7MTA.makeKca(null, null, null, kbig(), cbits, 0, precells, prehashes, minq, true, ecco(), maxReads, 1, 1, 1, 1, null, 0);
+ }else{
+ if(ht==null){ht=new Timer();}
+ ht.start();
+
+ ArrayList<String> extra=null;
+ filter[0]=KmerCount7MTA.makeKca_als(in1, in2, extra, kbig(), cbits, 0, precells, prehashes, minq, true, ecco(), maxReads, 1, 1, 1, 1, filter[0], filterMax);
+ assert(filterMax<filter[0].maxValue || (currentPass>0 && currentPass==prepasses-1));
+ outstream.println("Made prefilter: \t"+filter[0].toShortString(prehashes));
+ double uf=filter[0].usedFraction();
+// System.err.println("cellsUsed: "+filter[0].cellsUsed(1)+" //123"); //123
+ if(uf>0.5){
+ outstream.println("Warning: This table is "+(uf>0.995 ? "totally" : uf>0.99 ? "crazy" : uf>0.95 ? "incredibly" : uf>0.9 ? "extremely" : uf>0.8 ? "very" :
+ uf>0.7 ? "rather" : uf>0.6 ? "fairly" : "somewhat")+" full. Ideal load is under 50% used." +
+ "\nFor better accuracy, run on a node with more memory; quality-trim or error-correct reads; or increase prefiltersize.");
+ }
+ ht.stop();
+ currentPass++;
+
+ final double kmers=filter[0].estimateUniqueKmers(prehashes, Tools.min(filterMax+1, filter[0].maxValue));
+ outstream.println("Estimated valid kmers: \t\t"+(long)kmers);
+
+// outstream.println("Estimated valid kmers 1+: "+(long)filter[0].estimateUniqueKmers(prehashes, 1));
+// outstream.println("Estimated valid kmers 2+: "+(long)filter[0].estimateUniqueKmers(prehashes, 2));
+// outstream.println("Estimated valid kmers 3+: "+(long)filter[0].estimateUniqueKmers(prehashes, 3));
+// outstream.println("Estimated valid kmers 4+: "+(long)filter[0].estimateUniqueKmers(prehashes, 4));
+
+ if(prepasses<0){//auto
+ if((currentPass&1)==0){
+ return makePrefilter(filter, ht);
+ }else if(currentPass<5){
+ if(kmers>estimatedKmerCapacity()){
+ return makePrefilter(filter, ht);
+ }
+ }
+ }else if(currentPass<prepasses){
+ return makePrefilter(filter, ht);
+ }
+
+ if(DISPLAY_PROGRESS){
+ outstream.println("Prefilter time:\t"+ht);
+ outstream.println("After prefilter:");
+ Shared.printMemory();
+ outstream.println();
+ }
+ }
+
+ return filter[0];
+ }
+
+
+ public final void showStats(Timer t, long added){
+
+ if(!DISPLAY_STATS){return;}
+
+ if(DISPLAY_PROGRESS){
+ outstream.println("After loading:");
+ Shared.printMemory();
+ outstream.println();
+ }
+
+ t.stop();
+ outstream.println("Input: \t"+readsIn+" reads \t\t"+basesIn+" bases.");
+ outstream.println("Unique Kmers: \t"+added);
+ outstream.println("Load Time: \t"+t);
+
+ if(showSpeed){
+ double rpnano=readsIn/(double)(t.elapsed);
+ double bpnano=basesIn/(double)(t.elapsed);
+
+ //Format with k or m suffixes
+ String rpstring=(readsIn<100000 ? ""+readsIn : readsIn<100000000 ? (readsIn/1000)+"k" : (readsIn/1000000)+"m");
+ String bpstring=(basesIn<100000 ? ""+basesIn : basesIn<100000000 ? (basesIn/1000)+"k" : (basesIn/1000000)+"m");
+
+ while(rpstring.length()<8){rpstring=" "+rpstring;}
+ while(bpstring.length()<8){bpstring=" "+bpstring;}
+
+ outstream.println("\nReads Processed: "+rpstring+" \t"+String.format("%.2fk reads/sec", rpnano*1000000));
+ outstream.println("Bases Processed: "+bpstring+" \t"+String.format("%.2fm bases/sec", bpnano*1000));
+ }
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public final long loadKmers(){
+ allocateTables();
+ kmersLoaded=0;
+ final boolean vic=Read.VALIDATE_IN_CONSTRUCTOR;
+ Read.VALIDATE_IN_CONSTRUCTOR=false;
+ for(int i=0; i<in1.size(); i++){
+ String a=in1.get(i);
+ String b=in2.size()>i ? in2.get(i) : null;
+ int idx=a.indexOf('#');
+ if(idx>=0 && b==null){
+ b=a.replaceFirst("#", "2");
+ a=a.replaceFirst("#", "1");
+ }
+ kmersLoaded+=loadKmers(a, b);
+ }
+ Read.VALIDATE_IN_CONSTRUCTOR=vic;
+ return kmersLoaded;
+ }
+
+ /**
+ * Load reads into tables, using multiple LoadThread.
+ */
+ public abstract long loadKmers(String fname1, String fname2);
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Helper Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public abstract long regenerate();
+
+ public abstract Object getTable(int tnum);
+
+ public abstract long[] fillHistogram(int histMax);
+
+ public abstract void initializeOwnership();
+
+ public abstract void clearOwnership();
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Printing Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public abstract boolean dumpKmersAsBytes(String fname, int minToDump, boolean printTime);
+ public abstract boolean dumpKmersAsBytes_MT(String fname, int minToDump, boolean printTime);
+
+ public final long[] makeKhist(String fname, int cols, int max, boolean printHeader, boolean printZeros, boolean printTime, boolean smooth, int smoothRadius){
+ Timer t=new Timer();
+
+ long[] ca=fillHistogram(max);
+ if(smooth){
+ ca=CallPeaks.smoothProgressive(ca, smoothRadius);
+ }
+ if(fname==null){return ca;}
+
+ ByteStreamWriter bsw=new ByteStreamWriter(fname, overwrite, false, true);
+ bsw.start();
+ if(printHeader){
+ bsw.print("#Depth\t"+(cols==3 ? "RawCount\t" : "")+"Count\n");
+ }
+
+ for(int i=1; i<ca.length; i++){
+ long count=ca[i];
+ if(printZeros || count>0){
+ bsw.print(i);
+ bsw.print('\t');
+ if(cols==3){
+ bsw.print(i*count);
+ bsw.print('\t');
+ }
+ bsw.print(count);
+ bsw.print('\n');
+ }
+ }
+ bsw.poisonAndWait();
+ t.stop();
+ if(printTime){outstream.println("Histogram Write Time: \t"+t);}
+ return ca;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public boolean showStats=true;
+
+ /** Has this class encountered errors while processing? */
+ public boolean errorState=false;
+
+ /** Use a count-min prefilter for low-depth kmers */
+ public boolean prefilter=false;
+ /** Fill the prefilter at the same time as the main table */
+ public boolean onePass=false;
+ /** Number of hashes used by prefilter */
+ public int prehashes=2;
+ /** Fraction of memory used by prefilter */
+ public double prefilterFraction=0.2;
+
+ /** Initial size of data structures */
+ public int initialSize=-1;
+ /** Fraction of available memory preallocated to arrays */
+ public double preallocFraction=1.0;
+
+ public KCountArray prefilterArray=null;
+
+ public boolean minProbPrefilter=true;
+ public boolean minProbMain=true;
+
+ /** Input reads for kmers */
+ public ArrayList<String> in1=new ArrayList<String>(), in2=new ArrayList<String>();
+
+ /** Maximum input reads (or pairs) to process. Does not apply to references. -1 means unlimited. */
+ public long maxReads=-1;
+
+ public int buflen=1000;
+
+ /** Filter kmers up to this level; don't store them in primary data structure */
+ protected int filterMax=0;
+ protected int filterMax2=0;
+
+ public long readsIn=0;
+ public long basesIn=0;
+ public long lowqReads=0;
+ public long lowqBases=0;
+ public long readsTrimmed=0;
+ public long basesTrimmed=0;
+
+ public long kmersLoaded=0;
+
+ private int currentPass=0;
+ protected int prepasses=1;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Final Primitives ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public abstract int kbig();
+ public abstract long filterMemory(int pass);
+ public abstract long tableMemory();
+ public abstract long estimatedKmerCapacity();
+ public abstract boolean ecco();
+ public abstract boolean qtrimLeft();
+ public abstract boolean qtrimRight();
+ public abstract byte minAvgQuality();
+ public final int filterMax(){return filterMax;}
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Static Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ protected abstract void allocateTables();
+
+ /** Print messages to this stream */
+ public static PrintStream outstream=System.err;
+ /** Permission to overwrite existing files */
+ public static boolean overwrite=false;
+ /** Permission to append to existing files */
+ public static boolean append=false;
+ /** Print speed statistics upon completion */
+ public static boolean showSpeed=true;
+ /** Display progress messages such as memory usage */
+ public static boolean DISPLAY_PROGRESS=true;
+ /** Display kmer loading information */
+ public static boolean DISPLAY_STATS=true;
+ /** Verbose messages */
+ public static boolean verbose=false;
+ /** Debugging verbose messages */
+ public static boolean verbose2=false;
+ /** Number of ProcessThreads */
+ public static int THREADS=Shared.threads();
+
+ /** Increment owner by this much to indicate claim is final. */
+ public static final int CLAIM_OFFSET=100000;
+
+ /** Default initial table size */
+ public static final int initialSizeDefault=128000;
+
+ public static final float[] PROB_CORRECT=Arrays.copyOf(align2.QualityTools.PROB_CORRECT, 127);
+ public static final float[] PROB_CORRECT_INVERSE=Arrays.copyOf(align2.QualityTools.PROB_CORRECT_INVERSE, 127);
+
+ public static boolean IGNORE_UNKNOWN_ARGS=true;
+
+ public static final int NOT_PRESENT=AbstractKmerTable.NOT_PRESENT, HASH_COLLISION=AbstractKmerTable.HASH_COLLISION;
+ public static final int NO_OWNER=AbstractKmerTable.NO_OWNER;
+
+ public static double defaultMinprob=0;
+
+}
diff --git a/current/kmer/AtomicShortArray.java b/current/kmer/AtomicShortArray.java
new file mode 100755
index 0000000..f656d09
--- /dev/null
+++ b/current/kmer/AtomicShortArray.java
@@ -0,0 +1,27 @@
+package kmer;
+
+import java.util.concurrent.atomic.AtomicIntegerArray;
+
+/**
+ * @author Brian Bushnell
+ * @date May 14, 2015
+ *
+ */
+public class AtomicShortArray {
+
+ public AtomicShortArray(int length_){
+ assert(length_>=0);
+ length=length_;
+ intArray=new AtomicIntegerArray((length+1)/2);
+ assert(false) : "TODO";
+ }
+
+// public short set(int position, short value){
+// in
+// intArray
+// }
+
+ private AtomicIntegerArray intArray;
+ private final int length;
+
+}
diff --git a/current/kmer/DumpThread.java b/current/kmer/DumpThread.java
new file mode 100755
index 0000000..4f5d650
--- /dev/null
+++ b/current/kmer/DumpThread.java
@@ -0,0 +1,73 @@
+package kmer;
+
+import java.util.ArrayList;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import align2.Shared;
+import align2.Tools;
+
+import stream.ByteBuilder;
+
+import fileIO.ByteStreamWriter;
+
+/**
+ * @author Brian Bushnell
+ * @date Nov 16, 2015
+ *
+ */
+public class DumpThread extends Thread{
+
+ public static boolean dump(final int k, final int mincount, final AbstractKmerTable[] tables, final ByteStreamWriter bsw){
+ final int threads=NUM_THREADS>0 ? NUM_THREADS : Tools.min(tables.length, (Tools.mid(1, Shared.threads()-1, 6)));
+ final AtomicInteger lock=new AtomicInteger(0);
+ final ArrayList<DumpThread> list=new ArrayList<DumpThread>(threads);
+ for(int i=0; i<threads; i++){
+ list.add(new DumpThread(k, mincount, lock, tables, bsw));
+ }
+ for(DumpThread t : list){t.start();}
+ boolean success=true;
+ for(DumpThread t : list){
+ while(t.getState()!=Thread.State.TERMINATED){
+ try {
+ t.join();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ success&=t.success;
+ }
+ return success;
+ }
+
+ public DumpThread(final int k_, final int mincount_, final AtomicInteger nextTable_, final AbstractKmerTable[] tables_, final ByteStreamWriter bsw_){
+ k=k_;
+ mincount=mincount_;
+ nextTable=nextTable_;
+ tables=tables_;
+ bsw=bsw_;
+ }
+
+ @Override
+ public void run(){
+ final ByteBuilder bb=new ByteBuilder(16300);
+ for(int i=nextTable.getAndIncrement(); i<tables.length; i=nextTable.getAndIncrement()){
+ AbstractKmerTable t=tables[i];
+ t.dumpKmersAsBytes_MT(bsw, bb, k, mincount);
+ }
+ if(bb.length()>0){
+ synchronized(bsw){bsw.addJob(bb);}
+ }
+ success=true;
+ }
+
+ final int k;
+ final int mincount;
+ final AtomicInteger nextTable;
+ final AbstractKmerTable[] tables;
+ final ByteStreamWriter bsw;
+ boolean success=false;
+
+ public static int NUM_THREADS=-1;
+
+}
diff --git a/current/kmer/HashArray.java b/current/kmer/HashArray.java
new file mode 100755
index 0000000..568dddc
--- /dev/null
+++ b/current/kmer/HashArray.java
@@ -0,0 +1,485 @@
+package kmer;
+
+import java.util.Arrays;
+import java.util.concurrent.atomic.AtomicIntegerArray;
+import java.util.concurrent.locks.Lock;
+import java.util.concurrent.locks.ReentrantLock;
+
+import stream.ByteBuilder;
+
+
+import fileIO.ByteStreamWriter;
+import fileIO.TextStreamWriter;
+
+import align2.Tools;
+
+/**
+ * Stores kmers in a long[] and values in an int[][], with a victim cache.
+ * @author Brian Bushnell
+ * @date Nov 7, 2014
+ *
+ */
+public abstract class HashArray extends AbstractKmerTable {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ HashArray(int initialSize, boolean autoResize_, boolean twod){
+ if(initialSize>1){
+ initialSize=(int)Tools.min(maxPrime, Primes.primeAtLeast(initialSize));
+ }else{
+ initialSize=1;
+ }
+ prime=initialSize;
+ sizeLimit=(long)(sizeLimit=(long)(maxLoadFactor*prime));
+ array=allocLong1D(prime+extra);
+ victims=new HashForest(Tools.max(10, initialSize/8), autoResize_, twod);
+ Arrays.fill(array, NOT_PRESENT);
+ autoResize=autoResize_;
+ TWOD=twod;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Public Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+// public final int set_Test(final long kmer, final int v){
+// assert(TESTMODE);
+// final int x;
+// if(TWOD){
+// int[] old=getValues(kmer, new int[1]);
+// assert(old==null || contains(kmer, old));
+// if(verbose){System.err.println("Fetched "+Arrays.toString(old));}
+// x=set0(kmer, v);
+// assert(old==null || contains(kmer, old)) : "old="+Arrays.toString(old)+", v="+v+", kmer="+kmer+
+// ", get(kmer)="+(Arrays.toString(getValues(kmer, new int[1])));
+// assert(contains(kmer, v));
+// }else{
+// int old=getValue(kmer);
+// assert(old==0 || old==-1 || contains(kmer, old));
+// x=set0(kmer, v);
+// assert(contains(kmer, v)) : "old="+old+", v="+v+", kmer="+kmer+", get(kmer)="+getValue(kmer);
+// assert(v==old || !contains(kmer, old));
+// }
+// return x;
+// }
+//
+// public final int set_Test(final long kmer, final int v[]){
+// assert(TESTMODE);
+// final int x;
+// if(TWOD){
+// final int[] singleton=new int[1];
+// int[] old=getValues(kmer, singleton);
+// assert(old==null || contains(kmer, old));
+// if(verbose){System.err.println("Before: old="+Arrays.toString(old)+", v="+Arrays.toString(v));}
+// x=set0(kmer, v);
+// if(verbose){System.err.println("After: old="+Arrays.toString(old)+", v="+Arrays.toString(v)+", get()="+Arrays.toString(getValues(kmer, singleton)));}
+// assert(old==null || contains(kmer, old)) : "old="+Arrays.toString(old)+", v="+Arrays.toString(v)+", kmer="+kmer+
+// ", get(kmer)="+(Arrays.toString(getValues(kmer, new int[1])));
+// assert(contains(kmer, v)) : "old="+Arrays.toString(old)+", v="+Arrays.toString(v)+", kmer="+kmer+
+// ", get(kmer)="+(Arrays.toString(getValues(kmer, new int[1])));
+// }else{
+// int old=getValue(kmer);
+// assert(old==0 || old==-1 || contains(kmer, old));
+// x=set0(kmer, v);
+// assert(contains(kmer, v)) : "old="+old+", v="+v+", kmer="+kmer+", get(kmer)="+getValue(kmer);
+// assert(v[0]==old || !contains(kmer, old));
+// }
+// return x;
+// }
+//
+// public final int setIfNotPresent_Test(long kmer, int v){
+// assert(TESTMODE);
+// final int x;
+// if(TWOD){
+//// int[] vals=getValues(kmer, null);
+//// assert(vals==null || contains(kmer, vals));
+//// x=setIfNotPresent(kmer, v);
+//// assert(contains(kmer, vals));
+//// assert(contains(kmer, v));
+// x=0;
+// assert(false);
+// }else{
+// int old=getValue(kmer);
+// assert(old==0 || old==-1 || contains(kmer, old));
+// x=setIfNotPresent0(kmer, v);
+// assert((old<1 && contains(kmer, v)) || (old>0 && contains(kmer, old))) : kmer+", "+old+", "+v;
+// }
+// return x;
+// }
+
+ @Override
+ public final int set(final long kmer, final int[] v){
+ int cell=(int)(kmer%prime);
+
+ for(final int max=cell+extra; cell<max; cell++){
+ long n=array[cell];
+ if(n==kmer){
+ if(verbose){System.err.println("A2: Adding "+kmer+", "+Arrays.toString(v)+", "+cell);}
+ insertValue(kmer, v, cell);
+ if(verbose){System.err.println("A2: getValues("+kmer+") = "+Arrays.toString(getValues(kmer, new int[1])));}
+ return 0;
+ }else if(n==NOT_PRESENT){
+ if(verbose){System.err.println("B2: Adding "+kmer+", "+Arrays.toString(v)+", "+cell);}
+ array[cell]=kmer;
+ insertValue(kmer, v, cell);
+ if(verbose){System.err.println("B2: getValues("+kmer+") = "+Arrays.toString(getValues(kmer, new int[1])));}
+ size++;
+ if(autoResize && size+victims.size>sizeLimit){resize();}
+ return 1;
+ }
+ }
+ if(verbose){System.err.println("C2: Adding "+kmer+", "+v+", "+cell);}
+ final int x=victims.set(kmer, v);
+ if(autoResize && size+victims.size>sizeLimit){resize();}
+ if(verbose){System.err.println("C2: getValues("+kmer+") = "+Arrays.toString(getValues(kmer, new int[1])));}
+ return x;
+ }
+
+ @Override
+ public final int set(final long kmer, final int v){
+ int cell=(int)(kmer%prime);
+
+// assert(TESTMODE);
+// ll.add(kmer);
+// il.add(v);
+
+ for(final int max=cell+extra; cell<max; cell++){
+ long n=array[cell];
+ if(n==kmer){
+ if(verbose){System.err.println("A1: Adding "+kmer+", "+v+", "+cell);}
+ insertValue(kmer, v, cell);
+ if(verbose){System.err.println("A1: getValues("+kmer+") = "+Arrays.toString(getValues(kmer, new int[1])));}
+ return 0;
+ }else if(n==NOT_PRESENT){
+ if(verbose){System.err.println("B1: Adding "+kmer+", "+v+", "+cell);}
+ array[cell]=kmer;
+ insertValue(kmer, v, cell);
+ if(verbose){System.err.println("B1: getValues("+kmer+") = "+Arrays.toString(getValues(kmer, new int[1])));}
+ size++;
+ if(autoResize && size+victims.size>sizeLimit){resize();}
+ return 1;
+ }
+ }
+ if(verbose){System.err.println("C1: Adding "+kmer+", "+v+", "+cell+
+ "; victims.get(kmer)="+Arrays.toString(victims.getValues(kmer, new int[1])));}
+ final int x=victims.set(kmer, v);
+ if(autoResize && size+victims.size>sizeLimit){resize();}
+ if(verbose){System.err.println("C1: getValues("+kmer+") = "+Arrays.toString(getValues(kmer, new int[1]))+
+ "; victims.get(kmer)="+Arrays.toString(victims.getValues(kmer, new int[1])));}
+ return x;
+ }
+
+
+// protected LongList ll=new LongList(); //123
+// protected IntList il=new IntList();
+
+ @Override
+ public final int setIfNotPresent(long kmer, int value){
+ int cell=(int)(kmer%prime);
+
+ for(final int max=cell+extra; cell<max; cell++){
+ long n=array[cell];
+ if(n==kmer){
+ return 0;
+ }else if(n==NOT_PRESENT){
+ array[cell]=kmer;
+ insertValue(kmer, value, cell);
+ size++;
+ if(autoResize && size+victims.size>sizeLimit){resize();}
+ return 1;
+ }
+ }
+// System.err.println("size="+size+", prime="+prime+", limit="+sizeLimit);
+ int x=victims.setIfNotPresent(kmer, value);
+ if(autoResize && size+victims.size>sizeLimit){resize();}
+ return x;
+ }
+
+ @Override
+ public final int getValue(long kmer){
+ int cell=findKmer(kmer);
+ if(cell==NOT_PRESENT){return NOT_PRESENT;}
+ if(cell==HASH_COLLISION){return victims.getValue(kmer);}
+ return readCellValue(cell);
+ }
+
+ @Override
+ public final int[] getValues(long kmer, int[] singleton){
+ int cell=findKmer(kmer);
+ if(cell==NOT_PRESENT){
+ singleton[0]=NOT_PRESENT;
+ return singleton;
+ }
+ if(cell==HASH_COLLISION){return victims.getValues(kmer, singleton);}
+ return readCellValues(cell, singleton);
+ }
+
+ @Override
+ public final boolean contains(long kmer){
+ int cell=findKmer(kmer);
+ if(cell==NOT_PRESENT){return false;}
+ if(cell==HASH_COLLISION){return victims.contains(kmer);}
+ return true;
+ }
+
+ public final long getKmer(int cell) {
+ return array[cell];
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Ownership ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public final void initializeOwnership(){
+ assert(owners==null);
+ owners=allocAtomicInt(array.length);
+ for(int i=0; i<array.length; i++){
+ owners.set(i, NO_OWNER);
+ }
+ victims.initializeOwnership();
+ }
+
+ @Override
+ public final void clearOwnership(){
+ owners=null;
+ victims.clearOwnership();
+ }
+
+ @Override
+ public final int setOwner(final long kmer, final int newOwner){
+ final int cell=findKmer(kmer);
+ assert(cell!=NOT_PRESENT);
+ if(cell==HASH_COLLISION){return victims.setOwner(kmer, newOwner);}
+ return setOwner(kmer, newOwner, cell);
+ }
+
+ public final int setOwner(final long kmer, final int newOwner, final int cell){
+ assert(array[cell]==kmer);
+ final int original=owners.get(cell);
+ int current=original;
+ while(current<newOwner){
+ boolean success=owners.compareAndSet(cell, current, newOwner);
+ if(!success){current=owners.get(cell);}
+ else{current=newOwner;}
+ }
+ assert(current>=original) : "original="+original+", current="+current+", newOwner="+newOwner+", re-read="+owners.get(cell);
+ return current;
+ }
+
+ @Override
+ public final boolean clearOwner(final long kmer, final int owner){
+ final int cell=findKmer(kmer);
+ assert(cell!=NOT_PRESENT);
+ if(cell==HASH_COLLISION){return victims.clearOwner(kmer, owner);}
+ return clearOwner(kmer, owner, cell);
+ }
+
+ public final boolean clearOwner(final long kmer, final int owner, final int cell){
+ assert(array[cell]==kmer);
+ boolean success=owners.compareAndSet(cell, owner, NO_OWNER);
+ return success;
+ }
+
+ @Override
+ public final int getOwner(final long kmer){
+ final int cell=findKmer(kmer);
+ assert(cell!=NOT_PRESENT);
+ if(cell==HASH_COLLISION){return victims.getOwner(kmer);}
+ return getCellOwner(cell);
+ }
+
+ public final int getCellOwner(final int cell){
+ return owners.get(cell);
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Nonpublic Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ protected abstract void insertValue(final long kmer, final int v, final int cell);
+
+ protected abstract void insertValue(final long kmer, final int[] vals, final int cell);
+
+ protected abstract int readCellValue(int cell);
+ protected abstract int[] readCellValues(int cell, int[] singleton);
+
+ @Override
+ final Object get(long kmer){
+ throw new RuntimeException("Unimplemented.");
+ }
+
+ final int findKmer(long kmer){
+ int cell=(int)(kmer%prime);
+ for(final int max=cell+extra; cell<max; cell++){
+ final long n=array[cell];
+ if(n==kmer){return cell;}
+ else if(n==NOT_PRESENT){return NOT_PRESENT;}
+ }
+ return HASH_COLLISION;
+ }
+
+ final int findKmerOrEmpty(long kmer){
+ int cell=(int)(kmer%prime);
+ for(final int max=cell+extra; cell<max; cell++){
+ final long n=array[cell];
+ if(n==kmer || n==NOT_PRESENT){return cell;}
+ }
+ return HASH_COLLISION;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Resizing and Rebalancing ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ final boolean canResize() {return true;}
+
+ @Override
+ final public long size() {return size;}
+
+ @Override
+ final public int arrayLength() {return array.length;}
+
+ @Override
+ protected abstract void resize();
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Info Dumping ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public final boolean dumpKmersAsText(TextStreamWriter tsw, int k, int mincount){
+ if(TWOD){
+ final int[] singleton=new int[1];
+ for(int i=0; i<array.length; i++){
+ long kmer=array[i];
+ if(kmer!=NOT_PRESENT){
+ tsw.print(toText(kmer, readCellValues(i, singleton), k).append('\n'));
+ }
+ }
+ }else{
+ for(int i=0; i<array.length; i++){
+ long kmer=array[i];
+ if(kmer!=NOT_PRESENT && (mincount<2 || readCellValue(i)>=mincount)){
+ tsw.print(toText(kmer, readCellValue(i), k).append('\n'));
+ }
+ }
+ }
+ if(victims!=null){
+ victims.dumpKmersAsText(tsw, k, mincount);
+ }
+ return true;
+ }
+
+ @Override
+ public final boolean dumpKmersAsBytes(ByteStreamWriter bsw, int k, int mincount){
+ if(TWOD){
+ final int[] singleton=new int[1];
+ for(int i=0; i<array.length; i++){
+ long kmer=array[i];
+ if(kmer!=NOT_PRESENT){
+ bsw.printlnKmer(kmer, readCellValues(i, singleton), k);
+ }
+ }
+ }else{
+ for(int i=0; i<array.length; i++){
+ long kmer=array[i];
+ if(kmer!=NOT_PRESENT && (mincount<2 || readCellValue(i)>=mincount)){
+ bsw.printlnKmer(kmer, readCellValue(i), k);
+ }
+ }
+ }
+ if(victims!=null){
+ victims.dumpKmersAsBytes(bsw, k, mincount);
+ }
+ return true;
+ }
+
+ @Override
+ public final boolean dumpKmersAsBytes_MT(final ByteStreamWriter bsw, final ByteBuilder bb, final int k, final int mincount){
+ if(TWOD){
+ final int[] singleton=new int[1];
+ for(int i=0; i<array.length; i++){
+ long kmer=array[i];
+ if(kmer!=NOT_PRESENT){
+ toBytes(kmer, readCellValues(i, singleton), k, bb);
+ bb.append('\n');
+ if(bb.length()>=16000){
+ ByteBuilder bb2=new ByteBuilder(bb);
+ synchronized(bsw){bsw.addJob(bb2);}
+ bb.clear();
+ }
+ }
+ }
+ }else{
+ for(int i=0; i<array.length; i++){
+ long kmer=array[i];
+ if(kmer!=NOT_PRESENT && (mincount<2 || readCellValue(i)>=mincount)){
+ toBytes(kmer, readCellValue(i), k, bb);
+ bb.append('\n');
+ if(bb.length()>=16000){
+ ByteBuilder bb2=new ByteBuilder(bb);
+ synchronized(bsw){bsw.addJob(bb2);}
+ bb.clear();
+ }
+ }
+ }
+ }
+ if(victims!=null){
+ victims.dumpKmersAsBytes_MT(bsw, bb, k, mincount);
+ }
+ return true;
+ }
+
+ @Override
+ public final void fillHistogram(long[] ca, int max){
+ for(int i=0; i<array.length; i++){
+ long kmer=array[i];
+ if(kmer!=NOT_PRESENT){
+ int count=Tools.min(readCellValue(i), max);
+ ca[count]++;
+ }
+ }
+ if(victims!=null){
+ victims.fillHistogram(ca, max);
+ }
+ }
+
+ public HashForest victims(){
+ return victims;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ AtomicIntegerArray owners;
+ long[] array;
+ int prime;
+ long size=0;
+ long sizeLimit;
+ final HashForest victims;
+ final boolean autoResize;
+ public final boolean TWOD;
+ private final Lock lock=new ReentrantLock();
+
+ public AtomicIntegerArray owners() {return owners;}
+ @Override
+ final Lock getLock(){return lock;}
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Static Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ final static int extra=21;
+ final static int maxPrime=(int)Primes.primeAtMost(Integer.MAX_VALUE-extra);
+ final static float resizeMult=2f; //Resize by a minimum of this much
+ final static float minLoadFactor=0.58f; //Resize by enough to get the load above this factor
+ final static float maxLoadFactor=0.905f; //Reaching this load triggers resizing
+ final static float minLoadMult=1/minLoadFactor;
+ final static float maxLoadMult=1/maxLoadFactor;
+
+}
diff --git a/current/kmer/HashArray1D.java b/current/kmer/HashArray1D.java
new file mode 100755
index 0000000..4709fd1
--- /dev/null
+++ b/current/kmer/HashArray1D.java
@@ -0,0 +1,219 @@
+package kmer;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import align2.Tools;
+
+/**
+ * Stores kmers in a long[] and counts in an int[], with a victim cache.
+ * @author Brian Bushnell
+ * @date Oct 25, 2013
+ *
+ */
+public final class HashArray1D extends HashArray {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public HashArray1D(int initialSize, boolean autoResize_){
+ super(initialSize, autoResize_, false);
+ values=allocInt1D(prime+extra);
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Public Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public final int increment(final long kmer){
+ int cell=(int)(kmer%prime);
+
+ for(final int max=cell+extra; cell<max; cell++){
+ long n=array[cell];
+ if(n==kmer){
+ values[cell]++;
+ if(values[cell]<0){values[cell]=Integer.MAX_VALUE;}
+ return values[cell];
+ }else if(n==NOT_PRESENT){
+ array[cell]=kmer;
+ size++;
+ values[cell]=1;
+ if(autoResize && size+victims.size>sizeLimit){resize();}
+ return 1;
+ }
+ }
+ int x=victims.increment(kmer);
+ if(autoResize && size+victims.size>sizeLimit){resize();}
+ return x;
+ }
+
+ @Override
+ public final int incrementAndReturnNumCreated(final long kmer){
+ int cell=(int)(kmer%prime);
+
+ for(final int max=cell+extra; cell<max; cell++){
+ long n=array[cell];
+ if(n==kmer){
+ values[cell]++;
+ if(values[cell]<0){values[cell]=Integer.MAX_VALUE;}
+ return 0;
+ }else if(n==NOT_PRESENT){
+ array[cell]=kmer;
+ size++;
+ values[cell]=1;
+ if(autoResize && size+victims.size>sizeLimit){resize();}
+ return 1;
+ }
+ }
+ return victims.incrementAndReturnNumCreated(kmer);
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Nonpublic Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public final int readCellValue(int cell) {
+ return values[cell];
+ }
+
+ @Override
+ protected final int[] readCellValues(int cell, int[] singleton) {
+ singleton[0]=values[cell];
+ return singleton;
+ }
+
+ @Override
+ protected final void insertValue(long kmer, int v, int cell) {
+ assert(array[cell]==kmer);
+ values[cell]=v;
+ }
+
+ @Override
+ protected final void insertValue(long kmer, int[] vals, int cell) {
+ assert(array[cell]==kmer);
+ assert(vals.length==1);
+ values[cell]=vals[0];
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Resizing and Rebalancing ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public final boolean canRebalance() {return false;}
+
+ @Override
+ protected synchronized void resize(){
+// assert(false);
+// System.err.println("Resizing from "+prime+"; load="+(size*1f/prime));
+ if(prime>=maxPrime){
+ sizeLimit=0xFFFFFFFFFFFFL;
+ return;
+ }
+
+ final long oldSize=size, oldVSize=victims.size;
+ final long totalSize=oldSize+oldVSize;
+
+ final long maxAllowedByLoadFactor=(long)(totalSize*minLoadMult);
+ final long minAllowedByLoadFactor=(long)(totalSize*maxLoadMult);
+
+// sizeLimit=Tools.min((long)(maxLoadFactor*prime), maxPrime);
+
+ assert(maxAllowedByLoadFactor>=minAllowedByLoadFactor);
+ if(maxAllowedByLoadFactor<prime){
+ sizeLimit=(long)(maxLoadFactor*prime);
+ return;
+ }
+
+ long x=10+(long)(prime*resizeMult);
+ x=Tools.max(x, minAllowedByLoadFactor);
+ x=Tools.min(x, maxAllowedByLoadFactor);
+
+ int prime2=(int)Tools.min(maxPrime, Primes.primeAtLeast(x));
+
+ if(prime2<=prime){
+ sizeLimit=(long)(maxLoadFactor*prime);
+ assert(prime2==prime) : "Resizing to smaller array? "+totalSize+", "+prime+", "+x;
+ return;
+ }
+
+ prime=prime2;
+// System.err.println("Resized to "+prime+"; load="+(size*1f/prime));
+ long[] oldk=array;
+ int[] oldc=values;
+ KmerNode[] oldv=victims.array;
+ array=allocLong1D(prime2+extra);
+ Arrays.fill(array, NOT_PRESENT);
+ values=allocInt1D(prime2+extra);
+ ArrayList<KmerNode> list=victims.toList();
+ Arrays.fill(oldv, null);
+ victims.size=0;
+ size=0;
+ sizeLimit=Long.MAX_VALUE;
+
+ for(int i=0; i<oldk.length; i++){
+ if(oldk[i]>NOT_PRESENT){set(oldk[i], oldc[i]);}
+ }
+
+ for(KmerNode n : list){
+ if(n.pivot>NOT_PRESENT){set(n.pivot, n.value());}
+ }
+
+ assert(oldSize+oldVSize==size+victims.size) : oldSize+", "+oldVSize+" -> "+size+", "+victims.size;
+
+ sizeLimit=(long)(maxLoadFactor*prime);
+ }
+
+ @Deprecated
+ @Override
+ public void rebalance(){
+ throw new RuntimeException("Unimplemented.");
+ }
+
+ @Override
+ public long regenerate(){
+ long sum=0;
+ assert(owners==null) : "Clear ownership before regeneration.";
+ for(int pos=0; pos<values.length; pos++){
+ final long key=array[pos];
+ if(key>=0){
+ final int value=values[pos];
+ values[pos]=NOT_PRESENT;
+ array[pos]=NOT_PRESENT;
+ size--;
+ if(value>0){
+ set(key, value);
+ }else{
+ sum++;
+ }
+ }
+ }
+
+ ArrayList<KmerNode> nodes=victims.toList();
+ victims.clear();
+ for(KmerNode node : nodes){
+ int value=node.value();
+ if(value<1){
+ sum++;
+ }else{
+ set(node.pivot, node.value());
+ }
+ }
+
+ return sum;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private int[] values;
+
+ public int[] values(){return values;}
+
+
+
+}
diff --git a/current/kmer/HashArray2D.java b/current/kmer/HashArray2D.java
new file mode 100755
index 0000000..df0b152
--- /dev/null
+++ b/current/kmer/HashArray2D.java
@@ -0,0 +1,222 @@
+package kmer;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import align2.Tools;
+
+/**
+ * Stores kmers in a long[] and values in an int[][], with a victim cache.
+ * @author Brian Bushnell
+ * @date Nov 7, 2014
+ *
+ */
+public final class HashArray2D extends HashArray {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public HashArray2D(int initialSize, boolean autoResize_){
+ super(initialSize, autoResize_, true);
+ values=allocInt2D(prime+extra);
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Public Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Deprecated
+ @Override
+ public int increment(final long kmer){
+ throw new RuntimeException("Unsupported.");
+ }
+
+ @Deprecated
+ @Override
+ public int incrementAndReturnNumCreated(final long kmer){
+ throw new RuntimeException("Unsupported.");
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Nonpublic Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ protected final int readCellValue(int cell) {
+ int[] set=values[cell];
+ return set==null ? 0 : set[0];
+ }
+
+ @Override
+ protected final int[] readCellValues(int cell, int[] singleton) {
+ return values[cell];
+ }
+
+ /** Returns number of values added */
+ protected final void insertValue(final long kmer, final int v, final int cell){
+ assert(array[cell]==kmer);
+ if(values[cell]==null){
+ values[cell]=new int[] {v, NOT_PRESENT};
+ return;
+ }
+ int[] set=values[cell];
+ assert(set!=null);
+
+ for(int i=0; i<set.length; i++){
+ if(set[i]==v){return;}
+ else if(set[i]<0){set[i]=v;return;}
+ }
+ final int oldSize=set.length;
+ final int newSize=(int)Tools.min(Integer.MAX_VALUE, oldSize*2L);
+ assert(newSize>set.length) : "Overflow.";
+ set=Arrays.copyOf(set, newSize);
+ set[oldSize]=v;
+ Arrays.fill(set, oldSize+1, newSize, NOT_PRESENT);
+ values[cell]=set;
+ }
+
+ /** Returns number of values added */
+ protected final void insertValue(final long kmer, final int[] vals, final int cell){
+ assert(array[cell]==kmer);
+ if(values[cell]==null){
+ values[cell]=vals;
+ }else{
+ for(int v : vals){
+ if(v<0){break;}
+ insertValue(kmer, v, cell);
+ }
+ }
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Resizing and Rebalancing ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public final boolean canRebalance() {return false;}
+
+ @Override
+ protected synchronized void resize(){
+// assert(false);
+// System.err.println("Resizing from "+prime+"; load="+(size*1f/prime));
+ if(prime>=maxPrime){
+ sizeLimit=0xFFFFFFFFFFFFL;
+ return;
+ }
+
+ final long oldSize=size, oldVSize=victims.size;
+ final long totalSize=oldSize+oldVSize;
+
+ final long maxAllowedByLoadFactor=(long)(totalSize*minLoadMult);
+ final long minAllowedByLoadFactor=(long)(totalSize*maxLoadMult);
+
+// sizeLimit=Tools.min((long)(maxLoadFactor*prime), maxPrime);
+
+ assert(maxAllowedByLoadFactor>=minAllowedByLoadFactor);
+ if(maxAllowedByLoadFactor<prime){
+ sizeLimit=(long)(maxLoadFactor*prime);
+ return;
+ }
+
+ long x=10+(long)(prime*resizeMult);
+ x=Tools.max(x, minAllowedByLoadFactor);
+ x=Tools.min(x, maxAllowedByLoadFactor);
+
+ int prime2=(int)Tools.min(maxPrime, Primes.primeAtLeast(x));
+
+ if(prime2<=prime){
+ sizeLimit=(long)(maxLoadFactor*prime);
+ assert(prime2==prime) : "Resizing to smaller array? "+totalSize+", "+prime+", "+x;
+ return;
+ }
+// System.err.println("Resizing from "+prime+" to "+prime2+"; size="+size);
+
+ prime=prime2;
+// System.err.println("Resized to "+prime+"; load="+(size*1f/prime));
+ long[] oldk=array;
+ int[][] oldc=values;
+ KmerNode[] oldv=victims.array;
+ array=allocLong1D(prime2+extra);
+ Arrays.fill(array, NOT_PRESENT);
+ values=allocInt2D(prime2+extra);
+ ArrayList<KmerNode> list=new ArrayList<KmerNode>((int)(victims.size)); //Can fail if more than Integer.MAX_VALUE
+ for(int i=0; i<oldv.length; i++){
+ if(oldv[i]!=null){oldv[i].traverseInfix(list);}
+ }
+ Arrays.fill(oldv, null);
+ victims.size=0;
+ size=0;
+ sizeLimit=Long.MAX_VALUE;
+
+ final int[] singleton=new int[] {NOT_PRESENT};
+
+ for(int i=0; i<oldk.length; i++){
+ if(oldk[i]>NOT_PRESENT){
+// assert(!contains(oldk[i]));
+ set(oldk[i], oldc[i]);
+// assert(contains(oldk[i]));
+// assert(Tools.equals(getValues(oldk[i], singleton), oldc[i]));
+ }
+ }
+
+ for(KmerNode n : list){
+ if(n.pivot>NOT_PRESENT){
+// assert(!contains(n.pivot));
+ set(n.pivot, n.values(singleton));
+// assert(contains(n.pivot));
+// assert(Tools.equals(getValues(n.pivot, singleton), n.values(singleton)));
+ }
+ }
+
+ assert(oldSize+oldVSize==size+victims.size) : oldSize+", "+oldVSize+" -> "+size+", "+victims.size;
+
+ sizeLimit=(long)(maxLoadFactor*prime);
+ }
+
+ @Deprecated
+ @Override
+ public void rebalance(){
+ throw new RuntimeException("Unimplemented.");
+ }
+
+ @Deprecated
+ @Override
+ public long regenerate(){
+ assert(false) : "This is not tested or intended for use.";
+ long sum=0;
+ assert(owners==null) : "Clear ownership before regeneration.";
+ for(int pos=0; pos<values.length; pos++){
+ final long key=array[pos];
+ if(key>=0){
+ final int[] value=values[pos];
+ values[pos]=null;
+ array[pos]=NOT_PRESENT;
+ size--;
+ if(value!=null){
+ assert(value[0]>0);
+ set(key, value);
+ }else{
+ sum++;
+ }
+ }
+ }
+
+ ArrayList<KmerNode> nodes=victims.toList();
+ victims.clear();
+ for(KmerNode node : nodes){
+ set(node.pivot, node.values(null));//TODO: Probably unsafe or unwise. Should test for singletons, etc.
+ }
+
+ return sum;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private int[][] values;
+
+
+
+}
diff --git a/current/kmer/HashArrayHybrid.java b/current/kmer/HashArrayHybrid.java
new file mode 100755
index 0000000..5e5e213
--- /dev/null
+++ b/current/kmer/HashArrayHybrid.java
@@ -0,0 +1,356 @@
+package kmer;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import align2.IntList2;
+import align2.Tools;
+
+/**
+ * Stores kmers in a long[] and counts in an int[], with a victim cache.
+ * @author Brian Bushnell
+ * @date Oct 25, 2013
+ *
+ */
+public final class HashArrayHybrid extends HashArray {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public HashArrayHybrid(int initialSize, boolean autoResize_){
+ super(initialSize, autoResize_, true);
+ values=allocInt1D(prime+extra);
+ setList=new IntList2();
+ setList.add(null);
+ setList.add(null);
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Public Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public final int increment(final long kmer){
+ int cell=(int)(kmer%prime);
+
+ for(final int max=cell+extra; cell<max; cell++){
+ long n=array[cell];
+ assert(n>-2);
+ if(n==kmer){
+ assert(values[cell]>=0);
+ values[cell]++;
+ if(values[cell]<0){values[cell]=Integer.MAX_VALUE;}
+ return values[cell];
+ }else if(n==NOT_PRESENT){
+ array[cell]=kmer;
+ size++;
+ values[cell]=1;
+ if(autoResize && size+victims.size>sizeLimit){resize();}
+ return 1;
+ }
+ }
+ int x=victims.increment(kmer);
+ if(autoResize && size+victims.size>sizeLimit){resize();}
+ return x;
+ }
+
+ @Override
+ public final int incrementAndReturnNumCreated(final long kmer){
+ int cell=(int)(kmer%prime);
+
+ for(final int max=cell+extra; cell<max; cell++){
+ long n=array[cell];
+ assert(n>-2);
+ if(n==kmer){
+ assert(values[cell]>=0);
+ values[cell]++;
+ if(values[cell]<0){values[cell]=Integer.MAX_VALUE;}
+ return 0;
+ }else if(n==NOT_PRESENT){
+ array[cell]=kmer;
+ size++;
+ values[cell]=1;
+ if(autoResize && size+victims.size>sizeLimit){resize();}
+ return 1;
+ }
+ }
+ int x=victims.incrementAndReturnNumCreated(kmer);
+ if(autoResize && size+victims.size>sizeLimit){resize();}
+ return x;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Nonpublic Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ protected final int readCellValue(int cell) {
+ final int x=values[cell];
+ if(x>-2){return x;}
+ return setList.get(0-x)[0];
+ }
+
+ @Override
+ protected final int[] readCellValues(int cell, int[] singleton) {
+ final int x=values[cell];
+ if(x>-2){
+ singleton[0]=values[cell];
+ return singleton;
+ }
+ return setList.get(0-x);
+ }
+
+ @Override
+ protected final void insertValue(long kmer, int[] vals, int cell) {
+ if(verbose){System.err.println("insertValue("+kmer+", "+Arrays.toString(vals)+", "+cell+"); old="+values[cell]);}
+ assert(array[cell]==kmer);
+ if(vals.length==1){
+ if(verbose){System.err.println("A: length=1");}
+ insertValue(kmer, vals[0], cell);
+ return;
+ }
+ final int old=values[cell];
+ if(old==vals[0] && vals[1]==NOT_PRESENT){
+ if(verbose){System.err.println("B: old==vals[0] && vals[1]==-1");}
+ return; //Nothing to do
+ }else if(old<-1){//An array already exists
+ if(verbose){System.err.println("C: old<-1");}
+ for(int i : vals){
+ if(i==-1){break;}
+ insertIntoList(i, -old);
+ }
+ }else{//Add the list
+ final int[] temp;
+ if(old>0){//Move the old value to a new array. Note that this will probably never be used.
+ if(verbose){System.err.println("D: old>0");}
+ temp=allocInt1D(vals.length+1);
+ temp[0]=old;
+ for(int i=0; i<vals.length; i++){temp[i+1]=vals[i];}
+ }else{
+ if(verbose){System.err.println("E: old>0");}
+ temp=vals;
+ }
+ values[cell]=-setList.size;
+ setList.add(temp);
+ }
+ }
+
+ @Override
+ protected final void insertValue(long kmer, int v, int cell) {
+ assert(array[cell]==kmer);
+ assert(v>0);
+ final int cc=values[cell];
+ if(cc==v){
+ return;
+ }else if(cc<-1){
+ insertIntoList(v, -cc);
+ }else if(cc>0){
+ values[cell]=-setList.size;
+ setList.add(new int[] {cc, v, -1, -1});
+ }else{
+ values[cell]=v;
+ }
+ }
+
+ private final int insertIntoList(final int v, final int loc){
+
+ if(loc>=setList.size){
+ assert(loc==setList.size);
+ setList.add(null);
+ }
+
+ int[] set=setList.get(loc);
+ if(set==null){
+ set=new int[] {-1, -1};
+ setList.set(loc, set);
+ }
+
+ for(int i=0; i<set.length; i++){
+ if(set[i]==v){return 0;}
+ if(set[i]<0){set[i]=v;return 1;}
+ }
+ final int oldSize=set.length;
+ final int newSize=(int)Tools.min(Integer.MAX_VALUE, oldSize*2L);
+ assert(newSize>set.length) : "Overflow.";
+ set=Arrays.copyOf(set, newSize);
+ set[oldSize]=v;
+ Arrays.fill(set, oldSize+1, newSize, -1);
+ setList.set(loc, set);
+ return 1;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Resizing and Rebalancing ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public final boolean canRebalance() {return false;}
+
+ @Override
+ protected synchronized void resize(){
+
+ if(verbose){
+ System.err.println("Resizing from "+prime+"; load="+(size*1f/prime));
+ }
+
+// assert(TESTMODE);
+// if(TESTMODE){
+// for(int i=0; i<ll.size; i++){
+// assert(contains(ll.get(i), il.get(i)));
+// assert(!contains(ll.get(i), Integer.MAX_VALUE));
+// }
+// }
+
+// System.err.println("Resizing from "+prime+"; load="+(size*1f/prime));
+ if(prime>=maxPrime){
+ sizeLimit=0xFFFFFFFFFFFFL;
+ return;
+ }
+
+ final long oldSize=size, oldVSize=victims.size;
+ final long totalSize=oldSize+oldVSize;
+
+ final long maxAllowedByLoadFactor=(long)(totalSize*minLoadMult);
+ final long minAllowedByLoadFactor=(long)(totalSize*maxLoadMult);
+
+// sizeLimit=Tools.min((long)(maxLoadFactor*prime), maxPrime);
+
+ assert(maxAllowedByLoadFactor>=minAllowedByLoadFactor);
+ if(maxAllowedByLoadFactor<prime){
+ sizeLimit=(long)(maxLoadFactor*prime);
+ return;
+ }
+
+ long x=10+(long)(prime*resizeMult);
+ x=Tools.max(x, minAllowedByLoadFactor);
+ x=Tools.min(x, maxAllowedByLoadFactor);
+
+ int prime2=(int)Tools.min(maxPrime, Primes.primeAtLeast(x));
+
+ if(prime2<=prime){
+ sizeLimit=(long)(maxLoadFactor*prime);
+ assert(prime2==prime) : "Resizing to smaller array? "+totalSize+", "+prime+", "+x;
+ return;
+ }
+
+ prime=prime2;
+// System.err.println("Resized to "+prime+"; load="+(size*1f/prime));
+ long[] oldKmers=array;
+ int[] oldValues=values;
+ IntList2 oldList=setList;
+ setList=new IntList2();
+ setList.add(null);
+ setList.add(null);
+ KmerNode[] oldVictims=victims.array;
+ array=allocLong1D(prime2+extra);
+ Arrays.fill(array, -1);
+ values=allocInt1D(prime2+extra);
+ ArrayList<KmerNode> nodeList=new ArrayList<KmerNode>((int)(victims.size)); //Can fail if more than Integer.MAX_VALUE
+ for(int i=0; i<oldVictims.length; i++){
+ if(oldVictims[i]!=null){oldVictims[i].traverseInfix(nodeList);}
+ }
+ Arrays.fill(oldVictims, null);
+ victims.size=0;
+ size=0;
+ sizeLimit=Long.MAX_VALUE;
+
+// long added=0;
+ for(int i=0; i<oldKmers.length; i++){
+ final long kmer=oldKmers[i];
+ if(kmer!=-1){
+// final int[] old=getValues(kmer, new int[1]);
+// final long oldsize=(size+victims.size);
+// assert(old==null);
+ final int v=oldValues[i];
+// added++;
+
+// System.err.println("Found "+kmer+"->"+v);
+
+ assert(v<-1 || v>0);
+ if(v>=0){
+// assert(!contains(kmer));
+// long olds=size+victims.size; //123
+ set(kmer, v);
+// assert(contains(kmer));
+// assert(size+victims.size==olds+1);
+ }else{
+// if(verbose){
+// System.err.println("i="+i+", v="+v+", old="+Arrays.toString(oldList.get(-v))+", current="+Arrays.toString(setList.get(-v))+
+// ", get()="+Arrays.toString(getValues(kmer, new int[1])));
+// }
+// assert(!contains(kmer));
+// long olds=size+victims.size; //123
+ set(kmer, oldList.get(-v));
+// assert(contains(kmer));
+// assert(size+victims.size==olds+1);
+ }
+ }
+ }
+// assert(added==oldSize);
+
+ final int[] singleton=new int[1];
+// added=0;
+ for(KmerNode n : nodeList){
+ if(n.pivot>-1){
+// added++;
+// final int[] old=getValues(n.pivot, new int[1]);
+// assert(old==null);
+ if(n.numValues()>1){
+// assert(!contains(n.pivot()));
+// long olds=size+victims.size; //123
+ set(n.pivot, n.values(singleton));
+// assert(size+victims.size==olds+1);
+// assert(contains(n.pivot()));
+ }else{
+// assert(!contains(n.pivot()));
+// long olds=size+victims.size; //123
+ set(n.pivot, n.value());
+// assert(size+victims.size==olds+1);
+// assert(contains(n.pivot()));
+ }
+// assert(old==null || contains(n.pivot, old));
+// assert(contains(n.pivot, n.value()));
+ }
+ }
+// assert(added==oldVSize);
+
+ assert(oldSize+oldVSize==size+victims.size) : oldSize+" + "+oldVSize+" = "+(oldSize+oldVSize)+" -> "+size+" + "+victims.size+" = "+(size+victims.size);
+
+ if(verbose){System.err.println("Resized to "+prime+". "+oldSize+" + "+oldVSize+" = "+(oldSize+oldVSize)+" -> "+size+" + "+victims.size+" = "+(size+victims.size));}
+
+ sizeLimit=(long)(maxLoadFactor*prime);
+
+// assert(TESTMODE);
+// if(TESTMODE){
+// for(int i=0; i<ll.size; i++){
+// long kmer=ll.get(i);
+// int v=il.get(i);
+// assert(contains(kmer, v)) : i+", "+ll.size+", "+kmer+", "+v+", "+Arrays.toString(getValues(kmer, new int[1]));
+// assert(!contains(kmer, Integer.MAX_VALUE));
+// }
+// }
+ }
+
+ @Deprecated
+ @Override
+ public void rebalance(){
+ throw new RuntimeException("Unimplemented.");
+ }
+
+ @Deprecated
+ @Override
+ public long regenerate(){
+ throw new RuntimeException("Not supported.");
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private int[] values;
+ private IntList2 setList;
+
+
+
+}
diff --git a/current/kmer/HashBuffer.java b/current/kmer/HashBuffer.java
new file mode 100755
index 0000000..61969d7
--- /dev/null
+++ b/current/kmer/HashBuffer.java
@@ -0,0 +1,269 @@
+package kmer;
+
+import stream.ByteBuilder;
+import fileIO.ByteStreamWriter;
+import fileIO.TextStreamWriter;
+
+/**
+ * @author Brian Bushnell
+ * @date Nov 22, 2013
+ *
+ */
+public class HashBuffer extends AbstractKmerTable {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public HashBuffer(AbstractKmerTable[] tables_, int buflen_, int k_, boolean initValues){
+ tables=tables_;
+ buflen=buflen_;
+ halflen=(int)Math.ceil(buflen*0.5);
+ ways=tables.length;
+ buffers=new KmerBuffer[ways];
+ for(int i=0; i<ways; i++){
+ buffers[i]=new KmerBuffer(buflen, k_, initValues);
+ }
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Public Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public int incrementAndReturnNumCreated(long kmer) {
+ final int way=(int)(kmer%ways);
+ KmerBuffer buffer=buffers[way];
+ final int size=buffer.add(kmer);
+ if(size>=halflen && (size>=buflen || (size&SIZEMASK)==0)){
+ return dumpBuffer(way, size>=buflen);
+ }
+ return 0;
+ }
+
+ @Override
+ public final long flush(){
+ long added=0;
+ for(int i=0; i<ways; i++){added+=dumpBuffer(i, true);}
+ return added;
+ }
+
+ @Override
+ public int set(long kmer, int value) {
+ throw new RuntimeException("Unimplemented method; this class lacks value buffers");
+ }
+
+ @Override
+ public int set(long kmer, int[] vals) {
+ throw new RuntimeException("Unimplemented method; this class lacks value buffers");
+ }
+
+ @Override
+ public int setIfNotPresent(long kmer, int value) {
+ throw new RuntimeException("Unimplemented method; this class lacks value buffers");
+ }
+
+ @Override
+ public int getValue(long kmer) {
+ final int way=(int)(kmer%ways);
+ return tables[way].getValue(kmer);
+ }
+
+ @Override
+ public int[] getValues(long kmer, int[] singleton){
+ final int way=(int)(kmer%ways);
+ return tables[way].getValues(kmer, singleton);
+ }
+
+ @Override
+ public boolean contains(long kmer) {
+ final int way=(int)(kmer%ways);
+ return tables[way].contains(kmer);
+ }
+
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Ownership ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public final void initializeOwnership(){
+ for(AbstractKmerTable t : tables){t.initializeOwnership();}
+ }
+
+ @Override
+ public final void clearOwnership(){
+ for(AbstractKmerTable t : tables){t.clearOwnership();}
+ }
+
+ @Override
+ public final int setOwner(final long kmer, final int newOwner){
+ final int way=(int)(kmer%ways);
+ return tables[way].setOwner(kmer, newOwner);
+ }
+
+ @Override
+ public final boolean clearOwner(final long kmer, final int owner){
+ final int way=(int)(kmer%ways);
+ return tables[way].clearOwner(kmer, owner);
+ }
+
+ @Override
+ public final int getOwner(final long kmer){
+ final int way=(int)(kmer%ways);
+ return tables[way].getOwner(kmer);
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Nonpublic Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ Object get(long kmer) {
+ final int way=(int)(kmer%ways);
+ return tables[way].get(kmer);
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Private Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private int dumpBuffer(final int way, boolean force){
+ final KmerBuffer buffer=buffers[way];
+ final AbstractKmerTable table=tables[way];
+ final int lim=buffer.size();
+ if(lim<0){return 0;}
+ if(force){table.lock();}
+ else if(!table.tryLock()){return 0;}
+ final int x=dumpBuffer_inner(way);
+ table.unlock();
+ return x;
+ }
+
+ private int dumpBuffer_inner(final int way){
+ if(verbose){System.err.println("Dumping buffer for way "+way+" of "+ways);}
+ final KmerBuffer buffer=buffers[way];
+ final int lim=buffer.size();
+ if(lim<1){return 0;}
+ final long[] kmers=buffer.kmers.array;
+ final int[] values=(buffer.values==null ? null : buffer.values.array);
+ if(lim<1){return 0;}
+ int added=0;
+ final AbstractKmerTable table=tables[way];
+// synchronized(table){
+ if(values==null){
+// Arrays.sort(kmers, 0, lim); //Makes it slower
+ for(int i=0; i<lim; i++){
+ final long kmer=kmers[i];
+ added+=table.incrementAndReturnNumCreated(kmer);
+ }
+ }else{
+ for(int i=0; i<lim; i++){
+ final long kmer=kmers[i];
+ final int value=values[i];
+ added+=table.setIfNotPresent(kmer, value);
+ }
+ }
+// }
+ buffer.clear();
+ return added;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Resizing and Rebalancing ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ final boolean canResize() {return false;}
+
+ @Override
+ public final boolean canRebalance() {return false;}
+
+ @Deprecated
+ @Override
+ public long size() {
+ throw new RuntimeException("Unimplemented.");
+ }
+
+ @Deprecated
+ @Override
+ public int arrayLength() {
+ throw new RuntimeException("Unimplemented.");
+ }
+
+ @Deprecated
+ @Override
+ void resize() {
+ throw new RuntimeException("Unimplemented.");
+ }
+
+ @Deprecated
+ @Override
+ public void rebalance() {
+ throw new RuntimeException("Unimplemented.");
+ }
+
+ public long regenerate(){
+ long sum=0;
+ for(AbstractKmerTable table : tables){
+ sum+=table.regenerate();
+ }
+ return sum;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Info Dumping ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public boolean dumpKmersAsText(TextStreamWriter tsw, int k, int mincount){
+ for(AbstractKmerTable table : tables){
+ table.dumpKmersAsText(tsw, k, mincount);
+ }
+ return true;
+ }
+
+ @Override
+ public boolean dumpKmersAsBytes(ByteStreamWriter bsw, int k, int mincount){
+ for(AbstractKmerTable table : tables){
+ table.dumpKmersAsBytes(bsw, k, mincount);
+ }
+ return true;
+ }
+
+ @Override
+ @Deprecated
+ public boolean dumpKmersAsBytes_MT(final ByteStreamWriter bsw, final ByteBuilder bb, final int k, final int mincount){
+ throw new RuntimeException("Unsupported.");
+ }
+
+ @Override
+ public void fillHistogram(long[] ca, int max){
+ for(AbstractKmerTable table : tables){
+ table.fillHistogram(ca, max);
+ }
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Invalid Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public int increment(long kmer) {
+ throw new RuntimeException("Unsupported");
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private final AbstractKmerTable[] tables;
+ private final int buflen;
+ private final int halflen;
+ private final int ways;
+ private final KmerBuffer[] buffers;
+
+ private final static int SIZEMASK=15;
+
+}
diff --git a/current/kmer/HashForest.java b/current/kmer/HashForest.java
new file mode 100755
index 0000000..b1f902c
--- /dev/null
+++ b/current/kmer/HashForest.java
@@ -0,0 +1,476 @@
+package kmer;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.concurrent.locks.Lock;
+import java.util.concurrent.locks.ReentrantLock;
+
+import stream.ByteBuilder;
+
+
+import fileIO.ByteStreamWriter;
+import fileIO.TextStreamWriter;
+
+import align2.Tools;
+
+/**
+ * @author Brian Bushnell
+ * @date Oct 23, 2013
+ *
+ */
+public final class HashForest extends AbstractKmerTable implements Iterable<KmerNode> {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public HashForest(int initialSize, boolean autoResize_){
+ this(initialSize, autoResize_, false);
+ }
+
+ public HashForest(int initialSize, boolean autoResize_, boolean twod_){
+ if(initialSize>1){
+ initialSize=(int)Tools.min(maxPrime, Primes.primeAtLeast(initialSize));
+ }else{
+ initialSize=1;
+ }
+ prime=initialSize;
+ sizeLimit=(long) (initialSize*resizeMult);
+ array=allocKmerNodeArray(prime);
+ autoResize=autoResize_;
+ TWOD=twod_;
+ }
+
+ private KmerNode makeNode(long kmer, int val){
+ return (TWOD ? new KmerNode2D(kmer, val) : new KmerNode1D(kmer, val));
+ }
+
+ private KmerNode makeNode(long kmer, int[] vals){
+ assert(TWOD);
+ return new KmerNode2D(kmer, vals);
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Public Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public int increment(long kmer){
+ final int cell=(int)(kmer%prime);
+ KmerNode n=array[cell], prev=null;
+ while(n!=null && n.pivot!=kmer){
+ prev=n;
+ n=(kmer<n.pivot ? n.left : n.right);
+ }
+ if(n==null){
+ n=makeNode(kmer, 1);
+ size++;
+ if(prev==null){
+ array[cell]=n;
+ }else{
+ if(kmer<prev.pivot){
+ prev.left=n;
+ }else{
+ prev.right=n;
+ }
+ }
+ if(autoResize && size>sizeLimit){resize();}
+ }else{
+ n.increment(kmer);
+ }
+ return n.value();
+ }
+
+ @Override
+ public int incrementAndReturnNumCreated(long kmer){
+ final int cell=(int)(kmer%prime);
+ KmerNode n=array[cell], prev=null;
+ while(n!=null && n.pivot!=kmer){
+ prev=n;
+ n=(kmer<n.pivot ? n.left : n.right);
+ }
+ if(n==null){
+ n=makeNode(kmer, 1);
+ size++;
+ if(prev==null){
+ array[cell]=n;
+ }else{
+ if(kmer<prev.pivot){
+ prev.left=n;
+ }else{
+ prev.right=n;
+ }
+ }
+ if(autoResize && size>sizeLimit){resize();}
+ return 1;
+ }else{
+ n.increment(kmer);
+ return 0;
+ }
+ }
+
+// public final int set_Test(final long kmer, final int v){
+// assert(TESTMODE);
+// final int x;
+// if(TWOD){
+// int[] old=getValues(kmer, null);
+// assert(old==null || contains(kmer, old));
+// x=set0(kmer, v);
+// assert(old==null || contains(kmer, old));
+// assert(contains(kmer, v));
+// }else{
+// int old=getValue(kmer);
+// assert(old==0 || old==-1 || contains(kmer, old));
+// x=set0(kmer, v);
+// assert(contains(kmer, v)) : "old="+old+", v="+v+", kmer="+kmer+", get(kmer)="+getValue(kmer);
+// assert(v==old || !contains(kmer, old));
+// }
+// return x;
+// }
+//
+// public final int setIfNotPresent_Test(long kmer, int v){
+// assert(TESTMODE);
+// final int x;
+// if(TWOD){
+//// int[] vals=getValues(kmer, null);
+//// assert(vals==null || contains(kmer, vals));
+//// x=setIfNotPresent(kmer, v);
+//// assert(contains(kmer, vals));
+//// assert(contains(kmer, v));
+// x=0;
+// assert(false);
+// }else{
+// int old=getValue(kmer);
+// assert(old==0 || old==-1 || contains(kmer, old));
+// x=setIfNotPresent0(kmer, v);
+// assert((old<1 && contains(kmer, v)) || (old>0 && contains(kmer, old))) : kmer+", "+old+", "+v;
+// }
+// return x;
+// }
+//
+// public final int set_Test(final long kmer, final int v[]){
+// assert(TESTMODE);
+// final int x;
+// if(TWOD){
+// int[] old=getValues(kmer, null);
+// assert(old==null || contains(kmer, old));
+// x=set0(kmer, v);
+// assert(old==null || contains(kmer, old));
+// assert(contains(kmer, v));
+// }else{
+// int old=getValue(kmer);
+// assert(old==0 || old==-1 || contains(kmer, old));
+// x=set0(kmer, v);
+// assert(contains(kmer, v)) : "old="+old+", v="+v+", kmer="+kmer+", get(kmer)="+getValue(kmer);
+// assert(v[0]==old || !contains(kmer, old));
+// }
+// return x;
+// }
+
+
+ @Override
+ public int set(long kmer, int value){
+ int x=1, cell=(int)(kmer%prime);
+ final KmerNode n=array[cell];
+ if(n==null){
+ array[cell]=makeNode(kmer, value);
+ }else{
+ x=n.set(kmer, value);
+ }
+ size+=x;
+ if(autoResize && size>sizeLimit){resize();}
+ return x;
+ }
+
+ @Override
+ public int set(long kmer, int[] vals) {
+ int x=1, cell=(int)(kmer%prime);
+ final KmerNode n=array[cell];
+ if(n==null){
+ array[cell]=makeNode(kmer, vals);
+ }else{
+ x=n.set(kmer, vals);
+ }
+ size+=x;
+ if(autoResize && size>sizeLimit){resize();}
+ return x;
+ }
+
+ @Override
+ public int setIfNotPresent(long kmer, int value){
+ int x=1, cell=(int)(kmer%prime);
+ final KmerNode n=array[cell];
+ if(n==null){
+ array[cell]=makeNode(kmer, value);
+ }else{
+ x=n.setIfNotPresent(kmer, value);
+ }
+ size+=x;
+ if(autoResize && size>sizeLimit){resize();}
+ return x;
+ }
+
+ @Override
+ public final int getValue(long kmer){
+ int cell=(int)(kmer%prime);
+ KmerNode n=array[cell];
+ return n==null ? -1 : n.getValue(kmer);
+ }
+
+ @Override
+ public int[] getValues(long kmer, int[] singleton){
+ int cell=(int)(kmer%prime);
+ KmerNode n=array[cell];
+ return n==null ? null : n.getValues(kmer, singleton);
+ }
+
+ @Override
+ public boolean contains(long kmer){
+ return get(kmer)!=null;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Ownership ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public final void initializeOwnership(){
+ for(KmerNode n : array){
+ if(n!=null){n.initializeOwnership();}
+ }
+ }
+
+ @Override
+ public final void clearOwnership(){initializeOwnership();}
+
+ @Override
+ public final int setOwner(final long kmer, final int newOwner){
+ final int cell=(int)(kmer%prime);
+ KmerNode n=array[cell];
+ assert(n!=null);
+ return n.setOwner(kmer, newOwner);
+ }
+
+ @Override
+ public final boolean clearOwner(final long kmer, final int owner){
+ final int cell=(int)(kmer%prime);
+ KmerNode n=array[cell];
+ assert(n!=null);
+ return n.clearOwner(kmer, owner);
+ }
+
+ @Override
+ public final int getOwner(final long kmer){
+ final int cell=(int)(kmer%prime);
+ KmerNode n=array[cell];
+ assert(n!=null);
+ return n.getOwner(kmer);
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Nonpublic Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ final KmerNode get(long kmer){
+ int cell=(int)(kmer%prime);
+ KmerNode n=array[cell];
+ while(n!=null && n.pivot!=kmer){
+ n=(kmer<n.pivot ? n.left : n.right);
+ }
+ return n;
+ }
+
+ public final KmerNode getNode(int cell){
+ KmerNode n=array[cell];
+ return n;
+ }
+
+ boolean insert(KmerNode n){
+ n.left=null;
+ n.right=null;
+ int cell=(int)(n.pivot%prime);
+ if(array[cell]==null){
+ array[cell]=n;
+ return true;
+ }
+ return array[cell].insert(n);
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Private Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Resizing and Rebalancing ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ boolean canResize() {return true;}
+
+ @Override
+ public boolean canRebalance() {return true;}
+
+ @Override
+ public long size() {return size;}
+
+ @Override
+ public int arrayLength() {return array.length;}
+
+ @Override
+ synchronized void resize(){
+// assert(false);
+// System.err.println("Resizing from "+prime+"; load="+(size*1f/prime));
+ sizeLimit=Tools.max((long)(size*1.4), (long)(maxLoadFactor*prime));
+
+ final long maxAllowedByLoadFactor=(long)(size*minLoadMult);
+ final long minAllowedByLoadFactor=(long)(size*maxLoadMult);
+ assert(maxAllowedByLoadFactor>=minAllowedByLoadFactor);
+ if(maxAllowedByLoadFactor<prime){return;}
+
+ long x=10+(long)(prime*resizeMult);
+ x=Tools.max(x, minAllowedByLoadFactor);
+ x=Tools.min(x, maxAllowedByLoadFactor);
+
+ int prime2=(int)Tools.min(maxPrime, Primes.primeAtLeast(x));
+
+ if(prime2<=prime){return;}
+
+ prime=prime2;
+// System.err.println("Resized to "+prime+"; load="+(size*1f/prime));
+ KmerNode[] old=array;
+ array=allocKmerNodeArray(prime2);
+ ArrayList<KmerNode> list=new ArrayList<KmerNode>(1000);
+ for(int i=0; i<old.length; i++){
+ if(old[i]!=null){
+ old[i].traverseInfix(list);
+ for(KmerNode n : list){insert(n);}
+ list.clear();
+ }
+ }
+ sizeLimit=Tools.max((long)(size*1.4), (long)(maxLoadFactor*prime));
+ }
+
+ @Override
+ public void rebalance(){
+ ArrayList<KmerNode> list=new ArrayList<KmerNode>(1000);
+ for(int i=0; i<array.length; i++){
+ if(array[i]!=null){array[i]=array[i].rebalance(list);}
+ }
+ }
+
+ public void clear() {
+ size=0;
+ Arrays.fill(array, null);
+ }
+
+ @Override
+ long regenerate() {
+ throw new RuntimeException("Not implemented.");
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Info Dumping ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public boolean dumpKmersAsText(TextStreamWriter tsw, int k, int mincount){
+// tsw.print("HashForest:\n");
+ for(int i=0; i<array.length; i++){
+ KmerNode node=array[i];
+ if(node!=null && node.value()>=mincount){
+// StringBuilder sb=new StringBuilder();
+// tsw.print(node.dumpKmersAsText(sb, k, mincount));
+ node.dumpKmersAsText(tsw, k, mincount);
+ }
+ }
+ return true;
+ }
+
+ @Override
+ public boolean dumpKmersAsBytes(ByteStreamWriter bsw, int k, int mincount){
+ for(int i=0; i<array.length; i++){
+ KmerNode node=array[i];
+ if(node!=null && node.value()>=mincount){
+ node.dumpKmersAsBytes(bsw, k, mincount);
+ }
+ }
+ return true;
+ }
+
+ @Override
+ public boolean dumpKmersAsBytes_MT(final ByteStreamWriter bsw, final ByteBuilder bb, final int k, final int mincount){
+ for(int i=0; i<array.length; i++){
+ KmerNode node=array[i];
+ if(node!=null && node.value()>=mincount){
+ node.dumpKmersAsBytes_MT(bsw, bb, k, mincount);
+ }
+ }
+ return true;
+ }
+
+ @Override
+ public void fillHistogram(long[] ca, int max){
+ for(int i=0; i<array.length; i++){
+ KmerNode node=array[i];
+ if(node!=null){
+ node.fillHistogram(ca, max);
+ }
+ }
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Iteration ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public Iterator<KmerNode> iterator() {
+ return toList().iterator();
+ }
+
+ public ArrayList<KmerNode> toList(){
+ assert(size<Integer.MAX_VALUE);
+ ArrayList<KmerNode> list=new ArrayList<KmerNode>((int)size);
+ for(int i=0; i<array.length; i++){
+ if(array[i]!=null){array[i].traverseInfix(list);}
+ }
+ assert(list.size()==size);
+ return list;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Invalid Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public KmerNode[] array() {return array;}
+
+ KmerNode[] array;
+ int prime;
+ long size=0;
+ long sizeLimit;
+ final boolean autoResize;
+ final boolean TWOD;
+ private final Lock lock=new ReentrantLock();
+
+ @Override
+ final Lock getLock(){return lock;}
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Static Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ final static int maxPrime=(int)Primes.primeAtMost(Integer.MAX_VALUE);
+ final static float resizeMult=2.5f; //Resize by a minimum of this much
+ final static float minLoadFactor=0.75f; //Resize by enough to get the load above this factor
+ final static float maxLoadFactor=2.5f; //Resize by enough to get the load under this factor
+ final static float minLoadMult=1/minLoadFactor;
+ final static float maxLoadMult=1/maxLoadFactor;
+
+
+
+}
diff --git a/current/kmer/KmerBuffer.java b/current/kmer/KmerBuffer.java
new file mode 100755
index 0000000..bde6bf2
--- /dev/null
+++ b/current/kmer/KmerBuffer.java
@@ -0,0 +1,53 @@
+package kmer;
+
+import stream.ByteBuilder;
+import align2.IntList;
+import align2.LongList;
+
+/**
+ * @author Brian Bushnell
+ * @date Jul 30, 2015
+ *
+ */
+public class KmerBuffer {
+
+ public KmerBuffer(int buflen, int k_, boolean initValues){
+ k=k_;
+ kmers=new LongList(buflen);
+ values=(initValues ? new IntList(buflen) : null);
+ }
+
+ public int add(long kmer){
+ assert(values==null);
+ kmers.add(kmer);
+ return kmers.size;
+ }
+
+ public void add(long kmer, int value){
+ kmers.add(kmer);
+ values.add(value);
+ assert(values.size==kmers.size);
+ }
+
+ public void clear(){
+ kmers.clear();
+ if(values!=null){values.clear();}
+ }
+
+ //Returns raw size of kmers array, rather than actual number of kmers
+ final int size(){return kmers.size;}
+
+ public String toString(){
+ ByteBuilder bb=new ByteBuilder();
+ for(int i=0; i<kmers.size; i++){
+ if(i>0){bb.append(',');}
+ bb.appendKmer(kmers.get(i), k);
+ }
+ return bb.toString();
+ }
+
+ private final int k;
+ final LongList kmers;
+ final IntList values;
+
+}
diff --git a/current/kmer/KmerLink.java b/current/kmer/KmerLink.java
new file mode 100755
index 0000000..56f9e2c
--- /dev/null
+++ b/current/kmer/KmerLink.java
@@ -0,0 +1,288 @@
+package kmer;
+
+import java.util.ArrayList;
+
+import stream.ByteBuilder;
+
+import align2.Tools;
+
+import fileIO.ByteStreamWriter;
+import fileIO.TextStreamWriter;
+
+/**
+ * @author Brian Bushnell
+ * @date Oct 22, 2013
+ *
+ */
+public class KmerLink extends AbstractKmerTable {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public KmerLink(long pivot_){
+ pivot=pivot_;
+ }
+
+ public KmerLink(long pivot_, int value_){
+ pivot=pivot_;
+ value=value_;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Public Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public final int incrementAndReturnNumCreated(long kmer) {
+ int x=increment(kmer);
+ return x==1 ? 1 : 0;
+ }
+
+ @Override
+ public int increment(long kmer){
+ if(pivot<0){pivot=kmer; return (value=1);} //Allows initializing empty nodes to -1
+ if(kmer==pivot){
+ if(value<Integer.MAX_VALUE){value++;}
+ return value;
+ }
+ if(next==null){next=new KmerLink(kmer, 1); return 1;}
+ return next.increment(kmer);
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Nonpublic Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Returns number of nodes added */
+ public int set(long kmer, int value_){
+ if(pivot<0){pivot=kmer; value=value_; return 1;} //Allows initializing empty nodes to -1
+ if(kmer==pivot){value=value_; return 0;}
+ if(next==null){next=new KmerLink(kmer, value_); return 1;}
+ return next.set(kmer, value_);
+ }
+
+ /** Returns number of nodes added */
+ public int setIfNotPresent(long kmer, int value_){
+ if(pivot<0){pivot=kmer; value=value_; return 1;} //Allows initializing empty nodes to -1
+ if(kmer==pivot){return 0;}
+ if(next==null){next=new KmerLink(kmer, value_); return 1;}
+ return next.setIfNotPresent(kmer, value_);
+ }
+
+ KmerLink get(long kmer){
+ if(kmer==pivot){return this;}
+ return next==null ? null : next.get(kmer);
+ }
+
+ boolean insert(KmerLink n){
+ assert(pivot!=-1);
+ if(pivot==n.pivot){return false;}
+ if(next==null){next=n; return true;}
+ return next.insert(n);
+ }
+
+ public boolean contains(long kmer){
+ KmerLink node=get(kmer);
+ return node!=null;
+ }
+
+ void traversePrefix(ArrayList<KmerLink> list){
+ if(next!=null){next.traversePrefix(list);}
+ list.add(this);
+ }
+
+ void traverseInfix(ArrayList<KmerLink> list){
+ list.add(this);
+ if(next!=null){next.traverseInfix(list);}
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Resizing and Rebalancing ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ boolean canResize() {
+ return false;
+ }
+
+ @Override
+ public boolean canRebalance() {
+ return true;
+ }
+
+ @Deprecated
+ @Override
+ public int arrayLength() {
+ throw new RuntimeException("Unsupported.");
+ }
+
+ @Deprecated
+ @Override
+ void resize() {
+ throw new RuntimeException("Unsupported.");
+ }
+
+ @Deprecated
+ @Override
+ public void rebalance() {
+ throw new RuntimeException("Please call rebalance(ArrayList<KmerNode>) instead, with an empty list.");
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Ownership ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public final void initializeOwnership(){
+ owner=-1;
+ if(next!=null){next.initializeOwnership();}
+ }
+
+ @Override
+ public final void clearOwnership(){initializeOwnership();}
+
+ @Override
+ public final int setOwner(final long kmer, final int newOwner){
+ KmerLink n=get(kmer);
+ assert(n!=null);
+ if(n.owner<=newOwner){
+ synchronized(n){
+ if(n.owner<newOwner){
+ n.owner=newOwner;
+ }
+ }
+ }
+ return n.owner;
+ }
+
+ @Override
+ public final boolean clearOwner(final long kmer, final int owner){
+ KmerLink n=get(kmer);
+ assert(n!=null);
+ synchronized(n){
+ if(n.owner==owner){
+ n.owner=-1;
+ return true;
+ }
+ }
+ return false;
+ }
+
+ @Override
+ public final int getOwner(final long kmer){
+ KmerLink n=get(kmer);
+ assert(n!=null);
+ return n.owner;
+ }
+
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public int set(long kmer, int[] vals) {
+ throw new RuntimeException("Unimplemented.");
+ }
+
+ @Override
+ public final int getValue(long kmer){
+ KmerLink n=get(kmer);
+ return n==null ? -1 : n.value;
+ }
+
+ @Override
+ public final int[] getValues(long kmer, int[] singleton){
+ KmerLink n=get(kmer);
+ if(n==null){return null;}
+ singleton[0]=n.value;
+ return singleton;
+ }
+
+ @Override
+ public final long size() {
+ if(value<1){return 0;}
+ long size=1;
+ if(next!=null){size+=next.size();}
+ return size;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Info Dumping ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public final boolean dumpKmersAsBytes(ByteStreamWriter bsw, int k, int mincount){
+ if(value<1){return true;}
+ if(value>=mincount){bsw.printlnKmer(pivot, value, k);}
+ if(next!=null){next.dumpKmersAsBytes(bsw, k, mincount);}
+ return true;
+ }
+
+ @Override
+ public final boolean dumpKmersAsBytes_MT(final ByteStreamWriter bsw, final ByteBuilder bb, final int k, final int mincount){
+ if(value<1){return true;}
+ if(value>=mincount){
+ toBytes(pivot, value, k, bb);
+ bb.append('\n');
+ if(bb.length()>=16000){
+ ByteBuilder bb2=new ByteBuilder(bb);
+ synchronized(bsw){bsw.addJob(bb2);}
+ bb.clear();
+ }
+ }
+ if(next!=null){next.dumpKmersAsBytes_MT(bsw, bb, k, mincount);}
+ return true;
+ }
+
+ @Override
+ public final boolean dumpKmersAsText(TextStreamWriter tsw, int k, int mincount) {
+ tsw.print(dumpKmersAsText(new StringBuilder(32), k, mincount));
+ return true;
+ }
+
+ private final StringBuilder dumpKmersAsText(StringBuilder sb, int k, int mincount){
+ if(value<1){return sb;}
+ if(sb==null){sb=new StringBuilder(32);}
+ if(value>=mincount){sb.append(AbstractKmerTable.toText(pivot, value, k)).append('\n');}
+ if(next!=null){next.dumpKmersAsText(sb, k, mincount);}
+ return sb;
+ }
+
+ @Override
+ public final void fillHistogram(long[] ca, int max){
+ if(value<1){return;}
+ ca[Tools.min(value, max)]++;
+ if(next!=null){next.fillHistogram(ca, max);}
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Private Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Invalid Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ KmerLink rebalance(ArrayList<KmerLink> list){
+ throw new RuntimeException("Unsupported.");
+ }
+
+ private static KmerLink rebalance(ArrayList<KmerLink> list, int a, int b){
+ throw new RuntimeException("Unsupported.");
+ }
+
+ @Deprecated
+ @Override
+ public long regenerate(){
+ throw new RuntimeException("TODO - remove zero-value links.");
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ long pivot;
+ int value;
+ int owner=-1;
+ KmerLink next;
+}
diff --git a/current/kmer/KmerNode.java b/current/kmer/KmerNode.java
new file mode 100755
index 0000000..732dfcd
--- /dev/null
+++ b/current/kmer/KmerNode.java
@@ -0,0 +1,361 @@
+package kmer;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import stream.ByteBuilder;
+
+import align2.Tools;
+import fileIO.TextStreamWriter;
+
+/**
+ * @author Brian Bushnell
+ * @date Oct 22, 2013
+ *
+ */
+public abstract class KmerNode extends AbstractKmerTable {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ protected KmerNode(long pivot_){
+ pivot=pivot_;
+ }
+
+ public abstract KmerNode makeNode(long pivot_, int value_);
+ public abstract KmerNode makeNode(long pivot_, int[] values_);
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Public Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public final int increment(long kmer){
+ if(pivot<0){pivot=kmer; return set(1);} //Allows initializing empty nodes to -1
+ if(kmer<pivot){
+ if(left==null){left=makeNode(kmer, 1); return 1;}
+ return left.increment(kmer);
+ }else if(kmer>pivot){
+ if(right==null){right=makeNode(kmer, 1); return 1;}
+ return right.increment(kmer);
+ }else{
+ if(value()<Integer.MAX_VALUE){set(value()+1);}
+ return value();
+ }
+ }
+
+ @Override
+ public final int incrementAndReturnNumCreated(long kmer) {
+ int x=increment(kmer);
+ return x==1 ? 1 : 0;
+ }
+
+// public final int set_Test(final long kmer, final int v){
+// assert(TESTMODE);
+// final int x;
+// if(TWOD()){
+// int[] old=getValues(kmer, null);
+// assert(old==null || contains(kmer, old));
+// x=set0(kmer, v);
+// assert(old==null || contains(kmer, old));
+// assert(contains(kmer, v));
+// }else{
+// int old=getValue(kmer);
+// assert(old==0 || old==-1 || contains(kmer, old));
+// x=set0(kmer, v);
+// assert(contains(kmer, v)) : "old="+old+", v="+v+", kmer="+kmer+", get(kmer)="+getValue(kmer);
+// assert(v==old || !contains(kmer, old));
+// }
+// return x;
+// }
+//
+// public final int setIfNotPresent_Test(long kmer, int v){
+// assert(TESTMODE);
+// final int x;
+// if(TWOD()){
+//// int[] vals=getValues(kmer, null);
+//// assert(vals==null || contains(kmer, vals));
+//// x=setIfNotPresent(kmer, v);
+//// assert(contains(kmer, vals));
+//// assert(contains(kmer, v));
+// x=0;
+// assert(false);
+// }else{
+// int old=getValue(kmer);
+// assert(old==0 || old==-1 || contains(kmer, old));
+// x=setIfNotPresent0(kmer, v);
+// assert((old<1 && contains(kmer, v)) || (old>0 && contains(kmer, old))) : kmer+", "+old+", "+v;
+// }
+// return x;
+// }
+
+
+ /** Returns number of nodes added */
+ @Override
+ public final int set(long kmer, int value){
+ if(verbose){System.err.println("Set0: kmer="+kmer+", v="+value+", old="+Arrays.toString(values(new int[1])));}
+ if(pivot<0){pivot=kmer; set(value); return 1;} //Allows initializing empty nodes to -1
+ if(verbose){System.err.println("A");}
+ if(kmer<pivot){
+ if(verbose){System.err.println("B");}
+ if(left==null){left=makeNode(kmer, value); return 1;}
+ if(verbose){System.err.println("C");}
+ return left.set(kmer, value);
+ }else if(kmer>pivot){
+ if(verbose){System.err.println("D");}
+ if(right==null){right=makeNode(kmer, value); return 1;}
+ if(verbose){System.err.println("E");}
+ return right.set(kmer, value);
+ }else{
+ if(verbose){System.err.println("F");}
+ set(value);
+ }
+ if(verbose){System.err.println("G");}
+ return 0;
+ }
+
+
+ /** Returns number of nodes added */
+ @Override
+ public final int setIfNotPresent(long kmer, int value){
+ if(verbose){System.err.println("setIfNotPresent0: kmer="+kmer+", v="+value+", old="+Arrays.toString(values(new int[0])));}
+ if(pivot<0){pivot=kmer; set(value); return 1;} //Allows initializing empty nodes to -1
+ if(kmer<pivot){
+ if(left==null){left=makeNode(kmer, value); return 1;}
+ return left.setIfNotPresent(kmer, value);
+ }else if(kmer>pivot){
+ if(right==null){right=makeNode(kmer, value); return 1;}
+ return right.setIfNotPresent(kmer, value);
+ }
+ return 0;
+ }
+
+ @Override
+ public final int getValue(long kmer){
+ KmerNode n=get(kmer);
+ return n==null ? -1 : n.value();
+ }
+
+ @Override
+ public final int[] getValues(long kmer, int[] singleton){
+ KmerNode n=get(kmer);
+ return n==null ? null : n.values(singleton);
+ }
+
+ @Override
+ public final boolean contains(long kmer){
+ KmerNode node=get(kmer);
+ return node!=null;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Nonpublic Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public KmerNode left(){return left;}
+ public KmerNode right(){return right;}
+ public long pivot(){return pivot;}
+ public int owner(){return owner;}
+
+ public int count(){return value();}
+ protected abstract int value();
+ protected abstract int[] values(int[] singleton);
+ /** Returns new value */
+ public abstract int set(int value_);
+ protected abstract int set(int[] values_);
+
+ @Override
+ final KmerNode get(long kmer){
+// if(kmer<pivot){
+// return left==null ? null : left.get(kmer);
+// }else if(kmer>pivot){
+// return right==null ? null : right.get(kmer);
+// }else{
+// return this;
+// }
+ KmerNode n=this;
+ while(n!=null && n.pivot!=kmer){
+ n=(kmer<n.pivot ? n.left : n.right);
+ }
+ return n;
+ }
+
+ final KmerNode getNodeOrParent(long kmer){
+ if(pivot==kmer || pivot<0){return this;}
+ if(kmer<pivot){return left==null ? this : left.getNodeOrParent(kmer);}
+ return right==null ? this : right.getNodeOrParent(kmer);
+ }
+
+ final boolean insert(KmerNode n){
+ assert(pivot!=-1);
+ if(n.pivot<pivot){
+ if(left==null){left=n; return true;}
+ return left.insert(n);
+ }else if(n.pivot>pivot){
+ if(right==null){right=n; return true;}
+ return right.insert(n);
+ }else{
+ return false;
+ }
+ }
+
+ final void traversePrefix(ArrayList<KmerNode> list){
+ if(left!=null){left.traversePrefix(list);}
+ list.add(this);
+ if(right!=null){right.traversePrefix(list);}
+ }
+
+ final void traverseInfix(ArrayList<KmerNode> list){
+ list.add(this);
+ if(left!=null){left.traverseInfix(list);}
+ if(right!=null){right.traverseInfix(list);}
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Private Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Resizing and Rebalancing ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public final long size() {
+ if(value()<1){return 0;}
+ long size=1;
+ if(left!=null){size+=left.size();}
+ if(right!=null){size+=right.size();}
+ return size;
+ }
+
+ final KmerNode rebalance(ArrayList<KmerNode> list){
+ assert(list.isEmpty());
+ traversePrefix(list);
+ KmerNode n=this;
+ if(list.size()>2){
+ n=rebalance(list, 0, list.size()-1);
+ }
+ list.clear();
+ return n;
+ }
+
+ private static final KmerNode rebalance(ArrayList<KmerNode> list, int a, int b){
+ final int size=b-a+1;
+ final int middle=a+size/2;
+ final KmerNode n=list.get(middle);
+ if(size<4){
+ if(size==1){
+ n.left=n.right=null;
+ }else if(size==2){
+ KmerNode n1=list.get(a);
+ n.left=n1;
+ n.right=null;
+ n1.left=n1.right=null;
+ }else{
+ assert(size==3);
+ KmerNode n1=list.get(a), n2=list.get(b);
+ n.left=n1;
+ n.right=n2;
+ n1.left=n1.right=null;
+ n2.left=n2.right=null;
+ }
+ }else{
+ n.left=rebalance(list, a, middle-1);
+ n.right=rebalance(list, middle+1, b);
+ }
+ return n;
+ }
+
+ @Override
+ public long regenerate(){
+ throw new RuntimeException("Not supported.");
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Info Dumping ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public final boolean dumpKmersAsText(TextStreamWriter tsw, int k, int mincount) {
+ tsw.print(dumpKmersAsText(new StringBuilder(32), k, mincount));
+ return true;
+ }
+
+ protected abstract StringBuilder dumpKmersAsText(StringBuilder sb, int k, int mincount);
+
+ protected abstract ByteBuilder dumpKmersAsText(ByteBuilder bb, int k, int mincount);
+
+ @Override
+ public final void fillHistogram(long[] ca, int max){
+ final int value=value();
+ if(value<1){return;}
+ ca[Tools.min(value, max)]++;
+ if(left!=null){left.fillHistogram(ca, max);}
+ if(right!=null){right.fillHistogram(ca, max);}
+ }
+
+ abstract boolean TWOD();
+ abstract int numValues();
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Ownership ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public final void initializeOwnership(){
+ owner=-1;
+ if(left!=null){left.initializeOwnership();}
+ if(right!=null){right.initializeOwnership();}
+ }
+
+ @Override
+ public final void clearOwnership(){initializeOwnership();}
+
+ @Override
+ public final int setOwner(final long kmer, final int newOwner){
+ KmerNode n=get(kmer);
+ assert(n!=null);
+ if(n.owner<=newOwner){
+ synchronized(n){
+ if(n.owner<newOwner){
+ n.owner=newOwner;
+ }
+ }
+ }
+ return n.owner;
+ }
+
+ @Override
+ public final boolean clearOwner(final long kmer, final int owner){
+ KmerNode n=get(kmer);
+ assert(n!=null);
+ synchronized(n){
+ if(n.owner==owner){
+ n.owner=-1;
+ return true;
+ }
+ }
+ return false;
+ }
+
+ @Override
+ public final int getOwner(final long kmer){
+ KmerNode n=get(kmer);
+ assert(n!=null);
+ return n.owner;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Invalid Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ long pivot;
+ int owner=-1;
+ KmerNode left, right;
+
+}
diff --git a/current/kmer/KmerNode1D.java b/current/kmer/KmerNode1D.java
new file mode 100755
index 0000000..1287665
--- /dev/null
+++ b/current/kmer/KmerNode1D.java
@@ -0,0 +1,161 @@
+package kmer;
+
+import stream.ByteBuilder;
+import fileIO.ByteStreamWriter;
+
+/**
+ * @author Brian Bushnell
+ * @date Oct 22, 2013
+ *
+ */
+public class KmerNode1D extends KmerNode {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public KmerNode1D(long pivot_){
+ super(pivot_);
+ }
+
+ public KmerNode1D(long pivot_, int value_){
+ super(pivot_);
+ value=value_;
+ }
+
+ public final KmerNode makeNode(long pivot_, int value_){
+ return new KmerNode1D(pivot_, value_);
+ }
+
+ public final KmerNode makeNode(long pivot_, int[] values_){
+ throw new RuntimeException("Unimplemented");
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Public Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public final int set(long kmer, int[] vals) {
+ throw new RuntimeException("Unimplemented.");
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Nonpublic Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ protected int value(){return value;}
+
+ protected int[] values(int[] singleton){
+ assert(singleton.length==1);
+ singleton[0]=value;
+ return singleton;
+ }
+
+ public int set(int value_){return value=value_;}
+
+ protected int set(int[] values_){
+ throw new RuntimeException("Unimplemented");
+ }
+
+ int numValues(){return value<1 ? 0 : 1;}
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Private Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Resizing and Rebalancing ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ boolean canResize() {
+ return false;
+ }
+
+ @Override
+ public boolean canRebalance() {
+ return true;
+ }
+
+ @Deprecated
+ @Override
+ public int arrayLength() {
+ throw new RuntimeException("Unsupported.");
+ }
+
+ @Deprecated
+ @Override
+ void resize() {
+ throw new RuntimeException("Unsupported.");
+ }
+
+ @Deprecated
+ @Override
+ public void rebalance() {
+ throw new RuntimeException("Please call rebalance(ArrayList<KmerNode>) instead, with an empty list.");
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Info Dumping ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public final boolean dumpKmersAsBytes(ByteStreamWriter bsw, int k, int mincount){
+ if(value<1){return true;}
+ if(value>=mincount){bsw.printlnKmer(pivot, value, k);}
+ if(left!=null){left.dumpKmersAsBytes(bsw, k, mincount);}
+ if(right!=null){right.dumpKmersAsBytes(bsw, k, mincount);}
+ return true;
+ }
+
+ @Override
+ public final boolean dumpKmersAsBytes_MT(final ByteStreamWriter bsw, final ByteBuilder bb, final int k, final int mincount){
+ if(value<1){return true;}
+ if(value>=mincount){
+ toBytes(pivot, value, k, bb);
+ bb.append('\n');
+ if(bb.length()>=16000){
+ ByteBuilder bb2=new ByteBuilder(bb);
+ synchronized(bsw){bsw.addJob(bb2);}
+ bb.clear();
+ }
+ }
+ if(left!=null){left.dumpKmersAsBytes_MT(bsw, bb, k, mincount);}
+ if(right!=null){right.dumpKmersAsBytes_MT(bsw, bb, k, mincount);}
+ return true;
+ }
+
+ @Override
+ protected final StringBuilder dumpKmersAsText(StringBuilder sb, int k, int mincount){
+ if(value<1){return sb;}
+ if(sb==null){sb=new StringBuilder(32);}
+ if(value>=mincount){sb.append(AbstractKmerTable.toText(pivot, value, k)).append('\n');}
+ if(left!=null){left.dumpKmersAsText(sb, k, mincount);}
+ if(right!=null){right.dumpKmersAsText(sb, k, mincount);}
+ return sb;
+ }
+
+ @Override
+ protected final ByteBuilder dumpKmersAsText(ByteBuilder bb, int k, int mincount){
+ if(value<1){return bb;}
+ if(bb==null){bb=new ByteBuilder(32);}
+ if(value>=mincount){bb.append(AbstractKmerTable.toBytes(pivot, value, k)).append('\n');}
+ if(left!=null){left.dumpKmersAsText(bb, k, mincount);}
+ if(right!=null){right.dumpKmersAsText(bb, k, mincount);}
+ return bb;
+ }
+
+ final boolean TWOD(){return false;}
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Invalid Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ int value;
+
+}
diff --git a/current/kmer/KmerNode2D.java b/current/kmer/KmerNode2D.java
new file mode 100755
index 0000000..fc36711
--- /dev/null
+++ b/current/kmer/KmerNode2D.java
@@ -0,0 +1,237 @@
+package kmer;
+
+import java.util.Arrays;
+
+import stream.ByteBuilder;
+import fileIO.ByteStreamWriter;
+
+import align2.Tools;
+
+/**
+ * Allows multiple values per kmer.
+ * @author Brian Bushnell
+ * @date Nov 7, 2014
+ *
+ */
+public class KmerNode2D extends KmerNode {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public KmerNode2D(long pivot_){
+ super(pivot_);
+ }
+
+ public KmerNode2D(long pivot_, int value_){
+ super(pivot_);
+ assert(value_>=0 || value_==-1);
+ values=new int[] {value_, -1};
+ }
+
+ public KmerNode2D(long pivot_, int[] vals_){
+ super(pivot_);
+ values=vals_;
+ }
+
+ public final KmerNode makeNode(long pivot_, int value_){
+ return new KmerNode2D(pivot_, value_);
+ }
+
+ public final KmerNode makeNode(long pivot_, int[] values_){
+ return new KmerNode2D(pivot_, values_);
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Public Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+// public final int set_Test(final long kmer, final int v[]){
+// assert(TESTMODE);
+// final int x;
+// if(TWOD()){
+// int[] old=getValues(kmer, null);
+// assert(old==null || contains(kmer, old));
+// x=set0(kmer, v);
+// assert(old==null || contains(kmer, old));
+// assert(contains(kmer, v));
+// }else{
+// int old=getValue(kmer);
+// assert(old==0 || old==-1 || contains(kmer, old));
+// x=set0(kmer, v);
+// assert(contains(kmer, v)) : "old="+old+", v="+v+", kmer="+kmer+", get(kmer)="+getValue(kmer);
+// assert(v[0]==old || !contains(kmer, old));
+// }
+// return x;
+// }
+
+ /** Returns number of nodes added */
+ @Override
+ public int set(long kmer, int vals[]){
+ if(pivot<0){pivot=kmer; insertValue(vals); return 1;} //Allows initializing empty nodes to -1
+ if(kmer<pivot){
+ if(left==null){left=new KmerNode2D(kmer, vals); return 1;}
+ return left.set(kmer, vals);
+ }else if(kmer>pivot){
+ if(right==null){right=new KmerNode2D(kmer, vals); return 1;}
+ return right.set(kmer, vals);
+ }else{
+ insertValue(vals);
+ }
+ return 0;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Nonpublic Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ protected int value(){return values==null ? 0 : values[0];}
+
+ protected int[] values(int[] singleton){
+ return values;
+ }
+
+ public int set(int value_){
+ insertValue(value_);
+ return value_;
+ }
+
+ protected int set(int[] values_){
+ int ret=(values==null ? 1 : 0);
+ insertValue(values_);
+ return ret;
+ }
+
+ int numValues(){
+ if(values==null){return 0;}
+ for(int i=0; i<values.length; i++){
+ if(values[i]==-1){return i;}
+ }
+ return values.length;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Private Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Returns number of values added */
+ private int insertValue(int v){
+ for(int i=0; i<values.length; i++){
+ if(values[i]==v){return 0;}
+ if(values[i]==-1){values[i]=v;return 1;}
+ }
+ final int oldSize=values.length;
+ final int newSize=(int)Tools.min(Integer.MAX_VALUE, oldSize*2L);
+ assert(newSize>values.length) : "Overflow.";
+ values=Arrays.copyOf(values, newSize);
+ values[oldSize]=v;
+ Arrays.fill(values, oldSize+1, newSize, -1);
+ return 1;
+ }
+
+ /** Returns number of values added */
+ private int insertValue(int[] vals){
+ if(values==null){
+ values=vals;
+ return 1;
+ }
+ for(int v : vals){
+ if(v<0){break;}
+ insertValue(v);
+ }
+ return 0;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Resizing and Rebalancing ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ boolean canResize() {
+ return false;
+ }
+
+ @Override
+ public boolean canRebalance() {
+ return true;
+ }
+
+ @Deprecated
+ @Override
+ public int arrayLength() {
+ throw new RuntimeException("Unsupported.");
+ }
+
+ @Deprecated
+ @Override
+ void resize() {
+ throw new RuntimeException("Unsupported.");
+ }
+
+ @Deprecated
+ @Override
+ public void rebalance() {
+ throw new RuntimeException("Please call rebalance(ArrayList<KmerNode>) instead, with an empty list.");
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Info Dumping ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public final boolean dumpKmersAsBytes(ByteStreamWriter bsw, int k, int mincount){
+ if(values==null){return true;}
+ bsw.printlnKmer(pivot, values, k);
+ if(left!=null){left.dumpKmersAsBytes(bsw, k, mincount);}
+ if(right!=null){right.dumpKmersAsBytes(bsw, k, mincount);}
+ return true;
+ }
+
+ @Override
+ public final boolean dumpKmersAsBytes_MT(final ByteStreamWriter bsw, final ByteBuilder bb, final int k, final int mincount){
+ if(values==null){return true;}
+ toBytes(pivot, values, k, bb);
+ bb.append('\n');
+ if(bb.length()>=16000){
+ ByteBuilder bb2=new ByteBuilder(bb);
+ synchronized(bsw){bsw.addJob(bb2);}
+ bb.clear();
+ }
+ if(left!=null){left.dumpKmersAsBytes_MT(bsw, bb, k, mincount);}
+ if(right!=null){right.dumpKmersAsBytes_MT(bsw, bb, k, mincount);}
+ return true;
+ }
+
+ @Override
+ protected final StringBuilder dumpKmersAsText(StringBuilder sb, int k, int mincount){
+ if(values==null){return sb;}
+ if(sb==null){sb=new StringBuilder(32);}
+ sb.append(AbstractKmerTable.toText(pivot, values, k)).append('\n');
+ if(left!=null){left.dumpKmersAsText(sb, k, mincount);}
+ if(right!=null){right.dumpKmersAsText(sb, k, mincount);}
+ return sb;
+ }
+
+ @Override
+ protected final ByteBuilder dumpKmersAsText(ByteBuilder bb, int k, int mincount){
+ if(values==null){return bb;}
+ if(bb==null){bb=new ByteBuilder(32);}
+ bb.append(AbstractKmerTable.toBytes(pivot, values, k)).append('\n');
+ if(left!=null){left.dumpKmersAsText(bb, k, mincount);}
+ if(right!=null){right.dumpKmersAsText(bb, k, mincount);}
+ return bb;
+ }
+
+ final boolean TWOD(){return true;}
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Invalid Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ int[] values;
+
+}
diff --git a/current/kmer/KmerTable.java b/current/kmer/KmerTable.java
new file mode 100755
index 0000000..f8e08da
--- /dev/null
+++ b/current/kmer/KmerTable.java
@@ -0,0 +1,339 @@
+package kmer;
+
+import java.util.ArrayList;
+import java.util.concurrent.locks.Lock;
+import java.util.concurrent.locks.ReentrantLock;
+
+import stream.ByteBuilder;
+
+
+import fileIO.ByteStreamWriter;
+import fileIO.TextStreamWriter;
+
+import align2.Tools;
+
+/**
+ * @author Brian Bushnell
+ * @date Oct 23, 2013
+ *
+ */
+public final class KmerTable extends AbstractKmerTable {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public KmerTable(int initialSize, boolean autoResize_){
+ if(initialSize>1){
+ initialSize=(int)Tools.min(maxPrime, Primes.primeAtLeast(initialSize));
+ }else{
+ initialSize=1;
+ }
+ prime=initialSize;
+ sizeLimit=(long) (initialSize*resizeMult);
+ array=new KmerLink[prime];
+ autoResize=autoResize_;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Public Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public int increment(long kmer){
+ final int cell=(int)(kmer%prime);
+ KmerLink n=array[cell], prev=null;
+ while(n!=null && n.pivot!=kmer){
+ prev=n;
+ n=n.next;
+ }
+ if(n==null){
+ n=new KmerLink(kmer, 1);
+ size++;
+ if(prev==null){
+ array[cell]=n;
+ }else{
+ prev.next=n;
+ }
+ if(autoResize && size>sizeLimit){resize();}
+ }else{
+ n.value++;
+ if(n.value<0){n.value=Integer.MAX_VALUE;}
+ }
+ return n.value;
+ }
+
+ @Override
+ public int incrementAndReturnNumCreated(long kmer){
+ final int cell=(int)(kmer%prime);
+ KmerLink n=array[cell], prev=null;
+ while(n!=null && n.pivot!=kmer){
+ prev=n;
+ n=n.next;
+ }
+ if(n==null){
+ n=new KmerLink(kmer, 1);
+ size++;
+ if(prev==null){
+ array[cell]=n;
+ }else{
+ prev.next=n;
+ }
+ if(autoResize && size>sizeLimit){resize();}
+ return 1;
+ }else{
+ n.value++;
+ if(n.value<0){n.value=Integer.MAX_VALUE;}
+ return 0;
+ }
+ }
+
+ @Override
+ public int set(long kmer, int value){
+ int x=1, cell=(int)(kmer%prime);
+ final KmerLink n=array[cell];
+ if(n==null){
+ array[cell]=new KmerLink(kmer, value);
+ }else{
+ x=n.set(kmer, value);
+ }
+ size+=x;
+ if(autoResize && size>sizeLimit){resize();}
+ return x;
+ }
+
+ @Override
+ public int set(long kmer, int[] vals) {
+ throw new RuntimeException("Unimplemented.");
+ }
+
+ @Override
+ public int setIfNotPresent(long kmer, int value){
+ int x=1, cell=(int)(kmer%prime);
+ final KmerLink n=array[cell];
+ if(n==null){
+ array[cell]=new KmerLink(kmer, value);
+ }else{
+ x=n.setIfNotPresent(kmer, value);
+ }
+ size+=x;
+ if(autoResize && size>sizeLimit){resize();}
+ return x;
+ }
+
+ @Override
+ public int getValue(long kmer){
+ int cell=(int)(kmer%prime);
+ KmerLink n=array[cell];
+ while(n!=null && n.pivot!=kmer){n=n.next;}
+ return n==null ? 0 : n.value;
+ }
+
+ @Override
+ public int[] getValues(long kmer, int[] singleton){
+ assert(array.length==0);
+ singleton[0]=getValue(kmer);
+ return singleton;
+ }
+
+ @Override
+ public boolean contains(long kmer){
+ KmerLink node=get(kmer);
+ return node!=null;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Ownership ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public final void initializeOwnership(){
+ for(KmerLink n : array){
+ if(n!=null){n.initializeOwnership();}
+ }
+ }
+
+ @Override
+ public final void clearOwnership(){
+ for(KmerLink n : array){
+ if(n!=null){n.clearOwnership();}
+ }
+ }
+
+ @Override
+ public final int setOwner(final long kmer, final int newOwner){
+ final int cell=(int)(kmer%prime);
+ KmerLink n=array[cell];
+ assert(n!=null);
+ return n.setOwner(kmer, newOwner);
+ }
+
+ @Override
+ public final boolean clearOwner(final long kmer, final int owner){
+ final int cell=(int)(kmer%prime);
+ KmerLink n=array[cell];
+ assert(n!=null);
+ return n.clearOwner(kmer, owner);
+ }
+
+ @Override
+ public final int getOwner(final long kmer){
+ final int cell=(int)(kmer%prime);
+ KmerLink n=array[cell];
+ assert(n!=null);
+ return n.getOwner(kmer);
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Nonpublic Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ KmerLink get(long kmer){
+ int cell=(int)(kmer%prime);
+ KmerLink n=array[cell];
+ while(n!=null && n.pivot!=kmer){n=n.next;}
+ return n;
+ }
+
+ boolean insert(KmerLink n){
+ n.next=null;
+ int cell=(int)(n.pivot%prime);
+ if(array[cell]==null){
+ array[cell]=n;
+ return true;
+ }
+ return array[cell].insert(n);
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Private Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Invalid Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Resizing and Rebalancing ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ boolean canResize() {return true;}
+
+ @Override
+ public boolean canRebalance() {return false;}
+
+ @Override
+ public long size() {return size;}
+
+ @Override
+ public int arrayLength() {return array.length;}
+
+ @Override
+ synchronized void resize(){
+// assert(false);
+// System.err.println("Resizing from "+prime+"; load="+(size*1f/prime));
+ sizeLimit=Tools.max((long)(size*1.4), (long)(maxLoadFactor*prime));
+
+ final long maxAllowedByLoadFactor=(long)(size*minLoadMult);
+ final long minAllowedByLoadFactor=(long)(size*maxLoadMult);
+ assert(maxAllowedByLoadFactor>=minAllowedByLoadFactor);
+ if(maxAllowedByLoadFactor<prime){return;}
+
+ long x=10+(long)(prime*resizeMult);
+ x=Tools.max(x, minAllowedByLoadFactor);
+ x=Tools.min(x, maxAllowedByLoadFactor);
+
+ int prime2=(int)Tools.min(maxPrime, Primes.primeAtLeast(x));
+
+ if(prime2<=prime){return;}
+
+ prime=prime2;
+// System.err.println("Resized to "+prime+"; load="+(size*1f/prime));
+ KmerLink[] old=array;
+ array=new KmerLink[prime2];
+ ArrayList<KmerLink> list=new ArrayList<KmerLink>(1000);
+ for(int i=0; i<old.length; i++){
+ if(old[i]!=null){
+ old[i].traverseInfix(list);
+ for(KmerLink n : list){insert(n);}
+ list.clear();
+ }
+ }
+ sizeLimit=Tools.max((long)(size*1.4), (long)(maxLoadFactor*prime));
+ }
+
+ @Override
+ public void rebalance(){
+ ArrayList<KmerLink> list=new ArrayList<KmerLink>(1000);
+ for(int i=0; i<array.length; i++){
+ if(array[i]!=null){array[i]=array[i].rebalance(list);}
+ }
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Info Dumping ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Deprecated
+ @Override
+ public boolean dumpKmersAsText(TextStreamWriter tsw, int k, int mincount){
+ throw new RuntimeException("TODO");
+ }
+
+ @Override
+ public boolean dumpKmersAsBytes_MT(final ByteStreamWriter bsw, final ByteBuilder bb, final int k, final int mincount){
+ for(int i=0; i<array.length; i++){
+ KmerLink node=array[i];
+ if(node!=null && node.value>=mincount){
+ node.dumpKmersAsBytes_MT(bsw, bb, k, mincount);
+ }
+ }
+ return true;
+ }
+
+ @Deprecated
+ @Override
+ public boolean dumpKmersAsBytes(ByteStreamWriter bsw, int k, int mincount){
+ throw new RuntimeException("TODO");
+ }
+
+ @Deprecated
+ @Override
+ public void fillHistogram(long[] ca, int max){
+ throw new RuntimeException("TODO");
+ }
+
+ @Deprecated
+ @Override
+ public long regenerate(){
+ throw new RuntimeException("TODO - remove zero-value links.");
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ KmerLink[] array;
+ int prime;
+ long size=0;
+ long sizeLimit;
+ final boolean autoResize;
+ private final Lock lock=new ReentrantLock();
+
+ @Override
+ final Lock getLock(){return lock;}
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Static Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ final static int maxPrime=(int)Primes.primeAtMost(Integer.MAX_VALUE);
+ final static float resizeMult=2f; //Resize by a minimum of this much
+ final static float minLoadFactor=0.5f; //Resize by enough to get the load above this factor
+ final static float maxLoadFactor=0.98f; //Resize by enough to get the load under this factor
+ final static float minLoadMult=1/minLoadFactor;
+ final static float maxLoadMult=1/maxLoadFactor;
+
+}
diff --git a/current/kmer/KmerTableSet.java b/current/kmer/KmerTableSet.java
new file mode 100755
index 0000000..d2942b2
--- /dev/null
+++ b/current/kmer/KmerTableSet.java
@@ -0,0 +1,1213 @@
+package kmer;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import jgi.BBMerge;
+import stream.ByteBuilder;
+import stream.ConcurrentReadInputStream;
+import stream.FastaReadInputStream;
+import stream.Read;
+import align2.IntList;
+import align2.ListNum;
+import align2.LongList;
+import align2.ReadStats;
+import align2.Shared;
+import align2.Tools;
+import align2.TrimRead;
+import bloom.KmerCountAbstract;
+import dna.AminoAcid;
+import dna.Parser;
+import dna.Timer;
+import fileIO.ByteStreamWriter;
+import fileIO.FileFormat;
+import fileIO.ReadWrite;
+
+
+/**
+ * Loads and holds kmers for Tadpole
+ * @author Brian Bushnell
+ * @date Jun 22, 2015
+ *
+ */
+public class KmerTableSet extends AbstractKmerTableSet {
+
+ /**
+ * Code entrance from the command line.
+ * @param args Command line arguments
+ */
+ public static void main(String[] args){
+
+ args=Parser.parseConfig(args);
+ if(Parser.parseHelp(args, true)){
+ printOptions();
+ System.exit(0);
+ }
+
+ Timer t=new Timer(), t2=new Timer();
+ t.start();
+ t2.start();
+ KmerTableSet set=new KmerTableSet(args);
+ t2.stop();
+ outstream.println("Initialization Time: \t"+t2);
+
+ ///And run it
+ set.process(t);
+ }
+
+
+ /**
+ * Constructor.
+ * @param args Command line arguments
+ */
+ private KmerTableSet(String[] args){
+ this(args, 12);//+5 if using ownership and building contigs
+ }
+
+
+ /**
+ * Constructor.
+ * @param args Command line arguments
+ */
+ public KmerTableSet(String[] args, final int bytesPerKmer_){
+ System.err.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+
+ /* Initialize local variables with defaults */
+ Parser parser=new Parser();
+ boolean prealloc_=false;
+ int k_=31;
+ int ways_=-1;
+ int filterMax_=2;
+ boolean ecco_=false, merge_=false;
+ boolean rcomp_=true;
+ double minProb_=defaultMinprob;
+
+ /* Parse arguments */
+ for(int i=0; i<args.length; i++){
+
+ final String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if("null".equalsIgnoreCase(b)){b=null;}
+ while(a.charAt(0)=='-' && (a.indexOf('.')<0 || i>1 || !new File(a).exists())){a=a.substring(1);}
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(Parser.parseCommonStatic(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseZip(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseQuality(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseFasta(arg, a, b)){
+ //do nothing
+ }else if(parser.parseInterleaved(arg, a, b)){
+ //do nothing
+ }else if(parser.parseTrim(arg, a, b)){
+ //do nothing
+ }else if(a.equals("in") || a.equals("in1")){
+ in1.clear();
+ if(b!=null){
+ String[] s=b.split(",");
+ for(String ss : s){
+ in1.add(ss);
+ }
+ }
+ }else if(a.equals("in2")){
+ in2.clear();
+ if(b!=null){
+ String[] s=b.split(",");
+ for(String ss : s){
+ in2.add(ss);
+ }
+ }
+ }else if(a.equals("append") || a.equals("app")){
+ append=ReadStats.append=Tools.parseBoolean(b);
+ }else if(a.equals("overwrite") || a.equals("ow")){
+ overwrite=Tools.parseBoolean(b);
+ }else if(a.equals("initialsize")){
+ initialSize=(int)Tools.parseKMG(b);
+ }else if(a.equals("showstats") || a.equals("stats")){
+ showStats=Tools.parseBoolean(b);
+ }else if(a.equals("ways")){
+ ways_=(int)Tools.parseKMG(b);
+ }else if(a.equals("buflen") || a.equals("bufflen") || a.equals("bufferlength")){
+ buflen=(int)Tools.parseKMG(b);
+ }else if(a.equals("k")){
+ assert(b!=null) : "\nk needs an integer value from 1 to 31, such as k=27. Default is 31.\n";
+ k_=(int)Tools.parseKMG(b);
+ }else if(a.equals("threads") || a.equals("t")){
+ THREADS=(b==null || b.equalsIgnoreCase("auto") ? Shared.threads() : Integer.parseInt(b));
+ }else if(a.equals("showspeed") || a.equals("ss")){
+ showSpeed=Tools.parseBoolean(b);
+ }else if(a.equals("ecco")){
+ ecco_=Tools.parseBoolean(b);
+ }else if(a.equals("merge")){
+ merge_=Tools.parseBoolean(b);
+ }else if(a.equals("verbose")){
+// assert(false) : "Verbose flag is currently static final; must be recompiled to change.";
+ verbose=Tools.parseBoolean(b);
+ }else if(a.equals("verbose2")){
+// assert(false) : "Verbose flag is currently static final; must be recompiled to change.";
+ verbose2=Tools.parseBoolean(b);
+ }else if(a.equals("minprob")){
+ minProb_=Double.parseDouble(b);
+ }else if(a.equals("minprobprefilter") || a.equals("mpp")){
+ minProbPrefilter=Tools.parseBoolean(b);
+ }else if(a.equals("minprobmain") || a.equals("mpm")){
+ minProbMain=Tools.parseBoolean(b);
+ }else if(a.equals("reads") || a.startsWith("maxreads")){
+ maxReads=Tools.parseKMG(b);
+ }else if(a.equals("prealloc") || a.equals("preallocate")){
+ if(b==null || b.length()<1 || Character.isLetter(b.charAt(0))){
+ prealloc_=Tools.parseBoolean(b);
+ }else{
+ preallocFraction=Tools.max(0, Double.parseDouble(b));
+ prealloc_=(preallocFraction>0);
+ }
+ }else if(a.equals("prefilter")){
+ if(b==null || b.length()<1 || !Character.isDigit(b.charAt(0))){
+ prefilter=Tools.parseBoolean(b);
+ }else{
+ filterMax_=(int)Tools.parseKMG(b);
+ prefilter=filterMax_>0;
+ }
+ }else if(a.equals("prefiltersize") || a.equals("prefilterfraction") || a.equals("pff")){
+ prefilterFraction=Tools.max(0, Double.parseDouble(b));
+ assert(prefilterFraction<=1) : "prefiltersize must be 0-1, a fraction of total memory.";
+ prefilter=prefilterFraction>0;
+ }else if(a.equals("prehashes") || a.equals("hashes")){
+ prehashes=(int)Tools.parseKMG(b);
+ }else if(a.equals("prefilterpasses") || a.equals("prepasses")){
+ if(b.equalsIgnoreCase("auto")){
+ prepasses=-1;
+ }else{
+ prepasses=(int)Tools.parseKMG(b);
+ }
+ }else if(a.equals("onepass")){
+ onePass=Tools.parseBoolean(b);
+ }else if(a.equals("passes")){
+ int passes=(int)Tools.parseKMG(b);
+ onePass=(passes<2);
+ }else if(a.equals("rcomp")){
+ rcomp_=Tools.parseBoolean(b);
+ }else if(IGNORE_UNKNOWN_ARGS){
+ //Do nothing
+ }else{
+ throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ {//Process parser fields
+ Parser.processQuality();
+
+ qtrimLeft=parser.qtrimLeft;
+ qtrimRight=parser.qtrimRight;
+ trimq=parser.trimq;
+
+ minAvgQuality=parser.minAvgQuality;
+ minAvgQualityBases=parser.minAvgQualityBases;
+ }
+
+ if(prepasses==0 || !prefilter){
+ prepasses=0;
+ prefilter=false;
+ }
+
+
+// assert(false) : prepasses+", "+onePass+", "+prefilter;
+
+ {
+ long memory=Runtime.getRuntime().maxMemory();
+ double xmsRatio=Shared.xmsRatio();
+// long tmemory=Runtime.getRuntime().totalMemory();
+ usableMemory=(long)Tools.max(((memory-96000000)*(xmsRatio>0.97 ? 0.82 : 0.75)), memory*0.45);
+ if(prepasses==0 || !prefilter){
+ filterMemory0=filterMemory1=0;
+ }else{
+ double low=Tools.min(prefilterFraction, 1-prefilterFraction);
+ double high=1-low;
+ if(prepasses<0 || (prepasses&1)==1){//odd passes
+ filterMemory0=(long)(usableMemory*low);
+ filterMemory1=(long)(usableMemory*high);
+ }else{//even passes
+ filterMemory0=(long)(usableMemory*high);
+ filterMemory1=(long)(usableMemory*low);
+ }
+ }
+ tableMemory=(long)(usableMemory*.95-filterMemory0);
+ }
+
+ prealloc=prealloc_;
+ bytesPerKmer=bytesPerKmer_;
+ if(ways_<1){
+ long maxKmers=(2*tableMemory)/bytesPerKmer;
+ long minWays=Tools.min(10000, maxKmers/Integer.MAX_VALUE);
+ ways_=(int)Tools.max(31, (int)(THREADS*2.5), minWays);
+ ways_=(int)Primes.primeAtLeast(ways_);
+ assert(ways_>0);
+// System.err.println("ways="+ways_);
+ }
+
+ /* Set final variables; post-process and validate argument combinations */
+
+ onePass=onePass&prefilter;
+ ways=ways_;
+ filterMax=Tools.min(filterMax_, 0x7FFFFFFF);
+ ecco=ecco_;
+ merge=merge_;
+ minProb=(float)minProb_;
+ rcomp=rcomp_;
+ estimatedKmerCapacity=(long)((tableMemory*1.0/bytesPerKmer)*((prealloc && preallocFraction==1) ? 0.9 : 0.6));
+// System.err.println("tableMemory="+tableMemory+", bytesPerKmer="+bytesPerKmer+", estimatedKmerCapacity="+estimatedKmerCapacity);
+ KmerCountAbstract.minProb=(minProbPrefilter ? minProb : 0);
+ k=k_;
+ k2=k-1;
+
+ if(k<1 || k>31){throw new RuntimeException("\nk needs an integer value from 1 to 31, such as k=27. Default is 31.\n");}
+
+ if(initialSize<1){
+ final long memOverWays=tableMemory/(bytesPerKmer*ways);
+ final double mem2=(prealloc ? preallocFraction : 1)*tableMemory;
+ initialSize=(prealloc || memOverWays<initialSizeDefault ? (int)Tools.min(2142000000, (long)(mem2/(bytesPerKmer*ways))) : initialSizeDefault);
+ if(initialSize!=initialSizeDefault){
+ System.err.println("Initial size set to "+initialSize);
+ }
+ }
+
+ /* Adjust I/O settings and filenames */
+
+ assert(FastaReadInputStream.settingsOK());
+
+ if(in1.isEmpty()){
+ printOptions();
+ throw new RuntimeException("Error - at least one input file is required.");
+ }
+
+ for(int i=0; i<in1.size(); i++){
+ String s=in1.get(i);
+ if(s!=null && s.contains("#") && !new File(s).exists()){
+ int pound=s.lastIndexOf('#');
+ String a=s.substring(0, pound);
+ String b=s.substring(pound+1);
+ in1.set(i, a+1+b);
+ in2.add(a+2+b);
+ }
+ }
+
+ {
+ boolean allowDuplicates=true;
+ if(!Tools.testInputFiles(allowDuplicates, true, in1, in2)){
+ throw new RuntimeException("\nCan't read to some input files.\n");
+ }
+ }
+ assert(THREADS>0);
+
+ if(DISPLAY_PROGRESS){
+ outstream.println("Initial:");
+ outstream.println("Ways="+ways+", initialSize="+initialSize+", prefilter="+(prefilter ? "t" : "f")+", prealloc="+(prealloc ? (""+preallocFraction) : "f"));
+ Shared.printMemory();
+ outstream.println();
+ }
+
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Outer Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public void clear(){
+ tables=null;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ protected void allocateTables(){
+ assert(tables==null);
+ tables=null;
+ final int tableType=AbstractKmerTable.ARRAY1D;
+ tables=AbstractKmerTable.preallocate(ways, tableType, initialSize, (!prealloc || preallocFraction<1));
+ }
+
+ /**
+ * Load reads into tables, using multiple LoadThread.
+ */
+ public long loadKmers(String fname1, String fname2){
+
+ /* Create read input stream */
+ final ConcurrentReadInputStream cris;
+ {
+ FileFormat ff1=FileFormat.testInput(fname1, FileFormat.FASTQ, null, true, true);
+ FileFormat ff2=FileFormat.testInput(fname2, FileFormat.FASTQ, null, true, true);
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, false, ff1, ff2);
+ cris.start(); //4567
+ }
+
+// /* Optionally skip the first reads, since initial reads may have lower quality */
+// if(skipreads>0){
+// long skipped=0;
+//
+// ListNum<Read> ln=cris.nextList();
+// ArrayList<Read> reads=(ln!=null ? ln.list : null);
+//
+// while(skipped<skipreads && reads!=null && reads.size()>0){
+// skipped+=reads.size();
+//
+// cris.returnList(ln.id, ln.list.isEmpty());
+// ln=cris.nextList();
+// reads=(ln!=null ? ln.list : null);
+// }
+// cris.returnList(ln.id, ln.list.isEmpty());
+// if(reads==null || reads.isEmpty()){
+// ReadWrite.closeStreams(cris);
+// System.err.println("Skipped all of the reads.");
+// System.exit(0);
+// }
+// }
+
+ /* Create ProcessThreads */
+ ArrayList<LoadThread> alpt=new ArrayList<LoadThread>(THREADS);
+ for(int i=0; i<THREADS; i++){alpt.add(new LoadThread(cris));}
+ for(LoadThread pt : alpt){pt.start();}
+
+ long added=0;
+
+ /* Wait for threads to die, and gather statistics */
+ for(LoadThread pt : alpt){
+ while(pt.getState()!=Thread.State.TERMINATED){
+ try {
+ pt.join();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ added+=pt.added;
+
+ readsIn+=pt.readsInT;
+ basesIn+=pt.basesInT;
+ lowqReads+=pt.lowqReadsT;
+ lowqBases+=pt.lowqBasesT;
+ readsTrimmed+=pt.readsTrimmedT;
+ basesTrimmed+=pt.basesTrimmedT;
+ }
+
+ /* Shut down I/O streams; capture error status */
+ errorState|=ReadWrite.closeStreams(cris);
+ return added;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Classes ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /**
+ * Loads kmers.
+ */
+ private class LoadThread extends Thread{
+
+ /**
+ * Constructor
+ * @param cris_ Read input stream
+ */
+ public LoadThread(ConcurrentReadInputStream cris_){
+ cris=cris_;
+ table=new HashBuffer(tables, buflen, k, false);
+ }
+
+ @Override
+ public void run(){
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ //While there are more reads lists...
+ while(reads!=null && reads.size()>0){
+
+ //For each read (or pair) in the list...
+ for(int i=0; i<reads.size(); i++){
+ Read r1=reads.get(i);
+ Read r2=r1.mate;
+
+ if(!r1.validated()){r1.validate(true);}
+ if(r2!=null && !r2.validated()){r2.validate(true);}
+
+ if(verbose){System.err.println("Considering read "+r1.id+" "+new String(r1.bases));}
+
+ readsInT++;
+ basesInT+=r1.length();
+ if(r2!=null){
+ readsInT++;
+ basesInT+=r2.length();
+ }
+
+ //Determine whether to discard the reads based on average quality
+ if(minAvgQuality>0){
+ if(r1!=null && r1.quality!=null && r1.avgQuality(false, minAvgQualityBases)<minAvgQuality){r1.setDiscarded(true);}
+ if(r2!=null && r2.quality!=null && r2.avgQuality(false, minAvgQualityBases)<minAvgQuality){r2.setDiscarded(true);}
+ }
+
+ if(r1!=null){
+ if(qtrimLeft || qtrimRight){
+ int x=TrimRead.trimFast(r1, qtrimLeft, qtrimRight, trimq, 1);
+ basesTrimmedT+=x;
+ readsTrimmedT+=(x>0 ? 1 : 0);
+ }
+ if(r1.length()<k){r1.setDiscarded(true);}
+ }
+ if(r2!=null){
+ if(qtrimLeft || qtrimRight){
+ int x=TrimRead.trimFast(r2, qtrimLeft, qtrimRight, trimq, 1);
+ basesTrimmedT+=x;
+ readsTrimmedT+=(x>0 ? 1 : 0);
+ }
+ if(r2.length()<k){r2.setDiscarded(true);}
+ }
+
+ if((ecco || merge) && r1!=null && r2!=null && !r1.discarded() && !r2.discarded()){
+ if(merge){
+ final int insert=BBMerge.findOverlapStrict(r1, r2, false);
+ if(insert>0){
+ r2.reverseComplement();
+ r1=r1.joinRead(insert);
+ r2=null;
+ }
+ }else if(ecco){
+ BBMerge.findOverlapStrict(r1, r2, true);
+ }
+ }
+
+ if(r1!=null){
+ if(r1.discarded()){
+ lowqBasesT+=r1.length();
+ lowqReadsT++;
+ }else{
+ long temp=addKmersToTable(r1);
+ added+=temp;
+ if(verbose){System.err.println("A: Added "+temp);}
+ }
+ }
+ if(r2!=null){
+ if(r2.discarded()){
+ lowqBasesT+=r2.length();
+ lowqReadsT++;
+ }else{
+ long temp=addKmersToTable(r2);
+ added+=temp;
+ if(verbose){System.err.println("B: Added "+temp);}
+ }
+ }
+ }
+
+ //Fetch a new read list
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ cris.returnList(ln.id, ln.list.isEmpty());
+ long temp=table.flush();
+ if(verbose){System.err.println("Flush: Added "+temp);}
+ added+=temp;
+ }
+
+
+ private final int addKmersToTable(final Read r){
+ if(onePass){return addKmersToTable_onePass(r);}
+ if(r==null || r.bases==null){return 0;}
+ final float minProb2=(minProbMain ? minProb : 0);
+ final byte[] bases=r.bases;
+ final byte[] quals=r.quality;
+ final int shift=2*k;
+ final int shift2=shift-2;
+ final long mask=~((-1L)<<shift);
+ long kmer=0;
+ long rkmer=0;
+ int created=0;
+ int len=0;
+
+ if(bases==null || bases.length<k){return -1;}
+
+ /* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */
+ float prob=1;
+ for(int i=0; i<bases.length; i++){
+ final byte b=bases[i];
+ final long x=AminoAcid.baseToNumber[b];
+ final long x2=AminoAcid.baseToComplementNumber[b];
+
+ //Update kmers
+ kmer=((kmer<<2)|x)&mask;
+ rkmer=(rkmer>>>2)|(x2<<shift2);
+
+ if(minProb2>0 && quals!=null){//Update probability
+ prob=prob*PROB_CORRECT[quals[i]];
+ if(len>k){
+ byte oldq=quals[i-k];
+ prob=prob*PROB_CORRECT_INVERSE[oldq];
+ }
+ }
+
+ //Handle Ns
+ if(x<0){
+ len=0;
+ kmer=rkmer=0;
+ prob=1;
+ }else{len++;}
+
+ if(verbose){System.err.println("Scanning i="+i+", len="+len+", kmer="+kmer+", rkmer="+rkmer+"\t"+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
+ if(len>=k && prob>=minProb2){
+ final long key=toValue(kmer, rkmer);
+ if(!prefilter || prefilterArray.read(key)>filterMax2){
+ int temp=table.incrementAndReturnNumCreated(key);
+ created+=temp;
+ if(verbose){System.err.println("C: Added "+temp);}
+ }
+ }
+ }
+
+ return created;
+ }
+
+
+ private final int addKmersToTable_onePass(final Read r){
+ assert(prefilter);
+ if(r==null || r.bases==null){return 0;}
+ final byte[] bases=r.bases;
+ final byte[] quals=r.quality;
+ final int shift=2*k;
+ final int shift2=shift-2;
+ final long mask=~((-1L)<<shift);
+ long kmer=0;
+ long rkmer=0;
+ int created=0;
+ int len=0;
+
+ if(bases==null || bases.length<k){return -1;}
+
+ /* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */
+ float prob=1;
+ for(int i=0; i<bases.length; i++){
+ final byte b=bases[i];
+ final long x=AminoAcid.baseToNumber[b];
+ final long x2=AminoAcid.baseToComplementNumber[b];
+
+ //Update kmers
+ kmer=((kmer<<2)|x)&mask;
+ rkmer=(rkmer>>>2)|(x2<<shift2);
+
+ if(minProb>0 && quals!=null){//Update probability
+ prob=prob*PROB_CORRECT[quals[i]];
+ if(len>k){
+ byte oldq=quals[i-k];
+ prob=prob*PROB_CORRECT_INVERSE[oldq];
+ }
+ }
+
+ //Handle Ns
+ if(x<0){
+ len=0;
+ kmer=rkmer=0;
+ prob=1;
+ }else{len++;}
+
+ if(verbose){System.err.println("Scanning i="+i+", len="+len+", kmer="+kmer+", rkmer="+rkmer+"\t"+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
+ if(len>=k){
+ final long key=toValue(kmer, rkmer);
+ int count=prefilterArray.incrementAndReturnUnincremented(key, 1);
+ if(count>=filterMax2){
+ int temp=table.incrementAndReturnNumCreated(key);
+ created+=temp;
+ if(verbose){System.err.println("D: Added "+temp);}
+ }
+ }
+ }
+ return created;
+ }
+
+ /*--------------------------------------------------------------*/
+
+ /** Input read stream */
+ private final ConcurrentReadInputStream cris;
+
+ private final HashBuffer table;
+
+ public long added=0;
+
+ private long readsInT=0;
+ private long basesInT=0;
+ private long lowqReadsT=0;
+ private long lowqBasesT=0;
+ private long readsTrimmedT=0;
+ private long basesTrimmedT=0;
+
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Convenience ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public void regenerateKmers(byte[] bases, LongList kmers, IntList counts, final int a){
+ final int loc=a+k;
+ final int lim=Tools.min(counts.size, a+k+1);
+ final int shift=2*k;
+ final int shift2=shift-2;
+ final long mask=~((-1L)<<shift);
+ long kmer=kmers.get(a);
+ long rkmer=rcomp(kmer);
+ int len=k;
+
+// assert(false) : a+", "+loc+", "+lim;
+
+ /* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */
+ for(int i=loc, j=a+1; j<lim; i++, j++){
+ final byte b=bases[i];
+ final long x=AminoAcid.baseToNumber[b];
+ final long x2=AminoAcid.baseToComplementNumber[b];
+ kmer=((kmer<<2)|x)&mask;
+ rkmer=(rkmer>>>2)|(x2<<shift2);
+ if(x<0){
+ len=0;
+ kmer=rkmer=0;
+ }else{len++;}
+
+ if(len>=k){
+ assert(kmers.get(j)!=kmer);
+ kmers.set(j, kmer);
+ int count=getCount(kmer, rkmer);
+ counts.set(j, count);
+ }else{
+ kmers.set(j, -1);
+ counts.set(j, 0);
+ }
+ }
+ }
+
+ /** Returns number of valid kmers */
+ public int fillKmers(byte[] bases, LongList kmers){
+ final int blen=bases.length;
+ if(blen<k){return 0;}
+ final int min=k-1;
+ final int shift=2*k;
+ final long mask=~((-1L)<<shift);
+ long kmer=0;
+ int len=0;
+ int valid=0;
+
+ kmers.clear();
+
+ /* Loop through the bases, maintaining a forward kmer via bitshifts */
+ for(int i=0; i<blen; i++){
+ final byte b=bases[i];
+ final long x=AminoAcid.baseToNumber[b];
+ kmer=((kmer<<2)|x)&mask;
+ if(x<0){
+ len=0;
+ kmer=0;
+ }else{len++;}
+ if(i>=min){
+ if(len>=k){
+ kmers.add(kmer);
+ valid++;
+ }else{
+ kmers.add(-1);
+ }
+ }
+ }
+ return valid;
+ }
+
+ public void fillCounts(LongList kmers, IntList counts){
+ counts.clear();
+ for(int i=0; i<kmers.size; i++){
+ long kmer=kmers.get(i);
+ if(kmer>=0){
+ long rkmer=rcomp(kmer);
+ int count=getCount(kmer, rkmer);
+ counts.add(count);
+ }else{
+ counts.add(0);
+ }
+ }
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Helper Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public long regenerate(){
+ long sum=0;
+ for(AbstractKmerTable akt : tables){
+ sum+=akt.regenerate();
+ }
+ return sum;
+ }
+
+ public HashArray1D getTableForKey(long key){
+ return (HashArray1D) tables[(int)(key%ways)];
+ }
+
+ public HashArray1D getTable(int tnum){
+ return (HashArray1D) tables[tnum];
+ }
+
+ public long[] fillHistogram(int histMax) {
+ long[] ca=new long[histMax+1];
+ for(AbstractKmerTable set : tables){
+ set.fillHistogram(ca, histMax);
+ }
+ return ca;
+ }
+
+ public void initializeOwnership(){
+ for(AbstractKmerTable akt : tables){
+ akt.initializeOwnership();
+ }
+ }
+
+ public void clearOwnership(){
+ for(AbstractKmerTable akt : tables){
+ akt.clearOwnership();
+ }
+ }
+
+ public long rightmostKmer(final ByteBuilder bb){
+ return rightmostKmer(bb.array, bb.length());
+ }
+
+ public long rightmostKmer(final byte[] bases, final int blen){
+ if(blen<k){return -1;}
+ final int shift=2*k;
+ final int shift2=shift-2;
+ final long mask=~((-1L)<<shift);
+ long kmer=0;
+ long rkmer=0;
+ int len=0;
+
+ /* Loop through the bases, maintaining a forward and reverse kmer via bitshifts, to get the rightmost kmer */
+ {
+ for(int i=blen-k; i<blen; i++){
+ final byte b=bases[i];
+ final long x=AminoAcid.baseToNumber[b];
+ final long x2=AminoAcid.baseToComplementNumber[b];
+ kmer=((kmer<<2)|x)&mask;
+ rkmer=(rkmer>>>2)|(x2<<shift2);
+ if(x<0){
+ len=0;
+ kmer=rkmer=0;
+ }else{len++;}
+ if(verbose){outstream.println("Scanning i="+i+", len="+len+", kmer="+kmer+", rkmer="+rkmer+"\t"+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
+ }
+ }
+
+ if(len<k){return -1;}
+ else{assert(len==k);}
+ return kmer;
+ }
+
+ public long leftmostKmer(final ByteBuilder bb){
+ return leftmostKmer(bb.array, bb.length());
+ }
+
+ public long leftmostKmer(final byte[] bases, final int blen){
+ if(blen<k){return -1;}
+ final int shift=2*k;
+ final int shift2=shift-2;
+ final long mask=~((-1L)<<shift);
+ long kmer=0;
+ long rkmer=0;
+ int len=0;
+
+ /* Loop through the bases, maintaining a forward and reverse kmer via bitshifts, to get the leftmost kmer */
+ {
+ for(int i=0; i<k; i++){
+ final byte b=bases[i];
+ final long x=AminoAcid.baseToNumber[b];
+ final long x2=AminoAcid.baseToComplementNumber[b];
+ kmer=((kmer<<2)|x)&mask;
+ rkmer=(rkmer>>>2)|(x2<<shift2);
+ if(x<0){
+ len=0;
+ kmer=rkmer=0;
+ }else{len++;}
+ if(verbose){outstream.println("Scanning i="+i+", len="+len+", kmer="+kmer+", rkmer="+rkmer+"\t"+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
+ }
+ }
+
+ if(len<k){return -1;}
+ else{assert(len==k);}
+ return kmer;
+ }
+
+ public boolean doubleClaim(final ByteBuilder bb, final int id){
+ return doubleClaim(bb.array, bb.length(), id);
+ }
+
+ /** Ensures there can be only one owner. */
+ public boolean doubleClaim(final byte[] bases, final int blength, final int id){
+ boolean success=claim(bases, blength, id, true);
+ if(verbose){outstream.println("success1="+success+", id="+id+", blength="+blength);}
+ if(!success){return false;}
+ success=claim(bases, blength, id+CLAIM_OFFSET, true);
+ if(verbose){outstream.println("success2="+success+", id="+id+", blength="+blength);}
+ return success;
+ }
+
+ public boolean claim(final ByteBuilder bb, final int id, final boolean exitEarly){
+ return claim(bb.array, bb.length(), id, exitEarly);
+ }
+
+ public float calcCoverage(final byte[] bases, final int blength){
+ final int shift=2*k;
+ final int shift2=shift-2;
+ final long mask=~((-1L)<<shift);
+ long kmer=0, rkmer=0;
+ int len=0;
+ long sum=0, max=0;
+ int kmers=0;
+
+ /* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */
+ for(int i=0; i<blength; i++){
+ final byte b=bases[i];
+ final long x=AminoAcid.baseToNumber[b];
+ final long x2=AminoAcid.baseToComplementNumber[b];
+ kmer=((kmer<<2)|x)&mask;
+ rkmer=(rkmer>>>2)|(x2<<shift2);
+ if(x<0){
+ len=0;
+ kmer=rkmer=0;
+ }else{len++;}
+ if(len>=k){
+ int count=getCount(kmer, rkmer);
+ sum+=count;
+ max=Tools.max(count, max);
+ kmers++;
+ }
+ }
+ return sum==0 ? 0 : sum/(float)kmers;
+ }
+
+ public boolean claim(final byte[] bases, final int blength, final int id, boolean exitEarly){
+ if(verbose){outstream.println("Thread "+id+" claim start.");}
+ final int shift=2*k;
+ final int shift2=shift-2;
+ final long mask=~((-1L)<<shift);
+ long kmer=0, rkmer=0;
+ int len=0;
+ boolean success=true;
+ /* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */
+ for(int i=0; i<blength && success; i++){
+ final byte b=bases[i];
+ final long x=AminoAcid.baseToNumber[b];
+ final long x2=AminoAcid.baseToComplementNumber[b];
+ kmer=((kmer<<2)|x)&mask;
+ rkmer=(rkmer>>>2)|(x2<<shift2);
+ if(x<0){
+ len=0;
+ kmer=rkmer=0;
+ }else{len++;}
+ if(verbose){System.err.println("Scanning i="+i+", len="+len+", kmer="+kmer+", rkmer="+rkmer+"\t"+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
+ if(len>=k){
+ success=claim(kmer, rkmer, id/*, rid, i*/);
+ success=(success || !exitEarly);
+ }
+ }
+ return success;
+ }
+
+ public boolean claim(final long kmer, final long rkmer, final int id/*, final long rid, final int pos*/){
+ //TODO: rid and pos are just for debugging.
+ final long key=toValue(kmer, rkmer);
+ final int way=(int)(key%ways);
+ final AbstractKmerTable table=tables[way];
+ final int count=table.getValue(key);
+ assert(count==-1 || count>0) : count;
+// if(verbose /*|| true*/){outstream.println("Count="+count+".");}
+ if(count<0){return true;}
+ assert(count>0) : count;
+ final int owner=table.setOwner(key, id);
+ if(verbose){outstream.println("owner="+owner+".");}
+// assert(owner==id) : id+", "+owner+", "+rid+", "+pos;
+ return owner==id;
+ }
+
+ public void release(ByteBuilder bb, final int id){
+ release(bb.array, bb.length(), id);
+ }
+
+ public void release(final byte[] bases, final int blength, final int id){
+ if(verbose /*|| true*/){outstream.println("*Thread "+id+" release start.");}
+ final int shift=2*k;
+ final int shift2=shift-2;
+ final long mask=~((-1L)<<shift);
+ long kmer=0, rkmer=0;
+ int len=0;
+ /* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */
+ for(int i=0; i<blength; i++){
+ final byte b=bases[i];
+ final long x=AminoAcid.baseToNumber[b];
+ final long x2=AminoAcid.baseToComplementNumber[b];
+ kmer=((kmer<<2)|x)&mask;
+ rkmer=(rkmer>>>2)|(x2<<shift2);
+ if(x<0){
+ len=0;
+ kmer=rkmer=0;
+ }else{len++;}
+ if(verbose){System.err.println("Scanning i="+i+", len="+len+", kmer="+kmer+", rkmer="+rkmer+"\t"+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
+ if(len>=k){
+ release(kmer, rkmer, id);
+ }
+ }
+ }
+
+ public boolean release(final long kmer, final long rkmer, final int id){
+ return release(toValue(kmer, rkmer), id);
+ }
+
+ public boolean release(final long key, final int id){
+ final int way=(int)(key%ways);
+ final AbstractKmerTable table=tables[way];
+ final int count=table.getValue(key);
+// if(verbose /*|| true*/){outstream.println("Count="+count+".");}
+ if(count<1){return true;}
+ return table.clearOwner(key, id);
+ }
+
+ public int findOwner(ByteBuilder bb, final int id){
+ return findOwner(bb.array, bb.length(), id);
+ }
+
+ public int findOwner(final byte[] bases, final int blength, final int id){
+ final int shift=2*k;
+ final int shift2=shift-2;
+ final long mask=~((-1L)<<shift);
+ long kmer=0, rkmer=0;
+ int len=0;
+ int maxOwner=-1;
+ /* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */
+ for(int i=0; i<blength; i++){
+ final byte b=bases[i];
+ final long x=AminoAcid.baseToNumber[b];
+ final long x2=AminoAcid.baseToComplementNumber[b];
+ kmer=((kmer<<2)|x)&mask;
+ rkmer=(rkmer>>>2)|(x2<<shift2);
+ if(x<0){
+ len=0;
+ kmer=rkmer=0;
+ }else{len++;}
+ if(verbose){System.err.println("Scanning i="+i+", len="+len+", kmer="+kmer+", rkmer="+rkmer+"\t"+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
+ if(len>=k){
+ int owner=findOwner(kmer, rkmer);
+ maxOwner=Tools.max(owner, maxOwner);
+ if(maxOwner>id){break;}
+ }
+ }
+ return maxOwner;
+ }
+
+ public int findOwner(final long kmer){
+ return findOwner(kmer, rcomp(kmer));
+ }
+
+ public int findOwner(final long kmer, final long rkmer){
+ final long key=toValue(kmer, rkmer);
+ final int way=(int)(key%ways);
+ final AbstractKmerTable table=tables[way];
+ final int count=table.getValue(key);
+ if(count<0){return -1;}
+ final int owner=table.getOwner(key);
+ return owner;
+ }
+
+ public int getCount(long kmer, long rkmer){
+ long key=toValue(kmer, rkmer);
+ int way=(int)(key%ways);
+ return tables[way].getValue(key);
+ }
+
+ public int getCount(long key){
+ int way=(int)(key%ways);
+ return tables[way].getValue(key);
+ }
+
+ public int fillRightCounts(long kmer, long rkmer, int[] counts, long mask, int shift2){
+ assert(kmer==rcomp(rkmer));
+ if(verbose){outstream.println("fillRightCounts: "+toText(kmer)+", "+toText(rkmer));}
+ kmer=(kmer<<2)&mask;
+ rkmer=(rkmer>>>2);
+ int max=-1, maxPos=0;
+
+ for(int i=0; i<=3; i++){
+ long kmer2=kmer|((long)i);
+ long rkmer2=rkmer|(((long)AminoAcid.numberToComplement[i])<<shift2);
+ if(verbose){outstream.println("kmer: "+toText(kmer2)+", "+toText(rkmer2));}
+ assert(kmer2==(kmer2&mask));
+ assert(rkmer2==(rkmer2&mask));
+ assert(kmer2==rcomp(rkmer2));
+ long key=toValue(kmer2, rkmer2);
+ int way=(int)(key%ways);
+ int count=tables[way].getValue(key);
+ assert(count==NOT_PRESENT || count>=0);
+ count=Tools.max(count, 0);
+ counts[i]=count;
+ if(count>max){
+ max=count;
+ maxPos=i;
+ }
+ }
+ return maxPos;
+ }
+
+ public int fillLeftCounts(long kmer, long rkmer, int[] counts, long mask, int shift2){
+ assert(kmer==rcomp(rkmer));
+ if(verbose){outstream.println("fillLeftCounts: "+toText(kmer)+", "+toText(rkmer));}
+ rkmer=(rkmer<<2)&mask;
+ kmer=(kmer>>>2);
+ int max=-1, maxPos=0;
+// assert(false) : shift2+", "+k;
+ for(int i=0; i<=3; i++){
+ long rkmer2=rkmer|((long)AminoAcid.numberToComplement[i]);
+ long kmer2=kmer|(((long)i)<<shift2);
+ if(verbose){outstream.println("kmer: "+toText(kmer2)+", "+toText(rkmer2));}
+ assert(kmer2==(kmer2&mask));
+ assert(rkmer2==(rkmer2&mask));
+ assert(kmer2==rcomp(rkmer2)) : "\n"+"kmer: \t"+toText(rcomp(rkmer2))+", "+toText(rcomp(kmer2));
+ long key=toValue(rkmer2, kmer2);
+ int way=(int)(key%ways);
+ int count=tables[way].getValue(key);
+ assert(count==NOT_PRESENT || count>=0);
+ count=Tools.max(count, 0);
+ counts[i]=count;
+ if(count>max){
+ max=count;
+ maxPos=i;
+ }
+ }
+ return maxPos;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Printing Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public boolean dumpKmersAsBytes(String fname, int minToDump, boolean printTime){
+ if(fname==null){return false;}
+ Timer t=new Timer();
+
+ ByteStreamWriter bsw=new ByteStreamWriter(fname, overwrite, false, true);
+ bsw.start();
+ for(AbstractKmerTable set : tables){
+ set.dumpKmersAsBytes(bsw, k, minToDump);
+ }
+ bsw.poisonAndWait();
+
+ t.stop();
+ if(printTime){outstream.println("Kmer Dump Time: \t"+t);}
+ return bsw.errorState;
+ }
+
+ public boolean dumpKmersAsBytes_MT(String fname, int minToDump, boolean printTime){
+ final int threads=Tools.min(Shared.threads(), tables.length);
+ if(threads<3 || DumpThread.NUM_THREADS==1){return dumpKmersAsBytes(fname, minToDump, printTime);}
+
+ if(fname==null){return false;}
+ Timer t=new Timer();
+
+ ByteStreamWriter bsw=new ByteStreamWriter(fname, overwrite, false, true);
+ bsw.start();
+ DumpThread.dump(k, minToDump, tables, bsw);
+ bsw.poisonAndWait();
+
+ t.stop();
+ if(printTime){outstream.println("Kmer Dump Time: \t"+t);}
+ return bsw.errorState;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Recall Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private final long rcomp(long kmer){return AminoAcid.reverseComplementBinaryFast(kmer, k);}
+ private final StringBuilder toText(long kmer){return AbstractKmerTable.toText(kmer, k);}
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Static Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /**
+ * Transforms a kmer into a canonical value stored in the table. Expected to be inlined.
+ * @param kmer Forward kmer
+ * @param rkmer Reverse kmer
+ * @param lengthMask Bitmask with single '1' set to left of kmer
+ * @return Canonical value
+ */
+ public final long toValue(long kmer, long rkmer){
+ long value=(rcomp ? Tools.max(kmer, rkmer) : kmer);
+ return value;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Final Primitives ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public int kbig(){return k;}
+ public long filterMemory(int pass){return ((pass&1)==0) ? filterMemory0 : filterMemory1;}
+ public boolean ecco(){return ecco;}
+ public boolean qtrimLeft(){return qtrimLeft;}
+ public boolean qtrimRight(){return qtrimRight;}
+ public byte minAvgQuality(){return minAvgQuality;}
+ public long tableMemory(){return tableMemory;}
+ public long estimatedKmerCapacity(){return estimatedKmerCapacity;}
+
+ /** Hold kmers. A kmer X such that X%WAYS=Y will be stored in tables[Y] */
+ private AbstractKmerTable[] tables;
+
+ private final int bytesPerKmer;
+
+ private final long usableMemory;
+ private final long filterMemory0;
+ private final long filterMemory1;
+ private final long tableMemory;
+ private final long estimatedKmerCapacity;
+
+ /** Number of tables (and threads, during loading) */
+ private final boolean prealloc;
+
+ /** Number of tables (and threads, during loading) */
+ public final int ways;
+
+ /** Normal kmer length */
+ public final int k;
+ /** k-1; used in some expressions */
+ public final int k2;
+
+ /** Look for reverse-complements as well as forward kmers. Default: true */
+ private final boolean rcomp;
+
+ /** Quality-trim the left side */
+ public final boolean qtrimLeft;
+ /** Quality-trim the right side */
+ public final boolean qtrimRight;
+ /** Trim bases at this quality or below. Default: 4 */
+ public final byte trimq;
+
+ /** Throw away reads below this average quality before trimming. Default: 0 */
+ public final byte minAvgQuality;
+ /** If positive, calculate average quality from the first X bases only. Default: 0 */
+ public final int minAvgQualityBases;
+
+ /** Ignore kmers with probability of correctness less than this */
+ public final float minProb;
+
+ /** Correct via overlap */
+ private final boolean ecco;
+
+ /** Attempt to merge via overlap prior to counting kmers */
+ private final boolean merge;
+
+}
diff --git a/current/kmer/Primes.java b/current/kmer/Primes.java
new file mode 100755
index 0000000..ad7f777
--- /dev/null
+++ b/current/kmer/Primes.java
@@ -0,0 +1,163 @@
+package kmer;
+
+import java.io.File;
+import java.util.Arrays;
+
+import dna.Data;
+
+import fileIO.TextFile;
+import fileIO.TextStreamWriter;
+
+/**
+ * @author Brian Bushnell
+ * @date Oct 9, 2012
+ *
+ */
+public class Primes {
+
+ public static void main(String[] args){
+
+ if(args.length==3){makePrimes(args);}
+ else{
+
+
+ System.out.println(primeAtLeast(100));
+ System.out.println(primeAtLeast(1000));
+ System.out.println(primeAtLeast(10000));
+ System.out.println(primeAtLeast(100000));
+ System.out.println(primeAtLeast(1000000));
+ System.out.println(primeAtLeast(10000000));
+ System.out.println(primeAtLeast(100000000));
+ System.out.println(primeAtLeast(1000000000));
+ System.out.println(primeAtLeast(10000000000L));
+ System.out.println(primeAtLeast(100000000000L));
+ System.out.println(primeAtLeast(1000000000000L));
+ System.out.println(primeAtLeast(10000000000000L));
+ System.out.println(primeAtLeast(100000000000000L));
+ System.out.println(primeAtLeast(1000000000000000L));
+
+
+ System.out.println(primeAtMost(100));
+ System.out.println(primeAtMost(1000));
+ System.out.println(primeAtMost(10000));
+ System.out.println(primeAtMost(100000));
+ System.out.println(primeAtMost(1000000));
+ System.out.println(primeAtMost(10000000));
+ System.out.println(primeAtMost(100000000));
+ System.out.println(primeAtMost(1000000000));
+ System.out.println(primeAtMost(10000000000L));
+ System.out.println(primeAtMost(100000000000L));
+ System.out.println(primeAtMost(1000000000000L));
+ System.out.println(primeAtMost(10000000000000L));
+ System.out.println(primeAtMost(100000000000000L));
+ System.out.println(primeAtMost(1000000000000000L));
+
+ }
+
+ }
+
+
+ public static void makePrimes(String[] args){
+
+
+ String in=args[0];
+ String out=args[1];
+ double mult=Double.parseDouble(args[2]);
+ assert(mult>=1);
+
+ long next=1;
+
+ if(!new File(in).exists()){throw new RuntimeException("File not found: "+in);}
+ TextFile tf=new TextFile(in, true, false);
+ TextStreamWriter tsw=new TextStreamWriter(out, true, false, false);
+ tsw.start();
+
+// int cnt=0;
+
+ for(String s=tf.nextLine(); s!=null; s=tf.nextLine()){
+// cnt++;
+// if(cnt>10000){break;}
+ long x=Long.parseLong(s.trim());
+
+// System.out.println("cnt="+cnt+", x="+x+", next="+next);
+
+ if(x>=next){
+ tsw.print(x+"\n");
+ next=(long)(x*mult);
+ }
+ }
+ tsw.poison();
+ tf.close();
+
+ }
+
+
+ public static long primeAtLeast(long x){
+ int loc=Arrays.binarySearch(primes, x);
+ if(loc<0){
+ loc=-(loc+1);
+ assert(loc>=primes.length || primes[loc]>x) : x;
+ }
+ if(loc>=primes.length){//Out of bounds
+ long d=(long)Math.pow(x, 0.4);
+ long a=primeAtLeast(x/d);
+ long b=primeAtLeast(x/a);
+ long c=a*b;
+ assert(c>=x && c<(x*9)/8) : x+", "+a+", "+b+", "+c+", "+d;
+ return c;
+ }
+ while(primes[loc]<x){loc++;}
+ return primes[loc];
+
+// for(long p : primes){
+// if(p>=x){return p;}
+// }
+// throw new RuntimeException("No primes big enough for "+x);
+ }
+
+
+ public static long primeAtMost(long x){
+ int loc=Arrays.binarySearch(primes, x);
+ if(loc<0){
+ loc=-(loc+1);
+ assert(loc>=primes.length || primes[loc]>x) : x;
+ }
+ assert(loc>=0) : loc+", "+x;
+ if(loc>=primes.length){//Out of bounds
+ long d=(long)Math.pow(x, 0.4);
+ long a=primeAtMost(x/d);
+ long b=primeAtMost(x/a);
+ long c=a*b;
+ assert(c<=x && c>(x*7)/8) : x+", "+a+", "+b+", "+c+", "+d;
+ return c;
+ }
+ assert(loc>=0) : loc+", "+x;
+ assert(x>=primes[0]) : loc+", "+x+", "+primes[0];
+ while(primes[loc]>x){loc--;}
+ return primes[loc];
+
+// for(int i=primes.length-1; i>=0; i--){
+// if(primes[i]<=x){return primes[i];}
+// }
+// throw new RuntimeException("No primes small enough for "+x);
+ }
+
+
+ /**
+ * @return
+ */
+ private static long[] fetchPrimes() {
+ String fname=Data.findPath("?primes.txt.gz");
+
+ TextFile tf=new TextFile(fname, false, false);
+ String[] lines=tf.toStringLines();
+ long[] array=new long[lines.length];
+ for(int i=0; i<lines.length; i++){
+ array[i]=Long.parseLong(lines[i].trim());
+ }
+ return array;
+ }
+
+ public static final long[] primes=fetchPrimes();
+
+}
diff --git a/current/kmer/TableLoaderLockFree.java b/current/kmer/TableLoaderLockFree.java
new file mode 100755
index 0000000..76074df
--- /dev/null
+++ b/current/kmer/TableLoaderLockFree.java
@@ -0,0 +1,823 @@
+package kmer;
+
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.concurrent.ArrayBlockingQueue;
+
+import jgi.BBMerge;
+import jgi.Dedupe;
+
+import stream.ConcurrentReadInputStream;
+import stream.Read;
+import align2.IntList;
+import align2.ListNum;
+import align2.Shared;
+import align2.Tools;
+import dna.AminoAcid;
+import dna.Parser;
+import dna.Timer;
+import fileIO.FileFormat;
+import fileIO.ReadWrite;
+import fileIO.TextStreamWriter;
+
+/**
+ * @author Brian Bushnell
+ * @date Mar 4, 2015
+ *
+ */
+public class TableLoaderLockFree {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /**
+ * Code entrance from the command line.
+ * @param args Command line arguments
+ */
+ public static void main(String[] args){
+
+ args=Parser.parseConfig(args);
+ if(Parser.parseHelp(args, true)){
+ assert(false) : "TODO";
+ System.exit(0);
+ }
+
+ Timer t=new Timer();
+
+ AbstractKmerTable[] tables=makeTables(AbstractKmerTable.ARRAY1D, initialSizeDefault, true);
+
+ int k=31;
+ int mink=0;
+ int speed=0;
+ int hdist=0;
+ int edist=0;
+ boolean rcomp=true;
+ boolean maskMiddle=false;
+
+ //Create a new Loader instance
+ TableLoaderLockFree loader=new TableLoaderLockFree(tables, k, mink, speed, hdist, edist, rcomp, maskMiddle);
+ loader.setRefSkip(0);
+ loader.hammingDistance2=0;
+ loader.editDistance2=0;
+ loader.storeMode(SET_IF_NOT_PRESENT);
+
+ ///And run it
+ String[] refs=args;
+ String[] literals=null;
+ boolean keepNames=false;
+ boolean useRefNames=false;
+ long kmers=loader.processData(refs, literals, keepNames, useRefNames, false);
+ t.stop();
+
+ System.err.println("Time: \t"+t);
+ System.err.println("Return: \t"+kmers);
+ System.err.println("refKmers: \t"+loader.refKmers);
+ System.err.println("refBases: \t"+loader.refBases);
+ System.err.println("refReads: \t"+loader.refReads);
+ }
+
+ public TableLoaderLockFree(AbstractKmerTable[] tables_, int k_){
+ this(tables_, k_, 0, 0, 0, 0, true, false);
+ }
+
+ public TableLoaderLockFree(AbstractKmerTable[] tables_, int k_, int mink_, int speed_, int hdist_, int edist_, boolean rcomp_, boolean maskMiddle_){
+ tables=tables_;
+ k=k_;
+ k2=k-1;
+ mink=mink_;
+ rcomp=rcomp_;
+ useShortKmers=(mink>0 && mink<k);
+ speed=speed_;
+ hammingDistance=hdist_;
+ editDistance=edist_;
+ middleMask=maskMiddle ? ~(3L<<(2*(k/2))) : -1L;
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Outer Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+
+ public static AbstractKmerTable[] makeTables(int tableType, int initialSize, boolean growable){
+ return AbstractKmerTable.preallocate(WAYS, tableType, initialSize, growable);
+ }
+
+
+ public long processData(String[] ref, String[] literal, boolean keepNames, boolean useRefNames, boolean ecc_){
+
+ scaffoldNames=null;
+ refNames=null;
+ refScafCounts=null;
+ scaffoldLengths=null;
+ ecc=ecc_;
+
+ if(keepNames){
+ scaffoldNames=new ArrayList<String>();
+ refNames=new ArrayList<String>();
+ scaffoldLengths=new IntList();
+
+ if(ref!=null){
+ for(String s : ref){refNames.add(s);}
+ }
+ if(literal!=null){refNames.add("literal");}
+ refScafCounts=new int[refNames.size()];
+
+ if(useRefNames){toRefNames();}
+ }
+
+ return spawnLoadThreads(ref, literal);
+ }
+
+ public void setRefSkip(int x){setRefSkip(x, x);}
+
+ public void setRefSkip(int min, int max){
+ max=Tools.max(min, max);
+ if(min==max){
+ minRefSkip=maxRefSkip=min;
+ }else{
+ minRefSkip=min;
+ maxRefSkip=max;
+ }
+ variableRefSkip=(minRefSkip!=maxRefSkip);
+ }
+
+ public void storeMode(final int x){
+ assert(x==SET_IF_NOT_PRESENT || x==SET_ALWAYS || x==INCREMENT);
+ storeMode=x;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+
+ /**
+ * Fills tables with kmers from references, using multiple LoadThread.
+ * @return Number of kmers stored.
+ */
+ private long spawnLoadThreads(String[] ref, String[] literal){
+ Timer t=new Timer();
+ if((ref==null || ref.length<1) && (literal==null || literal.length<1)){return 0;}
+ long added=0;
+
+ /* Create load threads */
+ LoadThread[] loaders=new LoadThread[WAYS];
+ for(int i=0; i<loaders.length; i++){
+ loaders[i]=new LoadThread(i);
+ loaders[i].start();
+ }
+
+ /* For each reference file... */
+ int refNum=0;
+ if(ref!=null){
+ for(String refname : ref){
+
+ /* Start an input stream */
+ FileFormat ff=FileFormat.testInput(refname, FileFormat.FASTA, null, false, true);
+ ConcurrentReadInputStream cris=ConcurrentReadInputStream.getReadInputStream(-1L, false, ff, null, null, null, Shared.USE_MPI, true);
+ cris.start(); //4567
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ /* Iterate through read lists from the input stream */
+ while(reads!=null && reads.size()>0){
+ {
+ /* Assign a unique ID number to each scaffold */
+ ArrayList<Read> reads2=new ArrayList<Read>(reads);
+ if(scaffoldNames!=null){
+ for(Read r1 : reads2){
+ final Read r2=r1.mate;
+ final Integer id=scaffoldNames.size();
+ if(ecc && r1!=null && r2!=null){BBMerge.findOverlapStrict(r1, r2, true);}
+ refScafCounts[refNum]++;
+ scaffoldNames.add(r1.id==null ? id.toString() : r1.id);
+ int len=r1.length();
+ r1.obj=id;
+ if(r2!=null){
+ r2.obj=id;
+ len+=r2.length();
+ }
+ scaffoldLengths.add(len);
+ }
+ }
+
+ if(REPLICATE_AMBIGUOUS){
+ reads2=Tools.replicateAmbiguous(reads2, Tools.min(k, mink));
+ }
+
+ /* Send a pointer to the read list to each LoadThread */
+ for(LoadThread lt : loaders){
+ boolean b=true;
+ while(b){
+ try {
+ lt.queue.put(reads2);
+ b=false;
+ } catch (InterruptedException e) {
+ //TODO: This will hang due to still-running threads.
+ throw new RuntimeException(e);
+ }
+ }
+ }
+ }
+
+ /* Dispose of the old list and fetch a new one */
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ /* Cleanup */
+ cris.returnList(ln.id, ln.list.isEmpty());
+ errorState|=ReadWrite.closeStream(cris);
+ refNum++;
+ }
+ }
+
+ /* If there are literal sequences to use as references */
+ if(literal!=null){
+ ArrayList<Read> list=new ArrayList<Read>(literal.length);
+ if(verbose){System.err.println("Adding literals "+Arrays.toString(literal));}
+
+ /* Assign a unique ID number to each literal sequence */
+ for(int i=0; i<literal.length; i++){
+ if(scaffoldNames!=null){
+ final Integer id=scaffoldNames.size();
+ final Read r=new Read(literal[i].getBytes(), null, id);
+ refScafCounts[refNum]++;
+ scaffoldNames.add(id.toString());
+ scaffoldLengths.add(r.length());
+ r.obj=id;
+ }else{
+ final Read r=new Read(literal[i].getBytes(), null, i);
+ list.add(r);
+ }
+ }
+
+ if(REPLICATE_AMBIGUOUS){
+ list=Tools.replicateAmbiguous(list, Tools.min(k, mink));
+ }
+
+ /* Send a pointer to the read list to each LoadThread */
+ for(LoadThread lt : loaders){
+ boolean b=true;
+ while(b){
+ try {
+ lt.queue.put(list);
+ b=false;
+ } catch (InterruptedException e) {
+ //TODO: This will hang due to still-running threads.
+ throw new RuntimeException(e);
+ }
+ }
+ }
+ }
+
+ /* Signal loaders to terminate */
+ for(LoadThread lt : loaders){
+ boolean b=true;
+ while(b){
+ try {
+ lt.queue.put(POISON);
+ b=false;
+ } catch (InterruptedException e) {
+ //TODO: This will hang due to still-running threads.
+ throw new RuntimeException(e);
+ }
+ }
+ }
+
+ /* Wait for loaders to die, and gather statistics */
+ for(LoadThread lt : loaders){
+ while(lt.getState()!=Thread.State.TERMINATED){
+ try {
+ lt.join();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ added+=lt.addedT;
+ refKmers+=lt.refKmersT;
+ refBases+=lt.refBasesT;
+ refReads+=lt.refReadsT;
+ }
+ //Correct statistics for number of threads, since each thread processes all reference data
+ refKmers/=WAYS;
+ refBases/=WAYS;
+ refReads/=WAYS;
+
+ t.stop();
+ if(DISPLAY_PROGRESS){
+ outstream.println("Added "+added+" kmers; time: \t"+t);
+ Shared.printMemory();
+ outstream.println();
+ }
+
+ if(verbose){
+ TextStreamWriter tsw=new TextStreamWriter("stdout", false, false, false, FileFormat.TEXT);
+ tsw.start();
+ for(AbstractKmerTable table : tables){
+ table.dumpKmersAsText(tsw, k, 1);
+ }
+ tsw.poisonAndWait();
+ }
+
+ return added;
+ }
+
+
+
+ /**
+ * Fills the scaffold names array with reference names.
+ */
+ public void toRefNames(){
+ final int numRefs=refNames.size();
+ for(int r=0, s=1; r<numRefs; r++){
+ final int scafs=refScafCounts[r];
+ final int lim=s+scafs;
+ final String name=ReadWrite.stripToCore(refNames.get(r));
+// System.err.println("r="+r+", s="+s+", scafs="+scafs+", lim="+lim+", name="+name);
+ while(s<lim){
+// System.err.println(r+", "+s+". Setting "+scaffoldNames.get(s)+" -> "+name);
+ scaffoldNames.set(s, name);
+ s++;
+ }
+ }
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Classes ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /**
+ * Loads kmers into a table. Each thread handles all kmers X such that X%WAYS==tnum.
+ */
+ private class LoadThread extends Thread{
+
+ public LoadThread(final int tnum_){
+ tnum=tnum_;
+ map=tables[tnum];
+ }
+
+ /**
+ * Get the next list of reads (or scaffolds) from the queue.
+ * @return List of reads
+ */
+ private ArrayList<Read> fetch(){
+ ArrayList<Read> list=null;
+ while(list==null){
+ try {
+ list=queue.take();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ return list;
+ }
+
+ @Override
+ public void run(){
+ ArrayList<Read> reads=fetch();
+ while(reads!=POISON){
+ for(Read r1 : reads){
+ assert(r1.pairnum()==0);
+ final Read r2=r1.mate;
+
+ addedT+=addToMap(r1, minRefSkip);
+ if(r2!=null){addedT+=addToMap(r2, minRefSkip);}
+ }
+ reads=fetch();
+ }
+
+ if(map.canRebalance() && map.size()>2L*map.arrayLength()){
+ map.rebalance();
+ }
+ }
+
+ /**
+ * @param r The current read to process
+ * @param skip Number of bases to skip between kmers
+ * @return Number of kmers stored
+ */
+ private long addToMap(Read r, int skip){
+ if(variableRefSkip){
+ int rblen=r.length();
+ skip=(rblen>20000000 ? k : rblen>5000000 ? 11 : rblen>500000 ? 2 : 0);
+ skip=Tools.mid(minRefSkip, maxRefSkip, skip);
+ }
+ final byte[] bases=r.bases;
+ final int shift=2*k;
+ final int shift2=shift-2;
+ final long mask=~((-1L)<<shift);
+ final long kmask=kMasks[k];
+ long kmer=0;
+ long rkmer=0;
+ long added=0;
+ int len=0;
+
+ if(bases!=null){
+ refReadsT++;
+ refBasesT+=bases.length;
+ }
+ if(bases==null || bases.length<k){return 0;}
+
+ final int id=(r.obj==null ? 1 : ((Integer)r.obj).intValue());
+
+ if(skip>1){ //Process while skipping some kmers
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ long x=Dedupe.baseToNumber[b];
+ long x2=Dedupe.baseToComplementNumber[b];
+ kmer=((kmer<<2)|x)&mask;
+ rkmer=(rkmer>>>2)|(x2<<shift2);
+ if(b=='N'){len=0;}else{len++;}
+ if(verbose){System.err.println("Scanning1 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
+ if(len>=k){
+ refKmersT++;
+ if(len%skip==0){
+ final long extraBase=(i>=bases.length-1 ? -1 : AminoAcid.baseToNumber[bases[i+1]]);
+ added+=addToMap(kmer, rkmer, k, extraBase, id, kmask, hammingDistance, editDistance);
+ if(useShortKmers){
+ if(i==k2){added+=addToMapRightShift(kmer, rkmer, id);}
+ if(i==bases.length-1){added+=addToMapLeftShift(kmer, rkmer, extraBase, id);}
+ }
+ }
+ }
+ }
+ }else{ //Process all kmers
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ long x=Dedupe.baseToNumber[b];
+ long x2=Dedupe.baseToComplementNumber[b];
+ kmer=((kmer<<2)|x)&mask;
+ rkmer=(rkmer>>>2)|(x2<<shift2);
+ if(b=='N'){len=0;}else{len++;}
+ if(verbose){System.err.println("Scanning2 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
+ if(len>=k){
+ refKmersT++;
+ final long extraBase=(i>=bases.length-1 ? -1 : AminoAcid.baseToNumber[bases[i+1]]);
+ final long atm=addToMap(kmer, rkmer, k, extraBase, id, kmask, hammingDistance, editDistance);
+ added+=atm;
+// assert(false) : atm+", "+map.contains(toValue(kmer, rkmer, kmask));
+ if(useShortKmers){
+ if(i==k2){added+=addToMapRightShift(kmer, rkmer, id);}
+ if(i==bases.length-1){added+=addToMapLeftShift(kmer, rkmer, extraBase, id);}
+ }
+ }
+ }
+ }
+ return added;
+ }
+
+
+ /**
+ * Adds short kmers on the left end of the read.
+ * @param kmer Forward kmer
+ * @param rkmer Reverse kmer
+ * @param extraBase Base added to end in case of deletions
+ * @param id Scaffold number
+ * @return Number of kmers stored
+ */
+ private long addToMapLeftShift(long kmer, long rkmer, final long extraBase, final int id){
+ if(verbose){System.err.println("addToMapLeftShift");}
+ long added=0;
+ for(int i=k-1; i>=mink; i--){
+ kmer=kmer&rightMasks[i];
+ rkmer=rkmer>>>2;
+ long x=addToMap(kmer, rkmer, i, extraBase, id, kMasks[i], hammingDistance2, editDistance2);
+ added+=x;
+ if(verbose){
+ if((toValue(kmer, rkmer, kMasks[i]))%WAYS==tnum){
+ System.err.println("added="+x+"; i="+i+"; tnum="+tnum+"; Added left-shift kmer "+AminoAcid.kmerToString(kmer&~kMasks[i], i)+"; value="+(toValue(kmer, rkmer, kMasks[i]))+"; kmer="+kmer+"; rkmer="+rkmer+"; kmask="+kMasks[i]+"; rightMasks[i+1]="+rightMasks[i+1]);
+ System.err.println("i="+i+"; tnum="+tnum+"; Looking for left-shift kmer "+AminoAcid.kmerToString(kmer&~kMasks[i], i));
+ final long value=toValue(kmer, rkmer, kMasks[i]);
+ if(map.contains(value)){System.err.println("Found "+value);}
+ }
+ }
+ }
+ return added;
+ }
+
+
+ /**
+ * Adds short kmers on the right end of the read.
+ * @param kmer Forward kmer
+ * @param rkmer Reverse kmer
+ * @param id Scaffold number
+ * @return Number of kmers stored
+ */
+ private long addToMapRightShift(long kmer, long rkmer, final int id){
+ if(verbose){System.err.println("addToMapRightShift");}
+ long added=0;
+ for(int i=k-1; i>=mink; i--){
+ long extraBase=kmer&3L;
+ kmer=kmer>>>2;
+ rkmer=rkmer&rightMasks[i];
+// assert(Long.numberOfLeadingZeros(kmer)>=2*(32-i)) : Long.numberOfLeadingZeros(kmer)+", "+i+", "+kmer+", "+kMasks[i];
+// assert(Long.numberOfLeadingZeros(rkmer)>=2*(32-i)) : Long.numberOfLeadingZeros(rkmer)+", "+i+", "+rkmer+", "+kMasks[i];
+ long x=addToMap(kmer, rkmer, i, extraBase, id, kMasks[i], hammingDistance2, editDistance2);
+ added+=x;
+ if(verbose){
+ if((toValue(kmer, rkmer, kMasks[i]))%WAYS==tnum){
+ System.err.println("added="+x+"; i="+i+"; tnum="+tnum+"; Added right-shift kmer "+AminoAcid.kmerToString(kmer&~kMasks[i], i)+"; value="+(toValue(kmer, rkmer, kMasks[i]))+"; kmer="+kmer+"; rkmer="+rkmer+"; kmask="+kMasks[i]+"; rightMasks[i+1]="+rightMasks[i+1]);
+ System.err.println("i="+i+"; tnum="+tnum+"; Looking for right-shift kmer "+AminoAcid.kmerToString(kmer&~kMasks[i], i));
+ final long value=toValue(kmer, rkmer, kMasks[i]);
+ if(map.contains(value)){System.err.println("Found "+value);}
+ }
+ }
+ }
+ return added;
+ }
+
+
+ /**
+ * Adds this kmer to the table, including any mutations implied by editDistance or hammingDistance.
+ * @param kmer Forward kmer
+ * @param rkmer Reverse kmer
+ * @param len Kmer length
+ * @param extraBase Base added to end in case of deletions
+ * @param id Scaffold number
+ * @param kmask0
+ * @return Number of kmers stored
+ */
+ private long addToMap(final long kmer, final long rkmer, final int len, final long extraBase, final int id, final long kmask0, final int hdist, final int edist){
+
+ assert(kmask0==kMasks[len]) : kmask0+", "+len+", "+kMasks[len]+", "+Long.numberOfTrailingZeros(kmask0)+", "+Long.numberOfTrailingZeros(kMasks[len]);
+
+ if(verbose){System.err.println("addToMap_A; len="+len+"; kMasks[len]="+kMasks[len]);}
+ assert((kmer&kmask0)==0);
+ final long added;
+ if(hdist==0){
+ final long key=toValue(kmer, rkmer, kmask0);
+ if(speed>0 && ((key/WAYS)&15)<speed){return 0;}
+ if(key%WAYS!=tnum){return 0;}
+ if(verbose){System.err.println("addToMap_B: "+AminoAcid.kmerToString(kmer&~kMasks[len], len)+" = "+key);}
+ if(storeMode==SET_IF_NOT_PRESENT){
+ added=map.setIfNotPresent(key, id);
+ }else if(storeMode==SET_ALWAYS){
+ added=map.set(key, id);
+ }else{
+ assert(storeMode==INCREMENT);
+ added=map.increment(key);
+ }
+ }else if(edist>0){
+// long extraBase=(i>=bases.length-1 ? -1 : AminoAcid.baseToNumber[bases[i+1]]);
+ added=mutate(kmer, rkmer, len, id, edist, extraBase);
+ }else{
+ added=mutate(kmer, rkmer, len, id, hdist, -1);
+ }
+ if(verbose){System.err.println("addToMap added "+added+" keys.");}
+ return added;
+ }
+
+ /**
+ * Mutate and store this kmer through 'dist' recursions.
+ * @param kmer Forward kmer
+ * @param rkmer Reverse kmer
+ * @param id Scaffold number
+ * @param dist Number of mutations
+ * @param extraBase Base added to end in case of deletions
+ * @return Number of kmers stored
+ */
+ private long mutate(final long kmer, final long rkmer, final int len, final int id, final int dist, final long extraBase){
+ long added=0;
+
+ final long key=toValue(kmer, rkmer, kMasks[len]);
+
+ if(verbose){System.err.println("mutate_A; len="+len+"; kmer="+kmer+"; rkmer="+rkmer+"; kMasks[len]="+kMasks[len]);}
+ if(key%WAYS==tnum){
+ if(verbose){System.err.println("mutate_B: "+AminoAcid.kmerToString(kmer&~kMasks[len], len)+" = "+key);}
+ int x;
+ if(storeMode==SET_IF_NOT_PRESENT){
+ x=map.setIfNotPresent(key, id);
+ }else if(storeMode==SET_ALWAYS){
+ x=map.set(key, id);
+ }else{
+ assert(storeMode==INCREMENT);
+ x=map.increment(key);
+ x=(x==1 ? 1 : 0);
+ }
+ if(verbose){System.err.println("mutate_B added "+x+" keys.");}
+ added+=x;
+ assert(map.contains(key));
+ }
+
+ if(dist>0){
+ final int dist2=dist-1;
+
+ //Sub
+ for(int j=0; j<4; j++){
+ for(int i=0; i<len; i++){
+ final long temp=(kmer&clearMasks[i])|setMasks[j][i];
+ if(temp!=kmer){
+ long rtemp=AminoAcid.reverseComplementBinaryFast(temp, len);
+ added+=mutate(temp, rtemp, len, id, dist2, extraBase);
+ }
+ }
+ }
+
+ if(editDistance>0){
+ //Del
+ if(extraBase>=0 && extraBase<=3){
+ for(int i=1; i<len; i++){
+ final long temp=(kmer&leftMasks[i])|((kmer<<2)&rightMasks[i])|extraBase;
+ if(temp!=kmer){
+ long rtemp=AminoAcid.reverseComplementBinaryFast(temp, len);
+ added+=mutate(temp, rtemp, len, id, dist2, -1);
+ }
+ }
+ }
+
+ //Ins
+ final long eb2=kmer&3;
+ for(int i=1; i<len; i++){
+ final long temp0=(kmer&leftMasks[i])|((kmer&rightMasks[i])>>2);
+ for(int j=0; j<4; j++){
+ final long temp=temp0|setMasks[j][i-1];
+ if(temp!=kmer){
+ long rtemp=AminoAcid.reverseComplementBinaryFast(temp, len);
+ added+=mutate(temp, rtemp, len, id, dist2, eb2);
+ }
+ }
+ }
+ }
+
+ }
+
+ return added;
+ }
+
+ /*--------------------------------------------------------------*/
+
+ /** Number of kmers stored by this thread */
+ public long addedT=0;
+ /** Number of items encountered by this thread */
+ public long refKmersT=0, refReadsT=0, refBasesT=0;
+ /** Thread number; used to determine which kmers to store */
+ public final int tnum;
+ /** Buffer of input read lists */
+ public final ArrayBlockingQueue<ArrayList<Read>> queue=new ArrayBlockingQueue<ArrayList<Read>>(32);
+
+ /** Destination for storing kmers */
+ private final AbstractKmerTable map;
+
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Static Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /**
+ * Transforms a kmer into a canonical value stored in the table. Expected to be inlined.
+ * @param kmer Forward kmer
+ * @param rkmer Reverse kmer
+ * @param lengthMask Bitmask with single '1' set to left of kmer
+ * @return Canonical value
+ */
+ private final long toValue(long kmer, long rkmer, long lengthMask){
+ assert(lengthMask==0 || (kmer<lengthMask && rkmer<lengthMask)) : lengthMask+", "+kmer+", "+rkmer;
+ long value=(rcomp ? Tools.max(kmer, rkmer) : kmer);
+ return (value&middleMask)|lengthMask;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Has this class encountered errors while processing? */
+ public boolean errorState=false;
+
+ /** How to associate values with kmers */
+ private int storeMode=SET_IF_NOT_PRESENT;
+
+ /** Hold kmers. A kmer X such that X%WAYS=Y will be stored in keySets[Y] */
+ public AbstractKmerTable[] tables;
+ /** A scaffold's name is stored at scaffoldNames.get(id).
+ * scaffoldNames[0] is reserved, so the first id is 1. */
+ public ArrayList<String> scaffoldNames;
+ /** Names of reference files (refNames[0] is valid). */
+ public ArrayList<String> refNames;
+ /** Number of scaffolds per reference. */
+ public int[] refScafCounts;
+ /** scaffoldLengths[id] stores the length of that scaffold */
+ public IntList scaffoldLengths=new IntList();
+
+ /** Make the middle base in a kmer a wildcard to improve sensitivity */
+ public final boolean maskMiddle=false;
+ /** Correct errors via read overlap */
+ public boolean ecc=false;
+
+ /** Store reference kmers with up to this many substitutions */
+ public final int hammingDistance;
+ /** Store reference kmers with up to this many edits (including indels) */
+ public final int editDistance;
+ /** Store short reference kmers with up to this many substitutions */
+ public int hammingDistance2=-1;
+ /** Store short reference kmers with up to this many edits (including indels) */
+ public int editDistance2=-1;
+ /** Always skip at least this many consecutive kmers when hashing reference.
+ * 1 means every kmer is used, 2 means every other, etc. */
+ private int minRefSkip=0;
+ /** Never skip more than this many consecutive kmers when hashing reference. */
+ private int maxRefSkip=0;
+
+ private boolean variableRefSkip=false;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Statistics ----------------*/
+ /*--------------------------------------------------------------*/
+
+ long refReads=0;
+ long refBases=0;
+ long refKmers=0;
+
+ long storedKmers=0;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Final Primitives ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Look for reverse-complements as well as forward kmers. Default: true */
+ private final boolean rcomp;
+ /** AND bitmask with 0's at the middle base */
+ private final long middleMask;
+
+ /** Normal kmer length */
+ private final int k;
+ /** k-1; used in some expressions */
+ private final int k2;
+ /** Shortest kmer to use for trimming */
+ private final int mink;
+ /** Attempt to match kmers shorter than normal k on read ends when doing kTrimming. */
+ private final boolean useShortKmers;
+
+ /** Fraction of kmers to skip, 0 to 15 out of 16 */
+ private final int speed;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Static Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Default initial size of data structures */
+ private static final int initialSizeDefault=128000;
+
+ /** Number of tables (and threads, during loading) */
+ private static final int WAYS=7; //123
+ /** Verbose messages */
+ public static final boolean verbose=false; //123
+
+ /** Print messages to this stream */
+ private static PrintStream outstream=System.err;
+ /** Display progress messages such as memory usage */
+ public static boolean DISPLAY_PROGRESS=true;
+ /** Indicates end of input stream */
+ private static final ArrayList<Read> POISON=new ArrayList<Read>(0);
+ /** Make unambiguous copies of ref sequences with ambiguous bases */
+ public static boolean REPLICATE_AMBIGUOUS=false;
+
+ /** x&clearMasks[i] will clear base i */
+ private static final long[] clearMasks;
+ /** x|setMasks[i][j] will set base i to j */
+ private static final long[][] setMasks;
+ /** x&leftMasks[i] will clear all bases to the right of i (exclusive) */
+ private static final long[] leftMasks;
+ /** x&rightMasks[i] will clear all bases to the left of i (inclusive) */
+ private static final long[] rightMasks;
+ /** x|kMasks[i] will set the bit to the left of the leftmost base */
+ private static final long[] kMasks;
+
+ public static final int SET_IF_NOT_PRESENT=1, SET_ALWAYS=2, INCREMENT=3;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Static Initializers ----------------*/
+ /*--------------------------------------------------------------*/
+
+ static{
+ clearMasks=new long[32];
+ leftMasks=new long[32];
+ rightMasks=new long[32];
+ kMasks=new long[32];
+ setMasks=new long[4][32];
+ for(int i=0; i<32; i++){
+ clearMasks[i]=~(3L<<(2*i));
+ }
+ for(int i=0; i<32; i++){
+ leftMasks[i]=((-1L)<<(2*i));
+ }
+ for(int i=0; i<32; i++){
+ rightMasks[i]=~((-1L)<<(2*i));
+ }
+ for(int i=0; i<32; i++){
+ kMasks[i]=((1L)<<(2*i));
+ }
+ for(int i=0; i<32; i++){
+ for(long j=0; j<4; j++){
+ setMasks[(int)j][i]=(j<<(2*i));
+ }
+ }
+ }
+
+}
diff --git a/current/kmer/TableReader.java b/current/kmer/TableReader.java
new file mode 100755
index 0000000..aeda14a
--- /dev/null
+++ b/current/kmer/TableReader.java
@@ -0,0 +1,645 @@
+package kmer;
+
+import java.io.PrintStream;
+import java.util.BitSet;
+
+import jgi.Dedupe;
+import stream.Read;
+import align2.IntList;
+import align2.Tools;
+import dna.AminoAcid;
+import dna.Parser;
+import dna.Timer;
+
+/**
+ * @author Brian Bushnell
+ * @date Mar 5, 2015
+ *
+ */
+public class TableReader {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /**
+ * Code entrance from the command line.
+ * @param args Command line arguments
+ */
+ public static void main(String[] args){
+
+ args=Parser.parseConfig(args);
+ if(Parser.parseHelp(args, true)){
+ assert(false) : "TODO";
+ System.exit(0);
+ }
+
+ Timer t=new Timer();
+
+ AbstractKmerTable[] tables=TableLoaderLockFree.makeTables(AbstractKmerTable.ARRAY1D, 128000, true);
+
+ int k=31;
+ int mink=0;
+ int speed=0;
+ int hdist=0;
+ int edist=0;
+ boolean rcomp=true;
+ boolean maskMiddle=false;
+
+ //Create a new Loader instance
+ TableLoaderLockFree loader=new TableLoaderLockFree(tables, k, mink, speed, hdist, edist, rcomp, maskMiddle);
+ loader.setRefSkip(0);
+ loader.hammingDistance2=0;
+ loader.editDistance2=0;
+ loader.storeMode(TableLoaderLockFree.SET_IF_NOT_PRESENT);
+
+ ///And run it
+ String[] refs=args;
+ String[] literals=null;
+ boolean keepNames=false;
+ boolean useRefNames=false;
+ long kmers=loader.processData(refs, literals, keepNames, useRefNames, false);
+ t.stop();
+
+ System.err.println("Load Time:\t"+t);
+ System.err.println("Return: \t"+kmers);
+ System.err.println("refKmers: \t"+loader.refKmers);
+ System.err.println("refBases: \t"+loader.refBases);
+ System.err.println("refReads: \t"+loader.refReads);
+
+ int qskip=0;
+ int qhdist=0;
+ TableReader tr=new TableReader(k, mink, speed, qskip, qhdist, rcomp, maskMiddle);
+
+ //TODO: Stuff...
+ }
+
+ public TableReader(int k_){
+ this(k_, 0, 0, 0, 0, true, false);
+ }
+
+ public TableReader(int k_, int mink_, int speed_, int qskip_, int qhdist_, boolean rcomp_, boolean maskMiddle_){
+ k=k_;
+ k2=k-1;
+ mink=mink_;
+ rcomp=rcomp_;
+ useShortKmers=(mink>0 && mink<k);
+ speed=speed_;
+ qSkip=qskip_;
+ qHammingDistance=qhdist_;
+ middleMask=maskMiddle ? ~(3L<<(2*(k/2))) : -1L;
+
+ noAccel=(speed<1 && qSkip<2);
+ accel=!noAccel;
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Outer Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+
+ /**
+ * Mask a read to cover matching kmers.
+ * @param r Read to process
+ * @param sets Kmer tables
+ * @return Number of bases masked
+ */
+ public final int kMask(final Read r, final AbstractKmerTable[] sets){
+ if(r==null){return 0;}
+ if(verbose){System.err.println("KMasking read "+r.id);}
+
+ BitSet bs=markBits(r, sets);
+ if(verbose){System.err.println("Null bitset.");}
+ if(bs==null){return 0;}
+
+ final byte[] bases=r.bases, quals=r.quality;
+ final int cardinality=bs.cardinality();
+ assert(cardinality>0);
+
+ //Replace kmer hit zone with the trim symbol
+ for(int i=0; i<bases.length; i++){
+ if(bs.get(i)){
+ if(kmaskLowercase){
+ bases[i]=(byte)Character.toLowerCase(bases[i]);
+ }else{
+ bases[i]=trimSymbol;
+ if(quals!=null && trimSymbol=='N'){quals[i]=0;}
+ }
+ }
+ }
+ return cardinality;
+ }
+
+
+ /**
+ * Counts the number of kmer hits for a read.
+ * @param r Read to process
+ * @param sets Kmer tables
+ * @return Number of hits
+ */
+ public final int countKmerHits(final Read r, final AbstractKmerTable[] sets){
+ if(r==null || r.length()<k){return 0;}
+ if((skipR1 && r.pairnum()==0) || (skipR2 && r.pairnum()==1)){return 0;}
+ final byte[] bases=r.bases;
+ final int minlen=k-1;
+ final int minlen2=(maskMiddle ? k/2 : k);
+ final int shift=2*k;
+ final int shift2=shift-2;
+ final long mask=~((-1L)<<shift);
+ long kmer=0;
+ long rkmer=0;
+ int found=0;
+ int len=0;
+
+ final int start=(restrictRight<1 ? 0 : Tools.max(0, bases.length-restrictRight));
+ final int stop=(restrictLeft<1 ? bases.length : Tools.min(bases.length, restrictLeft));
+
+ /* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */
+ for(int i=start; i<stop; i++){
+ byte b=bases[i];
+ long x=Dedupe.baseToNumber[b];
+ long x2=Dedupe.baseToComplementNumber[b];
+ kmer=((kmer<<2)|x)&mask;
+ rkmer=(rkmer>>>2)|(x2<<shift2);
+ if(b=='N' && forbidNs){len=0;}else{len++;}
+ if(verbose){System.err.println("Scanning6 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
+ if(len>=minlen2 && i>=minlen){
+ final int id=getValue(kmer, rkmer, k, qHammingDistance, i, sets);
+ if(verbose){System.err.println("Testing kmer "+kmer+"; id="+id);}
+ if(id>0){
+ if(verbose){System.err.println("Found = "+(found+1)+"/"+minHits);}
+ if(found>=minHits){
+ return (found=found+1); //Early exit
+ }
+ found++;
+ }
+ }
+ }
+
+ return found;
+ }
+
+ /**
+ * Returns the id of the sequence with the most kmer matches to this read, or -1 if none are at least minHits.
+ * @param r Read to process
+ * @param sets Kmer tables
+ * @return id of best match
+ */
+ public final int findBestMatch(final Read r, final AbstractKmerTable[] sets){
+ idList.size=0;
+ if(r==null || r.length()<k){return -1;}
+ if((skipR1 && r.pairnum()==0) || (skipR2 && r.pairnum()==1)){return -1;}
+ final byte[] bases=r.bases;
+ final int minlen=k-1;
+ final int minlen2=(maskMiddle ? k/2 : k);
+ final int shift=2*k;
+ final int shift2=shift-2;
+ final long mask=~((-1L)<<shift);
+ long kmer=0;
+ long rkmer=0;
+ int len=0;
+ int found=0;
+
+ final int start=(restrictRight<1 ? 0 : Tools.max(0, bases.length-restrictRight));
+ final int stop=(restrictLeft<1 ? bases.length : Tools.min(bases.length, restrictLeft));
+
+ /* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */
+ for(int i=start; i<stop; i++){
+ byte b=bases[i];
+ long x=Dedupe.baseToNumber[b];
+ long x2=Dedupe.baseToComplementNumber[b];
+ kmer=((kmer<<2)|x)&mask;
+ rkmer=(rkmer>>>2)|(x2<<shift2);
+ if(b=='N' && forbidNs){len=0;}else{len++;}
+ if(verbose){System.err.println("Scanning6 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
+ if(len>=minlen2 && i>=minlen){
+ final int id=getValue(kmer, rkmer, k, qHammingDistance, i, sets);
+ if(id>0){
+ countArray[id]++;
+ if(countArray[id]==1){idList.add(id);}
+ found++;
+ if(verbose){System.err.println("Found = "+found+"/"+minHits);}
+ }
+ }
+ }
+
+ final int id, max;
+ if(found>=minHits){
+ max=condenseLoose(countArray, idList, countList);
+ int id0=-1;
+ for(int i=0; i<countList.size; i++){
+ if(countList.get(i)==max){
+ id0=idList.get(i); break;
+ }
+ }
+ id=id0;
+ }else{
+ max=0;
+ id=-1;
+ }
+
+ return id;
+ }
+
+
+ /**
+ * Mask a read to cover matching kmers.
+ * @param r Read to process
+ * @param sets Kmer tables
+ * @return Number of bases masked
+ */
+ public final BitSet markBits(final Read r, final AbstractKmerTable[] sets){
+ if(r==null || r.length()<Tools.max(1, (useShortKmers ? Tools.min(k, mink) : k))){
+ if(verbose){System.err.println("Read too short.");}
+ return null;
+ }
+ if((skipR1 && r.pairnum()==0) || (skipR2 && r.pairnum()==1)){
+ if(verbose){System.err.println("Skipping read.");}
+ return null;
+ }
+ if(verbose){System.err.println("Marking bitset for read "+r.id);}
+ final byte[] bases=r.bases;
+ final int minlen=k-1;
+ final int minlen2=(maskMiddle ? k/2 : k);
+ final int shift=2*k;
+ final int shift2=shift-2;
+ final long mask=~((-1L)<<shift);
+ long kmer=0;
+ long rkmer=0;
+ int found=0;
+ int len=0;
+ int id0=-1; //ID of first kmer found.
+
+ BitSet bs=new BitSet(bases.length+trimPad+1);
+
+ final int minus=k-1-trimPad;
+ final int plus=trimPad+1;
+
+ final int start=(restrictRight<1 ? 0 : Tools.max(0, bases.length-restrictRight));
+ final int stop=(restrictLeft<1 ? bases.length : Tools.min(bases.length, restrictLeft));
+
+ //Scan for normal kmers
+ for(int i=start; i<stop; i++){
+ byte b=bases[i];
+ long x=Dedupe.baseToNumber[b];
+ long x2=Dedupe.baseToComplementNumber[b];
+ kmer=((kmer<<2)|x)&mask;
+ rkmer=(rkmer>>>2)|(x2<<shift2);
+ if(b=='N' && forbidNs){len=0;}else{len++;}
+ if(verbose){System.err.println("Scanning3 i="+i+", kmer="+kmer+", rkmer="+rkmer+", len="+len+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
+ if(len>=minlen2 && i>=minlen){
+ final int id=getValue(kmer, rkmer, k, qHammingDistance, i, sets);
+ if(id>0){
+ if(id0<0){id0=id;}
+ if(verbose){
+ System.err.println("a: Found "+kmer);
+ System.err.println("Setting "+Tools.max(0, i-minus)+", "+(i+plus));
+ System.err.println("i="+i+", minus="+minus+", plus="+plus+", trimpad="+trimPad+", k="+k);
+ }
+ bs.set(Tools.max(0, i-minus), i+plus);
+ found++;
+ }
+ }
+ }
+
+ //If nothing was found, scan for short kmers.
+ if(useShortKmers){
+ assert(!maskMiddle && middleMask==-1) : maskMiddle+", "+middleMask+", k="+", mink="+mink;
+
+ //Look for short kmers on left side
+ {
+ kmer=0;
+ rkmer=0;
+ len=0;
+ final int lim=Tools.min(k, stop);
+ for(int i=start; i<lim; i++){
+ byte b=bases[i];
+ long x=Dedupe.baseToNumber[b];
+ long x2=Dedupe.baseToComplementNumber[b];
+ kmer=((kmer<<2)|x)&mask;
+ rkmer=rkmer|(x2<<(2*len));
+ len++;
+ if(verbose){System.err.println("Scanning4 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
+ if(len>=mink){
+
+ if(verbose){
+ System.err.println("Looking for left kmer "+AminoAcid.kmerToString(kmer, len));
+ System.err.println("Looking for left rkmer "+AminoAcid.kmerToString(rkmer, len));
+ }
+ final int id=getValue(kmer, rkmer, len, qHammingDistance2, i, sets);
+ if(id>0){
+ if(id0<0){id0=id;}
+ if(verbose){
+ System.err.println("b: Found "+kmer);
+ System.err.println("Setting "+0+", "+(i+plus));
+ }
+ bs.set(0, i+plus);
+ found++;
+ }
+ }
+ }
+ }
+
+ //Look for short kmers on right side
+ {
+ kmer=0;
+ rkmer=0;
+ len=0;
+ final int lim=Tools.max(-1, stop-k);
+ for(int i=stop-1; i>lim; i--){
+ byte b=bases[i];
+ long x=Dedupe.baseToNumber[b];
+ long x2=Dedupe.baseToComplementNumber[b];
+ kmer=kmer|(x<<(2*len));
+ rkmer=((rkmer<<2)|x2)&mask;
+ len++;
+ if(verbose){System.err.println("Scanning5 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
+ if(len>=mink){
+ if(verbose){
+ System.err.println("Looking for right kmer "+
+ AminoAcid.kmerToString(kmer&~lengthMasks[len], len)+"; value="+toValue(kmer, rkmer, lengthMasks[len])+"; kmask="+lengthMasks[len]);
+ }
+ final int id=getValue(kmer, rkmer, len, qHammingDistance2, i, sets);
+ if(id>0){
+ if(id0<0){id0=id;}
+ if(verbose){
+ System.err.println("c: Found "+kmer);
+ System.err.println("Setting "+Tools.max(0, i-trimPad)+", "+bases.length);
+ }
+ bs.set(Tools.max(0, i-trimPad), bases.length);
+ found++;
+ }
+ }
+ }
+ }
+ }
+
+
+ if(verbose){System.err.println("found="+found+", bitset="+bs);}
+
+ if(found==0){return null;}
+ assert(found>0) : "Overflow in 'found' variable.";
+
+ int cardinality=bs.cardinality();
+ assert(cardinality>0);
+
+ return bs;
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Helper Methods ----------------*/
+ /*--------------------------------------------------------------*/
+ /**
+ * Transforms a kmer into all canonical values for a given Hamming distance.
+ * Returns the related id stored in the tables.
+ * @param kmer Forward kmer
+ * @param rkmer Reverse kmer
+ * @param len kmer length
+ * @param qHDist Hamming distance
+ * @param qPos Position of kmer in query
+ * @param sets Kmer hash tables
+ * @return Value stored in table, or -1
+ */
+ public final int getValue(final long kmer, final long rkmer, final int len, final int qHDist, final int qPos, final AbstractKmerTable[] sets){
+ if(qSkip>1 && (qPos%qSkip!=0)){return -1;}
+ return qHDist<1 ? getValue(kmer, rkmer, len, sets) : getValue(kmer, rkmer, len, qHDist, sets);
+ }
+
+ /**
+ * Transforms a kmer into all canonical values for a given Hamming distance.
+ * Returns the related id stored in the tables.
+ * @param kmer Forward kmer
+ * @param rkmer Reverse kmer
+ * @param len kmer length
+ * @param qHDist Hamming distance
+ * @param sets Kmer hash tables
+ * @return Value stored in table, or -1
+ */
+ public final int getValue(final long kmer, final long rkmer, final int len, final int qHDist, final AbstractKmerTable[] sets){
+ int id=getValue(kmer, rkmer, len, sets);
+ if(id<1 && qHDist>0){
+ final int qHDistMinusOne=qHDist-1;
+
+ //Sub
+ for(int j=0; j<4 && id<1; j++){
+ for(int i=0; i<len && id<1; i++){
+ final long temp=(kmer&clearMasks[i])|setMasks[j][i];
+ if(temp!=kmer){
+ long rtemp=AminoAcid.reverseComplementBinaryFast(temp, len);
+ id=getValue(temp, rtemp, len, qHDistMinusOne, sets);
+ }
+ }
+ }
+ }
+ return id;
+ }
+
+ /**
+ * Transforms a kmer into a canonical value stored in the table and search.
+ * @param kmer Forward kmer
+ * @param rkmer Reverse kmer
+ * @param len kmer length
+ * @param sets Kmer hash tables
+ * @return Value stored in table
+ */
+ public final int getValue(final long kmer, final long rkmer, final int len, final AbstractKmerTable[] sets){
+ return getValueWithMask(kmer, rkmer, lengthMasks[len], sets);
+ }
+
+ /**
+ * Transforms a kmer into a canonical value stored in the table and search.
+ * @param kmer Forward kmer
+ * @param rkmer Reverse kmer
+ * @param lengthMask Bitmask with single '1' set to left of kmer
+ * @param sets Kmer hash tables
+ * @return Value stored in table
+ */
+ public final int getValueWithMask(final long kmer, final long rkmer, final long lengthMask, final AbstractKmerTable[] sets){
+ assert(lengthMask==0 || (kmer<lengthMask && rkmer<lengthMask)) : lengthMask+", "+kmer+", "+rkmer;
+
+ final long max=(rcomp ? Tools.max(kmer, rkmer) : kmer);
+ final long key=(max&middleMask)|lengthMask;
+ if(noAccel || ((key/WAYS)&15)>=speed){
+ if(verbose){System.err.println("Testing key "+key);}
+ AbstractKmerTable set=sets[(int)(key%WAYS)];
+ final int id=set.getValue(key);
+ return id;
+ }
+ return -1;
+ }
+
+
+ /**
+ * Transforms a kmer into a canonical value stored in the table. Expected to be inlined.
+ * @param kmer Forward kmer
+ * @param rkmer Reverse kmer
+ * @param lengthMask Bitmask with single '1' set to left of kmer
+ * @return Canonical value
+ */
+ private final long toValue(long kmer, long rkmer, long lengthMask){
+ assert(lengthMask==0 || (kmer<lengthMask && rkmer<lengthMask)) : lengthMask+", "+kmer+", "+rkmer;
+ long value=(rcomp ? Tools.max(kmer, rkmer) : kmer);
+ return (value&middleMask)|lengthMask;
+ }
+
+ /**
+ * Pack a list of counts from an array to an IntList.
+ * @param loose Counter array
+ * @param packed Unique values
+ * @param counts Counts of values
+ * @return
+ */
+ public static int condenseLoose(int[] loose, IntList packed, IntList counts){
+ counts.size=0;
+ if(packed.size<1){return 0;}
+
+ int max=0;
+ for(int i=0; i<packed.size; i++){
+ final int p=packed.get(i);
+ final int c=loose[p];
+ counts.add(c);
+ loose[p]=0;
+ max=Tools.max(max, c);
+ }
+ return max;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Has this class encountered errors while processing? */
+ public boolean errorState=false;
+
+ /** Make the middle base in a kmer a wildcard to improve sensitivity */
+ public final boolean maskMiddle=false;
+
+ /** Search for query kmers with up to this many substitutions */
+ private final int qHammingDistance;
+ /** Search for short query kmers with up to this many substitutions */
+ public int qHammingDistance2=-1;
+
+ /** Trim this much extra around matched kmers */
+ public int trimPad=0;
+
+ /** If positive, only look for kmer matches in the leftmost X bases */
+ public int restrictLeft=0;
+ /** If positive, only look for kmer matches the rightmost X bases */
+ public int restrictRight=0;
+
+ /** Don't allow a read 'N' to match a reference 'A'.
+ * Reduces sensitivity when hdist>0 or edist>0. Default: false. */
+ public boolean forbidNs=false;
+
+ /** Replace bases covered by matched kmers with this symbol */
+ public byte trimSymbol='N';
+
+ /** Convert masked bases to lowercase */
+ public boolean kmaskLowercase=false;
+
+ /** Don't look for kmers in read 1 */
+ public boolean skipR1=false;
+ /** Don't look for kmers in read 2 */
+ public boolean skipR2=false;
+
+ /** A read must contain at least this many kmer hits before being considered a match. Default: 1 */
+ public int minHits=1;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Statistics ----------------*/
+ /*--------------------------------------------------------------*/
+
+// public long storedKmers=0;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Per-Thread Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public int[] countArray;
+
+ private final IntList idList=new IntList();
+ private final IntList countList=new IntList();
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Final Primitives ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Look for reverse-complements as well as forward kmers. Default: true */
+ private final boolean rcomp;
+ /** AND bitmask with 0's at the middle base */
+ private final long middleMask;
+
+ /** Normal kmer length */
+ private final int k;
+ /** k-1; used in some expressions */
+ private final int k2;
+ /** Shortest kmer to use for trimming */
+ private final int mink;
+ /** Attempt to match kmers shorter than normal k on read ends when doing kTrimming. */
+ private final boolean useShortKmers;
+
+ /** Fraction of kmers to skip, 0 to 15 out of 16 */
+ private final int speed;
+
+ /** Skip this many kmers when examining the read. Default 1.
+ * 1 means every kmer is used, 2 means every other, etc. */
+ private final int qSkip;
+
+ /** noAccel is true if speed and qSkip are disabled, accel is the opposite. */
+ private final boolean noAccel, accel;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Static Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Number of tables (and threads, during loading) */
+ private static final int WAYS=7; //123
+ /** Verbose messages */
+ public static final boolean verbose=false; //123
+
+ /** Print messages to this stream */
+ private static PrintStream outstream=System.err;
+
+ /** x&clearMasks[i] will clear base i */
+ private static final long[] clearMasks;
+ /** x|setMasks[i][j] will set base i to j */
+ private static final long[][] setMasks;
+ /** x&leftMasks[i] will clear all bases to the right of i (exclusive) */
+ private static final long[] leftMasks;
+ /** x&rightMasks[i] will clear all bases to the left of i (inclusive) */
+ private static final long[] rightMasks;
+ /** x|kMasks[i] will set the bit to the left of the leftmost base */
+ private static final long[] lengthMasks;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Static Initializers ----------------*/
+ /*--------------------------------------------------------------*/
+
+ static{
+ clearMasks=new long[32];
+ leftMasks=new long[32];
+ rightMasks=new long[32];
+ lengthMasks=new long[32];
+ setMasks=new long[4][32];
+ for(int i=0; i<32; i++){
+ clearMasks[i]=~(3L<<(2*i));
+ }
+ for(int i=0; i<32; i++){
+ leftMasks[i]=((-1L)<<(2*i));
+ }
+ for(int i=0; i<32; i++){
+ rightMasks[i]=~((-1L)<<(2*i));
+ }
+ for(int i=0; i<32; i++){
+ lengthMasks[i]=((1L)<<(2*i));
+ }
+ for(int i=0; i<32; i++){
+ for(long j=0; j<4; j++){
+ setMasks[(int)j][i]=(j<<(2*i));
+ }
+ }
+ }
+
+}
diff --git a/current/pacbio/CalcCoverageFromSites.java b/current/pacbio/CalcCoverageFromSites.java
new file mode 100755
index 0000000..6fa8980
--- /dev/null
+++ b/current/pacbio/CalcCoverageFromSites.java
@@ -0,0 +1,526 @@
+package pacbio;
+
+import java.util.Arrays;
+import java.util.BitSet;
+
+import stream.SiteScoreR;
+import var.GenerateVarlets;
+
+
+import align2.Tools;
+import dna.AminoAcid;
+import dna.ChromosomeArray;
+import dna.CoverageArray;
+import dna.CoverageArray2;
+import dna.Data;
+import dna.Parser;
+import dna.Timer;
+import fileIO.ReadWrite;
+import fileIO.TextFile;
+
+/**
+ * @author Brian Bushnell
+ * @date Jul 19, 2012
+ *
+ */
+public class CalcCoverageFromSites {
+
+ public static void main(String[] args){
+ System.err.println("Executing "+(new Object() { }.getClass().getEnclosingClass().getName())+" "+Arrays.toString(args)+"\n");
+ Timer t=new Timer();
+ String infile=args[0];
+ String outfile=args[1];
+ if(outfile.equalsIgnoreCase("null")){outfile=null;}
+ assert(outfile==null || outfile.contains("#"));
+ int genome=Integer.parseInt(args[2]);
+ int mincoverage=1;
+ for(int i=3; i<args.length; i++){
+ final String arg=args[i];
+ final String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split[1];
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(a.equals("mincoverage")){
+ mincoverage=Integer.parseInt(b);
+ }
+ }
+ if(outfile==null){
+ process(infile, genome, mincoverage);
+ }else{
+ processAndWrite(infile, genome, mincoverage, outfile);
+ }
+ t.stop();
+ System.out.println("Time: \t"+t);
+ }
+
+
+ public static void processAndWrite(final String fname, final int genome, final int mincoverage, final String outpattern){
+ Data.setGenome(genome);
+
+ BitSet bs=new BitSet();
+
+ CoverageArray[] coverage=new CoverageArray[Data.numChroms+1];
+ byte[][] correct=new byte[Data.numChroms+1][];
+
+ for(int chrom=1; chrom<=Data.numChroms; chrom++){
+ coverage[chrom]=new CoverageArray2(chrom, Data.chromLengths[chrom]);
+ correct[chrom]=new byte[Data.chromLengths[chrom]];
+ }
+
+ TextFile tf=new TextFile(fname, true, false);
+ String s=tf.nextLine();
+
+ long totalSites=0;
+ long correctSites=0;
+ long totalSiteLen=0;
+ long correctSiteLen=0;
+
+ while(s!=null){
+ SiteScoreR[] sites=toSites(s);
+ for(SiteScoreR ssr : sites){
+
+ if(bs!=null && ssr.numericID<=Integer.MAX_VALUE){
+ bs.set((int)ssr.numericID);
+ }
+
+ int len=ssr.stop-ssr.start+1;
+ totalSites++;
+ totalSiteLen+=len;
+ if(ssr.correct){
+ correctSites++;
+ correctSiteLen+=len;
+ }
+
+
+ int chrom=ssr.chrom;
+ int min=Tools.max(ssr.start+MIN_END_DIST, 0);
+ int max=Tools.min(ssr.stop-MIN_END_DIST, Data.chromLengths[chrom]-1);
+
+ CoverageArray ca=coverage[chrom];
+ for(int i=min; i<=max; i++){
+ ca.increment(i);
+ }
+
+ if(ssr.correct){
+ byte[] array=correct[chrom];
+ for(int i=min; i<=max; i++){
+ if(array[i]<Byte.MAX_VALUE){array[i]++;}
+ }
+ }
+ }
+
+ s=tf.nextLine();
+ }
+ tf.close();
+
+ for(int i=1; i<coverage.length; i++){
+ if(coverage[i].arrayLength()-coverage[i].maxIndex>2000){coverage[i].resize(coverage[i].maxIndex+1);}
+ ReadWrite.writeObjectInThread(coverage[i], outpattern.replaceFirst("#", ""+i), false);
+ }
+
+ long totalCoverage=0;
+ long totalCoverageBase=0;
+ long totalCoverageN=0;
+ long correctCoverage=0;
+ long correctCoverageBase=0;
+ long correctCoverageN=0;
+
+ long onlyCorrectBase=0;
+ long onlyIncorrectBase=0;
+ long onlyCorrectN=0;
+ long onlyIncorrectN=0;
+ long mostlyCorrectBase=0;
+ long mostlyIncorrectBase=0;
+ long mostlyCorrectN=0;
+ long mostlyIncorrectN=0;
+ long anyCorrectBase=0;
+ long anyCorrectN=0;
+ long noCorrectBase=0;
+ long noCoverageBase=0;
+ long noCoverageN=0;
+
+ long baseCount=0;
+ long nCount=0;
+ long nCountCovered=0;
+
+ for(int chrom=1; chrom<=Data.numChroms; chrom++){
+ ChromosomeArray cha=Data.getChromosome(chrom);
+ CoverageArray cov=coverage[chrom];
+ byte[] cor=correct[chrom];
+ for(int i=0; i<cov.maxIndex; i++){
+ char b=Character.toUpperCase((char)cha.get(i));
+ if(!AminoAcid.isFullyDefined(b)){b='N';}
+
+ int total=cov.get(i);
+ int good=cor[i];
+ int bad=total-good;
+
+ totalCoverage+=total;
+ correctCoverage+=good;
+
+ if(b=='N'){
+ nCount++;
+ if(total>=mincoverage){
+ totalCoverageN+=total;
+ correctCoverageN+=good;
+ nCountCovered++;
+ if(total==good){
+ onlyCorrectN++;
+ mostlyCorrectN++;
+ }else if(good>bad){
+ mostlyCorrectN++;
+ }else if(good==0){
+ onlyIncorrectN++;
+ mostlyIncorrectN++;
+ }else if(bad>good){
+ mostlyIncorrectN++;
+ }
+ if(good>0){anyCorrectN++;}
+ }else{
+ noCoverageN++;
+ }
+ }else{
+ baseCount++;
+ if(total>=mincoverage){
+ totalCoverageBase+=total;
+ correctCoverageBase+=good;
+ if(total==good){
+ onlyCorrectBase++;
+ mostlyCorrectBase++;
+ }else if(good>bad){
+ mostlyCorrectBase++;
+ }else if(good==0){
+ onlyIncorrectBase++;
+ mostlyIncorrectBase++;
+ noCorrectBase++;
+ }else if(bad>good){
+ mostlyIncorrectBase++;
+ }
+ if(good>0){anyCorrectBase++;}
+ }else{
+ noCoverageBase++;
+ noCorrectBase++;
+ }
+ }
+ }
+ Data.unload(chrom, true);
+ coverage[chrom]=null;
+ correct[chrom]=null;
+ }
+
+ long length=nCount+baseCount;
+ double invlen=1.0/length;
+ double invbase=1.0/baseCount;
+ double invn=1.0/nCount;
+ double invnc=1.0/nCountCovered; //covered N's
+
+ double totalCoverageB=totalCoverage*invlen;
+ double totalCoverageBaseB=totalCoverageBase*invbase;
+ double totalCoverageNB=totalCoverageN*invn;
+ double correctCoverageB=correctCoverage*invlen;
+ double correctCoverageBaseB=correctCoverageBase*invbase;
+ double correctCoverageNB=correctCoverageN*invn;
+
+ double onlyCorrectBaseB=onlyCorrectBase*invbase*100;
+ double onlyIncorrectBaseB=onlyIncorrectBase*invbase*100;
+ double onlyCorrectNB=onlyCorrectN*invnc*100;
+ double onlyIncorrectNB=onlyIncorrectN*invnc*100;
+ double mostlyCorrectBaseB=mostlyCorrectBase*invbase*100;
+ double mostlyIncorrectBaseB=mostlyIncorrectBase*invbase*100;
+ double mostlyCorrectNB=mostlyCorrectN*invnc*100;
+ double mostlyIncorrectNB=mostlyIncorrectN*invnc*100;
+ double anyCorrectBaseB=anyCorrectBase*invbase*100;
+ double anyCorrectNB=anyCorrectN*invnc*100;
+ double noCorrectBaseB=noCorrectBase*invbase*100;
+ double noCoverageBaseB=noCoverageBase*invbase*100;
+ double noCoverageNB=noCoverageN*invn*100;
+
+
+
+ double correctSitesB=correctSites*100d/totalSites;
+ double correctSiteLenB=correctSiteLen*100d/totalSiteLen;
+
+ System.out.println("\nOverall Statistics");
+
+ if(bs!=null){
+ System.out.println("Reads Represented: \t"+bs.cardinality());
+ }
+ System.out.println(String.format("Total Correct Sites: \t"+(correctSitesB<10?" ":"")+"%.3f%% ", correctSitesB)+" \t"+correctSites);
+ System.out.println(String.format("Total Correct Site Length:\t"+(correctSiteLenB<10?" ":"")+"%.3f%% ", correctSiteLenB)+" \t"+correctSiteLen);
+
+ System.out.println("\nCoverage Statistics");
+
+ System.out.println(String.format("Avg Coverage: \t"+(totalCoverageB<10?" ":"")+"%.3f", totalCoverageB)+" \t"+totalCoverage);
+
+ System.out.println(String.format("Avg Coverage Base: \t"+(totalCoverageBaseB<10?" ":"")+"%.3f", totalCoverageBaseB)+" \t"+totalCoverageBase);
+
+ System.out.println(String.format("Avg Coverage N: \t"+(totalCoverageNB<10?" ":"")+"%.3f", totalCoverageNB)+" \t"+totalCoverageN);
+
+ System.out.println(String.format("Correct Coverage: \t"+(correctCoverageB<10?" ":"")+"%.3f", correctCoverageB)+" \t"+correctCoverage);
+
+ System.out.println(String.format("Correct Coverage Base: \t"+(correctCoverageBaseB<10?" ":"")+"%.3f", correctCoverageBaseB)+" \t"+correctCoverageBase);
+
+ System.out.println(String.format("Correct Coverage N: \t"+(correctCoverageNB<10?" ":"")+"%.3f", correctCoverageNB)+" \t"+correctCoverageN);
+
+ System.out.println("\nStatistics over Defined Bases");
+
+ System.out.println(String.format("onlyCorrect: \t"+(onlyCorrectBaseB<10?" ":"")+"%.3f", onlyCorrectBaseB)+"%");
+ System.out.println(String.format("mostlyCorrect: \t"+(mostlyCorrectBaseB<10?" ":"")+"%.3f", mostlyCorrectBaseB)+"%");
+ System.out.println(String.format("anyCorrect: \t"+(anyCorrectBaseB<10?" ":"")+"%.3f", anyCorrectBaseB)+"%");
+ System.out.println(String.format("noCorrect: \t"+(noCorrectBaseB<10?" ":"")+"%.3f", noCorrectBaseB)+"%");
+ System.out.println(String.format("mostlyIncorrect: \t"+(mostlyIncorrectBaseB<10?" ":"")+"%.3f", mostlyIncorrectBaseB)+"%");
+ System.out.println(String.format("onlyIncorrect: \t"+(onlyIncorrectBaseB<10?" ":"")+"%.3f", onlyIncorrectBaseB)+"%");
+ System.out.println(String.format("noCoverage: \t"+(noCoverageBaseB<10?" ":"")+"%.3f", noCoverageBaseB)+"%");
+
+ System.out.println("\nStatistics over N (for covered locations)");
+
+ System.out.println(String.format("onlyCorrect: \t"+(onlyCorrectNB<10?" ":"")+"%.3f", onlyCorrectNB)+"%");
+ System.out.println(String.format("mostlyCorrect: \t"+(mostlyCorrectNB<10?" ":"")+"%.3f", mostlyCorrectNB)+"%");
+ System.out.println(String.format("anyCorrect: \t"+(anyCorrectNB<10?" ":"")+"%.3f", anyCorrectNB)+"%");
+ System.out.println(String.format("mostlyIncorrect: \t"+(mostlyIncorrectNB<10?" ":"")+"%.3f", mostlyIncorrectNB)+"%");
+ System.out.println(String.format("onlyIncorrect: \t"+(onlyIncorrectNB<10?" ":"")+"%.3f", onlyIncorrectNB)+"%");
+ System.out.println(String.format("noCoverage (over all N): \t"+(noCoverageNB<10?" ":"")+"%.3f", noCoverageNB)+"%");
+
+
+ }
+
+
+ public static void process(final String fname, final int genome, final int mincoverage){
+ Data.setGenome(genome);
+
+ BitSet bs=new BitSet();
+
+ byte[][] coverage=new byte[Data.numChroms+1][];
+ byte[][] correct=new byte[Data.numChroms+1][];
+
+ for(int chrom=1; chrom<=Data.numChroms; chrom++){
+ coverage[chrom]=new byte[Data.chromLengths[chrom]];
+ correct[chrom]=new byte[Data.chromLengths[chrom]];
+ }
+
+ TextFile tf=new TextFile(fname, true, false);
+ String s=tf.nextLine();
+
+ long totalSites=0;
+ long correctSites=0;
+ long totalSiteLen=0;
+ long correctSiteLen=0;
+
+ while(s!=null){
+ SiteScoreR[] sites=toSites(s);
+ for(SiteScoreR ssr : sites){
+
+ if(bs!=null){
+ bs.set((int)ssr.numericID);
+ }
+
+ int len=ssr.stop-ssr.start+1;
+ totalSites++;
+ totalSiteLen+=len;
+ if(ssr.correct){
+ correctSites++;
+ correctSiteLen+=len;
+ }
+
+
+ int chrom=ssr.chrom;
+ int min=Tools.max(ssr.start, 0);
+ int max=Tools.min(ssr.stop, Data.chromLengths[chrom]-1);
+ byte[] array=coverage[chrom];
+ for(int i=min; i<=max; i++){
+ if(array[i]<Byte.MAX_VALUE){array[i]++;}
+ }
+ if(ssr.correct){
+ array=correct[chrom];
+ for(int i=min; i<=max; i++){
+ if(array[i]<Byte.MAX_VALUE){array[i]++;}
+ }
+ }
+ }
+
+ s=tf.nextLine();
+ }
+ tf.close();
+
+ long totalCoverage=0;
+ long totalCoverageBase=0;
+ long totalCoverageN=0;
+ long correctCoverage=0;
+ long correctCoverageBase=0;
+ long correctCoverageN=0;
+
+ long onlyCorrectBase=0;
+ long onlyIncorrectBase=0;
+ long onlyCorrectN=0;
+ long onlyIncorrectN=0;
+ long mostlyCorrectBase=0;
+ long mostlyIncorrectBase=0;
+ long mostlyCorrectN=0;
+ long mostlyIncorrectN=0;
+ long anyCorrectBase=0;
+ long anyCorrectN=0;
+ long noCorrectBase=0;
+ long noCoverageBase=0;
+ long noCoverageN=0;
+
+ long baseCount=0;
+ long nCount=0;
+ long nCountCovered=0;
+
+ for(int chrom=1; chrom<=Data.numChroms; chrom++){
+ ChromosomeArray cha=Data.getChromosome(chrom);
+ byte[] cov=coverage[chrom];
+ byte[] cor=correct[chrom];
+ for(int i=0; i<cov.length; i++){
+ char b=Character.toUpperCase((char)cha.get(i));
+ if(!AminoAcid.isFullyDefined(b)){b='N';}
+
+ int total=cov[i];
+ int good=cor[i];
+ int bad=total-good;
+
+ totalCoverage+=total;
+ correctCoverage+=good;
+
+ if(b=='N'){
+ nCount++;
+ if(total>=mincoverage){
+ totalCoverageN+=total;
+ correctCoverageN+=good;
+ nCountCovered++;
+ if(total==good){
+ onlyCorrectN++;
+ mostlyCorrectN++;
+ }else if(good>bad){
+ mostlyCorrectN++;
+ }else if(good==0){
+ onlyIncorrectN++;
+ mostlyIncorrectN++;
+ }else if(bad>good){
+ mostlyIncorrectN++;
+ }
+ if(good>0){anyCorrectN++;}
+ }else{
+ noCoverageN++;
+ }
+ }else{
+ baseCount++;
+ if(total>=mincoverage){
+ totalCoverageBase+=total;
+ correctCoverageBase+=good;
+ if(total==good){
+ onlyCorrectBase++;
+ mostlyCorrectBase++;
+ }else if(good>bad){
+ mostlyCorrectBase++;
+ }else if(good==0){
+ onlyIncorrectBase++;
+ mostlyIncorrectBase++;
+ noCorrectBase++;
+ }else if(bad>good){
+ mostlyIncorrectBase++;
+ }
+ if(good>0){anyCorrectBase++;}
+ }else{
+ noCoverageBase++;
+ noCorrectBase++;
+ }
+ }
+ }
+ Data.unload(chrom, true);
+ coverage[chrom]=null;
+ correct[chrom]=null;
+ }
+
+ long length=nCount+baseCount;
+ double invlen=1.0/length;
+ double invbase=1.0/baseCount;
+ double invn=1.0/nCount;
+ double invnc=1.0/nCountCovered; //covered N's
+
+ double totalCoverageB=totalCoverage*invlen;
+ double totalCoverageBaseB=totalCoverageBase*invbase;
+ double totalCoverageNB=totalCoverageN*invn;
+ double correctCoverageB=correctCoverage*invlen;
+ double correctCoverageBaseB=correctCoverageBase*invbase;
+ double correctCoverageNB=correctCoverageN*invn;
+
+ double onlyCorrectBaseB=onlyCorrectBase*invbase*100;
+ double onlyIncorrectBaseB=onlyIncorrectBase*invbase*100;
+ double onlyCorrectNB=onlyCorrectN*invnc*100;
+ double onlyIncorrectNB=onlyIncorrectN*invnc*100;
+ double mostlyCorrectBaseB=mostlyCorrectBase*invbase*100;
+ double mostlyIncorrectBaseB=mostlyIncorrectBase*invbase*100;
+ double mostlyCorrectNB=mostlyCorrectN*invnc*100;
+ double mostlyIncorrectNB=mostlyIncorrectN*invnc*100;
+ double anyCorrectBaseB=anyCorrectBase*invbase*100;
+ double anyCorrectNB=anyCorrectN*invnc*100;
+ double noCorrectBaseB=noCorrectBase*invbase*100;
+ double noCoverageBaseB=noCoverageBase*invbase*100;
+ double noCoverageNB=noCoverageN*invn*100;
+
+
+
+ double correctSitesB=correctSites*100d/totalSites;
+ double correctSiteLenB=correctSiteLen*100d/totalSiteLen;
+
+ System.out.println("\nOverall Statistics");
+
+ if(bs!=null){
+ System.out.println("Reads Represented: \t"+bs.cardinality());
+ }
+ System.out.println(String.format("Total Correct Sites: \t"+(correctSitesB<10?" ":"")+"%.3f%% ", correctSitesB)+" \t"+correctSites);
+ System.out.println(String.format("Total Correct Site Length:\t"+(correctSiteLenB<10?" ":"")+"%.3f%% ", correctSiteLenB)+" \t"+correctSiteLen);
+
+ System.out.println("\nCoverage Statistics");
+
+ System.out.println(String.format("Avg Coverage: \t"+(totalCoverageB<10?" ":"")+"%.3f", totalCoverageB)+" \t"+totalCoverage);
+
+ System.out.println(String.format("Avg Coverage Base: \t"+(totalCoverageBaseB<10?" ":"")+"%.3f", totalCoverageBaseB)+" \t"+totalCoverageBase);
+
+ System.out.println(String.format("Avg Coverage N: \t"+(totalCoverageNB<10?" ":"")+"%.3f", totalCoverageNB)+" \t"+totalCoverageN);
+
+ System.out.println(String.format("Correct Coverage: \t"+(correctCoverageB<10?" ":"")+"%.3f", correctCoverageB)+" \t"+correctCoverage);
+
+ System.out.println(String.format("Correct Coverage Base: \t"+(correctCoverageBaseB<10?" ":"")+"%.3f", correctCoverageBaseB)+" \t"+correctCoverageBase);
+
+ System.out.println(String.format("Correct Coverage N: \t"+(correctCoverageNB<10?" ":"")+"%.3f", correctCoverageNB)+" \t"+correctCoverageN);
+
+ System.out.println("\nStatistics over Defined Bases");
+
+ System.out.println(String.format("onlyCorrect: \t"+(onlyCorrectBaseB<10?" ":"")+"%.3f", onlyCorrectBaseB)+"%");
+ System.out.println(String.format("mostlyCorrect: \t"+(mostlyCorrectBaseB<10?" ":"")+"%.3f", mostlyCorrectBaseB)+"%");
+ System.out.println(String.format("anyCorrect: \t"+(anyCorrectBaseB<10?" ":"")+"%.3f", anyCorrectBaseB)+"%");
+ System.out.println(String.format("noCorrect: \t"+(noCorrectBaseB<10?" ":"")+"%.3f", noCorrectBaseB)+"%");
+ System.out.println(String.format("mostlyIncorrect: \t"+(mostlyIncorrectBaseB<10?" ":"")+"%.3f", mostlyIncorrectBaseB)+"%");
+ System.out.println(String.format("onlyIncorrect: \t"+(onlyIncorrectBaseB<10?" ":"")+"%.3f", onlyIncorrectBaseB)+"%");
+ System.out.println(String.format("noCoverage: \t"+(noCoverageBaseB<10?" ":"")+"%.3f", noCoverageBaseB)+"%");
+
+ System.out.println("\nStatistics over N (for covered locations)");
+
+ System.out.println(String.format("onlyCorrect: \t"+(onlyCorrectNB<10?" ":"")+"%.3f", onlyCorrectNB)+"%");
+ System.out.println(String.format("mostlyCorrect: \t"+(mostlyCorrectNB<10?" ":"")+"%.3f", mostlyCorrectNB)+"%");
+ System.out.println(String.format("anyCorrect: \t"+(anyCorrectNB<10?" ":"")+"%.3f", anyCorrectNB)+"%");
+ System.out.println(String.format("mostlyIncorrect: \t"+(mostlyIncorrectNB<10?" ":"")+"%.3f", mostlyIncorrectNB)+"%");
+ System.out.println(String.format("onlyIncorrect: \t"+(onlyIncorrectNB<10?" ":"")+"%.3f", onlyIncorrectNB)+"%");
+ System.out.println(String.format("noCoverage (over all N): \t"+(noCoverageNB<10?" ":"")+"%.3f", noCoverageNB)+"%");
+
+
+ }
+
+
+
+ public static SiteScoreR[] toSites(String s){
+ String[] split=s.split("\t");
+ SiteScoreR[] scores=new SiteScoreR[split.length];
+ for(int i=0; i<split.length; i++){
+ SiteScoreR ssr=scores[i]=SiteScoreR.fromText(split[i]);
+ }
+ return scores;
+ }
+
+ public static int MIN_END_DIST=GenerateVarlets.MIN_END_DIST; //These must be the same.
+
+}
diff --git a/current/pacbio/GenerateMultiChrom.java b/current/pacbio/GenerateMultiChrom.java
new file mode 100755
index 0000000..5fed33d
--- /dev/null
+++ b/current/pacbio/GenerateMultiChrom.java
@@ -0,0 +1,169 @@
+package pacbio;
+
+import java.io.File;
+import java.util.Random;
+
+
+import align2.Tools;
+
+import dna.AminoAcid;
+import dna.ChromArrayMaker;
+import dna.ChromosomeArray;
+import dna.Data;
+import dna.FastaToChromArrays2;
+import dna.Gene;
+import fileIO.ReadWrite;
+
+/**
+ * @author Brian Bushnell
+ * @date Jul 16, 2012
+ *
+ */
+public class GenerateMultiChrom {
+
+ public static void main(String[] args){
+
+ ChromosomeArray cha=null;
+
+ try {
+ int genomeIn=Integer.parseInt(args[0]);
+ Data.setGenome(genomeIn);
+ cha=Data.getChromosome(1);
+ Data.unload(1, true);
+ } catch (NumberFormatException e) {
+ //ignore
+ }
+
+ if(cha==null){
+ String inname=args[0];
+ cha=ChromosomeArray.read(inname, 1);
+ }
+
+ assert(cha!=null);
+
+ int copies=Integer.parseInt(args[1]);
+ int build=Integer.parseInt(args[2]);
+
+ int mincontig=-1;
+ int maxcontig=-1;
+ int buffer=-1;
+ if(args.length>3){
+ mincontig=Integer.parseInt(args[3]);
+ maxcontig=Integer.parseInt(args[4]);
+ buffer=Integer.parseInt(args[5]);
+ System.out.println("Multichrom will be overlayed with blocks of "+buffer+" 'N'");
+ }
+
+
+// String pattern=ROOT_GENOME+GENOME_BUILD+"/chr"+chrom+".chromC";
+
+ File f=new File(Data.ROOT_GENOME+build);
+ if(!f.exists()){f.mkdirs();}
+
+ for(int i=1; i<=copies; i++){
+ ChromosomeArray chb=makeSynthetic(cha, i);
+ if(buffer>0){
+ addN(chb, mincontig, maxcontig, buffer);
+ }
+ ReadWrite.write(chb, Data.ROOT_GENOME+build+"/chr"+i+Data.chromExtension(), false);
+ }
+ FastaToChromArrays2.writeInfo(build, copies, Data.name, "multiple_"+Data.GENOME_BUILD, false, false);
+
+ }
+
+ private static void addN(ChromosomeArray cha, int minContig, int maxContig, int buffer){
+
+ final int spread=maxContig-minContig+1;
+ final Random randy=new Random(cha.chromosome);
+ final int lim=cha.maxIndex-Tools.max(maxContig, minContig+buffer);
+
+ int contig=0;
+ int nextContig=minContig+randy.nextInt(spread);
+
+ for(int i=0; i<lim; i++){
+ byte b=cha.get(i);
+ if(b=='N'){contig=0;}
+ else{
+ contig++;
+ if(contig>=nextContig){
+ contig=0;
+ int lim2=i+buffer;
+ while(i<lim2){
+ cha.set(i, 'N');
+ i++;
+ }
+ nextContig=minContig+(randy.nextInt(spread)+randy.nextInt(spread))/2;
+ }
+ }
+ }
+ }
+
+ /**
+ * @param cha
+ * @param i
+ * @return
+ */
+ private static ChromosomeArray makeSynthetic(ChromosomeArray cha, int chrom) {
+// assert(false) : cha.array.length+", "+cha.maxIndex;
+ ChromosomeArray chb=new ChromosomeArray(chrom, Gene.PLUS, cha.minIndex, cha.array.length+40);
+ chb.maxIndex=-1;
+
+ int dif=0;
+ final int MIN_DIF=-12;
+ final int MAX_DIF=12;
+ final int INDEL_PERCENT=10;
+ final int SUB_PERCENT=1;
+ final int ERROR_PERCENT=INDEL_PERCENT+SUB_PERCENT;
+ final int ERROR_LENGTH=3;
+
+ Random randy=new Random(chrom);
+
+ int a=cha.minIndex;
+ int b=chb.minIndex;
+
+ while(a<=cha.array.length){
+ byte c=cha.get(a);
+ int x=(c=='N' ? 100 : randy.nextInt(100));
+ if(x>=ERROR_PERCENT){ //No error
+ chb.set(b, c);
+ a++;
+ b++;
+ }else if(x>=INDEL_PERCENT){//sub
+ byte e=c;
+ while(e==c){
+ e=AminoAcid.numberToBase[randy.nextInt(4)];
+ }
+ chb.set(b, e);
+ a++;
+ b++;
+ }else{//indel
+ boolean ins=randy.nextBoolean();
+ int len=Tools.min(randy.nextInt(ERROR_LENGTH), randy.nextInt(ERROR_LENGTH), randy.nextInt(ERROR_LENGTH+1))+1;
+ if(ins && dif+len>MAX_DIF){
+ ins=false;
+ }else if(!ins && dif-len<MIN_DIF){
+ ins=true;
+ }
+
+ if(ins){
+ for(int i=0; i<len; i++){
+ boolean same=randy.nextFloat()<0.6f; //Additional 60% chance that inserted base will be a duplicate of an existing base
+ int n=randy.nextInt(4);
+ byte e=(same ? c : AminoAcid.numberToBase[n]);
+ chb.set(b, e);
+ b++;
+ dif++;
+ }
+ }else{
+ a+=len;
+ dif-=len;
+ }
+ }
+ }
+
+ return chb;
+ }
+
+
+
+}
diff --git a/current/pacbio/MakePacBioScript.java b/current/pacbio/MakePacBioScript.java
new file mode 100755
index 0000000..e1d313d
--- /dev/null
+++ b/current/pacbio/MakePacBioScript.java
@@ -0,0 +1,443 @@
+package pacbio;
+
+import java.io.File;
+import java.util.ArrayList;
+
+import align2.Tools;
+
+import dna.Data;
+import dna.Parser;
+import fileIO.ReadWrite;
+import fileIO.TextFile;
+
+/**
+ * @author Brian Bushnell
+ * @date Oct 2, 2012
+ *
+ */
+public class MakePacBioScript {
+
+ /**
+ Be sure to replace:
+ @BUILDNUM with a number
+ @DIRTY_INPUT with the PacBio file
+ @CLEAN_INPUT_1 with the Illumina file
+ @ORGANISM with the name of the organism (or whatever)
+ @NUMSLOTS with the number of slots requested
+ @TARGET_SIZE with an estimate of the genome size, in bases. Examples: 160000000 or 160m or 0.16g are equivalent.
+ @RAM with e.g. Xmx31g
+ @MAXRAM with e.g. Xmx220g
+ @SCRIPT with the output file, e.g. run.sh
+ @MERGEREF with a list of reference files, e.g. chrom1.fa,chrom2.fa,chrom3.fa
+ @MERGEDIRTY with a list of dirty files, e.g. subreads1.fa,subreads2.fa,subreads3.fa
+ @MERGECLEAN with a list of clean files, e.g. illumina1.fq,illumina2.fq,illumina3.fq
+ @EXTRA with extra files for Illumina error correction. e.g. extra=a.fq,b.fq,c.fq
+
+ Optional:
+ @MAXREADS with the max number of clean reads to use in phase 1 (the slowest phase)
+ @REFERENCE with a reference file (optional)
+ @REFBUILD with a number
+ */
+ public static void main(String[] args){
+
+ if(args==null || args.length<1){
+ System.out.println("\nThis program generates a script for error-correcting PacBio reads using Illumina reads.\nSample command line:\n");
+// System.out.println("java -ea -Xmx64m"+(Data.WINDOWS ? "" : " -cp "+Data.ROOT)+" jgi.MakePacBioScript " +
+// "dirty=subreads.fa clean=illumina.fq ref=ecoliRef.fa name=ecoli " +
+// "out=run.sh template="+(Data.WINDOWS ? "" : "/house/homedirs/b/bushnell/template/")+"cleanPacbioTemplate.sh " +
+// "targetsize=5.4m threads=24 ram=31 maxram=100 noderam=256 build=-1 refbuild=-1 maxreads=-1");
+// System.out.println("java -ea -Xmx64m"+(Data.WINDOWS ? "" : " -cp "+Data.ROOT)+" jgi.MakePacBioScript " +
+// "dirty=subreads.fa clean=illumina.fq ref=ecoliRef.fa name=ecoli " +
+// "out=run.sh template="+(Data.WINDOWS ? "" : "/house/homedirs/b/bushnell/template/")+"cleanPacbioTemplate.sh " +
+// "targetsize=5.4m threads=24 noderam=256");
+// System.out.println("\n\nOr to be concise:");
+ System.out.println("java -ea -Xmx64m"+(Data.WINDOWS ? "" : " -cp "+Data.ROOT())+" jgi.MakePacBioScript " +
+ "d=subreads.fa c=illumina.fq tpl=template.sh ts=5.4m t=24 nm=256");
+ System.out.println("\n\nInput files can optionally be comma-separated lists of files, and absolute pathing can be used.");
+ System.out.println("All input files may be raw, gzipped, or bzipped as long as they have the correct file extension.");
+ System.out.println();
+ System.out.println("\n***** Required Parameters *****\n");
+ System.out.println("d=, dirty= \tPath to dirty (PacBio) reads. May be comma-delimited for multiple files.");
+ System.out.println("c=, clean= \tPath to clean (Illumina) reads. May be comma-delimited for multiple files.");
+ System.out.println("t=, threads= \tNumber of threads. Should equal the number of qsub slots or cores on the target machine.");
+ System.out.println("nm=, nodemem= \tPhysical memory (RAM) of target machine, in gigabytes.");
+ System.out.println("ts=, targetsize= \tEstimated size of target genome, in bases (k, m, or g may be used). Optional ONLY if a reference is supplied.");
+ System.out.println("\n***** Optional Parameters *****\n");
+ System.out.println("tpl=, template= \tPath to template for this script. Default is /house/homedirs/b/bushnell/template/cleanPacbioTemplate_ecc.sh");
+ System.out.println("mode= \tCan be specified instead of 'template='. Values are 'pacbio', 'assembly', or 'ccs'");
+ System.out.println("sort= \tTrue or false. Determines whether clean reads are sorted (alphabetically) and duplicates are removed.");
+ System.out.println("r=, ref= \tPath to reference fasta. May be comma-delimited for multiple files.");
+ System.out.println("o=, out= \tName of output script. Default is 'run.sh'.");
+ System.out.println("name= \tName of organism. Default is 'organism'.");
+ System.out.println("h=, hours= \tTime limit (in hours) for autogenerated qsub command line.");
+ System.out.println("m=,mem= \tAmount of heap memory for Java to use. Default is 31g; must be at least 10x number input PacBio bases." +
+ "\n \tNote! Two steps, Illumina error correction and site stacking, will ignore this and use all physical memory.");
+ System.out.println("b=,build= \tPrefix for index build number. Default is 2, yielding successively improved builds 2, 200, 201, 202, ... 208");
+ System.out.println("rb=,refbuild= \tReference build number. Default is 1.");
+ System.out.println("cp=,classpath= \tClasspath to the program. If unspecified, will be autodetected as "+
+ (Data.WINDOWS ? "/house/homedirs/b/bushnell/beta19/" : Data.ROOT()));
+// r=ref.fa o=run.sh
+ System.exit(0);
+ }
+
+ String dirty=null;
+ String clean=null;
+ String name="organism";
+ String targetsize=null;
+ String ref=null;
+ String template=null;
+ String output="run.sh";
+ String extra="";
+ String classpath=(Data.WINDOWS ? "/house/homedirs/b/bushnell/beta19/" : Data.ROOT());
+ String sort_in="";
+ String sorted="sorted_topo#.txt.gz";
+ String sorted_out="sorted_topo1.txt.gz";
+
+ String mergeref=null;
+ String mergedirty=null;
+ String mergeclean=null;
+
+ String qsub=null;
+
+ String cleanecc="@ORGANISM_ecc_1.txt.gz";
+ String cleanbadecc="@ORGANISM_ecc_1_BAD.txt.gz";
+ String cleanallecc="@ORGANISM_ecc_1_ALL.txt.gz";
+
+ String mode="pacbio";
+
+ int build=2;
+ int threads=24;
+ int ram=31;
+ int maxram=-1;
+ int refbuild=-1;
+ int noderam=-1;
+ long maxReads=-1;
+ int runtime=499;
+
+ boolean ecc=false;
+ boolean sort=true;
+
+ for(int i=0; i<args.length; i++){
+ final String arg=args[i];
+ final String[] split=arg.split("=");
+ if(split.length!=2){
+ System.out.println("Wrong number of arguments for variable "+split[0]);
+ System.exit(0);
+ }
+ String a=split[0].toLowerCase();
+ String b=split[1];
+ if(b.equalsIgnoreCase("null")){b=null;}
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(a.equals("threads") || a.startsWith("slots") || a.equals("t")){
+ threads=Integer.parseInt(b);
+ }else if(a.equals("mode")){
+ mode=b;
+ }else if(a.startsWith("reads") || a.startsWith("maxreads") || a.equals("rd")){
+ maxReads=Tools.parseKMG(b);
+ }else if(a.startsWith("build") || a.startsWith("genome") || a.equals("b")){
+ build=Integer.parseInt(b);
+ String s=Data.chromFname(1, build);
+ if((new File(s)).exists()){System.out.println("Warning! Genome build "+b+" already exists at "+s);}
+ }else if(a.startsWith("refbuild") || a.startsWith("refgenome") || a.equals("rb")){
+ refbuild=Integer.parseInt(b);
+ String s=Data.chromFname(1, refbuild);
+ if((new File(s)).exists()){System.out.println("Warning! Genome build "+b+" already exists at "+s);}
+ }else if(a.startsWith("ram") || a.startsWith("mem") || a.equals("m")){
+ ram=Integer.parseInt(b.toLowerCase().replaceAll("g", ""));
+ }else if(a.startsWith("maxram") || a.startsWith("maxmem") || a.equals("mm")){
+ maxram=Integer.parseInt(b.toLowerCase().replaceAll("g", ""));
+ }else if(a.startsWith("noderam") || a.startsWith("nodemem") || a.equals("nm")){
+ if(b!=null){noderam=Integer.parseInt(b.toLowerCase().replaceAll("g", ""));}
+ }else if(a.equals("runtime") || a.equals("hours") || a.equals("time") || a.equals("h")){
+ runtime=Integer.parseInt(b.toLowerCase().replaceAll("h", ""));
+ }else if(a.startsWith("dirty") || a.startsWith("pacbio") || a.equals("d")){
+ dirty=b;
+ if(dirty.contains(",")){
+ mergedirty=dirty;
+ dirty="concatenatedDirtyFiles.fa.gz";
+ if((new File(dirty)).exists()){System.out.println("Warning! file already exists: "+dirty);}
+ }else{
+ if(!(new File(b)).exists()){System.out.println("Warning! No such file: "+b);}
+ }
+ }else if(a.startsWith("clean") || a.startsWith("illumina") || a.equals("c")){
+ clean=b;
+
+ if(clean.contains(",")){
+ String ext="fq";
+ if(clean.contains(".fasta,") || clean.contains(".fa,") || clean.contains(".fasta.gz,") || clean.contains(".fa.gz,")){ext="fa";}
+ else if(clean.contains(".txt,") || clean.contains(".txt.gz,")){ext="txt";}
+
+ mergeclean=clean;
+ clean="concatenatedCleanFiles."+ext+".gz";
+ if((new File(clean)).exists()){System.out.println("Warning! file already exists: "+clean);}
+ }else{
+ if(!(new File(b)).exists()){System.out.println("Warning! No such file: "+b);}
+ }
+ }else if(a.startsWith("name") || a.startsWith("organism")){
+ name=b;
+ if(name==null){name="organism";}
+ }else if(a.startsWith("size") || a.startsWith("targetsize") || a.equals("ts")){
+ targetsize=b;
+ }else if(a.equals("path") || a.equals("classpath") || a.equals("cp")){
+ classpath=b;
+ }else if(a.startsWith("template") || a.equals("tpl")){
+ if(b!=null){
+ template=b;
+ if(!(new File(b)).exists()){System.out.println("Warning! No such file: "+b);}
+ }
+ }else if(a.startsWith("extra") || a.equals("ex")){
+ extra=("extra="+b);
+ if(!b.contains(",") && !(new File(b)).exists()){System.out.println("Warning! No such file: "+b);}
+ }else if(a.startsWith("ref") || a.equals("r")){
+ ref=b;
+ if(ref.contains(",")){
+ mergeref=ref;
+ ref="concatenatedReferenceFiles.fa.gz";
+ if((new File(ref)).exists()){System.out.println("Warning! file already exists: "+ref);}
+ }else{
+ if(!(new File(b)).exists()){System.out.println("Warning! No such file: "+b);}
+ }
+ }else if(a.startsWith("out") || a.equals("o")){
+ output=b;
+ if((new File(b)).exists()){System.out.println("Warning! Outfile already exists: "+b);}
+ }else if(a.startsWith("ecc")){
+ ecc=Tools.parseBoolean(b);
+ }else if(a.equals("sort")){
+ sort=Tools.parseBoolean(b);
+ }else{
+ throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ if(template==null){
+ if(mode==null){mode="pacbio";}
+ mode=mode.toLowerCase();
+ if(mode.equals("pacbio") || mode.equals("pacbio_illumina")){
+ template=(Data.WINDOWS ? "C:/workspace/prune/cleanPacbioTemplate_ecc.sh" : "/house/homedirs/b/bushnell/template/cleanPacbioTemplate_ecc_maxram.sh");
+ }else if(mode.equals("assembly") || mode.equals("assembly_illumina")
+ || mode.equals("reference") || mode.equals("reference_illumina")){
+ template=(Data.WINDOWS ? "C:/workspace/prune/correctReference.sh" : "/house/homedirs/b/bushnell/template/correctReference_maxram.sh");
+ }else if(mode.equals("ccs") || mode.startsWith("ccs_")){
+ throw new RuntimeException("TODO: Mode "+mode+" is unfinished.");
+ }else if(mode.equals("pacbio_ccs") || mode.endsWith("_ccs")){
+ throw new RuntimeException("TODO: Mode "+mode+" is unfinished.");
+ }
+ }
+// assert(false) : mode+", "+template;
+
+ if(ecc){
+ if(sort){
+ sort_in=cleanecc;
+ sorted_out=sorted.replaceFirst("#", "1");
+ }else{
+ sorted_out=sorted=clean;
+ }
+ }else{
+ if(sort){
+ sort_in=clean;
+ sorted_out=sorted.replaceFirst("#", "1");
+ cleanecc=sorted_out;
+ cleanbadecc=sorted_out;
+ cleanallecc=sorted_out;
+ }else{
+ sorted_out=sorted=clean;
+ cleanecc=clean;
+ cleanbadecc=clean;
+ cleanallecc=clean;
+ }
+ }
+
+ assert(threads>0);
+
+ if(noderam<1){
+ if(threads<9){noderam=144;}
+ else if(threads<25){noderam=252;}//Changed due to crash at 217 GB on 24-core nodes.
+ else if(threads<33){noderam=512;}
+ else if(threads<41){noderam=1024;}
+ else{noderam=2048;}
+ System.out.println("Set noderam at "+noderam+"g");
+ }
+
+ String slotram;
+ if(noderam%threads==0){slotram=(noderam/threads)+"G";}
+ else{slotram=((noderam*990)/threads)+"M";}
+
+ if(noderam>0){
+ if(maxram<1){
+ maxram=(int)(noderam*(noderam>256 ? 0.83 : 0.85f));
+ System.out.println("Set maxram at "+maxram+"g");
+ }
+ }
+
+ if(ram>maxram){
+ ram=maxram;
+ System.out.println("Set ram at "+maxram+"g");
+ }
+
+ if("auto".equalsIgnoreCase(targetsize) || (targetsize==null && ref!=null)){
+ if(ref==null){throw new RuntimeException("Ref file must be specified for auto targetsize.");}
+ File f=new File(ref);
+ if(!f.exists()){throw new RuntimeException("Ref file must exist for auto targetsize.");}
+ if(f.exists()){
+ targetsize=""+new File(ref).length();
+ if(ref.endsWith(".gz") || ref.endsWith(".gzip") || ref.endsWith(".zip") || ref.endsWith(".bz2")){
+ TextFile tf=new TextFile(ref, false, false);
+ long x=1;
+ for(String s=tf.nextLine(); s!=null; s=tf.nextLine()){x+=s.length();}
+ tf.close();
+ targetsize=""+x;
+ }
+ }
+ }
+
+ if(ref!=null && refbuild<1){
+ if(build==1){refbuild=2;}
+ else{refbuild=1;}
+ }
+
+ if(dirty==null){throw new RuntimeException("No dirty file specified.");}
+ if(clean==null){throw new RuntimeException("No clean file specified.");}
+ if(targetsize==null){throw new RuntimeException("No targetsize specified.");}
+ if(template==null){throw new RuntimeException("No template file specified.");}
+ if(!new File(template).exists()){throw new RuntimeException("Template file "+template+" does not exist; please specify a different template.");}
+ if(build==refbuild){throw new RuntimeException("Build id and ref build id must differ.");}
+ if(build<1){throw new RuntimeException("No build id.");}
+ if(ref!=null && refbuild<1){throw new RuntimeException("No ref build id.");}
+ if(ref==null && refbuild>0 && !(new File(Data.chromFname(1, refbuild))).exists()){throw new RuntimeException("Ref build id specified, but no reference file.");}
+
+ String[] lines;
+ {
+ TextFile tf=new TextFile(template, false, false);
+ lines=tf.toStringLines();
+ }
+
+
+ StringBuilder sb=new StringBuilder();
+ for(int i=0; i<lines.length; i++){
+ String s=lines[i];
+
+ boolean eccline=s.contains("?ecc?");
+ boolean sortline=s.contains("?sort?");
+ boolean refline=s.contains("?ref?");
+ boolean mergeline=s.contains("?ref?");
+ boolean optional=(!eccline && !sortline && !refline && !mergeline && s.startsWith("#?")); //Optional for some other reason
+
+ if(eccline){
+ s=s.replaceAll("\\?ecc\\?", "");
+ while(ecc && s.startsWith("#")){s=s.substring(1);}
+ }
+ if(sortline){
+ s=s.replaceAll("\\?sort\\?", "");
+ while(sort && s.startsWith("#")){s=s.substring(1);}
+ }
+ if(refline){
+ s=s.replaceAll("\\?ref\\?", "");
+ while(refbuild>0 && s.startsWith("#")){s=s.substring(1);}
+ }
+
+ if(!s.startsWith("#")){
+ if((eccline && !ecc) || (sortline && !sort) || (refline && refbuild<1)){s="#"+s;}
+ }
+
+
+ if(optional){
+ optional=true;
+ s=s.substring(2);
+ }
+
+ if((s.contains("@MAXRAM") && maxram>31) || (s.contains("@RAM") && ram>31)){
+ s=s.replace("-XX:+UseCompressedOops ", "");
+ }
+
+ s=s.replace("@CLEAN_ECC_1", cleanecc);
+ s=s.replace("@CLEAN_BAD_ECC_1", cleanbadecc);
+ s=s.replace("@CLEAN_ALL_ECC_1", cleanallecc);
+ s=s.replace("@SORT_IN", sort_in);
+ s=s.replace("@SORTED_OUT", sorted_out);
+ s=s.replace("@SORTED", sorted);
+
+ s=s.replace("@SLOTRAM", slotram);
+ s=s.replace("@BUILDNUM", ""+build);
+ s=s.replace("@DIRTY_INPUT", dirty);
+ s=s.replace("@CLEAN_INPUT_1", clean);
+ s=s.replace("@ORGANISM", name);
+ s=s.replace("@NUMSLOTS", ""+threads);
+ s=s.replace("@TARGET_SIZE", targetsize);
+ s=s.replace("@RAM", "-Xmx"+ram+"g");
+ s=s.replace("@MAXRAM", "-Xmx"+maxram+"g");
+ s=s.replace("@MAXREADS", ""+maxReads);
+ s=s.replace("@SCRIPT", (output==null ? "run.sh" : output));
+ s=s.replace("@EXTRA", extra);
+ s=s.replace("@RUNTIME", ""+runtime);
+ s=s.replace("@CLASSPATH", classpath);
+
+ if(s.contains("@REFBUILD")){
+ if(refbuild<1){
+ s="#"+s;
+ }else{
+ s=s.replace("@REFBUILD", ""+refbuild);
+ }
+ }
+
+
+ if(s.contains("@REFERENCE")){
+ if(ref==null){
+ s="#"+s;
+ }else{
+ s=s.replace("@REFERENCE", ref);
+ }
+ }
+
+ if(s.contains("@MERGECLEAN")){
+ if(mergeclean==null){
+ s="#"+s;
+ }else{
+ s=s.replace("@MERGECLEAN", mergeclean);
+ }
+ }
+
+ if(s.contains("@MERGEDIRTY")){
+ if(mergedirty==null){
+ s="#"+s;
+ }else{
+ s=s.replace("@MERGEDIRTY", mergedirty);
+ }
+ }
+
+ if(s.contains("@MERGEREF")){
+ if(mergeref==null){
+ s="#"+s;
+ }else{
+ s=s.replace("@MERGEREF", mergeref);
+ }
+ }
+
+ while(s.startsWith("##")){s=s.substring(1);}
+
+ assert(s==null || s.length()<1 || s.startsWith("#") || !s.contains("@")) : s;
+
+ if(s!=null && !s.startsWith("#//")){sb.append(s).append('\n');}
+
+ if(qsub==null && s.contains("export task") && s.contains("qsub")){
+ qsub=s;
+ }
+ }
+
+ if(output==null){
+ System.out.println(sb);
+ }else{
+ ReadWrite.writeString(sb, output, false);
+ System.out.println("Wrote "+output);
+ if(qsub!=null){
+ while(qsub.startsWith("#")){qsub=qsub.substring(1);}
+ System.out.println("The script can be executed on Genepool with the following command:\n\n"+qsub.trim());
+ }
+ }
+
+
+ }
+
+
+}
diff --git a/current/pacbio/MergeFastaContigs.java b/current/pacbio/MergeFastaContigs.java
new file mode 100755
index 0000000..47977e6
--- /dev/null
+++ b/current/pacbio/MergeFastaContigs.java
@@ -0,0 +1,532 @@
+package pacbio;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import stream.ConcurrentGenericReadInputStream;
+import stream.ConcurrentReadInputStream;
+import stream.FASTQ;
+import stream.Read;
+
+import align2.ListNum;
+import align2.ReadStats;
+import align2.Tools;
+import dna.Parser;
+import dna.Timer;
+
+import fileIO.FileFormat;
+import fileIO.ReadWrite;
+import fileIO.TextFile;
+import fileIO.TextStreamWriter;
+
+/**
+ * @author Brian Bushnell
+ * @date Jul 10, 2012
+ *
+ */
+public class MergeFastaContigs {
+
+
+ public static void main(String[] args){
+ System.out.println("Executing "+(new Object() { }.getClass().getEnclosingClass().getName()+" "+Arrays.toString(args)));
+
+ Timer t=new Timer();
+ String infile=null;
+ String outfile=null;
+ String outindex=null;
+ int npl=-1;
+ int npl2=-1;
+
+ for(int i=0; i<args.length; i++){
+ final String arg=args[i];
+ final String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=(split.length>1 ? split[1] : "true");
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(Parser.parseZip(arg, a, b)){
+ //do nothing
+ }else if(a.equals("in") && split.length>0){
+ infile=b;
+ }else if(a.equals("out") && split.length>0){
+ outfile=b;
+ }else if(a.equals("index") && split.length>0){
+ outindex=b;
+ }else if(a.equals("npad")){
+ npl=N_PAD_LENGTH=Integer.parseInt(b);
+ }else if(a.equals("npad2")){
+ npl2=N_PAD_LENGTH2=Integer.parseInt(b);
+ }else if(a.equals("maxdataout")){
+ maxDataOut=Integer.parseInt(b);
+ }else if(a.equals("mincontig")){
+ MIN_CONTIG_TO_ADD=Integer.parseInt(b);
+ }else if(a.equals("maxlen")){
+ MAX_OUTPUT_LEN=Integer.parseInt(b);
+ }else if(a.equals("maxchroms")){
+ maxChromsOut=Integer.parseInt(b);
+ }else if(a.equals("maxdata")){
+ maxDataOut=Tools.parseKMG(b);
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ }else if(a.equals("append") || a.equals("app")){
+ append=ReadStats.append=Tools.parseBoolean(b);
+ }else if(a.equals("overwrite") || a.equals("ow")){
+ overwrite=Tools.parseBoolean(b);
+ }else if(a.equals("padfront") || a.equals("padstart")){
+ PAD_START=Tools.parseBoolean(b);
+ }else{
+ throw new RuntimeException("Unknown argument "+arg);
+ }
+ }
+
+ if(infile==null){infile=args[0];}
+ if(outfile==null){outfile=args[1];}
+ if(outindex==null){outindex=args[2];}
+
+ try {
+ if(npl<0 && args.length>3){N_PAD_LENGTH=Integer.parseInt(args[3]);}
+ if(npl2<0 && args.length>4){N_PAD_LENGTH2=Integer.parseInt(args[4]);}
+ } catch (NumberFormatException e) {
+ //ignore
+ }
+
+ if(infile.contains(".fq.") || infile.endsWith(".fq") || infile.contains(".fastq.") || infile.endsWith(".fastq")){
+ mergeFastq(infile, outfile, outindex);
+ }else{
+ if(new File(infile).exists()){
+// System.err.println("Warning: This will run correctly, but I suggest against putting commas in your filenames.");
+// assert false : infile+", "+outfile+", "+outindex;
+ mergeFasta(new String[] {infile}, outfile, outindex);
+ }else{
+ String[] files=infile.split(",");
+ for(String s : files){
+ if(!new File(s).exists()){throw new RuntimeException("Cannot find file "+s);}
+ }
+ mergeFasta(files, outfile, outindex);
+ }
+ }
+ t.stop();
+
+ System.out.println("MergeFastaContigs output for "+Arrays.toString(args));
+ System.out.println("definedBasesIn: \t"+definedBasesIn);
+ System.out.println("contigsIn: \t"+contigsIn);
+ System.out.println("definedBasesOut: \t"+definedBasesOut);
+ System.out.println("basesOut: \t"+dataOut);
+ System.out.println("contigsOut: \t"+contigsOut);
+ System.out.println("chromsOut: \t"+chromsOut);
+
+ System.out.println("Time:\t"+t);
+
+ }
+
+
+
+ /**
+ * @param infile
+ * @param outfile
+ * @param outindex
+ */
+ public static void merge(String infile, String outfile, String outindex) {
+ StringBuilder temp=new StringBuilder(MIN_CONTIG_TO_ADD);
+ TextFile tf=new TextFile(infile, false, false);
+
+// OutputStream cos=ReadWrite.getOutputStream(outfile, false);
+// PrintWriter cpw=new PrintWriter(cos);
+
+ long loc=N_PAD_LENGTH;
+ int chrom=1;
+ System.out.println(">chr"+chrom);
+ npad=npad(N_PAD_LENGTH);
+ printAsLines(npad, 0);
+
+ String s=null;
+ String label=null;
+ for(s=tf.nextLine(); chrom<maxChromsOut && dataOut<maxDataOut; s=tf.nextLine()){
+ if(s==null || s.charAt(0)=='>'){
+
+ if(s!=null){contigsIn++;}
+
+ //evict current contig
+ if(temp.length()>=MIN_CONTIG_TO_ADD){
+
+ long newloc=loc+temp.length()+N_PAD_LENGTH;
+ if(newloc>=MAX_OUTPUT_LEN){
+ //Evict old chrom
+
+ //Make new chrom
+ chrom++;
+ loc=N_PAD_LENGTH;
+ newloc=loc+temp.length()+N_PAD_LENGTH;
+ System.out.println("\n>chr"+chrom);
+ printAsLines(npad, 0);
+ }
+
+ printAsLines(temp, (int)(loc%lineBreak));
+
+ definedBasesOut+=temp.length();
+ contigsOut++;
+
+ printAsLines(npad, (int)((loc+temp.length())%lineBreak));
+ System.err.println(chrom+"\t"+loc+"\t"+label);
+ loc=newloc;
+ }else{
+// System.err.println("Ignored "+temp);
+ }
+
+ if(s==null){break;}
+ temp.setLength(0);
+ label=s.substring(1);
+ }else{
+ //append line to current contig
+ temp.append(s);
+ definedBasesIn+=s.length();
+ }
+ }
+ tf.close();
+
+ chromsOut=chrom;
+ System.out.println();
+ }
+
+
+
+ /**
+ * @param infile
+ * @param outfile
+ * @param outindex
+ */
+ public static void mergeFasta(String infiles[], String outfile, String outindex) {
+
+ if(new File(outfile).exists()){
+ for(String s : infiles){assert(!s.equalsIgnoreCase(outfile));}
+ }
+
+ //if(verbose){System.err.println("A");}
+
+ StringBuilder temp=new StringBuilder(MIN_CONTIG_TO_ADD);
+ TextFile tf;
+
+ TextStreamWriter cout=new TextStreamWriter(outfile, overwrite, false, false);
+ TextStreamWriter iout=new TextStreamWriter(outindex, overwrite, false, false);
+
+ cout.start();
+ iout.start();
+ //if(verbose){System.err.println("B");}
+
+ long loc=(PAD_START ? N_PAD_LENGTH2 : 0);
+ int chrom=1;
+ cout.print(">chr"+chrom+"\n");
+ npad=npad(N_PAD_LENGTH);
+ npad2=npad2(N_PAD_LENGTH2);
+ assert(npad.length()<=npad2.length());
+ if(PAD_START){printAsLines(npad2, 0, cout);}
+ boolean np2=true;
+// cout.poison();
+// assert(false) : "\n"+npad+"\n\n\n"+npad2+"\n";
+ //if(verbose){System.err.println("C");}
+
+// assert(false) : PAD_START+", "+np2;
+
+ for(String fname : infiles){
+ tf=new TextFile(fname, false, false);
+ String s=null;
+ String label=null;
+ if(verbose){System.err.println("Processing file "+fname);}
+ for(s=tf.nextLine(); chrom<maxChromsOut && dataOut<maxDataOut; s=tf.nextLine()){
+ //if(verbose){System.err.print("");}
+ if(verbose){System.err.println("Processing line "+s);}
+ if(s==null || s.charAt(0)=='>'){
+ if(verbose){System.err.println("Contig break");}
+// System.err.println("chrom="+chrom+", maxChromsOut="+maxChromsOut);
+
+ if(s!=null){contigsIn++;}
+ if(verbose){System.err.println("Contigs="+contigsIn);}
+
+ //evict current contig
+ if(temp.length()>=MIN_CONTIG_TO_ADD){
+ if(verbose){System.err.println("Big enough to add");}
+
+ long newloc=loc+temp.length()+N_PAD_LENGTH;
+ if(newloc>=MAX_OUTPUT_LEN){
+ if(verbose){System.err.println("newloc>=MAX_OUTPUT_LEN");}
+ //Evict old chrom
+ printAsLines(npad2, (int)(loc%lineBreak), cout);
+
+ //Make new chrom
+ chrom++;
+ loc=N_PAD_LENGTH2;
+ newloc=loc+temp.length()+N_PAD_LENGTH;
+ cout.print("\n>chr"+chrom+"\n");
+ if(PAD_START){printAsLines(npad2, 0, cout);}
+ np2=true;
+ }
+ if(verbose){System.err.println("G");}
+
+ printAsLines(temp, (int)(loc%lineBreak), cout);
+
+ definedBasesOut+=temp.length();
+ contigsOut++;
+
+ if(np2){
+ if(verbose){System.err.println("np2");}
+ if(PAD_START){
+ if(verbose){System.err.println("PAD_START");}
+ loc=N_PAD_LENGTH2;
+ newloc=N_PAD_LENGTH2+temp.length();
+ }else{
+ if(verbose){System.err.println("~PAD_START");}
+ loc=0;
+ newloc=temp.length();
+ }
+ }else{
+ if(verbose){System.err.println("PAD_START");}
+ printAsLines(npad, (int)((loc+temp.length())%lineBreak), cout);
+ }
+ if(verbose){System.err.println("H");}
+ if(label!=null){iout.print(chrom+"\t"+loc+"\t"+label+"\n");}
+ loc=newloc;
+ np2=false;
+ }else{
+ // System.err.println("Ignored "+temp);
+ }
+ if(verbose){System.err.println("Done with contig");}
+
+ temp.setLength(0);
+ if(s==null){break;}
+ label=s.substring(1);
+ }else{
+ np2=false;
+ //if(verbose){System.err.print("J");}
+ //append line to current contig
+ temp.append(s);
+ definedBasesIn+=s.length();
+ if(verbose){System.err.println("Normal line. definedBasesIn="+definedBasesIn);}
+ }
+ //if(verbose){System.err.print("K");}
+ }
+ tf.close();
+ //if(verbose){System.err.print("L");}
+ }
+ //if(verbose){System.err.println("M");}
+
+ chromsOut=chrom;
+
+ assert(temp.length()==0) : temp.length();
+ printAsLines(npad2, (int)(loc%lineBreak), cout);
+ //if(verbose){System.err.println("N");}
+
+
+ cout.print("\n");
+ cout.poisonAndWait();
+ iout.poisonAndWait();
+ //if(verbose){System.err.println("O");}
+ }
+
+
+
+ /**
+ * @param in1
+ * @param outfile
+ * @param outindex
+ */
+ public static void mergeFastq(String in1, String outfile, String outindex) {
+ StringBuilder temp=new StringBuilder(MIN_CONTIG_TO_ADD);
+
+ FASTQ.TEST_INTERLEAVED=false;
+ FASTQ.DETECT_QUALITY=false;
+ long maxReads=-1;
+
+ final ConcurrentReadInputStream cris;
+ {
+ FileFormat ff1=FileFormat.testInput(in1, FileFormat.FASTQ, null, true, true);
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, null);
+// if(verbose){System.err.println("Started cris");}
+ cris.start(); //4567
+ }
+
+
+ TextStreamWriter cout=new TextStreamWriter(outfile, overwrite, false, false);
+ TextStreamWriter iout=new TextStreamWriter(outindex, overwrite, false, false);
+
+ cout.start();
+ iout.start();
+
+ long loc=N_PAD_LENGTH2;
+ int chrom=1;
+ cout.print(">chr"+chrom+"\n");
+ npad=npad(N_PAD_LENGTH);
+ npad2=npad2(N_PAD_LENGTH2);
+ assert(npad.length()<=npad2.length());
+ printAsLines(npad2, 0, cout);
+
+
+ String s=null;
+ String label=null;
+
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+
+
+
+ while(reads!=null && reads.size()>0){
+ //System.err.println("reads.size()="+reads.size());
+ for(Read r : reads){
+
+ s=new String(r.bases);
+ label=r.id;
+
+ temp.append(s);
+
+ if(temp.length()>=MIN_CONTIG_TO_ADD){
+
+ long newloc=loc+temp.length()+N_PAD_LENGTH;
+ if(newloc>=MAX_OUTPUT_LEN){
+ //Evict old chrom
+ printAsLines(npad2, (int)(loc%lineBreak), cout);
+
+ //Make new chrom
+ chrom++;
+ loc=N_PAD_LENGTH2;
+ newloc=loc+temp.length()+N_PAD_LENGTH;
+ cout.print("\n>chr"+chrom+"\n");
+ printAsLines(npad2, 0, cout);
+ }
+
+ printAsLines(temp, (int)(loc%lineBreak), cout);
+ printAsLines(npad, (int)((loc+temp.length())%lineBreak), cout);
+ iout.println(chrom+"\t"+loc+"\t"+label);
+ loc=newloc;
+ }else{
+ // System.err.println("Ignored "+temp);
+ }
+
+ temp.setLength(0);
+ if(s==null){break;}
+ label=s.substring(1);
+
+
+ }
+ //System.err.println("returning list");
+ cris.returnList(ln.id, ln.list.isEmpty());
+ //System.err.println("fetching list");
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+
+
+ cris.returnList(ln.id, ln.list.isEmpty());
+
+ assert(temp.length()==0) : temp.length();
+ printAsLines(npad2, (int)(loc%lineBreak), cout);
+
+
+ ReadWrite.closeStream(cris);
+
+ cout.print("\n");
+ cout.poison();
+ iout.poison();
+ }
+
+ private static void printAsLines(CharSequence sb, int mod){
+ dataOut+=sb.length();
+ assert(mod<lineBreak);
+ if(mod>0){
+ CharSequence s=sb.subSequence(0, min(lineBreak-mod, sb.length()));
+ if(s.length()+mod==lineBreak){
+ System.out.println(s);
+ }else{
+ System.out.print(s);
+ }
+ }
+
+ int loc=lineBreak-mod;
+ for(; loc<sb.length(); loc+=lineBreak){
+ CharSequence s=sb.subSequence(loc, min(loc+lineBreak, sb.length()));
+ if(s.length()==lineBreak){
+ System.out.println(s);
+ }else{
+ System.out.print(s);
+ }
+ }
+ }
+ private static void printAsLines(CharSequence sb, int mod, TextStreamWriter cout){
+ dataOut+=sb.length();
+ assert(mod<lineBreak);
+ if(mod>0){
+
+ CharSequence s=sb.subSequence(0, min(lineBreak-mod, sb.length()));
+
+// System.out.println(mod+", "+s.length()+", "+(s.length()+mod)+", "+lineBreak);
+
+ if(s.length()+mod==lineBreak){
+ cout.println(s);
+ }else{
+ cout.print(s);
+ }
+ }
+
+ int loc=(mod==0 ? 0 : lineBreak-mod);
+ for(; loc<sb.length(); loc+=lineBreak){
+ CharSequence s=sb.subSequence(loc, min(loc+lineBreak, sb.length()));
+
+// System.out.println(mod+", "+s.length()+", "+(s.length()+mod)+", "+lineBreak+", loc="+loc);
+
+ if(s.length()==lineBreak){
+ cout.println(s);
+ }else{
+ cout.print(s);
+ }
+ }
+ }
+
+ public static String npad(int N_PAD_LENGTH){
+ if(npad==null || npad.length()!=N_PAD_LENGTH){
+ StringBuilder sb=new StringBuilder(N_PAD_LENGTH);
+ for(int i=0; i<N_PAD_LENGTH; i++){
+ sb.append('N');
+ }
+ npad=sb.toString();
+ }
+ return npad;
+ }
+
+ public static String npad2(int N_PAD_LENGTH){
+ if(npad2==null || npad2.length()!=N_PAD_LENGTH){
+ StringBuilder sb=new StringBuilder(N_PAD_LENGTH);
+ for(int i=0; i<N_PAD_LENGTH; i++){
+ sb.append('N');
+ }
+ npad2=sb.toString();
+ }
+ return npad2;
+ }
+
+ private static final int min(int x, int y){return x<y ? x : y;}
+ private static final int max(int x, int y){return x>y ? x : y;}
+
+ static long definedBasesIn=0;
+ static long contigsIn=0;
+ static long definedBasesOut=0;
+ static long contigsOut=0;
+ static long chromsOut=0;
+
+ public static int lineBreak=80;
+ public static int modulo=lineBreak+1;
+ public static int N_PAD_LENGTH=300;
+ public static int N_PAD_LENGTH2=2000; //for ends
+ public static int MIN_CONTIG_TO_ADD=150;
+ public static int MAX_OUTPUT_LEN=220000000; //200M allows expansion to 262M
+ public static int maxChromsOut=60000;
+ public static long maxDataOut=Long.MAX_VALUE;
+ private static long dataOut=0;
+ public static String npad, npad2;
+ public static boolean overwrite=true;
+ public static boolean append=false;
+ public static boolean PAD_START=true; //Set to true to add padding to beginning.
+ public static boolean verbose=false;
+
+}
diff --git a/current/pacbio/MergeReadsAndGenome.java b/current/pacbio/MergeReadsAndGenome.java
new file mode 100755
index 0000000..1d434bc
--- /dev/null
+++ b/current/pacbio/MergeReadsAndGenome.java
@@ -0,0 +1,189 @@
+package pacbio;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import stream.ConcurrentGenericReadInputStream;
+import stream.ConcurrentLegacyReadInputStream;
+import stream.ConcurrentReadInputStream;
+import stream.FASTQ;
+import stream.FastaReadInputStream;
+import stream.Read;
+import stream.SequentialReadInputStream;
+
+
+import dna.Data;
+import dna.Parser;
+
+import fileIO.FileFormat;
+import fileIO.ReadWrite;
+import fileIO.TextStreamWriter;
+import align2.ListNum;
+import align2.ReadStats;
+import align2.Tools;
+
+/**
+ * @author Brian Bushnell
+ * @date Dec 7, 2012
+ *
+ */
+public class MergeReadsAndGenome {
+
+
+ public static void main(String[] args){
+ int genome=-1;
+ String in[]=null;
+ String out=null;
+ long reads=-1;
+ int readlen=300;
+ boolean overwrite=false;
+ boolean append=false;
+ int sequentialOverlap=5;
+ boolean sequentialStrandAlt=true;
+ ReadWrite.ZIPLEVEL=2;
+
+ FastaReadInputStream.TARGET_READ_LEN=250;
+ FastaReadInputStream.SPLIT_READS=(FastaReadInputStream.TARGET_READ_LEN>0);
+
+
+ for(int i=0; i<args.length; i++){
+ final String arg=args[i];
+ final String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=(split.length>1 ? split[1] : "true");
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(Parser.parseCommonStatic(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseZip(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseFasta(arg, a, b)){
+ //do nothing
+ }else if(a.equals("in")){
+ if("null".equalsIgnoreCase(b)){
+ //do nothing
+ }else{
+ in=b.split(",");
+ }
+ }else if(a.equals("out")){
+ out=b;
+ }else if(a.equals("build") || a.equals("genome")){
+ genome=Integer.parseInt(b);
+ }else if(a.equals("append") || a.equals("app")){
+ append=ReadStats.append=Tools.parseBoolean(b);
+ }else if(a.equals("overwrite") || a.equals("ow")){
+ overwrite=Tools.parseBoolean(b);
+ System.out.println("Set overwrite to "+overwrite);
+ }else if(a.equals("reads")){
+ reads=Tools.parseKMG(b);
+ }else if(a.equals("readlen") || a.equals("length") || a.equals("len")){
+ readlen=Integer.parseInt(b);
+ }else if(a.equals("sequentialoverlap")){
+ sequentialOverlap=Integer.parseInt(b);
+ }else if(a.equals("sequentialstrandalt")){
+ sequentialStrandAlt=Tools.parseBoolean(b);
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ }else{
+ System.err.println("Unknown parameter "+split[i]);
+ assert(false);
+ }
+ }
+
+ assert(FastaReadInputStream.settingsOK());
+ if(in!=null){
+ File a=new File(out);
+ for(String s : in){
+ File b=new File(s);
+ if(a.equals(b)){throw new RuntimeException("Input file may not equal output file: "+a.toString());}
+ }
+ }
+ assert(out!=null);
+
+ TextStreamWriter tsw=new TextStreamWriter(out, overwrite, false, false);
+ tsw.start();
+
+ long id=0;
+
+ if(genome>=0){
+ Data.setGenome(genome);
+ SequentialReadInputStream.UNLOAD=true;
+// SequentialReadInputStream.verbose=true;
+ SequentialReadInputStream ris=new SequentialReadInputStream(reads, readlen, Tools.max(50, readlen/2), sequentialOverlap, sequentialStrandAlt);
+ ConcurrentLegacyReadInputStream cris=new ConcurrentLegacyReadInputStream(ris, reads);
+ cris.start();
+ id=appendReads(cris, tsw, id);
+ ReadWrite.closeStream(cris);
+ }
+
+ if(in!=null){
+ for(String s : in){
+ final ConcurrentReadInputStream cris;
+ {
+ FileFormat ff1=FileFormat.testInput(s, FileFormat.FASTQ, null, true, false);
+ cris=ConcurrentReadInputStream.getReadInputStream(-1, true, ff1, null);
+ if(verbose){System.err.println("Started cris");}
+ cris.start(); //4567
+ }
+ id=appendReads(cris, tsw, id);
+ ReadWrite.closeStream(cris);
+ }
+ }
+
+ tsw.poison();
+ tsw.waitForFinish();
+ }
+
+ public static long appendReads(ConcurrentReadInputStream cris, TextStreamWriter tsw, long id){
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+ while(reads!=null && reads.size()>0){
+
+ for(Read r : reads){
+ Read b=r.mate;
+ Read a=correctRead(r, id);
+ if(a!=null){
+ tsw.println(a);
+ id++;
+ }
+ b=correctRead(b, id);
+ if(b!=null){
+ tsw.println(b);
+ id++;
+ }
+ }
+
+ cris.returnList(ln.id, ln.list.isEmpty());
+ //System.err.println("fetching list");
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ if(verbose){System.err.println("Finished reading");}
+ cris.returnList(ln.id, ln.list.isEmpty());
+ if(verbose){System.err.println("Returned list");}
+ return id;
+ }
+
+ public static Read correctRead(Read r, long id){
+ if(r==null){return null;}
+ r.numericID=id;
+ r.id=""+id;
+ if(r.chrom<1){return r;}
+
+ int startN=0;
+ int stopN=r.length()-1;
+ while(startN<r.length() && r.bases[startN]=='N'){startN++;}
+ while(stopN>0 && r.bases[stopN]=='N'){stopN--;}
+ if(startN>0 || stopN<r.length()-1){
+ if(r.length()-startN-stopN<50){return null;}
+ r.bases=Arrays.copyOfRange(r.bases, startN, stopN+1);
+ if(r.quality!=null){r.quality=Arrays.copyOfRange(r.quality, startN, stopN+1);}
+ }
+ return r;
+ }
+
+ public static boolean verbose=false;
+
+}
diff --git a/current/pacbio/PartitionFastaFile.java b/current/pacbio/PartitionFastaFile.java
new file mode 100755
index 0000000..38b00f5
--- /dev/null
+++ b/current/pacbio/PartitionFastaFile.java
@@ -0,0 +1,90 @@
+package pacbio;
+
+import dna.Timer;
+
+import fileIO.ReadWrite;
+import fileIO.TextFile;
+import fileIO.TextStreamWriter;
+
+/**
+ * @author Brian Bushnell
+ * @date Jul 10, 2012
+ *
+ */
+public class PartitionFastaFile {
+
+
+ public static void main(String[] args){
+
+ Timer t=new Timer();
+ String infile=args[0];
+ String outfile=args[1];
+ assert(!infile.equalsIgnoreCase(outfile));
+ assert(outfile.contains("#"));
+ long partition=Integer.parseInt(args[2]);
+ if(args.length>4){maxDataOut=Long.parseLong(args[4]);}
+
+ if(ReadWrite.ZIPLEVEL<2){ReadWrite.ZIPLEVEL=2;}
+
+ TextFile tf=new TextFile(infile, false, false);
+
+ split(tf, outfile, partition);
+ t.stop();
+ System.out.println("Time:\t"+t);
+
+ }
+
+
+
+ /**
+ * @param infile
+ * @param outfile
+ * @param outindex
+ */
+ public static void split(TextFile tf, String outfile, long partition) {
+ long currentBases=0;
+ int pnum=1;
+
+ TextStreamWriter tsw=new TextStreamWriter(outfile.replace("#", ""+pnum), true, false, false);
+ tsw.start();
+
+ String s;
+ for(s=tf.nextLine(); s!=null && dataOut<maxDataOut; s=tf.nextLine()){
+ if(s.charAt(0)=='>'){
+ if(currentBases>=partition){
+ System.out.println("Ended partition "+pnum+" at "+currentBases);
+ currentBases=0;
+ pnum++;
+ tsw.poison();
+ tsw=new TextStreamWriter(outfile.replace("#", ""+pnum), true, false, false);
+ tsw.start();
+ }
+ }else{
+ int x=s.length();
+ currentBases+=x;
+ dataOut+=x;
+ }
+ tsw.println(s);
+ }
+ System.out.println("Ended partition "+pnum+" at "+currentBases);
+ System.out.println("Total: "+dataOut);
+ System.out.println("Avg: "+(dataOut)/pnum);
+// System.out.println("\n"+s+"\n"+dataOut+"\n"+maxDataOut);
+
+ try {
+ synchronized(tsw){
+ tsw.wait(100);
+ }
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ tsw.poison();
+ }
+
+ public static int MIN_CONTIG_TO_ADD=150; //Not currently used
+ public static long MAX_OUTPUT_LEN=200000000000L;
+ public static long maxDataOut=Long.MAX_VALUE;
+ private static long dataOut=0;
+
+}
diff --git a/current/pacbio/PartitionReads.java b/current/pacbio/PartitionReads.java
new file mode 100755
index 0000000..cdfb1ee
--- /dev/null
+++ b/current/pacbio/PartitionReads.java
@@ -0,0 +1,225 @@
+package pacbio;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import stream.ConcurrentGenericReadInputStream;
+import stream.ConcurrentReadInputStream;
+import stream.FASTQ;
+import stream.FastaReadInputStream;
+import stream.Read;
+
+
+import align2.ListNum;
+import align2.ReadStats;
+import align2.Tools;
+import dna.Data;
+import dna.Parser;
+import dna.Timer;
+import fileIO.ReadWrite;
+import fileIO.FileFormat;
+import fileIO.TextStreamWriter;
+
+/**
+ * @author Brian Bushnell
+ * @date Nov 15, 2012
+ *
+ */
+public class PartitionReads {
+
+ public static void main(String[] args){
+ System.err.println("Executing "+(new Object() { }.getClass().getEnclosingClass().getName())+" "+Arrays.toString(args)+"\n");
+
+
+
+ Timer t=new Timer();
+
+ boolean verbose=false;
+ int ziplevel=-1;
+
+ String in1=null;
+ String in2=null;
+ long maxReads=-1;
+
+ String outname1=null;
+ String outname2=null;
+
+ for(int i=0; i<args.length; i++){
+ final String arg=args[i];
+ final String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : "true";
+ if("null".equalsIgnoreCase(b)){b=null;}
+// System.err.println("Processing "+args[i]);
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(Parser.parseCommonStatic(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseZip(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseQuality(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseFasta(arg, a, b)){
+ //do nothing
+ }else if(a.equals("path") || a.equals("root") || a.equals("tempdir")){
+ Data.setPath(b);
+ }else if(a.equals("fasta") || a.equals("in") || a.equals("input") || a.equals("in1") || a.equals("input1")){
+ in1=b;
+ if(b.indexOf('#')>-1){
+ in1=b.replace("#", "1");
+ in2=b.replace("#", "2");
+ }
+ }else if(a.equals("in2") || a.equals("input2")){
+ in2=b;
+ }else if(a.startsWith("partition")){
+ partitions=Integer.parseInt(b);
+ }else if(a.equals("append") || a.equals("app")){
+ append=ReadStats.append=Tools.parseBoolean(b);
+ }else if(a.equals("overwrite") || a.equals("ow")){
+ overwrite=Tools.parseBoolean(b);
+ System.out.println("Set overwrite to "+overwrite);
+ }else if(a.equals("reads") || a.equals("maxreads")){
+ maxReads=Tools.parseKMG(b);
+ }else if(a.equals("out") || a.equals("out1")){
+ if(b==null || b.equalsIgnoreCase("null") || b.equalsIgnoreCase("none") || split.length==1){
+ System.out.println("No output file.");
+ outname1=null;
+ }else{
+ outname1=b;
+ assert(!outname1.equalsIgnoreCase(outname2));
+ }
+ }else if(a.equals("out2")){
+ if(b==null || b.equalsIgnoreCase("null") || b.equalsIgnoreCase("none") || split.length==1){
+ outname2=null;
+ }else{
+ outname2=b;
+ assert(!outname2.equalsIgnoreCase(outname1));
+ }
+ }else if(a.startsWith("verbose")){
+ verbose=Tools.parseBoolean(b);
+ }else{
+ throw new RuntimeException("Unknown parameter: "+args[i]);
+ }
+ }
+
+ {//Process parser fields
+ Parser.processQuality();
+ }
+
+ assert(FastaReadInputStream.settingsOK());
+ assert(outname1==null || outname1.indexOf('#')>=0 || partitions<2);
+ assert(outname2==null || outname2.indexOf('#')>=0 || partitions<2);
+ assert(outname1==null || !outname1.equalsIgnoreCase(outname2));
+
+ if(in1==null){throw new RuntimeException("Please specify input file.");}
+
+
+ final ConcurrentReadInputStream cris;
+ {
+ FileFormat ff1=FileFormat.testInput(in1, FileFormat.FASTQ, null, true, true);
+ FileFormat ff2=FileFormat.testInput(in2, FileFormat.FASTQ, null, true, true);
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2);
+ if(verbose){System.err.println("Started cris");}
+// cris.start(); //4567
+// th.start();
+ }
+
+
+ TextStreamWriter[] tsw1=new TextStreamWriter[partitions];
+ TextStreamWriter[] tsw2=new TextStreamWriter[partitions];
+
+ FileFormat ff=FileFormat.testOutput(outname1, FileFormat.FASTQ, null, true, overwrite, append, false);
+ fastq=ff.fastq();
+ fasta=ff.fasta();
+ bread=ff.bread();
+
+ for(int i=0; i<partitions; i++){
+ tsw1[i]=new TextStreamWriter(outname1.replaceFirst("#", ""+i), overwrite, false, true);
+ if(outname2!=null){
+ tsw2[i]=new TextStreamWriter(outname2.replaceFirst("#", ""+i), overwrite, false, true);
+ }
+ }
+
+ long reads=process(tsw1, tsw2, cris);
+ t.stop();
+ System.out.println("Reads: \t"+reads);
+ System.out.println("Time: \t"+t);
+ }
+
+ public static long process(TextStreamWriter[] tsw1, TextStreamWriter[] tsw2, ConcurrentReadInputStream cris){
+ for(TextStreamWriter tsw : tsw1){if(tsw!=null){tsw.start();}}
+ for(TextStreamWriter tsw : tsw2){if(tsw!=null){tsw.start();}}
+ cris.start(); //4567
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> readlist=ln.list;
+
+ final boolean paired=cris.paired();
+
+ long x=0;
+ final int div=tsw1.length;
+ while(!readlist.isEmpty()){
+
+ //System.err.println("Got a list of size "+readlist.size());
+ for(int i=0; i<readlist.size(); i++){
+ Read r=readlist.get(i);
+ if(r!=null){
+ final Read r2=r.mate;
+ final int mod=(int)(x%div);
+
+ StringBuilder a=null, b=null;
+
+ if(fastq){
+ a=r.toFastq();
+ if(paired){b=r2.toFastq();}
+ }else if(fasta){
+ a=r.toFasta();
+ if(paired){b=r2.toFasta();}
+ }else if(bread){
+ a=r.toText(true);
+ if(paired){b=(r2==null ? new StringBuilder(".") : r2.toText(true));}
+ }else{
+ throw new RuntimeException("Unsupported output format.");
+ }
+
+ a.append('\n');
+ tsw1[mod].print(a);
+ if(paired){
+ b.append('\n');
+ if(tsw2[i]!=null){tsw2[i].print(b);}
+ else{tsw1[i].print(b);}
+ }
+
+ x++;
+ }
+ }
+
+ cris.returnList(ln.id, readlist.isEmpty());
+
+ //System.err.println("Waiting on a list...");
+ ln=cris.nextList();
+ readlist=ln.list;
+ }
+
+ //System.err.println("Returning a list... (final)");
+ assert(readlist.isEmpty());
+ cris.returnList(ln.id, readlist.isEmpty());
+
+
+ for(TextStreamWriter tsw : tsw1){if(tsw!=null){tsw.poison();}}
+ for(TextStreamWriter tsw : tsw2){if(tsw!=null){tsw.poison();}}
+ ReadWrite.closeStream(cris);
+ return x;
+ }
+
+ /** Permission to overwrite existing files */
+ public static boolean overwrite=false;
+ /** Permission to append to existing files */
+ public static boolean append=false;
+ public static int partitions=2;
+ public static boolean fastq=false;
+ public static boolean fasta=false;
+ public static boolean bread=false;
+
+}
diff --git a/current/pacbio/ProcessStackedSitesNormalized.java b/current/pacbio/ProcessStackedSitesNormalized.java
new file mode 100755
index 0000000..ce3a52f
--- /dev/null
+++ b/current/pacbio/ProcessStackedSitesNormalized.java
@@ -0,0 +1,499 @@
+package pacbio;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+
+import stream.SiteScoreR;
+
+
+import dna.Gene;
+import dna.Timer;
+
+import align2.Tools;
+import fileIO.TextFile;
+import fileIO.TextStreamWriter;
+
+/**
+ * @author Brian Bushnell
+ * @date Jul 18, 2012
+ *
+ */
+public class ProcessStackedSitesNormalized {
+
+ public static void main(String[] args){
+ System.err.println("Executing "+(new Object() { }.getClass().getEnclosingClass().getName())+" "+Arrays.toString(args)+"\n");
+ Timer t=new Timer();
+
+ String infile=args[0];
+ String outfile=args[1];
+
+ for(int i=2; i<args.length; i++){
+ String[] split=args[i].toLowerCase().split("=");
+ String a=split[0];
+ String b=(split.length>1 ? split[1] : null);
+
+ if(a.equals("scorethresh")){
+ SCORE_THRESH=Float.parseFloat(b);
+ }else if(a.equals("interval")){
+ INTERVAL=Integer.parseInt(b);
+ }else if(a.equals("minsitestodiscard")){
+ MIN_SITES_TO_DISCARD=Integer.parseInt(b);
+ }else if(a.equals("minlength")){
+ MIN_LENGTH_TO_RETAIN=Integer.parseInt(b);
+ }else if(a.equals("retainall")){
+ RETAIN_ALL=Tools.parseBoolean(b);
+ if(RETAIN_ALL){MIN_VOTES_TO_RETAIN=0;}
+ }else if(a.equals("fractiontoretain1")){
+ FRACTION_TO_RETAIN1=Float.parseFloat(b);
+ }else if(a.equals("fractiontoretain2")){
+ FRACTION_TO_RETAIN2=Float.parseFloat(b);
+ }else if(a.equals("centerweight")){
+ CENTER_WEIGHT=Float.parseFloat(b);
+ }else if(a.equals("sitestoretain1")){
+ SITES_TO_RETAIN1=Integer.parseInt(b);
+ }else if(a.equals("sitestoretain2")){
+ SITES_TO_RETAIN2=Integer.parseInt(b);
+ }else if(a.equals("minvotestoretain")){
+ MIN_VOTES_TO_RETAIN=Integer.parseInt(b);
+ }else if(a.equals("mindistfromreadends")){
+// MIN_DIST_FROM_READ_ENDS=Integer.parseInt(b);
+// throw new RuntimeException("Deprecated - use minfractionfromreadends instead.");
+ int x=Integer.parseInt(b);
+ float f=x/((150-INTERVAL)*.5f);
+ System.err.println("Warning - mindistfromreadends is deprecated. Setting minfractionfromreadends = "+String.format("%.3f",f));
+ MIN_FRACTION_FROM_READ_ENDS=f;
+ }else if(a.equals("minfractionfromreadends")){
+ MIN_FRACTION_FROM_READ_ENDS=Float.parseFloat(b);
+ }else{
+ assert(false) : "Unknown parameter "+a;
+ }
+ }
+
+ process(infile, outfile);
+
+ System.out.println("Sites In:\t"+sitesIn+" \t"+String.format("%.3f%% correct",correctIn*100d/sitesIn));
+ System.out.println("Sites Out:\t"+sitesOut+" \t"+String.format("%.3f%% correct",correctOut*100d/sitesOut));
+ t.stop();
+ System.out.println("Time: \t"+t);
+ }
+
+ /**
+ * @param infile
+ * @param outfile
+ */
+ public static void process(String infile, String outfile) {
+
+ Buffer buffer=new Buffer(3, infile, outfile);
+
+ int chrom=buffer.chrom;
+ int start=buffer.min;
+ int stop=buffer.min+INTERVAL-1;
+
+ assert(buffer.array[0]!=null);
+ while(buffer.array[0]!=null){
+
+ processInterval(buffer, chrom, start, stop);
+
+ start+=INTERVAL;
+ stop+=INTERVAL;
+ boolean success=buffer.advanceToInterval(start, stop, chrom);
+ if(!success){
+ chrom=buffer.chrom;
+ start=buffer.min;
+ stop=start+INTERVAL-1;
+ }
+ }
+ buffer.close();
+ }
+
+ private static void processInterval(Buffer buffer, int chrom, int start, int stop){
+
+ ArrayList<SiteScoreR> plus=new ArrayList<SiteScoreR>();
+ ArrayList<SiteScoreR> minus=new ArrayList<SiteScoreR>();
+
+ for(Ssra ssra : buffer.array){
+// if(Tools.isWithin(start-MIN_DIST_FROM_READ_ENDS, stop+MIN_DIST_FROM_READ_ENDS, ssra.min, ssra.max)){
+ if(Tools.isWithin(start, stop, ssra.min, ssra.max)){
+ for(SiteScoreR ssr : ssra.array){
+
+ int x=(int)((((ssr.stop-ssr.start+1)-INTERVAL)/2)*MIN_FRACTION_FROM_READ_ENDS);
+ if(x<0){x=0;}
+
+ if(ssr.readlen>=MIN_LENGTH_TO_RETAIN){
+ if(Tools.isWithin(start, stop, ssr.start+x, ssr.stop-x)){
+ ssr.normalizedScore=normalizedScore(ssr, Tools.min(start-ssr.start, ssr.stop-stop));
+ if(ssr.strand==Gene.PLUS){
+ plus.add(ssr);
+ }else{
+ minus.add(ssr);
+ }
+ }
+ }
+
+ }
+ }
+ }
+ markRetain(plus);
+ markRetain(minus);
+
+ }
+
+// private static final int markRetain_old(ArrayList<SiteScoreR> list){
+//// Collections.sort(list, SiteScoreR.NCOMP);
+// assert(list.size()<2 || list.get(0).normalizedScore>=list.get(1).normalizedScore) : list.get(0)+"\t"+list.get(1);
+//
+// int sites=list.size()-MIN_SITES_TO_DISCARD; //Always ignore worst site(s).
+//
+// int retain=(int)(sites*FRACTION_TO_RETAIN1);
+// if(retain>SITES_TO_RETAIN1){
+// int temp=(int)((retain-SITES_TO_RETAIN1)*FRACTION_TO_RETAIN2);
+//// System.out.println("sites="+sites+", retain="+retain+", temp="+temp);
+// retain=SITES_TO_RETAIN1+temp;
+// }
+// retain=Tools.min(retain, SITES_TO_RETAIN2);
+//// System.out.println("retain2="+retain);
+//
+//// for(int i=0; i<retain; i++){
+//// list.get(i).retainVotes++;
+//// }
+// Collections.sort(list);
+//
+// final SiteScoreR best=(list!=null && list.size()>0 ? list.get(0) : null);
+// for(int i=0; i<retain; i++){
+// final SiteScoreR b=list.get(i);
+// if(i>0){
+//// SiteScoreR a=list.get(i-1);
+//// if(a.score-b.score>a.score*0.03f){break;}
+// if(best.score-b.score>best.score*0.034f){break;}
+// }
+//
+// if(i==0){
+// b.retainVotes+=5;
+// }else if(i<3){
+// b.retainVotes+=3;
+// }else if(i<6){
+// b.retainVotes+=2;
+// }else{
+// b.retainVotes++;
+// }
+// }
+//
+// return retain;
+// }
+
+ private static final int markRetain(ArrayList<SiteScoreR> list){
+// Collections.sort(list, SiteScoreR.NCOMP);
+// assert(list.size()<2 || list.get(0).normalizedScore>=list.get(1).normalizedScore) : list.get(0)+"\t"+list.get(1);
+
+ int sites=list.size()-MIN_SITES_TO_DISCARD; //Always ignore worst site(s).
+
+ int retain=(int)(sites*FRACTION_TO_RETAIN1);
+ if(retain>SITES_TO_RETAIN1){
+ int temp=(int)((retain-SITES_TO_RETAIN1)*FRACTION_TO_RETAIN2);
+// System.out.println("sites="+sites+", retain="+retain+", temp="+temp);
+ retain=SITES_TO_RETAIN1+temp;
+ }
+ retain=Tools.min(retain, SITES_TO_RETAIN2);
+
+ if(RETAIN_ALL){retain=sites;}
+
+// System.out.println("retain2="+retain);
+
+// for(int i=0; i<retain; i++){
+// list.get(i).retainVotes++;
+// }
+ Collections.sort(list, SiteScoreR.NCOMP);
+// assert(false) : SCORE_THRESH;
+ final SiteScoreR best=(list!=null && list.size()>0 ? list.get(0) : null);
+ for(int i=0; i<retain; i++){
+ final SiteScoreR b=list.get(i);
+ if(i>0){
+// SiteScoreR a=list.get(i-1);
+// if(a.score-b.score>a.score*0.03f){break;}
+ if(!RETAIN_ALL && best.score-b.score>best.score*SCORE_THRESH){break;}
+ }
+
+ if(i==0){
+ b.retainVotes+=5;
+ }else if(i<4){
+ b.retainVotes+=3;
+ }else if(i<8){
+ b.retainVotes+=2;
+ }else{
+ b.retainVotes++;
+ }
+ }
+
+ return retain;
+ }
+
+ public static Ssra toSrar(String s){
+ String[] split=s.split("\t");
+ SiteScoreR[] scores=new SiteScoreR[split.length];
+ int min=Integer.MAX_VALUE;
+ int max=Integer.MIN_VALUE;
+ int worst=Integer.MAX_VALUE;
+ int best=Integer.MIN_VALUE;
+ int chrom=-1;
+
+ for(int i=0; i<split.length; i++){
+ SiteScoreR ssr=scores[i]=SiteScoreR.fromText(split[i]);
+
+// int dif=ssr.readlen-ssr.reflen(); //Positive for insertions, negative for deletions
+// float modifier=dif/(float)(ssr.readlen*4);
+// if(modifier<lim2){modifier=lim2;}
+// if(modifier>lim1){modifier=lim1;}
+// ssr.normalizedScore=(int)ssr.score*(1+modifier);
+
+
+ min=Tools.min(min, ssr.start);
+ max=Tools.max(max, ssr.stop);
+ worst=Tools.min(worst, ssr.score);
+ best=Tools.max(best, ssr.score);
+ assert(chrom==-1 || chrom==ssr.chrom);
+ chrom=ssr.chrom;
+ }
+ Ssra ssra=new Ssra(scores, chrom, min, max, best, worst);
+ return ssra;
+ }
+
+ public static float normalizedScore(SiteScoreR ssr, int endDist){
+ final float lim1=0.008f;
+ final float lim2=-lim1;
+
+
+ int dif=ssr.readlen-ssr.reflen(); //Positive for insertions, negative for deletions
+ float modifier=dif/(float)(ssr.readlen*4); //Prioritize reads with insertions over deletions, to correct for scoring bias
+ if(modifier<lim2){modifier=lim2;}
+ if(modifier>lim1){modifier=lim1;}
+
+ int maxEndDist=(ssr.reflen()-INTERVAL)/2;
+// float modifier2=(0.03f*endDist)/maxEndDist;
+ float modifier2=CENTER_WEIGHT*endDist/(float)maxEndDist; //Prioritize reads centered on this interval
+
+ float f=ssr.score*(1+modifier+modifier2);
+ return f;
+ }
+
+ /** Finds highest score of ssr's fully covering this site */
+ public static int maxScore(Ssra ssra, final int min, final int max){
+ assert(Tools.overlap(min, max, ssra.min, ssra.max));
+ assert(Tools.isWithin(min, max, ssra.min, ssra.max));
+
+ int best=-1;
+ for(SiteScoreR ssr : ssra.array){
+ if(ssr.start>min){break;}
+ if(max>=ssr.stop){
+ best=Tools.max(best, ssr.score);
+ if(best>=ssra.best){break;}
+ }
+ }
+ return best;
+ }
+
+ public static class Ssra{
+
+ public Ssra(){}
+
+ public Ssra(SiteScoreR[] array_, int chrom_, int min_, int max_, int best_, int worst_){
+ array=array_;
+ chrom=chrom_;
+ min=min_;
+ max=max_;
+ best=best_;
+ worst=worst_;
+ }
+
+ /** SiteScoreR array sorted by start loc, ascending */
+ SiteScoreR[] array;
+ /** All contents must have same chromosome / contig */
+ int chrom;
+ /** Minimum location in array */
+ int min;
+ /** Maximum location in array */
+ int max;
+ /** Top score in array */
+ int best;
+ /** Bottom score in array */
+ int worst;
+
+ }
+
+ public static class Buffer{
+
+ public Buffer(int size, String infname_, String outfname_){
+ assert(!infname_.equalsIgnoreCase(outfname_)) : infname_+" == "+outfname_; //Not a complete test
+ array=new Ssra[size];
+ infname=infname_;
+ outfname=outfname_;
+ tf=new TextFile(infname, true, false);
+ tsw=new TextStreamWriter(outfname, true, false, true);
+ tsw.start();
+ nextSsra=read();
+ fill();
+
+ }
+
+ public Ssra read(){
+ String s=tf.nextLine();
+ if(s==null){
+ tf.close();
+ return null;
+ }
+ Ssra ssra=toSrar(s);
+ sitesIn+=ssra.array.length;
+ return ssra;
+ }
+
+ private boolean advance(){
+ if(nextSsra==null){return false;}
+
+ Ssra old=add(nextSsra);
+ nextSsra=read();
+ if(old!=null){write(old);}
+ return true;
+ }
+
+ /** Starting with an empty array, fill with next chrom */
+ private boolean fill(){
+ assert(array[0]==null);
+ if(nextSsra==null){return false;}
+ int c=nextSsra.chrom;
+ for(int i=0; i<array.length && nextSsra!=null && c==nextSsra.chrom; i++){
+ array[i]=nextSsra;
+ nextSsra=read();
+ }
+ setLimits();
+ return true;
+ }
+
+ public boolean advanceToInterval(final int a, final int b, final int c){
+
+ while(chrom<c || (chrom==c && max<a)){
+ purge();
+ boolean success=fill();
+ if(!success){return false;}
+ }
+
+ assert(array[0]!=null && chrom>=c);
+// if(chrom>c || min>b){return false;} //Went past target
+
+ while(array[0].max<a && nextSsra!=null && nextSsra.chrom==c){
+ advance();
+ }
+
+ return chrom==c && Tools.overlap(a, b, min, max);
+ }
+
+ private void purge() {
+ for(int i=0; i<array.length; i++){
+ Ssra ssra=array[i];
+ if(ssra!=null){write(ssra);}
+ array[i]=null;
+ }
+ }
+
+ private void write(Ssra ssra) {
+ String tab="";
+ StringBuilder sb=new StringBuilder(ssra.array.length*48);
+
+ final long sitesOut_0=sitesOut;
+ for(SiteScoreR ssr : ssra.array){
+
+// if(ssr.weight>0){
+// ssr.normalizedScore/=ssr.weight;
+// }
+
+ if(ssr.correct){correctIn++;}
+ if(ssr.retainVotes>=MIN_VOTES_TO_RETAIN){
+ sitesOut++;
+ if(ssr.correct){correctOut++;}
+ sb.append(tab);
+ sb.append(ssr.toText());
+ tab="\t";
+ }
+ }
+ if(sitesOut_0==sitesOut){return;}
+ sb.append('\n');
+ tsw.print(sb);
+ }
+
+ public Ssra add(Ssra s){
+
+ assert(array[0]==null || array[0].chrom==s.chrom);
+
+ Ssra r=null;
+ if(array[array.length-1]==null){
+ //insert in first null loc
+ for(int i=0; i<array.length; i++){
+ if(array[i]==null){
+ array[i]=s;
+ break;
+ }
+ }
+ }else{
+ r=array[0];
+ for(int i=1; i<array.length; i++){
+ array[i-1]=array[i];
+ }
+ array[array.length-1]=s;
+ }
+
+ setLimits();
+
+ return r;
+ }
+
+ private void setLimits(){
+ max=Integer.MIN_VALUE;
+ min=Integer.MAX_VALUE;
+ chrom=array[0].chrom;
+ for(int i=0; i<array.length; i++){
+ if(array[i]!=null){
+ min=Tools.min(min, array[i].min);
+ max=Tools.max(max, array[i].max);
+ }
+ }
+ }
+
+ public void close(){
+ purge();
+ while(fill()){purge();}
+ tf.close();
+ tsw.poison();
+ }
+
+ public int max=-1;
+ public int min=-1;
+ public int chrom=-1;
+
+ public final Ssra[] array;
+ private Ssra nextSsra;
+ public final String infname;
+ public final String outfname;
+ private TextFile tf;
+ private TextStreamWriter tsw;
+
+ }
+
+ public static int MIN_LENGTH_TO_RETAIN=0;
+ public static boolean RETAIN_ALL=false;
+
+ public static long sitesIn=0;
+ public static long correctIn=0;
+ public static long sitesOut=0;
+ public static long correctOut=0;
+ public static float FRACTION_TO_RETAIN1=0.75f;
+ public static float FRACTION_TO_RETAIN2=0.3f;
+ public static int MIN_SITES_TO_DISCARD=0;
+ public static int SITES_TO_RETAIN1=8;
+ public static int SITES_TO_RETAIN2=16;
+ public static int MIN_VOTES_TO_RETAIN=5;
+// public static int MIN_DIST_FROM_READ_ENDS=25;
+ public static float MIN_FRACTION_FROM_READ_ENDS=0.35f;
+ public static float SCORE_THRESH=0.034f;
+ public static float CENTER_WEIGHT=0.015f;
+ public static int INTERVAL=12;
+
+}
diff --git a/current/pacbio/RemoveAdapters2.java b/current/pacbio/RemoveAdapters2.java
new file mode 100755
index 0000000..9d96de9
--- /dev/null
+++ b/current/pacbio/RemoveAdapters2.java
@@ -0,0 +1,665 @@
+package pacbio;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import stream.ConcurrentReadInputStream;
+import stream.FastaReadInputStream;
+import stream.ConcurrentReadOutputStream;
+import stream.Read;
+
+import align2.ListNum;
+import align2.MultiStateAligner9PacBioAdapter;
+import align2.MultiStateAligner9PacBioAdapter2;
+import align2.ReadStats;
+import align2.Shared;
+import align2.Tools;
+import dna.AminoAcid;
+import dna.Data;
+import dna.Parser;
+import dna.Timer;
+import fileIO.ReadWrite;
+import fileIO.FileFormat;
+
+/**
+ * Increased sensitivity to nearby adapters.
+ * @author Brian Bushnell
+ * @date Nov 5, 2012
+ *
+ */
+public class RemoveAdapters2 {
+
+ public static void main(String[] args){
+ System.err.println("Executing "+(new Object() { }.getClass().getEnclosingClass().getName())+" "+Arrays.toString(args)+"\n");
+
+
+
+ Timer t=new Timer();
+
+ boolean verbose=false;
+
+ String in1=null;
+ String in2=null;
+ long maxReads=-1;
+
+ String outname1=null;
+ String outname2=null;
+
+ String query=pacbioAdapter;
+ Shared.READ_BUFFER_LENGTH=Tools.min(Shared.READ_BUFFER_LENGTH, 20);
+
+ boolean splitReads=false;
+
+ for(int i=0; i<args.length; i++){
+ final String arg=args[i];
+ final String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : "true";
+ if("null".equalsIgnoreCase(b)){b=null;}
+// System.err.println("Processing "+args[i]);
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(Parser.parseCommonStatic(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseZip(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseQuality(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseFasta(arg, a, b)){
+ //do nothing
+ }else if(a.equals("path") || a.equals("root") || a.equals("tempdir")){
+ Data.setPath(b);
+ }else if(a.equals("fasta") || a.equals("in") || a.equals("input") || a.equals("in1") || a.equals("input1")){
+ in1=b;
+ if(b.indexOf('#')>-1){
+ in1=b.replace("#", "1");
+ in2=b.replace("#", "2");
+ }
+ }else if(a.equals("in2") || a.equals("input2")){
+ in2=b;
+ }else if(a.equals("query") || a.equals("adapter")){
+ query=b;
+ }else if(a.equals("split")){
+ splitReads=Tools.parseBoolean(b);
+ }else if(a.equals("plusonly")){
+ boolean x=Tools.parseBoolean(b);
+ if(x){TRY_PLUS=true; TRY_MINUS=false;}
+ }else if(a.equals("minusonly")){
+ boolean x=Tools.parseBoolean(b);
+ if(x){TRY_PLUS=false; TRY_MINUS=true;}
+ }else if(a.startsWith("mincontig")){
+ minContig=Integer.parseInt(b);
+ }else if(a.equals("append") || a.equals("app")){
+ append=ReadStats.append=Tools.parseBoolean(b);
+ }else if(a.equals("overwrite") || a.equals("ow")){
+ overwrite=Tools.parseBoolean(b);
+ System.out.println("Set overwrite to "+overwrite);
+ }else if(a.equals("threads") || a.equals("t")){
+ if(b.equalsIgnoreCase("auto")){THREADS=Data.LOGICAL_PROCESSORS;}
+ else{THREADS=Integer.parseInt(b);}
+ System.out.println("Set threads to "+THREADS);
+ }else if(a.equals("reads") || a.equals("maxreads")){
+ maxReads=Tools.parseKMG(b);
+ }else if(a.startsWith("outname") || a.startsWith("outfile") || a.equals("out")){
+ if(b==null || b.equalsIgnoreCase("null") || b.equalsIgnoreCase("none") || split.length==1){
+ System.out.println("No output file.");
+ outname1=null;
+ OUTPUT_READS=false;
+ }else{
+ OUTPUT_READS=true;
+ if(b.indexOf('#')>-1){
+ outname1=b.replace('#', '1');
+ outname2=b.replace('#', '2');
+ }else{
+ outname1=b;
+ }
+ }
+ }else if(a.equals("minratio")){
+ MINIMUM_ALIGNMENT_SCORE_RATIO=Float.parseFloat(b);
+ System.out.println("Set MINIMUM_ALIGNMENT_SCORE_RATIO to "+MINIMUM_ALIGNMENT_SCORE_RATIO);
+ }else if(a.equals("suspectratio")){
+ SUSPECT_RATIO=Float.parseFloat(b);
+ }else if(a.startsWith("verbose")){
+ verbose=Tools.parseBoolean(b);
+ }else{
+ throw new RuntimeException("Unknown parameter: "+args[i]);
+ }
+ }
+
+ {//Process parser fields
+ Parser.processQuality();
+ }
+
+ assert(FastaReadInputStream.settingsOK());
+ if(in1==null){throw new RuntimeException("Please specify input file.");}
+
+
+ final ConcurrentReadInputStream cris;
+ {
+ FileFormat ff1=FileFormat.testInput(in1, FileFormat.FASTQ, null, true, true);
+ FileFormat ff2=FileFormat.testInput(in2, FileFormat.FASTQ, null, true, true);
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2);
+// if(verbose){System.err.println("Started cris");}
+// cris.start(); //4567
+// th.start();
+ }
+ boolean paired=cris.paired();
+ if(verbose){System.err.println("Paired: "+paired);}
+
+ ConcurrentReadOutputStream ros=null;
+ if(OUTPUT_READS){
+ final int buff=(!OUTPUT_ORDERED_READS ? THREADS : Tools.max(24, 2*THREADS));
+
+ FileFormat ff1=FileFormat.testOutput(outname1, FileFormat.FASTQ, null, true, overwrite, append, OUTPUT_ORDERED_READS);
+ FileFormat ff2=FileFormat.testOutput(outname2, FileFormat.FASTQ, null, true, overwrite, append, OUTPUT_ORDERED_READS);
+ ros=ConcurrentReadOutputStream.getStream(ff1, ff2, buff, null, true);
+ }
+ process(cris, ros, query, splitReads);
+ }
+
+ public static void process(ConcurrentReadInputStream cris, ConcurrentReadOutputStream ros, String query, boolean split){
+
+ Timer t=new Timer();
+
+ cris.start(); //4567
+
+ System.out.println("Started read stream.");
+
+
+ if(ros!=null){
+ ros.start();
+ System.out.println("Started output threads.");
+ }
+ ProcessThread[] pts=new ProcessThread[THREADS];
+ for(int i=0; i<pts.length; i++){
+ pts[i]=new ProcessThread(cris, ros, MINIMUM_ALIGNMENT_SCORE_RATIO, query, split);
+ pts[i].start();
+ }
+ System.out.println("Started "+pts.length+" processing thread"+(pts.length==1 ? "" : "s")+".");
+
+ for(int i=0; i<pts.length; i++){
+ ProcessThread pt=pts[i];
+ synchronized(pt){
+ while(pt.getState()!=Thread.State.TERMINATED){
+ try {
+ pt.join();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+ if(i==0){
+ System.out.print("Detecting finished threads: 0");
+ }else{
+ System.out.print(", "+i);
+ }
+ }
+ System.out.println('\n');
+ ReadWrite.closeStreams(cris, ros);
+
+ printStatistics(pts);
+
+ t.stop();
+ System.out.println("Time: \t"+t);
+
+ }
+
+ public static void printStatistics(ProcessThread[] pts){
+
+ long plusAdaptersFound=0;
+ long minusAdaptersFound=0;
+ long goodReadsFound=0;
+ long badReadsFound=0;
+
+ long truepositive=0;
+ long truenegative=0;
+ long falsepositive=0;
+ long falsenegative=0;
+ long expected=0;
+ long unexpected=0;
+ long basesIn=0;
+ long basesOut=0;
+ long readsOut=0;
+
+ for(ProcessThread pt : pts){
+ plusAdaptersFound+=pt.plusAdaptersFound;
+ minusAdaptersFound+=pt.minusAdaptersFound;
+ goodReadsFound+=pt.goodReadsFound;
+ badReadsFound+=pt.badReadsFound;
+ basesIn+=pt.basesIn;
+ basesOut+=pt.basesOut;
+ readsOut+=pt.readsOut;
+
+ truepositive+=pt.truepositive;
+ truenegative+=pt.truenegative;
+ falsepositive+=pt.falsepositive;
+ falsenegative+=pt.falsenegative;
+ expected+=pt.expected;
+ unexpected+=pt.unexpected;
+ }
+
+ long totalReads=goodReadsFound+badReadsFound;
+ long totalAdapters=plusAdaptersFound+minusAdaptersFound;
+ if(expected<1){expected=1;}
+ if(unexpected<1){unexpected=1;}
+
+ System.out.println("Reads In: \t"+totalReads+" \t("+basesIn+" bases, avg length "+(basesIn/totalReads)+")");
+ System.out.println("Good reads: \t"+goodReadsFound);
+ System.out.println("Bad reads: \t"+badReadsFound+" \t("+totalAdapters+" adapters)");
+ System.out.println("Plus adapters: \t"+plusAdaptersFound);
+ System.out.println("Minus adapters: \t"+minusAdaptersFound);
+ System.out.println("Adapters per megabase: \t"+String.format("%.3f",totalAdapters*1000000f/basesIn));
+ if(readsOut>0){System.out.println("Reads Out: \t"+readsOut+" \t("+basesOut+" bases, avg length "+(basesOut/readsOut)+")");}
+ System.out.println();
+ if(truepositive>0 || truenegative>0 || falsepositive>0 || falsenegative>0){
+ System.out.println("Adapters Expected: \t"+expected);
+ System.out.println("True Positive: \t"+truepositive+" \t"+String.format("%.3f%%", truepositive*100f/expected));
+ System.out.println("True Negative: \t"+truenegative+" \t"+String.format("%.3f%%", truenegative*100f/unexpected));
+ System.out.println("False Positive: \t"+falsepositive+" \t"+String.format("%.3f%%", falsepositive*100f/unexpected));
+ System.out.println("False Negative: \t"+falsenegative+" \t"+String.format("%.3f%%", falsenegative*100f/expected));
+ }
+
+ }
+
+ private static class ProcessThread extends Thread{
+
+ /**
+ * @param cris
+ * @param ros
+ * @param mINIMUM_ALIGNMENT_SCORE_RATIO
+ */
+ public ProcessThread(ConcurrentReadInputStream cris_,
+ ConcurrentReadOutputStream ros_, float minRatio_, String query_, boolean split_) {
+ cris=cris_;
+ ros=ros_;
+ minRatio=minRatio_;
+ query1=query_.getBytes();
+ query2=AminoAcid.reverseComplementBases(query1);
+ ALIGN_ROWS=query1.length+1;
+ ALIGN_COLUMNS=ALIGN_ROWS*3+20;
+ SPLIT=split_;
+
+ stride=(int)(query1.length*0.95f);
+ window=(int)(query1.length*2.5f+10);
+ assert(window<ALIGN_COLUMNS);
+
+ msa=new MultiStateAligner9PacBioAdapter(ALIGN_ROWS, ALIGN_COLUMNS);
+ msa2=USE_ALT_MSA ? new MultiStateAligner9PacBioAdapter2(ALIGN_ROWS, ALIGN_COLUMNS) : null;
+
+ maxSwScore=msa.maxQuality(query1.length);
+ minSwScore=(int)(maxSwScore*MINIMUM_ALIGNMENT_SCORE_RATIO);
+ minSwScoreSuspect=(int)(maxSwScore*Tools.min(MINIMUM_ALIGNMENT_SCORE_RATIO*SUSPECT_RATIO, MINIMUM_ALIGNMENT_SCORE_RATIO-((1-SUSPECT_RATIO)*.2f)));
+ maxImperfectSwScore=msa.maxImperfectScore(query1.length);
+
+ suspectMidpoint=(minSwScoreSuspect+minSwScore)/2;
+ }
+
+ @Override
+ public void run(){
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> readlist=ln.list;
+
+ while(!readlist.isEmpty()){
+
+ //System.err.println("Got a list of size "+readlist.size());
+ for(int i=0; i<readlist.size(); i++){
+ Read r=readlist.get(i);
+
+ if(r.length()<minContig && (r.mate==null || r.mateLength()<minContig)){
+ readlist.set(i, null);
+ }else{
+
+ //System.out.println("Got read: "+r.toText());
+ //System.out.println("Synthetic: "+r.synthetic());
+
+ if(r.synthetic()){syntheticReads++;}
+
+ processRead(r);
+ if(r.mate!=null){processRead(r.mate);}
+ }
+
+ }
+
+// System.err.println("outputStream = "+outputStream==null ? "null" : "real");
+ if(ros!=null){ //Important to send all lists to output, even empty ones, to keep list IDs straight.
+ if(DONT_OUTPUT_BROKEN_READS){removeDiscarded(readlist);}
+ for(Read r : readlist){
+ if(r!=null){
+ r.obj=null;
+ assert(r.bases!=null);
+ if(r.sites!=null && r.sites.isEmpty()){r.sites=null;}
+ }
+ }
+// System.err.println("Adding list of length "+readlist.size());
+
+ ArrayList<Read> out=SPLIT ? split(readlist) : readlist;
+ for(Read r : out){
+ if(r!=null){
+ Read r2=r.mate;
+ basesOut+=r.length();
+ readsOut++;
+ if(r2!=null){
+ basesOut+=r2.length();
+ readsOut++;
+ }
+ }
+ }
+ ros.add(out, ln.id);
+ }
+
+ cris.returnList(ln.id, readlist.isEmpty());
+
+ //System.err.println("Waiting on a list...");
+ ln=cris.nextList();
+ readlist=ln.list;
+ }
+
+ //System.err.println("Returning a list... (final)");
+ assert(readlist.isEmpty());
+ cris.returnList(ln.id, readlist.isEmpty());
+ }
+
+ /**
+ * @param readlist
+ * @return
+ */
+ private ArrayList<Read> split(ArrayList<Read> in) {
+ ArrayList<Read> out=new ArrayList<Read>(in.size());
+ for(Read r : in){
+ if(r!=null){
+// assert(r.mate==null);
+ if(!r.hasadapter()){out.add(r);}
+ else{out.addAll(split(r));}
+ Read r2=r.mate;
+ if(r2!=null){
+ if(!r2.hasadapter()){out.add(r2);}
+ else{out.addAll(split(r2));}
+ }
+ }
+ }
+ return out;
+ }
+
+ /**
+ * @param r
+ * @return
+ */
+ private ArrayList<Read> split(Read r) {
+ ArrayList<Read> sections=new ArrayList<Read>();
+
+ int lastX=-1;
+ for(int i=0; i<r.length(); i++){
+ if(r.bases[i]=='X'){
+ if(i-lastX>minContig){
+ byte[] b=Arrays.copyOfRange(r.bases, lastX+1, i);
+ byte[] q=r.quality==null ? null : Arrays.copyOfRange(r.quality, lastX+1, i);
+ Read r2=new Read(b, 0, -1, -1, r.id+"_"+(lastX+1), q, r.numericID, 0);
+ sections.add(r2);
+ }
+ lastX=i;
+ }
+ }
+ int i=r.length();
+ if(i-lastX>minContig){
+ byte[] b=Arrays.copyOfRange(r.bases, lastX+1, i);
+ byte[] q=r.quality==null ? null : Arrays.copyOfRange(r.quality, lastX+1, i);
+ Read r2=new Read(b, 0, -1, -1, r.id+"_"+(lastX+1), q, r.numericID, 0);
+ sections.add(r2);
+ }
+ return sections;
+ }
+
+ /**
+ * @param r
+ */
+ private int processRead(Read r) {
+
+ int begin=0;
+ while(begin<r.length() && r.bases[begin]=='N'){begin++;} //Skip reads made of 'N'
+ if(begin>=r.length()){return 0;}
+
+ basesIn+=r.length();
+
+ final byte[] array=npad(r.bases, npad);
+
+ int lim=array.length-npad-stride;
+
+ int plusFound=0;
+ int minusFound=0;
+
+ int lastSuspect=-1;
+ int lastConfirmed=-1;
+
+ for(int i=begin; i<lim; i+=stride){
+ int j=Tools.min(i+window, array.length-1);
+ if(j-i<window){
+ lim=0; //Last loop cycle
+// i=Tools.max(0, array.length-2*query1.length);
+ }
+
+ if(TRY_MINUS){
+ int[] rvec=msa.fillAndScoreLimited(query2, array, i, j, minSwScoreSuspect);
+ if(rvec!=null && rvec[0]>=minSwScoreSuspect){
+ int score=rvec[0];
+ int start=rvec[1];
+ int stop=rvec[2];
+ assert(score>=minSwScoreSuspect);
+ if((i==0 || start>i) && (j==array.length-1 || stop<j)){
+ boolean kill=(score>=minSwScore ||
+ (score>=suspectMidpoint && lastSuspect>0 && start>=lastSuspect && start-lastSuspect<suspectDistance) ||
+ (lastConfirmed>0 && start>=lastConfirmed && start-lastConfirmed<suspectDistance));
+
+ if(!kill && USE_LOCALITY && array.length-stop>window){//Look ahead
+ rvec=msa.fillAndScoreLimited(query2, array, stop, stop+window, minSwScoreSuspect);
+ if(rvec!=null){
+ if(score>=suspectMidpoint && rvec[0]>=minSwScoreSuspect && rvec[1]-stop<suspectDistance){kill=true;}
+ else if(score>=minSwScoreSuspect && rvec[0]>=minSwScore && rvec[1]-stop<suspectDistance){kill=true;}
+ }
+ }
+
+ if(!kill && USE_ALT_MSA){//Try alternate msa
+ rvec=msa2.fillAndScoreLimited(query2, array, Tools.max(0, start-4), Tools.min(stop+4, array.length-1), minSwScoreSuspect);
+ if(rvec!=null && rvec[0]>=(minSwScore)){kill=true;}
+ }
+
+ if(kill){
+// System.out.println("-:"+score+", "+minSwScore+", "+minSwScoreSuspect+"\t"+lastSuspect+", "+start+", "+stop);
+ minusFound++;
+ for(int x=Tools.max(0, start); x<=stop; x++){array[x]='X';}
+ if(USE_LOCALITY && score>=minSwScore){lastConfirmed=Tools.max(lastConfirmed, stop);}
+ }
+ }
+// System.out.println("Set lastSuspect="+stop+" on score "+score);
+ if(USE_LOCALITY){lastSuspect=Tools.max(lastSuspect, stop);}
+ }
+ }
+
+ if(TRY_PLUS){
+ int[] rvec=msa.fillAndScoreLimited(query1, array, i, j, minSwScoreSuspect);
+ if(rvec!=null && rvec[0]>=minSwScoreSuspect){
+ int score=rvec[0];
+ int start=rvec[1];
+ int stop=rvec[2];
+ if((i==0 || start>i) && (j==array.length-1 || stop<j)){
+ boolean kill=(score>=minSwScore ||
+ (score>=suspectMidpoint && lastSuspect>0 && start>=lastSuspect && start-lastSuspect<suspectDistance) ||
+ (lastConfirmed>0 && start>=lastConfirmed && start-lastConfirmed<suspectDistance));
+
+ if(!kill && USE_LOCALITY && array.length-stop>window){//Look ahead
+ rvec=msa.fillAndScoreLimited(query1, array, stop, stop+window, minSwScoreSuspect);
+ if(rvec!=null){
+ if(score>=suspectMidpoint && rvec[0]>=minSwScoreSuspect && rvec[1]-stop<suspectDistance){kill=true;}
+ else if(score>=minSwScoreSuspect && rvec[0]>=minSwScore && rvec[1]-stop<suspectDistance){kill=true;}
+ }
+ }
+
+ if(!kill && USE_ALT_MSA){//Try alternate msa
+ rvec=msa2.fillAndScoreLimited(query1, array, Tools.max(0, start-4), Tools.min(stop+4, array.length-1), minSwScoreSuspect);
+ if(rvec!=null && rvec[0]>=(minSwScore)){kill=true;}
+ }
+
+ if(kill){
+// System.out.println("+:"+score+", "+minSwScore+", "+minSwScoreSuspect+"\t"+lastSuspect+", "+start+", "+stop);
+ plusFound++;
+ for(int x=Tools.max(0, start); x<=stop; x++){array[x]='X';}
+ if(USE_LOCALITY && score>=minSwScore){lastConfirmed=Tools.max(lastConfirmed, stop);}
+ }
+ }
+// System.out.println("Set lastSuspect="+stop+" on score "+score);
+ if(USE_LOCALITY){lastSuspect=Tools.max(lastSuspect, stop);}
+ }
+ }
+ }
+
+ int found=plusFound+minusFound;
+
+// if(r.synthetic()){
+// if(/*r.hasadapter() && */(r.numericID&3)==0){
+// if(plusFound>0){truepositive++;}else{falsenegative++;}
+// if(plusFound>1){falsepositive+=(plusFound-1);}
+// falsepositive+=minusFound;
+// expected++;
+// }else if(/*r.hasadapter() && */(r.numericID&3)==1){
+// if(minusFound>0){truepositive++;}else{falsenegative++;}
+// if(minusFound>1){falsepositive+=(minusFound-1);}
+// falsepositive+=plusFound;
+// expected++;
+// }else{
+// falsepositive=falsepositive+plusFound+minusFound;
+// if(plusFound+minusFound==0){truenegative++;}
+// unexpected++;
+// }
+// }
+
+ if(r.synthetic()){
+ if(/*r.hasadapter() && */(r.numericID&3)==0){
+ if(found>0){truepositive++;}else{falsenegative++;}
+ if(found>1){falsepositive+=(found-1);}
+ expected++;
+ }else if(/*r.hasadapter() && */(r.numericID&3)==1){
+ if(found>0){truepositive++;}else{falsenegative++;}
+ if(found>1){falsepositive+=(found-1);}
+ expected++;
+ }else{
+ falsepositive+=found;
+ if(found==0){truenegative++;}
+ unexpected++;
+ }
+ }
+
+ plusAdaptersFound+=plusFound;
+ minusAdaptersFound+=minusFound;
+ if(found>0){
+ for(int i=npad, j=0; j<r.length(); i++, j++){r.bases[j]=array[i];}
+ if(DONT_OUTPUT_BROKEN_READS){r.setDiscarded(true);}
+ badReadsFound++;
+ }else{
+ goodReadsFound++;
+ }
+
+ r.setHasAdapter(found>0);
+
+ return found;
+
+ }
+
+ private byte[] npad(final byte[] array, final int pad){
+ final int len=array.length+2*pad;
+ if(padbuffer==null || padbuffer.length!=len){padbuffer=new byte[len];}
+ byte[] r=padbuffer;
+ for(int i=0; i<pad; i++){r[i]='N';}
+ for(int i=pad, j=0; j<array.length; i++, j++){r[i]=array[j];}
+ for(int i=array.length+pad, limit=Tools.min(r.length, len+50); i<limit; i++){r[i]='N';}
+ padbuffer=null; //Kills the buffer. Causes more memory allocation, but better cache/NUMA locality if threads move around.
+ return r;
+ }
+
+ private byte[] padbuffer=null;
+ private final byte[] query1, query2;
+ private final ConcurrentReadInputStream cris;
+ private final ConcurrentReadOutputStream ros;
+ private final float minRatio;
+ private final MultiStateAligner9PacBioAdapter msa;
+ private final MultiStateAligner9PacBioAdapter2 msa2;
+ private final int ALIGN_ROWS;
+ private final int ALIGN_COLUMNS;
+ private final int stride;
+ private final int window;
+ private final boolean SPLIT;
+
+ long plusAdaptersFound=0;
+ long minusAdaptersFound=0;
+ long goodReadsFound=0;
+ long badReadsFound=0;
+ long truepositive=0;
+ long truenegative=0;
+ long falsepositive=0;
+ long falsenegative=0;
+ long expected=0;
+ long unexpected=0;
+ long basesIn=0;
+ long basesOut=0;
+ long readsOut=0;
+
+ private final int maxSwScore;
+ private final int minSwScore;
+ private final int minSwScoreSuspect;
+ private final int suspectMidpoint;
+ private final int maxImperfectSwScore;
+
+ long syntheticReads=0;
+
+ }
+
+ private static int removeDiscarded(ArrayList<Read> list){
+ int removed=0;
+ for(int i=0; i<list.size(); i++){
+ Read r=list.get(i);
+ if(r.discarded()){
+ if(r.mate==null || r.mate.discarded()){
+ list.set(i, null);
+ removed++;
+ }
+ }
+ }
+ return removed;
+ }
+
+ public static boolean DONT_OUTPUT_BROKEN_READS;
+ /** Permission to overwrite existing files */
+ private static boolean overwrite=false;
+ /** Permission to append to existing files */
+ private static boolean append=false;
+ private static int THREADS=Data.LOGICAL_PROCESSORS;
+ private static boolean OUTPUT_READS=false;
+ private static boolean OUTPUT_ORDERED_READS=false;
+ private static boolean PERFECTMODE=false;
+ private static float MINIMUM_ALIGNMENT_SCORE_RATIO=0.31f; //0.31f: At 250bp reads, approx 0.01% false-positive and 94% true-positive.
+ private static float SUSPECT_RATIO=0.85F;
+ public static boolean USE_LOCALITY=true;
+ public static boolean USE_ALT_MSA=true;
+ public static boolean TRY_PLUS=true;
+ public static boolean TRY_MINUS=true;
+ private static int npad=35;
+ public static int minContig=50;
+ public static int suspectDistance=100;
+
+ public static final String pacbioAdapter="ATCTCTCTCTTTTCCTCCTCCTCCGTTGTTGTTGTTGAGAGAGAT";
+ public static final String pacbioStandard_v1="TCCTCCTCCTCCGTTGTTGTTGTTGAGAGAGAGAAGGCTGGGCAGGCTATGCACCCTGGTCCAGGTCAAA" +
+ "AGCTGCGGAACCCGCTAGCGGCCATCTTGGCCACTAGGGGTCCCGCAGATTCATATTGTCGTCTAGCATGCACAATGCTGCAAACCCAGCTTGCAATGCCCACAGCA" +
+ "AGCGGCCAATCTTTACGCCACGTTGAATTGTTTATTACCTGTGACTGGCTATGGCTTGCAACGCCACTCGTAAAACTAGTACTTTGCGGTTAGGGGAAGTAGACAAA" +
+ "CCCATTACTCCACTTCCCGGAAGTTCAACTCATTCCAACACGAAATAAAAGTAAACTCAACACCCCAAGCAGGCTATGTGGGGGGGTGATAGGGGTGGATTCTATTT" +
+ "CCTATCCCATCCCCTAGGATCTCAATTAAGTTACTAGCGAGTTAAATGTCTGTAGCGATCCCGTCAGTCCTATCGCGCGCATCAAGACCTGGTTGGTTGAGCGTGCA" +
+ "GTAGATCATCGATAAGCTGCGAGTTAGGTCATCCCAGACCGCATCTGGCGCCTAAACGTTCAGTGGTAGCTAAGGCGTCACCTTCGACTGTCTAAAGGCAATATGTC" +
+ "GTCCTTAGCTCCAAGTCCCTAGCAAGCGTGTCGGGTCTCTCTCTTTTCCTCCTCCTCCGTTGTTGTTGTTGAGAGAGACCCGACACGCTTGCTAGGGACTTGGAGCT" +
+ "AAGGACGACATATTGCCTTTAGACAGTCGAAGGTGACGCCTTAGCTACCACTGAACGTTTAGGCGCCAGATGCGGTCTGGGATGACCTAACTCGCAGCTTATCGATG" +
+ "ATCTACTGCACGCTCAACCAACCAGGTCTTGATGCGCGCGATAGGACTGACGGGATCGCTACAGACATTTAACTCGCTAGTAACTTAATTGAGATCCTAGGGGATGG" +
+ "GATAGGAAATAGAATCCACCCCTATCACCCCCCCACATAGCCTGCTTGGGGTGTTGAGTTTACTTTTATTTCGTGTTGGAATGAGTTGAACTTCCGGGAAGTGGAGT" +
+ "AATGGGTTTGTCTACTTCCCCTAACCGCAAAGTACTAGTTTTACGAGTGGCGTTGCAAGCCATAGCCAGTCACAGGTAATAAACAATTCAACGTGGCGTAAAGATTG" +
+ "GCCGCTTGCTGTGGGCATTGCAAGCTGGGTTTGCAGCATTGTGCATGCTAGACGACAATATGAATCTGCGGGACCCCTAGTGGCCAAGATGGCCGCTAGCGGGTTCC" +
+ "GCAGCTTTTGACCTGGACCAGGGTGCATAGCCTGCCCAGCCTTCTCTCTCTCTTT";
+
+
+}
diff --git a/current/pacbio/RemoveAdapters3.java b/current/pacbio/RemoveAdapters3.java
new file mode 100755
index 0000000..e35f45c
--- /dev/null
+++ b/current/pacbio/RemoveAdapters3.java
@@ -0,0 +1,623 @@
+package pacbio;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import stream.ConcurrentGenericReadInputStream;
+import stream.ConcurrentReadInputStream;
+import stream.FASTQ;
+import stream.FastaReadInputStream;
+import stream.ConcurrentReadOutputStream;
+import stream.Read;
+
+import align2.ListNum;
+import align2.MultiStateAligner9PacBio;
+import align2.MultiStateAligner9PacBioAdapter;
+import align2.ReadStats;
+import align2.Tools;
+import dna.AminoAcid;
+import dna.Data;
+import dna.Parser;
+import dna.Timer;
+import fileIO.ReadWrite;
+import fileIO.FileFormat;
+
+/**
+ * Increased sensitivity to nearby adapters.
+ * Does reverse-complementation on low-scoring suspects.
+ * @author Brian Bushnell
+ * @date Nov 5, 2012
+ *
+ */
+public class RemoveAdapters3 {
+
+ public static void main(String[] args){
+
+
+
+ Timer t=new Timer();
+
+ boolean verbose=false;
+ int ziplevel=-1;
+
+ String in1=null;
+ String in2=null;
+ long maxReads=-1;
+
+ String outname1=null;
+ String outname2=null;
+
+ String query=pacbioAdapter;
+
+ boolean splitReads=false;
+
+ for(int i=0; i<args.length; i++){
+ final String arg=args[i];
+ final String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : "true";
+ if("null".equalsIgnoreCase(b)){b=null;}
+// System.err.println("Processing "+args[i]);
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(Parser.parseCommonStatic(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseZip(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseQuality(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseFasta(arg, a, b)){
+ //do nothing
+ }else if(a.equals("path") || a.equals("root") || a.equals("tempdir")){
+ Data.setPath(b);
+ }else if(a.equals("fasta") || a.equals("in") || a.equals("input") || a.equals("in1") || a.equals("input1")){
+ in1=b;
+ if(b.indexOf('#')>-1){
+ in1=b.replace("#", "1");
+ in2=b.replace("#", "2");
+ }
+ }else if(a.equals("in2") || a.equals("input2")){
+ in2=b;
+ }else if(a.equals("query") || a.equals("adapter")){
+ query=b;
+ }else if(a.equals("split")){
+ splitReads=Tools.parseBoolean(b);
+ }else if(a.equals("append") || a.equals("app")){
+ append=ReadStats.append=Tools.parseBoolean(b);
+ }else if(a.equals("overwrite") || a.equals("ow")){
+ overwrite=Tools.parseBoolean(b);
+ System.out.println("Set overwrite to "+overwrite);
+ }else if(a.equals("threads") || a.equals("t")){
+ if(b.equalsIgnoreCase("auto")){THREADS=Data.LOGICAL_PROCESSORS;}
+ else{THREADS=Integer.parseInt(b);}
+ System.out.println("Set threads to "+THREADS);
+ }else if(a.equals("reads") || a.equals("maxreads")){
+ maxReads=Tools.parseKMG(b);
+ }else if(a.startsWith("outname") || a.startsWith("outfile") || a.equals("out")){
+ if(b==null || b.equalsIgnoreCase("null") || b.equalsIgnoreCase("none") || split.length==1){
+ System.out.println("No output file.");
+ outname1=null;
+ OUTPUT_READS=false;
+ }else{
+ OUTPUT_READS=true;
+ if(b.indexOf('#')>-1){
+ outname1=b.replace('#', '1');
+ outname2=b.replace('#', '2');
+ }else{
+ outname1=b;
+ }
+ }
+ }else if(a.equals("perfectmode")){
+ PERFECTMODE=Tools.parseBoolean(b);
+ if(ziplevel==-1){ziplevel=2;}
+ }else if(a.equals("minratio")){
+ MINIMUM_ALIGNMENT_SCORE_RATIO=Float.parseFloat(b);
+ System.out.println("Set MINIMUM_ALIGNMENT_SCORE_RATIO to "+MINIMUM_ALIGNMENT_SCORE_RATIO);
+ }else if(a.startsWith("verbose")){
+ verbose=Tools.parseBoolean(b);
+ }else{
+ throw new RuntimeException("Unknown parameter: "+args[i]);
+ }
+ }
+
+ Parser.processQuality();
+
+ assert(FastaReadInputStream.settingsOK());
+ if(in1==null){throw new RuntimeException("Please specify input file.");}
+
+
+ final ConcurrentReadInputStream cris;
+ {
+ FileFormat ff1=FileFormat.testInput(in1, FileFormat.FASTQ, null, true, true);
+ FileFormat ff2=FileFormat.testInput(in2, FileFormat.FASTQ, null, true, true);
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2);
+// if(verbose){System.err.println("Started cris");}
+// cris.start(); //4567
+// th.start();
+ }
+ boolean paired=cris.paired();
+ if(verbose){System.err.println("Paired: "+paired);}
+
+ ConcurrentReadOutputStream ros=null;
+ if(OUTPUT_READS){
+ final int buff=(!OUTPUT_ORDERED_READS ? THREADS : Tools.max(24, 2*THREADS));
+
+ FileFormat ff1=FileFormat.testOutput(outname1, FileFormat.FASTQ, null, true, overwrite, append, OUTPUT_ORDERED_READS);
+ FileFormat ff2=FileFormat.testOutput(outname2, FileFormat.FASTQ, null, true, overwrite, append, OUTPUT_ORDERED_READS);
+ ros=ConcurrentReadOutputStream.getStream(ff1, ff2, buff, null, true);
+ }
+ process(cris, ros, query, splitReads);
+ }
+
+ public static void process(ConcurrentReadInputStream cris, ConcurrentReadOutputStream ros, String query, boolean split){
+
+ Timer t=new Timer();
+
+ cris.start(); //4567
+
+ System.out.println("Started read stream.");
+
+
+ if(ros!=null){
+ ros.start();
+ System.out.println("Started output threads.");
+ }
+ ProcessThread[] pts=new ProcessThread[THREADS];
+ for(int i=0; i<pts.length; i++){
+ pts[i]=new ProcessThread(cris, ros, MINIMUM_ALIGNMENT_SCORE_RATIO, query, split);
+ pts[i].start();
+ }
+ System.out.println("Started "+pts.length+" processing thread"+(pts.length==1 ? "" : "s")+".");
+
+ for(int i=0; i<pts.length; i++){
+ ProcessThread pt=pts[i];
+ synchronized(pt){
+ while(pt.getState()!=Thread.State.TERMINATED){
+ try {
+ pt.join();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+ if(i==0){
+ System.out.print("Detecting finished threads: 0");
+ }else{
+ System.out.print(", "+i);
+ }
+ }
+ System.out.println('\n');
+ ReadWrite.closeStreams(cris, ros);
+
+ printStatistics(pts);
+
+ t.stop();
+ System.out.println("Time: \t"+t);
+
+ }
+
+ public static void printStatistics(ProcessThread[] pts){
+
+ long plusAdaptersFound=0;
+ long minusAdaptersFound=0;
+ long goodReadsFound=0;
+ long badReadsFound=0;
+
+ long truepositive=0;
+ long truenegative=0;
+ long falsepositive=0;
+ long falsenegative=0;
+ long expected=0;
+ long unexpected=0;
+
+ for(ProcessThread pt : pts){
+ plusAdaptersFound+=pt.plusAdaptersFound;
+ minusAdaptersFound+=pt.minusAdaptersFound;
+ goodReadsFound+=pt.goodReadsFound;
+ badReadsFound+=pt.badReadsFound;
+
+ truepositive+=pt.truepositive;
+ truenegative+=pt.truenegative;
+ falsepositive+=pt.falsepositive;
+ falsenegative+=pt.falsenegative;
+ expected+=pt.expected;
+ unexpected+=pt.unexpected;
+ }
+
+ long totalReads=goodReadsFound+badReadsFound;
+ if(expected<1){expected=1;}
+ if(unexpected<1){unexpected=1;}
+
+ System.out.println("Good reads: \t"+goodReadsFound);
+ System.out.println("Bad reads: \t"+badReadsFound);
+ System.out.println("Plus adapters: \t"+plusAdaptersFound);
+ System.out.println("Minus adapters: \t"+minusAdaptersFound);
+ System.out.println();
+ if(truepositive>0 || truenegative>0 || falsepositive>0 || falsenegative>0){
+ System.out.println("Adapters Expected: \t"+expected);
+ System.out.println("True Positive: \t"+truepositive+" \t"+String.format("%.3f%%", truepositive*100f/expected));
+ System.out.println("True Negative: \t"+truenegative+" \t"+String.format("%.3f%%", truenegative*100f/unexpected));
+ System.out.println("False Positive: \t"+falsepositive+" \t"+String.format("%.3f%%", falsepositive*100f/unexpected));
+ System.out.println("False Negative: \t"+falsenegative+" \t"+String.format("%.3f%%", falsenegative*100f/expected));
+ }
+
+ }
+
+ private static class ProcessThread extends Thread{
+
+ /**
+ * @param cris
+ * @param ros
+ * @param mINIMUM_ALIGNMENT_SCORE_RATIO
+ */
+ public ProcessThread(ConcurrentReadInputStream cris_,
+ ConcurrentReadOutputStream ros_, float minRatio_, String query_, boolean split_) {
+ cris=cris_;
+ ros=ros_;
+ minRatio=minRatio_;
+ query1=query_.getBytes();
+ query2=AminoAcid.reverseComplementBases(query1);
+
+ stride=(int)(query1.length*0.95f);
+ window=(int)(query1.length*2.5f+10);
+
+ ALIGN_ROWS=Tools.max(query1.length, rcompDistance)+1;
+ ALIGN_COLUMNS=Tools.max(window, rcompDistance)+5;
+ SPLIT=split_;
+
+ assert(window<ALIGN_COLUMNS);
+
+ msa=new MultiStateAligner9PacBioAdapter(ALIGN_ROWS, ALIGN_COLUMNS);
+ msaR=new MultiStateAligner9PacBio(ALIGN_ROWS, ALIGN_COLUMNS); //TODO: Why is this not 'adapter' version?
+
+ maxSwScore=msa.maxQuality(query1.length);
+ minSwScore=(int)(maxSwScore*MINIMUM_ALIGNMENT_SCORE_RATIO);
+ minSwScoreRcomp=(int)(msa.maxQuality(rcompDistance)*MINIMUM_ALIGNMENT_SCORE_RATIO);
+ minSwScoreSuspect=(int)(maxSwScore*Tools.min(MINIMUM_ALIGNMENT_SCORE_RATIO*0.7f, MINIMUM_ALIGNMENT_SCORE_RATIO-0.05f));
+ maxImperfectSwScore=msa.maxImperfectScore(query1.length);
+
+ rcomp1=new byte[rcompDistance];
+ rcomp2=new byte[rcompDistance];
+ }
+
+ @Override
+ public void run(){
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> readlist=ln.list;
+
+ while(!readlist.isEmpty()){
+
+ //System.err.println("Got a list of size "+readlist.size());
+ for(int i=0; i<readlist.size(); i++){
+ Read r=readlist.get(i);
+
+// System.out.println("Got read: "+r.toText());
+// System.out.println("Synthetic: "+r.synthetic());
+
+ if(r.synthetic()){syntheticReads++;}
+
+ processRead(r);
+ if(r.mate!=null){processRead(r.mate);}
+
+ }
+
+// System.err.println("outputStream = "+outputStream==null ? "null" : "real");
+ if(ros!=null){ //Important to send all lists to output, even empty ones, to keep list IDs straight.
+ if(DONT_OUTPUT_BROKEN_READS){removeDiscarded(readlist);}
+ for(Read r : readlist){
+ if(r!=null){
+ r.obj=null;
+ assert(r.bases!=null);
+ if(r.sites!=null && r.sites.isEmpty()){r.sites=null;}
+ }
+ }
+// System.err.println("Adding list of length "+readlist.size());
+ if(SPLIT){
+ ros.add(split(readlist), ln.id);
+ }else{
+ ros.add(readlist, ln.id);
+ }
+ }
+
+ cris.returnList(ln.id, readlist.isEmpty());
+
+ //System.err.println("Waiting on a list...");
+ ln=cris.nextList();
+ readlist=ln.list;
+ }
+
+ //System.err.println("Returning a list... (final)");
+ assert(readlist.isEmpty());
+ cris.returnList(ln.id, readlist.isEmpty());
+ }
+
+ /**
+ * @param readlist
+ * @return
+ */
+ private ArrayList<Read> split(ArrayList<Read> in) {
+ ArrayList<Read> out=new ArrayList<Read>(in.size());
+ for(Read r : in){
+ if(r!=null){
+ assert(r.mate==null);
+ if(!r.hasadapter()){out.add(r);}
+ else{out.addAll(split(r));}
+ }
+ }
+ return out;
+ }
+
+ /**
+ * @param r
+ * @return
+ */
+ private ArrayList<Read> split(Read r) {
+ ArrayList<Read> sections=new ArrayList<Read>();
+
+ int lastX=-1;
+ for(int i=0; i<r.length(); i++){
+ if(r.bases[i]=='X'){
+ if(i-lastX>minContig){
+ byte[] b=Arrays.copyOfRange(r.bases, lastX+1, i);
+ byte[] q=r.quality==null ? null : Arrays.copyOfRange(r.quality, lastX+1, i);
+ Read r2=new Read(b, 0, -1, -1, r.id+"_"+(lastX+1), q, r.numericID, 0);
+ sections.add(r2);
+ }
+ lastX=i;
+ }
+ }
+ int i=r.length();
+ if(i-lastX>minContig){
+ byte[] b=Arrays.copyOfRange(r.bases, lastX+1, i);
+ byte[] q=r.quality==null ? null : Arrays.copyOfRange(r.quality, lastX+1, i);
+ Read r2=new Read(b, 0, -1, -1, r.id+"_"+(lastX+1), q, r.numericID, 0);
+ sections.add(r2);
+ }
+ return sections;
+ }
+
+ /**
+ * @param r
+ */
+ private int processRead(Read r) {
+
+ int begin=0;
+ while(begin<r.length() && r.bases[begin]=='N'){begin++;} //Skip reads made of 'N'
+ if(begin>=r.length()){return 0;}
+
+ final byte[] array=npad(r.bases, npad);
+
+ int lim=array.length-npad-stride;
+
+ int plusFound=0;
+ int minusFound=0;
+
+ int lastSuspect=-1;
+
+ for(int i=begin; i<lim; i+=stride){
+ int j=Tools.min(i+window, array.length-1);
+ if(j-i<window){
+ lim=0; //Last loop cycle
+// i=Tools.max(0, array.length-2*query1.length);
+ }
+
+ {
+ int[] rvec=msa.fillAndScoreLimited(query2, array, i, j, minSwScoreSuspect);
+ if(rvec!=null && rvec[0]>=minSwScoreSuspect){
+ int score=rvec[0];
+ int start=rvec[1];
+ int stop=rvec[2];
+ assert(score>=minSwScoreSuspect);
+ if((i==0 || start>i) && (j==array.length-1 || stop<j)){
+ boolean kill=(score>=minSwScore || (lastSuspect>0 && start>=lastSuspect && start-lastSuspect<suspectDistance));
+
+ if(!kill && array.length-stop>window){//Look ahead
+ rvec=msa.fillAndScoreLimited(query2, array, stop, stop+window, minSwScoreSuspect);
+ if(rvec!=null && rvec[0]>=minSwScoreSuspect && rvec[1]-stop<suspectDistance){kill=true;}
+ }
+
+ if(RCOMP && !kill && start>rcompDistance && array.length-stop>rcompDistance+1){
+ kill=testRcomp(array, start, stop);
+// System.out.print(kill ? "#" : ".");
+ }
+
+ if(kill){
+// System.out.println("-:"+score+", "+minSwScore+", "+minSwScoreSuspect+"\t"+lastSuspect+", "+start+", "+stop);
+ minusFound++;
+ for(int x=Tools.max(0, start); x<=stop; x++){array[x]='X';}
+ }
+ }
+// System.out.println("Set lastSuspect="+stop+" on score "+score);
+ lastSuspect=stop;
+ }
+ }
+
+ {
+ int[] rvec=msa.fillAndScoreLimited(query1, array, i, j, minSwScoreSuspect);
+ if(rvec!=null && rvec[0]>=minSwScoreSuspect){
+ int score=rvec[0];
+ int start=rvec[1];
+ int stop=rvec[2];
+ if((i==0 || start>i) && (j==array.length-1 || stop<j)){
+ boolean kill=(score>=minSwScore || (lastSuspect>0 && start>=lastSuspect && start-lastSuspect<suspectDistance));
+
+ if(!kill && array.length-stop>window){//Look ahead
+ rvec=msa.fillAndScoreLimited(query1, array, stop, stop+window, minSwScoreSuspect);
+ if(rvec!=null && rvec[0]>=minSwScoreSuspect && rvec[1]-stop<suspectDistance){kill=true;}
+ }
+
+ if(RCOMP && !kill && start>rcompDistance && array.length-stop>rcompDistance+1){
+ kill=testRcomp(array, start, stop);
+// System.out.print(kill ? "#" : ".");
+ }
+
+ if(kill){
+// System.out.println("+:"+score+", "+minSwScore+", "+minSwScoreSuspect+"\t"+lastSuspect+", "+start+", "+stop);
+ plusFound++;
+ for(int x=Tools.max(0, start); x<=stop; x++){array[x]='X';}
+ }
+ }
+// System.out.println("Set lastSuspect="+stop+" on score "+score);
+ lastSuspect=stop;
+ }
+ }
+ }
+
+ if(r.synthetic()){
+ if(/*r.hasadapter() && */(r.numericID&3)==0){
+ if(plusFound>0){truepositive++;}else{falsenegative++;}
+ if(plusFound>1){falsepositive+=(plusFound-1);}
+ falsepositive+=minusFound;
+ expected++;
+ }else if(/*r.hasadapter() && */(r.numericID&3)==1){
+ if(minusFound>0){truepositive++;}else{falsenegative++;}
+ if(minusFound>1){falsepositive+=(minusFound-1);}
+ falsepositive+=plusFound;
+ expected++;
+ }else{
+ falsepositive=falsepositive+plusFound+minusFound;
+ if(plusFound+minusFound==0){truenegative++;}
+ unexpected++;
+ }
+ }
+
+ plusAdaptersFound+=plusFound;
+ minusAdaptersFound+=minusFound;
+ int found=plusFound+minusFound;
+ if(found>0){
+ for(int i=npad, j=0; j<r.length(); i++, j++){r.bases[j]=array[i];}
+ if(DONT_OUTPUT_BROKEN_READS){r.setDiscarded(true);}
+ badReadsFound++;
+ }else{
+ goodReadsFound++;
+ }
+
+ r.setHasAdapter(found>0);
+
+ return found;
+
+ }
+
+ /**
+ * @param array
+ * @param start
+ * @param stop
+ * @return
+ */
+ private boolean testRcomp(byte[] array, int start, int stop) {
+
+ for(int i=0, j=start-rcompDistance, k=stop+1; i<rcompDistance; i++, j++, k++){
+ rcomp1[i]=array[j];
+ if(rcomp1[i]=='X'){rcomp1[i]='N';}
+ rcomp2[i]=array[k];
+ if(rcomp2[i]=='X'){rcomp2[i]='N';}
+ }
+ AminoAcid.reverseComplementBasesInPlace(rcomp2);
+
+// System.out.println(new String(array).substring(start-rcompDistance, stop+rcompDistance));
+// System.out.println(new String(rcomp1));
+// System.out.println(new String(rcomp2));
+
+// int[] rvec=msa.fillAndScoreLimited(rcomp1, rcomp2, 0, rcompDistance-1, minSwScoreSuspect);
+ int[] rvec=msaR.fillAndScoreLimited(rcomp1, rcomp2, 0, rcompDistance-1, 0, null);
+ int score=rvec==null ? -99999 : rvec[0];
+// System.out.println(minSwScoreSuspect+", "+score+"\n");
+ return score>=1000;
+// return score>=minSwScoreRcomp;
+ }
+
+ private byte[] npad(final byte[] array, final int pad){
+ final int len=array.length+2*pad;
+ if(padbuffer==null || padbuffer.length!=len){padbuffer=new byte[len];}
+ byte[] r=padbuffer;
+ for(int i=0; i<pad; i++){r[i]='N';}
+ for(int i=pad, j=0; j<array.length; i++, j++){r[i]=array[j];}
+ for(int i=array.length+pad, limit=Tools.min(r.length, len+50); i<limit; i++){r[i]='N';}
+ padbuffer=null; //Kills the buffer. Causes more memory allocation, but better cache/NUMA locality if threads move around.
+ return r;
+ }
+
+ private byte[] padbuffer=null;
+ private final byte[] rcomp1, rcomp2;
+ private final byte[] query1, query2;
+ private final ConcurrentReadInputStream cris;
+ private final ConcurrentReadOutputStream ros;
+ private final float minRatio;
+ private final MultiStateAligner9PacBioAdapter msa;
+ private final MultiStateAligner9PacBio msaR;
+ private final int ALIGN_ROWS;
+ private final int ALIGN_COLUMNS;
+ private final int stride;
+ private final int window;
+ private final boolean SPLIT;
+
+ long plusAdaptersFound=0;
+ long minusAdaptersFound=0;
+ long goodReadsFound=0;
+ long badReadsFound=0;
+ long truepositive=0;
+ long truenegative=0;
+ long falsepositive=0;
+ long falsenegative=0;
+ long expected=0;
+ long unexpected=0;
+
+ private final int maxSwScore;
+ private final int minSwScore;
+ private final int minSwScoreSuspect;
+ private final int minSwScoreRcomp;
+ private final int maxImperfectSwScore;
+
+ long syntheticReads=0;
+
+ }
+
+ private static int removeDiscarded(ArrayList<Read> list){
+ int removed=0;
+ for(int i=0; i<list.size(); i++){
+ Read r=list.get(i);
+ if(r.discarded()){
+ if(r.mate==null || r.mate.discarded()){
+ list.set(i, null);
+ removed++;
+ }
+ }
+ }
+ return removed;
+ }
+
+ public static boolean DONT_OUTPUT_BROKEN_READS;
+ /** Permission to overwrite existing files */
+ private static boolean overwrite=true;
+ /** Permission to append to existing files */
+ private static boolean append=false;
+ private static int THREADS=Data.LOGICAL_PROCESSORS;
+ private static boolean OUTPUT_READS=false;
+ private static boolean OUTPUT_ORDERED_READS=false;
+ private static boolean PERFECTMODE=false;
+ private static float MINIMUM_ALIGNMENT_SCORE_RATIO=0.31f; //0.31f: At 250bp reads, approx 0.01% false-positive and 94% true-positive.
+ public static boolean RCOMP=true;
+ private static int npad=35;
+ public static int minContig=20;
+ public static int suspectDistance=100;
+ public static int rcompDistance=80;
+
+ public static final String pacbioAdapter="ATCTCTCTCTTTTCCTCCTCCTCCGTTGTTGTTGTTGAGAGAGAT";
+ public static final String pacbioStandard_v1="TCCTCCTCCTCCGTTGTTGTTGTTGAGAGAGAGAAGGCTGGGCAGGCTATGCACCCTGGTCCAGGTCAAA" +
+ "AGCTGCGGAACCCGCTAGCGGCCATCTTGGCCACTAGGGGTCCCGCAGATTCATATTGTCGTCTAGCATGCACAATGCTGCAAACCCAGCTTGCAATGCCCACAGCA" +
+ "AGCGGCCAATCTTTACGCCACGTTGAATTGTTTATTACCTGTGACTGGCTATGGCTTGCAACGCCACTCGTAAAACTAGTACTTTGCGGTTAGGGGAAGTAGACAAA" +
+ "CCCATTACTCCACTTCCCGGAAGTTCAACTCATTCCAACACGAAATAAAAGTAAACTCAACACCCCAAGCAGGCTATGTGGGGGGGTGATAGGGGTGGATTCTATTT" +
+ "CCTATCCCATCCCCTAGGATCTCAATTAAGTTACTAGCGAGTTAAATGTCTGTAGCGATCCCGTCAGTCCTATCGCGCGCATCAAGACCTGGTTGGTTGAGCGTGCA" +
+ "GTAGATCATCGATAAGCTGCGAGTTAGGTCATCCCAGACCGCATCTGGCGCCTAAACGTTCAGTGGTAGCTAAGGCGTCACCTTCGACTGTCTAAAGGCAATATGTC" +
+ "GTCCTTAGCTCCAAGTCCCTAGCAAGCGTGTCGGGTCTCTCTCTTTTCCTCCTCCTCCGTTGTTGTTGTTGAGAGAGACCCGACACGCTTGCTAGGGACTTGGAGCT" +
+ "AAGGACGACATATTGCCTTTAGACAGTCGAAGGTGACGCCTTAGCTACCACTGAACGTTTAGGCGCCAGATGCGGTCTGGGATGACCTAACTCGCAGCTTATCGATG" +
+ "ATCTACTGCACGCTCAACCAACCAGGTCTTGATGCGCGCGATAGGACTGACGGGATCGCTACAGACATTTAACTCGCTAGTAACTTAATTGAGATCCTAGGGGATGG" +
+ "GATAGGAAATAGAATCCACCCCTATCACCCCCCCACATAGCCTGCTTGGGGTGTTGAGTTTACTTTTATTTCGTGTTGGAATGAGTTGAACTTCCGGGAAGTGGAGT" +
+ "AATGGGTTTGTCTACTTCCCCTAACCGCAAAGTACTAGTTTTACGAGTGGCGTTGCAAGCCATAGCCAGTCACAGGTAATAAACAATTCAACGTGGCGTAAAGATTG" +
+ "GCCGCTTGCTGTGGGCATTGCAAGCTGGGTTTGCAGCATTGTGCATGCTAGACGACAATATGAATCTGCGGGACCCCTAGTGGCCAAGATGGCCGCTAGCGGGTTCC" +
+ "GCAGCTTTTGACCTGGACCAGGGTGCATAGCCTGCCCAGCCTTCTCTCTCTCTTT";
+
+
+}
diff --git a/current/pacbio/RemoveNFromChromosome.java b/current/pacbio/RemoveNFromChromosome.java
new file mode 100755
index 0000000..77e615e
--- /dev/null
+++ b/current/pacbio/RemoveNFromChromosome.java
@@ -0,0 +1,55 @@
+package pacbio;
+
+import java.io.File;
+
+import dna.ChromosomeArray;
+import dna.Data;
+import dna.FastaToChromArrays2;
+import dna.Gene;
+import fileIO.ReadWrite;
+
+/**
+ * @author Brian Bushnell
+ * @date Jul 19, 2012
+ *
+ */
+public class RemoveNFromChromosome {
+
+ public static void main(String[] args){
+ int ingenome=Integer.parseInt(args[0]);
+ int outgenome=Integer.parseInt(args[1]);
+ int padding=Integer.parseInt(args[2]);
+
+ String outRoot=Data.ROOT_GENOME+outgenome+"/";
+ File f=new File(outRoot);
+ if(!f.exists()){
+ f.mkdirs();
+ }
+
+ Data.setGenome(ingenome);
+ for(int chrom=1; chrom<=Data.numChroms; chrom++){
+ ChromosomeArray cha=Data.getChromosome(chrom);
+ Data.unload(chrom, true);
+ ChromosomeArray chb=new ChromosomeArray(chrom, Gene.PLUS, 0, cha.countDefinedBases()+2*padding+1);
+ chb.maxIndex=-1;
+ for(int i=0; i<padding; i++){
+ chb.set(i, 'N');
+ }
+ for(int i=0; i<cha.maxIndex; i++){
+ byte b=cha.get(i);
+ if(b!='N'){
+ chb.set(chb.maxIndex+1, b);
+ }
+ }
+ for(int i=0; i<padding; i++){
+ chb.set(chb.maxIndex+1, 'N');
+ }
+ ReadWrite.write(chb, outRoot+"chr"+chrom+Data.chromExtension(), false);
+
+ }
+
+ FastaToChromArrays2.writeInfo(outgenome, Data.numChroms, Data.name, Data.genomeSource, false, false);
+
+ }
+
+}
diff --git a/current/pacbio/SiteR.java b/current/pacbio/SiteR.java
new file mode 100755
index 0000000..002e235
--- /dev/null
+++ b/current/pacbio/SiteR.java
@@ -0,0 +1,95 @@
+package pacbio;
+
+import stream.SiteScore;
+import stream.SiteScoreR;
+import dna.Gene;
+
+/**
+ * @author Brian Bushnell
+ * @date Jul 24, 2012
+ *
+ */
+public class SiteR {
+
+ public SiteR(SiteScoreR ssr){
+ this(ssr.start, ssr.stop, ssr.chrom, ssr.strand, ssr.numericID, ssr.pairnum);
+ }
+
+ public SiteR(int start_, int stop_, int chrom, byte strand, long numericID, int pairnum){
+ start=start_;
+ stop=stop_;
+ if((pairnum&1)==0){
+ idPairnum=numericID;
+ }else{
+ idPairnum=-numericID;
+ }
+ if(strand==Gene.PLUS){
+ chromStrand=chrom;
+ }else{
+ chromStrand=-chrom;
+ }
+ assert(chrom==chrom());
+ assert(strand==strand());
+ assert(numericID==numericID());
+ assert(pairnum==pairNum());
+ }
+
+ public boolean equals(SiteScore other){
+ if(other.start!=start){return false;}
+ if(other.stop!=stop){return false;}
+ if(other.chrom!=chrom()){return false;}
+ if(other.strand!=strand()){return false;}
+ return true;
+ }
+
+ public boolean equals(SiteScoreR other){
+ if(other.start!=start){return false;}
+ if(other.stop!=stop){return false;}
+ if(other.chrom!=chrom()){return false;}
+ if(other.strand!=strand()){return false;}
+ return true;
+ }
+
+ public StringBuilder toTextRecursive(StringBuilder sb){
+ if(sb==null){sb=new StringBuilder();}else{sb.append(" ");}
+ sb.append("("+toText()+")");
+ if(next!=null){next.toTextRecursive(sb);}
+ return sb;
+ }
+
+ public StringBuilder toText(){
+ StringBuilder sb=new StringBuilder();
+ sb.append(start).append(',');
+ sb.append(stop).append(',');
+ sb.append(chrom()).append(',');
+ sb.append(strand()).append(',');
+ sb.append(numericID()).append(',');
+ sb.append(pairNum());
+ return sb;
+ }
+
+ public String toString(){
+ return toText().toString();
+ }
+
+ public final int start;
+ public final int stop;
+ public final int chromStrand;
+ public final long idPairnum;
+ public SiteR next;
+
+ public long numericID(){return idPairnum>=0 ? idPairnum : -idPairnum;}
+ public int pairNum(){return idPairnum>=0 ? 0 : 1;}
+ public int chrom(){return chromStrand>=0 ? chromStrand : -chromStrand;}
+ public byte strand(){return chromStrand>=0 ? (byte)0 : (byte)1;};
+ public int listLength(){
+ int i=1;
+ SiteR sr=this;
+ while(sr.next!=null){
+ sr=sr.next;
+ i++;
+ }
+ return i;
+ }
+
+}
diff --git a/current/pacbio/SortSites.java b/current/pacbio/SortSites.java
new file mode 100755
index 0000000..fe6cc5b
--- /dev/null
+++ b/current/pacbio/SortSites.java
@@ -0,0 +1,298 @@
+package pacbio;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+
+import stream.SiteScoreR;
+
+
+import align2.Tools;
+import dna.Data;
+import dna.Parser;
+import dna.Timer;
+import fileIO.TextFile;
+import fileIO.TextStreamWriter;
+
+/**
+ * @author Brian Bushnell
+ * @date Aug 2, 2012
+ *
+ */
+public class SortSites {
+
+
+ public static void main(String[] args){
+ System.err.println("Executing "+(new Object() { }.getClass().getEnclosingClass().getName())+" "+Arrays.toString(args)+"\n");
+ Timer t=new Timer();
+
+ String tempname=null;
+
+ for(int i=2; i<args.length; i++){
+ final String arg=args[i];
+ final String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(a.equals("genome") || a.equals("build")){
+ Data.setGenome(Integer.parseInt(b)); //Not needed
+ }else if(a.equals("tempname")){
+ tempname=b;
+ }else if(a.equals("deletefiles") || a.startsWith("deletetemp") || a.equals("delete")){
+ DELETE_TEMP=(Tools.parseBoolean(b));
+ }else if(a.equals("mode")){
+ POSITIONMODE=(b.contains("position") || b.contains("location"));
+ }else if(a.equals("blocksize")){
+ BLOCKSIZE=(Integer.parseInt(b));
+ }else if(a.equals("ignoreperfect")){
+ IGNORE_PERFECT_SITES=(Tools.parseBoolean(b));
+ }else{
+ throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+ if(POSITIONMODE){
+ System.out.println("Sorting by position.");
+ }else{
+ System.out.println("Sorting by ID.");
+ }
+
+ stack(args[0], args[1], tempname);
+ assert(sitesRead==sitesWritten || (sitesRead>=sitesWritten && IGNORE_PERFECT_SITES));
+ t.stop();
+ System.out.println("Time: \t"+t);
+ }
+
+ public static void stack(String fname1, String outname, String tempname){
+
+ TextFile tf=new TextFile(fname1, false, false);
+
+ for(String s=tf.nextLine(); s!=null; s=tf.nextLine()){
+
+ SiteScoreR[] array=SiteScoreR.fromTextArray(s);
+ sitesRead+=array.length;
+
+ for(SiteScoreR ssr : array){
+ if(!ssr.perfect || !IGNORE_PERFECT_SITES){
+ write(ssr);
+ }
+ }
+ }
+ tf.close();
+
+ System.out.println("Finished reading");
+ System.out.println("Read "+sitesRead+" sites.");
+
+ finish(outname);
+ System.out.println("Wrote "+sitesWritten+" sites.");
+ System.out.println("Wrote "+perfectWritten+" perfect sites.");
+ System.out.println("Wrote "+semiperfectWritten+" semiperfect sites.");
+ wmap.clear();
+ }
+
+ private static void write(SiteScoreR ssr){
+ long key=key(ssr);
+ TextStreamWriter tsw=wmap.get(key);
+ if(tsw==null){
+ String fname=fname(key, tempname);
+ tsw=new TextStreamWriter(fname, true, false, false);
+ tsw.start();
+ wmap.put(key, tsw);
+ }
+ tsw.print(ssr.toText().append('\n'));
+ }
+
+ protected static final long key(SiteScoreR ssr){
+ return (POSITIONMODE ? poskey(ssr.chrom, ssr.start) : idkey(ssr.numericID));
+ }
+
+ protected static final long poskey(int chrom, int start){
+ long k=((long)chrom<<32)+(Tools.max(start, 0))/BLOCKSIZE;
+ return k;
+ }
+
+ protected static final long idkey(long id){
+ long k=id/BLOCKSIZE;
+ return k;
+ }
+
+ protected static final String fname(long key, String outname){
+ if(outname==null){outname=DEFAULT_TEMP_PATTERN;}
+ assert(outname.contains("#")) : outname;
+ return outname.replace("#", "b"+Data.GENOME_BUILD+"_"+key);
+ }
+
+ private static final void finish(String outname){
+ TextStreamWriter out=new TextStreamWriter(outname, true, false, false);
+ out.start();
+ ArrayList<Long> keys=new ArrayList<Long>(wmap.size());
+ keys.addAll(wmap.keySet());
+ Collections.sort(keys);
+ for(Long k : keys){
+ TextStreamWriter tsw=wmap.get(k);
+ tsw.poison();
+ }
+
+ if(POSITIONMODE){
+ finishByPosition(out, keys);
+ }else{
+ finishByID(out, keys);
+ }
+
+ out.poisonAndWait();
+ }
+
+ private static final void finishByPosition(TextStreamWriter out, ArrayList<Long> keys){
+
+
+
+ int chrom=0;
+ int loc=INTERVAL;
+ String tab="";
+ StringBuilder sb=new StringBuilder(4000);
+
+ for(Long k : keys){
+ TextStreamWriter tsw=wmap.get(k);
+ String fname=fname(k, tempname);
+ for(int i=0; i<50 && tsw.isAlive(); i++){
+ try {
+ tsw.join(20000);
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ if(tsw.isAlive()){
+ System.err.println("Waiting for tsw "+tsw.fname+" to finish...");
+ }
+ }
+ if(tsw.isAlive()){
+ System.err.println(tsw.getClass().getName()+" for "+fname+" refused to die after a long time.");
+ assert(false);
+ }
+
+ TextFile tf=new TextFile(fname, false, false);
+ ArrayList<SiteScoreR> list=new ArrayList<SiteScoreR>(1000);
+ for(String s=tf.nextLine(); s!=null; s=tf.nextLine()){list.add(SiteScoreR.fromText(s));}
+ tf.close();
+ if(DELETE_TEMP){
+ new File(fname).delete();
+ }
+ Collections.sort(list, SiteScoreR.PCOMP);
+
+ final int lim=list.size();
+ for(int i=0; i<lim; i++){
+ SiteScoreR ssr=list.get(i);
+ sitesWritten++;
+ if(ssr.semiperfect){semiperfectWritten++;}
+ if(ssr.perfect){perfectWritten++;}
+ list.set(i, null);
+ if(ssr.chrom>chrom || ssr.start>=loc){
+ if(sb.length()>0){//Purge to disk
+ sb.append('\n');
+ out.print(sb.toString());
+ sb.setLength(0);
+ }
+ chrom=ssr.chrom;
+ loc=ssr.start;
+ loc=(loc-(loc%INTERVAL))+INTERVAL;
+ assert(loc>ssr.start);
+ assert(loc-ssr.start<=INTERVAL);
+ assert(loc%INTERVAL==0);
+ tab="";
+ }
+ sb.append(tab);
+ sb.append(ssr.toText());
+ tab="\t";
+ }
+
+ }
+
+
+ sb.append('\n');
+ out.print(sb.toString());
+ }
+
+ private static final void finishByID(TextStreamWriter out, ArrayList<Long> keys){
+
+ long id=0;
+ int pairnum=0;
+ String tab="";
+ StringBuilder sb=new StringBuilder(4000);
+
+ for(Long k : keys){
+ TextStreamWriter tsw=wmap.get(k);
+ String fname=fname(k, tempname);
+ for(int i=0; i<50 && tsw.isAlive(); i++){
+ try {
+ tsw.join(20000);
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ if(tsw.isAlive()){
+ System.err.println("Waiting for tsw "+tsw.fname+" to finish...");
+ }
+ }
+ if(tsw.isAlive()){
+ System.err.println(tsw.getClass().getName()+" for "+fname+" refused to die after a long time.");
+ assert(false);
+ }
+
+ TextFile tf=new TextFile(fname, false, false);
+ ArrayList<SiteScoreR> list=new ArrayList<SiteScoreR>(1000);
+ for(String s=tf.nextLine(); s!=null; s=tf.nextLine()){list.add(SiteScoreR.fromText(s));}
+ tf.close();
+ if(DELETE_TEMP){
+ new File(fname).delete();
+ }
+ Collections.sort(list, SiteScoreR.IDCOMP);
+
+ final int lim=list.size();
+ for(int i=0; i<lim; i++){
+ SiteScoreR ssr=list.get(i);
+ sitesWritten++;
+ if(ssr.semiperfect){semiperfectWritten++;}
+ if(ssr.perfect){perfectWritten++;}
+ list.set(i, null);
+ if(ssr.numericID>id || ssr.pairnum>pairnum){
+ if(sb.length()>0){//Purge to disk
+ sb.append('\n');
+ out.print(sb.toString());
+ sb.setLength(0);
+ }
+ id=ssr.numericID;
+ pairnum=ssr.pairnum;
+ tab="";
+ }else{
+ assert(ssr.numericID==id && ssr.pairnum==pairnum);
+ }
+ sb.append(tab);
+ sb.append(ssr.toText());
+ tab="\t";
+ }
+
+ }
+
+
+ sb.append('\n');
+ out.print(sb.toString());
+ }
+
+ private static final HashMap<Long, TextStreamWriter> wmap=new HashMap<Long, TextStreamWriter>();
+
+ public static int INTERVAL=200;
+ public static int BLOCKSIZE=8000000;
+ public static long sitesRead=0;
+ public static long sitesWritten=0;
+ public static long perfectWritten=0;
+ public static long semiperfectWritten=0;
+ public static boolean DELETE_TEMP=true;
+ public static final String DEFAULT_TEMP_PATTERN="SortSitesByIDTempFile_#.txt.gz";
+ public static String tempname=null;
+ public static boolean POSITIONMODE=false; //False means sort by ID
+ public static boolean IGNORE_PERFECT_SITES=false; //Don't process perfect mappings, since they can't yield varlets.
+
+}
diff --git a/current/pacbio/SplitOffPerfectContigs.java b/current/pacbio/SplitOffPerfectContigs.java
new file mode 100755
index 0000000..270ef31
--- /dev/null
+++ b/current/pacbio/SplitOffPerfectContigs.java
@@ -0,0 +1,392 @@
+package pacbio;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import stream.SiteScore;
+
+
+import align2.Tools;
+import dna.ChromArrayMaker;
+import dna.ChromosomeArray;
+import dna.CoverageArray;
+import dna.CoverageArray2;
+import dna.Data;
+import dna.FastaToChromArrays2;
+import dna.Parser;
+import dna.Range;
+import dna.Timer;
+import fileIO.ReadWrite;
+import fileIO.TextFile;
+import fileIO.TextStreamWriter;
+
+/**
+ * @author Brian Bushnell
+ * @date Jul 26, 2012
+ *
+ */
+public class SplitOffPerfectContigs {
+
+ public static void main(String[] args){
+ System.err.println("Executing "+(new Object() { }.getClass().getEnclosingClass().getName())+" "+Arrays.toString(args)+"\n");
+ Timer t=new Timer();
+
+// ChromosomeArray c=new ChromosomeArray(1, (byte)1, "ANNNAAAANAAANNA");
+// System.out.println(c.toContigRanges(3));
+// System.out.println(c.toContigRanges(2));
+// System.out.println(c.toContigRanges(1));
+// assert(false);
+
+
+ Data.GENOME_BUILD=-1;
+ String dest=null;
+ String covfile=null;
+ String sitesfile=null;
+ String contigfile=null;
+ int trigger=50;
+ int blocklen=100;
+ int mincoverage=2;
+ int padding=4;
+ int buildout=-1;
+ String name=null;
+ String source=null;
+
+
+ for(int i=0; i<args.length; i++){
+ final String arg=args[i];
+ final String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : "true";
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(a.equals("genome") || a.equals("build")){
+ Data.setGenome(Integer.parseInt(b));
+ name=Data.name;
+ source=Data.genomeSource;
+ System.out.println("Set Data.GENOME_BUILD to "+Data.GENOME_BUILD);
+ }else if(a.equals("outgenome") || a.equals("outbuild") || a.equals("genomeout") || a.equals("buildout")){
+ buildout=Integer.parseInt(b);
+ }else if(a.equals("out") || a.equals("outfile")){
+ dest=b;
+ }else if(a.startsWith("cov") || a.startsWith("pcov") || a.startsWith("perfectcov")){
+ covfile=b;
+ }else if(a.startsWith("sites") || a.startsWith("psites") || a.startsWith("perfectsites")){
+ sitesfile=b;
+ }else if(a.equals("padding")){
+ padding=Integer.parseInt(b);
+ }else if(a.equals("trigger")){
+ trigger=Integer.parseInt(b);
+ }else if(a.startsWith("mincov")){
+ mincoverage=Integer.parseInt(b);
+ }else if(a.equals("blocklen")){
+ blocklen=Integer.parseInt(b);
+ }else if(a.equals("contigfile")){
+ contigfile=b;
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ }else if(a.startsWith("breakbad") || a.startsWith("splitbad") || a.startsWith("splitchim")){
+ BREAK_BAD_CONTIGS=Tools.parseBoolean(b);
+ }else{
+ throw new RuntimeException("Unknown parameter: "+args[i]);
+ }
+ }
+
+ assert(Data.GENOME_BUILD>-1);
+ if(buildout<=0){buildout=Data.GENOME_BUILD;}
+// assert(buildout!=Data.GENOME_BUILD); //For testing
+
+ TextStreamWriter tsw=new TextStreamWriter(dest, false, true, false);
+ tsw.start();
+
+ //Break into contigs
+ long contig=1;
+
+ if(contigfile!=null){
+ if(new File(contigfile).exists()){
+ TextFile tf=new TextFile(contigfile, false, false);
+ String s=tf.nextLine();
+ if(s!=null){contig=Long.parseLong(s);}
+ tf.close();
+ }
+ }
+
+ ArrayList<CoverageArray> calist=null;
+ if(sitesfile!=null){
+ calist=toCoverage(sitesfile, padding);
+ System.out.println("Made coverage; list size is "+calist.size());
+ }
+
+ if(buildout==Data.GENOME_BUILD){
+ String fname=Data.chromFname(1, buildout);
+ fname=fname.replaceFirst("/genome/", "/index/");
+ fname=fname.substring(0, fname.lastIndexOf('/'));
+ File dir=new File(fname);
+ if(dir.exists()){
+ System.out.println("Deleting old index.");
+ for(File f2 : dir.listFiles()){
+ if(f2.isFile() && !f2.isDirectory() && f2.getName().contains(".int2d")){f2.delete();}
+ }
+ }
+ }
+
+ for(int chrom=1; chrom<=Data.numChroms; chrom++){
+ ChromosomeArray cha=Data.getChromosome(chrom);
+ Data.unload(chrom, true);
+ CoverageArray ca=null;
+ if(calist!=null){
+ if(calist.size()>chrom){
+ ca=calist.get(chrom);
+ calist.set(chrom, null);
+ }
+ }else{
+ assert(covfile!=null && covfile.contains("#"));
+ ca=ReadWrite.read(CoverageArray.class, covfile.replaceFirst("#", ""+chrom), true);
+ if(ca==null){System.out.println("Can't find coverage for chrom "+chrom+" in file "+covfile.replaceFirst("#", ""+chrom));}
+ }
+ if(ca!=null){
+ contig=writeContigs(cha, ca, contig, trigger, mincoverage, blocklen, tsw, buildout, align2.Tools.max(1, padding));
+ }else{
+ System.out.println("Can't find coverage for chrom "+chrom);
+ }
+ }
+
+
+ tsw.poison();
+
+ if(contigfile!=null){
+ ReadWrite.writeString(""+contig, contigfile, false);
+ }
+
+ FastaToChromArrays2.writeInfo(buildout, Data.numChroms, name, source, false, false);
+
+ t.stop();
+
+ System.out.println(" \tWrote \tKept \tDropped \tSplit");
+ System.out.println("Bases \t"+basesWritten+" \t"+basesKept+" \t"+basesDropped+" \t"+basesX);
+ System.out.println("Contigs \t"+contigsWritten+" \t"+contigsKept+" \t"+contigsDropped+" \t"+contigsX);
+ System.out.println("Avg Len \t"+(basesWritten/Tools.max(contigsWritten,1))+" \t"+(basesKept/Tools.max(contigsKept,1))
+ +" \t"+(basesDropped/Tools.max(contigsDropped, 1))+" \t"+(basesX/Tools.max(contigsX, 1)));
+
+ System.out.println("Time:\t"+t);
+ }
+
+ public static long writeContigs(ChromosomeArray cha, CoverageArray ca, long contig, int trigger, int minAcceptableCoverage, int fastaBlocklen,
+ TextStreamWriter tsw, int buildout, int tipbuffer){
+
+ ArrayList<Range> list=cha.toContigRanges(trigger);
+
+ int minContig=MIN_CONTIG_TO_ADD;
+
+ if(BREAK_BAD_CONTIGS){
+ for(Range r : list){
+ if(r.length>=minContig){
+// int uncovered=0;
+// for(int i=r.a; i<=r.b; i++){
+// int cov=ca.get(i);
+// if(cov<minAcceptableCoverage){uncovered++;}
+// }
+
+
+ //Forward pass
+ int lastx=-1000;
+ int contiglen=0;
+ for(int i=r.a; i<=r.b; i++){
+ int cov=ca.get(i);
+ if(cov<minAcceptableCoverage){
+ if(contiglen>=minContig){
+ byte c=cha.get(i);
+ if(c!='N' && c!='X'){basesX++;}
+ if(i-lastx>10){
+ contigsX++;
+ }
+ cha.set(i, 'X');
+ lastx=i;
+ }
+ contiglen=0;
+ }else{
+ contiglen++;
+ }
+ }
+
+ //Reverse pass
+ lastx=Integer.MAX_VALUE;
+ contiglen=0;
+ for(int i=r.b; i>=r.a; i--){
+ int cov=ca.get(i);
+ if(cov<minAcceptableCoverage){
+ if(contiglen>=minContig){
+ byte c=cha.get(i);
+ if(c!='N' && c!='X'){basesX++;}
+ if(lastx-i>10){
+ contigsX++;
+ }
+ cha.set(i, 'X');
+ lastx=i;
+ }
+ contiglen=0;
+ }else{
+ contiglen++;
+ }
+ }
+ }
+ }
+ list=cha.toContigRanges(trigger);
+ }
+
+
+ ArrayList<Range> good=new ArrayList<Range>();
+ ArrayList<Range> bad=new ArrayList<Range>();
+ int badlen=0;
+
+ for(Range r : list){
+ if(r.length>=minContig){
+ int minCov=Integer.MAX_VALUE;
+ for(int i=r.a+tipbuffer; i<=r.b-tipbuffer; i++){
+ minCov=Tools.min(minCov, ca.get(i));
+ }
+ if(minCov>=minAcceptableCoverage){
+ good.add(r);
+ if(verbose){
+ StringBuilder sb0=new StringBuilder(), sb1=new StringBuilder(), sb2=new StringBuilder();
+ for(int i=r.a; i<=r.b; i++){
+ int cov=ca.get(i);
+ char b=(char) cha.get(i);
+ sb0.append(b);
+ sb1.append(b+"\t");
+ sb2.append(cov+"\t");
+ }
+ System.out.println(sb0+"\n"+sb1+"\n"+sb2+"\n");
+ }
+ }else{
+ bad.add(r);
+ badlen+=r.length+N_PAD_LENGTH;
+ if(verbose){
+ StringBuilder sb0=new StringBuilder(), sb1=new StringBuilder(), sb2=new StringBuilder();
+ for(int i=r.a; i<=r.b; i++){
+ int cov=ca.get(i);
+ char b=(char) cha.get(i);
+ sb0.append(b);
+ sb1.append(b+"\t");
+ sb2.append(cov+"\t");
+ }
+ System.err.println(sb0+"\n"+sb1+"\n"+sb2+"\n");
+ }
+ }
+ }else{
+ contigsDropped++;
+ basesDropped+=r.length;
+ }
+ }
+
+ for(Range r : good){
+ contigsWritten++;
+ basesWritten+=r.length;
+ String s=cha.getString(r.a, r.b);
+ tsw.print(">"+contig+"\n");
+ contig++;
+ writeContig(s, tsw, fastaBlocklen);
+// for(int i=r.a; i<=r.b; i++){cha.set(i, 'N');} //Delete "good" contigs from reference.
+ }
+
+ badlen=badlen+2*N_PAD_LENGTH2-N_PAD_LENGTH+10;
+ ChromosomeArray cha2=new ChromosomeArray(cha.chromosome, cha.strand, 0, badlen);
+ cha2.maxIndex=-1;
+ cha2.minIndex=0;
+ for(int i=0; i<N_PAD_LENGTH2; i++){
+ cha2.set(i, 'N');
+ }
+ for(Range r : bad){
+ contigsKept++;
+ basesKept+=r.length;
+
+ String s=cha.getString(r.a, r.b);
+ for(int i=0; i<s.length(); i++){
+ cha2.set(cha2.maxIndex+1, s.charAt(i));
+ }
+ for(int i=0; i<N_PAD_LENGTH; i++){
+ cha2.set(cha2.maxIndex+1, 'N');
+ }
+ }
+ for(int i=N_PAD_LENGTH; i<N_PAD_LENGTH2; i++){
+ cha2.set(cha2.maxIndex+1, 'N');
+ }
+
+// ReadWrite.writeObjectInThread(cha2, Data.chromFname(cha2.chromosome, Data.GENOME_BUILD));
+ String fname=Data.chromFname(cha2.chromosome, buildout);
+ {
+ File f=new File(fname.substring(0, fname.lastIndexOf('/')));
+ if(!f.exists()){
+ f.mkdirs();
+ }
+ }
+
+ ReadWrite.write(cha2, fname, false);
+
+ return contig;
+ }
+
+ public static void writeContig(CharSequence sb, TextStreamWriter tsw, int blocklen){
+ for(int i=0; i<sb.length(); i+=blocklen){
+ int max=Tools.min(i+blocklen, sb.length());
+ tsw.println(sb.subSequence(i, max));
+ }
+ }
+
+
+ public static ArrayList<CoverageArray> toCoverage(String sitesfile, int padding){
+ ArrayList<CoverageArray> pcov=new ArrayList<CoverageArray>(8);
+ pcov.add(new CoverageArray2(0,1000));
+
+ long perfect=0;
+ long semiperfect=0;
+ long sites=0;
+
+ String[] files=sitesfile.split(",");
+ for(String f : files){
+ TextFile tf=new TextFile(f, false, false);
+ for(String line=tf.nextLine(); line!=null; line=tf.nextLine()){
+ String[] split=line.split("\t");
+ for(String s : split){
+ SiteScore ss=SiteScore.fromText(s);
+ while(pcov.size()<=ss.chrom){
+ pcov.add(new CoverageArray2(pcov.size(), 500));
+ }
+ if(ss.perfect || ss.semiperfect){
+ CoverageArray ca=pcov.get(ss.chrom);
+ for(int i=ss.start+padding; i<=ss.stop-padding; i++){
+ ca.increment(i);
+ }
+ }
+ if(ss.perfect){perfect++;}
+ if(ss.semiperfect){semiperfect++;}
+ sites++;
+ assert(!ss.perfect || ss.semiperfect) : ss.perfect+", "+ss.semiperfect+"\n"+ss.header()+"\n"+ss.toText()+"\n"+s+"\n";
+ }
+ }
+ tf.close();
+ }
+ System.out.println("Read "+files.length+" sites file"+(files.length==1 ? "." : "s."));
+ System.out.println("sites="+sites+" \tsemiperfect="+semiperfect+" \tperfect="+perfect);
+ return pcov;
+ }
+
+
+ public static long basesWritten=0;
+ public static long basesKept=0;
+ public static long basesDropped=0;
+ public static long basesX=0;
+ public static long contigsWritten=0;
+ public static long contigsKept=0;
+ public static long contigsDropped=0;
+ public static long contigsX=0;
+
+ public static int N_PAD_LENGTH=MergeFastaContigs.N_PAD_LENGTH;
+ public static int N_PAD_LENGTH2=MergeFastaContigs.N_PAD_LENGTH2; //for ends
+ public static int MIN_CONTIG_TO_ADD=50;
+ public static boolean BREAK_BAD_CONTIGS=false;
+
+ public static boolean verbose=false;
+
+}
diff --git a/current/pacbio/StackSites.java b/current/pacbio/StackSites.java
new file mode 100755
index 0000000..612a0e6
--- /dev/null
+++ b/current/pacbio/StackSites.java
@@ -0,0 +1,312 @@
+package pacbio;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+
+import stream.ConcurrentLegacyReadInputStream;
+import stream.RTextInputStream;
+import stream.Read;
+import stream.SiteScore;
+import stream.SiteScoreR;
+
+
+import dna.AminoAcid;
+import dna.ChromosomeArray;
+import dna.CoverageArray;
+import dna.CoverageArray2;
+import dna.Data;
+import dna.Gene;
+import dna.Parser;
+import dna.Timer;
+import fileIO.ReadWrite;
+import fileIO.TextStreamWriter;
+
+import align2.ListNum;
+import align2.MultiStateAligner9PacBio;
+import align2.Tools;
+
+/**
+ * @author Brian Bushnell
+ * @date Jul 16, 2012
+ *
+ */
+public class StackSites {
+
+ public static void main(String[] args){
+ System.err.println("Executing "+(new Object() { }.getClass().getEnclosingClass().getName())+" "+Arrays.toString(args)+"\n");
+ Timer t=new Timer();
+
+ for(int i=4; i<args.length; i++){
+ final String arg=args[i];
+ final String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(a.equals("genome") || a.equals("build")){
+ Data.setGenome(Integer.parseInt(b));
+ }
+ }
+
+ stack(args[0], args[1], args[2], args[3]);
+ t.stop();
+ System.out.println("Time: \t"+t);
+ }
+
+ public static void stack(String fname1, String fname2, String outname, String pcovoutname){
+ assert(pcovoutname.contains("#"));
+ RTextInputStream rtis=new RTextInputStream(fname1, (fname2==null || fname2.equals("null") ? null : fname2), -1);
+ ConcurrentLegacyReadInputStream cris=new ConcurrentLegacyReadInputStream(rtis, -1);
+
+ cris.start();
+ System.err.println("Started cris");
+ boolean paired=cris.paired();
+ System.err.println("Paired: "+paired);
+
+ ArrayList<CoverageArray> pcov=new ArrayList<CoverageArray>(8);
+ pcov.add(new CoverageArray2(0,1000));
+
+ Glob g=new Glob();
+
+ {
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+ assert(paired==(r.mate!=null));
+ }
+
+ while(reads!=null && reads.size()>0){
+ //System.err.println("reads.size()="+reads.size());
+ for(Read r : reads){
+ readsProcessed++;
+
+// System.out.println("Processing read "+r.numericID);
+
+ if(r!=null){
+ if(r.sites!=null){
+// System.out.println("Adding "+r.list.size()+" sites.");
+ SiteScore original=r.originalSite;
+ for(SiteScore ss : r.sites){
+ sitesProcessed++;
+
+ //TODO: Process perfect coverage
+ {
+ boolean b=false;
+ if(ss.perfect || ss.semiperfect){
+ b=true;
+ }else{//Check for no-refs
+ int len=ss.stop-ss.start+1;
+ if(len==r.length() && ss.slowScore>=0.5f*MultiStateAligner9PacBio.POINTS_MATCH2){
+ b=checkPerfection(ss.start, ss.stop, r.bases, Data.getChromosome(ss.chrom), ss.strand==Gene.MINUS, 0.5f);
+ }
+ }
+ if(b){
+ while(pcov.size()<=ss.chrom){
+ pcov.add(new CoverageArray2(pcov.size(), 500));
+ }
+ CoverageArray ca=pcov.get(ss.chrom);
+ for(int i=ss.start; i<=ss.stop; i++){
+ ca.increment(i);
+ }
+ }
+ }
+
+ SiteScoreR ssr=new SiteScoreR(ss, r.length(), r.numericID, (byte)r.pairnum());
+
+ if(original!=null){
+ ssr.correct=isCorrectHitLoose(ss, original.chrom, original.strand, original.start, original.stop, 40, false);
+ }
+
+ g.add(ssr);
+ }
+// System.out.println(sitesProcessed);
+ }
+ }
+
+ if(r.mate!=null){
+ Read r2=r.mate;
+ if(r2.sites!=null){
+
+ SiteScore original=r2.originalSite;
+ for(SiteScore ss : r2.sites){
+ sitesProcessed++;
+
+ {
+ boolean b=false;
+ if(ss.perfect || ss.semiperfect){
+ b=true;
+ }else{//Check for no-refs
+ int len=ss.stop-ss.start+1;
+ if(len==r2.length() && ss.slowScore>=0.5f*MultiStateAligner9PacBio.POINTS_MATCH2){
+ b=checkPerfection(ss.start, ss.stop, r2.bases, Data.getChromosome(ss.chrom), ss.strand==Gene.MINUS, 0.5f);
+ }
+ }
+ if(b){
+ while(pcov.size()<=ss.chrom){
+ pcov.add(new CoverageArray2(pcov.size(), 500));
+ }
+ CoverageArray ca=pcov.get(ss.chrom);
+ for(int i=ss.start; i<=ss.stop; i++){
+ ca.increment(i);
+ }
+ }
+ }
+
+ SiteScoreR ssr=new SiteScoreR(ss, r2.length(), r2.numericID, (byte)r2.pairnum());
+
+ if(original!=null){
+ ssr.correct=isCorrectHitLoose(ss, original.chrom, original.strand, original.start, original.stop, 40, false);
+ }
+
+ g.add(ssr);
+ }
+ }
+ }
+
+// System.out.println(r.toString());
+// assert(r.list!=null);
+// assert(r.list.size()>0);
+
+ }
+ //System.err.println("returning list");
+ cris.returnList(ln.id, ln.list.isEmpty());
+ //System.err.println("fetching list");
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ System.err.println("Finished reading");
+ cris.returnList(ln.id, ln.list.isEmpty());
+ System.err.println("Returned list");
+ ReadWrite.closeStream(cris);
+ System.err.println("Closed stream");
+ System.err.println("Processed "+readsProcessed+" reads.");
+ System.err.println("Processed "+sitesProcessed+" sites.");
+ }
+
+
+ for(int i=1; i<pcov.size(); i++){
+ CoverageArray ca=pcov.get(i);
+ pcov.set(i, null);
+ ca.resize(ca.maxIndex+1);
+ ReadWrite.writeObjectInThread(ca, pcovoutname.replaceFirst("#", ""+i), false);
+ }
+
+
+ TextStreamWriter out=new TextStreamWriter(outname, true, false, false);
+ out.start();
+
+ //This is split by chrom to avoid getting more than 2^31 sites in a single list.
+ //Ultimately, it may be better to split output files by chrom and output unsorted, to avoid memory usage.
+ for(int i=0; i<g.array.length; i++){
+ Collections.sort(g.array[i], SiteScoreR.PCOMP);
+ write(g.array[i], out);
+ g.array[i]=null;
+ }
+
+ out.poison();
+
+ }
+
+ private static boolean checkPerfection(int start, int stop, byte[] bases, ChromosomeArray cha, boolean rcomp, float f) {
+
+ int noref=0;
+ if(rcomp){
+ for(int i=0; i<bases.length; i++){
+ byte a=AminoAcid.baseToComplementExtended[bases[bases.length-i-1]];
+ byte b=cha.get(start+i);
+ if(b=='N'){noref++;}
+ else if(a!=b){return false;}
+ }
+ }else{
+ for(int i=0; i<bases.length; i++){
+ byte a=bases[i];
+ byte b=cha.get(start+i);
+ if(b=='N'){noref++;}
+ else if(a!=b){return false;}
+ }
+ }
+ return bases.length-noref>=f*bases.length;
+ }
+
+ private static void write(ArrayList<SiteScoreR> alsr, TextStreamWriter out){
+ if(alsr==null || alsr.size()==0){return;}
+
+ int chrom=0;
+ int loc=INTERVAL;
+ StringBuilder sb=new StringBuilder();
+
+ String tab="";
+
+ final int lim=alsr.size();
+ for(int i=0; i<lim; i++){
+ SiteScoreR ssr=alsr.get(i);
+ alsr.set(i, null);
+ if(ssr.chrom>chrom || ssr.start>=loc){
+ if(sb.length()>0){//Purge to disk
+ sb.append('\n');
+ out.print(sb.toString());
+ sb.setLength(0);
+ }
+ chrom=ssr.chrom;
+ loc=ssr.start;
+ loc=(loc-(loc%INTERVAL))+INTERVAL;
+ assert(loc>ssr.start);
+ assert(loc-ssr.start<=INTERVAL);
+ assert(loc%INTERVAL==0);
+ tab="";
+ }
+ sb.append(tab);
+ sb.append(ssr.toText());
+ tab="\t";
+ }
+
+ if(sb.length()>0){//Purge to disk
+ sb.append('\n');
+ out.print(sb.toString());
+ sb.setLength(0);
+ }
+ }
+
+ public static boolean isCorrectHitLoose(SiteScore ss, int trueChrom, byte trueStrand, int trueStart, int trueStop, int thresh, boolean useChrom){
+ if((useChrom && ss.chrom!=trueChrom) || ss.strand!=trueStrand){return false;}
+
+ assert(ss.stop>ss.start) : ss.toText()+", "+trueStart+", "+trueStop;
+ assert(trueStop>trueStart) : ss.toText()+", "+trueStart+", "+trueStop;
+
+ return (Tools.absdif(ss.start, trueStart)<=thresh || Tools.absdif(ss.stop, trueStop)<=thresh);
+ }
+
+ private static class Glob{
+
+ public Glob(){
+ array=new ArrayList[8];
+ for(int i=0; i<array.length; i++){
+ array[i]=new ArrayList<SiteScoreR>();
+ }
+ }
+
+ public void add(SiteScoreR ssr){
+ if(ssr.chrom>=array.length){
+ int newlen=((int)ssr.chrom*2);
+ assert(newlen>array.length);
+ ArrayList<SiteScoreR>[] array2=new ArrayList[newlen];
+ for(int i=0; i<array.length; i++){array2[i]=array[i];}
+ for(int i=array.length; i<array2.length; i++){array2[i]=new ArrayList<SiteScoreR>();}
+ array=array2;
+ }
+ array[ssr.chrom].add(ssr);
+ }
+
+ public ArrayList<SiteScoreR>[] array;
+
+ }
+
+ public static final int INTERVAL=200;
+ public static long readsProcessed=0;
+ public static long sitesProcessed=0;
+
+}
diff --git a/current/pacbio/StackSites2.java b/current/pacbio/StackSites2.java
new file mode 100755
index 0000000..43024b5
--- /dev/null
+++ b/current/pacbio/StackSites2.java
@@ -0,0 +1,501 @@
+package pacbio;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+
+import stream.ConcurrentLegacyReadInputStream;
+import stream.RTextInputStream;
+import stream.Read;
+import stream.SiteScore;
+import stream.SiteScoreR;
+
+
+import dna.AminoAcid;
+import dna.ChromosomeArray;
+import dna.CoverageArray;
+import dna.CoverageArray2;
+import dna.Data;
+import dna.Gene;
+import dna.Parser;
+import dna.Timer;
+import fileIO.ReadWrite;
+import fileIO.TextFile;
+import fileIO.TextStreamWriter;
+
+import align2.ListNum;
+import align2.MultiStateAligner9PacBio;
+import align2.Tools;
+
+/**
+ * @author Brian Bushnell
+ * @date Jul 16, 2012
+ *
+ */
+public class StackSites2 {
+
+ public static void main(String[] args){
+ System.err.println("Executing "+(new Object() { }.getClass().getEnclosingClass().getName())+" "+Arrays.toString(args)+"\n");
+
+ Timer t=new Timer();
+
+ String tempname=null;
+ Data.GENOME_BUILD=-1;
+
+ for(int i=4; i<args.length; i++){
+ final String arg=args[i];
+ final String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(a.equals("genome") || a.equals("build")){
+ Data.setGenome(Integer.parseInt(b));
+ }else if(a.equals("tempname")){
+ tempname=b;
+ }else if(a.equals("deletefiles") || a.startsWith("deletetemp") || a.equals("delete")){
+ DELETE_TEMP=(Tools.parseBoolean(b));
+ }else if(a.equals("blocksize")){
+ BLOCKSIZE=(Integer.parseInt(b));
+ }else{
+ throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ if(Data.GENOME_BUILD<0){throw new RuntimeException("Please specify genome build.");}
+
+ stack(args[0], args[1], args[2], args[3], tempname);
+ t.stop();
+ System.out.println("Time: \t"+t);
+ }
+
+ public static void stack(String fname1, String fname2, String outname, String pcovoutname, String tempname){
+ assert(pcovoutname.contains("#"));
+ final RTextInputStream rtis=new RTextInputStream(fname1, (fname2==null || fname2.equals("null") ? null : fname2), -1);
+ final ConcurrentLegacyReadInputStream cris=new ConcurrentLegacyReadInputStream(rtis, -1);
+
+ cris.start();
+ System.err.println("Started cris");
+ final boolean paired=cris.paired();
+ System.err.println("Paired: "+paired);
+
+ final ArrayList<CoverageArray> pcov;
+ final ArrayList<CoverageArray> truePcov;
+ final ArrayList<CoverageArray> cov;
+
+ {
+ int len=(Data.GENOME_BUILD<0 ? 8 : Data.numChroms+1);
+
+ pcov=new ArrayList<CoverageArray>(len);
+ truePcov=new ArrayList<CoverageArray>(len);
+ cov=new ArrayList<CoverageArray>(len);
+
+ System.out.println("len="+len+"; Data.numChroms="+Data.numChroms);
+
+ pcov.add(null);
+ truePcov.add(null);
+ cov.add(null);
+
+ for(int i=1; i<len; i++){
+ if(Data.GENOME_BUILD<0){
+ pcov.add(new CoverageArray2(-1, 500));
+ truePcov.add(new CoverageArray2(-1, 500));
+ cov.add(new CoverageArray2(-1, 500));
+ }else{
+ pcov.add(new CoverageArray2(-1, Data.chromLengths[i]+1));
+ truePcov.add(new CoverageArray2(-1, Data.chromLengths[i]+1));
+ cov.add(new CoverageArray2(-1, Data.chromLengths[i]+1));
+ }
+ }
+ }
+
+
+ final Glob g=new Glob(tempname);
+
+ {
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+ assert(paired==(r.mate!=null));
+ }
+
+ while(reads!=null && reads.size()>0){
+ //System.err.println("reads.size()="+reads.size());
+ for(Read r : reads){
+ readsProcessed++;
+
+// System.out.println("Processing read "+r.numericID);
+
+ if(r!=null){
+ if(r.sites!=null){
+// System.out.println("Adding "+r.list.size()+" sites.");
+ SiteScore original=r.originalSite;
+ for(SiteScore ss : r.sites){
+ sitesProcessed++;
+
+ //TODO: Process perfect coverage
+ {
+ boolean b=false;
+ if(ss.semiperfect){
+ b=true;
+ }else{//Check for no-refs
+ int len=ss.stop-ss.start+1;
+ if(len==r.length() && ss.slowScore>=0.5f*MultiStateAligner9PacBio.POINTS_MATCH2){
+ b=checkPerfection(ss.start, ss.stop, r.bases, Data.getChromosome(ss.chrom), ss.strand==Gene.MINUS, 0.5f);
+ }
+ }
+ if(b){
+ while(pcov.size()<=ss.chrom){
+ pcov.add(new CoverageArray2(pcov.size(), Data.chromLengths[pcov.size()]));
+ truePcov.add(new CoverageArray2(truePcov.size(), Data.chromLengths[truePcov.size()]));
+ }
+ CoverageArray ca=pcov.get(ss.chrom);
+ CoverageArray tca=truePcov.get(ss.chrom);
+ for(int i=ss.start+PCOV_TIP_DIST; i<=ss.stop-PCOV_TIP_DIST; i++){
+ ca.increment(i);
+ }
+ if(ss.perfect){
+ for(int i=ss.start; i<=ss.stop; i++){
+ tca.increment(i);
+ }
+ }
+ }
+ {
+ while(cov.size()<=ss.chrom){
+ cov.add(new CoverageArray2(cov.size(), Data.chromLengths[cov.size()]));
+ }
+ CoverageArray ca=cov.get(ss.chrom);
+ for(int i=ss.start; i<=ss.stop; i++){
+ ca.increment(i);
+ }
+ }
+ }
+
+ SiteScoreR ssr=new SiteScoreR(ss, r.length(), r.numericID, (byte)r.pairnum());
+
+ if(original!=null){
+ ssr.correct=isCorrectHitLoose(ss, original.chrom, original.strand, original.start, original.stop, 40, false);
+ }
+
+ g.write(ssr);
+ }
+// System.out.println(sitesProcessed);
+ }
+ }
+
+ if(r.mate!=null){
+ Read r2=r.mate;
+ if(r2.sites!=null){
+
+ SiteScore original=r2.originalSite;
+ for(SiteScore ss : r2.sites){
+ sitesProcessed++;
+
+ {
+ boolean b=false;
+ if(ss.semiperfect){
+ b=true;
+ }else{//Check for no-refs
+ int len=ss.stop-ss.start+1;
+ if(len==r2.length() && ss.slowScore>=0.5f*MultiStateAligner9PacBio.POINTS_MATCH2){
+ b=checkPerfection(ss.start, ss.stop, r2.bases, Data.getChromosome(ss.chrom), ss.strand==Gene.MINUS, 0.5f);
+ }
+ }
+ if(b){
+ while(pcov.size()<=ss.chrom){
+ pcov.add(new CoverageArray2(pcov.size(), Data.chromLengths[pcov.size()]));
+ truePcov.add(new CoverageArray2(truePcov.size(), Data.chromLengths[truePcov.size()]));
+ }
+ CoverageArray ca=pcov.get(ss.chrom);
+ CoverageArray tca=truePcov.get(ss.chrom);
+ for(int i=ss.start+PCOV_TIP_DIST; i<=ss.stop-PCOV_TIP_DIST; i++){
+ ca.increment(i);
+ }
+ if(ss.perfect){
+ for(int i=ss.start; i<=ss.stop; i++){
+ tca.increment(i);
+ }
+ }
+ }
+ {
+ while(cov.size()<=ss.chrom){
+ cov.add(new CoverageArray2(cov.size(), Data.chromLengths[cov.size()]));
+ }
+ CoverageArray ca=cov.get(ss.chrom);
+ for(int i=ss.start; i<=ss.stop; i++){
+ ca.increment(i);
+ }
+ }
+ }
+
+ SiteScoreR ssr=new SiteScoreR(ss, r2.length(), r2.numericID, (byte)r2.pairnum());
+
+ if(original!=null){
+ ssr.correct=isCorrectHitLoose(ss, original.chrom, original.strand, original.start, original.stop, 40, false);
+ }
+
+ g.write(ssr);
+ }
+ }
+ }
+
+// System.out.println(r.toString());
+// assert(r.list!=null);
+// assert(r.list.size()>0);
+
+ }
+ //System.err.println("returning list");
+ cris.returnList(ln.id, ln.list.isEmpty());
+ //System.err.println("fetching list");
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ System.out.println("Finished reading");
+ cris.returnList(ln.id, ln.list.isEmpty());
+ System.out.println("Returned list");
+ ReadWrite.closeStream(cris);
+ System.out.println("Closed stream");
+ System.out.println("Processed "+readsProcessed+" reads.");
+ System.out.println("Processed "+sitesProcessed+" sites.");
+ }
+
+
+ for(int i=1; i<pcov.size(); i++){
+ CoverageArray ca=pcov.get(i);
+// pcov.set(i, null);
+ if(ca.maxIndex<.995*ca.arrayLength()){
+ ca.resize(ca.maxIndex+1);
+ }
+ ReadWrite.writeObjectInThread(ca, pcovoutname.replaceFirst("#", ""+i), false);
+ }
+
+ finish(g, outname, pcov, truePcov, cov);
+ System.out.println("Retained "+sitesOut+" sites.");
+
+ }
+
+ /** TODO - thread this by chrom */
+ private static final void finish(Glob g, String outname, ArrayList<CoverageArray> pcov, ArrayList<CoverageArray> truePcov, ArrayList<CoverageArray> cov){
+
+
+ final TextStreamWriter out=new TextStreamWriter(outname, true, false, false);
+ out.start();
+ ArrayList<Long> keys=new ArrayList<Long>(g.wmap.size());
+ keys.addAll(g.wmap.keySet());
+ Collections.sort(keys);
+ for(Long k : keys){
+ TextStreamWriter tsw=g.wmap.get(k);
+ tsw.poison();
+ }
+
+
+
+ int chrom=0;
+ int loc=INTERVAL;
+ String tab="";
+ StringBuilder sb=new StringBuilder(4000);
+
+ for(Long k : keys){
+ TextStreamWriter tsw=g.wmap.get(k);
+ String fname=Glob.fname(k, g.tempname);
+ for(int i=0; i<50 && tsw.isAlive(); i++){
+ try {
+ tsw.join(20000);
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ if(tsw.isAlive()){
+ System.err.println("Waiting for tsw "+tsw.fname+" to finish...");
+ }
+ }
+ if(tsw.isAlive()){
+ System.err.println(tsw.getClass().getName()+" for "+fname+" refused to die after a long time.");
+ assert(false);
+ }
+
+ TextFile tf=new TextFile(fname, false, false);
+ ArrayList<SiteScoreR> list=new ArrayList<SiteScoreR>(1000);
+ for(String s=tf.nextLine(); s!=null; s=tf.nextLine()){
+ SiteScoreR ssr=SiteScoreR.fromText(s);
+
+ assert(pcov.size()>=ssr.chrom) : ssr.chrom+", "+pcov.size()+", "+truePcov.size()+", "+cov.size();
+ final int c=ssr.chrom;
+ boolean retain=retainSite(ssr, (pcov.size()>c ? pcov.get(c) : FAKE), (truePcov.size()>c ? truePcov.get(c) : FAKE), (cov.size()>c ? cov.get(c) : null));
+ if(retain){
+ list.add(ssr);
+ sitesOut++;
+ }
+ }
+ tf.close();
+ if(DELETE_TEMP){
+ new File(fname).delete();
+ }
+ Collections.sort(list, SiteScoreR.PCOMP);
+
+ final int lim=list.size();
+ for(int i=0; i<lim; i++){
+ SiteScoreR ssr=list.get(i);
+ list.set(i, null);
+ if(ssr.chrom>chrom || ssr.start>=loc){
+ if(sb.length()>0){//Purge to disk
+ sb.append('\n');
+ out.print(sb.toString());
+ sb.setLength(0);
+ }
+ chrom=ssr.chrom;
+ loc=ssr.start;
+ loc=(loc-(loc%INTERVAL))+INTERVAL;
+ assert(loc>ssr.start);
+ assert(loc-ssr.start<=INTERVAL);
+ assert(loc%INTERVAL==0);
+ tab="";
+ }
+ sb.append(tab);
+ sb.append(ssr.toText());
+ tab="\t";
+ }
+
+ }
+
+
+ sb.append('\n');
+ out.print(sb.toString());
+ out.poisonAndWait();
+ }
+
+ private static boolean retainSite(SiteScoreR ssr, CoverageArray pcov, CoverageArray tpcov, CoverageArray cov){
+ if(ssr.semiperfect && !ssr.perfect){return true;} //For tip extension
+ assert(cov!=null && cov!=FAKE) : (cov==FAKE)+", "+ssr.chrom;
+
+ if(!ssr.semiperfect){ //Typical flawed read
+ assert(!ssr.perfect);
+ boolean toss=true;
+ if(pcov==null || tpcov==null){
+ toss=false;
+ }else{
+ for(int j=ssr.start-PCOV_TIP_DIST; toss && j<=ssr.stop+PCOV_TIP_DIST; j++){
+ toss=(pcov.get(j)>=MIN_PCOV_TO_TOSS && tpcov.get(j)>=MIN_PCOV_TO_TOSS);
+ }
+ }
+ if(toss){
+ for(int j=ssr.start; j<=ssr.stop; j++){cov.increment(j, -1);}
+ return false;
+ }
+ }
+
+ boolean alwaysLowCov=true;
+ boolean alwaysTooPerfect=true;
+ boolean onlyPerfect=true;
+
+ for(int j=ssr.start; (alwaysLowCov || alwaysTooPerfect || onlyPerfect) && j<=ssr.stop; j++){
+ int c=cov.get(j);
+ int tp=tpcov.get(j);
+
+ alwaysLowCov=alwaysLowCov && c<MIN_COV_TO_RETAIN;
+ alwaysTooPerfect=alwaysTooPerfect && c-tp<tp;
+ onlyPerfect=onlyPerfect && tp>0;
+ }
+
+ if(alwaysLowCov || (alwaysTooPerfect && !ssr.semiperfect) || onlyPerfect){
+ if(!ssr.semiperfect){
+ for(int j=ssr.start; j<=ssr.stop; j++){cov.increment(j, -1);}
+ }
+ return false;
+ }
+
+ return true;
+ }
+
+ private static boolean checkPerfection(int start, int stop, byte[] bases, ChromosomeArray cha, boolean rcomp, float f) {
+
+ int noref=0;
+ if(rcomp){
+ for(int i=0; i<bases.length; i++){
+ byte a=AminoAcid.baseToComplementExtended[bases[bases.length-i-1]];
+ byte b=cha.get(start+i);
+ if(b=='N'){noref++;}
+ else if(a!=b){return false;}
+ }
+ }else{
+ for(int i=0; i<bases.length; i++){
+ byte a=bases[i];
+ byte b=cha.get(start+i);
+ if(b=='N'){noref++;}
+ else if(a!=b){return false;}
+ }
+ }
+ return bases.length-noref>=f*bases.length;
+ }
+
+ public static boolean isCorrectHitLoose(SiteScore ss, int trueChrom, byte trueStrand, int trueStart, int trueStop, int thresh, boolean useChrom){
+ if((useChrom && ss.chrom!=trueChrom) || ss.strand!=trueStrand){return false;}
+
+ assert(ss.stop>ss.start) : ss.toText()+", "+trueStart+", "+trueStop;
+ assert(trueStop>trueStart) : ss.toText()+", "+trueStart+", "+trueStop;
+
+ return (Tools.absdif(ss.start, trueStart)<=thresh || Tools.absdif(ss.stop, trueStop)<=thresh);
+ }
+
+ private static class Glob{
+
+ public Glob(String tempPattern_){
+ tempname=(tempPattern_ == null ? DEFAULT_TEMP_PATTERN : tempPattern_);
+ }
+
+ public void write(SiteScoreR ssr){
+ long key=key(ssr.chrom, ssr.start);
+ TextStreamWriter tsw=wmap.get(key);
+ if(tsw==null){
+ String fname=fname(key, tempname);
+ tsw=new TextStreamWriter(fname, true, false, false);
+ tsw.start();
+ wmap.put(key, tsw);
+ }
+ tsw.print(ssr.toText().append('\n'));
+ }
+
+ protected static final long key(int chrom, int start){
+ long k=((long)chrom<<32)+(Tools.max(start, 0))/BLOCKSIZE;
+ return k;
+ }
+
+ protected static final String fname(long key, String outname){
+ if(outname==null){outname=DEFAULT_TEMP_PATTERN;}
+ assert(outname.contains("#")) : outname;
+ return outname.replace("#", "b"+Data.GENOME_BUILD+"_"+key);
+ }
+
+ final HashMap<Long, TextStreamWriter> wmap=new HashMap<Long, TextStreamWriter>();
+ final String tempname;
+
+ }
+
+ /** Sites will be written to files, each containing an index range of this size.
+ * Larger means fewer files, but more memory used when reading the files (at a later stage).
+ */
+ public static int BLOCKSIZE=8000000;
+
+ /** Sites are grouped into intervals (by start location) and treated as an array of arrays.
+ * All sites in an interval are printed as one line of text. */
+ public static final int INTERVAL=200;
+ public static long readsProcessed=0;
+ public static long sitesProcessed=0;
+ public static long sitesOut=0;
+ public static boolean DELETE_TEMP=true;
+ public static final String DEFAULT_TEMP_PATTERN="StackSites2TempFile_#.txt.gz";
+ /** Start incrementing coverage this far in from the site tips. */
+ public static int PCOV_TIP_DIST=6;
+
+ /** Toss sites from areas with less than this coverage, since they can't be used to call vars */
+ public static int MIN_COV_TO_RETAIN=2;
+ /** Toss sites from areas with less than this coverage, since they can't be used to call vars */
+ public static int MIN_PCOV_TO_TOSS=3;
+
+ private static final CoverageArray FAKE=new CoverageArray2(-1, 500);
+}
diff --git a/current/stream/ArrayListSet.java b/current/stream/ArrayListSet.java
new file mode 100755
index 0000000..e246d11
--- /dev/null
+++ b/current/stream/ArrayListSet.java
@@ -0,0 +1,198 @@
+package stream;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.LinkedHashMap;
+
+import tax.TaxNode;
+import tax.TaxTree;
+
+/**
+ * Associates reads with named lists.
+ * Designed for dynamically demultiplexing reads into output streams with MultiCros.
+ * This class is not thread-safe; one should be instantiated per thread.
+ * @author Brian Bushnell
+ * @date Apr 2, 2015
+ *
+ */
+public class ArrayListSet {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public ArrayListSet(boolean ordered_){
+ this(ordered_, null, TaxTree.stringToLevel("phylum"));
+ }
+
+ /**
+ * Create an ArrayListSet with an optional TaxTree and level.
+ * The tree is to assign reads to a list based on the taxonomy of the name,
+ * rather than the name itself.
+ * @param ordered_ Whether input order should be maintained. Unimplemented.
+ * @param tree_ A taxonomic tree.
+ * @param taxLevel_ The minimum level in the tree to stop.
+ */
+ public ArrayListSet(boolean ordered_, TaxTree tree_, int taxLevel_){
+ ordered=ordered_;
+ tree=tree_;
+ taxLevel=taxLevel_;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Outer Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public void add(Read r, Iterable<String> names){
+ for(String s : names){add(r, s);}
+ }
+
+ public void add(Read r, String name){
+ final Pack p=getPack(name, true);
+ p.add(r);
+ }
+
+ public void add(Read r, int id){
+ final Pack p=getPack(id, true);
+ p.add(r);
+ }
+
+ public ArrayList<Read> getAndClear(String name){
+ final Pack p=getPack(name, false);
+ return p==null ? null : p.getAndClear();
+ }
+
+ public ArrayList<Read> getAndClear(int id){
+ final Pack p=getPack(id, false);
+ return p==null ? null : p.getAndClear();
+ }
+
+ public Collection<String> getNames(){
+ return nameList;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- TaxId Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Look up the sequence name, which should start with a gi or ncbi number, and
+ * associate the read with the ancestor node at some taxonomic level.
+ * @param r
+ * @param name
+ */
+ public void addByTaxid(Read r, String name){
+ addByTaxid(r, nameToTaxid(name));
+ }
+
+ public void addByTaxid(Read r, int taxid){
+ String key=Integer.toString(taxid);
+ final Pack p=getPack(key, true);
+ p.add(r);
+ }
+
+ public void addByTaxid(Read r, ArrayList<String> names){
+ if(names.size()==0){return;}
+ else if(names.size()==1){addByTaxid(r, names.get(0));}
+ else{addByTaxid(r, (Iterable<String>)names);}
+ }
+
+ public void addByTaxid(Read r, Iterable<String> names){
+ HashSet<Integer> idset=tls.get();
+ if(idset==null){
+ idset=new HashSet<Integer>();
+ tls.set(idset);
+ }
+ assert(idset.isEmpty());
+ for(String s : names){
+ idset.add(nameToTaxid(s));
+ }
+ for(Integer i : idset){
+ addByTaxid(r, i);
+ }
+ idset.clear();
+ }
+
+ private int nameToTaxid(String name){
+ TaxNode tn=tree.getNode(name, taxLevel);
+ return (tn==null ? -1 :tn.id);
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private Pack getPack(String name, boolean add){
+ Pack p=stringMap.get(name);
+ if(p==null && add){p=new Pack(name);}
+ return p;
+ }
+
+ private Pack getPack(int id, boolean add){
+ Pack p=packList.size()>id ? packList.get(id) : null;
+ if(p==null && add){p=new Pack(id);}
+ return p;
+ }
+
+ public String toString(){
+ return nameList.toString();
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Nested Classes ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private class Pack {
+
+ Pack(String s){
+ assert(s==null || !stringMap.containsKey(s));
+ name=s;
+ id=packList.size();
+ nameList.add(s);
+ packList.add(this);
+ if(s!=null){stringMap.put(s, this);}
+ }
+
+ Pack(int x){
+ name=null;
+ id=x;
+ while(packList.size()<=x){packList.add(null);}
+ assert(packList.get(x)==null);
+ packList.set(x, this);
+ }
+
+ public void add(Read r){
+ if(list==null){list=new ArrayList<Read>();}
+ list.add(r);
+ }
+
+ public ArrayList<Read> getAndClear(){
+ ArrayList<Read> temp=list;
+ list=null;
+ return temp;
+ }
+
+ public String toString(){
+ return "Pack "+name;
+ }
+
+ final String name;
+ @SuppressWarnings("unused")
+ final int id;
+ private ArrayList<Read> list;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private final boolean ordered;
+ private final ArrayList<String> nameList=new ArrayList<String>();
+ private final ArrayList<Pack> packList=new ArrayList<Pack>();
+ private final LinkedHashMap<String, Pack> stringMap=new LinkedHashMap<String, Pack>();
+ private final int taxLevel;//=TaxTree.stringToLevel("phylum");
+ private final TaxTree tree;
+ private final ThreadLocal<HashSet<Integer>> tls=new ThreadLocal<HashSet<Integer>>();
+
+
+}
diff --git a/current/stream/ByteBuilder.java b/current/stream/ByteBuilder.java
new file mode 100755
index 0000000..02ac8af
--- /dev/null
+++ b/current/stream/ByteBuilder.java
@@ -0,0 +1,405 @@
+package stream;
+
+import java.io.Serializable;
+import java.util.Arrays;
+
+import ukmer.Kmer;
+
+import dna.AminoAcid;
+
+import align2.Tools;
+
+/**
+ * @author Brian Bushnell
+ * @date Oct 8, 2013
+ *
+ */
+public final class ByteBuilder implements Serializable {
+
+ /**
+ *
+ */
+ private static final long serialVersionUID = -4786450129730831665L;
+
+ public static void main(String[] args){
+ StringBuilder sb=new StringBuilder();
+ }
+
+ public ByteBuilder(){
+ array=new byte[32];
+ }
+
+ public ByteBuilder(int initial){
+ assert(initial>=1);
+ array=new byte[initial];
+ }
+
+ public ByteBuilder(Object o){
+ String s=o.toString();
+ array=new byte[s.length()+1];
+ append(s);
+ }
+
+ public ByteBuilder(ByteBuilder bb){
+ array=bb.toBytes();
+ length=bb.length();
+ }
+
+
+ public ByteBuilder append(float x, int places){return append(String.format("%."+places+"f", x));}
+ public ByteBuilder append(double x, int places){return append(String.format("%."+places+"f", x));}
+
+ public ByteBuilder append(float x){return append(Float.toString(x));}
+ public ByteBuilder append(double x){return append(Double.toString(x));}
+ public ByteBuilder append(boolean x){return append(x ? tbool : fbool);}
+
+
+ public ByteBuilder append(char x){
+ if(length>=array.length){expand();}
+ array[length]=(byte)x;
+ length++;
+ return this;
+ }
+ public ByteBuilder append(byte x){
+ if(length>=array.length){expand();}
+ array[length]=x;
+ length++;
+ return this;
+ }
+
+ public ByteBuilder appendKmer(Kmer kmer) {
+ return appendKmer(kmer.key(), kmer.k);
+ }
+
+ public ByteBuilder appendKmer(long[] kmer, int k) {
+ for(long subkmer : kmer){
+ appendKmer(subkmer, k);
+ }
+ return this;
+ }
+
+ /**
+ * @param key
+ * @param k
+ */
+ public ByteBuilder appendKmer(long kmer, int k) {
+ kmer=AminoAcid.reverseComplementBinaryFast(~kmer, k);
+ for(int i=0; i<k; i++){
+ int x=(int)(kmer&3);
+ append((char)AminoAcid.numberToBase[x]);
+ kmer>>=2;
+ }
+ return this;
+ }
+
+ public ByteBuilder append(int x){
+ expand(11);
+ if(x<0){
+ if(x==Integer.MIN_VALUE){
+ return append(Integer.toString(Integer.MIN_VALUE));
+ }else{
+ array[length]='-';
+ length++;
+ x=-x;
+ }
+ }else if(x==0){
+ array[length]='0';
+ length++;
+ return this;
+ }
+
+// final int len=lengthOf(x);
+// int pos=length+len-1;
+// while(x>9){
+// int y=x%100;
+// x=x/100;
+// array[pos]=ones100[y];
+// pos--;
+// array[pos]=tens100[y];
+// pos--;
+// }
+// while(x>0){
+// int y=x%10;
+// x=x/10;
+// array[pos]=numbers[y];
+// pos--;
+// }
+// length+=len;
+
+// final int initial=length;
+// while(x>9){
+// int y=x%100;
+// x=x/100;
+// array[length]=tens100[y];
+// length--;
+// array[length]=ones100[y];
+// length--;
+// }
+// while(x>0){
+// int y=x%10;
+// x=x/10;
+// array[length]=numbers[y];
+// length++;
+// }
+//
+// for(int i=initial, j=length-1; i<j; i++, j--){
+// byte temp=array[i];
+// array[i]=array[j];
+// array[j]=temp;
+// }
+
+
+
+ int pos=0;
+ while(x>9){
+ int y=x%100;
+ x=x/100;
+ numbuffer[pos]=ones100[y];
+ pos++;
+ numbuffer[pos]=tens100[y];
+ pos++;
+ }
+ while(x>0){
+ int y=x%10;
+ x=x/10;
+ numbuffer[pos]=ones100[y];
+ pos++;
+ }
+
+ while(pos>0){
+ pos--;
+ array[length]=numbuffer[pos];
+ length++;
+ }
+
+ return this;
+ }
+
+ public ByteBuilder append(long x){
+ if(x>Integer.MIN_VALUE && x<=Integer.MAX_VALUE){return append((int)x);}
+ expand(20);
+ if(x<0){
+ if(x==Integer.MIN_VALUE){
+ return append((long)x);
+ }else{
+ array[length]='-';
+ length++;
+ x=-x;
+ }
+ }else if(x==0){
+ array[length]='0';
+ length++;
+ return this;
+ }
+
+// final int len=lengthOf(x);
+// int pos=length+len-1;
+// while(x>9){
+// int y=(int)(x%100);
+// x=x/100;
+// array[pos]=ones100[y];
+// pos--;
+// array[pos]=tens100[y];
+// pos--;
+// }
+// while(x>0){
+// int y=(int)(x%10);
+// x=x/10;
+// array[pos]=numbers[y];
+// pos--;
+// }
+// length+=len;
+
+ int pos=0;
+ while(x>9){
+ int y=(int)(x%100);
+ x=x/100;
+ numbuffer[pos]=ones100[y];
+ pos++;
+ numbuffer[pos]=tens100[y];
+ pos++;
+ }
+ while(x>0){
+ int y=(int)(x%10);
+ x=x/10;
+ numbuffer[pos]=ones100[y];
+ pos++;
+ }
+
+ while(pos>0){
+ pos--;
+ array[length]=numbuffer[pos];
+ length++;
+ }
+
+ return this;
+ }
+
+ public ByteBuilder append(String x){
+ if(x==null){return append(nullBytes);}
+ expand(x.length());
+ for(int i=0; i<x.length(); i++){
+ array[length]=(byte)x.charAt(i);
+ length++;
+ }
+ return this;
+ }
+
+ public ByteBuilder append(StringBuilder x){
+ if(x==null){return append(nullBytes);}
+ expand(x.length());
+ for(int i=0; i<x.length(); i++){
+ array[length]=(byte)x.charAt(i);
+ length++;
+ }
+ return this;
+ }
+
+ public ByteBuilder append(CharSequence x){
+ if(x==null){return append(nullBytes);}
+ expand(x.length());
+ for(int i=0; i<x.length(); i++){
+ array[length]=(byte)x.charAt(i);
+ length++;
+ }
+ return this;
+ }
+
+ public ByteBuilder append(Object x){
+ if(x==null){return append(nullBytes);}
+ return append(x.toString());
+ }
+
+ public ByteBuilder append(byte[] x){
+ if(x==null){x=nullBytes;}
+ expand(x.length);
+ for(int i=0; i<x.length; i++){
+ array[length]=x[i];
+ length++;
+ }
+ return this;
+ }
+
+ public ByteBuilder append(byte[] x, int start, int len){
+// if(x==null){x=nullBytes;}
+ expand(len);
+ for(int i=start, lim=start+len; i<lim; i++){
+ array[length]=x[i];
+ length++;
+ }
+ return this;
+ }
+
+ public ByteBuilder append(char[] x){
+ if(x==null){return append(nullBytes);}
+ expand(x.length);
+ for(int i=0; i<x.length; i++){
+ array[length]=(byte)x[i];
+ length++;
+ }
+ return this;
+ }
+
+ public byte get(int i){
+ assert(i<length);
+ return array[i];
+ }
+
+ public char charAt(int i){
+ assert(i<length);
+ return (char)array[i];
+ }
+
+ /**
+ * @param trimEnds
+ * @param trimEnds2
+ */
+ public void trimByAmount(int left, int right) {
+ assert(left>=0 && right>=0);
+ int newlen=length-left-right;
+ if(newlen==length){return;}
+ length=Tools.max(newlen, 0);
+ if(length<1){return;}
+ for(int i=0, j=left; i<newlen; i++, j++){
+ array[i]=array[j];
+ }
+ }
+
+ @Override
+ public final String toString(){
+ return new String(array, 0, length);
+ }
+
+ public final byte[] toBytes(){
+ return Arrays.copyOf(array, length);
+ }
+
+ private final boolean isRoom(int x){
+ return array.length-length>=x;
+ }
+
+ private final void expand(){
+ long x=Tools.min(Integer.MAX_VALUE, array.length*2L);
+ if(x<=array.length){
+ throw new RuntimeException("Array overflow: "+x+"<="+array.length);
+ }
+ assert(((int)x)>array.length) : "Array overflow: "+x+"<="+array.length;
+ array=Arrays.copyOf(array, (int)x);
+ }
+
+ private final void expand(int extra){
+ long x=array.length;
+ while(x-length<extra){x<<=1;}
+ x=Tools.min(Integer.MAX_VALUE, x);
+ if(x<array.length){
+ throw new RuntimeException("Array overflow: "+x+"<"+array.length+", "+extra);
+ }
+ assert(((int)x)>=array.length) : "Array overflow: "+x+"<"+array.length;
+ if(x!=array.length){
+ array=Arrays.copyOf(array, (int)Tools.min(Integer.MAX_VALUE, x));
+// try {
+// array=Arrays.copyOf(array, (int)Tools.min(Integer.MAX_VALUE, x));
+// } catch (Throwable e) {
+// System.err.println(array.length+", "+x+", "+extra+", "+Tools.min(Integer.MAX_VALUE, x));
+// e.printStackTrace();
+// throw new RuntimeException(e);
+// }
+ }
+ }
+
+ public void reverseComplementInPlace() {
+ AminoAcid.reverseComplementBasesInPlace(array, length);
+ }
+
+ public final void ensureExtra(int extra){
+ if(array.length-length<extra){expand(extra);}
+ }
+
+ public int length(){return length;}
+ public void clear(){setLength(0);}
+ public void setLength(int x){
+ assert(x>=0 && x<=array.length);
+ length=x;
+ }
+
+ public byte[] array;
+ public int length=0;
+ private final byte[] numbuffer=new byte[19];
+
+ public static final byte[] numbers=new byte[] {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9'};
+ public static final byte[] nullBytes="null".getBytes();
+ public static final byte[] fbool="false".getBytes();
+ public static final byte[] tbool="true".getBytes();
+
+ public static final byte[] ones100, tens100;
+
+ static{
+ ones100=new byte[100];
+ tens100=new byte[100];
+ for(int i=0; i<100; i++){
+ ones100[i]=(byte)('0'+i%10);
+ tens100[i]=(byte)('0'+i/10);
+ }
+ }
+
+}
diff --git a/current/stream/ConcurrentCollectionReadInputStream.java b/current/stream/ConcurrentCollectionReadInputStream.java
new file mode 100755
index 0000000..a478e3f
--- /dev/null
+++ b/current/stream/ConcurrentCollectionReadInputStream.java
@@ -0,0 +1,310 @@
+package stream;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.TimeUnit;
+
+import align2.ListNum;
+
+import dna.Data;
+
+public class ConcurrentCollectionReadInputStream extends ConcurrentReadInputStream {
+
+ public ConcurrentCollectionReadInputStream(List<Read> source1, List<Read> source2, long maxReadsToGenerate){
+ assert(source1!=source2);
+ producer1=source1;
+ depot=new ConcurrentDepot<Read>(BUF_LEN, NUM_BUFFS);
+ producer2=source2;
+ maxReads=maxReadsToGenerate>=0 ? maxReadsToGenerate : Long.MAX_VALUE;
+ if(maxReads==0){
+ System.err.println("Warning - created a read stream for 0 reads.");
+ assert(false);
+ }
+
+ }
+
+ public synchronized ListNum<Read> nextList() {
+ ArrayList<Read> list=null;
+ if(verbose){System.err.println("**************** nextList() was called; shutdown="+shutdown+", depot.full="+depot.full.size());}
+ while(list==null){
+ if(shutdown){
+ if(verbose){System.err.println("**************** nextList() returning null; shutdown="+shutdown+", depot.full="+depot.full.size());}
+ return null;
+ }
+ try {
+ list=depot.full.take();
+ assert(list!=null);
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+
+ if(verbose){System.err.println("**************** nextList() returning list of size "+list.size()+"; shutdown="+shutdown+", depot.full="+depot.full.size());}
+ ListNum<Read> ln=new ListNum<Read>(list, listnum);
+ listnum++;
+ return ln;
+ }
+
+ public void returnList(long listNumber, boolean poison){
+ if(poison){
+ if(verbose){System.err.println("crisC: A: Adding empty list to full.");}
+ depot.full.add(new ArrayList<Read>(0));
+ }else{
+ if(verbose){System.err.println("crisC: A: Adding empty list to empty.");}
+ depot.empty.add(new ArrayList<Read>(BUF_LEN));
+ }
+ }
+
+ @Override
+ public void run() {
+// producer.start();
+ threads=new Thread[] {Thread.currentThread()};
+ if(verbose){System.err.println("crisC started, thread="+threads[0]);}
+
+// readLists();
+ readSingles();
+
+ addPoison();
+
+ //End thread
+
+ while(!depot.empty.isEmpty() && !shutdown){
+// System.out.println("Ending");
+ if(verbose){System.err.println("B: Adding empty lists to full.");}
+ depot.full.add(depot.empty.poll());
+ }
+// System.err.println("cris thread terminated. Final depot size: "+depot.full.size()+", "+depot.empty.size());
+ }
+
+ private final void addPoison(){
+ //System.err.println("Adding poison.");
+ //Add poison pills
+ if(verbose){System.err.println("C: Adding poison to full.");}
+ depot.full.add(new ArrayList<Read>());
+ for(int i=1; i<depot.bufferCount; i++){
+ ArrayList<Read> list=null;
+ while(list==null){
+ try {
+ list=depot.empty.poll(1000, TimeUnit.MILLISECONDS);
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+// System.err.println("Do not be alarmed by the following error message:");
+// e.printStackTrace();
+ if(shutdown){
+ i=depot.bufferCount;
+ break;
+ }
+ }
+ }
+ if(list!=null){
+ if(verbose){System.err.println("D: Adding list("+list.size()+") to full "+depot.full.size()+"/"+depot.bufferCount);}
+ depot.full.add(list);
+ }
+ }
+ //System.err.println("Added poison.");
+ }
+
+ private final void readSingles(){
+
+ for(int i=0; !shutdown && i<producer1.size() && generated<maxReads; i++){
+ ArrayList<Read> list=null;
+ while(list==null){
+ try {
+ list=depot.empty.take();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ if(shutdown){break;}
+ }
+ }
+ if(shutdown || list==null){break;}
+
+ long bases=0;
+ final long lim=producer1.size();
+ while(list.size()<depot.bufferSize && generated<maxReads && bases<MAX_DATA && generated<lim){
+ Read a=producer1.get((int)generated);
+ Read b=(producer2==null ? null : producer2.get((int)generated));
+ if(a==null){break;}
+ readsIn++;
+ basesIn+=a.length();
+ if(b!=null){
+ readsIn++;
+ basesIn+=b.length();
+ }
+ if(randy==null || randy.nextFloat()<samplerate){
+ list.add(a);
+ if(b!=null){
+ assert(a.numericID==b.numericID) : "\n"+a.numericID+", "+b.numericID+"\n"+a.toText(false)+"\n"+b.toText(false)+"\n";
+ a.mate=b;
+ b.mate=a;
+
+ assert(a.pairnum()==0);
+ b.setPairnum(1);
+ bases+=(b.bases==null ? 0 : b.length());
+ }
+ bases+=(a.bases==null ? 0 : a.length());
+ }
+ incrementGenerated(1);
+ }
+
+ if(verbose){System.err.println("E: Adding list("+list.size()+") to full "+depot.full.size()+"/"+depot.bufferCount);}
+ depot.full.add(list);
+ }
+ }
+
+ private boolean shutdown=false;
+
+ @Override
+ public void shutdown(){
+ if(verbose){System.out.println("Called shutdown.");}
+ shutdown=true;
+ if(!shutdown){
+ if(verbose){System.out.println("shutdown 2.");}
+ for(Thread t : threads){
+ if(verbose){System.out.println("shutdown 3.");}
+ if(t!=null && t.isAlive()){
+ if(verbose){System.out.println("shutdown 4.");}
+ t.interrupt();
+ if(verbose){System.out.println("shutdown 5.");}
+ }
+ }
+ }
+ if(verbose){System.out.println("shutdown 6.");}
+ }
+
+ @Override
+ public synchronized void restart(){
+ shutdown=false;
+ depot=new ConcurrentDepot<Read>(BUF_LEN, NUM_BUFFS);
+ generated=0;
+ basesIn=0;
+ readsIn=0;
+ nextProgress=PROGRESS_INCR;
+ }
+
+ @Override
+ public synchronized void close(){
+ if(verbose){System.out.println("Thread "+Thread.currentThread().getId()+" called close.");}
+ shutdown();
+// producer1.close();
+// if(producer2!=null){producer2.close();}
+// System.out.println("A");
+ if(threads!=null && threads[0]!=null && threads[0].isAlive()){
+ if(verbose){System.out.println("close 1.");}
+
+ while(threads[0].isAlive()){
+ if(verbose){System.out.println("close 2: Thread "+Thread.currentThread().getId()+" closing thread "+threads[0].getId()+" "+threads[0].getState());}
+// System.out.println("B");
+ ArrayList<Read> list=null;
+ for(int i=0; i<1 && list==null && threads[0].isAlive(); i++){
+ if(verbose){System.out.println("close 3.");}
+ try {
+ if(verbose){System.out.println("close 4.");}
+ list=depot.full.poll(100, TimeUnit.MILLISECONDS);
+ if(verbose){System.out.println("close 5; list.size()="+depot.full.size()+", list="+(list==null ? "null" : list.size()+""));}
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ System.err.println("Do not be alarmed by the following error message:");
+ e.printStackTrace();
+ break;
+ }
+ }
+
+ if(list!=null){
+ list.clear();
+ depot.empty.add(list);
+ }
+ if(verbose){System.out.println("close 6.");}
+
+// System.out.println("isAlive? "+threads[0].isAlive());
+ }
+ if(verbose){System.out.println("close 7.");}
+
+ }
+ if(verbose){System.out.println("close 8.");}
+
+ if(threads!=null){
+ if(verbose){System.out.println("close 9.");}
+ for(int i=1; i<threads.length; i++){
+ if(verbose){System.out.println("close 10.");}
+ while(threads[i]!=null && threads[i].isAlive()){
+ if(verbose){System.out.println("close 11.");}
+ try {
+ if(verbose){System.out.println("close 12.");}
+ threads[i].join();
+ if(verbose){System.out.println("close 13.");}
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+ }
+ if(verbose){System.out.println("close 14.");}
+
+ }
+
+ @Override
+ public boolean paired() {
+ return producer2!=null ? true : (producer1==null || producer1.isEmpty() ? false : producer1.get(0).mate!=null);
+ }
+
+ @Override
+ public boolean verbose(){return verbose;}
+
+ private void incrementGenerated(long amt){
+ generated+=amt;
+ if(SHOW_PROGRESS && generated>=nextProgress){
+ Data.sysout.print('.');
+ nextProgress+=PROGRESS_INCR;
+ }
+ }
+
+ @Override
+ public void setSampleRate(float rate, long seed){
+ samplerate=rate;
+ if(rate>=1f){
+ randy=null;
+ }else if(seed>-1){
+ randy=new java.util.Random(seed);
+ }else{
+ randy=new java.util.Random();
+ }
+ }
+
+ public long basesIn(){return basesIn;}
+ public long readsIn(){return readsIn;}
+
+ @Override
+ public boolean errorState(){return errorState;}
+ /** TODO */
+ private boolean errorState=false;
+
+ private float samplerate=1f;
+ private java.util.Random randy=null;
+
+ private Thread[] threads;
+
+ public Object[] producers(){return new Object[] {producer1, producer2};}
+
+ public final List<Read> producer1;
+ public final List<Read> producer2;
+ private ConcurrentDepot<Read> depot;
+
+ private long basesIn=0;
+ private long readsIn=0;
+
+ private long maxReads;
+ private long generated=0;
+ private long listnum=0;
+ private long nextProgress=PROGRESS_INCR;
+
+ public static boolean verbose=false;
+
+ private static final ArrayList<Read> poison=new ArrayList<Read>(0);
+
+ public static boolean SHOW_PROGRESS=false;
+ public static long PROGRESS_INCR=1000000;
+
+}
diff --git a/current/stream/ConcurrentDepot.java b/current/stream/ConcurrentDepot.java
new file mode 100755
index 0000000..5b9a65b
--- /dev/null
+++ b/current/stream/ConcurrentDepot.java
@@ -0,0 +1,35 @@
+package stream;
+
+import java.util.ArrayList;
+import java.util.concurrent.ArrayBlockingQueue;
+
+public class ConcurrentDepot<K> {
+
+ @SuppressWarnings("unchecked")
+ public ConcurrentDepot(int bufSize, int numBufs){
+ bufferSize=bufSize;
+ bufferCount=numBufs;
+
+ lists=new ArrayList[numBufs];
+ empty=new ArrayBlockingQueue<ArrayList<K>>(numBufs+1, fair);
+ full=new ArrayBlockingQueue<ArrayList<K>>(numBufs+1, fair);
+
+ for(int i=0; i<lists.length; i++){
+ lists[i]=new ArrayList<K>(bufSize);
+ empty.add(lists[i]);
+ }
+
+ }
+
+
+ public final ArrayBlockingQueue<ArrayList<K>> empty;
+ public final ArrayBlockingQueue<ArrayList<K>> full;
+
+ public final int bufferSize;
+ public final int bufferCount;
+
+ public static boolean fair=false;
+
+ private final ArrayList<K>[] lists;
+
+}
diff --git a/current/stream/ConcurrentGenericReadInputStream.java b/current/stream/ConcurrentGenericReadInputStream.java
new file mode 100755
index 0000000..530b0d4
--- /dev/null
+++ b/current/stream/ConcurrentGenericReadInputStream.java
@@ -0,0 +1,806 @@
+package stream;
+
+import java.util.ArrayList;
+import java.util.concurrent.ArrayBlockingQueue;
+import java.util.concurrent.TimeUnit;
+
+import align2.ListNum;
+import align2.Tools;
+
+import dna.Data;
+import dna.Parser;
+import dna.Timer;
+
+public class ConcurrentGenericReadInputStream extends ConcurrentReadInputStream {
+
+ public static void main(String[] args){
+ String in1=args[0];
+ String in2=(args.length<2 || args[1].equalsIgnoreCase("null") || args[1].contains("=") ? null : args[1]);
+ if(in2!=null){
+ assert(!in1.equalsIgnoreCase(in2));
+ FASTQ.TEST_INTERLEAVED=false;
+ }else{
+ FASTQ.TEST_INTERLEAVED=true;
+ FASTQ.FORCE_INTERLEAVED=true;
+ }
+
+ long maxReads=-1;
+ for(int i=1; i<args.length; i++){
+ final String arg=args[i];
+ final String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=(split.length>1 ? split[1] : "true");
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(Parser.parseZip(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseQuality(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseFasta(arg, a, b)){
+ //do nothing
+ }else if(a.equals("reads") || a.startsWith("maxreads")){
+ maxReads=Tools.parseKMG(b);
+ }else{
+ throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ Parser.processQuality();
+
+ assert(FastaReadInputStream.settingsOK());
+ Timer t=new Timer();
+
+ ConcurrentReadInputStream cris=getReadInputStream(maxReads, false, true, in1, in2);
+ System.out.println("Fetched "+cris.getClass().getName());
+ {
+ Object[] p=cris.producers();
+// while(p[0]==null){
+// p=cris.producers();
+// }
+ System.out.print("Producers: ");
+ String comma="";
+ for(Object o : p){
+ System.out.print(comma+(o==null ? "null" : o.getClass().getName()));
+ comma=", ";
+ }
+ System.out.println();
+ }
+ boolean paired=cris.paired();
+ System.out.println("paired="+paired);
+ cris.start(); //4567
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+ assert((r.mate!=null)==paired);
+ }
+
+ long readCount=0;
+ long baseCount=0;
+
+ while(reads!=null && reads.size()>0){
+
+ for(Read r : reads){
+ Read r2=r.mate;
+ if(r!=null){
+ readCount++;
+ if(r.bases!=null){
+ baseCount+=r.length();
+ }
+ }
+ if(r2!=null){
+ readCount++;
+ if(r2.bases!=null){
+ baseCount+=r2.length();
+ }
+ }
+ }
+ cris.returnList(ln.id, ln.list.isEmpty());
+// System.err.println("fetching list");
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+// System.out.println("reads: "+(reads==null ? "null" : reads.size()));
+ }
+ System.err.println("Finished reading");
+ cris.returnList(ln.id, ln.list.isEmpty());
+
+ cris.close();
+ t.stop();
+
+ System.out.println("Reads: \t"+readCount);
+ System.out.println("Bases: \t"+baseCount);
+ System.out.println("Avg Length: \t"+String.format("%.2f",baseCount*1.0/readCount));
+ System.out.println("Time: \t"+t);
+ }
+
+ public ConcurrentGenericReadInputStream(ReadInputStream source1, ReadInputStream source2, long maxReadsToGenerate){
+ assert(source1!=source2);
+ producer1=source1;
+ depot=new ConcurrentDepot<Read>(BUF_LEN, NUM_BUFFS);
+// assert(false) : BUF_LEN+", "+NUM_BUFFS;
+ producer2=source2;
+ assert(source2==null || !FASTQ.FORCE_INTERLEAVED) : "Please do not set 'interleaved=true' with dual input files.";
+ maxReads=maxReadsToGenerate>=0 ? maxReadsToGenerate : Long.MAX_VALUE;
+ if(maxReads==0){
+ System.err.println("crisG: Warning - created a read stream for 0 reads.");
+ assert(false);
+ }
+// if(maxReads<Long.MAX_VALUE){System.err.println("crisG: maxReads="+maxReads);}
+
+ if(producer1!=null){p1q=new ArrayBlockingQueue<ArrayList<Read>>(4);}
+ if(producer2!=null){p2q=new ArrayBlockingQueue<ArrayList<Read>>(4);}
+ }
+
+ public synchronized ListNum<Read> nextList() {
+ ArrayList<Read> list=null;
+ if(verbose){System.err.println("crisG: **************** nextList() was called; shutdown="+shutdown+", depot.full="+depot.full.size());}
+ while(list==null){
+ if(shutdown){
+ if(verbose){System.err.println("crisG: **************** nextList() returning null; shutdown="+shutdown+", depot.full="+depot.full.size());}
+ return null;
+ }
+ try {
+ list=depot.full.take();
+ assert(list!=null);
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+
+ if(verbose){System.err.println("crisG: **************** nextList() returning list of size "+list.size()+"; shutdown="+shutdown+", depot.full="+depot.full.size());}
+ ListNum<Read> ln=new ListNum<Read>(list, listnum);
+ listnum++;
+ return ln;
+ }
+
+ public void returnList(long listNumber, boolean poison){
+ if(poison){
+ if(verbose){System.err.println("crisG: A: Adding empty list to full.");}
+ depot.full.add(new ArrayList<Read>(0));
+ }else{
+ if(verbose){System.err.println("crisG: A: Adding empty list to empty.");}
+ depot.empty.add(new ArrayList<Read>(BUF_LEN));
+ }
+ }
+
+ @Override
+ public void run() {
+// producer.start();
+ synchronized(running){
+ assert(!running[0]) : "This cris was started by multiple threads.";
+ running[0]=true;
+ }
+
+ ReadThread rt1=null;
+ ReadThread rt2=null;
+ rt1=new ReadThread(producer1, p1q);
+ rt2=(producer2==null ? null : new ReadThread(producer2, p2q));
+ rt1.start();
+ if(rt2!=null){rt2.start();}
+
+ threads=(rt1==null ? new Thread[] {Thread.currentThread()} :
+ rt2==null ? new Thread[] {Thread.currentThread(), rt1} :
+ new Thread[] {Thread.currentThread(), rt1, rt2});
+
+ readLists();
+// readSingles();
+
+ addPoison();
+
+ //End thread
+
+ if(verbose){System.err.println("crisG: cris finished addPoison.");}
+ while(!depot.empty.isEmpty() && !shutdown){
+// System.out.println("crisG: Ending");
+ if(verbose){System.err.println("crisG: B: Adding empty lists to full.");}
+ depot.full.add(depot.empty.poll());
+ }
+ if(verbose){System.err.println("crisG: cris thread syncing before shutdown.");}
+
+ synchronized(running){//TODO Note: for some reason syncing on 'this' instead of 'running' causes a hang. Something else must be syncing improperly on this.
+ assert(running[0]);
+ running[0]=false;
+ }
+ if(verbose){System.err.println("crisG: cris thread terminated. Final depot size: "+depot.full.size()+", "+depot.empty.size());}
+ }
+
+ private final void addPoison(){
+ //System.err.println("crisG: Adding poison.");
+ //Add poison pills
+ if(verbose){System.err.println("crisG: C: Adding poison to full.");}
+ depot.full.add(new ArrayList<Read>());
+ for(int i=1; i<depot.bufferCount; i++){
+ ArrayList<Read> list=null;
+ while(list==null){
+ try {
+ list=depot.empty.poll(1000, TimeUnit.MILLISECONDS);
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+// System.err.println("crisG: Do not be alarmed by the following error message:");
+// e.printStackTrace();
+ if(shutdown){
+ i=depot.bufferCount;
+ break;
+ }
+ }
+ }
+ if(list!=null){
+ if(verbose){System.err.println("crisG: D: Adding list("+list.size()+") to full.");}
+ depot.full.add(list);
+ }
+ }
+ if(verbose){System.err.println("crisG: Added poison.");}
+ }
+
+ private final void readSingles(){
+
+ while(!shutdown && producer1.hasMore() && generated<maxReads){
+ ArrayList<Read> list=null;
+ while(list==null){
+ try {
+ list=depot.empty.take();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ if(shutdown){break;}
+ }
+ }
+ if(shutdown || list==null){break;}
+
+ long bases=0;
+ while(list.size()<depot.bufferSize && generated<maxReads && bases<MAX_DATA){
+ Read a=producer1.next();
+ Read b=(producer2==null ? null : producer2.next());
+ if(a==null){break;}
+ readsIn++;
+ basesIn+=a.length();
+ if(b!=null){
+ readsIn++;
+ basesIn+=b.length();
+ }
+ if(randy==null || randy.nextFloat()<samplerate){
+ list.add(a);
+ if(b!=null){
+ assert(a.numericID==b.numericID) : "\n"+a.numericID+", "+b.numericID+"\n"+a.toText(false)+"\n"+b.toText(false)+"\n";
+ assert(a.mate==null) : "Please set interleaved=false when using dual input files.\n"+a.id+"\n"+a.mate.id+"\n"+producer1+"\n"+producer2;
+ assert(b.mate==null) : "Please set interleaved=false when using dual input files.";
+ a.mate=b;
+ b.mate=a;
+
+ assert(a.pairnum()==0);
+ b.setPairnum(1);
+ bases+=(b.bases==null ? 0 : b.length());
+ }
+ bases+=(a.bases==null ? 0 : a.length());
+ }
+ incrementGenerated(1);
+ }
+
+ if(verbose){System.err.println("crisG: E: Adding list("+list.size()+") to full.");}
+ depot.full.add(list);
+ }
+ }
+
+ private final void readLists(){
+ ArrayList<Read> buffer1=null;
+ ArrayList<Read> buffer2=null;
+ ArrayList<Read> list=null;
+ int next=0;
+
+// System.out.println("crisG: a");
+ if(verbose){System.err.println(getClass().getName()+" entering read lists loop.");}
+ while(buffer1!=poison && (buffer1!=null || (!shutdown && generated<maxReads))){
+// System.out.println("crisG: b");
+ if(verbose){System.err.println("crisG: looping: buffer1==null "+(buffer1==null)+", buffer1==poison "+(buffer1==poison)
+ +", shutdown="+shutdown+", generated<maxReads="+(generated<maxReads));}
+ while(list==null){
+ if(verbose){System.err.println("crisG: Fetching an empty list: generated="+generated+"/"+maxReads);}
+ try {
+ list=depot.empty.take();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ if(shutdown){break;}
+ }
+ if(verbose){System.err.println("crisG: Fetched "+(list==null ? "null" : ""+list.size()));}
+ }
+// System.out.println("crisG: c");
+ if(verbose){System.err.println("crisG: Left empty fetch loop.");}
+ if(shutdown || list==null){
+ //System.err.println("crisG: Shutdown triggered; breaking.");
+ break;
+ }
+// System.out.println("crisG: d");
+
+ if(verbose){System.err.println("crisG: Entering full fetch loop.");}
+ long bases=0;
+ while(list.size()<depot.bufferSize && generated<maxReads && bases<MAX_DATA){
+ if(verbose){System.err.println("crisG: list.size()="+list.size()+", depot.bufferSize="+depot.bufferSize+", generated="+generated);}
+ if(buffer1==null || next>=buffer1.size()){
+ buffer1=null;
+ while(!shutdown && buffer1==null){
+ try {
+ buffer1=p1q.take();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+// System.out.println("crisG: e");
+
+ if(buffer1!=null && p2q!=null){
+ buffer2=null;
+ while(!shutdown && buffer2==null){
+ try {
+ buffer2=p2q.take();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ if(buffer2!=null){pair(buffer1, buffer2);}
+ if(REMOVE_DISCARDED_READS){removeDiscarded(buffer1, buffer2);}
+ }
+// System.out.println("crisG: f");
+ next=0;
+ }
+// System.out.println("crisG: g");
+ if(buffer1==null || buffer1==poison || shutdown){
+// if(list!=null && list.size()>0){
+// if(verbose){System.err.println("crisG: G: Adding list("+list.size()+") to full.");}
+// depot.full.add(list);
+// list=null;
+// }
+ if(verbose){System.err.println("crisG: Breaking because buffer1==null: "+(buffer1==null)+" || buffer1==poison: "+(buffer1==poison)+" || shutdown: "+shutdown);}
+ break;
+ }
+ assert(buffer1.size()<=BUF_LEN); //Although this is not really necessary.
+
+// assert(!set2.contains(buffer1)) : buffer1.hashCode();
+// set2.add(buffer1);
+// System.out.println(buffer1.hashCode());
+
+ if(buffer2!=null){
+// System.out.println("crisG: h");
+
+ if(buffer2!=null && (buffer1==null || buffer2.size()!=buffer1.size()) && !ALLOW_UNEQUAL_LENGTHS){
+ System.err.println("crisG: Error: Misaligned read streams.");
+ errorState=true;
+ return;
+ }
+ assert(ALLOW_UNEQUAL_LENGTHS || buffer2==null || buffer2.size()==buffer1.size());
+ }
+
+ //Code disabled because it does not actually seem to make anything faster.
+// if(buffer1.size()<=(BUF_LEN-list.size()) && (buffer1.size()+generated)<maxReads && randy==null){
+// //System.out.println("crisG: j");
+// //Then do a quicker bulk operation
+//
+// for(Read a : buffer1){
+// list.add(a);
+// Read b=a.mate;
+// readsIn++;
+// basesIn+=a.length();
+// bases+=(a.bases==null ? 0 : a.length());
+//// bases+=(b==null || b.bases==null ? 0 : b.length());
+// if(b!=null){
+// readsIn++;
+// basesIn+=b.length();
+// bases+=b.length();
+// assert(a.pairnum()==0 && b.pairnum()==1);
+// }
+//// System.out.println(generated+", "+readsIn+", "+(b==null));
+// }
+//// list.addAll(buffer1); //This is actually slower due to an array clone operation.
+// incrementGenerated(buffer1.size());
+//
+// next=0;
+// buffer1=null;
+// buffer2=null;
+// }else
+ {
+
+ while(next<buffer1.size() && list.size()<depot.bufferSize && generated<maxReads && bases<MAX_DATA){
+ Read a=buffer1.get(next);
+ Read b=a.mate;
+ readsIn++;
+ basesIn+=a.length();
+ if(b!=null){
+ readsIn++;
+ basesIn+=b.length();
+// assert(a.numericID==b.numericID) : "\n"+a.numericID+", "+b.numericID+"\n"+a.toText(false)+"\n"+b.toText(false)+"\n";
+// a.mate=b;
+// b.mate=a;
+//
+// assert(a.pairnum()==0);
+// b.setPairnum(1);
+ assert(a.pairnum()==0 && b.pairnum()==1 && a.mate==b && b.mate==a && a.numericID==b.numericID) :
+ "There is something wrong with the read pairing.\n"+
+ a.pairnum()+", "+(b.pairnum())+", "+(a.mate==b)+", "+(b.mate==a)+", "+(a.numericID)+", "+(b.numericID);
+ }
+ if(randy==null || randy.nextFloat()<samplerate){
+ list.add(a);
+ bases+=a.length();
+ if(a.mate!=null){
+ bases+=a.mateLength();
+// assert(a.pairnum()==0 && a.mate.pairnum()==1);
+ }
+ }
+ incrementGenerated(1);
+ next++;
+ }
+// System.out.println("crisG: l");
+
+
+ if(next>=buffer1.size()){
+ buffer1=null;
+ buffer2=null;
+ next=0;
+// System.out.println("crisG: ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~");
+ }else{
+// System.out.println("crisG: ------------------------------------------------");
+ }
+// System.out.println("crisG: m");
+ }
+ if(verbose){System.err.println("crisG: Loop end: list.size()="+(list.size()+", depot.bufferSize="+depot.bufferSize+", generated="+generated));}
+// System.out.println("crisG: n");
+ if(verbose){System.err.println(Thread.currentThread().getName());}
+ }
+
+// System.out.println("crisG: p");
+// System.err.println("crisG: Adding list to full depot. Shutdown="+shutdown);
+ if(verbose){System.err.println("crisG: F: Adding list("+list.size()+") to full.");}
+ depot.full.add(list);
+// System.err.println("crisG: Added.");
+
+// System.out.println("crisG: o");
+ if(buffer1==poison){
+ if(verbose){System.err.println("crisG: Detected poison from buffer1.");}
+ break;
+ }
+ list=null;
+ if(verbose){System.err.println("crisG: Finished loop iteration.\n");}
+ if(verbose){System.err.println("crisG: loop end: buffer1==null "+(buffer1==null)+", buffer1==poison "+(buffer1==poison)
+ +", shutdown="+shutdown+", generated<maxReads="+(generated<maxReads));}
+// System.out.println("crisG: q");
+ }
+// System.out.println("crisG: r");
+
+
+ p1q.clear();
+ if(p2q!=null){p2q.clear();}
+ }
+
+ private final void pair(ArrayList<Read> buffer1, ArrayList<Read> buffer2){
+ final int len1=buffer1.size(), len2=buffer2.size();
+ assert(ALLOW_UNEQUAL_LENGTHS || len1==len2) : "\nThere appear to be different numbers of reads in the paired input files." +
+ "\nThe pairing may have been corrupted by an upstream process. It may be fixable by running repair.sh.";
+ final int lim=Tools.min(len1, len2);
+
+ for(int i=0; i<lim; i++){
+ Read a=buffer1.get(i);
+ Read b=buffer2.get(i);
+
+ assert(a.numericID==b.numericID) : "\n"+a.numericID+", "+b.numericID+"\n"+a.toText(false)+"\n"+b.toText(false)+"\n";
+ assert(a.mate==null) : "Please set interleaved=false when using dual input files.\n"+a.id+"\n"+a.mate.id+"\n"+b.id+"\n"+producer1+"\n"+producer2;
+ assert(b.mate==null) : "Please set interleaved=false when using dual input files.";
+ a.mate=b;
+ b.mate=a;
+
+ assert(a.pairnum()==0);
+ b.setPairnum(1);
+ // assert(a.pairnum()!=b.pairnum());
+ }
+
+ if(len1>len2){
+ //do nothing;
+ }else if(len2>len1){
+ for(int i=lim; i<len2; i++){
+ Read b=buffer2.get(i);
+ b.setPairnum(0);
+ buffer1.add(b);
+ }
+ }
+ }
+
+ private final int removeDiscarded(ArrayList<Read> buffer1, ArrayList<Read> buffer2){
+ int removed=0;
+ if(buffer2==null){
+ for(int i=0; i<buffer1.size(); i++){
+ Read a=buffer1.get(i);
+ if(a.discarded()){
+ buffer1.set(i, null);
+ removed++;
+ }
+ }
+ }else{
+ for(int i=0; i<buffer1.size(); i++){
+ Read a=buffer1.get(i);
+ Read b=buffer2.get(i);
+ if(a.discarded() || b.discarded()){
+ buffer1.set(i, null);
+ buffer2.set(i, null);
+ removed++;
+ }
+ }
+ }
+ if(removed>0){
+ Tools.condenseStrict(buffer1);
+ if(buffer2!=null){Tools.condenseStrict(buffer2);}
+ }
+ return removed;
+ }
+
+ private boolean shutdown=false;
+
+ @Override
+ public void shutdown(){
+// System.err.println("crisG: Called shutdown.");
+ shutdown=true;
+ if(!shutdown){
+ for(Thread t : threads){
+ if(t!=null && t.isAlive()){
+ t.interrupt();
+ }
+ }
+ }
+ }
+
+ @Override
+ public synchronized void restart(){
+ shutdown=false;
+ p1q.clear();
+ if(p2q!=null){p2q.clear();}
+ producer1.restart();
+ if(producer2!=null){producer2.restart();}
+ depot=new ConcurrentDepot<Read>(BUF_LEN, NUM_BUFFS);
+ generated=0;
+ basesIn=0;
+ readsIn=0;
+ listnum=0; //Added Oct 9, 2014
+ nextProgress=PROGRESS_INCR;
+ lastTime=System.nanoTime();
+ }
+
+ @Override
+ public synchronized void close(){
+ if(verbose){System.err.println("crisG: Called shutdown for "+producer1+"; "+threads[0].getState());}
+// if(verbose){System.err.println(((FastqReadInputStream)producer1).tf.isOpen());}
+ shutdown();
+ errorState|=producer1.close();
+ if(producer2!=null){errorState|=producer2.close();}
+ if(threads!=null && threads[0]!=null && threads[0].isAlive()){
+
+ while(threads[0].isAlive()){
+// System.out.println("crisG: B");
+ ArrayList<Read> list=null;
+ for(int i=0; i<1000 && list==null && threads[0].isAlive(); i++){
+ try {
+ list=depot.full.poll(200, TimeUnit.MILLISECONDS);
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ System.err.println("crisG: Do not be alarmed by the following error message:");
+ e.printStackTrace();
+ break;
+ }
+ }
+
+ if(list!=null){
+ list.clear();
+ depot.empty.add(list);
+ }
+ }
+
+ }
+
+ if(threads!=null){
+ for(int i=1; i<threads.length; i++){
+ while(threads[i]!=null && threads[i].getState()!=Thread.State.TERMINATED){
+ try {
+ threads[i].join();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+ }
+ assert(threads==null || threads.length<2 || threads[1]==null || !threads[1].isAlive()) : ((ReadThread)threads[1]).generatedLocal;
+// threads=null;
+// System.out.println("crisG: C");
+
+ if(verbose){System.err.println("crisG: shutdown exited; errorState="+errorState);}
+ }
+
+ @Override
+ public boolean paired() {
+ return producer1.paired() || producer2!=null;
+ }
+
+ @Override
+ public boolean verbose(){return verbose;}
+
+ private class ReadThread extends Thread{
+ ReadThread(ReadInputStream producer_, ArrayBlockingQueue<ArrayList<Read>> pq_){
+ producer=producer_;
+ pq=pq_;
+ }
+
+ @Override
+ public void run(){
+ readLists();
+ }
+
+ private final void readLists(){
+
+ ArrayList<Read> list=null;
+
+ if(verbose){System.err.println(getClass().getName()+" entering read lists loop.");}
+ while(list!=null || (!shutdown && producer.hasMore() && generatedLocal<maxReads)){
+
+ if(verbose){System.err.println(getClass().getName()+" looping: buffer1==null "+(list==null)+", shutdown="+shutdown+
+ ", producer.hasMore()="+producer.hasMore()+", generated<maxReads="+(generatedLocal<maxReads));}
+
+
+
+ if(verbose){System.err.println(getClass().getName()+" Entering full fetch loop.");}
+ while(generatedLocal<maxReads){
+// System.out.println("crisG: E");
+ if(verbose){System.err.println(getClass().getName()+" depot.bufferSize="+depot.bufferSize+", generated="+generatedLocal);}
+// System.out.println("crisG: F");
+ try {
+ list=producer.nextList();
+ } catch (Throwable e1) {
+ // TODO
+// System.err.print('*');
+ e1.printStackTrace();
+ list=null;
+ shutdown=true;
+ try {
+ pq.put(new ArrayList<Read>(1));
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ errorState=true;
+ }
+ if(verbose){System.err.println(getClass().getName()+" grabbed a list of size "+(list==null ? "null" : list.size()+""));}
+// System.out.println("crisG: G");
+ if(list==null){
+// System.out.println("crisG: H");
+ if(verbose){System.err.println(getClass().getName()+" broke loop on null list.");}
+ break;
+ }
+ assert(list.size()>0);
+ assert(list.size()<=BUF_LEN); //Although this is not really necessary.
+// System.out.println("crisG: I");
+ if(list.size()+generatedLocal>maxReads){
+// System.out.println("crisG: J");
+ if(verbose){System.err.println("crisG: Removing extra reads.");}
+ while(list.size()+generatedLocal>maxReads){list.remove(list.size()-1);}
+// System.out.println("crisG: K");
+ }
+// System.out.println("crisG: A");
+ while(list!=null && !shutdown){
+// System.out.println("crisG: B");
+ try {
+ if(verbose){System.err.println("crisG: Trying to add list");}
+ pq.put(list);
+ generatedLocal+=list.size();
+ list=null;
+ if(verbose){
+ System.out.println("crisG: Added list; pq.size() = "+pq.size());
+ }
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+// System.out.println("crisG: C");
+ }
+// System.out.println("crisG: D");
+ if(verbose){System.err.println("crisG: looping");}
+ }
+
+ if(verbose){System.err.println(getClass().getName()+" Finished inner loop iteration.\n");}
+ }
+
+
+ if(verbose){System.err.println(getClass().getName()+" attempting to poison output queue.");}
+ boolean b=true;
+ while(b){
+ //TODO Note that this could cause a deadlock if there was a premature shutdown, so the consumer died while the queue was full.
+ try {
+// pq.offer(poison, 10000, TimeUnit.SECONDS);
+ pq.put(poison);
+ b=false;
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+
+
+ if(verbose){System.err.println(getClass().getName()+" exited read lists loop: "+(list==null)+", "+shutdown+", "+producer.hasMore()+", "+generatedLocal+", "+maxReads);}
+
+ }
+
+ private final ArrayBlockingQueue<ArrayList<Read>> pq;
+ private final ReadInputStream producer;
+ private long generatedLocal=0;
+ }
+
+ private void incrementGenerated(long amt){
+ generated+=amt;
+ if(SHOW_PROGRESS && generated>=nextProgress){
+ if(SHOW_PROGRESS2){
+ nextProgress+=PROGRESS_INCR;
+ long x=System.nanoTime();
+ long duration=x-lastTime;
+ lastTime=x;
+ Data.sysout.println(String.format("%.1f", duration*0.000000001));
+// Data.sysout.println((long)(0.5+duration*0.000000001)+" ");
+ }else{
+ nextProgress+=PROGRESS_INCR;
+ Data.sysout.print('.');
+ }
+ }
+// System.err.println("crisG: generated="+generated+"\treadsIn="+readsIn);
+ }
+
+ @Override
+ public void setSampleRate(float rate, long seed){
+ samplerate=rate;
+ if(rate>=1f){
+ randy=null;
+ }else if(seed>-1){
+ randy=new java.util.Random(seed);
+ }else{
+ randy=new java.util.Random();
+ }
+ }
+
+ public long basesIn(){return basesIn;}
+ public long readsIn(){return readsIn;}
+
+ @Override
+ public boolean errorState(){
+ return errorState ||
+ (producer1==null ? false : producer1.errorState()) || (producer2==null ? false : producer2.errorState());}
+ /** TODO */
+ private boolean errorState=false;
+
+ private boolean[] running=new boolean[] {false};
+
+ private float samplerate=1f;
+ private java.util.Random randy=null;
+
+ private ArrayBlockingQueue<ArrayList<Read>> p1q;
+ private ArrayBlockingQueue<ArrayList<Read>> p2q;
+
+
+ public Object[] producers(){return producer2==null ? new Object[] {producer1} : new Object[] {producer1, producer2};}
+
+ private Thread[] threads;
+
+ public final ReadInputStream producer1;
+ public final ReadInputStream producer2;
+ private ConcurrentDepot<Read> depot;
+
+ private long basesIn=0;
+ private long readsIn=0;
+
+ private long maxReads;
+ private long generated=0;
+ private long listnum=0;
+ private long nextProgress=PROGRESS_INCR;
+ private long lastTime=System.nanoTime();
+
+ public static boolean verbose=false;
+
+ private static final ArrayList<Read> poison=new ArrayList<Read>(0);
+
+}
diff --git a/current/stream/ConcurrentGenericReadOutputStream.java b/current/stream/ConcurrentGenericReadOutputStream.java
new file mode 100755
index 0000000..8e6b13d
--- /dev/null
+++ b/current/stream/ConcurrentGenericReadOutputStream.java
@@ -0,0 +1,240 @@
+package stream;
+
+import java.io.File;
+import java.lang.Thread.State;
+import java.util.ArrayList;
+import java.util.HashMap;
+
+import fileIO.FileFormat;
+
+/**
+ * @author Brian Bushnell
+ * @date Jan 26, 2015
+ *
+ */
+public final class ConcurrentGenericReadOutputStream extends ConcurrentReadOutputStream {
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ ConcurrentGenericReadOutputStream(FileFormat ff1_, FileFormat ff2_, String qf1, String qf2, int maxSize, CharSequence header, boolean useSharedHeader){
+ super(ff1_, ff2_);
+
+ if(verbose){
+ System.err.println("ConcurrentGenericReadOutputStream("+ff1+", "+ff2+", "+qf1+", "+qf2+", "+maxSize+", "+useSharedHeader+")");
+ }
+
+ assert(ff1!=null);
+ assert(!ff1.text() && !ff1.unknownFormat()) : "Unknown format for "+ff1;
+
+ if(ff1.hasName() && ff1.devnull()){
+ File f=new File(ff1.name());
+ assert(ff1.overwrite() || !f.exists()) : f.getAbsolutePath()+" already exists; please delete it.";
+ if(ff2!=null){assert(!ff1.name().equals(ff2.name())) : ff1.name()+"=="+ff2.name();}
+ }
+
+ if(BYTE_WRITER){
+ readstream1=new ReadStreamByteWriter(ff1, qf1, true, maxSize, header, useSharedHeader);
+ readstream2=ff1.stdio() || ff2==null ? null : new ReadStreamByteWriter(ff2, qf2, false, maxSize, header, useSharedHeader);
+ }else{
+ readstream1=new ReadStreamStringWriter(ff1, qf1, true, maxSize, header, useSharedHeader);
+ readstream2=ff1.stdio() || ff2==null ? null : new ReadStreamStringWriter(ff2, qf2, false, maxSize, header, useSharedHeader);
+ }
+
+ if(readstream2==null && readstream1!=null){
+// System.out.println("ConcurrentReadOutputStream detected interleaved output.");
+ readstream1.OUTPUT_INTERLEAVED=true;
+ }
+
+ table=(ORDERED ? new HashMap<Long, ArrayList<Read>>(MAX_CAPACITY) : null);
+
+ assert(readstream1==null || readstream1.read1==true);
+ assert(readstream2==null || (readstream2.read1==false));
+ }
+
+ @Override
+ public synchronized void start(){
+ if(started){
+ System.err.println("Resetting output stream.");
+ nextListID=0;
+ throw new RuntimeException();
+ }else{
+ started=true;
+ if(readstream1!=null){readstream1.start();}
+ if(readstream2!=null){readstream2.start();}
+ }
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Outer Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public synchronized void add(ArrayList<Read> list, long listnum){
+
+ if(ORDERED){
+ int size=table.size();
+// System.err.print(size+", ");
+ final boolean flag=(size>=HALF_LIMIT);
+ if(listnum>nextListID && size>=ADD_LIMIT){
+ System.err.println("Output buffer became full; key "+listnum+" waiting on "+nextListID+".");
+ while(listnum>nextListID && size>=HALF_LIMIT){
+ try {
+ this.wait(20000);
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ }
+ size=table.size();
+ }
+ System.err.println("Output buffer became clear for key "+listnum+"; next="+nextListID+", size="+size);
+ }
+ addOrdered(list, listnum);
+ assert(listnum!=nextListID);
+ if(flag && listnum<nextListID){this.notifyAll();}
+ }else{
+ addDisordered(list, listnum);
+ }
+ }
+
+ @Override
+ public synchronized void close(){
+
+ assert(table==null || table.isEmpty()); //Seems like a race condition. Probably, I should wait at this point until the condition is true before proceeding.
+
+// readstream1.addList(null);
+// if(readstream2!=null){readstream2.addList(null);}
+ readstream1.poison();
+ if(readstream2!=null){readstream2.poison();}
+ }
+
+ @Override
+ public void join(){
+ while(readstream1!=null && readstream1.getState()!=Thread.State.TERMINATED){
+ try {
+ readstream1.join();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ while(readstream2!=null && readstream2.getState()!=Thread.State.TERMINATED){
+ try {
+ if(readstream2!=null){readstream2.join();}
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ assert(table==null || table.isEmpty());
+ finishedSuccessfully=true;
+ }
+
+ @Override
+ public synchronized void resetNextListID(){
+ for(int i=0; i<2000 && !table.isEmpty(); i++){
+ try {this.wait(2000);}
+ catch (InterruptedException e) {e.printStackTrace();}
+ }
+ if(!table.isEmpty()){
+ System.err.println("WARNING! resetNextListID() waited a long time and the table never cleared. Process may have stalled.");
+ }
+ while(!table.isEmpty()){
+ try {this.wait(2000);}
+ catch (InterruptedException e) {e.printStackTrace();}
+ }
+ nextListID=0;
+ }
+
+ @Override
+ public final String fname(){
+// if(STANDARD_OUT){return "stdout";}
+ return readstream1.fname();
+ }
+
+ @Override
+ public boolean errorState(){
+ return errorState || (readstream1!=null && readstream1.errorState()) || (readstream2!=null && readstream2.errorState());
+ }
+
+ @Override
+ public boolean finishedSuccessfully(){
+ return finishedSuccessfully && (readstream1==null || readstream1.finishedSuccessfully()) && (readstream2==null || readstream2.finishedSuccessfully());
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+
+ private synchronized void addOrdered(ArrayList<Read> list, long listnum){
+// System.err.println("RTOS got "+listnum+" of size "+(list==null ? "null" : list.size())+
+// " with first read id "+(list==null || list.isEmpty() || list.get(0)==null ? "null" : ""+list.get(0).numericID));
+ assert(list!=null) : listnum;
+ assert(listnum>=nextListID) : listnum+", "+nextListID;
+// assert(list.isEmpty() || list.get(0)==null || list.get(0).numericID>=nextReadID) : list.get(0).numericID+", "+nextReadID;
+ assert(!table.containsKey(listnum));
+
+ table.put(listnum, new ArrayList<Read>(list));
+
+ while(table.containsKey(nextListID)){
+// System.err.println("Writing list "+first.get(0).numericID);
+ ArrayList<Read> value=table.remove(nextListID);
+ write(value);
+ nextListID++;
+ }
+ if(table.isEmpty()){notifyAll();}
+ }
+
+ private synchronized void addDisordered(ArrayList<Read> list, long listnum){
+ assert(list!=null);
+ assert(table==null);
+ write(new ArrayList<Read>(list));
+ }
+
+ private synchronized void write(ArrayList<Read> list){
+ if(readstream1!=null){
+ if(readstream1.getState()==State.TERMINATED){throw new RuntimeException("Writing to a terminated thread.");}
+ readstream1.addList(list);
+ }
+ if(readstream2!=null){
+ if(readstream1.getState()==State.TERMINATED){throw new RuntimeException("Writing to a terminated thread.");}
+ readstream2.addList(list);
+ }
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Getters ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public final ReadStreamWriter getRS1(){return readstream1;}
+ @Override
+ public final ReadStreamWriter getRS2(){return readstream2;}
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private final ReadStreamWriter readstream1;
+ private final ReadStreamWriter readstream2;
+ private long nextListID=0;
+
+ /** Number of lists held before the stream blocks */
+ private final int MAX_CAPACITY=256;
+ private final int ADD_LIMIT=MAX_CAPACITY-2;
+ private final int HALF_LIMIT=ADD_LIMIT/2;
+
+ /** For ordered output */
+ private final HashMap<Long, ArrayList<Read>> table;
+
+ {if(HALF_LIMIT<1){throw new RuntimeException("Capacity too low.");}}
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Static Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public static boolean verbose=false;
+
+}
diff --git a/current/stream/ConcurrentLegacyReadInputStream.java b/current/stream/ConcurrentLegacyReadInputStream.java
new file mode 100755
index 0000000..13d443a
--- /dev/null
+++ b/current/stream/ConcurrentLegacyReadInputStream.java
@@ -0,0 +1,279 @@
+package stream;
+
+import java.util.ArrayList;
+import java.util.concurrent.TimeUnit;
+
+import align2.ListNum;
+
+public class ConcurrentLegacyReadInputStream extends ConcurrentReadInputStream {
+
+ public ConcurrentLegacyReadInputStream(ReadInputStream source, long maxReadsToGenerate){
+ producer=source;
+ depot=new ConcurrentDepot<Read>(BUF_LEN, NUM_BUFFS);
+ maxReads=maxReadsToGenerate>=0 ? maxReadsToGenerate : Long.MAX_VALUE;
+ if(maxReads==0){
+ System.err.println("Warning - created a read stream for 0 reads.");
+ assert(false);
+ }
+// if(maxReads<Long.MAX_VALUE){System.err.println("maxReads="+maxReads);}
+ }
+
+ public synchronized ListNum<Read> nextList() {
+ ArrayList<Read> list=null;
+ while(list==null){
+ try {
+ list=depot.full.take();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ if(shutdown){return null;}
+ }
+ }
+ ListNum<Read> ln=new ListNum<Read>(list, listnum);
+ listnum++;
+ return ln;
+ }
+
+ public void returnList(long listNumber, boolean poison){
+ if(poison){
+ if(verbose){System.err.println("cris_: A: Adding empty list to full.");}
+ depot.full.add(new ArrayList<Read>(0));
+ }else{
+ if(verbose){System.err.println("cris_: A: Adding empty list to empty.");}
+ depot.empty.add(new ArrayList<Read>(BUF_LEN));
+ }
+ }
+
+ @Override
+ public void run() {
+// producer.start();
+ threads=new Thread[] {Thread.currentThread()};
+
+ if(producer.preferLists()){
+ readLists();
+ }else{
+ readSingles();
+ }
+
+ addPoison();
+
+ //End thread
+
+ while(!depot.empty.isEmpty()){
+ depot.full.add(depot.empty.poll());
+ }
+// System.err.println(depot.full.size()+", "+depot.empty.size());
+ }
+
+ private final void addPoison(){
+ //System.err.println("Adding poison.");
+ //Add poison pills
+ depot.full.add(new ArrayList<Read>());
+ for(int i=1; i<depot.bufferCount; i++){
+ ArrayList<Read> list=null;
+ while(list==null){
+ try {
+ list=depot.empty.poll(1000, TimeUnit.MILLISECONDS);
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+// System.err.println("Do not be alarmed by the following error message:");
+// e.printStackTrace();
+ if(shutdown){
+ i=depot.bufferCount;
+ break;
+ }
+ }
+ }
+ if(list!=null){depot.full.add(list);}
+ }
+ //System.err.println("Added poison.");
+ }
+
+ private final void readSingles(){
+
+ long bases=0;
+ while(!shutdown && producer.hasMore() && generated<maxReads && bases<MAX_DATA){
+ ArrayList<Read> list=null;
+ while(list==null){
+ try {
+ list=depot.empty.take();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ if(shutdown){break;}
+ }
+ }
+ if(shutdown || list==null){break;}
+
+ for(int i=0; i<depot.bufferSize && generated<maxReads && bases<MAX_DATA; i++){
+ Read r=producer.next();
+ if(r==null){break;}
+ list.add(r);
+ bases+=r.length();
+ bases+=(r.mate==null || r.mate.bases==null ? 0 : r.mateLength());
+ generated++;
+ }
+ depot.full.add(list);
+ }
+ }
+
+ private final void readLists(){
+
+ ArrayList<Read> buffer=null;
+ ArrayList<Read> list=null;
+ int next=0;
+ while(buffer!=null || (!shutdown && producer.hasMore() && generated<maxReads)){
+ while(list==null){
+ //System.err.println("Fetching a list: generated="+generated+"/"+maxReads);
+ try {
+ list=depot.empty.take();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ if(shutdown){break;}
+ }
+ //System.err.println("Fetched");
+ }
+ if(shutdown || list==null){
+ //System.err.println("Shutdown triggered; breaking.");
+ break;
+ }
+
+ long bases=0;
+ while(list.size()<depot.bufferSize && generated<maxReads && bases<MAX_DATA){
+ if(buffer==null || next>=buffer.size()){
+ buffer=producer.nextList();
+ next=0;
+ }
+ if(buffer==null){break;}
+ assert(buffer.size()<=BUF_LEN); //Although this is not really necessary.
+
+ if(buffer.size()<=(BUF_LEN-list.size()) && (buffer.size()+generated)<maxReads && randy==null){
+ //Then do a quicker bulk operation
+ list.addAll(buffer);
+ for(Read a : buffer){
+ readsIn++;
+ basesIn+=a.length();
+ bases+=a.length();
+ if(a.mate!=null){
+ readsIn++;
+ basesIn+=a.mateLength();
+ bases+=a.mateLength();
+ }
+ }
+ generated+=buffer.size();
+ next=0;
+ buffer=null;
+ }else{
+ while(next<buffer.size() && list.size()<depot.bufferSize && generated<maxReads && bases<MAX_DATA){
+ Read r=buffer.get(next);
+ readsIn++;
+ basesIn+=r.length();
+ if(r.mate!=null){
+ readsIn++;
+ basesIn+=r.mateLength();
+ }
+ if(randy==null || randy.nextFloat()<samplerate){
+ list.add(r);
+ bases+=r.length();
+ bases+=(r.mate==null || r.mate.bases==null ? 0 : r.mateLength());
+ }
+ generated++;
+// if(generated>1 && (generated%1000000)==0){System.err.println("Generated read #"+generated);}
+ next++;
+ }
+
+ if(next>=buffer.size()){
+ buffer=null;
+ next=0;
+ }
+ }
+ }
+ //System.err.println("Adding list to full depot.");
+ depot.full.add(list);
+ //System.err.println("Added.");
+ list=null;
+ }
+
+ }
+
+ private boolean shutdown=false;
+
+ @Override
+ public void shutdown(){
+ shutdown=true;
+ if(threads[0]!=null && threads[0].isAlive()){
+ threads[0].interrupt();
+ }
+ }
+
+ @Override
+ public synchronized void restart(){
+ shutdown=false;
+ producer.restart();
+ depot=new ConcurrentDepot<Read>(BUF_LEN, NUM_BUFFS);
+ generated=0;
+ basesIn=0;
+ readsIn=0;
+ }
+
+ @Override
+ public synchronized void close(){
+// System.err.println("Closing cris: "+maxReads+", "+generated);
+// if(threads!=null){
+// for(int i=0; i<threads.length; i++){
+// if(threads[i]!=null){System.err.println(i+": "+threads[i].isAlive());}
+// }
+// }
+ producer.close();
+ }
+
+ @Override
+ public boolean paired() {
+ return producer.paired();
+ }
+
+ @Override
+ public boolean verbose(){return verbose;}
+
+ @Override
+ public void setSampleRate(float rate, long seed){
+ samplerate=rate;
+ if(rate>=1f){
+ randy=null;
+ }else if(seed>-1){
+ randy=new java.util.Random(seed);
+ }else{
+ randy=new java.util.Random();
+ }
+ }
+
+ public long basesIn(){return basesIn;}
+ public long readsIn(){return readsIn;}
+
+ @Override
+ public boolean errorState(){return errorState || (producer!=null && producer.errorState());}
+ /** TODO */
+ private boolean errorState=false;
+
+ private float samplerate=1f;
+ private java.util.Random randy=null;
+
+ public Object[] producers(){return new Object[] {producer};}
+
+ private Thread[] threads;
+
+ public final ReadInputStream producer;
+ private ConcurrentDepot<Read> depot;
+
+ public static boolean verbose=false;
+
+ private long basesIn=0;
+ private long readsIn=0;
+
+ private long maxReads;
+ private long generated=0;
+ private long listnum=0;
+
+
+}
diff --git a/current/stream/ConcurrentReadInputStream.java b/current/stream/ConcurrentReadInputStream.java
new file mode 100755
index 0000000..41d02f4
--- /dev/null
+++ b/current/stream/ConcurrentReadInputStream.java
@@ -0,0 +1,252 @@
+package stream;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import stream.mpi.ConcurrentReadInputStreamMPI;
+
+import fileIO.FileFormat;
+import fileIO.ReadWrite;
+import align2.ListNum;
+import align2.Shared;
+
+/**
+ * Abstract superclass of all ConcurrentReadStreamInterface implementations.
+ * @author Brian Bushnell
+ * @date Nov 26, 2014
+ *
+ */
+public abstract class ConcurrentReadInputStream implements ConcurrentReadStreamInterface {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ protected ConcurrentReadInputStream(){}
+
+ protected static ConcurrentReadInputStream getReadInputStream(long maxReads, boolean keepSamHeader, boolean allowSubprocess, String...args){
+ assert(args.length>0) : Arrays.toString(args);
+ for(int i=0; i<args.length; i++){
+ if("null".equalsIgnoreCase(args[i])){args[i]=null;}
+ }
+ assert(args[0]!=null) : Arrays.toString(args);
+
+ assert(args.length<2 || !args[0].equalsIgnoreCase(args[1]));
+ String in1=args[0], in2=null, qf1=null, qf2=null;
+ if(args.length>1){in2=args[1];}
+ if(args.length>2){qf1=args[2];}
+ if(args.length>3){qf2=args[3];}
+
+ final FileFormat ff1=FileFormat.testInput(in1, null, allowSubprocess);
+ final FileFormat ff2=FileFormat.testInput(in2, null, allowSubprocess);
+
+// if(verbose){
+// System.err.println("cris: getReadInputStream("+maxReads+", "+colorspace+", "+keepSamHeader+", "+allowSubprocess+", "+in1+", "+in2+", "+qf1+", "+qf2+")");
+// }
+
+ return getReadInputStream(maxReads, keepSamHeader, ff1, ff2, qf1, qf2);
+ }
+
+ public static ConcurrentReadInputStream getReadInputStream(long maxReads, boolean keepSamHeader, FileFormat ff1, FileFormat ff2){
+ return getReadInputStream(maxReads, keepSamHeader, ff1, ff2, (String)null, (String)null, Shared.USE_MPI, Shared.MPI_KEEP_ALL);
+ }
+
+ public static ConcurrentReadInputStream getReadInputStream(long maxReads, boolean keepSamHeader, FileFormat ff1, FileFormat ff2,
+ final boolean mpi, final boolean keepAll){
+ return getReadInputStream(maxReads, keepSamHeader, ff1, ff2, (String)null, (String)null, mpi, keepAll);
+ }
+
+ public static ConcurrentReadInputStream getReadInputStream(long maxReads, boolean keepSamHeader,
+ FileFormat ff1, FileFormat ff2, String qf1, String qf2){
+ return getReadInputStream(maxReads, keepSamHeader, ff1, ff2, qf1, qf2, Shared.USE_MPI, Shared.MPI_KEEP_ALL);
+ }
+
+ public static ArrayList<Read> getReads(long maxReads, boolean keepSamHeader,
+ FileFormat ff1, FileFormat ff2, String qf1, String qf2){
+ ConcurrentReadInputStream cris=getReadInputStream(maxReads, keepSamHeader, ff1, ff2, qf1, qf2, Shared.USE_MPI, Shared.MPI_KEEP_ALL);
+ return cris.getReads();
+ }
+
+ public ArrayList<Read> getReads(){
+
+ ListNum<Read> ln=nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ ArrayList<Read> out=new ArrayList<Read>();
+
+ while(reads!=null && reads.size()>0){
+ out.addAll(reads);
+ returnList(ln.id, ln.list.isEmpty());
+ ln=nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ if(ln!=null){
+ returnList(ln.id, ln.list==null || ln.list.isEmpty());
+ }
+ boolean error=ReadWrite.closeStream(this);
+ if(error){
+ System.err.println("Warning - an error was encountered during read input.");
+ }
+ return out;
+ }
+
+ public static ConcurrentReadInputStream getReadInputStream(long maxReads, boolean keepSamHeader,
+ FileFormat ff1, FileFormat ff2, String qf1, String qf2, final boolean mpi, final boolean keepAll){
+
+ if(mpi){
+ final int rank=Shared.MPI_RANK;
+ final ConcurrentReadInputStream cris0;
+ if(rank==0){
+ cris0=getReadInputStream(maxReads, keepSamHeader, ff1, ff2, qf1, qf2, false, true);
+ cris0.start();
+ }else{
+ cris0=null;
+ }
+ final ConcurrentReadInputStream crisD;
+ if(Shared.USE_CRISMPI){
+ crisD=new ConcurrentReadInputStreamMPI(cris0, rank==0, keepAll);
+ }else{
+ crisD=new ConcurrentReadInputStreamD(cris0, rank==0, keepAll);
+ }
+ return crisD;
+ }
+
+// if(verbose){
+// System.err.println("cris: getReadInputStream("+maxReads+", "+colorspace+", "+keepSamHeader+", "+ff1+", "+ff2+", "+qf1+", "+qf2+")");
+// }
+
+ assert(ff1!=null);
+ assert(ff2==null || ff1.name()==null || !ff1.name().equalsIgnoreCase(ff2.name())) : ff1.name()+", "+ff2.name();
+ assert(qf1==null || ff1.name()==null || !ff1.name().equalsIgnoreCase(qf2));
+ assert(qf1==null || qf2==null || qf1.equalsIgnoreCase(qf2));
+
+ final ConcurrentReadInputStream cris;
+
+ if(ff1.fastq()){
+
+ ReadInputStream ris1=new FastqReadInputStream(ff1);
+ ReadInputStream ris2=(ff2==null ? null : new FastqReadInputStream(ff2));
+ cris=new ConcurrentGenericReadInputStream(ris1, ris2, maxReads);
+
+ }else if(ff1.fasta()){
+
+ ReadInputStream ris1=(qf1==null ? new FastaReadInputStream(ff1, (FASTQ.FORCE_INTERLEAVED && ff2==null), Shared.AMINO_IN, ff2==null ? Shared.READ_BUFFER_MAX_DATA : -1)
+ : new FastaQualReadInputStream(ff1, qf1));
+ ReadInputStream ris2=(ff2==null ? null : qf2==null ? new FastaReadInputStream(ff2, false, Shared.AMINO_IN, -1) : new FastaQualReadInputStream(ff2, qf2));
+ cris=new ConcurrentGenericReadInputStream(ris1, ris2, maxReads);
+
+ }else if(ff1.scarf()){
+
+ ReadInputStream ris1=new ScarfReadInputStream(ff1);
+ ReadInputStream ris2=(ff2==null ? null : new ScarfReadInputStream(ff2));
+ cris=new ConcurrentGenericReadInputStream(ris1, ris2, maxReads);
+
+ }else if(ff1.samOrBam()){
+
+ ReadInputStream ris1=new SamReadInputStream(ff1, keepSamHeader, FASTQ.FORCE_INTERLEAVED);
+ ReadInputStream ris2=(ff2==null ? null : new SamReadInputStream(ff2, false, false));
+ cris=new ConcurrentGenericReadInputStream(ris1, ris2, maxReads);
+
+ }else if(ff1.bread()){
+
+ RTextInputStream rtis=new RTextInputStream(ff1, ff2, maxReads);
+ cris=new ConcurrentLegacyReadInputStream(rtis, maxReads); //TODO: Change to generic
+
+
+ }else if(ff1.sequential()){
+ SequentialReadInputStream ris=new SequentialReadInputStream(maxReads, 200, 50, 0, false);
+ cris=new ConcurrentLegacyReadInputStream(ris, maxReads);
+ }else if(ff1.csfasta()){
+ throw new RuntimeException("csfasta is no longer supported.");
+ }else if(ff1.random()){
+
+ RandomReadInputStream3 ris=new RandomReadInputStream3(maxReads, FASTQ.FORCE_INTERLEAVED);
+ cris=new ConcurrentGenericReadInputStream(ris, null, maxReads);
+
+ }else{
+ cris=null;
+ throw new RuntimeException(""+ff1);
+ }
+
+ return cris;
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Outer Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+
+ @Override
+ public void start(){
+// System.err.println("Starting "+this);
+ new Thread(this).start(); //Prevents a strange deadlock in ConcurrentCollectionReadInputStream
+ started=true;
+ }
+
+ public final boolean started(){return started;}
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Abstract Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public abstract ListNum<Read> nextList();
+
+ @Override
+ public abstract void returnList(long listNum, boolean poison);
+
+ @Override
+ public abstract void run();
+
+ @Override
+ public abstract void shutdown();
+
+ @Override
+ public abstract void restart();
+
+ @Override
+ public abstract void close();
+
+ @Override
+ public abstract boolean paired();
+
+ @Override
+ public abstract Object[] producers();
+
+ @Override
+ public abstract boolean errorState();
+
+ @Override
+ public abstract void setSampleRate(float rate, long seed);
+
+ @Override
+ public abstract long basesIn();
+
+ @Override
+ public abstract long readsIn();
+
+ @Override
+ public abstract boolean verbose();
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ final int BUF_LEN=Shared.READ_BUFFER_LENGTH;
+ final int NUM_BUFFS=Shared.numBuffers();
+ final long MAX_DATA=Shared.READ_BUFFER_MAX_DATA;
+ public boolean ALLOW_UNEQUAL_LENGTHS=false;
+ boolean started=false;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Static Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public static boolean SHOW_PROGRESS=false;
+ public static boolean SHOW_PROGRESS2=false; //Indicate time in seconds between dots.
+ public static long PROGRESS_INCR=1000000;
+ public static boolean REMOVE_DISCARDED_READS=false;
+
+}
diff --git a/current/stream/ConcurrentReadInputStreamD.java b/current/stream/ConcurrentReadInputStreamD.java
new file mode 100755
index 0000000..3c72838
--- /dev/null
+++ b/current/stream/ConcurrentReadInputStreamD.java
@@ -0,0 +1,500 @@
+package stream;
+
+import java.util.ArrayList;
+import java.util.concurrent.TimeUnit;
+
+import align2.ListNum;
+import align2.Shared;
+
+/**
+ * This class is designed for distributed environments.
+ * The 'master' reads from the filesystem, creates reads, and broadcasts them.
+ * The 'slaves' listen for broadcasts.
+ * @author Brian Bushnell
+ * @date Oct 7, 2014
+ *
+ */
+public class ConcurrentReadInputStreamD extends ConcurrentReadInputStream {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public ConcurrentReadInputStreamD(ConcurrentReadInputStream cris_, boolean master_, boolean keepAll_){
+ source=cris_;
+ master=master_;
+ rank=Shared.MPI_RANK;
+ ranks=Shared.MPI_NUM_RANKS;
+ depot=new ConcurrentDepot<Read>(BUF_LEN, NUM_BUFFS);
+ assert(master==(cris_!=null));
+
+ if(master){
+ paired=source.paired();
+ broadcastPaired(paired);
+ keepAll=keepAll_;
+ broadcastKeepall(keepAll);
+ }else{
+ paired=listenPaired();
+ keepAll=listenKeepall();
+ }
+
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Outer Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public synchronized ListNum<Read> nextList() {
+ ArrayList<Read> list=null;
+ if(verbose){System.err.println("crisD: **************** nextList() was called; shutdown="+shutdown+", depot.full="+depot.full.size());}
+ while(list==null){
+ if(shutdown){
+ if(verbose){System.err.println("crisD: **************** nextList() returning null; shutdown="+shutdown+", depot.full="+depot.full.size());}
+ return null;
+ }
+ try {
+ list=depot.full.take();
+ assert(list!=null);
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+
+ if(verbose){System.err.println("crisD: **************** nextList() returning list of size "+list.size()+"; shutdown="+shutdown+", depot.full="+depot.full.size());}
+ ListNum<Read> ln=new ListNum<Read>(list, listnum);
+ listnum++;
+ return ln;
+ }
+
+ @Override
+ public void returnList(long listNumber, boolean poison){
+ if(poison){
+ if(verbose){System.err.println("crisD: A: Adding empty list to full.");}
+ depot.full.add(new ArrayList<Read>(0));
+ }else{
+ if(verbose){System.err.println("crisD: A: Adding empty list to empty.");}
+ depot.empty.add(new ArrayList<Read>(BUF_LEN)); //Technically this could be a length-0 list since it is never used.
+ }
+ }
+
+ @Override
+ public void run() {
+ synchronized(running){
+ assert(!running[0]) : "This cris was started by multiple threads.";
+ running[0]=true;
+ }
+ if(verbose){System.err.println("crisD: cris started.");}
+ threads=new Thread[] {Thread.currentThread()};
+
+ if(master){
+ readLists_master();
+ }else{
+ readLists_slave();
+ }
+
+ addPoison();
+
+ //End thread
+
+ while(!depot.empty.isEmpty() && !shutdown){
+// System.out.println("crisD: Ending");
+ if(verbose){System.err.println("crisD: B: Adding empty lists to full.");}
+ depot.full.add(depot.empty.poll());
+ }
+ if(verbose){System.err.println("crisD: cris thread syncing before shutdown.");}
+
+ synchronized(running){//TODO Note: for some reason syncing on 'this' instead of 'running' causes a hang. Something else must be syncing improperly on this.
+ assert(running[0]);
+ running[0]=false;
+ }
+ if(verbose){System.err.println("crisD: cris thread terminated. Final depot size: "+depot.full.size()+", "+depot.empty.size());}
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private final void addPoison(){
+ //System.err.println("crisD: Adding poison.");
+ //Add poison pills
+ if(verbose){System.err.println("crisD: C: Adding poison to full.");}
+ depot.full.add(new ArrayList<Read>());
+ for(int i=1; i<depot.bufferCount; i++){
+ ArrayList<Read> list=null;
+ while(list==null){
+ try {
+ list=depot.empty.poll(1000, TimeUnit.MILLISECONDS);
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+// System.err.println("crisD: Do not be alarmed by the following error message:");
+// e.printStackTrace();
+ if(shutdown){
+ i=depot.bufferCount;
+ break;
+ }
+ }
+ }
+ if(list!=null){
+ if(verbose){System.err.println("crisD: D: Adding list("+list.size()+") to full.");}
+ depot.full.add(list);
+ }
+ }
+ if(verbose){System.err.println("crisD: Added poison.");}
+ }
+
+ private final void readLists_master(){
+
+ if(verbose){System.err.println("crisD: Entered readLists_master().");}
+ ListNum<Read> lnForUnicastShutdown=null;
+ for(ListNum<Read> ln=source.nextList(); !shutdown && ln.list!=null; ln=source.nextList()){
+ final ArrayList<Read> reads=ln.list;
+ final int count=(reads==null ? 0 : reads.size());
+
+ if(verbose){System.err.println("crisD: Master fetched "+count+" reads.");}
+
+ if(keepAll || count==0 || (ln.id%ranks)==rank){//Decide whether to process this list
+
+ {
+ ArrayList<Read> dummy=null;
+ while(dummy==null && !shutdown){
+ try {
+ dummy=depot.empty.take();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ if(shutdown){break;}
+ }
+ }
+// if(shutdown){break;}
+ }
+
+ try {
+ depot.full.put(reads);
+ if(verbose){System.err.println("crisD: Master added reads to depot.");}
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ broadcast(ln);
+ lnForUnicastShutdown=ln;
+ if(verbose){System.err.println("crisD: Master broadcasted.");}
+ source.returnList(ln.id, count<1);
+ if(verbose){System.err.println("crisD: Master returned a list.");}
+ if(count<1){break;}
+ }
+ if(!keepAll){//Shutdown all slaves if unicasting
+ for(int i=1; i<ranks; i++){
+ unicast(lnForUnicastShutdown, i);
+ }
+ }
+ if(verbose){System.err.println("crisD: Finished readLists_master().");}
+ }
+
+ private final void readLists_slave(){
+
+ if(verbose){System.err.println("crisD: Entered readLists_slave().");}
+ for(ListNum<Read> ln=listen(); !shutdown && ln!=null; ln=listen()){
+
+ final ArrayList<Read> reads=ln.list;
+ final int count=(reads==null ? 0 : reads.size());
+
+ if(verbose){System.err.println("crisD: Slave fetched "+count+" reads.");}
+
+ if(keepAll || count==0 || (ln.id%ranks)==rank){//Decide whether to process this list
+ {
+ ArrayList<Read> dummy=null;
+ while(dummy==null && !shutdown){
+ try {
+ dummy=depot.empty.take();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ if(shutdown){break;}
+ }
+ }
+// if(shutdown){break;}
+ }
+
+ try {
+ depot.full.put(reads);
+ if(verbose){System.err.println("crisD: Slave added reads to depot.");}
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ if(count<1){break;}
+ }
+ if(verbose){System.err.println("crisD: Finished readLists_slave().");}
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Concurrency Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ protected void broadcast(ListNum<Read> ln){
+ if(!keepAll && ln.size()>0){//Decide how to send this list
+ final int toRank=(int)(ln.id%ranks);
+ unicast(ln, toRank);
+ return;
+ }
+
+ if(verbose){System.err.println("crisD "+(master?"master":"slave ")+": Broadcasting reads.");}
+
+ boolean success=false;
+ while(!success && !shutdown){
+ try {
+ //Do some MPI stuff
+ success=true;
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ throw new RuntimeException("TODO");
+ }
+
+ protected void unicast(ListNum<Read> ln, final int toRank){
+ if(toRank==rank){return;}
+ if(verbose){System.err.println("crisD "+(master?"master":"slave ")+": Unicasting reads to "+toRank+".");}
+
+ boolean success=false;
+ while(!success && !shutdown){
+ try {
+ //Do some MPI stuff
+ success=true;
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ throw new RuntimeException("TODO");
+ }
+
+ protected void broadcastPaired(boolean b){
+ if(verbose){System.err.println("crisD "+(master?"master":"slave ")+": Broadcasting pairing status.");}
+ boolean success=false;
+ while(!success && !shutdown){
+ try {
+ //Do some MPI stuff
+ success=true;
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+// throw new RuntimeException("TODO");
+ }
+
+ protected void broadcastKeepall(boolean b){
+ if(verbose){System.err.println("crisD "+(master?"master":"slave ")+": Broadcasting keepAll status.");}
+ boolean success=false;
+ while(!success && !shutdown){
+ try {
+ //Do some MPI stuff
+ success=true;
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+// throw new RuntimeException("TODO");
+ }
+
+ protected ListNum<Read> listen(){
+ if(verbose){System.err.println("crisD "+(master?"master":"slave ")+": Listening to "+0+" for reads.");}
+ boolean success=false;
+ while(!success && !shutdown){
+ try {
+ //Do some MPI stuff
+ success=true;
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ throw new RuntimeException("TODO");
+ }
+
+ protected boolean listenPaired(){
+ if(verbose){System.err.println("crisD "+(master?"master":"slave ")+": Listening to "+0+" for pairing status.");}
+ boolean success=false;
+ while(!success && !shutdown){
+ try {
+ //Do some MPI stuff
+ success=true;
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ throw new RuntimeException("TODO");
+ }
+
+ protected boolean listenKeepall(){
+ if(verbose){System.err.println("crisD "+(master?"master":"slave ")+": Listening to "+0+" for keepAll status.");}
+ boolean success=false;
+ while(!success && !shutdown){
+ try {
+ //Do some MPI stuff
+ success=true;
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ throw new RuntimeException("TODO");
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Termination ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public void shutdown(){
+ if(verbose){System.out.println("crisD: Called shutdown.");}
+
+ shutdown=true;
+ if(!shutdown){
+
+ if(master){
+ source.shutdown();
+ }
+ for(Thread t : threads){
+ if(t!=null && t.isAlive()){
+ t.interrupt();
+ }
+ }
+ }
+ }
+
+ @Override
+ public synchronized void restart(){
+ shutdown=false;
+ depot=new ConcurrentDepot<Read>(BUF_LEN, NUM_BUFFS);
+ basesIn=0;
+ readsIn=0;
+ listnum=0; //Added Oct 9, 2014
+ if(master){
+ source.restart();
+ }
+ }
+
+ @Override
+ public synchronized void close(){
+ shutdown();
+
+ if(master){
+ source.close();
+ }else{
+
+ }
+
+ if(threads!=null && threads[0]!=null && threads[0].isAlive()){
+
+ while(threads[0].isAlive()){
+// System.out.println("crisD: B");
+ ArrayList<Read> list=null;
+ for(int i=0; i<1000 && list==null && threads[0].isAlive(); i++){
+ try {
+ list=depot.full.poll(200, TimeUnit.MILLISECONDS);
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ System.err.println("crisD: Do not be alarmed by the following error message:");
+ e.printStackTrace();
+ break;
+ }
+ }
+
+ if(list!=null){
+ list.clear();
+ depot.empty.add(list);
+ }
+
+// System.out.println("crisD: isAlive? "+threads[0].isAlive());
+ }
+
+ }
+
+ if(threads!=null){
+ for(int i=1; i<threads.length; i++){
+ while(threads[i]!=null && threads[i].isAlive()){
+ try {
+ threads[i].join();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+ }
+
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Getters ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public boolean paired() {return paired;}
+
+ @Override
+ public boolean verbose(){return verbose;}
+
+ @Override
+ public void setSampleRate(float rate, long seed){
+ if(master){source.setSampleRate(rate, seed);}
+ }
+
+ public long basesIn(){return basesIn;}
+ public long readsIn(){return readsIn;}
+
+ @Override
+ public boolean errorState(){
+ if(master){return errorState|source.errorState();}
+ return errorState;
+ }
+
+ public Object[] producers(){
+ if(master){return source.producers();}
+ return null;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Wrapped source of reads. Null for slaves. */
+ private ConcurrentReadInputStream source;
+ private final boolean master;
+ protected final boolean keepAll;
+ protected final int rank, ranks;
+
+ private boolean errorState=false;
+
+ private boolean[] running=new boolean[] {false};
+
+ private boolean shutdown=false;
+
+ private ConcurrentDepot<Read> depot;
+
+ private Thread[] threads;
+
+ private long basesIn=0;
+ private long readsIn=0;
+
+ private long listnum=0;
+
+ /** This should be set in the first broadcast */
+ private final boolean paired;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Static Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public static boolean verbose=false;
+
+}
diff --git a/current/stream/ConcurrentReadListDepot.java b/current/stream/ConcurrentReadListDepot.java
new file mode 100755
index 0000000..af84b8a
--- /dev/null
+++ b/current/stream/ConcurrentReadListDepot.java
@@ -0,0 +1,35 @@
+package stream;
+
+import java.util.ArrayList;
+import java.util.concurrent.ArrayBlockingQueue;
+
+public class ConcurrentReadListDepot<K> {
+
+
+
+ public ConcurrentReadListDepot(int bufSize, int numBufs){
+ bufferSize=bufSize;
+ bufferCount=numBufs;
+
+ lists=new ArrayList[numBufs];
+ empty=new ArrayBlockingQueue<ArrayList<K>>(numBufs+1);
+ full=new ArrayBlockingQueue<ArrayList<K>>(numBufs+1);
+
+ for(int i=0; i<lists.length; i++){
+ lists[i]=new ArrayList<K>(bufSize);
+ empty.add(lists[i]);
+ }
+
+ }
+
+
+ public final ArrayBlockingQueue<ArrayList<K>> empty;
+ public final ArrayBlockingQueue<ArrayList<K>> full;
+
+ public final int bufferSize;
+ public final int bufferCount;
+
+
+ private final ArrayList<K>[] lists;
+
+}
diff --git a/current/stream/ConcurrentReadOutputStream.java b/current/stream/ConcurrentReadOutputStream.java
new file mode 100755
index 0000000..bfff821
--- /dev/null
+++ b/current/stream/ConcurrentReadOutputStream.java
@@ -0,0 +1,121 @@
+package stream;
+
+import java.util.ArrayList;
+
+import stream.mpi.ConcurrentReadOutputStreamMPI;
+
+import align2.Shared;
+
+import fileIO.FileFormat;
+
+/**
+ * @author Brian Bushnell
+ * @date Jan 26, 2015
+ *
+ */
+public abstract class ConcurrentReadOutputStream {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Factory ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public static ConcurrentReadOutputStream getStream(FileFormat ff1, int maxSize, CharSequence header, boolean useSharedHeader){
+ return getStream(ff1, null, null, null, maxSize, header, useSharedHeader, Shared.USE_MPI, Shared.MPI_KEEP_ALL);
+ }
+
+ public static ConcurrentReadOutputStream getStream(FileFormat ff1, FileFormat ff2, int maxSize, CharSequence header, boolean useSharedHeader){
+ return getStream(ff1, ff2, null, null, maxSize, header, useSharedHeader, Shared.USE_MPI, Shared.MPI_KEEP_ALL);
+ }
+
+ public static ConcurrentReadOutputStream getStream(FileFormat ff1, FileFormat ff2, String qf1, String qf2,
+ int maxSize, CharSequence header, boolean useSharedHeader){
+ return getStream(ff1, ff2, qf1, qf2, maxSize, header, useSharedHeader, Shared.USE_MPI, Shared.MPI_KEEP_ALL);
+ }
+
+ public static ConcurrentReadOutputStream getStream(FileFormat ff1, FileFormat ff2, String qf1, String qf2,
+ int maxSize, CharSequence header, boolean useSharedHeader, final boolean mpi, final boolean keepAll){
+ if(mpi){
+ final int rank=Shared.MPI_RANK;
+ final ConcurrentReadOutputStream cros0;
+ if(rank==0){
+ cros0=new ConcurrentGenericReadOutputStream(ff1, ff2, qf1, qf2, maxSize, header, useSharedHeader);
+ }else{
+ cros0=null;
+ }
+ final ConcurrentReadOutputStream crosD;
+ if(Shared.USE_CRISMPI){
+ crosD=new ConcurrentReadOutputStreamMPI(cros0, rank==0);
+ }else{
+ crosD=new ConcurrentReadOutputStreamD(cros0, rank==0);
+ }
+ return crosD;
+ }else{
+ return new ConcurrentGenericReadOutputStream(ff1, ff2, qf1, qf2, maxSize, header, useSharedHeader);
+ }
+
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ ConcurrentReadOutputStream(FileFormat ff1_, FileFormat ff2_){
+ ff1=ff1_;
+ ff2=ff2_;
+ ORDERED=(ff1==null ? true : ff1.ordered());
+ }
+
+ public abstract void start();
+
+ public final boolean started(){return started;}
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Outer Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+
+ public abstract void add(ArrayList<Read> list, long listnum);
+
+ public abstract void close();
+
+ public abstract void join();
+
+ public abstract void resetNextListID();
+
+ public abstract String fname();
+
+ /** Return true if this stream has detected an error */
+ public abstract boolean errorState();
+
+ public abstract boolean finishedSuccessfully();
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Getters ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public abstract ReadStreamWriter getRS1();
+ public abstract ReadStreamWriter getRS2();
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public final FileFormat ff1, ff2;
+ public final boolean ORDERED;
+
+ boolean errorState=false;
+ boolean finishedSuccessfully=false;
+ boolean started=false;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Static Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public static boolean BYTE_WRITER=true;
+ public static boolean verbose=false;
+
+}
diff --git a/current/stream/ConcurrentReadOutputStreamD.java b/current/stream/ConcurrentReadOutputStreamD.java
new file mode 100755
index 0000000..ff2c6d5
--- /dev/null
+++ b/current/stream/ConcurrentReadOutputStreamD.java
@@ -0,0 +1,323 @@
+package stream;
+
+import java.util.ArrayList;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import align2.ListNum;
+import align2.Shared;
+
+/**
+ * @author Brian Bushnell
+ * @date Jan 26, 2015
+ *
+ */
+public class ConcurrentReadOutputStreamD extends ConcurrentReadOutputStream{
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public ConcurrentReadOutputStreamD(ConcurrentReadOutputStream cros_, boolean master_){
+ super(cros_==null ? null : cros_.ff1, cros_==null ? null : cros_.ff2);
+ dest=cros_;
+ master=master_;
+ rank=Shared.MPI_RANK;
+ ranks=Shared.MPI_NUM_RANKS;
+ assert(master==(cros_!=null));
+ }
+
+ @Override
+ public synchronized void start(){
+ if(started){
+ System.err.println("Resetting output stream.");
+ throw new RuntimeException();
+ }
+
+ started=true;
+ if(master){
+ terminatedCount.set(0);
+ dest.start();
+ startThreads();
+ }
+ }
+
+ private void startThreads(){
+ assert(master);
+ for(int i=0; i<ranks; i++){
+ if(i!=rank){
+ ListenThread lt=new ListenThread(i);
+ lt.start();
+ }
+ }
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Outer Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+
+ @Override
+ public synchronized void add(ArrayList<Read> list, long listnum){
+ if(master){
+ dest.add(list, listnum);
+ }else{
+ unicast(list, listnum, 0);
+ }
+ }
+
+ @Override
+ public void close(){
+ if(master){
+ int count=terminatedCount.incrementAndGet();
+ while(count<ranks){
+ synchronized(terminatedCount){
+ count=terminatedCount.intValue();
+ if(count<ranks){
+ try {
+ terminatedCount.wait(1000);
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+ }
+ dest.close();
+ }else{
+ unicast(new ListNum<Read>(new ArrayList<Read>(1), -1), 0);
+ }
+ }
+
+ @Override
+ public void join(){
+ if(master){
+ dest.join();
+ broadcastJoin(true);
+ }else{
+ boolean b=listenForJoin();
+ assert(b);
+ }
+ }
+
+ @Override
+ public synchronized void resetNextListID(){
+ if(master){
+ dest.resetNextListID();
+ terminatedCount.set(0);
+ finishedSuccessfully=false;
+ }
+ }
+
+ @Override
+ public String fname(){
+ return ff1.name();
+ }
+
+ @Override
+ public boolean errorState(){
+ if(master){
+ return errorState || dest.errorState();
+ }else{
+ return errorState;
+ }
+ }
+
+ @Override
+ public boolean finishedSuccessfully(){
+ if(finishedSuccessfully){return true;}
+
+ synchronized(this){
+ if(finishedSuccessfully){return true;}
+ if(master){
+ finishedSuccessfully=dest.finishedSuccessfully();
+ broadcastFinishedSuccessfully(finishedSuccessfully);
+ }else{
+ finishedSuccessfully=listenFinishedSuccessfully();
+ }
+ }
+ return finishedSuccessfully;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private void unicast(ArrayList<Read> list, long listnum, int i) {
+ unicast(new ListNum<Read>(list, listnum), i);
+ }
+
+ /**
+ * @param list
+ * @param listnum
+ * @param i
+ */
+ protected void unicast(ListNum<Read> ln, int i) {
+ if(verbose){System.err.println("crosD "+(master?"master":"slave ")+": Unicasting reads to "+i+".");}
+ assert(!master);
+
+ boolean success=false;
+ while(!success){
+ try {
+ //Do some MPI stuff
+ success=true;
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ throw new RuntimeException("TODO");
+ }
+
+ protected ListNum<Read> listen(int i){
+ if(verbose){System.err.println("crosD "+(master?"master":"slave ")+": Listening for reads from "+i+".");}
+ assert(master);
+
+ boolean success=false;
+ while(!success){
+ try {
+ //Do some MPI stuff
+ success=true;
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ throw new RuntimeException("TODO");
+ }
+
+
+ /**
+ * Slaves listen to master's finishedSuccessfully status.
+ */
+ protected boolean listenFinishedSuccessfully() {
+ if(verbose){System.err.println("crosD "+(master?"master":"slave ")+": listenFinishedSuccessfully.");}
+ assert(!master);
+
+ boolean success=false;
+ while(!success){
+ try {
+ //Do some MPI stuff
+ success=true;
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ throw new RuntimeException("TODO");
+ }
+
+ /**
+ * Master reports finishedSuccessfully status to slaves.
+ */
+ protected void broadcastFinishedSuccessfully(boolean b) {
+ if(verbose){System.err.println("crosD "+(master?"master":"slave ")+": broadcastFinishedSuccessfully.");}
+ assert(master);
+
+ boolean success=false;
+ while(!success){
+ try {
+ //Do some MPI stuff
+ success=true;
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ throw new RuntimeException("TODO");
+ }
+
+ /** Master tells slaves that 'join' was successful. */
+ protected void broadcastJoin(boolean b) {
+ if(verbose){System.err.println("crosD "+(master?"master":"slave ")+": broadcastJoin.");}
+ assert(master);
+
+ boolean success=false;
+ while(!success){
+ try {
+ //Do some MPI stuff
+ success=true;
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ throw new RuntimeException("TODO");
+ }
+
+ /** Slave listens to see that master 'join' was successful. */
+ protected boolean listenForJoin() {
+ if(verbose){System.err.println("crosD "+(master?"master":"slave ")+": listenForJoin.");}
+ assert(!master);
+
+ boolean success=false;
+ while(!success){
+ try {
+ //Do some MPI stuff
+ success=true;
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ throw new RuntimeException("TODO");
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Classes ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private class ListenThread extends Thread{
+
+ ListenThread(int sourceNum_){
+ sourceNum=sourceNum_;
+ assert(sourceNum_!=rank);
+ assert(sourceNum>=0 && sourceNum<ranks);
+ }
+
+ public void run(){
+ assert(master);
+ ListNum<Read> ln=listen(sourceNum);
+ while(ln!=null && ln.id>=0){
+ dest.add(ln.list, ln.id);
+ ln=listen(sourceNum);
+ }
+ final int count=terminatedCount.addAndGet(1);
+ if(count>=ranks){
+ synchronized(terminatedCount){
+ terminatedCount.notify();
+ }
+ }
+ }
+
+ final int sourceNum;
+
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Getters ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public ReadStreamWriter getRS1(){return master ? dest.getRS1() : null;}
+ public ReadStreamWriter getRS2(){return master ? dest.getRS2() : null;}
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ protected final AtomicInteger terminatedCount=new AtomicInteger(0);
+ protected final ConcurrentReadOutputStreamD thisPointer=this;
+
+ /** Wrapped destination of reads. Null for slaves. */
+ protected ConcurrentReadOutputStream dest;
+ protected final boolean master;
+ protected final int rank, ranks;
+
+ protected boolean finishedSuccessfully=false;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Static Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public static boolean verbose=false;
+
+}
diff --git a/current/stream/ConcurrentReadStreamInterface.java b/current/stream/ConcurrentReadStreamInterface.java
new file mode 100755
index 0000000..407d645
--- /dev/null
+++ b/current/stream/ConcurrentReadStreamInterface.java
@@ -0,0 +1,65 @@
+package stream;
+
+import align2.ListNum;
+
+public interface ConcurrentReadStreamInterface extends Runnable{
+
+ /** Start this in a new thread. */
+ public void start();
+
+ /** Fetch the next list of reads. Returns an empty list when end of input is reached. */
+ public ListNum<Read> nextList();
+
+ /** When the nextList() caller is done processing a list, it MUST be returned using this method.
+ * The 'poison' flag should be set to false normally. When a consumer thread receives an empty list from nextList(),
+ * it should be returned with the poison flag set to true, then the consumer should terminate.
+ * This will return a list to the 'full' queue, allowing another thread to pull the empty list and terminate. */
+ public void returnList(long listNumber, boolean poison);
+
+ /** This must be called (indirectly, via Thread.start()) before reads will be generated. */
+ public void run();
+
+ /** Indicate to producer threads that no more reads are desired, and interrupts them. */
+ public void shutdown();
+
+ /** Reset state to allow production of reads from the beginning of the input files.
+ * Does not work with stdin (may cause strange behavior). */
+ public void restart();
+
+ /** Calls shutdown, then shuts down all threads and closes all associated files. */
+ public void close();
+
+ /** Returns true for paired-end stream, false for single-end stream. */
+ public boolean paired();
+
+ /** Returns the underlying read object producer(s), such as ReadInputStreams. Optional method for things such as error messages. */
+ public Object[] producers();
+
+ /** Return true if this stream or its producers have detected an error. */
+ public boolean errorState();
+
+ /**
+ * Set the read sampling rate. Optional method.
+ * @param rate Fraction of reads to use, 0-1.
+ * @param seed Random number generator seed when positive. If negative, a random seed will be used.
+ */
+ public void setSampleRate(float rate, long seed);
+
+ /**
+ * @return Number of bases read by this stream.
+ */
+ public long basesIn();
+
+ /**
+ * @return Number of reads read by this stream.
+ */
+ public long readsIn();
+
+ /**
+ * @return Value of verbose field.
+ */
+ public boolean verbose();
+
+// public String classname(); //e.g. getClass().getName()
+
+}
diff --git a/current/stream/CrisWrapper.java b/current/stream/CrisWrapper.java
new file mode 100755
index 0000000..a3692e7
--- /dev/null
+++ b/current/stream/CrisWrapper.java
@@ -0,0 +1,96 @@
+package stream;
+
+import java.util.ArrayList;
+
+import fileIO.FileFormat;
+import fileIO.ReadWrite;
+
+import align2.ListNum;
+
+/**
+ * Wraps a cris to allow single-read next() capability, and the ability to go back.
+ * @author Brian Bushnell
+ * @date Jul 18, 2014
+ *
+ */
+public class CrisWrapper {
+
+ public CrisWrapper(long maxReads, boolean keepSamHeader, FileFormat ff1, FileFormat ff2){
+ this(maxReads, keepSamHeader, ff1, ff2, (String)null, (String)null);
+ }
+
+ public CrisWrapper(long maxReads, boolean keepSamHeader, FileFormat ff1, FileFormat ff2, String qf1, String qf2){
+ this(ConcurrentReadInputStream.getReadInputStream(maxReads, ff1.samOrBam(), ff1, ff2, qf1, qf2), true);
+ }
+
+
+ public CrisWrapper(ConcurrentReadInputStream cris_, boolean start){
+ initialize(cris_, start);
+ }
+
+ public void initialize(ConcurrentReadInputStream cris_, boolean start){
+ cris=cris_;
+ if(start){cris.start();}
+ ln=cris.nextList();
+ reads=(ln==null ? null : ln.list);
+ if(reads==null || reads.size()==0){
+ reads=null;
+ //System.err.println("Empty.");
+ cris.returnList(ln.id, true);
+ errorState|=ReadWrite.closeStream(cris);
+ }
+ index=0;
+ //System.err.println("Initialized.");
+ }
+
+ public Read next(){
+ //System.err.println("*******1");
+ Read r=null;
+ if(reads==null || index>=reads.size()){
+ //System.err.println("*******2");
+ if(reads==null){return null;}
+ index=0;
+ if(reads.size()==0){
+ reads=null;
+ cris.returnList(ln.id, true);
+ errorState|=ReadWrite.closeStream(cris);
+ return null;
+ }
+ cris.returnList(ln.id, false);
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ if(reads==null){
+ //System.err.println("*******3");
+ cris.returnList(ln.id, true);
+ errorState|=ReadWrite.closeStream(cris);
+ //System.err.println("Returning null (2)");
+ return null;
+ }
+ }
+ //System.err.println("*******4");
+ if(index<reads.size()){
+ //System.err.println("*******5");
+ r=reads.get(index);
+ index++;
+ }else{
+ //System.err.println("*******6");
+ //System.err.println("Recalling");
+ return next();
+ }
+ //System.err.println("*******7");
+ //System.err.println("Returning "+(r==null ? "null" : r.id));
+ return r;
+ }
+
+ public void goBack(){
+ assert(index>0);
+ index--;
+ }
+
+ private ListNum<Read> ln;
+ private ArrayList<Read> reads;
+ private int index;
+ public ConcurrentReadInputStream cris;
+ public boolean errorState=false;
+
+}
diff --git a/current/stream/DualCris.java b/current/stream/DualCris.java
new file mode 100755
index 0000000..6e36236
--- /dev/null
+++ b/current/stream/DualCris.java
@@ -0,0 +1,223 @@
+package stream;
+
+import java.util.ArrayList;
+
+import fileIO.FileFormat;
+import fileIO.ReadWrite;
+import align2.ListNum;
+import align2.Shared;
+import align2.Tools;
+
+/**
+ * @author Brian Bushnell
+ * @date Apr 3, 2015
+ *
+ */
+public class DualCris extends ConcurrentReadInputStream {
+
+ public static void main(String[] args){
+ String a=args[0];
+ String b=args.length>1 ? args[1] : null;
+ FileFormat ff1=FileFormat.testInput(a, null, false);
+ FileFormat ff2=(b==null ? null : FileFormat.testInput(b, null, false));
+ DualCris cris=getReadInputStream(-1, false, ff1, ff2, null, null);
+ cris.start();
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=ln.list;
+
+ boolean foundR1=false, foundR2=false;
+ while(reads!=null && reads.size()>0){
+ for(Read r1 : reads){
+ Read r2=r1.mate;
+ if(r1.pairnum()==0){foundR1=true;}
+ else{foundR2=true;}
+ if(r2!=null){
+ if(r2.pairnum()==0){foundR1=true;}
+ else{foundR2=true;}
+ }
+ }
+
+ System.err.print(ln.id);
+
+ cris.returnList(ln.id, foundR1, foundR2);
+ foundR1=foundR2=false;
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ System.err.print(",");
+ }
+ System.err.print("Finished.");
+ cris.returnList(ln.id, foundR1, foundR2);
+ ReadWrite.closeStreams(cris);
+ }
+
+ public static DualCris getReadInputStream(long maxReads, boolean keepSamHeader,
+ FileFormat ff1, FileFormat ff2, String qf1, String qf2){
+ ConcurrentReadInputStream cris1=(ff1==null ? null : ConcurrentReadInputStream.getReadInputStream(maxReads, keepSamHeader, ff1, null, qf1, null));
+ ConcurrentReadInputStream cris2=(ff2==null ? null : ConcurrentReadInputStream.getReadInputStream(maxReads, keepSamHeader, ff2, null, qf2, null));
+ return new DualCris(cris1, cris2);
+ }
+
+ public DualCris(ConcurrentReadInputStream cris1_, ConcurrentReadInputStream cris2_){
+ cris1=cris1_;
+ cris2=cris2_;
+ }
+
+ private final ConcurrentReadInputStream cris1;
+ private final ConcurrentReadInputStream cris2;
+ private boolean cris1Active, cris2Active;
+ private boolean errorState=false;
+ private boolean verbose=false;
+
+ @Override
+ public ListNum<Read> nextList() {
+
+ ListNum<Read> ln1=null, ln2=null;
+ if(cris1Active && cris1!=null){
+ ln1=cris1.nextList();
+ if(ln1==null){
+ synchronized(this){
+ cris1Active=false;
+ System.err.println("\nSet cris1Active="+cris1Active);
+ }
+ }
+ }
+ if(cris2Active && cris2!=null){
+ ln2=cris2.nextList();
+ if(ln2!=null){
+ for(Read r : ln2.list){r.setPairnum(1);}
+ }else{
+ synchronized(this){
+ cris2Active=false;
+ System.err.println("\nSet cris2Active="+cris2Active);
+ }
+ }
+ }
+
+ if(ln1!=null && ln2!=null){
+ final int size1=ln1.size(), size2=ln2.size();
+ final int min=Tools.min(size1, size2);
+ for(int i=0; i<min; i++){
+ Read r1=ln1.get(i);
+ Read r2=ln2.get(i);
+ r1.mate=r2;
+ r2.mate=r1;
+ }
+ if(size2>size1){
+ for(int i=size1; i<size2; i++){
+ ln1.add(ln2.get(i));
+ }
+ }
+ }else if(ln2!=null){
+ ln1=ln2;
+ }
+
+ return ln1;
+ }
+
+ @Override
+ public void returnList(long listNum, boolean poison) {
+ throw new RuntimeException("Unsupported.");
+ }
+
+ public void returnList(long listNum, boolean foundR1, boolean foundR2) {
+ if(cris1!=null && cris1Active){
+ cris1.returnList(listNum, !foundR1);
+ if(!foundR1){cris1Active=false;}
+ }
+ if(cris2!=null && cris2Active){
+ cris2.returnList(listNum, !foundR2);
+ if(!foundR2){cris2Active=false;}
+ }
+ }
+
+ @Override
+ public void start() {
+ started=true;
+ if(cris1!=null){
+ cris1.start();
+ cris1Active=true;
+ }
+ if(cris2!=null){
+ cris2.start();
+ cris2Active=true;
+ }
+ }
+
+ @Override
+ public void run() {assert(false);}
+
+ @Override
+ public void shutdown() {
+ if(cris1!=null){cris1.shutdown();}
+ if(cris2!=null){cris2.shutdown();}
+ cris1Active=cris2Active=false;
+ }
+
+ @Override
+ public void restart() {
+ if(cris1!=null){
+ cris1.restart();
+ cris1Active=true;
+ }
+ if(cris2!=null){
+ cris2.restart();
+ cris2Active=true;
+ }
+ }
+
+ @Override
+ public void close() {
+ if(cris1!=null){cris1.close();}
+ if(cris2!=null){cris2.close();}
+ cris1Active=cris2Active=false;
+ }
+
+ @Override
+ public boolean paired() {
+ assert(cris1!=null || cris2!=null);
+ if(cris2!=null){return true;}
+ if(cris1!=null){return cris1.paired();}
+ return false;
+ }
+
+ @Override
+ public Object[] producers() {
+ ArrayList<Object> list=new ArrayList<Object>();
+ if(cris1!=null){
+ for(Object o : cris1.producers()){list.add(o);}
+ }
+ if(cris2!=null){
+ for(Object o : cris2.producers()){list.add(o);}
+ }
+ return list.toArray();
+ }
+
+ @Override
+ public boolean errorState() {
+ if(cris1!=null){errorState|=cris1.errorState();}
+ if(cris2!=null){errorState|=cris2.errorState();}
+ return errorState;
+ }
+
+ @Override
+ public void setSampleRate(float rate, long seed) {
+ throw new RuntimeException("Invalid.");
+ }
+
+ @Override
+ public long basesIn() {
+ return (cris1==null ? 0 : cris1.basesIn())+(cris2==null ? 0 : cris2.basesIn());
+ }
+
+ @Override
+ public long readsIn() {
+ return (cris1==null ? 0 : cris1.readsIn())+(cris2==null ? 0 : cris2.readsIn());
+ }
+
+ @Override
+ public boolean verbose() {
+ return verbose;
+ }
+
+}
diff --git a/current/stream/FASTQ.java b/current/stream/FASTQ.java
new file mode 100755
index 0000000..211b68f
--- /dev/null
+++ b/current/stream/FASTQ.java
@@ -0,0 +1,888 @@
+package stream;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import align2.Shared;
+import align2.Tools;
+
+import dna.Data;
+import dna.Gene;
+import fileIO.ByteFile;
+import fileIO.ReadWrite;
+import fileIO.TextFile;
+
+
+public class FASTQ {
+
+ public static void writeFASTQ(Read[] reads, String fname){
+ StringBuilder sb=new StringBuilder();
+ for(Read r : reads){
+ String[] quad=toFASTQ(r);
+ for(int i=0; i<quad.length; i++){
+ sb.append(quad[i]);
+ sb.append('\n');
+ }
+ }
+ ReadWrite.writeString(sb, fname);
+ }
+
+// public static boolean isInterleaved(String fname){
+// if(!TEST_INTERLEAVED && !FORCE_INTERLEAVED){return false;}
+// assert(tf.is!=System.in && !tf.name.equals("stdin") && !tf.name.startsWith("stdin."));
+// if(tf.is!=System.in && !tf.name.equals("stdin") && !tf.name.startsWith("stdin.")){return FORCE_INTERLEAVED;}
+// String s=null;
+//
+// String[] oct=new String[8];
+// }
+
+// public static boolean isInterleaved_old(String fname){
+//// assert(false) : TEST_INTERLEAVED+", "+FORCE_INTERLEAVED;
+// if(!TEST_INTERLEAVED && !FORCE_INTERLEAVED){
+// testQuality(fname);
+// return false;
+// }
+// assert(!fname.equals("stdin") && !fname.startsWith("stdin."));
+// if(fname.equals("stdin") || fname.startsWith("stdin.")){return FORCE_INTERLEAVED;}
+//// TextFile tf=new TextFile(fname);
+//// assert(tf.is!=System.in);
+//// if(tf.is==System.in){return FORCE_INTERLEAVED;}
+//
+// String[] oct=new String[8];
+// int cntr=0;
+//
+// {
+// InputStream is=ReadWrite.getInputStream(fname, false, false);
+// BufferedReader br=new BufferedReader(new InputStreamReader(is));
+// try {
+// for(String s=br.readLine(); s!=null && cntr<8; s=br.readLine()){
+// oct[cntr]=s;
+// cntr++;
+// }
+// } catch (IOException e) {
+// // TODO Auto-generated catch block
+// e.printStackTrace();
+// }
+// }
+//
+// if(oct[0]==null){return false;}
+//
+// testQuality(oct);
+//
+// if(cntr<8){return false;}
+//// assert(false);
+// assert(oct[0]==null || oct[0].startsWith("@")) : "Does not appear to be a valid FASTQ file:\n"+new String(oct[0]);
+// assert(oct[2]==null || oct[2].startsWith("+")) : "Does not appear to be a valid FASTQ file:\n"+new String(oct[2]);
+// assert(oct[4]==null || oct[4].startsWith("@")) : "Does not appear to be a valid FASTQ file:\n"+new String(oct[4]);
+// assert(oct[6]==null || oct[6].startsWith("+")) : "Does not appear to be a valid FASTQ file:\n"+new String(oct[6]);
+//
+// if(FORCE_INTERLEAVED){return true;}
+// if(PARSE_CUSTOM && fname.contains("_interleaved.f")){
+// return true;
+// }
+//
+// return testPairNames(oct[0], oct[4]);
+// }
+
+ private static String[] getFirstTwoFastaHeaders(String fname){
+ if(fname==null){return null;}
+ if(fname.equalsIgnoreCase("stdin") || fname.toLowerCase().startsWith("stdin.")){return null;}
+
+ String[] headers=new String[2];
+ int cntr=0;
+
+ {
+ InputStream is=ReadWrite.getInputStream(fname, false, false);
+ BufferedReader br=new BufferedReader(new InputStreamReader(is));
+ try {
+ for(String s=br.readLine(); s!=null && cntr<2; s=br.readLine()){
+ if(s.startsWith(">")){
+ headers[cntr]=s;
+ cntr++;
+ }
+ }
+ } catch (IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ return headers;
+ }
+
+ public static byte testQuality(String fname){
+ if(fname==null){return ASCII_OFFSET;}
+ if(!DETECT_QUALITY || fname.equalsIgnoreCase("stdin") || fname.toLowerCase().startsWith("stdin.")){return ASCII_OFFSET;}
+
+ ArrayList<String> oct=fileIO.FileFormat.getFirstOctet(fname);
+ return testQuality(oct);
+ }
+
+ public static boolean isInterleaved(final String fname, final boolean allowIdentical){
+ if(!DETECT_QUALITY && !TEST_INTERLEAVED){return FORCE_INTERLEAVED;}
+ final ArrayList<String> oct=fileIO.FileFormat.getFirstOctet(fname);
+ if(oct==null){return FORCE_INTERLEAVED;}
+
+ if(DETECT_QUALITY){testQuality(oct);}
+ if(TEST_INTERLEAVED){return testInterleaved(oct, fname, allowIdentical);}
+ return FORCE_INTERLEAVED;
+ }
+
+ public static boolean testInterleaved(ArrayList<String> oct, String fname, boolean allowIdentical){
+ if(oct==null || oct.size()<8){return false;}
+ for(String s : oct){
+ if(s==null){return false;}
+ }
+
+ assert(oct.get(0).startsWith("@")) : "File "+fname+"\ndoes not appear to be a valid FASTQ file:\n"+new String(oct.get(0));
+ assert(oct.get(2).startsWith("+")) : "File "+fname+"\ndoes not appear to be a valid FASTQ file:\n"+new String(oct.get(2));
+ assert(oct.get(4).startsWith("@")) : "File "+fname+"\ndoes not appear to be a valid FASTQ file:\n"+new String(oct.get(4));
+ assert(oct.get(6).startsWith("+")) : "File "+fname+"\ndoes not appear to be a valid FASTQ file:\n"+new String(oct.get(6));
+
+ if(FORCE_INTERLEAVED){return true;}
+ if(PARSE_CUSTOM && fname.contains("_interleaved.")){return true;}
+
+ return testPairNames(oct.get(0), oct.get(4), allowIdentical);
+ }
+
+ public static boolean testInterleavedFasta(String fname, boolean allowIdentical){
+ String[] headers=getFirstTwoFastaHeaders(fname);
+ return testInterleavedFasta(headers, fname, allowIdentical);
+ }
+
+ private static boolean testInterleavedFasta(String[] headers, String fname, boolean allowIdentical){
+ if(headers==null || headers.length<2){return false;}
+ for(int i=0; i<headers.length; i++){
+ if(headers[i]==null){return false;}
+ }
+
+ assert(headers[0]==null || headers[0].startsWith(">")) : "File "+fname+"\ndoes not appear to be a valid FASTA file:\n"+new String(headers[0]);
+ assert(headers[1]==null || headers[1].startsWith(">")) : "File "+fname+"\ndoes not appear to be a valid FASTA file:\n"+new String(headers[0]);
+
+ if(FORCE_INTERLEAVED){return true;}
+ if(PARSE_CUSTOM && fname.contains("_interleaved.")){return true;}
+
+ return testPairNames(headers[0], headers[1], allowIdentical);
+ }
+
+ public static byte testQuality(ArrayList<String> oct){
+ if(oct==null || oct.size()<4 || oct.get(0)==null){return ASCII_OFFSET;}
+ if(verbose){System.err.println("testQuality()");}
+ int qflips=0;
+ for(int k=0; k<2; k++){
+ int a=1+4*k, b=3+4*k;
+ if(oct.size()<b || oct.get(a)==null || oct.get(b)==null){break;}
+ byte[] bases=oct.get(a).getBytes();
+ byte[] quals=oct.get(b).getBytes();
+ // assert(false) : Arrays.toString(quals);
+ if(verbose){System.err.println(Arrays.toString(quals));}
+
+ if(DETECT_QUALITY && bases.length>=MIN_LENGTH_TO_FORCE_ASCII_33){
+ if(ASCII_OFFSET==33){
+ //do nothing
+ }else{
+ if(warnQualityChange){System.err.println("Changed from ASCII-64 to ASCII-33 due to read of length "+bases.length+" while prescanning.");}
+ ASCII_OFFSET=33;
+ }
+ DETECT_QUALITY=false;
+ }
+
+ for(int i=0; i<quals.length; i++){
+ quals[i]-=ASCII_OFFSET; //Convert from ASCII33 to native.
+ if(verbose){System.err.println(quals[i]);}
+ if(DETECT_QUALITY){
+ if(ASCII_OFFSET==33 && (quals[i]>QUAL_THRESH || (bases[i]=='N' && quals[i]=='B'))){
+ if(warnQualityChange && qflips<4){System.err.println("Changed from ASCII-33 to ASCII-64 on input quality "+(quals[i]+ASCII_OFFSET)+" while prescanning.");}
+ qflips++;
+ ASCII_OFFSET=64;
+ if(DETECT_QUALITY_OUT){ASCII_OFFSET_OUT=64;}
+ for(int j=0; j<=i; j++){
+ quals[j]=(byte)(quals[j]-31);
+ }
+ }else if(ASCII_OFFSET==64 && (quals[i]<-5)){
+ if(warnQualityChange && qflips<4){System.err.println("Changed from ASCII-64 to ASCII-33 on input quality "+(quals[i]+ASCII_OFFSET)+" while prescanning.");}
+ ASCII_OFFSET=33;
+ if(DETECT_QUALITY_OUT){ASCII_OFFSET_OUT=33;}
+ qflips++;
+ for(int j=0; j<=i; j++){
+ quals[j]=(byte)(quals[j]+31);
+ }
+ }
+ }
+ assert(quals[i]>=-5 || IGNORE_BAD_QUALITY) : "ASCII encoding for quality (currently ASCII-"+ASCII_OFFSET+") appears to be wrong.\n"
+ +oct.get(k)+"\n"+oct.get(k+3)+"\n"+Arrays.toString(oct.get(k+3).getBytes());
+ assert(qflips<2 || IGNORE_BAD_QUALITY) : "Failed to auto-detect quality coding; quitting. Please manually set qin=33 or qin=64.";
+ }
+ }
+
+ return ASCII_OFFSET;
+ }
+
+ public static boolean testPairNames(Read r1, Read r2, boolean allowIdentical){
+ if(r1==null || r2==null){return false;}
+ return testPairNames(r1.id, r2.id, allowIdentical);
+ }
+
+ public static boolean testPairNames(String id1, String id2, boolean allowIdentical){
+
+ if(id1==null || id2==null){return false;}
+
+ final int idxSlash1=id1.lastIndexOf('/');
+ final int idxSlash2=id2.lastIndexOf('/');
+ final int idxSpace1=id1.indexOf(' ');
+ final int idxSpace2=id2.indexOf(' ');
+
+ if(allowIdentical && idxSlash1<0 && idxSpace1<0){
+ return id1.equals(id2);
+ }
+
+ // System.out.println("idxSlash1="+idxSlash1+", idxSlash2="+idxSlash2+", idxSpace1="+idxSpace1+", idxSpace2="+idxSpace2);
+ if(idxSlash1==idxSlash2 && idxSlash1>1){
+ // System.out.println("A");
+ String[] split1=id1.split("/");
+ String[] split2=id2.split("/");
+ // System.out.println(Arrays.toString(split1));
+ // System.out.println(Arrays.toString(split2));
+
+ if(split1.length>1 && split2.length>1 && split1[0].equals(split2[0])){
+ // System.out.println("B");
+ if(split1[split1.length-1].contains(" ")){
+ split1[split1.length-1]=split1[split1.length-1].split(" ")[0];
+ // System.out.println("B1: "+Arrays.toString(split1));
+ }
+ if(split2[split2.length-1].contains(" ")){
+ split2[split2.length-1]=split2[split2.length-1].split(" ")[0];
+ // System.out.println("B2: "+Arrays.toString(split2));
+ }
+ if(split1[split1.length-1].equals("1") && split2[split2.length-1].equals("2")){
+ // System.out.println("B3");
+ return true;
+ }
+ }
+ }
+
+ if(idxSpace1==idxSpace2 && idxSpace1>=0){
+ // System.out.println("C");
+ if(idxSpace1==idxSpace2 && idxSpace1>1){
+ // System.out.println("D");
+ String[] split1=id1.split(" ");
+ String[] split2=id2.split(" ");
+ // System.out.println(Arrays.toString(split1));
+ // System.out.println(Arrays.toString(split2));
+
+ if(split1.length>1 && split2.length>1 && split1[0].equals(split2[0])){
+ // System.out.println("E");
+ if(split1[1].startsWith("1:") && split2[1].startsWith("2:")){return true;}
+ }
+ }
+ }
+ return false;
+ }
+
+ public static String[] toFASTQ(Read r){
+ String id=customID(r);
+ return toFASTQ(r.bases, r.quality, id==null ? ""+r.numericID : id);
+ }
+
+ public static String customID(Read r){
+ if(!TAG_CUSTOM){return r.id;}
+
+ if(DELETE_OLD_NAME){
+ assert(false) : "Seems odd so I added this assertion. I don't see anywhere it was being used. Use -da flag to override.";
+ r.id=null;
+ }
+
+ ByteBuilder sb=new ByteBuilder();
+ if(r.id==null /*|| DELETE_OLD_NAME*/){
+ sb.append(r.numericID);
+ }else{
+ sb.append(r.id);
+ }
+ if(r.chrom>-1 && r.stop>-1){
+ if(TAG_CUSTOM_SIMPLE){
+ sb.append('_');
+ sb.append(r.strand()==0 ? '+' : '-');
+ }else{
+ sb.append("_chr");
+ sb.append(r.chrom);
+ sb.append('_');
+ sb.append((int)r.strand());
+ sb.append('_');
+ sb.append(r.start);
+ sb.append('_');
+ sb.append(r.stop);
+ }
+
+ if(Data.GENOME_BUILD>=0){
+ final int chrom1=r.chrom;
+ final int start1=r.start;
+ final int stop1=r.stop;
+ final int idx1=Data.scaffoldIndex(chrom1, (start1+stop1)/2);
+ final byte[] name1=Data.scaffoldNames[chrom1][idx1];
+ final int a1=Data.scaffoldRelativeLoc(chrom1, start1, idx1);
+ final int b1=a1-start1+stop1;
+ sb.append('_');
+ sb.append(a1);
+ if(TAG_CUSTOM_SIMPLE){
+ sb.append('_');
+ sb.append(b1);
+ }
+ sb.append('_');
+ sb.append(new String(name1));
+ }
+ }
+
+ if(ADD_PAIRNUM_TO_CUSTOM_ID){
+ sb.append(' ');
+ sb.append('/');
+ sb.append(r.pairnum()+1);
+ }
+ return sb.toString();
+ }
+
+ private static int fastqLength(Read r){
+ int len=6; //newlines, @, +
+ len+=(r.id==null ? Tools.stringLength(r.numericID) : r.id.length());
+ len+=r.length();
+ len+=(r.quality==null ? 0 : r.quality.length);
+ return len;
+ }
+
+ public static ByteBuilder toFASTQ(Read r, ByteBuilder bb){
+ int len=fastqLength(r);
+ final String id;
+ final byte[] bases=r.bases, quals=r.quality;
+ if(TAG_CUSTOM && (r.chrom>-1 && r.stop>-1)){
+ id=customID(r);
+ if(id!=null){len+=id.length();}
+ }else{
+ id=r.id;
+ }
+ if(bb==null){bb=new ByteBuilder(len);}
+ else{bb.ensureExtra(len);}
+
+ bb.append('@');
+ if(id==null){bb.append(r.numericID);}
+ else{bb.append(id);}
+ bb.append('\n');
+
+// if(bases!=null){for(byte b : bases){sb.append((char)b);}}
+// sb.append('\n');
+// sb.append('+');
+// sb.append('\n');
+// if(quals!=null){for(byte b : quals){sb.append((char)(b+ASCII_OFFSET_OUT));}}
+
+ if(bases==null){
+ bb.append('\n').append('+').append('\n');
+ if(verbose){System.err.println("A:\n"+bb);}
+ }else{
+ bb.append(bases);
+ bb.append('\n').append('+').append('\n');
+ if(verbose){System.err.println("B:\n"+bb);}
+ if(quals==null){
+ final byte q=(byte)(Shared.FAKE_QUAL+ASCII_OFFSET_OUT);
+ final int blen=bases.length;
+ bb.ensureExtra(blen);
+ for(int i=0, j=bb.length; i<blen; i++, j++){bb.array[j]=q;}
+ bb.length+=blen;
+ if(verbose){System.err.println("C:\n"+bb);}
+ }else{
+ bb.ensureExtra(quals.length);
+ for(int i=0, j=bb.length; i<quals.length; i++, j++){
+ byte q=quals[i];
+ bb.array[j]=(byte)(q+ASCII_OFFSET_OUT);
+ }
+ bb.length+=quals.length;
+ if(verbose){System.err.println("D:\n"+bb);}
+ }
+ }
+ if(verbose){System.err.println("E:\n"+bb);}
+
+// sb.append('\n');
+ return bb;
+ }
+
+ public static StringBuilder toFASTQ(Read r, StringBuilder sb){
+ int len=fastqLength(r);
+ final String id;
+ final byte[] bases=r.bases, quals=r.quality;
+ if(TAG_CUSTOM && (r.chrom>-1 && r.stop>-1)){
+ id=customID(r);
+ if(id!=null){len+=id.length();}
+ }else{
+ id=r.id;
+ }
+ if(sb==null){sb=new StringBuilder(len);}
+ else{sb.ensureCapacity(len);}
+
+ sb.append('@');
+ if(id==null){sb.append(r.numericID);}
+ else{sb.append(id);}
+ sb.append('\n');
+
+// if(bases!=null){for(byte b : bases){sb.append((char)b);}}
+// sb.append('\n');
+// sb.append('+');
+// sb.append('\n');
+// if(quals!=null){for(byte b : quals){sb.append((char)(b+ASCII_OFFSET_OUT));}}
+
+ if(bases==null){
+ sb.append('\n').append('+').append('\n');
+ }else{
+ char[] buffer=Shared.getTLCB(bases.length);
+ for(int i=0; i<bases.length; i++){buffer[i]=(char)bases[i];}
+ sb.append(buffer, 0, bases.length);
+ sb.append('\n').append('+').append('\n');
+ if(quals==null){
+ final char q=(char)(30+ASCII_OFFSET_OUT);
+ final int blen=bases.length;
+ for(int i=0; i<blen; i++){buffer[i]=q;}
+ sb.append(buffer, 0, blen);
+ }else{
+ for(int i=0; i<quals.length; i++){buffer[i]=(char)(quals[i]+ASCII_OFFSET_OUT);}
+ sb.append(buffer, 0, quals.length);
+ }
+ }
+
+// sb.append('\n');
+ return sb;
+ }
+
+ public static String[] toFASTQ(byte[] bases, byte[] quality, String identifier){
+ String[] out=new String[4];
+
+ identifier=(identifier==null ? ""+incr() : identifier);
+ if(quality==null){
+ byte[] x=new byte[bases.length];
+ for(int i=0; i<bases.length; i++){
+ x[i]=30;
+ }
+ quality=x;
+ }
+
+ byte[] q2=new byte[quality.length];
+ for(int i=0; i<q2.length; i++){q2[i]=(byte)(quality[i]+ASCII_OFFSET_OUT);}
+
+ assert(quality.length==bases.length);
+
+ out[0]="@"+identifier;
+ out[1]=new String(bases);
+ out[2]="+"/*+identifier*/;
+ out[3]=new String(q2);
+
+ return out;
+ }
+
+
+ public static Read[] toReads(TextFile tf, int maxReadsToReturn, long numericID, boolean interleaved){
+ ArrayList<Read> list=toReadList(tf, maxReadsToReturn, numericID, interleaved);
+ assert(list.size()<=maxReadsToReturn);
+ return list.toArray(new Read[list.size()]);
+ }
+
+ public static final String makeId(String s){
+ if(s==null || s.length()<1){return null;}
+ char c=s.charAt(0);
+ int start=0, stop=s.length();
+ if(c=='@' || c=='>'){start=1;}
+ if(Shared.TRIM_READ_COMMENTS){
+ for(int i=start; i<stop; i++){
+ if(Character.isWhitespace(s.charAt(i))){
+ stop=i;
+ break;
+ }
+ }
+ }
+ return stop<=start ? null : start==0 && stop==s.length() ? s : s.substring(start, stop);
+ }
+
+ public static final String makeId(byte[] s){
+ if(s==null || s.length<1){return null;}
+ byte c=s[0];
+ int start=0, stop=s.length;
+ if(c=='@' || c=='>'){start=1;}
+ if(Shared.TRIM_READ_COMMENTS){
+ for(int i=start; i<stop; i++){
+ if(Character.isWhitespace(s[i])){
+ stop=i;
+ break;
+ }
+ }
+ }
+ return stop<=start ? null : new String(s, start, stop-start);
+ }
+
+ public static ArrayList<Read> toReadList(TextFile tf, int maxReadsToReturn, long numericID, boolean interleaved){
+ String s=null;
+ ArrayList<Read> list=new ArrayList<Read>(Data.min(16384, maxReadsToReturn));
+
+ String[] quad=new String[4];
+
+ int cntr=0;
+ int added=0;
+
+ Read prev=null;
+
+ for(s=tf.nextLine(); s!=null && added<maxReadsToReturn; s=tf.nextLine()){
+ quad[cntr]=s;
+ cntr++;
+ if(cntr==4){
+ assert(quad[0].startsWith("@")) : "\nError in "+tf.name+", line "+tf.lineNum+"\n"+quad[0]+"\n"+quad[1]+"\n"+quad[2]+"\n"+quad[3]+"\n";
+ assert(quad[2].startsWith("+")) : "\nError in "+tf.name+", line "+tf.lineNum+"\n"+quad[0]+"\n"+quad[1]+"\n"+quad[2]+"\n"+quad[3]+"\n";
+
+// if(quad[0].startsWith("@HW") || quad[0].startsWith("@FC")){ascii_offset=66;} //TODO: clumsy
+
+ final String id=makeId(quad[0]);
+
+ Read r=null;
+
+ byte[] bases=quad[1].getBytes();
+ byte[] quals=quad[3].getBytes();
+
+ if(DETECT_QUALITY && bases.length>=MIN_LENGTH_TO_FORCE_ASCII_33){
+ if(ASCII_OFFSET==33){
+ //do nothing
+ }else{
+
+ if(warnQualityChange){
+ if(numericID<1){
+ System.err.println("Changed from ASCII-33 to ASCII-64 due to read of length "+bases.length+".");
+ }else{
+ warnQualityChange=false;
+ System.err.println("Warning! Changed from ASCII-33 to ASCII-64 due to read of length "+bases.length+".");
+ System.err.println("Up to "+numericID+" prior reads may have been generated with incorrect qualities.");
+ System.err.println("If this is a problem you may wish to re-run with the flag 'qin=33' or 'qin=64'.");
+ }
+ }
+ ASCII_OFFSET=33;
+ }
+ DETECT_QUALITY=false;
+ }
+
+// assert(false) : Arrays.toString(quals);
+ for(int i=0; i<quals.length; i++){
+ quals[i]-=ASCII_OFFSET; //Convert from ASCII33 to native.
+ if(DETECT_QUALITY && ASCII_OFFSET==33 && (quals[i]>QUAL_THRESH /*|| (bases[i]=='N' && quals[i]>20)*/)){
+ if(warnQualityChange){
+ if(numericID<1){
+ System.err.println("Changed from ASCII-33 to ASCII-64 on input "+((char)quals[i])+": "+quals[i]+" -> "+(quals[i]-31));
+ }else{
+ warnQualityChange=false;
+ System.err.println("Warning! Changed from ASCII-33 to ASCII-64 on input "+((char)quals[i])+": "+quals[i]+" -> "+(quals[i]-31));
+ System.err.println("Up to "+numericID+" prior reads may have been generated with incorrect qualities.");
+ System.err.println("If this is a problem you may wish to re-run with the flag 'qin=33' or 'qin=64'.");
+ }
+ }
+ ASCII_OFFSET=64;
+ for(int j=0; j<=i; j++){
+ quals[j]=(byte)(quals[j]-31);
+ }
+ }
+ assert(quals[i]>=-5) : "\n"+quad[0]+"\n"+quad[3];
+ }
+// assert(false) : Arrays.toString(quals);
+// assert(false) : new String(quad[0]);
+ if(PARSE_CUSTOM && quad[0]!=null && quad[0].indexOf('_')>0){
+ String[] answer=quad[0].split("_");
+ if(answer.length>=5){
+ try {
+ int trueChrom=Gene.toChromosome(answer[1]);
+ byte trueStrand=Byte.parseByte(answer[2]);
+ int trueLoc=Integer.parseInt(answer[3]);
+ int trueStop=Integer.parseInt(answer[4]);
+ r=new Read(bases, trueChrom, trueStrand, trueLoc, trueStop, id, quals, numericID);
+ r.setSynthetic(true);
+ } catch (NumberFormatException e) {}
+ }
+ }
+ if(r==null){
+ r=new Read(bases, 0, (byte)0, 0, 0, id, quals, numericID);
+ }
+
+ cntr=0;
+
+ if(interleaved){
+ if(prev==null){prev=r;}
+ else{
+ prev.mate=r;
+ r.mate=prev;
+ r.setPairnum(1);
+ list.add(prev);
+ added++;
+ numericID++;
+ prev=null;
+ }
+ }else{
+ list.add(r);
+ added++;
+ numericID++;
+ }
+
+
+ if(added>=maxReadsToReturn){break;}
+
+// System.out.println(r.chrom+", "+r.strand+", "+r.loc);
+// assert(false);
+ }
+ }
+ assert(list.size()<=maxReadsToReturn);
+ return list;
+ }
+
+ public static ArrayList<Read> toReadList(ByteFile tf, int maxReadsToReturn, long numericID, boolean interleaved){
+ byte[] s=null;
+ ArrayList<Read> list=new ArrayList<Read>(Data.min(8192, maxReadsToReturn));
+// long numericID=numericID0;
+ byte[][] quad=new byte[4][];
+
+ int cntr=0;
+ int added=0;
+
+// int longest=0;
+
+ Read prev=null;
+
+ for(s=tf.nextLine(); s!=null && added<maxReadsToReturn; s=tf.nextLine()){
+ quad[cntr]=s;
+ cntr++;
+ if(cntr==4){
+
+ Read r=quadToRead(quad, true, false, tf, numericID);
+ cntr=0;
+
+// longest=Tools.max(longest, r.length());
+
+ if(interleaved){
+ if(prev==null){prev=r;}
+ else{
+ prev.mate=r;
+ r.mate=prev;
+ r.setPairnum(1);
+ list.add(prev);
+ added++;
+ numericID++;
+ prev=null;
+ }
+ }else{
+ list.add(r);
+ added++;
+ numericID++;
+ }
+
+
+ if(added>=maxReadsToReturn){break;}
+
+// System.out.println(r.chrom+", "+r.strand+", "+r.loc);
+// assert(false);
+ }
+ }
+ assert(list.size()<=maxReadsToReturn);
+ return list;
+ }
+
+ public static byte[][] scarfToQuad(final byte[] scarf, byte[][] quad){
+
+ int a=-1, b=-1;
+ final byte colon=':';
+ for(int i=scarf.length-1; i>=0; i--){
+ if(scarf[i]==colon){
+ if(b<0){b=i;}
+ else{
+ assert(a<0);
+ a=i;
+ break;
+ }
+ }
+ }
+ if(a<0 || b<0){
+ throw new RuntimeException("Misformatted scarf line: "+new String(scarf));
+ }
+ if(quad==null){quad=new byte[4][];}
+ quad[0]=Arrays.copyOfRange(scarf, 0, a);
+ quad[1]=Arrays.copyOfRange(scarf, a+1, b);
+ quad[3]=Arrays.copyOfRange(scarf, b+1, scarf.length);
+ return quad;
+ }
+
+ public static Read quadToRead(final byte[][] quad, boolean fastq, boolean scarf, ByteFile tf, long numericID){
+
+ if(verbose){
+ System.err.println("\nASCII offset is "+ASCII_OFFSET);
+ System.err.println("quad:");
+ System.err.println(new String(quad[0]));
+ System.err.println(new String(quad[1]));
+ System.err.println(new String(quad[2]));
+ System.err.println(new String(quad[3]));
+ }
+
+ assert(scarf || (quad[0].length>0 && quad[0][0]==(byte)'@')) : "\nError in "+tf.name()+", line "+tf.lineNum()+", with these 4 lines:\n"+
+ new String(quad[0])+"\n"+new String(quad[1])+"\n"+new String(quad[2])+"\n"+new String(quad[3])+"\n";
+ assert(scarf || (quad[0].length>0 && quad[2].length>0 && quad[2][0]==(byte)'+')) : "\nError in "+tf.name()+", line "+tf.lineNum()+", with these 4 lines:\n"+
+ new String(quad[0])+"\n"+new String(quad[1])+"\n"+new String(quad[2])+"\n"+new String(quad[3])+"\n";
+
+ // if(quad[0].startsWith("@HW") || quad[0].startsWith("@FC")){ascii_offset=66;} //TODO: clumsy
+
+ final String id=makeId(quad[0]);
+
+ Read r=null;
+
+ byte[] bases=quad[1];
+ byte[] quals=quad[3];
+ // assert(false) : Arrays.toString(quals);
+ for(int i=0; i<quals.length; i++){
+ quals[i]-=ASCII_OFFSET; //Convert from ASCII33 to native.
+ if(DETECT_QUALITY && ASCII_OFFSET==33 && (quals[i]>QUAL_THRESH /*|| (bases[i]=='N' && quals[i]>20)*/)){
+ if(warnQualityChange){
+ if(numericID<1){
+ System.err.println("Changed from ASCII-33 to ASCII-64 on input "+((char)quals[i])+": "+quals[i]+" -> "+(quals[i]-31));
+ }else{
+ warnQualityChange=false;
+ System.err.println("Warning! Changed from ASCII-33 to ASCII-64 on input "+((char)quals[i])+": "+quals[i]+" -> "+(quals[i]-31));
+ System.err.println("Up to "+numericID+" prior reads may have been generated with incorrect qualities.");
+ System.err.println("If this is a problem you may wish to re-run with the flag 'qin=33' or 'qin=64'.");
+ errorState=true;
+ }
+ }
+ ASCII_OFFSET=64;
+ for(int j=0; j<=i; j++){
+ quals[j]=(byte)(quals[j]-31);
+ }
+ }
+ if(quals[i]<-5){
+
+ if(IGNORE_BAD_QUALITY){
+ quals[i]=0;
+ }else{
+ if(!negativeFive){
+ for(int j=0; j<quals.length; j++){quals[j]=Tools.max(quals[j], (byte)33);}
+ System.err.println("\nThe ASCII quality encoding offset ("+ASCII_OFFSET+") is not set correctly, or the reads are corrupt; quality value below -5.\n" +
+ "Please re-run with the flag 'qin=33' or 'ignorebadquality'.\nProblematic read number "+numericID+":\n" +
+
+ "\n"+new String(quad[0])+"\n"+new String(quad[1])+"\n"+new String(quad[2])+"\n"+new String(quad[3])+"\n");
+ }
+ errorState=true;
+ negativeFive=true;
+ return null;
+ }
+
+ }
+ assert(quals[i]>=-5);
+ // assert(quals[i]>=-5) : "The ASCII quality encoding level is not set correctly. Quality value below -5:" +
+ // "\n"+new String(quad[0])+"\n"+new String(quad[1])+"\n"+new String(quad[2])+"\n"+new String(quad[3]);
+ }
+ // assert(false) : Arrays.toString(quals);
+ // assert(false) : PARSE_CUSTOM+"\n"+new String(quad[0]);
+ if(PARSE_CUSTOM){
+ if(quad[0]!=null && Tools.indexOf(quad[0], (byte)'_')>0){
+ String temp=new String(quad[0]);
+ if(temp.endsWith(" /1") || temp.endsWith(" /2")){temp=temp.substring(0, temp.length()-3);}
+ String[] answer=temp.split("_");
+
+ if(answer.length>=5){
+ try {
+ int trueChrom=Gene.toChromosome(answer[1]);
+ byte trueStrand=Byte.parseByte(answer[2]);
+ int trueLoc=Integer.parseInt(answer[3]);
+ int trueStop=Integer.parseInt(answer[4]);
+ r=new Read(bases, trueChrom, trueStrand, trueLoc, trueStop, id, quals, numericID);
+ r.setSynthetic(true);
+ } catch (NumberFormatException e) {
+ PARSE_CUSTOM=false;
+ System.err.println("Turned off PARSE_CUSTOM because could not parse "+new String(quad[0]));
+ }
+ }else{
+ PARSE_CUSTOM=false;
+ System.err.println("Turned off PARSE_CUSTOM because answer="+Arrays.toString(answer));
+ }
+ }else{
+ PARSE_CUSTOM=false;
+ System.err.println("Turned off PARSE_CUSTOM because quad[0]="+new String(quad[0])+", index="+Tools.indexOf(quad[0], (byte)'_'));
+ }
+ }
+ if(r==null){
+ r=new Read(bases, 0, (byte)0, 0, 0, id, quals, numericID);
+ }
+ return r;
+ }
+
+ public static ArrayList<Read> toScarfReadList(ByteFile tf, int maxReadsToReturn, long numericID, boolean interleaved){
+ byte[] s=null;
+ ArrayList<Read> list=new ArrayList<Read>(Data.min(16384, maxReadsToReturn));
+
+ byte[][] quad=new byte[4][];
+
+ int added=0;
+
+ Read prev=null;
+
+ for(s=tf.nextLine(); s!=null && added<maxReadsToReturn; s=tf.nextLine()){
+ scarfToQuad(s, quad);
+ Read r=quadToRead(quad, false, true, tf, numericID);
+
+ if(interleaved){
+ if(prev==null){prev=r;}
+ else{
+ prev.mate=r;
+ r.mate=prev;
+ r.setPairnum(1);
+ list.add(prev);
+ added++;
+ numericID++;
+ prev=null;
+ }
+ }else{
+ list.add(r);
+ added++;
+ numericID++;
+ }
+
+
+ if(added>=maxReadsToReturn){break;}
+ }
+ assert(list.size()<=maxReadsToReturn);
+ return list;
+ }
+
+ public static String qualToString(byte[] quals){
+ byte[] q2=new byte[quals.length];
+ for(int i=0; i<quals.length; i++){
+ q2[i]=(byte)(quals[i]+ASCII_OFFSET);
+ }
+ return new String(q2);
+ }
+
+ /** Return true if this has detected an error */
+ public static boolean errorState(){return errorState;}
+ /** TODO */
+ private static boolean errorState=false;
+ private static boolean negativeFive=false;
+
+ private static synchronized long incr(){return incr++;}
+ private static long incr=10000000000L;
+
+ public static boolean PARSE_CUSTOM=false;
+ public static boolean TAG_CUSTOM=false;
+ public static boolean TAG_CUSTOM_SIMPLE=false;
+ public static boolean DELETE_OLD_NAME=false;
+ public static byte ASCII_OFFSET=33;
+ public static byte ASCII_OFFSET_OUT=33;
+ public static boolean TEST_INTERLEAVED=true;
+ public static boolean FORCE_INTERLEAVED=false;
+ public static boolean DETECT_QUALITY=true;
+ public static boolean DETECT_QUALITY_OUT=true;
+ public static boolean ADD_PAIRNUM_TO_CUSTOM_ID=true;
+
+ public static final int MIN_LENGTH_TO_FORCE_ASCII_33=200;
+ public static final int QUAL_THRESH=54;
+ public static boolean IGNORE_BAD_QUALITY=false;
+ public static boolean verbose=false;
+
+ public static boolean warnQualityChange=true;
+
+// public static int minLength=0;
+// public static int maxLength=0;
+
+}
diff --git a/current/stream/FastaQualReadInputStream.java b/current/stream/FastaQualReadInputStream.java
new file mode 100755
index 0000000..a6d66cb
--- /dev/null
+++ b/current/stream/FastaQualReadInputStream.java
@@ -0,0 +1,340 @@
+package stream;
+
+import java.util.ArrayList;
+
+import align2.Shared;
+import align2.Tools;
+
+import dna.Data;
+import fileIO.ByteFile;
+import fileIO.FileFormat;
+
+public class FastaQualReadInputStream extends ReadInputStream {
+
+ public static void main(String[] args){
+
+ FastaQualReadInputStream fris=new FastaQualReadInputStream(args[0], args[1], true);
+
+ Read r=fris.next();
+ int i=0;
+ while(r!=null){
+ System.out.println(r.toText(false));
+ r=fris.next();
+ if(i++>3){break;}
+ }
+
+ }
+
+ public FastaQualReadInputStream(String fname, String qfname, boolean allowSubprocess_){
+ this(FileFormat.testInput(fname, FileFormat.FASTA, null, allowSubprocess_, false), qfname);
+ }
+
+ public FastaQualReadInputStream(FileFormat ff, String qfname){
+
+ if(!ff.fasta() && !ff.stdio()){
+ System.err.println("Warning: Did not find expected fasta file extension for filename "+ff.name());
+ }
+
+ btf=ByteFile.makeByteFile(ff, false);
+ qtf=ByteFile.makeByteFile(FileFormat.testInput(qfname, FileFormat.QUAL, null, ff.allowSubprocess(), false), false);
+ interleaved=false;
+
+ }
+
+ @Override
+ public void start() {
+// if(cris!=null){cris.start();}
+ }
+
+
+ @Override
+ public boolean hasMore() {
+ if(buffer==null || next>=buffer.size()){
+ if(btf.isOpen()){
+ fillBuffer();
+ }else{
+ assert(generated>0) : "Was the file empty?";
+ }
+ }
+ return (buffer!=null && next<buffer.size());
+ }
+
+ @Override
+ public Read next() {
+ if(!hasMore()){
+ if(verbose){System.err.println("hasMore() returned false; buffer="+(buffer==null ? null : buffer.size())+", next="+next+", consumed="+consumed);}
+ return null;
+ }
+ Read r=buffer.set(next, null);
+ next++;
+ consumed++;
+ return r;
+ }
+
+ @Override
+ public synchronized ArrayList<Read> nextList() {
+ if(next!=0){throw new RuntimeException("'next' should not be used when doing blockwise access.");}
+ if(buffer==null || next>=buffer.size()){fillBuffer();}
+ ArrayList<Read> r=buffer;
+ buffer=null;
+ if(r!=null && r.size()==0){r=null;}
+ consumed+=(r==null ? 0 : r.size());
+// System.err.println(hashCode()+" produced "+r[0].numericID);
+ return r;
+ }
+
+ private synchronized void fillBuffer(){
+ if(builder==null){builder=new ByteBuilder(2000);}
+ if(verbose){System.err.println("Filling buffer. buffer="+(buffer==null ? null : buffer.size()));}
+ assert(buffer==null || next>=buffer.size());
+
+ buffer=null;
+ next=0;
+
+ buffer=toReads(BUF_LEN, nextReadID, interleaved);
+ final int count=(buffer==null ? 0 : buffer.size());
+
+ if(verbose){System.err.println("Filled buffer. size="+count);}
+
+ nextReadID+=count;
+ if(count<BUF_LEN){
+ if(verbose){System.err.println("Closing tf");}
+ errorState|=close();
+ }
+
+ generated+=count;
+ if(verbose){System.err.println("generated="+generated);}
+ }
+
+ private ArrayList<Read> toReads(int maxReadsToReturn, long numericID, boolean interleaved){
+ ArrayList<Read> list=toReadList(maxReadsToReturn, numericID, interleaved);
+ if(list==null){assert(finished);}
+ else{assert(list.size()<=maxReadsToReturn);}
+ return list;
+ }
+
+ private ArrayList<Read> toReadList(int maxReadsToReturn, long numericID, boolean interleaved){
+ if(finished){return null;}
+ if(verbose){System.err.println("FastaQualRIS fetching a list.");}
+
+ if(currentHeader==null && numericID==0){
+// assert(numericID==0) : numericID;
+ nextBases(btf, builder);
+ nextQualities(qtf, builder);
+ if(nextHeaderB==null){
+ finish();
+ return null;
+ }
+ assert(Tools.equals(nextHeaderB, nextHeaderQ)) : "Quality and Base headers differ for read "+numericID;
+ currentHeader=nextHeaderB;
+ nextHeaderB=nextHeaderQ=null;
+ if(currentHeader==null){
+ finish();
+ return null;
+ }
+ }
+
+ ArrayList<Read> list=new ArrayList<Read>(Data.min(1000, maxReadsToReturn));
+
+ int added=0;
+
+ Read prev=null;
+
+ while(added<maxReadsToReturn){
+ Read r=makeRead(numericID);
+ if(verbose){System.err.println("Made "+r);}
+ if(r==null){
+ finish();
+ if(verbose){System.err.println("makeRead returned null.");}
+ break;
+ }
+ if(interleaved){
+ if(prev==null){prev=r;}
+ else{
+ prev.mate=r;
+ r.mate=prev;
+ list.add(prev);
+ added++;
+ numericID++;
+ prev=null;
+ }
+ }else{
+ list.add(r);
+ added++;
+ numericID++;
+ }
+ }
+
+ assert(list.size()<=maxReadsToReturn);
+ if(verbose){System.err.println("FastaQualRIS returning a list. Size="+list.size());}
+ return list;
+ }
+
+ private final byte[] nextBases(ByteFile btf, ByteBuilder bb){
+ assert(bb.length()==0);
+ byte[] line=btf.nextLine();
+ while(line!=null && (line.length==0 || line[0]!=carrot)){
+ bb.append(line);
+ line=btf.nextLine();
+ }
+
+
+ if(line==null){//end of file
+ //do nothing
+ }else{
+ assert(line.length>0);
+ assert(line[0]==carrot);
+ nextHeaderB=line;
+ }
+ final byte[] r=bb.toBytes();
+ bb.setLength(0);
+
+ return r;
+ }
+
+ private final byte[] nextQualities(ByteFile qtf, ByteBuilder bb){
+ assert(bb.length()==0);
+ byte[] line=qtf.nextLine();
+ while(line!=null && (line.length==0 || line[0]!=carrot)){
+ if(NUMERIC_QUAL && line.length>0){
+ int x=0;
+ for(int i=0; i<line.length; i++){
+ byte b=line[i];
+ if(b==space){
+ assert(i>0);
+ bb.append((byte)x);
+ x=0;
+ }else{
+ x=10*x+(b-zero);
+ }
+ }
+ bb.append((byte)x);
+ }else{
+ for(byte b : line){bb.append((byte)(b-FASTQ.ASCII_OFFSET));}
+ }
+ line=qtf.nextLine();
+ }
+// assert(bb.length()<1) : "'"+Arrays.toString(bb.toBytes())+"'";
+
+ if(line==null){//end of file
+ //do nothing
+ }else{
+ assert(line.length>0);
+ assert(line[0]==carrot);
+ nextHeaderQ=line;
+ }
+ final byte[] r=bb.toBytes();
+ bb.setLength(0);
+
+ return r;
+ }
+
+ private Read makeRead(long numericID){
+ if(finished){
+ if(verbose){System.err.println("Returning null because finished.");}
+ return null;
+ }
+ if(currentHeader==null){return null;}
+ assert(nextHeaderB==null);
+ assert(nextHeaderQ==null);
+
+ final byte[] bases=nextBases(btf, builder);
+ final byte[] quals=nextQualities(qtf, builder);
+ final byte[] header=currentHeader;
+
+ currentHeader=nextHeaderB;
+ nextHeaderB=nextHeaderQ=null;
+
+ if(bases==null){
+ if(verbose){System.err.println("Returning null because tf.nextLine()==null: A");}
+ return null;
+ }
+
+ assert(bases.length==quals.length) :
+ "\nFor sequence "+numericID+", name "+new String(header)+":\n" +
+ "The bases and quality scores are different lengths, "+bases.length+" and "+quals.length;
+
+ for(int i=0; i<bases.length; i++){
+ bases[i]=(byte)Character.toUpperCase(bases[i]);
+ }
+// for(int i=0; i<quals.length; i++){
+// quals[i]=(byte)(quals[i]-FASTQ.ASCII_OFFSET);
+// }
+
+ assert(bases[0]!=carrot) : new String(bases)+"\n"+numericID+"\n"+header[0];
+ String hd=new String(header, 1, header.length-1);
+ Read r=new Read(bases, (byte)0, (byte)0, 0, 0, hd, quals, numericID);
+ return r;
+ }
+
+ public synchronized boolean close(){
+ if(closed){return errorState;}
+ if(verbose){System.err.println("FastaQualRIS closing.");}
+// if(verbose){new Exception().printStackTrace(System.err);}
+ builder=null;
+ finish();
+ boolean a=btf.close();
+ boolean b=qtf.close();
+ closed=true;
+ return a|b;
+ }
+
+ @Override
+ public synchronized void restart() {
+ if(verbose){System.err.println("FastaQualRIS restarting.");}
+ generated=0;
+ consumed=0;
+ next=0;
+ nextReadID=0;
+
+ finished=false;
+ closed=false;
+
+ buffer=null;
+ nextHeaderB=null;
+ nextHeaderQ=null;
+ currentHeader=null;
+ builder=null;
+
+ btf.reset();
+ qtf.reset();
+ }
+
+ @Override
+ public boolean paired() {
+ return interleaved;
+ }
+
+ private synchronized void finish(){
+ if(verbose){System.err.println("FastaQualRIS setting finished "+finished+" -> "+true);}
+ finished=true;
+ }
+
+ private ArrayList<Read> buffer=null;
+ private int next=0;
+
+ private final ByteFile btf;
+ private final ByteFile qtf;
+ private final boolean interleaved;
+
+ private final int BUF_LEN=Shared.READ_BUFFER_LENGTH;
+
+ public long generated=0;
+ public long consumed=0;
+ private long nextReadID=0;
+
+ public static boolean NUMERIC_QUAL=true;
+
+ public static boolean verbose=false;
+
+ private byte[] nextHeaderB=null;
+ private byte[] nextHeaderQ=null;
+
+ private byte[] currentHeader=null;
+
+ private ByteBuilder builder=null;
+
+ private boolean finished=false, closed=false;
+ private final byte carrot='>', space=' ', zero='0';
+
+}
diff --git a/current/stream/FastaReadInputStream.java b/current/stream/FastaReadInputStream.java
new file mode 100755
index 0000000..98149e4
--- /dev/null
+++ b/current/stream/FastaReadInputStream.java
@@ -0,0 +1,572 @@
+package stream;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import align2.ListNum;
+import align2.Shared;
+import align2.Tools;
+
+import dna.Gene;
+import dna.Timer;
+
+import fileIO.FileFormat;
+import fileIO.ReadWrite;
+
+/**
+ * @author Brian Bushnell
+ * @date Feb 13, 2013
+ *
+ */
+public class FastaReadInputStream extends ReadInputStream {
+
+ public static void main(String[] args){
+
+ int a=20, b=Integer.MAX_VALUE;
+ if(args.length>1){a=Integer.parseInt(args[1]);}
+ if(args.length>2){b=Integer.parseInt(args[2]);}
+ if(args.length>3){MIN_READ_LEN=Integer.parseInt(args[3]);}
+ if(args.length>4){TARGET_READ_LEN=Integer.parseInt(args[4]);}
+ if(TARGET_READ_LEN<1){
+ TARGET_READ_LEN=Integer.MAX_VALUE;
+ SPLIT_READS=false;
+ }
+
+ Timer t=new Timer();
+
+ FastaReadInputStream fris=new FastaReadInputStream(args[0], false, false, false, Shared.READ_BUFFER_MAX_DATA);
+ Read r=fris.next();
+ int i=0;
+
+ while(r!=null){
+ if(i<a){System.out.println(r.toText(false));}
+ r=fris.next();
+ if(++i>=a){break;}
+ }
+ while(r!=null && i++<b){r=fris.next();}
+ t.stop();
+ System.out.println("Time: \t"+t);
+ }
+
+ public static ArrayList<Read> toReads(String fname){
+ if(fname==null){return null;}
+ ArrayList<Read> list=new ArrayList<Read>();
+
+ /* Start an input stream */
+ FileFormat ff=FileFormat.testInput(fname, FileFormat.FASTA, null, false, true);
+ ConcurrentReadInputStream cris=ConcurrentReadInputStream.getReadInputStream(-1L, false, ff, null);
+ cris.start(); //4567
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ /* Iterate through read lists from the input stream */
+ while(reads!=null && reads.size()>0){
+ list.addAll(reads);
+
+ /* Dispose of the old list and fetch a new one */
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ /* Cleanup */
+ cris.returnList(ln.id, ln.list.isEmpty());
+// errorState|=ReadWrite.closeStream(cris);
+
+ return list;
+ }
+
+ public FastaReadInputStream(String fname, boolean interleaved_, boolean amino_, boolean allowSubprocess_, long maxdata){
+ this(FileFormat.testInput(fname, FileFormat.FASTA, FileFormat.FASTA, 0, allowSubprocess_, false, false), interleaved_, amino_, maxdata);
+ }
+
+ public FastaReadInputStream(FileFormat ff, boolean interleaved_, boolean amino_, long maxdata){
+ name=ff.name();
+ amino=amino_;
+ flag=(amino ? Read.AAMASK : 0);
+
+ if(!fileIO.FileFormat.hasFastaExtension(name) && !name.startsWith("stdin")){
+ System.err.println("Warning: Did not find expected fasta file extension for filename "+name);
+ }
+
+ interleaved=interleaved_;
+ allowSubprocess=ff.allowSubprocess();
+ minLen=MIN_READ_LEN;
+ maxLen=(SPLIT_READS ? (TARGET_READ_LEN>0 ? TARGET_READ_LEN : Integer.MAX_VALUE) : Integer.MAX_VALUE);
+ MAX_DATA=maxdata>0 ? maxdata : Shared.READ_BUFFER_MAX_DATA;
+
+ ins=open();
+
+ assert(settingsOK());
+ }
+
+ @Override
+ public Read next() {
+ if(!hasMore()){
+ if(verbose){System.err.println("hasMore() returned false; currentList="+
+ (currentList==null ? null : currentList.size())+", nextReadIndex="+nextReadIndex+", consumed="+consumed);}
+ return null;
+ }
+ Read r=currentList.set(nextReadIndex, null);
+ nextReadIndex++;
+ consumed++;
+ return r;
+ }
+
+ @Override
+ public ArrayList<Read> nextList() {
+ if(nextReadIndex!=0){throw new RuntimeException("'next' should not be used when doing blockwise access.");}
+ if(currentList==null || nextReadIndex>=currentList.size()){
+ boolean b=fillList();
+ }
+ ArrayList<Read> list=currentList;
+ currentList=null;
+ if(list==null || list.isEmpty()){
+ list=null;
+ }else{
+ consumed+=list.size();
+ }
+ return list;
+ }
+
+ @Override
+ public boolean hasMore() {
+ if(currentList==null || nextReadIndex>=currentList.size()){
+ if(open){
+ fillList();
+ }else{
+// assert(generated>0) : "Was the file empty?";
+ }
+ }
+ return (currentList!=null && nextReadIndex<currentList.size());
+ }
+
+ @Override
+ public void restart() {
+ if(ins!=null){close();}
+ assert(ins==null);
+// generated=0;
+ consumed=0;
+ nextReadIndex=0;
+ nextReadID=0;
+ currentList=null;
+ header=null;
+ bstart=0;
+ bstop=0;
+
+ currentSection=0;
+ if(ins==null){
+ ins=open();
+ }else{
+ assert(false) : "is should be null";
+ }
+ }
+
+ @Override
+ public final boolean close(){
+ if(!open){return false;}
+ open=false;
+ assert(ins!=null);
+
+ try {
+ if(ins!=System.in){
+ errorState|=ReadWrite.finishReading(ins, name, allowSubprocess);
+ }
+ } catch (Exception e) {
+ System.err.println("Some error occured: "+e);
+ errorState=true;
+ }
+
+ ins=null;
+ return false;
+ }
+
+ @Override
+ public boolean paired() {return interleaved;}
+
+ @Override
+ public void start() {}
+
+ private final boolean fillList(){
+// assert(open);
+ if(!open){
+ currentList=null;
+ return false;
+ }
+ assert(currentList==null || nextReadIndex>=currentList.size());
+ nextReadIndex=0;
+ currentList=new ArrayList<Read>(BUF_LEN);
+
+ if(header==null){
+ header=nextHeader();
+ if(header==null){
+ currentList=null;
+ return false;
+ }
+ }
+ long len=0;
+ for(int i=0; i<BUF_LEN && len<MAX_DATA; i++){
+ Read r=generateRead(0);
+ if(r==null){break;}
+ currentList.add(r);
+ len+=r.length();
+ if(interleaved){
+ Read r2=generateRead(1);
+ if(r2==null){break;}
+ len+=r2.length();
+ r.mate=r2;
+ r2.mate=r;
+ }
+ nextReadID++;
+ if(verbose){System.err.println("Generated a read; i="+i+", BUF_LEN="+BUF_LEN);}
+// if(i==1){assert(false) : r.numericID+", "+r.mate.numericID;}
+ }
+
+ return currentList.size()>0;
+ }
+
+ private final Read generateRead(int pairnum){
+ if(verbose){System.err.println("Called generateRead(); bstart="+bstart+", bstop="+bstop+", currentSection="+currentSection+", header="+header);}
+ assert(header!=null) : "Null header for fasta read - input file may be corrupt: "+name;
+ if(bstart<bstop && buffer[bstart]==carrot){
+ header=nextHeader();
+ currentSection=0;
+ }
+ assert(header!=null);
+ byte[] bases=nextBases();
+
+ currentSection++;
+ while(bases==null){
+// if(!open){return null;} //Should not be needed...
+ header=nextHeader();
+ if(header==null){
+ close();
+ return null;
+ }
+ bases=nextBases();
+ currentSection=1;
+ }
+ assert(bases!=null);
+ assert(bases.length>0);
+
+ byte[] quals=null;
+ if(FAKE_QUALITY){
+ quals=new byte[bases.length];
+ Arrays.fill(quals, (byte)(Shared.FAKE_QUAL));
+ }
+// String hd=((currentSection==1 && !hitmax) ? header : header+"_"+currentSection);
+ String hd=((!FORCE_SECTION_NAME && currentSection==1 && bases.length<=maxLen) ? header : header+"_part_"+currentSection);
+// assert(false) : FORCE_SECTION_NAME+", "+(currentSection==1)+", "+(bases.length<=maxLen)+", "+bases.length+", "+maxLen;
+ assert(currentSection==1 || bases.length>0) : "id="+hd+", section="+currentSection+", len="+bases.length+"\n"+new String(bases);
+ Read r=null;
+ if(FASTQ.PARSE_CUSTOM){
+ if(header!=null && header.indexOf('_')>0){
+ String temp=header;
+ if(temp.endsWith(" /1") || temp.endsWith(" /2")){temp=temp.substring(0, temp.length()-3);}
+ String[] answer=temp.split("_");
+
+ if(answer.length>=5){
+ try {
+ int trueChrom=Gene.toChromosome(answer[1]);
+ byte trueStrand=Byte.parseByte(answer[2]);
+ int trueLoc=Integer.parseInt(answer[3]);
+ int trueStop=Integer.parseInt(answer[4]);
+// r=new Read(bases, trueChrom, trueStrand, trueLoc, trueStop, hd, quals, nextReadID);
+ r=new Read(bases, trueChrom, trueLoc, trueStop, hd, quals, nextReadID, (flag|trueStrand));
+ r.setSynthetic(true);
+ } catch (NumberFormatException e) {
+ FASTQ.PARSE_CUSTOM=false;
+ System.err.println("Turned off PARSE_CUSTOM because could not parse "+new String(header));
+ }
+ }else{
+ FASTQ.PARSE_CUSTOM=false;
+ System.err.println("Turned off PARSE_CUSTOM because answer="+Arrays.toString(answer));
+ }
+ }else{
+ FASTQ.PARSE_CUSTOM=false;
+ System.err.println("Turned off PARSE_CUSTOM because header="+header+", index="+header.indexOf('_'));
+ }
+ }
+ if(r==null){
+// r=new Read(bases, (byte)0, (byte)0, 0, 0, hd, quals, nextReadID);
+ r=new Read(bases, -1, -1, -1, hd, quals, nextReadID, flag);
+ }
+ r.setPairnum(pairnum);
+ if(verbose){System.err.println("Made read:\t"+(r.length()>1000 ? r.id : r.toString()));}
+ return r;
+ }
+
+ private String nextHeader(){
+ if(verbose){System.err.println("Called nextHeader(); bstart="+bstart+"; bstop="+bstop);}
+ assert(bstart>=bstop || buffer[bstart]=='>' || buffer[bstart]<=slashr) : bstart+", "+bstop+", '"+(char)buffer[bstart]+"'";
+ while(bstart<bstop && buffer[bstart]!='>'){bstart++;}
+ int x=bstart;
+ assert(bstart>=bstop || buffer[x]=='>') : bstart+", "+bstop+", '"+(char)buffer[x]+"'";
+ while(x<bstop && buffer[x]>slashr){x++;}
+ if(verbose){System.err.println("A: x="+x);}
+ if(x<bstop && (buffer[x]<0x2 || buffer[x]==tab)){ //Handle deprecated 'SOH' symbol and tab
+ while(x<bstop && (buffer[x]>slashr || buffer[x]<0x2 || buffer[x]==tab)){
+ if(buffer[x]==0x1){buffer[x]=carrot;}
+ x++;
+ }
+ }
+ if(verbose){System.err.println("B: x="+x);}
+ if(x>=bstop){
+ int fb=fillBuffer();
+ if(verbose){System.err.println("B: fb="+fb);}
+ if(fb<1){
+ if(verbose){System.err.println("Returning null from nextHeader()");}
+ return null;
+ }
+ x=0;
+ assert(bstart==0 && bstart<bstop && buffer[x]=='>') : "Improperly formatted fasta file; expecting '>' symbol.\n"+
+ (buffer[x]=='@' ? "If this is a fastq file, please rename it with a '.fastq' extension.\n" : "")+
+ bstart+", "+bstop+", "+(int)buffer[x]+", "+(char)buffer[x]; //Note: This assertion will fire if a fasta file starts with a newline.
+ while(x<bstop && buffer[x]>slashr){x++;}
+ if(x<bstop && (buffer[x]<0x2 || buffer[x]==tab)){ //Handle deprecated 'SOH' symbol and tab
+ while(x<bstop && (buffer[x]>slashr || buffer[x]<0x2 || buffer[x]==tab)){
+ if(buffer[x]==0x1){buffer[x]=carrot;}
+ x++;
+ }
+ }
+ }
+ if(verbose){System.err.println("C: x="+x);}
+ assert(x>=bstop || buffer[x]<=slashr);
+
+ int start=bstart+1, stop=x;
+ if(Shared.TRIM_READ_COMMENTS){
+ for(int i=start; i<stop; i++){
+ if(Character.isWhitespace(buffer[i])){
+ stop=i;
+ break;
+ }
+ }
+ }
+
+ String s=stop>start ? new String(buffer, start, stop-start) : "";
+// String s=new String(buffer, bstart+1, x-(bstart+1));
+ if(verbose){System.err.println("Fetched header: '"+s+"'");}
+ bstart=x+1;
+
+ return s;
+ }
+
+ private byte[] nextBases(){
+ if(verbose){System.err.println("Called nextBases()");}
+ assert(open) : "Attempting to read from a closed file. Current header: "+header;
+ if(bstart>=bstop){
+ int bytes=fillBuffer();
+ if(bytes<1 || !open){return null;}
+ }
+ int x=bstart;
+ int bases=0;
+
+ if(!(x>=bstop || buffer[x]!='>')){
+ if(WARN_IF_NO_SEQUENCE){
+ synchronized(getClass()){
+ System.err.println("Warning: A fasta header with no sequence was encountered.");
+ WARN_IF_NO_SEQUENCE=false;
+ }
+ }
+ assert(!ABORT_IF_NO_SEQUENCE) : "\n<START>"+new String(buffer, 0, Tools.min(x+1, buffer.length))+"<STOP>\n";
+ }
+
+// assert(x>=bstop || buffer[x]!='>') :
+// "A fasta header with no sequence was encountered. To discard such headers, please re-run with the -da flag.";
+ //"\n<START>"+new String(buffer, 0, Tools.min(x+1, buffer.length))+"<STOP>\n";
+
+ while(x<bstop && bases<maxLen && buffer[x]!='>'){
+ while(x<bstop && bases<maxLen && buffer[x]!='>'){
+ if(buffer[x]>slashr){bases++;}
+ x++;
+ }
+ assert(x==bstop || buffer[x]=='>' || bases==maxLen);
+ if(x==bstop && bases<maxLen){
+ int fb=fillBuffer();
+ if(fb<1){
+ x=bstop;
+ if(verbose){System.err.println("Broke loop when fb="+fb+"; bstart="+bstart+", bstop="+bstop);}
+ break;
+ }
+ x=bstart;
+ bases=0;
+ }
+ }
+
+ if(bases<minLen){
+ assert(open) : "Attempting to read from a closed file. Current header: "+header;
+ bstart=x;
+ if(verbose){System.err.println("Fetched "+bases+" bases; returning null. bstart="+bstart+", bstop="+bstop/*+"\n"+new String(buffer)*/);}
+ return null;
+ }
+
+ byte[] r=new byte[bases];
+
+// if(Read.TO_UPPER_CASE){
+// for(int i=bstart, j=0; j<bases; i++){
+// assert(i<x);
+// byte b=buffer[i];
+// // if(verbose){System.err.println("grabbed base "+(char)b+" = "+b);}
+// if(b>slashr){
+// r[j]=(b<91 ? b : (byte)(b-32));//Convert to upper case
+// // if(verbose){System.err.println("set to base "+(char)r[j]+" = "+r[j]);}
+// j++;
+// }
+// }
+// }else{
+ for(int i=bstart, j=0; j<bases; i++){
+ assert(i<x);
+ byte b=buffer[i];
+ if(b>slashr){
+ r[j]=b;
+ j++;
+ }
+ }
+// }
+
+ if(verbose){System.err.println("Fetched "+bases+" bases, open="+open+":\n'"+(r.length>1000 ? "*LONG*" : new String(r))+"'");}
+
+ bstart=x;
+ return r;
+ }
+
+ /** Fills buffer. Ensures that result will extend to the next caret or EOF. Returns number of bytes filled. */
+ private final int fillBuffer(){
+ assert(open);
+ if(!open){return 0;}
+ if(verbose){System.err.println("fillBuffer() : bstart="+bstart+", bstop="+bstop);}
+ if(bstart<bstop){ //Shift end bytes to beginning
+ if(bstart>0){
+// assert(bstart>0) : bstart+", "+bstop+", "+new String(buffer);
+ int extra=bstop-bstart;
+ for(int i=0; i<extra; i++, bstart++){
+ buffer[i]=buffer[bstart];
+ }
+ bstop=extra;
+ }
+ }else{
+ bstop=0;
+ }
+ if(verbose){System.err.println("After shift : bstart="+bstart+", bstop="+bstop);}
+
+// assert(bstart>0 || buffer[0]=='>') : "bstart="+bstart+", bstop="+bstop+", buffer[0]='"+(char)buffer[0]+"'";
+// assert(bstart<=bstop) : "bstart="+bstart+", bstop="+bstop+", buffer[0]='"+(char)buffer[0]+"'";
+
+ bstart=0;
+
+ int len=bstop;
+ int r=-1;
+ int sum=0;
+ while(len==bstop){//hit end of input without encountering a caret
+ if(bstop==buffer.length){
+ buffer=Arrays.copyOf(buffer, buffer.length*2);
+ if(verbose){System.err.println("Resized to "+buffer.length);}
+ }
+ try {
+ r=ins.read(buffer, bstop, buffer.length-bstop);
+ } catch (IOException e) {e.printStackTrace();}
+ //if(verbose){System.err.println("r="+r);}
+ if(r>0){
+ sum+=r;
+ bstop=bstop+r;
+ if(bstop>0 && len==0){len=1;}
+ while(len<bstop && buffer[len]!=carrot){len++;}
+ }else{
+ len=bstop;
+ break;
+ }
+ }
+
+ //Skip ';'-delimited comments
+ if(header==null && bstop>bstart && buffer[bstart]==';'){
+ if(sum==0){return sum;}
+ int lastsemi=bstart;
+ assert(nextReadID==0);
+ assert(bstart==0);
+ while(bstop>bstart && buffer[bstart]==';'){
+ while(bstop>bstart && (buffer[bstart]>slashr || buffer[bstart]<0x2 || buffer[bstart]==tab)){bstart++;}
+ while(bstop>bstart && buffer[bstart]<=slashr){bstart++;}
+ }
+ if(bstart>=bstop){ //Overflowed buffer with comments; recur
+ bstart=lastsemi;
+ return fillBuffer();
+ }
+ }
+
+ assert(r==-1 || buffer[len]=='>');
+ if(verbose){System.err.println("After filling: bstart="+bstart+", bstop="+bstop+", len="+len+", r="+r+", sum="+sum);}
+ return sum;
+ }
+
+ private final InputStream open(){
+ if(open){
+ throw new RuntimeException("Attempt to open already-opened fasta file "+name);
+ }
+ open=true;
+ ins=ReadWrite.getInputStream(name, true, allowSubprocess);
+ bstart=0;
+ bstop=0;
+// lasteol=-1;
+ return ins;
+ }
+
+ public boolean isOpen(){return open;}
+
+ public static final boolean settingsOK(){
+ if(MIN_READ_LEN>=Integer.MAX_VALUE-1){
+ throw new RuntimeException("Minimum FASTA read length is too long: "+MIN_READ_LEN);
+ }
+ if(MIN_READ_LEN<1){
+ throw new RuntimeException("Minimum FASTA read length is too short: "+MIN_READ_LEN);
+ }
+ if(SPLIT_READS){
+ if(TARGET_READ_LEN<1){
+ throw new RuntimeException("Target FASTA read length is too short: "+TARGET_READ_LEN);
+ }
+ if(MIN_READ_LEN>TARGET_READ_LEN){
+ throw new RuntimeException("Minimum FASTA read length is longer than maximum read length: "+MIN_READ_LEN+">"+TARGET_READ_LEN);
+ }
+ }
+ if(MIN_READ_LEN>=Integer.MAX_VALUE-1 || MIN_READ_LEN<1){return false;}
+ if(SPLIT_READS && (TARGET_READ_LEN<1 || MIN_READ_LEN>TARGET_READ_LEN)){return false;}
+ return true;
+ }
+
+ public final String name;
+
+ private ArrayList<Read> currentList=null;
+ private String header=null;
+
+ private boolean open=false;
+// private boolean hitmax=false; //Indicates that the current 'read' has more pieces to come
+ private byte[] buffer=new byte[16384];
+ private int bstart=0, bstop=0;
+ public InputStream ins;
+
+ private long consumed=0;
+ private long nextReadID=0;
+ private int nextReadIndex=0;
+ private int currentSection=0;
+
+ public final boolean allowSubprocess;
+ public final boolean interleaved;
+ public final boolean amino;
+ public final int flag;
+ private final int BUF_LEN=Shared.READ_BUFFER_LENGTH;
+ private final long MAX_DATA;
+ private final int maxLen, minLen;
+
+
+ public static boolean verbose=false;
+ private final static byte slashr='\r', slashn='\n', carrot='>', space=' ', tab='\t';
+
+ public static boolean SPLIT_READS=false;
+ public static int TARGET_READ_LEN=500;
+ public static int MIN_READ_LEN=1;//40;
+ public static boolean FAKE_QUALITY=false;
+ public static boolean FORCE_SECTION_NAME=false;
+ public static boolean WARN_IF_NO_SEQUENCE=true;
+ public static boolean ABORT_IF_NO_SEQUENCE=false;
+
+}
diff --git a/current/stream/FastaReadInputStream2.java b/current/stream/FastaReadInputStream2.java
new file mode 100755
index 0000000..4c88a3d
--- /dev/null
+++ b/current/stream/FastaReadInputStream2.java
@@ -0,0 +1,302 @@
+package stream;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import align2.Shared;
+
+import dna.Data;
+import fileIO.ByteFile;
+import fileIO.FileFormat;
+
+public class FastaReadInputStream2 extends ReadInputStream {
+
+ public static void main(String[] args){
+
+ FastaReadInputStream2 fris=new FastaReadInputStream2(args[0], true);
+
+ Read r=fris.next();
+ int i=0;
+ while(r!=null){
+ System.out.println(r.toText(false));
+ r=fris.next();
+ if(i++>3){break;}
+ }
+
+ }
+
+ public FastaReadInputStream2(String fname, boolean allowSubprocess_){
+
+ FileFormat ff=FileFormat.testInput(fname, FileFormat.FASTA, null, allowSubprocess_, false);
+
+ if(!ff.fasta() && !ff.stdio()){
+ System.err.println("Warning: Did not find expected fasta file extension for filename "+fname);
+ }
+
+ tf=ByteFile.makeByteFile(ff, false);
+ interleaved=false;
+
+ }
+
+ @Override
+ public void start() {
+// if(cris!=null){cris.start();}
+ }
+
+
+ @Override
+ public boolean hasMore() {
+ if(buffer==null || next>=buffer.size()){
+ if(tf.isOpen()){
+ fillBuffer();
+ }else{
+ assert(generated>0) : "Was the file empty?";
+ }
+ }
+ return (buffer!=null && next<buffer.size());
+ }
+
+ @Override
+ public Read next() {
+ if(!hasMore()){
+ if(verbose){System.err.println("hasMore() returned false; buffer="+(buffer==null ? null : buffer.size())+", next="+next+", consumed="+consumed);}
+ return null;
+ }
+ Read r=buffer.get(next);
+ buffer.set(next, null);
+ next++;
+ consumed++;
+ return r;
+ }
+
+ @Override
+ public synchronized ArrayList<Read> nextList() {
+ if(next!=0){throw new RuntimeException("'next' should not be used when doing blockwise access.");}
+ if(buffer==null || next>=buffer.size()){fillBuffer();}
+ ArrayList<Read> r=buffer;
+ buffer=null;
+ if(r!=null && r.size()==0){r=null;}
+ consumed+=(r==null ? 0 : r.size());
+// System.err.println(hashCode()+" produced "+r[0].numericID);
+ return r;
+ }
+
+ private synchronized void fillBuffer(){
+ if(verbose){System.err.println("Filling buffer. buffer="+(buffer==null ? null : buffer.size()));}
+ assert(buffer==null || next>=buffer.size());
+
+ buffer=null;
+ next=0;
+
+ buffer=toReadList(tf, BUF_LEN, nextReadID, interleaved, headerA);
+
+ if(verbose){System.err.println("Filled buffer. buffer="+(buffer==null ? null : buffer.size()));}
+
+ nextReadID+=buffer.size();
+ if(buffer.size()<BUF_LEN){
+ if(verbose){System.err.println("Closing tf");}
+ tf.close();
+ }
+
+ generated+=buffer.size();
+ if(verbose){System.err.println("generated="+generated);}
+ }
+
+ private ArrayList<Read> toReadList(ByteFile tf, int maxReadsToReturn, long numericID, boolean interleaved, String[] headerA){
+ if(finished){return null;}
+ if(verbose){System.err.println("FastaRIS fetching a list.");}
+
+ ArrayList<Read> list=new ArrayList<Read>(Data.min(16384, maxReadsToReturn));
+
+ int added=0;
+
+ Read prev=null;
+ if(verbose){System.err.println("added="+added+", max="+maxReadsToReturn);}
+ while(added<maxReadsToReturn){
+ Read r=makeNextRead(tf, maxReadsToReturn, numericID, headerA);
+ if(verbose){System.err.println("Made "+r);}
+ if(r==null){
+ if(verbose){System.err.println("makeNextRead returned null.");}
+ break;
+ }
+ if(interleaved){
+ if(prev==null){prev=r;}
+ else{
+ prev.mate=r;
+ r.mate=prev;
+ list.add(prev);
+ added++;
+ numericID++;
+ prev=null;
+ }
+ }else{
+ list.add(r);
+ added++;
+ numericID++;
+ }
+ }
+
+ assert(list.size()<=maxReadsToReturn);
+ if(verbose){System.err.println("FastaRIS returning a list. Size="+list.size());}
+ return list;
+ }
+
+ private Read makeNextRead(ByteFile tf, int maxReadsToReturn, long numericID, String[] headerA){
+ if(finished){
+ if(verbose){System.err.println("Returning null because finished.");}
+ return null;
+ }
+ assert(currentLine==null || currentLine[0]!=carrot);
+
+ while(currentLine==null){
+ currentLine=tf.nextLine();
+ if(currentLine==null){
+ if(verbose){System.err.println("Returning null because tf.nextLine()==null: A");}
+ return null;
+ }
+ if(currentLine[0]==carrot){
+ headerA[0]=new String(currentLine);
+ currentLoc=0;
+ currentSection=0;
+ currentLine=null;
+ }
+ }
+ if(verbose){System.err.println("currentLine="+new String(currentLine));}
+
+ assert(currentLine==null || currentLine[0]!=carrot);
+
+ StringBuilder sb=new StringBuilder();
+ Read r=null;
+ while(r==null){
+ if(verbose){System.err.println("r==null, looping; current="+new String(currentLine)+"\nsb="+sb);}
+ if(!SPLIT_READS || (currentLoc==0 && (currentLine.length<=(TARGET_READ_LEN-sb.length())))){
+// sb.append(currentLine);
+ for(byte b : currentLine){sb.append((char)b);}
+ currentLoc=currentLine.length;
+ }else{
+ while(sb.length()<TARGET_READ_LEN && currentLoc<currentLine.length){
+ sb.append((char)currentLine[currentLoc]);
+ currentLoc++;
+ }
+ }
+ assert(currentLine==null || currentLine[0]!=carrot);
+ assert(sb.length()<=TARGET_READ_LEN) : sb.length()+"\n"+sb;
+
+ if(sb.length()==TARGET_READ_LEN){
+ assert(currentLine==null || currentLine[0]!=carrot);
+ r=makeRead(sb, numericID);
+ currentSection++;
+ return r;
+ }else{
+ assert(currentLine==null || currentLine[0]!=carrot);
+ assert(currentLoc>=currentLine.length) : currentLoc+", "+currentLine.length+", "+
+ TARGET_READ_LEN+", "+sb.length()+"\n"+currentLine+"\n"+sb;
+ currentLine=null;
+ currentLoc=0;
+ while(currentLine==null){
+ currentLine=tf.nextLine();
+ if(currentLine==null || currentLine[0]==carrot){
+ if(sb.length()>=MIN_READ_LEN){
+ if(verbose){System.err.println("Made read of length "+sb.length());}
+ r=makeRead(sb, numericID);
+ }else{
+ if(verbose){System.err.println("Read was too short at length "+sb.length()+"\n"+sb);}
+ sb.setLength(0);
+ }
+ if(verbose){System.err.println("headerA was "+headerA[0]);}
+ headerA[0]=(currentLine==null ? null : new String(currentLine));
+ currentLoc=0;
+ currentSection=0;
+// assert(false) : "'"+new String(currentLine)+"', "+headerA[0];
+ currentLine=null;
+ if(r!=null){
+ if(verbose){System.err.println("Returning read "+r);}
+ return r;
+ }
+ if(headerA[0]==null){
+ if(verbose){System.err.println("Returning null because tf.nextLine()==null: B");}
+ return null;
+ }
+ }
+ assert(currentLine==null || currentLine[0]!=carrot);
+ }
+ assert(currentLine==null || currentLine[0]!=carrot);
+ }
+ assert(currentLine==null || currentLine[0]!=carrot);
+ }
+ assert(currentLine==null || currentLine[0]!=carrot);
+ if(verbose){System.err.println("Returning null because loop exited (should be unreachable).");}
+ return null;
+ }
+
+ private Read makeRead(StringBuilder sb, long numericID){
+ byte[] quals=null;
+ byte[] bases=new byte[sb.length()];
+ if(FAKE_QUALITY){
+ quals=new byte[sb.length()];
+ Arrays.fill(quals, (byte)(30));
+ }
+ for(int i=0; i<bases.length; i++){
+ bases[i]=(byte)Character.toUpperCase(sb.charAt(i));
+ }
+ assert(bases[0]!=carrot) : new String(bases)+"\n"+numericID+"\n"+headerA[0];
+ String hd=(currentSection>0 ? headerA[0].substring(1)+"_"+currentSection : new String(headerA[0].substring(1)));
+// assert(currentSection==0);
+ Read r=new Read(bases, (byte)0, (byte)0, 0, 0, hd, quals, numericID);
+ return r;
+ }
+
+ public boolean close(){
+ return tf.close();
+ }
+
+ @Override
+ public synchronized void restart() {
+ generated=0;
+ consumed=0;
+ next=0;
+ nextReadID=0;
+ buffer=null;
+
+ currentLine=null;
+ currentLoc=0;
+ currentSection=0;
+ finished=false;
+
+ tf.reset();
+ }
+
+ @Override
+ public boolean paired() {
+ return interleaved;
+ }
+
+ private ArrayList<Read> buffer=null;
+ private int next=0;
+
+ private final ByteFile tf;
+ private final boolean interleaved;
+
+ private final int BUF_LEN=Shared.READ_BUFFER_LENGTH;
+
+ public long generated=0;
+ public long consumed=0;
+ private long nextReadID=0;
+
+ private final String[] headerA=new String[1];
+
+ public static boolean SPLIT_READS=true;
+ public static int TARGET_READ_LEN=500;
+ public static int MIN_READ_LEN=40;
+ public static int DEFAULT_WRAP=100;
+
+ public static boolean verbose=false;
+ public static boolean FAKE_QUALITY=false;
+
+ private byte[] currentLine=null;
+ private int currentLoc=0;
+ private int currentSection=0;
+ private boolean finished=false;
+ private final byte carrot='>';
+
+}
diff --git a/current/stream/FastqReadInputStream.java b/current/stream/FastqReadInputStream.java
new file mode 100755
index 0000000..55f000d
--- /dev/null
+++ b/current/stream/FastqReadInputStream.java
@@ -0,0 +1,164 @@
+package stream;
+
+import java.util.ArrayList;
+
+import align2.Shared;
+
+import dna.Data;
+import fileIO.ByteFile;
+import fileIO.FileFormat;
+
+public class FastqReadInputStream extends ReadInputStream {
+
+ public static void main(String[] args){
+
+ FastqReadInputStream fris=new FastqReadInputStream(args[0], true);
+
+ Read r=fris.next();
+ System.out.println(r.toText(false));
+
+ }
+
+ public FastqReadInputStream(String fname, boolean allowSubprocess_){
+ this(FileFormat.testInput(fname, FileFormat.FASTQ, null, allowSubprocess_, false));
+ }
+
+
+ public FastqReadInputStream(FileFormat ff){
+ if(verbose){System.err.println("FastqReadInputStream("+ff+")");}
+
+ stdin=ff.stdio();
+ if(!ff.fastq()){
+ System.err.println("Warning: Did not find expected fastq file extension for filename "+ff.name());
+ }
+
+ if(FASTQ.PARSE_CUSTOM){
+ try {
+ String s[]=ff.name().split("_");
+// maxSnps=toNumber(s[3]);
+// maxInss=toNumber(s[4]);
+// maxDels=toNumber(s[5]);
+// maxSubs=toNumber(s[6]);
+
+// s=s[8].split("\\.");
+//
+// s=s[0].split("-");
+
+ if(s.length!=8 && s.length!=9){
+ if(Data.WINDOWS){System.err.println("Note: Filename indicates non-synthetic data, but FASTQ.PARSE_CUSTOM="+FASTQ.PARSE_CUSTOM);}
+ }
+
+// minChrom=Gene.toChromosome(s[0]);
+// maxChrom=Gene.toChromosome(s[1]);
+
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ // e.printStackTrace();
+ if(Data.WINDOWS){System.err.println("Note: Filename indicates non-synthetic data, but FASTQ.PARSE_CUSTOM="+FASTQ.PARSE_CUSTOM);}
+ }
+ }
+// interleaved=false;
+ interleaved=(ff.stdio()) ? FASTQ.FORCE_INTERLEAVED : FASTQ.isInterleaved(ff.name(), false);
+ tf=ByteFile.makeByteFile(ff, false);
+// assert(false) : interleaved;
+ }
+
+ @Override
+ public void start() {
+// if(cris!=null){cris.start();}
+ }
+
+
+ @Override
+ public boolean hasMore() {
+ if(buffer==null || next>=buffer.size()){
+ if(tf.isOpen()){
+ fillBuffer();
+ }else{
+ assert(generated>0) : "Was the file empty?";
+ }
+ }
+ return (buffer!=null && next<buffer.size());
+ }
+
+ @Override
+ public Read next() {
+ if(!hasMore()){return null;}
+ Read r=buffer.set(next, null);
+ next++;
+ consumed++;
+ return r;
+ }
+
+ @Override
+ public synchronized ArrayList<Read> nextList() {
+ if(next!=0){throw new RuntimeException("'next' should not be used when doing blockwise access.");}
+ if(buffer==null || next>=buffer.size()){fillBuffer();}
+ ArrayList<Read> list=buffer;
+ buffer=null;
+ if(list!=null && list.size()==0){list=null;}
+ consumed+=(list==null ? 0 : list.size());
+ return list;
+ }
+
+ private synchronized void fillBuffer(){
+
+ assert(buffer==null || next>=buffer.size());
+
+ buffer=null;
+ next=0;
+
+ buffer=FASTQ.toReadList(tf, BUF_LEN, nextReadID, interleaved);
+ int bsize=(buffer==null ? 0 : buffer.size());
+ nextReadID+=bsize;
+ if(bsize<BUF_LEN){tf.close();}
+
+ generated+=bsize;
+ if(buffer==null){
+ if(!errorState){
+ errorState=true;
+ System.err.println("Null buffer in FastqReadInputStream.");
+ }
+ }
+ }
+
+ public boolean close(){
+ if(verbose){System.err.println("Closing "+this.getClass().getName()+" for "+tf.name()+"; errorState="+errorState);}
+ errorState|=tf.close();
+ if(verbose){System.err.println("Closed "+this.getClass().getName()+" for "+tf.name()+"; errorState="+errorState);}
+ return errorState;
+ }
+
+ @Override
+ public synchronized void restart() {
+ generated=0;
+ consumed=0;
+ next=0;
+ nextReadID=0;
+ buffer=null;
+ tf.reset();
+ }
+
+ @Override
+ public boolean paired() {return interleaved;}
+
+ /** Return true if this stream has detected an error */
+ public boolean errorState(){return errorState || FASTQ.errorState();}
+
+ private ArrayList<Read> buffer=null;
+ private int next=0;
+
+ private final ByteFile tf;
+ private final boolean interleaved;
+
+ private final int BUF_LEN=Shared.READ_BUFFER_LENGTH;
+ private final long MAX_DATA=Shared.READ_BUFFER_MAX_DATA; //TODO - lot of work for unlikely case of super-long fastq reads. Must be disabled for paired-ends.
+
+ public long generated=0;
+ public long consumed=0;
+ private long nextReadID=0;
+
+ public final boolean stdin;
+ public static boolean verbose=false;
+
+}
diff --git a/current/stream/KillSwitch.java b/current/stream/KillSwitch.java
new file mode 100755
index 0000000..ad8185e
--- /dev/null
+++ b/current/stream/KillSwitch.java
@@ -0,0 +1,204 @@
+package stream;
+
+import java.lang.management.ManagementFactory;
+import java.lang.management.OperatingSystemMXBean;
+//import com.sun.management.OperatingSystemMXBean;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicIntegerArray;
+
+/**
+ * Monitors CPU utilization to determine if the program has crashed.
+ * @author Brian Bushnell
+ * @date Feb 25, 2015
+ *
+ */
+public final class KillSwitch extends Thread {
+
+ public static void main(String[] args){
+ double seconds=Double.parseDouble(args[0]);
+ double load=Double.parseDouble(args[1]);
+ launch(seconds, load);
+ if(args.length>2){
+
+ }
+ }
+
+ /**
+ * @param seconds
+ * @param load
+ */
+ private KillSwitch(double seconds, double load) {
+ maxSeconds=seconds;
+ minLoad=load;
+ }
+
+ public static boolean launch(){
+ return launch(600);
+ }
+
+ public static boolean launch(double seconds){
+ return launch(seconds, 0.002);
+ }
+
+ public static synchronized boolean launch(double seconds, double load){
+ if(count>0){return false;}
+ ks=new KillSwitch(seconds, load);
+ ks.start();
+ return true;
+ }
+
+ @Override
+ public void run(){
+
+ boolean success=monitor();
+// System.err.println("success: "+success);
+ if(!success || killFlag.get()){
+ if(!suppressMessages){
+ System.err.println("Process has decided it has crashed, and will abort.\n" +
+ "If this decision was incorrect, please re-run with the flag 'monitor=f'");
+ }
+ kill0();
+ }
+ }
+
+ private boolean monitor(){
+
+ final OperatingSystemMXBean bean=ManagementFactory.getOperatingSystemMXBean();
+ if(bean.getSystemLoadAverage()<0){
+ System.err.println("This OS does not support monitor, so monitoring was disabled.");
+ return true;
+ }
+
+ final long start=System.currentTimeMillis();
+ final long buffer=(long)(1+maxSeconds*1000);
+ long stop=start+buffer;
+// System.err.println("start="+start+", stop="+stop+", buffer="+buffer);
+// System.err.println("shutdownFlag.get()="+shutdownFlag.get());
+ while(!shutdownFlag.get()){
+ try {
+ sleep(500);
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ final double load=bean.getSystemLoadAverage();
+ final long time=System.currentTimeMillis();
+ if(load>minLoad){stop=time+buffer;}
+ if(time>stop){return false;}
+// System.err.println("stop-time="+(stop-time)+", load="+load);
+ }
+// System.err.println("shutdownFlag.get()="+shutdownFlag.get());
+ return true;
+ }
+
+ public static void kill(String s){
+ Exception e=new Exception(s);
+ e.printStackTrace();
+ kill0();
+ }
+
+ public static void kill(){
+ Exception e=new Exception("Aborting.");
+ e.printStackTrace();
+ kill0();
+ }
+
+ public static void killSilent(){
+ kill0();
+ }
+
+ private static void kill0(){
+ Runtime.getRuntime().halt(1);
+ }
+
+ public static void shutdown(){
+ shutdownFlag.set(true);
+ }
+
+ public static void setKillFlag(){
+ killFlag.set(true);
+ }
+
+ private final double maxSeconds;
+ private final double minLoad;
+
+ private static AtomicBoolean shutdownFlag=new AtomicBoolean(false);
+ private static AtomicBoolean killFlag=new AtomicBoolean(false);
+ private static int count=0;
+ private static KillSwitch ks;
+ private static boolean suppressMessages=false;
+
+
+
+
+ public static final AtomicIntegerArray allocAtomicInt(int len){
+ AtomicIntegerArray ret=null;
+ try {
+ ret=new AtomicIntegerArray(len);
+ } catch (OutOfMemoryError e) {
+ memKill(e);
+ }
+ return ret;
+ }
+
+ public static final long[] allocLong1D(int len){
+ long[] ret=null;
+ try {
+ ret=new long[len];
+ } catch (OutOfMemoryError e) {
+ memKill(e);
+ }
+ return ret;
+ }
+
+ public static final int[] allocInt1D(int len){
+ int[] ret=null;
+ try {
+ ret=new int[len];
+ } catch (OutOfMemoryError e) {
+ memKill(e);
+ }
+ return ret;
+ }
+
+ public static final byte[] allocByte1D(int len){
+ byte[] ret=null;
+ try {
+ ret=new byte[len];
+ } catch (OutOfMemoryError e) {
+ memKill(e);
+ }
+ return ret;
+ }
+
+ public static final char[] allocChar1D(int len){
+ char[] ret=null;
+ try {
+ ret=new char[len];
+ } catch (OutOfMemoryError e) {
+ memKill(e);
+ }
+ return ret;
+ }
+
+ public static final int[][] allocInt2D(int len){
+ int[][] ret=null;
+ try {
+ ret=new int[len][];
+ } catch (OutOfMemoryError e) {
+ memKill(e);
+ }
+ return ret;
+ }
+
+ public static final void memKill(OutOfMemoryError e){
+ synchronized(MemKillMessage){
+ e.printStackTrace();
+ System.err.println(MemKillMessage);
+ killSilent();
+ }
+ }
+
+ private final static String MemKillMessage=new String("\nThis program ran out of memory. Try increasing the -Xmx flag and setting prealloc.");
+
+}
diff --git a/current/stream/MultiCros.java b/current/stream/MultiCros.java
new file mode 100755
index 0000000..e331616
--- /dev/null
+++ b/current/stream/MultiCros.java
@@ -0,0 +1,190 @@
+package stream;
+
+import java.util.ArrayList;
+import java.util.LinkedHashMap;
+
+import align2.ListNum;
+import align2.Shared;
+import align2.Tools;
+
+import fileIO.FileFormat;
+import fileIO.ReadWrite;
+
+/**
+ * Allows output of reads to multiple different output streams.
+ * @author Brian Bushnell
+ * @date Apr 12, 2015
+ *
+ */
+public class MultiCros {
+
+ public static void main(String[] args){
+ String in=args[0];
+ String pattern=args[1];
+ ArrayList<String> names=new ArrayList<String>();
+ for(int i=2; i<args.length; i++){
+ names.add(args[i]);
+ }
+ final int buff=Tools.max(16, 2*Shared.threads());
+ MultiCros mcros=new MultiCros(pattern, null, false, false, false, false, false, FileFormat.FASTQ, buff);
+
+ ConcurrentReadInputStream cris=ConcurrentReadInputStream.getReadInputStream(-1, true, false, in);
+ cris.start();
+
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+ ArrayListSet als=new ArrayListSet(false);
+
+ while(reads!=null && reads.size()>0){
+
+ for(Read r1 : reads){
+ als.add(r1, names);
+ }
+ cris.returnList(ln.id, ln.list.isEmpty());
+ if(mcros!=null){mcros.add(als, ln.id);}
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ cris.returnList(ln.id, ln.list.isEmpty());
+ if(mcros!=null){mcros.add(als, ln.id);}
+ ReadWrite.closeStreams(cris);
+ ReadWrite.closeStreams(mcros);
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public MultiCros(String pattern1_, String pattern2_,
+ boolean ordered_, boolean overwrite_, boolean append_, boolean allowSubprocess_, boolean useSharedHeader_, int defaultFormat_, int maxSize_){
+ assert(pattern1_!=null && pattern1_.indexOf('%')>=0);
+ assert(pattern2_==null || pattern1_.indexOf('%')>=0);
+ if(pattern2_==null && pattern1_.indexOf('#')>=0){
+ pattern1=pattern1_.replaceFirst("#", "1");
+ pattern2=pattern1_.replaceFirst("#", "2");
+ }else{
+ pattern1=pattern1_;
+ pattern2=pattern2_;
+ }
+
+ ORDERED=ordered_;
+ overwrite= overwrite_;
+ append=append_;
+ allowSubprocess=allowSubprocess_;
+ useSharedHeader=useSharedHeader_;
+
+ defaultFormat=defaultFormat_;
+ maxSize=maxSize_;
+
+ streamList=new ArrayList<ConcurrentReadOutputStream>();
+ streamMap=new LinkedHashMap<String, ConcurrentReadOutputStream>();
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Outer Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+
+ public void add(ArrayListSet set, long listnum){
+ for(String s : set.getNames()){
+ ArrayList<Read> list=set.getAndClear(s);
+ if(list!=null){
+ add(list, listnum, s);
+ }
+ }
+ }
+
+ public void add(ArrayList<Read> list, long listnum, String name){
+ ConcurrentReadOutputStream ros=getStream(name);
+ ros.add(list, listnum);
+ }
+
+ public void close(){
+ for(ConcurrentReadOutputStream cros : streamList){cros.close();}
+ }
+
+ public void join(){
+ for(ConcurrentReadOutputStream cros : streamList){cros.join();}
+ }
+
+ public void resetNextListID(){
+ for(ConcurrentReadOutputStream cros : streamList){cros.resetNextListID();}
+ }
+
+ public String fname(){return pattern1;}
+
+ /** Return true if this stream has detected an error */
+ public boolean errorState(){
+ boolean b=errorState;
+ for(ConcurrentReadOutputStream cros : streamList){
+ b=b&&cros.errorState();
+ }
+ return b;
+ }
+
+ public boolean finishedSuccessfully(){
+ boolean b=true;
+ for(ConcurrentReadOutputStream cros : streamList){
+ b=b&&cros.finishedSuccessfully();
+ }
+ return b;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private ConcurrentReadOutputStream makeStream(String name){
+ String s1=pattern1.replaceFirst("%", name);
+ String s2=pattern2==null ? null : pattern2.replaceFirst("%", name);
+ final FileFormat ff1=FileFormat.testOutput(s1, defaultFormat, null, allowSubprocess, overwrite, append, ORDERED);
+ final FileFormat ff2=FileFormat.testOutput(s2, defaultFormat, null, allowSubprocess, overwrite, append, ORDERED);
+ ConcurrentReadOutputStream ros=ConcurrentReadOutputStream.getStream(ff1, ff2, maxSize, null, useSharedHeader);
+ return ros;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Getters ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public ConcurrentReadOutputStream getStream(String name){
+ ConcurrentReadOutputStream ros=streamMap.get(name);
+ if(ros==null){
+ synchronized(streamMap){
+ ros=streamMap.get(name);
+ if(ros==null){
+ ros=makeStream(name);
+ ros.start();
+ streamList.add(ros);
+ streamMap.put(name, ros);
+ }
+ }
+ }
+ return ros;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public final String pattern1, pattern2;
+ public final ArrayList<ConcurrentReadOutputStream> streamList;
+ public final LinkedHashMap<String, ConcurrentReadOutputStream> streamMap;
+ public final boolean ORDERED;
+
+ boolean errorState=false;
+ boolean started=false;
+ final boolean overwrite;
+ final boolean append;
+ final boolean allowSubprocess;
+ final int defaultFormat;
+ final int maxSize;
+ final boolean useSharedHeader;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Static Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public static boolean verbose=false;
+
+}
diff --git a/current/stream/RTextInputStream.java b/current/stream/RTextInputStream.java
new file mode 100755
index 0000000..a94c2df
--- /dev/null
+++ b/current/stream/RTextInputStream.java
@@ -0,0 +1,301 @@
+package stream;
+
+import java.io.File;
+import java.util.ArrayList;
+
+import align2.ListNum;
+import align2.Shared;
+import align2.Tools;
+
+import fileIO.FileFormat;
+import fileIO.ReadWrite;
+import fileIO.TextFile;
+
+/**
+ * This class allows multiple files as input.
+ * These files are synchronized, so a read will be created by merging the sitescores from the same line of each file.
+ * @author Brian Bushnell
+ * @date Jul 16, 2013
+ *
+ */
+public class RTextInputStream extends ReadInputStream {
+
+ public static void main(String[] args){
+ RTextInputStream rtis=new RTextInputStream(args, 0);
+ ArrayList<Read> list=rtis.nextList();
+ while(list!=null){
+ for(Read r : list){
+ System.out.println(r.toText(true));
+ }
+ list=rtis.nextList();
+ }
+ }
+
+ public RTextInputStream(FileFormat ff1, FileFormat ff2, long crisReadLimit){
+ this(ff1.name(), (ff2==null ? null : ff2.name()), crisReadLimit);
+ }
+
+ public RTextInputStream(String fname1, String fname2, long crisReadLimit){
+ this(new String[] {fname1}, (fname2==null || "null".equalsIgnoreCase(fname2)) ? null : new String[] {fname2}, crisReadLimit);
+ assert(fname2==null || !fname1.equals(fname2)) : "Error - input files have same name.";
+ }
+ public RTextInputStream(String[] fnames_, long crisReadLimit){this(fnames_, null, crisReadLimit);}
+
+ public RTextInputStream(String[] fnames_, String[] mate_fnames_, long crisReadLimit){
+ fnames=fnames_;
+ textfiles=new TextFile[fnames.length];
+ for(int i=0; i<textfiles.length; i++){
+ textfiles[i]=new TextFile(fnames[i], true, false);
+ }
+
+ readLimit=(crisReadLimit<0 ? Long.MAX_VALUE : crisReadLimit);
+ if(readLimit==0){
+ System.err.println("Warning - created a read stream for 0 reads.");
+ assert(false);
+ }
+ interleaved=(mate_fnames_!=null ? false :
+ (!FASTQ.TEST_INTERLEAVED || textfiles[0].is==System.in) ? FASTQ.FORCE_INTERLEAVED : isInterleaved(fnames[0]));
+
+// assert(false) : (mate_fnames_!=null)+", "+(textfiles[0].is==System.in)+", "+interleaved+", "+FASTQ.FORCE_INTERLEAVED+", "+isInterleaved(fnames[0]);
+
+ mateStream=(mate_fnames_==null ? null : new RTextInputStream(mate_fnames_, null, crisReadLimit));
+ cris=((!USE_CRIS || mateStream==null) ? null : new ConcurrentLegacyReadInputStream(mateStream, crisReadLimit));
+ if(cris!=null){cris.start();}
+ }
+
+ public static boolean isInterleaved(String fname){
+ File f=new File(fname);
+ assert(f.exists() && f.isFile());
+ TextFile tf=new TextFile(fname, false, false);
+ String s=tf.nextLine();
+ tf.close();
+ return "#INTERLEAVED".equals(s);
+ }
+
+ @Override
+ public void start() {
+ assert(false); //Not fully implemented everywhere...
+ if(cris!=null){cris.start();}
+ }
+
+// @Override
+// public synchronized Read[] nextBlock(){
+// ArrayList<Read> list=readList();
+// if(list==null || list.size()==0){return null;}
+// return list.toArray(new Read[list.size()]);
+// }
+
+ @Override
+ public synchronized ArrayList<Read> nextList(){
+// System.out.println((mateStream==null ? "F5: " : "F3: ")+"Grabbing a list: finished="+finished);
+ if(finished){return null;}
+ return readList();
+ }
+
+ private synchronized ArrayList<Read> readList(){
+ assert(buffer==null);
+// System.out.println((mateStream==null ? "F5: " : "F3: ")+" Entering readList");
+ if(finished){return null;}
+
+ ArrayList<Read> merged=getListFromFile(textfiles[0]);
+
+ if(textfiles.length>1){
+ ArrayList<Read>[] temp=new ArrayList[textfiles.length];
+ temp[0]=merged;
+ for(int i=0; i<temp.length; i++){
+ temp[i]=getListFromFile(textfiles[i]);
+ }
+
+ for(int i=0; i<merged.size(); i++){
+ Read r=merged.get(i);
+ for(int j=1; j<temp.length; j++){
+ Read r2=temp[j].get(i);
+ assert(r2.numericID==r.numericID);
+ assert(r2.id.equals(r.id));
+ if(r.sites==null){r.sites=r2.sites;}
+ else if(r2.sites!=null){r.sites.addAll(r2.sites);}
+ }
+ }
+ }
+
+// System.out.println((mateStream==null ? "F5: " : "F3: ")+"Merged: "+merged==null ? "null" : ""+merged.size());
+
+ if(cris!=null){
+ // System.out.println((mateStream==null ? "F5: " : "F3: ")+"Grabbing a mate list: finished="+mateStream.finished);
+ ListNum<Read> mates0=cris.nextList();
+ ArrayList<Read> mates=mates0.list;
+ assert((mates==null || mates.size()==0) == (merged==null || merged.size()==0)) : (merged==null)+", "+(mates==null);
+ if(merged!=null && mates!=null){
+
+ assert(mates.size()==merged.size()) : "\n"+mates.size()+", "+merged.size()+", "+paired()+"\n"+
+ merged.get(0).toText(false)+"\n"+mates.get(0).toText(false)+"\n\n"+
+ merged.get(merged.size()-1).toText(false)+"\n"+mates.get(mates.size()-1).toText(false)+"\n\n"+
+ merged.get(Tools.min(merged.size(), mates.size())-1).toText(false)+"\n"+
+ mates.get(Tools.min(merged.size(), mates.size())-1).toText(false)+"\n\n";
+
+ for(int i=0; i<merged.size(); i++){
+ Read r1=merged.get(i);
+ Read r2=mates.get(i);
+ r1.mate=r2;
+ assert(r1.pairnum()==0);
+
+ if(r2!=null){
+ r2.mate=r1;
+ r2.setPairnum(1);
+ assert(r2.numericID==r1.numericID) : "\n\n"+r1.toText(false)+"\n\n"+r2.toText(false)+"\n";
+// assert(r2.id.equals(r1.id)) : "\n\n"+r1.toText(false)+"\n\n"+r2.toText(false)+"\n";
+ }
+
+ }
+ }
+ cris.returnList(mates0.id, mates0.list.isEmpty());
+ }else if(mateStream!=null){
+ // System.out.println((mateStream==null ? "F5: " : "F3: ")+"Grabbing a mate list: finished="+mateStream.finished);
+ ArrayList<Read> mates=mateStream.readList();
+ assert((mates==null || mates.size()==0) == (merged==null || merged.size()==0)) : (merged==null)+", "+(mates==null);
+ if(merged!=null && mates!=null){
+ assert(mates.size()==merged.size()) : mates.size()+", "+merged.size();
+
+ for(int i=0; i<merged.size(); i++){
+ Read r1=merged.get(i);
+ Read r2=mates.get(i);
+ r1.mate=r2;
+ r2.mate=r1;
+
+ assert(r1.pairnum()==0);
+ r2.setPairnum(1);
+
+ assert(r2.numericID==r1.numericID) : "\n\n"+r1.toText(false)+"\n\n"+r2.toText(false)+"\n";
+ assert(r2.id.equals(r1.id)) : "\n\n"+r1.toText(false)+"\n\n"+r2.toText(false)+"\n";
+ }
+ }
+ }
+
+ if(merged.size()<READS_PER_LIST){
+ if(merged.size()==0){merged=null;}
+ shutdown();
+ }
+
+ return merged;
+ }
+
+ private ArrayList<Read> getListFromFile(TextFile tf){
+
+ int len=READS_PER_LIST;
+ if(readLimit-readCount<len){len=(int)(readLimit-readCount);}
+
+ ArrayList<Read> list=new ArrayList<Read>(len);
+
+ for(int i=0; i<len; i++){
+ String s=tf.nextLine();
+ while(s!=null && s.charAt(0)=='#'){s=tf.nextLine();}
+ if(s==null){break;}
+ Read r=Read.fromText(s);
+// assert(r.toString().equals(s)) : "\n\n"+s+"\n!=\n"+r.toString()+"\n\n";
+// assert(r.chrom>0 == r.mapScore>0) : r.toText(false);
+ if(interleaved){
+ s=tf.nextLine();
+ assert(s!=null) : "Odd number of reads in interleaved file "+tf.name;
+ if(s!=null){
+ Read r2=Read.fromText(s);
+ assert(r2.numericID==r.numericID) : "Different numeric IDs for paired reads in interleaved file "+tf.name;
+ r2.numericID=r.numericID;
+ r2.mate=r;
+ r.mate=r2;
+ }
+ }
+ list.add(r);
+ }
+ readCount+=list.size();
+
+ if(list.size()<len){
+ assert(tf.nextLine()==null);
+ tf.close();
+ }
+ return list;
+ }
+
+ @Override
+ public boolean paired() {
+ return mateStream!=null || interleaved;
+ }
+
+ public final void shutdown(){
+ finished=true;
+ if(mateStream!=null){mateStream.shutdown();}
+ if(cris!=null){cris.shutdown();}
+ }
+
+ public boolean finished=false;
+ public String[] fnames;
+ public TextFile[] textfiles;
+
+ private ArrayList<Read> buffer=null;
+ private int next=0;
+
+ private long readCount;
+ private final long readLimit;
+ private final boolean interleaved;
+
+ public static final int READS_PER_LIST=Shared.READ_BUFFER_LENGTH;
+
+ private final RTextInputStream mateStream;
+ private final ConcurrentLegacyReadInputStream cris;
+ public static boolean USE_CRIS=true; //Doubles read speed for zipped paired files
+
+ @Override
+ /** This is optimistic and may return "true" incorrectly. */
+ public boolean hasMore() {
+ if(buffer!=null && next<buffer.size()){return true;}
+ return !finished;
+ }
+
+
+ @Override
+ /** ONLY CALL FROM A SINGLE THREAD! */
+ public Read next() {
+ if(buffer==null || next>=buffer.size()){
+ buffer=null;
+ next=0;
+ if(!finished){
+ buffer=nextList();
+ }
+ }
+
+ if(buffer==null || next>=buffer.size()){
+ assert(finished);
+ return null;
+ }
+ Read r=buffer.get(next);
+ buffer.set(next, null);
+ next++;
+ return r;
+ }
+
+
+ @Override
+ public synchronized void restart() {
+ finished=false;
+ next=0;
+ buffer=null;
+ for(TextFile tf : textfiles){tf.reset();}
+ if(cris!=null){
+ cris.restart();
+ cris.start();
+ }else if(mateStream!=null){mateStream.restart();}
+ }
+
+ @Override
+ public synchronized boolean close() {
+ boolean error=false;
+ for(TextFile tf : textfiles){error|=tf.close();}
+ if(cris!=null){
+ error|=ReadWrite.closeStream(cris);;
+ }else if(mateStream!=null){
+ mateStream.close();
+ error|=mateStream.errorState();
+ }
+ return error;
+ }
+
+}
diff --git a/current/stream/RandomReadInputStream3.java b/current/stream/RandomReadInputStream3.java
new file mode 100755
index 0000000..aedbe04
--- /dev/null
+++ b/current/stream/RandomReadInputStream3.java
@@ -0,0 +1,188 @@
+package stream;
+
+import java.util.ArrayList;
+
+import dna.Data;
+
+import align2.RandomReads3;
+import align2.Shared;
+import align2.Tools;
+
+/**
+ * @author Brian Bushnell
+ * @date Sep 10, 2014
+ *
+ */
+public class RandomReadInputStream3 extends ReadInputStream {
+
+ public RandomReadInputStream3(long number_, boolean paired_){
+ Data.setGenome(Data.GENOME_BUILD);
+ number=number_;
+ paired=paired_;
+ maxChrom=Data.numChroms;
+ minQual=6;
+ midQual=18;
+ maxQual=30;
+ restart();
+ }
+
+ public RandomReadInputStream3(long number_, int minreadlen_, int maxreadlen_,
+ int maxSnps_, int maxInss_, int maxDels_, int maxSubs_,
+ float snpRate_, float insRate_, float delRate_, float subRate_,
+ int maxInsertionLen_, int maxDeletionLen_, int maxSubLen_,
+ int minChrom_, int maxChrom_, boolean paired_,
+ int minQual_, int midQual_, int maxQual_){
+ Data.setGenome(Data.GENOME_BUILD);
+ number=number_;
+ minreadlen=minreadlen_;
+ maxreadlen=maxreadlen_;
+
+ maxInsertionLen=maxInsertionLen_;
+ maxSubLen=maxSubLen_;
+ maxDeletionLen=maxDeletionLen_;
+
+
+ minInsertionLen=1;
+ minSubLen=1;
+ minDeletionLen=1;
+ minNLen=1;
+
+ minChrom=minChrom_;
+ maxChrom=maxChrom_;
+
+ maxSnps=maxSnps_;
+ maxInss=maxInss_;
+ maxDels=maxDels_;
+ maxSubs=maxSubs_;
+
+ snpRate=snpRate_;
+ insRate=insRate_;
+ delRate=delRate_;
+ subRate=subRate_;
+
+ paired=paired_;
+
+ minQual=(byte) minQual_;
+ midQual=(byte) midQual_;
+ maxQual=(byte) maxQual_;
+
+ restart();
+ }
+
+ @Override
+ public void start() {}
+
+
+ @Override
+ public boolean hasMore() {
+ return number>consumed;
+ }
+
+ @Override
+ public Read next() {
+ if(consumed>=number){return null;}
+ if(buffer==null || next>=buffer.size()){fillBuffer();}
+ Read r=buffer.get(next);
+ buffer.set(next, null);
+ next++;
+ consumed++;
+ return r;
+ }
+
+ @Override
+ public synchronized ArrayList<Read> nextList() {
+ if(next!=0){throw new RuntimeException("'next' should not be used when doing blockwise access.");}
+ if(consumed>=number){return null;}
+ if(buffer==null || next>=buffer.size()){fillBuffer();}
+ ArrayList<Read> r=buffer;
+ buffer=null;
+ if(r!=null && r.size()==0){r=null;}
+ consumed+=(r==null ? 0 : r.size());
+// assert(false) : r.size();
+ return r;
+ }
+
+ private synchronized void fillBuffer(){
+ buffer=null;
+ next=0;
+
+ long toMake=number-generated;
+ if(toMake<1){return;}
+ toMake=Tools.min(toMake, BUF_LEN);
+
+ ArrayList<Read> reads=rr.makeRandomReadsX((int)toMake, minreadlen, maxreadlen,
+ maxSnps, maxInss, maxDels, maxSubs, maxNs,
+ snpRate, insRate, delRate, subRate, NRate,
+ minInsertionLen, minDeletionLen, minSubLen, minNLen,
+ maxInsertionLen, maxDeletionLen, maxSubLen, maxNLen,
+ minChrom, maxChrom,
+ minQual, midQual, maxQual);
+
+ generated+=reads.size();
+ assert(generated<=number);
+ buffer=reads;
+// assert(false) : reads.size()+", "+toMake;
+ }
+
+ public synchronized void restart(){
+ next=0;
+ buffer=null;
+ consumed=0;
+ generated=0;
+ rr=new RandomReads3(1, paired);
+ }
+
+ @Override
+ public boolean close() {return false;}
+
+ @Override
+ public boolean paired() {
+ return paired;
+ }
+
+ private ArrayList<Read> buffer=null;
+ private int next=0;
+
+ private final int BUF_LEN=Shared.READ_BUFFER_LENGTH;
+
+ public long generated=0;
+ public long consumed=0;
+
+ public long number=100000;
+ public int minreadlen=100;
+ public int maxreadlen=100;
+
+ public int maxInsertionLen=6;
+ public int maxSubLen=6;
+ public int maxDeletionLen=100;
+ public int maxNLen=6;
+
+ public int minInsertionLen=1;
+ public int minSubLen=1;
+ public int minDeletionLen=1;
+ public int minNLen=1;
+
+ public int minChrom=1;
+ public int maxChrom=22;
+
+ public int maxSnps=4;
+ public int maxInss=2;
+ public int maxDels=2;
+ public int maxSubs=2;
+ public int maxNs=2;
+
+ public float snpRate=0.5f;
+ public float insRate=0.25f;
+ public float delRate=0.25f;
+ public float subRate=0.10f;
+ public float NRate=0.10f;
+
+ public final boolean paired;
+
+ public final byte minQual;
+ public final byte midQual;
+ public final byte maxQual;
+
+ private RandomReads3 rr;
+
+}
diff --git a/current/stream/Read.java b/current/stream/Read.java
new file mode 100755
index 0000000..4e48cf6
--- /dev/null
+++ b/current/stream/Read.java
@@ -0,0 +1,3395 @@
+package stream;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+
+import ukmer.Kmer;
+
+import jgi.Dedupe;
+
+import align2.GapTools;
+import align2.QualityTools;
+import align2.Shared;
+import align2.Tools;
+import align2.TrimRead;
+
+import dna.AminoAcid;
+import dna.ChromosomeArray;
+import dna.Data;
+import dna.Gene;
+
+public final class Read implements Comparable<Read>, Cloneable, Serializable{
+
+ /**
+ *
+ */
+ private static final long serialVersionUID = -1026645233407290096L;
+
+ public static void main(String[] args){
+ byte[] a=args[0].getBytes();
+ System.out.println(new String(a));
+ byte[] b=toShortMatchString(a);
+ System.out.println(new String(b));
+ byte[] c=toLongMatchString(b);
+ System.out.println(new String(c));
+ byte[] d=toLongMatchString(c);
+ System.out.println(new String(d));
+// byte[] e=toShortMatchString(b);
+// System.out.println(new String(e));
+
+ }
+
+ public Read(byte[] bases_, byte[] quals_, long id_){
+ this(bases_, quals_, id_, Long.toString(id_));
+ }
+
+ public Read(byte[] bases_, byte[] quals_, long id_, String name_){
+ this(bases_, -1, (byte)0, 0, 0, name_, quals_, id_);
+ }
+
+ public Read(byte[] s_, int chrom_, byte strand_, int start_, int stop_, long id_, byte[] quality_){
+ this(s_, chrom_, strand_, start_, stop_, Long.toString(id_), quality_, id_);
+ }
+
+// public Read(byte[][] fasta_, byte[][] qual_, long numericID_){
+// this(fasta_[1], 0, (byte)0, 0, 0, new String(fasta_[0]), qual_[1], numericID_);
+// }
+
+ public Read(byte[] s_, int chrom_, byte strand_, int start_, int stop_, String id_, byte[] quality_, long numericID_){
+ this(s_, chrom_, start_, stop_, id_, quality_, numericID_, strand_);
+ assert(strand_==0 || strand_==1);
+ assert(start_<=stop_) : chrom_+", "+start_+", "+stop_+", "+numericID_;
+ }
+
+ public Read(byte[] bases_, int chrom_, int start_, int stop_, String id_, byte[] quals_, long numericID_, int flags_){
+ flags=flags_&~VALIDATEDMASK;
+ bases=bases_;
+ quality=quals_;
+
+ chrom=chrom_;
+ start=start_;
+ stop=stop_;
+
+ id=id_;
+ numericID=numericID_;
+
+ if(VALIDATE_IN_CONSTRUCTOR){validate(true);}
+ }
+
+ public boolean validate(final boolean processAssertions){
+ assert(!validated());
+
+ if(false){//This causes problems with error-corrected PacBio reads.
+ boolean x=(quality==null || quality.length<1 || quality[0]<=80 || !FASTQ.DETECT_QUALITY || FASTQ.IGNORE_BAD_QUALITY);
+ if(!x){
+ if(processAssertions){
+ KillSwitch.kill("Quality value ("+quality[0]+") appears too high.\n"+Arrays.toString(quality)+
+ "\n"+Arrays.toString(bases)+"\n"+numericID+"\n"+id+"\n"+FASTQ.ASCII_OFFSET);
+ }
+ return false;
+ }
+ }
+
+ final boolean aa=aminoacid();
+
+ if(NULLIFY_BROKEN_QUALITY && quality!=null && quality.length!=bases.length){
+ quality=null;
+ setDiscarded(true);
+ }
+
+ assert(!processAssertions || quality==null || quality.length==bases.length) :
+ "\nMismatch between length of bases and qualities for read "+numericID+" (id="+id+").\n"+
+ "# qualities="+quality.length+", # bases="+bases.length+"\n\n"+
+ FASTQ.qualToString(quality)+"\n"+new String(bases)+"\n";
+ if(quality!=null && (bases==null || bases.length!=quality.length)){
+ assert(!processAssertions) :
+ "\nMismatch between length of bases and qualities for read "+numericID+" (id="+id+").\n"+
+ "# qualities="+quality.length+", # bases="+bases.length+"\n\n"+
+ FASTQ.qualToString(quality)+"\n"+new String(bases)+"\n";
+ return false;
+ }
+
+ if(bases!=null && (true || (FLAG_JUNK || FIX_JUNK || U_TO_T))){
+ if(aa){
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ int num=AminoAcid.aminoToCode[b];
+ if(num<0){
+ if(FIX_JUNK){
+ bases[i]='X';
+ if(quality!=null){quality[i]=0;}
+ }else{
+ setJunk(true);
+ break;
+ }
+ }
+ }
+ }else{
+ if(U_TO_T){
+ for(int i=0; i<bases.length; i++){
+ if(Character.toUpperCase(bases[i])=='U'){
+ bases[i]=AminoAcid.uToT[bases[i]];
+ }
+ }
+ }
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ int num=AminoAcid.baseToNumberExtended[b];
+ if(num<0){
+ if(FIX_JUNK){bases[i]='N';}
+ else{
+ setJunk(true);
+ break;
+ }
+ }
+ }
+ }
+ }else{
+ boolean x=(bases==null || bases.length<2 || bases[1]=='N' || bases[1]=='.' || bases[1]=='-' ||
+ Character.isLetter(bases[1]) || (aa && bases[1]=='*'));
+ if(!x){
+ assert(!processAssertions) : "\nAn input file appears to be misformatted. The character with ASCII code "+bases[1]+
+ " appeared where a base was expected.\n\n"+Tools.toStringSafe(bases);
+ return false;
+ }
+ }
+
+ if(bases!=null){
+ final byte nocall=(aa ? (byte)'.' : (byte)'N');
+ if(quality!=null){
+ if(!aa){
+
+ if(CHANGE_QUALITY){
+ for(int i=0; i<quality.length; i++){
+ byte b=bases[i];
+ byte q=quality[i];
+ if(AminoAcid.isFullyDefined(b)){
+ if(q<MIN_CALLED_QUALITY){
+ quality[i]=MIN_CALLED_QUALITY;
+ }else if(q>MAX_CALLED_QUALITY){
+ quality[i]=MAX_CALLED_QUALITY;
+ }
+ }else{
+ quality[i]=0;
+ if(b=='-' || b=='.' || b=='X' || b=='n'){bases[i]=nocall;}
+ }
+ if(TO_UPPER_CASE && b>90){bases[i]-=32;}
+ else if(LOWER_CASE_TO_N && b>90){bases[i]=nocall;}
+ }
+ }else{
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ if(AminoAcid.isFullyDefined(b)){
+ //do nothing
+ }else{
+ if(b=='-' || b=='.' || b=='X' || b=='n'){bases[i]=nocall;}
+ }
+ if(TO_UPPER_CASE && b>90){bases[i]-=32;}
+ else if(LOWER_CASE_TO_N && b>90){bases[i]=nocall;}
+ }
+ }
+ }
+ }else if(TO_UPPER_CASE){
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ if(b>90){
+ // assert(Character.isLowerCase(bases[i])) : new String(bases);
+ bases[i]-=32;
+ // assert(Character.isUpperCase(bases[i])) : new String(bases);
+ }
+ if(OTHER_SYMBOLS_TO_N && (b=='-' || b=='.' || b=='X') && !aa){bases[i]=nocall;}
+ }
+ }else if(LOWER_CASE_TO_N){
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ if(b>90){bases[i]=nocall;}
+ else if(b=='-' || b=='.' || b=='X'){bases[i]=nocall;}
+ }
+ }else if(OTHER_SYMBOLS_TO_N && !aminoacid()){
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ if(b=='-' || b=='.' || b=='X'){bases[i]=nocall;}
+ }
+ }
+ }
+
+ if(FIX_HEADER){
+ fixHeader();
+ }
+
+ setValidated(true);
+
+ return true;
+ }
+
+ private static final int absdif(int a, int b){
+ return a>b ? a-b : b-a;
+ }
+
+ public final void fixHeader(){
+ id=Tools.fixHeader(id);
+ }
+
+ /** Returns true if these reads are identical, allowing at most n no-calls and m mismatches of max quality q*/
+ public boolean isDuplicateByBases(Read r, int nmax, int mmax, byte qmax, boolean banSameQualityMismatch){
+ return isDuplicateByBases(r, nmax, mmax, qmax, false, false);
+ }
+
+
+
+ /** Returns true if these reads are identical, allowing at most n no-calls and m mismatches of max quality q*/
+ public boolean isDuplicateByBases(Read r, int nmax, int mmax, byte qmax, boolean banSameQualityMismatch, boolean allowDifferentLength){
+ int n=0, m=0;
+ assert(r.length()==bases.length) : "Merging different-length reads is supported but seems to be not useful.";
+ if(!allowDifferentLength && r.length()!=bases.length){return false;}
+ int minLen=Tools.min(bases.length, r.length());
+ for(int i=0; i<minLen; i++){
+ byte b1=bases[i];
+ byte b2=r.bases[i];
+ if(b1=='N' || b2=='N'){
+ n++;
+ if(n>nmax){return false;}
+ }else if(b1!=b2){
+ m++;
+ if(m>mmax){return false;}
+ if(quality[i]>qmax && r.quality[i]>qmax){return false;}
+ if(banSameQualityMismatch && quality[i]==r.quality[i]){return false;}
+ }
+ }
+ return true;
+ }
+
+ public boolean isDuplicateByMapping(Read r, boolean bothEnds, boolean checkAlignment){
+ if(bases.length!=r.length()){
+ return isDuplicateByMappingDifferentLength(r, bothEnds, checkAlignment);
+ }
+ assert(this!=r && mate!=r);
+ assert(!bothEnds || bases.length==r.length());
+ if(!mapped() || !r.mapped()){return false;}
+// if(chrom==-1 && start==-1){return false;}
+ if(chrom<1 && start<1){return false;}
+
+// if(chrom!=r.chrom || strand()!=r.strand() || start!=r.start){return false;}
+//// if(mate==null && stop!=r.stop){return false;} //For unpaired reads, require both ends match
+// if(stop!=r.stop){return false;} //For unpaired reads, require both ends match
+// return true;
+
+ if(chrom!=r.chrom || strand()!=r.strand()){return false;}
+ if(bothEnds){
+ if(start!=r.start || stop!=r.stop){return false;}
+ }else{
+ if(strand()==Gene.PLUS){
+ if(start!=r.start){return false;}
+ }else{
+ if(stop!=r.stop){return false;}
+ }
+ }
+ if(checkAlignment){
+ if(perfect() && r.perfect()){return true;}
+ if(match!=null && r.match!=null){
+ if(match.length!=r.match.length){return false;}
+ for(int i=0; i<match.length; i++){
+ byte a=match[i];
+ byte b=r.match[i];
+ if(a!=b){
+ if((a=='D') != (b=='D')){return false;}
+ if((a=='I' || a=='X' || a=='Y') != (b=='I' || b=='X' || b=='Y')){return false;}
+ }
+ }
+ }
+ }
+ return true;
+ }
+
+ public boolean isDuplicateByMappingDifferentLength(Read r, boolean bothEnds, boolean checkAlignment){
+ assert(this!=r && mate!=r);
+ assert(bases.length!=r.length());
+ if(bothEnds){return false;}
+// assert(!bothEnds || bases.length==r.length());
+ if(!mapped() || !r.mapped()){return false;}
+// if(chrom==-1 && start==-1){return false;}
+ if(chrom<1 && start<1){return false;}
+
+// if(chrom!=r.chrom || strand()!=r.strand() || start!=r.start){return false;}
+//// if(mate==null && stop!=r.stop){return false;} //For unpaired reads, require both ends match
+// if(stop!=r.stop){return false;} //For unpaired reads, require both ends match
+// return true;
+
+ if(chrom!=r.chrom || strand()!=r.strand()){return false;}
+
+ if(strand()==Gene.PLUS){
+ if(start!=r.start){return false;}
+ }else{
+ if(stop!=r.stop){return false;}
+ }
+
+ if(checkAlignment){
+ if(perfect() && r.perfect()){return true;}
+ if(match!=null && r.match!=null){
+ int minLen=Tools.min(match.length, r.match.length);
+ for(int i=0; i<minLen; i++){
+ byte a=match[i];
+ byte b=r.match[i];
+ if(a!=b){
+ if((a=='D') != (b=='D')){return false;}
+ if((a=='I' || a=='X' || a=='Y') != (b=='I' || b=='X' || b=='Y')){return false;}
+ }
+ }
+ }
+ }
+ return true;
+ }
+
+ public void merge(Read r, boolean mergeVectors, boolean mergeN){mergePrivate(r, mergeVectors, mergeN, true);}
+
+ private void mergePrivate(Read r, boolean mergeVectors, boolean mergeN, boolean mergeMate){
+ assert(r!=this);
+ assert(r!=this.mate);
+ assert(r!=r.mate);
+ assert(this!=this.mate);
+ assert(r.mate==null || r.mate.mate==r);
+ assert(this.mate==null || this.mate.mate==this);
+ assert(r.mate==null || r.numericID==r.mate.numericID);
+ assert(mate==null || numericID==mate.numericID);
+ mergeN=(mergeN||mergeVectors);
+
+ assert(r.length()==bases.length) : "Merging different-length reads is supported but seems to be not useful.";
+
+ if((mergeN || mergeVectors) && bases.length<r.length()){
+ int oldLenB=bases.length;
+ start=Tools.min(start, r.start);
+ stop=Tools.max(stop, r.stop);
+ mapScore=Tools.max(mapScore, r.mapScore);
+
+ bases=Arrays.copyOfRange(bases, 0, r.length());
+ quality=Arrays.copyOfRange(quality, 0, r.quality.length);
+ for(int i=oldLenB; i<bases.length; i++){
+ bases[i]='N';
+ quality[i]=0;
+ }
+ match=null;
+ r.match=null;
+ }
+
+ copies+=r.copies;
+
+
+// if(numericID==11063941 || r.numericID==11063941 || numericID==8715632){
+// System.err.println("***************");
+// System.err.println(this.toText()+"\n");
+// System.err.println(r.toText()+"\n");
+// System.err.println(mergeVectors+", "+mergeN+", "+mergeMate+"\n");
+// }
+
+ boolean pflag1=perfect();
+ boolean pflag2=r.perfect();
+
+ final int minLenB=Tools.min(bases.length, r.length());
+
+ if(mergeN){
+ if(quality==null){
+ for(int i=0; i<minLenB; i++){
+ byte b=r.bases[i];
+ if(bases[i]=='N' && b!='N'){bases[i]=b;}
+ }
+ }else{
+ for(int i=0; i<minLenB; i++){
+ final byte b1=bases[i];
+ final byte b2=r.bases[i];
+ final byte q1=Tools.max((byte)0, quality[i]);
+ final byte q2=Tools.max((byte)0, r.quality[i]);
+ if(b1==b2){
+ if(b1=='N'){
+ //do nothing
+ }else if(mergeVectors){
+ //merge qualities
+ // quality[i]=(byte) Tools.min(40, q1+q2);
+ if(q1>=q2){
+ quality[i]=(byte) Tools.min(48, q1+1+q2/4);
+ }else{
+ quality[i]=(byte) Tools.min(48, q2+1+q1/4);
+ }
+ }
+ }else if(b1=='N'){
+ bases[i]=b2;
+ quality[i]=q2;
+ }else if(b2=='N'){
+ //do nothing
+ }else if(mergeVectors){
+ if(q1<1 && q2<1){
+ //Special case - e.g. Illumina calls bases at 0 quality.
+ //Possibly best to keep the matching allele if one matches the ref.
+ //But for now, do nothing.
+ //This was causing problems changing perfect match strings into imperfect matches.
+ }else if(q1==q2){
+ assert(b1!=b2);
+ bases[i]='N';
+ quality[i]=0;
+ }else if(q1>q2){
+ bases[i]=b1;
+ quality[i]=(byte)(q1-q2/2);
+ }else{
+ bases[i]=b2;
+ quality[i]=(byte)(q2-q1/2);
+ }
+ assert(quality[i]>=0 && quality[i]<=48);
+ }
+ }
+ }
+ }
+
+ //TODO:
+ //Note that the read may need to be realigned after merging, so the match string may be rendered incorrect.
+
+ if(mergeN && match!=null){
+ if(r.match==null){match=null;}
+ else{
+ if(match.length!=r.match.length){match=null;}
+ else{
+ boolean ok=true;
+ for(int i=0; i<match.length && ok; i++){
+ byte a=match[i], b=r.match[i];
+ if(a!=b){
+ if((a=='m' || a=='S') && b=='N'){
+ //do nothing;
+ }else if(a=='N' && (b=='m' || b=='S')){
+ match[i]=b;
+ }else{
+ ok=false;
+ }
+ }
+ }
+ if(!ok){match=null;}
+ }
+ }
+ }
+
+ if(mergeMate && mate!=null){
+ mate.mergePrivate(r.mate, mergeVectors, mergeN, false);
+ assert(copies==mate.copies);
+ }
+ assert(copies>1);
+
+ assert(r!=this);
+ assert(r!=this.mate);
+ assert(r!=r.mate);
+ assert(this!=this.mate);
+ assert(r.mate==null || r.mate.mate==r);
+ assert(this.mate==null || this.mate.mate==this);
+ assert(r.mate==null || r.numericID==r.mate.numericID);
+ assert(mate==null || numericID==mate.numericID);
+ }
+
+ public String toString(){return toText(false).toString();}
+
+ public StringBuilder toSites(){
+ StringBuilder sb;
+ if(numSites()==0){
+ sb=new StringBuilder(2);
+ sb.append('.');
+ }else{
+ sb=new StringBuilder(sites.size()*20);
+ int appended=0;
+ for(SiteScore ss : sites){
+ if(appended>0){sb.append('\t');}
+ if(ss!=null){
+ sb.append(ss.toText());
+ appended++;
+ }
+ }
+ if(appended==0){sb.append('.');}
+ }
+ return sb;
+ }
+
+ public ByteBuilder toSitesB(ByteBuilder sb){
+ if(numSites()==0){
+ if(sb==null){sb=new ByteBuilder(2);}
+ sb.append('.');
+ }else{
+ if(sb==null){sb=new ByteBuilder(sites.size()*20);}
+ int appended=0;
+ for(SiteScore ss : sites){
+ if(appended>0){sb.append('\t');}
+ if(ss!=null){
+ ss.toBytes(sb);
+ appended++;
+ }
+ }
+ if(appended==0){sb.append('.');}
+ }
+ return sb;
+ }
+
+ public StringBuilder toInfo(){
+ if(obj==null){return new StringBuilder();}
+ if(obj.getClass()==StringBuilder.class){return (StringBuilder)obj;}
+ return new StringBuilder(obj.toString());
+ }
+
+ public ByteBuilder toInfoB(){
+ if(obj==null){return new ByteBuilder();}
+ if(obj.getClass()==ByteBuilder.class){return (ByteBuilder)obj;}
+ return new ByteBuilder(obj.toString());
+ }
+
+ public ByteBuilder toInfoB(ByteBuilder bb){
+ if(obj==null){return bb;}
+ if(obj.getClass()==ByteBuilder.class){return bb.append((ByteBuilder)obj);}
+ return bb.append(obj);
+ }
+
+ public StringBuilder toFastq(){
+ return FASTQ.toFASTQ(this, (StringBuilder)null);
+ }
+
+ public ByteBuilder toFastq(ByteBuilder bb){
+ return FASTQ.toFASTQ(this, bb);
+ }
+
+ public StringBuilder toFasta(){return toFasta(Shared.FASTA_WRAP);}
+ public ByteBuilder toFasta(ByteBuilder bb){return toFasta(Shared.FASTA_WRAP, bb);}
+
+ public StringBuilder toFasta(int wrap){
+ if(wrap<1){wrap=Integer.MAX_VALUE;}
+ int len=(id==null ? Tools.stringLength(numericID) : id.length())+(bases==null ? 0 : bases.length+bases.length/wrap)+5;
+ StringBuilder sb=new StringBuilder(len);
+ sb.append('>');
+ if(id==null || FASTQ.TAG_CUSTOM){
+ sb.append(FASTQ.TAG_CUSTOM ? FASTQ.customID(this) : ""+numericID);
+ }else{sb.append(id);}
+ sb.append('\n');
+ if(bases!=null){
+ final char[] buffer=Shared.getTLCB(Tools.min(wrap, bases.length));
+ int j=0;
+ for(int i=0; i<bases.length; i++, j++){
+ if(j==wrap){
+ sb.append(buffer, 0, j);
+ sb.append('\n');
+ j=0;
+ }
+ buffer[j]=(char)bases[i];
+ }
+ if(j>0){sb.append(buffer, 0, j);}
+ }
+ return sb;
+ }
+
+ public ByteBuilder toFasta(int wrap, ByteBuilder sb){
+ if(wrap<1){wrap=Integer.MAX_VALUE;}
+ int len=(id==null ? Tools.stringLength(numericID) : id.length())+(bases==null ? 0 : bases.length+bases.length/wrap)+5;
+ if(sb==null){sb=new ByteBuilder(len+1);}
+ sb.append('>');
+ if(id==null){sb.append(numericID);}
+ else{sb.append(id);}
+ if(bases!=null){
+ int pos=0;
+ while(pos<bases.length-wrap){
+ sb.append('\n');
+ sb.append(bases, pos, wrap);
+ pos+=wrap;
+ }
+ if(pos<bases.length){
+ sb.append('\n');
+ sb.append(bases, pos, bases.length-pos);
+ }
+ }
+ return sb;
+ }
+
+ public StringBuilder toSam(){
+ return new SamLine(this, pairnum()).toText();
+ }
+
+ public ByteBuilder toSam(ByteBuilder bb){
+ SamLine sl=new SamLine(this, pairnum());
+ return sl.toBytes(bb).append('\n');
+ }
+
+ public static CharSequence header(){
+
+ StringBuilder sb=new StringBuilder();
+ sb.append("id");
+ sb.append('\t');
+ sb.append("numericID");
+ sb.append('\t');
+ sb.append("chrom");
+ sb.append('\t');
+ sb.append("strand");
+ sb.append('\t');
+ sb.append("start");
+ sb.append('\t');
+ sb.append("stop");
+ sb.append('\t');
+
+ sb.append("flags");
+ sb.append('\t');
+
+ sb.append("copies");
+ sb.append('\t');
+
+ sb.append("errors,fixed");
+ sb.append('\t');
+ sb.append("mapScore");
+ sb.append('\t');
+ sb.append("length");
+ sb.append('\t');
+
+ sb.append("bases");
+ sb.append('\t');
+ sb.append("quality");
+ sb.append('\t');
+
+ sb.append("insert");
+ sb.append('\t');
+ {
+ //These are not really necessary...
+ sb.append("avgQual");
+ sb.append('\t');
+ }
+
+ sb.append("match");
+ sb.append('\t');
+ sb.append("SiteScores: "+SiteScore.header());
+ return sb;
+ }
+
+ public StringBuilder toText(boolean okToCompressMatch){
+
+ final byte[] oldmatch=match;
+ final boolean oldshortmatch=this.shortmatch();
+ if(COMPRESS_MATCH_BEFORE_WRITING && !shortmatch() && okToCompressMatch){
+ match=toShortMatchString(match);
+ setShortMatch(true);
+ }
+
+ StringBuilder sb=new StringBuilder();
+ sb.append(id);
+ sb.append('\t');
+ sb.append(numericID);
+ sb.append('\t');
+ sb.append(chrom);
+ sb.append('\t');
+ sb.append(Gene.strandCodes[strand()]);
+ sb.append('\t');
+ sb.append(start);
+ sb.append('\t');
+ sb.append(stop);
+ sb.append('\t');
+
+ for(int i=maskArray.length-1; i>=0; i--){
+ sb.append(flagToNumber(maskArray[i]));
+ }
+ sb.append('\t');
+
+ sb.append(copies);
+ sb.append('\t');
+
+ sb.append(errors);
+ sb.append('\t');
+ sb.append(mapScore);
+ sb.append('\t');
+
+ if(bases==null){sb.append('.');}
+ else{
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ if(b<4){
+ assert(b>=0);
+ b=(byte) (b+'0');
+ }
+ sb.append((char)b);
+ }
+ }
+ sb.append('\t');
+
+ int qualSum=0;
+ int qualMin=99999;
+
+ if(quality==null){
+ sb.append('.');
+ }else{
+ for(int i=0; i<quality.length; i++){
+ byte q=quality[i];
+ qualSum+=q;
+ qualMin=Tools.min(q, qualMin);
+ q=(byte) (q+ASCII_OFFSET);
+ sb.append((char)q);
+ }
+ }
+ sb.append('\t');
+
+ if(insert<1){sb.append('.');}else{sb.append(insert);};
+ sb.append('\t');
+
+ if(quality==null){
+ sb.append('.');
+ sb.append('\t');
+ }else{
+ //These are not really necessary...
+ sb.append(qualSum/quality.length);
+ sb.append('\t');
+ }
+
+ if(match==null){sb.append('.');}
+ else{for(byte b : match){sb.append((char)b);}}
+ sb.append('\t');
+
+ if(gaps==null){
+ sb.append('.');
+ }else{
+ for(int i=0; i<gaps.length; i++){
+ if(i>0){sb.append('~');}
+ sb.append(gaps[i]);
+ }
+ }
+
+ if(sites!=null && sites.size()>0){
+
+ assert(absdif(start, stop)<3000 || (gaps==null) == (sites.get(0).gaps==null)) :
+ "\n"+this.numericID+"\n"+Arrays.toString(gaps)+"\n"+sites.toString()+"\n";
+
+ for(SiteScore ss : sites){
+ sb.append('\t');
+ sb.append(ss==null ? "null" : ss.toText());
+ }
+ }
+
+ if(originalSite!=null){
+ sb.append('\t');
+ sb.append('*');
+ sb.append(originalSite.toText());
+ }
+
+ match=oldmatch;
+ setShortMatch(oldshortmatch);
+
+ return sb;
+ }
+
+ public ByteBuilder toText(boolean okToCompressMatch, ByteBuilder bb){
+
+ final byte[] oldmatch=match;
+ final boolean oldshortmatch=this.shortmatch();
+ if(COMPRESS_MATCH_BEFORE_WRITING && !shortmatch() && okToCompressMatch){
+ match=toShortMatchString(match);
+ setShortMatch(true);
+ }
+
+ if(bb==null){bb=new ByteBuilder();}
+ bb.append(id);
+ bb.append('\t');
+ bb.append(numericID);
+ bb.append('\t');
+ bb.append(chrom);
+ bb.append('\t');
+ bb.append(Gene.strandCodes2[strand()]);
+ bb.append('\t');
+ bb.append(start);
+ bb.append('\t');
+ bb.append(stop);
+ bb.append('\t');
+
+ for(int i=maskArray.length-1; i>=0; i--){
+ bb.append(flagToNumber(maskArray[i]));
+ }
+ bb.append('\t');
+
+ bb.append(copies);
+ bb.append('\t');
+
+ bb.append(errors);
+ bb.append('\t');
+ bb.append(mapScore);
+ bb.append('\t');
+
+ if(bases==null){bb.append('.');}
+ else{bb.append(bases);}
+ bb.append('\t');
+
+// int qualSum=0;
+// int qualMin=99999;
+
+ if(quality==null){
+ bb.append('.');
+ }else{
+ bb.ensureExtra(quality.length);
+ for(int i=0, j=bb.length; i<quality.length; i++, j++){
+ byte q=quality[i];
+ bb.array[j]=(byte)(q+ASCII_OFFSET);
+// qualSum+=q;
+// qualMin=Tools.min(q, qualMin);
+ }
+ bb.length+=quality.length;
+ }
+ bb.append('\t');
+
+ if(insert<1){bb.append('.');}else{bb.append(insert);};
+ bb.append('\t');
+
+ if(true || quality==null){
+ bb.append('.');
+ bb.append('\t');
+ }else{
+// //These are not really necessary...
+// sb.append(qualSum/quality.length);
+// sb.append('\t');
+ }
+
+ if(match==null){bb.append('.');}
+ else{bb.append(match);}
+ bb.append('\t');
+
+ if(gaps==null){
+ bb.append('.');
+ }else{
+ for(int i=0; i<gaps.length; i++){
+ if(i>0){bb.append('~');}
+ bb.append(gaps[i]);
+ }
+ }
+
+ if(sites!=null && sites.size()>0){
+
+ assert(absdif(start, stop)<3000 || (gaps==null) == (sites.get(0).gaps==null)) :
+ "\n"+this.numericID+"\n"+Arrays.toString(gaps)+"\n"+sites.toString()+"\n";
+
+ for(SiteScore ss : sites){
+ bb.append('\t');
+ if(ss==null){
+ bb.append((byte[])null);
+ }else{
+ ss.toBytes(bb);
+ }
+ bb.append(ss==null ? "null" : ss.toText());
+ }
+ }
+
+ if(originalSite!=null){
+ bb.append('\t');
+ bb.append('*');
+ originalSite.toBytes(bb);
+ }
+
+ match=oldmatch;
+ setShortMatch(oldshortmatch);
+
+ return bb;
+ }
+
+ public static Read fromText(String line){
+ if(line.length()==1 && line.charAt(0)=='.'){return null;}
+
+ String[] split=line.split("\t");
+
+ if(split.length<17){
+ throw new RuntimeException("Error parsing read from text.\n\n" +
+ "This may be caused be attempting to parse the wrong format.\n" +
+ "Please ensure that the file extension is correct:\n" +
+ "\tFASTQ should end in .fastq or .fq\n" +
+ "\tFASTA should end in .fasta or .fa, .fas, .fna, .ffn, .frn, .seq, .fsa\n" +
+ "\tSAM should end in .sam\n" +
+ "\tNative format should end in .txt or .bread\n" +
+ "If a file is compressed, there must be a compression extension after the format extension:\n" +
+ "\tgzipped files should end in .gz or .gzip\n" +
+ "\tzipped files should end in .zip and have only 1 file per archive\n" +
+ "\tbz2 files should end in .bz2\n");
+ }
+
+ final String id=new String(split[0]);
+ long numericID=Long.parseLong(split[1]);
+ int chrom=Byte.parseByte(split[2]);
+// byte strand=Byte.parseByte(split[3]);
+ int start=Integer.parseInt(split[4]);
+ int stop=Integer.parseInt(split[5]);
+
+// boolean cs=(Integer.parseInt(split[6])==1);
+// boolean paired=(Integer.parseInt(split[7])==1);
+
+ int flags=Integer.parseInt(split[6], 2);
+ boolean cs=((flags&COLORMASK)!=0);
+
+ int copies=Integer.parseInt(split[7]);
+
+ int errors;
+ int errorsCorrected;
+ if(split[8].indexOf(',')>=0){
+ String[] estring=split[8].split(",");
+ errors=Integer.parseInt(estring[0]);
+ errorsCorrected=Integer.parseInt(estring[1]);
+ }else{
+ errors=Integer.parseInt(split[8]);
+ errorsCorrected=0;
+ }
+
+ int mapScore=Integer.parseInt(split[9]);
+
+ byte[] basesOriginal=split[10].getBytes();
+ byte[] qualityOriginal=(split[11].equals(".") ? null : split[11].getBytes());
+
+ if(cs){
+ for(int i=0; i<basesOriginal.length; i++){
+ byte b=basesOriginal[i];
+ if(b>='0' && b<='3'){
+ b=(byte) (b-'0');
+ }
+ basesOriginal[i]=b;
+ }
+ }
+
+ if(qualityOriginal!=null){
+ for(int i=0; i<qualityOriginal.length; i++){
+ byte b=qualityOriginal[i];
+ b=(byte) (b-ASCII_OFFSET);
+ assert(b>=-1) : b;
+ qualityOriginal[i]=b;
+ }
+ }
+
+ int insert=-1;
+ if(!split[12].equals(".")){insert=Integer.parseInt(split[12]);}
+
+ byte[] match=null;
+ if(!split[14].equals(".")){match=split[14].getBytes();}
+ int[] gaps=null;
+ if(!split[15].equals(".")){
+
+ String[] gstring=split[16].split("~");
+ gaps=new int[gstring.length];
+ for(int i=0; i<gstring.length; i++){
+ gaps[i]=Integer.parseInt(gstring[i]);
+ }
+ }
+
+// assert(false) : split[16];
+
+ Read r=new Read(basesOriginal, chrom, start, stop, id, qualityOriginal, numericID, flags);
+ r.match=match;
+ r.errors=errors;
+ r.mapScore=mapScore;
+ r.copies=copies;
+ r.gaps=gaps;
+ r.insert=insert;
+
+ int firstScore=(ADD_BEST_SITE_TO_LIST_FROM_TEXT) ? 17 : 18;
+
+ int scores=split.length-firstScore;
+
+ int mSites=0;
+ for(int i=firstScore; i<split.length; i++){
+ if(split[i].charAt(0)!='*'){mSites++;}
+ }
+
+ //This can be disabled to handle very old text format.
+ if(mSites>0){r.sites=new ArrayList<SiteScore>(mSites);}
+ for(int i=firstScore; i<split.length; i++){
+ SiteScore ss=SiteScore.fromText(split[i]);
+ if(split[i].charAt(0)=='*'){r.originalSite=ss;}
+ else{r.sites.add(ss);}
+ }
+
+ if(DECOMPRESS_MATCH_ON_LOAD && r.shortmatch()){
+ r.match=toLongMatchString(match);
+ r.setShortMatch(false);
+ }
+
+ assert(r.numSites()==0 || absdif(r.start, r.stop)<3000 || (r.gaps==null) == (r.topSite().gaps==null)) :
+ "\n"+r.numericID+", "+r.chrom+", "+r.strand()+", "+r.start+", "+r.stop+", "+Arrays.toString(r.gaps)+"\n"+r.sites+"\n"+line+"\n";
+
+ return r;
+ }
+
+ /** Inflates gaps between contigs in a scaffold. */
+ public void inflateGaps(int minGapIn, int minGapOut) {
+ assert(minGapIn>0);
+ if(!containsNocalls()){return;}
+ final ByteBuilder bbb=new ByteBuilder();
+ final ByteBuilder bbq=(quality==null ? null : new ByteBuilder());
+
+ int gap=0;
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ byte q=(quality==null ? 0 : quality[i]);
+ if(b=='N'){
+ gap++;
+ }else{
+ while(gap>=minGapIn && gap<minGapOut){
+ gap++;
+ bbb.append('N');
+ if(bbq!=null){bbq.append(0);}
+ }
+ gap=0;
+ }
+ bbb.append(b);
+ if(bbq!=null){bbq.append(q);}
+ }
+
+ while(gap>=minGapIn && gap<minGapOut){//Handle trailing bases
+ gap++;
+ bbb.append('N');
+ if(bbq!=null){bbq.append(0);}
+ }
+
+ assert(bbb.length()>=bases.length);
+ if(bbb.length()>bases.length){
+ bases=bbb.toBytes();
+ if(bbq!=null){quality=bbq.toBytes();}
+ }
+ }
+
+ public ArrayList<Read> breakAtGaps(final boolean agp, final int minContig){
+ ArrayList<Read> list=new ArrayList<Read>();
+ byte prev='N';
+ int lastN=-1, lastBase=-1;
+ int contignum=1;
+ long feature=1;
+ StringBuilder sb=(agp ? new StringBuilder() : null);
+ assert(obj==null);
+ for(int i=0; i<bases.length; i++){
+ final byte b=bases[i];
+ if(b=='N'){
+ if(prev!='N'){
+ final int start=lastN+1, stop=i;
+ byte[] b2=Arrays.copyOfRange(bases, start, stop);
+ byte[] q2=(quality==null ? null : Arrays.copyOfRange(quality, start, stop));
+ Read r=new Read(b2, q2, numericID, id+"_c"+contignum);
+ if(r.length()>=minContig){list.add(r);}
+ contignum++;
+
+ if(sb!=null){
+ sb.append(id).append('\t');
+ sb.append(start+1).append('\t');
+ sb.append(stop).append('\t');
+ sb.append(feature).append('\t');
+ feature++;
+ sb.append('W').append('\t');
+ sb.append(r.id).append('\t');
+ sb.append(1).append('\t');
+ sb.append(r.length()).append('\t');
+ sb.append("+").append('\n');
+ }
+ }
+ lastN=i;
+ }else{
+ if(sb!=null && prev=='N' && lastBase>=0){
+ sb.append(id).append('\t');
+ sb.append(lastBase+2).append('\t');
+ sb.append(i).append('\t');
+ sb.append(feature).append('\t');
+ feature++;
+ sb.append('N').append('\t');
+ sb.append((i-lastBase-1)).append('\t');
+ sb.append("scaffold").append('\t');
+ sb.append("yes").append('\t');
+ sb.append("paired-ends").append('\n');
+ }
+ lastBase=i;
+ }
+ prev=b;
+ }
+ if(prev!='N'){
+ final int start=lastN+1, stop=bases.length;
+ byte[] b2=Arrays.copyOfRange(bases, start, stop);
+ byte[] q2=(quality==null ? null : Arrays.copyOfRange(quality, start, stop));
+ Read r=new Read(b2, q2, numericID, id+"_c"+contignum);
+ if(r.length()>=minContig){list.add(r);}
+ contignum++;
+
+ if(sb!=null){
+ sb.append(id).append('\t');
+ sb.append(start+1).append('\t');
+ sb.append(stop).append('\t');
+ sb.append(feature).append('\t');
+ feature++;
+ sb.append('W').append('\t');
+ sb.append(r.id).append('\t');
+ sb.append(1).append('\t');
+ sb.append(r.length()).append('\t');
+ sb.append("+").append('\n');
+ }
+ }else{
+ if(sb!=null && prev=='N' && lastBase>=0){
+ sb.append(id).append('\t');
+ sb.append(lastBase+2).append('\t');
+ sb.append(bases.length).append('\t');
+ sb.append(feature).append('\t');
+ feature++;
+ sb.append('N').append('\t');
+ sb.append((bases.length-lastBase-1)).append('\t');
+ sb.append("scaffold").append('\t');
+ sb.append("yes").append('\t');
+ sb.append("paired-ends").append('\n');
+ }
+ lastBase=bases.length;
+ }
+ if(sb!=null){obj=sb.toString();}
+ return list;
+ }
+
+// /** Reverses the read. Mainly for testing.
+// * Seems to be unused. */
+// @Deprecated
+// protected void reverse() {
+// Tools.reverseInPlace(bases);
+// Tools.reverseInPlace(quality);
+// Tools.reverseInPlace(match);
+// }
+
+ /** Reverse-complements the read. */
+ public void reverseComplement() {
+ AminoAcid.reverseComplementBasesInPlace(bases);
+ Tools.reverseInPlace(quality);
+ setStrand(strand()^1);
+ }
+
+ @Override
+ public int compareTo(Read o) {
+ if(chrom!=o.chrom){return chrom-o.chrom;}
+ if(start!=o.start){return start-o.start;}
+ if(stop!=o.stop){return stop-o.stop;}
+ if(strand()!=o.strand()){return strand()-o.strand();}
+ return 0;
+ }
+
+ public SiteScore toSite(){
+ assert(start<=stop) : this.toText(false);
+ SiteScore ss=new SiteScore(chrom, strand(), start, stop, 0, 0, rescued(), perfect());
+ if(paired()){
+ ss.setSlowPairedScore(mapScore-1, mapScore);
+ }else{
+ ss.setSlowPairedScore(mapScore, 0);
+ }
+ ss.setScore(mapScore);
+ ss.gaps=gaps;
+ ss.match=match;
+ originalSite=ss;
+ return ss;
+ }
+
+ public SiteScore topSite(){
+ final SiteScore ss=(sites==null || sites.isEmpty()) ? null : sites.get(0);
+ assert(sites==null || sites.isEmpty() || ss!=null) : "Top site is null for read "+this;
+ return ss;
+ }
+
+ public int numSites(){
+ return (sites==null ? 0 : sites.size());
+ }
+
+ public SiteScore makeOriginalSite(){
+ originalSite=toSite();
+ return originalSite;
+ }
+
+ public void setFromSite(SiteScore ss){
+ assert(ss!=null);
+ chrom=ss.chrom;
+ setStrand(ss.strand);
+ start=ss.start;
+ stop=ss.stop;
+ mapScore=ss.slowScore;
+ setRescued(ss.rescued);
+ gaps=ss.gaps;
+ setPerfect(ss.perfect);
+
+ match=ss.match;
+
+ if(gaps!=null){
+ gaps=ss.gaps=GapTools.fixGaps(start, stop, gaps, Shared.MINGAP);
+// gaps[0]=Tools.min(gaps[0], start);
+// gaps[gaps.length-1]=Tools.max(gaps[gaps.length-1], stop);
+ }
+ }
+
+// public static int[] fixGaps(int a, int b, int[] gaps, int minGap){
+//// System.err.println("fixGaps input: "+a+", "+b+", "+Arrays.toString(gaps)+", "+minGap);
+// int[] r=GapTools.fixGaps(a, b, gaps, minGap);
+//// System.err.println("fixGaps output: "+Arrays.toString(r));
+// return r;
+// }
+
+ public void setFromOriginalSite(){
+ setFromSite(originalSite);
+ }
+ public void setFromTopSite(){
+ final SiteScore ss=topSite();
+ if(ss==null){
+ clearSite();
+ setMapped(false);
+ return;
+ }
+ setMapped(true);
+ setFromSite(ss);
+ }
+
+ public void setFromTopSite(boolean randomIfAmbiguous, boolean primary, int maxPairDist){
+ final SiteScore ss0=topSite();
+ if(ss0==null){
+ clearSite();
+ setMapped(false);
+ return;
+ }
+ setMapped(true);
+
+ if(sites.size()==1 || !randomIfAmbiguous || !ambiguous()){
+ setFromSite(ss0);
+ return;
+ }
+
+ if(primary || mate==null || !mate.mapped() || !mate.paired()){
+ int count=1;
+ for(int i=1; i<sites.size(); i++){
+ SiteScore ss=sites.get(i);
+ if(ss.score<ss0.score || (ss0.perfect && !ss.perfect) || (ss0.semiperfect && !ss.semiperfect)){break;}
+ count++;
+ }
+
+ int x=(int)(numericID%count);
+ if(x>0){
+ SiteScore ss=sites.get(x);
+ sites.set(0, ss);
+ sites.set(x, ss0);
+ }
+ setFromSite(sites.get(0));
+ return;
+ }
+
+// assert(false) : "TODO: Proper strand orientation, and more.";
+ //TODO: Also, this code appears to sometimes duplicate sitescores(?)
+// for(int i=0; i<list.size(); i++){
+// SiteScore ss=list.get(i);
+// if(ss.chrom==mate.chrom && Tools.min(Tools.absdifUnsigned(ss.start, mate.stop), Tools.absdifUnsigned(ss.stop, mate.start))<=maxPairDist){
+// list.set(0, ss);
+// list.set(i, ss0);
+// setFromSite(ss);
+// return;
+// }
+// }
+
+ //If unsuccessful, recur unpaired.
+
+ this.setPaired(false);
+ mate.setPaired(false);
+ setFromTopSite(randomIfAmbiguous, true, maxPairDist);
+ }
+
+ public void clearPairMapping(){
+ clearMapping();
+ if(mate!=null){mate.clearMapping();}
+ }
+
+ public void clearMapping(){
+ clearSite();
+ match=null;
+ sites=null;
+ setMapped(false);
+ setPaired(false);
+ if(mate!=null){mate.setPaired(false);}
+ }
+
+ public void clearSite(){
+ chrom=-1;
+ setStrand(0);
+ start=-1;
+ stop=-1;
+// errors=0;
+ mapScore=0;
+ gaps=null;
+ }
+
+
+ public void clearAnswers(boolean clearMate) {
+// assert(mate==null || (pairnum()==0 && mate.pairnum()==1)) : pairnum()+", "+mate.pairnum();
+ clearSite();
+ match=null;
+ sites=null;
+ flags=(flags&(SYNTHMASK|COLORMASK|PAIRNUMMASK|SWAPMASK));
+ if(clearMate && mate!=null){
+ mate.clearSite();
+ mate.match=null;
+ mate.sites=null;
+ mate.flags=(mate.flags&(SYNTHMASK|COLORMASK|PAIRNUMMASK|SWAPMASK));
+ }
+// assert(mate==null || (pairnum()==0 && mate.pairnum()==1)) : pairnum()+", "+mate.pairnum();
+ }
+
+
+ public boolean isBadPair(boolean requireCorrectStrands, boolean sameStrandPairs, int maxdist){
+ if(mate==null || paired()){return false;}
+ if(!mapped() || !mate.mapped()){return false;}
+ if(chrom!=mate.chrom){return true;}
+
+ {
+ int inner;
+ if(start<=mate.start){inner=mate.start-stop;}
+ else{inner=start-mate.stop;}
+ if(inner>maxdist){return true;}
+ }
+// if(absdif(start, mate.start)>maxdist){return true;}
+ if(requireCorrectStrands){
+ if((strand()==mate.strand())!=sameStrandPairs){return true;}
+ }
+ if(!sameStrandPairs){
+ if(strand()==Gene.PLUS && mate.strand()==Gene.MINUS){
+ if(start>=mate.stop){return true;}
+ }else if(strand()==Gene.MINUS && mate.strand()==Gene.PLUS){
+ if(mate.start>=stop){return true;}
+ }
+ }
+ return false;
+ }
+
+ public int countMismatches(){
+ assert(match!=null);
+ int x=0;
+ for(byte b : match){
+ if(b=='S'){x++;}
+ }
+ return x;
+ }
+
+ /**
+ * @param k
+ * @return
+ */
+ public int numValidKmers(int k) {
+ if(bases==null){return 0;}
+ int len=0, counted=0;
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ long x=AminoAcid.baseToNumber[b];
+ if(x<0){len=0;}else{len++;}
+ if(len>=k){counted++;}
+ }
+ return counted;
+ }
+
+
+
+ /**
+ * @param match string
+ * @return Total number of match, sub, del, ins, or clip symbols
+ */
+ public static final int[] matchToMsdicn(byte[] match) {
+ if(match==null || match.length<1){return null;}
+ int[] msdicn=new int[6];
+
+ byte mode='0', c='0';
+ int current=0;
+ for(int i=0; i<match.length; i++){
+ c=match[i];
+ if(Character.isDigit(c)){
+ current=(current*10)+(c-'0');
+ }else{
+ if(mode==c){
+ current=Tools.max(current+1, 2);
+ }else{
+ current=Tools.max(current, 1);
+
+ if(mode=='m'){
+ msdicn[0]+=current;
+ }else if(mode=='S'){
+ msdicn[1]+=current;
+ }else if(mode=='D'){
+ msdicn[2]+=current;
+ }else if(mode=='I'){
+ msdicn[3]+=current;
+ }else if(mode=='C' || mode=='X' || mode=='Y'){
+ msdicn[4]+=current;
+ }else if(mode=='N' || mode=='R'){
+ msdicn[5]+=current;
+ }
+ mode=c;
+ current=0;
+ }
+ }
+ }
+ if(current>0 || !Character.isDigit(c)){
+ current=Tools.max(current, 1);
+ if(mode=='m'){
+ msdicn[0]+=current;
+ }else if(mode=='S'){
+ msdicn[1]+=current;
+ }else if(mode=='D'){
+ msdicn[2]+=current;
+ }else if(mode=='I'){
+ msdicn[3]+=current;
+ }else if(mode=='C' || mode=='X' || mode=='Y'){
+ msdicn[4]+=current;
+ }else if(mode=='N' || mode=='R'){
+ msdicn[5]+=current;
+ }
+ }
+ return msdicn;
+ }
+
+
+ /**
+ * @param match string
+ * @return Ref length of match string
+ */
+ public static final int calcMatchLength(byte[] match) {
+ if(match==null || match.length<1){return 0;}
+
+ byte mode='0', c='0';
+ int current=0;
+ int len=0;
+ for(int i=0; i<match.length; i++){
+ c=match[i];
+ if(Character.isDigit(c)){
+ current=(current*10)+(c-'0');
+ }else{
+ if(mode==c){
+ current=Tools.max(current+1, 2);
+ }else{
+ current=Tools.max(current, 1);
+
+ if(mode=='m'){
+ len+=current;
+ }else if(mode=='S'){
+ len+=current;
+ }else if(mode=='D'){
+ len+=current;
+ }else if(mode=='I'){ //Do nothing
+ //len+=current;
+ }else if(mode=='C' || mode=='X' || mode=='Y'){
+ len+=current;
+ }else if(mode=='N' || mode=='R'){
+ len+=current;
+ }
+ mode=c;
+ current=0;
+ }
+ }
+ }
+ if(current>0 || !Character.isDigit(c)){
+ current=Tools.max(current, 1);
+ if(mode=='m'){
+ len+=current;
+ }else if(mode=='S'){
+ len+=current;
+ }else if(mode=='D'){
+ len+=current;
+ }else if(mode=='I'){ //Do nothing
+ //len+=current;
+ }else if(mode=='C' || mode=='X' || mode=='Y'){
+ len+=current;
+ }else if(mode=='N' || mode=='R'){
+ len+=current;
+ }
+ }
+ return len;
+ }
+
+ public final float identity() {return identity(match);}
+
+ public static final float identity(byte[] match) {
+ if(FLAT_IDENTITY){
+ return identityFlat(match);
+ }else{
+ return identitySkewed(match);
+ }
+ }
+
+ /**
+ * Handles short or long mode.
+ * @param match string
+ * @return Identity based on number of match, sub, del, ins, or N symbols
+ */
+ public static final float identityFlat(byte[] match) {
+// assert(false) : new String(match);
+ if(match==null || match.length<1){return 0;}
+
+ int good=0, bad=0, n=0;
+
+ byte mode='0', c='0';
+ int current=0;
+ for(int i=0; i<match.length; i++){
+ c=match[i];
+ if(Character.isDigit(c)){
+ current=(current*10)+(c-'0');
+ }else{
+ if(mode==c){
+ current=Tools.max(current+1, 2);
+ }else{
+ current=Tools.max(current, 1);
+
+ if(mode=='m'){
+ good+=current;
+// System.out.println("G: mode="+(char)mode+", c="+(char)c+", current="+current+", good="+good+", bad="+bad);
+ }else if(mode=='R' || mode=='N'){
+ n+=current;
+ }else if(mode=='C'){
+ //Do nothing
+ //I assume this is clipped because it went off the end of a scaffold, and thus is irrelevant to identity
+ }else if(mode!='0'){
+ assert(mode=='S' || mode=='D' || mode=='I' || mode=='X' || mode=='Y') : (char)mode;
+ if(mode!='D' || current<SamLine.INTRON_LIMIT){
+ bad+=current;
+ }
+// System.out.println("B: mode="+(char)mode+", c="+(char)c+", current="+current+", good="+good+", bad="+bad);
+ }
+ mode=c;
+ current=0;
+ }
+ }
+ }
+ if(current>0 || !Character.isDigit(c)){
+ current=Tools.max(current, 1);
+ if(mode=='m'){
+ good+=current;
+ }else if(mode=='R' || mode=='N'){
+ n+=current;
+ }else if(mode=='C'){
+ //Do nothing
+ //I assume this is clipped because it went off the end of a scaffold, and thus is irrelevant to identity
+ }else if(mode!='0'){
+ assert(mode=='S' || mode=='I' || mode=='X' || mode=='Y') : (char)mode;
+ if(mode!='D' || current<SamLine.INTRON_LIMIT){
+ bad+=current;
+ }
+// System.out.println("B: mode="+(char)mode+", c="+(char)c+", current="+current+", good="+good+", bad="+bad);
+ }
+ }
+
+
+ n=(n+3)/4;
+ good+=n;
+ bad+=3*n;
+ float r=good/(float)Tools.max(good+bad, 1);
+// assert(false) : new String(match)+"\nmode='"+(char)mode+"', current="+current+", good="+good+", bad="+bad;
+
+// System.out.println("match="+new String(match)+"\ngood="+good+", bad="+bad+", r="+r);
+// System.out.println(Arrays.toString(matchToMsdicn(match)));
+
+ return r;
+ }
+
+ /**
+ * Handles short or long mode.
+ * @param match string
+ * @return Identity based on number of match, sub, del, ins, or N symbols
+ */
+ public static final float identitySkewed(byte[] match) {
+// assert(false) : new String(match);
+ if(match==null || match.length<1){return 0;}
+
+ int good=0, bad=0, n=0;
+
+ byte mode='0', c='0';
+ int current=0;
+ for(int i=0; i<match.length; i++){
+ c=match[i];
+ if(Character.isDigit(c)){
+ current=(current*10)+(c-'0');
+ }else{
+ if(mode==c){
+ current=Tools.max(current+1, 2);
+ }else{
+ current=Tools.max(current, 1);
+
+ if(mode=='m'){
+ good+=current;
+// System.out.println("G: mode="+(char)mode+", c="+(char)c+", current="+current+", good="+good+", bad="+bad);
+ }else if(mode=='D'){
+ if(current<SamLine.INTRON_LIMIT){
+ int x;
+ x=(int)Math.ceil(Math.sqrt(current));
+ //x=(int)Math.ceil(Tools.log2(current));
+ //May need special handling....
+ bad+=(Tools.min(x, current));
+ }
+
+// System.out.println("D: mode="+(char)mode+", c="+(char)c+", current="+current+", good="+good+", bad="+bad+", x="+x);
+ }else if(mode=='R' || mode=='N'){
+ n+=current;
+ }else if(mode=='C'){
+ //Do nothing
+ //I assume this is clipped because it went off the end of a scaffold, and thus is irrelevant to identity
+ }else if(mode!='0'){
+ assert(mode=='S' || mode=='I' || mode=='X' || mode=='Y') : (char)mode;
+ bad+=current;
+// System.out.println("B: mode="+(char)mode+", c="+(char)c+", current="+current+", good="+good+", bad="+bad);
+ }
+ mode=c;
+ current=0;
+ }
+ }
+ }
+ if(current>0 || !Character.isDigit(c)){
+ current=Tools.max(current, 1);
+ if(mode=='m'){
+ good+=current;
+ }else if(mode=='R' || mode=='N'){
+ n+=current;
+ }else if(mode=='C'){
+ //Do nothing
+ //I assume this is clipped because it went off the end of a scaffold, and thus is irrelevant to identity
+ }else if(mode!='0'){
+ assert(mode=='S' || mode=='I' || mode=='X' || mode=='Y') : (char)mode;
+ if(mode!='D' || current<SamLine.INTRON_LIMIT){
+ bad+=current;
+ }
+// System.out.println("B: mode="+(char)mode+", c="+(char)c+", current="+current+", good="+good+", bad="+bad);
+ }
+ }
+
+
+ n=(n+3)/4;
+ good+=n;
+ bad+=n;
+ float r=good/(float)Tools.max(good+bad, 1);
+// assert(false) : new String(match)+"\nmode='"+(char)mode+"', current="+current+", good="+good+", bad="+bad;
+
+// System.out.println("match="+new String(match)+"\ngood="+good+", bad="+bad+", r="+r);
+// System.out.println(Arrays.toString(matchToMsdicn(match)));
+
+ return r;
+ }
+
+ public boolean failsChastity(){
+ if(id==null){return false;}
+ int space=id.indexOf(' ');
+ if(space<0 || space+5>id.length()){return false;}
+ char a=id.charAt(space+1);
+ char b=id.charAt(space+2);
+ char c=id.charAt(space+3);
+ char d=id.charAt(space+4);
+
+ if(a=='/'){
+ if(b<'1' || b>'4' || c!=':'){
+ KillSwitch.kill("Strangely formatted read. Please disable chastityfilter. id:"+id);
+ }
+ return d=='Y';
+ }else{
+ assert(a=='1' || a=='2' || a=='3' || a=='4') : id;
+ assert(b==':') : id;
+ assert(d==':');
+ if(a<'1' || a>'4' || b!=':' || d!=':'){
+ KillSwitch.kill("Strangely formatted read. Please disable chastityfilter. id:"+id);
+ }
+ return c=='Y';
+ }
+ }
+
+ public boolean failsBarcode(HashSet<String> set, boolean failIfNoBarcode){
+ if(id==null){return false;}
+
+ final int loc=(id==null ? -1 : id.lastIndexOf(':'));
+ if(loc<0 || loc>=id.length()-1){
+ return failIfNoBarcode;
+ }
+
+ if(set==null){
+ for(int i=loc+1; i<id.length(); i++){
+ char c=id.charAt(i);
+ boolean ok=(c=='+' || AminoAcid.isFullyDefined(c));
+ if(!ok){return true;}
+ }
+ return false;
+ }else{
+ String code=id.substring(loc+1);
+ return !set.contains(code);
+ }
+ }
+
+ /** Average based on summing quality scores */
+ public int avgQuality(boolean countUndefined, int maxBases){
+ return AVERAGE_QUALITY_BY_PROBABILITY ? avgQualityByProbability(countUndefined, maxBases) : avgQualityByScore(maxBases);
+ }
+
+ /** Average based on summing error probabilities */
+ public int avgQualityByProbability(boolean countUndefined, int maxBases){
+ if(bases==null || bases.length==0){return 0;}
+ return avgQualityByProbability(bases, quality, countUndefined, maxBases);
+ }
+
+ /** Average based on summing error probabilities */
+ public static int avgQualityByProbability(byte[] bases, byte[] quality, boolean countUndefined, int maxBases){
+ if(quality==null){return 40;}
+ if(quality.length==0){return 0;}
+ float e=expectedErrors(bases, quality, countUndefined, maxBases);
+ final int div=(maxBases<1 ? quality.length : Tools.min(maxBases, quality.length));
+ float p=e/div;
+ return QualityTools.probCorrectToPhred(1-p);
+ }
+
+ /** Average based on summing quality scores */
+ public int avgQualityByScore(int maxBases){
+ if(bases==null || bases.length==0){return 0;}
+ if(quality==null){return 40;}
+ int x=0, limit=(maxBases<1 ? quality.length : Tools.min(maxBases, quality.length));
+ for(int i=0; i<limit; i++){
+ byte b=quality[i];
+ x+=(b<0 ? 0 : b);
+ }
+ return x/limit;
+ }
+
+ /** Used by BBMap tipsearch. */
+ public int avgQualityFirstNBases(int n){
+ if(bases==null || bases.length==0){return 0;}
+ if(quality==null || n<1){return 40;}
+ assert(quality!=null);
+ int x=0;
+ if(n>quality.length){return 0;}
+ for(int i=0; i<n; i++){
+ byte b=quality[i];
+ x+=(b<0 ? 0 : b);
+ }
+ return x/n;
+ }
+
+ /** Used by BBMap tipsearch. */
+ public int avgQualityLastNBases(int n){
+ if(bases==null || bases.length==0){return 0;}
+ if(quality==null || n<1){return 40;}
+ assert(quality!=null);
+ int x=0;
+ if(n>quality.length){return 0;}
+ for(int i=bases.length-n; i<bases.length; i++){
+ byte b=quality[i];
+ x+=(b<0 ? 0 : b);
+ }
+ return x/n;
+ }
+
+ /** Used by BBMap tipsearch. */
+ public byte minQualityFirstNBases(int n){
+ if(bases==null || bases.length==0){return 0;}
+ if(quality==null || n<1){return 40;}
+ assert(quality!=null && n>0);
+ if(n>quality.length){return 0;}
+ byte x=quality[0];
+ for(int i=1; i<n; i++){
+ byte b=quality[i];
+ if(b<x){x=b;}
+ }
+ return x;
+ }
+
+ /** Used by BBMap tipsearch. */
+ public byte minQualityLastNBases(int n){
+ if(bases==null || bases.length==0){return 0;}
+ if(quality==null || n<1){return 40;}
+ assert(quality!=null && n>0);
+ if(n>quality.length){return 0;}
+ byte x=quality[bases.length-n];
+ for(int i=bases.length-n; i<bases.length; i++){
+ byte b=quality[i];
+ if(b<x){x=b;}
+ }
+ return x;
+ }
+
+ public boolean containsNonM(){
+ assert(match!=null && valid());
+ for(int i=0; i<match.length; i++){
+ byte b=match[i];
+ assert(b!='M');
+ if(b>'9' && b!='m'){return true;}
+ }
+ return false;
+ }
+
+ public boolean containsNonNM(){
+ assert(match!=null && valid());
+ for(int i=0; i<match.length; i++){
+ byte b=match[i];
+ assert(b!='M');
+ if(b>'9' && b!='m' && b!='N'){return true;}
+ }
+ return false;
+ }
+
+ public boolean containsNonNMXY(){
+ assert(match!=null && valid());
+ for(int i=0; i<match.length; i++){
+ byte b=match[i];
+ assert(b!='M');
+ if(b>'9' && b!='m' && b!='N' && b!='X' && b!='Y'){return true;}
+ }
+ return false;
+ }
+
+ public boolean containsSDI(){
+ assert(match!=null && valid());
+ for(int i=0; i<match.length; i++){
+ byte b=match[i];
+ assert(b!='M');
+ if(b=='S' || b=='s' || b=='D' || b=='I'){return true;}
+ }
+ return false;
+ }
+
+ public boolean containsNonNMS(){
+ assert(match!=null && valid());
+ for(int i=0; i<match.length; i++){
+ byte b=match[i];
+ assert(b!='M');
+ if(b>'9' && b!='m' && b!='s' && b!='N' && b!='S'){return true;}
+ }
+ return false;
+ }
+
+ public boolean containsConsecutiveS(int num){
+ assert(match!=null && valid() && !shortmatch());
+ int cnt=0;
+ for(int i=0; i<match.length; i++){
+ byte b=match[i];
+ assert(b!='M');
+ if(b=='S'){
+ cnt++;
+ if(cnt>=num){return true;}
+ }else{
+ cnt=0;
+ }
+ }
+ return false;
+ }
+
+ public boolean containsIndels(){
+ assert(match!=null && valid());
+ for(int i=0; i<match.length; i++){
+ byte b=match[i];
+ if(b=='I' || b=='D' || b=='X' || b=='Y'){return true;}
+ }
+ return false;
+ }
+
+ public int countSubs(){
+ assert(match!=null && valid() && !shortmatch());
+ return countSubs(match);
+ }
+
+ public boolean containsInMatch(char c){
+ assert(match!=null && valid());
+ for(int i=0; i<match.length; i++){
+ byte b=match[i];
+ if(b==c){return true;}
+ }
+ return false;
+ }
+
+ public boolean containsNocalls(){
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ if(b=='N'){return true;}
+ }
+ return false;
+ }
+
+ public int countNocalls(){
+ return countNocalls(bases);
+ }
+
+ public static int countSubs(byte[] match){
+ int x=0;
+ assert(match!=null);
+ for(int i=0; i<match.length; i++){
+ byte b=match[i];
+ if(b=='S'){x++;}
+ assert(!Character.isDigit(b));
+ }
+ return x;
+ }
+
+ public static int countNocalls(byte[] bases){
+ int n=0;
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ if(b=='N'){n++;}
+ }
+ return n;
+ }
+
+ public static int countInsertions(byte[] bases){
+ int n=0;
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ if(b=='I'){n++;}
+ }
+ return n;
+ }
+
+ public static int countDeletions(byte[] bases){
+ int n=0;
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ if(b=='D'){n++;}
+ }
+ return n;
+ }
+
+ public static int countInsertionEvents(byte[] bases){
+ int n=0;
+ byte prev='N';
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ if(b=='I' && prev!=b){n++;}
+ prev=b;
+ }
+ return n;
+ }
+
+ public static int countDeletionEvents(byte[] bases){
+ int n=0;
+ byte prev='N';
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ if(b=='D' && prev!=b){n++;}
+ prev=b;
+ }
+ return n;
+ }
+
+ public boolean containsNonACGTN(){
+ if(bases==null){return false;}
+ for(byte b : bases){
+ if(AminoAcid.baseToNumberACGTN[b]<0){return true;}
+ }
+ return false;
+ }
+
+ public boolean containsUndefined(){
+ if(bases==null){return false;}
+ for(byte b : bases){
+ if(AminoAcid.baseToNumber[b]<0){return true;}
+ }
+ return false;
+ }
+
+ public int countUndefined(){
+ if(bases==null){return 0;}
+ int n=0;
+ for(byte b : bases){
+ if(AminoAcid.baseToNumber[b]<0){n++;}
+ }
+ return n;
+ }
+
+ public boolean hasMinConsecutiveBases(final int min){
+ if(bases==null){return min<=0;}
+ int len=0;
+ for(byte b : bases){
+ if(AminoAcid.baseToNumber[b]<0){len=0;}
+ else{
+ len++;
+ if(len>=min){return true;}
+ }
+ }
+ return false;
+ }
+
+
+ /**
+ * @return The number of occurrences of the rarest base.
+ */
+ public int minBaseCount(){
+ if(bases==null){return 0;}
+ int a=0, c=0, g=0, t=0;
+ for(byte b : bases){
+ if(b=='A'){a++;}
+ else if(b=='C'){c++;}
+ else if(b=='G'){g++;}
+ else if(b=='T'){t++;}
+ }
+ return Tools.min(a, c, g, t);
+ }
+
+ public boolean containsXY(){
+ assert(match!=null && valid());
+ return containsXY(match);
+ }
+
+ public static boolean containsXY(byte[] match){
+ if(match==null){return false;}
+ for(int i=0; i<match.length; i++){
+ byte b=match[i];
+ if(b=='X' || b=='Y'){return true;}
+ }
+ return false;
+ }
+
+ public boolean containsXY2(){
+ if(match==null || match.length<1){return false;}
+ boolean b=(match[0]=='X' || match[match.length-1]=='Y');
+ assert(!valid() || b==containsXY());
+ return b;
+ }
+
+ public boolean containsXYC(){
+ if(match==null || match.length<1){return false;}
+ boolean b=(match[0]=='X' || match[match.length-1]=='Y');
+ assert(!valid() || b==containsXY());
+ return b || match[0]=='C' || match[match.length-1]=='C';
+ }
+
+ /** Replaces 'B' in match string with 'S', 'm', or 'N' */
+ public boolean fixMatchB(){
+ assert(match!=null);
+ final ChromosomeArray ca;
+ if(Data.GENOME_BUILD>=0){
+ ca=Data.getChromosome(chrom);
+ }else{
+ ca=null;
+ }
+ boolean originallyShort=shortmatch();
+ if(originallyShort){match=toLongMatchString(match);}
+ int mloc=0, cloc=0, rloc=start;
+ for(; mloc<match.length; mloc++){
+ byte m=match[mloc];
+
+ if(m=='B'){
+ byte r=(ca==null ? (byte)'?' : ca.get(rloc));
+ byte c=bases[cloc];
+ if(r=='N' || c=='N'){
+ match[mloc]='N';
+ }else if(r==c || Character.toUpperCase(r)==Character.toUpperCase(c)){
+ match[mloc]='m';
+ }else{
+ if(ca==null){
+ if(originallyShort){
+ match=toShortMatchString(match);
+ }
+ for(int i=0; i<match.length; i++){
+ if(match[i]=='B'){match[i]='N';}
+ }
+ return false;
+ }
+ match[mloc]='S';
+ }
+ cloc++;
+ rloc++;
+ }else if(m=='m' || m=='S' || m=='N' || m=='s' || m=='C'){
+ cloc++;
+ rloc++;
+ }else if(m=='D'){
+ rloc++;
+ }else if(m=='I' || m=='X' || m=='Y'){
+ cloc++;
+ }
+ }
+ if(originallyShort){match=toShortMatchString(match);}
+ return true;
+ }
+
+ public float expectedTipErrors(boolean countUndefined, int maxBases){
+ return expectedTipErrors(bases, quality, countUndefined, maxBases);
+ }
+
+ public float expectedErrors(boolean countUndefined, int maxBases){
+ return expectedErrors(bases, quality, countUndefined, maxBases);
+ }
+
+ public static float expectedErrors(byte[] bases, byte[] quality, boolean countUndefined, int maxBases){
+ if(quality==null){return 0;}
+ final int limit=(maxBases<1 ? quality.length : Tools.min(maxBases, quality.length));
+ final float[] array=QualityTools.PROB_ERROR;
+ assert(array[0]>0 && array[0]<1);
+ float sum=0;
+ for(int i=0; i<limit; i++){
+ byte b=bases[i];
+ byte q=quality[i];
+ if(AminoAcid.isFullyDefined(b)){
+ sum+=array[q];
+ }else{
+ assert(q==0);
+ if(countUndefined){sum+=0.75f;}
+ }
+ }
+ return sum;
+ }
+
+ /** Runs backwards instead of forwards */
+ public static float expectedTipErrors(byte[] bases, byte[] quality, boolean countUndefined, int maxBases){
+ if(quality==null){return 0;}
+ final int limit;
+ {
+ final int limit0=(maxBases<1 ? quality.length : Tools.min(maxBases, quality.length));
+ limit=quality.length-limit0;
+ }
+ final float[] array=QualityTools.PROB_ERROR;
+ assert(array[0]>0 && array[0]<1);
+ float sum=0;
+ for(int i=quality.length-1; i>=limit; i--){
+ byte b=bases[i];
+ byte q=quality[i];
+ if(AminoAcid.isFullyDefined(b)){
+ sum+=array[q];
+ }else{
+ assert(q==0);
+ if(countUndefined){sum+=0.75f;}
+ }
+ }
+ return sum;
+ }
+
+ public int estimateErrors() {
+ if(quality==null){return 0;}
+ assert(match!=null) : this.toText(false);
+
+ int count=0;
+ for(int ci=0, mi=0; ci<bases.length && mi<match.length; mi++){
+
+// byte b=bases[ci];
+ byte q=quality[ci];
+ byte m=match[mi];
+ if(m=='m' || m=='s' || m=='N'){
+ ci++;
+ }else if(m=='X' || m=='Y'){
+ ci++;
+ count++;
+ }else if(m=='I'){
+ ci++;
+ }else if(m=='D'){
+
+ }else if(m=='S'){
+ ci++;
+ if(q<19){
+ count++;
+ }
+ }
+
+ }
+ return count;
+ }
+
+ /** {M, S, D, I, N, splice} */
+ public int[] countErrors(int minSplice) {
+ assert(match!=null) : this.toText(false);
+ int m=0;
+ int s=0;
+ int d=0;
+ int i=0;
+ int n=0;
+ int splice=0;
+
+ byte prev=' ';
+ int streak=0;
+ minSplice=Tools.max(minSplice, 1);
+
+ for(int pos=0; pos<match.length; pos++){
+ final byte b=match[pos];
+
+ if(b==prev){streak++;}else{streak=1;}
+
+ if(b=='m'){
+ m++;
+ }else if(b=='N' || b=='C'){
+ n++;
+ }else if(b=='X' || b=='Y'){
+ i++;
+ }else if(b=='I'){
+ i++;
+ }else if(b=='D'){
+ d++;
+ if(streak==minSplice){splice++;}
+ }else if(b=='S'){
+ s++;
+ }else{
+ if(Character.isDigit(b) && shortmatch()){
+ System.err.println("Warning! Found read in shortmatch form during countErrors():\n"+this); //Usually caused by verbose output.
+ if(mate!=null){System.err.println("mate:\n"+mate.id+"\t"+new String(mate.bases));}
+ System.err.println("Stack trace: ");
+ new Exception().printStackTrace();
+ match=toLongMatchString(match);
+ setShortMatch(false);
+ return countErrors(minSplice);
+ }else{
+ throw new RuntimeException("\nUnknown symbol "+(char)b+":\n"+new String(match)+"\n"+this+"\nshortmatch="+this.shortmatch());
+ }
+ }
+
+ prev=b;
+ }
+
+// assert(i==0) : i+"\n"+this+"\n"+new String(match)+"\n"+Arrays.toString(new int[] {m, s, d, i, n, splice});
+
+ return new int[] {m, s, d, i, n, splice};
+ }
+
+ public static boolean isShortMatchString(byte[] match){
+ byte last=' ';
+ int streak=0;
+ for(int i=0; i<match.length; i++){
+ byte b=match[i];
+ if(Character.isDigit(b)){return true;}
+ if(b==last){
+ streak++;
+ if(streak>3){return true;}
+ }else{
+ streak=0;
+ last=b;
+ }
+ }
+ return false;
+ }
+
+ public static byte[] toShortMatchString(byte[] match){
+ if(match==null){return null;}
+ assert(match.length>0);
+ StringBuilder sb=new StringBuilder(10);
+
+ byte prev=match[0];
+ int count=1;
+ for(int i=1; i<match.length; i++){
+ byte m=match[i];
+ assert(Character.isLetter(m) || m==0) : new String(match);
+ if(m==0){System.err.println("Warning! Converting empty match string to short form.");}
+ if(m==prev){count++;}
+ else{
+ sb.append((char)prev);
+ if(count>2){sb.append(count);}
+ else if(count==2){sb.append((char)prev);}
+ prev=m;
+ count=1;
+ }
+ }
+ sb.append((char)prev);
+ if(count>2){sb.append(count);}
+ else if(count==2){sb.append((char)prev);}
+
+ byte[] r=new byte[sb.length()];
+ for(int i=0; i<sb.length(); i++){r[i]=(byte)sb.charAt(i);}
+ return r;
+ }
+
+ public static byte[] toLongMatchString(byte[] shortmatch){
+ if(shortmatch==null){return null;}
+ assert(shortmatch.length>0);
+
+ int count=0;
+ int current=0;
+ for(int i=0; i<shortmatch.length; i++){
+ byte m=shortmatch[i];
+ if(Character.isLetter(m)){
+ count++;
+ count+=(current>0 ? current-1 : 0);
+ current=0;
+ }else{
+ assert(Character.isDigit(m));
+ current=(current*10)+(m-48); //48 == '0'
+ }
+ }
+ count+=(current>0 ? current-1 : 0);
+
+
+ byte[] r=new byte[count];
+ current=0;
+ byte lastLetter='?';
+ int j=0;
+ for(int i=0; i<shortmatch.length; i++){
+ byte m=shortmatch[i];
+ if(Character.isLetter(m)){
+ while(current>1){
+ r[j]=lastLetter;
+ current--;
+ j++;
+ }
+ current=0;
+
+ r[j]=m;
+ j++;
+ lastLetter=m;
+ }else{
+ assert(Character.isDigit(m));
+ current=(current*10)+(m-48); //48 == '0'
+ }
+ }
+ while(current>1){
+ r[j]=lastLetter;
+ current--;
+ j++;
+ }
+
+ assert(r[r.length-1]>0);
+ return r;
+ }
+
+ public String parseCustomRname(){
+ for(int i=0, under=0; i<id.length(); i++){
+ if(id.charAt(i)=='_'){
+ under++;
+ if(under==6){
+ if(id.length()>i+1){
+ return id.substring(i+1);
+ }
+ return null;
+ }
+ }
+ }
+ return null;
+ }
+
+ /** Bases of the read. */
+ public byte[] bases;
+
+ /** Quality of the read. */
+ public byte[] quality;
+
+ /** Alignment string. E.G. mmmmDDDmmm would have 4 matching bases, then a 3-base deletion, then 3 matching bases. */
+ public byte[] match;
+
+ public int[] gaps;
+
+ public String id;
+ public long numericID;
+ public int chrom;
+ public int start;
+ public int stop;
+
+ public int copies=1;
+
+ /** Errors detected (remaining) */
+ public int errors=0;
+
+ /** Alignment score from BBMap. Assumed to max at approx 100*bases.length */
+ public int mapScore=0;
+
+ public ArrayList<SiteScore> sites;
+ public SiteScore originalSite; //Origin site for synthetic reads
+ public Serializable obj=null; //For testing only
+ public Read mate;
+
+ public int flags;
+
+ /** -1 if invalid. TODO: Currently not retained through most processes. */
+ private int insert=-1;
+
+ /** A random number for deterministic usage.
+ * May decrease speed in multithreaded applications.
+ */
+ public double rand=-1;
+
+ public long time(){
+ assert(obj!=null && obj.getClass()==Long.class) : obj;
+ return ((Long)obj).longValue();
+ }
+ public int length(){return bases==null ? 0 : bases.length;}
+ public int mateLength(){return mate==null ? 0 : mate.length();}
+ public int mateCount(){return mate==null ? 0 : 1;}
+ public boolean mateMapped(){return mate==null ? false : mate.mapped();}
+
+ public boolean untrim(){
+ if(obj==null || obj.getClass()!=TrimRead.class){return false;}
+ ((TrimRead)obj).untrim();
+ obj=null;
+ return true;
+ }
+
+ public int trailingLowerCase(){
+ for(int i=bases.length-1; i>=0;){
+ if(Character.isLowerCase(bases[i])){
+ i--;
+ }else{
+ return bases.length-i-1;
+ }
+ }
+ return bases.length;
+ }
+ public int leadingLowerCase(){
+ for(int i=0; i<bases.length; i++){
+ if(!Character.isLowerCase(bases[i])){return i;}
+ }
+ return bases.length;
+ }
+
+ public byte strand(){return (byte)(flags&1);}
+ public boolean mapped(){return (flags&MAPPEDMASK)==MAPPEDMASK;}
+ public boolean paired(){return (flags&PAIREDMASK)==PAIREDMASK;}
+ public boolean synthetic(){return (flags&SYNTHMASK)==SYNTHMASK;}
+// public boolean colorspace(){return (flags&COLORMASK)==COLORMASK;}
+ public boolean ambiguous(){return (flags&AMBIMASK)==AMBIMASK;}
+ public boolean perfect(){return (flags&PERFECTMASK)==PERFECTMASK;}
+// public boolean semiperfect(){return perfect() ? true : list!=null && list.size()>0 ? list.get(0).semiperfect : false;} //TODO: This is a hack. Add a semiperfect flag.
+ public boolean rescued(){return (flags&RESCUEDMASK)==RESCUEDMASK;}
+ public boolean discarded(){return (flags&DISCARDMASK)==DISCARDMASK;}
+ public boolean invalid(){return (flags&INVALIDMASK)==INVALIDMASK;}
+ public boolean swapped(){return (flags&SWAPMASK)==SWAPMASK;}
+ public boolean shortmatch(){return (flags&SHORTMATCHMASK)==SHORTMATCHMASK;}
+ public boolean insertvalid(){return (flags&INSERTMASK)==INSERTMASK;}
+ public boolean hasadapter(){return (flags&ADAPTERMASK)==ADAPTERMASK;}
+ public boolean secondary(){return (flags&SECONDARYMASK)==SECONDARYMASK;}
+ public boolean aminoacid(){return (flags&AAMASK)==AAMASK;}
+ public boolean junk(){return (flags&JUNKMASK)==JUNKMASK;}
+ public boolean validated(){return (flags&VALIDATEDMASK)==VALIDATEDMASK;}
+
+ /** For paired ends: 0 for read1, 1 for read2 */
+ public int pairnum(){return (flags&PAIRNUMMASK)>>PAIRNUMSHIFT;}
+ public boolean valid(){return !invalid();}
+
+ public boolean getFlag(int mask){return (flags&mask)==mask;}
+ public int flagToNumber(int mask){return (flags&mask)==mask ? 1 : 0;}
+
+ public void setFlag(int mask, boolean b){
+ flags=(flags&~mask);
+ if(b){flags|=mask;}
+ }
+
+ public void setStrand(int b){
+ assert(b==1 || b==0);
+ flags=(flags&(~1))|b;
+ }
+
+ /** For paired ends: 0 for read1, 1 for read2 */
+ public void setPairnum(int b){
+// System.err.println("Setting pairnum to "+b+" for "+id);
+// assert(!id.equals("2_chr1_0_1853883_1853982_1845883_ecoli_K12") || b==1);
+ assert(b==1 || b==0);
+ flags=(flags&(~PAIRNUMMASK))|(b<<PAIRNUMSHIFT);
+// assert(pairnum()==b);
+ }
+
+ public void setPaired(boolean b){
+ flags=(flags&~PAIREDMASK);
+ if(b){flags|=PAIREDMASK;}
+ }
+
+ public void setSynthetic(boolean b){
+ flags=(flags&~SYNTHMASK);
+ if(b){flags|=SYNTHMASK;}
+ }
+
+ public void setAmbiguous(boolean b){
+ flags=(flags&~AMBIMASK);
+ if(b){flags|=AMBIMASK;}
+ }
+
+ public boolean setPerfectFlag(int maxScore){
+ final SiteScore ss=topSite();
+ if(ss==null){
+ setPerfect(false);
+ }else{
+ assert(ss.slowScore<=maxScore) : maxScore+", "+ss.slowScore+", "+ss.toText();
+
+ if(ss.slowScore==maxScore || ss.perfect){
+ assert(testMatchPerfection(true)) : "\n"+ss+"\n"+maxScore+"\n"+this+"\n"+mate+"\n";
+ setPerfect(true);
+ }else{
+ boolean flag=testMatchPerfection(false);
+ setPerfect(flag);
+ assert(flag || !ss.perfect) : "flag="+flag+", ss.perfect="+ss.perfect+"\nmatch="+new String(match)+"\n"+this.toText(false);
+ assert(!flag || ss.slowScore>=maxScore) : "\n"+ss+"\n"+maxScore+"\n"+this+"\n"+mate+"\n";
+ }
+ }
+ return perfect();
+ }
+
+ private boolean testMatchPerfection(boolean returnIfNoMatch){
+ if(match==null){return returnIfNoMatch;}
+ boolean flag=(match.length==bases.length);
+ if(shortmatch()){
+ flag=(match.length==0 || match[0]=='m');
+ for(int i=0; i<match.length && flag; i++){flag=(match[i]=='m' || Character.isDigit(match[i]));}
+ }else{
+ for(int i=0; i<match.length && flag; i++){flag=(match[i]=='m');}
+ }
+ for(int i=0; i<bases.length && flag; i++){flag=(bases[i]!='N');}
+ return flag;
+ }
+
+ /**
+ * @return
+ */
+ public float gc() {
+ if(bases==null || bases.length<1){return 0;}
+ int at=0, gc=0;
+ for(byte b : bases){
+ int x=AminoAcid.baseToNumber[b];
+ if(x>-1){
+ if(x==0 || x==3){at++;}
+ else{gc++;}
+ }
+ }
+ if(gc<1){return 0;}
+ return gc*1f/(at+gc);
+ }
+
+ /**
+ * @param swapFrom
+ * @param swapTo
+ * @return number of swaps
+ */
+ public int swapBase(byte swapFrom, byte swapTo) {
+ if(bases==null){return 0;}
+ int swaps=0;
+ for(int i=0; i<bases.length; i++){
+ if(bases[i]==swapFrom){
+ bases[i]=swapTo;
+ swaps++;
+ }
+ }
+ return swaps;
+ }
+
+ /**
+ * @param remap Table of new values
+ */
+ public void remap(byte[] remap) {
+ if(bases==null){return;}
+ for(int i=0; i<bases.length; i++){
+ bases[i]=remap[bases[i]];
+ }
+ }
+
+ /**
+ * @param remap Table of new values
+ */
+ public int remapAndCount(byte[] remap) {
+ if(bases==null){return 0;}
+ int swaps=0;
+ for(int i=0; i<bases.length; i++){
+ byte a=bases[i];
+ byte b=remap[a];
+ if(a!=b){
+ bases[i]=b;
+ swaps++;
+ }
+ }
+ return swaps;
+ }
+
+ public int convertUndefinedTo(byte b){
+ if(bases==null){return 0;}
+ int changed=0;
+ for(int i=0; i<bases.length; i++){
+ if(b<0 || AminoAcid.baseToNumberACGTN[bases[i]]<0){
+ changed++;
+ bases[i]=b;
+ if(quality!=null){quality[i]=0;}
+ }
+ }
+ return changed;
+ }
+
+ public void swapBasesWithMate(){
+ if(mate==null){
+ assert(false);
+ return;
+ }
+ byte[] temp=bases;
+ bases=mate.bases;
+ mate.bases=temp;
+ temp=quality;
+ quality=mate.quality;
+ mate.quality=temp;
+ }
+
+ public int insert(){
+ return insertvalid() ? insert : -1;
+ }
+
+ public int insertSizeMapped(boolean ignoreStrand){
+ return insertSizeMapped(this, mate, ignoreStrand);
+ }
+
+ public static int insertSizeMapped(Read r1, Read r2, boolean ignoreStrand){
+// assert(false) : ignoreStrand+", "+(r2==null)+", "+(r1.mapped())+", "+(r2.mapped())+", "+(r1.strand()==r2.strand())+", "+r1.strand()+", "+r2.strand();
+ if(ignoreStrand || r2==null || !r1.mapped() || !r2.mapped() || r1.strand()==r2.strand()){return insertSizeMapped_Unstranded(r1, r2);}
+ return insertSizeMapped_PlusLeft(r1, r2);
+ }
+
+ /** TODO: This is not correct when the insert is shorter than a read's bases with same-strand reads */
+ public static int insertSizeMapped_PlusLeft(Read r1, Read r2){
+ if(r1.strand()>r2.strand()){return insertSizeMapped_PlusLeft(r2, r1);}
+ if(r1.strand()==r2.strand() || r1.start>r2.stop){return insertSizeMapped_Unstranded(r2, r1);} //So r1 is always on the left.
+// if(!mapped() || !mate.mapped()){return 0;}
+ if(r1.chrom!=r2.chrom){return 0;}
+ if(r1.start==r1.stop || r2.start==r2.stop){return 0;} //???
+
+ int a=r1.length();
+ int b=r2.length();
+ int mid=r2.start-r1.stop-1;
+ if(-mid>=a+b){return insertSizeMapped_Unstranded(r1, r2);} //Not properly oriented; plus read is to the right of minus read
+ return mid+a+b;
+ }
+
+ public static int insertSizeMapped_Unstranded(Read r1, Read r2){
+ if(r2==null){return r1.start==r1.stop ? 0 : r1.stop-r1.start+1;}
+
+ if(r1.start>r2.start){return insertSizeMapped_Unstranded(r2, r1);} //So r1 is always on the left side.
+
+// if(!mapped() || !mate.mapped()){return 0;}
+ if(r1.start==r1.stop || r2.start==r2.stop){return 0;} //???
+
+ if(r1.chrom!=r2.chrom){return 0;}
+ int a=r1.length();
+ int b=r2.length();
+ if(false && Tools.overlap(r1.start, r1.stop, r2.start, r2.stop)){
+ //This does not handle very short inserts
+ return Tools.max(r1.stop, r2.stop)-Tools.min(r1.start, r2.start)+1;
+
+ }else{
+ if(r1.start<r2.start){
+ int mid=r2.start-r1.stop-1;
+// assert(false) : mid+", "+a+", "+b;
+// if(-mid>a && -mid>b){return Tools.min(a, b);} //Strange situation, no way to guess insert size
+ if(-mid>=a+b){return 0;} //Strange situation, no way to guess insert size
+ return mid+a+b;
+ }else{
+ assert(r1.start==r2.start);
+ return Tools.min(a, b);
+ }
+ }
+ }
+
+ public int insertSizeOriginalSite(){
+ if(mate==null){
+// System.err.println("A: "+(originalSite==null ? "null" : (originalSite.stop-originalSite.start+1)));
+ return (originalSite==null ? -1 : originalSite.stop-originalSite.start+1);
+ }
+
+ final SiteScore ssa=originalSite, ssb=mate.originalSite;
+ final int x;
+ if(ssa==null || ssb==null){
+// System.err.println("B: 0");
+ x=0;
+ }else{
+ x=insertSize(ssa, ssb, bases.length, mate.length());
+ }
+
+ assert(pairnum()>=mate.pairnum() || x==mate.insertSizeOriginalSite());
+ return x;
+ }
+
+ public static int insertSize(SiteScore ssa, SiteScore ssb, int lena, int lenb){
+ return insertSize(ssa.chrom, ssb.chrom, ssa.start, ssb.start, ssa.stop, ssb.stop, lena, lenb);
+ }
+
+ public static int insertSize(int chroma, int chromb, int starta, int startb, int stopa, int stopb, int lena, int lenb){
+
+ final int x;
+
+ // if(mate==null || ){return bases==null ? 0 : bases.length;}
+ if(chroma!=chromb){x=0;}
+ else{
+
+ if(Tools.overlap(starta, stopa, startb, stopb)){
+ x=Tools.max(stopa, stopb)-Tools.min(starta, startb)+1;
+// System.err.println("C: "+x);
+ }else{
+ if(starta<=startb){
+ int mid=startb-stopa-1;
+ // assert(false) : mid+", "+a+", "+b;
+ x=mid+lena+lenb;
+// System.err.println("D: "+x);
+ }else{
+ int mid=starta-stopb-1;
+ // assert(false) : mid+", "+a+", "+b;
+ x=mid+lena+lenb;
+// System.err.println("E: "+x);
+ }
+ }
+ }
+ return x;
+ }
+
+ public Read subRead(int from, int to){
+ Read r=this.clone();
+ r.bases=Arrays.copyOfRange(bases, from, to);
+ r.quality=(quality==null ? null : Arrays.copyOfRange(quality, from, to));
+ r.mate=null;
+// assert(Tools.indexOf(r.bases, (byte)'J')<0);
+ return r;
+ }
+
+ public Read joinRead(){
+ if(insert<1 || mate==null || !insertvalid()){return this;}
+ assert(insert>9 || bases.length<20) : "Perhaps old read format is being used? This appears to be a quality value, not an insert.\n"+this+"\n\n"+mate+"\n";
+ return joinRead(this, mate, insert);
+ }
+
+ public Read joinRead(int x){
+ if(x<1 || mate==null){return this;}
+ assert(x>9 || bases.length<20) : "Perhaps old read format is being used? This appears to be a quality value, not an insert.\n"+this+"\n\n"+mate+"\n";
+ return joinRead(this, mate, x);
+ }
+
+ public static Read joinRead(Read a, Read b, int insert){
+ assert(a!=null && b!=null && insert>0);
+ final int lengthSum=a.length()+b.length();
+ final int overlap=Tools.min(insert, lengthSum-insert);
+
+// System.err.println(insert);
+ final byte[] bases=new byte[insert];
+ final byte[] quals=(a.quality==null || b.quality==null ? null : new byte[insert]);
+
+ int mismatches=0;
+
+ int start, stop;
+
+ if(overlap<=0){//Simple join in which there is no overlap
+ int lim=insert-b.length();
+ if(quals==null){
+ for(int i=0; i<a.length(); i++){
+ bases[i]=a.bases[i];
+ }
+ for(int i=a.length(); i<lim; i++){
+ bases[i]='N';
+ }
+ for(int i=0; i<b.length(); i++){
+ bases[i+lim]=b.bases[i];
+ }
+ }else{
+ for(int i=0; i<a.length(); i++){
+ bases[i]=a.bases[i];
+ quals[i]=a.quality[i];
+ }
+ for(int i=a.length(); i<lim; i++){
+ bases[i]='N';
+ quals[i]=0;
+ }
+ for(int i=0; i<b.length(); i++){
+ bases[i+lim]=b.bases[i];
+ quals[i+lim]=b.quality[i];
+ }
+ }
+
+ start=Tools.min(a.start, b.start);
+// stop=start+insert-1;
+ stop=Tools.max(a.stop, b.stop);
+
+// }else if(insert>=a.length() && insert>=b.length()){ //Overlapped join, proper orientation
+// final int lim1=a.length()-overlap;
+// final int lim2=a.length();
+// for(int i=0; i<lim1; i++){
+// bases[i]=a.bases[i];
+// quals[i]=a.quality[i];
+// }
+// for(int i=lim1, j=0; i<lim2; i++, j++){
+// assert(false) : "TODO";
+// bases[i]='N';
+// quals[i]=0;
+// }
+// for(int i=lim2, j=overlap; i<bases.length; i++, j++){
+// bases[i]=b.bases[j];
+// quals[i]=b.quality[j];
+// }
+ }else{ //reads go off ends of molecule.
+ if(quals==null){
+ for(int i=0; i<a.length() && i<bases.length; i++){
+ bases[i]=a.bases[i];
+ }
+ for(int i=bases.length-1, j=b.length()-1; i>=0 && j>=0; i--, j--){
+ byte ca=bases[i], cb=b.bases[j];
+ if(ca==0 || ca=='N'){
+ bases[i]=cb;
+ }else if(ca==cb){
+ }else{
+ bases[i]=(ca>=cb ? ca : cb);
+ if(ca!='N' && cb!='N'){mismatches++;}
+ }
+ }
+ }else{
+ for(int i=0; i<a.length() && i<bases.length; i++){
+ bases[i]=a.bases[i];
+ quals[i]=a.quality[i];
+ }
+ for(int i=bases.length-1, j=b.length()-1; i>=0 && j>=0; i--, j--){
+ byte ca=bases[i], cb=b.bases[j];
+ byte qa=quals[i], qb=b.quality[j];
+ if(ca==0 || ca=='N'){
+ bases[i]=cb;
+ quals[i]=qb;
+ }else if(cb==0 || cb=='N'){
+ //do nothing
+ }else if(ca==cb){
+ quals[i]=(byte)Tools.min((Tools.max(qa, qb)+Tools.min(qa, qb)/4), MAX_MERGE_QUALITY);
+ }else{
+ bases[i]=(qa>qb ? ca : qa<qb ? cb : (byte)'N');
+ quals[i]=(byte)(Tools.max(qa, qb)-Tools.min(qa, qb));
+ if(ca!='N' && cb!='N'){mismatches++;}
+ }
+ }
+ }
+
+ if(a.strand()==0){
+ start=a.start;
+// stop=start+insert-1;
+ stop=b.stop;
+ }else{
+ stop=a.stop;
+// start=stop-insert+1;
+ start=b.start;
+ }
+ if(start>stop){
+ start=Tools.min(a.start, b.start);
+ stop=Tools.max(a.stop, b.stop);
+ }
+ }
+// assert(mismatches>=countMismatches(a, b, insert, 999));
+// System.err.println(mismatches);
+ if(a.chrom==0 || start==stop || (!a.mapped() && !a.synthetic())){start=stop=a.chrom=0;}
+
+// System.err.println(bases.length+", "+start+", "+stop);
+
+ Read r=new Read(bases, a.chrom, start, stop, a.id, null, a.numericID, a.flags);
+ r.quality=quals; //This prevents quality from getting capped.
+ if(a.chrom==0 || start==stop || (!a.mapped() && !a.synthetic())){r.setMapped(true);}
+ r.setInsert(insert);
+ r.setPaired(false);
+ r.copies=a.copies;
+ r.mapScore=a.mapScore+b.mapScore;
+ if(overlap<=0){
+ r.mapScore=a.mapScore+b.mapScore;
+ r.errors=a.errors+b.errors;
+ //TODO r.gaps=?
+ }else{//Hard to calculate
+ r.mapScore=(int)((a.mapScore*(long)a.length()+b.mapScore*(long)b.length())/insert);
+ r.errors=a.errors;
+ }
+
+
+ assert(r.insertvalid()) : "\n\n"+a.toText(false)+"\n\n"+b.toText(false)+"\n\n"+r.toText(false)+"\n\n";
+ assert(r.insert()==r.length()) : r.insert()+"\n\n"+a.toText(false)+"\n\n"+b.toText(false)+"\n\n"+r.toText(false)+"\n\n";
+// assert(false) : "\n\n"+a.toText(false)+"\n\n"+b.toText(false)+"\n\n"+r.toText(false)+"\n\n";
+
+ //TODO: Triggered by BBMerge in useratio mode for some reason.
+// assert(Shared.anomaly || (a.insertSizeMapped(false)>0 == r.insertSizeMapped(false)>0)) :
+// "\n"+r.length()+"\n"+r.insert()+"\n"+r.insertSizeMapped(false)+"\n"+a.insert()+"\n"+a.insertSizeMapped(false)+
+// "\n\n"+a.toText(false)+"\n\n"+b.toText(false)+"\n\n"+r.toText(false)+"\n\n";
+
+ return r;
+ }
+
+ /**
+ * @param minlen
+ * @param maxlen
+ * @return
+ */
+ public ArrayList<Read> split(int minlen, int maxlen) {
+ int len=bases==null ? 0 : bases.length;
+ if(len<minlen){return null;}
+ int parts=(len+maxlen-1)/maxlen;
+ ArrayList<Read> subreads=new ArrayList<Read>(parts);
+ if(len<=maxlen){
+ subreads.add(this);
+ }else{
+ float ideal=Tools.max(minlen, len/(float)parts);
+ int actual=(int)ideal;
+ assert(false) : "TODO"; //Some assertion goes here, I forget what
+ for(int i=0; i<parts; i++){
+ int a=i*actual;
+ int b=(i+1)*actual;
+ if(b>bases.length){b=bases.length;}
+// if(b-a<)
+ byte[] subbases=Arrays.copyOfRange(bases, a, b);
+ byte[] subquals=(quality==null ? null : Arrays.copyOfRange(quality, a, b+1));
+ Read r=new Read(subbases, -1, -1, -1, id+"_"+i, subquals, numericID, flags);
+ subreads.add(r);
+ }
+ }
+ return subreads;
+ }
+
+ /** Generate and return an array of canonical kmers for this read */
+ public long[] toKmers(final int k, final int gap, long[] kmers, boolean makeCanonical, Kmer longkmer) {
+ if(gap>0){throw new RuntimeException("Gapped reads: TODO");}
+ if(k>31){return toLongKmers(k, kmers, makeCanonical, longkmer);}
+ if(bases==null || bases.length<k+gap){return null;}
+
+ final int arraylen=bases.length-k+1;
+ if(kmers==null || kmers.length!=arraylen){kmers=new long[arraylen];}
+ Arrays.fill(kmers, -1);
+
+ final int shift=2*k;
+ final int shift2=shift-2;
+ final long mask=~((-1L)<<shift);
+ long kmer=0, rkmer=0;
+ int len=0;
+
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ long x=Dedupe.baseToNumber[b];
+ long x2=Dedupe.baseToComplementNumber[b];
+ kmer=((kmer<<2)|x)&mask;
+ rkmer=(rkmer>>>2)|(x2<<shift2);
+ if(AminoAcid.isFullyDefined(b)){len++;}else{len=0;}
+ if(len>=k){
+ kmers[i-k+1]=makeCanonical ? Tools.max(kmer, rkmer) : kmer;
+ }
+ }
+ return kmers;
+ }
+
+// /** Generate and return an array of canonical kmers for this read */
+// public long[] toKmers(final int k, final int gap, long[] kmers, boolean makeCanonical, Kmer longkmer) {
+// if(gap>0){throw new RuntimeException("Gapped reads: TODO");}
+// if(k>31){return toLongKmers(k, kmers, makeCanonical, longkmer);}
+// if(bases==null || bases.length<k+gap){return null;}
+//
+// final int kbits=2*k;
+// final long mask=~((-1L)<<(kbits));
+//
+// int len=0;
+// long kmer=0;
+// final int arraylen=bases.length-k+1;
+// if(kmers==null || kmers.length!=arraylen){kmers=new long[arraylen];}
+// Arrays.fill(kmers, -1);
+//
+// for(int i=0; i<bases.length; i++){
+// byte b=bases[i];
+// int x=AminoAcid.baseToNumber[b];
+// if(x<0){
+// len=0;
+// kmer=0;
+// }else{
+// kmer=((kmer<<2)|x)&mask;
+// len++;
+//
+// if(len>=k){
+// kmers[i-k+1]=kmer;
+// }
+// }
+// }
+//
+//// System.out.println(new String(bases));
+//// System.out.println(Arrays.toString(kmers));
+//
+// if(makeCanonical){
+// this.reverseComplement();
+// len=0;
+// kmer=0;
+// for(int i=0, j=bases.length-1; i<bases.length; i++, j--){
+// byte b=bases[i];
+// int x=AminoAcid.baseToNumber[b];
+// if(x<0){
+// len=0;
+// kmer=0;
+// }else{
+// kmer=((kmer<<2)|x)&mask;
+// len++;
+//
+// if(len>=k){
+// assert(kmer==AminoAcid.reverseComplementBinaryFast(kmers[j], k));
+// kmers[j]=Tools.max(kmers[j], kmer);
+// }
+// }
+// }
+// this.reverseComplement();
+//
+//// System.out.println(Arrays.toString(kmers));
+// }
+//
+//
+// return kmers;
+// }
+
+ /** Generate and return an array of canonical kmers for this read */
+ public long[] toLongKmers(final int k, long[] kmers, boolean makeCanonical, Kmer kmer) {
+ assert(k>31) : k;
+ assert(makeCanonical);
+ if(bases==null || bases.length<k){return null;}
+ kmer.clear();
+
+ final int arraylen=bases.length-k+1;
+ if(kmers==null || kmers.length!=arraylen){kmers=new long[arraylen];}
+ Arrays.fill(kmers, -1);
+
+ for(int i=0; i<bases.length; i++){
+ byte b=bases[i];
+ kmer.addRight(b);
+ if(!AminoAcid.isFullyDefined(b)){kmer.clear();}
+ if(kmer.len>=k){
+ kmers[i-k+1]=kmer.xor();
+ }
+ }
+
+ return kmers;
+ }
+
+// /** Generate and return an array of canonical kmers for this read */
+// public long[] toLongKmers(final int k, long[] kmers, boolean makeCanonical, Kmer longkmer) {
+// assert(k>31) : k;
+// if(bases==null || bases.length<k){return null;}
+//
+// final int kbits=2*k;
+// final long mask=Long.MAX_VALUE;
+//
+// int len=0;
+// long kmer=0;
+// final int arraylen=bases.length-k+1;
+// if(kmers==null || kmers.length!=arraylen){kmers=new long[arraylen];}
+// Arrays.fill(kmers, -1);
+//
+//
+// final int tailshift=k%32;
+// final int tailshiftbits=tailshift*2;
+//
+// for(int i=0; i<bases.length; i++){
+// byte b=bases[i];
+// int x=AminoAcid.baseToNumber[b];
+// if(x<0){
+// len=0;
+// kmer=0;
+// }else{
+// kmer=Long.rotateLeft(kmer, 2);
+// kmer=kmer^x;
+// len++;
+//
+// if(len>=k){
+// long x2=AminoAcid.baseToNumber[bases[i-k]];
+// kmer=kmer^(x2<<tailshiftbits);
+// kmers[i-k+1]=kmer;
+// }
+// }
+// }
+// if(makeCanonical){
+// this.reverseComplement();
+// len=0;
+// kmer=0;
+// for(int i=0, j=bases.length-1; i<bases.length; i++, j--){
+// byte b=bases[i];
+// int x=AminoAcid.baseToNumber[b];
+// if(x<0){
+// len=0;
+// kmer=0;
+// }else{
+// kmer=Long.rotateLeft(kmer, 2);
+// kmer=kmer^x;
+// len++;
+//
+// if(len>=k){
+// long x2=AminoAcid.baseToNumber[bases[i-k]];
+// kmer=kmer^(x2<<tailshiftbits);
+// kmers[j]=mask&(Tools.max(kmers[j], kmer));
+// }
+// }
+// }
+// this.reverseComplement();
+// }else{
+// assert(false) : "Long kmers should be made canonical here because they cannot be canonicized later.";
+// }
+//
+// return kmers;
+// }
+
+ public static final boolean CHECKSITES(Read r, byte[] basesM){
+ return CHECKSITES(r.sites, r.bases, basesM, r.numericID, true);
+ }
+
+ public static final boolean CHECKSITES(Read r, byte[] basesM, boolean verifySorted){
+ return CHECKSITES(r.sites, r.bases, basesM, r.numericID, verifySorted);
+ }
+
+ public static final boolean CHECKSITES(ArrayList<SiteScore> list, byte[] basesP, byte[] basesM, long id){
+ return CHECKSITES(list, basesP, basesM, id, true);
+ }
+
+ public static final boolean CHECKSITES(ArrayList<SiteScore> list, byte[] basesP, byte[] basesM, long id, boolean verifySorted){
+ return true; //Temporarily disabled
+// if(list==null || list.isEmpty()){return true;}
+// SiteScore prev=null;
+// for(int i=0; i<list.size(); i++){
+// SiteScore ss=list.get(i);
+// if(ss.strand==Gene.MINUS && basesM==null && basesP!=null){basesM=AminoAcid.reverseComplementBases(basesP);}
+// byte[] bases=(ss.strand==Gene.PLUS ? basesP : basesM);
+// if(verbose){System.err.println("Checking site "+i+": "+ss);}
+// boolean b=CHECKSITE(ss, bases, id);
+// assert(b) : id+"\n"+new String(basesP)+"\n"+ss+"\n";
+// if(verbose){System.err.println("Checked site "+i+" = "+ss+"\nss.p="+ss.perfect+", ss.sp="+ss.semiperfect);}
+// if(!b){
+//// System.err.println("Error at SiteScore "+i+": ss.p="+ss.perfect+", ss.sp="+ss.semiperfect);
+// return false;
+// }
+// if(verifySorted && prev!=null && ss.score>prev.score){
+// if(verbose){System.err.println("verifySorted failed.");}
+// return false;
+// }
+// prev=ss;
+// }
+// return true;
+ }
+
+ /** Makes sure 'bases' is for correct strand. */
+ public static final boolean CHECKORDER(ArrayList<SiteScore> list){
+ if(list==null || list.size()<2){return true;}
+ SiteScore prev=list.get(0);
+ for(int i=0; i<list.size(); i++){
+ SiteScore ss=list.get(i);
+ if(ss.score>prev.score){return false;}
+ prev=ss;
+ }
+ return true;
+ }
+
+ /** Makes sure 'bases' is for correct strand. */
+ public static final boolean CHECKSITE(SiteScore ss, byte[] basesP, byte[] basesM, long id){
+ return CHECKSITE(ss, ss.plus() ? basesP : basesM, id);
+ }
+
+ /** Make sure 'bases' is for correct strand! */
+ public static final boolean CHECKSITE(SiteScore ss, byte[] bases, long id){
+ return true; //Temporarily disabled
+// if(ss==null){return true;}
+//// System.err.println("Checking site "+ss+"\nss.p="+ss.perfect+", ss.sp="+ss.semiperfect+", bases="+new String(bases));
+// if(ss.perfect){assert(ss.semiperfect) : ss+"\n"+new String(bases);}
+// if(ss.gaps!=null){
+// if(ss.gaps[0]!=ss.start || ss.gaps[ss.gaps.length-1]!=ss.stop){return false;}
+//// assert(ss.gaps[0]==ss.start && ss.gaps[ss.gaps.length-1]==ss.stop);
+// }
+//
+// if(!(ss.pairedScore<1 || (ss.slowScore<=0 && ss.pairedScore>ss.quickScore ) || ss.pairedScore>ss.slowScore)){
+// System.err.println("Site paired score violation: "+ss.quickScore+", "+ss.slowScore+", "+ss.pairedScore);
+// return false;
+// }
+//
+// final boolean xy=ss.matchContainsXY();
+// if(bases!=null){
+//
+// final boolean p0=ss.perfect;
+// final boolean sp0=ss.semiperfect;
+// final boolean p1=ss.isPerfect(bases);
+// final boolean sp1=(p1 ? true : ss.isSemiPerfect(bases));
+//
+// assert(p0==p1 || (xy && p1)) : p0+"->"+p1+", "+sp0+"->"+sp1+", "+ss.isSemiPerfect(bases)+
+// "\nnumericID="+id+"\n"+new String(bases)+"\n\n"+Data.getChromosome(ss.chrom).getString(ss.start, ss.stop)+"\n\n"+ss+"\n\n";
+// assert(sp0==sp1 || (xy && sp1)) : p0+"->"+p1+", "+sp0+"->"+sp1+", "+ss.isSemiPerfect(bases)+
+// "\nnumericID="+id+"\n"+new String(bases)+"\n\n"+Data.getChromosome(ss.chrom).getString(ss.start, ss.stop)+"\n\n"+ss+"\n\n";
+//
+//// ss.setPerfect(bases, false);
+//
+// assert(p0==ss.perfect) :
+// p0+"->"+ss.perfect+", "+sp0+"->"+ss.semiperfect+", "+ss.isSemiPerfect(bases)+"\nnumericID="+id+"\n\n"+new String(bases)+"\n\n"+
+// Data.getChromosome(ss.chrom).getString(ss.start, ss.stop)+"\n"+ss+"\n\n";
+// assert(sp0==ss.semiperfect) :
+// p0+"->"+ss.perfect+", "+sp0+"->"+ss.semiperfect+", "+ss.isSemiPerfect(bases)+"\nnumericID="+id+"\n\n"+new String(bases)+"\n\n"+
+// Data.getChromosome(ss.chrom).getString(ss.start, ss.stop)+"\n"+ss+"\n\n";
+// if(ss.perfect){assert(ss.semiperfect);}
+// }
+// if(ss.match!=null && ss.matchLength()!=ss.mappedLength()){
+// if(verbose){System.err.println("Returning false because matchLength!=mappedLength:\n"+ss.matchLength()+", "+ss.mappedLength()+"\n"+ss);}
+// return false;
+// }
+// return true;
+ }
+
+ public void setPerfect(boolean b){
+ flags=(flags&~PERFECTMASK);
+ if(b){flags|=PERFECTMASK;}
+ }
+
+ public void setRescued(boolean b){
+ flags=(flags&~RESCUEDMASK);
+ if(b){flags|=RESCUEDMASK;}
+ }
+
+ public void setMapped(boolean b){
+ flags=(flags&~MAPPEDMASK);
+ if(b){flags|=MAPPEDMASK;}
+ }
+
+ public void setDiscarded(boolean b){
+ flags=(flags&~DISCARDMASK);
+ if(b){flags|=DISCARDMASK;}
+ }
+
+ public void setInvalid(boolean b){
+ flags=(flags&~INVALIDMASK);
+ if(b){flags|=INVALIDMASK;}
+ }
+
+ public void setSwapped(boolean b){
+ flags=(flags&~SWAPMASK);
+ if(b){flags|=SWAPMASK;}
+ }
+
+ public void setShortMatch(boolean b){
+ flags=(flags&~SHORTMATCHMASK);
+ if(b){flags|=SHORTMATCHMASK;}
+ }
+
+ public void setInsertValid(boolean b){
+ flags=(flags&~INSERTMASK);
+ if(b){flags|=INSERTMASK;}
+ }
+
+ public void setHasAdapter(boolean b){
+ flags=(flags&~ADAPTERMASK);
+ if(b){flags|=ADAPTERMASK;}
+ }
+
+ public void setSecondary(boolean b){
+ flags=(flags&~SECONDARYMASK);
+ if(b){flags|=SECONDARYMASK;}
+ }
+
+ public void setAminoAcid(boolean b){
+ flags=(flags&~AAMASK);
+ if(b){flags|=AAMASK;}
+ }
+
+ public void setJunk(boolean b){
+ flags=(flags&~JUNKMASK);
+ if(b){flags|=JUNKMASK;}
+ }
+
+ public void setValidated(boolean b){
+ flags=(flags&~VALIDATEDMASK);
+ if(b){flags|=VALIDATEDMASK;}
+ }
+
+ public void setInsert(int x){
+ if(x<1){x=-1;}
+// assert(x==-1 || x>9 || length()<20) : x+", "+length(); //Invalid assertion for synthetic reads.
+ insert=x;
+ setInsertValid(x>0);
+ if(mate!=null){
+ mate.insert=x;
+ mate.setInsertValid(x>0);
+ }
+ }
+
+ private static int[] makeMaskArray(int max) {
+ int[] r=new int[max+1];
+ for(int i=0; i<r.length; i++){r[i]=(1<<i);}
+ return r;
+ }
+
+
+
+ public static byte[] getFakeQuality(int len){
+ if(len>=QUALCACHE.length){
+ byte[] r=new byte[len];
+ Arrays.fill(r, (byte)30);
+ return r;
+ }
+ if(QUALCACHE[len]==null){
+ synchronized(QUALCACHE){
+ if(QUALCACHE[len]==null){
+ QUALCACHE[len]=new byte[len];
+ Arrays.fill(QUALCACHE[len], (byte)30);
+ }
+ }
+ }
+ return QUALCACHE[len];
+ }
+
+ public byte[] getScaffoldName(boolean requireSingleScaffold){
+ byte[] name=null;
+ if(mapped()){
+ if(!requireSingleScaffold || Data.isSingleScaffold(chrom, start, stop)){
+ int idx=Data.scaffoldIndex(chrom, (start+stop)/2);
+ name=Data.scaffoldNames[chrom][idx];
+// int scaflen=Data.scaffoldLengths[chrom][idx];
+// a1=Data.scaffoldRelativeLoc(chrom, start, idx);
+// b1=a1-start1+stop1;
+ }
+ }
+ return name;
+ }
+
+ public void bisulfite(boolean AtoG, boolean CtoT, boolean GtoA, boolean TtoC){
+ for(int i=0; i<bases.length; i++){
+ final int x=AminoAcid.baseToNumber[bases[i]];
+ if(x==0 && AtoG){bases[i]='G';}
+ else if(x==1 && CtoT){bases[i]='T';}
+ else if(x==2 && GtoA){bases[i]='A';}
+ else if(x==3 && TtoC){bases[i]='C';}
+ }
+ }
+
+ public Read copy(){
+ Read r=clone();
+ r.bases=(r.bases==null ? null : r.bases.clone());
+ r.quality=(r.quality==null ? null : r.quality.clone());
+ r.match=(r.match==null ? null : r.match.clone());
+ r.gaps=(r.gaps==null ? null : r.gaps.clone());
+ r.originalSite=(r.originalSite==null ? null : r.originalSite.clone());
+ r.sites=(ArrayList<SiteScore>) (r.sites==null ? null : r.sites.clone());
+ r.mate=null;
+
+ if(r.sites!=null){
+ for(int i=0; i<r.sites.size(); i++){
+ r.sites.set(i, r.sites.get(i).clone());
+ }
+ }
+ return r;
+ }
+
+ public Read clone(){
+ try {
+ return (Read) super.clone();
+ } catch (CloneNotSupportedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ throw new RuntimeException();
+ }
+
+ /**
+ * @return This protein in canonical nucleotide space.
+ */
+ public Read aminoToNucleic() {
+ assert(aminoacid()) : "This read is not flagged as an amino acid sequence.";
+ Read r=this.clone();
+ r.setAminoAcid(false);
+ r.bases=AminoAcid.toNTs(r.bases);
+ if(quality!=null){
+ byte[] ntquals=new byte[r.quality.length*3];
+ for(int i=0; i<quality.length; i++){
+ byte q=quality[i];
+ byte q2=(byte)Tools.min(q+5, MAX_CALLED_QUALITY);
+ ntquals[i]=ntquals[i+1]=ntquals[i+2]=q2;
+ }
+ r.quality=ntquals;
+ }
+ return r;
+ }
+
+ private static final byte[][] QUALCACHE=new byte[1000][];
+
+
+ public static final int STRANDMASK=1;
+ public static final int MAPPEDMASK=(1<<1);
+ public static final int PAIREDMASK=(1<<2);
+ public static final int PERFECTMASK=(1<<3);
+ public static final int AMBIMASK=(1<<4);
+ public static final int RESCUEDMASK=(1<<5);
+ public static final int COLORMASK=(1<<6);
+ public static final int SYNTHMASK=(1<<7);
+ public static final int DISCARDMASK=(1<<8);
+ public static final int INVALIDMASK=(1<<9);
+ public static final int SWAPMASK=(1<<10);
+ public static final int SHORTMATCHMASK=(1<<11);
+
+ public static final int PAIRNUMSHIFT=12;
+ public static final int PAIRNUMMASK=(1<<PAIRNUMSHIFT);
+
+ public static final int INSERTMASK=(1<<13);
+ public static final int ADAPTERMASK=(1<<14);
+ public static final int SECONDARYMASK=(1<<15);
+ public static final int AAMASK=(1<<16);
+ public static final int JUNKMASK=(1<<17);
+ public static final int VALIDATEDMASK=(1<<18);
+
+ private static final int[] maskArray=makeMaskArray(18); //Be sure this is big enough for all flags!
+
+// public static byte ASCII_OFFSET=33;
+ private static final byte ASCII_OFFSET=33;
+ public static boolean CHANGE_QUALITY=true; //Cap all quality values between MIN_CALLED_QUALITY and MAX_CALLED_QUALITY
+ public static byte MIN_CALLED_QUALITY=2;
+ public static byte MAX_CALLED_QUALITY=41;
+ public static byte MAX_MERGE_QUALITY=41;
+ public static boolean TO_UPPER_CASE=false;
+ public static boolean LOWER_CASE_TO_N=false;
+ public static final boolean OTHER_SYMBOLS_TO_N=true;
+ public static boolean AVERAGE_QUALITY_BY_PROBABILITY=true;
+ public static boolean FIX_HEADER=false;
+
+ public static boolean FLAG_JUNK=false;
+ public static boolean FIX_JUNK=false;
+ public static boolean U_TO_T=false;
+ public static boolean COMPRESS_MATCH_BEFORE_WRITING=true;
+ public static boolean DECOMPRESS_MATCH_ON_LOAD=true; //Set to false for some applications, like sorting, perhaps
+
+ public static boolean ADD_BEST_SITE_TO_LIST_FROM_TEXT=true;
+ public static boolean NULLIFY_BROKEN_QUALITY=false;
+ public static boolean FLAT_IDENTITY=true;
+ public static boolean VALIDATE_IN_CONSTRUCTOR=true;
+
+ public static boolean verbose=false;
+}
diff --git a/current/stream/ReadInputStream.java b/current/stream/ReadInputStream.java
new file mode 100755
index 0000000..496d597
--- /dev/null
+++ b/current/stream/ReadInputStream.java
@@ -0,0 +1,46 @@
+package stream;
+
+import java.util.ArrayList;
+
+public abstract class ReadInputStream {
+
+
+ public abstract Read next();
+
+// public final ArrayList<Read> fetchAll(){
+// ArrayList<Read> out=new ArrayList<Read>();
+// for(ArrayList<Read> list=nextList(); list!=null && list.size()>0; list=nextList()){
+// out.addAll(list);
+// }
+// close();
+// return out;
+// }
+
+ public abstract ArrayList<Read> nextList();
+
+ public abstract boolean hasMore();
+
+ public abstract void restart();
+
+ /** Returns true if there was an error, false otherwise */
+ public abstract boolean close();
+
+ public abstract boolean paired();
+
+ protected static final ArrayList<Read> toList(Read[] array){
+ if(array==null || array.length==0){return null;}
+ ArrayList<Read> list=new ArrayList<Read>(array.length);
+ for(int i=0; i<array.length; i++){list.add(array[i]);}
+ return list;
+ }
+
+ /** Return true if this stream has detected an error */
+ public boolean errorState(){return errorState;}
+ /** TODO */
+ protected boolean errorState=false;
+
+ public final boolean preferLists(){return true;}
+
+ public abstract void start();
+
+}
diff --git a/current/stream/ReadStreamByteWriter.java b/current/stream/ReadStreamByteWriter.java
new file mode 100755
index 0000000..d4dccfa
--- /dev/null
+++ b/current/stream/ReadStreamByteWriter.java
@@ -0,0 +1,594 @@
+package stream;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.util.ArrayList;
+
+import fileIO.FileFormat;
+import fileIO.ReadWrite;
+
+public class ReadStreamByteWriter extends ReadStreamWriter {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public ReadStreamByteWriter(FileFormat ff, String qfname_, boolean read1_, int bufferSize, CharSequence header, boolean useSharedHeader){
+ super(ff, qfname_, read1_, bufferSize, header, false, buffered, useSharedHeader);
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Execution ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public void run() {
+ try {
+ run2();
+ } catch (IOException e) {
+ finishedSuccessfully=false;
+// e.printStackTrace();
+ throw new RuntimeException(e);
+ }
+ }
+
+ private void run2() throws IOException{
+ writeHeader();
+
+ final ByteBuilder bb=new ByteBuilder(65000);
+ final ByteBuilder bbq=(myQOutstream==null ? null : new ByteBuilder(65000));
+
+ processJobs(bb, bbq);
+ finishWriting(bb, bbq);
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Outer Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private void writeHeader() throws IOException {
+ if(!OUTPUT_SAM && !OUTPUT_FASTQ && !OUTPUT_FASTA && !OUTPUT_ATTACHMENT && !OUTPUT_HEADER){
+ if(OUTPUT_INTERLEAVED){
+// assert(false) : OUTPUT_SAM+", "+OUTPUT_FASTQ+", "+OUTPUT_FASTA+", "+OUTPUT_ATTACHMENT+", "+OUTPUT_INTERLEAVED+", "+SITES_ONLY;
+ myOutstream.write("#INTERLEAVED\n".getBytes());
+ }
+ if(SITES_ONLY){
+ myOutstream.write(("#"+SiteScore.header()+"\n").getBytes());
+ }else if(!OUTPUT_ATTACHMENT){
+ myOutstream.write(("#"+Read.header()+"\n").getBytes());
+ }
+ }
+ }
+
+ private void processJobs(final ByteBuilder bb, final ByteBuilder bbq) throws IOException{
+
+ Job job=null;
+ while(job==null){
+ try {
+ job=queue.take();
+// job.list=queue.take();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+
+ while(job!=null && !job.poison){
+
+ final OutputStream os=job.outstream;
+
+ if(!job.isEmpty()){
+ if(myQOutstream!=null){
+ writeQuality(job, bbq);
+ }
+
+ if(OUTPUT_SAM){
+ writeSam(job, bb, os);
+ }else if(SITES_ONLY){
+ writeSites(job, bb, os);
+ }else if(OUTPUT_FASTQ){
+ writeFastq(job, bb, os);
+ }else if(OUTPUT_FASTA){
+ writeFasta(job, bb, os);
+ }else if(OUTPUT_ATTACHMENT){
+ writeAttachment(job, bb, os);
+ }else if(OUTPUT_HEADER){
+ writeHeader(job, bb, os);
+ }else{
+ writeBread(job, bb, os);
+ }
+ }
+ if(job.close){
+ if(bb.length>0){
+ os.write(bb.array, 0, bb.length);
+ bb.setLength(0);
+ }
+ assert(job.outstream!=null && job.outstream!=myOutstream);
+ ReadWrite.finishWriting(null, job.outstream, fname, allowSubprocess); //TODO: This should be job.fname
+ }
+
+ job=null;
+ while(job==null){
+ try {
+ job=queue.take();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+ }
+
+ /**
+ * @throws IOException
+ *
+ */
+ private void finishWriting(final ByteBuilder bb, final ByteBuilder bbq) throws IOException {
+ if(myOutstream!=null){
+ if(bb.length>0){
+ myOutstream.write(bb.array, 0, bb.length);
+ bb.setLength(0);
+ }
+ ReadWrite.finishWriting(null, myOutstream, fname, allowSubprocess);
+ }
+ if(myQOutstream!=null){
+ if(bbq.length>0){
+ myQOutstream.write(bbq.array, 0, bbq.length);
+ bbq.setLength(0);
+ }
+ ReadWrite.finishWriting(null, myQOutstream, qfname, allowSubprocess);
+ }
+ finishedSuccessfully=true;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private void writeQuality(final Job job, final ByteBuilder bbq) throws IOException{
+ bbq.setLength(0);
+ if(read1){
+ for(final Read r : job.list){
+ if(r!=null){
+ {
+ bbq.append('>');
+ bbq.append(r.id);
+ bbq.append('\n');
+ if(r.bases!=null){toQualityB(r.quality, r.length(), FASTA_WRAP, bbq);}
+ bbq.append('\n');
+ }
+ Read r2=r.mate;
+ if(OUTPUT_INTERLEAVED && r2!=null){
+ bbq.append('>');
+ bbq.append(r2.id);
+ bbq.append('\n');
+ if(r2.bases!=null){toQualityB(r2.quality, r2.length(), FASTA_WRAP, bbq);}
+ bbq.append('\n');
+ }
+ }
+ if(bbq.length>=32768 || true){
+ myQOutstream.write(bbq.array, 0, bbq.length);
+ bbq.setLength(0);
+ }
+ }
+ }else{
+ for(final Read r1 : job.list){
+ if(r1!=null){
+ final Read r2=r1.mate;
+ assert(r2!=null && r2.mate==r1 && r2!=r1) : r1.toText(false);
+ bbq.append('>');
+ bbq.append(r2.id);
+ bbq.append('\n');
+ if(r2.bases!=null){toQualityB(r2.quality, r2.length(), FASTA_WRAP, bbq);}
+ bbq.append('\n');
+ }
+ if(bbq.length>=32768){
+ myQOutstream.write(bbq.array, 0, bbq.length);
+ bbq.setLength(0);
+ }
+ }
+ }
+
+// if(bbq.length>0){
+// myQOutstream.write(bbq.array, 0, bbq.length);
+// bbq.setLength(0);
+// }
+ }
+
+ /**
+ * @param job
+ * @param bb
+ * @param os
+ * @throws IOException
+ */
+ private void writeBread(Job job, ByteBuilder bb, OutputStream os) throws IOException {
+ if(read1){
+ for(final Read r : job.list){
+ if(r!=null){
+ r.toText(true, bb).append('\n');
+ readsWritten++;
+ basesWritten+=(r.bases!=null ? r.length() : 0);
+ validReadsWritten+=(r.valid() && r.mapped() ? 1 : 0);
+ validBasesWritten+=(r.valid() && r.mapped() && r.bases!=null ? r.length() : 0);
+ Read r2=r.mate;
+ if(OUTPUT_INTERLEAVED && r2!=null){
+ r2.toText(true, bb).append('\n');
+ readsWritten++;
+ basesWritten+=(r2.bases!=null ? r2.length() : 0);
+ validReadsWritten+=(r2.valid() && r2.mapped() ? 1 : 0);
+ validBasesWritten+=(r2.valid() && r2.mapped() && r2.bases!=null ? r2.length() : 0);
+ }
+
+ }
+ if(bb.length>=32768){
+ os.write(bb.array, 0, bb.length);
+ bb.setLength(0);
+ }
+ }
+ }else{
+ for(final Read r1 : job.list){
+ if(r1!=null){
+ final Read r2=r1.mate;
+// assert(r2!=null && r2.mate==r1 && r2!=r1) : r1.toText(false);
+ if(r2!=null){
+ r2.toText(true, bb).append('\n');
+ readsWritten++;
+ basesWritten+=(r2.bases!=null ? r2.length() : 0);
+ validReadsWritten+=(r2.valid() && r2.mapped() ? 1 : 0);
+ validBasesWritten+=(r2.valid() && r2.mapped() && r2.bases!=null ? r2.length() : 0);
+ }else{
+ //TODO os.print(".\n");
+ }
+ }
+ if(bb.length>=32768){
+ os.write(bb.array, 0, bb.length);
+ bb.setLength(0);
+ }
+ }
+ }
+ }
+
+ /**
+ * @param job
+ * @param bb
+ * @param os
+ * @throws IOException
+ */
+ private void writeAttachment(Job job, ByteBuilder bb, OutputStream os) throws IOException {
+ if(read1){
+ for(final Read r : job.list){
+ if(r!=null){
+ if(r.obj==null){/*bb.append('.').append('\n');*/}
+ else{bb.append(r.obj.toString()).append('\n');}
+ readsWritten++;
+ validReadsWritten+=(r.valid() && r.mapped() ? 1 : 0);
+ Read r2=r.mate;
+ if(OUTPUT_INTERLEAVED && r2!=null){
+ if(r2.obj==null){/*bb.append('.').append('\n');*/}
+ else{bb.append(r2.obj.toString()).append('\n');}
+ readsWritten++;
+ validReadsWritten+=(r2.valid() && r2.mapped() ? 1 : 0);
+ }
+ }
+ if(bb.length>=32768){
+ os.write(bb.array, 0, bb.length);
+ bb.setLength(0);
+ }
+ }
+ }else{
+ for(final Read r1 : job.list){
+ if(r1!=null){
+ final Read r2=r1.mate;
+ if(r2!=null){
+ if(r2.obj==null){/*bb.append('.').append('\n');*/}
+ else{bb.append(r2.obj.toString()).append('\n');}
+ readsWritten++;
+ validReadsWritten+=(r2.valid() && r2.mapped() ? 1 : 0);
+ }else{
+// bb.append('.').append('\n');
+ }
+ }
+ if(bb.length>=32768){
+ os.write(bb.array, 0, bb.length);
+ bb.setLength(0);
+ }
+ }
+ }
+ }
+
+ /**
+ * @param job
+ * @param bb
+ * @param os
+ * @throws IOException
+ */
+ private void writeHeader(Job job, ByteBuilder bb, OutputStream os) throws IOException {
+ if(read1){
+ for(final Read r : job.list){
+ if(r!=null){
+ bb.append(r.id).append('\n');
+ readsWritten++;
+ validReadsWritten+=(r.valid() && r.mapped() ? 1 : 0);
+ Read r2=r.mate;
+ if(OUTPUT_INTERLEAVED && r2!=null){
+ bb.append(r2.id).append('\n');
+ readsWritten++;
+ validReadsWritten+=(r2.valid() && r2.mapped() ? 1 : 0);
+ }
+ }
+ if(bb.length>=32768){
+ os.write(bb.array, 0, bb.length);
+ bb.setLength(0);
+ }
+ }
+ }else{
+ for(final Read r1 : job.list){
+ if(r1!=null){
+ final Read r2=r1.mate;
+ if(r2!=null){
+ bb.append(r2.id).append('\n');
+ readsWritten++;
+ validReadsWritten+=(r2.valid() && r2.mapped() ? 1 : 0);
+ }else{
+// bb.append('.').append('\n');
+ }
+ }
+ if(bb.length>=32768){
+ os.write(bb.array, 0, bb.length);
+ bb.setLength(0);
+ }
+ }
+ }
+ }
+
+ /**
+ * @param job
+ * @param bb
+ * @param os
+ * @throws IOException
+ */
+ private void writeFasta(Job job, ByteBuilder bb, OutputStream os) throws IOException {
+ if(read1){
+ for(final Read r : job.list){
+ if(r!=null){
+ r.toFasta(FASTA_WRAP, bb).append('\n');
+ readsWritten++;
+ basesWritten+=(r.bases!=null ? r.length() : 0);
+ validReadsWritten+=(r.valid() && r.mapped() ? 1 : 0);
+ validBasesWritten+=(r.valid() && r.mapped() && r.bases!=null ? r.length() : 0);
+ Read r2=r.mate;
+ if(OUTPUT_INTERLEAVED && r2!=null){
+ r2.toFasta(FASTA_WRAP, bb).append('\n');
+ readsWritten++;
+ basesWritten+=(r2.bases!=null ? r2.length() : 0);
+ validReadsWritten+=(r2.valid() && r2.mapped() ? 1 : 0);
+ validBasesWritten+=(r2.valid() && r2.mapped() && r2.bases!=null ? r2.length() : 0);
+ }
+ }
+ if(bb.length>=32768){
+ os.write(bb.array, 0, bb.length);
+ bb.setLength(0);
+ }
+ }
+ }else{
+ for(final Read r1 : job.list){
+ if(r1!=null){
+ final Read r2=r1.mate;
+ assert(ignorePairAssertions || (r2!=null && r2.mate==r1 && r2!=r1)) : "\n"+r1.toText(false)+"\n\n"+(r2==null ? "null" : r2.toText(false)+"\n");
+ if(r2!=null){
+ r2.toFasta(FASTA_WRAP, bb).append('\n');
+ readsWritten++;
+ basesWritten+=(r2.bases!=null ? r2.length() : 0);
+ validReadsWritten+=(r2.valid() && r2.mapped() ? 1 : 0);
+ validBasesWritten+=(r2.valid() && r2.mapped() && r2.bases!=null ? r2.length() : 0);
+ }
+ }
+ if(bb.length>=32768){
+ os.write(bb.array, 0, bb.length);
+ bb.setLength(0);
+ }
+ }
+ }
+ }
+
+ /**
+ * @param job
+ * @param bb
+ * @param os
+ * @throws IOException
+ */
+ private void writeFastq(Job job, ByteBuilder bb, OutputStream os) throws IOException {
+ if(read1){
+ for(final Read r : job.list){
+ if(r!=null){
+ r.toFastq(bb).append('\n');
+ readsWritten++;
+ basesWritten+=(r.bases!=null ? r.length() : 0);
+ validReadsWritten+=(r.valid() && r.mapped() ? 1 : 0);
+ validBasesWritten+=(r.valid() && r.mapped() && r.bases!=null ? r.length() : 0);
+ Read r2=r.mate;
+ if(OUTPUT_INTERLEAVED && r2!=null){
+ r2.toFastq(bb).append('\n');
+ readsWritten++;
+ basesWritten+=(r2.bases!=null ? r2.length() : 0);
+ validReadsWritten+=(r2.valid() && r2.mapped() ? 1 : 0);
+ validBasesWritten+=(r2.valid() && r2.mapped() && r2.bases!=null ? r2.length() : 0);
+ }
+ }
+ if(bb.length>=32768){
+ os.write(bb.array, 0, bb.length);
+ bb.setLength(0);
+ }
+ }
+ }else{
+ for(final Read r1 : job.list){
+ if(r1!=null){
+ final Read r2=r1.mate;
+ assert(ignorePairAssertions || (r2!=null && r2.mate==r1 && r2!=r1)) : "\n"+r1.toText(false)+"\n\n"+(r2==null ? "null" : r2.toText(false)+"\n");
+ if(r2!=null){
+ r2.toFastq(bb).append('\n');
+ readsWritten++;
+ basesWritten+=(r2.bases!=null ? r2.length() : 0);
+ validReadsWritten+=(r2.valid() && r2.mapped() ? 1 : 0);
+ validBasesWritten+=(r2.valid() && r2.mapped() && r2.bases!=null ? r2.length() : 0);
+ }
+ }
+ if(bb.length>=32768){
+ os.write(bb.array, 0, bb.length);
+ bb.setLength(0);
+ }
+ }
+ }
+ }
+
+ /**
+ * @param job
+ * @param bb
+ * @param os
+ * @throws IOException
+ */
+ private void writeSites(Job job, ByteBuilder bb, OutputStream os) throws IOException {
+ assert(read1);
+ for(final Read r : job.list){
+ Read r2=(r==null ? null : r.mate);
+
+ if(r!=null && r.sites!=null){
+ r.toSitesB(bb).append('\n');
+
+ readsWritten++;
+ basesWritten+=(r.bases!=null ? r.length() : 0);
+ validReadsWritten+=(r.valid() && r.mapped() ? 1 : 0);
+ validBasesWritten+=(r.valid() && r.mapped() && r.bases!=null ? r.length() : 0);
+ }
+ if(r2!=null){
+ r2.toSitesB(bb).append('\n');
+
+ readsWritten++;
+ basesWritten+=(r2.bases!=null ? r2.length() : 0);
+ validReadsWritten+=(r2.valid() && r2.mapped() ? 1 : 0);
+ validBasesWritten+=(r2.valid() && r2.mapped() && r2.bases!=null ? r2.length() : 0);
+ }
+ if(bb.length>=32768){
+ os.write(bb.array, 0, bb.length);
+ bb.setLength(0);
+ }
+ }
+ }
+
+ /**
+ * @param job
+ * @param bb
+ * @throws IOException
+ */
+ private void writeSam(Job job, ByteBuilder bb, OutputStream os) throws IOException {
+
+ assert(read1);
+ for(final Read r : job.list){
+ Read r2=(r==null ? null : r.mate);
+
+ SamLine sl1=(r==null ? null : (USE_ATTACHED_SAMLINE && r.obj!=null ? (SamLine)r.obj : new SamLine(r, 0)));
+ SamLine sl2=(r2==null ? null : (USE_ATTACHED_SAMLINE && r2.obj!=null ? (SamLine)r2.obj : new SamLine(r2, 1)));
+
+ if(r!=null){
+
+ if(verbose && r.numSites()>0){
+ int ssnum=0;
+ final Read clone=r.clone();
+ for(SiteScore ss : r.sites){
+
+ clone.setFromSite(ss);
+ clone.setSecondary(true);
+ SamLine sl=new SamLine(clone, 0);
+
+ System.err.println("\n[*** ss"+ssnum+":\n"+ss+"\n*** clone: \n"+clone+"\n*** sl: \n"+sl+"\n***]\n");
+ ssnum++;
+ }
+ }
+
+ assert(!ASSERT_CIGAR || !r.mapped() || sl1.cigar!=null) : r;
+ sl1.toBytes(bb).append('\n');
+
+ readsWritten++;
+ basesWritten+=(r.bases!=null ? r.length() : 0);
+ validReadsWritten+=(r.valid() && r.mapped() ? 1 : 0);
+ validBasesWritten+=(r.valid() && r.mapped() && r.bases!=null ? r.length() : 0);
+ ArrayList<SiteScore> list=r.sites;
+ if(OUTPUT_SAM_SECONDARY_ALIGNMENTS && list!=null && list.size()>1){
+ final Read clone=r.clone();
+ for(int i=1; i<list.size(); i++){
+ SiteScore ss=list.get(i);
+ clone.match=null;
+ clone.setFromSite(ss);
+ clone.setSecondary(true);
+
+// System.err.println(r.numericID+": "+(ss.match==null ? "null" : new String(ss.match)));
+
+// assert(false) : r.mapScore+"\n"+ss.header()+"\n"+r.sites+"\n";
+ SamLine sl=new SamLine(clone, 0);
+ assert(!sl.primary());
+// sl.setPrimary(false);
+
+
+ assert(!ASSERT_CIGAR || sl.cigar!=null) : r;
+
+ sl.toBytes(bb).append('\n');
+
+// readsWritten++;
+// basesWritten+=(r.bases!=null ? r.length() : 0);
+// validReadsWritten+=(r.valid() && r.mapped() ? 1 : 0);
+// validBasesWritten+=(r.valid() && r.mapped() && r.bases!=null ? r.length() : 0);
+ }
+ }
+ }
+ if(r2!=null){
+ assert(!ASSERT_CIGAR || !r2.mapped() || sl2.cigar!=null) : r2;
+ if(!SamLine.KEEP_NAMES && sl1!=null && ((sl2.qname==null) || !sl2.qname.equals(sl1.qname))){
+ sl2.qname=sl1.qname;
+ }
+ sl2.toBytes(bb).append('\n');
+
+ readsWritten++;
+ basesWritten+=(r2.bases!=null ? r2.length() : 0);
+ validReadsWritten+=(r2.valid() && r2.mapped() ? 1 : 0);
+ validBasesWritten+=(r2.valid() && r2.mapped() && r2.bases!=null ? r2.length() : 0);
+
+ ArrayList<SiteScore> list=r2.sites;
+ if(OUTPUT_SAM_SECONDARY_ALIGNMENTS && list!=null && list.size()>1){
+ final Read clone=r2.clone();
+ for(int i=1; i<list.size(); i++){
+ SiteScore ss=list.get(i);
+ clone.match=null;
+ clone.setFromSite(ss);
+ clone.setSecondary(true);
+// assert(false) : r.mapScore+"\n"+ss.header()+"\n"+r.list+"\n";
+ SamLine sl=new SamLine(clone, 0);
+ assert(!sl.primary());
+// sl.setPrimary(false);
+
+ assert(!ASSERT_CIGAR || sl.cigar!=null) : r2;
+
+ sl.toBytes(bb).append('\n');
+
+// readsWritten++;
+// basesWritten+=(r.bases!=null ? r.length() : 0);
+// validReadsWritten+=(r.valid() && r.mapped() ? 1 : 0);
+// validBasesWritten+=(r.valid() && r.mapped() && r.bases!=null ? r.length() : 0);
+ }
+ }
+ }
+ if(bb.length>=32768){
+ os.write(bb.array, 0, bb.length);
+ bb.setLength(0);
+ }
+
+ }
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Static Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private static final boolean buffered=true;
+ private static final boolean verbose=false;
+
+}
diff --git a/current/stream/ReadStreamStringWriter.java b/current/stream/ReadStreamStringWriter.java
new file mode 100755
index 0000000..7671e5b
--- /dev/null
+++ b/current/stream/ReadStreamStringWriter.java
@@ -0,0 +1,364 @@
+package stream;
+
+import java.util.ArrayList;
+
+import fileIO.FileFormat;
+import fileIO.ReadWrite;
+
+public class ReadStreamStringWriter extends ReadStreamWriter {
+
+ @Deprecated
+ public ReadStreamStringWriter(String fname_, boolean read1_, int bufferSize, boolean allowSubprocess_){
+ this(FileFormat.testOutput(fname_, FileFormat.BREAD, null, allowSubprocess_, false, false, true), null, read1_, bufferSize, null, false);
+ }
+
+ public ReadStreamStringWriter(FileFormat ff, String qfname_, boolean read1_, int bufferSize, CharSequence header, boolean useSharedHeader){
+ super(ff, qfname_, read1_, bufferSize, header, true, true, useSharedHeader);
+ }
+
+ @Override
+ public void run() {
+
+ if(!OUTPUT_SAM && !OUTPUT_FASTQ && !OUTPUT_FASTA && !OUTPUT_ATTACHMENT && !OUTPUT_HEADER){
+ if(OUTPUT_INTERLEAVED){
+// assert(false) : OUTPUT_SAM+", "+OUTPUT_FASTQ+", "+OUTPUT_FASTA+", "+OUTPUT_ATTACHMENT+", "+OUTPUT_INTERLEAVED+", "+SITES_ONLY;
+ myWriter.print("#INTERLEAVED\n");
+ }
+ if(SITES_ONLY){
+ myWriter.println("#"+SiteScore.header());
+ }else if(!OUTPUT_ATTACHMENT){
+ myWriter.println("#"+Read.header());
+ }
+ }
+
+ Job job=null;
+ while(job==null){
+ try {
+ job=queue.take();
+// job.list=queue.take();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+
+ while(job!=null && !job.poison){
+// System.err.println("Processing job "+job);
+ if(!job.isEmpty()){
+
+ if(myQWriter!=null){
+ if(read1){
+ for(final Read r : job.list){
+ if(r!=null){
+ {
+ CharSequence cs=(r.bases==null ? "\n" : toQualitySB(r.quality, r.length(), FASTA_WRAP).append('\n'));
+ myQWriter.print('>');
+ myQWriter.println(r.id);
+ myQWriter.print(cs);
+ }
+ Read r2=r.mate;
+ if(OUTPUT_INTERLEAVED && r2!=null){
+ CharSequence cs=(r2.bases==null ? "\n" : toQualitySB(r2.quality, r2.length(), FASTA_WRAP).append('\n'));
+ myQWriter.print('>');
+ myQWriter.println(r2.id);
+ myQWriter.print(cs);
+ }
+ }
+ }
+ }else{
+ for(final Read r1 : job.list){
+ if(r1!=null){
+ final Read r2=r1.mate;
+ assert(r2!=null && r2.mate==r1 && r2!=r1) : r1.toText(false);
+ CharSequence cs=(r2.bases==null ? "\n" : toQualitySB(r2.quality, r2.length(), FASTA_WRAP).append('\n'));
+ myQWriter.print('>');
+ myQWriter.println(r2.id);
+ myQWriter.print(cs);
+ }
+ }
+ }
+ }
+// assert(false) : OUTPUT_SAM+", "+SITES_ONLY+", "+OUTPUT_FASTQ+", "+OUTPUT_FASTA+", "+OUTPUT_ATTACHMENT+"\n"+job.list.get(0).obj+"\n"+job.list.get(0);
+ if(OUTPUT_SAM){
+ assert(read1);
+ for(final Read r : job.list){
+ Read r2=(r==null ? null : r.mate);
+
+ SamLine sl1=(r==null ? null : (USE_ATTACHED_SAMLINE && r.obj!=null ? (SamLine)r.obj : new SamLine(r, 0)));
+ SamLine sl2=(r2==null ? null : (USE_ATTACHED_SAMLINE && r2.obj!=null ? (SamLine)r2.obj : new SamLine(r2, 1)));
+
+ if(r!=null){
+ job.writer.print(sl1.toText().append('\n'));
+
+ readsWritten++;
+ basesWritten+=(r.bases!=null ? r.length() : 0);
+ validReadsWritten+=(r.valid() && r.mapped() ? 1 : 0);
+ validBasesWritten+=(r.valid() && r.mapped() && r.bases!=null ? r.length() : 0);
+ ArrayList<SiteScore> list=r.sites;
+ if(OUTPUT_SAM_SECONDARY_ALIGNMENTS && list!=null && list.size()>1){
+ final Read clone=r.clone();
+ for(int i=1; i<list.size(); i++){
+ SiteScore ss=list.get(i);
+ clone.match=null;
+ clone.setFromSite(ss);
+ clone.setSecondary(true);
+// assert(false) : r.mapScore+"\n"+ss.header()+"\n"+r.list+"\n";
+ SamLine sl=new SamLine(clone, 0);
+ assert(!sl.primary());
+// sl.setPrimary(false);
+
+ job.writer.print(sl.toText().append('\n'));
+
+// readsWritten++;
+// basesWritten+=(r.bases!=null ? r.length() : 0);
+// validReadsWritten+=(r.valid() && r.mapped() ? 1 : 0);
+// validBasesWritten+=(r.valid() && r.mapped() && r.bases!=null ? r.length() : 0);
+ }
+ }
+ }
+ if(r2!=null){
+ if(!SamLine.KEEP_NAMES && sl1!=null && ((sl2.qname==null) || !sl2.qname.equals(sl1.qname))){
+ sl2.qname=sl1.qname;
+ }
+ job.writer.print(sl2.toText().append('\n'));
+
+ readsWritten++;
+ basesWritten+=(r2.bases!=null ? r2.length() : 0);
+ validReadsWritten+=(r2.valid() && r2.mapped() ? 1 : 0);
+ validBasesWritten+=(r2.valid() && r2.mapped() && r2.bases!=null ? r2.length() : 0);
+
+ ArrayList<SiteScore> list=r2.sites;
+ if(OUTPUT_SAM_SECONDARY_ALIGNMENTS && list!=null && list.size()>1){
+ final Read clone=r2.clone();
+ for(int i=1; i<list.size(); i++){
+ SiteScore ss=list.get(i);
+ clone.match=null;
+ clone.setFromSite(ss);
+ clone.setSecondary(true);
+// assert(false) : r.mapScore+"\n"+ss.header()+"\n"+r.list+"\n";
+ SamLine sl=new SamLine(clone, 0);
+ assert(!sl.primary());
+// sl.setPrimary(false);
+
+ job.writer.print(sl.toText().append('\n'));
+
+// readsWritten++;
+// basesWritten+=(r.bases!=null ? r.length() : 0);
+// validReadsWritten+=(r.valid() && r.mapped() ? 1 : 0);
+// validBasesWritten+=(r.valid() && r.mapped() && r.bases!=null ? r.length() : 0);
+ }
+ }
+ }
+
+ }
+ }else if(SITES_ONLY){
+ assert(read1);
+ for(final Read r : job.list){
+ Read r2=(r==null ? null : r.mate);
+
+ if(r!=null && r.sites!=null){
+ job.writer.print(r.toSites().append('\n'));
+
+ readsWritten++;
+ basesWritten+=(r.bases!=null ? r.length() : 0);
+ validReadsWritten+=(r.valid() && r.mapped() ? 1 : 0);
+ validBasesWritten+=(r.valid() && r.mapped() && r.bases!=null ? r.length() : 0);
+ }
+ if(r2!=null){
+ job.writer.print(r2.toSites().append('\n'));
+
+ readsWritten++;
+ basesWritten+=(r2.bases!=null ? r2.length() : 0);
+ validReadsWritten+=(r2.valid() && r2.mapped() ? 1 : 0);
+ validBasesWritten+=(r2.valid() && r2.mapped() && r2.bases!=null ? r2.length() : 0);
+ }
+ }
+ }else if(OUTPUT_FASTQ){
+ if(read1){
+ for(final Read r : job.list){
+ if(r!=null){
+ job.writer.print(r.toFastq().append('\n'));
+ readsWritten++;
+ basesWritten+=(r.bases!=null ? r.length() : 0);
+ validReadsWritten+=(r.valid() && r.mapped() ? 1 : 0);
+ validBasesWritten+=(r.valid() && r.mapped() && r.bases!=null ? r.length() : 0);
+ Read r2=r.mate;
+ if(OUTPUT_INTERLEAVED && r2!=null){
+ job.writer.print(r2.toFastq().append('\n'));
+ readsWritten++;
+ basesWritten+=(r2.bases!=null ? r2.length() : 0);
+ validReadsWritten+=(r2.valid() && r2.mapped() ? 1 : 0);
+ validBasesWritten+=(r2.valid() && r2.mapped() && r2.bases!=null ? r2.length() : 0);
+ }
+ }
+ }
+ }else{
+ for(final Read r1 : job.list){
+ if(r1!=null){
+ final Read r2=r1.mate;
+ assert(r2!=null && r2.mate==r1 && r2!=r1) : r1.toText(false);
+ job.writer.print(r2.toFastq().append('\n'));
+ readsWritten++;
+ basesWritten+=(r2.bases!=null ? r2.length() : 0);
+ validReadsWritten+=(r2.valid() && r2.mapped() ? 1 : 0);
+ validBasesWritten+=(r2.valid() && r2.mapped() && r2.bases!=null ? r2.length() : 0);
+ }
+ }
+ }
+ }else if(OUTPUT_FASTA){
+ if(read1){
+ for(final Read r : job.list){
+ if(r!=null){
+ job.writer.print(r.toFasta(FASTA_WRAP).append('\n'));
+ readsWritten++;
+ basesWritten+=(r.bases!=null ? r.length() : 0);
+ validReadsWritten+=(r.valid() && r.mapped() ? 1 : 0);
+ validBasesWritten+=(r.valid() && r.mapped() && r.bases!=null ? r.length() : 0);
+ Read r2=r.mate;
+ if(OUTPUT_INTERLEAVED && r2!=null){
+ job.writer.print(r2.toFasta(FASTA_WRAP).append('\n'));
+ readsWritten++;
+ basesWritten+=(r2.bases!=null ? r2.length() : 0);
+ validReadsWritten+=(r2.valid() && r2.mapped() ? 1 : 0);
+ validBasesWritten+=(r2.valid() && r2.mapped() && r2.bases!=null ? r2.length() : 0);
+ }
+ }
+ }
+ }else{
+ for(final Read r1 : job.list){
+ if(r1!=null){
+ final Read r2=r1.mate;
+ assert(r2!=null && r2.mate==r1 && r2!=r1) : r1.toText(false);
+ job.writer.print(r2.toFasta(FASTA_WRAP).append('\n'));
+ readsWritten++;
+ basesWritten+=(r2.bases!=null ? r2.length() : 0);
+ validReadsWritten+=(r2.valid() && r2.mapped() ? 1 : 0);
+ validBasesWritten+=(r2.valid() && r2.mapped() && r2.bases!=null ? r2.length() : 0);
+ }
+ }
+ }
+ }else if(OUTPUT_ATTACHMENT){
+ if(read1){
+ for(final Read r : job.list){
+ if(r!=null){
+ job.writer.println(r.obj==null ? "." : r.obj.toString());
+ readsWritten++;
+ validReadsWritten+=(r.valid() && r.mapped() ? 1 : 0);
+ Read r2=r.mate;
+ if(OUTPUT_INTERLEAVED && r2!=null){
+ job.writer.println(r2.obj==null ? "." : r2.obj.toString());
+ readsWritten++;
+ validReadsWritten+=(r2.valid() && r2.mapped() ? 1 : 0);
+ }
+ }
+ }
+ }else{
+ for(final Read r1 : job.list){
+ if(r1!=null){
+ final Read r2=r1.mate;
+// assert(r2!=null && r2.mate==r1 && r2!=r1) : r1.toText(false);
+ if(r2!=null){
+ job.writer.println(r2.obj==null ? "." : r2.obj.toString());
+ readsWritten++;
+ validReadsWritten+=(r2.valid() && r2.mapped() ? 1 : 0);
+ }else{
+ job.writer.println(".");
+ }
+ }
+ }
+ }
+ }else if(OUTPUT_HEADER){
+ if(read1){
+ for(final Read r : job.list){
+ if(r!=null){
+ job.writer.println(r.id);
+ readsWritten++;
+ validReadsWritten+=(r.valid() && r.mapped() ? 1 : 0);
+ Read r2=r.mate;
+ if(OUTPUT_INTERLEAVED && r2!=null){
+ job.writer.println(r.id);
+ readsWritten++;
+ validReadsWritten+=(r2.valid() && r2.mapped() ? 1 : 0);
+ }
+ }
+ }
+ }else{
+ for(final Read r1 : job.list){
+ if(r1!=null){
+ final Read r2=r1.mate;
+// assert(r2!=null && r2.mate==r1 && r2!=r1) : r1.toText(false);
+ if(r2!=null){
+ job.writer.println(r2.id);
+ readsWritten++;
+ validReadsWritten+=(r2.valid() && r2.mapped() ? 1 : 0);
+ }else{
+ job.writer.println(".");
+ }
+ }
+ }
+ }
+ }else{
+ if(read1){
+ for(final Read r : job.list){
+ if(r!=null){
+ job.writer.print(r.toText(true).append('\n'));
+ readsWritten++;
+ basesWritten+=(r.bases!=null ? r.length() : 0);
+ validReadsWritten+=(r.valid() && r.mapped() ? 1 : 0);
+ validBasesWritten+=(r.valid() && r.mapped() && r.bases!=null ? r.length() : 0);
+ Read r2=r.mate;
+ if(OUTPUT_INTERLEAVED && r2!=null){
+ job.writer.print(r2.toText(true).append('\n'));
+ readsWritten++;
+ basesWritten+=(r2.bases!=null ? r2.length() : 0);
+ validReadsWritten+=(r2.valid() && r2.mapped() ? 1 : 0);
+ validBasesWritten+=(r2.valid() && r2.mapped() && r2.bases!=null ? r2.length() : 0);
+ }
+
+ }
+ }
+ }else{
+ for(final Read r1 : job.list){
+ if(r1!=null){
+ final Read r2=r1.mate;
+// assert(r2!=null && r2.mate==r1 && r2!=r1) : r1.toText(false);
+ if(r2!=null){
+ job.writer.print(r2.toText(true).append('\n'));
+ readsWritten++;
+ basesWritten+=(r2.bases!=null ? r2.length() : 0);
+ validReadsWritten+=(r2.valid() && r2.mapped() ? 1 : 0);
+ validBasesWritten+=(r2.valid() && r2.mapped() && r2.bases!=null ? r2.length() : 0);
+ }else{
+ job.writer.print(".\n");
+ }
+ }
+ }
+ }
+ }
+ }
+ if(job.close){
+ assert(job.writer!=null && job.writer!=myWriter);
+ ReadWrite.finishWriting(job.writer, job.outstream, fname, allowSubprocess);
+ }
+
+ job=null;
+ while(job==null){
+ try {
+ job=queue.take();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+
+ if(myWriter!=null){
+ ReadWrite.finishWriting(myWriter, myOutstream, fname, allowSubprocess);
+ }
+ if(myQWriter!=null){
+ ReadWrite.finishWriting(myQWriter, myQOutstream, qfname, allowSubprocess);
+ }
+ finishedSuccessfully=true;
+ }
+
+}
diff --git a/current/stream/ReadStreamWriter.java b/current/stream/ReadStreamWriter.java
new file mode 100755
index 0000000..8b616c1
--- /dev/null
+++ b/current/stream/ReadStreamWriter.java
@@ -0,0 +1,416 @@
+package stream;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.PrintWriter;
+import java.util.ArrayList;
+import java.util.concurrent.ArrayBlockingQueue;
+
+import align2.Shared;
+
+import dna.Data;
+
+import fileIO.ReadWrite;
+import fileIO.FileFormat;
+
+public abstract class ReadStreamWriter extends Thread {
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+
+ protected ReadStreamWriter(FileFormat ff, String qfname_, boolean read1_, int bufferSize, CharSequence header,
+ boolean makeWriter, boolean buffered, boolean useSharedHeader){
+// assert(false) : useSharedHeader+", "+header;
+ assert(ff!=null);
+ assert(ff.write()) : "FileFormat is not in write mode for "+ff.name();
+
+ assert(!ff.text() && !ff.unknownFormat()) : "Unknown format for "+ff;
+ OUTPUT_FASTQ=ff.fastq();
+ OUTPUT_FASTA=ff.fasta();
+// boolean bread=(ext==TestFormat.txt);
+ OUTPUT_SAM=ff.samOrBam();
+ OUTPUT_BAM=ff.bam();
+ OUTPUT_ATTACHMENT=ff.attachment();
+ OUTPUT_HEADER=ff.header();
+ SITES_ONLY=ff.sites();
+ OUTPUT_STANDARD_OUT=ff.stdio();
+ FASTA_WRAP=Shared.FASTA_WRAP;
+ assert(((OUTPUT_SAM ? 1 : 0)+(OUTPUT_FASTQ ? 1 : 0)+(OUTPUT_FASTA ? 1 : 0)+(OUTPUT_ATTACHMENT ? 1 : 0)+
+ (OUTPUT_HEADER ? 1 : 0)+(SITES_ONLY ? 1 : 0))<=1) :
+ OUTPUT_SAM+", "+SITES_ONLY+", "+OUTPUT_FASTQ+", "+OUTPUT_FASTA+", "+OUTPUT_ATTACHMENT;
+
+ fname=ff.name();
+ qfname=qfname_;
+ read1=read1_;
+ allowSubprocess=ff.allowSubprocess();
+// assert(fname==null || (fname.contains(".sam") || fname.contains(".bam"))==OUTPUT_SAM) : "Outfile name and sam output mode flag disagree: "+fname;
+ assert(read1 || !OUTPUT_SAM) : "Attempting to output paired reads to different sam files.";
+
+ if(qfname==null){
+ myQOutstream=null;
+ myQWriter=null;
+ }else{
+ myQOutstream=ReadWrite.getOutputStream(qfname, (ff==null ? false : ff.append()), buffered, allowSubprocess);
+ myQWriter=(makeWriter ? new PrintWriter(myQOutstream) : null);
+ }
+
+ if(header==null){header=HEADER;} //new line; test.
+
+
+ if(fname==null && !OUTPUT_STANDARD_OUT){
+ myOutstream=null;
+ myWriter=null;
+ }else{
+ if(OUTPUT_STANDARD_OUT){myOutstream=System.out;}
+ else if(!OUTPUT_BAM || !Data.SAMTOOLS() || !Data.SH()){
+ myOutstream=ReadWrite.getOutputStream(ff, buffered);
+ }else{
+ if(!allowSubprocess){System.err.println("Warning! Spawning a samtools process when allowSubprocess="+allowSubprocess);}
+ myOutstream=ReadWrite.getOutputStreamFromProcess(fname, "samtools view -S -b -h - ", true, ff.append(), true, true);
+ }
+
+
+
+ myWriter=(makeWriter ? new PrintWriter(myOutstream) : null);
+
+ final boolean supressHeader=(NO_HEADER || (ff.append() && ff.exists()));
+ final boolean supressHeaderSequences=(NO_HEADER_SEQUENCES);
+// assert(false) : ff.append()+", "+ff.exists();
+
+ if(header!=null && !supressHeader){
+ if(myWriter!=null){
+ myWriter.println(header);
+ }else{
+ byte[] temp=new byte[header.length()];
+ for(int i=0; i<temp.length; i++){temp[i]=(byte)header.charAt(i);}
+ try {
+ myOutstream.write(temp);
+ } catch (IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }else if(OUTPUT_SAM && !supressHeader){
+ if(useSharedHeader){
+// assert(false);
+ ArrayList<byte[]> list=SamReadInputStream.getSharedHeader(true);
+ if(list==null){
+ System.err.println("Header was null.");
+ }else{
+ try {
+ if(supressHeaderSequences){
+ for(byte[] line : list){
+ boolean sq=(line!=null && line.length>2 && line[0]=='@' && line[1]=='S' && line[2]=='Q' && line[3]=='\t');
+ if(!sq){
+ myOutstream.write(line);
+ myOutstream.write('\n');
+ }
+ }
+ }else{
+ for(byte[] line : list){
+ myOutstream.write(line);
+ myOutstream.write('\n');
+ //myWriter.println(new String(line));
+ }
+ }
+ } catch (IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }else{
+ if(myWriter!=null){
+ myWriter.println(SamHeader.header0());
+ int a=(MINCHROM==-1 ? 1 : MINCHROM);
+ int b=(MAXCHROM==-1 ? Data.numChroms : MAXCHROM);
+ for(int chrom=a; chrom<=b; chrom++){
+ // myWriter.print(SamHeader.header1(chrom, chrom));
+ SamHeader.printHeader1(chrom, chrom, myWriter);
+ }
+ myWriter.println(SamHeader.header2());
+ }else{
+ ByteBuilder bb=new ByteBuilder(4096);
+ SamHeader.header0B(bb);
+ bb.append('\n');
+ int a=(MINCHROM==-1 ? 1 : MINCHROM);
+ int b=(MAXCHROM==-1 ? Data.numChroms : MAXCHROM);
+ if(!supressHeaderSequences){
+ for(int chrom=a; chrom<=b; chrom++){
+ SamHeader.printHeader1B(chrom, chrom, bb, myOutstream);
+ }
+ }
+ SamHeader.header2B(bb);
+ bb.append('\n');
+
+
+ try {
+ if(bb.length>0){myOutstream.write(bb.array, 0, bb.length);}
+ } catch (IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+ }else if(ff.bread() && !supressHeader){
+ if(myWriter!=null){
+ myWriter.println("#"+Read.header());
+ }else{
+ try {
+ myOutstream.write(("#"+Read.header()).getBytes());
+ } catch (IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+ }
+
+ assert(bufferSize>=1);
+ queue=new ArrayBlockingQueue<Job>(bufferSize);
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Outer Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+
+ @Override
+ public abstract void run();
+
+ /** Uses this thread to transform reads to text, and the ReadStreamWriter thread to write text to disk */
+ public final synchronized void addListAsText(ArrayList<Read> list){
+ assert(false) : "TODO";
+ addList(list, myWriter, myOutstream, false);
+ }
+
+ public final synchronized void poison(){
+ addJob(new Job(null, null, null, false, true));
+ }
+
+ public final synchronized void addList(ArrayList<Read> list){
+ addList(list, myWriter, myOutstream, false);
+ }
+
+ public final synchronized void addList(ArrayList<Read> l, PrintWriter w, OutputStream o, boolean c){
+ boolean poison=(c && w!=null && w==myWriter);
+ Job j=new Job(l, w, o, c, poison);
+ addJob(j);
+ }
+
+ public final synchronized void addJob(Job j){
+// System.err.println("Got job "+(j.list==null ? "null" : j.list.size()));
+ boolean success=false;
+ while(!success){
+ try {
+ queue.put(j);
+ success=true;
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ assert(!queue.contains(j)); //Hopefully it was not added.
+ }
+ }
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+
+ protected static final StringBuilder toQualitySB(final byte[] quals, final int len, final int wrap){
+ if(quals==null){return fakeQualitySB(30, len, wrap);}
+ assert(quals.length==len);
+ StringBuilder sb=new StringBuilder(NUMERIC_QUAL ? len*3+1 : len+1);
+ if(NUMERIC_QUAL){
+ if(len>0){sb.append(quals[0]);}
+ for(int i=1, w=1; i<len; i++, w++){
+ if(w>=wrap){
+ sb.append('\n');
+ w=0;
+ }else{
+ sb.append(' ');
+ }
+ sb.append(quals[i]);
+ }
+ }else{
+ final byte b=FASTQ.ASCII_OFFSET_OUT;
+ for(int i=0; i<len; i++){
+ sb.append((char)(b+quals[i]));
+ }
+ }
+ return sb;
+ }
+
+ protected static final StringBuilder fakeQualitySB(final int q, final int len, final int wrap){
+ StringBuilder sb=new StringBuilder(NUMERIC_QUAL ? len*3+1 : len+1);
+ char c=(char)(q+FASTQ.ASCII_OFFSET_OUT);
+ if(NUMERIC_QUAL){
+ if(len>0){sb.append(q);}
+ for(int i=1, w=1; i<len; i++, w++){
+ if(w>=wrap){
+ sb.append('\n');
+ w=0;
+ }else{
+ sb.append(' ');
+ }
+ sb.append(q);
+ }
+ }else{
+ for(int i=0; i<len; i++){sb.append(c);}
+ }
+ return sb;
+ }
+
+ protected static final ByteBuilder toQualityB(final byte[] quals, final int len, final int wrap, final ByteBuilder bb){
+ if(quals==null){return fakeQualityB(30, len, wrap, bb);}
+ assert(quals.length==len);
+ bb.ensureExtra(NUMERIC_QUAL ? len*3+1 : len+1);
+ if(NUMERIC_QUAL){
+ if(len>0){bb.append((int)quals[0]);}
+ for(int i=1, w=1; i<len; i++, w++){
+ if(w>=wrap){
+ bb.append('\n');
+ w=0;
+ }else{
+ bb.append(' ');
+ }
+ bb.append((int)quals[i]);
+ }
+ }else{
+ final byte b=FASTQ.ASCII_OFFSET_OUT;
+ for(int i=0; i<len; i++){
+ bb.append(b+quals[i]);
+ }
+ }
+ return bb;
+ }
+
+ protected static final ByteBuilder fakeQualityB(final int q, final int len, final int wrap, final ByteBuilder bb){
+ bb.ensureExtra(NUMERIC_QUAL ? len*3+1 : len+1);
+ if(NUMERIC_QUAL){
+ int c=(q+FASTQ.ASCII_OFFSET_OUT);
+ if(len>0){bb.append(q);}
+ for(int i=1, w=1; i<len; i++, w++){
+ if(w>=wrap){
+ bb.append('\n');
+ w=0;
+ }else{
+ bb.append(' ');
+ }
+ bb.append(q);
+ }
+ }else{
+ byte c=(byte)(q+FASTQ.ASCII_OFFSET_OUT);
+ for(int i=0; i<len; i++){bb.append(c);}
+ }
+ return bb;
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Getters ----------------*/
+ /*--------------------------------------------------------------*/
+
+
+ public String fname(){return fname;}
+ public long readsWritten(){return readsWritten;}
+ public long basesWritten(){return basesWritten;}
+ public long validReadsWritten(){return validReadsWritten;}
+ public long validBasesWritten(){return validBasesWritten;}
+
+ /** Return true if this stream has detected an error */
+ public final boolean errorState(){return errorState;}
+ /** Return true if this stream has finished */
+ public final boolean finishedSuccessfully(){return finishedSuccessfully;}
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** TODO */
+ protected boolean errorState=false;
+ protected boolean finishedSuccessfully=false;
+
+ public final boolean OUTPUT_SAM;
+ public final boolean OUTPUT_BAM;
+ public final boolean OUTPUT_FASTQ;
+ public final boolean OUTPUT_FASTA;
+ public final boolean OUTPUT_HEADER;
+ public final boolean OUTPUT_ATTACHMENT;
+ public final boolean OUTPUT_STANDARD_OUT;
+ public final boolean SITES_ONLY;
+ public boolean OUTPUT_INTERLEAVED=false;
+
+ protected final int FASTA_WRAP;
+
+ protected final boolean allowSubprocess;
+
+ protected final boolean read1;
+ protected final String fname;
+ protected final String qfname;
+ protected final OutputStream myOutstream;
+ protected final PrintWriter myWriter;
+ protected final OutputStream myQOutstream;
+ protected final PrintWriter myQWriter;
+ protected final ArrayBlockingQueue<Job> queue;
+
+ protected long readsWritten=0;
+ protected long basesWritten=0;
+ protected long validReadsWritten=0;
+ protected long validBasesWritten=0;
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Static Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public static int MINCHROM=-1; //For generating sam header
+ public static int MAXCHROM=-1; //For generating sam header
+ public static CharSequence HEADER;
+ public static boolean NUMERIC_QUAL=true;
+ public static boolean OUTPUT_SAM_SECONDARY_ALIGNMENTS=false;
+
+ public static boolean ignorePairAssertions=false;
+ public static boolean ASSERT_CIGAR=false;
+ public static boolean NO_HEADER=false;
+ public static boolean NO_HEADER_SEQUENCES=false;
+ public static boolean USE_ATTACHED_SAMLINE=false;
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Classes ----------------*/
+ /*--------------------------------------------------------------*/
+
+
+ protected static class Job{
+
+ public Job(ArrayList<Read> list_, PrintWriter writer_, OutputStream outstream_, boolean closeWhenDone_,
+ boolean shutdownThread_){
+ list=list_;
+ writer=writer_;
+ outstream=outstream_;
+ close=closeWhenDone_;
+ poison=shutdownThread_;
+ }
+ public Job(ArrayList<Read> list_, PrintWriter writer_){
+ this(list_, writer_, null, false, false);
+ }
+
+ /*--------------------------------------------------------------*/
+
+ public boolean isEmpty(){return list==null || list.isEmpty();}
+ public final ArrayList<Read> list;
+ public final PrintWriter writer;
+ public final OutputStream outstream;
+ public final boolean close;
+ public final boolean poison;
+
+ }
+
+}
diff --git a/current/stream/SamHeader.java b/current/stream/SamHeader.java
new file mode 100755
index 0000000..695b951
--- /dev/null
+++ b/current/stream/SamHeader.java
@@ -0,0 +1,360 @@
+package stream;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.PrintWriter;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+import align2.Shared;
+import align2.Tools;
+
+import dna.Data;
+import fileIO.TextStreamWriter;
+
+/**
+ * @author Brian Bushnell
+ * @date Jul 7, 2014
+ *
+ */
+public class SamHeader {
+
+ public static ByteBuilder header0B(ByteBuilder bb){
+ // if(MAKE_TOPHAT_TAGS){
+ // return new ByteBuilder("@HD\tVN:"+(VERSION<1.4f ? "1.0" : "1.4")+"\tSO:unsorted");
+ // }
+ bb.append("@HD\tVN:");
+ bb.append((SamLine.VERSION<1.4f ? "1.3" : "1.4"));
+ bb.append("\tSO:unsorted");
+ return bb;
+ }
+
+ public static StringBuilder header0(){
+ // if(MAKE_TOPHAT_TAGS){
+ // return new StringBuilder("@HD\tVN:"+(SamLine.VERSION<1.4f ? "1.0" : "1.4")+"\tSO:unsorted");
+ // }
+ StringBuilder sb=new StringBuilder("@HD\tVN:"+(SamLine.VERSION<1.4f ? "1.3" : "1.4")+"\tSO:unsorted");
+ return sb;
+ }
+
+ static ArrayList<String> scaffolds(int minChrom, int maxChrom, boolean sort){
+ final ArrayList<String> list=new ArrayList<String>(4000);
+ final StringBuilder sb=new StringBuilder(1000);
+ for(int i=minChrom; i<=maxChrom && i<=Data.numChroms; i++){
+ final byte[][] inames=Data.scaffoldNames[i];
+ for(int j=0; j<Data.chromScaffolds[i]; j++){
+ final byte[] scn=inames[j];
+ sb.append("@SQ\tSN:");//+Data.scaffoldNames[i][j]);
+ if(scn==null){
+ assert(false) : "scaffoldName["+i+"]["+j+"] = null";
+ sb.append("null");
+ }else{
+ appendScafName(sb, scn);
+ }
+ sb.append("\tLN:"+Tools.min(Integer.MAX_VALUE, (Data.scaffoldLengths[i][j])));
+ // sb.append("\tLN:"+Tools.min(Integer.MAX_VALUE, (Data.scaffoldLengths[i][j]+1000L)));
+ // sb.append("\tAS:"+((Data.name==null ? "" : Data.name+" ")+"b"+Data.GENOME_BUILD).replace('\t', ' '));
+
+ sb.append('\n');
+ list.add(sb.toString());
+ sb.setLength(0);
+ }
+ }
+ if(sort){Collections.sort(list);}
+ return list;
+ }
+
+ public static StringBuilder header1(int minChrom, int maxChrom){
+ StringBuilder sb=new StringBuilder(20000);
+ if(SamLine.SORT_SCAFFOLDS){
+ ArrayList<String> scaffolds=scaffolds(minChrom, maxChrom, true);
+ for(int i=0; i<scaffolds.size(); i++){
+ sb.append(scaffolds.get(i));
+ scaffolds.set(i, null);
+ }
+ return sb;
+ }
+
+ for(int i=minChrom; i<=maxChrom && i<=Data.numChroms; i++){
+ final byte[][] inames=Data.scaffoldNames[i];
+ for(int j=0; j<Data.chromScaffolds[i]; j++){
+ byte[] scn=inames[j];
+ sb.append("@SQ\tSN:");//+Data.scaffoldNames[i][j]);
+ if(scn==null){
+ assert(false) : "scaffoldName["+i+"]["+j+"] = null";
+ sb.append("null");
+ }else{
+ appendScafName(sb, scn);
+ }
+
+ sb.append("\tLN:"+Tools.min(Integer.MAX_VALUE, (Data.scaffoldLengths[i][j])));
+ // sb.append("\tLN:"+Tools.min(Integer.MAX_VALUE, (Data.scaffoldLengths[i][j]+1000L)));
+ // sb.append("\tAS:"+((Data.name==null ? "" : Data.name+" ")+"build "+Data.GENOME_BUILD).replace('\t', ' '));
+
+ sb.append('\n');
+ }
+ }
+
+ return sb;
+ }
+
+ public static void printHeader1(int minChrom, int maxChrom, PrintWriter pw){
+ if(SamLine.SORT_SCAFFOLDS){
+ ArrayList<String> scaffolds=scaffolds(minChrom, maxChrom, true);
+ for(int i=0; i<scaffolds.size(); i++){
+ pw.print(scaffolds.set(i, null));
+ }
+ return;
+ }
+
+ for(int i=minChrom; i<=maxChrom && i<=Data.numChroms; i++){
+ final byte[][] inames=Data.scaffoldNames[i];
+ StringBuilder sb=new StringBuilder(256);
+ for(int j=0; j<Data.chromScaffolds[i]; j++){
+ final byte[] scn=inames[j];
+ // StringBuilder sb=new StringBuilder(7+(scn==null ? 4 : scn.length)+4+10+4+/*(Data.name==null ? 0 : Data.name.length()+1)+11*/+4);//last one could be 1
+ sb.append("@SQ\tSN:");//+Data.scaffoldNames[i][j]);
+ if(scn==null){
+ assert(false) : "scaffoldName["+i+"]["+j+"] = null";
+ sb.append("null");
+ }else{
+ appendScafName(sb, scn);
+ }
+ sb.append("\tLN:"+Tools.min(Integer.MAX_VALUE, (Data.scaffoldLengths[i][j])));
+ // sb.append("\tLN:"+Tools.min(Integer.MAX_VALUE, (Data.scaffoldLengths[i][j]+1000L)));
+ // sb.append("\tAS:"+((Data.name==null ? "" : Data.name+" ")+"b"+Data.GENOME_BUILD).replace('\t', ' '));
+
+ sb.append('\n');
+
+ pw.print(sb);
+ sb.setLength(0);
+ }
+ }
+ }
+
+ public static void printHeader1B(int minChrom, int maxChrom, ByteBuilder bb, OutputStream os){
+ if(verbose){System.err.println("printHeader1B("+minChrom+", "+maxChrom+")");}
+
+ if(SamLine.SORT_SCAFFOLDS){
+ if(verbose){System.err.println("Sorting scaffolds");}
+ ArrayList<String> scaffolds=scaffolds(minChrom, maxChrom, true);
+ for(int i=0; i<scaffolds.size(); i++){
+ String s=scaffolds.set(i, null);
+ bb.append(s);
+ if(bb.length>=32768){
+ try {
+ os.write(bb.array, 0, bb.length);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ bb.setLength(0);
+ }
+ }
+ return;
+ }
+
+ if(verbose){System.err.println("Iterating over chroms");}
+ for(int chrom=minChrom; chrom<=maxChrom && chrom<=Data.numChroms; chrom++){
+ // if(verbose){System.err.println("chrom "+chrom);}
+ final byte[][] inames=Data.scaffoldNames[chrom];
+ // if(verbose){System.err.println("inames"+(inames==null ? " = null" : ".length = "+inames.length));}
+ final int numScafs=Data.chromScaffolds[chrom];
+ // if(verbose){System.err.println("scaffolds: "+numScafs);}
+ assert(inames.length==numScafs) : "Mismatch between number of scaffolds and names for chrom "+chrom+": "+inames.length+" != "+numScafs;
+ for(int scaf=0; scaf<numScafs; scaf++){
+ // if(verbose){System.err.println("chromScaffolds["+scaf+"] = "+(inames==null ? "=null" : ".length="+inames.length));}
+ final byte[] scafName=inames[scaf];
+ // if(verbose){System.err.println("scafName = "+(scafName==null ? "null" : new String(scafName)));}
+ bb.append("@SQ\tSN:");//+Data.scaffoldNames[i][j]);
+ if(scafName==null){
+ assert(false) : "scaffoldName["+chrom+"]["+scaf+"] = null";
+ bb.append(scafName);
+ }else{
+ appendScafName(bb, scafName);
+ }
+ bb.append("\tLN:");
+ bb.append(Tools.min(Integer.MAX_VALUE, (Data.scaffoldLengths[chrom][scaf])));
+ // sb.append("\tLN:"+Tools.min(Integer.MAX_VALUE, (Data.scaffoldLengths[i][j]+1000L)));
+ // sb.append("\tAS:"+((Data.name==null ? "" : Data.name+" ")+"b"+Data.GENOME_BUILD).replace('\t', ' '));
+
+ bb.append('\n');
+
+ if(bb.length>=32768){
+ try {
+ os.write(bb.array, 0, bb.length);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ bb.setLength(0);
+ }
+ }
+ }
+ }
+
+ public static void printHeader1(int minChrom, int maxChrom, TextStreamWriter tsw){
+ if(SamLine.SORT_SCAFFOLDS){
+ ArrayList<String> scaffolds=scaffolds(minChrom, maxChrom, true);
+ for(int i=0; i<scaffolds.size(); i++){
+ tsw.print(scaffolds.set(i, null));
+ }
+ return;
+ }
+
+ for(int i=minChrom; i<=maxChrom && i<=Data.numChroms; i++){
+ final byte[][] inames=Data.scaffoldNames[i];
+ final StringBuilder sb=new StringBuilder(256);
+ for(int j=0; j<Data.chromScaffolds[i]; j++){
+ final byte[] scn=inames[j];
+ // StringBuilder sb=new StringBuilder(7+(scn==null ? 4 : scn.length)+4+10+4+/*(Data.name==null ? 0 : Data.name.length()+1)+11*/+4);//last one could be 1
+ sb.append("@SQ\tSN:");//+Data.scaffoldNames[i][j]);
+ if(scn==null){
+ assert(false) : "scaffoldName["+i+"]["+j+"] = null";
+ sb.append("null");
+ }else{
+ appendScafName(sb, scn);
+ }
+ sb.append("\tLN:"+Tools.min(Integer.MAX_VALUE, (Data.scaffoldLengths[i][j])));
+ // sb.append("\tLN:"+Tools.min(Integer.MAX_VALUE, (Data.scaffoldLengths[i][j]+1000L)));
+ // sb.append("\tAS:"+((Data.name==null ? "" : Data.name+" ")+"b"+Data.GENOME_BUILD).replace('\t', ' '));
+
+ sb.append('\n');
+
+ tsw.print(sb);
+ sb.setLength(0);
+ }
+ }
+ }
+
+ static void appendScafName(StringBuilder sb, byte[] scn){
+ if(Data.scaffoldPrefixes){
+ int k=0;
+ while(k<scn.length && scn[k]!='$'){k++;}
+ k++;
+ while(k<scn.length){
+ sb.append((char)scn[k]);
+ k++;
+ }
+ }else{
+ final char[] buffer=Shared.getTLCB(scn.length);
+ for(int i=0; i<scn.length; i++){buffer[i]=(char)scn[i];}
+ sb.append(buffer, 0, scn.length);
+ }
+ }
+
+ static void appendScafName(ByteBuilder sb, byte[] scn){
+ if(Data.scaffoldPrefixes){
+ int k=0;
+ while(k<scn.length && scn[k]!='$'){k++;}
+ k++;
+ while(k<scn.length){
+ sb.append(scn[k]);
+ k++;
+ }
+ }else{
+ sb.append(scn);
+ }
+ }
+
+ public static StringBuilder header2(){
+ StringBuilder sb=new StringBuilder(1000);
+ // sb.append("@RG\tID:unknownRG\tSM:unknownSM\tPL:ILLUMINA\n"); //Can cause problems. If RG is in the header, reads may need extra fields.
+
+ // if(MAKE_TOPHAT_TAGS){
+ //// sb.append("@PG\tID:TopHat\tVN:2.0.6\tCL:/usr/common/jgi/aligners/tophat/2.0.6/bin/tophat -p 16 -r 0 --max-multihits 1 Creinhardtii_236 reads_1.fa reads_2.fa");
+ // sb.append("@PG\tID:TopHat\tVN:2.0.6");
+ // }else{
+ // sb.append("@PG\tID:BBMap\tPN:BBMap\tVN:"+Shared.BBMAP_VERSION_STRING);
+ // }
+
+ if(SamLine.READGROUP_ID!=null){
+ sb.append("@RG\tID:").append(SamLine.READGROUP_ID);
+ if(SamLine.READGROUP_CN!=null){sb.append("\tCN:").append(SamLine.READGROUP_CN);}
+ if(SamLine.READGROUP_DS!=null){sb.append("\tDS:").append(SamLine.READGROUP_DS);}
+ if(SamLine.READGROUP_DT!=null){sb.append("\tDT:").append(SamLine.READGROUP_DT);}
+ if(SamLine.READGROUP_FO!=null){sb.append("\tFO:").append(SamLine.READGROUP_FO);}
+ if(SamLine.READGROUP_KS!=null){sb.append("\tKS:").append(SamLine.READGROUP_KS);}
+ if(SamLine.READGROUP_LB!=null){sb.append("\tLB:").append(SamLine.READGROUP_LB);}
+ if(SamLine.READGROUP_PG!=null){sb.append("\tPG:").append(SamLine.READGROUP_PG);}
+ if(SamLine.READGROUP_PI!=null){sb.append("\tPI:").append(SamLine.READGROUP_PI);}
+ if(SamLine.READGROUP_PL!=null){sb.append("\tPL:").append(SamLine.READGROUP_PL);}
+ if(SamLine.READGROUP_PU!=null){sb.append("\tPU:").append(SamLine.READGROUP_PU);}
+ if(SamLine.READGROUP_SM!=null){sb.append("\tSM:").append(SamLine.READGROUP_SM);}
+ sb.append('\n');
+ }
+
+ sb.append("@PG\tID:BBMap\tPN:BBMap\tVN:");
+ sb.append(Shared.BBMAP_VERSION_STRING);
+
+ if(Shared.BBMAP_CLASS!=null){
+ sb.append("\tCL:java");
+ {
+ List<String> list=null;
+ list=Shared.JVM_ARGS();
+ if(list!=null){
+ for(String s : list){
+ sb.append(' ');
+ sb.append(s);
+ }
+ }
+ }
+ sb.append(" align2."+Shared.BBMAP_CLASS);
+ if(Shared.COMMAND_LINE!=null){
+ for(String s : Shared.COMMAND_LINE){
+ sb.append(' ');
+ sb.append(s);
+ }
+ }
+ }
+
+ return sb;
+ }
+
+ public static ByteBuilder header2B(ByteBuilder sb){
+
+ if(SamLine.READGROUP_ID!=null){
+ sb.append("@RG\tID:").append(SamLine.READGROUP_ID);
+ if(SamLine.READGROUP_CN!=null){sb.append("\tCN:").append(SamLine.READGROUP_CN);}
+ if(SamLine.READGROUP_DS!=null){sb.append("\tDS:").append(SamLine.READGROUP_DS);}
+ if(SamLine.READGROUP_DT!=null){sb.append("\tDT:").append(SamLine.READGROUP_DT);}
+ if(SamLine.READGROUP_FO!=null){sb.append("\tFO:").append(SamLine.READGROUP_FO);}
+ if(SamLine.READGROUP_KS!=null){sb.append("\tKS:").append(SamLine.READGROUP_KS);}
+ if(SamLine.READGROUP_LB!=null){sb.append("\tLB:").append(SamLine.READGROUP_LB);}
+ if(SamLine.READGROUP_PG!=null){sb.append("\tPG:").append(SamLine.READGROUP_PG);}
+ if(SamLine.READGROUP_PI!=null){sb.append("\tPI:").append(SamLine.READGROUP_PI);}
+ if(SamLine.READGROUP_PL!=null){sb.append("\tPL:").append(SamLine.READGROUP_PL);}
+ if(SamLine.READGROUP_PU!=null){sb.append("\tPU:").append(SamLine.READGROUP_PU);}
+ if(SamLine.READGROUP_SM!=null){sb.append("\tSM:").append(SamLine.READGROUP_SM);}
+ sb.append('\n');
+ }
+
+ sb.append("@PG\tID:BBMap\tPN:BBMap\tVN:");
+ sb.append(Shared.BBMAP_VERSION_STRING);
+
+ if(Shared.BBMAP_CLASS!=null){
+ sb.append("\tCL:java");
+ {
+ List<String> list=null;
+ list=Shared.JVM_ARGS();
+ if(list!=null){
+ for(String s : list){
+ sb.append(' ');
+ sb.append(s);
+ }
+ }
+ }
+ sb.append(" align2."+Shared.BBMAP_CLASS);
+ if(Shared.COMMAND_LINE!=null){
+ for(String s : Shared.COMMAND_LINE){
+ sb.append(' ');
+ sb.append(s);
+ }
+ }
+ }
+
+ return sb;
+ }
+
+ private static final boolean verbose=false;
+
+}
diff --git a/current/stream/SamLine.java b/current/stream/SamLine.java
new file mode 100755
index 0000000..4d418e1
--- /dev/null
+++ b/current/stream/SamLine.java
@@ -0,0 +1,2230 @@
+package stream;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import align2.Shared;
+import align2.Tools;
+
+
+import dna.AminoAcid;
+import dna.ChromosomeArray;
+import dna.Data;
+import dna.Gene;
+import dna.ScafLoc;
+
+
+public class SamLine implements Serializable {
+
+// 426_647_582 161 chr1 10159 0 26M9H chr3 170711991 0 TCCCTAACCCTAACCCTAACCTAACC IIFIIIIIIIIIIIIIIIIIICH2<> RG:Z:20110708003021394 NH:i:3 CM:i:2 SM:i:1 CQ:Z:A9?(BB?:<A?>=>B67=:7A);.%8'%))/%*%' CS:Z:G12002301002301002301023010200000003 XS:A:+
+
+// 1 QNAME String [!-?A-~]f1,255g Query template NAME
+// 2 FLAG Int [0,216-1] bitwise FLAG
+// 3 RNAME String \*|[!-()+-<>-~][!-~]* Reference sequence NAME
+// 4 POS Int [0,229-1] 1-based leftmost mapping POSition
+// 5 MAPQ Int [0,28-1] MAPping Quality
+// 6 CIGAR String \*|([0-9]+[MIDNSHPX=])+ CIGAR string
+// 7 RNEXT String \*|=|[!-()+-<>-~][!-~]* Ref. name of the mate/next fragment
+// 8 PNEXT Int [0,229-1] Position of the mate/next fragment
+// 9 TLEN Int [-229+1,229-1] observed Template LENgth
+// 10 SEQ String \*|[A-Za-z=.]+ fragment SEQuence
+// 11 QUAL String [!-~]+ ASCII of Phred-scaled base QUALity+33
+
+
+// FCB062MABXX:1:1101:1177:2115#GGCTACAA 147 chr11 47765857 29 90M = 47765579 -368 CCTCTGTGGCCCGGGTTGGAGTGCAGTGTCATGATCATGGCTCGCTGTAGCTACACCCTTCTGAGCTCAAGCAATCCTCCCACCTCTCCC ############################################################A@@><D<AAAB<=A2BD/BC<7:<4<%679 XT:A:M NM:i:5 SM:i:29 AM:i:29 XM:i:5 XO:i:0 XG:i:0 MD:Z:7T4A15G26A30A3
+// FCB062MABXX:1:1101:1193:2122#GGCTACAA 77 * 0 0 * * 0 0 TATATATGTGCTATGTACAGCATTGGAATTCACACCCTACACTTTCAAAAGNGAGCCCTAAATAAATGTTAGATCGGAAGAGCACACGTC FCFCFDDDADDEDEBDAEDFEDEFFGGFGGHEEFHHHHHHEDDDEDFFEFB#CBBA at B8BGGFGEEEC>DGGGDFBGGGGHHHHH9<@##
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /**
+ *
+ */
+ private static final long serialVersionUID = -4180486051387471116L;
+
+ public SamLine(String s){
+ this(s.split("\t"));
+ }
+
+ /** Prevents references to original string, in case of e.g. very long MD tags. */
+ public SamLine toSamLine(String s){
+ String[] split=s.split("\t");
+ split[0]=new String(split[0]);
+ split[5]=new String(split[5]);
+ split[9]=new String(split[9]);
+ split[10]=new String(split[10]);
+ for(int i=11; i<split.length; i++){
+ split[i]=new String(split[i]);
+ }
+ return new SamLine(split);
+ }
+
+ private void setFrom(SamLine sl){
+ qname=sl.qname;
+ flag=sl.flag;
+ rname=sl.rname;
+ rnameS=sl.rnameS;
+ pos=sl.pos;
+ mapq=sl.mapq;
+ cigar=sl.cigar;
+ rnext=sl.rnext;
+ pnext=sl.pnext;
+ tlen=sl.tlen;
+ seq=sl.seq;
+ qual=sl.qual;
+ optional=sl.optional;
+ }
+
+
+ public SamLine(Read r1, int fragNum){
+
+ if(verbose){
+ System.err.println("new SamLine for read with match "+(r1.match==null ? "null" : new String(r1.match)));
+ }
+
+ Read r2=r1.mate;
+ final boolean perfect=r1.perfect();
+
+ if(Data.scaffoldLocs==null && r1.obj!=null){
+ if(r1.obj.getClass()==SamLine.class){
+ assert(SET_FROM_OK) : "Sam format cannot be used as input to this program when no genome build is loaded.\n" +
+ "Please index the reference first and rerun with e.g. 'build=1', or use a different input format.";
+ setFrom((SamLine)r1.obj);
+ }
+ return;
+ }
+
+// qname=r.id.replace(' ', '_').replace('\t', '_');
+// qname=r.id.split("\\s+")[0];
+ qname=r1.id.replace('\t', '_');
+// if(!KEEP_NAMES && qname.length()>2 && r2!=null){
+// if(qname.endsWith("/1") || qname.endsWith("/2") || qname.endsWith(" 1") || qname.endsWith(" 2")){}
+// }
+
+ if(!KEEP_NAMES && qname.length()>2 && r2!=null){
+ char c=qname.charAt(qname.length()-2);
+ int num=(qname.charAt(qname.length()-1))-'1';
+ if((num==0 || num==1) && (c==' ' || c=='/')){qname=qname.substring(0, qname.length()-2);}
+// if(r.pairnum()==num && (c==' ' || c=='/')){qname=qname.substring(0, qname.length()-2);}
+ }
+// flag=Integer.parseInt(s[1]);
+
+ int idx1=-1, idx2=-1;
+ int chrom1=-1, chrom2=-1;
+ int start1=-1, start2=-1, a1=0, a2=0;
+ int stop1=-1, stop2=-1, b1=0, b2=0;
+ int scaflen=0, scafloc=0, scaflen2=0;
+ byte[] name1=bytestar, name2=bytestar;
+ if(r1.mapped()){
+ assert(r1.chrom>=0);
+ chrom1=r1.chrom;
+ start1=r1.start;
+ stop1=r1.stop;
+ if(Data.isSingleScaffold(chrom1, start1, stop1)){
+ assert(Data.scaffoldLocs!=null) : "\n\n"+r1+"\n\n"+r1.obj+"\n\n";
+ idx1=Data.scaffoldIndex(chrom1, (start1+stop1)/2);
+ name1=Data.scaffoldNames[chrom1][idx1];
+ scaflen=Data.scaffoldLengths[chrom1][idx1];
+ scafloc=Data.scaffoldLocs[chrom1][idx1];
+ a1=Data.scaffoldRelativeLoc(chrom1, start1, idx1);
+ b1=a1-start1+stop1;
+ }else{
+ if(verbose){System.err.println("------------- Found multi-scaffold alignment! -------------");}
+ r1.setMapped(false);
+ r1.setPaired(false);
+ r1.match=null;
+ if(r2!=null){r2.setPaired(false);}
+ }
+ }
+ if(r2!=null && r2.mapped()){
+ chrom2=r2.chrom;
+ start2=r2.start;
+ stop2=r2.stop;
+ if(Data.isSingleScaffold(chrom2, start2, stop2)){
+ idx2=Data.scaffoldIndex(chrom2, (start2+stop2)/2);
+ name2=Data.scaffoldNames[chrom2][idx2];
+ scaflen2=Data.scaffoldLengths[chrom2][idx2];
+ a2=Data.scaffoldRelativeLoc(chrom2, start2, idx2);
+ b2=a2-start2+stop2;
+ }else{
+ if(verbose){System.err.println("------------- Found multi-scaffold alignment for r2! -------------");}
+ r2.setMapped(false);
+ r2.setPaired(false);
+ r2.match=null;
+ if(r1!=null){r1.setPaired(false);}
+ }
+ }
+
+ final boolean sameScaf=(r2!=null && idx1>-1 && idx1==idx2 && r1.chrom==r2.chrom);
+ flag=makeFlag(r1, r2, fragNum, sameScaf);
+
+ rname=r1.mapped() ? name1 : ((r2!=null && r2.mapped()) ? name2 : null);
+
+ {
+ int pos0, pos0_mate; //start pos
+ int pos1, pos1_mate; //stop pos
+
+ if(r1.mapped()){
+// int leadingClip=countLeadingClip(cigar);
+ int clip=countLeadingClip(r1.match);
+ int clippedIndels=countLeadingIndels(a1, r1.match);
+ int tclip=countTrailingClip(r1.match);
+ int tclippedIndels=countTrailingIndels(b1, scaflen, r1.match);
+
+ if(verbose){
+ System.err.println("leadingClip="+clip);
+ System.err.println("clippedDels="+clippedIndels);
+ }
+ pos0=(a1+1)+clip+clippedIndels;
+ pos1=(b1+1)-tclip-tclippedIndels;
+ if(pos1>scaflen){pos1=scaflen;}
+
+ if(pos0<1){
+ //This is necessary to prevent mapped reads from having POS less than 1.
+ pos0=1;
+ }
+ assert(pos1>=pos0) : pos0+", "+pos1+"\n"+r1+"\n"+r2+"\n";
+
+ }else{
+ pos0=0;
+ pos1=0;
+ }
+
+ if(r2!=null && r2.mapped()){
+ int clip=countLeadingClip(r2.match);
+ int clippedIndels=countLeadingIndels(a2, r2.match);
+ int tclip=countTrailingClip(r2.match);
+ int tclippedIndels=countTrailingIndels(b2, scaflen, r2.match);
+ if(verbose){
+ System.err.println("leadingClip="+clip);
+ System.err.println("clippedDels="+clippedIndels);
+ }
+ pos0_mate=(a2+1)+clip+clippedIndels;
+ pos1_mate=(b2+1)-tclip-tclippedIndels;
+ if(pos1_mate>scaflen){pos1=scaflen;}
+
+ if(pos0_mate<1){
+ //This is necessary to prevent mapped reads from having POS less than 1.
+ pos0_mate=1;
+ }
+ assert(!sameScaf || pos1_mate>=pos0_mate) : pos0_mate+", "+pos1_mate+", "+scaflen+"\n"+r1+"\n"+r2+"\n";
+
+ }else{
+ pos0_mate=0;
+ pos1_mate=0;
+ }
+
+ if(r2==null){
+ pos=pos0;
+ pnext=pos0_mate;
+ tlen=0;
+ assert(((pos>0 && r1.mapped()) || (pos==0 && !r1.mapped())) && pnext==0);
+ }else{
+ if(r1.mapped() && r2.mapped()){
+ pos=pos0;
+ pnext=pos0_mate;
+ if(sameScaf){
+// tlen=1+(Data.max(r.stop, r2.stop)-Data.min(r.start, r2.start));
+ tlen=1+(Data.max(pos1, pos1_mate)-Data.min(pos0, pos0_mate));
+ }else{
+ tlen=0;
+ }
+ assert(pos>0) : pos+"\n"+r1+"\n"+r2;
+ assert(pnext>0) : pnext+"\n"+r1+"\n"+r2;
+ }else if(r1.mapped() && !r2.mapped()){
+ pos=pos0;
+ pnext=pos0;
+ tlen=0;
+ assert(pos>0 && pnext>0);
+ }else if(!r1.mapped() && r2.mapped()){
+ pos=pos0_mate;
+ pnext=pos0_mate;
+ tlen=0;
+ assert(pos>0 && pnext>0);
+ }else if(!r1.mapped() && !r2.mapped()){
+ pos=pos0;
+ pnext=pos0_mate;
+ tlen=0;
+ assert(pos==0 && pnext==0);
+ }else{assert(false);}
+ }
+
+ assert(pos>=0) : "Negative coordinate "+pos+" for read:\n\n"+r1+"\n\n"+r2+"\n\n"+this+"\n\na1="+a1+", a2="+a2+
+ ", pos0="+pos0+", pos0_mate="+pos0_mate+", clip="+countLeadingClip(cigar, true, false)+", clipM="+countLeadingClip(r1.match);
+ assert(pnext>=0) : "Negative coordinate "+pnext+" for mate:\n\n"+r1+"\n\n"+r2+"\n\n"+this+"\n\na1="+a1+", a2="+a2+
+ ", pos0="+pos0+", pos0_mate="+pos0_mate+", clip="+countLeadingClip(cigar, true, false);
+ }
+
+ mapq=toMapq(r1, null);
+
+ if(verbose){
+ System.err.println("Making cigar for "+(r1.match==null ? "null" : new String(r1.match)));
+ }
+
+ final boolean inbounds=!r1.mapped() ? false : (a1>=0 && b1<scaflen);
+ final boolean inbounds2=(r2==null ? true : !r2.mapped() ? false : (a2>=0 && b2<scaflen2));
+ if(r1.bases!=null && r1.mapped() && r1.match!=null){
+ if(VERSION>1.3f){
+ if(inbounds && perfect && !r1.containsNonM()){//r.containsNonM() should be unnecessary... it's there in case of clipping...
+ cigar=(r1.length()+"=");
+// System.err.println("SETTING cigar14="+cigar);
+//
+// byte[] match=r.match;
+// if(r.shortmatch()){match=Read.toLongMatchString(match);}
+// cigar=toCigar13(match, a1, b1, scaflen, r.bases);
+// System.err.println("RESETTING cigar14="+cigar+" from toCigar14("+new String(Read.toShortMatchString(match))+", "+a1+", "+b1+", "+scaflen+", "+r.bases+")");
+ }else{
+ byte[] match=r1.match;
+ if(r1.shortmatch()){match=Read.toLongMatchString(match);}
+ cigar=toCigar14(match, a1, b1, scaflen, r1.bases);
+// System.err.println("CALLING toCigar14("+Read.toShortMatchString(match)+", "+a1+", "+b1+", "+scaflen+", "+r.bases+")");
+ }
+ }else{
+ if(inbounds && (perfect || !r1.containsNonNMS())){
+ cigar=(r1.length()+"M");
+// System.err.println("SETTING cigar13="+cigar);
+//
+// byte[] match=r.match;
+// if(r.shortmatch()){match=Read.toLongMatchString(match);}
+// cigar=toCigar13(match, a1, b1, scaflen, r.bases);
+// System.err.println("RESETTING cigar13="+cigar+" from toCigar13("+new String(Read.toShortMatchString(match))+", "+a1+", "+b1+", "+scaflen+", "+r.bases+")");
+ }else{
+ byte[] match=r1.match;
+ if(r1.shortmatch()){match=Read.toLongMatchString(match);}
+ cigar=toCigar13(match, a1, b1, scaflen, r1.bases);
+// System.err.println("CALLING toCigar13("+Read.toShortMatchString(match)+", "+a1+", "+b1+", "+scaflen+", "+r.bases+")");
+ }
+ }
+ }
+
+ if(verbose){
+ System.err.println("cigar="+cigar);
+ }
+
+// assert(false);
+
+// assert(primary() || cigar.equals(stringstar)) : cigar;
+// if(pos<0){pos=0;cigar=null;rname=bytestar;mapq=0;flag|=0x4;}
+
+// assert(false) : "\npos="+pos+"\ncigar='"+cigar+"'\nVERSION="+VERSION+"\na1="+a1+", b1="+b1+"\n\n"+r.toString();
+
+// rnext=(r2==null ? stringstar : (r.mapped() && !r2.mapped()) ? "chr"+Gene.chromCodes[r.chrom] : "chr"+Gene.chromCodes[r2.chrom]);
+ rnext=((r2==null || (!r1.mapped() && !r2.mapped())) ? bytestar : (r1.mapped() && r2.mapped()) ? (sameScaf ? byteequals : name2) : byteequals);
+
+ assert(rnext!=byteequals || name1==name2 || name1==bytestar || name2==bytestar) :
+ new String(rname)+", "+new String(rnext)+", "+new String(name1)+", "+new String(name2)+"\n"+r1+"\n"+r2;
+
+// assert(r1.pairnum()==0) : r1.mapped()+", "+r2.mapped()+"fragNum="+fragNum+
+// "\nname1="+new String(name1)+"\nname2="+new String(name2)+"\nrname="+new String(rname)+"\nrnext="+new String(rnext)+
+// "\nname1="+name1+"\nname2="+name2+"\nrname="+rname+"\nrnext="+rnext+"\nidx1="+idx1+"\nidx2="+idx2;
+
+ if(Data.scaffoldPrefixes){
+ if(rname!=null && rname!=bytestar){
+ int k=Tools.indexOf(rname, (byte)'$');
+ rname=Arrays.copyOfRange(rname, k+1, rname.length);
+ }
+ if(rnext!=null && rnext!=bytestar){
+ int k=Tools.indexOf(rnext, (byte)'$');
+ rnext=Arrays.copyOfRange(rnext, k+1, rnext.length);
+ }
+ }
+
+// if(r2==null || r.stop<=r2.start){
+// //plus sign
+// }else if(r2.stop<=r.start){
+// //minus sign
+// tlen=-tlen;
+// }else{
+// //They overlap... a lot. Physically shorter than read length.
+// if(r.start<=r2.start){
+//
+// }else{
+// tlen=-tlen;
+// }
+// }
+ //This version is less technically correct (does not account for very short insert reads) but probably more in line with what is expected
+ if(r2==null || r1.start<r2.start || (r1.start==r2.start && r1.pairnum()==0)){
+ //plus sign
+ }else{
+ //minus sign
+ tlen=-tlen;
+ }
+
+// if(r.secondary()){
+//// seq=qual=stringstar;
+// seq=qual=bytestar;
+// }else{
+// if(r.strand()==Gene.PLUS){
+//// seq=new String(r.bases);
+// seq=r.bases.clone();
+// if(r.quality==null){
+//// qual=stringstar;
+// qual=bytestar;
+// }else{
+//// StringBuilder q=new StringBuilder(r.quality.length);
+//// for(byte b : r.quality){
+//// q.append((char)(b+33));
+//// }
+//// qual=q.toString();
+// qual=new byte[r.quality.length];
+// for(int i=0, j=qual.length-1; i<qual.length; i++, j--){
+// qual[i]=(byte)(r.quality[j]+33);
+// }
+// }
+// }else{
+//// seq=new String(AminoAcid.reverseComplementBases(r.bases));
+// seq=AminoAcid.reverseComplementBases(r.bases);
+// if(r.quality==null){
+//// qual=stringstar;
+// qual=bytestar;
+// }else{
+//// StringBuilder q=new StringBuilder(r.quality.length);
+//// for(int i=r.quality.length-1; i>=0; i--){
+//// q.append((char)(r.quality[i]+33));
+//// }
+//// qual=q.toString();
+// qual=new byte[r.quality.length];
+// for(int i=0, j=qual.length-1; i<qual.length; i++, j--){
+// qual[i]=(byte)(r.quality[j]+33);
+// }
+// }
+// }
+// }
+
+ if(r1.secondary() && SECONDARY_ALIGNMENT_ASTERISKS){
+// seq=qual=bytestar;
+ seq=qual=null;
+ }else{
+ seq=r1.bases;
+ if(r1.quality==null){
+// qual=bytestar;
+ qual=null;
+ }else{
+ qual=r1.quality;
+ }
+ }
+
+
+ optional=makeOptionalTags(r1, r2, perfect, scafloc, scaflen, inbounds, inbounds2);
+// assert(r.pairnum()==1) : "\n"+r.toText(false)+"\n"+this+"\n"+r2;
+ }
+
+ public SamLine(String[] s){
+ assert(!s[0].startsWith("@")) : "Tried to make a SamLine from a header: "+s[0];
+ assert(s.length>=11) : "\nNot all required fields are present: "+s.length+"\nline='"+Arrays.toString(s)+"'\n";
+ if(s.length<11){
+ System.err.println("Invalid SamLine: "+Arrays.toString(s));
+ return;
+ }
+ qname=s[0];
+ flag=Integer.parseInt(s[1]);
+ rname=s[2].getBytes();
+ pos=Integer.parseInt(s[3]);
+// try {
+// Integer.parseInt(s[4]);
+// } catch (NumberFormatException e) {
+// System.err.println(Arrays.toString(s));
+// }
+ mapq=Character.isDigit(s[4].charAt(0)) ? Integer.parseInt(s[4]) : 99; //Added for non-compliant mappers that put * here
+ cigar=s[5];
+ rnext=s[6].getBytes();
+ pnext=(s[7].charAt(0)=='*' ? 0 : Integer.parseInt(s[7]));
+ tlen=Character.isDigit(s[8].charAt(0)) ? Integer.parseInt(s[8]) : 0; //Added for non-compliant mappers that put * here
+// seq=s[9];
+// qual=s[10];
+ seq=(s[9].equals(stringstar) ? null : s[9].getBytes());
+ qual=(s[10].equals(stringstar) ? null : s[10].getBytes());
+
+ if(mapped() && strand()==Gene.MINUS){
+ if(seq!=bytestar){AminoAcid.reverseComplementBasesInPlace(seq);}
+ if(qual!=bytestar){Tools.reverseInPlace(qual);}
+ }
+
+ if(qual!=null && qual!=bytestar){
+ for(int i=0; i<qual.length; i++){qual[i]-=33;}
+ }
+
+ if(!PARSE_OPTIONAL){return;}
+
+ if(s.length>11){
+ optional=new ArrayList<String>(s.length-11);
+ for(int i=11; i<s.length; i++){
+ optional.add(s[i]);
+ }
+ }
+ }
+
+ public SamLine(byte[] s){
+ assert(s[0]!='@') : "Tried to make a SamLine from a header: "+new String(s);
+
+ int a=0, b=0;
+
+ while(b<s.length && s[b]!='\t'){b++;}
+ assert(b>a) : "Missing field 0: "+new String(s);
+ if(PARSE_0){qname=(b==a+1 && s[a]=='*' ? null : new String(s, a, b-a));}
+ b++;
+ a=b;
+
+ while(b<s.length && s[b]!='\t'){b++;}
+ assert(b>a) : "Missing field 1: "+new String(s);
+ flag=Tools.parseInt(s, a, b);
+ b++;
+ a=b;
+
+ while(b<s.length && s[b]!='\t'){b++;}
+ assert(b>a) : "Missing field 2: "+new String(s);
+ if(RNAME_AS_BYTES){
+ rname=(b==a+1 && s[a]=='*' ? null : Arrays.copyOfRange(s, a, b));
+ }else{
+ rnameS=(b==a+1 && s[a]=='*' ? null : new String(s, a, b-a));
+ }
+ b++;
+ a=b;
+
+ while(b<s.length && s[b]!='\t'){b++;}
+ assert(b>a) : "Missing field 3: "+new String(s);
+ pos=Tools.parseInt(s, a, b);
+ b++;
+ a=b;
+
+ while(b<s.length && s[b]!='\t'){b++;}
+ assert(b>a) : "Missing field 4: "+new String(s);
+ mapq=Tools.parseInt(s, a, b);
+ b++;
+ a=b;
+
+ while(b<s.length && s[b]!='\t'){b++;}
+ assert(b>a) : "Missing field 5: "+new String(s);
+ cigar=(b==a+1 && s[a]=='*' ? null : new String(s, a, b-a));
+ b++;
+ a=b;
+
+ while(b<s.length && s[b]!='\t'){b++;}
+ assert(b>a) : "Missing field 6: "+new String(s);
+ if(PARSE_6){rnext=(b==a+1 && s[a]=='*' ? null : Arrays.copyOfRange(s, a, b));}
+ b++;
+ a=b;
+
+ while(b<s.length && s[b]!='\t'){b++;}
+ assert(b>a) : "Missing field 7: "+new String(s);
+ if(PARSE_7){pnext=(b==a+1 && s[a]=='*' ? 0 :Tools.parseInt(s, a, b));}
+ b++;
+ a=b;
+
+ while(b<s.length && s[b]!='\t'){b++;}
+ assert(b>a) : "Missing field 8: "+new String(s);
+ if(PARSE_8){tlen=Tools.parseInt(s, a, b);}
+ b++;
+ a=b;
+
+ while(b<s.length && s[b]!='\t'){b++;}
+ assert(b>a) : "Missing field 9: "+new String(s);
+// seq=new String(s, a, b-a);
+ seq=(b==a+1 && s[a]=='*' ? null : Arrays.copyOfRange(s, a, b));
+ b++;
+ a=b;
+
+ while(b<s.length && s[b]!='\t'){b++;}
+ assert(b>a) : "Missing field 10: "+new String(s);
+// qual=new String(s, a, b-a);
+ if(PARSE_10){qual=(b==a+1 && s[a]=='*' ? null : Arrays.copyOfRange(s, a, b));}
+ b++;
+ a=b;
+
+ assert((seq==bytestar)==(Tools.equals(seq, bytestar)));
+ assert((qual==bytestar)==(Tools.equals(qual, bytestar)));
+
+ if(mapped() && strand()==Gene.MINUS){
+ if(seq!=bytestar){AminoAcid.reverseComplementBasesInPlace(seq);}
+ if(qual!=bytestar){Tools.reverseInPlace(qual);}
+ }
+
+ if(qual!=null && qual!=bytestar){
+ for(int i=0; i<qual.length; i++){qual[i]-=33;}
+ }
+
+ if(!PARSE_OPTIONAL){return;}
+
+ if(b<s.length){
+ optional=new ArrayList<String>(4);
+ while(b<s.length){
+ while(b<s.length && s[b]!='\t'){b++;}
+ if(b>a){
+ String x=new String(s, a, b-a);
+ optional.add(x);
+ }else{
+ //Empty field
+ }
+ b++;
+ a=b;
+ }
+ }
+ }
+
+ public static final int parseFlagOnly(byte[] s){
+ assert(s!=null && s.length>0) : "Blank line.";
+ if(s[0]=='@'){return -1;}
+
+ int a=0, b=0;
+
+ while(b<s.length && s[b]!='\t'){b++;}
+ assert(b>a) : "Missing field 0: "+new String(s);
+ b++;
+ a=b;
+
+ while(b<s.length && s[b]!='\t'){b++;}
+ assert(b>a) : "Missing field 1: "+new String(s);
+ int flag=Tools.parseInt(s, a, b);
+ return flag;
+ }
+
+ public static final String parseNameOnly(byte[] s){
+ assert(s!=null && s.length>0) : "Blank line.";
+ if(s[0]=='@'){return null;}
+
+ int a=0, b=0;
+
+ while(b<s.length && s[b]!='\t'){b++;}
+ assert(b>a) : "Missing field 0: "+new String(s);
+ String qname=(b==a+1 && s[a]=='*' ? null : new String(s, a, b-a));
+ return qname;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Cigar ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public static String toCigar13(byte[] match, int readStart, int readStop, int reflen, byte[] bases){
+ if(match==null || readStart==readStop){return null;}
+ StringBuilder sb=new StringBuilder(8);
+ int count=0;
+ char mode='=';
+ char lastMode='=';
+
+ int refloc=readStart;
+
+ int cigarlen=0; //for debugging
+ int opcount=0; //for debugging
+
+ for(int mpos=0; mpos<match.length; mpos++){
+
+ byte m=match[mpos];
+
+ boolean sfdflag=false;
+ if(SOFT_CLIP && (refloc<0 || refloc>=reflen)){
+ mode='S'; //soft-clip out-of-bounds
+ if(m!='I'){refloc++;}
+ if(m=='D'){sfdflag=true;} //Don't add soft-clip count for deletions!
+ }else if(m=='m' || m=='s' || m=='S' || m=='N' || m=='B'){//Little 's' is for a match classified as a sub to improve the affine score.
+ mode='M';
+ refloc++;
+ }else if(m=='I' || m=='X' || m=='Y'){
+ mode='I';
+ }else if(m=='D'){
+ mode='D';
+ refloc++;
+ }else if(m=='C'){
+ mode='S';
+ refloc++;
+ }else{
+ throw new RuntimeException("Invalid match string character '"+(char)m+"' = "+m+" (ascii). " +
+ "Match string should be in long format here.");
+ }
+
+ if(mode!=lastMode){
+ if(count>0){//Prevents an initial length-0 match
+ sb.append(count);
+// sb.append(lastMode);
+ if(lastMode=='D' && count>INTRON_LIMIT){sb.append('N');}
+ else{sb.append(lastMode);}
+ if(lastMode!='D'){cigarlen+=count;}
+ opcount+=count;
+ }
+ count=0;
+ lastMode=mode;
+ }
+
+ count++;
+ if(sfdflag){count--;}
+ }
+ sb.append(count);
+ if(mode=='D' && count>INTRON_LIMIT){sb.append('N');}
+ else{sb.append(mode);}
+ if(mode!='D'){cigarlen+=count;}
+ opcount+=count;
+
+ assert(bases==null || cigarlen==bases.length) : "\n(cigarlen = "+cigarlen+") != (bases.length = "+(bases==null ? -1 : bases.length)+")\n" +
+ "cigar = "+sb+"\nmatch = "+new String(match)+"\nbases = "+new String(bases)+"\n";
+
+ return sb.toString();
+ }
+
+ /**
+ * @param cigar2
+ * @return
+ */
+ public static String toCigar13(String cigar14) {
+ if(cigar14==null){return null;}
+ byte[] temp=cigar14.getBytes();
+ for(int i=0; i<temp.length; i++){
+ if(temp[i]=='X' || temp[i]=='='){temp[i]='M';}
+ }
+ return new String(temp);
+ }
+
+
+ public static String toCigar14(byte[] match, int readStart, int readStop, int reflen, byte[] bases){
+ if(match==null || readStart==readStop){return null;}
+ StringBuilder sb=new StringBuilder(8);
+ int count=0;
+ char mode='=';
+ char lastMode='=';
+
+ int refloc=readStart;
+
+ int cigarlen=0; //for debugging
+ int opcount=0; //for debugging
+
+ for(int mpos=0; mpos<match.length; mpos++){
+
+ byte m=match[mpos];
+
+ boolean sfdflag=false;
+ if(SOFT_CLIP && (refloc<0 || refloc>=reflen)){
+ mode='S'; //soft-clip out-of-bounds
+ if(m!='I'){refloc++;}
+ if(m=='D'){sfdflag=true;} //Don't add soft-clip count for deletions!
+ }else if(m=='m' || m=='s'){//Little 's' is for a match classified as a sub to improve the affine score.
+ mode='=';
+ refloc++;
+ }else if(m=='S'){
+ mode='X';
+ refloc++;
+ }else if(m=='I' || m=='X' || m=='Y'){
+ mode='I';
+ }else if(m=='D'){
+ mode='D';
+ refloc++;
+ }else if(m=='C'){
+ mode='S';
+ refloc++;
+ }else if(m=='N' || m=='B'){
+ mode='M';
+ refloc++;
+ }else{
+ throw new RuntimeException("Invalid match string character '"+(char)m+"' = "+m+" (ascii). " +
+ "Match string should be in long format here.");
+ }
+
+ if(mode!=lastMode){
+ if(count>0){//Prevents an initial length-0 match
+ sb.append(count);
+ if(lastMode=='D' && count>INTRON_LIMIT){sb.append('N');}
+ else{sb.append(lastMode);}
+ if(lastMode!='D'){cigarlen+=count;}
+ opcount+=count;
+ }
+ count=0;
+ lastMode=mode;
+ }
+
+ count++;
+ if(sfdflag){count--;}
+ }
+ sb.append(count);
+ if(mode=='D' && count>INTRON_LIMIT){
+ sb.append('N');
+ }else{
+ sb.append(mode);
+ }
+ if(mode!='D'){cigarlen+=count;}
+ opcount+=count;
+
+ assert(bases==null || cigarlen==bases.length) : "\n(cigarlen = "+cigarlen+") != (bases.length = "+(bases==null ? -1 : bases.length)+")\n" +
+ "cigar = "+sb+"\nmatch = "+new String(match)+"\nbases = "+new String(bases)+"\n";
+
+ return sb.toString();
+ }
+
+ public int calcCigarLength(boolean includeSoftClip, boolean includeHardClip){
+ return calcCigarLength(cigar, includeSoftClip, includeHardClip);
+ }
+
+ /** Reference length of cigar string */
+ public static int calcCigarLength(String cigar, boolean includeSoftClip, boolean includeHardClip){
+ if(cigar==null){return 0;}
+ int len=0;
+ int current=0;
+ for(int i=0; i<cigar.length(); i++){
+ char c=cigar.charAt(i);
+ if(Character.isDigit(c)){
+ current=(current*10)+(c-'0');
+ }else{
+ if(c=='M' || c=='=' || c=='X' || c=='D' || c=='N'){
+ len+=current;
+ }else if(c=='S'){
+ if(includeSoftClip){len+=current;}
+ }else if (c=='H'){
+ //In this case, the base string is the wrong length since letters were truncated.
+ //Therefore, the bases cannot be used for calling variations after mapping.
+ //Hard clipping messes up original location verification.
+ //Therefore... len+=current would be best in practice, but for GRADING purposes, leaving it disabled is best.
+
+ if(includeHardClip){len+=current;}
+ }else if(c=='I'){
+ //do nothing
+ }else if(c=='P'){
+ throw new RuntimeException("Unhandled cigar symbol: "+c+"\n"+cigar+"\n");
+ //'P' is currently poorly defined
+ }else{
+ throw new RuntimeException("Unhandled cigar symbol: "+c+"\n"+cigar+"\n");
+ }
+ current=0;
+ }
+ }
+ return len;
+ }
+
+ /** Number of query bases in cigar string */
+ public static int calcCigarBases(String cigar, boolean includeSoftClip, boolean includeHardClip){
+ if(cigar==null){return 0;}
+ int len=0;
+ int current=0;
+ for(int i=0; i<cigar.length(); i++){
+ char c=cigar.charAt(i);
+ if(Character.isDigit(c)){
+ current=(current*10)+(c-'0');
+ }else{
+ if(c=='M' || c=='=' || c=='X' || c=='I'){
+ len+=current;
+ }else if(c=='D' || c=='N'){
+ //do nothing
+ }else if (c=='H'){
+ if(includeHardClip){len+=current;}
+ }else if(c=='S'){
+ if(includeSoftClip){len+=current;}
+ }else if(c=='P'){
+ throw new RuntimeException("Unhandled cigar symbol: "+c+"\n"+cigar+"\n");
+ //'P' is currently poorly defined
+ }else{
+ throw new RuntimeException("Unhandled cigar symbol: "+c+"\n"+cigar+"\n");
+ }
+ current=0;
+ }
+ }
+ return len;
+ }
+
+ /** Length of clipped initial bases. Used to calculate correct start location of clipped reads. */
+ public static int countLeadingClip(String cigar, boolean includeSoftClip, boolean includeHardClip){
+ if(cigar==null || (!includeSoftClip && !includeHardClip)){return 0;}
+ int len=0;
+ int current=0;
+ for(int i=0; i<cigar.length(); i++){
+ char c=cigar.charAt(i);
+ if(Character.isLetter(c) || c=='='){
+ if(c=='H'){
+ if(includeHardClip){
+ len+=current;
+ }
+ }else if(c=='S'){
+ if(includeSoftClip){
+ len+=current;
+ }
+ }else{
+ break;
+ }
+ current=0;
+ }else{
+ current=(current*10)+(c-'0');
+ }
+ }
+ return len;
+ }
+
+ /** Length of clipped final bases. Used to calculate correct stop location of clipped reads. */
+ public static int countTrailingClip(String cigar, boolean includeSoftClip, boolean includeHardClip){
+ if(cigar==null || (!includeSoftClip && !includeHardClip)){return 0;}
+ int len=0;
+ if(includeHardClip){len+=countTrailingHardClip(cigar);}
+ int last=cigar.lastIndexOf('S');
+
+ int mult=1;
+ int i;
+ for(i=last-1; i>=0; i--){
+ char c=cigar.charAt(i);
+ if(Character.isLetter(c) || c=='='){
+ break;
+ }
+ len+=(len+(c-'0')*mult);
+ mult*=10;
+ }
+ if(i<0){return 0;}
+ return len;
+ }
+
+ /** Length of clipped final bases. Used to calculate correct stop location of clipped reads. */
+ public static int countTrailingHardClip(String cigar){
+ if(cigar==null){return 0;}
+ int last=cigar.lastIndexOf('H');
+
+ int mult=1, len=0;
+ int i;
+ for(i=last-1; i>=0; i--){
+ char c=cigar.charAt(i);
+ if(Character.isLetter(c) || c=='='){
+ break;
+ }
+ len+=(len+(c-'0')*mult);
+ mult*=10;
+ }
+ if(i<0){return 0;}
+ return len;
+ }
+
+ /** Length of clipped initial bases. */
+ public static int countLeadingClip(byte[] match){
+ if(match==null){return 0;}
+ int clips=0;
+ for(int mloc=0; mloc<match.length; mloc++){
+ byte b=match[mloc];
+ assert(!Character.isDigit(b));
+ if(b=='C'){
+ clips++;
+ }else{
+ break;
+ }
+ }
+ return clips;
+ }
+
+ /** Length of clipped trailing bases. */
+ public static int countTrailingClip(byte[] match){
+ if(match==null){return 0;}
+ int clips=0;
+ for(int mloc=match.length-1; mloc>=0; mloc--){
+ byte b=match[mloc];
+ assert(!Character.isDigit(b));
+ if(b=='C'){
+ clips++;
+ }else{
+ break;
+ }
+ }
+ return clips;
+ }
+
+ /** Length of clipped (out of bounds) initial insertions and deletions. */
+ public static int countLeadingIndels(int rloc, byte[] match){
+ if(match==null || rloc>=0){return 0;}
+ int dels=0;
+ int inss=0;
+ int cloc=0;
+ for(int mloc=0; mloc<match.length && rloc<0; mloc++){
+ byte b=match[mloc];
+ assert(!Character.isDigit(b));
+ if(b=='D'){
+ dels++;
+ rloc++;
+ }else if(b=='I'){
+ inss++;
+ cloc++;
+ }else{
+ rloc++;
+ cloc++;
+ }
+ }
+ return dels-inss;
+ }
+
+ /** Length of clipped (out of bounds) trialing insertions and deletions. */
+ public static int countTrailingIndels(int rloc, int rlen, byte[] match){
+ if(match==null || rloc>=0){return 0;}
+ int dels=0;
+ int inss=0;
+ int cloc=0;
+ for(int mloc=match.length; mloc>=0 && rloc>=rlen; mloc--){
+ byte b=match[mloc];
+ assert(!Character.isDigit(b));
+ if(b=='D'){
+ dels++;
+ rloc--;
+ }else if(b=='I'){
+ inss++;
+ cloc--;
+ }else{
+ rloc--;
+ cloc--;
+ }
+ }
+ return dels-inss;
+ }
+
+ /**
+ * @param cigar
+ * @return Max consecutive match, sub, del, ins, or clip symbols
+ */
+ public static final int[] cigarToMdsiMax(String cigar) {
+ if(cigar==null){return null;}
+ int[] msdic=new int[5];
+
+ int current=0;
+ for(int i=0; i<cigar.length(); i++){
+ char c=cigar.charAt(i);
+ if(Character.isDigit(c)){
+ current=(current*10)+(c-'0');
+ }else{
+ if(c=='M' || c=='='){
+ msdic[0]=Tools.max(msdic[0], current);
+ }else if(c=='X'){
+ msdic[1]=Tools.max(msdic[1], current);
+ }else if(c=='D' || c=='N'){
+ msdic[2]=Tools.max(msdic[2], current);
+ }else if(c=='I'){
+ msdic[3]=Tools.max(msdic[3], current);
+ }else if(c=='S' || c=='H' || c=='P'){
+ msdic[4]=Tools.max(msdic[4], current);
+ }
+ current=0;
+ }
+ }
+ return msdic;
+ }
+
+ /**
+ * @param cigar
+ * @return Total number of match, sub, del, ins, or clip symbols
+ */
+ public static final int[] cigarToMsdic(String cigar) {
+ if(cigar==null){return null;}
+ int[] msdic=new int[5];
+
+ int current=0;
+ for(int i=0; i<cigar.length(); i++){
+ char c=cigar.charAt(i);
+ if(Character.isDigit(c)){
+ current=(current*10)+(c-'0');
+ }else{
+ if(c=='M' || c=='='){
+ msdic[0]+=current;
+ }else if(c=='X'){
+ msdic[1]+=current;
+ }else if(c=='D' || c=='N'){
+ msdic[2]+=current;
+ }else if(c=='I'){
+ msdic[3]+=current;
+ }else if(c=='S' || c=='H' || c=='P'){
+ msdic[4]+=current;
+ }
+ current=0;
+ }
+ }
+ return msdic;
+ }
+
+ /**
+ * @param cigar
+ * @return Match string of this cigar string when possible, otherwise null
+ */
+ public static final byte[] cigarToShortMatch(String cigar, boolean allowM) {
+ if(cigar==null || cigar.equals(stringstar)){return null;}
+
+ int total=0;
+ int current=0;
+
+// int totalLen=0;
+// int currentLen=0;
+ for(int i=0; i<cigar.length(); i++){
+ char c=cigar.charAt(i);
+ if(Character.isDigit(c)){
+ current=(current*10)+(c-'0');
+ }else{
+
+ if(c=='M'){
+ if(!allowM){return null;} //Possible loss of information
+ }else if(c=='H'){
+ current=0; //Information destroyed
+ }else if(c=='P'){
+ return null; //Undefined symbol
+ }
+
+ total+=current;
+ current=0;
+ }
+ }
+
+ if(total<1){return null;}
+
+ ByteBuilder sb=new ByteBuilder(cigar.length());
+
+ for(int i=0; i<cigar.length(); i++){
+ char c=cigar.charAt(i);
+ if(Character.isDigit(c)){
+ current=(current*10)+(c-'0');
+ }else{
+ if(c=='='){
+ sb.append('m');
+ if(current>1){sb.append(current);}
+ }else if(c=='X'){
+ sb.append('S');
+ if(current>1){sb.append(current);}
+ }else if(c=='D' || c=='N'){
+ sb.append('D');
+ if(current>1){sb.append(current);}
+ }else if(c=='I'){
+ sb.append('I');
+ if(current>1){sb.append(current);}
+ }else if(c=='S'){
+ sb.append('C');
+ if(current>1){sb.append(current);}
+ }else if(c=='M'){
+// sb.append('B');
+ sb.append('N');
+ if(current>1){sb.append(current);}
+ }
+ current=0;
+ }
+ }
+
+ if(sb.array.length==sb.length()){return sb.array;}
+ return sb.toBytes();
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Tags ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public static String makeStopTag(int pos, int seqLength, String cigar, boolean perfect){
+// return "YS:i:"+(pos+((cigar==null || perfect) ? seqLength : -countLeadingClip(cigar, false)+calcCigarLength(cigar, false))-1); //123456789
+ return "YS:i:"+(pos+((cigar==null || perfect) ? seqLength : calcCigarLength(cigar, true, false))-1);
+ }
+
+ public static String makeLengthTag(int pos, int seqLength, String cigar, boolean perfect){
+ if(cigar==null || perfect){return "YL:Z:"+seqLength+","+seqLength;}
+ return "YL:Z:"+(seqLength-countLeadingClip(cigar, true, false))+","+calcCigarLength(cigar, false, false);
+ }
+
+ public static String makeIdentityTag(byte[] match, boolean perfect){
+ if(perfect){return "YI:f:100";}
+ float f=Read.identity(match);
+ return String.format("YI:f:%.2f", (100*f));
+ }
+
+ public static String makeScoreTag(int score){
+ return "YR:i:"+score;
+ }
+
+ public String matchTag(){
+ if(optional==null){return null;}
+ for(String s : optional){
+ if(s.startsWith("X2:Z:")){
+ return s;
+ }
+ }
+ return null;
+ }
+
+ private String makeXSTag(Read r){
+ if(r.mapped() && cigar!=null && cigar.indexOf('N')>=0){
+// System.err.println("For read "+r.pairnum()+" mapped to strand "+r.strand());
+ boolean plus=(r.strand()==Gene.PLUS); //Assumes secondstrand=false
+// System.err.println("plus="+plus);
+ if(r.pairnum()!=0){plus=!plus;}
+// System.err.println("plus="+plus);
+ if(XS_SECONDSTRAND){plus=!plus;}
+// System.err.println("plus="+plus);
+ return (plus ? XSPLUS : XSMINUS);
+ }else{
+ return null;
+ }
+ }
+
+
+ public static String makeMdTag(int chrom, int refstart, byte[] match, byte[] call, int scafloc, int scaflen){
+ if(match==null || chrom<0){return null;}
+ StringBuilder md=new StringBuilder(8);
+ md.append("MD:Z:");
+
+ ChromosomeArray cha=Data.getChromosome(chrom);
+
+ final int scafstop=scafloc+scaflen;
+
+ byte prevM='?';
+ int count=0;
+ int dels=0;
+ boolean prevSub=false;
+ for(int mpos=0, rpos=refstart, cpos=0; mpos<match.length; mpos++){
+ assert(cpos>=0 && cpos<call.length) : "\n"+new String(match)+"\n"+new String(call)+"\n"+mpos+", "+cpos+", "+dels+", "+INTRON_LIMIT;
+ final byte c=call[cpos];
+ final byte m=match[mpos];
+
+ if(prevM=='D' && m!='D'){
+ if(dels<=INTRON_LIMIT){//Otherwise, ignore it
+ md.append(count);
+ count=0;
+ md.append('^');
+ for(int i=rpos-dels; i<rpos; i++){
+ md.append((char)cha.get(i));
+ }
+ dels=0;
+ }
+ }
+
+ if(m=='C' || rpos<scafloc || rpos>=scafstop){ //Do nothing for clipped bases
+ rpos++;
+ if(m!='D'){cpos++;}
+ }else if(m=='m' || m=='s'){
+ count++;
+ rpos++;
+ cpos++;
+ }else if(m=='S'){
+ if(count>0 || !prevSub){md.append(count);}
+ md.append((char)cha.get(rpos));
+
+ count=0;
+ rpos++;
+ cpos++;
+ prevSub=true;
+ }else if(m=='N'){
+
+ final byte r=cha.get(rpos);
+
+ if(c==r){//Act like match
+ count++;
+ rpos++;
+ cpos++;
+ }else{//Act like sub
+ if(count>0 || !prevSub){md.append(count);}
+ md.append((char)r);
+
+ count=0;
+ rpos++;
+ cpos++;
+ prevSub=true;
+ }
+ }else if(m=='I' || m=='X' || m=='Y'){
+ cpos++;
+// count++;
+ }else if(m=='D'){
+// if(prevM!='D'){
+// md.append(count);
+// count=0;
+// md.append('^');
+// }
+// md.append((char)cha.get(rpos));
+
+ rpos++;
+ dels++;
+ }
+ prevM=m;
+
+ }
+// if(count>0){
+ md.append(count);
+// }
+
+ return md.toString();
+ }
+
+ public static int calcLeftClip(String cig, String id){
+ if(cig==null){return 0;}
+ int len=0;
+ for(int i=0; i<cig.length(); i++){
+ char c=cig.charAt(i);
+ if(Character.isDigit(c)){
+ len=len*10+(c-'0');
+ }else{
+ assert(c!='S' || i<cig.length()-1);//ban entirely soft-clipped reads
+ return (c=='S') ? len : 0;
+ }
+ }
+ return 0;
+ }
+
+ public static int calcRightClip(String cig, String id){
+ if(cig==null || cig.length()<1 || cig.charAt(cig.length()-1)!='S'){return 0;}
+ int pos=cig.length()-2;
+ for(; pos>=0 && Character.isDigit(cig.charAt(pos)); pos--){}
+
+ assert(pos>0) : cig+", id="+id+", pos="+pos;//ban entirely soft-clipped reads
+
+ int len=0;
+ for(int i=pos+1; i<cig.length(); i++){
+ char c=cig.charAt(i);
+ if(Character.isDigit(c)){
+ len=len*10+(c-'0');
+ }else{
+ return (c=='S') ? len : 0;
+ }
+ }
+ return len;
+ }
+
+ public ArrayList<String> makeOptionalTags(Read r, Read r2, boolean perfect, int scafloc, int scaflen, boolean inbounds, boolean inbounds2){
+ if(NO_TAGS){return null;}
+ final boolean mapped=r.mapped();
+ if(!mapped && READGROUP_ID==null && !MAKE_CUSTOM_TAGS && !MAKE_TIME_TAG){return null;}
+
+ ArrayList<String> optionalTags=new ArrayList<String>(8);
+
+ if(mapped){
+ if(!r.secondary() && r.ambiguous()){optionalTags.add("XT:A:R");} //Not sure what do do for secondary alignments
+
+// int nm=r.length();
+// int dels=0;
+
+ int nm=0;
+
+// //Only works for cigar strings in format 1.4+
+// if(perfect){nm=0;}else if(cigar!=null){
+// int len=0;
+// for(int i=0; i<cigar.length(); i++){
+// char c=cigar.charAt(i);
+// if(Character.isDigit(c)){
+// len=len*10+(c-'0');
+// }else{
+// if(c=='X' || c=='I' || c=='D' || c=='M'){
+// nm+=len;
+// }
+// len=0;
+// }
+// }
+//// System.err.println("\nRead "+r.id+": nm="+nm+"\n"+cigar+"\n"+new String(r.match));
+// System.err.println("\nRead "+r.id+": nm="+nm);
+// }
+
+ if(perfect){nm=0;}else if(r.match!=null){
+ nm=0;
+ int leftclip=calcLeftClip(cigar, r.id), rightclip=calcRightClip(cigar, r.id);
+ final int from=leftclip, to=r.length()-rightclip;
+ int delsCurrent=0;
+ for(int i=0, cpos=0; i<r.match.length; i++){
+ final byte b=r.match[i];
+
+// System.err.println("i="+i+", cpos="+cpos+", from="+from+", ")
+
+ if(cpos>=from && cpos<to){
+ if(b=='I' || b=='S' || b=='N' || b=='X' || b=='Y'){nm++;}
+
+ if(b=='D'){delsCurrent++;}
+ else{
+ if(delsCurrent<=INTRON_LIMIT){nm+=delsCurrent;}
+ delsCurrent=0;
+ }
+ }
+ if(b!='D'){cpos++;}
+ }
+ if(delsCurrent<=INTRON_LIMIT){nm+=delsCurrent;}
+ // assert(false) : nm+", "+dels+", "+delsCurrent+", "+r.length()+", "+r.match.length;
+
+// assert(false) : "rlen="+r.length()+", nm="+nm+", dels="+delsCurrent+", intron="+INTRON_LIMIT+", inbound1="+inbounds+", ib2="+inbounds2+"\n"+new String(r.match);
+
+// System.err.println("\nRead "+r.id+": left="+leftclip+", right="+rightclip+", nm="+nm+"\n"+cigar+"\n"+new String(r.match));
+
+ }
+
+ if(MAKE_NM_TAG){
+ if(perfect){optionalTags.add("NM:i:0");}
+ else if(r.match!=null){optionalTags.add("NM:i:"+(nm));}
+ }
+ if(MAKE_SM_TAG){optionalTags.add("SM:i:"+mapq);}
+ if(MAKE_AM_TAG){optionalTags.add("AM:i:"+Data.min(mapq, r2==null ? mapq : (r2.mapped() ? Data.max(1, r2.mapScore/r2.length()) : 0)));}
+
+ if(MAKE_TOPHAT_TAGS){
+ optionalTags.add("AS:i:0");
+ if(cigar==null || cigar.indexOf('N')<0){
+ optionalTags.add("XN:i:0");
+ }else{
+ }
+ optionalTags.add("XM:i:0");
+ optionalTags.add("XO:i:0");
+ optionalTags.add("XG:i:0");
+ if(cigar==null || cigar.indexOf('N')<0){
+ optionalTags.add("YT:Z:UU");
+ }else{
+ }
+ optionalTags.add("NH:i:1");
+ }else if(MAKE_XM_TAG){//XM tag. For bowtie compatibility; unfortunately it is poorly defined.
+ int x=0;
+ if(r.discarded() || (!r.ambiguous() && !mapped)){
+ x=0;//TODO: See if the flag needs to be present in this case.
+ }else if(mapped){
+ x=1;
+ if(r.numSites()>0 && r.numSites()>0){
+ int z=r.topSite().score;
+ for(int i=1; i<r.sites.size(); i++){
+ SiteScore ss=r.sites.get(i);
+ if(ss!=null && ss.score==z){x++;}
+ }
+ }
+ if(r.ambiguous()){x=Tools.max(x, 2);}
+ }
+ if(x>=0){optionalTags.add("XM:i:"+x);}
+ }
+
+ //XS tag
+ if(MAKE_XS_TAG){
+ String xs=makeXSTag(r);
+ if(xs!=null){
+ optionalTags.add(xs);
+ assert(r2==null || r.pairnum()!=r2.pairnum());
+ // assert(r2==null || !r2.mapped() || r.strand()==r2.strand() || makeXSTag(r2)==xs) :
+ // "XS problem:\n"+r+"\n"+r2+"\n"+xs+"\n"+makeXSTag(r2)+"\n";
+ }
+ }
+
+ if(MAKE_MD_TAG){
+ String md=makeMdTag(r.chrom, r.start, r.match, r.bases, scafloc, scaflen);
+ if(md!=null){optionalTags.add(md);}
+ }
+
+ if(r.mapped() && MAKE_NH_TAG){
+ if(ReadStreamWriter.OUTPUT_SAM_SECONDARY_ALIGNMENTS && r.numSites()>1){
+ optionalTags.add("NH:i:"+r.sites.size());
+ }else{
+ optionalTags.add("NH:i:1");
+ }
+ }
+
+ if(MAKE_STOP_TAG && (perfect || (r.match!=null && r.bases!=null))){optionalTags.add(makeStopTag(pos, r.length(), cigar, perfect));}
+
+ if(MAKE_LENGTH_TAG && (perfect || (r.match!=null && r.bases!=null))){optionalTags.add(makeLengthTag(pos, r.length(), cigar, perfect));}
+
+ if(MAKE_IDENTITY_TAG && (perfect || r.match!=null)){optionalTags.add(makeIdentityTag(r.match, perfect));}
+
+ if(MAKE_SCORE_TAG && r.mapped()){optionalTags.add(makeScoreTag(r.mapScore));}
+
+ if(MAKE_INSERT_TAG && r2!=null){
+ if(r.mapped() ||r.originalSite!=null){
+ optionalTags.add("X8:Z:"+r.insertSizeMapped(false)+(r.originalSite==null ? "" : ","+r.insertSizeOriginalSite()));
+ }
+ }
+ if(MAKE_CORRECTNESS_TAG){
+ final SiteScore ss0=r.originalSite;
+ if(ss0!=null){
+ optionalTags.add("X9:Z:"+(ss0.isCorrect(r.chrom, r.strand(), r.start, r.stop, 0) ? "T" : "F"));
+ }
+ }
+ }
+
+ if(READGROUP_ID!=null){
+ assert(READGROUP_TAG!=null);
+ optionalTags.add(READGROUP_TAG);
+ }
+
+ if(MAKE_CUSTOM_TAGS){
+ int sites=r.numSites() + (r.originalSite==null ? 0 : 1);
+ if(sites>0){
+ StringBuilder sb=new StringBuilder();
+ sb.append("X1:Z:");
+ if(r.sites!=null){
+ for(SiteScore ss : r.sites){
+ sb.append('$');
+ sb.append(ss.toText());
+ }
+ }
+ if(r.originalSite!=null){
+ sb.append('$');
+ sb.append('*');
+ sb.append(r.originalSite.toText());
+ }
+ optionalTags.add(sb.toString());
+ }
+
+ if(mapped){
+ if(r.match!=null){
+ byte[] match=r.match;
+ if(!r.shortmatch()){
+ match=Read.toShortMatchString(match);
+ }
+ optionalTags.add("X2:Z:"+new String(match));
+ }
+
+ optionalTags.add("X3:i:"+r.mapScore);
+ }
+ optionalTags.add("X5:Z:"+r.numericID);
+ optionalTags.add("X6:i:"+(r.flags|(r.match==null ? 0 : Read.SHORTMATCHMASK)));
+ if(r.copies>1){optionalTags.add("X7:i:"+r.copies);}
+ }
+
+ if(MAKE_TIME_TAG){
+ assert(r.obj!=null && r.obj.getClass()==Long.class) : r.obj;
+ optionalTags.add("X0:i:"+(r.obj==null ? 0 : r.obj));
+ }
+
+ if(MAKE_BOUNDS_TAG){
+ String a=(r.mapped() ? inbounds ? "I" : "O" : "U");
+ if(r2==null){
+ optionalTags.add("XB:Z:"+a);
+ }else{
+ String b=(r2.mapped() ? inbounds2 ? "I" : "O" : "U");
+ optionalTags.add("XB:Z:"+a+b);
+ }
+ }
+
+ return optionalTags;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- ? ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Length of read bases */
+ public int length(){
+ assert((seq!=null && (seq.length!=1 || seq[0]!='*')) || cigar!=null) :
+ "This program requires bases or a cigar string for every sam line. Problem line:\n"+this+"\n";
+ return seq==null ? calcCigarBases(cigar, true, false) : seq.length;
+ }
+
+// public int length(boolean includeSoftClip){
+// assert((seq!=null && (seq.length!=1 || seq[0]!='*')) || cigar!=null) :
+// "This program requires bases or a cigar string for every sam line. Problem line:\n"+this+"\n";
+// return seq==null ? calcCigarBases(cigar, includeSoftClip, false) : seq.length;
+// }
+
+ public static int toMapq(Read r, SiteScore ss){
+ assert(r!=null);
+ int score=(ss==null ? r.mapScore : ss.slowScore);
+ return toMapq(score, r.length(), r.mapped(), r.ambiguous());
+ }
+
+ public static int toMapq(int score, int length, boolean mapped, boolean ambig){
+ if(!mapped || length<1){return 0;}
+
+ if(ambig && PENALIZE_AMBIG){
+ float max=3;
+ float adjusted=(score*max)/(100f*length);
+ return Tools.max(1, (int)Math.round(adjusted));
+ }else{
+ float score2=(score-length*40)*1.6f;
+ float max=1.5f*((float)Tools.log2(length))+36;
+ float adjusted=(score2*max)/(100f*length);
+ return Tools.max(4, (int)Math.round(adjusted));
+ }
+ }
+
+
+ public Read parseName(){
+ try {
+ String[] answer=qname.split("_");
+ long id=Long.parseLong(answer[0]);
+ int trueChrom=Gene.toChromosome(answer[1]);
+ byte trueStrand=Byte.parseByte(answer[2]);
+ int trueLoc=Integer.parseInt(answer[3]);
+ int trueStop=Integer.parseInt(answer[4]);
+// for(int i=0; i<quals.length; i++){quals[i]-=33;}
+// Read r=new Read(seq.getBytes(), trueChrom, trueStrand, trueLoc, trueStop, qname, quals, false, id);
+ Read r=new Read(seq, trueChrom, trueStrand, trueLoc, trueStop, qname, qual, id);
+ return r;
+ } catch (NumberFormatException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ return null;
+ }
+ }
+
+ public long parseNumericId(){
+ return Long.parseLong(qname.substring(0, qname.indexOf('_')));
+ }
+
+ public Read toRead(boolean parseCustom){
+ return toRead(parseCustom, false);
+ }
+
+ public Read toRead(boolean parseCustom, boolean includeHardClip){
+
+ SiteScore originalSite=null;
+ long numericId_=0;
+ boolean synthetic=false;
+
+ if(parseCustom){
+ try {
+ String[] answer=qname.split("_");
+ numericId_=Long.parseLong(answer[0]);
+ int trueChrom=Gene.toChromosome(answer[1]);
+ byte trueStrand=Byte.parseByte(answer[2]);
+ int trueLoc=Integer.parseInt(answer[3]);
+ int trueStop=Integer.parseInt(answer[4]);
+
+ originalSite=new SiteScore(trueChrom, trueStrand, trueLoc, trueStop, 0, 0);
+ synthetic=true;
+
+ } catch (NumberFormatException e) {
+ System.err.println("Failed to parse "+qname);
+ } catch (NullPointerException e) {
+ System.err.println("Bad read with no name.");
+ return null;
+ }
+ }
+// assert(false) : originalSite;
+
+
+ if(Data.GENOME_BUILD>=0){
+
+ }
+
+ int chrom_=-1;
+ byte strand_=strand();
+ int start_=start(true, includeHardClip);
+ int stop_=stop(start_, true, includeHardClip);
+ assert(start_<=stop_) : start_+", "+stop_+"\n"+this+"\n";
+
+ if(Data.GENOME_BUILD>=0 && rname!=null && (rname.length!=1 || rname[0]!='*')){
+ ScafLoc sc=Data.getScafLoc(rname);
+ assert(sc!=null) : "Can't find scaffold in reference with name "+new String(rname)+"\n"+this;
+ if(sc!=null){
+ chrom_=sc.chrom;
+ start_+=sc.loc;
+ stop_+=sc.loc;
+ }
+ }
+
+//// byte[] quals=(qual==null || (qual.length()==1 && qual.charAt(0)=='*')) ? null : qual.getBytes();
+//// byte[] quals=(qual==null || (qual.length==1 && qual[0]=='*')) ? null : qual.clone();
+// byte[] quals=(qual==null || (qual.length==1 && qual[0]=='*')) ? null : qual;
+// byte[] bases=seq==null ? null : seq.clone();
+// if(strand_==Gene.MINUS){//Minus-mapped SAM lines have bases and quals reversed
+// AminoAcid.reverseComplementBasesInPlace(bases);
+// Tools.reverseInPlace(quals);
+// }
+// Read r=new Read(bases, chrom_, strand_, start_, stop_, qname, quals, cs_, numericId_);
+
+ final Read r;
+ {
+ byte[] seqX=(seq==null || (seq.length==1 && seq[0]=='*')) ? null : seq;
+ byte[] qualX=(qual==null || (qual.length==1 && qual[0]=='*')) ? null : qual;
+ String qnameX=(qname==null || qname.equals(stringstar)) ? null : qname;
+ r=new Read(seqX, chrom_, strand_, start_, stop_, qnameX, qualX, numericId_);
+ }
+
+ r.setMapped(mapped());
+ r.setSynthetic(synthetic);
+// r.setPairnum(pairnum()); //TODO: Enable after fixing assertions that this will break in read input streams.
+ if(originalSite!=null){
+ r.originalSite=originalSite;
+ }
+
+ r.mapScore=mapq;
+ r.setSecondary(!primary());
+
+// if(mapped()){
+// r.list=new ArrayList<SiteScore>(1);
+// r.list.add(new SiteScore(r.chrom, r.strand(), r.start, r.stop, 0));
+// }
+
+// System.out.println(optional);
+ if(optional!=null){
+ for(String s : optional){
+ if(s.equals("XT:A:R")){
+ r.setAmbiguous(true);
+ }else if(s.startsWith("X1:Z:")){
+// System.err.println("Found X1 tag!\t"+s);
+ String[] split=s.split("\\$");
+// assert(false) : Arrays.toString(split);
+ ArrayList<SiteScore> list=new ArrayList<SiteScore>(3);
+
+ for(int i=1; i<split.length; i++){
+// System.err.println("Processing ss\t"+split[i]);
+ String s2=split[i];
+ SiteScore ss=SiteScore.fromText(s2);
+ if(s2.charAt(0)=='*'){
+ r.originalSite=ss;
+ }else{
+ list.add(ss);
+ }
+ }
+// System.err.println("List size = "+list.size());
+ if(list.size()>0){r.sites=list;}
+ }else if(s.startsWith("X2:Z:")){
+ String s2=s.substring(5);
+ r.match=s2.getBytes();
+ }else if(s.startsWith("X3:i:")){
+ String s2=s.substring(5);
+// r.mapScore=Integer.parseInt(s2); //Messes up generation of ROC curve
+ }else if(s.startsWith("X5:Z:")){
+ String s2=s.substring(5);
+ r.numericID=Long.parseLong(s2);
+ }else if(s.startsWith("X6:i:")){
+ String s2=s.substring(5);
+ r.flags=Integer.parseInt(s2);
+ }else if(s.startsWith("X7:i:")){
+ String s2=s.substring(5);
+ r.copies=Integer.parseInt(s2);
+ }else{
+// System.err.println("Unknown SAM field:"+s);
+ }
+ }
+ }
+
+ if(r.match==null && cigar!=null && (CONVERT_CIGAR_TO_MATCH || cigar.indexOf('=')>=0)){
+ r.match=cigarToShortMatch(cigar, true);
+
+ if(r.match!=null){
+ r.setShortMatch(true);
+ if(Tools.indexOf(r.match, (byte)'B')>=0){
+ boolean success=r.fixMatchB();
+// if(!success){r.match=null;}
+// assert(false) : new String(r.match);
+ }
+// assert(false) : new String(r.match);
+ }
+// assert(false) : new String(r.match);
+// System.err.println(">\n"+cigar+"\n"+(r.match==null ? "null" : new String(r.match)));
+ }
+// assert(false) : new String(r.match);
+
+// System.err.println("Resulting read: "+r.toText());
+
+ return r;
+
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- toString ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Aproximate length of result of SamLine.toText() */
+ public int textLength(){
+ int len=11; //11 tabs
+ len+=(3+9+3+9);
+ len+=(tlen>999 ? 9 : 3);
+
+ len+=(qname==null ? 1 : qname.length());
+ len+=(rname==null ? 1 : rname.length);
+ len+=(rnext==null ? 1 : rnext.length);
+ len+=(cigar==null ? 1 : cigar.length());
+ len+=(seq==null ? 1 : seq.length);
+ len+=(qual==null ? 1 : qual.length);
+
+ if(optional!=null){
+ len+=optional.size();
+ for(String s : optional){len+=s.length();}
+ }
+ return len;
+ }
+
+ public ByteBuilder toBytes(ByteBuilder bb){
+
+ final int buflen=Tools.max((rname==null ? 1 : rname.length), (rnext==null ? 1 : rnext.length), (seq==null ? 1 : seq.length), (qual==null ? 1 : qual.length));
+
+ if(bb==null){bb=new ByteBuilder(textLength()+4);}
+ if(qname==null){bb.append('*').append('\t');}else{bb.append(qname).append('\t');}
+ bb.append(flag).append('\t');
+ append(bb, rname).append('\t');
+ bb.append(pos).append('\t');
+ bb.append(mapq).append('\t');
+ if(cigar==null){bb.append('*').append('\t');}else{bb.append(cigar).append('\t');}
+ append(bb, rnext).append('\t');
+ bb.append(pnext).append('\t');
+ bb.append(tlen).append('\t');
+
+ if(mapped() && strand()==Gene.MINUS){
+ appendReverseComplimented(bb, seq).append('\t');
+ appendQualReversed(bb, qual);
+ }else{
+ append(bb, seq).append('\t');
+ appendQual(bb, qual);
+ }
+
+// assert(seq.getClass()==String.class);
+// assert(qual.getClass()==String.class);
+// sb.append(seq).append('\t');
+// sb.append(qual);
+
+ if(optional!=null){
+ for(String s : optional){
+ bb.append('\t').append(s);
+ }
+ }
+ return bb;
+ }
+
+ public StringBuilder toText(){
+
+ final int buflen=Tools.max((rname==null ? 1 : rname.length), (rnext==null ? 1 : rnext.length), (seq==null ? 1 : seq.length), (qual==null ? 1 : qual.length));
+ final char[] buffer=Shared.getTLCB(buflen);
+
+ StringBuilder sb=new StringBuilder(textLength()+4);
+ if(qname==null){sb.append('*').append('\t');}else{sb.append(qname).append('\t');}
+ sb.append(flag).append('\t');
+ append(sb, rname, buffer).append('\t');
+ sb.append(pos).append('\t');
+ sb.append(mapq).append('\t');
+ if(cigar==null){sb.append('*').append('\t');}else{sb.append(cigar).append('\t');}
+ append(sb, rnext, buffer).append('\t');
+ sb.append(pnext).append('\t');
+ sb.append(tlen).append('\t');
+
+ if(mapped() && strand()==Gene.MINUS){
+ appendReverseComplimented(sb, seq, buffer).append('\t');
+ appendQualReversed(sb, qual, buffer);
+ }else{
+ append(sb, seq, buffer).append('\t');
+ appendQual(sb, qual, buffer);
+ }
+
+// assert(seq.getClass()==String.class);
+// assert(qual.getClass()==String.class);
+// sb.append(seq).append('\t');
+// sb.append(qual);
+
+ if(optional!=null){
+ for(String s : optional){
+ sb.append('\t').append(s);
+ }
+ }
+ return sb;
+ }
+
+ public String toString(){return toText().toString();}
+
+
+
+ private static StringBuilder append(StringBuilder sb, byte[] a, char[] buffer){
+ if(a==null || a==bytestar || (a.length==1 && a[0]=='*')){return sb.append('*');}
+ {//This is actually faster
+ assert(buffer.length>=a.length);
+ for(int i=0; i<a.length; i++){
+ buffer[i]=(char)a[i];
+ }
+ sb.append(buffer, 0, a.length);
+ }
+// for(byte b : a){
+// sb.append((char)b);
+// }
+ return sb;
+ }
+
+ private static StringBuilder appendReverseComplimented(StringBuilder sb, byte[] a, char[] buffer){
+ if(a==null || a==bytestar || (a.length==1 && a[0]=='*')){return sb.append('*');}
+ {//This is actually faster
+ assert(buffer.length>=a.length);
+ for(int i=0, j=a.length-1; j>=0; i++, j--){buffer[i]=(char)AminoAcid.baseToComplementExtended[a[j]];}
+ sb.append(buffer, 0, a.length);
+ }
+// for(int i=a.length-1; i>=0; i--){
+// sb.append((char)AminoAcid.baseToComplementEbuffertended[a[i]]);
+// }
+ return sb;
+ }
+
+ private static StringBuilder appendQual(StringBuilder sb, byte[] a, char[] buffer){
+ if(a==null || a==bytestar || (a.length==1 && a[0]=='*')){return sb.append('*');}
+ {//This is actually faster
+ assert(buffer.length>=a.length);
+ for(int i=0; i<a.length; i++){buffer[i]=(char)(a[i]+33);}
+ sb.append(buffer, 0, a.length);
+ }
+// for(byte b : a){
+// sb.append((char)(b+33));
+// }
+ return sb;
+ }
+
+ private static StringBuilder appendQualReversed(StringBuilder sb, byte[] a, char[] buffer){
+ if(a==null || a==bytestar || (a.length==1 && a[0]=='*')){return sb.append('*');}
+ {//This is actually faster
+ assert(buffer.length>=a.length);
+ for(int i=0, j=a.length-1; j>=0; i++, j--){buffer[i]=(char)(a[j]+33);}
+ sb.append(buffer, 0, a.length);
+ }
+// for(int i=a.length-1; i>=0; i--){
+// sb.append((char)(a[i]+33));
+// }
+ return sb;
+ }
+
+ private static ByteBuilder append(ByteBuilder sb, byte[] a){
+ if(a==null || a==bytestar || (a.length==1 && a[0]=='*')){return sb.append('*');}
+ return sb.append(a);
+ }
+
+ private static ByteBuilder appendReverseComplimented(ByteBuilder sb, byte[] a){
+ if(a==null || a==bytestar || (a.length==1 && a[0]=='*')){return sb.append('*');}
+
+ sb.ensureExtra(a.length);
+ byte[] buffer=sb.array;
+ int i=sb.length;
+ for(int j=a.length-1; j>=0; i++, j--){buffer[i]=AminoAcid.baseToComplementExtended[a[j]];}
+ sb.length+=a.length;
+
+ return sb;
+ }
+
+ private static ByteBuilder appendQual(ByteBuilder sb, byte[] a){
+ if(a==null || a==bytestar || (a.length==1 && a[0]=='*')){return sb.append('*');}
+
+ sb.ensureExtra(a.length);
+ byte[] buffer=sb.array;
+ int i=sb.length;
+ for(int j=0; j<a.length; i++, j++){buffer[i]=(byte)(a[j]+33);}
+ sb.length+=a.length;
+
+ return sb;
+ }
+
+ private static ByteBuilder appendQualReversed(ByteBuilder sb, byte[] a){
+ if(a==null || a==bytestar || (a.length==1 && a[0]=='*')){return sb.append('*');}
+
+ sb.ensureExtra(a.length);
+ byte[] buffer=sb.array;
+ int i=sb.length;
+ for(int j=a.length-1; j>=0; i++, j--){buffer[i]=(byte)(a[j]+33);}
+ sb.length+=a.length;
+
+ return sb;
+ }
+
+ /** Assumes a custom name including original location */
+ public byte[] originalContig(){
+// assert(PARSE_CUSTOM);
+ int loc=-1;
+ int count=0;
+ for(int i=0; i<qname.length() && loc==-1; i++){
+ if(qname.charAt(i)=='_'){
+ count++;
+ if(count==6){loc=i;}
+ }
+ }
+ if(loc==-1){
+ return null;
+ }
+ return qname.substring(loc+1).getBytes();
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Flag ----------------*/
+ /*--------------------------------------------------------------*/
+
+// Bit Description
+// 0x1 template having multiple fragments in sequencing
+// 0x2 each fragment properly aligned according to the aligner
+// 0x4 fragment unmapped
+// 0x8 next fragment in the template unmapped
+// 0x10 SEQ being reverse complemented
+// 0x20 SEQ of the next fragment in the template being reversed
+// 0x40 the first fragment in the template
+// 0x80 the last fragment in the template
+// 0x100 secondary alignment
+// 0x200 not passing quality controls
+// 0x400 PCR or optical duplicate
+// 0x800 supplementary alignment
+
+
+ public static int makeFlag(Read r, Read r2, int fragNum, boolean sameScaf){
+ int flag=0;
+ if(r2!=null){
+ flag|=0x1;
+
+ if(r.mapped() && r.valid() && r.match!=null &&
+ (r2==null || (sameScaf && r.paired() && r2.mapped() && r2.valid() && r2.match!=null))){flag|=0x2;}
+ if(fragNum==0){flag|=0x40;}
+ if(fragNum>0){flag|=0x80;}
+ }
+ if(!r.mapped()){flag|=0x4;}
+ if(r2!=null && !r2.mapped()){flag|=0x8;}
+ if(r.strand()==Gene.MINUS){flag|=0x10;}
+ if(r2!=null && r2.strand()==Gene.MINUS){flag|=0x20;}
+ if(r.secondary()){flag|=0x100;}
+ if(r.discarded()){flag|=0x200;}
+ return flag;
+ }
+
+
+ public boolean hasMate(){
+ return (flag&0x1)==0x1;
+ }
+
+ public boolean properPair(){
+ return (flag&0x2)==0x2;
+ }
+
+ public static boolean mapped(int flag){
+ return (flag&0x4)!=0x4;
+ }
+
+ public static byte strand(int flag){
+ return ((flag&0x10)==0x10 ? (byte)1 : (byte)0);
+ }
+
+ public boolean mapped(){
+ return (flag&0x4)!=0x4;
+// 0x4 fragment unmapped
+// 0x8 next fragment in the template unmapped
+ }
+
+ public boolean nextMapped(){
+ return (flag&0x8)!=0x8;
+// 0x4 fragment unmapped
+// 0x8 next fragment in the template unmapped
+ }
+
+ public byte strand(){
+ return ((flag&0x10)==0x10 ? (byte)1 : (byte)0);
+ }
+
+ public byte nextStrand(){
+ return ((flag&0x20)==0x20 ? (byte)1 : (byte)0);
+ }
+
+ public boolean firstFragment(){
+ return (flag&0x40)==0x40;
+ }
+
+ public boolean lastFragment(){
+ return (flag&0x80)==0x80;
+ }
+
+ public int pairnum(){
+ return firstFragment() ? 0 : lastFragment() ? 1 : 0;
+ }
+
+ public boolean primary(){return (flag&0x100)==0;}
+ public void setPrimary(boolean b){
+ if(b){
+ flag=flag|0x100;
+ }else{
+ flag=flag&~0x100;
+ }
+ }
+
+ public boolean discarded(){
+ return (flag&0x200)==0x200;
+ }
+
+ public boolean duplicate(){
+ return (flag&0x400)==0x400;
+ }
+
+ public boolean supplementary(){
+ return (flag&0x800)==0x800;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- ? ----------------*/
+ /*--------------------------------------------------------------*/
+
+// /** Assumes rname is an integer. */
+// public int chrom(){
+// if(Data.GENOME_BUILD<0){return -1;}
+// HashMap sc
+// }
+
+ /** Assumes rname is an integer. */
+ public int chrom_old(){
+ assert(false);
+ if(!Character.isDigit(rname[0]) && !Character.isDigit(rname[rname.length-1])){
+ if(warning){
+ warning=false;
+ System.err.println("Warning - sam lines need a chrom field.");
+ }
+ return -1;
+ }
+ assert(Shared.anomaly || '*'==rname[0] || (Character.isDigit(rname[0]) && Character.isDigit(rname[rname.length-1]))) :
+ "This is no longer correct, considering that sam lines are named by scaffold. They need a chrom field.\n"+new String(rname);
+ if(rname==null || Arrays.equals(rname, bytestar) || !(Character.isDigit(rname[0]) && Character.isDigit(rname[rname.length-1]))){return -1;}
+ //return Gene.toChromosome(new String(rname));
+ //return Integer.parseInt(new String(rname)));
+ final byte z='0';
+ int x=rname[0]-z;
+ for(int i=1; i<rname.length; i++){
+ x=(x*10)+(rname[i]-z);
+ }
+ return x;
+ }
+
+ /** Returns the zero-based starting location of this read on the sequence. */
+ public int start(boolean includeSoftClip, boolean includeHardClip){
+ int x=countLeadingClip(cigar, includeSoftClip, includeHardClip);
+ return pos-1-x;
+ }
+
+ /** Returns the zero-based stop location of this read on the sequence. */
+ public int stop(int start, boolean includeSoftClip, boolean includeHardClip){
+ if(!mapped() || cigar==null || cigar.charAt(0)=='*'){
+// return -1;
+ return start+(seq==null ? 0 : Tools.max(0, seq.length-1));
+ }
+ int r=start+calcCigarLength(cigar, includeSoftClip, includeHardClip)-1;
+
+// assert(false) : start+", "+r+", "+calcCigarLength(cigar, includeHardClip);
+// System.err.println("start= "+start+", stop="+r);
+ return r;
+ }
+
+ public int stop2(final int start, final boolean includeSoftClip, final boolean includeHardClip){
+ if(mapped() && cigar!=null && cigar.charAt(0)!='*'){return stop(start, includeSoftClip, includeHardClip);}
+// return (seq==null ? -1 : start()+seq.length());
+ return (seq==null ? -1 : start+seq.length);
+ }
+
+ public long numericId(){
+ return 0;
+ }
+
+ public boolean pairedOnSameChrom(){
+// assert(false) : (rname==null ? "nullX" : new String(rname))+", "+
+// (rnext==null ? "nullX" : new String(rnext))+", "+Tools.equals(rnext, byteequals)+", "+Arrays.equals(rname, rnext)+"\n"+this;
+ return Tools.equals(rnext, byteequals) || Arrays.equals(rname, rnext);
+ }
+
+ /** Assumes a custom name including original location */
+ public int originalContigStart(){
+// assert(PARSE_CUSTOM);
+ int loc=-1;
+ int count=0;
+ for(int i=0; i<qname.length() && loc==-1; i++){
+ if(qname.charAt(i)=='_'){
+ count++;
+ if(count==5){loc=i;}
+ }
+ }
+ if(loc==-1){
+ return -1;
+ }
+
+ int sum=0;
+ int mult=1;
+ for(int i=loc+1; i<qname.length(); i++){
+ char c=qname.charAt(i);
+ if(!Character.isDigit(c)){
+ if(i==loc+1 && c=='-'){mult=-1;}
+ else{break;}
+ }else{
+ sum=(sum*10)+(c-'0');
+ }
+ }
+ return sum*mult;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Getters ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public byte[] rname(){
+ assert(RNAME_AS_BYTES);
+ return rname;
+ }
+ public byte[] rnext(){return rnext;}
+
+ public String rnameS(){return rnameS!=null ? rnameS : rname==null ? null : new String(rname);}
+ public String rnextS(){return rnext==null ? null : new String(rnext);}
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public String qname;
+ public int flag;
+ public int pos;
+ public int mapq;
+ public String cigar;
+ public int pnext;
+ public int tlen;
+ public byte[] seq;
+ public byte[] qual;
+ public ArrayList<String> optional;
+
+ public Object obj;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Private Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private byte[] rname;
+ private byte[] rnext;
+
+ private String rnameS;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Static Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private static final String stringstar="*";
+ private static final byte[] bytestar=new byte[] {(byte)'*'};
+ private static final byte[] byteequals=new byte[] {(byte)'='};
+ private static final String XSPLUS="XS:A:+", XSMINUS="XS:A:-";
+// private static final double inv100=0.01d;
+// private static float minratio=0.4f;
+
+ private static boolean warning=System.getProperty("user.dir").contains("/bushnell/");
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Public Static Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public static String READGROUP_ID=null;
+ public static String READGROUP_CN=null;
+ public static String READGROUP_DS=null;
+ public static String READGROUP_DT=null;
+ public static String READGROUP_FO=null;
+ public static String READGROUP_KS=null;
+ public static String READGROUP_LB=null;
+ public static String READGROUP_PG=null;
+ public static String READGROUP_PI=null;
+ public static String READGROUP_PL=null;
+ public static String READGROUP_PU=null;
+ public static String READGROUP_SM=null;
+
+ public static String READGROUP_TAG=null;
+
+ /** Turn this off for RNAseq or long indels */
+ public static boolean MAKE_MD_TAG=false;
+
+ public static boolean NO_TAGS=false;
+
+// public static boolean MAKE_RG_TAG=false;
+ public static boolean MAKE_AM_TAG=true;
+ public static boolean MAKE_NM_TAG=true;
+ public static boolean MAKE_SM_TAG=false;
+ public static boolean MAKE_XM_TAG=false;
+ public static boolean MAKE_XS_TAG=false;
+ public static boolean MAKE_AS_TAG=false; //TODO: Alignment score from aligner
+ public static boolean MAKE_NH_TAG=false;
+ public static boolean MAKE_TOPHAT_TAGS=false;
+ public static boolean XS_SECONDSTRAND=false;
+ public static boolean MAKE_IDENTITY_TAG=false;
+ public static boolean MAKE_SCORE_TAG=false;
+ public static boolean MAKE_STOP_TAG=false;
+ public static boolean MAKE_LENGTH_TAG=false;
+ public static boolean MAKE_CUSTOM_TAGS=false;
+ public static boolean MAKE_INSERT_TAG=false;
+ public static boolean MAKE_CORRECTNESS_TAG=false;
+ public static boolean MAKE_TIME_TAG=false;
+ public static boolean MAKE_BOUNDS_TAG=false;
+
+ public static boolean PENALIZE_AMBIG=true;
+ public static boolean CONVERT_CIGAR_TO_MATCH=true;
+ public static boolean SOFT_CLIP=true;
+ public static boolean SECONDARY_ALIGNMENT_ASTERISKS=true;
+ /** OK to use the "setFrom" function which uses the old SamLine instead of translating the read, if a genome is not loaded. Should be false when processing occurs. */
+ public static boolean SET_FROM_OK=false;
+ /** For paired reads, keep original names rather than changing read2's name to match read1 */
+ public static boolean KEEP_NAMES=false;
+ public static float VERSION=1.4f;
+ /** Tells program when to use 'N' rather than 'D' in cigar strings */
+ public static int INTRON_LIMIT=Integer.MAX_VALUE;
+ public static boolean RNAME_AS_BYTES=true;//Effect on speed is negligible for pileup...
+
+ public static boolean setxs=false;
+ public static boolean setintron=false;
+
+// /** SSAHA2 incorrectly calculates the start position of reads with soft-clipped starts, and needs this enabled. */
+// public static boolean SUBTRACT_LEADING_SOFT_CLIP=true;
+ /** Sort header scaffolds in alphabetical order to be more compatible with Tophat */
+ public static boolean SORT_SCAFFOLDS=false;
+
+ public static boolean PARSE_0=true;
+ public static boolean PARSE_6=true;
+ public static boolean PARSE_7=true;
+ public static boolean PARSE_8=true;
+ public static boolean PARSE_10=true;
+ public static boolean PARSE_OPTIONAL=true;
+
+ public static boolean verbose=false;
+
+}
diff --git a/current/stream/SamReadInputStream.java b/current/stream/SamReadInputStream.java
new file mode 100755
index 0000000..79cfe55
--- /dev/null
+++ b/current/stream/SamReadInputStream.java
@@ -0,0 +1,213 @@
+package stream;
+
+import java.util.ArrayList;
+
+import align2.Shared;
+import align2.Tools;
+
+import fileIO.ByteFile;
+import fileIO.FileFormat;
+
+public class SamReadInputStream extends ReadInputStream {
+
+ public static void main(String[] args){
+
+ SamReadInputStream sris=new SamReadInputStream(args[0], false, false, true);
+
+ Read r=sris.next();
+ System.out.println(r.toText(false));
+ System.out.println();
+ System.out.println(r.obj.toString());
+ System.out.println();
+ }
+
+ public SamReadInputStream(String fname, boolean loadHeader_, boolean interleaved_, boolean allowSubprocess_){
+ this(FileFormat.testInput(fname, FileFormat.SAM, null, allowSubprocess_, false), loadHeader_, interleaved_);
+ }
+
+ public SamReadInputStream(FileFormat ff, boolean loadHeader_, boolean interleaved_){
+ loadHeader=loadHeader_;
+// assert(loadHeader);
+// interleaved=((tf.is==System.in || stdin) ? FASTQ.FORCE_INTERLEAVED : true);
+ interleaved=interleaved_;
+
+ stdin=ff.stdio();
+ if(!ff.samOrBam()){
+ System.err.println("Warning: Did not find expected sam file extension for filename "+ff.name());
+ }
+
+ tf=ByteFile.makeByteFile(ff, false);
+ header=new ArrayList<byte[]>();
+
+ }
+
+ @Override
+ public void start() {
+// if(cris!=null){cris.start();}
+ }
+
+
+ @Override
+ public boolean hasMore() {
+ if(buffer==null || next>=buffer.size()){
+ if(tf.isOpen()){
+ fillBuffer();
+ }else{
+ assert(generated>0) : "Was the file empty?";
+ }
+ }
+ return (buffer!=null && next<buffer.size());
+ }
+
+ @Override
+ public Read next() {
+ if(!hasMore()){return null;}
+ Read r=buffer.set(next, null);
+ next++;
+ consumed++;
+ return r;
+ }
+
+ @Override
+ public synchronized ArrayList<Read> nextList() {
+ if(next!=0){throw new RuntimeException("'next' should not be used when doing blockwise access.");}
+ if(buffer==null || next>=buffer.size()){fillBuffer();}
+ ArrayList<Read> list=buffer;
+ buffer=null;
+ if(list!=null && list.size()==0){list=null;}
+ consumed+=(list==null ? 0 : list.size());
+// System.err.println(hashCode()+" produced "+r[0].numericID);
+ return list;
+ }
+
+ private synchronized void fillBuffer(){
+
+ assert(buffer==null || next>=buffer.size());
+
+ buffer=null;
+ next=0;
+
+ buffer=toReadList(tf, BUF_LEN, nextReadID, FASTQ.PARSE_CUSTOM);
+ nextReadID+=buffer.size();
+ generated+=buffer.size();
+
+ if(buffer.size()<BUF_LEN){tf.close();}
+ }
+
+ /**
+ * @param tf2
+ * @param bUF_LEN2
+ * @param nextReadID2
+ * @param interleaved2
+ * @return
+ */
+ private final ArrayList<Read> toReadList(ByteFile tf2, int buflen, long nextReadID2, boolean parseCustom) {
+ ArrayList<Read> list=new ArrayList<Read>(buflen);
+ while(list.size()<buflen){
+ byte[] line=tf2.nextLine();
+// System.out.println("A: Read line "+new String(line));
+ while(line!=null && line[0]=='@'){
+// System.out.println(">"+new String(line));
+ if(loadHeader){header.add(line);}
+ line=tf2.nextLine();
+// assert(false) : new String(line)+"\n"+header.size()+", "+SHARED_HEADER;
+// System.out.println("B: Read line "+new String(line));
+ }
+ if(loadHeader && nextReadID2==0){setSharedHeader(header);}
+ if(line==null){return list;}
+ SamLine sl1=new SamLine(line);
+ Read r1=sl1.toRead(parseCustom);
+ r1.obj=sl1;
+ r1.numericID=nextReadID2;
+ list.add(r1);
+ if(interleaved && (sl1.flag&0x1)!=0){
+ assert((sl1.flag&0x40)!=0) : r1+"\n\n"+sl1;
+ byte[] line2=tf2.nextLine();
+ SamLine sl2=null;
+ Read r2=null;
+ if(line2!=null){
+ sl2=new SamLine(line2);
+ r2=sl2.toRead(parseCustom);
+ r2.numericID=nextReadID2;
+ }else{
+ assert(false) : r1+"\n\n"+sl1;
+ }
+ if(sl2!=null){
+ assert((sl2.flag&0x1)!=0);
+ assert((sl2.flag&0x80)!=0) : r2+"\n\n"+sl2+"\nflag="+Integer.toBinaryString(sl2.flag)+"\n";
+ r1.mate=r2;
+ r2.mate=r1;
+
+ int lim=Tools.min(sl1.qname.length(), sl2.qname.length());
+ for(int i=0; i<lim; i++){
+ char a=sl1.qname.charAt(i);
+ char b=sl2.qname.charAt(i);
+ if(a=='/' || b=='/' || Character.isWhitespace(a) || Character.isWhitespace(b)){break;}
+ assert(a==b) : "Name mismatch for paired reads: '"+sl1.qname+"' != '"+sl2.qname+"'\n\n"+sl1+"\n\n"+sl2;
+ }
+
+ }
+ }
+ nextReadID2++;
+ }
+ return list;
+ }
+
+ public boolean close(){
+ return tf.close();
+ }
+
+ @Override
+ public synchronized void restart() {
+ generated=0;
+ consumed=0;
+ next=0;
+ nextReadID=0;
+ buffer=null;
+ header=new ArrayList<byte[]>();
+ tf.reset();
+ }
+
+ public static synchronized ArrayList<byte[]> getSharedHeader(boolean wait){
+ if(!wait || SHARED_HEADER!=null){return SHARED_HEADER;}
+ System.err.println("Waiting on header to be read from a sam file.");
+ while(SHARED_HEADER==null){
+ try {
+ SamReadInputStream.class.wait(1000);
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ return SHARED_HEADER;
+ }
+
+ public static synchronized void setSharedHeader(ArrayList<byte[]> list){
+// assert(false) : list.size();
+ SHARED_HEADER=list;
+ SamReadInputStream.class.notifyAll();
+ }
+
+ private static ArrayList<byte[]> SHARED_HEADER;
+// private static boolean SET_SHARED_HEADER;
+
+ @Override
+ public boolean paired() {return interleaved;}
+
+ private ArrayList<Read> buffer=null;
+ private ArrayList<byte[]> header=null;
+ private int next=0;
+
+ private final ByteFile tf;
+ private final boolean interleaved;
+ private final boolean loadHeader;
+
+ private final int BUF_LEN=Shared.READ_BUFFER_LENGTH;
+
+ public long generated=0;
+ public long consumed=0;
+ private long nextReadID=0;
+
+ public final boolean stdin;
+
+}
diff --git a/current/stream/ScaffoldCoordinates.java b/current/stream/ScaffoldCoordinates.java
new file mode 100755
index 0000000..995c1ef
--- /dev/null
+++ b/current/stream/ScaffoldCoordinates.java
@@ -0,0 +1,85 @@
+package stream;
+
+import dna.Data;
+
+/**
+ * Transforms BBMap index coordinates into scaffold-relative coordinates.
+ * @author Brian Bushnell
+ * @date Aug 26, 2014
+ *
+ */
+public class ScaffoldCoordinates {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Constructors ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public ScaffoldCoordinates(){}
+
+ public ScaffoldCoordinates(Read r){set(r);}
+
+ public ScaffoldCoordinates(SiteScore ss){set(ss);}
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public boolean set(Read r){
+ valid=false;
+ if(r.mapped()){setFromIndex(r.chrom, r.start, r.stop, r.strand(), r);}
+ return valid;
+ }
+
+ public boolean set(SiteScore ss){
+ return setFromIndex(ss.chrom, ss.start, ss.stop, ss.strand, ss);
+ }
+
+ public boolean setFromIndex(int iChrom_, int iStart_, int iStop_, int strand_, Object o){
+ valid=false;
+ if(iChrom_>=0){
+ iChrom=iChrom_;
+ iStart=iStart_;
+ iStop=iStop_;
+ if(Data.isSingleScaffold(iChrom, iStart, iStop)){
+ assert(Data.scaffoldLocs!=null) : "\n\n"+o+"\n\n";
+ scafIndex=Data.scaffoldIndex(iChrom, (iStart+iStop)/2);
+ name=Data.scaffoldNames[iChrom][scafIndex];
+ scafLength=Data.scaffoldLengths[iChrom][scafIndex];
+ start=Data.scaffoldRelativeLoc(iChrom, iStart, scafIndex);
+ stop=start-iStart+iStop;
+ strand=(byte)strand_;
+ valid=true;
+ }
+ }
+ if(!valid){clear();}
+ return valid;
+ }
+
+ public void clear(){
+ valid=false;
+ scafIndex=-1;
+ iChrom=-1;
+ iStart=-1;
+ start=-1;
+ iStop=-1;
+ stop=-1;
+ strand=-1;
+ scafLength=0;
+ name=null;
+ valid=false;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public int scafIndex=-1;
+ public int iChrom=-1;
+ public int iStart=-1, iStop=-1;
+ public int start=-1, stop=-1;
+ public byte strand=-1;
+ public int scafLength=0;
+ public byte[] name=null;
+ public boolean valid=false;
+
+}
diff --git a/current/stream/ScarfReadInputStream.java b/current/stream/ScarfReadInputStream.java
new file mode 100755
index 0000000..ed345d2
--- /dev/null
+++ b/current/stream/ScarfReadInputStream.java
@@ -0,0 +1,138 @@
+package stream;
+
+import java.util.ArrayList;
+
+import align2.Shared;
+
+import fileIO.ByteFile;
+import fileIO.FileFormat;
+
+public class ScarfReadInputStream extends ReadInputStream {
+
+ public static void main(String[] args){
+
+ ScarfReadInputStream fris=new ScarfReadInputStream(args[0], true);
+
+ Read r=fris.next();
+ System.out.println(r.toText(false));
+
+ }
+
+ public ScarfReadInputStream(String fname, boolean allowSubprocess_){
+ this(FileFormat.testInput(fname, FileFormat.SCARF, null, allowSubprocess_, false));
+ }
+
+ public ScarfReadInputStream(FileFormat ff){
+ if(verbose){System.err.println("ScarfReadInputStream("+ff.name()+")");}
+
+ stdin=ff.stdio();
+ if(!ff.scarf()){
+ System.err.println("Warning: Did not find expected scarf file extension for filename "+ff.name());
+ }
+
+ tf=ByteFile.makeByteFile(ff, false);
+
+ interleaved=FASTQ.FORCE_INTERLEAVED;//((tf.is()==System.in || stdin) ? FASTQ.FORCE_INTERLEAVED : FASTQ.isInterleaved(tf.name));
+// assert(false) : interleaved;
+ }
+
+ @Override
+ public void start() {
+// if(cris!=null){cris.start();}
+ }
+
+
+ @Override
+ public boolean hasMore() {
+ if(buffer==null || next>=buffer.size()){
+ if(tf.isOpen()){
+ fillBuffer();
+ }else{
+ assert(generated>0) : "Was the file empty?";
+ }
+ }
+ return (buffer!=null && next<buffer.size());
+ }
+
+ @Override
+ public Read next() {
+ if(!hasMore()){return null;}
+ Read r=buffer.set(next, null);
+ next++;
+ consumed++;
+ return r;
+ }
+
+ @Override
+ public synchronized ArrayList<Read> nextList() {
+ if(next!=0){throw new RuntimeException("'next' should not be used when doing blockwise access.");}
+ if(buffer==null || next>=buffer.size()){fillBuffer();}
+ ArrayList<Read> list=buffer;
+ buffer=null;
+ if(list!=null && list.size()==0){list=null;}
+ consumed+=(list==null ? 0 : list.size());
+// System.err.println(hashCode()+" produced "+r[0].numericID);
+ return list;
+ }
+
+ private synchronized void fillBuffer(){
+
+ assert(buffer==null || next>=buffer.size());
+
+ buffer=null;
+ next=0;
+
+ buffer=FASTQ.toScarfReadList(tf, BUF_LEN, nextReadID, interleaved);
+ int bsize=(buffer==null ? 0 : buffer.size());
+ nextReadID+=bsize;
+ if(bsize<BUF_LEN){tf.close();}
+
+ generated+=bsize;
+ if(buffer==null){
+ if(!errorState){
+ errorState=true;
+ System.err.println("Null buffer in ScarfReadInputStream.");
+ }
+ }
+ }
+
+ public boolean close(){
+ if(verbose){System.err.println("Closing "+this.getClass().getName()+" for "+tf.name()+"; errorState="+errorState);}
+ errorState|=tf.close();
+ if(verbose){System.err.println("Closed "+this.getClass().getName()+" for "+tf.name()+"; errorState="+errorState);}
+ return errorState;
+ }
+
+ @Override
+ public synchronized void restart() {
+ generated=0;
+ consumed=0;
+ next=0;
+ nextReadID=0;
+ buffer=null;
+ tf.reset();
+ }
+
+ @Override
+ public boolean paired() {return interleaved;}
+
+ /** Return true if this stream has detected an error */
+ public boolean errorState(){return errorState || FASTQ.errorState();}
+
+ private ArrayList<Read> buffer=null;
+ private int next=0;
+
+ private final ByteFile tf;
+ private final boolean interleaved;
+
+ private final int BUF_LEN=Shared.READ_BUFFER_LENGTH;
+ private final long MAX_DATA=Shared.READ_BUFFER_MAX_DATA; //TODO - lot of work for unlikely case of super-long scarf reads. Must be disabled for paired-ends.
+
+ public long generated=0;
+ public long consumed=0;
+ private long nextReadID=0;
+
+ public final boolean stdin;
+ public static boolean verbose=false;
+
+}
diff --git a/current/stream/SequentialReadInputStream.java b/current/stream/SequentialReadInputStream.java
new file mode 100755
index 0000000..72501ac
--- /dev/null
+++ b/current/stream/SequentialReadInputStream.java
@@ -0,0 +1,193 @@
+package stream;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import align2.Shared;
+import align2.Tools;
+
+import dna.AminoAcid;
+import dna.ChromosomeArray;
+import dna.Data;
+import dna.Gene;
+
+public class SequentialReadInputStream extends ReadInputStream {
+
+ public SequentialReadInputStream(long maxReads_, int readlen_, int minreadlen_, int overlap_, boolean alternateStrand_){
+
+ maxReads=(maxReads_<0 ? Long.MAX_VALUE : maxReads_);
+ readlen=readlen_;
+ minReadlen=minreadlen_;
+ POSITION_INCREMENT=readlen;
+ overlap=overlap_;
+ alternateStrand=alternateStrand_;
+ assert(overlap<POSITION_INCREMENT);
+
+ maxPosition=Data.chromLengths[1];
+ maxChrom=Data.numChroms;
+
+ restart();
+ }
+
+ public void start(){}
+
+ @Override
+ public void restart(){
+ position=0;
+ chrom=1;
+ generated=0;
+ consumed=0;
+ next=0;
+ buffer=null;
+ }
+
+ @Override
+ public boolean paired() {
+ return false;
+ }
+
+ @Override
+ public boolean close() {return false;}
+
+ @Override
+ public boolean hasMore() {
+ if(verbose){
+ System.out.println("Called hasMore(): "+(id>=maxReads)+", "+(chrom<maxChrom)+", "+(position<=maxPosition)+", "+(buffer==null || next>=BUF_LEN));
+ System.out.println(id+", "+maxReads+", "+chrom+", "+maxChrom+", "+position+", "+maxPosition+", "+buffer+", "+next+", "+(buffer==null ? -1 : BUF_LEN));
+ }
+// if(buffer==null || next>=buffer.size()){
+// if(tf.isOpen()){
+// fillBuffer();
+// }else{
+// assert(generated>0) : "Was the file empty?";
+// }
+// }
+// return (buffer!=null && next<buffer.size());
+ if(id>=maxReads){return false;}
+ if(chrom<maxChrom){return true;}
+ if(position<=maxPosition){return true;}
+ if(buffer==null || next>=buffer.size()){return false;}
+ return true;
+ }
+
+ @Override
+ public Read next() {
+ if(!hasMore()){return null;}
+ if(buffer==null || next>=buffer.size()){fillBuffer();}
+ Read r=buffer.get(next);
+ buffer.set(next, null);
+ next++;
+ consumed++;
+ return r;
+ }
+
+ @Override
+ public synchronized ArrayList<Read> nextList() {
+ if(next!=0){throw new RuntimeException("'next' should not be used when doing blockwise access.");}
+ if(!hasMore()){return null;}
+ if(buffer==null || next>=buffer.size()){fillBuffer();}
+ ArrayList<Read> r=buffer;
+ buffer=null;
+ if(r!=null && r.size()==0){r=null;}
+ consumed+=(r==null ? 0 : r.size());
+ return r;
+ }
+
+ private synchronized void fillBuffer(){
+// System.out.println("fill "+chrom+", "+position);
+ buffer=null;
+ if(chrom>maxChrom){return;}
+ ChromosomeArray cha=Data.getChromosome(chrom);
+ next=0;
+
+ if(position==0){
+ while(position<=maxPosition && !AminoAcid.isFullyDefined((char)cha.get(position))){position++;}
+ }
+
+ ArrayList<Read> reads=new ArrayList<Read>(BUF_LEN);
+ int index=0;
+
+ while(position<=maxPosition && index<buffer.size() && id<maxReads){
+ int start=position;
+ int stop=Tools.min(position+readlen-1, cha.maxIndex);
+ byte[] s=cha.getBytes(start, stop);
+// assert(s.length==readlen) : s.length+", "+readlen;
+
+ if(s.length<1 || !AminoAcid.isFullyDefined(s)){
+ int firstGood=-1, lastGood=-1;
+ for(int i=0; i<s.length; i++){
+ if(AminoAcid.isFullyDefined(s[i])){
+ lastGood=i;
+ if(firstGood==-1){firstGood=i;}
+ }
+ }
+ if(lastGood-firstGood+1>=minReadlen){
+ start=start+firstGood;
+ stop=stop-(s.length-lastGood-1);
+ s=Arrays.copyOfRange(s, firstGood, lastGood+1);
+ assert(s.length==lastGood-firstGood+1);
+ }else{
+ s=null;
+ }
+ }
+
+ if(s!=null){
+ Read r=new Read(s, chrom, Gene.PLUS, start, stop, id, null);
+ if(alternateStrand && (r.numericID&1)==1){r.reverseComplement();}
+ r.setSynthetic(true);
+// System.out.println("Made read: "+r);
+// assert(id!=54406) : "\n"+r.toString()+"\nbases: "+s.length+"\nstart: "+start+"\nstop: "+stop+"\nminlen: "+minReadlen+"\n";
+
+ reads.add(r);
+ index++;
+ position+=(POSITION_INCREMENT-overlap);
+ id++;
+ }else{
+ //Move to the next defined position
+ while(AminoAcid.isFullyDefined((char)cha.get(position))){position++;}
+ while(position<=maxPosition && !AminoAcid.isFullyDefined((char)cha.get(position))){position++;}
+ }
+ }
+// System.out.println("got "+index+" from "+chrom+", "+position);
+
+ if(index==0){
+ if(UNLOAD && chrom>0){Data.unload(chrom, true);}
+ chrom++;
+ position=0;
+ buffer=null;
+ fillBuffer();
+ return;
+ }
+
+ generated+=index;
+
+ buffer=reads;
+ }
+
+ private long id=0;
+
+ public int position=0;
+ public int maxPosition;
+
+ private int chrom;
+
+ private ArrayList<Read> buffer=null;
+ private int next=0;
+
+ private final int BUF_LEN=Shared.READ_BUFFER_LENGTH;
+ public static boolean UNLOAD=false;
+
+ public long generated=0;
+ public long consumed=0;
+
+ public final long maxReads;
+ public final int readlen;
+ public final int POSITION_INCREMENT;
+ public final int minReadlen;
+ public final int maxChrom;
+ public final int overlap;
+ public final boolean alternateStrand;
+
+ public static boolean verbose=false;
+
+}
diff --git a/current/stream/SiteScore.java b/current/stream/SiteScore.java
new file mode 100755
index 0000000..b7cc54e
--- /dev/null
+++ b/current/stream/SiteScore.java
@@ -0,0 +1,1021 @@
+package stream;
+
+import java.io.Serializable;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
+
+import align2.GapTools;
+import align2.MSA;
+import align2.Shared;
+import align2.Tools;
+
+import dna.AminoAcid;
+import dna.ChromosomeArray;
+import dna.Data;
+import dna.Gene;
+
+
+
+public final class SiteScore implements Comparable<SiteScore>, Cloneable, Serializable{
+
+ /**
+ *
+ */
+ private static final long serialVersionUID = -8096245242590075081L;
+
+ public SiteScore(int chrom_, byte strand_, int start_, int stop_, int hits_, int quickScore_){
+ start=start_;
+ stop=stop_;
+ hits=hits_;
+ quickScore=quickScore_;
+ score=quickScore_;
+ chrom=chrom_;
+ strand=strand_;
+// assert(chrom_>=0) : this.toText()+"\nchrom_="+chrom_+", strand_="+strand_+", start_="+start_+", stop_="+stop_+", hits_="+hits_+", quickScore_="+quickScore_;
+ assert(start_<=stop_) : this.toText()+"\nchrom_="+chrom_+", strand_="+strand_+", start_="+start_+", stop_="+stop_+", hits_="+hits_+", quickScore_="+quickScore_;
+ }
+
+ public SiteScore(int chrom_, byte strand_, int start_, int stop_, int hits_, int quickScore_, boolean rescued_, boolean perfect_){
+ start=start_;
+ stop=stop_;
+ hits=hits_;
+ quickScore=quickScore_;
+ score=quickScore_;
+ chrom=chrom_;
+ strand=strand_;
+ rescued=rescued_;
+ perfect=perfect_;
+ semiperfect=perfect;
+ assert(start_<=stop_) : this.toText();
+ }
+
+ @Override
+ public int compareTo(SiteScore other) {
+ int x=other.score-score;
+ if(x!=0){return x;}
+
+ x=other.slowScore-slowScore;
+ if(x!=0){return x;}
+
+ x=other.pairedScore-pairedScore;
+ if(x!=0){return x;}
+
+ x=other.quickScore-quickScore;
+ if(x!=0){return x;}
+
+ x=chrom-other.chrom;
+ if(x!=0){return x;}
+
+ x=start-other.start;
+ return x;
+ }
+
+ public boolean equals(Object other){
+ return compareTo((SiteScore)other)==0;
+ }
+
+ public String toString(){
+ return toText().toString();
+ }
+
+// 9+2+1+9+9+1+1+4+4+4+4+gaps
+ public CharSequence toText(){
+ StringBuilder sb=new StringBuilder(53+(gaps==null ? 0 : gaps.length*10));
+ sb.append(chrom);
+ sb.append(',');
+ sb.append(strand);
+ sb.append(',');
+ sb.append(start);
+ sb.append(',');
+ sb.append(stop);
+ sb.append(',');
+ sb.append((rescued ? 1 : 0));
+ sb.append(',');
+ sb.append((semiperfect ? 1 : 0));
+ sb.append((perfect ? 1 : 0));
+ sb.append(',');
+ sb.append(hits);
+ sb.append(',');
+ sb.append(quickScore);
+ sb.append(',');
+ sb.append(slowScore);
+ sb.append(',');
+ sb.append(pairedScore);
+ sb.append(',');
+ sb.append(score);
+
+ if(gaps!=null){
+ sb.append(',');
+ for(int i=0; i<gaps.length; i++){
+ if(i>0){sb.append('~');}
+ sb.append(gaps[i]);
+ }
+ }
+
+ if(match!=null){
+ if(gaps==null){sb.append(',');}
+ sb.append(',');
+ final char[] buffer=Shared.getTLCB(match.length);
+ for(int i=0; i<match.length; i++){buffer[i]=(char)match[i];}
+ sb.append(buffer, 0, match.length);
+ }
+
+ return sb;
+// chrom+","+strand+","+start+","+stop+","+(rescued ? 1 : 0)+","+
+// (perfect ? 1 : 0)+","+quickScore+","+slowScore+","+pairedScore+","+score;
+ }
+
+// 9+2+1+9+9+1+1+4+4+4+4+gaps
+ public ByteBuilder toBytes(ByteBuilder sb){
+ if(sb==null){sb=new ByteBuilder(53+(gaps==null ? 0 : gaps.length*10));}
+ sb.append(chrom);
+ sb.append(',');
+ sb.append((int)strand);
+ sb.append(',');
+ sb.append(start);
+ sb.append(',');
+ sb.append(stop);
+ sb.append(',');
+ sb.append((rescued ? 1 : 0));
+ sb.append(',');
+ sb.append((semiperfect ? 1 : 0));
+ sb.append((perfect ? 1 : 0));
+ sb.append(',');
+ sb.append(hits);
+ sb.append(',');
+ sb.append(quickScore);
+ sb.append(',');
+ sb.append(slowScore);
+ sb.append(',');
+ sb.append(pairedScore);
+ sb.append(',');
+ sb.append(score);
+
+ if(gaps!=null){
+ sb.append(',');
+ for(int i=0; i<gaps.length; i++){
+ if(i>0){sb.append('~');}
+ sb.append(gaps[i]);
+ }
+ }
+
+ if(match!=null){
+ if(gaps==null){sb.append(',');}
+ sb.append(',');
+ sb.append(match);
+ }
+
+ return sb;
+// chrom+","+strand+","+start+","+stop+","+(rescued ? 1 : 0)+","+
+// (perfect ? 1 : 0)+","+quickScore+","+slowScore+","+pairedScore+","+score;
+ }
+
+ public boolean isSemiPerfect(byte[] bases){
+ if(bases.length!=stop-start+1){return false;}
+ byte[] ref=Data.getChromosome(chrom).array;
+
+ //This block handles cases where the read runs outside the reference
+ //Of course, padding the reference with 'N' would be better, but...
+ int readStart=0;
+ int readStop=bases.length;
+ final int refStop=start+bases.length;
+ int maxNoref=bases.length/2;
+
+ if(start<0){
+ readStart=0-start;
+ }
+ if(refStop>ref.length){
+ int dif=(refStop-ref.length);
+ readStop-=dif;
+ }
+
+ for(int i=readStart; i<readStop; i++){
+ byte c=bases[i];
+ byte r=ref[start+i];
+
+// assert(Character.isUpperCase(c) && Character.isUpperCase(r));
+ if(c=='N'){return false;}
+ if(c!=r){
+ maxNoref--;
+ if(maxNoref<0 || r!='N'){return false;}
+ }
+ }
+ return true;
+ }
+
+ public boolean isPerfect(byte[] bases){
+ if(bases.length!=stop-start+1 || start<0){return false;}
+ byte[] ref=Data.getChromosome(chrom).array;
+ if(stop>=ref.length){return false;}
+
+ for(int i=0; i<bases.length; i++){
+ byte c=bases[i];
+ byte r=ref[start+i];
+ assert(Character.isUpperCase(c) && Character.isUpperCase(r)) : "Lowercase letters detected: ref="+(char)r+", read="+(char)c+"\n"+new String(bases)+"\n"+
+ "Please re-run with the 'tuc=t' flag (touppercase=true).";
+
+ if((c!=r /* && (Character.toUpperCase(c)!=Character.toUpperCase(r))*/) || c=='N'){
+ return false;
+ }
+ }
+ return true;
+ }
+
+
+ public boolean setPerfectFlag(int maxScore, byte[] bases){
+ if(maxScore==slowScore){
+ assert(isPerfect(bases));
+ return perfect=semiperfect=true;
+ }
+ return setPerfect(bases, false);
+ }
+
+ /** Sets "perfect" and "semiperfect" flags */
+ public boolean setPerfect(byte[] bases){return setPerfect(bases, false);}
+
+ /** Sets "perfect" and "semiperfect" flags, optionally assuming "perfect" flag is correct. */
+ public boolean setPerfect(byte[] bases, boolean assumePerfectCorrect){
+ if(bases.length!=stop-start+1){
+ assert(!perfect || !assumePerfectCorrect) : perfect+", "+toString()+", "+
+ new String(Data.getChromosome(chrom).array, Tools.max(0, start), (Tools.min(Data.chromLengths[chrom], stop)-start));
+ perfect=false;
+ semiperfect=false;
+ assert(Read.CHECKSITE(this, bases, 0)) : new String(bases)+"\n"+this+"\n"; //123
+ return perfect;
+ }
+ byte[] ref=Data.getChromosome(chrom).array;
+
+ perfect=semiperfect=true;
+ int refloc=start, readloc=0, N=0, max=Tools.min(stop, ref.length-1), nlimit=bases.length/2;
+ if(start<0){
+ N-=start;
+ readloc-=start;
+ refloc-=start;
+ assert(!perfect || !assumePerfectCorrect);
+ perfect=false;
+ }
+ if(stop>=ref.length){
+ N+=(stop-ref.length+1);
+ assert(!perfect || !assumePerfectCorrect);
+ perfect=false;
+ }
+ if(N>nlimit){
+ perfect=semiperfect=false;
+ assert(Read.CHECKSITE(this, bases, 0)); //123
+ return perfect;
+ }
+
+ final byte bn=(byte)'N';
+ for(; refloc<=max; refloc++, readloc++){
+ final byte c=bases[readloc];
+ final byte r=ref[refloc];
+ assert(Character.isUpperCase(r) && Character.isUpperCase(c)) :
+ "\nAn input read appears to contain a non-upper-case base. Please rerun with the 'touppercase' flag.\n"+
+ "ref base = "+r+", read base = "+c+", TO_UPPER_CASE = "+Read.TO_UPPER_CASE+"\n"+(bases.length<=500 ? new String(bases) : "")+"\n";
+ if(c!=r || c==bn){
+ perfect=false;
+ if(c==bn){semiperfect=false;}
+ if(r!=bn || (N=N+1)>nlimit){
+ semiperfect=false;
+ assert(Read.CHECKSITE(this, bases, 0)); //123
+ return semiperfect;
+ }
+ }
+ }
+
+ semiperfect=(semiperfect && (N<=nlimit));
+ perfect=(perfect && semiperfect && (N==0));
+ assert(Read.CHECKSITE(this, bases, 0)); //123
+ return perfect;
+ }
+
+ public final boolean overlaps(SiteScore ss){
+ return chrom==ss.chrom && strand==ss.strand && overlap(start, stop, ss.start, ss.stop);
+ }
+ public final boolean overlaps(SiteScore ss, boolean ignoreStrand){
+ return chrom==ss.chrom && (ignoreStrand || strand==ss.strand) && overlap(start, stop, ss.start, ss.stop);
+ }
+ private static boolean overlap(int a1, int b1, int a2, int b2){
+ assert(a1<=b1 && a2<=b2) : a1+", "+b1+", "+a2+", "+b2;
+ return a2<=b1 && b2>=a1;
+ }
+
+ public static String header() {
+ return "chrom,strand,start,stop,rescued,semiperfect+perfect,hits,quickScore,slowScore,pairedScore,score,match";
+ }
+
+ public static SiteScore fromText(String s){
+// System.err.println("Trying to make a SS from "+s);
+ String line[]=s.split(",");
+
+ SiteScore ss;
+
+ assert(line.length>=11 && line.length<=13) : "\n"+line.length+"\n"+s+"\n"+Arrays.toString(line)+"\n";
+ int chrom=Byte.parseByte(line[0].charAt(0)=='*' ? line[0].substring(1) : line[0]);
+ byte strand=Byte.parseByte(line[1]);
+ int start=Integer.parseInt(line[2]);
+ int stop=Integer.parseInt(line[3]);
+ boolean rescued=Integer.parseInt(line[4])==1;
+// [1, 1, 9397398, 9398220, 0, 00, 20, 8701, 9084, 0, 9084, 9397398~9397471~9398145~9398220]
+ int p=Integer.parseInt(line[5], 2);
+// assert(false) : line[5]+"->"+p;
+ boolean perfect=(p&1)==1;
+ boolean semiperfect=(p&2)==2;
+ int hits=Integer.parseInt(line[6]);
+ int quickScore=Integer.parseInt(line[7]);
+ int swscore=Integer.parseInt(line[8]);
+ int pairedScore=Integer.parseInt(line[9]);
+ int score=Integer.parseInt(line[10]);
+ ss=new SiteScore(chrom, strand, start, stop, hits, quickScore, rescued, perfect);
+ ss.setScore(score);
+ ss.setSlowPairedScore(swscore, pairedScore);
+ ss.semiperfect=semiperfect;
+
+ if(line.length>11){
+ if(line[11]!=null && line[11].length()>0){
+ String[] gstring=line[11].split("~");
+ ss.gaps=new int[gstring.length];
+ for(int i=0; i<gstring.length; i++){
+ ss.gaps[i]=Integer.parseInt(gstring[i]);
+ }
+ }
+ }
+
+ if(line.length>12){
+ ss.match=line[12].getBytes();
+ }
+
+ return ss;
+ }
+
+ public boolean positionalMatch(SiteScore b, boolean testGaps){
+// return chrom==b.chrom && strand==b.strand && start==b.start && stop==b.stop;
+ if(chrom!=b.chrom || strand!=b.strand || start!=b.start || stop!=b.stop){
+ return false;
+ }
+ if(!testGaps || (gaps==null && b.gaps==null)){return true;}
+ if((gaps==null) != (b.gaps==null)){return false;}
+ if(gaps.length!=b.gaps.length){return false;}
+ for(int i=0; i<gaps.length; i++){
+ if(gaps[i]!=b.gaps[i]){return false;}
+ }
+ return true;
+ }
+
+ public byte[] getScaffoldName(boolean requireSingleScaffold){
+ byte[] name=null;
+ if(!requireSingleScaffold || Data.isSingleScaffold(chrom, start, stop)){
+ int idx=Data.scaffoldIndex(chrom, (start+stop)/2);
+ name=Data.scaffoldNames[chrom][idx];
+ // int scaflen=Data.scaffoldLengths[chrom][idx];
+ // a1=Data.scaffoldRelativeLoc(chrom, start, idx);
+ // b1=a1-start1+stop1;
+ }
+ return name;
+ }
+
+ public static class PositionComparator implements Comparator<SiteScore>{
+
+ private PositionComparator(){}
+
+ @Override
+ public int compare(SiteScore a, SiteScore b) {
+ if(a.chrom!=b.chrom){return a.chrom-b.chrom;}
+ if(a.start!=b.start){return a.start-b.start;}
+ if(a.stop!=b.stop){return a.stop-b.stop;}
+ if(a.strand!=b.strand){return a.strand-b.strand;}
+ if(a.score!=b.score){return b.score-a.score;}
+ if(a.slowScore!=b.slowScore){return b.slowScore-a.slowScore;}
+ if(a.quickScore!=b.quickScore){return b.quickScore-a.quickScore;}
+ if(a.perfect!=b.perfect){return a.perfect ? -1 : 1;}
+ if(a.rescued!=b.rescued){return a.rescued ? 1 : -1;}
+ return 0;
+ }
+
+ public void sort(List<SiteScore> list){
+ if(list==null || list.size()<2){return;}
+ Collections.sort(list, this);
+ }
+
+ public void sort(SiteScore[] list){
+ if(list==null || list.length<2){return;}
+ Arrays.sort(list, this);
+ }
+
+ }
+
+ public SiteScore copy(){
+ SiteScore ss2=this.clone();
+ if(gaps!=null){ss2.gaps=ss2.gaps.clone();}
+ return ss2;
+ }
+
+ public SiteScore clone(){
+ try {
+ return (SiteScore)super.clone();
+ } catch (CloneNotSupportedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ throw new RuntimeException();
+ }
+
+ public boolean isInBounds(){
+ ChromosomeArray cha=Data.getChromosome(chrom);
+ return (start>=0 && stop<=cha.maxIndex);
+ }
+
+ public boolean matchContainsXY(){
+ if(match==null || match.length<1){return false;}
+ final byte a=match[0], b=match[match.length-1];
+ return (a=='X' ||a=='Y' || b=='X' || b=='Y');
+ }
+
+ public boolean matchContainsC(){
+ if(match==null || match.length<1){return false;}
+ final byte a=match[0], b=match[match.length-1];
+ return (a=='C' || b=='C');
+ }
+
+ public boolean isCorrect(int chrom_, byte strand_, int start_, int stop_, int thresh){
+ if(chrom_!=chrom || strand_!=strand){return false;}
+ if(thresh<=0){return start_==start && stop_==stop;}
+ return Tools.absdif(start_, start)<=thresh || Tools.absdif(stop_, stop)<=thresh;
+ }
+
+ public int leftPaddingNeeded(int tiplen, int maxIndel){
+ if(match==null || match.length<1){return 0;}
+
+ int neutral=0, insertion=0, deletion=0, xy=0;
+ {
+ int mloc=0;
+ for(; mloc<match.length; mloc++){
+ byte c=match[mloc];
+ if(c=='I'){insertion++;}
+ else if(c=='X' || c=='Y'){xy++;}
+ else if(c=='D'){return insertion+xy;}
+ else{
+ neutral++;
+ if(mloc>=tiplen){break;}
+ }
+ }
+ }
+
+ if(insertion>maxIndel || xy>0 || match[0]=='I'){return insertion+xy;}
+ return 0;
+ }
+
+ public int rightPaddingNeeded(int tiplen, int maxIndel){
+ if(match==null || match.length<1){return 0;}
+ final int lastIndex=match.length-1;
+
+ int neutral=0, insertion=0, deletion=0, xy=0;
+ {
+ int mloc=lastIndex;
+ for(int min=lastIndex-tiplen; mloc>=0; mloc--){
+ byte c=match[mloc];
+ if(c=='I'){insertion++;}
+ else if(c=='X' || c=='Y'){xy++;}
+ else if(c=='D'){return insertion+xy;}
+ else{
+ neutral++;
+ if(mloc>=tiplen){break;}
+ }
+ }
+ }
+
+ if(insertion>maxIndel || xy>0 || match[lastIndex]=='I'){return insertion+xy;}
+ return 0;
+ }
+
+ public boolean clipTipIndels(byte[] bases, byte[] basesM, int tiplen, int maxIndel, MSA msa){
+ return this.plus() ? clipTipIndels(bases, tiplen, maxIndel, msa) : clipTipIndels(basesM, tiplen, maxIndel, msa);
+ }
+
+ public boolean clipTipIndels(byte[] bases, int tiplen, int maxIndel, MSA msa){
+ if(match==null || match.length<maxIndel){return false;}
+ if(verbose){
+ System.err.println("Calling clipTipIndels:\n"+new String(match));
+ System.err.println("slowScore="+slowScore+", pairedScore="+pairedScore);
+ }
+ assert(lengthsAgree());
+ boolean left=clipLeftTipIndel(bases, tiplen, maxIndel);
+ assert(lengthsAgree());
+ boolean right=clipRightTipIndel(bases, tiplen, maxIndel);
+ assert(lengthsAgree());
+
+ if(verbose){System.err.println("left="+left+", right="+right+", match="+new String(match));}
+ if(left || right){
+ unclip(bases);
+ assert(lengthsAgree());
+ int oldScore=slowScore;
+ if(verbose){System.err.println("oldScore="+oldScore+", slowScore="+slowScore+", pairedScore="+pairedScore+", newPairedScore="+(pairedScore+(slowScore-oldScore)));}
+ setSlowScore(msa.score(match));
+ setScore(score+(slowScore-oldScore));
+ this.setPerfect(bases);
+ }
+ if(verbose){System.err.println("After clipTipIndels:\n"+new String(match));}
+ return left | right;
+ }
+
+ public boolean clipLeftTipIndel(byte[] bases, int tiplen, int maxIndel){
+ if(match==null || match.length<maxIndel){return false;}
+ if(match[0]=='C' || match[0]=='Y' || match[0]=='X'){return false;}
+
+ int neutral=0, insertion=0, deletion=0;
+ {
+ int mloc=0;
+ for(; mloc<match.length; mloc++){
+ byte c=match[mloc];
+ if(c=='I'){insertion++;}
+ else if(c=='D'){deletion++;}
+ else{
+ neutral++;
+ if(mloc>=tiplen){break;}
+ }
+ }
+ while(mloc>=0 && match[mloc]=='m'){mloc--; neutral--;}
+ }
+ if(insertion<=maxIndel && deletion<=4*maxIndel){return false;}
+ assert(mappedLength()==matchLength() || matchContainsXY()) :
+ "start="+start+", stop="+stop+", maplen="+mappedLength()+", matchlen="+matchLength()+"\n"+new String(match)+"\n"+new String(bases)+"\n\n"+this;
+
+ int sum=neutral+insertion+deletion;
+ if(deletion>0){
+ byte[] temp=new byte[match.length-deletion];
+ int i=0, j=0;
+ for(; i<sum; i++){
+ byte c=match[i];
+ if(c=='D'){
+ //Do nothing
+ }else{
+ temp[j]=match[i];
+ j++;
+ }
+ }
+ for(; i<match.length; i++, j++){
+ temp[j]=match[i];
+ }
+ assert(i==match.length && j==temp.length) : i+", "+j+", "+match.length+", "+temp.length+"\n"+new String(match)+"\n"+new String(bases)+"\n"+this+"\n";
+ match=temp; //Be sure to percolate this to the read!
+ }
+
+ sum=neutral+insertion;
+ for(int i=0; i<sum; i++){
+ match[i]='C';
+ }
+
+ final int dif=(insertion-deletion);
+ incrementStart(-dif);
+ assert(mappedLength()==matchLength() || matchContainsXY());
+
+ return true;
+ }
+
+ public boolean clipRightTipIndel(final byte[] bases, final int tiplen, final int maxIndel){
+ if(match==null || match.length<maxIndel){return false;}
+ final int lastIndex=match.length-1;
+ if(match[lastIndex]=='C' || match[lastIndex]=='Y' || match[lastIndex]=='X'){return false;}
+
+ if(verbose){System.err.println("mappedLength="+mappedLength()+", matchLength()="+matchLength());}
+
+ int neutral=0, insertion=0, deletion=0;
+ {
+ int mloc=lastIndex;
+ for(int min=lastIndex-tiplen; mloc>=0; mloc--){
+ byte c=match[mloc];
+ if(c=='I'){insertion++;}
+ else if(c=='D'){deletion++;}
+ else{
+ neutral++;
+ if(mloc<=min){break;}
+ }
+ }
+ while(mloc<match.length && match[mloc]=='m'){mloc++; neutral--;}
+ }
+ if(insertion<=maxIndel && deletion<=4*maxIndel){return false;}
+ assert(mappedLength()==matchLength() || matchContainsXY()) : mappedLength()+", "+matchLength()+"\n"+new String(match)+"\n"+new String(bases);
+
+ int sum=neutral+insertion+deletion;
+ final int limit=match.length-sum;
+ if(deletion>0){
+ byte[] temp=new byte[match.length-deletion];
+ int i=0, j=0;
+ for(; i<limit; i++, j++){
+ temp[j]=match[i];
+ }
+ for(; i<match.length; i++){
+ byte c=match[i];
+ if(c=='D'){
+ //Do nothing
+ }else{
+ temp[j]=match[i];
+ j++;
+ }
+ }
+ assert(i==match.length && j==temp.length) : i+", "+j+", "+match.length+", "+temp.length+
+ "\n"+new String(match)+"\n"+new String(temp)+"\n"+new String(bases)+"\n"+this+"\n";
+ match=temp; //Be sure to percolate this to the read!
+ }
+
+ sum=neutral+insertion;
+ for(int i=limit; i<match.length; i++){
+ match[i]='C';
+ }
+
+
+ if(verbose){System.err.println("Final: "+new String(match));}
+ final int dif=(insertion-deletion);
+ if(verbose){System.err.println("mappedLength="+mappedLength()+", matchLength()="+matchLength()+", dif="+dif);}
+ incrementStop(dif);
+ assert(mappedLength()==matchLength() || matchContainsXY()) : mappedLength()+", "+matchLength()+", "+neutral+", "+insertion+", "+deletion+", "+dif;
+
+ return true;
+ }
+
+ public boolean unclip(final byte[] bases){
+ if(match==null || match.length<1){return false;}
+ if(verbose){System.err.println("Calling unclip on "+new String(match));}
+ {
+ byte first=match[0], last=match[match.length-1];
+// if(first=='X' || last=='Y'){return false;}
+ if(first!='C' && last!='C'){
+ if(verbose){System.err.println("No unclipping needed.");}
+ return false;
+ }
+ }
+ assert(lengthsAgree()) : new String(bases)+"\n"+this;
+ final ChromosomeArray ca=Data.getChromosome(chrom);
+ for(int rloc=start, cloc=0, mloc=0; mloc<match.length; mloc++){
+ final byte m=match[mloc];
+
+ if(m=='C'){
+ final byte c=bases[cloc];
+ final byte r=ca.get(rloc);
+ if(!AminoAcid.isFullyDefined(c) || !AminoAcid.isFullyDefined(r)){
+ match[mloc]='N';
+ }else{
+ match[mloc]=(byte)(c==r ? 'm' : 'S');
+ }
+ rloc++;
+ cloc++;
+ }else if(m=='m' || m=='N' || m=='S'){
+ rloc++;
+ cloc++;
+ }else if(m=='X' || m=='Y'){
+ rloc++;
+ cloc++;
+ }else if(m=='I'){
+ cloc++;
+ }else if(m=='D'){
+ rloc++;
+ }else{
+ throw new RuntimeException("Unsupported symbol: ASCII "+(int)m);
+ }
+ }
+ if(verbose){System.err.println("After unclip: "+new String(match));}
+ assert(lengthsAgree()) : new String(bases)+"\n"+this;
+ return true;
+ }
+
+ /** TODO: Test
+ * Attempt to extend match/N symbols where there are X and Y symbols
+ * */
+ public boolean fixXY(byte[] bases, boolean nullifyOnFailure, MSA msa){
+ if(verbose && match!=null){System.err.println("Calling fixXY:\n"+new String(match));}
+ if(!matchContainsXY()){return true;}
+ if(verbose){System.err.println("lengthsAgree: "+this.lengthsAgree());}
+
+ boolean disable=false;
+ if(disable){
+ if(nullifyOnFailure){
+ match=null;
+ }
+ return false;
+ }
+
+// if(match==null || match.length<1){return false;} //Already covered
+ final ChromosomeArray ca=Data.getChromosome(chrom);
+// final int tip=3;
+ boolean success=true;
+ final float maxSubRate=0.4f;
+ final int maxSubs=5;
+
+ {//Process left side
+ if(verbose){System.err.println("Processing left side. Success="+success+", start="+start+", stop="+stop+", match=\n"+new String(match));}
+ int mloc=0;
+ while(mloc<match.length && (match[mloc]=='X' || match[mloc]=='Y')){mloc++;}
+ if(mloc>=match.length || mloc>=bases.length){success=false;}
+ else if(mloc>0){
+ mloc--;//Location of last X or Y on left side
+ final int numX=mloc+1;
+ int rloc=start+mloc, cloc=mloc;
+ int subs=0, firstSub=-1;
+ while(mloc>=0){
+ byte m=match[mloc];
+ byte c=bases[cloc];
+ byte r=ca.get(rloc);
+ assert(m=='X' || m=='Y') : (char)m+", "+mloc+", "+(char)c+", "+(char)r+"\n"+new String(bases)+"\n"+this.toString();
+ if(r=='N' || c=='N'){match[mloc]='N';}
+ else if(c==r){match[mloc]='m';}
+// else if(mloc<=tip){match[mloc]='S';}
+ else{
+ match[mloc]='S';
+ subs++;
+ if(subs==1){firstSub=mloc;}
+ }
+// else{
+// if(verbose){System.err.println("A: Set success to false.");}
+// success=false;
+// break;
+// }
+ mloc--;
+ rloc--;
+ cloc--;
+ }
+ if(success && mappedLength()!=matchLength()){incrementStart(-numX);}
+ if(subs>maxSubs && subs>numX*maxSubRate){
+ if(verbose){System.err.println("Failed to correct alignment; clipping left side of read.");}
+ for(int i=0; i<=firstSub; i++){
+ match[i]='C';
+ }
+ if(!lengthsAgree()){
+ assert(false);
+ incrementStart(firstSub+1);
+ assert(lengthsAgree());
+ }
+ }
+ }
+ if(verbose){System.err.println("Finished left side. Success="+success+", start="+start+", stop="+stop+", match=\n"+new String(match));}
+ if(verbose){System.err.println("lengthsAgree: "+this.lengthsAgree());}
+ }
+
+ if(success){//Process right side
+ if(verbose){System.err.println("Processing right side. Success="+success+", start="+start+", stop="+stop+", match=\n"+new String(match));}
+ int mloc=match.length-1;
+ while(mloc>=0 && (match[mloc]=='X' || match[mloc]=='Y')){mloc--;}
+ int dif=match.length-1-mloc;
+ if(mloc<0){
+ if(verbose){System.err.println("B: Set success to false.");}
+ success=false;
+ }
+ else if(dif>0){
+ mloc++;//Location of first X or Y on right side
+ final int numX=match.length-mloc;
+ int rloc=stop-dif+1, cloc=bases.length-dif;
+ int subs=0, firstSub=-1;
+ if(cloc<0){
+ if(verbose){System.err.println("C: Set success to false.");}
+ success=false;
+ }else{
+// final int tip2=match.length-tip;
+ while(mloc<match.length){
+ byte m=match[mloc];
+ byte c=bases[cloc];
+ byte r=ca.get(rloc);
+ assert(m=='X' || m=='Y') : (char)m+", "+mloc+", "+(char)c+", "+(char)r+"\n"+new String(bases)+"\n"+this.toString();
+ if(r=='N' || c=='N'){match[mloc]='N';}
+ else if(c==r){match[mloc]='m';}
+// else if(mloc>=tip2){match[mloc]='S';}
+ else{
+ match[mloc]='S';
+ subs++;
+ if(subs==1){firstSub=mloc;}
+ }
+// else{
+// if(verbose){System.err.println("D: Set success to false.");}
+// success=false;
+// break;
+// }
+ mloc++;
+ rloc++;
+ cloc++;
+ }
+ }
+ if(success){
+ if(verbose){System.err.println("A: Start="+start+", stop="+stop+", numX="+numX+", lengthsAgree()="+lengthsAgree());}
+ if(mappedLength()!=matchLength()){incrementStop(numX);}
+ if(verbose){System.err.println("B: Start="+start+", stop="+stop+", numX="+numX+", lengthsAgree()="+lengthsAgree());}
+ if(subs>maxSubs && subs>numX*maxSubRate){
+ if(verbose){System.err.println("Failed to correct alignment; clipping right side of read.");}
+ for(int i=firstSub; i<match.length; i++){
+ match[i]='C';
+ }
+ int clipped=match.length-firstSub+1;
+ if(!lengthsAgree()){
+ assert(false);
+ incrementStop(-clipped);
+ assert(lengthsAgree());
+ }
+ if(verbose){System.err.println("C: Start="+start+", stop="+stop+", numX="+numX+", lengthsAgree()="+lengthsAgree());}
+ }
+ if(verbose){System.err.println("mappedLength()="+mappedLength()+", matchLength()="+matchLength()+", numX="+numX);}
+ }
+ }
+ if(verbose){System.err.println("Finished right side. Success="+success+", start="+start+", stop="+stop+", match=\n"+new String(match));}
+ if(verbose){System.err.println("lengthsAgree: "+this.lengthsAgree());}
+ }
+
+ success=success && !matchContainsXY()/* && mappedLength()==matchLength()*/;
+ if(!success){
+ if(verbose){System.err.println("E: Set success to false.");}
+ if(nullifyOnFailure){match=null;}
+ }else{
+ if(verbose){System.err.println("E: Success!");}
+ }
+
+ if(match!=null){
+ int oldScore=slowScore;
+ setSlowScore(msa.score(match));
+ setScore(score+(slowScore-oldScore));
+ }
+ if(verbose){System.err.println("lengthsAgree: "+this.lengthsAgree());}
+
+ setPerfect(bases); //Fixes a rare bug
+ return success;
+ }
+
+ public boolean lengthsAgree(){
+ return match==null ? true : matchLength()==mappedLength();
+ }
+
+ public int mappedLength(){
+ return stop-start+1;
+ }
+
+ public int matchLength(){
+ assert(match!=null);
+ return Read.calcMatchLength(match);
+ }
+
+// public boolean plus(){return strand()==Gene.PLUS;}
+// public boolean minus(){return strand()==Gene.MINUS;}
+//
+// public final byte strand(){return (byte)(flags&strandMask);}
+// public boolean rescued(){return (flags&rescuedMask)!=0;}
+// public boolean perfect(){return (flags&perfectMask)!=0;}
+// public boolean semiperfect(){return (flags&semiperfectMask)!=0;}
+//
+// public final int setStrand(int x){
+// assert(x==0 || x==1);
+// if(x==0){flags=(flags&~strandMask);}
+// else{flags=(flags|strandMask);}
+// assert(strand()==x);
+// return x;
+// }
+// public boolean setRescued(boolean b){
+// if(b){flags=(flags|rescuedMask);}
+// else{flags=(flags&~rescuedMask);}
+// assert(rescued()==b);
+// return b;
+// }
+// public boolean setPerfect(boolean b){
+// if(b){flags=(flags|semiperfectMask);}
+// else{flags=(flags&~semiperfectMask);}
+// assert(perfect()==b);
+// return b;
+// }
+// public boolean setSemiperfect(boolean b){
+// if(b){flags=(flags|semiperfectMask);}
+// else{flags=(flags&~semiperfectMask);}
+// assert(semiperfect()==b);
+// return b;
+// }
+
+ public boolean plus(){return strand==Gene.PLUS;}
+ public boolean minus(){return strand==Gene.MINUS;}
+ public boolean perfect(){return perfect;}
+ public boolean semiperfect(){return semiperfect;}
+ public boolean rescued(){return rescued;}
+ public byte strand(){return strand;}
+
+ public final byte strand;
+ public boolean rescued=false;
+ public boolean perfect=false;
+ public boolean semiperfect=false;
+
+ public void setPerfect(){
+ perfect=semiperfect=true;
+ gaps=null;
+ }
+ public void incrementStart(int x){setStart(start+x);}
+ public void incrementStop(int x){setStop(stop+x);}
+ public void setLimits(int a, int b){
+ start=a;
+ stop=b;
+ if(gaps!=null){
+ gaps[0]=a;
+ gaps[gaps.length-1]=b;
+ if(!CHECKGAPS()){gaps=GapTools.fixGaps(this);}
+ assert(CHECKGAPS()) : Arrays.toString(gaps);
+ }
+ }
+
+ public void fixLimitsXY(){
+ if(match==null || match.length<1){return;}
+ int x=0, y=0;
+ for(int i=0; i<match.length; i++){
+ if(match[i]=='X'){x++;}else{break;}
+ }
+ for(int i=match.length-1; i>=0; i--){
+ if(match[i]=='Y'){y++;}else{break;}
+ }
+// if((x!=0 || y!=0) && !lengthsAgree()){
+// setLimits(start-x, stop+y);
+// }
+ if((y!=0)){
+ setLimits(start, stop+y);
+ }
+ }
+
+ public void setStart(int a){
+ start=a;
+ if(gaps!=null){
+ gaps[0]=a;
+ if(gaps[0]>gaps[1]){
+ gaps=GapTools.fixGaps(this);
+ }
+ assert(CHECKGAPS()) : Arrays.toString(gaps);
+ }
+ }
+ public void setStop(int b){
+ stop=b;
+ if(gaps!=null){
+ gaps[gaps.length-1]=b;
+ if(gaps.length-1>gaps.length-2){gaps=GapTools.fixGaps(this);}
+ assert(CHECKGAPS()) : Arrays.toString(gaps);
+ }
+ }
+ public boolean CHECKGAPS(){
+ if(gaps==null){return true;}
+ if(gaps.length==0 || ((gaps.length&1)==1)){return false;}
+ for(int i=1; i<gaps.length; i++){
+ if(gaps[i-1]>gaps[i]){return false;}
+ }
+ return gaps[0]==start && gaps[gaps.length-1]==stop;
+ }
+
+ public int start(){return start;}
+ public int stop(){return stop;}
+ public void setSlowScore(int x){
+// assert(x!=-1);
+ if(verbose){System.err.println("Before setSlowScore: x="+x+", quick="+quickScore+", slow="+slowScore+", paired="+pairedScore);}
+// assert(slowScore<=0 || pairedScore<=0 || pairedScore>=slowScore) : "x="+x+", quick="+quickScore+", slow="+slowScore+", paired="+pairedScore+"\n"+this; //Correct, but temporarily disabled for stability
+ if(x<=0){
+ pairedScore=slowScore=x;
+ }else if(pairedScore<=0){
+ slowScore=x;
+ }else{
+ assert(pairedScore>=slowScore) : this;
+ if(pairedScore>0){
+ if(slowScore>0){
+ pairedScore=x+(pairedScore-slowScore);
+ }else{
+ pairedScore=x+1;
+ }
+ }
+ }
+ slowScore=x;
+ if(verbose){System.err.println("After setSlowScore: "+this);}
+// assert(pairedScore<=0 || pairedScore>=slowScore) : "quick="+quickScore+", slow="+slowScore+", paired="+pairedScore+"\n"+this; //Correct, but temporarily disabled for stability
+ }
+ public void setPairedScore(int x){
+// assert(x==0 || slowScore<=0 || x>=slowScore) : "x="+x+", quick="+quickScore+", slow="+slowScore+", paired="+pairedScore+"\n"+this; //Correct, but temporarily disabled for stability
+ pairedScore=x;
+// assert(slowScore<=0 || pairedScore<=0 || pairedScore>=slowScore) : "x="+x+", quick="+quickScore+", slow="+slowScore+", paired="+pairedScore+"\n"+this; //Correct, but temporarily disabled for stability
+ }
+ public void setSlowPairedScore(int x, int y){
+// assert(slowScore<=0 || pairedScore<=0 || pairedScore>=slowScore) : "x="+x+", quick="+quickScore+", slow="+slowScore+", paired="+pairedScore+"\n"+this; //Correct, but temporarily disabled for stability
+ slowScore=x;
+ pairedScore=y;
+// assert(pairedScore<=0 || pairedScore>=slowScore) : this; //Correct, but temporarily disabled for stability
+ }
+ public void setScore(int x){
+ score=x;
+ }
+
+ public int start;
+ public int stop;
+ public int quickScore;
+ public int score;
+ public int slowScore;
+ public int pairedScore;
+ public int hits;
+ public final int chrom;
+
+ public long flags; //TODO Use this instead of fields
+
+ public int[] gaps; //Limits of large gaps
+ public byte[] match;
+
+
+ public static final PositionComparator PCOMP=new PositionComparator();
+ public static final long strandMask=(1L<<0);
+ public static final long rescuedMask=(1L<<1);
+ public static final long perfectMask=(1L<<2);
+ public static final long semiperfectMask=(1L<<3);
+ public static boolean verbose=false;
+
+}
diff --git a/current/stream/SiteScoreR.java b/current/stream/SiteScoreR.java
new file mode 100755
index 0000000..4063798
--- /dev/null
+++ b/current/stream/SiteScoreR.java
@@ -0,0 +1,285 @@
+package stream;
+
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
+
+
+
+/**
+ * @author Brian Bushnell
+ * @date Jul 16, 2012
+ *
+ */
+public final class SiteScoreR implements Comparable<SiteScoreR>{
+
+ public SiteScoreR(SiteScore ss, int readlen_, long numericID_, byte pairnum_){
+ this(ss.chrom, ss.strand, ss.start, ss.stop, readlen_, numericID_, pairnum_, ss.score, ss.pairedScore, ss.perfect, ss.semiperfect);
+ }
+
+ public SiteScoreR(int chrom_, byte strand_, int start_, int stop_, int readlen_, long numericID_, byte pairnum_, int score_, int pscore_, boolean perfect_, boolean semiperfect_){
+ chrom=chrom_;
+ strand=strand_;
+ start=start_;
+ stop=stop_;
+ readlen=readlen_;
+ numericID=numericID_;
+ pairnum=pairnum_;
+ score=score_;
+ pairedScore=pscore_;
+ perfect=perfect_;
+ semiperfect=semiperfect_|perfect_;
+ assert(start_<=stop_) : this.toText();
+ }
+
+ @Override
+ public int compareTo(SiteScoreR other) {
+ int x=other.score-score;
+ if(x!=0){return x;}
+
+ x=other.pairedScore-pairedScore;
+ if(x!=0){return x;}
+
+ x=chrom-other.chrom;
+ if(x!=0){return x;}
+
+ x=strand-other.strand;
+ if(x!=0){return x;}
+
+ x=start-other.start;
+ return x;
+ }
+
+ public boolean equals(Object other){
+ return compareTo((SiteScoreR)other)==0;
+ }
+
+ public boolean equals(SiteScore other){
+ if(other.start!=start){return false;}
+ if(other.stop!=stop){return false;}
+ if(other.chrom!=chrom){return false;}
+ if(other.strand!=strand){return false;}
+ return true;
+ }
+
+ public boolean equals(SiteScoreR other){
+ return compareTo(other)==0;
+ }
+
+ public String toString(){
+// StringBuilder sb=new StringBuilder();
+// sb.append('\t');
+// sb.append(start);
+// int spaces=10-sb.length();
+// for(int i=0; i<spaces; i++){
+// sb.append(" ");
+// }
+// sb.append('\t');
+// sb.append(quickScore);
+// sb.append('\t');
+// sb.append(score);
+//
+// return "chr"+chrom+"\t"+Gene.strandCodes[strand]+sb;
+ return toText().toString();
+ }
+
+// 9+2+1+9+9+1+1+4+4+4+4+gaps
+ public StringBuilder toText(){
+ StringBuilder sb=new StringBuilder(50);
+ if(correct){sb.append('*');}
+ sb.append(chrom);
+ sb.append(',');
+ sb.append(strand);
+ sb.append(',');
+ sb.append(start);
+ sb.append(',');
+ sb.append(stop);
+ sb.append(',');
+ sb.append(readlen);
+ sb.append(',');
+ sb.append(numericID);
+ sb.append(',');
+ sb.append(pairnum);
+ sb.append(',');
+ sb.append((semiperfect ? 1 : 0));
+ sb.append((perfect ? 1 : 0));
+ sb.append(',');
+ sb.append(pairedScore);
+ sb.append(',');
+ sb.append(score);
+// sb.append(',');
+// sb.append((long)normalizedScore);
+ return sb;
+// chrom+","+strand+","+start+","+stop+","+(rescued ? 1 : 0)+","+
+// (perfect ? 1 : 0)+","+quickScore+","+slowScore+","+pairedScore+","+score;
+ }
+
+ public final boolean overlaps(SiteScoreR ss){
+ return chrom==ss.chrom && strand==ss.strand && overlap(start, stop, ss.start, ss.stop);
+ }
+ private static boolean overlap(int a1, int b1, int a2, int b2){
+ assert(a1<=b1 && a2<=b2) : a1+", "+b1+", "+a2+", "+b2;
+ return a2<=b1 && b2>=a1;
+ }
+
+ public static String header() {
+ return "chrom,strand,start,stop,readlen,numericID,pairnum,semiperfect+perfect,quickScore,slowScore,pairedScore,score";
+ }
+
+ public static SiteScoreR fromText(String s){
+// System.err.println("Trying to make a SS from "+s);
+ String line[]=s.split(",");
+
+ SiteScoreR ss;
+
+ assert(line.length==10 || line.length==11) : "\n"+line.length+"\n"+s+"\n"+Arrays.toString(line)+"\n";
+ boolean correct=false;
+ if(line[0].charAt(0)=='*'){
+ correct=true;
+ line[0]=line[0].substring(1);
+ }
+ int chrom=Byte.parseByte(line[0]);
+ byte strand=Byte.parseByte(line[1]);
+ int start=Integer.parseInt(line[2]);
+ int stop=Integer.parseInt(line[3]);
+ int readlen=Integer.parseInt(line[4]);
+ long numericID=Long.parseLong(line[5]);
+ byte pairnum=Byte.parseByte(line[6]);
+ int p=Integer.parseInt(line[7], 2);
+ boolean perfect=(p&1)==1;
+ boolean semiperfect=(p&2)==2;
+ int pairedScore=Integer.parseInt(line[8]);
+ int score=Integer.parseInt(line[9]);
+ ss=new SiteScoreR(chrom, strand, start, stop, readlen, numericID, pairnum, score, pairedScore, perfect, semiperfect);
+ ss.correct=correct;
+
+ return ss;
+ }
+
+ public static SiteScoreR[] fromTextArray(String s){
+ String[] split=s.split("\t");
+ SiteScoreR[] out=new SiteScoreR[split.length];
+ for(int i=0; i<split.length; i++){out[i]=fromText(split[i]);}
+ return out;
+ }
+
+ public boolean positionalMatch(SiteScoreR b){
+// return chrom==b.chrom && strand==b.strand && start==b.start && stop==b.stop;
+ if(chrom!=b.chrom || strand!=b.strand || start!=b.start || stop!=b.stop){
+ return false;
+ }
+ return true;
+ }
+
+ public static class PositionComparator implements Comparator<SiteScoreR>{
+
+ private PositionComparator(){}
+
+ @Override
+ public int compare(SiteScoreR a, SiteScoreR b) {
+ if(a.chrom!=b.chrom){return a.chrom-b.chrom;}
+ if(a.start!=b.start){return a.start-b.start;}
+ if(a.stop!=b.stop){return a.stop-b.stop;}
+ if(a.strand!=b.strand){return a.strand-b.strand;}
+ if(a.score!=b.score){return b.score-a.score;}
+ if(a.perfect!=b.perfect){return a.perfect ? -1 : 1;}
+ return 0;
+ }
+
+ public void sort(List<SiteScoreR> list){
+ if(list==null || list.size()<2){return;}
+ Collections.sort(list, this);
+ }
+
+ public void sort(SiteScoreR[] list){
+ if(list==null || list.length<2){return;}
+ Arrays.sort(list, this);
+ }
+
+ }
+
+ public static class NormalizedComparator implements Comparator<SiteScoreR>{
+
+ private NormalizedComparator(){}
+
+ @Override
+ public int compare(SiteScoreR a, SiteScoreR b) {
+ if((int)a.normalizedScore!=(int)b.normalizedScore){return (int)b.normalizedScore-(int)a.normalizedScore;}
+ if(a.score!=b.score){return b.score-a.score;}
+ if(a.pairedScore!=b.pairedScore){return b.pairedScore-a.pairedScore;}
+ if(a.retainVotes!=b.retainVotes){return b.retainVotes-a.retainVotes;}
+ if(a.perfect!=b.perfect){return a.perfect ? -1 : 1;}
+ if(a.chrom!=b.chrom){return a.chrom-b.chrom;}
+ if(a.start!=b.start){return a.start-b.start;}
+ if(a.stop!=b.stop){return a.stop-b.stop;}
+ if(a.strand!=b.strand){return a.strand-b.strand;}
+ return 0;
+ }
+
+ public void sort(List<SiteScoreR> list){
+ if(list==null || list.size()<2){return;}
+ Collections.sort(list, this);
+ }
+
+ public void sort(SiteScoreR[] list){
+ if(list==null || list.length<2){return;}
+ Arrays.sort(list, this);
+ }
+
+ }
+
+ public static class IDComparator implements Comparator<SiteScoreR>{
+
+ private IDComparator(){}
+
+ @Override
+ public int compare(SiteScoreR a, SiteScoreR b) {
+ if(a.numericID!=b.numericID){return a.numericID>b.numericID ? 1 : -1;}
+ if(a.pairnum!=b.pairnum){return a.pairnum-b.pairnum;}
+
+ if(a.chrom!=b.chrom){return a.chrom-b.chrom;}
+ if(a.start!=b.start){return a.start-b.start;}
+ if(a.stop!=b.stop){return a.stop-b.stop;}
+ if(a.strand!=b.strand){return a.strand-b.strand;}
+ if(a.score!=b.score){return b.score-a.score;}
+ if(a.perfect!=b.perfect){return a.perfect ? -1 : 1;}
+ return 0;
+ }
+
+ public void sort(List<SiteScoreR> list){
+ if(list==null || list.size()<2){return;}
+ Collections.sort(list, this);
+ }
+
+ public void sort(SiteScoreR[] list){
+ if(list==null || list.length<2){return;}
+ Arrays.sort(list, this);
+ }
+
+ }
+
+ public static final PositionComparator PCOMP=new PositionComparator();
+ public static final NormalizedComparator NCOMP=new NormalizedComparator();
+ public static final IDComparator IDCOMP=new IDComparator();
+
+ public int reflen(){return stop-start+1;}
+
+ public int start;
+ public int stop;
+ public int readlen;
+ public int score;
+ public int pairedScore;
+ public final int chrom;
+ public final byte strand;
+ public boolean perfect;
+ public boolean semiperfect;
+ public final long numericID;
+ public final byte pairnum;
+ public float normalizedScore;
+// public int weight=0; //Temp variable, for calculating normalized score
+ public boolean correct=false;
+ public int retainVotes=0;
+
+}
diff --git a/current/stream/mpi/ConcurrentReadInputStreamMPI.java b/current/stream/mpi/ConcurrentReadInputStreamMPI.java
new file mode 100755
index 0000000..3a37eff
--- /dev/null
+++ b/current/stream/mpi/ConcurrentReadInputStreamMPI.java
@@ -0,0 +1,77 @@
+package stream.mpi;
+
+import align2.ListNum;
+
+import stream.Read;
+import stream.ConcurrentReadInputStream;
+import stream.ConcurrentReadInputStreamD;
+import stream.mpi.MPIWrapper;
+
+/**
+ * The MPI implementation of ConcurrentReadInputStreamD.
+ * @author Jonathan Rood
+ * @date Dec 9, 2014
+ *
+ */
+public class ConcurrentReadInputStreamMPI extends ConcurrentReadInputStreamD {
+
+ /**
+ * @param cris_
+ * @param master_
+ * @param keepAll_
+ */
+ public ConcurrentReadInputStreamMPI(ConcurrentReadInputStream cris_, boolean master_, boolean keepAll_) {
+ super(cris_, master_, keepAll_);
+ }
+
+ protected void broadcast(ListNum<Read> ln){
+ //if(!keepAll && ln.size()>0){//Decide how to send this list
+ if(!keepAll){//Decide how to send this list
+ final int toRank=(int)(ln.id%ranks);
+ if(toRank==rank){return;}
+ unicast(ln, toRank);
+ if(verbose){System.err.println("crisMPI: Rank: " + rank + " unicasted to rank: " + toRank);}
+ return;
+ }
+ MPIWrapper.broadcastList(ln);
+ if(verbose){System.err.println("crisMPI: Rank: " + rank + " broadcasted reads.");}
+ }
+
+ protected void unicast(ListNum<Read> ln, final int toRank){
+ if(toRank==rank){return;}
+ MPIWrapper.sendList(ln, toRank);
+ if(verbose){System.err.println("crisMPI: Rank: " + rank + " unicasted to rank: " + toRank);}
+ }
+
+ protected void broadcastPaired(boolean b){
+ if(verbose){System.err.println("crisMPI: Rank: " + rank + " broadcasting pairing status of " + b + ".");}
+ MPIWrapper.broadcastBoolean(b);
+ }
+
+ protected void broadcastKeepall(boolean b){
+ if(verbose){System.err.println("crisMPI: Rank: " + rank + " broadcasting keepAll status of " + b + ".");}
+ MPIWrapper.broadcastBoolean(b);
+ }
+
+ protected ListNum<Read> listen(){
+ if(verbose){System.err.println("crisMPI: Rank: " + rank + " listening for reads.");}
+ return MPIWrapper.listenForListCris(0);
+ }
+
+ protected boolean listenPaired(){
+ if(verbose){System.err.println("crisMPI: Rank: " + rank + " listening for pairing status.");}
+ boolean b=MPIWrapper.listenForBooleanFromBroadcast();
+ if(verbose){System.err.println("crisMPI: Rank: " + rank + " received paired status of " + b + ".");}
+ return b;
+ }
+
+ protected boolean listenKeepall(){
+ if(verbose){System.err.println("crisMPI: Rank: " + rank + " listening for keepAll status.");}
+ boolean b=MPIWrapper.listenForBooleanFromBroadcast();
+ if(verbose){System.err.println("crisMPI: Rank: " + rank + " received keepAll status of " + b + ".");}
+ return b;
+ }
+
+ private boolean verbose=false;
+
+}
diff --git a/current/stream/mpi/ConcurrentReadOutputStreamMPI.java b/current/stream/mpi/ConcurrentReadOutputStreamMPI.java
new file mode 100755
index 0000000..53f1eb9
--- /dev/null
+++ b/current/stream/mpi/ConcurrentReadOutputStreamMPI.java
@@ -0,0 +1,69 @@
+package stream.mpi;
+
+import align2.ListNum;
+
+import stream.Read;
+import stream.ConcurrentReadOutputStream;
+import stream.ConcurrentReadOutputStreamD;
+import stream.mpi.MPIWrapper;
+
+/**
+ * The MPI implementation of ConcurrentReadOutputStreamD.
+ * @author Jonathan Rood
+ * @date Dec 9, 2014
+ *
+ */
+public class ConcurrentReadOutputStreamMPI extends ConcurrentReadOutputStreamD {
+
+ /**
+ * @param cros_
+ * @param master_
+ */
+ public ConcurrentReadOutputStreamMPI(ConcurrentReadOutputStream cros_, boolean master_) {
+ super(cros_, master_);
+ }
+
+ @Override
+ protected void unicast(ListNum<Read> ln, final int toRank){
+ if(toRank==rank){return;}
+ MPIWrapper.sendList(ln, toRank);
+ if(verbose){System.err.println("crosMPI: Rank: " + rank + " unicasted to rank: " + toRank);}
+ }
+
+ @Override
+ protected ListNum<Read> listen(final int fromRank){
+ if(verbose){System.err.println("crosMPI: Rank: " + rank + " listening for reads from rank " + fromRank + ".");}
+ return MPIWrapper.listenForListCros(fromRank);
+ }
+
+ @Override
+ protected boolean listenForJoin(){
+ if(verbose){System.err.println("crosMPI: Rank: " + rank + " listening for join status.");}
+ boolean b=MPIWrapper.listenForBooleanFromBroadcast();
+ if(verbose){System.err.println("crosMPI: Rank: " + rank + " received join status of " + b + ".");}
+ return b;
+ }
+
+ @Override
+ protected boolean listenFinishedSuccessfully(){
+ if(verbose){System.err.println("crosMPI: Rank: " + rank + " listening for finished successfully status.");}
+ boolean b=MPIWrapper.listenForBooleanFromBroadcast();
+ if(verbose){System.err.println("crosMPI: Rank: " + rank + " received finished successfully status of " + b + ".");}
+ return b;
+ }
+
+ @Override
+ protected void broadcastJoin(boolean b){
+ if(verbose){System.err.println("crosMPI: Rank: " + rank + " broadcasting join status of " + b + ".");}
+ MPIWrapper.broadcastBoolean(b);
+ }
+
+ @Override
+ protected void broadcastFinishedSuccessfully(boolean b){
+ if(verbose){System.err.println("crosMPI: Rank: " + rank + " broadcasting finished successfully status.");}
+ MPIWrapper.broadcastBoolean(b);
+ }
+
+ private boolean verbose=false;
+
+}
diff --git a/current/stream/mpi/MPIWrapper.java b/current/stream/mpi/MPIWrapper.java
new file mode 100755
index 0000000..1e0383a
--- /dev/null
+++ b/current/stream/mpi/MPIWrapper.java
@@ -0,0 +1,311 @@
+package stream.mpi;
+
+import mpi.*;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
+
+import java.nio.ByteBuffer;
+
+import align2.Shared;
+import align2.ListNum;
+
+import stream.Read;
+
+/**
+ * Wraps MPI class functions for access by other programs.
+ * All MPI calls should go through this class.
+ * It should also set Shared.MPI fields such as MPI_RANK.
+ *
+ * @author Jonathan Rood
+ * @date Dec 9, 2014
+ *
+ */
+
+public class MPIWrapper {
+
+ public static void mpiInit(String[] args) {
+ if(Shared.USE_MPI && Shared.USE_CRISMPI) {
+ if(verbose){System.out.println("Running MPI Init.");}
+ if(!blocking) {
+ bb=new ByteBuffer[msgsInFlight];
+ bbLength=new ByteBuffer[msgsInFlight];
+ iReq=new Request[msgsInFlight];
+ iReqLength=new Request[msgsInFlight];
+ n=-1;
+ for(int i=0; i<msgsInFlight; i++){
+ bb[i]=null;
+ bbLength[i]=null;
+ iReq[i]=null;
+ iReqLength[i]=null;
+ }
+ }
+ try {
+ MPI.Init(args);
+ Shared.MPI_RANK=MPI.COMM_WORLD.getRank();
+ Shared.MPI_NUM_RANKS=MPI.COMM_WORLD.getSize();
+ } catch (MPIException e) {
+ e.printStackTrace();
+ }
+ }
+ if(verbose){System.err.println("MPI: Rank " + Shared.MPI_RANK + " finished MPI Init.");}
+ }
+
+ public static void mpiFinalize() {
+ if(Shared.USE_MPI && Shared.USE_CRISMPI) {
+ if(verbose){System.err.println("MPI: Rank " + Shared.MPI_RANK + " running MPI Finalize.");}
+ try {
+ //MPI.COMM_WORLD.barrier();
+ MPI.Finalize();
+ } catch (MPIException e) {
+ e.printStackTrace();
+ }
+ }
+ }
+
+ public static void broadcastList(ListNum<Read> ln){
+ if(blocking){
+ blockingBroadcastList(ln); //blocking
+ }else{
+ nonblockingBroadcastList(ln); //non-blocking
+ }
+ }
+
+ private static void blockingBroadcastList(ListNum<Read> ln){
+ byte[] b=serialize(ln);
+ int[] bLength={b.length};
+ if(verbose){System.err.println("MPI: Rank " + Shared.MPI_RANK + " broadcasting message of size " + bLength[0] + ".");}
+ try {
+ MPI.COMM_WORLD.bcast(bLength,1,MPI.INT,0); // can't probe a broadcast, so send message size first
+ MPI.COMM_WORLD.bcast(b,b.length,MPI.BYTE,0); // broadcast the actual message
+ } catch (MPIException e) {
+ e.printStackTrace();
+ }
+ }
+
+ public static void sendList(ListNum<Read> ln, final int toRank){
+ if(blocking){
+ blockingSendList(ln, toRank); //blocking
+ }else{
+ nonblockingSendList(ln, toRank); //non-blocking
+ }
+ }
+
+ private static void blockingSendList(ListNum<Read> ln, final int toRank){
+ byte[] b=serialize(ln);
+ if(verbose){System.err.println("MPI: Rank " + Shared.MPI_RANK + " sending message of size " + b.length + " to rank " + toRank + ".");}
+ try {
+ MPI.COMM_WORLD.send(b,b.length,MPI.BYTE,toRank,50);
+ } catch (MPIException e) {
+ e.printStackTrace();
+ }
+ }
+
+ private static void nonblockingSendList(ListNum<Read> ln, final int toRank){
+ n++;
+ int m=(int)(n%msgsInFlight);
+ byte[] b=serialize(ln);
+ if(iReq[m]!=null) {
+ try {
+ iReq[m].waitFor(); // wait on oldest message in flight
+ } catch (MPIException e) {
+ e.printStackTrace();
+ }
+ }
+ bb[m]=ByteBuffer.allocateDirect(b.length);
+ bb[m].put(b);
+ bb[m].clear();
+ if(verbose){System.err.println("MPI: Rank " + Shared.MPI_RANK + " sending message of size " + b.length + " to rank " + toRank + ".");}
+ try {
+ iReq[m]=MPI.COMM_WORLD.iSend(bb[m],b.length,MPI.BYTE,toRank,50);
+ } catch (MPIException e) {
+ e.printStackTrace();
+ }
+ }
+
+ private static void nonblockingBroadcastList(ListNum<Read> ln){
+ n++;
+ int m=(int)(n%msgsInFlight);
+ byte[] b=serialize(ln);
+ int[] bLength={b.length};
+ if(iReqLength[m]!=null) {
+ try {
+ iReqLength[m].waitFor();
+ } catch (MPIException e) {
+ e.printStackTrace();
+ }
+ }
+ bbLength[m]=ByteBuffer.allocateDirect(4);
+ bbLength[m].putInt(bLength[0]);
+ bbLength[m].clear();
+ try {
+ iReqLength[m]=MPI.COMM_WORLD.iBcast(bbLength[m],4,MPI.BYTE,0);
+ if(iReq[m]!=null) {
+ iReq[m].waitFor();
+ }
+ } catch (MPIException e) {
+ e.printStackTrace();
+ }
+ bb[m]=ByteBuffer.allocateDirect(b.length);
+ bb[m].put(b);
+ bb[m].clear();
+ if(verbose){System.err.println("MPI: Rank " + Shared.MPI_RANK + " broadcasting message of size " + bLength[0] + ".");}
+ try {
+ iReq[m]=MPI.COMM_WORLD.iBcast(bb[m],b.length,MPI.BYTE,0);
+ } catch (MPIException e) {
+ e.printStackTrace();
+ }
+ }
+
+ private static ListNum<Read> blockingListenForListFromBroadcast(int fromRank){
+ int[] bLength={0};
+ try {
+ MPI.COMM_WORLD.bcast(bLength,1,MPI.INT,fromRank);
+ } catch (MPIException e) {
+ e.printStackTrace();
+ }
+ byte[] b=new byte[bLength[0]];
+ try {
+ MPI.COMM_WORLD.bcast(b,bLength[0],MPI.BYTE,fromRank);
+ } catch (MPIException e) {
+ e.printStackTrace();
+ }
+ if(verbose){System.err.println("MPI: Rank " + Shared.MPI_RANK + " received message of size " + bLength[0] + ".");}
+ ListNum<Read> ln=(ListNum<Read>) deserialize(b);
+ return ln;
+ }
+
+ private static ListNum<Read> blockingListenForListFromSend(int fromRank){
+ Status s=null;
+ int bLength=0;
+ try {
+ s=MPI.COMM_WORLD.probe(fromRank,50);
+ bLength=s.getCount(MPI.BYTE);
+ } catch (MPIException e) {
+ e.printStackTrace();
+ }
+ byte[] b=new byte[bLength];
+ try {
+ MPI.COMM_WORLD.recv(b,bLength,MPI.BYTE,fromRank,50);
+ } catch (MPIException e) {
+ e.printStackTrace();
+ }
+ if(verbose){System.err.println("MPI: Rank " + Shared.MPI_RANK + " received message of size " + bLength + ".");}
+ ListNum<Read> ln=(ListNum<Read>) deserialize(b);
+ return ln;
+ }
+
+ private static ListNum<Read> nonblockingListenForListFromBroadcast(int fromRank){
+ int[] bLength={0};
+ bbLength2=ByteBuffer.allocateDirect(4);
+ Request req=null;
+ try {
+ req=MPI.COMM_WORLD.iBcast(bbLength2,4,MPI.BYTE,fromRank);
+ req.waitFor();
+ } catch (MPIException e) {
+ e.printStackTrace();
+ }
+ bbLength2.clear();
+ bLength[0]=bbLength2.getInt();
+ byte[] b=new byte[bLength[0]];
+ bb2=ByteBuffer.allocateDirect(bLength[0]);
+ try {
+ req=MPI.COMM_WORLD.iBcast(bb2,bLength[0],MPI.BYTE,fromRank);
+ req.waitFor();
+ } catch (MPIException e) {
+ e.printStackTrace();
+ }
+ bb2.clear();
+ bb2.get(b);
+ if(verbose){System.err.println("MPI: Rank " + Shared.MPI_RANK + " received message of size " + bLength[0] + ".");}
+ ListNum<Read> ln=(ListNum<Read>) deserialize(b);
+ return ln;
+ }
+
+ public static void broadcastBoolean(boolean b){
+ boolean[] array={b};
+ if(verbose){System.err.println("MPI: Rank " + Shared.MPI_RANK + " broadcasting boolean of " + array[0] + ".");}
+ try {
+ MPI.COMM_WORLD.bcast(array,1,MPI.BOOLEAN,0);
+ } catch (MPIException e) {
+ e.printStackTrace();
+ }
+ }
+
+ public static boolean listenForBooleanFromBroadcast(){
+ boolean[] array={false};
+ if(verbose){System.err.println("MPI: Rank " + Shared.MPI_RANK + " listening for boolean.");}
+ try {
+ MPI.COMM_WORLD.bcast(array,1,MPI.BOOLEAN,0);
+ } catch (MPIException e) {
+ e.printStackTrace();
+ }
+ if(verbose){System.err.println("MPI: Rank " + Shared.MPI_RANK + " received boolean of " + array[0] + ".");}
+ return array[0];
+ }
+
+ private static byte[] serialize(Object obj) {
+ ByteArrayOutputStream bos = null;
+ ObjectOutputStream oos = null;
+ try {
+ bos = new ByteArrayOutputStream();
+ oos = new ObjectOutputStream(bos);
+ oos.writeObject(obj);
+ oos.flush();
+ oos.close();
+ bos.close();
+ } catch(IOException ioe) {
+ ioe.printStackTrace();
+ }
+ return bos.toByteArray();
+ }
+
+ private static Object deserialize(byte[] bytes) {
+ Object obj = null;
+ ByteArrayInputStream bis = null;
+ ObjectInputStream ois = null;
+ try {
+ bis = new ByteArrayInputStream(bytes);
+ ois = new ObjectInputStream(bis);
+ obj = ois.readObject();
+ ois.close();
+ bis.close();
+ } catch(IOException ioe) {
+ ioe.printStackTrace();
+ } catch(ClassNotFoundException cnfe) {
+ cnfe.printStackTrace();
+ }
+ return obj;
+ }
+
+ public static ListNum<Read> listenForListCris(int fromRank){
+ if(Shared.MPI_KEEP_ALL){
+ if(blocking){
+ return blockingListenForListFromBroadcast(fromRank); //blocking broadcast
+ }else{
+ return nonblockingListenForListFromBroadcast(fromRank); //non-blocking broadcast
+ }
+ }else{
+ return blockingListenForListFromSend(fromRank); //only need to use blocking receive for unicast
+ }
+ }
+
+ public static ListNum<Read> listenForListCros(int fromRank){
+ return blockingListenForListFromSend(fromRank); //only need to use blocking receive for unicast
+ }
+
+ private static boolean verbose=false;
+ private static Request[] iReq;
+ private static Request[] iReqLength;
+ private static ByteBuffer[] bb;
+ private static ByteBuffer[] bbLength;
+ private static ByteBuffer bb2;
+ private static ByteBuffer bbLength2;
+ private static long n=-1;
+ private static int msgsInFlight=2;
+ private static boolean blocking=true;
+
+}
diff --git a/current/tax/FilterByTaxa.java b/current/tax/FilterByTaxa.java
new file mode 100755
index 0000000..cd99711
--- /dev/null
+++ b/current/tax/FilterByTaxa.java
@@ -0,0 +1,441 @@
+package tax;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import stream.ConcurrentReadInputStream;
+import stream.FASTQ;
+import stream.FastaReadInputStream;
+import stream.ConcurrentReadOutputStream;
+import stream.Read;
+
+import dna.Parser;
+import dna.Timer;
+import fileIO.ByteFile;
+import fileIO.ReadWrite;
+import fileIO.FileFormat;
+
+import align2.ListNum;
+import align2.ReadStats;
+import align2.Shared;
+import align2.Tools;
+
+/**
+ * Filters sequences according to their taxonomy,
+ * as determined by the sequence name. Sequences should
+ * be labeled with a gi number or NCBI taxID.
+ *
+ * @author Brian Bushnell
+ * @date November 23, 2015
+ *
+ */
+public class FilterByTaxa {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /**
+ * Code entrance from the command line.
+ * @param args Command line arguments
+ */
+ public static void main(String[] args){
+ Timer t=new Timer();
+ FilterByTaxa as=new FilterByTaxa(args);
+ as.process(t);
+ }
+
+ /**
+ * Constructor.
+ * @param args Command line arguments
+ */
+ public FilterByTaxa(String[] args){
+
+ //Process any config files
+ args=Parser.parseConfig(args);
+
+ //Detect whether the uses needs help
+ if(Parser.parseHelp(args, true)){
+ printOptions();
+ System.exit(0);
+ }
+
+ //Print the program name and arguments
+ outstream.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+
+ boolean setInterleaved=false; //Whether interleaved was explicitly set.
+
+ //Set some shared static variables regarding PIGZ
+ Shared.READ_BUFFER_LENGTH=Tools.min(200, Shared.READ_BUFFER_LENGTH);
+ Shared.capBuffers(4);
+ ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
+ ReadWrite.MAX_ZIP_THREADS=Shared.threads();
+
+ //Create a parser object
+ Parser parser=new Parser();
+
+ //Parse each argument
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+
+ //Break arguments into their constituent parts, in the form of "a=b"
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if(b==null || b.equalsIgnoreCase("null")){b=null;}
+ while(a.startsWith("-")){a=a.substring(1);} //Strip leading hyphens
+
+
+ if(parser.parse(arg, a, b)){//Parse standard flags in the parser
+ //do nothing
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ }else if(TaxFilter.validArgument(a)){
+ //do nothing
+ }else{
+ outstream.println("Unknown parameter "+args[i]);
+ assert(false) : "Unknown parameter "+args[i];
+ // throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ {//Process parser fields
+ Parser.processQuality();
+
+ maxReads=parser.maxReads;
+
+ overwrite=ReadStats.overwrite=parser.overwrite;
+ append=ReadStats.append=parser.append;
+ setInterleaved=parser.setInterleaved;
+
+ in1=parser.in1;
+ in2=parser.in2;
+ qfin1=parser.qfin1;
+ qfin2=parser.qfin2;
+
+ out1=parser.out1;
+ out2=parser.out2;
+ qfout1=parser.qfout1;
+ qfout2=parser.qfout2;
+
+ extin=parser.extin;
+ extout=parser.extout;
+ }
+
+ //Do input file # replacement
+ if(in1!=null && in2==null && in1.indexOf('#')>-1 && !new File(in1).exists()){
+ in2=in1.replace("#", "2");
+ in1=in1.replace("#", "1");
+ }
+
+ //Do output file # replacement
+ if(out1!=null && out2==null && out1.indexOf('#')>-1){
+ out2=out1.replace("#", "2");
+ out1=out1.replace("#", "1");
+ }
+
+ //Adjust interleaved detection based on the number of input files
+ if(in2!=null){
+ if(FASTQ.FORCE_INTERLEAVED){outstream.println("Reset INTERLEAVED to false because paired input files were specified.");}
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+ }
+
+ assert(FastaReadInputStream.settingsOK());
+
+ //Ensure there is an input file
+ if(in1==null){
+ printOptions();
+ throw new RuntimeException("Error - at least one input file is required.");
+ }
+
+ //Adjust the number of threads for input file reading
+ if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){
+ ByteFile.FORCE_MODE_BF2=true;
+ }
+
+ //Ensure out2 is not set without out1
+ if(out1==null){
+ if(out2!=null){
+ printOptions();
+ throw new RuntimeException("Error - cannot define out2 without defining out1.");
+ }
+ }
+
+ //Adjust interleaved settings based on number of output files
+ if(!setInterleaved){
+ assert(in1!=null && (out1!=null || out2==null)) : "\nin1="+in1+"\nin2="+in2+"\nout1="+out1+"\nout2="+out2+"\n";
+ if(in2!=null){ //If there are 2 input streams.
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+ outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED);
+ }else{ //There is one input stream.
+ if(out2!=null){
+ FASTQ.FORCE_INTERLEAVED=true;
+ FASTQ.TEST_INTERLEAVED=false;
+ outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED);
+ }
+ }
+ }
+
+ //Ensure output files can be written
+ if(!Tools.testOutputFiles(overwrite, append, false, out1, out2)){
+ outstream.println((out1==null)+", "+(out2==null)+", "+out1+", "+out2);
+ throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+out1+", "+out2+"\n");
+ }
+
+ //Ensure input files can be read
+ if(!Tools.testInputFiles(false, true, in1, in2)){
+ throw new RuntimeException("\nCan't read to some input files.\n");
+ }
+
+ //Ensure that no file was specified multiple times
+ if(!Tools.testForDuplicateFiles(true, in1, in2, out1, out2)){
+ throw new RuntimeException("\nSome file names were specified multiple times.\n");
+ }
+
+ //Create output FileFormat objects
+ ffout1=FileFormat.testOutput(out1, FileFormat.FASTQ, extout, true, overwrite, append, ordered);
+ ffout2=FileFormat.testOutput(out2, FileFormat.FASTQ, extout, true, overwrite, append, ordered);
+
+ //Create input FileFormat objects
+ ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, extin, true, true);
+ ffin2=FileFormat.testInput(in2, FileFormat.FASTQ, extin, true, true);
+
+ //Make the actual filter
+ filter=TaxFilter.makeFilter(args);
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Outer Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Create read streams and process all data */
+ public void process(Timer t){
+
+ //Create a read input stream
+ final ConcurrentReadInputStream cris;
+ {
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, ffin2, qfin1, qfin2);
+ cris.start(); //Start the stream
+ if(verbose){outstream.println("Started cris");}
+ }
+ boolean paired=cris.paired();
+ if(!ffin1.samOrBam()){outstream.println("Input is being processed as "+(paired ? "paired" : "unpaired"));}
+
+ //Optionally create a read output stream
+ final ConcurrentReadOutputStream ros;
+ if(ffout1!=null){
+ final int buff=4;
+
+ if(cris.paired() && out2==null && (in1!=null && !ffin1.samOrBam() && !ffout1.samOrBam())){
+ outstream.println("Writing interleaved.");
+ }
+
+ ros=ConcurrentReadOutputStream.getStream(ffout1, ffout2, qfout1, qfout2, buff, null, false);
+ ros.start(); //Start the stream
+ }else{ros=null;}
+
+ //Reset counters
+ readsProcessed=0;
+ basesProcessed=0;
+
+ //Process the read stream
+ processInner(cris, ros);
+
+ if(verbose){outstream.println("Finished; closing streams.");}
+
+ //Write anything that was accumulated by ReadStats
+ errorState|=ReadStats.writeAll();
+ //Close the read streams
+ errorState|=ReadWrite.closeStreams(cris, ros);
+
+ //Report timing and results
+ {
+ t.stop();
+
+ //Calculate units per nanosecond
+ double rpnano=readsProcessed/(double)(t.elapsed);
+ double bpnano=basesProcessed/(double)(t.elapsed);
+
+ //Add "k" and "m" for large numbers
+ String rpstring=(readsProcessed<100000 ? ""+readsProcessed : readsProcessed<100000000 ? (readsProcessed/1000)+"k" : (readsProcessed/1000000)+"m");
+ String bpstring=(basesProcessed<100000 ? ""+basesProcessed : basesProcessed<100000000 ? (basesProcessed/1000)+"k" : (basesProcessed/1000000)+"m");
+
+ //Format the strings so they have they are right-justified
+ while(rpstring.length()<8){rpstring=" "+rpstring;}
+ while(bpstring.length()<8){bpstring=" "+bpstring;}
+
+ outstream.println("Reads In: \t"+readsProcessed+" reads \t"+basesProcessed+" bases");
+ outstream.println("Reads Out: \t"+readsOut+" reads \t"+basesOut+" bases");
+ outstream.println();
+
+ outstream.println("Time: \t"+t);
+ outstream.println("Reads Processed: "+rpstring+" \t"+String.format("%.2fk reads/sec", rpnano*1000000));
+ outstream.println("Bases Processed: "+bpstring+" \t"+String.format("%.2fm bases/sec", bpnano*1000));
+ }
+
+ //Throw an exception of there was an error in a thread
+ if(errorState){
+ throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
+ }
+ }
+
+ /** Iterate through the reads */
+ void processInner(final ConcurrentReadInputStream cris, final ConcurrentReadOutputStream ros){
+
+ //Do anything necessary prior to processing
+
+ {
+ //Grab the first ListNum of reads
+ ListNum<Read> ln=cris.nextList();
+ //Grab the actual read list from the ListNum
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ //Check to ensure pairing is as expected
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+ assert((ffin1==null || ffin1.samOrBam()) || (r.mate!=null)==cris.paired());
+ }
+
+ //As long as there is a nonempty read list...
+ while(reads!=null && reads.size()>0){
+ if(verbose){outstream.println("Fetched "+reads.size()+" reads.");}
+
+ //Loop through each read in the list
+ for(int idx=0; idx<reads.size(); idx++){
+ final Read r1=reads.get(idx);
+ final Read r2=r1.mate;
+
+ //Track the initial length for statistics
+ final int initialLength1=r1.length();
+ final int initialLength2=(r1.mateLength());
+
+ //Increment counters
+ readsProcessed+=1+r1.mateCount();
+ basesProcessed+=initialLength1+initialLength2;
+
+ boolean keep=processReadPair(r1, r2);
+ if(!keep){reads.set(idx, null);}
+ else{
+ readsOut+=1+r1.mateCount();
+ basesOut+=initialLength1+initialLength2;
+ }
+ }
+
+ //Output reads to the output stream
+ if(ros!=null){ros.add(reads, ln.id);}
+
+ //Notify the input stream that the list was used
+ cris.returnList(ln.id, ln.list.isEmpty());
+ if(verbose){outstream.println("Returned a list.");}
+
+ //Fetch a new list
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+
+ //Notify the input stream that the final list was used
+ if(ln!=null){
+ cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
+ }
+ }
+
+ //Do anything necessary after processing
+
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /**
+ * Process a single read pair.
+ * @param r1 Read 1
+ * @param r2 Read 2 (may be null)
+ * @return True if the reads should be kept, false if they should be discarded.
+ */
+ boolean processReadPair(final Read r1, final Read r2){
+ return filter.passesFilter(r1.id);
+ }
+
+ /** This is called if the program runs with no parameters */
+ private void printOptions(){
+ throw new RuntimeException("TODO");
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Primary input file path */
+ private String in1=null;
+ /** Secondary input file path */
+ private String in2=null;
+
+ private String qfin1=null;
+ private String qfin2=null;
+
+ /** Primary output file path */
+ private String out1=null;
+ /** Secondary output file path */
+ private String out2=null;
+
+ private String qfout1=null;
+ private String qfout2=null;
+
+ /** Override input file extension */
+ private String extin=null;
+ /** Override output file extension */
+ private String extout=null;
+
+ /** The actual filter */
+ private final TaxFilter filter;
+
+ /*--------------------------------------------------------------*/
+
+ /** Number of reads processed */
+ protected long readsProcessed=0;
+ /** Number of bases processed */
+ protected long basesProcessed=0;
+
+ /** Number of reads out */
+ public long readsOut=0;
+ /** Number of bases out */
+ public long basesOut=0;
+
+ /** Quit after processing this many input reads; -1 means no limit */
+ private long maxReads=-1;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Final Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Primary input file */
+ private final FileFormat ffin1;
+ /** Secondary input file */
+ private final FileFormat ffin2;
+
+ /** Primary output file */
+ private final FileFormat ffout1;
+ /** Secondary output file */
+ private final FileFormat ffout2;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Common Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Print status messages to this output stream */
+ private PrintStream outstream=System.err;
+ /** Print verbose messages */
+ public static boolean verbose=false;
+ /** True if an error was encountered */
+ public boolean errorState=false;
+ /** Overwrite existing output files */
+ private boolean overwrite=false;
+ /** Append to existing output files */
+ private boolean append=false;
+ /** This flag has no effect on singlethreaded programs */
+ private final boolean ordered=false;
+
+}
diff --git a/current/tax/GiToNcbi.java b/current/tax/GiToNcbi.java
new file mode 100755
index 0000000..c364b3a
--- /dev/null
+++ b/current/tax/GiToNcbi.java
@@ -0,0 +1,245 @@
+package tax;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import align2.Tools;
+import fileIO.ByteFile;
+import fileIO.ReadWrite;
+
+/**
+ * @author Brian Bushnell
+ * @date Mar 10, 2015
+ *
+ */
+public class GiToNcbi {
+
+ public static void main(String[] args){
+ ReadWrite.USE_UNPIGZ=true;
+ ReadWrite.USE_PIGZ=true;
+ ReadWrite.ZIPLEVEL=8;
+ initialize(args[0]);
+ if(args.length>2){//Run a test
+ test(args);
+ }else if(args.length==2){//Write array
+ ReadWrite.write(array, args[1], true);
+ }
+ }
+
+ public static void test(String[] args){
+ System.err.println(getID(1000));
+ System.err.println(getID(10000));
+ System.err.println(getID(10001));
+ System.err.println(getID(10002));
+ System.err.println(getID(10003));
+ System.err.println(getID(10004));
+ System.err.println(getID(10005));
+ System.err.println(getID(100000));
+ System.err.println(getID(1000000));
+ System.err.println(getID(10000000));
+
+ TaxTree tree=null;
+ if(args.length>1){
+ tree=new TaxTree(args[1], args[2]);
+ }
+
+ System.err.println("Strings:");
+ int x;
+ x=getID("gi|18104025|emb|AJ427095.1| Ceratitis capitata centromeric or pericentromeric satellite DNA, clone 44");
+ System.err.println(x);
+ if(tree!=null){
+ System.err.println(tree.getNode(x));
+ tree.incrementRaw(x, 30);
+ }
+ x=getID("gi|15982920|gb|AY057568.1| Arabidopsis thaliana AT5g43500/MWF20_22 mRNA, complete cds");
+ System.err.println(x);
+ if(tree!=null){
+ System.err.println(tree.getNode(x));
+ tree.incrementRaw(x, 40);
+ }
+ x=getID("gi|481043749|gb|KC494054.1| Plesiochorus cymbiformis isolate ST05-58 internal transcribed spacer 2, partial sequence");
+ System.err.println(x);
+ if(tree!=null){
+ System.err.println(tree.getNode(x));
+ tree.incrementRaw(x, 20);
+ }
+
+ if(tree!=null){
+ tree.percolateUp();
+ ArrayList<TaxNode> nodes=tree.gatherNodesAtLeastLimit(35);
+ for(TaxNode n : nodes){
+ System.err.println(n);
+ }
+ }
+ }
+
+ /** Parse a gi number, or return -1 if formatted incorrectly. */
+ private static int parseGiNumber(String s){
+ if(s==null || s.length()<4){return -1;}
+ if(s.charAt(0)=='>'){return getID(s.substring(1));}
+ if(!s.startsWith("gi")){return -1;}
+ char delimiter='|';
+ int initial=s.indexOf(delimiter);
+ if(initial<0){delimiter='_';}
+ initial=s.indexOf(delimiter);
+ if(initial<0){return -1;}
+ if(!Character.isDigit(s.charAt(initial+1))){return -1;}
+
+ int number=0;
+ for(int i=initial+1; i<s.length(); i++){
+ char c=s.charAt(i);
+ if(c==delimiter){break;}
+ assert(Character.isDigit(c));
+ number=(number*10)+(c-'0');
+ }
+ return number;
+ }
+
+ /** Parse a ncbi number, or return -1 if formatted incorrectly. */
+ private static int parseNcbiNumber(String s){
+ if(s==null || s.length()<6){return -1;}
+ if(s.charAt(0)=='>'){return getID(s.substring(1));}
+ if(!s.startsWith("ncbi")){return -1;}
+ char delimiter='|';
+ int initial=s.indexOf(delimiter);
+ if(initial<0){delimiter='_';}
+ initial=s.indexOf(delimiter);
+ if(initial<0){return -1;}
+ if(!Character.isDigit(s.charAt(initial+1))){return -1;}
+
+ int number=0;
+ for(int i=initial+1; i<s.length(); i++){
+ char c=s.charAt(i);
+ if(c==delimiter){break;}
+ assert(Character.isDigit(c));
+ number=(number*10)+(c-'0');
+ }
+ return number;
+ }
+
+ /** Get the taxID from a header starting with a taxID or gi number */
+ public static int getID(String s){
+ int x=parseGiNumber(s);
+ if(x>=0){return array[x];}
+ return parseNcbiNumber(s);
+ }
+
+ /** Parse a gi number, or return -1 if formatted incorrectly. */
+ private static int parseGiNumber(byte[] s){
+ if(s==null || s.length<4){return -1;}
+ if(!Tools.startsWith(s, "gi") && !Tools.startsWith(s, ">gi")){return -1;}
+ char delimiter='|';
+ int initial=Tools.indexOf(s, (byte)delimiter);
+ if(initial<0){delimiter='_';}
+ initial=Tools.indexOf(s, (byte)delimiter);
+ if(initial<0){return -1;}
+ if(!Character.isDigit(s[initial+1])){return -1;}
+
+ int number=0;
+ for(int i=initial+1; i<s.length; i++){
+ byte c=s[i];
+ if(c==delimiter){break;}
+ assert(Character.isDigit(c));
+ number=(number*10)+(c-'0');
+ }
+ return number;
+ }
+
+ /** Parse a gi number, or return -1 if formatted incorrectly. */
+ private static int parseNcbiNumber(byte[] s){
+ if(s==null || s.length<4){return -1;}
+ if(!Tools.startsWith(s, "ncbi") && !Tools.startsWith(s, ">ncbi")){return -1;}
+ char delimiter='|';
+ int initial=Tools.indexOf(s, (byte)delimiter);
+ if(initial<0){delimiter='_';}
+ initial=Tools.indexOf(s, (byte)delimiter);
+ if(initial<0){return -1;}
+ if(!Character.isDigit(s[initial+1])){return -1;}
+
+ int number=0;
+ for(int i=initial+1; i<s.length; i++){
+ byte c=s[i];
+ if(c==delimiter){break;}
+ assert(Character.isDigit(c));
+ number=(number*10)+(c-'0');
+ }
+ return number;
+ }
+
+ /** Get the taxID from a header starting with a taxID or gi number */
+ public static int getID(byte[] s){
+ int x=parseGiNumber(s);
+ if(x>=0){return array[x];}
+ return parseNcbiNumber(s);
+ }
+
+ /** Get the taxID from a gi number */
+ public static int getID(int gi){
+ assert(gi>=0) : gi;
+ assert(gi<array.length) : gi+", "+array.length;
+ return array[gi];
+ }
+
+ public static void initialize(String fname){
+ assert(fname!=null);
+ if(file==null || !file.equals(fname)){
+ synchronized(GiToNcbi.class){
+ if(!initialized || file==null || !file.equals(fname)){
+ file=fname;
+ if(fname.contains(".int1d")){
+ array=ReadWrite.read(int[].class, fname, true);
+ }else{
+ array=makeArray(fname);
+ }
+ }
+ initialized=true;
+ }
+ }
+ }
+
+ public static boolean isInitialized(){return initialized;}
+
+ public static synchronized void unload(){
+ array=null;
+ file=null;
+ initialized=false;
+ }
+
+ private static int[] makeArray(String fname){
+ ByteFile bf=ByteFile.makeByteFile(fname, false, true);
+ long count=0, max=0;
+ byte[] line=bf.nextLine();
+ while(line!=null){
+ count++;
+ int tab=Tools.indexOf(line, (byte)'\t');
+ long gi=Tools.parseLong(line, 0, tab);
+ max=Tools.max(max, gi);
+ line=bf.nextLine();
+ }
+ assert(max<Integer.MAX_VALUE) : "Overflow.";
+ int[] ret=new int[(int)max+1];
+ Arrays.fill(ret, -1);
+// bf.close();
+// bf=ByteFile.makeByteFile(fname, false, true);
+ bf.reset();
+ line=bf.nextLine();
+ long count2=0;
+ while(line!=null){
+ count2++;
+ int tab=Tools.indexOf(line, (byte)'\t');
+ int gi=Tools.parseInt(line, 0, tab);
+ int ncbi=Tools.parseInt(line, tab+1, line.length);
+ ret[gi]=ncbi;
+ line=bf.nextLine();
+ }
+ if(verbose){System.err.println("Count: "+count+", "+count2);}
+ bf.close();
+ return ret;
+ }
+
+ private static int[] array;
+ private static String file;
+
+ public static boolean verbose=false;
+ private static boolean initialized=false;
+}
diff --git a/current/tax/PrintTaxonomy.java b/current/tax/PrintTaxonomy.java
new file mode 100755
index 0000000..a03cfc8
--- /dev/null
+++ b/current/tax/PrintTaxonomy.java
@@ -0,0 +1,301 @@
+package tax;
+
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import dna.Parser;
+import dna.Timer;
+import fileIO.ReadWrite;
+import fileIO.FileFormat;
+import fileIO.TextFile;
+import fileIO.TextStreamWriter;
+
+import align2.ReadStats;
+import align2.Shared;
+import align2.Tools;
+
+/**
+ * Filters sequences according to their taxonomy,
+ * as determined by the sequence name. Sequences should
+ * be labeled with a gi number or NCBI taxID.
+ *
+ * @author Brian Bushnell
+ * @date November 23, 2015
+ *
+ */
+public class PrintTaxonomy {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /**
+ * Code entrance from the command line.
+ * @param args Command line arguments
+ */
+ public static void main(String[] args){
+ Timer t=new Timer();
+ PrintTaxonomy as=new PrintTaxonomy(args);
+ as.process(t);
+ }
+
+ /**
+ * Constructor.
+ * @param args Command line arguments
+ */
+ public PrintTaxonomy(String[] args){
+
+ //Process any config files
+ args=Parser.parseConfig(args);
+
+ //Detect whether the uses needs help
+ if(Parser.parseHelp(args, true)){
+ printOptions();
+ System.exit(0);
+ }
+
+ //Print the program name and arguments
+ outstream.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+
+ //Set some shared static variables regarding PIGZ
+ ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
+ ReadWrite.MAX_ZIP_THREADS=Shared.threads();
+
+ //Create a parser object
+ Parser parser=new Parser();
+
+ //Parse each argument
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+
+ //Break arguments into their constituent parts, in the form of "a=b"
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if(b==null || b.equalsIgnoreCase("null")){b=null;}
+ while(a.startsWith("-")){a=a.substring(1);} //Strip leading hyphens
+
+
+ if(a.equals("out")){
+ out1=b;
+ }else if(a.equals("counts")){
+ countFile=b;
+ }else if(parser.parse(arg, a, b)){//Parse standard flags in the parser
+ //do nothing
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ }else if(a.equals("table") || a.equals("gi") || a.equals("gitable")){
+ tableFile=b;
+ if("auto".equalsIgnoreCase(b)){tableFile=TaxTree.DefaultTableFile;}
+ }else if(a.equals("tree") || a.equals("taxtree")){
+ treeFile=b;
+ if("auto".equalsIgnoreCase(b)){treeFile=TaxTree.DefaultTreeFile;}
+ }else if(a.equals("level") || a.equals("taxlevel")){
+ if(Character.isDigit(b.charAt(0))){
+ taxLevel=Integer.parseInt(b);
+ }else{
+ taxLevel=TaxTree.stringToLevel(b.toLowerCase());
+ }
+ }else if(b!=null && (a.equals("name") || a.equals("names") || a.equals("id") || a.equals("ids"))){
+ for(String s : b.split(",")){
+ names.add(s);
+ }
+ }else{
+ names.add(arg);
+ }
+ }
+
+ {//Process parser fields
+ overwrite=ReadStats.overwrite=parser.overwrite;
+ append=ReadStats.append=parser.append;
+
+ in1=parser.in1;
+ }
+
+ //Ensure output files can be written
+ if(!Tools.testOutputFiles(overwrite, append, false, out1)){
+ outstream.println((out1==null)+", "+out1);
+ throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output file "+out1+"\n");
+ }
+
+ //Create output FileFormat objects
+ ffout1=FileFormat.testOutput(out1, FileFormat.TEXT, null, true, overwrite, append, ordered);
+
+ ffcount=FileFormat.testOutput(countFile, FileFormat.TEXT, null, true, overwrite, append, ordered);
+
+ //Create input FileFormat objects
+ ffin1=FileFormat.testInput(in1, FileFormat.TEXT, null, true, false);
+
+ if(tableFile!=null){
+ outstream.println("Loading gi table.");
+ GiToNcbi.initialize(tableFile);
+ }
+ if(treeFile!=null){
+ outstream.println("Loading tree.");
+ tree=ReadWrite.read(TaxTree.class, treeFile, true);
+ if(tree.nameMap==null){
+ outstream.println("Hashing names.");
+ tree.hashNames();
+ }
+ assert(tree.nameMap!=null);
+ }else{
+ tree=null;
+ throw new RuntimeException("No tree specified.");
+ }
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Outer Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Create read streams and process all data */
+ void process(Timer t){
+
+ TextStreamWriter tsw=null;
+ if(ffout1!=null){
+ tsw=new TextStreamWriter(ffout1);
+ tsw.start();
+ }
+
+ if(ffin1!=null){
+ processFile(new TextFile(ffin1), tsw);
+ }else{
+ processNames(tsw);
+ }
+
+ if(tsw!=null){errorState|=tsw.poisonAndWait();}
+
+ if(ffcount!=null){
+ TextStreamWriter tswc=new TextStreamWriter(ffcount);
+ tswc.start();
+ for(TaxNode tn : tree.nodes){
+ if(tn!=null && tn.countRaw>0){
+ tswc.println(tn.countRaw+"\t"+tn.name);
+ }
+ }
+ errorState|=tswc.poisonAndWait();
+ }
+
+ t.stop();
+
+ //Throw an exception of there was an error in a thread
+ if(errorState){
+ throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
+ }
+ }
+
+ /** Iterate through the names */
+ void processNames(final TextStreamWriter tsw){
+ for(String name : names){
+ printTaxonomy(name, tsw);
+ }
+ }
+
+ /** Iterate through the names */
+ void processFile(final TextFile tf, final TextStreamWriter tsw){
+ for(String name=tf.nextLine(); name!=null; name=tf.nextLine()){
+ printTaxLevel(name, tsw);
+ }
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ void printTaxonomy(String name, final TextStreamWriter tsw){
+ TaxNode tn=null;
+ tn=tree.getNode(name);
+ if(tn==null){tn=tree.getNodeByName(name);}
+
+ tsw.print("\n");
+ if(tn==null){
+ tsw.println("Could not find node for '"+name+"'");
+ return;
+ }
+ do{
+ if(tn.level<=taxLevel){tn.incrementRaw(1);}
+ tsw.println(tn.levelString()+"\t"+tn.id+"\t"+tn.name);
+ tn=tree.getNode(tn.pid);
+ }while(tn!=null && tn.id!=tn.pid);
+ }
+
+ void printTaxLevel(String name, final TextStreamWriter tsw){
+ TaxNode tn=null;
+ tn=tree.getNode(name);
+ if(tn==null){tn=tree.getNodeByName(name);}
+ if(tn==null){tn=unknown;}
+ while(tn!=null && tn.id!=tn.pid && tn.level<taxLevel){tn=tree.getNode(tn.pid);}
+ if(tsw!=null)tsw.println(tn.name);
+ tn.incrementRaw(1);
+ }
+
+ void printTaxCounts(String name, final TextStreamWriter tsw){
+ TaxNode tn=null;
+ tn=tree.getNode(name);
+ if(tn==null){tn=tree.getNodeByName(name);}
+ if(tn==null){tn=unknown;}
+ while(tn!=null && tn.id!=tn.pid && tn.level<taxLevel){tn=tree.getNode(tn.pid);}
+ if(tsw!=null)tsw.println(tn.name);
+ tn.incrementRaw(1);
+ }
+
+ /** This is called if the program runs with no parameters */
+ private void printOptions(){
+ throw new RuntimeException("TODO");
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Optional input file path */
+ private String in1=null;
+
+ /** Primary output file path */
+ private String out1="stdout.txt";
+
+ private String countFile=null;
+
+ private String tableFile=null;;
+ private String treeFile=TaxTree.DefaultTreeFile;
+
+ private final TaxTree tree;
+
+ /** Level to print */
+ private int taxLevel=TaxTree.stringToLevel("phylum");
+
+ private ArrayList<String> names=new ArrayList<String>();
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Final Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Optional input file */
+ private final FileFormat ffin1;
+
+ /** Primary output file */
+ private final FileFormat ffout1;
+
+ private final FileFormat ffcount;
+
+ private final TaxNode unknown=new TaxNode(-99, -99, taxLevel, "UNKNOWN");
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Common Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Print status messages to this output stream */
+ private PrintStream outstream=System.err;
+ /** Print verbose messages */
+ public static boolean verbose=false;
+ /** True if an error was encountered */
+ public boolean errorState=false;
+ /** Overwrite existing output files */
+ private boolean overwrite=false;
+ /** Append to existing output files */
+ private boolean append=false;
+ /** This flag has no effect on singlethreaded programs */
+ private final boolean ordered=false;
+
+}
diff --git a/current/tax/RenameGiToNcbi.java b/current/tax/RenameGiToNcbi.java
new file mode 100755
index 0000000..b1e489f
--- /dev/null
+++ b/current/tax/RenameGiToNcbi.java
@@ -0,0 +1,270 @@
+package tax;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.util.Arrays;
+
+import kmer.HashArray1D;
+
+import stream.ConcurrentGenericReadInputStream;
+import stream.FASTQ;
+import stream.FastaReadInputStream;
+import align2.ReadStats;
+import align2.Shared;
+import align2.Tools;
+import dna.Parser;
+import dna.Timer;
+import fileIO.ByteFile;
+import fileIO.ByteFile1;
+import fileIO.ByteFile2;
+import fileIO.ByteStreamWriter;
+import fileIO.FileFormat;
+import fileIO.ReadWrite;
+
+/**
+ * @author Brian Bushnell
+ * @date Mar 10, 2015
+ *
+ */
+public class RenameGiToNcbi {
+
+ public static void main(String[] args){
+ Timer t=new Timer();
+ RenameGiToNcbi mb=new RenameGiToNcbi(args);
+ mb.process(t);
+ }
+
+ public RenameGiToNcbi(String[] args){
+
+ args=Parser.parseConfig(args);
+ if(Parser.parseHelp(args, true)){
+ printOptions();
+ System.exit(0);
+ }
+
+ outstream.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+
+
+ Shared.READ_BUFFER_LENGTH=Tools.min(200, Shared.READ_BUFFER_LENGTH);
+ Shared.capBuffers(4);
+ ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
+ ReadWrite.MAX_ZIP_THREADS=Shared.threads();
+
+ FASTQ.TEST_INTERLEAVED=FASTQ.FORCE_INTERLEAVED=false;
+
+ Parser parser=new Parser();
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if(b==null || b.equalsIgnoreCase("null")){b=null;}
+ while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens
+
+ if(parser.parse(arg, a, b)){
+ //do nothing
+ }else if(a.equals("prefix")){
+ prefix=Tools.parseBoolean(b);
+ }else if(a.equals("table") || a.equals("gi") || a.equals("gitable")){
+ tableFile=b;
+ if("auto".equalsIgnoreCase(b)){tableFile=TaxTree.DefaultTableFile;}
+ }else if(a.equals("invalid")){
+ outInvalid=b;
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ ByteFile1.verbose=verbose;
+ ByteFile2.verbose=verbose;
+ stream.FastaReadInputStream.verbose=verbose;
+ ConcurrentGenericReadInputStream.verbose=verbose;
+ stream.FastqReadInputStream.verbose=verbose;
+ ReadWrite.verbose=verbose;
+ }else if(parser.in1==null && i==0 && !arg.contains("=") && (arg.toLowerCase().startsWith("stdin") || new File(arg).exists())){
+ parser.in1=arg;
+ }else if(parser.out1==null && i==1 && !arg.contains("=")){
+ parser.out1=arg;
+ }else{
+ outstream.println("Unknown parameter "+args[i]);
+ assert(false) : "Unknown parameter "+args[i];
+ // throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ {//Process parser fields
+ Parser.processQuality();
+
+ maxReads=parser.maxReads;
+
+ overwrite=ReadStats.overwrite=parser.overwrite;
+ append=ReadStats.append=parser.append;
+
+ in1=parser.in1;
+
+ out1=parser.out1;
+ }
+
+ assert(FastaReadInputStream.settingsOK());
+
+ if(in1==null){
+ printOptions();
+ throw new RuntimeException("Error - at least one input file is required.");
+ }
+ if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2){
+ ByteFile.FORCE_MODE_BF2=false;
+ ByteFile.FORCE_MODE_BF1=true;
+ }
+
+ if(out1!=null && out1.equalsIgnoreCase("null")){out1=null;}
+
+ if(!Tools.testOutputFiles(overwrite, append, false, out1)){
+ outstream.println((out1==null)+", "+out1);
+ throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+out1+"\n");
+ }
+
+ ffout1=FileFormat.testOutput(out1, FileFormat.FA, null, true, overwrite, append, false);
+ ffoutInvalid=FileFormat.testOutput(outInvalid, FileFormat.FA, null, true, overwrite, append, false);
+ ffin1=FileFormat.testInput(in1, FileFormat.FA, null, true, true);
+
+ GiToNcbi.initialize(tableFile);
+ }
+
+ void process(Timer t){
+
+ ByteFile bf=ByteFile.makeByteFile(ffin1, false);
+ ByteStreamWriter bsw=new ByteStreamWriter(ffout1);
+ bsw.start();
+
+ ByteStreamWriter bswInvalid=null;
+ if(ffoutInvalid!=null){
+ bswInvalid=new ByteStreamWriter(ffoutInvalid);
+ bswInvalid.start();
+ }
+
+ final HashArray1D counts=(countTable && !prefix) ? new HashArray1D(256000, true) : null;
+
+ long readsProcessed=0, basesProcessed=0;
+
+ byte[] line=bf.nextLine();
+ boolean valid=false;
+ while(line!=null){
+ if(line.length>0 && line[0]=='>'){
+ readsProcessed++;
+ if(maxReads>0 && readsProcessed>maxReads){break;}
+ final int number=GiToNcbi.getID(line);
+ valid=(number>=0);
+ if(valid){
+ validReads++;
+ bsw.print(ncbi);
+ bsw.print(number);
+ if(prefix){
+ bsw.print('|');
+ for(int i=1; i<line.length; i++){
+ bsw.print(line[i]);
+ }
+ }else if(counts!=null){
+ bsw.print('|');
+ int count=counts.increment(number);
+ bsw.print(count);
+ if(count==1){taxaCounted++;}
+ }
+ bsw.println();
+ }else{
+ invalidReads++;
+ if(bswInvalid!=null){
+ bswInvalid.println(line);
+ }
+ }
+ }else{
+ basesProcessed+=line.length;
+ if(valid){
+ validBases+=line.length;
+ bsw.println(line);
+ }else{
+ invalidBases+=line.length;
+ if(bswInvalid!=null && keepInvalidSequence){
+ bswInvalid.println(line);
+ }
+ }
+ }
+ line=bf.nextLine();
+ }
+
+ errorState|=bf.close();
+ if(bsw!=null){errorState|=bsw.poisonAndWait();}
+ if(bswInvalid!=null){errorState|=bswInvalid.poisonAndWait();}
+
+ t.stop();
+
+ double rpnano=readsProcessed/(double)(t.elapsed);
+ double bpnano=basesProcessed/(double)(t.elapsed);
+
+ String rpstring=(readsProcessed<100000 ? ""+readsProcessed : readsProcessed<100000000 ? (readsProcessed/1000)+"k" : (readsProcessed/1000000)+"m");
+ String bpstring=(basesProcessed<100000 ? ""+basesProcessed : basesProcessed<100000000 ? (basesProcessed/1000)+"k" : (basesProcessed/1000000)+"m");
+
+ while(rpstring.length()<8){rpstring=" "+rpstring;}
+ while(bpstring.length()<8){bpstring=" "+bpstring;}
+
+ outstream.println("Time: \t"+t);
+ outstream.println("Reads Processed: "+rpstring+" \t"+String.format("%.2fk reads/sec", rpnano*1000000));
+ outstream.println("Bases Processed: "+bpstring+" \t"+String.format("%.2fm bases/sec", bpnano*1000));
+
+ outstream.println();
+ outstream.println("Valid Sequences: \t"+validReads);
+ outstream.println("Valid Bases: \t"+validBases);
+ outstream.println("Invalid Sequences: \t"+invalidReads);
+ outstream.println("Invalid Bases: \t"+invalidBases);
+ if(counts!=null){
+ outstream.println("Unique Taxa: \t"+taxaCounted);
+ }
+
+ if(errorState){
+ throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
+ }
+ }
+
+ /*--------------------------------------------------------------*/
+
+ private void printOptions(){
+ assert(false) : "printOptions: TODO";
+ }
+
+
+ /*--------------------------------------------------------------*/
+
+ private String in1=null;
+ private String out1=null;
+ private String outInvalid=null;
+
+ private String tableFile=null;
+
+ /*--------------------------------------------------------------*/
+
+ private long maxReads=-1;
+
+ private long validReads=0;
+ private long validBases=0;
+ private long invalidReads=0;
+ private long invalidBases=0;
+ private long taxaCounted=0;
+
+ private boolean prefix=false;
+ private boolean countTable=true;
+ private boolean keepInvalidSequence=false;
+
+ /*--------------------------------------------------------------*/
+
+ private final FileFormat ffin1;
+ private final FileFormat ffout1;
+ private final FileFormat ffoutInvalid;
+
+
+ /*--------------------------------------------------------------*/
+
+ private PrintStream outstream=System.err;
+ public static boolean verbose=false;
+ public boolean errorState=false;
+ private boolean overwrite=false;
+ private boolean append=false;
+
+ private static final byte[] ncbi=">ncbi|".getBytes();
+
+}
diff --git a/current/tax/SortByTaxa.java b/current/tax/SortByTaxa.java
new file mode 100755
index 0000000..77e61ae
--- /dev/null
+++ b/current/tax/SortByTaxa.java
@@ -0,0 +1,582 @@
+package tax;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Comparator;
+
+import stream.ByteBuilder;
+import stream.ConcurrentGenericReadInputStream;
+import stream.ConcurrentReadInputStream;
+import stream.ConcurrentReadOutputStream;
+import stream.FASTQ;
+import stream.FastaReadInputStream;
+import stream.Read;
+import align2.ListNum;
+import align2.Shared;
+import align2.Tools;
+import dna.Parser;
+import dna.Timer;
+import fileIO.ByteFile;
+import fileIO.ByteFile1;
+import fileIO.ByteFile2;
+import fileIO.FileFormat;
+import fileIO.ReadWrite;
+
+/**
+ * @author Brian Bushnell
+ * @date Mar 11, 2015
+ *
+ */
+public class SortByTaxa {
+
+ public static void main(String[] args){
+ Timer t=new Timer();
+ SortByTaxa mb=new SortByTaxa(args);
+ mb.process(t);
+ }
+
+ public SortByTaxa(String[] args){
+
+ args=Parser.parseConfig(args);
+ if(Parser.parseHelp(args, true)){
+ printOptions();
+ System.exit(0);
+ }
+
+ outstream.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+
+
+
+ Shared.READ_BUFFER_LENGTH=Tools.min(200, Shared.READ_BUFFER_LENGTH);
+ Shared.capBuffers(4);
+ ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
+ ReadWrite.MAX_ZIP_THREADS=Shared.threads();
+
+ FASTQ.TEST_INTERLEAVED=FASTQ.FORCE_INTERLEAVED=false;
+
+ Parser parser=new Parser();
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if(b==null || b.equalsIgnoreCase("null")){b=null;}
+ while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens
+
+ if(parser.parse(arg, a, b)){
+ //do nothing
+ }else if(a.equals("table") || a.equals("gi") || a.equals("gitable")){
+ tableFile=b;
+ if("auto".equalsIgnoreCase(b)){tableFile=TaxTree.DefaultTableFile;}
+ }else if(a.equals("tree") || a.equals("taxtree")){
+ treeFile=b;
+ if("auto".equalsIgnoreCase(b)){treeFile=TaxTree.DefaultTreeFile;}
+ }else if(a.equals("fuse")){
+ fuse=Tools.parseBoolean(b);
+ }else if(a.equals("dummyreads") || a.equals("adddummies") || a.equals("dummy")){
+ addDummyReads=Tools.parseBoolean(b);
+ }else if(a.equals("dummylevel")){
+ dummyLevel=Integer.parseInt(b);
+ }else if(a.equals("promote")){
+ promote=Integer.parseInt(b);
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ ByteFile1.verbose=verbose;
+ ByteFile2.verbose=verbose;
+ stream.FastaReadInputStream.verbose=verbose;
+ ConcurrentGenericReadInputStream.verbose=verbose;
+// align2.FastaReadInputStream2.verbose=verbose;
+ stream.FastqReadInputStream.verbose=verbose;
+ ReadWrite.verbose=verbose;
+ }else if(parser.in1==null && i==0 && !arg.contains("=") && (arg.toLowerCase().startsWith("stdin") || new File(arg).exists())){
+ parser.in1=arg;
+ }else if(parser.out1==null && i==1 && !arg.contains("=")){
+ parser.out1=arg;
+ }else{
+ outstream.println("Unknown parameter "+args[i]);
+ assert(false) : "Unknown parameter "+args[i];
+ // throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ {//Process parser fields
+ Parser.processQuality();
+
+ maxReads=parser.maxReads;
+
+ overwrite=parser.overwrite;
+ append=parser.append;
+
+ in1=parser.in1;
+
+ out1=parser.out1;
+ }
+
+ assert(FastaReadInputStream.settingsOK());
+
+ if(in1==null){
+ printOptions();
+ throw new RuntimeException("Error - at least one input file is required.");
+ }
+ if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2){
+ ByteFile.FORCE_MODE_BF2=false;
+ ByteFile.FORCE_MODE_BF1=true;
+ }
+
+ if(out1!=null && out1.equalsIgnoreCase("null")){out1=null;}
+
+ if(!Tools.testOutputFiles(overwrite, append, false, out1)){
+ outstream.println((out1==null)+", "+out1);
+ throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+out1+"\n");
+ }
+
+ ffout1=FileFormat.testOutput(out1, FileFormat.FA, null, true, overwrite, append, false);
+ ffin1=FileFormat.testInput(in1, FileFormat.FA, null, true, true);
+
+ if(tableFile!=null){
+ outstream.println("Loading gi table.");
+ GiToNcbi.initialize(tableFile);
+ }
+ if(treeFile!=null){
+ outstream.println("Loading tree.");
+ tree=ReadWrite.read(TaxTree.class, treeFile, true);
+ }else{
+ tree=null;
+ }
+ }
+
+ void process(Timer t){
+
+ final ConcurrentReadInputStream cris;
+ {
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, null, null, null);
+ cris.start();
+ if(verbose){outstream.println("Started cris");}
+ }
+
+ long readsProcessed=0;
+ long basesProcessed=0;
+
+ long dummiesAdded=0;
+
+ ArrayList<Read> all=new ArrayList<Read>();
+
+ {
+ outstream.println("Loading sequences.");
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+ assert((ffin1==null || ffin1.samOrBam()) || (r.mate!=null)==cris.paired());
+ }
+
+ while(reads!=null && reads.size()>0){
+
+ for(int idx=0; idx<reads.size(); idx++){
+ final Read r1=reads.get(idx);
+
+ final int initialLength1=r1.length();
+
+ if(tree!=null){
+ TaxNode tn=tree.getNode(r1.id);
+ if(tn!=null){tn.incrementRaw(1);}
+ }
+
+ readsProcessed++;
+ basesProcessed+=initialLength1;
+ }
+
+ all.addAll(reads);
+
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ if(ln!=null){
+ cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
+ }
+ }
+
+ errorState|=ReadWrite.closeStream(cris);
+
+ if(addDummyReads){
+ outstream.println("Adding dummies.");
+ for(TaxNode n : tree.nodes){
+ if(n!=null && n.level>=dummyLevel && n.countRaw<1){
+ Read r=new Read(dummyBases, -1, -1, -1, "ncbi|"+n.id, null, all.size(), 0);
+ all.add(r);
+ dummiesAdded++;
+ }
+ }
+ }
+
+ {
+ outstream.println("Sorting.");
+ Collections.sort(all, taxaComparator);
+ }
+
+ if(fuse){
+ outstream.println("Fusing.");
+ ArrayList<Read> fused=new ArrayList<Read>();
+ ArrayList<Read> current=new ArrayList<Read>();
+ final ByteBuilder bb=new ByteBuilder(1000000);
+
+ int taxid=-2;
+ int segment=0;
+ long currentLength=0;
+ for(int i=0; i<all.size(); i++){
+ Read r=all.remove(i);
+ int tax=GiToNcbi.getID(r.id);
+ if(promote>-1){
+ TaxNode n=tree.getNode(tax);
+ assert(n!=null) : "Can't find node for "+r.id+", "+r.numericID+", "+r.length();
+ while(n.level<promote){
+ n=tree.getNode(n.pid);
+ }
+ tax=n.id;
+ }
+ if(tax!=taxid || r.length()+currentLength>MAX_FUSE_LENGTH){
+ Read x=fuse(current, taxid, segment, bb);
+ current.clear();
+ currentLength=0;
+ if(tax==taxid){segment++;}
+ else{segment=0;}
+ if(x!=null){
+ fused.add(x);
+ }
+ }
+ current.add(r);
+ currentLength+=(r.length()+padding);
+ taxid=tax;
+ }
+ {
+ Read x=fuse(current, taxid, segment, bb);
+ current.clear();
+ if(x!=null){
+ fused.add(x);
+ }
+ }
+ all=fused;
+ }
+
+ if(out1!=null){
+ outstream.println("Writing output.");
+ final ConcurrentReadOutputStream ros;
+ final int buff=4;
+
+ assert(!out1.equalsIgnoreCase(in1) && !out1.equalsIgnoreCase(in1)) : "Input file and output file have same name.";
+
+ ros=ConcurrentReadOutputStream.getStream(ffout1, null, null, null, buff, null, false);
+ ros.start();
+
+ ArrayList<Read> list=new ArrayList<Read>();
+ long num=0;
+ for(Read r : all){
+ list.add(r);
+ if(list.size()>=200){
+ ros.add(list, num);
+ num++;
+ list=new ArrayList<Read>();
+ }
+ }
+ if(list.size()>0){
+ ros.add(list, num);
+ num++;
+ }
+
+ errorState|=ReadWrite.closeStream(ros);
+ }
+
+ t.stop();
+
+ double rpnano=readsProcessed/(double)(t.elapsed);
+ double bpnano=basesProcessed/(double)(t.elapsed);
+
+ String rpstring=(readsProcessed<100000 ? ""+readsProcessed : readsProcessed<100000000 ? (readsProcessed/1000)+"k" : (readsProcessed/1000000)+"m");
+ String bpstring=(basesProcessed<100000 ? ""+basesProcessed : basesProcessed<100000000 ? (basesProcessed/1000)+"k" : (basesProcessed/1000000)+"m");
+
+ while(rpstring.length()<8){rpstring=" "+rpstring;}
+ while(bpstring.length()<8){bpstring=" "+bpstring;}
+
+ outstream.println();
+ outstream.println("Time: \t"+t);
+ outstream.println("Reads Processed: "+rpstring+" \t"+String.format("%.2fk reads/sec", rpnano*1000000));
+ outstream.println("Bases Processed: "+bpstring+" \t"+String.format("%.2fm bases/sec", bpnano*1000));
+
+ outstream.println();
+ if(dummiesAdded>0){
+ outstream.println("Dummies Added: "+dummiesAdded);
+ }
+
+ if(errorState){
+ throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
+ }
+ }
+
+ /**
+ * @param current
+ * @param taxid
+ * @return
+ */
+ private Read fuse(ArrayList<Read> current, int taxid, int segment, ByteBuilder bb) {
+// System.err.println("Calling fuse("+current+")");
+
+ if(current.isEmpty()){return null;}
+ if(current.size()==1){
+ Read x=current.get(0);
+ x.id="ncbi|"+taxid+"|"+segment;
+// System.err.println("a) Returning "+x);
+ return x;
+ }
+ Read x=current.get(0);
+ bb.setLength(0);
+
+ long lensum=0;
+
+ Read last=null;
+ try {
+ for(int i=0; i<current.size(); i++){
+ Read r=current.remove(i);
+ last=r;
+ lensum+=r.length();
+ if(bb.length()>0 && r.length()>0){
+ for(int j=0; j<padding; j++){
+ bb.append('N');
+ lensum++;
+ }
+ }
+ bb.append(r.bases);
+ }
+ } catch (Throwable e) {
+ System.err.println(lensum+", "+last.length()+", "+last.id+"\n"+current.size()+", "+taxid+", "+tree.getNode(taxid));
+ System.err.println(e);
+ }
+ x.bases=bb.toBytes();
+ x.quality=null;
+ x.id="ncbi|"+taxid;
+// System.err.println("b) Returning "+x);
+ return x;
+ }
+
+ @SuppressWarnings("unused")
+ private void validate(ArrayList<Read> all){
+ for(Read a : all){
+ for(Read b : all){
+ assert(taxaComparator.compare(a, b)==-taxaComparator.compare(b, a)) : (verbose=true)+"\n"+a.id+", "+b.id+"\n"+
+ taxaComparator.compare(a, b)+"\n"+taxaComparator.compare(b, a);
+ for(Read c : all){
+ int ab=taxaComparator.compare(a, b);
+ int bc=taxaComparator.compare(b, c);
+ int ca=taxaComparator.compare(c, a);
+ int zero=(ab==0 ? 1 : 0)+(bc==0 ? 1 : 0)+(ca==0 ? 1 : 0);
+ int more=(ab>0 ? 1 : 0)+(bc>0 ? 1 : 0)+(ca>0 ? 1 : 0);
+ int less=(ab<0 ? 1 : 0)+(bc<0 ? 1 : 0)+(ca<0 ? 1 : 0);
+ assert(zero+more+less==3) : a.id+", "+b.id+", "+c.id+"; "+ab+", "+bc+", "+ca;
+ assert(zero==0 || zero==1 || zero==3) : a.id+", "+b.id+", "+c.id+"; "+ab+", "+bc+", "+ca;
+ if(ab==0 && bc==0){assert(ca==0) : a.id+", "+b.id+", "+c.id+"; "+ab+", "+bc+", "+ca;}
+ if(zero==0){
+ assert(less>0 && more>0) : a.id+", "+b.id+", "+c.id+"; "+ab+", "+bc+", "+ca;
+ }else if(zero==1){
+// assert(less==2 || more==2) : a.id+", "+b.id+", "+c.id+"; "+ab+", "+bc+", "+ca;
+ }
+ }
+ }
+ }
+ }
+
+ /*--------------------------------------------------------------*/
+
+ private void printOptions(){
+ assert(false) : "printOptions: TODO";
+// outstream.println("Syntax:\n");
+// outstream.println("java -ea -Xmx512m -cp <path> jgi.ReformatReads in=<infile> in2=<infile2> out=<outfile> out2=<outfile2>");
+// outstream.println("\nin2 and out2 are optional. \nIf input is paired and there is only one output file, it will be written interleaved.\n");
+// outstream.println("Other parameters and their defaults:\n");
+// outstream.println("overwrite=false \tOverwrites files that already exist");
+// outstream.println("ziplevel=4 \tSet compression level, 1 (low) to 9 (max)");
+// outstream.println("interleaved=false\tDetermines whether input file is considered interleaved");
+// outstream.println("fastawrap=70 \tLength of lines in fasta output");
+// outstream.println("qin=auto \tASCII offset for input quality. May be set to 33 (Sanger), 64 (Illumina), or auto");
+// outstream.println("qout=auto \tASCII offset for output quality. May be set to 33 (Sanger), 64 (Illumina), or auto (meaning same as input)");
+// outstream.println("outsingle=<file> \t(outs) Write singleton reads here, when conditionally discarding reads from pairs.");
+ }
+
+ public final class TaxaComparator implements Comparator<Read>{
+
+ @Override
+ public int compare(Read a, Read b) {
+ //if(verbose){System.err.println("Comparing "+a.id+", "+b.id);}
+ int atax=GiToNcbi.getID(a.id);
+ int btax=GiToNcbi.getID(b.id);
+
+ if(tree!=null){
+ int x=compareWithTree(atax, btax);
+ if(x!=0){
+ //if(verbose){System.err.println("\na) returning "+x);}
+ return x;
+ }
+ }
+
+// System.err.println("Comparing "+atax+" to "+btax+" for reads "+a.numericID+", "+b.numericID);
+ if(atax!=btax){
+ //if(verbose){System.err.println("b) returning "+(atax-btax));}
+ return atax-btax;
+ }
+ if(a.length()!=b.length()){
+ //if(verbose){System.err.println("c) returning "+(b.length()-a.length()));}
+ return b.length()-a.length();
+ }
+ //if(verbose){System.err.println("d) returning "+(a.id.compareTo(b.id)));}
+ return a.id.compareTo(b.id);
+ }
+
+// public int compareWithTree(int a, int b){
+// if(a==b){return 0;}
+// if(a==-1){return 1;}
+// if(b==-1){return -1;}
+// TaxNode na=tree.getNode(a);
+// TaxNode nb=tree.getNode(b);
+// if(na==nb){return 0;}
+// if(na==null){return 1;}
+// if(nb==null){return -1;}
+// while(na.level<promote){na=tree.getNode(na.pid);}
+// while(nb.level<promote){nb=tree.getNode(nb.pid);}
+// while(na.pid!=nb.pid){
+// if(na.pid==nb.id){
+// return 1;
+// }else if(nb.pid==na.id){
+// return -1;
+// }
+// if(na.level<=nb.level){
+// na=tree.getNode(na.pid);
+// }else{
+// nb=tree.getNode(nb.pid);
+// }
+// }
+// assert(na.id>-1 && nb.id>-1);
+// return na.id-nb.id;
+// }
+
+// public int compareWithTree(int a, int b){
+// if(verbose){System.err.print("e");}
+// if(a==b){return 0;}
+// if(verbose){System.err.print("f");}
+// if(a==-1){return 1;}
+// if(verbose){System.err.print("g");}
+// if(b==-1){return -1;}
+// if(verbose){System.err.print("h");}
+// TaxNode na=tree.getNode(a);
+// TaxNode nb=tree.getNode(b);
+// if(na==nb){return 0;}
+// if(verbose){System.err.print("i");}
+// if(na==null){return 1;}
+// if(verbose){System.err.print("j");}
+// if(nb==null){return -1;}
+// if(verbose){System.err.print("k");}
+// while(na.level<promote){na=tree.getNode(na.pid);}
+// while(nb.level<promote){nb=tree.getNode(nb.pid);}
+// while(na.pid!=nb.pid && na.pid!=nb.id && nb.pid!=na.id){
+// TaxNode pa=tree.getNode(na.pid);
+// TaxNode pb=tree.getNode(nb.pid);
+// if(pa.level<=pb.level){
+// if(verbose){System.err.println(na.id+", lv "+na.level+" promoted to "+na.pid+", level "+pa.level);}
+// na=pa;
+// }else{
+// if(verbose){System.err.println(nb.id+", lv "+nb.level+" promoted to "+nb.pid+", level "+pb.level);}
+// nb=pb;
+// }
+// }
+// if(na==nb){return 0;}
+// if(na.pid==nb.id){
+// if(verbose){System.err.println("\na -> b");
+// System.err.println(a+" -> "+b);
+// System.err.println(na+" -> "+nb);}
+// return 1;
+// }else if(nb.pid==na.id){
+// if(verbose){System.err.println("\nb -> a");
+// System.err.println(b+" -> "+a);
+// System.err.println(nb+" -> "+na);}
+// return -1;
+// }
+// if(verbose){System.err.print("n");}
+// assert(na.id>-1 && nb.id>-1);
+// return na.id-nb.id;
+// }
+
+ public int compareWithTree(int a, int b){
+ if(a==b){return 0;}
+ if(a==-1){return 1;}
+ if(b==-1){return -1;}
+ TaxNode na=tree.getNode(a);
+ TaxNode nb=tree.getNode(b);
+ if(na==nb){return 0;}
+ if(na==null){return 1;}
+ if(nb==null){return -1;}
+ while(na.level<promote){na=tree.getNode(na.pid);}
+ while(nb.level<promote){nb=tree.getNode(nb.pid);}
+ while(na.pid!=nb.pid && na.pid!=nb.id && nb.pid!=na.id){
+ TaxNode pa=tree.getNode(na.pid);
+ TaxNode pb=tree.getNode(nb.pid);
+ if(pa.level<=pb.level){
+ na=pa;
+ }else{
+ nb=pb;
+ }
+ }
+ if(na==nb){return 0;}
+ if(na.pid==nb.id){
+ return 1;
+ }else if(nb.pid==na.id){
+ return -1;
+ }
+ assert(na.id>-1 && nb.id>-1);
+ return na.id-nb.id;
+ }
+
+ }
+
+
+ /*--------------------------------------------------------------*/
+
+ private String in1=null;
+ private String out1=null;
+
+ private String tableFile=null;
+ private String treeFile=null;
+
+ private boolean fuse=false;
+ private int promote=0;
+ private int padding=3;
+
+ private boolean addDummyReads=true;
+ private int dummyLevel=TaxTree.stringToLevel("species");
+
+ private final TaxTree tree;
+ private final TaxaComparator taxaComparator=new TaxaComparator();
+ private final byte[] dummyBases=new byte[] {'N'};
+
+ /*--------------------------------------------------------------*/
+
+ private long maxReads=-1;
+
+ /*--------------------------------------------------------------*/
+
+ private final FileFormat ffin1;
+
+ private final FileFormat ffout1;
+
+ /*--------------------------------------------------------------*/
+
+ private PrintStream outstream=System.err;
+ public static boolean verbose=false;
+ public boolean errorState=false;
+ private boolean overwrite=false;
+ private boolean append=false;
+
+ private static int MAX_FUSE_LENGTH=500000000;
+
+}
diff --git a/current/tax/SplitByTaxa.java b/current/tax/SplitByTaxa.java
new file mode 100755
index 0000000..fbf9af3
--- /dev/null
+++ b/current/tax/SplitByTaxa.java
@@ -0,0 +1,426 @@
+package tax;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+
+import stream.ConcurrentReadInputStream;
+import stream.FASTQ;
+import stream.FastaReadInputStream;
+import stream.ConcurrentReadOutputStream;
+import stream.Read;
+
+import dna.Parser;
+import dna.Timer;
+import fileIO.ByteFile;
+import fileIO.ReadWrite;
+import fileIO.FileFormat;
+
+import align2.ListNum;
+import align2.ReadStats;
+import align2.Shared;
+import align2.Tools;
+
+/**
+ * Filters sequences according to their taxonomy,
+ * as determined by the sequence name. Sequences should
+ * be labeled with a gi number or NCBI taxID.
+ *
+ * @author Brian Bushnell
+ * @date November 23, 2015
+ *
+ */
+public class SplitByTaxa {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /**
+ * Code entrance from the command line.
+ * @param args Command line arguments
+ */
+ public static void main(String[] args){
+ Timer t=new Timer();
+ SplitByTaxa as=new SplitByTaxa(args);
+ as.process(t);
+ }
+
+ /**
+ * Constructor.
+ * @param args Command line arguments
+ */
+ public SplitByTaxa(String[] args){
+
+ //Process any config files
+ args=Parser.parseConfig(args);
+
+ //Detect whether the uses needs help
+ if(Parser.parseHelp(args, true)){
+ printOptions();
+ System.exit(0);
+ }
+
+ //Print the program name and arguments
+ outstream.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+
+ boolean setInterleaved=false; //Whether interleaved was explicitly set.
+
+ //Set some shared static variables regarding PIGZ
+ Shared.READ_BUFFER_LENGTH=Tools.min(200, Shared.READ_BUFFER_LENGTH);
+ Shared.capBuffers(4);
+ ReadWrite.USE_UNPIGZ=true;
+ ReadWrite.USE_PIGZ=false;
+ ReadWrite.USE_GZIP=false;
+ ReadWrite.MAX_ZIP_THREADS=Shared.threads();
+
+ String tableFile=null;
+ String treeFile=null;
+
+ //Create a parser object
+ Parser parser=new Parser();
+
+ //Parse each argument
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+
+ //Break arguments into their constituent parts, in the form of "a=b"
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if(b==null || b.equalsIgnoreCase("null")){b=null;}
+ while(a.startsWith("-")){a=a.substring(1);} //Strip leading hyphens
+
+
+ if(parser.parse(arg, a, b)){//Parse standard flags in the parser
+ //do nothing
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ }else if(a.equals("taxlevel") || a.equals("level")){
+ taxLevel=Integer.parseInt(b);
+ }else if(a.equals("table") || a.equals("gi") || a.equals("gitable")){
+ tableFile=b;
+ if("auto".equalsIgnoreCase(b)){tableFile=TaxTree.DefaultTableFile;}
+ }else if(a.equals("tree") || a.equals("taxtree")){
+ treeFile=b;
+ if("auto".equalsIgnoreCase(b)){treeFile=TaxTree.DefaultTreeFile;}
+ }else{
+ outstream.println("Unknown parameter "+args[i]);
+ assert(false) : "Unknown parameter "+args[i];
+ // throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ {//Process parser fields
+ Parser.processQuality();
+
+ maxReads=parser.maxReads;
+
+ overwrite=ReadStats.overwrite=parser.overwrite;
+ append=ReadStats.append=parser.append;
+ setInterleaved=parser.setInterleaved;
+
+ in1=parser.in1;
+ in2=parser.in2;
+
+ out1=parser.out1;
+ out2=parser.out2;
+
+ extin=parser.extin;
+ extout=parser.extout;
+ }
+
+ //Do input file # replacement
+ if(in1!=null && in2==null && in1.indexOf('#')>-1 && !new File(in1).exists()){
+ in2=in1.replace("#", "2");
+ in1=in1.replace("#", "1");
+ }
+
+ //Do output file # replacement
+ if(out1!=null && out2==null && out1.indexOf('#')>-1){
+ out2=out1.replace("#", "2");
+ out1=out1.replace("#", "1");
+ }
+
+ assert(out1==null || out1.contains("%")) : "Output filename must contain % symbol.";
+ assert(out2==null || out2.contains("%")) : "Output filename must contain % symbol.";
+
+ //Adjust interleaved detection based on the number of input files
+ if(in2!=null){
+ if(FASTQ.FORCE_INTERLEAVED){outstream.println("Reset INTERLEAVED to false because paired input files were specified.");}
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+ }
+
+ assert(FastaReadInputStream.settingsOK());
+
+ //Ensure there is an input file
+ if(in1==null){
+ printOptions();
+ throw new RuntimeException("Error - at least one input file is required.");
+ }
+
+ //Adjust the number of threads for input file reading
+ if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){
+ ByteFile.FORCE_MODE_BF2=true;
+ }
+
+ //Ensure out2 is not set without out1
+ if(out1==null){
+ if(out2!=null){
+ printOptions();
+ throw new RuntimeException("Error - cannot define out2 without defining out1.");
+ }
+ }
+
+ //Adjust interleaved settings based on number of output files
+ if(!setInterleaved){
+ assert(in1!=null && (out1!=null || out2==null)) : "\nin1="+in1+"\nin2="+in2+"\nout1="+out1+"\nout2="+out2+"\n";
+ if(in2!=null){ //If there are 2 input streams.
+ FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
+ outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED);
+ }else{ //There is one input stream.
+ if(out2!=null){
+ FASTQ.FORCE_INTERLEAVED=true;
+ FASTQ.TEST_INTERLEAVED=false;
+ outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED);
+ }
+ }
+ }
+
+ //Ensure output files can be written
+ if(!Tools.testOutputFiles(overwrite, append, false, out1, out2)){
+ outstream.println((out1==null)+", "+(out2==null)+", "+out1+", "+out2);
+ throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+out1+", "+out2+"\n");
+ }
+
+ //Ensure input files can be read
+ if(!Tools.testInputFiles(false, true, in1, in2)){
+ throw new RuntimeException("\nCan't read to some input files.\n");
+ }
+
+ //Ensure that no file was specified multiple times
+ if(!Tools.testForDuplicateFiles(true, in1, in2, out1, out2)){
+ throw new RuntimeException("\nSome file names were specified multiple times.\n");
+ }
+
+ //Create input FileFormat objects
+ ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, extin, true, true);
+ ffin2=FileFormat.testInput(in2, FileFormat.FASTQ, extin, true, true);
+
+ TaxFilter.loadGiTable(tableFile);
+ tree=TaxFilter.loadTree(treeFile);
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Outer Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Create read streams and process all data */
+ public void process(Timer t){
+
+ //Create a read input stream
+ final ConcurrentReadInputStream cris;
+ {
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, ffin2, null, null);
+ cris.start(); //Start the stream
+ if(verbose){outstream.println("Started cris");}
+ }
+ boolean paired=cris.paired();
+ if(!ffin1.samOrBam()){outstream.println("Input is being processed as "+(paired ? "paired" : "unpaired"));}
+
+ //Reset counters
+ readsProcessed=0;
+ basesProcessed=0;
+
+ final HashMap<String, ConcurrentReadOutputStream> map=new HashMap<String, ConcurrentReadOutputStream>();
+
+ //Process the read stream
+ processInner(cris, map);
+
+ if(verbose){outstream.println("Finished; closing streams.");}
+
+ //Write anything that was accumulated by ReadStats
+ errorState|=ReadStats.writeAll();
+ //Close the read streams
+ errorState|=ReadWrite.closeStream(cris);
+
+ for(ConcurrentReadOutputStream ros : map.values()){
+ ReadWrite.closeStream(ros);
+ }
+
+ //Report timing and results
+ {
+ t.stop();
+
+ //Calculate units per nanosecond
+ double rpnano=readsProcessed/(double)(t.elapsed);
+ double bpnano=basesProcessed/(double)(t.elapsed);
+
+ //Add "k" and "m" for large numbers
+ String rpstring=(readsProcessed<100000 ? ""+readsProcessed : readsProcessed<100000000 ? (readsProcessed/1000)+"k" : (readsProcessed/1000000)+"m");
+ String bpstring=(basesProcessed<100000 ? ""+basesProcessed : basesProcessed<100000000 ? (basesProcessed/1000)+"k" : (basesProcessed/1000000)+"m");
+
+ //Format the strings so they have they are right-justified
+ while(rpstring.length()<8){rpstring=" "+rpstring;}
+ while(bpstring.length()<8){bpstring=" "+bpstring;}
+
+ outstream.println("Reads In: \t"+readsProcessed+" reads \t"+basesProcessed+" bases");
+ outstream.println();
+
+ outstream.println("Time: \t"+t);
+ outstream.println("Reads Processed: "+rpstring+" \t"+String.format("%.2fk reads/sec", rpnano*1000000));
+ outstream.println("Bases Processed: "+bpstring+" \t"+String.format("%.2fm bases/sec", bpnano*1000));
+ }
+
+ //Throw an exception of there was an error in a thread
+ if(errorState){
+ throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
+ }
+ }
+
+ /** Iterate through the reads */
+ void processInner(final ConcurrentReadInputStream cris, HashMap<String, ConcurrentReadOutputStream> map){
+
+ {
+ //Grab the first ListNum of reads
+ ListNum<Read> ln=cris.nextList();
+ //Grab the actual read list from the ListNum
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ //Check to ensure pairing is as expected
+ if(reads!=null && !reads.isEmpty()){
+ Read r=reads.get(0);
+ assert((ffin1==null || ffin1.samOrBam()) || (r.mate!=null)==cris.paired());
+ }
+
+ //As long as there is a nonempty read list...
+ while(reads!=null && reads.size()>0){
+ if(verbose){outstream.println("Fetched "+reads.size()+" reads.");}
+
+ //Loop through each read in the list
+ for(int idx=0; idx<reads.size(); idx++){
+ final Read r1=reads.get(idx);
+
+ //Track the initial length for statistics
+ final int initialLength1=r1.length();
+ final int initialLength2=(r1.mateLength());
+
+ //Increment counters
+ readsProcessed+=1+r1.mateCount();
+ basesProcessed+=initialLength1+initialLength2;
+
+ TaxNode tn=tree.getNode(r1.id);
+ if(tn==null){tn=tree.getNodeByName(r1.id);}
+ if(tn==null){tn=unknown;}
+ while(tn.level<taxLevel && tn.id!=tn.pid){tn=tree.getNode(tn.pid);}
+
+ if(out1!=null){
+ ConcurrentReadOutputStream ros=map.get(tn.name);
+ if(ros==null){
+ final int buff=4;
+ FileFormat ffout1=null, ffout2=null;
+ ffout1=FileFormat.testOutput(out1.replaceFirst("%", tn.name.replaceAll("\\s+", "_").replaceAll("[/\\\\]", "")), FileFormat.FASTQ, extout, false, overwrite, append, ordered);
+ if(out2!=null){ffout2=FileFormat.testOutput(out2.replaceFirst("%", tn.name.replaceAll("\\s+", "_").replaceAll("[/\\\\]", "")), FileFormat.FASTQ, extout, false, overwrite, append, ordered);}
+ ros=ConcurrentReadOutputStream.getStream(ffout1, ffout2, null, null, buff, null, false);
+ ros.start(); //Start the stream
+ map.put(tn.name, ros);
+ }
+ ArrayList<Read> temp=new ArrayList<Read>(1); //Kind of inefficient
+ temp.add(r1);
+ ros.add(temp, 0);
+ }
+ }
+
+ //Notify the input stream that the list was used
+ cris.returnList(ln.id, ln.list.isEmpty());
+ if(verbose){outstream.println("Returned a list.");}
+
+ //Fetch a new list
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+
+ //Notify the input stream that the final list was used
+ if(ln!=null){
+ cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
+ }
+ }
+
+ //Do anything necessary after processing
+
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** This is called if the program runs with no parameters */
+ private void printOptions(){
+ throw new RuntimeException("TODO");
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Primary input file path */
+ private String in1=null;
+ /** Secondary input file path */
+ private String in2=null;
+
+ /** Primary output file path */
+ private String out1=null;
+ /** Secondary output file path */
+ private String out2=null;
+
+ /** Override input file extension */
+ private String extin=null;
+ /** Override output file extension */
+ private String extout=null;
+
+ /** The actual filter */
+ private int taxLevel=TaxTree.stringToLevel("phylum");
+
+ /*--------------------------------------------------------------*/
+
+ /** Number of reads processed */
+ protected long readsProcessed=0;
+ /** Number of bases processed */
+ protected long basesProcessed=0;
+
+ /** Quit after processing this many input reads; -1 means no limit */
+ private long maxReads=-1;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Final Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Primary input file */
+ private final FileFormat ffin1;
+ /** Secondary input file */
+ private final FileFormat ffin2;
+
+ private final TaxTree tree;
+
+ private final TaxNode unknown=new TaxNode(-99, -99, taxLevel, "UNKNOWN");
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Common Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Print status messages to this output stream */
+ private PrintStream outstream=System.err;
+ /** Print verbose messages */
+ public static boolean verbose=false;
+ /** True if an error was encountered */
+ public boolean errorState=false;
+ /** Overwrite existing output files */
+ private boolean overwrite=false;
+ /** Append to existing output files */
+ private boolean append=false;
+ /** This flag has no effect on singlethreaded programs */
+ private final boolean ordered=false;
+
+}
diff --git a/current/tax/TaxFilter.java b/current/tax/TaxFilter.java
new file mode 100755
index 0000000..3212173
--- /dev/null
+++ b/current/tax/TaxFilter.java
@@ -0,0 +1,242 @@
+package tax;
+
+import java.io.PrintStream;
+import java.util.HashSet;
+
+import stream.Read;
+
+import fileIO.ReadWrite;
+
+import align2.Tools;
+
+/**
+ * @author Brian Bushnell
+ * @date Nov 30, 2015
+ *
+ */
+public class TaxFilter {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Constructors ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /**
+ * Constructor.
+ * @param args Command line arguments
+ */
+ public static TaxFilter makeFilter(String[] args){
+
+ String names=null;
+ String ids=null;
+
+ String tableFile=null;
+ String treeFile=null;
+
+ int taxLevel=0;
+ boolean include=false;
+
+ for(int i=0; i<args.length; i++){
+ String arg=args[i];
+
+ //Break arguments into their constituent parts, in the form of "a=b"
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if(b==null || b.equalsIgnoreCase("null")){b=null;}
+ while(a.startsWith("-")){a=a.substring(1);} //Strip leading hyphens
+
+ if(a.equals("table") || a.equals("gi")){
+ tableFile=b;
+ if("auto".equalsIgnoreCase(b)){tableFile=TaxTree.DefaultTableFile;}
+ }else if(a.equals("tree") || a.equals("taxtree")){
+ treeFile=b;
+ if("auto".equalsIgnoreCase(b)){treeFile=TaxTree.DefaultTreeFile;}
+ }else if(a.equals("level") || a.equals("taxlevel")){
+ if(Character.isDigit(b.charAt(0))){
+ taxLevel=Integer.parseInt(b);
+ }else{
+ taxLevel=TaxTree.stringToLevel(b.toLowerCase());
+ }
+ }else if(a.equals("name") || a.equals("names")){
+ names=b;
+ }else if(a.equals("include")){
+ include=Tools.parseBoolean(b);
+ }else if(a.equals("exclude")){
+ include=!Tools.parseBoolean(b);
+ }else if(a.equals("id") || a.equals("ids") || a.equals("taxid") || a.equals("taxids")){
+ ids=b;
+ }
+ }
+
+ TaxFilter filter=new TaxFilter(tableFile, treeFile, taxLevel, include, null);
+ filter.addNames(names);
+ filter.addNumbers(ids);
+
+ return filter;
+ }
+
+ /**
+ * Constructor.
+ * @param tree_
+ * @param taxLevel_
+ * @param include_
+ * @param taxSet_
+ */
+ public TaxFilter(TaxTree tree_, int taxLevel_, boolean include_, HashSet<Integer> taxSet_){
+ tree=tree_;
+ taxLevel=taxLevel_;
+ include=include_;
+ taxSet=(taxSet_==null ? new HashSet<Integer>() : taxSet_);
+ }
+
+ /**
+ * Constructor.
+ * @param tableFile
+ * @param treeFile
+ * @param taxLevel_
+ * @param include_
+ * @param taxSet_
+ */
+ public TaxFilter(String tableFile, String treeFile, int taxLevel_, boolean include_, HashSet<Integer> taxSet_){
+ taxLevel=taxLevel_;
+ include=include_;
+ taxSet=(taxSet_==null ? new HashSet<Integer>() : taxSet_);
+
+ loadGiTable(tableFile);
+ tree=loadTree(treeFile);
+ }
+
+ public static boolean validArgument(String a){
+ if(a.equals("table") || a.equals("gi")){
+ }else if(a.equals("tree") || a.equals("taxtree")){
+ }else if(a.equals("level") || a.equals("taxlevel")){
+ }else if(a.equals("name") || a.equals("names")){
+ }else if(a.equals("include")){
+ }else if(a.equals("exclude")){
+ }else if(a.equals("id") || a.equals("ids") || a.equals("taxid") || a.equals("taxids")){
+ }else{
+ return false;
+ }
+ return true;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ static void loadGiTable(String fname){
+ if(fname==null){return;}
+ if(PRINT_STUFF){outstream.println("Loading gi table.");}
+ GiToNcbi.initialize(fname);
+ }
+
+ static TaxTree loadTree(String fname){
+ if(fname==null){return null;}
+ if(PRINT_STUFF){outstream.println("Loading tree.");}
+ TaxTree tt=ReadWrite.read(TaxTree.class, fname, true);
+ if(tt.nameMap==null){
+ if(PRINT_STUFF){outstream.println("Hashing names.");}
+ tt.hashNames();
+ }
+ assert(tt.nameMap!=null);
+ return tt;
+ }
+
+ public void addNames(String names){
+ if(names==null){return;}
+ String[] array=names.split(",");
+ for(String name : array){
+ addName(name);
+ }
+ }
+
+ public boolean addName(String name){
+ TaxNode tn=tree.getNodeByName(name);
+ if(tn==null){tn=tree.getNode(name);}
+ assert(tn!=null) : "Could not find a node for '"+name+"'";
+ return addNode(tn);
+ }
+
+ public void addNumbers(String numbers){
+ if(numbers==null){return;}
+ String[] array=numbers.split(",");
+ for(String s : array){
+ final int x=Integer.parseInt(s);
+ addNumber(x);
+ }
+ }
+
+ public boolean addNumber(int taxID){
+ TaxNode tn=tree.getNode(taxID);
+ assert(tn!=null) : "Could not find a node for '"+taxID+"'";
+ return addNode(tn);
+ }
+
+ public boolean addNode(TaxNode tn){
+ if(tn==null){return false;}
+ do{
+ taxSet.add(tn.id);
+ tn=tree.getNode(tn.pid);
+ }while(tn!=null && tn.level<=taxLevel && tn.id!=tn.pid);
+ return true;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Outer Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ boolean passesFilter(final Read r){
+ return passesFilter(r.id);
+ }
+
+ boolean passesFilter(final String name){
+ if(taxSet.isEmpty()){return !include;}
+ TaxNode tn=tree.getNode(name);
+ if(tn==null){tn=tree.getNodeByName(name);}
+ assert(tn!=null) : "Could not find node for '"+name+"'";
+ return passesFilter(tn);
+ }
+
+ boolean passesFilter(final int id){
+ if(taxSet.isEmpty()){return !include;}
+ TaxNode tn=tree.getNode(id);
+ assert(tn!=null) : "Could not find node number "+id;
+ return passesFilter(tn);
+ }
+
+ boolean passesFilter(TaxNode tn){
+ if(taxSet.isEmpty()){return !include;}
+ assert(tn!=null) : "Null TaxNode.";
+ boolean found=false;
+ do{
+ found=taxSet.contains(tn.id);
+ tn=tree.getNode(tn.pid);
+ }while(!found && tn!=null && tn.id!=tn.pid);
+ return include==found;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private final TaxTree tree;
+
+ /** Level at which to filter */
+ private final int taxLevel;
+
+ /** Set of numeric NCBI TaxIDs */
+ private final HashSet<Integer> taxSet;
+
+ private final boolean include;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Static Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Print status messages to this output stream */
+ private static PrintStream outstream=System.err;
+
+ /** Print loading messages */
+ static boolean PRINT_STUFF=true;
+
+}
diff --git a/current/tax/TaxNode.java b/current/tax/TaxNode.java
new file mode 100755
index 0000000..4ef0396
--- /dev/null
+++ b/current/tax/TaxNode.java
@@ -0,0 +1,101 @@
+package tax;
+
+import java.io.Serializable;
+import java.util.Comparator;
+
+import align2.Tools;
+
+/**
+ * Represents a taxonomic identifier, such as a specific genus.
+ * Includes the name, NCBI numeric id, parent id, and taxonomic level.
+ * @author Brian Bushnell
+ * @date Mar 6, 2015
+ *
+ */
+public class TaxNode implements Serializable{
+
+ /**
+ *
+ */
+ private static final long serialVersionUID = -926484721933977347L;
+
+ public TaxNode(int id_, String name_){
+ this(id_, -1, -1, name_);
+ }
+
+ public TaxNode(int id_, int parent_, int level_, String name_){
+ id=id_;
+ pid=parent_;
+ level=level_;
+ name=name_;
+ }
+
+ /**
+ * @param split
+ * @param i
+ * @return
+ */
+ public boolean matchesName(String[] split, int idx, TaxTree tree) {
+ if(idx<0){return true;}
+ if(!split[idx].equalsIgnoreCase(name)){return false;}
+ return tree.getNode(pid).matchesName(split, idx-1, tree);
+ }
+
+ public String toString(){
+ return "("+id+","+pid+","+countRaw+","+countSum+",'"+(level<0 ? "?" : TaxTree.levelToString(level))+",'"+(canonical ? "T" : "F")+",'"+name+"')";
+ }
+
+ public boolean equals(TaxNode b){
+ if(id!=b.id || pid!=b.pid || level!=b.level || canonical!=b.canonical){return false;}
+ if(name==b.name){return true;}
+ if((name==null) != (b.name==null)){return false;}
+ return name.equals(b.name);
+ }
+
+ public long incrementRaw(long amt){
+ if(amt==0){return countRaw;}
+ if(verbose){System.err.println("incrementRaw("+amt+") node: "+this);}
+ countRaw+=amt;
+ assert(countRaw>=0) : "Overflow! "+countRaw+", "+amt;
+ return countRaw;
+ }
+
+ public long incrementSum(long amt){
+ if(amt==0){return countSum;}
+ if(verbose){System.err.println("incrementSum("+amt+") node: "+this);}
+ countSum+=amt;
+ assert(countSum>=0) : "Overflow! "+countSum+", "+amt;
+ return countSum;
+ }
+
+ public String levelString(){return level<0 ? "unknown" : TaxTree.levelToString(level);}
+
+ public static class CountComparator implements Comparator<TaxNode>{
+
+ @Override
+ public int compare(TaxNode a, TaxNode b) {
+ long x=b.countSum-a.countSum;
+// System.err.println("x="+x+" -> "+Tools.longToInt(x));
+ if(x!=0){return Tools.longToInt(x);}
+ return a.level==b.level ? a.id-b.id : a.level-b.level;
+ }
+
+ }
+
+ @Override
+ public final int hashCode(){return id;}
+
+ public final int id;
+ public final String name;
+ public int pid;
+ public int level;
+ public boolean canonical=true;
+
+ public long countRaw=0;
+ public long countSum=0;
+
+ public static final boolean verbose=false;
+ public static final CountComparator countComparator=new CountComparator();
+
+
+}
diff --git a/current/tax/TaxTree.java b/current/tax/TaxTree.java
new file mode 100755
index 0000000..278a966
--- /dev/null
+++ b/current/tax/TaxTree.java
@@ -0,0 +1,661 @@
+package tax;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.regex.Pattern;
+
+import dna.Timer;
+
+import align2.IntList;
+import align2.Tools;
+
+import fileIO.ReadWrite;
+import fileIO.TextFile;
+
+/**
+ * @author Brian Bushnell
+ * @date Mar 6, 2015
+ *
+ */
+public class TaxTree implements Serializable{
+
+ /**
+ *
+ */
+ private static final long serialVersionUID = 1682832560435175041L;
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public static void main(String[] args){
+ ReadWrite.USE_UNPIGZ=true;
+ ReadWrite.USE_PIGZ=true;
+ ReadWrite.ZIPLEVEL=9;
+ Timer t=new Timer();
+ TaxTree tree=new TaxTree(args[0], args[1]);
+ t.stop();
+ System.out.println("Retained "+tree.nodeCount+" nodes:");
+ for(int i=tree.treeLevels.length-1; i>=0; i--){
+ System.out.print(tree.nodesPerLevel[i]+"\t"+taxaNames[i]);
+ if(verbose){
+ int lim=10;
+ for(int j=0; j<lim && j<tree.treeLevels[i].length; j++){
+ TaxNode n=tree.treeLevels[i][j];
+ System.out.print("\n"+n+" -> "+tree.nodes[n.pid]);
+ }
+ for(int j=tree.treeLevels[i].length-lim; j<tree.treeLevels[i].length; j++){
+ if(j>=lim){
+ TaxNode n=tree.treeLevels[i][j];
+ System.out.print("\n"+n+" -> "+tree.nodes[n.pid]);
+ }
+ }
+ }
+ System.out.println();
+ }
+ System.out.println();
+ System.out.println("Time: \t"+t);
+
+ if(args.length>2){//Write a tree
+ ReadWrite.write(tree, args[2], true);
+ }
+ }
+
+ public TaxTree(String namesFile, String nodesFile){
+
+ nodes=getNames(namesFile);
+ getNodes(nodesFile, nodes);
+
+ if(simplify){
+ int removed=simplify(nodes);
+ if(verbose){System.out.println("Removed "+removed+" nodes.");}
+ }
+
+ for(TaxNode n : nodes){
+ if(n!=null){
+ nodesPerLevel[n.level]++;
+ }
+ }
+
+ for(int i=0; i<nodesPerLevel.length; i++){
+ treeLevels[i]=new TaxNode[nodesPerLevel[i]];
+ }
+
+ {
+ int[] temp=new int[nodesPerLevel.length];
+ for(TaxNode n : nodes){
+ if(n!=null){
+ int level=n.level;
+ treeLevels[level][temp[level]]=n;
+ temp[level]++;
+ }
+ }
+ }
+ nodeCount=(int)Tools.sum(nodesPerLevel);
+
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Construction ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private static TaxNode[] getNames(String fname){
+ ArrayList<TaxNode> list=new ArrayList<TaxNode>(200000);
+ int max=0;
+
+ TextFile tf=new TextFile(fname, false, false);
+ for(String s=tf.nextLine(); s!=null; s=tf.nextLine()){
+ if(s.contains("scientific name")){
+ String[] split=delimiter.split(s, 3);
+ assert(split.length==3) : s;
+ int id=Integer.parseInt(split[0]);
+ String name=split[1];
+ if(id==1 && name.equalsIgnoreCase("root")){name="Life";}
+ max=Tools.max(max, id);
+ list.add(new TaxNode(id, name));
+ }
+ }
+
+ TaxNode[] nodes=new TaxNode[max+1];
+ for(TaxNode n : list){
+ assert(nodes[n.id]==null || nodes[n.id].equals(n)) : nodes[n.id]+" -> "+n;
+ nodes[n.id]=n;
+ }
+
+ return nodes;
+ }
+
+ public void hashNames(){
+ assert(nameMap==null);
+ assert(nameMapLower==null);
+ nameMap=new HashMap<String, ArrayList<TaxNode>>((int)Tools.mid(2, nodes.length*1.5, Integer.MAX_VALUE));
+ nameMapLower=new HashMap<String, ArrayList<TaxNode>>((int)Tools.mid(2, nodes.length*1.5, Integer.MAX_VALUE));
+ for(TaxNode n : nodes){
+ if(n!=null){
+ if(n.name!=null && !n.name.equals("environmental samples")){
+ {
+ ArrayList<TaxNode> list=nameMap.get(n.name);
+ if(list==null){
+ list=new ArrayList<TaxNode>();
+ nameMap.put(n.name, list);
+ }
+ list.add(n);
+ }
+ {
+ String lc=n.name.toLowerCase();
+ ArrayList<TaxNode> list=nameMapLower.get(lc);
+ if(list==null){
+ list=new ArrayList<TaxNode>();
+ nameMapLower.put(lc, list);
+ }
+ list.add(n);
+ }
+ }
+ }
+ }
+ }
+
+ private static TaxNode[] getNodes(String fname, TaxNode[] nodes){
+
+ int max=0;
+
+ LinkedHashMap<String, int[]> oddNames=new LinkedHashMap<String, int[]>();
+
+ TextFile tf=new TextFile(fname, false, false);
+ for(String s=tf.nextLine(); s!=null; s=tf.nextLine()){
+ String[] split=delimiter.split(s, 4);
+ assert(split.length==4) : s;
+ int id=-1, pid=-1, level=-1;
+
+ id=Integer.parseInt(split[0]);
+ try {
+ pid=Integer.parseInt(split[1]);
+ } catch (NumberFormatException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ System.err.println("Bad line: "+s+"\n"+Arrays.toString(split));
+ }
+ boolean alt=false;
+ {
+ String key=split[2];
+ Integer obj=levelMap.get(key);
+ if(obj==null){
+ obj=altLevelMap.get(key);
+ alt=true;
+ }
+ if(obj!=null){
+ level=obj;
+ if(id==pid){
+ level=levelMap.get("life");
+ alt=false;
+ }
+ }else{
+ if(id==pid){
+ level=levelMap.get("life");
+ alt=false;
+ }else{
+ int[] count=oddNames.get(key);
+ if(count==null){
+ count=new int[1];
+ oddNames.put(key, count);
+ }
+ count[0]++;
+ }
+ }
+ }
+ max=Tools.max(max, id);
+ TaxNode n=nodes[id];
+ assert(n!=null && n.pid<0) : n+" -> "+s;
+ n.pid=pid;
+ n.level=level;
+ n.canonical=!alt;
+ }
+
+ if(oddNames.size()>0){
+ System.out.println("Found "+oddNames.size()+" unknown taxonomic levels:");
+ if(verbose){
+ for(String s : oddNames.keySet()){
+ System.out.println(oddNames.get(s)[0]+"\t"+s);
+ }
+ }
+ }
+
+ return nodes;
+ }
+
+ private int simplify(TaxNode nodes[]){
+
+ int failed=test(nodes);
+
+ int removed=0;
+ int reassigned=0;
+
+ if(eliminateUnknown){//Eliminate nodes with unknown taxa
+ if(verbose){System.out.println("A0");}
+ for(int i=0; i<nodes.length; i++){
+ TaxNode n=nodes[i];
+ if(n!=null){
+ int pid=n.pid;
+ TaxNode parent=nodes[pid];
+ assert(parent!=null) : n;
+ assert(parent!=n || pid==1) : n+", "+pid;
+ while(parent.level<0){
+ assert(parent.id!=parent.pid);
+ parent=nodes[parent.pid];
+ n.pid=parent.id;
+ reassigned++;
+ }
+ }
+ }
+
+ for(int i=0; i<nodes.length; i++){
+ if(nodes[i]!=null && nodes[i].level<0){
+ nodes[i]=null;
+ removed++;
+ }
+ }
+ if(verbose){System.out.println("reassigned: "+reassigned+", removed: "+removed);}
+ }
+
+ if(inferRankLimit>0){//Infer level for unset nodes (from "no rank")
+ if(verbose){System.out.println("A");}
+ int changed=1;
+ while(changed>0){
+ changed=0;
+ for(final TaxNode n : nodes){
+ if(n!=null){
+ if(n.level==0){
+ TaxNode parent=nodes[n.pid];
+ if(n!=parent && parent.level>0 && parent.level<=inferRankLimit+1){
+ n.level=Tools.max(1, parent.level-1);
+ assert(n.level>0 && n.level<=parent.level && n.level<=inferRankLimit);
+ n.canonical=false;
+ changed++;
+ }
+ }
+ }
+ }
+ if(verbose){System.out.println("changed: "+changed);}
+ }
+
+// System.out.println("B");
+// for(TaxNode n : nodes){
+// if(n!=null && n.level==0){
+// n.level=-1;
+// }
+// }
+ }
+
+ failed=test(nodes);
+
+ {//Skip nodes with duplicate taxa
+ if(verbose){System.out.println("D");}
+ int changed=1;
+ while(changed>0){
+ changed=0;
+ for(final TaxNode n : nodes){
+ if(n!=null){
+ TaxNode parent=nodes[n.pid];
+ TaxNode grandparent=nodes[parent.pid];
+ assert(n.level<=parent.level || parent.level<1 || !parent.canonical) : n+" -> "+parent+" -> "+grandparent;
+ assert(parent.level<=grandparent.level || grandparent.level<1 || !grandparent.canonical) : n+" -> "+parent+" -> "+grandparent;
+
+ while(parent!=grandparent && (parent.level<0 || (parent.level==grandparent.level && !parent.canonical) ||
+ n.level>parent.level || (n.level==parent.level))){
+ parent=grandparent;
+ grandparent=nodes[parent.pid];
+ n.pid=parent.id;
+ reassigned++;
+ changed++;
+ }
+ }
+ }
+ if(verbose){System.out.println("changed: "+changed);}
+ }
+ if(verbose){System.out.println("E");}
+ for(int i=0; i<nodes.length; i++){
+ if(nodes[i]!=null && nodes[i].level<0){
+ nodes[i]=null;
+ removed++;
+ }
+ }
+ }
+
+ failed=test(nodes);
+
+ if(verbose){System.out.println("F");}
+ {//Ensure the tree is now clean
+ for(int i=0; i<nodes.length; i++){
+ TaxNode n=nodes[i];
+ if(n!=null){
+ TaxNode parent=nodes[n.pid];
+ TaxNode grandparent=nodes[parent.pid];
+ assert(n==parent || n.level<parent.level || !n.canonical) : n+" -> "+parent+" -> "+grandparent;
+ assert(parent==grandparent || parent.level<grandparent.level) : n+" -> "+parent+" -> "+grandparent;
+ }
+ }
+ }
+
+ if(verbose){System.err.println("Reassignments: "+reassigned);}
+
+ return removed;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Validation ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private static int test(TaxNode[] nodes){
+ int failed=0;
+ for(final TaxNode n : nodes){
+ if(n!=null){
+ TaxNode parent=nodes[n.pid];
+ assert(n==parent || n.level<=parent.level || parent.level<1 || !parent.canonical) : n+" -> "+parent;
+// assert(n==parent || n.level<parent.level || parent.level<1 || !n.canonical || !parent.canonical) : n+" -> "+parent;
+ if(n!=parent && n.level>=parent.level && parent.level>=1 && n.canonical && parent.canonical){
+ if(verbose){System.out.println("Error: "+n+" -> "+parent);}
+ failed++;
+ }
+ assert(n!=parent || n.id<=1) : n;
+ }
+ }
+ if(verbose){System.out.println(failed+" nodes failed.");}
+ return failed;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Outer Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+
+ public static int getID(String s){return GiToNcbi.getID(s);}
+
+ public static int getID(byte[] s){return GiToNcbi.getID(s);}
+
+ /** Return the ancestor with taxonomic level at least minLevel */
+ public TaxNode getNode(String s, int minLevel){
+ TaxNode tn=getNode(s);
+ while(tn!=null && tn.level<minLevel && tn.pid!=tn.id){
+ tn=getNode(tn.pid);
+ }
+ return tn;
+ }
+
+ public TaxNode getNode(String s){
+ {
+ int index=s.indexOf('|');
+ if(index<0){index=s.indexOf("_");}
+ if((index==2 && s.length()>3 && s.startsWith("gi") && Character.isDigit(s.charAt(4))) ||
+ (index==4 && s.length()>5 && s.startsWith("ncbi") && Character.isDigit(s.charAt(6)))){
+// System.err.println("Looking for gi or ncbi number.");
+ int number=GiToNcbi.getID(s);
+ if(number>=0){return getNode(number);}
+ }
+ }
+// System.err.println("Can't process name "+s);
+ if(Character.isDigit(s.charAt(0)) && s.length()<=9){
+ try {
+ return getNode(Integer.parseInt(s));
+ } catch (NumberFormatException e) {
+ //ignore
+ }
+ }
+ return null;
+ }
+
+ public TaxNode getNode(byte[] s){
+ if(Tools.indexOf(s, (byte)'|')>=0){return getNode(GiToNcbi.getID(s));}
+
+ {
+ int index=Tools.indexOf(s, (byte)'|');
+ if(index<0){index=Tools.indexOf(s, (byte)'_');}
+ if((index==2 && s.length>3 && Tools.startsWith(s, "gi") && Character.isDigit(s[4])) ||
+ (index==4 && s.length>5 && Tools.startsWith(s, "ncbi") && Character.isDigit(s[6]))){
+// System.err.println("Looking for gi or ncbi number.");
+ int number=GiToNcbi.getID(s);
+ if(number>=0){return getNode(number);}
+ }
+ }
+
+ if(Character.isDigit(s[0]) && s.length<=9){
+ try {
+ return getNode(Tools.parseInt(s, 0, s.length));
+ } catch (NumberFormatException e) {
+ //ignore
+ }
+ }
+ return null;
+ }
+ public TaxNode getNode(int id){return id<0 ? null : nodes[id];}
+
+ public TaxNode getNodeByName(String s){
+ TaxNode tn=getNodeByName(s, false);
+ if(tn==null){tn=getNodeByName(s, true);}
+ return tn;
+ }
+ private TaxNode getNodeByName(String s, boolean lowercase){
+ if(s.indexOf('_')>=0){s=s.replace('_', ' ');}
+ if(lowercase){s=s.toLowerCase();}
+// System.err.println("Searching for "+s);
+ final HashMap<String, ArrayList<TaxNode>> map=(lowercase ? nameMapLower : nameMap);
+ ArrayList<TaxNode> list=map.get(s);
+ if(list!=null){
+ if(list.size()==1){return list.get(0);}
+ assert(false) : "Found multiple nodes for '"+s+"':\n"+list+"\n";
+ }
+// System.err.println("No matches for '"+s+"'");
+
+// assert(false) : nameMap.containsKey(s)+", "+nameMapLower.containsKey(s);
+
+ String[] split=delimiter2.split(lowercase ? s.toLowerCase() : s, 8);
+// System.err.println("Array: "+Arrays.toString(split));
+ list=map.get(split[split.length-1]);
+// System.err.println(list==null ? "No matches for "+split[split.length-1] : "Found list( "+list.size()+")");
+ if(list==null || list.isEmpty()){
+ return null;
+ }
+ if(list.size()==1){return list.get(0);}
+
+ TaxNode matching=null;
+ for(TaxNode tn : list){
+ if(tn.matchesName(split, split.length-1, this)){
+ assert(matching==null) : "Found two nodes for '"+s+"':\n"+matching+"\n"+tn;
+ matching=tn;
+ }
+ }
+ return matching;
+ }
+ public ArrayList<TaxNode> getAncestors(int id){
+ TaxNode current=getNode(id);
+ ArrayList<TaxNode> list=new ArrayList<TaxNode>();
+ while(current!=null && current.pid!=current.id){//ignores root
+ list.add(current);
+ current=getNode(current.pid);
+ }
+ //optionally add root here
+ return list;
+ }
+
+ public void increment(IntList ids, IntList counts, boolean sync){
+
+ ids.sort();
+ ids.getUniqueCounts(counts);
+
+ if(!sync){
+ for(int i=0; i<ids.size; i++){
+ int id=ids.get(i);
+ int count=counts.get(i);
+ incrementRaw(id, count);
+ }
+ }else{
+ synchronized(this){
+ for(int i=0; i<ids.size; i++){
+ int id=ids.get(i);
+ int count=counts.get(i);
+ incrementRaw(id, count);
+ }
+ }
+ }
+ }
+
+ public void incrementRaw(int id, long amt){
+ nodes[id].incrementRaw(amt);
+ }
+
+ public void percolateUp(){
+ for(int i=0; i<treeLevels.length; i++){
+ percolateUp(i);
+ }
+ }
+
+ public void percolateUp(final int fromLevel){
+ final TaxNode[] stratum=treeLevels[fromLevel];
+ for(final TaxNode n : stratum){
+ n.incrementSum(n.countRaw);
+ TaxNode parent=nodes[n.pid];
+ if(n!=parent){
+ parent.incrementSum(n.countSum);
+ }
+ }
+ }
+
+ /** Add this amount to the node and all its ancestors. */
+ public void percolateUp(TaxNode node, long amt){
+ if(amt==0){return;}
+ if(verbose){System.err.println("percolateUp("+amt+") node: "+node);}
+ while(node.id!=node.pid){
+ node.incrementSum(amt);
+ node=nodes[node.pid];
+ }
+ node.incrementSum(amt);
+ }
+
+ public ArrayList<TaxNode> gatherNodesAtLeastLimit(final long limit){
+ return gatherNodesAtLeastLimit(limit, 0, nodesPerLevel.length-1);
+ }
+
+ public ArrayList<TaxNode> gatherNodesAtLeastLimit(final long limit, final int minLevel, final int maxLevel){
+ ArrayList<TaxNode> list=new ArrayList<TaxNode>();
+ for(int i=minLevel; i<nodesPerLevel.length && i<=maxLevel; i++){
+ list.addAll(gatherNodesAtLeastLimit(i, limit));
+ }
+ Collections.sort(list, TaxNode.countComparator);
+ return list;
+ }
+
+ public ArrayList<TaxNode> gatherNodesAtLeastLimit(final int fromLevel, final long limit){
+ ArrayList<TaxNode> list=new ArrayList<TaxNode>();
+ final TaxNode[] stratum=treeLevels[fromLevel];
+ for(final TaxNode n : stratum){
+ if(n.countSum>=limit){
+ list.add(n);
+ TaxNode parent=nodes[n.pid];
+ if(n!=parent){
+ percolateUp(parent, -n.countSum);
+ }
+ }
+ }
+ Collections.sort(list, TaxNode.countComparator);
+ return list;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Static Initializers ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /**
+ * @return
+ */
+ private static HashMap<String, Integer> makeLevelMap() {
+ HashMap<String, Integer> map=new HashMap<String, Integer>(31);
+ for(int i=0; i<taxaNames.length; i++){
+ map.put(taxaNames[i], i);
+ }
+ return map;
+ }
+
+ /**
+ * @return
+ */
+ private static HashMap<String, Integer> makeAltLevelMap() {
+ HashMap<String, Integer> map=new HashMap<String, Integer>(67);
+ for(int i=0; i<taxaNames.length; i++){
+ map.put(taxaNames[i], i);
+ }
+
+ //Add synonyms
+ map.put("subfamily", map.get("family"));
+ map.put("tribe", map.get("family"));
+ map.put("varietas", map.get("subspecies"));
+ map.put("subgenus", map.get("genus"));
+ map.put("forma", map.get("subspecies"));
+ map.put("species group", map.get("genus"));
+ map.put("subclass", map.get("class"));
+ map.put("species subgroup", map.get("species"));
+ map.put("infraorder", map.get("order"));
+ map.put("superorder", map.get("class"));
+ map.put("subphylum", map.get("phylum"));
+ map.put("infraclass", map.get("class"));
+ map.put("superkingdom", map.get("division"));
+ map.put("parvorder", map.get("order"));
+ map.put("superclass", map.get("phylum"));
+ map.put("superphylum", map.get("kingdom"));
+ map.put("subkingdom", map.get("kingdom"));
+ map.put("superfamily", map.get("order"));
+ map.put("superkingdom", map.get("domain"));
+ map.put("suborder", map.get("order"));
+ map.put("subtribe", map.get("family"));
+// map.put("no rank", map.get("subspecies"));
+
+ return map;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public final TaxNode[] nodes;
+ public final int[] nodesPerLevel=new int[taxaNames.length];
+ public final int nodeCount;
+
+ public final TaxNode[][] treeLevels=new TaxNode[taxaNames.length][];
+
+ HashMap<String, ArrayList<TaxNode>> nameMap;
+ HashMap<String, ArrayList<TaxNode>> nameMapLower;
+
+ public int minValidTaxa=0;
+
+ public boolean simplify=true;
+ public boolean inferSpecies=true;
+ public boolean eliminateUnknown=false;
+ public int inferRankLimit=levelMap.get("species");
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Constants ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public static final int stringToLevel(String s){return levelMap.containsKey(s) ? levelMap.get(s) : altLevelMap.get(s);}
+ public static final String levelToString(int x){return taxaNames[x];}
+
+ private static final String[] taxaNames=new String[] {
+ "no rank", "subspecies", "species", "genus",
+ "family", "order", "class", "phylum",
+ "kingdom", "domain", "life"
+ };
+
+ private static final HashMap<String, Integer> levelMap=makeLevelMap();
+ private static final HashMap<String, Integer> altLevelMap=makeAltLevelMap();
+
+ private static final Pattern delimiter = Pattern.compile("\t\\|\t");
+ private static final Pattern delimiter2 = Pattern.compile("[\\s_]+");
+
+ public static final String DefaultTableFile="/global/projectb/sandbox/gaag/bbtools/tax/gitable.int1d.gz";
+ public static final String DefaultTreeFile="/global/projectb/sandbox/gaag/bbtools/tax/tree.taxtree.gz";
+
+ public static boolean verbose=false;
+
+}
diff --git a/current/ukmer/AbstractKmerTableU.java b/current/ukmer/AbstractKmerTableU.java
new file mode 100755
index 0000000..1fa89b1
--- /dev/null
+++ b/current/ukmer/AbstractKmerTableU.java
@@ -0,0 +1,563 @@
+package ukmer;
+
+import java.util.concurrent.atomic.AtomicIntegerArray;
+import java.util.concurrent.locks.Lock;
+
+import stream.ByteBuilder;
+import stream.KillSwitch;
+import align2.Shared;
+import align2.Tools;
+import dna.AminoAcid;
+import fileIO.ByteStreamWriter;
+import fileIO.TextStreamWriter;
+
+/**
+ * @author Brian Bushnell
+ * @date Oct 23, 2013
+ *
+ */
+public abstract class AbstractKmerTableU {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Kmer methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Returns count */
+ public abstract int increment(Kmer kmer);
+
+ /** Returns number of entries created */
+ public abstract int incrementAndReturnNumCreated(final Kmer kmer);
+
+ public abstract int set(Kmer kmer, int value);
+
+ public abstract int set(Kmer kmer, int[] vals);
+
+ /** Returns number of kmers added */
+ public abstract int setIfNotPresent(Kmer kmer, int value);
+
+ /**
+ * Fetch the value associated with a kmer.
+ * @param kmer
+ * @return A value. -1 means the kmer was not present.
+ */
+ public abstract int getValue(Kmer kmer);
+
+ /**
+ * Fetch the values associated with a kmer.
+ * @param kmer
+ * @param singleton A blank array of length 1.
+ * @return An array filled with values. Values of -1 are invalid.
+ */
+ public abstract int[] getValues(Kmer kmer, int[] singleton);
+
+ public abstract boolean contains(Kmer kmer);
+
+// public abstract boolean contains(Kmer kmer, int v);
+//
+// public abstract boolean contains(Kmer kmer, int[] vals);
+//
+// public abstract Object get(Kmer kmer);
+
+ public static final int compare(long[] key1, long[] key2){
+ for(int i=0; i<key1.length; i++){
+ long dif=key1[i]-key2[i];
+ if(dif!=0){return (int)Tools.mid(-1, dif, 1);}
+ }
+ return 0;
+ }
+
+ public static final boolean equals(long[] key1, long[] key2){
+ return compare(key1, key2)==0;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Abstract Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public abstract int getValue(long[] key, long xor);
+
+// /** Returns count */
+// public final int increment(long[] key){throw new RuntimeException();}
+//
+// /** Returns number of entries created */
+// public final int incrementAndReturnNumCreated(final long[] key){throw new RuntimeException();}
+//
+// public final int set(long[] key, int value){throw new RuntimeException();}
+//
+// public final int set(long[] key, int[] vals){throw new RuntimeException();}
+//
+// /** Returns number of kmers added */
+// public final int setIfNotPresent(long[] key, int value){throw new RuntimeException();}
+//
+// /**
+// * Fetch the value associated with a kmer.
+// * @param kmer
+// * @return A value. -1 means the kmer was not present.
+// */
+// final int getValue(long[] key){throw new RuntimeException();}
+//
+// /**
+// * Fetch the values associated with a kmer.
+// * @param kmer
+// * @param singleton A blank array of length 1.
+// * @return An array filled with values. Values of -1 are invalid.
+// */
+// public final int[] getValues(long[] key, int[] singleton){throw new RuntimeException();}
+//
+// public final boolean contains(long[] key){throw new RuntimeException();}
+
+ public final boolean contains(Kmer kmer, int v){
+ assert(TESTMODE);
+ int[] set=getValues(kmer, new int[] {-1});
+ if(set==null){return false;}
+ for(int s : set){
+ if(s==-1){break;}
+ if(s==v){return true;}
+ }
+ return false;
+ }
+
+ public final boolean contains(Kmer kmer, int[] vals){
+ assert(TESTMODE);
+ int[] set=getValues(kmer, new int[] {-1});
+ if(set==null){return false;}
+ boolean success=true;
+ for(int v : vals){
+ if(v==-1){break;}
+ success=false;
+ for(int s : set){
+ if(s==v){
+ success=true;
+ break;
+ }
+ }
+ if(!success){break;}
+ }
+ return success;
+ }
+
+ public abstract void rebalance();
+
+ public abstract long size();
+ public abstract int arrayLength();
+ public abstract boolean canRebalance();
+
+ public abstract boolean dumpKmersAsText(TextStreamWriter tsw, int k, int mincount);
+ public abstract boolean dumpKmersAsBytes(ByteStreamWriter bsw, int k, int mincount);
+ public abstract boolean dumpKmersAsBytes_MT(final ByteStreamWriter bsw, final ByteBuilder bb, final int k, final int mincount);
+
+ public abstract void fillHistogram(long[] ca, int max);
+
+ Object get(Kmer kmer){return get(kmer.key());}
+ abstract Object get(long[] key);
+ abstract void resize();
+ abstract boolean canResize();
+
+
+
+ /**
+ * Removes entries with a value of zero or less.
+ * Rehashes the remainder.
+ * @return Number removed.
+ */
+ abstract long regenerate();
+
+ final void lock(){getLock().lock();}
+ final void unlock(){getLock().unlock();}
+ final boolean tryLock(){return getLock().tryLock();}
+ Lock getLock(){
+ throw new RuntimeException("Unimplemented.");
+ }
+
+ /*--------------------------------------------------------------*/
+ /*--------------- Allocation Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ final AtomicIntegerArray allocAtomicInt(int len){
+ AtomicIntegerArray ret=null;
+ try {
+ ret=new AtomicIntegerArray(len);
+ } catch (OutOfMemoryError e) {
+ synchronized(killMessage){
+ e.printStackTrace();
+ System.err.println(killMessage);
+ KillSwitch.killSilent();
+ }
+ }
+ return ret;
+ }
+
+ final long[] allocLong1D(int len){
+ long[] ret=null;
+ try {
+ ret=new long[len];
+ } catch (OutOfMemoryError e) {
+ synchronized(killMessage){
+ e.printStackTrace();
+ System.err.println(killMessage);
+ KillSwitch.killSilent();
+ }
+ }
+ return ret;
+ }
+
+ final long[][] allocLong2D(int mult, int len){
+ long[][] ret=null;
+ try {
+ ret=new long[mult][len];
+ } catch (OutOfMemoryError e) {
+ ret=null;
+ synchronized(killMessage){
+ e.printStackTrace();
+ System.err.println(killMessage);
+ KillSwitch.killSilent();
+ }
+ }
+ return ret;
+ }
+
+ final int[] allocInt1D(int len){
+ int[] ret=null;
+ try {
+ ret=new int[len];
+ } catch (OutOfMemoryError e) {
+ synchronized(killMessage){
+ e.printStackTrace();
+ System.err.println(killMessage);
+ KillSwitch.killSilent();
+ }
+ }
+ return ret;
+ }
+
+ final int[][] allocInt2D(int len){
+ int[][] ret=null;
+ try {
+ ret=new int[len][];
+ } catch (OutOfMemoryError e) {
+ synchronized(killMessage){
+ e.printStackTrace();
+ System.err.println(killMessage);
+ KillSwitch.killSilent();
+ }
+ }
+ return ret;
+ }
+
+ final KmerNodeU[] allocKmerNodeArray(int len){
+ KmerNodeU[] ret=null;
+ try {
+ ret=new KmerNodeU[len];
+ } catch (OutOfMemoryError e) {
+ synchronized(killMessage){
+ e.printStackTrace();
+ System.err.println(killMessage);
+ KillSwitch.killSilent();
+ }
+ }
+ return ret;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*--------------- Ownership Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Set the thread owning this kmer. Return the new owner.
+ * Will only change the owner if newOwner is greater than current owner. */
+ public abstract int setOwner(Kmer kmer, int newOwner);
+
+ /** Reset owner to -1 if this is the current owner. */
+ public abstract boolean clearOwner(Kmer kmer, int owner);
+
+ /** Return the thread ID owning this kmer, or -1. */
+ public abstract int getOwner(Kmer kmer);
+
+ /** Create data structures needed for ownership representation */
+ public abstract void initializeOwnership();
+
+ /** Eliminate ownership data structures or set them to -1. */
+ public abstract void clearOwnership();
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public static final StringBuilder toText(Kmer kmer){
+ return toText(kmer.key(), kmer.k);
+ }
+
+ public static final StringBuilder toText(long[] array, int k){
+ StringBuilder sb=new StringBuilder(k*array.length);
+ for(int pos=0; pos<array.length; pos++){
+ long kmer=array[pos];
+ for(int i=k-1; i>=0; i--){
+ int x=(int)((kmer>>(2*i))&3);
+ sb.append((char)AminoAcid.numberToBase[x]);
+ }
+ }
+ return sb;
+ }
+
+ static final StringBuilder toText(long[] array, int count, int k){
+ StringBuilder sb=new StringBuilder(k+10);
+ return toText(array, count, k, sb);
+ }
+
+ static final ByteBuilder toBytes(long[] array, int count, int k){
+ ByteBuilder bb=new ByteBuilder(k+10);
+ return toBytes(array, count, k, bb);
+ }
+
+ static final StringBuilder toText(long[] array, int[] values, int k){
+ StringBuilder sb=new StringBuilder(k+10);
+ return toText(array, values, k, sb);
+ }
+
+ static final ByteBuilder toBytes(long[] array, int[] values, int k){
+ ByteBuilder bb=new ByteBuilder(k+10);
+ return toBytes(array, values, k, bb);
+ }
+
+ static final StringBuilder toText(long[] array, int count, int k, StringBuilder sb){
+ if(FASTA_DUMP){
+ sb.append('>');
+ sb.append(count);
+ sb.append('\n');
+ for(int i=0; i<array.length; i++){
+ append(array[i], k, sb);
+ }
+ }else{
+ for(int i=0; i<array.length; i++){
+ append(array[i], k, sb);
+ }
+ sb.append('\t');
+ sb.append(count);
+ }
+ return sb;
+ }
+
+ static final StringBuilder toText(long[] array, int[] values, int k, StringBuilder sb){
+ if(FASTA_DUMP){
+ sb.append('>');
+ for(int i=0; i<values.length; i++){
+ int x=values[i];
+ if(x==-1){break;}
+ if(i>0){sb.append(',');}
+ sb.append(x);
+ }
+ sb.append('\n');
+ for(int i=0; i<array.length; i++){
+ append(array[i], k, sb);
+ }
+ }else{
+ for(int i=0; i<array.length; i++){
+ append(array[i], k, sb);
+ }
+ sb.append('\t');
+ for(int i=0; i<values.length; i++){
+ int x=values[i];
+ if(x==-1){break;}
+ if(i>0){sb.append(',');}
+ sb.append(x);
+ }
+ }
+ return sb;
+ }
+
+ private static final void append(long kmer, int k, StringBuilder sb){
+ for(int i=k-1; i>=0; i--){
+ int x=(int)((kmer>>(2*i))&3);
+ sb.append((char)AminoAcid.numberToBase[x]);
+ }
+ }
+
+ public static final ByteBuilder toBytes(long[] array, int count, int k, ByteBuilder sb){
+ if(FASTA_DUMP){
+ sb.append('>');
+ sb.append(count);
+ sb.append('\n');
+ for(int i=0; i<array.length; i++){
+ append(array[i], k, sb);
+ }
+ }else{
+ for(int i=0; i<array.length; i++){
+ append(array[i], k, sb);
+ }
+ sb.append('\t');
+ sb.append(count);
+ }
+ return sb;
+ }
+
+ public static final ByteBuilder toBytes(long[] array, int[] values, int k, ByteBuilder sb){
+ if(FASTA_DUMP){
+ sb.append('>');
+ for(int i=0; i<values.length; i++){
+ int x=values[i];
+ if(x==-1){break;}
+ if(i>0){sb.append(',');}
+ sb.append(x);
+ }
+ sb.append('\n');
+ for(int i=0; i<array.length; i++){
+ append(array[i], k, sb);
+ }
+ }else{
+ for(int i=0; i<array.length; i++){
+ append(array[i], k, sb);
+ }
+ sb.append('\t');
+ for(int i=0; i<values.length; i++){
+ int x=values[i];
+ if(x==-1){break;}
+ if(i>0){sb.append(',');}
+ sb.append(x);
+ }
+ }
+ return sb;
+ }
+
+ private static final void append(long kmer, int k, ByteBuilder sb){
+ for(int i=k-1; i>=0; i--){
+ int x=(int)((kmer>>(2*i))&3);
+ sb.append((char)AminoAcid.numberToBase[x]);
+ }
+ }
+
+
+// static void appendKmerText(long kmer, int count, int k, StringBuilder sb){
+// sb.setLength(0);
+// toText(kmer, count, k, sb);
+// sb.append('\n');
+// }
+
+ static void appendKmerText(long[] array, int count, int k, ByteBuilder bb){
+ bb.setLength(0);
+ toBytes(array, count, k, bb);
+ bb.append('\n');
+ }
+
+
+ /** For buffered tables. */
+ long flush(){
+ throw new RuntimeException("Unsupported.");
+ }
+
+ /**
+ * This allocates the data structures in multiple threads. Unfortunately, it does not lead to any speedup, at least for ARRAY type.
+ * @param ways
+ * @param tableType
+ * @param initialSize
+ * @param growable
+ * @return
+ */
+ public static final AbstractKmerTableU[] preallocate(int ways, int tableType, int initialSize, int kbig, boolean growable){
+
+ final AbstractKmerTableU[] tables=new AbstractKmerTableU[ways];
+
+ {
+ final int t=Tools.max(1, Tools.min(Shared.threads(), 2, ways));
+ final AllocThread[] allocators=new AllocThread[t];
+ for(int i=0; i<t; i++){
+ allocators[i]=new AllocThread(tableType, initialSize, i, t, kbig, growable, tables);
+ }
+ for(AllocThread at : allocators){at.start();}
+ for(AllocThread at : allocators){
+ while(at.getState()!=Thread.State.TERMINATED){
+ try {
+ at.join();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+ }
+
+ synchronized(tables){
+ for(int i=0; i<tables.length; i++){
+ final AbstractKmerTableU akt=tables[i];
+ if(akt==null){
+ throw new RuntimeException("KmerTable allocation failed, probably due to lack of RAM: "+i+", "+tables.length);
+ }
+ }
+ }
+
+ return tables;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Nested Classes ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private static class AllocThread extends Thread{
+
+ AllocThread(int type_, int initialSize_, int mod_, int div_, int kbig_, boolean growable_, AbstractKmerTableU[] tables_){
+ type=type_;
+ size=initialSize_;
+ mod=mod_;
+ div=div_;
+ growable=growable_;
+ tables=tables_;
+ kbig=kbig_;
+ }
+
+ @Override
+ public void run(){
+ for(int i=mod; i<tables.length; i+=div){
+// System.err.println("T"+i+" allocating "+i);
+ final AbstractKmerTableU akt;
+ if(type==FOREST1D){
+ akt=new HashForestU(size, growable, false);
+ }else if(type==ARRAY1D){
+ akt=new HashArrayU1D(size, kbig, growable);
+ }else if(type==NODE1D){
+ throw new RuntimeException("Must use forest, table, or array data structure. Type="+type);
+// akt=new KmerNode2(-1, 0);
+ }else if(type==FOREST2D){
+ akt=new HashForestU(size, growable, true);
+ }else if(type==ARRAY2D){
+ akt=new HashArrayU2D(size, kbig, growable);
+ }else if(type==NODE2D){
+ throw new RuntimeException("Must use forest, table, or array data structure. Type="+type);
+// akt=new KmerNode(-1, 0);
+ }else if(type==ARRAYH){
+ akt=new HashArrayUHybrid(size, kbig, growable);
+ }else{
+ throw new RuntimeException("Must use forest, table, or array data structure. Type="+type);
+ }
+ synchronized(tables){
+ tables[i]=akt;
+ }
+// System.err.println("T"+i+" allocated "+i);
+ }
+ }
+
+ private final int type;
+ private final int size;
+ private final int mod;
+ private final int div;
+ private final int kbig;
+ private final boolean growable;
+ final AbstractKmerTableU[] tables;
+
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public static boolean FASTA_DUMP=true;
+ public static boolean NUMERIC_DUMP=false;
+
+ public static final boolean verbose=false; //slow
+ public static final boolean TESTMODE=false; //slow
+
+ public static final int UNKNOWN=0, ARRAY1D=1, FOREST1D=2, NODE1D=4, ARRAY2D=5, FOREST2D=6, NODE2D=8, ARRAYH=9;
+
+ public static final int NOT_PRESENT=-1, HASH_COLLISION=-2;
+ public static final int NO_OWNER=-1;
+
+ private final static String killMessage=new String("\nThis program ran out of memory. Try increasing the -Xmx flag and setting prealloc.");
+
+}
diff --git a/current/ukmer/DumpThreadU.java b/current/ukmer/DumpThreadU.java
new file mode 100755
index 0000000..6e4a920
--- /dev/null
+++ b/current/ukmer/DumpThreadU.java
@@ -0,0 +1,73 @@
+package ukmer;
+
+import java.util.ArrayList;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import kmer.DumpThread;
+
+import align2.Shared;
+import align2.Tools;
+
+import stream.ByteBuilder;
+
+import fileIO.ByteStreamWriter;
+
+/**
+ * @author Brian Bushnell
+ * @date Nov 16, 2015
+ *
+ */
+public class DumpThreadU extends Thread{
+
+ public static boolean dump(final int k, final int mincount, final AbstractKmerTableU[] tables, final ByteStreamWriter bsw){
+ final int threads=DumpThread.NUM_THREADS>0 ? DumpThread.NUM_THREADS : Tools.min(tables.length, (Tools.mid(1, Shared.threads()-1, 6)));
+ final AtomicInteger lock=new AtomicInteger(0);
+ final ArrayList<DumpThreadU> list=new ArrayList<DumpThreadU>(threads);
+ for(int i=0; i<threads; i++){
+ list.add(new DumpThreadU(k, mincount, lock, tables, bsw));
+ }
+ for(DumpThreadU t : list){t.start();}
+ boolean success=true;
+ for(DumpThreadU t : list){
+ while(t.getState()!=Thread.State.TERMINATED){
+ try {
+ t.join();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ success&=t.success;
+ }
+ return success;
+ }
+
+ public DumpThreadU(final int k_, final int mincount_, final AtomicInteger nextTable_, final AbstractKmerTableU[] tables_, final ByteStreamWriter bsw_){
+ k=k_;
+ mincount=mincount_;
+ nextTable=nextTable_;
+ tables=tables_;
+ bsw=bsw_;
+ }
+
+ @Override
+ public void run(){
+ final ByteBuilder bb=new ByteBuilder(16300);
+ for(int i=nextTable.getAndIncrement(); i<tables.length; i=nextTable.getAndIncrement()){
+ AbstractKmerTableU t=tables[i];
+ t.dumpKmersAsBytes_MT(bsw, bb, k, mincount);
+ }
+ if(bb.length()>0){
+ synchronized(bsw){bsw.addJob(bb);}
+ }
+ success=true;
+ }
+
+ final int k;
+ final int mincount;
+ final AtomicInteger nextTable;
+ final AbstractKmerTableU[] tables;
+ final ByteStreamWriter bsw;
+ boolean success=false;
+
+}
diff --git a/current/ukmer/HashArrayU.java b/current/ukmer/HashArrayU.java
new file mode 100755
index 0000000..5441a31
--- /dev/null
+++ b/current/ukmer/HashArrayU.java
@@ -0,0 +1,586 @@
+package ukmer;
+
+import java.util.Arrays;
+import java.util.concurrent.atomic.AtomicIntegerArray;
+import java.util.concurrent.locks.Lock;
+import java.util.concurrent.locks.ReentrantLock;
+
+import stream.ByteBuilder;
+
+import kmer.Primes;
+
+import fileIO.ByteStreamWriter;
+import fileIO.TextStreamWriter;
+
+import align2.Tools;
+
+/**
+ * Stores kmers in a long[] and values in an int[][], with a victim cache.
+ * @author Brian Bushnell
+ * @date Nov 7, 2014
+ *
+ */
+public abstract class HashArrayU extends AbstractKmerTableU {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ HashArrayU(int initialSize, int kbig_, boolean autoResize_, boolean twod){
+ if(initialSize>1){
+ initialSize=(int)Tools.min(maxPrime, Primes.primeAtLeast(initialSize));
+ }else{
+ initialSize=1;
+ }
+ prime=initialSize;
+ sizeLimit=(long)(sizeLimit=(long)(maxLoadFactor*prime));
+ kbig=kbig_;
+ mult=Kmer.getMult(kbig);
+ arrays=new long[mult][];
+ for(int i=0; i<mult; i++){
+ arrays[i]=allocLong1D(prime+extra);
+ Arrays.fill(arrays[i], NOT_PRESENT);
+ }
+ victims=new HashForestU(Tools.max(10, initialSize/8), autoResize_, twod);
+ autoResize=autoResize_;
+ TWOD=twod;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Public Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+// public final int set_Test(final long kmer, final int v){
+// assert(TESTMODE);
+// final int x;
+// if(TWOD){
+// int[] old=getValues(kmer, new int[1]);
+// assert(old==null || contains(kmer, old));
+// if(verbose){System.err.println("Fetched "+Arrays.toString(old));}
+// x=set0(kmer, v);
+// assert(old==null || contains(kmer, old)) : "old="+Arrays.toString(old)+", v="+v+", kmer="+kmer+
+// ", get(kmer)="+(Arrays.toString(getValues(kmer, new int[1])));
+// assert(contains(kmer, v));
+// }else{
+// int old=getValue(kmer);
+// assert(old==0 || old==-1 || contains(kmer, old));
+// x=set0(kmer, v);
+// assert(contains(kmer, v)) : "old="+old+", v="+v+", kmer="+kmer+", get(kmer)="+getValue(kmer);
+// assert(v==old || !contains(kmer, old));
+// }
+// return x;
+// }
+//
+// public final int set_Test(final long kmer, final int v[]){
+// assert(TESTMODE);
+// final int x;
+// if(TWOD){
+// final int[] singleton=new int[1];
+// int[] old=getValues(kmer, singleton);
+// assert(old==null || contains(kmer, old));
+// if(verbose){System.err.println("Before: old="+Arrays.toString(old)+", v="+Arrays.toString(v));}
+// x=set0(kmer, v);
+// if(verbose){System.err.println("After: old="+Arrays.toString(old)+", v="+Arrays.toString(v)+", get()="+Arrays.toString(getValues(kmer, singleton)));}
+// assert(old==null || contains(kmer, old)) : "old="+Arrays.toString(old)+", v="+Arrays.toString(v)+", kmer="+kmer+
+// ", get(kmer)="+(Arrays.toString(getValues(kmer, new int[1])));
+// assert(contains(kmer, v)) : "old="+Arrays.toString(old)+", v="+Arrays.toString(v)+", kmer="+kmer+
+// ", get(kmer)="+(Arrays.toString(getValues(kmer, new int[1])));
+// }else{
+// int old=getValue(kmer);
+// assert(old==0 || old==-1 || contains(kmer, old));
+// x=set0(kmer, v);
+// assert(contains(kmer, v)) : "old="+old+", v="+v+", kmer="+kmer+", get(kmer)="+getValue(kmer);
+// assert(v[0]==old || !contains(kmer, old));
+// }
+// return x;
+// }
+//
+// public final int setIfNotPresent_Test(long kmer, int v){
+// assert(TESTMODE);
+// final int x;
+// if(TWOD){
+//// int[] vals=getValues(kmer, null);
+//// assert(vals==null || contains(kmer, vals));
+//// x=setIfNotPresent(kmer, v);
+//// assert(contains(kmer, vals));
+//// assert(contains(kmer, v));
+// x=0;
+// assert(false);
+// }else{
+// int old=getValue(kmer);
+// assert(old==0 || old==-1 || contains(kmer, old));
+// x=setIfNotPresent0(kmer, v);
+// assert((old<1 && contains(kmer, v)) || (old>0 && contains(kmer, old))) : kmer+", "+old+", "+v;
+// }
+// return x;
+// }
+
+ @Override
+ public final int set(Kmer kmer, final int[] v){
+ final int cell=findKmerOrEmpty(kmer);
+
+ if(cell==HASH_COLLISION){
+ if(verbose){System.err.println("C2: Adding "+kmer+", "+v+", "+cell);}
+ final int x=victims.set(kmer, v);
+ if(autoResize && size+victims.size>sizeLimit){resize();}
+ if(verbose){System.err.println("C2: getValues("+kmer+") = "+Arrays.toString(getValues(kmer, new int[1])));}
+ return x;
+ }
+ final long[] key=kmer.key();
+
+ assert(cell>=0);
+
+ final boolean notpresent=(arrays[0][cell]==NOT_PRESENT);
+ if(notpresent){
+ if(verbose){System.err.println("B2: Setting cell "+cell+" to kmer "+kmer);}
+ setKmer(kmer.key(), cell);
+ }
+
+ if(verbose){System.err.println("A2: Adding "+kmer+", "+Arrays.toString(v)+", "+cell);}
+ insertValue(key, v, cell);
+ if(verbose){System.err.println("A2: getValues("+kmer+") = "+Arrays.toString(getValues(kmer, new int[1])));}
+
+ if(notpresent){
+ size++;
+ if(autoResize && size+victims.size>sizeLimit){resize();}
+ return 1;
+ }else{
+ return 0;
+ }
+ }
+
+ public final void setKmer(long[] key, int cell){
+ if(verbose){System.err.println();}
+ for(int i=0; i<mult; i++){
+ arrays[i][cell]=key[i];
+ }
+ }
+
+ @Override
+ public final int set(final Kmer kmer, final int v){
+ assert(kmer.mult==mult && kmer.len>=kmer.kbig);
+ final int cell=findKmerOrEmpty(kmer);
+// assert(kmer.verify(false)); //123
+
+ if(cell==HASH_COLLISION){
+ if(verbose){System.err.println("C2: Adding "+kmer+", "+v+", "+cell);}
+ final int x=victims.set(kmer, v);
+ if(autoResize && size+victims.size>sizeLimit){resize();}
+ if(verbose){System.err.println("C2: getValues("+kmer+") = "+Arrays.toString(getValues(kmer, new int[1])));}
+ return x;
+ }
+ assert(cell>=0);
+ final long[] key=kmer.key();
+
+ final boolean notpresent=(arrays[0][cell]==NOT_PRESENT);
+ if(notpresent){
+ if(verbose){System.err.println("B2: Setting cell "+cell+" to kmer "+kmer);}
+ setKmer(key, cell);
+ }
+
+ if(verbose){System.err.println("A2: Adding "+kmer+", "+v+", "+cell);}
+ insertValue(key, v, cell);
+ if(verbose){System.err.println("A2: getValues("+kmer+") = "+Arrays.toString(getValues(kmer, new int[1])));}
+
+ if(notpresent){
+ size++;
+ if(autoResize && size+victims.size>sizeLimit){resize();}
+ return 1;
+ }else{
+ return 0;
+ }
+ }
+
+
+// protected LongList ll=new LongList(); //123
+// protected IntList il=new IntList();
+
+ @Override
+ public final int setIfNotPresent(Kmer kmer, int value){
+ final int cell=findKmerOrEmpty(kmer);
+
+ if(cell==HASH_COLLISION){
+ int x=victims.setIfNotPresent(kmer, value);
+ if(autoResize && size+victims.size>sizeLimit){resize();}
+ return x;
+ }
+ assert(cell>=0);
+ final long[] key=kmer.key();
+
+ if(cell==NOT_PRESENT){
+ setKmer(key, cell);
+ insertValue(key, value, cell);
+ size++;
+ if(autoResize && size+victims.size>sizeLimit){resize();}
+ return 1;
+ }else{
+ return 0;
+ }
+ }
+
+ @Override
+ public final int getValue(Kmer kmer){
+ int cell=findKmer(kmer);
+ if(cell==NOT_PRESENT){return NOT_PRESENT;}
+ if(cell==HASH_COLLISION){return victims.getValue(kmer);}
+ return readCellValue(cell);
+ }
+
+ /* (non-Javadoc)
+ * @see ukmer.AbstractKmerTableU#getValue(long[], long)
+ */
+ @Override
+ public int getValue(long[] key, long xor) {
+ throw new RuntimeException("Unimplemented");
+ }
+
+ protected final long[] fillKey(int cell, long[] temp) {
+ return fillKey(cell, temp, arrays);
+ }
+
+ public final Kmer fillKmer(int cell, Kmer kmer) {
+ return fillKmer(cell, kmer, arrays);
+ }
+
+ public final Kmer fillKmer(int cell, Kmer kmer, long[][] matrix) {
+ long[] x=fillKey(cell, kmer.array1(), matrix);
+// assert(false) : x+"\ngetKmer("+cell+", kmer, matrix)"; //123
+ if(x==null){return null;}
+ kmer.fillArray2();
+ if(verbose){System.err.println("Filled kmer "+kmer+": a1="+Arrays.toString(kmer.array1())+", a2="+Arrays.toString(kmer.array2())+", key="+Arrays.toString(kmer.key()));}
+ return kmer;
+ }
+
+ protected final long[] fillKey(int cell, long[] temp, long[][] matrix) {
+ assert(temp.length==mult);
+ if(matrix[0][cell]<0){
+// assert(false) : matrix[0][cell]+"\ngetKmer("+cell+", kmer, matrix)\n"+Arrays.toString(matrix[0]); //123
+ return null;
+ }
+ for(int i=0; i<temp.length; i++){
+ temp[i]=matrix[i][cell];
+ }
+ if(verbose){System.err.println("cell="+cell+", matrix[0][cell]="+matrix[0][cell]+", temp="+Arrays.toString(temp)+"\nmatrix[0]="+Arrays.toString(matrix[0]));}
+ return temp;
+ }
+
+ @Override
+ public final int[] getValues(Kmer kmer, int[] singleton){
+ int cell=findKmer(kmer);
+ if(cell==NOT_PRESENT){
+ singleton[0]=NOT_PRESENT;
+ return singleton;
+ }
+ if(cell==HASH_COLLISION){return victims.getValues(kmer, singleton);}
+ return readCellValues(cell, singleton);
+ }
+
+ @Override
+ public final boolean contains(Kmer kmer){
+ int cell=findKmer(kmer);
+ if(cell==NOT_PRESENT){return false;}
+ if(cell==HASH_COLLISION){return victims.contains(kmer);}
+ return true;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Ownership ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public final void initializeOwnership(){
+ assert(owners==null);
+ owners=allocAtomicInt(arrays[0].length);
+ for(int i=0; i<arrays[0].length; i++){
+ owners.set(i, NO_OWNER);
+ }
+ victims.initializeOwnership();
+ }
+
+ @Override
+ public final void clearOwnership(){
+ owners=null;
+ victims.clearOwnership();
+ }
+
+ @Override
+ public final int setOwner(final Kmer kmer, final int newOwner){
+ final int cell=findKmer(kmer);
+ assert(cell!=NOT_PRESENT);
+ if(cell==HASH_COLLISION){return victims.setOwner(kmer, newOwner);}
+ return setOwner(kmer, newOwner, cell);
+ }
+
+ public final int setOwner(final Kmer kmer, final int newOwner, final int cell){
+// kmer.verify(true);
+ assert(matches(kmer.key(), cell)) : "cell="+cell+", key="+Arrays.toString(kmer.key())+", row="+Arrays.toString(cellToArray(cell))+"\n" +
+ "kmer="+kmer+", array1="+Arrays.toString(kmer.array1())+", array2="+Arrays.toString(kmer.array2())+", row="+AbstractKmerTableU.toText(cellToArray(cell), kmer.k);
+ final int original=owners.get(cell);
+ int current=original;
+ while(current<newOwner){
+ boolean success=owners.compareAndSet(cell, current, newOwner);
+ if(!success){current=owners.get(cell);}
+ else{current=newOwner;}
+ }
+ assert(current>=original) : "original="+original+", current="+current+", newOwner="+newOwner+", re-read="+owners.get(cell);
+ return current;
+ }
+
+ @Override
+ public final boolean clearOwner(final Kmer kmer, final int owner){
+ final int cell=findKmer(kmer);
+ assert(cell!=NOT_PRESENT);
+ if(cell==HASH_COLLISION){return victims.clearOwner(kmer, owner);}
+ return clearOwner(kmer, owner, cell);
+ }
+
+ public final boolean clearOwner(final Kmer kmer, final int owner, final int cell){
+ assert(matches(kmer.key(), cell));
+ boolean success=owners.compareAndSet(cell, owner, NO_OWNER);
+ return success;
+ }
+
+ @Override
+ public final int getOwner(final Kmer kmer){
+ final int cell=findKmer(kmer);
+ assert(cell!=NOT_PRESENT);
+ if(cell==HASH_COLLISION){return victims.getOwner(kmer);}
+ return getCellOwner(cell);
+ }
+
+ public final int getCellOwner(final int cell){
+ return owners.get(cell);
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Nonpublic Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ protected abstract void insertValue(final long[] kmer, final int v, final int cell);
+
+ protected abstract void insertValue(final long[] kmer, final int[] vals, final int cell);
+
+ protected abstract int readCellValue(int cell);
+ protected abstract int[] readCellValues(int cell, int[] singleton);
+
+ @Override
+ final Object get(long[] kmer){
+ throw new RuntimeException("Unimplemented.");
+ }
+
+ final int findKmer(Kmer kmer){
+ return findKmer(kmer.key(), kmer.xor());
+ }
+
+ final int findKmer(long[] key, long xor){
+ int cell=(int)(xor%prime);
+ for(final int max=cell+extra; cell<max; cell++){
+ final long n=arrays[0][cell];
+ if(n==key[0]){
+ boolean success=true;
+ for(int i=1; i<mult && success; i++){
+ if(key[i]!=arrays[i][cell]){success=false;}
+ }
+ if(success){return cell;}
+ }else if(n==NOT_PRESENT){return NOT_PRESENT;}
+ }
+ return HASH_COLLISION;
+ }
+
+ final int findKmerOrEmpty(Kmer kmer){
+ int cell=(int)(kmer.xor()%prime);
+ if(verbose){System.err.println("Started at cell "+cell+" for "+kmer);}
+
+ long[] key=kmer.key();
+ for(final int max=cell+extra; cell<max; cell++){
+ final long n=arrays[0][cell];
+ if(n==NOT_PRESENT){
+ if(verbose){System.err.println("Chose empty cell "+cell+" for "+kmer);}
+ return cell;
+ }
+ boolean success=true;
+ for(int i=0; i<mult && success; i++){
+ if(key[i]!=arrays[i][cell]){success=false;}
+ }
+ if(success){
+ if(verbose){System.err.println("Found cell "+cell+" containing "+kmer);}
+ return cell;
+ }
+ }
+ return HASH_COLLISION;
+ }
+
+ final boolean matches(long[] key, int cell){
+ assert(cell>=0);
+ boolean success=true;
+ for(int i=0; i<mult && success; i++){
+ if(key[i]!=arrays[i][cell]){success=false;}
+ }
+ return success;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Resizing and Rebalancing ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ final boolean canResize() {return true;}
+
+ @Override
+ final public long size() {return size;}
+
+ @Override
+ final public int arrayLength() {return arrays[0].length;}
+
+ @Override
+ protected abstract void resize();
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Info Dumping ----------------*/
+ /*--------------------------------------------------------------*/
+
+ protected long[] cellToArray(int cell){throw new RuntimeException("Unimplemented");}
+
+ @Override
+ public final boolean dumpKmersAsText(TextStreamWriter tsw, int k, int mincount){
+ final long[] key=new long[mult];
+ final int alen=arrays[0].length;
+ if(TWOD){
+ final int[] singleton=new int[1];
+ for(int i=0; i<alen; i++){
+ long[] temp=fillKey(i, key);
+ if(temp!=null){
+ tsw.print(toText(temp, readCellValues(i, singleton), k).append('\n'));
+ }
+ }
+ }else{
+ for(int i=0; i<alen; i++){
+ long[] temp=fillKey(i, key);
+ if(temp!=null && readCellValue(i)>=mincount){
+ tsw.print(toText(temp, readCellValue(i), k).append('\n'));
+ }
+ }
+ }
+ if(victims!=null){
+ victims.dumpKmersAsText(tsw, k, mincount);
+ }
+ return true;
+ }
+
+ @Override
+ public final boolean dumpKmersAsBytes(ByteStreamWriter bsw, int k, int mincount){
+ final long[] key=new long[mult];
+ final int alen=arrays[0].length;
+ if(TWOD){
+ final int[] singleton=new int[1];
+ for(int i=0; i<alen; i++){
+ long[] temp=fillKey(i, key);
+ if(temp!=null){
+ bsw.printlnKmer(temp, readCellValues(i, singleton), k);
+ }
+ }
+ }else{
+ for(int i=0; i<alen; i++){
+ long[] temp=fillKey(i, key);
+ if(temp!=null && readCellValue(i)>=mincount){
+ bsw.printlnKmer(temp, readCellValue(i), k);
+ }
+ }
+ }
+ if(victims!=null){
+ victims.dumpKmersAsBytes(bsw, k, mincount);
+ }
+ return true;
+ }
+
+ @Override
+ public final boolean dumpKmersAsBytes_MT(final ByteStreamWriter bsw, final ByteBuilder bb, final int k, final int mincount){
+ final long[] key=new long[mult];
+ final int alen=arrays[0].length;
+ if(TWOD){
+ final int[] singleton=new int[1];
+ for(int i=0; i<alen; i++){
+ long[] temp=fillKey(i, key);
+ if(temp!=null){
+ toBytes(temp, readCellValues(i, singleton), k, bb);
+ bb.append('\n');
+ if(bb.length()>=16000){
+ ByteBuilder bb2=new ByteBuilder(bb);
+ synchronized(bsw){bsw.addJob(bb2);}
+ bb.clear();
+ }
+ }
+ }
+ }else{
+ for(int i=0; i<alen; i++){
+ long[] temp=fillKey(i, key);
+ if(temp!=null && readCellValue(i)>=mincount){
+ toBytes(temp, readCellValue(i), k, bb);
+ bb.append('\n');
+ if(bb.length()>=16000){
+ ByteBuilder bb2=new ByteBuilder(bb);
+ synchronized(bsw){bsw.addJob(bb2);}
+ bb.clear();
+ }
+ }
+ }
+ }
+ if(victims!=null){
+ victims.dumpKmersAsBytes_MT(bsw, bb, k, mincount);
+ }
+ return true;
+ }
+
+ @Override
+ public final void fillHistogram(long[] ca, int max){
+ final int alen=arrays[0].length;
+ for(int i=0; i<alen; i++){
+ long kmer=arrays[0][i];
+ if(kmer!=NOT_PRESENT){
+ int count=Tools.min(readCellValue(i), max);
+ ca[count]++;
+ }
+ }
+ if(victims!=null){
+ victims.fillHistogram(ca, max);
+ }
+ }
+
+ public HashForestU victims(){
+ return victims;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ AtomicIntegerArray owners;
+ long[][] arrays;
+ int prime;
+ long size=0;
+ long sizeLimit;
+ final HashForestU victims;
+ final boolean autoResize;
+ final int kbig;
+ final int mult;//Length of Kmer arrays.
+ public final boolean TWOD;
+ private final Lock lock=new ReentrantLock();
+
+ public AtomicIntegerArray owners() {return owners;}
+
+ @Override
+ final Lock getLock(){return lock;}
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Static Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ final static int extra=21;
+ final static int maxPrime=(int)Primes.primeAtMost(Integer.MAX_VALUE-extra);
+ final static float resizeMult=2f; //Resize by a minimum of this much
+ final static float minLoadFactor=0.58f; //Resize by enough to get the load above this factor
+ final static float maxLoadFactor=0.905f; //Reaching this load triggers resizing
+ final static float minLoadMult=1/minLoadFactor;
+ final static float maxLoadMult=1/maxLoadFactor;
+
+}
diff --git a/current/ukmer/HashArrayU1D.java b/current/ukmer/HashArrayU1D.java
new file mode 100755
index 0000000..a8293dc
--- /dev/null
+++ b/current/ukmer/HashArrayU1D.java
@@ -0,0 +1,298 @@
+package ukmer;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import kmer.Primes;
+
+import align2.Tools;
+
+/**
+ * Stores kmers in a long[] and counts in an int[], with a victim cache.
+ * @author Brian Bushnell
+ * @date Oct 25, 2013
+ *
+ */
+public final class HashArrayU1D extends HashArrayU {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public HashArrayU1D(int initialSize, int mult_, boolean autoResize_){
+ super(initialSize, mult_, autoResize_, false);
+ values=allocInt1D(prime+extra);
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Public Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public final int increment(final Kmer kmer){
+ final int cell=findKmerOrEmpty(kmer);
+
+ if(cell==HASH_COLLISION){
+ int x=victims.increment(kmer);
+ if(autoResize && size+victims.size>sizeLimit){resize();}
+ return x;
+ }else if(arrays[0][cell]==NOT_PRESENT){
+ setKmer(kmer.key(), cell);
+ size++;
+ values[cell]=1;
+ if(autoResize && size+victims.size>sizeLimit){resize();}
+ return 1;
+ }else{
+ values[cell]++;
+ if(values[cell]<0){values[cell]=Integer.MAX_VALUE;}
+ return values[cell];
+ }
+ }
+
+ @Override
+ public final int incrementAndReturnNumCreated(final Kmer kmer){
+// assert(kmer.verify(false));
+//// System.err.println("***");
+//// System.err.println("Incrementing kmer "+kmer+"\n"+kmer.arraysToString());
+//// System.err.println("Initial state:"+Arrays.toString(arrays[0])+"\n"+Arrays.toString(values)+"\nVictims.size: "+victims.size);
+// final int a=getValue(kmer);
+// final int x=incrementAndReturnNumCreated0(kmer);
+// final int b=getValue(kmer);
+//// System.err.println("Kmer is now "+kmer+"\n"+kmer.arraysToString());
+// assert(kmer.verify(false));
+// assert((a==-1 && b==1) || (a+1==b)) : a+", "+b+", "+kmer+"\n"+kmer.arraysToString()+"\n"+Arrays.toString(arrays[0])+"\n"+Arrays.toString(values);
+// return x;
+// }
+//
+// public final int incrementAndReturnNumCreated0(final Kmer kmer){
+ final int cell=findKmerOrEmpty(kmer);
+// assert(victims.size<size+100);
+// System.err.println("size="+size+", victims="+victims.size+", sizeLimit="+sizeLimit+", autoResize="+autoResize);//123
+ if(cell==HASH_COLLISION){
+// if(verbose || true){System.err.println("HASH_COLLISION - sending to victims.");}
+ final int x=victims.incrementAndReturnNumCreated(kmer);
+ if(autoResize && size+victims.size>sizeLimit){
+ if(verbose){System.err.println("Exceeded size limit - resizing.");}
+ resize();
+ }
+// else{
+ assert(!autoResize || size+victims.size<=sizeLimit+1) : sizeLimit+"<"+(size+victims.size)+", size="+size+", victims="+victims.size+", prime="+prime;
+// }
+ return x;
+ }else if(arrays[0][cell]==NOT_PRESENT){
+ setKmer(kmer.key(), cell);
+ size++;
+ values[cell]=1;
+ if(verbose){System.err.println("Added kmer "+kmer+", key "+Arrays.toString(kmer.key())+
+ ", a1 "+Arrays.toString(kmer.array1())+", a2 "+Arrays.toString(kmer.array2())+", xor "+kmer.xor()+", to cell "+cell+"\n" +
+ " array:"/*+Arrays.toString(arrays[0])*/);}
+ if(autoResize && size+victims.size>sizeLimit){
+ if(verbose){System.err.println("Exceeded size limit - resizing.");}
+ resize();
+ }
+// else{
+ assert(!autoResize || size+victims.size<=sizeLimit+1) : sizeLimit+"<"+(size+victims.size)+", size="+size+", victims="+victims.size+", prime="+prime;
+// }
+ return 1;
+ }else{
+ if(verbose){System.err.println("Already present - incrementing.");}
+ assert(!autoResize || size+victims.size<=sizeLimit+1) : sizeLimit+"<"+(size+victims.size)+", size="+size+", victims="+victims.size+", prime="+prime;
+ values[cell]++;
+ if(values[cell]<0){values[cell]=Integer.MAX_VALUE;}
+ return 0;
+ }
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Nonpublic Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public final int readCellValue(int cell) {
+ return values[cell];
+ }
+
+ @Override
+ protected final int[] readCellValues(int cell, int[] singleton) {
+ singleton[0]=values[cell];
+ return singleton;
+ }
+
+ @Override
+ protected final void insertValue(long[] kmer, int v, int cell) {
+ assert(matches(kmer, cell));
+ values[cell]=v;
+ }
+
+ @Override
+ protected final void insertValue(long[] kmer, int[] vals, int cell) {
+ assert(matches(kmer, cell));
+ assert(vals.length==1);
+ values[cell]=vals[0];
+ }
+
+ protected long[] cellToArray(int cell){
+ long[] r=new long[mult];
+ for(int i=0; i<mult; i++){r[i]=arrays[i][cell];}
+ return r;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Resizing and Rebalancing ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public final boolean canRebalance() {return false;}
+
+ @Override
+ protected synchronized void resize(){
+ if(verbose){System.err.println("Resizing from "+prime+"; load="+(size*1f/prime));}
+ final int oldPrime=prime;
+ if(prime>=maxPrime){
+ sizeLimit=0xFFFFFFFFFFFFL;
+ return;
+ }
+
+ final long oldSize=size, oldVSize=victims.size;
+ final long totalSize=oldSize+oldVSize;
+
+ final long maxAllowedByLoadFactor=(long)(totalSize*minLoadMult);
+ final long minAllowedByLoadFactor=(long)(totalSize*maxLoadMult);
+
+// sizeLimit=Tools.min((long)(maxLoadFactor*prime), maxPrime);
+
+ assert(maxAllowedByLoadFactor>=minAllowedByLoadFactor);
+ if(maxAllowedByLoadFactor<prime){
+ sizeLimit=(long)(maxLoadFactor*prime);
+ return;
+ }
+
+ long x=10+(long)(prime*resizeMult);
+ x=Tools.max(x, minAllowedByLoadFactor);
+ x=Tools.min(x, maxAllowedByLoadFactor);
+
+ int prime2=(int)Tools.min(maxPrime, Primes.primeAtLeast(x));
+
+ if(prime2<=prime){
+ sizeLimit=(long)(maxLoadFactor*prime);
+ assert(prime2==prime) : "Resizing to smaller array? "+totalSize+", "+prime+", "+x;
+ return;
+ }
+
+ prime=prime2;
+// System.err.println("Resized to "+prime+"; load="+(size*1f/prime));
+ long[][] oldk=arrays;
+ int[] oldc=values;
+// KmerNodeU[] oldv=victims.array;
+ arrays=allocLong2D(mult, prime2+extra);
+ for(int i=0; i<mult; i++){
+ Arrays.fill(arrays[i], NOT_PRESENT);
+ }
+ values=allocInt1D(prime2+extra);
+ ArrayList<KmerNodeU> list=victims.toList();
+ victims.clear();
+ size=0;
+ sizeLimit=Long.MAX_VALUE;
+// assert(false);
+ final Kmer kmer=new Kmer(kbig);
+// long kmersProcessed=0; //123
+ {
+ for(int i=0; i<oldk[0].length; i++){
+// assert(false) : oldk[0][i];
+ if(oldk[0][i]>NOT_PRESENT){
+// kmersProcessed++;
+// assert(false) : oldk[0][i];
+ Kmer temp=fillKmer(i, kmer, oldk);
+ assert(temp==kmer);
+ if(verbose){
+ System.err.println("In cell "+i+", found kmer "+kmer+"; key="+Arrays.toString(kmer.key())+"; " +
+ "a1="+Arrays.toString(kmer.array1())+"; a2="+Arrays.toString(kmer.array2()));
+ System.err.println(Arrays.toString(oldk[0]));
+ System.err.println(Arrays.toString(arrays[0]));
+ }
+ assert(temp!=null) : i+", "+kmer+", "+oldk[0][i];
+ set(temp, oldc[i]);
+
+// assert(getValue(temp)==oldc[i]); //123
+
+ if(verbose){
+ System.err.println("prime="+prime+", xor="+kmer.xor()+", mod="+(kmer.xor()%prime));
+ System.err.println("After set: kmer "+kmer+"; key="+Arrays.toString(kmer.key())+"; " +
+ "a1="+Arrays.toString(kmer.array1())+"; a2="+Arrays.toString(kmer.array2()));
+ System.err.println(Arrays.toString(arrays[0]));
+ }
+// assert(kmer.verify(false)); //123
+ }
+ }
+ }
+
+ for(KmerNodeU n : list){
+ if(n.pivot[0]>NOT_PRESENT){
+ kmer.setFrom(n.pivot());
+ set(kmer, n.value());
+// assert(getValue(kmer)==n.value()); //123 slow
+ }
+ else{assert(false) : "pivot="+n.pivot()+", n="+n;}
+ }
+
+ assert(oldSize+oldVSize==size+victims.size) : oldSize+", "+oldVSize+" -> "+size+", "+victims.size+"; totalSize="+totalSize+", new total="+(size+victims.size)+
+ "\noldPrime="+oldPrime+", prime="+prime+(prime<1000 ? (
+ "\noldArray:"+Arrays.toString(oldk[0])+
+ "\nnewArray:"+Arrays.toString(arrays[0])
+ ) : "");
+
+ sizeLimit=(long)(maxLoadFactor*prime);
+ }
+
+ @Deprecated
+ @Override
+ public void rebalance(){
+ throw new RuntimeException("Unimplemented.");
+ }
+
+ @Override
+ public long regenerate(){
+ long sum=0;
+ assert(owners==null) : "Clear ownership before regeneration.";
+ final Kmer kmer=new Kmer(kbig);
+ for(int pos=0; pos<values.length; pos++){
+ Kmer key=fillKmer(pos, kmer);
+ if(key!=null){
+ final int value=values[pos];
+ values[pos]=NOT_PRESENT;
+ arrays[0][pos]=NOT_PRESENT;
+ size--;
+ if(value>0){
+ set(key, value);
+ }else{
+ sum++;
+ }
+ }
+ }
+
+ ArrayList<KmerNodeU> nodes=victims.toList();
+ victims.clear();
+ for(KmerNodeU node : nodes){
+ int value=node.value();
+ if(value<1){
+ sum++;
+ }else{
+ kmer.setFrom(node.pivot());
+ set(kmer, node.value());
+ }
+ }
+
+ return sum;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private int[] values;
+
+ public int[] values(){return values;}
+
+
+
+}
diff --git a/current/ukmer/HashArrayU2D.java b/current/ukmer/HashArrayU2D.java
new file mode 100755
index 0000000..0cf7ccb
--- /dev/null
+++ b/current/ukmer/HashArrayU2D.java
@@ -0,0 +1,226 @@
+package ukmer;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import kmer.Primes;
+
+import align2.Tools;
+
+/**
+ * Stores kmers in a long[] and values in an int[][], with a victim cache.
+ * @author Brian Bushnell
+ * @date Nov 7, 2014
+ *
+ */
+public final class HashArrayU2D extends HashArrayU {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public HashArrayU2D(int initialSize, int mult_, boolean autoResize_){
+ super(initialSize, mult_, autoResize_, true);
+ values=allocInt2D(prime+extra);
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Public Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Deprecated
+ @Override
+ public int increment(final Kmer kmer){
+ throw new RuntimeException("Unsupported.");
+ }
+
+ @Deprecated
+ @Override
+ public int incrementAndReturnNumCreated(final Kmer kmer){
+ throw new RuntimeException("Unsupported.");
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Nonpublic Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ protected final int readCellValue(int cell) {
+ int[] set=values[cell];
+ return set==null ? 0 : set[0];
+ }
+
+ @Override
+ protected final int[] readCellValues(int cell, int[] singleton) {
+ return values[cell];
+ }
+
+ /** Returns number of values added */
+ protected final void insertValue(final long[] kmer, final int v, final int cell){
+ assert(matches(kmer, cell));
+ if(values[cell]==null){
+ values[cell]=new int[] {v, NOT_PRESENT};
+ return;
+ }
+ int[] set=values[cell];
+ assert(set!=null);
+
+ for(int i=0; i<set.length; i++){
+ if(set[i]==v){return;}
+ else if(set[i]<0){set[i]=v;return;}
+ }
+ final int oldSize=set.length;
+ final int newSize=(int)Tools.min(Integer.MAX_VALUE, oldSize*2L);
+ assert(newSize>set.length) : "Overflow.";
+ set=Arrays.copyOf(set, newSize);
+ set[oldSize]=v;
+ Arrays.fill(set, oldSize+1, newSize, NOT_PRESENT);
+ values[cell]=set;
+ }
+
+ /** Returns number of values added */
+ protected final void insertValue(final long[] kmer, final int[] vals, final int cell){
+ assert(matches(kmer, cell));
+ if(values[cell]==null){
+ values[cell]=vals;
+ }else{
+ for(int v : vals){
+ if(v<0){break;}
+ insertValue(kmer, v, cell);
+ }
+ }
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Resizing and Rebalancing ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public final boolean canRebalance() {return false;}
+
+ @Override
+ protected synchronized void resize(){
+// assert(false);
+// System.err.println("Resizing from "+prime+"; load="+(size*1f/prime));
+ if(prime>=maxPrime){
+ sizeLimit=0xFFFFFFFFFFFFL;
+ return;
+ }
+
+ final long oldSize=size, oldVSize=victims.size;
+ final long totalSize=oldSize+oldVSize;
+
+ final long maxAllowedByLoadFactor=(long)(totalSize*minLoadMult);
+ final long minAllowedByLoadFactor=(long)(totalSize*maxLoadMult);
+
+// sizeLimit=Tools.min((long)(maxLoadFactor*prime), maxPrime);
+
+ assert(maxAllowedByLoadFactor>=minAllowedByLoadFactor);
+ if(maxAllowedByLoadFactor<prime){
+ sizeLimit=(long)(maxLoadFactor*prime);
+ return;
+ }
+
+ long x=10+(long)(prime*resizeMult);
+ x=Tools.max(x, minAllowedByLoadFactor);
+ x=Tools.min(x, maxAllowedByLoadFactor);
+
+ int prime2=(int)Tools.min(maxPrime, Primes.primeAtLeast(x));
+
+ if(prime2<=prime){
+ sizeLimit=(long)(maxLoadFactor*prime);
+ assert(prime2==prime) : "Resizing to smaller array? "+totalSize+", "+prime+", "+x;
+ return;
+ }
+// System.err.println("Resizing from "+prime+" to "+prime2+"; size="+size);
+
+ prime=prime2;
+// System.err.println("Resized to "+prime+"; load="+(size*1f/prime));
+ long[][] oldk=arrays;
+ int[][] oldc=values;
+ arrays=allocLong2D(mult, prime2+extra);
+ for(int i=0; i<mult; i++){
+ Arrays.fill(arrays[i], NOT_PRESENT);
+ }
+ values=allocInt2D(prime2+extra);
+ ArrayList<KmerNodeU> list=victims.toList();
+ victims.clear();
+ size=0;
+ sizeLimit=Long.MAX_VALUE;
+
+ final int[] singleton=new int[] {NOT_PRESENT};
+ final Kmer kmer=new Kmer(kbig);
+ {
+ for(int i=0; i<oldk.length; i++){
+ if(oldk[0][i]>NOT_PRESENT){
+ set(fillKmer(i, kmer, oldk), oldc[i]);
+ }
+ }
+ }
+
+ for(KmerNodeU n : list){
+ if(n.pivot[0]>NOT_PRESENT){
+ kmer.setFrom(n.pivot());
+ set(kmer, n.values(singleton));
+ }
+ else{assert(false);}
+ }
+
+ assert(oldSize+oldVSize==size+victims.size) : oldSize+", "+oldVSize+" -> "+size+", "+victims.size;
+
+ sizeLimit=(long)(maxLoadFactor*prime);
+ }
+
+ @Deprecated
+ @Override
+ public void rebalance(){
+ throw new RuntimeException("Unimplemented.");
+ }
+
+ @Deprecated
+ @Override
+ public long regenerate(){
+ assert(false) : "This is not tested or intended for use.";
+ long sum=0;
+ assert(owners==null) : "Clear ownership before regeneration.";
+ final Kmer kmer=new Kmer(kbig);
+ for(int pos=0; pos<values.length; pos++){
+ Kmer key=fillKmer(pos, kmer);
+ if(key!=null){
+ final int[] value=values[pos];
+ values[pos]=null;
+ arrays[0][pos]=NOT_PRESENT;
+ size--;
+ if(value!=null){
+ assert(value[0]>0);
+ set(key, value);
+ }else{
+ sum++;
+ }
+ }
+ }
+
+ ArrayList<KmerNodeU> nodes=victims.toList();
+ victims.clear();
+ for(KmerNodeU node : nodes){
+ int value=node.value();
+ if(value<1){
+ sum++;
+ }else{
+ kmer.setFrom(node.pivot());
+ set(kmer, node.values(null));//TODO: Probably unsafe or unwise. Should test for singletons, etc.
+ }
+ }
+
+ return sum;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private int[][] values;
+
+
+
+}
diff --git a/current/ukmer/HashArrayUHybrid.java b/current/ukmer/HashArrayUHybrid.java
new file mode 100755
index 0000000..7765869
--- /dev/null
+++ b/current/ukmer/HashArrayUHybrid.java
@@ -0,0 +1,314 @@
+package ukmer;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import kmer.Primes;
+
+import align2.IntList2;
+import align2.Tools;
+
+/**
+ * Stores kmers in a long[] and counts in an int[], with a victim cache.
+ * @author Brian Bushnell
+ * @date Oct 25, 2013
+ *
+ */
+public final class HashArrayUHybrid extends HashArrayU {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public HashArrayUHybrid(int initialSize, int mult_, boolean autoResize_){
+ super(initialSize, mult_, autoResize_, true);
+ values=allocInt1D(prime+extra);
+ setList=new IntList2();
+ setList.add(null);
+ setList.add(null);
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Public Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public final int increment(final Kmer kmer){
+ final int cell=findKmerOrEmpty(kmer);
+
+ if(cell==HASH_COLLISION){
+ int x=victims.increment(kmer);
+ if(autoResize && size+victims.size>sizeLimit){resize();}
+ return x;
+ }else if(arrays[0][cell]==NOT_PRESENT){
+ setKmer(kmer.key(), cell);
+ size++;
+ values[cell]=1;
+ if(autoResize && size+victims.size>sizeLimit){resize();}
+ return 1;
+ }else{
+ values[cell]++;
+ if(values[cell]<0){values[cell]=Integer.MAX_VALUE;}
+ return values[cell];
+ }
+ }
+
+ @Override
+ public final int incrementAndReturnNumCreated(final Kmer kmer){
+ final int cell=findKmerOrEmpty(kmer);
+
+ if(cell==HASH_COLLISION){
+ int x=victims.incrementAndReturnNumCreated(kmer);
+ if(autoResize && size+victims.size>sizeLimit){resize();}
+ return x;
+ }else if(arrays[0][cell]==NOT_PRESENT){
+ setKmer(kmer.key(), cell);
+ size++;
+ values[cell]=1;
+ if(autoResize && size+victims.size>sizeLimit){resize();}
+ return 1;
+ }else{
+ values[cell]++;
+ if(values[cell]<0){values[cell]=Integer.MAX_VALUE;}
+ return 0;
+ }
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Nonpublic Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ protected final int readCellValue(int cell) {
+ final int x=values[cell];
+ if(x>-2){return x;}
+ return setList.get(0-x)[0];
+ }
+
+ @Override
+ protected final int[] readCellValues(int cell, int[] singleton) {
+ final int x=values[cell];
+ if(x>-2){
+ singleton[0]=values[cell];
+ return singleton;
+ }
+ return setList.get(0-x);
+ }
+
+ @Override
+ protected final void insertValue(long[] kmer, int[] vals, int cell) {
+ if(verbose){System.err.println("insertValue("+kmer+", "+Arrays.toString(vals)+", "+cell+"); old="+values[cell]);}
+ assert(matches(kmer, cell));
+ if(vals.length==1){
+ if(verbose){System.err.println("A: length=1");}
+ insertValue(kmer, vals[0], cell);
+ return;
+ }
+ final int old=values[cell];
+ if(old==vals[0] && vals[1]==NOT_PRESENT){
+ if(verbose){System.err.println("B: old==vals[0] && vals[1]==-1");}
+ return; //Nothing to do
+ }else if(old<-1){//An array already exists
+ if(verbose){System.err.println("C: old<-1");}
+ for(int i : vals){
+ if(i==-1){break;}
+ insertIntoList(i, -old);
+ }
+ }else{//Add the list
+ final int[] temp;
+ if(old>0){//Move the old value to a new array. Note that this will probably never be used.
+ if(verbose){System.err.println("D: old>0");}
+ temp=allocInt1D(vals.length+1);
+ temp[0]=old;
+ for(int i=0; i<vals.length; i++){temp[i+1]=vals[i];}
+ }else{
+ if(verbose){System.err.println("E: old>0");}
+ temp=vals;
+ }
+ values[cell]=-setList.size;
+ setList.add(temp);
+ }
+ }
+
+ @Override
+ protected final void insertValue(long[] kmer, int v, int cell) {
+ assert(matches(kmer, cell));
+ assert(v>0);
+ final int cc=values[cell];
+ if(cc==v){
+ return;
+ }else if(cc<-1){
+ insertIntoList(v, -cc);
+ }else if(cc>0){
+ values[cell]=-setList.size;
+ setList.add(new int[] {cc, v, -1, -1});
+ }else{
+ values[cell]=v;
+ }
+ }
+
+ private final int insertIntoList(final int v, final int loc){
+
+ if(loc>=setList.size){
+ assert(loc==setList.size);
+ setList.add(null);
+ }
+
+ int[] set=setList.get(loc);
+ if(set==null){
+ set=new int[] {-1, -1};
+ setList.set(loc, set);
+ }
+
+ for(int i=0; i<set.length; i++){
+ if(set[i]==v){return 0;}
+ if(set[i]<0){set[i]=v;return 1;}
+ }
+ final int oldSize=set.length;
+ final int newSize=(int)Tools.min(Integer.MAX_VALUE, oldSize*2L);
+ assert(newSize>set.length) : "Overflow.";
+ set=Arrays.copyOf(set, newSize);
+ set[oldSize]=v;
+ Arrays.fill(set, oldSize+1, newSize, -1);
+ setList.set(loc, set);
+ return 1;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Resizing and Rebalancing ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public final boolean canRebalance() {return false;}
+
+ @Override
+ protected synchronized void resize(){
+
+ if(verbose){
+ System.err.println("Resizing from "+prime+"; load="+(size*1f/prime));
+ }
+
+// assert(TESTMODE);
+// if(TESTMODE){
+// for(int i=0; i<ll.size; i++){
+// assert(contains(ll.get(i), il.get(i)));
+// assert(!contains(ll.get(i), Integer.MAX_VALUE));
+// }
+// }
+
+// System.err.println("Resizing from "+prime+"; load="+(size*1f/prime));
+ if(prime>=maxPrime){
+ sizeLimit=0xFFFFFFFFFFFFL;
+ return;
+ }
+
+ final long oldSize=size, oldVSize=victims.size;
+ final long totalSize=this.size+victims.size;
+
+ final long maxAllowedByLoadFactor=(long)(totalSize*minLoadMult);
+ final long minAllowedByLoadFactor=(long)(totalSize*maxLoadMult);
+
+// sizeLimit=Tools.min((long)(maxLoadFactor*prime), maxPrime);
+
+ assert(maxAllowedByLoadFactor>=minAllowedByLoadFactor);
+ if(maxAllowedByLoadFactor<prime){
+ sizeLimit=(long)(maxLoadFactor*prime);
+ return;
+ }
+
+ long x=10+(long)(prime*resizeMult);
+ x=Tools.max(x, minAllowedByLoadFactor);
+ x=Tools.min(x, maxAllowedByLoadFactor);
+
+ int prime2=(int)Tools.min(maxPrime, Primes.primeAtLeast(x));
+
+ if(prime2<=prime){
+ sizeLimit=(long)(maxLoadFactor*prime);
+ assert(prime2==prime) : "Resizing to smaller array? "+totalSize+", "+prime+", "+x;
+ return;
+ }
+
+ prime=prime2;
+// System.err.println("Resized to "+prime+"; load="+(size*1f/prime));
+ long[][] oldk=arrays;
+ int[] oldc=values;
+ arrays=allocLong2D(mult, prime2+extra);
+ for(int i=0; i<mult; i++){
+ Arrays.fill(arrays[i], NOT_PRESENT);
+ }
+ IntList2 oldList=setList;
+ setList=new IntList2();
+ setList.add(null);
+ setList.add(null);//TODO: May have to add 3 of them to avoid HASH_COLLISION at -2
+ values=allocInt1D(prime2+extra);
+ ArrayList<KmerNodeU> list=victims.toList();
+ victims.clear();
+ size=0;
+ sizeLimit=Long.MAX_VALUE;
+
+ final int[] singleton=new int[] {NOT_PRESENT};
+ final Kmer kmer=new Kmer(kbig);
+ {
+ for(int i=0; i<oldk.length; i++){
+ if(oldk[0][i]>NOT_PRESENT){
+ final int v=oldc[i];
+ fillKmer(i, kmer, oldk);
+ if(v>=0){
+ set(kmer, v);
+ }else{
+ set(kmer, oldList.get(-v));
+ }
+ }
+ }
+ }
+
+ for(KmerNodeU n : list){
+ if(n.pivot[0]>NOT_PRESENT){
+ kmer.setFrom(n.pivot());
+ if(n.numValues()>1){
+ set(kmer, n.values(singleton));
+ }else{
+ set(kmer, n.value());
+ }
+ }else{assert(false);}
+ }
+
+ assert(oldSize+oldVSize==size+victims.size) : oldSize+" + "+oldVSize+" = "+(oldSize+oldVSize)+" -> "+size+" + "+victims.size+" = "+(size+victims.size);
+
+ if(verbose){System.err.println("Resized to "+prime+". "+oldSize+" + "+oldVSize+" = "+(oldSize+oldVSize)+" -> "+size+" + "+victims.size+" = "+(size+victims.size));}
+
+ sizeLimit=(long)(maxLoadFactor*prime);
+
+// assert(TESTMODE);
+// if(TESTMODE){
+// for(int i=0; i<ll.size; i++){
+// long[] kmer=ll.get(i);
+// int v=il.get(i);
+// assert(contains(kmer, v)) : i+", "+ll.size+", "+kmer+", "+v+", "+Arrays.toString(getValues(kmer, new int[1]));
+// assert(!contains(kmer, Integer.MAX_VALUE));
+// }
+// }
+ }
+
+ @Deprecated
+ @Override
+ public void rebalance(){
+ throw new RuntimeException("Unimplemented.");
+ }
+
+ @Deprecated
+ @Override
+ public long regenerate(){
+ throw new RuntimeException("Not supported.");
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private int[] values;
+ private IntList2 setList;
+
+
+
+}
diff --git a/current/ukmer/HashBufferU.java b/current/ukmer/HashBufferU.java
new file mode 100755
index 0000000..1079e1a
--- /dev/null
+++ b/current/ukmer/HashBufferU.java
@@ -0,0 +1,312 @@
+package ukmer;
+
+import stream.ByteBuilder;
+import fileIO.ByteStreamWriter;
+import fileIO.TextStreamWriter;
+
+/**
+ * @author Brian Bushnell
+ * @date Nov 22, 2013
+ *
+ */
+public class HashBufferU extends AbstractKmerTableU {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public HashBufferU(AbstractKmerTableU[] tables_, int buflen_, int kbig_, boolean initValues){
+ tables=tables_;
+ buflen=buflen_;
+ kmer=new Kmer(kbig_);
+ mult=kmer.mult;
+ buflen2=buflen*mult;
+ halflen2=((buflen+1)/2)*mult;
+ ways=tables.length;
+ buffers=new KmerBufferU[ways];
+ for(int i=0; i<ways; i++){
+ buffers[i]=new KmerBufferU(buflen, kmer.kbig, initValues);
+ }
+// tempKey=new long[mult];
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Public Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+// @Override
+// public int incrementAndReturnNumCreated(Kmer kmer) {
+// assert(kmer.mult==mult) : kmer.mult+"!="+mult+", kbig="+kmer.kbig+", k="+kmer.k;
+// final int way=getWay(kmer);
+// KmerBufferU buffer=buffers[way];
+// final int size=buffer.add(kmer);
+// if(size==halflen2 || size>=buflen2){
+// return dumpBuffer(way, size>=buflen2);
+// }
+// return 0;
+// }
+
+ @Override
+ public int incrementAndReturnNumCreated(Kmer kmer) {
+ assert(kmer.mult==mult) : kmer.mult+"!="+mult+", kbig="+kmer.kbig+", k="+kmer.k;
+ final int way=getWay(kmer);
+ KmerBufferU buffer=buffers[way];
+ final int size=buffer.add(kmer);
+ if(size>=halflen2 && (size>=buflen2 || (size&SIZEMASK)==0)){
+// if(size==halflen2 || size>=buflen2){
+ return dumpBuffer(way, size>=buflen2);
+ }
+ return 0;
+ }
+
+ @Override
+ public final long flush(){
+ long added=0;
+ for(int i=0; i<ways; i++){added+=dumpBuffer(i, true);}
+ return added;
+ }
+
+ @Override
+ public int set(Kmer kmer, int value) {
+ throw new RuntimeException("Unimplemented method; this class lacks value buffers");
+ }
+
+ @Override
+ public int set(Kmer kmer, int[] vals) {
+ throw new RuntimeException("Unimplemented method; this class lacks value buffers");
+ }
+
+ @Override
+ public int setIfNotPresent(Kmer kmer, int value) {
+ throw new RuntimeException("Unimplemented method; this class lacks value buffers");
+ }
+
+ @Override
+ public int getValue(Kmer kmer) {
+ final int way=getWay(kmer);
+ return tables[way].getValue(kmer);
+ }
+
+ @Override
+ public int getValue(long[] key, long xor) {
+ final int way=(int)(xor%ways);
+ return tables[way].getValue(key, xor);
+ }
+
+ @Override
+ public int[] getValues(Kmer kmer, int[] singleton){
+ final int way=getWay(kmer);
+ return tables[way].getValues(kmer, singleton);
+ }
+
+ @Override
+ public boolean contains(Kmer kmer) {
+ final int way=getWay(kmer);
+ return tables[way].contains(kmer);
+ }
+
+ public final int getWay(Kmer kmer){return (int)(kmer.xor()%ways);}
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Ownership ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public final void initializeOwnership(){
+ for(AbstractKmerTableU t : tables){t.initializeOwnership();}
+ }
+
+ @Override
+ public final void clearOwnership(){
+ for(AbstractKmerTableU t : tables){t.clearOwnership();}
+ }
+
+ @Override
+ public final int setOwner(final Kmer kmer, final int newOwner){
+ final int way=getWay(kmer);
+ return tables[way].setOwner(kmer, newOwner);
+ }
+
+ @Override
+ public final boolean clearOwner(final Kmer kmer, final int owner){
+ final int way=getWay(kmer);
+ return tables[way].clearOwner(kmer, owner);
+ }
+
+ @Override
+ public final int getOwner(final Kmer kmer){
+ final int way=getWay(kmer);
+ return tables[way].getOwner(kmer);
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Nonpublic Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ Object get(Kmer kmer) {
+ final int way=getWay(kmer);
+ return tables[way].get(kmer);
+ }
+
+ @Override
+ Object get(long[] kmer) {
+ throw new RuntimeException();
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Private Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private int dumpBuffer(final int way, boolean force){
+ final KmerBufferU buffer=buffers[way];
+ final AbstractKmerTableU table=tables[way];
+ final int lim=buffer.size();
+ if(lim<0){return 0;}
+ if(force){table.lock();}
+ else if(!table.tryLock()){return 0;}
+ final int x=dumpBuffer_inner(way);
+ table.unlock();
+ return x;
+ }
+
+ private int dumpBuffer_inner(final int way){
+ if(verbose){System.err.println("Dumping buffer for way "+way+" of "+ways);}
+ final KmerBufferU buffer=buffers[way];
+ final int lim=buffer.size();
+ if(lim<1){return 0;}
+ final long[] kmers=buffer.kmers.array;
+ final int[] values=(buffer.values==null ? null : buffer.values.array);
+ int added=0;
+ final AbstractKmerTableU table=tables[way];
+ final long array1[]=kmer.array1();
+// synchronized(table){
+ if(values==null){
+// System.err.println("way="+way);
+ for(int j=0; j<lim;){
+ for(int x=0; x<mult; x++, j++){
+ if(verbose){System.err.println("x="+x+", j="+j);}
+ array1[x]=kmers[j];
+ }
+ kmer.fillArray2();
+ if(verbose){System.err.println("Incrementing "+kmer+"; xor="+kmer.xor());}
+// assert(kmer.mod(ways)==way) : kmer+", "+way+", "+ways+", "+kmer.mod(ways)+", "+kmer.xor()+"\n"+
+// Arrays.toString(kmer.array1())+"\n"+Arrays.toString(kmer.array2())+"\n"+Arrays.toString(kmer.key());
+// assert(kmer.verify(false));
+ int x=table.incrementAndReturnNumCreated(kmer);
+ added+=x;
+ }
+ }else{
+ for(int i=0, j=0; j<lim; i++){
+ for(int x=0; x<mult; x++, j++){
+ array1[x]=kmers[j];
+ }
+ kmer.fillArray2();
+ added+=table.setIfNotPresent(kmer, values[i]);
+ }
+ }
+// }
+ buffer.clear();
+ return added;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Resizing and Rebalancing ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ final boolean canResize() {return false;}
+
+ @Override
+ public final boolean canRebalance() {return false;}
+
+ @Deprecated
+ @Override
+ public long size() {
+ throw new RuntimeException("Unimplemented.");
+ }
+
+ @Deprecated
+ @Override
+ public int arrayLength() {
+ throw new RuntimeException("Unimplemented.");
+ }
+
+ @Deprecated
+ @Override
+ void resize() {
+ throw new RuntimeException("Unimplemented.");
+ }
+
+ @Deprecated
+ @Override
+ public void rebalance() {
+ throw new RuntimeException("Unimplemented.");
+ }
+
+ public long regenerate(){
+ long sum=0;
+ for(AbstractKmerTableU table : tables){
+ sum+=table.regenerate();
+ }
+ return sum;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Info Dumping ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public boolean dumpKmersAsText(TextStreamWriter tsw, int k, int mincount){
+ for(AbstractKmerTableU table : tables){
+ table.dumpKmersAsText(tsw, k, mincount);
+ }
+ return true;
+ }
+
+ @Override
+ public boolean dumpKmersAsBytes(ByteStreamWriter bsw, int k, int mincount){
+ for(AbstractKmerTableU table : tables){
+ table.dumpKmersAsBytes(bsw, k, mincount);
+ }
+ return true;
+ }
+
+ @Override
+ @Deprecated
+ public boolean dumpKmersAsBytes_MT(final ByteStreamWriter bsw, final ByteBuilder bb, final int k, final int mincount){
+ throw new RuntimeException("Unsupported.");
+ }
+
+ @Override
+ public void fillHistogram(long[] ca, int max){
+ for(AbstractKmerTableU table : tables){
+ table.fillHistogram(ca, max);
+ }
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Invalid Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public int increment(Kmer kmer) {
+ throw new RuntimeException("Unsupported");
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private final AbstractKmerTableU[] tables;
+ private final int buflen;
+ private final int buflen2;
+ private final int halflen2;
+ private final int mult;
+ private final int ways;
+ private final KmerBufferU[] buffers;
+ private final Kmer kmer;
+
+ private final static int SIZEMASK=15;
+
+}
diff --git a/current/ukmer/HashForestU.java b/current/ukmer/HashForestU.java
new file mode 100755
index 0000000..8a3a70a
--- /dev/null
+++ b/current/ukmer/HashForestU.java
@@ -0,0 +1,540 @@
+package ukmer;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.concurrent.locks.Lock;
+import java.util.concurrent.locks.ReentrantLock;
+
+import stream.ByteBuilder;
+
+import kmer.Primes;
+
+
+import fileIO.ByteStreamWriter;
+import fileIO.TextStreamWriter;
+
+import align2.Tools;
+
+/**
+ * @author Brian Bushnell
+ * @date Oct 23, 2013
+ *
+ */
+public final class HashForestU extends AbstractKmerTableU implements Iterable<KmerNodeU> {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public HashForestU(int initialSize, boolean autoResize_){
+ this(initialSize, autoResize_, false);
+ }
+
+ public HashForestU(int initialSize, boolean autoResize_, boolean twod_){
+ if(initialSize>1){
+ initialSize=(int)Tools.min(maxPrime, Primes.primeAtLeast(initialSize));
+ }else{
+ initialSize=1;
+ }
+ prime=initialSize;
+ sizeLimit=(long) (initialSize*resizeMult);
+ array=allocKmerNodeArray(prime);
+ autoResize=autoResize_;
+ TWOD=twod_;
+ }
+
+ private KmerNodeU makeNode(Kmer kmer, int val){return makeNode(kmer.key(), val);}
+ private KmerNodeU makeNode(Kmer kmer, int[] vals){return makeNode(kmer.key(), vals);}
+
+ private KmerNodeU makeNode(long[] kmer, int val){
+ return (TWOD ? new KmerNodeU2D(kmer, val) : new KmerNodeU1D(kmer, val));
+ }
+
+ private KmerNodeU makeNode(long[] kmer, int[] vals){
+ assert(TWOD);
+ return new KmerNodeU2D(kmer, vals);
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Public Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public KmerNodeU findParent(Kmer kmer, final int cell){return findParent(kmer.key(), cell);}
+
+ public KmerNodeU findParent(final long[] kmer, final int cell){
+ KmerNodeU n=array[cell], prev=null;
+ int cmp=(n==null ? 0 : compare(kmer, n.pivot()));
+ while(cmp!=0){
+ prev=n;
+ n=(cmp<0 ? n.left : n.right);
+ cmp=(n==null ? 0 : compare(kmer, n.pivot()));
+ }
+ return prev;
+ }
+
+ @Override
+ public int increment(Kmer kmer){
+ final int cell=kmer.mod(prime);
+ KmerNodeU n=array[cell], prev=null;
+ final long[] key=kmer.key();
+ int cmp=(n==null ? 0 : compare(key, n.pivot()));
+ while(cmp!=0){
+ prev=n;
+ n=(cmp<0 ? n.left : n.right);
+ cmp=(n==null ? 0 : compare(key, n.pivot()));
+ }
+ if(n==null){
+ n=makeNode(kmer, 1);
+ size++;
+ if(prev==null){
+ array[cell]=n;
+ }else{
+ if(compare(key, prev.pivot)<0){
+ prev.left=n;
+ }else{
+ prev.right=n;
+ }
+ }
+ if(autoResize && size>sizeLimit){resize();}
+ }else{
+ n.increment(kmer);
+ }
+ return n.value();
+ }
+
+ @Override
+ public int incrementAndReturnNumCreated(Kmer kmer){
+// assert(kmer.verify(false));
+//// Kmer old=kmer.clone(); //123
+//// System.err.println("cell should be "+kmer.mod(prime)+"; prime="+prime);
+// int a=getValue(kmer);
+// int x=incrementAndReturnNumCreated0(kmer);
+//// System.err.println("cell should be "+kmer.mod(prime)+"; prime="+prime);
+// int b=getValue(kmer);
+//// System.err.println("cell should be "+kmer.mod(prime)+"; prime="+prime);
+//// assert(old.equals(kmer));
+// assert(Tools.max(a, 0)+1==b) : a+", "+b+", "+x+", "+kmer+", "+kmer.arraysToString();
+// return x;
+// }
+//
+// public int incrementAndReturnNumCreated0(Kmer kmer){//123
+ final int cell=kmer.mod(prime);
+ if(verbose){System.err.println("Placed in cell "+cell+": "+Arrays.toString(kmer.key()));}
+// assert(cell==kmer.xor()%prime);
+ KmerNodeU n=array[cell], prev=null;
+ final long[] key=kmer.key();
+ int cmp=(n==null ? 0 : compare(key, n.pivot()));
+ while(cmp!=0){
+ prev=n;
+ n=(cmp<0 ? n.left : n.right);
+ cmp=(n==null ? 0 : compare(key, n.pivot()));
+ }
+ if(n==null){
+ n=makeNode(kmer, 1);
+ size++;
+ if(prev==null){
+ array[cell]=n;
+ }else{
+ if(compare(key, prev.pivot)<0){
+ prev.left=n;
+ }else{
+ prev.right=n;
+ }
+ }
+ if(autoResize && size>sizeLimit){resize();}
+ return 1;
+ }else{
+ n.increment(kmer);
+ return 0;
+ }
+ }
+
+// public final int set_Test(final long[] kmer, final int v){
+// assert(TESTMODE);
+// final int x;
+// if(TWOD){
+// int[] old=getValues(kmer, null);
+// assert(old==null || contains(kmer, old));
+// x=set0(kmer, v);
+// assert(old==null || contains(kmer, old));
+// assert(contains(kmer, v));
+// }else{
+// int old=getValue(kmer);
+// assert(old==0 || old==-1 || contains(kmer, old));
+// x=set0(kmer, v);
+// assert(contains(kmer, v)) : "old="+old+", v="+v+", kmer="+kmer+", get(kmer)="+getValue(kmer);
+// assert(v==old || !contains(kmer, old));
+// }
+// return x;
+// }
+//
+// public final int setIfNotPresent_Test(Kmer kmer, int v){
+// assert(TESTMODE);
+// final int x;
+// if(TWOD){
+//// int[] vals=getValues(kmer, null);
+//// assert(vals==null || contains(kmer, vals));
+//// x=setIfNotPresent(kmer, v);
+//// assert(contains(kmer, vals));
+//// assert(contains(kmer, v));
+// x=0;
+// assert(false);
+// }else{
+// int old=getValue(kmer);
+// assert(old==0 || old==-1 || contains(kmer, old));
+// x=setIfNotPresent0(kmer, v);
+// assert((old<1 && contains(kmer, v)) || (old>0 && contains(kmer, old))) : kmer+", "+old+", "+v;
+// }
+// return x;
+// }
+//
+// public final int set_Test(final long[] kmer, final int v[]){
+// assert(TESTMODE);
+// final int x;
+// if(TWOD){
+// int[] old=getValues(kmer, null);
+// assert(old==null || contains(kmer, old));
+// x=set0(kmer, v);
+// assert(old==null || contains(kmer, old));
+// assert(contains(kmer, v));
+// }else{
+// int old=getValue(kmer);
+// assert(old==0 || old==-1 || contains(kmer, old));
+// x=set0(kmer, v);
+// assert(contains(kmer, v)) : "old="+old+", v="+v+", kmer="+kmer+", get(kmer)="+getValue(kmer);
+// assert(v[0]==old || !contains(kmer, old));
+// }
+// return x;
+// }
+
+
+ @Override
+ public int set(Kmer kmer, int value){
+ int x=1, cell=kmer.mod(prime);
+ final KmerNodeU n=array[cell];
+ if(n==null){
+ array[cell]=makeNode(kmer, value);
+ }else{
+ x=n.set(kmer, value);
+ }
+ size+=x;
+ if(autoResize && size>sizeLimit){resize();}
+ return x;
+ }
+
+ @Override
+ public int set(Kmer kmer, int[] vals) {
+ int x=1, cell=kmer.mod(prime);
+ final KmerNodeU n=array[cell];
+ if(n==null){
+ array[cell]=makeNode(kmer, vals);
+ }else{
+ x=n.set(kmer, vals);
+ }
+ size+=x;
+ if(autoResize && size>sizeLimit){resize();}
+ return x;
+ }
+
+ @Override
+ public int setIfNotPresent(Kmer kmer, int value){
+ int x=1, cell=kmer.mod(prime);
+ final KmerNodeU n=array[cell];
+ if(n==null){
+ array[cell]=makeNode(kmer, value);
+ }else{
+ x=n.setIfNotPresent(kmer, value);
+ }
+ size+=x;
+ if(autoResize && size>sizeLimit){resize();}
+ return x;
+ }
+
+ @Override
+ public final int getValue(Kmer kmer){
+ return getValue(kmer.key(), kmer.xor());
+ }
+
+ int getValue(KmerNodeU n){
+ return getValue(n.pivot, n.xor());
+ }
+
+ @Override
+ public int getValue(long[] key, long xor) {
+ int cell=(int)(xor%prime);
+ if(verbose){System.err.println("Looking in cell "+cell+": "+array[cell]);}
+ KmerNodeU n=array[cell];
+ return n==null ? -1 : n.getValue(key);
+ }
+
+ @Override
+ Object get(long[] key) {
+ throw new RuntimeException("Unimplemented.");
+ }
+
+ @Override
+ public int[] getValues(Kmer kmer, int[] singleton){
+ int cell=kmer.mod(prime);
+ KmerNodeU n=array[cell];
+ return n==null ? null : n.getValues(kmer, singleton);
+ }
+
+ @Override
+ public boolean contains(Kmer kmer){
+ return get(kmer)!=null;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Ownership ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public final void initializeOwnership(){
+ for(KmerNodeU n : array){
+ if(n!=null){n.initializeOwnership();}
+ }
+ }
+
+ @Override
+ public final void clearOwnership(){initializeOwnership();}
+
+ @Override
+ public final int setOwner(final Kmer kmer, final int newOwner){
+ final int cell=kmer.mod(prime);
+ KmerNodeU n=array[cell];
+ assert(n!=null);
+ return n.setOwner(kmer, newOwner);
+ }
+
+ @Override
+ public final boolean clearOwner(final Kmer kmer, final int owner){
+ final int cell=kmer.mod(prime);
+ KmerNodeU n=array[cell];
+ assert(n!=null);
+ return n.clearOwner(kmer, owner);
+ }
+
+ @Override
+ public final int getOwner(final Kmer kmer){
+ final int cell=kmer.mod(prime);
+ KmerNodeU n=array[cell];
+ assert(n!=null);
+ return n.getOwner(kmer);
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Nonpublic Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ final KmerNodeU get(Kmer kmer){
+ int cell=kmer.mod(prime);
+ KmerNodeU n=array[cell];
+ final long[] key=kmer.key();
+ int cmp=(n==null ? 0 : compare(key, n.pivot()));
+ while(cmp!=0){
+ n=(cmp<0 ? n.left : n.right);
+ cmp=(n==null ? 0 : compare(key, n.pivot()));
+ }
+ return n;
+ }
+
+ public final KmerNodeU getNode(int cell){
+ KmerNodeU n=array[cell];
+ return n;
+ }
+
+ boolean insert(KmerNodeU n){
+ n.left=null;
+ n.right=null;
+ int cell=(int)(n.xor()%prime);
+ if(array[cell]==null){
+ array[cell]=n;
+ return true;
+ }
+ return array[cell].insert(n);
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Private Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Resizing and Rebalancing ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ boolean canResize() {return true;}
+
+ @Override
+ public boolean canRebalance() {return true;}
+
+ @Override
+ public long size() {return size;}
+
+ @Override
+ public int arrayLength() {return array.length;}
+
+ @Override
+ synchronized void resize(){
+// assert(false);
+// System.err.println("Resizing from "+prime+"; load="+(size*1f/prime));
+ sizeLimit=Tools.max((long)(size*1.4), (long)(maxLoadFactor*prime));
+
+ final long maxAllowedByLoadFactor=(long)(size*minLoadMult);
+ final long minAllowedByLoadFactor=(long)(size*maxLoadMult);
+ assert(maxAllowedByLoadFactor>=minAllowedByLoadFactor);
+ if(maxAllowedByLoadFactor<prime){return;}
+
+ long x=10+(long)(prime*resizeMult);
+ x=Tools.max(x, minAllowedByLoadFactor);
+ x=Tools.min(x, maxAllowedByLoadFactor);
+
+ int prime2=(int)Tools.min(maxPrime, Primes.primeAtLeast(x));
+
+ if(prime2<=prime){return;}
+
+ prime=prime2;
+// System.err.println("Resized to "+prime+"; load="+(size*1f/prime));
+ KmerNodeU[] old=array;
+ array=allocKmerNodeArray(prime2);
+ ArrayList<KmerNodeU> list=new ArrayList<KmerNodeU>(1000);
+ for(int i=0; i<old.length; i++){
+ if(old[i]!=null){
+ old[i].traverseInfix(list);
+ for(KmerNodeU n : list){
+ insert(n);
+// assert(getValue(n)==n.value());//123 slow
+ }
+ list.clear();
+ }
+ }
+ sizeLimit=Tools.max((long)(size*1.4), (long)(maxLoadFactor*prime));
+ }
+
+ @Override
+ public void rebalance(){
+ ArrayList<KmerNodeU> list=new ArrayList<KmerNodeU>(1000);
+ for(int i=0; i<array.length; i++){
+ if(array[i]!=null){array[i]=array[i].rebalance(list);}
+ }
+ }
+
+ public void clear() {
+ size=0;
+ Arrays.fill(array, null);
+ }
+
+ @Override
+ long regenerate() {
+ throw new RuntimeException("Not implemented.");
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Info Dumping ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public boolean dumpKmersAsText(TextStreamWriter tsw, int k, int mincount){
+// tsw.print("HashForest:\n");
+ for(int i=0; i<array.length; i++){
+ KmerNodeU node=array[i];
+ if(node!=null && node.value()>=mincount){
+// StringBuilder sb=new StringBuilder();
+// tsw.print(node.dumpKmersAsText(sb, k, mincount));
+ node.dumpKmersAsText(tsw, k, mincount);
+ }
+ }
+ return true;
+ }
+
+ @Override
+ public boolean dumpKmersAsBytes(ByteStreamWriter bsw, int k, int mincount){
+// tsw.print("HashForest:\n");
+ for(int i=0; i<array.length; i++){
+ KmerNodeU node=array[i];
+ if(node!=null && node.value()>=mincount){
+// StringBuilder sb=new StringBuilder();
+// tsw.print(node.dumpKmersAsText(sb, k, mincount));
+ node.dumpKmersAsBytes(bsw, k, mincount);
+ }
+ }
+ return true;
+ }
+
+ @Override
+ public boolean dumpKmersAsBytes_MT(final ByteStreamWriter bsw, final ByteBuilder bb, final int k, final int mincount){
+ for(int i=0; i<array.length; i++){
+ KmerNodeU node=array[i];
+ if(node!=null && node.value()>=mincount){
+ node.dumpKmersAsBytes_MT(bsw, bb, k, mincount);
+ }
+ }
+ return true;
+ }
+
+ @Override
+ public void fillHistogram(long[] ca, int max){
+ for(int i=0; i<array.length; i++){
+ KmerNodeU node=array[i];
+ if(node!=null){
+ node.fillHistogram(ca, max);
+ }
+ }
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Iteration ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public Iterator<KmerNodeU> iterator() {
+ return toList().iterator();
+ }
+
+ public ArrayList<KmerNodeU> toList(){
+ assert(size<Integer.MAX_VALUE);
+ ArrayList<KmerNodeU> list=new ArrayList<KmerNodeU>((int)size);
+ for(int i=0; i<array.length; i++){
+ if(array[i]!=null){array[i].traverseInfix(list);}
+ }
+ assert(list.size()==size);
+ return list;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Invalid Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public KmerNodeU[] array() {return array;}
+
+ KmerNodeU[] array;
+ int prime;
+ long size=0;
+ long sizeLimit;
+ final boolean autoResize;
+ final boolean TWOD;
+ private final Lock lock=new ReentrantLock();
+
+ @Override
+ final Lock getLock(){return lock;}
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Static Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ final static int maxPrime=(int)Primes.primeAtMost(Integer.MAX_VALUE);
+ final static float resizeMult=2.5f; //Resize by a minimum of this much
+ final static float minLoadFactor=0.75f; //Resize by enough to get the load above this factor
+ final static float maxLoadFactor=2.5f; //Resize by enough to get the load under this factor
+ final static float minLoadMult=1/minLoadFactor;
+ final static float maxLoadMult=1/maxLoadFactor;
+
+
+
+}
diff --git a/current/ukmer/Kmer.java b/current/ukmer/Kmer.java
new file mode 100755
index 0000000..32c1e97
--- /dev/null
+++ b/current/ukmer/Kmer.java
@@ -0,0 +1,347 @@
+package ukmer;
+
+import java.util.Arrays;
+
+import align2.Tools;
+
+import stream.ByteBuilder;
+import dna.AminoAcid;
+
+/**
+ * @author Brian Bushnell
+ * @date Jul 9, 2015
+ *
+ */
+public class Kmer implements Cloneable {
+
+ public Kmer(Kmer o){
+ this(o.k, o.mult);
+ setFrom(o);
+ }
+
+ public Kmer(int kbig_){
+ this(getK(kbig_), getMult(kbig_));
+ }
+
+ public Kmer clone(){
+ return new Kmer(this);
+ }
+
+ public Kmer(int k_, int mult_){
+ k=k_;
+ mult=mult_;
+ maxindex=mult-1;
+ shift=2*k;
+ shift2=shift-2;
+ mask=~((-1L)<<shift);
+
+ kbig=k*mult;
+ array1=new long[mult];
+ array2=new long[mult];
+ key=null;
+ }
+
+ public static int getMult(int kbig){
+ final int mult=getMult0(kbig);
+// assert(mult==getMult0(kbig*(mult/mult))) : mult+", "+getMult0(mult*(kbig/mult));
+ return mult;
+ }
+
+ public static int getKbig(int kbig){
+ int x=getMult(kbig)*getK(kbig);
+ assert(x<=kbig) : x+", "+kbig;
+ assert(kbig>31 || x==kbig);
+ return x;
+ }
+
+ private static int getMult0(int kbig){
+ final int word=31;
+
+ final int mult1=(kbig+word-1)/word;
+ final int mult2=Tools.max(1, kbig/word);
+ if(mult1==mult2){return mult1;}
+
+ final int k1=Tools.min(word, kbig/mult1);
+ final int k2=Tools.min(word, kbig/mult2);
+
+ final int kbig1=k1*mult1;
+ final int kbig2=k2*mult2;
+
+ assert(kbig1<=kbig);
+ assert(kbig2<=kbig);
+ assert(mult2<=mult1);
+
+// assert(false) : mult1+", "+mult2+", "+k1+", "+k2;
+
+ final int mult=kbig2>=kbig1 ? mult2 : mult1;
+
+ return mult;
+ }
+
+ public static int getK(int kbig){
+ int mult=getMult(kbig);
+ int x=kbig/mult;
+ assert(x*mult<=kbig) : x+", "+kbig;
+ assert(x<=31) : kbig+", "+mult+", "+x;
+ return x;
+ }
+
+ public Kmer setFrom(Kmer o){
+ for(int i=0; i<mult; i++){
+ array1[i]=o.array1[i];
+ array2[i]=o.array2[i];
+ len=o.len;
+ }
+ incarnation++;
+ return this;
+ }
+
+ public Kmer setFrom(long[] array){
+ for(int i=0; i<mult; i++){
+ array1[i]=array[i];
+ }
+ fillArray2();
+ incarnation++;
+ return this;
+ }
+
+ public void clear() {
+ len=0;
+ for(int i=0; i<mult; i++){
+ array1[i]=0;
+ array2[i]=0;
+ }
+ incarnation++;
+ }
+
+ public boolean verify(boolean update){
+// boolean b=verify();
+// if(b){
+// if(update){update();}
+// b=verify();
+// assert(len<kbig || incarnation==lastIncarnation);
+// }
+ if(update){
+ update();
+ assert(len<kbig || incarnation==lastIncarnation) : "incarnation="+incarnation+", last="+lastIncarnation+", len="+len+", kbig="+kbig;
+ }
+ boolean b=verify();
+ return b;
+ }
+
+ private boolean verify(){
+ if(len<kbig){return true;}
+ for(int i=maxindex, j=0; j<mult; j++, i--){
+ long kmer=array1[i];
+ long rkmer=array2[j];
+ if(kmer!=AminoAcid.reverseComplementBinaryFast(rkmer, k)){
+// assert(false) : Arrays.toString(array1);
+ return false;
+ }
+ }
+ assert(incarnation==lastIncarnation) : "incarnation="+incarnation+", last="+lastIncarnation+", len="+len+", kbig="+kbig;
+ return true;
+ }
+
+ public byte addRight(final byte b){
+ long x=AminoAcid.baseToNumber[b];
+ return AminoAcid.numberToBase[(int)addRightNumeric(x)];
+ }
+
+ public byte addLeft(final byte b){
+ long x=AminoAcid.baseToNumber[b];
+ return AminoAcid.numberToBase[(int)addLeftNumeric(x)];
+ }
+
+ public long addRightNumeric(long x){
+ long x2;
+
+ if(x<0){
+ x=0;
+ x2=3;
+ len=0;
+ }else{
+ x2=AminoAcid.numberToComplement[(int)x];
+ len++;
+ }
+
+ for(int i=maxindex, j=0; j<mult; j++, i--){
+
+ long y=(array1[i]>>>shift2)&3L;
+ long y2=array2[j]&3L;
+
+ //Update kmers
+ array1[i]=((array1[i]<<2)|x)&mask;
+ array2[j]=(array2[j]>>>2)|(x2<<shift2);
+
+ x=y;
+ x2=y2;
+ }
+ incarnation++;
+ return x;
+ }
+
+ public long addLeftNumeric(long x){
+ assert(x>=0 && x<4) : x;
+ long x2=AminoAcid.numberToComplement[(int)x];
+
+ assert(x>=0);
+ assert(len>=kbig);
+
+ for(int i=0, j=maxindex; i<mult; i++, j--){
+
+ long y=array1[i]&3L;
+ long y2=(array2[j]>>>shift2)&3L;
+
+ //Update kmers
+ array1[i]=(array1[i]>>>2)|(x<<shift2);
+ array2[j]=((array2[j]<<2)|x2)&mask;
+
+ x=y;
+ x2=y2;
+ }
+ incarnation++;
+ return x;
+ }
+
+ public void fillArray2() {
+ for(int i=maxindex, j=0; j<mult; j++, i--){
+ array2[j]=AminoAcid.reverseComplementBinaryFast(array1[i], k);
+ }
+ len=kbig;
+ incarnation++;
+ }
+
+ public String toString(){
+// update();
+ assert(verify(true));
+ ByteBuilder bb=new ByteBuilder();
+ for(int i=0; i<mult; i++){
+ bb.appendKmer(array1[i], k);
+// bb.append(" ");
+ }
+//// bb.append("~");
+// for(int i=0; i<mult; i++){
+// bb.appendKmer(array2[i], k);
+//// bb.append(" ");
+// }
+ return bb.toString();
+ }
+
+ public boolean equals(Kmer x){
+ if(xor()!=x.xor()){return false;}
+ return AbstractKmerTableU.equals(key(), x.key());
+ }
+
+ public int compareTo(Kmer x){
+ return compare(key(), x.key());
+ }
+
+ public int compareTo(long[] key2){
+ assert(false);
+ return compare(key(), key2);
+ }
+
+ public static int compare(long[] key1, long[] key2){
+ assert(false);
+ return AbstractKmerTableU.compare(key1, key2);
+ }
+
+ public static boolean equals(long[] key1, long[] key2){
+ assert(false);
+ return AbstractKmerTableU.equals(key1, key2);
+ }
+
+ public long[] array1(){return array1;}
+
+ public long[] array2(){return array2;}
+
+ /** WARNING!
+ * Do not confuse this with xor()! */
+ public long[] key(){
+ update();
+// assert(verify(false));
+ return key;
+ }
+
+ private void setKey0(){
+ key=array1;
+ for(int i=0; i<mult; i++){
+ if(array1[i]>array2[i]){break;}
+ else if(array1[i]<array2[i]){
+ key=array2;
+ break;
+ }
+ }
+ }
+
+ public static long xor(long[] key){
+ long xor=key[0];
+ for(int i=1; i<key.length; i++){
+ xor=(Long.rotateLeft(xor, 25))^key[i];
+ }
+ return xor&mask63;
+ }
+
+ /** WARNING!
+ * Do not confuse this with key()! */
+ public long xor(){
+ update();
+ return lastXor;
+ }
+
+ /**
+ * @param value
+ * @return
+ */
+ public int mod(int value) {
+ int x=(int)(xor()%value);
+// System.err.println(xor()+"%"+value+"="+x);
+ return x;
+ }
+
+ public void rcomp() {
+ long[] temp=array1;
+ array1=array2;
+ array2=temp;
+ }
+
+ private void update(){
+ if(verbose){System.err.println("update() - len="+len);}
+ assert(TESTMODE || len>=kbig) : len+", "+kbig;
+ if(incarnation==lastIncarnation){return;}
+ setKey0();
+ lastXor=xor0();
+ lastIncarnation=incarnation;
+ if(verbose){System.err.println("After update - kmer "+this+"; key="+Arrays.toString(key)+"; a1="+Arrays.toString(array1())+"; a2="+Arrays.toString(array2()));}
+ }
+
+ private long xor0(){
+ return xor(key);
+ }
+
+ public String arraysToString() {
+ return "key="+Arrays.toString(key)+", a1="+Arrays.toString(array1)+", a2="+Arrays.toString(array2);
+ }
+
+ private long lastXor=-1;
+ private long incarnation=0;
+ private long lastIncarnation=-1;
+ private long[] key=null;
+
+ private long[] array1;
+ private long[] array2;
+ public final int kbig;
+ public final int k;
+ final int mult, maxindex;
+
+ private final int shift;
+ private final int shift2;
+ private final long mask;
+
+ public int len=0;
+
+ private static final long mask63=Long.MAX_VALUE;
+ private final static boolean TESTMODE=false; //123
+ private final static boolean verbose=false;
+}
diff --git a/current/ukmer/KmerBufferU.java b/current/ukmer/KmerBufferU.java
new file mode 100755
index 0000000..0ec5d6b
--- /dev/null
+++ b/current/ukmer/KmerBufferU.java
@@ -0,0 +1,67 @@
+package ukmer;
+
+import stream.ByteBuilder;
+import align2.IntList;
+import align2.LongList;
+
+/**
+ * @author Brian Bushnell
+ * @date Jul 9, 2015
+ *
+ */
+public class KmerBufferU {
+
+ public KmerBufferU(int buflen, int kbig, boolean initValues){
+ k=Kmer.getK(kbig);
+ mult=Kmer.getMult(kbig);
+ kmers=new LongList(buflen*mult);
+ values=(initValues ? new IntList(buflen) : null);
+ }
+
+ public int add(Kmer kmer){
+// System.err.println("Adding "+kmer+"; this="+this+"; kmers.size="+kmers.size);
+ add(kmer.key());
+ return kmers.size;
+// System.err.println("Added "+kmer+"; this="+this+"; kmers.size="+kmers.size);
+ }
+
+ public void add(Kmer kmer, int value){
+ add(kmer.key(), value);
+ }
+
+ public void add(long[] kmer){
+ assert(values==null);
+ assert(kmer.length==mult) : kmer.length+", "+mult+", "+k;
+ kmers.append(kmer);
+ }
+
+ public void add(long[] kmer, int value){
+ assert(kmer.length==mult);
+ kmers.append(kmer);
+ values.add(value);
+ assert(values.size*mult==kmers.size);
+ }
+
+ public void clear(){
+ kmers.clear();
+ if(values!=null){values.clear();}
+ }
+
+ //Returns raw size of kmers array, rather than actual number of kmers
+ final int size(){return kmers.size;}
+
+ public String toString(){
+ ByteBuilder bb=new ByteBuilder();
+ for(int i=0; i<kmers.size; i++){
+ if(i>0){bb.append(',');}
+ bb.appendKmer(kmers.get(i), k);
+ }
+ return bb.toString();
+ }
+
+ private final int mult;
+ private final int k;
+ final LongList kmers;
+ final IntList values;
+
+}
diff --git a/current/ukmer/KmerNodeU.java b/current/ukmer/KmerNodeU.java
new file mode 100755
index 0000000..c5d442a
--- /dev/null
+++ b/current/ukmer/KmerNodeU.java
@@ -0,0 +1,387 @@
+package ukmer;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import stream.ByteBuilder;
+
+import align2.Tools;
+import fileIO.TextStreamWriter;
+
+/**
+ * @author Brian Bushnell
+ * @date Oct 22, 2013
+ *
+ */
+public abstract class KmerNodeU extends AbstractKmerTableU {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ protected KmerNodeU(long[] pivot_){
+ pivot=pivot_.clone();
+ }
+
+ public abstract KmerNodeU makeNode(long[] pivot_, int value_);
+ public abstract KmerNodeU makeNode(long[] pivot_, int[] values_);
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Public Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public final int increment(Kmer kmer){return increment(kmer.key());}
+
+ public final int increment(long[] kmer){
+ final int cmp=compare(kmer, pivot);
+ if(cmp<0){
+ if(left==null){left=makeNode(kmer, 1); return 1;}
+ return left.increment(kmer);
+ }else if(cmp>0){
+ if(right==null){right=makeNode(kmer, 1); return 1;}
+ return right.increment(kmer);
+ }else{
+ if(value()<Integer.MAX_VALUE){set(value()+1);}
+ return value();
+ }
+ }
+
+ @Override
+ public final int incrementAndReturnNumCreated(Kmer kmer){return incrementAndReturnNumCreated(kmer.key());}
+
+ public final int incrementAndReturnNumCreated(long[] kmer) {
+ int x=increment(kmer);
+ return x==1 ? 1 : 0;
+ }
+
+ /** Returns number of nodes added */
+ public final int set(long[] kmer, int value){
+ if(verbose){System.err.println("Set0: kmer="+Arrays.toString(kmer)+", v="+value+", old="+Arrays.toString(values(new int[1])));}
+ if(verbose){System.err.println("A");}
+ final int cmp=compare(kmer, pivot);
+ if(cmp<0){
+ if(verbose){System.err.println("B");}
+ if(left==null){left=makeNode(kmer, value); return 1;}
+ if(verbose){System.err.println("C");}
+ return left.set(kmer, value);
+ }else if(cmp>0){
+ if(verbose){System.err.println("D");}
+ if(right==null){right=makeNode(kmer, value); return 1;}
+ if(verbose){System.err.println("E");}
+ return right.set(kmer, value);
+ }else{
+ if(verbose){System.err.println("F");}
+ set(value);
+ }
+ if(verbose){System.err.println("G");}
+ return 0;
+ }
+
+
+ /** Returns number of nodes added */
+ public final int setIfNotPresent(long[] kmer, int value){
+ if(verbose){System.err.println("setIfNotPresent0: kmer="+kmer+", v="+value+", old="+Arrays.toString(values(new int[0])));}
+ final int cmp=compare(kmer, pivot);
+ if(cmp<0){
+ if(left==null){left=makeNode(kmer, value); return 1;}
+ return left.setIfNotPresent(kmer, value);
+ }else if(cmp>0){
+ if(right==null){right=makeNode(kmer, value); return 1;}
+ return right.setIfNotPresent(kmer, value);
+ }
+ return 0;
+ }
+
+ public final int getValue(long[] kmer){
+ KmerNodeU n=get(kmer);
+ return n==null ? -1 : n.value();
+ }
+
+ public final int[] getValues(long[] kmer, int[] singleton){
+ KmerNodeU n=get(kmer);
+ return n==null ? null : n.values(singleton);
+ }
+
+ public final boolean contains(long[] kmer){
+ KmerNodeU node=get(kmer);
+ return node!=null;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Nonpublic Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public KmerNodeU left(){return left;}
+ public KmerNodeU right(){return right;}
+ public long[] pivot(){return pivot;}
+ public int owner(){return owner;}
+
+ public int count(){return value();}
+ protected abstract int value();
+ protected abstract int[] values(int[] singleton);
+ /** Returns new value */
+ public abstract int set(int value_);
+ protected abstract int set(int[] values_);
+
+ @Override
+ final KmerNodeU get(final long[] kmer){
+// if(kmer<pivot){
+// return left==null ? null : left.get(kmer);
+// }else if(kmer>pivot){
+// return right==null ? null : right.get(kmer);
+// }else{
+// return this;
+// }
+ KmerNodeU n=this;
+ int cmp=compare(kmer, n.pivot);
+ while(cmp!=0){
+ n=(cmp<0 ? n.left : n.right);
+ cmp=(n==null ? 0 : compare(kmer, n.pivot));
+ }
+ return n;
+ }
+
+ final KmerNodeU getNodeOrParent(long[] kmer){
+ final int cmp=compare(kmer, pivot);
+ if(cmp==0){return this;}
+ if(cmp<0){return left==null ? this : left.getNodeOrParent(kmer);}
+ return right==null ? this : right.getNodeOrParent(kmer);
+ }
+
+ final boolean insert(KmerNodeU n){
+ assert(pivot!=null);
+ final int cmp=compare(n.pivot, pivot);
+ if(cmp<0){
+ if(left==null){left=n; return true;}
+ return left.insert(n);
+ }else if(cmp>0){
+ if(right==null){right=n; return true;}
+ return right.insert(n);
+ }else{
+ return false;
+ }
+ }
+
+ final void traversePrefix(ArrayList<KmerNodeU> list){
+ if(left!=null){left.traversePrefix(list);}
+ list.add(this);
+ if(right!=null){right.traversePrefix(list);}
+ }
+
+ final void traverseInfix(ArrayList<KmerNodeU> list){
+ list.add(this);
+ if(left!=null){left.traverseInfix(list);}
+ if(right!=null){right.traverseInfix(list);}
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Private Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Resizing and Rebalancing ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public final long size() {
+ if(value()<1){return 0;}
+ long size=1;
+ if(left!=null){size+=left.size();}
+ if(right!=null){size+=right.size();}
+ return size;
+ }
+
+ final KmerNodeU rebalance(ArrayList<KmerNodeU> list){
+ assert(list.isEmpty());
+ traversePrefix(list);
+ KmerNodeU n=this;
+ if(list.size()>2){
+ n=rebalance(list, 0, list.size()-1);
+ }
+ list.clear();
+ return n;
+ }
+
+ private static final KmerNodeU rebalance(ArrayList<KmerNodeU> list, int a, int b){
+ final int size=b-a+1;
+ final int middle=a+size/2;
+ final KmerNodeU n=list.get(middle);
+ if(size<4){
+ if(size==1){
+ n.left=n.right=null;
+ }else if(size==2){
+ KmerNodeU n1=list.get(a);
+ n.left=n1;
+ n.right=null;
+ n1.left=n1.right=null;
+ }else{
+ assert(size==3);
+ KmerNodeU n1=list.get(a), n2=list.get(b);
+ n.left=n1;
+ n.right=n2;
+ n1.left=n1.right=null;
+ n2.left=n2.right=null;
+ }
+ }else{
+ n.left=rebalance(list, a, middle-1);
+ n.right=rebalance(list, middle+1, b);
+ }
+ return n;
+ }
+
+ @Override
+ public long regenerate(){
+ throw new RuntimeException("Not supported.");
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Info Dumping ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public final boolean dumpKmersAsText(TextStreamWriter tsw, int k, int mincount) {
+ tsw.print(dumpKmersAsText(new StringBuilder(32), k, mincount));
+ return true;
+ }
+
+ protected abstract StringBuilder dumpKmersAsText(StringBuilder sb, int k, int mincount);
+
+ protected abstract ByteBuilder dumpKmersAsText(ByteBuilder bb, int k, int mincount);
+
+ @Override
+ public final void fillHistogram(long[] ca, int max){
+ final int value=value();
+ if(value<1){return;}
+ ca[Tools.min(value, max)]++;
+ if(left!=null){left.fillHistogram(ca, max);}
+ if(right!=null){right.fillHistogram(ca, max);}
+ }
+
+ public String toString(){return Arrays.toString(pivot);}
+
+ abstract boolean TWOD();
+ abstract int numValues();
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Ownership ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public final void initializeOwnership(){
+ owner=-1;
+ if(left!=null){left.initializeOwnership();}
+ if(right!=null){right.initializeOwnership();}
+ }
+
+ @Override
+ public final void clearOwnership(){initializeOwnership();}
+
+
+ public final int setOwner(final long[] kmer, final int newOwner){
+ KmerNodeU n=get(kmer);
+ assert(n!=null);
+ if(n.owner<=newOwner){
+ synchronized(n){
+ if(n.owner<newOwner){
+ n.owner=newOwner;
+ }
+ }
+ }
+ return n.owner;
+ }
+
+
+ public final boolean clearOwner(final long[] kmer, final int owner){
+ KmerNodeU n=get(kmer);
+ assert(n!=null);
+ synchronized(n){
+ if(n.owner==owner){
+ n.owner=-1;
+ return true;
+ }
+ }
+ return false;
+ }
+
+
+ public final int getOwner(final long[] kmer){
+ KmerNodeU n=get(kmer);
+ assert(n!=null);
+ return n.owner;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Recall Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ abstract int set(long[] kmer, int[] vals);
+
+ @Override
+ public int set(Kmer kmer, int value) {
+ return set(kmer.key(), value);
+ }
+
+ @Override
+ public int set(Kmer kmer, int[] vals) {
+ return set(kmer.key(), vals);
+ }
+
+ @Override
+ public int setIfNotPresent(Kmer kmer, int value) {
+ return setIfNotPresent(kmer.key(), value);
+ }
+
+ @Override
+ public int getValue(Kmer kmer) {
+ return getValue(kmer.key());
+ }
+
+ @Override
+ public int[] getValues(Kmer kmer, int[] singleton) {
+ return getValues(kmer.key(), singleton);
+ }
+
+ @Override
+ public boolean contains(Kmer kmer) {
+ return contains(kmer.key());
+ }
+
+ @Override
+ public int getValue(long[] key, long xor) {
+ return getValue(key);
+ }
+
+ @Override
+ public int setOwner(Kmer kmer, int newOwner) {
+ return setOwner(kmer.key(), newOwner);
+ }
+
+ @Override
+ public boolean clearOwner(Kmer kmer, int owner) {
+ return clearOwner(kmer.key(), owner);
+ }
+
+ @Override
+ public int getOwner(Kmer kmer) {
+ return getOwner(kmer.key());
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Invalid Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ long xor() {
+ return Kmer.xor(pivot);
+ }
+
+ final long[] pivot;
+ int owner=-1;
+ KmerNodeU left, right;
+
+}
diff --git a/current/ukmer/KmerNodeU1D.java b/current/ukmer/KmerNodeU1D.java
new file mode 100755
index 0000000..216e775
--- /dev/null
+++ b/current/ukmer/KmerNodeU1D.java
@@ -0,0 +1,161 @@
+package ukmer;
+
+import stream.ByteBuilder;
+import fileIO.ByteStreamWriter;
+
+/**
+ * @author Brian Bushnell
+ * @date Oct 22, 2013
+ *
+ */
+public class KmerNodeU1D extends KmerNodeU {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public KmerNodeU1D(long[] pivot_){
+ super(pivot_);
+ }
+
+ public KmerNodeU1D(long[] pivot_, int value_){
+ super(pivot_);
+ value=value_;
+ }
+
+ public final KmerNodeU makeNode(long[] pivot_, int value_){
+ return new KmerNodeU1D(pivot_, value_);
+ }
+
+ public final KmerNodeU makeNode(long[] pivot_, int[] values_){
+ throw new RuntimeException("Unimplemented");
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Public Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public final int set(long[] kmer, int[] vals) {
+ throw new RuntimeException("Unimplemented.");
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Nonpublic Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ protected int value(){return value;}
+
+ protected int[] values(int[] singleton){
+ assert(singleton.length==1);
+ singleton[0]=value;
+ return singleton;
+ }
+
+ public int set(int value_){return value=value_;}
+
+ protected int set(int[] values_){
+ throw new RuntimeException("Unimplemented");
+ }
+
+ int numValues(){return value<1 ? 0 : 1;}
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Private Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Resizing and Rebalancing ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ boolean canResize() {
+ return false;
+ }
+
+ @Override
+ public boolean canRebalance() {
+ return true;
+ }
+
+ @Deprecated
+ @Override
+ public int arrayLength() {
+ throw new RuntimeException("Unsupported.");
+ }
+
+ @Deprecated
+ @Override
+ void resize() {
+ throw new RuntimeException("Unsupported.");
+ }
+
+ @Deprecated
+ @Override
+ public void rebalance() {
+ throw new RuntimeException("Please call rebalance(ArrayList<KmerNode>) instead, with an empty list.");
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Info Dumping ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public final boolean dumpKmersAsBytes(ByteStreamWriter bsw, int k, int mincount){
+ if(value<1){return true;}
+ if(value>=mincount){bsw.printlnKmer(pivot, value, k);}
+ if(left!=null){left.dumpKmersAsBytes(bsw, k, mincount);}
+ if(right!=null){right.dumpKmersAsBytes(bsw, k, mincount);}
+ return true;
+ }
+
+ @Override
+ public final boolean dumpKmersAsBytes_MT(final ByteStreamWriter bsw, final ByteBuilder bb, final int k, final int mincount){
+ if(value<1){return true;}
+ if(value>=mincount){
+ toBytes(pivot, value, k, bb);
+ bb.append('\n');
+ if(bb.length()>=16000){
+ ByteBuilder bb2=new ByteBuilder(bb);
+ synchronized(bsw){bsw.addJob(bb2);}
+ bb.clear();
+ }
+ }
+ if(left!=null){left.dumpKmersAsBytes_MT(bsw, bb, k, mincount);}
+ if(right!=null){right.dumpKmersAsBytes_MT(bsw, bb, k, mincount);}
+ return true;
+ }
+
+ @Override
+ protected final StringBuilder dumpKmersAsText(StringBuilder sb, int k, int mincount){
+ if(value<1){return sb;}
+ if(sb==null){sb=new StringBuilder(32);}
+ if(value>=mincount){sb.append(AbstractKmerTableU.toText(pivot, value, k)).append('\n');}
+ if(left!=null){left.dumpKmersAsText(sb, k, mincount);}
+ if(right!=null){right.dumpKmersAsText(sb, k, mincount);}
+ return sb;
+ }
+
+ @Override
+ protected final ByteBuilder dumpKmersAsText(ByteBuilder bb, int k, int mincount){
+ if(value<1){return bb;}
+ if(bb==null){bb=new ByteBuilder(32);}
+ if(value>=mincount){bb.append(AbstractKmerTableU.toBytes(pivot, value, k)).append('\n');}
+ if(left!=null){left.dumpKmersAsText(bb, k, mincount);}
+ if(right!=null){right.dumpKmersAsText(bb, k, mincount);}
+ return bb;
+ }
+
+ final boolean TWOD(){return false;}
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Invalid Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ int value;
+
+}
diff --git a/current/ukmer/KmerNodeU2D.java b/current/ukmer/KmerNodeU2D.java
new file mode 100755
index 0000000..ff17dc6
--- /dev/null
+++ b/current/ukmer/KmerNodeU2D.java
@@ -0,0 +1,237 @@
+package ukmer;
+
+import java.util.Arrays;
+
+import stream.ByteBuilder;
+import fileIO.ByteStreamWriter;
+
+import align2.Tools;
+
+/**
+ * Allows multiple values per kmer.
+ * @author Brian Bushnell
+ * @date Nov 7, 2014
+ *
+ */
+public class KmerNodeU2D extends KmerNodeU {
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Initialization ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public KmerNodeU2D(long[] pivot_){
+ super(pivot_);
+ }
+
+ public KmerNodeU2D(long[] pivot_, int value_){
+ super(pivot_);
+ assert(value_>=0 || value_==-1);
+ values=new int[] {value_, -1};
+ }
+
+ public KmerNodeU2D(long[] pivot_, int[] vals_){
+ super(pivot_);
+ values=vals_;
+ }
+
+ public final KmerNodeU makeNode(long[] pivot_, int value_){
+ return new KmerNodeU2D(pivot_, value_);
+ }
+
+ public final KmerNodeU makeNode(long[] pivot_, int[] values_){
+ return new KmerNodeU2D(pivot_, values_);
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Public Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+// public final int set_Test(final long[] kmer, final int v[]){
+// assert(TESTMODE);
+// final int x;
+// if(TWOD()){
+// int[] old=getValues(kmer, null);
+// assert(old==null || contains(kmer, old));
+// x=set0(kmer, v);
+// assert(old==null || contains(kmer, old));
+// assert(contains(kmer, v));
+// }else{
+// int old=getValue(kmer);
+// assert(old==0 || old==-1 || contains(kmer, old));
+// x=set0(kmer, v);
+// assert(contains(kmer, v)) : "old="+old+", v="+v+", kmer="+kmer+", get(kmer)="+getValue(kmer);
+// assert(v[0]==old || !contains(kmer, old));
+// }
+// return x;
+// }
+
+ /** Returns number of nodes added */
+ @Override
+ public int set(long[] kmer, int vals[]){
+ final int cmp=compare(kmer, pivot);
+ if(cmp<0){
+ if(left==null){left=new KmerNodeU2D(kmer, vals); return 1;}
+ return left.set(kmer, vals);
+ }else if(cmp>0){
+ if(right==null){right=new KmerNodeU2D(kmer, vals); return 1;}
+ return right.set(kmer, vals);
+ }else{
+ insertValue(vals);
+ }
+ return 0;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Nonpublic Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ protected int value(){return values==null ? 0 : values[0];}
+
+ protected int[] values(int[] singleton){
+ return values;
+ }
+
+ public int set(int value_){
+ insertValue(value_);
+ return value_;
+ }
+
+ protected int set(int[] values_){
+ int ret=(values==null ? 1 : 0);
+ insertValue(values_);
+ return ret;
+ }
+
+ int numValues(){
+ if(values==null){return 0;}
+ for(int i=0; i<values.length; i++){
+ if(values[i]==-1){return i;}
+ }
+ return values.length;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Private Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /** Returns number of values added */
+ private int insertValue(int v){
+ for(int i=0; i<values.length; i++){
+ if(values[i]==v){return 0;}
+ if(values[i]==-1){values[i]=v;return 1;}
+ }
+ final int oldSize=values.length;
+ final int newSize=(int)Tools.min(Integer.MAX_VALUE, oldSize*2L);
+ assert(newSize>values.length) : "Overflow.";
+ values=Arrays.copyOf(values, newSize);
+ values[oldSize]=v;
+ Arrays.fill(values, oldSize+1, newSize, -1);
+ return 1;
+ }
+
+ /** Returns number of values added */
+ private int insertValue(int[] vals){
+ if(values==null){
+ values=vals;
+ return 1;
+ }
+ for(int v : vals){
+ if(v<0){break;}
+ insertValue(v);
+ }
+ return 0;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Resizing and Rebalancing ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ boolean canResize() {
+ return false;
+ }
+
+ @Override
+ public boolean canRebalance() {
+ return true;
+ }
+
+ @Deprecated
+ @Override
+ public int arrayLength() {
+ throw new RuntimeException("Unsupported.");
+ }
+
+ @Deprecated
+ @Override
+ void resize() {
+ throw new RuntimeException("Unsupported.");
+ }
+
+ @Deprecated
+ @Override
+ public void rebalance() {
+ throw new RuntimeException("Please call rebalance(ArrayList<KmerNode>) instead, with an empty list.");
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Info Dumping ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public final boolean dumpKmersAsBytes(ByteStreamWriter bsw, int k, int mincount){
+ if(values==null){return true;}
+ bsw.printlnKmer(pivot, values, k);
+ if(left!=null){left.dumpKmersAsBytes(bsw, k, mincount);}
+ if(right!=null){right.dumpKmersAsBytes(bsw, k, mincount);}
+ return true;
+ }
+
+ @Override
+ public final boolean dumpKmersAsBytes_MT(final ByteStreamWriter bsw, final ByteBuilder bb, final int k, final int mincount){
+ if(values==null){return true;}
+ toBytes(pivot, values, k, bb);
+ bb.append('\n');
+ if(bb.length()>=16000){
+ ByteBuilder bb2=new ByteBuilder(bb);
+ synchronized(bsw){bsw.addJob(bb2);}
+ bb.clear();
+ }
+ if(left!=null){left.dumpKmersAsBytes_MT(bsw, bb, k, mincount);}
+ if(right!=null){right.dumpKmersAsBytes_MT(bsw, bb, k, mincount);}
+ return true;
+ }
+
+ @Override
+ protected final StringBuilder dumpKmersAsText(StringBuilder sb, int k, int mincount){
+ if(values==null){return sb;}
+ if(sb==null){sb=new StringBuilder(32);}
+ sb.append(AbstractKmerTableU.toText(pivot, values, k)).append('\n');
+ if(left!=null){left.dumpKmersAsText(sb, k, mincount);}
+ if(right!=null){right.dumpKmersAsText(sb, k, mincount);}
+ return sb;
+ }
+
+ @Override
+ protected final ByteBuilder dumpKmersAsText(ByteBuilder bb, int k, int mincount){
+ if(values==null){return bb;}
+ if(bb==null){bb=new ByteBuilder(32);}
+ bb.append(AbstractKmerTableU.toBytes(pivot, values, k)).append('\n');
+ if(left!=null){left.dumpKmersAsText(bb, k, mincount);}
+ if(right!=null){right.dumpKmersAsText(bb, k, mincount);}
+ return bb;
+ }
+
+ final boolean TWOD(){return true;}
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Invalid Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Fields ----------------*/
+ /*--------------------------------------------------------------*/
+
+ int[] values;
+
+}
diff --git a/current/ukmer/KmerTableSetU.java b/current/ukmer/KmerTableSetU.java
new file mode 100755
index 0000000..b2b16dd
--- /dev/null
+++ b/current/ukmer/KmerTableSetU.java
@@ -0,0 +1,1141 @@
+package ukmer;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import jgi.BBMerge;
+import kmer.AbstractKmerTableSet;
+import kmer.DumpThread;
+import kmer.Primes;
+import stream.ByteBuilder;
+import stream.ConcurrentReadInputStream;
+import stream.FastaReadInputStream;
+import stream.Read;
+import align2.IntList;
+import align2.ListNum;
+import align2.ReadStats;
+import align2.Shared;
+import align2.Tools;
+import align2.TrimRead;
+import bloom.KmerCountAbstract;
+import dna.AminoAcid;
+import dna.Parser;
+import dna.Timer;
+import fileIO.ByteStreamWriter;
+import fileIO.FileFormat;
+import fileIO.ReadWrite;
+
+
+/**
+ * Loads and holds kmers for Tadpole/KmerCountExact
+ * @author Brian Bushnell
+ * @date Jun 22, 2015
+ *
+ */
+public class KmerTableSetU extends AbstractKmerTableSet {
+
+ /**
+ * Code entrance from the command line.
+ * @param args Command line arguments
+ */
+ public static void main(String[] args){
+
+ args=Parser.parseConfig(args);
+ if(Parser.parseHelp(args, true)){
+ printOptions();
+ System.exit(0);
+ }
+
+ Timer t=new Timer(), t2=new Timer();
+ t.start();
+ t2.start();
+
+ //Create a new CountKmersExact instance
+ KmerTableSetU set=new KmerTableSetU(args);
+ t2.stop();
+ outstream.println("Initialization Time: \t"+t2);
+
+ ///And run it
+ set.process(t);
+ }
+
+ /**
+ * Constructor.
+ * @param args Command line arguments
+ */
+ private KmerTableSetU(String[] args){
+ this(args, 12);//+5 if using ownership and building contigs
+ }
+
+
+ /**
+ * Constructor.
+ * @param args Command line arguments
+ */
+ public KmerTableSetU(String[] args, int extraBytesPerKmer_){
+ System.err.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n");
+
+ /* Initialize local variables with defaults */
+ Parser parser=new Parser();
+ boolean prealloc_=false;
+ int kbig_=62;
+ int ways_=-1;
+ int filterMax_=2;
+ boolean ecco_=false, merge_=false;
+ boolean rcomp_=true;
+ double minProb_=defaultMinprob;
+
+ /* Parse arguments */
+ for(int i=0; i<args.length; i++){
+
+ final String arg=args[i];
+ String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if("null".equalsIgnoreCase(b)){b=null;}
+ while(a.charAt(0)=='-' && (a.indexOf('.')<0 || i>1 || !new File(a).exists())){a=a.substring(1);}
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(Parser.parseCommonStatic(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseZip(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseQuality(arg, a, b)){
+ //do nothing
+ }else if(Parser.parseFasta(arg, a, b)){
+ //do nothing
+ }else if(parser.parseInterleaved(arg, a, b)){
+ //do nothing
+ }else if(parser.parseTrim(arg, a, b)){
+ //do nothing
+ }else if(a.equals("in") || a.equals("in1")){
+ in1.clear();
+ if(b!=null){
+ String[] s=b.split(",");
+ for(String ss : s){
+ in1.add(ss);
+ }
+ }
+ }else if(a.equals("in2")){
+ in2.clear();
+ if(b!=null){
+ String[] s=b.split(",");
+ for(String ss : s){
+ in2.add(ss);
+ }
+ }
+ }else if(a.equals("append") || a.equals("app")){
+ append=ReadStats.append=Tools.parseBoolean(b);
+ }else if(a.equals("overwrite") || a.equals("ow")){
+ overwrite=Tools.parseBoolean(b);
+ }else if(a.equals("initialsize")){
+ initialSize=(int)Tools.parseKMG(b);
+ }else if(a.equals("showstats") || a.equals("stats")){
+ showStats=Tools.parseBoolean(b);
+ }else if(a.equals("ways")){
+ ways_=(int)Tools.parseKMG(b);
+ }else if(a.equals("buflen") || a.equals("bufflen") || a.equals("bufferlength")){
+ buflen=(int)Tools.parseKMG(b);
+ }else if(a.equals("k")){
+ assert(b!=null) : "\nk needs an integer value such as k=50. Default is 62.\n";
+ kbig_=(int)Tools.parseKMG(b);
+ }else if(a.equals("threads") || a.equals("t")){
+ THREADS=(b==null || b.equalsIgnoreCase("auto") ? Shared.threads() : Integer.parseInt(b));
+ }else if(a.equals("showspeed") || a.equals("ss")){
+ showSpeed=Tools.parseBoolean(b);
+ }else if(a.equals("ecco")){
+ ecco_=Tools.parseBoolean(b);
+ }else if(a.equals("merge")){
+ merge_=Tools.parseBoolean(b);
+ }else if(a.equals("verbose")){
+// assert(false) : "Verbose flag is currently static final; must be recompiled to change.";
+ verbose=Tools.parseBoolean(b);
+ }else if(a.equals("verbose2")){
+// assert(false) : "Verbose flag is currently static final; must be recompiled to change.";
+ verbose2=Tools.parseBoolean(b);
+ }else if(a.equals("minprob")){
+ minProb_=Double.parseDouble(b);
+ }else if(a.equals("minprobprefilter") || a.equals("mpp")){
+ minProbPrefilter=Tools.parseBoolean(b);
+ }else if(a.equals("minprobmain") || a.equals("mpm")){
+ minProbMain=Tools.parseBoolean(b);
+ }else if(a.equals("reads") || a.startsWith("maxreads")){
+ maxReads=Tools.parseKMG(b);
+ }else if(a.equals("prealloc") || a.equals("preallocate")){
+ if(b==null || b.length()<1 || Character.isLetter(b.charAt(0))){
+ prealloc_=Tools.parseBoolean(b);
+ }else{
+ preallocFraction=Tools.max(0, Double.parseDouble(b));
+ prealloc_=(preallocFraction>0);
+ }
+ }else if(a.equals("prefilter")){
+ if(b==null || b.length()<1 || !Character.isDigit(b.charAt(0))){
+ prefilter=Tools.parseBoolean(b);
+ }else{
+ filterMax_=(int)Tools.parseKMG(b);
+ prefilter=filterMax_>0;
+ }
+ }else if(a.equals("prefiltersize") || a.equals("prefilterfraction") || a.equals("pff")){
+ prefilterFraction=Tools.max(0, Double.parseDouble(b));
+ assert(prefilterFraction<=1) : "prefiltersize must be 0-1, a fraction of total memory.";
+ prefilter=prefilterFraction>0;
+ }else if(a.equals("prehashes") || a.equals("hashes")){
+ prehashes=(int)Tools.parseKMG(b);
+ }else if(a.equals("prefilterpasses") || a.equals("prepasses")){
+ if(b.equalsIgnoreCase("auto")){
+ prepasses=-1;
+ }else{
+ prepasses=(int)Tools.parseKMG(b);
+ }
+ }else if(a.equals("onepass")){
+ onePass=Tools.parseBoolean(b);
+ }else if(a.equals("passes")){
+ int passes=(int)Tools.parseKMG(b);
+ onePass=(passes<2);
+ }else if(a.equals("rcomp")){
+ rcomp_=Tools.parseBoolean(b);
+ }else if(IGNORE_UNKNOWN_ARGS){
+ //Do nothing
+ }else{
+ throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ {//Process parser fields
+ Parser.processQuality();
+
+ qtrimLeft=parser.qtrimLeft;
+ qtrimRight=parser.qtrimRight;
+ trimq=parser.trimq;
+
+ minAvgQuality=parser.minAvgQuality;
+ minAvgQualityBases=parser.minAvgQualityBases;
+ }
+
+ if(prepasses==0 || !prefilter){
+ prepasses=0;
+ prefilter=false;
+ }
+
+ {
+ long memory=Runtime.getRuntime().maxMemory();
+ double xmsRatio=Shared.xmsRatio();
+// long tmemory=Runtime.getRuntime().totalMemory();
+ usableMemory=(long)Tools.max(((memory-96000000)*(xmsRatio>0.97 ? 0.82 : 0.75)), memory*0.45);
+ if(prepasses==0 || !prefilter){
+ filterMemory0=filterMemory1=0;
+ }else{
+ double low=Tools.min(prefilterFraction, 1-prefilterFraction);
+ double high=1-low;
+ if(prepasses<0 || (prepasses&1)==1){//odd passes
+ filterMemory0=(long)(usableMemory*low);
+ filterMemory1=(long)(usableMemory*high);
+ }else{//even passes
+ filterMemory0=(long)(usableMemory*high);
+ filterMemory1=(long)(usableMemory*low);
+ }
+ }
+ tableMemory=(long)(usableMemory*.95-filterMemory0);
+ }
+
+ mult=Kmer.getMult(kbig_);
+ k=Kmer.getK(kbig_);
+ kbig=k*mult;
+ kbig2=kbig-1;
+ assert(k<=31);
+
+ prealloc=prealloc_;
+ bytesPerKmer=4+8*mult+extraBytesPerKmer_;
+ assert(bytesPerKmer>=4+8*mult) : bytesPerKmer+", "+mult+", "+k+", "+kbig+", "+(4+8*mult);
+ if(ways_<1){
+ long maxKmers=(2*tableMemory)/bytesPerKmer;
+ long minWays=Tools.min(10000, maxKmers/Integer.MAX_VALUE);
+ ways_=(int)Tools.max(31, (int)(THREADS*2.5), minWays);
+ ways_=(int)Primes.primeAtLeast(ways_);
+ assert(ways_>0);
+// System.err.println("ways="+ways_);
+ }
+// assert(false) : extraBytesPerKmer_+bytesPerKmer+", "+mult+", "+k+", "+kbig+", "+(4+8*mult)+", "+ways_;
+
+ /* Set final variables; post-process and validate argument combinations */
+
+ onePass=onePass&prefilter;
+ ways=ways_;
+ filterMax=Tools.min(filterMax_, 0x7FFFFFFF);
+ ecco=ecco_;
+ merge=merge_;
+ minProb=(float)minProb_;
+ rcomp=rcomp_;
+ estimatedKmerCapacity=(long)((tableMemory*1.0/bytesPerKmer)*((prealloc && preallocFraction==1) ? 0.9 : 0.6));
+ KmerCountAbstract.minProb=(minProbPrefilter ? minProb : 0);
+
+ if(kbig!=kbig_){
+ System.err.println("K was changed from "+kbig_+" to "+kbig);
+ }
+
+ if(k<1){throw new RuntimeException("\nk needs an integer value above 0, such as k=27. Default is 62.\n");}
+
+ if(initialSize<1){
+ final long memOverWays=tableMemory/(bytesPerKmer*ways);
+ final double mem2=(prealloc ? preallocFraction : 1)*tableMemory;
+ initialSize=(prealloc || memOverWays<initialSizeDefault ? (int)Tools.min(2142000000, (long)(mem2/(bytesPerKmer*ways))) : initialSizeDefault);
+ if(initialSize!=initialSizeDefault){
+ System.err.println("Initial size set to "+initialSize);
+ }
+ }
+
+ /* Adjust I/O settings and filenames */
+
+ assert(FastaReadInputStream.settingsOK());
+
+ if(in1.isEmpty()){
+ printOptions();
+ throw new RuntimeException("Error - at least one input file is required.");
+ }
+
+ for(int i=0; i<in1.size(); i++){
+ String s=in1.get(i);
+ if(s!=null && s.contains("#") && !new File(s).exists()){
+ int pound=s.lastIndexOf('#');
+ String a=s.substring(0, pound);
+ String b=s.substring(pound+1);
+ in1.set(i, a+1+b);
+ in2.add(a+2+b);
+ }
+ }
+
+ {
+ boolean allowDuplicates=true;
+ if(!Tools.testInputFiles(allowDuplicates, true, in1, in2)){
+ throw new RuntimeException("\nCan't read to some input files.\n");
+ }
+ }
+ assert(THREADS>0);
+
+ if(DISPLAY_PROGRESS){
+ outstream.println("Initial:");
+ outstream.println("Ways="+ways+", initialSize="+initialSize+", prefilter="+(prefilter ? "t" : "f")+", prealloc="+(prealloc ? (""+preallocFraction) : "f"));
+ Shared.printMemory();
+ outstream.println();
+ }
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Outer Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ public void clear(){
+ tables=null;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ @Override
+ protected void allocateTables(){
+ assert(tables==null);
+ tables=null;
+ final int tableType=AbstractKmerTableU.ARRAY1D;
+ tables=AbstractKmerTableU.preallocate(ways, tableType, initialSize, kbig, (!prealloc || preallocFraction<1));
+ }
+
+ /**
+ * Load reads into tables, using multiple LoadThread.
+ */
+ public long loadKmers(String fname1, String fname2){
+
+ /* Create read input stream */
+ final ConcurrentReadInputStream cris;
+ {
+ FileFormat ff1=FileFormat.testInput(fname1, FileFormat.FASTQ, null, true, true);
+ FileFormat ff2=FileFormat.testInput(fname2, FileFormat.FASTQ, null, true, true);
+ cris=ConcurrentReadInputStream.getReadInputStream(maxReads, false, ff1, ff2);
+ cris.start(); //4567
+ }
+
+ /* Create ProcessThreads */
+ ArrayList<LoadThread> alpt=new ArrayList<LoadThread>(THREADS);
+ for(int i=0; i<THREADS; i++){alpt.add(new LoadThread(cris));}
+ for(LoadThread pt : alpt){pt.start();}
+
+ long added=0;
+
+ /* Wait for threads to die, and gather statistics */
+ for(LoadThread pt : alpt){
+ while(pt.getState()!=Thread.State.TERMINATED){
+ try {
+ pt.join();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ added+=pt.added;
+
+ readsIn+=pt.readsInT;
+ basesIn+=pt.basesInT;
+ lowqReads+=pt.lowqReadsT;
+ lowqBases+=pt.lowqBasesT;
+ readsTrimmed+=pt.readsTrimmedT;
+ basesTrimmed+=pt.basesTrimmedT;
+ }
+
+ /* Shut down I/O streams; capture error status */
+ errorState|=ReadWrite.closeStreams(cris);
+ return added;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Inner Classes ----------------*/
+ /*--------------------------------------------------------------*/
+
+ /**
+ * Loads kmers.
+ */
+ private class LoadThread extends Thread{
+
+ /**
+ * Constructor
+ * @param cris_ Read input stream
+ */
+ public LoadThread(ConcurrentReadInputStream cris_){
+ cris=cris_;
+ table=new HashBufferU(tables, buflen, kbig, false);
+ kmer=new Kmer(k, mult);
+ }
+
+ @Override
+ public void run(){
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ //While there are more reads lists...
+ while(reads!=null && reads.size()>0){
+
+ //For each read (or pair) in the list...
+ for(int i=0; i<reads.size(); i++){
+ Read r1=reads.get(i);
+ Read r2=r1.mate;
+
+ if(!r1.validated()){r1.validate(true);}
+ if(r2!=null && !r2.validated()){r2.validate(true);}
+
+ if(verbose){System.err.println("Considering read "+r1.id+" "+new String(r1.bases));}
+
+ readsInT++;
+ basesInT+=r1.length();
+ if(r2!=null){
+ readsInT++;
+ basesInT+=r2.length();
+ }
+
+ //Determine whether to discard the reads based on average quality
+ if(minAvgQuality>0){
+ if(r1!=null && r1.quality!=null && r1.avgQuality(false, minAvgQualityBases)<minAvgQuality){r1.setDiscarded(true);}
+ if(r2!=null && r2.quality!=null && r2.avgQuality(false, minAvgQualityBases)<minAvgQuality){r2.setDiscarded(true);}
+ }
+
+ if(r1!=null){
+ if(qtrimLeft || qtrimRight){
+ int x=TrimRead.trimFast(r1, qtrimLeft, qtrimRight, trimq, 1);
+ basesTrimmedT+=x;
+ readsTrimmedT+=(x>0 ? 1 : 0);
+ }
+ if(r1.length()<kbig){r1.setDiscarded(true);}
+ }
+ if(r2!=null){
+ if(qtrimLeft || qtrimRight){
+ int x=TrimRead.trimFast(r2, qtrimLeft, qtrimRight, trimq, 1);
+ basesTrimmedT+=x;
+ readsTrimmedT+=(x>0 ? 1 : 0);
+ }
+ if(r2.length()<kbig){r2.setDiscarded(true);}
+ }
+
+ if((ecco || merge) && r1!=null && r2!=null && !r1.discarded() && !r2.discarded()){
+ if(merge){
+ final int insert=BBMerge.findOverlapStrict(r1, r2, false);
+ if(insert>0){
+ r2.reverseComplement();
+ r1=r1.joinRead(insert);
+ r2=null;
+ }
+ }else if(ecco){
+ BBMerge.findOverlapStrict(r1, r2, true);
+ }
+ }
+
+ if(r1!=null){
+ if(r1.discarded()){
+ lowqBasesT+=r1.length();
+ lowqReadsT++;
+ }else{
+ long temp=addKmersToTable(r1, kmer);
+ added+=temp;
+ if(verbose){System.err.println("A: Added "+temp);}
+ }
+ }
+ if(r2!=null){
+ if(r2.discarded()){
+ lowqBasesT+=r2.length();
+ lowqReadsT++;
+ }else{
+ long temp=addKmersToTable(r2, kmer);
+ added+=temp;
+ if(verbose){System.err.println("B: Added "+temp);}
+ }
+ }
+ }
+
+ //Fetch a new read list
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ cris.returnList(ln.id, ln.list.isEmpty());
+ long temp=table.flush();
+ if(verbose){System.err.println("Flush: Added "+temp);}
+ added+=temp;
+ }
+
+
+ private final int addKmersToTable(final Read r, Kmer kmer){
+ if(onePass){return addKmersToTable_onePass(r, kmer);}
+ if(r==null || r.bases==null){return 0;}
+ final float minProb2=(minProbMain ? minProb : 0);
+ final byte[] bases=r.bases;
+ final byte[] quals=r.quality;
+ int created=0;
+ int len=0;
+
+ if(bases==null || bases.length<kbig){return -1;}
+ kmer.clear();
+
+ /* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */
+ float prob=1;
+ for(int i=0; i<bases.length; i++){
+ final byte b=bases[i];
+ final long x=AminoAcid.baseToNumber[b];
+// assert(x>=0) : ((char)b)+", "+x+", "+new String(bases);
+
+ //Update kmers
+// kmer.addRight(b);
+ kmer.addRightNumeric(x);
+
+ if(minProb2>0 && quals!=null){//Update probability
+ prob=prob*PROB_CORRECT[quals[i]];
+ if(len>kbig){
+ byte oldq=quals[i-kbig];
+ prob=prob*PROB_CORRECT_INVERSE[oldq];
+ }
+ }
+
+ //Handle Ns
+ if(x<0){
+ len=0;
+ prob=1;
+ }else{len++;}
+
+ assert(len==kmer.len);
+
+// if(verbose){System.err.println("A: Scanning i="+i+", len="+len+", kmer="+kmer.toString()+"\t"+new String(bases, Tools.max(0, i-kbig2), Tools.min(i+1, kbig)));}
+ if(len>=kbig && prob>=minProb2){
+// System.err.println("kmer="+kmer+"; xor()="+kmer.xor()+"; filterMax2="+filterMax2+"; prefilter="+prefilter);
+// System.err.println("prefilterArray.read(xor.key())="+prefilterArray.read(kmer.xor())+"");
+// System.err.println("prefilterArray.read(kmer.key())="+prefilterArray.read(kmer.key())+"");
+ if(!prefilter || prefilterArray.read(kmer.xor())>filterMax2){
+ int temp=table.incrementAndReturnNumCreated(kmer);
+// System.err.println("kmer="+kmer+"; xor()="+kmer.xor()+"; temp="+temp+" ");
+ created+=temp;
+ if(verbose){System.err.println("C: Added "+temp);}
+ }
+ }
+ }
+
+ return created;
+ }
+
+
+ private final int addKmersToTable_onePass(final Read r, Kmer kmer){
+ assert(prefilter);
+ if(r==null || r.bases==null){return 0;}
+ final byte[] bases=r.bases;
+ final byte[] quals=r.quality;
+ int created=0;
+ int len=0;
+
+ if(bases==null || bases.length<kbig){return -1;}
+ kmer.clear();
+
+ /* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */
+ float prob=1;
+ for(int i=0; i<bases.length; i++){
+ final byte b=bases[i];
+ final long x=AminoAcid.baseToNumber[b];
+
+ //Update kmers
+ kmer.addRight(b);
+
+ if(minProb>0 && quals!=null){//Update probability
+ prob=prob*PROB_CORRECT[quals[i]];
+ if(len>kbig){
+ byte oldq=quals[i-kbig];
+ prob=prob*PROB_CORRECT_INVERSE[oldq];
+ }
+ }
+
+ //Handle Ns
+ if(x<0){
+ len=0;
+ prob=1;
+ }else{len++;}
+
+ assert(len==kmer.len);
+
+ if(verbose){System.err.println("B: Scanning i="+i+", len="+len+", kmer="+kmer+"\t"+new String(bases, Tools.max(0, i-kbig2), Tools.min(i+1, kbig)));}
+ if(len>=kbig && prob>=minProb){
+ final long xor=kmer.xor();
+ int count=prefilterArray.incrementAndReturnUnincremented(xor, 1);
+ if(count>=filterMax2){
+ int temp=table.incrementAndReturnNumCreated(kmer);
+ created+=temp;
+ if(verbose){System.err.println("D: Added "+temp);}
+ }
+ }
+ }
+ return created;
+ }
+
+ /*--------------------------------------------------------------*/
+
+ /** Input read stream */
+ private final ConcurrentReadInputStream cris;
+
+ private final HashBufferU table;
+
+ public long added=0;
+
+ private long readsInT=0;
+ private long basesInT=0;
+ private long lowqReadsT=0;
+ private long lowqBasesT=0;
+ private long readsTrimmedT=0;
+ private long basesTrimmedT=0;
+ private final Kmer kmer;
+
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Convenience ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public void regenerateCounts(byte[] bases, IntList counts, final int a, final Kmer kmer){
+ final int loc=a+kbig;
+ final int lim=Tools.min(counts.size, a+kbig+1);
+ int len=0;
+ kmer.clear();
+
+ //Generate initial kmer
+ for(int i=a; i<loc; i++){
+ final byte b=bases[i];
+ final long x=AminoAcid.baseToNumber[b];
+
+ kmer.addRight(b);
+
+ if(x<0){
+ len=0;
+ }else{len++;}
+ assert(len==kmer.len);
+ }
+ assert(len==kbig || Tools.indexOf(bases, (byte)'N')>=a) : new String(bases)+"\n"+a+", "+len;
+
+ /* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */
+ for(int i=loc, j=a+1; j<lim; i++, j++){
+ final byte b=bases[i];
+ final long x=AminoAcid.baseToNumber[b];
+
+ kmer.addRight(b);
+
+ if(x<0){len=0;}
+ else{len++;}
+ assert(len==kmer.len);
+
+ if(len>=kbig){
+ int count=getCount(kmer);
+ counts.set(j, count);
+ }else{
+ counts.set(j, 0);
+ }
+ }
+ }
+
+ public int fillCounts(byte[] bases, IntList counts, final Kmer kmer){
+ counts.clear();
+
+ {
+ Kmer x=leftmostKmer(bases, bases.length, kmer);
+ assert((x!=null)==(kmer.len==kbig));
+ }
+ int len=kmer.len;
+ int valid=0;
+ if(len>=kbig){
+ valid++;
+ int count=getCount(kmer);
+ counts.add(count);
+ }else{
+ counts.add(0);
+ }
+ assert(kmer.len==len);
+ assert(len<=kbig) : len+", "+kbig;
+
+ /* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */
+ for(int i=kbig; i<bases.length; i++){
+ final byte b=bases[i];
+ final long x=AminoAcid.baseToNumber[b];
+
+ kmer.addRight(b);
+
+ if(x<0){len=0;}
+ else{len++;}
+ assert(len==kmer.len);
+
+ if(len>=kbig){
+ valid++;
+ int count=getCount(kmer);
+ counts.add(count);
+ }else{
+ counts.add(0);
+ }
+ }
+ return valid;
+ }
+
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Helper Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public long regenerate(){
+ long sum=0;
+ for(AbstractKmerTableU akt : tables){
+ sum+=akt.regenerate();
+ }
+ return sum;
+ }
+
+ public HashArrayU1D getTable(Kmer kmer){
+ return (HashArrayU1D) tables[kmer.mod(ways)];
+ }
+
+ public HashArrayU1D getTable(int tnum){
+ return (HashArrayU1D) tables[tnum];
+ }
+
+ @Override
+ public long[] fillHistogram(int histMax) {
+ long[] ca=new long[histMax+1];
+ for(AbstractKmerTableU set : tables){
+ set.fillHistogram(ca, histMax);
+ }
+ return ca;
+ }
+
+ public void initializeOwnership(){
+ for(AbstractKmerTableU akt : tables){
+ akt.initializeOwnership();
+ }
+ }
+
+ public void clearOwnership(){
+ for(AbstractKmerTableU akt : tables){
+ akt.clearOwnership();
+ }
+ }
+
+ public Kmer rightmostKmer(final ByteBuilder bb, Kmer kmer){
+ return rightmostKmer(bb.array, bb.length(), kmer);
+ }
+
+ public Kmer rightmostKmer(final byte[] bases, final int blen, final Kmer kmer){
+ kmer.clear();
+ if(blen<kbig){return null;}
+ int len=0;
+
+ /* Loop through the bases, maintaining a forward and reverse kmer via bitshifts, to get the rightmost kmer */
+ {
+ for(int i=blen-kbig; i<blen; i++){
+ final byte b=bases[i];
+ final long x=AminoAcid.baseToNumber[b];
+ kmer.addRight(b);
+
+ if(x<0){len=0;}
+ else{len++;}
+ assert(len==kmer.len);
+
+ //if(verbose){outstream.println("C: Scanning i="+i+", len="+len+", kmer="+kmer+"\t"+new String(bases, Tools.max(0, i-kbig2), Tools.min(i+1, kbig)));}
+ }
+ }
+
+ if(len<kbig){return null;}
+ else{assert(len==kbig);}
+ return kmer;
+ }
+
+ public Kmer leftmostKmer(final ByteBuilder bb, final Kmer kmer){
+ return leftmostKmer(bb.array, bb.length(), kmer);
+ }
+
+ public Kmer leftmostKmer(final byte[] bases, final int blen, final Kmer kmer){
+ kmer.clear();
+ if(blen<kbig){return null;}
+ int len=0;
+
+ /* Loop through the bases, maintaining a forward and reverse kmer via bitshifts, to get the rightmost kmer */
+ {
+ for(int i=0; i<kbig; i++){
+ final byte b=bases[i];
+ final long x=AminoAcid.baseToNumber[b];
+ kmer.addRight(b);
+
+ if(x<0){len=0;}
+ else{len++;}
+ assert(len==kmer.len);
+
+ if(verbose){outstream.println("D: Scanning i="+i+", len="+len+", kmer="+kmer+"\t"+new String(bases, Tools.max(0, i-kbig2), Tools.min(i+1, kbig)));}
+ }
+ }
+
+ if(len<kbig){return null;}
+ else{assert(len==kbig);}
+ return kmer;
+ }
+
+ public boolean doubleClaim(final ByteBuilder bb, final int id, Kmer kmer){
+ return doubleClaim(bb.array, bb.length(), id, kmer);
+ }
+
+ /** Ensures there can be only one owner. */
+ public boolean doubleClaim(final byte[] bases, final int blength, final int id, Kmer kmer){
+ boolean success=claim(bases, blength, id, true, kmer);
+ if(verbose){outstream.println("success1="+success+", id="+id+", blength="+blength);}
+ if(!success){return false;}
+ success=claim(bases, blength, id+CLAIM_OFFSET, true, kmer);
+ if(verbose){outstream.println("success2="+success+", id="+id+", blength="+blength);}
+ return success;
+ }
+
+ public boolean claim(final ByteBuilder bb, final int id, final boolean exitEarly, Kmer kmer){
+ return claim(bb.array, bb.length(), id, exitEarly, kmer);
+ }
+
+ public float calcCoverage(final byte[] bases, final int blen, final Kmer kmer){
+ if(blen<kbig){return 0;}
+ int len=0;
+ kmer.clear();
+ long sum=0;
+ int kmers=0;
+
+ /* Loop through the bases, maintaining a forward and reverse kmer via bitshifts, to get the rightmost kmer */
+ for(int i=0; i<blen; i++){
+ final byte b=bases[i];
+ final long x=AminoAcid.baseToNumber[b];
+ kmer.addRight(b);
+
+ if(x<0){len=0;}
+ else{len++;}
+ assert(len==kmer.len);
+
+ if(len>=kbig){
+ int count=getCount(kmer);
+ sum+=count;
+ kmers++;
+ }
+ }
+ return sum==0 ? 0 : sum/(float)kmers;
+ }
+
+ public boolean claim(final byte[] bases, final int blen, final int id, boolean exitEarly, final Kmer kmer){
+ if(blen<kbig){return false;}
+ if(verbose){outstream.println("Thread "+id+" claim start.");}
+ int len=0;
+ kmer.clear();
+ boolean success=true;
+ /* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */
+ for(int i=0; i<blen && success; i++){
+ final byte b=bases[i];
+ final long x=AminoAcid.baseToNumber[b];
+ kmer.addRight(b);
+
+ if(x<0){len=0;}
+ else{len++;}
+ assert(len==kmer.len);
+
+ if(len>=kbig){
+ success=claim(kmer, id/*, rid, i*/);
+ success=(success || !exitEarly);
+ }
+ }
+ return success;
+ }
+
+ public boolean claim(Kmer kmer, final int id/*, final long rid, final int pos*/){
+ //TODO: rid and pos are just for debugging.
+ final int way=kmer.mod(ways);
+ final AbstractKmerTableU table=tables[way];
+ final int count=table.getValue(kmer);
+ assert(count==-1 || count>0) : count;
+// if(verbose /*|| true*/){outstream.println("Count="+count+".");}
+ if(count<0){return true;}
+ assert(count>0) : count;
+ final int owner=table.setOwner(kmer, id);
+ if(verbose){outstream.println("owner="+owner+".");}
+// assert(owner==id) : id+", "+owner+", "+rid+", "+pos;
+ return owner==id;
+ }
+
+ public void release(ByteBuilder bb, final int id, final Kmer kmer){
+ release(bb.array, bb.length(), id, kmer);
+ }
+
+ public void release(final byte[] bases, final int blen, final int id, final Kmer kmer){
+ if(verbose /*|| true*/){outstream.println("*Thread "+id+" release start.");}
+ int len=0;
+ kmer.clear();
+ /* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */
+ for(int i=0; i<blen; i++){
+ final byte b=bases[i];
+ final long x=AminoAcid.baseToNumber[b];
+ kmer.addRight(b);
+
+ if(x<0){len=0;}
+ else{len++;}
+ assert(len==kmer.len);
+
+ if(len>=kbig){
+ release(kmer, id);
+ }
+ }
+ }
+
+ public boolean release(Kmer kmer, final int id){
+ final int way=kmer.mod(ways);
+ final AbstractKmerTableU table=tables[way];
+ final int count=table.getValue(kmer);
+// if(verbose /*|| true*/){outstream.println("Count="+count+".");}
+ if(count<1){return true;}
+ return table.clearOwner(kmer, id);
+ }
+
+ public int findOwner(ByteBuilder bb, final int id, final Kmer kmer){
+ return findOwner(bb.array, bb.length(), id, kmer);
+ }
+
+ public int findOwner(final byte[] bases, final int blen, final int id, final Kmer kmer){
+ int len=0;
+ kmer.clear();
+ int maxOwner=-1;
+ /* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */
+ for(int i=0; i<blen; i++){
+ final byte b=bases[i];
+ final long x=AminoAcid.baseToNumber[b];
+ kmer.addRight(b);
+
+ if(x<0){len=0;}
+ else{len++;}
+ assert(len==kmer.len);
+ //if(verbose){System.err.println("E: Scanning i="+i+", len="+len+", kmer="+kmer+"\t"+new String(bases, Tools.max(0, i-kbig2), Tools.min(i+1, kbig)));}
+ if(len>=kbig){
+ int owner=findOwner(kmer);
+ maxOwner=Tools.max(owner, maxOwner);
+ if(maxOwner>id){break;}
+ }
+ }
+ return maxOwner;
+ }
+
+ public int findOwner(final Kmer kmer){
+ final int way=kmer.mod(ways);
+ final AbstractKmerTableU table=tables[way];
+ final int count=table.getValue(kmer);
+ if(count<0){return -1;}
+ final int owner=table.getOwner(kmer);
+ return owner;
+ }
+
+ public int getCount(Kmer kmer){
+ int way=kmer.mod(ways);
+ return tables[way].getValue(kmer);
+ }
+
+ public int fillRightCounts(Kmer kmer, int[] counts){
+ assert(kmer.len>=kbig);
+ if(verbose){outstream.println("fillRightCounts: "+kmer);}
+ int max=-1, maxPos=0;
+
+// final Kmer kmer2=new Kmer(kmer);//123 TODO: Slow, for an assertion only;
+
+ for(int i=0; i<=3; i++){
+ final long old=kmer.addRightNumeric(i);
+ if(verbose){outstream.println("kmer: "+kmer);}
+ int way=kmer.mod(ways);
+ int count=tables[way].getValue(kmer);
+ assert(count==NOT_PRESENT || count>=0);
+ count=Tools.max(count, 0);
+ counts[i]=count;
+ if(count>max){
+ max=count;
+ maxPos=i;
+ }
+ kmer.addLeftNumeric(old);
+// assert(kmer.equals(kmer2));
+ }
+ return maxPos;
+ }
+
+ public int fillLeftCounts(final Kmer kmer, int[] counts){
+ assert(kmer.len>=kbig);
+ if(verbose){outstream.println("fillLeftCounts: "+kmer);}
+ int max=-1, maxPos=0;
+
+// final Kmer kmer2=new Kmer(kmer);//123 TODO: Slow, for an assertion only;
+// assert(false) : kmer+", "+kmer2;
+
+ for(int i=0; i<=3; i++){
+ if(verbose){
+ outstream.println("kmer: "+kmer+" (key==array1 ? "+(kmer.key()==kmer.array1()));
+// outstream.println("kmer2: "+kmer2);
+ }
+ final long old=kmer.addLeftNumeric(i);
+ if(verbose){
+ outstream.println("after: "+kmer+" (key==array1 ? "+(kmer.key()==kmer.array1()));
+ outstream.println("i="+i+", old="+old);
+ }
+ int way=kmer.mod(ways);
+ int count=tables[way].getValue(kmer);
+ assert(count==NOT_PRESENT || count>=0);
+ count=Tools.max(count, 0);
+ counts[i]=count;
+ if(count>max){
+ max=count;
+ maxPos=i;
+ }
+ kmer.addRightNumeric(old);
+ if(verbose){outstream.println("restored: "+kmer);}
+// assert(kmer.equals(kmer2)) : kmer+", "+kmer2+", "+kmer.xor()+", "+kmer2.xor();
+ }
+ return maxPos;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Printing Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public boolean dumpKmersAsBytes(String fname, int minToDump, boolean printTime){
+ if(fname==null){return false;}
+ Timer t=new Timer();
+
+ ByteStreamWriter bsw=new ByteStreamWriter(fname, overwrite, false, true);
+ bsw.start();
+ for(AbstractKmerTableU set : tables){
+ set.dumpKmersAsBytes(bsw, k, minToDump);
+ }
+ bsw.poisonAndWait();
+
+ t.stop();
+ if(printTime){outstream.println("Kmer Dump Time: \t"+t);}
+ return bsw.errorState;
+ }
+
+ public boolean dumpKmersAsBytes_MT(String fname, int minToDump, boolean printTime){
+
+ final int threads=Tools.min(Shared.threads(), tables.length);
+ if(threads<3 || DumpThread.NUM_THREADS==1){return dumpKmersAsBytes(fname, minToDump, printTime);}
+
+ if(fname==null){return false;}
+ Timer t=new Timer();
+
+ ByteStreamWriter bsw=new ByteStreamWriter(fname, overwrite, false, true);
+ bsw.start();
+ DumpThreadU.dump(k, minToDump, tables, bsw);
+ bsw.poisonAndWait();
+
+ t.stop();
+ if(printTime){outstream.println("Kmer Dump Time: \t"+t);}
+ return bsw.errorState;
+ }
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Recall Methods ----------------*/
+ /*--------------------------------------------------------------*/
+
+ private final StringBuilder toText(long[] kmer){return AbstractKmerTableU.toText(kmer, k);}
+
+ /*--------------------------------------------------------------*/
+ /*---------------- Final Primitives ----------------*/
+ /*--------------------------------------------------------------*/
+
+ public int kbig(){return kbig;}
+ public long filterMemory(int pass){return ((pass&1)==0) ? filterMemory0 : filterMemory1;}
+ public boolean ecco(){return ecco;}
+ public boolean qtrimLeft(){return qtrimLeft;}
+ public boolean qtrimRight(){return qtrimRight;}
+ public byte minAvgQuality(){return minAvgQuality;}
+ public long tableMemory(){return tableMemory;}
+ public long estimatedKmerCapacity(){return estimatedKmerCapacity;}
+
+ /** Hold kmers. A kmer X such that X%WAYS=Y will be stored in tables[Y] */
+ private AbstractKmerTableU[] tables;
+
+ private final int bytesPerKmer;
+
+ private final long usableMemory;
+ private final long filterMemory0;
+ private final long filterMemory1;
+ private final long tableMemory;
+ private final long estimatedKmerCapacity;
+
+ /** Number of tables (and threads, during loading) */
+ private final boolean prealloc;
+
+ /** Number of tables (and threads, during loading) */
+ public final int ways;
+
+ /** Total kmer length */
+ public final int kbig;
+ /** Normal kmer length */
+ public final int k;
+ /** kbig-1; used in some expressions */
+ public final int kbig2;
+ /** Number of little kmers in a big kmer */
+ public final int mult;
+
+ /** Look for reverse-complements as well as forward kmers. Default: true */
+ private final boolean rcomp;
+
+ /** Quality-trim the left side */
+ public final boolean qtrimLeft;
+ /** Quality-trim the right side */
+ public final boolean qtrimRight;
+ /** Trim bases at this quality or below. Default: 4 */
+ public final byte trimq;
+
+ /** Throw away reads below this average quality before trimming. Default: 0 */
+ public final byte minAvgQuality;
+ /** If positive, calculate average quality from the first X bases only. Default: 0 */
+ public final int minAvgQualityBases;
+
+ /** Ignore kmers with probability of correctness less than this */
+ public final float minProb;
+
+ /** Correct via overlap */
+ private final boolean ecco;
+
+ /** Attempt to merge via overlap prior to counting kmers */
+ private final boolean merge;
+
+}
diff --git a/current/var/ApplyVarsToReference.java b/current/var/ApplyVarsToReference.java
new file mode 100755
index 0000000..b33bf6a
--- /dev/null
+++ b/current/var/ApplyVarsToReference.java
@@ -0,0 +1,325 @@
+package var;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.Arrays;
+
+
+import align2.IndexMaker4;
+import align2.ReadStats;
+import align2.Tools;
+import dna.ChromArrayMaker;
+import dna.ChromosomeArray;
+import dna.Data;
+import dna.FastaToChromArrays2;
+import dna.Gene;
+import dna.Timer;
+import fileIO.ReadWrite;
+
+/**
+ * @author Brian Bushnell
+ * @date Jul 23, 2012
+ *
+ */
+public class ApplyVarsToReference {
+
+ public static void main(String[] args){
+ System.err.println("Executing "+(new Object() { }.getClass().getEnclosingClass().getName())+" "+Arrays.toString(args)+"\n");
+ Timer t=new Timer();
+
+ String inPattern=args[0];
+
+ int minChrom=-1;
+ int maxChrom=-1;
+ int outgenome=-1;
+ Data.GENOME_BUILD=-1;
+ String name=null;
+
+ for(int i=1; i<args.length; i++){
+ final String arg=args[i].toLowerCase();
+ String[] split=arg.split("=");
+ String a=split[0];
+ String b=(split.length>1 ? split[1] : null);
+
+ if(a.equals("ingenome")){
+ Data.setGenome(Integer.parseInt(b));
+ if(minChrom==-1){minChrom=1;}
+ if(maxChrom==-1){maxChrom=Data.numChroms;}
+ }else if(a.equals("outgenome")){
+ outgenome=Integer.parseInt(b);
+ }else if(a.equals("minchrom")){
+ minChrom=Integer.parseInt(b);
+ }else if(a.equals("maxchrom")){
+ maxChrom=Integer.parseInt(b);
+ }else if(a.equals("threads") || a.equals("t")){
+ THREADS=Integer.parseInt(b);
+ }else if(a.equals("nblocksize")){
+ N_BLOCK_SIZE=Integer.parseInt(b);
+ }else if(a.equals("nblocktrigger")){
+ N_BLOCK_TRIGGER=Integer.parseInt(b);
+ }else if(a.equals("staynearref")){
+ STAY_NEAR_REF=Tools.parseBoolean(b);
+ }else if(a.equals("append") || a.equals("app")){
+ append=ReadStats.append=Tools.parseBoolean(b);
+ }else if(a.equals("overwrite") || a.equals("ow")){
+ overwrite=Tools.parseBoolean(b);
+ }else if(a.startsWith("regen")){
+ REGEN_N_BLOCKS=Tools.parseBoolean(b);
+ }else if(a.startsWith("name=")){
+ REGEN_N_BLOCKS=Tools.parseBoolean(b);
+ }else{
+ System.err.println("Unknown argument "+arg);
+ }
+ }
+
+ assert(Data.GENOME_BUILD>-1);
+ assert(outgenome>-1);
+// assert(Data.GENOME_BUILD!=outgenome);
+ if(Data.GENOME_BUILD==outgenome){
+ System.out.println("Warning! Overwriting input genome "+outgenome);
+ }
+
+ String fname=Data.chromFname(minChrom, outgenome);
+ File f=new File(fname.substring(0, fname.lastIndexOf('/')));
+// assert(false) : f.getAbsolutePath();
+ if(!f.exists()){f.mkdirs();}
+
+ for(int chrom=minChrom; chrom<=maxChrom; chrom++){
+ String outName=Data.chromFname(chrom, outgenome);
+ assert(overwrite || !new File(outName).exists()) : "Destination "+outName+" already exists.";
+// assert(false) : inPattern+", "+outName;
+ process(inPattern.replaceFirst("#", ""+chrom), outName, chrom);
+ }
+
+ FastaToChromArrays2.writeInfo(outgenome, maxChrom, (name==null ? Data.name : name), ""+Data.GENOME_BUILD+"_plus_variations", false, false);
+
+ t.stop();
+
+ {
+ String path=IndexMaker4.fname(1, 1, 12, 1);
+ int lastSlash=path.lastIndexOf('/');
+ path=path.substring(0, lastSlash);
+ File dir=new File(path);
+ if(dir.exists()){
+ System.out.println("Deleting old index for "+outgenome);
+ for(File f2 : dir.listFiles()){
+ if(f2.isFile() && (f2.getName().contains(".int2d") || f2.getName().endsWith(".txt"))){
+ f2.delete();
+ }
+ }
+ }
+ }
+
+// System.out.println("Vars in: \t"+VARS_IN);
+// System.out.println("Vars out:\t"+VARS_OUT);
+ System.out.println();
+ System.out.println("Time: \t"+t);
+
+ }
+
+ /**
+ * @param replaceFirst
+ * @param chromFname
+ * @param chrom
+ */
+ public static void process(String inVarsName, String outChromName, int chrom) {
+ ArrayList<Varlet> vars=Varlet.fromTextFile(inVarsName);
+ ChromosomeArray cha=Data.getChromosome(chrom);
+ ChromosomeArray chb=new ChromosomeArray(chrom, Gene.PLUS);
+
+ //Next location to read in a
+ int aloc=0;
+ //Next location to set in b
+ int bloc=0;
+
+ for(int i=0; i<vars.size(); i++){
+
+ Varlet v=vars.get(i);
+ assert(v.beginLoc>=aloc) : i+"\n"+vars.get(i-1)+"\n"+v+"\n"; //Overlapping variations
+
+ while(v.beginLoc<aloc){//skip it, for now.
+ System.err.print("e");
+ i++;
+ if(i>=vars.size()){break;}
+ v=vars.get(i);
+ }
+
+ if(STAY_NEAR_REF && Tools.absdif(aloc, bloc)>=REF_LIMIT){
+ int dif=v.lengthDif();
+
+ if(aloc<bloc){//skip insertions
+ while(dif>0){
+// System.err.print("i");
+ i++;
+ if(i>=vars.size()){break;}
+ v=vars.get(i);
+ dif=v.lengthDif();
+ }
+ }else{//skip deletions
+ while(dif<0){
+// System.err.print("d");
+ i++;
+ if(i>=vars.size()){break;}
+ v=vars.get(i);
+ dif=v.lengthDif();
+ }
+ }
+ }
+
+ //Advance to variation's beginning
+ while(aloc<v.beginLoc){
+ byte b=cha.get(aloc);
+ chb.set(bloc, b);
+ aloc++;
+ bloc++;
+ }
+
+ //Apply variation
+ if(v.varType==Variation.SNP){
+ String call=v.call;
+ String ref=v.ref;
+ if(ref!=null && ref.equals("=")){ref=null;}
+ for(int j=0; j<call.length(); j++){
+ char c=call.charAt(j);
+ if(ref!=null){
+ assert(ref.charAt(j)==cha.get(aloc)) : "\n"+i+", "+v;
+ }
+ chb.set(bloc, c);
+ aloc++;
+ bloc++;
+ }
+ }else if(v.varType==Variation.DELINS){
+ String call=v.call;
+ for(int j=0; j<call.length(); j++){
+ char c=call.charAt(j);
+ chb.set(bloc, c);
+ bloc++;
+ }
+ aloc+=v.lengthRef();
+ }else if(v.varType==Variation.NOCALL){
+ //Do nothing. But, it should have been removed already.
+ if(!foundNocall){
+ System.err.println("*** Warning - found a nocall in input variations ***");
+ foundNocall=true;
+ }
+ }else if(v.varType==Variation.NOREF){
+ String call=v.call;
+ for(int j=0; j<call.length(); j++){
+ char c=call.charAt(j);
+ assert(cha.get(aloc)=='N') : cha.get(aloc);
+ chb.set(bloc, c);
+ aloc++;
+ bloc++;
+ }
+ }else if(v.varType==Variation.INS){
+ String call=v.call;
+ for(int j=0; j<call.length(); j++){
+ char c=call.charAt(j);
+ chb.set(bloc, c);
+ bloc++;
+ }
+ }else if(v.varType==Variation.DEL){
+ int len=v.lengthRef();
+ assert(len>0);
+ aloc+=len;
+ }
+ }
+
+ //Finish writing array
+ while(aloc<cha.array.length || aloc<=cha.maxIndex){
+ byte c=cha.get(aloc);
+ chb.set(bloc, c);
+ aloc++;
+ bloc++;
+ }
+
+ System.out.println("Length Shift for chr"+chrom+": \t"+(bloc-aloc));
+
+ Data.unload(chrom, true);
+ cha=null;
+
+ if(REGEN_N_BLOCKS){
+ chb=regenNBlocks(chb, N_BLOCK_SIZE, N_BLOCK_TRIGGER, N_BLOCK_END_SIZE);
+ }
+
+ chb.resize(chb.maxIndex+1);
+
+ //Can't do this because it is read later
+// if(THREADS==1){ReadWrite.writeObjectInThread(cac, outChromName);}
+// else{ReadWrite.write(cac, outChromName);}
+
+ ReadWrite.write(chb, outChromName, false);
+ }
+
+ public static ChromosomeArray regenNBlocks(ChromosomeArray cha, int blocksize, int trigger, int endsize){
+ ChromosomeArray chb=new ChromosomeArray(cha.chromosome, cha.strand, cha.minIndex, cha.maxIndex);
+ chb.maxIndex=-1;
+
+ int aloc=0;
+ int bloc=0;
+ int ns=0;
+
+ //Process start
+ while(cha.get(aloc)=='N'){
+ chb.set(bloc, 'N');
+ ns++;
+ aloc++;
+ bloc++;
+ }
+ while(ns<endsize){
+ chb.set(bloc, 'N');
+ ns++;
+ bloc++;
+ }
+ ns=0;
+
+
+ //Process middle
+ while(aloc<=cha.maxIndex){
+ byte b=cha.get(aloc);
+ if(b=='N'){
+ ns++;
+ }else{
+ if(ns>=trigger){
+ while(ns<blocksize){
+ chb.set(bloc, 'N');
+ bloc++;
+ ns++;
+ }
+ }
+ ns=0;
+ }
+ chb.set(bloc, b);
+ aloc++;
+ bloc++;
+ }
+
+
+ //Process end
+ ns=0;
+ for(int i=chb.maxIndex; i>=0; i--){
+ if(chb.get(i)!='N'){break;}
+ }
+ while(ns<endsize){
+ chb.set(chb.maxIndex+1, 'N');
+ ns++;
+ }
+
+ return chb;
+ }
+
+ public static int THREADS=1;
+
+ private static boolean foundNocall=false;
+ private static boolean STAY_NEAR_REF=false;
+ private static final int REF_LIMIT=20;
+ public static boolean REGEN_N_BLOCKS=true;
+ public static int N_BLOCK_END_SIZE=2000;
+ public static int N_BLOCK_SIZE=300;
+ public static int N_BLOCK_TRIGGER=80;
+ /** Permission to overwrite existing files */
+ public static boolean overwrite=false;
+ /** Permission to append to existing files */
+ public static boolean append=false;
+
+}
diff --git a/current/var/GenerateConsensusVariations.java b/current/var/GenerateConsensusVariations.java
new file mode 100755
index 0000000..e301f74
--- /dev/null
+++ b/current/var/GenerateConsensusVariations.java
@@ -0,0 +1,247 @@
+package var;
+
+import java.util.Arrays;
+
+
+import align2.Tools;
+import dna.ChromosomeArray;
+import dna.CoverageArray;
+import dna.Data;
+import dna.Timer;
+import fileIO.ReadWrite;
+import fileIO.TextFile;
+import fileIO.TextStreamWriter;
+
+/**
+ * @author Brian Bushnell
+ * @date Jul 23, 2012
+ *
+ */
+public class GenerateConsensusVariations {
+
+ public static void main(String[] args){
+ System.err.println("Executing "+(new Object() { }.getClass().getEnclosingClass().getName())+" "+Arrays.toString(args)+"\n");
+ Timer t=new Timer();
+
+ String inVarsPattern=args[0];
+ String inCovPattern=args[1];
+ String outPattern=args[2];
+
+ assert(!inVarsPattern.equalsIgnoreCase(outPattern));
+ assert(!inCovPattern.equalsIgnoreCase(outPattern));
+
+ int minChrom=-1;
+ int maxChrom=-1;
+ int minCoverage=1;
+ ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
+
+ for(int i=3; i<args.length; i++){
+ final String arg=args[i].toLowerCase();
+ String[] split=arg.split("=");
+ String a=split[0];
+ String b=(split.length>1 ? split[1] : null);
+
+ if(a.startsWith("mincov")){
+ minCoverage=Integer.parseInt(b);
+ assert(minCoverage>0);
+ }else if(a.startsWith("consensus")){
+ consensusRatio=Float.parseFloat(b);
+// assert(consensusRatio>=0.5f && consensusRatio<=1f);
+ assert(consensusRatio>=0f && consensusRatio<=1f);
+ consensusRatioNR=1-(1-consensusRatio)*.5f; //Lower multiplier is more accurate
+// assert(false) : consensusRatioNR;
+ }else if(a.equals("genome") || a.equals("build")){
+ Data.setGenome(Integer.parseInt(b));
+ if(minChrom==-1){minChrom=1;}
+ if(maxChrom==-1){maxChrom=Data.numChroms;}
+ }else if(a.equals("verbose")){
+ verbose=Tools.parseBoolean(b);
+ }else if(a.equals("minchrom")){
+ minChrom=Integer.parseInt(b);
+ }else if(a.equals("maxchrom")){
+ maxChrom=Integer.parseInt(b);
+ }else if(a.equals("threads") || a.equals("t")){
+ THREADS=Integer.parseInt(b);
+ }else if(a.startsWith("noref") || a.startsWith("undef")){
+ NOREF_CAP=Integer.parseInt(b);
+ }else{
+ System.err.println("Unknown argument "+arg);
+ }
+ }
+
+ for(int chrom=minChrom; chrom<=maxChrom; chrom++){
+ process(inVarsPattern.replaceFirst("#", ""+chrom), inCovPattern.replaceFirst("#", ""+chrom), outPattern.replaceFirst("#", ""+chrom), chrom, minCoverage);
+ }
+
+ t.stop();
+
+ System.out.println();
+ System.out.println("Vars in: \t"+(VARS_IN-NOREFS_IN));
+ System.out.println("Length Delta in: \t"+VARLEN_IN);
+ System.out.println("No-refs in: \t"+NOREFS_IN);
+ System.out.println();
+ System.out.println("Vars out: \t"+(VARS_OUT-NOREFS_OUT));
+ System.out.println("Length Delta out: \t"+VARLEN_OUT);
+ System.out.println("No-refs out: \t"+NOREFS_OUT);
+ System.out.println();
+ System.out.println("Time: \t"+t);
+
+ }
+
+ /** Now removes overlapping vars by retaining better quality one. */
+ public static void process(final String invars, final String incov, final String outfile, final int chrom, final int mincov){
+ TextFile tf=new TextFile(invars, true, false);
+ CoverageArray ca=ReadWrite.read(CoverageArray.class, incov, true);
+ TextStreamWriter tsw=new TextStreamWriter(outfile, true, false, true);
+ tsw.start();
+
+ ChromosomeArray cha=Data.getChromosome(chrom);
+
+ Varlet prev=null;
+
+ tsw.println(Varlet.header());
+
+ for(String s=tf.nextLine(); s!=null; s=tf.nextLine()){
+ if(s.charAt(0)!='#'){
+ Varlet v=Varlet.fromText(s);
+ VARS_IN++;
+ int dif=v.lengthDif();
+ VARLEN_IN+=dif;
+ if(v.varType==Variation.NOREF){NOREFS_IN++;}
+
+ boolean passes=passesFilter(v, ca, cha, mincov);
+ boolean overlap=(prev==null ? false : v.beginLoc<=prev.endLoc);
+// if(passes){System.out.println(v.varTypeMap[v.varType]+" " +
+// ((v.ref==null || v.ref.length()<1 ? "." : v.ref)+" "+(v.call==null || v.call.length()<1 ? "." : v.call))+
+// " \tchr"+v.chromosome+" "+v.beginLoc+" \tdepth "+v.numReads+" / "+ca.get(v.beginLoc)+"");}
+
+ if(!overlap){
+ if(prev!=null){
+ StringBuilder sb=prev.toText().append('\n');
+ tsw.print(sb);
+ VARS_OUT++;
+ VARLEN_OUT+=prev.lengthDif();
+ if(prev.varType==Variation.NOREF){NOREFS_OUT++;}
+ }
+ prev=null;
+ }else{
+ if(passes && v.score()>prev.score()){
+ prev=null;
+ }else{
+ v=null;
+ }
+ }
+
+ if(passes && v!=null){
+ prev=v;
+ }
+
+// if(passesFilter(v, ca, cha, mincov)){
+// StringBuilder sb=v.toText().append('\n');
+// tsw.print(sb);
+// VARS_OUT++;
+// VARLEN_OUT+=dif;
+// if(v.varType==Variation.NOREF){NOREFS_OUT++;}
+// }else{
+//
+// }
+ }
+ }
+
+ if(prev!=null){
+ StringBuilder sb=prev.toText().append('\n');
+ tsw.print(sb);
+ VARS_OUT++;
+ VARLEN_OUT+=prev.lengthDif();
+ if(prev.varType==Variation.NOREF){NOREFS_OUT++;}
+ }
+
+ tf.close();
+ tsw.poison();
+ Data.unload(chrom, true);
+
+ }
+
+
+ /**
+ * @param v
+ * @param ca
+ * @return
+ */
+ private static boolean passesFilter(Varlet v, CoverageArray ca, ChromosomeArray cha, int minCoverageToPass) {
+
+ int dif=v.lengthDif();
+
+ int midLoc=(v.beginLoc+v.endLoc)/2;
+ int midCov=ca.get(midLoc);
+ int maxCov=midCov, minCov=midCov;
+
+ int bound1, bound2;
+ float ratio;
+
+ if(verbose){System.err.println("\nConsidering varlet "+v);}
+
+ if(v.varType==Variation.NOREF){
+ bound1=v.beginLoc;
+ bound2=v.endLoc;
+ minCoverageToPass=minCoverageToPass*2+5;
+ ratio=consensusRatioNR;
+ }else{
+ bound1=v.beginLoc;
+ bound2=v.endLoc;
+ ratio=consensusRatio;
+// if(dif<0){minCoverageToPass++;} //Helps reduce deletion bias
+ }
+
+ for(int i=bound1; i<=bound2; i++){
+ int cov=ca.get(i);
+ minCov=Tools.min(minCov, cov);
+ maxCov=Tools.max(maxCov, cov);
+ if(verbose){System.err.println("minCov = "+minCov+", maxCov = "+maxCov);}
+ }
+// if(dif<)
+
+ if(minCov<minCoverageToPass){
+ if(verbose){System.err.println("Low coverage, "+minCov+"<"+minCoverageToPass+"\n"+v);}
+ return false;
+ }
+ int minReads=(int)Math.ceil(ratio*minCov);
+ if(v.numReads<minReads){
+ if(verbose){System.err.println("Low reads, mincov="+minCov+", "+v.numReads+"<"+minReads+"\n"+v);}
+ return false;
+ }
+ if(v.minStrandReads()<1 && v.numSemiUniqueReads<2*minCoverageToPass){
+ if(verbose){System.err.println("Low strands, mincov="+minCov+", "+v.minStrandReads()+"<"+1+"\n"+v);}
+ return false;
+ }
+
+ //Check noref
+ if(v.varType==Variation.NOREF){
+ if(NOREF_CAP>=0){
+ int a=Tools.max(v.beginLoc-NOREF_CAP, cha.minIndex);
+ int b=Tools.min(v.endLoc+NOREF_CAP, cha.maxIndex);
+ if(cha.isFullyUndefined(a, b)){
+ if(verbose){System.err.println("Noref cap, mincov="+minCov+"\n"+v);}
+ return false;
+ }
+ }
+ }
+ if(verbose){System.err.println("Retaining variation.");}
+ return true;
+ }
+
+
+ /** TODO */
+ public static int THREADS=1;
+ public static int NOREF_CAP=-1;
+ public static float consensusRatio=1f;
+ public static float consensusRatioNR=1f;
+ public static long VARS_IN=0;
+ public static long VARLEN_IN=0;
+ public static long NOREFS_IN=0;
+ public static long VARS_OUT=0;
+ public static long VARLEN_OUT=0;
+ public static long NOREFS_OUT=0;
+ public static boolean verbose=false;
+
+}
diff --git a/current/var/GenerateVarlets.java b/current/var/GenerateVarlets.java
new file mode 100755
index 0000000..b6b89a2
--- /dev/null
+++ b/current/var/GenerateVarlets.java
@@ -0,0 +1,664 @@
+package var;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.PrintWriter;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.zip.ZipOutputStream;
+
+import pacbio.CalcCoverageFromSites;
+import pacbio.SiteR;
+
+import stream.ConcurrentLegacyReadInputStream;
+import stream.RTextInputStream;
+import stream.Read;
+import stream.SiteScore;
+import stream.SiteScoreR;
+
+
+import dna.Data;
+import dna.Gene;
+import dna.Timer;
+
+import fileIO.ReadWrite;
+import fileIO.TextFile;
+
+import align2.ListNum;
+import align2.MSA;
+import align2.MultiStateAligner9ts;
+import align2.Tools;
+import align2.TranslateColorspaceRead;
+
+public class GenerateVarlets {
+
+
+ public static void main(String[] args){
+
+ Data.GENOME_BUILD=-1;
+
+ String reads1=args[0];
+ String reads2=args[1].equalsIgnoreCase("null") ? null : args[1];
+ String outname=args[2];
+// assert(outname.contains("#"));
+
+ String sitesfile=null;
+
+ int minChrom=1;
+ int maxChrom=1;
+ ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
+
+
+ for(int i=3; i<args.length; i++){
+ String arg=args[i].toLowerCase().replace("_", "");
+ String[] split=arg.split("=");
+ if(arg.equals("condense")){CONDENSE=true;}
+ else if(arg.startsWith("condense=")){
+ CONDENSE=Tools.parseBoolean(split[1]);
+ }else if(arg.equals("condensesnps")){CONDENSE_SNPS=true;}
+ else if(arg.startsWith("condensesnps=")){
+ CONDENSE_SNPS=Tools.parseBoolean(split[1]);
+ }else if(arg.startsWith("splitsubs=")){
+ SPLIT_SUBS=Tools.parseBoolean(split[1]);
+ }else if(arg.equals("tosssolo1")){
+ TOSS_SOLO1=true;
+ }else if(arg.equals("tosssolo2")){
+ TOSS_SOLO2=true;
+ }else if(arg.startsWith("tosssolo1=")){
+ if(split[1].equals("1") || split[1].startsWith("t")){TOSS_SOLO1=true;}
+ else{TOSS_SOLO1=false;}
+ }else if(arg.startsWith("tosssolo2=")){
+ if(split[1].equals("1") || split[1].startsWith("t")){TOSS_SOLO2=true;}
+ else{TOSS_SOLO2=false;}
+ }else if(arg.startsWith("minchrom=")){
+ minChrom=Byte.parseByte(split[1]);
+ }else if(arg.startsWith("maxchrom=")){
+ maxChrom=Byte.parseByte(split[1]);
+ }else if(arg.startsWith("build=") || arg.startsWith("genomebuild=") || arg.startsWith("genome=")){
+ Data.setGenome(Integer.parseInt(split[1]));
+ System.out.println("Set GENOME_BUILD to "+Data.GENOME_BUILD);
+ }else if(arg.startsWith("threads=")){
+ THREADS=(Integer.parseInt(split[1]));
+ }else if(arg.startsWith("buffer=") || arg.startsWith("writebuffer=")){
+ WRITE_BUFFER=(Integer.parseInt(split[1]));
+ }else if(arg.startsWith("maxreads=")){
+ MAX_READS=(Long.parseLong(split[1]));
+ }else if(arg.startsWith("sites=") || arg.startsWith("sitesfile=")){
+ final String arg0=args[i]; split=arg0.split("=");
+ sitesfile=split[1];
+ }else{
+ assert(false) : "Unknown argument: "+arg;
+ }
+ }
+ assert(minChrom<=maxChrom && minChrom>=0);
+
+ if(ReadWrite.ZIPLEVEL<2){ReadWrite.ZIPLEVEL=2;}
+ GenerateVarlets gv=new GenerateVarlets(reads1, reads2, outname, minChrom, maxChrom, MAX_READS, sitesfile);
+ gv.process();
+ }
+
+ public GenerateVarlets(String fname1, String fname2, String outname_, int minChrom, int maxChrom, long maxReads, String sitesfile_){
+ this(new RTextInputStream(fname1, fname2, maxReads), outname_, minChrom, maxChrom, maxReads, sitesfile_);
+ assert(fname2==null || !fname1.equals(fname2)) : "Error - input files have same name.";
+ }
+
+ public GenerateVarlets(RTextInputStream stream_, String outname_, int minChrom, int maxChrom, long maxReads, String sitesfile_){
+ sitesfile=sitesfile_;
+ stream=stream_;
+ outname=outname_;
+ assert(outname.contains("#")) : "Output file name must contain the character '#' to be used for chromosome number.";
+
+ outArray=new OutputStream[maxChrom+1];
+ printArray=new PrintWriter[maxChrom+1];
+ for(int i=minChrom; i<outArray.length; i++){
+ outArray[i]=ReadWrite.getOutputStream(outname.replace("#", ""+i), false, true, false);
+ printArray[i]=new PrintWriter(outArray[i]);
+ printArray[i].println("#Chromosome "+i);
+ printArray[i].println(Varlet.textHeader());
+ }
+ cris=(USE_CRIS ? new ConcurrentLegacyReadInputStream(stream, maxReads) : null);
+ if(CONDENSE_SNPS){assert(!SPLIT_SUBS);}
+ }
+
+ public void finish(){
+
+ for(int i=0; i<printArray.length; i++){
+ if(printArray[i]!=null){
+ synchronized(printArray[i]){
+ printArray[i].flush();
+ if(outArray[i].getClass()==ZipOutputStream.class){
+ ZipOutputStream zos=(ZipOutputStream)outArray[i];
+ try {
+ zos.closeEntry();
+ zos.finish();
+ } catch (IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ printArray[i].close();
+ try {
+ outArray[i].close();
+ } catch (IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+ }
+
+// if(cris!=null){cris.shutdown();}
+// stream.shutdown();
+
+ if(cris!=null){ReadWrite.closeStream(cris);}
+ else{stream.close();}
+ }
+
+ public void process(){
+
+ Timer t=new Timer();
+
+ if(sitesfile!=null){
+ sitemap=loadSites(sitesfile);
+ }
+
+ cris.start();
+ ProcessThread[] threadHandles=new ProcessThread[THREADS];
+ for(int i=0; i<THREADS; i++){
+ threadHandles[i]=new ProcessThread();
+ threadHandles[i].start();
+ }
+
+ long varsMade=0;
+ long norefsMade=0;
+ long snpMade=0;
+ long delMade=0;
+ long subnMade=0;
+ long subdMade=0;
+ long subiMade=0;
+ long insMade=0;
+ long deltaLen=0;
+
+ for(int i=0; i<threadHandles.length; i++){
+ ProcessThread pt=threadHandles[i];
+ while(!pt.finished()){
+ synchronized(pt){
+ try {
+ pt.wait(1000);
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+ varsMade+=pt.varsMade;
+ norefsMade+=pt.norefsMade;
+ snpMade+=pt.snpMade;
+ delMade+=pt.delMade;
+ subnMade+=pt.subnMade;
+ subdMade+=pt.subdMade;
+ subiMade+=pt.subiMade;
+ insMade+=pt.insMade;
+ deltaLen+=pt.deltaLen;
+ }
+
+ finish();
+
+ t.stop();
+
+ System.out.println("\nOutput variations count");
+ System.out.println("Total (minus no-ref): \t"+(varsMade-norefsMade));
+ System.out.println("Deletions: \t"+(delMade));
+ System.out.println("D-type subs: \t"+(subdMade));
+ System.out.println("Insertions: \t"+(insMade));
+ System.out.println("I-type subs: \t"+(subiMade));
+ System.out.println("Snps: \t"+(snpMade));
+ System.out.println("N-type subs: \t"+(subnMade));
+ System.out.println("No-refs: \t"+(norefsMade));
+ System.out.println("Delta Length: \t"+(deltaLen));
+ System.out.println();
+ System.out.println("Time:\t"+t);
+ }
+
+
+ /**
+ * @param sitesfile2
+ * @return
+ */
+ private static final HashMap<Long, ArrayList<SiteScoreR>> loadSites_old(String fname) {
+ HashMap<Long, ArrayList<SiteScoreR>> map=new HashMap<Long, ArrayList<SiteScoreR>>(4096);
+ TextFile tf=new TextFile(fname, false, false);
+
+ for(String s=tf.nextLine(); s!=null; s=tf.nextLine()){
+ SiteScoreR[] array=CalcCoverageFromSites.toSites(s);
+ for(SiteScoreR ssr : array){
+ long key=ssr.numericID;
+ if((ssr.pairnum&1)==1){
+ key=-key;
+ assert(key<0);
+ }
+ ArrayList<SiteScoreR> list=map.get(key);
+ if(list==null){
+ list=new ArrayList<SiteScoreR>(4);
+ map.put(key, list);
+ }
+ list.add(ssr);
+ }
+ }
+ return map;
+ }
+
+
+ /**
+ * @param sitesfile2
+ * @return
+ */
+ private static final HashMap<Long, SiteR> loadSites(String fname) {
+ HashMap<Long, SiteR> map=new HashMap<Long, SiteR>(4096);
+ TextFile tf=new TextFile(fname, false, false);
+
+ for(String s=tf.nextLine(); s!=null; s=tf.nextLine()){
+ SiteScoreR[] array=CalcCoverageFromSites.toSites(s);
+ for(SiteScoreR ssr : array){
+ SiteR sr=new SiteR(ssr);
+ Long key=sr.idPairnum;
+
+ SiteR head=map.get(key);
+ sr.next=head;
+ map.put(key, sr);
+ }
+ }
+ return map;
+ }
+
+
+ private void writeList(ArrayList<Varlet> list){
+
+ assert(list!=null && list.size()>0);
+ int chrom=list.get(0).chromosome;
+
+ PrintWriter out=printArray[chrom];
+ synchronized(out){
+ for(Varlet v : list){
+ out.println(v.toText());
+ }
+ }
+
+ }
+
+
+ private final class ProcessThread extends Thread {
+
+ public ProcessThread(){
+ for(int i=1; i<lists.length; i++){
+ lists[i]=new ArrayList<Varlet>(WRITE_BUFFER);
+ }
+ }
+
+ @Override
+ public void run(){
+
+ final boolean processReads=true;
+ if(!processReads){System.err.println("Warning: Skipping read processing.");}
+
+ if(cris!=null){
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ while(!terminate && reads!=null && reads.size()>0){
+ if(processReads){processReads(reads);}
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ cris.returnList(ln.id, ln.list.isEmpty());
+ }else{
+ ArrayList<Read> reads=stream.nextList();
+ while(!terminate && reads!=null && reads.size()>0){
+ if(processReads){processReads(reads);}
+ reads=stream.nextList();
+ }
+ }
+
+ for(ArrayList<Varlet> list : lists){
+ if(list!=null && !list.isEmpty()){
+ if(MERGE_EQUAL_VARLETS){
+ mergeEqualVarlets(list);
+ }else{
+ Collections.sort(list);
+ }
+ writeList(list);
+ list=null;
+ }
+ }
+
+ finished=true;
+ synchronized(this){this.notifyAll();}
+ }
+
+ private void processReads(ArrayList<Read> reads){
+
+ if(sitemap==null){
+ for(Read r : reads){
+ Read r2=r.mate;
+ assert(r2==null || r.mate.mate==r);
+
+ if(r2==null){
+ processRead(r);
+ }else{
+ if(!TOSS_SOLO1 || r.paired()){processRead(r);}
+ if(!TOSS_SOLO2 || r2.paired()){processRead(r2);}
+ }
+ }
+ }else{
+ for(Read r : reads){
+ Read r2=r.mate;
+ assert(r2==null || r.mate.mate==r);
+
+ if(r2==null){
+ multiprocessRead(r);
+ }else{
+ if(!TOSS_SOLO1 || r.paired()){multiprocessRead(r);}
+ if(!TOSS_SOLO2 || r2.paired()){multiprocessRead(r2);}
+ }
+ }
+ }
+ }
+
+ @Deprecated
+ private void multiprocessRead_old(Read r){
+ long key=r.numericID;
+ if((r.pairnum()&1)==1){
+ key=-key;
+ assert(key<0);
+ }
+ if(true){throw new RuntimeException("Deprecated.");}
+ ArrayList<SiteScoreR> alssr=null;//sitemap.get(key);
+ if(alssr==null){return;}
+
+
+ for(SiteScoreR ssr : alssr){
+ SiteScore ss=find(ssr, r.sites);
+ assert(ss!=null) : "\nCan't find ssr "+ssr+" in read\n"+r+"\n";
+
+ r.clearSite();
+ r.setFromSite(ss);
+ r.match=null;
+
+ r.setPaired(ss.pairedScore>0);
+ r.setPerfect(ss.perfect);
+ r.setRescued(ss.rescued);
+
+ processRead(r);
+ }
+ }
+
+ private void multiprocessRead(Read r){
+ long key=r.numericID;
+ if((r.pairnum()&1)==1){
+ key=-key;
+ assert(key<0);
+ }
+
+
+ SiteR head=sitemap.get(key);
+
+// assert(head==null) : "\n"+r.pairnum()+", "+key+",\n"+r.list+",\n"+r.mate.list+"\n"+head.toTextRecursive(null)+"\n";
+
+ while(head!=null){
+ SiteScore ss=find(head, r.sites);
+ assert(ss!=null) : "\nCan't find sr "+head+" in read\n"+r+"\n";
+
+ r.clearSite();
+ r.setFromSite(ss);
+ r.match=null;
+
+ r.setPaired(ss.pairedScore>0);
+ r.setPerfect(ss.perfect);
+ r.setRescued(ss.rescued);
+
+ processRead(r);
+ SiteR old=head;
+ head=old.next;
+ old.next=null; //Clears up memory.
+ }
+ }
+
+ /**
+ * @param ssr
+ * @param list
+ * @return
+ */
+ private SiteScore find(SiteScoreR ssr, ArrayList<SiteScore> list) {
+ for(SiteScore ss : list){
+ if(ssr.equals(ss)){return ss;}
+ }
+ return null;
+ }
+
+ private SiteScore find(SiteR sr, ArrayList<SiteScore> list) {
+ for(SiteScore ss : list){
+ if(sr.equals(ss)){return ss;}
+ }
+ return null;
+ }
+
+
+ private void processRead(Read r){
+
+ boolean flag=false;
+ if(false && (/*r.numericID==30719442 || r.numericID==107055007 || */ r.numericID==42829556) /*&& r.length()<=35*/){
+ System.err.println("Processing read:");
+ System.err.println("\n"+r.toText(false));
+ System.err.println("\n"+r.strand());
+ System.err.println("\n");
+ System.err.println(new String(r.bases));
+ System.err.println(r.match==null ? "null" : new String(r.match));
+ System.err.println("\n");
+ tcr.verbose=true;
+ flag=true;
+ System.err.println("Mapped Length: "+(r.stop-r.start+1));
+ }
+
+
+// if(r.chrom<1 && r.list!=null && r.list.size()>0){
+// SiteScore ss=r.list.get(0); //Should not be necessary
+// r.start=ss.start;
+// r.stop=ss.stop;
+// r.chrom=ss.chrom;
+// r.setStrand(ss.strand);
+// }
+ assert((r.chrom>=1)==r.mapped()) : r.toText(false);
+ if(!r.mapped()){//Unmapped.
+ assert(r.sites==null || r.sites.isEmpty()) : r.toText(false);
+ return;
+ }
+ if(r.invalid()){return;} //Probably trimmed too short to be used.
+
+ if(r.match!=null){
+ if(r.perfect()){//Hopefully this will be set correctly...
+ assert(TranslateColorspaceRead.perfectMatch(r.match));
+ return;
+ }else if(TranslateColorspaceRead.perfectMatch(r.match)){
+ return;
+ }
+ }
+
+ if(flag){
+ System.err.println("r.match = "+(r.match==null ? null : new String(r.match)));
+ System.err.println("Mapped Length: "+(r.stop-r.start+1));
+ }
+// if(r.match!=null){
+// for(int i=0; i<r.match.length; i++){
+// if(r.match[i]=='I'){
+// r.match=null;
+// if(flag){System.err.println("nullified match string");}
+// break;
+// }
+// }
+// }
+
+// r.match=null; //TODO - why are some match strings backwards?
+ if(r.match==null){
+ if(flag){
+ System.err.println("realigning match string");
+ System.err.println("Mapped Length: "+(r.stop-r.start+1));
+ }
+ tcr.realign_new(r, 20, true, 0, false); //Also generates the match string
+
+ if(TranslateColorspaceRead.perfectMatch(r.match)){return;}
+ if(flag){
+ System.err.println("new match string:\n"+(r.match==null ? null : new String(r.match)));
+ System.err.println("Mapped Length: "+(r.stop-r.start+1));
+ }
+ }
+ r.errors=r.estimateErrors();
+
+ if(r.match==null){
+ System.err.println("Could not align read "+r.numericID);
+ return;
+ }else if(r.match[0]=='X'){
+ System.err.println("Could not align read "+r.numericID+": "+new String(r.match));
+ return;
+ }
+
+ assert(CONDENSE);
+ ArrayList<Varlet> vars=tcr.toVars(r, CONDENSE, CONDENSE_SNPS, SPLIT_SUBS);
+
+ if(vars==null){return;}
+
+// if(r.numericID==36858949){
+// System.err.println(r.toText(false));
+// System.err.println(r.copies);
+// System.err.println(r.mate.toText(false));
+// System.err.println(r.mate.copies);
+// System.err.println();
+//
+// for(Varlet v : vars){
+// System.err.println(v.toText());
+// System.err.println(v.numReads);
+// }
+// assert(false);
+// }
+
+ for(Varlet v : vars){
+ if(v.endDist>=MIN_END_DIST){
+ assert(v.numUniqueReads==1);
+ assert(v.numSemiUniqueReads==1);
+ assert(v.numPlusReads1+v.numMinusReads1+v.numPlusReads2+v.numMinusReads2==1);
+ assert(v.numReads>=1);
+ // assert(!TranslateColorspaceReadPacBio.COUNT_DUPLICATES_WHEN_MAKING_VARLETS || v.numReads==1);
+ assert(v.numReads==r.copies);
+ assert(v.readLen==r.length());
+ varsMade++;
+ if(v.varType==Variation.NOREF){norefsMade++;}
+ else if(v.varType==Variation.SNP){snpMade++;}
+ else if(v.varType==Variation.DEL){delMade++;}
+ else if(v.varType==Variation.INS){insMade++;}
+ else if(v.varType==Variation.DELINS){
+ int a=v.lengthRef();
+ int b=v.lengthVar();
+ if(a==b){subnMade++;}
+ else if(a>b){subdMade++;}
+ else{subiMade++;}
+ }
+ deltaLen+=v.lengthDif();
+ addVar(v);
+ }
+ }
+// System.out.println(varsMade+", "+norefsMade);
+ }
+
+ private void addVar(Varlet v){
+ ArrayList<Varlet> list=lists[v.chromosome];
+ list.add(v);
+ if(list.size()>=WRITE_BUFFER){
+
+ if(MERGE_EQUAL_VARLETS){
+ mergeEqualVarlets(list);
+ }else{
+ Collections.sort(list);
+ }
+
+ writeList(list);
+ lists[v.chromosome]=new ArrayList<Varlet>(WRITE_BUFFER);
+ }
+ }
+
+ private void mergeEqualVarlets(ArrayList<Varlet> vars){
+
+ Collections.sort(vars);
+ ArrayList<Varlet> list=new ArrayList<Varlet>(8);
+ for(int i=0; i<vars.size(); i++){
+ Varlet a=vars.get(i);
+ vars.set(i, null);
+ Varlet b=(list.isEmpty() ? null : list.get(0));
+ if(b==null || a.equals(b)){
+ list.add(a);
+ }else{//purge
+ Varlet c=StackVariations.mergeEqualVarlets(list);
+ vars.set(i-1, c);
+ list.clear();
+ list.add(a);
+ }
+ }
+ if(!list.isEmpty()){
+ Varlet c=StackVariations.mergeEqualVarlets(list);
+ vars.set(list.size()-1, c);
+ }
+ Tools.condenseStrict(vars);
+ }
+
+ protected boolean finished(){return finished;}
+ protected void terminate(){terminate=true;}
+
+ private final TranslateColorspaceRead tcr=new TranslateColorspaceRead(PAC_BIO_MODE ?
+ new MultiStateAligner9ts(ALIGN_ROWS, ALIGN_COLUMNS) : new MultiStateAligner9ts(ALIGN_ROWS, ALIGN_COLUMNS));
+ private final ArrayList<Varlet> lists[]=new ArrayList[Gene.chromCodes.length];
+ private boolean finished=false;
+ private boolean terminate=false;
+ private long varsMade=0;
+ private long norefsMade=0;
+ private long snpMade=0;
+ private long delMade=0;
+ private long subnMade=0;
+ private long subdMade=0;
+ private long subiMade=0;
+ private long insMade=0;
+ private long deltaLen=0;
+
+
+ }
+
+ public final String outname;
+ public final String sitesfile;
+// private HashMap<Long, ArrayList<SiteScoreR>> sitemap=null;
+ private HashMap<Long, SiteR> sitemap=null;
+ private final RTextInputStream stream;
+ private final ConcurrentLegacyReadInputStream cris;
+ private final OutputStream[] outArray;
+ private final PrintWriter[] printArray;
+
+ public static boolean USE_CRIS=true; //Similar speed either way. "true" may be better with many threads.
+
+ public static int THREADS=5;
+ public static int WRITE_BUFFER=20000; //Bigger number uses more memory, for less frequent writes.
+
+ public static boolean CONDENSE=true;
+ public static boolean CONDENSE_SNPS=true;
+ public static boolean SPLIT_SUBS=false;
+
+ public static boolean TOSS_SOLO1=false;
+ public static boolean TOSS_SOLO2=false;
+
+ public static boolean MERGE_EQUAL_VARLETS=false;
+ public static boolean PAC_BIO_MODE=true;
+ public static int ALIGN_ROWS=2020;
+ public static int ALIGN_COLUMNS=3000;
+
+
+ public static long MAX_READS=-1;
+ public static final int MIN_END_DIST=4;
+
+}
diff --git a/current/var/GenerateVarlets2.java b/current/var/GenerateVarlets2.java
new file mode 100755
index 0000000..1c4494f
--- /dev/null
+++ b/current/var/GenerateVarlets2.java
@@ -0,0 +1,647 @@
+package var;
+
+import java.io.OutputStream;
+import java.io.PrintWriter;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+
+import pacbio.CalcCoverageFromSites;
+import pacbio.SiteR;
+
+import stream.ConcurrentLegacyReadInputStream;
+import stream.RTextInputStream;
+import stream.Read;
+import stream.SiteScore;
+import stream.SiteScoreR;
+
+
+import dna.Data;
+import dna.Parser;
+import dna.Timer;
+
+import fileIO.ReadWrite;
+import fileIO.TextFile;
+
+import align2.ListNum;
+import align2.MultiStateAligner9ts;
+import align2.Tools;
+import align2.TranslateColorspaceRead;
+
+/** Splits output files across blocks for low memory usage */
+public class GenerateVarlets2 {
+
+
+ public static void main(String[] args){
+
+ Data.GENOME_BUILD=-1;
+
+ String reads1=args[0];
+ String reads2=args[1].equalsIgnoreCase("null") ? null : args[1];
+ String outname=args[2];
+// assert(outname.contains("#"));
+
+ String sitesfile=null;
+
+ byte minChrom=1;
+ byte maxChrom=1;
+ ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
+
+
+ for(int i=3; i<args.length; i++){
+ final String arg=args[i];
+ final String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=(split.length>1 ? split[1] : "true");
+ if("t".equals(b)){b="true";}
+ if("f".equals(b)){b="false";}
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(Parser.parseZip(arg, a, b)){
+ //do nothing
+ }else if(a.equals("condense")){
+ CONDENSE=Tools.parseBoolean(b);
+ }else if(a.equals("condensesnps")){
+ CONDENSE_SNPS=Tools.parseBoolean(b);
+ }else if(a.startsWith("splitsubs")){
+ SPLIT_SUBS=Tools.parseBoolean(b);
+ }else if(a.equals("tosssolo1")){
+ TOSS_SOLO1=Tools.parseBoolean(b);
+ }else if(a.equals("tosssolo2")){
+ TOSS_SOLO2=Tools.parseBoolean(b);
+ }else if(a.startsWith("minchrom")){
+ minChrom=Byte.parseByte(b);
+ }else if(a.startsWith("maxchrom")){
+ maxChrom=Byte.parseByte(b);
+ }else if(a.startsWith("build") || a.startsWith("genomebuild") || a.startsWith("genome")){
+ Data.setGenome(Integer.parseInt(b));
+ System.out.println("Set GENOME_BUILD to "+Data.GENOME_BUILD);
+ }else if(a.equals("threads") || a.equals("t")){
+ THREADS=(Integer.parseInt(b));
+ }else if(a.startsWith("buffer") || a.startsWith("writebuffer")){
+ WRITE_BUFFER=(Integer.parseInt(b));
+ }else if(a.startsWith("maxreads")){
+ MAX_READS=(Long.parseLong(b));
+ }else if(a.equals("blocksize")){
+ BLOCKSIZE=(Integer.parseInt(b));
+ }else if(a.startsWith("sites") || a.startsWith("sitesfile")){
+ sitesfile=b;
+ }else{
+ throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+ assert(minChrom<=maxChrom && minChrom>=0);
+ if(Data.GENOME_BUILD<0){throw new RuntimeException("Please set genome number.");}
+
+ if(ReadWrite.ZIPLEVEL<2){ReadWrite.ZIPLEVEL=2;}
+ GenerateVarlets2 gv=new GenerateVarlets2(reads1, reads2, outname, minChrom, maxChrom, MAX_READS, sitesfile);
+ gv.process();
+ }
+
+ public GenerateVarlets2(String fname1, String fname2, String outname_, byte minChrom, byte maxChrom, long maxReads, String sitesfile_){
+ this(new RTextInputStream(fname1, fname2, maxReads), outname_, minChrom, maxChrom, maxReads, sitesfile_);
+ assert(fname2==null || !fname1.equals(fname2)) : "Error - input files have same name.";
+ }
+
+ public GenerateVarlets2(RTextInputStream stream_, String outname_, byte minChrom, byte maxChrom, long maxReads, String sitesfile_){
+ sitesfile=sitesfile_;
+ stream=stream_;
+ outname=outname_;
+ assert(outname==null || outname.contains("#")) : "Output file name must contain the character '#' to be used for key number.";
+ makeKeyMap();
+
+ cris=(USE_CRIS ? new ConcurrentLegacyReadInputStream(stream, maxReads) : null);
+ if(CONDENSE_SNPS){assert(!SPLIT_SUBS);}
+ }
+
+ public void finish(){
+
+ ArrayList<Long> keys=new ArrayList<Long>();
+ keys.addAll(keymap.keySet());
+ Collections.sort(keys);
+ for(long k : keys){
+ ArrayList<Varlet> vars=keymap.remove(k);
+ if(!vars.isEmpty()){writeList(vars);}
+ }
+
+ if(cris!=null){ReadWrite.closeStream(cris);}
+ else{stream.close();}
+
+ }
+
+ public void process(){
+
+ Timer t=new Timer();
+
+ if(sitesfile!=null){
+ sitemap=loadSites(sitesfile);
+ }
+
+ cris.start();
+ ProcessThread[] threadHandles=new ProcessThread[THREADS];
+ for(int i=0; i<THREADS; i++){
+ threadHandles[i]=new ProcessThread();
+ threadHandles[i].start();
+ }
+
+ long varsMade=0;
+ long norefsMade=0;
+ long snpMade=0;
+ long delMade=0;
+ long subnMade=0;
+ long subdMade=0;
+ long subiMade=0;
+ long insMade=0;
+ long deltaLen=0;
+ long sitesProcessed=0;
+ long readsProcessed=0;
+
+ for(int i=0; i<threadHandles.length; i++){
+ ProcessThread pt=threadHandles[i];
+ while(!pt.finished()){
+ synchronized(pt){
+ try {
+ pt.wait(1000);
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+ varsMade+=pt.varsMade;
+ norefsMade+=pt.norefsMade;
+ snpMade+=pt.snpMade;
+ delMade+=pt.delMade;
+ subnMade+=pt.subnMade;
+ subdMade+=pt.subdMade;
+ subiMade+=pt.subiMade;
+ insMade+=pt.insMade;
+ deltaLen+=pt.deltaLen;
+ sitesProcessed+=pt.sitesProcessed;
+ readsProcessed+=pt.readsProcessed;
+ }
+
+ finish();
+
+ t.stop();
+
+ System.out.println("\nOutput variations count");
+ System.out.println("Total (minus no-ref): \t"+(varsMade-norefsMade));
+ System.out.println("Deletions: \t"+(delMade));
+ System.out.println("D-type subs: \t"+(subdMade));
+ System.out.println("Insertions: \t"+(insMade));
+ System.out.println("I-type subs: \t"+(subiMade));
+ System.out.println("Snps: \t"+(snpMade));
+ System.out.println("N-type subs: \t"+(subnMade));
+ System.out.println("No-refs: \t"+(norefsMade));
+ System.out.println("Delta Length: \t"+(deltaLen));
+ System.out.println("Sites Processed: \t"+(sitesProcessed));
+ System.out.println("Reads Processed: \t"+(readsProcessed));
+ System.out.println();
+ System.out.println("Time:\t"+t);
+ }
+
+
+ /**
+ * @param sitesfile2
+ * @return
+ */
+ private static final HashMap<Long, SiteR> loadSites(String fname) {
+ HashMap<Long, SiteR> map=new HashMap<Long, SiteR>(4096);
+ TextFile tf=new TextFile(fname, false, false);
+
+ for(String s=tf.nextLine(); s!=null; s=tf.nextLine()){
+ SiteScoreR[] array=CalcCoverageFromSites.toSites(s);
+ for(SiteScoreR ssr : array){
+ SiteR sr=new SiteR(ssr);
+ Long key=sr.idPairnum;
+
+ SiteR head=map.get(key);
+ sr.next=head;
+ map.put(key, sr);
+ }
+
+ }
+ return map;
+ }
+
+
+ private void writeList(ArrayList<Varlet> list){
+ assert(list!=null && list.size()>0);
+ long key=key(list.get(0).chromosome, list.get(0).beginLoc);
+ String fname=fname(key, outname);
+ boolean allowSubprocess=false;
+ OutputStream os=ReadWrite.getOutputStream(fname, true, true, allowSubprocess);
+ PrintWriter pw=new PrintWriter(os);
+
+
+ for(Varlet v : list){
+ pw.println(v.toText());
+ }
+ ReadWrite.finishWriting(pw, os, fname, allowSubprocess);
+ }
+
+
+ private final class ProcessThread extends Thread {
+
+ public ProcessThread(){
+ }
+
+ @Override
+ public void run(){
+
+ final boolean processReads=true;
+ if(!processReads){System.err.println("Warning: Skipping read processing.");}
+
+ if(cris!=null){
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ while(!terminate && reads!=null && reads.size()>0){
+ if(processReads){processReads(reads);}
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ cris.returnList(ln.id, ln.list.isEmpty());
+ }else{
+ ArrayList<Read> reads=stream.nextList();
+ while(!terminate && reads!=null && reads.size()>0){
+ if(processReads){processReads(reads);}
+ reads=stream.nextList();
+ }
+ }
+
+ finished=true;
+ synchronized(this){this.notifyAll();}
+ }
+
+ private void processReads(ArrayList<Read> reads){
+
+ if(sitemap==null){
+ for(Read r : reads){
+ Read r2=r.mate;
+ assert(r2==null || r.mate.mate==r);
+
+ if(r2==null){
+ processRead(r);
+ }else{
+ if(!TOSS_SOLO1 || r.paired()){processRead(r);}
+ if(!TOSS_SOLO2 || r2.paired()){processRead(r2);}
+ }
+ }
+ }else{
+ for(Read r : reads){
+ Read r2=r.mate;
+ assert(r2==null || r.mate.mate==r);
+
+ if(r2==null){
+ multiprocessRead(r);
+ }else{
+ if(!TOSS_SOLO1 || r.paired()){multiprocessRead(r);}
+ if(!TOSS_SOLO2 || r2.paired()){multiprocessRead(r2);}
+ }
+ }
+ }
+ }
+
+ @Deprecated
+ private void multiprocessRead_old(Read r){
+ long key=r.numericID;
+ if((r.pairnum()&1)==1){
+ key=-key;
+ assert(key<0);
+ }
+ if(true){throw new RuntimeException("Deprecated.");}
+ ArrayList<SiteScoreR> alssr=null;//sitemap.get(key);
+ if(alssr==null){return;}
+
+
+ for(SiteScoreR ssr : alssr){
+ SiteScore ss=find(ssr, r.sites);
+ assert(ss!=null) : "\nCan't find ssr "+ssr+" in read\n"+r+"\n";
+
+ r.clearSite();
+ r.setFromSite(ss);
+ r.match=null;
+
+ r.setPaired(ss.pairedScore>0);
+ r.setPerfect(ss.perfect);
+ r.setRescued(ss.rescued);
+
+ processRead(r);
+ }
+ }
+
+ private void multiprocessRead(Read r){
+ long key=r.numericID;
+ if((r.pairnum()&1)==1){
+ key=-key;
+ assert(key<0);
+ }
+
+
+ SiteR head=sitemap.get(key);
+ if(head!=null){readsProcessed++;}
+
+// assert(head==null) : "\n"+r.pairnum()+", "+key+",\n"+r.list+",\n"+r.mate.list+"\n"+head.toTextRecursive(null)+"\n";
+
+ while(head!=null){
+ SiteScore ss=find(head, r.sites);
+ assert(ss!=null) : "\nCan't find sr "+head+" in read\n"+r+"\n";
+
+ r.clearSite();
+ r.setFromSite(ss);
+ r.match=null;
+
+ r.setPaired(ss.pairedScore>0);
+ r.setPerfect(ss.perfect);
+ r.setRescued(ss.rescued);
+
+ processRead(r);
+ SiteR old=head;
+ head=old.next;
+ old.next=null; //Clears up memory.
+ }
+ }
+
+ /**
+ * @param ssr
+ * @param list
+ * @return
+ */
+ private SiteScore find(SiteScoreR ssr, ArrayList<SiteScore> list) {
+ for(SiteScore ss : list){
+ if(ssr.equals(ss)){return ss;}
+ }
+ return null;
+ }
+
+ private SiteScore find(SiteR sr, ArrayList<SiteScore> list) {
+ for(SiteScore ss : list){
+ if(sr.equals(ss)){return ss;}
+ }
+ return null;
+ }
+
+
+ private void processRead(Read r){
+ sitesProcessed++;
+
+ boolean flag=false;
+ if(false && (/*r.numericID==30719442 || r.numericID==107055007 || */ r.numericID==42829556) /*&& r.length()<=35*/){
+ System.err.println("Processing read:");
+ System.err.println("\n"+r.toText(false));
+ System.err.println("\n"+r.strand());
+ System.err.println("\n");
+ System.err.println(new String(r.bases));
+ System.err.println(r.match==null ? "null" : new String(r.match));
+ System.err.println("\n");
+ tcr.verbose=true;
+ flag=true;
+ System.err.println("Mapped Length: "+(r.stop-r.start+1));
+ }
+
+
+// if(r.chrom<1 && r.list!=null && r.list.size()>0){
+// SiteScore ss=r.list.get(0); //Should not be necessary
+// r.start=ss.start;
+// r.stop=ss.stop;
+// r.chrom=ss.chrom;
+// r.setStrand(ss.strand);
+// }
+ assert((r.chrom>=1)==r.mapped()) : r.toText(false);
+ if(!r.mapped()){//Unmapped.
+ assert(r.sites==null || r.sites.isEmpty()) : r.toText(false);
+ return;
+ }
+ if(r.invalid()){return;} //Probably trimmed too short to be used.
+
+ if(r.match!=null){
+ if(r.perfect()){//Hopefully this will be set correctly...
+ assert(TranslateColorspaceRead.perfectMatch(r.match));
+ return;
+ }else if(TranslateColorspaceRead.perfectMatch(r.match)){
+ return;
+ }
+ }
+
+ if(flag){
+ System.err.println("r.match = "+(r.match==null ? null : new String(r.match)));
+ System.err.println("Mapped Length: "+(r.stop-r.start+1));
+ }
+// if(r.match!=null){
+// for(int i=0; i<r.match.length; i++){
+// if(r.match[i]=='I'){
+// r.match=null;
+// if(flag){System.err.println("nullified match string");}
+// break;
+// }
+// }
+// }
+
+// r.match=null; //TODO - why are some match strings backwards?
+ if(r.match==null){
+ if(flag){
+ System.err.println("realigning match string");
+ System.err.println("Mapped Length: "+(r.stop-r.start+1));
+ }
+ tcr.realign_new(r, 20, true, 0, false); //Also generates the match string
+ if(TranslateColorspaceRead.perfectMatch(r.match)){return;}
+ if(flag){
+ System.err.println("new match string:\n"+(r.match==null ? null : new String(r.match)));
+ System.err.println("Mapped Length: "+(r.stop-r.start+1));
+ }
+ }
+ r.errors=r.estimateErrors();
+
+ if(r.match==null){
+ System.err.println("Could not align read "+r.numericID);
+ return;
+ }else if(r.match[0]=='X'){
+ System.err.println("Could not align read "+r.numericID+": "+new String(r.match));
+ return;
+ }
+
+ assert(CONDENSE);
+ ArrayList<Varlet> vars=tcr.toVars(r, CONDENSE, CONDENSE_SNPS, SPLIT_SUBS);
+
+ if(vars==null){return;}
+
+// if(r.numericID==36858949){
+// System.err.println(r.toText(false));
+// System.err.println(r.copies);
+// System.err.println(r.mate.toText(false));
+// System.err.println(r.mate.copies);
+// System.err.println();
+//
+// for(Varlet v : vars){
+// System.err.println(v.toText());
+// System.err.println(v.numReads);
+// }
+// assert(false);
+// }
+
+ for(Varlet v : vars){
+ if(v.endDist>=MIN_END_DIST){
+ assert(v.numUniqueReads==1);
+ assert(v.numSemiUniqueReads==1);
+ assert(v.numPlusReads1+v.numMinusReads1+v.numPlusReads2+v.numMinusReads2==1);
+ assert(v.numReads>=1);
+ // assert(!TranslateColorspaceReadPacBio.COUNT_DUPLICATES_WHEN_MAKING_VARLETS || v.numReads==1);
+ assert(v.numReads==r.copies);
+ assert(v.readLen==r.length());
+ varsMade++;
+ if(v.varType==Variation.NOREF){norefsMade++;}
+ else if(v.varType==Variation.SNP){snpMade++;}
+ else if(v.varType==Variation.DEL){delMade++;}
+ else if(v.varType==Variation.INS){insMade++;}
+ else if(v.varType==Variation.DELINS){
+ int a=v.lengthRef();
+ int b=v.lengthVar();
+ if(a==b){subnMade++;}
+ else if(a>b){subdMade++;}
+ else{subiMade++;}
+ }
+ deltaLen+=v.lengthDif();
+ addVar(v);
+ }
+ }
+// System.out.println(varsMade+", "+norefsMade);
+ }
+
+ /** TODO: Synchronize once per read, not once per varlet */
+ private void addVar(Varlet v){
+ long key=key(v.chromosome, v.beginLoc);
+ ArrayList<Varlet> list=keymap.get(key);
+ assert(list!=null) : "\nCan't find "+key+" in "+keymap.keySet()+"\n";
+ synchronized(list){
+ list.add(v);
+ if(list.size()>=WRITE_BUFFER){
+
+ if(MERGE_EQUAL_VARLETS){
+ mergeEqualVarlets(list);
+ }else{
+ Collections.sort(list);
+ }
+
+ writeList(list);
+ list.clear();
+ }
+ }
+ }
+
+ private void mergeEqualVarlets(ArrayList<Varlet> vars){
+
+ Collections.sort(vars);
+ ArrayList<Varlet> list=new ArrayList<Varlet>(8);
+ for(int i=0; i<vars.size(); i++){
+ Varlet a=vars.get(i);
+ vars.set(i, null);
+ Varlet b=(list.isEmpty() ? null : list.get(0));
+ if(b==null || a.equals(b)){
+ list.add(a);
+ }else{//purge
+ Varlet c=StackVariations.mergeEqualVarlets(list);
+ vars.set(i-1, c);
+ list.clear();
+ list.add(a);
+ }
+ }
+ if(!list.isEmpty()){
+ Varlet c=StackVariations.mergeEqualVarlets(list);
+ vars.set(list.size()-1, c);
+ }
+ Tools.condenseStrict(vars);
+ }
+
+ protected boolean finished(){return finished;}
+ protected void terminate(){terminate=true;}
+
+ private final TranslateColorspaceRead tcr=new TranslateColorspaceRead(PAC_BIO_MODE ?
+ new MultiStateAligner9ts(ALIGN_ROWS, ALIGN_COLUMNS) : new MultiStateAligner9ts(ALIGN_ROWS, ALIGN_COLUMNS));
+ private boolean finished=false;
+ private boolean terminate=false;
+ private long varsMade=0;
+ private long norefsMade=0;
+ private long snpMade=0;
+ private long delMade=0;
+ private long subnMade=0;
+ private long subdMade=0;
+ private long subiMade=0;
+ private long insMade=0;
+ private long deltaLen=0;
+ private long sitesProcessed=0;
+ private long readsProcessed=0;
+
+
+ }
+
+
+ protected static final long key(int chrom, int start){
+ long k=((long)chrom<<32)+(Tools.max(start, 0))/BLOCKSIZE;
+ return k;
+ }
+
+ protected static final long[] keys(final int chrom){
+ int lim=(Data.chromLengths[chrom]+1000)/BLOCKSIZE;
+ long[] keys=new long[lim+1];
+ for(int i=0; i<=lim; i++){
+ long key=key(chrom, i*BLOCKSIZE);
+ keys[i]=key;
+ }
+ return keys;
+ }
+
+ protected static final String fname(long key, String outname){
+ if(outname==null){outname="GV2TempFile_#.txt";}
+ assert(outname.contains("#")) : outname;
+ assert(!outname.endsWith(".gz") && !outname.endsWith(".zip") && !outname.endsWith(".bz2")) : outname;
+ return outname.replace("#", "b"+Data.GENOME_BUILD+"_"+key);
+ }
+
+ private final void makeKeyMap(){
+ final String header=Varlet.textHeader()+"\n";
+ keymap=new HashMap<Long, ArrayList<Varlet>>();
+ for(int chrom=1; chrom<=Data.numChroms; chrom++){
+ long[] keys=keys(chrom);
+ for(long key : keys){
+ keymap.put(key, new ArrayList<Varlet>(WRITE_BUFFER));
+ ReadWrite.writeString(header, fname(key, outname), false);
+ }
+ }
+ }
+
+ private HashMap<Long, ArrayList<Varlet>> keymap;
+
+ public final String outname;
+ public final String sitesfile;
+// private HashMap<Long, ArrayList<SiteScoreR>> sitemap=null;
+ private HashMap<Long, SiteR> sitemap=null;
+ private final RTextInputStream stream;
+ private final ConcurrentLegacyReadInputStream cris;
+
+
+ public static boolean USE_CRIS=true; //Similar speed either way. "true" may be better with many threads.
+
+ public static int THREADS=7;
+ public static int WRITE_BUFFER=200000; //Bigger number uses more memory, for less frequent writes.
+
+ public static boolean CONDENSE=true;
+ public static boolean CONDENSE_SNPS=true;
+ public static boolean SPLIT_SUBS=false;
+
+ public static boolean TOSS_SOLO1=false;
+ public static boolean TOSS_SOLO2=false;
+
+ public static boolean MERGE_EQUAL_VARLETS=false;
+ public static boolean PAC_BIO_MODE=true;
+ public static int ALIGN_ROWS=2020;
+ public static int ALIGN_COLUMNS=3000;
+
+ public static long MAX_READS=-1;
+ public static final int MIN_END_DIST=4;
+ public static int BLOCKSIZE=1000000;
+
+}
diff --git a/current/var/GenerateVarlets3.java b/current/var/GenerateVarlets3.java
new file mode 100755
index 0000000..eca7def
--- /dev/null
+++ b/current/var/GenerateVarlets3.java
@@ -0,0 +1,865 @@
+package var;
+
+import java.io.OutputStream;
+import java.io.PrintWriter;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+
+import pacbio.SiteR;
+
+import stream.ConcurrentLegacyReadInputStream;
+import stream.RTextInputStream;
+import stream.Read;
+import stream.SiteScore;
+import stream.SiteScoreR;
+
+import dna.CoverageArray;
+import dna.Data;
+import dna.Parser;
+import dna.Timer;
+
+import fileIO.ReadWrite;
+import fileIO.TextFile;
+
+import align2.ListNum;
+import align2.MultiStateAligner9PacBio;
+import align2.MultiStateAligner9ts;
+import align2.Tools;
+import align2.TranslateColorspaceRead;
+
+/** Splits output files across blocks for low memory usage.
+ * Uses id-sorted site list for even lower memory usage. */
+public class GenerateVarlets3 {
+
+
+ public static void main(String[] args){
+ System.err.println("Executing "+(new Object() { }.getClass().getEnclosingClass().getName())+" "+Arrays.toString(args)+"\n");
+
+ Data.GENOME_BUILD=-1;
+
+ String reads1=args[0];
+ String reads2=args[1].equalsIgnoreCase("null") ? null : args[1];
+ String outname=args[2];
+ String pcovFile=null;
+ String covFile=null;
+// assert(outname.contains("#"));
+
+ String sitesfile=null;
+
+ int minChrom=-1;
+ int maxChrom=-1;
+
+ int distFromDefined=-1;
+ ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
+
+ for(int i=3; i<args.length; i++){
+ final String arg=args[i];
+ final String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=split.length>1 ? split[1] : null;
+ if("null".equalsIgnoreCase(b)){b=null;}
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(Parser.parseZip(arg, a, b)){
+ //do nothing
+ }else if(a.equals("condense")){
+ CONDENSE=Tools.parseBoolean(b);
+ }else if(a.equals("condensesnps")){
+ CONDENSE_SNPS=Tools.parseBoolean(b);
+ }else if(a.startsWith("splitsubs")){
+ SPLIT_SUBS=Tools.parseBoolean(b);
+ }else if(a.startsWith("illumina")){
+ PAC_BIO_MODE=!Tools.parseBoolean(b);
+ }else if(a.startsWith("pacbio")){
+ PAC_BIO_MODE=Tools.parseBoolean(b);
+ }else if(a.equals("tosssolo1")){
+ TOSS_SOLO1=Tools.parseBoolean(b);
+ }else if(a.equals("tosssolo2")){
+ TOSS_SOLO2=Tools.parseBoolean(b);
+ }else if(a.startsWith("minchrom")){
+ minChrom=Integer.parseInt(b);
+ }else if(a.startsWith("maxchrom")){
+ maxChrom=Integer.parseInt(b);
+ }else if(a.startsWith("build") || a.startsWith("genomebuild") || a.startsWith("genome")){
+ Data.setGenome(Integer.parseInt(b));
+ System.out.println("Set GENOME_BUILD to "+Data.GENOME_BUILD);
+ }else if(a.equals("threads") || a.equals("t")){
+ THREADS=(Integer.parseInt(b));
+ }else if(a.startsWith("buffer") || a.startsWith("writebuffer")){
+ WRITE_BUFFER=(Integer.parseInt(b));
+ }else if(a.startsWith("maxreads")){
+ MAX_READS=(Long.parseLong(b));
+ }else if(a.startsWith("minenddist")){
+ MIN_END_DIST=Integer.parseInt(b);
+ }else if(a.startsWith("alignrow")){
+ ALIGN_ROWS=Integer.parseInt(b);
+ }else if(a.startsWith("aligncol")){
+ ALIGN_COLUMNS=Integer.parseInt(b);
+ }else if(a.startsWith("pcovtipdist")){
+ PCOV_TIP_DIST=Integer.parseInt(b);
+ }else if(a.equals("blocksize")){
+ BLOCKSIZE=(Integer.parseInt(b));
+ }else if(a.equals("norefcap") || a.equals("distfromdefined") || a.equals("maxdistfromdefined")){
+ distFromDefined=(Integer.parseInt(b));
+ }else if(a.startsWith("sites") || a.startsWith("sitesfile")){
+ sitesfile=(b==null || b.equalsIgnoreCase("null") ? null : b);
+ }else if(a.startsWith("pcov") || a.startsWith("perfectcov")){
+ pcovFile=(b==null || b.equalsIgnoreCase("null") ? null : b);
+ }else if(a.equals("cov") || a.startsWith("coverage")){
+ covFile=(b==null || b.equalsIgnoreCase("null") ? null : b);
+ }else{
+ throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ if(Data.GENOME_BUILD<0){throw new RuntimeException("Please set genome number.");}
+ if(minChrom<0){minChrom=1;}
+ if(maxChrom<0){maxChrom=Data.numChroms;}
+
+ assert(minChrom<=maxChrom && minChrom>=0);
+
+ if(ReadWrite.ZIPLEVEL<2){ReadWrite.ZIPLEVEL=2;}
+ GenerateVarlets3 gv=new GenerateVarlets3(reads1, reads2, outname, MAX_READS, sitesfile, pcovFile, distFromDefined);
+ gv.process();
+ }
+
+ public GenerateVarlets3(String fname1, String fname2, String outname_, long maxReads, String sitesfile_, String pcovFile, int distFromDefined_){
+ this(new RTextInputStream(fname1, fname2, maxReads), outname_, maxReads, sitesfile_, pcovFile, distFromDefined_);
+ assert(fname2==null || !fname1.equals(fname2)) : "Error - input files have same name.";
+ }
+
+ public GenerateVarlets3(RTextInputStream stream_, String outname_, long maxReads, String sitesfile_, String pcovFile, int distFromDefined_){
+ sitesfile=sitesfile_;
+ sitesTextFile=new TextFile(sitesfile, false, false);
+ stream=stream_;
+ outname=outname_;
+ assert(outname==null || outname.contains("#")) : "Output file name must contain the character '#' to be used for key number.";
+ makeKeyMap();
+
+ cris=(USE_CRIS ? new ConcurrentLegacyReadInputStream(stream, maxReads) : null);
+ if(CONDENSE_SNPS){assert(!SPLIT_SUBS);}
+
+ maxDistFromDefined=distFromDefined_;
+
+ if(maxDistFromDefined>0){
+ //Unfortunately, this serializes the chromosome loading.
+ nearestDefinedBase=new char[Data.numChroms+1][];
+ for(int i=1; i<=Data.numChroms; i++){
+ nearestDefinedBase[i]=Data.getChromosome(i).nearestDefinedBase();
+ }
+ }else{
+ nearestDefinedBase=null;
+ }
+
+ if(pcovFile!=null){
+ assert(pcovFile.contains("#") || Data.numChroms<2);
+ pcov=new CoverageArray[Data.numChroms+1];
+ for(int i=1; i<=Data.numChroms; i++){
+ String fname=pcovFile.replaceFirst("#", ""+i);
+ pcov[i]=ReadWrite.read(CoverageArray.class, fname, true);
+ }
+ }else{
+ pcov=null;
+ }
+
+ }
+
+ public void finish(){
+
+ ArrayList<Long> keys=new ArrayList<Long>();
+ keys.addAll(keymap.keySet());
+ Collections.sort(keys);
+ for(long k : keys){
+ ArrayList<Varlet> vars=keymap.remove(k);
+ if(!vars.isEmpty()){writeList(vars);}
+ }
+
+ if(cris!=null){ReadWrite.closeStream(cris);}
+ else{stream.close();}
+
+ }
+
+ public void process(){
+
+ Timer t=new Timer();
+
+ if(sitesfile==null){
+ sitemap=null;
+ }
+
+ cris.start();
+ ProcessThread[] threadHandles=new ProcessThread[THREADS];
+ for(int i=0; i<THREADS; i++){
+ threadHandles[i]=new ProcessThread();
+ threadHandles[i].start();
+ }
+
+ long varsMade=0;
+ long norefsMade=0;
+ long snpMade=0;
+ long delMade=0;
+ long subnMade=0;
+ long subdMade=0;
+ long subiMade=0;
+ long insMade=0;
+ long deltaLen=0;
+ long sitesProcessed=0;
+ long readsProcessed=0;
+
+ for(int i=0; i<threadHandles.length; i++){
+ ProcessThread pt=threadHandles[i];
+ while(!pt.finished()){
+ synchronized(pt){
+ try {
+ pt.wait(1000);
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+ varsMade+=pt.varsMade;
+ norefsMade+=pt.norefsMade;
+ snpMade+=pt.snpMade;
+ delMade+=pt.delMade;
+ subnMade+=pt.subnMade;
+ subdMade+=pt.subdMade;
+ subiMade+=pt.subiMade;
+ insMade+=pt.insMade;
+ deltaLen+=pt.deltaLen;
+ sitesProcessed+=pt.sitesProcessed;
+ readsProcessed+=pt.readsProcessed;
+ }
+
+ sitesTextFile.close();
+ assert(sitemap==null || sitemap.size()==0) : sitemap;
+
+ finish();
+
+ t.stop();
+
+ System.out.println("\nOutput variations count");
+ System.out.println("Total (minus no-ref): \t"+(varsMade-norefsMade));
+ System.out.println("Deletions: \t"+(delMade));
+ System.out.println("D-type subs: \t"+(subdMade));
+ System.out.println("Insertions: \t"+(insMade));
+ System.out.println("I-type subs: \t"+(subiMade));
+ System.out.println("Snps: \t"+(snpMade));
+ System.out.println("N-type subs: \t"+(subnMade));
+ System.out.println("No-refs: \t"+(norefsMade));
+ System.out.println("Delta Length: \t"+(deltaLen));
+ System.out.println("Lines Loaded: \t"+(linesLoaded));
+ System.out.println("Lines Retained: \t"+(linesRetained));
+ System.out.println("Reads Processed: \t"+(readsProcessed));
+ System.out.println("Sites Loaded: \t"+(sitesLoaded));
+ System.out.println("Sites Retained: \t"+(sitesRetained));
+ System.out.println("Sites Processed: \t"+(sitesProcessed));
+ System.out.println();
+ System.out.println("Max Site Table Size: \t"+maxSiteTableSize);
+ System.out.println();
+ System.out.println("Time:\t"+t);
+ }
+
+
+ /**
+ * @param sitesfile2
+ * @return
+ */
+ private final long readSites(TextFile tf, long maxID) {
+ long maxFound=-1;
+
+ final boolean retainSemiperfect=maxDistFromDefined!=0;
+ synchronized(sitemap){
+// System.out.print("Sync for "+maxID+".");
+ if(maxID>=maxSiteRead && tf.isOpen()){
+// System.out.print(" ... ");
+// System.out.println("Looking for ")
+ String s;
+ for(s=tf.nextLine(); s!=null; s=tf.nextLine()){
+// SiteScoreR[] array=CalcCoverageFromSites.toSites(s);
+// SiteR head=new SiteR(array[0]);
+// sitemap.put(head.idPairnum, head);
+// for(int i=1; i<array.length; i++){
+// head.next=new SiteR(array[i]);
+// assert(head.idPairnum==head.next.idPairnum) : "Not sorted correctly.";
+// head=head.next;
+// }
+ SiteR head=toImperfectSites(s, retainSemiperfect);
+ if(head!=null){
+ sitemap.put(head.idPairnum, head);
+ long id=head.numericID();
+ assert(id>=maxFound);
+ maxFound=id;
+ }
+ if(maxFound>maxID){break;}
+ }
+ maxSiteRead=Tools.max(maxFound, maxSiteRead);
+ if(s==null){
+ tf.close();
+// System.out.println(" closing file at maxFound="+maxFound+", maxRead="+maxSiteRead+", lines="+linesLoaded);
+ maxSiteRead=Long.MAX_VALUE;
+ }
+ }
+// System.out.println(" maxFound="+maxFound+", maxRead="+maxSiteRead+", lines="+linesLoaded);
+ if(maxSiteRead<=maxID){assert(!tf.isOpen());}
+ maxSiteTableSize=Tools.max(maxSiteTableSize, sitemap.size());
+
+ }
+
+ return maxSiteRead;
+ }
+
+ public SiteR toImperfectSites(String s, boolean retainSemiperfect){
+ SiteR head=null;
+ SiteR prev=null;
+ String[] split=s.split("\t");
+
+
+ sitesLoaded+=split.length;
+ linesLoaded++;
+
+ for(int i=0; i<split.length; i++){
+ SiteScoreR ssr=SiteScoreR.fromText(split[i]);
+
+ boolean retain=true;
+
+ if(ssr.perfect || (ssr.semiperfect && !retainSemiperfect)){retain=false;}
+
+ //Note that this relies on the semiperfect tag being correct in order to generate no-refs from semiperfect reads.
+ if(retain && !ssr.semiperfect && pcov!=null){
+ CoverageArray ca=pcov[ssr.chrom];
+ boolean toss=true;
+ for(int j=ssr.start-PCOV_TIP_DIST; toss && j<=ssr.stop+PCOV_TIP_DIST; j++){
+ toss=ca.get(j)>=MIN_PCOV_DEPTH_TO_TOSS;
+ }
+ if(toss){retain=false;}
+// for(int j=ssr.start-PCOV_TIP_DIST; retain && j<=ssr.stop+PCOV_TIP_DIST; j++){
+// retain=ca.get(j)<MIN_PCOV_DEPTH_TO_TOSS;
+// }
+ }
+
+ if(retain){
+
+ SiteR sr=new SiteR(ssr);
+ if(head==null){
+ head=sr;
+ prev=head;
+ }else{
+ assert(sr.idPairnum==prev.idPairnum) : "Not sorted correctly.";
+ prev.next=sr;
+ prev=sr;
+ }
+ }
+ }
+// assert(head==null) : head.toTextRecursive(null);
+
+ if(head!=null){
+ sitesRetained+=head.listLength();
+ linesRetained++;
+ }
+ return head;
+ }
+
+
+ public static SiteR toImperfectSites2(String s){
+ SiteScoreR[] array=SiteScoreR.fromTextArray(s);
+ if(array!=null && array.length>0){
+ SiteR[] a2=new SiteR[array.length];
+ for(int i=0; i<a2.length; i++){
+ a2[i]=new SiteR(array[i]);
+ if(i>0){a2[i-1].next=a2[i];}
+ }
+ return a2[0];
+ }
+ return null;
+ }
+
+ private void writeList(ArrayList<Varlet> list){
+ assert(list!=null && list.size()>0);
+ long key=key(list.get(0).chromosome, list.get(0).beginLoc);
+ String fname=fname(key, outname);
+ boolean allowSubprocess=false;
+ OutputStream os=ReadWrite.getOutputStream(fname, true, true, allowSubprocess);
+ PrintWriter pw=new PrintWriter(os);
+
+
+ for(Varlet v : list){
+ pw.println(v.toText());
+ }
+ ReadWrite.finishWriting(pw, os, fname, allowSubprocess);
+ }
+
+
+ private final class ProcessThread extends Thread {
+
+ public ProcessThread(){
+ }
+
+ private void fixReadSites(ArrayList<Read> reads){
+ assert(sitemap!=null);
+ if(reads==null || reads.size()==0){return;}
+ long max=-2;
+ for(Read r : reads){
+ max=Tools.max(max, r.numericID);
+ }
+ synchronized(sitemap){
+ if(max>=maxSiteRead){
+ readSites(sitesTextFile, max);
+ }
+ for(Read r : reads){
+ {
+ long key=r.numericID;
+ if((r.pairnum()&1)==1){
+ key=-key;
+ assert(key<0);
+ }
+ SiteR head=sitemap.get(key);
+
+ ArrayList<SiteScore> old=r.sites;
+ r.sites=null;
+ if(head!=null){
+ r.sites=new ArrayList<SiteScore>();
+ sitemap.remove(key);
+ while(head!=null){
+ SiteScore ss=find(head, old); //Note - I can accelerate this by sorting SiteR and r.list by the same metric, e.g. position.
+ assert(ss!=null) : "\nCan't find sr "+head+" in read\n"+r+"\nlist:\n"+old;
+ r.sites.add(ss);
+ head=head.next;
+ }
+ }
+ }
+
+ Read r2=r.mate;
+ if(r2!=null){
+ long key=r2.numericID;
+ if((r2.pairnum()&1)==1){
+ key=-key;
+ assert(key<0);
+ }
+ SiteR head=sitemap.get(key);
+
+ ArrayList<SiteScore> old=r2.sites;
+ r2.sites=null;
+ if(head!=null){
+ r2.sites=new ArrayList<SiteScore>();
+ sitemap.remove(key);
+ while(head!=null){
+ SiteScore ss=find(head, old); //Note - I can accelerate this by sorting SiteR and r2.list by the same metric, e.g. position.
+ assert(ss!=null) : "\nCan't find sr "+head+" in read\n"+r2+"\nlist:\n"+old;
+ r2.sites.add(ss);
+ }
+ }
+ }
+
+ }
+ }
+ }
+
+ @Override
+ public void run(){
+
+ final boolean processReads=true;
+ if(!processReads){System.err.println("Warning: Skipping read processing.");}
+
+ if(cris!=null){
+ ListNum<Read> ln=cris.nextList();
+ ArrayList<Read> reads=(ln!=null ? ln.list : null);
+
+ while(!terminate && reads!=null && reads.size()>0){
+ if(processReads){processReads(reads);}
+ cris.returnList(ln.id, ln.list.isEmpty());
+ ln=cris.nextList();
+ reads=(ln!=null ? ln.list : null);
+ }
+ cris.returnList(ln.id, ln.list.isEmpty());
+ }else{
+ ArrayList<Read> reads=stream.nextList();
+ while(!terminate && reads!=null && reads.size()>0){
+ if(processReads){processReads(reads);}
+ reads=stream.nextList();
+ }
+ }
+
+ finished=true;
+ synchronized(this){this.notifyAll();}
+ }
+
+ private void processReads(ArrayList<Read> reads){
+ if(sitemap==null){
+ for(Read r : reads){
+ Read r2=r.mate;
+ assert(r2==null || r.mate.mate==r);
+
+ if(r2==null){
+ processRead(r);
+ }else{
+ if(!TOSS_SOLO1 || r.paired()){processRead(r);}
+ if(!TOSS_SOLO2 || r2.paired()){processRead(r2);}
+ }
+ }
+ }else{
+ fixReadSites(reads);
+
+ for(Read r : reads){
+ Read r2=r.mate;
+ assert(r2==null || r.mate.mate==r);
+
+ if(r2==null){
+ multiprocessRead(r);
+ }else{
+ if(!TOSS_SOLO1 || r.paired()){multiprocessRead(r);}
+ if(!TOSS_SOLO2 || r2.paired()){multiprocessRead(r2);}
+ }
+ }
+ }
+ }
+
+ private void multiprocessRead(Read r){
+
+// assert(head==null) : "\n"+r.pairnum()+", "+key+",\n"+r.list+",\n"+r.mate.list+"\n"+head.toTextRecursive(null)+"\n";
+
+ if(r.numSites()==0){return;}
+
+ readsProcessed++;
+ for(SiteScore ss : r.sites){
+ r.clearSite();
+ r.setFromSite(ss);
+ r.match=null;
+
+ r.setPaired(ss.pairedScore>0);
+ r.setPerfect(ss.perfect);
+ r.setRescued(ss.rescued);
+
+ processRead(r);
+ }
+ }
+
+ /**
+ * @param ssr
+ * @param list
+ * @return
+ */
+ private SiteScore find(SiteScoreR ssr, ArrayList<SiteScore> list) {
+ for(SiteScore ss : list){
+ if(ssr.equals(ss)){return ss;}
+ }
+ return null;
+ }
+
+ private SiteScore find(SiteR sr, ArrayList<SiteScore> list) {
+ for(SiteScore ss : list){
+ if(sr.equals(ss)){return ss;}
+ }
+ return null;
+ }
+
+
+ private void processRead(Read r){
+ sitesProcessed++;
+
+ assert(r.numericID<Integer.MAX_VALUE) : r.toText(false);
+
+ boolean flag=false;
+ if(false && (/*r.numericID==30719442 || r.numericID==107055007 || */ r.numericID==42829556) /*&& r.length()<=35*/){
+ System.err.println("Processing read:");
+ System.err.println("\n"+r.toText(false));
+ System.err.println("\n"+r.strand());
+ System.err.println("\n");
+ System.err.println(new String(r.bases));
+ System.err.println(r.match==null ? "null" : new String(r.match));
+ System.err.println("\n");
+ tcr.verbose=true;
+ flag=true;
+ System.err.println("Mapped Length: "+(r.stop-r.start+1));
+ }
+
+
+// if(r.chrom<1 && r.list!=null && r.list.size()>0){
+// SiteScore ss=r.list.get(0); //Should not be necessary
+// r.start=ss.start;
+// r.stop=ss.stop;
+// r.chrom=ss.chrom;
+// r.setStrand(ss.strand);
+// }
+ assert((r.chrom>=1)==r.mapped()) : r.toText(false);
+ if(!r.mapped()){//Unmapped.
+ assert(r.sites==null || r.sites.isEmpty()) : r.toText(false);
+ return;
+ }
+ if(r.invalid()){return;} //Probably trimmed too short to be used.
+
+ if(r.match!=null){
+ if(r.perfect()){//Hopefully this will be set correctly...
+ assert(TranslateColorspaceRead.perfectMatch(r.match));
+ return;
+ }else if(TranslateColorspaceRead.perfectMatch(r.match)){
+ return;
+ }
+ }
+
+ assert(r.numericID<Integer.MAX_VALUE) : r.toText(false);
+
+ if(flag){
+ System.err.println("r.match = "+(r.match==null ? null : new String(r.match)));
+ System.err.println("Mapped Length: "+(r.stop-r.start+1));
+ }
+// if(r.match!=null){
+// for(int i=0; i<r.match.length; i++){
+// if(r.match[i]=='I'){
+// r.match=null;
+// if(flag){System.err.println("nullified match string");}
+// break;
+// }
+// }
+// }
+
+// r.match=null; //TODO - why are some match strings backwards?
+ if(r.match==null){
+ if(flag){
+ System.err.println("realigning match string");
+ System.err.println("Mapped Length: "+(r.stop-r.start+1));
+ }
+ tcr.realign_new(r, 20, true, 0, false); //Also generates the match string
+ if(TranslateColorspaceRead.perfectMatch(r.match)){return;}
+ if(flag){
+ System.err.println("new match string:\n"+(r.match==null ? null : new String(r.match)));
+ System.err.println("Mapped Length: "+(r.stop-r.start+1));
+ }
+ }
+ assert(r.numericID<Integer.MAX_VALUE) : r.toText(false);
+ r.errors=r.estimateErrors();
+ assert(r.numericID<Integer.MAX_VALUE) : r.toText(false);
+
+ if(r.match==null){
+ System.err.println("Could not align read "+r.numericID);
+ return;
+ }else if(r.match[0]=='X'){
+ System.err.println("Could not align read "+r.numericID+": "+new String(r.match));
+ return;
+ }
+
+ assert(r.numericID<Integer.MAX_VALUE) : r.toText(false);
+
+// assert(CONDENSE);
+// assert(false) : r+"\n"+CONDENSE+"\n"+CONDENSE_SNPS+"\n"+SPLIT_SUBS;
+ ArrayList<Varlet> vars=tcr.toVars(r, CONDENSE, CONDENSE_SNPS, SPLIT_SUBS);
+
+ if(vars==null){return;}
+
+// if(r.numericID==36858949){
+// System.err.println(r.toText(false));
+// System.err.println(r.copies);
+// System.err.println(r.mate.toText(false));
+// System.err.println(r.mate.copies);
+// System.err.println();
+//
+// for(Varlet v : vars){
+// System.err.println(v.toText());
+// System.err.println(v.numReads);
+// }
+// assert(false);
+// }
+
+ char[] nearest=(nearestDefinedBase == null ? null : nearestDefinedBase[r.chrom]);
+ CoverageArray ca=(pcov==null ? null : pcov[r.chrom]);
+
+ for(Varlet v : vars){
+ if(v.endDist>=MIN_END_DIST){
+ assert(v.numUniqueReads==1);
+ assert(v.numSemiUniqueReads==1);
+ assert(v.numPlusReads1+v.numMinusReads1+v.numPlusReads2+v.numMinusReads2==1);
+ assert(v.numReads>=1);
+ // assert(!TranslateColorspaceReadPacBio.COUNT_DUPLICATES_WHEN_MAKING_VARLETS || v.numReads==1);
+ assert(v.numReads==r.copies);
+ assert(v.readLen==r.length());
+
+ boolean retain=true;
+ if(maxDistFromDefined>=0 && v.varType==Variation.NOREF){
+ char dist=(maxDistFromDefined==0 ? 1 : Tools.min(nearest[v.beginLoc], nearest[v.endLoc]));
+ if(dist>maxDistFromDefined){retain=false;}
+ }
+
+ if(retain && v.varType!=Variation.NOREF && ca!=null){
+ boolean toss=true;
+ assert(PCOV_TIP_DIST>0);
+ for(int j=v.beginLoc-PCOV_TIP_DIST; toss && j<=v.endLoc+PCOV_TIP_DIST; j++){
+ toss=ca.get(j)>=MIN_PCOV_DEPTH_TO_TOSS;
+ }
+ if(toss){retain=false;}
+ }
+
+ if(retain){
+ varsMade++;
+ if(v.varType==Variation.NOREF){norefsMade++;}
+ else if(v.varType==Variation.SNP){snpMade++;}
+ else if(v.varType==Variation.DEL){delMade++;}
+ else if(v.varType==Variation.INS){insMade++;}
+ else if(v.varType==Variation.DELINS){
+ int a=v.lengthRef();
+ int b=v.lengthVar();
+ if(a==b){subnMade++;}
+ else if(a>b){subdMade++;}
+ else{subiMade++;}
+ }
+ deltaLen+=v.lengthDif();
+ addVar(v);
+ }
+
+ }
+ }
+// System.out.println(varsMade+", "+norefsMade);
+ }
+
+ /** TODO: Synchronize once per read, not once per varlet */
+ private void addVar(Varlet v){
+ long key=key(v.chromosome, v.beginLoc);
+ ArrayList<Varlet> list=keymap.get(key);
+ assert(list!=null) : "\nCan't find "+key+" in "+keymap.keySet()+"\n";
+ synchronized(list){
+ list.add(v);
+ if(list.size()>=WRITE_BUFFER){
+
+ if(MERGE_EQUAL_VARLETS){
+ mergeEqualVarlets(list);
+ }else{
+ Collections.sort(list);
+ }
+
+ writeList(list);
+ list.clear();
+ }
+ }
+ }
+
+ private void mergeEqualVarlets(ArrayList<Varlet> vars){
+
+ Collections.sort(vars);
+ ArrayList<Varlet> list=new ArrayList<Varlet>(8);
+ for(int i=0; i<vars.size(); i++){
+ Varlet a=vars.get(i);
+ vars.set(i, null);
+ Varlet b=(list.isEmpty() ? null : list.get(0));
+ if(b==null || a.equals(b)){
+ list.add(a);
+ }else{//purge
+ Varlet c=StackVariations.mergeEqualVarlets(list);
+ vars.set(i-1, c);
+ list.clear();
+ list.add(a);
+ }
+ }
+ if(!list.isEmpty()){
+ Varlet c=StackVariations.mergeEqualVarlets(list);
+ vars.set(list.size()-1, c);
+ }
+ Tools.condenseStrict(vars);
+ }
+
+ protected boolean finished(){return finished;}
+ protected void terminate(){terminate=true;}
+
+ private final TranslateColorspaceRead tcr=new TranslateColorspaceRead(PAC_BIO_MODE ?
+ new MultiStateAligner9PacBio(ALIGN_ROWS, ALIGN_COLUMNS) : new MultiStateAligner9ts(ALIGN_ROWS, ALIGN_COLUMNS));
+ private boolean finished=false;
+ private boolean terminate=false;
+ private long varsMade=0;
+ private long norefsMade=0;
+ private long snpMade=0;
+ private long delMade=0;
+ private long subnMade=0;
+ private long subdMade=0;
+ private long subiMade=0;
+ private long insMade=0;
+ private long deltaLen=0;
+ private long sitesProcessed=0;
+ private long readsProcessed=0;
+
+
+ }
+
+
+ protected static final long key(int chrom, int start){
+ long k=((long)chrom<<32)+(Tools.max(start, 0))/BLOCKSIZE;
+ return k;
+ }
+
+ protected static final long[] keys(final int chrom){
+ int lim=(Data.chromLengths[chrom]+1000)/BLOCKSIZE;
+ long[] keys=new long[lim+1];
+ for(int i=0; i<=lim; i++){
+ long key=key(chrom, i*BLOCKSIZE);
+ keys[i]=key;
+ }
+ return keys;
+ }
+
+ protected static final String fname(long key, String outname){
+ if(outname==null){outname="GV2TempFile_#.txt";}
+ assert(outname.contains("#")) : outname;
+ assert(!outname.endsWith(".gz") && !outname.endsWith(".zip") && !outname.endsWith(".bz2")) : outname;
+ return outname.replace("#", "b"+Data.GENOME_BUILD+"_"+key);
+ }
+
+ private final void makeKeyMap(){
+ final String header=Varlet.textHeader()+"\n";
+ keymap=new HashMap<Long, ArrayList<Varlet>>();
+ for(int chrom=1; chrom<=Data.numChroms; chrom++){
+ long[] keys=keys(chrom);
+ for(long key : keys){
+ keymap.put(key, new ArrayList<Varlet>(WRITE_BUFFER));
+ ReadWrite.writeString(header, fname(key, outname), false);
+ }
+ }
+ }
+
+ private HashMap<Long, ArrayList<Varlet>> keymap;
+ private final char[][] nearestDefinedBase;
+ private final int maxDistFromDefined;
+
+ private final CoverageArray[] pcov;
+
+ public final String outname;
+ public final String sitesfile;
+ private TextFile sitesTextFile;
+ private static long maxSiteRead=-1;
+ private static long maxSiteTableSize=-1;
+
+ private static long sitesLoaded=0;
+ private static long sitesRetained=0;
+ private static long linesLoaded=0;
+ private static long linesRetained=0;
+
+ private HashMap<Long, SiteR> sitemap=new HashMap<Long, SiteR>(4096);
+ private final RTextInputStream stream;
+ private final ConcurrentLegacyReadInputStream cris;
+
+ public static boolean USE_CRIS=true; //Similar speed either way. "true" may be better with many threads.
+
+ public static int THREADS=Data.LOGICAL_PROCESSORS;
+ public static int WRITE_BUFFER=16000; //Bigger number uses more memory, for less frequent writes.
+
+ public static boolean CONDENSE=true;
+ public static boolean CONDENSE_SNPS=true;
+ public static boolean SPLIT_SUBS=false;
+
+ public static boolean TOSS_SOLO1=false;
+ public static boolean TOSS_SOLO2=false;
+
+ public static boolean MERGE_EQUAL_VARLETS=false;
+ public static boolean PAC_BIO_MODE=true;
+ public static int ALIGN_ROWS=2020;
+ public static int ALIGN_COLUMNS=3000;
+
+ public static long MAX_READS=-1;
+ public static int MIN_END_DIST=4;
+ public static int BLOCKSIZE=1000000;
+ /** Imperfect reads fully covered by perfect reads to this depth or more will be tossed. */
+ public static int MIN_PCOV_DEPTH_TO_TOSS=2;
+ /** Extend perfect coverage depth requirement by this much of the tips of variations and reads before tossing them.
+ * A higher number means more varlets will be retained. */
+ public static int PCOV_TIP_DIST=8;
+
+}
diff --git a/current/var/StackVariations.java b/current/var/StackVariations.java
new file mode 100755
index 0000000..6853113
--- /dev/null
+++ b/current/var/StackVariations.java
@@ -0,0 +1,738 @@
+package var;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+
+
+import dna.Data;
+import dna.Gene;
+import dna.Timer;
+import fileIO.TextStreamWriter;
+
+import align2.Tools;
+
+public class StackVariations {
+
+ public static void main(String[] args){
+ System.err.println("Executing "+(new Object() { }.getClass().getEnclosingClass().getName())+" "+Arrays.toString(args)+"\n");
+
+ Timer t=new Timer();
+
+ String inPattern=args[0];
+ String outPattern=args[1];
+
+ assert(!inPattern.equalsIgnoreCase(outPattern));
+
+ int minChrom=-1;
+ int maxChrom=-1;
+
+ boolean filter=false;
+
+ for(String arg : args){
+ final String s=arg.toLowerCase();
+ String[] split=s.split("=");
+ String a=split[0];
+ String b=(split.length>1 ? split[1] : null);
+
+ if(a.equalsIgnoreCase("filter")){filter=true;}
+ else if(a.startsWith("filter")){
+ if(b.equals("1") || b.startsWith("t")){filter=true;}
+ else if(b.equals("0") || b.startsWith("f")){filter=false;}
+ else{throw new RuntimeException("Unknown parameter "+arg);}
+ }else if(a.equalsIgnoreCase("strict")){
+ if(b==null){STRICT=true;}
+ else if(b.equals("1") || b.startsWith("t")){STRICT=true;}
+ else if(b.equals("0") || b.startsWith("f")){STRICT=false;}
+ else{throw new RuntimeException("Unknown parameter "+arg);}
+ }else if(a.equals("genome") || a.equals("build")){
+ Data.setGenome(Integer.parseInt(b));
+ if(minChrom==-1){minChrom=1;}
+ if(maxChrom==-1){maxChrom=Data.numChroms;}
+ }else if(a.equals("minchrom")){
+ minChrom=Integer.parseInt(b);
+ }else if(a.equals("maxchrom")){
+ maxChrom=Integer.parseInt(b);
+ }else if(a.equals("threads") || a.equals("t")){
+ THREADS=Integer.parseInt(b);
+ }else{
+// System.err.println("************* "+s);
+ }
+ }
+
+ assert(minChrom>=0 && maxChrom>=minChrom) : "Please set minchrom and maxchrom.";
+
+// for(byte i=minChrom; i<=maxChrom; i++){
+// String fname1=inPattern.replace("#", i+"");
+// String fname2=outPattern.replace("#", i+"");
+// assert(new File(fname1).exists());
+// assert(!new File(fname2).exists());
+// processFile(fname1, fname2, filter);
+// }
+
+ runThreaded(inPattern, outPattern, minChrom, maxChrom, filter);
+
+ t.stop();
+ System.out.println("Input Vars: \t"+(totalIn_global-totalInNR_global));
+ System.out.println("Input No-ref: \t"+totalInNR_global);
+ System.out.println("Input Delta Length:\t"+deltaLenIn_global);
+ System.out.println();
+ System.out.println("Kept Vars: \t"+(totalKept_global-totalKeptNR_global));
+ System.out.println("Kept No-ref: \t"+totalKeptNR_global);
+ System.out.println("Kept Snp: \t"+snpKept_global);
+ System.out.println("Kept Del: \t"+delKept_global+"\t\tLength: \t"+delLenKept_global);
+ System.out.println("Kept Ins: \t"+insKept_global+"\t\tLength: \t"+insLenKept_global);
+ System.out.println("Kept Sub: \t"+subKept_global+"\t\tLength: \t"+subLenKept_global);
+ System.out.println("Kept Delta Length: \t"+deltaLenKept_global);
+ System.out.println("Kept Avg Score: \t"+(scoreKept_global/(Tools.max(1, totalKept_global))));
+ System.out.println();
+ System.out.println("Dropped Vars: \t"+(totalDropped_global-totalDroppedNR_global));
+ System.out.println("Dropped No-ref: \t"+totalDroppedNR_global);
+ System.out.println("Dropped Avg Score: \t"+(scoreDropped_global/Tools.max(1, totalDropped_global)));
+ System.out.println();
+ System.out.println("Time: \t"+t);
+ }
+
+ public static final void runThreaded(String inPattern, String outPattern, int minChrom, int maxChrom, boolean filter){
+ ArrayList<SVThread> svts=new ArrayList<SVThread>();
+ for(int i=minChrom; i<=maxChrom; i++){
+ String fname1=inPattern.replace("#", i+"");
+ String fname2=outPattern.replace("#", i+"");
+ assert(!fname1.equalsIgnoreCase(fname2));
+ assert(new File(fname1).exists());
+// assert(!new File(fname2).exists());
+ addThread(1);
+ SVThread svt=new SVThread(fname1, fname2, filter);
+ svts.add(svt);
+ new Thread(svt).start();
+ }
+ while(addThread(0)>0){}
+ for(SVThread svt : svts){
+
+ snpKept_global+=svt.snpKept;
+ delKept_global+=svt.delKept;
+ insKept_global+=svt.insKept;
+ subKept_global+=svt.subKept;
+ delLenKept_global+=svt.delLenKept;
+ insLenKept_global+=svt.insLenKept;
+ subLenKept_global+=svt.subLenKept;
+ deltaLenKept_global+=svt.deltaLenKept;
+
+ deltaLenIn_global+=svt.deltaLenIn;
+ totalIn_global+=svt.totalIn;
+ totalInNR_global+=svt.totalInNR;
+ totalKept_global+=svt.totalKept;
+ totalDropped_global+=svt.totalDropped;
+ totalKeptNR_global+=svt.totalKeptNR;
+ totalDroppedNR_global+=svt.totalDroppedNR;
+ scoreKept_global+=svt.scoreKept;
+ scoreDropped_global+=svt.scoreDropped;
+ }
+ }
+
+
+ public static boolean passesFilterSNP(Varlet v){
+
+
+ //Best so far:
+
+ if(STRICT){
+
+ if(v.endDist<3){return false;}
+ if(v.tailDist<10){return false;}
+
+ //NOTE! Last thing I did was make this more strict by adding 1 to all the num reads/unique reads required.
+ if(v.minStrandReads()>=2){
+
+ if(v.errors>2){return false;}
+ if(v.expectedErrors>1.5f){return false;}
+// if(v.expectedErrors-v.errors>3f){return false;}
+ if(v.maxReadQuality()<18){return false;}
+ if(v.avgReadQuality()<13){return false;}
+ if(v.maxVarQuality()<26){return false;}
+ if(v.avgVarQuality()<18){return false;}
+ if(v.numReads<4){return false;}
+ if(v.numSemiUniqueReads<4){return false;}
+ if(v.numUniqueReads<2){return false;}
+ if(v.paired<3){return false;}
+
+ }else if(v.minStrandReads()>=1){
+
+ if(v.errors>2){return false;}
+ if(v.expectedErrors>1.2f){return false;}
+// if(v.expectedErrors-v.errors>3f){return false;}
+ if(v.maxReadQuality()<19){return false;}
+ if(v.avgReadQuality()<14){return false;}
+ if(v.maxVarQuality()<28){return false;}
+ if(v.avgVarQuality()<19){return false;}
+ if(v.numReads<3){return false;}
+ if(v.numSemiUniqueReads<3){return false;}
+ if(v.numUniqueReads<2){return false;}
+ if(v.paired<3){return false;}
+
+ }else{
+ if(v.endDist<8){return false;}
+ if(v.tailDist<14){return false;}
+
+ if(v.errors>0){return false;}
+ if(v.expectedErrors>0.5f){return false;}
+// if(v.expectedErrors-v.errors>2f){return false;}
+ if(v.maxReadQuality()<21){return false;}
+ if(v.avgReadQuality()<17){return false;}
+ if(v.maxVarQuality()<30){return false;}
+ if(v.avgVarQuality()<21){return false;}
+ if(v.numReads<6){return false;}
+ if(v.numSemiUniqueReads<5){return false;}
+ if(v.numUniqueReads<3){return false;}
+ if(v.paired<5){return false;}
+ if(v.score()<8100){return false;}
+ }
+
+// else{
+// if(v.endDist<8){return false;}
+// if(v.tailDist<14){return false;}
+//
+// if(v.errors>0){return false;}
+// if(v.expectedErrors>0.5f){return false;}
+//// if(v.expectedErrors-v.errors>2f){return false;}
+// if(v.maxReadQuality()<21){return false;}
+// if(v.avgReadQuality()<17){return false;}
+// if(v.maxVarQuality()<30){return false;}
+// if(v.avgVarQuality()<21){return false;}
+// if(v.numReads<5){return false;}
+// if(v.numSemiUniqueReads<4){return false;}
+// if(v.numUniqueReads<2){return false;}
+// if(v.paired<4){return false;}
+// if(v.score()<8100){return false;}
+// }
+
+ }else{
+
+ assert(false) : "disabled";
+
+ }
+
+
+
+ return true;
+ }
+
+ public static boolean passesFilterOther(Varlet v){
+
+
+
+ if(v.endDist<3){return false;}
+ if(v.tailDist<10){return false;}
+
+ //NOTE! Last thing I did was make this more strict by adding 1 to all the num reads/unique reads required.
+ if(v.minStrandReads()>=2){
+
+ if(v.errors>2){return false;}
+ if(v.expectedErrors>1.5f){return false;}
+// if(v.expectedErrors-v.errors>3f){return false;}
+ if(v.maxReadQuality()<16){return false;}
+ if(v.avgReadQuality()<12){return false;}
+ if(v.maxVarQuality()<26){return false;}
+ if(v.avgVarQuality()<16){return false;}
+ if(v.numReads<4){return false;}
+ if(v.numSemiUniqueReads<4){return false;}
+ if(v.numUniqueReads<2){return false;}
+ if(v.paired<3){return false;}
+
+ }else if(v.minStrandReads()>=1){
+
+ if(v.errors>2){return false;}
+ if(v.expectedErrors>1.2f){return false;}
+// if(v.expectedErrors-v.errors>3f){return false;}
+ if(v.maxReadQuality()<17){return false;}
+ if(v.avgReadQuality()<13){return false;}
+ if(v.maxVarQuality()<28){return false;}
+ if(v.avgVarQuality()<17){return false;}
+ if(v.numReads<4){return false;}
+ if(v.numSemiUniqueReads<4){return false;}
+ if(v.numUniqueReads<2){return false;}
+ if(v.paired<3){return false;}
+
+ }else{
+ if(v.endDist<8){return false;}
+ if(v.tailDist<14){return false;}
+
+ if(v.errors>0){return false;}
+ if(v.expectedErrors>0.5f){return false;}
+// if(v.expectedErrors-v.errors>2f){return false;}
+ if(v.maxReadQuality()<20){return false;}
+ if(v.avgReadQuality()<16){return false;}
+ if(v.maxVarQuality()<30){return false;}
+ if(v.avgVarQuality()<20){return false;}
+ if(v.numReads<6){return false;}
+ if(v.numSemiUniqueReads<5){return false;}
+ if(v.numUniqueReads<3){return false;}
+ if(v.paired<5){return false;}
+ if(v.score()<6500){return false;}
+ }
+
+
+
+
+
+ return true;
+ }
+
+
+ public static ArrayList<Varlet> mergeAll(ArrayList<Varlet> vars){
+ if(vars==null || vars.size()==0){return null;}
+ ArrayList<Varlet> out=new ArrayList<Varlet>(8+vars.size()/16);
+ Collections.sort(vars);
+
+ ArrayList<Varlet> temp=new ArrayList<Varlet>(64);
+ for(int i=0; i<vars.size(); i++){
+// while(vars.get(i).beginLoc<3746582){i++;}
+ Varlet v=vars.get(i);
+// System.err.println("Grabbed "+v.beginLoc+" ~ "+v.call);
+ if(temp.isEmpty()){
+// System.err.println("Adding "+v.beginLoc+" ~ "+v.call);
+ temp.add(v);
+ }else{
+ if(v.equals(temp.get(0))){
+ temp.add(v);
+// System.err.println("Adding "+v.beginLoc+" ~ "+v.call);
+ }else{
+// System.err.println("Merging "+temp.size()+" x "+v.beginLoc+" ~ "+v.call);
+ Varlet result=mergeEqualVarlets(temp);
+ if(result.numReads>MIN_READS_TO_KEEP){
+ out.add(result);
+ }else if(result.numReads==MIN_READS_TO_KEEP){
+ if(result.maxVarQuality()>=MIN_QUALITY_AT_MIN_READS &&
+ result.errors<=MAX_ERRORS_AT_MIN_READS &&
+ result.expectedErrors<=MAX_EXPECTED_ERRORS_AT_MIN_READS &&
+ (result.paired>0 || !REQUIRE_PAIRED_AT_MIN_READS)){
+ out.add(result);
+ }
+ }
+ temp.clear();
+ temp.add(v);
+ }
+ }
+
+
+ }
+
+ if(!temp.isEmpty()){
+ if(temp.size()>=MIN_READS_TO_KEEP){
+ Varlet result=mergeEqualVarlets(temp);
+ out.add(result);
+ }
+ temp.clear();
+ }
+
+ {//For testing
+ Collections.sort(out); //Should already be sorted...
+ for(int i=1; i<out.size(); i++){
+ assert(!out.get(i).equals(out.get(i-1)));
+ }
+ }
+
+
+ if(verbose){System.err.println("out.size="+out.size());}
+
+ return out;
+ }
+
+
+ public static Varlet mergeEqualVarlets(ArrayList<Varlet> vars){
+
+// System.err.println("Merging "+vars.size()+" vars.");
+
+ if(vars.size()==1){return vars.get(0);}
+
+ HashMap<Integer, ArrayList<Varlet>> plus=new HashMap<Integer, ArrayList<Varlet>>(Tools.min(8, vars.size()));
+ HashMap<Integer, ArrayList<Varlet>> minus=new HashMap<Integer, ArrayList<Varlet>>(Tools.min(8, vars.size()));
+
+ int numReads=0;
+ int numSemiUniqueReads=0;
+ int numUniqueReads=0;
+ int pairedReads=0;
+ int plusReads1=0;
+ int minusReads1=0;
+ int plusReads2=0;
+ int minusReads2=0;
+
+ int totalQuality=0;
+ int totalVarQuality=0;
+
+ int maxReadQuality=0;
+ int maxVarQuality=0;
+
+ int maxMapScore=0;
+ int bestLen=0;
+ int minReadStart=Integer.MAX_VALUE;
+ int maxReadStop=-999999;
+
+ int maxHeadDist=-1;
+ int maxTailDist=-1;
+ int maxEndDist=-1;
+
+ Varlet bestVar=null;
+
+ int minErrors=999;
+ float minExpectedErrors=999f;
+
+ for(Varlet v : vars){
+
+ numReads+=v.numReads;
+ numSemiUniqueReads+=v.numSemiUniqueReads;
+ plusReads1+=v.numPlusReads1;
+ minusReads1+=v.numMinusReads1;
+ plusReads2+=v.numPlusReads2;
+ minusReads2+=v.numMinusReads2;
+
+ if(v.errors<minErrors || (v.errors<=minErrors && v.maxReadQuality()>maxReadQuality)){
+ bestVar=v;
+ }
+
+ totalQuality+=v.avgReadQuality()*v.numReads;
+ maxReadQuality=Tools.max(maxReadQuality, v.maxReadQuality());
+
+ totalVarQuality+=v.avgVarQuality()*v.numReads;
+ maxVarQuality=Tools.max(maxVarQuality, v.maxVarQuality());
+
+ if(bestLen==0 || (v.mapScore>=maxMapScore && v.readLen>=bestLen)){
+ bestLen=v.readLen;
+ }
+
+ maxHeadDist=Tools.max(maxHeadDist, v.headDist);
+ maxTailDist=Tools.max(maxTailDist, v.tailDist);
+ maxEndDist=Tools.max(maxEndDist, v.endDist);
+
+ minErrors=Tools.min(minErrors, v.errors);
+ minExpectedErrors=Tools.min(minExpectedErrors, v.expectedErrors);
+ maxMapScore=Tools.max(maxMapScore, v.mapScore);
+ minReadStart=Tools.min(minReadStart, v.readStart);
+ maxReadStop=Tools.max(maxReadStop, v.readStop);
+ assert(minReadStart<maxReadStop) : "\n"+minReadStart+"\n"+maxReadStop+"\n"+v.toText();
+
+ pairedReads+=v.paired;
+
+ if(v.strand==Gene.PLUS){
+ ArrayList<Varlet> value=plus.get(v.readStart);
+ if(value==null){
+ numUniqueReads++;
+ value=new ArrayList<Varlet>(2);
+ plus.put(v.readStart, value);
+ }
+ value.add(v);
+ }else{
+ ArrayList<Varlet> value=minus.get(v.readStop);
+ if(value==null){
+ numUniqueReads++;
+ value=new ArrayList<Varlet>(2);
+ minus.put(v.readStop, value);
+ }
+ value.add(v);
+ }
+ }
+
+// byte plusReads=(byte) ((plus.isEmpty() ? 0 : 1)+(minus.isEmpty() ? 0 : 1));
+
+ float avgVarQuality=totalVarQuality/(float)numReads;
+ float avgReadQuality=totalQuality/(float)numReads;
+
+ int netQuality=(int)Math.ceil((avgVarQuality+maxVarQuality)/2);
+ int netReadQuality=(int)Math.ceil((avgReadQuality+maxReadQuality)/2);
+
+ Varlet v=new Varlet(bestVar.chromosome, ((plusReads1+plusReads2>0) && (minusReads1+minusReads2>0) ? Gene.PLUS : bestVar.strand),
+ bestVar.beginLoc, bestVar.endLoc, bestVar.matchStart, bestVar.matchStop, bestVar.varType, bestVar.ref, bestVar.call,
+ netQuality, netReadQuality, maxMapScore, minErrors, minExpectedErrors, pairedReads, bestVar.readID, bestLen,
+ minReadStart, maxReadStop, numReads, maxHeadDist, maxTailDist, maxEndDist, bestVar.pairNum());
+
+
+ v.setMaxReadQuality(maxReadQuality);
+ v.setMaxVarQuality(maxVarQuality);
+ v.setAvgReadQuality((int)Math.ceil(avgReadQuality));
+ v.setAvgVarQuality((int)Math.ceil(avgVarQuality));
+
+ v.numSemiUniqueReads=(short)numSemiUniqueReads;
+ v.numUniqueReads=(short)numUniqueReads;
+ v.numPlusReads1=(short)plusReads1;
+ v.numMinusReads1=(short)minusReads1;
+ v.numPlusReads2=(short)plusReads2;
+ v.numMinusReads2=(short)minusReads2;
+ assert(plusReads1+minusReads1+plusReads2+minusReads2==numSemiUniqueReads);
+
+ assert(v.numReads>=v.numSemiUniqueReads);
+ assert(v.numSemiUniqueReads>=v.numUniqueReads);
+
+ //This assertion is only correct if stacking is done from raw, uncombined varlets.
+ assert(v.numSemiUniqueReads==vars.size()) : "\n"+vars.size()+", "+v.numReads+", "+v.numSemiUniqueReads+", "+v.numUniqueReads
+ +"\n"+v.toText();
+
+ assert(v.numUniqueReads<=v.numReads && v.numUniqueReads>0);
+ assert(v.numUniqueReads==plus.size()+minus.size()) : "numUniqueReads="+numUniqueReads+
+ ", v.numUniqueReads="+v.numUniqueReads+", v.numReads="+v.numReads
+ +", plus.size()="+plus.size()+", minus.size()="+minus.size()+"\n"+vars+"\n";
+
+ return v;
+ }
+
+
+ private static class SVThread implements Runnable {
+
+ public SVThread(String fname1_, String fname2_, boolean filter_){
+ fname1=fname1_;
+ fname2=fname2_;
+ filter=filter_;
+ }
+
+ @Override
+ public void run() {
+// addThread(1);
+ assert(activeThreads>0);
+ processFile(fname1, fname2, filter);
+ addThread(-1);
+ }
+
+ private final ArrayList<Varlet> processFile(String inName, String outName, boolean filter){
+
+ ArrayList<Varlet> initial=Varlet.fromTextFile(inName);
+
+ for(Varlet v : initial){
+ if(v.varType==Variation.NOREF){totalInNR++;}
+ totalIn++;
+ }
+
+ if(verbose){System.err.println("Initial: \t"+initial.size());}
+ ArrayList<Varlet> merged=mergeAll(initial);
+ initial=null;
+ if(verbose){System.err.println("Merged: \t"+merged.size());}
+ ArrayList<Varlet> out;
+ if(!filter){
+// System.out.println("Not filtering.");
+ out=merged;
+ for(Varlet v : out){
+ if(v!=null){
+ totalKept++;
+ scoreKept+=v.score();
+ }
+ }
+ }else{
+// System.out.println("Filtering.");
+ out=filterLight(merged);
+// System.out.println("Filtered: \t"+out.size());
+ }
+ merged=null;
+ if(out==null){out=new ArrayList<Varlet>(1);}
+ out.trimToSize();
+ if(verbose){if(verbose){System.err.println("Out: \t"+out.size());}}
+
+ if(outName!=null){
+
+ TextStreamWriter tsw=new TextStreamWriter(outName, true, false, false);
+ tsw.start();
+ tsw.println(Varlet.textHeader());
+ for(Varlet v : out){
+ StringBuilder sb=v.toText();
+ sb.append('\n');
+ tsw.print(sb);
+ }
+ tsw.poison();
+
+ }
+ return out;
+ }
+
+
+ private final ArrayList<Varlet> filterLight(ArrayList<Varlet> vars){
+ if(vars==null || vars.size()==0){return null;}
+
+ int dropped=0;
+ for(int i=0; i<vars.size(); i++){
+ Varlet v=vars.get(i);
+
+ int dif=v.lengthDif();
+ deltaLenIn+=dif;
+
+ boolean passes=true;
+ if(v.varType==Variation.NOCALL){
+ passes=false;
+ }else if(v.numSemiUniqueReads<2){
+ passes=false;
+ }else if(v.endDist<6 || v.tailDist<10){
+ passes=false;
+ }else if(v.maxVarQuality()<24){
+ passes=false;
+ }else if(v.expectedErrors>2){
+ passes=false;
+ }
+
+ if(passes && STRICT){
+ passes=passesFilterLight(v);
+ }
+
+ if(passes){
+ if(v.varType==Variation.NOREF){totalKeptNR++;}
+ else if(v.varType==Variation.SNP){snpKept++;}
+ else if(v.varType==Variation.DEL){
+ delKept++;
+// delLenKept-=v.lengthRef();
+ delLenKept+=dif;
+ }
+ else if(v.varType==Variation.INS){
+ insKept++;
+// insLenKept+=v.lengthVar();
+ insLenKept+=dif;
+ }
+ else if(v.varType==Variation.DELINS){
+ subKept++;
+// subLenKept+=(v.lengthRef()-v.lengthVar());
+ subLenKept+=dif;
+ }
+ totalKept++;
+ scoreKept+=v.score();
+ deltaLenKept+=dif;
+ }else{
+ vars.set(i, null);
+ if(v.varType==Variation.NOREF){totalDroppedNR++;}
+ dropped++;
+ scoreDropped+=v.score();
+ }
+ }
+ totalDropped+=dropped;
+ if(dropped>0){
+ Tools.condenseStrict(vars);
+ }
+ return vars;
+ }
+
+ private static boolean passesFilterLight(Varlet v){
+ if(v.endDist<4){return false;}
+ if(v.tailDist<10){return false;}
+
+ //NOTE! Last thing I did was make this more strict by adding 1 to all the num reads/unique reads required.
+ if(v.minStrandReads()>=2){
+
+ if(v.errors>2){return false;}
+ if(v.expectedErrors>1.4f){return false;}
+ // if(v.expectedErrors-v.errors>3f){return false;}
+ if(v.maxReadQuality()<17){return false;}
+ if(v.avgReadQuality()<13){return false;}
+ if(v.maxVarQuality()<26){return false;}
+ if(v.avgVarQuality()<17){return false;}
+ if(v.numReads<3){return false;}
+ if(v.numSemiUniqueReads<3){return false;}
+ if(v.numUniqueReads<2){return false;}
+// if(v.paired<3){return false;}
+ if(v.score()<8200){return false;}
+
+ }else if(v.minStrandReads()>=1){
+ if(v.endDist<7){return false;}
+ if(v.tailDist<12){return false;}
+
+ if(v.errors>2){return false;}
+ if(v.expectedErrors>1.1f){return false;}
+ // if(v.expectedErrors-v.errors>3f){return false;}
+ if(v.maxReadQuality()<18){return false;}
+ if(v.avgReadQuality()<14){return false;}
+ if(v.maxVarQuality()<28){return false;}
+ if(v.avgVarQuality()<18){return false;}
+ if(v.numReads<4){return false;}
+ if(v.numSemiUniqueReads<3){return false;}
+ if(v.numUniqueReads<2){return false;}
+// if(v.paired<3){return false;}
+ if(v.score()<8020){return false;}
+ }else{
+ if(v.endDist<8){return false;}
+ if(v.tailDist<14){return false;}
+
+ if(v.errors>0){return false;}
+ if(v.expectedErrors>0.5f){return false;}
+ // if(v.expectedErrors-v.errors>2f){return false;}
+ if(v.maxReadQuality()<21){return false;}
+ if(v.avgReadQuality()<17){return false;}
+ if(v.maxVarQuality()<30){return false;}
+ if(v.avgVarQuality()<21){return false;}
+ if(v.numReads<6){return false;}
+ if(v.numSemiUniqueReads<5){return false;}
+ if(v.numUniqueReads<3){return false;}
+// if(v.paired<5){return false;}
+ if(v.score()<7670){return false;}
+ }
+ return true;
+ }
+
+ private long deltaLenKept=0;
+ private long snpKept=0;
+ private long delKept=0;
+ private long insKept=0;
+ private long subKept=0;
+ private long delLenKept=0;
+ private long insLenKept=0;
+ private long subLenKept=0;
+
+ private long deltaLenIn=0;
+ private long totalIn=0;
+ private long totalInNR=0;
+ private long totalKept=0;
+ private long totalKeptNR=0;
+ private long totalDropped=0;
+ private long totalDroppedNR=0;
+ private long scoreKept=0;
+ private long scoreDropped=0;
+
+ private final String fname1;
+ private final String fname2;
+ private final boolean filter;
+ }
+
+ private static int addThread(int x){
+ synchronized(THREADLOCK){
+ while(x>0 && activeThreads>=THREADS){
+ try {
+ THREADLOCK.wait(200);
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ activeThreads+=x;
+ return activeThreads;
+ }
+ }
+
+
+ public static long deltaLenKept_global=0;
+ public static long deltaLenIn_global=0;
+
+ public static long snpKept_global=0;
+ public static long delKept_global=0;
+ public static long insKept_global=0;
+ public static long subKept_global=0;
+ public static long delLenKept_global=0;
+ public static long insLenKept_global=0;
+ public static long subLenKept_global=0;
+
+ public static long totalIn_global=0;
+ public static long totalInNR_global=0;
+ public static long totalKept_global=0;
+ public static long totalDropped_global=0;
+ public static long totalKeptNR_global=0;
+ public static long totalDroppedNR_global=0;
+ public static long scoreKept_global=0;
+ public static long scoreDropped_global=0;
+
+ private static int activeThreads=0;
+
+ private static final String THREADLOCK=new String("THREADLOCK");
+ private static int THREADS=3;
+ public static final int MIN_READS_TO_KEEP=1;
+ public static final int MIN_QUALITY_AT_MIN_READS=14;
+ public static final int MAX_ERRORS_AT_MIN_READS=2;
+ public static final int MAX_EXPECTED_ERRORS_AT_MIN_READS=4;
+ public static final boolean REQUIRE_PAIRED_AT_MIN_READS=true;
+ public static boolean STRICT=false;
+ public static boolean VSTRICT=false;
+ public static boolean USTRICT=false;
+
+ public static final boolean verbose=false;
+}
diff --git a/current/var/StackVariations2.java b/current/var/StackVariations2.java
new file mode 100755
index 0000000..11a628b
--- /dev/null
+++ b/current/var/StackVariations2.java
@@ -0,0 +1,833 @@
+package var;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+
+
+import dna.Data;
+import dna.Gene;
+import dna.Parser;
+import dna.Timer;
+import fileIO.TextStreamWriter;
+
+import align2.Tools;
+
+public class StackVariations2 {
+
+ public static void main(String[] args){
+ System.err.println("Executing "+(new Object() { }.getClass().getEnclosingClass().getName())+" "+Arrays.toString(args)+"\n");
+
+ Timer t=new Timer();
+
+ String inPattern=(args[0].equalsIgnoreCase("null") ? null : args[0]);
+ String outPattern=args[1];
+
+ assert(!inPattern.equalsIgnoreCase(outPattern));
+
+ int minChrom=-1;
+ int maxChrom=-1;
+
+ boolean filter=false;
+ Data.GENOME_BUILD=-1;
+
+ for(int i=2; i<args.length; i++){
+ final String arg=args[i];
+ final String[] split=arg.split("=");
+ String a=split[0].toLowerCase();
+ String b=(split.length>1 ? split[1] : null);
+
+ if(Parser.isJavaFlag(arg)){
+ //jvm argument; do nothing
+ }else if(a.equalsIgnoreCase("filter")){filter=true;}
+ else if(a.startsWith("filter")){
+ if(b.equals("1") || b.startsWith("t")){filter=true;}
+ else if(b.equals("0") || b.startsWith("f")){filter=false;}
+ else{throw new RuntimeException("Unknown parameter "+args[i]);}
+ }else if(a.equalsIgnoreCase("strict")){
+ if(b==null){STRICT=true;}
+ else if(b.equals("1") || b.startsWith("t")){STRICT=true;}
+ else if(b.equals("0") || b.startsWith("f")){STRICT=false;}
+ else{throw new RuntimeException("Unknown parameter "+args[i]);}
+ }else if(a.equals("genome") || a.equals("build")){
+ Data.setGenome(Integer.parseInt(b));
+ if(minChrom==-1){minChrom=1;}
+ if(maxChrom==-1){maxChrom=Data.numChroms;}
+ }else if(a.equals("minchrom")){
+ minChrom=Integer.parseInt(b);
+ }else if(a.equals("maxchrom")){
+ maxChrom=Integer.parseInt(b);
+ }else if(a.equals("threads") || a.equals("t")){
+ THREADS=Integer.parseInt(b);
+ }else if(a.equals("minreads")){
+ MIN_READS_TO_KEEP=Integer.parseInt(b);
+ }else if(a.equals("blocksize")){
+ GenerateVarlets2.BLOCKSIZE=(Integer.parseInt(b));
+ }else if(a.equals("deletefiles") || a.startsWith("deletetemp") || a.startsWith("deleteinput") || a.equals("delete")){
+ DELETE_INPUT=(Tools.parseBoolean(b));
+ }else{
+ throw new RuntimeException("Unknown parameter "+args[i]);
+ }
+ }
+
+ assert(minChrom>=0 && maxChrom>=minChrom) : "Please set minchrom and maxchrom.";
+ if(Data.GENOME_BUILD<0){throw new RuntimeException("Please set genome number.");}
+ THREADS=Tools.max(1, THREADS);
+
+// for(byte i=minChrom; i<=maxChrom; i++){
+// String fname1=inPattern.replace("#", i+"");
+// String fname2=outPattern.replace("#", i+"");
+// assert(new File(fname1).exists());
+// assert(!new File(fname2).exists());
+// processFile(fname1, fname2, filter);
+// }
+
+ runThreaded(inPattern, outPattern, minChrom, maxChrom, filter);
+
+ t.stop();
+ System.out.println("Input Vars: \t"+(totalIn_global-totalInNR_global));
+ System.out.println("Input No-ref: \t"+totalInNR_global);
+ System.out.println("Input Delta Length:\t"+deltaLenIn_global);
+ System.out.println();
+ System.out.println("Kept Vars: \t"+(totalKept_global-totalKeptNR_global));
+ System.out.println("Kept No-ref: \t"+totalKeptNR_global);
+ System.out.println("Kept Snp: \t"+snpKept_global);
+ System.out.println("Kept Del: \t"+delKept_global+"\t\tLength: \t"+delLenKept_global);
+ System.out.println("Kept Ins: \t"+insKept_global+"\t\tLength: \t"+insLenKept_global);
+ System.out.println("Kept Sub: \t"+subKept_global+"\t\tLength: \t"+subLenKept_global);
+ System.out.println("Kept Delta Length: \t"+deltaLenKept_global);
+ System.out.println("Kept Avg Score: \t"+(scoreKept_global/(Tools.max(1, totalKept_global))));
+ System.out.println();
+ System.out.println("Dropped Vars: \t"+(totalDropped_global-totalDroppedNR_global));
+ System.out.println("Dropped No-ref: \t"+totalDroppedNR_global);
+ System.out.println("Dropped Avg Score: \t"+(scoreDropped_global/Tools.max(1, totalDropped_global)));
+ System.out.println();
+ System.out.println("Time: \t"+t);
+ }
+
+ public static final void runThreaded(String inPattern, String outPattern, int minChrom, int maxChrom, boolean filter){
+ ArrayList<SVThread> svts=new ArrayList<SVThread>();
+ for(int i=minChrom; i<=maxChrom; i++){
+ assert(inPattern==null || !inPattern.equalsIgnoreCase(outPattern));
+ String fname1=inPattern;
+ String fname2=outPattern.replace("#", i+"");
+ addThread(1);
+ SVThread svt=new SVThread(fname1, fname2, i, filter);
+ svts.add(svt);
+ new Thread(svt).start();
+ }
+ while(addThread(0)>0){}
+ for(SVThread svt : svts){
+
+ snpKept_global+=svt.snpKept;
+ delKept_global+=svt.delKept;
+ insKept_global+=svt.insKept;
+ subKept_global+=svt.subKept;
+ delLenKept_global+=svt.delLenKept;
+ insLenKept_global+=svt.insLenKept;
+ subLenKept_global+=svt.subLenKept;
+ deltaLenKept_global+=svt.deltaLenKept;
+
+ deltaLenIn_global+=svt.deltaLenIn;
+ totalIn_global+=svt.totalIn;
+ totalInNR_global+=svt.totalInNR;
+ totalKept_global+=svt.totalKept;
+ totalDropped_global+=svt.totalDropped;
+ totalKeptNR_global+=svt.totalKeptNR;
+ totalDroppedNR_global+=svt.totalDroppedNR;
+ scoreKept_global+=svt.scoreKept;
+ scoreDropped_global+=svt.scoreDropped;
+ }
+ }
+
+
+ public static boolean passesFilterSNP(Varlet v){
+
+
+ //Best so far:
+
+ if(STRICT){
+
+ if(v.endDist<3){return false;}
+ if(v.tailDist<10){return false;}
+
+ //NOTE! Last thing I did was make this more strict by adding 1 to all the num reads/unique reads required.
+ if(v.minStrandReads()>=2){
+
+ if(v.errors>2){return false;}
+ if(v.expectedErrors>1.5f){return false;}
+// if(v.expectedErrors-v.errors>3f){return false;}
+ if(v.maxReadQuality()<18){return false;}
+ if(v.avgReadQuality()<13){return false;}
+ if(v.maxVarQuality()<26){return false;}
+ if(v.avgVarQuality()<18){return false;}
+ if(v.numReads<4){return false;}
+ if(v.numSemiUniqueReads<4){return false;}
+ if(v.numUniqueReads<2){return false;}
+ if(v.paired<3){return false;}
+
+ }else if(v.minStrandReads()>=1){
+
+ if(v.errors>2){return false;}
+ if(v.expectedErrors>1.2f){return false;}
+// if(v.expectedErrors-v.errors>3f){return false;}
+ if(v.maxReadQuality()<19){return false;}
+ if(v.avgReadQuality()<14){return false;}
+ if(v.maxVarQuality()<28){return false;}
+ if(v.avgVarQuality()<19){return false;}
+ if(v.numReads<3){return false;}
+ if(v.numSemiUniqueReads<3){return false;}
+ if(v.numUniqueReads<2){return false;}
+ if(v.paired<3){return false;}
+
+ }else{
+ if(v.endDist<8){return false;}
+ if(v.tailDist<14){return false;}
+
+ if(v.errors>0){return false;}
+ if(v.expectedErrors>0.5f){return false;}
+// if(v.expectedErrors-v.errors>2f){return false;}
+ if(v.maxReadQuality()<21){return false;}
+ if(v.avgReadQuality()<17){return false;}
+ if(v.maxVarQuality()<30){return false;}
+ if(v.avgVarQuality()<21){return false;}
+ if(v.numReads<6){return false;}
+ if(v.numSemiUniqueReads<5){return false;}
+ if(v.numUniqueReads<3){return false;}
+ if(v.paired<5){return false;}
+ if(v.score()<8100){return false;}
+ }
+
+// else{
+// if(v.endDist<8){return false;}
+// if(v.tailDist<14){return false;}
+//
+// if(v.errors>0){return false;}
+// if(v.expectedErrors>0.5f){return false;}
+//// if(v.expectedErrors-v.errors>2f){return false;}
+// if(v.maxReadQuality()<21){return false;}
+// if(v.avgReadQuality()<17){return false;}
+// if(v.maxVarQuality()<30){return false;}
+// if(v.avgVarQuality()<21){return false;}
+// if(v.numReads<5){return false;}
+// if(v.numSemiUniqueReads<4){return false;}
+// if(v.numUniqueReads<2){return false;}
+// if(v.paired<4){return false;}
+// if(v.score()<8100){return false;}
+// }
+
+ }else{
+
+ assert(false) : "disabled";
+
+ }
+
+
+
+ return true;
+ }
+
+ public static boolean passesFilterOther(Varlet v){
+
+
+
+ if(v.endDist<3){return false;}
+ if(v.tailDist<10){return false;}
+
+ //NOTE! Last thing I did was make this more strict by adding 1 to all the num reads/unique reads required.
+ if(v.minStrandReads()>=2){
+
+ if(v.errors>2){return false;}
+ if(v.expectedErrors>1.5f){return false;}
+// if(v.expectedErrors-v.errors>3f){return false;}
+ if(v.maxReadQuality()<16){return false;}
+ if(v.avgReadQuality()<12){return false;}
+ if(v.maxVarQuality()<26){return false;}
+ if(v.avgVarQuality()<16){return false;}
+ if(v.numReads<4){return false;}
+ if(v.numSemiUniqueReads<4){return false;}
+ if(v.numUniqueReads<2){return false;}
+ if(v.paired<3){return false;}
+
+ }else if(v.minStrandReads()>=1){
+
+ if(v.errors>2){return false;}
+ if(v.expectedErrors>1.2f){return false;}
+// if(v.expectedErrors-v.errors>3f){return false;}
+ if(v.maxReadQuality()<17){return false;}
+ if(v.avgReadQuality()<13){return false;}
+ if(v.maxVarQuality()<28){return false;}
+ if(v.avgVarQuality()<17){return false;}
+ if(v.numReads<4){return false;}
+ if(v.numSemiUniqueReads<4){return false;}
+ if(v.numUniqueReads<2){return false;}
+ if(v.paired<3){return false;}
+
+ }else{
+ if(v.endDist<8){return false;}
+ if(v.tailDist<14){return false;}
+
+ if(v.errors>0){return false;}
+ if(v.expectedErrors>0.5f){return false;}
+// if(v.expectedErrors-v.errors>2f){return false;}
+ if(v.maxReadQuality()<20){return false;}
+ if(v.avgReadQuality()<16){return false;}
+ if(v.maxVarQuality()<30){return false;}
+ if(v.avgVarQuality()<20){return false;}
+ if(v.numReads<6){return false;}
+ if(v.numSemiUniqueReads<5){return false;}
+ if(v.numUniqueReads<3){return false;}
+ if(v.paired<5){return false;}
+ if(v.score()<6500){return false;}
+ }
+
+
+
+
+
+ return true;
+ }
+
+
+ public static ArrayList<Varlet> mergeAll(ArrayList<Varlet> vars){
+ if(vars==null || vars.size()==0){return null;}
+ ArrayList<Varlet> out=new ArrayList<Varlet>(8+vars.size()/16);
+ Collections.sort(vars);
+
+ ArrayList<Varlet> temp=new ArrayList<Varlet>(64);
+ for(int i=0; i<vars.size(); i++){
+// while(vars.get(i).beginLoc<3746582){i++;}
+ Varlet v=vars.get(i);
+// System.err.println("Grabbed "+v.beginLoc+" ~ "+v.call);
+ if(temp.isEmpty()){
+// System.err.println("Adding "+v.beginLoc+" ~ "+v.call);
+ temp.add(v);
+ }else{
+ if(v.equals(temp.get(0))){
+ temp.add(v);
+// System.err.println("Adding "+v.beginLoc+" ~ "+v.call);
+ }else{
+// System.err.println("Merging "+temp.size()+" x "+v.beginLoc+" ~ "+v.call);
+ Varlet result=mergeEqualVarlets(temp);
+ if(result.numReads>MIN_READS_TO_KEEP){
+ out.add(result);
+ }else if(result.numReads==MIN_READS_TO_KEEP){
+ if(result.maxVarQuality()>=MIN_QUALITY_AT_MIN_READS &&
+ result.errors<=MAX_ERRORS_AT_MIN_READS &&
+ result.expectedErrors<=MAX_EXPECTED_ERRORS_AT_MIN_READS &&
+ (result.paired>0 || !REQUIRE_PAIRED_AT_MIN_READS)){
+ out.add(result);
+ }
+ }
+ temp.clear();
+ temp.add(v);
+ }
+ }
+
+
+ }
+
+ if(!temp.isEmpty()){
+ if(temp.size()>=MIN_READS_TO_KEEP){
+ Varlet result=mergeEqualVarlets(temp);
+ out.add(result);
+ }
+ temp.clear();
+ }
+
+ {//For testing
+ Collections.sort(out); //Should already be sorted...
+ for(int i=1; i<out.size(); i++){
+ assert(!out.get(i).equals(out.get(i-1)));
+ }
+ }
+
+
+ if(verbose){System.err.println("out.size="+out.size());}
+
+ return out;
+ }
+
+
+ public static Varlet mergeEqualVarlets(ArrayList<Varlet> vars){
+
+// System.err.println("Merging "+vars.size()+" vars.");
+
+ if(vars.size()==1){return vars.get(0);}
+
+ HashMap<Integer, ArrayList<Varlet>> plus=new HashMap<Integer, ArrayList<Varlet>>(Tools.min(8, vars.size()));
+ HashMap<Integer, ArrayList<Varlet>> minus=new HashMap<Integer, ArrayList<Varlet>>(Tools.min(8, vars.size()));
+
+ int numReads=0;
+ int numSemiUniqueReads=0;
+ int numUniqueReads=0;
+ int pairedReads=0;
+ int plusReads1=0;
+ int minusReads1=0;
+ int plusReads2=0;
+ int minusReads2=0;
+
+ int totalQuality=0;
+ int totalVarQuality=0;
+
+ int maxReadQuality=0;
+ int maxVarQuality=0;
+
+ int maxMapScore=0;
+ int bestLen=0;
+ int minReadStart=Integer.MAX_VALUE;
+ int maxReadStop=-999999;
+
+ int maxHeadDist=-1;
+ int maxTailDist=-1;
+ int maxEndDist=-1;
+
+ Varlet bestVar=null;
+
+ int minErrors=999;
+ float minExpectedErrors=999f;
+
+ for(Varlet v : vars){
+
+ numReads+=v.numReads;
+ numSemiUniqueReads+=v.numSemiUniqueReads;
+ plusReads1+=v.numPlusReads1;
+ minusReads1+=v.numMinusReads1;
+ plusReads2+=v.numPlusReads2;
+ minusReads2+=v.numMinusReads2;
+
+ if(v.errors<minErrors || (v.errors<=minErrors && v.maxReadQuality()>maxReadQuality)){
+ bestVar=v;
+ }
+
+ totalQuality+=v.avgReadQuality()*v.numReads;
+ maxReadQuality=Tools.max(maxReadQuality, v.maxReadQuality());
+
+ totalVarQuality+=v.avgVarQuality()*v.numReads;
+ maxVarQuality=Tools.max(maxVarQuality, v.maxVarQuality());
+
+ if(bestLen==0 || (v.mapScore>=maxMapScore && v.readLen>=bestLen)){
+ bestLen=v.readLen;
+ }
+
+ maxHeadDist=Tools.max(maxHeadDist, v.headDist);
+ maxTailDist=Tools.max(maxTailDist, v.tailDist);
+ maxEndDist=Tools.max(maxEndDist, v.endDist);
+
+ minErrors=Tools.min(minErrors, v.errors);
+ minExpectedErrors=Tools.min(minExpectedErrors, v.expectedErrors);
+ maxMapScore=Tools.max(maxMapScore, v.mapScore);
+ minReadStart=Tools.min(minReadStart, v.readStart);
+ maxReadStop=Tools.max(maxReadStop, v.readStop);
+ assert(minReadStart<maxReadStop) : "\n"+minReadStart+"\n"+maxReadStop+"\n"+v.toText();
+
+ pairedReads+=v.paired;
+
+ if(v.strand==Gene.PLUS){
+ ArrayList<Varlet> value=plus.get(v.readStart);
+ if(value==null){
+ numUniqueReads++;
+ value=new ArrayList<Varlet>(2);
+ plus.put(v.readStart, value);
+ }
+ value.add(v);
+ }else{
+ ArrayList<Varlet> value=minus.get(v.readStop);
+ if(value==null){
+ numUniqueReads++;
+ value=new ArrayList<Varlet>(2);
+ minus.put(v.readStop, value);
+ }
+ value.add(v);
+ }
+ }
+
+// byte plusReads=(byte) ((plus.isEmpty() ? 0 : 1)+(minus.isEmpty() ? 0 : 1));
+
+ float avgVarQuality=totalVarQuality/(float)numReads;
+ float avgReadQuality=totalQuality/(float)numReads;
+
+ int netQuality=(int)Math.ceil((avgVarQuality+maxVarQuality)/2);
+ int netReadQuality=(int)Math.ceil((avgReadQuality+maxReadQuality)/2);
+
+ Varlet v=new Varlet(bestVar.chromosome, ((plusReads1+plusReads2>0) && (minusReads1+minusReads2>0) ? Gene.PLUS : bestVar.strand),
+ bestVar.beginLoc, bestVar.endLoc, bestVar.matchStart, bestVar.matchStop, bestVar.varType, bestVar.ref, bestVar.call,
+ netQuality, netReadQuality, maxMapScore, minErrors, minExpectedErrors, pairedReads, bestVar.readID, bestLen,
+ minReadStart, maxReadStop, numReads, maxHeadDist, maxTailDist, maxEndDist, bestVar.pairNum());
+
+
+ v.setMaxReadQuality(maxReadQuality);
+ v.setMaxVarQuality(maxVarQuality);
+ v.setAvgReadQuality((int)Math.ceil(avgReadQuality));
+ v.setAvgVarQuality((int)Math.ceil(avgVarQuality));
+
+ v.numSemiUniqueReads=numSemiUniqueReads;
+ v.numUniqueReads=numUniqueReads;
+ v.numPlusReads1=plusReads1;
+ v.numMinusReads1=minusReads1;
+ v.numPlusReads2=plusReads2;
+ v.numMinusReads2=minusReads2;
+ assert(plusReads1+minusReads1+plusReads2+minusReads2==numSemiUniqueReads);
+
+ assert(v.numReads>=v.numSemiUniqueReads);
+ assert(v.numSemiUniqueReads>=v.numUniqueReads);
+
+ //This assertion is only correct if stacking is done from raw, uncombined varlets.
+ assert(v.numSemiUniqueReads==vars.size()) : "\n"+vars.size()+", "+v.numReads+", "+v.numSemiUniqueReads+", "+v.numUniqueReads
+ +"\n"+v.toText();
+
+ assert(v.numUniqueReads<=v.numReads && v.numUniqueReads>0);
+ assert(v.numUniqueReads==plus.size()+minus.size()) : "numUniqueReads="+numUniqueReads+
+ ", v.numUniqueReads="+v.numUniqueReads+", v.numReads="+v.numReads
+ +", plus.size()="+plus.size()+", minus.size()="+minus.size()+"\n"+vars+"\n";
+
+ return v;
+ }
+
+
+ private static class SVThread implements Runnable {
+
+ public SVThread(String fname1_, String fname2_, final int chrom_, boolean filter_){
+ fname1=fname1_;
+ fname2=fname2_;
+ filter=filter_;
+ chrom=chrom_;
+ }
+
+ @Override
+ public void run() {
+// addThread(1);
+ assert(activeThreads>0);
+ processFile(fname1, fname2);
+ addThread(-1);
+ }
+
+ private final void processFile(final String inName, final String outName){
+
+ final long[] keys=GenerateVarlets2.keys(chrom);
+ final TextStreamWriter tsw=(inName==null ? null : new TextStreamWriter(outName, true, false, false));
+ if(tsw!=null){
+ tsw.start();
+ tsw.println(Varlet.textHeader());
+ }
+
+ for(final long key : keys){
+ String blockname=GenerateVarlets2.fname(key, inName);
+
+ ArrayList<Varlet> initial=Varlet.fromTextFile(blockname);
+
+ for(Varlet v : initial){
+ if(v.varType==Variation.NOREF){totalInNR++;}
+ totalIn++;
+
+ int dif=v.lengthDif();
+ deltaLenIn+=dif;
+ }
+
+ if(verbose){System.err.println("Initial: \t"+initial.size());}
+
+ int merged=mergeAll2(initial, tsw);
+
+ initial=null;
+ if(verbose){System.err.println("Merged: \t"+merged);}
+
+ }
+
+ if(tsw!=null){
+ tsw.poison();
+ if(DELETE_INPUT){
+ for(int i=0; i<10 && tsw.isAlive(); i++){
+ try {
+ tsw.join(10000);
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ if(tsw.isAlive()){
+ System.err.println(tsw.getClass().getName()+" for "+outName+" refused to die.");
+ assert(false);
+ }
+ }
+ }
+
+ if(DELETE_INPUT){
+ for(final long key : keys){
+ String blockname=GenerateVarlets2.fname(key, inName);
+// System.out.println("Deleting "+blockname);
+ new File(blockname).delete();
+ }
+ }
+ }
+
+
+
+
+
+ private final int mergeAll2(ArrayList<Varlet> vars, TextStreamWriter tsw){
+ if(vars==null || vars.size()==0){return 0;}
+
+ Collections.sort(vars);
+ int out=0;
+
+ ArrayList<Varlet> temp=new ArrayList<Varlet>(64);
+ for(int i=0; i<vars.size(); i++){
+// while(vars.get(i).beginLoc<3746582){i++;}
+// Varlet v=vars.get(i);
+ final Varlet v=vars.set(i, null);
+// System.err.println("Grabbed "+v.beginLoc+" ~ "+v.call);
+ if(temp.isEmpty()){
+// System.err.println("Adding "+v.beginLoc+" ~ "+v.call);
+ temp.add(v);
+ }else{
+ if(v.equals(temp.get(0))){
+ temp.add(v);
+// System.err.println("Adding "+v.beginLoc+" ~ "+v.call);
+ }else{
+// System.err.println("Merging "+temp.size()+" x "+v.beginLoc+" ~ "+v.call);
+ Varlet result=mergeEqualVarlets(temp);
+
+ processMergedVar(result, tsw);
+ out++;
+
+ temp.clear();
+ temp.add(v);
+ }
+ }
+ }
+
+ if(!temp.isEmpty()){
+ if(temp.size()>=MIN_READS_TO_KEEP){
+ Varlet result=mergeEqualVarlets(temp);
+ out++;
+ processMergedVar(result, tsw);
+ }
+ temp.clear();
+ }
+
+ return out;
+ }
+
+
+ private final boolean processMergedVar(Varlet v, TextStreamWriter tsw){
+
+ if(v==null){return false;}
+ if(v.numReads<MIN_READS_TO_KEEP){return false;}
+ if(v.numReads==MIN_READS_TO_KEEP){
+ if(v.maxVarQuality()<MIN_QUALITY_AT_MIN_READS ||
+ v.errors<=MAX_ERRORS_AT_MIN_READS ||
+ v.expectedErrors<=MAX_EXPECTED_ERRORS_AT_MIN_READS ||
+ (v.paired<1 && REQUIRE_PAIRED_AT_MIN_READS)){
+ return false;
+ }
+ }
+
+ boolean keep;
+
+ if(filter){
+ keep=filterLight(v);
+ }else{
+ keep=true;
+ totalKept++;
+ scoreKept+=v.score();
+ }
+
+ if(keep){
+ StringBuilder sb=v.toText();
+ sb.append('\n');
+ tsw.print(sb);
+ }
+ return keep;
+ }
+
+
+ private final boolean filterLight(Varlet v){
+ int dropped=0;
+
+ int dif=v.lengthDif();
+// deltaLenIn+=dif;
+
+ boolean passes=true;
+ if(v.varType==Variation.NOCALL){
+ passes=false;
+ }else if(v.numSemiUniqueReads<2){
+ passes=false;
+ }else if(v.endDist<6 || v.tailDist<10){
+ passes=false;
+ }else if(v.maxVarQuality()<24){
+ passes=false;
+ }else if(v.expectedErrors>2){
+ passes=false;
+ }
+
+ if(passes && STRICT){
+ passes=passesFilterLight(v);
+ }
+
+ if(passes){
+ if(v.varType==Variation.NOREF){totalKeptNR++;}
+ else if(v.varType==Variation.SNP){snpKept++;}
+ else if(v.varType==Variation.DEL){
+ delKept++;
+ // delLenKept-=v.lengthRef();
+ delLenKept+=dif;
+ }
+ else if(v.varType==Variation.INS){
+ insKept++;
+ // insLenKept+=v.lengthVar();
+ insLenKept+=dif;
+ }
+ else if(v.varType==Variation.DELINS){
+ subKept++;
+ // subLenKept+=(v.lengthRef()-v.lengthVar());
+ subLenKept+=dif;
+ }
+ totalKept++;
+ scoreKept+=v.score();
+ deltaLenKept+=dif;
+ }else{
+ if(v.varType==Variation.NOREF){totalDroppedNR++;}
+ dropped++;
+ scoreDropped+=v.score();
+ }
+
+ totalDropped+=dropped;
+ return passes;
+ }
+
+ private static boolean passesFilterLight(Varlet v){
+ if(v.endDist<4){return false;}
+ if(v.tailDist<10){return false;}
+
+ //NOTE! Last thing I did was make this more strict by adding 1 to all the num reads/unique reads required.
+ if(v.minStrandReads()>=2){
+
+ if(v.errors>2){return false;}
+ if(v.expectedErrors>1.4f){return false;}
+ // if(v.expectedErrors-v.errors>3f){return false;}
+ if(v.maxReadQuality()<17){return false;}
+ if(v.avgReadQuality()<13){return false;}
+ if(v.maxVarQuality()<26){return false;}
+ if(v.avgVarQuality()<17){return false;}
+ if(v.numReads<3){return false;}
+ if(v.numSemiUniqueReads<3){return false;}
+ if(v.numUniqueReads<2){return false;}
+// if(v.paired<3){return false;}
+ if(v.score()<8200){return false;}
+
+ }else if(v.minStrandReads()>=1){
+ if(v.endDist<7){return false;}
+ if(v.tailDist<12){return false;}
+
+ if(v.errors>2){return false;}
+ if(v.expectedErrors>1.1f){return false;}
+ // if(v.expectedErrors-v.errors>3f){return false;}
+ if(v.maxReadQuality()<18){return false;}
+ if(v.avgReadQuality()<14){return false;}
+ if(v.maxVarQuality()<28){return false;}
+ if(v.avgVarQuality()<18){return false;}
+ if(v.numReads<4){return false;}
+ if(v.numSemiUniqueReads<3){return false;}
+ if(v.numUniqueReads<2){return false;}
+// if(v.paired<3){return false;}
+ if(v.score()<8020){return false;}
+ }else{
+ if(v.endDist<8){return false;}
+ if(v.tailDist<14){return false;}
+
+ if(v.errors>0){return false;}
+ if(v.expectedErrors>0.5f){return false;}
+ // if(v.expectedErrors-v.errors>2f){return false;}
+ if(v.maxReadQuality()<21){return false;}
+ if(v.avgReadQuality()<17){return false;}
+ if(v.maxVarQuality()<30){return false;}
+ if(v.avgVarQuality()<21){return false;}
+ if(v.numReads<6){return false;}
+ if(v.numSemiUniqueReads<5){return false;}
+ if(v.numUniqueReads<3){return false;}
+// if(v.paired<5){return false;}
+ if(v.score()<7670){return false;}
+ }
+ return true;
+ }
+
+ private long deltaLenKept=0;
+ private long snpKept=0;
+ private long delKept=0;
+ private long insKept=0;
+ private long subKept=0;
+ private long delLenKept=0;
+ private long insLenKept=0;
+ private long subLenKept=0;
+
+ private long deltaLenIn=0;
+ private long totalIn=0;
+ private long totalInNR=0;
+
+ private long totalKept=0;
+ private long totalKeptNR=0;
+ private long totalDropped=0;
+ private long totalDroppedNR=0;
+ private long scoreKept=0;
+ private long scoreDropped=0;
+
+ private final String fname1;
+ private final String fname2;
+ private final boolean filter;
+ private final int chrom;
+ }
+
+ private static int addThread(int x){
+ synchronized(THREADLOCK){
+ while(x>0 && activeThreads>=THREADS){
+ try {
+ THREADLOCK.wait(200);
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ activeThreads+=x;
+ return activeThreads;
+ }
+ }
+
+
+ public static long deltaLenKept_global=0;
+ public static long deltaLenIn_global=0;
+
+ public static long snpKept_global=0;
+ public static long delKept_global=0;
+ public static long insKept_global=0;
+ public static long subKept_global=0;
+ public static long delLenKept_global=0;
+ public static long insLenKept_global=0;
+ public static long subLenKept_global=0;
+
+ public static long totalIn_global=0;
+ public static long totalInNR_global=0;
+ public static long totalKept_global=0;
+ public static long totalDropped_global=0;
+ public static long totalKeptNR_global=0;
+ public static long totalDroppedNR_global=0;
+ public static long scoreKept_global=0;
+ public static long scoreDropped_global=0;
+
+ private static int activeThreads=0;
+
+ private static final String THREADLOCK=new String("THREADLOCK");
+ private static int THREADS=7;
+ private static boolean DELETE_INPUT=false;
+ public static int MIN_READS_TO_KEEP=1;
+ public static final int MIN_QUALITY_AT_MIN_READS=14;
+ public static final int MAX_ERRORS_AT_MIN_READS=2;
+ public static final int MAX_EXPECTED_ERRORS_AT_MIN_READS=4;
+ public static final boolean REQUIRE_PAIRED_AT_MIN_READS=false;
+ public static boolean STRICT=false;
+ public static boolean VSTRICT=false;
+ public static boolean USTRICT=false;
+
+ public static final boolean verbose=false;
+}
diff --git a/current/var/VarLine.java b/current/var/VarLine.java
new file mode 100755
index 0000000..81e5b0e
--- /dev/null
+++ b/current/var/VarLine.java
@@ -0,0 +1,249 @@
+package var;
+
+import java.io.Serializable;
+
+import dna.Gene;
+
+
+public class VarLine extends Variation implements Serializable, Cloneable{
+
+ public static final long serialVersionUID = -4089933371294357462L;
+
+// >locus ploidy haplotype chromosome begin end varType reference alleleSeq totalScore hapLink xRef
+
+ public VarLine(){}
+
+ public VarLine(String s, float version){
+ String[] line=s.split("\t", -1);
+
+ for(int i=0; i<line.length; i++){
+ line[i]=line[i].trim();
+ if(line[i].length()<1){
+ line[i]=null;
+ }
+ }
+
+
+
+// varType=(byte)find((line.length>6 ? line[6] : "null"), varTypeMap);
+ Byte b=varTypeMap2.get(line.length>6 ? line[6] : "null");
+ assert(b!=null) : "Can't find "+line[6]+" in "+varTypeMap2.keySet()+"\n\nLine: "+s+"\n";
+ varType=b;
+
+
+// locus=Integer.parseInt(line[0]);
+
+ b=(Byte)ploidyMap.get(line[1]);
+ assert(b!=null) : "\n\n"+line[1]+"\n\n"+s+"\n\n";
+ ploidy=b;
+
+
+ haplotype=(byte)find(line[2], haploMap);
+ assert(haplotype>=0) : line[2];
+
+ chromosome=Gene.toChromosome(line[3]);
+ assert(chromosome>0) : line[3]+" -> "+line[3].substring(3);
+
+ beginLoc=Integer.parseInt(line[4]);
+ int tempInt=Integer.parseInt(line[5])-1; //Note: 0,1 based
+ tempInt=max(tempInt, beginLoc);
+ endLoc=tempInt;
+
+ String temp;
+
+ temp=line.length>7 ? line[7] : null;
+ if("?".equals(temp)){temp=null;}
+ ref=temp;
+
+ temp=line.length>8 ? line[8] : null;
+ if("?".equals(temp)){temp=null;}
+ call=temp;
+
+
+ if(version<2){
+
+ totalScore=((line.length<=9 || line[9]==null || line[9].length()<1) ? -1 : Integer.parseInt(line[9]));
+ hapLink=((line.length<=10 || line[10]==null || line[10].length()<1) ? -1 : Integer.parseInt(line[10]));
+
+ assert(beginLoc<=endLoc) : s;
+
+ // System.out.println("\n"+this+"\n"+new Variation(this)+"\n");
+ }else{
+
+// return "#locus\tploidy\tallele\tchromosome\tbegin\tend\tvarType\treference\talleleSeq\t
+// varScoreVAF\tvarScoreEAF\tvarQuality\thapLink\txRef
+
+ int varScoreVAF=((line.length<=9 || line[9]==null || line[9].length()<1) ? -1 : Integer.parseInt(line[9]));
+ int varScoreEAF=((line.length<=10 || line[10]==null || line[10].length()<1) ? -1 : Integer.parseInt(line[10]));
+ byte VQ=((line.length<=11 || line[11]==null || line[11].length()<1) ? (byte)0 : (byte)find(line[11], VQARRAY));
+
+ totalScore=varScoreVAF;
+ hapLink=((line.length<=12 || line[12]==null || line[12].length()<1) ? -1 : Integer.parseInt(line[12]));
+
+ assert(beginLoc<=endLoc) : s;
+
+// System.out.println("\n"+this+"\n"+new Variation(this)+"\n");
+ }
+
+ assert(!((varType==Variation.INS || varType==Variation.DELINS || varType==Variation.SNP)
+ && call==null)) : "\nversion="+version+"\n"+s+"\n"+line+"\nline.ref="+ref+"\nline.call="+call+"\nref="+ref+"\ncall="+call;
+
+ intern();
+ }
+
+ public VarLine clone(){
+ VarLine v=null;
+// try {
+// v=(VarLine) super.clone();
+// } catch (CloneNotSupportedException e) {
+// // TODO Auto-generated catch block
+// e.printStackTrace();
+// }
+ v=(VarLine) super.clone();
+ return v;
+ }
+
+ public VarLine[] splitLine(){
+ assert(haplotype==3) : this;
+ VarLine[] r=new VarLine[2];
+ r[0]=this.clone();
+ r[1]=this.clone();
+ assert(this.equals(r[0]) && r[0].equals(this));
+ r[0].haplotype=1;
+ r[1].haplotype=2;
+ return r;
+ }
+
+ public VarLine spawnEqualPoint(){
+ assert(this.isPoint());
+ VarLine v=this.clone();
+ v.varType=REFPOINT;
+ v.call=v.ref=null;
+ return v;
+ }
+
+ public static VarLine makeEqualPoint(byte chrom, int loc, byte hap){
+ VarLine v=new VarLine();
+ v.chromosome=chrom;
+ v.beginLoc=loc;
+ v.endLoc=loc;
+ v.haplotype=hap;
+ v.varType=REFPOINT;
+ return v;
+ }
+
+ public String toSuperString(){return super.toString();}
+
+
+ public String toString(){
+ StringBuilder sb=new StringBuilder(256);
+
+// sb.append(locus+"\t");
+ sb.append(ploidyMap.get(ploidy)+"\t");
+ sb.append(haploMap[haplotype]+"\t");
+ sb.append("chr"+Gene.chromCodes[chromosome]+"\t");
+ sb.append(beginLoc+"\t");
+ sb.append(endLoc+"\t");
+
+ sb.append(varTypeMap[varType]+"\t");
+ sb.append((ref==null ? "" : ref)+"\t");
+ sb.append((call==null ? "" : call)+"\t");
+ sb.append((totalScore==-1 ? "" : totalScore)+"\t"); //TODO: Note the collision with a true -1
+ sb.append((hapLink==-1 ? "" : hapLink)+"\t"); //TODO "
+
+ return sb.toString();
+ }
+
+ public static String sourceHeader(){
+ return "#locus\tploidy\tallele\tchromosome\tbegin\tend\tvarType\treference\talleleSeq\ttotalScore\thapLink\txRef";
+// return "#locus\tploidy\tallele\tchromosome\tbegin\tend\tvarType\treference\talleleSeq\t
+// varScoreVAF\tvarScoreEAF\tvarQuality\thapLink\txRef
+ }
+ //locus ploidy allele chromosome begin end varType reference alleleSeq varScoreVAF varScoreEAF varQuality hapLink xRef
+
+ public String toSourceString(){
+ StringBuilder sb=new StringBuilder(256);
+
+ sb.append(0+"\t");
+ sb.append(ploidyMap.get(ploidy)+"\t");
+ sb.append(haploMap[haplotype]+"\t");
+ sb.append("chr"+Gene.chromCodes[chromosome]+"\t");
+ sb.append(beginLoc+"\t");
+
+ if(varType==INS){
+ sb.append(beginLoc+"\t");
+ }else{
+ sb.append((endLoc+1)+"\t");
+ }
+
+ sb.append(varTypeMap[varType]+"\t");
+ sb.append((ref==null ? "" : ref)+"\t");
+ sb.append((call==null ? "" : call)+"\t");
+ sb.append((totalScore==-1 ? "" : totalScore)+"\t"); //TODO: Note the collision with a true -1
+ sb.append((hapLink==-1 ? "" : hapLink)+"\t"); //TODO "
+
+ return sb.toString();
+ }
+
+
+ public String toShortString(){
+ StringBuilder sb=new StringBuilder(256);
+
+ sb.append(haploMap[haplotype]);
+ while(sb.length()<3){sb.append(' ');}
+ sb.append('\t');
+ sb.append(locationString()+"\t");
+
+ sb.append(varTypeMap[varType]+"\t");
+ sb.append((ref==null ? "" : ref)+"\t");
+ sb.append((call==null ? "" : call)+"\t");
+// sb.append((totalScore==-1 ? "" : totalScore)+"\t"); //TODO: Note the collision with a true -1
+// sb.append((hapLink==-1 ? "" : hapLink+"\t")); //TODO "
+
+ return sb.toString();
+ }
+
+ @SuppressWarnings("unused")
+ private static final int min(int x, int y){return x<y ? x : y;}
+ private static final int max(int x, int y){return x>y ? x : y;}
+
+
+ @Override
+ public int compareTo(Variation other) {
+ if(other.getClass()==VarLine.class){
+ return compareTo((VarLine)other);
+ }
+ return super.compareTo(other);
+ }
+
+ public int compareTo(VarLine other) {
+ int x=super.compareTo((Variation)other);
+ if(x!=0){return x;}
+ return haplotype-other.haplotype;
+ }
+
+ public boolean equals(Object other){
+ if(other.getClass()==VarLine.class){
+ return equals((VarLine)other);
+ }
+ return super.equals(other);
+ }
+
+ public boolean equals(VarLine other){
+ return compareTo(other)==0;
+ }
+
+ public boolean equals(Variation other){
+ return super.equals(other);
+ }
+
+ public byte ploidy;
+
+ /** Which copy this is on */
+ public byte haplotype;
+ public int totalScore;
+ public int hapLink;
+
+ public static final String[] VQARRAY=new String[] {"?", "VQLOW", "VQHIGH"};
+
+}
diff --git a/current/var/Variation.java b/current/var/Variation.java
new file mode 100755
index 0000000..4b9fea3
--- /dev/null
+++ b/current/var/Variation.java
@@ -0,0 +1,869 @@
+package var;
+import java.io.Serializable;
+import java.lang.reflect.Array;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.HashMap;
+import java.util.Set;
+
+
+import dna.Data;
+import dna.Gene;
+import dna.GeneSet;
+import dna.Range;
+import driver.Search;
+
+
+
+public class Variation implements Comparable<Variation>, Serializable, Cloneable {
+
+// >locus ploidy haplotype chromosome begin end varType reference alleleSeq totalScore hapLink xRef
+
+ /**
+ *
+ */
+ private static final long serialVersionUID = -3847258470952802740l;
+
+ public Variation(VarLine line){
+// this(line.chromosome, line.beginLoc, line.endLoc, line.xRef, line.varType, line.ref, line.call);
+ this(line.chromosome, line.beginLoc, line.endLoc, line.varType, line.ref, line.call);
+
+ assert(!((varType==INS || varType==DELINS || varType==SNP) && call==null)) : "\n"+line+"\n"+this+
+ "\nline.ref="+line.ref+"\nline.call="+line.call+"\nref="+ref+"\ncall="+call;
+
+ assert(beginLoc<=endLoc) : line.toString();
+
+ assert(this.equals(line)) : "\n\n"+this+"\n!=\n"+line;
+ assert(line.equals(this)) : "\n\n"+this+"\n!=\n"+line;
+
+
+// if(xRef==11429487){
+// System.out.println("\n"+this.toString());
+// }
+ }
+
+// public Variation(GeneVarLine line){
+//// this(line.chromosome, line.beginLoc, line.endLoc, line.xRef, line.varType, line.ref, line.call);
+// this(line.chromosome, line.beginLoc, line.endLoc, line.xRef, line.xRefArray, line.varType, line.ref, line.call);
+//
+// assert(beginLoc<=endLoc) : line.toString();
+//
+// assert(this.equals(line)) : "\n\n"+this+"\n!=\n"+line.toSuperString()+"\n\n"+line;
+// assert(line.equals(this)) : "\n\n"+this+"\n!=\n"+line.toSuperString()+"\n\n"+line;
+//
+// }
+
+ public Variation(Variation line){
+ this(line.chromosome, line.beginLoc, line.endLoc, line.varType, line.ref, line.call);
+
+ assert(beginLoc<=endLoc) : line.toString();
+
+ assert(this.equals(line)) : "\n\n"+this+"\n!=\n"+line.toSuperString()+"\n\n"+line;
+ assert(line.equals(this)) : "\n\n"+this+"\n!=\n"+line.toSuperString()+"\n\n"+line;
+
+ }
+
+ public Variation(int chr, int bLoc, int eLoc, byte vType, String rf, String ca){
+ chromosome=chr;
+ beginLoc=bLoc;
+ endLoc=eLoc;
+ varType=vType;
+
+ setDetails(vType, rf, ca);
+
+ assert(beginLoc<=endLoc) : toString();
+
+ }
+
+ public Variation(){}
+
+
+ public Variation clone(){
+ Variation v=null;
+ try {
+ v=(Variation) super.clone();
+ } catch (CloneNotSupportedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ return v;
+ }
+
+
+ public static final HashSet<Variation> toVariations(VarLine[] array, boolean retainEqual){
+ HashSet<Variation> set=new HashSet<Variation>(array.length);
+ for(VarLine line : array){
+ Variation var=new Variation(line);
+ if(retainEqual || var.varType!=Variation.REF){
+ if(!set.contains(var)){
+ set.add(var);
+ }
+ }
+ }
+ return set;
+ }
+
+ public static final Variation[] toVariationArray(VarLine[][] array, boolean retainEqual){
+ HashSet<Variation> set=toVariations(array[0], retainEqual);
+ for(int i=1; i<array.length; i++){
+ set.addAll(toVariations(array[i], retainEqual));
+ }
+ Variation[] vars=set.toArray(new Variation[set.size()]);
+ Arrays.sort(vars);
+ return vars;
+ }
+
+ public static final Variation[] toVariationArray(VarLine[] array, boolean retainEqual){
+ HashSet<Variation> set=toVariations(array, retainEqual);
+ Variation[] vars=set.toArray(new Variation[set.size()]);
+ Arrays.sort(vars);
+ return vars;
+ }
+
+ @SuppressWarnings("unchecked")
+ public static final <X extends Comparable<? super X>> X[] toArray(Class<X> c, Set<X> set){
+
+ set.getClass().getTypeParameters();
+ X[] array=(X[])Array.newInstance(c,set.size());
+
+ array=set.toArray(array);
+ int i=0;
+ for(X x : set){
+ array[i]=x;
+ i++;
+ }
+ Arrays.sort(array);
+ return array;
+ }
+
+
+ public static VarLine[] filterCodingVariances(VarLine[] variances, int chrom, boolean nearby){
+ Range[] ranges=(nearby ? Data.geneNearbyRangeMatrix(chrom) : Data.geneCodeAndExonRangeMatrix(chrom));
+
+ ArrayList<VarLine> list=new ArrayList<VarLine>(8+variances.length/8);
+
+ for(VarLine var : variances){
+
+ if(var.varType!=VarLine.REF && var.varType!=VarLine.NOREF){
+ int loc=var.beginLoc;
+ int rnum=Search.findPointBinary(loc, ranges);
+
+ if(ranges[rnum].intersects(var.beginLoc, var.endLoc)){
+ list.add(var);
+ }
+
+ for(int i=rnum; i<ranges.length; i++){
+ Range r=ranges[i];
+ if(r.a>var.endLoc){break;} //Out of range
+
+ if(r.intersects(var.beginLoc, var.endLoc)){
+ list.add(var);
+ break;
+ }
+ }
+ }
+ }
+
+ return list.toArray(new VarLine[list.size()]);
+ }
+
+
+
+
+ /**
+ * Generates an array of non-overlapping Ranges, sorted by index, ascending.
+ * To each is attached a list of all overlapping Variations from the input array.
+ * @param va
+ * @return The array of ranges
+ */
+ public static Range[] makeVarRanges(Variation[] va){
+ // System.out.println("va.length="+va.length);
+
+ if(va==null || va.length==0){
+ return new Range[0];
+ }
+
+ ArrayList<Range> ra=new ArrayList<Range>(va.length);
+ for(Variation v : va){
+ Range r=new Range(v.beginLoc, v.endLoc);
+ r.obj1=new ArrayList<Variation>();
+ ((ArrayList<Variation>)r.obj1).add(v);
+ ra.add(r);
+ }
+ Collections.sort(ra);
+ ArrayList<Range> ra2=new ArrayList<Range>(va.length);
+ Range current=null;
+ // System.out.println("ra.size="+ra.size());
+ for(Range r : ra){
+ // System.out.println("\ncurrent="+current+", r="+r);
+ if(current==null){current=r;}
+ else if(current.intersects(r)){
+ // System.out.println("merged");
+ Range temp=current.merge(r);
+ temp.obj1=current.obj1;
+ ((ArrayList<Variation>)temp.obj1).addAll((ArrayList<Variation>)r.obj1);
+ current=temp;
+ }else{
+ // System.out.println("added");
+ ra2.add(current);
+ current=r;
+ }
+ // System.out.println("current="+current+", r="+r);
+ }
+ // System.out.println("\ncurrent="+current);
+ // System.out.println("ra2.size="+ra2.size());
+ assert(current!=null); //Note: this could be null if input was empty, I guess...
+ assert(ra2.size()==0 || ra2.get(ra2.size()-1)!=current);
+ ra2.add(current);
+ return ra2.toArray(new Range[ra2.size()]);
+ }
+
+ public static final int toRsid(String s){return xRefToId(s);}
+ public static final int xRefToId(String s){
+// System.out.println(s);
+ if(s==null || s.length()==0){return -1;}
+// assert(s.startsWith("dbsnp:rs")) : s;
+
+ if(s.contains(":")){
+ s=s.substring(s.indexOf(':')+1);
+ }
+
+ int i=0, max=s.length();
+// System.err.println(s);
+ while(i<max && !Character.isDigit(s.charAt(i))){i++;}
+ if(i>=max){assert(s.equals(".")) : s; return -1;}
+ s=s.substring(i);
+
+ return Integer.parseInt(s);
+ }
+
+ public static final int[] toRsidArray(String s){return xRefToIdArray(s);}
+ public static final int[] xRefToIdArray(String s){
+ if(s==null || s.length()<1){return null;}
+ String[] array=s.split("[,;]");
+ int[] r=new int[array.length];
+ for(int i=0; i<array.length; i++){
+ r[i]=xRefToId(array[i]);
+ if(r[i]==-1){
+ if(r.length==1){return null;}
+
+ //This can be safely disabled. But it is best to fix this case by making the array smaller.
+ assert(false) : "Not a real rsID: "+s;
+ }
+ }
+ return r;
+ }
+
+ public boolean matches(Variation line){
+ if(line==null || chromosome!=line.chromosome || beginLoc!=line.beginLoc || endLoc!=line.endLoc || varType!=line.varType){
+ return false;
+ }
+ return matches(line.varType, line.ref, line.call);
+ }
+
+// public boolean matchesLoose(VarLine line){
+// if(line==null || chromosome!=line.chromosome || !intersects(line)){
+// return false;
+// }
+// if(isEqual() && line.isEqual()){return true;}
+// if(varType!=line.varType){return false;}
+// return matches(line.varType, line.ref, line.call);
+// }
+
+ /** Overlap and don't contradict each other */
+ public boolean matchesLoose(VarLine line){
+ if(line==null || chromosome!=line.chromosome || !intersects(line)){
+ return false;
+ }
+
+ if(isTrueVariation()){
+ if(varType!=line.varType){return false;}
+ return matches(line.varType, line.ref, line.call);
+ }else if(isRef()){
+ return line.isRef();
+ }else{
+ assert(isUnsureVariation()) : this;
+ return line.isUnsureVariation();
+ }
+ }
+
+ private boolean matches(int type, String ref2, String call2){
+ if(type==REF || type==REFPOINT || type==DEL || type==NOCALL){
+ return true;
+ }
+ return call.equals(call2);
+
+ }
+
+ private void setDetails(byte vt, String rf, String ca){
+
+ ref=null;
+ call=null;
+
+ switch(vt){
+
+ case REF: {
+ }break;
+ case SNP: {
+ ref=rf; call=ca;
+ }break;
+ case INS: {
+ call=ca;
+ }break;
+ case DEL: {
+ ref=rf;
+ }break;
+ case DELINS: {
+ ref=rf; call=ca;
+ }break;
+ case REFCON: {
+ ref=rf; call=ca;
+ }break;
+ case REFINCON: {
+ ref=rf; call=ca;
+ }break;
+ case NOCALL: {
+ //I can't remember if nocalls need N or null calls
+// ref=rf;
+// call=ca;
+// assert(ref!=null && call!=null && ref.length()==call.length()) : ref+", "+call;
+ }break;
+ case NOREF: {
+ //I can't remember if norefs need N or null refs
+// ref=rf;
+ call=ca;
+// assert(ref!=null && call!=null && ref.length()==call.length()) : ref+", "+call;
+ }break;
+ case PAR: {
+ ref=rf; call=ca;
+ }break;
+ case NULL: {
+ ref=rf; call=ca;
+ }break;
+ case REFPOINT: {
+ ref=call=null;
+ }break;
+
+ default: {assert(false);}
+ }
+ intern();
+ }
+
+ public String locationString(){
+ return locationString(0);
+ }
+
+ public String locationString(int base){
+ assert(base==0 || base==1);
+
+ if(beginLoc==endLoc){
+ return "("+(beginLoc+base)+")";
+ }
+ return "("+(beginLoc+base)+" - "+(endLoc+base)+")";
+
+// if(beginLoc==endLoc){
+// return (beginLoc+base)+"";
+// }
+// return (beginLoc+base)+"-"+(endLoc+base);
+ }
+
+ public String toSuperString(){return toString();}
+
+ public String toString(){
+ return toString(0);
+ }
+
+ public String toString(int base){
+ StringBuilder sb=new StringBuilder();
+
+ sb.append("chr"+Gene.chromCodes[chromosome]);
+ while(sb.length()<5){sb.append(' ');}
+ sb.append('\t');
+ sb.append(locationString(base)+"\t");
+
+ sb.append(varTypeMap[varType]);
+
+ sb.append("\t"+(ref==null ? "" : ref));
+ sb.append("\t"+(call==null ? "" : call));
+
+ sb.append('\t');
+
+ return sb.toString();
+ }
+
+ public String toSourceString(){
+ StringBuilder sb=new StringBuilder(64);
+
+ sb.append("chr"+Gene.chromCodes[chromosome]+"\t");
+ sb.append(beginLoc+"\t");
+
+ if(varType==INS){
+ sb.append(beginLoc+"\t");
+ }else{
+ sb.append((endLoc+1)+"\t");
+ }
+
+ sb.append(varTypeMap[varType]+"\t");
+ sb.append((ref==null ? "" : ref)+"\t");
+ sb.append((call==null ? "" : call)+"\t");
+
+ return sb.toString();
+ }
+
+ public static String header(){
+
+ return "chrom\tstart\tstop\ttype\tref\tcall\trsID";
+ }
+
+ public String toShortString(){
+ StringBuilder sb=new StringBuilder();
+
+ sb.append(locationString()+"\t");
+
+ sb.append(varTypeMap[varType]);
+
+ if(ref!=null){sb.append("\t"+ref);}
+ if(call!=null){sb.append("\t"+call);}
+
+ return sb.toString();
+ }
+
+
+ public static final int find(String a, String[] array){
+ for(int i=0; i<array.length; i++){
+ if(a.equals(array[i])){return i;}
+ }
+ assert(false) : "Can't find "+a+" in "+Arrays.toString(array);
+ return -1;
+ }
+
+ public final int lengthRef(){
+ switch(varType){
+ case SNP: {
+ assert(endLoc-beginLoc+1==1) : "\n"+endLoc+"-"+beginLoc+"+1 = "+(endLoc-beginLoc+1)+" != "+1+"\n"+this.toString()+"\n";
+ assert(call!=null && call.length()==1) : "\ncall= '"+call+"'\n"+this.toString();
+ return 1;
+ }
+ case INS: {
+ return 0;
+ }
+ case REFPOINT: {
+ return 0;
+ }
+// case NOREF: {
+// throw new RuntimeException();
+// }
+// case NULL: {
+// throw new RuntimeException();
+// }
+ }
+ return endLoc-beginLoc+1;
+ }
+
+ public final int lengthMax(){return max(lengthRef(), lengthVar());}
+ public final int lengthMin(){return min(lengthRef(), lengthVar());}
+ public final int lengthDif(){return isNR_or_NC() ? 0 : lengthVar()-lengthRef();}
+
+ public final int lengthVar(){
+ switch(varType){
+
+ case REF: {
+ return endLoc-beginLoc+1;
+ }
+ case SNP: {
+ assert(endLoc-beginLoc+1==1);
+ assert(call!=null && call.length()==1);
+ return 1;
+ }
+ case INS: {
+ assert(call!=null);
+ return call.length();
+ }
+ case REFPOINT: {
+ return 0;
+ }
+ case DEL: {
+ assert(call==null);
+ return 0;
+ }
+ case DELINS: {
+ assert(call!=null);
+ return call.length();
+ }
+ case REFCON: {
+ return endLoc-beginLoc+1;
+ }
+ case REFINCON: {
+ assert(false) : "Warning - Length cannot be known for certain.";
+ return endLoc-beginLoc+1;
+ }
+ case NOCALL: {
+ assert(false) : "Warning - Length cannot be known for certain.";
+ return endLoc-beginLoc+1;
+ }
+ case NOREF: {
+ assert(false) : "Warning - Length cannot be known for certain.";
+ return endLoc-beginLoc+1;
+ }
+ case PAR: {
+ assert(call!=null);
+ return call.length();
+ }
+ case NULL: {
+ assert(false);
+ throw new RuntimeException();
+ }
+
+ default: {throw new RuntimeException();}
+ }
+ }
+
+
+// //TODO Note that this may be wrong for e.g. insertions, deletions, and if/when changed to half-open numbering.
+// public final int length(){
+// if(varType==INS){return 0;}
+// return endLoc-beginLoc+1;
+// }
+// public final int length2(){
+// if(varType==INS){return call==null ? 0 : call.length();}
+// if(varType==DELINS){return call==null ? (endLoc-beginLoc+1) : max(call.length(), endLoc-beginLoc+1);}
+// return endLoc-beginLoc+1;
+// }
+
+ public final boolean isPoint(){
+ return varType==INS || varType==REFPOINT;
+ }
+
+ public final boolean isRef(){
+ return varType==REF || varType==REFPOINT;
+ }
+
+ public final boolean isTrueVariation(){
+ return varType==SNP || varType==INS || varType==DEL || varType==DELINS;
+ }
+
+ public final boolean isNoCall(){
+ return varType==NOCALL || varType==REFCON || varType==REFINCON;
+ }
+
+ public final boolean isNR_or_NC(){
+ return varType==NOCALL || varType==NOREF || varType==REFCON || varType==REFINCON;
+ }
+
+ public final boolean isUnsureVariation(){
+ return varType==NOCALL || varType==NOREF || varType==REFINCON || varType==REFCON;
+ }
+
+
+// /** TODO May be slow. Perhaps add a boolean field. */
+// public boolean isCoding(){
+// int middle=((beginLoc+endLoc)/2);
+// GeneSet[] sets=Data.getNearestGeneSets(chromosome, middle);
+// for(GeneSet gs : sets){
+// for(Gene g : gs.genes){
+// if(g.intersectsCodeAndExon(beginLoc, endLoc)){
+// return true;
+// }
+// }
+// }
+// return false;
+// }
+
+
+ /** Does this variation intersect within (range) of a coding region or splice site? */
+ public boolean isNearCodingOrSplice(int range, boolean includeExonsForUntranslatedGenes, boolean includeSplice){
+ assert(beginLoc<=endLoc);
+ int a=beginLoc-range, b=endLoc+range;
+ return isNearCodingOrSplice(range, includeExonsForUntranslatedGenes, Data.getNearestGeneSets(chromosome, a, b), includeSplice);
+ }
+
+
+ /** Does this variation intersect within (range) of a coding region or splice site? */
+ public boolean isNearCodingOrSplice(int range, boolean includeExonsForUntranslatedGenes){
+ return isNearCodingOrSplice(range, includeExonsForUntranslatedGenes, true);
+ }
+
+
+ /** Does this variation lie at least partially within an intron? */
+ public boolean intersectsIntron(){
+ assert(beginLoc<=endLoc);
+ int a=beginLoc, b=endLoc;
+ return intersectsIntron(Data.getNearestGeneSets(chromosome, a, b));
+ }
+
+
+ /** Does this variation intersect within (range) of a coding region or splice site? */
+ public boolean isNearCodingOrSplice(int range, boolean includeExonsForUntranslatedGenes, GeneSet[] sets, boolean includeSplice){
+ assert(beginLoc<=endLoc);
+ int a=beginLoc-range, b=endLoc+range;
+
+// int middle=((beginLoc+endLoc)/2);
+// GeneSet[] sets=Data.getNearestGeneSets(chromosome, middle);
+
+// boolean flag=(chromosome==21 && intersects(9929078));//TODO UNDO
+
+// if(flag){System.out.println("Found: "+Arrays.toString(sets));}
+// assert(false);
+
+ if(sets==null){
+ assert(chromosome>=25);
+ return true;
+ }
+
+
+ for(GeneSet gs : sets){
+// if(flag){System.out.println("### "+gs);}//TODO UNDO
+ for(Gene g : gs.genes){
+
+ if(!g.untranslated){
+
+// if(flag){System.out.println("*** "+g);}//TODO UNDO
+
+// if(flag){
+// System.out.println("intersectsCodeAndExon: "+g.intersectsCodeAndExon(a, b));
+// System.out.println("intersectsCode: "+g.intersectsCode(a, b));
+// System.out.println("intersectsExon: "+g.intersectsExon(a, b));
+// }
+
+ if(g.intersectsCodeAndExon(a, b)){
+ return true;
+ }
+
+ }else if(includeExonsForUntranslatedGenes){
+// if(flag){System.out.println("*** "+g);}//TODO UNDO
+//
+// if(flag){
+// System.out.println("intersectsExon: "+g.intersectsExon(a, b));
+// }
+
+ if(g.intersectsExon(a, b)){
+ return true;
+ }
+
+ }
+
+ if(includeSplice){
+ int[] array=g.nearestSpliceSite(beginLoc, endLoc);
+ if(array[0]<=range){return true;}
+ }
+
+ }
+ }
+ return false;
+ }
+
+
+ /** Does this variation lie at least partially within an intron? */
+ public boolean intersectsIntron(GeneSet[] sets){
+ assert(beginLoc<=endLoc);
+ int a=beginLoc, b=endLoc;
+
+// int middle=((beginLoc+endLoc)/2);
+// GeneSet[] sets=Data.getNearestGeneSets(chromosome, middle);
+
+ if(sets==null){
+ assert(chromosome>=25);
+ return true;
+ }
+
+
+ for(GeneSet gs : sets){
+ for(Gene g : gs.genes){
+ if(g.intersectsIntron(a, b)){return true;}
+ }
+ }
+ return false;
+ }
+
+ public int beginLoc=-2;
+ public int endLoc=-2;
+
+ public int chromosome=-1;
+ public byte varType=-1;
+
+ public String ref=null;
+ public String call=null;
+
+
+ public static final HashMap<Object, Object> ploidyMap=makePloidyMap();
+ public static final String[] haploMap={"0","1","2","all"};
+
+ public static final String[] varTypeMap={"ref","snp","ins","del","sub",
+ "no-call-rc","no-call-ri","no-call","no-ref","PAR-called-in-X","null","refpoint"};
+
+ public static final HashMap<String, Byte> varTypeMap2=makeVarTypeMap();
+
+ private static final HashMap<String, Byte> makeVarTypeMap(){
+ HashMap<String, Byte> r=new HashMap<String, Byte>(32);
+
+ for(byte i=0; i<varTypeMap.length; i++){r.put(varTypeMap[i], i);}
+ r.put("=", REF);
+ r.put("ref-consistent", REFCON);
+ r.put("ref-inconsistent", REFINCON);
+// r.put("no-call-rc", REFCON);
+// r.put("no-call-ri", REFINCON);
+ r.put("delins", DELINS);
+
+ return r;
+ }
+
+ public static final byte REF=0;
+ public static final byte SNP=1;
+ public static final byte INS=2;
+ public static final byte DEL=3;
+ public static final byte DELINS=4;
+ public static final byte REFCON=5;
+ public static final byte REFINCON=6;
+ public static final byte NOCALL=7;
+ public static final byte NOREF=8;
+ public static final byte PAR=9;
+ public static final byte NULL=10;
+ public static final byte REFPOINT=11;
+
+ public void intern(){
+// assert(false) : ref+", "+call+", "+this;
+ if(ref!=null){ref=Data.intern(ref);}
+ if(call!=null){call=Data.intern(call);}
+ }
+
+ private static HashMap<Object, Object> makePloidyMap(){
+ HashMap<Object, Object> hashy=new HashMap<Object, Object>(64);
+ for(int i=0; i<10; i++){
+ hashy.put((Byte)(byte)i, i+"");
+ hashy.put((Integer)i, i+"");
+ hashy.put(i+"", (Byte)(byte)i);
+ }
+ hashy.put((Byte)(byte)-1, "?");
+ hashy.put((Integer)(-1), "?");
+ hashy.put("?",(Byte)(byte)-1);
+ return hashy;
+ }
+
+ private static final int min(int x, int y){return x<y ? x : y;}
+ private static final int max(int x, int y){return x>y ? x : y;}
+
+ @Override
+ public final int hashCode(){
+ long x=chromosome;
+ x=x<<4;
+ x^=varType;
+ x=x<<28;
+ x^=beginLoc;
+ x=x<<16;
+ x^=(endLoc-beginLoc+1);
+ return new Long(x).hashCode(); //TODO: Slow
+ }
+
+ @Override
+ public int compareTo(Variation other) {
+ if(chromosome!=other.chromosome){return other.chromosome>chromosome ? -1 : 1;}
+ if(beginLoc!=other.beginLoc){return other.beginLoc>beginLoc ? -1 : 1;}
+ if(endLoc!=other.endLoc){return other.endLoc>endLoc ? -1 : 1;}
+ if(varType!=other.varType){return other.varType>varType ? -1 : 1;}
+ if(varType==REF || varType==NOCALL){return 0;}
+
+ if(call==null){
+ return other.call==null ? 0 : -1;
+ }
+ return other.call==null ? 1 : call.compareTo(other.call);
+ }
+
+ public boolean equals(Object other){
+ return equals((Variation)other);
+ }
+
+ public boolean equals(Variation other){
+ return compareTo(other)==0;
+ }
+
+ public boolean intersects(int point){
+ return point>=beginLoc && point<=endLoc;
+ }
+
+ public boolean touches(int point){
+ return point>=beginLoc-1 && point<=endLoc+1;
+ }
+
+ /** This is quite clever. */
+ public static boolean overlap(int a1, int b1, int a2, int b2){
+ assert(a1<=b1 && a2<=b2) : a1+", "+b1+", "+a2+", "+b2;
+ return a2<=b1 && b2>=a1;
+ }
+ public static boolean touch(int a1, int b1, int a2, int b2){
+ assert(a1<=b1 && a2<=b2) : a1+", "+b1+", "+a2+", "+b2;
+ return a2<=(b1+1) && b2>=(a1-1);
+ }
+
+ /** Is (a1, b1) within (a2, b2) ? */
+ public static boolean isWithin(int a1, int b1, int a2, int b2){
+ assert(a1<=b1 && a2<=b2) : a1+", "+b1+", "+a2+", "+b2;
+ return a1>=a2 && b1<=b2;
+ }
+
+ public static boolean isWithinNotTouching(int a1, int b1, int a2, int b2){
+ assert(a1<=b1 && a2<=b2) : a1+", "+b1+", "+a2+", "+b2;
+ return a1>a2 && b1<b2;
+ }
+
+ //Slow if not inlined
+ public boolean intersects(int a2, int b2){return overlap(beginLoc, endLoc, a2, b2);}
+
+ public boolean isWithin(int a2, int b2){return isWithin(beginLoc, endLoc, a2, b2);}
+
+ public boolean isWithinNotTouching(int a2, int b2){return isWithinNotTouching(beginLoc, endLoc, a2, b2);}
+
+ public boolean intersects(Variation v){
+
+ if(v.chromosome!=chromosome){
+ return false;
+ }
+
+ int len1=lengthRef();
+ int len2=v.lengthRef();
+
+ if(len1<len2){
+ return v.intersects(this);
+ }
+
+// if(v.beginLoc==46397336 || v.beginLoc==46397348){
+// System.err.println(len1+": "+this+"\n"+len2+": "+v+"\n");
+// }
+ //Now, this is at least as long (ref-wise) as v.
+
+// if(varType==EQUAL && v.varType==INS){
+// assert(false);
+// }
+
+ if(!touch(beginLoc, endLoc, v.beginLoc, v.endLoc)){return false;}
+
+// if(v.beginLoc==46397336 || v.beginLoc==46397348){
+// System.err.println("Touch("+beginLoc+", "+endLoc+", "+v.beginLoc+", "+v.endLoc+")");
+// }
+
+ if(v.isPoint()){
+// if(v.beginLoc==46397336 || v.beginLoc==46397348){System.out.println("v");}
+ if(isPoint()){
+// assert(beginLoc==v.beginLoc && endLoc==v.endLoc) : this+"\n"+v;
+// return true;
+
+ return beginLoc==v.beginLoc;
+ }
+// if(v.beginLoc==46397336 || v.beginLoc==46397348){System.out.println("w");}
+ if(this.isNoCall()){
+ if(len1>0){return overlap(beginLoc, endLoc, v.beginLoc, v.endLoc);} //Normal case
+ else{
+ //TODO: Bad news! Original MAY have been a length 0 no-call in half-open coordinates.
+ return overlap(beginLoc, endLoc+1, v.beginLoc, v.endLoc);
+ }
+ }
+// if(v.beginLoc==46397336 || v.beginLoc==46397348){System.out.println("x");}
+ if(v.beginLoc<=beginLoc){return false;}
+// if(v.beginLoc==46397336 || v.beginLoc==46397348){System.out.println("y");}
+ }
+// if(v.beginLoc==46397336 || v.beginLoc==46397348){System.out.println("z");}
+
+ return overlap(beginLoc, endLoc, v.beginLoc, v.endLoc);
+ }
+}
diff --git a/current/var/Varlet.java b/current/var/Varlet.java
new file mode 100755
index 0000000..3385f05
--- /dev/null
+++ b/current/var/Varlet.java
@@ -0,0 +1,403 @@
+package var;
+
+import java.util.ArrayList;
+
+import align2.QualityTools;
+import align2.Tools;
+
+
+import dna.Gene;
+import fileIO.TextFile;
+
+public class Varlet extends var.Variation {
+
+
+ public Varlet(int chrom_, byte strand_, int start_, int stop_, int matchStart_, int matchStop_, byte vType, String rf, String ca,
+ int varQuality_, int readQuality_, int mapScore_, int errors_, float expectedErrors_, int paired_, long readID_,
+ int readLen_,
+ int readStart_, int readStop_, int readCopies_, int headDist_, int tailDist_, int endDist_, int pairnum){
+ super(chrom_, start_, stop_, vType, rf, ca);
+ strand=strand_;
+
+ setQvector(varQuality_, readQuality_, varQuality_, readQuality_);
+
+ mapScore=mapScore_;
+ errors=errors_;
+ expectedErrors=expectedErrors_;
+ paired=paired_;
+
+ matchStart=matchStart_;
+ matchStop=matchStop_;
+
+ readID=readID_;
+ readLen=readLen_;
+
+ readStart=readStart_;
+ readStop=readStop_;
+
+ numReads=Tools.min(readCopies_, Short.MAX_VALUE);
+
+
+ headDist=headDist_;
+ tailDist=tailDist_;
+ endDist=endDist_;
+
+ if(pairnum==0){
+ if(strand==Gene.PLUS){numPlusReads1=1;}
+ else{numMinusReads1=1;}
+ }else{
+ if(strand==Gene.PLUS){numPlusReads2=1;}
+ else{numMinusReads2=1;}
+ }
+
+ assert(pairnum==0 || pairnum==1) : pairnum+"\n"+this;
+// assert(readID_<Integer.MAX_VALUE) : readID_+"\n"+this;
+ assert(readCopies_>=1) : readCopies_+"\n"+this;
+// assert(readCopies_<Short.MAX_VALUE) : readCopies_+"\n"+this;
+
+ assert(endDist<=tailDist) : this;
+ assert(endDist<=headDist) : this;
+
+ assert(readStart<readStop) : this;
+ }
+
+ public String toString(){return toText().toString();}
+
+ public static String header(){return textHeader().toString();}
+
+ public static CharSequence textHeader(){
+ StringBuilder sb=new StringBuilder(64);
+
+ sb.append("#");
+ sb.append("chrom").append('\t');
+ sb.append("strand").append('\t');
+ sb.append("readStart").append('\t');
+ sb.append("readStop").append('\t');
+ sb.append("varStart").append('\t');
+ sb.append("varStop").append('\t');
+
+ sb.append("type").append('\t');
+ sb.append("mapScore").append('\t');
+ sb.append("errors").append('\t');
+ sb.append("expectedErrors").append('\t');
+ sb.append("readID").append('\t');
+ sb.append("readLen").append('\t');
+ sb.append("headDist").append('\t');
+ sb.append("tailDist").append('\t');
+ sb.append("endDist").append('\t');
+
+ sb.append("avgVarQuality").append('\t');
+ sb.append("maxVarQuality").append('\t');
+ sb.append("avgReadQuality").append('\t');
+ sb.append("maxReadQuality").append('\t');
+
+ sb.append("numReads").append('\t');
+ sb.append("numSemiUnique").append('\t');
+ sb.append("numUniqueReads").append('\t');
+ sb.append("paired").append('\t');
+ sb.append("plusReads1").append('\t');
+ sb.append("minusReads1").append('\t');
+ sb.append("plusReads2").append('\t');
+ sb.append("minusReads2").append('\t');
+
+ sb.append("ref").append('\t');
+ sb.append("call");
+ return sb;
+ }
+
+ public final StringBuilder toText(){
+ StringBuilder sb=new StringBuilder(64);
+
+ sb.append(chromosome).append('\t');
+ sb.append(Gene.strandCodes[strand]).append('\t');
+ sb.append(readStart).append('\t');
+ sb.append(readStop).append('\t');
+ sb.append(beginLoc).append('\t');
+ sb.append(endLoc).append('\t');
+ sb.append(Variation.varTypeMap[varType]).append('\t');
+
+ sb.append(mapScore).append('\t');
+ sb.append(errors).append('\t');
+ sb.append(String.format("%.1f", expectedErrors)).append('\t');
+ sb.append(readID).append('\t');
+ sb.append(readLen).append('\t');
+ sb.append(headDist).append('\t');
+ sb.append(tailDist).append('\t');
+ sb.append(endDist).append('\t');
+
+ sb.append(avgVarQuality()).append('\t');
+ sb.append(maxVarQuality()).append('\t');
+ sb.append(avgReadQuality()).append('\t');
+ sb.append(maxReadQuality()).append('\t');
+
+ sb.append(numReads).append('\t');
+ sb.append(numSemiUniqueReads).append('\t');
+ sb.append(numUniqueReads).append('\t');
+ sb.append(paired).append('\t');
+ sb.append(numPlusReads1).append('\t');
+ sb.append(numMinusReads1).append('\t');
+ sb.append(numPlusReads2).append('\t');
+ sb.append(numMinusReads2).append('\t');
+
+ sb.append(ref==null || ref.length()==0 ? "." : ref).append('\t');
+ sb.append(call==null || call.length()==0 ? "." : call);
+
+// if(coverageAtLoc>0){sb.append("\t"+coverageAtLoc);}
+ return sb;
+ }
+
+ public static final ArrayList<Varlet> fromTextFile(String fname){
+ TextFile tf=new TextFile(fname, false, false);
+ ArrayList<Varlet> list=new ArrayList<Varlet>(2000);
+
+ for(String s=tf.nextLine(); s!=null; s=tf.nextLine()){
+ if(s.charAt(0)!='#'){
+ Varlet v=Varlet.fromText(s);
+ list.add(v);
+ }
+ }
+ tf.close();
+ list.trimToSize();
+ return list;
+ }
+
+ public static final Varlet fromText(String line){
+ String[] split=line.split("\t");
+
+ int chrom=Byte.parseByte(split[0]);
+ byte strand=Gene.toStrand(split[1]);
+ int readStart=Integer.parseInt(split[2]);
+ int readStop=Integer.parseInt(split[3]);
+ int start=Integer.parseInt(split[4]);
+ int stop=Integer.parseInt(split[5]);
+ byte varType=Variation.varTypeMap2.get(split[6]);
+
+ int mapScore=Integer.parseInt(split[7]);
+ int errors=Integer.parseInt(split[8]);
+ float expectedErrors=Float.parseFloat(split[9]);
+
+ long readID=Integer.parseInt(split[10]);
+ int readLen=Integer.parseInt(split[11]);
+ int headDist=Integer.parseInt(split[13]);
+ int tailDist=Integer.parseInt(split[14]);
+ int endDist=Integer.parseInt(split[15]);
+
+ int avgVarQuality=Integer.parseInt(split[16]);
+ int maxVarQuality=Integer.parseInt(split[17]);
+ int avgReadQuality=Integer.parseInt(split[18]);
+ int maxReadQuality=Integer.parseInt(split[19]);
+ int numReads=Integer.parseInt(split[20]);
+ int numSemiUniqueReads=Integer.parseInt(split[21]);
+ int numUniqueReads=Integer.parseInt(split[22]);
+ int paired=Integer.parseInt(split[23]);
+ int numPlusReads1=Integer.parseInt(split[24]);
+ int numMinusReads1=Integer.parseInt(split[25]);
+ int numPlusReads2=Integer.parseInt(split[26]);
+ int numMinusReads2=Integer.parseInt(split[27]);
+
+ String ref=split[28];
+ String call=split[29];
+ if(ref.length()==1 && ref.charAt(0)=='.'){ref=null;}
+ if(call.length()==1 && call.charAt(0)=='.'){call=null;}
+
+
+
+ Varlet v=new Varlet(chrom, strand, start, stop, -1, -1, varType, ref, call, avgVarQuality, avgReadQuality,
+ mapScore, errors, expectedErrors, paired, readID, readLen, readStart, readStop, numReads,
+ headDist, tailDist, endDist, 1);
+
+ v.setQvector(avgVarQuality, avgReadQuality, maxVarQuality, maxReadQuality);
+ v.numPlusReads1=numPlusReads1;
+ v.numMinusReads1=numMinusReads1;
+ v.numPlusReads2=numPlusReads2;
+ v.numMinusReads2=numMinusReads2;
+ v.numSemiUniqueReads=numSemiUniqueReads;
+ v.numUniqueReads=numUniqueReads;
+
+ return v;
+ }
+
+
+ @Override
+ public boolean equals(Variation other){
+// assert(other.getClass()!=Varlet.class);
+ return super.compareTo(other)==0;
+ }
+
+ //DO NOT enable this! Varlets should use equality based on Variation data only.
+// public boolean equals(Varlet other){
+// return compareTo(other)==0;
+// }
+
+ @Override
+ public int compareTo(Variation other) {
+// if(other.getClass()==Varlet.class){} //not needed in practice
+ return(compareTo((Varlet)other));
+ }
+
+ public int compareTo(Varlet other) {
+
+// int a=compareTo2(other);
+// int b=other.compareTo2(this);
+// assert(a==-b) : "\n"+a+", "+b+"\n"+Varlet.header()+"\n"+this+"\n"+other+"\n";
+
+ if(chromosome!=other.chromosome){return chromosome-other.chromosome;}
+ if(beginLoc!=other.beginLoc){return other.beginLoc>beginLoc ? -1 : 1;}
+ if(endLoc!=other.endLoc){return other.endLoc>endLoc ? -1 : 1;}
+ if(varType!=other.varType){return varType-other.varType;}
+ if(varType==REF || varType==NOCALL){return 0;}
+
+ if(call==null && other.call!=null){return -1;}
+ if(call!=null && other.call==null){return 1;}
+ if(call!=null && other.call!=null){
+ int x=call.compareTo(other.call);
+ if(x!=0){return x;}
+ }
+
+ if(readStart!=other.readStart){return readStart-other.readStart;}
+ if(readStop!=other.readStop){return readStop-other.readStop;}
+ if(strand!=other.strand){return strand-other.strand;}
+ if(maxVarQuality()!=other.maxVarQuality()){return other.maxVarQuality()<maxVarQuality() ? -1 : 1;}
+
+ return 0;
+ }
+
+ /** TODO: Add expected errors, tailDist, endDist */
+ public int score(){
+ int score=1000/(errors+1);
+ score+=(int)(500/(expectedErrors+1));
+ score+=Tools.max(0, (1000-(int)(16000*QualityTools.PROB_ERROR[maxReadQuality()])));
+ score+=Tools.max(0, (1000-(int)(16000*QualityTools.PROB_ERROR[maxVarQuality()])));
+ score+=10*Tools.min(35, maxVarQuality());
+ score+=Tools.max(0, (200-(int)(8000*QualityTools.PROB_ERROR[avgVarQuality()])));
+ score+=Tools.max(0, (200-(int)(8000*QualityTools.PROB_ERROR[avgReadQuality()])));
+ score+=(1000-2000/(paired+2));
+ score+=(500-1000/(numSemiUniqueReads+2));
+ score+=(500-1000/(numUniqueReads+2));
+ score+=(200-400/(numReads+2));
+ score+=(50*Tools.min(20, tailDist));
+ score+=(50*Tools.min(10, endDist));
+
+ int lenFactor=Tools.min(readLen, 100);
+ score+=(1000*lenFactor)/(lenFactor+100);
+
+ score+=Tools.min(1000, (10*mapScore)/readLen); //TODO: This is temporary, until Read correctly supports mapLen in toText()
+ score+=(1000-1000/(1+minStrandReads()));
+ return score;
+ }
+
+
+ private int qvector;
+
+ public int avgVarQuality(){return qvector&0xFF;}
+ public int avgReadQuality(){return (qvector>>8)&0xFF;};
+ public int maxVarQuality(){return (qvector>>16)&0xFF;};
+ public int maxReadQuality(){return (qvector>>24)&0xFF;};
+
+ public void setAvgVarQuality(int value){
+ qvector=((qvector&0xFFFFFF00)|(value&0xFF));
+ }
+ public void setAvgReadQuality(int value){
+ qvector=((qvector&0xFFFF00FF)|((value&0xFF)<<8));
+ }
+ public void setMaxVarQuality(int value){
+ qvector=((qvector&0xFF00FFFF)|((value&0xFF)<<16));
+ }
+ public void setMaxReadQuality(int value){
+ qvector=((qvector&0x00FFFFFF)|((value&0xFF)<<24));
+ }
+ public void setQvector(int avq, int arq, int mvq, int mrq){
+ qvector=mrq&0xFF;
+ qvector=(qvector<<8)|(mvq&0xFF);
+ qvector=(qvector<<8)|(arq&0xFF);
+ qvector=(qvector<<8)|(avq&0xFF);
+ }
+
+
+
+ public int mapScore;
+ public int errors;
+
+ public float expectedErrors;
+
+ public int matchStart;
+ public int matchStop;
+
+ public int readStart;
+ public int readStop;
+
+ public int headDist;
+ public int tailDist;
+ public int endDist;
+
+ public byte strand;
+ public int paired;
+
+ public long readID;
+
+ /** Length of read when used for calling vars; ie, after being trimmed, and after colorspace conversion. */
+ public int readLen;
+
+ public int numReads;
+ public int numSemiUniqueReads=1;
+ public int numUniqueReads=1;
+// public int coverageAtLoc=0;
+
+// public byte numStrands=1;
+
+ /** Varlets from read 1 mapped to plus strand */
+ public int numPlusReads1=0;
+
+ /** Varlets from read 1 mapped to minus strand */
+ public int numMinusReads1=0;
+
+ /** Varlets from read 2 mapped to plus strand */
+ public int numPlusReads2=0;
+
+ /** Varlets from read 2 mapped to minus strand */
+ public int numMinusReads2=0;
+
+ /** Number of reads1 and reads2 mapped to the plus strand */
+ public int numPlusMappedReads(){
+ return numPlusReads1+numPlusReads2;
+ }
+
+ /** Number of reads1 and reads2 from which the original molecule (i.e., read 1) mapped to the plus strand */
+ public int numPlusOriginReads(){
+ return numPlusReads1+numMinusReads2;
+ }
+
+ /** Number of reads1 and reads2 mapped to the minus strand */
+ public int numMinusMappedReads(){
+ return numMinusReads1+numMinusReads2;
+ }
+
+ /** Number of reads1 and reads2 from which the original molecule (i.e., read 1) mapped to the minus strand */
+ public int numMinusOriginReads(){
+ return numMinusReads1+numPlusReads2;
+ }
+
+ public int minStrandReads(){return Tools.min(numPlusMappedReads(), numMinusMappedReads());}
+
+// public byte numStrands(){return (byte)((numPlusReads>0 ? 1 : 0)+(numMinusReads>0 ? 1 : 0));}
+ public int minStrandReads4(){return Tools.min(numPlusReads1, numMinusReads1, numPlusReads2, numMinusReads2);}
+ public int minStrandReads3(){//return second lowest number
+
+ final int a, b, c, d;
+ if(numPlusReads1<=numMinusReads1){a=numPlusReads1; b=numMinusReads1;}
+ else{b=numPlusReads1; a=numMinusReads1;}
+ if(numPlusReads2<=numMinusReads2){c=numPlusReads2; d=numMinusReads2;}
+ else{d=numPlusReads2; c=numMinusReads2;}
+
+ return Tools.min(b, d, (a>=c ? a : c));
+
+ }
+ public int strandReadCount(){
+ return (numPlusReads1>0 ? 1 : 0)+(numMinusReads1>0 ? 1 : 0)+(numPlusReads2>0 ? 1 : 0)+(numMinusReads2>0 ? 1 : 0);
+ }
+
+ public int pairNum(){
+ return (numPlusReads1+numMinusReads1)>0 ? 0 : 1;
+ }
+
+}
diff --git a/cutprimers.sh b/cutprimers.sh
new file mode 100755
index 0000000..395d5d5
--- /dev/null
+++ b/cutprimers.sh
@@ -0,0 +1,79 @@
+#!/bin/bash
+#cutprimers in=<file> out=<file>
+
+usage(){
+echo "
+Written by Brian Bushnell
+Last modified February 17, 2015
+
+Description: Cuts out sequences corresponding to primers identified in sam files.
+
+Usage: cutprimers.sh in=<file> out=<file> sam1=<file> sam2=<file>
+
+Parameters:
+
+in=<file> File containing reads. in=stdin.fa will pipe from stdin.
+out=<file> Output sequences. out=stdout will pipe to stdout.
+sam1=<file> Sam file containing mapped locations of primer sequence 1.
+sam2=<file> Sam file containing mapped locations of primer sequence 2.
+fake=t Output 1bp 'N' reads in cases where there is no primer.
+include=f Include the flanking primer sequences in output.
+
+Java Parameters:
+
+-Xmx This will be passed to Java to set memory usage, overriding
+ the program's automatic memory detection. -Xmx20g will specify
+ 20 gigs of RAM, and -Xmx200m will specify 200 megs.
+ The max is typically 85% of physical memory.
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx1g"
+z2="-Xms1g"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+ if [[ $set == 1 ]]; then
+ return
+ fi
+ freeRam 2000m 42
+ z="-Xmx${RAM}m"
+ z2="-Xms${RAM}m"
+}
+calcXmx "$@"
+
+cutprimers() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ fi
+ local CMD="java $EA $z -cp $CP jgi.CutPrimers $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+cutprimers "$@"
diff --git a/decontaminate.sh b/decontaminate.sh
new file mode 100755
index 0000000..131ee49
--- /dev/null
+++ b/decontaminate.sh
@@ -0,0 +1,121 @@
+#!/bin/bash
+#decontaminate in=<infile> out=<outfile>
+
+usage(){
+echo "
+Written by Brian Bushnell.
+Last modified September 29, 2015
+
+Description: Decontaminates multiplexed assemblies via normalization and mapping.
+
+Usage: decontaminate.sh reads=<file,file> ref=<file,file> out=<directory>
+or
+decontaminate.sh readnamefile=<file> refnamefile=<file> out=<directory>
+
+Input Parameters:
+reads=<file,file> Input reads, one file per library.
+ref=<file,file> Input assemblies, one file per library.
+readnamefile=<file> List of input reads, one line per library.
+refnamefile=<file> List of input assemblies, one line per library.
+
+interleaved=auto True forces paired/interleaved input; false forces single-ended mapping.
+ If not specified, interleaved status will be autodetected from read names.
+unpigz=t Spawn a pigz (parallel gzip) process for faster decompression. Requires pigz to be installed.
+touppercase=t (tuc) Convert lowercase letters in reads to upper case (otherwise they will not match the reference).
+
+Output Parameters:
+pigz=f Spawn a pigz (parallel gzip) process for faster compression. Requires pigz to be installed.
+tmpdir=. Write temp files here. By default is uses the system's $TMPDIR or current directory.
+outdir=. Write ouput files here.
+
+Mapping Parameters:
+kfilter=55 Set to a positive number N to require minimum N contiguous matches for a mapped read.
+ambig=random Determines how coverage will be calculated for ambiguously-mapped reads.
+ first: Add coverage only at first genomic mapping location.
+ random: Add coverage at a random best-scoring location.
+ all: Add coverage at all best-scoring locations.
+ toss: Discard ambiguously-mapped reads without adding coverage.
+
+Filtering Parameters:
+minc=3.5 Min average coverage to retain scaffold.
+minp=20 Min percent coverage to retain scaffold.
+minr=18 Min mapped reads to retain scaffold.
+minl=500 Min length to retain scaffold.
+ratio=1.2 Contigs will not be removed by minc unless the coverage changed by at least this factor. 0 disables this filter.
+mapraw=t Set true to map the unnormalized reads. Required to filter by 'ratio'.
+basesundermin=-1 If positive, removes contigs with at least this many bases in low-coverage windows.
+window=500 Sliding window size
+windowcov=5 Average coverage below this will be classified as low.
+
+
+Normalization Parameters:
+mindepth=2 Min depth of reads to keep.
+target=20 Target normalization depth.
+hashes=4 Number of hashes in Bloom filter.
+passes=1 Normalization passes.
+minprob=0.5 Min probability of correctness to add a kmer.
+dp=0.75 (depthpercentile) Percentile to use for depth proxy (0.5 means median).
+ecc=f Error-correction.
+aecc=f Agressive error-correction.
+cecc=f Conservative error-correction.
+prefilter=t Prefilter, for large datasets.
+filterbits=32 (fbits) Bits per cell in primary filter.
+prefilterbits=2 (pbits) Bits per cell in prefilter.
+
+Java Parameters:
+-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
+ -Xmx20g will specify 20 gigs of RAM, and -Xmx800m will specify 800 megs. The max is typically 85% of physical memory.
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+NATIVELIBDIR="$DIR""jni/"
+
+z="-Xmx1g"
+z2="-Xms1g"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+ if [[ $set == 1 ]]; then
+ return
+ fi
+ freeRam 15000m 84
+ z="-Xmx${RAM}m"
+ z2="-Xms${RAM}m"
+}
+calcXmx "$@"
+
+
+decontaminate() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ fi
+ local CMD="java -Djava.library.path=$NATIVELIBDIR $EA $z $z2 -cp $CP jgi.DecontaminateByNormalization $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+decontaminate "$@"
diff --git a/dedupe.sh b/dedupe.sh
new file mode 100755
index 0000000..78470b4
--- /dev/null
+++ b/dedupe.sh
@@ -0,0 +1,154 @@
+#!/bin/bash
+#dedupe in=<infile> out=<outfile>
+
+usage(){
+echo "
+Written by Brian Bushnell and Jonathan Rood
+Last modified October 27, 2015
+
+Description: Accepts one or more files containing sets of sequences (reads or scaffolds).
+Removes duplicate sequences, which may be specified to be exact matches, subsequences, or sequences within some percent identity.
+Can also find overlapping sequences and group them into clusters.
+
+Usage: dedupe.sh in=<file or stdin> out=<file or stdout>
+
+An example of running Dedupe for clustering short reads:
+dedupe.sh in=x.fq am=f ac=f fo c pc rnc=f mcs=4 mo=100 s=1 pto cc qin=33 csf=stats.txt pattern=cluster_%.fq dot=graph.dot
+
+Input may be fasta or fastq, compressed or uncompressed.
+Output may be stdout or a file. With no output parameter, data will be written to stdout.
+If 'out=null', there will be no output, but statistics will still be printed.
+You can also use 'dedupe <infile> <outfile>' without the 'in=' and 'out='.
+
+I/O Parameters
+in=<file,file> A single file or a comma-delimited list of files.
+out=<file> Destination for all output contigs.
+pattern=<file> Clusters will be written to individual files, where the '%' symbol in the pattern is replaced by cluster number.
+outd=<file> Optional; removed duplicates will go here.
+csf=<file> (clusterstatsfile) Write a list of cluster names and sizes.
+dot=<file> (graph) Write a graph in dot format. Requires 'fo' and 'pc' flags.
+threads=auto (t) Set number of threads to use; default is number of logical processors.
+overwrite=t (ow) Set to false to force the program to abort rather than overwrite an existing file.
+showspeed=t (ss) Set to 'f' to suppress display of processing speed.
+minscaf=0 (ms) Ignore contigs/scaffolds shorter than this.
+interleaved=auto If true, forces fastq input to be paired and interleaved.
+ziplevel=2 Set to 1 (lowest) through 9 (max) to change compression level; lower compression is faster.
+
+Output Format Parameters
+storename=t (sn) Store scaffold names (set false to save memory).
+#addpairnum=f Add .1 and .2 to numeric id of read1 and read2.
+storequality=t (sq) Store quality values for fastq assemblies (set false to save memory).
+uniquenames=t (un) Ensure all output scaffolds have unique names. Uses more memory.
+numbergraphnodes=t (ngn) Label dot graph nodes with read numbers rather than read names.
+sort=f Sort output by scaffold length (otherwise it will be random).
+ 'a' for ascending, 'd' for descending, 'f' for false (no sorting).
+renameclusters=f (rnc) Rename contigs to indicate which cluster they are in.
+printlengthinedges=f (ple) Print the length of contigs in edges.
+
+Processing Parameters
+absorbrc=t (arc) Absorb reverse-complements as well as normal orientation.
+absorbmatch=t (am) Absorb exact matches of contigs.
+absorbcontainment=t (ac) Absorb full containments of contigs.
+#absorboverlap=f (ao) Absorb (merge) non-contained overlaps of contigs (TODO).
+findoverlap=f (fo) Find overlaps between contigs (containments and non-containments). Necessary for clustering.
+uniqueonly=f (uo) If true, all copies of duplicate reads will be discarded, rather than keeping 1.
+rmn=f (requirematchingnames) If true, both names and sequence must match.
+usejni=f (jni) Do alignments in C code, which is faster, if an edit distance is allowed.
+ This will require compiling the C code; details are in /jni/README.txt.
+
+Subset Parameters
+subsetcount=1 (sstc) Number of subsets used to process the data; higher uses less memory.
+subset=0 (sst) Only process reads whose ((ID%subsetcount)==subset).
+
+Clustering Parameters
+cluster=f (c) Group overlapping contigs into clusters.
+pto=f (preventtransitiveoverlaps) Do not look for new edges between nodes in the same cluster.
+minclustersize=1 (mcs) Do not output clusters smaller than this.
+pbr=f (pickbestrepresentative) Only output the single highest-quality read per cluster.
+
+Cluster Postprocessing Parameters
+processclusters=f (pc) Run the cluster processing phase, which performs the selected operations in this category.
+fixmultijoins=t (fmj) Remove redundant overlaps between the same two contigs.
+removecycles=t (rc) Remove all cycles so clusters form trees.
+cc=t (canonicizeclusters) Flip contigs so clusters have a single orientation.
+fcc=f (fixcanoncontradictions) Truncate graph at nodes with canonization disputes.
+foc=f (fixoffsetcontradictions) Truncate graph at nodes with offset disputes.
+mst=f (maxspanningtree) Remove cyclic edges, leaving only the longest edges that form a tree.
+
+Overlap Detection Parameters
+exact=t (ex) Only allow exact symbol matches. When false, an 'N' will match any symbol.
+touppercase=t (tuc) Convert input bases to upper-case; otherwise, lower-case will not match.
+maxsubs=0 (s) Allow up to this many mismatches (substitutions only, no indels). May be set higher than maxedits.
+maxedits=0 (e) Allow up to this many edits (subs or indels). Higher is slower.
+minidentity=100 (mid) Absorb contained sequences with percent identity of at least this (includes indels).
+minlengthpercent=0 (mlp) Smaller contig must be at least this percent of larger contig's length to be absorbed.
+minoverlappercent=0 (mop) Overlap must be at least this percent of smaller contig's length to cluster and merge.
+minoverlap=200 (mo) Overlap must be at least this long to cluster and merge.
+depthratio=0 (dr) When non-zero, overlaps will only be formed between reads with a depth ratio of at most this.
+ Should be above 1. Depth is determined by parsing the read names; this information can be added
+ by running KmerNormalize (khist.sh, bbnorm.sh, or ecc.sh) with the flag 'rename'
+k=31 Seed length used for finding containments and overlaps. Anything shorter than k will not be found.
+numaffixmaps=1 (nam) Number of prefixes/suffixes to index per contig. Higher is more sensitive, if edits are allowed.
+#ignoreaffix1=f (ia1) Ignore first affix (for testing).
+#storesuffix=f (ss) Store suffix as well as prefix. Automatically set to true when doing inexact matches.
+
+Other Parameters
+forcetrimleft=-1 (ftl) If positive, trim bases to the left of this position (exclusive, 0-based).
+forcetrimright=-1 (ftr) If positive, trim bases to the right of this position (exclusive, 0-based).
+
+Java Parameters:
+-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
+ -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory.
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+NATIVELIBDIR="$DIR""jni/"
+
+z="-Xmx1g"
+z2="-Xms1g"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+ if [[ $set == 1 ]]; then
+ return
+ fi
+ freeRam 3200m 84
+ z="-Xmx${RAM}m"
+ z2="-Xms${RAM}m"
+}
+calcXmx "$@"
+
+dedupe() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ fi
+ local CMD="java -Djava.library.path=$NATIVELIBDIR $EA $z $z2 -cp $CP jgi.Dedupe $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+dedupe "$@"
diff --git a/dedupe2.sh b/dedupe2.sh
new file mode 100755
index 0000000..6c47055
--- /dev/null
+++ b/dedupe2.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+#dedupe in=<infile> out=<outfile>
+
+usage(){
+echo "
+Written by Brian Bushnell and Jonathan Rood
+Last modified September 15, 2015
+
+Dedupe2 is identical to Dedupe except it supports hashing unlimited kmer
+prefixes and suffixes per sequence. Dedupe supports at most 2 of each,
+but uses slightly more memory. You can manually set the number of kmers to
+hash per read with the numaffixmaps (nam) flag. Dedupe will automatically
+call Dedupe2 if necessary (if nam=3 or higher).
+
+For documentation, please consult dedupe.sh; syntax is identical.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+NATIVELIBDIR="$DIR""jni/"
+
+z="-Xmx1g"
+z2="-Xms1g"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+ if [[ $set == 1 ]]; then
+ return
+ fi
+ freeRam 3200m 84
+ z="-Xmx${RAM}m"
+ z2="-Xms${RAM}m"
+}
+calcXmx "$@"
+
+dedupe() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ fi
+ local CMD="java -Djava.library.path=$NATIVELIBDIR $EA $z $z2 -cp $CP jgi.Dedupe2 $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+dedupe "$@"
diff --git a/dedupebymapping.sh b/dedupebymapping.sh
new file mode 100755
index 0000000..4a9c5fd
--- /dev/null
+++ b/dedupebymapping.sh
@@ -0,0 +1,87 @@
+#!/bin/bash
+#dedupebymapping in=<infile> out=<outfile>
+
+usage(){
+echo "
+Written by Brian Bushnell
+Last modified February 25, 2015
+
+Description: Deduplicates mapped reads based on pair mapping coordinates.
+
+Usage: dedupebymapping.sh in=<file> out=<file>
+
+Input may be stdin or a sam/bam file, compressed or uncompressed.
+
+
+Parameters:
+in=<file> The 'in=' flag is needed if the input file is not the
+ first parameter. 'in=stdin' will pipe from standard in.
+out=<file> The 'out=' flag is needed if the output file is not the
+ second parameter. 'out=stdout' will pipe to standard out.
+overwrite=t (ow) Set to false to force the program to abort rather
+ than overwrite an existing file.
+ziplevel=2 (zl) Set to 1 (lowest) through 9 (max) to change
+ compression level; lower compression is faster.
+keepunmapped=t (ku) Keep unmapped reads. This refers to unmapped
+ single-ended reads or pairs with both unmapped.
+monitor=f Kill this process if it crashes. monitor=600,0.01
+ would kill after 600 seconds under 1% usage.
+
+Java Parameters:
+-Xmx This will be passed to Java to set memory usage, overriding
+ the program's automatic memory detection. -Xmx20g will
+ specify 20 gigs of RAM, and -Xmx200m will specify 200 megs.
+ The max is typically 85% of physical memory.
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx3g"
+z2="-Xms3g"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+ if [[ $set == 1 ]]; then
+ return
+ fi
+ freeRam 3000m 84
+ z="-Xmx${RAM}m"
+ z2="-Xms${RAM}m"
+}
+calcXmx "$@"
+
+dedupebymapping() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ module load samtools
+ fi
+ local CMD="java $EA $z $z2 -cp $CP jgi.DedupeByMapping $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+dedupebymapping "$@"
diff --git a/demuxbyname.sh b/demuxbyname.sh
new file mode 100755
index 0000000..49b604b
--- /dev/null
+++ b/demuxbyname.sh
@@ -0,0 +1,94 @@
+#!/bin/bash
+#demuxbyname in=<infile> out=<outfile>
+
+function usage(){
+echo "
+Written by Brian Bushnell
+Last modified April 14, 2015
+
+Description: Demultiplexes reads based on their name (suffix or prefix) into multiple files.
+
+Usage: demuxbyname.sh in=<file> in2=<file2> out=<outfile> out2=<outfile2> names=<string,string,string...>
+
+in2 and out2 are for paired reads and are optional.
+If input is paired and there is only one output file, it will be written interleaved.
+Output filenames MUST contain a '%' symbol.
+
+Parameters and their defaults:
+
+in=<file> Input file.
+out=<file> Output files for reads with matched names.
+outu=<file> Output file for reads with unmatched names.
+prefixmode=t (pm) Match prefix of read name. If false, match suffix of read name.
+names= List of strings (or files containing strings) to parse from read names.
+length=0 If positive, use a suffix or prefix of this length from read name instead of or in addition to the list of names.
+ow=f (overwrite) Overwrites files that already exist.
+app=f (append) Append to files that already exist.
+zl=4 (ziplevel) Set compression level, 1 (low) to 9 (max).
+int=f (interleaved) Determines whether INPUT file is considered interleaved.
+fastawrap=80 Length of lines in fasta output.
+fastareadlen=0 Set to a non-zero number to break fasta files into reads of at most this length.
+minscaf=1 Ignore fasta reads shorter than this.
+tuc=f (touppercase) Change lowercase letters in reads to uppercase.
+qin=auto ASCII offset for input quality. May be 33 (Sanger), 64 (Illumina), or auto.
+qout=auto ASCII offset for output quality. May be 33 (Sanger), 64 (Illumina), or auto (same as input).
+qfake=30 Quality value used for fasta to fastq demuxbynameting.
+tossbrokenreads=f (tbr) Discard reads that have different numbers of bases and qualities. By default this will be detected and cause a crash.
+ignorebadquality=f (ibq) Fix out-of-range quality values instead of crashing with a warning.
+
+Java Parameters:
+-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
+ -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory.
+
+Supported input formats are fastq, fasta, fast+qual, scarf, and bread (BBMap's native format)
+Supported output formats are fastq, fasta, fast+qual, bread, sam, and bam (bam only if samtools is installed)
+Supported compression formats are gz, zip, and bz2
+To read from stdin, set 'in=stdin'. The format should be specified with an extension, like 'in=stdin.fq.gz'
+To write to stdout, set 'out=stdout'. The format should be specified with an extension, like 'out=stdout.fasta'
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx400m"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+}
+calcXmx "$@"
+
+function demuxbyname() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module unload samtools
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ module load samtools
+ fi
+ local CMD="java $EA $z -cp $CP jgi.DemuxByName $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+demuxbyname "$@"
diff --git a/docs/Legal.txt b/docs/Legal.txt
new file mode 100755
index 0000000..5165b82
--- /dev/null
+++ b/docs/Legal.txt
@@ -0,0 +1,9 @@
+BBTools Copyright (c) 2014, The Regents of the University of California, through Lawrence Berkeley National Laboratory (subject to receipt of any required approvals from the U.S. Dept. of Energy). All rights reserved.
+
+
+
+If you have questions about your rights to use or distribute this software, please contact Technology Transfer and IP Management at TTD at lbl.gov referring to " BB Tools (LBNL Ref 2014-042)."
+
+
+
+NOTICE. This software was developed under funding from the U.S. Department of Energy. As such, the U.S. Government has been granted for itself and others acting on its behalf a paid-up, nonexclusive, irrevocable, worldwide license in the Software to reproduce, prepare derivative works, and perform publicly and display publicly. Beginning five (5) years after the date permission to assert copyright is obtained from the U.S. Department of Energy, and subject to any subsequent five (5) y [...]
diff --git a/docs/Legal_Illumina.txt b/docs/Legal_Illumina.txt
new file mode 100755
index 0000000..9ff311e
--- /dev/null
+++ b/docs/Legal_Illumina.txt
@@ -0,0 +1,3 @@
+For the Illumina-specific nucleotide sequences included in the resources directory, e.g. truseq.fa and nextera.fa:
+
+Oligonucleotide sequences � 2007-2013 Illumina, Inc. All rights reserved.
\ No newline at end of file
diff --git a/docs/ToolDescriptions.txt b/docs/ToolDescriptions.txt
new file mode 100755
index 0000000..7ba6140
--- /dev/null
+++ b/docs/ToolDescriptions.txt
@@ -0,0 +1,520 @@
+Concise descriptions of BBTools.
+For complete documentation of a specific tool, please see its shellscript, and its guide if available.
+
+
+
+Note on threads:
+
+Virtually all BBTools are multithreaded. If a description indicates that a tool is singlethreaded, that generally means there is only 1 worker thread. File input and output are usually in separate threads, so a "singlethreaded" program like ReformatReads may be observed using over 250% of the resources of a single core (in other words, 2.5 threads on average, with 1 input file and 1 output file). Programs listed as multithreaded, on the other hand, will automatically use all available [...]
+
+Note on memory:
+
+The memory usage classification of "low" or "high" is based on assumptions; with the exception of AssemblyStats (which uses a fixed amount of memory), the actual amount of memory needed varies based on the parameters and input files. While all programs can be forced to use a specific amount of memory with the -Xmx flag, the tools classified as low memory will try to grab only a small amount of memory by default when run via the shellscript, while the ones listed as high memory will try [...]
+
+
+Alignment and Coverage-Related
+
+Name: align2.BBMap
+Shellscript: bbmap.sh, removehuman.sh, removehuman2.sh, mapnt.sh
+Description: Fast and accurate splice-aware read aligner for DNA and RNA. Finds optimal global alignments. Maximum read length is 600bp.
+Notes: Multithreaded, high memory. Memory usage depends on the size of the reference; roughly 6 bytes per base, or 3 bytes per base with the flag "usemodulo".
+Additional Shellscripts: removehuman.sh calls BBMap with a prebuilt index and parameters designed to remove human contamination with zero false-positives; removehuman2.sh is designed to minimize false-negatives at the expense of allowing some false-positives. mapnt.sh calls BBMap with a prebuilt index and parameters designed to allow mapping to nt while running on a 120GB node. All of these are designed exclusively for Genepool and will not function elsewhere, so should not be distrib [...]
+
+Name: align2.BBMapPacBio
+Shellscript: mapPacBio.sh
+Description: Version of BBMap for long reads up to 6kbp. Designed for PacBio and Nanopore reads; uses alignment penalties weighted for PacBio's error model.
+Notes: Multithreaded, high memory. Memory usage depends on the size of the reference and number of threads.
+
+Name: align2.BBMapPacBioSkimmer
+Shellscript: bbmapskimmer.sh
+Description: Version of BBMap for mapping reads to all sites above a certain score threshold, rather than finding the single best mapping location. Uses alignment penalties weighted for PacBio's error model, as it was originally created to map Illumina reads to PacBio reads for error-correction.
+Notes: Multithreaded, high memory. Memory usage depends on the size of the reference and number of threads.
+
+Name: align2.BBSplit
+Shellscript: bbsplit.sh
+Description: Uses BBMap to map to multiple references simultaneously, and output one file per reference, containing all the reads that match it better than the other references. Used for metagenomic binning, distinguishing between closely-related organisms, and contamination removal.
+Notes: See BBMap.
+
+Name: align2.BBWrap
+Shellscript: bbwrap.sh
+Description: Allows multiple runs of BBMap on different input files without reloading the reference. Useful when the reference is very large.
+Notes: See BBMap.
+
+Name: jgi.CoveragePileup
+Shellscript: pileup.sh
+Description: Calculates coverage information from an unsorted or sorted sam or bam file. Outputs per-scaffold coverage, per-base coverage, binned coverage, normalized coverage, per-ORF coverage (using PRODIGAL's format), coverage histograms, stranded coverage, physical coverage, FPKMs, and various others.
+Notes: Singlethreaded, high memory. TODO: Would not be overly difficult to make a multithreaded version using A_SampleMT, but would require locks or queues.
+
+Name: driver.SummarizeCoverage
+Shellscript: summarizescafstats.sh
+Description: Summarizes the scafstats output of BBMap for evaluation of cross-contamination. The intended use is to map multiple libraries or assemblies, of different multiplexed organisms, to a concatenated reference containing one fused scaffold per organism. This will convert all of the resulting stats files (one per library) to a single text file, with multiple columns, indicating how much of the input hit the primary versus nonprimary scaffolds. See also BBMap, Pileup, Summarize [...]
+Notes: Singlethreaded, low memory.
+
+Name: jgi.FilterByCoverage
+Shellscript: filterbycoverage.sh.
+Description: Filters an assembly by contig coverage, to remove contigs below a coverage cutoff, or with fewer than some percent of their bases covered. Uses coverage stats produced by BBMap or Pileup.
+Notes: Singlethreaded, low memory.
+
+Name: driver.MergeCoverageOTU
+Shellscript: mergeOTUs.sh
+Description: Merges coverage stats lines (from Pileup) for the same OTU, according to some custom naming scheme. See also CoveragePileup.
+Notes: Singlethreaded, low memory.
+
+Name: jgi.SamToEst
+Shellscript: bbest.sh
+Description: Calculates EST (expressed sequence tags) capture by an assembly from a sam file. Designed to use BBMap output generated with these flags: k=13 maxindel=100000 customtag ordered
+Notes: Singlethreaded, low memory.
+
+Name: assemble.Postfilter
+Shellscript: postfilter.sh
+Description: Maps reads, then filters an assembly by contig coverage. Intended to reduce misassembly rate of SPAdes by removing suspicious contigs. See also BBMap and FilterByCoverage.
+Notes: Multithreaded, high memory.
+
+
+Kmer Matching
+
+Name: jgi.BBDukF
+Shellscript: bbduk.sh
+Description: Multipurpose tool for read preprocessing, which does adapter-trimming, quality-trimming, contaminant filtering, entropy filtering, sequence masking, quality score recalibration, format conversion, histogram generation, barcode filtering, gc filtering, kmer cardinality estimation, and many similar tasks.
+Notes: Multithreaded, high memory. Memory usage depends on the size of the reference (roughly 20 bytes per kmer) and whether hdist or edist are set (they multiply memory consumption by a large factor); if no reference is loaded, little memory is needed.
+
+Name: jgi.BBDuk2
+Shellscript: bbduk2.sh
+Description: Version of BBDuk that can do multiple kmer-based operations at once - left-trim, right-trim, filter, and mask.
+Notes: See BBDuk.
+
+Name: jgi.Seal
+Shellscript: seal.sh
+Description: Performs high-speed alignment-free sequence quantification or binning, by counting the number of long kmers that match between a read and a set of reference sequences. Designed for RNA-seq versus a transcriptome, metagenomic binning and abundance analysis, quantifying contamination, and similar. Very similar to BBDuk except that Seal associates each kmer with multiple reference sequences instead of just one, so it is superior in situations where multiple reference sequenc [...]
+Notes: Multithreaded, high memory. Memory usage depends on the size of the reference (roughly 30 bytes per kmer) and whether hdist or edist are set (they multiply memory consumption by a large factor).
+
+Name: driver.SummarizeSealStats
+Shellscript: summarizeseal.sh
+Description: Summarizes the stats output of Seal for evaluation of cross-contamination. The intended use is to map multiple libraries or assemblies, of different multiplexed organisms, to a concatenated reference containing one fused scaffold per organism. This will convert all of the resulting stats files (one per library) to a single text file, with multiple columns, indicating how much of the input hit the primary versus nonprimary scaffolds. Also allows filtering of certain librar [...]
+Notes: Singlethreaded, low memory.
+
+
+Kmer Counting
+
+Name: jgi.LogLog
+Shellscript: loglog.sh
+Description: Estimates the number of unique kmers within a dataset to within ~10%.
+Notes: Multithreaded, low memory. This can also be done with other programs such as BBDuk by adding the loglog flag.
+
+Name: jgi.KmerCountExact
+Shellscript: kmercountexact.sh
+Description: Counts kmers in sequence data. Capable of outputting the kmers and their counts as fasta or 2-column tsv, as well as a frequency histogram. No kmer length limits.
+Notes: Multithreaded, high memory.
+
+Name: jgi.KmerNormalize (generally referred to as BBNorm)
+Shellscript: bbnorm.sh, ecc.sh, khist.sh
+Description: Uses a lossy data structure (count-min sketch) to perform kmer-based normalization, error-correction, and/or depth-binning on reads.
+Notes: Multithreaded, high memory. BBNorm will never run out of memory; rather, as the amount of data increases, the accuracy decreases. Therefore you should always use all available memory for best accuracy. The error correction by Tadpole is superior, but Tadpole can run out of memory with large datasets.
+Additional Shellscripts: KmerNormalize is called by 3 different shellscripts, which differ only in their default parameters (which can be overridden). bbnorm.sh does 2-pass normalization only; ecc.sh does error-correction only; and khist.sh only makes a kmer histogram, without ignoring the low-quality kmers (as is done by ecc and bbnorm). But, if add the flag "ecc" to bbnorm.sh and it will do error-correction also, and so forth - with the same parameters they are all identical.
+
+Name: jgi.CalcUniqueness
+Shellscript: bbcountunique.sh
+Description: Generates a kmer uniqueness histogram, binned by file position. Designed to analyze library complexity, and determine how much sequencing is needed before reaching saturation. Outputs both single-read uniqueness and pair uniqueness.
+Notes: Singlethreaded, high memory (around 100 bytes per read pair).
+
+Name: jgi.SmallKmerFrequency
+Shellscript: commonkmers.sh
+Description: Prints the most common kmers in a sequence, their counts, and the sequence header. K is limited to 15.
+Notes: Singlethreaded, low memory. Memory is proportional to 4^k, and is trivial for short kmers under 10.
+
+Name: jgi.KmerCoverage
+Shellscript: kmercoverage.sh
+Description: Annotates reads with their kmer depth.
+Notes: Deprecated. Multithreaded, high memory.
+
+Name: jgi.CallPeaks
+Shellscript: callpeaks.sh
+Description: Calls peaks from a kmer frequency histogram, such as that from BBNorm or KmerCountExact. Also estimates genome size and other statistics.
+Notes: Singlethreaded, low memory. Normally called automatically by programs that make the histogram. The peak-calling logic is not very sophisticated and could be improved.
+
+
+Assembly
+
+Name: assemble.Tadpole
+Shellscript: tadpole.sh
+Description: Very fast kmer-based assembler, designed for haploid organisms. Performs well on single cells, viruses, organelles, and in other situations with small genomes and potentially uneven or very high coverage. Also has modes for read error-correction and extension, instead of assembly; Tadpole's error-correction is superior to BBNorm's. No upper limit on kmer length. See also KmerCountExact, KmerCompressor, LogLog, BBMerge, KmerNormalize.
+Notes: Multithreaded, high memory. Memory consumption is a strict function of the number of unique input kmers.
+
+Name: assemble.TadpoleWrapper
+Shellscript: tadwrapper.sh
+Description: Generates multiple assemblies with Tadpole to estimate the optimal kmer length.
+Notes: Multithreaded, high memory.
+
+Name: assemble.KmerCompressor
+Shellscript: kcompress.sh
+Description: Generates a minimal fasta file containing each kmer from the input sequence exactly once. Optionally allows the inclusion only of kmers within a certain depth range. Arbitrary kmer set operations are possible via multiple passes. Very similar to an assembler.
+Notes: Multithreaded, high memory. Contains a singlethreaded phase.
+
+Name: jgi.AssemblyStats2
+Shellscript: stats.sh
+Description: Generates basic assembly statistics such as scaffold count, N50, L50, GC content, gap percent, etc. Also generates per-scaffold length and base content statistics, and can estimate BBMap's memory requirements for an assembly. See also StatsWrapper.
+Notes: Singlethreaded, low memory.
+
+Name: jgi.AssemblyStatsWrapper
+Shellscript: statswrapper.sh
+Description: Generates stats on multiple assemblies, allowing tab-delimited columns with one assembly per row, and only one header.
+Notes: Singlethreaded, low memory.
+
+Name: jgi.CountGC
+Shellscript: countgc.sh
+Description: Counts GC content of reads or scaffolds.
+Notes: Deprecated; superceded by AssemblyStats.
+
+Name: jgi.FungalRelease
+Shellscript: fungalrelease.sh
+Description: Reformats a fungal assembly for release. Also creates contig and agp files.
+Notes: Singlethreaded, low memory.
+
+
+Taxonomy
+
+Name: tax.FilterByTaxa
+Shellscript: filterbytaxa.sh
+Description: Filters sequences according to their taxonomy, as determined by the sequence name. Sequences should be labeled with a gi number, NCBI taxID, or species name. Relies on NCBI taxdump processed using taxtree.sh and gitable.sh.
+Notes: Singlethreaded, low memory.
+
+Name: tax.RenameGiToNcbi
+Shellscript: gi2taxid.sh
+Description: Renames sequences with gi numbers to NCBI taxa IDs. This allows taxonomy processing without a gi number lookup.
+Notes: Singlethreaded, high memory. TODO: Can be made low memory if slightly altered to accept gitable.int1d files.
+
+Name: tax.GiToNcbi
+Shellscript: gitable.sh
+Description: Condenses gi_taxid_nucl.dmp from NCBI taxdmp to gitable.int1d, a more efficient representation, used by other tools for translating gi numbers to taxID's. See also TaxTree.
+Notes: Singlethreaded, high memory.
+
+Name: tax.SortByTaxa
+Shellscript: sortbytaxa.sh
+Description: Sorts sequences into taxonomic order by some depth-first traversal of the Tree of Life as defined by NCBI taxdump. Sequences must be labelled with taxonomic identifiers.
+Notes: Singlethreaded, high memory.
+
+Name: tax.SplitByTaxa
+Shellscript: splitbytaxa.sh
+Description: Splits sequences according to their taxonomy, as determined by the sequence name. Sequences should be labeled with a gi number, NCBI taxID, or species name.
+Notes: Multithreaded, high memory. If the number of threads is restricted and the sequences are fairly short, regardless of the total number, this may be run using low memory.
+
+Name: tax.PrintTaxonomy
+Shellscript: taxonomy.sh
+Description: Prints the full taxonomy of a given taxonomic identifier (such as homo_sapiens).
+Notes: Singlethreaded, low memory.
+
+Name: tax.TaxTree
+Shellscript: taxtree.sh
+Description: Creates tree.taxtree from names.dmp and nodes.dmp, which are in NCBI tax dump. The taxtree file is needed for programs that can deal with taxonomy, like Seal and SortByTaxa.
+Notes: Singlethreaded, high memory.
+
+Name: driver.ReduceSilva
+Shellscript: reducesilva.sh
+Description: Reduces Silva entries down to one entry per specified taxonomic level. Designed to increase the efficiency of operations like mapping, in which having thousands of substrains represented are not helpful.
+Notes: Singlethreaded, low memory.
+
+
+Cross-Contamination
+
+Name: jgi.SynthMDA
+Shellscript: synthmda.sh
+Description: Generates synthetic reads following an MDA-amplified single cell's coverage distribution. Designed for single-cell assembly and analysis optimization. See also CrossContaminate, RandomReads.
+Notes: Singlethreaded, medium memory (needs around 4GB).
+
+Name: jgi.CrossContaminate
+Shellscript: crosscontaminate.sh
+Description: Generates synthetic cross-contaminated files from clean files. Intended for use with synthetic reads generated by SynthMDA or RandomReads. Designed to evaluate the effects of cross-contamination on assembly, and the efficacy of decontamination methods.
+Notes: Singlethreaded, high memory.
+
+Name: jgi.DecontaminateByNormalization
+Shellscript: decontaminate.sh, crossblock.sh
+Description: Removes contaminant contigs from assemblies of multiplexed libraries via normalization and mapping.
+Notes: Multithreaded, high memory. Mostly a wrapper for other programs like BBMap, BBNorm, and FilterByCoverage.
+
+
+Deduplication and Clustering
+
+Name: jgi.Dedupe
+Shellscript: dedupe.sh
+Description: Accepts one or more files containing sets of sequences (reads or scaffolds). Removes duplicate sequences, which may be specified to be exact matches, fully contained subsequences, or subsequence within some edit distance. Can also find overlapping sequences and group them into clusters based on transitive reachability; for example, clustering full-length 16S PacBio reads by species.
+Notes: Multithreaded, high memory. This program has a jni mode which increases speed dramatically if an edit distance is used.
+
+Name: jgi.Dedupe2
+Shellscript: dedupe2.sh
+Description: Allows more kmer seeds than Dedupe. This will be automatically called by Dedupe if needed.
+Notes: See Dedupe.
+
+Name: jgi.DedupeByMapping
+Shellscript: dedupebymapping.sh
+Description: Removes duplicate reads or read pairs from a sam/bam file based on mapping coordinates. The sam file does not need to be sorted.
+Notes: Singlethreaded, high memory.
+
+Name: clump.Clumpify
+Shellscript: clumpify.sh
+Description: Rearranges unsorted reads into small clumps of reads, such that each clump shares a kmer, and thus probably overlaps. Can also create consensus sequence from these clumps.
+Notes: Multithreaded, low or high memory. Memory consumption may be made arbitrarily small by using a user-specified number of temp files for bucket-sorting. By default, it will try to grab all available memory.
+
+
+Read Merging
+
+Name: jgi.BBMerge
+Shellscript: bbmerge.sh, bbmerge-auto.sh
+Description: Merges paired reads into single reads by overlap detection. With sufficient coverage, can also merge nonoverlapping reads by kmer extension.
+Notes: Multithreaded, low memory. If kmers are used (for extension or error-correction), it will need much more memory, and the shellscript bbmerge-auto.sh should be used, which tries to acquire all available RAM. This program has a jni mode which increases speed by around 20%.
+
+Name: jgi.MateReadsMT
+Shellscript: bbmergegapped.sh
+Description: Uses gapped kmers to merge nonoverlapping reads.
+Notes: Deprecated; superceded by BBMerge.
+
+
+Synthetic Read Generation and Benchmarking
+
+Name: align2.RandomReads3
+Shellscript: randomreads.sh
+Description: Generates random synthetic reads from a reference genome, annotated with their genomic origin. Allows precise customization of things like insert size and synthetic mutation type, sizes, and rates. Read names are parsed by various other BBTools to grade accuracy.
+Notes: Singlethreaded, high memory.
+
+Name: jgi.FakeReads
+Shellscript: bbfakereads.sh
+Description: Generates fake read pairs from ends of contigs or single reads. Intended for use in generating a fake LMP library for scaffolding, using additional information like another assembly, or very long reads (like PacBio). This can also be accomplished with RandomReads.
+Notes: Singlethreaded, low memory.
+
+Name: align2.GradeSamFile
+Shellscript: gradesam.sh
+Description: Grades the accuracy of an aligner (such as BBMap) by parsing the output. The reads must be single-ended and annotated as though generated by RandomReads.
+Notes: Singlethreaded, low memory.
+
+Name: align2.MakeRocCurve
+Shellscript: samtoroc.sh
+Description: Creates an ROC plot (technically, true-positive versus false-positive) from a sam or bam file of mapped reads. The reads should be single-ended with headers generated by RandomReads.
+Notes: Singlethreaded, low memory.
+
+Name: jgi.AddAdapters
+Shellscript: addadapters.sh
+Description: Randomly adds adapters to a file, or grades a trimmed file. The input is a set of reads, paired or unpaired. The output is those same reads with adapter sequence replacing some of the bases in some reads. For paired reads, adapters are located in the same position in read1 and read2. This is designed for benchmarking adapter-trimming software (such as BBDuk), and evaluating methodology. Adapters can alternately be added by RandomReads, in which case insert size is used to [...]
+Notes: Singlethreaded, low memory.
+
+Name: jgi.GradeMergedReads
+Shellscript: grademerge.sh
+Description: Grades the accuracy of a read-merging program (such as BBMerge) by parsing the output. The reads must be annotated by their insert size. This can be done by generating them with RandomReads and renaming with RenameReads
+Notes: Singlethreaded, low memory.
+
+Name: align2.PrintTime
+Shellscript: printtime.sh
+Description: Prints time elapsed since last called on the same file.
+Notes: Singlethreaded, low memory.
+
+
+16S, Primers, and Amplicons
+
+Name: jgi.FindPrimers
+Shellscript: msa.sh
+Description: Aligns a query sequence to reference sequences. Outputs the best matching position per reference sequence. If there are multiple queries, only the best-matching query will be used. Designed to find primer binding sites in a sequence that may contain indels, such as a PacBio read, using a MultiStateAligner.
+Notes: Singlethreaded, high memory. TODO: Could easily be made multithreaded using A_SampleMT.
+
+Name: jgi.CutPrimers
+Shellscript: cutprimers.sh
+Description: Cuts out sequences corresponding to primers identified in sam files. Used in conjunction with FindPrimers (msa.sh).
+Notes: Singlethreaded, low memory.
+
+Name: jgi.IdentityMatrix
+Shellscript: idmatrix.sh
+Description: Generates an identity matrix via all-to-all alignment of sequences in a file. Intended for 16S or other amplicon analysis. See also CorrelateIdentity.
+Notes: Multithreaded, high-memory. Time complexity is O(N^2) with the number of reads.
+
+Name: driver.CorrelateIdentity
+Shellscript: matrixtocolumns.sh
+Description: Transforms two matched identity matrices into 2-column format, one row per entry, one column per matrix. Designed for comparing different 16S subregions. See also IdentityMatrix, FindPrimers.
+Notes: Singlethreaded, high memory. The actual amount of memory just depends on the matrix sizes.
+
+
+Barcodes
+
+Name: jgi.CountBarcodes
+Shellscript: countbarcodes.sh
+Description: Counts the number of reads with each barcode. Assumes read names have the barcode at the end.
+Notes: Singlethreaded, low memory.
+
+Name: jgi.CorrelateBarcodes
+Shellscript: filterbarcodes.sh
+Description: Filters barcodes by quality, and generates quality histograms. See also MergeBarcodes.
+Notes: Singlethreaded, low memory.
+
+Name: jgi.MergeBarcodes
+Shellscript: mergebarcodes.sh
+Description: Concatenates barcodes and barcode quality onto read names. Designed to analyze the effects of barcode quality on library misassignment. See also CorrelateBarcodes.
+Notes: Singlethreaded, low memory.
+
+Name: jgi.RemoveBadBarcodes
+Shellscript: removebadbarcodes.sh
+Description: Removes reads with improper barcodes - either with no barcode, or a barcode containing a degenerate base.
+Notes: Singlethreaded, low memory. Mostly a test case for extending BBTool_ST.
+
+
+Filtering and Demultiplexing
+
+Name: jgi.DemuxByName
+Shellscript: demuxbyname.sh
+Description: Demultiplexes reads into multiple files based on their name, by matching a suffix or prefix.
+Notes: Singlethreaded, low memory.
+
+Name: jgi.FilterBySequence
+Shellscript: filterbysequence.sh
+Description: Filters reads by exact sequence match. Allows case-sensitive or insensitive matches, and reverse-complement matches or only forward matches.
+Notes: Multithreaded, high memory.
+
+Name: driver.FilterReadsByName
+Shellscript: filterbyname.sh
+Description: Filters reads by name. Allows substring matching, though that is much slower.
+Notes: Singlethreaded, low memory.
+
+Name: jgi.FilterReadsWithSubs
+Shellscript: filtersubs.sh
+Description: Filters a sam file to select only reads with substitution errors for bases with quality scores in a certain interval. Used for manually examining specific reads that may contain incorrectly calibrated quality scores.
+Notes: Singlethreaded, low memory.
+
+Name: jgi.GetReads
+Shellscript: getreads.sh
+Description: Fetches the reads with specified numeric IDs (unrelated to their names). The first read (or pair) in a file has ID 0, the second read (or pair) has ID 1, etc.
+Notes: Singlethreaded, low memory.
+
+Name: driver.EstherFilter
+Shellscript: estherfilter.sh
+Description: BLASTs queries against reference, and filters out hits with scores less than 'cutoff'.
+Notes: All the work is done by blastall, which dictates the performance characteristics.
+
+
+JGI-Exclusive Preprocessing Wrappers
+
+Name: jgi.BBQC
+Shellscript: bbqc.sh
+Description: Wrapper for various read preprocessing operations.
+Notes: Deprecated; superceded by RQCFilter. Designed exclusively for Genepool and will not function elsewhere, so should not be distributed outside LBL.
+
+Name: jgi.RQCFilter
+Shellscript: rqcfilter.sh
+Description: Acts as a wrapper/pipeline for read preprocessing. Performs quality-trimming, artifact removal, linker-trimming, adapter trimming, spike-in removal, vertebrate contaminant removal, microbial contaminant removal, and generates various histogram and statistics files used by RQC.
+Notes: Multithreaded, high memory. Currently requires 39500m RAM and thus can run on a 40G node, but it's recommended to submit it exclusive, as all stages are fully multithreaded. Designed exclusively for Genepool and will not function elsewhere, so should not be distributed outside LBL.
+
+
+Shredding and Sorting
+
+Name: jgi.Shred
+Shellscript: shred.sh
+Description: Shreds long sequences into shorter sequences, with overlap length and variable-length options. See also Fuse.
+Notes: Singlethreaded, low memory.
+
+Name: jgi.FuseSequence
+Shellscript: fuse.sh
+Description: Fuses sequences together, padding junctions with Ns. Does not support total length greater than 2 billion. Designed for use with Seal or BBDuk to make kmer tracking for a given genome more efficient. See also Shred.
+Notes: Singlethreaded, high memory.
+
+Name: jgi.Shuffle
+Shellscript: shuffle.sh
+Description: Reorders reads randomly, keeping pairs together. Also supports some sorting operations, like alphabetically by name or by sequence.
+Notes: Singlethreaded, high memory. All operations are in-memory.
+
+
+Non-Sequence-Related
+
+Name: Calcmem - Shellscript Only
+Shellscript: calcmem.sh
+Description: Calculates available memory for other shellscripts. Designed for Genepool but works fine on many Linux configurations.
+Notes: If java is being killed for allocating too much memory, this is the script to fix.
+
+Name: fileIO.TextFile
+Shellscript: textfile.sh
+Description: Displays contents of a text file, optionally between a start and stop line. Useful mainly in Windows where there are few command-line utilities.
+Notes: Singlethreaded, low memory.
+
+Name: driver.CountSharedLines
+Shellscript: countsharedlines.sh
+Description: Counts the number of lines shared between sets of files. One output file will be printed for each input file. For example, an output file for a file in the 'in1' set will contain one line per file in the 'in2' set, indicating how many lines are shared. This is not designed for sequence data, but more for things like sequence names or organism names. See filterlines.sh for actually filtering shared lines in a more normal fashion.
+Notes: Singlethreaded, low memory.
+
+Name: driver.FilterLines
+Shellscript: filterlines.sh
+Description: Filters lines by exact match or substring. This is not designed for sequence data, but for things like sequence names or organism names.
+Notes: Singlethreaded, low memory.
+
+
+Other Tools
+
+Name: jgi.A_SampleMT
+Shellscript: a_sample_mt.sh
+Description: Does nothing. Serves as a template for easily making new BBTools by dropping in code.
+Notes: Multithreaded, high memory. Be sure to modify the shellscript line " freeRam 4000m 84" as needed. The first is the amount of memory used if available memory cannot be calculated, the second is the percentage of free memory to use if it can be calculated.
+
+Name: jgi.BBMask
+Shellscript: bbmask.sh
+Description: Masks sequences of low-complexity, or containing repeat kmers, or covered by mapped reads. Used to make masked versions of human, cat, dog, and mouse genomes; these are used for filtering vertebrate contamination from fungal/plant/microbial data without risk of false-positive removals.
+Notes: Multithreaded, high memory. Uses around 2 bytes per reference base.
+
+Name: jgi.CalcTrueQuality
+Shellscript: calctruequality.sh
+Description: Generates matrices used for quality-score recalibration. Requires one or more mapped sam files as input. The actual recalibration is done with another program such as BBDuk.
+Notes: Multithreaded, low memory.
+
+Name: jgi.MakeChimeras
+Shellscript: makechimeras.sh
+Description: Makes chimeric sequences by randomly fusing together nonchimeric sequences. Designed for analyzing chimera removal effectiveness.
+Notes: Singlethreaded, low memory.
+
+Name: jgi.PhylipToFasta
+Shellscript: phylip2fasta.sh
+Description: Transforms interleaved phylip to fasta.
+Notes: Singlethreaded, high memory.
+
+Name: jgi.MakeLengthHistogram
+Shellscript: readlength.sh
+Description: Makes a length histogram of sequences.
+Notes: Singlethreaded, low memory. Can also be accomplished with Reformat or BBDuk, but with less flexibility.
+
+Name: jgi.ReformatReads
+Shellscript: reformat.sh
+Description: Reformats sequence data into another format, such as interleaved ASCII-33 fastq to twin-file ASCII-64. Also supports a huge collection of simple optional operations, like trimming, filtering, reverse-complementing, modifying read names, and modifying read sequence.
+Notes: Singlethreaded, low memory.
+
+Name: pacbio.RemoveAdapters2
+Shellscript: removesmartbell.sh
+Description: Detects or removes SmartBell adapters from PacBio reads, by aligning the adapter using a customized version of the MultiStateAligner.
+Notes: Multithreaded, low memory.
+
+Name: jgi.RenameReads
+Shellscript: rename.sh
+Description: Renames reads according to some specified prefix. Can also rename by insert size or mapping location.
+Notes: Singlethreaded, low memory.
+
+Name: jgi.SplitPairsAndSingles
+Shellscript: repair.sh, bbsplitpairs.sh
+Description: Separates paired reads into files of pairs and singletons by removing reads that are shorter than a min length, or have no mate. Can also reorder arbitrarily-ordered reads in files where the pairing order was desynchronized. See also Reformat's vint flag.
+Notes: Singlethreaded, low or high memory. All operations are low-memory except reordering arbitrarily disordered files, which is optional.
+
+Name: jgi.SplitNexteraLMP
+Shellscript: splitnextera.sh
+Description: Trims and splits Nextera LMP libraries into subsets based on linker orientation: LMP, fragment, unknown, and singleton.
+Notes: Singlethreaded, low memory. TODO: Should be reimplemented using A_SampleMT.
+
+Name: jgi.SplitSamFile
+Shellscript: splitsam.sh
+Description: Splits a sam file into three files: Plus-mapped reads, Minus-mapped reads, and Unmapped.
+Notes: Singlethreaded, low memory.
+
+Name: fileIO.FileFormat
+Shellscript: testformat.sh
+Description: Tests the format of a sequence-containing file. Determines format (fasta, fastq, etc), quality encoding, compression type, interleaving, and read length. All BBTools use this to determine how to process a file.
+Notes: Singlethreaded, low memory.
+
+Name: jgi.TranslateSixFrames
+Shellscript: translate6frames.sh
+Description: Translates nucleotide sequences to all 6 amino acid frames, or amino acids to a canonical nucleotide representation.
+Notes: Singlethreaded, low memory.
+
+
+Template
+
+Name:
+Shellscript:
+Description:
+Notes:
diff --git a/docs/UsageGuide.txt b/docs/UsageGuide.txt
new file mode 100755
index 0000000..f2975b8
--- /dev/null
+++ b/docs/UsageGuide.txt
@@ -0,0 +1,306 @@
+BBMap/BBTools usage guide.
+Last updated December 11, 2015
+
+Table of Contents
+
+System Requirements
+Installation
+Terminology Notes
+Usage
+Standard Syntax
+Paired Reads
+Multiple Output and % Symbol
+File Formats
+Piping
+Memory and Java Flags
+Threads
+Subprocesses
+Additional Help
+Standard flags
+Help Flags
+Config Flags
+Input Flags
+Output Flags
+Sampling Flags
+Compression Flags
+Quality-Related Flags
+Length-Related Flags
+Histogram Flags
+Advanced Flags
+Buffer Flags
+MPI and JNI Flags
+
+
+System Requirements
+
+BBTools is written in Java and requires Java 1.7 or higher for full functionality. It is tested on Oracle's JDK, not OpenJDK. Most tools will work with Java 1.6 (if not, they will throw a ClassNotFound exception), and most tools will work with OpenJDK, but if you experience a problem with Java 1.6 or OpenJDK it is recommended that you install Oracle's latest JDK, which is currently 1.8. All operating systems that support Java are supported. Note that many of the tools require a subst [...]
+
+java -Xmx90m -version
+
+Installation (for non-Genepool users)
+
+BBTools can be installed by downloading the gzipped tar file from Sourceforge (http://sourceforge.net/projects/bbmap/files/latest/download) and decompressing it. In Linux, the command would be:
+
+tar xvzf BBMap_35.74.tar.gz
+
+Then, optionally, you can export the path of the shellscripts to your environment to make it easier to run. The Java code is already compiled and does not need recompilation. Source code is also available from bitbucket (https://bitbucket.org/berkeleylab/jgi-bbtools), but this is currently a private Berkeley account.
+
+BBTools includes bash wrapper scripts to make the command lines shorter. The package also contains C code that can accelerate certain programs, and experimental MPI code that can make it difficult to compile BBTools on systems without MPI support. None of these is required. But, you can accelerate BBMerge, Dedupe, and BBMap by following the instructions in /jni/README.txt if you have a C compiler.
+
+Installation (for Genepool users)
+
+BBTools is a module that can be loaded with the command �module load bbtools�. This will put all of the shellscripts in your path.
+Installation (for Genepool admins)
+
+A new version of the module is created automatically by Jenkins when BBTools is committed to the Bitbucket repo berkeleylab/jgi-bbtools, with a new version number. Process:
+Update files in the local repository copy of the master branch (e.g. /global/projectb/sandbox/gaag/bbtools/jgi-bbtools)
+Ensure everything looks correct: git status
+Add the new files: git add .
+Commit changes: git commit -m �changes go here�
+Tag the new version: git tag -a v35.75 -m 'Version 35.75'
+Push the changes: git push origin master --tags
+Compiled code (.class files, etc) are ignored in this repo; when Jenkins sees the push, it will recompile everything, using the Eclipse JDK, which produces faster bytecode than Oracle�s.
+
+Terminology Notes
+
+"Read" in this file is used synonymously with "sequence", whether it is contig in a fasta file or a short read produced by a sequencing platform. "Paired reads" or "pair" refer to 2 reads that are generated by sequencing both ends of a single fragment of DNA. These are typically delivered in two fastq files, named something like "read1.fastq.gz" and "read2.fastq.gz". The alternative is single-ended reads, in which only one end of the molecule is sequenced. When paired reads are avail [...]
+
+
+Usage
+
+Most BBTools use the same syntax and operate with a set of standard flags. Individual tools also have specific flags - for example, kmer-based tools support the flag "k" to specify the kmer length, and non-kmer-based tools don't. This guide describes the standard syntax and most common flags. Custom syntax and flags for a given tool are described in that tool's shellscript.
+
+
+Standard Syntax
+
+Most BBTools (such as Reformat or BBNorm) process genomic sequences in some fashion, and are executed like this:
+
+reformat.sh in=reads.fq out=processed.fq
+
+The shellscript allows autodetection of memory (in some cases) and classpath.
+The above command is equivalent to this:
+
+java -ea -Xmx200m -cp /path/to/bbmap/current/ jgi.ReformatReads in=reads.fq out=processed.fq
+
+Note that �/path/to/bbmap/current/� needs to be replaced with an actual path. While the shellscript will only work in bash (or some other Linux/Unix/MacOS shells),
+the full Java command will work on any environment with Java installed, such as Windows.
+
+Tools that use a reference (such as BBMap, BBDuk, and Seal) will also need the additional flag "ref=":
+
+bbmap.sh in=reads.fq out=mapped.sam ref=genome.fasta
+
+In each of the above cases, the flags can be arranged in any order.
+
+
+Paired Reads
+
+Most BBTools support paired reads. These may be in two files, or interleaved in a single file, which BBTools will autodetect based on the read names. When the reads are in two files, you can use the in2 and out2 flags, like this:
+
+reformat.sh in1=read1.fq in2=read2.fq out1=processed1.fq out2=processed2.fq
+
+It is also possible to specify paired files like this:
+
+reformat.sh in=read#.fq out=processed#.fq
+
+...which is equivalent to the above command.
+
+It is important to process paired files together in one command so that they are kept in the proper order. If you have dual input files and only 1 output file, the output will be written interleaved, and vice-versa. All tools that support paired reads will keep pairs together. For example, Reformat supports subsampling; if read 1 is discarded, read 2 will also be discarded. This prevents a loss of synchronization that corrupts the output.
+
+
+Multiple Output and % Symbol
+
+Some tools (such as Seal, BBSplit, BBMap, Dedupe) can use the % symbol as a wildcard, to be replaced by some other word when generating many files from a single input. It is recommended that the % symbol be avoided in filenames. As an example, assume you run Seal to bin some reads based on matching sequences in the fasta file "ref.fa", which contains the genomes of e.coli and salmonella:
+
+seal.sh in=reads.fq pattern=out_%.fq ref=ref.fa
+
+This would produce the output files "out_e.coli.fq" and "out_salmonella.fq".
+
+
+File Formats and Extensions
+
+BBTools support most standard sequence formats, including fastq, fasta, fasta+qual, scarf, sam, and (if samtools is installed) bam. They also support gzip and (if bzip2 or pbzip2 is installed) bzip2. The tools are sensitive to file extensions. For example:
+
+reformat.sh in=reads.fq.gz out=processed.fa
+
+In this case, reformat will try to read a gzip-compressed fastq file and output an uncompressed fasta file. For BBMap, this means that it will output a sam file if you name the output ".sam", bam if you name it ".bam", fastq if you name it ".fastq", and so forth. BBTools are usually capable of autodetecting input format (for example, if you feed it a fasta file called "stuff.txt" it will be able determine that it is in fasta format), but this is not recommended. Also, it is possible t [...]
+
+List of supported file extensions:
+
+Fastq: fastq, fq
+Fasta: fasta, fa, fas, fna, ffn, frn, seq, fsa, faa
+Bread: bread (BBMap internal format; deprecated)
+Sam: sam
+Bam: bam
+Qual: qual (should be accompanied with fasta)
+Scarf: scarf (an old Illumina format; input only)
+Phylip: phylip (only supported by phylip2fasta; input only)
+Text: txt (used for logs, stats, and histograms)
+Header: header (use this extension to write read names only)
+
+List of supported compression extensions:
+
+Gzip: gzip, gz
+Bzip2: bz2
+Zip: zip
+
+Piping and Screen Output
+
+Most tools can accept input from stdin and write output to stdout, with notable exceptions being BBNorm and Tadpole in some processing modes, which require reading the input file multiple times. Piping works like this:
+
+cat reads.fq.gz | reformat.sh in=stdin.fq.gz out=stdout.fa int=f > x.fa
+
+Note that the extensions are added to stdin and stdout so that Reformat knows how to interpret the data; when piping, it cannot first autodetect the file format. Similarly, it cannot autodetect whether the reads are interleaved or not. So, "int=f" (equivalent to "interleaved=false") was added to force it to treat the data as single-ended.
+
+By default, all tools write status information to stderr, not stdout. To capture a program�s screen output, do this:
+
+reformat.sh in=a.fq out=b.fq 1>out.txt 2>err.txt
+
+Or, to direct both to a single file:
+
+reformat.sh in=a.fq out=b.fq 1>out.txt 2>&1
+
+Memory and Java Flags
+
+There are two flags that are passed by the shellscripts directly to Java rather than to BBTools, "-Xmx" and "-da".
+Java does not dynamically grow virtual memory as needed like C programs. The amount of virtual memory must be specified up front, and it will immediately be grabbed; the physical memory used will only increase as needed. The shellscripts will try to autodetect memory and set it to an appropriate value, but sometimes this will need to be overridden (for example, if you are using a shared node and don't really need all the memory, or not enough memory was allocated and the program crashe [...]
+
+reformat.sh in=reads.fq out=processed.fq -Xmx3g
+
+That's the equivalent of:
+
+java -ea -Xmx3g -cp /path/to/bbmap/current/ jgi.ReformatReads in=reads.fq out=processed.fq
+
+The "-ea" flag means "enable assertions", which will make BBTools crash if they detect a problem. If you want to ignore the problem and force it to run anyway, you can use the "-da" flag. The -da flag may also increase speed slightly.
+
+
+Threads
+
+Most BBTools are multithreaded, and will automatically detect and use all available threads. This is usually desirable when you have exclusive use of a computer, but may not be on a shared node. The number of threads can be capped at X with the flag "t=X" (threads=X). The total CPU usage may still go higher, though, due to several factors:
+1) Input and output are handled in separate threads; "t=X" only regulates the number of worker threads.
+2) Java uses additional threads for garbage collection and other virtual machine tasks.
+3) When subprocesses (such as pigz) are spawned, they also individually obey the thread limit, but if you set "t=4" and the process spawns 3 pigz instances, you could still potentially use over 16 threads - 4 worker threads, 4 threads for each pigz process, plus other threads for the JVM and I/O.
+If you have exclusive use of a computer, you don't need to worry about spawning too many threads; this is only an issue with regards to fairness on shared nodes.
+
+
+Subprocesses
+
+If they are installed, BBTools will automatically use samtools for sam<->bam conversion, and bzip2 or pbzip2 for processing bz2 files. It may use pigz to accelerate processing of gzipped files, depending on the number of threads available. This is generally fine on a standalone computer, but in some circumstances, depending on the cluster configuration, the scheduler may kill a process that spawns a subprocess for violating virtual memory limits (Amazon instances may do this). In that [...]
+
+
+Additional Help
+
+There are many forum threads on SeqAnswers describing the usage of different BBTools, linked from this thread:
+http://seqanswers.com/forums/showthread.php?t=41057
+
+
+*Standard flags*
+
+The flags below work with many or all BBTools that process reads, but the list is not complete because it does not include flags specific to only one or a few tools - those are listed in the shellscript. They are listed with their default setting, but some of the defaults differ between tools; the specific default is also listed in the shellscript. Where the description starts with something in parentheses, like "(in1)", that is an acceptable alternative version of the flag.
+
+
+Flag Syntax
+
+With the exception of certain special flags like help flags (--help, --version) and Java flags (-Xmx, -da), all flags are in the same format: �a=b� where �a� is the name of the flag, and is not case-sensitive, and �b� is the value, which is case-sensitive (for filenames). Flags may be in any order, and never need leading hyphens, except for those special flags mentioned above. If a flag is set twice, the later value will override the former; for example, �reformat.sh in=x.fq in=y.fq� w [...]
+For boolean variables, �null� is equivalent to �true�, and values may be abbreviated �t and f. So, these are all equivalent:
+
+Help Flags
+
+--help Print the usage information from the shellscript (when run from a shellscript). Alternately you can just look at the shellscript with a text editor.
+--version Print the version of BBTools.
+
+Config Files
+
+config=<file> A file or a comma-delimited list of files. If this flag is present, the contents of the config file will be added to the command line. Config files must contain one argument per line. Config files are never required, but may be useful when a command line would be too long or when arguments contain whitespace. See readme_config.txt for more information.
+
+Input Flags
+
+in=<file> (in1) Main input.
+in2=<file> Input for 2nd read of pairs in a different file.
+interleaved=auto (int) t/f overrides interleaved autodetection.
+samplerate=1 Set lower to only process a fraction of input reads.
+qfin=<.qual file> Read qualities from this qual file, for the reads coming from 'in'
+qfin2=<.qual file> Read qualities from this qual file, for the reads coming from 'in2'
+extout= Allows overriding of input file format. For example, "extin=.fq" would force the input to be read in fastq format regardless of the file name.
+trimreaddescription=f (trd) Trim the names of reads after the first whitespace.
+touppercase=f (tuc) Convert lowercase letters in reads to upper case.
+lowercaseton=f (lctn) Convert lowercase letters in reads to N.
+utot=f Convert U bases to T.
+
+Output Flags
+
+out=<file> (out1) Main output.
+out2=<file> Output for 2nd read of pairs in a different file.
+qfout=<.qual file> Write qualities from this qual file, for the reads going to 'out'
+qfout2=<.qual file> Write qualities from this qual file, for the reads coming from 'out2'
+extout= Allows overriding of output file format. For example, "extout=.fq" would force the output to be written in fastq format regardless of the file name.
+fastawrap=70 Length of lines in fasta output.
+overwrite=f Allow overwriting of existing files.
+append=f Append to existing files.
+
+Sampling Flags
+
+reads=-1 Set to a positive number to only process this many input reads (or pairs), then quit.
+samplerate=1 Randomly output only this fraction of reads; 1 means sampling is disabled.
+sampleseed=-1 Set to a positive number to use that prng seed for sampling (allowing deterministic sampling). A negative number will use a random seed.
+
+Threading Flags
+
+threads=auto (t) Number of worker threads to spawn.
+
+Compression Flags
+
+ziplevel=2 (zl) Compression level for zip or gzip output; 1-9.
+unpigz= Spawn a pigz process for faster decompression. Requires pigz to be installed. Valid values are t or f; the default varies by program.
+pigz= Spawn a pigz process for faster compression. Requires pigz to be installed. Valid values are t, f, or a number; the default varies by program. "pigz=X" will enable pigz, and also force all pigz processes to use exactly X threads.
+
+Quality-Related Flags
+
+qin=auto Input quality offset: 33 (Sanger), 64, or auto.
+qout=auto ASCII offset for output quality. May be 33 (Sanger), 64 (Illumina), or auto (same as input).
+qfake=30 Quality value used for fasta to fastq reformatting.
+maxcalledquality=41 Cap quality values at this upper level.
+mincalledquality=0 Cap quality values at this lower level.
+ignorebadquality (ibq) Don't crash if quality values appear to be incorrect.
+qtrim=f Enable or disable quality trimming. May be set to r, l, or rl to trim the right, left, or both sides.
+trimq= Trim bases below this quality value.
+
+Length-Related Flags
+
+fastareadlen= Fasta sequences longer than this are broken into subsequences of at most this length, and given a suffix such as _part_1. Only works with fasta files; generally designed for mapping very long sequences with BBMap.
+fastaminlen= Discard fasta sequences shorter than this.
+maxlen= Has different meanings depending on the program. For BBMap, reads longer than this will be broken to pieces this length. For most other programs, it acts as a filter.
+minlen= Has different meanings depending on the program. Typically, reads shorter than this will be discarded.
+
+Histogram Flags
+
+bhist=<file> Base composition histogram by position.
+qhist=<file> Quality histogram by position.
+qchist=<file> Count of bases with each quality value.
+aqhist=<file> Histogram of average read quality.
+bqhist=<file> Quality histogram designed for box plots.
+lhist=<file> Read length histogram.
+gchist=<file> Read GC content histogram.
+gcbins=100 Number gchist bins. Set to 'auto' to use read length.
+
+*Advanced Flags*
+Debugging and Benchmarking Flags
+
+
+verbose=f Print status messages for debugging.
+parsecustom=f Parse synthetic read names for custom data stored by RandomReads.
+
+Buffering and I/O Flags
+
+readbufferlength=200 Number of reads to store per ListNum. A ListNum is the smallest unit of work sent to a worker thread.
+readbuffers= Number of ListNums to store in the queue waiting for worker threads. The default is 150% of the number of threads.
+bf1= Set to true to force ByteFile1 to be used for reading files.
+bf2= Set to true to force ByteFile2 to be used for reading files (faster).
+
+MPI and JNI Flags
+
+usejni=f Set to true to enable JNI-accelerated versions of BBMerge, BBMap, and Dedupe. Requires the C code to be compiled.
+mpi=0 Inform the program of how many MPI processes will be used. Most programs are not currently MPI-capable.
+crismpi=f Use an MPI version of ConcurrentReadInputStreams.
+mpikeepall=t If using MPI, send all reads to all processes.
+
+
diff --git a/docs/changelog.txt b/docs/changelog.txt
new file mode 100755
index 0000000..4a788b2
--- /dev/null
+++ b/docs/changelog.txt
@@ -0,0 +1,2134 @@
+BBTools changelog and todo list.
+
+V35.
+35.00
+Changed Gene.toChromosome to return an int rather than a byte.
+Changed gitable.int2d name to gitable.int1d since it is a 1D array.
+Added taxa support for ArrayListSet.
+Added % support for output in Reformat. Requested by Alex Copeland.
+Added gitable.sh script for generating gitable.int1d.gz tax translator.
+35.01
+Fixed BBDuk crash when K>31 and stats output was enabled. Noted by Alex Spunde.
+Fixed repair.sh failure on fint flag.
+Fixed SplitPairsAndSingles not working on interleaved input anymore.
+Split Tadpole's mincount flag into mincountseed and mincountextend (mcs and mce).
+Added rcomp flag to BBMap. Requested by Bryce Foster.
+Added merge flag to KmerCountExact.
+35.02
+maq flag now accepts 2 arguments: maq=Q,B. If second argument is specified, only the initial B bases will be used to calculate the quality.
+Added minprob and maq flags to Tadpole and KmerCountExact.
+Fixed memory detection in calcmem.sh not working when ulimit=unlimited. Thanks for the debugging help from Jason S!
+Added some getters to KmerForest and KmerNode.
+Enabled Tadpole kmer harvesting from victim buffer.
+Greatly accelerated Tadpole by allowing threads to compete for tables, rather than using fixed allocation.
+Accelerated Tadpole by increasing default number of tables per thread.
+35.03
+Wrote Shred and shred.sh.
+BBMap can now output mapping stats to a file with the statsfile= flag. Requested by Vasanth.
+35.04
+Integrated extension into BBMerge (extend= flag).
+BBNorm now does ecc after deciding whether to discard a read, not before.
+35.05
+Fixed FilterByCoverage ignoring minCoverage if pre-normalization covstats not given.
+35.06
+Added BBMap lengthtag. Requested by Esther Singer.
+35.07
+Fixed rbb flag in BBNorm not working (conflated with parser flag).
+Integrated a shellscript modification that allows shellscripts to be symlinked and still find the correct classpath. Thanks Elmar Pruess!
+35.08
+Fixed rcompmate flag; it was triggering an assertion error.
+Added BBMap showprogress2 flag.
+Got rid of ReadInputStream.preferBlocks and associated methods.
+Simplified how Reformat works with in1 vs ffin1, and sam files.
+Fixed bug in which Reformat was dropping header lines. Reported by Gloria F.
+Fixed bug in BBMergeOverlapper pfilter for reads of different length.
+Fixed bug in BBMergeOverlapper for reads of different length.
+35.09
+Removed BBMerge hist2 and hist3 which were redundant; added showhiststats flag.
+Added BBMerge prealloc and prefilter flags.
+Removed some old BBMerge functionality (outinsert, perfectonly, etc).
+Changed extend to extend1 and extend2.
+Completely rewrote BBMerge's code path to break it into small modular functions.
+Memory allocation exceptions in HashArray are now handled gracefully.
+BBMerge now uses tadpole for kfilter.
+BBMerge can now extend before or after merge attempts.
+Tadpole can now do error correction via pincer.
+Tadpole can now do error correction via tail also.
+Added genome size esimation to KmerCountExact (via CallPeaks). This will be printed in the peaks output header.
+Fixed BBMap slowdown caused by rescue in LMP libraries. Thanks Marc Strous and Xiaoli Dong (Metawatt team) for helping me track it down!
+35.10
+Removed a debugging line from Tadpole that made it creash when extending reads.
+35.11
+Removed a debug assertion from SamReadInputStream. Found by Kurt LaButti.
+Improved descriptions in kmercountexact.sh.
+Centralized memory statistics printing in Shared.
+Separated Tadpoles load phase (KmerTableSet) from build phase (Tadpole).
+Added catch blocks for memory exceptions when reading objects from disk.
+Added catch blocks for memory exceptions when indexing.
+Added catch blocks for memory exceptions in ChromosomeArrays and CoverageArrays.
+RandomReads now correctly outputs names in fasta format.
+RandomReads now has simple names without custom BBMap coordinates.
+KmerCountExact now uses KmerTableSet.
+Parsing is more robust for Tadpole, KmerCountExact, and KmerTableSet.
+Coverage estimate (based on first peak) now in KmerCountExact. Requested by Vasanth.
+Added ihist PercentOfPairs header line.
+Added trim support to KmerTableSet.
+Added triangle filter for smoothing histograms before peak calling. Vastly improves result. Requested by Alex Copeland.
+35.12
+Updated shellscripts to have consistent formatting, and fixed various typos.
+Reimplemented outinsert for BBMerge. Requested by Matt Nolan.
+35.13
+Wrote Tadpole.explore.
+Removed debugging parameters (rid, pos) from Tadpole/KmerTableSet ownership functions.
+Fixed massive performance problem in KmerArray - victim buffer was being searched for nonexistent kmers.
+Wrote function to clear and regenerate tables after shaving.
+Shaving now seems to work correctly.
+Reduced mcs (minclustersize) in Dedupe from 2 to 1, to match the documentation.
+Added Shaver bubble-removal and improved statistics tracking.
+Added Tadpole markBadBases (mbb) flag for turning bases covered by low-count kmers into N.
+Added Tadpole mode=correct/ecc for correction without extension.
+Tadpole now uses in/out when in extend/ecc mode and ine/oute are not specified.
+35.14
+Added iterative seeding with decreasing depth to Tadpole, via contigpasses and contigpassmult flags.
+Added Tadpole mdo (markdeltaonly) flag; default true.
+Tadpole can now do error marking (mbb) without error correction (ecc).
+Tadpole ownership is now automatic.
+Added driver.FilterLines and filterlines.sh for filtering text lines.
+Verified that an issue with transcriptome mapping is due to an incorrect transcriptome rather than a bug.
+Fixed bug in which BBMap subfilter passed sites with no cigar string. Noted by lankage (SeqAnswers).
+mapq of reads with primary site filtered out is now very low (under 4).
+BBDuk can now stop after X outm or outu bases. Requested by R. Westerman.
+35.15
+Fixed minor bug in Seal in which unmatched reads were not being incremented, causing unmatched read rate to be displayed as 0.
+Fixed a bug in parseKMG for decimal values.
+BBMerge now supports error-correction with Tadpole.
+BBMerge now supports iterative extension.
+BBMerge will now always output the original read sequence for reads that don't get merged, rather than the extended or error-corrected sequence.
+BBMerge minor output formatting bugs fixed.
+BBMerge now calls Tadpole rather than Tadpole_old.
+Wrote a shellscript for TaxTree.
+Wrote Postfilter and postfilter.sh, a wrapper for BBMap and FilterByCoverage to postfilter SPAdes assemblies.
+35.16
+Fixed a bug in FilterByCoverage that was filtering everything if cov0 was not specified.
+Found and fixed some small bugs in Tadpole, such as not add the very last base of a contig.
+Fixed non-determinism in Tadpole by looking for hidden back branches and using extension return codes.
+Created ukmer package and Tadpole2, which supports unlimited kmer lengths. Tadpole2 will be automatically called by Tadpole if K>31.
+35.17
+Made Tadpole an abstract superclass of Tadpole1 and Tadpole2, with massive reduction in duplicate code.
+BBMerge now supports unlimited kmer lengths.
+Made AbstractKmerTableSet superclass of KmerTableSet and KmerTableSetU.
+KmerCountExact now supports unlimited kmer lengths.
+35.18
+Made Shaver and abstract superclass for Shaver1 and Shaver2.
+KmerCountExact now supports shave and rinse operations.
+Prefilter now works optimally with K>31, thanks to new hash routine.
+Fixed KmerCountExact not writing peaks without khist set.
+35.19
+Fixed a crash in read extension with K>31.
+35.20
+Re-added lines for unmerged read to BBMerge outinsert stream. Requested by Matt Nolan.
+Added some new header lines to KmerCountExact peaks output - ploidy, het rate, repeat content, etc.
+35.21
+Fixed Tadpole contig coverage estimation.
+Added Tadpole mincoverage flag.
+Fixed crash in ReadWrite when attempting to create filenames containing the pipe symbol.
+Fixed an invalid assertion in HashArrayHybrid resize().
+Added IntList.contains().
+Fixed a tricky bug with Seal qhdist not looking for matches with substitutions if it first found a match without substitutions. Noted by sdriscoll.
+Dedupe for some reason had interleaved name detection disabled. This is now enabled. Noted by Bede Constantinides.
+Fixed new BBMerge crash bug with outinsert. Noted by Matt Nolan.
+35.22
+Added truncateheadersymbol flag to filterbyname.
+Added some postfilter flags with defaults optimized to increase speed.
+Fixed Seal qhdist again; I had forgotten to sort a list. Noted by sdriscoll.
+Enabled pigz by default in BBNorm.
+Made some improvements to peak-calling.
+35.23
+Fixed a Tadpole2 assertion error when error-correcting with K>31 and variable-length reads.
+Added minconsecutivebases flag to Reformat/BBDuk.
+Added locking and lock testing to HashBuffer; unclear whether speed increased.
+35.24
+Added BBDuk maskfullycovered flag.
+Added SummarizeSealStats ignoresametaxa and ignoresamebarcode flags.
+Wrote ReduceSilva and reducesilva.sh for shrinking Silva database by removing entries with redundant taxonomy.
+35.25
+Added SummarizeSealStats ignoresamelocation flag.
+35.26
+Fixed ignoresamelocation pulling from incorrect field.
+Documented mlf flag. Requested by Alex Copeland.
+Added kmg support to minlength and maxlength. Requested by Bill A.
+Fixed a bug in BBMap when handling subfilter on multimapped reads. Noted by vout.
+Improved BBMap fixXY() function.
+Fixed major bug in BBMerge; outu read2 was reverse-complemented.
+Added a function to soft-clip reads with a long indel anchored by very few bases.
+Added ftm, ftl, ftr, ftr2 flags to BBMerge.
+Added qtrim2 flag to BBMerge (trim on overlap failure).
+Fixed implementation of shave and rinse to properly handle backward branches.
+Fixed shave mindepth at 1 instead of variable.
+35.27
+Fixed a crash bug in BBMap tip indel clipping during fast mode.
+35.28
+Checksites now verifies correct site ordering.
+ensureMatchStringOnPrimary now ensures correct ss ordering if it changes anything.
+ensureMatchStringsOnSiteScores now ensures correct ss ordering if it changes anything.
+These changes resolved an assertion crash bug.
+35.29
+Fixed an assertion bug in tip indel clipping. Noted by Bryce Foster.
+Fixed a couple places where clipped bases were not counted when calculating match position.
+Fixed another bug with fast match strings related to clipping tip indels.
+35.30
+Fixed another bug related to clipping tip indels and resorting. Noted by Bryce Foster and Matt Nolan.
+Clipped tip indels are now replaced with matches or mismatches.
+Fixed a missing else in SamLine.
+35.31
+Fixed a bug in toLocalAlignment() when a read has zero matches to reference.
+35.32
+Fixed an instance where alignments exceeding window size yielded ss with inconsistent lengths.
+Improved calculation of amount of padding needed when alignments exceed window.
+35.33
+Removed kmersamplerate and readsamplerate from bloom package to simplify code.
+Corrected handling of minprob in bloom package.
+Added Tadpole minprobprefilter and minprobmain flags.
+KmerCountExact now disables minProbMain when prefilter is enabled.
+Tadpole can now do multipass prefiltering.
+Tadpole can prefilter for an automatic number of passes.
+Tadpole now supports 1-bit final-pass prefiltering.
+Fixed a bug in fixLimitsXY() - only Y needs adjustment, not X.
+Fixed a bug in generateMatchString in which sorting was not redone when results changed.
+Added functionality to Bloom prefilter to allow arbitrary cutoffs, rather than just using the filter's max value.
+35.34
+Fixed a compile error due to Bloom filter changes.
+35.35
+Added MergeBigelow for combining custom Bigelow text files.
+Updated Shred to add equal flag, to shred reads into equal lengths rather than a fixed length.
+Tracked down a few bugs regarding ss score-setting order.
+Temporarily disabled CHECKSITES and SiteScore.setScore() assertions, which are mainly of interest for efficiency, not correctness.
+35.36
+Added mlf flag to BBDuk.
+Fixed qhdist crash with values over 1 (params were reversed).
+Stripped qfin/qfout support from rqcfilter since nobody will ever use it.
+Made files of the common kmers found in ribosomes (/global/projectb/sandbox/gaag/bbtools/ribo) using ReduceSilva, Dedupe, and KmerCountExact.
+Added ribo filtering to RQCFilter.
+35.37
+Fixed a ribo filtering flag for RQCFilter.
+Added RQCFilter ribodb, ribohdist, riboedist flags.
+Added RQCFilter extend flag (allows BBMerge read extension).
+Fixed path in file-list (directory was being prepended).
+35.38
+Improved Tadpole help info.
+Added Pileup coverage standard deviation calculation. Requested by Bill A.
+Fixed one last (?) assertion error in BBMap. Reported by Vasanth and Shijie.
+35.39
+Postfilter now unloads Data after mapping.
+Added trim flag to postfilter and filterbycoverage.
+35.40
+Fixed a null pointer in Read.validate().
+Fixed read extension in Tadpole when K<=31; the wrong method was called, causing a crash. Noted by Westerman.
+Added Tadpole contig trimming flag.
+Fixed colossal BBMerge bug - read 2 was being merged as a reverse complement. Not sure when that started...
+35.41
+Added spaceslash flag to RandomReads to allow space to be omitted from read names prior to slash pairnum. Requested by Rob Egan.
+35.42
+Slightly altered Tadpole1 to allow condensed assembly of kmer sets; added flag ibo (ignore bad owner).
+Made KmerCompressor and kcompress.sh. Generates a concise fasta representation of the set of kmers occuring at least N times.
+35.43
+Fixed crash in BBDuk wheen using MinKmerFraction (mkf) flag on single-ended reads.
+Added fuse flag to KmerCompressor.
+Fixed a crash in BBMapPacBio versions, caused by not percolating over a change in normal BBMap. Noted by Teshome.
+35.44
+KmerCountExact khist was overflowing if there were more than 2 billion kmers of a given depth. Converted counts to long array.
+Wrote AbstractRemoveThread for removing kmers with counts outside of a certain range.
+Added mincr and maxcr (min count to retain and max count to retain) flags to Tadpole.
+Fixed incorrect haploid_fold_coverage in peaks.txt. Noted by Kurt.
+Fixed KmerCountExact not writing peaks file if no khist was specified.
+Fixed Tadpole differentiation between in/out and ine/oute. Now only in and out are needed.
+Added Reads and Bases columns to Dedupe output. Requested by Esther S.
+35.45
+Added driver.CountSharedLines and countsharedlines.sh. Requested by Esther S.
+35.46
+Added smoothing control flags for KmerCountExact.
+Caught invalid values of K in BBMap.
+Added some additional header lines in peaks output.
+35.47
+Fixed BBMap incorrect NM tags for reads with soft-clipping. Noted by Rob Egan.
+35.48
+Disabled second parameter being automatically interpreted as an output file when = is not specified, in most cases. This is ambiguous as the second parameter might be a file for input read 2.
+Fixed a new bug in newly fixed NM tag gen ^^;. Also noted by Rob Egan.
+Identity calculations no longer penalize regions skipped as introns if the intronlen flag is set. Suggested by Rob Egan.
+35.49
+Clarified error messages for reads failing barcode filter.
+Added cutprimers flag to include flanking primer sequence.
+BBMerge trimq can now be an array for multiple attempts.
+Multithreaded memory allocation for bloom filters; moderate speed increase.
+Added mingc/maxgc to BBDuk and BBDuk2.
+Added BBDuk mcf (min covered fraction) flag.
+35.50
+Added KmerCompressor max flag.
+Clarified crossblock help regarding input file lists.
+Enclosed all iterations of Dedupe overlapLists with a null check.
+KmerCompressor is not deterministic when multithreaded (kmers may be used more than once); reduced buildthreads to 1.
+Added LogLog class for cardinality estimation, and loglog.sh.
+Enabled loglog flag for Reformat and BBDuk.
+35.51
+Added X bit to bamscript generated by BBMap.
+Loglog can now accept multiple files.
+Changed settings of removehuman in rqcfilter to be faster (requires 2 hits now).
+Fixed a null pointer exception in BBMerge with quality.
+Fixed a bug with BBMerge ecco flag being ignored.
+Upgraded Seal to allow requiring full containments of ref sequences. Requested by Ernst O.
+35.52
+Fixed issue with "ignorebadquality" flag being ignored in some cases. Noted by Alicia C.
+35.53
+Added mouse to RQCFilter.
+Switched RQCFilter and RemoveHuman to k=14 for a 4x speedup.
+Modified rqcfilter.sh to allow mouse, cat, dog, and human removal concurrently on 40GB nodes.
+Disabled test for too-high quality scores because it was annoying when dealing with PacBio reads.
+35.54
+Added TadpoleWrapper and tadwrapper.sh, which runs Tadpole multiple times with different kmer lengths and recommends the best length. Requested by Alex Copeland.
+Added normandcorrectwrapper.sh, which runs BBNorm then Tadpole. Requested by Stephan Trong.
+Added clear() operation to KmerTableSets.
+35.55
+Changed Character.isAlphabetic() calls to Character.isLetter().
+Modified CountBarcodes to add more flags (for dual barcodes).
+RQCFilter pigz and unpigz now default to true.
+Moved parsing of threads and recalpairnum from parseCommon to parseCommonStatic.
+Increased sensitivity of ribo removal (96.6% to 98.8%) by using a larger kmer set.
+Adjusted BBMap default per-thread memory usage estimate after a crash. Noted by Matt Nolan.
+35.57
+Fixed a change to removehuman.sh default memory allocation.
+35.58
+Fixed SynthMDA handling of minlen flag.
+35.59
+Added sam 1.4 -> 1.3 support to reformat, via sam=1.3 flag.
+Added RQCFilter filterqhdist flag. Requested by Adam Rivers.
+Slightly reduced default mininsert in BBDuk from 50 to 40.
+Added some additional comments to BBDuk.
+Added # support for BBMap output files. Requested by Adrian Pelin.
+Fixed rqcfilter.sh not grab enough memory on slot-scheduled Mendel nodes... hopefully.
+Added name flag to FuseSequence.
+35.60
+Data.clear() now also clears scaffold information in BBSplitter.
+Added removemicrobes flag to RQCFilter.
+Added removehuman2.sh for aggressive human contaminant removal versus an unmasked reference. Requested by Alicia Clum.
+Modified RQCFilter to support unmasked mouse, cat, dog, and human references. Requested by Alicia Clum.
+Allowed entropyfilter bbduk flag instead of just entropy.
+Unpigz is now used in certain cases where it was prevented before, like reading lists of names in filterbyname.
+35.61
+Corrected some names in RQCFilter file-list.txt.
+35.62
+Split writeReproduceHeader off from writeReproduceFile.
+Added BBTools version and RQCFilter command to RQCFilter reproduce.sh log.
+Added humanpath, catpath, dogpath, mousepath flags to RQCFilter and clarified them in the documentation.
+Improved documentation of bbduk2.sh.
+Fixed BBMergeOverlapper.c to match java version.
+35.63
+Fixed some BBMergeOverlapper.c syntax errors.
+35.64
+Fixed more BBMergeOverlapper.c syntax errors.
+35.65
+Fixed a BBMergeOverlapper.c runtime errors in quality-free mode.
+Finally working again!
+35.66
+Changed testQuality() to assign ASCII-64 to the specific case of N bases with quality B.
+Added preliminary support for dsrc compression. However, the program does not appear to work correctly in Windows.
+Added header output (.header extension). Requested by Matt Nolan.
+Added coverage calculation ignoring deletions.
+AssemblyStats (stats.sh) now has fastq support; requested by several people.
+Added file type and extension documentation as readme_filetypes.txt.
+Removed obsolete changelogs for BBDuk and Reformat.
+35.67
+Added "_part" suffix before the part number to names of automatic-split fasta reads. This fixes a problem with underscore-number-named sequences in BBEst. Noted by Kurt L.
+Fixed a corner-case in filterbycoverage's handling of trimmed reads that drop below the length cutoff. Noted by Stephan Trong.
+35.68
+Wrote KmerComparator and KmerComparator2 for comparing reads by pivot kmers.
+Wrote KmerSort for sorting reads by pivots.
+Wrote KmerSplit for binning reads by pivots.
+Wrote Clump class for storing ordered overlapping reads.
+Wrote Clumpify to wrap KmerSplit and KmerSort.
+Wrote ClumpList to turn a list of clumped reads into a list of clumps.
+Added preliminary consensus operations to Clump and KmerSort.
+Added KmerReduce to produce the set of pivot kmers.
+Fixed an out-of-bounds error in CutPrimers. Noted by vmikk (SeqAnswers).
+Moved UnicodeToAscii (which did not work), TableLoaderLockFreeU, and TableReaderU to z_old, since they cause compilation problems. Noted by Martin M.
+Fixed prefilter onepass mode causing a crash.
+Added clump package.
+Added kmer-count restrictions to Clump pivot selection. Not clear whether it is useful.
+Added local maximum capability to KmerComparator.
+Fixed BBNorm to work with kmers>31, for normalization (not error-correction yet). Not fully tested, though. Noted by Kurt L.
+KmerTableSet read loading now does read validation per thread. This allows better multithreaded scaling.
+BBDuk and Seal also now do validation per thread.
+35.69
+Multithreaded kmer table dumping by KmerCountExact and Tadpole; over 4x speedup.
+35.70
+Tadpole now does validation per-thread when error-correcting. Slight speed increase.
+Fixed a bug in KmerCountExact in which prefilter did not work with K>31, due to using key() instead of xor(). Noted by Kurt L.
+35.71
+Added A_SampleMT, with full line-by-line comments.
+Improved A_Sample's comments.
+Added kmer histogram generation to rqcfilter (khist flag).
+Reorganized rqcfilter.sh documentation.
+Added a_sample_mt.sh.
+Fixed documentation in shuffle.sh.
+Running any program with -version, -help, etc now prints a useful message.
+35.72
+LogLog now retains last cardinality estimate in a static field.
+RQCFilter now chooses BBNorm or KmerCountExact for the khist depending on the estimated cardinality.
+Kmer histograms now have a header by default.
+35.73
+HashBufferU now only tries to acquire a lock every 16th time, like HashBuffer.
+Removed some checks for the literal string null.
+Fixed some else-if fallthroughs where else was missing.
+Addressed some compiler warnings in kmer, ukmer packages.
+Wrote FilterByTaxa for filtering of sequences labelled with their taxonomy (gi number or ncbi taxID).
+Wrote PrintTaxonomy.
+Wrote taxonomy.sh and filterbytaxa.sh.
+35.74
+Added peaks output to rqcfilter.
+Made TaxFilter class and revised FilterByTaxa to use it.
+Added FilterByTaxa support to RQCFilter for microbial decontamination.
+Fixed a bug in which empty files had their format misdetected.
+Fixed a couple array-out-of-bounds errors with unicode characters in genetic sequence. They are now converted to N.
+35.75
+Enabled worker thread read validation in A_SampleMT.
+Wrote SplitByTaxa and splitbytaxa.sh.
+35.76
+Changed documentation structure. There is now changelog.txt, readme.txt, UsageGuide.txt, and ToolDescriptions.txt.
+Fixed IntList resize overflow bug. Noted by jazz710 (SeqAnswers).
+Removed unicode2ascii.sh since it does not work.
+Wrote FungalRelease and fungalrelease.sh. Requested by Kurt and Jasmyn.
+35.77
+BBDuk can now call CalcTrueQuality to generate matrices if given a sam file.
+Added scaffold name remapping legend to FungalRelease. Requested by Kurt and Jasmyn.
+Wrote BBDukGuide.
+Wrote BBMergeGuide.
+Wrote TadpoleGuide.
+35.78
+Fixed infinite recursion when setting threadcount. Found by Matt Nolan.
+35.79
+Changed BBNorm defaults to target=100 min=5.
+Wrote Reformat guide.
+Wrote Seal guide.
+Wrote Taxonomy guide.
+Added Tools.startsWith(byte[], String)
+Revised GiToNcbi and TaxTree functions to allow gi_ as well as gi|, to avoid pipe symbol.
+35.79
+Standardized syntax of gitable and taxtree flags, and added "auto" option.
+35.80
+Added FilterBySequence and filterbysequence.sh.
+Wrote PreprocessingGuide.
+Wrote DedupeGuide.
+Wrote BBNormGuide.
+Removed bbmap20.sh, bbnorm20.sh, bbsplit20.sh, and khist20.sh since they can now have memory set explicitly.
+35.81
+Unified shellscipts between private and public release - module load commands now only run if NERSC_HOST==genepool, and "-l" removed from /bin/bash header.
+JNI mode is now enabled by default if NERSC_HOST==genepool.
+35.82
+Fixed a double-print of BBMap version number.
+Updated projectb pre-deploy version of BBTools compiled jni code and disabled auto-NJI-enable when NERSC_HOST==genepool.
+Wrote guides for A_Sample, BBMask, Stats, CalcUniqueness, Repair, SplitNextera, Clumpify, and AddAdapters.
+Wrote BBMapGuide.
+35.83
+Added "banns" to RandomReads.
+35.84
+Fixed an unnecessary assertion for negative values of pairedScore in Tools.removeLowQualitySitesUnpaired2. Noted by Jason S.
+Fixed a bug in SamLine.makeMdTag for handling deletions called off the end of a scaffold. Noted by Jason S.
+35.85
+Fixed a bug in which BBMap was verifying the presence of the wrong input file. Noted by Adrian P.
+
+
+
+TODO: Add minprob to LogLog.
+
+TODO: Heejung encountered a random null-pointer exception in ByteFile2.run() at "list[loc]=s;". However, I manually examined the code and this state appears to be impossible. Perhaps it is a JVM bug?
+TODO: Autoset bits and prefilter for khist based on cardinality.
+
+TODO: KmerSet prefilter=1 onepass does not work (prefilter=2 onepass does work).
+
+TODO: Validate BBNorm results with k>31.
+TODO: Add summary of how many reads got removed to BBDuk. (Hemant).
+TODO: Add Tadpole ability to screen reads with kmers only occurring at most N times, or having errors/Ns after correction. (Torben).
+
+TODO: Program that can demux a sequence file into multiple sequence files randomly.
+TODO: SynthMDA with a short reference outputs lots of reads with Ns (Alex Copeland).
+TODO: Parser.parse should go at the end, not beginning, of parse blocks for all programs.
+TODO: Tadpole should keep nodes with only outward branches.
+TODO: Print kmer coverage information after Tadpole assemblies (Alex Copeland).
+***TODO: Replace QuadHeap with a heap of longs. The current implementation is very slow on NUMA machines.
+**TODO: Compare Seal performance and correctness with countvector flag. One may be faster for large numbers of ref sequences.
+TODO: Seal mcf flag.
+TODO: Represent covariant depth as a vector with 1.0 for max depth for binning.
+*TODO: Allow kcompress direct set subtraction and intersection.
+*TODO: Add outu support to filterbyname
+*TODO: Speed up BBMap indexing.
+
+TODO: Print information about which reference sequences hit which locations in which reads, for Seal.
+TODO: Second extra base for BBDuk edit distance...?
+TODO: Thoroughly vet the assertions in CHECKSITES and SiteScore.setScore() to ensure they will do not incur false positive error messages.
+TODO: BBMap shave and rinse are reducing contig length at level 2.
+TODO: bbcountunique should use a longer K and have an offset rather than just looking at the first K bases.
+TODO: Pincer could handle arbitrary problems - indels, error bursts, etc.
+TODO: Tail can handle bursts if it simply continues until X bases concur.
+TODO: Port pincer/tail over to BBNorm.
+TODO: Use entropy to determine how many bases to extend past errors.
+
+TODO: BBMap is not handling pairing when ambig=all. Pairing should be done at a SS level. (Elmar P).
+TODO: Tadpole multipass prefilter, and auto prefilter passes.
+TODO: BBMap MPI mode.
+TODO: Seal needs behavior with qhdist to be toggleable between searching or not searching for mutant kmers if a nonmutant kmer is found.
+TODO: BBDuk with hdist should reprocess the reference multiple times, first with hdist=0, then hdist=1, etc. That will improve specificity.
+*TODO: qtrim=r trimq=6 (or even 3) improves BBMerge rate for 2x250, 422 insert library.
+TODO: BBMerge - Track number of errors detected/corrected and error locations.
+TODO: Use small heap to reorder HashArray1D to optimize it.
+TODO: Dump kmers to text by way when max size is exceeded, then reload by way and re-dump low count kmers.
+
+TODO: Tadpole degenerate contig output.
+TODO: Locked/managed HashArray expansion.
+TODO: Fractional (1/4) way allocation per build thread.
+*TODO: extendToRight should return an exit code, not just true or false. May not need to be released.
+*TODO: Tadpole - first, classify all kmers as junction or non-junction (via ownership).
+***TODO: Always verify that left max is yields prev kmer/evicted base. If not, that is a hidden branch.
+*TODO: Allocation schedule for HashArrays.
+*TODO: Optional synchronized resize on final schedule slot to minimize memory use.
+
+TODO: MS state for MSA, always, for M1 state.
+TODO: extin and extout flags for BBMap.
+TODO: FastaReadInputStream asserts false for headers with no sequence.
+
+TODO: Speed up shaving (exploration) where possible.
+
+TODO: Seal kmer rank promotion with 1D arrays.
+TODO: Partition program, round-robin with equal number of bp or equal number of sequences per output.
+TODO: msa.sh should accept a file instead of literal.
+TODO: BBMap bed format (Alex C).
+TODO: BBMap fix for crash in filterbyname on sam file - SamLine 1490, assert(start_<=stop_).
+TODO: Reformat lhist and readlength.sh should have equivalent information. "I prefer readlength.sh info"
+TODO: Tadpole/KCE double-lock and double-buffer with LongLists for loading.
+TODO: xmx=auto or percentage
+TODO: reformat: multithread?
+TODO: bbduk: move read validation to per-thread
+TODO: (write scaffolder)
+TODO: (write polishing/consensus tool)
+TODO: (write breaker)
+
+
+V34.
+34.00
+Fixed a bug in BandedAlignerConcrete related to width being allowed to be even.
+34.01
+IdentityMatrix is now much faster for ghigh-identity sequences, and allows the 'width' flag to increase speed.
+Updated FilterReadsByName to allow "names=<read filename>", supporting fastq, fasta, and sam. So, one file will be filtered according to the names of reads in a second file. "names=<file>" where the file is just a list of names is still supported.
+34.02
+Fixed a couple errors in ConcurrentReadInputStreamD.
+Added fetching of a dummy list from "empty" for crisD, both master and slave.
+Added A_SampleD, which uses crisD. It now works correctly for master.
+Renamed various ConcurrentReadStreamInterface classes.
+Added an abstract superclass for all ConcurrentReadInputStreams, which extends Thread. Now, cris can be started directly without making a new thread.
+Changed all instances of wrapping cirs in a thread to just use start directly. These are mostly commented with "//4567" to find if something was missed (like starting the cris twice).
+Increased cris stability by removing "returnList(ListNum, boolean)" and replacing it with "returnList(long, boolean)". Lists may no longer be recycled.
+34.03
+Added scaffoldstats to BBQC and RQCFilter fileList logs. Requested by Bryce F.
+Fixed a strange deadlock in Dedupe/ConcurrentCollectionReadInputStream caused by making CRIS a Thread subclass. This will still occur if CRIS goes back to being a Thread. Noted by Shoudan.
+34.04
+Removed hitCount tracking from Seal.
+"qtrim=<integer>" is now allowed for all classes using Parser.parseTrim().
+Parser.parseZip, parseInterleaved, parseQuality, parseTrim, parseFasta, and parseCommonStatic were integrated into most classes; reduced code size by almost 200kb.
+Parser.parseTrim got some extra functionality, like maxNs.
+Made an abstract superclass for KmerCount* classes, allowing removal of some code.
+Removed all KmerCount.countFasta() methods; they must now use a CRIS.
+Retired ErrorCorrectMT (superceded ny KmerNormalize).
+Fixed bug in BBDuk, Seal, and ReformatReads - when quality trimming and force-trimming, count of trimmed reads could go over 100%. Now these counts are independent. Noted by ysnapus (SeqAnswers).
+Removed "minscaf" and "mincontig" flags from Parser.parseFasta() because they were conflated.
+Determined cause of Kurt's error message in Dedupe - lower-case letters can trigger a failure.
+Dedupe now defaults to "tuc=t" (all input is made upper-case).
+Moved CRIS factory from CGRIS to CRIS.
+Copied cc2-cc5 to /global/projectb/sandbox/gaag/TestData/SingleCell/SimMockCommunity/plate*/. These are simulated cross-contaminated single cell plates.
+Removed conflated "qual" flag from RandomReads; "q" should be used instead to set all read quality values to a single number.
+Fixed conflated "renamebymapping" flag in RenameReads.
+"tbr" flag is conflated in KmerNormalize; adjusted so that it now controls both "tossBadReads" (reads with errors) and "tossBrokenReads" (reads with the wrong number of quality scores).
+Conflated "gzip" flag in ChromArrayMaker/FastaToChromArrays changed to "gz".
+Handled conflated "ziplevel" flag in AbstractMapper.
+Conflated "fakequality" flag resolved by moving from BBMap to Parser and renaming "fakefastaquality"/"ffq".
+Added hdist2 and edist2 to BBDuk. These allow independently specifying hdist/edist for full-length kmers and short kmers when using mink.
+Added trimhdist2 to RQCFilter/BBQC.
+*** Added path and mapref flags to RQCFilter/BBQC; they can now map to an arbitrary genome instead of just human.
+Added Shared.USE_MPI field (parsed by Parser.parseCommonStatic; "mpi" or "usempi").
+Added Shared.MPI_RANK field (should be set automatically).
+Added Shared.MPI_KEEP_ALL field. This controls whether CRISD objects retain all reads, or just some of them.
+CRIS now automatically returns a CRISD when USE_MPI is enabled, as a slave or master depending on whether rank==0.
+ListNum is now Serializable.
+CRISD now transmits ListNum objects rather than ArrayLists, so that the number is preserved.
+Added Maxns flag to reformat.
+Fixed BBQC and RQCFilter's unnecessary addition of "usejni" to BBMap phase, since it is now already parsed by parseCommonStatic.
+BBQC now defaults to normalization and ecc off, but can be enabled with the "ecc" and "norm" flags, and supports cecc flag.
+Added notes on compiling JNI version suggested by sdriscoll.
+34.05
+Commented out a reference to ErrorCorrectMT in MateReadsMT.
+34.06
+FindPrimers (msa.sh) now accepts multiple queries (primers) and will use the best-matching of them.
+Added a BBMap flag to disable score penalty due to ambiguous alignments ("penalizeambiguous" or "pambig"). Requested by Matthias.
+Fixed failure to start CRIS in A_SampleD.
+Fixed some incorrect division in CRISD.
+Added MPI_NUM_RANKS to Shared. This is parsed by parser via e.g. "mpi=4".
+Added BBMap flags subfilter, insfilter, delfilter, inslenfilter, dellenfilter, indelfilter, editfilter. These function similarly to idfilter. Requested by sdriscoll.
+34.07
+Dedupe now automatically calls Dedupe2 if more than 2 affixes are requested.
+Added "subset" (sst) and "subsetcount" (sstc) flags to Dedupe.
+Added "printLengthInEdges" (ple) flag to Dedupe.
+34.08
+Finished Dedupe subset processing for graph file generation.
+34.09
+Fixed bug where 'k' was not added to filename in RQCFilter. Noted by Vasanth.
+34.10
+Documented "ordered" and "trd" flags for BBDuk/Seal.
+Added crismpi flag to allow switching between crisd and crismpi.
+Added shared.mpi package, containing MPIWrapper and ConcurrentReadInputStreamMPI.
+34.11
+Added detection of read length, interleaving, and quality coding to FileFormat objects, but these fields are not currently read.
+FileFormat.main() now outputs read length, if in fastq format.
+Reformat will now allow sam -> sam conversion; not useful in practice, but maybe useful in testing.
+Added flag "mpikeepall", default true.
+Fixed deadlock when mpikeepall=false. Noted by Jon Rood.
+34.12
+Added 'auto' option to gcbins and idbins flags. Requested by Seung-jin.
+Added dedupe "addpairnum" flag to control whether ".1" is appended to numbered graph nodes.
+Added real quality to qhist plot, when mhist is being generated.
+Moved maxns and maq to AFTER quality trimming in RQCFilter and BBDuk.
+Added "ftm" (forcetrimmodulo) flag to BBDuk/Reformat/RQCFilter/BBQC. Default 5 for RQCFilter/BBQC, 0 otherwise.
+34.13
+Fixed a missing "else" in RQCFilter/BBQC. Noted by Kurt LaButti.
+34.14
+Added .size() to ListNum.
+CrisD gained "unicast" method. Also, unicast and listen now have mandatory toRank parameter.
+Made CrisD MPI methods protected rather than private, so they can be overridden.
+Refactored RTextOutputStream3.
+34.15
+Added Shared.LOW_MEMORY:
+Disables multithreaded index gen.
+Disables multithreaded ReadWrite writeObjectInThread method.
+Disables ByteFile2.
+For some reason it does not really seem to reduce memory consumption...
+Added BBMap qfin1 and qfin2 flags.
+Updated BBMap to use more modern input stream initialization.
+Added mapnt.sh for mapping to nt on a 120g node.
+34.16
+Changed RQCFilter "t" to mean "trimmed"; "k" was removed.
+Added parser noheadersequences (nhs) flag for sam files with millions of ref sequences.
+Documented "ambig" flag in Seal.
+Fixed issue where Shared.READ_BUFFER_NUM_BUFFERS was not getting changed with THREADS was changed. Now both are private and get set together.
+Verified that mapnt.sh works on 120G nodes.
+34.17
+RTextOutputStream3 renamed to ConcurrentReadOutputStream.
+ReadStreamByteWriter refactored to be cleaner.
+Merged MPI dev branch into master.
+34.18
+Moved Seal's maxns/maq to after trimming.
+Added chastity filter to bbduk and reformat (reads containing " 1:Y:" or " 2:Y:"). Requested by Lynn A.
+Dedupe outd stream now produces correctly interleaved reads. Requested by Lynn A.
+Replaced Dedupe TextStreamWriters with ByteStreamWriters, for read output.
+34.19
+Added parseCommon() to BBDuk, allowing samplerate flag.
+34.20
+FASTA_WRAP moved to Shared.
+Numeric qual output is now wrapped after the same number of bases as fasta output.
+"Low quality discards:" line is now triggered by chastity filter.
+SPLIT_READS and MIN_READ_LEN are now disabled when processing reference in BBDuk/Seal.
+Seal gained parseCommon and parseQuality.
+34.21
+Fixed MIN_READ_LEN bug (set to 0; should have been set to 1)
+34.22
+Added qfin (qual file) flags to BBDuk/Seal.
+Applied BBDuk restrictleft and restrictright to filtering and masking; before, it was only valid for trimming.
+Added calcCigarBases.
+Required includeHardClip parameter for all calls to calcCigarLength(), start(), or stop().
+Fixed bug in pileup caused by hard-clipped reads. Noted by Casey B.
+34.23
+DecontaminateByNormalization was excluding contigs with length under 50bp, which caused an assertion error.
+Fixed a crash in BBDuk2 when not using a reference. Noted by Dave O.
+Added entropy filter to BBDuk/BBDuk2. Set "entropy=X" where X is 0 to 1 to filter reads with less entropy than X.
+34.24
+Added maxreads flag to readlength.sh.
+Fixed bug in BBMap - when directly outputting coverage, secondary alignments were never being used.
+BBMap now uses the "ambig" and "secondary" flags to determine whether to include secondary site coverage. Specifically, "ambig=all" will use secondary sites, while other modes will not unless "secondary=t". In other words, use of secondary sites in coverage will be exactly the same as use of them in a sam output file. Removed "uscov=t Include secondary alignments when calculating coverage." from shellscript.
+Fixed minid trumping minratio when both were specified. Now, the last one specified will be used.
+Added pileup support for reads with asterisks instead of bases, as long as they have a cigar string. Also sped up calculation of read stop position.
+Cigar string 'M' symbols are now converted to match string 'N' symbols if there is no reference.
+34.25
+BBMerge initialization order bug fixed; it was preventing jni from being used with the "loose" or "vloose" flags. Noted by sarvidsson (SeqAnswers).
+34.26
+Fixed semiperfect mode allowing non-semiperfect rescued alignments. Noted by Dave O.
+Fixed ReadStats columns header for qhist when mhist was also generated.
+Fixed an inequality in BBMergeOverlapper that favored shorter overlaps with an equal number of mismatches, in some cases. Had no impact on a normal 1M read benchmark except when margin=0, where it tripled the false-positive rate.
+34.27
+Enabled verbose mode in BBMergeOverlapper.
+34.28
+Added "align2." to sam header command line of BBMap.
+Fixed bug in BBMap that could cause "=" to be printed for "rnext" even when pairs were on different scaffolds. Noted by rkanwar (SeqAnswers).
+34.30
+Reformat can now produce indelhist from sam files prior to v1.4.
+Fixed a crash bug in BBMap caused by an improper assertion. Noted by Rob Egan.
+34.31
+BBDuk/Seal now recognize "scafstats" flag as equivalent to "stats".
+Seal now defaults to 5 stats columns (includes #bp).
+Wrote BBTool_ST, and abstract superclass for singlethreaded BBTools.
+Clarified documentation of "trimq=X" as meaning "regions with average quality under X will be trimmed".
+Fixed major bug in RQCFilter/BBQC: "forcetrimmod" was being set to same value as "ktrim". Noted by Brian Foster.
+34.32
+Changed the way BBMerge handles qualities to make it 40% faster (in java mode). Reduced size of jni matrix accordingly.
+Fixed lack of readgroup tags for unmapped reads in sam format. Noted by Rahul (SeqAnswers).
+Ensured Read.CHANGE_QUALITY affects both lower (<0) and upper (>41) values.
+34.33
+Pushed BBMergeOverlapper.c to commit.
+34.34
+Documented trimfragadapter and removehuman in RQCFilter.
+Added Parser flag for Shared.READ_BUFFER_LENGTH (readbufferlength).
+Added Parser flag for Shared.READ_BUFFER_MAX_DATA (readbufferdata).
+Added Parser flag for Shared.READ_BUFFER_NUM_BUFFERS (readbuffers).
+RQCFilter now accepts multiple references for decontamination by mapping.
+Added FuseSequence (the first BBTool_ST subclass) and fuse.sh, for gluing contigs together with Ns.
+Reformatted many scripts' help info to remove echo statements.
+Fixed bugs in stats and countgc; they were not including undefined bases when printing the length in gcformat=1 and gcformat=4.
+Replaced all instances of .bases.length with .length(), to prevent null pointer exceptions (for example in sam lines with no bases).
+Added cat and dog flags to rqcfilter.
+Changed defaults of BBMask to reduce amount masked in cat and dog to ~1% of genome. This still masks all of the coincidental low-complexity hits from fungi.
+Determined that dog is contaminated with fungus, particularly chr7 and chr13.
+34.35
+Fixed a bug in which data was retained from the prior index when indexing a second fasta file in nodisk mode.
+34.36
+Disabled an assertion in BBMerge that the input is paired; it crashes if the input file is empty.
+34.37
+NSLOTS is now ignored if at least 16, to account for new 20-core nodes.
+ReadWrite.getOutputStream now creates the directory structure if it does not already exist. Problem discovered by Brian Foster.
+BBQC and RQCFilter now strip directory names before writing temp files.
+BBDuk now correctly reports number of reads quality-filtered.
+Added "unmappedonly" flag to Reformat.
+RQCFilter now defaults to using TMPDIR.
+34.38
+BBMap now prints reads/second correctly. Before, it actually displayed pairs/second with paired data.
+Added maxq flag to BBMerge, which allows quality values over 41 where reads overlap. Requested by Eric J.
+Changed CoveragePileup from TextFiles to ByteFiles; increased read speed by 3.65x.
+Changed CoveragePileup from TextStreamWriters to ByteStreamWriters; increased write speed by 1.46x.
+Fixed a bug in BBQC/RQCFilter: paired input and interleaved output was getting its paired status lost. Noted by Simon P.
+Reformat, when in "mappedonly" or "unmappedonly" mode, now excludes reads with no bases or secondary alignments.
+34.39
+Human contaminant removal is now optional in BBQC.
+34.40
+ConcurrentReadOutputStream made abstract superclass.
+Added ConcurrentGenericReadInputStream, the default implementation.
+Added ConurrentReadOutputStreamD, distributed template.
+Merged some duplicate methods in MPIWrapper/ConcurrentReadInputStreamMPI.
+34.41
+Added some features to CoveragePileup, FilterByCoverage, and DecontaminateByNormalization to quantify low-coverage regions on otherwise high-coverage contigs.
+Added parser fastadump flag to toggle dumping of kmers as fasta vs 2-columns.
+Fixed a couple bugs in RQCFilter which mixed up names of stats files for trimming and filtering.
+RQCFilter will now map to cat, dog, and human together with BBSplit if all three are specified, and produce "refstats.txt".
+BBDuk/Seal now support ambiguous IUPAC codes in reference sequences.
+34.42
+ByteFile now returns empty lines as byte[0] instead of null. This allows processing of fastq files with 0-length reads. Noted by lankage (SeqAnswers).
+Fixed a bug in FastaToChromArrays2 - blank lines in fasta files were interpreted as breaks between sequences. Noted by Alex Spunde.
+Fixed "unmappedonly" flag in reformat.sh - it was providing inverted output. Noted by Kristin T.
+34.43
+Improved MD tag generation. Reference Ns were not being counted, and unnecessary zeros were appearing between adjacent substitutions. Noted by Jason S.
+expectedErrors() and averageQuality() both now require a boolean parameter, includeUndefined.
+Fixed a bug in BBQC's output directory - primary output was going to scratch. Noted by Simon P.
+Added path to BBSplit's help menu. Noted by Ed K.
+34.44
+TranslateSixFrames now can accept AA input and produce NT output.
+Merged dev branch into master.
+Enabled CrosMPI to be created when CRIS_MPI is set to true.
+BBDuk and Seal now use MPI streams correctly for reading the reference (when MPI is enabled).
+Added truseq_rna.fa.gz to resources.
+34.45
+Added BBDuk skipr1/skipr2 flags. Requested by Stephanie H.
+Fixed a null pointer in ConcurrentReadOutputStreamD.
+34.46
+Added SamLine.parseFlagOnly(byte[]) for rapid classification of sam lines.
+Revised SplitSamFile and added splitsam.sh to the public distribution. It's now fast (~540MB/s).
+Added a table of contents to /resources/.
+34.47
+Fixed bug parens around in FindTipDeletions; it was sometimes running when it should have been disabled.
+Added swap flag to Reformat, for substituting one base for another (as in bisulfite treatment).
+Added underscore flag to Reformat.
+Fixed threads flag in BBDuk; it was getting parsed in 2 places and never set.
+Added qhdist/qhdist2 flags to BBDuk/Seal for mutating query kmers. Suggested by sdriscoll (SeqAnswers).
+Corrected mkh flag in Seal. Noted by Vasanth S.
+FastaReadInputStream now has a mandatory amino field in constructor.
+34.48
+Bitsets and coverage arrays can now both be disabled in pileup.
+Reorganized buffer lengths in BBIndexPacBio to reduce memory usage and support long (6000bp) reads with shorter kmers, down to 9bp.
+Added small rna adapter path to BBQC/RQCFilter.
+Fixed FilterReadsByName processing of sam files; bug found by Marissa Miller.
+Accelerated and reduced memory usage of FilterReadsByName; moved name parsing over to Tools.
+Added ReadStreamWriter.USE_ATTACHED_SAMLINE.
+34.49
+Fixed qin/qout flags in many classes; they were being ignored. Noted by Jason H.
+Added Nextera LMP adapter sequences.
+34.50
+Added AssemblyStats format=7. Requested by Andrew Tritt.
+34.51
+Added physical (aka fragment) coverage flag to Pileup and BBMap.
+Added rpkm/fpkm output to Piluep and BBMap. Requested by Vasanth.
+Changed Seal FPKM calcluation; it was dividing by number of mapped reads rather than number of mapped fragments.
+34.52
+Added SmallKmerFrequency and commonkmers.sh. Requested by Bill A.
+Fixed a bug in ReadStreamByteWriter; "attachment" mode was printing a period instead of newline.
+34.53
+Added graphical display of GC level to gchist. The gcplot flag works with all programs that use gchist. Requested by Kecia D.
+Added reparse to BBTool_ST. This allows parsing of subclass fields which are otherwise overwritten by their defaults.
+Added count output to SmallKmerFrequency.
+34.54
+Added cumulative column to gchist. This is also enabled by the gcplot flag. Requested by Seung-Jin.
+Added BBMap normcov and normcovo flags. Requested by Vasanth.
+Added support for out= to stats and statswrapper. Requested by Brian Foster.
+Fixed a bug in which stdout was being closed by closing a PrintWriter that wrapped it.
+Disabled a message about read pairing for sam input.
+Finished DedupeByMapping and created dedupebymapping.sh.
+34.55
+Fixed a bug in BBMap's coverage flags; normcovo was called normcovOverall. Noted by Matt Nolan.
+34.56
+Fixed qin flag being ignored by BBMap. Noted by Adrian P.
+Removed obsolete classes ReformatFasta and ReverseComplement (both handled by ReformatReads now).
+34.57
+Added BBMap timetag flag and thist output.
+Fixed bug in AssemblyStats GC output. Noted by Jasmyn P.
+Added format=0 to stats (no output).
+34.58
+Version bump.
+34.59
+BBSplit now supports # operator in filenames. Requested by Vicente G.
+BBMap now prevents cross-scaffold alignments if any output file is sam or bam, not just the primary one.
+Reformat now has a primaryonly flag to prevent output of secondary alignments.
+Added KillSwitch class. This will kill the process after X seconds with under Y CPU utilization. It is invoked by the command line argument "monitor" for post programs.
+34.60
+BBDuk can now remove reads with less than X% of any single base. Requested by Alicia C.
+Added reformat 'filterbits' and 'requiredbits' flags.
+Removed obsolete colorspace-specific fields Read.expectedErrors and Read.mapLength.
+Wrote SplitNexteraLMP.java and splitnextera.sh.
+Added Read.subRead(from, to).
+Added BBMap call to ReadStats.checkFiles() to force a crash before running, rather than after running, if there are problems.
+Changed nextera_LMP_linker.fa.gz to a double linker after examining real data.
+Modified BBTool_ST for greater flexibility with additional IO streams.
+Wrote MultiStateAligner9XFlat.java. For testing. A flatter, faster MSA.
+34.61
+Completely removed all support for Solid colorspace.
+Removed ChromosomeArrayCompressed.
+Removed MultiStateAligner9fs.
+Removed FastaStream, QualStream, FastqReadInputStream_old.
+Renamed FastaQualReadInputStream3 to FastaQualReadInputStream.
+Added kmer.TableLoaderLockfree. This unifies the load portion of BBDuk and Seal for filling AbstractKmerTables.
+Added kmer.TableReader. This makes it easy to read data from kmer tables.
+Added error message for KmerCountExact if k<1 or k>31.
+Added warning to BBDuk if no kmers are loaded but a kmer operation is specified.
+Added kmask flag to BBDuk.sh help and clarified ktrim flag.
+Timer now automatically self-starts upon creation.
+Added NexteraLMP support to rqcfilter.
+Added versions of Illumina contaminant files without Nextera adapter junctions.
+34.62
+Fixed null pointer in BBDukF. Found by Alicia C.
+34.63
+Created tax package for processing NCBI taxonomy data.
+Added IntList.getUniqueCount()
+Added TaxNode and TaxTree, for accumulating taxa counts.
+Added GiToNcbi, for translating gi numbers to taxa ids.
+Added RenameGiToNcbi, for renaming sequences (e.g. nt) with their taxa id.
+Added SortByTaxa, for sorting sequences based on taxonomy for better compression.
+TaxTree and GiToNcbi now support serialized input; much smaller.
+Integrated taxonomy support into Seal.
+Seal now uses 9 ways, and uses pigz when loading the reference.
+Added ftr2 (forcetrimright2) flag, which allows trimming a fixed number of bases on the rightmost end.
+Added BBMap parsing logic to prevent bad vad values of maxsites and maxsites2.
+Fixed BBTools failure to find primes.txt.gz if there is a space in the classpath (Matt Kearse).
+All calls to average quality now require a max number of bases to process.
+Added maqb flag (min average quality bases); maq calculation will be restricted to that many leading bases. (Shoudan)
+Retired FilterReads (superceded by Reformat and BBNorm).
+Added reformat "tossjunk", "fixjunk", and "aminoin" flags.
+34.64
+Fixed off-by-one error in forcetrimright2.
+34.65
+Made gi2taxid.sh, which calls RenameGiToNcbi.
+RenameGiToNcbi updated to split input into valid and invalid output, where invalid gets anything with no taxid.
+Added FileFormat.isFasta(String) method.
+Improved SortByTaxa. Now does preorder traversal of tree, and supports dummy nodes, fusing, and promotion.
+Added sortbytaxa.sh.
+BBSplit ref= can now point to directories. Requested by Manuel K.
+34.66
+Fixed an uncaught overflow in ByteBuilder.expand().
+Added SortByTaxa max fusion length.
+Added barcode filtering to Reformat and BBDuk.
+Sped up chastity filtering and allowed it to process reads with / before read number.
+Added chastity filtering and barcode filtering to RQCFilter.
+34.67
+Fixed BBDuk double-counting chastity/barcode-filtered reads.
+Fixed BBDuk overcounting of reads that were trimmed by overlap.
+BBMap nzo flag now affects refstats and scafstats in addition to covstats (Vasanth).
+Added BBMap sortstats flag.
+Added BBMap rebuild flag (Vasanth).
+34.68
+Added qtrim=window flag (Alicia).
+Added slashspace flag to Reformat (disable space when adding /1 and /2 to read names).
+Added clearzone to Seal (Vasanth).
+Added stoptag to Reformat.
+Added boundstag to Parser (Shoudan).
+34.69
+Slight fix for samline inbounds detection.
+Fixed a corrupt Truseq RNA adapter sequence.
+Added Truseq RNA adapters to adapters.fa.
+34.70
+Added BBMerge useratio mode (enabled by default in vloose mode).
+Added BBMerge adapter processing.
+Added BBDuk kmask=lowercase.
+34.71
+Added BBMerge uloose and vstrict modes.
+Added BBMerge requireratiomatch, ratiominoverlapreduction, and ratiooffset flags.
+34.72
+Adjusted BBMerge uloose settings.
+34.73
+Added RandomReads path flag.
+Increased BBMerge defaults to -Xmx1000m and readbufferlen=400 to improve scaling.
+baseToComplementExtended array now maps lowercase letters to lowercase letters.
+Changed BBMerge to use floating-point probabilities.
+Fixed missing quote mark in bbsplit.sh. Noted by Manuel K.
+34.74
+Added Reformat quantize flag (Alicia Clum).
+34.75
+RandomReads ignored q=x in perfect mode, and perfect flag did not work without a number.
+Added Reformat skipreads flag.
+Added DualCris, for dual input files of unequal length.
+Repair.sh now works if r1 and r2 files are unequal length.
+Added flags for BBMerge normalmode and ratiomode.
+Made BBMerge default to ratiomode-only for default and loose stringencies.
+BBMerge efilter now enabled by default in all modes.
+Improved BBMerge efilter (now occurs after both merge modes).
+Updated BBMerge efilter to examine only the trailing X bases depending on mininsert setting.
+Made normalmode and ratiomode more independent, so requireratiomatch flag is more efficient.
+Added ratiomode settings for strict, vstrict and ustrict. Ustrict has requireratiomatch enabled.
+Increased readlength.sh default memory and reduced number of reads in buffer.
+Added BBMerge ordered flag; disabled by default.
+34.76
+Accelerated BBMerge by using a buffer for quality values translated to probabilities.
+Added early exits before mininsert0 and minoverlap, and added mininsert0 flag.
+Added ecc mode, for correction only and no merging.
+Tested removal of runtime division; speed unchanged.
+34.77
+Added BBMerge pfilter flag; discards overlaps with low probability mismatches.
+Fixed BBMergeOverlapper.expectedMismatches() and probability(). Both were considering the wrong bases.
+Increased mateByOverlapRatioJava speed with altBadlimt. Slightly increases false-positive rate.
+Added reformat itn flag (convert iupac symbols to N).
+Simplified mateByOverlap and mateByOverlapRatio - removed no-quality loop.
+BBMerge uloose settings tweaked; no longer uses normalmode.
+34.78
+Fixed BBDuk compatibility with new BBMergeOverlapper float array requirement.
+34.79
+Integrated new BBMergeOverlapper ratiomode call into BBDuk. Uses settings similar to strict mode.
+Adjusted BBMerge defaults some more.
+34.80
+Added outu flag to demuxbyname.
+Added overlapWithoutQuality (owq) and overlapUsingQuality (ouq) flags to BBMerge; default is ouq=f owq=t.
+Made a quality-free loop in BBMergeOverlapper now that quality is disabled by default; 20% faster.
+Removed minoi flag and some legacy fields from Overlapper that dealt with mapping information.
+34.81
+Split read verification into a different function outside of constructor.
+Added a mode for worker threads to verify reads, instead of at construction time.
+Added input file check to BBMerge.
+Greatly increased speed of overlapper with findBestRatio function.
+34.82
+Minor changes to BBMerge entropy calculations.
+Added BBMerge static function errorCorrect().
+Changed fast from normalmode to ratiomode.
+Added ecc flag to BBDuk and Seal.
+34.83
+Fixed DualCris and SplitPairsAndSingles (repair.sh). Crash bug found by GenoMax.
+34.84
+Added overlap flag to BBNorm to regulate whether overlapping is used for error correction.
+BBMerge now has static functions mergeableFraction() and makeInsertHistogram().
+Added ecc flag to kmercountexact.
+Added normbins flag to BBMap and fixed normcovo flag.
+34.85
+Finished ArrayListSet.
+Created MultiCros and a functional reference implementation in main().
+Upgraded Seal handling of refnames mode; now all sequences for a ref file get the same ID (in that mode).
+Added BBSplit-style output for Seal refstats. Requested by Vasanth.
+Removed outsingle from Seal.
+Added multiple output streams to Seal with the pattern flag; acts like basename in BBSplit.
+DemuxByName now supports arbitrary output streams without needing a name list.
+Moved directory parsing from BBSplit to Tools.getFileOrFiles().
+34.86
+Fixed text string for output stats of FilterByName. Noted by Alex C.
+Tiny change to BBMergeOverlapper regarding Ns.
+Added mkf (minkmerfraction) flag to Seal. Requested by Vasanth S.
+34.87
+Added /ref/qual/ directory for recalibration matrices.
+Added recalibrate() function to CalcTrueQuality.
+recalibrate (recal) is now a parser flag and has been enabled for BBMerge, Reformat, and BBDuk.
+Added fixheader flag to parser. Requested by Shijie.
+Changed parseInt to parseLong for matrix loading.
+Added qb123 matrix.
+Added estimate by max observed error rate rather than average.
+34.89
+Added unicode2ascii.sh for fixing files with strange symbols.
+34.90
+Fixed FastaReadInputStream's ability to process extended ascii characters (128-255).
+Fixed BBDuk, Reformat, and Seal sometimes ignoring the ftr2 flag.
+Added adapter sequence detection to BBMerge (outa flag).
+Made CalcTrueQuality multithreaded.
+Fixed ROOT_QUALITY not getting set with the path flag.
+Added path flag to BBDuk and BBMerge.
+Added notags flag to BBMap.
+34.91
+CalcTrueQuality now tracks paired reads independently.
+BBDuk and various programs no longer deadlock waiting for sam header to be read.
+Added BBMerge iupacton (itn) flag.
+Seal can now write sam output from sam input.
+BBDuk can write sam output from sam input, but only for quality recalibration, not other operations.
+Made RemapQuality to test recalibration.
+Internal 2-pass recalibration is now working.
+Changed $CMD to eval $CMD in all shellscripts, which allows escaping spaces in filenames using backslash. Thanks Jon!
+Added qchist (qualityCountHistogram). Gives counts of bases with each quality score.
+Fixed BBDuk not testing to see if qahist was set.
+Changed CalcTrueQuality default observationcutoff to 100, because higher settings cause odd-looking graphs.
+34.92
+Added RenameReads prefixonly flag.
+Added driver.SummarizeCoverage to summarize cross-contamination scafstats files.
+Changed calcmem.sh to work correctly even if ulimit fails. Noted by Tomas B. Thanks to vladr (stackoverflow)!
+Changed bbduk.sh/bbduk2.sh default ram to 1400m from 2000m so that they should work on 32-bit MacOS systems without setting the -Xmx flag.
+Removed some redundancy from TextFile.
+DecontaminateByNormalization now named CrossBlock.
+Added Read.uToT and parser utot flag for converting uracil to thymine in reads.
+BBMap now converts U to T when generating the reference, and all degenerate bases to N.
+Changed SummarizeCoverage to be memory-efficient with large files (e.g. coverage vs nt).
+Removed colorspace from ChromosomeArray.
+Fixed a bug in which cigar strings were sometimes not printed for secondary alignments or when using filters. Noted by Jason S.
+Added qb12 matrix to CalcTrueQuality.
+Added support for adjusting read quality score limits beyond 2~41 with the mincalledquality and maxcalledquality flags.
+Recalibration matrices may be extended above Q41 with the recalqmax flag, for processing consensus reads.
+34.93
+Fixed version flag in parser; it was being ignored if there were arguments.
+Fixed calcmem.sh behavior on a Mac (or other system without /proc/mem) when ulimit=unlimited.
+Fixed issue where BBTool_ST subclasses were having params overridden by defaults.
+ReadStats now correctly tracks read1 and read2 for qhist using sam input, instead of lumping them together.
+Fixed CalcTrueQuality's recalibration of sam input; it was applying the read1 profile to both reads. Finally works perfectly.
+Added BBSplit force-rebuild logic.
+Coverage histogram now goes to 1 million in 32bit mode, instead of 64k.
+Added pjet to rqcfilter/bbqc.
+Fixed a division-by-zero bug in ReadStats. Noted by Seung-Jin.
+Removed confusing readme line stating that BBMap is free for noncommercial use. This is true, but it is also free for commercial use.
+Added stdev to covstats output (as long as arrays are enabled).
+Added driver.SummarizeSealStats and summarizeseal.sh for analyzing cross-contamination results using Seal stats output.
+Added summarizescafstats.sh script for driver.SummarizeCoverage.
+Added jgi.FilterReadsWithSubs it select only those reads with substitution errors for bases in a specified quality range.
+Added phix_adapters.fa.gz to /resources and updated contents.
+Added qahist average deviation header line.
+34.94
+BBMap now produces an error message if indexing fastq files rather than just crashing.
+Added config file support to all BBTools via the config= flag.
+Added some sample config files and a config file readme.
+Moved bloom filter and count-min sketch data structures to bloom package.
+34.95
+Added trd flag to reformat.sh help. Noted missing by Esther Singer.
+Added merge support to SplitNexteraLMP. Currently unknown which is better, merge=t or f.
+Added better output names to RQCFilter. Requested by Bryce F.
+Added kmer ownership to AbstractKmerTable.
+KmerLink is now an AbstractKmerTable subclass.
+Wrote Tadpole.
+Fixed bug when reading empty fasta files. Noted by Matt K.
+Added SamLine.countTrailingClip, and modified countLeadingClip. Now both have soft/hard clipping toggles.
+Removed static SamLine.SUBTRACT_LEADING_SOFT_CLIP and replaced with required parameter.
+Reduced default initial CoverageArray size from 16m to 500.
+Added mincov, maxcov, and delcov flags to BBMask.
+Updated BBMask readme.
+Split IntList/LongList toString method into SetView and ListView.
+Added Pileup toggle for including soft-clipped bases; default false.
+Made Tadpole shell script.
+Fixed Seal stats header indicating over 100% matched sequences with ambig=all. Noted by Esther Singer.
+AbstractKmerTable classes now correctly return -1 instead of 0 if a key is not present.
+Added AbstractKmerTable.clearOwner() to clean up ownership trails of abandoned contigs.
+Fixed various bugs in Tadpole.
+Added substring and case flags to FilterByName. Requested by Esther Singer.
+Tadpole now works correctly multithreaded.
+34.96
+Fixed a bug in FastaReadInputStream not shutting down subprocesses when done.
+Added LMP insert-size detection mode to Tadpole.
+Added ByteBuilder.appendKmer(kmer, k).
+Added Tadpole read-extension mode.
+Tadpole now builds contigs from kmer seeds rather than contig seeds. Slower but more consistent.
+Tadpole is now a complete assembler.
+34.97
+Added length, coverage, and GC to Tadpole contig names.
+Added directional substrings for filterbyname. Requested by Esther Singer.
+Disabled module lines in reformat.sh. Noted by Xiaoli D.
+Renamed summarizecoverage to summarizescafstats.
+Added ambig and kfilter flags to CrossBlock. Requested by Ken H.
+34.98
+BBDuk/BBDuk2 default maxrskip set to 1 (disabled), to reduce confusion.
+Fixed Seal generating ArrayListSets even when pattern output was not specified.
+Fixed Seal bug classifying both read1 and read2 as matched when only one matched in kpt=f mode (this IS the intended behavior in default kpt=t mode). Noted by Alex Spunde.
+Fixed string compare bug in FilterReadsByName making substring=header and substring=name fail. Noted by Esther Singer.
+34.99
+Fixed BBMap not printing coverage statistics when machineout=t. Noted by Vasanth Singan.
+Added minlen flag to filterbyname.
+Changed Dedupe primary structure from HashMap to LinkedHashMap to (somewhat) preserve input order.
+
+
+
+TODO: BBDuk crashes with K>31 (Alex Spunde).
+TODO: Memory autodetection does not work on Amazon.
+TODO: BBMap machineout to file (Vasanth).
+TODO: chrombits and CHROMS_PER_BLOCK may be obsolete and ready to remove.
+TODO: out=stdout.bam does not work.
+TODO: Include deletions toggle for Pileup.
+TODO: Soft-clipping coverage flag.
+TODO: Add match/cigar/SamLine trimming to TrimRead.
+TODO: Write Hollow.
+TODO: Multithread splitnextera.
+TODO: config flag in Parser
+TODO: Normalize CalcTrueQuality on 50% GC by tracking GC rates (etc) observed in reads.
+TODO: Make Recalibrate class and recalibrate.sh to automate everything.
+TODO: Track quality-score accuracy per base location.
+TODO: Track quality-score accuracy per base letter.
+TODO: Tool to extract reads mapped to a specific locus.
+TODO: Make it easy to test a decontam tool on the synth datasets.
+TODO: Map unknowns in 48-sample-plate.
+TODO: BBMerge return codes. -1 no solution, -2 ambig, -3 too long (short overlap), -4 too short.
+TODO: Seal speed and mkf flags should work together.
+TODO: Apply Seal refnames upgrade to taxonomy handling, if not already done.
+TODO: BBNorm histout with 1pass/ecc does not seem to generate anything.
+TODO: randomreads does not name reads by origin in fasta format.
+TODO: Hamming distance for demuxbyname.
+TODO: BBMerge has a lower merge rate when r1 and r2 are different lengths. (simple fix - swap them [done]).
+TODO: MultiCros wrapper and hash-based multi-listnum object.
+TODO: Reformat should be able to trim mapped sam files (Aldo J).
+TODO: Mask bases overlapping from Dedupe graph (Shoudan).
+TODO: Seal split capability, or BBSplit for short sequences (Manuel K).
+TODO: RQCFilter - dynamically switch between $TMPDIR or /dev/shm depending on input size and available disk space.
+TODO: BBMerge - trim adapters for unmerged reads (?)
+TODO: Fungal pipeline: FindErrors?
+*TODO: BBMap calls calcCorrectness even when data is not synthetic.
+TODO: BBMap File containing all reads/pairs that are not completely contained within a single contig. (Shoudan)
+TODO: BBDuk/Seal - enable tracking of kmers by reference file rather than reference sequence.
+TODO: Batch setting for BBDuk to operate on multiple files and auto-name output.
+TODO: Get data from Chris B, count mismatched pairs, send to E.
+TODO: Stats does not accurately estimate BBMap RAM usage for K=15.
+TODO: Accelerate maxindel=0 mode for BBMap by banning MSA usage.
+*TODO: BBMerge poster.
+TODO: Redo DedupeByMapping so that it can handle sorted input using a heap.
+TODO: MSA Flat - remove states to increase speed.
+TODO: Dedupe does not work with sam input. (Lynn A.)
+TODO: Change all instances of "remove bases with quality below minq" to "...trimq" in shellscripts.
+TODO: Parse extra part of sam lines into a byte array (optionally).
+*TODO: Dedupe crash on input in C:\temp\dd1\bad.fa (Shoudan).
+TODO: Tile-based statistics and filtering for BBMap, BBDuk, etc.
+TODO: Pileup could calculate ref/nonref coverage.
+TODO: Stats needs a fastq mode.
+TODO: Marcel wants a program to essentially sort reads and remove duplicates that are at least X identity.
+TODO: Move parsing of "threads" to parseCommonStatic and adjust all relevant classes.
+TODO: Add 'remap' from Reformat flag to BBMap.
+*TODO: BBMerge won't go below 17bp in normal mode or 26bp in loose mode, regardless of minoi flag.
+TODO: BBMerge dynamic mode - test to determine best overlap limits.
+TODO: Bed output of masked regions by BBMask, or regions with Ns.
+TODO: Bed output of regions with coverage abover or below X (Bob Bower).
+TODO: Document append in shellscripts.
+TODO: Genbank format parser (Sam D). Looks confusing.
+TODO: Decontam should break at (or N-mask) low-coverage areas rather than discarding the whole contig.
+TODO: BED support for pileup. And make Pileup faster by ignoring irrelevent sam fields.
+TODO: CrossMask. Accept set of files; for each, mask using BBDuk with all others as ref.
+TODO: Study bisulfite data on BBMap. Possibly use multiple reference copies with different transforms (C->T, A-G, both, neither).
+TODO: Shellscripts are not able to handle paths containing spaces.
+TODO: Add mininsert flag to BBMap. And maybe maxinsert.
+TODO: Parse MD tag when available.
+TODO: CC rates for all 3 platforms in one chart; ignore R1/R2 differences.
+*TODO: Dedupe loses reads when using paired data and run multithreaded.
+TODO: document nhs flag.
+TODO: Filter cross-contam plates with only depth and length, test cc rates.
+TODO: Fix dedupe crash when minclustersize=1.
+TODO: Clarify or fix what minid does in Dedupe.
+TODO: Add ribosomal filtering to rqc.
+TODO: Update BandedAlignerJNI for quicker width reset.
+TODO: Optional penalty when seq ends before ref in banded.
+TODO: Make sure AddAdapters is adding them correctly, i.e., reverse-complemented (or not).
+TODO: Make list of proposed higher stringency adapter trimming changes and send to Vasanth/Erika.
+TODO: Retire ErrorCorrect, and move the functionality over to another class.
+TODO: Implement ErrorCorrectBulk in KmerNormalize. It is used in MateReadsMT.
+TODO: BBMerge should allow optional inline error-correction for reads that fail to merge, and revert if they still fail.
+TODO: Retire KmerCount7MT (non-atomic version).
+*TODO: It appears that timeslip is being correctly applied by fillLimited (etc), but not by calcDelScore() or calcAffineScore().
+TODO: Dedupe should warn if lowercase letters are present. (Kurt)
+
+
+v33.
+Added "usemodulo" flag to BBMap. Allows throwing away 80% of reference kmers to save memory. Slight reduction in sensitivity. Requested by Rob Egan.
+Moved GetReads back to jgi package and fixed shellscript.
+Fixed rare crash when using "local" mode on paired-end data on highly-repetitive genomes (Creinhardtii). Found by Vasanth S.
+Improved "usemodulo" mode - it was biased against minus-strand hits. Now, it keeps kmers where (kmer%5==rkmer%5). Result is virtually no reduction in sensitivity (zero in error-free reads, and less than 0.01% in reads with 8% error).
+BBMap will now discard reads shorter than "minlen".
+Added "idhistbins" or "idbins" flag to BBMap; allows setting the number of bins used in the idhist.
+Rescaled BBMap's MAPQ to be lower. It is now 0 for unmapped, 1-3 for ambiguous, and roughly 4-45 otherwise, with higher values allowed for longer reads.
+Added a much flatter MSA version, "MultiStateAligner9Flat", requested by JJ Chai.
+Fixed SNR output formatting.
+Added "forcesectionname" flag; fasta reads will always get an "_1" at the end, even if they are not broken into multiple pieces. (requested by Shoudan)
+Changed "fastareadlen" suffixes to only be appended when read is > maxlen rather than >=
+Reorganized SamLine and created SamHeader class.
+Modified CountBarcodes to append sub distance from expected barcodes and 'valid' for valid barcodes.
+Fixed null pointer exception related to "qhist", "aqhist", and "qahist". Noted by Harald (seqanswers).
+Fixed issue of readlength.sh breaking up reads when processing fasta files without a fasta extension.
+Updated BBDuk documentation.
+Added "maxlength" and qahist support to BBDuk.
+Added "minoverlap" and "mininsert" to BBDuk.
+Added "maxlength" to BBMerge.
+Created countbarcodes.sh
+Added edit distance column to CountBarcodes output.
+Added raw mapping score tag, YS:i:, controlled by "scoretag" flag and disabled by default.
+Added 'cq' (changequality) flag to reformat. Default: true.
+Fixed mhist being generated from sam files.
+Added readgroup support; a readgroup field "xx" can be specified with the flag "rgxx=value".
+Updated 'usemodulo' flag to use (kmer%9==0 || rkmer%9==0). Requiring the remainders to be equal unevenly affected palindromes and thus even kmer lengths.
+Updated RemoveHuman to use 'usemodulo' flag and reduced RAM allotment from 23g to 10g. Updated index location of HG19 masked.
+Added "idfilter" to BBMap.
+Made BandedAligner abstract superclass and created BandedAlignerConcrete for the Java implementation, and BandedAlignerJNI for the C version.
+Made file extension detection more robust against capitalization.
+Added outsingle to BBDuk.
+Replaced FastaToChromArrays with ChromArrayMaker. Now, indexing can be done from fastq files instead of just fasta.
+Fixed MAJOR bug in which reference was split up into pieces (as of 33.12).
+Reverted to old version of reference loader (as of 33.13) as there was still a bug (skipping every other scaffold).
+BBDuk (and BBDuk2) now better support kmer masking! Every occurance of a kmer is individually masked.
+Added parseQuality (qin, qout, etc) to Dedupe.
+Changed Dedupe default cluster stats cutoff to 2 (from 10), min cluster size to 2, and by default these values are linked.
+Added 'outbest' to Dedupe, writing the representative read per cluster (regardless of 'pbr' flag). This is mainly for 16s clustering.
+Fixed sorting of depths in pileup.sh. Noted by Alicia Clum.
+Fixed 'outbest' of Dedupe (was writing to wrong stream).
+Slightly accelerated read trimming.
+Added read/base count tracking to ConcurrentReadStreamInterface.
+Added display of exact number of input and output bases and reads to reformat.sh (requested by Seung-Jin).
+Fixed capital letters changing to lower-case in output filenames when using the "basename" flag with BBSplit. Noted by Shoudan Liang.
+Added Tools.condenseStrict(array).
+Fixed fast/slow flags with BBSplit. Noted by Shoudan Liang.
+Added 3-frames option to TranslateSixFrames by adding the flag "frames=3". Requested by Anne M.
+TranslateSixFrames now defaults to fasta format when the file extension is unclear.
+Added "estherfilter.sh" for filtering blastall queries.
+Added option of getting an input stream from a process with null file argument.
+Wrote FastaToChromArrays2 based on ByteFile/ByteBuilder for slightly better indexing speed and lower memory use.
+Modified ChromosomeArray to work with ByteBuilder.
+Fixed reformat displaying wrong number of input reads when run interleaved (due to recent changes).
+Added minratio, maxindel, minhits, and fast flags to BBQC, for controlling BBMap.
+Fixed "assert(false)" statement accidentally left in SamPileup from testing. Noted by Brian Foster.
+Added kfilter and local flags to BBQC.
+Fixed "bs" (bamscript) flag with BBSplit. Previously, it did not include the per-reference output streams.
+Added Jonathan Rood's C code and JNI class for Dedupe.
+Modified dedupe shellscripts to allow JNI code.
+BBSplit was not outputting any reads when reference files had uppercase letters (as a result of the recent case-sensitivity change). This has been fixed. Noted by Shoudan Liang.
+BBMap can now output fastq files with reads renamed to indicate mapping location, using the flags "rbm" and "don" (renamebymapping and deleteoldname).
+FastaQualInputStream replaced by FastaQualInputStream3. At least 2.5x faster, and correctly reads input in which fasta and qual lines are wrapped at different lengths. Bug noted by Kurt LaButti.
+Added bqhist, which allows box plots of read quality-per-base-location.
+Fixed a slowdown when making quality histograms due to recalculating probability rather than using cached value.
+Default sam format is now 1.4.
+RemoveHuman/BBQC/RQCFilter now default to minhits=1 because 'usemodulo' reduces the number of valid keys.
+Programs no longer default to outputting to stdout when "out=" is not specified because it's annoying. To write to stdout set "out=stdout.fq" (for example).
+AssemblyStats now counts IUPAC and invalid characters seperately. X and N now denote gaps between contigs, but no other symbols do. The code was also cleaned somewhat. The output formatting changed slightly.
+Preliminarily integrated Jon Rood's JNI versions of BandedAligner and MultiStateAligner into both Java code and shellscripts to test Genepool deployment.
+C code is now in /jni/ folder, at same level as /resources/ and /docs/.
+Clarified documentation of BBMap, BBSplit, and BBWrap to differentiate some parameters. For example, "refstats" only works with BBSplit.
+Added LW and RW (whisker values) columns to bqhist output, set at the 2nd and 98th percentiles. Requested by Seung-Jin Sul.
+BBQC will now compress intermediate files to level 2 instead of level 4, to save time.
+Fixed incompatibility of dot graph output and other output in Dedupe.
+Reverted to default "minhits=2" for RemoveHuman, because minhits=1 took 5x as long.
+Added median, mean, and stdev to gchist. Requested by Seung-Jin.
+Added obqhist (overall base quality histogram). Requested by Seung-Jin.
+Fixed various places, such as BBDuk, where the "int=true" flag caused references to be loaded interleaved. Noted by Jessica Jarett.
+Added some parser flags to allow dynamically enabling verbose mode and assertions specifically for certain classes.
+Fixed a bug in BBMap that made secondary alignments sometimes not get cigar strings.
+Added "addprefix" mode to rename reads, which simply prepends a prefix to the existing name.
+Clarified documentation of different histogram outputs in shellscripts.
+Ported BBMapThread changes over to BBMap variants.
+Restructured SamPileup and renamed it to CoveragePileup. Now supports Read objects (instead of just SamLines).
+Integrated CoveragePileup with BBMap and documented new flags.
+CoveragePileup: Added a concise coverage output, stranded coverage, and read-start-only coverage.
+Removed an obsolete Java classes and some shellscripts.
+Increased robustness of BBDuk's detection invalid file arguments, and clarified the error messages. Noted by Scott D.
+Fixed a problem with interleaving not being forced on fasta input.
+Paired output files will now force BBDuk input to be treated as interleaved.
+BBDuk now tracks statistics on which reference sequences were trimmed or masked - previously, it just tracked what was filtered.
+Reverse-complemented Nextera adapters and added them to official release (/resources/nextera.fa.gz).
+Added Illumina adapter sequence legal disclaimer to /docs/Legal_Illumina.txt
+Implemented GC calculation from index, for generating coverage stats while mapping.
+Tracked down strangeness with BBDuk. It is possible for "rcomp=f" to slightly reduce sensitivity when "mm=t" using an even kmer length, due to asymmetry. This appears to be correct.
+Merged in revised JNI Dedupe version that should be working correctly. Verified that it returns same answer as non-JNI version. Tests indicate roughly triple speed, when working with PacBio reads of insert.
+BBMap JNI version now seems roughly 30% faster than Java version.
+Added insert size quartiles to BBMap and BBMerge. Requested by Alex Copeland.
+Fixed rare bug related to SiteScore.fixXY(), caused by aligning reads with insufficient padding, fixing the tips, but not changing the start/stop positions. Found by Brian Foster.
+Fixed a race condition in TextStreamWriter that could randomly cause a deadlock in numerous different programs. Found by Shoudan Liang.
+Added "maxsites2" flag to allow over 800 alignments for a given read.
+Fixed bounds of kmer masking in BBDuk; they were off by 2 (too big).
+Fixed unintended debug print line. Noted by Shoudan Liang.
+Updated RandomReadInputStream to work with the newer RandomReads3 class.
+ConcurrentGenericReadInputStream now supports RandomReadInputStream3 as a producer.
+Fixed kmer dumping from CountKmersExact.
+Fixed length of vector created in BBMergeOverlapper (4->5). Noted by Jon Rood.
+Changed default kmer length in BBDuk to 27 so that the 'maskmiddle' base will be in the middle for both forward and reverse kmers.
+"pairlen" flag accidentally deleted from BBMap; restored. Noted by HGV (seqanswers).
+BBMerge now has a JNI version from Jonathan Rood - 60% faster than pure Java. Requires compiling the C code; details are in /jni/README.txt.
+Wrapped BBMerge JNI initializer in a conditional, so it will not try to load unless "usejni" is specified.
+Added "parseCommonStatic" to BBMerge and BBDuk (to allow JNI flag parsing).
+Commented out "module load" and "module unload" statements in public version.
+Added 'printlastbin' or 'plb' flag to countunique to produce a final bin smaller than binsize. Suggested for use in cumulative mode. Requested by Andrew Tritt.
+Added support for bzip2 and pbzip2 compression and decompression. The programs must be installed to use bz2 format.
+Elminated use of "sh" when launching subprocesses. This also allows pigz compression support in Windows.
+Files were not being closed after "testInterleaved()". Fixed.
+Improved error messages when improper quality values are observed.
+Updated hard-coded adapter path to include Nextera adapters. This affects BBQC and RQCFilter.
+Improved file format detection. Now FileFormat (testformat.sh) will print a warning when the contents and extension don't match, and it can differentiate between sam and fastq. Problem noted by Vasanth Singan.
+Fixed issue where "scafstats" output was printing inflated numbers with chimeric paired reads, or pairs with only one mapped read. Noted by HGV (seqanswers).
+Closed stream after reading in FileFormat.
+Unrolled, debranched, and removed assertion function calls from BBMerge inner loop.
+Fixed a bug in which findTipDeletions was not changing the bounds of the gap array.
+Added getters and setters for SiteScores that enforce gap correctness.
+Improved GapTools to test for and fix non-ascending points.
+Forced use of setters in TranslateColorspaceRead, AbstractMapThread, and BBIndex* classes; this caught some inconsistencies that should increase stability and correctness.
+Enabled jni-mode alignment by default for BBQC and removehuman.
+Added a BBMap output line indicating how many reads survived for use with, e.g., removehuman. Requested by Brian Foster.
+Added messages to BBQC to indicate which phase is executing. Requested by Brian Foster.
+SiteScore start and stop are exclusively set by methods now. Fixed a bug with local flag noted by Vasanth Singan.
+Added MaximumSpanningTree generation to Dedupe (mst flag).
+Merged in faster BBMerge overlapper JNI version; now 90% faster than Java with fastq and 70% faster with fasta.
+Improved Dedupe's support for paired reads: fixed an assertion, and added "in1" and "in2".
+Fixed a assertion involving semiperfect alignments of repetitive reads, that go out of the alignment window. Found by Alicia Clum.
+Fixed idhist mean calculation. Added mode, median, stdev, both by read count and base count.
+Better documented ConcurrentReadStreamInterface.
+Fixed a crash in CoveragePileup when using 32-bit mode.
+Fixed a couple instances in which the first two arguments being unrecognized would not be noticed.
+Fixed a bug in pileup causing coverage fraction to be reported incorrectly, if arrays were not being used. Noted by Vasanth Singan.
+Fixed a twocolumn mode in pileup; it was generating no output.
+Added additional parse flags to pileup, such as "stats" and "outcov".
+Added additional output fields to coverage stats - total number of covered bases, and number of reads mapped to plus and minus strands.
+CountKmersExact: Added preallocation (faster, less memory) and a one-pass-mode for the prefilter (faster, but nondeterministic).
+Replaced most instances of "Long.parseLong" with "Tools.parseKMG" to support kilo, mega, and giga abbreviated suffixes.
+Added jgi.PhylipToFasta and phylip2fasta.sh, for converting interleaved phylip files to fasta. Requested by Esther Singer.
+v33.58
+Began listing point-version numbers in this readme.
+Added jgi.A_Sample2, an simpler template for a concurrent pipe-filter stage.
+Added jgi.MakeChimeras, a tool for making chimeric PacBio reads from input non-chimeric reads. Also, makechimeras.sh. Requested by Esther Singer.
+Added support for normalized binning to CoveragePileup. Requested by Vasanth Singan.
+v33.59
+Fixed pileup's normalized scaling when dealing with 0-coverage scaffolds.
+v33.60
+Added driver.FilterReadsByName.java and filterbyname.sh. Allows inclusion or exclusion of reads by name.
+Added midpad flag to RandomReads (allows defining inter-scaffold padding).
+v33.61
+Added ConcurrentReadInputStreamD, prototype for MPI-version of input stream.
+Made Read and all classes that might be attached to reads Serializable.
+Added DemuxByName and demuxbyname.sh which allows a single file to be split into multiple files based on read names.
+v33.62
+Added FilterByCoverage and filterbycoverage.sh to filter assemblies based on contig coverage stats (from Pileup).
+Added CovStatsLine, an object representation of Pileup's coverage stats.
+Added '#' symbol to coverage stats header.
+v33.63
+Fixed path in filterbycoverage.sh
+v33.64
+Added custom scripts driver.MergeCoverageOTU and mergeOTUs.sh for Esther.
+Added DecontaminateByNormalization, for automating SAG plate decontamination.
+Fixed legacy code that set KmerNormalize to use 8 threads in some cases.
+Added "fixquality" for capping quality scores at 41. Requested by Bryce Foster.
+Added fasta output to kmercountexact. Requested by Alex Copeland.
+Added kmer histogram to kmercountexact (2-column and 3-column). Requested by Alex Copeland.
+Added multiple memory-related and output formatting flags to kmercountexact.
+Made KmerNode a subclass of AbstractKmerTable.
+Improved Data's "unloadall" to also clear scaffold-related data.
+Removed obsolete class CoverageArray1.
+v33.65
+Reduced preallocated memory in kmercountexact to avoid a crash on high memory machines. Also reduced total number of threads.
+v33.66
+"CountKmersExact.java" renamed to "KmerCountExact.java".
+kmercountexact now writes histogram and kmer dump simultaneously in seperate threads.
+kmercountexact.sh now specifies both -Xms and -Xmx.
+CountKmersExact will no longer run out of memory if -Xms is not specified; instead, it will preallocate a smaller table.
+v33.67
+Messed with MDA amp in RandomReads a bit.
+Added parser "ztd" ("zipthreaddivisor") flag. Defaults to 2 for removehuman.sh.
+Added BBMerge flags "maq" (minaveragequality) and "mee" (mmaxexpectederrors). Reads violating these will not be attempted to merge.
+Added BBMerge "efilter" flag, to allow disabling of the efilter. Efilter bans merges of reads that have more than the expected number of errors, based on quality scores.
+Closed A_Sample2 I/O streams after completion. Noted by Jon Rood.
+Created SynthMDA, a program to make a synthetic MDA'd single cell genome. This genome would be used as a reference for RandomReads.
+Added Reformat "vpair" or (verifypairing) flag, which allows validation of pair names. Before, it was just interleaved reads.
+Pair name validation will now accept identical names, if the "ain" (allowidenticalnames) flag is set.
+Updated reformat.sh, repair.sh, bbsplitpairs.sh with new flags.
+Removed FastaReadInputStream_old.java.
+Added "forcelength" flag to MakeChimeras.
+v33.68
+Added "ihist" flag to rqcfilter, default "ihist.txt". Unless this is set to null, BBMerge will run to generate the insert size histogram after filtering completes.
+AbstractKmerTable preallocation is now multithreaded. Unfortunately, this did not result in a speedup.
+Added ByteBuilder-related methods to certain Read output formats.
+Added ByteStreamWriter. This is a threaded writer with low overhead, and is substantially faster than TextStreamWriter (perhaps 2x speed).
+Fixed a bug in KmerNode (traversing wrong branch during dump).
+All AbstractKmerTable subclasses now dump kmers using bsw/ByteBuilder instead of tsw/StringBuilder.
+Added ForceTrimLeft/ForceTrimRight flags to Dedupe (requested by Bryce/Seung-Jin).
+v33.69
+FilterByCoverage (and thus DecontaminatebyNormalization) now produce a log file indicating which contigs were removed.
+FilterByCoverage and DecontaminatebyNormalization can now optionally process coverage before and after normalization, and not remove contigs unless the coverage changes by at least some ratio (default 2). Enable with "mapraw" and optionally "minratio" flag.
+Added ihist to file-list.txt. TODO: Verify success.
+Reads longer than 200bp are now detected as ASCII-33 regardless of their quality values. This helps with handling PacBio CCS/ROI data.
+Added support in FixPairsAndSingles (repair.sh) for reads with names that do not contain whitespace, but still end with "/1" and "/2".
+Added qout flag to RandomReads3.
+Refactored TextStreamWriter to be more like ByteStreamWriter.
+Added gcformat 0 (no base content info printed) to AssemblyStats2 (stats.sh).
+v33.70
+Updated RQCFilter and BBQC to bring them closer together and improve some of their defaults. RQCFilter now has more parameters such as k for filtering and trimming.
+RQCFilter now correctly produces the insert size histogram.
+v33.71
+Fixed a bug in Dedupe preventing overlap detection when 'absorb match' and 'absorb containment' were both disabled. Noted by Shoudan Liang.
+Optimized synthetic MDA procedure.
+v33.72
+Fixed a bug in SynthMDA.java. Further tweaked parameters.
+Added synthmda.sh.
+v33.73
+Further tweaked SynthMDA defaults to better match some real data sent to me by Shoudan and Alex.
+Fixed a bug in BBDuk's mask mode in which all bases in a masked read were assigned quality 0. Noted by luc (SeqAnswers).
+Fixed a small error in KmerCountExact's preallocation calculation.
+Added preallocation to BBDuk/BBDuk2. Not recommended for BBDuk2 because the tables may need unequal sizes.
+Added "restrictleft" and "restrictright" flags to BBDuk (not BBDuk2). These allow only looking for kmer matches in the leftmost or rightmost X bases. Requested by lankage (SeqAnswers).
+v33.74
+Added jgi.Shuffle.java to input a read set and output it in random order. It can also sort by various things (coordinates, sequence, name, and numericID).
+Added CallPeaks, which can call peaks from a histogram. Requested by Kurt LaButti.
+Integrated peak calling into BBNorm and KmerCountExact.
+BBNorm now has a "histogramcolumns" flag, so it can produce Jellyfish-compatible output.
+Added callpeaks.sh.
+v33.75
+CallPeaks now calls by raw kmer count rather than unique kmer count. This better detects higher-order peaks.
+Finished CrossContaminate.java and added crosscontaminate.sh.
+Added "header" and "headerpound" to pileup.sh, to control header presence and whether they start with "#".
+Added "prefix" flag to SynthMDA and RandomReads3, to better track origin of reads during cross-contamination trials.
+RQCFilter and BBQC now parse 'usejni' flag; rqcfilter.sh and bbqc.sh default to this being enabled.
+Added "uselowerdepth" flag to BBNorm (default true). Allows normalization by depth of higher or lower read. Set to false by DecontaminateByNormalization.
+v33.76
+Fixed a bug in synthmda.sh command line.
+Fixed build number not being parsed by SynthMDA.
+Added some error handling to CrossContaminate, so it shouldn't hang as a result of missing files.
+v33.77
+SynthMDA now nullifies reference in memory prior to generating reads.
+Parser was not correctly setting the number of compression threads when exactly 1 was requested.
+Shuffle is now multithreaded, and CrossContaminate defaults to shufflethreads=3.
+Shuffle now removes reads as they are printed, reducing memory usage.
+Created shellscript templates for generating and assembling full plates of synth MDA data, and ran successfully.
+*SamLine was fixed when generating pnext from clipped reads. Still needs work; pos1 and pos2 need to be recalculated considering clipping.
+BBDuk now tracks #contaminant bases as well as #contaminant reads per scaffold for stats. Additional flag "columns=5" enables this output.
+BBDuk stats are now sorted by #bases, not #reads.
+BBDuk counting arrays changed from int to long to handle potential overflow.
+v33.78
+Modified DemuxByName to handle affixes of variable length (though it's less efficient with multiple lengths).
+v33.79
+Changed the way "pos" and "pnext" are calculated for paired reads to be consistent. Bug had been noted with soft-clipped reads by Rob Egan.
+Changed LOCAL_ALIGN_TIP_LENGTH from 8 to 1. Previously, soft-clipping would only occur if at least 8 bases would be clipped; not sure why I did that.
+Changed the way "tlen" is calculated to compensate for clipping.
+v33.80
+Changed default decontaminate minratio from2 to 0 (disabling it) because of false negatives.
+Changed default decontaminate mincov from 4 to 5 due to a false negative.
+Changed default decontaminate kfilter from 63 to 55 to better reflect Spades defaults.
+Fixed a bug in filterbycoverage which was outputting contaminant contigs instead of clean contigs.
+Added outd (outdirty) flag to FilterByCoverage.
+v33.81
+Changed decontaminate normalization target from 100 to 50, and minlength from 0 to 500.
+Changed decontaminate minc and minp flags from int to float.
+v33.82
+Changed cross contaminate probability root from 2 to 3 (increasing amount of lower-level contamination).
+Fixed a crash bug in sam file generation caused by the change in the way pos was calculated.
+v33.83
+Added aecc=f, cecc=f, minprob=0.5, depthpercentile=0.8 flags to DecontaminateByNormalization. Defaults are as listed.
+Dropped mindepth to 3 and maxdepth to target; target default changed to 20.
+Changed the way mindepth is handled in normalization; now it is based on the depth of the higher read.
+v33.84
+Added BBNorm prebits flag for setting prefilter cell size (default 2).
+Added Decontaminate filterbits and prefilterbits flags, default 32 and 4. 4 was chosen because MDA data has high error kmer counts.
+v33.85
+Fixed parsing of decontaminate minc and minp (parsed as ints; should have been floats)
+Changed default minc to 3.5.
+Change default ratio to 1.2.
+v33.86
+Changed decontaminate default dp to 0.75.
+Changed decontaminate default prebits to 2.
+Changed decontaminate default minr (min reads) to 20. Some tiny (~500bp) low-coverage contigs were getting through.
+Changed decontaminate mindepth to 2.
+Decontaminate results now prints extra columns for read counts and pre-norm coverage.
+v33.87
+Added "covminscaf" flag to BBMap and Pileup, to supress output of really short contigs. Default 0.
+Changed CrossContaminate coverage distribution from cubic to geometric.
+v33.88
+Shuffle removing reads caused incredible slowness; it should have set reads to null. Fixed.
+v33.89
+Added HashArrayA, HashForestA, KmerNodeA and updated AbstractKmerTable to allow sets of values per kmer.
+Refactored all AbstractKmerTable subclasses.
+Added scaffold length tracking to BBDuk (for RPKM).
+Added RPKM output to BBDuk (enable with "rpkm" flag).
+BBDuk now unloads kmers after finishing processing reads.
+v33.90
+BBDuk counter arrays are now local per-thread, to prevent cache-thrashing.
+Added IntList.toString()
+Created Seal class, based on BBDuk with values stored in arrays.
+Adjusted auto skip settings of BBDuk (increased size threshold for longer skips).
+Added BBDuk skip flag (controls minskip and maxskip).
+Fixed a bug in DemuxByName/DecontaminateByNormalization/CrossContaminate: attempt to read directories as files.
+v33.91
+Fixed a bug in BBDuk related to clearing data too early. Noted by Brian Foster.
+v33.92
+Added per-reference-file stats counting to BBDuk/Seal, and "refstats" flag.
+Added returnList(boolean) to ConcurrentReadStreamInterface.
+Removed an extra listen() call from ConcurrentReadInputStreamD.
+Documented "addname" flag for stats.sh.
+Implemented restrictleft and restrictright for BBDuk2.
+Added "nzo" flag for BBDuk/Seal.
+Added sdriscoll's reformatted shellscript help for BBDuk and BBMap. Thanks!
+Added more documentation to bbmap.sh (usequality flag).
+Added maq (minaveragequality) flag to BBMap, at request of sdriscoll.
+Added rename flag to BBDuk/Seal - renames reads based on what sequences they matched.
+Added userefnames flag BBDuk/Seal - the names of reference files are used, rather than scaffold IDs.
+
+v33.93
+maxindel flag now allows KMG suffix.
+Added "speed" flag to BBDuk/Seal.
+Added read processing time to BBDuk/Seal output.
+BBDuk "fbm" (findbestmatch) mode is now much faster, using variable rather than fixed-length counters.
+Fixed BBDuk2 not working when using the "ref" flag rather than "filterref".
+Changed AbstractKmerTable subclass names to *1D and *2D.
+Made KmerNode a superclass of KmerNode1D and KmerNode2D and eliminated redundant methods.
+Eliminated 2D version of HashForest; it now works with 1D and 2D nodes.
+Made HashArray a superclass of HashArray1D and HashArray2D.
+Created HashArrayHybrid.
+Added slow debugging methods to AbstractKmerTable classes, to verify that values were present after being added.
+Fixed bug in KmerNode1D; was never changing its value on 'set'. Probably only affected Seal. Seal 1D now appears to produce identical output for prealloc and non-prealloc.
+Finished debugging KmerNode2D, KmerForest, HashArray2D, HashArrayHybrid, and Seal.
+Added "fbm" and "fum" to Seal.
+Seal now defaults to 7 ways.
+Adjusted Seal's memory preallocation.
+Added -Xms flag to BBMergeGapped BBNorm shellscripts.
+v33.94
+Added -Xms flag to BBDuk and Seal.
+Added qskip flag to BBDuk and Seal (for skipping query kmers).
+v33.95
+Seal now defaults to HashArrayHybrid rather than HashArrayArray2D
+v33.96
+Fixed a slowdown in Seal and BBDuk caused by sorting list of ID hits.
+v33.97
+Wrote driver.CorrelateIdentity and matrixtocolumns.sh for identity correlations between 16S and V4.
+Wrote jgi.IdentityMatrix and idmatrix.sh for all-to-all alignment.
+Added BandedAligner.alignQuadruple() to check all orientations.
+BandedAligner now does not clear the full arrays, only the used portion, which can vary depending on read length.
+v33.98
+No change - build failure.
+v33.99
+Changed BandedAligner.PenalizeOffCenter(). Indels were getting double-penalized when they led to length mismatches between query and ref.
+Added AlignDouble(), but it looks like AlignQuadruple is the only viable method for calculating full identity when the sequences do not start or stop at the same place.
+Added test method to ReadStats to ensure the files are safe to write (ReadStats.testFiles()).
+Fixed a bug bqhist output giving read 1 and read 2 same values. Noted by Shoudan/Bryce
+Fixed a bug in BBDuk initialization when no kmer input supplied. Noted by Bill A.
+Fixed a bug in BBDuk/Seal giving a spurious warning.
+Detected race condition in ByteFile2 triggered by closing early. Not very important.
+Added jni path flags to BBDuk shellscript command line.
+Wrote FindPrimers and msa.sh to locate primer sites. Uses MultiStateAligner; outputs in sam format.
+Wrote CutPrimers and cutprimers.sh to cut regions flanked by mapped primer locations from sequences, e.g. V4.
+
+TODO: Plot correlation of V4 and 16s.
+TODO: Add length into edges of Dedupe output. (Ted)
+TODO: Benchmark Seal. Speed seems inconsistent.
+TODO: Locking version of Seal.
+TODO: HashArray resize - grow fast up to a limit, then resize to exactly the max allowable.
+TODO: Alicia BBMap PacBio slowdown (try an older version...)
+TODO: BBMerge rename mode with insert sizes.
+TODO: Dump info about Seal kmer copy histogram.
+TODO: rpkm for pileup / BBMap.
+TODO: Dedupe crash bug. (Kurt)
+TODO: CallPeaks minwidth should be a subsumption threshold, not creation threshold.
+TODO: CallPeaks should not subsume peaks with valleys in between that are very low.
+*TODO: Make TextStreamWriter an abstract superclass.
+TODO: BBDuk split mode
+TODO: Add option for BBMap to convert U to T. (Asaf Levy)
+TODO: Add dedupe support for graphing containments and matches.
+TODO: Log normalization.
+TODO: Prefilterpasses (prepasses)
+TODO: Test forcing msa.scoreNoIndels to always run bidirectionally.
+TODO: Message for BBNorm indicating pairing (this is nontrivial)
+TODO: Average quality for pileup.sh
+TODO: Fix ChromArrayMaker which may skip every other scaffold (for now I have reverted to old, correct version). ***Possibly fixed by disabling interleaving; TODO: Test.
+TODO: Consider changing ConcurrentGenericReadInputStream to put read/base statistics into incrementGenerated(), or at least in a function.
+TODO: BBSplit produces alignments to the wrong reference in the output for a specific reference. (Shoudan)
+TODO: Change the way Ns are handled in cigar strings, both input and output.
+TODO: Add #clipped reads/bases to BBMap output.
+TODO: Add method for counting number of clipped bases in a read and unclipped length.
+TODO: Orientation statistics for BBMap ihist.
+TODO: Clarify documentation of 'reads' flag to note that it means reads OR pairs.
+TODO: bs flag does not work with BBWrap (Shoudan).
+TODO: Fasta input tries to sometimes keep reading from the file when a limited number of reads is specified. Gives error message but output is fine.
+TODO: 'saa' flag sometimes does not work (Shoudan).
+TODO: Kmer transition probabilities for binning.
+TODO: One coverage file per scaffold; abort if over X scaffolds. (Andrew Tritt)
+TODO: Enable JNI by default for BBMap and Dedupe on Genepool.
+TODO: Disable cigar string generation when dumping coverage only (?). This will disable stats, though.
+TODO: Pipethread spawned when decompressing from standard in with an external process.
+TODO: FileFormat should test interleaving and quality individually on files rather than relying on a static field.
+TODO: Refstats (BBSplit) still reports inflated rates for pairs that don't map to the same reference. This behavior is difficult to change because it is conflated with BBSPlit's output streams.
+
+
+v32.
+Revised all shellscripts to better detect memory in Linux. This should massively increase reliability and ease of use.
+Added append flag. Allows appending to output files instead of overwriting.
+Append flag now should work with BBWrap, with sam files, and with gzipped files.
+All statistics are now stored in longs, rather than ints.
+Added statistics tracking of # bases as well as # reads. Updated human-readable output to show 4 columns.
+Split bbmerge into gapped (split kmer) and ungapped (overlap only) versions. bbmerge.sh calls the ungapped version.
+Added "qahist" to bbmap - match/sub/ins/del histogram by quality score.
+Fixed "pairlen" flag; it was only being used if greater than the default. (Noted by Harald on seqanswers)
+Added insert size median and standard deviation to output stats. The 'ihist=' flag must be set to enable this, otherwise the data won't be tracked. (Requested by Harald on seqanswers)
+Fixed bug in which non-ACGTN IUPAC symbols were not being converted to N. (Noted by Leanne on seqanswers)
+Changed shellscripts from DOS to Unix EOL encoding.
+Added support for "-h" and "--help" in shellscripts (before it was just in java files).
+Created Dedupe2 - faster, and supports 1-cluster-per-file output.
+Created Dedupe3 - supports more than 2 affix tables. Uses slightly more memory.
+BBMap now generates "sort" shellscripts even if the output is in bam format.
+pileup.sh now prints a coverage summary to standard out.
+Added 'split' flag to BBMask.
+Fixed bug in randomreads allowing paired reads to come from 'nearby' scaffolds.
+Documented randomreads.sh.
+Added gaussian insert size distribution to randomreads.
+Fixed a bug in calcmem.sh that prevented requesting memory that Linux considered 'cached'.
+TODO: Penalize score of sites with errors near read tips, and long deletions.
+Added "Median_fold" column to pileup. You need to set 'bitset=
+Changed default quality-filtering mode to average probability rather than average quality score.
+Default number of threads now takes the environment variable NSLOTS into consideration. However, because Mendel nodes have hpyerthreading enabled, if NSLOTS>8 and (# processors)==NSLOTS*2, then #processors will be used instead. So it is still recommended that you set threads manually if you don't have exclusive access to a node.
+Fixed bbmerge, which was crashing on fasta input.
+Fixed gaussian insert size distribution in randomreads (it was causing a crash).
+Enabled unpigz support in Windows (decompression only).
+TODO: BBNorm needs in1/in2/out1/out2 support.
+Added mingc and maxgc to reformat.
+Added 'passes' flag to BBQC and reduced default passes to 1 if normalization is disabled.
+Swapped FileFormat's method signature "allowFileRead" and "allowSubprocess" parms for some functions, as they were inconsistent. This may have unknown effects.
+TODO: unclear if fasta files are currently checked for interleaving. Method added to "FASTQ".
+TODO: FileFormat should perhaps test for quality format and interleaving.
+Fixed reversed variables in "machineout" stats for %mapped and %unambiguous. Found by Michael Barton.
+Added "testformat.sh".
+Fixed dedupe "csf" output to work even when no other outputs specified.
+Fixed dedupe erroneous assumption that "bandwidth" had not been custom-specified.
+Changed MakeLengthHistogram (readlength.sh) default behavior to place reads in lower bins rather than closest bins. Toggle with "round" flag.
+Added "repair" flag to SplitPairsAndSingles. Created "repair.sh".
+Fixed a bug in which tabs were not allowed in fasta headers.
+Improved BBMerge: default minqo 7->8, made margin a parameter, added 'strict' macro that reduces false positive rate.
+Added "samestrand" flag to RandomReads.
+Fixed a dedupe bug with "pto" and paired reads; read2 was not getting a UnitID.
+Fixed a bug in which the BBMap stats for insertion rate was sometimes higher than the true value.
+Fixed bugs in BBMerge; increased speed slightly.
+Created grademerge.sh to grade merged reads.
+Added 'variance' flag to randomreads; used to make qualities less uniform between reads.
+BBDuk now has overwrite=true by default.
+calcmem.sh now sets -Xmx and -Xms from each other if only one was specified.
+
+Fixed bug with "ambig=all" and "stoptag" flags being used together. Found by WhatSoEver (seqanswers).
+Added 'findbestmatch'/'fbm' flag to BBDuk; reports the reference sequence sharing the greatest number of kmers with the read.
+Shellscripts no longer try to calculate memory before displaying help (noted by Kjiersten Fagnan).
+-ea and -da are now valid parameters for all shellscripts.
+Improved documentation of Dedupe.
+Added "loose" and "vloose" modes to BBMerge.
+Added novel-kmer-filtering to BBMerge - bans merged reads that create a novel kmer. Does not seem to help.
+Added entropy-detection to BBMerge - minimum allowed overlap is determined by entropy rather than a constant. Moderate improvement.
+Fixed bug causing "repair.sh" script to not work. Noted by SES (seqanswers).
+Added "fast" mode to BBMerge.
+Fixed a rounding problem in RandomReads that caused gaussian distribution to have 2x frequency of intended reads at exactly insert size of double read length.
+Added exponential decay insert size distribution to RandomReads, for use in LMP libraries.
+TODO: Track different paired read orientation rates (innie, outie, same direction, etc) with BBMap.
+Added sssr (secondarysitescoreratio) and ssao (secondarysiteasambiguousonly) flags. Response to WhatSoEver (seqanswers).
+Ambiguously-mapped reads that print a primary site now print a minimum of 1 secondary site, and all sites with the same score as the top secondary site.
+Improved error message for paired reads with unequal number of read 1 vs read 2. Response to Salvatore (seqanswers).
+Updated bbcountunique.sh help message.
+Changed AddAdapters default to "arc=f" (no reverse-complement adapters). Added "addpaired" flag (adds adapter to same location of both reads).
+Added BBDuk/BBDuk2 "tbo" (trimbyoverlap) flag. Vastly reduces false-negatives with no increase in false-positives.
+Adding "fragadapter" flag to RandomReads. Also added ability to handle multiple different adapters for both read 1 and read 2. Adapters are added to paired reads with insert size shorter than read length.
+Added "ordered" flag to BBDuk/BBDuk2.
+Added "tpe" (trimpairsevenly) flag to BBDuk/BBDuk2. This works in conjunction with kmer-trimming to the right. Slightly decreases false negatives and doubles false positives.
+Updated rqcfilter and bbqc with 'tbo' and 'tpe' flags.
+TODO: Migrate RQCFilter to BBDuk2.
+Improved addadapters to better handle reads annotated by renamereads.
+BBMap's fillLimited routine is now affected by 'sssr' flag, if secondary sites are enabled. This will make things slightly slower when secondary sites are enabled, if sssr uses a low value (default is 0.95).
+statswrapper now allows comma-delimited files.
+Added standard deviation to BBMerge (requested by Bryce F).
+Added "tbo" (trimbyoverlap) flag to BBMerge, as an alternative to joining.
+Updated help for 'ambig' in bbmap.sh to remove the obsolete information that 'ambig=all' did not support sam output.
+Updated BBMapSkimmer and its shellscript to default to 'ambig=all', which is its intended mode.
+BBDuk no longer defaults to "out=stdout.fq" because that was incredibly annoying. Now it defaults to "out=null".
+Changed BBDuk default mink from 4 to 6.
+Changed BBDuk, Reformat, SplitPairsAndSingles default trimq from 4 to 6.
+Added "ftr"/"ftl" flags to BBDuk.
+Added "bbmapskimmer" to the list of options parsed by BBWrap. (Noted by JJ Chai)
+Corrected documentation of idtag and stoptag - both default to false, not true. (Noted by JJ Chai)
+Added "mappedonly" flag to reformat. (Requested by Kristen T)
+Added "rmn" (requirematchingnames) flag to Dedupe. Requested by Alex Copeland.
+Added ehist, indelhist, idhist, gchist, lhist flags to BBMap, BBDuk, and Reformat.
+Added removesmartbell.sh wrapper for pacbio.RemoveAdapters2.
+Fixed instance in KmerCoverage where input stream was being started twice. Noted by Alicia Clum.
+Added "ngn" (NumberGraphNodes) flag to dedupe; default true. Allows toggling of labelling graph nodes with read number or read name.
+"slow" flag now disables a heuristic that skipped mapping reads containing only kmers that are highly overrepresented in the reference. Problem noted by Shoudan Liang.
+Added MergeBarcodes and mergebarcodes.sh
+Identity is now calculated neutrally by default.
+Added "qin" and "qout" documentation to bbnorm shellscripts. Noted by muol (seqanswers).
+Changed qhist to ouput additional columns - both linear averages and logrithmic averages.
+Added mode to BBMerge output.
+Added mode, min, max, median, and standard deviation to ReadLength output. The mode and std dev are affected by bin size, so will only be exactly correct when bin size is 1.
+Added "nzo" (nonzeroonly) flag to ReadLength.
+Created "A_Sample", a template for programs that input reads, perform some function, and output reads.
+BBNorm now works correctly with dual input and output files. Noted by Olaf (seqanswers).
+Added mode to BBMap insert size statistics.
+Added CorrelateBarcodes and filterbarcodes.sh, for analyzing and filtering reads by barcode quality.
+Added "aqhist" (average quality histogram) to ReadStats - can be used by BBMap, BBDuk, Reformat.
+
+
+v31.
+TODO: Change pipethreads to redirects (where possible), and hash pipethreads by process, not by filename.
+TODO: Improve scoring function by using gembal distribution and/or accounting for read length.
+TextStreamWriter was improperly testing for output format 'other'. Noted by Brian Foster.
+Fixed bug for read stream 2 in RTextOutputStream3. Found by Brian Foster.
+Fixed bug in MateReadsMT creating an unwanted read stream 2. Found by Brian Foster.
+TrimRead.testOptimal() mode added, and made default when quality trimming is performed; old mode can be used with 'otf=f' flag.
+Fixed a couple cases where output file format was set to "ordered" even though the process was singlethreaded; this had caused an out-of-memory crash noted by Bill A.
+Changed shellscripts of MapPacBio classes to remove "interleaved=false" term.
+Reduced Shared.READ_BUFFER_LENGTH from 500 to 200 and Shared.READ_BUFFER_MAX_DATA from 1m to 500k, to reduce ram usage of buffers.
+Noticed small bug in trimming; somehow a read had a 'T' with quality 0, which triggered assertion error. I disabled the assertion but I'm not sure how it happened.
+Fixed bug in which pigz was not used to decompress fasta files.
+All program message information now defaults to stderr.
+Added "ignorebadquality" (ibq) flag for reads with out-of-range quality.
+TODO: mask by information content
+Added "mtl"/"mintrimlength" flag (default 60). Reads will not be trimmed shorter than that.
+Made 'tuc' (to uppercase) default to true for bbmap, to prevent assertion errors. Reads MUST be uppercase to match reference.
+Added new tool, BBMask.
+Reads and SamLines can now be created with null bases.
+SamLines to Read is now faster, skipping colorspace check.
+Added deprecated 'SOH' symbol support to FastaInputStream. This will be replaced with a '>'. Needed to process NCBI's NT database.
+Added "sampad" or "sp" flag to BBMask, to allow masking beyond bounds of mapped reads.
+TODO: %reads with ins, del, splice
+TODO: #bases mapped/unmapped, avg read length mapped/unmapped
+Dedupe now tracks and prints scaffolds that were duplicates with "outd=". (request by Andrew Tritt)
+Updated all shellscripts to support the -h and --help flags. (suggested by Westerman)
+RAM detection is now skipped if user supplies -Xmx flag, preventing a false warning. (noted by Westerman)
+Created AddAdapters.java. Capable of adding adapter sequence to a fastq file, and grading the trimmed file for correctness.
+Removed some debug code from FileFormat causing a crash on "stdin" with no extension. Noted by Matt Nolan.
+Added BBWrap and bbwrap.sh. Wraps BBMap to allow multiple input/output files without reloading the reference.
+Added support for breaking long fastq reads into shorter reads (maxlength and minlength flags). Requested by James Han.
+Added Pileup support for residual bins smaller than binsize. Flag "ksb", "keepshortbins". Requested by Kurt LaButti.
+Fixed support for breaking long reads; was failing on the last read in the set. Noted by James Han.
+Improved accuracy slightly by better detecting when padding is needed.
+Improved verbose output from MSA.
+Created TranslateSixFrames, first step toward amino acid mapping.
+Improved RandomReads ability to simulate PacBio error profile.
+Fixed crash when using BBSplit in PacBio mode. (Noted by Esther Singer)
+May have improved ability to read relatively-pathed files if "." is not in $PATH. (nope, seems not)
+Fixed crash when using "usequality=f" flag with fasta input reads. (Noted by Esther Singer)
+Corrected behaviour of minlength with regards to trimming; it was not always working correctly.
+Added "bhist" (base composition histogram) flag.
+
+v30.
+Disabled compression/decompression subprocesses when total system threads allowed is less than 3.
+Fixed assertion error in calcCorrectness in which SiteScores are not necessarily sorted if AMBIGUOUS_RANDOM=true. Noted by Brian Foster.
+Fixed bug in toLocalAlignment with respect to considering XY as insertions, not subs.
+TODO: XY should be standardized as substitutions.
+Added scarf input support. Requested by Alex Copeland.
+TODO: Allow sam input with interleaved flag.
+TODO: Make pigz a module dependency or script load.
+Fixed bug with nodisk mode dropping the name of the first scaffold of every 500MB chunk after the first. Noted by Vasanth Singan.
+Overhaul of I/O channel creation. Sequence files are now initialized with a FileFormat object which contains information about the format, permission to overwrite, etc.
+Increased limit of number of index threads in Windows in nodisk mode (since disk fragmentation is no longer relevant).
+Renamed Read.list to sites; added Read.topSite() and Read.numSites(); replaced many instances of things like "r.sites!=null && !r.sites.isEmpty()"
+Refactored to put Read and all read-streaming I/O classes in 'stream' package.
+Moved kmer hashing and indexing classes to kmer package.
+Moved Variation, subclasses, and related classes to var package.
+Moved FastaToChrom and ChromToFasta to dna package.
+Moved pacbio error correction classes to pacbio package.
+Removed stack, stats, primes, and other packages; prefixed all unused pacakges with z_.
+TODO: Sites failing Data.isSingleScaffold() test should be clipped, not discarded.
+RandomReads3 no longer adds /1 and /2 to paired fastq read names by default (can be enabled with 'addpairnum' flag).
+Added "inserttag" flag; adds the insert size to sam output.
+Fixed insert size histogram anomaly. There was a blip at insert==(read1.length+read2.length) because the algorithm used to calculate insert size was different for reads that overlap and reads that don't overlap.
+Skimmer now defaults to cigar=true.
+Added maxindel1 and maxindel2 (or maxindelsum) flags.
+Removed OUTER_DIST_MULT2 because it caused assertion errors when different from OUTER_DIST_MULT; changed OUTER_DIST_MULT from 15 to 14.
+Added shellscript for skimmer, bbmapskimmer.sh
+TODO: Document above changes to parameters.
+
+
+
+v29.
+New version since major refactoring.
+Added FRACTION_GENOME_TO_EXCLUDE flag (fgte). Setting this lower increases sensitivity at expense of speed. Range is 0-1 and default is around 0.03.
+Added setFractionGenometoExclude() to Skimmer index.
+LMP librares were not being paired correctly. Now "rcs=f" may be used to ignore orientation when pairing. Noted by Kurt LaButti.
+Allocating memory to alignment score matrices caused uncaught out-of-memory error on low-memory machines, resulting in a hang. This is now caught and results in an exit. Noted by Alicia Clum.
+GPINT machines are now detected and restricted to 4 threads max. This helps prevent out-of-memory errors with PacBio mode.
+Fixed sam output bug in which an unmapped read would get pnext of 0 rather than 1 when its mate mapped off the beginning of a scaffold. Noted by Rob Egan.
+Added memory test prior to allocating mapping threads. Thread count will be reduced if there is not enough memory. This is to address the issue noted by James Han, in which the PacBio versions would crash after running out of memory on low-memory nodes.
+TODO: Detect and prevent low-memory crashes while loading the index by aborting.
+Fixed assertion error caused by strictmaxindel mode (noted by James Han).
+Added flag "trd" (trimreaddescriptions) which truncates read names at the first whitespace.
+Added "usequality/uq" flag to turn on/off usage of quality information when mapping. Requested by Rob Egan.
+Added "keepbadkeys/kbk" flag to prevent discarding of keys due to low quality. Requested by Rob Egan.
+Fixed crash with very long reads and very small kmers due to exceeding length of various kmer array buffers.
+Avg Initial Sites and etc no longer printed for read 2 data.
+TODO: Support for selecting long-mate-pair orientation has been requested by Alex C.
+Fixed possible bug in read trimming when the entire read was below the quality threshold.
+Fixed trim mode bug: "trim=both" was only trimming the right side. "qtrim" is also now an alias for "trim".
+Fixed bug in ConcurrentGenericReadInputStream causing an incorrect assertion error for input in paired files and read sampling. Found by Alex Copeland.
+Added insert size histogram: ihist=<file>
+Added "machineout" flag for machine-readable output stats.
+TODO: reads_B1_100000x150bp_0S_0I_0D_0U_0N_interleaved.fq.gz (ecoli) has 0% rescued for read1 and 0.7% rescued for read 2. After swapping r1 and r2, .664% of r2 is rescued and .001% of r1 is rescued. Why are they not symmetric?
+Added 'slow' flag to bbmap for increased accuracy. Still in progress.
+Added MultiStateAligner11ts to MSA minIdToMinRatio().
+Changed the way files are tested for permission to write (moved to Tools).
+Fixed various places in which version string was parsed as an integer.
+Added test for "help" and "version" flags.
+Fixed bug in testing for file existence; noted by Bryce Foster.
+Fixed issue with scaffold names not being trimmed on whitespace boundaries when 'trd=t'. Noted by Rob Egan.
+Added pigz (parallel gzip) support, at suggestion of Rob Egan.
+Improved support for subprocesses and pipethreads; they are now automatically killed when not needed, even if the I/O stream is not finished. This allows gunzip/unpigz when a file is being partially read.
+Added shellscript test for the hostname 'gpint'; in that case, memory will be capped at 4G per process.
+Changed the way cris/ros are shut down. All must now go through ReadWrite.closeStreams()
+TODO: Force rtis and tsw to go through that too.
+TODO: Add "Job.fname" field.
+Made output threads kill processes also.
+Modified TrimRead to require minlength parameter.
+Fixed a bug with gathering statistics in BBMapPacBioSkimmer (found by Matt Scholz).
+Fixed a bug in which reads with match string containing X/Y were not eligible to be semiperfect (Found by Brian Foster).
+Fixed a bug related to improving the prior fix; I had inverted an == operator (Found by Brian Foster).
+Added SiteScore.fixXY(), a fast method to fix reads that go out-of-bounds during alignment. Unfinished; score needs to be altered as a result.
+Added "pairsonly" or "po" flag. Enabling it will treat unpaired reads as unmapped, so they will be sent to 'outu' instead of 'outm'. Suggested by James Han and Alex Copeland.
+Added shellscript support for java -Xmx flag (Suggested by James Han).
+Changed behavior: with 'quickmatch' enabled secondary sites will now get cigar strings (mostly, not all of them).
+"fast" flag now enables quickmatch (50% speedup in e.coli with low-identity reads). Very minor effect on accuracy.
+Fixed bug with overflowing gref due GREFLIMIT2_CUSHION padding. Found by Alicia Clum.
+Fixed bug in which writing the index would use pigz rather than native gzip, allowing reads from scaffolds.txt.gz before the (buffered) writing finished. Rare race condition. Found by Brian Foster.
+Fixed stdout.fa.gz writing uncompressed via ReadStreamWriter.
+Added "allowSubprocess" flag to all constructors of TextFile and TextStreamWriter, and made TextFile 'tryAllExtensions' flag the last param.
+allowSubprocess currently defaults to true for ByteFiles and ReadInput/Output Streams.
+TODO: TextFile and TextStreamWriter (and maybe others?) may ignore ReadWrite.killProcess().
+TODO: RTextOutputStream3 - make allowSubprocess a parameter
+TODO: Assert that first symbol of reference fasta is '>' to help detect corrupt fastas.
+Improved TextStreamWriter, TextFile, and all ReadStream classes usage of ReadWrite's InputStream/OutputStream creation/destruction methods.
+All InputStream and OutputStream creation/destruction now has an allowSubprocesses flag.
+Added verbose output to all ReadWrite methods.
+Fixed bug in which realigned SiteScores were not given a new perfect/semiperfect status. Noted by Brian Foster and Will Andreopoulos.
+
+
+v28.
+New version because the new I/O system seems to be stable now.
+Re-enabled bam input/output (via samtools subprocess). Lowered shellscript memory from 85% to 84% to provide space for samtools.
+Added "-l" to "#!/bin/bash" at top. This may make it less likely for the environment to be messed up. Thanks to Alex Boyd for the tip.
+Addressed potential bug in start/stop index padding calculation for scaffolds that began or ended with non-ACGT bases.
+Made superclass for Index.
+Made superclass for BBMap.
+Removed around 5000 lines of code as a result of dereplication into superclasses.
+Added MultiStateAligner11ts, which uses arrays for affine transform instead of if blocks. Changing insertions gave a ~5% speedup; subs gave an immeasurably small speedup.
+Found bug in calculation of insert penalties during mapping. Fixing this bug increases speed but decreases accuracy, so it was modified toward a compromise.
+
+
+v27.
+Added command line to sam file header.
+Added "msa=" flag. You can specify which msa to use by entering the classname.
+Added initial banded mode. Specify "bandwidth=X" or "bandwidthratio=X" accelerate alignment.
+Cleaned up argument parsing a bit.
+Improved nodisk mode; now does not use the disk at all for indexing. BBSplitter still uses the disk.
+Added "fast" flag, which changes some paramters to make mapping go faster, with slightly lower sensitivity.
+Improved error handling; corrupt input files should be more likely to crash with an error message and less likely to hang. Noted by Alex Copeland.
+Improved SAM input, particularly coordinates and cigar-string parsing; this should now be correct but requires an indexed reference. Of course this information is irrelevant for mapping so this parsing is turned off by default for bbmap.
+Increased maximum read speed with ByteFile2, by using 2 threads per file. May be useful in input-speed limited scenarios, as when reading compressed input on a node with many cores. Also accelerates sam input.
+TODO: Consider moving THREADS to Shared.
+Updated match/cigar flag syntax.
+Updated shellscript documentation.
+Changed ByteFile2 from array lists to arrays; should reduce overhead.
+TODO: Increase speed of sam input.
+TODO: Increase speed of output, for all formats.
+TODO: Finish ReadStreamWriter.addStringList(), which allows formatting to be done in the host.
+In progress: Moving all MapThread fields to abstract class.
+MapThread now passes reverse-complemented bases to functions to prevent replication of this array.
+Fixed very rare bug when a non-semiperfect site becomes semiperfect after realignment, but subsequently is no longer highest-ranked.
+strictmaxindel can now be assigned a number (e.g. stricmaxindel=5).
+If a fasta read is broken into pieces, now all pieces will recieve the _# suffix in their name. Previously, the first piece was exempt.
+TODO: Consider changing SamLine.rname to a String and seq, qual to byte[].
+Changed SamLine.seq, qual to byte[]. Now stored in original read order and only reversed for minus strand during I/O.
+Added sortscaffolds flag (requested by Vasanth Singan).
+Fixed XS tag bug; in some cases read 2 was getting opposite flag (noted by Vasanth Singan).
+Fixed bug when reading sam files without qualities (noted by Brian Foster).
+Fixed bug where absent cigar strings were printed as "null" instead of "*" as a result of recent changes to sam I/O (noted by Vasanth Singan).
+Found error when a read goes off the beginning of a block. Ref padding seems to be absent, because Ns were replaced by random sequence. Cause is unknown; cannot replicate.
+Fixed Block.getHitList(int, int).
+Changed calcAffineScore() to require base array for information when throwing exceptions.
+Changed generated bamscript to unload samtools module before loading samtools/0.1.19.
+sam file idflag and stopflag are both now faster, particularly for perfect mappings. But both default to off because they are still slow nonetheless.
+Fixed bug in BBIndex in which a site was considered perfect because all bases matched the reference, but some of the bases were N. Canonically, reads with Ns can never be perfect even if the ref has Ns in the same locations.
+Fixed above bug again because it was not fully fixed: CHECKSITES was allowing a read to be classified as perfect even if it contained an N.
+Increased sam read speed by ~2x; 30MB/s to 66MB/s
+Increased sam write speed from ~18MB/s to ~32MB/s on my 4-core computer (during mapping), with mapping at peak 42MB/s with out=null. Standalone (no mapping) sam output seems to run at 51MB/s but it's hard to tell.
+Increased fasta write from 118MB/s to 140 MB/s
+Increased fastq write from 70MB/s to 100MB/s
+Increased fastq read from 120MB/s (I think) to 296MB/s (663 megabytes/sec!) with 2 threads or 166MB/s with 1 thread
+Some of these speed increases come from writing byte[] into char[] buffer held in a ThreadLocal, instead of turning them into Strings or appending them byte-by-byte.
+All of these speed optimizations caused a few I/O bugs that temporarily affected some users between Oct 1 and Oct 4, 2013. Sorry!
+Flipped XS tag from + to - or vice versa. I seem to have misinterpreted the Cufflinks documentation (noted by Vasanth Singan).
+Fixed bug in which (as a result of speed optimizations) reads outside scaffold boundaries, in sam 1.3 format, were not getting clipped (Noted by Brian Foster).
+Changed default behavior of all shellscripts to run with -Xmx4g if maximum memory cannot be detected (typically, because ulimit=infinity). Was 31. Unfortunately things will break either way.
+Fixed off-by-1 error in sam TLEN calculation; also simplified it to give sign based on leftmost POS and always give a plus and minus even when POS is equal.
+Added sam NH tag (when ambig=all).
+Disabled sam XM tag because the bowtie documentation and output do not make any sense.
+Changed sam MD and NM tags to account for 'N' symbol in cigar strings.
+Made sam SM tag score compatible with mapping score.
+Fixed bug in SamLine when cigar=f (null pointer when parsing match string). (Found by Vasanth Singan)
+Fixed bug in BBMapThread* when local=true and ambiguous=toss (null pointer to read.list). (Found by Alexander Spunde)
+Changed synthetic read naming and parsing (parsecustom flag) to use " /1" and " /2" at the end of paired read names. (Requested by Kurt LaButti)
+Increased fastq write to 200MB/s (590 megabytes/s)
+Increased fasta write to 212MB/s (624 megabytes/s measured by fastq input)
+Increased sam write to 167MB/s (492 megabytes/s measured by fastq input)
+Increased bread write to 196MB/s (579 megabytes/s measured by fastq input)
+bf2 (multithreaded input) is now enabled by default on systems with >4 cores, or in ReformatReads always.
+Fixed RTextOutputStream3.finishedSuccessfully() returning false when output was in 2 files.
+Changed output streams to unbuffered. No notable speed increase.
+Fixed bug in ByteFile2 in which reads would be recycled when end of file was hit (found by Brian Foster, Bryce Foster, and Kecia Duffy).
+
+
+v26.
+Fixed crash from consecutive newlines in ByteFile.
+Made SiteScore clonable/copyable.
+Removed @RG line from headers. It implies that reads should be annotated with addition fields based on the RG line information.
+Changed sam flags (at advice of Joel Martin). Now single-ended reads will never have flags 0x2, 0x40, or 0x80 set.
+Added correct insert size average to output stats, in place of old inner distance and mapping length.
+Fixed crash when detecting length of SamLines with no cigar string. (Found by Shayna Stein)
+Added flag "keepnames" which keeps the read names unchanged when writing in sam format. Normally, a trailing "/1", "/2", " 1", or " 2" are stripped off, and if read 2's name differs from read 1's name, read 1's name is used for both. This is to remain spec-compliant with the sam format. However, in some cases (such as grading synthetic reads tagged with the correct mapping location) it is useful to retain the original name of each read.
+Added local alignment option, "local". Translates global alignments into a local alignments using the same affine transform (and soft-clips ends).
+Changed killbadpairs default to false. Now by default improperly paired reads are allowed.
+Merged TranslateColorspaceRead versions into a single class.
+Added interleaved input and output for bread format. May be useful for error correction pipeline.
+TODO: Mode where reads are mapped to multiple scaffolds, but are mapped at most one time per scaffold. I.e., remove all but top site per scaffold (and forbid self-mapping).
+Fixed yet another instance of negative coordinates appearing in an unmapped read, which the new version of samtools can't handle.
+Fixed bug in counting ambiguous reads; was improperly including in statistics reads that were ambiguous but had a score lower than minratio.
+Fixed rare crash found related to realignment of reads with ambiguous mappings (found by Rob Egan).
+Unified many of the differences between the MapThread variants, and added a new self-checking function (checkTopSite) to ensure a Read is self-consistent.
+Added some bitflag fetch functions to SamLine and fixed 'pairedOnSameChrom()' which was not handling the '=' symbol.
+TODO: Make GENERATE_BASE_SCORES_FROM_QUALITY a parameter, default false in BBMapPacBio and true elsewhere. (I verified this should work fine)
+TODO: Make GENERATE_KEY_SCORES_FROM_QUALITY a parameter, default true (probably even in BBMapPacBio). (I verified this should work fine)
+Updated LongM (merged with LongM from Dedupe).
+Fixed bug in SamLine in which clipped leading indels were not considered, causing potential negative coordinates. (Found by Brian Foster)
+TODO: Match strings like NNNNNNDDDDDNNNNNmmmmmmmmmmmmmmmmm...mmmmmmm should never exist in the first place. Why did that happen?
+Added "strictmaxindel" flag (default: strictmaxindel=f). Attempts to kill mappings in which there is a single indel event longer than the "maxindel" setting. Requested by James Han.
+TODO: Ensure strictmaxindel works in all situations, including rescued paired ends and recursively regenerated padded match strings.
+TODO: Redo msa to be strictly subtractive. Start with score=100*bases, then use e.g. 0 for match, -1 for del, -370 for sub, -100 for N, etc. No need for negative values.
+Changed TIMEBITS in MultiStateAligner9PacBio from 10 to 9 to address a score underflow assertion error found by Alicia Clum. The underflow occuerd around length 5240; new limit should be around 10480.
+TODO: Alicia found an error of exceeding gref bounds.
+Fixed race condition in TextStreamWriter.
+Improved functionality of splitter. Now you can index once and map subsequently using "basename" without specifying "ref=" every single time.
+"Reads Used" in output now dispays the number of reads used. Before, for paired reads, it would display the number of pairs (half as many).
+Added bases used to reads used at Kurt's request.
+Improved bam script generation. Now correctly sets samtools memory based on detected memory, and warns user that crashes may be memory-related.
+Fixed an obsolete assertion in SamLine found by Alicia.
+Added XS tag option ("xstag=t") for Cufflinks; the need for this was noted by requested by Vasanth Singan.
+Added 'N' cigar operation for deletions longer than X bases (intronlen=X). Also needed by Cufflinks.
+Secondary alignments now get "*" for bases and qualities, as recommended by the SAM spec. This saves space, but may cause problems when converting sam into other formats.
+Fixed bug that caused interleaved=true to override in2. Now if you set in and in2, interleaved input will be disabled. (noted by Andrew Tritt).
+Fixed some low-level bugs in I/O streams. When shutting down streams I was waiting until !Thread.isAlive() rather than Thread.getState()==Thread.State.TERMINATED, which caused a race condition (since a thread is not alive before it starts execution).
+Added debugging file with random name written to /ref/ directory. This should help debugging if somewhere deep in a pipeline multiple processes try to index at the same location simultaneously. Suggested by Bryce Foster.
+Fixed log file generation causing a crash if the /ref/ directory did not exist, found by Vasanth Singan. Also logging is now disabled by default but enabled if you set "log=t".
+Input sequence data will now translate '.' and '-' to 'N' automatically, as some fasta databases appear to use '.' instead of 'N'. (Thanks to Kecia Duffy and James Han)
+Added capability to convert lowercase reads to upper case (crash on lowercase noted by Vasanth Singan).
+
+
+v25.
+Increased BBMapPacBio max read length to 6000, and BBMapPacBioSkimmer to 4000.
+Fixed bugs in padding calculations during match string generation.
+Improved some assertion error output.
+Added flag "maxsites" for max alignments to print.
+Added match field to sitescore.
+Made untrim() affect sitescores as well.
+Decreased read array buffer from 500 to 20 in MapPacBio.
+TODO: stitcher for super long reads.
+TODO: wrapper for split reference mapping and merging.
+Improved fillAndScoreLimited to return additional information.
+Added flag "secondary" to print secondary alignments. Does not yet ensure that all secondary alignments will get cigar strings, but most do.
+Added flag "quickmatch" to generate match strings for SiteScores during slow align. Speeds up the overall process somewhat (at least on my PC; have not tested it on cluster).
+Improved pruning during slow align by dynamically increasing msa limit.
+Addressed a bug in which reads sometimes have additional sites aligned to the same coordinates as the primary site. The bug can still occur (typically during match generation or as a result of padding), but is detected and corrected during runtime.
+Tracked down and fixed a bug relating to negative coordinates in sam output for unmapped reads paired with reads mapped off the beginning of a scaffold, with help from Rob Egan.
+Disabled frowny-face warning message which had caused some confusion.
+TODO: Add verification of match strings on site scores.
+Made superclass for MSA. This will allow merging of redundant code over the various BBMap versions.
+Fixed a crash-hang out-of-memory error caused by initialization order. Now crashes cleanly and terminates. Found by James Han.
+Fixed bug in output related to detecting cigar string length under sam 1.4 specification (found by Rob Egan).
+Added flag "killbadpairs"/"kbp".
+Added flag "fakequality" for fasta.
+Permanently fixed bugs related to unexpected short match strings caused by error messages.
+Increased speed of dynamic program phase when dealing with lots of Ns.
+TODO: In-line generation of short match string when printing a read, rather than mutating the read. (mutation is now temporary)
+Added flag, "stoptag". Allows generation of SAM tag YS:i:<read stop location>
+Added flag, "idtag". Allows generation of SAM tag YI:f:<percent identity>
+
+v24.
+Fixed bug that slightly reduced accuracy for reads with exactly 1 mismatch. They were always skipping slow align, sometimes preventing ambiguous reads from being detected.
+Increased speed of MakeRocCurve (for automatic grading of sam files from synthetic reads). Had used 1 pass per quality level; now it uses only 1 pass total.
+Increased accuracy of processing reads and contigs with ambiguous bases (in mapping phase).
+Adjusted clearzones to use gradient functions and asymptotes rather than step functions. Reduces false positives and increases true positives, especially near the old step cutoffs.
+Fixed trimSitesBelowCutoff assertion that failed for paired reads.
+Added single scaffold toggle to RandomReads. Default 'singlescaffold=true'; forces reads to come from a single scaffold). This can cause non-termination if no scaffolds are long enough, and may bias against shorter scaffolds.
+Added min scaffold overlap to RandomReads. Default 'overlap=1'; forces reads to overlap a scaffold at least this much. This can cause non-termination if no scaffolds are long enough, and may bias against shorter scaffolds.
+Fixed setPerfect(). Previously, reads with 'N' overlapping 'N' in the reference could be considered perfect matches, but no reads containing 'N' should ever be considered a perfect mapping to anything.
+Formalized definition of semiperfect to require read having no ambiguous bases, and fixed "isSemiperfect()" function accordingly.
+Shortened and clarified executable names.
+Fixed soft-clipped read start position calculation (mainly relevant to grading).
+Prevented reads from being double-counted when grading, when a program gives multiple primary alignments for a read.
+Fixed a bug in splitter initialization.
+Added "ambiguous2". Reads that map to multiple references can now be written to distinct files (prefixed by "AMBIGUOUS_") or thrown away, independantly of whether they are ambiguous in the normal sense (which includes ambiguous within a single reference).
+Added statistics tracking per reference and per scaffold. Enable with "scafstats=<file>" or "refstats=<file>".
+"ambiguous" may now be shortened to "ambig" on the command line.
+"true" and "false" may now be shortened to t, 1, or f, 0. If omitted entirely, "true" is assumed; e.g. "overwrite" is equivalent to "overwrite=true".
+Added stderr as a vaild output destination specified from the command line.
+BBSplitter now has a flag, "mapmode"; can be set to normal, accurate, pacbio, or pacbioskimmer.
+Fixed issue where stuff was being written to stdout instead of stderr and ended up in SAM files (found by Brian Foster).
+TODO: Add secondary alignments.
+TODO: Unlimited length reads.
+TODO: Protein mapping.
+TODO: Soft clipping in both bbmap and GradeSamFile. Should universally adjust coords by soft-clip amount when reported in SAM format.
+Fixed assertion error concerning reads containing Ns marked as perfect, when aligned to reference Ns (found by Rob Egan).
+Fixed potential null-pointer error in "showprogress" flag.
+
+v23.
+Created BBSplitter wrapper for BBMap that allows merging any number references together and splitting the output into different streams.
+Added support for ambiguous=random with paired reads (before it was limited to unpaired).
+TODO: Iterative anchored alignment for very long reads, with a full master gref.
+TODO: untrim=c/m/s/n/r
+TODO: mode=vfast/veryfast: k=14 minratio=0.8 minhits=2 maxindel=20
+TODO: mode=slow/accurate: BBMapi
+TODO: mode=pacbio: BBMapPacBio k=12
+TODO: mode=rnaseq
+TODO: Put untrim in caclStatistics section
+TODO: Test with MEGAN.
+Finished new random read generator. Much faster, and solves coordinate problem with multiple indels.
+Improved error message on read parsing failures.
+TODO: Insert size histogram
+TODO: "outp=", output for reads that mapped paired
+TODO: "outs=", output for reads that mapped singly
+Corrected assertion in "isSingleScaffold()"
+Fixed a rare bug preventing recursive realignment when ambiguous=random (found by Brian Foster)
+Added samversion/samv flag. Set to 1.3 for cigar strings with 'M' or 1.4 for cigar strings with '=' and 'X'. Default is 1.3.
+Added enforcement of thread limit when indexing.
+Added internal autodetection of gpint machines. Set default threadcount for gpints at 2.
+Improved ability to map with maxindel=0
+Added XM:i:<N> optional SAM flag because some programs seem to demand it. Like all extra flags, this is omitted if the read is not mapped. Otherwise, it is set to 1 for unambiguously mapped reads, and 2 or more for ambiguously mapped reads. The number can range as high as the total number of equal-scoring sites, but this is not guaranteed unless the "ambiguous=random" flag is used.
+Fixed bug in autodetection of paired ends, found by Rob Egan.
+
+
+
+v22.
+Added match histogram support.
+Added quality histogram support.
+Added interleaving support to random read generator.
+Added ability to disable pair rescue ("rescue=false" flag), which can speed things up in some cases.
+Disabled dynamic-programming slow alignment phase when no indels are allowed.
+Accelerated rescue in perfect and semiperfect mode.
+Vastly accelerated paired mapping against references with a very low expected mapping rate.
+Fixed crash in rescue caused by reads without quality strings (e.g. paired fasta files). (found by Brian Foster)
+
+
+v21.
+If reference specified is same as already-processed reference, the old index will not be deleted.
+Added BBMap memory usage estimator to assembly statistics tool: java -Xmx120m jgi.AssemblyStats2 <fasta file> k=<kmer size for BBMap>
+Added support for multiple output read streams: all reads (set by out=), mapped reads (set by outm=), and unmapped reads (set by outu=). They can be in different formats and any combination can be used at once. You can set pair output to secondary files with out2, outm2, and outu2.
+Changed definition of "out=". You can no longer specify split output streams implicitly by using a "#" in the filename; it must be explicit. the "#" wildcard is still allowed for input streams.
+Fixed a bug with sam input not working. (found by Brian Foster)
+Added additional interleaved autodetection pattern for reads named "xxxxx 1:xxxx" and "xxxxx 2:xxxx"
+Fixed a bug with soft-clipped deletions causing an incorrect cigar length. (found by Brian Foster)
+Fixed a bug with parsing of negative numbers in byte arrays.
+TODO: Found a new situation in which poly-N reads preferentially map to poly-N reference (probably tip search?)
+Fixed a bug in which paired reads occasionally are incorrectly considered non-semiperfect. (found by Brian Foster)
+Added more assertion tests for perfection/imperfection status.
+Added blacklist support. This allows selection of output stream based on the name of the scaffold to which a read maps.
+Created Blacklist class, allowing creation of blacklists and whitelists.
+Added outb (aka outblacklist) and outb2 streams, to output reads that mapped to blacklisted scaffolds.
+Added flag "outputblacklisted=<true/false>" which contols whether blacklisted reads are printed to the "out=" stream. Default is true.
+Added support for streaming references. e.g. "cat ref1.fa ref2.fa | java BBMap ref=stdin.fa"
+Updated and reorganized this readme.
+Removed a dependency on Java 7 libraries (so that the code runs in Java 6).
+Added per-read error rate histogram. Enable with qhist=<filename>
+TODO: generate standard deviation.
+Added per-base-position M/S/D/I/N rate tracking. Enable with mhist=<filename>
+Added quality trimming. Reads may be trimmed prior to mapping, and optionally untrimmed after mapping, so that no data is lost. Trimmed bases are reported as soft-clipped in this case.
+Trimming will extend until at least 2 consecutive bases have a quality greater than trimq (default 5).
+Added flags: trim=<left/right/both/false>, trimq=<5>, untrim=<true/false>
+TODO: Correct insert size in realtime for trim length.
+TODO: Consider adding a TrimRead pointer to reads, rather than using obj.
+TODO: Consider extending match string as 'M' rather than 'C' as long as clipped bases match.
+Found and made safe some instances where reads could be trimmed to less than kmer length.
+Found and fixed instance where rescue was attempted for length-zero reads.
+Fixed an instance where perfect reads were not marked perfect (while making match string).
+
+
+v20.1 (not differentiated from v20 since the differences are minor)
+Fixed a minor, longstanding bug that prevented minus-strand alignment of rads that only had a single valid key (due to low complexity or low quality).
+Increased accuracy of perfectmode and semiperfectmode, by allowing mapping of reads with only one valid key, without loss of speed. They still don't quite match normal mode since they use fewer keys.
+Added detection of and error messages for reads that are too long to map.
+Improved shell script usage information.
+
+
+v20.
+Made all MapThreads subclasses of MapThread, eliminating duplicate code.
+Any exception thrown by a MapThread will now be detected, allowing the process to complete normally without hanging.
+Exceptions (e.g. OutOfMemory) when loading reference genome are now detected, typically causing a crash exit instead of a hang.
+Exceptions (e.g. OutOfMemory) when generating index are now detected, causing a crash exit instead of a hang.
+Exceptions in output stream (RTextOutputStream) subthreads are now detected, throwing an exception.
+Added support for soft clipping. All reads that go off the ends of scaffolds will be soft-clipped when output to SAM format. (The necessity of this was noted by Rob Egan, as negative scaffold indices can cause software such as samtools to crash)
+
+
+v19.
+Added support for leading FASTA comments (denoted by semicolon).
+Fixed potential problem in FASTA read input stream with very long reads.
+Recognizes additional FASTA file extensions: .seq, .fna, .ffn, .frn, .fsa, .fas
+Disabled gzip subprocesses to circumvent a bug in UGE: Forking can cause a program to be terminated. Gzip is still supported.
+Slightly reduced memory allocation in shellscript.
+Ported "Analyze Index" improvement over to all versions (except v5).
+Added flags: fastaminread, showprogress
+Fixed problem noted by Rob Egan in which paired-end reads containing mostly 'N' could be rescued by aligning to the poly-N section off the end of a contig.
+Fixed: Synthetic read headers were being improperly parsed by new FASTQ input stream.
+Made a new, faster, more correct version of "isSemiperfect".
+Added "semiperfect" test for reads changed during findDeletions.
+Identified locations in "scoreNoIndels" where call 'N' == ref 'N' is considered a match. Does not seem to cause problems.
+Noted that SAM flag 0x40 and 0x80 definitions differ from my usage.
+
+
+v18.
+Fastq read input speed doubled.
+Fasta read input speed increased 50%.
+Increased speed of "Analyze Index" by a factor of 3+ (just for BBMap so far; have not yet ported change over to other versions).
+Fixed an array out-of-bounds bug found by Alicia Clum.
+Added bam output option (relies on Samtools being installed).
+Allows gzip subprocesses, which can sometimes improve gzipping and gunzipping speed over Java's implementation (will be used automatically if gzip is installed). This can be disabled with with the flags "usegzip=false" and "usegunzip=false".
+Started a 32-bit mode which allows 4GB per block instead of 2GB, for a slight memory savings (not finished yet).
+Added nondeterministic random read sampling option.
+Added flags: minscaf, startpad, stoppad, samplerate, sampleseed, kfilter, usegzip, usegunzip
+
+
+v17.
+Changed the way error rate statistics are displayed. All now use match string length as denominator.
+Identified error in random read generator regarding multiple insertions. It will be hard to fix but does not matter much.
+Found out-of-bounds error when filling gref. Fixed (but maybe not everywhere...).
+Added random mapping for ambiguous reads.
+Changed index from 2d array to single array (saves a lot of memory).
+Increased speed by ~10%.
+Improved index generation and loading speed (typically more than doubled).
+Changed chrom format to gzipped.
+Added "nodisk" flag; index is not written to disk.
+Fixed a rare out-of-bounds error.
+Increased speed of perfect read mapping.
+Fixed rare human PAR bug.
+
+
+v16. Changes since last version:
+Supports unlimited number of unscaffolded contigs.
+Supports piping in and out. Set "out=stdout.sam" and "in=stdin.fq" to pipe in a fastq file and pipe out a sam file (other extensions are also supported).
+Ambiguously named files (without proper extensions) will be autodetected as fasta or fastq (though I suggest not relying on that).
+Added additional flags (described in parameters section): minapproxhits, padding, tipsearch, maxindel.
+minapproxhits has a huge impact on speed. Going from 1 to 2 will typically at least double the speed (on a large genome) at some cost to accuracy.
+
+
+v15. Changes since last version:
+Contig names are retained for output.
+SAM header @SQ tags fixed.
+SAM header @PG tag added.
+An out-of-bounds error was fixed.
+An error related to short match strings was found and possibly handled.
+All versions now give full statistics related to %matches, %substitutions, %deletions, and %insertions (unless match string generation is disabled).
+Increased speed and accuracy for tiny (<20MB) genomes.
+Added dynamic detection of scaffold sizes to better partition index, reducing memory in some cases.
+Added command-line specification of kmer length.
+Added more command line flags and described them in this readme.
+Allowed overwriting of existing indices, for ease of use (only when overwrite=true). For efficiency you should still only specify "ref=" the first time you map to a particular reference, and just specify the build number subsequently.
diff --git a/docs/compiling.txt b/docs/compiling.txt
new file mode 100755
index 0000000..cbf0703
--- /dev/null
+++ b/docs/compiling.txt
@@ -0,0 +1,9 @@
+To install BBTools, unzip it to a directory and it will run as long as Java is installed. It is also possible to increase performance with some additional steps.
+
+BBTools has 4 components:
+1) Java code. This is the bulk of the code and all that is strictly necessary. It is already compiled for Java 6 and does not need recompiling. It can run under Java 7 or higher, and most components (aside from BBNorm) can also run in Java 6. No components will work with Java versions below 6.
+2) Bash shellscripts. These are present to make it easier to invoke the Java code (by automatically detecting and setting memory limits, for example). If you are not using Bash and Linux, then the shellscripts probably won't work, but you can still invoke the Java code from the command line. Shellscripts are interpreted and do not require compiling.
+3) C code. This is currently under development by Jonathan Rood to accelerate certain programs (currently BBMap, BBMerge, and Dedupe). It can be enabled with the "usejni" flag, but requires platform-specific compilation first.
+4) MPI code. This is also under development by Jonathan Rood and support requires a recent version of OpenMPI. It's currently experimental. If you want to recompile BBTools, you will need the "mpi.jar" file that is present in systems with OpenMPI installed.
+
+To use the accelerated versions of BBMap, BBMerge, or Dedupe, the C code must first be compiled for your specific platform. The instructions are in "/jni/README.txt"
diff --git a/docs/guides/A_SampleGuide.txt b/docs/guides/A_SampleGuide.txt
new file mode 100755
index 0000000..af5d906
--- /dev/null
+++ b/docs/guides/A_SampleGuide.txt
@@ -0,0 +1,18 @@
+A_Sample Guide
+Written by Brian Bushnell
+Last updated December 22, 2015
+
+A_Sample and A_SampleMT are sample BBTools that do some kind of manipulation on reads. They are designed for rapid creation of custom high-performance tools, following BBTools best practices and taking advantage of its existing infrastructure such as read input streams. A_Sample is singlethreaded, and A_SampleMT is multithreaded. Neither currently does anything to the reads (they act as a "null transform"); but there is an empty function, "processReadPair", into which new code can be [...]
+
+
+*Notes*
+
+
+Java Code:
+
+These programs are not for end-users, only developers. The general idea is that you make a copy of A_Sample or A_SampleMT, rename it, and change processReadPair function. Usually, you will need to add any additional fields at the bottom and additional parsing terms at the top. Be sure that a new parse keyword you add does not conflict with one already in the Parser class.
+
+
+Shellscript:
+
+After the java code is done, copy a_sample_mt.sh and rename that, too. Then modify the shellscript's usage function at the top, the execution function at the bottom (where the java command is called, e.g. "java $EA $z -cp $CP jgi.A_SampleMT $@"), and the default amounts of memory (default of variable "z" and the line with "freeRam").
diff --git a/docs/guides/AddAdaptersGuide.txt b/docs/guides/AddAdaptersGuide.txt
new file mode 100755
index 0000000..ba9ca9e
--- /dev/null
+++ b/docs/guides/AddAdaptersGuide.txt
@@ -0,0 +1,23 @@
+AddAdapters Guide
+Written by Brian Bushnell
+Last updated December 22, 2015
+
+AddAdapters is designed for grading the performance of adapter-trimming tools. It can add adapters to reads, and annotate the reads with their correct post-trimming length; and it can be run on trimmed reads, to calculate the rates of correct and incorrect trimming. However, it does not understand insert size, so for adding adapters to paired reads, it's better to use RandomReads.
+
+
+*Usage Examples*
+
+
+To add adapters to reads:
+addadapters.sh in=a.fq out=b.fq adapters=adapters.fa
+
+
+To grade trimmed reads:
+addadapters.sh in=trimmed.fq grade
+
+
+To use RandomReads instead, to add adapters in the correct location according to insert size:
+randomreads.sh ref=ref.fa out=reads.fq len=150 paired reads=100k mininsert=50 maxinsert=350 fragadapter1=ACTG fragadapter2=ACTG
+rename.sh in=reads.fq out=renamed.fq renamebytrim interleaved
+
+The result of this will still be named correctly for grading by addadapters. "ACTG" would normally be a much longer adapter sequence.
diff --git a/docs/guides/BBDukGuide.txt b/docs/guides/BBDukGuide.txt
new file mode 100755
index 0000000..8e25eeb
--- /dev/null
+++ b/docs/guides/BBDukGuide.txt
@@ -0,0 +1,178 @@
+BBDuk Guide
+Written by Brian Bushnell
+Last updated December 14, 2015
+
+"Duk" stands for Decontamination Using Kmers. BBDuk was developed to combine most common data-quality-related trimming, filtering, and masking operations into a single high-performance tool. It is capable of quality-trimming and filtering, adapter-trimming, contaminant-filtering via kmer matching, sequence masking, GC-filtering, length filtering, entropy-filtering, format conversion, histogram generation, subsampling, quality-score recalibration, kmer cardinality estimation, and various [...]
+
+BBDuk's parameters are described in its shellscript (bbduk.sh). This file provides usage examples of various common tasks.
+
+
+*Notes*
+
+
+Paired reads:
+
+Paired reads interleaved in a single file will be autodetected based on their names; this can be overridden with the "interleaved" flag. The commands in this document assume either single-ended reads or paired reads in a single file. BBDuk also supports input or output of paired reads in dual files using the in1, in2, out1, and out2 flags, for example:
+
+bbduk.sh in1=read1.fq in2=read2.fq out1=clean1.fq out2=clean2.fq
+
+When dealing with paired reads in 2 files they should always be processed together, not one at a time. Pairs are always kept together - either both reads are kept, or both are discarded.
+
+
+Kmer-processing modes:
+
+If a reference is specified, BBDuk will operate on kmers in one of 4 modes: right-trimming, left-trimming, masking, or filtering. The default is filtering - any read matching a reference kmer will be discarded. In order to kmer-trim, the flag "ktrim=r" or "ktrim=l" must be used. In ktrim=r mode, once a reference kmer is matched in a read, that kmer and all the bases to the right will be trimmed, leaving only the bases to the left; this is the normal mode for adapter trimming. For ktr [...]
+
+
+Memory:
+
+BBDuk's shellscript will try to autodetect the available memory and use about half of it. You can override this with with the -Xmx flag, e.g. "bbduk.sh -Xmx1g in=reads.fq". That command will force it to use 1 GB. Most operations such as adapter-trimming and quality-trimming need only a tiny amount of memory. Only processing large references, or using a high value of "hdist" or "edist", actually need a lot of memory. The only factor determining how much memory BBDuk needs is the numb [...]
+
+
+Kmers, hdist, qhdist, and edist:
+
+A 4.6Mbp genome like E.coli contains around 4.6 million unique kmers. If a hamming distance is used, such as hdist=1, then the number of kmers stored will be multiplied by 1+(3*k)^hdist. So, for E.coli with K=31 and hdist=0, there are 4554207 kmers stored, using around 140MB, taking about 0.5 seconds; with hdist=1, there are 427998710 kmers stored (94 times as many), using 15GB, and taking 104 seconds. Edit distance creates even more kmers, at 1+(8*k)^edist. BBDuk needs around 20 byt [...]
+
+
+Mink:
+
+When you set K=25, BBDuk will store all 25-mers in the reference, and try to match them against 25-mers in the queries (reads). However, for adapter-trimming, this may not be desirable - for example, if the last 12bp of a read are adapter sequence, it will not match a reference 25-mer, because it is too short. "mink=8" will additionally look for shorter kmers with lengths 24 to 8 (in this case). For the query, these kmers will only come from the end of the read - for example, with "kt [...]
+
+
+Hdist2, edist2, and qhdist2
+
+Because they are shorter, the appropriate hamming distance may differ for short kmers generated with the mink flag. For primary kmers, K=25 has a very high high specificity; a given 25-mer has a 1/1125899906842624 chance of matching random sequence (meaning you will never see this happen). Even with hdist=3, the chance is increased to ~1/18130433280 (still very low). However, k=8 only has a 1/65536 chance at hdist=0, becoming a 1/36 chance at hdist=3, which would yield too many false [...]
+If hdist is set and hdist2 is not set, hdist will control the hamming distance for all kmers, short and long. If both are set, then hdist will control the hamming distance for full-length kmers, and hdist2 will control it for short kmers; and etc.
+
+
+Kmer length:
+
+BBDuk supports kmers of length 1-31. The longer a kmer, the high the specificity; except in certain highly-conserved areas (such as ribosomes), or in very low-complexity areas (like ATATATAT...), it is very unlikely for two unrelated organisms to share a 31-mer. Note that it is impossible to do kmer matching for reference sequences that are shorter than K. When kmer filtering, you can specify a kmer length greater than 31 (such as k=40) for even higher specificity. For k=40, BBDuk wi [...]
+
+
+Kmer skipping:
+
+To reduce memory or increase speed, it is possible to ignore some reference or query kmers. This can be done with the rskip, qskip, or speed flags (do not use more than one at a time). rskip=4, for example, will only store every 4th reference kmer, saving memory. qskip=4 will only look up every 4th query kmer, increasing speed. The speed flag is a bit different. speed=X, where X runs from 0 to 15, will ignore X/16 of the kmers in both the read and reference, and the same ones will b [...]
+
+
+Number of kmer hits:
+
+By default, a read is considered to match the reference if they share a single kmer. This can be changed with the flag "mkh" (minkmerhits), or overridden with "mkf" (minkmerfraction) or "mcf" (mincoveredfraction). For example, "mkh=2" would require 2 kmer hits to consider a read as matching; "mkf=0.5" would require 50% of the kmers to match; and "mcf=0.5" would require 50% of the bases to be covered by reference kmers. "mcf" is probably more stable than "mkf", because a single error c [...]
+
+
+Maskmiddle and rcomp:
+
+By default, BBDuk has maskmiddle and rcomp set to true. Maskmiddle ignores the middle base of a kmer, which increases sensitivity. Rcomp looks for kmers and their reverse-complements, rather than just forward kmers. However, sometime this confuses people when BBDuk reports that a sequence matches some reference, but using some tool like "Grep", they can't find any shared kmers. This is usually caused by maskmiddle and/or rcomp. So if you only want exact matches, disable maskmiddle ( [...]
+
+
+Seal and kmers in multiple references:
+
+BBDuk associates each kmer with a single number. Specifically, if a kmer appears in the first reference sequence, that kmer is associated with the number 1; kmers from a second reference sequence will be associated with the number 2, etc. If two reference sequences contain the same kmer, the kmer will only be associated with the first reference. Therefore, when reporting statistics about how many reads matched which reference sequence, these statistics may not be correct in situations [...]
+
+
+Threads:
+
+BBDuk operates in 2 phases. Phase 1 is loading the reference kmers (if you specified a reference). This uses a fixed number of threads, currently 7, which cannot be changed (except by editing the "WAYS=7" line in BBDuk's source code and recompiling). With a small reference such as adapter sequences or a bacterial genome, this phase will take under a second. The second phase will autodetect and use all available processors, unless restricted with the "t" flag; for example, t=1 will re [...]
+
+
+Output streams:
+
+BBDuk has 3 standard output streams, "out", "outm", and "outs". None are required; any or all may be used. "out" (aka "outu" or "outunmatched") will get all the reads that pass all filtering criteria. That means that after any trimming operations, reads must be at least as long as minlen; and if kmer-filtering is being performed, the read must not match any reference kmer; if a minimum average quality is specified, the read's average quality must be at least that high; etc. Reads fai [...]
+
+bbduk.sh in=pairs.fq out=pass_pairs.fq outm=fail_pairs.fq outs=pass_singletons.fq ref=contam.fasta
+
+In this case, pairs in which neither read matches the reference will go to pass_pairs.fq. Pairs in which either read or both match the reference will go to fail_pairs.fq. And if one read matches the reference but the other one does not, the read not matching the reference will go to pass_singletons.fq (in addition to fail_pairs.fq).
+
+
+Preprocessing Steps and Order:
+
+To preprocess raw sequence data with BBDuk, I recommend a specific order; please see PreprocessingGuide for details.
+
+
+*Usage Examples*
+
+Adapter trimming:
+bbduk.sh in=reads.fq out=clean.fq ref=adapters.fa ktrim=r k=23 mink=11 hdist=1 tpe tbo
+
+(if your data is very low quality, you may wish to use more sensitive settings of hdist=2 k=21)
+
+...where "ktrim=r" is for right-trimming (3' adapters), and "ktrim=l" is for left-trimming (5' adapters). "k" specifies the kmer size to use (must be at most the length of the adapters) while "mink" allows it to use shorter kmers at the ends of the read (for example, k=11 for the last 11 bases). "hdist" means "hamming distance"; this allows one mismatch. Instead of "ref=file" you can alternately (or additionally) say "literal=ACTGGT,TTTGGTG" for those two literal strings. The BBTools pac [...]
+
+
+Quality trimming:
+bbduk.sh in=reads.fq out=clean.fq qtrim=r trimq=10
+
+This will quality-trim to Q10 using the Phred algorithm, which is more accurate than naive trimming. "qtrim=r" means it will trim the right side only; you can alternatively set "qtrim=l" for left or "qtrim=rl" for both. If quality trimming is enabled, it happens after all kmer-based operations.
+
+
+Force-Trimming:
+bbduk.sh in=reads.fq out=clean.fq ftl=10 ftr=139
+
+This will trim the leftmost 10 bases (ftl=10) and also trim the right end after to position 139 (zero-based). The resulting read would be 130bp long. For example, a 150bp read would have the first 10 bases trimmed (bases 0-9, keeping 10+) and the last 10 bases trimmed (bases 140-149, keeping 139 and lower).
+
+
+Force-Trim Modulo:
+bbduk.sh in=reads.fq out=clean.fq ftm=5
+
+The right end so that the read's length is equal to zero modulo 5 (ftm=5). The reason for this is that with Illumina sequencing, normal runs are usually a multiple of 5 in length (50bp, 75bp, 100bp, etc), but sometimes they are generated with an extra base (51bp, 76bp, 151bp, etc). This last base is very inaccurate and has badly calibrated quality as well, so it's best to trim it before doing anything else. But you don't want to simply always trim the last base, because sometimes the [...]
+
+
+Quality filtering:
+bbduk.sh in=reads.fq out=clean.fq maq=10
+
+This will discard reads with average quality below 10. If quality-trimming is enabled, the average quality will be calculated on the trimmed read.
+
+
+Kmer filtering:
+bbduk.sh in=reads.fq out=unmatched.fq outm=matched.fq ref=phix.fa k=31 hdist=1 stats=stats.txt
+
+This will remove all reads that have a 31-mer match to PhiX (a common Illumina spikein, which is included in /bbmap/resources/), allowing one mismatch. The "outm" stream will catch reads that matched a reference kmers. This allows you to split a set of reads based on the presence of something. "stats" will produce a report of which contaminant sequences were seen, and how many reads had them.
+
+
+Kmer masking:
+bbduk.sh in=ecoli.fa out=ecoli_masked.fa ref=salmonella.fa k=25 ktrim=N
+
+This will mask all 25-mers in E.coli that are also shared by Salmonella, by converting them to N. You can change them to some other letter instead, like X.
+
+
+Entropy filtering:
+bbduk.sh in=reads.fq out=filtered.fq entropy=0.5 entropywindow=50 entropyk=5
+
+This will filter out reads that have an average entropy of under 0.5. This is a measure of read complexity and varies from 0 to 1. A homopolymer such as AAAAAAAAAAAAAA would have entropy of zero; completely random sequence would have entropy approaching 1.
+
+
+Quality recalibration:
+bbduk.sh in=reads.fq out=recalibrated.fq recalibrate sam=mapped.sam
+
+This will recalibrate quality scores to be more accurate, using the mapping information from the sam file provided. If matrix files have already been generated from a mapped sam or bam file of the reads using CalcTrueQuality, then the sam flag should not be used. The sam file must have cigar strings with = and X symbols for match and mismatch, like BBMap produces, rather than M for match or mismatch. The sam file does not need to contain the same reads as the input - it just has to be [...]
+
+
+Histogram generation:
+bbduk.sh in=reads.fq bhist=bhist.txt qhist=qhist.txt gchist=gchist.txt aqhist=aqhist.txt lhist=lhist.txt gcbins=auto
+
+This will generate histograms of various aspects of the reads - base frequency, quality scores, gc content, average quality, and length. BBMap can generate even more histograms by using mapping information (such as quality accuracy and indel length); BBDuk can also generate these histograms if it is fed a sam file in which the cigar strings use = and X to denote match and mismatch.
+
+
+Barcode and chastity filtering:
+bbduk.sh in=reads.fq out=filtered.fq barcodes=ACGTT,GGCCG barcodefilter chastityfilter
+
+This will remove reads that fail Illumina chastity filtering, or have barcodes that do not exactly match the list provided. The ability to process these depends on Illumina's header format, which can change between software versions and platforms.
+
+
+Cardinality estimation:
+bbduk.sh in=reads.fq loglog loglogk=31
+
+This will generate an accurate estimation of the number of unique kmers in the dataset using the LogLog algorithm, requiring very little memory. There is no upper bound for this kmer length. Note that "k=" and "loglogk=" are completely unrelated.
+
+
+Matching degenerate sequences such as primers:
+bbduk.sh in=reads.fq out=matching.fq literal=ACGTTNNNNNGTC copyundefined k=13 mm=f
+
+This will clone the reference sequences to represent every possibility for the degenerate bases (Ns and other non-ACGT IUPAC symbols). For example, this would create ACGTTAAAAAGTC, ACGTTAAAACGTC, ACGTTAAAAGGTC, and so forth (all 1024 possibilities). If you are interested in seaching for new life by mining shotgun metagenomic reads for 16s sequences that do not quite match your primers... this (and hdist) might be a good place to start! But it's also useful for adapters with barcodes.
+
+
+Length filtering:
+bbduk.sh in=reads.fq out=clean.fq qtrim=r trimq=10 minlen=100
+
+This will discard reads shorter than 100bp after trimming to Q10. Alternatively, using "mlf=50" (minlengthfraction=50) would discard reads under 50% of their original length after trimming. Either of these flags apply to any trim operation, whether force-trim (ftr, ftl, ftm), quality-trimming (qtrim), or kmer-trimming (ktrim). "mlf" compares the final length after all trim operations to the initial length before any trim operations.
+
diff --git a/docs/guides/BBMapGuide.txt b/docs/guides/BBMapGuide.txt
new file mode 100755
index 0000000..c40e94b
--- /dev/null
+++ b/docs/guides/BBMapGuide.txt
@@ -0,0 +1,186 @@
+BBMap Guide
+Written by Brian Bushnell
+Last updated December 23, 2015
+
+
+BBMap is a splice-aware global aligner for DNA and RNA sequencing reads. It can align reads from all major platforms - Illumina, 454, Sanger, Ion Torrent, Pac Bio, and Nanopore. BBMap is fast and extremely accurate, particularly with highly mutated genomes or reads with long indels, even whole-gene deletions over 100kbp long. It has no upper limit to genome size or number of contigs, and has been successfully used for mapping to an 85 gigabase soil metagenome with over 200 million con [...]
+BBMap has a large array of options, described in its shellscript. It can output many different statistics files, such as an empirical read quality histogram, insert-size distribution, and genome coverage, with or without generating a sam file. As a result, it is useful in quality control of libraries and sequencing runs, or evaluating new sequencing platforms. The derivative program BBSplit is also useful in binning or refining metagenomic reads.
+
+
+*Notes*
+
+
+Algorithm:
+
+This guide will not describe BBMap's algorithm, other than to say it uses a multi-kmer-seed-and-extend approach, analogous to growing polycrystalline silicon. For those interested, there is a paper describing many of the technical details here: http://bib.irb.hr/datoteka/773708.Josip_Maric_diplomski.pdf
+
+
+Memory:
+
+BBMap normally uses roughly 6 bytes per reference base. It also has a low-memory mode (triggered by the flag "usemodulo") that will use approximately 3 bytes per base, with a slight reduction in sensitivity. Some additional memory is needed per thread for alignment matrices; this is relatively small in normal mode, but bigger in PacBio mode due to the longer reads. Also, the amount of memory needed for the index increases with kmer length. The memory needed for a specific kmer length [...]
+
+
+Indexing and Disk Use:
+
+BBMap must index a reference before mapping to it, which is relatively fast. By default, it will write this index to disk so that it can be loaded more quickly next time, but this can be suppressed with the "nodisk" flag. The index is written to the location /ref/. In other words, if you run BBMap from the location /bob/work/, then the directory /bob/work/ref/ will be created and an index written to it; if there is already an index at that location which matches the reference you are [...]
+1) "bbmap.sh in=reads.fq" will look for an index in /ref/, not find anything, and so will quit without mapping.
+2) "bbmap.sh in=reads.fq ref=A.fa nodisk" will generate an index in memory and write nothing to disk.
+3) "bbmap.sh in=reads.fq ref=A.fa" will generate an index and write it to /ref/.
+4) "bbmap.sh in=reads.fq" will look for an index in /ref/, and load it.
+5) "bbmap.sh in=reads.fq ref=A.fa" will look in ref, see that A is already indexed, and load the existing index.
+6) "bbmap.sh in=reads.fq ref=B.fa" will overwrite the index in A with a new index for B.
+7) "bbmap.sh in=reads.fq ref=C.fa build=2" will write a new index for C in the /ref/ folder. At this point there will be an index for be (stored as build 1) and an index for C (stored as build 2).
+8) "bbmap.sh in=reads.fq ref=D.fa path=/another/location/" will write an index for D into /another/location/ref/.
+
+Specifying a path or build number when "nodisk" is in the command will have no effect. Do not attempt to have multiple processes write an index to the same location at the same time, or you will get a corrupt index that needs to be deleted and regenerated. If you need to map many files to a single reference, build the index once (e.g. "bbmap.sh ref=a.fa"), then wait for it to finish. Then, you can map all the read files simultaneously if you want (without a ref= flag). Alternately, y [...]
+
+
+Performance, Threads, and Sensitivity:
+
+BBMap is multithreaded for both indexing and mapping. It will use all available threads unless capped with the "t=" flag, but it scales near-linearly with processor cores so there is rarely a good reason to restrict it unless operating on a shared system.
+After indexing, there are two stages of processing for each read, mapping (finding candidate locations via kmer matching) and alignment (scoring how well the read matches each candidate location). Normally, BBMap spends most of its time in the alignment (rather than mapping) phase. You can speed up alignment by using the "fast" flag, reducing maxindel, and adjusting sensitivity flags like "minratio/minid" and "bandwidth"; you can speed up mapping by increasing minhits and kmer length. [...]
+Generally, any flag that increases speed reduces sensitivity, and vice-versa.
+
+
+Maxindel Flag:
+
+"maxindel" determines the maximum length of insertions and deletions that will be sought. It is a soft limit - it is possible to find indels much longer than maxindel, they just won't explicitly be searched for. Maxindel has more impact on insertions than deletions, because deletions (relative to the reference) can be found that are much longer than read length, but it is impossible to find an insertion longer than read length from mapping a single read. Typically, insertions can be f [...]
+The default for maxindel is 16000. This is fine for many purposes, but if you want to map RNA-seq reads to a genome in an organism with long introns (such as human), you should set it to a higher value like 200000 (maxindel=200k). The same is true if you are looking for severe mutations like knocked-out genes. To increase speed, or to avoid spurious long indels caused by chimeric sequences (MDA, for example), you can reduce it to a lower value like 80. But unless you use the flag "st [...]
+
+
+Post-filtering:
+
+There are various optional flags such as idfilter and subfilter that ban alignments failing those filters. For example, if "subfilter=2" then any alignment with more than 2 substitutions will be eliminated. These are called "post-filtering" for a reason - they occur after all mapping and alignment is complete. So if, hypothetically, a read has a best mapping score at a site with 1 deletion, and the second-best has 8 substitutions, and you set "delfilter=0".
+
+
+File Formats:
+
+BBMap requires read input to be fasta or fastq, compressed or raw. Paired reads can be in two files or interleaved in a single file. It cannot process both paired and unpaired reads in the same run (except by using BBWrap). The indexing phase requires fasta format only (compressed is OK).
+Output formats are fasta, fastq, sam, or bam (if samtools is installed). The alignment information will be lost if reads are output as fasta or fastq, though that's still useful for filtering operations.
+All other output (statistics, histograms, coverage, etc) are tab-delimited text, with one or more header rows (starting with #) and the rest data.
+
+
+Output Streams:
+
+BBMap's primary output stream is "out", e.g. "out=mapped.sam". All reads go to out. It also supports "outu" and "outm", which are streams only for unmapped or mapped reads, respectively. Pairs are always kept together; if one read is mapped and the other is unmapped, both will go to outm.
+
+
+Paired Reads:
+
+BBMap supports paired reads. When mapped as pairs, reads in the normal "innie" fragment orientation (left read mapped to plus strand, right read mapped to minus strand), with a pair distance within some margin of the average, will be considered properly paired and the mapping score will be increased. If one read maps and the other does not map nearby, a "rescue" operation is performed to look for a good mapping location by brute force. However, some library types (mainly long-mate-pai [...]
+For information on the syntax of using paired reads, please see UsageGuide.txt.
+
+
+Coverage Output:
+
+BBMap generates coverage information by internally using Pileup. So, the results are the same as generating a sam file with BBMap and feeding it to pileup.sh, if the parameters are the same. However, Pileup supports a wider variety of parameters, so there may be cases where it is preferable.
+
+
+Sam Format and Cigar Strings:
+
+The cigar string is a required field in a Sam file, which tells you how the read aligned to the reference. Unfortunately, early versions used the symbol "M" to designate a base matching or mismatching the reference, which has caused many problems. That was fixed several years ago in the Sam 1.4 specification, which allows "X" for substitutions and "=" for matches; it still allows "M" for either one. BBMap supports the modern sam specification and by default will output cigar strings w [...]
+
+
+Global versus Local Alignment:
+
+BBMap is a global aligner. That means it looks for the highest-scoring alignment taking into account all bases in a sequence. A local aligner would look for the best-scoring local alignment, meaning an alignment where the ends are possibly clipped off. So, if there were two possible alignment locations for a 100bp read, one with 3 mismatches scattered through a read, and one with 5 mismatches all in the last 10bp of a read, BBMap would place the read at the location with 3 mismatches, [...]
+BBMap has a "local" flag, which will convert its global alignments into local alignments. That does not make it a local aligner - it still looks for the best global alignment. If the local flag is enabled, then the alignment will be clipped if that yields a higher score. So, BBMap will create local alignments, but it will not guarantee that it finds the optimal local alignment - rather, it will produce local alignments from the optimal global alignments.
+
+
+Minratio and Minid:
+
+Internally, BBMap uses a custom affine-transform matrix to generate alignment scores. Whether a read is considered "mapped" depends on whether the ratio between its best actual score and the maximum possible score (meaning 100% of bases match the reference) is at least minratio. The score for a base matching the reference is +100 points; for a single mismatch, -127 points. These numbers were determined empirically through extensive testing. A second consecutive mismatch only gets a - [...]
+However, this is very confusing to users. So while you can directly set "minratio", you can alternatively set "minid", which then adjusts minratio to the level that approximately matches. For example, if you set "minid=0.9", BBMap will print "Set MINIMUM_ALIGNMENT_SCORE_RATIO to 0.816". So, "minid=0.9" is equivalent to "minratio=0.816". How is that decided? It calculates the score ratio you would get at 90% identity, if all the differences from the reference were caused by noncontig [...]
+BBMap will ignore candidate sites if it can prove they cannot get close to the minratio, and when doing an alignment, the amount of the alignment matrix that is filled in depends on the minratio - if it's high, less work is done. So, setting either of these higher will increase speed at the expense of sensitivity.
+
+
+Perfectmode and Semiperfectmode:
+
+BBMap can run fastest in "perfectmode", meaning reads must match the reference perfectly - no substitutions or indels. There is a second, similar mode, called "semiperfectmode". This is almost as fast, and also requires all bases to match the reference, but allows read bases to map to reference Ns, or for reads to go off the end of contigs.
+
+
+Read Length and BBMap Versions:
+
+Normal BBMap supports reads up to 700bp. There is a variant BBMapPacBio (called by the shellscript mapPacBio.sh) which supports reads up to 6kbp. bbmapskimmer.sh also supports reads up to 6kbp. Reads longer than the max read length can be automatically shredded and renamed by adding the flag maxlen, e.g. "maxlen=500". This will shred long reads into 500bp pieces, map them independently, and add a "_1", "_2", etc to the original name.
+The PacBio versions have a different error weight profile designed for long reads with a high error rate, dominated by short indels. It can process Illumina data but the globally optimal alignments will occasionally differ between the two versions. It is also the recommended version for Nanopore data.
+Skimmer is designed to find all alignments above a certain threshold, as opposed to the normal versions, which attempt to find the single best alignment, and some alignments that are almost as good (like the second and third best), to quantify whether the best alignment is ambiguous. Skimmer, however, was intended for mapping Illumina reads to PacBio reads for error-correction; in that case, you expect each read to have many correct alignments, with very different alignment scores.
+
+
+BBSplit:
+
+BBSplit internally uses BBMap to map reads to multiple genomes at once, and determine which genome they match best. This is different than with ordinary mapping. If a genome (say, human) contains an exact repeat somewhere, reads mapping to it will be mapped ambiguously. But if you want to determine whether reads are mouse or human, it does not matter whether they map ambiguously within human, only whether they are ambiguous between human and mouse. BBSplit tracks this additional ambi [...]
+
+
+BBWrap:
+
+BBWrap is a simple wrapper that allows BBMap to be run multiple times without reloading the index each time. So, it can save some compute resources (particularly with a small number of reads and large reference), and is also handy for things like mapping paired and unpaired reads to the same reference, then outputting them in the same file.
+
+
+*Usage Examples*
+
+
+To build an index:
+bbmap.sh ref=contigs.fa
+
+
+To map to an index in the present directory:
+bbmap.sh in=reads.fq out=mapped.sam
+
+
+To index and map at the same time:
+bbmap.sh in=reads.fq out=mapped.sam ref=ref.fa
+
+
+To build an index in-memory without writing to disk:
+bbmap.sh in=reads.fq out=mapped.sam ref=ref.fa nodisk
+
+
+To split input into mapped and unmapped, in fastq format:
+bbmap.sh in=reads.fq outm=mapped.fq outu=unmapped.fq
+
+
+To calculate coverage:
+bbmap.sh in=reads.fq covstats=constats.txt covhist=covhist.txt basecov=basecov.txt bincov=bincov.txt
+
+That will generate per-scaffold coverage statistics, a histogram of coverage depth, and the precise coverage for every genomic base, as well as binned coverage which is easier to plot.
+
+
+To output a bam file (if samtools is installed):
+bbmap.sh in=reads.fq out=mapped.bam
+
+
+To generate a sorted, indexed bam file:
+bbmap.sh in=reads.fq out=mapped.sam bamscript=bs.sh; sh bs.sh
+
+
+To map vertebrate RNA-seq reads to a genome:
+bbmap.sh in=reads.fq out=mapped.sam maxindel=200k ambig=random intronlen=20 xstag=us
+
+
+To map faster:
+bbmap.sh in=reads.fq out=mapped.sam fast
+
+
+To map with high sensitivity:
+bbmap.sh in=reads.fq out=mapped.sam slow k=12
+
+
+To map with super-high sensitivity (useful for very-low-quality data, or remote homologies):
+mapPacBio.sh in=reads.fq out=mapped.sam vslow k=8 maxindel=200 minratio=0.1
+
+
+To map in low-memory mode:
+bbmap.sh ref=ref.fa usemodulo
+bbmap.sh in=reads.fq out=mapped.san usemodulo
+
+Note that the "usemodulo" flag must be present both when generating the reference and when mapping.
+
+
+To map quickly with very high precision and lower sensitivity, as when removing contaminant reads specific to a genome without risking false-positives:
+bbmap.sh minratio=0.9 maxindel=3 bwr=0.16 bw=12 fast minhits=2 qtrim=r trimq=10 untrim idtag printunmappedcount kfilter=25 maxsites=1 k=14
+
+
+To generate histograms and statistics:
+bbmap.sh in=reads.fq bhist=bhist.txt qhist=qhist.txt aqhist=aqhist.txt lhist=lhist.txt ihist=ihist.txt ehist=ehist.txt qahist=qahist.txt indelhist=indelhist.txt mhist=mhist.txt gchist=gchist.txt idhist=idhist.txt scafstats=scafstats.txt
+
+
diff --git a/docs/guides/BBMap_old_readme.txt b/docs/guides/BBMap_old_readme.txt
new file mode 100755
index 0000000..2528bc4
--- /dev/null
+++ b/docs/guides/BBMap_old_readme.txt
@@ -0,0 +1,237 @@
+This is a readme for BBMap. However, it has not been maintained and is superceded by the information in the shellscript, bbmap.sh.
+
+Basic Syntax:
+
+(Using shellscript, under Unix, which autodetects RAM to set -Xmx parameter. You can also include a flag like '-Xmx31g' in the shellscript arguments to set RAM usage.)
+To index:
+bbmap.sh ref=<reference.fa>
+To map:
+bbmap.sh in=<reads.fq> out=<mapped.sam>
+
+(without shellscript)
+To index:
+java -ea -Xmx31g -cp <PATH> align2.BBMap ref=<reference.fa>
+To map:
+java -ea -Xmx31g -cp <PATH> align2.BBMap in=<reads.fq> out=<mapped.sam>
+
+...where "<PATH>" should indicate the path to the directory containing all the source code directories; e.g. "/usr/bin/bbmap/current"
+
+Please note, the reference is only needed for building the index the first time; subsequently, just specify the build number which corresponds to that reference.
+So for example the first time you map to e.coli you might specify "ref=ecoli_reference.fa build=3"; after that, just specify "build=3".
+The index files would then be stored in ./ref/genome/3/ and ./ref/index/3/
+Also, the -Xmx parameter should specify approximately 85% of the physical memory of the target machine; so, 21G for a 24GB node. The process needs approximately 8 bytes per reference base (plus a several hundred MB overhead).
+
+
+Advanced Syntax:
+
+
+Indexing Parameters (required when building the index):
+path=<.> Base directory to store index files. Default is the local directory. The index will always be placed in a subdirectory "ref".
+ref=<ref.fasta> Use this file to build the index. Needs to be specified only once; subsequently, the build number should be used.
+build=<1> Write the index to this location (build=1 would be stored in /ref/genome/1/ and /ref/index/1/). Can be any integer. This parameter defaults to 1, but using additional numbers allows multiple references to be indexed in the same directory.
+k=<13> Use length 13 kmers for indexing. Suggested values are 9-15, with lower typically being slower and more accurate. 13 is usually optimal. 14 is better for RNA-SEQ and very large references >4GB; 12 is better for PacBio and cross-species mapping.
+midpad=<300> Put this many "N" in between scaffolds when making the index. 300 is fine for metagenomes with millions of contigs; for a finished genome like human with 25 scaffolds, this should be set to 100000+ to prevent cross-scaffold mapping.
+startpad=<8000> Put this many "N" at the beginning of a "chrom" file when making index. It's best if this is longer than your longest expected read.
+stoppad=<8000> Put this many "N" at the end of a "chrom" file when making index. It's best if this is longer than your longest expected read.
+minscaf=<1> Do not include scaffolds shorter than this when generating index. Useful for assemblies with millions of fairly worthless unscaffolded contigs under 100bp. There's no reason to make this shorter than the kmer length.
+usemodulo=<f> Throw away ~80% of kmers based on their remainder modulo a number. Reduces memory usage by around 50%, and reduces sensitivity slightly. Must be specified when indexing and when mapping.
+
+
+Input Parameters:
+path=<.> Base directory to read index files.
+build=<1> Use the index at this location (same as when indexing).
+in=<reads.fq> Use this as the input file for reads. Also accepts fasta. "in=sequential length=200" will break a genome into 200bp pieces and map them to itself. "in=stdin" will accept piped input. The format of piped input can be specified with e.g. "in=stdin.fq.gz" or "in=stdin.fa"; default is uncompressed fastq.
+in2=<reads2.fq> Run mapping paired, with reads2 in the file "reads2.fq"
+ NOTE: As a shorthand, "in=reads#.fq" is equivalent to "in=reads1.fq in2=reads2.fq"
+interleaved=<auto> Or "int". Set to "true" to run mapping paired, forcing the reads to be considered interleaved from a single input file. By default the reader will try to determine whether a file is interleaved based on the read names; so if you don't want this, set interleaved=false.
+qin=<auto> Set to 33 or 64 to specify input quality value ASCII offset.
+fastareadlen=<500> If fasta is used for input, breaks the fasta file up into reads of about this length. Useful if you want to map one reference against another, since BBMap currently has internal buffers limited to 500bp. I can change this easily if desired.
+fastaminread=<1> Ignore fasta reads shorter than this. Useful if, say, you set fastareadlen=500, and get a length 518 read; this will be broken into a 500bp read and an 18bp read. But it's not usually worth mapping the 18bp read, which will often be ambiguous.
+maxlen=<0> Break long fastq reads into pieces of this length.
+minlen=<0> Throw away remainder of read that is shorter than this.
+fakequality=<-1> Set to a positive number 1-50 to generate fake quality strings for fasta input reads. Less than one turns this function off.
+blacklist=<a.fa,b.fa> Set a list of comma-delimited fasta files. Any read mapped to a scaffold name in these files will be considered "blacklisted" and can be handled differently by using the "outm", "outb", and "outputblacklisted" flags. The blacklist fasta files should also be merged with other fasta files to make a single combined fasta file; this combined file should be specified with the "ref=" flag when indexing.
+touppercase=<f> Set true to convert lowercase read bases to upper case. This is required if any reads have lowercase letters (which real reads should never have).
+
+
+Sampling Parameters:
+reads=<-1> Process at most N reads, then stop. Useful for benchmarking. A negative number will use all reads.
+samplerate=<1.0> Set to a fraction of 1 if you want to randomly sample reads. For example, samplerate=0.25 would randomly use a quarter of the reads and ignore the rest. Useful for huge datasets where all you want to know is the % mapped.
+sampleseed=<1> Set to the RNG seed for random sampling. If this is set to a negative number, a random seed is used; for positive numbers, the number itself is the seed. Since the default is 1, this is deterministic unless you explicitly change it to a negative number.
+idmodulo=<1> Set to a higher number if you want to map only every Nth read (for sampling huge datasets).
+
+
+Mapping Parameters:
+fast=<f> The fast flag is a macro. It will set many other paramters so that BBMap will run much faster, at slightly reduced sensitivity for most applications. Not recommended for RNAseq, cross-species alignment, or other situations where long deletions or low identity matches are expected.
+minratio=<0.56> Alignment sensitivity as a fraction of a read's max possible mapping score. Lower is slower and more sensitive but gives more false positives. Ranges from 0 (very bad alignment) to 1 (perfect alignment only). Default varies between BBMap versions.
+minidentity=<> Or "minid". Use this flag to set minratio more easily. If you set minid=0.9, for example, minratio will be set to a value that will be APPROXIMATELY equivalent to 90% identity alignments.
+minapproxhits=<1> Controls minimum number of seed hits to examine a site. Higher is less accurate but faster (on large genomes). 2 is maybe 2.5x as fast and 3 is maybe 5x as fast on a genome with several gigabases. Does not speed up genomes under 100MB or so very much.
+padding=<4> Sets extra padding for slow-aligning. Higher numbers are more accurate for indels near the tips of reads, but slower.
+tipsearch=<100> Controls how far to look for possible deletions near tips of reads by brute force. tipsearch=0 disables this function. Higher is more accurate.
+maxindel=<16000> Sets the maximum size of indels allowed during the quick mapping phase. Set higher (~100,000) for RNA-SEQ and lower (~20) for large assemblies with mostly very short contigs. Lower is faster.
+strictmaxindel=<f> Set to true to disallow mappings with indels longer than maxindel. Alternately, for an integer X, 'strictmaxindel=X' is equivalent to the pair of flags 'strictmaxindel=t maxindel=X'.
+pairlen=<32000> Maximum distance between mates allowed for pairing.
+requirecorrectstrand=<t> Or "rcs". Requires correct strand orientation when pairing reads. Please set this to false for long mate pair libraries!
+samestrandpairs=<f> Or "ssp". Defines correct strand orientation when pairing reads. Default is false, meaning opposite strands, as in Illumina fragment libraries. "ssp=true" mode is not fully tested.
+killbadpairs=<f> Or "kbp". When true, if a read pair is mapped with an inappropriate insert size or orientation, the read with the lower mapping quality is marked unmapped.
+rcompmate=<f> ***TODO*** Set to true if you wish the mate of paired reads to be reverse-complemented prior to mapping (to allow better pairing of same-strand pair libraries).
+kfilter=<-1> If set to a positive number X, all potential mapping locatiosn that do not have X contiguous perfect matches with the read will be ignored. So, reads that map with "kfilter=51" are assured to have at least 51 contiguous bases that match the reference. Useful for mapping to assemblies generated by a De Bruijn graph assembly that used a kmer length of X, so that you know which reads were actually used in the assembly.
+threads=<?> Or "t". Set number of threads. Default is # of logical cores. The total number of active threads will be higher than this, because input and output are in seperate threads.
+perfectmode=<f> Only accept perfect mappings. Everything goes much faster.
+semiperfectmode=<f> Only accept perfect or "semiperfect" mappings. Semiperfect means there are no mismatches of defined bases, but up to half of the reference is 'N' (to allow mapping to the edge of a contig).
+rescue=<t> Controls whether paired may be rescued by searching near the mapping location of a mate. Increases accuracy, with usually a minor speed penalty.
+expectedsites=<1> For BBMapPacBioSkimmer only, sets the expected number of correct mapping sites in the target reference. Useful if you are mapping reads to other reads with some known coverage depth.
+msa=<> Advanced option, not recommended. Set classname of MSA to use.
+bandwidth=0 Or "bw". When above zero, restricts alignment band to this width. Runs faster, but with reduced accuracy for reads with many or long indels.
+bandwidthratio=0 Or "bwr". When above zero, restricts alignment band to this fraction of a read's length. Runs faster, but with reduced accuracy for reads with many or long indels.
+usequality=<t> Or "uq". Set to false to ignore quality values when mapping. This will allow very low quality reads to be attempted to be mapped rather than discarded.
+keepbadkeys=<f> Or "kbk". With kbk=false (default), read keys (kmers) have their probability of being incorrect evaluated from quality scores, and keys with a 94%+ chance of being wrong are discarded. This increases both speed and accuracy.
+usejni=<f> Or "jni". Do alignments in C code, which is faster. Requires first compiling the C code; details are in /jni/README.txt. This will produce identical output.
+maxsites2=<800> Don't analyze (or print) more than this many alignments per read.
+minaveragequality=<0> (maq) Discard reads with average quality below this.
+
+Post-Filtering Parameters:
+
+idfilter=0 Different than "minid". No alignments will be allowed with an identity score lower than this value. This filter occurs at the very end and is unrelated to minratio, and has no effect on speed unless set to 1. Range is 0-1.
+subfilter=-1 Ban alignments with more than this many substitutions.
+insfilter=-1 Ban alignments with more than this many insertions.
+delfilter=-1 Ban alignments with more than this many deletions.
+indelfilter=-1 Ban alignments with more than this many indels.
+editfilter=-1 Ban alignments with more than this many edits.
+inslenfilter=-1 Ban alignments with an insertion longer than this.
+dellenfilter=-1 Ban alignments with a deletion longer than this.
+
+Output Parameters:
+out=<outfile.sam> Write output to this file. If out=null, output is suppressed. If you want to output paired reads to paired files, use a "#" symbol, like out=mapped#.sam. Then reads1 will go to mapped1.sam and reads2 will go to mapped2.sam. (NOTE: split output currently diabled for .sam format, but allowed for native .txt format). To print to standard out, use "out=stdout"
+outm=<> Write only mapped reads to this file (excluding blacklisted reads, if any).
+outu=<> Write only unmapped reads to this file.
+outb=<> Write only blacklisted reads to this file. If a pair has one end mapped to a non-blacklisted scaffold, it will NOT go to this file. (see: blacklist)
+out2=<> If you set out2, outu2, outm2, or outb2, the second read in each pair will go to this file. Not currently allowed for SAM format, but OK for others (such as fasta, fastq, bread).
+overwrite=<f> Or "ow". Overwrite output file if it exists, instead of aborting.
+append=<f> Or "app". Append to output file if it exists, instead of aborting.
+ambiguous=<best> Or "ambig". Sets how to handle ambiguous reads. "first" or "best" uses the first encountered best site (fastest). "all" returns all best sites. "random" selects a random site from all of the best sites (does not yet work with paired-ends). "toss" discards all sites and considers the read unmapped (same as discardambiguous=true). Note that for all options (aside from toss) ambiguous reads in SAM format will have the extra field "XT:A:R" while unambiguous reads will h [...]
+ambiguous2=<best> (for BBSplit only) Or "ambig2". Only for splitter mode. Ambiguous2 strictly refers to any read that maps to more than one reference set, regardless of whether it has multiple mappings within a reference set. This may be set to "best" (aka "first"), in which case the read will be written only to the first reference to which it has a best mapping; "all", in which case a read will be written to outputs for all references to which it maps; "toss", in which case it will be [...]
+outputunmapped=<t> Outputs unmapped reads to primary output stream (otherwise they are dropped).
+outputblacklisted=<t> Outputs blacklisted reads to primary output stream (otherwise they are dropped).
+ordered=<f> Set to true if you want reads to be output in the same order they were input. This takes more memory, and can be slower, due to buffering in multithreaded execution. Not needed for singlethreaded execution.
+ziplevel=<2> Sets output compression level, from 1 (fast) to 9 (slow). I/O is multithreaded, and thus faster when writing paired reads to two files rather than one interleaved file.
+nodisk=<f> "true" will not write the index to disk, and may load slightly faster. Prevents collisions between multiple bbmap instances writing indexes to the same location at the same time.
+usegzip=<f> If gzip is installed, output file compression is done with a gzip subprocess instead of with Java's native deflate method. Can be faster when set to true. The output file must end in a compressed file extension for this to have effect.
+usegunzip=<f> If gzip is installed, input file decompression is done with a gzip subprocess instead of with Java's native inflate method. Can be faster when set to true.
+pigz=<f> Spawn a pigz (parallel gzip) process for faster compression than Java or gzip. Requires pigz to be installed.
+unpigz=<f> Spawn a pigz process for faster decompression than Java or gzip. Requires pigz to be installed.
+bamscript=<filename> (bs for short) Writes a shell script to <filename> with the command line to translate the sam output of BBMap into a sorted bam file, assuming you have samtools in your path.
+maxsites=<5> Sets maximum alignments to print per read, if secondary alignments are allowed. Currently secondary alignments may lack cigar strings.
+secondary=<f> Print secondary alignments.
+sssr=<0.95> (secondarysitescoreratio) Print only secondary alignments with score of at least this fraction of primary.
+ssao=<f> (secondarysiteasambiguousonly) Only print secondary alignments for ambiguously-mapped reads.
+quickmatch=<f> Generate cigar strings during the initial alignment (before the best site is known). Currently, this must be enabled to generate cigar strings for secondary alignments. It increases overall speed but may in some very rare cases yield inferior alignments due to less padding.
+local=<f> Output local alignments instead of global alignments. The mapping will still be based on the best global alignment, but the mapping score, cigar string, and mapping coordinate will reflect a local alignment (using the same affine matrix as the global alignment).
+sortscaffolds=<f> Sort scaffolds alphabetically in SAM headers to allow easier comparisons with Tophat (in cuffdif, etc). Default is in same order as source fasta.
+trimreaddescriptions=<f> (trd) Truncate read names at the first whitespace, assuming that the remaineder is a comment or description.
+machineout=<f> Set to true to output statistics in machine-friendly 'key=value' format.
+forcesectionname=<f> All fasta reads get an _# at the end of their name. The number is 1 for the first shred and continues ascending.
+
+
+Sam settings and flags:
+samversion=<1.4> SAM specification version. Set to 1.3 for cigar strings with 'M' or 1.4 for cigar strings with '=' and 'X'. Samtools 0.1.18 and earlier are incompatible with sam format version 1.4 and greater.
+saa=<t> (secondaryalignmentasterisks) Use asterisks instead of bases for sam secondary alignments.
+cigar=<t> Generate cigar strings (for bread format, this means match strings). cigar=false is faster. "cigar=" is synonymous with "match=". This must be enabled if match/insertion/deletion/substitution statistics are desired, but the program will run faster with cigar strings disabled.
+keepnames=<f> Retain original names of paired reads, rather than ensuring both reads have the same name when written in sam format by renaming read2 to the same as read1. If this is set to true then the output may not be sam compliant.
+mdtag=<f> Generate MD tags for SAM files. Requires that cigar=true. I do not recommend generating MD tags for RNASEQ or other data where long deletions are expected because they will be incredibly long.
+xstag=<f> Generate XS (strand) tags for Cufflinks. This should be used with a stranded RNA-seq protocol.
+xmtag=<t> Generate XM tag. Indicates number of best alignments. May only work correctly with ambig=all.
+nhtag=<f> Write NH tags.
+intronlen=<999999999> Set to a lower number like 10 to change 'D' to 'N' in cigar strings for deletions of at least that length. This is used by Cufflinks; 'N' implies an intron while 'D' implies a deletion, but they are otherwise identical.
+stoptag=<f> Allows generation of custom SAM tag YS:i:<read stop location>
+idtag=<f> Allows generation of custom SAM tag YI:f:<percent identity>
+scoretag=<f> Allows generation of custom SAM tag YR:i:<raw mapping score>
+inserttag=<f> Write a tag indicating insert size, prefixed by X8:Z:
+rgid=<> Set readgroup ID. All other readgroup fields can be set similarly, with the flag rgXX=value.
+noheader=<f> Suppress generation of output header lines.
+
+
+Statistics and Histogram Parameters:
+showprogress=<f> Set to true to print out a '.' once per million reads processed. You can also change the interval with e.g. showprogress=20000.
+qhist=<file> Output a per-base average quality histogram to <file>.
+aqhist=<file> Write histogram of average read quality to <file>.
+bqhist=<file> Write a quality histogram designed for box plots to <file>.
+obqhist=<file> Write histogram of overall base counts per quality score to <file>.
+qahist=<file> Quality accuracy histogram; correlates claimed phred quality score with observed quality based on substitution, insertion, and deletion rates.
+mhist=<file> Output a per-base match histogram to <file>. Requires cigar strings to be enabled. The columns give fraction of bases at each position having each match string operation: match, substitution, deletion, insertion, N, or other.
+ihist=<file> Output a per-read-pair insert size histogram to <file>.
+bhist=<file> Output a per-base composition histogram to <file>.
+indelhist=<file> Output an indel length histogram.
+lhist=<file> Output a read length histogram.
+ehist=<file> Output an errors-per-read histogram.
+gchist=<file> Output a gc content histogram.
+gchistbins=<100> (gcbins) Set the number of bins in the gc content histogram.
+idhist=<file> Write a percent identity histogram.
+idhistbins=<100> (idbins) Set the number of bins in the identity histogram.
+scafstats=<file> Track mapping statistics per scaffold, and output to <file>.
+refstats=<file> For BBSplitter, enable or disable tracking of read mapping statistics on a per-reference-set basis, and output to <file>.
+verbosestats=<0> From 0-3; higher numbers will print more information about internal program counters.
+printunmappedcount=<f> Set true to print the count of reads that were unmapped. For paired reads this only includes reads whose mate was also unmapped.
+
+
+Coverage output parameters (these may reduce speed and use more RAM):
+covstats=<file> Per-scaffold coverage info.
+covhist=<file> Histogram of # occurrences of each depth level.
+basecov=<file> Coverage per base location.
+bincov=<file> Print binned coverage per location (one line per X bases).
+covbinsize=1000 Set the binsize for binned coverage output.
+nzo=f Only print scaffolds with nonzero coverage.
+twocolumn=f Change to true to print only ID and Avg_fold instead of all 6 columns to the 'out=' file.
+32bit=f Set to true if you need per-base coverage over 64k.
+bitset=f Store coverage data in BitSets.
+arrays=t Store coverage data in Arrays.
+ksb=t Keep residual bins shorter than binsize.
+strandedcov=f Track coverage for plus and minus strand independently. Requires a # symbol in coverage output filenames which will be replaced by 1 for plus strand and 2 for minus strand.
+startcov=f Only track start positions of reads.
+concisecov=f Write basecov in a more concise format.
+
+
+Trimming Parameters:
+qtrim=<f> Options are false, left, right, or both. Allows quality-trimming of read ends before mapping.
+ false: Disable trimming.
+ left (l): Trim left (leading) end only.
+ right (r): Trim right (trailing) end only. This is the end with lower quality many platforms.
+ both (lr): Trim both ends.
+trimq=<5> Set the quality cutoff. Bases will be trimmed until there are 2 consecutive bases with quality GREATER than this value; default is 5. If the read is from fasta and has no quality socres, Ns will be trimmed instead, as long as this is set to at least 1.
+untrim=<f> Untrim the read after mapping, restoring the trimmed bases. The mapping position will be corrected (if necessary) and the restored bases will be classified as soft-clipped in the cigar string.
+
+
+Java Parameters:
+-Xmx If running from the shellscript, include it with the rest of the arguments and it will be passed to Java to set memory usage, overriding the shellscript's automatic memory detection. -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max allowed is typically 85% of physical memory.
+-da Disable assertions. Alternative is -ea which is the default.
+
+
+Splitting Parameters:
+The splitter is invoked by calling bbsplit.sh (or align2.BBSplitter) instead of bbmap.sh, for the indexing phase. It allows combining multiple references and outputting reads to different files depending on which one they mapped to best. The order in which references are specified is important in cases of ambiguous mappings; when a read has 2 identically-scoring mapping locations from different references, it will be mapped to the first reference.
+All parameters are the same as BBMap with the exception of the ones listed below. You can still use "outu=" to capture unmapped reads.
+ref_<name>=<fasta files> Defines a named set of organisms with a single fasta file or list. For example, ref_a=foo.fa,bar.fa defines the references for named set "a"; any read that maps to foo.fasta or bar.fasta will be considered a member of set a.
+out_<name>=<output file> Sets the output file name for reads mapping to set <name>. out_a=stuff.sam would capture all the reads mapping to ref_a.
+basename=<example%.sam> This shorthand for mass-specifying all output files, where the % symbol is a wildcard for the set name. For example, "ref_a=a.fa ref_b=b.fa basename=mapped_%.sam" would expand to "ref_a=a.fa ref_b=b.fa out_a=mapped_a.sam out_b=mapped_b.sam"
+ref=<fasta files> When run through the splitter, this is shorthand for writing a bunch of ref_<name> entries. "ref=a.fa,b.fa" would expand to "ref_a=a.fa ref_b=b.fa".
+
+
+Formats and Extensions
+.gz,.gzip,.zip,.bz2 These file extensions are allowed on input and output files and will force reading/writing compressed data.
+.fa,.fasta,.txt,.fq,.fastq These file extensions are allowed on input and output files. Having one is REQUIRED. So, reads.fq and reads.fq.zip are valid, but reads.zip is NOT valid. Note that outputting in fasta or fastq will not retain mapping locations.
+.sam This is only allowed on output files.
+.bam This is allowed on output files if samtools is installed. Beware of memory usage; samtools will run in a subprocess, and it can consume over 1kb per scaffold of the reference genome.
+
+
+Different versions:
+BBMap (bbmap.sh) Fastest version. Finds single best mapping location.
+BBMapPacBio (mapPacBio.sh) Optimized for PacBio's error profile (more indels, fewer substitutions). Finds single best mapping location. PacBio reads should be in fasta format.
+BBMapPacBioSkimmer (bbmapskimmer.sh) Designed to find ALL mapping locations with alignment score above a certain threshold; also optimized for Pac Bio reads.
+BBSplitter (bbsplit.sh) Uses BBMap or BBMapPacBio to map to multiple references simultaneously, and output the reads to the file corresponding to the best-matching reference. Designed to split metagenomes or contaminated datasets prior to assembly.
+BBWrap (bbwrap.sh) Maps multiple read files to the same reference, producing one sam file per input file. The advantage is that the reference/index only needs to be read once.
+
+
+
+
+Notes.
+File types are autodetected by parsing the filename. So you can name files, say, out.fq.gz or out.fastq.gz or reads1.fasta.bz2 or data.sam and it will work as long as the extensions are correct.
+
diff --git a/docs/guides/BBMaskGuide.txt b/docs/guides/BBMaskGuide.txt
new file mode 100755
index 0000000..6156f65
--- /dev/null
+++ b/docs/guides/BBMaskGuide.txt
@@ -0,0 +1,45 @@
+BBMask Guide
+Written by Brian Bushnell
+Last updated December 22, 2015
+
+
+BBMask is designed for masking sequence, primarily to prevent false-positive matches in highly-conserved or low-complexity regions of genomes. It was designed as a replacement for tools like Dust which are incredibly slow and do not work well for that purpose. It does three types of masking, all optional:
+1) Low-entropy (complexity)
+2) Tandem-repeated kmers
+3) Sam-file coverage
+
+
+*Notes*
+
+
+Entropy:
+
+Entropy is calculated using Shannon Entropy of kmers in a window, and varies from 0 (mask nothing) to 1 (mask everything). It's a little hard to figure out exactly what threshold you should set, but for reference, the default of 0.7 masks only 107 bases in the E.coli genome and about 0.7% of the human genome. 0.9 masks about 8kbp of E.coli and 7% of the human genome.
+
+
+Memory:
+
+BBMask loads all sequences into memory to allow multiple masking operations. For that reason, it requires about 1 byte per base (for fasta).
+
+
+Human, Cat, Dog, and Mouse removal:
+
+Masked references of various vertebrate organisms were prepared using BBMask. These are used by JGI to remove contaminant reads from libraries with zero false-positives. To produce them, BBMask was used, with default settings. Additionally, all of Mycocosm and Phytozome were shredded into 80bp overlapping pieces and mapped to the genomes; the resulting sam files were used for masking. Some organisms such as Maize were not used due to a high level of human contamination; this was asce [...]
+A handful of animals were also used for masking, including zebra danio (the only vertebrate used). The goal here was to bottleneck homologous regions. This is a highly simplified explanation, but my justification is this: If a gene in human is shared, largely unchanged, with a fungus, that means it had to go through fish, since fish are evolutionarily between humans and fungi. Therefore, masking human with fish should remove any sequences that are shared largely unchanged between hum [...]
+
+
+*Usage Examples*
+
+
+To mask using just entropy:
+bbmask.sh in=ref.fa out=masked.fa entropy=0.7
+
+
+To mask sequences in genome A similar to those in genome B, plus low-entropy sequences:
+shred.sh in=B.fa out=shredded.fa length=80 minlength=70 overlap=40
+bbmap.sh ref=A.fa in=shredded.fa outm=mapped.sam minid=0.85 maxindel=2
+bbmask.sh in=A.fa out=masked.fa entropy=0.7 sam=mapped.sam
+
+
+To filter low-entropy sequences rather than masking them:
+See the BBDuk Guide.
diff --git a/docs/guides/BBMergeGuide.txt b/docs/guides/BBMergeGuide.txt
new file mode 100755
index 0000000..9274d8f
--- /dev/null
+++ b/docs/guides/BBMergeGuide.txt
@@ -0,0 +1,68 @@
+BBMerge Guide
+Written by Brian Bushnell
+Last updated December 11, 2015
+
+BBMerge is designed to merge two overlapping paired reads into a single read. For example, a 2x150bp read pair with an insert size of 270bp would result in a single 270bp read. This is useful in amplicon studies, as clustering and consensus are far easier with single reads than paired reads, and also in assembly, where longer reads allow the use of longer kmers (for kmer-based assemblers) or fewer comparisons (for overlap-based assemblers). And in either case, the quality of the overl [...]
+
+BBMerge's parameters are described in its shellscript (bbmerge.sh). This file provides usage examples of various common tasks.
+
+
+*Notes*
+
+
+Memory:
+
+BBMerge has 2 shellscripts, bbmerge.sh and bbmerge-auto.sh. They are equivalent except for memory usage; so if you override memory with the -Xmx flag, they are equivalent. bbmerge.sh is designed for overlap-based merging only, and uses a fixed 1GB of RAM (though it can function with much less than that). bbmerge-auto.sh is designed for merging overlapping and non-overlapping reads, and attempts to grab all available physical memory, which is needed for storing kmers.
+
+
+Output streams:
+
+BBMerge supports "out" (aka "outm" or "outmerged") and "outu" ("outunmerged"). Reads that are merged, or mergeable, go to out, and the rest go to outu. There is a "join" flag (default true) that controls whether mergeable reads get merged. If it is set to false, mergeable reads will be written interleaved to out. All output streams are optional.
+
+
+Threads and speed:
+
+BBMerge is multithreaded and scales linearly with the number of processor cores, so it's best to let it automatically use all of them. You can restrict the number of worker threads with the "t" flag if you are working on a shared node. To achieve the maximal speed on a system with many (20+) cores, BBMerge should be fed two files (using in1 and in2) rather than a single interleaved file.
+
+
+JNI acceleration:
+
+BBMerge has an optional C component (written by Jonathan Rood) which will accelerate merging by approximately 20%. This can be activated with the "jni" flag, but it must first be compiled. For details on compiling it, see /bbmap/jni/README.txt
+
+
+Strictness:
+
+BBMerge has a lot of settings controlling merging stringency, such as maxratio, ratiomargin, entropy, efilter, etc. Advanced users should feel free to tune these as needed. But it's a lot simpler to use the predefined strictness levels which adjust the specific settings according to the results of extensive benchmarking. To use a predefined strictness level, simply add a flag like "loose" (you don't need to add one for "default"). The predefined strictness levels, from strictest to loosest:
+xstrict, ustrict, vstrict, strict, default, loose, vloose, uloose, xloose
+Stricter settings have lower merge rates and fewer false positives; looser settings have higher merge rates and more false positives. Loose settings are generally not necessary except with low-quality data (which often happens in low-diversity amplicon sequencing using long reads). A false-positive means a read pair that merged with the wrong overlap length - these can cause problems in assembly or clustering (leading to spurious clusters). However, at any level of strictness, BBMerge [...]
+
+
+When not to use:
+
+If you run BBMerge, and under, say, 15% of the reads merge, even at very loose stringency, it's probably a waste of time to merge - you'll just make the workflow more complicated, and possibly get a lot of false-positives. Also, don't try to merge single-ended libraries or long-mate-pair libraries that are not in an "innie" orientation.
+
+
+*Usage Examples*
+
+Basic merging:
+bbmerge.sh in=reads.fq out=merged.fq outu=unmerged.fq ihist=ihist.txt
+
+This will merge the reads by overlap. If no best overlap is found, the pair will go to outu; otherwise, the reads will be merged and sent to out. After finishing, an insert size histogram will be written to ihist.txt.
+
+
+Overlap-based error-correction:
+bbmerge.sh in=reads.fq out=corrected.fq ecco mix
+
+This will correct reads that overlap, rather than merging them. Where the two reads agree, the quality score will be increased; where they disagree, the score will be reduced, and the base call will be changed to the base with the higher quality. If the bases differ and the scores are equal, the base will be replaced with N.
+
+
+Merging of nonoverlapping reads:
+bbmerge-auto.sh in=reads.fq out=merged.fq outu=unmerged.fq ihist=ihist.txt ecct extend2=20 iterations=5
+
+This will attempt to merge each pair by overlap. If unsuccessful, both reads will be error-corrected using Tadpole, and then merging will be tried again. If still unsuccessful, both reads will be extended by 20bp, then merging will be attempted again. This will repeat up to 5 times, or until neither of the reads can be extended any more due to a branch or dead-end in the kmer graph. If the reads are not merged, all of the changes are undone and the original pair will be sent to outu. [...]
+
+
+Discovering adapter sequences:
+bbmerge.sh in=reads.fq outa=adapters.fa
+
+This will report the consensus adapter sequences of pairs with insert size shorter than read length.
diff --git a/docs/guides/BBNormGuide.txt b/docs/guides/BBNormGuide.txt
new file mode 100755
index 0000000..88e4b68
--- /dev/null
+++ b/docs/guides/BBNormGuide.txt
@@ -0,0 +1,112 @@
+BBNorm Guide
+Written by Brian Bushnell
+Last updated December 16, 2015
+
+BBNorm is designed to normalize coverage by down-sampling reads over high-depth areas of a genome, to result in a flat coverage distribution. This process can dramatically accelerate assembly and render intractable datasets tractable, and often improve assembly quality. It can also do depth-binning, kmer frequency histogram generation, error-correction, error-marking, and genome-size estimation. BBNorm has 4 particularly notable features:
+1) It stores kmers in a probabilistic data structure called a count-min sketch. This means it will never run out of memory, or swap to disk, on any dataset. Rather, as the number of unique kmers increases, accuracy gradually declines.
+2) It has numerous features such as multipass normalization, which reduce the average error rate in the normalized output; whereas standard normalization enriches for reads containing errors.
+3) It is extremely fast and easy-to-use compared to other normalization programs.
+4) It supports unlimited kmer lengths.
+
+
+*Notes*
+
+
+Data Structures:
+
+A Count-Min Sketch (CMS) is also called a "counting Bloom filter". It is a type of hash table that only stores values, not keys, and ignores collisions. To prevent the negative effects of collisions, values are stored in multiple locations, in the hopes that at least one of them won't collide with anything else; when reading kmer counts, all locations are read, and the lowest value is used.
+BBNorm can use CMSs with 1, 2, 4, 8, 16, or 32 bits per cell. The more bits, the higher the maximum count (up to 2^bits-1), but the fewer cells are available; for example, 1GB RAM will accommodate 4 billion 2-bit cells, with counts up to 3, or 500 million 16-bit cells, with counts up to 65535. If your data has expected coverage of 200x, there is little reason to use 32-bit cells.
+Also, the number of locations used for storing a kmer's count (the number of "hashes") can be specified, from 1 to infinity (default 3). More hashes are more accurate (until the table becomes too full), but slower. To determine the optimal number of hashes, please read about Bloom filters.
+
+
+Memory and Capacity:
+
+BBNorm should be run using all available memory (which is what the shellscript will try to do by default). The more memory available, the more accurate. It is possible to process an arbitrarily large dataset with even a tiny amount of memory. However, that will result in a warning message like this:
+
+"Made hash table: hashes = 1 mem = 581.26 MB cells = 152.38M used = 93.540%
+Warning: This table is extremely full, which may reduce accuracy. Ideal load is under 60% used.
+For better accuracy, use the 'prefilter' flag; run on a node with more memory; quality-trim or error-correct reads; or increase the values of the minprob flag to reduce spurious kmers. In practice you should still get good normalization results even with loads over 90%, but the histogram and statistics will be off."
+
+Please don't ignore this message! The memory can be used more efficiently by specifying "prefilter", which stores low-count kmers in smaller cells (2-bit, by default) and high-count kmers in bigger cells (32-bit, by default). Prefilter is by default false, as it makes things slower, but should always be enabled when maximal accuracy is desired or if the tables become too full (say, over 50% or so for normalization; lower for error-correction). You can also reduce the size of the prima [...]
+
+
+Shellscripts:
+
+BBNorm (whose java file name is jgi.KmerNormalize) has 3 shellscripts - bbnorm.sh, ecc.sh, and khist.sh. They all call KmerNormalize and just use different default parameters. It is possible to make kmer frequency histograms while doing error-correction and normalization at the same time with a single command from any of these shellscripts; they are only for convenience.
+
+
+Dumping Kmers, Exact Counts, and Error Correction:
+
+BBNorm cannot dump kmers and their counts because it only stores counts, not kmers. For this purpose, please use KmerCountExact instead, which explicitly tracks both kmers and their exact counts. Also, KmerCountExact can report the exact kmer frequency histogram. However, KmerCountExact cannot handle unlimited input data in finite memory like BBNorm can.
+Tadpole uses the same exact data structures as KmerCountExact, and as a result, error-correction by Tadpole is generally better than error-correction by BBNorm. Therefore, while BBNorm supports error-correction, it is recommended that Tadpole be used when there is sufficient memory.
+
+
+When Not To Use BBNorm:
+
+For normalization, BBNorm is mainly intended for use in assembly, and with short reads. Normalization is often useful if you have too much data (for example, 600x average coverage when you only want 100x) or uneven coverage (amplified single-cell, RNA-seq, viruses, metagenomes, etc). It is not useful if you have smooth coverage and approximately the right amount of data, or too little data. BBNorm cannot inflate low coverage (bring 15x coverage up to 100x), only reduce it. Never norm [...]
+Also, error-correction is not advisable when you are looking for rare variants. It should generally be fine with relatively high-depth coverage of heterozygous mutations in a diploid (where you expect a 50/50 allele split), but with low-depth coverage (like 5x), or very lopsided distributions (like a 1/100 allele split), it may correct the minority allele into the majority allele, so should be used with caution.
+
+
+Temp Files and Piping:
+
+BBNorm needs to read input files multiple times (twice per pass), which means it is unable to accept piped input. In multipass mode, it also needs to generate temp files. The location of temp files can be specified with the "tmpdir" flag; it defaults to the environment variable $TMPDIR, which on Genepool points to local disk when available. Temp files will be cleaned up once BBNorm finishes.
+
+
+Threads:
+
+BBNorm is fully multithreaded both when counting kmers and when doing other operations such as normalization. The counting is lock-free, using atomic counters. As a result, it will default to using all available hardware threads; this can be adjusted with the "t" flag.
+
+
+*Usage Examples*
+
+
+Estimating Memory Requirements:
+loglog.sh in=reads.fq
+
+This will estimate the number of unique kmers in a dataset, which will dictate how much memory is needed by kmer-counting programs such as BBNorm. It does so very quickly while using virtually no memory, so it is recommended prior to running BBNorm (or any kmer-counting tool) if you need to know how much memory is needed. For BBNorm, if LogLog reports 1 billion kmers (for example), then using 16-bit cells and 3 hashes, you would need roughly 3hashes*16bits/kmer/8bits/byte*1000000000km [...]
+Estimating the memory requirement is not really necessary, though.
+
+
+To normalize read coverage:
+bbnorm.sh in=reads.fq out=normalized.fq target=100 min=5
+
+This will run 2-pass normalization to produce an output file of reads with an average depth of 100x. Reads with an apparant depth of under 5x will be presumed to be errors and discarded.
+
+
+To error-correct reads:
+ecc.sh in=reads.fq out=corrected.fq
+or equivalently
+bbnorm.sh in=reads.fq out=corrected.fq ecc=t keepall passes=1 bits=16 prefilter
+
+This will do error correction without discarding any reads. "bits=16 prefilter" are not really necessary but will typically make the correction more accurate by storing kmers more efficiently.
+
+
+To generate a kmer-frequency histogram:
+khist.sh in=reads.fq khist=khist.txt peaks=peaks.txt
+or equivalently
+bbnorm.sh in=reads.fq khist=khist.txt peaks=peaks.txt passes=1 prefilter minprob=0 minqual=0 mindepth=0
+
+The histogram shows the number of unique kmers at a given depth. For example, a point at "Depth=10, UniqueKmers=248028" indicates that there are 248028 kmers that each occur 10 times in the input data. This should be plotted on a log-log scale. The peaks file contains the locations of peaks in the histogram, as well as estimates of genome size and ploidy. These estimates will only be accurate for randomly-sheared isolate genomic DNA with little contamination, and a ploidy of at most [...]
+The additional arguments to bbnorm.sh (minprob=0 minqual=0 mindepth=0) are there to prevent low-depth kmers from being discarded.
+
+
+To normalize and error-correct reads, creating before and after kmer histograms:
+bbnorm.sh in=reads.fq out=normalized.fq target=100 min=5 prefilter ecc khist=khist_before.txt khistout=khist_after.txt
+
+
+To make a high-pass or low-pass filter:
+bbnorm.sh in=reads.fq out=highpass.fq outt=lowpass.fq passes=1 target=999999999 min=10
+
+This will pass only reads with a depth of at least 10 to "out", and low-depth reads under 10 to "outt" (outtoss).
+
+
+To split by depth into 3 bins:
+bbnorm.sh in=reads.fq outlow=low.fq outmid=mid.fq outhigh=high.fq passes=1 lowbindepth=10 highbindepth=80
+
+This will put reads with coverage under 10x in low.fq; at least 80x in high.fq; and all others in mid.fq. Specifically, for pairs, if one read is below the low cutoff and the other is above the high cutoff, both go into mid.
+
+
+Using additional files for kmer counts:
+bbnorm.sh in=reads.fq out=corrected.fq passes=1 ecc extra=genome.fa,more_reads.fq
+
+This will error-correct reads.fq using additional kmer count information from genome.fa and more_reads.fq. It can also be applied to other operations like normalization. The arguments to "extra" will be used only for kmer frequency data, but will not be part of the output.
diff --git a/docs/guides/CalcUniquenessGuide.txt b/docs/guides/CalcUniquenessGuide.txt
new file mode 100755
index 0000000..0e7b512
--- /dev/null
+++ b/docs/guides/CalcUniquenessGuide.txt
@@ -0,0 +1,43 @@
+CalcUniqueness Guide
+Written by Brian Bushnell
+Last updated December 22, 2015
+
+CalcUniqueness is designed to plot the fraction of unique reads produced by a sequencing run, as a function of the number of reads sequence. In other words, the output is similar to a rarefaction curve. It can help determine library complexity and whether additional sequencing might be useful. The way it determines whether a read has already been seen is probabilistic, by storing kmers from fixed locations (e.g., the first kmer in the read); if a kmer has already been seen, it is assu [...]
+
+
+*Notes*
+
+
+Memory:
+
+CalcUniqueness grabs all available memory, even though normally it doesn't really need it. It needs approximately 50 bytes per unique read.
+
+
+Legacy Aspects:
+
+CalcUniqueness was designed to replace an existing, inefficient pipeline. And it was designed to provide output matching that old pipeline, which I did not design. As a result, some of the features do not make a lot of sense, such as using K=20 (which is too short) and the "random kmer" columns (which are of questionable utility; I ignore them).
+
+
+Data Quality:
+
+Kmer matches must be exact. As a result, low quality data will give artificially high uniqueness estimates. For the same reason, this program cannot be used on raw PacBio data. Interestingly, you can see where on the flowcell the sequencing machine has quality issues by looking at the graphs from this program; they show up as spikes.
+
+
+Histogram Output:
+
+There are 3 columns for single reads, 6 columns for paired:
+count number of reads or pairs processed
+r1_first percent unique 1st kmer of read 1
+r1_rand percent unique random kmer of read 1
+r2_first percent unique 1st kmer of read 2
+r2_rand percent unique random kmer of read 2
+pair percent unique concatenated kmer from read 1 and 2
+
+One line is printed every X reads (default is 25000) showing the percent of reads that were unique in the last interval. "cumulative=t" will still print once per interval, but will print the number of reads that were unique overall (which is generally a higher number, and not useful in most cases).
+
+
+*Usage Examples*
+
+
+To generate a uniqueness plot:
+bbcountunique.sh in=reads out=histogram.txt
diff --git a/docs/guides/ClumpifyGuide.txt b/docs/guides/ClumpifyGuide.txt
new file mode 100755
index 0000000..3b93427
--- /dev/null
+++ b/docs/guides/ClumpifyGuide.txt
@@ -0,0 +1,40 @@
+Clumpify Guide
+Written by Brian Bushnell
+Last updated December 22, 2015
+
+Clumpify is a tool designed to rapidly group overlapping reads into clumps. This can be used as a way to increase file compression, accelerate overlap-based assembly, or accelerate applications such as mapping or that are cache-sensitive. Clumpify can also generate consensus sequence from these clusters, though this is currently rudimentary. The clusters are not guaranteed to be overlapping; rather, they are guaranteed to share a kmer, meaning they are likely to overlap. It is design [...]
+
+
+*Notes*
+
+
+Paired reads:
+
+Clumpify supports paired reads, in which case it will clump based on read 1 only. However, it's much more effective to treat reads as unpaired. For example, merge the reads with BBMerge, then concatenate the merged reads with the unmerged pairs, and clump them all together as unpaired.
+
+
+Memory, Disk, and Phases:
+
+Clumpify stores all sequences in memory while clumping. But it is also capable of operating in two phases, KmerSplit and KmerSort. KmerSplit will break the data up into an arbitrary number of temporary files which can then be sorted independently, and subsequently merged. As a result, Clumpify does not have a strict bound on how much memory it needs or how many sequences it can process, since the user can specify however many groups are desired to make the files arbitrarily small. Th [...]
+
+
+Compression:
+
+Gzip compression is more efficient when similar sequences are nearby, as they can be replaced by pointers to prior copies of that sequence. So, a clumpified file will compress smaller than a randomly-ordered file. Linebreaks, headers, and quality values take up the majority of the space in a compressed clumpified file. The most efficient way to compress a sequence file, then, is to store it in fasta format with unlimited line-wrap length and replace the headers with short strings; the [...]
+
+
+*Usage Examples*
+
+
+To clumpify reads:
+clumpify.sh in=reads.fq.gz out=clumped.fq.gz groups=16
+
+This will use 16 temp files during clumpification.
+
+
+To maximally compress sequence data:
+rename.sh in=reads.fq out=renamed.fq prefix=x
+bbmerge.sh in=renamed.fq out=merged.fq mix
+clumpify.sh in=reads.fq.gz out=clumped.fa.gz zl=9 pigz fastawrap=100000
+
+This will strip off the names, merge the reads when possible, and then clumpify everything. The output will be fasta-formatted to remove the quality values and have one read per line (unless the reads are over 100kbp long). If quality values need to be saved, then output as "clumped.fq.gz" instead.
diff --git a/docs/guides/DedupeGuide.txt b/docs/guides/DedupeGuide.txt
new file mode 100755
index 0000000..d005851
--- /dev/null
+++ b/docs/guides/DedupeGuide.txt
@@ -0,0 +1,136 @@
+Dedupe Guide
+Written by Brian Bushnell
+Last updated December 18, 2015
+
+Dedupe was written to eliminate duplicate contigs in assemblies, and later expanded to find all contained and overlapping sequences in a dataset, allowing a specified number of substitutions or edit distance. It is now also capable of clustering sequences based on similarity, and printing dot-formatted all-to-all overlap graphs.
+Kmer-based assemblers do not typically create redundant contigs when working correctly, though an exception can be made in the case of transcriptome assemblers. However, overlap-based assemblers may create duplicate sequences, and merged kmer-based assemblies (such as 5 assemblies of the same reads with different kmer lengths) will usually contain massive redundancy. Also, public databases such as nt and RefSeq often have hundreds of thousands of duplicate sequences due to poor curatio [...]
+
+
+*Notes*
+
+
+Memory:
+
+Dedupe stores all unique sequences in memory. The cost is around 500 bytes per unique sequence, plus the sequences themselves (1 byte per base). It is possible to run Dedupe in subset mode to deduplicate datasets that do not fit in memory, but that will not be covered in this guide.
+
+
+Threads and Scaling:
+
+Dedupe is fully multithreaded, and scales near-linearly with the number of cores. Finding exact duplicates is so fast that it typically becomes bottlenecked by the file input streams, which max out at around 500 Mbp/s each. When deduplicating multiple references, using "in=a.fasta,b.fasta,c.fasta" allows each to be read in a different stream, increasing the maximal throughput compared to first concatenating all references into a single file.
+
+
+Phases:
+
+Dedupe has 6 phases, most of which are optional and depend on the processing mode. They are always executed (or skipped) in the same order.
+1) Exact Matches.
+During this required phase, sequences are loaded into memory, and exact duplicates (including reverse-complements) are detected and discarded. Hashtables are filled with sequence hash codes of sequences. If containments or overlaps will be examined in later stages, kmers from the ends of sequences will also be added to hash tables. After this phase, the input files will not be used again.
+2) Absorb Containments.
+If "absorbcontainments" is enabled (default), every read X is scanned for kmers; each kmer is looked up in a hashtable. If the kmer occurs in some other read Y, then Y is aligned with X to see if X contains Y (meaning Y is equal in length or shorter than X, and the number of substitutions or edits is at most the values specified with the "s" and "e" flags). If so, Y is discarded.
+3) Find Overlaps.
+If "findoverlaps" is enabled (non-default), overlaps will be sought using the same process as containment-absorbtion, but X will not need to contain Y; they must simply have an overlap of at least minoverlap (default 200). Neither is absorbed, and nor are they merged; the overlap is simply recorded.
+4) Make Clusters.
+If "cluster" is enabled (non-default), clusters will be created by searching the overlap graph. Each cluster is the set of all reads reachable via transitive overlaps. For example, if X overlaps Y, and Y overlaps Z, then X, Y, and Z will form a cluster, even if X does not overlap Z. That means if there is even a single edge between 2 clusters, they will become one cluster.
+5) Process Clusters.
+If "processclusters" is enabled (non-default), the clusters will be postprocessed to simplify them. This involves various graph simplification operations (which can be individually toggled) like removing redundant edges and (when possible) flipping some of the sequences so that they are all in the same orientation.
+6) Output
+Finally, all of the output files are generated.
+
+
+Read Deduplication:
+
+Dedupe can be used for deduplicating read sets, and supports paired reads as well (in which case it requires a pair to be the duplicate of another pair). However, due to memory usage, it is not particularly efficient in this role, considering the volume of data that can be generated on modern sequencing platforms. Dedupe can easily handle a 10M read MiSeq run, but a HighSeq lane with 300M reads might take hundreds of GB of RAM. In those cases, deduplication methods based on sorting wo [...]
+Also, the current implementation of Dedupe is strictly limited to 2 billion unique sequences regardless of how much memory is available.
+
+
+Pair Limitations:
+
+Dedupe supports paired reads, but it was not really designed for them. When processing paired reads, some parts of Dedupe are restricted to a single thread due to a complication that causes non-deterministic output. As such, processing paired reads is slower than unpaired reads. Also, pair support is limited to exact matches and overlaps, not containments.
+
+
+JNI acceleration:
+
+Dedupe has an optional C component (written by Jonathan Rood) which will accelerate overlap and containment detection by a lot (at least double). However, it only has an effect if an edit distance is allowed. This can be activated with the "jni" flag, but it must first be compiled. For details on compiling it, see /bbmap/jni/README.txt. When clustering amplicons and allowing an edit distance, the "jni" and "pto" flags are highly recommended as they will dramatically increase speed.
+
+
+Dedupe versus Dedupe2:
+
+Dedupe and Dedupe2 are identical except that Dedupe2 allows an unlimited number of affixes (kmer prefixes and suffixes used for seeding overlap detection). This is only useful when searching for overlaps with a relatively low identity, since kmers are required to have exact matches. More affixes use more memory and slow things down, so don't go overboard. You can call dedupe.sh or dedupe2.sh; internally, either Dedupe or Dedupe2 will get used depending on how many affixes were request [...]
+
+
+*Usage Examples*
+
+
+Exact duplicate removal only:
+dedupe.sh in=X.fa out=Z.fa ac=f
+
+The "ac=f" flag disables containment removal.
+
+
+Exact duplicate and contained sequence removal:
+dedupe.sh in=X.fa out=Y.fa
+
+
+Finding duplicate sequences:
+dedupe.sh in=X.fa out=Y.fa outd=duplicates.fa
+
+All removed sequences will end up in "duplicates.fa".
+
+
+Deduplication of multiple files:
+dedupe.sh in=X1.fa,X2.fa,X3.fa out=Y.fa
+
+
+Deduplication allowing mismatches:
+dedupe.sh in=X.fa out=Y.fa s=5 e=2
+
+This will allow up to 5 substitutions, or 2 edits. What does this mean? Well, 5 substitutions is OK. 2 insertions or 2 deletions is OK. 2 insertions and 3 substitutions is OK. But, 5 insertions is not OK, because the edit distance specifies the bandwith of the banded aligner, and more than 2 insertions or deletions would go out of bounds. "s=5" alone would allow 5 substitutions and zero indels, while "e=2" alone would allow up to 2 of any mutations (2 subs, 1 sub 1 insertion, etc).
+
+
+Deduplication with a minimum identity:
+dedupe.sh in=X.fa out=Y.fa minidentity=99
+
+This will consider two sequences to be duplicates if their identity is at least 99%. Indels are not allowed unless you specifically set the "e" flag. So, "minidentity=99" would consider 2 1000bp sequences to be duplicates if they had up to 1000*1% = 10 substitutions. "minidentity=99 e=5" would consider 2 1000bp sequences to be duplicates if they had up to 10 mutations, but only up to 5 of them could be indels. "minidentity=99 e=20" would consider 2 1000bp sequences to be duplicates i [...]
+
+
+Clustering by overlap:
+dedupe.sh in=X.fq pattern=cluster%.fq ac=f am=f s=1 mo=200 c pc csf=stats.txt outbest=best.fq fo c mcs=3 cc dot=graph.dot
+
+This will find overlaps (fo) using a min overlap length (mo) of 200 and allowing 1 substitution (s). Then, reads will be clustered (c), and clusters of at least size 3 (mcs) will be written to individual files: cluster1.fq, cluster2.fq, etc. Also, the single best representative of each cluster (based on length and quality scores) will be written to outbest.fq (this makes more sense for amplicon clustering than assembly). A graph representing the overlaps will be written to graph.dot, [...]
+
+
+Clustering full-length PacBio 16s reads of insert:
+reformat.sh in=reads_of_insert.fastq out=filtered.fq minlen=1420 maxlen=1640 maq=20 qin=33
+then:
+dedupe.sh in=filtered.fq csf=stats_e26.txt outbest=best_e26.fq qin=33 usejni=t am=f ac=f fo c rnc=f mcs=3 k=27 mo=1420 ow cc pto nam=4 e=26 pattern=cluster_%.fq dot=graph.dot
+
+This first filters out low-quality data and probable chimeras (based on length) using Reformat. Then, clustering is done allowing up to 26 edits (this was chosen to allow roughly 99% accurate 1540bp amplicons to overlap; it should be adjusted depending on the accuracy and length of the data). A minimum overlap length is set to 1420bp. "nam=4 k=27" means 4 nonoverlapping 27-mers are used as seeds from each end of the sequences.
+
+
+*Set Operations*
+
+It is possible to do arbitrary set operations (intersection, union, subtraction) with Dedupe, though it's not trivial. They are made possible by the "uniqueonly" flag, which discards all copies of sequences that have duplicates, rather than retaining exactly one. Note that similar operations are possible on kmer sets rather than sequence sets using kcompress.sh.
+
+
+Set creation:
+dedupe.sh in=X.fa out=X2.fa ac=f
+dedupe.sh in=Y.fa out=Y2.fa ac=f
+
+This is a necessary first step to ensure that X2 and Y2 are sets, meaning they have no duplicates.
+
+
+Set union:
+dedupe.sh in=X.fa,Y.fa out=union.fa ac=f
+
+
+Set subtraction:
+dedupe.sh in=X2.fa,union.fa out=Y2_minus_X2.fa uniqueonly ac=f
+
+
+Set symmetric difference:
+dedupe.sh in=X2.fa,Y2.fa out=symmetric_difference.fa uniqueonly ac=f
+
+
+Set intersection:
+dedupe.sh in=X2.fa,symmetric_difference.fa out=intersection.fa uniqueonly ac=f
+
+
diff --git a/docs/guides/PreprocessingGuide.txt b/docs/guides/PreprocessingGuide.txt
new file mode 100755
index 0000000..4d43560
--- /dev/null
+++ b/docs/guides/PreprocessingGuide.txt
@@ -0,0 +1,39 @@
+Preprocessing Guide
+Written by Brian Bushnell
+Last updated December 18, 2015
+
+Prior to doing anything with raw reads - mapping, clustering, assembly, etc - it is usually prudent to do certain preprocessing steps. And these steps are best done in a specific order, which I have detailed below, along with the suggest tool. Note that many of them (like quality-trimming) are optional, so if you do them, do them in this order; but you don't have to do them. Others, like adapter-trimming, are not optional and should always be done.
+
+These steps replicate the QA protocol implemented at JGI for Illumina reads. There is a program "RQCFilter" which implements them as a pipeline, but that is not publically available because it has numerous hard-coded paths to reference datasets of contaminants.
+
+
+0) Format conversion, if necessary. The simplest format for the subsequent steps is gzipped fastq, with the reads interleaved in a single file if they are paired, but that's not required. However, H5 and SRA formats are not supported, and unaligned bam should be converted to fastq first. Tool: Reformat, Samtools, SRA Toolkit, etc.
+
+1) Adapter-trimming. Always recommended. Tool: BBDuk.
+1b) If chastity-filtering and barcode-filtering were not already done, they can be done here.
+1c) If reads have an extra base at the end (like 2x151bp reads versus 2x150bp), it should be trimmed here with the "ftm=5" flag. That will occur before adapter-trimming.
+
+2) Contaminant filtering for synthetic molecules and spike-ins such as PhiX. Always recommended. Tool: BBDuk.
+2b) Quality-trimming and/or quality-filtering. Optional; only recommended if you have very low-quality data or are doing something very sensitive to low-quality data, like calling very rare variants.
+
+3) Nextera LMP library splitting. Mandatory when processing Nextera long-mate-pair libraries (NOT normal paired Nextera libraries). Tool: SplitNexteraLMP (splitnextera.sh).
+
+4) Human contaminant removal. Optional; only for non-vertebrate studies. Should be done by mapping. JGI also removes cat, dog, and mouse sequences, and we use masked version of the references to avoid false positives. Tool: BBMap.
+
+5) Quality recalibration. Optional; mainly for when quality scores are very inaccurate, or binned, as in the NextSeq or HiSeq3000+ platforms. Tool: BBMap plus BBDuk.
+5b) This step requires mapping, which requires an assembly. If no assembly exists, one can be generated rapidly with Tadpole.
+
+6) Deduplication. Optional; mainly for exome-capture. This is not actually part of RQCFilter because JGI does not typically do exon-capture. Tool: Either Dedupe or DedupeByMapping can be used if you have sufficient memory. If not, there are 3rd-party deduplication tools based on sorting that do not need much memory.
+
+7) Normalization or subsampling. Optional; mainly for assembly of data with high or uneven coverage. Tool: BBNorm for normalization, Reformat for subsampling.
+
+8) Error correction. Optional; requires adequate coverage. Tool: Tadpole, or BBNorm if Tadpole runs out of memory. If BBNorm will be used, and normalization is desired, error correction can be done at the same time as normalization.
+
+9) Paired-read merging. Optional; mainly for assembly, clustering, or insert-size calculation. Tool: BBMerge.
+8b) RQCFilter runs BBMerge on all paired libraries for insert-size calculation, and uses the "cardinality" flag to simultaneously calculate the approximate number of unique kmers in the dataset, which can help estimate memory needs for assembly.
+
+10) Kmer depth distribution. Optional; mainly for assembly and contamination detection. Tool: BBNorm (khist.sh).
+
+11) BLAST or similar search against wide-taxonomy database such as RefSeq Microbial or nt. This can be done on an assembly of the reads, or a handful of reads. Optional; just for checking for contamination before proceeding. Mainly useful on isolates of a known organism such as human or fruitfly. Tool: BLAST, LAST, etc.
+
+At this point the data is ready to use!
diff --git a/docs/guides/ReformatGuide.txt b/docs/guides/ReformatGuide.txt
new file mode 100755
index 0000000..8354689
--- /dev/null
+++ b/docs/guides/ReformatGuide.txt
@@ -0,0 +1,140 @@
+Reformat Guide
+Written by Brian Bushnell
+Last updated December 14, 2015
+
+Reformat is designed for generic streaming read-processing tasks that have low memory or computational demands, such as format conversion, subsampling, and various filtering operations. Some of its functionality (like quality-trimming, length-filtering, histogram generation) is shared with BBDuk, in which case BBDuk will be faster; but much of it (like converting degenerate bases to N) is unique to Reformat. Because of its lower resource consumption, Reformat is often preferable to BBD [...]
+
+Reformat's parameters are described in its shellscript (reformat.sh). This file provides usage examples of various common tasks.
+
+
+*Notes*
+
+
+Memory:
+
+Reformat needs only a trivial amount of memory for processing short reads, regardless of how many there are. The only situation it would need more memory is when processing very long sequences, such as the human genome, since by default Reformat buffers several hundred sequences in memory at a time; with the human genome, that would be the whole thing (over 3GB). In that situation you can reduce the number of buffered reads with the flags "readbufferlength=1 readbuffers=1", and/or incr [...]
+
+
+Threads:
+
+Reformat only uses a single worker thread, but can use multiple I/O threads and potentially even more compression threads if pigz is installed. The "t" flag will not impact the number of worker threads, but it can be used to cap the number of compression and I/O threads used. However, even with "t=1", Reformat will generally use over 2 CPU cores on average since the I/O is in separate threads.
+
+
+Output streams:
+
+Reformat has 2 standard output streams, "out" and "outs". Normal reads passing any filters being used go to "out"; "outs" only captures singleton reads that pass a filter but whose mate fails the filter.
+
+
+Formats supported:
+
+Please see readme_filtypes.txt.
+
+
+Related tools:
+
+Reformat shares a lot of functionality with BBDuk, which is typically faster but more resource-intensive. However, there is also similar functionality (low-resource, streaming operations) in some tools that seems like it would be in Reformat, but isn't:
+rename.sh does various renaming operations;
+repair.sh/bbsplitpairs.sh does reordering of paired reads that have lost synchronization;
+readlength.sh has more advanced length histogram control options;
+filterbyname.sh for name-based filtering;
+fuse.sh and split.sh for shredding and concatenating sequence;
+phylip2fasta for phylip reformatting;
+translate6frames for AminoAcid<->Nucleotide conversion.
+
+
+*Usage Examples*
+
+To reformat fastq to fasta:
+reformat.sh in=reads.fastq out=reads.fasta
+
+This command is analogous for any file format conversion. For example, "reformat.sh in=reads.fa.gz out=reads.sam" will convert gzipped fasta reads to uncompressed sam. They won't be mapped, of course. If you wish to convert sam to fastq, it is recommended that you add the "primaryonly" flag to avoid getting duplicates of reads.
+
+
+To run reformat in a loop, and automatically rename files appropriately:
+reformat.sh in=read#.fq out=%.fa
+
+This will convert read1.fq and read2.fq (expanded from the # symbol) to read1.fa and read2.fa (expanded from the % symbol).
+
+
+To change quality encoding:
+reformat.sh in=reads.fq out=reads.fq qin=33 qout=64
+
+This will covert ASCII-33 qualities (Sanger, modern Illumina, and all other platforms) to ASCII-64 (obsolete Illumina).
+
+
+To convert between fastq and fasta+qual:
+reformat.sh in=reads.fq out=reads.fa qfout=reads.qual
+or
+reformat.sh in=reads.fa qfin=reads.qual out=reads.fq
+
+
+To interleave or deinterleave paired reads:
+reformat.sh in=reads.fq out1=read1.fq out2=read2.fq
+or
+reformat.sh in1=read1.fq in2=read2.fq out=reads.fq
+or to be concise,
+reformat.sh in=read#.fq out=reads.fq
+
+
+To change fasta word-wrap limits:
+reformat.sh in=reads.fa out=wrapped.fa fastawrap=70
+
+
+To verify that reads appear to be correctly paired, based on their names:
+reformat.sh in=reads.fq vint
+or (for reads in 2 files)
+reformat.sh in=read#.fq vpair
+
+If it is acceptable for reads to have identical names, rather than the usual /1 and /2 or 1: and 2: at the end, add the flag "allowidenticalnames".
+
+
+To discard reads that have mismatching lengths of bases and qualities:
+reformat.sh in=reads.fq out=fixed.fq tossbrokenreads
+
+Note that this should be used with caution as it normally means the input file is corrupt.
+
+
+To add a "/1" and "/2" to the names of paired reads that don't have them:
+reformat.sh in=reads.fq out=renamed.fq addslash int
+
+
+To change whitespace in read names to underscores:
+reformat.sh in=reads.fq out=renamed.fq underscore
+
+Or, to trim read names after the first whitespace:
+reformat.sh in=reads.fq out=renamed.fq trd
+
+BBTools by default always use the full name of a sequence. However, some other programs ignore everything after the first whitespace, so these options are often useful for compatibility with them.
+
+
+To reverse-complement reads:
+reformat.sh in=reads.fq out=out.fq rcomp
+or, for just read2:
+reformat.sh in=reads.fq out=out.fq rcompmate
+
+
+To change lowercase letters to uppercase:
+reformat.sh in=reads.fq out=out.fq tuc
+
+
+To perform arbitrary remaping of input bases:
+reformat.sh in=reads.fq out=out.fq remap=aZGP
+
+The map consists of a series of pairs, in this case "aZ" and "GP". This will change "a" to "Z" and "G" to "P", and ignore all other characters.
+
+
+To convert degenerate bases (IUPAC characters) to Ns:
+reformat.sh in=reads.fq out=out.fq iupacton
+
+
+To ensure all sequences in a file have unique names:
+reformat.sh in=reads.fq out=out.fq uniquenames
+
+If a name is duplicated, the additional copies will have "_<number>" appended to them to ensure all names are unique. Note that unlike most other functions, this is NOT streaming and requires storing all names in memory. As a result, it can use a substantial amount of memory.
+
+
+To cap quality scores into a certain range:
+reformat.sh in=reads.fq out=out.fq mincalledquality=2 maxcalledquality=41
+
+This is useful for preventing abnormal quality scores (such as in error-corrected PacBio reads) that can break some programs.
+
diff --git a/docs/guides/RepairGuide.txt b/docs/guides/RepairGuide.txt
new file mode 100755
index 0000000..f4dde4a
--- /dev/null
+++ b/docs/guides/RepairGuide.txt
@@ -0,0 +1,29 @@
+Repair Guide
+Written by Brian Bushnell
+Last updated December 22, 2015
+
+Repair (or re-pair) is designed to fix files of paired reads that became disordered. With paired reads in 2 files, the first read in file 1 must be the mate of the first read in file 2, etc. For paired reads in a single interleaved file, the second read is the mate of the first read, and the 4th read is the mate of the 3rd read, etc. Using old, non-pair-aware software like Fastx Toolkit is the primary cause of corrupting these files to break the pairing order; when that happens, it's [...]
+
+
+*Notes*
+
+
+Memory:
+
+Repair has two shellscripts, repair.sh and bbsplitpairs.sh. Both call jgi.SplitPairsAndSingles, but bbsplitpairs.sh requests a small amount of memory by default and repair.sh requests all available memory by default. Repairing (repair flag) arbitrarily disordered files will take a lot of memory - potentially, all reads need to be stored in memory. However, fixing a file that was interleaved but processed as unpaired (fint flag) only needs a small amount of memory. "Repair" can also b [...]
+
+
+*Usage Examples*
+
+
+Repairing an arbitrarily disordered file:
+repair.sh in=broken.fq out=fixed.fq outs=singletons.fq repair
+
+
+Repairing disordered dual files:
+repair.sh in1=broken1.fq in2=broken2 out1=fixed1.fq out2=fixed2.fq outs=singletons.fq repair
+
+
+Fixing broken interleaving:
+bbsplitpairs.sh in=broken.fq out=fixed.fq outs=singletons.fq fint
+
diff --git a/docs/guides/SealGuide.txt b/docs/guides/SealGuide.txt
new file mode 100755
index 0000000..ab33852
--- /dev/null
+++ b/docs/guides/SealGuide.txt
@@ -0,0 +1,97 @@
+Seal Guide
+Written by Brian Bushnell
+Last updated December 15, 2015
+
+Seal stands for "Sequence Expression AnaLyzer". Seal can be thought of as BBDuk's sibling; the two programs are very similar. So, this guide will focus on the differences; for more details on basics, please see the BBDuk guide. BBDuk associates one kmer with one number (for example, a kmer with the reference it came from). Thus if two references share a kmer, BBDuk will associate it with the first one only; reads containing that kmer will be considered as matching the first reference [...]
+Seal can associate a kmer with an unlimited number of numbers. So it is better in cases where different references may share sequence - related organisms, for example, or adapters that differ only by the barcode... or different isoforms of a gene, which share one or more exons. The uses of Seal are slightly different - it does not do kmer-trimming or kmer-masking. It does kmer-filtering, kmer-binning, and hit stats counting. Unlike BBDuk, Seal does not provide emulated support for K> [...]
+Seal also supports some taxonomic classification operations, though that aspect is still in progress.
+
+Seal's parameters are described in its shellscript (seal.sh). This file provides usage examples of various common tasks.
+
+
+*Notes*
+
+
+Memory:
+
+Seal uses a similar amount of memory as BBDuk (20 bytes) for unique kmers. Additional copies of kmers cost more to store. So, 2 copies of the E.coli genome would require the same amount of memory as 1 copy, with BBDuk; for Seal, it would require somewhat more memory - a lump sum of perhaps 32 extra bytes for each nonunique kmer, plus 4 bytes per extra copy.
+
+
+Ambig Modes:
+
+Like BBMap, Seal has "ambig modes" for detailing how to handle ambiguously-mapping reads (meaning reads that match more than one reference). The modes:
+first: Use the first best-matching reference sequence.
+toss: Consider unmapped.
+random: Select one best-matching reference sequence randomly.
+all: Use all best-matching reference sequences.
+Default is "random", meaning every matching read will get assigned to exactly one reference; if it matches more than one, it will be assigned to one at random, chosen from all best-matching references. For example, if a read shares 2 kmers with reference A, 2 with reference B, and 1 with reference C, it will choose between A and B since they are equally good and both better than C. With ambig=first, ambig=toss, and ambig=random, the sum of the number of reads assigned to various refere [...]
+
+
+Clearzone:
+
+The clearzone is the maximum number of kmer matches separating the best-matching reference from the worst-matching reference. The default is zero, meaning if the best-matching reference has even 1 kmer hit more than the second-best-matching reference, it will still be considered unambiguous. For a concrete example, say a read R shares 10 kmers with ref A, 8 kmers with B, 3 kmers with C, and 0 kmers with D. At clearzone=0, this read unambiguously matches A. At clearzone=2, it ambiguou [...]
+
+
+Match Modes:
+
+Seal has 3 modes for determining how to count reference kmer matches, with the default being "all":
+all: Attempt to match all kmers in each read.
+first: Quit after the first matching kmer.
+unique: Quit after the first uniquely matching kmer.
+"All" is of course the slowest; all kmers are counted, then the references are ordered by the number of shared kmers. "First" is the fastest; as soon as a kmer is matched, counting will stop. The read can still map ambiguously if that first kmer was present in multiple references. "Unique" is in-between; counting will continue until a kmer is encountered that only occurs in exactly one reference (meaning that, errors aside, the read clearly came from that reference). The speed of "un [...]
+
+
+Refnames and Fuse:
+
+By default, references are tracked on a per-sequence basis. That means that one ref file containing 10 sequences would be equivalent to 10 ref files, each containing one sequence; when printing stats, either would yield 10 lines, for example. If you have 2 bacterial assemblies (let's call them A and B) each with 300 contigs, and you just want to see the proportion of reads that best match A versus B, this is really annoying since your stats file will have 300 lines in it (whereas BBSpl [...]
+1) Run fuse.sh on each ref file to concatenate all the sequences into a single sequence. This is (currently) the best approach, as duplicate kmers within a genome will only be stored once. But, it does not work for sequences more than 2Gbp long.
+or
+2) Set "refnames=t". This will report results on a per-reference-file basis rather than a per-sequence basis, though kmers are (currently) still stored on a per-sequence basis. Also, binning will create only 1 output file per reference file.
+
+
+Splitting and output streams:
+
+Like BBSplit, Seal can split input into multiple output streams, creating one output file per reference, containing all the reads that best match that reference (depending to the ambig mode, etc). Unlike BBSplit, Seal does this by kmer-matching rather than alignment, so it is generally faster but uses more memory. Also, the syntax is different; and furthermore, by default, one output file is created per reference sequence (rather than per reference file, in BBSplit). Binning can be ha [...]
+Seal also supports "out" and "outm", which have the same definitions as BBDuk; "out" gets everything NOT MATCHING the references, and "outm" gets everything MATCHING the references.
+
+
+Stats reporting:
+
+Seal has 3 stats outputs - stats, refstats, and rpkm. Stats reports the number and fraction of reads and bases mapping to each ref sequence. RPKM reports fold coverage, RPKM, raw counts, and FPKM of reads and bases mapping to each ref sequence. Refstats is supposed to be like stats but on a per-reference-file basis, but it currently prints the rpkm output on a per-reference-file basis instead.
+
+
+Summarizing stats:
+
+There is a tool called summarizeseal.sh that summarizes multiple sets of seal "stats=" summary files. It's designed for use in cross-contamination analysis, but could be useful in other areas.
+
+
+Paired reads:
+
+Seal can assign reads together, by summing kmer counts of individuals, or independently, using the "kpt" (keeppairstogether) flag, default true.
+
+
+Seal versus BBSplit:
+
+Seal and BBSplit both bin reads into multiple files, or generate statistics, based on which reference they match best. So, which should you use?
+Seal is generally much faster, but uses roughly 3x as much memory (around 20 bytes/base as opposed to BBSplit's 6 bytes/base), though both BBSplit and Seal can be run in lower-memory modes (3 bytes/base for BBSplit, and arbitrarily low for Seal) with a reduction in sensitivity. BBSplit typically has higher sensitivity and specificity. Seal, however, can handle reads (query sequences) of unlimited length, while BBSplit is capped at 6000bp maximum (default 600bp). Also, BBSplit slows do [...]
+
+
+*Usage Examples*
+
+
+To analyze and quantify expression or abundance:
+seal.sh in=reads.fq ref=transcripts.fa stats=sealstats.txt rpkm=sealrpkm.txt ambig=random
+
+
+To summarize statistics of multiple Seal runs on different files:
+summarizeseal.sh sealstats*.txt out=summary.txt
+
+
+To split reads into files by best organism match:
+seal.sh in=reads.fq ref=bacterial_genomes.fa pattern=out_%.fq outu=unmapped.fq ambig=all
+
+
+To display taxonomic information from a dataset:
+seal.sh in=reads.fq ref=organisms.fasta minlevel=phylum maxlevel=phylum tax=tax_results.txt tree=tree.taxtree.gz gi=gitable.int1d.gz
+
+This will list the number of reads hitting various taxonomic groups, at the phylum level. The reference sequences must be annotated with NCBI identifiers (gi numbers or NCBI taxonomy ID numbers). See the TaxonomyGuide for more details.
diff --git a/docs/guides/SplitNexteraGuide.txt b/docs/guides/SplitNexteraGuide.txt
new file mode 100755
index 0000000..3e85cbc
--- /dev/null
+++ b/docs/guides/SplitNexteraGuide.txt
@@ -0,0 +1,22 @@
+SplitNextera Guide
+Written by Brian Bushnell
+Last updated December 22, 2015
+
+SplitNextera splits Nextera LMP libraries into subsets based on linker orientation. It is designed strictly for Nextera LMP (long-mate-pair) reads, not for normal libraries using a Nextera kit. Nextera LMP libraries must be split prior to further processing; they are not usable raw. Adapter-trimming should still be done on Nextera LMP libraries prior to splitting.
+
+
+*Usage Examples*
+
+
+Processing a Nextera LMP library:
+bbduk.sh in=reads.fq out=trimmed.fq ref=adapters.fa ktrim=r k=23 mink=11 hdist=1 tpe tbo
+splitnextera.sh in=trimmed.fq out=lmp.fq outf=fragments.fq outu=unknown.fq outs=singletons.fq mask
+
+This will produce 4 output files - long-mate pairs, fragments (short pairs), singletons, and unknown. The unknown are typically long-mate pairs, but the linker was not found so they might be short pairs. The "mask" flag tells the program to look for the junction. It's possible to alternately look for the junction with BBDuk, instead (see below).
+
+
+Processing a Nextera LMP library, but finding the junctions with BBDuk:
+bbduk.sh in=reads.fq out=trimmed.fq ref=adapters.fa ktrim=r k=23 mink=11 hdist=1 tpe tbo
+bbduk.sh in=trimmed.fq out=stdout.fq ktmask=J k=19 hdist=1 mink=11 hdist2=0 literal=CTGTCTCTTATACACATCTAGATGTGTATAAGAGACAG | splitnextera.sh in=stdin.fq out=lmp.fq outf=fragments.fq outu=unknown.fq outs=singletons.fq
+
+This is somewhat faster but will yield the same output.
diff --git a/docs/guides/StatsGuide.txt b/docs/guides/StatsGuide.txt
new file mode 100755
index 0000000..4dec181
--- /dev/null
+++ b/docs/guides/StatsGuide.txt
@@ -0,0 +1,33 @@
+Stats Guide
+Written by Brian Bushnell
+Last updated December 22, 2015
+
+Stats is designed to generate basic assembly statistics such as scaffold count, N50, L50, GC content, gap percent, etc. It can also generate per-sequence GC-content information. The reason for the existence of stats is to replace prior tools that had similar function, but could not scale to large metagenomes; Stats is capable of processing an assembly of practically unbounded size, with sequences of practically unbounded length. And it does this rapidly, in a small amount of memory. [...]
+
+
+*Notes*
+
+
+Memory:
+
+Stats uses 120MB of RAM regardless of the assembly size.
+
+
+Threads:
+
+Stats is singlethreaded; it does not do garbage-collection or even use independent threads for I/O streams, unlike other BBTools.
+
+
+*Usage Examples*
+
+
+To get stats on an assembly:
+stats.sh in=contigs.fa
+
+
+To compare multiple assemblies:
+statswrapper.sh in=a.fa,b.fa,c.fa format=6
+
+
+To print GC and length information per sequence:
+stats.sh in=contigs.fa gc=gc.txt gcformat=4
diff --git a/docs/guides/TadpoleGuide.txt b/docs/guides/TadpoleGuide.txt
new file mode 100755
index 0000000..c1fb870
--- /dev/null
+++ b/docs/guides/TadpoleGuide.txt
@@ -0,0 +1,66 @@
+Tadpole Guide
+Written by Brian Bushnell
+Last updated December 11, 2015
+
+Tadpole is a kmer-based assembler, with additional capabilities of error-correcting and extending reads. It does not do any complicated graph analysis or scaffolding, and therefore, is not particularly good for diploid organisms. However, compared to most other assemblers, it is incredibly fast, has a very low misassembly rate, and is very adept at handling extremely irregular or superhigh coverage distributions. It does not have any annoying side-effects of generating temp files and [...]
+
+Tadpole's parameters are described in its shellscript (tadpole.sh). This file provides usage examples of various common tasks.
+
+
+*Notes*
+
+
+Memory:
+
+Tadpole will, by default, attempt to claim all available memory. It uses approximately 20 bytes per unique kmer for k=1-31, 30 bytes per kmer for k=32-62, and so forth in increments of 31. However, with most datasets, the bulk of the kmers (and thus memory) are unwanted error kmers rather than genomic kmers. It is possible to save memory by making Tadpole ignore low-quality kmers using the "minprob" flag (this ignores kmers that, based on their quality scores, have less than a specifi [...]
+
+
+Processing modes and output streams:
+
+The default mode is contig-building; reads are processed, kmers are counted, then contigs are made from kmers and written to a file. The alternate mode is error correction / extension, which can be entered with the flag "mode=correct" or "mode=extend"; either of those modes supports both error-correction and extension (making the reads longer by assembling at their ends). In contig mode, the reads will be processed once, and the contigs will be written to "out". In correct or extend m [...]
+
+
+Threads:
+
+Tadpole is fully multithreaded, both for kmer-counting and for the output phase (contig-building, error-correction, or extension). You should allow it to use all available processors except when operating on a shared node, in which case you may need to cap the number of threads with the "t" flag.
+
+
+Kmer Length:
+
+Tadpole supports unlimited kmer length, but it does not support all kmer lengths. Specifically, it supports every value of K from 1-31, every multiple of 2 from 32-62 (meaning 32, 34, 36, etc), every multiple of 3 from 63-93, and so forth. There is a wrapper script, tadwrapper.sh, that will assemble a range of different kmer lengths to determine which is best. Typically, about 2/3rds of read length is a good value for K for assembly. For error-correction, about 1/3rd of read length i [...]
+
+
+Shave and Rinse:
+
+These flags examine the graph immediately after kmer-counting is finished, to remove kmers that cause error-induced branches. Specifically, "shave" removes kmers along dead-end paths with very low depth that branch off from a higher-depth path, and "rinse" removes kmers along very-low-depth bubbles that start and end at branches off a higher-depth path. Both are optional and can be applied to any processing mode. They do not currently seem to make a significant difference.
+
+
+Continuity and fragmentation:
+
+Tadpole is designed to be conservative and avoid misassemblies in repetitive regions. As a result, the assemblies may sometimes be more fragmented than necessary. With sufficient coverage and read length, fragmentation can often be reduced by choosing a longer kmer. Alternately, reducing the value of branchmult1 and branchmult2 (to, say, "bm1=8 bm2=2") can often increase the continuity of an assembly, though that does come with an increased risk of misassemblies.
+
+
+*Usage Examples*
+
+Assembly:
+tadpole.sh in=reads.fq out=contigs.fa k=93
+
+This will assemble the reads into contigs. Each contig will consist of unique kmers, so contigs will not overlap by more than K-1 bases. Contigs end when there is a branch or dead-end in the kmer graph. The specific triggers for detecting a branch or dead-end are controlled by the flags mincountextend, branchmult1, branchmult2, and branchlower. Contigs will only be assembled starting with kmers with depth at least mincountseed, and contigs shorter than mincontig or with average cover [...]
+
+
+Error correction:
+tadpole.sh in=reads.fq out=ecc.fa mode=correct k=50
+
+This corrects the reads and outputs corrected reads. Correction is handled by two algorithms, "pincer" and "tail". Pincer corrects errors bidirectionally, using kmers on the left and right; therefore, it can only work on bases in the middle of the read, at least K away from either end. Tail is not as robust, but is able to work on the ends of the read. So, it's best to leave them both enabled, in which case the middle bases are corrected with pincer, and the end bases are corrected w [...]
+
+
+Error marking:
+tadpole.sh in=reads.fq out=ecc.fa mode=correct k=50 ecc=f mbb=2
+
+This will not correct bases, but simply mark bases that appear to be errors by replacing them with N. A base is considered a probable error (in this mode) if it is fully covered by kmers with depth below the value (in this case, 2). Mbb and ecc can be used together.
+
+
+Read Extension:
+tadpole.sh in=reads.fq out=extended.fq mode=extend k=93 el=50 er=50
+
+This will extend reads by up to 50bp to the left and 50bp to the right. Extension will stop prematurely if a branch or dead-end is encountered. Read extension and error-correction may be done at the same time, but that's not always ideal, as they may have different optimal values of K. Error-correction should use kmers shorter than 1/2 read length at the longest; otherwise, the middle of the read can't get corrected.
diff --git a/docs/guides/TaxonomyGuide.txt b/docs/guides/TaxonomyGuide.txt
new file mode 100755
index 0000000..0a908a7
--- /dev/null
+++ b/docs/guides/TaxonomyGuide.txt
@@ -0,0 +1,91 @@
+Taxonomy Guide
+Written by Brian Bushnell
+Last updated December 16, 2015
+
+
+BBTools contains a taxonomy package designed for processing NCBI taxonomy information, which is in the form of taxonomy files, and sequence name annotations in sequence files from NCBI's ftp site. The related tools allow you to filter or bin annotated sequences by taxonomy, or use Seal to classify read abundance at a specific taxonomic level.
+
+
+*Notes*
+
+
+Acquiring taxonomy data:
+
+First, the taxonomy files must be downloaded from NCBI. The are currently available here:
+ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdmp.zip
+and
+ftp://ftp.ncbi.nih.gov/pub/taxonomy/gi_taxid_nucl.dmp.gz
+In Linux, it is most convenient to fetch them like this:
+wget ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdmp.zip
+Before further use, these must be processed by gitable.sh and taxtree.sh. taxdmp.zip needs to be unzipped first. For more details see the Usage Examples section.
+
+
+Acquiring sequence data:
+
+NCBI is constantly rearranging its site, but currently, you can get sequence data here:
+ftp://ftp.ncbi.nlm.nih.gov/genomes/
+And specifically, ftp://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Bacteria/all.fna.tar.gz contains all of the RefSeq bacterial data. The current version should be available here: ftp://ftp.ncbi.nlm.nih.gov/refseq/release/bacteria/*.fna*, though that's inconvenient as it's in many files. There are also other clades in ftp://ftp.ncbi.nlm.nih.gov/refseq/release/
+
+
+Sequence naming conventions:
+
+The only requirement for reference sequence data is that it is named in a recognizable manner. This includes 3 possibilities:
+1) It starts with a gi number, in this format:
+gi|123|other stuff
+This is the current naming convention used by all NCBI data.
+2) It starts with an NCBI taxonomic identifier, in this format:
+ncbi|123"other stuff
+3) It is a proper Genus_species pair, with whitespace replaced by underscores, like this:
+Homo_sapiens
+Note that 3 is not recommended because there could be a name collision, but in general, it should be fine. It is not strictly necessary to replace whitespace with underscores, but that is often convenient. For taxa above the species level, just a single taxonomic name is fine, such as "Gammaproteobacteria". Sequence names are case-insensitive.
+
+
+Specifying processed taxonomy files:
+
+Most taxonomy-aware tools (Seal, FilterByTaxa, etc) require the path to the tree and/or gitable files to be specified in the command line. The default locations of processed taxonomy files are hard-coded in TaxTree.DefaultTreeFile and DefaultTableFile, currently (on Genepool, JGI's cluster) at /global/projectb/sandbox/gaag/bbtools/tax. So, on Genepool, you can use the flag "tree=auto" instead of the full path, and the default will be used. For non-JGI users the full path would be needed.
+
+
+Memory and Threads:
+
+The taxonomy-related tools (other than Seal) are all single-threaded and relatively low-memory.
+
+
+*Usage Examples*
+
+
+Creating a TaxTree file:
+taxtree.sh names.dmp nodes.dmp tree.taxtree.gz
+
+names.dmp and nodes.dmp come from unzipping taxdmp.zip (see "Acquiring taxonomy data"). The TaxTree file is needed by every tool processing taxonomy. Note that this command's flags are order-sensitive.
+
+
+Creating a GiTable file:
+gitable.sh gi_taxid_nucl.dmp.gz gitable.int1d.gz
+
+gi_taxid_nucl.dmp.gz comes from NCBI (see "Acquiring taxonomy data"). The GiTable is needed by tools processing taxonomy, but only if reference sequences are named with gi numbers (e.g. gi|123|stuff). Note that this command's flags are order-sensitive.
+
+
+Renaming gi numbers to NCBI Tax ID numbers:
+gi2taxid.sh in=bacteria.fa out=renamed.fa gi=gitable.int1d.gz
+
+This is an optional step, but will rename sequences such as "gi|123|stuff" to "ncbi|456|stuff". Replacing the gi number with a Tax ID number means in subsequent steps the gitable is no longer needed, which is more efficient.
+
+
+Filtering sequences by taxonomy, according to sequence names:
+filterbytaxa.sh in=bacteria.fa out=filtered.fa tree=tree.taxtree.gz table=gitable.int1d.gz names=Escherichia_coli level=phylum include=t
+
+This will create a file, "filtered.fa", containing all the sequences in the same phylum as E.coli. It is also possible to use numeric taxonomic IDs with the "ids" flag, or create a file containing everything except E.coli's phylum using the "include=f" flag. Gitable is not needed unless the sequences are named with gi numbers.
+
+
+Binning sequences by taxonomy, according to sequence names:
+splitbytaxa.sh in=bacteria.fa out=%.fa tree=tree.taxtree.gz table=gitable.int1d.gz level=phylum
+
+This will split the file into many output files, such as Deinococcus-Thermus.fa and Bacteroidetes.fa. For taxonomic binning based on sequence content rather than names, see Seal. Gitable is not needed unless the sequences are named with gi numbers.
+
+
+Printing the taxonomic ancestry of a taxa:
+taxonomy.sh tree=tree.taxtree.gz table=gitable.int1d.gz homo_sapiens meiothermus_ruber 123 gi_123
+
+This will print the complete taxonomy of Homo sapiens, Meiothermus ruber, and the organism has a sequence with an NCBI identifier of 123 (Pirellula), and the organism with a gi number of 123 (Bos taurus). Note that an underscore was used instead of the vertical line (gi_123) because vertical line is a reserved symbol for piping, so it's annoying to use on the command line. Also note that "123" or "ncbi_123" indicate a taxonomy number, while "gi_123" indicates a gi number. Gitable is n [...]
+
+
diff --git a/docs/readme.txt b/docs/readme.txt
new file mode 100755
index 0000000..4c90b0e
--- /dev/null
+++ b/docs/readme.txt
@@ -0,0 +1,34 @@
+BBMap/BBTools readme
+Written by Brian Bushnell
+Last updated December 23, 2015
+
+The BBTools package was written by Brian Bushnell, with the exception of the (optional, but faster) C, JNI, and MPI components, which were written by Jonathan Rood.
+
+All tools in the BBTools package are free to use. If you use BBTools in work leading to a publication, and BBTools has not yet been published, please cite it something like this:
+BBMap - Bushnell B. - sourceforge.net/projects/bbmap/
+
+License:
+
+The BBMap package is open source and free to use with no restrictions. For more information, please read Legal.txt and license.txt.
+
+Documentation:
+
+Documentation is in the /bbmap/docs/ directory, and in each tool's shellscript in /bbmap/.
+readme.txt: This file.
+UsageGuide.txt: Contains basic installation and usage information. Please read this first!
+ToolDescriptions.txt: Contains a list of all BBTools, a description of what they do, and their hardware requirements.
+compiling.txt: Information on compiling JNI code.
+readme_config.txt: Usage information about config files.
+readme_filetypes.txt: More detailed information on file formats supported by BBTools.
+changelog.txt: List of changes by version, and current known issues.
+
+Tool-specific Guides:
+
+Some tools have specific guides, like BBDukGuide.txt. They are in /bbmap/docs/guides/. For complete documentation of a tool, I recommend that you read UsageGuide.txt first (which covers the shared functionality of all tools), then the tool's specific guide if it has one (such as ReformatGuide.txt), then the tool's shellscript (such as reformat.sh) which lists all of the flags.
+
+If you have any questions not answered in the documentation, please look at the relevant SeqAnswers thread (linked from here: http://seqanswers.com/forums/showthread.php?t=41057) and post a question there if it is not already answered. You can also contact JGI's BBTools team at bbtools at lbl.gov, or me at bbushnell at lbl.gov. But please read the documentation first.
+
+Special thanks for help with shellscripts goes to:
+Alex Copeland (JGI), Douglas Jacobsen (JGI/NERSC), Bill Andreopoulos (JGI), sdriscoll (SeqAnswers), Jon Rood (JGI/NERSC), and Elmar Pruesse (UC Denver).
+
+Special thanks for helping to support BBTools goes to Genomax (SeqAnswers).
diff --git a/docs/readme_config.txt b/docs/readme_config.txt
new file mode 100755
index 0000000..2d4e965
--- /dev/null
+++ b/docs/readme_config.txt
@@ -0,0 +1,30 @@
+BBTools Config File Readme
+Written by Brian Bushnell
+Last updated May 12, 2015
+
+A config file is a text file with a set of parameters that will be added to the command line.
+The format is one parameter per line, with the # symbol indicating comments.
+To use a config file, use the config=file flag. For example, take BBDuk:
+
+bbduk.sh in=reads.fq out=trimmed.fq ref=ref.fa k=23 mink=11 hdist=1 tbo tpe
+
+That is equivalent to:
+
+bbduk.sh in=reads.fq out=trimmed.fq ref=ref.fa config=trimadapters.txt
+...if trimadapters.txt contained these lines:
+k=23
+mink=11
+hdist=1
+tbo
+tpe
+
+
+Any parameter placed AFTER the config file will override the same parameter if it is in the config file.
+For example, in this case k=20 will be used:
+bbduk.sh in=reads.fq out=trimmed.fq ref=ref.fa config=trimadapters.txt k=20
+
+But in this case, k=23 will be used, from the config file:
+bbduk.sh in=reads.fq out=trimmed.fq ref=ref.fa k=20 config=trimadapters.txt
+
+What are config files for? Well, mainly, to overcome difficulties like whitespace in file paths, or command lines that are too long.
+There are some example config files in bbmap/config/. They are not used unless you specifically tell a program to use them.
diff --git a/docs/readme_filetypes.txt b/docs/readme_filetypes.txt
new file mode 100755
index 0000000..d281dd2
--- /dev/null
+++ b/docs/readme_filetypes.txt
@@ -0,0 +1,34 @@
+BBTools are sensitive to filename extensions. For example, this command:
+reformat.sh in=reads.fq out=reads.fa.gz
+...will convert reads from fastq format to gzipped fasta. The recognized sequence file extensions are as follows:
+
+fastq (fq)
+fasta (fa, fna, fas, ffn, frn, seq, fsa, faa)
+sam
+bam [requires samtools]
+qual
+scarf [input only]
+phylip [input only; only supported by phylip2fasta.sh]
+header [output only]
+
+The recognized compression extensions:
+
+gzip (gz) [can be accelerated by pigz]
+zip
+bz2 [requires bzip2 or pbzip2]
+
+In order to stream using standard in or standard out, it is recommended to include the format. For example:
+cat data.fq.gz | reformat.sh in=stdin.fq.gz out=stdout.fa > file.fa
+This allows the tool to determine the format. Otherwise it will revert to the default.
+
+BBTools can usually determine the type of sequence data by examining the contents. To test this, run:
+fileformat.sh in=file
+
+...which will print the way the data is detected, e.g. Sanger (ASCII-33) quality, interleaved, etc. These can normally be overridden with the "qin" and "interleaved" flags.
+
+When BBTools are processing gzipped files, they may, if possible, attempt to spawn a pigz process to accelerate it. This behavior can be forced with the "pigz=t unpigz=t" flags, or prevented with "pigz=f unpigz=f"; otherwise, the default behavior depends on the tool. In some cluster configurations, and some Amazon nodes, spawning a process may cause the program to killed with an indication that it used too much virtual memory. I recommend pigz be enabled unless that scenario occurs.
+
+The most recent extension added is "header". You can use it like this:
+reformat.sh in=reads.fq out=reads.header minlen=100
+
+That will create a file containing headers of reads that pass the "minlen" filter.
diff --git a/ecc.sh b/ecc.sh
new file mode 100755
index 0000000..bfcf215
--- /dev/null
+++ b/ecc.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+#ecc.sh in=<infile> out=<outfile>
+
+usage(){
+echo "
+Description: Corrects substitution errors in reads using kmer depth information.
+Can also normalize and/or bin reads by kmer depth.
+
+Usage: ecc.sh in=<input> out=<reads to keep> outt=<reads to toss> hist=<histogram output>
+
+Please see bbnorm.sh for more information.
+All the flags are the same, only the parameters (near the bottom of this file) differ.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx31g"
+z2="-Xms31g"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+ if [[ $set == 1 ]]; then
+ return
+ fi
+ freeRam 31000m 84
+ z="-Xmx${RAM}m"
+ z2="-Xms${RAM}m"
+}
+calcXmx "$@"
+
+correct() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ fi
+ local CMD="java $EA $z $z2 -cp $CP jgi.KmerNormalize bits=16 ecc=t passes=1 keepall dr=f prefilter $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+correct "$@"
diff --git a/estherfilter.sh b/estherfilter.sh
new file mode 100755
index 0000000..6b0a2be
--- /dev/null
+++ b/estherfilter.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+#estherfilter.sh <query> <reference> <cutoff>"
+
+usage(){
+echo "
+Written by Brian Bushnell
+Last modified January 21, 2015
+
+Description: BLASTs queries against reference, and filters out hits with
+ scores less than 'cutoff'. The score is taken from column 12
+ of the BLAST output. The specific BLAST command is:
+ blastall -p blastn -i QUERY -d REFERENCE -e 0.00001 -m 8
+
+Usage: estherfilter.sh <query> <reference> <cutoff>
+
+For example:
+
+estherfilter.sh reads.fasta genes.fasta 1000 > results.txt
+
+'fasta' can be used as a fourth argument to get output in Fasta format. Requires more memory.
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx3200m"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+}
+calcXmx "$@"
+
+estherfilter() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module load oracle-jdk/1.7_64bit
+ module load blast
+ fi
+ local CMD="java $EA $z -cp $CP driver.EstherFilter $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+estherfilter "$@"
diff --git a/filterbarcodes.sh b/filterbarcodes.sh
new file mode 100755
index 0000000..92b26f9
--- /dev/null
+++ b/filterbarcodes.sh
@@ -0,0 +1,83 @@
+#!/bin/bash
+#filterbarcodes in=<infile> out=<outfile>
+
+usage(){
+echo "
+Written by Brian Bushnell
+Last modified February 17, 2015
+
+Description: Filters barcodes by quality, and generates quality histograms.
+
+Usage: filterbarcodes.sh in=<file> out=<file> maq=<integer>
+
+
+Input parameters:
+in=<file> Reads that have already been muxed with barcode qualities using mergebarcodes.sh.
+interleaved=auto (int) If true, forces fastq input to be paired and interleaved.
+qin=auto ASCII offset for input quality. May be 33 (Sanger), 64 (Illumina), or auto.
+
+Output parameters:
+out=<file> Write filtered reads here. 'out=stdout.fq' will pipe to standard out.
+cor=<file> Correlation between read and index qualities.
+bqhist=<file> Barcode quality histogram by position.
+baqhist=<file> Barcode average quality histogram.
+bmqhist=<file> Barcode min quality histogram.
+overwrite=t (ow) Set to false to force the program to abort rather than overwrite an existing file.
+ziplevel=2 (zl) Set to 1 (lowest) through 9 (max) to change compression level; lower compression is faster.
+fastawrap=80 Length of lines in fasta output.
+qout=auto ASCII offset for output quality. May be 33 (Sanger), 64 (Illumina), or auto (same as input).
+maq=0 Filter reads with barcode average quality less than this.
+mmq=0 Filter reads with barcode minimum quality less than this.
+
+Other parameters:
+pigz=t Use pigz to compress. If argument is a number, that will set the number of pigz threads.
+unpigz=t Use pigz to decompress.
+
+Java Parameters:
+-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
+ -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory.
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx200m"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+}
+calcXmx "$@"
+
+filterbarcodes() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ fi
+ local CMD="java $EA $z -cp $CP jgi.CorrelateBarcodes $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+filterbarcodes "$@"
diff --git a/filterbycoverage.sh b/filterbycoverage.sh
new file mode 100755
index 0000000..f58dc7c
--- /dev/null
+++ b/filterbycoverage.sh
@@ -0,0 +1,92 @@
+#!/bin/bash
+#filterbycoverage in=<infile> out=<outfile>
+
+function usage(){
+echo "
+Written by Brian Bushnell
+Last modified November 4, 2015
+
+Description: Filters an assembly by contig coverage.
+
+Usage: filterbycoverage.sh in=<assembly> cov=<coverage stats> out=<filtered assembly> mincov=5
+
+in2 and out2 are for paired reads and are optional.
+If input is paired and there is only one output file, it will be written interleaved.
+
+
+Parameters:
+in=<file> File containing input assembly.
+cov=<file> File containing coverage stats generated by pileup.
+cov0=<file> Optional file containing coverage stats before normalization.
+out=<file> Destination of clean output assembly.
+outd=<file> (outdirty) Destination of dirty output containing only removed contigs.
+minc=5 (mincov) Discard contigs with lower average coverage.
+minp=40 (minpercent) Discard contigs with a lower percent covered bases.
+minr=0 (minreads) Discard contigs with fewer mapped reads.
+minl=1 (minlength) Discard contigs shorter than this (after trimming).
+trim=0 (trimends) Trim the first and last X bases of each sequence.
+ratio=0 If cov0 is set, contigs will not be removed unless the coverage ratio (of cov to cov0) is at least this (0 disables it).
+ow=t (overwrite) Overwrites files that already exist.
+app=f (append) Append to files that already exist.
+zl=4 (ziplevel) Set compression level, 1 (low) to 9 (max).
+int=f (interleaved) Determines whether INPUT file is considered interleaved.
+
+Java Parameters:
+-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
+ -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory.
+
+To read from stdin, set 'in=stdin'. The format should be specified with an extension, like 'in=stdin.fq.gz'
+To write to stdout, set 'out=stdout'. The format should be specified with an extension, like 'out=stdout.fasta'
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx800m"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+ if [[ $set == 1 ]]; then
+ return
+ fi
+ freeRam 800m 84
+ z="-Xmx${RAM}m"
+ z2="-Xms${RAM}m"
+}
+calcXmx "$@"
+
+function filterbycoverage() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module unload samtools
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ module load samtools
+ fi
+ local CMD="java $EA $z -cp $CP jgi.FilterByCoverage $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+filterbycoverage "$@"
diff --git a/filterbyname.sh b/filterbyname.sh
new file mode 100755
index 0000000..302d2a2
--- /dev/null
+++ b/filterbyname.sh
@@ -0,0 +1,93 @@
+#!/bin/bash
+#filterbyname in=<infile> out=<outfile>
+
+function usage(){
+echo "
+Written by Brian Bushnell
+Last modified July 23, 2015
+
+Description: Filters reads by name.
+
+Usage: filterbyname.sh in=<file> in2=<file2> out=<outfile> out2=<outfile2> names=<string,string,string> include=<t/f>
+
+in2 and out2 are for paired reads and are optional.
+If input is paired and there is only one output file, it will be written interleaved.
+
+
+Parameters:
+include=f Set to 'true' to include the filtered names rather than excluding them.
+substring=f Allow one name to be a substring of the other, rather than a full match.
+ f: No substring matching.
+ t: Bidirectional substring matching.
+ header: Allow input read headers to be substrings of names in list.
+ name: Allow names in list to be substrings of input read headers.
+casesensitive=t (case) Match case also.
+ow=t (overwrite) Overwrites files that already exist.
+app=f (append) Append to files that already exist.
+zl=4 (ziplevel) Set compression level, 1 (low) to 9 (max).
+int=f (interleaved) Determines whether INPUT file is considered interleaved.
+names= A list of strings or files. The files can have one name per line, or
+ be a standard read file (fasta, fastq, or sam).
+minlen=0 Do not output reads shorter than this.
+truncate=f (ths) Ignore a leading @ or > symbol in the names file.
+
+
+Java Parameters:
+-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
+ -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory.
+
+To read from stdin, set 'in=stdin'. The format should be specified with an extension, like 'in=stdin.fq.gz'
+To write to stdout, set 'out=stdout'. The format should be specified with an extension, like 'out=stdout.fasta'
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx800m"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+ if [[ $set == 1 ]]; then
+ return
+ fi
+ freeRam 800m 84
+ z="-Xmx${RAM}m"
+ z2="-Xms${RAM}m"
+}
+calcXmx "$@"
+
+function filterbyname() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module unload samtools
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ module load samtools
+ fi
+ local CMD="java $EA $z -cp $CP driver.FilterReadsByName $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+filterbyname "$@"
diff --git a/filterbysequence.sh b/filterbysequence.sh
new file mode 100755
index 0000000..799347c
--- /dev/null
+++ b/filterbysequence.sh
@@ -0,0 +1,91 @@
+#!/bin/bash
+#filterbyname in=<infile> out=<outfile>
+
+function usage(){
+echo "
+Written by Brian Bushnell
+Last modified December 18, 2015
+
+Description: Filters sequences by exact sequence matches.
+
+Usage: filterbysequence.sh in=<file> out=<file> ref=<file> include=<t/f>
+
+
+I/O Parameters:
+in= Primary input. 'in2' will specify a second file.
+out= Primary out. 'out2' will specify a second file.
+ref= A reference file or comma-delimited list of files.
+literal= A literal sequence or comma-delimited list of sequences.
+ow=t (overwrite) Overwrites files that already exist.
+zl=2 (ziplevel) Set compression level, 1 (low) to 9 (max).
+
+Processing Parameters:
+include=f Set to 'true' to include the filtered sequences rather
+ than excluding them.
+rcomp=t Match reverse complements as well.
+casesensitive=f (case) Require matching case.
+storebases=t (sb) Store ref bases. Requires more memory. If false,
+ case-sensitive matching cannot be done, and the matching
+ will be probabilistic based 128-bit hashcodes.
+threads=auto (t) Specify the number of worker threads.
+
+
+Java Parameters:
+-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
+ -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory.
+
+To read from stdin, set 'in=stdin'. The format should be specified with an extension, like 'in=stdin.fq.gz'
+To write to stdout, set 'out=stdout'. The format should be specified with an extension, like 'out=stdout.fasta'
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx800m"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+ if [[ $set == 1 ]]; then
+ return
+ fi
+ freeRam 800m 84
+ z="-Xmx${RAM}m"
+ z2="-Xms${RAM}m"
+}
+calcXmx "$@"
+
+function filterbysequence() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module unload samtools
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ module load samtools
+ fi
+ local CMD="java $EA $z -cp $CP jgi.FilterBySequence $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+filterbysequence "$@"
diff --git a/filterbytaxa.sh b/filterbytaxa.sh
new file mode 100755
index 0000000..5a88922
--- /dev/null
+++ b/filterbytaxa.sh
@@ -0,0 +1,95 @@
+#!/bin/bash
+#filterbytaxa in=<infile> out=<outfile>
+
+usage(){
+echo "
+Written by Brian Bushnell
+Last modified December 15, 2015
+
+Description: Filters sequences according to their taxonomy,
+as determined by the sequence name. Sequences should
+be labeled with a gi number, NCBI taxID, or species name.
+
+Usage: filterbytaxa.sh in=<input file> out=<output file> tree=<tree file> table=<table file> ids=<numbers> level=<name or number>
+
+Input may be fasta or fastq, compressed or uncompressed.
+
+
+Standard parameters:
+in=<file> Primary input, or read 1 input.
+out=<file> Primary output, or read 1 output.
+overwrite=f (ow) Set to false to force the program to abort rather than
+ overwrite an existing file.
+showspeed=t (ss) Set to 'f' to suppress display of processing speed.
+ziplevel=2 (zl) Set to 1 (lowest) through 9 (max) to change compression
+ level; lower compression is faster.
+
+Processing parameters:
+level= Taxonomic level, such as phylum. Filtering will operate on
+ sequences within the same taxonomic level as specified ids.
+ids= Comma-delimited list of NCBI numeric IDs.
+names= Alternately, a list of names (such as 'Homo sapiens').
+ Note that spaces need special handling.
+include=f 'f' will discard filtered sequences, 't' will keep them.
+tree= A taxonomic tree made by TaxTree, such as tree.taxtree.gz.
+table= A table translating gi numbers to NCBI taxIDs.
+ Only needed if gi numbers will be used.
+* Note *
+Tree and table files are in /global/projectb/sandbox/gaag/bbtools/tax
+For non-Genepool users, or to make new ones, use taxtree.sh and gitable.sh
+
+Java Parameters:
+-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
+ -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory.
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx4g"
+z2="-Xms4g"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+ if [[ $set == 1 ]]; then
+ return
+ fi
+ freeRam 1000m 84
+ z="-Xmx${RAM}m"
+ z2="-Xms${RAM}m"
+}
+calcXmx "$@"
+
+filterbytaxa() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ fi
+ local CMD="java $EA $z -cp $CP tax.FilterByTaxa $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+filterbytaxa "$@"
diff --git a/filterlines.sh b/filterlines.sh
new file mode 100755
index 0000000..4e7c511
--- /dev/null
+++ b/filterlines.sh
@@ -0,0 +1,87 @@
+#!/bin/bash
+#filterlines in=<infile> out=<outfile>
+
+function usage(){
+echo "
+Written by Brian Bushnell
+Last modified July 6, 2015
+
+Description: Filters lines by exact match or substring.
+
+Usage: filterlines.sh in=<file> out=<file> names=<file> include=<t/f>
+
+
+Parameters:
+include=f Set to 'true' to include the filtered names rather than excluding them.
+prefix=f Allow matching of only the line's prefix (all characters up to first whitespace).
+substring=f Allow one name to be a substring of the other, rather than a full match.
+ f: No substring matching.
+ t: Bidirectional substring matching.
+ line: Allow input lines to be substrings of names in list.
+ name: Allow names in list to be substrings of input lines.
+casesensitive=t (case) Match case also.
+ow=t (overwrite) Overwrites files that already exist.
+app=f (append) Append to files that already exist.
+zl=4 (ziplevel) Set compression level, 1 (low) to 9 (max).
+names= A list of strings or files, comma-delimited. Files must have one name per line.
+
+
+Java Parameters:
+-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
+ -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory.
+
+To read from stdin, set 'in=stdin'. The format should be specified with an extension, like 'in=stdin.fq.gz'
+To write to stdout, set 'out=stdout'. The format should be specified with an extension, like 'out=stdout.fasta'
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx800m"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+ if [[ $set == 1 ]]; then
+ return
+ fi
+ freeRam 800m 84
+ z="-Xmx${RAM}m"
+ z2="-Xms${RAM}m"
+}
+calcXmx "$@"
+
+function filterlines() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module unload samtools
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ module load samtools
+ fi
+ local CMD="java $EA $z -cp $CP driver.FilterLines $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+filterlines "$@"
diff --git a/filtersubs.sh b/filtersubs.sh
new file mode 100755
index 0000000..b496155
--- /dev/null
+++ b/filtersubs.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+#filtersubs in=<infile>
+
+usage(){
+echo "
+Written by Brian Bushnell
+Last modified August 24, 2015
+
+Description: Filters a sam file to select only reads with substitution errors
+for bases with quality scores in a certain interval. Used for manually
+examining specific reads that may have incorrectly calibrated quality scores.
+
+Usage: filtersubs.sh in=<file> out=<file> minq=<number> maxq=<number>
+
+Parameters:
+in=<file> Input sam or bam file.
+out=<file> Output file.
+minq=0 Keep only reads with substitutions of at least this quality.
+maxq=99 Keep only reads with substitutions of at most this quality.
+countindels=t Also keep reads with indels in the quality range.
+keepperfect=f Also keep error-free reads.
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx120m"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+filtersubs() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ module load samtools
+ fi
+ local CMD="java $EA $z -cp $CP jgi.FilterReadsWithSubs $@"
+# echo $CMD >&2
+ eval $CMD
+}
+
+filtersubs "$@"
diff --git a/fungalrelease.sh b/fungalrelease.sh
new file mode 100755
index 0000000..34d03c1
--- /dev/null
+++ b/fungalrelease.sh
@@ -0,0 +1,94 @@
+#!/bin/bash
+#fungalrelease in=<infile> out=<outfile>
+
+usage(){
+echo "
+Written by Brian Bushnell
+Last modified December 10, 2015
+
+Description: Reformats a fungal assembly for release.
+Also creates contig and agp files.
+
+Usage: fungalrelease.sh in=<input file> out=<output file>
+
+
+File parameters:
+in=<file> Input scaffolds.
+out=<file> Output scaffolds.
+outc=<file> Output contigs.
+qfin=<file> Optional quality scores input.
+qfout=<file> Optional quality scores output.
+qfoutc=<file> Optional contig quality scores output.
+agp=<file> Output AGP file.
+legend=<file> Output name legend file.
+overwrite=f (ow) Set to false to force the program to abort rather than
+ overwrite an existing file.
+
+Processing parameters:
+fastawrap=60 Wrap length for fasta lines.
+tuc=t Convert sequence to upper case.
+baniupac=t Crash on encountering a non-ACGTN base call.
+mingap=10 Expand all gaps (Ns) to be at least this long.
+mingapin=1 Only expand gaps that are at least this long.
+sortcscaffolds=t Sort scaffolds descending by length.
+sortcontigs=f Sort contigs descending by length.
+renamescaffolds=t Rename scaffolds to 'scaffold_#'.
+scafnum=1 Number of first scaffold.
+renamecontigs=f Rename contigs to 'contig_#' instead of 'scafname_c#'.
+contignum=1 Number of first contig; only used if renamecontigs=t.
+minscaf=1 Only retain scaffolds at least this long.
+mincontig=1 Only retain contigs at least this long.
+
+
+Java Parameters:
+-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
+ -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory.
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx4g"
+z2="-Xms4g"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+ if [[ $set == 1 ]]; then
+ return
+ fi
+}
+calcXmx "$@"
+
+fungalrelease() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ fi
+ local CMD="java $EA $z -cp $CP jgi.FungalRelease $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+fungalrelease "$@"
diff --git a/fuse.sh b/fuse.sh
new file mode 100755
index 0000000..991ccff
--- /dev/null
+++ b/fuse.sh
@@ -0,0 +1,89 @@
+#!/bin/bash
+#fuse in=<infile> out=<outfile>
+
+usage(){
+echo "
+Written by Brian Bushnell
+Last modified October 23, 2015
+
+Description: Fuses sequences together, padding gaps with Ns.
+Does not support total length greater than 2 billion.
+
+Usage: fuse.sh in=<input file> out=<output file> pad=<number of Ns>
+
+Input may be fasta or fastq, compressed or uncompressed.
+
+Optional parameters (and their defaults)
+
+in=<file> The 'in=' flag is needed if the input file is not the
+ first parameter. 'in=stdin' will pipe from standard in.
+out=<file> The 'out=' flag is needed if the output file is not the
+ second parameter. 'out=stdout' will pipe to standard out.
+pad=300 Pad this many N between sequences.
+quality=30 Fake quality scores, if generating fastq from fasta.
+overwrite=t (ow) Set to false to force the program to abort rather
+ than overwrite an existing file.
+ziplevel=2 (zl) Set to 1 (lowest) through 9 (max) to change
+ compression level; lower compression is faster.
+fusepairs=f Default mode fuses all sequences into one long sequence.
+ Setting fusepairs=t will instead fuse each pair together.
+name= Set name of output sequence. Default is the name of
+ the first input sequence.
+
+Java Parameters:
+-Xmx This will be passed to Java to set memory usage, overriding
+ the program's automatic memory detection. -Xmx20g will
+ specify 20 gigs of RAM, and -Xmx200m will specify 200 megs.
+ The max is typically 85% of physical memory.
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx2g"
+z2="-Xms2g"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+ if [[ $set == 1 ]]; then
+ return
+ fi
+ freeRam 2000m 84
+ z="-Xmx${RAM}m"
+ z2="-Xms${RAM}m"
+}
+calcXmx "$@"
+
+fuse() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ fi
+ local CMD="java $EA $z -cp $CP jgi.FuseSequence $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+fuse "$@"
diff --git a/getreads.sh b/getreads.sh
new file mode 100755
index 0000000..3b9753d
--- /dev/null
+++ b/getreads.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+
+function usage(){
+echo "
+Written by Brian Bushnell
+Last modified February 17, 2015
+
+Description: Selects reads with designated numeric IDs.
+
+Usage: getreads.sh in=<file> id=<number,number,number...> out=<file>
+
+The first read (or pair) has ID 0, the second read (or pair) has ID 1, etc.
+
+Parameters:
+in=<file> Specify the input file, or stdin.
+out=<file> Specify the output file, or stdout.
+id= Comma delimited list of numbers or ranges, in any order.
+ For example: id=5,93,17-31,8,0,12-13
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx200m"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+}
+calcXmx "$@"
+
+function tf() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module load oracle-jdk/1.7_64bit
+ fi
+ local CMD="java $EA $z -cp $CP jgi.GetReads $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+tf "$@"
\ No newline at end of file
diff --git a/gi2taxid.sh b/gi2taxid.sh
new file mode 100755
index 0000000..672f3de
--- /dev/null
+++ b/gi2taxid.sh
@@ -0,0 +1,79 @@
+#!/bin/bash
+#gi2taxid in=<infile> out=<outfile>
+
+usage(){
+echo "
+Written by Brian Bushnell.
+Last modified December 15, 2015
+
+Description: Renames fasta sequences with gi numbers to NCBI taxa IDs.
+
+Usage: gi2taxid.sh in=<file> out=<file> gi=<file>
+
+Parameters:
+in=<file> Input sequences; required parameter. Must be fasta.
+out=<file> Destination for renamed sequences.
+invalid=<file> Destination for headers with no taxid.
+gi=<file> 2-column tsv with gi and taxid numbers.
+prefix=f Append the taxid as a prefix to the old header.
+ziplevel=2 (zl) Compression level for gzip output.
+pigz=f Spawn a pigz (parallel gzip) process for faster
+ compression than Java. Requires pigz to be installed.
+
+Java Parameters:
+-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
+ -Xmx20g will specify 20 gigs of RAM, and -Xmx800m will specify 800 megs. The max is typically 85% of physical memory.
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+NATIVELIBDIR="$DIR""jni/"
+
+z="-Xmx7g"
+z2="-Xms7g"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+ if [[ $set == 1 ]]; then
+ return
+ fi
+ freeRam 7000m 84
+ z="-Xmx${RAM}m"
+ z2="-Xms${RAM}m"
+}
+calcXmx "$@"
+
+
+gi2taxid() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ fi
+ local CMD="java $EA $z $z2 -cp $CP tax.RenameGiToNcbi $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+gi2taxid "$@"
diff --git a/gitable.sh b/gitable.sh
new file mode 100755
index 0000000..e7c0f70
--- /dev/null
+++ b/gitable.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+#gitable gi_taxid_nucl.dmp.gz gitable.int1d.gz
+
+usage(){
+echo "
+Written by Brian Bushnell.
+Last modified December 15, 2015
+
+Description: Creates gitable.int1d from gi_taxid_nucl.dmp.
+gitable.int1d is a much more efficient representation,
+allowing easy translation of gi numbers to ncbi taxids.
+gi_taxid_nucl.dmp is at ftp://ftp.ncbi.nih.gov/pub/taxonomy/
+
+Usage: gitable.sh gi_taxid_nucl.dmp.gz gitable.int1d.gz
+
+Java Parameters:
+-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
+ -Xmx20g will specify 20 gigs of RAM. The max is typically 85% of physical memory.
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+NATIVELIBDIR="$DIR""jni/"
+
+z="-Xmx12g"
+z2="-Xms12g"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+ if [[ $set == 1 ]]; then
+ return
+ fi
+ freeRam 12000m 84
+ z="-Xmx${RAM}m"
+ z2="-Xms${RAM}m"
+}
+calcXmx "$@"
+
+
+gitable() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ fi
+ local CMD="java $EA $z $z2 -cp $CP tax.GiToNcbi $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+gitable "$@"
diff --git a/grademerge.sh b/grademerge.sh
new file mode 100755
index 0000000..3efd646
--- /dev/null
+++ b/grademerge.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+#grademerge in=<infile> out=<outfile>
+
+function usage(){
+echo "
+Written by Brian Bushnell
+Last modified February 17, 2015
+
+Description: Grades correctness of merging synthetic reads with headers
+ generated by RandomReads and re-headered by RenameReads.
+
+Usage: grademerge.sh in=<file>
+
+Parameters:
+in=<file> Specify the input file, or 'stdin'.
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+function grademerge() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ fi
+ local CMD="java $EA -Xmx200m -cp $CP jgi.GradeMergedReads $@"
+# echo $CMD >&2
+ eval $CMD
+}
+
+grademerge "$@"
diff --git a/gradesam.sh b/gradesam.sh
new file mode 100755
index 0000000..bd8104e
--- /dev/null
+++ b/gradesam.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+#gradesam in=<infile> out=<outfile>
+
+function usage(){
+echo "
+Written by Brian Bushnell
+Last modified May 23, 2014
+
+Description: Grades mapping correctness of a sam file of synthetic reads with headers generated by RandomReads3.java
+
+Usage: gradesam.sh in=<sam file> reads=<number of reads>
+
+
+Parameters:
+in=<file> Specify the input sam file, or stdin.
+reads=<int> Number of reads in mapper's input (i.e., the fastq file).
+thresh=20 Max deviation from correct location to be considered 'loosely correct'.
+blasr=f Set to 't' for BLASR output; fixes extra information added to read names.
+ssaha2=f Set to 't' for SSAHA2 or SMALT output; fixes incorrect soft-clipped read locations.
+quality=3 Reads with a mapping quality of this or below will be considered ambiguously mapped.
+bitset=t Track read ID's to detect secondary alignments.
+ Necessary for mappers that incorrectly output multiple primary alignments per read.
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+function gradesam() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module unload samtools
+ module load oracle-jdk/1.7_64bit
+ module load samtools
+ module load pigz
+ fi
+ local CMD="java $EA -Xmx200m -cp $CP align2.GradeSamFile $@"
+# echo $CMD >&2
+ eval $CMD
+}
+
+gradesam "$@"
diff --git a/idmatrix.sh b/idmatrix.sh
new file mode 100755
index 0000000..44bc6e2
--- /dev/null
+++ b/idmatrix.sh
@@ -0,0 +1,84 @@
+#!/bin/bash
+#idmatrix in=<file> out=<file>
+
+usage(){
+echo "
+Written by Brian Bushnell
+Last modified November 25, 2014
+
+Description: Generates an identity matrix via all-to-all alignment.
+
+Usage: idmatrix.sh in=<file> out=<file>
+
+Parameters:
+
+in=<file> File containing reads. in=stdin.fa will pipe from stdin.
+out=<file> Matrix output. out=stdout will pipe to stdout.
+threads=auto (t) Set number of threads to use; default is number of
+ logical processors.
+percent=f Output identity as percent rather than a fraction.
+edits= Allow at most this much edit distance. Default is the
+ length of the longest input sequence. Lower is faster.
+width= Alignment bandwidth, lower is faster. Default: 2*edits+1.
+usejni=f (jni) Do alignments faster, in C code. Requires
+ compiling the C code; details are in /jni/README.txt.
+
+Java Parameters:
+
+-Xmx This will be passed to Java to set memory usage, overriding
+ the program's automatic memory detection. -Xmx20g will specify
+ 20 gigs of RAM, and -Xmx200m will specify 200 megs.
+ The max is typically 85% of physical memory.
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+NATIVELIBDIR="$DIR""jni/"
+
+z="-Xmx2g"
+z2="-Xms2g"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+ if [[ $set == 1 ]]; then
+ return
+ fi
+ freeRam 3200m 84
+ z="-Xmx${RAM}m"
+ z2="-Xms${RAM}m"
+}
+calcXmx "$@"
+
+idmatrix() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ fi
+ local CMD="java -Djava.library.path=$NATIVELIBDIR $EA $z -cp $CP jgi.IdentityMatrix $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+idmatrix "$@"
diff --git a/jni/BBMergeOverlapper.c b/jni/BBMergeOverlapper.c
new file mode 100755
index 0000000..4e6e76f
--- /dev/null
+++ b/jni/BBMergeOverlapper.c
@@ -0,0 +1,523 @@
+#include <jni.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "jgi_BBMergeOverlapper.h"
+
+// C doesn't have min() or max() so we define our own
+#define max(a,b) \
+ ({ __typeof__ (a) _a = (a); \
+ __typeof__ (b) _b = (b); \
+ _a > _b ? _a : _b; })
+
+#define min(a,b) \
+ ({ __typeof__ (a) _a = (a); \
+ __typeof__ (b) _b = (b); \
+ _a < _b ? _a : _b; })
+
+#define BAD_MULT 6
+#define GOOD_MULT_1 8
+#define GOOD_MULT_2 400
+
+jint mid(int x, int y, int z){return x<y ? (x<z ? min(y, z) : x) : (y<z ? min(x, z) : y);}
+
+//Looks fine on cursory inspection - BB Oct 27 2015
+jint mateByOverlap(jbyte * a_bases, const jint a_bases_length, jbyte * b_bases, const jint b_bases_length, jbyte * a_quality, jbyte * b_quality, jfloat * aprob, jfloat * bprob, jint * rvector, jint minOverlap0, const jint minOverlap, const jint minInsert0, jint margin, const jint maxMismatches0, const jint maxMismatches, const jint minq) {
+
+ minOverlap0=min(max(1, minOverlap0), minOverlap);
+ margin=max(margin, 0);
+
+ const jbyte *abases=a_bases, *bbases=b_bases;
+ jbyte *aqual=NULL;
+ jbyte *bqual=NULL;
+ if(a_quality!=NULL){
+ aqual=a_quality;
+ }
+ if(b_quality!=NULL){
+ bqual=b_quality;
+ }
+ const jint alen=a_bases_length, blen=b_bases_length;
+
+ jint bestOverlap=-1;
+ jint bestGood=-1;
+ jint bestBad=maxMismatches0;
+
+ jboolean ambig=0;
+ const jint maxOverlap=alen+blen-max(minOverlap, minInsert0);
+
+ const jfloat probCorrect[71] =
+ {0.000f, 0.251f, 0.369f, 0.499f, 0.602f, 0.684f, 0.749f, 0.800f, 0.842f, 0.874f, 0.900f, 0.921f, 0.937f, 0.950f, 0.960f, 0.968f,
+ 0.975f, 0.980f, 0.984f, 0.987f, 0.990f, 0.992f, 0.994f, 0.995f, 0.996f, 0.997f, 0.997f, 0.998f, 0.998f, 0.999f, 0.999f, 0.999f,
+ 0.999f, 0.999f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+
+ if(aqual!=NULL && bqual!=NULL){
+ for(jint i=0; i<alen; i++){aprob[i]=probCorrect[aqual[i]];}
+ for(jint i=0; i<blen; i++){bprob[i]=probCorrect[bqual[i]];}
+ }else{
+ for(jint i=0; i<alen; i++){aprob[i]=0.98f;}
+ for(jint i=0; i<blen; i++){bprob[i]=0.98f;}
+ }
+
+ const jfloat minprob=probCorrect[mid(1, minq, 41)];
+
+ for(jint overlap=max(minOverlap0, 0); overlap<maxOverlap; overlap++){
+ jint good=0, bad=0;
+ jint istart=(overlap<=alen ? 0 : overlap-alen);
+ jint jstart=(overlap<=alen ? alen-overlap : 0);
+ {
+ const jint iters=min(overlap-istart, min(blen-istart, alen-jstart));
+ const jint imax=istart+iters;
+ const jint badlim=bestBad+margin;
+
+ for(jint i=istart, j=jstart; i<imax && bad<=badlim; i++, j++){
+ const jbyte ca1=abases[j], cb1=bbases[i];
+ const jfloat pc=aprob[j]*bprob[j];
+
+ if(pc<=minprob){//do nothing
+ }else if(ca1==cb1){good++;}
+ else{bad++;}
+ }
+ }
+
+ if(bad*2<good){
+ if(good>minOverlap){//Candidate
+ if(bad<=bestBad){
+ if(bad<bestBad || (bad==bestBad && good>bestGood)){//Current winner
+ if(bestBad-bad<margin){ambig=1;}
+ bestOverlap=overlap;
+ bestBad=bad;
+ bestGood=good;
+ }else if(bad==bestBad){
+ ambig=1;
+ }
+
+ if(ambig && bestBad<margin){
+ rvector[2]=bestBad;
+ rvector[4]=(ambig ? 1 : 0);
+ return -1;
+ }
+ }
+ }else if(bad<margin){
+ ambig=1;
+ rvector[2]=bestBad;
+ rvector[4]=(ambig ? 1 : 0);
+ return -1;
+ }else{
+ }
+ }
+ }
+
+ if(!ambig && bestBad>maxMismatches-margin){bestOverlap=-1;}
+
+ rvector[2]=bestBad;
+ rvector[4]=(ambig ? 1 : 0);
+
+ return (bestOverlap<0 ? -1 : alen+blen-bestOverlap);
+}
+//Fixed - BB Oct 26 2015
+jfloat findBestRatio(jbyte * a_bases, const jint a_bases_length, jbyte * b_bases, const jint b_bases_length,
+ const jint minOverlap0, const jint minOverlap, const jint minInsert, const jfloat maxRatio, const jfloat offset, const jfloat gIncr, const jfloat bIncr) {
+ const jbyte *abases=a_bases, *bbases=b_bases;
+ const jint alen=a_bases_length, blen=b_bases_length;
+
+ jfloat bestRatio=maxRatio+0.0001f;
+ const jint maxOverlap=alen+blen-max(minOverlap, minInsert);
+// const jfloat altBadlimit=max(maxRatio, 0.07f)*2f*alen+1;
+ const jfloat halfmax=maxRatio*0.5f;
+ const jbyte N='N';
+
+ const jint largestInsertToTest=(alen+blen-minOverlap);
+ const jint smallestInsertToTest=minInsert;
+ for(jint insert=largestInsertToTest; insert>=smallestInsertToTest; insert--){
+ const jint istart=(insert<=blen ? 0 : insert-blen);
+ const jint jstart=(insert>=blen ? 0 : blen-insert);
+ const jint overlapLength=min(alen-istart, min(blen-jstart, insert));
+
+// const jfloat badlimit=(min(altBadlimit, bestRatio*overlapLength));
+ const jfloat badlimit=bestRatio*overlapLength;
+ jfloat good=0, bad=0;
+
+ const jint imax=istart+overlapLength;
+ for(jint i=istart, j=jstart; i<imax && bad<=badlimit; i++, j++){
+ const jbyte ca=abases[i], cb=bbases[j];
+
+ if(ca==cb){
+ if(ca!=N){good+=gIncr;}
+ }else{bad+=bIncr;}
+ }
+
+ if(bad<=badlimit){
+ if(bad==0 && good>minOverlap0 && good<minOverlap){
+ return 100.0f;
+ }
+
+ jfloat ratio=(bad+offset)/overlapLength;
+
+ if(ratio<bestRatio){
+ bestRatio=ratio;
+ if(good>=minOverlap && ratio<halfmax){return bestRatio;}
+ }
+ }
+ }
+
+ return bestRatio;
+}
+//Fixed - BB Oct 27 2015
+//TODO: remove a_quality and b_quality
+jfloat findBestRatio_WithQualities(jbyte * a_bases, const jint a_bases_length, jbyte * b_bases, const jint b_bases_length,
+ jbyte * a_quality, jbyte * b_quality, //TODO: Not needed
+ jfloat * aprob, jfloat * bprob,
+ const jint minOverlap0, const jint minOverlap, const jint minInsert, const jfloat maxRatio, const jfloat offset) {
+ const jbyte *abases=a_bases, *bbases=b_bases;
+ const jint alen=a_bases_length, blen=b_bases_length;
+
+ jfloat bestRatio=maxRatio+0.0001f;
+// const jfloat altBadlimit=max(maxRatio, 0.07f)*2f*alen+1;
+ const jfloat halfmax=maxRatio*0.5f;
+
+
+ const jint largestInsertToTest=(alen+blen-minOverlap); //TODO: test speed with minOverlap0
+ const jint smallestInsertToTest=minInsert;
+ for(jint insert=largestInsertToTest; insert>=smallestInsertToTest; insert--){
+
+
+ const jint istart=(insert<=blen ? 0 : insert-blen);
+ const jint jstart=(insert>=blen ? 0 : blen-insert);
+ const jint overlapLength=min(alen-istart, min(blen-jstart, insert));
+
+// const jfloat badlimit=(min(altBadlimit, bestRatio*overlapLength));
+ const jfloat badlimit=bestRatio*overlapLength;
+ jfloat good=0, bad=0;
+
+ const jint imax=istart+overlapLength;
+ for(jint i=istart, j=jstart; i<imax && bad<=badlimit; i++, j++){
+ const jbyte ca=abases[i], cb=bbases[j];
+ const jfloat x=aprob[i]*bprob[j];
+
+ if(ca==cb){good+=x;}
+ else{bad+=x;}
+ }
+
+ if(bad<=badlimit){
+ if(bad==0 && good>minOverlap0 && good<minOverlap){
+ return 100.0f;
+ }
+
+ jfloat ratio=(bad+offset)/overlapLength;
+
+ if(ratio<bestRatio){
+ bestRatio=ratio;
+ if(good>=minOverlap && ratio<halfmax){return bestRatio;}
+ }
+ }
+ }
+
+ return bestRatio;
+}
+//Fixed - BB Oct 27 2015
+jint mateByOverlapRatio_WithQualities(jbyte * a_bases, const jint a_bases_length, jbyte * b_bases, const jint b_bases_length,
+ jbyte * a_quality, jbyte * b_quality,
+ jfloat * aprob, jfloat * bprob,
+ jint * rvector, jint minOverlap0, jint minOverlap, jint minInsert0, jint minInsert, jfloat maxRatio, const jfloat margin, const jfloat offset) {
+ minOverlap=max(4, max(minOverlap0, minOverlap));
+ minOverlap0=mid(4, minOverlap0, minOverlap);
+
+ const jbyte *abases=a_bases, *bbases=b_bases, *aqual=a_quality, *bqual=b_quality;
+ const jint alen=a_bases_length, blen=b_bases_length;
+ const jint minLength=min(alen, blen);
+
+ {
+ const jfloat probCorrect[71] =
+ {0.000f, 0.251f, 0.369f, 0.499f, 0.602f, 0.684f, 0.749f, 0.800f, 0.842f, 0.874f, 0.900f, 0.921f, 0.937f, 0.950f, 0.960f, 0.968f,
+ 0.975f, 0.980f, 0.984f, 0.987f, 0.990f, 0.992f, 0.994f, 0.995f, 0.996f, 0.997f, 0.997f, 0.998f, 0.998f, 0.999f, 0.999f, 0.999f,
+ 0.999f, 0.999f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+
+ for(jint i=0; i<alen; i++){aprob[i]=probCorrect[aqual[i]];}
+ for(jint i=0; i<blen; i++){bprob[i]=probCorrect[bqual[i]];}
+ }
+ {
+ jfloat x=findBestRatio_WithQualities(a_bases, a_bases_length, b_bases, b_bases_length, a_quality, b_quality, aprob, bprob, minOverlap0, minOverlap, minInsert, maxRatio, offset);
+ if(x>maxRatio){
+ rvector[2]=minLength;
+ rvector[4]=0;
+ return -1;
+ }
+ maxRatio=min(maxRatio, x);
+ }
+
+ const jfloat altBadlimit=max(maxRatio, 0.07f)*2.0f*alen+1;
+ const jfloat margin2=(margin+offset)/minLength;
+
+ jint bestInsert=-1;
+ jfloat bestBad=minLength;
+ jfloat bestRatio=1;
+ jboolean ambig=0;
+
+ const jint largestInsertToTest=(alen+blen-minOverlap0);
+ const jint smallestInsertToTest=minInsert0;
+ for(jint insert=largestInsertToTest; insert>=smallestInsertToTest; insert--){
+ jfloat good=0, bad=0;
+
+ const jint istart=(insert<=blen ? 0 : insert-blen);
+ const jint jstart=(insert>=blen ? 0 : blen-insert);
+
+ const jint overlapLength=min(alen-istart, min(blen-jstart, insert));
+ const jfloat badlimit=min(altBadlimit, min(bestRatio, maxRatio)*margin*overlapLength);
+
+ const jint imax=istart+overlapLength;
+ for(jint i=istart, j=jstart; i<imax && bad<=badlimit; i++, j++){
+ const jbyte ca=abases[i], cb=bbases[j];
+ const jfloat x=aprob[i]*bprob[j];
+
+ if(ca==cb){good+=x;}
+ else{bad+=x;}
+ }
+
+ if(bad<=badlimit){
+ if(bad==0 && good>minOverlap0 && good<minOverlap){
+ rvector[2]=(jint)bestBad;
+ rvector[4]=1;
+ return -1;
+ }
+
+ jfloat ratio=(bad+offset)/overlapLength;
+
+ if(ratio<bestRatio*margin){
+
+ ambig=(ratio*margin>=bestRatio || good<minOverlap);
+ if(ratio<bestRatio){
+ bestInsert=insert;
+ bestBad=bad;
+ bestRatio=ratio;
+ }
+ if(ambig && bestRatio<margin2){
+ rvector[2]=(jint)bestBad;
+ rvector[4]=1;
+ return -1;
+ }
+ }
+ }
+ }
+
+ if(!ambig && bestRatio>maxRatio){bestInsert=-1;}
+
+ rvector[2]=(jint)bestBad;
+ rvector[4]=(ambig ? 1 : 0);
+
+ return (bestInsert<0 ? -1 : bestInsert);
+}
+//Fixed - BB Oct 27 2015
+jint mateByOverlapRatio(jbyte * a_bases, const jint a_bases_length, jbyte * b_bases, const jint b_bases_length, jint * rvector, jint minOverlap0, jint minOverlap, const jint minInsert0, const jint minInsert, jfloat maxRatio, const jfloat margin, const jfloat offset, const jfloat gIncr, const jfloat bIncr) {
+
+ minOverlap=max(4, max(minOverlap0, minOverlap));
+ minOverlap0=mid(4, minOverlap0, minOverlap);
+
+ const jbyte *abases=a_bases, *bbases=b_bases;
+ const jint alen=a_bases_length, blen=b_bases_length;
+ const jint minLength=min(alen, blen);
+ {
+ jfloat x=findBestRatio(a_bases, a_bases_length, b_bases, b_bases_length, minOverlap0, minOverlap, minInsert, maxRatio, offset, gIncr, bIncr);
+ if(x>=maxRatio){
+ rvector[2]=minLength;
+ rvector[4]=0;
+ return -1;
+ }
+ maxRatio=min(maxRatio, x);
+ }
+
+ const jfloat altBadlimit=max(maxRatio, 0.07f)*2.0f*alen+1;
+ const jfloat margin2=(margin+offset)/minLength;
+ const jbyte N='N';
+
+ jint bestInsert=-1;
+ jfloat bestBad=minLength;
+ jfloat bestRatio=1;
+ jboolean ambig=0;
+
+ const jint largestInsertToTest=(alen+blen-minOverlap0);
+ const jint smallestInsertToTest=minInsert0;
+ for(jint insert=largestInsertToTest; insert>=smallestInsertToTest; insert--){
+ const jint istart=(insert<=blen ? 0 : insert-blen);
+ const jint jstart=(insert>=blen ? 0 : blen-insert);
+ const jint overlapLength=min(alen-istart, min(blen-jstart, insert));
+
+ const jfloat badlimit=(min(altBadlimit, min(bestRatio, maxRatio)*margin*overlapLength));
+ jfloat good=0, bad=0;
+
+ const int imax=istart+overlapLength;
+ for(jint i=istart, j=jstart; i<imax && bad<=badlimit; i++, j++){
+ const jbyte ca=abases[i], cb=bbases[j];
+
+ if(ca==cb){
+ if(ca!=N){good+=gIncr;}
+ }else{bad+=bIncr;}
+ }
+
+ if(bad<=badlimit){
+ if(bad==0 && good>minOverlap0 && good<minOverlap){
+ rvector[2]=(jint)bestBad;
+ rvector[4]=1;
+ return -1;
+ }
+
+ jfloat ratio=(bad+offset)/overlapLength;
+
+ if(ratio<bestRatio*margin){
+
+ ambig=(ratio*margin>=bestRatio || good<minOverlap);
+ if(ratio<bestRatio){
+ bestInsert=insert;
+ bestBad=bad;
+ bestRatio=ratio;
+ }
+ if(ambig && bestRatio<margin2){
+ rvector[2]=(int)bestBad;
+ rvector[4]=1;
+ return -1;
+ }
+ }
+ }
+ }
+
+ if(!ambig && bestRatio>maxRatio){bestInsert=-1;}
+
+ rvector[2]=(jint)bestBad;
+ rvector[4]=(ambig ? 1 : 0);
+
+ return (bestInsert<0 ? -1 : bestInsert);
+}
+
+JNIEXPORT jint JNICALL Java_jgi_BBMergeOverlapper_mateByOverlapJNI_WithQualities(
+ JNIEnv *env,
+ jobject obj,
+ jbyteArray a_bases,
+ jbyteArray b_bases,
+ jbyteArray a_quality,
+ jbyteArray b_quality,
+ jfloatArray aprob,
+ jfloatArray bprob,
+ jintArray rvector,
+ jint minOverlap0,
+ jint minOverlap,
+ jint minInsert0,
+ jint minInsert,
+ jfloat maxRatio,
+ jfloat margin,
+ jfloat offset
+ ) {
+ jbyte * ja_quality = NULL;
+ jbyte * jb_quality = NULL;
+ jfloat * japrob = NULL;
+ jfloat * jbprob = NULL;
+
+ // Get the size of the read and the reference arrays
+ const jint a_bases_length = (*env)->GetArrayLength(env, a_bases);
+ const jint b_bases_length = (*env)->GetArrayLength(env, b_bases);
+
+ // Copy arrays from Java
+ jbyte * ja_bases = (jbyte*)(*env)->GetPrimitiveArrayCritical(env, a_bases, NULL);
+ jbyte * jb_bases = (jbyte*)(*env)->GetPrimitiveArrayCritical(env, b_bases, NULL);
+ if(a_quality!=NULL) {ja_quality = (jbyte*)(*env)->GetPrimitiveArrayCritical(env, a_quality, NULL);}
+ if(b_quality!=NULL) {jb_quality = (jbyte*)(*env)->GetPrimitiveArrayCritical(env, b_quality, NULL);}
+ if(aprob!=NULL) {japrob = (jfloat*)(*env)->GetPrimitiveArrayCritical(env, aprob, NULL);}
+ if(bprob!=NULL) {jbprob = (jfloat*)(*env)->GetPrimitiveArrayCritical(env, bprob, NULL);}
+ jint * jrvector = (jint*)(*env)->GetPrimitiveArrayCritical(env, rvector, NULL);
+
+ const jint returnVal = mateByOverlapRatio_WithQualities(ja_bases, a_bases_length, jb_bases, b_bases_length, ja_quality, jb_quality, japrob, jbprob, jrvector, minOverlap0, minOverlap, minInsert0, minInsert, maxRatio, margin, offset);
+
+ // Release Java arrays; 0 copies the array back to Java, JNI_ABORT does not copy the current array values to Java
+ (*env)->ReleasePrimitiveArrayCritical(env, a_bases, ja_bases, JNI_ABORT);
+ (*env)->ReleasePrimitiveArrayCritical(env, b_bases, jb_bases, JNI_ABORT);
+ if(ja_quality!=NULL) {(*env)->ReleasePrimitiveArrayCritical(env, a_quality, ja_quality, JNI_ABORT);}
+ if(jb_quality!=NULL) {(*env)->ReleasePrimitiveArrayCritical(env, b_quality, jb_quality, JNI_ABORT);}
+ if(japrob!=NULL) {(*env)->ReleasePrimitiveArrayCritical(env, aprob, japrob, JNI_ABORT);}
+ if(jbprob!=NULL) {(*env)->ReleasePrimitiveArrayCritical(env, bprob, jbprob, JNI_ABORT);}
+ (*env)->ReleasePrimitiveArrayCritical(env, rvector, jrvector, 0);
+
+ return returnVal;
+}
+
+JNIEXPORT jint JNICALL Java_jgi_BBMergeOverlapper_mateByOverlapJNI(
+ JNIEnv *env,
+ jobject obj,
+ jbyteArray a_bases,
+ jbyteArray b_bases,
+ jbyteArray a_quality,
+ jbyteArray b_quality,
+ jfloatArray aprob,
+ jfloatArray bprob,
+ jintArray rvector,
+ jint minOverlap0,
+ jint minOverlap,
+ jint minInsert0,
+ jint margin,
+ jint maxMismatches0,
+ jint maxMismatches,
+ jint minq
+ ) {
+ jbyte * ja_quality = NULL;
+ jbyte * jb_quality = NULL;
+ jfloat * japrob = NULL;
+ jfloat * jbprob = NULL;
+
+ // Get the size of the read and the reference arrays
+ const jint a_bases_length = (*env)->GetArrayLength(env, a_bases);
+ const jint b_bases_length = (*env)->GetArrayLength(env, b_bases);
+
+ // Copy arrays from Java
+ jbyte * ja_bases = (jbyte*)(*env)->GetPrimitiveArrayCritical(env, a_bases, NULL);
+ jbyte * jb_bases = (jbyte*)(*env)->GetPrimitiveArrayCritical(env, b_bases, NULL);
+ if(a_quality!=NULL) {ja_quality = (jbyte*)(*env)->GetPrimitiveArrayCritical(env, a_quality, NULL);}
+ if(b_quality!=NULL) {jb_quality = (jbyte*)(*env)->GetPrimitiveArrayCritical(env, b_quality, NULL);}
+ if(aprob!=NULL) {japrob = (jfloat*)(*env)->GetPrimitiveArrayCritical(env, aprob, NULL);}
+ if(bprob!=NULL) {jbprob = (jfloat*)(*env)->GetPrimitiveArrayCritical(env, bprob, NULL);}
+ jint * jrvector = (jint*)(*env)->GetPrimitiveArrayCritical(env, rvector, NULL);
+
+ const jint returnVal = mateByOverlap(ja_bases, a_bases_length, jb_bases, b_bases_length, ja_quality, jb_quality, japrob, jbprob, jrvector, minOverlap0, minOverlap, minInsert0, margin, maxMismatches0, maxMismatches, minq);
+
+ // Release Java arrays; 0 copies the array back to Java, JNI_ABORT does not copy the current array values to Java
+ (*env)->ReleasePrimitiveArrayCritical(env, a_bases, ja_bases, JNI_ABORT);
+ (*env)->ReleasePrimitiveArrayCritical(env, b_bases, jb_bases, JNI_ABORT);
+ if(ja_quality!=NULL) {(*env)->ReleasePrimitiveArrayCritical(env, a_quality, ja_quality, JNI_ABORT);}
+ if(jb_quality!=NULL) {(*env)->ReleasePrimitiveArrayCritical(env, b_quality, jb_quality, JNI_ABORT);}
+ if(japrob!=NULL) {(*env)->ReleasePrimitiveArrayCritical(env, aprob, japrob, JNI_ABORT);}
+ if(jbprob!=NULL) {(*env)->ReleasePrimitiveArrayCritical(env, bprob, jbprob, JNI_ABORT);}
+ (*env)->ReleasePrimitiveArrayCritical(env, rvector, jrvector, 0);
+
+ return returnVal;
+}
+
+JNIEXPORT jint JNICALL Java_jgi_BBMergeOverlapper_mateByOverlapRatioJNI(
+ JNIEnv *env,
+ jobject obj,
+ jbyteArray a_bases,
+ jbyteArray b_bases,
+ jintArray rvector,
+ jint minOverlap0,
+ jint minOverlap,
+ jint minInsert0,
+ jint minInsert,
+ jfloat maxRatio,
+ jfloat margin,
+ jfloat offset,
+ jfloat gIncr,
+ jfloat bIncr
+ ) {
+ // Get the size of the read and the reference arrays
+ const jint a_bases_length = (*env)->GetArrayLength(env, a_bases);
+ const jint b_bases_length = (*env)->GetArrayLength(env, b_bases);
+
+ // Copy arrays from Java
+ jbyte * ja_bases = (jbyte*)(*env)->GetPrimitiveArrayCritical(env, a_bases, NULL);
+ jbyte * jb_bases = (jbyte*)(*env)->GetPrimitiveArrayCritical(env, b_bases, NULL);
+ jint * jrvector = (jint*)(*env)->GetPrimitiveArrayCritical(env, rvector, NULL);
+
+ const jint returnVal = mateByOverlapRatio(ja_bases, a_bases_length, jb_bases, b_bases_length, jrvector, minOverlap0, minOverlap, minInsert0, minInsert, maxRatio, margin, offset, gIncr, bIncr);
+
+ // Release Java arrays; 0 copies the array back to Java, JNI_ABORT does not copy the current array values to Java
+ (*env)->ReleasePrimitiveArrayCritical(env, a_bases, ja_bases, JNI_ABORT);
+ (*env)->ReleasePrimitiveArrayCritical(env, b_bases, jb_bases, JNI_ABORT);
+ (*env)->ReleasePrimitiveArrayCritical(env, rvector, jrvector, 0);
+
+ return returnVal;
+}
+
diff --git a/jni/BandedAlignerJNI.c b/jni/BandedAlignerJNI.c
new file mode 100755
index 0000000..7c4eb84
--- /dev/null
+++ b/jni/BandedAlignerJNI.c
@@ -0,0 +1,758 @@
+#include <jni.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "align2_BandedAlignerJNI.h"
+
+// C doesn't have min() or max() so we define our own
+#define max(a,b) \
+ ({ __typeof__ (a) _a = (a); \
+ __typeof__ (b) _b = (b); \
+ _a > _b ? _a : _b; })
+
+#define min(a,b) \
+ ({ __typeof__ (a) _a = (a); \
+ __typeof__ (b) _b = (b); \
+ _a < _b ? _a : _b; })
+
+// I am not aware of any place where BBMap changes this value
+// taken from the value in BandedAligner.java, so I'm using a define
+// to true so the compiler can optimize the if statements out
+#define penalizeOffCenter 1
+
+// Need these prototypes since these functions can call each other
+jint alignForward(
+ jbyte * query,
+ jbyte * ref,
+ jint query_length,
+ jint ref_length,
+ jint qstart,
+ jint rstart,
+ jint maxEdits,
+ jboolean exact,
+ jint * lastQueryLoc,
+ jint * lastRefLoc,
+ jint * lastRow,
+ jint * lastEdits,
+ jint * lastOffset,
+ jint maxWidth,
+ jbyte * baseToNumber
+ );
+
+jint alignForwardRC(
+ jbyte * query,
+ jbyte * ref,
+ jint query_length,
+ jint ref_length,
+ jint qstart,
+ jint rstart,
+ jint maxEdits,
+ jboolean exact,
+ jint * lastQueryLoc,
+ jint * lastRefLoc,
+ jint * lastRow,
+ jint * lastEdits,
+ jint * lastOffset,
+ jint maxWidth,
+ jbyte * baseToNumber,
+ jbyte * baseToComplementExtended
+ );
+
+jint alignReverse(
+ jbyte * query,
+ jbyte * ref,
+ jint query_length,
+ jint ref_length,
+ jint qstart,
+ jint rstart,
+ jint maxEdits,
+ jboolean exact,
+ jint * lastQueryLoc,
+ jint * lastRefLoc,
+ jint * lastRow,
+ jint * lastEdits,
+ jint * lastOffset,
+ jint maxWidth,
+ jbyte * baseToNumber
+ );
+
+jint alignReverseRC(
+ jbyte * query,
+ jbyte * ref,
+ jint query_length,
+ jint ref_length,
+ jint qstart,
+ jint rstart,
+ jint maxEdits,
+ jboolean exact,
+ jint * lastQueryLoc,
+ jint * lastRefLoc,
+ jint * lastRow,
+ jint * lastEdits,
+ jint * lastOffset,
+ jint maxWidth,
+ jbyte * baseToNumber,
+ jbyte * baseToComplementExtended
+ );
+
+jint lastOffsetFunc(jint * array, jint halfWidth){
+ const jint center=halfWidth+1;
+ jint minLoc=center;
+ for(jint i=1; i<=halfWidth; i++){
+ if(array[center+i]<array[minLoc]){
+ minLoc=center+i;
+ }
+ if(array[center-i]<array[minLoc]){
+ minLoc=center-i;
+ }
+ }
+ return center-minLoc;
+}
+
+jint penalizeOffCenterFunc(jint * array, jint halfWidth, jint big){
+ const jint center=halfWidth+1;
+ jint edits=array[center];
+ for(jint i=1; i<=halfWidth; i++){
+ array[center+i]=min(big, array[center+i]+i);
+ edits=min(edits, array[center+i]);
+ array[center-i]=min(big, array[center-i]+i);
+ edits=min(edits, array[center-i]);
+ }
+ return edits;
+}
+
+jint alignForward(
+ jbyte * query,
+ jbyte * ref,
+ jint query_length,
+ jint ref_length,
+ jint qstart,
+ jint rstart,
+ jint maxEdits,
+ jboolean exact,
+ jint * lastQueryLoc,
+ jint * lastRefLoc,
+ jint * lastRow,
+ jint * lastEdits,
+ jint * lastOffset,
+ jint maxWidth,
+ jbyte * baseToNumber
+ ) {
+
+ if(query_length-qstart>ref_length-rstart){
+ jint x=alignForward(ref, query, ref_length, query_length, rstart,
+ qstart, maxEdits, exact, lastQueryLoc, lastRefLoc, lastRow, lastEdits, lastOffset, maxWidth, baseToNumber);
+ const jint temp = *lastQueryLoc;
+ *lastQueryLoc = *lastRefLoc;
+ *lastRefLoc = temp;
+ return x;
+ }
+
+ const jint big=999;
+ jint edits=0;
+ jint row=0;
+ *lastRow=-1;
+ *lastEdits=0;
+ *lastOffset=0;
+
+ const jint width=min(maxWidth, (maxEdits*2)+1);
+ const jint halfWidth=width/2;
+ const jboolean inexact=!exact;
+
+ jint qloc=qstart;
+ jint rsloc=rstart-halfWidth;
+ const jint xlines=query_length-qstart;
+ const jint ylines=ref_length-rstart;
+ const jint len=min(xlines, ylines);
+
+ if(len<1){
+ return 0;
+ }
+
+ jint arrayCurrentArray[maxWidth+2];
+ jint arrayPrevArray[maxWidth+2];
+ jint * arrayCurrent=arrayCurrentArray;
+ jint * arrayPrev=arrayPrevArray;
+ jint * arrayTemp;
+
+ for(jint i=0; i<maxWidth+2; i++) {
+ arrayCurrent[i]=big;
+ arrayPrev[i]=big;
+ }
+
+ {
+ const jbyte q=query[qloc];
+ const jint colStart=max(0, rsloc);
+ const jint colLimit=min(rsloc+width, ref_length);
+ edits=big;
+ jint mloc=1+(colStart-rsloc);
+ for(jint col=colStart; col<colLimit; mloc++, col++){
+ const jbyte r=ref[col];
+ const jint score=(q==r || (inexact && (!(baseToNumber[q]>=0) || !(baseToNumber[r]>=0))) ? 0 : 1);
+ arrayCurrent[mloc]=score;
+ edits=min(edits, score);
+ }
+ row++; qloc++; rsloc++;
+ }
+ if(penalizeOffCenter){
+ edits=penalizeOffCenterFunc(arrayCurrent, halfWidth, big);
+ }
+
+ for(row=1; row<len; row++, qloc++, rsloc++){
+ arrayTemp=arrayCurrent;
+ arrayCurrent=arrayPrev;
+ arrayPrev=arrayTemp;
+ const jbyte q=query[qloc];
+ const jint colStart=max(0, rsloc);
+ const jint colLimit=min(rsloc+width, ref_length);
+ for(jint i=0; i<maxWidth+2; i++) {
+ arrayCurrent[i]=big;
+ }
+ edits=big;
+ jint mloc=1+(colStart-rsloc);
+ const jint forceDiag=(row==len-1);
+ for(jint col=colStart; col<colLimit; mloc++, col++){
+ const jbyte r=ref[col];
+ const jint scoreUp=arrayPrev[mloc+1]+1;
+ const jint scoreDiag=arrayPrev[mloc]+(q==r || (inexact && (!(baseToNumber[q]>=0) || !(baseToNumber[r]>=0))) ? 0 : 1);
+ const jint scoreLeft=arrayCurrent[mloc-1]+1;
+ const jint score=(forceDiag || col==ref_length-1) ? scoreDiag : min(scoreUp, min(scoreDiag, scoreLeft));
+ arrayCurrent[mloc]=score;
+ edits=min(edits, score);
+ }
+ if(edits>maxEdits){
+ row++;
+ break;
+ }
+ }
+ if(penalizeOffCenter){
+ edits=penalizeOffCenterFunc(arrayCurrent, halfWidth, big);
+ }
+
+ *lastRow=row-1;
+ *lastEdits=edits;
+ *lastQueryLoc=qloc-1;
+ *lastOffset=lastOffsetFunc(arrayCurrent, halfWidth);
+ *lastRefLoc=rsloc+halfWidth-(*lastOffset)-1;
+ while((*lastRefLoc)>=ref_length || (*lastQueryLoc)>=query_length){(*lastRefLoc)--;(*lastQueryLoc)--;}
+
+ return edits;
+}
+
+jint alignForwardRC(
+ jbyte * query,
+ jbyte * ref,
+ jint query_length,
+ jint ref_length,
+ jint qstart,
+ jint rstart,
+ jint maxEdits,
+ jboolean exact,
+ jint * lastQueryLoc,
+ jint * lastRefLoc,
+ jint * lastRow,
+ jint * lastEdits,
+ jint * lastOffset,
+ jint maxWidth,
+ jbyte * baseToNumber,
+ jbyte * baseToComplementExtended
+ ) {
+
+ if(qstart+1>ref_length-rstart){
+ jint x=alignReverseRC(ref, query, ref_length, query_length, rstart,
+ qstart, maxEdits, exact, lastQueryLoc, lastRefLoc, lastRow, lastEdits, lastOffset, maxWidth, baseToNumber, baseToComplementExtended);
+ const jint temp = *lastQueryLoc;
+ *lastQueryLoc = *lastRefLoc;
+ *lastRefLoc = temp;
+ return x;
+ }
+
+ const jint big=999;
+ jint edits=0;
+ jint row=0;
+ *lastRow=-1;
+ *lastEdits=0;
+ *lastOffset=0;
+
+ const jint width=min(maxWidth, (maxEdits*2)+1);
+ const jint halfWidth=width/2;
+ const jboolean inexact=!exact;
+
+ jint qloc=qstart;
+ jint rsloc=rstart-halfWidth;
+ const jint xlines=qstart+1;
+ const jint ylines=ref_length-rstart;
+ const jint len=min(xlines, ylines);
+
+ if(len<1){
+ return 0;
+ }
+
+ jint arrayCurrentArray[maxWidth+2];
+ jint arrayPrevArray[maxWidth+2];
+ jint * arrayCurrent=arrayCurrentArray;
+ jint * arrayPrev=arrayPrevArray;
+ jint * arrayTemp;
+
+ for(jint i=0; i<maxWidth+2; i++) {
+ arrayCurrent[i]=big;
+ arrayPrev[i]=big;
+ }
+
+ {
+ const jbyte q=baseToComplementExtended[query[qloc]];
+ const jint colStart=max(0, rsloc);
+ const jint colLimit=min(rsloc+width, ref_length);
+ edits=big;
+ jint mloc=1+(colStart-rsloc);
+ for(jint col=colStart; col<colLimit; mloc++, col++){
+ const jbyte r=ref[col];
+ const jint score=(q==r || (inexact && (!(baseToNumber[q]>=0) || !(baseToNumber[r]>=0))) ? 0 : 1);
+ arrayCurrent[mloc]=score;
+ edits=min(edits, score);
+ }
+ row++; qloc--; rsloc++;
+ }
+ if(penalizeOffCenter){
+ edits=penalizeOffCenterFunc(arrayCurrent, halfWidth, big);
+ }
+
+ for(row=1; row<len; row++, qloc--, rsloc++){
+ arrayTemp=arrayCurrent;
+ arrayCurrent=arrayPrev;
+ arrayPrev=arrayTemp;
+ const jbyte q=baseToComplementExtended[query[qloc]];
+ const jint colStart=max(0, rsloc);
+ const jint colLimit=min(rsloc+width, ref_length);
+ for(jint i=0; i<maxWidth+2; i++) {
+ arrayCurrent[i]=big;
+ }
+ edits=big;
+ jint mloc=1+(colStart-rsloc);
+ const jint forceDiag=(row==len-1);
+ for(jint col=colStart; col<colLimit; mloc++, col++){
+ const jbyte r=ref[col];
+ const jint scoreUp=arrayPrev[mloc+1]+1;
+ const jint scoreDiag=arrayPrev[mloc]+(q==r || (inexact && (!(baseToNumber[q]>=0) || !(baseToNumber[r]>=0))) ? 0 : 1);
+ const jint scoreLeft=arrayCurrent[mloc-1]+1;
+ const jint score=(forceDiag || col==ref_length-1) ? scoreDiag : min(scoreUp, min(scoreDiag, scoreLeft));
+ arrayCurrent[mloc]=score;
+ edits=min(edits, score);
+ }
+ if(edits>maxEdits){row++; break;}
+ }
+ if(penalizeOffCenter){
+ edits=penalizeOffCenterFunc(arrayCurrent, halfWidth, big);
+ }
+
+ *lastRow=row-1;
+ *lastEdits=edits;
+ *lastOffset=lastOffsetFunc(arrayCurrent, halfWidth);
+ *lastQueryLoc=qloc+1;
+ *lastRefLoc=rsloc+halfWidth-(*lastOffset)-1;
+ while((*lastRefLoc)>=ref_length || (*lastQueryLoc)<0){(*lastRefLoc)--; (*lastQueryLoc)++;}
+
+ return edits;
+}
+
+jint alignReverse(
+ jbyte * query,
+ jbyte * ref,
+ jint query_length,
+ jint ref_length,
+ jint qstart,
+ jint rstart,
+ jint maxEdits,
+ jboolean exact,
+ jint * lastQueryLoc,
+ jint * lastRefLoc,
+ jint * lastRow,
+ jint * lastEdits,
+ jint * lastOffset,
+ jint maxWidth,
+ jbyte * baseToNumber
+ ) {
+
+ if(qstart>rstart){
+ jint x=alignReverse(ref, query, ref_length, query_length, rstart,
+ qstart, maxEdits, exact, lastQueryLoc, lastRefLoc, lastRow, lastEdits, lastOffset, maxWidth, baseToNumber);
+ const jint temp = *lastQueryLoc;
+ *lastQueryLoc = *lastRefLoc;
+ *lastRefLoc = temp;
+ return x;
+ }
+
+ const jint big=999;
+ jint edits=0;
+ jint row=0;
+ *lastRow=-1;
+ *lastEdits=0;
+ *lastOffset=0;
+
+ const jint width=min(maxWidth, (maxEdits*2)+1);
+ const jint halfWidth=width/2;
+ const jboolean inexact=!exact;
+
+ jint qloc=qstart;
+ jint rsloc=rstart-halfWidth;
+ const jint xlines=qstart+1;
+ const jint ylines=rstart+1;
+ const jint len=min(xlines, ylines);
+
+ if(len<1){
+ return 0;
+ }
+
+ jint arrayCurrentArray[maxWidth+2];
+ jint arrayPrevArray[maxWidth+2];
+ jint * arrayCurrent=arrayCurrentArray;
+ jint * arrayPrev=arrayPrevArray;
+ jint * arrayTemp;
+
+ for(jint i=0; i<maxWidth+2; i++) {
+ arrayCurrent[i]=big;
+ arrayPrev[i]=big;
+ }
+
+ {
+ const jbyte q=query[qloc];
+ const jint colStart=max(0, rsloc);
+ const jint colLimit=min(rsloc+width, ref_length);
+ edits=big;
+ jint mloc=1+width-(colLimit-rsloc);
+ for(jint col=colLimit-1; col>=colStart; mloc++, col--){
+ const jbyte r=ref[col];
+ const jint score=(q==r || (inexact && (!(baseToNumber[q]>=0) || !(baseToNumber[r]>=0))) ? 0 : 1);
+ arrayCurrent[mloc]=score;
+ edits=min(edits, score);
+ }
+ row++; qloc--; rsloc--;
+ }
+ if(penalizeOffCenter){
+ edits=penalizeOffCenterFunc(arrayCurrent, halfWidth, big);
+ }
+
+ for(row=1; row<len; row++, qloc--, rsloc--){
+ arrayTemp=arrayCurrent;
+ arrayCurrent=arrayPrev;
+ arrayPrev=arrayTemp;
+ const jbyte q=query[qloc];
+ const jint colStart=max(0, rsloc);
+ const jint colLimit=min(rsloc+width, ref_length);
+ for(jint i=0; i<maxWidth+2; i++) {
+ arrayCurrent[i]=big;
+ }
+ edits=big;
+ jint mloc=1+width-(colLimit-rsloc);
+ const jint forceDiag=(row==len-1);
+ for(jint col=colLimit-1; col>=colStart; mloc++, col--){
+ const jbyte r=ref[col];
+ const jint scoreUp=arrayPrev[mloc+1]+1;
+ const jint scoreDiag=arrayPrev[mloc]+(q==r || (inexact && (!(baseToNumber[q]>=0) || !(baseToNumber[r]>=0))) ? 0 : 1);
+ const jint scoreLeft=arrayCurrent[mloc-1]+1;
+ const jint score=(forceDiag || col==0) ? scoreDiag : min(scoreUp, min(scoreDiag, scoreLeft));
+ arrayCurrent[mloc]=score;
+ edits=min(edits, score);
+ }
+ if(edits>maxEdits){row++; break;}
+ }
+ if(penalizeOffCenter){
+ edits=penalizeOffCenterFunc(arrayCurrent, halfWidth, big);
+ }
+
+ *lastRow=row-1;
+ *lastEdits=edits;
+ *lastOffset=lastOffsetFunc(arrayCurrent, halfWidth);
+ *lastQueryLoc=qloc+1;
+ *lastRefLoc=rsloc+halfWidth+(*lastOffset)+1;
+ while((*lastRefLoc)<0 || (*lastQueryLoc)<0){(*lastRefLoc)++; (*lastQueryLoc)++;}
+
+ return edits;
+}
+
+jint alignReverseRC(
+ jbyte * query,
+ jbyte * ref,
+ jint query_length,
+ jint ref_length,
+ jint qstart,
+ jint rstart,
+ jint maxEdits,
+ jboolean exact,
+ jint * lastQueryLoc,
+ jint * lastRefLoc,
+ jint * lastRow,
+ jint * lastEdits,
+ jint * lastOffset,
+ jint maxWidth,
+ jbyte * baseToNumber,
+ jbyte * baseToComplementExtended
+ ) {
+
+ if(query_length-qstart>rstart+1){
+ jint x=alignForwardRC(ref, query, ref_length, query_length, rstart,
+ qstart, maxEdits, exact, lastQueryLoc, lastRefLoc, lastRow, lastEdits, lastOffset, maxWidth, baseToNumber, baseToComplementExtended);
+ const jint temp = *lastQueryLoc;
+ *lastQueryLoc = *lastRefLoc;
+ *lastRefLoc = temp;
+ return x;
+ }
+
+ const jint big=999;
+ jint edits=0;
+ jint row=0;
+ *lastRow=-1;
+ *lastEdits=0;
+ *lastOffset=0;
+
+ const jint width=min(maxWidth, (maxEdits*2)+1);
+ const jint halfWidth=width/2;
+ const jboolean inexact=!exact;
+
+ jint qloc=qstart;
+ jint rsloc=rstart-halfWidth;
+ const jint xlines=query_length-qstart;
+ const jint ylines=rstart+1;
+ const jint len=min(xlines, ylines);
+
+ if(len<1){
+ return 0;
+ }
+
+ jint arrayCurrentArray[maxWidth+2];
+ jint arrayPrevArray[maxWidth+2];
+ jint * arrayCurrent=arrayCurrentArray;
+ jint * arrayPrev=arrayPrevArray;
+ jint * arrayTemp;
+
+ for(jint i=0; i<maxWidth+2; i++) {
+ arrayCurrent[i]=big;
+ arrayPrev[i]=big;
+ }
+
+ {
+ const jbyte q=baseToComplementExtended[query[qloc]];
+ const jint colStart=max(0, rsloc);
+ const jint colLimit=min(rsloc+width, ref_length);
+ edits=big;
+ jint mloc=1+width-(colLimit-rsloc);
+ for(jint col=colLimit-1; col>=colStart; mloc++, col--){
+ const jbyte r=ref[col];
+ const jint score=(q==r || (inexact && (!(baseToNumber[q]>=0) || !(baseToNumber[r]>=0))) ? 0 : 1);
+ arrayCurrent[mloc]=score;
+ edits=min(edits, score);
+ }
+ row++; qloc++; rsloc--;
+ }
+ if(penalizeOffCenter){
+ edits=penalizeOffCenterFunc(arrayCurrent, halfWidth, big);
+ }
+
+ for(row=1; row<len; row++, qloc++, rsloc--){
+ arrayTemp=arrayCurrent;
+ arrayCurrent=arrayPrev;
+ arrayPrev=arrayTemp;
+ const jbyte q=baseToComplementExtended[query[qloc]];
+ const jint colStart=max(0, rsloc);
+ const jint colLimit=min(rsloc+width, ref_length);
+ for(jint i=0; i<maxWidth+2; i++) {
+ arrayCurrent[i]=big;
+ }
+ edits=big;
+ jint mloc=1+width-(colLimit-rsloc);
+ const jint forceDiag=(row==len-1);
+ for(jint col=colLimit-1; col>=colStart; mloc++, col--){
+ const jbyte r=ref[col];
+ const jint scoreUp=arrayPrev[mloc+1]+1;
+ const jint scoreDiag=arrayPrev[mloc]+(q==r || (inexact && (!(baseToNumber[q]>=0) || !(baseToNumber[r]>=0))) ? 0 : 1);
+ const jint scoreLeft=arrayCurrent[mloc-1]+1;
+ const jint score=(forceDiag || col==0) ? scoreDiag : min(scoreUp, min(scoreDiag, scoreLeft));
+ arrayCurrent[mloc]=score;
+ edits=min(edits, score);
+ }
+ if(edits>maxEdits){row++; break;}
+ }
+ if(penalizeOffCenter){
+ edits=penalizeOffCenterFunc(arrayCurrent, halfWidth, big);
+ }
+
+ *lastRow=row-1;
+ *lastEdits=edits;
+ *lastOffset=lastOffsetFunc(arrayCurrent, halfWidth);
+ *lastQueryLoc=qloc-1;
+ *lastRefLoc=rsloc+halfWidth+(*lastOffset)+1;
+ while((*lastRefLoc)<0 || (*lastQueryLoc)>=query_length){(*lastRefLoc)++; (*lastQueryLoc)--;}
+ return edits;
+}
+
+// The other three JNICALL functions are almost identical
+JNIEXPORT jint JNICALL Java_align2_BandedAlignerJNI_alignForwardJNI(
+ JNIEnv *env,
+ jobject obj,
+ jbyteArray query,
+ jbyteArray ref,
+ jint qstart,
+ jint rstart,
+ jint maxEdits,
+ jboolean exact,
+ jint maxWidth,
+ jbyteArray baseToNumber,
+ jintArray returnVals
+ ) {
+ jint edits = 0;
+
+ // Get the size of the read and the reference arrays
+ jint ref_length = (*env)->GetArrayLength(env, ref);
+ jint query_length = (*env)->GetArrayLength(env, query);
+
+ // Copy arrays from Java
+ jbyte * jbaseToNumber = (jbyte*)(*env)->GetPrimitiveArrayCritical(env, baseToNumber, NULL);
+ jbyte * jref = (jbyte*)(*env)->GetPrimitiveArrayCritical(env, ref, NULL);
+ jbyte * jquery = (jbyte*)(*env)->GetPrimitiveArrayCritical(env, query, NULL);
+ jint * jreturnVals = (jint*)(*env)->GetPrimitiveArrayCritical(env, returnVals, NULL);
+
+ // Using pointers for variables that need to be passed back to Java so the called functions can update them
+ jint * lastQueryLoc = &jreturnVals[0];
+ jint * lastRefLoc = &jreturnVals[1];
+ jint * lastRow = &jreturnVals[2];
+ jint * lastEdits = &jreturnVals[3];
+ jint * lastOffset = &jreturnVals[4];
+
+ // Call the fillLimitedX function in C; the 5 return values will be in jresult[]
+ edits = alignForward(jquery, jref, query_length, ref_length, qstart, rstart, maxEdits, exact,
+ lastQueryLoc, lastRefLoc, lastRow, lastEdits, lastOffset, maxWidth, jbaseToNumber);
+
+ // Release Java arrays; 0 copies the array back to Java, JNI_ABORT does not copy the current array values to Java
+ (*env)->ReleasePrimitiveArrayCritical(env, baseToNumber, jbaseToNumber, JNI_ABORT);
+ (*env)->ReleasePrimitiveArrayCritical(env, ref, jref, JNI_ABORT);
+ (*env)->ReleasePrimitiveArrayCritical(env, query, jquery, JNI_ABORT);
+ (*env)->ReleasePrimitiveArrayCritical(env, returnVals, jreturnVals, 0);
+
+ return edits;
+}
+
+JNIEXPORT jint JNICALL Java_align2_BandedAlignerJNI_alignForwardRCJNI(
+ JNIEnv *env,
+ jobject obj,
+ jbyteArray query,
+ jbyteArray ref,
+ jint qstart,
+ jint rstart,
+ jint maxEdits,
+ jboolean exact,
+ jint maxWidth,
+ jbyteArray baseToNumber,
+ jbyteArray baseToComplementExtended,
+ jintArray returnVals
+ ) {
+ jint edits = 0;
+
+ jint ref_length = (*env)->GetArrayLength(env, ref);
+ jint query_length = (*env)->GetArrayLength(env, query);
+
+ jbyte * jbaseToComplementExtended = (jbyte*)(*env)->GetPrimitiveArrayCritical(env, baseToComplementExtended, NULL);
+ jbyte * jbaseToNumber = (jbyte*)(*env)->GetPrimitiveArrayCritical(env, baseToNumber, NULL);
+ jbyte * jref = (jbyte*)(*env)->GetPrimitiveArrayCritical(env, ref, NULL);
+ jbyte * jquery = (jbyte*)(*env)->GetPrimitiveArrayCritical(env, query, NULL);
+ jint * jreturnVals = (jint*)(*env)->GetPrimitiveArrayCritical(env, returnVals, NULL);
+
+ jint * lastQueryLoc = &jreturnVals[0];
+ jint * lastRefLoc = &jreturnVals[1];
+ jint * lastRow = &jreturnVals[2];
+ jint * lastEdits = &jreturnVals[3];
+ jint * lastOffset = &jreturnVals[4];
+
+ edits = alignForwardRC(jquery, jref, query_length, ref_length, qstart, rstart, maxEdits, exact,
+ lastQueryLoc, lastRefLoc, lastRow, lastEdits, lastOffset, maxWidth, jbaseToNumber, jbaseToComplementExtended);
+
+ (*env)->ReleasePrimitiveArrayCritical(env, baseToComplementExtended, jbaseToComplementExtended, JNI_ABORT);
+ (*env)->ReleasePrimitiveArrayCritical(env, baseToNumber, jbaseToNumber, JNI_ABORT);
+ (*env)->ReleasePrimitiveArrayCritical(env, ref, jref, JNI_ABORT);
+ (*env)->ReleasePrimitiveArrayCritical(env, query, jquery, JNI_ABORT);
+ (*env)->ReleasePrimitiveArrayCritical(env, returnVals, jreturnVals, 0);
+
+ return edits;
+}
+
+JNIEXPORT jint JNICALL Java_align2_BandedAlignerJNI_alignReverseJNI(
+ JNIEnv *env,
+ jobject obj,
+ jbyteArray query,
+ jbyteArray ref,
+ jint qstart,
+ jint rstart,
+ jint maxEdits,
+ jboolean exact,
+ jint maxWidth,
+ jbyteArray baseToNumber,
+ jintArray returnVals
+ ) {
+ jint edits = 0;
+
+ jint ref_length = (*env)->GetArrayLength(env, ref);
+ jint query_length = (*env)->GetArrayLength(env, query);
+
+ jbyte * jbaseToNumber = (jbyte*)(*env)->GetPrimitiveArrayCritical(env, baseToNumber, NULL);
+ jbyte * jref = (jbyte*)(*env)->GetPrimitiveArrayCritical(env, ref, NULL);
+ jbyte * jquery = (jbyte*)(*env)->GetPrimitiveArrayCritical(env, query, NULL);
+ jint * jreturnVals = (jint*)(*env)->GetPrimitiveArrayCritical(env, returnVals, NULL);
+
+ jint * lastQueryLoc = &jreturnVals[0];
+ jint * lastRefLoc = &jreturnVals[1];
+ jint * lastRow = &jreturnVals[2];
+ jint * lastEdits = &jreturnVals[3];
+ jint * lastOffset = &jreturnVals[4];
+
+ edits = alignReverse(jquery, jref, query_length, ref_length, qstart, rstart, maxEdits, exact,
+ lastQueryLoc, lastRefLoc, lastRow, lastEdits, lastOffset, maxWidth, jbaseToNumber);
+
+ (*env)->ReleasePrimitiveArrayCritical(env, baseToNumber, jbaseToNumber, JNI_ABORT);
+ (*env)->ReleasePrimitiveArrayCritical(env, ref, jref, JNI_ABORT);
+ (*env)->ReleasePrimitiveArrayCritical(env, query, jquery, JNI_ABORT);
+ (*env)->ReleasePrimitiveArrayCritical(env, returnVals, jreturnVals, 0);
+
+ return edits;
+}
+
+JNIEXPORT jint JNICALL Java_align2_BandedAlignerJNI_alignReverseRCJNI(
+ JNIEnv *env,
+ jobject obj,
+ jbyteArray query,
+ jbyteArray ref,
+ jint qstart,
+ jint rstart,
+ jint maxEdits,
+ jboolean exact,
+ jint maxWidth,
+ jbyteArray baseToNumber,
+ jbyteArray baseToComplementExtended,
+ jintArray returnVals
+ ) {
+ jint edits = 0;
+
+ jint ref_length = (*env)->GetArrayLength(env, ref);
+ jint query_length = (*env)->GetArrayLength(env, query);
+
+ jbyte * jbaseToComplementExtended = (jbyte*)(*env)->GetPrimitiveArrayCritical(env, baseToComplementExtended, NULL);
+ jbyte * jbaseToNumber = (jbyte*)(*env)->GetPrimitiveArrayCritical(env, baseToNumber, NULL);
+ jbyte * jref = (jbyte*)(*env)->GetPrimitiveArrayCritical(env, ref, NULL);
+ jbyte * jquery = (jbyte*)(*env)->GetPrimitiveArrayCritical(env, query, NULL);
+ jint * jreturnVals = (jint*)(*env)->GetPrimitiveArrayCritical(env, returnVals, NULL);
+
+ jint * lastQueryLoc = &jreturnVals[0];
+ jint * lastRefLoc = &jreturnVals[1];
+ jint * lastRow = &jreturnVals[2];
+ jint * lastEdits = &jreturnVals[3];
+ jint * lastOffset = &jreturnVals[4];
+
+ edits = alignReverseRC(jquery, jref, query_length, ref_length, qstart, rstart, maxEdits, exact,
+ lastQueryLoc, lastRefLoc, lastRow, lastEdits, lastOffset, maxWidth, jbaseToNumber, jbaseToComplementExtended);
+
+ (*env)->ReleasePrimitiveArrayCritical(env, baseToComplementExtended, jbaseToComplementExtended, JNI_ABORT);
+ (*env)->ReleasePrimitiveArrayCritical(env, baseToNumber, jbaseToNumber, JNI_ABORT);
+ (*env)->ReleasePrimitiveArrayCritical(env, ref, jref, JNI_ABORT);
+ (*env)->ReleasePrimitiveArrayCritical(env, query, jquery, JNI_ABORT);
+ (*env)->ReleasePrimitiveArrayCritical(env, returnVals, jreturnVals, 0);
+
+ return edits;
+}
+
diff --git a/jni/CMakeLists.txt b/jni/CMakeLists.txt
new file mode 100755
index 0000000..d15a370
--- /dev/null
+++ b/jni/CMakeLists.txt
@@ -0,0 +1,55 @@
+#Project settings
+cmake_minimum_required (VERSION 2.8)
+project(BBToolsJNI)
+
+#Find required packages
+find_package(Java REQUIRED)
+find_package(JNI REQUIRED)
+if(NOT EXISTS ${Java_JAVAH_EXECUTABLE})
+ FIND_PROGRAM(Java_JAVAH_EXECUTABLE
+ NAMES javah
+ HINTS ${_JAVA_HINTS}
+ PATHS ${_JAVA_PATHS}
+ )
+ if(EXISTS ${Java_JAVAH_EXECUTABLE})
+ message(STATUS "Found Java_JAVAH_EXECUTABLE=${Java_JAVAH_EXECUTABLE}")
+ else()
+ message(FATAL_ERROR "Java generator 'javah' could not be found. Ensure it is installed and in your PATH.")
+ endif()
+endif()
+
+#Set C_FLAGS on Unix
+if(UNIX OR APPLE)
+ set(CMAKE_MACOSX_RPATH TRUE)
+ set(CMAKE_C_FLAGS "-O3 -std=c99")
+endif()
+
+#Set the filename prefixes
+set(NAME1 "BandedAlignerJNI")
+set(NAME2 "MultiStateAligner11tsJNI")
+set(NAME3 "BBMergeOverlapper")
+
+#Set names of source and header files
+set(SOURCES "${NAME1}.c" "${NAME2}.c" "${NAME3}")
+set(HEADERS "align2_${NAME1}.h" "align2_${NAME2}.h" "jgi_${NAME3}.h")
+
+#Generate header files
+set(_stubDir "${CMAKE_CURRENT_BINARY_DIR}")
+set(_classDir "${CMAKE_BINARY_DIR}")
+add_custom_command(
+ OUTPUT align2_${NAME1}.h
+ COMMAND ${Java_JAVAH_EXECUTABLE} -verbose -classpath ${_classDir} -d ${_stubDir} -jni align2.${NAME1}
+ )
+add_custom_command(
+ OUTPUT align2_${NAME2}.h
+ COMMAND ${Java_JAVAH_EXECUTABLE} -verbose -classpath ${_classDir} -d ${_stubDir} -jni align2.${NAME2}
+ )
+add_custom_command(
+ OUTPUT jgi_${NAME3}.h
+ COMMAND ${Java_JAVAH_EXECUTABLE} -verbose -classpath ${_classDir} -d ${_stubDir} -jni jgi.${NAME3}
+ )
+
+#Generate bbtoolsjni.jnilib
+include_directories(${CMAKE_CURRENT_BINARY_DIR} ${JNI_INCLUDE_DIRS})
+add_library(bbtoolsjni SHARED ${SOURCES} ${HEADERS})
+
diff --git a/jni/MultiStateAligner11tsJNI.c b/jni/MultiStateAligner11tsJNI.c
new file mode 100755
index 0000000..64e7b63
--- /dev/null
+++ b/jni/MultiStateAligner11tsJNI.c
@@ -0,0 +1,813 @@
+#include <jni.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <limits.h>
+#include "align2_MultiStateAligner11tsJNI.h"
+
+// C doesn't have min() or max() so we define our own
+#define max(a,b) \
+ ({ __typeof__ (a) _a = (a); \
+ __typeof__ (b) _b = (b); \
+ _a > _b ? _a : _b; })
+
+#define min(a,b) \
+ ({ __typeof__ (a) _a = (a); \
+ __typeof__ (b) _b = (b); \
+ _a < _b ? _a : _b; })
+
+#define GAPLEN 128
+#define GAPCOST max(1,GAPLEN/64)
+
+#define GAPC '-'
+
+#define MODE_MS 0
+#define MODE_DEL 1
+#define MODE_INS 2
+#define MODE_SUB 3
+
+#define AFFINE_ARRAYS 1
+
+#define TIMEBITS 11
+#define SCOREBITS (32-TIMEBITS)
+#define MAX_TIME ((1<<TIMEBITS)-1)
+#define MAX_SCORE (((1<<(SCOREBITS-1))-1)-2000)
+#define MIN_SCORE (0-MAX_SCORE) //Keeps it 1 point above "BAD".
+
+#define SCOREOFFSET TIMEBITS
+
+#define TIMEMASK (~((-1)<<TIMEBITS))
+#define SCOREMASK ((~((-1)<<SCOREBITS))<<SCOREOFFSET)
+
+#define POINTS_NOREF 0 //default -110
+#define POINTS_NOCALL 0
+#define POINTS_MATCH 70 //default 50
+#define POINTS_MATCH2 100 //Note: Changing to 90 substantially reduces false positives
+#define POINTS_COMPATIBLE 50
+#define POINTS_SUB -127 //default -133
+#define POINTS_SUBR -147 //increased penalty if prior match streak was at most 1 (I have no idea why this improves things)
+#define POINTS_SUB2 -51 //default -47
+#define POINTS_SUB3 -25
+#define POINTS_MATCHSUB -10
+#define POINTS_INS -395 //default -251
+#define POINTS_INS2 -39 //default -61
+#define POINTS_INS3 -23 //default -20
+#define POINTS_INS4 -8 //default -20
+#define POINTS_DEL -472 //default -239
+#define POINTS_DEL2 -33 //default -30
+#define POINTS_DEL3 -9 //default -7
+#define POINTS_DEL4 -1 //default -1
+#define POINTS_DEL5 -1 //default -1
+#define POINTS_DEL_REF_N -10 //default -10
+#define POINTS_GAP (0-GAPCOST) //default -10
+
+#define TIMESLIP 4
+#define MASK5 (TIMESLIP-1)
+
+#define BARRIER_I1 2
+#define BARRIER_D1 3
+
+#define LIMIT_FOR_COST_3 5
+#define LIMIT_FOR_COST_4 20
+#define LIMIT_FOR_COST_5 80
+
+#define BAD (MIN_SCORE-1)
+
+#define POINTSoff_NOREF (POINTS_NOREF<<SCOREOFFSET)
+#define POINTSoff_NOCALL (POINTS_NOCALL<<SCOREOFFSET)
+#define POINTSoff_MATCH (POINTS_MATCH<<SCOREOFFSET)
+#define POINTSoff_MATCH2 (POINTS_MATCH2<<SCOREOFFSET)
+#define POINTSoff_COMPATIBLE (POINTS_COMPATIBLE<<SCOREOFFSET)
+#define POINTSoff_SUB (POINTS_SUB<<SCOREOFFSET)
+#define POINTSoff_SUBR (POINTS_SUBR<<SCOREOFFSET)
+#define POINTSoff_SUB2 (POINTS_SUB2<<SCOREOFFSET)
+#define POINTSoff_SUB3 (POINTS_SUB3<<SCOREOFFSET)
+#define POINTSoff_MATCHSUB (POINTS_MATCHSUB<<SCOREOFFSET)
+#define POINTSoff_INS (POINTS_INS<<SCOREOFFSET)
+#define POINTSoff_INS2 (POINTS_INS2<<SCOREOFFSET)
+#define POINTSoff_INS3 (POINTS_INS3<<SCOREOFFSET)
+#define POINTSoff_INS4 (POINTS_INS4<<SCOREOFFSET)
+#define POINTSoff_DEL (POINTS_DEL<<SCOREOFFSET)
+#define POINTSoff_DEL2 (POINTS_DEL2<<SCOREOFFSET)
+#define POINTSoff_DEL3 (POINTS_DEL3<<SCOREOFFSET)
+#define POINTSoff_DEL4 (POINTS_DEL4<<SCOREOFFSET)
+#define POINTSoff_DEL5 (POINTS_DEL5<<SCOREOFFSET)
+#define POINTSoff_GAP (POINTS_GAP<<SCOREOFFSET)
+#define POINTSoff_DEL_REF_N (POINTS_DEL_REF_N<<SCOREOFFSET)
+#define BADoff (BAD<<SCOREOFFSET)
+#define MAXoff_SCORE (MAX_SCORE<<SCOREOFFSET)
+#define MINoff_SCORE (MIN_SCORE<<SCOREOFFSET)
+
+void fillUnlimited(
+ jbyte * read,
+ jbyte * ref,
+ jsize read_length,
+ jsize ref_length,
+ jint refStartLoc,
+ jint refEndLoc,
+ jint * result,
+ jlong * iterationsUnlimited,
+ jint * packed,
+ jint * POINTSoff_SUB_ARRAY,
+ jint * POINTSoff_INS_ARRAY,
+ jint maxRows,
+ jint maxColumns
+ ) {
+
+ const jint rows=read_length;
+ const jint columns=refEndLoc-refStartLoc+1;
+
+ const jint maxGain=(read_length-1)*POINTSoff_MATCH2+POINTSoff_MATCH;
+ const jint subfloor=0-2*maxGain;
+ const jint BARRIER_I2=rows-BARRIER_I1, BARRIER_I2b=columns-1;
+ const jint BARRIER_D2=rows-BARRIER_D1;
+
+ const int sizeXY=(maxRows+1)*(maxColumns+1);
+ const int idxMS=MODE_MS*sizeXY;
+ const int idxDEL=MODE_DEL*sizeXY;
+ const int idxINS=MODE_INS*sizeXY;
+
+ //temporary, for finding a bug
+ if(rows>maxRows || columns>maxColumns){
+ printf("error\n"); exit(0);
+ }
+
+ for(int row=1; row<=rows; row++){
+ const int tmp1=(row-1)*(maxColumns+1);
+ const int tmp2=(row)*(maxColumns+1);
+ for(int col=1; col<=columns; col++){
+ (*iterationsUnlimited)++;
+
+ const jbyte call0=(row<2 ? (jbyte)'?' : read[row-2]);
+ const jbyte call1=read[row-1];
+ const jbyte ref0=(col<2 ? (jbyte)'!' : ref[refStartLoc+col-2]);
+ const jbyte ref1=ref[refStartLoc+col-1];
+
+ const jboolean match=(call1==ref1 && ref1!='N');
+ const jboolean prevMatch=(call0==ref0 && ref0!='N');
+
+ const jboolean gap=(ref1==GAPC);
+
+ if(gap){
+ packed[idxMS+tmp2+col]=subfloor;
+ }else{//Calculate match and sub scores
+
+ const jint scoreFromDiag=packed[idxMS+tmp1+col-1]&SCOREMASK;
+ const jint scoreFromDel=packed[idxDEL+tmp1+col-1]&SCOREMASK;
+ const jint scoreFromIns=packed[idxINS+tmp1+col-1]&SCOREMASK;
+ const jint streak=(packed[idxMS+tmp1+col-1]&TIMEMASK);
+
+ {//Calculate match/sub score
+
+ if(match){
+
+ const jint scoreMS=scoreFromDiag+(prevMatch ? POINTSoff_MATCH2 : POINTSoff_MATCH);
+ const jint scoreD=scoreFromDel+POINTSoff_MATCH;
+ const jint scoreI=scoreFromIns+POINTSoff_MATCH;
+
+ jint score;
+ jint time;
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? streak+1 : 1);
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+ }else{
+ score=scoreI;
+ time=1;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ packed[idxMS+tmp2+col]=(score|time);
+
+ }else{
+
+ jint scoreMS;
+ if(ref1!='N' && call1!='N'){
+ scoreMS=scoreFromDiag+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) :
+ POINTSoff_SUB_ARRAY[streak+1]);
+ }else{
+ scoreMS=scoreFromDiag+POINTSoff_NOCALL;
+ }
+
+ const jint scoreD=scoreFromDel+POINTSoff_SUB; //+2 to move it as close as possible to the deletion / insertion
+ const jint scoreI=scoreFromIns+POINTSoff_SUB;
+
+ jint score;
+ jint time;
+ jbyte prevState;
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? 1 : streak+1);
+ prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+ prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+ prevState=MODE_INS;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ packed[idxMS+tmp2+col]=(score|time);
+ }
+ }
+ }
+
+ if(row<BARRIER_D1 || row>BARRIER_D2){
+ packed[idxDEL+tmp2+col]=subfloor;
+ }else{//Calculate DEL score
+
+ const jint streak=packed[idxDEL+tmp2+col-1]&TIMEMASK;
+
+ const jint scoreFromDiag=packed[idxMS+tmp2+col-1]&SCOREMASK;
+ const jint scoreFromDel=packed[idxDEL+tmp2+col-1]&SCOREMASK;
+
+ jint scoreMS=scoreFromDiag+POINTSoff_DEL;
+ jint scoreD=scoreFromDel+(streak==0 ? POINTSoff_DEL :
+ streak<LIMIT_FOR_COST_3 ? POINTSoff_DEL2 :
+ streak<LIMIT_FOR_COST_4 ? POINTSoff_DEL3 :
+ streak<LIMIT_FOR_COST_5 ? POINTSoff_DEL4 :
+ ((streak&MASK5)==0 ? POINTSoff_DEL5 : 0));
+
+ if(ref1=='N'){
+ scoreMS+=POINTSoff_DEL_REF_N;
+ scoreD+=POINTSoff_DEL_REF_N;
+ }else if(gap){
+ scoreMS+=POINTSoff_GAP;
+ scoreD+=POINTSoff_GAP;
+ }
+
+ jint score;
+ jint time;
+ jbyte prevState;
+ if(scoreMS>=scoreD){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreD;
+ time=streak+1;
+ prevState=MODE_DEL;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ packed[idxDEL+tmp2+col]=(score|time);
+ }
+
+ //Calculate INS score
+ if(gap || (row<BARRIER_I1 && col>1) || (row>BARRIER_I2 && col<BARRIER_I2b)){
+ packed[idxINS+tmp2+col]=subfloor;
+ }else{//Calculate INS score
+
+ const jint streak=packed[idxINS+tmp1+col]&TIMEMASK;
+
+ const jint scoreFromDiag=packed[idxMS+tmp1+col]&SCOREMASK;
+ const jint scoreFromIns=packed[idxINS+tmp1+col]&SCOREMASK;
+
+ const jint scoreMS=scoreFromDiag+POINTSoff_INS;
+ const jint scoreI=scoreFromIns+POINTSoff_INS_ARRAY[streak+1];
+
+ jint score;
+ jint time;
+ jbyte prevState;
+ if(scoreMS>=scoreI){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreI;
+ time=streak+1;
+ prevState=MODE_INS;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ packed[idxINS+tmp2+col]=(score|time);
+ }
+ }
+ }
+
+ jint maxCol=-1;
+ jint maxState=-1;
+ jint maxScore=INT_MIN;
+
+ const int tmp=rows*(maxColumns+1);
+ for(int state=0; state<3; state++){
+ for(int col=1; col<=columns; col++){
+ const int x=packed[(state)*sizeXY+tmp+col]&SCOREMASK;
+ if(x>maxScore){
+ maxScore=x;
+ maxCol=col;
+ maxState=state;
+ }
+ }
+ }
+ maxScore>>=SCOREOFFSET;
+
+ result[0]=rows;
+ result[1]=maxCol;
+ result[2]=maxState;
+ result[3]=maxScore;
+ return;
+}
+
+jint calcDelScoreOffset(jint len){
+ if(len<=0){return 0;}
+ jint score=POINTSoff_DEL;
+
+ if(len>LIMIT_FOR_COST_5){
+ score+=((len-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTSoff_DEL5;
+ len=LIMIT_FOR_COST_5;
+ }
+ if(len>LIMIT_FOR_COST_4){
+ score+=(len-LIMIT_FOR_COST_4)*POINTSoff_DEL4;
+ len=LIMIT_FOR_COST_4;
+ }
+ if(len>LIMIT_FOR_COST_3){
+ score+=(len-LIMIT_FOR_COST_3)*POINTSoff_DEL3;
+ len=LIMIT_FOR_COST_3;
+ }
+ if(len>1){
+ score+=(len-1)*POINTSoff_DEL2;
+ }
+ return score;
+}
+
+jint calcInsScoreOffset(jint len,
+ jint * POINTSoff_INS_ARRAY_C
+ ){
+ if(len<=0){return 0;}
+ if(AFFINE_ARRAYS){
+ return POINTSoff_INS_ARRAY_C[len];
+ }else{
+ jint score=POINTSoff_INS;
+ if(len>LIMIT_FOR_COST_4){
+ score+=(len-LIMIT_FOR_COST_4)*POINTSoff_INS4;
+ len=LIMIT_FOR_COST_4;
+ }
+ if(len>LIMIT_FOR_COST_3){
+ score+=(len-LIMIT_FOR_COST_3)*POINTSoff_INS3;
+ len=LIMIT_FOR_COST_3;
+ }
+ if(len>1){
+ score+=(len-1)*POINTSoff_INS2;
+ }
+ return score;
+ }
+}
+
+void fillLimitedX(
+ jbyte * read,
+ jbyte * ref,
+ jsize read_length,
+ jsize ref_length,
+ jint refStartLoc,
+ jint refEndLoc,
+ jint minScore,
+ jint * result,
+ jlong * iterationsLimited,
+ jint * packed,
+ jint * POINTSoff_SUB_ARRAY,
+ jint * POINTSoff_INS_ARRAY,
+ jint maxRows,
+ jint maxColumns,
+ jint bandwidth,
+ jfloat bandwidthRatio,
+ jint * vertLimit,
+ jint * horizLimit,
+ jbyte * baseToNumber,
+ jint * POINTSoff_INS_ARRAY_C
+ ) {
+
+ const jint rows=read_length;
+ const jint columns=refEndLoc-refStartLoc+1;
+
+ const int sizeXY=(maxRows+1)*(maxColumns+1);
+ const int idxMS=MODE_MS*sizeXY;
+ const int idxDEL=MODE_DEL*sizeXY;
+ const int idxINS=MODE_INS*sizeXY;
+
+ const jint halfband=(bandwidth<1 && bandwidthRatio<=0) ? 0 :
+ max(min(bandwidth<1 ? 9999999 : bandwidth, bandwidthRatio<=0 ? 9999999 : 8+(jint)(rows*bandwidthRatio)), (columns-rows+8))/2;
+
+ const jint BARRIER_I2=rows-BARRIER_I1, BARRIER_I2b=columns-1;
+ const jint BARRIER_D2=rows-BARRIER_D1;
+
+ const int tmp=rows*(maxColumns+1);
+ for(int x=0; x<3; x++){
+ for(int i=1; i<columns+1; i++) {
+ packed[x*sizeXY+tmp+i]=BADoff;
+ }
+ }
+
+ jint minGoodCol=1;
+ jint maxGoodCol=columns;
+
+ const jint minScore_off=(minScore<<SCOREOFFSET);
+ const jint maxGain=(read_length-1)*POINTSoff_MATCH2+POINTSoff_MATCH;
+ const jint floor=minScore_off-maxGain;
+ const jint subfloor=floor-5*POINTSoff_MATCH2;
+
+ vertLimit[rows]=minScore_off;
+ jboolean prevDefined=0;
+ for(int i=rows-1; i>=0; i--){
+ jbyte c=read[i];
+ //if(AminoAcid.isFullyDefined(c)){
+ if(baseToNumber[c]>=0){
+ vertLimit[i]=max(vertLimit[i+1]-(prevDefined ? POINTSoff_MATCH2 : POINTSoff_MATCH), floor);
+ prevDefined=1;
+ }else{
+ vertLimit[i]=max(vertLimit[i+1]-POINTSoff_NOCALL, floor);
+ prevDefined=0;
+ }
+ }
+
+ horizLimit[columns]=minScore_off;
+ prevDefined=0;
+ for(int i=columns-1; i>=0; i--){
+ jbyte c=ref[refStartLoc+i];
+ if(baseToNumber[c]>=0){
+ horizLimit[i]=max(horizLimit[i+1]-(prevDefined ? POINTSoff_MATCH2 : POINTSoff_MATCH), floor);
+ prevDefined=1;
+ }else{
+ horizLimit[i]=max(horizLimit[i+1]-(prevDefined && c==GAPC ? POINTSoff_DEL : POINTSoff_NOREF), floor);
+ prevDefined=0;
+ }
+ }
+
+ for(int row=1; row<=rows; row++){
+ const jint colStart=(halfband<1 ? minGoodCol : max(minGoodCol, row-halfband));
+ const jint colStop=(halfband<1 ? maxGoodCol : min(maxGoodCol, row+halfband*2-1));
+
+ minGoodCol=-1;
+ maxGoodCol=-2;
+
+ const jint vlimit=vertLimit[row];
+
+ if(colStart<0 || colStop<colStart){break;}
+
+ if(colStart>1){
+ const int tmp3=(row)*(maxColumns+1)+(colStart-1);
+ packed[idxMS+tmp3]=subfloor;
+ packed[idxINS+tmp3]=subfloor;
+ packed[idxDEL+tmp3]=subfloor;
+ }
+
+ const int tmp1=(row-1)*(maxColumns+1);
+ const int tmp2=(row)*(maxColumns+1);
+ for(int col=colStart; col<=columns; col++){
+ const jbyte call0=(row<2 ? (jbyte)'?' : read[row-2]);
+ const jbyte call1=read[row-1];
+ const jbyte ref0=(col<2 ? (jbyte)'!' : ref[refStartLoc+col-2]);
+ const jbyte ref1=ref[refStartLoc+col-1];
+
+ const jboolean gap=(ref1==GAPC);
+
+ const jboolean match=(call1==ref1 && ref1!='N');
+ const jboolean prevMatch=(call0==ref0 && ref0!='N');
+
+ (*iterationsLimited)++;
+ const jint limit=max(vlimit, horizLimit[col]);
+ const jint limit3=max(floor, (match ? limit-POINTSoff_MATCH2 : limit-POINTSoff_SUB3));
+
+ const jint delNeeded=max(0, row-col-1);
+ const jint insNeeded=max(0, (rows-row)-(columns-col)-1);
+
+ const jint delPenalty=calcDelScoreOffset(delNeeded);
+ const jint insPenalty=calcInsScoreOffset(insNeeded,POINTSoff_INS_ARRAY_C);
+
+ const jint scoreFromDiag_MS=packed[idxMS+tmp1+col-1]&SCOREMASK;
+ const jint scoreFromDel_MS=packed[idxDEL+tmp1+col-1]&SCOREMASK;
+ const jint scoreFromIns_MS=packed[idxINS+tmp1+col-1]&SCOREMASK;
+
+ const jint scoreFromDiag_DEL=packed[idxMS+tmp2+col-1]&SCOREMASK;
+ const jint scoreFromDel_DEL=packed[idxDEL+tmp2+col-1]&SCOREMASK;
+
+ const jint scoreFromDiag_INS=packed[idxMS+tmp1+col]&SCOREMASK;
+ const jint scoreFromIns_INS=packed[idxINS+tmp1+col]&SCOREMASK;
+
+ if(gap || (scoreFromDiag_MS<=limit3 && scoreFromDel_MS<=limit3 && scoreFromIns_MS<=limit3)){
+ packed[idxMS+tmp2+col]=subfloor;
+ }else{//Calculate match and sub scores
+ const jint streak=(packed[idxMS+tmp1+col-1]&TIMEMASK);
+ {//Calculate match/sub score
+ jint score;
+ jint time;
+ jbyte prevState;
+
+ if(match){
+ const jint scoreMS=scoreFromDiag_MS+(prevMatch ? POINTSoff_MATCH2 : POINTSoff_MATCH);
+ const jint scoreD=scoreFromDel_MS+POINTSoff_MATCH;
+ const jint scoreI=scoreFromIns_MS+POINTSoff_MATCH;
+
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? streak+1 : 1);
+ prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+ prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+ prevState=MODE_INS;
+ }
+ }else{
+ jint scoreMS;
+ if(ref1!='N' && call1!='N'){
+ scoreMS=scoreFromDiag_MS+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) :
+ POINTSoff_SUB_ARRAY[streak+1]);
+ }else{
+ scoreMS=scoreFromDiag_MS+POINTSoff_NOCALL;
+ }
+
+ const jint scoreD=scoreFromDel_MS+POINTSoff_SUB; //+2 to move it as close as possible to the deletion / insertion
+ const jint scoreI=scoreFromIns_MS+POINTSoff_SUB;
+
+ if(scoreMS>=scoreD && scoreMS>=scoreI){
+ score=scoreMS;
+ time=(prevMatch ? 1 : streak+1);
+ prevState=MODE_MS;
+ }else if(scoreD>=scoreI){
+ score=scoreD;
+ time=1;
+ prevState=MODE_DEL;
+ }else{
+ score=scoreI;
+ time=1;
+ prevState=MODE_INS;
+ }
+ }
+
+ jint limit2;
+ if(delNeeded>0){
+ limit2=limit-delPenalty;
+ }else if(insNeeded>0){
+ limit2=limit-insPenalty;
+ }else{
+ limit2=limit;
+ }
+
+ if(score>=limit2){
+ maxGoodCol=col;
+ if(minGoodCol<0){minGoodCol=col;}
+ }else{
+ score=subfloor;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ packed[idxMS+tmp2+col]=(score|time);
+ }
+ }
+
+ if((scoreFromDiag_DEL<=limit && scoreFromDel_DEL<=limit) || row<BARRIER_D1 || row>BARRIER_D2){
+ packed[idxDEL+tmp2+col]=subfloor;
+ }else{//Calculate DEL score
+ const jint streak=packed[idxDEL+tmp2+col-1]&TIMEMASK;
+
+ jint scoreMS=scoreFromDiag_DEL+POINTSoff_DEL;
+ jint scoreD=scoreFromDel_DEL+(streak==0 ? POINTSoff_DEL :
+ streak<LIMIT_FOR_COST_3 ? POINTSoff_DEL2 :
+ streak<LIMIT_FOR_COST_4 ? POINTSoff_DEL3 :
+ streak<LIMIT_FOR_COST_5 ? POINTSoff_DEL4 :
+ ((streak&MASK5)==0 ? POINTSoff_DEL5 : 0));
+
+ if(ref1=='N'){
+ scoreMS+=POINTSoff_DEL_REF_N;
+ scoreD+=POINTSoff_DEL_REF_N;
+ }else if(gap){
+ scoreMS+=POINTSoff_GAP;
+ scoreD+=POINTSoff_GAP;
+ }
+
+ jint score;
+ jint time;
+ jbyte prevState;
+ if(scoreMS>=scoreD){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreD;
+ time=streak+1;
+ prevState=MODE_DEL;
+ }
+
+ jint limit2;
+ if(insNeeded>0){
+ limit2=limit-insPenalty;
+ }else if(delNeeded>0){
+ limit2=limit-calcDelScoreOffset(time+delNeeded)+calcDelScoreOffset(time);
+ }else{
+ limit2=limit;
+ }
+
+ if(score>=limit2){
+ maxGoodCol=col;
+ if(minGoodCol<0){minGoodCol=col;}
+ }else{
+ score=subfloor;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ packed[idxDEL+tmp2+col]=(score|time);
+ }
+
+ if(gap || (scoreFromDiag_INS<=limit && scoreFromIns_INS<=limit) || (row<BARRIER_I1 && col>1) || (row>BARRIER_I2 && col<BARRIER_I2b)){
+ packed[idxINS+tmp2+col]=subfloor;
+ }else{//Calculate INS score
+ const jint streak=packed[idxINS+tmp1+col]&TIMEMASK;
+
+ const jint scoreMS=scoreFromDiag_INS+POINTSoff_INS;
+ const jint scoreI=scoreFromIns_INS+POINTSoff_INS_ARRAY[streak+1];
+
+ jint score;
+ jint time;
+ jbyte prevState;
+ if(scoreMS>=scoreI){
+ score=scoreMS;
+ time=1;
+ prevState=MODE_MS;
+ }else{
+ score=scoreI;
+ time=streak+1;
+ prevState=MODE_INS;
+ }
+
+ jint limit2;
+ if(delNeeded>0){
+ limit2=limit-delPenalty;
+ }else if(insNeeded>0){
+ limit2=limit-calcInsScoreOffset(time+insNeeded,POINTSoff_INS_ARRAY_C)+calcInsScoreOffset(time,POINTSoff_INS_ARRAY_C);
+ }else{
+ limit2=limit;
+ }
+
+ if(score>=limit2){
+ maxGoodCol=col;
+ if(minGoodCol<0){minGoodCol=col;}
+ }else{
+ score=subfloor;
+ }
+
+ if(time>MAX_TIME){time=MAX_TIME-MASK5;}
+ packed[idxINS+tmp2+col]=(score|time);
+ }
+
+ if(col>=colStop){
+ if(col>colStop && (maxGoodCol<col || halfband>0)){break;}
+ if(row>1){
+ const int tmp3=tmp1+(col+1);
+ packed[idxMS+tmp3]=subfloor;
+ packed[idxINS+tmp3]=subfloor;
+ packed[idxDEL+tmp3]=subfloor;
+ }
+ }
+ }
+ }
+
+ jint maxCol=-1;
+ jint maxState=-1;
+ jint maxScore=INT_MIN;
+
+ //const int tmp=rows*(maxColumns+1);
+ for(int state=0; state<3; state++){
+ for(int col=1; col<=columns; col++){
+ const int x=packed[(state)*sizeXY+tmp+col]&SCOREMASK;
+ if(x>maxScore){
+ maxScore=x;
+ maxCol=col;
+ maxState=state;
+ }
+ }
+ }
+
+ if(maxScore<minScore_off){
+ result[0]=rows;
+ result[1]=maxCol;
+ result[2]=maxState;
+ result[3]=maxScore;
+ result[4]=1;
+ return;
+ }
+
+ maxScore>>=SCOREOFFSET;
+ result[0]=rows;
+ result[1]=maxCol;
+ result[2]=maxState;
+ result[3]=maxScore;
+ result[4]=0;
+ return;
+}
+
+
+JNIEXPORT void JNICALL Java_align2_MultiStateAligner11tsJNI_fillUnlimitedJNI(
+ JNIEnv *env,
+ jobject obj,
+ jbyteArray read,
+ jbyteArray ref,
+ jint refStartLoc,
+ jint refEndLoc,
+ jintArray result,
+ jlongArray iterationsUnlimited,
+ jintArray packed,
+ jintArray POINTSoff_SUB_ARRAY,
+ jintArray POINTSoff_INS_ARRAY,
+ jint maxRows,
+ jint maxColumns
+ ) {
+ // Get the size of the read and the reference arrays
+ jsize read_length=(*env)->GetArrayLength(env, read);
+ jsize ref_length=(*env)->GetArrayLength(env, ref);
+
+ // Copy arrays from Java
+ jint * jPOINTSoff_INS_ARRAY=(jint*)(*env)->GetPrimitiveArrayCritical(env, POINTSoff_INS_ARRAY, NULL);
+ jint * jPOINTSoff_SUB_ARRAY=(jint*)(*env)->GetPrimitiveArrayCritical(env, POINTSoff_SUB_ARRAY, NULL);
+ jint * jpacked=(jint*)(*env)->GetPrimitiveArrayCritical(env, packed, NULL);
+ jbyte * jread=(jbyte*)(*env)->GetPrimitiveArrayCritical(env, read, NULL);
+ jbyte * jref=(jbyte*)(*env)->GetPrimitiveArrayCritical(env, ref, NULL);
+ jint * jresult=(jint*)(*env)->GetPrimitiveArrayCritical(env, result, NULL);
+ jlong * jiterationsUnlimited=(jlong*)(*env)->GetPrimitiveArrayCritical(env, iterationsUnlimited, NULL);
+
+ // Using pointers for variables that need to be passed back to Java so they can be updated by the called functions
+ jlong * iterationsUnlimitedPointer=&jiterationsUnlimited[0];
+
+ // Call the fillUnlimited function in C; the 4 return values will be in jresult[]
+ fillUnlimited(jread,jref,read_length,ref_length,refStartLoc,refEndLoc,jresult,iterationsUnlimitedPointer,jpacked,jPOINTSoff_SUB_ARRAY,jPOINTSoff_INS_ARRAY,maxRows,maxColumns);
+
+ // Release Java arrays; 0 copies the array back to Java, JNI_ABORT does not copy the current array values to Java
+ (*env)->ReleasePrimitiveArrayCritical(env, result, jresult, 0);
+ (*env)->ReleasePrimitiveArrayCritical(env, iterationsUnlimited, jiterationsUnlimited, 0);
+ (*env)->ReleasePrimitiveArrayCritical(env, read, jread, JNI_ABORT);
+ (*env)->ReleasePrimitiveArrayCritical(env, ref, jref, JNI_ABORT);
+ (*env)->ReleasePrimitiveArrayCritical(env, packed, jpacked, 0);
+ (*env)->ReleasePrimitiveArrayCritical(env, POINTSoff_SUB_ARRAY, jPOINTSoff_SUB_ARRAY, JNI_ABORT);
+ (*env)->ReleasePrimitiveArrayCritical(env, POINTSoff_INS_ARRAY, jPOINTSoff_INS_ARRAY, JNI_ABORT);
+
+ return;
+}
+
+JNIEXPORT void JNICALL Java_align2_MultiStateAligner11tsJNI_fillLimitedXJNI(
+ JNIEnv *env,
+ jobject obj,
+ jbyteArray read,
+ jbyteArray ref,
+ jint refStartLoc,
+ jint refEndLoc,
+ jint minScore,
+ jintArray result,
+ jlongArray iterationsLimited,
+ jintArray packed,
+ jintArray POINTSoff_SUB_ARRAY,
+ jintArray POINTSoff_INS_ARRAY,
+ jint maxRows,
+ jint maxColumns,
+ jint bandwidth,
+ jfloat bandwidthRatio,
+ jintArray vertLimit,
+ jintArray horizLimit,
+ jbyteArray baseToNumber,
+ jintArray POINTSoff_INS_ARRAY_C
+ ) {
+ // Get the size of the read and the reference arrays
+ jsize read_length=(*env)->GetArrayLength(env, read);
+ jsize ref_length=(*env)->GetArrayLength(env, ref);
+
+ // Copy arrays from Java
+ jint * jPOINTSoff_INS_ARRAY=(jint*)(*env)->GetPrimitiveArrayCritical(env, POINTSoff_INS_ARRAY, NULL);
+ jint * jPOINTSoff_SUB_ARRAY=(jint*)(*env)->GetPrimitiveArrayCritical(env, POINTSoff_SUB_ARRAY, NULL);
+ jint * jpacked=(jint*)(*env)->GetPrimitiveArrayCritical(env, packed, NULL);
+ jbyte * jread=(jbyte*)(*env)->GetPrimitiveArrayCritical(env, read, NULL);
+ jbyte * jref=(jbyte*)(*env)->GetPrimitiveArrayCritical(env, ref, NULL);
+ jint * jresult=(jint*)(*env)->GetPrimitiveArrayCritical(env, result, NULL);
+ jlong * jiterationsLimited=(jlong*)(*env)->GetPrimitiveArrayCritical(env, iterationsLimited, NULL);
+ jint * jvertLimit=(jint*)(*env)->GetPrimitiveArrayCritical(env, vertLimit, NULL);
+ jint * jhorizLimit=(jint*)(*env)->GetPrimitiveArrayCritical(env, horizLimit, NULL);
+ jbyte * jbaseToNumber=(jbyte*)(*env)->GetPrimitiveArrayCritical(env, baseToNumber, NULL);
+ jint * jPOINTSoff_INS_ARRAY_C=(jint*)(*env)->GetPrimitiveArrayCritical(env, POINTSoff_INS_ARRAY_C, NULL);
+
+ // Using pointers for variables that need to be passed back to Java so they can be updated by the called functions
+ jlong * iterationsLimitedPointer=&jiterationsLimited[0];
+
+ // Call the fillLimitedX function in C; the 5 return values will be in jresult[]
+ fillLimitedX(jread,jref,read_length,ref_length,refStartLoc,refEndLoc,minScore,jresult,iterationsLimitedPointer,jpacked,jPOINTSoff_SUB_ARRAY,jPOINTSoff_INS_ARRAY,maxRows,maxColumns,bandwidth,bandwidthRatio,jvertLimit,jhorizLimit,jbaseToNumber,jPOINTSoff_INS_ARRAY_C);
+
+ // Release Java arrays; 0 copies the array back to Java, JNI_ABORT does not copy the current array values to Java
+ (*env)->ReleasePrimitiveArrayCritical(env, result, jresult, 0);
+ (*env)->ReleasePrimitiveArrayCritical(env, iterationsLimited, jiterationsLimited, 0);
+ (*env)->ReleasePrimitiveArrayCritical(env, read, jread, JNI_ABORT);
+ (*env)->ReleasePrimitiveArrayCritical(env, ref, jref, JNI_ABORT);
+ (*env)->ReleasePrimitiveArrayCritical(env, packed, jpacked, 0);
+ (*env)->ReleasePrimitiveArrayCritical(env, POINTSoff_SUB_ARRAY, jPOINTSoff_SUB_ARRAY, JNI_ABORT);
+ (*env)->ReleasePrimitiveArrayCritical(env, POINTSoff_INS_ARRAY, jPOINTSoff_INS_ARRAY, JNI_ABORT);
+ (*env)->ReleasePrimitiveArrayCritical(env, vertLimit, jvertLimit, 0);
+ (*env)->ReleasePrimitiveArrayCritical(env, horizLimit, jhorizLimit, 0);
+ (*env)->ReleasePrimitiveArrayCritical(env, baseToNumber, jbaseToNumber, JNI_ABORT);
+ (*env)->ReleasePrimitiveArrayCritical(env, POINTSoff_INS_ARRAY_C, jPOINTSoff_INS_ARRAY_C, JNI_ABORT);
+
+ return;
+}
+
diff --git a/jni/README.txt b/jni/README.txt
new file mode 100755
index 0000000..5b17d22
--- /dev/null
+++ b/jni/README.txt
@@ -0,0 +1,15 @@
+The accelerated versions of BBMap, Dedupe and BBMerge rely on the addition of a small amount C code that you will need to compile on your specific machine to take full advantage of its specific architecture. Most C compilers should suffice. On Linux and OS X, we use gcc (gcc.gnu.org). The compiling process will create a library that Java can then load and use during execution. Simple makefiles for OSX and Linux have been provided. To compile the accelerated versions of BBTools on OS X or [...]
+
+Linux:
+make -f makefile.linux
+
+OS X:
+make -f makefile.osx
+
+Windows:
+If you are familiar with cmake and have it installed on your system, there is a CMakeLists.txt file that should get you most of the way there, but a full Windows build has not been tested at the moment.
+
+After the "make" command, a "libbbtoolsjni.xx" file should appear in the "jni" directory. Once you have the libraries built, run BBMap, Dedupe or BBMerge with the addition of the "usejni" flag. If Java complains that it can't find the library you just compiled, the -Djava.library.path=<dir> flag in the bash scripts is what tells Java where to look for native library, and it should already be pointing to the "jni" directory. Java also looks for specific library suffixes on different opera [...]
+
+Note:
+Before building this, you must have a JDK installed (Java 7) and the JAVA_HOME environment variable set (to something like /usr/common/usg/languages/java/jdk/oracle/1.7.0_51_x86_64).
\ No newline at end of file
diff --git a/jni/align2_BandedAlignerJNI.h b/jni/align2_BandedAlignerJNI.h
new file mode 100755
index 0000000..204cc31
--- /dev/null
+++ b/jni/align2_BandedAlignerJNI.h
@@ -0,0 +1,47 @@
+/* DO NOT EDIT THIS FILE - it is machine generated */
+#include <jni.h>
+/* Header for class align2_BandedAlignerJNI */
+
+#ifndef _Included_align2_BandedAlignerJNI
+#define _Included_align2_BandedAlignerJNI
+#ifdef __cplusplus
+extern "C" {
+#endif
+#undef align2_BandedAlignerJNI_big
+#define align2_BandedAlignerJNI_big 99999999L
+/*
+ * Class: align2_BandedAlignerJNI
+ * Method: alignForwardJNI
+ * Signature: ([B[BIIIZI[B[I)I
+ */
+JNIEXPORT jint JNICALL Java_align2_BandedAlignerJNI_alignForwardJNI
+ (JNIEnv *, jobject, jbyteArray, jbyteArray, jint, jint, jint, jboolean, jint, jbyteArray, jintArray);
+
+/*
+ * Class: align2_BandedAlignerJNI
+ * Method: alignForwardRCJNI
+ * Signature: ([B[BIIIZI[B[B[I)I
+ */
+JNIEXPORT jint JNICALL Java_align2_BandedAlignerJNI_alignForwardRCJNI
+ (JNIEnv *, jobject, jbyteArray, jbyteArray, jint, jint, jint, jboolean, jint, jbyteArray, jbyteArray, jintArray);
+
+/*
+ * Class: align2_BandedAlignerJNI
+ * Method: alignReverseJNI
+ * Signature: ([B[BIIIZI[B[I)I
+ */
+JNIEXPORT jint JNICALL Java_align2_BandedAlignerJNI_alignReverseJNI
+ (JNIEnv *, jobject, jbyteArray, jbyteArray, jint, jint, jint, jboolean, jint, jbyteArray, jintArray);
+
+/*
+ * Class: align2_BandedAlignerJNI
+ * Method: alignReverseRCJNI
+ * Signature: ([B[BIIIZI[B[B[I)I
+ */
+JNIEXPORT jint JNICALL Java_align2_BandedAlignerJNI_alignReverseRCJNI
+ (JNIEnv *, jobject, jbyteArray, jbyteArray, jint, jint, jint, jboolean, jint, jbyteArray, jbyteArray, jintArray);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/jni/align2_MultiStateAligner11tsJNI.h b/jni/align2_MultiStateAligner11tsJNI.h
new file mode 100755
index 0000000..dc66fd1
--- /dev/null
+++ b/jni/align2_MultiStateAligner11tsJNI.h
@@ -0,0 +1,179 @@
+/* DO NOT EDIT THIS FILE - it is machine generated */
+#include <jni.h>
+/* Header for class align2_MultiStateAligner11tsJNI */
+
+#ifndef _Included_align2_MultiStateAligner11tsJNI
+#define _Included_align2_MultiStateAligner11tsJNI
+#ifdef __cplusplus
+extern "C" {
+#endif
+#undef align2_MultiStateAligner11tsJNI_GAPBUFFER
+#define align2_MultiStateAligner11tsJNI_GAPBUFFER 64L
+#undef align2_MultiStateAligner11tsJNI_GAPBUFFER2
+#define align2_MultiStateAligner11tsJNI_GAPBUFFER2 128L
+#undef align2_MultiStateAligner11tsJNI_GAPLEN
+#define align2_MultiStateAligner11tsJNI_GAPLEN 128L
+#undef align2_MultiStateAligner11tsJNI_MINGAP
+#define align2_MultiStateAligner11tsJNI_MINGAP 256L
+#undef align2_MultiStateAligner11tsJNI_GAPC
+#define align2_MultiStateAligner11tsJNI_GAPC 45L
+#undef align2_MultiStateAligner11tsJNI_GREFLIMIT2_CUSHION
+#define align2_MultiStateAligner11tsJNI_GREFLIMIT2_CUSHION 128L
+#undef align2_MultiStateAligner11tsJNI_MODE_MS
+#define align2_MultiStateAligner11tsJNI_MODE_MS 0L
+#undef align2_MultiStateAligner11tsJNI_MODE_DEL
+#define align2_MultiStateAligner11tsJNI_MODE_DEL 1L
+#undef align2_MultiStateAligner11tsJNI_MODE_INS
+#define align2_MultiStateAligner11tsJNI_MODE_INS 2L
+#undef align2_MultiStateAligner11tsJNI_MODE_SUB
+#define align2_MultiStateAligner11tsJNI_MODE_SUB 3L
+#undef align2_MultiStateAligner11tsJNI_MIN_SCORE_ADJUST
+#define align2_MultiStateAligner11tsJNI_MIN_SCORE_ADJUST 120L
+#undef align2_MultiStateAligner11tsJNI_TIMEBITS
+#define align2_MultiStateAligner11tsJNI_TIMEBITS 11L
+#undef align2_MultiStateAligner11tsJNI_SCOREBITS
+#define align2_MultiStateAligner11tsJNI_SCOREBITS 21L
+#undef align2_MultiStateAligner11tsJNI_MAX_TIME
+#define align2_MultiStateAligner11tsJNI_MAX_TIME 2047L
+#undef align2_MultiStateAligner11tsJNI_MAX_SCORE
+#define align2_MultiStateAligner11tsJNI_MAX_SCORE 1046575L
+#undef align2_MultiStateAligner11tsJNI_MIN_SCORE
+#define align2_MultiStateAligner11tsJNI_MIN_SCORE -1046575L
+#undef align2_MultiStateAligner11tsJNI_SCOREOFFSET
+#define align2_MultiStateAligner11tsJNI_SCOREOFFSET 11L
+#undef align2_MultiStateAligner11tsJNI_TIMEMASK
+#define align2_MultiStateAligner11tsJNI_TIMEMASK 2047L
+#undef align2_MultiStateAligner11tsJNI_SCOREMASK
+#define align2_MultiStateAligner11tsJNI_SCOREMASK -2048L
+#undef align2_MultiStateAligner11tsJNI_MODE_MS
+#define align2_MultiStateAligner11tsJNI_MODE_MS 0L
+#undef align2_MultiStateAligner11tsJNI_MODE_DEL
+#define align2_MultiStateAligner11tsJNI_MODE_DEL 1L
+#undef align2_MultiStateAligner11tsJNI_MODE_INS
+#define align2_MultiStateAligner11tsJNI_MODE_INS 2L
+#undef align2_MultiStateAligner11tsJNI_MODE_SUB
+#define align2_MultiStateAligner11tsJNI_MODE_SUB 3L
+#undef align2_MultiStateAligner11tsJNI_POINTS_NOREF
+#define align2_MultiStateAligner11tsJNI_POINTS_NOREF 0L
+#undef align2_MultiStateAligner11tsJNI_POINTS_NOCALL
+#define align2_MultiStateAligner11tsJNI_POINTS_NOCALL 0L
+#undef align2_MultiStateAligner11tsJNI_POINTS_MATCH
+#define align2_MultiStateAligner11tsJNI_POINTS_MATCH 70L
+#undef align2_MultiStateAligner11tsJNI_POINTS_MATCH2
+#define align2_MultiStateAligner11tsJNI_POINTS_MATCH2 100L
+#undef align2_MultiStateAligner11tsJNI_POINTS_COMPATIBLE
+#define align2_MultiStateAligner11tsJNI_POINTS_COMPATIBLE 50L
+#undef align2_MultiStateAligner11tsJNI_POINTS_SUB
+#define align2_MultiStateAligner11tsJNI_POINTS_SUB -127L
+#undef align2_MultiStateAligner11tsJNI_POINTS_SUBR
+#define align2_MultiStateAligner11tsJNI_POINTS_SUBR -147L
+#undef align2_MultiStateAligner11tsJNI_POINTS_SUB2
+#define align2_MultiStateAligner11tsJNI_POINTS_SUB2 -51L
+#undef align2_MultiStateAligner11tsJNI_POINTS_SUB3
+#define align2_MultiStateAligner11tsJNI_POINTS_SUB3 -25L
+#undef align2_MultiStateAligner11tsJNI_POINTS_MATCHSUB
+#define align2_MultiStateAligner11tsJNI_POINTS_MATCHSUB -10L
+#undef align2_MultiStateAligner11tsJNI_POINTS_INS
+#define align2_MultiStateAligner11tsJNI_POINTS_INS -395L
+#undef align2_MultiStateAligner11tsJNI_POINTS_INS2
+#define align2_MultiStateAligner11tsJNI_POINTS_INS2 -39L
+#undef align2_MultiStateAligner11tsJNI_POINTS_INS3
+#define align2_MultiStateAligner11tsJNI_POINTS_INS3 -23L
+#undef align2_MultiStateAligner11tsJNI_POINTS_INS4
+#define align2_MultiStateAligner11tsJNI_POINTS_INS4 -8L
+#undef align2_MultiStateAligner11tsJNI_POINTS_DEL
+#define align2_MultiStateAligner11tsJNI_POINTS_DEL -472L
+#undef align2_MultiStateAligner11tsJNI_POINTS_DEL2
+#define align2_MultiStateAligner11tsJNI_POINTS_DEL2 -33L
+#undef align2_MultiStateAligner11tsJNI_POINTS_DEL3
+#define align2_MultiStateAligner11tsJNI_POINTS_DEL3 -9L
+#undef align2_MultiStateAligner11tsJNI_POINTS_DEL4
+#define align2_MultiStateAligner11tsJNI_POINTS_DEL4 -1L
+#undef align2_MultiStateAligner11tsJNI_POINTS_DEL5
+#define align2_MultiStateAligner11tsJNI_POINTS_DEL5 -1L
+#undef align2_MultiStateAligner11tsJNI_POINTS_DEL_REF_N
+#define align2_MultiStateAligner11tsJNI_POINTS_DEL_REF_N -10L
+#undef align2_MultiStateAligner11tsJNI_TIMESLIP
+#define align2_MultiStateAligner11tsJNI_TIMESLIP 4L
+#undef align2_MultiStateAligner11tsJNI_MASK5
+#define align2_MultiStateAligner11tsJNI_MASK5 3L
+#undef align2_MultiStateAligner11tsJNI_BARRIER_I1
+#define align2_MultiStateAligner11tsJNI_BARRIER_I1 2L
+#undef align2_MultiStateAligner11tsJNI_BARRIER_D1
+#define align2_MultiStateAligner11tsJNI_BARRIER_D1 3L
+#undef align2_MultiStateAligner11tsJNI_LIMIT_FOR_COST_3
+#define align2_MultiStateAligner11tsJNI_LIMIT_FOR_COST_3 5L
+#undef align2_MultiStateAligner11tsJNI_LIMIT_FOR_COST_4
+#define align2_MultiStateAligner11tsJNI_LIMIT_FOR_COST_4 20L
+#undef align2_MultiStateAligner11tsJNI_LIMIT_FOR_COST_5
+#define align2_MultiStateAligner11tsJNI_LIMIT_FOR_COST_5 80L
+#undef align2_MultiStateAligner11tsJNI_BAD
+#define align2_MultiStateAligner11tsJNI_BAD -1046576L
+#undef align2_MultiStateAligner11tsJNI_POINTSoff_NOREF
+#define align2_MultiStateAligner11tsJNI_POINTSoff_NOREF 0L
+#undef align2_MultiStateAligner11tsJNI_POINTSoff_NOCALL
+#define align2_MultiStateAligner11tsJNI_POINTSoff_NOCALL 0L
+#undef align2_MultiStateAligner11tsJNI_POINTSoff_MATCH
+#define align2_MultiStateAligner11tsJNI_POINTSoff_MATCH 143360L
+#undef align2_MultiStateAligner11tsJNI_POINTSoff_MATCH2
+#define align2_MultiStateAligner11tsJNI_POINTSoff_MATCH2 204800L
+#undef align2_MultiStateAligner11tsJNI_POINTSoff_COMPATIBLE
+#define align2_MultiStateAligner11tsJNI_POINTSoff_COMPATIBLE 102400L
+#undef align2_MultiStateAligner11tsJNI_POINTSoff_SUB
+#define align2_MultiStateAligner11tsJNI_POINTSoff_SUB -260096L
+#undef align2_MultiStateAligner11tsJNI_POINTSoff_SUBR
+#define align2_MultiStateAligner11tsJNI_POINTSoff_SUBR -301056L
+#undef align2_MultiStateAligner11tsJNI_POINTSoff_SUB2
+#define align2_MultiStateAligner11tsJNI_POINTSoff_SUB2 -104448L
+#undef align2_MultiStateAligner11tsJNI_POINTSoff_SUB3
+#define align2_MultiStateAligner11tsJNI_POINTSoff_SUB3 -51200L
+#undef align2_MultiStateAligner11tsJNI_POINTSoff_MATCHSUB
+#define align2_MultiStateAligner11tsJNI_POINTSoff_MATCHSUB -20480L
+#undef align2_MultiStateAligner11tsJNI_POINTSoff_INS
+#define align2_MultiStateAligner11tsJNI_POINTSoff_INS -808960L
+#undef align2_MultiStateAligner11tsJNI_POINTSoff_INS2
+#define align2_MultiStateAligner11tsJNI_POINTSoff_INS2 -79872L
+#undef align2_MultiStateAligner11tsJNI_POINTSoff_INS3
+#define align2_MultiStateAligner11tsJNI_POINTSoff_INS3 -47104L
+#undef align2_MultiStateAligner11tsJNI_POINTSoff_INS4
+#define align2_MultiStateAligner11tsJNI_POINTSoff_INS4 -16384L
+#undef align2_MultiStateAligner11tsJNI_POINTSoff_DEL
+#define align2_MultiStateAligner11tsJNI_POINTSoff_DEL -966656L
+#undef align2_MultiStateAligner11tsJNI_POINTSoff_DEL2
+#define align2_MultiStateAligner11tsJNI_POINTSoff_DEL2 -67584L
+#undef align2_MultiStateAligner11tsJNI_POINTSoff_DEL3
+#define align2_MultiStateAligner11tsJNI_POINTSoff_DEL3 -18432L
+#undef align2_MultiStateAligner11tsJNI_POINTSoff_DEL4
+#define align2_MultiStateAligner11tsJNI_POINTSoff_DEL4 -2048L
+#undef align2_MultiStateAligner11tsJNI_POINTSoff_DEL5
+#define align2_MultiStateAligner11tsJNI_POINTSoff_DEL5 -2048L
+#undef align2_MultiStateAligner11tsJNI_POINTSoff_DEL_REF_N
+#define align2_MultiStateAligner11tsJNI_POINTSoff_DEL_REF_N -20480L
+#undef align2_MultiStateAligner11tsJNI_BADoff
+#define align2_MultiStateAligner11tsJNI_BADoff -2143387648L
+#undef align2_MultiStateAligner11tsJNI_MAXoff_SCORE
+#define align2_MultiStateAligner11tsJNI_MAXoff_SCORE 2143385600L
+#undef align2_MultiStateAligner11tsJNI_MINoff_SCORE
+#define align2_MultiStateAligner11tsJNI_MINoff_SCORE -2143385600L
+#undef align2_MultiStateAligner11tsJNI_AFFINE_ARRAYS
+#define align2_MultiStateAligner11tsJNI_AFFINE_ARRAYS 1L
+/*
+ * Class: align2_MultiStateAligner11tsJNI
+ * Method: fillUnlimitedJNI
+ * Signature: ([B[BII[I[J[I[I[III)V
+ */
+JNIEXPORT void JNICALL Java_align2_MultiStateAligner11tsJNI_fillUnlimitedJNI
+ (JNIEnv *, jobject, jbyteArray, jbyteArray, jint, jint, jintArray, jlongArray, jintArray, jintArray, jintArray, jint, jint);
+
+/*
+ * Class: align2_MultiStateAligner11tsJNI
+ * Method: fillLimitedXJNI
+ * Signature: ([B[BIII[I[J[I[I[IIIIF[I[I[B[I)V
+ */
+JNIEXPORT void JNICALL Java_align2_MultiStateAligner11tsJNI_fillLimitedXJNI
+ (JNIEnv *, jobject, jbyteArray, jbyteArray, jint, jint, jint, jintArray, jlongArray, jintArray, jintArray, jintArray, jint, jint, jint, jfloat, jintArray, jintArray, jbyteArray, jintArray);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/jni/jgi_BBMergeOverlapper.h b/jni/jgi_BBMergeOverlapper.h
new file mode 100755
index 0000000..b6c3775
--- /dev/null
+++ b/jni/jgi_BBMergeOverlapper.h
@@ -0,0 +1,45 @@
+/* DO NOT EDIT THIS FILE - it is machine generated */
+#include <jni.h>
+/* Header for class jgi_BBMergeOverlapper */
+
+#ifndef _Included_jgi_BBMergeOverlapper
+#define _Included_jgi_BBMergeOverlapper
+#ifdef __cplusplus
+extern "C" {
+#endif
+#undef jgi_BBMergeOverlapper_BAD_MULT
+#define jgi_BBMergeOverlapper_BAD_MULT 6L
+#undef jgi_BBMergeOverlapper_GOOD_MULT_1
+#define jgi_BBMergeOverlapper_GOOD_MULT_1 8L
+#undef jgi_BBMergeOverlapper_GOOD_MULT_2
+#define jgi_BBMergeOverlapper_GOOD_MULT_2 400L
+#undef jgi_BBMergeOverlapper_verbose
+#define jgi_BBMergeOverlapper_verbose 0L
+/*
+ * Class: jgi_BBMergeOverlapper
+ * Method: mateByOverlapJNI
+ * Signature: ([B[B[B[B[F[F[IIIIIIII)I
+ */
+JNIEXPORT jint JNICALL Java_jgi_BBMergeOverlapper_mateByOverlapJNI
+ (JNIEnv *, jclass, jbyteArray, jbyteArray, jbyteArray, jbyteArray, jfloatArray, jfloatArray, jintArray, jint, jint, jint, jint, jint, jint, jint);
+
+/*
+ * Class: jgi_BBMergeOverlapper
+ * Method: mateByOverlapRatioJNI_WithQualities
+ * Signature: ([B[B[B[B[F[F[IIIIIFFF)I
+ */
+JNIEXPORT jint JNICALL Java_jgi_BBMergeOverlapper_mateByOverlapRatioJNI_1WithQualities
+ (JNIEnv *, jclass, jbyteArray, jbyteArray, jbyteArray, jbyteArray, jfloatArray, jfloatArray, jintArray, jint, jint, jint, jint, jfloat, jfloat, jfloat);
+
+/*
+ * Class: jgi_BBMergeOverlapper
+ * Method: mateByOverlapRatioJNI
+ * Signature: ([B[B[IIIIIFFFFF)I
+ */
+JNIEXPORT jint JNICALL Java_jgi_BBMergeOverlapper_mateByOverlapRatioJNI
+ (JNIEnv *, jclass, jbyteArray, jbyteArray, jintArray, jint, jint, jint, jint, jfloat, jfloat, jfloat, jfloat, jfloat);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/jni/makefile.linux b/jni/makefile.linux
new file mode 100755
index 0000000..b46ec80
--- /dev/null
+++ b/jni/makefile.linux
@@ -0,0 +1,16 @@
+CC=gcc
+INCS=-I"$(JAVA_HOME)/include" -I"$(JAVA_HOME)/include/linux"
+CFLAGS=-O3 -Wall -std=c99 -fPIC
+LDFLAGS=-O3 -Wall -std=c99 -fPIC -shared
+OBJ=BandedAlignerJNI.o MultiStateAligner11tsJNI.o BBMergeOverlapper.o
+
+%.o: %.c
+ $(CC) $(CFLAGS) $(INCS) -c -o $@ $<
+
+libbbtoolsjni.so: $(OBJ)
+ $(CC) -o $@ $^ $(LDFLAGS)
+
+.PHONY: clean
+
+clean:
+ rm -f *.o *~ core libbbtoolsjni.so
diff --git a/jni/makefile.osx b/jni/makefile.osx
new file mode 100755
index 0000000..94b78d1
--- /dev/null
+++ b/jni/makefile.osx
@@ -0,0 +1,16 @@
+CC=gcc
+INCS=-I"$(JAVA_HOME)/include" -I"$(JAVA_HOME)/include/darwin"
+CFLAGS=-O3 -Wall -std=c99 -fPIC
+LDFLAGS=-O3 -Wall -std=c99 -fPIC -dynamiclib
+OBJ=BandedAlignerJNI.o MultiStateAligner11tsJNI.o BBMergeOverlapper.o
+
+%.o: %.c
+ $(CC) $(CFLAGS) $(INCS) -c -o $@ $<
+
+libbbtoolsjni.dylib: $(OBJ)
+ $(CC) -o $@ $^ $(LDFLAGS)
+
+.PHONY: clean
+
+clean:
+ rm -f *.o *~ core libbbtoolsjni.dylib
diff --git a/kcompress.sh b/kcompress.sh
new file mode 100755
index 0000000..d61bf06
--- /dev/null
+++ b/kcompress.sh
@@ -0,0 +1,105 @@
+#!/bin/bash
+#tadpole in=<infile>
+
+usage(){
+echo "
+Written by Brian Bushnell
+Last modified September 28, 2015
+
+Description: Compresses sequence data into a fasta file containing each kmer
+exactly once. Allows arbitrary kmer set operations via multiple passes.
+
+Usage:
+Generation: kcompress.sh in=<reads> out=<contigs> min=<1> max=<2147483647>
+
+Input may be fasta or fastq, compressed or uncompressed.
+
+
+Input parameters:
+in=<file> Primary input file for reads to use as kmer data.
+in2=<file> Second input file for paired data.
+reads=-1 Only process this number of reads, then quit (-1 means all).
+
+Output parameters:
+out=<file> Write contigs (in contig mode).
+showstats=t Print assembly statistics after writing contigs.
+fuse=0 Fuse output sequences into chunks at least this long.
+
+Prefiltering parameters:
+prefilter=0 If set to a positive integer, use a countmin sketch
+ to ignore kmers with depth of that value or lower.
+prehashes=2 Number of hashes for prefilter.
+prefiltersize=0.2 (pff) Fraction of memory to use for prefilter.
+minprobprefilter=t (mpp) Use minprob for the prefilter.
+prepasses=1 Use this many prefiltering passes; higher be more thorough
+ if the filter is very full. Set to 'auto' to iteratively
+ prefilter until the remaining kmers will fit in memory.
+
+Hashing parameters:
+k=31 Kmer length (1 to 31).
+prealloc=t Pre-allocate memory rather than dynamically growing;
+ faster and more memory-efficient. A float fraction (0-1)
+ may be specified; default is 1.
+minprob=0.5 Ignore kmers with overall probability of correctness below this.
+minprobmain=t (mpm) Use minprob for the primary kmer counts.
+threads=X Spawn X threads (default is number of logical processors).
+
+Assembly parameters:
+mincount=1 (min) Only retain kmers that occur at least this many times.
+maxcount=2147483647 (max) Only retain kmers that occur at most this many times.
+requiresamecount (rsc) Only build contigs from kmers with exactly the same count.
+
+
+Java Parameters:
+-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
+ -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx14g"
+z2="-Xms14g"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+ if [[ $set == 1 ]]; then
+ return
+ fi
+ freeRam 15000m 84
+ z="-Xmx${RAM}m"
+ z2="-Xms${RAM}m"
+}
+calcXmx "$@"
+
+kcompress() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ fi
+ local CMD="java $EA $z $z2 -cp $CP assemble.KmerCompressor $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+kcompress "$@"
diff --git a/khist.sh b/khist.sh
new file mode 100755
index 0000000..15b28ec
--- /dev/null
+++ b/khist.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+#khist in=<infile> out=<outfile>
+
+usage(){
+echo "
+Description: Generates a histogram of kmer counts for the input reads or assemblies.
+
+Usage: khist.sh in=<input> hist=<histogram output>
+
+Please see bbnorm.sh for more information.
+All the flags are the same, only the parameters (near the bottom of this file) differ.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx31g"
+z2="-Xms31g"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+ if [[ $set == 1 ]]; then
+ return
+ fi
+ freeRam 31000m 84
+ z="-Xmx${RAM}m"
+ z2="-Xms${RAM}m"
+}
+calcXmx "$@"
+
+khist() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ fi
+ local CMD="java $EA $z $z2 -cp $CP jgi.KmerNormalize bits=32 ecc=f passes=1 keepall dr=f prefilter hist=stdout minprob=0 minqual=0 mindepth=0 minkmers=1 hashes=3 $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+khist "$@"
diff --git a/kmercountexact.sh b/kmercountexact.sh
new file mode 100755
index 0000000..68c16b2
--- /dev/null
+++ b/kmercountexact.sh
@@ -0,0 +1,129 @@
+#!/bin/bash
+#kmercountexact in=<infile>
+
+usage(){
+echo "
+Written by Brian Bushnell
+Last modified November 18, 2015
+
+Description: Counts the number of unique kmers in a file.
+Generates a kmer frequency histogram and genome size estimate (in peaks output),
+and prints a file containing all kmers and their counts.
+Supports K=1 to infinity, though not all values are allowed.
+SEE ALSO: bbnorm.sh/khist.sh, which have similar functionality.
+
+Usage: kmercountexact.sh in=<file> khist=<file> peaks=<file> out=<file>
+
+Input may be fasta or fastq, compressed or uncompressed.
+Output may be stdout or a file. out, khist, and peaks are optional.
+
+
+Input parameters:
+in=<file> Primary input file.
+in2=<file> Second input file for paired reads.
+
+Output parameters:
+out=<file> Print kmers and their counts.
+fastadump=t Print kmers and counts as fasta versus 2-column tsv.
+mincount=1 Only print kmers with at least this depth.
+reads=-1 Only process this number of reads, then quit (-1 means all).
+dumpthreads=-1 Use this number of threads for dumping kmers (-1 means auto).
+
+Hashing parameters:
+k=31 Kmer length (1-31 is fastest).
+prealloc=t Pre-allocate memory rather than dynamically growing; faster and more memory-efficient. A float fraction (0-1) may be specified, default 1.
+prefilter=0 If set to a positive integer, use a countmin sketch to ignore kmers with depth of that value or lower.
+prehashes=2 Number of hashes for prefilter.
+prefiltersize=0.2 Fraction of memory to use for prefilter.
+minq=6 Ignore kmers containing bases with quality below this. (TODO)
+minprob=0.0 Ignore kmers with overall probability of correctness below this.
+threads=X Spawn X hashing threads (default is number of logical processors).
+onepass=f If true, prefilter will be generated in same pass as kmer counts. Much faster but counts will be lower, by up to prefilter's depth limit.
+rcomp=t Store and count each kmer together and its reverse-complement.
+
+Histogram parameters:
+khist=<file> Print kmer frequency histogram.
+histcolumns=2 2 columns: (depth, count). 3 columns: (depth, rawCount, count).
+histmax=100000 Maximum depth to print in histogram output.
+histheader=t Set true to print a header line.
+nzo=t (nonzeroonly) Only print lines for depths with a nonzero kmer count.
+smooth=f Smooth the kmer histogram.
+smoothradius=1 Initial radius of progressive smoothing function.
+maxradius=10 Maximum radius of progressive smoothing function.
+progressivemult=2 Increment radius each time depth increases by this factor.
+
+
+Peak calling parameters:
+peaks=<file> Write the peaks to this file. Default is stdout.
+ Also contains the genome size estimate in bp.
+minHeight=2 (h) Ignore peaks shorter than this.
+minVolume=2 (v) Ignore peaks with less area than this.
+minWidth=2 (w) Ignore peaks narrower than this.
+minPeak=2 (minp) Ignore peaks with an X-value below this.
+maxPeak=BIG (maxp) Ignore peaks with an X-value above this.
+maxPeakCount=12 (maxpc) Print up to this many peaks (prioritizing height).
+ploidy=-1 Specify ploidy; otherwise it will be autodetected.
+
+Quality parameters:
+qtrim=f Trim read ends to remove bases with quality below minq.
+ Values: t (trim both ends), f (neither end), r (right end only), l (left end only).
+trimq=4 Trim quality threshold.
+minavgquality=0 (maq) Reads with average quality (before trimming) below this will be discarded.
+
+Overlap parameters (for overlapping paired-end reads only):
+merge=f Attempt to merge reads before counting kmers.
+ecco=f Error correct via overlap, but do not merge reads.
+
+Java Parameters:
+-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
+ -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx1g"
+z2="-Xms1g"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+ if [[ $set == 1 ]]; then
+ return
+ fi
+ freeRam 3200m 84
+ z="-Xmx${RAM}m"
+ z2="-Xms${RAM}m"
+}
+calcXmx "$@"
+
+kmercountexact() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ fi
+ local CMD="java $EA $z $z2 -cp $CP jgi.KmerCountExact $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+kmercountexact "$@"
diff --git a/kmercoverage.sh b/kmercoverage.sh
new file mode 100755
index 0000000..9dd7063
--- /dev/null
+++ b/kmercoverage.sh
@@ -0,0 +1,106 @@
+#!/bin/bash
+#kmercoverage in=<infile> out=<outfile>
+
+usage(){
+echo "
+Written by Brian Bushnell
+Last modified May 23, 2014
+
+*** DEPRECATED: This should still work but is no longer maintained. ***
+
+Description: Annotates reads with their kmer depth.
+
+Usage: kmercoverage in=<input> out=<read output> hist=<histogram output>
+
+
+Optional parameters (and their defaults)
+
+Input parameters:
+in2=null Second input file for paired reads
+extra=null Additional files to use for input (generating hash table) but not for output
+fastareadlen=2^31 Break up FASTA reads longer than this. Can be useful when processing scaffolded genomes
+tablereads=-1 Use at most this many reads when building the hashtable (-1 means all)
+kmersample=1 Process every nth kmer, and skip the rest
+readsample=1 Process every nth read, and skip the rest
+
+Output parameters:
+hist=null Specify a file to output the depth histogram
+histlen=10000 Max depth displayed on histogram
+reads=-1 Only process this number of reads, then quit (-1 means all)
+sampleoutput=t Use sampling on output as well as input (not used if sample rates are 1)
+printcoverage=f Only print coverage information instead of reads
+useheader=f Append coverage info to the read's header
+minmedian=0 Don't output reads with median coverage below this
+minaverage=0 Don't output reads with average coverage below this
+zerobin=f Set to true if you want kmers with a count of 0 to go in the 0 bin instead of the 1 bin in histograms.
+ Default is false, to prevent confusion about how there can be 0-count kmers.
+ The reason is that based on the 'minq' and 'minprob' settings, some kmers may be excluded from the bloom filter.
+
+Hashing parameters:
+k=31 Kmer length (values under 32 are most efficient, but arbitrarily high values are supported)
+cbits=8 Bits per cell in bloom filter; must be 2, 4, 8, 16, or 32. Maximum kmer depth recorded is 2^cbits.
+ Large values decrease accuracy for a fixed amount of memory.
+hashes=4 Number of times a kmer is hashed. Higher is slower.
+ Higher is MORE accurate if there is enough memory, and LESS accurate if there is not enough memory.
+prefilter=f True is slower, but generally more accurate; filters out low-depth kmers from the main hashtable.
+prehashes=2 Number of hashes for prefilter.
+passes=1 More passes can sometimes increase accuracy by iteratively removing low-depth kmers
+minq=7 Ignore kmers containing bases with quality below this
+minprob=0.5 Ignore kmers with overall probability of correctness below this
+threads=X Spawn exactly X hashing threads (default is number of logical processors). Total active threads may exceed X by up to 4.
+
+Java Parameters:
+-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
+ -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory.
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx1g"
+z2="-Xms1g"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+ if [[ $set == 1 ]]; then
+ return
+ fi
+ freeRam 3200m 84
+ z="-Xmx${RAM}m"
+ z2="-Xms${RAM}m"
+}
+calcXmx "$@"
+
+kmercoverage() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ fi
+ local CMD="java $EA $z -cp $CP jgi.KmerCoverage prefilter=true bits=16 interleaved=false $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+kmercoverage "$@"
diff --git a/license.txt b/license.txt
new file mode 100755
index 0000000..46a0c96
--- /dev/null
+++ b/license.txt
@@ -0,0 +1,25 @@
+BBTools Copyright (c) 2014, The Regents of the University of California, through Lawrence Berkeley National Laboratory (subject to receipt of any required approvals from the U.S. Dept. of Energy). All rights reserved.
+
+
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+
+
+(1) Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+
+
+
+(2) Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+
+
+
+(3) Neither the name of the University of California, Lawrence Berkeley National Laboratory, U.S. Dept. of Energy nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFI [...]
+
+
+
+You are under no obligation whatsoever to provide any bug fixes, patches, or upgrades to the features, functionality or performance of the source code ("Enhancements") to anyone; however, if you choose to make your Enhancements available either publicly, or directly to Lawrence Berkeley National Laboratory, without imposing a separate written license agreement for such Enhancements, then you hereby grant the following license: a non-exclusive, royalty-free perpetual license to install, [...]
diff --git a/loglog.sh b/loglog.sh
new file mode 100755
index 0000000..fc66dba
--- /dev/null
+++ b/loglog.sh
@@ -0,0 +1,83 @@
+#!/bin/bash
+#reformat in=<infile> out=<outfile>
+
+function usage(){
+echo "
+Written by Brian Bushnell
+Last modified October 9, 2015
+
+Description: Estimates cardinality of unique kmers in sequence data.
+
+Usage: loglog.sh in=<file> k=<31>
+
+
+Parameters and their defaults:
+
+in=<file> (in1) Input file, or comma-delimited list of files.
+in2=<file> Optional second file for paired reads.
+k=31 Use this kmer length for counting.
+buckets=1999 Use this many buckets for counting; higher decreases variance.
+bits=8 Hash this many bits per cycle.
+seed=-1 Use this seed for hash functions. A negative number forces a random seed.
+
+Shortcuts:
+The # symbol will be substituted for 1 and 2.
+For example:
+loglog.sh in=read#.fq
+...is equivalent to:
+loglog.sh in1=read1.fq in2=read2.fq
+
+Java Parameters:
+-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
+ -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory.
+
+Supported input formats are fastq, fasta, scarf, sam, and bam
+Supported compression formats are gzip, zip, and bz2
+To read from stdin, set 'in=stdin'. The format should be specified with an extension, like 'in=stdin.fq.gz'
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx200m"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+}
+calcXmx "$@"
+
+function loglog() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module unload samtools
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ module load samtools
+ fi
+ local CMD="java $EA $z -cp $CP jgi.LogLog $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+loglog "$@"
diff --git a/makechimeras.sh b/makechimeras.sh
new file mode 100755
index 0000000..f9008e0
--- /dev/null
+++ b/makechimeras.sh
@@ -0,0 +1,78 @@
+#!/bin/bash
+#makechimeras in=<infile> out=<outfile>
+
+usage(){
+echo "
+Written by Brian Bushnell
+Last modified February 17, 2015
+
+Description: Makes chimeric sequences from nonchimeric sequences.
+Designed for PacBio reads.
+
+Usage: makechimeras.sh in=<input> out=<output> chimeras=<integer>
+
+
+Input Parameters:
+in=<file> The input file containing nonchimeric reads.
+unpigz=t Decompress with pigz for faster decompression.
+
+Output Parameters:
+out=<file> Fasta output destination.
+chimeras=-1 Number of chimeras to create (required parameter).
+forcelength=0 If a positive number X, one parent will be length X, and the other will be length-X.
+
+Java Parameters:
+-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
+ -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory.
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx1g"
+z2="-Xms1g"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+ if [[ $set == 1 ]]; then
+ return
+ fi
+ freeRam 800m 82
+ z="-Xmx${RAM}m"
+ z2="-Xms${RAM}m"
+}
+calcXmx "$@"
+
+makechimeras() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ fi
+ local CMD="java $EA $z -cp $CP jgi.MakeChimeras $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+makechimeras "$@"
diff --git a/mapPacBio.sh b/mapPacBio.sh
new file mode 100755
index 0000000..12c4b98
--- /dev/null
+++ b/mapPacBio.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+#mapPacBio in=<infile> out=<outfile> ref=<reference>
+
+usage(){
+ bash "$DIR"bbmap.sh
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+NATIVELIBDIR="$DIR""jni/"
+
+z="-Xmx1g"
+z2="-Xms1g"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+ if [[ $set == 1 ]]; then
+ return
+ fi
+ freeRam 3200m 84
+ z="-Xmx${RAM}m"
+ z2="-Xms${RAM}m"
+}
+calcXmx "$@"
+
+mapPacBio() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module unload samtools
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ module load samtools
+ fi
+ local CMD="java -Djava.library.path=$NATIVELIBDIR $EA $z -cp $CP align2.BBMapPacBio build=1 overwrite=true minratio=0.40 fastareadlen=6000 ambiguous=best minscaf=100 startpad=10000 stoppad=10000 midpad=6000 $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+mapPacBio "$@"
diff --git a/matrixtocolumns.sh b/matrixtocolumns.sh
new file mode 100755
index 0000000..e8719ad
--- /dev/null
+++ b/matrixtocolumns.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+#matrixtocolumns in1=<infile> in2=<infile> out=<outfile>
+
+usage(){
+echo "
+Written by Brian Bushnell
+Last modified January 21, 2015
+
+Description: Transforms two matched identity matrices into 2-column format,
+ one row per entry, one column per matrix.
+
+Usage: matrixtocolumns.sh in1=<matrix1> in2=<matrix2> out=<file>
+
+Java Parameters:
+-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
+ -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory.
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx2g"
+z2="-Xms2g"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+ if [[ $set == 1 ]]; then
+ return
+ fi
+ freeRam 3200m 84
+ z="-Xmx${RAM}m"
+ z2="-Xms${RAM}m"
+}
+calcXmx "$@"
+
+matrixtocolumns() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ fi
+ local CMD="java $EA $z -cp $CP driver.CorrelateIdentity $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+matrixtocolumns "$@"
diff --git a/mergeOTUs.sh b/mergeOTUs.sh
new file mode 100755
index 0000000..ea78adb
--- /dev/null
+++ b/mergeOTUs.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+#mergeOTUs in=<infile> out=<outfile>
+
+function usage(){
+echo "
+Written by Brian Bushnell
+Last modified January 21, 2015
+
+Description: Merges coverage stats lines (from pileup) for the same OTU,
+ according to some custom naming scheme.
+
+Usage: mergeOTUs.sh in=<file> out=<file>
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx1g"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+}
+calcXmx "$@"
+
+function mergeOTUs() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module load oracle-jdk/1.7_64bit
+ fi
+ local CMD="java $EA $z -cp $CP driver.MergeCoverageOTU $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+mergeOTUs "$@"
diff --git a/mergebarcodes.sh b/mergebarcodes.sh
new file mode 100755
index 0000000..7027f46
--- /dev/null
+++ b/mergebarcodes.sh
@@ -0,0 +1,89 @@
+#!/bin/bash
+#mergebarcodes in=<infile> out=<outfile>
+
+usage(){
+echo "
+Written by Brian Bushnell
+Last modified February 17, 2015
+
+Description: Concatenates barcodes and quality onto read names.
+
+Usage: mergebarcodes.sh in=<file> out=<file> barcode=<file>
+
+Input may be stdin or a fasta or fastq file, raw or gzipped.
+If you pipe via stdin/stdout, please include the file type; e.g. for gzipped fasta input, set in=stdin.fa.gz
+
+
+Optional parameters (and their defaults)
+
+Input parameters:
+in=<file> Input reads. 'in=stdin.fq' will pipe from standard in.
+bar=<file> File containing barcodes.
+interleaved=auto (int) If true, forces fastq input to be paired and interleaved.
+qin=auto ASCII offset for input quality. May be 33 (Sanger), 64 (Illumina), or auto.
+
+Output parameters:
+out=<file> Write muxed sequences here. 'out=stdout.fa' will pipe to standard out.
+overwrite=t (ow) Set to false to force the program to abort rather than overwrite an existing file.
+ziplevel=2 (zl) Set to 1 (lowest) through 9 (max) to change compression level; lower compression is faster.
+qout=auto ASCII offset for output quality. May be 33 (Sanger), 64 (Illumina), or auto (same as input).
+
+Other parameters:
+pigz=t Use pigz to compress. If argument is a number, that will set the number of pigz threads.
+unpigz=t Use pigz to decompress.
+
+Java Parameters:
+-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
+ -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory.
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx1g"
+z2="-Xms1g"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+ if [[ $set == 1 ]]; then
+ return
+ fi
+ freeRam 3200m 84
+ z="-Xmx${RAM}m"
+ z2="-Xms${RAM}m"
+}
+calcXmx "$@"
+
+mergebarcodes() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ fi
+ local CMD="java $EA $z -cp $CP jgi.MergeBarcodes $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+mergebarcodes "$@"
diff --git a/msa.sh b/msa.sh
new file mode 100755
index 0000000..b2a1805
--- /dev/null
+++ b/msa.sh
@@ -0,0 +1,79 @@
+#!/bin/bash
+#msa in=<file> out=<file>
+
+usage(){
+echo "
+Written by Brian Bushnell
+Last modified December 5, 2014
+
+Description: Aligns a query sequence to reference sequences.
+Outputs the best matching position per reference sequence.
+If there are multiple queries, only the best-matching query will be used.
+MSA in this context stands for MultiStateAligner, not Multiple Sequence Alignment.
+
+Usage: msa.sh in=<file> out=<file> query=<literal,literal,...>
+
+Parameters:
+
+in=<file> File containing reads.
+out=<file> Sam output file.
+literal= A sequence of bases to match, or a comma-delimited list.
+
+Java Parameters:
+
+-Xmx This will be passed to Java to set memory usage, overriding
+ the program's automatic memory detection. -Xmx20g will specify
+ 20 gigs of RAM, and -Xmx200m will specify 200 megs.
+ The max is typically 85% of physical memory.
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx1g"
+z2="-Xms1g"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+ if [[ $set == 1 ]]; then
+ return
+ fi
+ freeRam 2000m 84
+ z="-Xmx${RAM}m"
+ z2="-Xms${RAM}m"
+}
+calcXmx "$@"
+
+msa() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ fi
+ local CMD="java $EA $z -cp $CP jgi.FindPrimers $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+msa "$@"
diff --git a/phylip2fasta.sh b/phylip2fasta.sh
new file mode 100755
index 0000000..1455ec6
--- /dev/null
+++ b/phylip2fasta.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+#convert in=<infile> out=<outfile>
+
+usage(){
+echo "
+Written by Brian Bushnell
+Last modified October 3, 2014
+
+Description: Transforms interleaved phylip to fasta.
+
+Usage: phylip2fasta.sh in=<input> out=<output>
+
+Input may be stdin or an interleaved phylip file, compressed or uncompressed.
+
+Input Parameters:
+in=<phylip file> The input file; this is the only required parameter.
+unpigz=<true> Decompress with pigz for faster decompression.
+
+Output Parameters:
+out=<file> Fasta output destination.
+
+Java Parameters:
+-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
+ -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory.
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx1g"
+z2="-Xms1g"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+ if [[ $set == 1 ]]; then
+ return
+ fi
+ freeRam 800m 82
+ z="-Xmx${RAM}m"
+ z2="-Xms${RAM}m"
+}
+calcXmx "$@"
+
+convert() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ fi
+ local CMD="java $EA $z -cp $CP jgi.PhylipToFasta $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+convert "$@"
diff --git a/pileup.sh b/pileup.sh
new file mode 100755
index 0000000..67e456f
--- /dev/null
+++ b/pileup.sh
@@ -0,0 +1,144 @@
+#!/bin/bash
+#pileup in=<infile> out=<outfile>
+
+usage(){
+echo "
+Written by Brian Bushnell
+Last modified August 25, 2015
+
+Description: Calculates per-scaffold coverage information from an unsorted sam or bam file.
+
+Usage: pileup.sh in=<input> out=<output>
+
+
+Input Parameters:
+in=<file> The input sam file; this is the only required parameter.
+ref=<file> Scans a reference fasta for per-scaffold GC counts, not otherwise needed.
+fastaorf=<file> An optional fasta file with ORF header information in PRODIGAL's output format. Must also specify 'outorf'.
+unpigz=t Decompress with pigz for faster decompression.
+
+Output Parameters:
+out=<file> (covstats) Per-scaffold coverage info.
+rpkm=<file> Per-scaffold RPKM/FPKM counts.
+twocolumn=f Change to true to print only ID and Avg_fold instead of all 6 columns.
+countgc=t Enable/disable counting of read GC content.
+outorf=<file> Per-orf coverage info to this file (only if 'fastaorf' is specified).
+outsam=<file> Print the input sam stream to this file (or stdout). Useful for piping data.
+hist=<file> Histogram of # occurrences of each depth level.
+basecov=<file> Coverage per base location.
+bincov=<file> Binned coverage per location (one line per X bases).
+binsize=1000 Binsize for binned coverage output.
+keepshortbins=t (ksb) Keep residual bins shorter than binsize.
+normcov=<file> Normalized coverage by normalized location (X lines per scaffold).
+normcovo=<file> Overall normalized coverage by normalized location (X lines for the entire assembly).
+normb=-1 If positive, use a fixed number of bins per scaffold; affects 'normcov' and 'normcovo'.
+normc=f Normalize coverage to fraction of max per scaffold; affects 'normcov' and 'normcovo'.
+delta=f Only print base coverage lines when the coverage differs from the previous base.
+nzo=f Only print scaffolds with nonzero coverage.
+concise=f Write 'basecov' in a more concise format.
+header=t (hdr) Include headers in output files.
+headerpound=t (#) Prepend header lines with '#' symbol.
+stdev=t Calculate coverage standard deviation.
+covminscaf=0 (minscaf) Don't print coverage for scaffolds shorter than this.
+covwindow=0 Calculate how many bases are in windows of this size with
+ low average coverage. Produces an extra stats column.
+covwindowavg=5 Average coverage below this will be classified as low.
+
+Processing Parameters:
+strandedcov=f Track coverage for plus and minus strand independently.
+startcov=f Only track start positions of reads.
+secondary=t Use secondary alignments, if present.
+softclip=f Include soft-clipped bases in coverage.
+minmapq=0 (minq) Ignore alignments with mapq below this.
+physical=f (physcov) Calculate physical coverage for paired reads. This includes the unsequenced bases.
+tlen=t Track physical coverage from the tlen field rather than recalculating it.
+arrays=auto Set to t/f to manually force the use of coverage arrays. Arrays and bitsets are mutually exclusive.
+bitsets=auto Set to t/f to manually force the use of coverage bitsets.
+32bit=f Set to true if you need per-base coverage over 64k; does not affect per-scaffold coverage precision.
+ This option will double RAM usage (when calculating per-base coverage).
+delcoverage=t (delcov) Count bases covered by deletions as covered.
+ True is faster than false.
+
+Java Parameters:
+-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
+ -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory.
+
+Output format (tab-delimited):
+ID, Avg_fold, Length, Ref_GC, Covered_percent, Covered_bases, Plus_reads, Minus_reads, Median_fold, Read_GC
+
+ID: Scaffold ID
+Length: Scaffold length
+Ref_GC: GC ratio of reference
+Avg_fold: Average fold coverage of this scaffold
+Covered_percent: Percent of scaffold with any coverage (only if arrays or bitsets are used)
+Covered_bases: Number of bases with any coverage (only if arrays or bitsets are used)
+Plus_reads: Number of reads mapped to plus strand
+Minus_reads: Number of reads mapped to minus strand
+Median_fold: Median fold coverage of this scaffold (only if arrays are used)
+Read_GC: Average GC ratio of reads mapped to this scaffold
+
+Notes:
+
+Only supports SAM format for reads and FASTA for reference (though either may be gzipped).
+Sorting is not needed, so output may be streamed directly from a mapping program.
+Requires approximately 1 bit per reference base plus 100 bytes per scaffold (even if no reference is specified).
+This script will attempt to autodetect and correctly specify the -Xmx parameter to use all memory on the target node.
+If this fails with a message including 'Error: Could not create the Java Virtual Machine.', then...
+Please decrease the -Xmx parameter. It should be set to around 85% of the available memory.
+For example, -Xmx20g needs around 23 GB of virtual (and physical) memory when qsubbed.
+If the program fails with a message including 'java.lang.OutOfMemoryError:', then...
+-Xmx needs to be increased, which probably also means it needs to be qsubbed with a higher memory allocation.
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx1g"
+z2="-Xms1g"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+ if [[ $set == 1 ]]; then
+ return
+ fi
+ freeRam 3200m 84
+ z="-Xmx${RAM}m"
+ z2="-Xms${RAM}m"
+}
+calcXmx "$@"
+
+pileup() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module unload samtools
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ module load samtools
+ fi
+ local CMD="java $EA $z -cp $CP jgi.CoveragePileup $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+pileup "$@"
diff --git a/postfilter.sh b/postfilter.sh
new file mode 100755
index 0000000..9bfc451
--- /dev/null
+++ b/postfilter.sh
@@ -0,0 +1,98 @@
+#!/bin/bash
+
+function usage(){
+echo "
+Written by Brian Bushnell
+Last modified July 27, 2015
+
+Description: Maps reads, then filters an assembly by contig coverage.
+Intended to reduce misassembly rate of SPAdes by removing suspicious contigs.
+
+Usage: postfilter.sh in=<reads> ref=<contigs> out=<filtered contigs>
+
+
+Standard Parameters:
+in=<file> File containing input reads.
+in2=<file> Optional file containing read mates.
+ref=<file> File containing input assembly.
+cov=covstats.txt File to write coverage stats generated by pileup.
+out=filtered.fa Destination of clean output assembly.
+outdirty=<file> (outd) Destination of removed contigs; optional.
+ow=f (overwrite) Overwrites files that already exist.
+app=f (append) Append to files that already exist.
+zl=4 (ziplevel) Set compression level, 1 (low) to 9 (max).
+int=f (interleaved) Determines whether input reads are considered interleaved.
+
+Filtering Parameters:
+minc=2 (mincov) Discard contigs with lower average coverage.
+minp=95 (minpercent) Discard contigs with a lower percent covered bases.
+minr=6 (minreads) Discard contigs with fewer mapped reads.
+minl=400 (minlength) Discard shorter contigs.
+trim=0 (trimends) Trim the first and last X bases of each sequence.
+
+Mapping Parameters (unlisted params will use BBMap defaults)
+minhits=2
+maxindel=0
+tipsearch=0
+bw=20
+rescue=f
+
+
+Java Parameters:
+-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
+ -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory.
+
+Other parameters will be passed directly to BBMap.
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx800m"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+ if [[ $set == 1 ]]; then
+ return
+ fi
+ freeRam 800m 84
+ z="-Xmx${RAM}m"
+ z2="-Xms${RAM}m"
+}
+calcXmx "$@"
+
+function postfilter() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module unload samtools
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ module load samtools
+ fi
+ local CMD="java $EA $z -cp $CP assemble.Postfilter $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+postfilter "$@"
diff --git a/printtime.sh b/printtime.sh
new file mode 100755
index 0000000..afca7ca
--- /dev/null
+++ b/printtime.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+#printtime in=<infile> out=<outfile>
+
+function usage(){
+echo "
+Written by Brian Bushnell
+Last modified January 21, 2015
+
+Description: Prints time elapsed since last called on the same file.
+
+Usage: printtime.sh <filename>
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+function printtime() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module load oracle-jdk/1.7_64bit
+ fi
+ local CMD="java $EA -Xmx8m -cp $CP align2.PrintTime $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+printtime "$@"
diff --git a/randomreads.sh b/randomreads.sh
new file mode 100755
index 0000000..03d6222
--- /dev/null
+++ b/randomreads.sh
@@ -0,0 +1,159 @@
+#!/bin/bash
+
+usage(){
+echo "
+Written by Brian Bushnell
+Last modified September 1, 2015
+
+Description: Generates random synthetic reads from a reference genome. Read names indicate their genomic origin.
+Allows precise customization of things like insert size and synthetic mutation type, sizes, and rates.
+Read names generated by this program are used by MakeRocCure (samtoroc.sh) and GradeSamFile (gradesam.sh).
+They can also be used by BBMap (bbmap.sh) and BBMerge (bbmerge.sh) to automatically calculate
+true and false positive rates, if the flag 'parsecustom' is used.
+
+Usage: randomreads.sh ref=<file> out=<file> length=<number> reads=<number>
+
+
+Basic parameters:
+out=null Output file. If blank, filename(s) will be autogenerated.
+ref=null Reference file. Not needed if the reference is already indexed.
+build=1 If multiple references are indexed in the same directory,
+ each needs a unique build ID.
+midpad=300 Specifies space between scaffolds in packed index.
+reads=0 Generate this many reads (or pairs).
+minlength=100 Generate reads of up to this length.
+maxlength=100 Generate reads of at least this length.
+length=100 Generate reads of exactly this length.
+overwrite=t Set to false to disallow overwriting of existing files.
+replacenoref=f Set to true to replace Ns in the reference sequence
+ with random letters.
+simplenames=f Set to true to generate read names that clearly indicate
+ genomic origin, without BBMap internal coordinates.
+illuminanames=f Set to true to have matching names for paired reads,
+ rather than naming by location.
+spaceslash=f Set true to add a space before slash read pairnum.
+prefix=null Generated reads will start with this prefix,
+ rather than naming by location.
+seed=0 Use this to set the random number generator seed;
+ use -1 for a random seed.
+
+Pairing parameters:
+paired=f Set to true for paired reads.
+interleaved=f Set to true for interleaved output (rather than in two files).
+mininsert= Controls minimum insert length. Default depends on read length.
+maxinsert= Controls maximum insert length. Default depends on read length.
+triangle=t Make a triangular insert size distribution.
+flat=f Make a roughly flat insert size distribution..
+superflat=f Make a perfectly flat insert size distribution.
+gaussian=f Make a bell-shaped insert size distribution, with
+ standard deviation of (maxinsert-mininsert)/6.
+samestrand=f Generate paired reads on the same strand.
+
+Mutation parameters:
+snprate=0 Add snps to reads with this probability (0-1).
+insrate=0 Add insertions to reads with this probability (0-1).
+delrate=0 Add deletions to reads with this probability (0-1).
+subrate=0 Add contiguous substitutions to reads with this probability (0-1).
+nrate=0 Add nocalls to reads with this probability (0-1).
+
+Note: With a 'rate' of X, each read has an X chance of getting at least
+1 mutation, X^2 chance of 2+ mutations, X^3 chance of 3+ mutations,
+and so forth up to the maximum allowed number of mutations of that type.
+
+maxsnps=3 Add at most this many snps per read.
+maxinss=2 Add at most this many deletions per read.
+maxdels=2 Add at most this many insertions per read.
+maxsubs=2 Add at most this many contiguous substitutions per read.
+maxns=0 Add at most this many blocks of Ns per read.
+
+maxinslen=12 Max length of insertions.
+maxdellen=400 Max length of deletions.
+maxsublen=12 Max length of contiguous substitutions.
+maxnlen=1 Min length of N blocks.
+
+mininslen=1 Min length of insertions.
+mindellen=1 Min length of deletions.
+minsublen=2 Min length of contiguous substitutions.
+minnlen=1 Min length of N blocks.
+
+Illumina quality parameters:
+maxq=36 Upper bound of quality values.
+midq=32 Approximate average of quality values.
+minq=28 Lower bound of quality values.
+q= Sets maxq, midq, and minq to the same value.
+adderrors=t Add substitution errors based on quality values, after mutations.
+
+PacBio quality parameters:
+pacbio=f Use a PacBio error model, rather than Illumina
+ error model, and add PacBio errors after mutations.
+pbmin=0.13 Minimum rate of PacBio errors for a read.
+pbmax=0.17 Maximum rate of PacBio errors for a read.
+
+Other Parameters:
+overlap=1 Require reads to overlap scaffold end by at least this much.
+randomscaffold=f Choose random scaffolds without respect to length.
+amp=1 Simulate highly-amplified MDA single-cell data by
+ setting this to a higher number like 1000.
+replacenoref=f Replace intra- and inter-scaffold Ns with random bases.
+#colorspace=f Generate Solid colorspace reads.
+pbadapter= Add adapter sequence to some reads using this literal string.
+fragadapter= Add this sequence to paired reads with insert size
+ shorter than read length.
+fragadapter2= Use this sequence for read 2.
+
+
+Java Parameters:
+-Xmx This will be passed to Java to set memory usage, overriding the
+ program's automatic memory detection.
+ -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify
+ 200 megs. The max is typically 85% of physical memory.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx1g"
+z2="-Xms1g"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+ if [[ $set == 1 ]]; then
+ return
+ fi
+ freeRam 3200m 84
+ z="-Xmx${RAM}m"
+ z2="-Xms${RAM}m"
+}
+calcXmx "$@"
+
+randomreads() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ fi
+ local CMD="java $EA $z -cp $CP align2.RandomReads3 build=1 $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+randomreads "$@"
diff --git a/readlength.sh b/readlength.sh
new file mode 100755
index 0000000..f7ed72d
--- /dev/null
+++ b/readlength.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+#readlength in=<infile>
+
+usage(){
+echo "
+Written by Brian Bushnell
+Last modified April 4, 2015
+Description: Generates a length histogram of input reads.
+
+Usage: readlength.sh in=<input file>
+
+
+in=<file> The 'in=' flag is needed only if the input file is not the first parameter. 'in=stdin.fq' will pipe from standard in.
+in2=<file> Use this if 2nd read of pairs are in a different file.
+out=<file> Write the histogram to this file. Default is stdout.
+bin=10 Set the histogram bin size.
+max=80000 Set the max read length to track.
+round=f Places reads in the closest bin, rather than the highest bin of at least readlength.
+nzo=f (nonzeroonly) Do not print empty bins.
+reads=-1 If nonnegative, stop after this many reads.
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx400m"
+z2="-Xmx400m"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+}
+calcXmx "$@"
+
+stats() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ fi
+ local CMD="java $EA $z -cp $CP jgi.MakeLengthHistogram $@"
+# echo $CMD >&2
+ eval $CMD
+}
+
+stats "$@"
diff --git a/reducesilva.sh b/reducesilva.sh
new file mode 100755
index 0000000..26d8147
--- /dev/null
+++ b/reducesilva.sh
@@ -0,0 +1,73 @@
+#!/bin/bash
+#rename in=<infile> out=<outfile>
+
+function usage(){
+echo "
+Written by Brian Bushnell
+Last modified July 31, 2015
+
+Description: Reduces Silva entries down to one entry per taxa.
+
+Usage: reducesilva.sh in=<file> out=<file> column=<1>
+
+
+Parameters:
+column The taxonomic level. 0=species, 1=genus, etc.
+ow=f (overwrite) Overwrites files that already exist.
+zl=4 (ziplevel) Set compression level, 1 (low) to 9 (max).
+fastawrap=70 Length of lines in fasta output.
+
+
+Sampling parameters:
+reads=-1 Set to a positive number to only process this many INPUT sequences, then quit.
+
+Java Parameters:
+-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
+ -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory.
+
+To read from stdin, set 'in=stdin'. The format should be specified with an extension, like 'in=stdin.fq.gz'
+To write to stdout, set 'out=stdout'. The format should be specified with an extension, like 'out=stdout.fasta'
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx1g"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+}
+calcXmx "$@"
+
+function reducesilva() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ fi
+ local CMD="java $EA $z -cp $CP driver.ReduceSilva $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+reducesilva "$@"
diff --git a/reformat.sh b/reformat.sh
new file mode 100755
index 0000000..2cd224a
--- /dev/null
+++ b/reformat.sh
@@ -0,0 +1,206 @@
+#!/bin/bash
+#reformat in=<infile> out=<outfile>
+
+function usage(){
+echo "
+Written by Brian Bushnell
+Last modified October 23, 2015
+
+Description: Reformats reads to change ASCII quality encoding, interleaving, file format, or compression format.
+Optionally performs additional functions such as quality trimming, subsetting, and subsampling.
+Supports sam, fastq, fasta, fasta+qual, scarf, gzip, zip.
+
+Usage: reformat.sh in=<file> in2=<file2> out=<outfile> out2=<outfile2>
+
+in2 and out2 are for paired reads and are optional.
+If input is paired and there is only one output file, it will be written interleaved.
+
+
+Other parameters and their defaults:
+
+ow=f (overwrite) Overwrites files that already exist.
+app=f (append) Append to files that already exist.
+zl=4 (ziplevel) Set compression level, 1 (low) to 9 (max).
+int=f (interleaved) Determines whether INPUT file is considered interleaved.
+fastawrap=70 Length of lines in fasta output.
+fastareadlen=0 Set to a non-zero number to break fasta files into reads of at most this length.
+fastaminlen=1 Ignore fasta reads shorter than this.
+qin=auto ASCII offset for input quality. May be 33 (Sanger), 64 (Illumina), or auto.
+qout=auto ASCII offset for output quality. May be 33 (Sanger), 64 (Illumina), or auto (same as input).
+qfake=30 Quality value used for fasta to fastq reformatting.
+qfin=<.qual file> Read qualities from this qual file, for the reads coming from 'in=<fasta file>'
+qfin2=<.qual file> Read qualities from this qual file, for the reads coming from 'in2=<fasta file>'
+qfout=<.qual file> Write qualities from this qual file, for the reads going to 'out=<fasta file>'
+qfout2=<.qual file> Write qualities from this qual file, for the reads coming from 'out2=<fasta file>'
+outsingle=<file> (outs) If a read is longer than minlength and its mate is shorter, the longer one goes here.
+
+Processing Parameters:
+
+verifypaired=f (vpair) When true, checks reads to see if the names look paired. Prints an error message if not.
+verifyinterleaved=f (vint) sets 'vpair' to true and 'interleaved' to true.
+allowidenticalnames=f (ain) When verifying pair names, allows identical names, instead of requiring /1 and /2 or 1: and 2:
+tossbrokenreads=f (tbr) Discard reads that have different numbers of bases and qualities. By default this will be detected and cause a crash.
+ignorebadquality=f (ibq) Fix out-of-range quality values instead of crashing with a warning.
+addslash=f Append ' /1' and ' /2' to read names, if not already present. Please include the flag 'int=t' if the reads are interleaved.
+spaceslash=t Put a space before the slash in addslash mode.
+underscore=f Change whitespace in read names to underscores.
+rcomp=f (rc) Reverse-compliment reads.
+rcompmate=f (rcm) Reverse-compliment read 2 only.
+changequality=t (cq) N bases always get a quality of 0 and ACGT bases get a min quality of 2.
+quantize=f Quantize qualities to a subset of values like NextSeq. Can also be used with comma-delimited list, like quantize=0,8,13,22,27,32,37
+tuc=f (touppercase) Change lowercase letters in reads to uppercase.
+uniquenames=f Make duplicate names unique by appending _<number>.
+remap= A set of pairs: remap=CTGN will transform C>T and G>N.
+ Use remap1 and remap2 to specify read 1 or 2.
+iupacToN=f (itn) Convert non-ACGTN symbols to N.
+monitor=f Kill this process if it crashes. monitor=600,0.01 would kill after 600 seconds under 1% usage.
+tossjunk=f Discard reads with invalid characters as bases.
+fixjunk=f Convert invalid bases to N.
+fixheaders=f Convert nonstandard header characters to standard ASCII.
+recalibrate=f (recal) Recalibrate quality scores. Must first generate matrices with CalcTrueQuality.
+maxcalledquality=41 Quality scores capped at this upper bound.
+mincalledquality=2 Quality scores of ACGT bases will be capped at lower bound.
+trimreaddescription=f (trd) Trim the names of reads after the first whitespace.
+
+Histogram output parameters:
+
+bhist=<file> Base composition histogram by position.
+qhist=<file> Quality histogram by position.
+qchist=<file> Count of bases with each quality value.
+aqhist=<file> Histogram of average read quality.
+bqhist=<file> Quality histogram designed for box plots.
+lhist=<file> Read length histogram.
+gchist=<file> Read GC content histogram.
+gcbins=100 Number gchist bins. Set to 'auto' to use read length.
+gcplot=f Add a graphical representation to the gchist.
+
+Histograms for sam files only (requires sam format 1.4 or higher):
+
+ehist=<file> Errors-per-read histogram.
+qahist=<file> Quality accuracy histogram of error rates versus quality score.
+indelhist=<file> Indel length histogram.
+mhist=<file> Histogram of match, sub, del, and ins rates by read location.
+idhist=<file> Histogram of read count versus percent identity.
+idbins=100 Number idhist bins. Set to 'auto' to use read length.
+
+Sampling parameters:
+
+reads=-1 Set to a positive number to only process this many INPUT reads (or pairs), then quit.
+skipreads=-1 Skip (discard) this many INPUT reads before processing the rest.
+samplerate=1 Randomly output only this fraction of reads; 1 means sampling is disabled.
+sampleseed=-1 Set to a positive number to use that prng seed for sampling (allowing deterministic sampling).
+samplereadstarget=0 (srt) Exact number of OUTPUT reads (or pairs) desired.
+samplebasestarget=0 (sbt) Exact number of OUTPUT bases desired.
+ Important: srt/sbt flags should not be used with stdin, samplerate, qtrim, minlength, or minavgquality.
+
+Trimming and filtering parameters:
+
+qtrim=f Trim read ends to remove bases with quality below trimq.
+ Values: t (trim both ends), f (neither end), r (right end only), l (left end only), w (sliding window).
+trimq=6 Regions with average quality BELOW this will be trimmed.
+minlength=0 (ml) Reads shorter than this after trimming will be discarded. Pairs will be discarded only if both are shorter.
+mlf=0 (mlf) Reads shorter than this fraction of original length after trimming will be discarded.
+maxlength=0 If nonzero, reads longer than this after trimming will be discarded.
+breaklength=0 If nonzero, reads longer than this will be broken into multiple reads of this length. Does not work for paired reads.
+requirebothbad=t (rbb) Only discard pairs if both reads are shorter than minlen.
+minavgquality=0 (maq) Reads with average quality (after trimming) below this will be discarded.
+maqb=0 If positive, calculate maq from this many initial bases.
+chastityfilter=f (cf) Reads with names containing ' 1:Y:' or ' 2:Y:' will be discarded.
+barcodefilter=f Remove reads with unexpected barcodes if barcodes is set, or barcodes containing 'N' otherwise.
+ A barcode must be the last part of the read header.
+barcodes= Comma-delimited list of barcodes or files of barcodes.
+maxns=-1 If 0 or greater, reads with more Ns than this (after trimming) will be discarded.
+minconsecutivebases=0 (mcb) Discard reads without at least this many consecutive called bases.
+forcetrimleft=0 (ftl) If nonzero, trim left bases of the read to this position (exclusive, 0-based).
+forcetrimright=0 (ftr) If nonzero, trim right bases of the read after this position (exclusive, 0-based).
+forcetrimright2=0 (ftr2) If positive, trim this many bases on the right end.
+forcetrimmod=5 (ftm) If positive, trim length to be equal to zero modulo this number.
+mingc=0 Discard reads with GC content below this.
+maxgc=1 Discard reads with GC content above this.
+
+Sam and bam processing options:
+
+mappedonly=f Toss unmapped reads.
+unmappedonly=f Toss mapped reads.
+primaryonly=f Toss secondary alignments. Set this to true for sam to fastq conversion.
+requiredbits=0 (rbits) Toss sam lines with any of these flag bits unset. Similar to samtools -f.
+filterbits=0 (fbits) Toss sam lines with any of these flag bits set. Similar to samtools -F.
+stoptag=f Set to true to write a tag indicating read stop location, prefixed by YS:i:
+sam= Set to 'sam=1.3' to convert '=' and 'X' cigar symbols (from sam 1.4+ format) to 'M'.
+
+Cardinality estimation:
+cardinality=f (loglog) Count unique kmers using the LogLog algorithm.
+loglogk=31 Use this kmer length for counting.
+loglogbuckets=1999 Use this many buckets for counting.
+
+Shortcuts:
+The # symbol will be substituted for 1 and 2. The % symbol in out will be substituted for input name minus extensions.
+For example:
+reformat.sh in=read#.fq out=%.fa
+...is equivalent to:
+reformat.sh in1=read1.fq in2=read2.fq out1=read1.fa out2=read2.fa
+
+Java Parameters:
+-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
+ -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory.
+
+Supported input formats are fastq, fasta, fast+qual, scarf, and bread (BBMap's native format)
+Supported output formats are fastq, fasta, fast+qual, bread, sam, and bam (bam only if samtools is installed)
+Supported compression formats are gz, zip, and bz2
+To read from stdin, set 'in=stdin'. The format should be specified with an extension, like 'in=stdin.fq.gz'
+To write to stdout, set 'out=stdout'. The format should be specified with an extension, like 'out=stdout.fasta'
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+#Old sam options - these probably only work with samv1.4+ input. TODO: test.
+#build=<integer> Assign a genome's build id. You can index like this: bbmap.sh ref=<file> build=1
+#sam=1.4 Set to 1.4 to write Sam version 1.4 cigar strings, with = and X, or 1.3 to use M.
+#md=f Set to true to write MD tags.
+#xs=f Set to 'ss', 'fs', or 'us' to write XS tags for RNAseq using secondstrand, firststrand,
+# or unstranded libraries. Needed by Cufflinks. JGI mainly uses 'firststrand'.
+#idtag=t Set to true to write a tag indicating percent identity, prefixed by YI:f:
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx200m"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+}
+calcXmx "$@"
+
+function reformat() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module unload samtools
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ module load samtools
+ fi
+ local CMD="java $EA $z -cp $CP jgi.ReformatReads $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+reformat "$@"
diff --git a/removebadbarcodes.sh b/removebadbarcodes.sh
new file mode 100755
index 0000000..50404b6
--- /dev/null
+++ b/removebadbarcodes.sh
@@ -0,0 +1,71 @@
+#!/bin/bash
+#removebadbarcodes in=<infile> out=<outfile>
+
+usage(){
+echo "
+Written by Brian Bushnell.
+Last modified March 16, 2015
+
+Description: Removes reads with improper barcodes.
+
+Usage: removebadbarcodes.sh in=<file> out=<file>
+
+
+Parameters:
+in=<file> Input reads; required parameter.
+out=<file> Destination for good reads; optional.
+ziplevel=2 (zl) Compression level for gzip output.
+pigz=f Spawn a pigz (parallel gzip) process for faster
+ compression than Java. Requires pigz to be installed.
+
+Java Parameters:
+-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
+ -Xmx20g will specify 20 gigs of RAM, and -Xmx800m will specify 800 megs. The max is typically 85% of physical memory.
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+NATIVELIBDIR="$DIR""jni/"
+
+z="-Xmx200m"
+z2="-Xmx200m"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+}
+calcXmx "$@"
+
+
+removebadbarcodes() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ fi
+ local CMD="java $EA $z $z2 -cp $CP jgi.RemoveBadBarcodes $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+removebadbarcodes "$@"
diff --git a/removesmartbell.sh b/removesmartbell.sh
new file mode 100755
index 0000000..d8e9ca4
--- /dev/null
+++ b/removesmartbell.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+#removesmartbell in=<infile> out=<outfile>
+
+usage(){
+echo "
+Written by Brian Bushnell
+Last modified February 17, 2015
+
+Description: Remove Smart Bell adapters from PacBio reads.
+
+Usage: removesmartbell in=<input> out=<output> split=t
+
+Input may be fasta or fastq, compressed or uncompressed (not H5 files).
+
+
+Parameters:
+in=file Specify the input file, or stdin.
+out=file Specify the output file, or stdout.
+split=f 'split=t' splits reads at adapters; split=f masks adapters.
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx400m"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+}
+calcXmx "$@"
+
+removesmartbell() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module load oracle-jdk/1.7_64bit
+ fi
+ local CMD="java $EA $z -cp $CP pacbio.RemoveAdapters2 $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+removesmartbell "$@"
diff --git a/rename.sh b/rename.sh
new file mode 100755
index 0000000..f82e11d
--- /dev/null
+++ b/rename.sh
@@ -0,0 +1,94 @@
+#!/bin/bash
+#rename in=<infile> out=<outfile>
+
+function usage(){
+echo "
+Written by Brian Bushnell
+Last modified June 24, 2015
+
+Description: Renames reads to <prefix>_<number> where you specify the prefix and the numbers are ordered.
+
+Usage: rename.sh in=<file> in2=<file2> out=<outfile> out2=<outfile2> prefix=<>
+
+in2 and out2 are for paired reads and are optional.
+If input is paired and there is only one output file, it will be written interleaved.
+
+
+Parameters:
+ow=f (overwrite) Overwrites files that already exist.
+zl=4 (ziplevel) Set compression level, 1 (low) to 9 (max).
+int=f (interleaved) Determines whether INPUT file is considered interleaved.
+fastawrap=70 Length of lines in fasta output.
+fastareadlen=0 Set to a non-zero number to break fasta files into reads of at most this length.
+minscaf=1 Ignore fasta reads shorter than this.
+qin=auto ASCII offset for input quality. May be 33 (Sanger), 64 (Illumina), or auto.
+qout=auto ASCII offset for output quality. May be 33 (Sanger), 64 (Illumina), or auto (same as input).
+qfin=<.qual file> Read qualities from this qual file, for the reads coming from 'in=<fasta file>'
+qfin2=<.qual file> Read qualities from this qual file, for the reads coming from 'in2=<fasta file>'
+qfout=<.qual file> Write qualities from this qual file, for the reads going to 'out=<fasta file>'
+qfout2=<.qual file> Write qualities from this qual file, for the reads coming from 'out2=<fasta file>'
+ignorebadquality=f (ibq) Fix out-of-range quality values instead of crashing with a warning.
+
+Renaming modes (if not default):
+renamebyinsert=f Rename the read to indicate its correct insert size.
+renamebymapping=f Rename the read to indicate its correct mapping coordinates.
+renamebytrim=f Rename the read to indicate its correct post-trimming length.
+addprefix=f Rename the read by prepending the prefix to the existing name.
+prefixonly=f Only use the prefix; don't add _<number>
+
+Sampling parameters:
+reads=-1 Set to a positive number to only process this many INPUT reads (or pairs), then quit.
+
+Java Parameters:
+-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
+ -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory.
+
+Supported input formats are fastq, fasta, fast+qual, scarf, and bread (BBMap's native format)
+Supported output formats are fastq, fasta, fast+qual, bread, sam, and bam (bam only if samtools is installed)
+Supported compression formats are gz, zip, and bz2
+To read from stdin, set 'in=stdin'. The format should be specified with an extension, like 'in=stdin.fq.gz'
+To write to stdout, set 'out=stdout'. The format should be specified with an extension, like 'out=stdout.fasta'
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx1g"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+}
+calcXmx "$@"
+
+function rename() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ fi
+ local CMD="java $EA $z -cp $CP jgi.RenameReads $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+rename "$@"
diff --git a/repair.sh b/repair.sh
new file mode 100755
index 0000000..49f40c9
--- /dev/null
+++ b/repair.sh
@@ -0,0 +1,93 @@
+#!/bin/bash
+#repair in=<infile> out=<outfile>
+
+usage(){
+echo "
+Written by Brian Bushnell
+Last modified June 2, 2015
+
+Description: Re-pairs reads that became disordered or had some mates eliminated.
+
+Usage: repair.sh in=<input file> out=<pair output> outs=<singleton output>
+
+Input may be fasta or fastq, compressed or uncompressed.
+
+
+Parameters:
+in=<file> The 'in=' flag is needed if the input file is not the first
+ parameter. 'in=stdin' will pipe from standard in.
+in2=<file> Use this if 2nd read of pairs are in a different file.
+out=<file> The 'out=' flag is needed if the output file is not the second
+ parameter. 'out=stdout' will pipe to standard out.
+out2=<file> Use this to write 2nd read of pairs to a different file.
+outs=<file> (outsingle) Write singleton reads here.
+overwrite=t (ow) Set to false to force the program to abort rather than
+ overwrite an existing file.
+showspeed=t (ss) Set to 'f' to suppress display of processing speed.
+ziplevel=2 (zl) Set to 1 (lowest) through 9 (max) to change compression
+ level; lower compression is faster.
+fint=f (fixinterleaving) Fixes corrupted interleaved files using read
+ names. Only use on files with broken interleaving - correctly
+ interleaved files from which some reads were removed.
+repair=t (rp) Fixes arbitrarily corrupted paired reads by using read
+ names. Uses much more memory than 'fint' mode.
+ain=f (allowidenticalnames) When detecting pair names, allows
+ identical names, instead of requiring /1 and /2 or 1: and 2:
+monitor=f Kill this process if it crashes. monitor=600,0.01 would kill
+ after 600 seconds under 1% usage.
+
+Java Parameters:
+-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
+ -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory.
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx4g"
+z2="-Xms4g"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+ if [[ $set == 1 ]]; then
+ return
+ fi
+ freeRam 4000m 84
+ z="-Xmx${RAM}m"
+ z2="-Xms${RAM}m"
+}
+calcXmx "$@"
+
+repair() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ fi
+ local CMD="java $EA $z -cp $CP jgi.SplitPairsAndSingles rp $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+repair "$@"
diff --git a/resources/adapters.fa b/resources/adapters.fa
new file mode 100755
index 0000000..d6702cf
--- /dev/null
+++ b/resources/adapters.fa
@@ -0,0 +1,304 @@
+>Reverse_adapter
+AGATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG
+>TruSeq_Universal_Adapter
+AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT
+>pcr_dimer
+AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCTAGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGATCTCGTATGCCGTCTTCTGCTTG
+>PCR_Primers
+AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCTCAAGCAGAAGACGGCATACGAGCTCTTCCGATCT
+>TruSeq_Adapter_Index_1_6
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG
+>TruSeq_Adapter_Index_2
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGATGTATCTCGTATGCCGTCTTCTGCTTG
+>TruSeq_Adapter_Index_3
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACTTAGGCATCTCGTATGCCGTCTTCTGCTTG
+>TruSeq_Adapter_Index_4
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACTGACCAATCTCGTATGCCGTCTTCTGCTTG
+>TruSeq_Adapter_Index_5
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACACAGTGATCTCGTATGCCGTCTTCTGCTTG
+>TruSeq_Adapter_Index_6
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACGCCAATATCTCGTATGCCGTCTTCTGCTTG
+>TruSeq_Adapter_Index_7
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACCAGATCATCTCGTATGCCGTCTTCTGCTTG
+>TruSeq_Adapter_Index_8
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACACTTGAATCTCGTATGCCGTCTTCTGCTTG
+>TruSeq_Adapter_Index_9
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACGATCAGATCTCGTATGCCGTCTTCTGCTTG
+>TruSeq_Adapter_Index_10
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACTAGCTTATCTCGTATGCCGTCTTCTGCTTG
+>TruSeq_Adapter_Index_11
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACGGCTACATCTCGTATGCCGTCTTCTGCTTG
+>TruSeq_Adapter_Index_12
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACCTTGTAATCTCGTATGCCGTCTTCTGCTTG
+>TruSeq_Adapter_Index_13
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACAGTCAACAATCTCGTATGCCGTCTTCTGCTTG
+>TruSeq_Adapter_Index_14
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACAGTTCCGTATCTCGTATGCCGTCTTCTGCTTG
+>TruSeq_Adapter_Index_15
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACATGTCAGAATCTCGTATGCCGTCTTCTGCTTG
+>TruSeq_Adapter_Index_16
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACCCGTCCCGATCTCGTATGCCGTCTTCTGCTTG
+>TruSeq_Adapter_Index_18_7
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTCCGCACATCTCGTATGCCGTCTTCTGCTTG
+>TruSeq_Adapter_Index_19
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTGAAACGATCTCGTATGCCGTCTTCTGCTTG
+>TruSeq_Adapter_Index_20
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTGGCCTTATCTCGTATGCCGTCTTCTGCTTG
+>TruSeq_Adapter_Index_21
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTTTCGGAATCTCGTATGCCGTCTTCTGCTTG
+>TruSeq_Adapter_Index_22
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGTACGTAATCTCGTATGCCGTCTTCTGCTTG
+>TruSeq_Adapter_Index_23
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACGAGTGGATATCTCGTATGCCGTCTTCTGCTTG
+>TruSeq_Adapter_Index_25
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACACTGATATATCTCGTATGCCGTCTTCTGCTTG
+>TruSeq_Adapter_Index_27
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACATTCCTTTATCTCGTATGCCGTCTTCTGCTTG
+>I5_Nextera_Transposase_1
+CTGTCTCTTATACACATCTGACGCTGCCGACGA
+>I7_Nextera_Transposase_1
+CTGTCTCTTATACACATCTCCGAGCCCACGAGAC
+>I5_Nextera_Transposase_2
+CTGTCTCTTATACACATCTCTGATGGCGCGAGGGAGGC
+>I7_Nextera_Transposase_2
+CTGTCTCTTATACACATCTCTGAGCGGGCTGGCAAGGC
+>I5_Primer_Nextera_XT_and_Nextera_Enrichment_[N/S/E]501
+GACGCTGCCGACGAGCGATCTAGTGTAGATCTCGGTGGTCGCCGTATCATT
+>I5_Primer_Nextera_XT_and_Nextera_Enrichment_[N/S/E]502
+GACGCTGCCGACGAATAGAGAGGTGTAGATCTCGGTGGTCGCCGTATCATT
+>I5_Primer_Nextera_XT_and_Nextera_Enrichment_[N/S/E]503
+GACGCTGCCGACGAAGAGGATAGTGTAGATCTCGGTGGTCGCCGTATCATT
+>I5_Primer_Nextera_XT_and_Nextera_Enrichment_[N/S/E]504
+GACGCTGCCGACGATCTACTCTGTGTAGATCTCGGTGGTCGCCGTATCATT
+>I5_Primer_Nextera_XT_and_Nextera_Enrichment_[N/S/E]505
+GACGCTGCCGACGACTCCTTACGTGTAGATCTCGGTGGTCGCCGTATCATT
+>I5_Primer_Nextera_XT_and_Nextera_Enrichment_[N/S/E]506
+GACGCTGCCGACGATATGCAGTGTGTAGATCTCGGTGGTCGCCGTATCATT
+>I5_Primer_Nextera_XT_and_Nextera_Enrichment_[N/S/E]507
+GACGCTGCCGACGATACTCCTTGTGTAGATCTCGGTGGTCGCCGTATCATT
+>I5_Primer_Nextera_XT_and_Nextera_Enrichment_[N/S/E]508
+GACGCTGCCGACGAAGGCTTAGGTGTAGATCTCGGTGGTCGCCGTATCATT
+>I5_Primer_Nextera_XT_and_Nextera_Enrichment_[N/S/E]517
+GACGCTGCCGACGATCTTACGCGTGTAGATCTCGGTGGTCGCCGTATCATT
+>I7_Primer_Nextera_XT_and_Nextera_Enrichment_N701
+CCGAGCCCACGAGACTAAGGCGAATCTCGTATGCCGTCTTCTGCTTG
+>I7_Primer_Nextera_XT_and_Nextera_Enrichment_N702
+CCGAGCCCACGAGACCGTACTAGATCTCGTATGCCGTCTTCTGCTTG
+>I7_Primer_Nextera_XT_and_Nextera_Enrichment_N703
+CCGAGCCCACGAGACAGGCAGAAATCTCGTATGCCGTCTTCTGCTTG
+>I7_Primer_Nextera_XT_and_Nextera_Enrichment_N704
+CCGAGCCCACGAGACTCCTGAGCATCTCGTATGCCGTCTTCTGCTTG
+>I7_Primer_Nextera_XT_and_Nextera_Enrichment_N705
+CCGAGCCCACGAGACGGACTCCTATCTCGTATGCCGTCTTCTGCTTG
+>I7_Primer_Nextera_XT_and_Nextera_Enrichment_N706
+CCGAGCCCACGAGACTAGGCATGATCTCGTATGCCGTCTTCTGCTTG
+>I7_Primer_Nextera_XT_and_Nextera_Enrichment_N707
+CCGAGCCCACGAGACCTCTCTACATCTCGTATGCCGTCTTCTGCTTG
+>I7_Primer_Nextera_XT_and_Nextera_Enrichment_N708
+CCGAGCCCACGAGACCAGAGAGGATCTCGTATGCCGTCTTCTGCTTG
+>I7_Primer_Nextera_XT_and_Nextera_Enrichment_N709
+CCGAGCCCACGAGACGCTACGCTATCTCGTATGCCGTCTTCTGCTTG
+>I7_Primer_Nextera_XT_and_Nextera_Enrichment_N710
+CCGAGCCCACGAGACCGAGGCTGATCTCGTATGCCGTCTTCTGCTTG
+>I7_Primer_Nextera_XT_and_Nextera_Enrichment_N711
+CCGAGCCCACGAGACAAGAGGCAATCTCGTATGCCGTCTTCTGCTTG
+>I7_Primer_Nextera_XT_and_Nextera_Enrichment_N712
+CCGAGCCCACGAGACGTAGAGGAATCTCGTATGCCGTCTTCTGCTTG
+>I5_Primer_Nextera_XT_Index_Kit_v2_S502
+GACGCTGCCGACGAATAGAGAGGTGTAGATCTCGGTGGTCGCCGTATCATT
+>I5_Primer_Nextera_XT_Index_Kit_v2_S503
+GACGCTGCCGACGAAGAGGATAGTGTAGATCTCGGTGGTCGCCGTATCATT
+>I5_Primer_Nextera_XT_Index_Kit_v2_S505
+GACGCTGCCGACGACTCCTTACGTGTAGATCTCGGTGGTCGCCGTATCATT
+>I5_Primer_Nextera_XT_Index_Kit_v2_S506
+GACGCTGCCGACGATATGCAGTGTGTAGATCTCGGTGGTCGCCGTATCATT
+>I5_Primer_Nextera_XT_Index_Kit_v2_S507
+GACGCTGCCGACGATACTCCTTGTGTAGATCTCGGTGGTCGCCGTATCATT
+>I5_Primer_Nextera_XT_Index_Kit_v2_S508
+GACGCTGCCGACGAAGGCTTAGGTGTAGATCTCGGTGGTCGCCGTATCATT
+>I5_Primer_Nextera_XT_Index_Kit_v2_S510
+GACGCTGCCGACGAATTAGACGGTGTAGATCTCGGTGGTCGCCGTATCATT
+>I5_Primer_Nextera_XT_Index_Kit_v2_S511
+GACGCTGCCGACGACGGAGAGAGTGTAGATCTCGGTGGTCGCCGTATCATT
+>I5_Primer_Nextera_XT_Index_Kit_v2_S513
+GACGCTGCCGACGACTAGTCGAGTGTAGATCTCGGTGGTCGCCGTATCATT
+>I5_Primer_Nextera_XT_Index_Kit_v2_S515
+GACGCTGCCGACGAAGCTAGAAGTGTAGATCTCGGTGGTCGCCGTATCATT
+>I5_Primer_Nextera_XT_Index_Kit_v2_S516
+GACGCTGCCGACGAACTCTAGGGTGTAGATCTCGGTGGTCGCCGTATCATT
+>I5_Primer_Nextera_XT_Index_Kit_v2_S517
+GACGCTGCCGACGATCTTACGCGTGTAGATCTCGGTGGTCGCCGTATCATT
+>I5_Primer_Nextera_XT_Index_Kit_v2_S518
+GACGCTGCCGACGACTTAATAGGTGTAGATCTCGGTGGTCGCCGTATCATT
+>I5_Primer_Nextera_XT_Index_Kit_v2_S520
+GACGCTGCCGACGAATAGCCTTGTGTAGATCTCGGTGGTCGCCGTATCATT
+>I5_Primer_Nextera_XT_Index_Kit_v2_S521
+GACGCTGCCGACGATAAGGCTCGTGTAGATCTCGGTGGTCGCCGTATCATT
+>I5_Primer_Nextera_XT_Index_Kit_v2_S522
+GACGCTGCCGACGATCGCATAAGTGTAGATCTCGGTGGTCGCCGTATCATT
+>I7_Primer_Nextera_XT_Index_Kit_v2_N701
+CCGAGCCCACGAGACTAAGGCGAATCTCGTATGCCGTCTTCTGCTTG
+>I7_Primer_Nextera_XT_Index_Kit_v2_N702
+CCGAGCCCACGAGACCGTACTAGATCTCGTATGCCGTCTTCTGCTTG
+>I7_Primer_Nextera_XT_Index_Kit_v2_N703
+CCGAGCCCACGAGACAGGCAGAAATCTCGTATGCCGTCTTCTGCTTG
+>I7_Primer_Nextera_XT_Index_Kit_v2_N704
+CCGAGCCCACGAGACTCCTGAGCATCTCGTATGCCGTCTTCTGCTTG
+>I7_Primer_Nextera_XT_Index_Kit_v2_N705
+CCGAGCCCACGAGACGGACTCCTATCTCGTATGCCGTCTTCTGCTTG
+>I7_Primer_Nextera_XT_Index_Kit_v2_N706
+CCGAGCCCACGAGACTAGGCATGATCTCGTATGCCGTCTTCTGCTTG
+>I7_Primer_Nextera_XT_Index_Kit_v2_N707
+CCGAGCCCACGAGACCTCTCTACATCTCGTATGCCGTCTTCTGCTTG
+>I7_Primer_Nextera_XT_Index_Kit_v2_N710
+CCGAGCCCACGAGACCGAGGCTGATCTCGTATGCCGTCTTCTGCTTG
+>I7_Primer_Nextera_XT_Index_Kit_v2_N711
+CCGAGCCCACGAGACAAGAGGCAATCTCGTATGCCGTCTTCTGCTTG
+>I7_Primer_Nextera_XT_Index_Kit_v2_N712
+CCGAGCCCACGAGACGTAGAGGAATCTCGTATGCCGTCTTCTGCTTG
+>I7_Primer_Nextera_XT_Index_Kit_v2_N714
+CCGAGCCCACGAGACGCTCATGAATCTCGTATGCCGTCTTCTGCTTG
+>I7_Primer_Nextera_XT_Index_Kit_v2_N715
+CCGAGCCCACGAGACATCTCAGGATCTCGTATGCCGTCTTCTGCTTG
+>I7_Primer_Nextera_XT_Index_Kit_v2_N716
+CCGAGCCCACGAGACACTCGCTAATCTCGTATGCCGTCTTCTGCTTG
+>I7_Primer_Nextera_XT_Index_Kit_v2_N718
+CCGAGCCCACGAGACGGAGCTACATCTCGTATGCCGTCTTCTGCTTG
+>I7_Primer_Nextera_XT_Index_Kit_v2_N719
+CCGAGCCCACGAGACGCGTAGTAATCTCGTATGCCGTCTTCTGCTTG
+>I7_Primer_Nextera_XT_Index_Kit_v2_N720
+CCGAGCCCACGAGACCGGAGCCTATCTCGTATGCCGTCTTCTGCTTG
+>I7_Primer_Nextera_XT_Index_Kit_v2_N721
+CCGAGCCCACGAGACTACGCTGCATCTCGTATGCCGTCTTCTGCTTG
+>I7_Primer_Nextera_XT_Index_Kit_v2_N722
+CCGAGCCCACGAGACATGCGCAGATCTCGTATGCCGTCTTCTGCTTG
+>I7_Primer_Nextera_XT_Index_Kit_v2_N723
+CCGAGCCCACGAGACTAGCGCTCATCTCGTATGCCGTCTTCTGCTTG
+>I7_Primer_Nextera_XT_Index_Kit_v2_N724
+CCGAGCCCACGAGACACTGAGCGATCTCGTATGCCGTCTTCTGCTTG
+>I7_Primer_Nextera_XT_Index_Kit_v2_N726
+CCGAGCCCACGAGACCCTAAGACATCTCGTATGCCGTCTTCTGCTTG
+>I7_Primer_Nextera_XT_Index_Kit_v2_N727
+CCGAGCCCACGAGACCGATCAGTATCTCGTATGCCGTCTTCTGCTTG
+>I7_Primer_Nextera_XT_Index_Kit_v2_N728
+CCGAGCCCACGAGACTGCAGCTAATCTCGTATGCCGTCTTCTGCTTG
+>I7_Primer_Nextera_XT_Index_Kit_v2_N729
+CCGAGCCCACGAGACTCGACGTCATCTCGTATGCCGTCTTCTGCTTG
+>I5_Adapter_Nextera
+CTGATGGCGCGAGGGAGGCGTGTAGATCTCGGTGGTCGCCGTATCATT
+>I7_Adapter_Nextera_No_Barcode
+CTGAGCGGGCTGGCAAGGCAGACCGATCTCGTATGCCGTCTTCTGCTTG
+>Nextera_LMP_Read1_External_Adapter
+GATCGGAAGAGCACACGTCTGAACTCCAGTCAC
+>Nextera_LMP_Read2_External_Adapter
+GATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT
+>RNA_Adapter_(RA5)_part_#_15013205
+GATCGTCGGACTGTAGAACTCTGAAC
+>RNA_Adapter_(RA3)_part_#_15013207
+CCTTGGCACCCGAGAATTCCA
+>Stop_Oligo_(STP)_8
+CCACGGGAACGTGGTGGAATTC
+>RNA_RT_Primer_(RTP)_part_#_15013981
+TGGAATTCTCGGGTGCCAAGGC
+>RNA_PCR_Primer_(RP1)_part_#_15013198
+TCGGACTGTAGAACTCTGAACGTGTAGATCTCGGTGGTCGCCGTATCATT
+>RNA_PCR_Primer_Index_1_(RPI1)_2,9
+TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG
+>RNA_PCR_Primer_Index_2_(RPI2)
+TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCGATGTATCTCGTATGCCGTCTTCTGCTTG
+>RNA_PCR_Primer_Index_3_(RPI3)
+TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACTTAGGCATCTCGTATGCCGTCTTCTGCTTG
+>RNA_PCR_Primer_Index_4_(RPI4)
+TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACTGACCAATCTCGTATGCCGTCTTCTGCTTG
+>RNA_PCR_Primer_Index_5_(RPI5)
+TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACACAGTGATCTCGTATGCCGTCTTCTGCTTG
+>RNA_PCR_Primer_Index_6_(RPI6)
+TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAATATCTCGTATGCCGTCTTCTGCTTG
+>RNA_PCR_Primer_Index_7_(RPI7)
+TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCAGATCATCTCGTATGCCGTCTTCTGCTTG
+>RNA_PCR_Primer_Index_8_(RPI8)
+TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACACTTGAATCTCGTATGCCGTCTTCTGCTTG
+>RNA_PCR_Primer_Index_9_(RPI9)
+TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGATCAGATCTCGTATGCCGTCTTCTGCTTG
+>RNA_PCR_Primer_Index_10_(RPI10)
+TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACTAGCTTATCTCGTATGCCGTCTTCTGCTTG
+>RNA_PCR_Primer_Index_11_(RPI11)
+TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGGCTACATCTCGTATGCCGTCTTCTGCTTG
+>RNA_PCR_Primer_Index_12_(RPI12)
+TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCTTGTAATCTCGTATGCCGTCTTCTGCTTG
+>RNA_PCR_Primer_Index_13_(RPI13)
+TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACAGTCAAATCTCGTATGCCGTCTTCTGCTTG
+>RNA_PCR_Primer_Index_14_(RPI14)
+TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACAGTTCCATCTCGTATGCCGTCTTCTGCTTG
+>RNA_PCR_Primer_Index_15_(RPI15)
+TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACATGTCAATCTCGTATGCCGTCTTCTGCTTG
+>RNA_PCR_Primer_Index_16_(RPI16)
+TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCCGTCCATCTCGTATGCCGTCTTCTGCTTG
+>RNA_PCR_Primer_Index_17_(RPI17)
+TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGTAGAGATCTCGTATGCCGTCTTCTGCTTG
+>RNA_PCR_Primer_Index_18_(RPI18)
+TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGTCCGCATCTCGTATGCCGTCTTCTGCTTG
+>RNA_PCR_Primer_Index_19_(RPI19)
+TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGTGAAAATCTCGTATGCCGTCTTCTGCTTG
+>RNA_PCR_Primer_Index_20_(RPI20)
+TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGTGGCCATCTCGTATGCCGTCTTCTGCTTG
+>RNA_PCR_Primer_Index_21_(RPI21)
+TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGTTTCGATCTCGTATGCCGTCTTCTGCTTG
+>RNA_PCR_Primer_Index_22_(RPI22)
+TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCGTACGATCTCGTATGCCGTCTTCTGCTTG
+>RNA_PCR_Primer_Index_23_(RPI23)
+TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGAGTGGATCTCGTATGCCGTCTTCTGCTTG
+>RNA_PCR_Primer_Index_24_(RPI24)
+TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGGTAGCATCTCGTATGCCGTCTTCTGCTTG
+>RNA_PCR_Primer_Index_25_(RPI25)
+TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACACTGATATCTCGTATGCCGTCTTCTGCTTG
+>RNA_PCR_Primer_Index_26_(RPI26)
+TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACATGAGCATCTCGTATGCCGTCTTCTGCTTG
+>RNA_PCR_Primer_Index_27_(RPI27)
+TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACATTCCTATCTCGTATGCCGTCTTCTGCTTG
+>RNA_PCR_Primer_Index_28_(RPI28)
+TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCAAAAGATCTCGTATGCCGTCTTCTGCTTG
+>RNA_PCR_Primer_Index_29_(RPI29)
+TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCAACTAATCTCGTATGCCGTCTTCTGCTTG
+>RNA_PCR_Primer_Index_30_(RPI30)
+TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCACCGGATCTCGTATGCCGTCTTCTGCTTG
+>RNA_PCR_Primer_Index_31_(RPI31)
+TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCACGATATCTCGTATGCCGTCTTCTGCTTG
+>RNA_PCR_Primer_Index_32_(RPI32)
+TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCACTCAATCTCGTATGCCGTCTTCTGCTTG
+>RNA_PCR_Primer_Index_33_(RPI33)
+TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCAGGCGATCTCGTATGCCGTCTTCTGCTTG
+>RNA_PCR_Primer_Index_34_(RPI34)
+TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCATGGCATCTCGTATGCCGTCTTCTGCTTG
+>RNA_PCR_Primer_Index_35_(RPI35)
+TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCATTTTATCTCGTATGCCGTCTTCTGCTTG
+>RNA_PCR_Primer_Index_36_(RPI36)
+TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCCAACAATCTCGTATGCCGTCTTCTGCTTG
+>RNA_PCR_Primer_Index_37_(RPI37)
+TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCGGAATATCTCGTATGCCGTCTTCTGCTTG
+>RNA_PCR_Primer_Index_38_(RPI38)
+TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCTAGCTATCTCGTATGCCGTCTTCTGCTTG
+>RNA_PCR_Primer_Index_39_(RPI39)
+TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCTATACATCTCGTATGCCGTCTTCTGCTTG
+>RNA_PCR_Primer_Index_40_(RPI40)
+TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCTCAGAATCTCGTATGCCGTCTTCTGCTTG
+>RNA_PCR_Primer_Index_41_(RPI41)
+TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGACGACATCTCGTATGCCGTCTTCTGCTTG
+>RNA_PCR_Primer_Index_42_(RPI42)
+TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACTAATCGATCTCGTATGCCGTCTTCTGCTTG
+>RNA_PCR_Primer_Index_43_(RPI43)
+TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACTACAGCATCTCGTATGCCGTCTTCTGCTTG
+>RNA_PCR_Primer_Index_44_(RPI44)
+TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACTATAATATCTCGTATGCCGTCTTCTGCTTG
+>RNA_PCR_Primer_Index_45_(RPI45)
+TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACTCATTCATCTCGTATGCCGTCTTCTGCTTG
+>RNA_PCR_Primer_Index_46_(RPI46)
+TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACTCCCGAATCTCGTATGCCGTCTTCTGCTTG
+>RNA_PCR_Primer_Index_47_(RPI47)
+TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACTCGAAGATCTCGTATGCCGTCTTCTGCTTG
+>RNA_PCR_Primer_Index_48_(RPI48)
+TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACTCGGCAATCTCGTATGCCGTCTTCTGCTTG
+>PhiX_read1_adapter
+AGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGATCTCGTATGCCGTCTTCTGCTTGAAA
+>PhiX_read2_adapter
+AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATTAAAAAA
diff --git a/resources/contents.txt b/resources/contents.txt
new file mode 100755
index 0000000..0eb0f87
--- /dev/null
+++ b/resources/contents.txt
@@ -0,0 +1,33 @@
+Not: Unless otherwise mentioned, adapter sequences in this directory should be trimmed to the right. For BBDuk, that means using the flag "ktrim=r".
+
+adapters.fa
+All adapters in one file, provided for convenience.
+
+truseq.fa.gz
+Illumina Truseq DNA adapters.
+
+truseq_rna.fa.gz
+Illumina Truseq RNA adapters.
+
+nextera.fa.gz
+Illumina Nextera adapter sequences.
+
+nextera_LMP_adapter.fa.gz
+Illumina Nextera long-mate-pair adapters.
+
+nextera_LMP_linker.fa.gz
+Illumina Nextera sequence that joins the circularized DNA in LMP libraries. This should be handled specially, e.g. with SplitNexteraLMP.
+
+phix174_ill.ref.fa.gz
+Illumina PhiX spike-in reference genome, for filtering with BBDuk or mapping for quality metrics.
+
+phix_adapters.fa.gz
+A set of adapters found on a PhiX library; may be subject to change.
+
+sample1.fq.gz
+sample2.fq.gz
+Sample read pairs for testing whether the programs are working. These are from PhiX.
+
+primes.txt.gz
+A prime selection of numbers.
+
diff --git a/resources/nextera.fa.gz b/resources/nextera.fa.gz
new file mode 100755
index 0000000..df9939d
Binary files /dev/null and b/resources/nextera.fa.gz differ
diff --git a/resources/nextera_LMP_adapter.fa.gz b/resources/nextera_LMP_adapter.fa.gz
new file mode 100755
index 0000000..397247d
Binary files /dev/null and b/resources/nextera_LMP_adapter.fa.gz differ
diff --git a/resources/nextera_LMP_linker.fa.gz b/resources/nextera_LMP_linker.fa.gz
new file mode 100755
index 0000000..e46f61e
Binary files /dev/null and b/resources/nextera_LMP_linker.fa.gz differ
diff --git a/resources/phix174_ill.ref.fa.gz b/resources/phix174_ill.ref.fa.gz
new file mode 100755
index 0000000..b743e4f
Binary files /dev/null and b/resources/phix174_ill.ref.fa.gz differ
diff --git a/resources/phix_adapters.fa.gz b/resources/phix_adapters.fa.gz
new file mode 100755
index 0000000..8bbbbff
Binary files /dev/null and b/resources/phix_adapters.fa.gz differ
diff --git a/resources/primes.txt.gz b/resources/primes.txt.gz
new file mode 100755
index 0000000..d337921
Binary files /dev/null and b/resources/primes.txt.gz differ
diff --git a/resources/sample1.fq.gz b/resources/sample1.fq.gz
new file mode 100755
index 0000000..e4f0879
Binary files /dev/null and b/resources/sample1.fq.gz differ
diff --git a/resources/sample2.fq.gz b/resources/sample2.fq.gz
new file mode 100755
index 0000000..b5fecaf
Binary files /dev/null and b/resources/sample2.fq.gz differ
diff --git a/resources/truseq.fa.gz b/resources/truseq.fa.gz
new file mode 100755
index 0000000..1d9f434
Binary files /dev/null and b/resources/truseq.fa.gz differ
diff --git a/resources/truseq_rna.fa.gz b/resources/truseq_rna.fa.gz
new file mode 100755
index 0000000..042d54f
Binary files /dev/null and b/resources/truseq_rna.fa.gz differ
diff --git a/samtoroc.sh b/samtoroc.sh
new file mode 100755
index 0000000..7b86d7e
--- /dev/null
+++ b/samtoroc.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+#samtoroc in=<infile>
+
+usage(){
+echo "
+Written by Brian Bushnell
+Last modified May 23, 2014
+
+Description: Creates a ROC curve from a sam file of synthetic reads with headers generated by RandomReads3.java
+
+Usage: samtoroc.sh in=<sam file> reads=<number of reads in input fastq>
+
+Parameters:
+in=<file> Specify the input sam file, or stdin.
+thresh=20 Max deviation from correct location to be considered 'loosely correct'.
+blasr=f Set to 't' for BLASR output; fixes extra information added to read names.
+ssaha2=f Set to 't' for SSAHA2 or SMALT output; fixes incorrect soft-clipped read locations.
+bitset=t Track read ID's to detect secondary alignments.
+ Necessary for mappers that incorrectly output multiple primary alignments per read.
+
+Java Parameters:
+-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
+ -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory.
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx200m"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+}
+calcXmx "$@"
+
+samtoroc() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module unload samtools
+ module load oracle-jdk/1.7_64bit
+ module load samtools
+ fi
+ local CMD="java $EA $z -cp $CP align2.MakeRocCurve $@"
+# echo $CMD >&2
+ eval $CMD
+}
+
+samtoroc "$@"
diff --git a/seal.sh b/seal.sh
new file mode 100755
index 0000000..cafdd72
--- /dev/null
+++ b/seal.sh
@@ -0,0 +1,225 @@
+#!/bin/bash
+#seal in=<file> out=<file> ref=<ref file>
+
+usage(){
+echo "
+Written by Brian Bushnell
+Last modified December 15, 2015
+
+Description: Performs high-speed alignment-free sequence quantification,
+by counting the number of long kmers that match between a read and
+a set of reference sequences. Designed for RNA-seq with alternative splicing.
+
+Usage: seal.sh in=<input file> ref=<file,file,file...> rpkm=<file>
+
+Input may be fasta or fastq, compressed or uncompressed.
+If you pipe via stdin/stdout, please include the file type; e.g. for gzipped
+fasta input, set in=stdin.fa.gz
+
+
+Input parameters:
+in=<file> Main input. in=stdin.fq will pipe from stdin.
+in2=<file> Input for 2nd read of pairs in a different file.
+ref=<file,file> Comma-delimited list of reference files or directories.
+literal=<seq,seq> Comma-delimited list of literal reference sequences.
+touppercase=f (tuc) Change all bases upper-case.
+interleaved=auto (int) t/f overrides interleaved autodetection.
+qin=auto Input quality offset: 33 (Sanger), 64, or auto.
+reads=-1 If positive, quit after processing X reads or pairs.
+copyundefined=f (cu) Process non-AGCT IUPAC reference bases by making all
+ possible unambiguous copies. Intended for short motifs
+ or adapter barcodes, as time/memory use is exponential.
+
+Output parameters:
+out=<file> (outmatch) Write reads here that contain kmers matching
+ the reference. 'out=stdout.fq' will pipe to standard out.
+out2=<file> (outmatch2) Use this to write 2nd read of pairs to a
+ different file.
+outu=<file> (outunmatched) Write reads here that do not contain kmers
+ matching the database.
+outu2=<file> (outunmatched2) Use this to write 2nd read of pairs to a
+ different file.
+pattern=<file> Use this to write reads to one stream per ref sequence
+ match, replacing the % character with the sequence name.
+ For example, pattern=%.fq for ref sequences named dog and
+ cat would create dog.fq and cat.fq.
+stats=<file> Write statistics about which contamininants were detected.
+refstats=<file> Write statistics on a per-reference-file basis.
+rpkm=<file> Write RPKM for each reference sequence (for RNA-seq).
+dump=<file> Dump kmer tables to a file, in fasta format.
+nzo=t Only write statistics about ref sequences with nonzero hits.
+overwrite=t (ow) Grant permission to overwrite files.
+showspeed=t (ss) 'f' suppresses display of processing speed.
+ziplevel=2 (zl) Compression level; 1 (min) through 9 (max).
+fastawrap=80 Length of lines in fasta output.
+qout=auto Output quality offset: 33 (Sanger), 64, or auto.
+statscolumns=5 (cols) Number of columns for stats output, 3 or 5.
+ 5 includes base counts.
+rename=f Rename reads to indicate which sequences they matched.
+refnames=f Use names of reference files rather than scaffold IDs.
+ With multiple reference files, this is more efficient
+ than tracking statistics on a per-sequence bases.
+trd=f Truncate read and ref names at the first whitespace.
+ordered=f Set to true to output reads in same order as input.
+kpt=t (keepPairsTogether) Paired reads will always be assigned
+ to the same ref sequence.
+
+Processing parameters:
+k=31 Kmer length used for finding contaminants. Contaminants
+ shorter than k will not be found. k must be at least 1.
+rcomp=t Look for reverse-complements of kmers in addition to
+ forward kmers.
+maskmiddle=t (mm) Treat the middle base of a kmer as a wildcard, to
+ increase sensitivity in the presence of errors.
+minkmerhits=1 (mkh) A read needs at least this many kmer hits to be
+ considered a match.
+minkmerfraction=0.0 (mkf) A reads needs at least this fraction of its total
+ kmers to hit a ref, in order to be considered a match.
+hammingdistance=0 (hdist) Maximum Hamming distance for ref kmers (subs only).
+ Memory use is proportional to (3*K)^hdist.
+qhdist=0 Hamming distance for query kmers; impacts speed, not memory.
+editdistance=0 (edist) Maximum edit distance from ref kmers (subs and
+ indels). Memory use is proportional to (8*K)^edist.
+forbidn=f (fn) Forbids matching of read kmers containing N.
+ By default, these will match a reference 'A' if hdist>0
+ or edist>0, to increase sensitivity.
+match=all Determines when to quit looking for kmer matches. Values:
+ all: Attempt to match all kmers in each read.
+ first: Quit after the first matching kmer.
+ unique: Quit after the first uniquely matching kmer.
+ambiguous=random (ambig) Set behavior on ambiguously-mapped reads (with an
+ equal number of kmer matches to multiple sequences).
+ first: Use the first best-matching sequence.
+ toss: Consider unmapped.
+ random: Select one best-matching sequence randomly.
+ all: Use all best-matching sequences.
+clearzone=0 (cz) Threshhold for ambiguity. If the best match shares X
+ kmers with the read, the read will be considered
+ also ambiguously mapped to any sequence sharing at least
+ [X minus clearzone] kmers.
+ecco=f For overlapping paired reads only. Performs error-
+ correction with BBMerge prior to kmer operations.
+
+Containment parameters:
+processcontainedref=f Require a reference sequence to be fully contained by
+ an input sequence
+storerefbases=f Store reference bases so that ref containments can be
+ validated. If this is set to false and processcontainedref
+ is true, then it will only require that the read share the
+ same number of bases as are present in the ref sequence.
+
+Taxonomy parameters (only use when doing taxonomy):
+tax=<file> Output destination for taxonomy information.
+taxtree=<file> (tree) A serialized TaxTree (tree.taxtree.gz).
+gi=<file> A serialized GiTable (gitable.int1d.gz). Only needed if
+ reference sequence names start with 'gi|'.
+mincount=1 Only display taxa with at least this many hits.
+maxnodes=-1 If positive, display at most this many top hits.
+minlevel=subspecies Do not display nodes below this taxonomic level.
+maxlevel=life Do not display nodes above this taxonomic level.
+Valid levels are subspecies, species, genus, family, order, class,
+phylum, kingdom, domain, life
+
+Speed and Memory parameters:
+threads=auto (t) Set number of threads to use; default is number of
+ logical processors.
+prealloc=f Preallocate memory in table. Allows faster table loading
+ and more efficient memory usage, for a large reference.
+monitor=f Kill this process if CPU usage drops to zero for a long
+ time. monitor=600,0.01 would kill after 600 seconds
+ under 1% usage.
+rskip=1 Skip reference kmers to reduce memory usage.
+ 1 means use all, 2 means use every other kmer, etc.
+qskip=1 Skip query kmers to increase speed. 1 means use all.
+speed=0 Ignore this fraction of kmer space (0-15 out of 16) in both
+ reads and reference. Increases speed and reduces memory.
+Note: Do not use more than one of 'speed', 'qskip', and 'rskip'.
+
+Trimming/Masking parameters:
+qtrim=f Trim read ends to remove bases with quality below trimq.
+ Performed AFTER looking for kmers. Values:
+ t (trim both ends),
+ f (neither end),
+ r (right end only),
+ l (left end only).
+trimq=6 Regions with average quality BELOW this will be trimmed.
+minlength=1 (ml) Reads shorter than this after trimming will be
+ discarded. Pairs will be discarded only if both are shorter.
+maxlength= Reads longer than this after trimming will be discarded.
+ Pairs will be discarded only if both are longer.
+minavgquality=0 (maq) Reads with average quality (after trimming) below
+ this will be discarded.
+maqb=0 If positive, calculate maq from this many initial bases.
+maxns=-1 If non-negative, reads with more Ns than this
+ (after trimming) will be discarded.
+forcetrimleft=0 (ftl) If positive, trim bases to the left of this position
+ (exclusive, 0-based).
+forcetrimright=0 (ftr) If positive, trim bases to the right of this position
+ (exclusive, 0-based).
+forcetrimright2=0 (ftr2) If positive, trim this many bases on the right end.
+forcetrimmod=0 (ftm) If positive, right-trim length to be equal to zero,
+ modulo this number.
+restrictleft=0 If positive, only look for kmer matches in the
+ leftmost X bases.
+restrictright=0 If positive, only look for kmer matches in the
+ rightmost X bases.
+
+Java Parameters:
+-Xmx This will be passed to Java to set memory usage, overriding
+ the program's automatic memory detection. -Xmx20g will specify
+ 20 gigs of RAM, and -Xmx200m will specify 200 megs.
+ The max is typically 85% of physical memory.
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx1g"
+z2="-Xms1g"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+ if [[ $set == 1 ]]; then
+ return
+ fi
+ freeRam 2000m 84
+ z="-Xmx${RAM}m"
+ z2="-Xms${RAM}m"
+}
+calcXmx "$@"
+
+seal() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module unload samtools
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ module load samtools
+ fi
+ local CMD="java $EA $z $z2 -cp $CP jgi.Seal $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+seal "$@"
diff --git a/shred.sh b/shred.sh
new file mode 100755
index 0000000..ec849f9
--- /dev/null
+++ b/shred.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+#shred in=<infile>
+
+usage(){
+echo "
+Written by Brian Bushnell
+Last modified August 21, 2015
+Description: Shreds sequences into shorter, potentially overlapping sequences.
+
+Usage: shred.sh in=<file> out=<file> length=<number> minlength=<number> overlap=<number>
+
+
+in=<file> Input sequences.
+out=<file> Destination of output shreds.
+length=500 Desired length of shreds.
+minlength=1 Shortest allowed shred. The last shred of each input sequence may be shorter than desired length.
+overlap=0 Amount of overlap between successive reads.
+reads=-1 If nonnegative, stop after this many input sequences.
+equal=f Shred each sequence into subsequences of equal size of at most 'length', instead of a fixed size.
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx1400m"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+}
+calcXmx "$@"
+
+stats() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ fi
+ local CMD="java $EA $z -cp $CP jgi.Shred $@"
+# echo $CMD >&2
+ eval $CMD
+}
+
+stats "$@"
diff --git a/shuffle.sh b/shuffle.sh
new file mode 100755
index 0000000..36ed7c1
--- /dev/null
+++ b/shuffle.sh
@@ -0,0 +1,84 @@
+#!/bin/bash
+#shuffle in=<infile> out=<outfile>
+
+usage(){
+echo "
+Written by Brian Bushnell
+Last modified November 19, 2015
+
+Description: Reorders reads randomly, keeping pairs together.
+
+Usage: shuffle.sh in=<file> out=<file>
+
+
+Standard parameters:
+in=<file> The 'in=' flag is needed if the input file is not the first parameter. 'in=stdin' will pipe from standard in.
+in2=<file> Use this if 2nd read of pairs are in a different file.
+out=<file> The 'out=' flag is needed if the output file is not the second parameter. 'out=stdout' will pipe to standard out.
+out2=<file> Use this to write 2nd read of pairs to a different file.
+overwrite=t (ow) Set to false to force the program to abort rather than overwrite an existing file.
+ziplevel=2 (zl) Set to 1 (lowest) through 9 (max) to change compression level; lower compression is faster.
+interleaved=auto (int) Set to t or f to override interleaving autodetection.
+
+Processing parameters:
+shuffle Randomly reorders reads (default).
+name Sort reads by name.
+coordinate Sort reads by mapping location.
+sequence Sort reads by sequence.
+
+
+Java Parameters:
+-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
+ -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory.
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx2g"
+z2="-Xms2g"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+ if [[ $set == 1 ]]; then
+ return
+ fi
+ freeRam 2000m 84
+ z="-Xmx${RAM}m"
+ z2="-Xms${RAM}m"
+}
+calcXmx "$@"
+
+shuffle() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ fi
+ local CMD="java $EA $z -cp $CP jgi.Shuffle $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+shuffle "$@"
diff --git a/sortbytaxa.sh b/sortbytaxa.sh
new file mode 100755
index 0000000..a0fbfd6
--- /dev/null
+++ b/sortbytaxa.sh
@@ -0,0 +1,85 @@
+#!/bin/bash
+#sortbytaxa in=<infile> out=<outfile>
+
+usage(){
+echo "
+Written by Brian Bushnell.
+Last modified March 16, 2015
+
+Description: Sorts sequences into taxonomic order.
+
+Usage: sortbytaxa.sh in=<file> out=<file> gi=<file> tree=<file>
+
+
+Parameters:
+in=<file> Input sequences; required parameter.
+out=<file> Destination for sorted sequences.
+tree=<file> A TaxTree file.
+gi=<file> 2-column tsv with gi and taxid numbers, or gitable.int1d.
+ Only needed if sequences have not alread been renamed by taxa.
+fuse=f Fuse sequences of the same taxa together to save memory.
+promote=-1 Promote to this taxonomic level before comparing or fusing.
+dummy=f Create dummy sequences for nodes with no sequences.
+dummyLevel=2 Minimum taxonomic level for creating dummy sequences.
+promote=-1 Promote to this taxonomic level before comparing or fusing.
+ziplevel=2 (zl) Compression level for gzip output.
+pigz=f Spawn a pigz (parallel gzip) process for faster
+ compression than Java. Requires pigz to be installed.
+
+Java Parameters:
+-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
+ -Xmx20g will specify 20 gigs of RAM, and -Xmx800m will specify 800 megs. The max is typically 85% of physical memory.
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+NATIVELIBDIR="$DIR""jni/"
+
+z="-Xmx2g"
+z2="-Xms2g"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+ if [[ $set == 1 ]]; then
+ return
+ fi
+ freeRam 2000m 84
+ z="-Xmx${RAM}m"
+ z2="-Xms${RAM}m"
+}
+calcXmx "$@"
+
+
+sortbytaxa() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ fi
+ local CMD="java $EA $z $z2 -cp $CP tax.SortByTaxa $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+sortbytaxa "$@"
diff --git a/splitbytaxa.sh b/splitbytaxa.sh
new file mode 100755
index 0000000..9914b85
--- /dev/null
+++ b/splitbytaxa.sh
@@ -0,0 +1,91 @@
+#!/bin/bash
+#splitbytaxa in=<infile> out=<outfile>
+
+usage(){
+echo "
+Written by Brian Bushnell
+Last modified December 15, 2015
+
+Description: Splits sequences according to their taxonomy,
+as determined by the sequence name. Sequences should
+be labeled with a gi number, NCBI taxID, or species name.
+
+Usage: splitbytaxa.sh in=<input file> out=<output pattern> tree=<tree file> table=<table file> level=<name or number>
+
+Input may be fasta or fastq, compressed or uncompressed.
+
+
+Standard parameters:
+in=<file> Primary input.
+out=<file> Output pattern; must contain % symbol.
+overwrite=f (ow) Set to false to force the program to abort rather than
+ overwrite an existing file.
+showspeed=t (ss) Set to 'f' to suppress display of processing speed.
+ziplevel=2 (zl) Set to 1 (lowest) through 9 (max) to change compression
+ level; lower compression is faster.
+
+Processing parameters:
+level=phylum Taxonomic level, such as phylum. Filtering will operate on
+ sequences within the same taxonomic level as specified ids.
+tree= A taxonomic tree made by TaxTree, such as tree.taxtree.gz.
+table= A table translating gi numbers to NCBI taxIDs.
+ Only needed if gi numbers will be used.
+* Note *
+Tree and table files are in /global/projectb/sandbox/gaag/bbtools/tax
+For non-Genepool users, or to make new ones, use taxtree.sh and gitable.sh
+
+Java Parameters:
+-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
+ -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory.
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx4g"
+z2="-Xms4g"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+ if [[ $set == 1 ]]; then
+ return
+ fi
+ freeRam 1000m 84
+ z="-Xmx${RAM}m"
+ z2="-Xms${RAM}m"
+}
+calcXmx "$@"
+
+splitbytaxa() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ fi
+ local CMD="java $EA $z -cp $CP tax.SplitByTaxa $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+splitbytaxa "$@"
diff --git a/splitnextera.sh b/splitnextera.sh
new file mode 100755
index 0000000..2cdc145
--- /dev/null
+++ b/splitnextera.sh
@@ -0,0 +1,101 @@
+#!/bin/bash
+#splitnextera in=<infile> out=<outfile>
+
+function usage(){
+echo "
+Written by Brian Bushnell
+Last modified March 6, 2015
+
+Description: Splits Nextera LMP libraries into subsets based on linker orientation:
+LMP, fragment, unknown, and singleton.
+
+Usage: splitnextera.sh in=<file> out=<file> outf=<file> outu=<file> outs=<file>
+
+For pairs in two files, use in1, in2, out1, out2, etc.
+
+*** Note ***
+For maximal speed, before running splitnextera, the linkers should be replaced with a constant, like this:
+bbduk.sh in=reads.fq out=replaced.fq ktmask=J k=19 hdist=1 mink=11 hdist2=0 literal=CTGTCTCTTATACACATCTAGATGTGTATAAGAGACAG
+
+
+I/O parameters:
+in=<file> Input reads. Set to 'stdin.fq' to read from stdin.
+out=<file> Output for pairs with LMP orientation.
+outf=<file> Output for pairs with fragment orientation.
+outu=<file> Pairs with unknown orientation.
+outs=<file> Singleton output.
+ow=f (overwrite) Overwrites files that already exist.
+app=f (append) Append to files that already exist.
+zl=4 (ziplevel) Set compression level, 1 (low) to 9 (max).
+int=f (interleaved) Determines whether INPUT file is considered interleaved.
+qin=auto ASCII offset for input quality. May be 33 (Sanger), 64 (Illumina), or auto.
+qout=auto ASCII offset for output quality. May be 33 (Sanger), 64 (Illumina), or auto (same as input).
+
+Processing Parameters:
+mask=f Set to true if you did not already convert junctions to some symbol, and it will be done automatically, but the program will run slower.
+junction=J Look for this symbol to designate the junction bases.
+innerlmp=f Generate long mate pairs from the inner pair also, when the junction is found in both reads.
+rename=t Rename read 2 of output when using single-ended input.
+minlength=40 (ml) Do not output reads shorter than this.
+merge=f Attempt to merge overlapping reads before looking for junctions.
+testmerge=0.0 If nonzero, only merge reads if at least the fraction of input reads are mergable.
+
+Sampling parameters:
+
+reads=-1 Set to a positive number to only process this many INPUT reads (or pairs), then quit.
+samplerate=1 Randomly output only this fraction of reads; 1 means sampling is disabled.
+sampleseed=-1 Set to a positive number to use that prng seed for sampling (allowing deterministic sampling).
+
+Java Parameters:
+-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
+ -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory.
+
+Supported input formats are fastq, fasta, fast+qual, scarf.
+Supported output formats are fastq, fasta, fast+qual.
+Supported compression formats are gz, zip, and bz2
+To read from stdin, set 'in=stdin'. The format should be specified with an extension, like 'in=stdin.fq.gz'
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx200m"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+}
+calcXmx "$@"
+
+function splitnextera() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ fi
+ local CMD="java $EA $z -cp $CP jgi.SplitNexteraLMP $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+splitnextera "$@"
diff --git a/splitsam.sh b/splitsam.sh
new file mode 100755
index 0000000..5cc5300
--- /dev/null
+++ b/splitsam.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+function usage(){
+echo "
+Written by Brian Bushnell
+Last modified February 9, 2015
+
+Description: Splits a sam file into three files:
+Plus-mapped reads, Minus-mapped reads, and Unmapped.
+If 'header' is the 5th argument, header lines will be included.
+
+Usage: splitsam <input> <plus output> <minus output> <unmapped output>
+
+Input may be stdin or a sam file, raw or gzipped.
+Outputs must be sam files, and may be gzipped.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+function splitsam() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ fi
+ local CMD="java $EA -Xmx128m -cp $CP jgi.SplitSamFile $@"
+ echo $CMD
+ eval $CMD
+}
+
+splitsam "$@"
diff --git a/stats.sh b/stats.sh
new file mode 100755
index 0000000..a6870a9
--- /dev/null
+++ b/stats.sh
@@ -0,0 +1,93 @@
+#!/bin/bash
+#stats in=<infile>
+
+usage(){
+echo "
+Written by Brian Bushnell
+Last modified November 2, 2015
+
+Description: Generates basic assembly statistics such as scaffold count,
+ N50, L50, GC content, gap percent, etc. For multiple files,
+ please use statswrapper.sh. Works with fasta and fastq only
+ (gzipped is fine).
+
+Usage: stats.sh in=<file>
+
+
+Parameters:
+in=<file> Specify the input fasta file, or stdin.
+gc=<file> Writes ACGTN content per scaffold to a file.
+gchist=<file> Filename to output scaffold gc content histogram.
+shist=<file> Filename to output cumulative scaffold length histogram.
+gcbins=<200> Number of bins for gc histogram.
+n=<10> Number of contiguous Ns to signify a break between contigs.
+k=<13> Estimate memory usage of BBMap with this kmer length.
+minscaf=<0> Ignore scaffolds shorter than this.
+phs=<f> (printheaderstats) Set to true to print total size of headers.
+pdl=<f> (printduplicatelines) Set to true to print lines in the
+ scaffold size table where the counts did not change.
+n_=<t> This flag will prefix the terms 'contigs' and 'scaffolds'
+ with 'n_' in formats 3-6.
+addname=<f> Adds a column for input file name, for formats 3-6.
+
+format=<0-7> Format of the stats information; default 1.
+ format=0 prints no assembly stats.
+ format=1 uses variable units like MB and KB, and is designed for compatibility with existing tools.
+ format=2 uses only whole numbers of bases, with no commas in numbers, and is designed for machine parsing.
+ format=3 outputs stats in 2 rows of tab-delimited columns: a header row and a data row.
+ format=4 is like 3 but with scaffold data only.
+ format=5 is like 3 but with contig data only.
+ format=6 is like 3 but the header starts with a #.
+ format=7 is like 1 but only prints contig info.
+
+gcformat=<0-4> Select GC output format; default 1.
+ gcformat=0: (no base content info printed)
+ gcformat=1: name length A C G T N GC
+ gcformat=2: name GC
+ gcformat=4: name length GC
+ Note that in gcformat 1, A+C+G+T=1 even when N is nonzero.
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx120m"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+}
+calcXmx "$@"
+
+stats() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ fi
+ local CMD="java $EA $z -cp $CP jgi.AssemblyStats2 $@"
+# echo $CMD >&2
+ eval $CMD
+}
+
+stats "$@"
diff --git a/statswrapper.sh b/statswrapper.sh
new file mode 100755
index 0000000..85f0a5e
--- /dev/null
+++ b/statswrapper.sh
@@ -0,0 +1,79 @@
+#!/bin/bash
+#stats in=<infile>
+
+usage(){
+echo "
+Written by Brian Bushnell
+Last modified November 13, 2014
+
+Description: Runs stats.sh on multiple assemblies to produce one ouput line per file.
+
+Usage: statswrapper.sh in=<input file>
+
+
+Parameters:
+in=<file> Specify the input fasta file, or stdin. For multiple files a, b, and c: 'statswrapper.sh in=a in=b in=c'.
+ 'in=' may be omitted if this is the first arg, and asterisks may be used; e.g. statswrapper.sh *.fa
+gc=<file> Writes ACGTN content per scaffold to a file.
+gchist=<file> Filename to output scaffold gc content histogram.
+gcbins=<200> Number of bins for gc histogram.
+n=<10> Number of contiguous Ns to signify a break between contigs.
+k=<13> Estimate memory usage of BBMap with this kmer length.
+minscaf=<0> Ignore scaffolds shorter than this.
+n_=<t> This flag will prefix the terms 'contigs' and 'scaffolds' with 'n_' in formats 3-6.
+addname=<t> Adds a column for input file name, for formats 3-6.
+
+format=<1 through 6> Format of the stats information.
+ format=1 uses variable units like MB and KB, and is designed for compatibility with existing tools.
+ format=2 uses only whole numbers of bases, with no commas in numbers, and is designed for machine parsing.
+ format=3 outputs stats in 2 rows of tab-delimited columns: a header row and a data row.
+ format=4 is like 3 but with scaffold data only.
+ format=5 is like 3 but with contig data only.
+ format=6 is like 3 but the header starts with a #.
+
+gcformat=<1 or 2> Select GC output format.
+ gcformat=1: name start stop A C G T N GC
+ gcformat=2: name GC
+ Note that in gcformat 1, A+C+G+T=1 even when N is nonzero.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx200m"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+}
+calcXmx "$@"
+
+stats() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module load oracle-jdk/1.7_64bit
+ fi
+ local CMD="java $EA $z -cp $CP jgi.AssemblyStatsWrapper $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+stats "$@"
diff --git a/summarizescafstats.sh b/summarizescafstats.sh
new file mode 100755
index 0000000..0f77210
--- /dev/null
+++ b/summarizescafstats.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+#summarizescafstats in=<infile>
+
+usage(){
+echo "
+Written by Brian Bushnell
+Last modified May 26, 2015
+
+Description: Summarizes the scafstats output of BBMap for evaluation
+of cross-contamination. The intended use is to map multiple libraries or
+assemblies, of different multiplexed organisms, to a concatenated reference
+containing one fused scaffold per organism. This will convert all of the
+resulting stats files (one per library) to a single text file, with multiple
+columns, indicating how much of the input hit the primary versus nonprimary
+scaffolds.
+
+Usage: summarizescafstats.sh in=<file,file...> out=<file>
+
+You can alternatively use a wildcard, like this:
+summarizescafstats.sh scafstats_*.txt out=summary.txt
+
+Parameters:
+in=<file> A list of stats files, or a text file containing one stats file name per line.
+out=<file> Destination for summary.
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx120m"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+summarizescafstats() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ fi
+ local CMD="java $EA $z -cp $CP driver.SummarizeCoverage $@"
+# echo $CMD >&2
+ eval $CMD
+}
+
+summarizescafstats "$@"
diff --git a/summarizeseal.sh b/summarizeseal.sh
new file mode 100755
index 0000000..49ddb91
--- /dev/null
+++ b/summarizeseal.sh
@@ -0,0 +1,73 @@
+#!/bin/bash
+#summarizeseal in=<infile>
+
+usage(){
+echo "
+Written by Brian Bushnell
+Last modified Aug 3, 2015
+
+Description: Summarizes the stats output of Seal for evaluation of
+cross-contamination. The intended use is to map multiple libraries or
+assemblies, of different multiplexed organisms, to a concatenated reference
+containing one fused scaffold per organism. This will convert all of the
+resulting stats files (one per library) to a single text file, with multiple
+columns, indicating how much of the input hit the primary versus nonprimary
+scaffolds.
+
+If ingoresametaxa or ignoresamebarcode are used, ref names must be
+in this format:
+barcode,library,tax,location
+For example:
+6-G,N0296,gammaproteobacteria_bacterium,deep_ocean
+
+
+Usage: summarizeseal.sh in=<file,file...> out=<file>
+
+You can alternately run 'summarizeseal.sh *.txt out=out.txt'
+
+Parameters:
+in=<file> A list of stats files, or a text file containing one stats file name per line.
+out=<file> Destination for summary.
+ignoresametaxa=f Ignore secondary hits sharing taxonomy.
+ignoresamebarcode=f Ignore secondary hits sharing a barcode.
+ignoresamelocation=f Ignore secondary hits sharing a sampling site.
+totaldenominator=f (td) Use all bases as denominator rather than mapped bases.
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx120m"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+summarizeseal() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ fi
+ local CMD="java $EA $z -cp $CP driver.SummarizeSealStats $@"
+# echo $CMD >&2
+ eval $CMD
+}
+
+summarizeseal "$@"
diff --git a/synthmda.sh b/synthmda.sh
new file mode 100755
index 0000000..8e9746a
--- /dev/null
+++ b/synthmda.sh
@@ -0,0 +1,87 @@
+#!/bin/bash
+#synthmda in=<infile> out=<outfile>
+
+usage(){
+echo "
+Written by Brian Bushnell
+Last modified October 22, 2015
+
+Description: Generates synthetic reads following an MDA-amplified single cell's coverage distribution.
+
+Usage: synthmda.sh in=<reference> out=<reads out file>
+
+Input may be fasta or fastq, compressed or uncompressed.
+
+
+Optional parameters (and their defaults)
+
+reads=12000000 Generate this many reads.
+paired=t Generate paired reads.
+length=150 Reads should be this long.
+minlen=4000 Min length of MDA contig.
+maxlen=150000 Max length of MDA contig.
+cycles=9 Number of MDA cycles; higher is more spiky.
+initialratio=1.3 Fraction of genome initially replicated;
+ lower is more spiky.
+ratio=1.7 Fraction of genome replicated per cycle.
+refout=null Write MDA'd genome to this file.
+perfect=0 This fraction of reads will be error-free.
+amp=200 'amp' flag sent to RandomReads (higher is more spiky).
+build=7 Index MDA'd genome in this location.
+prefix=null Generated reads will start with this prefix.
+overwrite=t (ow) Set to false to force the program to abort rather
+ than overwrite an existing file.
+
+Java Parameters:
+-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
+ -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory.
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx4g"
+z2="-Xms4g"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+ if [[ $set == 1 ]]; then
+ return
+ fi
+ freeRam 4000m 80
+ z="-Xmx${RAM}m"
+ z2="-Xms${RAM}m"
+}
+calcXmx "$@"
+
+synthmda() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module load pigz
+ fi
+ local CMD="java $EA $z -cp $CP jgi.SynthMDA $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+synthmda "$@"
diff --git a/tadpole.sh b/tadpole.sh
new file mode 100755
index 0000000..2a8d151
--- /dev/null
+++ b/tadpole.sh
@@ -0,0 +1,175 @@
+#!/bin/bash
+#tadpole in=<infile>
+
+usage(){
+echo "
+Written by Brian Bushnell
+Last modified November 18, 2015
+
+Description: Uses kmer counts to assembles contigs, extend sequences,
+or error-correct reads. Tadpole has no upper bound for kmer length,
+but some values are not supported. Specifically, it allows 1-31,
+multiples of 2 from 32-62, multiples of 3 from 63-93, etc.
+
+Usage:
+Generation: tadpole.sh in=<reads> out=<contigs>
+Extension: tadpole.sh in=<reads> ine=<sequences> oute=<extended> mode=extend
+Correction: tadpole.sh in=<reads> out=<corrected> mode=correct
+
+Input may be fasta or fastq, compressed or uncompressed.
+
+
+Input parameters:
+in=<file> Primary input file for reads to use as kmer data.
+in2=<file> Second input file for paired data.
+extend=<file> Primary input file for sequences to extend.
+extend2=<file> Second input file for paired reads.
+reads=-1 Only process this number of reads, then quit (-1 means all).
+
+Output parameters:
+out=<file> Write contigs (in contig mode).
+oute=<file> Write extended reads (in extend mode).
+ihist=<file> Write insert size histogram (in insert mode).
+dump=<file> Write kmers and their counts.
+fastadump=t Write kmers and counts as fasta versus 2-column tsv.
+mincounttodump=1 Only dump kmers with at least this depth.
+showstats=t Print assembly statistics after writing contigs.
+
+Prefiltering parameters:
+prefilter=0 If set to a positive integer, use a countmin sketch
+ to ignore kmers with depth of that value or lower.
+prehashes=2 Number of hashes for prefilter.
+prefiltersize=0.2 (pff) Fraction of memory to use for prefilter.
+minprobprefilter=t (mpp) Use minprob for the prefilter.
+prepasses=1 Use this many prefiltering passes; higher be more thorough
+ if the filter is very full. Set to 'auto' to iteratively
+ prefilter until the remaining kmers will fit in memory.
+onepass=f If true, prefilter will be generated in same pass as kmer
+ counts. Much faster but counts will be lower, by up to
+ prefilter's depth limit.
+
+Hashing parameters:
+k=31 Kmer length (1 to infinity). Memory use increases with K.
+prealloc=t Pre-allocate memory rather than dynamically growing;
+ faster and more memory-efficient. A float fraction (0-1)
+ may be specified; default is 1.
+minprob=0.5 Ignore kmers with overall probability of correctness below this.
+minprobmain=t (mpm) Use minprob for the primary kmer counts.
+threads=X Spawn X hashing threads (default is number of logical processors).
+rcomp=t Store and count each kmer together and its reverse-complement.
+
+Assembly parameters:
+mincountseed=3 (mcs) Minimum kmer count to seed a new contig or begin extension.
+mincountextend=2 (mce) Minimum kmer count continue extension of a read or contig.
+mincountretain=0 (mincr) Discard kmers with count below this.
+maxcountretain=INF (maxcr) Discard kmers with count above this.
+branchmult1=20 (bm1) Min ratio of 1st to 2nd-greatest path depth at high depth.
+branchmult2=3 (bm2) Min ratio of 1st to 2nd-greatest path depth at low depth.
+branchlower=3 (blc) Max value of 2nd-greatest path depth to be considered low.
+minextension=1 (mine) Do not keep contigs that did not extend at least this much.
+mincontig=100 (minc) Do not write contigs shorter than this.
+mincoverage=1 (mincov) Do not write contigs with average coverage below this.
+trimends=0 (trim) Trim contig ends by this much. Trimming by K/2
+ may yield more accurate genome size estimation.
+contigpasses=16 Build contigs with decreasing seed depth for this many iterations.
+contigpassmult=1.7 Ratio between seed depth of two iterations.
+ownership=auto For concurrency; do not touch.
+
+Processing modes:
+mode=contig contig: Make contigs from kmers.
+ extend: Extend sequences to be longer, and optionally
+ perform error correction.
+ correct: Error correct only.
+ insert: Measure insert sizes.
+
+Extension parameters:
+extendleft=100 (el) Extend to the left by at most this many bases.
+extendright=100 (er) Extend to the right by at most this many bases.
+ibb=t (ignorebackbranches) Do not stop at backward branches.
+
+Error-correction parameters:
+ecc=f Error correct via kmer counts.
+pincer=t If ecc is enabled, use the pincer algorithm.
+tail=t If ecc is enabled, use the tail algorithm.
+aggressive=f (aecc) Correct with bidirectional double-pass.
+markbadbases=0 (mbb) Any base fully covered by kmers with count
+ below this will be changed to N.
+markdeltaonly=t (mdo) Only mark bad bases adjacent to good bases.
+errormult1=60 (em1) Min ratio between kmer depths to call an error.
+errormult2=3 (em2) Alternate ratio between kmer depths.
+errorlowerconst=3 (elc) Use mult2 when the lower kmer is at most this deep.
+mincountcorrect=4 (mcc) Don't correct to kmers with count under this.
+pathsimilarityfraction=0.3 (psf) Max difference ratio considered similar.
+ Controls whether a path appears to be continuous.
+pathsimilarityconstant=3 (psc) Absolute differences below this are ignored.
+errorextensionpincer=5 (eep) Verify this many additional bases after the
+ correction as matching current bases, for pincer.
+errorextensiontail=9 (eet) Verify this many additional bases after the
+ correction as matching current bases, for tail.
+
+Shaving parameters:
+shave=f Remove dead ends (aka hair).
+rinse=f Remove bubbles.
+maxshavedepth=1 (msd) Shave or rinse kmers at most this deep.
+maxshavedepth=1 (msd) Shave or rinse kmers at most this deep.
+exploredist=100 (sed) Quit after exploring this far.
+discardlength=150 (sdl) Discard shavings up to this long.
+
+
+Overlap parameters (for overlapping paired-end reads only):
+merge=f Attempt to merge reads before counting kmers.
+ecco=f Error correct via overlap, but do not merge reads.
+
+Java Parameters:
+-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
+ -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx14g"
+z2="-Xms14g"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+ if [[ $set == 1 ]]; then
+ return
+ fi
+ freeRam 15000m 84
+ z="-Xmx${RAM}m"
+ z2="-Xms${RAM}m"
+}
+calcXmx "$@"
+
+tadpole() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ fi
+ local CMD="java $EA $z $z2 -cp $CP assemble.Tadpole $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+tadpole "$@"
diff --git a/tadwrapper.sh b/tadwrapper.sh
new file mode 100755
index 0000000..af9f50c
--- /dev/null
+++ b/tadwrapper.sh
@@ -0,0 +1,70 @@
+#!/bin/bash
+#tadwrapper in=<infile>
+
+usage(){
+echo "
+Written by Brian Bushnell
+Last modified October 15, 2015
+
+Description: Generates multiple assemblies with Tadpole
+to estimate the optimal kmer length.
+
+Usage:
+tadwrapper.sh in=reads.fq out=contigs%.fq k=31,62,93
+
+
+Parameters:
+out=<file> Output file name. Must contain a % symbol.
+k=31 Comma-delimited list of kmer lengths.
+
+All other parameters are passed to Tadpole.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx14g"
+z2="-Xms14g"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+ if [[ $set == 1 ]]; then
+ return
+ fi
+ freeRam 15000m 84
+ z="-Xmx${RAM}m"
+ z2="-Xms${RAM}m"
+}
+calcXmx "$@"
+
+tadwrapper() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ fi
+ local CMD="java $EA $z $z2 -cp $CP assemble.TadpoleWrapper $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+tadwrapper "$@"
diff --git a/taxonomy.sh b/taxonomy.sh
new file mode 100755
index 0000000..0639894
--- /dev/null
+++ b/taxonomy.sh
@@ -0,0 +1,80 @@
+#!/bin/bash
+#taxonomy in=<infile> out=<outfile>
+
+usage(){
+echo "
+Written by Brian Bushnell
+Last modified December 15, 2015
+
+Description: Prints the full taxonomy of a string.
+String may be a gi number, NCBI taxID, or Latin name.
+An NCBI identifier should just be a number or ncbi|number.
+A gi number should be gi|number.
+
+Usage: taxonomy.sh tree=<tree file> <identifier>
+
+For example, taxonomy.sh tree=tree.taxtree.gz homo_sapiens
+
+Processing parameters:
+tree= A taxonomic tree made by TaxTree, such as tree.taxtree.gz.
+table= A table translating gi numbers to NCBI taxIDs.
+ Only needed if gi numbers will be used.
+* Note *
+Tree and table files are in /global/projectb/sandbox/gaag/bbtools/tax
+For non-Genepool users, or to make new ones, use taxtree.sh and gitable.sh
+
+Java Parameters:
+-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
+ -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory.
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx4g"
+z2="-Xms4g"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+ if [[ $set == 1 ]]; then
+ return
+ fi
+ freeRam 2000m 84
+ z="-Xmx${RAM}m"
+ z2="-Xms${RAM}m"
+}
+calcXmx "$@"
+
+taxonomy() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ fi
+ local CMD="java $EA $z -cp $CP tax.PrintTaxonomy $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+taxonomy "$@"
diff --git a/taxtree.sh b/taxtree.sh
new file mode 100755
index 0000000..420233a
--- /dev/null
+++ b/taxtree.sh
@@ -0,0 +1,71 @@
+#!/bin/bash
+
+usage(){
+echo "
+Written by Brian Bushnell.
+Last modified December 15, 2015
+
+Description: Creates tree.taxtree from names.dmp and nodes.dmp.
+These are in taxdmp.zip available at ftp://ftp.ncbi.nih.gov/pub/taxonomy/
+The taxtree file is needed for programs that can deal with taxonomy,
+like Seal and SortByTaxa.
+
+Usage: taxtree.sh names.dmp nodes.dmp tree.taxtree.gz
+
+Java Parameters:
+-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
+ -Xmx20g will specify 20 gigs of RAM. The max is typically 85% of physical memory.
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+NATIVELIBDIR="$DIR""jni/"
+
+z="-Xmx2g"
+z2="-Xms2g"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+ if [[ $set == 1 ]]; then
+ return
+ fi
+ freeRam 2000m 84
+ z="-Xmx${RAM}m"
+ z2="-Xms${RAM}m"
+}
+calcXmx "$@"
+
+
+taxtree() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ fi
+ local CMD="java $EA $z $z2 -cp $CP tax.TaxTree $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+taxtree "$@"
diff --git a/testformat.sh b/testformat.sh
new file mode 100755
index 0000000..48b0da3
--- /dev/null
+++ b/testformat.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+#testformat in=<infile>
+
+usage(){
+echo "
+Written by Brian Bushnell
+Last modified February 17, 2015
+
+Description: Tests file extensions and contents to determine format, quality, compression, interleaving, and read length.
+
+Usage: testformat.sh <file>
+
+More than one file may be specified.
+Note that ASCII-33 (sanger) and ASCII-64 (illumina) cannot always be differentiated.
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx120m"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+}
+calcXmx "$@"
+
+testformat() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module load oracle-jdk/1.7_64bit
+ fi
+ local CMD="java $EA $z -cp $CP fileIO.FileFormat $@"
+# echo $CMD >&2
+ eval $CMD
+}
+
+testformat "$@"
diff --git a/textfile.sh b/textfile.sh
new file mode 100755
index 0000000..e4c096e
--- /dev/null
+++ b/textfile.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+
+function usage(){
+echo "
+Written by Brian Bushnell
+Last modified February 17, 2015
+
+Description: Displays contents of a text file.
+
+Usage: textfile.sh <file> <start line> <stop line>
+
+Start line and stop line are zero-based.
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+function tf() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module load oracle-jdk/1.7_64bit
+ fi
+ local CMD="java $EA -Xmx120m -cp $CP fileIO.TextFile $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+tf "$@"
diff --git a/translate6frames.sh b/translate6frames.sh
new file mode 100755
index 0000000..e711988
--- /dev/null
+++ b/translate6frames.sh
@@ -0,0 +1,97 @@
+#!/bin/bash
+#translate6frames in=<infile> out=<outfile>
+
+usage(){
+echo "
+Written by Brian Bushnell
+Last modified February 17, 2015
+
+Description: Translates nucleotide sequences to all 6 amino acid frames,
+or amino acids to a canonical nucleotide representation.
+
+Usage: translate6frames.sh in=<input file> out=<output file>
+
+Input may be fasta or fastq, compressed or uncompressed.
+
+
+Optional parameters (and their defaults)
+
+Input parameters:
+in=<file> Main input. in=stdin.fa will pipe from stdin.
+in2=<file> Input for 2nd read of pairs in a different file.
+interleaved=auto (int) t/f overrides interleaved autodetection.
+qin=auto Input quality offset: 33 (Sanger), 64, or auto.
+aain=f False if input is nucleotides, true for amino acids.
+reads=-1 If positive, quit after processing X reads or pairs.
+
+Output parameters:
+out=<file> Write output here. 'out=stdout.fa' goes to standard out.
+out2=<file> Use this to write 2nd read of pairs to a different file.
+overwrite=t (ow) Grant permission to overwrite files.
+append=f Append to existing files.
+ziplevel=2 (zl) Compression level; 1 (min) through 9 (max).
+fastawrap=80 Length of lines in fasta output.
+qout=auto Output quality offset: 33 (Sanger), 64, or auto.
+aaout=t False to output nucleotides, true for amino acids.
+tag=t Tag read id with the frame, adding e.g. ' fr1'
+frames=6 Only print this many frames.
+ If you already know the sense, set 'frames=3'.
+
+Java Parameters:
+-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
+ -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory.
+
+Please contact Brian Bushnell at bbushnell at lbl.gov if you encounter any problems.
+"
+
+}
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+ cd "$(dirname "$DIR")"
+ DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx2g"
+z2="-Xms2g"
+EA="-ea"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+ usage
+ exit
+fi
+
+calcXmx () {
+ source "$DIR""/calcmem.sh"
+ parseXmx "$@"
+ if [[ $set == 1 ]]; then
+ return
+ fi
+ freeRam 2000m 42
+ z="-Xmx${RAM}m"
+ z2="-Xms${RAM}m"
+}
+calcXmx "$@"
+
+translate6frames() {
+ if [[ $NERSC_HOST == genepool ]]; then
+ module unload oracle-jdk
+ module unload samtools
+ module load oracle-jdk/1.7_64bit
+ module load pigz
+ module load samtools
+ fi
+ local CMD="java $EA $z -cp $CP jgi.TranslateSixFrames $@"
+ echo $CMD >&2
+ eval $CMD
+}
+
+translate6frames "$@"
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/bbmap.git
More information about the debian-med-commit
mailing list