[med-svn] [snap-aligner] 01/02: Imported Upstream version 0.18~1.0beta.18
Michael Crusoe
misterc-guest at moszumanska.debian.org
Sun Sep 20 04:08:20 UTC 2015
This is an automated email from the git hooks/post-receive script.
misterc-guest pushed a commit to branch master
in repository snap-aligner.
commit f3ecaa5ce33f6a5e74ad2898ab4dcacc021aaa0e
Author: Michael R. Crusoe <michael.crusoe at gmail.com>
Date: Sat Sep 19 20:16:13 2015 -0700
Imported Upstream version 0.18~1.0beta.18
.gitignore | 47 +
COPYING | 13 +
LICENSE | 202 +
Makefile | 79 +
README.md | 27 +
SNAPLib/AlignerContext.cpp | 510 +
SNAPLib/AlignerContext.h | 166 +
SNAPLib/AlignerOptions.cpp | 975 +
SNAPLib/AlignerOptions.h | 129 +
SNAPLib/AlignerStats.cpp | 111 +
SNAPLib/AlignerStats.h | 80 +
SNAPLib/AlignmentResult.cpp | 107 +
SNAPLib/AlignmentResult.h | 93 +
SNAPLib/ApproximateCounter.cpp | 40 +
SNAPLib/ApproximateCounter.h | 30 +
SNAPLib/Bam.cpp | 1841 +
SNAPLib/Bam.h | 439 +
SNAPLib/BaseAligner.cpp | 1583 +
SNAPLib/BaseAligner.h | 341 +
SNAPLib/BiasTables.cpp | 78991 +++++++++++++++++++
SNAPLib/BigAlloc.cpp | 572 +
SNAPLib/BigAlloc.h | 177 +
SNAPLib/BufferedAsync.cpp | 223 +
SNAPLib/BufferedAsync.h | 66 +
SNAPLib/ChimericPairedEndAligner.cpp | 208 +
SNAPLib/ChimericPairedEndAligner.h | 101 +
SNAPLib/CommandProcessor.cpp | 182 +
SNAPLib/CommandProcessor.h | 29 +
SNAPLib/Compat.cpp | 2217 +
SNAPLib/Compat.h | 516 +
SNAPLib/DataReader.cpp | 2535 +
SNAPLib/DataReader.h | 210 +
SNAPLib/DataWriter.cpp | 876 +
SNAPLib/DataWriter.h | 271 +
SNAPLib/Error.cpp | 89 +
SNAPLib/Error.h | 34 +
SNAPLib/FASTA.cpp | 198 +
SNAPLib/FASTA.h | 49 +
SNAPLib/FASTQ.cpp | 669 +
SNAPLib/FASTQ.h | 213 +
SNAPLib/FileFormat.h | 108 +
SNAPLib/FixedSizeMap.h | 296 +
SNAPLib/FixedSizeSet.h | 131 +
SNAPLib/FixedSizeVector.h | 72 +
SNAPLib/GenericFile.cpp | 109 +
SNAPLib/GenericFile.h | 75 +
SNAPLib/GenericFile_Blob.cpp | 125 +
SNAPLib/GenericFile_Blob.h | 66 +
SNAPLib/GenericFile_HDFS.cpp | 382 +
SNAPLib/GenericFile_HDFS.h | 106 +
SNAPLib/GenericFile_map.cpp | 71 +
SNAPLib/GenericFile_map.h | 43 +
SNAPLib/GenericFile_stdio.cpp | 94 +
SNAPLib/GenericFile_stdio.h | 44 +
SNAPLib/Genome.cpp | 492 +
SNAPLib/Genome.h | 309 +
SNAPLib/GenomeIndex.cpp | 2067 +
SNAPLib/GenomeIndex.h | 299 +
SNAPLib/GzipDataWriter.cpp | 439 +
SNAPLib/GzipDataWriter.h | 101 +
SNAPLib/HashTable.cpp | 343 +
SNAPLib/HashTable.h | 219 +
SNAPLib/Histogram.cpp | 103 +
SNAPLib/Histogram.h | 55 +
SNAPLib/IntersectingPairedEndAligner.cpp | 1423 +
SNAPLib/IntersectingPairedEndAligner.h | 476 +
SNAPLib/LandauVishkin.cpp | 766 +
SNAPLib/LandauVishkin.h | 512 +
SNAPLib/MultiInputReadSupplier.cpp | 298 +
SNAPLib/MultiInputReadSupplier.h | 111 +
SNAPLib/PairedAligner.cpp | 657 +
SNAPLib/PairedAligner.h | 91 +
SNAPLib/PairedEndAligner.h | 62 +
SNAPLib/PairedReadMatcher.cpp | 433 +
SNAPLib/ParallelTask.cpp | 144 +
SNAPLib/ParallelTask.h | 299 +
SNAPLib/PriorityQueue.h | 117 +
SNAPLib/ProbabilityDistance.cpp | 135 +
SNAPLib/ProbabilityDistance.h | 56 +
SNAPLib/RangeSplitter.cpp | 260 +
SNAPLib/RangeSplitter.h | 132 +
SNAPLib/Read.cpp | 53 +
SNAPLib/Read.h | 860 +
SNAPLib/ReadReader.cpp | 57 +
SNAPLib/ReadSupplierQueue.cpp | 729 +
SNAPLib/ReadSupplierQueue.h | 221 +
SNAPLib/ReadWriter.cpp | 548 +
SNAPLib/SAM.cpp | 1721 +
SNAPLib/SAM.h | 236 +
SNAPLib/SNAPLib.vcxproj | 276 +
SNAPLib/SNAPLib.vcxproj.filters | 342 +
SNAPLib/Seed.cpp | 56 +
SNAPLib/Seed.h | 200 +
SNAPLib/SeedSequencer.cpp | 109 +
SNAPLib/SeedSequencer.h | 388 +
SNAPLib/SingleAligner.cpp | 304 +
SNAPLib/SingleAligner.h | 63 +
SNAPLib/SortedDataWriter.cpp | 507 +
SNAPLib/Tables.cpp | 94 +
SNAPLib/Tables.h | 64 +
SNAPLib/Util.cpp | 177 +
SNAPLib/Util.h | 538 +
SNAPLib/VariableSizeMap.h | 665 +
SNAPLib/VariableSizeVector.h | 250 +
SNAPLib/WindowsFileMapper.h | 36 +
SNAPLib/directions.h | 36 +
SNAPLib/exit.cpp | 42 +
SNAPLib/exit.h | 30 +
SNAPLib/mapq.cpp | 45 +
SNAPLib/mapq.h | 68 +
SNAPLib/options.h | 32 +
SNAPLib/stdafx.cpp | 8 +
SNAPLib/stdafx.h | 53 +
SNAPLib/targetver.h | 8 +
apps/ComputeROC/.gitignore | 2 +
apps/ComputeROC/ComputeROC.cpp | 431 +
apps/ComputeROC/ComputeROC.vcxproj | 173 +
apps/ComputeROC/ComputeROC.vcxproj.filters | 33 +
apps/ComputeROC/stdafx.cpp | 8 +
apps/ComputeROC/stdafx.h | 5 +
apps/ComputeROC/targetver.h | 8 +
apps/DistanceHist/DistanceHist.cpp | 234 +
apps/DistanceHist/DistanceHist.vcxproj | 173 +
apps/DistanceHist/DistanceHist.vcxproj.filters | 33 +
apps/DistanceHist/stdafx.cpp | 8 +
apps/DistanceHist/stdafx.h | 50 +
apps/DistanceHist/targetver.h | 8 +
apps/ExtractReads/ExtractReads.cpp | 109 +
apps/ExtractReads/ExtractReads.vcxproj | 171 +
apps/ExtractReads/ExtractReads.vcxproj.filters | 33 +
apps/ExtractReads/stdafx.cpp | 8 +
apps/ExtractReads/stdafx.h | 5 +
apps/ExtractReads/targetver.h | 8 +
apps/RandomizePIfastq/GoodRandom.cpp | 159 +
apps/RandomizePIfastq/GoodRandom.h | 33 +
apps/RandomizePIfastq/RandomizePIfastq.cpp | 177 +
apps/RandomizePIfastq/RandomizePIfastq.vcxproj | 175 +
.../RandomizePIfastq.vcxproj.filters | 39 +
apps/RandomizePIfastq/stdafx.cpp | 8 +
apps/RandomizePIfastq/stdafx.h | 5 +
apps/RandomizePIfastq/targetver.h | 8 +
apps/SNAPCommand/SNAPCommand.cpp | 105 +
apps/SNAPCommand/SNAPCommand.vcxproj | 162 +
apps/SNAPCommand/SNAPCommand.vcxproj.filters | 30 +
apps/SNAPCommand/stdafx.cpp | 8 +
apps/SNAPCommand/stdafx.h | 50 +
apps/ToFASTQ/ToFASTQ.cpp | 223 +
apps/ToFASTQ/ToFASTQ.vcxproj | 162 +
apps/ToFASTQ/ToFASTQ.vcxproj.filters | 30 +
apps/ToFASTQ/stdafx.cpp | 8 +
apps/ToFASTQ/stdafx.h | 5 +
apps/snap/Main.cpp | 33 +
apps/snap/snap.vcxproj | 184 +
apps/snap/snap.vcxproj.filters | 30 +
apps/snap/stdafx.cpp | 8 +
apps/snap/stdafx.h | 50 +
apps/stringz/GoodRandom.cpp | 159 +
apps/stringz/GoodRandom.h | 33 +
apps/stringz/stdafx.cpp | 8 +
apps/stringz/stdafx.h | 5 +
apps/stringz/stringz.cpp | 245 +
apps/stringz/stringz.vcxproj | 177 +
apps/stringz/stringz.vcxproj.filters | 39 +
apps/stringz/targetver.h | 8 +
apps/wc/stdafx.cpp | 8 +
apps/wc/stdafx.h | 5 +
apps/wc/targetver.h | 8 +
apps/wc/wc.cpp | 259 +
apps/wc/wc.vcxproj | 173 +
apps/wc/wc.vcxproj.filters | 33 +
docs/Manual.docx | Bin 0 -> 37458 bytes
docs/Manual.pdf | Bin 0 -> 213231 bytes
docs/QuickStart.docx | Bin 0 -> 20352 bytes
docs/QuickStart.pdf | Bin 0 -> 57172 bytes
import/libhdfs.lib | Bin 0 -> 499840 bytes
import/pdclibhdfs/inc/exception.h | 172 +
import/pdclibhdfs/inc/expect.h | 120 +
import/pdclibhdfs/inc/hdfs.h | 768 +
import/pdclibhdfs/inc/hdfs_test.h | 39 +
import/pdclibhdfs/inc/jni_helper.h | 128 +
import/pdclibhdfs/inc/native_mini_dfs.h | 104 +
import/pdclibhdfs/inc/stdint.h-xx | 259 +
import/pdclibhdfs/inc/uthash.h | 948 +
import/pdclibhdfs/libhdfs.sln | 26 +
import/pdclibhdfs/libhdfs/ReadMe.txt | 37 +
import/pdclibhdfs/libhdfs/libhdfs.vcxproj | 165 +
import/pdclibhdfs/libhdfs/libhdfs.vcxproj.filters | 66 +
import/pdclibhdfs/libhdfs/stdafx.cpp | 8 +
import/pdclibhdfs/libhdfs/stdafx.h | 14 +
import/pdclibhdfs/libhdfs/targetver.h | 8 +
import/pdclibhdfs/src/.cmake.state | Bin 0 -> 746068 bytes
import/pdclibhdfs/src/LICENSE.txt | 271 +
import/pdclibhdfs/src/NOTICE.txt | 2 +
import/pdclibhdfs/src/TstOpsHdfs.tdprj.xml | 40 +
import/pdclibhdfs/src/TstReadHdfs.tdprj.xml | 40 +
import/pdclibhdfs/src/TstWriteHdfs.tdprj.xml | 40 +
import/pdclibhdfs/src/exception.c | 232 +
import/pdclibhdfs/src/hdfs.c | 3016 +
import/pdclibhdfs/src/jni_helper.c | 1181 +
import/pdclibhdfs/src/makefile.twb | 88 +
import/pdclibhdfs/src/makefile.twb.options | 5 +
import/pdclibhdfs/src/native_mini_dfs.c | 274 +
import/pdclibhdfs/src/pdclibhdfs.tdprj.xml | 196 +
import/pdclibhdfs/src/test_libhdfs_ops.c | 582 +
import/pdclibhdfs/src/test_libhdfs_read.c | 79 +
import/pdclibhdfs/src/test_libhdfs_threaded.c | 310 +
import/pdclibhdfs/src/test_libhdfs_write.c | 103 +
import/pdclibhdfs/src/test_native_mini_dfs.c | 41 +
import/zconf.h | 511 +
import/zlib.h | 1768 +
import/zlibstat.lib | Bin 0 -> 773824 bytes
scripts/testhdfs.bat | 28 +
snap.sln | 189 +
tests/.gitignore | 1 +
tests/EventTest.cpp | 137 +
tests/LandauVishkinTest.cpp | 129 +
tests/ProbabilityDistanceTest.cpp | 70 +
tests/TestLib.cpp | 43 +
tests/TestLib.h | 152 +
tests/bin/ValidateSamFile.jar | Bin 0 -> 976301 bytes
tests/bin/diff.exe | Bin 0 -> 68608 bytes
tests/bin/grep.exe | Bin 0 -> 81408 bytes
tests/bin/gzip.exe | Bin 0 -> 50688 bytes
tests/bin/msys-1.0.dll | Bin 0 -> 743240 bytes
tests/bin/samtools.exe | Bin 0 -> 460800 bytes
tests/bin/sort.exe | Bin 0 -> 22528 bytes
tests/datatest.py | 109 +
tests/datatest/.gitignore | 1 +
tests/datatest/ValidateSamFile.jar | Bin 0 -> 947804 bytes
tests/datatest/correct-fq-datatest.sam | 5 +
tests/datatest/correct-fq-datatest2.sam | 6 +
tests/datatest/correct-sam-datatest.sam | 6 +
tests/datatest/correct-sam-datatest2.sam | 7 +
tests/datatest/datatest.bam | Bin 0 -> 612 bytes
tests/datatest/datatest.fa | 3 +
tests/datatest/datatest.fq | 8 +
tests/datatest/datatest.sam | 6 +
tests/datatest/datatest2.fa | 6 +
tests/dup_reads.py | 42 +
tests/filetest.py | 175 +
tests/main.cpp | 7 +
tests/tests.vcxproj | 172 +
tests/tests.vcxproj.filters | 39 +
243 files changed, 133510 insertions(+)
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..7bcbe86
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,47 @@
+# Compiled Object files
+# Compiled Dynamic libraries
+# Compiled Static libraries
+# Visual Studio files
+# Compiled binaries
diff --git a/COPYING b/COPYING
new file mode 100644
index 0000000..c234b11
--- /dev/null
@@ -0,0 +1,13 @@
+Copyright 2012, Regents of the University of California.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..d645695
--- /dev/null
@@ -0,0 +1,202 @@
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+ 1. Definitions.
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ implied, including, without limitation, any warranties or conditions
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+ APPENDIX: How to apply the Apache License to your work.
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+ Copyright [yyyy] [name of copyright owner]
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ See the License for the specific language governing permissions and
+ limitations under the License.
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..0f3cda8
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,79 @@
+ifndef CXXFLAGS
+ CXXFLAGS = -O3 -Wno-format
+LDFLAGS += -pthread
+#LIBHDFS_HOME = ../hadoop-2.2.0-src/hadoop-hdfs-project/hadoop-hdfs/src/main/native/libhdfs
+#JAVA_HOME = /usr/lib/jvm/java-7-oracle
+ LDFLAGS += -L$(LIBHDFS_HOME) -L$(JAVA_HOME)/jre/lib/amd64/server -L$(JAVA_HOME)/jre/lib/amd64
+ LIBS += -lhdfs -ljvm
+UNAME := $(shell uname)
+ifeq ($(UNAME), Linux)
+ LIBS += -lrt -lz
+ifeq ($(UNAME), Darwin)
+ LIBS += -lz
+CXX = g++
+LIB_SRC = $(wildcard SNAPLib/*.cpp)
+LIB_OBJ = $(patsubst %.cpp, %.o, $(LIB_SRC))
+SNAP_SRC = $(wildcard apps/snap/*.cpp)
+TEST_SRC = $(wildcard tests/*.cpp)
+ROC_SRC = $(wildcard apps/ComputeROC/*.cpp)
+SNAPCOMMAND_SRC = $(wildcard apps/SNAPCommand/*.cpp)
+SNAP_OBJ = $(patsubst %.cpp, %.o, $(SNAP_SRC))
+TEST_OBJ = $(patsubst %.cpp, %.o, $(TEST_SRC))
+ROC_OBJ = $(patsubst %.cpp, %.o, $(ROC_SRC))
+SNAPCOMMAND_OBJ = $(patsubst %.cpp, %.o, $(SNAPCOMMAND_SRC))
+DEPS = $(pathsubst %.o, %.d, $(ALL_OBJ))
+EXES = snap-aligner unit_tests SNAPCommand
+default: $(EXES)
+-include $(pathsubst %.o, %.d, $(ALL_OBJ))
+$(OBJS): %.o : %.cpp
+ $(CXX) -o $@ $(CXXFLAGS) -c $<
+snap-aligner: $(LIB_OBJ) $(SNAP_OBJ)
+ $(CXX) -o $@ $(CXXFLAGS) -Itests $(LDFLAGS) $^ $(LIBS)
+ echo SNAP_OBJ is $(SNAP_OBJ)
+ $(CXX) -o $@ $(CXXFLAGS) $(LDFLAGS) $^ $(LIBS)
+ make clean
+ mv snap snapxl
+ make clean
+roc: $(LIB_OBJ) $(ROC_OBJ)
+ $(CXX) -o $@ $(CXXFLAGS) -Itests $(LDFLAGS) $^ $(LIBS)
+unit_tests: $(LIB_OBJ) $(TEST_OBJ)
+ $(CXX) -o $@ $(CXXFLAGS) -Itests $(LDFLAGS) $^ $(LIBS)
+ rm -f $(ALL_OBJ) $(DEPS) $(EXES) snap SNAP
+.phony: clean default
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..27be663
--- /dev/null
+++ b/README.md
@@ -0,0 +1,27 @@
+Scalable Nucleotide Alignment Program - <http://snap.cs.berkeley.edu>
+## Overview
+SNAP is a fast and accurate aligner for short DNA reads. It is optimized for
+modern read lengths of 100 bases or higher, and takes advantage of these reads
+to align data quickly through a hash-based indexing scheme.
+## Documentation
+A quick start guide and user manual are available in the `docs` folder, with
+additional documentation at <http://snap.cs.berkeley.edu>.
+## Building
+SNAP runs on Windows, Linux and Mac OS X.
+For Windows, we provide a Visual C++ project, `snap.sln`. Requirements:
+- Visual Studio 2012 (11.0)
+For Linux and OS X, simply type `make`. Requirements:
+- g++ version 4.6
+- zlib 1.2.8 from http://zlib.net/
diff --git a/SNAPLib/AlignerContext.cpp b/SNAPLib/AlignerContext.cpp
new file mode 100644
index 0000000..90a0646
--- /dev/null
+++ b/SNAPLib/AlignerContext.cpp
@@ -0,0 +1,510 @@
+Module Name:
+ AlignerContext.cpp
+ Common parameters for running single & paired alignment.
+ Ravi Pandya, May, 2012
+ User mode service.
+Revision History:
+ Integrated from SingleAligner.cpp & PairedAligner.cpp
+#include "stdafx.h"
+#include "Compat.h"
+#include "options.h"
+#include "AlignerOptions.h"
+#include "AlignerContext.h"
+#include "AlignerStats.h"
+#include "BaseAligner.h"
+#include "FileFormat.h"
+#include "exit.h"
+#include "PairedAligner.h"
+#include "Error.h"
+#include "Util.h"
+#include "CommandProcessor.h"
+using std::max;
+using std::min;
+// Save the index & index directory globally so that we don't need to reload them on multiple runs.
+GenomeIndex *g_index = NULL;
+char *g_indexDirectory = NULL;
+AlignerContext::AlignerContext(int i_argc, const char **i_argv, const char *i_version, AlignerExtension* i_extension)
+ :
+ index(NULL),
+ writerSupplier(NULL),
+ options(NULL),
+ stats(NULL),
+ extension(i_extension != NULL ? i_extension : new AlignerExtension()),
+ readWriter(NULL),
+ argc(i_argc),
+ argv(i_argv),
+ version(i_version),
+ perfFile(NULL)
+ delete extension;
+ if (NULL != perfFile) {
+ fclose(perfFile);
+ }
+void AlignerContext::runAlignment(int argc, const char **argv, const char *version, unsigned *argsConsumed)
+ options = parseOptions(argc, argv, version, argsConsumed, isPaired());
+ if (NULL == options) { // Didn't parse correctly
+ *argsConsumed = argc;
+ return;
+ }
+#ifdef _MSC_VER
+ useTimingBarrier = options->useTimingBarrier;
+ if (!initialize()) {
+ return;
+ }
+ extension->initialize();
+ if (! extension->skipAlignment()) {
+ WriteStatusMessage("Aligning.\n");
+ beginIteration();
+ runTask();
+ finishIteration();
+ printStatsHeader();
+ printStats();
+ nextIteration(); // This probably should get rolled into something else; it's really cleanup code, not "next iteration"
+ }
+ extension->finishAlignment();
+ PrintBigAllocProfile();
+ PrintWaitProfile();
+ void
+ stats = newStats(); // separate copy per thread
+ stats->extra = extension->extraStats();
+ readWriter = writerSupplier != NULL ? writerSupplier->getWriter() : NULL;
+ extension = extension->copy();
+ void
+ extension->beginThread();
+ runIterationThread();
+ if (readWriter != NULL) {
+ readWriter->close();
+ delete readWriter;
+ }
+ extension->finishThread();
+ void
+AlignerContext::finishThread(AlignerContext* common)
+ common->stats->add(stats);
+ delete stats;
+ stats = NULL;
+ delete extension;
+ extension = NULL;
+ bool
+ if (g_indexDirectory == NULL || strcmp(g_indexDirectory, options->indexDir) != 0) {
+ delete g_index;
+ g_index = NULL;
+ delete g_indexDirectory;
+ g_indexDirectory = new char [strlen(options->indexDir) + 1];
+ strcpy(g_indexDirectory, options->indexDir);
+ if (strcmp(options->indexDir, "-") != 0) {
+ WriteStatusMessage("Loading index from directory... ");
+ fflush(stdout);
+ _int64 loadStart = timeInMillis();
+ index = GenomeIndex::loadFromDirectory((char*) options->indexDir, options->mapIndex, options->prefetchIndex);
+ if (index == NULL) {
+ WriteErrorMessage("Index load failed, aborting.\n");
+ return false;
+ }
+ g_index = index;
+ _int64 loadTime = timeInMillis() - loadStart;
+ WriteStatusMessage("%llds. %u bases, seed size %d\n",
+ loadTime / 1000, index->getGenome()->getCountOfBases(), index->getSeedLength());
+ } else {
+ WriteStatusMessage("no alignment, input/output only\n");
+ }
+ } else {
+ index = g_index;
+ }
+ maxHits_ = options->maxHits;
+ maxDist_ = options->maxDist;
+ extraSearchDepth = options->extraSearchDepth;
+ noUkkonen = options->noUkkonen;
+ noOrderedEvaluation = options->noOrderedEvaluation;
+ noTruncation = options->noTruncation;
+ maxSecondaryAlignmentAdditionalEditDistance = options->maxSecondaryAlignmentAdditionalEditDistance;
+ maxSecondaryAlignments = options->maxSecondaryAlignments;
+ maxSecondaryAlignmentsPerContig = options->maxSecondaryAlignmentsPerContig;
+ if (maxSecondaryAlignmentAdditionalEditDistance < 0 && (maxSecondaryAlignments < 1000000 || maxSecondaryAlignmentsPerContig > 0)) {
+ WriteErrorMessage("You set -omax and/or -mpc without setting -om. They're meaningful only in the context of -om, so you probably didn't really mean to do that.\n");
+ soft_exit(1);
+ }
+ minReadLength = options->minReadLength;
+ if (index != NULL && (int)minReadLength < index->getSeedLength()) {
+ WriteErrorMessage("The min read length (%d) must be at least the seed length (%d), or there's no hope of aligning reads that short.\n", minReadLength, index->getSeedLength());
+ return false;
+ }
+ if (options->perfFileName != NULL) {
+ perfFile = fopen(options->perfFileName,"a");
+ if (NULL == perfFile) {
+ WriteErrorMessage("Unable to open perf file '%s'\n", options->perfFileName);
+ return false;
+ }
+ }
+ DataSupplier::ThreadCount = options->numThreads;
+ return true;
+ void
+ WriteStatusMessage("Total Reads Aligned, MAPQ >= %2d Aligned, MAPQ < %2d Unaligned Too Short/Too Many Ns %%Pairs\tReads/s Time in Aligner (s)\n", MAPQ_LIMIT_FOR_SINGLE_HIT, MAPQ_LIMIT_FOR_SINGLE_HIT);
+ void
+ writerSupplier = NULL;
+ alignStart = timeInMillis();
+ clipping = options->clipping;
+ totalThreads = options->numThreads;
+ bindToProcessors = options->bindToProcessors;
+ maxDist = maxDist_;
+ maxHits = maxHits_;
+ numSeedsFromCommandLine = options->numSeedsFromCommandLine;
+ seedCoverage = options->seedCoverage;
+ minWeightToCheck = options->minWeightToCheck;
+ if (stats != NULL) {
+ delete stats;
+ }
+ stats = newStats();
+ stats->extra = extension->extraStats();
+ extension->beginIteration();
+ memset(&readerContext, 0, sizeof(readerContext));
+ readerContext.clipping = options->clipping;
+ readerContext.defaultReadGroup = options->defaultReadGroup;
+ readerContext.genome = index != NULL ? index->getGenome() : NULL;
+ readerContext.ignoreSecondaryAlignments = options->ignoreSecondaryAlignments;
+ readerContext.ignoreSupplementaryAlignments = options->ignoreSecondaryAlignments; // Maybe we should split them out
+ DataSupplier::ExpansionFactor = options->expansionFactor;
+ typeSpecificBeginIteration();
+ if (UnknownFileType != options->outputFile.fileType) {
+ const FileFormat* format;
+ if (SAMFile == options->outputFile.fileType) {
+ format = FileFormat::SAM[options->useM];
+ } else if (BAMFile == options->outputFile.fileType) {
+ format = FileFormat::BAM[options->useM];
+ } else {
+ //
+ // This shouldn't happen, because the command line parser should catch it. Perhaps you've added a new output file format and just
+ // forgoten to add it here.
+ //
+ WriteErrorMessage("AlignerContext::beginIteration(): unknown file type %d for '%s'\n", options->outputFile.fileType, options->outputFile.fileName);
+ soft_exit(1);
+ }
+ format->setupReaderContext(options, &readerContext);
+ writerSupplier = format->getWriterSupplier(options, readerContext.genome);
+ ReadWriter* headerWriter = writerSupplier->getWriter();
+ headerWriter->writeHeader(readerContext, options->sortOutput, argc, argv, version, options->rgLineContents, options->outputFile.omitSQLines);
+ headerWriter->close();
+ delete headerWriter;
+ }
+ void
+ extension->finishIteration();
+ if (NULL != writerSupplier) {
+ writerSupplier->close();
+ delete writerSupplier;
+ writerSupplier = NULL;
+ }
+ alignTime = /*timeInMillis() - alignStart -- use the time from ParallelTask.h, that may exclude memory allocation time*/ time;
+ bool
+ //
+ // This thing is a vestage of when we used to allow parameter ranges.
+ //
+ typeSpecificNextIteration();
+ return false;
+extern char *FormatUIntWithCommas(_uint64 val, char *outputBuffer, size_t outputBufferSize); // Relying on the one in Util.h results in an "internal compiler error" for Visual Studio.
+// Take an integer and a percentage, and turn it into a string of the form "number (percentage%)<padding>" where
+// number has commas and the whole thing is padded out with spaces to a specific length.
+char *numPctAndPad(char *buffer, _uint64 num, double pct, size_t desiredWidth, size_t bufferLen)
+ _ASSERT(desiredWidth < bufferLen); // < to leave room for trailing null.
+ FormatUIntWithCommas(num, buffer, bufferLen);
+ const size_t percentageBufferSize = 100; // Plenty big enough for any value
+ char percentageBuffer[percentageBufferSize];
+ sprintf(percentageBuffer, " (%.02f%%)", pct);
+ if (strlen(percentageBuffer) + strlen(buffer) >= bufferLen) {
+ WriteErrorMessage("numPctAndPad: overflowed output buffer\n");
+ soft_exit(1);
+ }
+ strcat(buffer, percentageBuffer);
+ for (size_t x = strlen(buffer); x < desiredWidth; x++) {
+ strcat(buffer, " ");
+ }
+ return buffer;
+ void
+ double usefulReads = max((double) stats->usefulReads, 1.0);
+ const size_t strBufLen = 50; // Way more than enough for 64 bit numbers with commas
+ char tooShort[strBufLen];
+ char single[strBufLen];
+ char multi[strBufLen];
+ char unaligned[strBufLen];
+ char numReads[strBufLen];
+ char readsPerSecond[strBufLen];
+ char alignTimeString[strBufLen];
+ WriteStatusMessage("%-14s %s %s %s %s %.02f%%%\t%-9s %s\n",
+ FormatUIntWithCommas(stats->totalReads, numReads, strBufLen),
+ numPctAndPad(single, stats->singleHits, 100.0 * stats->singleHits / stats->totalReads, 22, strBufLen),
+ numPctAndPad(multi, stats->multiHits, 100.0 * stats->multiHits / stats->totalReads, 22, strBufLen),
+ numPctAndPad(unaligned, stats->notFound, 100.0 * stats->notFound / stats->totalReads, 22, strBufLen),
+ numPctAndPad(tooShort, stats->totalReads - stats->usefulReads, 100.0 * (stats->totalReads - stats->usefulReads) / max(stats->totalReads, (_int64)1), 22, strBufLen),
+ 100.0 * stats->alignedAsPairs / stats->totalReads,
+ FormatUIntWithCommas((unsigned _int64)(1000 * stats->totalReads / max(alignTime, (_int64)1)), readsPerSecond, strBufLen), // Aligntime is in ms
+ FormatUIntWithCommas((alignTime + 500) / 1000, alignTimeString, strBufLen)
+ );
+ if (NULL != perfFile) {
+ fprintf(perfFile, "%d\t%d\t%0.2f%%\t%0.2f%%\t%0.2f%%\t%0.2f%%\t%0.2f%%\t%lld\t%lld\tt%.0f\n",
+ maxHits_, maxDist_,
+ 100.0 * usefulReads / max(stats->totalReads, (_int64) 1),
+ 100.0 * stats->singleHits / stats->totalReads,
+ 100.0 * stats->multiHits / stats->totalReads,
+ 100.0 * stats->notFound / stats->totalReads,
+ stats->lvCalls,
+ 100.0 * stats->alignedAsPairs / stats->totalReads,
+ stats->totalReads,
+ (1000.0 * usefulReads) / max(alignTime, (_int64) 1));
+ fprintf(perfFile,"\n");
+ }
+ WriteStatusMessage("Per-read alignment time histogram:\nlog2(ns)\tcount\ttotal time (ns)\n");
+ for (int i = 0; i < 31; i++) {
+ WriteStatusMessage("%d\t%lld\t%lld\n", i, stats->countByTimeBucket[i], stats->nanosByTimeBucket[i]);
+ }
+ stats->printHistograms(stdout);
+ WriteStatusMessage("%llds, %lld calls in BSD noneClose, not -1\n", stats->nanosTimeInBSD[0][1]/1000000000, stats->BSDCounts[0][1]);
+ WriteStatusMessage("%llds, %lld calls in BSD noneClose, -1\n", stats->nanosTimeInBSD[0][0]/1000000000, stats->BSDCounts[0][0]);
+ WriteStatusMessage("%llds, %lld calls in BSD close, not -1\n", stats->nanosTimeInBSD[1][1]/1000000000, stats->BSDCounts[1][1]);
+ WriteStatusMessage("%llds, %lld calls in BSD close, -1\n", stats->nanosTimeInBSD[1][0]/1000000000, stats->BSDCounts[1][0]);
+ WriteStatusMessage("%llds, %lld calls in Hamming\n", stats->hammingNanos/1000000000, stats->hammingCount);
+ extension->printStats();
+ AlignerOptions*
+ int i_argc,
+ const char **i_argv,
+ const char *i_version,
+ unsigned *argsConsumed,
+ bool paired)
+ argc = i_argc;
+ argv = i_argv;
+ version = i_version;
+ AlignerOptions *options;
+ if (paired) {
+ options = new PairedAlignerOptions("snap paired <index-dir> <inputFile(s)> [<options>] where <input file(s)> is a list of files to process.\n");
+ } else {
+ options = new AlignerOptions("snap single <index-dir> <inputFile(s)> [<options>] where <input file(s)> is a list of files to process.\n");
+ }
+ options->extra = extension->extraOptions();
+ if (argc < 3) {
+ WriteErrorMessage("Too few parameters\n");
+ options->usage();
+ delete options;
+ return NULL;
+ }
+ options->indexDir = argv[1];
+ struct InputList {
+ SNAPFile input;
+ InputList* next;
+ } *inputList = NULL;
+ //
+ // Now build the input array and parse options.
+ //
+ bool inputFromStdio = false;
+ int i;
+ int nInputs = 0;
+ for (i = 2; i < argc; i++) { // Starting at 2 skips single/paired and the index
+ if (',' == argv[i][0] && '\0' == argv[i][1]) {
+ i++; // Consume the comma
+ break;
+ }
+ int argsConsumed;
+ SNAPFile input;
+ if (SNAPFile::generateFromCommandLine(argv+i, argc-i, &argsConsumed, &input, paired, true)) {
+ if (input.isStdio) {
+ if (CommandPipe != NULL) {
+ WriteErrorMessage("You may not use stdin/stdout in daemon mode\n");
+ delete options;
+ return NULL;
+ }
+ if (inputFromStdio) {
+ WriteErrorMessage("You specified stdin ('-') specified for more than one input, which isn't permitted.\n");
+ delete options;
+ return NULL;
+ } else {
+ inputFromStdio = true;
+ }
+ }
+ InputList *listEntry = new InputList;
+ listEntry->input = input;
+ listEntry->next = inputList;
+ inputList = listEntry; // Yes, this puts them in backwards. a) We reverse them at the end and b) it doesn't matter anyway
+ nInputs++;
+ i += argsConsumed - 1;
+ continue;
+ }
+ bool done;
+ int oldI = i;
+ if (!options->parse(argv, argc, i, &done)) {
+ WriteErrorMessage("Didn't understand options starting at %s\n", argv[oldI]);
+ options->usage();
+ delete options;
+ return NULL;
+ }
+ if (done) {
+ i++; // For the ',' arg
+ break;
+ }
+ }
+ if (0 == nInputs) {
+ WriteErrorMessage("No input files specified.\n");
+ delete options;
+ return NULL;
+ }
+ if (options->maxDist + options->extraSearchDepth >= MAX_K) {
+ WriteErrorMessage("You specified too large of a maximum edit distance combined with extra search depth. The must add up to less than %d.\n", MAX_K);
+ WriteErrorMessage("Either reduce their sum, or change MAX_K in LandauVishkin.h and recompile.\n");
+ delete options;
+ return NULL;
+ }
+ if (options->maxSecondaryAlignmentAdditionalEditDistance > (int)options->extraSearchDepth) {
+ WriteErrorMessage("You can't have the max edit distance for secondary alignments (-om) be bigger than the max search depth (-D)\n");
+ delete options;
+ return NULL;
+ }
+ options->nInputs = nInputs;
+ options->inputs = new SNAPFile[nInputs];
+ for (int j = nInputs - 1; j >= 0; j --) {
+ // The loop runs backwards so that we reverse the reversing that we did when we built it. Not that it matters anyway.
+ _ASSERT(NULL != inputList);
+ options->inputs[j] = inputList->input;
+ InputList *dying = inputList;
+ inputList = inputList->next;
+ delete dying;
+ }
+ _ASSERT(NULL == inputList);
+ *argsConsumed = i;
+ return options;
diff --git a/SNAPLib/AlignerContext.h b/SNAPLib/AlignerContext.h
new file mode 100644
index 0000000..5226788
--- /dev/null
+++ b/SNAPLib/AlignerContext.h
@@ -0,0 +1,166 @@
+Module Name:
+ AlignerContext.h
+ Common parameters for running single & paired alignment.
+ Ravi Pandya, May, 2012
+ User mode service.
+Revision History:
+ Integrated from SingleAligner.cpp & PairedAligner.cpp
+#pragma once
+#include "stdafx.h"
+#include "Genome.h"
+#include "RangeSplitter.h"
+#include "AlignerOptions.h"
+#include "AlignerStats.h"
+#include "ParallelTask.h"
+#include "GenomeIndex.h"
+class AlignerExtension;
+ Common context state shared across threads during alignment process
+class AlignerContext : public TaskContextBase
+ AlignerContext(int i_argc, const char **i_argv, const char *i_version, AlignerExtension* i_extension = NULL);
+ ~AlignerContext();
+ // running alignment
+ void runAlignment(int argc, const char **argv, const char *version, unsigned *nArgsConsumed);
+ // ParallelTask template
+ void initializeThread();
+ void runThread();
+ void finishThread(AlignerContext* common);
+ void printStatsHeader();
+ void printStats();
+ void beginIteration();
+ void finishIteration();
+ // advance to next iteration in range, return false when past end
+ bool nextIteration();
+ // overrideable by concrete single/paired alignment subclasses
+ // parse options from the command line
+ AlignerOptions* parseOptions(int argc, const char **argv, const char *version, unsigned *argsConsumed, bool paired);
+ // initialize from options
+ virtual bool initialize();
+ // new stats object
+ virtual AlignerStats* newStats() = 0;
+ // instantiate and run a parallel task
+ virtual void runTask() = 0;
+ // run single thread within single iteration
+ virtual void runIterationThread() = 0;
+ virtual void typeSpecificBeginIteration() = 0;
+ virtual void typeSpecificNextIteration() = 0;
+ virtual bool isPaired() = 0;
+ friend class AlignerContext2;
+ // common state across all threads
+ GenomeIndex *index;
+ ReadWriterSupplier *writerSupplier;
+ ReaderContext readerContext;
+ _int64 alignStart;
+ _int64 alignTime;
+ AlignerOptions *options;
+ AlignerStats *stats;
+ AlignerExtension *extension;
+ unsigned maxDist;
+ unsigned numSeedsFromCommandLine;
+ double seedCoverage;
+ unsigned minWeightToCheck;
+ int maxHits;
+ bool detailedStats;
+ ReadClippingType clipping;
+ unsigned extraSearchDepth;
+ int argc;
+ const char **argv;
+ const char *version;
+ FILE *perfFile;
+ bool noUkkonen;
+ bool noOrderedEvaluation;
+ bool noTruncation;
+ int maxSecondaryAlignmentAdditionalEditDistance;
+ int maxSecondaryAlignments;
+ int maxSecondaryAlignmentsPerContig;
+ unsigned minReadLength;
+ // iteration variables
+ int maxHits_;
+ int maxDist_;
+ // Per-thread context state used during alignment process
+ ReadWriter *readWriter;
+// abstract class for extending base context
+class AlignerExtension
+ virtual ~AlignerExtension() {}
+ virtual AbstractOptions* extraOptions() { return NULL; }
+ virtual AbstractStats* extraStats() { return NULL; }
+ virtual bool skipAlignment() { return false; }
+ virtual void initialize() {}
+ virtual void beginIteration() {}
+ virtual AlignerExtension* copy() { return new AlignerExtension(); }
+ virtual void beginThread() {}
+ virtual bool runIterationThread(PairedReadSupplier* supplier, AlignerContext* threadContext) { return false; }
+ virtual bool runIterationThread(ReadSupplier* supplier, AlignerContext* threadContext) { return false; }
+ virtual void finishThread() {}
+ virtual void finishIteration() {}
+ virtual void printStats() {}
+ virtual void finishAlignment() {}
diff --git a/SNAPLib/AlignerOptions.cpp b/SNAPLib/AlignerOptions.cpp
new file mode 100644
index 0000000..85ee223
--- /dev/null
+++ b/SNAPLib/AlignerOptions.cpp
@@ -0,0 +1,975 @@
+Module Name:
+ AlignerOptions.cpp
+ Common parameters for running single & paired alignment.
+ Ravi Pandya, May, 2012
+ User mode service.
+Revision History:
+ Integrated from SingleAligner.cpp & PairedAligner.cpp
+#include "stdafx.h"
+#include "options.h"
+#include "AlignerOptions.h"
+#include "FASTQ.h"
+#include "SAM.h"
+#include "Bam.h"
+#include "exit.h"
+#include "Error.h"
+#include "BaseAligner.h"
+#include "CommandProcessor.h"
+ const char* i_commandLine,
+ bool forPairedEnd)
+ :
+ commandLine(i_commandLine),
+ indexDir(NULL),
+ similarityMapFile(NULL),
+ numThreads(GetNumberOfProcessors()),
+ bindToProcessors(true),
+ ignoreMismatchedIDs(false),
+ clipping(ClipBack),
+ sortOutput(false),
+ noIndex(false),
+ noDuplicateMarking(false),
+ noQualityCalibration(false),
+ sortMemory(0),
+ filterFlags(0),
+ explorePopularSeeds(false),
+ stopOnFirstHit(false),
+ useM(true),
+ gapPenalty(0),
+ extra(NULL),
+ rgLineContents("@RG\tID:FASTQ\tPL:Illumina\tPU:pu\tLB:lb\tSM:sm"),
+ perfFileName(NULL),
+ useTimingBarrier(false),
+ extraSearchDepth(2),
+ defaultReadGroup("FASTQ"),
+ seedCountSpecified(false),
+ minWeightToCheck(1),
+ numSeedsFromCommandLine(0),
+ ignoreSecondaryAlignments(true),
+ maxSecondaryAlignmentAdditionalEditDistance(-1),
+ maxSecondaryAlignments(0x7fffffff),
+ maxSecondaryAlignmentsPerContig(-1), // -1 means don't limit
+ preserveClipping(false),
+ expansionFactor(1.0),
+ noUkkonen(false),
+ noOrderedEvaluation(false),
+ noTruncation(false),
+ maxDistFraction(0.0),
+ mapIndex(false),
+ prefetchIndex(false),
+ writeBufferSize(16 * 1024 * 1024)
+ if (forPairedEnd) {
+ maxDist = 15;
+ seedCoverage = 0;
+ numSeedsFromCommandLine = 8;
+ maxHits = 300;
+ } else {
+ maxDist = 14;
+ numSeedsFromCommandLine = 25;
+ maxHits = 300;
+ seedCoverage = 0;
+ }
+ initializeLVProbabilitiesToPhredPlus33();
+ void
+ usageMessage();
+ void
+ WriteErrorMessage(
+ "Usage: \n%s\n"
+ "Options:\n"
+ " -o filename output alignments to filename in SAM or BAM format, depending on the file extension or\n"
+ " explicit type specifier (see below). Use a dash with an explicit type specifier to write to\n"
+ " stdout, so for example -o -sam - would write SAM output to stdout\n"
+ " -d maximum edit distance allowed per read or pair (default: %d)\n"
+ " -n number of seeds to use per read\n"
+ " -sc Seed coverage (i.e., readSize/seedSize). Floating point. Exclusive with -n. (default uses -n)\n"
+ " -h maximum hits to consider per seed (default: %d)\n"
+ " -ms minimum seed matches per location (default: %d)\n"
+ " -t number of threads (default is one per core)\n"
+ " -b bind each thread to its processor (this is the default)\n"
+ " --b Don't bind each thread to its processor (note the double dash)\n"
+ " -P disables cache prefetching in the genome; may be helpful for machines\n"
+ " with small caches or lots of cores/cache\n"
+ " -so sort output file by alignment location\n"
+ " -sm memory to use for sorting in Gb\n"
+ " -x explore some hits of overly popular seeds (useful for filtering)\n"
+ " -f stop on first match within edit distance limit (filtering mode)\n"
+ " -F filter output (a=aligned only, s=single hit only (MAPQ >= %d), u=unaligned only, l=long enough to align (see -mrl))\n"
+ " -S suppress additional processing (sorted BAM output only)\n"
+ " i=index, d=duplicate marking\n"
+ " -I ignore IDs that don't match in the paired-end aligner\n"
+#ifdef _MSC_VER // Only need this on Windows, since memory allocation is fast on Linux
+ " -B Insert barrier after per-thread memory allocation to improve timing accuracy\n"
+#endif // _MSC_VER
+ " -Cxx must be followed by two + or - symbols saying whether to clip low-quality\n"
+ " bases from front and back of read respectively; default: back only (-C-+)\n"
+ " -M indicates that CIGAR strings in the generated SAM file should use M (alignment\n"
+ " match) rather than = and X (sequence (mis-)match). This is the default\n"
+ " -= use the new style CIGAR strings with = and X rather than M. The opposite of -M\n"
+ " -G specify a gap penalty to use when generating CIGAR strings\n"
+ " -pf specify the name of a file to contain the run speed\n"
+ " --hp Indicates not to use huge pages (this may speed up index load and slow down alignment) This is the default\n"
+ " -hp Indicates to use huge pages (this may speed up alignment and slow down index load).\n"
+ " -D Specifies the extra search depth (the edit distance beyond the best hit that SNAP uses to compute MAPQ). Default 2\n"
+ " -rg Specify the default read group if it is not specified in the input file\n"
+ " -R Specify the entire read group line for the SAM/BAM output. This must include an ID tag. If it doesn't start with\n"
+ " '@RG' SNAP will add that. Specify tabs by \\t. Two backslashes will generate a single backslash.\n"
+ " backslash followed by anything else is illegal. So, '-R @RG\\tID:foo\\tDS:my data' would generate reads\n"
+ " with defualt tag foo, and an @RG line that also included the DS:my data field.\n"
+ " -sa Include reads from SAM or BAM files with the secondary (0x100) or supplementary (0x800) flag set; default is to drop them.\n"
+ " -om Output multiple alignments. Takes as a parameter the maximum extra edit distance relative to the best alignment\n"
+ " to allow for secondary alignments\n"
+ " -omax Limit the number of alignments per read generated by -om. This means that if -om would generate more\n"
+ " than -omax secondary alignments, SNAP will write out only the best -omax of them, where 'best' means\n"
+ " 'with the lowest edit distance'. Ties are broken arbitrarily.\n"
+ " -mpc Limit the number of alignments generated by -om to this many per contig (chromosome/FASTA entry);\n"
+ " 'mpc' means 'max per contig; default unlimited. This filter is applied prior to -omax. The primary alignment\n"
+ " is counted.\n"
+ " -pc Preserve the soft clipping for reads coming from SAM or BAM files\n"
+ " -xf Increase expansion factor for BAM and GZ files (default %.1f)\n"
+ " -hdp Use Hadoop-style prefixes (reporter:status:...) on error messages, and emit hadoop-style progress messages\n"
+ " -mrl Specify the minimum read length to align, reads shorter than this (after clipping) stay unaligned. This should be\n"
+ " a good bit bigger than the seed length or you might get some questionable alignments. Default %d\n"
+ " -map Use file mapping to load the index rather than reading it. This might speed up index loading in cases\n"
+ " where SNAP is run repatedly on the same index, and the index is larger than half of the memory size\n"
+ " of the machine. On some operating systems, loading an index with -map is much slower than without if the\n"
+ " index is not in memory. You might consider adding -pre to prefetch the index into system cache when loading\n"
+ " with -map when you don't expect the index to be in cache.\n"
+ " -pre Prefetch the index into system cache. This is only meaningful with -map, and only helps if the index is not\n"
+ " already in memory and your operating system is slow at reading mapped files (i.e., some versions of Linux,\n"
+ " but not Windows).\n"
+ " -lp Run SNAP at low scheduling priority (Only implemented on Windows)\n"
+#ifdef LONG_READS
+ " -dp Edit distance as a percentage of read length (single only, overrides -d)\n"
+ " -nu No Ukkonen: don't reduce edit distance search based on prior candidates. This option is purely for\n"
+ " evalutating the performance effect of using Ukkonen's algorithm rather than Smith-Waterman, and specifying\n"
+ " it will slow down execution without improving the alignments.\n"
+ " -no No Ordering: don't order the evalutation of reads so as to select more likely candidates first. This option\n"
+ " is purely for evaluating the performance effect of the read evaluation order, and specifying it will slow\n"
+ " down execution without improving alignments.\n"
+ " -nt Don't truncate searches based on missed seed hits. This option is purely for evaluating the performance effect\n"
+ " of candidate truncation, and specifying it will slow down execution without improving alignments.\n"
+ " -wbs Write buffer size in megabytes. Don't specify this unless you've gotten an error message saying to make it bigger. Default 16.\n"
+ ,
+ commandLine,
+ maxDist,
+ maxHits,
+ minWeightToCheck,
+ expansionFactor,
+ if (extra != NULL) {
+ extra->usageMessage();
+ }
+ WriteErrorMessage("\n\n"
+ "You may process more than one alignment without restarting SNAP, and if possible without reloading\n"
+ "the index. In order to do this, list on the command line all of the parameters for the first\n"
+ "alignment, followed by a comma (separated by a space from the other parameters) followed by the\n"
+ "parameters for the next alignment (including single or paired). You may have as many of these\n"
+ "as you please. If two consecutive alignments use the same index, it will not be reloaded.\n"
+ "So, for example, you could do 'snap single hg19-20 foo.fq -o foo.sam , paired hg19-20 end1.fq end2.fq -o paired.sam'\n"
+ "and it would not reload the index between the single and paired alignments.\n",
+ "SNAP doesn't parse the options for later runs until the earlier ones have completed, so if you make\n"
+ "an error in one, it may take a while for you to notice. So, be careful (or check back shortly after\n"
+ "you think each run will have completed).\n\n");
+ WriteErrorMessage("When specifying an input or output file, you can simply list the filename, in which case\n"
+ "SNAP will infer the type of the file from the file extension (.sam or .bam for example),\n"
+ "or you can explicitly specify the file type by preceeding the filename with one of the\n"
+ " following type specifiers (which are case sensitive):\n"
+ " -fastq\n"
+ " -compressedFastq\n"
+ " -sam\n"
+ " -bam\n"
+ " -pairedFastq\n"
+ " -pairedInterleavedFastq\n"
+ " -pairedCompressedInterleavedFastq\n"
+ "\n"
+ "So, for example, you could specify -bam input.file to make SNAP treat input.file as a BAM file,\n"
+ "even though it would ordinarily assume a FASTQ file for input or a SAM file for output when it\n"
+ "doesn't recoginize the file extension.\n"
+ "In order to use a file name that begins with a '-' and not have SNAP treat it as a switch, you must\n"
+ "explicitly specify the type. But really, that's just confusing and you shouldn't do it.\n"
+ "Input and output may also be from/to stdin/stdout. To do that, use a - for the input or output file\n"
+ "name and give an explicit type specifier. So, for example, \n"
+ "snap single myIndex -fastq - -o -sam -\n"
+ "would read FASTQ from stdin and write SAM to stdout.\n"
+ );
+ bool
+ const char** argv,
+ int argc,
+ int& n,
+ bool *done)
+ *done = false;
+ if (strcmp(argv[n], "-d") == 0) {
+ if (n + 1 < argc) {
+ maxDist = atoi(argv[n+1]);
+ n++;
+ return true;
+ }
+ } else if (strcmp(argv[n], "-n") == 0) {
+ if (n + 1 < argc) {
+ if (seedCountSpecified) {
+ WriteErrorMessage("-sc and -n are mutually exclusive. Please use only one.\n");
+ return false;
+ }
+ seedCountSpecified = true;
+ numSeedsFromCommandLine = atoi(argv[n+1]);
+ n++;
+ return true;
+ }
+ } else if (strcmp(argv[n], "-sc") == 0) {
+ if (n + 1 < argc) {
+ if (seedCountSpecified) {
+ WriteErrorMessage("-sc and -n are mutually exclusive. Please use only one.\n");
+ return false;
+ }
+ seedCountSpecified = true;
+ seedCoverage = atof(argv[n+1]);
+ numSeedsFromCommandLine = 0;
+ n++;
+ return true;
+ }
+ } else if (strcmp(argv[n], "-ms") == 0) {
+ if (n + 1 < argc) {
+ minWeightToCheck = (unsigned) atoi(argv[n+1]);
+ if (minWeightToCheck > 1000) {
+ fprintf(stderr, "-ms must be between 1 and 1000\n");
+ return false;
+ }
+ n++;
+ return true;
+ }
+ } else if (strcmp(argv[n], "-h") == 0) {
+ if (n + 1 < argc) {
+ maxHits = atoi(argv[n+1]);
+ n++;
+ return true;
+ }
+ } else if (strcmp(argv[n], "-c") == 0) { // conf diff is deprecated, but we just ignore it rather than throwing an error.
+ if (n + 1 < argc) {
+ n++;
+ return true;
+ }
+ } else if (strcmp(argv[n], "-a") == 0) { // adaptive conf diff is deprecated, but we just ignore it rather than throwing an error.
+ if (n + 1 < argc) {
+ n++;
+ return true;
+ }
+ } else if (strcmp(argv[n], "-t") == 0) {
+ if (n + 1 < argc) {
+ numThreads = atoi(argv[n+1]);
+ n++;
+ return true;
+ }
+ } else if (strcmp(argv[n], "-o") == 0) {
+ int argsConsumed;
+ if (!SNAPFile::generateFromCommandLine(argv + n + 1, argc - n - 1, &argsConsumed, &outputFile, false, false)) {
+ WriteErrorMessage("Must have a file specifier after -o\n");
+ return false;
+ }
+ if (outputFile.isStdio) {
+ AlignerOptions::outputToStdout = true;
+ }
+ n += argsConsumed;
+ return true;
+ } else if (strcmp(argv[n], "-P") == 0) {
+ doAlignerPrefetch = false;
+ return true;
+ } else if (strcmp(argv[n], "-b") == 0) {
+ bindToProcessors = true;
+ return true;
+ } else if (strcmp(argv[n], "--b") == 0) {
+ bindToProcessors = false;
+ return true;
+ } else if (strcmp(argv[n], "-so") == 0) {
+ sortOutput = true;
+ return true;
+ } else if (strcmp(argv[n], "-map") == 0) {
+ mapIndex = true;
+ return true;
+ } else if (strcmp(argv[n], "-pre") == 0) {
+ prefetchIndex = true;
+ return true;
+ }
+ else if (strcmp(argv[n], "-S") == 0) {
+ if (n + 1 < argc) {
+ n++;
+ for (const char* p = argv[n]; *p; p++) {
+ switch (*p) {
+ case 'i':
+ noIndex = true;
+ break;
+ case 'd':
+ noDuplicateMarking = true;
+ break;
+ case 'q':
+ noQualityCalibration = true;
+ break;
+ default:
+ return false;
+ }
+ }
+ return true;
+ }
+ } else if (strcmp(argv[n], "-sm") == 0) {
+ if (n + 1 < argc && argv[n+1][0] >= '0' && argv[n+1][0] <= '9') {
+ sortMemory = atoi(argv[n+1]);
+ n++;
+ return true;
+ }
+ } else if (strcmp(argv[n], "-F") == 0) {
+ if (n + 1 < argc) {
+ n++;
+ if (strcmp(argv[n], "a") == 0) {
+ if (0 != filterFlags) {
+ WriteErrorMessage("Specified -F %s after a previous -F option. Choose one (or put -F b after -F %s)\n", argv[n], argv[n]);
+ return false;
+ }
+ filterFlags = FilterSingleHit | FilterMultipleHits | FilterTooShort;
+ } else if (strcmp(argv[n], "s") == 0) {
+ if (0 != filterFlags) {
+ WriteErrorMessage("Specified -F %s after a previous -F option. Choose one (or put -F b after -F %s)\n", argv[n], argv[n]);
+ return false;
+ }
+ filterFlags = FilterSingleHit | FilterTooShort;
+ } else if (strcmp(argv[n], "u") == 0) {
+ if (0 != filterFlags) {
+ WriteErrorMessage("Specified -F %s after a previous -F option. Choose one (or put -F b after -F %s)\n", argv[n], argv[n]);
+ return false;
+ }
+ filterFlags = FilterUnaligned | FilterTooShort;
+ } else if (strcmp(argv[n], "l") == 0) {
+ if (0 != filterFlags) {
+ WriteErrorMessage("Specified -F %s after a previous -F option. Choose one (or put -F b after -F %s)\n", argv[n], argv[n]);
+ return false;
+ }
+ filterFlags = FilterSingleHit | FilterMultipleHits | FilterUnaligned;
+ } else if (strcmp(argv[n], "b") == 0) {
+ // ignore paired-end option(s)
+ } else {
+ WriteErrorMessage("Unknown option type after -F: %s\n", argv[n]);
+ return false;
+ }
+ return true;
+ }
+ } else if (strcmp(argv[n], "-x") == 0) {
+ explorePopularSeeds = true;
+ return true;
+ } else if (strcmp(argv[n], "-f") == 0) {
+ stopOnFirstHit = true;
+ return true;
+ } else if (strcmp(argv[n], "-I") == 0) {
+ ignoreMismatchedIDs = true;
+ return true;
+#ifdef _MSC_VER
+ } else if (strcmp(argv[n], "-B") == 0) {
+ useTimingBarrier = true;
+ return true;
+#endif // _MSC_VER
+ } else if (strcmp(argv[n], "-M") == 0) {
+ useM = true;
+ return true;
+ } else if (strcmp(argv[n], "-=") == 0) {
+ useM = false;
+ return true;
+ } else if (strcmp(argv[n], "-sa") == 0) {
+ ignoreSecondaryAlignments = false;
+ return true;
+ } else if (strcmp(argv[n], "-om") == 0) {
+ if (n + 1 >= argc) {
+ WriteErrorMessage("-om requires an additional value\n");
+ return false;
+ }
+ //
+ // Check that the parameter is actually numeric. This is to avoid having someone do "-om -anotherSwitch" and
+ // having the additional switch silently consumed here.
+ //
+ if (argv[n + 1][0] < '0' || argv[n + 1][0] > '9') {
+ WriteErrorMessage("-om requires a numerical parameter.\n");
+ return false;
+ }
+ maxSecondaryAlignmentAdditionalEditDistance = atoi(argv[n + 1]);
+ n++;
+ return true;
+ } else if (strcmp(argv[n], "-omax") == 0) {
+ if (n + 1 >= argc) {
+ WriteErrorMessage("-omax requires an additional value\n");
+ return false;
+ }
+ maxSecondaryAlignments = atoi(argv[n + 1]);
+ if (maxSecondaryAlignments <= 0) {
+ WriteErrorMessage("-omax must be strictly positive\n");
+ }
+ n++;
+ return true;
+ } else if (strcmp(argv[n], "-mpc") == 0) {
+ if (n + 1 >= argc) {
+ WriteErrorMessage("-mpc requires an additional value\n");
+ return false;
+ }
+ maxSecondaryAlignmentsPerContig = atoi(argv[n + 1]);
+ if (maxSecondaryAlignmentsPerContig <= 0) {
+ WriteErrorMessage("-mpc must be strictly positive\n");
+ return false;
+ }
+ n++;
+ return true;
+ } else if (strcmp(argv[n], "-wbs") == 0) {
+ if (n + 1 >= argc) {
+ WriteErrorMessage("-wbs requires an additional value\n");
+ return false;
+ }
+ //
+ // Check that the parameter is actually numeric. This is to avoid having someone do "-wbs -anotherSwitch" and
+ // having the additional switch silently consumed here.
+ //
+ if (argv[n + 1][0] < '0' || argv[n + 1][0] > '9') {
+ WriteErrorMessage("-wbs requires a numerical parameter.\n");
+ return false;
+ }
+ writeBufferSize = atoi(argv[n + 1]) * 1024 * 1024;
+ if (writeBufferSize <= 0) {
+ WriteErrorMessage("-wbs must be bigger than zero");
+ return false;
+ }
+ n++;
+ return true;
+ } else if (strcmp(argv[n], "-xf") == 0) {
+ if (n + 1 < argc) {
+ n++;
+ expansionFactor = (float)atof(argv[n]);
+ return expansionFactor > 0;
+ }
+ } else if (strcmp(argv[n], "-pc") == 0) {
+ preserveClipping = true;
+ return true;
+ } else if (strcmp(argv[n], "-G") == 0) {
+ if (n + 1 < argc) {
+ gapPenalty = atoi(argv[n+1]);
+ if (gapPenalty < 1) {
+ WriteErrorMessage("Gap penalty must be at least 1.\n");
+ soft_exit(1);
+ }
+ n++;
+ return true;
+ } else {
+ WriteErrorMessage("Must have the gap penalty value after -G\n");
+ }
+ } else if (strcmp(argv[n], "-mrl") == 0) {
+ if (n + 1 < argc) {
+ n++;
+ minReadLength = atoi(argv[n]);
+ return minReadLength > 0;
+ }
+ } else if (strcmp(argv[n], "-dp") == 0) {
+ if (n + 1 < argc) {
+ n++;
+ maxDistFraction = (float) (0.01 * atof(argv[n]));
+ return (! isPaired()) && maxDistFraction > 0.0 && maxDistFraction < 1.0;
+ }
+ } else if (strcmp(argv[n], "-R") == 0) {
+ if (n + 1 < argc) {
+ //
+ // Check the line for sanity. It must consist either of @RG\t<fields> or just <fields> (in which
+ // case we add the @RG part). It must contain a field called ID. Fields are separated by tabs.
+ // We don't require that the fields be things that are listed in the SAM spec, however, because
+ // new ones might be added.
+ //
+ bool needsRG = !(argv[n+1][0] == '@' && argv[n+1][1] == 'R' && argv[n+1][2] == 'G' && argv[n+1][3] == '\\' && argv[n+1][4] == 't');
+ const unsigned buflen = (unsigned) (strlen(argv[n + 1]) + 1 + (needsRG ? 4 : 0));
+ char *buffer = new char[buflen];
+ char *copyToPtr = buffer;
+ const char *copyFromPtr = argv[n+1];
+ if (needsRG) {
+ memcpy(copyToPtr, "@RG\t", 4);
+ copyToPtr += 4;
+ }
+ //
+ // First copy the line, converting \t into tabs.
+ //
+ bool pendingBackslash = false;
+ while (*copyFromPtr != '\0') {
+ if (pendingBackslash) {
+ if (*copyFromPtr == 't' || *copyFromPtr == '\\') {
+ _ASSERT((unsigned)(copyToPtr - buffer) < buflen);
+ *copyToPtr = (*copyFromPtr == 't') ? '\t' : '\\';
+ copyToPtr++;
+ copyFromPtr++;
+ pendingBackslash = false;
+ } else {
+ WriteErrorMessage("Unrecognized escape character in -R parameter. A backslash must be followed by a t or another backslash.\n");
+ return false;
+ }
+ } else {
+ //
+ // Emit the character literally unless it's a backslash.
+ //
+ pendingBackslash = *copyFromPtr == '\\';
+ if (!pendingBackslash) {
+ _ASSERT((unsigned)(copyToPtr - buffer) < buflen);
+ *copyToPtr = *copyFromPtr;
+ copyToPtr++;
+ }
+ copyFromPtr++;
+ }
+ } // while
+ _ASSERT((unsigned)(copyToPtr - buffer) < buflen);
+ *copyToPtr = '\0'; // Null terminate the string
+ //
+ // Now run through the line looking for <tab>ID:..., and use that to set the default read group.
+ //
+ int bytesAlong = 0;
+ defaultReadGroup = NULL;
+ for (int i = 0; NULL == defaultReadGroup && i < strlen(buffer); i++) {
+ switch (bytesAlong) {
+ case 0:
+ if (buffer[i] == '\t') {
+ bytesAlong = 1;
+ }
+ break;
+ case 1:
+ if (buffer[i] == 'I') {
+ bytesAlong = 2;
+ } else {
+ bytesAlong = 0;
+ }
+ break;
+ case 2:
+ if (buffer[i] == 'D') {
+ bytesAlong = 3;
+ } else {
+ bytesAlong = 0;
+ }
+ break;
+ case 3:
+ if (buffer[i] == ':') {
+ if (NULL != defaultReadGroup) {
+ WriteErrorMessage("read group string specified with -R contained more than one ID field.\n");
+ return false;
+ }
+ //
+ // The ID tag starts at i+1.
+ //
+ int idTagSize = 0;
+ for (idTagSize = 0; buffer[i + 1 + idTagSize] != '\t' && buffer[i + 1 + idTagSize] != '\0'; idTagSize++) {
+ // This loop body intentionally left blank.
+ }
+ if (0 == idTagSize) {
+ WriteErrorMessage("The ID tag on the read group line specified by -R must not be empty\n");
+ return false;
+ }
+ char *newReadGroup = new char[idTagSize + 1]; // +1 for null.
+ memcpy(newReadGroup, buffer + i + 1, idTagSize);
+ newReadGroup[idTagSize] = '\0';
+ defaultReadGroup = newReadGroup; // +1 for null.
+ } else {
+ bytesAlong = 0;
+ }
+ break;
+ default:
+ WriteErrorMessage("Invalid bytesAlong = %d", bytesAlong);
+ soft_exit(1);
+ } // switch
+ } // for
+ if (NULL == defaultReadGroup) {
+ WriteErrorMessage("The string specified after -R must include an ID field.\n");
+ return false;
+ }
+ rgLineContents = buffer; // This leaks, but so what?
+ n++;
+ return true;
+ } else {
+ WriteErrorMessage("-R requires a value");
+ return false;
+ }
+ } else if (strcmp(argv[n], "-pf") == 0) {
+ if (n + 1 < argc) {
+ perfFileName = argv[n+1];
+ n++;
+ return true;
+ } else {
+ WriteErrorMessage("Must specify the name of the perf file after -pf\n");
+ }
+ } else if (strcmp(argv[n], "-rg") == 0) {
+ if (n + 1 < argc) {
+ char *newReadGroup = new char[strlen(argv[n+1]) + 1];
+ strcpy(newReadGroup, argv[n+1]);
+ defaultReadGroup = newReadGroup;
+ n++;
+ static const char* format = "@RG\tID:%s\tPL:Illumina\tPU:pu\tLB:lb\tSM:sm";
+ char* s = new char[1 + strlen(defaultReadGroup) + strlen(format)];
+ sprintf(s, format, defaultReadGroup);
+ rgLineContents = s;
+ return true;
+ } else {
+ WriteErrorMessage("Must specify the default read group after -rg\n");
+ }
+ } else if (strcmp(argv[n], "--hp") == 0) {
+ BigAllocUseHugePages = false;
+ return true;
+ } else if (strcmp(argv[n], "-hp") == 0) {
+ BigAllocUseHugePages = true;
+ return true;
+ } else if (strcmp(argv[n], "-hdp") == 0) {
+ AlignerOptions::useHadoopErrorMessages = true;
+ return true;
+ } else if (strcmp(argv[n], "-lp") == 0) {
+ SetToLowSchedulingPriority();
+ return true;
+ } else if (strcmp(argv[n], "-nu") == 0) {
+ noUkkonen = true;
+ return true;
+ } else if (strcmp(argv[n], "-no") == 0) {
+ noOrderedEvaluation = true;
+ return true;
+ } else if (strcmp(argv[n], "-nt") == 0) {
+ noTruncation = true;
+ return true;
+ } else if (strcmp(argv[n], "-D") == 0) {
+ if (n + 1 < argc) {
+ extraSearchDepth = atoi(argv[n+1]);
+ n++;
+ return true;
+ } else {
+ WriteErrorMessage("Must specify the desired extra search depth after -D\n");
+ }
+ } else if (strlen(argv[n]) >= 2 && '-' == argv[n][0] && 'C' == argv[n][1]) {
+ if (strlen(argv[n]) != 4 || '-' != argv[n][2] && '+' != argv[n][2] ||
+ '-' != argv[n][3] && '+' != argv[n][3]) {
+ WriteErrorMessage("Invalid -C argument.\n\n");
+ return false;
+ }
+ if ('-' == argv[n][2]) {
+ if ('-' == argv[n][3]) {
+ clipping = NoClipping;
+ } else {
+ clipping = ClipBack;
+ }
+ } else {
+ if ('-' == argv[n][3]) {
+ clipping = ClipFront;
+ } else {
+ clipping = ClipFrontAndBack;
+ }
+ }
+ return true;
+ } else if (strcmp(argv[n], ",") == 0) {
+ //
+ // End of args for this run.
+ //
+ *done = true;
+ return true;
+ } else if (extra != NULL) {
+ return extra->parse(argv, argc, n, done);
+ }
+ return false;
+ bool
+ Read* read,
+ AlignmentResult result,
+ bool tooShort)
+ if (filterFlags == 0) {
+ return true;
+ }
+ if (tooShort && (filterFlags & FilterTooShort) == 0) {
+ return false;
+ }
+ switch (result) {
+ case NotFound:
+ case UnknownAlignment:
+ return (filterFlags & FilterUnaligned) != 0;
+ case SingleHit:
+ return (filterFlags & FilterSingleHit) != 0;
+ case MultipleHits:
+ return (filterFlags & FilterMultipleHits) != 0;
+ default:
+ return false; // shouldn't happen!
+ }
+ PairedReadSupplierGenerator *
+SNAPFile::createPairedReadSupplierGenerator(int numThreads, bool quicklyDropUnpairedReads, const ReaderContext& context)
+ _ASSERT(fileType == SAMFile || fileType == BAMFile || fileType == InterleavedFASTQFile || secondFileName != NULL); // Caller's responsibility to check this
+ switch (fileType) {
+ case SAMFile:
+ return SAMReader::createPairedReadSupplierGenerator(fileName, numThreads, quicklyDropUnpairedReads, context);
+ case BAMFile:
+ return BAMReader::createPairedReadSupplierGenerator(fileName,numThreads, quicklyDropUnpairedReads, context);
+ case FASTQFile:
+ return PairedFASTQReader::createPairedReadSupplierGenerator(fileName, secondFileName, numThreads, context, isCompressed);
+ case InterleavedFASTQFile:
+ return PairedInterleavedFASTQReader::createPairedReadSupplierGenerator(fileName, numThreads, context, isCompressed);
+ default:
+ _ASSERT(false);
+ WriteErrorMessage("SNAPFile::createPairedReadSupplierGenerator: invalid file type (%d)\n", fileType);
+ soft_exit(1);
+ return NULL;
+ }
+ ReadSupplierGenerator *
+SNAPFile::createReadSupplierGenerator(int numThreads, const ReaderContext& context)
+ _ASSERT(secondFileName == NULL);
+ switch (fileType) {
+ case SAMFile:
+ return SAMReader::createReadSupplierGenerator(fileName, numThreads, context);
+ case BAMFile:
+ return BAMReader::createReadSupplierGenerator(fileName,numThreads, context);
+ case FASTQFile:
+ return FASTQReader::createReadSupplierGenerator(fileName, numThreads, context, isCompressed);
+ default:
+ _ASSERT(false);
+ WriteErrorMessage("SNAPFile::createReadSupplierGenerator: invalid file type (%d)\n", fileType);
+ soft_exit(1);
+ return NULL;
+ }
+ bool
+SNAPFile::generateFromCommandLine(const char **args, int nArgs, int *argsConsumed, SNAPFile *snapFile, bool paired, bool isInput)
+ snapFile->fileName = NULL;
+ snapFile->secondFileName = NULL;
+ snapFile->isCompressed = false;
+ *argsConsumed = 0;
+ snapFile->isStdio = false;
+ if (0 == nArgs) {
+ return false;
+ }
+ //
+ // Check to see if this is an explicit file type.
+ //
+ if ('-' == args[0][0] && '\0' != args[0][1]) { // starts with - but isn't just a - (which means to use stdio without a type specifier)
+ if (1 == nArgs) {
+ return false;
+ }
+ if (!strcmp(args[1], "-")) {
+ snapFile->isStdio = true;
+ }
+ if (!strcmp(args[0], "-fastq") || !strcmp(args[0], "-compressedFastq")) {
+ if (!isInput) {
+ WriteErrorMessage("%s is not a valid output file type.\n", args[0]);
+ return false;
+ }
+ if (paired && nArgs < 3) {
+ WriteErrorMessage("Expected a pair of fastQ files, but instead just got one\n");
+ return false;
+ }
+ snapFile->isCompressed = !strcmp(args[0], "-compressedFastq");
+ if (paired) {
+ if (nArgs < 3) {
+ WriteErrorMessage("paired FASTQ requires two consecutive input files, and the last item on your command line is the first half of a FASTQ pair.\n");
+ return false;
+ }
+ snapFile->fileType = FASTQFile;
+ snapFile->secondFileName = args[2];
+ if (!strcmp("-", args[2])) {
+ if (snapFile->isStdio) {
+ WriteErrorMessage("Can't have both halves of paired FASTQ files be stdin ('-'). Did you mean to use the interleaved FASTQ type?\n");
+ return false;
+ }
+ snapFile->isStdio = true;
+ }
+ *argsConsumed = 3;
+ } else {
+ snapFile->fileType = FASTQFile;
+ *argsConsumed = 2;
+ }
+ } else if (!strcmp(args[0], "-sam")) {
+ snapFile->fileType = SAMFile;
+ *argsConsumed = 2;
+ } else if (!strcmp(args[0], "-samNoSQ") && !isInput) { // No header is only valid for output file types
+ snapFile->fileType = SAMFile;
+ snapFile->omitSQLines = true;
+ *argsConsumed = 2;
+ } else if (!strcmp(args[0], "-bam")) {
+ snapFile->fileType = BAMFile;
+ snapFile->isCompressed = true;
+ *argsConsumed = 2;
+ } else if (!strcmp(args[0], "-pairedInterleavedFastq") || !strcmp(args[0], "-pairedCompressedInterleavedFastq")) {
+ if (!paired) {
+ WriteErrorMessage("Specified %s for a single-end alignment. To treat it as single-end, just use ordinary fastq (or compressed fastq, as appropriate)\n", args[0]);
+ return false;
+ }
+ snapFile->fileType = InterleavedFASTQFile;
+ snapFile->isCompressed = !strcmp(args[0], "-pairedCompressedInterleavedFastq");
+ *argsConsumed = 2;
+ } else {
+ //
+ // starts with '-' but isn't a file-type specifier
+ //
+ return false;
+ }
+ snapFile->fileName = args[1];
+ return true;
+ }
+ //
+ // Just a filename. Infer the type.
+ //
+ *argsConsumed = 1;
+ snapFile->fileName = args[0];
+ snapFile->isStdio = '-' == args[0][0] && '\0' == args[0][1];
+ if (util::stringEndsWith(args[0], ".sam")) {
+ snapFile->fileType = SAMFile;
+ snapFile->isCompressed = false;
+ } else if (util::stringEndsWith(args[0], ".bam")) {
+ snapFile->fileType = BAMFile;
+ snapFile->isCompressed = true;
+ } else if (!isInput) {
+ //
+ // No default output file type.
+ //
+ WriteErrorMessage("You specified an output file with name '%s', which doesn't end in .sam or .bam, and doesn't have an explicit type\n"
+ "specifier. There is no default output file type. Consider doing something like '-o -bam %s'\n", args[0], args[0]);
+ return false;
+ } else if (util::stringEndsWith(args[0], ".fq") || util::stringEndsWith(args[0], ".fastq") ||
+ util::stringEndsWith(args[0], ".fq.gz") || util::stringEndsWith(args[0], ".fastq.gz") ||
+ util::stringEndsWith(args[0], ".fq.gzip") || util::stringEndsWith(args[0], ".fastq.gzip")) {
+ //
+ // It's a fastq input file (either by default or because it's got a .fq or .fastq extension, we don't
+ // need to check). See if it's also compressed.
+ //
+ snapFile->fileType= FASTQFile;
+ if (util::stringEndsWith(args[0], ".gz") || util::stringEndsWith(args[0], ".gzip")) {
+ snapFile->isCompressed = true;
+ } else {
+ snapFile->isCompressed = false;
+ }
+ snapFile->isStdio = !strcmp(args[0], "-");
+ if (paired) {
+ if (nArgs < 2) {
+ WriteErrorMessage("paired FASTQ requires two input files, and the last item on your command line is the first half of a FASTQ pair.\n");
+ return false;
+ }
+ snapFile->secondFileName = args[1];
+ if (!strcmp(args[1], "-")) {
+ if (snapFile->isStdio) {
+ WriteErrorMessage("Can't have both halves of paired FASTQ files be stdin ('-'). Did you mean to use the interleaved FASTQ type?\n");
+ return false;
+ }
+ if (CommandPipe != NULL) {
+ WriteErrorMessage("You may not write to stdout in daemon mode\n");
+ return false;
+ }
+ snapFile->isStdio = true;
+ }
+ *argsConsumed = 2;
+ }
+ } else {
+ if (snapFile->isStdio) {
+ WriteErrorMessage("Stdio IO always requires an explicit file type. So, for example, do 'snap single index-directory -fastq -' to read FASTQ from stdin\n");
+ } else {
+ WriteErrorMessage("Unknown file type for file name '%s', please specify file type with -fastq, -sam, -bam, etc.\n", snapFile->fileName);
+ }
+ return false;
+ }
+ return true;
+ bool
+AlignerOptions::useHadoopErrorMessages= false;
+ bool
+AlignerOptions::outputToStdout = false;
diff --git a/SNAPLib/AlignerOptions.h b/SNAPLib/AlignerOptions.h
new file mode 100644
index 0000000..f578d44
--- /dev/null
+++ b/SNAPLib/AlignerOptions.h
@@ -0,0 +1,129 @@
+Module Name:
+ AlignerOptions.h
+ Common parameters for running single & paired alignment.
+ Ravi Pandya, May, 2012
+ User mode service.
+Revision History:
+ Integrated from SingleAligner.cpp & PairedAligner.cpp
+#pragma once
+#include "stdafx.h"
+#include "options.h"
+#include "Genome.h"
+#include "Read.h"
+struct AbstractOptions
+ virtual void usageMessage() = 0;
+ virtual bool parse(const char** argv, int argc, int& n, bool *done) = 0;
+enum FileType {UnknownFileType, SAMFile, FASTQFile, BAMFile, InterleavedFASTQFile, CRAMFile}; // Add more as needed
+struct SNAPFile {
+ SNAPFile() : fileName(NULL), secondFileName(NULL), fileType(UnknownFileType), isStdio(false), omitSQLines(false) {}
+ const char *fileName;
+ const char *secondFileName;
+ FileType fileType;
+ bool isCompressed;
+ bool isStdio; // Only applies to the first file for two-file inputs
+ bool omitSQLines; // Special undocumented option for Charles Chiu's group. Mostly a bad idea.
+ PairedReadSupplierGenerator *createPairedReadSupplierGenerator(int numThreads, bool quicklyDropUnpairedReads, const ReaderContext& context);
+ ReadSupplierGenerator *createReadSupplierGenerator(int numThreads, const ReaderContext& context);
+ static bool generateFromCommandLine(const char **args, int nArgs, int *argsConsumed, SNAPFile *snapFile, bool paired, bool isInput);
+struct AlignerOptions : public AbstractOptions
+ AlignerOptions(const char* i_commandLine, bool forPairedEnd = false);
+ const char *commandLine;
+ const char *indexDir;
+ const char *similarityMapFile;
+ int numThreads;
+ unsigned maxDist;
+ float maxDistFraction;
+ unsigned numSeedsFromCommandLine;
+ double seedCoverage; // Exclusive with numSeeds; this is readSize/seedSize
+ bool seedCountSpecified; // Has either -n or -sc been specified? This bool is used to make sure they're not both specified on the command line
+ unsigned maxHits;
+ int minWeightToCheck;
+ bool bindToProcessors;
+ bool ignoreMismatchedIDs;
+ SNAPFile outputFile;
+ int nInputs;
+ SNAPFile *inputs;
+ ReadClippingType clipping;
+ bool sortOutput;
+ bool noIndex;
+ bool noDuplicateMarking;
+ bool noQualityCalibration;
+ unsigned sortMemory; // total output sorting buffer size in Gb
+ unsigned filterFlags;
+ bool explorePopularSeeds;
+ bool stopOnFirstHit;
+ bool useM; // Should we generate CIGAR strings using = and X, or using the old-style M?
+ unsigned gapPenalty; // if non-zero use gap penalty aligner
+ AbstractOptions *extra; // extra options
+ const char *rgLineContents;
+ const char *perfFileName;
+ bool useTimingBarrier;
+ unsigned extraSearchDepth;
+ const char *defaultReadGroup; // if not specified in input
+ bool ignoreSecondaryAlignments; // on input, default true
+ int maxSecondaryAlignmentAdditionalEditDistance;
+ int maxSecondaryAlignments;
+ int maxSecondaryAlignmentsPerContig;
+ bool preserveClipping;
+ float expansionFactor;
+ bool noUkkonen;
+ bool noOrderedEvaluation;
+ bool noTruncation;
+ unsigned minReadLength;
+ bool mapIndex;
+ bool prefetchIndex;
+ size_t writeBufferSize;
+ static bool useHadoopErrorMessages; // This is static because it's global (and I didn't want to push the options object to every place in the code)
+ static bool outputToStdout; // Likewise
+ void usage();
+ virtual void usageMessage();
+ virtual bool parse(const char** argv, int argc, int& n, bool *done);
+ enum FilterFlags
+ {
+ FilterUnaligned = 0x0001,
+ FilterSingleHit = 0x0002,
+ FilterMultipleHits = 0x0004,
+ FilterBothMatesMatch = 0x0008,
+ FilterTooShort = 0x0010
+ };
+ bool passFilter(Read* read, AlignmentResult result, bool tooShort);
+ virtual bool isPaired() { return false; }
diff --git a/SNAPLib/AlignerStats.cpp b/SNAPLib/AlignerStats.cpp
new file mode 100644
index 0000000..9ba0f25
--- /dev/null
+++ b/SNAPLib/AlignerStats.cpp
@@ -0,0 +1,111 @@
+Module Name:
+ AlignerStats.cpp
+ Common statistics for running single & paired alignment.
+ Ravi Pandya, May, 2012
+ User mode service.
+Revision History:
+ Integrated from SingleAligner.cpp & PairedAligner.cpp
+#include "stdafx.h"
+#include "options.h"
+#include "AlignerStats.h"
+AlignerStats::AlignerStats(AbstractStats* i_extra)
+ totalReads(0),
+ usefulReads(0),
+ singleHits(0),
+ multiHits(0),
+ notFound(0),
+ alignedAsPairs(0),
+ extra(i_extra),
+ lvCalls(0)
+ for (int i = 0; i <= AlignerStats::maxMapq; i++) {
+ mapqHistogram[i] = 0;
+ }
+ for (int i = 0; i < maxMaxHits; i++) {
+ countOfBestHitsByWeightDepth[i] = 0;
+ countOfAllHitsByWeightDepth[i] = 0;
+ probabilityMassByWeightDepth[i] = 0;
+ }
+ for (unsigned i = 0; i < 31; i++) {
+ countByTimeBucket[i] = nanosByTimeBucket[i] = 0;
+ }
+ if (extra != NULL) {
+ delete extra;
+ }
+ void
+ FILE* out)
+ // nothing
+ if (extra != NULL) {
+ extra->printHistograms(out);
+ }
+ void
+ const AbstractStats* i_other)
+ AlignerStats* other = (AlignerStats*) i_other;
+ totalReads += other->totalReads;
+ usefulReads += other->usefulReads;
+ singleHits += other->singleHits;
+ multiHits += other->multiHits;
+ notFound += other->notFound;
+ alignedAsPairs += other->alignedAsPairs;
+ lvCalls += other->lvCalls;
+ if (extra != NULL && other->extra != NULL) {
+ extra->add(other->extra);
+ }
+ for (int i = 0; i <= AlignerStats::maxMapq; i++) {
+ mapqHistogram[i] += other->mapqHistogram[i];
+ }
+ for (int i = 0; i < maxMaxHits; i++) {
+ countOfBestHitsByWeightDepth[i] += other->countOfBestHitsByWeightDepth[i];
+ countOfAllHitsByWeightDepth[i] += other->countOfAllHitsByWeightDepth[i];
+ probabilityMassByWeightDepth[i] = other->probabilityMassByWeightDepth[i];
+ }
+ for (unsigned i = 0; i < 31; i++) {
+ countByTimeBucket[i] += other->countByTimeBucket[i];
+ nanosByTimeBucket[i] += other->nanosByTimeBucket[i];
+ }
diff --git a/SNAPLib/AlignerStats.h b/SNAPLib/AlignerStats.h
new file mode 100644
index 0000000..625182a
--- /dev/null
+++ b/SNAPLib/AlignerStats.h
@@ -0,0 +1,80 @@
+Module Name:
+ AlignerStats.h
+ Common statistics for running single & paired alignment.
+ Ravi Pandya, May, 2012
+ User mode service.
+Revision History:
+ Integrated from SingleAligner.cpp & PairedAligner.cpp
+#pragma once
+#include "stdafx.h"
+#include "Compat.h"
+struct AbstractStats
+ virtual ~AbstractStats();
+ virtual void add(const AbstractStats* other) = 0;
+ virtual void printHistograms(FILE* out) = 0;
+struct AlignerStats : public AbstractStats
+ AlignerStats(AbstractStats* i_extra = NULL);
+ // TODO: This should also count both-aligned vs one-aligned etc.
+ _int64 totalReads;
+ _int64 usefulReads;
+ _int64 singleHits;
+ _int64 multiHits;
+ _int64 notFound;
+ _int64 alignedAsPairs;
+ _int64 lvCalls;
+ static const unsigned maxMapq = 70;
+ unsigned mapqHistogram[maxMapq+1];
+ //
+ // Histogram of alignment times. Time buckets are divided by powers-of-two nanoseconds, so time bucket 0 is
+ // <= 1 ns, time bucket 10 is <= 1.024 us, etc. Time bucket 30 is > 1s.
+ //
+ _int64 countByTimeBucket[31];
+ _int64 nanosByTimeBucket[31];
+ static const unsigned maxMaxHits = 50;
+ unsigned countOfBestHitsByWeightDepth[maxMaxHits];
+ unsigned countOfAllHitsByWeightDepth[maxMaxHits];
+ double probabilityMassByWeightDepth[maxMaxHits];
+ AbstractStats* extra;
+ virtual ~AlignerStats();
+ virtual void add(const AbstractStats* other);
+ virtual void printHistograms(FILE* out);
diff --git a/SNAPLib/AlignmentResult.cpp b/SNAPLib/AlignmentResult.cpp
new file mode 100644
index 0000000..73f5868
--- /dev/null
+++ b/SNAPLib/AlignmentResult.cpp
@@ -0,0 +1,107 @@
+Module Name:
+Code for SNAP genome alignment results
+Bill Bolosky, March, 2015
+Revision History:
+#include "stdafx.h"
+#include "AlignmentResult.h"
+#include "GenomeIndex.h"
+ int
+SingleAlignmentResult::compareByContigAndScore(const void *first_, const void *second_)
+ extern GenomeIndex *g_index; // Sorry, but no easy way to get this into here
+ const SingleAlignmentResult *first = (SingleAlignmentResult *)first_;
+ const SingleAlignmentResult *second = (SingleAlignmentResult *)second_;
+ int firstContig = g_index->getGenome()->getContigNumAtLocation(first->location);
+ int secondContig = g_index->getGenome()->getContigNumAtLocation(second->location);
+ if (firstContig < secondContig) {
+ return -1;
+ } else if (firstContig > secondContig) {
+ return 1;
+ } else if (first->score < second->score) {
+ return -1;
+ } else if (first->score > second->score) {
+ return 1;
+ } else {
+ return 0;
+ }
+ }
+ SingleAlignmentResult::compareByScore(const void *first_, const void *second_)
+ const SingleAlignmentResult *first = (SingleAlignmentResult *)first_;
+ const SingleAlignmentResult *second = (SingleAlignmentResult *)second_;
+ if (first->score < second->score) {
+ return -1;
+ } else if (first->score > second->score) {
+ return 1;
+ } else {
+ return 0;
+ }
+ int
+PairedAlignmentResult::compareByContigAndScore(const void *first_, const void *second_)
+ extern GenomeIndex *g_index; // Sorry, but no easy way to get this into here
+ const PairedAlignmentResult *first = (PairedAlignmentResult *)first_;
+ const PairedAlignmentResult *second = (PairedAlignmentResult *)second_;
+ int firstContig = g_index->getGenome()->getContigNumAtLocation(first->location[0]);
+ int secondContig = g_index->getGenome()->getContigNumAtLocation(second->location[0]);
+ if (firstContig < secondContig) {
+ return -1;
+ } else if (firstContig > secondContig) {
+ return 1;
+ } else if (first->score < second->score) {
+ return -1;
+ } else if (first->score > second->score) {
+ return 1;
+ } else {
+ return 0;
+ }
+PairedAlignmentResult::compareByScore(const void *first_, const void *second_)
+ const PairedAlignmentResult *first = (PairedAlignmentResult *)first_;
+ const PairedAlignmentResult *second = (PairedAlignmentResult *)second_;
+ int firstScore = first->score[0] + first->score[1];
+ int secondScore = second->score[0] + second->score[1];
+ if (firstScore < secondScore) {
+ return -1;
+ } else if (firstScore > secondScore) {
+ return 1;
+ } else {
+ return 0;
+ }
\ No newline at end of file
diff --git a/SNAPLib/AlignmentResult.h b/SNAPLib/AlignmentResult.h
new file mode 100644
index 0000000..eb229fb
--- /dev/null
+++ b/SNAPLib/AlignmentResult.h
@@ -0,0 +1,93 @@
+Module Name:
+ AlignmentResult.h
+ Header for SNAP genome alignment results
+ Bill Bolosky, May, 2014
+ User mode service.
+ This class is NOT thread safe. It's the caller's responsibility to ensure that
+ at most one thread uses an instance at any time.
+Revision History:
+ Pulled out of other places in SNAP
+#pragma once
+#include "Genome.h"
+#include "directions.h"
+class Read;
+enum AlignmentResult {NotFound, SingleHit, MultipleHits, UnknownAlignment}; // BB: Changed Unknown to UnknownAlignment because of a conflict w/Windows headers
+bool isAValidAlignmentResult(AlignmentResult result);
+inline const char *AlignmentResultToString(AlignmentResult result) {
+ switch (result) {
+ case NotFound: return "NotFound";
+ case SingleHit: return "SingleHit";
+ case MultipleHits: return "MultipleHits";
+ case UnknownAlignment: return "Unknown";
+ default: return "Unknown alignment result type";
+ }
+struct SingleAlignmentResult {
+ AlignmentResult status;
+ GenomeLocation location; // Aligned genome location.
+ Direction direction; // Did we match the reverse complement?
+ int score; // score of each end if matched
+ int mapq; // mapping quality, encoded like a Phred score (but as an integer, not ASCII Phred + 33).
+ static int compareByContigAndScore(const void *first, const void *second); // qsort()-style compare routine
+ static int compareByScore(const void *first, const void *second); // qsort()-style compare routine
+// Does an AlignmentResult represent a single location?
+inline bool isOneLocation(AlignmentResult result) {
+ return result == SingleHit;
+const int NUM_READS_PER_PAIR = 2; // This is just to make it clear what the array subscripts are, it doesn't ever make sense to change
+struct PairedAlignmentResult {
+ AlignmentResult status[NUM_READS_PER_PAIR]; // SingleHit or CertainHit if aligned, MultipleHit if matches DB
+ // but not confidently aligned, or NotFound.
+ GenomeLocation location[NUM_READS_PER_PAIR];// Genome location of each read.
+ Direction direction[NUM_READS_PER_PAIR]; // Did we match the reverse complement? In general the two reads should have
+ // opposite orientations because they're part of the same original fragment,
+ // but it seems possible for a piece of the genome to get cut cleanly and flip
+ // in a translocation event, which would cause both ends of a fragment aligning
+ // there to be in the same orientation w.r.t. the reference genome.
+ int score[NUM_READS_PER_PAIR]; // score of each end if matched
+ int mapq[NUM_READS_PER_PAIR]; // mapping quality of each end, encoded like a Phred score (but as an integer, not ASCII Phred + 33).
+ bool fromAlignTogether; // Was this alignment created by aligning both reads together, rather than from some combination of single-end aligners?
+ bool alignedAsPair; // Were the reads aligned as a pair, or separately?
+ _int64 nanosInAlignTogether;
+ unsigned nLVCalls;
+ unsigned nSmallHits;
+ static int compareByContigAndScore(const void *first, const void *second); // qsort()-style compare routine
+ static int compareByScore(const void *first, const void *second); // qsort()-style compare routine
\ No newline at end of file
diff --git a/SNAPLib/ApproximateCounter.cpp b/SNAPLib/ApproximateCounter.cpp
new file mode 100644
index 0000000..8ea1367
--- /dev/null
+++ b/SNAPLib/ApproximateCounter.cpp
@@ -0,0 +1,40 @@
+#include "stdafx.h"
+#include <math.h>
+#include "ApproximateCounter.h"
+using namespace std;
+ buckets.resize(BUCKETS);
+void ApproximateCounter::add(_uint64 value)
+ _uint64 h = hash(value);
+ unsigned bucket = (unsigned) h % BUCKETS;
+ unsigned rest = (unsigned)(h >> SHIFT);
+ unsigned long firstZero;
+ if (rest == 0) {
+ firstZero = 64 - SHIFT;
+ } else {
+ CountTrailingZeroes(rest, firstZero);
+ }
+ buckets[bucket] |= (1LL << firstZero);
+unsigned ApproximateCounter::getCount()
+ double s = 0;
+ for (int i = 0; i < BUCKETS; i++) {
+ _uint64 r = 0;
+ while (r < 64 && (buckets[i] & (1LL << r)) != 0) {
+ r++;
+ }
+ s += r;
+ }
+ return (unsigned) (BUCKETS / 0.77351 * pow(2, s / BUCKETS));
diff --git a/SNAPLib/ApproximateCounter.h b/SNAPLib/ApproximateCounter.h
new file mode 100644
index 0000000..dc2c6b6
--- /dev/null
+++ b/SNAPLib/ApproximateCounter.h
@@ -0,0 +1,30 @@
+#pragma once
+#include "Compat.h"
+// Counts the number of distinct items in a stream approximately using Flajolet-Martin.
+class ApproximateCounter
+ ApproximateCounter();
+ void add(_uint64 value);
+ unsigned getCount();
+ static const int SHIFT = 9;
+ static const int BUCKETS = 1 << SHIFT;
+ std::vector<_uint64> buckets;
+ // MurmurHash3 finalization step from http://sites.google.com/site/murmurhash
+ inline _uint64 hash(_uint64 value) {
+ value ^= (value >> 33);
+ value *= 0xff51afd7ed558ccdLL;
+ value ^= (value >> 33);
+ value *= 0xc4ceb9fe1a85ec53LL;
+ value ^= (value >> 33);
+ return value;
+ }
diff --git a/SNAPLib/Bam.cpp b/SNAPLib/Bam.cpp
new file mode 100644
index 0000000..47049a1
--- /dev/null
+++ b/SNAPLib/Bam.cpp
@@ -0,0 +1,1841 @@
+Module Name:
+ Bam.cpp
+ Binary Alignment Map (BAM) file writer and reader.
+ User mode service.
+ BamWriter and BamReader (and their subclasses) aren't thread safe.
+#include "stdafx.h"
+#include "SAM.h"
+#include "BigAlloc.h"
+#include "Compat.h"
+#include "Read.h"
+#include "Bam.h"
+#include "Tables.h"
+#include "RangeSplitter.h"
+#include "ParallelTask.h"
+#include "ReadSupplierQueue.h"
+#include "Util.h"
+#include "FileFormat.h"
+#include "AlignerOptions.h"
+#include "exit.h"
+#include "VariableSizeMap.h"
+#include "PairedAligner.h"
+#include "GzipDataWriter.h"
+#include "Error.h"
+using std::max;
+using std::min;
+using util::strnchr;
+BAMReader::BAMReader(const ReaderContext& i_context) : ReadReader(i_context)
+ bool
+ Read *read1,
+ Read *read2,
+ PairedAlignmentResult *alignmentResult,
+ unsigned *mapQ,
+ const char **cigar)
+ return false;
+ void
+ const char *fileName,
+ int bufferCount,
+ _int64 startingOffset,
+ _int64 amountOfFileToProcess)
+ // todo: integrate supplier models
+ // might need up to 3x extra for expanded sequence + quality + cigar data
+ if (!strcmp("-", fileName)) {
+ data = DataSupplier::GzipBamStdio->getDataReader(bufferCount, MAX_RECORD_LENGTH, 3.0 * DataSupplier::ExpansionFactor, 0);
+ } else {
+ data = DataSupplier::GzipBamDefault->getDataReader(bufferCount, MAX_RECORD_LENGTH, 3.0 * DataSupplier::ExpansionFactor, 0);
+ }
+ if (! data->init(fileName)) {
+ WriteErrorMessage("Unable to read file %s\n", fileName);
+ soft_exit(1);
+ }
+ if (startingOffset == 0) {
+ readHeader(fileName);
+ }
+ _ASSERT(context.headerBytes > 0);
+ reinit(startingOffset, amountOfFileToProcess);
+ if ((size_t) startingOffset < context.headerBytes) {
+ _int64 bytesToSkip = context.headerBytes - startingOffset;
+ while (bytesToSkip > 0) {
+ char* p;
+ _int64 valid, start;
+ bool ok = data->getData(&p, &valid, &start);
+ if (!ok) {
+ WriteErrorMessage("failure reading file %s\n", fileName);
+ soft_exit(1);
+ }
+ _int64 bytesToSkipThisTime = __min(valid, bytesToSkip);
+ data->advance(bytesToSkipThisTime);
+ if (bytesToSkipThisTime > start) {
+ data->nextBatch();
+ }
+ data->getData(&p, &valid, &start);
+ bytesToSkip -= bytesToSkipThisTime;
+ }
+ }
+ void
+ const char* fileName)
+ _ASSERT(context.header == NULL);
+ _int64 headerSize = 1024 * 1024; // 1M header initially
+ bool sawWholeHeader;
+ BAMHeader* header;
+ _int64 textHeaderSize;
+ char* buffer;
+ buffer = data->readHeader(&headerSize);
+ if (headerSize < sizeof(BAMHeader)) {
+ WriteErrorMessage("Malformed BAM file '%s', too small to conatain even a header.\n", fileName);
+ soft_exit(1);
+ }
+ header = (BAMHeader*)buffer;
+ if (header->magic != BAMHeader::BAM_MAGIC) {
+ WriteErrorMessage("BAMReader: Not a valid BAM file\n");
+ soft_exit(1);
+ }
+ textHeaderSize = header->l_text;
+ if (textHeaderSize + (_int64)sizeof(BAMHeader) > headerSize) {
+ headerSize = textHeaderSize + (_int64)sizeof(BAMHeader);
+ buffer = data->readHeader(&headerSize);
+ if (textHeaderSize + (_int64)sizeof(BAMHeader) > headerSize) {
+ WriteErrorMessage("Unable to read entire header of BAM file '%s', it may be malformed.\n", fileName);
+ soft_exit(1);
+ }
+ header = (BAMHeader*)buffer;
+ if (header->magic != BAMHeader::BAM_MAGIC) {
+ WriteErrorMessage("BAMReader: Not a valid BAM file\n");
+ soft_exit(1);
+ }
+ _ASSERT(textHeaderSize == header->l_text); // We got the same thing this time
+ }
+ if (!SAMReader::parseHeader(fileName, header->text(), header->text() + headerSize - sizeof(BAMHeader), context.genome, &textHeaderSize, &context.headerMatchesIndex, &sawWholeHeader)) {
+ WriteErrorMessage("BAMReader: failed to parse header on '%s'\n", fileName);
+ soft_exit(1);
+ }
+ if (!sawWholeHeader) {
+ WriteErrorMessage("We had the entire header loaded for file '%s', but it didn't parse correctly\n", fileName);
+ soft_exit(1);
+ }
+ int n_ref = header->n_ref();
+ BAMHeaderRefSeq* refSeq = header->firstRefSeq();
+ for (int i = 0; i < n_ref; i++, refSeq = refSeq->next()) {
+ // just advance
+ }
+ char* p = new char[textHeaderSize + 1];
+ memcpy(p, header->text(), textHeaderSize);
+ p[textHeaderSize] = 0;
+ context.header = p;
+ context.headerLength = textHeaderSize;
+ context.headerBytes = (char*) refSeq - buffer;
+ BAMReader*
+ const char *fileName,
+ int bufferCount,
+ _int64 startingOffset,
+ _int64 amountOfFileToProcess,
+ const ReaderContext& context)
+ BAMReader* reader = new BAMReader(context);
+ reader->init(fileName, bufferCount, startingOffset, amountOfFileToProcess);
+ return reader;
+ void
+ _int64 startingOffset,
+ _int64 amountOfFileToProcess)
+ data->reinit(startingOffset, amountOfFileToProcess);
+ extraOffset = 0;
+ ReadSupplierGenerator *
+ const char *fileName,
+ int numThreads,
+ const ReaderContext& context)
+ BAMReader* reader = create(fileName, ReadSupplierQueue::BufferCount(numThreads), 0, 0, context);
+ ReadSupplierQueue* queue = new ReadSupplierQueue((ReadReader*)reader);
+ queue->startReaders();
+ return queue;
+ PairedReadSupplierGenerator *
+ const char *fileName,
+ int numThreads,
+ bool quicklyDropUnmatchedReads,
+ const ReaderContext& context,
+ int matchBufferSize)
+ BAMReader* reader = create(fileName,
+ ReadSupplierQueue::BufferCount(numThreads) + PairedReadReader::MatchBuffers, 0, 0, context);
+ PairedReadReader* matcher = PairedReadReader::PairMatcher(reader, quicklyDropUnmatchedReads);
+ ReadSupplierQueue* queue = new ReadSupplierQueue(matcher);
+ queue->startReaders();
+ return queue;
+const char* BAMAlignment::CodeToSeq = "=ACMGRSVTWYHKDBN";
+const char *BAMAlignment::CodeToSeqRC = "NTGKCYWBASRDMHVN"; // Bill's best guess for things other than ATCG, not that it matters for SNAP
+_uint16 BAMAlignment::CodeToSeqPair[256];
+_uint16 BAMAlignment::CodeToSeqPairRC[256];
+_uint8 BAMAlignment::SeqToCode[256];
+const char* BAMAlignment::CodeToCigar = "MIDNSHP=X";
+_uint8 BAMAlignment::CigarToCode[256];
+_uint8 BAMAlignment::CigarCodeToRefBase[9] = {1, 0, 1, 1, 0, 0, 1, 1, 1};
+const _uint8 BAM_CIGAR_M = 0;
+const _uint8 BAM_CIGAR_I = 1;
+const _uint8 BAM_CIGAR_D = 2;
+const _uint8 BAM_CIGAR_N = 3;
+const _uint8 BAM_CIGAR_S = 4;
+const _uint8 BAM_CIGAR_H = 5;
+const _uint8 BAM_CIGAR_P = 6;
+const _uint8 BAM_CIGAR_EQUAL = 7;
+const _uint8 BAM_CIGAR_X = 8;
+BAMAlignment::_init BAMAlignment::_init_;
+ void
+ char* o_sequence,
+ const _uint8* nibbles,
+ int bases)
+ _uint16 *o_sequence_pairs = (_uint16 *)o_sequence;
+ int pairs = bases / 2;
+ for (int i = 0; i < pairs; i++) {
+ o_sequence_pairs[i] = CodeToSeqPair[nibbles[i]];
+ }
+ if (bases % 2 == 1) {
+ o_sequence[bases - 1] = CodeToSeq[nibbles[bases / 2] >> 4];
+ }
+#ifdef _DEBUG // Make sure the new one does the same thing as the old.
+ for (int i = 0; i < bases; i++) {
+ int bit = 1 ^ (i & 1);
+ int n = (*nibbles >> (bit << 2)) & 0xf; // extract nibble without branches
+ nibbles += 1 ^ bit;
+ _ASSERT(o_sequence[i] == BAMAlignment::CodeToSeq[n]);
+ }
+#endif // _DEBUG
+ void
+char* o_sequence,
+const _uint8* nibbles,
+int bases)
+ _uint16 *o_sequence_pairs = (_uint16 *)o_sequence;
+ int pairs = bases / 2;
+ for (int i = 0; i < pairs; i++) {
+ o_sequence_pairs[pairs-i-1] = CodeToSeqPairRC[nibbles[i]];
+ }
+ if (bases % 2 == 1) {
+ o_sequence[0] = CodeToSeqRC[nibbles[bases / 2] >> 4];
+ }
+ void
+ char* o_qual,
+ char* quality,
+ int bases)
+ for (int i = 0; i < bases; i++) {
+ o_qual[i] = CIGAR_QUAL_TO_SAM[((_uint8*)quality)[i]];
+ }
+ void
+ char* o_qual,
+ char* quality,
+ int bases)
+ for (int i = 0; i < bases; i++) {
+ o_qual[bases-i-1] = CIGAR_QUAL_TO_SAM[((_uint8*)quality)[i]];
+ }
+ bool
+ char* o_cigar,
+ int cigarSize,
+ _uint32* cigar,
+ int ops)
+ int i = 0;
+ _uint32 lastOp = 99999;
+ while (ops > 0 && i < cigarSize - 11) { // 9 decimal digits (28 bits) + 1 cigar char + null terminator
+ i += sprintf(o_cigar + i, "%u", *cigar >> 4);
+ _ASSERT((*cigar & 0xf) <= 8);
+ _uint32 op = *cigar & 0xf;
+ o_cigar[i++] = BAMAlignment::CodeToCigar[op];
+ _ASSERT(op != lastOp);
+ lastOp = op;
+ ops--;
+ cigar++;
+ }
+ o_cigar[i++] = 0;
+ return ops == 0;
+ void
+ _uint32 *cigar,
+ int ops,
+ unsigned *o_frontClipping,
+ unsigned *o_backClipping,
+ unsigned *o_frontHardClipping,
+ unsigned *o_backHardClipping)
+ *o_frontHardClipping = 0; // Gets overwritten if we have any
+ *o_frontClipping = 0;
+ *o_backHardClipping = 0;
+ *o_backClipping = 0;
+ if (0 == ops) return;
+ if ((*cigar & 0xf) == BAM_CIGAR_H) {
+ *o_frontHardClipping = *cigar >> 4;
+ cigar++;
+ ops--;
+ if (0 == ops) {
+ return; // What a strange cigar string, all hard clip!
+ }
+ }
+ if ((*cigar & 0xf) == BAM_CIGAR_S) {
+ *o_frontClipping = *cigar >> 4;
+ cigar++;
+ ops--;
+ if (0 == ops) {
+ return;
+ }
+ }
+ if ((cigar[ops - 1] & 0xf) == BAM_CIGAR_H) {
+ *o_backHardClipping = cigar[ops - 1] >> 4;
+ ops--;
+ if (0 == ops) {
+ return;
+ }
+ }
+ if ((cigar[ops - 1] & 0xf) == BAM_CIGAR_S) {
+ *o_backClipping = cigar[ops - 1] >> 4;
+ }
+ void
+ _uint8* encoded,
+ char* ascii,
+ int length)
+ _uint8* p = encoded;
+ for (int i = 0; i + 1 < length; i += 2) {
+ *p++ = (BAMAlignment::SeqToCode[ascii[i]] << 4) | BAMAlignment::SeqToCode[ascii[i+1]];
+ }
+ if (length % 2) {
+ *p = BAMAlignment::SeqToCode[ascii[length - 1]] << 4;
+ }
+ int
+ return 0;
+ }
+ if (n_cigar_op == 0) {
+ return l_seq;
+ }
+ _uint32* p = cigar();
+ int len = 0;
+ static const int op_ref[16] = {1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0};
+ for (int i = 0; i < n_cigar_op; i++) {
+ _uint32 op = *p++;
+ len += op_ref[(op & 15)] * (op >> 4);
+ }
+ return len;
+// static initializer
+ memset(SeqToCode, 0, 256);
+ for (int i = 1; i < 16; i++) {
+ SeqToCode[CodeToSeq[i]] = i;
+ }
+ for (int i = 0; i < 256; i++) {
+ CodeToSeqPair[i] = CodeToSeq[i >> 4] | (CodeToSeq[i & 0xf] << 8); // If this looks backwards, recall that the machines are little-endian
+ CodeToSeqPairRC[i] = (CodeToSeqRC[i >> 4] << 8) | CodeToSeqRC[i & 0xf]; // Doubled backwards == forward
+ }
+ memset(CigarToCode, 0, 256);
+ for (int i = 1; i < 9; i++) {
+ CigarToCode[CodeToCigar[i]] = i;
+ }
+ int
+ int beg,
+ int end)
+ --end;
+ if (beg>>14 == end>>14) return ((1<<15)-1)/7 + (beg>>14);
+ if (beg>>17 == end>>17) return ((1<<12)-1)/7 + (beg>>17);
+ if (beg>>20 == end>>20) return ((1<<9)-1)/7 + (beg>>20);
+ if (beg>>23 == end>>23) return ((1<<6)-1)/7 + (beg>>23);
+ if (beg>>26 == end>>26) return ((1<<3)-1)/7 + (beg>>26);
+ return 0;
+ int
+ int beg,
+ int end,
+ _uint16* list)
+ int i = 0, k;
+ --end;
+ list[i++] = 0;
+ for (k = 1 + (beg>>26); k <= 1 + (end>>26); ++k) list[i++] = k;
+ for (k = 9 + (beg>>23); k <= 9 + (end>>23); ++k) list[i++] = k;
+ for (k = 73 + (beg>>20); k <= 73 + (end>>20); ++k) list[i++] = k;
+ for (k = 585 + (beg>>17); k <= 585 + (end>>17); ++k) list[i++] = k;
+ for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k;
+ return i;
+ void
+ _ASSERT(block_size < 0x100000); // sanity check, should be <1MB!
+ _ASSERT(size(l_read_name, n_cigar_op, l_seq, 0) <= block_size + sizeof(block_size));
+ _ASSERT(refID >= -1 && refID <= (int) 0x100000);
+ // todo: validate bin, requires more info
+ _ASSERT(MAPQ <= 80 || MAPQ == 255);
+ _ASSERT(FLAG <= 0x7ff);
+ _ASSERT(next_refID >= -1 && refID <= (int) 0x100000);
+ for (char* p = read_name(); p < read_name() + l_read_name - 1; p++) {
+ _ASSERT(*p >= ' ' && *p <= '~');
+ }
+ _ASSERT(read_name()[l_read_name - 1] == 0);
+ // can't validate seq, all values are valid (though some are unlikely!)
+ char* q = qual();
+ for (int i = 0; i < l_seq; i++) {
+ _ASSERT(q[i] >= -1 && q[i] <= 80);
+ }
+ BAMAlignAux* aux = firstAux();
+ for (; (char*)aux - (char*)firstAux() < auxLen(); aux = aux->next()) {
+ _ASSERT(aux->tag[0] >= ' ' && aux->tag[0] <= '~' && aux->tag[1] >= ' ' && aux->tag[1] <= '~');
+ _ASSERT(strchr("AcCsSiIfZHB", aux->val_type) != NULL);
+ }
+ _ASSERT((char*) aux - (char*) firstAux() == auxLen());
+ bool
+ Read *read,
+ AlignmentResult *alignmentResult,
+ GenomeLocation *genomeLocation,
+ bool *isRC,
+ unsigned *mapQ,
+ unsigned *flag,
+ bool ignoreEndOfRange,
+ const char **cigar)
+ unsigned local_flag;
+ if (NULL == flag) {
+ flag = &local_flag;
+ }
+ do {
+ char* buffer;
+ _int64 bytes;
+ if (! data->getData(&buffer, &bytes)) {
+ data->nextBatch();
+ if (! data->getData(&buffer, &bytes)) {
+ return false;
+ }
+ extraOffset = 0;
+ }
+ BAMAlignment* bam = (BAMAlignment*) buffer;
+ if ((_uint64)bytes < sizeof(bam->block_size) || (_uint64)bytes < bam->size()) {
+ WriteErrorMessage("Insufficient buffer space for BAM file, increase -xf parameter\n");
+ soft_exit(1);
+ }
+ data->advance(bam->size());
+ size_t lineLength;
+ getReadFromLine(context.genome, buffer, buffer + bytes, read, alignmentResult, genomeLocation,
+ isRC, mapQ, &lineLength, flag, cigar, context.clipping);
+ unsigned auxLen = bam->auxLen();
+ read->setReadGroup(context.defaultReadGroup);
+ if (auxLen > 0) {
+ read->setAuxiliaryData((char*) bam->firstAux(), auxLen);
+ for (BAMAlignAux* aux = bam->firstAux(); aux < bam->endAux(); aux = aux->next()) {
+ if (aux->val_type == 'Z' && aux->tag[0] == 'R' && aux->tag[1] == 'G') {
+ read->setReadGroup(READ_GROUP_FROM_AUX);
+ break;
+ }
+ }
+ }
+ } while ((context.ignoreSecondaryAlignments && (*flag & SAM_SECONDARY)) ||
+ (context.ignoreSupplementaryAlignments && (*flag & SAM_SUPPLEMENTARY)));
+ _ASSERT(read->getData()[0]);
+ return true;
+ void
+ const Genome *genome,
+ char *line,
+ char *endOfBuffer,
+ Read *read,
+ AlignmentResult *alignmentResult,
+ GenomeLocation *out_genomeLocation,
+ bool *isRC,
+ unsigned *mapQ,
+ size_t *lineLength,
+ unsigned *flag,
+ const char **cigar,
+ ReadClippingType clipping)
+ _ASSERT(endOfBuffer - line >= sizeof(BAMHeader));
+ BAMAlignment* bam = (BAMAlignment*) line;
+ _ASSERT((size_t)(endOfBuffer - line) >= bam->size());
+ bam->validate();
+ GenomeLocation genomeLocation = bam->getLocation(genome);
+ if (NULL != out_genomeLocation) {
+ _ASSERT(-1 <= bam->refID && bam->refID < (int)genome->getNumContigs());
+ *out_genomeLocation = genomeLocation;
+ }
+ if (NULL != cigar) {
+ const char* cigarBuffer;
+ {
+ char *writableCigarBuffer = getExtra(min(MAX_K * 5, MAX_SEQ_LENGTH));
+ if (!BAMAlignment::decodeCigar(writableCigarBuffer, MAX_SEQ_LENGTH, bam->cigar(), bam->n_cigar_op)) {
+ cigarBuffer = ""; // todo: fail?
+ }
+ else {
+ cigarBuffer = writableCigarBuffer;
+ }
+ }
+ *cigar = cigarBuffer;
+ }
+ if (NULL != read) {
+ _ASSERT(bam->l_seq < MAX_SEQ_LENGTH);
+ char* seqBuffer = getExtra(bam->l_seq);
+ char* qualBuffer = getExtra(bam->l_seq);
+ unsigned originalFrontClipping, originalBackClipping, originalFrontHardClipping, originalBackHardClipping;
+ BAMAlignment::decodeSeqRC(seqBuffer, bam->seq(), bam->l_seq);
+ BAMAlignment::decodeQualRC(qualBuffer, bam->qual(), bam->l_seq);
+ //
+ // Get the clipping, but reverse the outputs front/back because this is an RC read.
+ //
+ BAMAlignment::getClippingFromCigar(bam->cigar(), bam->n_cigar_op, &originalBackClipping, &originalFrontClipping, &originalBackHardClipping, &originalFrontHardClipping);
+ } else {
+ BAMAlignment::decodeSeq(seqBuffer, bam->seq(), bam->l_seq);
+ BAMAlignment::decodeQual(qualBuffer, bam->qual(), bam->l_seq);
+ BAMAlignment::getClippingFromCigar(bam->cigar(), bam->n_cigar_op, &originalFrontClipping, &originalBackClipping, &originalFrontHardClipping, &originalBackHardClipping);
+ }
+ const char *rnext;
+ unsigned rnextLen;
+ if (bam->next_refID < 0 || (genome != NULL && bam->next_refID >= genome->getNumContigs())) {
+ rnext = "*";
+ rnextLen = 1;
+ } else {
+ rnext = genome->getContigs()[bam->next_refID].name;
+ rnextLen = genome->getContigs()[bam->next_refID].nameLength;
+ }
+ read->init(bam->read_name(), bam->l_read_name - 1, seqBuffer, qualBuffer, bam->l_seq, genomeLocation, bam->MAPQ, bam->FLAG,
+ originalFrontClipping, originalBackClipping, originalFrontHardClipping, originalBackHardClipping, rnext, rnextLen, bam->next_pos + 1, true);
+ read->setBatch(data->getBatch());
+ read->clip(clipping);
+ }
+ if (NULL != alignmentResult) {
+ _ASSERT(bam->FLAG & SAM_UNMAPPED || bam->refID >= 0);
+ *alignmentResult = bam->FLAG & SAM_UNMAPPED ? NotFound : SingleHit; // todo: look at MAPQ?
+ }
+ if (NULL != isRC) {
+ *isRC = (bam->FLAG & SAM_REVERSE_COMPLEMENT) != 0;
+ }
+ if (NULL != mapQ) {
+ *mapQ = bam->MAPQ;
+ }
+ if (NULL != flag) {
+ *flag = bam->FLAG;
+ }
+ char*
+ _int64 bytes)
+ char* extra;
+ _int64 limit;
+ data->getExtra(&extra, &limit);
+ _ASSERT(extra != NULL && bytes >= 0 && limit - extraOffset >= bytes);
+ if (limit - extraOffset < bytes) {
+ WriteErrorMessage("error: not enough space for expanding BAM file - increase expansion factor, currently -xf %.1f\n", DataSupplier::ExpansionFactor);
+ soft_exit(1);
+ }
+ char* result = extra + extraOffset;
+ extraOffset += max((_int64) 0, bytes);
+ return result;
+class BAMFormat : public FileFormat
+ BAMFormat(bool i_useM) : useM(i_useM) {}
+ virtual void getSortInfo(const Genome* genome, char* buffer, _int64 bytes, GenomeLocation* o_location, GenomeDistance* o_readBytes, int* o_refID, int* o_pos) const;
+ virtual void setupReaderContext(AlignerOptions* options, ReaderContext* readerContext) const
+ { FileFormat::setupReaderContext(options, readerContext, true); }
+ virtual ReadWriterSupplier* getWriterSupplier(AlignerOptions* options, const Genome* genome) const;
+ virtual bool writeHeader(
+ const ReaderContext& context, char *header, size_t headerBufferSize, size_t *headerActualSize,
+ bool sorted, int argc, const char **argv, const char *version, const char *rgLine, bool omitSQLines) const;
+ virtual bool writeRead(
+ const ReaderContext& context, LandauVishkinWithCigar * lv, char * buffer, size_t bufferSpace,
+ size_t * spaceUsed, size_t qnameLen, Read * read, AlignmentResult result,
+ int mapQuality, GenomeLocation genomeLocation, Direction direction, bool secondaryAlignment, int * o_addFrontClipping,
+ bool hasMate = false, bool firstInPair = false, Read * mate = NULL,
+ AlignmentResult mateResult = NotFound, GenomeLocation mateLocation = 0, Direction mateDirection = FORWARD,
+ bool alignedAsPair = false) const;
+ static int computeCigarOps(const Genome * genome, LandauVishkinWithCigar * lv,
+ char * cigarBuf, int cigarBufLen,
+ const char * data, unsigned dataLength, unsigned basesClippedBefore, unsigned extraBasesClippedBefore, unsigned basesClippedAfter,
+ unsigned frontHardClipping, unsigned backHardClipping,
+ GenomeLocation genomeLocation, bool isRC, bool useM, int * o_editDistance, int * o_addFrontClipping);
+ const bool useM;
+const FileFormat* FileFormat::BAM[] = { new BAMFormat(false), new BAMFormat(true) };
+ void
+ const Genome* genome,
+ char* buffer,
+ _int64 bytes,
+ GenomeLocation* o_location,
+ GenomeDistance* o_readBytes,
+ int* o_refID,
+ int* o_pos) const
+ BAMAlignment* bam = (BAMAlignment*) buffer;
+ _ASSERT((size_t) bytes >= sizeof(BAMAlignment) && bam->size() <= (size_t) bytes && bam->refID < genome->getNumContigs());
+ if (o_location != NULL) {
+ if (bam->refID < 0 || bam->refID >= genome->getNumContigs() || bam->pos < 0) {
+ if (bam->next_refID < 0 || bam->next_refID > genome->getNumContigs() || bam->next_pos < 0) {
+ *o_location = UINT32_MAX;
+ } else {
+ *o_location = genome->getContigs()[bam->next_refID].beginningLocation + bam->next_pos;
+ }
+ } else {
+ *o_location = genome->getContigs()[bam->refID].beginningLocation + bam->pos;
+ }
+ }
+ if (o_readBytes != NULL) {
+ *o_readBytes = (unsigned) bam->size();
+ }
+ if (o_refID != NULL) {
+ *o_refID = bam->refID;
+ }
+ if (o_pos != NULL) {
+ *o_pos = bam->pos;
+ }
+ ReadWriterSupplier*
+ AlignerOptions* options,
+ const Genome* genome) const
+ DataWriterSupplier* dataSupplier;
+ GzipWriterFilterSupplier* gzipSupplier =
+ DataWriterSupplier::gzip(true, BAM_BLOCK, max(1, options->numThreads - 1), false, options->sortOutput);
+ // (leave a thread free for main, and let OS map threads to cores to allow system IO etc.)
+ if (options->sortOutput) {
+ size_t len = strlen(options->outputFile.fileName);
+ // todo: this is going to leak, but there's no easy way to free it, and it's small...
+ char* tempFileName = (char*) malloc(5 + len);
+ strcpy(tempFileName, options->outputFile.fileName);
+ strcpy(tempFileName + len, ".tmp");
+ // todo: make markDuplicates optional?
+ DataWriter::FilterSupplier* filters = gzipSupplier;
+ if (! options->noDuplicateMarking) {
+ filters = DataWriterSupplier::markDuplicates(genome)->compose(filters);
+ }
+ if (! options->noIndex) {
+ char* indexFileName = (char*) malloc(5 + len);
+ strcpy(indexFileName, options->outputFile.fileName);
+ strcpy(indexFileName + len, ".bai");
+ filters = DataWriterSupplier::bamIndex(indexFileName, genome, gzipSupplier)->compose(filters);
+ }
+ dataSupplier = DataWriterSupplier::sorted(this, genome, tempFileName,
+ options->sortMemory * (1ULL << 30),
+ options->numThreads, options->outputFile.fileName, filters, options->writeBufferSize,
+ FileEncoder::gzip(gzipSupplier, options->numThreads, options->bindToProcessors));
+ } else {
+ dataSupplier = DataWriterSupplier::create(options->outputFile.fileName, options->writeBufferSize, gzipSupplier);
+ }
+ return ReadWriterSupplier::create(this, dataSupplier, genome);
+ bool
+ const ReaderContext& context,
+ char *header,
+ size_t headerBufferSize,
+ size_t *headerActualSize,
+ bool sorted,
+ int argc,
+ const char **argv,
+ const char *version,
+ const char *rgLine,
+ bool omitSQLines) const
+ _ASSERT(!omitSQLines); // This is just for SAM files, at least for now.
+ if (headerBufferSize < BAMHeader::size(0)) {
+ return false;
+ }
+ size_t cursor = 0;
+ BAMHeader* bamHeader = (BAMHeader*) header;
+ bamHeader->magic = BAMHeader::BAM_MAGIC;
+ size_t samHeaderSize;
+ bool ok = FileFormat::SAM[0]->writeHeader(context, bamHeader->text(), headerBufferSize - BAMHeader::size(0), &samHeaderSize,
+ sorted, argc, argv, version, rgLine, omitSQLines);
+ if (! ok) {
+ return false;
+ }
+ bamHeader->l_text = (int)samHeaderSize;
+ cursor = BAMHeader::size((int)samHeaderSize);
+ // Write a RefSeq record for each chromosome / contig in the genome
+ // todo: handle null genome index case - reparse header & translate into BAM
+ bamHeader->n_ref() = 0; // in case of overflow or no genome
+ if (context.genome != NULL) {
+ const Genome::Contig *contigs = context.genome->getContigs();
+ int numContigs = context.genome->getNumContigs();
+ bamHeader->n_ref() = numContigs;
+ BAMHeaderRefSeq* refseq = bamHeader->firstRefSeq();
+ GenomeDistance genomeLen = context.genome->getCountOfBases();
+ for (int i = 0; i < numContigs; i++) {
+ int len = (int)strlen(contigs[i].name) + 1;
+ cursor += BAMHeaderRefSeq::size(len);
+ if (cursor > headerBufferSize) {
+ return false;
+ }
+ refseq->l_name = len;
+ memcpy(refseq->name(), contigs[i].name, len);
+ GenomeLocation start = contigs[i].beginningLocation;
+ GenomeLocation end = ((i + 1 < numContigs) ? contigs[i+1].beginningLocation : genomeLen) - context.genome->getChromosomePadding();
+ refseq->l_ref() = (int)(end - start);
+ refseq = refseq->next();
+ _ASSERT((char*) refseq - header == cursor);
+ }
+ }
+ *headerActualSize = cursor;
+ return true;
+ bool
+ const ReaderContext& context,
+ LandauVishkinWithCigar * lv,
+ char * buffer,
+ size_t bufferSpace,
+ size_t * spaceUsed,
+ size_t qnameLen,
+ Read * read,
+ AlignmentResult result,
+ int mapQuality,
+ GenomeLocation genomeLocation,
+ Direction direction,
+ bool secondaryAlignment,
+ int *o_addFrontClipping,
+ bool hasMate,
+ bool firstInPair,
+ Read * mate,
+ AlignmentResult mateResult,
+ GenomeLocation mateLocation,
+ Direction mateDirection,
+ bool alignedAsPair) const
+ const int cigarBufSize = MAX_READ;
+ _uint32 cigarBuf[cigarBufSize];
+ int flags = 0;
+ const char *contigName = "*";
+ int contigIndex = -1;
+ GenomeDistance positionInContig = 0;
+ int cigarOps = 0;
+ const char *mateContigName = "*";
+ int mateContigIndex = -1;
+ GenomeDistance matePositionInContig = 0;
+ _int64 templateLength = 0;
+ char data[MAX_READ];
+ char quality[MAX_READ];
+ const char* clippedData;
+ unsigned fullLength;
+ unsigned clippedLength;
+ unsigned basesClippedBefore;
+ GenomeDistance extraBasesClippedBefore;
+ unsigned basesClippedAfter;
+ int editDistance;
+ int newAddFrontClipping = 0;
+ if (!SAMFormat::createSAMLine(context.genome, lv,
+ // outputs:
+ data, quality, MAX_READ, contigName, contigIndex,
+ flags, positionInContig, mapQuality, mateContigName, mateContigIndex, matePositionInContig, templateLength,
+ fullLength, clippedData, clippedLength, basesClippedBefore, basesClippedAfter,
+ // inputs:
+ qnameLen, read, result, genomeLocation, direction, secondaryAlignment, useM,
+ hasMate, firstInPair, alignedAsPair, mate, mateResult, mateLocation, mateDirection,
+ &extraBasesClippedBefore))
+ {
+ return false;
+ }
+ if (genomeLocation != InvalidGenomeLocation) {
+ cigarOps = computeCigarOps(context.genome, lv, (char*)cigarBuf, cigarBufSize * sizeof(_uint32),
+ clippedData, clippedLength, basesClippedBefore, (unsigned)extraBasesClippedBefore, basesClippedAfter,
+ read->getOriginalFrontHardClipping(), read->getOriginalBackHardClipping(),
+ genomeLocation, direction == RC, useM, &editDistance, o_addFrontClipping);
+ if (*o_addFrontClipping != 0) {
+ return false;
+ }
+ }
+ // Write the BAM entry
+ unsigned auxLen;
+ bool auxSAM;
+ char* aux = read->getAuxiliaryData(&auxLen, &auxSAM);
+ static bool warningPrinted = false;
+ bool translateReadGroupFromSAM = false;
+ if (aux != NULL && auxSAM) {
+ if (! warningPrinted) {
+ warningPrinted = true;
+ WriteErrorMessage("warning: translating optional data from SAM->BAM is not yet implemented, optional data will not appear in BAM\n");
+ }
+ if (read->getReadGroup() == READ_GROUP_FROM_AUX) {
+ for (char* p = aux; p != NULL && p < aux + auxLen; p = SAMReader::skipToBeyondNextFieldSeparator(p, aux + auxLen)) {
+ if (strncmp(p, "RG:Z:", 5) == 0) {
+ size_t fieldLen;
+ SAMReader::skipToBeyondNextFieldSeparator(p, aux + auxLen, &fieldLen);
+ aux = p;
+ auxLen = (unsigned) fieldLen;
+ translateReadGroupFromSAM = true;
+ break;
+ }
+ }
+ }
+ if (! translateReadGroupFromSAM) {
+ aux = NULL;
+ auxLen = 0;
+ }
+ }
+ size_t bamSize = BAMAlignment::size((unsigned)qnameLen + 1, cigarOps, fullLength, auxLen);
+ if (read->getReadGroup() != NULL && read->getReadGroup() != READ_GROUP_FROM_AUX) {
+ if (strcmp(read->getReadGroup(), context.defaultReadGroup) != 0) {
+ bamSize += 4 + strlen(read->getReadGroup());
+ } else {
+ bamSize += context.defaultReadGroupAuxLen;
+ }
+ }
+ bamSize += 12; // NM:C PG:Z:SNAP fields
+ if (bamSize > bufferSpace) {
+ return false;
+ }
+ BAMAlignment* bam = (BAMAlignment*) buffer;
+ bam->block_size = (int)bamSize - 4;
+ bam->refID = contigIndex;
+ if (positionInContig > INT32_MAX || matePositionInContig > INT32_MAX) {
+ WriteErrorMessage("Can't write read to BAM file because aligned position (or mate position) within contig > 2^31, which is the limit for the BAM format.\n");
+ soft_exit(1);
+ }
+ bam->pos = (int)(positionInContig - 1);
+ if (qnameLen > 254) {
+ WriteErrorMessage("BAM format: QNAME field must be less than 254 characters long, instead it's %lld\n", qnameLen);
+ soft_exit(1);
+ }
+ bam->l_read_name = (_uint8)qnameLen + 1;
+ bam->MAPQ = mapQuality;
+ int refLength = cigarOps > 0 ? 0 : fullLength;
+ for (int i = 0; i < cigarOps; i++) {
+ refLength += BAMAlignment::CigarCodeToRefBase[cigarBuf[i] & 0xf] * (cigarBuf[i] >> 4);
+ }
+ bam->bin = genomeLocation != InvalidGenomeLocation ? BAMAlignment::reg2bin((int)positionInContig-1, (int)positionInContig-1 + refLength) :
+ // unmapped is at mate's position, length 1
+ mateLocation != InvalidGenomeLocation ? BAMAlignment::reg2bin((int)matePositionInContig-1, (int)matePositionInContig) :
+ // otherwise at -1, length 1
+ BAMAlignment::reg2bin(-1, 0);
+ bam->n_cigar_op = cigarOps;
+ bam->FLAG = flags;
+ bam->l_seq = fullLength;
+ bam->next_refID = mateContigIndex;
+ bam->next_pos = (int)matePositionInContig - 1;
+ bam->tlen = (int)templateLength;
+ memcpy(bam->read_name(), read->getId(), qnameLen);
+ bam->read_name()[qnameLen] = 0;
+ memcpy(bam->cigar(), cigarBuf, cigarOps * 4);
+ BAMAlignment::encodeSeq(bam->seq(), data, fullLength);
+ for (unsigned i = 0; i < fullLength; i++) {
+ quality[i] -= '!';
+ }
+ memcpy(bam->qual(), quality, fullLength);
+ if (aux != NULL && auxLen > 0) {
+ if (((char*)bam->firstAux()) + auxLen > buffer + bufferSpace) {
+ return false;
+ }
+ if (! translateReadGroupFromSAM) {
+ memcpy(bam->firstAux(), aux, auxLen);
+ } else {
+ // hack, build just RG field from SAM opt field
+ BAMAlignAux* auxData = bam->firstAux();
+ auxData->tag[0] = 'R';
+ auxData->tag[1] = 'G';
+ auxData->val_type = 'Z';
+ memcpy(auxData->value(), aux + 5, auxLen - 5);
+ ((char*)auxData->value())[auxLen-5] = 0;
+ auxLen -= 1; // RG:Z:xxx -> RGZxxx\0
+ }
+ }
+ // RG
+ if (read->getReadGroup() != NULL && read->getReadGroup() != READ_GROUP_FROM_AUX) {
+ if (strcmp(read->getReadGroup(), context.defaultReadGroup) != 0) {
+ if ((char*)bam->firstAux() + auxLen + 4 + strlen(read->getReadGroup()) > buffer + bufferSpace) {
+ return false;
+ }
+ BAMAlignAux* rg = (BAMAlignAux*)(auxLen + (char*)bam->firstAux());
+ rg->tag[0] = 'R'; rg->tag[1] = 'G'; rg->val_type = 'Z';
+ strcpy((char*)rg->value(), read->getReadGroup());
+ auxLen += (unsigned)rg->size();
+ } else {
+ if ((char*)bam->firstAux() + auxLen + context.defaultReadGroupAuxLen > buffer + bufferSpace) {
+ return false;
+ }
+ memcpy((char*)bam->firstAux() + auxLen, context.defaultReadGroupAux, context.defaultReadGroupAuxLen);
+ auxLen += context.defaultReadGroupAuxLen;
+ }
+ }
+ // PG
+ BAMAlignAux* pg = (BAMAlignAux*) (auxLen + (char*) bam->firstAux());
+ pg->tag[0] = 'P'; pg->tag[1] = 'G'; pg->val_type = 'Z';
+ strcpy((char*) pg->value(), "SNAP");
+ auxLen += (unsigned) pg->size();
+ // NM
+ BAMAlignAux* nm = (BAMAlignAux*) (auxLen + (char*) bam->firstAux());
+ nm->tag[0] = 'N'; nm->tag[1] = 'M'; nm->val_type = 'C';
+ *(_uint8*)nm->value() = (_uint8)editDistance;
+ auxLen += (unsigned) nm->size();
+ if (NULL != spaceUsed) {
+ *spaceUsed = bamSize;
+ }
+ // debugging: _ASSERT(0 == memcmp(bam->firstAux()->tag, "RG", 2) && 0 == memcmp(bam->firstAux()->next()->tag, "PG", 2) && 0 == memcmp(bam->firstAux()->next()->next()->tag, "NM", 2));
+ bam->validate();
+ return true;
+// Compute the CIGAR edit sequence operations in BAM format for a read against a given genome location
+// Returns number of operations (or 0 if there was a problem)
+// if returns with *o_addFrontClipping set non-zero, need to adjust front clipping & rerun
+ int
+ const Genome * genome,
+ LandauVishkinWithCigar * lv,
+ char * cigarBuf,
+ int cigarBufLen,
+ const char * data,
+ unsigned dataLength,
+ unsigned basesClippedBefore,
+ unsigned extraBasesClippedBefore,
+ unsigned basesClippedAfter,
+ unsigned frontHardClipping,
+ unsigned backHardClipping,
+ GenomeLocation genomeLocation,
+ bool isRC,
+ bool useM,
+ int * o_editDistance,
+ int * o_addFrontClipping
+ GenomeDistance extraBasesClippedAfter = 0;
+ int used = 0;
+ unsigned clippingWordsBefore = ((basesClippedBefore + extraBasesClippedBefore > 0) ? 1 : 0) + ((frontHardClipping > 0) ? 1 : 0);
+ unsigned clippingWordsAfter = ((basesClippedAfter + extraBasesClippedAfter > 0) ? 1 : 0) + ((backHardClipping > 0) ? 1 : 0);
+ SAMFormat::computeCigar(BAM_CIGAR_OPS, genome, lv, cigarBuf + 4 * clippingWordsBefore, cigarBufLen - 4 * (clippingWordsBefore + clippingWordsAfter), data, dataLength, basesClippedBefore, extraBasesClippedBefore,
+ basesClippedAfter, &extraBasesClippedAfter, genomeLocation, useM, o_editDistance, &used, o_addFrontClipping);
+ if (*o_addFrontClipping != 0) {
+ return 0;
+ }
+ if (*o_editDistance == -2) {
+ WriteErrorMessage("WARNING: computeEditDistance returned -2; cigarBuf may be too small\n");
+ return 0;
+ } else if (*o_editDistance == -1) {
+ static bool warningPrinted = false;
+ if (!warningPrinted) {
+ WriteErrorMessage("WARNING: computeEditDistance returned -1; this shouldn't happen\n");
+ warningPrinted = true;
+ }
+ return 0;
+ } else {
+ //
+ // If we have hard clipping, add in the cigar string for it.
+ //
+ if (frontHardClipping > 0) {
+ *(_uint32*)cigarBuf = (frontHardClipping << 4) | BAMAlignment::CigarToCode['H'];
+ used += 4;
+ }
+ // Add some CIGAR instructions for soft-clipping if we've ignored some bases in the read.
+ if (basesClippedBefore + extraBasesClippedBefore > 0) {
+ *((_uint32*)cigarBuf + ((frontHardClipping > 0) ? 1 : 0)) = ((basesClippedBefore + extraBasesClippedBefore) << 4) | BAMAlignment::CigarToCode['S'];
+ used += 4;
+ }
+ if (basesClippedAfter + extraBasesClippedAfter > 0) {
+ *(_uint32*)(cigarBuf + used) = ((int)(basesClippedAfter + extraBasesClippedAfter) << 4) | BAMAlignment::CigarToCode['S'];
+ used += 4;
+ }
+ if (backHardClipping > 0) {
+ *(_uint32*)(cigarBuf + used) = (backHardClipping << 4) | BAMAlignment::CigarToCode['H'];
+ used += 4;
+ }
+ return used / 4;
+ }
+class BAMFilter : public DataWriter::Filter
+ BAMFilter(DataWriter::FilterType i_type) : Filter(i_type), offsets(1000), header(false) {}
+ virtual ~BAMFilter() {}
+ virtual void inHeader(bool flag)
+ { header = flag; }
+ virtual void onAdvance(DataWriter* writer, size_t batchOffset, char* data, GenomeDistance bytes, GenomeLocation location);
+ virtual size_t onNextBatch(DataWriter* writer, size_t offset, size_t bytes);
+ virtual void onRead(BAMAlignment* bam, size_t fileOffset, int batchIndex) = 0;
+ BAMAlignment* getRead(size_t fileOffset);
+ BAMAlignment* getNextRead(BAMAlignment* read, size_t* o_fileOffset = NULL);
+ BAMAlignment* tryFindRead(size_t offset, size_t endOffset, const char* id, size_t* o_offset);
+ bool header;
+ VariableSizeVector<size_t> offsets;
+ DataWriter* currentWriter;
+ char* currentBuffer;
+ size_t currentBufferBytes; // # of valid bytes
+ size_t currentOffset; // logical file offset of beginning of current buffer
+ size_t
+ DataWriter* writer,
+ size_t offset,
+ size_t bytes)
+ bool ok = writer->getBatch(-1, ¤tBuffer, NULL, NULL, NULL, ¤tBufferBytes, ¤tOffset);
+ _ASSERT(ok);
+ currentWriter = writer;
+ int index = 0;
+ for (VariableSizeVector<size_t>::iterator i = offsets.begin(); i != offsets.end(); i++) {
+ onRead((BAMAlignment*) (currentBuffer + *i), currentOffset + *i, index++);
+ }
+ offsets.clear();
+ currentWriter = NULL;
+ currentBuffer = NULL;
+ currentBufferBytes = 0;
+ currentOffset = 0;
+ return bytes;
+ void
+ DataWriter* writer,
+ size_t batchOffset,
+ char* data,
+ GenomeDistance bytes,
+ GenomeLocation location)
+ if (! header) {
+ offsets.push_back(batchOffset);
+ }
+ BAMAlignment*
+ size_t offset)
+ if (offset >= currentOffset && offset < currentOffset + currentBufferBytes) {
+ return (BAMAlignment*) (currentBuffer + (offset - currentOffset));
+ }
+ for (int i = -2; ; i--) {
+ char* buffer;
+ size_t bufferFileOffset, bufferUsed; // logical
+ if (! currentWriter->getBatch(i, &buffer, NULL, NULL, NULL, &bufferUsed, &bufferFileOffset)) {
+ break;
+ }
+ if (offset >= bufferFileOffset && offset < bufferFileOffset + bufferUsed) {
+ return (BAMAlignment*) (buffer + (offset - bufferFileOffset));
+ }
+ }
+ return NULL;
+ BAMAlignment*
+ BAMAlignment* bam,
+ size_t* io_offset)
+ char* p = (char*) bam;
+ size_t size = bam->size();
+ size_t oldOffset = *io_offset;
+ *io_offset += size;
+ if (p >= currentBuffer && p < currentBuffer + currentBufferBytes) {
+ p += bam->size();
+ if (p >= currentBuffer + currentBufferBytes) {
+ return NULL;
+ }
+ _ASSERT(*io_offset == currentOffset + (p - currentBuffer));
+ _ASSERT(((BAMAlignment*)p)->refID >= -1);
+ return (BAMAlignment*) p;
+ }
+ for (int i = -2; ; i--) {
+ char* buffer;
+ size_t bufferOffset, bufferUsed; // logical
+ if (! currentWriter->getBatch(i, &buffer, NULL, NULL, NULL, &bufferUsed, &bufferOffset)) {
+ break;
+ }
+ if (p >= buffer && p < buffer+ bufferUsed) {
+ p += size;
+ _ASSERT(*io_offset == bufferOffset + (p - buffer));
+ return p < buffer + bufferUsed? (BAMAlignment*) p : getRead(*io_offset);
+ }
+ }
+ return NULL;
+ BAMAlignment*
+ size_t offset,
+ size_t endOffset,
+ const char* id,
+ size_t* o_offset)
+ BAMAlignment* bam = getRead(offset);
+ while (bam != NULL && offset < endOffset) {
+ if (readIdsMatch(bam->read_name(), id)) {
+ if (o_offset != NULL) {
+ *o_offset = offset;
+ }
+ return bam;
+ }
+ bam = getNextRead(bam, &offset);
+ }
+ return NULL;
+struct DuplicateReadKey
+ DuplicateReadKey()
+ { memset(this, 0, sizeof(DuplicateReadKey)); }
+ DuplicateReadKey(const BAMAlignment* bam, const Genome* genome)
+ {
+ if (bam == NULL) {
+ locations[0] = locations[1] = UINT32_MAX;
+ isRC[0] = isRC[1] = false;
+ } else {
+ locations[0] = bam->getLocation(genome);
+ locations[1] = bam->getNextLocation(genome);
+ isRC[0] = (bam->FLAG & SAM_REVERSE_COMPLEMENT) != 0;
+ isRC[1] = (bam->FLAG & SAM_NEXT_REVERSED) != 0;
+ if (((((_uint64) GenomeLocationAsInt64(locations[0])) << 1) | (isRC[0] ? 1 : 0)) > ((((_uint64) GenomeLocationAsInt64(locations[1])) << 1) | (isRC[1] ? 1 : 0))) {
+ const GenomeLocation t = locations[1];
+ locations[1] = locations[0];
+ locations[0] = t;
+ const bool f = isRC[1];
+ isRC[1] = isRC[0];
+ isRC[0] = f;
+ }
+ }
+ }
+ bool operator==(const DuplicateReadKey& b) const
+ {
+ return locations[0] == b.locations[0] && locations[1] == b.locations[1] &&
+ isRC[0] == b.isRC[0] && isRC[1] == b.isRC[1];
+ }
+ bool operator!=(const DuplicateReadKey& b) const
+ {
+ return ! ((*this) == b);
+ }
+ bool operator<(const DuplicateReadKey& b) const
+ {
+ return locations[0] < b.locations[0] ||
+ (locations[0] == b.locations[0] &&
+ (locations[1] < b.locations[1] ||
+ (locations[1] == b.locations[1] &&
+ isRC[0] * 2 + isRC[1] < b.isRC[0] *2 + b.isRC[1])));
+ }
+ // required for use as a key in VariableSizeMap template
+ DuplicateReadKey(int x)
+ { locations[0] = locations[1] = x; isRC[0] = isRC[1] = false; }
+ bool operator==(int x) const
+ { return locations[0] == (_uint32) x && locations[1] == (_uint32) x; }
+ bool operator!=(int x) const
+ { return locations[0] != (_uint32) x || locations[1] != (_uint32) x; }
+ operator _uint64()
+ { return ((_uint64) (GenomeLocationAsInt64(locations[1]) ^ (isRC[1] ? 1 : 0))) << 32 | (_uint64) (GenomeLocationAsInt64(locations[0]) ^ (isRC[0] ? 1 : 0)); }
+ GenomeLocation locations[2];
+ bool isRC[2];
+struct DuplicateMateInfo
+ DuplicateMateInfo() { memset(this, 0, sizeof(DuplicateMateInfo)); }
+ size_t firstRunOffset; // first read in duplicate set
+ size_t firstRunEndOffset;
+ size_t bestReadOffset[4]; // file offsets of first/second/new first/old second best reads
+ int bestReadQuality[2]; // total quality of first/both best reads
+ char bestReadId[120];
+ void setBestReadId(const char* id) { strncpy(bestReadId, id, sizeof(bestReadId)); }
+ const char* getBestReadId() { return bestReadId; }
+class BAMDupMarkFilter : public BAMFilter
+ BAMDupMarkFilter(const Genome* i_genome) :
+ BAMFilter(DataWriter::ModifyFilter),
+ genome(i_genome), runOffset(0), runLocation(UINT32_MAX), runCount(0), mates()
+ {}
+ ~BAMDupMarkFilter()
+ {
+ if (mates.size() > 0) {
+ WriteErrorMessage("duplicate matching ended with %d unmatched reads:\n", mates.size());
+ for (MateMap::iterator i = mates.begin(); i != mates.end(); i = mates.next(i)) {
+ WriteErrorMessage("%u%s/%u%s\n", i->key.locations[0], i->key.isRC[0] ? "rc" : "", i->key.locations[1], i->key.isRC[1] ? "rc" : "");
+ }
+ }
+ }
+ static bool isDuplicate(const BAMAlignment* a, const BAMAlignment* b)
+ { return a->pos == b->pos && a->refID == b->refID &&
+ virtual void onRead(BAMAlignment* bam, size_t fileOffset, int batchIndex);
+ static int getTotalQuality(BAMAlignment* bam);
+ const Genome* genome;
+ size_t runOffset; // offset in file of first read in run
+ GenomeLocation runLocation; // location in genome
+ int runCount; // number of aligned reads
+ typedef VariableSizeMap<DuplicateReadKey,DuplicateMateInfo,150,MapNumericHash<DuplicateReadKey>,70,0,-2> MateMap;
+ static const _uint64 RunKey = 0xffffffffc0000000UL;
+ static const _uint64 RunRC = 0x80000000;
+ static const _uint64 RunNextRC = 0x40000000;
+ static const _uint64 RunOffset = 0x3fffffff;
+ typedef VariableSizeVector<_uint64> RunVector;
+ RunVector run;
+ MateMap mates;
+ void
+BAMDupMarkFilter::onRead(BAMAlignment* lastBam, size_t lastOffset, int)
+ if ((lastBam->FLAG & SAM_SECONDARY) != 0) {
+ return; // ignore secondary aliignments; todo: mark them as dups too?
+ }
+ GenomeLocation location = lastBam->getLocation(genome);
+ GenomeLocation nextLocation = lastBam->getNextLocation(genome);
+ GenomeLocation logicalLocation = location != InvalidGenomeLocation ? location : nextLocation;
+ if (logicalLocation == UINT32_MAX) {
+ return;
+ }
+ if (logicalLocation == runLocation) {
+ runCount++;
+ } else {
+ // if there was more than one read with same location, then analyze the run
+ if (runCount > 1) {
+ // partition by duplicate key, find best read in each partition
+ size_t offset = runOffset;
+ run.clear();
+ // sort run by other coordinate & RC flags to get sub-runs
+ for (BAMAlignment* record = getRead(offset); record != NULL && record != lastBam; record = getNextRead(record, &offset)) {
+ // use opposite of logical location to sort records
+ _uint64 entry = record->getLocation(genome) == UINT32_MAX
+ ? (((_uint64) UINT32_MAX) << 32) |
+ ((record->FLAG & SAM_REVERSE_COMPLEMENT) ? RunNextRC : 0) |
+ ((record->FLAG & SAM_NEXT_REVERSED) ? RunRC : 0)
+ : (((_uint64) GenomeLocationAsInt64(record->getNextLocation(genome))) << 32) |
+ ((record->FLAG & SAM_REVERSE_COMPLEMENT) ? RunRC : 0) |
+ ((record->FLAG & SAM_NEXT_REVERSED) ? RunNextRC : 0);
+ entry |= (_uint64) ((offset - runOffset) & RunOffset);
+ _ASSERT(offset - runOffset <= RunOffset);
+ run.push_back(entry);
+ }
+ if (run.size() == 0) {
+ goto done; // todo: handle runs > n buffers (but should be rare!)
+ }
+ // ensure that adjacent half-mapped pairs stay together
+ std::stable_sort(run.begin(), run.end());
+ bool foundRun = false;
+ for (RunVector::iterator i = run.begin(); i != run.end(); i++) {
+ // skip singletons
+ if ((i == run.begin() || (*i & RunKey) != (*(i-1) & RunKey)) &&
+ (i + 1 == run.end() || (*i & RunKey) != (*(i+1) & RunKey))) {
+ continue;
+ }
+ offset = runOffset + (*i & RunOffset);
+ BAMAlignment* record = getRead(offset);
+ _ASSERT(record->refID >= -1 && record->refID < genome->getNumContigs()); // simple sanity check
+ // skip adjacent half-mapped pairs, they're not really runs
+ if (i + 1 < run.end() && readIdsMatch(record->read_name(), getRead(runOffset + (*(i+1) & RunOffset))->read_name())) {
+ i++;
+ continue;
+ }
+ foundRun = true;
+ DuplicateReadKey key(record, genome);
+ MateMap::iterator f = mates.find(key);
+ DuplicateMateInfo* info;
+ if (f == mates.end()) {
+ mates.put(key, DuplicateMateInfo());
+ info = &mates[key];
+ //fprintf(stderr, "add %u%s/%u%s -> %d\n", key.locations[0], key.isRC[0] ? "rc" : "", key.locations[1], key.isRC[1] ? "rc" : "", mates.size());
+ info->firstRunOffset = runOffset;
+ info->firstRunEndOffset = lastOffset;
+ } else {
+ info = &f->value;
+ }
+ int totalQuality = getTotalQuality(record);
+ size_t mateOffset = 0;
+ BAMAlignment* mate = NULL;
+ // optimize case for half-mapped pairs with adjacent reads
+ if ((record->FLAG & SAM_MULTI_SEGMENT) != 0) {
+ mate = tryFindRead(info->firstRunOffset, info->firstRunEndOffset, record->read_name(), &mateOffset);
+ if (mate == record) {
+ mate = NULL;
+ }
+ }
+ bool isSecond = mate != NULL;
+ if (isSecond) {
+ totalQuality += getTotalQuality(mate);
+ }
+ if (totalQuality > info->bestReadQuality[isSecond]) {
+ info->bestReadQuality[isSecond] = totalQuality;
+ info->bestReadOffset[isSecond] = offset;
+ if (isSecond) {
+ info->bestReadOffset[2] = mateOffset;
+ }
+ info->setBestReadId(record->read_name());
+ }
+ if (isSecond && readIdsMatch(info->getBestReadId(), record->read_name())) {
+ info->bestReadOffset[3] = offset;
+ }
+ }
+ if (! foundRun) {
+ goto done; // avoid useless looping
+ }
+ // go back and adjust flags
+ offset = runOffset;
+ VariableSizeVector<DuplicateMateInfo*>* failedBackpatch = NULL;
+ for (RunVector::iterator i = run.begin(); i != run.end(); i++) {
+ // skip singletons
+ if ((i == run.begin() || (*i & RunKey) != (*(i-1) & RunKey)) &&
+ (i + 1 == run.end() || (*i & RunKey) != (*(i+1) & RunKey))) {
+ continue;
+ }
+ offset = runOffset + (*i & RunOffset);
+ BAMAlignment* record = getRead(offset);
+ if (i + 1 < run.end() && readIdsMatch(record->read_name(), getRead(runOffset + (*(i+1) & RunOffset))->read_name())) {
+ i++;
+ continue;
+ }
+ DuplicateReadKey key(record, genome);
+ MateMap::iterator m = mates.find(key);
+ if (m == mates.end()) {
+ continue; // one end in a run, other not
+ }
+ DuplicateMateInfo* minfo = &m->value;
+ bool pass = minfo->bestReadQuality[1] != 0; // 1 for second pass, 0 for first pass
+ bool isSecond = minfo->firstRunOffset != runOffset;
+ static const int index[2][2] = {{0, 3}, {2, 1}};
+ if (offset != minfo->bestReadOffset[index[pass][isSecond]]) {
+ // Picard markDuplicates will not mark unmapped reads
+ if ((record->FLAG & SAM_UNMAPPED) == 0) {
+ record->FLAG |= SAM_DUPLICATE;
+ }
+ } else if (pass == 1 && minfo->bestReadOffset[2] != 0 && minfo->bestReadOffset[0] != 0 && minfo->bestReadOffset[2] != minfo->bestReadOffset[0]) {
+ // backpatch reads in first matelist if they're still in memory
+ BAMAlignment* oldBest = getRead(minfo->bestReadOffset[0]);
+ BAMAlignment* newBest = getRead(minfo->bestReadOffset[2]);
+ if (oldBest != NULL && newBest != NULL) {
+ oldBest->FLAG &= ~SAM_DUPLICATE;
+ } else {
+ if (failedBackpatch == NULL) {
+ failedBackpatch = new VariableSizeVector<DuplicateMateInfo*>();
+ }
+ failedBackpatch->push_back(minfo);
+ }
+ }
+ }
+ // fixup any that failed
+ if (failedBackpatch != NULL) {
+ for (VariableSizeVector<DuplicateMateInfo*>::iterator i = failedBackpatch->begin(); i != failedBackpatch->end(); i++) {
+ // couldn't go back and patch first set to have correct best for second set
+ // so patch second set to have same best as first set even though it's not really the best
+ BAMAlignment* trueBestSecond = getRead((*i)->bestReadOffset[1]);
+ BAMAlignment* firstBestSecond = getRead((*i)->bestReadOffset[3]);
+ _ASSERT(trueBestSecond != NULL && firstBestSecond != NULL);
+ if (trueBestSecond != NULL && firstBestSecond != NULL) {
+ trueBestSecond->FLAG &= ~SAM_DUPLICATE;
+ firstBestSecond->FLAG |= ~SAM_DUPLICATE;
+ }
+ }
+ }
+ // clean up
+ for (RunVector::iterator i = run.begin(); i != run.end(); i++) {
+ // skip singletons
+ if ((i == run.begin() || (*i & RunKey) != (*(i-1) & RunKey)) &&
+ (i + 1 == run.end() || (*i & RunKey) != (*(i+1) & RunKey))) {
+ continue;
+ }
+ offset = runOffset + (*i & RunOffset);
+ BAMAlignment* record = getRead(offset);
+ if (i + 1 < run.end() && readIdsMatch(record->read_name(), getRead(runOffset + (*(i+1) & RunOffset))->read_name())) {
+ i++;
+ continue;
+ }
+ DuplicateReadKey key(record, genome);
+ MateMap::iterator m = mates.find(key);
+ if (m != mates.end() && m->value.firstRunOffset != runOffset) {
+ mates.erase(key);
+ //fprintf(stderr, "erase %u%s/%u%s -> %d\n", key.locations[0], key.isRC[0] ? "rc" : "", key.locations[1], key.isRC[1] ? "rc" : "", mates.size());
+ }
+ }
+ }
+ runLocation = logicalLocation;
+ runOffset = lastOffset;
+ runCount = 1;
+ }
+ // todo: preserve this across batches - need to block-copy entire memory for reads
+ int
+ BAMAlignment* bam)
+ int result = 0;
+ _uint8* p = (_uint8*) bam->qual();
+ for (int i = 0; i < bam->l_seq; i++) {
+ int q = *p++;
+ result += (q != 255) * q; // avoid branch?
+ }
+ return result;
+class BAMDupMarkSupplier : public DataWriter::FilterSupplier
+ BAMDupMarkSupplier(const Genome* i_genome) :
+ FilterSupplier(DataWriter::ReadFilter), genome(i_genome) {}
+ virtual DataWriter::Filter* getFilter()
+ { return new BAMDupMarkFilter(genome); }
+ virtual void onClosing(DataWriterSupplier* supplier) {}
+ virtual void onClosed(DataWriterSupplier* supplier) {}
+ const Genome* genome;
+ DataWriter::FilterSupplier*
+DataWriterSupplier::markDuplicates(const Genome* genome)
+ return new BAMDupMarkSupplier(genome);
+class BAMIndexSupplier;
+class BAMIndexFilter : public BAMFilter
+ BAMIndexFilter(BAMIndexSupplier* i_supplier)
+ : BAMFilter(DataWriter::ReadFilter), supplier(i_supplier) {}
+ virtual void onRead(BAMAlignment* bam, size_t fileOffset, int batchIndex);
+ BAMIndexSupplier* supplier;
+class BAMIndexSupplier : public DataWriter::FilterSupplier
+ BAMIndexSupplier(const char* i_indexFileName, const Genome* i_genome, GzipWriterFilterSupplier* i_gzipSupplier) :
+ FilterSupplier(DataWriter::ReadFilter),
+ indexFileName(i_indexFileName),
+ genome(i_genome),
+ gzipSupplier(i_gzipSupplier),
+ lastRefId(-1),
+ lastBin(0), binStart(0), lastBamEnd(0)
+ {
+ refs = genome ? new RefInfo[genome->getNumContigs()] : NULL;
+ readCounts[0] = readCounts[1] = 0;
+ }
+ virtual DataWriter::Filter* getFilter()
+ { return new BAMIndexFilter(this); }
+ virtual void onClosing(DataWriterSupplier* supplier) {}
+ virtual void onClosed(DataWriterSupplier* supplier);
+ friend class BAMIndexFilter;
+ struct BAMChunk {
+ BAMChunk() : start(0), end(0) {}
+ BAMChunk(const BAMChunk& a) : start(a.start), end(a.end) {}
+ _uint64 start, end;
+ };
+ typedef VariableSizeVector<BAMChunk> ChunkVec;
+ typedef VariableSizeMap<_uint32,ChunkVec,150,MapNumericHash<_uint32>,80,-1,-2> BinMap;
+ typedef VariableSizeVector<_uint64> LinearMap;
+ struct RefInfo {
+ BinMap bins;
+ LinearMap intervals;
+ };
+ RefInfo* getRefInfo(int refId);
+ void onRead(BAMAlignment* bam, size_t fileOffset, int batchIndex);
+ void addChunk(int refId, _uint32 bin, _uint64 start, _uint64 end);
+ void addInterval(int refId, int begin, int end, _uint64 fileOffset);
+ const char* indexFileName;
+ const Genome* genome;
+ int lastRefId;
+ _uint32 lastBin;
+ _uint64 binStart;
+ _uint64 firstBamStart;
+ _uint64 lastBamEnd;
+ _uint64 readCounts[2]; // mapped, unmapped
+ RefInfo* refs;
+ GzipWriterFilterSupplier* gzipSupplier;
+ void
+ BAMAlignment* bam,
+ size_t fileOffset,
+ int batchIndex)
+ supplier->onRead(bam, fileOffset, batchIndex);
+ DataWriter::FilterSupplier*
+ const char* indexFileName,
+ const Genome* genome,
+ GzipWriterFilterSupplier* gzipSupplier)
+ return new BAMIndexSupplier(indexFileName, genome, gzipSupplier);
+ void
+ BAMAlignment* bam,
+ size_t fileOffset,
+ int batchIndex)
+ //fprintf(stderr, "index onRead %d:%d+%d @ %lld %d\n", bam->refID, bam->pos, bam->l_ref(), fileOffset, batchIndex);
+ if (bam->refID != lastRefId) {
+ if (lastRefId != -1) {
+ addChunk(lastRefId, BAMAlignment::BAM_EXTRA_BIN, firstBamStart, lastBamEnd);
+ addChunk(lastRefId, BAMAlignment::BAM_EXTRA_BIN, readCounts[0], readCounts[1]);
+ readCounts[0] = readCounts[1] = 0;
+ }
+ firstBamStart = fileOffset;
+ }
+ readCounts[(bam->FLAG & SAM_UNMAPPED) ? 1 : 0]++;
+ if (bam->refID != lastRefId || bam->bin != lastBin || lastRefId == -1) {
+ addChunk(lastRefId, lastBin, binStart, fileOffset);
+ lastBin = bam->bin;
+ lastRefId = bam->refID;
+ binStart = fileOffset;
+ }
+ if (! (bam->FLAG & SAM_UNMAPPED)) {
+ _ASSERT(bam->pos != -1 && bam->refID != -1);
+ addInterval(bam->refID, bam->pos, bam->pos + bam->l_ref() - 1, fileOffset);
+ }
+ lastBamEnd = fileOffset + bam->size();
+ void
+ DataWriterSupplier* supplier)
+ // add final chunk
+ if (lastRefId != -1) {
+ addChunk(lastRefId, lastBin, binStart, lastBamEnd);
+ addChunk(lastRefId, BAMAlignment::BAM_EXTRA_BIN, firstBamStart, lastBamEnd);
+ addChunk(lastRefId, BAMAlignment::BAM_EXTRA_BIN, readCounts[0], readCounts[1]);
+ }
+ // write out index file
+ FILE* index = fopen(indexFileName, "wb");
+ char magic[4] = {'B', 'A', 'I', 1};
+ fwrite(magic, sizeof(magic), 1, index);
+ _int32 n_ref = genome->getNumContigs();
+ fwrite(&n_ref, sizeof(n_ref), 1, index);
+ for (int i = 0; i < n_ref; i++) {
+ RefInfo* info = getRefInfo(i);
+ _int32 n_bin, n_intv;
+ if (info == NULL) {
+ n_bin = 0;
+ fwrite(&n_bin, sizeof(n_bin), 1, index);
+ n_intv = 0;
+ fwrite(&n_intv, sizeof(n_intv), 1, index);
+ continue;
+ }
+ n_bin = info->bins.size();
+ fwrite(&n_bin, sizeof(n_bin), 1, index);
+ for (BinMap::iterator j = info->bins.begin(); j != info->bins.end(); j = info->bins.next(j)) {
+ _uint32 bin = j->key;
+ fwrite(&bin, sizeof(bin), 1, index);
+ _int32 n_chunk = (_int32) j->value.size();
+ fwrite(&n_chunk, sizeof(n_chunk), 1, index);
+ if (bin != BAMAlignment::BAM_EXTRA_BIN) {
+ for (ChunkVec::iterator k = j->value.begin(); k != j->value.end(); k++) {
+ _uint64 chunk[2] = {gzipSupplier->toVirtualOffset(k->start), gzipSupplier->toVirtualOffset(k->end)};
+ fwrite(&chunk, sizeof(chunk), 1, index);
+ }
+ } else {
+ _uint64 chunk[2] = {gzipSupplier->toVirtualOffset(j->value[0].start), gzipSupplier->toVirtualOffset(j->value[0].end)};
+ fwrite(&chunk, sizeof(chunk), 1, index);
+ chunk[0] = j->value[1].start;
+ chunk[1] = j->value[1].end;
+ fwrite(&chunk, sizeof(chunk), 1, index);
+ }
+ }
+ n_intv = (_int32) info->intervals.size();
+ fwrite(&n_intv, sizeof(n_intv), 1, index);
+ for (LinearMap::iterator m = info->intervals.begin(); m != info->intervals.end(); m++) {
+ _uint64 ioffset = gzipSupplier->toVirtualOffset(*m);
+ fwrite(&ioffset, sizeof(ioffset), 1, index);
+ }
+ }
+ fclose(index);
+ BAMIndexSupplier::RefInfo*
+ int refId)
+ return refId >= 0 && refId < genome->getNumContigs() ? &refs[refId] : NULL;
+ void
+ int refId,
+ _uint32 bin,
+ _uint64 start,
+ _uint64 end)
+ RefInfo* info = getRefInfo(refId);
+ if (info == NULL) {
+ return;
+ }
+ ChunkVec* chunks = info->bins.tryFind(bin);
+ if (chunks == NULL) {
+ ChunkVec empty;
+ info->bins.tryAdd(bin, empty, &chunks);
+ }
+ BAMChunk chunk;
+ chunk.start = start;
+ chunk.end = end;
+ chunks->push_back(chunk);
+ void
+ int refId,
+ int begin,
+ int end,
+ _uint64 fileOffset)
+ RefInfo* info = getRefInfo(refId);
+ if (info == NULL) {
+ return;
+ }
+ int slot = end <= 0 ? 0 : ((end - 1) / 16384);
+ if (slot >= info->intervals.size()) {
+ for (_int64 i = info->intervals.size(); i < slot; i++) {
+ info->intervals.push_back(UINT64_MAX);
+ }
+ info->intervals.push_back(fileOffset);
+ }
+ bool
+BgzfHeader::validate(char* buffer, size_t bytes)
+ char* p;
+ for (p = buffer; p - buffer < (_int64)bytes; ) {
+ BgzfHeader* h = (BgzfHeader*) p;
+ unsigned bsize = h->BSIZE() + 1;
+ unsigned isize = h->ISIZE();
+ if (bsize == 0 || bsize > BAM_BLOCK || isize > BAM_BLOCK ||
+ bsize > max(2 * isize, isize+1000) || ! h->validate(bsize, isize)) {
+ return false;
+ }
+ p += bsize;
+ }
+ return p == buffer + bytes;
+ bool
+ size_t compressed,
+ size_t uncompressed)
+ return ID1 == 0x1f && ID2 == 0x8b && CM == 8 && FLG == 4 &&
+ MTIME == 0 && XFL == 0 && OS == 0 &&
+ ISIZE() == uncompressed&&
+ BSIZE() + 1 == compressed;
diff --git a/SNAPLib/Bam.h b/SNAPLib/Bam.h
new file mode 100644
index 0000000..438753f
--- /dev/null
+++ b/SNAPLib/Bam.h
@@ -0,0 +1,439 @@
+Module Name:
+ Bam.h
+ Binary Alignment Map (BAM) file writer.
+ User mode service.
+ This class is NOT thread safe. It's the caller's responsibility to ensure that
+ at most one thread uses an instance at any time.
+#pragma once
+#include "Compat.h"
+#include "LandauVishkin.h"
+#include "PairedEndAligner.h"
+#include "VariableSizeVector.h"
+#include "BufferedAsync.h"
+#include "SAM.h"
+#include "Read.h"
+#include "DataReader.h"
+// for debugging file I/O, validate BAM records on input & output
+//#define VALIDATE_BAM
+// BAM format layout
+// SAM Format Specification v1.4-r985
+// header information in each BGZF compression block
+#define BAM_BLOCK 65536
+#pragma pack(push, 1)
+struct BAMHeaderRefSeq;
+struct BAMHeader
+ static const _uint32 BAM_MAGIC = 0x014d4142; // 'BAM\1'
+ _uint32 magic;
+ _int32 l_text;
+ char* text() // not necessarily null terminated
+ { return sizeof(magic) + sizeof(l_text) + (char*) this; }
+ _int32& n_ref()
+ { return * (_int32*) (l_text + text()); }
+ BAMHeaderRefSeq* firstRefSeq()
+ { return (BAMHeaderRefSeq*) (size(l_text) + (char*) this); }
+ BAMHeader()
+ : magic(BAM_MAGIC), l_text(0)
+ {}
+ size_t size()
+ { return size(l_text); }
+ // bytes required for a given text length, not including reference sequence blocks
+ static size_t size(_int32 ltext)
+ { return sizeof(BAMHeader) + sizeof(_int32)/*n_ref*/ + ltext; }
+// header information for each reference sequence record
+struct BAMHeaderRefSeq
+ _int32 l_name;
+ char* name()
+ { return 4 + (char*) this; }
+ _int32& l_ref()
+ { return * (_int32*) (l_name + name()); }
+ BAMHeaderRefSeq* next()
+ { return (BAMHeaderRefSeq*) (size(l_name) + (char*) this); }
+ static size_t size(_int32 l_name)
+ { return sizeof(l_name) + sizeof(_int32)/*l_ref*/ + l_name; }
+// information for each alignment record
+struct BAMAlignAux;
+struct BAMAlignment
+ _int32 block_size;
+ _int32 refID;
+ _int32 pos;
+ _uint8 l_read_name;
+ _uint8 MAPQ;
+ _uint16 bin;
+ _uint16 n_cigar_op;
+ _uint16 FLAG;
+ _int32 l_seq;
+ _int32 next_refID;
+ _int32 next_pos;
+ _int32 tlen;
+ char* read_name()
+ { return sizeof(tlen) + (char*) &this->tlen; }
+ _uint32* cigar()
+ { return (_uint32*) (l_read_name + read_name()); }
+ _uint8* seq()
+ { return (_uint8*) (n_cigar_op + cigar()); }
+ char* qual()
+ { return (char*) ((l_seq + 1) / 2 + seq()); }
+ BAMAlignAux* firstAux()
+ { return (BAMAlignAux*) (l_seq + qual()); }
+ unsigned auxLen()
+ { return (unsigned) (size() - ((char*) firstAux() - (char*) this)); }
+ BAMAlignAux* endAux()
+ { return (BAMAlignAux*) (auxLen() + (char*) firstAux()); }
+ size_t size()
+ { return block_size + sizeof(block_size); }
+ static size_t size(unsigned l_read_name, unsigned n_cigar_op, unsigned l_seq, unsigned l_aux)
+ { return sizeof(BAMAlignment) + l_read_name + n_cigar_op * sizeof(_uint32) + (l_seq + 1) / 2 + l_seq + l_aux; }
+ // conversions
+ static const char* CodeToSeq;
+ static const char* CodeToSeqRC;
+ static _uint16 CodeToSeqPair[256];
+ static _uint16 CodeToSeqPairRC[256];
+ static _uint8 SeqToCode[256];
+ static const char* CodeToCigar;
+ static _uint8 CigarToCode[256];
+ static _uint8 CigarCodeToRefBase[9];
+ static int GetCigarOpCode(_uint32 op) { return op & 0xf; }
+ static int GetCigarOpCount(_uint32 op) { return op >> 4; }
+ static void decodeSeq(char* o_sequence, const _uint8* nibbles, int bases);
+ static void decodeQual(char* o_qual, char* quality, int bases);
+ static void decodeSeqRC(char* o_sequence, const _uint8* nibbles, int bases);
+ static void decodeQualRC(char* o_qual, char* quality, int bases);
+ static bool decodeCigar(char* o_cigar, int cigarSize, _uint32* cigar, int ops);
+ static void getClippingFromCigar(_uint32 *cigar, int ops, unsigned *o_frontClipping, unsigned *o_backClipping, unsigned *o_frontHardClipping, unsigned *o_backHardClipping);
+ static void encodeSeq(_uint8* nibbles, char* ascii, int length);
+ int l_ref(); // length of reference aligned to read
+ class _init { public: _init(); };
+ static _init _init_;
+ // binning
+ static const _uint32 BAM_EXTRA_BIN = 37450; // extra bin for metadata
+ /* calculate bin given an alignment covering [beg,end) (zero-based, half-close-half-open) */
+ static int reg2bin(int beg, int end);
+ /* calculate the list of bins that may overlap with region [beg,end) (zero-based) */
+ static const int MAX_BIN = (((1<<18)-1)/7);
+ static int reg2bins(int beg, int end, _uint16* list/*[MAX_BIN]*/);
+ // absoluate genome locations
+ GenomeLocation getLocation(const Genome* genome) const
+ { return genome == NULL || pos < 0 || refID < 0 || refID >= genome->getNumContigs() || (FLAG & SAM_UNMAPPED)
+ ? UINT32_MAX : (genome->getContigs()[refID].beginningLocation + pos); }
+ GenomeLocation getNextLocation(const Genome* genome) const
+ { return next_pos < 0 || next_refID < 0 || (FLAG & SAM_NEXT_UNMAPPED) ? UINT32_MAX : (genome->getContigs()[next_refID].beginningLocation + next_pos); }
+ void validate();
+ inline void validate() {} // inline noop
+#define INT8_VAL_TYPE 'c'
+#define UINT8_VAL_TYPE 'C'
+#define INT16_VAL_TYPE 's'
+#define UINT16_VAL_TYPE 'S'
+#define INT32_VAL_TYPE 'i'
+#define UINT32_VAL_TYPE 'I'
+#define FLOAT_VAL_TYPE 'f'
+#define CHAR_VAL_TYPE 'A'
+#define STRING_VAL_TYPE 'Z'
+#define HEX_VAL_TYPE 'H'
+#define ARRAY_VAL_TYPE 'B'
+// header for each auxiliary data field
+struct BAMAlignAux
+ char tag[2];
+ char val_type;
+ // accessors for single-valued fields
+ void* value()
+ { return 3 + (char*) this; }
+ _int8 int8Value()
+ { _ASSERT(val_type == INT8_VAL_TYPE); return * (_int8*) value(); }
+ _uint8 uint8Value()
+ { _ASSERT(val_type == UINT8_VAL_TYPE); return * (_uint8*) value(); }
+ _int16 int16Value()
+ { _ASSERT(val_type == INT16_VAL_TYPE); return * (_int16*) value(); }
+ _uint16 uint16Value()
+ { _ASSERT(val_type == UINT16_VAL_TYPE); return * (_uint16*) value(); }
+ _int32 int32Value()
+ { _ASSERT(val_type == INT32_VAL_TYPE); return * (_int32*) value(); }
+ _uint32 uint32Value()
+ { _ASSERT(val_type == UINT32_VAL_TYPE); return * (_uint32*) value(); }
+ float floatValue()
+ { _ASSERT(val_type == FLOAT_VAL_TYPE); return * (float*) value(); }
+ // accessors for array fields
+ _int32 count()
+ { _ASSERT(val_type == ARRAY_VAL_TYPE); return * (_uint32*) (1 + (char*) value()); }
+ char arrayValType()
+ { _ASSERT(val_type == ARRAY_VAL_TYPE); return * (char*) value(); }
+ void* data()
+ { _ASSERT(val_type == ARRAY_VAL_TYPE); return 5 + (char*) value(); }
+ _int8* int8Array()
+ { _ASSERT(arrayValType() == INT8_VAL_TYPE); return (_int8*) data(); }
+ _uint8* uint8Array()
+ { _ASSERT(arrayValType() == UINT8_VAL_TYPE); return (_uint8*) data(); }
+ _int16* int16Array()
+ { _ASSERT(arrayValType() == INT16_VAL_TYPE); return (_int16*) data(); }
+ _uint16* uint16Array()
+ { _ASSERT(arrayValType() == UINT16_VAL_TYPE); return (_uint16*) data(); }
+ _int32* int32Array()
+ { _ASSERT(arrayValType() == INT32_VAL_TYPE); return (_int32*) data(); }
+ _uint32* uint32Array()
+ { _ASSERT(arrayValType() == UINT32_VAL_TYPE); return (_uint32*) data(); }
+ float* floatArray()
+ { _ASSERT(arrayValType() == FLOAT_VAL_TYPE); return (float*) data(); }
+ // compute overall size
+ size_t size()
+ {
+ return val_type == STRING_VAL_TYPE ? strlen((const char*) value()) + 4
+ : val_type == ARRAY_VAL_TYPE ? size(arrayValType(), count())
+ : size(val_type);
+ }
+ static size_t size(char val_type)
+ { return 3 + valueSize(val_type); }
+ static size_t size(char array_val_type, _uint32 count)
+ { return 8 + valueSize(array_val_type) * (size_t) count; }
+ static size_t valueSize(char val_type)
+ {
+ switch (val_type) {
+ case INT8_VAL_TYPE:
+ case UINT8_VAL_TYPE:
+ return 1;
+ case INT16_VAL_TYPE:
+ case UINT16_VAL_TYPE:
+ return 2;
+ case INT32_VAL_TYPE:
+ case UINT32_VAL_TYPE:
+ return 4;
+ case HEX_VAL_TYPE:
+ default:
+ // todo: remove? log?
+ _ASSERT(false);
+ return 1;
+ }
+ }
+ BAMAlignAux* next()
+ { return (BAMAlignAux*) (size() + (char*) this); }
+struct BgzfExtra
+ _uint8 SI1;
+ _uint8 SI2;
+ _uint16 SLEN;
+ void* data()
+ { return this + 1; }
+ BgzfExtra* nextExtra()
+ { return (BgzfExtra*) (SLEN + (char*) data()); }
+struct BgzfHeader
+ _uint8 ID1; // 0x1f
+ _uint8 ID2; // 0x8b (magic numbers)
+ _uint8 CM; // Compression method (8 == deflate)
+ _uint8 FLG; // flags
+ _uint32 MTIME; // Mod time
+ _uint8 XFL; // extra flags
+ _uint8 OS; // operating system
+ _uint16 XLEN; // extra length
+ BgzfExtra* firstExtra()
+ { return (BgzfExtra*) (sizeof(_uint16) + (char*) &this->XLEN); }
+ _uint16 BSIZE()
+ {
+ for (BgzfExtra* x = firstExtra(); (char*) x < XLEN + (char*) firstExtra(); x = x->nextExtra()) {
+ if (x->SI1 == 66 && x->SI2 == 67) {
+ _ASSERT(x->SLEN == 2);
+ return * (_uint16*) x->data();
+ }
+ }
+ _ASSERT(false);
+ return 0;
+ }
+ _uint32 ISIZE()
+ {
+ _uint32 result = * (_uint32*) (BSIZE() - 3 + (char*) this);
+ _ASSERT(result <= BAM_BLOCK);
+ return result;
+ }
+ bool validate(size_t compressed, size_t uncompressed);
+ static bool validate(char* buffer, size_t bytes);
+#pragma pack(pop)
+class BAMReader : public PairedReadReader, public ReadReader {
+ BAMReader(const ReaderContext& i_context);
+ virtual ~BAMReader();
+ void init(const char *fileName, int bufferCount, _int64 startingOffset, _int64 amountOfFileToProcess);
+ virtual bool getNextRead(Read *readToUpdate)
+ {
+ return getNextRead(readToUpdate, NULL, NULL, NULL, NULL, NULL, false, NULL);
+ }
+ virtual bool getNextRead(Read *read, AlignmentResult *alignmentResult, GenomeLocation *genomeLocation, bool *isRC, unsigned *mapQ,
+ unsigned *flag, const char **cigar)
+ {
+ return getNextRead(read,alignmentResult,genomeLocation,isRC,mapQ,flag,false,cigar);
+ }
+ //
+ // In getNextReadPair mapQ points to an array of two unsigneds.
+ //
+ virtual bool getNextReadPair(Read *read1, Read *read2, PairedAlignmentResult *alignmentResult, unsigned *mapQ, const char **cigar);
+ //
+ // The PairedReadReader version of getNextReadPair, which throws away the alignment, mapQ and cigar values.
+ //
+ bool getNextReadPair(Read *read1, Read *read2)
+ {
+ return getNextReadPair(read1,read2,NULL,NULL,NULL);
+ }
+ void holdBatch(DataBatch batch)
+ { data->holdBatch(batch); }
+ bool releaseBatch(DataBatch batch)
+ { return data->releaseBatch(batch); }
+ virtual ReaderContext* getContext()
+ { return ((ReadReader*)this)->getContext(); }
+ static BAMReader* create(const char *fileName, int bufferCount,
+ _int64 startingOffset, _int64 amountOfFileToProcess,
+ const ReaderContext& context);
+ virtual void reinit(_int64 startingOffset, _int64 amountOfFileToProcess);
+ static ReadSupplierGenerator *createReadSupplierGenerator(const char *fileName, int numThreads, const ReaderContext& context);
+ static PairedReadSupplierGenerator *createPairedReadSupplierGenerator(const char *fileName, int numThreads, bool quicklyDropUnmatchedReads,
+ const ReaderContext& context, int matchBufferSize = 5000);
+ static const int MAX_SEQ_LENGTH;
+ static const int MAX_RECORD_LENGTH;
+ virtual bool getNextRead(Read *read, AlignmentResult *alignmentResult,
+ GenomeLocation *genomeLocation, bool *isRC, unsigned *mapQ, unsigned *flag, bool ignoreEndOfRange, const char **cigar);
+ void getReadFromLine(const Genome *genome, char *line, char *endOfBuffer, Read *read, AlignmentResult *alignmentResult,
+ GenomeLocation *genomeLocation, bool *isRC, unsigned *mapQ,
+ size_t *lineLength, unsigned *flag, const char **cigar, ReadClippingType clipping);
+ void readHeader(const char* fileName);
+ char* getExtra(_int64 bytes);
+ DataReader* data;
+ //unsigned n_ref; // number of reference sequences
+ //unsigned* refOffset; // array mapping ref sequence ID to contig location
+ _int64 extraOffset; // offset into extra data
diff --git a/SNAPLib/BaseAligner.cpp b/SNAPLib/BaseAligner.cpp
new file mode 100644
index 0000000..eb9fb4c
--- /dev/null
+++ b/SNAPLib/BaseAligner.cpp
@@ -0,0 +1,1583 @@
+Module Name:
+ BaseAligner.cpp
+ SNAP genome aligner
+ Bill Bolosky, August, 2011
+ User mode service.
+ This class is NOT thread safe. It's the caller's responsibility to ensure that
+ at most one thread uses an instance at any time.
+Revision History:
+ Adapted from Matei Zaharia's Scala implementation.
+#include "stdafx.h"
+#include "BaseAligner.h"
+#include "Compat.h"
+#include "LandauVishkin.h"
+#include "BigAlloc.h"
+#include "mapq.h"
+#include "SeedSequencer.h"
+#include "exit.h"
+#include "AlignerOptions.h"
+#include "Error.h"
+using std::min;
+#ifdef TRACE_ALIGNER // If you turn this on, then stdout writing won't work.
+#define TRACE printf
+#define TRACE(...) {}
+ GenomeIndex *i_genomeIndex,
+ unsigned i_maxHitsToConsider,
+ unsigned i_maxK,
+ unsigned i_maxReadSize,
+ unsigned i_maxSeedsToUseFromCommandLine,
+ double i_maxSeedCoverage,
+ unsigned i_minWeightToCheck,
+ unsigned i_extraSearchDepth,
+ bool i_noUkkonen,
+ bool i_noOrderedEvaluation,
+ bool i_noTruncation,
+ int i_maxSecondaryAlignmentsPerContig,
+ LandauVishkin<1>*i_landauVishkin,
+ LandauVishkin<-1>*i_reverseLandauVishkin,
+ AlignerStats *i_stats,
+ BigAllocator *allocator) :
+ genomeIndex(i_genomeIndex), maxHitsToConsider(i_maxHitsToConsider), maxK(i_maxK),
+ maxReadSize(i_maxReadSize), maxSeedsToUseFromCommandLine(i_maxSeedsToUseFromCommandLine),
+ maxSeedCoverage(i_maxSeedCoverage), readId(-1), extraSearchDepth(i_extraSearchDepth),
+ explorePopularSeeds(false), stopOnFirstHit(false), stats(i_stats),
+ noUkkonen(i_noUkkonen), noOrderedEvaluation(i_noOrderedEvaluation), noTruncation(i_noTruncation),
+ minWeightToCheck(max(1u, i_minWeightToCheck)), maxSecondaryAlignmentsPerContig(i_maxSecondaryAlignmentsPerContig)
+Routine Description:
+ Constructor for the BaseAligner class. Aligners align reads against an indexed genome.
+ i_genomeIndex - The index against which to do the alignments
+ i_maxHitsToConsider - The maximum number of hits to use from a seed lookup. Any lookups that return more
+ than this are ignored.
+ i_maxK - The largest string difference to consider for any comparison.
+ i_maxReadSize - Bound on the number of bases in any read. There's no reason to make it tight, it just affects a little memory allocation.
+ i_maxSeedsToUse - The maximum number of seeds to use when aligning any read (not counting ones ignored because they resulted in too many
+ hits). Once we've looked up this many seeds, we just score what we've got.
+ i_maxSeedCoverage - The maximum number of seeds to use expressed as readSize/seedSize
+ i_extraSearchDepth - How deeply beyond bestScore do we search?
+ i_noUkkonen - Don't use Ukkonen's algorithm (i.e., don't reduce the max edit distance depth as we score candidates)
+ i_noOrderedEvaluation-Don't order evaluating the reads by the hit count in order to drive down the max edit distance more quickly
+ i_noTruncation - Don't truncate searches based on count of disjoint seed misses
+ i_maxSecondaryAlignmentsPerContig - Maximum secondary alignments per contig; -1 means don't limit this
+ i_landauVishkin - an externally supplied LandauVishkin string edit distance object. This is useful if we're expecting repeated computations and use the LV cache.
+ i_reverseLandauVishkin - the same for the reverse direction.
+ i_stats - an object into which we report out statistics
+ allocator - an allocator that's used to allocate our local memory. This is useful for TLB optimization. If this is supplied, the caller
+ is responsible for deallocation, we'll not deallocate any dynamic memory in our destructor.
+ --*/
+ hadBigAllocator = allocator != NULL;
+ nHashTableLookups = 0;
+ nLocationsScored = 0;
+ nHitsIgnoredBecauseOfTooHighPopularity = 0;
+ nReadsIgnoredBecauseOfTooManyNs = 0;
+ nIndelsMerged = 0;
+ genome = genomeIndex->getGenome();
+ seedLen = genomeIndex->getSeedLength();
+ doesGenomeIndexHave64BitLocations = genomeIndex->doesGenomeIndexHave64BitLocations();
+ probDistance = new ProbabilityDistance(SNP_PROB, GAP_OPEN_PROB, GAP_EXTEND_PROB); // Match Mason
+ if ((i_landauVishkin == NULL) != (i_reverseLandauVishkin == NULL)) {
+ WriteErrorMessage("Must supply both or neither of forward & reverse Landau-Vishkin objects. You tried exactly one.\n");
+ soft_exit(1);
+ }
+ if (i_landauVishkin == NULL) {
+ if (allocator) {
+ landauVishkin = new (allocator) LandauVishkin<>;
+ reverseLandauVishkin = new (allocator) LandauVishkin<-1>;
+ } else {
+ landauVishkin = new LandauVishkin<>;
+ reverseLandauVishkin = new LandauVishkin<-1>;
+ }
+ ownLandauVishkin = true;
+ } else {
+ landauVishkin = i_landauVishkin;
+ reverseLandauVishkin = i_reverseLandauVishkin;
+ ownLandauVishkin = false;
+ }
+ unsigned maxSeedsToUse;
+ if (0 != maxSeedsToUseFromCommandLine) {
+ maxSeedsToUse = maxSeedsToUseFromCommandLine;
+ } else {
+ maxSeedsToUse = (int)(maxSeedCoverage * maxReadSize / genomeIndex->getSeedLength());
+ }
+ numWeightLists = maxSeedsToUse + 1;
+ candidateHashTablesSize = (maxHitsToConsider * maxSeedsToUse * 3)/2; // *1.5 for hash table slack
+ hashTableElementPoolSize = maxHitsToConsider * maxSeedsToUse * 2 ; // *2 for RC
+ if (allocator) {
+ rcReadData = (char *)allocator->allocate(sizeof(char) * maxReadSize * 2); // The *2 is to allocte space for the quality string
+ } else {
+ rcReadData = (char *)BigAlloc(sizeof(char) * maxReadSize * 2); // The *2 is to allocte space for the quality string
+ }
+ rcReadQuality = rcReadData + maxReadSize;
+ if (allocator) {
+ reversedRead[FORWARD] = (char *)allocator->allocate(sizeof(char) * maxReadSize * 4 + 2 * MAX_K); // Times 4 to also hold RC version and genome data (+2MAX_K is for genome data)
+ } else {
+ reversedRead[FORWARD] = (char *)BigAlloc(sizeof(char) * maxReadSize * 4 + 2 * MAX_K); // Times 4 to also hold RC version and genome data (+2MAX_K is for genome data)
+ }
+ rcReadData = (char *)BigAlloc(sizeof(char) * maxReadSize);
+ // treat everything but ACTG like N
+ for (unsigned i = 0; i < 256; i++) {
+ nTable[i] = 1;
+ rcTranslationTable[i] = 'N';
+ }
+ reversedRead[RC] = reversedRead[FORWARD] + maxReadSize;
+ rcTranslationTable['A'] = 'T';
+ rcTranslationTable['G'] = 'C';
+ rcTranslationTable['C'] = 'G';
+ rcTranslationTable['T'] = 'A';
+ rcTranslationTable['N'] = 'N';
+ memset(nTable, 0, sizeof(nTable));
+ nTable['N'] = 1;
+ if (allocator) {
+ seedUsed = (BYTE *)allocator->allocate((sizeof(BYTE) * (maxReadSize + 7 + 128) / 8)); // +128 to make sure it extends at both
+ } else {
+ seedUsed = (BYTE *)BigAlloc((sizeof(BYTE) * (maxReadSize + 7 + 128) / 8)); // +128 to make sure it extends at both
+ }
+ seedUsedAsAllocated = seedUsed; // Save the pointer for the delete.
+ seedUsed += 8; // This moves the pointer up an _int64, so we now have the appropriate before buffer.
+ nUsedHashTableElements = 0;
+ if (allocator) {
+ candidateHashTable[FORWARD] = (HashTableAnchor *)allocator->allocate(sizeof(HashTableAnchor) * candidateHashTablesSize);
+ candidateHashTable[RC] = (HashTableAnchor *)allocator->allocate(sizeof(HashTableAnchor) * candidateHashTablesSize);
+ weightLists = (HashTableElement *)allocator->allocate(sizeof(HashTableElement) * numWeightLists);
+ hashTableElementPool = (HashTableElement *)allocator->allocate(sizeof(HashTableElement) * hashTableElementPoolSize); // Allocte last, because it's biggest and usually unused. This puts all of the commonly used stuff into one large page.
+ hitCountByExtraSearchDepth = (unsigned *)allocator->allocate(sizeof(*hitCountByExtraSearchDepth) * extraSearchDepth);
+ if (maxSecondaryAlignmentsPerContig > 0) {
+ hitsPerContigCounts = (HitsPerContigCounts *)allocator->allocate(sizeof(*hitsPerContigCounts) * genome->getNumContigs());
+ memset(hitsPerContigCounts, 0, sizeof(*hitsPerContigCounts) * genome->getNumContigs());
+ } else {
+ hitsPerContigCounts = NULL;
+ }
+ } else {
+ candidateHashTable[FORWARD] = (HashTableAnchor *)BigAlloc(sizeof(HashTableAnchor) * candidateHashTablesSize);
+ candidateHashTable[RC] = (HashTableAnchor *)BigAlloc(sizeof(HashTableAnchor) * candidateHashTablesSize);
+ weightLists = (HashTableElement *)BigAlloc(sizeof(HashTableElement) * numWeightLists);
+ hashTableElementPool = (HashTableElement *)BigAlloc(sizeof(HashTableElement) * hashTableElementPoolSize);
+ hitCountByExtraSearchDepth = (unsigned *)BigAlloc(sizeof(*hitCountByExtraSearchDepth) * extraSearchDepth);
+ if (maxSecondaryAlignmentsPerContig > 0) {
+ hitsPerContigCounts = (HitsPerContigCounts *)BigAlloc(sizeof(*hitsPerContigCounts) * genome->getNumContigs());
+ memset(hitsPerContigCounts, 0, sizeof(*hitsPerContigCounts) * genome->getNumContigs());
+ }
+ else {
+ hitsPerContigCounts = NULL;
+ }
+ }
+ for (unsigned i = 0; i < hashTableElementPoolSize; i++) {
+ hashTableElementPool[i].init();
+ }
+ for (unsigned i = 0; i < maxSeedsToUse + 1; i++) {
+ weightLists[i].init();
+ }
+ for (Direction rc = 0; rc < NUM_DIRECTIONS; rc++) {
+ memset(candidateHashTable[rc],0,sizeof(HashTableAnchor) * candidateHashTablesSize);
+ }
+ hashTableEpoch = 0;
+#ifdef _DEBUG
+bool _DumpAlignments = false;
+#endif // _DEBUG
+ void
+ Read *inputRead,
+ SingleAlignmentResult *primaryResult,
+ int maxEditDistanceForSecondaryResults,
+ int secondaryResultBufferSize,
+ int *nSecondaryResults,
+ int maxSecondaryResults,
+ SingleAlignmentResult *secondaryResults // The caller passes in a buffer of secondaryResultBufferSize and it's filled in by AlignRead()
+ )
+Routine Description:
+ Align a particular read, possibly constraining the search around a given location.
+ read - the read to align
+ primaryResult - the best alignment result found
+ maxEditDistanceForSecondaryResults - How much worse than the primary result should we look?
+ secondaryResultBufferSize - the size of the secondaryResults buffer. If provided, it must be at least maxK * maxSeeds * 2.
+ nRescondaryResults - returns the number of secondary results found
+ maxSecondaryResults - limit the number of secondary results to this
+ secondaryResults - returns the secondary results
+Return Value:
+ true if there was enough space in secondaryResults, false otherwise
+ memset(hitCountByExtraSearchDepth, 0, sizeof(*hitCountByExtraSearchDepth) * extraSearchDepth);
+ if (NULL != nSecondaryResults) {
+ *nSecondaryResults = 0;
+ }
+ firstPassSeedsNotSkipped[FORWARD] = firstPassSeedsNotSkipped[RC] = 0;
+ smallestSkippedSeed[FORWARD] = smallestSkippedSeed[RC] = 0x8fffffffffffffff;
+ highestWeightListChecked = 0;
+ unsigned maxSeedsToUse;
+ if (0 != maxSeedsToUseFromCommandLine) {
+ maxSeedsToUse = maxSeedsToUseFromCommandLine;
+ } else {
+ maxSeedsToUse = (int)(2 * maxSeedCoverage * inputRead->getDataLength() / genomeIndex->getSeedLength()); // 2x is for FORWARD/RC
+ }
+ primaryResult->location = InvalidGenomeLocation; // Value to return if we don't find a location.
+ primaryResult->direction = FORWARD; // So we deterministically print the read forward in this case.
+ primaryResult->score = UnusedScoreValue;
+ primaryResult->status = NotFound;
+ unsigned lookupsThisRun = 0;
+ popularSeedsSkipped = 0;
+ //
+ // A bitvector for used seeds, indexed on the starting location of the seed within the read.
+ //
+ if (inputRead->getDataLength() > maxReadSize) {
+ WriteErrorMessage("BaseAligner:: got too big read (%d > %d)\n"
+ "Increase MAX_READ_LENGTH at the beginning of Read.h and recompile\n", inputRead->getDataLength(), maxReadSize);
+ soft_exit(1);
+ }
+ if ((int)inputRead->getDataLength() < seedLen) {
+ //
+ // Too short to have any seeds, it's hopeless.
+ // No need to finalize secondary results, since we don't have any.
+ //
+ return;
+ }
+ printf("Aligning read '%.*s':\n%.*s\n%.*s\n", inputRead->getIdLength(), inputRead->getId(), inputRead->getDataLength(), inputRead->getData(),
+ inputRead->getDataLength(), inputRead->getQuality());
+#ifdef _DEBUG
+ if (_DumpAlignments) {
+ printf("BaseAligner: aligning read ID '%.*s', data '%.*s'\n", inputRead->getIdLength(), inputRead->getId(), inputRead->getDataLength(), inputRead->getData());
+ }
+#endif // _DEBUG
+ //
+ // Clear out the seed used array.
+ //
+ memset(seedUsed, 0, (inputRead->getDataLength() + 7) / 8);
+ unsigned readLen = inputRead->getDataLength();
+ const char *readData = inputRead->getData();
+ const char *readQuality = inputRead->getQuality();
+ unsigned countOfNs = 0;
+ for (unsigned i = 0; i < readLen; i++) {
+ char baseByte = readData[i];
+ char complement = rcTranslationTable[baseByte];
+ rcReadData[readLen - i - 1] = complement;
+ rcReadQuality[readLen - i - 1] = readQuality[i];
+ reversedRead[FORWARD][readLen - i - 1] = baseByte;
+ reversedRead[RC][i] = complement;
+ countOfNs += nTable[baseByte];
+ }
+ if (countOfNs > maxK) {
+ nReadsIgnoredBecauseOfTooManyNs++;
+ // No need to finalize secondary results, since we don't have any.
+ return;
+ }
+ //
+ // Block off any seeds that would contain an N.
+ //
+ if (countOfNs > 0) {
+ int minSeedToConsiderNing = 0; // In English, any word can be verbed. Including, apparently, "N."
+ for (int i = 0; i < (int) readLen; i++) {
+ if (BASE_VALUE[readData[i]] > 3) {
+ int limit = __min(i + seedLen - 1, readLen-1);
+ for (int j = __max(minSeedToConsiderNing, i - (int) seedLen + 1); j <= limit; j++) {
+ SetSeedUsed(j);
+ }
+ minSeedToConsiderNing = limit+1;
+ if (minSeedToConsiderNing >= (int) readLen)
+ break;
+ }
+ }
+ }
+ Read reverseComplimentRead;
+ Read *read[NUM_DIRECTIONS];
+ read[FORWARD] = inputRead;
+ read[RC] = &reverseComplimentRead;
+ read[RC]->init(NULL, 0, rcReadData, rcReadQuality, readLen);
+ clearCandidates();
+ //
+ // Initialize the bases table, which represents which bases we've checked.
+ // We have readSize - seeds size + 1 possible seeds.
+ //
+ unsigned nPossibleSeeds = readLen - seedLen + 1;
+ TRACE("nPossibleSeeds: %d\n", nPossibleSeeds);
+ unsigned nextSeedToTest = 0;
+ unsigned wrapCount = 0;
+ lowestPossibleScoreOfAnyUnseenLocation[FORWARD] = lowestPossibleScoreOfAnyUnseenLocation[RC] = 0;
+ mostSeedsContainingAnyParticularBase[FORWARD] = mostSeedsContainingAnyParticularBase[RC] = 1; // Instead of tracking this for real, we're just conservative and use wrapCount+1. It's faster.
+ bestScore = UnusedScoreValue;
+ secondBestScore = UnusedScoreValue;
+ nSeedsApplied[FORWARD] = nSeedsApplied[RC] = 0;
+ lvScores = 0;
+ lvScoresAfterBestFound = 0;
+ probabilityOfAllCandidates = 0.0;
+ probabilityOfBestCandidate = 0.0;
+ scoreLimit = maxK + extraSearchDepth; // For MAPQ computation
+ while (nSeedsApplied[FORWARD] + nSeedsApplied[RC] < maxSeedsToUse) {
+ //
+ // Choose the next seed to use. Choose the first one that isn't used
+ //
+ if (nextSeedToTest >= nPossibleSeeds) {
+ //
+ // We're wrapping. We want to space the seeds out as much as possible, so if we had
+ // a seed length of 20 we'd want to take 0, 10, 5, 15, 2, 7, 12, 17. To make the computation
+ // fast, we use use a table lookup.
+ //
+ wrapCount++;
+ if (wrapCount >= seedLen) {
+ //
+ // We tried all possible seeds without matching or even getting enough seeds to
+ // exceed our seed count. Do the best we can with what we have.
+ //
+ printf(stderr, "Calling score with force=true because we wrapped around enough\n");
+ score(
+ true,
+ read,
+ primaryResult,
+ maxEditDistanceForSecondaryResults,
+ secondaryResultBufferSize,
+ nSecondaryResults,
+ secondaryResults);
+#ifdef _DEBUG
+ if (_DumpAlignments) printf("\tFinal result score %d MAPQ %d (%e probability of best candidate, %e probability of all candidates) at %u\n",
+ primaryResult->score, primaryResult->mapq, probabilityOfBestCandidate, probabilityOfAllCandidates, primaryResult->location);
+#endif // _DEBUG
+ finalizeSecondaryResults(*primaryResult, nSecondaryResults, secondaryResults, maxSecondaryResults, maxEditDistanceForSecondaryResults, bestScore);
+ return;
+ }
+ nextSeedToTest = GetWrappedNextSeedToTest(seedLen, wrapCount);
+ mostSeedsContainingAnyParticularBase[FORWARD] = mostSeedsContainingAnyParticularBase[RC] = wrapCount + 1;
+ }
+ while (nextSeedToTest < nPossibleSeeds && IsSeedUsed(nextSeedToTest)) {
+ //
+ // This seed is already used. Try the next one.
+ //
+ TRACE("Skipping due to IsSeedUsed\n");
+ nextSeedToTest++;
+ }
+ if (nextSeedToTest >= nPossibleSeeds) {
+ //
+ // Unusable seeds have pushed us past the end of the read. Go back around the outer loop so we wrap properly.
+ //
+ TRACE("Eek, we're past the end of the read\n");
+ continue;
+ }
+ SetSeedUsed(nextSeedToTest);
+ if (!Seed::DoesTextRepresentASeed(read[FORWARD]->getData() + nextSeedToTest, seedLen)) {
+ continue;
+ }
+ Seed seed(read[FORWARD]->getData() + nextSeedToTest, seedLen);
+ _int64 nHits[NUM_DIRECTIONS]; // Number of times this seed hits in the genome
+ const GenomeLocation *hits[NUM_DIRECTIONS]; // The actual hits (of size nHits)
+ GenomeLocation singletonHits[NUM_DIRECTIONS]; // Storage for single hits (this is required for 64 bit genome indices, since they might use fewer than 8 bytes internally)
+ const unsigned *hits32[NUM_DIRECTIONS];
+ if (doesGenomeIndexHave64BitLocations) {
+ genomeIndex->lookupSeed(seed, &nHits[FORWARD], &hits[FORWARD], &nHits[RC], &hits[RC], &singletonHits[FORWARD], &singletonHits[RC]);
+ } else {
+ genomeIndex->lookupSeed32(seed, &nHits[FORWARD], &hits32[FORWARD], &nHits[RC], &hits32[RC]);
+ }
+ nHashTableLookups++;
+ lookupsThisRun++;
+#ifdef _DEBUG
+ if (_DumpAlignments) {
+ printf("\tSeed offset %2d, %4d hits, %4d rcHits.", nextSeedToTest, nHits[0], nHits[1]);
+ for (int rc = 0; rc < 2; rc++) {
+ for (unsigned i = 0; i < __min(nHits[rc], 5); i++) {
+ printf(" %sHit at %9llu.", rc == 1 ? "RC " : "", doesGenomeIndexHave64BitLocations ? hits[rc][i] : (_int64)hits32[rc][i]);
+ }
+ }
+ printf("\n");
+ }
+#endif // _DEUBG
+ printf("Looked up seed %.*s (offset %d): hits=%u, rchits=%u\n",
+ seedLen, inputRead->getData() + nextSeedToTest, nextSeedToTest, nHits[0], nHits[1]);
+ for (int rc = 0; rc < 2; rc++) {
+ if (nHits[rc] <= maxHitsToConsider) {
+ printf("%sHits:", rc == 1 ? "RC " : "");
+ for (unsigned i = 0; i < nHits[rc]; i++)
+ printf(" %u", hits[rc][i]);
+ printf("\n");
+ }
+ }
+ bool appliedEitherSeed = false;
+ for (Direction direction = 0; direction < NUM_DIRECTIONS; direction++) {
+ if (nHits[direction] > maxHitsToConsider && !explorePopularSeeds) {
+ //
+ // This seed is matching too many places. Just pretend we never looked and keep going.
+ //
+ nHitsIgnoredBecauseOfTooHighPopularity++;
+ popularSeedsSkipped++;
+ smallestSkippedSeed[direction] = __min(nHits[direction], smallestSkippedSeed[direction]);
+ } else {
+ if (0 == wrapCount) {
+ firstPassSeedsNotSkipped[direction]++;
+ }
+ //
+ // Update the candidates list with any hits from this seed. If lowest possible score of any unseen location is
+ // more than best_score + confDiff then we know that if this location is newly seen then its location won't ever be a
+ // winner, and we can ignore it.
+ //
+ unsigned offset;
+ if (direction == FORWARD) {
+ offset = nextSeedToTest;
+ } else {
+ //
+ // The RC seed is at offset ReadSize - SeedSize - seed offset in the RC seed.
+ //
+ // To see why, imagine that you had a read that looked like 0123456 (where the digits
+ // represented some particular bases, and digit' is the base's complement). Then the
+ // RC of that read is 6'5'4'3'2'1'. So, when we look up the hits for the seed at
+ // offset 0 in the forward read (i.e. 012 assuming a seed size of 3) then the index
+ // will also return the results for the seed's reverse complement, i.e., 3'2'1'.
+ // This happens as the last seed in the RC read.
+ //
+ offset = readLen - seedLen - nextSeedToTest;
+ }
+ const unsigned prefetchDepth = 30;
+ _int64 limit = min(nHits[direction], (_int64)maxHitsToConsider) + prefetchDepth;
+ for (unsigned iBase = 0 ; iBase < limit; iBase += prefetchDepth) {
+ //
+ // This works in two phases: we launch prefetches for a group of hash table lines,
+ // then we do all of the inserts, and then repeat.
+ //
+ _int64 innerLimit = min((_int64)iBase + prefetchDepth, min(nHits[direction], (_int64)maxHitsToConsider));
+ if (doAlignerPrefetch) {
+ for (unsigned i = iBase; i < innerLimit; i++) {
+ if (doesGenomeIndexHave64BitLocations) {
+ prefetchHashTableBucket(GenomeLocationAsInt64(hits[direction][i]) - offset, direction);
+ } else {
+ prefetchHashTableBucket(hits32[direction][i] - offset, direction);
+ }
+ }
+ }
+ for (unsigned i = iBase; i < innerLimit; i++) {
+ //
+ // Find the genome location where the beginning of the read would hit, given a match on this seed.
+ //
+ GenomeLocation genomeLocationOfThisHit;
+ if (doesGenomeIndexHave64BitLocations) {
+ genomeLocationOfThisHit = hits[direction][i] - offset;
+ } else {
+ genomeLocationOfThisHit = hits32[direction][i] - offset;
+ }
+ Candidate *candidate = NULL;
+ HashTableElement *hashTableElement;
+ findCandidate(genomeLocationOfThisHit, direction, &candidate, &hashTableElement);
+ if (NULL != hashTableElement) {
+ if (!noOrderedEvaluation) { // If noOrderedEvaluation, just leave them all on the one-hit weight list so they get evaluated in whatever order
+ incrementWeight(hashTableElement);
+ }
+ candidate->seedOffset = offset;
+ _ASSERT((unsigned)candidate->seedOffset <= readLen - seedLen);
+ } else if (lowestPossibleScoreOfAnyUnseenLocation[direction] <= scoreLimit || noTruncation) {
+ _ASSERT(offset <= readLen - seedLen);
+ allocateNewCandidate(genomeLocationOfThisHit, direction, lowestPossibleScoreOfAnyUnseenLocation[direction],
+ offset, &candidate, &hashTableElement);
+ }
+ }
+ }
+ nSeedsApplied[direction]++;
+ appliedEitherSeed = true;
+ } // not too popular
+ } // directions
+#if 1
+ nextSeedToTest += seedLen;
+#else // 0
+ //
+ // If we don't have enough seeds left to reach the end of the read, space out the seeds more-or-less evenly.
+ //
+ if ((maxSeedsToUse - (nSeedsApplied[FORWARD] + nSeedsApplied[RC]) + 1) * seedLen + nextSeedToTest < nPossibleSeeds) {
+ _ASSERT((nPossibleSeeds + nextSeedToTest) / (maxSeedsToUse - (nSeedsApplied[FORWARD] + nSeedsApplied[RC]) + 1) > seedLen);
+ nextSeedToTest += (nPossibleSeeds + nextSeedToTest) / (maxSeedsToUse - (nSeedsApplied[FORWARD] + nSeedsApplied[RC]) + 1);
+ } else {
+ nextSeedToTest += seedLen;
+ }
+#endif // 0
+ if (appliedEitherSeed) {
+ //
+ // And finally, try scoring.
+ //
+ if (score(
+ false,
+ read,
+ primaryResult,
+ maxEditDistanceForSecondaryResults,
+ secondaryResultBufferSize,
+ nSecondaryResults,
+ secondaryResults)) {
+#ifdef _DEBUG
+ if (_DumpAlignments) printf("\tFinal result score %d MAPQ %d at %u\n", primaryResult->score, primaryResult->mapq, primaryResult->location);
+#endif // _DEBUG
+ finalizeSecondaryResults(*primaryResult, nSecondaryResults, secondaryResults, maxSecondaryResults, maxEditDistanceForSecondaryResults, bestScore);
+ return;
+ }
+ }
+ }
+ //
+ // Do the best with what we've got.
+ //
+ printf("Calling score with force=true because we ran out of seeds\n");
+ score(
+ true,
+ read,
+ primaryResult,
+ maxEditDistanceForSecondaryResults,
+ secondaryResultBufferSize,
+ nSecondaryResults,
+ secondaryResults);
+#ifdef _DEBUG
+ if (_DumpAlignments) printf("\tFinal result score %d MAPQ %d (%e probability of best candidate, %e probability of all candidates) at %u\n", primaryResult->score, primaryResult->mapq, probabilityOfBestCandidate, probabilityOfAllCandidates, primaryResult->location);
+#endif // _DEBUG
+ finalizeSecondaryResults(*primaryResult, nSecondaryResults, secondaryResults, maxSecondaryResults, maxEditDistanceForSecondaryResults, bestScore);
+ return;
+ bool
+ bool forceResult,
+ Read *read[NUM_DIRECTIONS],
+ SingleAlignmentResult *primaryResult,
+ int maxEditDistanceForSecondaryResults,
+ int secondaryResultBufferSize,
+ int *nSecondaryResults,
+ SingleAlignmentResult *secondaryResults)
+Routine Description:
+ Make progress in scoring a possibly partial alignment. This is a private method of the BaseAligner class that's used
+ only by AlignRead.
+ It does a number of things. First, it computes the lowest possible score of any unseen location. This is useful
+ because once we have a scored hit that's more than confDiff better than all unseen locations, there's no need to
+ lookup more of them, we can just score what we've got and be sure that the answer is right (unless errors have
+ pushed the read to be closer to a wrong location than to the correct one, in which case it's hopeless).
+ It then decides whether it should score a location, and if so what one to score. It chooses the unscored
+ location that's got the highest weight (i.e., appeared in the most hash table lookups), since that's most
+ likely to be the winner. If there are multiple candidates with the same best weight, it breaks the tie using
+ the best possible score for the candidates (which is determined when they're first hit). Remaining ties are
+ broken arbitrarily.
+ It merges indels with scored candidates. If there's an insertion or deletion in the read, then we'll get very
+ close but unequal results out of the hash table lookup for parts of the read on opposite sides of the
+ insertion or deletion. This throws out the one with the worse score.
+ It then figures out if we have a definitive answer, and says what that is.
+ forceResult - should we generate an answer even if it's not definitive?
+ read - the read we're aligning in both directions
+ result - returns the result if we reach one
+ singleHitGenomeLocation - returns the location in the genome if we return a single hit
+ hitDirection - if we return a single hit, indicates its direction
+ candidates - in/out the array of candidates that have hit and possibly been scored
+ mapq - returns the map quality if we've reached a final result
+ secondary - returns secondary alignment locations & directions (optional)
+Return Value:
+ true iff we've reached a result. When called with forceResult, we'll always return true.
+ printf("score() called with force=%d nsa=%d nrcsa=%d best=%u bestloc=%u 2nd=%u\n",
+ forceResult, nSeedsApplied[FORWARD], nSeedsApplied[RC], bestScore, bestScoreGenomeLocation, secondBestScore);
+ //printf("Candidates:\n");
+ //for (int i = 0; i < nCandidates; i++) {
+ // Candidate* c = candidates + i;
+ // printf(" loc=%u rc=%d weight=%u minps=%u scored=%d score=%u r=%u-%u\n",
+ // c->genomeLocation, c->isRC, c->weight, c->minPossibleScore, c->scored,
+ // c->score, c->minRange, c->maxRange);
+ //}
+ //printf("\n\n");
+ if (0 == mostSeedsContainingAnyParticularBase[FORWARD] && 0 == mostSeedsContainingAnyParticularBase[RC]) {
+ //
+ // The only way we can get here is if we've tried all of the seeds that we're willing
+ // to try and every one of them generated too many hits to process. Give up.
+ //
+ _ASSERT(forceResult);
+ primaryResult->status = NotFound;
+ primaryResult->mapq = 0;
+ return true;
+ }
+ //
+ // Recompute lowestPossibleScore.
+ //
+ for (Direction direction = 0; direction < 2; direction++) {
+ if (0 != mostSeedsContainingAnyParticularBase[direction]) {
+ lowestPossibleScoreOfAnyUnseenLocation[direction] =
+ __max(lowestPossibleScoreOfAnyUnseenLocation[direction],
+ nSeedsApplied[direction] / mostSeedsContainingAnyParticularBase[direction]);
+ }
+ }
+ printf("Lowest possible scores for unseen locations: %d (fwd), %d (RC)\n",
+ lowestPossibleScoreOfAnyUnseenLocation[FORWARD],
+ lowestPossibleScoreOfAnyUnseenLocation[RC]);
+ unsigned weightListToCheck = highestUsedWeightList;
+ do {
+ //
+ // Grab the next element to score, and score it.
+ //
+ while (weightListToCheck > 0 && weightLists[weightListToCheck].weightNext == &weightLists[weightListToCheck]) {
+ weightListToCheck--;
+ highestUsedWeightList = weightListToCheck;
+ }
+ if ((__min(lowestPossibleScoreOfAnyUnseenLocation[FORWARD],lowestPossibleScoreOfAnyUnseenLocation[RC]) > scoreLimit && !noTruncation) || forceResult) {
+ if (weightListToCheck< minWeightToCheck) {
+ //
+ // We've scored all live candidates and excluded all non-candidates, or we've checked enough that we've hit the cutoff. We have our
+ // answer.
+ //
+ primaryResult->score = bestScore;
+ if (bestScore <= maxK) {
+ primaryResult->location = bestScoreGenomeLocation;
+ primaryResult->mapq = computeMAPQ(probabilityOfAllCandidates, probabilityOfBestCandidate, bestScore, popularSeedsSkipped);
+ if (primaryResult->mapq >= MAPQ_LIMIT_FOR_SINGLE_HIT) {
+ primaryResult->status = SingleHit;
+ } else {
+ primaryResult->status = MultipleHits;
+ }
+ return true;
+ } else {
+ primaryResult->status = NotFound;
+ primaryResult->mapq = 0;
+ return true;
+ }
+ }
+ //
+ // Nothing that we haven't already looked up can possibly be the answer. Score what we've got and exit.
+ //
+ forceResult = true;
+ } else if (weightListToCheck == 0) {
+ //
+ // No candidates, look for more.
+ //
+ return false;
+ }
+ HashTableElement *elementToScore = weightLists[weightListToCheck].weightNext;
+ _ASSERT(!elementToScore->allExtantCandidatesScored);
+ _ASSERT(elementToScore->candidatesUsed != 0);
+ _ASSERT(elementToScore != &weightLists[weightListToCheck]);
+ if (doAlignerPrefetch) {
+ //
+ // Our prefetch pipeline is one loop out we get the genome data for the next loop, and two loops out we get the element to score.
+ //
+ _mm_prefetch((const char *)(elementToScore->weightNext->weightNext), _MM_HINT_T2); // prefetch the next element, it's likely to be the next thing we score.
+ genome->prefetchData(elementToScore->weightNext->baseGenomeLocation);
+ }
+ if (elementToScore->lowestPossibleScore <= scoreLimit) {
+ unsigned long candidateIndexToScore;
+ _uint64 candidatesMask = elementToScore->candidatesUsed;
+ while (_BitScanForward64(&candidateIndexToScore,candidatesMask)) {
+ _uint64 candidateBit = ((_uint64)1 << candidateIndexToScore);
+ candidatesMask &= ~candidateBit;
+ if ((elementToScore->candidatesScored & candidateBit) != 0) {
+ // Already scored it, or marked it as scored due to using ProbabilityDistance
+ continue;
+ }
+ bool anyNearbyCandidatesAlreadyScored = elementToScore->candidatesScored != 0;
+ elementToScore->candidatesScored |= candidateBit;
+ _ASSERT(candidateIndexToScore < hashTableElementSize);
+ Candidate *candidateToScore = &elementToScore->candidates[candidateIndexToScore];
+ GenomeLocation genomeLocation = elementToScore->baseGenomeLocation + candidateIndexToScore;
+ GenomeLocation elementGenomeLocation = genomeLocation; // This is the genome location prior to any adjustments for indels
+ //
+ // We're about to run edit distance computation on the genome. Launch a prefetch for it
+ // so that it's in cache when we do (or at least on the way).
+ //
+ if (doAlignerPrefetch) {
+ genomeIndex->prefetchGenomeData(genomeLocation);
+ }
+ unsigned score = -1;
+ double matchProbability = 0;
+ unsigned readDataLength = read[elementToScore->direction]->getDataLength();
+ GenomeDistance genomeDataLength = readDataLength + MAX_K; // Leave extra space in case the read has deletions
+ const char *data = genome->getSubstring(genomeLocation, genomeDataLength);
+#if 0 // This only happens when we're in the padding region, and genomeLocations there just lead to problems. Just say no.
+ if (NULL == data) {
+ //
+ // We're up against the end of a chromosome. Reduce the extra space enough that it isn't too
+ // long. We're willing to reduce it to less than the length of a read, because the read could
+ // butt up against the end of the chromosome and have insertions in it.
+ //
+ const Genome::Contig *contig = genome->getContigAtLocation(genomeLocation);
+ if (contig != NULL) {
+ GenomeLocation endLocation;
+ if (genomeLocation + readDataLength + MAX_K >= GenomeLocation(0) + genome->getCountOfBases()) {
+ endLocation = GenomeLocation(0) + genome->getCountOfBases();
+ } else {
+ const Genome::Contig *nextContig = genome->getNextContigAfterLocation(genomeLocation);
+ _ASSERT(contig->beginningLocation <= genomeLocation && contig != nextContig);
+ endLocation = nextContig->beginningLocation;
+ }
+ genomeDataLength = endLocation - genomeLocation - 1;
+ if (genomeDataLength >= readDataLength - MAX_K) {
+ data = genome->getSubstring(genomeLocation, genomeDataLength);
+ _ASSERT(NULL != data);
+ }
+ }
+ }
+#endif // 0
+ if (data != NULL) {
+ Read *readToScore = read[elementToScore->direction];
+ _ASSERT(candidateToScore->seedOffset + seedLen <= readToScore->getDataLength());
+ //
+ // Compute the distance separately in the forward and backward directions from the seed, to allow
+ // arbitrary offsets at both the start and end.
+ //
+ double matchProb1, matchProb2;
+ int score1, score2;
+ // First, do the forward direction from where the seed aligns to past of it
+ int readLen = readToScore->getDataLength();
+ int seedLen = genomeIndex->getSeedLength();
+ int seedOffset = candidateToScore->seedOffset; // Since the data is reversed
+ int tailStart = seedOffset + seedLen;
+ _ASSERT(!memcmp(data+seedOffset, readToScore->getData() + seedOffset, seedLen));
+ int textLen = (int)__min(genomeDataLength - tailStart, 0x7ffffff0);
+ score1 = landauVishkin->computeEditDistance(data + tailStart, textLen, readToScore->getData() + tailStart, readToScore->getQuality() + tailStart, readLen - tailStart,
+ scoreLimit, &matchProb1);
+ if (score1 == -1) {
+ score = -1;
+ } else {
+ // The tail of the read matched; now let's reverse match the reference genome and the head
+ int limitLeft = scoreLimit - score1;
+ int genomeLocationOffset;
+ score2 = reverseLandauVishkin->computeEditDistance(data + seedOffset, seedOffset + MAX_K, reversedRead[elementToScore->direction] + readLen - seedOffset,
+ read[OppositeDirection(elementToScore->direction)]->getQuality() + readLen - seedOffset, seedOffset, limitLeft, &matchProb2,
+ &genomeLocationOffset);
+ if (score2 == -1) {
+ score = -1;
+ } else {
+ score = score1 + score2;
+ // Map probabilities for substrings can be multiplied, but make sure to count seed too
+ matchProbability = matchProb1 * matchProb2 * pow(1 - SNP_PROB, seedLen);
+ //
+ // Adjust the genome location based on any indels that we found.
+ //
+ genomeLocation += genomeLocationOffset;
+ //
+ // We could mark as scored anything in between the old and new genome offsets, but it's probably not worth the effort since this is
+ // so rare and all it would do is same time.
+ //
+ }
+ }
+ } else { // if we had genome data to compare against
+ matchProbability = 0;
+ }
+ printf("Computing distance at %u (RC) with limit %d: %d (prob %g)\n",
+ genomeLocation, scoreLimit, score, matchProbability);
+#ifdef _DEBUG
+ if (_DumpAlignments) printf("Scored %9u weight %2d limit %d, result %2d %s\n", genomeLocation, elementToScore->weight, scoreLimit, score, elementToScore->direction ? "RC" : "");
+#endif // _DEBUG
+ candidateToScore->score = score;
+ nLocationsScored++;
+ lvScores++;
+ lvScoresAfterBestFound++;
+ //
+ // Handle the special case where we just scored a different offset for a region that's already been scored. This can happen when
+ // there are indels in the read. In this case, we want to treat them as a single aignment, not two different ones (which would
+ // cause us to lose confidence in the alignment, since they're probably both pretty good).
+ //
+ if (anyNearbyCandidatesAlreadyScored) {
+ if (elementToScore->bestScore < score || elementToScore->bestScore == score && matchProbability <= elementToScore->matchProbabilityForBestScore) {
+ //
+ // This is a no better mapping than something nearby that we already tried. Just ignore it.
+ //
+ continue;
+ }
+ } else {
+ _ASSERT(elementToScore->matchProbabilityForBestScore == 0.0);
+ }
+ elementToScore->bestScoreGenomeLocation = genomeLocation;
+ //
+ // Look up the hash table element that's closest to the genomeLocation but that doesn't
+ // contain it, to check if this location is already scored.
+ //
+ // We do this computation in a strange way in order to avoid generating a branch instruction that
+ // the processor's branch predictor will get wrong half of the time. Think about it like this:
+ // The genome location lies in a bucket of size hashTableElementSize. Its offset in the bucket
+ // is genomeLocation % hashTableElementSize. If we take that quantity and integer divide it by
+ // hashTableElementSize / 2, we get 0 if it's in the first half and 1 if it's in the second. Double that and subtract
+ // one, and you're at the right place with no branches.
+ //
+ HashTableElement *nearbyElement;
+ GenomeLocation nearbyGenomeLocation;
+ if (-1 != score) {
+ nearbyGenomeLocation = elementGenomeLocation + (2*(GenomeLocationAsInt64(elementGenomeLocation) % hashTableElementSize / (hashTableElementSize/2)) - 1) * (hashTableElementSize/2);
+ _ASSERT((GenomeLocationAsInt64(elementGenomeLocation) % hashTableElementSize >= (hashTableElementSize/2) ? elementGenomeLocation + (hashTableElementSize/2) : elementGenomeLocation - (hashTableElementSize/2)) == nearbyGenomeLocation); // Assert that the logic in the above comment is right.
+ findElement(nearbyGenomeLocation, elementToScore->direction, &nearbyElement);
+ } else {
+ nearbyElement = NULL;
+ }
+ if (NULL != nearbyElement && nearbyElement->candidatesScored != 0) {
+ //
+ // Just because there's a "nearby" element doesn't mean it's really within the maxMergeDist. Check that now.
+ //
+ if (!genomeLocationIsWithin(genomeLocation, nearbyElement->bestScoreGenomeLocation, maxMergeDist)) {
+ //
+ // There's a nearby element, but its best score is too far away to merge. Forget it.
+ //
+ nearbyElement = NULL;
+ }
+ if (NULL != nearbyElement) {
+ if (nearbyElement->bestScore < score || nearbyElement->bestScore == score && nearbyElement->matchProbabilityForBestScore >= matchProbability) {
+ //
+ // Again, this no better than something nearby we already tried. Give up.
+ //
+ continue;
+ }
+ anyNearbyCandidatesAlreadyScored = true;
+ probabilityOfAllCandidates = __max(0.0, probabilityOfAllCandidates - nearbyElement->matchProbabilityForBestScore);
+ nearbyElement->matchProbabilityForBestScore = 0; // keeps us from backing it out twice
+ }
+ }
+ probabilityOfAllCandidates = __max(0.0, probabilityOfAllCandidates - elementToScore->matchProbabilityForBestScore); // need the max due to floating point lossage.
+ probabilityOfAllCandidates += matchProbability; // Don't combine this with the previous line, it introduces floating point unhappiness.
+ elementToScore->matchProbabilityForBestScore = matchProbability;
+ elementToScore->bestScore = score;
+ if (bestScore > score ||
+ (bestScore == score && matchProbability > probabilityOfBestCandidate)) {
+ //
+ // We have a new best score. The old best score becomes the second best score, unless this is the same as the best or second best score
+ //
+ if ((secondBestScore == UnusedScoreValue || !(secondBestScoreGenomeLocation + maxMergeDist > genomeLocation && secondBestScoreGenomeLocation < genomeLocation + maxMergeDist)) &&
+ (bestScore == UnusedScoreValue || !(bestScoreGenomeLocation + maxMergeDist > genomeLocation && bestScoreGenomeLocation < genomeLocation + maxMergeDist)) &&
+ (!anyNearbyCandidatesAlreadyScored || (GenomeLocationAsInt64(bestScoreGenomeLocation) / maxMergeDist != GenomeLocationAsInt64(genomeLocation) / maxMergeDist &&
+ GenomeLocationAsInt64(secondBestScoreGenomeLocation) / maxMergeDist != GenomeLocationAsInt64(genomeLocation) / maxMergeDist))) {
+ secondBestScore = bestScore;
+ secondBestScoreGenomeLocation = bestScoreGenomeLocation;
+ secondBestScoreDirection = primaryResult->direction;
+ }
+ //
+ // If we're tracking secondary alignments, put the old best score in as a new secondary alignment
+ //
+ if (NULL != secondaryResults && (int)(bestScore - score) <= maxEditDistanceForSecondaryResults) { // bestScore is initialized to UnusedScoreValue, which is large, so this won't fire if this is the first candidate
+ if (secondaryResultBufferSize <= *nSecondaryResults) {
+ WriteErrorMessage("Out of secondary result buffer in BaseAliner::score(), which shouldn't be possible");
+ soft_exit(1);
+ }
+ SingleAlignmentResult *result = &secondaryResults[*nSecondaryResults];
+ result->direction = primaryResult->direction;
+ result->location = bestScoreGenomeLocation;
+ result->mapq = 0;
+ result->score = bestScore;
+ result->status = MultipleHits;
+ _ASSERT(result->score != -1);
+ (*nSecondaryResults)++;
+ }
+ bestScore = score;
+ probabilityOfBestCandidate = matchProbability;
+ _ASSERT(probabilityOfBestCandidate <= probabilityOfAllCandidates);
+ bestScoreGenomeLocation = genomeLocation;
+ primaryResult->location = bestScoreGenomeLocation;
+ primaryResult->score = bestScore;
+ primaryResult->direction = elementToScore->direction;
+ lvScoresAfterBestFound = 0;
+ } else {
+ if (secondBestScore > score) {
+ //
+ // A new second best.
+ //
+ secondBestScore = score;
+ secondBestScoreGenomeLocation = genomeLocation;
+ secondBestScoreDirection = elementToScore->direction;
+ }
+ //
+ // If this is close enough, record it as a secondary alignment.
+ //
+ if (-1 != maxEditDistanceForSecondaryResults && NULL != secondaryResults && (int)(bestScore - score) <= maxEditDistanceForSecondaryResults && score != -1) {
+ if (secondaryResultBufferSize <= *nSecondaryResults) {
+ WriteErrorMessage("Out of secondary result buffer in BaseAliner::score(), which shouldn't be possible");
+ soft_exit(1);
+ }
+ SingleAlignmentResult *result = &secondaryResults[*nSecondaryResults];
+ result->direction = elementToScore->direction;
+ result->location = genomeLocation;
+ result->mapq = 0;
+ result->score = score;
+ result->status = MultipleHits;
+ _ASSERT(result->score != -1);
+ (*nSecondaryResults)++;
+ }
+ }
+ if (stopOnFirstHit && bestScore <= maxK) {
+ // The user just wanted to find reads that match the database within some distance, but doesn't
+ // care about the best alignment. Stop now but mark the result as MultipleHits because we're not
+ // confident that it's the best one. We don't support mapq in this secnario, because we haven't
+ // explored enough to compute it.
+ primaryResult->status = MultipleHits;
+ primaryResult->mapq = 0;
+ return true;
+ }
+ // Update scoreLimit since we may have improved bestScore or secondBestScore
+ if (!noUkkonen) { // If we've turned off Ukkonen, then don't drop the score limit, just leave it at maxK + extraSearchDepth always
+ scoreLimit = min(bestScore, maxK) + extraSearchDepth;
+ } else {
+ _ASSERT(scoreLimit == maxK + extraSearchDepth);
+ }
+ } // While candidates exist in the element
+ } // If the element could possibly affect the result
+ //
+ // Remove the element from the weight list.
+ //
+ elementToScore->allExtantCandidatesScored = true;
+ elementToScore->weightNext->weightPrev = elementToScore->weightPrev;
+ elementToScore->weightPrev->weightNext = elementToScore->weightNext;
+ elementToScore->weightNext = elementToScore->weightPrev = elementToScore;
+ } while (forceResult);
+ return false;
+ void
+BaseAligner::prefetchHashTableBucket(GenomeLocation genomeLocation, Direction direction)
+ HashTableAnchor *hashTable = candidateHashTable[direction];
+ _uint64 lowOrderGenomeLocation;
+ _uint64 highOrderGenomeLocation;
+ decomposeGenomeLocation(genomeLocation, &highOrderGenomeLocation, &lowOrderGenomeLocation);
+ _uint64 hashTableIndex = hash(highOrderGenomeLocation) % candidateHashTablesSize;
+ _mm_prefetch((const char *)&hashTable[hashTableIndex], _MM_HINT_T2);
+ bool
+ GenomeLocation genomeLocation,
+ Direction direction,
+ HashTableElement **hashTableElement)
+ HashTableAnchor *hashTable = candidateHashTable[direction];
+ _uint64 lowOrderGenomeLocation;
+ _uint64 highOrderGenomeLocation;
+ decomposeGenomeLocation(genomeLocation, &highOrderGenomeLocation, &lowOrderGenomeLocation);
+ _uint64 hashTableIndex = hash(highOrderGenomeLocation) % candidateHashTablesSize;
+ HashTableAnchor *anchor = &hashTable[hashTableIndex];
+ if (anchor->epoch != hashTableEpoch) {
+ //
+ // It's empty.
+ //
+ *hashTableElement = NULL;
+ return false;
+ }
+ HashTableElement *lookedUpElement = anchor->element;
+ while (NULL != lookedUpElement && lookedUpElement->baseGenomeLocation != highOrderGenomeLocation) {
+ lookedUpElement = lookedUpElement->next;
+ }
+ *hashTableElement = lookedUpElement;
+ return lookedUpElement != NULL;
+ void
+ GenomeLocation genomeLocation,
+ Direction direction,
+ Candidate **candidate,
+ HashTableElement **hashTableElement)
+Routine Description:
+ Find a candidate in the hash table, optionally allocating it if it doesn't exist (but the element does).
+ genomeLocation - the location of the candidate we'd like to look up
+ candidate - The candidate that was found or created
+ hashTableElement - the hashTableElement for the candidate that was found.
+ allocateNew - if this doesn't already exist, should we allocate it?
+ _uint64 lowOrderGenomeLocation;
+ decomposeGenomeLocation(genomeLocation, NULL, &lowOrderGenomeLocation);
+ if (!findElement(genomeLocation, direction, hashTableElement)) {
+ *hashTableElement = NULL;
+ *candidate = NULL;
+ return;
+ }
+ _uint64 bitForThisCandidate = (_uint64)1 << lowOrderGenomeLocation;
+ *candidate = &(*hashTableElement)->candidates[lowOrderGenomeLocation];
+ (*hashTableElement)->allExtantCandidatesScored = (*hashTableElement)->allExtantCandidatesScored && ((*hashTableElement)->candidatesUsed & bitForThisCandidate);
+ (*hashTableElement)->candidatesUsed |= bitForThisCandidate;
+bool doAlignerPrefetch = true;
+ void
+ GenomeLocation genomeLocation,
+ Direction direction,
+ unsigned lowestPossibleScore,
+ int seedOffset,
+ Candidate ** candidate,
+ HashTableElement ** hashTableElement)
+Routine Description:
+Return Value:
+ HashTableAnchor *hashTable = candidateHashTable[direction];
+ _uint64 lowOrderGenomeLocation;
+ _uint64 highOrderGenomeLocation;
+ decomposeGenomeLocation(genomeLocation, &highOrderGenomeLocation, &lowOrderGenomeLocation);
+ unsigned hashTableIndex = hash(highOrderGenomeLocation) % candidateHashTablesSize;
+ HashTableAnchor *anchor = &hashTable[hashTableIndex];
+ if (doAlignerPrefetch) {
+ _mm_prefetch((const char *)anchor, _MM_HINT_T2); // Prefetch our anchor. We don't have enough computation to completely hide the prefetch, but at least we get some for free here.
+ }
+ HashTableElement *element;
+#if DBG
+ element = hashTable[hashTableIndex].element;
+ while (anchor->epoch == hashTableEpoch && NULL != element && element->genomeLocation != highOrderGenomeLocation) {
+ element = element->next;
+ }
+ _ASSERT(NULL == element || anchor->epoch != hashTableEpoch);
+#endif // DBG
+ _ASSERT(nUsedHashTableElements < hashTableElementPoolSize);
+ element = &hashTableElementPool[nUsedHashTableElements];
+ nUsedHashTableElements++;
+ if (doAlignerPrefetch) {
+ //
+ // Fetch the next candidate so we don't cache miss next time around.
+ //
+ _mm_prefetch((const char *)&hashTableElementPool[nUsedHashTableElements], _MM_HINT_T2);
+ }
+ element->candidatesUsed = (_uint64)1 << lowOrderGenomeLocation;
+ element->candidatesScored = 0;
+ element->lowestPossibleScore = lowestPossibleScore;
+ element->direction = direction;
+ element->weight = 1;
+ element->baseGenomeLocation = highOrderGenomeLocation;
+ element->bestScore = UnusedScoreValue;
+ element->allExtantCandidatesScored = false;
+ element->matchProbabilityForBestScore = 0;
+ //
+ // And insert it at the end of weight list 1.
+ //
+ element->weightNext = &weightLists[1];
+ element->weightPrev = weightLists[1].weightPrev;
+ element->weightNext->weightPrev = element;
+ element->weightPrev->weightNext = element;
+ *candidate = &element->candidates[lowOrderGenomeLocation];
+ (*candidate)->seedOffset = seedOffset;
+ *hashTableElement = element;
+ highestUsedWeightList = __max(highestUsedWeightList,(unsigned)1);
+ if (anchor->epoch == hashTableEpoch) {
+ element->next = anchor->element;
+ } else {
+ anchor->epoch = hashTableEpoch;
+ element->next = NULL;
+ }
+ anchor->element = element;
+Routine Description:
+Return Value:
+ delete probDistance;
+ if (hadBigAllocator) {
+ //
+ // Since these got allocated with the alloator rather than new, we want to call
+ // their destructors without freeing their memory (which is the responsibility of
+ // the owner of the allocator).
+ //
+ if (ownLandauVishkin) {
+ if (NULL != landauVishkin) {
+ landauVishkin->~LandauVishkin();
+ }
+ if (NULL != reverseLandauVishkin) {
+ reverseLandauVishkin->~LandauVishkin();
+ }
+ }
+ } else {
+ if (ownLandauVishkin) {
+ if (NULL != landauVishkin) {
+ delete landauVishkin;
+ }
+ if (NULL != reverseLandauVishkin) {
+ delete reverseLandauVishkin;
+ }
+ }
+ BigDealloc(rcReadData);
+ rcReadData = NULL;
+ BigDealloc(reversedRead[FORWARD]);
+ reversedRead[FORWARD] = NULL;
+ reversedRead[RC] = NULL;
+ BigDealloc(seedUsedAsAllocated);
+ seedUsed = NULL;
+ BigDealloc(candidateHashTable[FORWARD]);
+ candidateHashTable[FORWARD] = NULL;
+ BigDealloc(candidateHashTable[RC]);
+ candidateHashTable[RC] = NULL;
+ BigDealloc(weightLists);
+ weightLists = NULL;
+ BigDealloc(hashTableElementPool);
+ hashTableElementPool = NULL;
+ if (NULL != hitsPerContigCounts) {
+ BigDealloc(hitsPerContigCounts);
+ hitsPerContigCounts = NULL;
+ }
+ }
+ init();
+ void
+ weightNext = NULL;
+ weightPrev = NULL;
+ next = NULL;
+ candidatesUsed = 0;
+ baseGenomeLocation = 0;
+ weight = 0;
+ lowestPossibleScore = UnusedScoreValue;
+ bestScore = UnusedScoreValue;
+ direction = FORWARD;
+ allExtantCandidatesScored = false;
+ matchProbabilityForBestScore = 0;
+ void
+ score = UnusedScoreValue;
+ void
+BaseAligner::clearCandidates() {
+ hashTableEpoch++;
+ nUsedHashTableElements = 0;
+ highestUsedWeightList = 0;
+ for (unsigned i = 1; i < numWeightLists; i++) {
+ weightLists[i].weightNext = weightLists[i].weightPrev = &weightLists[i];
+ }
+ void
+BaseAligner::incrementWeight(HashTableElement *element)
+ if (element->allExtantCandidatesScored) {
+ //
+ // It's already scored, so it shouldn't be on a weight list.
+ //
+ _ASSERT(element->weightNext == element);
+ _ASSERT(element->weightPrev == element);
+ return;
+ }
+ //
+ // It's possible to have elements with weight > maxSeedsToUse. This
+ // happens when a single seed occurs more than once within a particular
+ // element (imagine an element with bases ATATATATATATATAT..., it will
+ // match the appropriate seed at offset 0, 2, 4, etc.) If that happens,
+ // just don't let the weight get too big.
+ //
+ if (element->weight >= numWeightLists - 1) {
+ return;
+ }
+ //
+ // Remove it from its existing list.
+ //
+ element->weightNext->weightPrev = element->weightPrev;
+ element->weightPrev->weightNext = element->weightNext;
+ element->weight++;
+ highestUsedWeightList = __max(highestUsedWeightList,element->weight);
+ //
+ // And insert it at the tail of the new list.
+ //
+ element->weightNext = &weightLists[element->weight];
+ element->weightPrev = weightLists[element->weight].weightPrev;
+ element->weightNext->weightPrev = element;
+ element->weightPrev->weightNext = element;
+ size_t
+BaseAligner::getBigAllocatorReservation(GenomeIndex *index, bool ownLandauVishkin, unsigned maxHitsToConsider, unsigned maxReadSize,
+ unsigned seedLen, unsigned numSeedsFromCommandLine, double seedCoverage, int maxSecondaryAlignmentsPerContig)
+ unsigned maxSeedsToUse;
+ if (0 != numSeedsFromCommandLine) {
+ maxSeedsToUse = numSeedsFromCommandLine;
+ } else {
+ maxSeedsToUse = (unsigned)(maxReadSize * seedCoverage / seedLen);
+ }
+ size_t candidateHashTablesSize = (maxHitsToConsider * maxSeedsToUse * 3)/2; // *1.5 for hash table slack
+ size_t hashTableElementPoolSize = maxHitsToConsider * maxSeedsToUse * 2 ; // *2 for RC
+ size_t contigCounters;
+ if (maxSecondaryAlignmentsPerContig > 0) {
+ contigCounters = sizeof(HitsPerContigCounts)* index->getGenome()->getNumContigs();
+ } else {
+ contigCounters = 0;
+ }
+ return
+ contigCounters +
+ sizeof(_uint64) * 14 + // allow for alignment
+ sizeof(BaseAligner) + // our own member variables
+ (ownLandauVishkin ?
+ LandauVishkin<>::getBigAllocatorReservation() +
+ LandauVishkin<-1>::getBigAllocatorReservation() : 0) + // our LandauVishkin objects
+ sizeof(char) * maxReadSize * 2 + // rcReadData
+ sizeof(char) * maxReadSize * 4 + 2 * MAX_K + // reversed read (both)
+ sizeof(BYTE) * (maxReadSize + 7 + 128) / 8 + // seed used
+ sizeof(HashTableElement) * hashTableElementPoolSize + // hash table element pool
+ sizeof(HashTableAnchor) * candidateHashTablesSize * 2 + // candidate hash table (both)
+ sizeof(HashTableElement) * (maxSeedsToUse + 1); // weight lists
+ void
+ SingleAlignmentResult primaryResult,
+ int *nSecondaryResults, // in/out
+ SingleAlignmentResult *secondaryResults,
+ int maxSecondaryResults,
+ int maxEditDistanceForSecondaryResults,
+ int bestScore)
+ //
+ // There's no guarantee that the results are actually within the bound; the aligner records anything that's
+ // within the bound when it's scored, but if we subsequently found a better fit, then it may no longer be
+ // close enough. Get rid of those now.
+ //
+ // NB: This code is very similar to code at the end of IntersectingPairedEndAligner::align(). Sorry.
+ //
+ int worstScoreToKeep = min((int)maxK, bestScore + maxEditDistanceForSecondaryResults);
+ int i = 0;
+ while (i < *nSecondaryResults) {
+ if (secondaryResults[i].score > worstScoreToKeep) {
+ //
+ // This one is too bad to keep. Move the last one from the array here and decrement the
+ // count. Don't move up i, because the one we just moved in may also be too
+ // bad.
+ //
+ secondaryResults[i] = secondaryResults[(*nSecondaryResults)-1];
+ (*nSecondaryResults)--;
+ } else {
+ i++;
+ }
+ }
+ if (maxSecondaryAlignmentsPerContig > 0 && primaryResult.status != NotFound) {
+ //
+ // Run through the results and count the number of results per contig, to see if any of them are too big.
+ //
+ bool anyContigHasTooManyResults = false;
+ int primaryResultContigNum = genome->getContigNumAtLocation(primaryResult.location);
+ hitsPerContigCounts[primaryResultContigNum].hits = 1;
+ hitsPerContigCounts[primaryResultContigNum].epoch = hashTableEpoch;
+ for (i = 0; i < *nSecondaryResults; i++) {
+ int contigNum = genome->getContigNumAtLocation(secondaryResults[i].location);
+ if (hitsPerContigCounts[contigNum].epoch != hashTableEpoch) {
+ hitsPerContigCounts[contigNum].epoch = hashTableEpoch;
+ hitsPerContigCounts[contigNum].hits = 0;
+ }
+ hitsPerContigCounts[contigNum].hits++;
+ if (hitsPerContigCounts[contigNum].hits > maxSecondaryAlignmentsPerContig) {
+ anyContigHasTooManyResults = true;
+ break;
+ }
+ }
+ if (anyContigHasTooManyResults) {
+ //
+ // Just sort them all, in order of contig then hit depth.
+ //
+ qsort(secondaryResults, *nSecondaryResults, sizeof(*secondaryResults), SingleAlignmentResult::compareByContigAndScore);
+ //
+ // Now run through and eliminate any contigs with too many hits. We can't use the same trick at the first loop above, because the
+ // counting here relies on the results being sorted. So, instead, we just copy them as we go.
+ //
+ int currentContigNum = -1;
+ int currentContigCount = 0;
+ int destResult = 0;
+ for (int sourceResult = 0; sourceResult < *nSecondaryResults; sourceResult++) {
+ int contigNum = genome->getContigNumAtLocation(secondaryResults[sourceResult].location);
+ if (contigNum != currentContigNum) {
+ currentContigNum = contigNum;
+ currentContigCount = (contigNum == primaryResultContigNum) ? 1 : 0;
+ }
+ currentContigCount++;
+ if (currentContigCount <= maxSecondaryAlignmentsPerContig) {
+ //
+ // Keep it. If we don't get here, then we don't copy the result and
+ // don't increment destResult. And yes, this will sometimes copy a
+ // result over itself. That's harmless.
+ //
+ secondaryResults[destResult] = secondaryResults[sourceResult];
+ destResult++;
+ }
+ } // for each source result
+ *nSecondaryResults = destResult;
+ }
+ } // if maxSecondaryAlignmentsPerContig > 0
+ if (*nSecondaryResults > maxSecondaryResults) {
+ qsort(secondaryResults, *nSecondaryResults, sizeof(*secondaryResults), SingleAlignmentResult::compareByScore);
+ *nSecondaryResults = maxSecondaryResults; // Just truncate it
+ }
+ unsigned
+BaseAligner::getMaxSecondaryResults(unsigned maxSeedsToUse, double maxSeedCoverage, unsigned maxReadSize, unsigned maxHits, unsigned seedLength)
+ if (0 != maxSeedsToUse) {
+ return maxHits * maxSeedsToUse * NUM_DIRECTIONS; // Can't have more alignments than total possible hits
+ } else {
+ return (unsigned)((maxSeedCoverage * maxReadSize + seedLength) / seedLength) * maxHits * NUM_DIRECTIONS;
+ }
diff --git a/SNAPLib/BaseAligner.h b/SNAPLib/BaseAligner.h
new file mode 100644
index 0000000..61a24cb
--- /dev/null
+++ b/SNAPLib/BaseAligner.h
@@ -0,0 +1,341 @@
+Module Name:
+ BaseAligner.h
+ Header for SNAP genome aligner
+ Bill Bolosky, August, 2011
+ User mode service.
+ This class is NOT thread safe. It's the caller's responsibility to ensure that
+ at most one thread uses an instance at any time.
+Revision History:
+ Adapted from Matei Zaharia's Scala implementation.
+#pragma once
+#include "AlignmentResult.h"
+#include "LandauVishkin.h"
+#include "BigAlloc.h"
+#include "ProbabilityDistance.h"
+#include "AlignerStats.h"
+#include "directions.h"
+#include "GenomeIndex.h"
+extern bool doAlignerPrefetch;
+class BaseAligner {
+ BaseAligner(
+ GenomeIndex *i_genomeIndex,
+ unsigned i_maxHitsToConsider,
+ unsigned i_maxK,
+ unsigned i_maxReadSize,
+ unsigned i_maxSeedsToUse,
+ double i_maxSeedCoverage,
+ unsigned i_minWeightToCheck,
+ unsigned i_extraSearchDepth,
+ bool i_noUkkonen,
+ bool i_noOrderedEvaluation,
+ bool i_noTruncation,
+ int i_maxSecondaryAlignmentsPerContig,
+ LandauVishkin<1>*i_landauVishkin = NULL,
+ LandauVishkin<-1>*i_reverseLandauVishkin = NULL,
+ AlignerStats *i_stats = NULL,
+ BigAllocator *allocator = NULL);
+ static unsigned getMaxSecondaryResults(unsigned maxSeedsToUse, double maxSeedCoverage, unsigned maxReadSize, unsigned maxHits, unsigned seedLength);
+ virtual ~BaseAligner();
+ void
+ AlignRead(
+ Read *read,
+ SingleAlignmentResult *primaryResult,
+ int maxEditDistanceForSecondaryResults,
+ int secondaryResultBufferSize,
+ int *nSecondaryResults,
+ int maxSecondaryResults, // The most secondary results to return; always return the best ones
+ SingleAlignmentResult *secondaryResults // The caller passes in a buffer of secondaryResultBufferSize and it's filled in by AlignRead()
+ ); // Retun value is true if there was enough room in the secondary alignment buffer for everything that was found.
+ //
+ // Statistics gathering.
+ //
+ _int64 getNHashTableLookups() const {return nHashTableLookups;}
+ _int64 getLocationsScored() const {return nLocationsScored;}
+ _int64 getNHitsIgnoredBecauseOfTooHighPopularity() const {return nHitsIgnoredBecauseOfTooHighPopularity;}
+ _int64 getNReadsIgnoredBecauseOfTooManyNs() const {return nReadsIgnoredBecauseOfTooManyNs;}
+ _int64 getNIndelsMerged() const {return nIndelsMerged;}
+ void addIgnoredReads(_int64 newlyIgnoredReads) {nReadsIgnoredBecauseOfTooManyNs += newlyIgnoredReads;}
+ const char *getRCTranslationTable() const {return rcTranslationTable;}
+ inline int getMaxK() const {return maxK;}
+ inline void setMaxK(int maxK_) {maxK = maxK_;}
+ inline void setReadId(int readId_) {readId = readId_;}
+ const char *getName() const {return "Base Aligner";}
+ inline bool checkedAllSeeds() {return popularSeedsSkipped == 0;}
+ void *operator new(size_t size) {return BigAlloc(size);}
+ void operator delete(void *ptr) {BigDealloc(ptr);}
+ void *operator new(size_t size, BigAllocator *allocator) {_ASSERT(size == sizeof(BaseAligner)); return allocator->allocate(size);}
+ void operator delete(void *ptr, BigAllocator *allocator) {/* do nothing. Memory gets cleaned up when the allocator is deleted.*/}
+ inline bool getExplorePopularSeeds() {return explorePopularSeeds;}
+ inline void setExplorePopularSeeds(bool newValue) {explorePopularSeeds = newValue;}
+ inline bool getStopOnFirstHit() {return stopOnFirstHit;}
+ inline void setStopOnFirstHit(bool newValue) {stopOnFirstHit = newValue;}
+ static size_t getBigAllocatorReservation(GenomeIndex *index, bool ownLandauVishkin, unsigned maxHitsToConsider, unsigned maxReadSize, unsigned seedLen,
+ unsigned numSeedsFromCommandLine, double seedCoverage, int maxSecondaryAlignmentsPerContig);
+ bool hadBigAllocator;
+ LandauVishkin<> *landauVishkin;
+ LandauVishkin<-1> *reverseLandauVishkin;
+ bool ownLandauVishkin;
+ ProbabilityDistance *probDistance;
+ // Maximum distance to merge candidates that differ in indels over.
+#ifdef LONG_READS
+ static const unsigned maxMergeDist = 64; // Must be even and <= 64
+ static const unsigned maxMergeDist = 48; // Must be even and <= 64
+ char rcTranslationTable[256];
+ _int64 nHashTableLookups;
+ _int64 nLocationsScored;
+ _int64 nHitsIgnoredBecauseOfTooHighPopularity;
+ _int64 nReadsIgnoredBecauseOfTooManyNs;
+ _int64 nIndelsMerged;
+ //
+ // A bitvector indexed by offset in the read indicating whether this seed is used.
+ // This is here to avoid doing a memory allocation in the aligner.
+ //
+ BYTE *seedUsed;
+ BYTE *seedUsedAsAllocated; // Use this for deleting seedUsed.
+ inline bool IsSeedUsed(unsigned indexInRead) const {
+ return (seedUsed[indexInRead / 8] & (1 << (indexInRead % 8))) != 0;
+ }
+ inline void SetSeedUsed(unsigned indexInRead) {
+ seedUsed[indexInRead / 8] |= (1 << (indexInRead % 8));
+ }
+ struct Candidate {
+ Candidate() {init();}
+ void init();
+ unsigned score;
+ int seedOffset;
+ };
+ static const unsigned hashTableElementSize = maxMergeDist; // The code depends on this, don't change it
+ void decomposeGenomeLocation(GenomeLocation genomeLocation, _uint64 *highOrder, _uint64 *lowOrder)
+ {
+ *lowOrder = (_uint64)GenomeLocationAsInt64(genomeLocation) % hashTableElementSize;
+ if (NULL != highOrder) {
+ *highOrder = (_uint64)GenomeLocationAsInt64(genomeLocation) - *lowOrder;
+ }
+ }
+ struct HashTableElement {
+ HashTableElement();
+ void init();
+ //
+ // Doubly linked list for the weight buckets.
+ //
+ HashTableElement *weightNext;
+ HashTableElement *weightPrev;
+ //
+ // Singly linked list for the hash table buckets.
+ //
+ HashTableElement *next;
+ _uint64 candidatesUsed; // Really candidates we still need to score
+ _uint64 candidatesScored;
+ GenomeLocation baseGenomeLocation;
+ unsigned weight;
+ unsigned lowestPossibleScore;
+ unsigned bestScore;
+ GenomeLocation bestScoreGenomeLocation;
+ Direction direction;
+ bool allExtantCandidatesScored;
+ double matchProbabilityForBestScore;
+ Candidate candidates[hashTableElementSize];
+ };
+ //
+ // Clearing out all of the pointers in the hash tables is expensive relative to running
+ // an alignment, because usually the table is much bigger than the number of entries in it.
+ // So, we avoid that expense by simply not clearing out the table at all. Instead, along with
+ // the pointers we keep an epoch number. There's a corresponding epoch number in the
+ // BaseAligner object, and if the two differ then the hash table bucket is empty. We increment
+ // the epoch number in the BaseAligner at the beginning of each alignment, thus effectively
+ // clearing the hash table from the last run.
+ //
+ struct HashTableAnchor {
+ HashTableElement *element;
+ _int64 epoch;
+ };
+ _int64 hashTableEpoch;
+ unsigned nUsedHashTableElements;
+ unsigned hashTableElementPoolSize;
+ HashTableElement *hashTableElementPool;
+ const HashTableElement emptyHashTableElement;
+ unsigned candidateHashTablesSize;
+ HashTableAnchor *candidateHashTable[NUM_DIRECTIONS];
+ HashTableElement *weightLists;
+ unsigned highestUsedWeightList;
+ static inline _uint64 hash(_uint64 key) {
+ key = key * 131; // Believe it or not, we spend a long time computing the hash, so we're better off with more table entries and a dopey function.
+ return key;
+ }
+ static const unsigned UnusedScoreValue = 0xffff;
+ // MAPQ parameters, currently not set to match Mason. Using #define because VC won't allow "static const double".
+#define SNP_PROB 0.001
+#define GAP_OPEN_PROB 0.001
+#define GAP_EXTEND_PROB 0.5
+ //
+ // Storage that's used during a call to AlignRead, but that's also needed by the
+ // score function. Since BaseAligner is single threaded, it's easier just to make
+ // them member variables than to pass them around.
+ //
+ unsigned lowestPossibleScoreOfAnyUnseenLocation[NUM_DIRECTIONS];
+ unsigned mostSeedsContainingAnyParticularBase[NUM_DIRECTIONS];
+ unsigned nSeedsApplied[NUM_DIRECTIONS];
+ unsigned bestScore;
+ GenomeLocation bestScoreGenomeLocation;
+ unsigned secondBestScore;
+ GenomeLocation secondBestScoreGenomeLocation;
+ int secondBestScoreDirection;
+ unsigned scoreLimit;
+ unsigned lvScores;
+ unsigned lvScoresAfterBestFound;
+ double probabilityOfAllCandidates;
+ double probabilityOfBestCandidate;
+ int firstPassSeedsNotSkipped[NUM_DIRECTIONS];
+ _int64 smallestSkippedSeed[NUM_DIRECTIONS];
+ unsigned highestWeightListChecked;
+ double totalProbabilityByDepth[AlignerStats::maxMaxHits];
+ void updateProbabilityMass();
+ bool
+ score(
+ bool forceResult,
+ Read *read[NUM_DIRECTIONS],
+ SingleAlignmentResult *primaryResult,
+ int maxEditDistanceForSecondaryResults,
+ int secondaryResultBufferSize,
+ int *nSecondaryResults,
+ SingleAlignmentResult *secondaryResults);
+ void clearCandidates();
+ bool findElement(GenomeLocation genomeLocation, Direction direction, HashTableElement **hashTableElement);
+ void findCandidate(GenomeLocation genomeLocation, Direction direction, Candidate **candidate, HashTableElement **hashTableElement);
+ void allocateNewCandidate(GenomeLocation genomeLoation, Direction direction, unsigned lowestPossibleScore, int seedOffset, Candidate **candidate, HashTableElement **hashTableElement);
+ void incrementWeight(HashTableElement *element);
+ void prefetchHashTableBucket(GenomeLocation genomeLocation, Direction direction);
+ const Genome *genome;
+ GenomeIndex *genomeIndex;
+ unsigned seedLen;
+ unsigned maxHitsToConsider;
+ unsigned maxK;
+ unsigned maxReadSize;
+ unsigned maxSeedsToUseFromCommandLine; // Max number of seeds to look up in the hash table
+ double maxSeedCoverage; // Max seeds to used expressed as readSize/seedSize this is mutually exclusive with maxSeedsToUseFromCommandLine
+ unsigned minWeightToCheck;
+ unsigned extraSearchDepth;
+ unsigned numWeightLists;
+ bool noUkkonen;
+ bool noOrderedEvaluation;
+ bool noTruncation;
+ bool doesGenomeIndexHave64BitLocations;
+ int maxSecondaryAlignmentsPerContig;
+ struct HitsPerContigCounts {
+ _int64 epoch; // Used hashTableEpoch, for the same reason
+ int hits;
+ };
+ HitsPerContigCounts *hitsPerContigCounts; // How many alignments are we reporting for each contig. Used to implement -mpc, otheriwse unallocated.
+ char *rcReadData;
+ char *rcReadQuality;
+ char *reversedRead[NUM_DIRECTIONS];
+ unsigned nTable[256];
+ int readId;
+ // How many overly popular (> maxHits) seeds we skipped this run
+ unsigned popularSeedsSkipped;
+ bool explorePopularSeeds; // Whether we should explore the first maxHits hits even for overly
+ // popular seeds (useful for filtering reads that come from a database
+ // with many very similar sequences).
+ bool stopOnFirstHit; // Whether to stop the first time a location matches with less than
+ // maxK edit distance (useful when using SNAP for filtering only).
+ AlignerStats *stats;
+ unsigned *hitCountByExtraSearchDepth; // How many hits at each depth bigger than the current best edit distance.
+ // So if the current best hit has edit distance 2, then hitCountByExtraSearchDepth[0] would
+ // be the count of hits at edit distance 2, while hitCountByExtraSearchDepth[2] would be the count
+ // of hits at edit distance 4.
+ void finalizeSecondaryResults(
+ SingleAlignmentResult primaryResult,
+ int *nSecondaryResults, // in/out
+ SingleAlignmentResult *secondaryResults,
+ int maxSecondaryResults,
+ int maxEditDistanceForSecondaryResults,
+ int bestScore);
diff --git a/SNAPLib/BiasTables.cpp b/SNAPLib/BiasTables.cpp
new file mode 100644
index 0000000..c68b9fb
--- /dev/null
+++ b/SNAPLib/BiasTables.cpp
@@ -0,0 +1,78991 @@
+Module Name:
+ BiasTables.cpp
+ Headers for the index builder for the SNAP sequencer
+ Bill Bolosky, August, 2011
+ User mode service.
+Revision History:
+ Adapted from Matei Zaharia's Scala implementation.
+#include "stdafx.h"
+#include "GenomeIndex.h"
+static double hg19_biasTable16_4_large[] = {
+static double hg19_biasTable17_4_large[] = {
+static double hg19_biasTable18_4_large[] = {
+static double hg19_biasTable19_4_large[] = {
+static double hg19_biasTable20_4_large[] = {
+static double hg19_biasTable21_4_large[] = {
+static double hg19_biasTable22_4_large[] = {
+static double hg19_biasTable23_4_large[] = {
+static double hg19_biasTable24_4_large[] = {
+static double hg19_biasTable25_4_large[] = {
+static double hg19_biasTable20_5_large[] = {
+static double hg19_biasTable21_5_large[] = {
+static double hg19_biasTable22_5_large[] = {
+static double hg19_biasTable23_5_large[] = {
+static double hg19_biasTable24_5_large[] = {
+static double hg19_biasTable25_5_large[] = {
+static double hg19_biasTable26_5_large[] = {
+static double hg19_biasTable27_5_large[] = {
+static double hg19_biasTable24_6_large[] = {
+static double hg19_biasTable25_6_large[] = {
+static double hg19_biasTable26_6_large[] = {
+static double hg19_biasTable27_6_large[] = {
+static double hg19_biasTable28_6_large[] = {
+static double hg19_biasTable29_6_large[] = {
+static double hg19_biasTable30_6_large[] = {
+static double hg19_biasTable31_6_large[] = {
+static double hg19_biasTable28_7_large[] = {
+static double hg19_biasTable29_7_large[] = {
+static double hg19_biasTable30_7_large[] = {
+static double hg19_biasTable31_7_large[] = {
+static double hg19_biasTable32_7_large[] = {
+static double hg19_biasTable32_8_large[] = {
+static double hg19_biasTable16_4[] = {
+static double hg19_biasTable17_4[] = {
+static double hg19_biasTable18_4[] = {
+static double hg19_biasTable19_4[] = {
+static double hg19_biasTable20_4[] = {
+static double hg19_biasTable21_4[] = {
+static double hg19_biasTable22_4[] = {
+static double hg19_biasTable23_4[] = {
+static double hg19_biasTable24_4[] = {
+static double hg19_biasTable25_4[] = {
+static double hg19_biasTable20_5[] = {
+static double hg19_biasTable21_5[] = {
+static double hg19_biasTable22_5[] = {
+static double hg19_biasTable23_5[] = {
+static double hg19_biasTable24_5[] = {
+static double hg19_biasTable25_5[] = {
+static double hg19_biasTable26_5[] = {
+static double hg19_biasTable27_5[] = {
+static double hg19_biasTable24_6[] = {
+static double hg19_biasTable25_6[] = {
+static double hg19_biasTable26_6[] = {
+static double hg19_biasTable27_6[] = {
+static double hg19_biasTable28_6[] = {
+static double hg19_biasTable29_6[] = {
+static double hg19_biasTable30_6[] = {
+static double hg19_biasTable31_6[] = {
+static double hg19_biasTable28_7[] = {
+static double hg19_biasTable29_7[] = {
+static double hg19_biasTable30_7[] = {
+static double hg19_biasTable31_7[] = {
+static double hg19_biasTable32_7[] = {
+static double hg19_biasTable32_8[] = {
+double *GenomeIndex::hg19_biasTables_large[GenomeIndex::largestKeySize + 1][GenomeIndex::largestBiasTable + 1] = {
+ { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, hg19_biasTable16_4_large, hg19_biasTable17_4_large, hg19_biasTable18_4_large, hg19_biasTable19_4_large, hg19_biasTable20_4_large, hg19_biasTable21_4_large, hg19_biasTable22_4_large, hg19_biasTable23_4_large, hg19_biasTable24_4_large, hg19_biasTable25_4_large, NULL, NULL, NULL, NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, hg19_biasTable20_5_large, hg19_biasTable21_5_large, hg19_biasTable22_5_large, hg19_biasTable23_5_large, hg19_biasTable24_5_large, hg19_biasTable25_5_large, hg19_biasTable26_5_large, hg19_biasTable27_5_large, NULL, NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, hg19_biasTable24_6_large, hg19_biasTable25_6_large, hg19_biasTable26_6_large, hg19_biasTable27_6_large, hg19_biasTable28_6_large, hg19_biasTable29_6_large, hg19_biasTable30_6_large, hg19_biasTable31_6_large, NULL },
+ { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, hg19_biasTable28_7_large, hg19_biasTable29_7_large, hg19_biasTable30_7_large, hg19_biasTable31_7_large, hg19_biasTable32_7_large },
+double *GenomeIndex::hg19_biasTables[GenomeIndex::largestKeySize+1][GenomeIndex::largestBiasTable+1] = {
+ {NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, hg19_biasTable16_4, hg19_biasTable17_4, hg19_biasTable18_4, hg19_biasTable19_4, hg19_biasTable20_4, hg19_biasTable21_4, hg19_biasTable22_4, hg19_biasTable23_4, hg19_biasTable24_4, hg19_biasTable25_4, NULL, NULL, NULL, NULL, NULL, NULL, NULL },
+ {NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, hg19_biasTable20_5, hg19_biasTable21_5, hg19_biasTable22_5, hg19_biasTable23_5, hg19_biasTable24_5, hg19_biasTable25_5, hg19_biasTable26_5, hg19_biasTable27_5, NULL, NULL, NULL, NULL, NULL},
+ {NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, hg19_biasTable24_6, hg19_biasTable25_6, hg19_biasTable26_6, hg19_biasTable27_6, hg19_biasTable28_6, hg19_biasTable29_6, hg19_biasTable30_6, hg19_biasTable31_6, NULL},
+ {NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, hg19_biasTable28_7, hg19_biasTable29_7, hg19_biasTable30_7, hg19_biasTable31_7, hg19_biasTable32_7},
diff --git a/SNAPLib/BigAlloc.cpp b/SNAPLib/BigAlloc.cpp
new file mode 100644
index 0000000..d0f2d6b
--- /dev/null
+++ b/SNAPLib/BigAlloc.cpp
@@ -0,0 +1,572 @@
+Module Name:
+ bigalloc.cpp
+ Allocator that uses big pages where appropriate and possible.
+ Bill Bolosky, August, 2011
+ User mode service.
+Revision History:
+#include "stdafx.h"
+#include "BigAlloc.h"
+#include "Compat.h"
+#include "exit.h"
+#include "Error.h"
+bool BigAllocUseHugePages = false;
+struct ProfileEntry
+ ProfileEntry() : caller(NULL), total(0), count(0) {}
+ const char* caller;
+ size_t total;
+ size_t count;
+static const int MaxCallers = 1000;
+static int NCallers = 0;
+static int LastCaller = 0;
+static ProfileEntry AllocProfile[1000];
+static ProfileEntry ProfileTotal;
+static ProfileEntry LastPrintProfile;
+void *BigAllocInternal(
+ size_t sizeToAllocate,
+ size_t *sizeAllocated,
+ bool reserveOnly = FALSE,
+ size_t *pageSize = NULL);
+void RecordAllocProfile(size_t bytes, const char* caller)
+ if (caller) {
+ if (LastCaller >= NCallers || strcmp(AllocProfile[LastCaller].caller, caller)) {
+ LastCaller = NCallers;
+ for (int i = 0; i < NCallers; i++) {
+ if (0 == strcmp(AllocProfile[i].caller, caller)) {
+ LastCaller = i;
+ break;
+ }
+ }
+ if (LastCaller == NCallers && NCallers < MaxCallers) {
+ NCallers++;
+ char* buffer = (char*) malloc(strlen(caller) + 1);
+ strcpy(buffer, caller);
+ AllocProfile[LastCaller].caller = buffer;
+ AllocProfile[LastCaller].total = AllocProfile[LastCaller].count = 0;
+ }
+ }
+ if (LastCaller < MaxCallers) {
+ AllocProfile[LastCaller].count++;
+ AllocProfile[LastCaller].total += bytes;
+ }
+ }
+ ProfileTotal.count++;
+ ProfileTotal.total += bytes;
+ if (ProfileTotal.count - LastPrintProfile.count >= 1000 || ProfileTotal.total - LastPrintProfile.total >= ((size_t)1 << 30)) {
+ fprintf(stderr, "BigAllocProfile %lld allocs, %lld total; caller %s alloc %lld\n", ProfileTotal.count, ProfileTotal.total, caller ? caller : "?", bytes);
+ LastPrintProfile = ProfileTotal;
+ }
+void *BigAllocProfile(
+ size_t sizeToAllocate,
+ size_t *sizeAllocated,
+ const char *caller)
+ RecordAllocProfile(sizeToAllocate, caller);
+ return BigAllocInternal(sizeToAllocate, sizeAllocated);
+#ifdef _MSC_VER
+// Assert an NT privilege for this thread.
+ IN LPCSTR PrivilegeName
+ )
+ BOOL b;
+ HANDLE hThread;
+ HANDLE hProcess;
+ TOKEN_PRIVILEGES tokenPrivileges, oldTokenPrivileges;
+ DWORD oldPrivilegesLength;
+ b = OpenThreadToken(GetCurrentThread(), TOKEN_ADJUST_PRIVILEGES |
+ TOKEN_QUERY, TRUE, &hThread);
+ if (!b) {
+ if (GetLastError() != ERROR_NO_TOKEN) {
+ return b;
+ }
+ b = OpenProcessToken(GetCurrentProcess(), TOKEN_DUPLICATE, &hProcess);
+ if (!b) {
+ return b;
+ }
+ b = DuplicateTokenEx(hProcess, TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY |
+ TOKEN_IMPERSONATE, NULL, SecurityImpersonation,
+ TokenImpersonation, &hThread);
+ if (!b) {
+ CloseHandle(hProcess);
+ return b;
+ }
+ b = SetThreadToken(NULL, hThread);
+ if (!b) {
+ CloseHandle(hProcess);
+ CloseHandle(hThread);
+ return b;
+ }
+ CloseHandle(hProcess);
+ }
+ ZeroMemory(&tokenPrivileges, sizeof(tokenPrivileges));
+ b = LookupPrivilegeValue(NULL,PrivilegeName,&tokenPrivileges.Privileges[0].Luid);
+ if (!b) {
+ return b;
+ }
+ tokenPrivileges.PrivilegeCount = 1;
+ tokenPrivileges.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
+ b = AdjustTokenPrivileges(hThread, FALSE, &tokenPrivileges,
+ sizeof(tokenPrivileges), &oldTokenPrivileges,
+ &oldPrivilegesLength);
+ CloseHandle(hThread);
+ return b;
+void *BigAllocInternal(
+ size_t sizeToAllocate,
+ size_t *sizeAllocated,
+ bool reserveOnly,
+ size_t *pageSize)
+Routine Description:
+ Allocate memory, using large pages if both appropriate and possible, and always using
+ VirtualAlloc (meaning that this will always use at least one VM page, so you shouldn't
+ use it for small stuff, only gigantic data structures for which you want to reduce TLB
+ misses and cache misses on the page table). Use malloc or new for ordinary allocations.
+ sizeToAllocate - The amount of memory that is needed
+ sizeAllocated - Optional parameter that if provided returns the amount of memory actually allocated, which
+ will always be >= sizeToAllocate (unless the allocation fails).
+ reserveOnly - If TRUE, will only reserve address space, must call BigCommit to commit memory
+ pageSize - Optional parameter that if provided returns the page size (not large page size)
+Return Value:
+ pointer to the memory allocated, or NULL if the allocation failed.
+ if (sizeToAllocate == 0) {
+ sizeToAllocate = 1;
+ }
+ static bool warningPrinted = false;
+ void *allocatedMemory;
+ SYSTEM_INFO systemInfo[1];
+ GetSystemInfo(systemInfo);
+ size_t virtualAllocSize = ((sizeToAllocate + systemInfo->dwPageSize - 1) / systemInfo->dwPageSize) * systemInfo->dwPageSize;
+ if (pageSize != NULL) {
+ *pageSize = systemInfo->dwPageSize;
+ }
+ //
+ // Try to do the VirtualAlloc using large pages if the size we're getting is at last one large page.
+ // Callers should have asserted the SeLockMemoryPrivilege if they want large pages.
+ //
+ size_t largePageSize = GetLargePageMinimum();
+ DWORD commitFlag = reserveOnly ? 0 : MEM_COMMIT;
+ if (0 != largePageSize && virtualAllocSize >= largePageSize) {
+ //
+ // Start by asserting the SeLockMemoryPrivilege, which is necessary for large page allocations. It's overkill to
+ // do this every time, it only has to happen once/thread. However, a BigAllocation is a big deal and shouldn't be
+ // happening very much, so we just don't worry about the extra cost.
+ //
+ BOOL assertPrivilegeWorked = AssertPrivilege("SeLockMemoryPrivilege");
+ DWORD assertPrivilegeError = GetLastError();
+ size_t largePageSizeToAllocate = ((virtualAllocSize + largePageSize - 1) / largePageSize) * largePageSize;
+#if _DEBUG
+ largePageSizeToAllocate += largePageSize; // For the guard page.
+#endif // DEBUG
+ allocatedMemory = (BYTE *)VirtualAlloc(0,largePageSizeToAllocate,commitFlag|MEM_RESERVE|((BigAllocUseHugePages && !reserveOnly) ? MEM_LARGE_PAGES : 0),PAGE_READWRITE);
+ if (NULL != allocatedMemory) {
+#if _DEBUG
+ DWORD oldProtect;
+ if (!VirtualProtect((char *)allocatedMemory + virtualAllocSize, systemInfo->dwPageSize, PAGE_NOACCESS, &oldProtect)) {
+ static bool printedVirtualProtectedWarning = false;
+ if (! printedVirtualProtectedWarning) {
+ //WriteErrorMessage("VirtualProtect for guard page failed, %d\n", GetLastError());
+ printedVirtualProtectedWarning = true;
+ }
+ }
+ largePageSizeToAllocate -= largePageSize; // Back out the guard page
+#endif // DEBUG
+ if (NULL != sizeAllocated) {
+ *sizeAllocated = largePageSizeToAllocate;
+ }
+ return allocatedMemory;
+ } else if (!warningPrinted) {
+ //
+ // The first time we fail, print out a warning and then fall back to VirtualAlloc. We want be able to use
+ // the fallback because the caller might not be able to assert the appropriate privilege and we'd still like
+ // to run. The check for printing only once isn't thread safe, so you might get more than one printed
+ // if multiple threads fail at the same time.
+ //
+ warningPrinted = true;
+ WriteErrorMessage("BigAlloc: WARNING: Unable to allocate large page memory, %d. Falling back to VirtualAlloc. Performance may be adversely affected. Size = %lld\n", GetLastError(), largePageSizeToAllocate);
+ if (!assertPrivilegeWorked || GetLastError() == 1314) { // TODO: Look up the error code name for 1314.
+ WriteErrorMessage("BigAlloc: Unable to assert the SeLockMemoryPrivilege (%d), which is probably why it failed.\n"
+ "Try secpol.msc, then SecuritySettings, Local Policies, User Rights Assignment.\n"
+ "Then double click 'Lock Pages in Memory,' add the current user directly or by being\n"
+ "In a group and then reboot (you MUST reboot) for it to work.\n", GetLastError());
+ }
+ }
+ }
+ allocatedMemory = (BYTE *)VirtualAlloc(0,virtualAllocSize,commitFlag|MEM_RESERVE,PAGE_READWRITE);
+ if (NULL != allocatedMemory && NULL != sizeAllocated) {
+ *sizeAllocated = virtualAllocSize;
+ }
+ if (NULL == allocatedMemory) {
+ WriteErrorMessage("BigAlloc of size %lld failed.\n", sizeToAllocate);
+ PrintBigAllocProfile();
+ soft_exit(1);
+ }
+ return allocatedMemory;
+void *BigAlloc(
+ size_t sizeToAllocate,
+ size_t *sizeAllocated)
+ return BigAllocInternal(sizeToAllocate, sizeAllocated, FALSE, NULL);
+void BigDealloc(void *memory)
+Routine Description:
+ Free memory allocated by BigAlloc.
+ memory - address of the memory to free.
+ if (NULL == memory) return;
+ VirtualFree(memory,0,MEM_RELEASE);
+void *BigReserveProfile(
+ size_t sizeToReserve,
+ size_t *sizeReserved,
+ size_t *pageSize,
+ const char* caller)
+ char buffer[1000];
+ strncpy(buffer, caller, sizeof(buffer));
+ strncat(buffer, "(RESERVE)", sizeof(buffer));
+ RecordAllocProfile(sizeToReserve, buffer);
+ return BigAllocInternal(sizeToReserve, sizeReserved, TRUE, pageSize);
+bool BigCommitProfile(
+ void *memoryToCommit,
+ size_t sizeToCommit,
+ const char* caller)
+ char buffer[1000];
+ strncpy(buffer, caller, sizeof(buffer));
+ strncat(buffer, "(COMMIT)", sizeof(buffer));
+ RecordAllocProfile(sizeToCommit, buffer);
+ void* allocatedMemory = VirtualAlloc(memoryToCommit, sizeToCommit, MEM_COMMIT, PAGE_READWRITE);
+ if (allocatedMemory == NULL) {
+ WriteErrorMessage("BigCommit VirtualAlloc failed with error 0x%x\n", GetLastError());
+ }
+ return allocatedMemory != NULL;
+void *BigReserve(
+ size_t sizeToReserve,
+ size_t *sizeReserved,
+ size_t *pageSize)
+ return BigAllocInternal(sizeToReserve, sizeReserved, TRUE, pageSize);
+bool BigCommit(
+ void *memoryToCommit,
+ size_t sizeToCommit)
+ void* allocatedMemory = VirtualAlloc(memoryToCommit, sizeToCommit, MEM_COMMIT, PAGE_READWRITE);
+ if (allocatedMemory == NULL) {
+ WriteErrorMessage("BigCommit VirtualAlloc failed with error 0x%x\n", GetLastError());
+ }
+ return allocatedMemory != NULL;
+#else /* no _MSC_VER */
+void *BigAllocInternal(
+void *BigAlloc(
+ size_t sizeToAllocate,
+ size_t *sizeAllocated)
+ // Make space to include the allocated size at the start of our region; this is necessary
+ // so that we can BigDealloc the memory later.
+ sizeToAllocate += sizeof(size_t);
+ const size_t ALIGN_SIZE = 4096;
+ if (sizeToAllocate % ALIGN_SIZE != 0) {
+ sizeToAllocate += ALIGN_SIZE - (sizeToAllocate % ALIGN_SIZE);
+ }
+ if (sizeAllocated != NULL) {
+ *sizeAllocated = sizeToAllocate - sizeof(size_t);
+ }
+ flags |= MAP_HUGETLB;
+ char *mem = (char *) mmap(NULL, sizeToAllocate, PROT_READ|PROT_WRITE, flags, -1, 0);
+ if (mem == MAP_FAILED) {
+ perror("mmap");
+ soft_exit(1);
+ }
+#if (defined(MADV_HUGEPAGE) && !defined(USE_HUGETLB))
+ // Tell Linux to use huge pages for this range
+ if (BigAllocUseHugePages) {
+ if (madvise(mem, sizeToAllocate, MADV_HUGEPAGE) == -1) {
+ WriteErrorMessage("WARNING: failed to enable huge pages -- your kernel may not support it\n");
+ }
+ }
+ // Remember the size allocated in the first sizeof(size_t) bytes
+ *((size_t *) mem) = sizeToAllocate;
+ return (void *) (mem + sizeof(size_t));
+void BigDealloc(void *memory)
+ if (NULL == memory) return;
+ // Figure out the size we had allocated
+ char *startAddress = ((char *) memory) - sizeof(size_t);
+ size_t sizeAllocated = *((size_t *) startAddress);
+ if (munmap(startAddress, sizeAllocated) != 0) {
+ perror("munmap");
+ soft_exit(1);
+ }
+void *BigReserve(
+ size_t sizeToReserve,
+ size_t *sizeReserved,
+ size_t *pageSize)
+ // TODO: use actual reserve/commit API; this is a temporary hack
+ if (pageSize != NULL) {
+ *pageSize = 4096;
+ }
+ return BigAllocInternal(sizeToReserve, sizeReserved);
+ return BigAlloc(sizeToReserve, sizeReserved);
+bool BigCommit(
+ void *memoryToCommit,
+ size_t sizeToCommit)
+ // TODO: use actual reserve/commit API; this is a temporary hack
+ return true;
+#endif /* _MSC_VER */
+BigAllocator::BigAllocator(size_t i_maxMemory, size_t i_allocationGranularity) : maxMemory(i_maxMemory), allocationGranularity(i_allocationGranularity)
+#if _DEBUG
+ maxMemory += maxCanaries * sizeof(unsigned);
+#endif // DEBUG
+ basePointer = (char *)BigAlloc(__max(maxMemory, 2 * 1024 * 1024)); // The 2MB minimum is to assure this lands in a big page
+ allocPointer = basePointer;
+#if _DEBUG
+ //
+ // Stick a canary at the beginning of the array so that we can detect underflows for whatever's allocated first.
+ //
+ canaries[0] = (unsigned *) allocPointer;
+ *canaries[0] = canaryValue;
+ nCanaries = 1;
+ allocPointer += sizeof(unsigned);
+#endif // _DEBUG
+ BigDealloc(basePointer);
+void *
+BigAllocator::allocate(size_t amountToAllocate)
+ //
+ // Round up to the allocation granularity.
+ //
+ if ((size_t)allocPointer % allocationGranularity != 0) {
+ allocPointer = (char *)((size_t)allocPointer + allocationGranularity - (size_t)allocPointer % allocationGranularity);
+ _ASSERT((size_t)allocPointer % allocationGranularity == 0);
+ }
+ if (allocPointer + amountToAllocate > basePointer + maxMemory) {
+ WriteErrorMessage("BigAllocator: allocating too much memory, %lld > %lld\n", allocPointer + amountToAllocate - basePointer , maxMemory);
+ soft_exit(1);
+ }
+ void *retVal = allocPointer;
+ allocPointer += amountToAllocate;
+#if _DEBUG
+ if (nCanaries < maxCanaries) {
+ _ASSERT(allocPointer + sizeof(unsigned) <= basePointer + maxMemory);
+ canaries[nCanaries] = (unsigned *)allocPointer;
+ *canaries[nCanaries] = canaryValue;
+ nCanaries++;
+ allocPointer += sizeof(unsigned);
+ }
+#endif // DEBUG
+ return retVal;
+#if _DEBUG
+ void
+ bool allOK = true;
+ for (unsigned i = 0; i < nCanaries; i++) {
+ if (*canaries[i] != canaryValue) {
+ WriteErrorMessage("Memory corruption detected: canary at 0x%llx has value 0x%llx\n",canaries[i], *canaries[i]);
+ allOK = false;
+ }
+ }
+ _ASSERT(allOK);
+#endif // DEBUG
+ void *
+CountingBigAllocator::allocate(size_t sizeToAllocate)
+ size += sizeToAllocate + allocationGranularity - 1; // Add in the max roundoff
+ Allocation *allocation = new Allocation;
+ allocation->next = allocations;
+ allocation->ptr = malloc(sizeToAllocate);
+ allocations = allocation;
+ return allocation->ptr;
+ while (NULL != allocations) {
+ Allocation *allocation = allocations;
+ allocations = allocation->next;
+ free(allocation->ptr);
+ delete allocation;
+ }
+void PrintBigAllocProfile()
+ WriteStatusMessage("BigAlloc usage\n");
+ for (int i = 0; i < NCallers; i++) {
+ WriteStatusMessage("%7.1f Mb %7lld %s\n",
+ AllocProfile[i].total * 1e-6, AllocProfile[i].count, AllocProfile[i].caller);
+ }
+void* zalloc(void* opaque, unsigned items, unsigned size)
+ size_t bytes = items * (size_t) size;
+ void* result = ((ThreadHeap*) opaque)->alloc(bytes);
+ static int printed = 0;
+ if ((! result) && printed++ < 10) {
+ WriteErrorMessage("warning: zalloc using malloc for %lld bytes\n", bytes);
+ }
+ return result ? result : malloc(bytes);
+void zfree(void* opaque, void* p)
+ if (! ((ThreadHeap*) opaque)->free(p)) {
+ free(p);
+ }
diff --git a/SNAPLib/BigAlloc.h b/SNAPLib/BigAlloc.h
new file mode 100644
index 0000000..9416569
--- /dev/null
+++ b/SNAPLib/BigAlloc.h
@@ -0,0 +1,177 @@
+Module Name:
+ bigalloc.h
+ Headers for an allocator that uses big pages where appropriate and possible.
+ Bill Bolosky, August, 2011
+ User mode service.
+Revision History:
+#pragma once
+inline unsigned RoundUpToPageSize(unsigned size)
+ const unsigned pageSize = 4096;
+ return ((size + pageSize - 1) / pageSize) * pageSize;
+#define BigAlloc(s) BigAllocProfile((s), NULL, __FUNCTION__)
+#define BigAlloc2(s,p) BigAllocProfile((s), (p), __FUNCTION__)
+#define BigReserve(s) BigReserveProfile((s), NULL, NULL, __FUNCTION__)
+#define BigCommit(p, s) BigCommitProfile((p), (s), __FUNCTION__)
+void *BigAllocProfile(
+ size_t sizeToAllocate,
+ size_t *sizeAllocated = NULL,
+ const char* caller = NULL);
+void *BigReserveProfile(
+ size_t sizeToReserve,
+ size_t *sizeReserved = NULL,
+ size_t *pageSize = NULL,
+ const char* caller = NULL);
+bool BigCommitProfile(
+ void *memoryToCommit,
+ size_t sizeToCommit,
+ const char* caller = NULL);
+void *BigAlloc(
+ size_t sizeToAllocate,
+ size_t *sizeAllocated = NULL);
+#define BigAlloc2(s,p) BigAlloc((s), (p))
+void *BigReserve(
+ size_t sizeToReserve,
+ size_t *sizeReserved = NULL,
+ size_t *pageSize = NULL);
+bool BigCommit(
+ void *memoryToCommit,
+ size_t sizeToCommit);
+void PrintBigAllocProfile();
+void BigDealloc(void *memory);
+// This class is used to allocate a group of objects all onto a single set of big pages. It requires knowing
+// the amount of memory to be allocated when it's created. It does not support deleting memory other than
+// all at once.
+class BigAllocator {
+ BigAllocator(size_t i_maxMemory, size_t i_allocationGranularity = 8);
+ ~BigAllocator();
+ virtual void *allocate(size_t amountToAllocate);
+#if _DEBUG
+ void checkCanaries();
+#else // DEBUG
+ void checkCanaries() {}
+#endif // DEBUG
+ char *basePointer;
+ char *allocPointer;
+ size_t maxMemory;
+ size_t allocationGranularity;
+#if _DEBUG
+ //
+ // Stick a canary between each allocation and
+ unsigned nCanaries;
+ static const unsigned maxCanaries = 100;
+ static const unsigned canaryValue = 0xca4a71e5;
+ unsigned *canaries[maxCanaries];
+#endif // DEBUG
+// An allocator that doesn't actually allocate, it just counts how much it would allocate. The idea is that
+// you can write allocations in a fairly normal looking way, call them with this to see how much would be
+// allocated, then create a real BigAllocator with that amount of memory. That way, you don't need to
+// keep in sync the actual allocation and the code that knows how much memory will be needed.
+class CountingBigAllocator : public BigAllocator
+ CountingBigAllocator(size_t i_allocationGranularity = 8) :size(0), allocations(NULL), BigAllocator(0), allocationGranularity(i_allocationGranularity) {}
+ ~CountingBigAllocator();
+ virtual void *allocate(size_t amountToAllocate);
+ virtual void assertAllMemoryUsed() {}
+ size_t getMemoryUsed() {return size;}
+ size_t size;
+ size_t allocationGranularity;
+ struct Allocation {
+ void *ptr;
+ Allocation *next;
+ } *allocations;
+extern bool BigAllocUseHugePages;
+// trivial per-thread heap for use in zalloc
+struct ThreadHeap
+ char* start;
+ char* end;
+ char* next;
+ ThreadHeap(size_t bytes)
+ {
+ next = start = (char*) BigAlloc(bytes);
+ end = start + bytes;
+ }
+ void* alloc(size_t bytes)
+ {
+ if (next + bytes <= end) {
+ void* result = next;
+ next += bytes;
+ return result;
+ }
+ return NULL;
+ }
+ bool free(void* p)
+ {
+ return (char*)p >= start && (char*) p <= end;
+ }
+ void reset()
+ {
+ next = start;
+ }
+ ~ThreadHeap()
+ {
+ BigDealloc(start);
+ }
+void* zalloc(void* opaque, unsigned items, unsigned size);
+void zfree(void* opaque, void* p);
diff --git a/SNAPLib/BufferedAsync.cpp b/SNAPLib/BufferedAsync.cpp
new file mode 100644
index 0000000..e2a00d5
--- /dev/null
+++ b/SNAPLib/BufferedAsync.cpp
@@ -0,0 +1,223 @@
+Module Name:
+ BufferedAsync.cpp
+ Double-buffered asynchronous file I/O
+ User mode service.
+ This class is NOT thread safe. It's the caller's responsibility to ensure that
+ at most one thread uses an instance at any time.
+#include "stdafx.h"
+#include "Compat.h"
+#include "BigAlloc.h"
+#include "BufferedAsync.h"
+#include "Error.h"
+using std::min;
+ bool
+ AsyncFile* file,
+ size_t offset,
+ size_t bytes,
+ size_t i_bufferSize,
+ bool async,
+ void* buffer0,
+ void* buffer1)
+ _ASSERT((buffer0 == NULL) == (buffer1 == NULL)); // both null or both non-null
+ waitTime = 0;
+ reader[0] = file->getReader();
+ reader[1] = file->getReader();
+ bufferSize = i_bufferSize;
+ fileSize = offset + bytes;
+ ownBuffer = buffer0 == NULL;
+ buffer[0] = (char*) (buffer0 != NULL ? buffer0 : BigAlloc(bufferSize));
+ buffer[1] = (char*) (buffer1 != NULL ? buffer1 : BigAlloc(bufferSize));
+ if (reader[0] == NULL || reader[1] == NULL || buffer[0] == NULL || buffer[1] == NULL) {
+ WriteErrorMessage("unable to setup temp file reader\n");
+ return false;
+ }
+ length[0] = min(bytes, bufferSize);
+ _int64 start = timeInNanos();
+ reader[0]->beginRead(buffer[0],length[0], offset, NULL);
+ if (bytes > bufferSize) {
+ length[1] = min(bytes - bufferSize, bufferSize);
+ reader[1]->beginRead(buffer[1], length[1], offset + bufferSize, NULL);
+ } else {
+ length[1] = 0;
+ }
+ waitTime += timeInNanos() - start;
+ reading = 0;
+ readOffset = 0;
+ nextFileOffset = offset + min(bytes, 2 * bufferSize);
+ if (! async) {
+ endOpen();
+ }
+ return true;
+ void
+ _int64 start = timeInNanos();
+ reader[reading]->waitForCompletion();
+ waitTime += timeInNanos() - start;
+ bool
+ return length[reading] == 0;
+ bool
+ void* data,
+ size_t bytes)
+ if (bytes == 0) {
+ return true;
+ }
+ if (atEnd()) {
+ return false;
+ }
+ size_t first = min(bytes, length[reading] - readOffset);
+ memcpy(data, buffer[reading] + readOffset, first);
+ readOffset += bytes;
+ if (readOffset >= length[reading]) {
+ // switch buffers
+ readOffset -= length[reading];
+ reading = 1 - reading;
+ _int64 start = timeInNanos();
+ reader[reading]->waitForCompletion();
+ waitTime += timeInNanos() - start;
+ if (readOffset > 0) {
+ // copy second part of read segment
+ // todo: allow for longer reads
+ if (readOffset > length[reading]) {
+ WriteErrorMessage("read length too big\n");
+ return false;
+ }
+ memcpy((char*) data + first, buffer[reading], readOffset);
+ }
+ // begin read of next block
+ if (nextFileOffset < fileSize) {
+ length[1 - reading] = min(fileSize - nextFileOffset, bufferSize);
+ start = timeInNanos();
+ reader[1 - reading]->beginRead(buffer[1 - reading], length[1 - reading], nextFileOffset, NULL);
+ waitTime += timeInNanos() - start;
+ nextFileOffset += length[1 - reading];
+ } else {
+ length[1 - reading] = 0;
+ }
+ }
+ return true;
+ bool
+ delete reader[0];
+ delete reader[1];
+ if (ownBuffer) {
+ BigDealloc(buffer[0]);
+ BigDealloc(buffer[1]);
+ }
+ return true;
+ bool
+ AsyncFile* file,
+ size_t i_bufferSize,
+ volatile _int64* sharedOffset)
+ waitTime = 0;
+ writer[0] = file->getWriter();
+ writer[1] = file->getWriter();
+ bufferSize = i_bufferSize;
+ buffer[0] = (char*) BigAlloc(bufferSize);
+ buffer[1] = (char*) BigAlloc(bufferSize);
+ writeOffset = 0;
+ writing = 0;
+ privateFileOffset = 0;
+ nextFileOffset = sharedOffset != NULL ? sharedOffset : &privateFileOffset;
+ return writer[0] != NULL && writer[1] != NULL && buffer[0] != NULL && buffer[1] != NULL;
+ bool
+ void* data,
+ size_t bytes)
+ void* p = forWrite(bytes);
+ if (p == NULL) {
+ // todo: allow bytes > bufferSize using synchronous writes
+ return false;
+ }
+ memcpy(p, data, bytes);
+ return true;
+ void*
+ size_t bytes)
+ bool ok = true;
+ if (writeOffset + bytes <= bufferSize) {
+ writeOffset += bytes;
+ return buffer[writing] + writeOffset - bytes;
+ } else {
+ _int64 fileOffset = InterlockedAdd64AndReturnNewValue(nextFileOffset, writeOffset) - writeOffset;
+ _int64 start = timeInNanos();
+ ok = writer[writing]->beginWrite(buffer[writing], writeOffset, fileOffset, NULL);
+ ok &= writer[1 - writing]->waitForCompletion();
+ waitTime += timeInNanos() - start;
+ writing = 1 - writing;
+ if (! ok) {
+ WriteErrorMessage("BufferedAsyncWriter write failed\n");
+ return NULL;
+ }
+ if (bytes <= bufferSize) {
+ writeOffset = bytes;
+ return buffer[writing];
+ } else {
+ // too big to be async
+ WriteErrorMessage("BufferedAsyncWriter write too large\n");
+ return NULL;
+ }
+ }
+ bool
+ _int64 start = timeInNanos();
+ bool ok = writer[1 - writing]->waitForCompletion();
+ waitTime += timeInNanos() - start;
+ if (writeOffset > 0) {
+ _int64 fileOffset = InterlockedAdd64AndReturnNewValue(nextFileOffset, writeOffset) - writeOffset;
+ _int64 start = timeInNanos();
+ ok &= writer[writing]->beginWrite(buffer[writing], writeOffset, fileOffset, NULL);
+ ok &= writer[writing]->waitForCompletion();
+ waitTime += timeInNanos() - start;
+ }
+ delete writer[0];
+ delete writer[1];
+ BigDealloc(buffer[0]);
+ BigDealloc(buffer[1]);
+ return ok;
\ No newline at end of file
diff --git a/SNAPLib/BufferedAsync.h b/SNAPLib/BufferedAsync.h
new file mode 100644
index 0000000..d6955c3
--- /dev/null
+++ b/SNAPLib/BufferedAsync.h
@@ -0,0 +1,66 @@
+Module Name:
+ BufferedAsync.h
+ Double-buffered asynchronous file I/O
+ User mode service.
+ This class is NOT thread safe. It's the caller's responsibility to ensure that
+ at most one thread uses an instance at any time.
+#pragma once
+#include "stdafx.h"
+#include "Compat.h"
+class BufferedAsyncReader
+ bool open(AsyncFile* file, size_t offset, size_t length, size_t bufferSize, bool async = false, void* buffer0 = NULL, void* buffer1 = NULL);
+ void endOpen();
+ bool atEnd();
+ bool read(void* data, size_t bytes);
+ bool close();
+ _int64 getWaitTimeInMillis() { return waitTime / 1000000; }
+ int reading; // 0 or 1
+ size_t readOffset;
+ size_t bufferSize;
+ size_t nextFileOffset;
+ size_t fileSize;
+ AsyncFile::Reader* reader[2];
+ char* buffer[2];
+ size_t length[2];
+ bool ownBuffer;
+ _int64 waitTime; // in nanos
+class BufferedAsyncWriter
+ bool open(AsyncFile* file, size_t bufferSize, volatile _int64* sharedOffset = NULL);
+ bool write(void* data, size_t bytes);
+ void* forWrite(size_t bytes);
+ bool close();
+ _int64 getWaitTimeInMillis() { return waitTime / 1000000; }
+ int writing; // 0 or 1
+ size_t writeOffset; // within buffer
+ size_t bufferSize;
+ volatile _int64 privateFileOffset; // used by default
+ volatile _int64* nextFileOffset; // for current buffer to write
+ AsyncFile::Writer* writer[2];
+ char* buffer[2];
+ _int64 waitTime; // in nanos
diff --git a/SNAPLib/ChimericPairedEndAligner.cpp b/SNAPLib/ChimericPairedEndAligner.cpp
new file mode 100644
index 0000000..27d4bf1
--- /dev/null
+++ b/SNAPLib/ChimericPairedEndAligner.cpp
@@ -0,0 +1,208 @@
+Module Name:
+ ChimericPairedEndAligner.cpp
+ A paired-end aligner calls into a different paired-end aligner, and if
+ it fails to find an alignment, aligns each of the reads singly. This handles
+ chimeric reads that would otherwise be unalignable.
+ Bill Bolosky, June, 2013
+ User mode service.
+Revision History:
+#include "stdafx.h"
+#include "ChimericPairedEndAligner.h"
+#include "mapq.h"
+#include "directions.h"
+#include "BigAlloc.h"
+#include "Util.h"
+using namespace std;
+#define TRACE printf
+#define TRACE(...) {}
+ GenomeIndex *index_,
+ unsigned maxReadSize,
+ unsigned maxHits,
+ unsigned maxK,
+ unsigned maxSeedsFromCommandLine,
+ double seedCoverage,
+ unsigned minWeightToCheck,
+ bool forceSpacing_,
+ unsigned extraSearchDepth,
+ bool noUkkonen,
+ bool noOrderedEvaluation,
+ bool noTruncation,
+ PairedEndAligner *underlyingPairedEndAligner_,
+ unsigned minReadLength_,
+ int maxSecondaryAlignmentsPerContig,
+ BigAllocator *allocator)
+ : underlyingPairedEndAligner(underlyingPairedEndAligner_), forceSpacing(forceSpacing_), index(index_), minReadLength(minReadLength_)
+ // Create single-end aligners.
+ singleAligner = new (allocator) BaseAligner(index, maxHits, maxK, maxReadSize,
+ maxSeedsFromCommandLine, seedCoverage, minWeightToCheck, extraSearchDepth, noUkkonen, noOrderedEvaluation, noTruncation, maxSecondaryAlignmentsPerContig, &lv, &reverseLV, NULL, allocator);
+ underlyingPairedEndAligner->setLandauVishkin(&lv, &reverseLV);
+ singleSecondary[0] = singleSecondary[1] = NULL;
+ size_t
+ GenomeIndex * index,
+ unsigned maxReadSize,
+ unsigned maxHits,
+ unsigned seedLen,
+ unsigned maxSeedsFromCommandLine,
+ double seedCoverage,
+ unsigned maxEditDistanceToConsider,
+ unsigned maxExtraSearchDepth,
+ unsigned maxCandidatePoolSize,
+ int maxSecondaryAlignmentsPerContig)
+ return BaseAligner::getBigAllocatorReservation(index, false, maxHits, maxReadSize, seedLen, maxSeedsFromCommandLine, seedCoverage, maxSecondaryAlignmentsPerContig) + sizeof(ChimericPairedEndAligner)+sizeof(_uint64);
+ singleAligner->~BaseAligner();
+#ifdef _DEBUG
+extern bool _DumpAlignments;
+#endif // _DEBUG
+void ChimericPairedEndAligner::align(
+ Read *read0,
+ Read *read1,
+ PairedAlignmentResult *result,
+ int maxEditDistanceForSecondaryResults,
+ int secondaryResultBufferSize,
+ int *nSecondaryResults,
+ PairedAlignmentResult *secondaryResults, // The caller passes in a buffer of secondaryResultBufferSize and it's filled in by AlignRead()
+ int singleSecondaryBufferSize,
+ int maxSecondaryAlignmentsToReturn,
+ int *nSingleEndSecondaryResultsForFirstRead,
+ int *nSingleEndSecondaryResultsForSecondRead,
+ SingleAlignmentResult *singleEndSecondaryResults // Single-end secondary alignments for when the paired-end alignment didn't work properly
+ )
+ result->status[0] = result->status[1] = NotFound;
+ *nSecondaryResults = 0;
+ *nSingleEndSecondaryResultsForFirstRead = 0;
+ *nSingleEndSecondaryResultsForSecondRead = 0;
+ if (read0->getDataLength() < minReadLength && read1->getDataLength() < minReadLength) {
+ TRACE("Reads are both too short -- returning");
+ for (int whichRead = 0; whichRead < NUM_READS_PER_PAIR; whichRead++) {
+ result->location[whichRead] = 0;
+ result->mapq[whichRead] = 0;
+ result->score[whichRead] = 0;
+ result->status[whichRead] = NotFound;
+ }
+ result->alignedAsPair = false;
+ result->fromAlignTogether = false;
+ result->nanosInAlignTogether = 0;
+ result->nLVCalls = 0;
+ result->nSmallHits = 0;
+ return;
+ }
+ _int64 start = timeInNanos();
+ if (read0->getDataLength() >= minReadLength && read1->getDataLength() >= minReadLength) {
+ //
+ // Let the LVs use the cache that we built up.
+ //
+ underlyingPairedEndAligner->align(read0, read1, result, maxEditDistanceForSecondaryResults, secondaryResultBufferSize, nSecondaryResults, secondaryResults,
+ singleSecondaryBufferSize, maxSecondaryAlignmentsToReturn, nSingleEndSecondaryResultsForFirstRead, nSingleEndSecondaryResultsForSecondRead,
+ singleEndSecondaryResults);
+ _int64 end = timeInNanos();
+ result->nanosInAlignTogether = end - start;
+ result->fromAlignTogether = true;
+ result->alignedAsPair = true;
+ if (forceSpacing) {
+ if (result->status[0] == NotFound) {
+ result->fromAlignTogether = false;
+ }
+ else {
+ _ASSERT(result->status[1] != NotFound); // If one's not found, so is the other
+ }
+ return;
+ }
+ if (result->status[0] != NotFound && result->status[1] != NotFound) {
+ //
+ // Not a chimeric read.
+ //
+ return;
+ }
+ }
+ //
+ // If the intersecting aligner didn't find an alignment for these reads, then they may be
+ // chimeric and so we should just align them with the single end aligner and apply a MAPQ penalty.
+ //
+ Read *read[NUM_READS_PER_PAIR] = {read0, read1};
+ int *resultCount[2] = {nSingleEndSecondaryResultsForFirstRead, nSingleEndSecondaryResultsForSecondRead};
+ for (int r = 0; r < NUM_READS_PER_PAIR; r++) {
+ SingleAlignmentResult singleResult;
+ int singleEndSecondaryResultsThisTime = 0;
+ if (read[r]->getDataLength() < minReadLength) {
+ result->status[r] = NotFound;
+ result->mapq[r] = 0;
+ result->direction[r] = FORWARD;
+ result->location[r] = 0;
+ result->score[r] = 0;
+ } else {
+ // We're using *nSingleEndSecondaryResultsForFirstRead because it's either 0 or what all we've seen (i.e., we know NUM_READS_PER_PAIR is 2)
+ singleAligner->AlignRead(read[r], &singleResult, maxEditDistanceForSecondaryResults,
+ singleSecondaryBufferSize - *nSingleEndSecondaryResultsForFirstRead, &singleEndSecondaryResultsThisTime,
+ maxSecondaryAlignmentsToReturn, singleEndSecondaryResults + *nSingleEndSecondaryResultsForFirstRead);
+ *(resultCount[r]) = singleEndSecondaryResultsThisTime;
+ result->status[r] = singleResult.status;
+ result->mapq[r] = singleResult.mapq / 3; // Heavy quality penalty for chimeric reads
+ result->direction[r] = singleResult.direction;
+ result->location[r] = singleResult.location;
+ result->score[r] = singleResult.score;
+ }
+ }
+ result->fromAlignTogether = false;
+ result->alignedAsPair = false;
+#ifdef _DEBUG
+ if (_DumpAlignments) {
+ printf("ChimericPairedEndAligner: (%u, %u) score (%d, %d), MAPQ (%d, %d)\n\n\n",result->location[0], result->location[1],
+ result->score[0], result->score[1], result->mapq[0], result->mapq[1]);
+ }
+#endif // _DEBUG
diff --git a/SNAPLib/ChimericPairedEndAligner.h b/SNAPLib/ChimericPairedEndAligner.h
new file mode 100644
index 0000000..cb1f3b3
--- /dev/null
+++ b/SNAPLib/ChimericPairedEndAligner.h
@@ -0,0 +1,101 @@
+Module Name:
+ ChimericPairedEndAligner.h
+ A paired-end aligner calls into a different paired-end aligner, and if
+ it fails to find an alignment, aligns each of the reads singly. This handles
+ chimeric reads that would otherwise be unalignable.
+ Bill Bolosky, June, 2013
+ User mode service.
+Revision History:
+#pragma once
+#include "PairedEndAligner.h"
+#include "BaseAligner.h"
+#include "BigAlloc.h"
+class ChimericPairedEndAligner : public PairedEndAligner {
+ ChimericPairedEndAligner(
+ GenomeIndex *index_,
+ unsigned maxReadSize,
+ unsigned maxHits,
+ unsigned maxK,
+ unsigned maxSeedsFromCommandLine,
+ double seedCoverage,
+ unsigned minWeightToCheck,
+ bool forceSpacing_,
+ unsigned extraSearchDepth,
+ bool noUkkonen,
+ bool noOrderedEvaluation,
+ bool noTruncation,
+ PairedEndAligner *underlyingPairedEndAligner_,
+ unsigned minReadLength_,
+ int maxSecondaryAlignmentsPerContig,
+ BigAllocator *allocator);
+ virtual ~ChimericPairedEndAligner();
+ static unsigned getMaxSingleEndSecondaryResults(unsigned maxSeedsToUse, double maxSeedCoverage, unsigned maxReadSize, unsigned maxHits, unsigned seedLength)
+ {
+ return BaseAligner::getMaxSecondaryResults(maxSeedsToUse, maxSeedCoverage, maxReadSize, maxHits, seedLength) * NUM_READS_PER_PAIR;
+ }
+ static size_t getBigAllocatorReservation(GenomeIndex * index, unsigned maxReadSize, unsigned maxHits, unsigned seedLen, unsigned maxSeedsFromCommandLine,
+ double seedCoverage, unsigned maxEditDistanceToConsider, unsigned maxExtraSearchDepth, unsigned maxCandidatePoolSize,
+ int maxSecondaryAlignmentsPerContig);
+ void *operator new(size_t size, BigAllocator *allocator) {_ASSERT(size == sizeof(ChimericPairedEndAligner)); return allocator->allocate(size);}
+ void operator delete(void *ptr, BigAllocator *allocator) {/* do nothing. Memory gets cleaned up when the allocator is deleted.*/}
+ virtual void align(
+ Read *read0,
+ Read *read1,
+ PairedAlignmentResult *result,
+ int maxEditDistanceForSecondaryResults,
+ int secondaryResultBufferSize,
+ int *nSecondaryResults,
+ PairedAlignmentResult *secondaryResults, // The caller passes in a buffer of secondaryResultBufferSize and it's filled in by AlignRead()
+ int singleSecondaryBufferSize,
+ int maxSecondaryAlignmentsToReturn,
+ int *nSingleEndSecondaryResultsForFirstRead,
+ int *nSingleEndSecondaryResultsForSecondRead,
+ SingleAlignmentResult *singleEndSecondaryResults // Single-end secondary alignments for when the paired-end alignment didn't work properly
+ );
+ void *operator new(size_t size) {return BigAlloc(size);}
+ void operator delete(void *ptr) {BigDealloc(ptr);}
+ virtual _int64 getLocationsScored() const {
+ return underlyingPairedEndAligner->getLocationsScored() + singleAligner->getLocationsScored();
+ }
+ bool forceSpacing;
+ BaseAligner *singleAligner;
+ PairedEndAligner *underlyingPairedEndAligner;
+ // avoid allocation in aligner calls
+ IdPairVector* singleSecondary[2];
+ LandauVishkin<1> lv;
+ LandauVishkin<-1> reverseLV;
+ GenomeIndex *index;
+ unsigned minReadLength;
diff --git a/SNAPLib/CommandProcessor.cpp b/SNAPLib/CommandProcessor.cpp
new file mode 100644
index 0000000..c0ed872
--- /dev/null
+++ b/SNAPLib/CommandProcessor.cpp
@@ -0,0 +1,182 @@
+Module Name:
+Code for running the top-level commands of SNAP
+Bill Bolosky, November, 2014
+User mode service.
+Revision History:
+Pulled from the main program and expanded to handle daemon mode
+#include "stdafx.h"
+#include "options.h"
+#include "FASTA.h"
+#include "GenomeIndex.h"
+#include "SingleAligner.h"
+#include "PairedAligner.h"
+#include "exit.h"
+#include "SeedSequencer.h"
+#include "AlignerOptions.h"
+#include "CommandProcessor.h"
+#include "Error.h"
+#include "Compat.h"
+const char *SNAP_VERSION = "1.0beta.18";
+static void usage()
+ WriteErrorMessage(
+ "Usage: snap <command> [<options>]\n"
+ "Commands:\n"
+ " index build a genome index\n"
+ " single align single-end reads\n"
+ " paired align paired-end reads\n"
+ " daemon run in daemon mode--accept commands remotely\n"
+ "Type a command without arguments to see its help.\n");
+void ProcessNonDaemonCommands(int argc, const char **argv) {
+ if (strcmp(argv[1], "index") == 0) {
+ if (CommandPipe == NULL) {
+ GenomeIndex::runIndexer(argc - 2, argv + 2);
+ } else {
+ //
+ // The error cases in index build don't really free memory properly, so we just don't allows it in daemon mode.
+ //
+ WriteErrorMessage("The index command is not available in daemon mode. Please run 'snap index' directly.\n");
+ }
+ } else if (strcmp(argv[1], "single") == 0 || strcmp(argv[1], "paired") == 0) {
+ for (int i = 1; i < argc; /* i is increased below */) {
+ unsigned nArgsConsumed;
+ if (strcmp(argv[i], "single") == 0) {
+ SingleAlignerContext single;
+ single.runAlignment(argc - i, argv + i, SNAP_VERSION, &nArgsConsumed);
+ } else if (strcmp(argv[i], "paired") == 0) {
+ PairedAlignerContext paired;
+ paired.runAlignment(argc - i, argv + i, SNAP_VERSION, &nArgsConsumed);
+ } else {
+ fprintf(stderr, "Invalid command: %s\n\n", argv[i]);
+ usage();
+ return;
+ }
+ _ASSERT(nArgsConsumed > 0);
+ i += nArgsConsumed;
+ }
+ } else {
+ WriteErrorMessage("Invalid command: %s\n\n", argv[1]);
+ usage();
+ }
+static void daemonUsage()
+ fprintf(stderr, "Usage: snap daemon [Named pipe name]\n");
+ soft_exit_no_print(1); // Don't use soft_exit, it's confusing people to get an "error" message after the usage
+void RunDaemonMode(int argc, const char **argv)
+ if (argc < 2 || argc > 3) {
+ daemonUsage();
+ }
+ printf("SNAP in daemon mode, waiting for commands to execute\n");
+ const char *pipeName = argc == 3 ? argv[2] : DEFAULT_NAMED_PIPE_NAME;
+ CommandPipe = OpenNamedPipe(pipeName, true);
+ if (NULL == CommandPipe) {
+ WriteErrorMessage("Unable to open named pipe for command IO.\n");
+ soft_exit(1);
+ }
+ const size_t commandBufferSize = 10000; // Yes, this is fixed size, no it's not a buffer overflow. The named pipe reader just quits if it's too long.
+ char commandBuffer[commandBufferSize];
+ //
+ // Format of commands is argc (in ascii) followed by argc arguments, each in one line.
+ //
+ for (;;) {
+ if (!ReadFromNamedPipe(CommandPipe, commandBuffer, commandBufferSize)) {
+ CloseNamedPipe(CommandPipe);
+ CommandPipe = NULL;
+ WriteStatusMessage("Named pipe closed. Exiting\n");
+ soft_exit_no_print(0);
+ }
+ int argc = atoi(commandBuffer);
+ if (0 == argc) {
+ WriteErrorMessage("Expected argument count on named pipe, got '%s'; ignoring.\n", commandBuffer);
+ } else {
+ char **argv = new char*[argc];
+ for (int i = 0; i < argc; i++) {
+ argv[i] = new char[commandBufferSize];
+ if (!ReadFromNamedPipe(CommandPipe, argv[i], commandBufferSize)) {
+ CloseNamedPipe(CommandPipe);
+ CommandPipe = NULL;
+ WriteStatusMessage("Error reading argument #%d from named pipe.\n", i);
+ soft_exit(1);
+ }
+ } // for each arg
+ if (argc > 1 && strcmp(argv[1], "exit") == 0) {
+ WriteStatusMessage("SNAP server exiting by request\n");
+ WriteToNamedPipe(CommandPipe, CommandExecutedString);
+ soft_exit_no_print(1);
+ }
+ printf("Executing command: ");
+ for (int i = 1; i < argc; i++) {
+ printf("%s ", argv[i]);
+ }
+ printf("\n");
+ ProcessNonDaemonCommands(argc, (const char **) argv);
+ printf("\n");
+ for (int i = 0; i < argc; i++) {
+ delete[] argv[i];
+ argv[i] = NULL;
+ }
+ delete[] argv;
+ argv = NULL;
+ }
+ WriteToNamedPipe(CommandPipe, CommandExecutedString);
+ }
+void ProcessTopLevelCommands(int argc, const char **argv)
+ fprintf(stderr, "Welcome to SNAP version %s.\n\n", SNAP_VERSION); // Can't use WriteStatusMessage, because we haven't parsed args yet to determine if -hdp is specified. Just stick with stderr.
+ InitializeSeedSequencers();
+ if (argc < 2) {
+ usage();
+ soft_exit_no_print(1);
+ }
+ if (strcmp(argv[1], "daemon") == 0) {
+ RunDaemonMode(argc, argv);
+ } else {
+ ProcessNonDaemonCommands(argc, argv);
+ }
+NamedPipe *CommandPipe = NULL;
+const char *CommandExecutedString = "***SNAP Command completed execution***";
diff --git a/SNAPLib/CommandProcessor.h b/SNAPLib/CommandProcessor.h
new file mode 100644
index 0000000..cdd0319
--- /dev/null
+++ b/SNAPLib/CommandProcessor.h
@@ -0,0 +1,29 @@
+Module Name:
+Header for running the top-level commands of SNAP
+Bill Bolosky, November, 2014
+User mode service.
+Revision History:
+#pragma once
+#include "Compat.h"
+extern void ProcessTopLevelCommands(int argc, const char **argv);
+extern NamedPipe *CommandPipe;
+extern const char *CommandExecutedString; // Sent back along the command pipe to indicate that the whole thing is done and SNAPCommand should exit
diff --git a/SNAPLib/Compat.cpp b/SNAPLib/Compat.cpp
new file mode 100644
index 0000000..c74e749
--- /dev/null
+++ b/SNAPLib/Compat.cpp
@@ -0,0 +1,2217 @@
+Module Name:
+ compat.cpp
+ Functions that provide compatibility between the Windows and Linux versions,
+ and mostly that serve to keep #ifdef's out of the main code in order to
+ improve readibility.
+ Bill Bolosky, November, 2011
+ User mode service.
+Revision History:
+#include "stdafx.h"
+#include "Compat.h"
+#include "BigAlloc.h"
+#ifndef _MSC_VER
+#include <fcntl.h>
+#include <aio.h>
+#include <err.h>
+#include <unistd.h>
+#include <signal.h>
+#include "exit.h"
+#include <map>
+#include "DataWriter.h"
+#include "Error.h"
+using std::min;
+using std::max;
+#undef AcquireExclusiveLock
+#undef WaitForSingleWaiterObject
+#undef WaitForEvent
+void AcquireUnderlyingExclusiveLock(UnderlyingExclusiveLock *lock);
+bool WaitForSingleWaiterObject(SingleWaiterObject *singleWaiterObject);
+void WaitForEvent(EventObject *eventObject);
+using std::map;
+using std::string;
+static map<string,_int64> times;
+void addTime(const char* fn, int line, _int64 time)
+ if (time > 0) {
+ char s[20];
+ sprintf(s, ":%d", line);
+ string key = string(fn) + string(s);
+ times[key] += time;
+ }
+void AcquireExclusiveLockProfile(ExclusiveLock *lock, const char* fn, int line)
+ _int64 start = timeInMillis();
+ AcquireExclusiveLock(lock);
+ addTime(fn, line, timeInMillis() - start);
+bool WaitForSingleWaiterObjectProfile(SingleWaiterObject *singleWaiterObject, const char* fn, int line)
+ _int64 start = timeInMillis();
+ bool result = WaitForSingleWaiterObject(singleWaiterObject);
+ addTime(fn, line, timeInMillis() - start);
+ return result;
+void WaitForEventProfile(EventObject *eventObject, const char* fn, int line)
+ _int64 start = timeInMillis();
+ WaitForEvent(eventObject);
+ addTime(fn, line, timeInMillis() - start);
+void PrintWaitProfile()
+ printf("function:line wait_time (s)\n");
+ for (map<string,_int64>::iterator lt = times.begin(); lt != times.end(); lt++) {
+ printf("%s %.3f\n", lt->first.data(), lt->second * 0.0001);
+ }
+#ifdef _MSC_VER
+const void* memmem(const void* data, const size_t dataLength, const void* pattern, const size_t patternLength)
+ if (dataLength < patternLength) {
+ return NULL;
+ }
+ const void* p = data;
+ const char first = *(char*)pattern;
+ size_t count = dataLength - patternLength + 1;
+ while (count > 0) {
+ const void* q = memchr(p, first, count);
+ if (q == NULL) {
+ return NULL;
+ }
+ if (0 == memcmp(q, pattern, patternLength)) {
+ return q;
+ }
+ count -= ((char*)q - (char*)p) + 1;
+ p = (char*)q + 1;
+ }
+ return NULL;
+_int64 timeInMillis()
+ * Get the current time in milliseconds since some arbitrary starting point
+ * (e.g. system boot or epoch)
+ */
+ return GetTickCount64();
+_int64 timeInNanos()
+ static _int64 performanceFrequency = 0;
+ if (0 == performanceFrequency) {
+ QueryPerformanceFrequency((PLARGE_INTEGER)&performanceFrequency);
+ }
+ LARGE_INTEGER perfCount;
+ QueryPerformanceCounter(&perfCount);
+ return perfCount.QuadPart * 1000000000 / performanceFrequency;
+void AcquireUnderlyingExclusiveLock(UnderlyingExclusiveLock *lock) {
+ EnterCriticalSection(lock);
+void ReleaseUnderlyingExclusiveLock(UnderlyingExclusiveLock *lock) {
+ LeaveCriticalSection(lock);
+bool InitializeUnderlyingExclusiveLock(UnderlyingExclusiveLock *lock) {
+ InitializeCriticalSection(lock);
+ return true;
+bool DestroyUnderlyingExclusiveLock(UnderlyingExclusiveLock *lock) {
+ DeleteCriticalSection(lock);
+ return true;
+bool CreateSingleWaiterObject(SingleWaiterObject *newWaiter)
+ *newWaiter = CreateEvent(NULL,TRUE,FALSE,NULL);
+ if (NULL == *newWaiter) {
+ return false;
+ }
+ return true;
+void DestroySingleWaiterObject(SingleWaiterObject *waiter)
+ CloseHandle(*waiter);
+void SignalSingleWaiterObject(SingleWaiterObject *singleWaiterObject) {
+ SetEvent(*singleWaiterObject);
+bool WaitForSingleWaiterObject(SingleWaiterObject *singleWaiterObject) {
+ DWORD retVal = WaitForSingleObject(*singleWaiterObject,INFINITE);
+ if (WAIT_OBJECT_0 != retVal) {
+ return false;
+ }
+ return true;
+void ResetSingleWaiterObject(SingleWaiterObject *singleWaiterObject) {
+ ResetEvent(*singleWaiterObject);
+// In Windows, the single waiter objects are already implemented using events.
+void CreateEventObject(EventObject *newEvent) {CreateSingleWaiterObject(newEvent);}
+void DestroyEventObject(EventObject *eventObject) {DestroySingleWaiterObject(eventObject);}
+void AllowEventWaitersToProceed(EventObject *eventObject) {SignalSingleWaiterObject(eventObject);}
+void PreventEventWaitersFromProceeding(EventObject *eventObject) {ResetSingleWaiterObject(eventObject);}
+void WaitForEvent(EventObject *eventObject) {WaitForSingleWaiterObject(eventObject);}
+bool WaitForEventWithTimeout(EventObject *eventObject, _int64 timeoutInMillis)
+ DWORD retVal = WaitForSingleObjectEx(*eventObject, (unsigned)timeoutInMillis, FALSE);
+ if (retVal == WAIT_OBJECT_0) {
+ return true;
+ } else if (retVal == WAIT_TIMEOUT) {
+ return false;
+ }
+ WriteErrorMessage("WaitForSingleObject returned unexpected result %d (error %d)\n", retVal, GetLastError());
+ soft_exit(1);
+ return false; // NOTREACHED: Just to avoid the compiler complaining.
+void BindThreadToProcessor(unsigned processorNumber) // This hard binds a thread to a processor. You can no-op it at some perf hit.
+ if (!SetThreadAffinityMask(GetCurrentThread(),((unsigned _int64)1) << processorNumber)) {
+ WriteErrorMessage("Binding thread to processor %d failed, %d\n",processorNumber,GetLastError());
+ }
+int InterlockedIncrementAndReturnNewValue(volatile int *valueToIncrement)
+ return InterlockedIncrement((volatile long *)valueToIncrement);
+int InterlockedDecrementAndReturnNewValue(volatile int *valueToDecrement)
+ return InterlockedDecrement((volatile long *)valueToDecrement);
+_uint32 InterlockedCompareExchange32AndReturnOldValue(volatile _uint32 *valueToUpdate, _uint32 replacementValue, _uint32 desiredPreviousValue)
+ return (_uint32) InterlockedCompareExchange(valueToUpdate, replacementValue, desiredPreviousValue);
+_uint64 InterlockedCompareExchange64AndReturnOldValue(volatile _uint64 *valueToUpdate, _uint64 replacementValue, _uint64 desiredPreviousValue)
+ return (_uint64) InterlockedCompareExchange(valueToUpdate, replacementValue, desiredPreviousValue);
+void* InterlockedCompareExchangePointerAndReturnOldValue(void * volatile *valueToUpdate, void* replacementValue, void* desiredPreviousValue)
+ return InterlockedCompareExchangePointer(valueToUpdate, replacementValue, desiredPreviousValue);
+struct WrapperThreadContext {
+ ThreadMainFunction mainFunction;
+ void *mainFunctionParameter;
+WrapperThreadMain(PVOID Context)
+ WrapperThreadContext *context = (WrapperThreadContext *)Context;
+ (*context->mainFunction)(context->mainFunctionParameter);
+ delete context;
+ context = NULL;
+ return 0;
+bool StartNewThread(ThreadMainFunction threadMainFunction, void *threadMainFunctionParameter)
+ WrapperThreadContext *context = new WrapperThreadContext;
+ if (NULL == context) {
+ return false;
+ }
+ context->mainFunction = threadMainFunction;
+ context->mainFunctionParameter = threadMainFunctionParameter;
+ HANDLE hThread;
+ DWORD threadId;
+ hThread = CreateThread(NULL,0,WrapperThreadMain,context,0,&threadId);
+ if (NULL == hThread) {
+ WriteErrorMessage("Create thread failed, %d\n",GetLastError());
+ delete context;
+ context = NULL;
+ return false;
+ }
+ CloseHandle(hThread);
+ hThread = NULL;
+ return true;
+void SleepForMillis(unsigned millis)
+ Sleep(millis);
+unsigned GetNumberOfProcessors()
+ SYSTEM_INFO systemInfo[1];
+ GetSystemInfo(systemInfo);
+ return systemInfo->dwNumberOfProcessors;
+_int64 QueryFileSize(const char *fileName) {
+ if (INVALID_HANDLE_VALUE == hFile) {
+ WriteErrorMessage("Unable to open file '%s' for QueryFileSize, %d\n", fileName, GetLastError());
+ soft_exit(1);
+ }
+ if (!GetFileSizeEx(hFile,&fileSize)) {
+ WriteErrorMessage("GetFileSize failed, %d\n",GetLastError());
+ soft_exit(1);
+ }
+ CloseHandle(hFile);
+ return fileSize.QuadPart;
+ bool
+ const char* filename)
+ return DeleteFile(filename) ? true : false;
+ bool
+ const char* oldFileName,
+ const char* newFileName)
+ return MoveFile(oldFileName, newFileName) ? true : false;
+class LargeFileHandle
+ HANDLE handle;
+ LargeFileHandle*
+ const char* filename,
+ const char* mode)
+ _ASSERT(strlen(mode) == 1 && (*mode == 'r' || *mode == 'w' || *mode == 'a'));
+ LargeFileHandle* result = new LargeFileHandle();
+ result->handle = CreateFile(filename,
+ *mode == 'r' ? GENERIC_READ :
+ *mode == 'a' ? FILE_APPEND_DATA
+ 0 /* exclusive */,
+ NULL);
+ if (result->handle == NULL) {
+ WriteErrorMessage("open large file %s failed with 0x%x\n", filename, GetLastError());
+ delete (void*) result;
+ return NULL;
+ }
+ return result;
+ size_t
+ LargeFileHandle* file,
+ void* buffer,
+ size_t bytes)
+ size_t count = bytes;
+ while (count > 0) {
+ DWORD step = 0;
+ if ((! WriteFile(file->handle, buffer, (DWORD) min(count, (size_t) 0x2000000), &step, NULL)) || step == 0) {
+ WriteErrorMessage("WriteLargeFile failed at %lu of %lu bytes with 0x%x\n", bytes - count, bytes, GetLastError());
+ return bytes - count;
+ }
+ count -= step;
+ buffer = ((char*) buffer) + step;
+ }
+ return bytes;
+ size_t
+ LargeFileHandle* file,
+ void* buffer,
+ size_t bytes)
+ size_t count = bytes;
+ while (count > 0) {
+ DWORD step = 0;
+ if ((! ReadFile(file->handle, buffer, (DWORD) min(count, (size_t) 0x1000000), &step, NULL)) || step == 0) {
+ WriteErrorMessage("ReadLargeFile failed at %lu of %lu bytes with 0x%x\n", bytes - count, bytes, GetLastError());
+ return bytes - count;
+ }
+ count -= step;
+ buffer = ((char*) buffer) + step;
+ }
+ return bytes;
+ void
+ LargeFileHandle* file)
+ if (CloseHandle(file->handle)) {
+ delete (void*) file;
+ }
+class MemoryMappedFile
+ HANDLE fileHandle;
+ HANDLE fileMapping;
+ void* mappedAddress;
+ MemoryMappedFile*
+ const char* filename,
+ size_t offset,
+ size_t length,
+ void** o_contents,
+ bool write,
+ bool sequential)
+ MemoryMappedFile* result = new MemoryMappedFile();
+ result->fileHandle = CreateFile(filename, (write ? GENERIC_WRITE : 0) | GENERIC_READ, 0, NULL, OPEN_EXISTING,
+ if (result->fileHandle == NULL) {
+ WriteErrorMessage("unable to open mapped file %s error 0x%x\n", filename, GetLastError());
+ delete result;
+ return NULL;
+ }
+ result->fileMapping = CreateFileMapping(result->fileHandle, NULL, write ? PAGE_READWRITE : PAGE_READONLY, 0, 0, NULL);
+ if (result->fileMapping == NULL) {
+ WriteErrorMessage("unable to create file mapping %s error 0x%x\n", filename, GetLastError());
+ delete result;
+ return NULL;
+ }
+ *o_contents = result->mappedAddress = MapViewOfFile(result->fileMapping,
+ (DWORD) (offset >> (8 * sizeof(DWORD))),
+ (DWORD) offset,
+ length);
+ if (*o_contents == NULL) {
+ WriteErrorMessage("unable to map file %s error 0x%x\n", filename, GetLastError());
+ delete result;
+ return NULL;
+ }
+ return result;
+ void
+ MemoryMappedFile* mappedFile)
+ bool ok = UnmapViewOfFile(mappedFile->mappedAddress) &&
+ CloseHandle(mappedFile->fileMapping) &&
+ CloseHandle(mappedFile->fileHandle);
+ if (ok) {
+ delete (void*) mappedFile;
+ } else {
+ WriteErrorMessage("unable to close memory mapped file, error 0x%x\n", GetLastError());
+ }
+void AdviseMemoryMappedFilePrefetch(const MemoryMappedFile *mappedFile)
+ // No-op on WIndows.
+class WindowsAsyncFile : public AsyncFile
+ static WindowsAsyncFile* open(const char* filename, bool write);
+ WindowsAsyncFile(HANDLE i_hFile);
+ virtual bool close();
+ class Writer : public AsyncFile::Writer
+ {
+ public:
+ Writer(WindowsAsyncFile* i_file);
+ virtual bool close();
+ virtual bool beginWrite(void* buffer, size_t length, size_t offset, size_t *bytesWritten);
+ virtual bool waitForCompletion();
+ private:
+ WindowsAsyncFile* file;
+ bool writing;
+ };
+ virtual AsyncFile::Writer* getWriter();
+ class Reader : public AsyncFile::Reader
+ {
+ public:
+ Reader(WindowsAsyncFile* i_file);
+ virtual bool close();
+ virtual bool beginRead(void* buffer, size_t length, size_t offset, size_t *bytesRead);
+ virtual bool waitForCompletion();
+ private:
+ WindowsAsyncFile* file;
+ bool reading;
+ };
+ virtual AsyncFile::Reader* getReader();
+ HANDLE hFile;
+ WindowsAsyncFile*
+ const char* filename,
+ bool write)
+ HANDLE hFile = CreateFile(filename,
+ write ? 0 : FILE_SHARE_READ,
+ NULL);
+ if (INVALID_HANDLE_VALUE == hFile) {
+ WriteErrorMessage("Unable to create SAM file '%s', %d\n",filename,GetLastError());
+ return NULL;
+ }
+ return new WindowsAsyncFile(hFile);
+ HANDLE i_hFile)
+ : hFile(i_hFile)
+ bool
+ return CloseHandle(hFile) ? true : false;
+ AsyncFile::Writer*
+ return new Writer(this);
+WindowsAsyncFile::Writer::Writer(WindowsAsyncFile* i_file)
+ : file(i_file), writing(false)
+ lap.hEvent = CreateEvent(NULL,FALSE,FALSE,NULL);
+ bool
+ waitForCompletion();
+ return CloseHandle(lap.hEvent) ? true : false;
+ bool
+ void* buffer,
+ size_t length,
+ size_t offset,
+ size_t *bytesWritten)
+ if (! waitForCompletion()) {
+ return false;
+ }
+ lap.OffsetHigh = (DWORD) (offset >> (8 * sizeof(DWORD)));
+ lap.Offset = (DWORD) offset;
+ if (!WriteFile(file->hFile,buffer, (DWORD) length, (LPDWORD) bytesWritten, &lap)) {
+ if (ERROR_IO_PENDING != GetLastError()) {
+ WriteErrorMessage("WindowsAsyncFile: WriteFile failed, %d\n",GetLastError());
+ return false;
+ }
+ }
+ writing = true;
+ return true;
+ bool
+ if (writing) {
+ DWORD nBytesTransferred;
+ if (!GetOverlappedResult(file->hFile,&lap,&nBytesTransferred,TRUE)) {
+ return false;
+ }
+ writing = false;
+ }
+ return true;
+ AsyncFile::Reader*
+ return new Reader(this);
+ WindowsAsyncFile* i_file)
+ : file(i_file), reading(false)
+ lap.hEvent = CreateEvent(NULL,FALSE,FALSE,NULL);
+ bool
+ return CloseHandle(lap.hEvent) ? true : false;
+ bool
+ void* buffer,
+ size_t length,
+ size_t offset,
+ size_t* bytesRead)
+ if (! waitForCompletion()) {
+ return false;
+ }
+ lap.OffsetHigh = (DWORD) (offset >> (8 * sizeof(DWORD)));
+ lap.Offset = (DWORD) offset;
+ if (!ReadFile(file->hFile, buffer,(DWORD) length, (LPDWORD) bytesRead, &lap)) {
+ if (ERROR_IO_PENDING != GetLastError()) {
+ WriteErrorMessage("WindowsSAMWriter: WriteFile failed, %d\n",GetLastError());
+ return false;
+ }
+ }
+ reading = true;
+ return true;
+ bool
+ if (reading) {
+ DWORD nBytesTransferred;
+ if (!GetOverlappedResult(file->hFile,&lap,&nBytesTransferred,TRUE)) {
+ return false;
+ }
+ reading = false;
+ }
+ return true;
+_int64 InterlockedAdd64AndReturnNewValue(volatile _int64 *valueToWhichToAdd, _int64 amountToAdd)
+ return InterlockedAdd64((volatile LONGLONG *)valueToWhichToAdd,(LONGLONG)amountToAdd);
+int _fseek64bit(FILE *stream, _int64 offset, int origin)
+ return _fseeki64(stream,offset,origin);
+int getpagesize()
+ SYSTEM_INFO systemInfo;
+ GetSystemInfo(&systemInfo);
+ return systemInfo.dwAllocationGranularity;
+ hMapping = NULL;
+ initialized = false;
+ pagesize = getpagesize();
+ mapCount = 0;
+#if 0
+ lap->hEvent = NULL;
+ prefetchBuffer = BigAlloc(prefetchBufferSize);
+ isPrefetchOutstanding = false;
+ lastPrefetch = 0;
+ millisSpentInReadFile = 0;
+ countOfImmediateCompletions = 0;
+ countOfDelayedCompletions = 0;
+ countOfFailures = 0;
+FileMapper::init(const char *i_fileName)
+ if (initialized) {
+ if (strcmp(fileName, i_fileName)) {
+ WriteErrorMessage("FileMapper already initialized with %s, cannot init with %s\n", fileName, i_fileName);
+ return false;
+ }
+ return true;
+ }
+ fileName = i_fileName;
+ if (INVALID_HANDLE_VALUE == hFile) {
+ WriteErrorMessage("Failed to open '%s', error %d\n",fileName, GetLastError());
+ return false;
+ }
+#if 0
+ if (INVALID_HANDLE_VALUE == hFilePrefetch) {
+ WriteErrorMessage("Failed to open '%s' for prefetch, error %d\n",fileName, GetLastError());
+ CloseHandle(hFile);
+ return false;
+ }
+ if (!GetFileInformationByHandle(hFile,&fileInfo)) {
+ WriteErrorMessage("Unable to get file information for '%s', error %d\n", fileName, GetLastError());
+ CloseHandle(hFile);
+#if 0
+ CloseHandle(hFilePrefetch);
+ return false;
+ }
+ liFileSize.HighPart = fileInfo.nFileSizeHigh;
+ liFileSize.LowPart = fileInfo.nFileSizeLow;
+ fileSize = liFileSize.QuadPart;
+ hMapping = CreateFileMapping(hFile,NULL,PAGE_READONLY,0,0,NULL);
+ if (NULL == hMapping) {
+ WriteErrorMessage("Unable to create mapping to file '%s', %d\n", fileName, GetLastError());
+ CloseHandle(hFile);
+#if 0
+ CloseHandle(hFilePrefetch);
+ return false;
+ }
+#if 0
+ lap->hEvent = CreateEvent(NULL,FALSE,FALSE,NULL);
+ initialized = true;
+ return true;
+char *
+FileMapper::createMapping(size_t offset, size_t amountToMap, void** o_mappedBase)
+ size_t beginRounding = offset % pagesize;
+ LARGE_INTEGER liStartingOffset;
+ liStartingOffset.QuadPart = offset - beginRounding;
+ size_t endRounding = 0;
+ if ((amountToMap + beginRounding) % pagesize != 0) {
+ endRounding = pagesize - (amountToMap + beginRounding) % pagesize;
+ }
+ size_t mapRequestSize = beginRounding + amountToMap + endRounding;
+ _ASSERT(mapRequestSize % pagesize == 0);
+ if (mapRequestSize + liStartingOffset.QuadPart >= fileSize) {
+ mapRequestSize = 0; // Says to just map the whole thing.
+ }
+ char* mappedBase = (char *)MapViewOfFile(hMapping,FILE_MAP_READ,liStartingOffset.HighPart,liStartingOffset.LowPart, mapRequestSize);
+ if (NULL == mappedBase) {
+ WriteErrorMessage("Unable to map file, %d\n", GetLastError());
+ return NULL;
+ }
+ char* mappedRegion = mappedBase + beginRounding;
+#if 0
+ prefetch(0);
+ InterlockedIncrementAndReturnNewValue(&mapCount);
+ *o_mappedBase = mappedBase;
+ return mappedRegion;
+FileMapper::unmap(void* mappedBase)
+ _ASSERT(mapCount > 0);
+ if (mapCount > 0) {
+ int n = InterlockedDecrementAndReturnNewValue(&mapCount);
+ _ASSERT(n >= 0);
+ if (!UnmapViewOfFile(mappedBase)) {
+ WriteErrorMessage("Unmap of file failed, %d\n", GetLastError());
+ }
+ }
+ _ASSERT(mapCount == 0);
+#if 0
+ if (isPrefetchOutstanding) {
+ DWORD numberOfBytesTransferred;
+ GetOverlappedResult(hFile,lap,&numberOfBytesTransferred,TRUE);
+ }
+ BigDealloc(prefetchBuffer);
+ prefetchBuffer = NULL;
+ CloseHandle(hFilePrefetch);
+ CloseHandle(lap->hEvent);
+ CloseHandle(hMapping);
+ CloseHandle(hFile);
+ WriteErrorMessage("FileMapper: %lld immediate completions, %lld delayed completions, %lld failures, %lld ms in readfile (%lld ms/call)\n",countOfImmediateCompletions, countOfDelayedCompletions, countOfFailures, millisSpentInReadFile,
+ millisSpentInReadFile/max(1, countOfImmediateCompletions + countOfDelayedCompletions + countOfFailures));
+#if 0
+FileMapper::prefetch(size_t currentRead)
+ if (currentRead + prefetchBufferSize / 2 <= lastPrefetch || lastPrefetch + prefetchBufferSize >= amountMapped) {
+ //
+ // Nothing to do; we're either not ready for more prefetching or we're at the end of our region.
+ //
+ return;
+ }
+ if (isPrefetchOutstanding) {
+ //
+ // See if the last prefetch is done.
+ //
+ DWORD numberOfBytesTransferred;
+ if (GetOverlappedResult(hFile,lap,&numberOfBytesTransferred,FALSE)) {
+ isPrefetchOutstanding = false;
+ } else {
+#if DBG
+ if (GetLastError() != ERROR_IO_PENDING) {
+ WriteErrorMessage("mapped file prefetcher: GetOverlappedResult failed, %d\n", GetLastError());
+ }
+#endif // DBG
+ return; // There's still IO on outstanding, we can't start more.
+ }
+ }
+ DWORD amountToRead = (DWORD)__min(prefetchBufferSize, amountMapped - lastPrefetch);
+ _ASSERT(amountToRead > 0); // Else we should have failed the initial check and returned
+ LARGE_INTEGER liReadOffset;
+ lastPrefetch += prefetchBufferSize;
+ liReadOffset.QuadPart = lastPrefetch;
+ lap->OffsetHigh = liReadOffset.HighPart;
+ lap->Offset = liReadOffset.LowPart;
+ DWORD nBytesRead;
+ _int64 start = timeInMillis();
+ if (!ReadFile(hFilePrefetch,prefetchBuffer,amountToRead,&nBytesRead,lap)) {
+ if (GetLastError() == ERROR_IO_PENDING) {
+ InterlockedAdd64AndReturnNewValue(&countOfDelayedCompletions,1);
+ isPrefetchOutstanding = true;
+ } else {
+ InterlockedAdd64AndReturnNewValue(&countOfFailures,1);
+#if DBG
+ if (GetLastError() != ERROR_IO_PENDING) {
+ WriteErrorMessage("mapped file prefetcher: ReadFile failed, %d\n", GetLastError());
+ }
+#endif // DBG
+ isPrefetchOutstanding = false; // Just ignore it
+ }
+ } else {
+ InterlockedAdd64AndReturnNewValue(&countOfImmediateCompletions,1);
+ isPrefetchOutstanding = false;
+ }
+ InterlockedAdd64AndReturnNewValue(&millisSpentInReadFile,timeInMillis() - start);
+void PreventMachineHibernationWhileThisThreadIsAlive()
+void SetToLowSchedulingPriority()
+ if (!SetPriorityClass(GetCurrentProcess(), IDLE_PRIORITY_CLASS)) {
+ WriteErrorMessage("Unable to set process to background priority class, %d. Ignoring and proceeding at normal priority\n", GetLastError());
+ }
+struct NamedPipe {
+ HANDLE hPipe;
+NamedPipe *OpenNamedPipe(const char *pipeName, bool serverSide)
+ NamedPipe *pipe = new NamedPipe;
+ const char *prefix = "\\\\.\\pipe\\";
+ char *fullyQualifiedPipeName = new char[strlen(prefix) + strlen(pipeName) + 1]; // +1 for null
+ sprintf(fullyQualifiedPipeName, "%s%s", prefix, pipeName);
+ if (serverSide) {
+ if (INVALID_HANDLE_VALUE == pipe->hPipe) {
+ WriteErrorMessage("OpenNamedPipe('%s'): unable to open pipe, %d\n", fullyQualifiedPipeName, GetLastError());
+ delete pipe;
+ return NULL;
+ }
+ if (!ConnectNamedPipe(pipe->hPipe, NULL) && ERROR_PIPE_CONNECTED != GetLastError()) {
+ WriteErrorMessage("Unable to connect named pipe, %d\n", GetLastError());
+ delete pipe;
+ return NULL;
+ }
+ } else {
+ pipe->hPipe = CreateFile(fullyQualifiedPipeName, GENERIC_READ | GENERIC_WRITE, 0, NULL, OPEN_EXISTING, 0, NULL);
+ while (INVALID_HANDLE_VALUE == pipe->hPipe && GetLastError() == ERROR_PIPE_BUSY) {
+ WriteStatusMessage("Server is busy. Waiting.\n");
+ if (!WaitNamedPipe(fullyQualifiedPipeName, NMPWAIT_WAIT_FOREVER)) {
+ fprintf(stderr, "Waiting for server connection failed, %d\n", GetLastError());
+ delete pipe;
+ return NULL;
+ }
+ pipe->hPipe = CreateFile(fullyQualifiedPipeName, GENERIC_READ | GENERIC_WRITE, 0, NULL, OPEN_EXISTING, 0, NULL);
+ }
+ if (INVALID_HANDLE_VALUE == pipe->hPipe) {
+ WriteErrorMessage("Unable to open server connection '%s', %d\n", fullyQualifiedPipeName, GetLastError());
+ delete pipe;
+ return NULL;
+ }
+ }
+ return pipe;
+bool ReadFromNamedPipe(NamedPipe *pipe, char *outputBuffer, size_t outputBufferSize)
+ DWORD bytesRead;
+ while (!ReadFile(pipe->hPipe, outputBuffer, (DWORD)outputBufferSize, &bytesRead, NULL)) {
+ if (GetLastError() != ERROR_BROKEN_PIPE && GetLastError() != ERROR_NO_DATA) {
+ fprintf(stderr, "Read named pipe failed, %d\n", GetLastError()); // Don't use WriteErrorMessage, it will try to send on the pipe
+ return false;
+ }
+ if (GetLastError() == ERROR_BROKEN_PIPE) {
+ if (!DisconnectNamedPipe(pipe->hPipe)) {
+ fprintf(stderr, "Disconnect named pipe failed, %d; ignoring\n", GetLastError());
+ }
+ if (!ConnectNamedPipe(pipe->hPipe, NULL) && ERROR_PIPE_CONNECTED != GetLastError()) {
+ fprintf(stderr, "ReadFromNamedPipe: reconnecting to pipe failed, %d\n", GetLastError());
+ return false;
+ }
+ }
+ }
+ return true;
+bool WriteToNamedPipe(NamedPipe *pipe, const char *stringToWrite)
+ DWORD bytesWritten;
+ if (!WriteFile(pipe->hPipe, stringToWrite, (DWORD)strlen(stringToWrite) + 1, &bytesWritten, NULL)) { // +1 sends terminating NULL
+ fprintf(stderr, "WriteToNamedPipe: write failed, %d\n", GetLastError());
+ return false;
+ }
+ FlushFileBuffers(pipe->hPipe);
+ if (bytesWritten != strlen(stringToWrite) + 1) {
+ fprintf(stderr, "WriteToNamedPipe: expected to write %lld bytes, actually wrote %d\n", strlen(stringToWrite) + 1, bytesWritten);
+ }
+ return bytesWritten == strlen(stringToWrite) + 1;
+void CloseNamedPipe(NamedPipe *pipe)
+ CloseHandle(pipe->hPipe);
+ delete pipe;
+#else // _MSC_VER
+#if defined(__MACH__)
+#include <mach/clock.h>
+#include <mach/mach.h>
+_int64 timeInMillis()
+ * Get the current time in milliseconds since some arbitrary starting point
+ * (e.g. system boot or epoch)
+ */
+ timeval t;
+ gettimeofday(&t, NULL);
+ return ((_int64) t.tv_sec) * 1000 + ((_int64) t.tv_usec) / 1000;
+_int64 timeInNanos()
+ timespec ts;
+#if defined(__linux__)
+ clock_gettime(CLOCK_REALTIME, &ts); // Works on Linux
+#elif defined(__MACH__)
+ clock_serv_t cclock;
+ mach_timespec_t mts;
+ host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock);
+ clock_get_time(cclock, &mts);
+ mach_port_deallocate(mach_task_self(), cclock);
+ ts.tv_sec = mts.tv_sec;
+ ts.tv_nsec = mts.tv_nsec;
+ #error "Don't know how to get time in nanos on your platform"
+ return ((_int64) ts.tv_sec) * 1000000000 + (_int64) ts.tv_nsec;
+void AcquireUnderlyingExclusiveLock(UnderlyingExclusiveLock *lock)
+ pthread_mutex_lock(lock);
+void ReleaseUnderlyingExclusiveLock(UnderlyingExclusiveLock *lock)
+ pthread_mutex_unlock(lock);
+bool InitializeUnderlyingExclusiveLock(UnderlyingExclusiveLock *lock)
+ return pthread_mutex_init(lock, NULL) == 0;
+bool DestroyUnderlyingExclusiveLock(UnderlyingExclusiveLock *lock)
+ return pthread_mutex_destroy(lock) == 0;
+class SingleWaiterObjectImpl {
+ pthread_mutex_t lock;
+ pthread_cond_t cond;
+ bool set;
+ bool init() {
+ if (pthread_mutex_init(&lock, NULL) != 0) {
+ return false;
+ }
+ if (pthread_cond_init(&cond, NULL) != 0) {
+ pthread_mutex_destroy(&lock);
+ return false;
+ }
+ set = false;
+ return true;
+ }
+ void signal() {
+ pthread_mutex_lock(&lock);
+ set = true;
+ pthread_cond_signal(&cond);
+ pthread_mutex_unlock(&lock);
+ }
+ void wait() {
+ pthread_mutex_lock(&lock);
+ while (!set) {
+ pthread_cond_wait(&cond, &lock);
+ }
+ pthread_mutex_unlock(&lock);
+ }
+ bool waitWithTimeout(_int64 timeoutInMillis) {
+ struct timespec wakeTime;
+#ifdef __LINUX__
+ clock_gettime(CLOCK_REALTIME, &wakeTime);
+ wakeTime.tv_nsec += timeoutInMillis * 1000000;
+#elif defined(__MACH__)
+ clock_serv_t cclock;
+ mach_timespec_t mts;
+ host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock);
+ clock_get_time(cclock, &mts);
+ mach_port_deallocate(mach_task_self(), cclock);
+ wakeTime.tv_nsec = mts.tv_nsec + timeoutInMillis * 1000000;
+ wakeTime.tv_sec = mts.tv_sec;
+ wakeTime.tv_sec += wakeTime.tv_nsec / 1000000000;
+ wakeTime.tv_nsec = wakeTime.tv_nsec % 1000000000;
+ bool timedOut = false;
+ pthread_mutex_lock(&lock);
+ while (!set) {
+ int retVal = pthread_cond_timedwait(&cond, &lock, &wakeTime);
+ if (retVal == ETIMEDOUT) {
+ timedOut = true;
+ break;
+ }
+ }
+ pthread_mutex_unlock(&lock);
+ return !timedOut;
+ }
+ bool destroy() {
+ pthread_cond_destroy(&cond);
+ pthread_mutex_destroy(&lock);
+ }
+bool CreateSingleWaiterObject(SingleWaiterObject *waiter)
+ SingleWaiterObjectImpl *obj = new SingleWaiterObjectImpl;
+ if (obj == NULL) {
+ return false;
+ }
+ if (!obj->init()) {
+ delete obj;
+ return false;
+ }
+ *waiter = obj;
+ return true;
+void DestroySingleWaiterObject(SingleWaiterObject *waiter)
+ (*waiter)->destroy();
+ delete *waiter;
+void SignalSingleWaiterObject(SingleWaiterObject *waiter)
+ (*waiter)->signal();
+bool WaitForSingleWaiterObject(SingleWaiterObject *waiter)
+ (*waiter)->wait();
+ return true;
+void ResetSingleWaiterObject(SingleWaiterObject *waiter)
+ (*waiter)->init();
+class EventObjectImpl : public SingleWaiterObjectImpl
+ void signalAll()
+ {
+ pthread_mutex_lock(&lock);
+ set = true;
+ pthread_cond_broadcast(&cond);
+ pthread_mutex_unlock(&lock);
+ }
+ void blockAll()
+ {
+ pthread_mutex_lock(&lock);
+ set = false;
+ pthread_mutex_unlock(&lock);
+ }
+void CreateEventObject(EventObject *newEvent)
+ EventObjectImpl* obj = new EventObjectImpl();
+ if (obj == NULL) {
+ return;
+ }
+ if (!obj->init()) {
+ delete obj;
+ return;
+ }
+ *newEvent = obj;
+void DestroyEventObject(EventObject *eventObject)
+ (*eventObject)->destroy();
+ delete *eventObject;
+void AllowEventWaitersToProceed(EventObject *eventObject)
+ (*eventObject)->signalAll();
+void PreventEventWaitersFromProceeding(EventObject *eventObject)
+ (*eventObject)->blockAll();
+void WaitForEvent(EventObject *eventObject)
+ (*eventObject)->wait();
+bool WaitForEventWithTimeout(EventObject *eventObject, _int64 timeoutInMillis)
+ return (*eventObject)->waitWithTimeout(timeoutInMillis);
+int InterlockedIncrementAndReturnNewValue(volatile int *valueToDecrement)
+ return (int) __sync_fetch_and_add((volatile int*) valueToDecrement, 1) + 1;
+int InterlockedDecrementAndReturnNewValue(volatile int *valueToDecrement)
+ return __sync_fetch_and_sub(valueToDecrement, 1) - 1;
+_int64 InterlockedAdd64AndReturnNewValue(volatile _int64 *valueToWhichToAdd, _int64 amountToAdd)
+ return __sync_fetch_and_add(valueToWhichToAdd, amountToAdd) + amountToAdd;
+_uint32 InterlockedCompareExchange32AndReturnOldValue(volatile _uint32 *valueToUpdate, _uint32 replacementValue, _uint32 desiredPreviousValue)
+ return __sync_val_compare_and_swap(valueToUpdate, desiredPreviousValue, replacementValue);
+_uint64 InterlockedCompareExchange64AndReturnOldValue(volatile _uint64 *valueToUpdate, _uint64 replacementValue, _uint64 desiredPreviousValue)
+ return (_uint64) __sync_val_compare_and_swap((volatile _int64 *) valueToUpdate, desiredPreviousValue, replacementValue);
+void* InterlockedCompareExchangePointerAndReturnOldValue(void * volatile *valueToUpdate, void* replacementValue, void* desiredPreviousValue)
+ return __sync_val_compare_and_swap(valueToUpdate, desiredPreviousValue, replacementValue);
+namespace {
+// POSIX thread functions need to return void*, so we wrap the ThreadMainFunction in our API
+struct ThreadInfo {
+ ThreadMainFunction function;
+ void *parameter;
+ ThreadInfo(ThreadMainFunction f, void *p): function(f), parameter(p) {}
+void* runThread(void* infoVoidPtr) {
+ ThreadInfo *info = (ThreadInfo*) infoVoidPtr;
+ info->function(info->parameter);
+ delete info;
+ return NULL;
+bool StartNewThread(ThreadMainFunction threadMainFunction, void *threadMainFunctionParameter)
+ ThreadInfo *info = new ThreadInfo(threadMainFunction, threadMainFunctionParameter);
+ pthread_t thread;
+ return pthread_create(&thread, NULL, runThread, info) == 0;
+void BindThreadToProcessor(unsigned processorNumber)
+#ifdef __linux__
+ cpu_set_t cpuset;
+ CPU_ZERO(&cpuset);
+ CPU_SET(processorNumber, &cpuset);
+ if (sched_setaffinity(0, sizeof(cpu_set_t), &cpuset) != 0) {
+ perror("sched_setaffinity");
+ }
+unsigned GetNumberOfProcessors()
+ return (unsigned) sysconf(_SC_NPROCESSORS_ONLN);
+void SleepForMillis(unsigned millis)
+ usleep(millis*1000);
+_int64 QueryFileSize(const char *fileName)
+ int fd = open(fileName, O_RDONLY);
+ _ASSERT(fd != -1);
+ struct stat sb;
+ int r = fstat(fd, &sb);
+ _ASSERT(r != -1);
+ _int64 fileSize = sb.st_size;
+ close(fd);
+ return fileSize;
+ bool
+ const char* filename)
+ return unlink(filename) == 0;
+ bool
+ const char* from,
+ const char* to)
+ return rename(from, to) == 0;
+class LargeFileHandle
+ FILE* file;
+ LargeFileHandle*
+ const char* filename,
+ const char* mode)
+ _ASSERT(strlen(mode) == 1 && (*mode == 'r' || *mode == 'w' || *mode == 'a'));
+ char fmode[3];
+ fmode[0] = mode[0]; fmode[1] = 'b'; fmode[2] = '\0';
+ FILE* file = fopen(filename, fmode);
+ if (file == NULL) {
+ return NULL;
+ }
+ LargeFileHandle* result = new LargeFileHandle();
+ result->file = file;
+ return result;
+ size_t
+ LargeFileHandle* file,
+ void* buffer,
+ size_t bytes)
+ return fwrite(buffer, 1, bytes, file->file);
+ size_t
+ LargeFileHandle* file,
+ void* buffer,
+ size_t bytes)
+ return fread(buffer, 1, bytes, file->file);
+ void
+ LargeFileHandle* file)
+ if (0 == fclose(file->file)) {
+ delete file;
+ }
+class MemoryMappedFile
+ int fd;
+ void* map;
+ size_t length;
+ MemoryMappedFile*
+ const char* filename,
+ size_t offset,
+ size_t length,
+ void** o_contents,
+ bool write,
+ bool sequential)
+ int fd = open(filename, write ? O_CREAT | O_RDWR : O_RDONLY, S_IRUSR | S_IWUSR);
+ if (fd < 0) {
+ warn("OpenMemoryMappedFile %s failed", filename);
+ return NULL;
+ }
+ // todo: large page support
+ size_t page = getpagesize();
+ size_t extra = offset % page;
+ void* map = mmap(NULL, length + extra, (write ? PROT_WRITE : 0) | PROT_READ, MAP_PRIVATE, fd, offset - extra);
+ if (map == NULL || map == MAP_FAILED) {
+ warn("OpenMemoryMappedFile %s mmap failed", filename);
+ close(fd);
+ return NULL;
+ }
+ int e = madvise(map, length + extra, sequential ? MADV_SEQUENTIAL : MADV_RANDOM);
+ if (e < 0) {
+ warn("OpenMemoryMappedFile %s madvise failed", filename);
+ }
+ MemoryMappedFile* result = new MemoryMappedFile();
+ result->fd = fd;
+ result->map = map;
+ result->length = length + extra;
+ *o_contents = (char*)map + extra;
+ return result;
+ void
+ MemoryMappedFile* mappedFile)
+ int e = munmap(mappedFile->map, mappedFile->length);
+ int e2 = close(mappedFile->fd);
+ if (e != 0 || e2 != 0) {
+ WriteErrorMessage("CloseMemoryMapped file failed\n");
+ }
+void AdviseMemoryMappedFilePrefetch(const MemoryMappedFile *mappedFile)
+ if (madvise(mappedFile->map, mappedFile->length, MADV_SEQUENTIAL)) {
+ WriteErrorMessage("madvise MADV_SEQUENTIAL failed (since it's only an optimization, this is OK). Errno %d\n", errno);
+ }
+ if (madvise(mappedFile->map, mappedFile->length, MADV_WILLNEED)) {
+ WriteErrorMessage("madvise MADV_WILLNEED failed (since it's only an optimization, this is OK). Errno %d\n", errno);
+ }
+#ifdef __linux__
+class PosixAsyncFile : public AsyncFile
+ static PosixAsyncFile* open(const char* filename, bool write);
+ PosixAsyncFile(int i_fd);
+ virtual bool close();
+ class Writer : public AsyncFile::Writer
+ {
+ public:
+ Writer(PosixAsyncFile* i_file);
+ virtual bool close();
+ virtual bool beginWrite(void* buffer, size_t length, size_t offset, size_t *bytesWritten);
+ virtual bool waitForCompletion();
+ private:
+ PosixAsyncFile* file;
+ bool writing;
+ SingleWaiterObject ready;
+ struct aiocb aiocb;
+ size_t* result;
+ };
+ virtual AsyncFile::Writer* getWriter();
+ class Reader : public AsyncFile::Reader
+ {
+ public:
+ Reader(PosixAsyncFile* i_file);
+ virtual bool close();
+ virtual bool beginRead(void* buffer, size_t length, size_t offset, size_t *bytesRead);
+ virtual bool waitForCompletion();
+ private:
+ PosixAsyncFile* file;
+ bool reading;
+ SingleWaiterObject ready;
+ struct aiocb aiocb;
+ size_t* result;
+ };
+ virtual AsyncFile::Reader* getReader();
+ int fd;
+ PosixAsyncFile*
+ const char* filename,
+ bool write)
+ int fd = ::open(filename, write ? O_CREAT | O_RDWR | O_TRUNC : O_RDONLY, write ? S_IRWXU | S_IRGRP : 0);
+ if (fd < 0) {
+ WriteErrorMessage("Unable to create SAM file '%s', %d\n",filename,errno);
+ return NULL;
+ }
+ return new PosixAsyncFile(fd);
+ int i_fd)
+ : fd(i_fd)
+ bool
+ return ::close(fd) == 0;
+ AsyncFile::Writer*
+ return new Writer(this);
+PosixAsyncFile::Writer::Writer(PosixAsyncFile* i_file)
+ : file(i_file), writing(false)
+ memset(&aiocb, 0, sizeof(aiocb));
+ if (! CreateSingleWaiterObject(&ready)) {
+ WriteErrorMessage("PosixAsyncFile: cannot create waiter\n");
+ soft_exit(1);
+ }
+ bool
+ waitForCompletion();
+ DestroySingleWaiterObject(&ready);
+ return true;
+ void
+ union sigval val)
+ SignalSingleWaiterObject((SingleWaiterObject*) val.sival_ptr);
+ void
+ struct aiocb* control,
+ SingleWaiterObject* ready,
+ int fd,
+ void* buffer,
+ size_t length,
+ size_t offset)
+ control->aio_fildes = fd;
+ control->aio_buf = buffer;
+ control->aio_nbytes = length;
+ control->aio_offset = offset;
+ control->aio_sigevent.sigev_notify = SIGEV_THREAD;
+ control->aio_sigevent.sigev_value.sival_ptr = ready;
+ control->aio_sigevent.sigev_notify_function = sigev_ready;
+ bool
+ void* buffer,
+ size_t length,
+ size_t offset,
+ size_t *bytesWritten)
+ if (! waitForCompletion()) {
+ return false;
+ }
+ aio_setup(&aiocb, &ready, file->fd, buffer, length, offset);
+ result = bytesWritten;
+ if (aio_write(&aiocb) < 0) {
+ warn("PosixAsyncFile aio_write failed");
+ return false;
+ }
+ writing = true;
+ return true;
+ bool
+ if (writing) {
+ WaitForSingleWaiterObject(&ready);
+ ResetSingleWaiterObject(&ready);
+ writing = false;
+ ssize_t ret = aio_return(&aiocb);
+ if (ret < 0 && errno != 0) {
+ warn("PosixAsyncFile Writer aio_return failed");
+ return false;
+ }
+ if (result != NULL) {
+ *result = max((ssize_t)0, ret);
+ }
+ }
+ return true;
+ AsyncFile::Reader*
+ return new Reader(this);
+ PosixAsyncFile* i_file)
+ : file(i_file), reading(false)
+ memset(&aiocb, 0, sizeof(aiocb));
+ if (! CreateSingleWaiterObject(&ready)) {
+ WriteErrorMessage("PosixAsyncFile cannot create waiter\n");
+ soft_exit(1);
+ }
+ bool
+ DestroySingleWaiterObject(&ready);
+ return true;
+ bool
+ void* buffer,
+ size_t length,
+ size_t offset,
+ size_t* bytesRead)
+ if (! waitForCompletion()) {
+ return false;
+ }
+ aio_setup(&aiocb, &ready, file->fd, buffer, length, offset);
+ result = bytesRead;
+ if (aio_read(&aiocb) < 0) {
+ warn("PosixAsyncFile Reader aio_read failed");
+ return false;
+ }
+ reading = true;
+ return true;
+ bool
+ if (reading) {
+ WaitForSingleWaiterObject(&ready);
+ ResetSingleWaiterObject(&ready);
+ reading = false;
+ ssize_t ret = aio_return(&aiocb);
+ if (ret < 0 && errno != 0) {
+ warn("PosixAsyncFile Reader aio_return");
+ return false;
+ }
+ if (result != NULL) {
+ *result = max((ssize_t)0, ret);
+ }
+ }
+ return true;
+// todo: make this actually async!
+class OsxAsyncFile : public AsyncFile
+ static OsxAsyncFile* open(const char* filename, bool write);
+ OsxAsyncFile(int i_fd);
+ virtual bool close();
+ class Writer : public AsyncFile::Writer
+ {
+ public:
+ Writer(OsxAsyncFile* i_file);
+ virtual bool close();
+ virtual bool beginWrite(void* buffer, size_t length, size_t offset, size_t *bytesWritten);
+ virtual bool waitForCompletion();
+ private:
+ OsxAsyncFile* file;
+ bool writing;
+ SingleWaiterObject ready;
+ struct aiocb aiocb;
+ size_t* result;
+ };
+ virtual AsyncFile::Writer* getWriter();
+ class Reader : public AsyncFile::Reader
+ {
+ public:
+ Reader(OsxAsyncFile* i_file);
+ virtual bool close();
+ virtual bool beginRead(void* buffer, size_t length, size_t offset, size_t *bytesRead);
+ virtual bool waitForCompletion();
+ private:
+ OsxAsyncFile* file;
+ bool reading;
+ size_t* result;
+ };
+ virtual AsyncFile::Reader* getReader();
+ int fd;
+ OsxAsyncFile*
+ const char* filename,
+ bool write)
+ int fd = ::open(filename, write ? O_CREAT | O_RDWR | O_TRUNC : O_RDONLY, write ? S_IRWXU | S_IRGRP : 0);
+ if (fd < 0) {
+ WriteErrorMessage("Unable to create SAM file '%s', %d\n",filename,errno);
+ return NULL;
+ }
+ return new OsxAsyncFile(fd);
+ int i_fd)
+ : fd(i_fd)
+ bool
+ return ::close(fd) == 0;
+ AsyncFile::Writer*
+ return new Writer(this);
+OsxAsyncFile::Writer::Writer(OsxAsyncFile* i_file)
+ : file(i_file), writing(false)
+ bool
+ return true;
+ bool
+ void* buffer,
+ size_t length,
+ size_t offset,
+ size_t *bytesWritten)
+ size_t m = ::lseek(file->fd, offset, SEEK_SET);
+ if (m == -1) {
+ return false;
+ }
+ size_t n = ::write(file->fd, buffer, length);
+ if (bytesWritten) {
+ *bytesWritten = n;
+ }
+ return n != -1;
+ bool
+ return true;
+ AsyncFile::Reader*
+ return new Reader(this);
+ OsxAsyncFile* i_file)
+ : file(i_file), reading(false)
+ bool
+ return true;
+ bool
+ void* buffer,
+ size_t length,
+ size_t offset,
+ size_t* bytesRead)
+ size_t m = ::lseek(file->fd, offset, SEEK_SET);
+ if (m == -1) {
+ return false;
+ }
+ size_t n = ::read(file->fd, buffer, length);
+ if (bytesRead) {
+ *bytesRead = n;
+ }
+ return n != -1;
+ bool
+ return true;
+int _fseek64bit(FILE *stream, _int64 offset, int origin)
+#ifdef __APPLE__
+ // Apple's file pointers are already 64-bit so just use fseeko.
+ fseeko(stream, offset, origin);
+ return fseeko64(stream, offset, origin);
+ fd = -1;
+ initialized = false;
+ mapCount = 0;
+ pagesize = getpagesize();
+FileMapper::init(const char *i_fileName)
+ if (initialized) {
+ if (strcmp(fileName, i_fileName)) {
+ WriteErrorMessage("FileMapper already initialized with %s, cannot init with %s\n", fileName, i_fileName);
+ return false;
+ }
+ return true;
+ }
+ fileName = i_fileName;
+ fd = open(fileName, O_RDONLY);
+ if (fd == -1) {
+ WriteErrorMessage("Failed to open %s\n", fileName);
+ return false;
+ }
+ struct stat sb;
+ int r = fstat(fd, &sb);
+ if (r == -1) {
+ WriteErrorMessage("Failed to stat %s\n", fileName);
+ return false;
+ }
+ fileSize = sb.st_size;
+ initialized = true;
+ return true;
+char *
+FileMapper::createMapping(size_t offset, size_t amountToMap, void** o_token)
+ size_t beginRounding = offset % pagesize;
+ size_t mapRequestSize = beginRounding + amountToMap;
+ //_ASSERT(mapRequestSize % pagesize == 0);
+ if (mapRequestSize + offset >= fileSize) {
+ mapRequestSize = 0; // Says to just map the whole thing.
+ }
+ char* mappedBase = (char *) mmap(NULL, amountToMap + beginRounding, PROT_READ, MAP_SHARED, fd, offset - beginRounding);
+ if (mappedBase == MAP_FAILED) {
+ WriteErrorMessage("mmap failed.\n");
+ return NULL;
+ }
+ int r = madvise(mappedBase, min((size_t) madviseSize, amountToMap + beginRounding), MADV_WILLNEED | MADV_SEQUENTIAL);
+ _ASSERT(r == 0);
+ lastPosMadvised = 0;
+ InterlockedIncrementAndReturnNewValue(&mapCount);
+ *o_token = new UnmapToken(mappedBase, amountToMap);
+ return mappedBase + beginRounding;
+FileMapper::unmap(void* i_token)
+ _ASSERT(mapCount > 0);
+ if (mapCount > 0) {
+ int n = InterlockedDecrementAndReturnNewValue(&mapCount);
+ _ASSERT(n >= 0);
+ UnmapToken* token = (UnmapToken*) i_token;
+ munmap(token->first, token->second);
+ delete token;
+ }
+ _ASSERT(mapCount == 0);
+ close(fd);
+#if 0
+FileMapper::prefetch(size_t currentRead)
+ if (currentRead > lastPosMadvised + madviseSize / 2) {
+ _uint64 offset = lastPosMadvised + madviseSize;
+ _uint64 len = (offset > amountMapped ? 0 : min(amountMapped - offset, (_uint64) madviseSize));
+ if (len > 0) {
+ // Start reading new range
+ int r = madvise(mappedBase + offset, len, MADV_WILLNEED);
+ _ASSERT(r == 0);
+ }
+ if (lastPosMadvised > 0) {
+ // Unload the range we had before our current one
+ int r = madvise(mappedBase + lastPosMadvised - madviseSize, madviseSize, MADV_DONTNEED);
+ _ASSERT(r == 0);
+ }
+ lastPosMadvised = offset;
+ }
+void PreventMachineHibernationWhileThisThreadIsAlive()
+ // Only implemented for Windows
+void SetToLowSchedulingPriority()
+ // Only implemented for Windows (the Linux version is per-thread, and I'm too lazy to do it now).
+ WriteErrorMessage("The Linux code for running at low priority is not implemented, so SNAP will run at normal priority\n");
+// Linux named pipes are unidirectional, so we need two of them.
+struct NamedPipe {
+ bool serverSide;
+ char * pipeName;
+ FILE * input;
+ FILE * output;
+bool createPipe(const char *fullyQualifiedPipeName)
+ if (mkfifo(fullyQualifiedPipeName, S_IRUSR | S_IWUSR)) {
+ if (errno != EEXIST) {
+ if (errno == ENOENT) {
+ WriteErrorMessage("OpenNamedPipe: unable to create named pipe at path '%s' because a directory in the path doesn't exist. Please create it or use a different pipe name\n", fullyQualifiedPipeName);
+ return false;
+ }
+ if (errno == EACCES) {
+ WriteErrorMessage("OpenNamedPipe: unable to create named pipe at path '%s' because you do not have sufficient permissions.\n", fullyQualifiedPipeName);
+ return false;
+ }
+ if (errno == ENOTDIR) {
+ WriteErrorMessage("OpenNamedPipe: a component of your pipe path isn't a directory. '%s'\n", fullyQualifiedPipeName);
+ return false;
+ }
+ WriteErrorMessage("OpenNamedPipe: unexpectedly failed to create named pipe '%s', errno %d\n", fullyQualifiedPipeName, errno);
+ return false;
+ }
+ }
+ return true;
+FILE *connectPipe(char *fullyQualifiedPipeName, bool forInput)
+ FILE *pipeFile = fopen(fullyQualifiedPipeName, forInput ? "r" : "w");
+ if (NULL == pipeFile) {
+ WriteErrorMessage("OpenNamedPipe: unable to open pipe file '%s', errno %d\n", fullyQualifiedPipeName, errno);
+ }
+ return pipeFile;
+char *createFullyQualifiedPipeName(const char *pipeName, bool serverSide, bool forInput)
+ char *fullyQualifiedPipeName;
+ const char *defaultPipeDirectory = "/tmp/";
+ const char *pipeDirectory;
+ const char *toServer = "-toServer";
+ const char *toClient = "-toClient";
+ if (pipeName[0] != '/') {
+ pipeDirectory = defaultPipeDirectory;
+ } else {
+ pipeDirectory = "";
+ }
+ fullyQualifiedPipeName = new char[strlen(pipeDirectory) + strlen(pipeName) + __max(strlen(toServer), strlen(toClient)) + 1]; // +1 for trailing null
+ sprintf(fullyQualifiedPipeName, "%s%s%s", pipeDirectory, pipeName, (serverSide == forInput) ? toServer : toClient);
+ return fullyQualifiedPipeName;
+bool connectNamedPipes(NamedPipe *pipe)
+ char *inputPipeName = createFullyQualifiedPipeName(pipe->pipeName, pipe->serverSide, true);
+ char *outputPipeName = createFullyQualifiedPipeName(pipe->pipeName, pipe->serverSide, false);
+ if (pipe->input != NULL) {
+ fclose(pipe->input);
+ pipe->input = NULL;
+ }
+ if (pipe->output != NULL) {
+ fclose(pipe->output);
+ pipe->output = NULL;
+ }
+ //
+ // Connecting pipes is synchronous, so the server and client need to connect
+ // in opposite order.
+ //
+ if (pipe->serverSide) {
+ signal(SIGPIPE, SIG_IGN);// If the client hits ^C, we'll get this. Ignore it, let the fwrite fail, and continue
+ pipe->input = connectPipe(inputPipeName, true);
+ pipe->output = connectPipe(outputPipeName, false);
+ //
+ // Release any exclusive lock on the toServer pipe that may have been left by a now-dead client
+ //
+ struct flock lock;
+ lock.l_type = F_UNLCK;
+ lock.l_whence = SEEK_SET;
+ lock.l_start = 0;
+ lock.l_len = 1;
+ lock.l_pid = 0;
+ if (fcntl(fileno(pipe->output), F_SETLKW, &lock) < 0) {
+ fprintf(stderr,"Unable to clear named pipe lock, errno %d\n", errno);
+ delete pipe;
+ return NULL;
+ }
+ } else {
+ pipe->output = connectPipe(outputPipeName, false);
+ pipe->input = connectPipe(inputPipeName, true);
+ }
+ delete [] inputPipeName;
+ delete [] outputPipeName;
+ return pipe->input != NULL && pipe->output != NULL;
+NamedPipe *OpenNamedPipe(const char *pipeName, bool serverSide)
+ char *inputPipeName = createFullyQualifiedPipeName(pipeName, serverSide, true);
+ char *outputPipeName = createFullyQualifiedPipeName(pipeName, serverSide, false);
+ NamedPipe *pipe = new NamedPipe;
+ pipe->pipeName = new char[strlen(pipeName) + 1];
+ strcpy(pipe->pipeName, pipeName);
+ pipe->input = pipe->output = NULL;
+ pipe->serverSide = serverSide;
+ if (serverSide) {
+ if (!createPipe(inputPipeName)) {
+ delete pipe;
+ return NULL;
+ }
+ if (!createPipe(outputPipeName)) {
+ delete pipe;
+ return NULL;
+ }
+ }
+ delete [] inputPipeName;
+ delete [] outputPipeName;
+ if (!connectNamedPipes(pipe)) {
+ delete pipe;
+ return NULL;
+ }
+ if (!serverSide) {
+ //
+ // Take an exclusive lock on the toServer pipe so that only one client is sending at a time.
+ //
+ struct flock lock;
+ lock.l_type = F_WRLCK;
+ lock.l_whence = SEEK_SET;
+ lock.l_start = 0;
+ lock.l_len = 1;
+ lock.l_pid = 0;
+ if (fcntl(fileno(pipe->output), F_SETLKW, &lock) < 0) {
+ fprintf(stderr,"OpenNamedPipe: F_SETLKW failed, errno %d\n", errno);
+ delete pipe;
+ return NULL;
+ }
+ }
+ return pipe;
+// Our Linux version of named pipe IO sends strings with 4 byte byte counts first.
+bool ReadFromNamedPipe(NamedPipe *pipe, char *outputBuffer, size_t outputBufferSize)
+ unsigned int size;
+ for (;;) {
+ if (1 != fread(&size, sizeof(size), 1, pipe->input)) {
+ if (!pipe->serverSide) {
+ return false;
+ }
+ if (!connectNamedPipes(pipe)) {
+ return false;
+ }
+ continue;
+ }
+ if (size >= outputBufferSize) {
+ WriteErrorMessage("Trying to read too big a chunk from named pipe, %d >= %lld\n", size, outputBufferSize);
+ return false;
+ }
+ if (1 != fread(outputBuffer, size, 1, pipe->input)) {
+ if (!pipe->serverSide) {
+ return false;
+ }
+ if (!connectNamedPipes(pipe)) {
+ return false;
+ }
+ continue;
+ }
+ break;
+ }
+ outputBuffer[size] = '\0';
+ return true;
+bool WriteToNamedPipe(NamedPipe *pipe, const char *stringToWrite)
+ unsigned int size = (unsigned int)strlen(stringToWrite);
+ if (1 != fwrite(&size, sizeof(size), 1, pipe->output)) {
+ return false;
+ }
+ if (1 != fwrite(stringToWrite, size, 1, pipe->output)) {
+ return false;
+ }
+ fflush(pipe->output);
+ return true;
+void CloseNamedPipe(NamedPipe *pipe)
+ fclose(pipe->input);
+ fclose(pipe->output);
+ delete pipe;
+#endif // _MSC_VER
+AsyncFile* AsyncFile::open(const char* filename, bool write)
+ if (!strcmp("-", filename) && write) {
+ return StdoutAsyncFile::open("-", true);
+ }
+#ifdef _MSC_VER
+ return WindowsAsyncFile::open(filename, write);
+#ifdef __linux__
+ return PosixAsyncFile::open(filename, write);
+ return OsxAsyncFile::open(filename, write);
diff --git a/SNAPLib/Compat.h b/SNAPLib/Compat.h
new file mode 100644
index 0000000..97177bb
--- /dev/null
+++ b/SNAPLib/Compat.h
@@ -0,0 +1,516 @@
+Module Name:
+ compat.h
+ Functions that provide compatibility between the Windows and Linux versions,
+ and mostly that serve to keep #ifdef's out of the main code in order to
+ improve readibility.
+ Bill Bolosky, November, 2011
+ User mode service.
+Revision History:
+#pragma once
+#ifdef _MSC_VER
+#include <Windows.h>
+typedef unsigned _int64 _uint64;
+typedef unsigned _int32 _uint32;
+typedef unsigned char _uint8;
+typedef unsigned short _uint16;
+// <http://stackoverflow.com/questions/126279/c99-stdint-h-header-and-ms-visual-studio>
+const _uint64 UINT64_MAX = MAXUINT64;
+const _int64 INT64_MAX = MAXINT64;
+const _int64 INT64_MIN = MININT64;
+const _uint32 UINT32_MAX = MAXUINT32;
+const _int32 INT32_MIN = MININT32;
+const _int32 INT32_MAX = MAXINT32;
+const _uint16 UINT16_MAX = MAXUINT16;
+const _int16 INT16_MAX = MAXINT16;
+const _int16 INT16_MIN = MININT16;
+static const double LOG10 = log(10.0);
+inline double exp10(double x) { return exp(x * LOG10); }
+const void* memmem(const void* data, const size_t dataLength, const void* pattern, const size_t patternLength);
+typedef CRITICAL_SECTION UnderlyingExclusiveLock;
+typedef HANDLE SingleWaiterObject; // This is an event in Windows. It's just a synchronization object that you can wait for and set. "Single" means only one thread can wait on it at a time.
+typedef HANDLE EventObject;
+#define PATH_SEP '\\'
+#define snprintf _snprintf
+#define mkdir(path, mode) _mkdir(path)
+#define strdup(s) _strdup(s)
+// <http://stackoverflow.com/questions/9021502/whats-the-difference-between-strtok-r-and-strtok-s-in-c>
+#define strtok_r strtok_s
+#define strncasecmp _strnicmp
+#define atoll(S) _atoi64(S)
+#define bit_rotate_right(value, shift) _rotr(value, shift)
+#define bit_rotate_left(value, shift) _rotl(value, shift)
+#define bit_rotate_right64(value, shift) _rotr64(value, shift)
+#define bit_rotate_left64(value, shift) _rotl64(value, shift)
+int getpagesize();
+#else // _MSC_VER
+#include <pthread.h>
+// <http://stackoverflow.com/questions/986426/what-do-stdc-limit-macros-and-stdc-constant-macros-mean>
+#include <stdint.h>
+#include <assert.h>
+#include <float.h>
+#ifdef __linux__
+#include <sched.h> // For sched_setaffinity
+#ifndef __APPLE__
+#include <xmmintrin.h> // This is currently (in Dec 2013) broken on Mac OS X 10.9 (Apple clang-500.2.79)
+#define _mm_prefetch(...) {}
+typedef int64_t _int64;
+typedef uint64_t _uint64;
+typedef int32_t _int32;
+typedef uint32_t _uint32;
+typedef uint16_t _uint16;
+typedef int16_t _int16;
+typedef uint8_t BYTE;
+typedef uint8_t _uint8;
+typedef int8_t _int8;
+typedef void *PVOID;
+// TODO: check if Linux libs have exp10 function
+#include <math.h>
+static const double LOG10 = log(10.0);
+inline double exp10(double x) { return exp(x * LOG10); }
+#define __in /* nothing */
+#define PATH_SEP '/'
+#ifdef DEBUG
+#define _ASSERT assert
+#ifndef _DEBUG
+#define _DEBUG 1 // Compat with Windows version
+#endif // !_DEBUG
+#define _ASSERT(x) {}
+#define __min(x,y) ((x)<(y) ? (x) : (y))
+#define __max(x,y) ((x)>(y) ? (x) : (y))
+#ifdef max
+#undef max
+#ifdef min
+#undef min
+#define MAX_PATH 4096
+#define __cdecl __attribute__((__cdecl__))
+#define _stricmp strcasecmp
+inline bool _BitScanForward64(unsigned long *result, _uint64 x) {
+ *result = __builtin_ctzll(x);
+ return x != 0;
+// We implement SingleWaiterObject using a mutex because POSIX unnamed semaphores don't work on OS X
+class SingleWaiterObjectImpl;
+typedef pthread_mutex_t UnderlyingExclusiveLock;
+typedef SingleWaiterObjectImpl *SingleWaiterObject; // "Single" means only one thread can wait on it at a time.
+class EventObjectImpl;
+typedef EventObjectImpl *EventObject;
+inline unsigned bit_rotate_right(unsigned value, unsigned shift)
+ if (shift%32 == 0) return value;
+ return value >> (shift%32) | (value << (32 - shift%32));
+inline unsigned bit_rotate_left(unsigned value, unsigned shift)
+ if (shift%32 == 0) return value;
+ return value << (shift %32) | (value >> (32 - shift%32));
+inline _uint64 bit_rotate_right64(_uint64 value, unsigned shift)
+ if (shift%64 == 0) return value;
+ return value >> (shift%64) | (value << (64 - shift%64));
+inline _uint64 bit_rotate_left64(_uint64 value, unsigned shift)
+ if (shift%64 == 0) return value;
+ return value << (shift %64) | (value >> (64 - shift%64));
+#endif // _MSC_VER
+struct NamedPipe; // It's bi-directional, which in Unix means it's actually two pipes
+extern NamedPipe *OpenNamedPipe(const char *pipeName, bool serverSide);
+extern bool ReadFromNamedPipe(NamedPipe *pipe, char *outputBuffer, size_t outputBufferSize);
+extern bool WriteToNamedPipe(NamedPipe *pipe, const char *stringToWrite); // Null-terminated string
+extern void CloseNamedPipe(NamedPipe *pipe);
+extern const char *DEFAULT_NAMED_PIPE_NAME;
+// Get the time since some predefined time. The predefined time must not change during any particular program run.
+_int64 timeInMillis();
+_int64 timeInNanos();
+//#define PROFILE_WAIT
+void PrintWaitProfile();
+// Exclusive locks. These have the obvious semantics: At most one thread can acquire one at any time, the others block
+// until the first one releases it. In the DEBUG build we wrap the lock in a class that ensures that it's initialized before
+// it's used (which we found out the hard way isn't always so obvious).
+extern void AcquireUnderlyingExclusiveLock(UnderlyingExclusiveLock *);
+bool InitializeUnderlyingExclusiveLock(UnderlyingExclusiveLock *lock);
+void ReleaseUnderlyingExclusiveLock(UnderlyingExclusiveLock *lock);
+bool DestroyUnderlyingExclusiveLock(UnderlyingExclusiveLock *lock);
+#ifdef _DEBUG
+class ExclusiveLock {
+ UnderlyingExclusiveLock lock;
+ bool initialized;
+ bool wholeProgramScope;
+#ifdef _MSC_VER
+ DWORD holderThreadId;
+#endif // _MSC_VER
+ ExclusiveLock() : initialized(false), holderThreadId(0), wholeProgramScope(false) {}
+ ~ExclusiveLock() {_ASSERT(!initialized || wholeProgramScope);} // Must DestroyExclusiveLock first
+inline void SetExclusiveLockWholeProgramScope(ExclusiveLock *lock)
+ lock->wholeProgramScope = true;
+inline void AcquireExclusiveLock(ExclusiveLock *lock)
+ _ASSERT(lock->initialized);
+ AcquireUnderlyingExclusiveLock(&lock->lock);
+#ifdef _MSC_VER
+ // If you see this go off, you're probably trying a recursive lock acquisition (i.e., twice on the same thead),
+ // which is legal in Windows and a deadlock in Linux.
+ _ASSERT(lock->holderThreadId == 0);
+ lock->holderThreadId = GetCurrentThreadId();
+#endif // _MSC_VER
+inline void AssertExclusiveLockHeld(ExclusiveLock *lock)
+#ifdef _MSC_VER
+ _ASSERT(GetCurrentThreadId() == lock->holderThreadId);
+#endif // _MSC_VER
+inline bool InitializeExclusiveLock(ExclusiveLock *lock)
+ _ASSERT(!lock->initialized);
+ lock->initialized = true;
+ return InitializeUnderlyingExclusiveLock(&lock->lock);
+inline void ReleaseExclusiveLock (ExclusiveLock *lock)
+ _ASSERT(lock->initialized);
+#ifdef _MSC_VER
+ _ASSERT(GetCurrentThreadId() == lock->holderThreadId);
+ lock->holderThreadId = 0;
+#endif // _MSC_VER
+ ReleaseUnderlyingExclusiveLock(&lock->lock);
+inline void DestroyExclusiveLock(ExclusiveLock *lock)
+#ifdef _MSC_VER
+ _ASSERT(lock->holderThreadId == 0);
+#endif // _MSC_VER
+ _ASSERT(!lock->wholeProgramScope);
+ _ASSERT(lock->initialized);
+ lock->initialized = false;
+ DestroyUnderlyingExclusiveLock(&lock->lock);
+#else // _DEBUG
+#define ExclusiveLock UnderlyingExclusiveLock
+#define InitializeExclusiveLock InitializeUnderlyingExclusiveLock
+#define ReleaseExclusiveLock ReleaseUnderlyingExclusiveLock
+#define DestroyExclusiveLock DestroyUnderlyingExclusiveLock
+#define AssertExclusiveLockHeld(l) /* nothing */
+#define SetExclusiveLockWholeProgramScope(l) /*nothing*/
+#endif // _DEBUG
+#define AcquireExclusiveLock(lock) AcquireExclusiveLockProfile((lock), __FUNCTION__, __LINE__)
+void AcquireExclusiveLockProfile(ExclusiveLock *lock, const char* fn, int line);
+#elif _DEBUG
+// already defined above
+#else // !debug, !profile_wait
+#define AcquireExclusiveLock AcquireUnderlyingExclusiveLock
+// Single waiter objects. The semantics are that a single thread can wait on one of these, and when it's
+// set by any thread, the waiter will proceed. It works regardless of the order of waiting and signalling.
+// Can be reset back to unsignalled state.
+bool CreateSingleWaiterObject(SingleWaiterObject *newWaiter);
+void DestroySingleWaiterObject(SingleWaiterObject *waiter);
+void SignalSingleWaiterObject(SingleWaiterObject *singleWaiterObject);
+#define WaitForSingleWaiterObject(o) WaitForSingleWaiterObjectProfile((o), __FUNCTION__, __LINE__)
+bool WaitForSingleWaiterObjectProfile(SingleWaiterObject *singleWaiterObject, const char* fn, int line);
+bool WaitForSingleWaiterObject(SingleWaiterObject *singleWaiterObject);
+void ResetSingleWaiterObject(SingleWaiterObject *singleWaiterObject);
+// An Event is a synchronization object that acts as a gateway: it can either be open
+// or closed. Open events allow all waiters to proceed, while closed ones block all
+// waiters. Events can be opened and closed multiple times, and can have any number of
+// waiters.
+void CreateEventObject(EventObject *newEvent);
+void DestroyEventObject(EventObject *eventObject);
+void AllowEventWaitersToProceed(EventObject *eventObject);
+void PreventEventWaitersFromProceeding(EventObject *eventObject);
+#define WaitForEvent(o) WaitForEventProfile((o), __FUNCTION__, __LINE__)
+void WaitForEventProfile(EventObject *eventObject, const char* fn, int line);
+void WaitForEvent(EventObject *eventObject);
+bool WaitForEventWithTimeout(EventObject *eventObject, _int64 timeoutInMillis); // Returns true if the event was set, false if the timeout happened
+// Thread-safe read-modify-write operations
+int InterlockedIncrementAndReturnNewValue(volatile int *valueToIncrement);
+int InterlockedDecrementAndReturnNewValue(volatile int *valueToDecrement);
+_int64 InterlockedAdd64AndReturnNewValue(volatile _int64 *valueToWhichToAdd, _int64 amountToAdd);
+_uint32 InterlockedCompareExchange32AndReturnOldValue(volatile _uint32 *valueToUpdate, _uint32 replacementValue, _uint32 desiredPreviousValue);
+_uint64 InterlockedCompareExchange64AndReturnOldValue(volatile _uint64 *valueToUpdate, _uint64 replacementValue, _uint64 desiredPreviousValue);
+void* InterlockedCompareExchangePointerAndReturnOldValue(void * volatile *valueToUpdate, void* replacementValue, void* desiredPreviousValue);
+// Functions for creating and binding threads.
+typedef void (*ThreadMainFunction) (void *threadMainFunctionParameter);
+bool StartNewThread(ThreadMainFunction threadMainFunction, void *threadMainFunctionParameter);
+void BindThreadToProcessor(unsigned processorNumber); // This hard binds a thread to a processor. You can no-op it at some perf hit.
+#ifdef _MSC_VER
+#define GetThreadId() GetCurrentThreadId()
+#else // _MSC_VER
+#define GetThreadId() pthread_self()
+#endif // _MSC_VER
+void SleepForMillis(unsigned millis);
+unsigned GetNumberOfProcessors();
+_int64 QueryFileSize(const char *fileName);
+// returns true on success
+bool DeleteSingleFile(const char* filename); // DeleteFile is a Windows macro...
+// returns true on success
+bool MoveSingleFile(const char* oldFileName, const char* newFileName);
+class LargeFileHandle;
+// open binary file, supports "r" for read, "w" for rewrite/create, "a" for append
+LargeFileHandle* OpenLargeFile(const char* filename, const char* mode);
+size_t WriteLargeFile(LargeFileHandle* file, void* buffer, size_t bytes);
+size_t ReadLargeFile(LargeFileHandle* file, void* buffer, size_t bytes);
+// closes and deallocates
+void CloseLargeFile(LargeFileHandle* file);
+// open and close memory mapped files
+// currently just readonly, could add flags for r/w if necessary
+class MemoryMappedFile;
+MemoryMappedFile* OpenMemoryMappedFile(const char* filename, size_t offset, size_t length, void** o_contents, bool write = false, bool sequential = false);
+// closes and deallocates the file structure
+void CloseMemoryMappedFile(MemoryMappedFile* mappedFile);
+void AdviseMemoryMappedFilePrefetch(const MemoryMappedFile *mappedFile);
+class AsyncFile
+ // open a new file for reading and/or writing
+ static AsyncFile* open(const char* filename, bool write);
+ // free resources; must have destroyed all readers & writers first
+ virtual bool close() = 0;
+ // abstract class for asynchronous writes
+ class Writer
+ {
+ public:
+ // waits for all writes to complete, frees resources
+ virtual bool close() = 0;
+ // begin a write; if there is already a write in progress, might wait for it to complete
+ virtual bool beginWrite(void* buffer, size_t length, size_t offset, size_t *bytesWritten) = 0;
+ // wait for all prior beginWrites to complete
+ virtual bool waitForCompletion() = 0;
+ };
+ // get a new writer, e.g. for another thread to use
+ virtual Writer* getWriter() = 0;
+ // abstract class for asynchronous reads
+ class Reader
+ {
+ public:
+ // waits for alls reads to complete, frees resources
+ virtual bool close() = 0;
+ // begin a new read; if there is already a read in progress, might wait for it to complete
+ virtual bool beginRead(void* buffer, size_t length, size_t offset, size_t *bytesRead) = 0;
+ // wait for all prior beginReads to complete
+ virtual bool waitForCompletion() = 0;
+ };
+ // get a new reader, e.g. for another thread to use
+ virtual Reader* getReader() = 0;
+// Macro for counting trailing zeros of a 64-bit value
+#ifdef _MSC_VER
+#define CountLeadingZeroes(x, ans) {_BitScanReverse64(&ans, x);}
+#define CountTrailingZeroes(x, ans) {_BitScanForward64(&ans, x);}
+#define ByteSwapUI64(x) (_byteswap_uint64(x))
+#define CountLeadingZeroes(x, ans) {ans = __builtin_clzll(x);}
+#define CountTrailingZeroes(x, ans) {ans = __builtin_ctzll(x);}
+#define ByteSwapUI64(x) (__builtin_bswap64(x))
+// 64 bit version of fseek.
+int _fseek64bit(FILE *stream, _int64 offset, int origin);
+#ifndef _MSC_VER
+#define MININT32 ((int32_t) 0x80000000)
+#define MAXINT32 ((int32_t) 0x7fffffff)
+// Class for handling mapped files. It's got the same interface for both platforms, but different implementations.
+class FileMapper {
+ FileMapper();
+ ~FileMapper();
+ // can only be called once - only usable for a single file
+ bool init(const char *fileName);
+ const size_t getFileSize() {
+ _ASSERT(initialized);
+ return fileSize;
+ }
+ // can get multiple mappings on the same file
+ char *createMapping(size_t offset, size_t amountToMap, void** o_token);
+ // MUST call unmap on each token out of createMapping, the destructor WILL NOT cleanup
+ void unmap(void* token);
+ bool initialized;
+ const char* fileName;
+ size_t fileSize;
+ size_t pagesize;
+ int mapCount; // simple count of mappings that have not yet been unmapped
+#ifdef _MSC_VER
+ HANDLE hFile;
+ HANDLE hMapping;
+ _int64 millisSpentInReadFile;
+ _int64 countOfImmediateCompletions;
+ _int64 countOfDelayedCompletions;
+ _int64 countOfFailures;
+#else // _MSC_VER
+ static const int madviseSize = 4 * 1024 * 1024;
+ typedef std::pair<void*,size_t> UnmapToken;
+ int fd;
+ _uint64 lastPosMadvised;
+#endif // _MSC_VER
+// Call to keep the OS from putting the machine asleep
+void PreventMachineHibernationWhileThisThreadIsAlive();
+// Reduce our scheduling priority to be nicer to other jobs.
+void SetToLowSchedulingPriority();
diff --git a/SNAPLib/DataReader.cpp b/SNAPLib/DataReader.cpp
new file mode 100644
index 0000000..be829d1
--- /dev/null
+++ b/SNAPLib/DataReader.cpp
@@ -0,0 +1,2535 @@
+Module Name:
+ DataReader.cpp
+ Concrete implementation classes for DataReader and DataSupplier.
+ These are completely opaque, and are only exposed through static supplier objects
+ defined in DataReader.h
+ User mode service.
+#include "stdafx.h"
+#include "BigAlloc.h"
+#include "Compat.h"
+#include "RangeSplitter.h"
+#include "ParallelTask.h"
+#include "DataReader.h"
+#include "Bam.h"
+#include "zlib.h"
+#include "exit.h"
+#include "Error.h"
+using std::max;
+using std::min;
+using std::map;
+using std::string;
+// Read-Based
+// A data reader that uses a read-type call to get its data (as opposed to memory mapping).
+// This class contains the generic implementation, it must be subclassed to implement
+// startIo() and waitForBuffer(), which do the actual IO.
+class ReadBasedDataReader : public DataReader
+ ReadBasedDataReader(unsigned i_nBuffers, _int64 i_overflowBytes, double extraFactor, size_t bufferSpace = 0);
+ virtual ~ReadBasedDataReader();
+ virtual bool init(const char* fileName) = 0;
+ char* readHeader(_int64* io_headerSize);
+ virtual void reinit(_int64 startingOffset, _int64 amountOfFileToProcess);
+ virtual bool getData(char** o_buffer, _int64* o_validBytes, _int64* o_startBytes = NULL);
+ virtual void advance(_int64 bytes);
+ virtual void nextBatch();
+ virtual bool isEOF();
+ virtual DataBatch getBatch();
+ virtual void holdBatch(DataBatch batch);
+ virtual bool releaseBatch(DataBatch batch);
+ virtual _int64 getFileOffset();
+ virtual void getExtra(char** o_extra, _int64* o_length);
+ virtual const char* getFilename() = 0;
+ // must hold the lock to call
+ virtual void startIo() = 0;
+ // must hold the lock to call
+ virtual void waitForBuffer(unsigned bufferNumber) = 0;
+ // must hold the lock to call
+ virtual void addBuffer();
+ static const unsigned BUFFER_SIZE = 4 * 1024 * 1024 - 4096;
+ enum BufferState {Empty, Reading, Full, InUse};
+ struct BufferInfo
+ {
+ char *buffer;
+ BufferState state;
+ unsigned validBytes;
+ unsigned nBytesThatMayBeginARead;
+ bool isEOF;
+ unsigned offset; // How far has the consumer gotten in current buffer
+ _int64 fileOffset;
+ _uint32 batchID;
+ int holds;
+ char* extra;
+ int next, previous; // index of next/previous in free/ready list, -1 if end
+ bool headerBuffer; // Set if this is a special buffer that holds the rewound header. These get read once and deallocated.
+ void operator=(BufferInfo &peer) {
+ buffer = peer.buffer;
+ state = peer.state;
+ validBytes = peer.validBytes;
+ nBytesThatMayBeginARead = peer.nBytesThatMayBeginARead;
+ isEOF = peer.isEOF;
+ offset = peer.offset;
+ fileOffset = peer.fileOffset;
+ batchID = peer.batchID;
+ holds = peer.holds;
+ extra = peer.extra;
+ next = peer.next;
+ previous = peer.previous;
+ headerBuffer = peer.headerBuffer;
+ }
+ };
+ unsigned nBuffers;
+ const unsigned maxBuffers;
+ int headerBuffersOutstanding;
+ bool startedReadingHeader;
+ _int64 extraBytes;
+ _int64 overflowBytes;
+ BufferInfo* bufferInfo;
+ _uint32 nextBatchID;
+ int nextBufferForReader; // list head (singly linked), -1 if empty
+ int nextBufferForConsumer; // list head (doubly linked), -1 if empty
+ int lastBufferForConsumer; // list tail, -1 if empty
+ EventObject releaseEvent;
+ _int64 releaseWaitInMillis;
+ ExclusiveLock lock;
+ virtual bool getDataInternal(char** o_buffer, _int64* o_validBytes, _int64* o_startBytes = NULL);
+ //
+ // Stuff for handling the header read. We allow arbitrarily large header reads, and service them by copying data from the underlying
+ // data reader into a local buffer. We might wind up reading more than the actual header, so we serve reads out of the header buffer
+ // until it's used up.
+ //
+ char *headerBuffer;
+ _int64 headerBufferSize;
+ char *headerExtra; // Allocated in one go with the headerBuffer
+ _int64 headerExtraSize;
+ _int64 amountAdvancedThroughUnderlyingStoreByUs;
+ unsigned nHeaderBuffersAllocated;
+ bool hitEOFReadingHeader;
+ const size_t bufferSize;
+ unsigned i_nBuffers,
+ _int64 i_overflowBytes,
+ double extraFactor,
+ size_t i_bufferSpace)
+ : DataReader(), nBuffers(i_nBuffers), overflowBytes(i_overflowBytes),
+ maxBuffers(i_nBuffers * (i_nBuffers == 1 ? 2 : 4)),
+ bufferSize(i_bufferSpace > 0 ? i_bufferSpace / (i_nBuffers * 2) : BUFFER_SIZE),
+ headerBuffer(NULL), headerBufferSize(0), amountAdvancedThroughUnderlyingStoreByUs(0),
+ headerExtra(NULL), headerExtraSize(0), startedReadingHeader(false), headerBuffersOutstanding(0), nHeaderBuffersAllocated(0),
+ hitEOFReadingHeader(false)
+ //
+ // Initialize the buffer info struct.
+ //
+ // allocate all the data in one big block
+ // NOTE: buffers are not null-terminated (since memmap version can't do it)
+ _ASSERT(extraFactor >= 0 && i_nBuffers > 0);
+ bufferInfo = new BufferInfo[maxBuffers];
+ extraBytes = max((_int64) 0, (_int64) ((bufferSize + overflowBytes) * extraFactor));
+ char* allocated = (char*) BigReserve(maxBuffers * (bufferSize + extraBytes + overflowBytes));
+ BigCommit(allocated, nBuffers * (bufferSize + extraBytes + overflowBytes));
+ if (NULL == allocated) {
+ WriteErrorMessage("ReadBasedDataReader: unable to allocate IO buffer\n");
+ soft_exit(1);
+ }
+ for (unsigned i = 0 ; i < nBuffers; i++) {
+ bufferInfo[i].buffer = allocated;
+ allocated += bufferSize + overflowBytes;
+ bufferInfo[i].extra = extraBytes > 0 ? allocated : NULL;
+ allocated += extraBytes;
+ bufferInfo[i].state = Empty;
+ bufferInfo[i].isEOF = false;
+ bufferInfo[i].offset = 0;
+ bufferInfo[i].next = i < nBuffers - 1 ? i + 1 : -1;
+ bufferInfo[i].previous = i > 0 ? i - 1 : -1;
+ bufferInfo[i].holds = 0;
+ bufferInfo[i].headerBuffer = false;
+ }
+ nextBatchID = 1;
+ nextBufferForConsumer = -1;
+ lastBufferForConsumer = -1;
+ nextBufferForReader = 0;
+ CreateEventObject(&releaseEvent);
+ releaseWaitInMillis = 5; // wait up to 5 ms before allocating a new buffer
+ InitializeExclusiveLock(&lock);
+ BigDealloc(bufferInfo[0].buffer);
+ for (unsigned i = 0; i < nBuffers; i++) {
+ bufferInfo[i].buffer = bufferInfo[i].extra = NULL;
+ }
+ if (NULL != headerBuffer) {
+ delete[] headerBuffer;
+ headerBuffer = NULL;
+ }
+ if (NULL != headerExtra) {
+ delete[] headerExtra;
+ headerExtra = NULL;
+ }
+ DestroyExclusiveLock(&lock);
+ DestroyEventObject(&releaseEvent);
+ char *
+ReadBasedDataReader::readHeader(_int64* io_headerSize)
+ _ASSERT(!startedReadingHeader);
+ _int64 validBytesInHeader;
+ if (NULL != headerBuffer) {
+ if (*io_headerSize <= headerBufferSize) {
+ return headerBuffer;
+ }
+ //
+ // We need more data for the header. Reallocate the buffer, copy the data from the old buffer into the new, and then get more
+ // data from the underlying reader.
+ //
+ char *newHeaderBuffer = new char[*io_headerSize];
+ memcpy(newHeaderBuffer, headerBuffer, headerBufferSize);
+ delete[] headerBuffer;
+ headerBuffer = newHeaderBuffer;
+ validBytesInHeader = headerBufferSize;
+ headerBufferSize = *io_headerSize;
+ } else {
+ reinit(0, 0);
+ headerBufferSize = *io_headerSize;
+ headerBuffer = new char[headerBufferSize];
+ validBytesInHeader = 0;
+ }
+ //
+ // Run through the underlying data provider getting data until we've filled the header buffer or hit EOF.
+ //
+ _int64 bytesLeftToGet = headerBufferSize - validBytesInHeader;
+ _ASSERT(bytesLeftToGet);
+ while (bytesLeftToGet != 0) {
+ if (amountAdvancedThroughUnderlyingStoreByUs < validBytesInHeader) {
+ _int64 amountToAdvance = validBytesInHeader - amountAdvancedThroughUnderlyingStoreByUs - overflowBytes; // Leave overflowBytes left over, since we
+ if (amountToAdvance <= 0) {
+ //
+ // We're probably almost at EOF. Consume the overflow bytes, too.
+ //
+ amountToAdvance = validBytesInHeader - amountAdvancedThroughUnderlyingStoreByUs;
+ }
+ advance(amountToAdvance);
+ amountAdvancedThroughUnderlyingStoreByUs += amountToAdvance;
+ }
+ char *dataFromUnderlyingStore;
+ _int64 dataSizeFromUnderlyingStore;
+ if (!getDataInternal(&dataFromUnderlyingStore, &dataSizeFromUnderlyingStore)) {
+ nextBatch();
+ if (!getDataInternal(&dataFromUnderlyingStore, &dataSizeFromUnderlyingStore)) {
+ //
+ // Hit EOF while reading header.
+ //
+ hitEOFReadingHeader = true;
+ headerBufferSize = *io_headerSize = validBytesInHeader;
+ return headerBuffer;
+ }
+ }
+ //
+ // Adjust for the fact that we don't advance as far as we've read, so that we leave some overlap for
+ // subsequent readers who want the data, not the header.
+ //
+ _ASSERT(amountAdvancedThroughUnderlyingStoreByUs <= validBytesInHeader); // We haven't advanced over something we need.
+ _int64 offsetIntoBuffer = validBytesInHeader - amountAdvancedThroughUnderlyingStoreByUs;
+ _ASSERT(dataSizeFromUnderlyingStore >= offsetIntoBuffer);
+ dataSizeFromUnderlyingStore -= offsetIntoBuffer;
+ _int64 bytesToCopy = __min(dataSizeFromUnderlyingStore, bytesLeftToGet);
+ memcpy(headerBuffer + validBytesInHeader, dataFromUnderlyingStore + offsetIntoBuffer, bytesToCopy);
+ bytesLeftToGet -= bytesToCopy;
+ validBytesInHeader += bytesToCopy;
+ }
+ return headerBuffer; // No need to reset *io_headerSize, we read as much as was requested
+// This gets called only for subclasses that can't implement their own. It's able to put the header reads
+// back on the queue, but can't seek anywhere else.
+ void
+ _int64 startingOffset,
+ _int64 amountOfFileToProcess)
+ AcquireExclusiveLock(&lock);
+ if (0 != amountOfFileToProcess) {
+ WriteErrorMessage("ReadBasedDataReader:reinit called with non-zero amountOfFileToProcess (%lld, %lld)\n", startingOffset, amountOfFileToProcess);
+ soft_exit(1);
+ }
+ if (startedReadingHeader || 0 != headerBuffersOutstanding) {
+ WriteErrorMessage("ReadBasedDataReader:reinit called after reading some data (%lld, %lld)\n", startingOffset, amountOfFileToProcess);
+ soft_exit(1);
+ }
+ //
+ // We've already read a bunch of data from the underlying reader during header read. Create some new virtual buffers that point into the
+ // header buffer, and stick them at the head of the "already read" list.
+ //
+ _ASSERT(!startedReadingHeader && 0 == headerBuffersOutstanding);
+ if (0 != headerBufferSize) {
+ startedReadingHeader = true;
+ }
+ //
+ // First let any pending IO complete.
+ //
+ for (unsigned i = 0; i < nBuffers; i++) {
+ if (bufferInfo[i].state == Reading) {
+ waitForBuffer(i);
+ }
+ }
+ _ASSERT(amountAdvancedThroughUnderlyingStoreByUs <= headerBufferSize);
+ if (amountAdvancedThroughUnderlyingStoreByUs == 0) {
+ nHeaderBuffersAllocated = 0;
+ } else {
+ nHeaderBuffersAllocated = (int)((amountAdvancedThroughUnderlyingStoreByUs + bufferSize - 1) / (bufferSize - overflowBytes)); // Round up, in case we read the last buffer.
+ }
+ int totalBuffersNeeded = (int)(maxBuffers + nHeaderBuffersAllocated);
+ //
+ // Reallocate the buffers array.
+ //
+ BufferInfo *newBuffers = new BufferInfo[totalBuffersNeeded];
+ for (unsigned i = 0; i < maxBuffers; i++) {
+ newBuffers[i] = bufferInfo[i];
+ }
+ delete[] bufferInfo;
+ bufferInfo = newBuffers;
+ //
+ // Don't increase maxBuffers, so the buffer adder won't use the headerBuffers for anything.
+ //
+ //
+ // Now construct the header buffers.
+ //
+ headerExtraSize = extraBytes * nHeaderBuffersAllocated;
+ headerExtra = new char[headerExtraSize];
+ char *headerPointer = headerBuffer;
+ char *headerExtraPointer = headerExtra;
+ _int64 fileOffset = 0;
+ _int64 bytesRemaining = amountAdvancedThroughUnderlyingStoreByUs;
+ for (int i = maxBuffers; i < totalBuffersNeeded; i++) {
+ bufferInfo[i].state = Full;
+ bufferInfo[i].isEOF = false;
+ bufferInfo[i].offset = 0;
+ bufferInfo[i].next = (i == totalBuffersNeeded - 1) ? nextBufferForConsumer : i + 1;
+ bufferInfo[i].previous = (i == maxBuffers) ? -1 : i - 1;
+ bufferInfo[i].holds = 0;
+ bufferInfo[i].headerBuffer = true;
+ bufferInfo[i].validBytes = (int)__min(bytesRemaining, (_int64) bufferSize);
+ bufferInfo[i].nBytesThatMayBeginARead = (int)((bytesRemaining <= (_int64) bufferSize) ? bytesRemaining : __max(bufferInfo[i].validBytes - overflowBytes, 0));
+ bufferInfo[i].offset = 0;
+ bufferInfo[i].fileOffset = fileOffset;
+ bufferInfo[i].batchID = nextBatchID++;
+ bufferInfo[i].buffer = headerPointer;
+ headerPointer += bufferInfo[i].nBytesThatMayBeginARead; // NB: don't add overflowBytes; these buffers overlap
+ fileOffset += bufferInfo[i].nBytesThatMayBeginARead;
+ bufferInfo[i].extra = headerExtraPointer;
+ headerExtraPointer += extraBytes;
+ headerBuffersOutstanding++;
+ bytesRemaining -= bufferInfo[i].nBytesThatMayBeginARead;
+ }
+ if (nHeaderBuffersAllocated > 0) {
+ _ASSERT(bufferInfo[nextBufferForConsumer].previous == -1);
+ bufferInfo[nextBufferForConsumer].previous = totalBuffersNeeded - 1;
+ nextBufferForConsumer = maxBuffers;
+ if (hitEOFReadingHeader) {
+ bufferInfo[totalBuffersNeeded - 1].isEOF = true;
+ }
+ }
+ //
+ // Kick off IO, wait for the first buffer to be read
+ //
+ startIo();
+ waitForBuffer(nextBufferForConsumer);
+ ReleaseExclusiveLock(&lock);
+ //
+ // Now, consume data until we've gotten to startingOffset.
+ //
+ _int64 bytesToSkip = startingOffset;
+ while (bytesToSkip > 0) {
+ char *p;
+ _int64 valid, start;
+ bool ok = getData(&p, &valid, &start);
+ if (!ok) {
+ WriteErrorMessage("ReadBasedDataReader::init() failure getting data\n");
+ soft_exit(1);
+ }
+ _int64 bytesToSkipThisTime = __min(valid, bytesToSkip);
+ advance(bytesToSkipThisTime);
+ if (bytesToSkipThisTime > start) {
+ nextBatch();
+ }
+ getData(&p, &valid, &start);
+ bytesToSkip -= bytesToSkipThisTime;
+ }
+ bool
+ char** o_buffer,
+ _int64* o_validBytes,
+ _int64* o_startBytes)
+ if (NULL != headerBuffer && !startedReadingHeader) {
+ delete[] headerBuffer;
+ headerBuffer = NULL;
+ _ASSERT(NULL == headerExtra);
+ }
+ return getDataInternal(o_buffer, o_validBytes, o_startBytes);
+ bool
+ char** o_buffer,
+ _int64* o_validBytes,
+ _int64* o_startBytes)
+ _ASSERT(nextBufferForConsumer >= 0);
+ BufferInfo *info = &bufferInfo[nextBufferForConsumer];
+ if (info->isEOF && info->offset >= info->validBytes) {
+ //
+ // EOF.
+ //
+ return false;
+ }
+ if (info->offset >= info->nBytesThatMayBeginARead) {
+ //
+ // Past the end of our section.
+ //
+ return false;
+ }
+ if (info->state != Full) {
+ _ASSERT(info->state != InUse);
+ AcquireExclusiveLock(&lock);
+ waitForBuffer(nextBufferForConsumer);
+ ReleaseExclusiveLock(&lock);
+ }
+ *o_buffer = info->buffer + info->offset;
+ *o_validBytes = info->validBytes - info->offset;
+ if (o_startBytes != NULL) {
+ *o_startBytes = info->nBytesThatMayBeginARead - info->offset;
+ }
+ return true;
+ void
+ _int64 bytes)
+ BufferInfo* info = &bufferInfo[nextBufferForConsumer];
+ _ASSERT(info->validBytes >= info->offset && bytes >= 0 && bytes <= info->validBytes - info->offset);
+ info->offset += min(info->validBytes - info->offset, (unsigned)max((_int64)0, bytes));
+ void
+ AcquireExclusiveLock(&lock);
+ _ASSERT(nextBufferForConsumer >= 0);
+ BufferInfo* info = &bufferInfo[nextBufferForConsumer];
+ if (info->isEOF) {
+ ReleaseExclusiveLock(&lock);
+ if (info->holds == 0) {
+ releaseBatch(DataBatch(info->batchID));
+ }
+ return;
+ }
+ DataBatch priorBatch = DataBatch(info->batchID);
+ info->state = InUse;
+ _uint32 overflow = max((unsigned) info->offset, info->nBytesThatMayBeginARead) - info->nBytesThatMayBeginARead;
+ _int64 nextStart = info->fileOffset + info->nBytesThatMayBeginARead; // for validation
+ //fprintf(stderr, "ReadBasedDataReader:nextBatch() finished buffer %d, starting buffer %d\n", nextBufferForConsumer, info->next);
+ //fprintf(stderr, "ReadBasedDataReader:nextBatch() skipping %u overflow bytes used in previous batch\n", overflow);
+ nextBufferForConsumer = info->next;
+ bool first = true;
+ while (nextBufferForConsumer == -1) {
+ nextStart = 0; // can no longer count on getting sequential buffers from file
+ ReleaseExclusiveLock(&lock);
+ if (! first) {
+ //fprintf(stderr, "ReadBasedDataReader::nextBatch thread %d wait for release\n", GetCurrentThreadId());
+ _int64 start = timeInNanos();
+ bool waitSucceeded = WaitForEventWithTimeout(&releaseEvent, releaseWaitInMillis);
+ InterlockedAdd64AndReturnNewValue(&ReleaseWaitTime, timeInNanos() - start);
+ //fprintf(stderr, "ReadBasedDataReader::nextBatch thread %d released\n", GetCurrentThreadId());
+ if (!waitSucceeded) {
+ AcquireExclusiveLock(&lock);
+ addBuffer();
+ ReleaseExclusiveLock(&lock);
+ }
+ }
+ first = false;
+ AcquireExclusiveLock(&lock);
+ startIo();
+ }
+ if (bufferInfo[nextBufferForConsumer].state != Full) {
+ waitForBuffer(nextBufferForConsumer);
+ }
+ bufferInfo[nextBufferForConsumer].offset = overflow;
+ bufferInfo[nextBufferForConsumer].holds = 0;
+ //fprintf(stderr,"emitting buffer starting at 0x%llx\n", info->fileOffset);
+ //if (nextStart != 0) fprintf(stderr, "checking NextStart 0x%llx\n", nextStart);
+ _ASSERT(nextStart == 0 || nextStart == bufferInfo[nextBufferForConsumer].fileOffset || bufferInfo[nextBufferForConsumer].isEOF);
+ ReleaseExclusiveLock(&lock);
+ if (info->holds == 0) {
+ releaseBatch(priorBatch);
+ }
+ bool
+ return bufferInfo[nextBufferForConsumer].isEOF;
+ DataBatch
+ return DataBatch(bufferInfo[nextBufferForConsumer].batchID);
+ void
+ DataBatch batch)
+ AcquireExclusiveLock(&lock);
+ for (unsigned i = 0; i < maxBuffers + nHeaderBuffersAllocated; i = (i == nBuffers - 1) ? maxBuffers : i+1) { // Goofy loop is because headerBuffers get tacked on beyond maxBuffers
+ BufferInfo *info = &bufferInfo[i];
+ if (info->batchID == batch.batchID) {
+ //fprintf(stderr, "%x holdBatch batch 0x%x, holds on buffer %d now %d\n", (unsigned) this, batch.batchID, i, info->holds);
+ info->holds++;
+ }
+ }
+ ReleaseExclusiveLock(&lock);
+ bool
+ DataBatch batch)
+ AcquireExclusiveLock(&lock);
+ bool released = false;
+ bool result = true;
+ for (unsigned i = 0; i < maxBuffers + nHeaderBuffersAllocated; i = (i == nBuffers - 1) ? maxBuffers : i + 1) { // Goofy loop is because headerBuffers get tacked on beyond maxBuffers
+ BufferInfo* info = &bufferInfo[i];
+ if (info->batchID == batch.batchID) {
+ switch (info->state) {
+ case Empty:
+ // should never happen
+ break;
+ case Reading:
+ // should never happen
+ _ASSERT(false);
+ break;
+ case InUse:
+ released = info->holds <= 1;
+ // fall through
+ case Full:
+ if (info->holds > 0) {
+ info->holds--;
+ }
+ if (info->holds == 0) {
+ //fprintf(stderr,"%x releaseBatch batch %d, releasing %s buffer %d\n", (unsigned) this, batch.batchID, info->state == InUse ? "InUse" : "Full", i);
+ info->state = Empty;
+ // remove from ready list
+ if (i == nextBufferForConsumer) {
+ //fprintf(stderr, "ReadBasedDataReader::releaseBatch change nextBufferForConsumer %d->%d\n", nextBufferForConsumer, info->next);
+ nextBufferForConsumer = info->next;
+ }
+ if (i == lastBufferForConsumer) {
+ lastBufferForConsumer = info->previous;
+ }
+ if (info->next != -1) {
+ bufferInfo[info->next].previous = info->previous;
+ }
+ if (info->previous != -1) {
+ bufferInfo[info->previous].next = info->next;
+ }
+ if (info->headerBuffer) {
+ // Header buffers never get reused. Just get rid of it.
+ info->buffer = NULL;
+ info->extra = NULL;
+ _ASSERT(headerBuffersOutstanding > 0);
+ if (headerBuffersOutstanding > 0) {
+ headerBuffersOutstanding--;
+ if (0 == headerBuffersOutstanding) {
+ delete[] headerBuffer;
+ delete[] headerExtra;
+ headerBuffer = headerExtra = NULL;
+ nHeaderBuffersAllocated = 0;
+ }
+ }
+ } else {
+ // add to head of free list
+ info->next = nextBufferForReader;
+ info->batchID = 0;
+#ifdef _DEBUG
+ memset(info->buffer, 0xde, bufferSize + extraBytes);
+ nextBufferForReader = i;
+ }
+ result = true;
+ } else {
+ //fprintf(stderr,"%x releaseBatch batch %d, holds on buffer %d now %d\n", (unsigned) this, batch.batchID, i, info->holds);
+ result = false;
+ }
+ break;
+ default:
+ WriteErrorMessage("ReadBasedDataReader::releaseBatch():invalid enum\n");
+ soft_exit(1);
+ }
+ }
+ }
+ startIo();
+ if (released) {
+ //fprintf(stderr, "releaseBatch set releaseEvent\n");
+ AllowEventWaitersToProceed(&releaseEvent);
+ }
+ ReleaseExclusiveLock(&lock);
+ return result;
+ _int64
+ return bufferInfo[nextBufferForConsumer].fileOffset + bufferInfo[nextBufferForConsumer].offset;
+ void
+ char** o_extra,
+ _int64* o_length)
+ // hack: return valid buffer even when no consumer buffers - this may happen when reading header
+ *o_extra = bufferInfo[max(0, nextBufferForConsumer)].extra;
+ *o_length = extraBytes;
+ void
+ if (nBuffers == maxBuffers) {
+ //fprintf(stderr, "ReadBasedDataReader: addBuffer at limit\n");
+ return;
+ }
+ _ASSERT(nBuffers < maxBuffers);
+ //fprintf(stderr, "ReadBasedDataReader: addBuffer %d of %d\n", nBuffers, maxBuffers);
+ size_t bytes = bufferSize + extraBytes + overflowBytes;
+ bufferInfo[nBuffers].buffer = bufferInfo[nBuffers-1].buffer + bytes;
+ if (! BigCommit(bufferInfo[nBuffers].buffer, bytes)) {
+ WriteErrorMessage("ReadBasedDataReader: unable to commit IO buffer\n");
+ soft_exit(1);
+ }
+ bufferInfo[nBuffers].extra = extraBytes > 0 ? bufferInfo[nBuffers].buffer + bytes - extraBytes : NULL;
+ bufferInfo[nBuffers].state = Empty;
+ bufferInfo[nBuffers].isEOF= false;
+ bufferInfo[nBuffers].offset = 0;
+ bufferInfo[nBuffers].next = nextBufferForReader;
+ bufferInfo[nBuffers].previous = -1;
+ bufferInfo[nBuffers].headerBuffer = false;
+ nextBufferForReader = nBuffers;
+ nBuffers++;
+ _ASSERT(nBuffers <= maxBuffers);
+ if (nBuffers == maxBuffers) {
+ releaseWaitInMillis = 1000 * 3600 * 24 * 7; // A week
+ }
+class StdioDataReader : public ReadBasedDataReader
+ StdioDataReader(unsigned i_nBuffers, _int64 i_overflowBytes, double extraFactor);
+ ~StdioDataReader();
+ virtual bool init(const char* i_fileName);
+ virtual const char* getFilename()
+ { return "-"; }
+ protected:
+ // must hold the lock to call
+ virtual void startIo();
+ // must hold the lock to call
+ virtual void waitForBuffer(unsigned bufferNumber);
+ //
+ // Because reads don't necessarily divide evenly into buffers, we have to assure that
+ // the buffers that we read can overlap. In file-IO based readers, we do this by reading
+ // a buffer's worth of data each time, but advancing the file pointer only by
+ // bufferSize - overflowBytes, so each buffer ovelaps with its predecessor by a little.
+ // That doesn't work for stdio, since it can't rewind. So, instead, we allocate
+ // storage on the side to hold a copy of the last overflowBytes
+ // and then just copy those bytes into the beginning of the next buffer to read.
+ // We also use this buffer to hold the header (the first read), and to allow
+ // reading the header plus some extra data, parsing the header, and then seeking
+ // backward to the actual end of the header.
+ //
+ char *overflowBuffer;
+ bool overflowBufferFilled; // For the very first read, there may be no overlap buffer data.
+ bool started;
+ bool hitEOF;
+ _int64 readOffset;
+StdioDataReader::StdioDataReader(unsigned i_nBuffers, _int64 i_overflowBytes, double extraFactor) :
+ ReadBasedDataReader(i_nBuffers, i_overflowBytes, extraFactor), started(false), hitEOF(false), overflowBufferFilled(false),
+ readOffset(0), overflowBuffer(NULL)
+ BigDealloc(overflowBuffer);
+ overflowBuffer = NULL;
+StdioDataReader::init(const char * i_fileName)
+ if (strcmp(i_fileName, "-")) {
+ WriteErrorMessage("StdioDataReader: must have filename of '-', got '%s'\n", i_fileName);
+ soft_exit(1);
+ }
+#ifdef _MSC_VER
+ int result = _setmode( _fileno( stdin ), _O_BINARY ); // puts stdin in to non-translated mode, so if we're reading compressed data windows' CRLF processing doesn't destroy it.
+ if (-1 == result) {
+ WriteErrorMessage("StdioDataReader::freopen to change to untranslated mode failed\n");
+ soft_exit(1);
+ }
+#endif // _MSC_VER
+ return true;
+ AssertExclusiveLockHeld(&lock);
+ started = true;
+ //
+ // Synchronously read data into whatever buffers are ready.
+ //
+ while (nextBufferForReader != -1) {
+ // remove from free list
+ BufferInfo* info = &bufferInfo[nextBufferForReader];
+ _ASSERT(info->state == Empty);
+ int index = nextBufferForReader;
+ nextBufferForReader = info->next;
+ info->batchID = nextBatchID++;
+ // add to end of consumer list
+ if (lastBufferForConsumer != -1) {
+ _ASSERT(bufferInfo[lastBufferForConsumer].next == -1);
+ bufferInfo[lastBufferForConsumer].next = index;
+ }
+ info->next = -1;
+ info->previous = lastBufferForConsumer;
+ lastBufferForConsumer = index;
+ if (nextBufferForConsumer == -1) {
+ //fprintf(stderr, "StdioDataReader::startIo set nextBufferForConsumder -1 -> %d\n", index);
+ nextBufferForConsumer = index;
+ }
+ if (hitEOF) {
+ info->validBytes = 0;
+ info->buffer[0] = '\0';
+ info->nBytesThatMayBeginARead = 0;
+ info->isEOF = true;
+ info->state = Full;
+ return;
+ }
+ size_t amountToRead;
+ size_t bufferOffset;
+ if (overflowBufferFilled) {
+ //
+ // Copy the bytes from the overflow buffer into our buffer.
+ //
+ memcpy(info->buffer, overflowBuffer, overflowBytes);
+ bufferOffset = overflowBytes;
+ amountToRead = bufferSize - overflowBytes;
+ info->fileOffset = readOffset - overflowBytes;
+ } else {
+ amountToRead = bufferSize;
+ bufferOffset = 0;
+ info->fileOffset = readOffset;
+ }
+ //
+ // We have to run this holding the lock, because otherwise there's no way to make the overflow buffer work properly.
+ //
+ size_t bytesRead = fread(info->buffer + bufferOffset, 1, amountToRead, stdin);
+ //fprintf(stderr,"StdioDataReader:startIO(): Read offset 0x%llx into buffer at 0x%llx, size %d, copied 0x%x overflow bytes, start at 0x%llx, tid %d\n", readOffset, info->buffer, bytesRead, bufferOffset, readOffset - bufferOffset, GetCurrentThreadId());
+ readOffset += bytesRead;
+ if (bytesRead != amountToRead) {
+ if (feof(stdin)) {
+ info->isEOF = true;
+ hitEOF = true;
+ } else {
+ WriteErrorMessage("StdinDataReader: Error reading stdin (but not EOF).\n");
+ soft_exit(1);
+ }
+ } else {
+ info->isEOF = false;
+ }
+ info->validBytes = (unsigned)(bytesRead + bufferOffset);
+ if (hitEOF) {
+ info->nBytesThatMayBeginARead = (unsigned)(bytesRead + bufferOffset);
+ overflowBufferFilled = false;
+ } else {
+ info->nBytesThatMayBeginARead = (unsigned)(bytesRead + bufferOffset - overflowBytes);
+ //
+ // Fill the overflow buffer with the last bytes from this buffer.
+ //
+ if (NULL == overflowBuffer) {
+ //
+ // We can get here if we never called readHeader(). If so, we know we never will and so
+ // we can just allocate the overflow buffer to be the size of the header.
+ //
+ overflowBuffer = (char *)BigAlloc(overflowBytes);
+ }
+ memcpy(overflowBuffer, info->buffer + bufferOffset + bytesRead - overflowBytes, overflowBytes);
+ overflowBufferFilled = true;
+ }
+ info->state = Full;
+ }
+ if (nextBufferForConsumer == -1) {
+ //fprintf(stderr, "startIo thread %x reset releaseEvent\n", GetCurrentThreadId());
+ PreventEventWaitersFromProceeding(&releaseEvent);
+ }
+ void
+ unsigned bufferNumber)
+ _ASSERT(bufferNumber >= 0 && (bufferNumber < nBuffers || bufferNumber >= maxBuffers && 0 != headerBuffersOutstanding));
+ BufferInfo *info = &bufferInfo[bufferNumber];
+ while (info->state == InUse) {
+ //fprintf(stderr, "StdioDataReader::waitForBuffer %d InUse...\n", bufferNumber);
+ // must already have lock to call, release & wait & reacquire
+ ReleaseExclusiveLock(&lock);
+ // TODO: implement timed wait on Linux
+#ifdef _MSC_VER
+ _int64 start = timeInNanos();
+ _uint32 waitTime;
+ if (releaseWaitInMillis > 0xffffffff) {
+ waitTime = INFINITE;
+ } else {
+ waitTime = (_uint32)releaseWaitInMillis;
+ }
+ _uint32 result = WaitForSingleObject(releaseEvent, waitTime);
+ InterlockedAdd64AndReturnNewValue(&ReleaseWaitTime, timeInNanos() - start);
+ WaitForEvent(&releaseEvent);
+ AcquireExclusiveLock(&lock);
+#ifdef _MSC_VER
+ if (result == WAIT_TIMEOUT) {
+ // this isn't going to directly make this buffer available, but will reduce pressure
+ addBuffer();
+ }
+ }
+ if (info->state == Full) {
+ return;
+ }
+ _ASSERT(info->state != Reading); // We're synchronous, we don't use Reading
+ startIo();
+ info->state = Full;
+ info->buffer[info->validBytes] = 0;
+class StdioDataSupplier : public DataSupplier
+ StdioDataSupplier() : DataSupplier() {}
+ virtual DataReader* getDataReader(int bufferCount, _int64 overflowBytes, double extraFactor = 0.0, size_t bufferSpace = 0)
+ {
+ if (supplied) {
+ WriteErrorMessage("You can only use stdin input for one run per execution of SNAP (i.e., if you use ',' to run SNAP more than once without reloading the index, you can only use stdin once)\n");
+ soft_exit_no_print(1);
+ }
+ supplied = true;
+ return new StdioDataReader(bufferCount, overflowBytes, extraFactor);
+ }
+ static bool supplied;
+bool StdioDataSupplier::supplied = false;
+#ifdef _MSC_VER
+class WindowsOverlappedDataReader : public ReadBasedDataReader
+ WindowsOverlappedDataReader(unsigned i_nBuffers, _int64 i_overflowBytes, double extraFactor, size_t bufferSpace);
+ virtual ~WindowsOverlappedDataReader();
+ virtual bool init(const char* i_fileName);
+ virtual void reinit(_int64 startingOffset, _int64 amountOfFileToProcess);
+// virtual char* readHeader(_int64* io_headerSize);
+ virtual const char* getFilename()
+ { return fileName; }
+ protected:
+ // must hold the lock to call
+ virtual void startIo();
+ // must hold the lock to call
+ virtual void waitForBuffer(unsigned bufferNumber);
+ // must hold the lock to call
+ virtual void addBuffer();
+ OVERLAPPED *bufferLaps;
+ const char* fileName;
+ HANDLE hFile;
+ LARGE_INTEGER readOffset;
+ _int64 endingOffset;
+WindowsOverlappedDataReader::WindowsOverlappedDataReader(unsigned i_nBuffers, _int64 i_overflowBytes, double extraFactor, size_t bufferSpace) :
+ ReadBasedDataReader(i_nBuffers, i_overflowBytes, extraFactor, bufferSpace), fileName(NULL), hFile(INVALID_HANDLE_VALUE), endingOffset(0)
+ readOffset.QuadPart = 0;
+ bufferLaps = (OVERLAPPED *)malloc(sizeof(OVERLAPPED) * maxBuffers);
+ for (unsigned i = 0; i < i_nBuffers; i++) {
+ bufferLaps[i].hEvent = CreateEvent(NULL,TRUE,FALSE,NULL);
+ if (NULL == bufferLaps[i].hEvent) {
+ WriteErrorMessage("WindowsOverlappedDataReader: Unable to create event\n");
+ soft_exit(1);
+ }
+ }
+ for (unsigned i = 0; i < nBuffers; i++) {
+ CloseHandle(bufferLaps[i].hEvent);
+ }
+ free(bufferLaps);
+ bufferLaps = NULL;
+ CloseHandle(hFile);
+WindowsOverlappedDataReader::init(const char* i_fileName)
+ fileName = i_fileName;
+ if (INVALID_HANDLE_VALUE == hFile) {
+ return false;
+ }
+ if (!GetFileSizeEx(hFile,&fileSize)) {
+ WriteErrorMessage("WindowsOverlappedDataReader: unable to get file size of '%s', %d\n",fileName,GetLastError());
+ return false;
+ }
+ return true;
+ void
+ _int64 i_startingOffset,
+ _int64 amountOfFileToProcess)
+ _ASSERT(INVALID_HANDLE_VALUE != hFile); // Must call init() before reinit()
+ AcquireExclusiveLock(&lock);
+ //
+ // First let any pending IO complete.
+ //
+ for (unsigned i = 0; i < nBuffers; i++) {
+ if (bufferInfo[i].state == Reading) {
+ waitForBuffer(i);
+ }
+ bufferInfo[i].state = Empty;
+ bufferInfo[i].isEOF= false;
+ bufferInfo[i].offset = 0;
+ bufferInfo[i].next = i < nBuffers - 1 ? i + 1 : -1;
+ bufferInfo[i].previous = i > 0 ? i - 1 : -1;
+ }
+ nextBufferForConsumer = -1;
+ lastBufferForConsumer = -1;
+ nextBufferForReader = 0;
+ readOffset.QuadPart = i_startingOffset;
+ if (amountOfFileToProcess == 0) {
+ //
+ // This means just read the whole file.
+ //
+ endingOffset = fileSize.QuadPart;
+ } else {
+ endingOffset = min(fileSize.QuadPart,i_startingOffset + amountOfFileToProcess);
+ }
+ //
+ // Kick off IO, wait for the first buffer to be read
+ //
+ startIo();
+ waitForBuffer(nextBufferForConsumer);
+ ReleaseExclusiveLock(&lock);
+ void
+ //
+ // Launch reads on whatever buffers are ready.
+ //
+ AssertExclusiveLockHeld(&lock);
+ while (nextBufferForReader != -1) {
+ // remove from free list
+ BufferInfo* info = &bufferInfo[nextBufferForReader];
+ OVERLAPPED *bufferLap = &bufferLaps[nextBufferForReader];
+ _ASSERT(info->state == Empty);
+ int index = nextBufferForReader;
+ nextBufferForReader = info->next;
+ info->batchID = nextBatchID++;
+ // add to end of consumer list
+ if (lastBufferForConsumer != -1) {
+ _ASSERT(bufferInfo[lastBufferForConsumer].next == -1);
+ bufferInfo[lastBufferForConsumer].next = index;
+ }
+ info->next = -1;
+ info->previous = lastBufferForConsumer;
+ lastBufferForConsumer = index;
+ if (nextBufferForConsumer == -1) {
+ nextBufferForConsumer = index;
+ }
+ if (readOffset.QuadPart >= fileSize.QuadPart || readOffset.QuadPart >= endingOffset) {
+ info->validBytes = 0;
+ info->nBytesThatMayBeginARead = 0;
+ info->isEOF = true;
+ info->state = Full;
+ SetEvent(bufferLap->hEvent);
+ return;
+ }
+ unsigned amountToRead;
+ _int64 finalOffset = min(fileSize.QuadPart, endingOffset + overflowBytes);
+ _int64 finalStartOffset = min(fileSize.QuadPart, endingOffset);
+ amountToRead = (unsigned)min(finalOffset - readOffset.QuadPart, (_int64) bufferSize); // Cast OK because can't be longer than unsigned bufferSize
+ info->isEOF = readOffset.QuadPart + amountToRead == finalOffset;
+ info->nBytesThatMayBeginARead = (unsigned)min((_int64)bufferSize - overflowBytes, finalStartOffset - readOffset.QuadPart);
+ _ASSERT(amountToRead >= info->nBytesThatMayBeginARead && (!info->isEOF || finalOffset == readOffset.QuadPart + amountToRead));
+ ResetEvent(bufferLap->hEvent);
+ bufferLap->Offset = readOffset.LowPart;
+ bufferLap->OffsetHigh = readOffset.HighPart;
+ info->fileOffset = readOffset.QuadPart;
+ readOffset.QuadPart += info->nBytesThatMayBeginARead;
+ info->state = Reading;
+ info->offset = 0;
+ //fprintf(stderr, "startIo on %d at %lld for %uB\n", index, readOffset, amountToRead);
+ if (!ReadFile(
+ hFile,
+ info->buffer,
+ amountToRead,
+ (DWORD *)&info->validBytes,
+ bufferLap)) {
+ if (GetLastError() != ERROR_IO_PENDING) {
+ WriteErrorMessage("WindowsOverlappedDataReader::startIo(): readFile failed, %d\n",GetLastError());
+ soft_exit(1);
+ }
+ }
+ }
+ if (nextBufferForConsumer == -1) {
+ //fprintf(stderr, "startIo thread %x reset releaseEvent\n", GetCurrentThreadId());
+ ResetEvent(releaseEvent);
+ }
+ void
+ unsigned bufferNumber)
+ _ASSERT(bufferNumber >= 0 && bufferNumber < nBuffers);
+ BufferInfo *info = &bufferInfo[bufferNumber];
+ OVERLAPPED *bufferLap = &bufferLaps[bufferNumber];
+ while (info->state == InUse) {
+ //fprintf(stderr, "WindowsOverlappedDataReader::waitForBuffer %d InUse...\n", bufferNumber);
+ // must already have lock to call, release & wait & reacquire
+ ReleaseExclusiveLock(&lock);
+ _int64 start = timeInNanos();
+ DWORD waitTime;
+ if (releaseWaitInMillis > 0xffffffff) {
+ waitTime = INFINITE;
+ } else {
+ waitTime = (DWORD)releaseWaitInMillis;
+ }
+ DWORD result = WaitForSingleObject(releaseEvent, waitTime);
+ InterlockedAdd64AndReturnNewValue(&ReleaseWaitTime, timeInNanos() - start);
+ AcquireExclusiveLock(&lock);
+ if (result == WAIT_TIMEOUT) {
+ // this isn't going to directly make this buffer available, but will reduce pressure
+ addBuffer();
+ }
+ }
+ if (info->state == Full) {
+ return;
+ }
+ if (info->state != Reading) {
+ startIo();
+ }
+ _int64 start = timeInNanos();
+ if (!GetOverlappedResult(hFile, bufferLap, (DWORD *)&info->validBytes,TRUE)) {
+ WriteErrorMessage("Error reading FASTQ file, %d\n",GetLastError());
+ soft_exit(1);
+ }
+ InterlockedAdd64AndReturnNewValue(&ReadWaitTime, timeInNanos() - start);
+ info->state = Full;
+ info->buffer[info->validBytes] = 0;
+ ResetEvent(bufferLap->hEvent);
+ if (nBuffers == maxBuffers) {
+ WriteErrorMessage("WindowsOverlappedDataReader: addBuffer at limit\n");
+ return;
+ }
+ _ASSERT(nBuffers < maxBuffers);
+ bufferLaps[nBuffers].hEvent = CreateEvent(NULL,TRUE,FALSE,NULL);
+ if (NULL == bufferLaps[nBuffers].hEvent) {
+ WriteErrorMessage("WindowsOverlappedDataReader: Unable to create event\n");
+ soft_exit(1);
+ }
+ ReadBasedDataReader::addBuffer();
+class WindowsOverlappedDataSupplier : public DataSupplier
+ WindowsOverlappedDataSupplier() : DataSupplier() {}
+ virtual DataReader* getDataReader(int bufferCount, _int64 overflowBytes, double extraFactor, size_t bufferSpace)
+ {
+ // add some buffers for read-ahead
+ return new WindowsOverlappedDataReader(bufferCount + (bufferCount > 1 ? 4 : 0), overflowBytes, extraFactor, bufferSpace);
+ }
+DataSupplier* DataSupplier::WindowsOverlapped = new WindowsOverlappedDataSupplier();
+#endif // _MSC_VER
+// Decompress
+static const int windowBits = 15;
+static const int ENABLE_ZLIB_GZIP = 32;
+static const double MIN_FACTOR = 1.2;
+static const double MAX_FACTOR = 10.0;
+class DecompressDataReader : public DataReader
+ DecompressDataReader(DataReader* i_inner, int i_count, _int64 totalExtra, _int64 i_extraBytes, _int64 i_overflowBytes, int i_chunkSize = BAM_BLOCK);
+ virtual ~DecompressDataReader();
+ virtual bool init(const char* fileName);
+ virtual char* readHeader(_int64* io_headerSize);
+ virtual void reinit(_int64 startingOffset, _int64 amountOfFileToProcess);
+ virtual bool getData(char** o_buffer, _int64* o_validBytes, _int64* o_startBytes = NULL);
+ virtual void advance(_int64 bytes);
+ virtual void nextBatch();
+ virtual bool isEOF();
+ virtual DataBatch getBatch();
+ virtual void holdBatch(DataBatch batch);
+ virtual bool releaseBatch(DataBatch batch);
+ virtual _int64 getFileOffset();
+ virtual void getExtra(char** o_extra, _int64* o_length);
+ virtual const char* getFilename()
+ { return inner->getFilename(); }
+ enum DecompressMode { SingleBlock, ContinueMultiBlock, StartMultiBlock };
+ static bool decompress(z_stream* zstream, ThreadHeap* heap, char* input, _int64 inputSize, _int64* o_inputUsed,
+ char* output, _int64 outputSize, _int64* o_outputUsed, DecompressMode mode);
+ // debugging
+ char* findPointer(void* p);
+ static void decompressThread(void *context);
+ static void decompressThreadContinuous(void *context);
+ friend class DecompressManager;
+ friend class DecompressWorker;
+ enum EntryState
+ {
+ EntryReady, // for reading by client, on first list
+ EntryHeld, // finished reading but not released, not on a list
+ EntryAvailable, // released by client, on available list
+ EntryReading // reading or decompressing, not on a list
+ };
+ struct Entry
+ {
+ Entry* next; // next entry on first/available list
+ EntryState state;
+ DataBatch batch;
+ char* compressed;
+ _int64 compressedStart; // limit to start a new zip block
+ _int64 compressedValid; // total available data
+ char* decompressed;
+ _int64 decompressedStart;
+ _int64 decompressedValid;
+ bool allocated; // if decompressed has been allocated specially, not from inner extra data
+ };
+ // use only these routines to manipulate the linked lists
+ Entry* peekReady(); // from first, block if none
+ void popReady(); // from first
+ void enqueueReady(Entry* entry); // as last
+ Entry* dequeueAvailable(); // as available, block if none
+ void enqueueAvailable(Entry* entry); // from available
+ DataReader* inner; // inner reader for compressed data
+ const _int64 extraBytes; // number of bytes of extra that I get to use
+ const _int64 overflowBytes; // overflow between batches
+ const _int64 totalExtra; // total extra data
+ const int chunkSize; // max size of decompressed data
+ _int64 offset; // into current entry
+ bool threadStarted; // whether thread has been started
+ bool eof; // true when we've read to eof of previous
+ volatile bool stopping; // set to stop everything
+ EventObject decompressThreadDone; // signalled by background thread on exit
+ // entry lists
+ Entry* entries; // ring buffer of batches from inner reader
+ int count; // # of entries
+ Entry* first; // first ready buffer, NULL if none, currently being read by client
+ Entry* last; // last ready buffer, NULL if none
+ EventObject readyEvent; // signalled by bg thread when first goes NULL->non-NULL
+ Entry* available; // first non-ready buffer (head of freelist), NULL if none
+ EventObject availableEvent; // signalled by main thread when available goes NULL->non-NULL
+ ExclusiveLock lock; // lock on linked list pointers in this object and in Entry
+ DataReader* i_inner,
+ int i_count,
+ _int64 i_totalExtra,
+ _int64 i_extraBytes,
+ _int64 i_overflowBytes,
+ int i_chunkSize)
+ : DataReader(), inner(i_inner), count(i_count), offset(i_overflowBytes),
+ totalExtra(i_totalExtra), extraBytes(i_extraBytes), overflowBytes(i_overflowBytes),
+ chunkSize(i_chunkSize), threadStarted(false), eof(false), stopping(false)
+ entries = new Entry[count];
+ for (int i = 0; i < count; i++) {
+ Entry* entry = &entries[i];
+ entry->state = EntryAvailable;
+ entry->next = i < count - 1 ? &entries[i + 1] : NULL;
+ entry->decompressed = NULL;
+ entry->allocated = false;
+ entry->batch = DataBatch(0, 0);
+ }
+ available = entries;
+ first = last = NULL;
+ CreateEventObject(&readyEvent);
+ PreventEventWaitersFromProceeding(&readyEvent);
+ CreateEventObject(&availableEvent);
+ AllowEventWaitersToProceed(&availableEvent);
+ CreateEventObject(&decompressThreadDone);
+ PreventEventWaitersFromProceeding(&decompressThreadDone);
+ InitializeExclusiveLock(&lock);
+ if (threadStarted) {
+ stopping = true;
+ AllowEventWaitersToProceed(&availableEvent);
+ WaitForEvent(&decompressThreadDone);
+ }
+ for (int i = 0; i < count; i++) {
+ if (entries[i].allocated) {
+ BigDealloc(entries[i].decompressed);
+ }
+ }
+ DestroyExclusiveLock(&lock);
+ delete inner;
+ bool
+ const char* fileName)
+ return inner->init(fileName);
+ char*
+ _int64* io_headerSize)
+ z_stream zstream;
+ ThreadHeap heap(max(chunkSize,1000));
+ _int64 compressedBytes = (_int64)(*io_headerSize / MIN_FACTOR);
+ char* compressed = inner->readHeader(&compressedBytes);
+ char* header;
+ _int64 total;
+ inner->getExtra(&header, &total);
+ _ASSERT(total >= totalExtra);
+ _int64 headerSize = 0;
+ while (headerSize < *io_headerSize && compressedBytes > 0) {
+ _int64 compressedBlockSize, decompressedBlockSize;
+ //fprintf(stderr,"decompress chunkSize %d compressedBytes %d headerSize %d totalExtra %d\n", chunkSize, compressedBytes, headerSize, totalExtra);
+ decompress(&zstream, chunkSize != 0 ? &heap : NULL,
+ compressed, compressedBytes, &compressedBlockSize,
+ header + headerSize, totalExtra - headerSize, &decompressedBlockSize,
+ StartMultiBlock);
+ // This just gets reinit()'ed later, and in the interim confuses the non-rewind stdio data reader. inner->advance(compressedBlockSize);
+ compressed += compressedBlockSize;
+ compressedBytes -= compressedBlockSize;
+ headerSize += decompressedBlockSize;
+ }
+ *io_headerSize = headerSize;
+ return header;
+ void
+ _int64 startingOffset,
+ _int64 amountOfFileToProcess)
+ if (threadStarted) {
+ WriteErrorMessage("DecompressDataReader reinit called twice\n");
+ soft_exit(1);
+ }
+ // todo: transform start/amount to add for compression? I don't think so...
+ inner->reinit(startingOffset, amountOfFileToProcess);
+ threadStarted = true;
+ if (! StartNewThread(chunkSize > 0 ? decompressThread : decompressThreadContinuous, this)) {
+ WriteErrorMessage("failed to start decompressThread\n");
+ soft_exit(1);
+ }
+ bool
+ char** o_buffer,
+ _int64* o_validBytes,
+ _int64* o_startBytes)
+ if (eof) {
+ return false;
+ }
+ Entry* entry = peekReady();
+ if (offset >= entry->decompressedStart) {
+ return false;
+ }
+ *o_buffer = entry->decompressed + offset;
+ *o_validBytes = entry->decompressedValid - offset;
+ if (o_startBytes != NULL) {
+ *o_startBytes = entry->decompressedStart- offset;
+ }
+ return true;
+ void
+ _int64 bytes)
+ offset = min(offset + max(bytes, (_int64) 0), peekReady()->decompressedValid);
+ void
+ if (eof) {
+ return;
+ }
+ Entry* old = peekReady();
+ popReady();
+ if (old->decompressedValid == overflowBytes) {
+ eof = true;
+ return;
+ }
+ Entry* next = peekReady();
+ _ASSERT(next->state == EntryReady && next->decompressed != NULL);
+ _int64 copy = old->decompressedValid - max(offset, old->decompressedStart);
+ memcpy(next->decompressed + overflowBytes - copy, old->decompressed + old->decompressedValid - copy, copy);
+ offset = overflowBytes - copy;
+ //fprintf(stderr,"DecompressDataReader nextBatch %d:%d #%d -> %d:%d #%d copy %lld + %lld/%lld\n", old->batch.fileID, old->batch.batchID, old-entries, next->batch.fileID, next->batch.batchID, next-entries, copy, next->decompressedStart, next->decompressedValid);
+ releaseBatch(old->batch); // holdBatch was called in decompress thread, release now if no customers added holds
+ if (offset == next->decompressedValid) {
+ eof = true;
+ _ASSERT(inner->isEOF());
+ }
+ bool
+ return eof;
+ DataBatch
+ return peekReady()->batch;
+ void
+DecompressDataReader::holdBatch(DataBatch batch)
+ inner->holdBatch(batch);
+ bool
+DecompressDataReader::releaseBatch(DataBatch batch)
+ if (! inner->releaseBatch(batch)) {
+ return false;
+ }
+ // truly released, find matching entry & put back on available list
+ AcquireExclusiveLock(&lock);
+ for (int i = 0; i < count; i++) {
+ Entry* entry = &entries[i];
+ if (entry->batch == batch) {
+ //fprintf(stderr,"DecompressDataReader releaseBatch %d:0x%x #%d\n", batch.fileID, batch.batchID, i);
+ if (entry->state == EntryHeld) {
+ enqueueAvailable(entry);
+ } else {
+ _ASSERT(entry->state == EntryAvailable);
+ }
+ break;
+ }
+ }
+ ReleaseExclusiveLock(&lock);
+ return true;
+ _int64
+ return inner->getFileOffset();
+ void
+ char** o_extra,
+ _int64* o_length)
+ *o_extra = peekReady()->decompressed + extraBytes;
+ *o_length = totalExtra - extraBytes;
+ bool
+ z_stream* zstream,
+ ThreadHeap* heap,
+ char* input,
+ _int64 inputBytes,
+ _int64* o_inputRead,
+ char* output,
+ _int64 outputBytes,
+ _int64* o_outputWritten,
+ DecompressMode mode)
+ if (inputBytes > 0xffffffff || outputBytes > 0xffffffff) {
+ WriteErrorMessage("GzipDataReader: inputBytes or outputBytes > max unsigned int\n");
+ soft_exit(1);
+ }
+ zstream->next_in = (Bytef*) input;
+ zstream->avail_in = (uInt)inputBytes;
+ zstream->next_out = (Bytef*) output;
+ zstream->avail_out = (uInt)outputBytes;
+ if (heap != NULL) {
+ zstream->zalloc = zalloc;
+ zstream->zfree = zfree;
+ zstream->opaque = heap;
+ } else {
+ zstream->zalloc = NULL;
+ zstream->zfree = NULL;
+ }
+ uInt oldAvailOut, oldAvailIn;
+ int block = 0;
+ bool multiBlock = true;
+ int status;
+ do {
+ if (mode != ContinueMultiBlock || block != 0) {
+ if (heap != NULL) {
+ heap->reset();
+ }
+ status = inflateInit2(zstream, windowBits | ENABLE_ZLIB_GZIP);
+ if (status < 0) {
+ WriteErrorMessage("GzipDataReader: inflateInit2 failed with %d\n", status);
+ return false;
+ }
+ }
+ oldAvailOut = zstream->avail_out;
+ oldAvailIn = zstream->avail_in;
+ status = inflate(zstream, mode == SingleBlock ? Z_NO_FLUSH : Z_FINISH);
+ // fprintf(stderr, "decompress block #%d %lld -> %lld = %d\n", block, zstream.next_in - lastIn, zstream.next_out - lastOut, status);
+ block++;
+ if (status < 0 && status != Z_BUF_ERROR) {
+ WriteErrorMessage("GzipDataReader: inflate failed with %d\n", status);
+ soft_exit(1);
+ }
+ if (status < 0 && zstream->avail_out == 0 && zstream->avail_in > 0) {
+ WriteErrorMessage("insufficient decompression buffer space - increase expansion factor, currently -xf %.1f\n", DataSupplier::ExpansionFactor);
+ soft_exit(1);
+ }
+ } while (zstream->avail_in != 0 && (zstream->avail_out != oldAvailOut || zstream->avail_in != oldAvailIn) && mode != SingleBlock);
+ // fprintf(stderr, "end decompress status=%d, avail_in=%lld, last block=%lld->%lld, avail_out=%lld\n", status, zstream.avail_in, zstream.next_in - lastIn, zstream.next_out - lastOut, zstream.avail_out);
+ if (o_inputRead) {
+ *o_inputRead = inputBytes - zstream->avail_in;
+ }
+ if (o_outputWritten) {
+ *o_outputWritten = outputBytes - zstream->avail_out;
+ }
+ return zstream->avail_in == 0;
+ char*
+ void* p)
+ static char result[100];
+ sprintf(result, "not found");
+ for (int i = 0; i < count; i++) {
+ Entry* e = &entries[i];
+ if (e->compressed <= p && p < e->compressed + e->compressedValid) {
+ sprintf(result, "compressed #%d @ %lld", i, (char*)p - e->compressed);
+ break;
+ }
+ if (e->decompressed <= p && p < e->decompressed + extraBytes) {
+ sprintf(result, "decompressed #%d %lld", i, (char*) p - e->decompressed);
+ break;
+ }
+ if (e->decompressed + extraBytes <= p && p < e->decompressed + totalExtra) {
+ sprintf(result, "extra #%d %lld", i, (char*) p - e->decompressed - extraBytes);
+ break;
+ }
+ }
+ return result;
+typedef VariableSizeVector<_int64> OffsetVector;
+class DecompressWorker : public ParallelWorker
+ DecompressWorker();
+ virtual void step();
+ z_stream zstream;
+ ThreadHeap heap;
+class DecompressManager: public ParallelWorkerManager
+ DecompressManager(OffsetVector* i_inputs, OffsetVector* i_outputs)
+ : inputs(i_inputs), outputs(i_outputs)
+ {}
+ virtual ParallelWorker* createWorker()
+ { return new DecompressWorker(); }
+ OffsetVector* inputs;
+ OffsetVector* outputs;
+ DecompressDataReader::Entry* entry;
+ friend class DecompressWorker;
+ : heap(BAM_BLOCK)
+ zstream.zalloc = zalloc;
+ zstream.zfree = zfree;
+ zstream.opaque = &heap;
+ void
+ DecompressManager* manager = (DecompressManager*) getManager();
+ for (int i = getThreadNum(); i < manager->inputs->size() - 1; i += getNumThreads()) {
+ _int64 inputUsed, outputUsed;
+ DecompressDataReader::decompress(&zstream,
+ &heap,
+ manager->entry->compressed + (*manager->inputs)[i],
+ (*manager->inputs)[i + 1] - (*manager->inputs)[i],
+ &inputUsed,
+ manager->entry->decompressed + (*manager->outputs)[i],
+ (*manager->outputs)[i + 1] - (*manager->outputs)[i],
+ &outputUsed,
+ DecompressDataReader::SingleBlock);
+ _ASSERT(inputUsed == (*manager->inputs)[i + 1] - (*manager->inputs)[i] &&
+ outputUsed == (*manager->outputs)[i + 1] - (*manager->outputs)[i]);
+ }
+ void
+ void* context)
+ DecompressDataReader* reader = (DecompressDataReader*) context;
+ OffsetVector inputs, outputs;
+ DecompressManager manager(&inputs, &outputs);
+ ParallelCoworker coworker(min(8, DataSupplier::ThreadCount), false, &manager);
+ coworker.start();
+ // keep reading & decompressing entries until stopped
+ bool stop = false;
+ while (! stop) {
+ Entry* entry = reader->dequeueAvailable();
+ if (reader->stopping) {
+ break;
+ }
+ // always starts with a fresh batch - advances after reading it all
+ bool ok = reader->inner->getData(&entry->compressed, &entry->compressedValid, &entry->compressedStart);
+ int index = (int) (entry - reader->entries);
+ if (! ok) {
+ //fprintf(stderr, "decompressThread #%d %d:%d eof\n", index, reader->inner->getBatch().fileID, reader->inner->getBatch().batchID);
+ if (! reader->inner->isEOF()) {
+ WriteErrorMessage("error reading file at offset %lld\n", reader->getFileOffset());
+ soft_exit(1);
+ }
+ // mark as eof - no data
+ entry->decompressedValid = entry->decompressedStart = reader->overflowBytes;
+ DataBatch b = reader->inner->getBatch();
+ entry->batch = DataBatch(b.batchID + 1, b.fileID);
+ // decompressed buffer is same as next-to-last batch, need to allocate own buffer
+ entry->decompressed = (char*) BigAlloc(reader->totalExtra);
+ entry->allocated = true;
+ stop = true;
+ } else {
+ _int64 extraSize;
+ reader->inner->getExtra(&entry->decompressed, &extraSize);
+ _ASSERT(extraSize >= reader->extraBytes && extraSize >= reader->overflowBytes);
+ // figure out offsets and advance inner data
+ inputs.clear();
+ outputs.clear();
+ _int64 input = 0;
+ _int64 output = reader->overflowBytes;
+ do {
+ inputs.push_back(input);
+ outputs.push_back(output);
+ BgzfHeader* zip = (BgzfHeader*) (entry->compressed + input);
+ input += zip->BSIZE() + 1;
+ output += zip->ISIZE();
+ if (output > reader->extraBytes) {
+ fprintf(stderr, "insufficient decompression space, increase -xf parameter\n");
+ soft_exit(1);
+ }
+ if (input > entry->compressedValid || zip->BSIZE() >= BAM_BLOCK || zip->ISIZE() > BAM_BLOCK) {
+ fprintf(stderr, "error reading BAM file at offset %lld\n", reader->getFileOffset());
+ soft_exit(1);
+ }
+ } while (input < entry->compressedStart);
+ // append final offsets
+ inputs.push_back(input);
+ outputs.push_back(output);
+ //fprintf(stderr, "decompressThread read #%d %lld->%lld\n", index, input, output);
+ reader->inner->advance(input);
+ entry->decompressedValid = output;
+ entry->decompressedStart = output - reader->overflowBytes;
+ entry->batch = reader->inner->getBatch();
+ reader->holdBatch(entry->batch); // hold batch while decompressing
+ reader->inner->nextBatch(); // start reading next batch
+ // decompress all chunks synchronously on multiple threads
+ manager.entry = entry;
+ coworker.step();
+ }
+ // make buffer available for clients & go on to next
+ //fprintf(stderr, "decompressThread #%d %d:%d ready\n", index, entry->batch.fileID, entry->batch.batchID);
+ reader->enqueueReady(entry);
+ }
+ coworker.stop();
+ AllowEventWaitersToProceed(&reader->decompressThreadDone);
+ void
+ void* context)
+ DecompressDataReader* reader = (DecompressDataReader*) context;
+ z_stream zstream;
+ bool first = true;
+ bool stop = false;
+ while (! stop) {
+ Entry* entry = reader->dequeueAvailable();
+ if (reader->stopping) {
+ break;
+ }
+ // always starts with a fresh batch - advances after reading it all
+ bool ok = reader->inner->getData(&entry->compressed, &entry->compressedValid, &entry->compressedStart);
+ int index = (int) (entry - reader->entries);
+ if (! ok) {
+ //fprintf(stderr, "decompressThreadContinuous#%d %d:%d eof\n", index, reader->inner->getBatch().fileID, reader->inner->getBatch().batchID);
+ if (! reader->inner->isEOF()) {
+ WriteErrorMessage("error reading file at offset %lld\n", reader->getFileOffset());
+ soft_exit(1);
+ }
+ // mark as eof - no data
+ entry->decompressedValid = entry->decompressedStart = reader->overflowBytes;
+ DataBatch b = reader->inner->getBatch();
+ entry->batch = DataBatch(b.batchID + 1, b.fileID);
+ entry->decompressed = (char*) BigAlloc(reader->totalExtra);
+ entry->allocated = true;
+ stop = true;
+ } else {
+ // figure out offsets and advance inner data
+ _int64 ignore;
+ reader->inner->getExtra(&entry->decompressed, &ignore);
+ _ASSERT(ignore >= reader->extraBytes && ignore >= reader->overflowBytes);
+ _int64 compressedRead, decompressedWritten;
+ entry->batch = reader->inner->getBatch();
+ reader->holdBatch(entry->batch); // hold batch while decompressing
+ reader->inner->advance(entry->compressedValid);
+ reader->inner->nextBatch(); // start reading next batch
+ decompress(&zstream, NULL,
+ entry->compressed, entry->compressedValid, &compressedRead,
+ entry->decompressed + reader->overflowBytes, reader->extraBytes - reader->overflowBytes, &decompressedWritten,
+ first ? StartMultiBlock : ContinueMultiBlock);
+ _ASSERT(compressedRead == entry->compressedValid && decompressedWritten <= reader->extraBytes - reader->overflowBytes);
+ entry->decompressedValid = reader->overflowBytes + decompressedWritten;
+ entry->decompressedStart = decompressedWritten;
+ first = false;
+ }
+ // make buffer available for clients & go on to next
+ //fprintf(stderr, "decompressThreadContinuous#%d %d:%d ready\n", index, entry->batch.fileID, entry->batch.batchID);
+ reader->enqueueReady(entry);
+ }
+ AllowEventWaitersToProceed(&reader->decompressThreadDone);
+ DecompressDataReader::Entry*
+ // not thread-safe relative to popReady!
+ if (first == NULL) {
+ WaitForEvent(&readyEvent);
+ }
+ _ASSERT(first->state == EntryReady);
+ return first;
+ void
+ while (true) {
+ AcquireExclusiveLock(&lock);
+ if (first != NULL) {
+ _ASSERT(first->state == EntryReady);
+ //fprintf(stderr, "popReady %d:%d #%d -> held\n", first->batch.fileID, first->batch.batchID, first - entries);
+ first->state = EntryHeld;
+ if (first->next == NULL) {
+ _ASSERT(last == first);
+ last = NULL;
+ PreventEventWaitersFromProceeding(&readyEvent);
+ }
+ first = first->next;
+ _ASSERT(first == NULL || first->state == EntryReady);
+ ReleaseExclusiveLock(&lock);
+ return;
+ }
+ ReleaseExclusiveLock(&lock);
+ WaitForEvent(&readyEvent);
+ }
+ void
+DecompressDataReader::enqueueReady(Entry* entry)
+ AcquireExclusiveLock(&lock);
+ _ASSERT(entry->state == EntryReading);
+ entry->next = NULL;
+ entry->state = EntryReady;
+ if (last == NULL) {
+ first = last = entry;
+ AllowEventWaitersToProceed(&readyEvent);
+ } else {
+ last->next = entry;
+ last = entry;
+ }
+ ReleaseExclusiveLock(&lock);
+ DecompressDataReader::Entry*
+ while (true) {
+ AcquireExclusiveLock(&lock);
+ //fprintf(stderr, "dequeueAvailable #%d\n", available == NULL ? -1 : available - entries);
+ if (available!= NULL) {
+ _ASSERT(available->state == EntryAvailable);
+ available->state = EntryReading;
+ Entry* result = available;
+ available = available->next;
+ if (available == NULL) {
+ PreventEventWaitersFromProceeding(&availableEvent);
+ }
+ ReleaseExclusiveLock(&lock);
+ return result;
+ }
+ ReleaseExclusiveLock(&lock);
+ WaitForEvent(&availableEvent);
+ if (stopping) {
+ return NULL;
+ }
+ }
+ void
+DecompressDataReader::enqueueAvailable(Entry* entry)
+ AssertExclusiveLockHeld(&lock);
+ _ASSERT(entry->state == EntryHeld);
+ entry->state = EntryAvailable;
+ entry->next = available;
+ available = entry;
+ if (entry->next == NULL) {
+ AllowEventWaitersToProceed(&availableEvent);
+ }
+class DecompressDataReaderSupplier : public DataSupplier
+ DecompressDataReaderSupplier(DataSupplier* i_inner, int i_blockSize = BAM_BLOCK)
+ : DataSupplier(), inner(i_inner), blockSize(i_blockSize)
+ {}
+ virtual DataReader* getDataReader(int bufferCount, _int64 overflowBytes, double extraFactor, size_t bufferSpace);
+ DataSupplier* inner;
+ const int blockSize;
+ DataReader*
+ int bufferCount,
+ _int64 overflowBytes,
+ double extraFactor,
+ size_t bufferSpace)
+ // adjust extra factor for compression ratio
+ double expand = MAX_FACTOR * DataSupplier::ExpansionFactor;
+ double totalFactor = expand * (1.0 + extraFactor);
+ // get inner reader with no overflow since zlib can't deal with it
+ // add 2 buffers for compression thread
+ DataReader* data = inner->getDataReader(bufferCount + 2, blockSize, totalFactor, bufferSpace);
+ // compute how many extra bytes are owned by this layer
+ char* p;
+ _int64 totalExtra;
+ data->getExtra(&p, &totalExtra);
+ _int64 mine = (_int64)(totalExtra * expand / totalFactor);
+ // create new reader, telling it how many bytes it owns
+ // it will subtract overflow off the end of each batch
+ return new DecompressDataReader(data, bufferCount, totalExtra, mine, overflowBytes, blockSize);
+ DataSupplier*
+ DataSupplier* inner)
+ return new DecompressDataReaderSupplier(inner, BAM_BLOCK);
+ DataSupplier*
+ DataSupplier* inner)
+ return new DecompressDataReaderSupplier(inner, 0);
+ DataSupplier*
+ return new StdioDataSupplier();
+// MemMap
+class MemMapDataSupplier : public DataSupplier
+ MemMapDataSupplier();
+ virtual ~MemMapDataSupplier();
+ virtual DataReader* getDataReader(int bufferCount, _int64 overflowBytes, double extraFactor, size_t bufferSpace);
+ FileMapper* getMapper(const char* fileName);
+ void releaseMapper(const char* fileName);
+ ExclusiveLock lock;
+ map<string,FileMapper*> mappers;
+ map<string,int> refcounts;
+class MemMapDataReader : public DataReader
+ MemMapDataReader(MemMapDataSupplier* i_supplier, int i_batchCount, _int64 i_batchSize, _int64 i_overflowBytes, _int64 i_batchExtra);
+ virtual ~MemMapDataReader();
+ virtual bool init(const char* fileName);
+ virtual char* readHeader(_int64* io_headerSize);
+ virtual void reinit(_int64 startingOffset, _int64 amountOfFileToProcess);
+ virtual bool getData(char** o_buffer, _int64* o_validBytes, _int64* o_startBytes = NULL);
+ virtual void advance(_int64 bytes);
+ virtual void nextBatch();
+ virtual bool isEOF();
+ virtual DataBatch getBatch();
+ virtual void holdBatch(DataBatch batch);
+ virtual bool releaseBatch(DataBatch batch);
+ virtual _int64 getFileOffset();
+ virtual void getExtra(char** o_extra, _int64* o_length);
+ virtual const char* getFilename()
+ { return fileName; }
+ void acquireLock()
+ {
+ if (batchCount != 1) {
+ AcquireExclusiveLock(&lock);
+ }
+ }
+ void releaseLock()
+ {
+ if (batchCount != 1) {
+ ReleaseExclusiveLock(&lock);
+ }
+ }
+ const int batchCount; // number of batches
+ const _int64 batchSizeParam; // bytes per batch, 0 for entire file
+ _int64 batchSize; // actual batch size for this file
+ const _int64 overflowBytes;
+ const _int64 batchExtra; // extra bytes per batch
+ const char* fileName; // current file name for diagnostics
+ _int64 fileSize; // total size of current file
+ char* currentMap; // currently mapped block if non-NULL
+ _int64 currentMapOffset; // current file offset of mapped region
+ _int64 currentMapStartSize; // start size of mapped region (not incl overflow)
+ _int64 currentMapSize; // total valid size of mapped region (incl overflow)
+ void* currentMappedBase; // mapped base for unmap
+ char* extra; // extra data buffer
+ int extraUsed; // number of extra data buffers in use
+ DataBatch* extraBatches; // non-zero for each extra buffer that is in use
+ int* extraHolds; // keeps hold count for each extra buffer
+ int currentExtraIndex; // index of extra block for current batch
+ _int64 offset; // into current batch
+ _uint32 currentBatch; // current batch number starting at 1
+ _int64 startBytes; // in current batch
+ _int64 validBytes; // in current batch
+ MemMapDataSupplier* supplier;
+ FileMapper* mapper;
+ SingleWaiterObject waiter; // flow control
+ ExclusiveLock lock; // lock around flow control members (currentBatch, extraUsed, etc.)
+MemMapDataReader::MemMapDataReader(MemMapDataSupplier* i_supplier, int i_batchCount, _int64 i_batchSize, _int64 i_overflowBytes, _int64 i_batchExtra)
+ : DataReader(),
+ batchCount(i_batchCount),
+ batchSizeParam(i_batchSize),
+ overflowBytes(i_overflowBytes),
+ batchExtra(i_batchExtra),
+ currentBatch(1),
+ extraUsed(0),
+ currentMap(NULL),
+ currentMapOffset(0),
+ currentMapSize(0),
+ currentExtraIndex(0),
+ supplier(i_supplier),
+ mapper(NULL)
+ _ASSERT(batchCount > 0 && batchSizeParam >= 0 && batchExtra >= 0);
+ if (batchExtra > 0) {
+ extra = (char*) BigAlloc(batchCount * batchExtra);
+ extraBatches = new DataBatch[batchCount];
+ memset(extraBatches, 0, batchCount * sizeof(DataBatch));
+ extraHolds = new int[batchCount];
+ memset(extraHolds, 0, batchCount * sizeof(int));
+ } else {
+ extra = NULL;
+ extraBatches = NULL;
+ }
+ if (! (CreateSingleWaiterObject(&waiter) && InitializeExclusiveLock(&lock))) {
+ WriteErrorMessage("MemMapDataReader: CreateSingleWaiterObject failed\n");
+ soft_exit(1);
+ }
+ if (extra != NULL) {
+ BigDealloc(extra);
+ extra = NULL;
+ }
+ if (extraBatches != NULL) {
+ delete [] extraBatches;
+ }
+ DestroyExclusiveLock(&lock);
+ DestroySingleWaiterObject(&waiter);
+ if (mapper != NULL) {
+ if (currentMap != NULL) {
+ mapper->unmap(currentMappedBase);
+ }
+ supplier->releaseMapper(fileName);
+ }
+ bool
+ const char* i_fileName)
+ if (mapper != NULL) {
+ supplier->releaseMapper(fileName);
+ }
+ mapper = supplier->getMapper(i_fileName);
+ if (mapper == NULL) {
+ return false;
+ }
+ fileName = i_fileName;
+ fileSize = mapper->getFileSize();
+ batchSize = batchSizeParam == 0 ? fileSize : batchSizeParam;
+ return true;
+ char*
+ _int64* io_headerSize)
+ *io_headerSize = min(*io_headerSize, fileSize);
+ reinit(0, *io_headerSize);
+ return currentMap;
+ void
+ _int64 i_startingOffset,
+ _int64 amountOfFileToProcess)
+ _ASSERT(i_startingOffset >= 0 && amountOfFileToProcess >= 0);
+ if (currentMap != NULL) {
+ mapper->unmap(currentMappedBase);
+ }
+ _int64 oldAmount = amountOfFileToProcess;
+ _int64 startSize = amountOfFileToProcess == 0 ? fileSize - i_startingOffset
+ : max((_int64) 0, min(fileSize - i_startingOffset, amountOfFileToProcess));
+ amountOfFileToProcess = max((_int64)0, min(startSize + overflowBytes, fileSize - i_startingOffset));
+ currentMap = mapper->createMapping(i_startingOffset, amountOfFileToProcess, ¤tMappedBase);
+ if (currentMap == NULL) {
+ WriteErrorMessage("MemMapDataReader: fail to map %s at %lld,%lld\n", fileName, i_startingOffset, amountOfFileToProcess);
+ soft_exit(1);
+ }
+ acquireLock();
+ currentMapOffset = i_startingOffset;
+ currentMapStartSize = startSize;
+ currentMapSize = amountOfFileToProcess;
+ offset = 0;
+ startBytes = min(batchSize, currentMapStartSize - (currentBatch - 1) * batchSize);
+ validBytes = min(batchSize + overflowBytes, currentMapSize - (currentBatch - 1) * batchSize);
+ currentBatch = 1;
+ extraUsed = 1;
+ currentExtraIndex = 0;
+ if (extraBatches != NULL) {
+ memset(extraBatches, 0, sizeof(DataBatch) * batchCount);
+ extraBatches[currentExtraIndex] = currentBatch;
+ memset(extraHolds, 0, sizeof(int) * batchCount);
+ }
+ releaseLock();
+ if (batchCount != 1) {
+ SignalSingleWaiterObject(&waiter);
+ }
+ bool
+ char** o_buffer,
+ _int64* o_validBytes,
+ _int64* o_startBytes)
+ if (offset >= startBytes) {
+ return false;
+ }
+ *o_buffer = currentMap + (currentBatch - 1) * batchSize + offset;
+ *o_validBytes = validBytes - offset;
+ if (o_startBytes) {
+ *o_startBytes = max((_int64)0, startBytes - offset);
+ }
+ return *o_validBytes > 0;
+ void
+ _int64 bytes)
+ _ASSERT(bytes >= 0);
+ offset = min(offset + max((_int64)0, bytes), validBytes);
+ void
+ if (isEOF()) {
+ return;
+ }
+ while (true) {
+ acquireLock();
+ if (extraBatches == NULL || extraUsed < batchCount) {
+ currentBatch++;
+ if (extraBatches != NULL) {
+ bool found = false;
+ for (int i = 0; i < batchCount; i++) {
+ if (extraBatches[i].batchID == 0) {
+ extraBatches[i].batchID = currentBatch;
+ currentExtraIndex = i;
+ found = true;
+ break;
+ }
+ }
+ _ASSERT(found);
+ extraUsed++;
+ //fprintf(stderr, "MemMap nextBatch %d:%d = index %d used %d of %d\n", 0, currentBatch, currentExtraIndex, extraUsed, batchCount);
+ if (extraUsed == batchCount) {
+ ResetSingleWaiterObject(&waiter);
+ }
+ }
+ releaseLock();
+ offset = max(offset, startBytes) - startBytes;
+ startBytes = min(batchSize, currentMapStartSize - (currentBatch - 1) * batchSize);
+ validBytes = min(batchSize + overflowBytes, currentMapSize - (currentBatch - 1) * batchSize);
+ _ASSERT(validBytes >= 0);
+ return;
+ }
+ releaseLock();
+ WaitForSingleWaiterObject(&waiter);
+ }
+ bool
+ return currentBatch * batchSize >= currentMapSize;
+ DataBatch
+ return DataBatch(currentBatch);
+ void
+ DataBatch batch)
+ if (extraBatches == NULL) {
+ return;
+ }
+ acquireLock();
+ for (int i = 0; i < batchCount; i++) {
+ if (extraBatches[i] == batch) {
+ extraHolds[i]++;
+ break;
+ }
+ }
+ releaseLock();
+ bool
+ DataBatch batch)
+ if (extraBatches == NULL) {
+ return true;
+ }
+ bool result = true;
+ acquireLock();
+ for (int i = 0; i < batchCount; i++) {
+ if (extraBatches[i] == batch) {
+ if (extraHolds[i] > 0) {
+ extraHolds[i]--;
+ }
+ if (extraHolds[i] == 0) {
+ extraBatches[i].batchID = 0;
+ _ASSERT(extraUsed > 0);
+ extraUsed--;
+ //fprintf(stderr,"MemMap: releaseBatch %d:%d = index %d now using %d of %d\n", batch.fileID, batch.batchID, i, extraUsed, batchCount);
+ if (extraUsed == batchCount - 1) {
+ SignalSingleWaiterObject(&waiter);
+ }
+ releaseLock();
+ } else {
+ result = false;
+ }
+ break;
+ }
+ }
+ releaseLock();
+ return result;
+ _int64
+ return currentMapOffset + (currentBatch - 1) * batchSize + offset;
+ void
+ char** o_extra,
+ _int64* o_length)
+ if (extra == NULL) {
+ *o_extra = NULL;
+ *o_length = 0;
+ } else {
+ *o_extra = extra + currentExtraIndex * batchExtra;
+ *o_length = batchExtra;
+ }
+MemMapDataSupplier::MemMapDataSupplier() : DataSupplier()
+ InitializeExclusiveLock(&lock);
+ DestroyExclusiveLock(&lock);
+ DataReader*
+ int bufferCount,
+ _int64 overflowBytes,
+ double extraFactor,
+ size_t bufferSpace /*not relevant*/)
+ _ASSERT(extraFactor >= 0 && overflowBytes >= 0);
+ if (extraFactor == 0) {
+ // no per-batch expansion factor, so can read entire file as a batch
+ return new MemMapDataReader(this, 1, 0, overflowBytes, 0);
+ } else {
+ // break up into 4Mb batches
+ _int64 batch = 4 * 1024 * 1024;
+ _int64 extra = (_int64)(batch * extraFactor);
+ return new MemMapDataReader(this, bufferCount, batch, overflowBytes, extra);
+ }
+ FileMapper*
+ const char* fileName)
+ AcquireExclusiveLock(&lock);
+ FileMapper* result = mappers[fileName];
+ if (result == NULL) {
+ result = new FileMapper();
+ result->init(fileName);
+ mappers[fileName] = result;
+ }
+ ++refcounts[fileName];
+ ReleaseExclusiveLock(&lock);
+ return result;
+ void
+ const char* fileName)
+ AcquireExclusiveLock(&lock);
+ int n = refcounts[fileName];
+ if (n > 0 && 0 == --refcounts[fileName]) {
+ delete mappers[fileName];
+ mappers[fileName] = NULL;
+ }
+ ReleaseExclusiveLock(&lock);
+// BatchTracker
+BatchTracker::BatchTracker(int i_capacity)
+ : pending(i_capacity)
+ bool
+ DataBatch batch)
+ DataBatch::Key key = batch.asKey();
+ unsigned* p = pending.tryFind(key);
+ int n = 1;
+ if (p != NULL) {
+ n = ++(*p);
+ } else {
+ pending.put(key, 1);
+ }
+ //_ASSERT(pending.tryFind(DataBatch(batch.batchID, 1^batch.fileID).asKey) != p);
+ //unsigned* q = pending.tryFind(key); _ASSERT(q && (p == NULL || p == q) && *q == n);
+ //fprintf(stderr, "thread %d tracker %lx addRead %u:%u = %d\n", GetCurrentThreadId(), this, batch.fileID, batch.batchID, n);
+ return p == NULL;
+ bool
+ DataBatch removed)
+ DataBatch::Key key = removed.asKey();
+ unsigned* p = pending.tryFind(key);
+ //fprintf(stderr, "thread %d tracker %lx removeRead %u:%u = %d\n", GetCurrentThreadId(), this, removed.fileID, removed.batchID, p ? *p - 1 : -1);
+ _ASSERT(p != NULL && *p > 0);
+ if (p != NULL) {
+ if (*p > 1) {
+ pending.put(key, *p - 1);
+ //unsigned* q = pending.tryFind(key); _ASSERT(q == p);
+ //_ASSERT(pending.tryFind(DataBatch(removed.batchID, 1^removed.fileID).asKey) != p);
+ return false;
+ }
+ pending.erase(key);
+ }
+ return true;
+// public static suppliers
+DataSupplier* DataSupplier::MemMap = new MemMapDataSupplier();
+#ifdef _MSC_VER
+DataSupplier* DataSupplier::Default = DataSupplier::WindowsOverlapped;
+DataSupplier* DataSupplier::Default = DataSupplier::MemMap;
+DataSupplier* DataSupplier::GzipDefault = DataSupplier::Gzip(DataSupplier::Default);
+DataSupplier* DataSupplier::GzipBamDefault = DataSupplier::GzipBam(DataSupplier::Default);
+DataSupplier* DataSupplier::Stdio = DataSupplier::StdioSupplier();
+DataSupplier* DataSupplier::GzipStdio = DataSupplier::Gzip(DataSupplier::Stdio);
+DataSupplier* DataSupplier::GzipBamStdio = DataSupplier::GzipBam(DataSupplier::Stdio);
+int DataSupplier::ThreadCount = 1;
+double DataSupplier::ExpansionFactor = 1.0;
+volatile _int64 DataReader::ReadWaitTime = 0;
+volatile _int64 DataReader::ReleaseWaitTime = 0;
diff --git a/SNAPLib/DataReader.h b/SNAPLib/DataReader.h
new file mode 100644
index 0000000..15eeef1
--- /dev/null
+++ b/SNAPLib/DataReader.h
@@ -0,0 +1,210 @@
+Module Name:
+ DataReader.h
+ Headers for the DataReader & related classes for the SNAP sequencer
+ Ravi Pandya, Jan 2013
+ User mode service.
+Revision History:
+#pragma once
+#include "Compat.h"
+#include "VariableSizeMap.h"
+// This defines a family of composable classes for efficiently reading data with flow control.
+// DataReader
+// Reads data from one or more files either sequentially, in ranges, or memory-mapped.
+// A DataReader should be accessed by only one thread at a time,
+// except for release() which may be called from any thread.
+// Divides data into sequential batches each of which is identified by a file ID and batch ID.
+// Data in a batch will remain stable until it is released by the consumer.
+// Consumers should release batches as soon as possible to make buffers free for read-ahead.
+// Batches may include extra data for higher layers that also remains stable.
+// Extra data size is defined as a factor of the underlying data size, and/or a fixed number of bytes.
+// DataSupplier
+// A factory for DataReaders, which may be called from multiple threads.
+struct DataBatch
+ _uint32 fileID;
+ _uint32 batchID;
+ inline DataBatch() : fileID(0), batchID(0) {}
+ inline DataBatch(_uint32 i_batchID, _uint32 i_fileID = 0) : fileID(i_fileID), batchID(i_batchID) {}
+ inline DataBatch(const DataBatch& o) : fileID(o.fileID), batchID(o.batchID) {}
+ static bool comparator(const DataBatch& a, const DataBatch& b)
+ { return a.fileID < b.fileID || (a.fileID == b.fileID && a.batchID < b.batchID); }
+ inline bool operator<=(const DataBatch& b) const
+ { return fileID < b.fileID || (fileID == b.fileID && batchID <= b.batchID); }
+ inline bool operator<(const DataBatch& b) const
+ { return fileID < b.fileID || (fileID == b.fileID && batchID < b.batchID); }
+ inline bool operator==(const DataBatch& b) const
+ { return fileID == b.fileID && batchID == b.batchID; }
+ inline bool operator!=(const DataBatch& b) const
+ { return batchID != b.batchID || fileID != b.fileID; }
+ inline DataBatch Min(const DataBatch& b) const
+ { return *this <= b ? *this : b; }
+ inline bool isZero() const
+ { return fileID == 0 && batchID == 0; }
+ // convert to _int64 for use as a hashtable key
+ typedef _int64 Key;
+ inline Key asKey()
+ { return (((_int64) fileID) << 32) + (_int64) batchID; }
+ inline DataBatch(Key key) : fileID((_uint32) (key >> 32)), batchID((_uint32) key) {}
+// read data from a file or other source
+// should all be called from a single thread, except for hold/releaseBatch which are thread-safe
+class DataReader
+ DataReader() {}
+ virtual ~DataReader() {}
+ // initialize to use a specific filename
+ virtual bool init(const char* fileName) = 0;
+ // read bytes from the beginning of the file for the header
+ virtual char* readHeader(_int64* io_headerSize) = 0;
+ // seek to a particular range in the file
+ virtual void reinit(_int64 startingOffset, _int64 amountOfFileToProcess) = 0;
+ // get all remaining data in current batch
+ // return false if no more data in current batch
+ // startBytes is data "owned" by this block in which reads may start
+ // validBytes may also include overflow bytes to handle records spanning batches
+ // if you advance() past startBytes, nextBatch() will start offset at that point
+ virtual bool getData(char** o_buffer, _int64* o_validBytes, _int64* o_startBytes = NULL) = 0;
+ // advance through data in current batch, reducing results from next getData call
+ virtual void advance(_int64 bytes) = 0;
+ // advance to next batch
+ // by default automatically releases previous batch
+ virtual void nextBatch() = 0;
+ // whether current batch is last in file
+ virtual bool isEOF() = 0;
+ // get current batch identifier
+ virtual DataBatch getBatch() = 0;
+ // hold buffers associated with this batch for reuse, increments refcount
+ // NOTE: this may be called from another thread,
+ // so anything it touches must be thread-safe!
+ virtual void holdBatch(DataBatch batch) = 0;
+ // release buffers associated with this batch for reuse
+ // decrements refcount, returns true if last release
+ // NOTE: this may be called from another thread,
+ // so anything it touches must be thread-safe!
+ virtual bool releaseBatch(DataBatch batch) = 0;
+ // get current offset into file
+ virtual _int64 getFileOffset() = 0;
+ // get pointer to extra data area for current batch
+ // todo: allow this to grow dynamically while keeping stable pointers to previous data
+ virtual void getExtra(char** o_extra, _int64* o_length) = 0;
+ // get filename for debugging / error printing
+ virtual const char* getFilename() = 0;
+ // timing for performance tuning (in nanos)
+ static volatile _int64 ReadWaitTime;
+ static volatile _int64 ReleaseWaitTime;
+class DataSupplier
+ DataSupplier() {}
+ virtual ~DataSupplier() {}
+ virtual DataReader* getDataReader(int bufferCount, _int64 overflowBytes, double extraFactor, size_t bufferSpace) = 0;
+ //
+ // creating specific factories
+ //
+ //
+ static DataSupplier* GzipBam(DataSupplier* inner);
+ static DataSupplier* Gzip(DataSupplier* inner);
+ static DataSupplier* StdioSupplier();
+ // memmap works on both platforms (but better on Linux)
+ static DataSupplier* MemMap;
+#ifdef _MSC_VER
+ // overlapped is only on Windows
+ static DataSupplier* WindowsOverlapped;
+ // default raw data supplier for platform
+ static DataSupplier* Default;
+ static DataSupplier* GzipDefault;
+ static DataSupplier* GzipBamDefault;
+ static DataSupplier* GzipStdio;
+ static DataSupplier* Stdio;
+ static DataSupplier* GzipBamStdio;
+ // hack: must be set to communicate thread count into suppliers
+ static int ThreadCount;
+ // hack: global for additional expansion factor
+ static double ExpansionFactor;
+// manages lifetime tracking for batches of reads
+class BatchTracker
+ BatchTracker(int i_capacity);
+ // reference was added from a batch, increment reference count
+ // return true if first hold
+ bool holdBatch(DataBatch batch);
+ // reference was removed from a batch
+ // returns true if the batch has no more references
+ bool releaseBatch(DataBatch batch);
+ typedef VariableSizeMap<DataBatch::Key,unsigned> BatchMap;
+ BatchMap pending;
diff --git a/SNAPLib/DataWriter.cpp b/SNAPLib/DataWriter.cpp
new file mode 100644
index 0000000..9c32ef7
--- /dev/null
+++ b/SNAPLib/DataWriter.cpp
@@ -0,0 +1,876 @@
+Module Name:
+ DataWriter.cpp
+ General file writer.
+ User mode service.
+ Not thread safe.
+#include "stdafx.h"
+#include "BigAlloc.h"
+#include "Compat.h"
+#include "DataWriter.h"
+#include "ParallelTask.h"
+#include "exit.h"
+#include "Bam.h"
+#include "Error.h"
+using std::min;
+using std::max;
+class AsyncDataWriterSupplier : public DataWriterSupplier
+ AsyncDataWriterSupplier(const char* i_filename, DataWriter::FilterSupplier* i_filterSupplier,
+ FileEncoder* i_encoder, int i_bufferCount, size_t i_bufferSize);
+ virtual DataWriter* getWriter();
+ virtual void close();
+ friend class AsyncDataWriter;
+ friend class FileEncoder;
+ void advance(size_t physical, size_t logical, size_t* o_physical, size_t* o_logical);
+ const char* filename;
+ AsyncFile* file;
+ DataWriter::FilterSupplier* filterSupplier;
+ FileEncoder* encoder;
+ const int bufferCount;
+ const size_t bufferSize;
+ ExclusiveLock lock;
+ size_t sharedOffset;
+ size_t sharedLogical;
+ bool closing;
+class AsyncDataWriter : public DataWriter
+ AsyncDataWriter(AsyncFile* i_file, AsyncDataWriterSupplier* i_supplier,
+ int i_count, size_t i_bufferSize, Filter* i_filter, FileEncoder* i_encoder);
+ virtual ~AsyncDataWriter()
+ {
+ for (int i = 0; i < count; i++) {
+ delete batches[i].file;
+ }
+ BigDealloc(batches[0].buffer); // all in one big block
+ delete [] batches;
+ if (encoder != NULL) {
+ delete encoder;
+ }
+ DestroyExclusiveLock(&lock);
+ }
+ virtual bool getBuffer(char** o_buffer, size_t* o_size);
+ virtual void advance(GenomeDistance bytes, GenomeLocation location = 0);
+ virtual bool getBatch(int relative, char** o_buffer, size_t* o_size, size_t* o_used, size_t* o_offset, size_t* o_logicalUsed = 0, size_t* o_logicalOffset = NULL);
+ virtual bool nextBatch();
+ virtual void close();
+ void acquireLock()
+ { if (encoder != NULL) { AcquireExclusiveLock(&lock); } }
+ void releaseLock()
+ { if (encoder != NULL) { ReleaseExclusiveLock(&lock); } }
+ struct Batch
+ {
+ char* buffer;
+ AsyncFile::Writer* file;
+ size_t used;
+ size_t fileOffset;
+ size_t logicalUsed;
+ size_t logicalOffset;
+ EventObject encoded;
+ };
+ Batch* batches;
+ const int count;
+ const size_t bufferSize;
+ AsyncDataWriterSupplier* supplier;
+ int current;
+ FileEncoder* encoder;
+ ExclusiveLock lock;
+ friend class FileEncoder;
+ int numThreads,
+ bool bindToProcessors,
+ ParallelWorkerManager* i_manager)
+ :
+ encoderRunning(false),
+ coworker(numThreads == 0 ? NULL
+ : new ParallelCoworker(numThreads, bindToProcessors, i_manager, FileEncoder::outputReadyCallback, this))
+ void
+ AsyncDataWriter* i_writer)
+ writer = i_writer;
+ lock = &writer->lock;
+ encoderBatch = writer->count - 1;
+ if (coworker != NULL) {
+ coworker->getManager()->initialize(this);
+ coworker->start();
+ }
+ void
+ AcquireExclusiveLock(lock);
+ if (! encoderRunning) {
+ checkForInput();
+ }
+ ReleaseExclusiveLock(lock);
+ void
+ // wait for pending encodes
+ AcquireExclusiveLock(lock);
+ int start = encoderBatch;
+ int pending = (writer->current + writer->count - start) % writer->count;
+ ReleaseExclusiveLock(lock);
+ for (int i = 0; i < pending; i++) {
+ WaitForEvent(&writer->batches[(start + i) % writer->count].encoded);
+ }
+ coworker->stop();
+ void
+ void *p)
+ ((FileEncoder*) p)->outputReady();
+ void
+ AcquireExclusiveLock(lock);
+ encoderRunning = false;
+ // begin writing the buffer to disk
+ AsyncDataWriter::Batch* write = &writer->batches[encoderBatch];
+ writer->supplier->advance(write->used, 0, &write->fileOffset, &write->logicalOffset);
+ //fprintf(stderr, "outputReady write batch %d @%lld:%lld\n", encoderBatch, write->fileOffset, write->used);
+ if (! write->file->beginWrite(write->buffer, write->used, write->fileOffset, NULL)) {
+ WriteErrorMessage("error: file write %lld bytes at offset %lld failed\n", write->used, write->fileOffset);
+ soft_exit(1);
+ }
+ AllowEventWaitersToProceed(&write->encoded);
+ // check for more work
+ checkForInput();
+ ReleaseExclusiveLock(lock);
+ void
+ // look for another block ready to encode
+ while (true) {
+ int nextBatch = (encoderBatch + 1) % writer->count;
+ if (nextBatch == writer->current) {
+ break;
+ }
+ encoderBatch = nextBatch;
+ AsyncDataWriter::Batch* encode = &writer->batches[encoderBatch];
+ if (encode->used > 0) {
+ encoderRunning = true;
+ coworker->step();
+ break;
+ }
+ }
+ void
+ int relative)
+ encoderBatch = (writer->current + relative + writer->count) % writer->count;
+ void
+ char** o_batch,
+ size_t* o_batchSize,
+ size_t* o_batchUsed)
+ AsyncDataWriter::Batch* batch = &writer->batches[encoderBatch];
+ *o_batch = batch->buffer;
+ *o_batchSize = writer->bufferSize;
+ *o_batchUsed = batch->used;
+ //fprintf(stderr, "getEncodeBatch #%d: %lld/%lld\n", encoderBatch, batch->used, writer->bufferSize);
+ void
+ size_t* o_logicalOffset,
+ size_t* o_physicalOffset)
+ // logical has already been set correctly in batch
+ *o_logicalOffset = writer->batches[encoderBatch].logicalOffset;
+ // physical is not yet updated, use shared
+ *o_physicalOffset = writer->supplier->sharedOffset;
+ void
+ size_t newSize)
+ size_t old = writer->batches[encoderBatch].used;
+//fprintf(stderr, "setEncodedBatchSize #%d %lld -> %lld\n", encoderBatch, old, newSize);
+ if (newSize != old) {
+ AcquireExclusiveLock(lock);
+ AsyncDataWriter::Batch* batch = &writer->batches[encoderBatch];
+ batch->logicalUsed = batch->used;
+ batch->used = newSize;
+ ReleaseExclusiveLock(lock);
+ }
+ AsyncFile* i_file,
+ AsyncDataWriterSupplier* i_supplier,
+ int i_count,
+ size_t i_bufferSize,
+ Filter* i_filter,
+ FileEncoder* i_encoder)
+ :
+ DataWriter(i_filter),
+ encoder(i_encoder),
+ supplier(i_supplier),
+ count(i_count),
+ bufferSize(i_bufferSize),
+ current(0)
+ _ASSERT(count >= 2);
+ char* block = (char*) BigAlloc(count * bufferSize);
+ if (block == NULL) {
+ WriteErrorMessage("Unable to allocate %lld bytes for write buffers\n", count * bufferSize);
+ soft_exit(1);
+ }
+ batches = new Batch[count];
+ for (int i = 0; i < count; i++) {
+ batches[i].buffer = block + i * bufferSize;
+ batches[i].file = i_file->getWriter();
+ batches[i].used = 0;
+ batches[i].fileOffset = 0;
+ batches[i].logicalUsed = 0;
+ batches[i].logicalOffset = 0;
+ if (encoder != NULL) {
+ CreateEventObject(&batches[i].encoded);
+ AllowEventWaitersToProceed(&batches[i].encoded); // initialize so empty bufs are available
+ }
+ }
+ InitializeExclusiveLock(&lock);
+ if (encoder != NULL) {
+ encoder->initialize(this);
+ }
+ bool
+ char** o_buffer,
+ size_t* o_size)
+ *o_buffer = batches[current].buffer + batches[current].used;
+ *o_size = bufferSize - batches[current].used;
+ return true;
+ void
+ GenomeDistance bytes,
+ GenomeLocation location)
+ _ASSERT((size_t)bytes <= bufferSize - batches[current].used);
+ char* data = batches[current].buffer + batches[current].used;
+ size_t batchOffset = batches[current].used;
+ batches[current].used = min<long long>(bufferSize, batchOffset + bytes);
+ if (filter != NULL) {
+ //_int64 start = timeInNanos();
+ filter->onAdvance(this, batchOffset, data, bytes, location);
+ //InterlockedAdd64AndReturnNewValue(&FilterTime, timeInNanos() - start);
+ }
+ bool
+ int relative,
+ char** o_buffer,
+ size_t* o_size,
+ size_t* o_used,
+ size_t* o_offset,
+ size_t* o_logicalUsed,
+ size_t* o_logicalOffset)
+ if (relative < 1 - count || relative > count - 1) {
+ return false;
+ }
+ if (encoder != NULL && relative <= ((encoder->encoderBatch - current + count) % count) - count) {
+ return false;
+ }
+ int index = (current + relative + count) % count; // ensure non-negative
+ Batch* batch = &batches[index];
+ *o_buffer = batch->buffer;
+ if (o_size != NULL) {
+ *o_size = bufferSize;
+ }
+ if (o_used != NULL) {
+ *o_used = relative <= 0 ? batch->used : 0;
+ }
+ if (o_offset != NULL) {
+ *o_offset = relative <= 0 ? batch->fileOffset : 0;
+ }
+ if (o_logicalUsed != NULL) {
+ *o_logicalUsed = relative <=0 ? batch->logicalUsed: 0;
+ }
+ if (o_logicalOffset != NULL) {
+ *o_logicalOffset = relative <=0 ? batch->logicalOffset : 0;
+ }
+ if (relative >= 0) {
+ if (encoder != NULL) {
+ WaitForEvent(&batch->encoded);
+ }
+ batch->file->waitForCompletion();
+ }
+ return true;
+ bool
+ _int64 start = timeInNanos();
+ if (encoder != NULL) {
+ WaitForEvent(&batches[(current + 1) % count].encoded);
+ }
+ acquireLock();
+ int written = current;
+ Batch* write = &batches[written];
+ write->logicalUsed = write->used;
+ current = (current + 1) % count;
+ //fprintf(stderr, "nextBatch reset %d used=0\n", current);
+ batches[current].used = 0;
+ bool newBuffer = filter != NULL && (filter->filterType == CopyFilter || filter->filterType == TransformFilter);
+ bool newSize = filter != NULL && (filter->filterType == TransformFilter || filter->filterType == ResizeFilter);
+ if (newSize) {
+ // advisory only
+ write->fileOffset = supplier->sharedOffset;
+ write->logicalOffset = supplier->sharedLogical;
+ } else {
+ supplier->advance(encoder == NULL ? write->used : 0, write->logicalUsed, &write->fileOffset, &write->logicalOffset);
+ }
+ if (filter != NULL) {
+ size_t n = filter->onNextBatch(this, write->fileOffset, write->used);
+ if (newSize) {
+ write->used = n;
+ supplier->advance(encoder == NULL ? write->used : 0, write->logicalUsed, &write->fileOffset, &write->logicalOffset);
+ }
+ if (newBuffer) {
+ // current has used>0, written has logicalUsed>0, for compressed & uncompressed data respectively
+ batches[current].used = write->used;
+ batches[current].fileOffset = write->fileOffset;
+ batches[current].logicalUsed = 0;
+ batches[current].logicalOffset = write->logicalOffset;
+ write->used = 0;
+ written = current;
+ write = &batches[written];
+ current = (current + 1) % count;
+ batches[current].used = 0;
+ batches[current].logicalUsed = 0;
+ }
+ }
+ _int64 start2 = timeInNanos();
+ releaseLock();
+ InterlockedAdd64AndReturnNewValue(&FilterTime, start2 - start);
+ if (encoder == NULL) {
+ //fprintf(stderr, "nextBatch beginWrite #%d @%lld: %lld bytes\n", write-batches, write->fileOffset, write->used);
+ //_ASSERT(BgzfHeader::validate(write->buffer, write->used)); //!! remove before checkin
+ if (! write->file->beginWrite(write->buffer, write->used, write->fileOffset, NULL)) {
+ WriteErrorMessage("error: file write %lld bytes at offset %lld failed\n", write->used, write->fileOffset);
+ soft_exit(1);
+ }
+ } else {
+ PreventEventWaitersFromProceeding(&write->encoded);
+ encoder->inputReady();
+ }
+ if (! batches[current].file->waitForCompletion()) {
+ WriteErrorMessage("error: file write failed\n");
+ soft_exit(1);
+ }
+ InterlockedAdd64AndReturnNewValue(&WaitTime, timeInNanos() - start2);
+ return true;
+ void
+ nextBatch(); // ensure last buffer gets written
+ if (encoder != NULL) {
+ encoder->close();
+ for (int i = 0; i < count; i++) {
+ DestroyEventObject(&batches[i].encoded);
+ }
+ }
+ for (int i = 0; i < count; i++) {
+ batches[i].file->close();
+ }
+ const char* i_filename,
+ DataWriter::FilterSupplier* i_filterSupplier,
+ FileEncoder* i_encoder,
+ int i_bufferCount,
+ size_t i_bufferSize)
+ :
+ filename(i_filename),
+ filterSupplier(i_filterSupplier),
+ encoder(i_encoder),
+ bufferCount(i_bufferCount),
+ bufferSize(i_bufferSize),
+ sharedOffset(0),
+ sharedLogical(0),
+ closing(false)
+ file = AsyncFile::open(filename, true);
+ if (file == NULL) {
+ WriteErrorMessage("failed to open %s for write\n", filename);
+ soft_exit(1);
+ }
+ InitializeExclusiveLock(&lock);
+ DataWriter*
+ return new AsyncDataWriter(file, this, bufferCount, bufferSize,
+ filterSupplier && ! closing ? filterSupplier->getFilter() : NULL,
+ closing ? NULL : encoder);
+ void
+ closing = true;
+ if (filterSupplier != NULL) {
+ filterSupplier->onClosing(this);
+ }
+ file->close();
+ if (filterSupplier != NULL) {
+ filterSupplier->onClosed(this);
+ }
+ DestroyExclusiveLock(&lock);
+ void
+ size_t physical,
+ size_t logical,
+ size_t* o_physical,
+ size_t* o_logical)
+ AcquireExclusiveLock(&lock);
+ *o_physical = sharedOffset;
+ sharedOffset += physical;
+ *o_logical = sharedLogical;
+ sharedLogical += logical;
+ //fprintf(stderr, "advance %lld + %lld = %lld, logical %lld + %lld = %lld\n", *o_physical, physical, sharedOffset, *o_logical, logical, sharedLogical);
+ ReleaseExclusiveLock(&lock);
+ DataWriterSupplier*
+ const char* filename,
+ size_t bufferSize,
+ DataWriter::FilterSupplier* filterSupplier,
+ FileEncoder* encoder,
+ int count)
+ return new AsyncDataWriterSupplier(filename, filterSupplier, encoder, count, bufferSize);
+class ComposeFilter : public DataWriter::Filter
+ ComposeFilter(DataWriter::Filter* i_a, DataWriter::Filter* i_b) :
+ Filter(max(i_a->filterType, i_b->filterType)), a(i_a), b(i_b) {}
+ virtual ~ComposeFilter()
+ { delete a; delete b; }
+ virtual void inHeader(bool flag)
+ {
+ a->inHeader(flag);
+ b->inHeader(flag);
+ }
+ virtual void onAdvance(DataWriter* writer, size_t batchOffset, char* data, GenomeDistance bytes, GenomeLocation location)
+ {
+ a->onAdvance(writer, batchOffset, data, bytes, location);
+ b->onAdvance(writer, batchOffset, data, bytes, location);
+ }
+ virtual size_t onNextBatch(DataWriter* writer, size_t offset, size_t bytes)
+ {
+ size_t sa = a->onNextBatch(writer, offset, bytes);
+ size_t sb = b->onNextBatch(writer, offset, sa);
+ return sb;
+ }
+ DataWriter::Filter* a;
+ DataWriter::Filter* b;
+class ComposeFilterSupplier : public DataWriter::FilterSupplier
+ ComposeFilterSupplier(DataWriter::FilterSupplier* i_a, DataWriter::FilterSupplier* i_b) :
+ FilterSupplier(max(i_a->filterType, i_b->filterType)), a(i_a), b(i_b) {}
+ virtual ~ComposeFilterSupplier()
+ { delete a; delete b; }
+ virtual DataWriter::Filter* getFilter()
+ { return new ComposeFilter(a->getFilter(), b->getFilter()); }
+ virtual void onClosing(DataWriterSupplier* supplier)
+ {
+ a->onClosing(supplier);
+ b->onClosing(supplier);
+ }
+ virtual void onClosed(DataWriterSupplier* supplier)
+ {
+ a->onClosed(supplier);
+ b->onClosed(supplier);
+ }
+ DataWriter::FilterSupplier* a;
+ DataWriter::FilterSupplier* b;
+ DataWriter::FilterSupplier*
+ DataWriter::FilterSupplier* other)
+ return new ComposeFilterSupplier(this, other);
+volatile _int64 DataWriter::WaitTime = 0;
+volatile _int64 DataWriter::FilterTime = 0;
+ if (anyCreated) {
+ WriteErrorMessage("You can only ever write to stdout once per SNAP run (even if you're doing multiple runs with the comma syntax\n");
+ soft_exit(1);
+ }
+ anyCreated = true;
+#ifdef _MSC_VER
+ int result = _setmode( _fileno( stdout ), _O_BINARY ); // puts stdout in to non-translated mode, so if we're writing compressed data windows' CRLF processing doesn't destroy it.
+ if (-1 == result) {
+ WriteErrorMessage("StdoutAsyncFile::freopen to change to untranslated mode failed\n");
+ soft_exit(1);
+ }
+#endif // _MSC_VER
+ writeElementQueue->next = writeElementQueue->prev = writeElementQueue;
+ highestOffsetCompleted = 0;
+ InitializeExclusiveLock(&lock);
+ CreateEventObject(&unexaminedElementsOnQueue);
+ CreateEventObject(&elementsCompleted);
+ PreventEventWaitersFromProceeding(&unexaminedElementsOnQueue);
+ PreventEventWaitersFromProceeding(&elementsCompleted);
+ CreateSingleWaiterObject(&consumerThreadDone);
+ closing = false;
+ StartNewThread(ConsumerThreadMain, this);
+ StdoutAsyncFile *
+StdoutAsyncFile::open(const char *filename, bool write)
+ if (strcmp("-", filename) || !write) {
+ WriteErrorMessage("StdoutAsynFile must be named - and must be opened for write.\n");
+ soft_exit(1);
+ }
+ return new StdoutAsyncFile();
+class StdoutAsyncFileWriter : public AsyncFile::Writer
+ StdoutAsyncFileWriter(StdoutAsyncFile *i_asyncFile);
+ ~StdoutAsyncFileWriter() {}
+ // waits for all writes to complete, frees resources
+ bool close();
+ // begin a write; if there is already a write in progress, might wait for it to complete
+ bool beginWrite(void* buffer, size_t length, size_t offset, size_t *bytesWritten);
+ // wait for all prior beginWrites to complete
+ bool waitForCompletion();
+ bool anyWritesStarted;
+ size_t highestOffsetWritten;
+ StdoutAsyncFile *asyncFile;
+StdoutAsyncFileWriter::StdoutAsyncFileWriter(StdoutAsyncFile *i_asyncFile)
+ asyncFile = i_asyncFile;
+ highestOffsetWritten = 0;
+ anyWritesStarted = false;
+ bool
+ {
+ return waitForCompletion();
+ }
+ bool
+StdoutAsyncFileWriter::beginWrite(void* buffer, size_t length, size_t offset, size_t *bytesWritten)
+ _ASSERT(offset > highestOffsetWritten || !anyWritesStarted);
+ //fprintf(stderr, "StdoutAsyncFileWriter::beginWrite(0x%llx, %lld, %lld)\n", buffer, length, offset);
+ asyncFile->beginWrite(buffer, length, offset, bytesWritten);
+ highestOffsetWritten = offset + length;
+ anyWritesStarted = true;
+ return true;
+ bool
+ if (!anyWritesStarted) {
+ return true;
+ }
+ asyncFile->waitForCompletion(highestOffsetWritten);
+ return true;
+ DestroyExclusiveLock(&lock);
+ DestroyEventObject(&unexaminedElementsOnQueue);
+ DestroyEventObject(&elementsCompleted);
+ bool
+ AcquireExclusiveLock(&lock);
+ closing = true;
+ AllowEventWaitersToProceed(&unexaminedElementsOnQueue);
+ ReleaseExclusiveLock(&lock);
+ WaitForSingleWaiterObject(&consumerThreadDone);
+ return true;
+ AsyncFile::Writer*
+ return new StdoutAsyncFileWriter(this);
+ AsyncFile::Reader*
+ WriteErrorMessage("StdoutAsyncFile::getReader() called.\n");
+ soft_exit(1);
+ return NULL;
+ void
+StdoutAsyncFile::ConsumerThreadMain(void *param)
+ StdoutAsyncFile *file = (StdoutAsyncFile *)param;
+ SingleWaiterObject *doneObject = &file->consumerThreadDone;
+ file->runConsumer();
+ SignalSingleWaiterObject(doneObject);
+ void
+StdoutAsyncFile::beginWrite(void *buffer, size_t length, size_t offset, size_t *o_bytesWritten)
+ if (0 == length) {
+ return;
+ }
+ WriteElement *element = new WriteElement;
+ element->buffer = buffer;
+ element->length = length;
+ element->offset = offset;
+ element->o_bytesWritten = o_bytesWritten;
+ AcquireExclusiveLock(&lock);
+ _ASSERT(offset >= highestOffsetCompleted);
+ //
+ // The queue is in order. See if this element goes first.
+ //
+ if (isQueueEmpty() || offset < writeElementQueue->next->offset) {
+ _ASSERT(isQueueEmpty() || offset <= writeElementQueue->next->offset); // It fits entirely before the next element
+ element->enqueue(writeElementQueue);
+ if (element->offset == highestOffsetCompleted) {
+ //
+ // Wake the consumer, this is ready to write.
+ //
+ AllowEventWaitersToProceed(&unexaminedElementsOnQueue);
+ }
+ } else {
+ //
+ // It isn't the first thing on the queue. Figure out where it goes.
+ //
+ WriteElement *possiblePredecessor = writeElementQueue->next;
+ while (possiblePredecessor->next != writeElementQueue && possiblePredecessor->next->offset < offset) {
+ possiblePredecessor = possiblePredecessor->next;
+ }
+ _ASSERT(possiblePredecessor->offset < offset);
+ element->enqueue(possiblePredecessor);
+ }
+ ReleaseExclusiveLock(&lock);
+ void
+StdoutAsyncFile::waitForCompletion(size_t offset)
+ AcquireExclusiveLock(&lock);
+ while (offset > highestOffsetCompleted) {
+ PreventEventWaitersFromProceeding(&elementsCompleted);
+ ReleaseExclusiveLock(&lock);
+ WaitForEvent(&elementsCompleted);
+ AcquireExclusiveLock(&lock);
+ }
+ ReleaseExclusiveLock(&lock);
+ void
+ size_t maxWriteSize = 1024 * 1024;
+ AcquireExclusiveLock(&lock);
+ for (;;) {
+ if (isQueueEmpty() && closing) {
+ ReleaseExclusiveLock(&lock);
+ //
+ // Done. The caller is responsible for signalling the consumerThreadDone object.
+ //
+ return;
+ }
+ if (isQueueEmpty() || writeElementQueue->next->offset != highestOffsetCompleted) {
+ //
+ // Wait for work.
+ //
+ ReleaseExclusiveLock(&lock);
+ WaitForEvent(&unexaminedElementsOnQueue);
+ AcquireExclusiveLock(&lock);
+ PreventEventWaitersFromProceeding(&unexaminedElementsOnQueue);
+ continue;
+ }
+ //
+ // We have the next write queued. Write it. Use a loop in case fwrite doesn't take the whole thing at once.
+ //
+ WriteElement *element = writeElementQueue->next;
+ //fprintf(stderr,"StdoutAsyncFile::runConsumer(): writing buffer at 0x%llx, size %lld\n", element->buffer, element->length);
+ ReleaseExclusiveLock(&lock);
+ size_t bytesLeftToWrite = element->length;
+ size_t totalBytesWritten = 0;
+ while (bytesLeftToWrite > 0) {
+ size_t bytesToWrite = __min(bytesLeftToWrite, maxWriteSize);
+ size_t bytesWritten = fwrite((char *)element->buffer + totalBytesWritten, 1, bytesToWrite, stdout);
+ _ASSERT(bytesWritten <= bytesToWrite);
+ if (0 == bytesWritten) {
+ if (ENOMEM == errno && maxWriteSize > 1024) {
+ //
+ // For whatever reason, sometimes trying to write too much to stdout generates an ENOMEM (though we have tons of memory).
+ // If we see that and we're not already at a small size, just reduce our max write size and try again.
+ //
+ maxWriteSize /= 2;
+ } else {
+ WriteErrorMessage("StdoutAsyncFile::runConsumer(): fwrite failed %d\n", errno);
+ soft_exit(1);
+ }
+ }
+ bytesLeftToWrite -= bytesWritten;
+ totalBytesWritten += bytesWritten;
+ }
+ if (NULL != element->o_bytesWritten) {
+ *element->o_bytesWritten = totalBytesWritten;
+ }
+ AcquireExclusiveLock(&lock);
+ _ASSERT(writeElementQueue->next == element);
+ element->dequeue();
+ highestOffsetCompleted = element->offset + element->length;
+ AllowEventWaitersToProceed(&elementsCompleted);
+ delete element;
+ }
+ void
+StdoutAsyncFile::WriteElement::enqueue(WriteElement *previous)
+ next = previous->next;
+ prev = previous;
+ prev->next = this;
+ next->prev = this;
+ void
+ next->prev = prev;
+ prev->next = next;
+ next = prev = NULL;
+bool StdoutAsyncFile::anyCreated = false;
diff --git a/SNAPLib/DataWriter.h b/SNAPLib/DataWriter.h
new file mode 100644
index 0000000..52679e7
--- /dev/null
+++ b/SNAPLib/DataWriter.h
@@ -0,0 +1,271 @@
+Module Name:
+ DataWriter.h
+ Headers for the DataWriter & related classes for the SNAP sequencer
+ Ravi Pandya, Feb 2013
+ User mode service.
+Revision History:
+#pragma once
+#include "Compat.h"
+#include "Read.h"
+#include "ParallelTask.h"
+#include "Genome.h"
+class DataWriterSupplier;
+// per-thread writer for data into a single destination
+class DataWriter
+ enum FilterType
+ {
+ ReadFilter, // reads data but does not modify it
+ ModifyFilter, // modifies data in place
+ CopyFilter, // copies data into new buffer, same size
+ TransformFilter, // copies data into new buffer, possibly different size
+ ResizeFilter, // rewrites data in same buffer, possibly different size
+ };
+ // single filter instance per thread
+ // points to filterSupplier for common data
+ class Filter
+ {
+ public:
+ Filter (FilterType i_filterType) : filterType(i_filterType) {}
+ const FilterType filterType;
+ virtual ~Filter() {}
+ // called to set whether we're writing a header vs. individual reads
+ virtual void inHeader(bool flag) {} // default do nothing
+ // called when a chunk of data (i.e. a single read) has been written into the file
+ virtual void onAdvance(DataWriter* writer, size_t batchOffset, char* data, GenomeDistance bytes, GenomeLocation location) = 0;
+ // called when a batch has been completed, after advancing to the next
+ // e.g. so use getBatch(-1, ...) to get the one that was just completed
+ // TransformFilters return #byte of transformed data in current buffer, so we need to advance again
+ // TransformFilters should call getBatch(0) to ensure current buffer has been written before they write into it
+ virtual size_t onNextBatch(DataWriter* writer, size_t offset, size_t bytes) = 0;
+ };
+ // factory for per-thread filters
+ class FilterSupplier
+ {
+ public:
+ FilterSupplier (FilterType i_filterType) : filterType(i_filterType) {}
+ const FilterType filterType;
+ virtual ~FilterSupplier() {}
+ FilterSupplier* compose(FilterSupplier* other);
+ virtual Filter* getFilter() = 0;
+ // called when entire file is done; onClosing before file is closed, onClosed after
+ virtual void onClosing(DataWriterSupplier* supplier) = 0;
+ virtual void onClosed(DataWriterSupplier* supplier) = 0;
+ };
+ DataWriter(Filter* i_filter) : filter(i_filter) {}
+ virtual ~DataWriter() {}
+ void inHeader(bool flag)
+ { if (filter != NULL) { filter->inHeader(flag); } }
+ // get remaining space in current buffer for writing
+ virtual bool getBuffer(char** o_buffer, size_t* o_size) = 0;
+ // advance within current buffer, reducing available space
+ // should be called on each read, with the location
+ virtual void advance(GenomeDistance bytes, GenomeLocation location = 0) = 0;
+ // get complete data buffer in batch, relative==0 is current, relative==-1 is previous, etc.
+ // if negative gets old data written, else waits for write to complete so you can write into it
+ // o_offset gets physical offset (e.g. compressed), o_logical gets data offset (e.g. uncompressed)
+ virtual bool getBatch(int relative, char** o_buffer, size_t* o_size = NULL, size_t* o_used = NULL, size_t* o_offset = NULL, size_t* o_logicalUsed = 0, size_t* o_logicalOffset = NULL) = 0;
+ // advance to next buffer
+ virtual bool nextBatch() = 0;
+ // this thread is complete
+ virtual void close() = 0;
+ // nanosecond timers
+ static volatile _int64 FilterTime;
+ static volatile _int64 WaitTime;
+ Filter* filter;
+class FileFormat;
+class Genome;
+class GzipWriterFilterSupplier;
+class FileEncoder;
+// creates writers for multiple threads
+class DataWriterSupplier
+ virtual ~DataWriterSupplier() {}
+ virtual DataWriter* getWriter() = 0;
+ // call when all threads are done, all filters destroyed
+ virtual void close() = 0;
+ static DataWriterSupplier* create(
+ const char* filename,
+ size_t bufferSize,
+ DataWriter::FilterSupplier* filterSupplier = NULL,
+ FileEncoder* encoder = NULL,
+ int count = 4);
+ static DataWriterSupplier* sorted(
+ const FileFormat* format,
+ const Genome* genome,
+ const char* tempFileName,
+ size_t tempBufferMemory,
+ int numThreads,
+ const char* sortedFileName,
+ DataWriter::FilterSupplier* sortedFilterSupplier,
+ size_t maxBufferSize,
+ FileEncoder* encoder = NULL);
+ // defaults follow BAM output spec
+ static GzipWriterFilterSupplier* gzip(bool bamFormat, size_t chunkSize, int numThreads, bool bindToProcessors, bool multiThreaded);
+ static DataWriter::FilterSupplier* markDuplicates(const Genome* genome);
+ static DataWriter::FilterSupplier* bamIndex(const char* indexFileName, const Genome* genome, GzipWriterFilterSupplier* gzipSupplier);
+class AsyncDataWriter;
+class FileEncoder
+ FileEncoder(int numThreads, bool bindToProcessors, ParallelWorkerManager* i_supplier);
+ ~FileEncoder()
+ {
+ if (coworker != NULL) {
+ _ASSERT(! encoderRunning); coworker->stop(); delete coworker;
+ }
+ }
+ static FileEncoder* gzip(GzipWriterFilterSupplier* filterSupplier, int numThreads, bool bindToProcessor, size_t chunkSize = 65536, bool bam = true);
+ // post-construction initialization
+ void initialize(AsyncDataWriter* i_writer);
+ // called by writer when there is data to encode; threadsafe
+ void inputReady();
+ void close();
+ // called by supplier to get/set information about current batch
+ void setupEncode(int relative);
+ void getEncodeBatch(char** o_batch, size_t* o_batchSize, size_t* o_batchUsed);
+ void getOffsets(size_t* o_logicalOffset, size_t* o_physicalOffset);
+ void setEncodedBatchSize(size_t newSize);
+ // static callback for encoder; threadsafe
+ static void outputReadyCallback(void *p);
+ // called by encoder when a block of data has been encoded; threadsafe
+ void outputReady();
+ // scans writer and kicks off encoder if there is something ready; must hold lock
+ void checkForInput();
+ AsyncDataWriter* writer;
+ ParallelCoworker* coworker;
+ ExclusiveLock* lock;
+ bool encoderRunning;
+ int encoderBatch;
+ friend class AsyncDataWriter;
+class StdoutAsyncFile : public AsyncFile
+ StdoutAsyncFile();
+ virtual ~StdoutAsyncFile();
+ bool close();
+ static StdoutAsyncFile *open(const char *filename, bool write);
+ AsyncFile::Writer* getWriter();
+ AsyncFile::Reader* getReader();
+ void beginWrite(void *buffer, size_t length, size_t offset, size_t *o_bytesWritten);
+ void waitForCompletion(size_t offset);
+ ExclusiveLock lock;
+ struct WriteElement {
+ void *buffer;
+ size_t length;
+ size_t offset;
+ size_t *o_bytesWritten;
+ WriteElement *next;
+ WriteElement *prev;
+ void enqueue(WriteElement *previous);
+ void dequeue();
+ };
+ bool isQueueEmpty() {
+ return writeElementQueue->next == writeElementQueue;
+ }
+ size_t highestOffsetCompleted;
+ //
+ // The queue is kept in order, and the writer writes without gaps, so if you put on blocks 10 and 12, the writer will write
+ // 10, and then leave 12 on the queue and wait for 11 to be added and written before processing 12.
+ //
+ WriteElement writeElementQueue[1];
+ EventObject unexaminedElementsOnQueue; // This gets set when a writer puts a block on the queue, and cleared when the consumer has seen it.
+ EventObject elementsCompleted; // This gets set when any element is completed by the consumer, and reset when a waiter starts
+ SingleWaiterObject consumerThreadDone;
+ bool closing;
+ static void ConsumerThreadMain(void *param);
+ void runConsumer();
+ static bool anyCreated; // Because there's no way to multiplex stdout, you only get one per run of SNAP
diff --git a/SNAPLib/Error.cpp b/SNAPLib/Error.cpp
new file mode 100644
index 0000000..66c16ba
--- /dev/null
+++ b/SNAPLib/Error.cpp
@@ -0,0 +1,89 @@
+Module Name:
+ Error.cpp
+ SNAP error-message writer
+ Bill Bolosky, Feburary, 2014
+ User mode service.
+Revision History:
+#include "stdafx.h"
+#include "Compat.h"
+#include "Error.h"
+#include "AlignerOptions.h"
+#include "CommandProcessor.h"
+ void
+WriteMessageToFile(FILE *file, const char *message)
+ if (AlignerOptions::useHadoopErrorMessages) {
+ fprintf(stderr,"reporter:status:%s", message); // Always use stderr in Hadoop mode, regardless of whether this is an error
+ fprintf(stderr, "%s", message); // And also print without the prefix, so it shows up in both logs
+ fflush(stderr);
+ } else {
+ if (AlignerOptions::outputToStdout && stdout == file) {
+ fprintf(stderr, "%s", message);
+ fflush(stderr);
+ } else {
+ fprintf(file, "%s", message);
+ fflush(file);
+ }
+ }
+ void
+WriteErrorMessage(const char *message, ...)
+ va_list args;
+ va_start(args, message);
+ const size_t bufferSize = 10240;
+ char buffer[bufferSize];
+ vsnprintf(buffer, bufferSize - 1, message, args);
+ WriteMessageToFile(stderr, buffer);
+ if (NULL != CommandPipe) {
+ WriteToNamedPipe(CommandPipe, buffer);
+ }
+ void
+WriteStatusMessage(const char *message, ...)
+ va_list args;
+ va_start(args, message);
+ const size_t bufferSize = 10240;
+ char buffer[bufferSize];
+ vsnprintf(buffer, bufferSize - 1, message, args);
+ WriteMessageToFile(stdout, buffer);
+ if (NULL != CommandPipe) {
+ WriteToNamedPipe(CommandPipe, buffer);
+ }
+ void
+WriteProgressCounter(const char *counterName, _int64 increment)
+ if (!AlignerOptions::useHadoopErrorMessages) {
+ //
+ // No counters unless in Hadoop mode.
+ //
+ return;
+ }
+ fprintf(stderr,"reporter:counter:SNAP,%s,%lld\n", counterName, increment);
+ fflush(stderr);
diff --git a/SNAPLib/Error.h b/SNAPLib/Error.h
new file mode 100644
index 0000000..cd014ea
--- /dev/null
+++ b/SNAPLib/Error.h
@@ -0,0 +1,34 @@
+Module Name:
+ Error.h
+ Header for SNAP error-message writer
+ Bill Bolosky, Feburary, 2014
+ User mode service.
+Revision History:
+#pragma once
+#include "Compat.h"
+ void
+WriteErrorMessage(const char *message, ...);
+ void
+WriteStatusMessage(const char *message, ...);
+ void
+WriteProgressCounter(const char *counterName, _int64 increment);
\ No newline at end of file
diff --git a/SNAPLib/FASTA.cpp b/SNAPLib/FASTA.cpp
new file mode 100644
index 0000000..712cbab
--- /dev/null
+++ b/SNAPLib/FASTA.cpp
@@ -0,0 +1,198 @@
+Module Name:
+ FASTA.cpp
+ FASTA reader
+ Bill Bolosky, August, 2011
+ User mode service.
+Revision History:
+ Adapted from Matei Zaharia's Scala implementation.
+#include "stdafx.h"
+#include "Compat.h"
+#include "FASTA.h"
+#include "Error.h"
+#include "exit.h"
+using namespace std;
+ const Genome *
+ const char *fileName,
+ const char *pieceNameTerminatorCharacters,
+ bool spaceIsAPieceNameTerminator,
+ unsigned chromosomePaddingSize)
+ //
+ // We need to know a bound on the size of the genome before we create the Genome object.
+ // A bound is the number of bytes in the FASTA file, because we store at most one base per
+ // byte. Get the file size to use for this bound.
+ //
+ _int64 fileSize = QueryFileSize(fileName);
+ bool isValidGenomeCharacter[256];
+ for (int i = 0; i < 256; i++) {
+ isValidGenomeCharacter[i] = false;
+ }
+ isValidGenomeCharacter['A'] = isValidGenomeCharacter['T'] = isValidGenomeCharacter['C'] = isValidGenomeCharacter['G'] = isValidGenomeCharacter['N'] = true;
+ isValidGenomeCharacter['a'] = isValidGenomeCharacter['t'] = isValidGenomeCharacter['c'] = isValidGenomeCharacter['g'] = isValidGenomeCharacter['n'] = true;
+ FILE *fastaFile = fopen(fileName, "r");
+ if (fastaFile == NULL) {
+ WriteErrorMessage("Unable to open FASTA file '%s' (even though we already got its size)\n",fileName);
+ return NULL;
+ }
+ const size_t lineBufferSize = 4096;
+ char lineBuffer[lineBufferSize];
+ //
+ // Count the chromosomes
+ //
+ unsigned nChromosomes = 0;
+ while (NULL != fgets(lineBuffer,lineBufferSize,fastaFile)) {
+ if (lineBuffer[0] == '>') {
+ nChromosomes++;
+ }
+ }
+ rewind(fastaFile);
+ Genome *genome = new Genome(fileSize + (nChromosomes+1) * (size_t)chromosomePaddingSize, fileSize + (nChromosomes+1) * (size_t)chromosomePaddingSize, chromosomePaddingSize, nChromosomes + 1);
+ char *paddingBuffer = new char[chromosomePaddingSize+1];
+ for (unsigned i = 0; i < chromosomePaddingSize; i++) {
+ paddingBuffer[i] = 'n';
+ }
+ paddingBuffer[chromosomePaddingSize] = '\0';
+ bool warningIssued = false;
+ bool inAContig = false;
+ while (NULL != fgets(lineBuffer,lineBufferSize,fastaFile)) {
+ if (lineBuffer[0] == '>') {
+ inAContig = true;
+ //
+ // A new contig. Add in the padding first.
+ //
+ genome->addData(paddingBuffer);
+ //
+ // Now supply the chromosome name.
+ //
+ if (NULL != pieceNameTerminatorCharacters) {
+ for (int i = 0; i < strlen(pieceNameTerminatorCharacters); i++) {
+ char *terminator = strchr(lineBuffer+1, pieceNameTerminatorCharacters[i]);
+ if (NULL != terminator) {
+ *terminator = '\0';
+ }
+ }
+ }
+ if (spaceIsAPieceNameTerminator) {
+ char *terminator = strchr(lineBuffer, ' ');
+ if (NULL != terminator) {
+ *terminator = '\0';
+ }
+ terminator = strchr(lineBuffer, '\t');
+ if (NULL != terminator) {
+ *terminator = '\0';
+ }
+ }
+ char *terminator = strchr(lineBuffer, '\n');
+ if (NULL != terminator) {
+ *terminator = '\0';
+ }
+ terminator = strchr(lineBuffer, '\r');
+ if (NULL != terminator) {
+ *terminator = '\0';
+ }
+ genome->startContig(lineBuffer+1);
+ } else {
+ if (!inAContig) {
+ WriteErrorMessage("\nFASTA file doesn't beging with a contig name (i.e., the first line doesn't start with '>').\n");
+ soft_exit(1);
+ }
+ //
+ // Convert it to upper case and truncate the newline before adding it to the genome.
+ //
+ char *newline = strchr(lineBuffer, '\n');
+ if (NULL != newline) {
+ *newline = 0;
+ }
+ //
+ // But convert any 'N' to 'n'. This is so we don't match the N from the genome with N
+ // in reads (where we just do a straight text comparison.
+ //
+ size_t lineLen = strlen(lineBuffer);
+ for (unsigned i = 0; i < lineLen; i++) {
+ lineBuffer[i] = toupper(lineBuffer[i]);
+ }
+ for (unsigned i = 0; i < lineLen; i++) {
+ if ('N' == lineBuffer[i]) {
+ lineBuffer[i] = 'n';
+ }
+ if (!isValidGenomeCharacter[(unsigned char)lineBuffer[i]]) {
+ if (!warningIssued) {
+ WriteErrorMessage("\nFASTA file contained a character that's not a valid base (or N): '%c', full line '%s'; \nconverting to 'N'. This may happen again, but there will be no more warnings.\n", lineBuffer[i], lineBuffer);
+ warningIssued = true;
+ }
+ lineBuffer[i] = 'N';
+ }
+ }
+ genome->addData(lineBuffer);
+ }
+ }
+ //
+ // And finally add padding at the end of the genome.
+ //
+ genome->addData(paddingBuffer);
+ genome->fillInContigLengths();
+ genome->sortContigsByName();
+ fclose(fastaFile);
+ delete [] paddingBuffer;
+ return genome;
+// TODO: Reduce code duplication with the mutator.
+bool AppendFASTAGenome(const Genome *genome, FILE *fasta, const char *prefix="")
+ int nContigs = genome->getNumContigs();
+ const Genome::Contig *contigs = genome->getContigs();
+ for (int i = 0; i < nContigs; ++i) {
+ const Genome::Contig &contig = contigs[i];
+ GenomeLocation start = contig.beginningLocation;
+ GenomeLocation end = i + 1 < nContigs ? contigs[i + 1].beginningLocation : genome->getCountOfBases();
+ GenomeDistance size = end - start;
+ const char *bases = genome->getSubstring(start, size);
+ fprintf(fasta, ">%s%s\n", prefix, contig.name);
+ fwrite(bases, 1, size, fasta);
+ fputc('\n', fasta);
+ }
+ return !ferror(fasta);
diff --git a/SNAPLib/FASTA.h b/SNAPLib/FASTA.h
new file mode 100644
index 0000000..6e542f3
--- /dev/null
+++ b/SNAPLib/FASTA.h
@@ -0,0 +1,49 @@
+Module Name:
+ FASTA reader
+ Bill Bolosky, August, 2011
+ User mode service.
+Revision History:
+ Adapted from Matei Zaharia's Scala implementation.
+#pragma once
+#include "Genome.h"
+ const Genome *
+ReadFASTAGenome(const char *fileName, const char *pieceNameTerminatorCharacters, bool spaceIsAPieceNameTerminator, unsigned chromosomePaddingSize);
+// The FASTA appending functions return whether the write was successful.
+// WARNING: They write very long lines.
+// According to Wikipedia, a FASTA file's line limit should be 120, or better, 79.
+// Unix workaround if the piece names aren't too long: 'fold -w 79'.
+ bool
+AppendFASTAGenome(const Genome *, FILE *fasta);
+// This is arbitrary; is there some existing convention?
+inline const char *diploidFASTASexPrefix(bool male)
+ return male ? "PATERNAL|" : "MATERNAL|";
diff --git a/SNAPLib/FASTQ.cpp b/SNAPLib/FASTQ.cpp
new file mode 100644
index 0000000..f297f5f
--- /dev/null
+++ b/SNAPLib/FASTQ.cpp
@@ -0,0 +1,669 @@
+Module Name:
+ FASTQ.cpp
+ Fast FASTQ genome "query" reader.
+ Bill Bolosky, August, 2011
+ User mode service.
+ This class is NOT thread safe. It's the caller's responsibility to ensure that
+ at most one thread uses an instance at any time.
+Revision History:
+ Adapted from Matei Zaharia's Scala implementation.
+#include "stdafx.h"
+#include "FASTQ.h"
+#include "Compat.h"
+#include "BigAlloc.h"
+#include "Read.h"
+#include "Util.h"
+#include "exit.h"
+#include "Error.h"
+using std::min;
+using util::strnchr;
+ DataReader* i_data,
+ const ReaderContext& i_context)
+ :
+ ReadReader(i_context),
+ data(i_data)
+ delete data;
+ data = NULL;
+ FASTQReader*
+ DataSupplier* supplier,
+ const char *fileName,
+ int bufferCount,
+ _int64 startingOffset,
+ _int64 amountOfFileToProcess,
+ const ReaderContext& context)
+ DataReader* data = supplier->getDataReader(bufferCount, maxReadSizeInBytes, 0.0, 0);
+ FASTQReader* fastq = new FASTQReader(data, context);
+ if (! fastq->init(fileName)) {
+ WriteErrorMessage("Unable to initialize FASTQReader for file %s\n", fileName);
+ soft_exit(1);
+ }
+ fastq->reinit(startingOffset, amountOfFileToProcess);
+ return fastq;
+ void
+ const char* fileName,
+ ReaderContext& context)
+ // no header in FQ files
+ context.header = NULL;
+ context.headerLength = context.headerBytes = 0;
+ bool
+ const char* i_fileName)
+ fileName = i_fileName;
+ return data->init(fileName);
+ void
+ _int64 startingOffset,
+ _int64 amountOfFileToProcess)
+ data->reinit(startingOffset, amountOfFileToProcess);
+ char* buffer;
+ _int64 bytes;
+ if (! data->getData(&buffer, &bytes)) {
+ return;
+ }
+ // If we're not at the start of the file, we might have the tail end of a read that someone
+ // who got the previous range will process; advance past that. This is fairly tricky because
+ // there can be '@' signs in the quality string (and maybe even in read names?).
+ if (startingOffset != 0) {
+ if (!skipPartialRecord(data)) {
+ //
+ // There wasn't a whole record in our range. Skip over what we had.
+ //
+ data->advance(bytes);
+ return;
+ }
+ }
+ bool
+FASTQReader::skipPartialRecord(DataReader *data)
+ //
+ // Just assume that a single FASTQ read is smaller than our buffer, so we won't exceed the buffer here.
+ // Look for the pattern '{0|\n}@*\n{A|T|C|G|N}*\n+' That is, either the beginning of the buffer or a
+ // newline, followed by an '@' and some text, another newline followed by a list of bases and a newline,
+ // and then a plus.
+ //
+ char* buffer;
+ _int64 validBytes;
+ data->getData(&buffer, &validBytes);
+ char *firstLineCandidate = buffer;
+ if (*firstLineCandidate != '@') {
+ firstLineCandidate = strnchr(buffer, '\n', validBytes) + 1;
+ }
+ for (;;) {
+ if (firstLineCandidate - buffer >= validBytes) {
+ // This happens for very small files.
+ return false;
+ }
+ char *secondLineCandidate = strnchr(firstLineCandidate, '\n', validBytes - (firstLineCandidate - buffer)) + 1;
+ if (NULL == (secondLineCandidate-1)) {
+ WriteErrorMessage("Unable to find a read in FASTQ buffer (2) at %d\n", data->getFileOffset());
+ return false;
+ }
+ if (*firstLineCandidate != '@') {
+ firstLineCandidate = secondLineCandidate;
+ continue;
+ }
+ //
+ // Scan through the second line making sure it's all bases (or 'N'). We don't have to
+ // check for end-of-buffer because we know there's a null there.
+ //
+ char *thirdLineCandidate = secondLineCandidate;
+ while (*thirdLineCandidate == 'A' || *thirdLineCandidate == 'C' || *thirdLineCandidate == 'T' || *thirdLineCandidate == 'G' ||
+ *thirdLineCandidate == 'N' || *thirdLineCandidate == 'a' || *thirdLineCandidate == 'c' || *thirdLineCandidate == 't' ||
+ *thirdLineCandidate == 'g' || *thirdLineCandidate == 'n') {
+ thirdLineCandidate++;
+ }
+ if (*thirdLineCandidate == '\r') {
+ //
+ // CRLF text; skip the CR.
+ //
+ thirdLineCandidate++;
+ }
+ if (*thirdLineCandidate != '\n') {
+ //
+ // We found something that's not a base and not a newline. It wasn't a read data (second) line. Move up a line
+ // and try again.
+ //
+ firstLineCandidate = secondLineCandidate;
+ continue;
+ }
+ thirdLineCandidate++;
+ if (*thirdLineCandidate != '+') {
+ firstLineCandidate = secondLineCandidate;
+ continue;
+ }
+ break;
+ }
+ data->advance(firstLineCandidate - buffer);
+ return true;
+// Try to parse a read starting at a given position pos, updating readToUpdate with it.
+// Returns 0 if the parse failed or the first position past the read if it succeeds. In
+// addition, if exitOnFailure is set, print a warning and exit if there is a parse error.
+// (The one time we don't set this is when trying to find the first read in a chunk.)
+ bool
+FASTQReader::getNextRead(Read *readToUpdate)
+ //
+ // Find the next newline.
+ //
+ char* buffer;
+ _int64 validBytes;
+ if (! data->getData(&buffer, &validBytes)) {
+ data->nextBatch();
+ if (! data->getData(&buffer, &validBytes)) {
+ return false;
+ }
+ }
+ _int64 bytesConsumed = getReadFromBuffer(buffer, validBytes, readToUpdate, fileName, data, context);
+ if (bytesConsumed == 0) {
+ return false;
+ }
+ data->advance(bytesConsumed);
+ return true;
+// static char LAST[100000]; static int LASTLEN = 0;
+ _int64
+FASTQReader::getReadFromBuffer(char *buffer, _int64 validBytes, Read *readToUpdate, const char *fileName, DataReader *data, const ReaderContext &context)
+ //
+ // Get the next four lines.
+ //
+ char* lines[nLinesPerFastqQuery];
+ unsigned lineLengths[nLinesPerFastqQuery];
+ char* scan = buffer;
+ for (unsigned i = 0; i < nLinesPerFastqQuery; i++) {
+ char *newLine = strnchr(scan, '\n', validBytes - (scan - buffer));
+ if (NULL == newLine) {
+ if (validBytes - (scan - buffer) == 1 && *scan == 0x1a && data->isEOF()) {
+ // sometimes DOS files will have extra ^Z at end
+ return false;
+ }
+ //
+ // There was no next newline
+ //
+ if (data->isEOF()) {
+ WriteErrorMessage("FASTQ file doesn't end with a newline! Failing. fileOffset = %lld, validBytes = %d\n",
+ data->getFileOffset(),validBytes);
+ soft_exit(1);
+ }
+ WriteErrorMessage("FASTQ record larger than buffer size at %s:%lld\n", fileName, data->getFileOffset());
+ soft_exit(1);
+ }
+ const size_t lineLen = newLine - scan;
+ if (0 == lineLen) {
+ WriteErrorMessage("Syntax error in FASTQ file: blank line.\n");
+ soft_exit(1);
+ }
+ if (! isValidStartingCharacterForNextLine[(i + 3) % 4][*scan]) {
+ WriteErrorMessage("FASTQ file %s has invalid starting character at offset %lld, line type %d, char %c\n", data->getFilename(), data->getFileOffset(), i, *scan);
+ WriteErrorMessage("Line in question: '%.*s'\n", lineLen, scan);
+ //WriteErrorMessage("Preceding record: '%.*s'\n", LASTLEN, LAST);
+ soft_exit(1);
+ }
+ lines[i] = scan;
+ lineLengths[i] = (unsigned) lineLen - (scan[lineLen-1] == '\r' ? 1 : 0);
+ scan = newLine + (newLine[1] == '\r' ? 2 : 1);
+ }
+ const char *id = lines[0] + 1; // The '@' on the first line is not part of the ID
+ const char* space = strnchr(id, ' ', lineLengths[0] - 1);
+ readToUpdate->init(id, space != NULL ? (unsigned) (space - id) : (unsigned) lineLengths[0] - 1, lines[1], lines[3], lineLengths[1]);
+ readToUpdate->clip(context.clipping);
+ readToUpdate->setBatch(data->getBatch());
+ readToUpdate->setReadGroup(context.defaultReadGroup);
+ // memcpy(LAST, buffer, scan - buffer); LASTLEN = scan - buffer;
+ return scan - buffer;
+// static data & initialization
+bool FASTQReader::isValidStartingCharacterForNextLine[FASTQReader::nLinesPerFastqQuery][256];
+FASTQReader::_init FASTQReader::_initializer;
+ //
+ // Initialize the isValidStartingCharacterForNextLine array.
+ //
+ memset(isValidStartingCharacterForNextLine, 0, sizeof(isValidStartingCharacterForNextLine));
+ //
+ // The first line is the descriptor line and must start with an '@'
+ //
+ isValidStartingCharacterForNextLine[3]['@'] = true;
+ //
+ // The second line is the read itself and must start with a base or an
+ // 'N' in either case. A . is just a different way to encode an N.
+ //
+ for (const char*p = "ACTGNURYKMSWBDHVNX."; *p; p++) {
+ isValidStartingCharacterForNextLine[0][*p] = true;
+ isValidStartingCharacterForNextLine[0][tolower(*p)] = true;
+ }
+ //
+ //
+ // The third line is additional sequence idenfitier info and must
+ // start with a '+'.
+ //
+ isValidStartingCharacterForNextLine[1]['+'] = true;
+ //
+ // Line 4 is the quality line. It starts with a printable ascii character.
+ // It would be nice to rule out the bases, N, + and @ because it might confsue the parser,
+ // but some quality lines do start with those...
+ //
+ for (char i = '!'; i <= '~'; i++) {
+ isValidStartingCharacterForNextLine[2][i] = true;
+ }
+// PairedInterleavedFASTQReader
+ DataReader* i_data,
+ const ReaderContext& i_context) :
+ data(i_data), context(i_context)
+ PairedInterleavedFASTQReader*
+PairedInterleavedFASTQReader::create(DataSupplier* supplier, const char *fileName, int bufferCount, _int64 startingOffset, _int64 amountOfFileToProcess,
+ const ReaderContext& context)
+ DataReader* data = supplier->getDataReader(bufferCount, 2 * maxReadSizeInBytes, 0.0, 0); // 2* because we read in pairs
+ PairedInterleavedFASTQReader* fastq = new PairedInterleavedFASTQReader(data, context);
+ if (! fastq->init(fileName)) {
+ WriteErrorMessage("Unable to initialize PairedInterleavedFASTQReader for file %s\n", fileName);
+ soft_exit(1);
+ }
+ fastq->reinit(startingOffset, amountOfFileToProcess);
+ return fastq;
+ bool
+PairedInterleavedFASTQReader::init(const char* i_fileName)
+ fileName = i_fileName;
+ return data->init(fileName);
+ bool
+PairedInterleavedFASTQReader::getNextReadPair(Read *read0, Read *read1)
+ //
+ // Find the next newline.
+ //
+ char* buffer;
+ _int64 validBytes;
+ if (! data->getData(&buffer, &validBytes)) {
+ data->nextBatch();
+ if (! data->getData(&buffer, &validBytes)) {
+ return false;
+ }
+ }
+ _int64 bytesConsumed = FASTQReader::getReadFromBuffer(buffer, validBytes, read0, fileName, data, context);
+ if (bytesConsumed == validBytes) {
+ WriteErrorMessage("Input file seems to have an odd number of reads. Ignoring the last one.");
+ return false;
+ }
+ bytesConsumed += FASTQReader::getReadFromBuffer(buffer + bytesConsumed, validBytes - bytesConsumed, read1, fileName, data, context);
+ //
+ // Validate the Read IDs.
+ //
+ if (read0->getIdLength() < 2 || memcmp(read0->getId() + read0->getIdLength() - 2, "/1", 2)) {
+ WriteErrorMessage("PairedInterleavedFASTQReader: first read of batch doesn't have ID ending with /1: '%.*s'\n", read0->getIdLength(), read0->getId());
+ soft_exit(1);
+ }
+ if (read1->getIdLength() < 2 || memcmp(read1->getId() + read1->getIdLength() - 2, "/2", 2)) {
+ WriteErrorMessage("PairedInterleavedFASTQReader: second read of batch doesn't have ID ending with /2: '%.*s'\n", read1->getIdLength(), read1->getId());
+ soft_exit(1);
+ }
+ data->advance(bytesConsumed);
+ return true;
+ void
+PairedInterleavedFASTQReader::reinit(_int64 startingOffset, _int64 amountOfFileToProcess)
+ data->reinit(startingOffset, amountOfFileToProcess);
+ char* buffer;
+ _int64 bytes;
+ if (! data->getData(&buffer, &bytes)) {
+ return;
+ }
+ // If we're not at the start of the file, we might have the tail end of a read that someone
+ // who got the previous range will process; advance past that. This is fairly tricky because
+ // there can be '@' signs in the quality string (and maybe even in read names?).
+ if (startingOffset != 0) {
+ if (!FASTQReader::skipPartialRecord(data)) {
+ return;
+ }
+ }
+ //
+ // Grab the first read from the buffer, and see if it's /1 or /2.
+ //
+ if (!data->getData(&buffer, &bytes)) {
+ return;
+ }
+ Read read;
+ _int64 bytesForFirstRead = FASTQReader::getReadFromBuffer(buffer, bytes, &read, fileName, data, context);
+ if (read.getIdLength() < 2 || read.getId()[read.getIdLength() - 2] != '/' || (read.getId()[read.getIdLength() - 1] != '1' && read.getId()[read.getIdLength() -1] != '2') ) {
+ WriteErrorMessage("PairedInterleavedFASTQReader: read ID doesn't appear to end with /1 or /2, you can't use this as a paired FASTQ file: '%.*s'\n", read.getIdLength(), read.getId());
+ soft_exit(1);
+ }
+ if (read.getId()[read.getIdLength()-1] == '2') {
+ //
+ // This is the second half of a pair. Skip it.
+ //
+ data->advance(bytesForFirstRead);
+ //
+ // Now make sure that the next read is /1.
+ //
+ if (!data->getData(&buffer, &bytes)) {
+ //
+ // This was the last read in the file.
+ //
+ return;
+ }
+ FASTQReader::getReadFromBuffer(buffer, bytes, &read, fileName, data, context);
+ if (read.getIdLength() < 2 || read.getId()[read.getIdLength()-2] != '/' || read.getId()[read.getIdLength()-1] != '1') {
+ WriteErrorMessage("PairedInterleavedFASTQReader: first read of pair doesn't appear to have an ID that ends in /1: '%.*s'\n", read.getIdLength(), read.getId());
+ soft_exit(1);
+ }
+ }
+// FASTQWriter
+ FASTQWriter *
+FASTQWriter::Factory(const char *filename)
+ FILE *file = fopen(filename,"wb");
+ if (NULL == file) {
+ return NULL;
+ }
+ return new FASTQWriter(file);
+ bool
+FASTQWriter::writeRead(Read *read)
+ size_t len = read->getIdLength() + 2 * (read->getDataLength() + 2) /* @ and + */ + 10 /* crlf + padding + null */;
+ if (bufferSize - bufferOffset <= len) {
+ flushBuffer();
+ }
+ size_t bytesUsed = snprintf(buffer + bufferOffset, bufferSize - bufferOffset, "@%.*s\n%.*s\n+\n%.*s\n", read->getIdLength(), read->getId(), read->getDataLength(), read->getData(), read->getDataLength(), read->getQuality());
+ bufferOffset += bytesUsed;
+ return true;
+ for (int i = 0; i < 2; i++) {
+ delete readers[i];
+ readers[i] = NULL;
+ }
+ PairedFASTQReader *
+ DataSupplier* supplier,
+ const char *fileName0,
+ const char *fileName1,
+ int bufferCount,
+ _int64 startingOffset,
+ _int64 amountOfFileToProcess,
+ const ReaderContext& context)
+ PairedFASTQReader *reader = new PairedFASTQReader;
+ reader->readers[0] = FASTQReader::create(supplier, fileName0, bufferCount, startingOffset, amountOfFileToProcess, context);
+ reader->readers[1] = FASTQReader::create(supplier, fileName1, bufferCount, startingOffset, amountOfFileToProcess, context);
+ for (int i = 0; i < 2; i++) {
+ if (NULL == reader->readers[i]) {
+ delete reader;
+ reader = NULL;
+ return NULL;
+ }
+ }
+ return reader;
+ bool
+PairedFASTQReader::getNextReadPair(Read *read0, Read *read1)
+ bool worked = readers[0]->getNextRead(read0);
+ if (readers[1]->getNextRead(read1) != worked) {
+ WriteErrorMessage("PairedFASTQReader: reads of both ends responded differently. The FASTQ files may not match properly.\n");
+ soft_exit(1);
+ }
+ return worked;
+ PairedReadSupplierGenerator *
+ const char *fileName0,
+ const char *fileName1,
+ int numThreads,
+ const ReaderContext& context,
+ bool gzip)
+ const char *fileNames[2] = {fileName0, fileName1};
+ //
+ // Decide whether to use the range splitter or a queue based on whether the files are the same size.
+ //
+ if (!strcmp("-", fileNames[0]) || !strcmp("-", fileNames[1]) || QueryFileSize(fileNames[0]) != QueryFileSize(fileNames[1]) || gzip) {
+ //WriteStatusMessage("FASTQ using supplier queue\n");
+ DataSupplier* dataSupplier[2];
+ size_t fileSize[2];
+ for (int i = 0; i < 2; i++) {
+ if (!strcmp(fileNames[i], "-")) {
+ fileSize[i] = 0;
+ if (gzip) {
+ dataSupplier[i] = DataSupplier::GzipStdio;
+ } else {
+ dataSupplier[i] = DataSupplier::Stdio;
+ }
+ } else {
+ fileSize[i] = QueryFileSize(fileNames[i]);
+ if (gzip) {
+ dataSupplier[i] = DataSupplier::GzipDefault;
+ } else {
+ dataSupplier[i] = DataSupplier::Default;
+ }
+ }
+ }
+ int bufferCount = ReadSupplierQueue::BufferCount(numThreads);
+ ReadReader *reader1 = FASTQReader::create(dataSupplier[0], fileName0, bufferCount, 0, fileSize[0],context);
+ ReadReader *reader2 = FASTQReader::create(dataSupplier[1], fileName1, bufferCount, 0, fileSize[1],context);
+ if (NULL == reader1 || NULL == reader2) {
+ delete reader1;
+ delete reader2;
+ return NULL;
+ }
+ ReadSupplierQueue *queue = new ReadSupplierQueue(reader1,reader2);
+ queue->startReaders();
+ return queue;
+ } else {
+ //WriteStatusMessage("FASTQ using range splitter\n");
+ return new RangeSplittingPairedReadSupplierGenerator(fileName0, fileName1, FASTQFile, numThreads, false, context);
+ }
+ ReadSupplierGenerator *
+ const char *fileName,
+ int numThreads,
+ const ReaderContext& context,
+ bool gzip)
+ bool isStdin = !strcmp(fileName,"-");
+ if (! gzip && !isStdin) {
+ //
+ // Single ended uncompressed FASTQ files can be handled by a range splitter.
+ //
+ return new RangeSplittingReadSupplierGenerator(fileName, false, numThreads, context);
+ } else {
+ ReadReader* fastq;
+ //
+ // Because we can only have one stdin reader, we need to use a queue if we're reading from stdin
+ //
+ if (isStdin) {
+ if (gzip) {
+ fastq = FASTQReader::create(DataSupplier::GzipStdio, fileName, ReadSupplierQueue::BufferCount(numThreads), 0, 0, context);
+ } else {
+ fastq = FASTQReader::create(DataSupplier::Stdio, fileName, ReadSupplierQueue::BufferCount(numThreads), 0, 0, context);
+ }
+ } else {
+ fastq = FASTQReader::create(DataSupplier::GzipDefault, fileName, ReadSupplierQueue::BufferCount(numThreads), 0, QueryFileSize(fileName), context);
+ }
+ if (fastq == NULL) {
+ delete fastq;
+ return NULL;
+ }
+ ReadSupplierQueue *queue = new ReadSupplierQueue(fastq);
+ queue->startReaders();
+ return queue;
+ }
+ PairedReadSupplierGenerator *
+ const char *fileName,
+ int numThreads,
+ const ReaderContext& context,
+ bool gzip)
+ bool isStdin = !strcmp(fileName,"-");
+ if (gzip || isStdin) {
+ //WriteStatusMessage("PairedInterleavedFASTQ using supplier queue\n");
+ DataSupplier *dataSupplier;
+ if (isStdin) {
+ if (gzip) {
+ dataSupplier = DataSupplier::GzipStdio;
+ } else {
+ dataSupplier = DataSupplier::Stdio;
+ }
+ } else {
+ dataSupplier = DataSupplier::GzipDefault;
+ }
+ PairedReadReader *reader = PairedInterleavedFASTQReader::create(dataSupplier, fileName,
+ ReadSupplierQueue::BufferCount(numThreads), 0,(stdin ? 0 : QueryFileSize(fileName)),context);
+ if (NULL == reader ) {
+ delete reader;
+ return NULL;
+ }
+ ReadSupplierQueue *queue = new ReadSupplierQueue(reader);
+ queue->startReaders();
+ return queue;
+ } else {
+ //WriteStatusMessage("PairedInterleavedFASTQ using range splitter\n");
+ return new RangeSplittingPairedReadSupplierGenerator(fileName, NULL, InterleavedFASTQFile, numThreads, false, context);
+ }
diff --git a/SNAPLib/FASTQ.h b/SNAPLib/FASTQ.h
new file mode 100644
index 0000000..78572a2
--- /dev/null
+++ b/SNAPLib/FASTQ.h
@@ -0,0 +1,213 @@
+Module Name:
+ Headers for fast FASTQ genome "query" reader.
+ Bill Bolosky, August, 2011
+ User mode service.
+ This class is NOT thread safe. It's the caller's responsibility to ensure that
+ at most one thread uses an instance at any time.
+Revision History:
+ Adapted from Matei Zaharia's Scala implementation.
+#pragma once
+#include "Compat.h"
+#include "Read.h"
+#include "ReadSupplierQueue.h"
+#include "RangeSplitter.h"
+#include "DataReader.h"
+#include "Error.h"
+class FASTQReader : public ReadReader {
+ FASTQReader(DataReader* data, const ReaderContext& i_context);
+ virtual ~FASTQReader();
+ static FASTQReader* create(DataSupplier* supplier,
+ const char *fileName,
+ int bufferCount,
+ _int64 startingOffset,
+ _int64 amountOfFileToProcess,
+ const ReaderContext& i_context);
+ static void readHeader(const char* fileName, ReaderContext& context);
+ bool init(const char* i_fileName);
+ static ReadSupplierGenerator *createReadSupplierGenerator(const char *fileName, int numThreads, const ReaderContext& context, bool gzip = false);
+ virtual bool getNextRead(Read *readToUpdate);
+ virtual void reinit(_int64 startingOffset, _int64 amountOfFileToProcess);
+ virtual void holdBatch(DataBatch batch)
+ { data->holdBatch(batch); }
+ virtual bool releaseBatch(DataBatch batch)
+ { return data->releaseBatch(batch); }
+ static _int64 getReadFromBuffer(char *buffer, _int64 bufferSize, Read *readToUpdate, const char *fileName, DataReader *data, const ReaderContext &context); // Returns the number of bytes consumed.
+ static bool skipPartialRecord(DataReader *data);
+ static const int maxReadSizeInBytes = MAX_READ_LENGTH * 2 + 1000; // Read as in sequencer read, not read-from-the-filesystem. +1000 is for ID string, + line, newlines, etc.
+ DataReader* data;
+ const char* fileName;
+ static const unsigned maxLineLen = MAX_READ_LENGTH + 500;
+ static const unsigned nLinesPerFastqQuery = 4;
+ static bool isValidStartingCharacterForNextLine[nLinesPerFastqQuery][256];
+ static class _init
+ {
+ public:
+ _init();
+ } _initializer;
+// Get read pairs from an interleaved FASTQ. It's the same as an ordinary FASTQ reader, except that it has a different version of
+// skipPartialRecord() that goes until it hits the first read in a pair. It identifies the pairs by looking for /1 and /2 at the
+// end of the read IDs.
+class PairedInterleavedFASTQReader : public PairedReadReader {
+ PairedInterleavedFASTQReader(DataReader* data, const ReaderContext& i_context);
+ virtual ~PairedInterleavedFASTQReader() {}
+ static PairedInterleavedFASTQReader* create(DataSupplier* supplier, const char *fileName, int bufferCount, _int64 startingOffset, _int64 amountOfFileToProcess,
+ const ReaderContext& i_context);
+ static void readHeader(const char* fileName, ReaderContext& context) {
+ FASTQReader::readHeader(fileName, context);
+ }
+ bool init(const char* i_fileName);
+ static PairedReadSupplierGenerator *createPairedReadSupplierGenerator(const char *fileName, int numThreads, const ReaderContext& context, bool gzip);
+ virtual bool getNextReadPair(Read *read0, Read *read1);
+ virtual void reinit(_int64 startingOffset, _int64 amountOfFileToProcess);
+ virtual void holdBatch(DataBatch batch)
+ { data->holdBatch(batch); }
+ virtual bool releaseBatch(DataBatch batch)
+ { return data->releaseBatch(batch); }
+ virtual ReaderContext* getContext()
+ { return &context; }
+ static const int maxReadSizeInBytes = MAX_READ_LENGTH * 2 + 1000; // Read as in sequencer read, not read-from-the-filesystem. +1000 is for ID string, + line, newlines, etc.
+ DataReader* data;
+ const char* fileName;
+ ReaderContext context;
+class PairedFASTQReader: public PairedReadReader {
+ virtual ~PairedFASTQReader();
+ static PairedFASTQReader* create(DataSupplier* supplier, const char *fileName0, const char *fileName1,
+ int bufferCount, _int64 startingOffset, _int64 amountOfFileToProcess, const ReaderContext& context);
+ virtual bool getNextReadPair(Read *read0, Read *read1);
+ virtual void reinit(_int64 startingOffset, _int64 amountOfFileToProcess) {
+ for (int i = 0; i < 2; i++) {
+ readers[i]->reinit(startingOffset,amountOfFileToProcess);
+ }
+ }
+ virtual ReadReader *getReaderToInitializeRead(int whichHalfOfPair)
+ {
+ _ASSERT(0 == whichHalfOfPair || 1 == whichHalfOfPair);
+ return readers[whichHalfOfPair];
+ }
+ static PairedReadSupplierGenerator *createPairedReadSupplierGenerator(const char *fileName0, const char *fileName1, int numThreads, const ReaderContext& context, bool gzip = false);
+ virtual void holdBatch(DataBatch batch)
+ { _ASSERT(false); /* not supported */ }
+ virtual bool releaseBatch(DataBatch batch)
+ { _ASSERT(false); /* not supported */ return false; }
+ virtual ReaderContext* getContext()
+ { return readers[0]->getContext(); }
+ PairedFASTQReader()
+ {
+ for (int i =0; i < 2; i++) {
+ readers[i] = NULL;
+ }
+ }
+ FASTQReader *readers[2];
+class FASTQWriter {
+ ~FASTQWriter() {flushBuffer(); delete [] buffer; fclose(outputFile);}
+ static FASTQWriter *Factory(const char *filename);
+ bool writeRead(Read *readToWrite);
+ void flushBuffer()
+ {
+ if (0 == bufferOffset) {
+ return;
+ }
+ if (1 != fwrite(buffer, bufferOffset, 1, outputFile)) {
+ WriteErrorMessage("FASTQWriter: error writing file\n");
+ }
+ bufferOffset = 0;
+ }
+ FASTQWriter(FILE *i_outputFile) : outputFile(i_outputFile) {
+ bufferSize = 20 * 1024 * 1024;
+ buffer = new char[bufferSize];
+ bufferOffset = 0;
+ }
+ FILE *outputFile;
+ char *buffer;
+ size_t bufferSize;
+ size_t bufferOffset;
diff --git a/SNAPLib/FileFormat.h b/SNAPLib/FileFormat.h
new file mode 100644
index 0000000..74734f3
--- /dev/null
+++ b/SNAPLib/FileFormat.h
@@ -0,0 +1,108 @@
+Module Name:
+ FileFormat.h
+ Headers for the FileFormat class for the SNAP sequencer
+ Ravi Pandya, February 2013
+ User mode service.
+Revision History:
+#pragma once
+#include "Compat.h"
+#include "Tables.h"
+#include "Read.h"
+#include "Genome.h"
+#include "LandauVishkin.h"
+#include "AlignerOptions.h"
+#include "Genome.h"
+// abstract class defining format-specific operations
+// for reading and writing files of reads
+class FileFormat
+ //
+ // files
+ //
+ // reading
+ //
+ virtual void getSortInfo(const Genome* genome, char* buffer, _int64 bytes, GenomeLocation* o_location, GenomeDistance* o_readBytes, int* o_refID = NULL, int* o_pos = NULL) const = 0;
+ /*
+ virtual ReadReader* createReader(const DataSupplier* supplier, const char *fileName,
+ const Genome *genome, _int64 startingOffset, _int64 amountOfFileToProcess,
+ ReadClippingType clipping = ClipBack) = 0;
+ virtual PairedReadReader* createPairedReader(const DataSupplier* supplier, const char *fileName,
+ const Genome *genome, _int64 startingOffset, _int64 amountOfFileToProcess,
+ ReadClippingType clipping = ClipBack) = 0;
+ virtual ReadSupplierGenerator *createReadSupplierGenerator(const char *fileName, int numThreads,
+ const Genome *genome, ReadClippingType clipping = ClipBack) = 0;
+ virtual PairedReadSupplierGenerator *createPairedReadSupplierGenerator(const char *fileName,
+ int numThreads, const Genome *genome, ReadClippingType clipping = ClipBack) = 0;
+ // parse header and match against genome
+ virtual bool parseHeader(const char *fileName, char *firstLine, char *endOfBuffer,
+ const Genome *genome, _int64 *o_headerSize) = 0;
+ */
+ static const char* RGLineToAux;
+ static void setupReaderContext(AlignerOptions* options, ReaderContext* readerContext, bool bam);
+ virtual void setupReaderContext(AlignerOptions* options, ReaderContext* readerContext) const = 0;
+ //
+ // writing
+ //
+ virtual ReadWriterSupplier* getWriterSupplier(AlignerOptions* options, const Genome* genome) const = 0;
+ virtual bool writeHeader(
+ const ReaderContext& context, char *header, size_t headerBufferSize, size_t *headerActualSize,
+ bool sorted, int argc, const char **argv, const char *version, const char *rgLine, bool omitSQLines) const = 0;
+ virtual bool writeRead(
+ const ReaderContext& context, LandauVishkinWithCigar * lv, char * buffer, size_t bufferSpace,
+ size_t * spaceUsed, size_t qnameLen, Read * read, AlignmentResult result,
+ int mapQuality, GenomeLocation genomeLocation, Direction direction, bool secondaryAlignment, int* o_addFrontClipping,
+ bool hasMate = false, bool firstInPair = false, Read * mate = NULL,
+ AlignmentResult mateResult = NotFound, GenomeLocation mateLocation = 0, Direction mateDirection = FORWARD,
+ bool alignedAsPair = false) const = 0;
+ //
+ // formats
+ //
+ static const FileFormat* SAM[2]; // 0 for =, 1 for M (useM flag)
+ static const FileFormat* BAM[2];
+ static const FileFormat* FASTQ;
+ static const FileFormat* FASTQZ;
diff --git a/SNAPLib/FixedSizeMap.h b/SNAPLib/FixedSizeMap.h
new file mode 100644
index 0000000..bca97a5
--- /dev/null
+++ b/SNAPLib/FixedSizeMap.h
@@ -0,0 +1,296 @@
+#pragma once
+#include "Compat.h"
+#include "BigAlloc.h"
+#include "exit.h"
+#include "Error.h"
+// A hash function for numeric types.
+template<typename T>
+class NumericHash
+ inline _uint64 operator() (T value) {
+ return (_uint64) (value * 131);
+ }
+// A fixed-size hash map that allows for efficient clearing and reuse through epochs
+// and does not perform any memory allocation.
+// This class only allows the capacity to be a power of 2.
+// Use epoch + 1 as a tombstone for deleted values
+// K must be a numeric type that supports shift, mask and xor operators.
+template< typename K, typename V, typename Hash = NumericHash<K> >
+class FixedSizeMap
+ FixedSizeMap(unsigned capacity_ = 16): entries(NULL), size(0) {
+ reserve(capacity_);
+ }
+ ~FixedSizeMap() {
+ delete[] entries;
+ }
+ void reserve(unsigned capacity) {
+ if (!isPowerOf2(capacity)) {
+ WriteErrorMessage("FixedSizeMap capacity must be a power of 2\n");
+ soft_exit(1);
+ }
+ if (entries != NULL) {
+ if (size > 0) {
+ WriteErrorMessage("reserve() called on a non-empty FixedSizeMap\n");
+ soft_exit(1);
+ }
+ delete[] entries;
+ }
+ this->capacity = capacity;
+ this->mask = capacity - 1;
+ entries = new Entry[capacity];
+ for (unsigned i = 0; i < capacity; i++) {
+ entries[i].epoch = 0;
+ }
+ epoch = 1;
+ clearBloomFilter();
+ }
+ void clear() {
+ size = 0;
+ epoch += 2;
+ if (epoch > 100000000) {
+ // Reset the epoch of every bucket to 0 and the current epoch to 1
+ for (unsigned i = 0; i < capacity; i++) {
+ entries[i].epoch = 0;
+ }
+ epoch = 1;
+ }
+ clearBloomFilter();
+ }
+ void resize(unsigned size)
+ {
+ // Do something here to limit the size of the hash table to reduce cache missing.
+ _ASSERT(size <= capacity);
+ }
+ static const unsigned MaxQuadraticProbes = 4;
+ inline V get(K key) {
+ unsigned pos = hash(key) & mask;
+#if 0
+ //
+ // Prefetch the data. If it hits in the Bloom Filter then we can overlap the cache fetch
+ // with the Bloom Filter computation, making the latter essentially free. If it's not in the
+ // Bloom Filter, then this will bring the cache line in for the add that's doubtless coming
+ // soon after.
+ //
+ _mm_prefetch((const char *)(&entries[pos]),_MM_HINT_T2);
+ if (!checkBloomFilter(key)) {
+ //
+ // Not in the Bloom Filter means not in the cache.
+ //
+ return V();
+ }
+#endif // 0
+ unsigned i = 1;
+ while (true) {
+ if (entries[pos].epoch < epoch) {
+ return V();
+ } else if (entries[pos].key == key && entries[pos].epoch == epoch) {
+ return entries[pos].value;
+ } else {
+ pos = (pos + (i <= MaxQuadraticProbes ? i : 1)) & mask;
+ i++;
+ if (i > capacity + MaxQuadraticProbes) {
+ return V();
+ }
+ }
+ }
+ }
+ inline void put(K key, V value) {
+ _ASSERT(size < capacity);
+// addToBloomFilter(key);
+ unsigned pos = hash(key) & mask;
+ unsigned i = 1;
+ while (true) {
+ if (entries[pos].epoch != epoch) {
+ entries[pos].key = key;
+ entries[pos].value = value;
+ entries[pos].epoch = epoch;
+ size++;
+ return;
+ } else if (entries[pos].key == key) {
+ entries[pos].value = value;
+ return;
+ } else {
+ pos = (pos + (i <= MaxQuadraticProbes ? i : 1)) & mask;
+ i++;
+ _ASSERT(i <= capacity + MaxQuadraticProbes); // todo: overlow condition?
+ }
+ }
+ }
+ inline void erase(K key) {
+ _ASSERT(size <= capacity);
+ unsigned pos = hash(key) & mask;
+ unsigned i = 1;
+ while (true) {
+ if (entries[pos].epoch < epoch) {
+ return;
+ } else if (entries[pos].key == key && entries[pos].epoch == epoch) {
+ entries[pos].epoch = epoch + 1; // mark with tombstone
+ size--;
+ return;
+ } else {
+ pos = (pos + (i <= MaxQuadraticProbes ? i : 1)) & mask;
+ i++;
+ _ASSERT(i <= capacity + MaxQuadraticProbes); // todo: overlow condition?
+ }
+ }
+ }
+ inline int getSize() { return size; }
+ void *operator new(size_t size) {return BigAlloc(size);}
+ void operator delete(void *ptr) {BigDealloc(ptr);}
+ typedef void* iterator;
+ iterator begin()
+ {
+ return next(&entries[-1]);
+ }
+ iterator next(iterator i)
+ {
+ Entry* final = &entries[capacity];
+ Entry* x = (Entry*) i;
+ if (x < final) {
+ do {
+ x++;
+ } while (x < final && x->epoch != epoch);
+ }
+ return x;
+ }
+ iterator end()
+ {
+ return &entries[capacity];
+ }
+ K key(iterator i)
+ {
+ return ((Entry*)i)->key;
+ }
+ V& value(iterator i)
+ {
+ return ((Entry*)i)->value;
+ }
+ //
+ // To avoid cache misses on failed lookups, we have a cheezy Bloom filter. It's fixed at 512 bits (which is 64 bytes, typically
+ // a cache line), and two features.
+ //
+ static const unsigned bloomFilterFeatureSizeInBits = 9; // Must be >=3. Using 9 results in 64 bytes of Bloom Filter, which is cache-line sized (though not necessarily aligned)
+ static const unsigned bloomFilterSizeInChar = (1 << (bloomFilterFeatureSizeInBits - 3));
+ static const _uint64 bloomFilterFeatureMask = (1 << bloomFilterFeatureSizeInBits) - 1;
+ unsigned char bloomFilter[bloomFilterSizeInChar];
+ static inline void getBloomFilterFeatures(K key, unsigned *feature0Word, unsigned *feature0Bit, unsigned *feature1Word, unsigned *feature1Bit)
+ {
+ //
+ // We know the bloom filter is 2^bloomFilterFeatureSizeInBits bits wide. Use alternating bloomFilterFeatureSizeInBits bit chunks from the key to build up each of the features.
+ //
+ _uint64 feature[2] = {0, 0};
+ for (int i = 0; i < sizeof(K) * 8; i += bloomFilterFeatureSizeInBits * 2) {
+ feature[0] ^= ((key >> i) & bloomFilterFeatureMask);
+ feature[1] ^= ((key >> (i+bloomFilterFeatureSizeInBits)) & bloomFilterFeatureMask);
+ }
+ *feature0Word = feature[0] / 8;
+ *feature0Bit = feature[0] % 8;
+ *feature1Word = feature[1] / 8;
+ *feature1Bit = feature[1] % 8;
+ _ASSERT(*feature0Word < bloomFilterSizeInChar && *feature1Word < bloomFilterSizeInChar);
+ }
+ //
+ // false means that this entry is NOT in the cache. true means we can't say for sure.
+ //
+ inline bool checkBloomFilter(K key)
+ {
+ unsigned feature0Word, feature0Bit, feature1Word, feature1Bit;
+ getBloomFilterFeatures(key, &feature0Word, &feature0Bit, &feature1Word, &feature1Bit);
+ return (bloomFilter[feature0Word] & (1 << feature0Bit)) && (bloomFilter[feature1Word] & (1 << feature1Bit));
+ }
+ inline void addToBloomFilter(K key)
+ {
+ unsigned feature0Word, feature0Bit, feature1Word, feature1Bit;
+ getBloomFilterFeatures(key, &feature0Word, &feature0Bit, &feature1Word, &feature1Bit);
+ bloomFilter[feature0Word] |= 1 << feature0Bit;
+ bloomFilter[feature1Word] |= 1 << feature1Bit;
+ }
+ void clearBloomFilter()
+ {
+ memset(bloomFilter, 0, bloomFilterSizeInChar * sizeof(bloomFilter[0]));
+ }
+ struct Entry {
+ K key;
+ V value;
+ int epoch;
+ void *operator new[](size_t size) {return BigAlloc(size);}
+ void operator delete[](void *ptr) {BigDealloc(ptr);}
+ };
+ Entry *entries;
+ unsigned capacity;
+ unsigned size;
+ int mask;
+ int epoch;
+ Hash hash;
+ bool isPowerOf2(int n) {
+ while (n > 0) {
+ if (n == 1) {
+ return true;
+ } else if (n % 2 == 1) {
+ return false;
+ } else {
+ n /= 2;
+ }
+ }
+ return false;
+ }
diff --git a/SNAPLib/FixedSizeSet.h b/SNAPLib/FixedSizeSet.h
new file mode 100644
index 0000000..66eea28
--- /dev/null
+++ b/SNAPLib/FixedSizeSet.h
@@ -0,0 +1,131 @@
+#pragma once
+#include "Compat.h"
+#include "BigAlloc.h"
+#include "FixedSizeMap.h"
+#include "exit.h"
+#include "Error.h"
+// A fixed-capacity hash set that allows for efficient clearing and reuse through epochs
+// and does not perform any memory allocation.
+// This class only allows the capacity to be a power of 2.
+template< typename K, typename Hash = NumericHash<K> >
+class FixedSizeSet
+ FixedSizeSet(int capacity_ = 16): entries(NULL), size(0) {
+ reserve(capacity_);
+ }
+ ~FixedSizeSet() {
+ delete[] entries;
+ }
+ void reserve(int capacity) {
+ if (!isPowerOf2(capacity)) {
+ WriteErrorMessage("FixedSizeSet capacity must be a power of 2\n");
+ soft_exit(1);
+ }
+ if (entries != NULL) {
+ if (size > 0) {
+ WriteErrorMessage("reserve() called on a non-empty FixedSizeSet\n");
+ soft_exit(1);
+ }
+ delete[] entries;
+ }
+ this->capacity = capacity;
+ this->mask = capacity - 1;
+ entries = new Entry[capacity];
+ for (int i = 0; i < capacity; i++) {
+ entries[i].epoch = 0;
+ }
+ epoch = 1;
+ }
+ void clear() {
+ size = 0;
+ epoch++;
+ if (epoch > 100000000) {
+ // Reset the epoch of every bucket to 0 and the current epoch to 1
+ for (int i = 0; i < capacity; i++) {
+ entries[i].epoch = 0;
+ }
+ epoch = 1;
+ }
+ }
+ inline bool contains(K key) {
+ unsigned pos = hash(key) & mask;
+ int i = 1;
+ while (true) {
+ if (entries[pos].epoch != epoch) {
+ return false;
+ } else if (entries[pos].key == key) {
+ return true;
+ } else {
+ pos = (pos + i) & mask;
+ i++;
+ }
+ }
+ }
+ inline void add(K key) {
+ _ASSERT(size < capacity);
+ unsigned pos = hash(key) & mask;
+ int i = 1;
+ while (true) {
+ if (entries[pos].epoch != epoch) {
+ entries[pos].key = key;
+ entries[pos].epoch = epoch;
+ size++;
+ if (size >= capacity) { // Can't be exactly equal, because then contains with a non-existant element infinite loops
+ WriteErrorMessage("FixedSizeSet overflowed. Code bug.\n");
+ soft_exit(1);
+ }
+ return;
+ } else if (entries[pos].key == key) {
+ return;
+ } else {
+ pos = (pos + i) & mask;
+ i++;
+ }
+ }
+ }
+ void *operator new(size_t size) {return BigAlloc(size);}
+ void operator delete(void *ptr) {BigDealloc(ptr);}
+ struct Entry {
+ K key;
+ int epoch;
+ void *operator new[](size_t size) {return BigAlloc(size);}
+ void operator delete[](void *ptr) {BigDealloc(ptr);}
+ };
+ Entry *entries;
+ int capacity;
+ int size;
+ int maxSize;
+ int mask;
+ int epoch;
+ Hash hash;
+ bool isPowerOf2(int n) {
+ while (n > 0) {
+ if (n == 1) {
+ return true;
+ } else if (n % 2 == 1) {
+ return false;
+ } else {
+ n /= 2;
+ }
+ }
+ return false;
+ }
diff --git a/SNAPLib/FixedSizeVector.h b/SNAPLib/FixedSizeVector.h
new file mode 100644
index 0000000..ad29f6e
--- /dev/null
+++ b/SNAPLib/FixedSizeVector.h
@@ -0,0 +1,72 @@
+#pragma once
+#include "exit.h"
+#include "Error.h"
+// A fixed-size vector that does not perform any memory allocation.
+template<typename V>
+class FixedSizeVector
+ FixedSizeVector(int capacity_ = 16): entries(NULL), curSize(0) {
+ reserve(capacity_);
+ }
+ // Create a fixed size vector initialized to size copies of an initialValue
+ FixedSizeVector(int size, V initialValue): entries(NULL), curSize(0) {
+ reserve(size);
+ for (int i = 0; i < size; i++) {
+ push_back(initialValue);
+ }
+ }
+ ~FixedSizeVector() {
+ delete[] entries;
+ }
+ void reserve(int capacity) {
+ if (entries != NULL) {
+ if (curSize > 0) {
+ WriteErrorMessage("reserve() called on a non-empty FixedSizeVector\n");
+ soft_exit(1);
+ }
+ delete[] entries;
+ }
+ this->capacity = capacity;
+ entries = new V[capacity];
+ }
+ void clear() {
+ curSize = 0;
+ }
+ int size() {
+ return curSize;
+ }
+ inline void push_back(const V& value) {
+ _ASSERT(curSize < capacity);
+ entries[curSize++] = value;
+ }
+ inline V& operator[] (int index) {
+ return entries[index];
+ }
+ typedef V *iterator;
+ iterator begin() {
+ return entries;
+ }
+ iterator end() {
+ return entries + curSize;
+ }
+ V *entries;
+ int capacity;
+ int curSize;
diff --git a/SNAPLib/GenericFile.cpp b/SNAPLib/GenericFile.cpp
new file mode 100755
index 0000000..82b87f5
--- /dev/null
+++ b/SNAPLib/GenericFile.cpp
@@ -0,0 +1,109 @@
+Module Name:
+ GenericFile.cpp
+ Generic IO class for SNAP that can read from either the filesystem or HDFS.
+ Jeremy Elson, February 2014
+ User mode service.
+Revision History:
+#include "stdafx.h"
+#include <string.h>
+#include "Compat.h"
+#include "GenericFile.h"
+#include "GenericFile_stdio.h"
+#ifdef SNAP_HDFS
+# include "GenericFile_HDFS.h"
+const char *GenericFile::HDFS_PREFIX = "hdfs:/";
+ _filename = NULL;
+ free(_filename);
+GenericFile *GenericFile::open(const char *filename, Mode mode)
+ if (NULL == filename) {
+ return NULL;
+ }
+ GenericFile *retval = NULL;
+ if (0 == strncmp(filename, HDFS_PREFIX, strlen(HDFS_PREFIX))) {
+#ifdef SNAP_HDFS
+ retval = GenericFile_HDFS::open(filename, mode);
+ fprintf(stderr, "SNAP not compiled with HDFS support. Set HADOOP_HOME and recompile.\n");
+ retval = NULL;
+ } else {
+ retval = GenericFile_stdio::open(filename, mode);
+ }
+ if (NULL != retval) {
+ retval->_filename = strdup(filename);
+ retval->_mode = mode;
+ }
+ return retval;
+// gets -- read until a newline. Based on the K&R implementation.
+char *GenericFile::_gets_impl(char *buf, size_t count)
+ int c;
+ char *next;
+ if (count == 0) {
+ return NULL;
+ }
+ next = buf;
+ while (--count > 0 && (c = getchar()) != EOF) {
+ // put the input char into the current pointer position, then increment it.
+ // if a newline is encountered, break
+ if ((*next++ = c) == '\n')
+ break;
+ }
+ *next = '\0';
+ return (c == EOF && next == buf) ? NULL : buf;
+ _int64
+ const size_t ioSize = 128 * 1024 * 1024;
+ char *buffer = new char[ioSize];
+ for (;;) {
+ if (0 == read(buffer, ioSize)) {
+ delete[] buffer;
+ return 0;
+ }
+ }
\ No newline at end of file
diff --git a/SNAPLib/GenericFile.h b/SNAPLib/GenericFile.h
new file mode 100755
index 0000000..3950d27
--- /dev/null
+++ b/SNAPLib/GenericFile.h
@@ -0,0 +1,75 @@
+Module Name:
+ GenericFile.h
+ Generic IO class for SNAP that can read from either stdlib or HDFS.
+ Jeremy Elson, February 2014
+ User mode service.
+Revision History:
+#pragma once
+#include "Compat.h"
+class GenericFile
+ enum Mode
+ {
+ ReadOnly,
+ WriteOnly,
+ };
+ static const char *HDFS_PREFIX;
+ // Factory that returns either:
+ // * a GenericFile_HDFS object if the filename starts with "hdfs://"
+ // * a GenericFile_stdio object otherwise
+ static GenericFile *open(const char *fileName, Mode mode);
+ // Read 'count' bytes into the memory pointed at by 'ptr'.
+ // Returns the actual number of bytes read, or -1 on error.
+ virtual size_t read(void *ptr, size_t count) = 0;
+ // Gets a single character from the stream, like stdio getc.
+ // On success, returns the char promoted to an int.
+ // On failure, returns EOF.
+ virtual int getchar() = 0;
+ // Gets a string from the file and store it as a c string in 'str' until (num-1)
+ // characters have ben read or either a newline or eod-of-file is reached,
+ // whichever happens first.
+ virtual char *gets(char *buf, size_t count) = 0;
+ // Advance forward or back by byteOffset bytes in the file.
+ virtual int advance(long long byteOffset) = 0;
+ // Close the file.
+ virtual void close() = 0;
+ // Return the name of the file.
+ char *getFilename() { return _filename; }
+ virtual ~GenericFile();
+ virtual _int64 prefetch(); // Ignore the return value, it's just to trick the compiler into not optimizing it away.
+ char *_gets_impl(char *buf, size_t count);
+ GenericFile();
+ Mode _mode;
+ char *_filename;
diff --git a/SNAPLib/GenericFile_Blob.cpp b/SNAPLib/GenericFile_Blob.cpp
new file mode 100644
index 0000000..d207232
--- /dev/null
+++ b/SNAPLib/GenericFile_Blob.cpp
@@ -0,0 +1,125 @@
+Module Name:
+ GenericFile_Blob.cpp
+ Generic IO class for SNAP that can read from an in-memory blob.
+ Bill Bolosky, March, 2014
+ User mode service.
+Revision History:
+#include "stdafx.h"
+#include "GenericFile_Blob.h"
+GenericFile_Blob::GenericFile_Blob(void *i_blob, size_t i_blobSize) : blob((char *)i_blob), readPointer((char *)i_blob), blobSize(i_blobSize), blobEnd((char *)i_blob + i_blobSize)
+GenericFile_Blob *
+GenericFile_Blob::open(void *i_blob, size_t i_blobSize)
+ GenericFile_Blob *file = new GenericFile_Blob(i_blob, i_blobSize);
+ return file;
+GenericFile_Blob::read(void *ptr, size_t count)
+ size_t bytesReturned;
+ void *base = mapAndAdvance(count, &bytesReturned);
+ memcpy(ptr, base, bytesReturned);
+ return bytesReturned;
+GenericFile_Blob::advance(long long offset)
+ if (offset < 0) {
+ if (readPointer - blob > -1 * offset) {
+ return 1;
+ }
+ readPointer += offset;
+ return 0;
+ }
+ long long amountRemaining = blobEnd - readPointer;
+ if (amountRemaining < offset) {
+ return EOF;
+ }
+ readPointer += offset;
+ return 0;
+ void
+ blob = readPointer = blobEnd = NULL;
+ blobSize = 0;
+ close();
+ int
+ if (readPointer >= blobEnd) {
+ return EOF;
+ }
+ unsigned char c = *(unsigned char *)readPointer;
+ readPointer++;
+ return (int)c;
+char *GenericFile_Blob::gets(char *buf, size_t count)
+ return _gets_impl(buf, count);
+ //
+ // This gives a pointer into the blob rather than copying it.
+ // It's the caller's responsibility to assure that the blob doesn't
+ // go away while this pointer's still in use.
+ //
+ void *
+GenericFile_Blob::mapAndAdvance(size_t count, size_t *bytesReturned)
+ size_t amountRemaining = blobEnd - readPointer;
+ if (count > amountRemaining) {
+ *bytesReturned = amountRemaining;
+ } else {
+ *bytesReturned = count;
+ }
+ void *retVal = readPointer;
+ readPointer += *bytesReturned;
+ return retVal;
+ size_t
+ return readPointer - blob;
diff --git a/SNAPLib/GenericFile_Blob.h b/SNAPLib/GenericFile_Blob.h
new file mode 100644
index 0000000..82633fc
--- /dev/null
+++ b/SNAPLib/GenericFile_Blob.h
@@ -0,0 +1,66 @@
+Module Name:
+ GenericFile_Blob.h
+ Generic IO class for SNAP that can read from an in-memory blob.
+ Bill Bolosky, March, 2014
+ User mode service.
+Revision History:
+#pragma once
+#include "GenericFile.h"
+class GenericFile_Blob: public GenericFile
+ //
+ // This object does not take ownership of the blob, so it's the caller's
+ // responsibility to assure that it continues to exist as long as the
+ // GenericFile_Blob object does, and to free the memory as necessary.
+ // In addition, if anyone calls mapAndAdvance, the caller must
+ // assure that the blob continues to exist until all uses of that
+ // pointer are finished as well.
+ //
+ static GenericFile_Blob *open(void *i_blob, size_t i_blobSize);
+ virtual size_t read(void *ptr, size_t count);
+ virtual int getchar();
+ virtual char *gets(char *buf, size_t count);
+ virtual int advance(long long offset);
+ virtual void close();
+ virtual ~GenericFile_Blob();
+ virtual size_t getAmountUsed();
+ //
+ // This gives a pointer into the blob rather than copying it.
+ // It's the caller's responsibility to assure that the blob doesn't
+ // go away while this pointer's still in use.
+ //
+ void *mapAndAdvance(size_t count, size_t *bytesReturned);
+ GenericFile_Blob(void *i_blob, size_t i_blobSize);
+ char *blob;
+ char *blobEnd;
+ char *readPointer;
+ size_t blobSize;
diff --git a/SNAPLib/GenericFile_HDFS.cpp b/SNAPLib/GenericFile_HDFS.cpp
new file mode 100755
index 0000000..0f44e08
--- /dev/null
+++ b/SNAPLib/GenericFile_HDFS.cpp
@@ -0,0 +1,382 @@
+Module Name:
+ GenericFile_HDFS.cpp
+ Generic IO class for SNAP that can read from HDFS.
+ Jeremy Elson, February 2014
+ User mode service.
+Revision History:
+#include "stdafx.h"
+#ifdef SNAP_HDFS
+#include "Compat.h"
+#include "Error.h"
+#include "GenericFile.h"
+#include "GenericFile_HDFS.h"
+#include "Util.h"
+ExclusiveLock GenericFile_HDFS::_staticLock;
+hdfsFS GenericFile_HDFS::_fs;
+// this goes last so staticInit runs after others are initialized;
+// it has side-effects that initialize the other members
+int GenericFile_HDFS::_initFlag = GenericFile_HDFS::_staticInit();
+int GenericFile_HDFS::_staticInit()
+ InitializeExclusiveLock(&_staticLock);
+ SetExclusiveLockWholeProgramScope(&_staticLock);
+ _fs = NULL;
+ srand(unsigned(time(NULL)));
+ return 1;
+ _file = NULL;
+GenericFile_HDFS *GenericFile_HDFS::open(const char *filename, Mode mode)
+ GenericFile_HDFS *retval = new GenericFile_HDFS();
+ bool noFs = false;
+ AcquireExclusiveLock(&_staticLock);
+ if (NULL == _fs) {
+ // note, because we're now holding this globally due to HDFS' ridiculous
+ // bug (see: https://issues.apache.org/jira/browse/HDFS-925), this never gets closed.
+ // sorry. feel free to refcount it and hdfsDisconnect() it in ::close(). --JE
+ _fs = hdfsConnect("default", 0);
+ if (NULL == _fs) {
+ fprintf(stderr, "can't open HDFS");
+ noFs = true;
+ }
+ }
+ ReleaseExclusiveLock(&_staticLock);
+ if (noFs) {
+ goto fail;
+ }
+ switch (mode) {
+ case ReadOnly:
+ retval->_file = hdfsOpenFile(retval->_fs, filename, O_RDONLY, 0, 0, 0);
+ break;
+ case WriteOnly:
+ retval->_file = hdfsOpenFile(retval->_fs, filename, O_WRONLY | O_CREAT, 0, 0, 0);
+ break;
+ default:
+ fprintf(stderr, "GenericFile_HDFS::open::unknown file mode");
+ break;
+ }
+ if (0 == retval->_file) {
+ fprintf(stderr, "couldn't open hdfs file");
+ goto fail;
+ }
+ return retval;
+ delete retval;
+ return NULL;
+// for debugging; ignore
+static char *startingBytes(char *ptr, size_t totalRead)
+ static char buf[100];
+ for (size_t i = 0; i < min(totalRead, 10); i++)
+ sprintf(buf+i*3, "%02x:", (unsigned char) ptr[i]);
+ return buf;
+size_t GenericFile_HDFS::_readSingleThreaded(void *ptr, size_t count)
+ size_t totalRead = 0;
+ while (true) {
+ // HDFS takes read arguments as signed 32-bit ints, so do smaller
+ // reads if we got a larger argument
+ size_t desiredRead = count - totalRead;
+ tSize readSize;
+ if (desiredRead > INT32_MAX) {
+ readSize = INT32_MAX;
+ } else {
+ readSize = (tSize) desiredRead;
+ }
+ // WriteErrorMessage("reading 0x%x from 0x%llx --> %x\n", readSize, hdfsTell(_fs, _file), (char *) ptr+totalRead);
+ tSize retval = hdfsRead(_fs, _file, ((char *) ptr) + totalRead, readSize);
+ if (retval < 0) {
+ perror("hdfsRead");
+ return totalRead;
+ } else {
+ totalRead += retval;
+ if (retval == 0 || totalRead >= count) {
+ // WriteErrorMessage("...read at 0x%x, %ld bytes, starts with %s\n", ptr, totalRead, startingBytes((char *)ptr, totalRead));
+ return totalRead;
+ }
+ }
+ }
+/// Implementation of multi-threaded work queue for HDFS reads
+GenericFile_HDFS::_HdfsWorkQueue::_HdfsWorkQueue(GenericFile_HDFS *gFile, void *void_ptr, tOffset startOffset, size_t count)
+ InitializeExclusiveLock(&_workQueueLock);
+ _error = false;
+ this->_gFile = gFile;
+ // cast to char ptr necessary to do pointer math
+ char *ptr = (char *) void_ptr;
+ while (count > 0) {
+ long readSize = count >= _READ_CHUNK_SIZE ? _READ_CHUNK_SIZE : (long) count;
+ _workItemList.push_back(new _HdfsWorkItem(ptr, startOffset, readSize));
+ ptr += readSize;
+ startOffset += readSize;
+ count -= readSize;
+ };
+ std::random_shuffle(_workItemList.begin(), _workItemList.end());
+size_t GenericFile_HDFS::_HdfsWorkQueue::size()
+ AcquireExclusiveLock(&_workQueueLock);
+ size_t retval = _workItemList.size();
+ ReleaseExclusiveLock(&_workQueueLock);
+ return retval;
+GenericFile_HDFS::_HdfsWorkItem *GenericFile_HDFS::_HdfsWorkQueue::getOne()
+ _HdfsWorkItem *retval = NULL;
+ // If an error has been thrown, don't bother with the rest of the work
+ if (isErrorThrown()) {
+ return NULL;
+ }
+ AcquireExclusiveLock(&_workQueueLock);
+ if (_workItemList.empty()) {
+ retval = NULL;
+ } else {
+ retval = _workItemList.back();
+ _workItemList.pop_back();
+ }
+ ReleaseExclusiveLock(&_workQueueLock);
+ return retval;
+void GenericFile_HDFS::_HdfsWorkQueue::createNWaiter(size_t n)
+ _nWaiter = new NWaiter(n);
+void GenericFile_HDFS::_HdfsWorkQueue::signalThreadDone()
+ _nWaiter->signal();
+void GenericFile_HDFS::_HdfsWorkQueue::wait()
+ _nWaiter->wait();
+ if (!_workItemList.empty()) {
+ WriteErrorMessage("HDFS work queue not empty when destroyed! This is a bug. A grave one.");
+ exit(1);
+ }
+ DestroyExclusiveLock(&_workQueueLock);
+ delete _nWaiter;
+size_t GenericFile_HDFS::_readMultiThreaded(void *ptr, size_t count)
+ _HdfsWorkQueue *workQueue = new _HdfsWorkQueue(this, ptr, hdfsTell(_fs, _file), count);
+ size_t numThreads = workQueue->size();
+ if (numThreads > _MAX_READ_THREADS) {
+ numThreads = _MAX_READ_THREADS;
+ }
+ // WriteErrorMessage("Reading %lld bytes with %d threads (%lld work queue items)\n", count, numThreads, workQueue->size());
+ // Tell the work queue how many threads we're creating
+ workQueue->createNWaiter(numThreads);
+ for (size_t i = 0; i < numThreads; i++) {
+ StartNewThread(_hdfsReaderThread, workQueue);
+ }
+ // Wait for all threads to finish
+ workQueue->wait();
+ size_t retval;
+ if (workQueue->isErrorThrown()) {
+ retval = 0;
+ } else {
+ retval = count;
+ }
+ delete workQueue;
+ return retval;
+void GenericFile_HDFS::_hdfsReaderThread(void *workQueueVoid)
+ _HdfsWorkQueue *workQueue = static_cast<_HdfsWorkQueue*>(workQueueVoid);
+ if (NULL == workQueue) {
+ WriteErrorMessage("HDFS reader thread didn't get a valid work queue!\n");
+ exit(-1); // can't even signal the thread is done
+ }
+ // we need a separate file structure so that we can seek independently
+ GenericFile_HDFS *localFile = GenericFile_HDFS::open(workQueue->getFile()->getFilename(), ReadOnly);
+ if (NULL == localFile) {
+ WriteErrorMessage("HDFS reader thread could not open file!\n");
+ workQueue->signalError();
+ goto done;
+ }
+ while (true) {
+ _HdfsWorkItem *nextItem = workQueue->getOne();
+ if (NULL == nextItem)
+ goto done;
+ localFile->seek(nextItem->startOffset);
+ // WriteErrorMessage("readerThread: reading 0x%x from %x --> %x\n", nextItem->count, nextItem->startOffset, nextItem->ptr);
+ size_t retval = localFile->_readSingleThreaded(nextItem->ptr, nextItem->count);
+ if (retval != nextItem->count) {
+ WriteErrorMessage("HDFS read error: starting at offset %lld, tried to read %lld, only got %lld\n",
+ nextItem->startOffset, nextItem->count, retval);
+ workQueue->signalError();
+ }
+ }
+ if (NULL != localFile) {
+ localFile->close();
+ delete localFile;
+ }
+ workQueue->signalThreadDone();
+size_t GenericFile_HDFS::read(void *ptr, size_t count)
+ if (count <= _READ_CHUNK_SIZE || _MAX_READ_THREADS == 0) {
+ return _readSingleThreaded(ptr, count);
+ } else {
+ size_t retval = _readMultiThreaded(ptr, count);
+ // move the file pointer forward on the "main" file,
+ // because the multi threaded reader will open its own file pointers
+ advance(count);
+ return retval;
+ }
+// ridiculously slow implementaiton of getChar.
+int GenericFile_HDFS::getchar()
+ char buf[1];
+ if (1 == read(buf, sizeof(buf))) {
+ return buf[0];
+ } else {
+ return EOF;
+ }
+// very slow because we're going all the way out to the JVM
+// for each character. We can buffer locally if perf hurts too much.
+char *GenericFile_HDFS::gets(char *buf, size_t count)
+ return _gets_impl(buf, count);
+int GenericFile_HDFS::advance(long long offset)
+ if (offset == 0)
+ return 0;
+ tOffset currOffset = hdfsTell(_fs, _file);
+ return hdfsSeek(_fs, _file, currOffset + offset);
+int GenericFile_HDFS::seek(long long offset)
+ return hdfsSeek(_fs, _file, offset);
+void GenericFile_HDFS::close()
+ if (_mode == GenericFile::WriteOnly) {
+ if (hdfsFlush(_fs, _file)) {
+ fprintf(stderr, "Failed to flush %s!\n", _filename);
+ }
+ }
+ hdfsCloseFile(_fs, _file);
+#endif // SNAP_HDFS
diff --git a/SNAPLib/GenericFile_HDFS.h b/SNAPLib/GenericFile_HDFS.h
new file mode 100755
index 0000000..5aa203f
--- /dev/null
+++ b/SNAPLib/GenericFile_HDFS.h
@@ -0,0 +1,106 @@
+Module Name:
+ GenericFile_HDFS.h
+ Generic IO class for SNAP that can read from HDFS.
+ Jeremy Elson, February 2014
+ User mode service.
+Revision History:
+#pragma once
+#include "GenericFile.h"
+#include "Util.h"
+#include "hdfs.h"
+#include <vector>
+class GenericFile_HDFS : public GenericFile
+ static GenericFile_HDFS *open(const char *filename, Mode mode);
+ virtual size_t read(void *ptr, size_t count);
+ virtual int getchar();
+ virtual char *gets(char *buf, size_t count);
+ virtual int advance(long long offset);
+ int seek(long long offset);
+ virtual void close();
+ virtual ~GenericFile_HDFS();
+ // private constructor -- must use factory
+ GenericFile_HDFS();
+ // private methods and data
+ static int _initFlag;
+ static int _staticInit();
+ static ExclusiveLock _staticLock;
+ size_t _readMultiThreaded(void *ptr, size_t count);
+ size_t _readSingleThreaded(void *ptr, size_t count);
+ // this is static because of an apparent bug in the HDFS library
+ // that prevents clients from holding more than one handle. If you
+ // open two connections to the same filesystem, then close one,
+ // the other is also closed. Ugh. As a work-around we'll just keep
+ // one global.
+ // See: https://issues.apache.org/jira/browse/HDFS-925
+ static hdfsFS _fs;
+ hdfsFile _file;
+ class _HdfsWorkItem {
+ public:
+ void *ptr;
+ tOffset startOffset;
+ size_t count;
+ _HdfsWorkItem(void *ptrArg, tOffset startOffsetArg, size_t countArg)
+ {
+ this->ptr = ptrArg;
+ this->startOffset = startOffsetArg;
+ this->count = countArg;
+ }
+ };
+ class _HdfsWorkQueue {
+ public:
+ _HdfsWorkQueue(GenericFile_HDFS *gFile, void *ptr, tOffset startOffset, size_t count);
+ size_t size();
+ _HdfsWorkItem *getOne();
+ GenericFile_HDFS *getFile() { return _gFile; }
+ void createNWaiter(size_t n);
+ void wait();
+ void signalThreadDone();
+ void signalError() { _error = true; }
+ bool isErrorThrown() { return _error; }
+ ~_HdfsWorkQueue();
+ private:
+ GenericFile_HDFS *_gFile;
+ ExclusiveLock _workQueueLock;
+ NWaiter *_nWaiter;
+ std::vector<_HdfsWorkItem *> _workItemList;
+ bool _error;
+ };
+ static void _hdfsReaderThread(void *workQueueObject);
+ static const size_t _MAX_READ_THREADS = 32;
+ static const size_t _READ_CHUNK_SIZE = 16*1024*1024; // 16MB
diff --git a/SNAPLib/GenericFile_map.cpp b/SNAPLib/GenericFile_map.cpp
new file mode 100644
index 0000000..4a1b91b
--- /dev/null
+++ b/SNAPLib/GenericFile_map.cpp
@@ -0,0 +1,71 @@
+Module Name:
+Generic IO class for SNAP that can map an input file.
+Bill Bolosky, September, 2014
+User mode service.
+Revision History:
+#include "stdafx.h"
+#include "GenericFile_map.h"
+#include "Error.h"
+#include "exit.h"
+GenericFile_map *GenericFile_map::open(const char *filename)
+ size_t fileSize = QueryFileSize(filename);
+ void *contents;
+ MemoryMappedFile *mappedFile = OpenMemoryMappedFile(filename, 0, fileSize, &contents);
+ return new GenericFile_map(mappedFile, contents, fileSize);
+GenericFile_map::GenericFile_map(MemoryMappedFile *i_mappedFile, void *i_contents, size_t i_fileSize) : mappedFile(i_mappedFile), contents((const char *)i_contents), fileSize(i_fileSize), GenericFile_Blob(i_contents, i_fileSize)
+ void
+ if (NULL != mappedFile) {
+ CloseMemoryMappedFile(mappedFile);
+ mappedFile = NULL;
+ GenericFile_Blob::close();
+ }
+ close();
+ _int64
+ AdviseMemoryMappedFilePrefetch(mappedFile);
+ int pageSize = getpagesize();
+ _int64 total = 0;
+ for (size_t offset = 0; offset < fileSize / sizeof (_int64); offset += 4 ) {
+ total += ((_int64 *)contents)[offset];
+ }
+ return total; // We're returning this just to keep the compiler from optimizing away the whole thing.
diff --git a/SNAPLib/GenericFile_map.h b/SNAPLib/GenericFile_map.h
new file mode 100644
index 0000000..3d0bbca
--- /dev/null
+++ b/SNAPLib/GenericFile_map.h
@@ -0,0 +1,43 @@
+Module Name:
+Generic IO class for SNAP that can map an input file.
+Bill Bolosky, September, 2014
+User mode service.
+Revision History:
+#pragma once
+#include "GenericFile_Blob.h"
+#include "Compat.h"
+class GenericFile_map : public GenericFile_Blob
+ static GenericFile_map *open(const char *filename);
+ virtual ~GenericFile_map();
+ virtual _int64 prefetch(); // Ignore the return value, it's just to trick the compiler into not optimizing it away.
+ virtual void close();
+ GenericFile_map(MemoryMappedFile *i_mappedFile, void *i_contents, size_t i_fileSize);
+ MemoryMappedFile *mappedFile;
+ const char *contents;
+ size_t fileSize;
diff --git a/SNAPLib/GenericFile_stdio.cpp b/SNAPLib/GenericFile_stdio.cpp
new file mode 100755
index 0000000..3e161fa
--- /dev/null
+++ b/SNAPLib/GenericFile_stdio.cpp
@@ -0,0 +1,94 @@
+Module Name:
+ GenericFile_stdio.cpp
+ Generic IO class for SNAP that can read from stdio.
+ Jeremy Elson, February 2014
+ User mode service.
+Revision History:
+#include "stdafx.h"
+#include "Compat.h"
+#include "GenericFile_stdio.h"
+#include "Error.h"
+ _file = NULL;
+GenericFile_stdio *GenericFile_stdio::open(const char *filename, Mode mode)
+ GenericFile_stdio *retval = new GenericFile_stdio();
+ retval->_mode = mode;
+ const char *fMode = NULL;
+ switch (mode) {
+ case ReadOnly:
+ fMode = "rb";
+ break;
+ case WriteOnly:
+ fMode = "wb";
+ break;
+ }
+ retval->_file = fopen(filename, fMode);
+ if (retval->_file == NULL) {
+ delete retval;
+ return NULL;
+ }
+ return retval;
+GenericFile_stdio *GenericFile_stdio::open(const char *filename)
+ return open(filename, ReadOnly);
+size_t GenericFile_stdio::read(void *ptr, size_t count)
+ return fread(ptr, 1, count, _file);
+int GenericFile_stdio::getchar()
+ return fgetc(_file);
+char *GenericFile_stdio::gets(char *buf, size_t count)
+ return fgets(buf, (int) count, _file);
+int GenericFile_stdio::advance(long long offset)
+ return _fseek64bit(_file, offset, SEEK_CUR);
+void GenericFile_stdio::close()
+ fclose(_file);
diff --git a/SNAPLib/GenericFile_stdio.h b/SNAPLib/GenericFile_stdio.h
new file mode 100755
index 0000000..d9cc5fc
--- /dev/null
+++ b/SNAPLib/GenericFile_stdio.h
@@ -0,0 +1,44 @@
+Module Name:
+ GenericFile_stdio.h
+ Generic IO class for SNAP that can read from stdio.
+ Jeremy Elson, February 2014
+ User mode service.
+Revision History:
+#pragma once
+#include "GenericFile.h"
+class GenericFile_stdio : public GenericFile
+ static GenericFile_stdio *open(const char *filename, Mode mode);
+ static GenericFile_stdio *open(const char *filename); // no Mode means ReadOnly
+ virtual size_t read(void *ptr, size_t count);
+ virtual int getchar();
+ virtual char *gets(char *buf, size_t count);
+ virtual int advance(long long offset);
+ virtual ~GenericFile_stdio();
+ virtual void close();
+ GenericFile_stdio();
+ FILE *_file;
diff --git a/SNAPLib/Genome.cpp b/SNAPLib/Genome.cpp
new file mode 100755
index 0000000..102cdd7
--- /dev/null
+++ b/SNAPLib/Genome.cpp
@@ -0,0 +1,492 @@
+Module Name:
+ geonome.cpp
+ Genome class for the SNAP sequencer
+ Bill Bolosky, August, 2011
+ User mode service.
+Revision History:
+ Adapted from Matei Zaharia's Scala implementation.
+#include "stdafx.h"
+#include "Genome.h"
+#include "GenericFile.h"
+#include "GenericFile_map.h"
+#include "Compat.h"
+#include "BigAlloc.h"
+#include "exit.h"
+#include "Error.h"
+Genome::Genome(GenomeDistance i_maxBases, GenomeDistance nBasesStored, unsigned i_chromosomePadding, unsigned i_maxContigs)
+: maxBases(i_maxBases), minLocation(0), maxLocation(i_maxBases), chromosomePadding(i_chromosomePadding), maxContigs(i_maxContigs),
+ mappedFile(NULL)
+ bases = ((char *) BigAlloc(nBasesStored + 2 * N_PADDING)) + N_PADDING;
+ if (NULL == bases) {
+ WriteErrorMessage("Genome: unable to allocate memory for %llu bases\n", GenomeLocationAsInt64(maxBases));
+ soft_exit(1);
+ }
+ // Add N's for the N_PADDING bases before and after the genome itself
+ memset(bases - N_PADDING, 'n', N_PADDING);
+ memset(bases + nBasesStored, 'n', N_PADDING);
+ nBases = 0;
+ nContigs = 0;
+ contigs = new Contig[maxContigs];
+ contigsByName = NULL;
+ void
+Genome::addData(const char *data, GenomeDistance len)
+ if (nBases + len > GenomeLocationAsInt64(maxBases)) {
+ WriteErrorMessage("Tried to write beyond allocated genome size (or tried to write into a genome that was loaded from a file).\n"
+ "Size = %lld\n", GenomeLocationAsInt64(maxBases));
+ soft_exit(1);
+ }
+ memcpy(bases + nBases,data,len);
+ nBases += (unsigned)len;
+ void
+Genome::addData(const char *data)
+ addData(data, strlen(data));
+ void
+Genome::startContig(const char *contigName)
+ if (nContigs == maxContigs) {
+ //
+ // Reallocate (maybe we're sequencing a tree that's got lots of chromosomes).
+ //
+ int newMaxContigs = maxContigs * 2;
+ Contig *newContigs = new Contig[newMaxContigs];
+ if (NULL == newContigs) {
+ WriteErrorMessage("Genome: unable to reallocate contig array to size %d\n", newMaxContigs);
+ soft_exit(1);
+ }
+ for (int i = 0; i < nContigs; i++) {
+ newContigs[i] = contigs[i];
+ }
+ delete [] contigs;
+ contigs = newContigs;
+ maxContigs = newMaxContigs;
+ }
+ contigs[nContigs].beginningLocation = nBases;
+ size_t len = strlen(contigName) + 1;
+ contigs[nContigs].name = new char[len];
+ contigs[nContigs].nameLength = (unsigned)len-1;
+ strncpy(contigs[nContigs].name,contigName,len);
+ contigs[nContigs].name[len-1] = '\0';
+ nContigs++;
+ BigDealloc(bases - N_PADDING);
+ for (int i = 0; i < nContigs; i++) {
+ delete [] contigs[i].name;
+ contigs[i].name = NULL;
+ }
+ delete [] contigs;
+ if (contigsByName) {
+ delete [] contigsByName;
+ }
+ contigs = NULL;
+ if (NULL != mappedFile) {
+ mappedFile->close();
+ delete mappedFile;
+ }
+ bool
+Genome::saveToFile(const char *fileName) const
+ //
+ // Save file format is (in binary) the number of bases, the number of contigs, followed by
+ // the contigs themselves, rounded up to 4K, followed by the bases.
+ //
+ FILE *saveFile = fopen(fileName,"wb");
+ if (saveFile == NULL) {
+ WriteErrorMessage("Genome::saveToFile: unable to open file '%s'\n",fileName);
+ return false;
+ }
+ fprintf(saveFile,"%lld %d\n",nBases, nContigs);
+ char *curChar = NULL;
+ for (int i = 0; i < nContigs; i++) {
+ for (int n = 0; n < strlen(contigs[i].name); n++){
+ curChar = contigs[i].name + n;
+ if (*curChar == ' '){ *curChar = '_'; }
+ }
+ fprintf(saveFile,"%lld %s\n",contigs[i].beginningLocation, contigs[i].name);
+ }
+ //
+ // Write it out in (big) chunks. For whatever reason, fwrite with really big sizes seems not to
+ // work as well as one would like.
+ //
+ const size_t max_chunk_size = 1 * 1024 * 1024 * 1024; // 1 GB (or GiB for the obsessively precise)
+ size_t bases_to_write = nBases;
+ size_t bases_written = 0;
+ while (bases_to_write > 0) {
+ size_t bases_this_write = __min(bases_to_write, max_chunk_size);
+ if (bases_this_write != fwrite(bases + bases_written, 1, bases_this_write, saveFile)) {
+ WriteErrorMessage("Genome::saveToFile: fwrite failed\n");
+ fclose(saveFile);
+ return false;
+ }
+ bases_to_write -= bases_this_write;
+ bases_written += bases_this_write;
+ }
+ _ASSERT(bases_written == nBases);
+ fclose(saveFile);
+ return true;
+ const Genome *
+Genome::loadFromFile(const char *fileName, unsigned chromosomePadding, GenomeLocation minLocation, GenomeDistance length, bool map)
+ GenericFile *loadFile;
+ GenomeDistance nBases;
+ unsigned nContigs;
+ if (!openFileAndGetSizes(fileName, &loadFile, &nBases, &nContigs, map)) {
+ //
+ // It already printed an error. Just fail.
+ //
+ return NULL;
+ }
+ GenomeLocation maxLocation(nBases);
+ if (0 == length) {
+ length = maxLocation - minLocation;
+ } else {
+ //
+ // Don't let length go beyond nBases.
+ //
+ length = __min(length, maxLocation - minLocation);
+ maxLocation = minLocation + length;
+ }
+ Genome *genome = new Genome(nBases, length, chromosomePadding);
+ genome->nBases = nBases;
+ genome->nContigs = genome->maxContigs = nContigs;
+ genome->contigs = new Contig[nContigs];
+ genome->minLocation = minLocation;
+ if (GenomeLocationAsInt64(minLocation) >= nBases) {
+ WriteErrorMessage("Genome::loadFromFile: specified minOffset %u >= nBases %u\n", GenomeLocationAsInt64(minLocation), nBases);
+ soft_exit(-1);
+ }
+ genome->maxLocation = maxLocation;
+ static const unsigned contigNameBufferSize = 512;
+ char contigNameBuffer[contigNameBufferSize];
+ unsigned n;
+ size_t contigSize;
+ char *curName;
+ for (unsigned i = 0; i < nContigs; i++) {
+ if (NULL == loadFile->gets(contigNameBuffer, contigNameBufferSize)){
+ WriteErrorMessage("Unable to read contig description\n");
+ delete genome;
+ return NULL;
+ }
+ for (n = 0; n < contigNameBufferSize; n++){
+ if (contigNameBuffer[n] == ' ') {
+ contigNameBuffer[n] = '\0';
+ break;
+ }
+ }
+ _int64 contigStart;
+ if (1 != sscanf(contigNameBuffer, "%lld", &contigStart)) {
+ WriteErrorMessage("Unable to parse contig start in genome file '%s', '%s%'\n", fileName, contigNameBuffer);
+ soft_exit(1);
+ }
+ genome->contigs[i].beginningLocation = GenomeLocation(contigStart);
+ contigNameBuffer[n] = ' ';
+ n++; // increment n so we start copying at the position after the space
+ contigSize = strlen(contigNameBuffer + n) - 1; //don't include the final \n
+ genome->contigs[i].name = new char[contigSize + 1];
+ genome->contigs[i].nameLength = (unsigned)contigSize;
+ curName = genome->contigs[i].name;
+ for (unsigned pos = 0; pos < contigSize; pos++) {
+ curName[pos] = contigNameBuffer[pos + n];
+ }
+ curName[contigSize] = '\0';
+ }
+ if (0 != loadFile->advance(GenomeLocationAsInt64(minLocation))) {
+ WriteErrorMessage("Genome::loadFromFile: _fseek64bit failed\n");
+ soft_exit(1);
+ }
+ size_t readSize;
+ if (map) {
+ GenericFile_map *mappedFile = (GenericFile_map *)loadFile;
+ genome->bases = (char *)mappedFile->mapAndAdvance(length, &readSize);
+ genome->mappedFile = mappedFile;
+ mappedFile->prefetch();
+ } else {
+ readSize = loadFile->read(genome->bases, length);
+ loadFile->close();
+ delete loadFile;
+ loadFile = NULL;
+ }
+ if (length != readSize) {
+ WriteErrorMessage("Genome::loadFromFile: fread of bases failed; wanted %u, got %d\n", length, readSize);
+ delete loadFile;
+ delete genome;
+ return NULL;
+ }
+ genome->fillInContigLengths();
+ genome->sortContigsByName();
+ return genome;
+ bool
+ const Genome::Contig& a,
+ const Genome::Contig& b)
+ return strcmp(a.name, b.name) < 0;
+ void
+ if (contigsByName) {
+ delete [] contigsByName;
+ }
+ contigsByName = new Contig[nContigs];
+ memcpy(contigsByName, contigs, nContigs * sizeof(Contig));
+ std::sort(contigsByName, contigsByName + nContigs, contigComparator);
+ bool
+Genome::openFileAndGetSizes(const char *filename, GenericFile **file, GenomeDistance *nBases, unsigned *nContigs, bool map)
+ if (map) {
+ *file = GenericFile_map::open(filename);
+ } else {
+ *file = GenericFile::open(filename, GenericFile::ReadOnly);
+ }
+ if (*file == NULL) {
+ WriteErrorMessage("Genome::openFileAndGetSizes: unable to open file '%s'\n",filename);
+ return false;
+ }
+ char linebuf[2000];
+ char *retval = (*file)->gets(linebuf, sizeof(linebuf));
+ if (NULL == retval || 2 != sscanf(linebuf,"%lld %d\n", nBases, nContigs)) {
+ (*file)->close();
+ delete *file;
+ *file = NULL;
+ WriteErrorMessage("Genome::openFileAndGetSizes: unable to read header\n");
+ return false;
+ }
+ return true;
+ bool
+Genome::getSizeFromFile(const char *fileName, GenomeDistance *nBases, unsigned *nContigs)
+ GenericFile *file;
+ GenomeDistance localNBases;
+ unsigned localnContigs;
+ if (!openFileAndGetSizes(fileName,&file, nBases ? nBases : &localNBases, nContigs ? nContigs : &localnContigs, false)) {
+ return false;
+ }
+ file->close();
+ delete file;
+ return true;
+ bool
+Genome::getLocationOfContig(const char *contigName, GenomeLocation *location, int * index) const
+ if (contigsByName) {
+ int low = 0;
+ int high = nContigs - 1;
+ while (low <= high) {
+ int mid = (low + high) / 2;
+ int c = strcmp(contigsByName[mid].name, contigName);
+ if (c == 0) {
+ if (location != NULL) {
+ *location = contigsByName[mid].beginningLocation;
+ }
+ if (index != NULL) {
+ *index = mid;
+ }
+ return true;
+ } else if (c < 0) {
+ low = mid + 1;
+ } else {
+ high = mid - 1;
+ }
+ }
+ return false;
+ }
+ for (int i = 0; i < nContigs; i++) {
+ if (!strcmp(contigName,contigs[i].name)) {
+ if (NULL != location) {
+ *location = contigs[i].beginningLocation;
+ }
+ if (index != NULL) {
+ *index = i;
+ }
+ return true;
+ }
+ }
+ return false;
+ const Genome::Contig *
+Genome::getContigAtLocation(GenomeLocation location) const
+ _ASSERT(location < nBases);
+ int low = 0;
+ int high = nContigs - 1;
+ while (low <= high) {
+ int mid = (low + high) / 2;
+ if (contigs[mid].beginningLocation <= location &&
+ (mid == nContigs-1 || contigs[mid+1].beginningLocation > location)) {
+ return &contigs[mid];
+ } else if (contigs[mid].beginningLocation <= location) {
+ low = mid + 1;
+ } else {
+ high = mid - 1;
+ }
+ }
+ return NULL; // Should not be reached
+ int
+Genome::getContigNumAtLocation(GenomeLocation location) const
+ const Contig *contig = getContigAtLocation(location);
+ return (int)(contig - contigs);
+ const Genome::Contig *
+Genome::getNextContigAfterLocation(GenomeLocation location) const
+ _ASSERT(location < nBases);
+ if (nContigs > 0 && location < contigs[0].beginningLocation) {
+ return &contigs[0];
+ }
+ int low = 0;
+ int high = nContigs - 1;
+ while (low <= high) {
+ int mid = (low + high) / 2;
+ if (contigs[mid].beginningLocation <= location &&
+ (mid == nContigs-1 || contigs[mid+1].beginningLocation > location)) {
+ if (mid >= nContigs - 1) {
+ //
+ // This location landed in the last contig, so return NULL for the next one.
+ //
+ return NULL;
+ } else {
+ return &contigs[mid+1];
+ }
+ } else if (contigs[mid].beginningLocation <= location) {
+ low = mid + 1;
+ } else {
+ high = mid - 1;
+ }
+ }
+ return NULL; // Should not be reached
+GenomeDistance DistanceBetweenGenomeLocations(GenomeLocation locationA, GenomeLocation locationB)
+ if (locationA > locationB) return locationA - locationB;
+ return locationB - locationA;
+void Genome::fillInContigLengths()
+ if (nContigs == 0) return;
+ for (int i = 0; i < nContigs - 1; i++) {
+ contigs[i].length = contigs[i+1].beginningLocation - contigs[i].beginningLocation;
+ }
+ contigs[nContigs-1].length = nBases - GenomeLocationAsInt64(contigs[nContigs-1].beginningLocation);
+const Genome::Contig *Genome::getContigForRead(GenomeLocation location, unsigned readLength, GenomeDistance *extraBasesClippedBefore) const
+ const Contig *contig = getContigAtLocation(location);
+ //
+ // Sometimes, a read aligns before the beginning of a chromosome (imagine prepending a few bases to the read).
+ // In that case, we want to handle it by soft-clipping the bases off of the beginning of the read. We detect it
+ // here by looking to see if the aligned location plus the read length crosses a contig boundary. It also might
+ // happen that it is aligned before the first contig, in which case contig will be NULL.
+ //
+ if (NULL == contig || location + readLength > contig->beginningLocation + contig->length) {
+ //
+ // We should never align over the end of a chromosome, only before the beginning. So move this into the next
+ // chromosome.
+ //
+ contig = getNextContigAfterLocation(location);
+ _ASSERT(NULL != contig);
+ _ASSERT(contig->beginningLocation > location && contig->beginningLocation < location + readLength);
+ *extraBasesClippedBefore = contig->beginningLocation - location;
+ } else {
+ *extraBasesClippedBefore = 0;
+ }
+ return contig;
+GenomeLocation InvalidGenomeLocation; // Gets set on genome build/load
\ No newline at end of file
diff --git a/SNAPLib/Genome.h b/SNAPLib/Genome.h
new file mode 100644
index 0000000..84e94f5
--- /dev/null
+++ b/SNAPLib/Genome.h
@@ -0,0 +1,309 @@
+Module Name:
+ geonome.h
+ Genome class for the SNAP sequencer
+ Bill Bolosky, August, 2011
+ User mode service.
+Revision History:
+ Adapted from Matei Zaharia's Scala implementation.
+#pragma once
+#include "Compat.h"
+#include "GenericFile.h"
+#include "GenericFile_map.h"
+// We have two different classes to represent a place in a genome and a distance between places in a genome.
+// In reality, they're both just 64 bit ints, but the classes are set up to encourage the user to keep
+// in mind the difference. So, a genome location might be something
+// like "chromosome 12, base 12345" which would be represented in (0-based) genome coordinates as some
+// 64 bit int that's the base of cheomosome 12 + 12344 (one less because we're 0-based and the nomenclature
+// uses 1-based).
+// In the non-debug build, GenomeLocation is just defined as an _int64, so that no matter how dumb the compiler
+// can't possibly screw it up. However, in the debug build you get the happy type checking.
+typedef _int64 GenomeDistance;
+#ifdef _DEBUG
+class GenomeLocation {
+ GenomeLocation(_int64 i_location) : location(i_location) {}
+ GenomeLocation(const GenomeLocation &peer) : location(peer.location) {}
+ GenomeLocation() {location = -1;}
+ inline GenomeLocation operator=(const GenomeLocation &peer) {
+ location = peer.location;
+ return *this;
+ }
+ inline GenomeLocation operator=(const _int64 value) {
+ location = value;
+ return *this;
+ }
+ inline GenomeLocation operator++() {
+ location++;
+ return *this;
+ }
+ inline GenomeLocation operator--() {
+ location--;
+ return *this;
+ }
+ // The postfix versions
+ inline GenomeLocation operator++(int foo) {
+ location++;
+ return *this - 1;
+ }
+ inline GenomeLocation operator--(int foo) {
+ location--;
+ return *this + 1;
+ }
+ inline bool operator==(const GenomeLocation &peer) const {
+ return location == peer.location;
+ }
+ inline bool operator>=(const GenomeLocation &peer) const {
+ return location >= peer.location;
+ }
+ inline bool operator>(const GenomeLocation &peer) const {
+ return location > peer.location;
+ }
+ inline bool operator<=(const GenomeLocation &peer) const {
+ return location <= peer.location;
+ }
+ inline bool operator<(const GenomeLocation &peer) const {
+ return location < peer.location;
+ }
+ inline bool operator!=(const GenomeLocation &peer) const {
+ return location != peer.location;
+ }
+ inline GenomeLocation operator+(const GenomeDistance distance) const {
+ GenomeLocation retVal(location + distance);
+ return retVal;
+ }
+ inline GenomeDistance operator-(const GenomeLocation &otherLoc) const {
+ return location - otherLoc.location;
+ }
+ inline GenomeLocation operator-(const GenomeDistance distance) const {
+ return location - distance;
+ }
+ inline GenomeLocation operator+=(const GenomeDistance distance) {
+ location += distance;
+ return *this;
+ }
+ inline GenomeLocation operator-=(const GenomeDistance distance) {
+ location -= distance;
+ return *this;
+ }
+ _int64 location;
+inline _int64 GenomeLocationAsInt64(GenomeLocation genomeLocation) {
+ return genomeLocation.location;
+inline unsigned GenomeLocationAsInt32(GenomeLocation genomeLocation) {
+ _ASSERT(genomeLocation.location <= 0xffffffff && genomeLocation.location >= 0);
+ return (unsigned)genomeLocation.location;
+#else // _DEBUG
+typedef _int64 GenomeLocation;
+inline _int64 GenomeLocationAsInt64(GenomeLocation genomeLocation)
+ return genomeLocation;
+inline unsigned GenomeLocationAsInt32(GenomeLocation genomeLocation) {
+ _ASSERT(genomeLocation <= 0xffffffff && genomeLocation>= 0); // One might wonder about the value of an _ASSERT in code that's only non-_DEBUG. Think of it as an uppity comment. :-)
+ return (unsigned)genomeLocation;
+#endif // _DEBUG
+typedef _int64 GenomeDistance;
+extern GenomeLocation InvalidGenomeLocation;
+class Genome {
+ //
+ // Methods for building a genome.
+ //
+ //
+ // Create a new genome. It's got a limit on the number of bases, but there's no need to
+ // store that many, it's just an upper bound. It will, of course, use memory proportional
+ // to the bound.
+ //
+ Genome(
+ GenomeDistance i_maxBases,
+ GenomeDistance nBasesStored,
+ unsigned i_chromosomePadding,
+ unsigned maxContigs = 32);
+ void startContig(
+ const char *contigName);
+ void addData(
+ const char *data);
+ void addData(const char *data, GenomeDistance len);
+ const unsigned getChromosomePadding() const {return chromosomePadding;}
+ ~Genome();
+ //
+ // Methods for loading a genome from a file, and saving one to a file. When you save and
+ // then load a genome the space used is reduced from the max that was reserved when it was
+ // first created to the amount actually used (rounded up to a page size). However, saved
+ // and loaded genomes can't be added to, they're read only.
+ //
+ // minOffset and length are used to read in only a part of a whole genome.
+ //
+ static const Genome *loadFromFile(const char *fileName, unsigned chromosomePadding, GenomeLocation i_minLocation = 0, GenomeDistance length = 0, bool map = false);
+ // This loads from a genome save
+ // file, not a FASTA file. Use
+ // FASTA.h for FASTA loads.
+ static bool getSizeFromFile(const char *fileName, GenomeDistance *nBases, unsigned *nContigs);
+ bool saveToFile(const char *fileName) const;
+ //
+ // Methods to read the genome.
+ //
+ inline const char *getSubstring(GenomeLocation location, GenomeDistance lengthNeeded) const {
+ if (location > nBases || location + lengthNeeded > nBases + N_PADDING) {
+ // The first part of the test is for the unsigned version of a negative offset.
+ return NULL;
+ }
+ // If we're in the padding, then the base will be an n, and we can't short circuit. Recall that we use lower case n in the reference so it won't match with N in the read.
+ if (lengthNeeded <= chromosomePadding && bases[GenomeLocationAsInt64(location)] != 'n') {
+ return bases + (location - minLocation);
+ }
+ _ASSERT(location >= minLocation && location + lengthNeeded <= maxLocation + N_PADDING); // If the caller asks for a genome slice, it's only legal to look within it.
+ if (lengthNeeded == 0) {
+ return bases + (location - minLocation);
+ }
+ const Contig *contig = getContigAtLocation(location);
+ if (NULL == contig) {
+ return NULL;
+ }
+ _ASSERT(contig->beginningLocation <= location && contig->beginningLocation + contig->length >= location);
+ if (contig->beginningLocation + contig->length <= location + lengthNeeded) {
+ return NULL;
+ }
+ return bases + (location - minLocation);
+ }
+ inline GenomeDistance getCountOfBases() const {return nBases;}
+ bool getLocationOfContig(const char *contigName, GenomeLocation *location, int* index = NULL) const;
+ inline void prefetchData(GenomeLocation genomeLocation) const {
+ _mm_prefetch(bases + GenomeLocationAsInt64(genomeLocation), _MM_HINT_T2);
+ _mm_prefetch(bases + GenomeLocationAsInt64(genomeLocation) + 64, _MM_HINT_T2);
+ }
+ struct Contig {
+ Contig() : beginningLocation(InvalidGenomeLocation), length(0), nameLength(0), name(NULL) {}
+ GenomeLocation beginningLocation;
+ GenomeDistance length;
+ unsigned nameLength;
+ char *name;
+ };
+ inline const Contig *getContigs() const { return contigs; }
+ inline int getNumContigs() const { return nContigs; }
+ const Contig *getContigAtLocation(GenomeLocation location) const;
+ const Contig *getContigForRead(GenomeLocation location, unsigned readLength, GenomeDistance *extraBasesClippedBefore) const;
+ const Contig *getNextContigAfterLocation(GenomeLocation location) const;
+ int getContigNumAtLocation(GenomeLocation location) const; // Returns the contig number, which runs from 0 .. getNumContigs() - 1.
+// unused Genome *copy() const {return copy(true,true,true);}
+// unused Genome *copyGenomeOneSex(bool useY, bool useM) const {return copy(!useY,useY,useM);}
+ //
+ // These are only public so creators of new genomes (i.e., FASTA) can use them.
+ //
+ void fillInContigLengths();
+ void sortContigsByName();
+ static const int N_PADDING = 100; // Padding to add on either end of the genome to allow substring reads past it
+ //
+ // The actual genome.
+ char *bases; // Will point to offset N_PADDING in an array of nBases + 2 * N_PADDING
+ GenomeDistance nBases;
+ GenomeLocation maxBases;
+ GenomeLocation minLocation;
+ GenomeLocation maxLocation;
+ //
+ // A genome is made up of a bunch of contigs, typically chromosomes. Contigs have names,
+ // which are stored here.
+ //
+ int nContigs;
+ int maxContigs;
+ Contig *contigs; // This is always in order (it's not possible to express it otherwise in FASTA).
+ Contig *contigsByName;
+ Genome *copy(bool copyX, bool copyY, bool copyM) const;
+ static bool openFileAndGetSizes(const char *filename, GenericFile **file, GenomeDistance *nBases, unsigned *nContigs, bool map);
+ const unsigned chromosomePadding;
+ GenericFile_map *mappedFile;
+GenomeDistance DistanceBetweenGenomeLocations(GenomeLocation locationA, GenomeLocation locationB);
+inline bool genomeLocationIsWithin(GenomeLocation locationA, GenomeLocation locationB, GenomeDistance distance)
+ return DistanceBetweenGenomeLocations(locationA, locationB) <= distance;
diff --git a/SNAPLib/GenomeIndex.cpp b/SNAPLib/GenomeIndex.cpp
new file mode 100644
index 0000000..3bfe61c
--- /dev/null
+++ b/SNAPLib/GenomeIndex.cpp
@@ -0,0 +1,2067 @@
+Module Name:
+ GenomeIndex.cpp
+ Index (hash table) builder for the SNAP sequencer
+ Bill Bolosky, August, 2011
+ User mode service.
+Revision History:
+ Adapted from Matei Zaharia's Scala implementation.
+#include "stdafx.h"
+#include "ApproximateCounter.h"
+#include "BigAlloc.h"
+#include "Compat.h"
+#include "FASTA.h"
+#include "FixedSizeSet.h"
+#include "FixedSizeVector.h"
+#include "GenericFile.h"
+#include "GenericFile_stdio.h"
+#include "Genome.h"
+#include "GenomeIndex.h"
+#include "HashTable.h"
+#include "Seed.h"
+#include "exit.h"
+#include "Error.h"
+#include "directions.h"
+using namespace std;
+static const int DEFAULT_SEED_SIZE = 20;
+static const double DEFAULT_SLACK = 0.3;
+static const unsigned DEFAULT_PADDING = 500;
+static const unsigned DEFAULT_KEY_BYTES = 4;
+static const unsigned DEFAULT_LOCATION_SIZE = 4;
+static void usage()
+ WriteErrorMessage(
+ "Usage: snap index <input.fa> <output-dir> [<options>]\n"
+ "Options:\n"
+ " -s Seed size (default: %d)\n"
+ " -h Hash table slack (default: %.1f)\n"
+ " -hg19 Use pre-computed table bias for hg19, which results in better speed, balance, and a smaller index, but only works for the complete human reference.\n"
+ " -Ofactor This parameter is deprecated and will be ignored.\n"
+ " -tMaxThreads Specify the maximum number of threads to use. Default is the number of cores.\n"
+ " -B<chars> Specify characters to use as chromosome name terminators in the FASTA header line; these characters and anything after are\n"
+ " not part of the chromosome name. You must specify all characters on a single -B switch. So, for example, with -B_|,\n"
+ " the FASTA header line '>chr1|Chromosome 1' would generate a chromosome named 'chr1'. There's a separate flag for\n"
+ " indicating that a space is a terminator.\n"
+ " -bSpace Indicates that the space character is a terminator for chromosome names (see -B above). This may be used in addition\n"
+ " to other terminators specified by -B. -B and -bSpace are case sensitive.\n"
+ " -pPadding Specify the number of Ns to put as padding between chromosomes. This must be as large as the largest\n"
+ " edit distance you'll ever use, and there's a performance advantage to have it be bigger than any\n"
+ " read you'll process. Default is %d\n"
+ " -HHistogramFile Build a histogram of seed popularity. This is just for information, it's not used by SNAP.\n"
+ " -exact Compute hash table sizes exactly. This will slow down index build, but usually will result in smaller indices.\n"
+ " -keysize The number of bytes to use for the hash table key. Larger values increase SNAP's memory footprint, but allow larger seeds. Default: %d\n"
+ " -large Build a larger index that's a little faster, particualrly for runs with quick/inaccurate parameters. Increases index size by\n"
+ " about 30%%, depending on the other index parameters and the contents of the reference genome\n"
+ " -locationSize The size of the genome locations stored in the index. This can be from 4 to 8 bytes. The locations need to be big enough\n"
+ " not only to index the genome, but also to allow some space for representing seeds that occur multiple times. For the\n"
+ " human genome, it will fit with four byte locations if the seed size is 19 or larger, but needs 5 (or more) for smaller\n"
+ " seeds. Making the location size bigger than necessary will just waste (lots of) space, so unless you're doing something\n"
+ " quite unusual, the right answer is 4 or 5. Default is %d\n"
+ " -sm Use a temp file to work better in smaller memory. This only helps a little, but can be the difference if you're close.\n"
+ " In particular, this will generally use less memory than the index will use once it's built, so if this doesn't work you\n"
+ " won't be able to use the index anyway. However, if you've got sufficient memory to begin with, this option will just\n"
+ " slow down the index build by doing extra, useless IO.\n"
+ ,
+ soft_exit_no_print(1); // Don't use soft-exit, it's confusing people to get an error message after the usage
+ void
+ int argc,
+ const char **argv)
+ if (argc < 2) {
+ usage();
+ }
+ const char *fastaFile = argv[0];
+ const char *outputDir = argv[1];
+ unsigned maxThreads = GetNumberOfProcessors();
+ int seedLen = DEFAULT_SEED_SIZE;
+ double slack = DEFAULT_SLACK;
+ bool computeBias = true;
+ const char *pieceNameTerminatorCharacters = NULL;
+ bool spaceIsAPieceNameTerminator = false;
+ const char *histogramFileName = NULL;
+ unsigned chromosomePadding = DEFAULT_PADDING;
+ bool forceExact = false;
+ unsigned keySizeInBytes = DEFAULT_KEY_BYTES;
+ bool large = false;
+ unsigned locationSize = DEFAULT_LOCATION_SIZE;
+ bool smallMemory = false;
+ for (int n = 2; n < argc; n++) {
+ if (strcmp(argv[n], "-s") == 0) {
+ if (n + 1 < argc) {
+ seedLen = atoi(argv[n+1]);
+ n++;
+ } else {
+ usage();
+ }
+ } else if (strcmp(argv[n], "-h") == 0) {
+ if (n + 1 < argc) {
+ slack = atof(argv[n+1]);
+ n++;
+ } else {
+ usage();
+ }
+ } else if (strcmp(argv[n], "-exact") == 0) {
+ forceExact = true;
+ } else if (strcmp(argv[n], "-hg19") == 0) {
+ computeBias = false;
+ } else if (strcmp(argv[n], "-locationSize") == 0) {
+ if (n + 1 < argc) {
+ locationSize = atoi(argv[n+1]);
+ if (locationSize < 4 || locationSize > 8) {
+ WriteErrorMessage("Location size must be between 4 and 8 inclusive\n");
+ soft_exit(1);
+ }
+ n++;
+ } else {
+ usage();
+ }
+ } else if (strcmp(argv[n], "-large") == 0) {
+ large = true;
+ } else if (argv[n][0] == '-' && argv[n][1] == 'H') {
+ histogramFileName = argv[n] + 2;
+ } else if (argv[n][0] == '-' && argv[n][1] == 'O') {
+ // Deprecated, ignored parameter
+ } else if (argv[n][0] == '-' && argv[n][1] == 't') {
+ maxThreads = atoi(argv[n]+2);
+ if (maxThreads < 1 || maxThreads > 100) {
+ WriteErrorMessage("maxThreads must be between 1 and 100 inclusive (and you need not to leave a space after '-t')\n");
+ soft_exit(1);
+ }
+ } else if (argv[n][0] == '-' && argv[n][1] == 'p') {
+ chromosomePadding = atoi(argv[n] + 2);
+ if (0 == chromosomePadding) {
+ WriteErrorMessage("Invalid chromosome padding specified, must be at least one (and in practice as large as any max edit distance you might use).\n");
+ soft_exit(1);
+ }
+ } else if (argv[n][0] == '-' && argv[n][1] == 's' && argv[n][2] == 'm') {
+ smallMemory = true;
+ }
+ else if (strcmp(argv[n], "-keysize") == 0) {
+ if (n + 1 < argc) {
+ keySizeInBytes = atoi(argv[n+1]);
+ if (keySizeInBytes < 4 || keySizeInBytes > 8) {
+ WriteErrorMessage("Key size must be between 4 and 8 inclusive\n");
+ soft_exit(1);
+ }
+ n++;
+ } else {
+ usage();
+ }
+ } else if (argv[n][0] == '-' && argv[n][1] == 'B') {
+ pieceNameTerminatorCharacters = argv[n] + 2;
+ } else if (!strcmp(argv[n], "-bSpace")) {
+ spaceIsAPieceNameTerminator = true;
+ } else {
+ WriteErrorMessage("Invalid argument: %s\n\n", argv[n]);
+ usage();
+ }
+ }
+ if (seedLen < 16 || seedLen > 32) {
+ // Seeds are stored in 64 bits, so they can't be larger than 32 bases for now.
+ WriteErrorMessage("Seed length must be between 16 and 32, inclusive\n");
+ soft_exit(1);
+ }
+ if (seedLen < 19 && !computeBias && locationSize < 5) {
+ WriteErrorMessage("For hg19 with seedLen < 19, you'll need to use 5 byte location size (which will use more memory). Setting that option for you.\n");
+ locationSize = 5;
+ }
+ if ((unsigned)seedLen * 2 < keySizeInBytes * 8) {
+ WriteErrorMessage("You must specify a smaller keysize or a larger seed size. The seed must be big enough to fill the key\n"
+ "and takes two bits per base of seed.\n");
+ soft_exit(1);
+ }
+ if (seedLen * 2 - keySizeInBytes * 8 > 16) {
+ WriteErrorMessage("You must specify a biger keysize or smaller seed len. SNAP restricts the number of hash tables to 4^8,\n"
+ "and needs 4^{excess seed len} hash tables, where excess seed len is the seed size minus the four times the key size.\n");
+ soft_exit(1);
+ }
+ WriteStatusMessage("Hash table slack %lf\nLoading FASTA file '%s' into memory...", slack, fastaFile);
+ BigAllocUseHugePages = false;
+ _int64 start = timeInMillis();
+ const Genome *genome = ReadFASTAGenome(fastaFile, pieceNameTerminatorCharacters, spaceIsAPieceNameTerminator, chromosomePadding);
+ if (NULL == genome) {
+ WriteErrorMessage("Unable to read FASTA file\n");
+ soft_exit(1);
+ }
+ WriteStatusMessage("%llds\n", (timeInMillis() + 500 - start) / 1000);
+ GenomeDistance nBases = genome->getCountOfBases();
+ if (!GenomeIndex::BuildIndexToDirectory(genome, seedLen, slack, computeBias, outputDir, maxThreads, chromosomePadding, forceExact, keySizeInBytes,
+ large, histogramFileName, locationSize, smallMemory)) {
+ WriteErrorMessage("Genome index build failed\n");
+ soft_exit(1);
+ }
+ genome = NULL; // It's deleted by BuildIndexToDirectory.
+ _int64 end = timeInMillis();
+ WriteStatusMessage("Index build and save took %llds (%lld bases/s)\n",
+ (end - start) / 1000, nBases / max((end - start) / 1000, (_int64) 1));
+// Compute the value of InvalidGenomeLoctaion based on the number of bytes we're using in the hash table to
+// store genome locations.
+ void
+SetInvalidGenomeLocation(unsigned locationSize)
+ if (locationSize == 8) {
+ InvalidGenomeLocation = 0xffffffffffffffff;
+ } else {
+ InvalidGenomeLocation = ((_int64) 1 << (locationSize * 8)) - 1;
+ }
+ bool
+GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double slack, bool computeBias, const char *directoryName,
+ unsigned maxThreads, unsigned chromosomePaddingSize, bool forceExact, unsigned hashTableKeySize,
+ bool large, const char *histogramFileName, unsigned locationSize, bool smallMemory)
+ PreventMachineHibernationWhileThisThreadIsAlive();
+ SetInvalidGenomeLocation(locationSize);
+ bool buildHistogram = (histogramFileName != NULL);
+ FILE *histogramFile;
+ if (buildHistogram) {
+ histogramFile = fopen(histogramFileName, "w");
+ if (NULL == histogramFile) {
+ WriteErrorMessage("Unable to open histogram file '%s', skipping it.\n", histogramFileName);
+ buildHistogram = false;
+ }
+ }
+ if (mkdir(directoryName, 0777) != 0 && errno != EEXIST) {
+ WriteErrorMessage("BuildIndex: failed to create directory %s\n",directoryName);
+ return false;
+ }
+ const unsigned filenameBufferSize = MAX_PATH+1;
+ char filenameBuffer[filenameBufferSize];
+ fprintf(stderr,"Saving genome...");
+ _int64 start = timeInMillis();
+ snprintf(filenameBuffer,filenameBufferSize,"%s%cGenome",directoryName,PATH_SEP);
+ if (!genome->saveToFile(filenameBuffer)) {
+ WriteErrorMessage("GenomeIndex::saveToDirectory: Failed to save the genome itself\n");
+ return false;
+ }
+ fprintf(stderr,"%llds\n", (timeInMillis() + 500 - start) / 1000);
+ GenomeIndex *index = new GenomeIndex();
+ index->genome = NULL; // We always delete the index when we're done, but we delete the genome first to save space during the overflow table build.
+ GenomeDistance countOfBases = genome->getCountOfBases();
+ if (locationSize != 8 && countOfBases > ((_int64) 1 << (locationSize*8)) - 16) {
+ WriteErrorMessage("Genome is too big for %d byte genome locations. Specify a larger location size with -locationSize\n", locationSize);
+ soft_exit(1);
+ }
+ // Compute bias table sizes, unless we're using the precomputed ones hardcoded in BiasTables.cpp
+ double *biasTable = NULL;
+ if (!computeBias) {
+ if (large) {
+ biasTable = hg19_biasTables_large[hashTableKeySize][seedLen];
+ } else {
+ biasTable = hg19_biasTables[hashTableKeySize][seedLen];
+ }
+ if (NULL == biasTable) {
+ WriteErrorMessage("-hg19 not available for this seed length/key size/small-or-large combo. Computing bias tables the hard way.\n");
+ computeBias = true;
+ }
+ }
+ if (computeBias) {
+ unsigned nHashTables = 1 << ((max((unsigned)seedLen, hashTableKeySize * 4) - hashTableKeySize * 4) * 2);
+ biasTable = new double[nHashTables];
+ ComputeBiasTable(genome, seedLen, biasTable, maxThreads, forceExact, hashTableKeySize, large);
+ }
+ WriteStatusMessage("Allocating memory for hash tables...");
+ start = timeInMillis();
+ unsigned nHashTables;
+ SNAPHashTable** hashTables = index->hashTables =
+ allocateHashTables(&nHashTables, countOfBases, slack, seedLen, hashTableKeySize, large, locationSize, biasTable);
+ index->nHashTables = nHashTables;
+ //
+ // Set up the hash tables. Each table has a key value of the lower 32 bits of the seed, and data
+ // of two integers. There is one integer each for the seed and its reverse complement (i.e., what you'd
+ // get from the complementary DNA strand, A<->T and G<->C with the string order reversed).
+ // The first integer is always for the version of the seed with "lower" value, using an arbitrary
+ // total order that we define in Seed.h. Some seeds are their own reverse complements (e.g.,
+ // AGCT), in which case only the first integer is used.
+ //
+ OverflowBackpointerAnchor *overflowAnchor = new OverflowBackpointerAnchor(__min(((locationSize == 8) ? (_int64)0x8effffffffffffff : GenomeLocationAsInt64(InvalidGenomeLocation)) - countOfBases, countOfBases)); // i.e., as much as the address space will allow.
+ WriteStatusMessage("%llds\nBuilding hash tables.\n", (timeInMillis() + 500 - start) / 1000);
+ start = timeInMillis();
+ volatile _int64 nextOverflowBackpointer = 0;
+ volatile _int64 nonSeeds = 0;
+ volatile _int64 seedsWithMultipleOccurrences = 0;
+ volatile _int64 genomeLocationsInOverflowTable = 0; // Number of extra hits on duplicate indices. This should come out once we implement the overflow table.
+ volatile _int64 bothComplementsUsed = 0; // Number of hash buckets where both complements are used
+ volatile _int64 noBaseAvailable = 0; // Number of places where getSubstring returned null.
+ volatile _int64 nBasesProcessed = 0;
+ volatile int runningThreadCount;
+ SingleWaiterObject doneObject;
+ CreateSingleWaiterObject(&doneObject);
+ unsigned nThreads = __min(GetNumberOfProcessors(), maxThreads);
+ BuildHashTablesThreadContext *threadContexts = new BuildHashTablesThreadContext[nThreads];
+ ExclusiveLock *hashTableLocks = new ExclusiveLock[nHashTables];
+ for (unsigned i = 0; i < nHashTables; i++) {
+ InitializeExclusiveLock(&hashTableLocks[i]);
+ }
+ runningThreadCount = nThreads;
+ GenomeDistance nextChunkToProcess = 0;
+ _int64 * lastBackpointerIndexUsedByThread = NULL;
+ ExclusiveLock backpointerSpillLock;
+ FILE *backpointerSpillFile = NULL;
+ char *backpointerSpillFileName = NULL;
+ InitializeExclusiveLock(&backpointerSpillLock);
+ if (smallMemory) {
+ lastBackpointerIndexUsedByThread = new _int64[nThreads];
+ for (unsigned i = 0; i < nThreads; i++) {
+ lastBackpointerIndexUsedByThread[i] = 0;
+ }
+#define BACKPOINTER_TABLE_SPILL_FILE_NAME "BackpointerTableSpillFile"
+ backpointerSpillFileName = new char[strlen(directoryName) + 1 + strlen(BACKPOINTER_TABLE_SPILL_FILE_NAME) + 1];
+ sprintf(backpointerSpillFileName, "%s%c%s", directoryName, PATH_SEP, BACKPOINTER_TABLE_SPILL_FILE_NAME);
+ backpointerSpillFile = fopen(backpointerSpillFileName, "w+b");
+ if (NULL == backpointerSpillFile) {
+ WriteErrorMessage("Unable to create spill file '%s' for -sm\n", backpointerSpillFileName);
+ soft_exit(1);
+ }
+ }
+ for (unsigned i = 0; i < nThreads; i++) {
+ threadContexts[i].whichThread = i;
+ threadContexts[i].nThreads = nThreads;
+ threadContexts[i].doneObject = &doneObject;
+ threadContexts[i].genome = genome;
+ threadContexts[i].genomeChunkStart = nextChunkToProcess;
+ if (i == nThreads - 1) {
+ nextChunkToProcess = countOfBases - seedLen - 1;
+ } else {
+ nextChunkToProcess += (countOfBases - seedLen) / nThreads;
+ }
+ threadContexts[i].genomeChunkEnd = nextChunkToProcess;
+ threadContexts[i].nBasesProcessed = &nBasesProcessed;
+ threadContexts[i].index = index;
+ threadContexts[i].runningThreadCount = &runningThreadCount;
+ threadContexts[i].seedLen = seedLen;
+ threadContexts[i].noBaseAvailable = &noBaseAvailable;
+ threadContexts[i].nonSeeds = &nonSeeds;
+ threadContexts[i].seedsWithMultipleOccurrences = &seedsWithMultipleOccurrences;
+ threadContexts[i].genomeLocationsInOverflowTable = &genomeLocationsInOverflowTable;
+ threadContexts[i].bothComplementsUsed = &bothComplementsUsed;
+ threadContexts[i].overflowAnchor = overflowAnchor;
+ threadContexts[i].nextOverflowBackpointer = &nextOverflowBackpointer;
+ threadContexts[i].hashTableLocks = hashTableLocks;
+ threadContexts[i].hashTableKeySize = hashTableKeySize;
+ threadContexts[i].large = large;
+ threadContexts[i].locationSize = locationSize;
+ threadContexts[i].backpointerSpillLock = &backpointerSpillLock;
+ threadContexts[i].lastBackpointerIndexUsedByThread = lastBackpointerIndexUsedByThread;
+ threadContexts[i].backpointerSpillFile = backpointerSpillFile;
+ StartNewThread(BuildHashTablesWorkerThreadMain, &threadContexts[i]);
+ }
+ WaitForSingleWaiterObject(&doneObject);
+ DestroySingleWaiterObject(&doneObject);
+ DestroyExclusiveLock(&backpointerSpillLock);
+ delete[] lastBackpointerIndexUsedByThread;
+ if (locationSize != 8 && seedsWithMultipleOccurrences + genomeLocationsInOverflowTable + (_int64)genome->getCountOfBases() > ((_int64)1 << (8 * locationSize)) - 15) { // Only really need -1 for InvalidGenomeLocation, the rest is just spare
+ WriteErrorMessage("Ran out of overflow table namespace. This genome cannot be indexed with this seed and location size. Increase at least one.\n");
+ exit(1);
+ }
+ size_t totalUsedHashTableElements = 0;
+ for (unsigned j = 0; j < index->nHashTables; j++) {
+ totalUsedHashTableElements += hashTables[j]->GetUsedElementCount();
+// printf("HashTable[%d] has %lld used elements, loading %lld%%\n",j,(_int64)hashTables[j]->GetUsedElementCount(),
+// (_int64)hashTables[j]->GetUsedElementCount() * 100 / (_int64)hashTables[j]->GetTableSize());
+ }
+ WriteStatusMessage("%lld(%lld%%) seeds occur more than once, total of %lld(%lld%%) genome locations are not unique, %lld(%lld%%) bad seeds, %lld both complements used %lld no string\n",
+ seedsWithMultipleOccurrences,
+ (seedsWithMultipleOccurrences * 100) / countOfBases,
+ genomeLocationsInOverflowTable,
+ genomeLocationsInOverflowTable * 100 / countOfBases,
+ nonSeeds,
+ (nonSeeds * 100) / countOfBases,
+ bothComplementsUsed,
+ noBaseAvailable);
+ WriteStatusMessage("Hash table build took %llds\n",(timeInMillis() + 500 - start) / 1000);
+ //
+ // We're done with the raw genome. Delete it to save some memory.
+ //
+ delete genome;
+ genome = NULL;
+ char *halfBuiltHashTableSpillFileName = NULL;
+ if (smallMemory) {
+ //
+ // In the hash table build, we use the backpointer table sequentially, and the hash tables randomly. In the
+ // overflow table build, it's the opposite. So, we spill out the half-built hash tables (except for #0, which
+ // we need immediately anyway), and then load back in the backpointer table.
+ //
+ _int64 startSpill = timeInMillis();
+ WriteStatusMessage("Spilling half-built hash tables to disk..");
+ halfBuiltHashTableSpillFileName = new char[strlen(directoryName) + 1 + strlen(HALF_BUILT_HASH_TABLE_SPILL_FILE_NAME) + 20]; // +20 is for the number and trailing null
+ for (unsigned i = 1; i < nHashTables; i++) {
+ sprintf(halfBuiltHashTableSpillFileName, "%s%c%s.%d", directoryName, PATH_SEP, HALF_BUILT_HASH_TABLE_SPILL_FILE_NAME, i);
+ size_t bytesWritten;
+ hashTables[i]->saveToFile(halfBuiltHashTableSpillFileName, &bytesWritten);
+ delete hashTables[i];
+ hashTables[i] = NULL;
+ }
+ _int64 spillDone = timeInMillis();
+ WriteStatusMessage("%llds\nReloading backpointer table from disk...", (spillDone - startSpill + 500) / 1000);
+ overflowAnchor->loadFromFile(backpointerSpillFile);
+ fclose(backpointerSpillFile);
+ DeleteSingleFile(backpointerSpillFileName);
+ delete[] backpointerSpillFileName;
+ WriteStatusMessage("%llds\n", (timeInMillis() - spillDone + 500) / 1000);
+ }
+ WriteStatusMessage("Building overflow table.\n");
+ start = timeInMillis();
+ fflush(stdout);
+ //
+ // Now build the real overflow table and simultaneously fixup the hash table entries.
+ // If locationSize == 4, then it's built from 32 bit entries, otherwise from 64.
+ // Its format is one entry of the number of genome locations matching the
+ // particular seed, followed by that many genome locations, reverse sorted
+ // (the reverse part is for historical reasons, but it's necessary for correct functioning).
+ // For each seed with multiple occurrences in the genome, there is one count.
+ // For each genome location that's not unique, there is one list entry. So, the size
+ // of the overflow table is the number of non-unique seeds plus the number of non-unique
+ // genome locations.
+ //
+ index->overflowTableSize = seedsWithMultipleOccurrences + genomeLocationsInOverflowTable;
+ if (locationSize > 4) {
+ index->overflowTable64 = (_int64 *)BigAlloc(index->overflowTableSize * sizeof(*index->overflowTable64));
+ } else {
+ index->overflowTable32 = (unsigned *)BigAlloc(index->overflowTableSize * sizeof(*index->overflowTable32));
+ }
+ if ((_int64)index->overflowTableSize + countOfBases >= GenomeLocationAsInt64(InvalidGenomeLocation) - 15) {
+ WriteErrorMessage("Not enough address space to index this genome with this seed size. Try a larger seed or location size.\n");
+ soft_exit(1);
+ }
+ _uint64 nBackpointersProcessed = 0;
+ _int64 lastPrintTime = timeInMillis();
+ const unsigned maxHistogramEntry = 500000;
+ _uint64 countOfTooBigForHistogram = 0;
+ _uint64 sumOfTooBigForHistogram = 0;
+ _uint64 largestSeed = 0;
+ unsigned *histogram = NULL;
+ if (buildHistogram) {
+ histogram = new unsigned[maxHistogramEntry+1];
+ for (unsigned i = 0; i <= maxHistogramEntry; i++) {
+ histogram[i] = 0;
+ }
+ }
+ //
+ // Build the overflow table by walking each of the hash tables and looking for elements to fix up.
+ // Write the hash tables as we go so that we can free their memory on the fly.
+ //
+ snprintf(filenameBuffer,filenameBufferSize,"%s%cGenomeIndexHash", directoryName, PATH_SEP);
+ FILE *tablesFile = fopen(filenameBuffer, "wb");
+ if (NULL == tablesFile) {
+ WriteErrorMessage("Unable to open hash table file '%s'\n", filenameBuffer);
+ soft_exit(1);
+ }
+ size_t totalBytesWritten = 0;
+ _uint64 overflowTableIndex = 0;
+ _uint64 duplicateSeedsProcessed = 0;
+ for (unsigned whichHashTable = 0; whichHashTable < nHashTables; whichHashTable++) {
+ if (NULL == hashTables[whichHashTable]) {
+ _ASSERT(smallMemory);
+ sprintf(halfBuiltHashTableSpillFileName, "%s%c%s.%d", directoryName, PATH_SEP, HALF_BUILT_HASH_TABLE_SPILL_FILE_NAME, whichHashTable);
+ GenericFile_stdio *file = GenericFile_stdio::open(halfBuiltHashTableSpillFileName);
+ if (NULL == file) {
+ WriteErrorMessage("Unable to open file '%s' to reload spilled hash table.\n", halfBuiltHashTableSpillFileName);
+ soft_exit(1);
+ }
+ hashTables[whichHashTable] = SNAPHashTable::loadFromGenericFile(file);
+ file->close();
+ DeleteSingleFile(halfBuiltHashTableSpillFileName);
+ }
+ for (_uint64 whichEntry = 0; whichEntry < hashTables[whichHashTable]->GetTableSize(); whichEntry++) {
+ unsigned *values32 = (unsigned *)hashTables[whichHashTable]->getEntryValues(whichEntry);
+ char *values64 = (char *)values32; // char * because it's variable sized
+ for (int i = 0; i < (large ? NUM_DIRECTIONS : 1); i++) {
+ _int64 value;
+ if (locationSize > 4) {
+ value = 0;
+ memcpy((char *)&value, values64 + locationSize * i, locationSize); // assumes little endian
+ } else {
+ value = values32[i];
+ }
+ if (value >= countOfBases && value != GenomeLocationAsInt64(InvalidGenomeLocation) && value != GenomeLocationAsInt64(InvalidGenomeLocation) - 1) {
+ //
+ // This is an overflow pointer. Fix it up. Count the number of occurrences of this
+ // seed by walking the overflow chain.
+ //
+ duplicateSeedsProcessed++;
+ _uint64 nOccurrences = 0;
+ _int64 backpointerIndex = value - countOfBases;
+ while (backpointerIndex != -1) {
+ nOccurrences++;
+ OverflowBackpointer *backpointer = overflowAnchor->getBackpointer(backpointerIndex);
+ _ASSERT(overflowTableIndex + nOccurrences < index->overflowTableSize);
+ if (locationSize > 4) {
+ index->overflowTable64[overflowTableIndex + nOccurrences] = GenomeLocationAsInt64(backpointer->genomeLocation);
+ } else {
+ index->overflowTable32[overflowTableIndex + nOccurrences] = GenomeLocationAsInt32(backpointer->genomeLocation);
+ }
+ backpointerIndex = backpointer->nextIndex;
+ }
+ _ASSERT(nOccurrences > 1);
+ //
+ // Fill the count in as the first thing in the overflow table
+ // and patch the value into the hash table.
+ //
+ _ASSERT(overflowTableIndex < index->overflowTableSize);
+ if (locationSize > 4) {
+ index->overflowTable64[overflowTableIndex] = nOccurrences;
+ _int64 newValue = overflowTableIndex + countOfBases;
+ memcpy(values64 + locationSize * i, &newValue, locationSize); // Assumes little endian
+ } else {
+ index->overflowTable32[overflowTableIndex] = (unsigned)nOccurrences;
+ values32[i] = (unsigned)(overflowTableIndex + countOfBases);
+ }
+ overflowTableIndex += 1 + nOccurrences;
+ _ASSERT(overflowTableIndex <= index->overflowTableSize);
+ nBackpointersProcessed += nOccurrences;
+ //
+ // Sort the overflow table entries, because the paired-end aligner relies on this. Sort them backwards, because that's
+ // what it expects. For those who are desparately curious, this is because it was originally built this way by accident
+ // before there was any concept of doing binary search over a seed's hits. When the binary search was built, it relied
+ // on this. Then, when the index build was parallelized it was easier just to preserve the old order than to change the
+ // code in the aligner. So now you know.
+ //
+ if (locationSize > 4) {
+ qsort(&index->overflowTable64[overflowTableIndex -nOccurrences], nOccurrences, sizeof(index->overflowTable64[0]), BackwardsInt64Compare);
+ } else {
+ qsort(&index->overflowTable32[overflowTableIndex -nOccurrences], nOccurrences, sizeof(index->overflowTable32[0]), BackwardsUnsignedCompare);
+ }
+ if (timeInMillis() - lastPrintTime > 60 * 1000) {
+ WriteStatusMessage("%lld/%lld duplicate seeds, %lld/%lld backpointers, %d/%d hash tables processed\n",
+ duplicateSeedsProcessed, seedsWithMultipleOccurrences, nBackpointersProcessed, genomeLocationsInOverflowTable,
+ whichHashTable, nHashTables);
+ lastPrintTime = timeInMillis();
+ }
+ //
+ // If we're building a histogram, update it.
+ //
+ if (buildHistogram) {
+ if (nOccurrences > maxHistogramEntry) {
+ countOfTooBigForHistogram++;
+ sumOfTooBigForHistogram += nOccurrences;
+ } else {
+ histogram[nOccurrences]++;
+ }
+ largestSeed = __max(largestSeed, nOccurrences);
+ }
+ } // If this entry needs patching
+ } // forward and RC if large table
+ } // for each entry in the hash table
+ //
+ // We're done with this hash table, free it to releive memory pressure.
+ //
+ size_t bytesWrittenThisHashTable;
+ if (!hashTables[whichHashTable]->saveToFile(tablesFile, &bytesWrittenThisHashTable)) {
+ WriteErrorMessage("GenomeIndex::saveToDirectory: Failed to save hash table %d\n", whichHashTable);
+ return false;
+ }
+ totalBytesWritten += bytesWrittenThisHashTable;
+ delete hashTables[whichHashTable];
+ hashTables[whichHashTable] = NULL;
+ } // for each hash table
+ fclose(tablesFile);
+ _ASSERT(overflowTableIndex == index->overflowTableSize); // We used exactly what we expected to use.
+ delete overflowAnchor;
+ overflowAnchor = NULL;
+ if (buildHistogram) {
+ histogram[1] = (unsigned)(totalUsedHashTableElements - seedsWithMultipleOccurrences);
+ for (unsigned i = 0; i <= maxHistogramEntry; i++) {
+ if (histogram[i] != 0) {
+ fprintf(histogramFile,"%d\t%d\n", i, histogram[i]);
+ }
+ }
+ fprintf(histogramFile, "%d larger than %d with %d total genome locations, largest seed %d\n", countOfTooBigForHistogram, maxHistogramEntry, sumOfTooBigForHistogram, largestSeed);
+ fclose(histogramFile);
+ delete [] histogram;
+ }
+ //
+ // Now save out the part of the index that's independent of the genome itself.
+ //
+ WriteStatusMessage("Overflow table build and hash table save took %llds\nSaving overflow table...", (timeInMillis() + 500 - start)/1000);
+ start = timeInMillis();
+ snprintf(filenameBuffer, filenameBufferSize, "%s%cOverflowTable", directoryName, PATH_SEP);
+ FILE* fOverflowTable = fopen(filenameBuffer, "wb");
+ if (fOverflowTable == NULL) {
+ WriteErrorMessage("Unable to open overflow table file, '%s', %d\n", filenameBuffer, errno);
+ return false;
+ }
+ const unsigned writeSize = 32 * 1024 * 1024;
+ unsigned overflowElementSize = (locationSize > 4) ? sizeof(*index->overflowTable64) : sizeof(*index->overflowTable32);
+ char *tableToWriteAsChar = (locationSize > 4) ? (char *)index->overflowTable64 : (char *)index->overflowTable32;
+ for (size_t writeOffset = 0; writeOffset < index->overflowTableSize * overflowElementSize; ) {
+ unsigned amountToWrite = (unsigned)__min((size_t)writeSize,(size_t)index->overflowTableSize * overflowElementSize - writeOffset);
+ size_t amountWritten = fwrite(tableToWriteAsChar + writeOffset, 1, amountToWrite, fOverflowTable);
+ if (amountWritten < amountToWrite) {
+ WriteErrorMessage("GenomeIndex::saveToDirectory: fwrite failed, %d\n",errno);
+ fclose(fOverflowTable);
+ return false;
+ }
+ writeOffset += amountWritten;
+ }
+ fclose(fOverflowTable);
+ fOverflowTable = NULL;
+ //
+ // The save format is:
+ // file 'GenomeIndex' contains in order major version, minor version, nHashTables, overflowTableSize, seedLen, chromosomePaddingSize.
+ // File 'overflowTable' overflowTableSize bytes of the overflow table.
+ // Each hash table is saved in file base name 'GenomeIndexHash%d' where %d is the
+ // table number.
+ // And the genome itself is already saved in the same directory in its own format.
+ //
+ snprintf(filenameBuffer, filenameBufferSize, "%s%cGenomeIndex", directoryName, PATH_SEP);
+ FILE *indexFile = fopen(filenameBuffer,"w");
+ if (indexFile == NULL) {
+ WriteErrorMessage("Unable to open file '%s' for write.\n", filenameBuffer);
+ return false;
+ }
+ fprintf(indexFile,"%d %d %d %lld %d %d %d %lld %d %d", GenomeIndexFormatMajorVersion, GenomeIndexFormatMinorVersion, index->nHashTables,
+ index->overflowTableSize, seedLen, chromosomePaddingSize, hashTableKeySize, totalBytesWritten, large ? 0 : 1, locationSize);
+ fclose(indexFile);
+ delete index;
+ if (computeBias && biasTable != NULL) {
+ delete[] biasTable;
+ }
+ WriteStatusMessage("%llds\n", (timeInMillis() + 500 - start) / 1000);
+ return true;
+SNAPHashTable** GenomeIndex::allocateHashTables(
+ unsigned* o_nTables,
+ GenomeDistance countOfBases,
+ double slack,
+ int seedLen,
+ unsigned hashTableKeySize,
+ bool large,
+ unsigned locationSize,
+ double* biasTable)
+ _ASSERT(NULL != biasTable);
+ BigAllocUseHugePages = false; // Huge pages just slow down allocation and don't help much for hash table build, so don't use them.
+ if (slack <= 0) {
+ WriteErrorMessage("allocateHashTables: must have positive slack for the hash table to work. 0.3 is probably OK, 0.1 is minimal, less will wreak havoc with perf.\n");
+ soft_exit(1);
+ }
+ if (seedLen <= 0) {
+ WriteErrorMessage("allocateHashTables: seedLen is too small (must be > 0, and practically should be >= 15 or so.\n");
+ soft_exit(1);
+ }
+ if (hashTableKeySize < 4 || hashTableKeySize > 8) {
+ WriteErrorMessage("allocateHashTables: key size must be 4-8 inclusive\n");
+ soft_exit(1);
+ }
+ if ((unsigned)seedLen < hashTableKeySize * 4) {
+ WriteErrorMessage("allocateHashTables: key size too large for seedLen.\n");
+ soft_exit(1);
+ }
+ if ((unsigned)seedLen > hashTableKeySize * 4 + 9) {
+ WriteErrorMessage("allocateHashTables: key size too small for seeLen.\n");
+ soft_exit(1);
+ }
+ if (locationSize < 4 || locationSize > 8) {
+ WriteErrorMessage("Location size must be between 4 and 8 inclusive.\n");
+ soft_exit(1);
+ }
+ //
+ // Make an array of HashTables, size depending on the seed size. The way the index works is that we use
+ // the low bits of the seed as a hash key. Any remaining bases are used as an index into the
+ // particular hash table in question. The division between "low" and "high" depends on the hash table key size.
+ //
+ unsigned nHashTablesToBuild = 1 << ((seedLen - hashTableKeySize * 4) * 2);
+ if (nHashTablesToBuild > 256 * 1024) {
+ WriteErrorMessage("allocateHashTables: key size too small for seedLen. Try specifying -keySize and giving it a larger value.\n");
+ soft_exit(1);
+ }
+ //
+ // Average size of the hash table. We bias this later based on the actual content of the genome.
+ //
+ size_t hashTableSize = (size_t) ((double)countOfBases * (slack + 1.0) / nHashTablesToBuild);
+ SNAPHashTable **hashTables = new SNAPHashTable*[nHashTablesToBuild];
+ for (unsigned i = 0; i < nHashTablesToBuild; i++) {
+ //
+ // Create the actual hash tables. It turns out that the human genome is highly non-uniform in its
+ // sequences of bases, so we bias the hash table sizes based on their popularity (which is emperically
+ // measured), or use the estimates that we generated and passed in as "biasTable."
+ //
+ double bias = biasTable[i];
+ unsigned biasedSize = (unsigned) (hashTableSize * bias);
+ if (biasedSize < 100) {
+ biasedSize = 100;
+ }
+ hashTables[i] = new SNAPHashTable(biasedSize, hashTableKeySize, locationSize, large ? 2 : 1, GenomeLocationAsInt64(InvalidGenomeLocation));
+ if (NULL == hashTables[i]) {
+ WriteErrorMessage("IndexBuilder: unable to allocate HashTable %d of %d\n", i+1, nHashTablesToBuild);
+ soft_exit(1);
+ }
+ }
+ *o_nTables = nHashTablesToBuild;
+ return hashTables;
+GenomeIndex::GenomeIndex() : nHashTables(0), hashTables(NULL), overflowTable32(NULL), overflowTable64(NULL), genome(NULL), tablesBlob(NULL), mappedOverflowTable(NULL), mappedTables(NULL)
+ if (NULL != hashTables) {
+ for (unsigned i = 0; i < nHashTables; i++) {
+ delete hashTables[i];
+ hashTables[i] = NULL;
+ }
+ }
+ delete [] hashTables;
+ hashTables = NULL;
+ if (NULL != mappedTables) {
+ mappedTables->close();
+ mappedOverflowTable->close();
+ } else {
+ if (NULL != overflowTable32) {
+ BigDealloc(overflowTable32);
+ overflowTable32 = NULL;
+ }
+ if (NULL != overflowTable64) {
+ BigDealloc(overflowTable64);
+ overflowTable64 = NULL;
+ }
+ if (NULL != tablesBlob) {
+ BigDealloc(tablesBlob);
+ tablesBlob = NULL;
+ }
+ }
+ delete genome;
+ genome = NULL;
+ void
+GenomeIndex::ComputeBiasTable(const Genome* genome, int seedLen, double* table, unsigned maxThreads, bool forceExact, unsigned hashTableKeySize, bool large)
+ * Fill in table with the table size biases for a given genome and seed size.
+ * We assume that table is already of the correct size for our seed size
+ * (namely 4**(seedLen-hashTableKeySize*4)), and just fill in the values.
+ *
+ * If the genome is less than 2^20 bases, we count the seeds in each table exactly;
+ * otherwise, we estimate them using Flajolet-Martin approximate counters.
+ */
+ _int64 start = timeInMillis();
+ WriteStatusMessage("Computing bias table.\n");
+ unsigned nHashTables = ((unsigned)seedLen <= (hashTableKeySize * 4) ? 1 : 1 << (((unsigned)seedLen - hashTableKeySize * 4) * 2));
+ GenomeDistance countOfBases = genome->getCountOfBases();
+ static const unsigned GENOME_SIZE_FOR_EXACT_COUNT = 1 << 20; // Needs to be a power of 2 for hash sets
+ bool computeExactly = (countOfBases < GENOME_SIZE_FOR_EXACT_COUNT) || forceExact;
+ if (countOfBases >= (((_int64)1) << 62) && forceExact) {
+ WriteErrorMessage("You can't use -exact for genomes with >= 2^62 bases (not that you have that much memory or disk anyway).\n");
+ soft_exit(1);
+ }
+ _uint64 *numExactSeeds = NULL;
+ vector<ApproximateCounter> approxCounters(nHashTables);
+ _int64 validSeeds = 0;
+ if (computeExactly) {
+ numExactSeeds = new _uint64[nHashTables];
+ for (unsigned i = 0; i < nHashTables; i++) {
+ numExactSeeds[i] = 0;
+ }
+ //
+ // Create a hash table to record all of the seeds we've already seen. The key is the seed, and the value is just one byte
+ // that the hash table package needs to be able to differentiate empty from non-empty entries. The *11/10 is to leave some slack
+ // in the hash table. In any case, this table should be smaller than the final index (because it doesn't need
+ // any genome locations, not to mention an overflow table), so it should fit in memory.
+ //
+ SNAPHashTable *seedsSeen = new SNAPHashTable((countOfBases * 11) / 10, ((seedLen + 3) * 2) / 8, 1, 1, 0xff);
+ for (_int64 i = 0; i < countOfBases - seedLen; i++) {
+ if (i % 100000000 == 0) {
+ WriteStatusMessage("Bias computation: %lld / %lld\n",(_int64)i, (_int64)countOfBases);
+ }
+ const char *bases = genome->getSubstring(i,seedLen);
+ //
+ // Check it for NULL, because Genome won't return strings that cross contig boundaries.
+ //
+ if (NULL == bases) {
+ continue;
+ }
+ //
+ // We don't build seeds out of sections of the genome that contain 'N.' If this is one, skip it.
+ //
+ if (!Seed::DoesTextRepresentASeed(bases, seedLen)) {
+ continue;
+ }
+ Seed seed(bases, seedLen);
+ validSeeds++;
+ if (large && seed.isBiggerThanItsReverseComplement()) {
+ // For large hash tables, because seeds and their reverse complements are stored
+ // together, figure out which one is used for the hash table key, and use that
+ // one.
+ seed = ~seed;
+ }
+ _ASSERT(seed.getHighBases(hashTableKeySize) < nHashTables);
+ if (NULL == seedsSeen->GetFirstValueForKey(seed.getBases())) {
+ _uint64 value = 42;
+ seedsSeen->Insert(seed.getBases(), &value);
+ numExactSeeds[seed.getHighBases(hashTableKeySize)]++;
+ }
+ }
+// for (unsigned i = 0; i < nHashTables; i++) printf("Hash table %d is predicted to have %lld entries\n", i, numExactSeeds[i]);
+ delete seedsSeen;
+ seedsSeen = NULL;
+ } else {
+ //
+ // Run through the table in parallel.
+ //
+ unsigned nThreads = __min(GetNumberOfProcessors(), maxThreads);
+ volatile int runningThreadCount = nThreads;
+ volatile _int64 nBasesProcessed = 0;
+ SingleWaiterObject doneObject;
+ ExclusiveLock *locks;
+ locks = new ExclusiveLock[nHashTables];
+ for (unsigned i = 0; i < nHashTables; i++) {
+ InitializeExclusiveLock(&locks[i]);
+ }
+ CreateSingleWaiterObject(&doneObject);
+ ComputeBiasTableThreadContext *contexts = new ComputeBiasTableThreadContext[nThreads];
+ GenomeDistance nextChunkToProcess = 0;
+ for (unsigned i = 0; i < nThreads; i++) {
+ contexts[i].approxCounters = &approxCounters;
+ contexts[i].doneObject = &doneObject;
+ contexts[i].genomeChunkStart = nextChunkToProcess;
+ if (i == nThreads - 1) {
+ nextChunkToProcess = countOfBases - seedLen - 1;
+ } else {
+ nextChunkToProcess += (countOfBases - seedLen) / nThreads;
+ }
+ contexts[i].genomeChunkEnd = nextChunkToProcess;
+ contexts[i].nHashTables = nHashTables;
+ contexts[i].hashTableKeySize = hashTableKeySize;
+ contexts[i].runningThreadCount = &runningThreadCount;
+ contexts[i].genome = genome;
+ contexts[i].nBasesProcessed = &nBasesProcessed;
+ contexts[i].seedLen = seedLen;
+ contexts[i].validSeeds = &validSeeds;
+ contexts[i].approximateCounterLocks = locks;
+ contexts[i].large = large;
+ StartNewThread(ComputeBiasTableWorkerThreadMain, &contexts[i]);
+ }
+ WaitForSingleWaiterObject(&doneObject);
+ DestroySingleWaiterObject(&doneObject);
+ for (unsigned i = 0; i < nHashTables; i++) {
+ DestroyExclusiveLock(&locks[i]);
+ }
+ delete [] locks;
+ }
+ double distinctSeeds = 0;
+ for (unsigned i = 0; i < nHashTables; i++) {
+ distinctSeeds += computeExactly ? numExactSeeds[i] : approxCounters[i].getCount();
+ }
+ for (unsigned i = 0; i < nHashTables; i++) {
+ _uint64 count = computeExactly ? numExactSeeds[i] : approxCounters[i].getCount();
+ table[i] = ((double)count * nHashTables) / (double)countOfBases;
+ }
+ delete numExactSeeds;
+ numExactSeeds = NULL;
+ WriteStatusMessage("Computed bias table in %llds\n", (timeInMillis() + 500 - start) / 1000);
+struct PerCounterBatch {
+ PerCounterBatch() : nUsed(0) {}
+ static const unsigned nSeedsPerBatch = 1000;
+ unsigned nUsed;
+ _uint64 lowBases[nSeedsPerBatch];
+ bool addSeed(_uint64 seedLowBases) {
+ _ASSERT(nUsed < nSeedsPerBatch);
+ lowBases[nUsed] = seedLowBases;
+ nUsed++;
+ return nUsed >= nSeedsPerBatch;
+ }
+ void apply(ApproximateCounter *counter) {
+ for (unsigned i = 0; i < nUsed; i++) {
+ counter->add(lowBases[i]);
+ }
+ nUsed = 0;
+ }
+ void
+GenomeIndex::ComputeBiasTableWorkerThreadMain(void *param)
+ ComputeBiasTableThreadContext *context = (ComputeBiasTableThreadContext *)param;
+ bool large = context->large;
+ GenomeDistance countOfBases = context->genome->getCountOfBases();
+ _int64 validSeeds = 0;
+ //
+ // Batch the insertions into the approximate counters, because otherwise we spend all of
+ // our time acquiring and releasing locks.
+ //
+ PerCounterBatch *batches = new PerCounterBatch[context->nHashTables];
+ _uint64 unrecordedSkippedSeeds = 0;
+ const _uint64 printBatchSize = 100000000;
+ for (GenomeDistance i = context->genomeChunkStart; i < context->genomeChunkEnd; i++) {
+ const char *bases = context->genome->getSubstring(i, context->seedLen);
+ //
+ // Check it for NULL, because Genome won't return strings that cross contig boundaries.
+ //
+ if (NULL == bases) {
+ continue;
+ }
+ //
+ // We don't build seeds out of sections of the genome that contain 'N.' If this is one, skip it.
+ //
+ if (!Seed::DoesTextRepresentASeed(bases, context->seedLen)) {
+ unrecordedSkippedSeeds++;
+ continue;
+ }
+ Seed seed(bases, context->seedLen);
+ validSeeds++;
+ if (large && seed.isBiggerThanItsReverseComplement()) {
+ //
+ // Figure out if we're using this base or its complement.
+ //
+ seed = ~seed; // Couldn't resist using ~ for this.
+ }
+ unsigned whichHashTable = seed.getHighBases(context->hashTableKeySize);
+ _ASSERT(whichHashTable < context->nHashTables);
+ if (batches[whichHashTable].addSeed(seed.getLowBases(context->hashTableKeySize))) {
+ PerCounterBatch *batch = &batches[whichHashTable];
+ AcquireExclusiveLock(&context->approximateCounterLocks[whichHashTable]);
+ batch->apply(&(*context->approxCounters)[whichHashTable]);
+ ReleaseExclusiveLock(&context->approximateCounterLocks[whichHashTable]);
+ _int64 basesProcessed = InterlockedAdd64AndReturnNewValue(context->nBasesProcessed, PerCounterBatch::nSeedsPerBatch + unrecordedSkippedSeeds);
+ if ((_uint64)basesProcessed / printBatchSize > ((_uint64)basesProcessed - PerCounterBatch::nSeedsPerBatch - unrecordedSkippedSeeds)/printBatchSize) {
+ WriteStatusMessage("Bias computation: %lld / %lld\n",(basesProcessed/printBatchSize)*printBatchSize, (_int64)countOfBases);
+ }
+ unrecordedSkippedSeeds= 0; // We've now recorded them.
+ }
+ }
+ for (unsigned i = 0; i < context->nHashTables; i++) {
+ _int64 basesProcessed = InterlockedAdd64AndReturnNewValue(context->nBasesProcessed, batches[i].nUsed + unrecordedSkippedSeeds);
+ if ((_uint64)basesProcessed / printBatchSize > ((_uint64)basesProcessed - batches[i].nUsed - unrecordedSkippedSeeds)/printBatchSize) {
+ WriteStatusMessage("Bias computation: %lld / %lld\n",(basesProcessed/printBatchSize)*printBatchSize, (_int64)countOfBases);
+ }
+ unrecordedSkippedSeeds = 0; // All except the first time through the loop this will be 0.
+ AcquireExclusiveLock(&context->approximateCounterLocks[i]);
+ batches[i].apply(&(*context->approxCounters)[i]);
+ ReleaseExclusiveLock(&context->approximateCounterLocks[i]);
+ }
+ delete [] batches;
+ InterlockedAdd64AndReturnNewValue(context->validSeeds, validSeeds);
+ if (0 == InterlockedDecrementAndReturnNewValue(context->runningThreadCount)) {
+ SignalSingleWaiterObject(context->doneObject);
+ }
+ void
+GenomeIndex::BuildHashTablesWorkerThreadMain(void *param)
+ BuildHashTablesThreadContext *context = (BuildHashTablesThreadContext *)param;
+ context->index->BuildHashTablesWorkerThread(context);
+ void
+GenomeIndex::BuildHashTablesWorkerThread(BuildHashTablesThreadContext *context)
+ GenomeDistance countOfBases = context->genome->getCountOfBases();
+ const Genome *genome = context->genome;
+ unsigned seedLen = context->seedLen;
+ bool large = context->large;
+ //
+ // Batch the insertions into the hash tables, because otherwise we spend all of
+ // our time acquiring and releasing locks.
+ //
+ PerHashTableBatch *batches = new PerHashTableBatch[nHashTables];
+ IndexBuildStats stats;
+ for (GenomeLocation genomeLocation = context->genomeChunkStart; genomeLocation < context->genomeChunkEnd; genomeLocation++) {
+ const char *bases = genome->getSubstring(genomeLocation, seedLen);
+ //
+ // Check it for NULL, because Genome won't return strings that cross contig boundaries.
+ //
+ if (NULL == bases) {
+ stats.noBaseAvailable++;
+ stats.unrecordedSkippedSeeds++;
+ continue;
+ }
+ //
+ // We don't build seeds out of sections of the genome that contain 'N.' If this is one, skip it.
+ //
+ if (!Seed::DoesTextRepresentASeed(bases, seedLen)) {
+ stats.nonSeeds++;
+ stats.unrecordedSkippedSeeds++;
+ continue;
+ }
+ Seed seed(bases, seedLen);
+ indexSeed(genomeLocation, seed, batches, context, &stats, large);
+ } // For each genome base in our area
+ //
+ // Now apply the updates from the batches that were left over
+ //
+ completeIndexing(batches, context, &stats, large);
+ InterlockedAdd64AndReturnNewValue(context->noBaseAvailable, stats.noBaseAvailable);
+ InterlockedAdd64AndReturnNewValue(context->nonSeeds, stats.nonSeeds);
+ InterlockedAdd64AndReturnNewValue(context->bothComplementsUsed, stats.bothComplementsUsed);
+ InterlockedAdd64AndReturnNewValue(context->genomeLocationsInOverflowTable, stats.genomeLocationsInOverflowTable);
+ InterlockedAdd64AndReturnNewValue(context->seedsWithMultipleOccurrences, stats.seedsWithMultipleOccurrences);
+ delete [] batches;
+ if (0 == InterlockedDecrementAndReturnNewValue(context->runningThreadCount)) {
+ SignalSingleWaiterObject(context->doneObject);
+ }
+const _int64 GenomeIndex::printPeriod = 100000000;
+ void
+GenomeIndex::indexSeed(GenomeLocation genomeLocation, Seed seed, PerHashTableBatch *batches, BuildHashTablesThreadContext *context, IndexBuildStats *stats, bool large)
+ bool usingComplement = large && seed.isBiggerThanItsReverseComplement();
+ if (usingComplement) {
+ seed = ~seed; // Couldn't resist using ~ for this.
+ }
+ unsigned whichHashTable = seed.getHighBases(context->hashTableKeySize);
+ _ASSERT(whichHashTable < nHashTables);
+ if (batches[whichHashTable].addSeed(genomeLocation, seed.getLowBases(context->hashTableKeySize), usingComplement)) {
+ AcquireExclusiveLock(&context->hashTableLocks[whichHashTable]);
+ for (unsigned i = 0; i < batches[whichHashTable].nUsed; i++) {
+ ApplyHashTableUpdate(context, whichHashTable, batches[whichHashTable].entries[i].genomeLocation,
+ batches[whichHashTable].entries[i].lowBases, batches[whichHashTable].entries[i].usingComplement,
+ &stats->bothComplementsUsed, &stats->genomeLocationsInOverflowTable, &stats->seedsWithMultipleOccurrences, large);
+ }
+ ReleaseExclusiveLock(&context->hashTableLocks[whichHashTable]);
+ _int64 newNBasesProcessed = InterlockedAdd64AndReturnNewValue(context->nBasesProcessed, batches[whichHashTable].nUsed + stats->unrecordedSkippedSeeds);
+ if ((unsigned)(newNBasesProcessed / printPeriod) > (unsigned)((newNBasesProcessed - batches[whichHashTable].nUsed - stats->unrecordedSkippedSeeds) / printPeriod)) {
+ WriteStatusMessage("Indexing %lld / %lld\n", (newNBasesProcessed / printPeriod) * printPeriod, context->genome->getCountOfBases());
+ }
+ stats->unrecordedSkippedSeeds = 0;
+ batches[whichHashTable].clear();
+ } // If we filled a batch
+ void
+GenomeIndex::ApplyHashTableUpdate(BuildHashTablesThreadContext *context, _uint64 whichHashTable, GenomeLocation genomeLocation, _uint64 lowBases, bool usingComplement,
+ _int64 *bothComplementsUsed, _int64 *genomeLocationsInOverflowTable, _int64 *seedsWithMultipleOccurrences, bool large)
+ _ASSERT(large || !usingComplement);
+ GenomeIndex *index = context->index;
+ GenomeDistance countOfBases = context->genome->getCountOfBases();
+ SNAPHashTable *hashTable = index->hashTables[whichHashTable];
+ unsigned locationSize = context->locationSize;
+ unsigned *entry32 = (unsigned *)hashTable->SlowLookup(lowBases); // use SlowLookup because we might have overflowed the table. Cast is OK because valueSize == 4 when we use entry32
+ char *entry64 = (char *)entry32; // Char * because it's variable sized
+ if (NULL == entry64) {
+ SNAPHashTable::ValueType newEntry[2]; // We only use [0] if !large, but it doesn't hurt to declare two
+ if (large) {
+ //
+ // We haven't yet seen either this seed or its complement. Make a new hash table
+ // entry.
+ //
+ if (!usingComplement) {
+ newEntry[0] = GenomeLocationAsInt64(genomeLocation);
+ newEntry[1] = GenomeLocationAsInt64(InvalidGenomeLocation) - 1; // Use 0xfffffffe for unused, because we gave 0xffffffff to the hash table package.
+ } else{
+ newEntry[0] = GenomeLocationAsInt64(InvalidGenomeLocation) - 1; // Use 0xfffffffe for unused, because we gave 0xffffffff to the hash table package.
+ newEntry[1] = GenomeLocationAsInt64(genomeLocation);
+ }
+ } else {
+ newEntry[0] = GenomeLocationAsInt64(genomeLocation);
+ }
+ _ASSERT(0 != GenomeLocationAsInt64(genomeLocation));
+ if (!hashTable->Insert(lowBases, newEntry)) {
+ for (unsigned j = 0; j < index->nHashTables; j++) {
+ WriteErrorMessage("HashTable[%d] has %lld used elements\n",j,(_int64)index->hashTables[j]->GetUsedElementCount());
+ }
+ WriteErrorMessage("IndexBuilder: exceeded size of hash table %d.\n"
+ "If you're indexing a non-human genome, make sure not to pass the -hg19 option. Otheriwse, use -exact or increase slack with -h.\n",
+ whichHashTable);
+ soft_exit(1);
+ }
+ } else {
+ //
+ // This entry already exists in the hash table. It might just be because we've already seen the seed's complement
+ // in which case we update our half of the entry. Otherwise, it's a repeat in the genome, and we need to insert
+ // it in the overflow table.
+ //
+ int entryIndex = usingComplement ? 1 : 0;
+ void *entryPointer = entry64 + locationSize * entryIndex;
+ if (locationSize > 4) {
+ entry32 = NULL; // Using this would be bad
+ _int64 entryValue = 0;
+ memcpy(&entryValue, entryPointer, locationSize); // Assumes little endian
+ if (large && GenomeLocationAsInt64(InvalidGenomeLocation) - 1 == entryValue) {
+ _int64 locationAsInt64 = GenomeLocationAsInt64(genomeLocation);
+ memcpy(entryPointer, &locationAsInt64, locationSize); // Assumes little endian
+ (*bothComplementsUsed)++;
+ } else if (entryValue < countOfBases) {
+ (*seedsWithMultipleOccurrences)++;
+ (*genomeLocationsInOverflowTable) += 2;
+ _int64 overflowIndex = AddOverflowBackpointer(-1, context, entryValue);
+ overflowIndex = AddOverflowBackpointer(overflowIndex, context, GenomeLocationAsInt64(genomeLocation));
+ _int64 entryValue = overflowIndex + countOfBases;
+ memcpy(entryPointer, &entryValue, locationSize);
+ } else {
+ //
+ // Stick another entry in the existing overflow bucket.
+ //
+ _int64 overflowIndex = AddOverflowBackpointer(entryValue - countOfBases, context, genomeLocation);
+ _int64 entryValue = overflowIndex + countOfBases;
+ memcpy(entryPointer, &entryValue, locationSize); // Assumes little endian
+ (*genomeLocationsInOverflowTable)++;
+ } // If the existing entry had the complement empty, needed a new overflow entry or extended an old one
+ } else {
+ entry64 = NULL; // Using this would be bad
+ if (large && GenomeLocationAsInt32(InvalidGenomeLocation) - 1 == entry32[entryIndex]) {
+ entry32[entryIndex] = GenomeLocationAsInt32(genomeLocation);
+ (*bothComplementsUsed)++;
+ } else if (entry32[entryIndex] < (unsigned)countOfBases) { // cast OK, because locationSize <= 4
+ (*seedsWithMultipleOccurrences)++;
+ (*genomeLocationsInOverflowTable) += 2;
+ _int64 overflowIndex = AddOverflowBackpointer(-1, context, entry32[entryIndex]);
+ overflowIndex = AddOverflowBackpointer(overflowIndex, context, GenomeLocationAsInt64(genomeLocation));
+ entry32[entryIndex] = (unsigned)(overflowIndex + countOfBases);
+ } else {
+ //
+ // Stick another entry in the existing overflow bucket.
+ //
+ _int64 overflowIndex = AddOverflowBackpointer(entry32[entryIndex] - countOfBases, context, genomeLocation);
+ entry32[entryIndex] = (unsigned)(overflowIndex + countOfBases);
+ (*genomeLocationsInOverflowTable)++;
+ } // If the existing entry had the complement empty, needed a new overflow entry or extended an old one
+ }
+ } // If new or existing entry.
+ _int64
+ _int64 previousOverflowBackpointer,
+ BuildHashTablesThreadContext*context,
+ GenomeLocation genomeLocation)
+ _int64 overflowBackpointerIndex = InterlockedAdd64AndReturnNewValue(context->nextOverflowBackpointer, 1) - 1;
+ OverflowBackpointer *newBackpointer = context->overflowAnchor->getBackpointer(overflowBackpointerIndex);
+ newBackpointer->nextIndex = previousOverflowBackpointer;
+ newBackpointer->genomeLocation = genomeLocation;
+ if (overflowBackpointerIndex % 100000 == 1 && NULL != context->lastBackpointerIndexUsedByThread) {
+ AcquireExclusiveLock(context->backpointerSpillLock);
+ context->lastBackpointerIndexUsedByThread[context->whichThread] = overflowBackpointerIndex - 1;
+ _int64 trimToIndex = context->lastBackpointerIndexUsedByThread[0];
+ for (unsigned i = 1; i < context->nThreads; i++) {
+ trimToIndex = __min(trimToIndex, context->lastBackpointerIndexUsedByThread[i]);
+ }
+ context->overflowAnchor->trimTo(trimToIndex, context->backpointerSpillFile);
+ ReleaseExclusiveLock(context->backpointerSpillLock);
+ }
+ return overflowBackpointerIndex;
+// A comparison method for qsort that sorts unsigned ints backwards (SNAP expects them to be backwards due to
+// a historical artifact).
+GenomeIndex::BackwardsUnsignedCompare(const void *first, const void *second)
+ if (*(const unsigned *) first > *(const unsigned *)second) {
+ return -1;
+ } else if (*(const unsigned *) first == *(const unsigned *)second) {
+ return 0;
+ } else {
+ return 1;
+ }
+GenomeIndex::BackwardsInt64Compare(const void *first, const void *second)
+ if (*(const _int64 *) first > *(const _int64 *)second) {
+ return -1;
+ } else if (*(const _int64 *) first == *(const _int64 *)second) {
+ return 0;
+ } else {
+ return 1;
+ }
+ void
+GenomeIndex::completeIndexing(PerHashTableBatch *batches, BuildHashTablesThreadContext *context, IndexBuildStats *stats, bool large)
+ for (unsigned whichHashTable = 0; whichHashTable < nHashTables; whichHashTable++) {
+ _int64 basesProcessed = InterlockedAdd64AndReturnNewValue(context->nBasesProcessed, batches[whichHashTable].nUsed + stats->unrecordedSkippedSeeds);
+ if ((_uint64)basesProcessed / printPeriod > ((_uint64)basesProcessed - batches[whichHashTable].nUsed - stats->unrecordedSkippedSeeds)/printPeriod) {
+ WriteStatusMessage("Indexing %lld / %lld\n",(basesProcessed/printPeriod)*printPeriod, context->genome->getCountOfBases());
+ }
+ stats->unrecordedSkippedSeeds = 0; // All except the first time through the loop this will be 0.
+ AcquireExclusiveLock(&context->hashTableLocks[whichHashTable]);
+ for (unsigned i = 0; i < batches[whichHashTable].nUsed; i++) {
+ ApplyHashTableUpdate(context, whichHashTable, batches[whichHashTable].entries[i].genomeLocation,
+ batches[whichHashTable].entries[i].lowBases, batches[whichHashTable].entries[i].usingComplement,
+ &stats->bothComplementsUsed, &stats->genomeLocationsInOverflowTable,
+ &stats->seedsWithMultipleOccurrences, large);
+ }
+ ReleaseExclusiveLock(&context->hashTableLocks[whichHashTable]);
+ }
+GenomeIndex::OverflowBackpointerAnchor::OverflowBackpointerAnchor(_int64 maxOverflowEntries_) : maxOverflowEntries(maxOverflowEntries_)
+ _ASSERT(maxOverflowEntries > 0);
+ _int64 roundedUpMaxOverflowEntries = (maxOverflowEntries + batchSize - 1) / batchSize * batchSize; // Round up to the next batch size
+ table = new OverflowBackpointer *[roundedUpMaxOverflowEntries / batchSize];
+ for (unsigned i = 0; i < roundedUpMaxOverflowEntries / batchSize; i++) {
+ table[i] = NULL;
+ }
+ InitializeExclusiveLock(&lock);
+ for (unsigned i = 0; i < maxOverflowEntries / batchSize; i++) {
+ if (table[i] != NULL) {
+ BigDealloc(table[i]);
+ table[i] = NULL;
+ }
+ }
+ delete [] table;
+ table = NULL;
+ DestroyExclusiveLock(&lock);
+ GenomeIndex::OverflowBackpointer *
+GenomeIndex::OverflowBackpointerAnchor::getBackpointer(_int64 index)
+ if (index >= maxOverflowEntries) {
+ WriteErrorMessage("Trying to use too many overflow entries. To index this genome, you either need a larger seed size or a larger location size.\n");
+ soft_exit(1);
+ }
+ _int64 tableSlot = index / batchSize;
+ if (table[tableSlot] == NULL) {
+ AcquireExclusiveLock(&lock);
+ if (table[tableSlot] == NULL) {
+ OverflowBackpointer *newTableEntry = (OverflowBackpointer *)BigAlloc(batchSize * sizeof(OverflowBackpointer));
+ for (unsigned i = 0; i < batchSize; i++) {
+ newTableEntry[i].genomeLocation = 0xffffffffffffffff;
+ newTableEntry[i].nextIndex = 0xffffffffffffffff;
+ }
+ //
+ // Don't fill in the table[] pointer until initialization is complete in order to avoid racing with someone writing while we're
+ // initializing.
+ //
+ table[tableSlot] = newTableEntry;
+ }
+ ReleaseExclusiveLock(&lock);
+ } else {
+ if (&spilledTableSlot == table[tableSlot]) {
+ WriteErrorMessage("Looking up spilled table slot. Something is very wrong. Try not using -sm and contact the developers.\n");
+ soft_exit(1);
+ }
+ }
+ return &table[tableSlot][index % batchSize];
+ void
+GenomeIndex::OverflowBackpointerAnchor::trimTo(_int64 trimToIndex, FILE *trimFile)
+ //
+ // Run through the anchor table, and spill out any table slots whose indices are all less than trimToIndex and that aren't
+ // yet spilled out.
+ //
+ for (_int64 tableSlot = 0; (tableSlot + 1) * batchSize < trimToIndex; tableSlot++) {
+ if (&spilledTableSlot != table[tableSlot] && NULL != table[tableSlot]) {
+ if (batchSize != fwrite(table[tableSlot], sizeof(*table[tableSlot]), batchSize, trimFile)) {
+ WriteErrorMessage("Failure writing to trim file. Maybe you're out of disk space or encountered some other error. Perhaps try without -sm.\n");
+ soft_exit(1);
+ }
+ BigDealloc((void *)table[tableSlot]);
+ table[tableSlot] = &spilledTableSlot;
+ }
+ }
+ void
+GenomeIndex::OverflowBackpointerAnchor::loadFromFile(FILE *file)
+ rewind(file);
+ for (int i = 0; i < maxOverflowEntries / batchSize; i++) {
+ if (table[i] == &spilledTableSlot) {
+ table[i] = (OverflowBackpointer *)BigAlloc(batchSize * sizeof(OverflowBackpointer));
+ if (batchSize != fread(table[i], sizeof(OverflowBackpointer), batchSize, file)) {
+ WriteErrorMessage("Failed to read overflow table batch i from spill file\n", i);
+ soft_exit(1);
+ }
+ }
+ }
+const unsigned GenomeIndex::OverflowBackpointerAnchor::batchSize = 1024 * 1024;
+GenomeIndex::OverflowBackpointer GenomeIndex::OverflowBackpointerAnchor::spilledTableSlot;
+ void
+ for (int keySize = 0; keySize <= largestKeySize; keySize++) {
+ for (int seedSize = 0; seedSize <= largestBiasTable; seedSize++) {
+ if (NULL != hg19_biasTables_large[keySize][seedSize]) {
+ printf("static double hg19_biasTable%d_%d_large[] = {\n", seedSize, keySize);
+ unsigned bitsOfSeed = seedSize * 2;
+ unsigned bitsOfKey = keySize * 8; // 8 == nBits / byte
+ unsigned numHashTables = 1 << (bitsOfSeed - bitsOfKey);
+ for (unsigned hashTable = 0; hashTable < numHashTables; hashTable++) {
+ if (hg19_biasTables_large[keySize][seedSize][hashTable] == 0) {
+ printf("0");
+ } else if (hg19_biasTables_large[keySize][seedSize][hashTable] > 1000 || hg19_biasTables_large[keySize][seedSize][hashTable] < .01) {
+ printf("%1.2e", hg19_biasTables_large[keySize][seedSize][hashTable]);
+ } else if (hg19_biasTables_large[keySize][seedSize][hashTable] > 10) {
+ printf("%1.1f", hg19_biasTables_large[keySize][seedSize][hashTable]);
+ } else {
+ printf("%1.3f", hg19_biasTables_large[keySize][seedSize][hashTable]);
+ }
+ if (hashTable != numHashTables-1) {
+ printf(",");
+ }
+ if (hashTable % 10 == 9) {
+ printf("\n");
+ }
+ } // for each hash table
+ printf("\n};\n\n");
+ } // if there is a bias entry for this key * seed size
+ } // for each seed size
+ } // for each key size
+ for (int keySize = 0; keySize <= largestKeySize; keySize++) {
+ for (int seedSize = 0; seedSize <= largestBiasTable; seedSize++) {
+ if (NULL != hg19_biasTables[keySize][seedSize]) {
+ printf("static double hg19_biasTable%d_%d[] = {\n", seedSize, keySize);
+ unsigned bitsOfSeed = seedSize * 2;
+ unsigned bitsOfKey = keySize * 8; // 8 == nBits / byte
+ unsigned numHashTables = 1 << (bitsOfSeed - bitsOfKey);
+ for (unsigned hashTable = 0; hashTable < numHashTables; hashTable++) {
+ if (hg19_biasTables[keySize][seedSize][hashTable] == 0) {
+ printf("0");
+ } else if (hg19_biasTables[keySize][seedSize][hashTable] > 1000 || hg19_biasTables[keySize][seedSize][hashTable] < .01) {
+ printf("%1.2e", hg19_biasTables[keySize][seedSize][hashTable]);
+ } else if (hg19_biasTables[keySize][seedSize][hashTable] > 10) {
+ printf("%1.1f", hg19_biasTables[keySize][seedSize][hashTable]);
+ } else {
+ printf("%1.3f", hg19_biasTables[keySize][seedSize][hashTable]);
+ }
+ if (hashTable != numHashTables-1) {
+ printf(",");
+ }
+ if (hashTable % 10 == 9) {
+ printf("\n");
+ }
+ } // for each hash table
+ printf("\n};\n\n");
+ } // if there is a bias entry for this key * seed size
+ } // for each seed size
+ } // for each key size
+ GenomeIndex *
+GenomeIndex::loadFromDirectory(char *directoryName, bool map, bool prefetch)
+ const unsigned filenameBufferSize = MAX_PATH+1;
+ char filenameBuffer[filenameBufferSize];
+ snprintf(filenameBuffer,filenameBufferSize,"%s%cGenomeIndex",directoryName,PATH_SEP);
+ GenericFile *indexFile = GenericFile::open(filenameBuffer, GenericFile::ReadOnly);
+ if (NULL == indexFile) {
+ WriteErrorMessage("Unable to open file '%s' for read.\n",filenameBuffer);
+ return NULL;
+ }
+ char indexFileBuf[1000];
+ size_t indexFileSize = indexFile->read(indexFileBuf, sizeof(indexFileBuf) - 1);
+ indexFileBuf[indexFileSize] = 0;
+ unsigned seedLen;
+ unsigned majorVersion, minorVersion, chromosomePadding;
+ int nRead;
+ size_t hashTablesFileSize;
+ unsigned nHashTables;
+ _int64 overflowTableSize;
+ unsigned hashTableKeySize;
+ unsigned smallHashTable;
+ unsigned locationSize;
+ if (10 != (nRead = sscanf(indexFileBuf,"%d %d %d %lld %d %d %d %lld %d %d", &majorVersion, &minorVersion, &nHashTables, &overflowTableSize, &seedLen, &chromosomePadding,
+ &hashTableKeySize, &hashTablesFileSize, &smallHashTable, &locationSize))) {
+ if (3 == nRead || 6 == nRead || 7 == nRead || 9 == nRead) {
+ WriteErrorMessage("Indices built by versions before 1.0dev.21 are no longer supported. Please rebuild your index.\n");
+ } else {
+ WriteErrorMessage("GenomeIndex::LoadFromDirectory: didn't read initial values\n");
+ }
+ indexFile->close();
+ delete indexFile;
+ return NULL;
+ }
+ indexFile->close();
+ delete indexFile;
+ if (majorVersion != GenomeIndexFormatMajorVersion) {
+ WriteErrorMessage("This genome index appears to be from a different version of SNAP than this, and so we can't read it. Index version %d, SNAP index format version %d\n",
+ majorVersion, GenomeIndexFormatMajorVersion);
+ soft_exit(1);
+ }
+ if (0 == seedLen) {
+ WriteErrorMessage("GenomeIndex::LoadFromDirectory: saw seed size of 0.\n");
+ return NULL;
+ }
+ SetInvalidGenomeLocation(locationSize);
+ GenomeIndex *index;
+ index = new GenomeIndex();
+ index->nHashTables = nHashTables;
+ index->overflowTableSize = overflowTableSize;
+ index->hashTableKeySize = hashTableKeySize;
+ index->seedLen = seedLen;
+ index->locationSize = locationSize;
+ index->largeHashTable = !smallHashTable;
+ unsigned overflowEntrySize = (locationSize > 4) ? sizeof(*index->overflowTable64) : sizeof(*index->overflowTable32);
+ size_t overflowTableSizeInBytes = (size_t)index->overflowTableSize * overflowEntrySize;
+ snprintf(filenameBuffer,filenameBufferSize, "%s%cOverflowTable", directoryName, PATH_SEP);
+ if (map) {
+ if (prefetch) {
+ GenericFile *overflowTableFile = GenericFile::open(filenameBuffer, GenericFile::ReadOnly);
+ if (NULL == overflowTableFile) {
+ WriteErrorMessage("Unable to open file '%s'\n", filenameBuffer);
+ soft_exit(1);
+ }
+ overflowTableFile->prefetch();
+ overflowTableFile->close();
+ delete overflowTableFile;
+ }
+ index->mappedOverflowTable = GenericFile_map::open(filenameBuffer);
+ if (NULL == index->mappedOverflowTable) {
+ WriteErrorMessage("Unable to open file '%s'\n", filenameBuffer);
+ soft_exit(1);
+ }
+ size_t bytesMapped;
+ if (locationSize > 4) {
+ index->overflowTable64 = (_int64 *)index->mappedOverflowTable->mapAndAdvance(overflowTableSizeInBytes, &bytesMapped);
+ } else {
+ index->overflowTable32 = (unsigned *)index->mappedOverflowTable->mapAndAdvance(overflowTableSizeInBytes, &bytesMapped);
+ }
+ if (bytesMapped != overflowTableSizeInBytes) {
+ WriteErrorMessage("read (via mapping) only %lld bytes of '%s', expected %lld\n", bytesMapped, filenameBuffer, overflowTableSizeInBytes);
+ soft_exit(1);
+ }
+ index->mappedOverflowTable->prefetch(); // NB: This is different than the -pre prefetch. This one maps the whole thing (and reads it sequentially in case you didn't use -pre)
+ } else {
+ char *tableAsCharStar;
+ if (locationSize > 4) {
+ index->overflowTable64 = (_int64 *)BigAlloc(overflowTableSizeInBytes);
+ tableAsCharStar = (char *)index->overflowTable64;
+ _ASSERT(NULL == index->overflowTable32);
+ } else {
+ index->overflowTable32 = (unsigned *)BigAlloc(overflowTableSizeInBytes);
+ tableAsCharStar = (char *)index->overflowTable32;
+ _ASSERT(NULL == index->overflowTable64);
+ }
+ GenericFile *fOverflowTable = GenericFile::open(filenameBuffer, GenericFile::ReadOnly);
+ if (NULL == fOverflowTable) {
+ WriteErrorMessage("Unable to open overflow table file, '%s', %d\n", filenameBuffer, errno);
+ delete index;
+ return NULL;
+ }
+ size_t amountRead = fOverflowTable->read(tableAsCharStar, overflowTableSizeInBytes);
+ if (amountRead != overflowTableSizeInBytes) {
+ WriteErrorMessage("Error reading overflow table, %lld != %lld bytes read.\n", amountRead, overflowTableSizeInBytes);
+ soft_exit(1);
+ }
+ fOverflowTable->close();
+ delete fOverflowTable;
+ fOverflowTable = NULL;
+ }
+ index->hashTables = new SNAPHashTable*[index->nHashTables];
+ for (unsigned i = 0; i < index->nHashTables; i++) {
+ index->hashTables[i] = NULL; // We need to do this so the destructor doesn't crash if loading a hash table fails.
+ }
+ snprintf(filenameBuffer, filenameBufferSize, "%s%cGenomeIndexHash", directoryName, PATH_SEP);
+ GenericFile_Blob *blobFile = NULL;
+ GenericFile *tablesFile = NULL;
+ if (map) {
+ if (prefetch) {
+ GenericFile *hashTableFile = GenericFile::open(filenameBuffer, GenericFile::ReadOnly);
+ if (NULL == hashTableFile) {
+ WriteErrorMessage("Unable to open genome hash table file '%s'\n", filenameBuffer);
+ soft_exit(1);
+ }
+ hashTableFile->prefetch();
+ hashTableFile->close();
+ delete hashTableFile;
+ }
+ if (QueryFileSize(filenameBuffer) != hashTablesFileSize) {
+ WriteErrorMessage("File '%s' had unexpected size, %lld != %lld\n", filenameBuffer, QueryFileSize(filenameBuffer), hashTablesFileSize);
+ delete index;
+ return NULL;
+ }
+ index->mappedTables = GenericFile_map::open(filenameBuffer);
+ index->mappedTables->prefetch();
+ blobFile = index->mappedTables;
+ index->tablesBlob = NULL;
+ } else {
+ tablesFile = GenericFile::open(filenameBuffer, GenericFile::ReadOnly);
+ if (NULL == tablesFile) {
+ WriteErrorMessage("Unable to open genome hash table file '%s'\n", filenameBuffer);
+ soft_exit(1);
+ }
+ index->tablesBlob = BigAlloc(hashTablesFileSize);
+ size_t amountRead = tablesFile->read(index->tablesBlob, hashTablesFileSize);
+ if (amountRead != hashTablesFileSize) {
+ WriteErrorMessage("Read incorrect amount for GenomeIndexHash file, %lld != %lld\n", hashTablesFileSize, amountRead);
+ delete index;
+ return NULL;
+ }
+ blobFile = GenericFile_Blob::open(index->tablesBlob, hashTablesFileSize);
+ }
+ for (unsigned i = 0; i < index->nHashTables; i++) {
+ if (NULL == (index->hashTables[i] = SNAPHashTable::loadFromBlob(blobFile))) {
+ WriteErrorMessage("GenomeIndex::loadFromDirectory: Failed to load hash table %d\n",i);
+ delete index;
+ return NULL;
+ }
+ unsigned expectedValueCount;
+ if (smallHashTable) {
+ expectedValueCount = 1;
+ } else {
+ expectedValueCount = 2;
+ }
+ if (index->hashTables[i]->GetValueCount() != expectedValueCount) {
+ WriteErrorMessage("Expected loaded hash table to have value count of %d, but it had %d. Index corrupt\n", expectedValueCount, index->hashTables[i]->GetValueCount());
+ delete index;
+ return NULL;
+ }
+ }
+ if (!map) {
+ tablesFile->close();
+ delete tablesFile;
+ tablesFile = NULL;
+ blobFile->close();
+ delete blobFile;
+ blobFile = NULL;
+ }
+ snprintf(filenameBuffer,filenameBufferSize,"%s%cGenome",directoryName,PATH_SEP);
+ if (NULL == (index->genome = Genome::loadFromFile(filenameBuffer, chromosomePadding, 0, 0, map))) {
+ WriteErrorMessage("GenomeIndex::loadFromDirectory: Failed to load the genome itself\n");
+ delete index;
+ return NULL;
+ }
+ if ((_int64)index->genome->getCountOfBases() + (_int64)index->overflowTableSize > 0xfffffff0 && locationSize == 4) {
+ WriteErrorMessage("\nThis index has too many overflow entries to be valid. Some early versions of SNAP\n"
+ "allowed building indices with too small of a seed size, and this appears to be such\n"
+ "an index. You can no longer build indices like this, and you also can't use them\n"
+ "because they are corrupt and would produce incorrect results. Please use an index\n"
+ "built with a larger seed size. For hg19, the seed size must be at least 19.\n"
+ "For other reference genomes this quantity will vary.\n");
+ soft_exit(1);
+ }
+ return index;
+ void
+ Seed seed,
+ _int64 *nHits,
+ const unsigned **hits,
+ _int64 *nRCHits,
+ const unsigned **rcHits)
+ _ASSERT(locationSize == 4); // This is the caller's responsibility to check.
+ if (largeHashTable) {
+ bool lookedUpComplement;
+ lookedUpComplement = seed.isBiggerThanItsReverseComplement();
+ if (lookedUpComplement) {
+ seed = ~seed;
+ }
+ _ASSERT(seed.getHighBases(hashTableKeySize) < nHashTables);
+ _uint64 lowBases = seed.getLowBases(hashTableKeySize);
+ _ASSERT(hashTables[seed.getHighBases(hashTableKeySize)]->GetValueSizeInBytes() == 4);
+ const unsigned *entry = (const unsigned *)hashTables[seed.getHighBases(hashTableKeySize)]->GetFirstValueForKey(lowBases); // Cast OK because valueSize == 4
+ if (NULL == entry) {
+ *nHits = 0;
+ *nRCHits = 0;
+ return;
+ }
+ //
+ // Fill in the caller's answers for the main and complement of the seed looked up.
+ // Because of our hash table design, we may have had to take the complement before the
+ // lookup, in which case we reverse the results so the caller gets the right thing.
+ // Also, if the seed is its own reverse complement, we need to fill the same hits
+ // in both return arrays.
+ //
+ fillInLookedUpResults32((lookedUpComplement ? entry + 1 : entry), nHits, hits);
+ if (seed.isOwnReverseComplement()) {
+ *nRCHits = *nHits;
+ *rcHits = *hits;
+ } else {
+ fillInLookedUpResults32((lookedUpComplement ? entry : entry + 1), nRCHits, rcHits);
+ }
+ } else {
+ for (int dir = 0; dir < NUM_DIRECTIONS; dir++) {
+ _ASSERT(seed.getHighBases(hashTableKeySize) < nHashTables);
+ _uint64 lowBases = seed.getLowBases(hashTableKeySize);
+ _ASSERT(hashTables[seed.getHighBases(hashTableKeySize)]->GetValueSizeInBytes() == 4);
+ unsigned *entry = (unsigned int *)hashTables[seed.getHighBases(hashTableKeySize)]->GetFirstValueForKey(lowBases); // Cast OK because valueSize == 4
+ if (NULL == entry) {
+ if (FORWARD == dir) {
+ *nHits = 0;
+ } else {
+ *nRCHits = 0;
+ }
+ } else if (FORWARD == dir) {
+ fillInLookedUpResults32(entry, nHits, hits);
+ } else {
+ fillInLookedUpResults32(entry, nRCHits, rcHits);
+ }
+ seed = ~seed;
+ } // For each direction
+ }
+ void
+ const unsigned *subEntry,
+ _int64 *nHits,
+ const unsigned **hits)
+ //
+ // WARNING: the code in the IntersectingPairedEndAligner relies on being able to look at
+ // hits[-1]. It doesn't care about the value, but it must not be a bogus pointer. This
+ // is true with the current layout (where it will either be the hit count, the key or
+ // forward pointer in the hash table entry or some intermediate hit in the case where the
+ // search is constrained by minLocation/maxLocation). If you change this, be sure to look
+ // at the code and fix it.
+ //
+ if (*subEntry < genome->getCountOfBases()) {
+ //
+ // It's a singleton.
+ //
+ *nHits = 1;
+ *hits = subEntry;
+ } else if (*subEntry == 0xfffffffe) {
+ //
+ // It's unused, the other complement must exist.
+ //
+ _ASSERT(largeHashTable);
+ *nHits = 0;
+ } else {
+ //
+ // Multiple hits. Recall that the overflow table format is first a count of
+ // the number of hits for that seed, followed by the list of hits.
+ //
+ unsigned overflowTableOffset = *subEntry - (unsigned)genome->getCountOfBases();
+ _ASSERT(overflowTableOffset < overflowTableSize);
+ int hitCount = overflowTable32[overflowTableOffset];
+ _ASSERT(hitCount >= 2);
+ _ASSERT(hitCount + overflowTableOffset < overflowTableSize);
+ *nHits = hitCount;
+ *hits = &overflowTable32[overflowTableOffset + 1];
+ }
+ void
+ Seed seed,
+ _int64 * nHits,
+ const GenomeLocation ** hits,
+ _int64 * nRCHits,
+ const GenomeLocation ** rcHits,
+ GenomeLocation * singleHit,
+ GenomeLocation * singleRCHit)
+ _ASSERT(locationSize > 4 && locationSize <= 8);
+ if (largeHashTable) {
+ bool lookedUpComplement;
+ lookedUpComplement = seed.isBiggerThanItsReverseComplement();
+ if (lookedUpComplement) {
+ seed = ~seed;
+ }
+ _ASSERT(seed.getHighBases(hashTableKeySize) < nHashTables);
+ _uint64 lowBases = seed.getLowBases(hashTableKeySize);
+ _ASSERT(hashTables[seed.getHighBases(hashTableKeySize)]->GetValueSizeInBytes() > 4);
+ const char *entry = (char *)hashTables[seed.getHighBases(hashTableKeySize)]->GetFirstValueForKey(lowBases);
+ if (NULL == entry) {
+ *nHits = 0;
+ *nRCHits = 0;
+ return;
+ }
+ GenomeLocation entryByValue[NUM_DIRECTIONS];
+ entryByValue[0] = 0;
+ entryByValue[1] = 0;
+ memcpy(&entryByValue[0], entry, locationSize); // Works because we're litte-endian
+ memcpy(&entryByValue[1], entry + locationSize, locationSize); // Again, required litte-endianness.
+ //
+ // Fill in the caller's answers for the main and complement of the seed looked up.
+ // Because of our hash table design, we may have had to take the complement before the
+ // lookup, in which case we reverse the results so the caller gets the right thing.
+ // Also, if the seed is its own reverse complement, we need to fill the same hits
+ // in both return arrays.
+ //
+ fillInLookedUpResults(entryByValue[lookedUpComplement ? 1 : 0], nHits, hits, singleHit);
+ if (seed.isOwnReverseComplement()) {
+ *nRCHits = *nHits;
+ *rcHits = *hits;
+ } else {
+ fillInLookedUpResults(entryByValue[lookedUpComplement ? 0 : 1], nRCHits, rcHits, singleRCHit);
+ }
+ } else {
+ for (int dir = 0; dir < NUM_DIRECTIONS; dir++) {
+ _ASSERT(seed.getHighBases(hashTableKeySize) < nHashTables);
+ _uint64 lowBases = seed.getLowBases(hashTableKeySize);
+ _ASSERT(hashTables[seed.getHighBases(hashTableKeySize)]->GetValueSizeInBytes() > 4);
+ const char *entry = (char *)hashTables[seed.getHighBases(hashTableKeySize)]->GetFirstValueForKey(lowBases);
+ if (NULL == entry) {
+ if (FORWARD == dir) {
+ *nHits = 0;
+ } else {
+ *nRCHits = 0;
+ }
+ } else {
+ GenomeLocation entryByValue = 0;
+ memcpy(&entryByValue, entry, locationSize); // Assumes little endian
+ if (FORWARD == dir) {
+ fillInLookedUpResults(entryByValue, nHits, hits, singleHit);
+ } else {
+ fillInLookedUpResults(entryByValue, nRCHits, rcHits, singleRCHit);
+ }
+ }
+ seed = ~seed;
+ } // For each direction
+ }
+ void
+GenomeIndex::fillInLookedUpResults(GenomeLocation lookedUpLocation, _int64 *nHits, const GenomeLocation **hits, GenomeLocation *singleHitLocation)
+ //
+ // WARNING: the code in the IntersectingPairedEndAligner relies on being able to look at
+ // hits[-1]. It doesn't care about the value, but it must not be a bogus pointer. This
+ // is true with the current layout (where it will either be the hit count, the key or
+ // forward pointer in the hash table entry or some intermediate hit in the case where the
+ // search is constrained by minLocation/maxLocation). If you change this, be sure to look
+ // at the code and fix it. You don't need to worry about this in the case of singleHitLocation,
+ // that's the caller's problem.
+ //
+ if (lookedUpLocation < genome->getCountOfBases()) {
+ //
+ // It's a singleton.
+ //
+ *nHits = 1;
+ *hits = singleHitLocation;
+ *singleHitLocation = lookedUpLocation;
+ } else if (lookedUpLocation == InvalidGenomeLocation - 1) {
+ //
+ // It's unused, the other complement must exist.
+ //
+ _ASSERT(largeHashTable);
+ *nHits = 0;
+ } else {
+ //
+ // Multiple hits. Recall that the overflow table format is first a count of
+ // the number of hits for that seed, followed by the list of hits.
+ //
+ _int64 overflowTableOffset = GenomeLocationAsInt64(lookedUpLocation) - genome->getCountOfBases();
+ _ASSERT(overflowTableOffset < (_int64)overflowTableSize);
+ _int64 hitCount = overflowTable64[overflowTableOffset];
+ _ASSERT(hitCount >= 2);
+ _ASSERT(hitCount + overflowTableOffset < (_int64)overflowTableSize);
+ *nHits = hitCount;
+ *hits = (const GenomeLocation *)&overflowTable64[overflowTableOffset + 1];
+ }
diff --git a/SNAPLib/GenomeIndex.h b/SNAPLib/GenomeIndex.h
new file mode 100644
index 0000000..2165461
--- /dev/null
+++ b/SNAPLib/GenomeIndex.h
@@ -0,0 +1,299 @@
+Module Name:
+ GenomeIndex.h
+ Headers for the index builder for the SNAP sequencer
+ Bill Bolosky, August, 2011
+ User mode service.
+Revision History:
+ Adapted from Matei Zaharia's Scala implementation.
+#pragma once
+#include "HashTable.h"
+#include "Seed.h"
+#include "Genome.h"
+#include "ApproximateCounter.h"
+#include "GenericFile_map.h"
+class GenomeIndex {
+ const Genome *getGenome() {return genome;}
+ //
+ // This looks up a seed and its reverse complement, and returns the number and list of hits for each.
+ // It guarantees that if the lookup succeeds that hits[-1] and rcHits[-1] are valid memory with
+ // arbirtary values. The -32 version is used for indices with 32 bit genome offsets; using the version
+ // that doesn't match the genome index GenomeLocation size is an error. Check the index type with
+ // doesGenomeIndexHave64BitLocations();
+ //
+ // The 64 bit version requires the called to supply a special location for storing a single forward or
+ // reverse hit. This is because in the hash table (but not overflow table), these may be stored in
+ // 5-7 bytes in order to save space. This means that there's no address in the hash table that can
+ // be pointed to as a return value. When only a single hit is returned, *hits == singleHit, so there's
+ // no need to check on the caller's side.
+ //
+ void lookupSeed(Seed seed, _int64 *nHits, const GenomeLocation **hits, _int64 *nRCHits, const GenomeLocation **rcHits, GenomeLocation *singleHit, GenomeLocation *singleRCHit);
+ void lookupSeed32(Seed seed, _int64 *nHits, const unsigned **hits, _int64 *nRCHits, const unsigned **rcHits);
+ bool doesGenomeIndexHave64BitLocations() const {return locationSize > 4;}
+ //
+ // Looks up a seed and its reverse complement, restricting the search to a given range of locations,
+ // and returns the number and list of hits for each.
+ //
+// virtual void lookupSeed(Seed seed, unsigned minLocation, unsigned maxLocation,
+// unsigned *nHits, const unsigned **hits, unsigned *nRCHits, const unsigned **rcHits) = 0;
+ //
+ // This issues a compiler prefetch for the genome data.
+ //
+ inline void prefetchGenomeData(GenomeLocation genomeLocation) const {
+ genome->prefetchData(genomeLocation);
+ }
+ inline int getSeedLength() const { return seedLen; }
+ virtual ~GenomeIndex();
+ //
+ // run the indexer from command line arguments
+ //
+ static void runIndexer(int argc, const char **argv);
+ static GenomeIndex *loadFromDirectory(char *directoryName, bool map, bool prefetch);
+ static void printBiasTables();
+ int seedLen;
+ unsigned hashTableKeySize;
+ unsigned nHashTables;
+ const Genome *genome;
+ bool largeHashTable;
+ unsigned locationSize;
+ //
+ // The overflow table is indexed by numbers > than the number of bases in the genome.
+ // The hash table(s) point into the overflow table when they have a seed that's got more
+ // than one instance in the genome. For locationSize <= 4, the table is made of 32
+ // bit entries (and pointed to by overflowTable32), otherwise it's 64 bit entries.
+ //
+ _uint64 overflowTableSize;
+ unsigned *overflowTable32;
+ _int64 *overflowTable64;
+ GenericFile_map *mappedOverflowTable;
+ void *tablesBlob; // All of the hash tables in one giant blob
+ GenericFile_map *mappedTables;
+ //
+ // We have to build the overflow table in two stages. While we're walking the genome, we first
+ // assign tentative overflow table locations, and build up a list of places where each repeat
+ // occurs. Once we've read the whole thing (and so know the exact number of instances of each
+ // repeated seed) we build the actual overflow table and go back and update the entries in the
+ // hash table.
+ //
+ // The list of repeats works as a singly linked list, headed by the hash table entry. The entries
+ // use the index in the overflow table as links, rather than using real pointers, in order to save
+ // space. So that we can dynamically allocate overflow entries while still using indices to
+ // find them, they're built in a two level table.
+ //
+ struct OverflowBackpointer {
+ _int64 nextIndex;
+ GenomeLocation genomeLocation;
+ };
+ class OverflowBackpointerAnchor {
+ public:
+ OverflowBackpointerAnchor(_int64 maxOverflowEntries_);
+ ~OverflowBackpointerAnchor();
+ OverflowBackpointer *getBackpointer(_int64 index);
+ void trimTo(_int64 trimToIndex, FILE *trimFile);
+ void loadFromFile(FILE *tripFile);
+ private:
+ ExclusiveLock lock;
+ static const unsigned batchSize;
+ _int64 maxOverflowEntries;
+ OverflowBackpointer **table;
+ static OverflowBackpointer spilledTableSlot; // This value is used to indicate that the table slot in question has been spilled
+ };
+ //
+ // Build a genome index and write it to a directory. If you don't already have a saved index
+ // the only way to get one is to build it into a directory and then load it from the directory.
+ // NB: This deletes the Genome that's passed into it.
+ //
+ static bool BuildIndexToDirectory(const Genome *genome, int seedLen, double slack,
+ bool computeBias, const char *directory,
+ unsigned maxThreads, unsigned chromosomePaddingSize, bool forceExact,
+ unsigned hashTableKeySize, bool large, const char *histogramFileName,
+ unsigned locationSize, bool smallMemory);
+ //
+ // Allocate set of hash tables indexed by seeds with bias
+ //
+ static SNAPHashTable** allocateHashTables(unsigned* o_nTables, GenomeDistance countOfBases, double slack,
+ int seedLen, unsigned hashTableKeySize, bool large, unsigned locationSize, double* biasTable = NULL);
+ static const unsigned GenomeIndexFormatMajorVersion = 5;
+ static const unsigned GenomeIndexFormatMinorVersion = 0;
+ static const unsigned largestBiasTable = 32; // Can't be bigger than the biggest seed size, which is set in Seed.h. Bigger than 32 means a new Seed structure.
+ static const unsigned largestKeySize = 8;
+ static double *hg19_biasTables[largestKeySize+1][largestBiasTable+1];
+ static double *hg19_biasTables_large[largestKeySize+1][largestBiasTable+1];
+ static void ComputeBiasTable(const Genome* genome, int seedSize, double* table, unsigned maxThreads, bool forceExact, unsigned hashTableKeySize, bool large);
+ struct ComputeBiasTableThreadContext {
+ SingleWaiterObject *doneObject;
+ volatile int *runningThreadCount;
+ GenomeDistance genomeChunkStart;
+ GenomeDistance genomeChunkEnd;
+ unsigned nHashTables;
+ unsigned hashTableKeySize;
+ std::vector<ApproximateCounter> *approxCounters;
+ const Genome *genome;
+ volatile _int64 *nBasesProcessed;
+ unsigned seedLen;
+ volatile _int64 *validSeeds;
+ bool large;
+ ExclusiveLock *approximateCounterLocks;
+ };
+ static void ComputeBiasTableWorkerThreadMain(void *param);
+ struct OverflowBackpointer;
+ struct BuildHashTablesThreadContext {
+ unsigned nThreads;
+ unsigned whichThread;
+ SingleWaiterObject *doneObject;
+ volatile int *runningThreadCount;
+ GenomeLocation genomeChunkStart;
+ GenomeLocation genomeChunkEnd;
+ const Genome *genome;
+ volatile _int64 *nBasesProcessed;
+ unsigned seedLen;
+ volatile _int64 *noBaseAvailable;
+ volatile _int64 *nonSeeds;
+ volatile _int64 *seedsWithMultipleOccurrences;
+ volatile _int64 *bothComplementsUsed;
+ GenomeIndex *index;
+ OverflowBackpointerAnchor *overflowAnchor;
+ volatile _int64 *nextOverflowBackpointer;
+ volatile _int64 *genomeLocationsInOverflowTable;
+ unsigned hashTableKeySize;
+ bool large;
+ unsigned locationSize;
+ //
+ // The "small memory" option causes SNAP to write out the backpointer table as it's
+ // built in order to save the memory it uses, because it's accessed sequentially as
+ // the table is built. In order to be able to tell when it's safe to free some of the
+ // table, each thread occasionally records the largest backpointer index that it's done
+ // with. It's safe to free table chunks that are less than the least of these. The lock
+ // is used to keep more than one thread from trying to spill table at a time.
+ //
+ _int64 *lastBackpointerIndexUsedByThread;
+ ExclusiveLock *backpointerSpillLock;
+ FILE *backpointerSpillFile;
+ ExclusiveLock *hashTableLocks;
+ ExclusiveLock *overflowTableLock;
+ };
+ struct PerHashTableBatch {
+ PerHashTableBatch() : nUsed(0) {}
+ static const unsigned nSeedsPerBatch = 1000;
+ unsigned nUsed;
+ struct Entry {
+ bool usingComplement;
+ _uint64 lowBases;
+ GenomeLocation genomeLocation;
+ };
+ Entry entries[nSeedsPerBatch];
+ bool addSeed(GenomeLocation genomeLocation, _uint64 seedLowBases, bool seedUsingComplement) {
+ _ASSERT(nUsed < nSeedsPerBatch);
+ entries[nUsed].lowBases = seedLowBases;
+ entries[nUsed].usingComplement = seedUsingComplement;
+ entries[nUsed].genomeLocation = genomeLocation;
+ nUsed++;
+ return nUsed >= nSeedsPerBatch;
+ }
+ void clear()
+ {
+ nUsed = 0;
+ }
+ };
+ struct IndexBuildStats {
+ IndexBuildStats() : noBaseAvailable(0), nonSeeds(0), bothComplementsUsed(0), genomeLocationsInOverflowTable(0),
+ unrecordedSkippedSeeds(0), seedsWithMultipleOccurrences(0) {}
+ _int64 noBaseAvailable;
+ _int64 nonSeeds;
+ _int64 bothComplementsUsed;
+ _int64 genomeLocationsInOverflowTable;
+ _int64 seedsWithMultipleOccurrences;
+ _uint64 unrecordedSkippedSeeds;
+ };
+ static const _int64 printPeriod;
+ virtual void indexSeed(GenomeLocation genomeLocation, Seed seed, PerHashTableBatch *batches, BuildHashTablesThreadContext *context, IndexBuildStats *stats, bool large);
+ virtual void completeIndexing(PerHashTableBatch *batches, BuildHashTablesThreadContext *context, IndexBuildStats *stats, bool large);
+ static void BuildHashTablesWorkerThreadMain(void *param);
+ void BuildHashTablesWorkerThread(BuildHashTablesThreadContext *context);
+ static void ApplyHashTableUpdate(BuildHashTablesThreadContext *context, _uint64 whichHashTable, GenomeLocation genomeLocation, _uint64 lowBases, bool usingComplement,
+ _int64 *bothComplementsUsed, _int64 *genomeLocationsInOverflowTable, _int64 *seedsWithMultipleOccurrences, bool large);
+ static int BackwardsUnsignedCompare(const void *, const void *);
+ static int BackwardsInt64Compare(const void *, const void *);
+ GenomeIndex();
+ SNAPHashTable **hashTables;
+ static _int64 AddOverflowBackpointer(
+ _int64 previousOverflowBackpointer,
+ BuildHashTablesThreadContext*context,
+ GenomeLocation genomeLocation);
+ void fillInLookedUpResults32(const unsigned *subEntry, _int64 *nHits, const unsigned **hits);
+ void fillInLookedUpResults(GenomeLocation lookedUpLocation, _int64 *nHits, const GenomeLocation **hits, GenomeLocation *singleHitLocation);
diff --git a/SNAPLib/GzipDataWriter.cpp b/SNAPLib/GzipDataWriter.cpp
new file mode 100644
index 0000000..cfc2767
--- /dev/null
+++ b/SNAPLib/GzipDataWriter.cpp
@@ -0,0 +1,439 @@
+Module Name:
+ GzipDataWriter.cpp
+ File writer that compresses data into zip format.
+ User mode service.
+ Not thread safe.
+#include "stdafx.h"
+#include "GzipDataWriter.h"
+#include "BigAlloc.h"
+#include "VariableSizeVector.h"
+#include "ParallelTask.h"
+#include "RangeSplitter.h"
+#include "Bam.h"
+#include "zlib.h"
+#include "exit.h"
+#include "Error.h"
+using std::min;
+using std::max;
+using std::pair;
+class GzipWriterFilterSupplier;
+class GzipCompressWorkerManager : public ParallelWorkerManager
+ GzipCompressWorkerManager(GzipWriterFilterSupplier* i_filterSupplier)
+ : filterSupplier(i_filterSupplier), buffer(NULL),
+ chunkSize(i_filterSupplier->chunkSize), bam(i_filterSupplier->bamFormat)
+ {}
+ virtual ~GzipCompressWorkerManager();
+ virtual void initialize(void* i_writer);
+ virtual ParallelWorker* createWorker();
+ virtual void beginStep();
+ virtual void finishStep();
+ VariableSizeVector<size_t> sizes;
+ volatile int nChunks;
+ const size_t chunkSize;
+ const bool bam;
+ FileEncoder* encoder;
+ GzipWriterFilterSupplier* filterSupplier;
+ char* input;
+ size_t inputSize;
+ size_t inputUsed;
+ char* buffer;
+ VariableSizeVector< pair<_uint64,_uint64> > translation;
+ friend class GzipCompressWorker;
+class GzipCompressWorker : public ParallelWorker
+ GzipCompressWorker() : heap(NULL) {}
+ virtual ~GzipCompressWorker() { delete heap; }
+ virtual void step();
+ static size_t compressChunk(z_stream& zstream, bool bamFormat, char* toBuffer, size_t toSize, char* fromBuffer, size_t fromUsed);
+ z_stream zstream;
+ ThreadHeap* heap;
+// used for case where each thread compresses by itself
+class GzipWriterFilter : public DataWriter::Filter
+ GzipWriterFilter(GzipWriterFilterSupplier* i_supplier);
+ virtual void onAdvance(DataWriter* writer, size_t batchOffset, char* data, GenomeDistance bytes, GenomeLocation location);
+ virtual size_t onNextBatch(DataWriter* writer, size_t offset, size_t bytes);
+ GzipWriterFilterSupplier* supplier;
+ // if doing inline compression, filled in with minimally initialized objects
+ GzipCompressWorkerManager* manager;
+ ParallelWorker* worker;
+ FileEncoder* encoder;
+ if (buffer != NULL) {
+ delete buffer;
+ }
+ void
+ void* i_encoder)
+ encoder = (FileEncoder*) i_encoder;
+ ParallelWorker*
+ return new GzipCompressWorker();
+ void
+ if (filterSupplier->closing) {
+ nChunks = 0;
+ return;
+ }
+ encoder->getEncodeBatch(&input, &inputSize, &inputUsed);
+ nChunks = (int) ((inputUsed + chunkSize - 1) / chunkSize);
+ sizes.clear();
+ sizes.extend(nChunks);
+ if (buffer == NULL) {
+ buffer = (char*) BigAlloc(inputSize);
+ }
+ void
+ if (filterSupplier->closing) {
+ return;
+ }
+ size_t toUsed = 0, logicalOffset, physicalOffset;
+ encoder->getOffsets(&logicalOffset, &physicalOffset);
+ for (int i = 0; i < nChunks; i++) {
+ translation.push_back(pair<_uint64,_uint64>(logicalOffset, physicalOffset + toUsed));
+ _ASSERT(i * chunkSize < inputUsed);
+ _ASSERT(sizes[i] <= chunkSize);
+ size_t logicalChunk = min(chunkSize, inputUsed - i * chunkSize);
+ logicalOffset += logicalChunk;
+ _ASSERT(((BgzfHeader*)(buffer + i * chunkSize))->validate(sizes[i], logicalChunk));
+ memcpy(input + toUsed, buffer + i * chunkSize, sizes[i]);
+ toUsed += sizes[i];
+ }
+ _ASSERT(BgzfHeader::validate(input, toUsed));
+ encoder->setEncodedBatchSize(toUsed);
+ filterSupplier->addTranslations(&translation);
+ translation.clear();
+ void
+ GzipCompressWorkerManager* supplier = (GzipCompressWorkerManager*) getManager();
+ if (heap == NULL) {
+ heap = new ThreadHeap(supplier->chunkSize * 8); // appears to use 4*chunkSize per run
+ zstream.zalloc = zalloc;
+ zstream.zfree = zfree;
+ zstream.opaque = heap;
+ }
+ //fprintf(stderr, "zip task thread %d begin\n", GetCurrentThreadId());
+ _int64 start = timeInMillis();
+ int begin = (getThreadNum() * supplier->nChunks) / getNumThreads();
+ int end = ((1 + getThreadNum()) * supplier->nChunks) / getNumThreads();
+ for (int i = begin; i < end; i++) {
+ size_t bytes = min(supplier->chunkSize, supplier->inputUsed - i * supplier->chunkSize);
+ supplier->sizes[i] = compressChunk(zstream, supplier->bam,
+ supplier->buffer + i * supplier->chunkSize, supplier->chunkSize,
+ supplier->input + i * supplier->chunkSize, bytes);
+ _ASSERT(supplier->sizes[i] <= supplier->chunkSize); // can't grow!
+ }
+ size_t
+ z_stream& zstream,
+ bool bamFormat,
+ char* toBuffer,
+ size_t toSize,
+ char* fromBuffer,
+ size_t fromUsed)
+ if (bamFormat && fromUsed > BAM_BLOCK) {
+ WriteErrorMessage("exceeded BAM chunk size\n");
+ soft_exit(1);
+ }
+ if (zstream.opaque != NULL) {
+ ((ThreadHeap*)zstream.opaque)->reset();
+ }
+ // set up BAM header structure
+ gz_header header;
+ _uint8 bamExtraData[6];
+ if (bamFormat) {
+ header.text = false;
+ header.time = 0;
+ header.xflags = 0;
+ header.os = 0;
+ header.extra = bamExtraData;
+ header.extra_len = 6;
+ header.extra_max = 6;
+ header.name = NULL;
+ header.name_max = 0;
+ header.comment = NULL;
+ header.comm_max = 0;
+ header.hcrc = false;
+ header.done = true;
+ bamExtraData[0] = 'B';
+ bamExtraData[1] = 'C';
+ bamExtraData[2] = 2;
+ bamExtraData[3] = 0;
+ bamExtraData[4] = 3; // will be filled in later
+ bamExtraData[5] = 7; // will be filled in later
+ }
+ if (fromUsed > 0xffffffff || toSize > 0xffffffff) {
+ WriteErrorMessage("GZipDataWriter: fromUsed or toSize too big\n");
+ soft_exit(1);
+ }
+ // based on sample code at http://www.lemoda.net/c/zlib-open-write/index.html
+ const int windowBits = 15;
+ const int GZIP_ENCODING = 16;
+ zstream.next_in = (Bytef*) fromBuffer;
+ zstream.avail_in = (uInt)fromUsed;
+ zstream.next_out = (Bytef*) toBuffer;
+ zstream.avail_out = (uInt)toSize;
+ uInt oldAvail;
+ int status;
+ status = deflateInit2(&zstream, Z_DEFAULT_COMPRESSION, Z_DEFLATED, windowBits | GZIP_ENCODING, 8, Z_DEFAULT_STRATEGY);
+ if (status < 0) {
+ WriteErrorMessage("GzipWriterFilter: deflateInit2 failed with %d\n", status);
+ soft_exit(1);
+ }
+ if (bamFormat) {
+ status = deflateSetHeader(&zstream, &header);
+ if (status != Z_OK) {
+ WriteErrorMessage("GzipWriterFilter: defaultSetHeader failed with %d\n", status);
+ soft_exit(1);
+ }
+ }
+ oldAvail = zstream.avail_out;
+ status = deflate(&zstream, Z_FINISH);
+ if (status < 0 && status != Z_BUF_ERROR) {
+ WriteErrorMessage("GzipWriterFilter: deflate failed with %d\n", status);
+ soft_exit(1);
+ }
+ // make sure it all got written out in a single compressed block
+ if (zstream.avail_in != 0) {
+ WriteErrorMessage("GzipWriterFilter: default failed to read all input\n");
+ soft_exit(1);
+ }
+ if (zstream.avail_out == oldAvail) {
+ WriteErrorMessage("GzipWriterFilter: default failed to write output\n");
+ soft_exit(1);
+ }
+ status = deflateEnd(&zstream);
+ if (status < 0) {
+ WriteErrorMessage("GzipWriterFilter: deflateEnd failed with %d\n", status);
+ soft_exit(1);
+ }
+ size_t toUsed = toSize - zstream.avail_out;
+ if (bamFormat) {
+ // backpatch compressed block size into gzip header
+ if (toUsed >= BAM_BLOCK) {
+ WriteErrorMessage("exceeded BAM chunk size\n");
+ soft_exit(1);
+ }
+ * (_uint16*) (toBuffer + 16) = (_uint16) (toUsed - 1);
+ }
+ return toUsed;
+GzipWriterFilter::GzipWriterFilter(GzipWriterFilterSupplier* i_supplier)
+ : DataWriter::Filter(DataWriter::ResizeFilter), supplier(i_supplier), manager(NULL), worker(NULL)
+ void
+ DataWriter* writer,
+ size_t batchOffset,
+ char* data,
+ GenomeDistance bytes,
+ GenomeLocation location)
+ // nothing
+ size_t
+ DataWriter* writer,
+ size_t offset,
+ size_t bytes)
+ char* fromBuffer;
+ size_t fromSize, fromUsed, physicalOffset, logicalOffset;
+ writer->getBatch(-1, &fromBuffer, &fromSize, &fromUsed, &physicalOffset, NULL, &logicalOffset);
+ if (fromUsed == 0 || supplier->multiThreaded || supplier->closing) {
+ return fromUsed;
+ }
+ // do compress buffer synchronously in-place
+ if (manager == NULL) {
+ manager = new GzipCompressWorkerManager(supplier);
+ worker = manager->createWorker();
+ encoder = new FileEncoder(0, false, manager);
+ encoder->initialize((AsyncDataWriter*) writer);
+ manager->initialize(encoder);
+ manager->configure(worker, 0, 1);
+ }
+ encoder->setupEncode(-1);
+ manager->beginStep();
+ worker->step();
+ manager->finishStep();
+ writer->getBatch(-1, &fromBuffer, &fromSize, &fromUsed, &physicalOffset, NULL, &logicalOffset);
+ return fromUsed;
+ GzipWriterFilterSupplier*
+ bool bamFormat,
+ size_t chunkSize,
+ int numThreads,
+ bool bindToProcessors,
+ bool multiThreaded)
+ return new GzipWriterFilterSupplier(bamFormat, chunkSize, numThreads, bindToProcessors, multiThreaded);
+ DataWriter::Filter*
+ return new GzipWriterFilter(this);
+ void
+ DataWriterSupplier* supplier)
+ if (bamFormat) {
+ closing = true;
+ DataWriter* writer = supplier->getWriter();
+ // write empty block as BAM end of file marker
+ static _uint8 eof[] = {
+ 0x1f, 0x8b, 0x08, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x06, 0x00, 0x42, 0x43,
+ 0x02, 0x00, 0x1b, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ };
+ char* buffer;
+ size_t bytes;
+ if (! (writer->getBuffer(&buffer, &bytes) && bytes >= sizeof(eof))) {
+ WriteErrorMessage("no space to write eof marker\n");
+ soft_exit(1);
+ }
+ memcpy(buffer, eof, sizeof(eof));
+ writer->advance(sizeof(eof));
+ // add final translation for last empty block
+ writer->nextBatch();
+ char* ignore;
+ pair<_uint64,_uint64> last;
+ size_t used;
+ writer->getBatch(-1, &ignore, NULL, &used, (size_t*) &last.second, NULL, (size_t*) &last.first);
+ last.second += used;
+ translation.push_back(last);
+ writer->close();
+ delete writer;
+ }
+ // sort translations
+ std::sort(translation.begin(), translation.end(), translationComparator);
+ void
+ VariableSizeVector< pair<_uint64,_uint64> >* moreTranslations)
+ AcquireExclusiveLock(&lock);
+ translation.append(moreTranslations);
+ ReleaseExclusiveLock(&lock);
+ bool
+ _uint64 logical,
+ _uint64* o_physical,
+ _uint64* o_logicalDelta)
+ pair<_uint64,_uint64> value;
+ value.first = logical;
+ value.second = 0; //ignored
+ pair<_uint64,_uint64>* upper = std::upper_bound(translation.begin(), translation.end(), value, translationComparator);
+ if (upper == translation.begin()) {
+ return false;
+ }
+ upper--;
+ *o_physical = upper->second;
+ *o_logicalDelta = logical - upper->first;
+ return true;
+ bool
+ const pair<_uint64,_uint64>& a,
+ const pair<_uint64,_uint64>& b)
+ return a.first < b.first;
+ FileEncoder*
+ GzipWriterFilterSupplier* filterSupplier,
+ int numThreads,
+ bool bindToProcessor,
+ size_t chunkSize,
+ bool bam)
+ return new FileEncoder(numThreads, bindToProcessor, new GzipCompressWorkerManager(filterSupplier));
diff --git a/SNAPLib/GzipDataWriter.h b/SNAPLib/GzipDataWriter.h
new file mode 100644
index 0000000..f70d2a4
--- /dev/null
+++ b/SNAPLib/GzipDataWriter.h
@@ -0,0 +1,101 @@
+Module Name:
+ GzipDataWriter.h
+ Headers for the GzipDataWriter & related classes for the SNAP sequencer
+ Ravi Pandya, Mar 2013
+ User mode service.
+Revision History:
+#pragma once
+#include "Compat.h"
+#include "Read.h"
+#include "Compat.h"
+#include "DataWriter.h"
+#include "VariableSizeVector.h"
+#include "stdafx.h"
+#include "zlib.h"
+#include "Error.h"
+using std::pair;
+class GzipWriterFilterSupplier : public DataWriter::FilterSupplier
+ GzipWriterFilterSupplier(bool i_bamFormat, size_t i_chunkSize, int i_numThreads, bool i_bindToProcessors, bool i_multiThreaded)
+ :
+ FilterSupplier(DataWriter::ResizeFilter),
+ bamFormat(i_bamFormat),
+ chunkSize(i_chunkSize),
+ numThreads(i_numThreads),
+ bindToProcessors(i_bindToProcessors),
+ multiThreaded(i_multiThreaded),
+ closing(false)
+ {
+ InitializeExclusiveLock(&lock);
+ }
+ virtual ~GzipWriterFilterSupplier()
+ {
+ DestroyExclusiveLock(&lock);
+ }
+ const bool multiThreaded;
+ virtual DataWriter::Filter* getFilter();
+ virtual void onClosing(DataWriterSupplier* supplier);
+ virtual void onClosed(DataWriterSupplier* supplier) {}
+ void addTranslations(VariableSizeVector< pair<_uint64,_uint64> >* translation);
+ bool translate(_uint64 logical, _uint64* o_physical, _uint64* delta);
+ // translate to BAM virtual offset format
+ _uint64 toVirtualOffset(_uint64 logical)
+ {
+ _uint64 physical, delta;
+ if (logical != UINT64_MAX && translate(logical, &physical, &delta)) {
+ if (delta < 65536 && physical < ((_uint64) 1 << 48)) {
+ return (physical << 16) | delta;
+ }
+ WriteErrorMessage( "Invalid virtual file offset, logical=%llu, start=%llu, delta=%llu\n", logical, physical, delta);
+ }
+ return 0;
+ }
+ friend class GzipWriterFilter;
+ friend class GzipCompressWorkerManager;
+ void addTranslation(_uint64 logical, _uint64 physical)
+ {
+ AcquireExclusiveLock(&lock);
+ translation.push_back(pair<_uint64,_uint64>(logical, physical));
+ ReleaseExclusiveLock(&lock);
+ }
+ static bool translationComparator(const pair<_uint64,_uint64>& a, const pair<_uint64,_uint64>& b);
+ const bool bamFormat;
+ const size_t chunkSize;
+ const int numThreads;
+ const bool bindToProcessors;
+ ExclusiveLock lock;
+ VariableSizeVector< pair<_uint64,_uint64> > translation;
+ bool closing;
diff --git a/SNAPLib/HashTable.cpp b/SNAPLib/HashTable.cpp
new file mode 100755
index 0000000..1e20730
--- /dev/null
+++ b/SNAPLib/HashTable.cpp
@@ -0,0 +1,343 @@
+Module Name:
+ HashTable.cpp
+ Large closed hash table, specialized for SNAP
+ Bill Bolosky, March, 2011
+#include "stdafx.h"
+#include "HashTable.h"
+#include "BigAlloc.h"
+#include "exit.h"
+#include "Genome.h"
+#include "Error.h"
+#include "GenericFile_Blob.h"
+ _int64 i_tableSize,
+ unsigned i_keySizeInBytes,
+ unsigned i_valueSizeInBytes,
+ unsigned i_valueCount,
+ _uint64 i_invalidValueValue)
+Routine Description:
+ Constructor for a new, empty closed hash table.
+ tableSize - How many slots should the table have.
+ keySizeInBytes = i_keySizeInBytes;
+ valueSizeInBytes = i_valueSizeInBytes;
+ valueCount = i_valueCount;
+ invalidValueValue = i_invalidValueValue;
+ elementSize = keySizeInBytes + valueSizeInBytes * valueCount;
+ tableSize = i_tableSize;
+ usedElementCount = 0;
+ Table = NULL;
+ if (tableSize <= 0) {
+ tableSize = 0;
+ return;
+ }
+ Table = BigAlloc(tableSize * elementSize);
+ ownsMemoryForTable = true;
+ //
+ // Run through the table and set all of the first values to invalidValueValue, which means
+ // unused.
+ //
+ for (size_t i = 0; i < tableSize; i++) {
+ void *entry = getEntry(i);
+ _ASSERT(entry >= Table && entry <= (char *)Table + tableSize * elementSize);
+ clearKey(entry);
+ memcpy(getEntry(i), &invalidValueValue, valueSizeInBytes);
+ }
+SNAPHashTable *SNAPHashTable::loadFromBlob(GenericFile_Blob *loadFile)
+ SNAPHashTable *table = loadCommon(loadFile);
+ size_t bytesMapped;
+ table->Table = loadFile->mapAndAdvance(table->tableSize * table->elementSize, &bytesMapped);
+ if (bytesMapped != table->tableSize * table->elementSize) {
+ WriteErrorMessage("SNAPHashTable: unable to map table\n");
+ soft_exit(1);
+ }
+ table->ownsMemoryForTable = false;
+ return table;
+SNAPHashTable *SNAPHashTable::loadFromGenericFile(GenericFile *loadFile)
+ SNAPHashTable *table = loadCommon(loadFile);
+ table->Table = BigAlloc(table->tableSize * table->elementSize);
+ loadFile->read(table->Table, table->tableSize * table->elementSize);
+ table->ownsMemoryForTable = true;
+ return table;
+SNAPHashTable *SNAPHashTable::loadCommon(GenericFile *loadFile)
+ SNAPHashTable *table = new SNAPHashTable();
+ unsigned fileMagic;
+ if (sizeof(magic) != loadFile->read(&fileMagic, sizeof(magic))) {
+ WriteErrorMessage("Magic number mismatch on hash table load. %d != %d\n", fileMagic, magic);
+ soft_exit(1);
+ }
+ if (fileMagic != magic) {
+ WriteErrorMessage("SNAPHashTable: magic number mismatch. Perhaps you have a corruped index. %d != %d\n", fileMagic, magic);
+ soft_exit(1);
+ }
+ if (sizeof(table->tableSize) != loadFile->read(&table->tableSize, sizeof(table->tableSize))) {
+ WriteErrorMessage("SNAPHashTable::SNAPHashTable fread table size failed\n");
+ soft_exit(1);
+ }
+ if (sizeof(table->usedElementCount) != loadFile->read(&table->usedElementCount, sizeof(table->usedElementCount))) {
+ WriteErrorMessage("SNAPHashTable::SNAPHashTable fread used element count failed\n");
+ soft_exit(1);
+ }
+ if (sizeof(table->keySizeInBytes) != loadFile->read(&table->keySizeInBytes, sizeof(table->keySizeInBytes))) {
+ WriteErrorMessage("SNAPHashTable::SNAPHashTable fread keySizeInBytes size failed. Perhaps this is an old format hash table and needs to be rebuilt.\n");
+ soft_exit(1);
+ }
+ if (table->keySizeInBytes < 4 || table->keySizeInBytes > 8) {
+ WriteErrorMessage("SNAPHashTable::SNAPHashTable Key size must be between 4 and 8 inclusive. Perhaps this is an old format hash table and needs to be rebuilt.\n");
+ soft_exit(1);
+ }
+ if (sizeof(table->valueSizeInBytes) != loadFile->read(&table->valueSizeInBytes, sizeof(table->valueSizeInBytes))) {
+ WriteErrorMessage("SNAPHashTable::SNAPHashTable fread dataSizeInBytes size failed. Perhaps this is an old format hash table and needs to be rebuilt.\n");
+ soft_exit(1);
+ }
+ if (table->valueSizeInBytes == 0 || table->valueSizeInBytes > sizeof(_uint64)) {
+ //
+ // It must be at least one byte, because we need that much for the unused value value. The code stuffs
+ // values into _uint64, so it can't be bigger than that.
+ //
+ WriteErrorMessage(
+ "SNAPHashTable::SNAPHashTable value size in bytes (%d) must be between 1 and 8. Perhaps you have a hash table from a future version of SNAP? Or else it's corrupt.\n", table->valueSizeInBytes);
+ soft_exit(1);
+ }
+ if (sizeof(table->valueCount) != loadFile->read(&table->valueCount, sizeof(table->valueCount))) {
+ WriteErrorMessage("SNAPHashTable::SNAPHashTable: value count failed to read.\n");
+ soft_exit(1);
+ }
+ if (table->valueCount == 0 || table->valueCount > 2) {
+ // Technically, > 2 would work fine with the code, but SNAP doesn't use it, so the check is here to detect corruption.
+ WriteErrorMessage("SNAPHashTable::SNAPHashTable: invalid value count (%d), possible corruption or bad file format.\n", table->valueCount);
+ soft_exit(1);
+ }
+ table->invalidValueValue = 0; // Need this in case valueSizeInBytes < sizeof(ValueType)
+ if (table->valueSizeInBytes != loadFile->read(&table->invalidValueValue, table->valueSizeInBytes)) {
+ WriteErrorMessage("SNAPHashTable::SNAPHashTable: unable to read invalid value value\n");
+ soft_exit(1);
+ }
+ if (table->tableSize <= 0) {
+ WriteErrorMessage("SNAPHashTable::SNAPHashTable Zero or negative hash table size\n");
+ soft_exit(1);
+ }
+ table->elementSize = table->keySizeInBytes + table->valueSizeInBytes * table->valueCount;
+ return table;
+ if (ownsMemoryForTable) {
+ BigDealloc(Table);
+ }
+ bool
+SNAPHashTable::saveToFile(const char *saveFileName, size_t *bytesWritten)
+ FILE *saveFile = fopen(saveFileName,"wb");
+ if (saveFile == NULL) {
+ WriteErrorMessage("SNAPHashTable::SNAPHashTable(%s) fopen failed\n",saveFileName);
+ return false;
+ }
+ bool worked = saveToFile(saveFile, bytesWritten);
+ fclose(saveFile);
+ return worked;
+SNAPHashTable::saveToFile(FILE *saveFile, size_t *bytesWritten)
+ *bytesWritten = 0;
+ if (1 != fwrite(&magic,sizeof(magic), 1, saveFile)) {
+ WriteErrorMessage("SNAPHashTable::SNAPHashTable fwrite magic number failed\n");
+ return false;
+ }
+ (*bytesWritten) += sizeof(magic);
+ if (1 != fwrite(&tableSize,sizeof(tableSize), 1, saveFile)) {
+ WriteErrorMessage("SNAPHashTable::SNAPHashTable fwrite table size failed\n");
+ return false;
+ }
+ (*bytesWritten) += sizeof(tableSize);
+ if (1 != fwrite(&usedElementCount,sizeof(usedElementCount), 1, saveFile)) {
+ WriteErrorMessage("SNAPHashTable::SNAPHashTable fwrite used element count size failed\n");
+ return false;
+ }
+ (*bytesWritten) += sizeof(usedElementCount);
+ if (1 != fwrite(&keySizeInBytes, sizeof(keySizeInBytes), 1, saveFile)) {
+ WriteErrorMessage("SNAPHashTable::SNAPHashTable fwrite key size failed\n");
+ return false;
+ }
+ (*bytesWritten) += sizeof(keySizeInBytes);
+ if (1 != fwrite(&valueSizeInBytes, sizeof(valueSizeInBytes), 1, saveFile)) {
+ WriteErrorMessage("SNAPHashTable::SNAPHashTable fwrite data size failed\n");
+ return false;
+ }
+ (*bytesWritten) += sizeof(valueSizeInBytes);
+ if (1 != fwrite(&valueCount, sizeof(valueCount), 1, saveFile)) {
+ WriteErrorMessage("SNAPHashTable: fwrite value count failed\n");
+ return false;
+ }
+ (*bytesWritten) += sizeof(valueCount);
+ if (1 != fwrite(&invalidValueValue, valueSizeInBytes, 1, saveFile)) {
+ WriteErrorMessage("SNAPHashTable: fwrite invalid value value failed\n");
+ return false;
+ }
+ (*bytesWritten) += valueSizeInBytes;
+ size_t maxWriteSize = 100 * 1024 * 1024;
+ size_t writeOffset = 0;
+ while (writeOffset < tableSize * elementSize) {
+ size_t amountToWrite = __min(maxWriteSize,tableSize * elementSize - writeOffset);
+ size_t thisWrite = fwrite((char*)Table + writeOffset, 1, amountToWrite, saveFile);
+ if (thisWrite < amountToWrite) {
+ WriteErrorMessage("SNAPHashTable::saveToFile: fwrite failed, %d\n"
+ "handle %p, addr %p, atr: %lu, &bw %p\n",errno, saveFile,(char*)Table + writeOffset, amountToWrite, &bytesWritten);
+ return false;
+ }
+ writeOffset += thisWrite;
+ (*bytesWritten) += thisWrite;
+ }
+ return true;
+_int64 nCallsToGetEntryForKey = 0;
+_int64 nProbesInGetEntryForKey = 0;
+void *
+SNAPHashTable::getEntryForKey(KeyType key) const
+ nCallsToGetEntryForKey++;
+ _uint64 tableIndex = hash(key) % tableSize;
+ bool wrapped = false;
+ _uint64 nProbes = 1;
+ //
+ // Chain through the table until we hit either a match on the key or an unused element
+ //
+ void *entry = getEntry(tableIndex);
+ while (!isKeyEqual(entry, key) && !doesEntryHaveInvalidValue(entry)) {
+ nProbesInGetEntryForKey++;
+ tableIndex += nProbes * nProbes;
+ } else {
+ tableIndex++;
+ }
+ nProbes++;
+ if (tableIndex >= tableSize) {
+ if (wrapped) {
+ return NULL;
+ }
+ wrapped = true;
+ tableIndex = tableIndex % tableSize;
+ }
+ entry = getEntry(tableIndex);
+ }
+ nProbesInGetEntryForKey++;
+ return entry;
+ SNAPHashTable::ValueType *
+SNAPHashTable::SlowLookup(KeyType key)
+ void *entry = getEntryForKey(key);
+ if (NULL == entry || doesEntryHaveInvalidValue(entry)) {
+ return NULL;
+ }
+ return (ValueType *)entry;
+ bool
+SNAPHashTable::Insert(KeyType key, ValueType *data)
+ void *entry = getEntryForKey(key);
+ if (NULL == entry) {
+ return false;
+ }
+ if (!isKeyEqual(entry, key)) {
+ setKey(entry, key);
+ usedElementCount++;
+ }
+ for (unsigned i = 0; i < valueCount; i++) {
+ memcpy((char *)entry + i * valueSizeInBytes, &data[i], valueSizeInBytes); // Assumes little endian
+ }
+ return true;
+const unsigned SNAPHashTable::magic = 0xb111b010;
diff --git a/SNAPLib/HashTable.h b/SNAPLib/HashTable.h
new file mode 100644
index 0000000..87238e3
--- /dev/null
+++ b/SNAPLib/HashTable.h
@@ -0,0 +1,219 @@
+Module Name:
+ HashTable.h
+ Headers for large closed hash table that used to be general, but now just handles
+ seeds for SNAP.
+ Bill Bolosky, March, 2011
+#pragma once
+#include "Compat.h"
+#include "GenericFile_Blob.h"
+#include "Genome.h"
+class SNAPHashTable {
+ public:
+ typedef _uint64 ValueType; // Values can be smaller than this, but they're expanded in the interface
+ typedef _uint64 KeyType; // Likewise for keys.
+ SNAPHashTable(
+ _int64 i_tableSize,
+ unsigned i_keySizeInBytes,
+ unsigned i_valueSizeInBytes,
+ unsigned i_valueCount,
+ _uint64 i_invalidValueValue);
+ //
+ // Load from file.
+ //
+ static SNAPHashTable *loadFromBlob(GenericFile_Blob *loadFile);
+ static SNAPHashTable *loadFromGenericFile(GenericFile *loadFile);
+ ~SNAPHashTable();
+ bool saveToFile(const char *saveFileName, size_t *bytesWritten);
+ bool saveToFile(FILE *saveFile, size_t *bytesWritten);
+ //
+ // Fails if either the table is full or key already exists. Inserts ALL
+ // values for a key.
+ //
+ bool Insert(KeyType key, ValueType *data);
+ size_t GetUsedElementCount() const {return usedElementCount;}
+ size_t GetTableSize() const {return tableSize;}
+ unsigned GetKeySizeInBytes() const {return keySizeInBytes;}
+ unsigned GetValueSizeInBytes() const {return valueSizeInBytes;}
+ unsigned GetValueCount() const {return valueCount;}
+ void *getEntryValues(_uint64 whichEntry)
+ {
+ _ASSERT(whichEntry < GetTableSize());
+ return getEntry(whichEntry);
+ }
+ static inline _uint64 hash(_uint64 key) {
+ //
+ // Hash the key. Use the hash finalizer from the 64 bit MurmurHash3, http://code.google.com/p/smhasher/wiki/MurmurHash3,
+ // which is public domain code.
+ //
+ key ^= key >> 33;
+ key *= 0xff51afd7ed558ccd;
+ key ^= key >> 33;
+ key *= 0xc4ceb9fe1a85ec53;
+ key ^= key >> 33;
+ return key;
+ }
+ inline ValueType *GetFirstValueForKey(KeyType key) const {
+ _ASSERT(keySizeInBytes == 8 || (key & ~((((_uint64)1) << (keySizeInBytes * 8)) - 1)) == 0); // High bits of the key aren't set.
+ _uint64 tableIndex = hash(key) % tableSize;
+ void *entry = getEntry(tableIndex);
+ if (isKeyEqual(entry, key) && !doesEntryHaveInvalidValue(entry)) {
+ return (ValueType *)entry;
+ } else {
+ unsigned nProbes = 0;
+ void* entry;
+ do {
+ nProbes++;
+ if (nProbes > tableSize + QUADRATIC_CHAINING_DEPTH) {
+ return NULL;
+ }
+ tableIndex = (tableIndex + nProbes * nProbes) % tableSize;
+ } else {
+ tableIndex = (tableIndex + 1) % tableSize;
+ }
+ entry = getEntry(tableIndex);
+ } while (!isKeyEqual(entry, key) && !doesEntryHaveInvalidValue(entry));
+ extern _int64 nProbesInGetEntryForKey;
+ nProbesInGetEntryForKey += nProbes;
+ if (doesEntryHaveInvalidValue(entry)) {
+ return NULL;
+ } else {
+ return (ValueType *)entry;
+ }
+ }
+ }
+ inline bool Lookup(KeyType key, unsigned nValuesToFill, ValueType *values) const {
+ _ASSERT(nValuesToFill <= valueCount);
+ char *entry = (char *)GetFirstValueForKey(key);
+ if (NULL == entry) {
+ return false;
+ }
+ for (unsigned i = 0; i < nValuesToFill; i++) {
+ values[i] = 0;
+ memcpy(values + i, entry + i * valueSizeInBytes, valueSizeInBytes);
+ }
+ return true;
+ }
+ //
+ // A version of Lookup that works properly when the table is (nearly) full and the key being looked up isn't
+ // there. It's, as you might imagine, slower than Lookup.
+ //
+ ValueType *SlowLookup(KeyType key);
+ SNAPHashTable() {}
+ static const unsigned QUADRATIC_CHAINING_DEPTH = 5; // Chain quadratically for this long, then linerarly Set to 0 for linear chaining
+ static SNAPHashTable *loadCommon(GenericFile *loadFile);
+ //
+ // A hash table entry consists of a set of valueCount values, each of valueSizeInBytes bytes, followed by
+ // a key of keySizeInBytes bytes. The key size must be between 4 and 8 bytes, inclusive.
+ //
+ // Because the size of the fields and the count of values is variable, we can't use a struct and are stuck
+ // with ugly memory manipulation to access the hash table entries.
+ //
+ // The format is 1 or 2 (valueCount) values of size valueSize, followed by keySize bytes of key.
+ //
+#if 0
+ struct Entry {
+ unsigned value1;
+ unsigned value2;
+ unsigned char key[1]; // Actual size of key determined by keySizeInBytes
+ };
+#endif // 0
+ // Free Entries have the first valueSizeInByes bytes of value == invalidValueValue. The following methods
+ // understand the format and try to make it less opaque to use them.
+ inline void *getEntry(_uint64 whichEntry) const {
+ return ((char *)Table + elementSize * whichEntry);
+ }
+ inline bool doesEntryHaveInvalidValue(void *entry) const
+ {
+ return !memcmp(entry, &invalidValueValue, valueSizeInBytes);
+ }
+ inline ValueType getValueFromEntry(void *entry, unsigned whichValue) const
+ {
+ _ASSERT(whichValue < valueCount);
+ ValueType value = 0; // Need =0 because valueSizeInBytes might be < sizeeof(ValueType)
+ memcpy(&value, (char *)entry + whichValue * valueSizeInBytes, valueSizeInBytes); // Assumes little-endian
+ return value;
+ }
+ inline bool isKeyEqual(const void *entry, KeyType key) const
+ {
+ return !memcmp((const char *)entry + valueSizeInBytes * valueCount, &key, keySizeInBytes);
+ }
+ inline void clearKey(void *entry)
+ {
+ memset((char *)entry + valueSizeInBytes * valueCount, 0 , keySizeInBytes);
+ }
+ inline void setKey(void *entry, KeyType key)
+ {
+ memcpy((char *)entry + valueSizeInBytes * valueCount, &key, keySizeInBytes);
+ }
+ void *Table;
+ size_t tableSize;
+ unsigned keySizeInBytes;
+ unsigned elementSize;
+ size_t usedElementCount;
+ bool ownsMemoryForTable;
+ unsigned valueSizeInBytes;
+ unsigned valueCount;
+ ValueType invalidValueValue;
+ //
+ // Returns either the entry for this key, or else the entry where the key would be
+ // inserted if it's not in the table.
+ //
+ void* getEntryForKey(__in KeyType key) const;
+ friend class SeedCountIterator;
+ static const unsigned magic;
diff --git a/SNAPLib/Histogram.cpp b/SNAPLib/Histogram.cpp
new file mode 100644
index 0000000..84d280a
--- /dev/null
+++ b/SNAPLib/Histogram.cpp
@@ -0,0 +1,103 @@
+Module Name:
+ histogram.cpp
+ Cheezy histogram class
+ Bill Bolosky, September, 2011
+ User mode service.
+ This class is NOT thread safe. It's the caller's responsibility to ensure that
+ at most one thread uses an instance at any time.
+Revision History:
+#include "stdafx.h"
+#include "Compat.h"
+#include "Histogram.h"
+#include "exit.h"
+Histogram::Histogram(unsigned i_nBuckets, bool i_isExponential) :
+ nBuckets(i_nBuckets), isExponential(i_isExponential)
+ _ASSERT(nBuckets > 0);
+ buckets = new Bucket[nBuckets];
+ buckets[0].maxValue = 1;
+ buckets[0].count = 0;
+ for (unsigned i = 1; i < nBuckets; i++) {
+ if (isExponential) {
+ buckets[i].maxValue = buckets[i-1].maxValue * 2;
+ } else {
+ buckets[i].maxValue = buckets[i-1].maxValue +1;
+ }
+ buckets[i].count = 0;
+ }
+ unsigned
+Histogram::getBucketCount(unsigned whichBucket) const
+ _ASSERT(whichBucket < nBuckets);
+ return buckets[whichBucket].count;
+ unsigned
+Histogram::getBucketMax(unsigned whichBucket) const
+ _ASSERT(whichBucket < nBuckets);
+ return buckets[whichBucket].maxValue;
+ unsigned
+Histogram::getBucketMin(unsigned whichBucket) const
+ _ASSERT(whichBucket < nBuckets);
+ if (0 == whichBucket) {
+ return 0;
+ }
+ return buckets[whichBucket-1].maxValue + 1;
+ void
+Histogram::addToCount(unsigned value, unsigned amountToAdd)
+ for (unsigned i = 0 ; i < nBuckets; i++) { // It is called "cheezy" after all
+ if (value <= buckets[i].maxValue) {
+ buckets[i].count += amountToAdd;
+ return;
+ }
+ }
+ //
+ // Overflow. Just drop it.
+ //
+ void
+Histogram::print() const
+ printf("MaxValue Count\n");
+ printf("-------- --------\n");
+ for (unsigned i = 0; i < nBuckets; i++) {
+ printf("%8d %8d\n",buckets[i].maxValue, buckets[i].count);
+ }
+ delete [] buckets;
+ buckets = NULL;
diff --git a/SNAPLib/Histogram.h b/SNAPLib/Histogram.h
new file mode 100644
index 0000000..efab46f
--- /dev/null
+++ b/SNAPLib/Histogram.h
@@ -0,0 +1,55 @@
+Module Name:
+ histogram.h
+ Header for cheezy histogram class
+ Bill Bolosky, September, 2011
+ User mode service.
+ This class is NOT thread safe. It's the caller's responsibility to ensure that
+ at most one thread uses an instance at any time.
+Revision History:
+#pragma once
+class Histogram {
+ Histogram(unsigned i_nBuckets, bool i_isExponential);
+ ~Histogram();
+ unsigned getNBuckets() const {return nBuckets;}
+ bool getIsExponential() const {return isExponential;}
+ unsigned getBucketCount(unsigned whichBucket) const;
+ unsigned getBucketMax(unsigned whichBucket) const;
+ unsigned getBucketMin(unsigned whichBucket) const;
+ void print() const;
+ void addToCount(unsigned value, unsigned amountToAdd = 1);
+ unsigned nBuckets;
+ bool isExponential;
+ struct Bucket {
+ unsigned maxValue;
+ unsigned count;
+ };
+ Bucket *buckets;
diff --git a/SNAPLib/IntersectingPairedEndAligner.cpp b/SNAPLib/IntersectingPairedEndAligner.cpp
new file mode 100644
index 0000000..df63cec
--- /dev/null
+++ b/SNAPLib/IntersectingPairedEndAligner.cpp
@@ -0,0 +1,1423 @@
+Module Name:
+ IntersectingPairedEndAligner.cpp
+ A paired-end aligner based on set intersections to narrow down possible candidate locations.
+ Bill Bolosky, February, 2013
+ User mode service.
+Revision History:
+#include "stdafx.h"
+#include "IntersectingPairedEndAligner.h"
+#include "SeedSequencer.h"
+#include "mapq.h"
+#include "exit.h"
+#include "Error.h"
+#include "BigAlloc.h"
+#include "AlignerOptions.h"
+#ifdef _DEBUG
+extern bool _DumpAlignments; // From BaseAligner.cpp
+#endif // _DEBUG
+ GenomeIndex *index_,
+ unsigned maxReadSize_,
+ unsigned maxHits_,
+ unsigned maxK_,
+ unsigned numSeedsFromCommandLine_,
+ double seedCoverage_,
+ unsigned minSpacing_, // Minimum distance to allow between the two ends.
+ unsigned maxSpacing_, // Maximum distance to allow between the two ends.
+ unsigned maxBigHits_,
+ unsigned extraSearchDepth_,
+ unsigned maxCandidatePoolSize,
+ int maxSecondaryAlignmentsPerContig_,
+ BigAllocator *allocator,
+ bool noUkkonen_,
+ bool noOrderedEvaluation_,
+ bool noTruncation_) :
+ index(index_), maxReadSize(maxReadSize_), maxHits(maxHits_), maxK(maxK_), numSeedsFromCommandLine(__min(MAX_MAX_SEEDS,numSeedsFromCommandLine_)), minSpacing(minSpacing_), maxSpacing(maxSpacing_),
+ landauVishkin(NULL), reverseLandauVishkin(NULL), maxBigHits(maxBigHits_), seedCoverage(seedCoverage_),
+ extraSearchDepth(extraSearchDepth_), nLocationsScored(0), noUkkonen(noUkkonen_), noOrderedEvaluation(noOrderedEvaluation_), noTruncation(noTruncation_),
+ maxSecondaryAlignmentsPerContig(maxSecondaryAlignmentsPerContig_)
+ doesGenomeIndexHave64BitLocations = index->doesGenomeIndexHave64BitLocations();
+ unsigned maxSeedsToUse;
+ if (0 != numSeedsFromCommandLine) {
+ maxSeedsToUse = numSeedsFromCommandLine;
+ } else {
+ maxSeedsToUse = (unsigned)(maxReadSize * seedCoverage / index->getSeedLength());
+ }
+ allocateDynamicMemory(allocator, maxReadSize, maxBigHits, maxSeedsToUse, maxK, extraSearchDepth, maxCandidatePoolSize, maxSecondaryAlignmentsPerContig);
+ rcTranslationTable['A'] = 'T';
+ rcTranslationTable['G'] = 'C';
+ rcTranslationTable['C'] = 'G';
+ rcTranslationTable['T'] = 'A';
+ rcTranslationTable['N'] = 'N';
+ for (unsigned i = 0; i < 256; i++) {
+ nTable[i] = 0;
+ }
+ nTable['N'] = 1;
+ seedLen = index->getSeedLength();
+ genome = index->getGenome();
+ genomeSize = genome->getCountOfBases();
+ size_t
+IntersectingPairedEndAligner::getBigAllocatorReservation(GenomeIndex * index, unsigned maxBigHitsToConsider, unsigned maxReadSize, unsigned seedLen, unsigned numSeedsFromCommandLine,
+ double seedCoverage, unsigned maxEditDistanceToConsider, unsigned maxExtraSearchDepth, unsigned maxCandidatePoolSize,
+ int maxSecondaryAlignmentsPerContig)
+ unsigned maxSeedsToUse;
+ if (0 != numSeedsFromCommandLine) {
+ maxSeedsToUse = numSeedsFromCommandLine;
+ } else {
+ maxSeedsToUse = (unsigned)(maxReadSize * seedCoverage / index->getSeedLength());
+ }
+ CountingBigAllocator countingAllocator;
+ {
+ IntersectingPairedEndAligner aligner; // This has to be in a nested scope so its destructor is called before that of the countingAllocator
+ aligner.index = index;
+ aligner.allocateDynamicMemory(&countingAllocator, maxReadSize, maxBigHitsToConsider, maxSeedsToUse, maxEditDistanceToConsider, maxExtraSearchDepth, maxCandidatePoolSize,
+ maxSecondaryAlignmentsPerContig);
+ return sizeof(aligner) + countingAllocator.getMemoryUsed();
+ }
+ void
+IntersectingPairedEndAligner::allocateDynamicMemory(BigAllocator *allocator, unsigned maxReadSize, unsigned maxBigHitsToConsider, unsigned maxSeedsToUse,
+ unsigned maxEditDistanceToConsider, unsigned maxExtraSearchDepth, unsigned maxCandidatePoolSize,
+ int maxSecondaryAlignmentsPerContig)
+ seedUsed = (BYTE *) allocator->allocate(100 + (maxReadSize + 7) / 8);
+ for (unsigned whichRead = 0; whichRead < NUM_READS_PER_PAIR; whichRead++) {
+ rcReadData[whichRead] = (char *)allocator->allocate(maxReadSize);
+ rcReadQuality[whichRead] = (char *)allocator->allocate(maxReadSize);
+ for (Direction dir = 0; dir < NUM_DIRECTIONS; dir++) {
+ reversedRead[whichRead][dir] = (char *)allocator->allocate(maxReadSize);
+ hashTableHitSets[whichRead][dir] =(HashTableHitSet *)allocator->allocate(sizeof(HashTableHitSet)); /*new HashTableHitSet();*/
+ hashTableHitSets[whichRead][dir]->firstInit(maxSeedsToUse, maxMergeDistance, allocator, doesGenomeIndexHave64BitLocations);
+ }
+ }
+ scoringCandidatePoolSize = min(maxCandidatePoolSize, maxBigHitsToConsider * maxSeedsToUse * NUM_READS_PER_PAIR);
+ scoringCandidates = (ScoringCandidate **) allocator->allocate(sizeof(ScoringCandidate *) * (maxEditDistanceToConsider + maxExtraSearchDepth + 1)); //+1 is for 0.
+ scoringCandidatePool = (ScoringCandidate *)allocator->allocate(sizeof(ScoringCandidate) * scoringCandidatePoolSize);
+ for (unsigned i = 0; i < NUM_READS_PER_PAIR; i++) {
+ scoringMateCandidates[i] = (ScoringMateCandidate *) allocator->allocate(sizeof(ScoringMateCandidate) * scoringCandidatePoolSize / NUM_READS_PER_PAIR);
+ }
+ mergeAnchorPoolSize = scoringCandidatePoolSize;
+ mergeAnchorPool = (MergeAnchor *)allocator->allocate(sizeof(MergeAnchor) * mergeAnchorPoolSize);
+ if (maxSecondaryAlignmentsPerContig > 0) {
+ size_t size = sizeof(*hitsPerContigCounts) * index->getGenome()->getNumContigs();
+ hitsPerContigCounts = (HitsPerContigCounts *)allocator->allocate(size);
+ memset(hitsPerContigCounts, 0, size);
+ contigCountEpoch = 0;
+ } else {
+ hitsPerContigCounts = NULL;
+ }
+ void
+ Read *read0,
+ Read *read1,
+ PairedAlignmentResult *result,
+ int maxEditDistanceForSecondaryResults,
+ int secondaryResultBufferSize,
+ int *nSecondaryResults,
+ PairedAlignmentResult *secondaryResults, // The caller passes in a buffer of secondaryResultBufferSize and it's filled in by align()
+ int singleSecondaryBufferSize,
+ int maxSecondaryResultsToReturn,
+ int *nSingleEndSecondaryResultsForFirstRead,
+ int *nSingleEndSecondaryResultsForSecondRead,
+ SingleAlignmentResult *singleEndSecondaryResults // Single-end secondary alignments for when the paired-end alignment didn't work properly
+ )
+ result->nLVCalls = 0;
+ result->nSmallHits = 0;
+ *nSecondaryResults = 0;
+ *nSingleEndSecondaryResultsForFirstRead = 0;
+ *nSingleEndSecondaryResultsForSecondRead = 0;
+ int maxSeeds;
+ if (numSeedsFromCommandLine != 0) {
+ maxSeeds = (int)numSeedsFromCommandLine;
+ } else {
+ maxSeeds = (int)(max(read0->getDataLength(), read1->getDataLength()) * seedCoverage / index->getSeedLength());
+ }
+#ifdef _DEBUG
+ if (_DumpAlignments) {
+ printf("\nIntersectingAligner aligning reads '%*.s' and '%.*s' with data '%.*s' and '%.*s'\n", read0->getIdLength(), read0->getId(), read1->getIdLength(), read1->getId(), read0->getDataLength(), read0->getData(), read1->getDataLength(), read1->getData());
+ }
+#endif // _DEBUG
+ lowestFreeScoringCandidatePoolEntry = 0;
+ for (unsigned k = 0; k <= maxK + extraSearchDepth; k++) {
+ scoringCandidates[k] = NULL;
+ }
+ for (unsigned i = 0; i < NUM_SET_PAIRS; i++) {
+ lowestFreeScoringMateCandidate[i] = 0;
+ }
+ firstFreeMergeAnchor = 0;
+ Read rcReads[NUM_READS_PER_PAIR];
+ GenomeLocation bestResultGenomeLocation[NUM_READS_PER_PAIR];
+ Direction bestResultDirection[NUM_READS_PER_PAIR];
+ unsigned bestResultScore[NUM_READS_PER_PAIR];
+ unsigned popularSeedsSkipped[NUM_READS_PER_PAIR];
+ reads[0][FORWARD] = read0;
+ reads[1][FORWARD] = read1;
+ //
+ // Don't bother if one or both reads are too short. The minimum read length here is the seed length, but usually there's a longer
+ // minimum enforced by our called
+ //
+ if (read0->getDataLength() < seedLen || read1->getDataLength() < seedLen) {
+ return;
+ }
+ //
+ // Build the RC reads.
+ //
+ unsigned countOfNs = 0;
+ for (unsigned whichRead = 0; whichRead < NUM_READS_PER_PAIR; whichRead++) {
+ Read *read = reads[whichRead][FORWARD];
+ readLen[whichRead] = read->getDataLength();
+ popularSeedsSkipped[whichRead] = 0;
+ countOfHashTableLookups[whichRead] = 0;
+#if 0
+ hitLocations[whichRead]->clear();
+ mateHitLocations[whichRead]->clear();
+#endif // 0
+ for (Direction dir = FORWARD; dir < NUM_DIRECTIONS; dir++) {
+ totalHashTableHits[whichRead][dir] = 0;
+ largestHashTableHit[whichRead][dir] = 0;
+ hashTableHitSets[whichRead][dir]->init();
+ }
+ if (readLen[whichRead] > maxReadSize) {
+ WriteErrorMessage("IntersectingPairedEndAligner:: got too big read (%d > %d)\n"
+ "Change MAX_READ_LENTH at the beginning of Read.h and recompile.\n", readLen[whichRead], maxReadSize);
+ soft_exit(1);
+ }
+ for (unsigned i = 0; i < reads[whichRead][FORWARD]->getDataLength(); i++) {
+ rcReadData[whichRead][i] = rcTranslationTable[read->getData()[readLen[whichRead] - i - 1]];
+ rcReadQuality[whichRead][i] = read->getQuality()[readLen[whichRead] - i - 1];
+ countOfNs += nTable[read->getData()[i]];
+ }
+ reads[whichRead][RC] = &rcReads[whichRead];
+ reads[whichRead][RC]->init(read->getId(), read->getIdLength(), rcReadData[whichRead], rcReadQuality[whichRead], read->getDataLength());
+ }
+ if (countOfNs > maxK) {
+ return;
+ }
+ //
+ // Build the reverse data for both reads in both directions for the backwards LV to use.
+ //
+ for (unsigned whichRead = 0; whichRead < NUM_READS_PER_PAIR; whichRead++) {
+ for (Direction dir = 0; dir < NUM_DIRECTIONS; dir++) {
+ Read *read = reads[whichRead][dir];
+ for (unsigned i = 0; i < read->getDataLength(); i++) {
+ reversedRead[whichRead][dir][i] = read->getData()[read->getDataLength() - i - 1];
+ }
+ }
+ }
+ unsigned thisPassSeedsNotSkipped[NUM_READS_PER_PAIR][NUM_DIRECTIONS] = {{0,0}, {0,0}};
+ //
+ // Initialize the member variables that are effectively stack locals, but are in the object
+ // to avoid having to pass them to score.
+ //
+ double probabilityOfBestPair = 0;
+ localBestPairProbability[0] = 0;
+ localBestPairProbability[1] = 0;
+ double probabilityOfAllPairs = 0;
+ unsigned bestPairScore = 65536;
+ unsigned scoreLimit = maxK + extraSearchDepth;
+ //
+ // Phase 1: do the hash table lookups for each of the seeds for each of the reads and add them to the hit sets.
+ //
+ for (unsigned whichRead = 0; whichRead < NUM_READS_PER_PAIR; whichRead++) {
+ int nextSeedToTest = 0;
+ unsigned wrapCount = 0;
+ int nPossibleSeeds = (int)readLen[whichRead] - seedLen + 1;
+ memset(seedUsed, 0, (__max(readLen[0], readLen[1]) + 7) / 8);
+ bool beginsDisjointHitSet[NUM_DIRECTIONS] = {true, true};
+ while (countOfHashTableLookups[whichRead] < nPossibleSeeds && countOfHashTableLookups[whichRead] < maxSeeds) {
+ if (nextSeedToTest >= nPossibleSeeds) {
+ wrapCount++;
+ beginsDisjointHitSet[FORWARD] = beginsDisjointHitSet[RC] = true;
+ if (wrapCount >= seedLen) {
+ //
+ // There aren't enough valid seeds in this read to reach our target.
+ //
+ break;
+ }
+ nextSeedToTest = GetWrappedNextSeedToTest(seedLen, wrapCount);
+ }
+ while (nextSeedToTest < nPossibleSeeds && IsSeedUsed(nextSeedToTest)) {
+ //
+ // This seed is already used. Try the next one.
+ //
+ nextSeedToTest++;
+ }
+ if (nextSeedToTest >= nPossibleSeeds) {
+ //
+ // Unusable seeds have pushed us past the end of the read. Go back around the outer loop so we wrap properly.
+ //
+ continue;
+ }
+ SetSeedUsed(nextSeedToTest);
+ if (!Seed::DoesTextRepresentASeed(reads[whichRead][FORWARD]->getData() + nextSeedToTest, seedLen)) {
+ //
+ // It's got Ns in it, so just skip it.
+ //
+ nextSeedToTest++;
+ continue;
+ }
+ Seed seed(reads[whichRead][FORWARD]->getData() + nextSeedToTest, seedLen);
+ //
+ // Find all instances of this seed in the genome.
+ //
+ _int64 nHits[NUM_DIRECTIONS];
+ const GenomeLocation *hits[NUM_DIRECTIONS];
+ const unsigned *hits32[NUM_DIRECTIONS];
+ if (doesGenomeIndexHave64BitLocations) {
+ index->lookupSeed(seed, &nHits[FORWARD], &hits[FORWARD], &nHits[RC], &hits[RC],
+ hashTableHitSets[whichRead][FORWARD]->getNextSingletonLocation(), hashTableHitSets[whichRead][RC]->getNextSingletonLocation());
+ } else {
+ index->lookupSeed32(seed, &nHits[FORWARD], &hits32[FORWARD], &nHits[RC], &hits32[RC]);
+ }
+ countOfHashTableLookups[whichRead]++;
+ for (Direction dir = FORWARD; dir < NUM_DIRECTIONS; dir++) {
+ int offset;
+ if (dir == FORWARD) {
+ offset = nextSeedToTest;
+ } else {
+ offset = readLen[whichRead] - seedLen - nextSeedToTest;
+ }
+ if (nHits[dir] < maxBigHits) {
+ totalHashTableHits[whichRead][dir] += nHits[dir];
+ if (doesGenomeIndexHave64BitLocations) {
+ hashTableHitSets[whichRead][dir]->recordLookup(offset, nHits[dir], hits[dir], beginsDisjointHitSet[dir]);
+ } else {
+ hashTableHitSets[whichRead][dir]->recordLookup(offset, nHits[dir], hits32[dir], beginsDisjointHitSet[dir]);
+ }
+ beginsDisjointHitSet[dir]= false;
+ } else {
+ popularSeedsSkipped[whichRead]++;
+ }
+ }
+ //
+ // If we don't have enough seeds left to reach the end of the read, space out the seeds more-or-less evenly.
+ //
+ if ((maxSeeds - countOfHashTableLookups[whichRead] + 1) * (int)seedLen + nextSeedToTest < nPossibleSeeds) {
+ _ASSERT((nPossibleSeeds - nextSeedToTest - 1) / (maxSeeds - countOfHashTableLookups[whichRead] + 1) >= (int)seedLen);
+ nextSeedToTest += (nPossibleSeeds - nextSeedToTest - 1) / (maxSeeds - countOfHashTableLookups[whichRead] + 1);
+ _ASSERT(nextSeedToTest < nPossibleSeeds); // We haven't run off the end of the read.
+ } else {
+ nextSeedToTest += seedLen;
+ }
+ } // while we need to lookup seeds for this read
+ } // for each read
+ readWithMoreHits = totalHashTableHits[0][FORWARD] + totalHashTableHits[0][RC] > totalHashTableHits[1][FORWARD] + totalHashTableHits[1][RC] ? 0 : 1;
+ readWithFewerHits = 1 - readWithMoreHits;
+#ifdef _DEBUG
+ if (_DumpAlignments) {
+ printf("Read 0 has %d hits, read 1 has %d hits\n", totalHashTableHits[0][FORWARD] + totalHashTableHits[0][RC], totalHashTableHits[1][FORWARD] + totalHashTableHits[1][RC]);
+ }
+#endif // _DEBUG
+ Direction setPairDirection[NUM_SET_PAIRS][NUM_READS_PER_PAIR] = {{FORWARD, RC}, {RC, FORWARD}};
+ //
+ // Phase 2: find all possible candidates and add them to candidate lists (for the reads with fewer and more hits).
+ //
+ unsigned maxUsedBestPossibleScoreList = 0;
+ for (unsigned whichSetPair = 0; whichSetPair < NUM_SET_PAIRS; whichSetPair++) {
+ HashTableHitSet *setPair[NUM_READS_PER_PAIR];
+ if (whichSetPair == 0) {
+ setPair[0] = hashTableHitSets[0][FORWARD];
+ setPair[1] = hashTableHitSets[1][RC];
+ } else {
+ setPair[0] = hashTableHitSets[0][RC];
+ setPair[1] = hashTableHitSets[1][FORWARD];
+ }
+ unsigned lastSeedOffsetForReadWithFewerHits;
+ GenomeLocation lastGenomeLocationForReadWithFewerHits;
+ GenomeLocation lastGenomeLocationForReadWithMoreHits;
+ unsigned lastSeedOffsetForReadWithMoreHits;
+ bool outOfMoreHitsLocations = false;
+ //
+ // Seed the intersection state by doing a first lookup.
+ //
+ if (setPair[readWithFewerHits]->getFirstHit(&lastGenomeLocationForReadWithFewerHits, &lastSeedOffsetForReadWithFewerHits)) {
+ //
+ // No hits in this direction.
+ //
+ continue; // The outer loop over set pairs.
+ }
+ lastGenomeLocationForReadWithMoreHits = InvalidGenomeLocation;
+ //
+ // Loop over the candidates in for the read with more hits. At the top of the loop, we have a candidate but don't know if it has
+ // a mate. Each pass through the loop considers a single hit on the read with fewer hits.
+ //
+ for (;;) {
+ //
+ // Loop invariant: lastGenomeLocationForReadWithFewerHits is the highest genome offset that has not been considered.
+ // lastGenomeLocationForReadWithMoreHits is also the highest genome offset on that side that has not been
+ // considered (or is InvalidGenomeLocation), but higher ones within the appropriate range might already be in scoringMateCandidates.
+ // We go once through this loop for each
+ //
+ if (lastGenomeLocationForReadWithMoreHits > lastGenomeLocationForReadWithFewerHits + maxSpacing) {
+ //
+ // The more hits side is too high to be a mate candidate for the fewer hits side. Move it down to the largest
+ // location that's not too high.
+ //
+ if (!setPair[readWithMoreHits]->getNextHitLessThanOrEqualTo(lastGenomeLocationForReadWithFewerHits + maxSpacing,
+ &lastGenomeLocationForReadWithMoreHits, &lastSeedOffsetForReadWithMoreHits)) {
+ break; // End of all of the mates. We're done with this set pair.
+ }
+ }
+ if ((lastGenomeLocationForReadWithMoreHits + maxSpacing < lastGenomeLocationForReadWithFewerHits || outOfMoreHitsLocations) &&
+ (0 == lowestFreeScoringMateCandidate[whichSetPair] ||
+ !genomeLocationIsWithin(scoringMateCandidates[whichSetPair][lowestFreeScoringMateCandidate[whichSetPair]-1].readWithMoreHitsGenomeLocation, lastGenomeLocationForReadWithFewerHits, maxSpacing))) {
+ //
+ // No mates for the hit on the read with fewer hits. Skip to the next candidate.
+ //
+ if (outOfMoreHitsLocations) {
+ //
+ // Nothing left on the more hits side, we're done with this set pair.
+ //
+ break;
+ }
+ if (!setPair[readWithFewerHits]->getNextHitLessThanOrEqualTo(lastGenomeLocationForReadWithMoreHits + maxSpacing, &lastGenomeLocationForReadWithFewerHits,
+ &lastSeedOffsetForReadWithFewerHits)) {
+ //
+ // No more candidates on the read with fewer hits side. We're done with this set pair.
+ //
+ break;
+ }
+ continue;
+ }
+ //
+ // Add all of the mate candidates for this fewer side hit.
+ //
+ GenomeLocation previousMoreHitsLocation = lastGenomeLocationForReadWithMoreHits;
+ while (lastGenomeLocationForReadWithMoreHits + maxSpacing >= lastGenomeLocationForReadWithFewerHits && !outOfMoreHitsLocations) {
+ unsigned bestPossibleScoreForReadWithMoreHits;
+ if (noTruncation) {
+ bestPossibleScoreForReadWithMoreHits = 0;
+ } else {
+ bestPossibleScoreForReadWithMoreHits = setPair[readWithMoreHits]->computeBestPossibleScoreForCurrentHit();
+ }
+ if (lowestFreeScoringMateCandidate[whichSetPair] >= scoringCandidatePoolSize / NUM_READS_PER_PAIR) {
+ WriteErrorMessage("Ran out of scoring candidate pool entries. Perhaps trying with a larger value of -mcp will help.\n");
+ soft_exit(1);
+ }
+ scoringMateCandidates[whichSetPair][lowestFreeScoringMateCandidate[whichSetPair]].init(
+ lastGenomeLocationForReadWithMoreHits, bestPossibleScoreForReadWithMoreHits, lastSeedOffsetForReadWithMoreHits);
+#ifdef _DEBUG
+ if (_DumpAlignments) {
+ printf("SetPair %d, added more hits candidate %d at genome location %u, bestPossibleScore %d, seedOffset %d\n",
+ whichSetPair, lowestFreeScoringMateCandidate[whichSetPair], lastGenomeLocationForReadWithMoreHits,
+ bestPossibleScoreForReadWithMoreHits,
+ lastSeedOffsetForReadWithMoreHits);
+ }
+#endif // _DEBUG
+ lowestFreeScoringMateCandidate[whichSetPair]++;
+ previousMoreHitsLocation = lastGenomeLocationForReadWithMoreHits;
+ if (!setPair[readWithMoreHits]->getNextLowerHit(&lastGenomeLocationForReadWithMoreHits, &lastSeedOffsetForReadWithMoreHits)) {
+ lastGenomeLocationForReadWithMoreHits = 0;
+ outOfMoreHitsLocations = true;
+ break; // out of the loop looking for candidates on the more hits side.
+ }
+ }
+ //
+ // And finally add the hit from the fewer hit side. To compute its best possible score, we need to look at all of the mates; we couldn't do it in the
+ // loop immediately above because some of them might have already been in the mate list from a different, nearby fewer hit location.
+ //
+ unsigned bestPossibleScoreForReadWithFewerHits;
+ if (noTruncation) {
+ bestPossibleScoreForReadWithFewerHits = 0;
+ } else {
+ bestPossibleScoreForReadWithFewerHits = setPair[readWithFewerHits]->computeBestPossibleScoreForCurrentHit();
+ }
+ unsigned lowestBestPossibleScoreOfAnyPossibleMate = maxK + extraSearchDepth;
+ for (int i = lowestFreeScoringMateCandidate[whichSetPair] - 1; i >= 0; i--) {
+ if (scoringMateCandidates[whichSetPair][i].readWithMoreHitsGenomeLocation > lastGenomeLocationForReadWithFewerHits + maxSpacing) {
+ break;
+ }
+ lowestBestPossibleScoreOfAnyPossibleMate = __min(lowestBestPossibleScoreOfAnyPossibleMate, scoringMateCandidates[whichSetPair][i].bestPossibleScore);
+ }
+ if (lowestBestPossibleScoreOfAnyPossibleMate + bestPossibleScoreForReadWithFewerHits <= maxK + extraSearchDepth) {
+ //
+ // There's a set of ends that we can't prove doesn't have too large of a score. Allocate a fewer hit candidate and stick it in the
+ // correct weight list.
+ //
+ if (lowestFreeScoringCandidatePoolEntry >= scoringCandidatePoolSize) {
+ WriteErrorMessage("Ran out of scoring candidate pool entries. Perhaps rerunning with a larger value of -mcp will help.\n");
+ soft_exit(1);
+ }
+ //
+ // If we have noOrderedEvaluation set, just stick everything on list 0, regardless of what it really is. This will cause us to
+ // evaluate the candidates in more-or-less inverse genome order.
+ //
+ unsigned bestPossibleScore = noOrderedEvaluation ? 0 : lowestBestPossibleScoreOfAnyPossibleMate + bestPossibleScoreForReadWithFewerHits;
+ scoringCandidatePool[lowestFreeScoringCandidatePoolEntry].init(lastGenomeLocationForReadWithFewerHits, whichSetPair, lowestFreeScoringMateCandidate[whichSetPair] - 1,
+ lastSeedOffsetForReadWithFewerHits, bestPossibleScoreForReadWithFewerHits,
+ scoringCandidates[bestPossibleScore]);
+ scoringCandidates[bestPossibleScore] = &scoringCandidatePool[lowestFreeScoringCandidatePoolEntry];
+#ifdef _DEBUG
+ if (_DumpAlignments) {
+ printf("SetPair %d, added fewer hits candidate %d at genome location %u, bestPossibleScore %d, seedOffset %d\n",
+ whichSetPair, lowestFreeScoringCandidatePoolEntry, lastGenomeLocationForReadWithFewerHits,
+ lowestBestPossibleScoreOfAnyPossibleMate + bestPossibleScoreForReadWithFewerHits,
+ lastSeedOffsetForReadWithFewerHits);
+ }
+#endif // _DEBUG
+ lowestFreeScoringCandidatePoolEntry++;
+ maxUsedBestPossibleScoreList = max(maxUsedBestPossibleScoreList, bestPossibleScore);
+ }
+ if (!setPair[readWithFewerHits]->getNextLowerHit(&lastGenomeLocationForReadWithFewerHits, &lastSeedOffsetForReadWithFewerHits)) {
+ break;
+ }
+ }
+ } // For each set pair
+ //
+ // Phase 3: score and merge the candidates we've found.
+ //
+ unsigned currentBestPossibleScoreList = 0;
+ scoreLimit = maxK + extraSearchDepth;
+ //
+ // Loop until we've scored all of the candidates, or proven that what's left must have too high of a score to be interesting.
+ //
+ while (currentBestPossibleScoreList <= maxUsedBestPossibleScoreList && currentBestPossibleScoreList <= scoreLimit) {
+ if (scoringCandidates[currentBestPossibleScoreList] == NULL) {
+ //
+ // No more candidates on this list. Skip to the next one.
+ //
+ currentBestPossibleScoreList++;
+ continue;
+ }
+ //
+ // Grab the first candidate on the highest list and score it.
+ //
+ ScoringCandidate *candidate = scoringCandidates[currentBestPossibleScoreList];
+ unsigned fewerEndScore;
+ double fewerEndMatchProbability;
+ int fewerEndGenomeLocationOffset;
+ scoreLocation(readWithFewerHits, setPairDirection[candidate->whichSetPair][readWithFewerHits], candidate->readWithFewerHitsGenomeLocation,
+ candidate->seedOffset, scoreLimit, &fewerEndScore, &fewerEndMatchProbability, &fewerEndGenomeLocationOffset);
+ _ASSERT(-1 == fewerEndScore || fewerEndScore >= candidate->bestPossibleScore);
+#ifdef _DEBUG
+ if (_DumpAlignments) {
+ printf("Scored fewer end candidate %d, set pair %d, read %d, location %u, seed offset %d, score limit %d, score %d, offset %d\n", (int)(candidate - scoringCandidatePool),
+ candidate->whichSetPair, readWithFewerHits, candidate->readWithFewerHitsGenomeLocation, candidate->seedOffset,
+ scoreLimit, fewerEndScore, fewerEndGenomeLocationOffset);
+ }
+#endif // DEBUG
+ if (fewerEndScore != -1) {
+ //
+ // Find and score mates. The index in scoringMateCandidateIndex is the lowest mate (i.e., the highest index number).
+ //
+ unsigned mateIndex = candidate->scoringMateCandidateIndex;
+ for (;;) {
+ ScoringMateCandidate *mate = &scoringMateCandidates[candidate->whichSetPair][mateIndex];
+ _ASSERT(genomeLocationIsWithin(mate->readWithMoreHitsGenomeLocation, candidate->readWithFewerHitsGenomeLocation, maxSpacing));
+ if (!genomeLocationIsWithin(mate->readWithMoreHitsGenomeLocation, candidate->readWithFewerHitsGenomeLocation, minSpacing) && mate->bestPossibleScore <= scoreLimit - fewerEndScore) {
+ //
+ // It's within the range and not necessarily too poor of a match. Consider it.
+ //
+ //
+ // If we haven't yet scored this mate, or we've scored it and not gotten an answer, but had a higher score limit than we'd
+ // use now, score it.
+ //
+ if (mate->score == -2 || mate->score == -1 && mate->scoreLimit < scoreLimit - fewerEndScore) {
+ scoreLocation(readWithMoreHits, setPairDirection[candidate->whichSetPair][readWithMoreHits], mate->readWithMoreHitsGenomeLocation,
+ mate->seedOffset, scoreLimit - fewerEndScore, &mate->score, &mate->matchProbability,
+ &mate->genomeOffset);
+#ifdef _DEBUG
+ if (_DumpAlignments) {
+ printf("Scored mate candidate %d, set pair %d, read %d, location %u, seed offset %d, score limit %d, score %d, offset %d\n",
+ (int)(mate - scoringMateCandidates[candidate->whichSetPair]), candidate->whichSetPair, readWithMoreHits, mate->readWithMoreHitsGenomeLocation,
+ mate->seedOffset, scoreLimit - fewerEndScore, mate->score, mate->genomeOffset);
+ }
+#endif // _DEBUG
+ _ASSERT(-1 == mate->score || mate->score >= mate->bestPossibleScore);
+ mate->scoreLimit = scoreLimit - fewerEndScore;
+ }
+ if (mate->score != -1) {
+ double pairProbability = mate->matchProbability * fewerEndMatchProbability;
+ unsigned pairScore = mate->score + fewerEndScore;
+ //
+ // See if this should be ignored as a merge, or if we need to back out a previously scored location
+ // because it's a worse version of this location.
+ //
+ MergeAnchor *mergeAnchor = candidate->mergeAnchor;
+ if (NULL == mergeAnchor) {
+ //
+ // Look up and down the array of candidates to see if we have possible merge candidates.
+ //
+ for (ScoringCandidate *mergeCandidate = candidate - 1;
+ mergeCandidate >= scoringCandidatePool &&
+ genomeLocationIsWithin(mergeCandidate->readWithFewerHitsGenomeLocation, candidate->readWithFewerHitsGenomeLocation + fewerEndGenomeLocationOffset, 50) &&
+ mergeCandidate->whichSetPair == candidate->whichSetPair;
+ mergeCandidate--) {
+ if (mergeCandidate->mergeAnchor != NULL) {
+ candidate->mergeAnchor = mergeAnchor = mergeCandidate->mergeAnchor;
+ break;
+ }
+ }
+ if (NULL == mergeAnchor) {
+ for (ScoringCandidate *mergeCandidate = candidate + 1;
+ mergeCandidate < scoringCandidatePool + lowestFreeScoringCandidatePoolEntry &&
+ genomeLocationIsWithin(mergeCandidate->readWithFewerHitsGenomeLocation, candidate->readWithFewerHitsGenomeLocation + fewerEndGenomeLocationOffset, 50) &&
+ mergeCandidate->whichSetPair == candidate->whichSetPair;
+ mergeCandidate--) {
+ if (mergeCandidate->mergeAnchor != NULL) {
+ candidate->mergeAnchor = mergeAnchor = mergeCandidate->mergeAnchor;
+ break;
+ }
+ }
+ }
+ }
+ bool merged;
+ double oldPairProbability;
+ if (NULL == mergeAnchor) {
+ if (firstFreeMergeAnchor >= mergeAnchorPoolSize) {
+ WriteErrorMessage("Ran out of merge anchor pool entries. Perhaps rerunning with a larger value of -mcp will help\n");
+ soft_exit(1);
+ }
+ mergeAnchor = &mergeAnchorPool[firstFreeMergeAnchor];
+ firstFreeMergeAnchor++;
+ mergeAnchor->init(mate->readWithMoreHitsGenomeLocation + mate->genomeOffset, candidate->readWithFewerHitsGenomeLocation + fewerEndGenomeLocationOffset,
+ pairProbability, pairScore);
+ merged = false;
+ oldPairProbability = 0;
+ candidate->mergeAnchor = mergeAnchor;
+ } else {
+ merged = mergeAnchor->checkMerge(mate->readWithMoreHitsGenomeLocation + mate->genomeOffset, candidate->readWithFewerHitsGenomeLocation + fewerEndGenomeLocationOffset,
+ pairProbability, pairScore, &oldPairProbability);
+ }
+ if (!merged) {
+ //
+ // Back out the probability of the old match that we're merged with, if any. The max
+ // is necessary because a + b - b is not necessarily a in floating point. If there
+ // was no merge, the oldPairProbability is 0.
+ //
+ probabilityOfAllPairs = __max(0, probabilityOfAllPairs - oldPairProbability);
+ bool isBestHit = false;
+ if (pairScore <= maxK && (pairScore < bestPairScore ||
+ (pairScore == bestPairScore && pairProbability > probabilityOfBestPair))) {
+ //
+ // A new best hit.
+ //
+ if (maxEditDistanceForSecondaryResults != -1 && (unsigned)maxEditDistanceForSecondaryResults >= pairScore - bestPairScore) {
+ //
+ // Move the old best to be a secondary alignment. This won't happen on the first time we get a valid alignment,
+ // because bestPairScore is initialized to be very large.
+ //
+ //
+ if (*nSecondaryResults >= secondaryResultBufferSize) {
+ WriteErrorMessage("IntersectingPairedEndAligner::align(): out of secondary result buffer\n");
+ soft_exit(1);
+ }
+ PairedAlignmentResult *result = &secondaryResults[*nSecondaryResults];
+ result->alignedAsPair = true;
+ result->fromAlignTogether = true;
+ for (int r = 0; r < NUM_READS_PER_PAIR; r++) {
+ result->direction[r] = bestResultDirection[r];
+ result->location[r] = bestResultGenomeLocation[r];
+ result->mapq[r] = 0;
+ result->score[r] = bestResultScore[r];
+ result->status[r] = MultipleHits;
+ }
+ (*nSecondaryResults)++;
+ }
+ bestPairScore = pairScore;
+ probabilityOfBestPair = pairProbability;
+ bestResultGenomeLocation[readWithFewerHits] = candidate->readWithFewerHitsGenomeLocation + fewerEndGenomeLocationOffset;
+ bestResultGenomeLocation[readWithMoreHits] = mate->readWithMoreHitsGenomeLocation + mate->genomeOffset;
+ bestResultScore[readWithFewerHits] = fewerEndScore;
+ bestResultScore[readWithMoreHits] = mate->score;
+ bestResultDirection[readWithFewerHits] = setPairDirection[candidate->whichSetPair][readWithFewerHits];
+ bestResultDirection[readWithMoreHits] = setPairDirection[candidate->whichSetPair][readWithMoreHits];
+ if (!noUkkonen) {
+ scoreLimit = bestPairScore + extraSearchDepth;
+ }
+ isBestHit = true;
+ } else {
+ if (maxEditDistanceForSecondaryResults != -1 && (unsigned)maxEditDistanceForSecondaryResults >= pairScore - bestPairScore) {
+ //
+ // A secondary result to save.
+ //
+ if (*nSecondaryResults >= secondaryResultBufferSize) {
+ WriteErrorMessage("IntersectingPairedEndAligner::align(): out of secondary result buffer. Read ID %.*s\n", read0->getIdLength(), read0->getId());
+ soft_exit(1);
+ }
+ PairedAlignmentResult *result = &secondaryResults[*nSecondaryResults];
+ result->alignedAsPair = true;
+ result->direction[readWithMoreHits] = setPairDirection[candidate->whichSetPair][readWithMoreHits];
+ result->direction[readWithFewerHits] = setPairDirection[candidate->whichSetPair][readWithFewerHits];
+ result->fromAlignTogether = true;
+ result->location[readWithMoreHits] = mate->readWithMoreHitsGenomeLocation + mate->genomeOffset;
+ result->location[readWithFewerHits] = candidate->readWithFewerHitsGenomeLocation + fewerEndGenomeLocationOffset;
+ result->mapq[0] = result->mapq[1] = 0;
+ result->score[readWithMoreHits] = mate->score;
+ result->score[readWithFewerHits] = fewerEndScore;
+ result->status[readWithFewerHits] = result->status[readWithMoreHits] = MultipleHits;
+ (*nSecondaryResults)++;
+ }
+ }
+ probabilityOfAllPairs += pairProbability;
+ #ifdef _DEBUG
+ if (_DumpAlignments) {
+ printf("Added %e (= %e * %e) @ (%u, %u), giving new probability of all pairs %e, score %d = %d + %d%s\n",
+ pairProbability, mate->matchProbability , fewerEndMatchProbability,
+ candidate->readWithFewerHitsGenomeLocation + fewerEndGenomeLocationOffset, mate->readWithMoreHitsGenomeLocation + mate->genomeOffset,
+ probabilityOfAllPairs,
+ pairScore, fewerEndScore, mate->score, isBestHit ? " New best hit" : "");
+ }
+ #endif // _DEBUG
+ if (probabilityOfAllPairs >= 4.9 && -1 == maxEditDistanceForSecondaryResults) {
+ //
+ // Nothing will rescue us from a 0 MAPQ, so just stop looking.
+ //
+ goto doneScoring;
+ }
+ }
+ }// if the mate has a non -1 score
+ }
+ if (mateIndex == 0 || !genomeLocationIsWithin(scoringMateCandidates[candidate->whichSetPair][mateIndex-1].readWithMoreHitsGenomeLocation, candidate->readWithFewerHitsGenomeLocation, maxSpacing)) {
+ //
+ // Out of mate candidates.
+ //
+ break;
+ }
+ mateIndex--;
+ }
+ }
+ //
+ // Remove us from the head of the list and proceed to the next candidate to score.
+ //
+ scoringCandidates[currentBestPossibleScoreList] = candidate->scoreListNext;
+ }
+ if (bestPairScore == 65536) {
+ //
+ // Found nothing.
+ //
+ for (unsigned whichRead = 0; whichRead < NUM_READS_PER_PAIR; whichRead++) {
+ result->location[whichRead] = InvalidGenomeLocation;
+ result->mapq[whichRead] = 0;
+ result->score[whichRead] = -1;
+ result->status[whichRead] = NotFound;
+#ifdef _DEBUG
+ if (_DumpAlignments) {
+ printf("No sufficiently good pairs found.\n");
+ }
+#endif // DEBUG
+ }
+ } else {
+ for (unsigned whichRead = 0; whichRead < NUM_READS_PER_PAIR; whichRead++) {
+ result->location[whichRead] = bestResultGenomeLocation[whichRead];
+ result->direction[whichRead] = bestResultDirection[whichRead];
+ result->mapq[whichRead] = computeMAPQ(probabilityOfAllPairs, probabilityOfBestPair, bestResultScore[whichRead], popularSeedsSkipped[0] + popularSeedsSkipped[1]);
+ result->status[whichRead] = result->mapq[whichRead] > MAPQ_LIMIT_FOR_SINGLE_HIT ? SingleHit : MultipleHits;
+ result->score[whichRead] = bestResultScore[whichRead];
+ }
+#ifdef _DEBUG
+ if (_DumpAlignments) {
+ printf("Returned %u %s %u %s with MAPQ %d and %d, probability of all pairs %e, probability of best pair %e\n",
+ result->location[0], result->direction[0] == RC ? "RC" : "", result->location[1], result->direction[1] == RC ? "RC" : "", result->mapq[0], result->mapq[1],
+ probabilityOfAllPairs, probabilityOfBestPair);
+ }
+#endif // DEBUG
+ }
+ //
+ // Get rid of any secondary results that are too far away from the best score. (NB: the rest of the code in align() is very similar to BaseAligner::finalizeSecondaryResults. Sorry)
+ //
+ int i = 0;
+ while (i < *nSecondaryResults) {
+ if ((int)(secondaryResults[i].score[0] + secondaryResults[i].score[1]) > (int)bestPairScore + maxEditDistanceForSecondaryResults) {
+ secondaryResults[i] = secondaryResults[(*nSecondaryResults) - 1];
+ (*nSecondaryResults)--;
+ } else {
+ i++;
+ }
+ }
+ //
+ // Now check to see if there are too many for any particular contig.
+ //
+ if (maxSecondaryAlignmentsPerContig > 0 && result->status[0] != NotFound) {
+ //
+ // Run through the results and count the number of results per contig, to see if any of them are too big.
+ // First, record the primary result.
+ //
+ bool anyContigHasTooManyResults = false;
+ contigCountEpoch++;
+ int primaryContigNum = genome->getContigNumAtLocation(result->location[0]);
+ hitsPerContigCounts[primaryContigNum].hits = 1;
+ hitsPerContigCounts[primaryContigNum].epoch = contigCountEpoch;
+ for (i = 0; i < *nSecondaryResults; i++) {
+ int contigNum = genome->getContigNumAtLocation(secondaryResults[i].location[0]); // We know they're on the same contig, so either will do
+ if (hitsPerContigCounts[contigNum].epoch != contigCountEpoch) {
+ hitsPerContigCounts[contigNum].epoch = contigCountEpoch;
+ hitsPerContigCounts[contigNum].hits = 0;
+ }
+ hitsPerContigCounts[contigNum].hits++;
+ if (hitsPerContigCounts[contigNum].hits > maxSecondaryAlignmentsPerContig) {
+ anyContigHasTooManyResults = true;
+ break;
+ }
+ }
+ if (anyContigHasTooManyResults) {
+ //
+ // Just sort them all, in order of contig then hit depth.
+ //
+ qsort(secondaryResults, *nSecondaryResults, sizeof(*secondaryResults), PairedAlignmentResult::compareByContigAndScore);
+ //
+ // Now run through and eliminate any contigs with too many hits. We can't use the same trick at the first loop above, because the
+ // counting here relies on the results being sorted. So, instead, we just copy them as we go.
+ //
+ int currentContigNum = -1;
+ int currentContigCount = 0;
+ int destResult = 0;
+ for (int sourceResult = 0; sourceResult < *nSecondaryResults; sourceResult++) {
+ int contigNum = genome->getContigNumAtLocation(secondaryResults[sourceResult].location[0]);
+ if (contigNum != currentContigNum) {
+ currentContigNum = contigNum;
+ currentContigCount = (contigNum == primaryContigNum) ? 1 : 0;
+ }
+ currentContigCount++;
+ if (currentContigCount <= maxSecondaryAlignmentsPerContig) {
+ //
+ // Keep it. If we don't get here, then we don't copy the result and
+ // don't increment destResult. And yes, this will sometimes copy a
+ // result over itself. That's harmless.
+ //
+ secondaryResults[destResult] = secondaryResults[sourceResult];
+ destResult++;
+ }
+ } // for each source result
+ *nSecondaryResults = destResult;
+ }
+ } // if we're limiting by contig
+ if (*nSecondaryResults > maxSecondaryResultsToReturn) {
+ qsort(secondaryResults, *nSecondaryResults, sizeof(*secondaryResults), PairedAlignmentResult::compareByScore);
+ *nSecondaryResults = maxSecondaryResultsToReturn; // Just truncate it
+ }
+ void
+ unsigned whichRead,
+ Direction direction,
+ GenomeLocation genomeLocation,
+ unsigned seedOffset,
+ unsigned scoreLimit,
+ unsigned *score,
+ double *matchProbability,
+ int *genomeLocationOffset)
+ nLocationsScored++;
+ Read *readToScore = reads[whichRead][direction];
+ unsigned readDataLength = readToScore->getDataLength();
+ GenomeDistance genomeDataLength = readDataLength + MAX_K; // Leave extra space in case the read has deletions
+ const char *data = genome->getSubstring(genomeLocation, genomeDataLength);
+#if 0 // This only happens when genomeLocation is in the padding, which can lead to no good. Just say no.
+ if (NULL == data) {
+ //
+ // We're up against the end of a contig. Reduce the extra space enough that it isn't too
+ // long. We're willing to reduce it to less than the length of a read, because the read could
+ // butt up against the end of the contig and have insertions in it.
+ //
+ const Genome::Contig *contig = genome->getContigAtLocation(genomeLocation);
+ GenomeLocation endLocation;
+ if (genomeLocation + readDataLength + MAX_K >= genome->getCountOfBases()) {
+ endLocation = genome->getCountOfBases();
+ } else {
+ const Genome::Contig *nextContig = genome->getContigAtLocation(genomeLocation + readDataLength + MAX_K);
+ _ASSERT(NULL != contig && contig->beginningLocation <= genomeLocation && contig != nextContig);
+ endLocation = nextContig->beginningLocation;
+ }
+ genomeDataLength = endLocation - genomeLocation - 1;
+ if (genomeDataLength >= readDataLength - MAX_K) {
+ data = genome->getSubstring(genomeLocation, genomeDataLength);
+ _ASSERT(NULL != data);
+ }
+ }
+#endif // 0 This only happens when genomeLocation is in the padding, which can lead to no good. Just say no.
+ if (NULL == data) {
+ *score = -1;
+ *matchProbability = 0;
+ return;
+ }
+ // Compute the distance separately in the forward and backward directions from the seed, to allow
+ // arbitrary offsets at both the start and end but not have to pay the cost of exploring all start
+ // shifts in BoundedStringDistance
+ double matchProb1, matchProb2;
+ int score1, score2;
+ // First, do the forward direction from where the seed aligns to past of it
+ int readLen = readToScore->getDataLength();
+ int seedLen = index->getSeedLength();
+ int tailStart = seedOffset + seedLen;
+ _ASSERT(!memcmp(data+seedOffset, readToScore->getData() + seedOffset, seedLen)); // that the seed actually matches
+ int textLen;
+ if (genomeDataLength - tailStart > INT32_MAX) {
+ textLen = INT32_MAX;
+ } else {
+ textLen = (int)(genomeDataLength - tailStart);
+ }
+ score1 = landauVishkin->computeEditDistance(data + tailStart, textLen, readToScore->getData() + tailStart, readToScore->getQuality() + tailStart, readLen - tailStart,
+ scoreLimit, &matchProb1);
+ if (score1 == -1) {
+ *score = -1;
+ } else {
+ // The tail of the read matched; now let's reverse the reference genome data and match the head
+ int limitLeft = scoreLimit - score1;
+ score2 = reverseLandauVishkin->computeEditDistance(data + seedOffset, seedOffset + MAX_K, reversedRead[whichRead][direction] + readLen - seedOffset,
+ reads[whichRead][OppositeDirection(direction)]->getQuality() + readLen - seedOffset, seedOffset, limitLeft, &matchProb2, genomeLocationOffset);
+ if (score2 == -1) {
+ *score = -1;
+ } else {
+ *score = score1 + score2;
+ _ASSERT(*score <= scoreLimit);
+ // Map probabilities for substrings can be multiplied, but make sure to count seed too
+ *matchProbability = matchProb1 * matchProb2 * pow(1 - SNP_PROB, seedLen);
+ }
+ }
+ if (*score == -1) {
+ *matchProbability = 0;
+ }
+ void
+ IntersectingPairedEndAligner::HashTableHitSet::firstInit(unsigned maxSeeds_, unsigned maxMergeDistance_, BigAllocator *allocator, bool doesGenomeIndexHave64BitLocations_)
+ {
+ maxSeeds = maxSeeds_;
+ maxMergeDistance = maxMergeDistance_;
+ doesGenomeIndexHave64BitLocations = doesGenomeIndexHave64BitLocations_;
+ nLookupsUsed = 0;
+ if (doesGenomeIndexHave64BitLocations) {
+ lookups64 = (HashTableLookup<GenomeLocation> *)allocator->allocate(sizeof(HashTableLookup<GenomeLocation>) * maxSeeds);
+ lookups32 = NULL;
+ } else {
+ lookups32 = (HashTableLookup<unsigned> *)allocator->allocate(sizeof(HashTableLookup<unsigned>) * maxSeeds);
+ lookups64 = NULL;
+ }
+ disjointHitSets = (DisjointHitSet *)allocator->allocate(sizeof(DisjointHitSet) * maxSeeds);
+ }
+ void
+ nLookupsUsed = 0;
+ currentDisjointHitSet = -1;
+ if (doesGenomeIndexHave64BitLocations) {
+ lookupListHead64->nextLookupWithRemainingMembers = lookupListHead64->prevLookupWithRemainingMembers = lookupListHead64;
+ lookupListHead32->nextLookupWithRemainingMembers = lookupListHead32->prevLookupWithRemainingMembers = NULL;
+ } else {
+ lookupListHead32->nextLookupWithRemainingMembers = lookupListHead32->prevLookupWithRemainingMembers = lookupListHead32;
+ lookupListHead64->nextLookupWithRemainingMembers = lookupListHead64->prevLookupWithRemainingMembers = NULL;
+ }
+// I apologize for this, but I had to do two versions of recordLookup, one for the 32 bit and one for the 64 bit version. The options were
+// copying the code or doing a macro with the types as parameters. I chose macro, so you get ugly but unlikely to accidentally diverge.
+// At least it's just isolated to the HashTableHitSet class.
+#define RL(lookups, glType, lookupListHead) \
+ void \
+IntersectingPairedEndAligner::HashTableHitSet::recordLookup(unsigned seedOffset, _int64 nHits, const glType *hits, bool beginsDisjointHitSet) \
+{ \
+ _ASSERT(nLookupsUsed < maxSeeds); \
+ if (beginsDisjointHitSet) { \
+ currentDisjointHitSet++; \
+ _ASSERT(currentDisjointHitSet < (int)maxSeeds); \
+ disjointHitSets[currentDisjointHitSet].countOfExhaustedHits = 0; \
+ } \
+ \
+ if (0 == nHits) { \
+ disjointHitSets[currentDisjointHitSet].countOfExhaustedHits++; \
+ } else { \
+ _ASSERT(currentDisjointHitSet != -1); /* Essentially that beginsDisjointHitSet is set for the first recordLookup call */ \
+ lookups[nLookupsUsed].currentHitForIntersection = 0; \
+ lookups[nLookupsUsed].hits = hits; \
+ lookups[nLookupsUsed].nHits = nHits; \
+ lookups[nLookupsUsed].seedOffset = seedOffset; \
+ lookups[nLookupsUsed].whichDisjointHitSet = currentDisjointHitSet; \
+ \
+ /* Trim off any hits that are smaller than seedOffset, since they are clearly meaningless. */ \
+ \
+ while (lookups[nLookupsUsed].nHits > 0 && lookups[nLookupsUsed].hits[lookups[nLookupsUsed].nHits - 1] < lookups[nLookupsUsed].seedOffset) { \
+ lookups[nLookupsUsed].nHits--; \
+ } \
+ \
+ /* Add this lookup into the non-empty lookup list. */ \
+ \
+ lookups[nLookupsUsed].prevLookupWithRemainingMembers = lookupListHead; \
+ lookups[nLookupsUsed].nextLookupWithRemainingMembers = lookupListHead->nextLookupWithRemainingMembers; \
+ lookups[nLookupsUsed].prevLookupWithRemainingMembers->nextLookupWithRemainingMembers = \
+ lookups[nLookupsUsed].nextLookupWithRemainingMembers->prevLookupWithRemainingMembers = &lookups[nLookupsUsed]; \
+ \
+ if (doAlignerPrefetch) { \
+ _mm_prefetch((const char *)&lookups[nLookupsUsed].hits[lookups[nLookupsUsed].nHits / 2], _MM_HINT_T2); \
+ } \
+ \
+ nLookupsUsed++; \
+ } \
+RL(lookups32, unsigned, lookupListHead32)
+RL(lookups64, GenomeLocation, lookupListHead64)
+#undef RL
+ unsigned
+ //
+ // Now compute the best possible score for the hit. This is the largest number of misses in any disjoint hit set.
+ //
+ for (int i = 0; i <= currentDisjointHitSet; i++) {
+ disjointHitSets[i].missCount = disjointHitSets[i].countOfExhaustedHits;
+ }
+ //
+ // Another macro. Sorry again.
+ //
+#define loop(glType, lookupListHead) \
+ for (HashTableLookup<glType> *lookup = lookupListHead->nextLookupWithRemainingMembers; lookup != lookupListHead; \
+ lookup = lookup->nextLookupWithRemainingMembers) { \
+ \
+ if (!(lookup->currentHitForIntersection != lookup->nHits && \
+ genomeLocationIsWithin(lookup->hits[lookup->currentHitForIntersection], mostRecentLocationReturned + lookup->seedOffset, maxMergeDistance) || \
+ lookup->currentHitForIntersection != 0 && \
+ genomeLocationIsWithin(lookup->hits[lookup->currentHitForIntersection-1], mostRecentLocationReturned + lookup->seedOffset, maxMergeDistance))) { \
+ \
+ /* This one was not close enough. */ \
+ \
+ disjointHitSets[lookup->whichDisjointHitSet].missCount++; \
+ } \
+ }
+ if (doesGenomeIndexHave64BitLocations) {
+ loop(GenomeLocation, lookupListHead64);
+ } else {
+ loop(unsigned, lookupListHead32);
+ }
+#undef loop
+ unsigned bestPossibleScoreSoFar = 0;
+ for (int i = 0; i <= currentDisjointHitSet; i++) {
+ bestPossibleScoreSoFar = max(bestPossibleScoreSoFar, disjointHitSets[i].missCount);
+ }
+ return bestPossibleScoreSoFar;
+ bool
+IntersectingPairedEndAligner::HashTableHitSet::getNextHitLessThanOrEqualTo(GenomeLocation maxGenomeLocationToFind, GenomeLocation *actualGenomeLocationFound, unsigned *seedOffsetFound)
+ bool anyFound = false;
+ GenomeLocation bestLocationFound = 0;
+ for (unsigned i = 0; i < nLookupsUsed; i++) {
+ //
+ // Binary search from the current starting offset to either the right place or the end.
+ //
+ _int64 limit[2];
+ GenomeLocation maxGenomeLocationToFindThisSeed;
+ if (doesGenomeIndexHave64BitLocations) {
+ limit[0] = (_int64)lookups64[i].currentHitForIntersection;
+ limit[1] = (_int64)lookups64[i].nHits - 1;
+ maxGenomeLocationToFindThisSeed = maxGenomeLocationToFind + lookups64[i].seedOffset;
+ } else {
+ limit[0] = (_int64)lookups32[i].currentHitForIntersection;
+ limit[1] = (_int64)lookups32[i].nHits - 1;
+ maxGenomeLocationToFindThisSeed = maxGenomeLocationToFind + lookups32[i].seedOffset;
+ }
+ while (limit[0] <= limit[1]) {
+ _int64 probe = (limit[0] + limit[1]) / 2;
+ if (doAlignerPrefetch) { // not clear this helps. We're probably not far enough ahead.
+ if (doesGenomeIndexHave64BitLocations) {
+ _mm_prefetch((const char *)&lookups64[i].hits[(limit[0] + probe) / 2 - 1], _MM_HINT_T2);
+ _mm_prefetch((const char *)&lookups64[i].hits[(limit[1] + probe) / 2 + 1], _MM_HINT_T2);
+ } else {
+ _mm_prefetch((const char *)&lookups32[i].hits[(limit[0] + probe) / 2 - 1], _MM_HINT_T2);
+ _mm_prefetch((const char *)&lookups32[i].hits[(limit[1] + probe) / 2 + 1], _MM_HINT_T2);
+ }
+ }
+ //
+ // Recall that the hit sets are sorted from largest to smallest, so the strange looking logic is actually right.
+ // We're evaluating the expression "lookups[i].hits[probe] <= maxGenomeOffsetToFindThisSeed && (probe == 0 || lookups[i].hits[probe-1] > maxGenomeOffsetToFindThisSeed)"
+ // It's written in this strange way just so the profile tool will show us where the time's going.
+ //
+ GenomeLocation probeHit;
+ GenomeLocation probeMinusOneHit;
+ unsigned seedOffset;
+ if (doesGenomeIndexHave64BitLocations) {
+ probeHit = lookups64[i].hits[probe];
+ probeMinusOneHit = lookups64[i].hits[probe-1];
+ seedOffset = lookups64[i].seedOffset;
+ } else {
+ probeHit = lookups32[i].hits[probe];
+ probeMinusOneHit = lookups32[i].hits[probe-1];
+ seedOffset = lookups32[i].seedOffset;
+ }
+ unsigned clause1 = probeHit <= maxGenomeLocationToFindThisSeed;
+ unsigned clause2 = probe == 0;
+ if (clause1 && (clause2 || probeMinusOneHit > maxGenomeLocationToFindThisSeed)) {
+ if (probeHit - seedOffset > bestLocationFound) {
+ anyFound = true;
+ mostRecentLocationReturned = *actualGenomeLocationFound = bestLocationFound = probeHit - seedOffset;
+ *seedOffsetFound = seedOffset;
+ }
+ if (doesGenomeIndexHave64BitLocations) {
+ lookups64[i].currentHitForIntersection = probe;
+ } else {
+ lookups32[i].currentHitForIntersection = probe;
+ }
+ break;
+ }
+ if (probeHit > maxGenomeLocationToFindThisSeed) { // Recode this without the if to avoid the hard-to-predict branch.
+ limit[0] = probe + 1;
+ } else {
+ limit[1] = probe - 1;
+ }
+ } // While we're looking
+ if (limit[0] > limit[1]) {
+ // We're done with this lookup.
+ if (doesGenomeIndexHave64BitLocations) {
+ lookups64[i].currentHitForIntersection = lookups64[i].nHits;
+ } else {
+ lookups32[i].currentHitForIntersection = lookups32[i].nHits;
+ }
+ }
+ } // For each lookup
+ _ASSERT(!anyFound || *actualGenomeLocationFound <= maxGenomeLocationToFind);
+ return anyFound;
+ bool
+IntersectingPairedEndAligner::HashTableHitSet::getFirstHit(GenomeLocation *genomeLocation, unsigned *seedOffsetFound)
+ bool anyFound = false;
+ *genomeLocation = 0;
+ //
+ // Yet another macro. This makes me want to write in a better language sometimes. But then it would be too slow. :-(
+ //
+#define LOOP(lookups) \
+ for (unsigned i = 0; i < nLookupsUsed; i++) { \
+ if (lookups[i].nHits > 0 && lookups[i].hits[0] - lookups[i].seedOffset > GenomeLocationAsInt64(*genomeLocation)) { \
+ mostRecentLocationReturned = *genomeLocation = lookups[i].hits[0] - lookups[i].seedOffset; \
+ *seedOffsetFound = lookups[i].seedOffset; \
+ anyFound = true; \
+ } \
+ }
+ if (doesGenomeIndexHave64BitLocations) {
+ LOOP(lookups64);
+ } else {
+ LOOP(lookups32);
+ }
+#undef LOOP
+ return !anyFound;
+ bool
+IntersectingPairedEndAligner::HashTableHitSet::getNextLowerHit(GenomeLocation *genomeLocation, unsigned *seedOffsetFound)
+ //
+ // Look through all of the lookups and find the one with the highest location smaller than the current one.
+ //
+ GenomeLocation foundLocation = 0;
+ bool anyFound = false;
+ //
+ // Run through the lookups pushing up any that are at the most recently returned
+ //
+ for (unsigned i = 0; i < nLookupsUsed; i++) {
+ _int64 *currentHitForIntersection;
+ _int64 nHits;
+ GenomeLocation hitLocation;
+ unsigned seedOffset;
+ //
+ // A macro to initialize stuff that we need to avoid a bigger macro later.
+ //
+#define initVars(lookups) \
+ currentHitForIntersection = &lookups[i].currentHitForIntersection; \
+ nHits = lookups[i].nHits; \
+ seedOffset = lookups[i].seedOffset; \
+ if (nHits != *currentHitForIntersection) { \
+ hitLocation = lookups[i].hits[*currentHitForIntersection]; \
+ }
+ if (doesGenomeIndexHave64BitLocations) {
+ initVars(lookups64);
+ } else {
+ initVars(lookups32);
+ }
+#undef initVars
+ _ASSERT(*currentHitForIntersection == nHits || hitLocation - seedOffset <= mostRecentLocationReturned || hitLocation < seedOffset);
+ if (*currentHitForIntersection != nHits && hitLocation - seedOffset == mostRecentLocationReturned) {
+ (*currentHitForIntersection)++;
+ if (*currentHitForIntersection == nHits) {
+ continue;
+ }
+ if (doesGenomeIndexHave64BitLocations) {
+ hitLocation = lookups64[i].hits[*currentHitForIntersection];
+ } else {
+ hitLocation = lookups32[i].hits[*currentHitForIntersection];
+ }
+ }
+ if (*currentHitForIntersection != nHits) {
+ if (foundLocation < hitLocation - seedOffset && // found location is OK
+ hitLocation >= seedOffset) // found location isn't too small to push us before the beginning of the genome
+ {
+ *genomeLocation = foundLocation = hitLocation - seedOffset;
+ *seedOffsetFound = seedOffset;
+ anyFound = true;
+ }
+ }
+ }
+ if (anyFound) {
+ mostRecentLocationReturned = foundLocation;
+ }
+ return anyFound;
+ bool
+IntersectingPairedEndAligner::MergeAnchor::checkMerge(GenomeLocation newMoreHitLocation, GenomeLocation newFewerHitLocation, double newMatchProbability, int newPairScore,
+ double *oldMatchProbability)
+ if (locationForReadWithMoreHits == InvalidGenomeLocation || !doesRangeMatch(newMoreHitLocation, newFewerHitLocation)) {
+ //
+ // No merge. Remember the new one.
+ //
+ locationForReadWithMoreHits = newMoreHitLocation;
+ locationForReadWithFewerHits = newFewerHitLocation;
+ matchProbability = newMatchProbability;
+ pairScore = newPairScore;
+ *oldMatchProbability = 0.0;
+ return false;
+ } else {
+ //
+ // Within merge distance. Keep the better score (or if they're tied the better match probability).
+ //
+ if (newPairScore < pairScore || newPairScore == pairScore && newMatchProbability > matchProbability) {
+#ifdef _DEBUG
+ if (_DumpAlignments) {
+ printf("Merge replacement at anchor (%u, %u), loc (%u, %u), old match prob %e, new match prob %e, old pair score %d, new pair score %d\n",
+ locationForReadWithMoreHits, locationForReadWithFewerHits, newMoreHitLocation, newFewerHitLocation,
+ matchProbability, newMatchProbability, pairScore, newPairScore);
+ }
+#endif // DEBUG
+ *oldMatchProbability = matchProbability;
+ matchProbability = newMatchProbability;
+ pairScore = newPairScore;
+ return false;
+ } else {
+ //
+ // The new one should just be ignored.
+ //
+#ifdef _DEBUG
+ if (_DumpAlignments) {
+ printf("Merged at anchor (%u, %u), loc (%u, %u), old match prob %e, new match prob %e, old pair score %d, new pair score %d\n",
+ locationForReadWithMoreHits, locationForReadWithFewerHits, newMoreHitLocation, newFewerHitLocation,
+ matchProbability, newMatchProbability, pairScore, newPairScore);
+ }
+#endif // DEBUG
+ return true;
+ }
+ }
+const unsigned IntersectingPairedEndAligner::maxMergeDistance = 31;
diff --git a/SNAPLib/IntersectingPairedEndAligner.h b/SNAPLib/IntersectingPairedEndAligner.h
new file mode 100644
index 0000000..9bc6029
--- /dev/null
+++ b/SNAPLib/IntersectingPairedEndAligner.h
@@ -0,0 +1,476 @@
+Module Name:
+ IntersectingPairedEndAligner.h
+ A paired-end aligner based on set intersections to narrow down possible candidate locations.
+ Bill Bolosky, February, 2013
+ User mode service.
+Revision History:
+#pragma once
+#include "PairedEndAligner.h"
+#include "BaseAligner.h"
+#include "BigAlloc.h"
+#include "directions.h"
+#include "LandauVishkin.h"
+#include "FixedSizeMap.h"
+const unsigned DEFAULT_MAX_CANDIDATE_POOL_SIZE = 1000000;
+class IntersectingPairedEndAligner : public PairedEndAligner
+ IntersectingPairedEndAligner(
+ GenomeIndex *index_,
+ unsigned maxReadSize_,
+ unsigned maxHits_,
+ unsigned maxK_,
+ unsigned maxSeedsFromCommandLine_,
+ double seedCoverage_,
+ unsigned minSpacing_, // Minimum distance to allow between the two ends.
+ unsigned maxSpacing_, // Maximum distance to allow between the two ends.
+ unsigned maxBigHits_,
+ unsigned extraSearchDepth_,
+ unsigned maxCandidatePoolSize,
+ int maxSecondaryAlignmentsPerContig_,
+ BigAllocator *allocator,
+ bool noUkkonen_,
+ bool noOrderedEvaluation_,
+ bool noTruncation_);
+ static unsigned getMaxSecondaryResults(unsigned numSeedsFromCommandLine, double seedCoverage, unsigned maxReadSize, unsigned maxHits, unsigned seedLength, unsigned minSpacing, unsigned maxSpacing)
+ {
+ unsigned maxSeedsToUse;
+ if (0 != numSeedsFromCommandLine) {
+ maxSeedsToUse = numSeedsFromCommandLine;
+ } else {
+ maxSeedsToUse = (unsigned)(maxReadSize * seedCoverage / seedLength);
+ }
+ //
+ // The number of hits we can conceivably get is for each seed a result for every hit, times every possible pair for that hit. The possible pairs
+ // run from min to max distance on either side, but if they're within max merge distance then they'll be merged.
+ //
+ return NUM_DIRECTIONS * maxHits * maxSeedsToUse * (maxSpacing - minSpacing + 1 + maxMergeDistance - 1) / maxMergeDistance * 2;
+ }
+ void setLandauVishkin(
+ LandauVishkin<1> *landauVishkin_,
+ LandauVishkin<-1> *reverseLandauVishkin_)
+ {
+ landauVishkin = landauVishkin_;
+ reverseLandauVishkin = reverseLandauVishkin_;
+ }
+ virtual ~IntersectingPairedEndAligner();
+ virtual void align(
+ Read *read0,
+ Read *read1,
+ PairedAlignmentResult *result,
+ int maxEditDistanceForSecondaryResults,
+ int secondaryResultBufferSize,
+ int *nSecondaryResults,
+ PairedAlignmentResult *secondaryResults, // The caller passes in a buffer of secondaryResultBufferSize and it's filled in by align()
+ int singleSecondaryBufferSize,
+ int maxSecondaryResultsToReturn,
+ int *nSingleEndSecondaryResultsForFirstRead,
+ int *nSingleEndSecondaryResultsForSecondRead,
+ SingleAlignmentResult *singleEndSecondaryResults // Single-end secondary alignments for when the paired-end alignment didn't work properly
+ );
+ static size_t getBigAllocatorReservation(GenomeIndex * index, unsigned maxBigHitsToConsider, unsigned maxReadSize, unsigned seedLen, unsigned maxSeedsFromCommandLine,
+ double seedCoverage, unsigned maxEditDistanceToConsider, unsigned maxExtraSearchDepth, unsigned maxCandidatePoolSize,
+ int maxSecondaryAlignmentsPerContig);
+ void *operator new(size_t size, BigAllocator *allocator) {_ASSERT(size == sizeof(IntersectingPairedEndAligner)); return allocator->allocate(size);}
+ void operator delete(void *ptr, BigAllocator *allocator) {/* do nothing. Memory gets cleaned up when the allocator is deleted.*/}
+ void *operator new(size_t size) {return BigAlloc(size);}
+ void operator delete(void *ptr) {BigDealloc(ptr);}
+ virtual _int64 getLocationsScored() const {
+ return nLocationsScored;
+ }
+ IntersectingPairedEndAligner() {} // This is for the counting allocator, it doesn't build a useful object
+ static const int NUM_SET_PAIRS = 2; // A "set pair" is read0 FORWARD + read1 RC, or read0 RC + read1 FORWARD. Again, it doesn't make sense to change this.
+ void allocateDynamicMemory(BigAllocator *allocator, unsigned maxReadSize, unsigned maxBigHitsToConsider, unsigned maxSeedsToUse,
+ unsigned maxEditDistanceToConsider, unsigned maxExtraSearchDepth, unsigned maxCandidatePoolSize,
+ int maxSecondaryAlignmentsPerContig);
+ GenomeIndex * index;
+ const Genome * genome;
+ GenomeDistance genomeSize;
+ unsigned maxReadSize;
+ unsigned maxHits;
+ unsigned maxBigHits;
+ unsigned extraSearchDepth;
+ unsigned maxK;
+ unsigned numSeedsFromCommandLine;
+ double seedCoverage;
+ static const unsigned MAX_MAX_SEEDS = 30;
+ unsigned minSpacing;
+ unsigned maxSpacing;
+ unsigned seedLen;
+ bool doesGenomeIndexHave64BitLocations;
+ _int64 nLocationsScored;
+ bool noUkkonen;
+ bool noOrderedEvaluation;
+ bool noTruncation;
+ static const unsigned maxMergeDistance;
+ //
+ // It's a template, because we
+ // have different sizes of genome locations depending on the hash table format. So, GL must be unsigned or GenomeLocation
+ //
+ template<class GL> struct HashTableLookup {
+ unsigned seedOffset;
+ _int64 nHits;
+ const GL * hits;
+ unsigned whichDisjointHitSet;
+ //
+ // We keep the hash table lookups that haven't been exhaused in a circular list.
+ //
+ HashTableLookup<GL> *nextLookupWithRemainingMembers;
+ HashTableLookup<GL> *prevLookupWithRemainingMembers;
+ //
+ // State for handling the binary search of a location in this lookup.
+ // This would ordinarily be stack local state in the binary search
+ // routine, but because a) we want to interleave the steps of the binary
+ // search in order to allow cache prefetches to have time to execute;
+ // and b) we don't want to do dynamic memory allocation (really at all),
+ // it gets stuck here.
+ //
+ int limit[2]; // The upper and lower limits of the current binary search in hits
+ GL maxGenomeLocationToFindThisSeed;
+ //
+ // A linked list of lookups that haven't yet completed this binary search. This is a linked
+ // list with no header element, so testing for emptiness needs to happen at removal time.
+ // It's done that way to avoid a comparison for list head that would result in a hard-to-predict
+ // branch.
+ //
+ HashTableLookup<GL> *nextLookupForCurrentBinarySearch;
+ HashTableLookup<GL> *prevLookupForCurrentBinarySearch;
+ _int64 currentHitForIntersection;
+ //
+ // A place for the hash table to write in singletons. We need this because when the hash table is
+ // built with > 4 byte genome locations, it usually doesn't store 8 bytes, so we need to
+ // provide the lookup function a place to write the result. Since we need one per
+ // lookup, it goes here.
+ //
+ GL singletonGenomeLocation[2]; // The [2] is because we need to look one before sometimes, and that allows space
+ };
+ //
+ // A set of seed hits, represented by the lookups that came out of the big hash table. It can be over 32 or
+ // 64 bit indices, but its external interface is always 64 bits (it extends on the way out if necessary).
+ //
+ class HashTableHitSet {
+ public:
+ HashTableHitSet() {}
+ void firstInit(unsigned maxSeeds_, unsigned maxMergeDistance_, BigAllocator *allocator, bool doesGenomeIndexHave64BitLocations_);
+ //
+ // Reset to empty state.
+ //
+ void init();
+ //
+ // Record a hash table lookup. All recording must be done before any
+ // calls to getNextHitLessThanOrEqualTo. A disjoint hit set is a set of hits
+ // that don't share any bases in the read. This is interesting because the edit
+ // distance of a read must be at least the number of seeds that didn't hit for
+ // any disjoint hit set (because there must be a difference in the read within a
+ // seed for it not to hit, and since the reads are disjoint there can't be a case
+ // where the same difference caused two seeds to miss).
+ //
+ void recordLookup(unsigned seedOffset, _int64 nHits, const unsigned *hits, bool beginsDisjointHitSet);
+ void recordLookup(unsigned seedOffset, _int64 nHits, const GenomeLocation *hits, bool beginsDisjointHitSet);
+ //
+ // This efficiently works through the set looking for the next hit at or below this address.
+ // A HashTableHitSet only allows a single iteration through its address space per call to
+ // init().
+ //
+ bool getNextHitLessThanOrEqualTo(GenomeLocation maxGenomeLocationToFind, GenomeLocation *actualGenomeLocationFound, unsigned *seedOffsetFound);
+ //
+ // Walk down just one step, don't binary search.
+ //
+ bool getNextLowerHit(GenomeLocation *genomeLocation, unsigned *seedOffsetFound);
+ //
+ // Find the highest genome address.
+ //
+ bool getFirstHit(GenomeLocation *genomeLocation, unsigned *seedOffsetFound);
+ unsigned computeBestPossibleScoreForCurrentHit();
+ //
+ // This is bit of storage that the 64 bit lookup needs in order to extend singleton hits into 64 bits, since they may be
+ // stored in the index in fewer.
+ //
+ GenomeLocation *getNextSingletonLocation()
+ {
+ return &lookups64[nLookupsUsed].singletonGenomeLocation[1];
+ }
+ private:
+ struct DisjointHitSet {
+ unsigned countOfExhaustedHits;
+ unsigned missCount;
+ };
+ int currentDisjointHitSet;
+ DisjointHitSet * disjointHitSets;
+ HashTableLookup<unsigned> * lookups32;
+ HashTableLookup<GenomeLocation> * lookups64;
+ HashTableLookup<unsigned> lookupListHead32[1];
+ HashTableLookup<GenomeLocation> lookupListHead64[1];
+ unsigned maxSeeds;
+ unsigned nLookupsUsed;
+ GenomeLocation mostRecentLocationReturned;
+ unsigned maxMergeDistance;
+ bool doesGenomeIndexHave64BitLocations;
+ };
+ HashTableHitSet * hashTableHitSets[NUM_READS_PER_PAIR][NUM_DIRECTIONS];
+ int countOfHashTableLookups[NUM_READS_PER_PAIR];
+ _int64 totalHashTableHits[NUM_READS_PER_PAIR][NUM_DIRECTIONS];
+ _int64 largestHashTableHit[NUM_READS_PER_PAIR][NUM_DIRECTIONS];
+ unsigned readWithMoreHits;
+ unsigned readWithFewerHits;
+ //
+ // A location that's been scored (or waiting to be scored). This is needed in order to do merging
+ // of close-together hits and to track potential mate pairs.
+ //
+ struct HitLocation {
+ GenomeLocation genomeLocation;
+ int genomeLocationOffset; // This is needed because we might get an offset back from scoring (because it's really scoring a range).
+ unsigned seedOffset;
+ bool isScored; // Mate pairs are sometimes not scored when they're inserted, because they
+ unsigned score;
+ unsigned maxK; // The maxK that this was scored with (we may need to rescore if we need a higher maxK and score is -1)
+ double matchProbability;
+ unsigned bestPossibleScore;
+ //
+ // We have to be careful in the case where lots of offsets in a row match well against the read (think
+ // about repetitive short sequences, i.e., ATTATTATTATT...). We want to merge the close ones together,
+ // but if the repetitive sequence extends longer than maxMerge, we don't want to just slide the window
+ // over the whole range and declare it all to be one. There is really no good definition for the right
+ // thing to do here, so instead all we do is that when we declare two candidates to be matched we
+ // pick one of them to be the match primary and then coalesce all matches that are within maxMatchDistance
+ // of the match primary. No one can match with any of the locations in the set that's beyond maxMatchDistance
+ // from the set primary. This means that in the case of repetitve sequences that we'll declare locations
+ // right next to one another not to be matches. There's really no way around this while avoiding
+ // matching things that are possibly much more than maxMatchDistance apart.
+ //
+ GenomeLocation genomeLocationOfNearestMatchedCandidate;
+ };
+ char *rcReadData[NUM_READS_PER_PAIR]; // the reverse complement of the data for each read
+ char *rcReadQuality[NUM_READS_PER_PAIR]; // the reversed quality strings for each read
+ unsigned readLen[NUM_READS_PER_PAIR];
+ Read *reads[NUM_READS_PER_PAIR][NUM_DIRECTIONS]; // These are the reads that are provided in the align call, together with their reverse complements, which are computed.
+ char *reversedRead[NUM_READS_PER_PAIR][NUM_DIRECTIONS]; // The reversed data for each read for forward and RC. This is used in the backwards LV
+ LandauVishkin<> *landauVishkin;
+ LandauVishkin<-1> *reverseLandauVishkin;
+ char rcTranslationTable[256];
+ unsigned nTable[256];
+ BYTE *seedUsed;
+ inline bool IsSeedUsed(_int64 indexInRead) const {
+ return (seedUsed[indexInRead / 8] & (1 << (indexInRead % 8))) != 0;
+ }
+ inline void SetSeedUsed(_int64 indexInRead) {
+ seedUsed[indexInRead / 8] |= (1 << (indexInRead % 8));
+ }
+ //
+ // "Local probability" means the probability that each end is correct given that the pair itself is correct.
+ // Consider the example where there's exactly one decent match for one read, but the other one has several
+ // that are all within the correct range for the first one. Then the local probability for the second read
+ // is lower than the first. The overall probability of an alignment then is
+ // pairProbability * localProbability/ allPairProbability.
+ //
+ double localBestPairProbability[NUM_READS_PER_PAIR];
+ void scoreLocation(
+ unsigned whichRead,
+ Direction direction,
+ GenomeLocation genomeLocation,
+ unsigned seedOffset,
+ unsigned scoreLimit,
+ unsigned *score,
+ double *matchProbability,
+ int *genomeLocationOffset // The computed offset for genomeLocation (which is needed because we scan several different possible starting locations)
+ );
+ //
+ // These are used to keep track of places where we should merge together candidate locations for MAPQ purposes, because they're sufficiently
+ // close in the genome.
+ //
+ struct MergeAnchor {
+ double matchProbability;
+ GenomeLocation locationForReadWithMoreHits;
+ GenomeLocation locationForReadWithFewerHits;
+ int pairScore;
+ void init(GenomeLocation locationForReadWithMoreHits_, GenomeLocation locationForReadWithFewerHits_, double matchProbability_, int pairScore_) {
+ locationForReadWithMoreHits = locationForReadWithMoreHits_;
+ locationForReadWithFewerHits = locationForReadWithFewerHits_;
+ matchProbability = matchProbability_;
+ pairScore = pairScore_;
+ }
+ //
+ // Returns whether this candidate is a match for this merge anchor.
+ //
+ bool doesRangeMatch(GenomeLocation newMoreHitLocation, GenomeLocation newFewerHitLocation) {
+ GenomeDistance deltaMore = DistanceBetweenGenomeLocations(locationForReadWithMoreHits, newMoreHitLocation);
+ GenomeDistance deltaFewer = DistanceBetweenGenomeLocations(locationForReadWithFewerHits, newFewerHitLocation);
+ return deltaMore < 50 && deltaFewer < 50;
+ }
+ //
+ // Returns true and sets oldMatchProbability if this should be eliminated due to a match.
+ //
+ bool checkMerge(GenomeLocation newMoreHitLocation, GenomeLocation newFewerHitLocation, double newMatchProbability, int newPairScore,
+ double *oldMatchProbability);
+ };
+ //
+ // We keep track of pairs of locations to score using two structs, one for each end. The ends for the read with fewer hits points into
+ // a list of structs for the end with more hits, so that we don't need one stuct for each pair, just one for each end, and also so that
+ // we don't need to score the mates more than once if they could be paired with more than one location from the end with fewer hits.
+ //
+ struct ScoringMateCandidate {
+ //
+ // These are kept in arrays in decreasing genome order, one for each set pair, so you can find the next largest location by just looking one
+ // index lower, and vice versa.
+ //
+ double matchProbability;
+ GenomeLocation readWithMoreHitsGenomeLocation;
+ unsigned bestPossibleScore;
+ unsigned score;
+ unsigned scoreLimit; // The scoreLimit with which score was computed
+ unsigned seedOffset;
+ int genomeOffset;
+ void init(GenomeLocation readWithMoreHitsGenomeLocation_, unsigned bestPossibleScore_, unsigned seedOffset_) {
+ readWithMoreHitsGenomeLocation = readWithMoreHitsGenomeLocation_;
+ bestPossibleScore = bestPossibleScore_;
+ seedOffset = seedOffset_;
+ score = -2;
+ scoreLimit = -1;
+ matchProbability = 0;
+ genomeOffset = 0;
+ }
+ };
+ struct ScoringCandidate {
+ ScoringCandidate * scoreListNext; // This is a singly-linked list
+ MergeAnchor * mergeAnchor;
+ unsigned scoringMateCandidateIndex; // Index into the array of scoring mate candidates where we should look
+ GenomeLocation readWithFewerHitsGenomeLocation;
+ unsigned whichSetPair;
+ unsigned seedOffset;
+ unsigned bestPossibleScore;
+ void init(GenomeLocation readWithFewerHitsGenomeLocation_, unsigned whichSetPair_, unsigned scoringMateCandidateIndex_, unsigned seedOffset_,
+ unsigned bestPossibleScore_, ScoringCandidate *scoreListNext_)
+ {
+ readWithFewerHitsGenomeLocation = readWithFewerHitsGenomeLocation_;
+ whichSetPair = whichSetPair_;
+ _ASSERT(whichSetPair < NUM_SET_PAIRS); // You wouldn't think this would be necessary, but...
+ scoringMateCandidateIndex = scoringMateCandidateIndex_;
+ seedOffset = seedOffset_;
+ bestPossibleScore = bestPossibleScore_;
+ scoreListNext = scoreListNext_;
+ mergeAnchor = NULL;
+ }
+ };
+ //
+ // A pool of scoring candidates. For each alignment call, we free them all by resetting lowestFreeScoringCandidatePoolEntry to 0,
+ // and then fill in the content when they're initialized. This means that for alignments with few candidates we'll be using the same
+ // entries over and over, so they're likely to be in the cache. We have maxK * maxSeeds * 2 of these in the pool, so we can't possibly run
+ // out. We rely on their being allocated in descending genome order within a set pair.
+ //
+ ScoringCandidate *scoringCandidatePool;
+ unsigned scoringCandidatePoolSize;
+ unsigned lowestFreeScoringCandidatePoolEntry;
+ //
+ // maxK + 1 lists of Scoring Candidates. The lists correspond to bestPossibleScore for the candidate and its best mate.
+ //
+ ScoringCandidate **scoringCandidates;
+ //
+ // The scoring mates. The each set scoringCandidatePoolSize / 2.
+ //
+ ScoringMateCandidate * scoringMateCandidates[NUM_SET_PAIRS];
+ unsigned lowestFreeScoringMateCandidate[NUM_SET_PAIRS];
+ //
+ // Merge anchors. Again, we allocate an upper bound number of them, which is the same as the number of scoring candidates.
+ //
+ MergeAnchor *mergeAnchorPool;
+ unsigned firstFreeMergeAnchor;
+ unsigned mergeAnchorPoolSize;
+ struct HitsPerContigCounts {
+ _int64 epoch; // Rather than zeroing this whole array every time, we just bump the epoch number; results with an old epoch are considered zero
+ int hits;
+ };
+ HitsPerContigCounts *hitsPerContigCounts; // How many alignments are we reporting for each contig. Used to implement -mpc, otheriwse unallocated.
+ int maxSecondaryAlignmentsPerContig;
+ _int64 contigCountEpoch;
diff --git a/SNAPLib/LandauVishkin.cpp b/SNAPLib/LandauVishkin.cpp
new file mode 100644
index 0000000..a68512d
--- /dev/null
+++ b/SNAPLib/LandauVishkin.cpp
@@ -0,0 +1,766 @@
+#include "stdafx.h"
+#include "Compat.h"
+#include "LandauVishkin.h"
+#include "mapq.h"
+#include "Read.h"
+#include "BaseAligner.h"
+#include "Bam.h"
+#include "exit.h"
+#include "Error.h"
+using std::make_pair;
+using std::min;
+ for (int i = 0; i < MAX_K+1; i++) {
+ for (int j = 0; j < 2*MAX_K+1; j++) {
+ L[i][j] = -2;
+ }
+ }
+ totalIndels[0][MAX_K] = 0;
+ Write cigar to buffer, return true if it fits
+ null-terminates buffer if it returns false (i.e. fills up buffer)
+bool writeCigar(char** o_buf, int* o_buflen, int count, char code, CigarFormat format)
+ _ASSERT(count >= 0);
+ if (count <= 0) {
+ return true;
+ }
+ switch (format) {
+ int n = min(*o_buflen, count);
+ for (int i = 0; i < n; i++) {
+ *(*o_buf)++ = code;
+ }
+ *o_buflen -= n;
+ if (*o_buflen == 0) {
+ *(*o_buf - 1) = '\0';
+ }
+ return *o_buflen > 0;
+ }
+ if (*o_buflen == 0) {
+ *(*o_buf - 1) = '\0';
+ return false;
+ }
+ int written = snprintf(*o_buf, *o_buflen, "%d%c", count, code);
+ if (written > *o_buflen - 1) {
+ *o_buf = '\0';
+ return false;
+ } else {
+ *o_buf += written;
+ *o_buflen -= written;
+ return true;
+ }
+ }
+ // binary format with non-zero count byte followed by char (easier to examine programmatically)
+ while (true) {
+ if (*o_buflen < 3) {
+ *(*o_buf) = '\0';
+ return false;
+ }
+ *(*o_buf)++ = min(count, 255);
+ *(*o_buf)++ = code;
+ *o_buflen -= 2;
+ if (count <= 255) {
+ return true;
+ }
+ count -= 255;
+ }
+ if (*o_buflen < 4 || count >= (1 << 28)) {
+ return false;
+ }
+ *(_uint32*)*o_buf = (count << 4) | BAMAlignment::CigarToCode[code];
+ *o_buf += 4;
+ *o_buflen -= 4;
+ return true;
+ default:
+ WriteErrorMessage( "invalid cigar format %d\n", format);
+ soft_exit(1);
+ return false; // Not reached. This is just here to suppress a compiler warning.
+ } // switch
+#if 0
+inline void validateAction(char& last, char current)
+ _ASSERT(last != current);
+ last = current;
+inline void validateAction(char& last, char current) {}
+ void
+ char* buffer,
+ int bufferSize,
+ unsigned variant)
+ _ASSERT(bufferSize >= 12);
+ int inserts = (variant >> CigarInsertCShift) & CigarInsertCount;
+ if (inserts > 0) {
+ *buffer++ = '0' + inserts;
+ *buffer++ = 'I';
+ for (int i = 0; i < inserts; i++) {
+ *buffer++ = VALUE_BASE[(variant >> (CigarInsertBShift + 2 * i)) & 3];
+ }
+ }
+ unsigned op = variant & CigarOpcode;
+ if (op >= CigarReplace && op < CigarDelete) {
+ *buffer++ = 'X';
+ *buffer++ = VALUE_BASE[op - CigarReplace];
+ } else if (op == CigarDelete) {
+ *buffer++ = 'D';
+ }
+ *buffer++ = 0;
+#if 0
+static const int PrevDelta[3][3] = // Version that minimizes NET indels (ie., |#ins - #del|
+ {{+1, 0, -1}, // d < 0
+ {0, -1, +1}, // d == 0
+ {-1, 0, +1}}; // d > 0
+#else // 0
+ static const int PrevDelta[3][3] = // Version that minimizes absolute indels (ie., #ins + #del)
+ { { 0, +1, -1}, // d < 0
+ { 0, +1, -1 }, // d == 0
+ { 0, -1, +1 } }; // d > 0
+#endif // 0
+int LandauVishkinWithCigar::computeEditDistance(
+ const char* text, int textLen,
+ const char* pattern, int patternLen,
+ int k,
+ char *cigarBuf, int cigarBufLen, bool useM,
+ CigarFormat format, int* o_cigarBufUsed, int* o_textUsed,
+ int *o_netIndel)
+ int localNetIndel;
+ if (NULL == o_netIndel) {
+ //
+ // If the user doesn't want netIndel, just use a stack local to avoid
+ // having to check it all the time.
+ //
+ o_netIndel = &localNetIndel;
+ }
+ _ASSERT(k < MAX_K);
+ *o_netIndel = 0;
+ _ASSERT(patternLen >= 0 && textLen >= 0);
+ _ASSERT(k < MAX_K);
+ const char* p = pattern;
+ const char* t = text;
+ char* cigarBufStart = cigarBuf;
+ if (NULL == text) {
+ return -1; // This happens when we're trying to read past the end of the genome.
+ }
+ int end = min(patternLen, textLen);
+ const char* pend = pattern + end;
+ while (p < pend) {
+ _uint64 x = *((_uint64*) p) ^ *((_uint64*) t);
+ if (x) {
+ unsigned long zeroes;
+ CountTrailingZeroes(x, zeroes);
+ zeroes >>= 3;
+ L[0][MAX_K] = min((int)(p - pattern) + (int)zeroes, end);
+ goto done1;
+ }
+ p += 8;
+ t += 8;
+ }
+ L[0][MAX_K] = end;
+ if (L[0][MAX_K] == end) {
+ // We matched the text exactly; fill the CIGAR string with all ='s (or M's)
+ if (useM) {
+ if (! writeCigar(&cigarBuf, &cigarBufLen, patternLen, 'M', format)) {
+ return -2;
+ }
+ // todo: should this also write X's like '=' case? or is 'M' special?
+ } else {
+ if (! writeCigar(&cigarBuf, &cigarBufLen, end, '=', format)) {
+ return -2;
+ }
+ if (patternLen > end) {
+ // Also need to write a bunch of X's past the end of the text
+ if (! writeCigar(&cigarBuf, &cigarBufLen, patternLen - end, 'X', format)) {
+ return -2;
+ }
+ }
+ }
+ // todo: should this null-terminate?
+ if (o_cigarBufUsed != NULL) {
+ *o_cigarBufUsed = (int)(cigarBuf - cigarBufStart);
+ }
+ if (o_textUsed != NULL) {
+ *o_textUsed = end;
+ }
+ return 0;
+ }
+ char lastAction = '*';
+ int e;
+ int lastBestIndels = MAX_K + 1;
+ int lastBestD = MAX_K + 1;
+ int lastBestBest;
+ for (e = 1; e <= k; e++) {
+ // Go through the offsets, d, in the order 0, -1, 1, -2, 2, etc, in order to find CIGAR strings
+ // with few indels first if possible.
+ for (int d = 0; d != -(e+1); d = (d >= 0 ? -(d+1) : -d)) {
+ int bestdelta = 0;
+ int bestbest = -1;
+ int bestBestIndels = MAX_K + 1;
+ // extend previous solutions as far as possible, pick best, minimizing indels
+ int dy = (d >= 0) + (d > 0);
+ for (int dx = 0; dx < 3; dx++) {
+ int delta = PrevDelta[dy][dx];
+ int best = L[e-1][MAX_K+d + delta] + (delta >= 0);
+ int bestIndels = totalIndels[e - 1][MAX_K + d + delta] + (delta != 0); // Our parent, plus one if this is an indel
+ if (best < 0) {
+ continue;
+ }
+ const char* p = pattern + best;
+ const char* t = (text + d) + best;
+ if (*p == *t) {
+ int end = min(patternLen, textLen - d);
+ const char* pend = pattern + end;
+ while (true) {
+ _uint64 x = *((_uint64*) p) ^ *((_uint64*) t);
+ if (x) {
+ unsigned long zeroes;
+ CountTrailingZeroes(x, zeroes);
+ zeroes >>= 3;
+ best = min((int)(p - pattern) + (int)zeroes, end);
+ break;
+ }
+ p += 8;
+ if (p >= pend) {
+ best = end;
+ break;
+ }
+ t += 8;
+ }
+ }
+ if (best > bestbest || best == bestbest && bestIndels < bestBestIndels) {
+ bestbest = best;
+ bestdelta = delta;
+ bestBestIndels = bestIndels;
+ }
+ }
+ int best = bestbest;
+ A[e][MAX_K+d] = "DXI"[bestdelta + 1];
+ L[e][MAX_K+d] = best;
+ totalIndels[e][MAX_K + d] = bestBestIndels;
+ if (best == patternLen) {
+ if (bestBestIndels == 0) {
+ lastBestIndels = bestBestIndels;
+ lastBestD = d;
+ lastBestBest = best;
+ goto got_answer;
+ }
+ if (abs(lastBestIndels) > bestBestIndels) {
+ lastBestIndels = bestBestIndels;
+ lastBestD = d;
+ lastBestBest = best;
+ }
+ } // if best == patternlen
+ } // for d
+ if (lastBestD != MAX_K + 1) {
+ goto got_answer;
+ }
+ } // for e
+ // Could not align strings with at most K edits
+ *(cigarBuf - (cigarBufLen == 0 ? 1 : 0)) = '\0'; // terminate string
+ return -1;
+ // We're done. First, let's see whether we can reach e errors with no indels. Otherwise, we'll
+ // trace back through the dynamic programming array to build up the CIGAR string.
+ int straightMismatches = 0;
+ for (int i = 0; i < end; i++) {
+ if (pattern[i] != text[i]) {
+ straightMismatches++;
+ }
+ }
+ straightMismatches += patternLen - end;
+ if (straightMismatches == e) {
+ // We can match with no indels; let's do that
+ if (useM) {
+ //
+ // No inserts or deletes, and with useM equal and SNP look the same, so just
+ // emit a simple string.
+ //
+ validateAction(lastAction, 'M');
+ if (!writeCigar(&cigarBuf, &cigarBufLen, patternLen, 'M', format)) {
+ return -2;
+ }
+ }
+ else {
+ int streakStart = 0;
+ bool matching = (pattern[0] == text[0]);
+ for (int i = 0; i < end; i++) {
+ bool newMatching = (pattern[i] == text[i]);
+ if (newMatching != matching) {
+ validateAction(lastAction, matching ? '=' : 'X');
+ if (!writeCigar(&cigarBuf, &cigarBufLen, i - streakStart, (matching ? '=' : 'X'), format)) {
+ return -2;
+ }
+ matching = newMatching;
+ streakStart = i;
+ }
+ }
+ // Write the last '=' or 'X' streak
+ if (patternLen > streakStart) {
+ if (!matching) {
+ // Write out X's all the way to patternLen
+ validateAction(lastAction, 'X');
+ if (!writeCigar(&cigarBuf, &cigarBufLen, patternLen - streakStart, 'X', format)) {
+ return -2;
+ }
+ }
+ else {
+ // Write out some ='s and then possibly X's if pattern is longer than text
+ validateAction(lastAction, '=');
+ if (!writeCigar(&cigarBuf, &cigarBufLen, end - streakStart, '=', format)) {
+ return -2;
+ }
+ if (patternLen > end) {
+ validateAction(lastAction, 'X');
+ if (!writeCigar(&cigarBuf, &cigarBufLen, patternLen - end, 'X', format)) {
+ return -2;
+ }
+ }
+ }
+ }
+ }
+ *(cigarBuf - (cigarBufLen == 0 ? 1 : 0)) = '\0'; // terminate string
+ if (o_cigarBufUsed != NULL) {
+ *o_cigarBufUsed = (int)(cigarBuf - cigarBufStart);
+ }
+ if (o_textUsed != NULL) {
+ *o_textUsed = end;
+ }
+ return e;
+ }
+#ifdef TRACE_LV
+ // Dump the contents of the various arrays
+ printf("Done with e=%d, d=%d\n", e, d);
+ for (int ee = 0; ee <= e; ee++) {
+ for (int dd = -e; dd <= e; dd++) {
+ if (dd >= -ee && dd <= ee)
+ printf("%3d ", L[ee][MAX_K + dd]);
+ else
+ printf(" ");
+ }
+ printf("\n");
+ }
+ for (int ee = 0; ee <= e; ee++) {
+ for (int dd = -e; dd <= e; dd++) {
+ if (dd >= -ee && dd <= ee)
+ printf("%3c ", A[ee][MAX_K + dd]);
+ else
+ printf(" ");
+ }
+ printf("\n");
+ }
+ // Trace backward to build up the CIGAR string. We do this by filling in the backtraceAction,
+ // backtraceMatched and backtraceD arrays, then going through them in the forward direction to
+ // figure out our string.
+ int curD = lastBestD;
+ for (int curE = e; curE >= 1; curE--) {
+ backtraceAction[curE] = A[curE][MAX_K + curD];
+ if (backtraceAction[curE] == 'I') {
+ backtraceD[curE] = curD + 1;
+ backtraceMatched[curE] = L[curE][MAX_K + curD] - L[curE - 1][MAX_K + curD + 1] - 1;
+ }
+ else if (backtraceAction[curE] == 'D') {
+ backtraceD[curE] = curD - 1;
+ backtraceMatched[curE] = L[curE][MAX_K + curD] - L[curE - 1][MAX_K + curD - 1];
+ }
+ else { // backtraceAction[curE] == 'X'
+ backtraceD[curE] = curD;
+ backtraceMatched[curE] = L[curE][MAX_K + curD] - L[curE - 1][MAX_K + curD] - 1;
+ }
+ curD = backtraceD[curE];
+#ifdef TRACE_LV
+ printf("%d %d: %d %c %d %d\n", curE, curD, L[curE][MAX_K + curD],
+ backtraceAction[curE], backtraceD[curE], backtraceMatched[curE]);
+ }
+ int accumulatedMs; // Count of Ms that we need to emit before an I or D (or ending).
+ if (useM) {
+ accumulatedMs = L[0][MAX_K + 0];
+ }
+ else {
+ // Write out ='s for the first patch of exact matches that brought us to L[0][0]
+ if (L[0][MAX_K + 0] > 0) {
+ validateAction(lastAction, '=');
+ if (!writeCigar(&cigarBuf, &cigarBufLen, L[0][MAX_K + 0], '=', format)) {
+ return -2;
+ }
+ }
+ }
+ int curE = 1;
+ while (curE <= e) {
+ // First write the action, possibly with a repeat if it occurred multiple times with no exact matches
+ char action = backtraceAction[curE];
+ int actionCount = 1;
+ while (curE + 1 <= e && backtraceMatched[curE] == 0 && backtraceAction[curE + 1] == action) {
+ actionCount++;
+ curE++;
+ }
+ if (action == 'I') {
+ *o_netIndel -= actionCount;
+ } else if (action == 'D') {
+ *o_netIndel += actionCount;
+ }
+ if (useM) {
+ if (action == '=' || action == 'X') {
+ accumulatedMs += actionCount;
+ }
+ else {
+ if (accumulatedMs != 0) {
+ validateAction(lastAction, 'M');
+ if (!writeCigar(&cigarBuf, &cigarBufLen, accumulatedMs, 'M', format)) {
+ return -2;
+ }
+ accumulatedMs = 0;
+ }
+ validateAction(lastAction, action);
+ if (!writeCigar(&cigarBuf, &cigarBufLen, actionCount, action, format)) {
+ return -2;
+ }
+ }
+ }
+ else {
+ validateAction(lastAction, action);
+ if (!writeCigar(&cigarBuf, &cigarBufLen, actionCount, action, format)) {
+ return -2;
+ }
+ }
+ // Next, write out ='s for the exact match
+ if (backtraceMatched[curE] > 0) {
+ if (useM) {
+ accumulatedMs += backtraceMatched[curE];
+ }
+ else {
+ validateAction(lastAction, '=');
+ if (!writeCigar(&cigarBuf, &cigarBufLen, backtraceMatched[curE], '=', format)) {
+ return -2;
+ }
+ }
+ }
+ curE++;
+ }
+ if (useM && accumulatedMs != 0) {
+ //
+ // Write out the trailing Ms.
+ //
+ validateAction(lastAction, 'M');
+ if (!writeCigar(&cigarBuf, &cigarBufLen, accumulatedMs, 'M', format)) {
+ return -2;
+ }
+ }
+ if (format != BAM_CIGAR_OPS) {
+ *(cigarBuf - (cigarBufLen == 0 ? 1 : 0)) = '\0'; // terminate string
+ }
+ if (o_cigarBufUsed != NULL) {
+ *o_cigarBufUsed = (int)(cigarBuf - cigarBufStart);
+ }
+ if (o_textUsed != NULL) {
+ *o_textUsed = min(textLen, lastBestBest + lastBestD);
+ }
+ return e;
+int LandauVishkinWithCigar::computeEditDistanceNormalized(
+ const char* text, int textLen,
+ const char* pattern, int patternLen,
+ int k,
+ char *cigarBuf, int cigarBufLen, bool useM,
+ CigarFormat format, int* o_cigarBufUsed,
+ int* o_addFrontClipping,
+ int *o_netIndel)
+ if (format != BAM_CIGAR_OPS && format != COMPACT_CIGAR_STRING) {
+ WriteErrorMessage("LandauVishkinWithCigar::computeEditDistanceNormalized invalid parameter\n");
+ soft_exit(1);
+ }
+ int bamBufLen = (format == BAM_CIGAR_OPS ? 1 : 2) * cigarBufLen; // should be enough
+ char* bamBuf = (char*)alloca(bamBufLen);
+ int bamBufUsed, textUsed;
+ int score = computeEditDistance(text, (int)textLen, pattern, (int)patternLen, k, bamBuf, bamBufLen,
+ useM, BAM_CIGAR_OPS, &bamBufUsed, &textUsed, o_netIndel);
+ if (score < 0) {
+ return score;
+ }
+ _uint32* bamOps = (_uint32*)bamBuf;
+ int bamOpCount = bamBufUsed / sizeof(_uint32);
+#if 0 // Not sure this is necessary, and it seems to cause problems with the new LV that won't put indels at the end
+ bool hasIndels = false;
+ for (int i = 0; i < bamOpCount; i++) {
+ char c = BAMAlignment::CodeToCigar[BAMAlignment::GetCigarOpCode(bamOps[i])];
+ if (c == 'I' || c == 'D') {
+ hasIndels = true;
+ break;
+ }
+ }
+ if (hasIndels) {
+ // run it again in reverse so it pushes indels towards the beginning
+ char* text2 = (char*)alloca(textLen + 1);
+ _ASSERT(textUsed <= textLen);
+ util::memrevcpy(text2, text, textUsed);
+ char* pattern2 = (char*)alloca(patternLen + 1);
+ util::memrevcpy(pattern2, pattern, patternLen);
+ char* bamBuf2 = (char*)alloca(bamBufLen);
+ int bamBufUsed2, textUsed2;
+ int score2 = computeEditDistance(text2, textUsed, pattern2, patternLen, k, bamBuf2, bamBufLen,
+ useM, BAM_CIGAR_OPS, &bamBufUsed2, &textUsed2);
+ if (score == score2 /* && bamBufUsed2 == bamBufUsed && textUsed2 == textUsed*/) {
+ bamBuf = bamBuf2;
+ bamBufUsed = bamBufUsed2;
+ bamOpCount = bamBufUsed2 / sizeof(_uint32);
+ textUsed = textUsed2;
+ // reverse the operations
+ for (int i = 0; i < bamOpCount; i++) {
+ bamOps[i] = ((_uint32*)bamBuf2)[bamOpCount - 1 - i];
+ }
+ } else if (false) { // debugging
+ text2[textUsed2] = 0;
+ pattern2[patternLen] = 0;
+ WriteErrorMessage("inconsistent forward/reverse comparison\nreverse score %d, textUsed %d, bamUsed %d, text/pattern:\n%s\n%s\n",
+ score2, textUsed2, bamBufUsed2, text2, pattern2);
+ memcpy(text2, text, textLen);
+ text2[textLen] = 0;
+ memcpy(pattern2, pattern, patternLen);
+ pattern2[patternLen] = 0;
+ WriteErrorMessage("forward score %d, textUsed %d, bamUsed %d, text/pattern:\n%s\n%s\n",
+ score, textUsed, bamBufUsed, text2, pattern2);
+ }
+ }
+#endif // 0 // Not sure this is necessary, and it seems to cause problems with the new LV that won't put indels at the end
+#if 0 // This shouldn't happen anymore, the basic computeEditDistance doesn't allow it
+ //
+ // Trim out any trailing insertions, which can just be changed or merge in to X (or M as the case may be).
+ //
+ _ASSERT(bamOpCount > 0);
+ char lastCode = BAMAlignment::CodeToCigar[BAMAlignment::GetCigarOpCode(bamOps[bamOpCount - 1])];
+ if ('I' == lastCode) {
+ //
+ // See if it merges into the previous cigar code (which it will if it's M or X, but not D or =).
+ //
+ if (bamOpCount != 1) {
+ char previousOp = BAMAlignment::CodeToCigar[BAMAlignment::GetCigarOpCode(bamOps[bamOpCount - 2])];
+ if ('X' == previousOp || 'M' == previousOp) {
+ int newCount = BAMAlignment::GetCigarOpCount(bamOps[bamOpCount - 1]) + BAMAlignment::GetCigarOpCount(bamOps[bamOpCount - 2]);
+ bamOps[bamOpCount - 2] = (newCount << 4) | BAMAlignment::CigarToCode[previousOp];
+ bamOpCount--;
+ bamBufUsed -= sizeof(_uint32);
+ } else if ('=' == previousOp) {
+ //
+ // The previous op was =, which this obviously doesn't. Convert the final code to X or M.
+ //
+ bamOps[bamOpCount - 1] = (BAMAlignment::GetCigarOpCount(bamOps[bamOpCount - 1]) << 4) | BAMAlignment::CigarToCode[useM ? 'M' : 'X'];
+ }
+ }
+ }
+#endif // 0 // This shouldn't happen anymore, the basic computeEditDistance doesn't allow it. Just assert it
+ _ASSERT('I' != BAMAlignment::CodeToCigar[BAMAlignment::GetCigarOpCode(bamOps[bamOpCount - 1])]);
+ //
+ // Turn leading 'D' into soft clipping, and 'I' into an alignment change followed by an X.
+ //
+ if (o_addFrontClipping != NULL) {
+ char firstCode = BAMAlignment::CodeToCigar[BAMAlignment::GetCigarOpCode(bamOps[0])];
+ if (firstCode == 'D') {
+ *o_addFrontClipping = BAMAlignment::GetCigarOpCount(bamOps[0]);
+ if (*o_addFrontClipping != 0) {
+ return 0; // can fail, will be rerun with new clipping
+ }
+ } else if (firstCode == 'I') {
+ *o_addFrontClipping = -1 * BAMAlignment::GetCigarOpCount(bamOps[0]);
+ } else {
+ *o_addFrontClipping = 0;
+ }
+ }
+ _ASSERT(bamOpCount <= 1 || BAMAlignment::CodeToCigar[BAMAlignment::GetCigarOpCode(bamOps[bamOpCount - 1])] != 'I'); // We should have cleared all of these out
+ _ASSERT(bamOpCount <= 1 || BAMAlignment::CodeToCigar[BAMAlignment::GetCigarOpCode(bamOps[bamOpCount - 1])] != 'D'); // And none of these should happen, either.
+// Seems to happen; TODO: fix this _ASSERT(bamOpCount <= 1 || BAMAlignment::CodeToCigar[BAMAlignment::GetCigarOpCode(bamOps[0])] != 'D');
+// Seems to happen; TODO: fix this _ASSERT(bamOpCount <= 1 || BAMAlignment::CodeToCigar[BAMAlignment::GetCigarOpCode(bamOps[0])] != 'I');
+ // copy out cigar info
+ if (format == BAM_CIGAR_OPS) {
+ memcpy(cigarBuf, bamOps, bamBufUsed);
+ if (o_cigarBufUsed != NULL) {
+ *o_cigarBufUsed = bamBufUsed;
+ }
+ } else {
+ bool ok = BAMAlignment::decodeCigar(cigarBuf, cigarBufLen, bamOps, bamOpCount);
+ if (! ok) {
+ return -1;
+ }
+ if (o_cigarBufUsed != NULL) {
+ *o_cigarBufUsed = (int)strlen(cigarBuf) + 1;
+ }
+ }
+ return score;
+ int
+ _uint16* o_linear,
+ int referenceSize,
+ char* cigar,
+ int cigarSize,
+ char* sample,
+ int sampleSize)
+ memset(o_linear, 0, referenceSize * 2); // zero-init
+ int ic = 0, ir = 0, is = 0; // index into cigar, linear/reference, and sample
+ while (ic < cigarSize) {
+ int n = (unsigned char) cigar[ic++];
+ char code = cigar[ic++];
+ int ii, base;
+ for (int i = 0; i < n; i++) {
+ if ((code != 'I' && ir >= referenceSize) || (code != 'D' && is >= sampleSize)) {
+ return ir;
+ }
+ if (code != 'D') {
+ base = sample[is] != 'N' ? BASE_VALUE[sample[is]] : 0;
+ is++;
+ }
+ switch (code) {
+ case '=':
+ ir++;
+ break;
+ case 'X':
+ o_linear[ir++] |= CigarReplace + base;
+ break;
+ case 'D':
+ o_linear[ir++] |= CigarDelete;
+ break;
+ case 'I':
+ ii = (o_linear[ir] >> CigarInsertCShift) & CigarInsertCount;
+ if (ii < 4) {
+ o_linear[ir] = (base << (2 * ii + CigarInsertBShift)) | ((ii + 1) << CigarInsertCShift);
+ } else if (ii < 7) {
+ o_linear[ir] = (o_linear[ir] & CigarInsertBases) | ((ii + 1) << CigarInsertCShift);
+ }
+ break;
+ default:
+ _ASSERT(false);
+ }
+ }
+ }
+ return ir;
+ void
+setLVProbabilities(double *i_indelProbabilities, double *i_phredToProbability, double mutationProbability)
+ lv_indelProbabilities = i_indelProbabilities;
+ //
+ // Compute the phred table to incorporate the mutation probability, assuming that machine errors and mutations
+ // are independent (which there's no reason not to think is the case). If P(A) and P(B) are independent, then
+ // P(A or B) = P(not (not-A and not-B)) = 1-(1-P(A))(1-P(B)).
+ //
+ for (unsigned i = 0; i < 255; i++) {
+ lv_phredToProbability[i] = 1.0-(1.0 - i_phredToProbability[i]) * (1.0 - mutationProbability);
+ }
+ void
+ static bool alreadyInitialized = false;
+ if (alreadyInitialized) {
+ return;
+ }
+ alreadyInitialized = true;
+ //
+ // indel probability is .0001 for any indel (10% of a SNP real difference), and then 10% worse for each longer base.
+ //
+ _ASSERT(NULL == lv_phredToProbability);
+ lv_phredToProbability = (double *)BigAlloc(sizeof(double) * 256);
+ static const int maxIndels = 10000; // Way more than we'll see, and in practice enough to result in p=0.0;
+ _ASSERT(NULL == lv_indelProbabilities);
+ lv_indelProbabilities = (double *)BigAlloc(sizeof(double) * maxIndels);
+ const double mutationRate = SNP_PROB;
+ lv_indelProbabilities = new double[maxIndels+1];
+ lv_indelProbabilities[0] = 1.0;
+ lv_indelProbabilities[1] = GAP_OPEN_PROB;
+ for (int i = 2; i <= maxIndels; i++) {
+ lv_indelProbabilities[i] = lv_indelProbabilities[i-1] * GAP_EXTEND_PROB;
+ }
+ //
+ // Use 0.001 as the probability of a real SNP, then or it with the Phred+33 probability.
+ //
+ for (int i = 0; i < 33; i++) {
+ lv_phredToProbability[i] = mutationRate; // This isn't a sensible Phred score
+ }
+ for (int i = 33; i <= 93 + 33; i++) {
+ lv_phredToProbability[i] = 1.0-(1.0 - pow(10.0,-1.0 * (i - 33.0) / 10.0)) * (1.0 - mutationRate);
+ }
+ for (int i = 93 + 33 + 1; i < 256; i++) {
+ lv_phredToProbability[i] = mutationRate; // This isn't a sensible Phred score
+ }
+ _ASSERT(NULL == lv_perfectMatchProbability);
+ lv_perfectMatchProbability = new double[MaxReadLength+1];
+ lv_perfectMatchProbability[0] = 1.0;
+ for (unsigned i = 1; i <= MaxReadLength; i++) {
+ lv_perfectMatchProbability[i] = lv_perfectMatchProbability[i - 1] * (1 - SNP_PROB);
+ }
+ initializeMapqTables();
+double *lv_phredToProbability = NULL;
+double *lv_indelProbabilities = NULL;
+double *lv_perfectMatchProbability = NULL;
diff --git a/SNAPLib/LandauVishkin.h b/SNAPLib/LandauVishkin.h
new file mode 100644
index 0000000..5f8ad20
--- /dev/null
+++ b/SNAPLib/LandauVishkin.h
@@ -0,0 +1,512 @@
+#pragma once
+#include "Compat.h"
+#include "FixedSizeMap.h"
+#include "BigAlloc.h"
+#include "exit.h"
+#include "Genome.h"
+const int MAX_K = 63;
+// These are global so there are only one for both senses of the template
+extern double *lv_indelProbabilities; // Maps indels by length to probability of occurance.
+extern double *lv_phredToProbability; // Maps ASCII phred character to probability of error, including
+extern double *lv_perfectMatchProbability; // Probability that a read of this length has no mutations
+struct LVResult {
+ short k;
+ short result;
+ short netIndel;
+ double matchProbability;
+ LVResult() { k = -1; result = -1; netIndel = 0;}
+ LVResult(short k_, short result_, short netIndel_, double matchProbability_) {
+ k = k_;
+ result = result_;
+ netIndel = netIndel_;
+ matchProbability = matchProbability_;
+ }
+ inline bool isValid() { return k != -1; }
+static inline void memsetint(int* p, int value, int count)
+// this is required to get around a GCC optimization bug
+#ifndef _MSC_VER
+ volatile
+ int * q = p;
+ for (int i = 0; i < count; i++) {
+ q[i] = value;
+ }
+// Computes the edit distance between two strings without returning the edits themselves.
+// Set TEXT_DIRECTION to -1 to run backwards through the text.
+template<int TEXT_DIRECTION = 1> class LandauVishkin {
+// Macros to make arrays with negative indices seem "natural" in the code.
+#define L(e,d) L_zero [(e) * (2 * MAX_K + 1) + (d)]
+#define A(e,d) A_zero [(e) * (2 * MAX_K + 1) + (d)]
+ LandauVishkin()
+ if (TEXT_DIRECTION != 1 && TEXT_DIRECTION != -1) {
+ fprintf(stderr, "You can't possibly be serious.\n");
+ soft_exit(1);
+ }
+ memsetint(L_space, -2, (MAX_K + 1) * (2 * MAX_K + 1));
+ L_zero = L_space + MAX_K; // The address of L(0,0)
+ A_zero = A_space + MAX_K; // The address of A(0,0)
+ //
+ // Initialize dTable, which is used to avoid a branch misprediction in our inner loop.
+ // The d values are 0, -1, 1, -2, 2, etc.
+ //
+ for (int i = 0, d = 0; i < 2 * (MAX_K + 1) + 1; i++, d = (d > 0 ? -d : -d+1)) {
+ dTable[i] = d;
+ }
+ void pushBackCacheStats()
+ {
+ if (NULL != cache) {
+ cache->pushBackCacheStats();
+ }
+ }
+ static size_t getBigAllocatorReservation() {return sizeof(LandauVishkin<TEXT_DIRECTION>);} // maybe we should worry about allocating the cache with a BigAllocator, but not for now.
+ ~LandauVishkin()
+ // Compute the edit distance between two strings, if it is <= k, or return -1 otherwise.
+ //
+ // The essential method is to build up the L array row by row. L[e][d] is the farthest that you can get
+ // through the pattern (read data) with e changes (single base substition, insert or delete) and a net indel of d.
+ // Once you get to the end of the read, you've computed the best edit distance (e). L[e][d] can be computed
+ // by looking at L[e-1][d-1 .. d+1], depending on whether the next change is a deletion, insertion or
+ // substitution.
+ //
+ // Because d can be negative, the L array doesn't really use L[e][d]. Instead, it uses L[e][MAX_K+d], because MAX_K is
+ // the largest possible edit distance, and hence d can never be less than -MAX_K, so MAX_K + d >= 0. However, the L and A macros
+ // conceal this internally.
+ //
+ // Also, because of the way the alignment algorithms work, sometimes SNAP wants to run the edit distance
+ // backward. This is built as a template with TEXT_DIRECTION either 1 for forward or -1 for backward, just to make
+ // it extra confusing.
+ //
+ int computeEditDistance(
+ const char* text,
+ int textLen,
+ const char* pattern,
+ const char *qualityString,
+ int patternLen,
+ int k,
+ double *matchProbability,
+ int *o_netIndel = NULL) // the net of insertions and deletions in the alignment. Negative for insertions, positive for deleteions (and 0 if there are non in net). Filled in only if matchProbability is non-NULL
+ int localNetIndel;
+ int d;
+ if (NULL == o_netIndel) {
+ //
+ // If the user doesn't want netIndel, just use a stack local to avoid
+ // having to check it all the time.
+ //
+ o_netIndel = &localNetIndel;
+ }
+ _ASSERT(k < MAX_K);
+ *o_netIndel = 0;
+ k = __min(MAX_K - 1, k); // enforce limit even in non-debug builds
+ if (NULL == text) {
+ // This happens when we're trying to read past the end of the genome.
+ if (NULL != matchProbability) {
+ *matchProbability = 0.0;
+ }
+ return -1;
+ }
+ if (NULL != matchProbability) {
+ //
+ // Start with perfect match probability and work our way down.
+ //
+ *matchProbability = 1.0;
+ }
+ if (TEXT_DIRECTION == -1) {
+ text--; // so now it points at the "first" character of t, not after it.
+ }
+ const char* p = pattern;
+ const char* t = text;
+ int end = __min(patternLen, textLen);
+ const char* pend = pattern + end;
+ L(0, 0) = countPerfectMatch(p, t, end);
+ if (L(0, 0) == end) {
+ int result = (patternLen > end ? patternLen - end : 0); // Could need some deletions at the end
+ if (NULL != matchProbability) {
+ *matchProbability = lv_perfectMatchProbability[patternLen]; // Becuase the chance of a perfect match is < 1
+ }
+ if (result > k) {
+ //
+ // The deletions at the end pushed us oevr the score limit.
+ //
+ return -1;
+ }
+ return result;
+ }
+ int lastBestD = MAX_K + 1;
+ int e;
+ for (e = 1; e <= k; e++) {
+ // Search d's in the order 0, 1, -1, 2, -2, etc to find an alignment with as few indels as possible.
+ // dTable is just precomputed d = (d > 0 ? -d : -d+1) to save the branch misprediction from (d > 0)
+ int i =0;
+ for (d = 0; d != e+1 ; i++, d = dTable[i]) {
+ int best = L(e-1, d) + 1; // up
+ A(e, d) = 'X';
+ const char* p = pattern + best;
+ const char* t = (text + d * TEXT_DIRECTION) + best * TEXT_DIRECTION;
+ if (*p == *t && best >= 0) {
+ int end = __min(patternLen, textLen - d);
+ const char* pend = pattern + end;
+ best += countPerfectMatch(p, t, (int)(end - (p - pattern)));
+ }
+ int left = L(e-1, d-1);
+ p = pattern + left;
+ t = (text + d * TEXT_DIRECTION) + left * TEXT_DIRECTION;
+ if (*p == *t && left >= 0) {
+ int end = __min(patternLen, textLen - d);
+ const char* pend = pattern + end;
+ left += countPerfectMatch(p, t, (int)(end - (p - pattern)));
+ }
+ if (left > best) {
+ best = left;
+ A(e, d) = 'D';
+ }
+ int right = L(e-1, d+1) + 1;
+ p = pattern + right;
+ t = (text + d * TEXT_DIRECTION) + right * TEXT_DIRECTION;
+ if (*p == *t && right >= 0) {
+ int end = __min(patternLen, textLen - d);
+ const char* pend = pattern + end;
+ right += countPerfectMatch(p, t, (int)(end - (p - pattern)));
+ }
+ if (right > best) {
+ best = right;
+ A(e, d) = 'I';
+ }
+ if (best == patternLen) {
+ //
+ // We're through on this iteration.
+ //
+ if ('X' == A(e, d)) {
+ //
+ // The last step wasn't an indel, so we're sure it's the right one.
+ //
+ lastBestD = d;
+ goto got_answer;
+ } else {
+ //
+ // We're done on this round, but maybe there's a better answer, so keep looking.
+ //
+ if (abs(d) < abs(lastBestD)) {
+ lastBestD = d;
+ }
+ }
+ } // if best==patternLen
+ L(e, d) = best;
+ } // for d
+ if (MAX_K + 1 != lastBestD) {
+ break;
+ }
+ } // for e
+ if (MAX_K + 1 == lastBestD) {
+ return -1;
+ }
+ _ASSERT(abs(lastBestD) < MAX_K + 1);
+ if (NULL != matchProbability) {
+ _ASSERT(*matchProbability == 1.0);
+ //
+ // We're done. Compute the match probability.
+ //
+ //
+ // Trace backward to build up the CIGAR string. We do this by filling in the backtraceAction,
+ // backtraceMatched and backtraceD arrays, then going through them in the forward direction to
+ // figure out our string.
+ int curD = lastBestD;
+ for (int curE = e; curE >= 1; curE--) {
+ backtraceAction[curE] = A(curE, curD);
+ if (backtraceAction[curE] == 'I') {
+ backtraceD[curE] = curD + 1;
+ backtraceMatched[curE] = L(curE, curD) - L(curE - 1, curD + 1) - 1;
+ } else if (backtraceAction[curE] == 'D') {
+ backtraceD[curE] = curD - 1;
+ backtraceMatched[curE] = L(curE, curD) - L(curE - 1, curD - 1);
+ } else { // backtraceAction[curE] == 'X'
+ backtraceD[curE] = curD;
+ backtraceMatched[curE] = L(curE, curD) - L(curE - 1, curD) - 1;
+ }
+ curD = backtraceD[curE];
+#ifdef TRACE_LV
+ printf("%d %d: %d %c %d %d\n", curE, curD, L(curE, curD),
+ backtraceAction[curE], backtraceD[curE], backtraceMatched[curE]);
+ }
+ int curE = 1;
+ int offset = L(0, 0);
+ _ASSERT(*o_netIndel == 0);
+ while (curE <= e) {
+ // First write the action, possibly with a repeat if it occurred multiple times with no exact matches
+ char action = backtraceAction[curE];
+ int actionCount = 1;
+ while (curE + 1 <= e && backtraceMatched[curE] == 0 && backtraceAction[curE + 1] == action) {
+ actionCount++;
+ curE++;
+ }
+ if (action == 'I') {
+ *matchProbability *= lv_indelProbabilities[actionCount];
+ offset += actionCount;
+ *o_netIndel += actionCount;
+ }
+ else if (action == 'D') {
+ *matchProbability *= lv_indelProbabilities[actionCount];
+ offset -= actionCount;
+ *o_netIndel -= actionCount;
+ }
+ else {
+ _ASSERT(action == 'X');
+ for (int i = 0; i < actionCount; i++) {
+ *matchProbability *= lv_phredToProbability[qualityString[/*BUGBUG - think about what to do here*/__min(patternLen - 1, __max(offset, 0))]];
+ offset++;
+ }
+ }
+ offset += backtraceMatched[curE]; // Skip over the matching bases.
+ curE++;
+ }
+ *matchProbability *= lv_perfectMatchProbability[patternLen - e]; // Accounting for the < 1.0 chance of no changes for matching bases
+ } else {
+ //
+ // Not tracking match probability.
+ //
+ }
+ _ASSERT(e <= k);
+ return e;
+ // Version that does not requre match probability and quality string
+ inline int computeEditDistance(
+ const char* text,
+ int textLen,
+ const char* pattern,
+ int patternLen,
+ int k)
+ {
+ return computeEditDistance(text, textLen, pattern, NULL, patternLen, k, NULL);
+ }
+ void *operator new(size_t size) {return BigAlloc(size);}
+ void operator delete(void *ptr) {BigDealloc(ptr);}
+ void *operator new(size_t size, BigAllocator *allocator) {_ASSERT(size == sizeof(LandauVishkin<TEXT_DIRECTION>)); return allocator->allocate(size);}
+ void operator delete(void *ptr, BigAllocator *allocator) {/*Do nothing. The memory is freed when the allocator is deleted.*/}
+ //
+ // Count characters of a perfect match until a mismatch or the end of one or the other string, the
+ // minimum length of which is represented by the end parameter. Advances p & t to the first mismatch
+ // or first character beyond the end.
+ //
+ inline int countPerfectMatch(const char *& p, const char *& t, int availBytes) // This is essentially duplicated in LandauVishkinWithCigar
+ {
+ const char *pBase = p;
+ const char *pend = p + availBytes;
+ while (true) {
+ _uint64 x;
+ if (TEXT_DIRECTION == 1) {
+ x = *((_uint64*)p) ^ *((_uint64*)t);
+ } else {
+ _uint64 T = *(_uint64 *)(t - 7);
+ _uint64 tSwap = ByteSwapUI64(T);
+ x = *((_uint64*)p) ^ tSwap;
+ }
+ if (x) {
+ unsigned long zeroes;
+ CountTrailingZeroes(x, zeroes);
+ zeroes >>= 3;
+ return __min((int)(p - pBase) + (int)zeroes, availBytes);
+ } // if (x)
+ p += 8;
+ if (p >= pend) {
+ return availBytes;
+ }
+ t += 8 * TEXT_DIRECTION;
+ } // while true
+ return 0;
+ }
+ //
+ // Table of d values for the inner loop in computeEditDistance. This allows us to avoid the line d = (d > 0 ? -d : -d+1), which causes
+ // a branch misprediction every time.
+ //
+ int dTable[2 * (MAX_K + 1) + 1];
+ //
+ // Note on state arrays:
+ //
+ // We have several arrays that need to be indexed on edit distance and net indels. Because net indels is signed, we want them
+ // to have their second coordinate (d) run from [-MAX_K .. MAX_K]. When computing the opening or closing of an indel, we add more than
+ // one edit distance, which means we compute LInsert[e][x] based on L[e-OpenPenalty][x+1]. When we're at edit distance < the gap open
+ // penalty, of course we can't fill in LInsert; however, rather than just checking it each time (and incurring a branch prediction miss),
+ // we just let it happily index into negative space, which is initialized with -2 and so will never be used. So, we want our arrays to
+ // run from [-MAX_GAP..MAX_K][-MAX_K .. MAX_K]. To do this, we just allocate the space and compute a pointer that would be at [0][0].
+ // We use macros to do the indexing, because it's tricky to convince C++ to do this kind of thing statically.
+ //
+ // Also, conceptually these arrays are local to each computation. They're here to save memory allocation and initialization overhead.
+ // Note that the important parts for the initialization is never overwritten, though the rest is.
+ //
+ // TODO: For long reads, we should include a version that only has L be 2 x (2*MAX_K+1) cells
+ int L_space[(MAX_K + 1) * (2 * MAX_K + 1)];
+ int *L_zero;
+ // Action we did to get to each position: 'D' = deletion, 'I' = insertion, 'X' = substitution. This is needed to compute match probability.
+ char A_space[(MAX_K + 1) * (2 * MAX_K + 1)];
+ char *A_zero;
+ // Arrays for backtracing the actions required to match two strings
+ char backtraceAction[MAX_K+1];
+ int backtraceMatched[MAX_K+1];
+ int backtraceD[MAX_K+1];
+#undef L
+#undef A
+void setLVProbabilities(double *i_indelProbabilities, double *i_phredToProbability, double mutationProbability);
+void initializeLVProbabilitiesToPhredPlus33();
+// Computes the edit distance between two strings and returns a CIGAR string for the edits.
+enum CigarFormat
+// express cigar as 2 byte per reference base summarizing the changes
+// at that location; may lose information for longer inserts
+enum LinearCigarFlags
+ CigarInsertFlags = 0xfff8, // 13 bits for insert
+ CigarInsertCShift= 3, // shift to get count
+ CigarInsertCount = 0x7, // after shifting, mask to get # insertions
+ CigarInsertBShift= 6, // shift to get bases
+ CigarInsertBases = 0xffc0, // up to 5 inserted bases, low-order bits first
+ CigarInsertNBases= 5,
+ CigarOpcode = 0x07, // opcode for ref base
+ CigarNoop = 0x00, // no change
+ CigarReplace = 0x01, // base is n-1
+ CigarDelete = 0x05, // delete
+class LandauVishkinWithCigar {
+ LandauVishkinWithCigar();
+ // Compute the edit distance between two strings and write the CIGAR string in cigarBuf.
+ // Returns -1 if the edit distance exceeds k or -2 if we run out of space in cigarBuf.
+ int computeEditDistance(const char* text, int textLen, const char* pattern, int patternLen, int k,
+ char* cigarBuf, int cigarBufLen, bool useM,
+ CigarFormat format = COMPACT_CIGAR_STRING,
+ int* o_cigarBufUsed = NULL,
+ int* o_textUsed = NULL,
+ int *o_netIndel = NULL);
+ // same, but places indels as early as possible, following BWA & VCF conventions
+ int computeEditDistanceNormalized(const char* text, int textLen, const char* pattern, int patternLen, int k,
+ char* cigarBuf, int cigarBufLen, bool useM,
+ CigarFormat format = COMPACT_CIGAR_STRING,
+ int* o_cigarBufUsed = NULL,
+ int* o_textUsed = NULL,
+ int *o_netIndel = NULL);
+ // take a compact cigar binary format and turn it into one byte per reference base
+ // describing the difference from the reference at that location
+ // might lose information for large inserts
+ // returns number of bytes in result
+ static int linearizeCompactBinary(_uint16* o_linear, int referenceSize,
+ char* cigar, int cigarSize, char* sample, int sampleSize);
+ static void printLinear(char* buffer, int bufferSize, unsigned variant);
+ int L[MAX_K+1][2 * MAX_K + 1];
+ // Action we did to get to each position: 'D' = deletion, 'I' = insertion, 'X' = substitution.
+ char A[MAX_K+1][2 * MAX_K + 1];
+ //
+ // Total (not net) indels at this point. Parallel to L and A arrays. Used to select the least-indel path
+ // consistent with the lowest edit distance.
+ //
+ int totalIndels[MAX_K + 1][2 * MAX_K + 1];
+ // Arrays for backtracing the actions required to match two strings
+ char backtraceAction[MAX_K+1];
+ int backtraceMatched[MAX_K+1];
+ int backtraceD[MAX_K+1];
diff --git a/SNAPLib/MultiInputReadSupplier.cpp b/SNAPLib/MultiInputReadSupplier.cpp
new file mode 100644
index 0000000..ef38b56
--- /dev/null
+++ b/SNAPLib/MultiInputReadSupplier.cpp
@@ -0,0 +1,298 @@
+Module Name:
+ MultiInputReadSupplier.cpp
+ A read supplier that combines other read suppliers. It's used when there are muliple input files to process.
+ Bill Bolosky, November, 2012
+ User mode service.
+Revision History:
+#include "stdafx.h"
+#include "Read.h"
+#include "Compat.h"
+#include "MultiInputReadSupplier.h"
+MultiInputReadSupplier::MultiInputReadSupplier(int i_nReadSuppliers, ReadSupplier **i_readSuppliers)
+ readSuppliers = i_readSuppliers; // We get to own the array
+ nRemainingReadSuppliers = nReadSuppliers = i_nReadSuppliers;
+ nextReadSupplier = 0;
+ activeReadSuppliers = new ActiveRead[nReadSuppliers];
+ for (int i = 0; i < nReadSuppliers; i++) {
+ activeReadSuppliers[i].index = i;
+ activeReadSuppliers[i].lastBatch = DataBatch();
+ activeReadSuppliers[i].firstReadInNextBatch = NULL;
+ }
+ for (int i = 0; i < nReadSuppliers; i++) {
+ delete readSuppliers[i];
+ readSuppliers[i] = NULL;
+ }
+ delete [] readSuppliers;
+ delete [] activeReadSuppliers;
+ Read *
+ while (true) {
+ if (0 == nRemainingReadSuppliers) {
+ return NULL;
+ }
+ _ASSERT(nextReadSupplier < nRemainingReadSuppliers);
+ ActiveRead* active = &activeReadSuppliers[nextReadSupplier];
+ DataBatch last = active->lastBatch;
+ Read *read;
+ if (active->firstReadInNextBatch != NULL) {
+ read = active->firstReadInNextBatch;
+ active->firstReadInNextBatch = NULL;
+ active->lastBatch = read->getBatch();
+ return read;
+ }
+ read = readSuppliers[active->index]->getNextRead();
+ if (read != NULL) {
+ read->setBatch(DataBatch(read->getBatch().batchID,
+ read->getBatch().fileID * nReadSuppliers + active->index));
+ active->lastBatch = read->getBatch();
+ if (read->getBatch() == last || last == DataBatch()) {
+ return read;
+ }
+ // end of batch from current supplier, round-robin through suppliers
+ active->firstReadInNextBatch = read;
+ nextReadSupplier = (nextReadSupplier + 1) % nRemainingReadSuppliers;
+ } else {
+ //
+ // This supplier is done. Update our array to pull the
+ // last live read supplier into the slot that we just vacated (this will result
+ // in violating a strict round robin, but we don't promise any such thing
+ // anyway). Can't delete because it might be retaining read data in use downstream.
+ //
+ nRemainingReadSuppliers--;
+ activeReadSuppliers[nextReadSupplier] = activeReadSuppliers[nRemainingReadSuppliers];
+ nextReadSupplier = 0; // (Bluntly) handles the case where nextReadSupplier is the last one.
+ }
+ }
+ void
+ DataBatch batch)
+ int index = batch.fileID % nReadSuppliers;
+ _ASSERT(index >= 0 && index < nReadSuppliers);
+ readSuppliers[index]->holdBatch(DataBatch(batch.batchID, batch.fileID / nReadSuppliers));
+ bool
+ DataBatch batch)
+ int index = batch.fileID % nReadSuppliers;
+ _ASSERT(index >= 0 && index < nReadSuppliers);
+ return readSuppliers[index]->releaseBatch(DataBatch(batch.batchID, batch.fileID / nReadSuppliers));
+MultiInputPairedReadSupplier::MultiInputPairedReadSupplier(int i_nReadSuppliers, PairedReadSupplier **i_pairedReadSuppliers)
+ pairedReadSuppliers = i_pairedReadSuppliers; // We get to own the array
+ nRemainingReadSuppliers = nReadSuppliers = i_nReadSuppliers;
+ nextReadSupplier = 0;
+ activeReadSuppliers = new ActiveRead[nReadSuppliers];
+ for (int i = 0; i < nReadSuppliers; i++) {
+ activeReadSuppliers[i].index = i;
+ activeReadSuppliers[i].lastBatch[0] = activeReadSuppliers[i].lastBatch[1] = DataBatch();
+ activeReadSuppliers[i].firstReadInNextBatch[0] = activeReadSuppliers[i].firstReadInNextBatch[1] = NULL;
+ }
+ for (int i = 0; i < nReadSuppliers; i++) {
+ delete pairedReadSuppliers[i];
+ pairedReadSuppliers[i] = NULL;
+ }
+ delete [] pairedReadSuppliers;
+ delete [] activeReadSuppliers;
+ bool
+MultiInputPairedReadSupplier::getNextReadPair(Read **read0, Read **read1)
+ if (0 == nRemainingReadSuppliers) {
+ return NULL;
+ }
+ _ASSERT(nextReadSupplier < nRemainingReadSuppliers);
+ ActiveRead* active = &activeReadSuppliers[nextReadSupplier];
+ bool hasReads = pairedReadSuppliers[active->index]->getNextReadPair(read0, read1);
+ bool nextBatch = hasReads && ((*read0)->getBatch() != active->lastBatch[0] || (*read1)->getBatch() != active->lastBatch[1]);
+ if (nextBatch) {
+ active->firstReadInNextBatch[0] = *read0;
+ active->firstReadInNextBatch[1] = *read1;
+ }
+ if (nextBatch || ! hasReads) {
+ while (true) {
+ // end of batch from current supplier, round-robin through suppliers
+ if (nextBatch) {
+ nextReadSupplier = (nextReadSupplier + 1) % nRemainingReadSuppliers;
+ nextBatch = false;
+ }
+ active = &activeReadSuppliers[nextReadSupplier];
+ if (active->firstReadInNextBatch[0] != NULL) {
+ _ASSERT(active->firstReadInNextBatch[1] != NULL);
+ *read0 = active->firstReadInNextBatch[0];
+ *read1 = active->firstReadInNextBatch[1];
+ active->firstReadInNextBatch[0] = active->firstReadInNextBatch[1] = NULL;
+ break;
+ }
+ if (pairedReadSuppliers[active->index]->getNextReadPair(read0, read1)) {
+ break;
+ }
+ nRemainingReadSuppliers--;
+ if (0 == nRemainingReadSuppliers) {
+ return false;
+ }
+ //
+ // This supplier is done. Update our array to pull the
+ // last live read supplier into the slot that we just vacated (this will result
+ // in violating a strict round robin, but we don't promise any such thing
+ // anyway). Can't delete because it might be retaining read data in use downstream.
+ //
+ activeReadSuppliers[nextReadSupplier] = activeReadSuppliers[nRemainingReadSuppliers];
+ nextReadSupplier = 0; // (Bluntly) handles the case where nextReadSupplier is the last one.
+ }
+ }
+ active->lastBatch[0] = (*read0)->getBatch();
+ (*read0)->setBatch(DataBatch(active->lastBatch[0].fileID * nReadSuppliers + active->index, active->lastBatch[0].batchID));
+ active->lastBatch[1] = (*read1)->getBatch();
+ (*read1)->setBatch(DataBatch(active->lastBatch[1].fileID * nReadSuppliers + active->index, active->lastBatch[1].batchID));
+ return true;
+ void
+ DataBatch batch)
+ int index = batch.fileID % nReadSuppliers;
+ _ASSERT(index >= 0 && index < nReadSuppliers);
+ pairedReadSuppliers[index]->holdBatch(DataBatch(batch.batchID, batch.fileID / nReadSuppliers));
+ bool
+ DataBatch batch)
+ int index = batch.fileID % nReadSuppliers;
+ _ASSERT(index >= 0 && index < nReadSuppliers);
+ return pairedReadSuppliers[index]->releaseBatch(DataBatch(batch.batchID, batch.fileID / nReadSuppliers));
+MultiInputReadSupplierGenerator::MultiInputReadSupplierGenerator(int i_nReadSuppliers, ReadSupplierGenerator **i_readSupplierGenerators)
+ nReadSuppliers = i_nReadSuppliers;
+ readSupplierGenerators = i_readSupplierGenerators; // We take ownership of the array
+ for (int i = 0; i < nReadSuppliers; i++) {
+ delete readSupplierGenerators[i];
+ }
+ delete [] readSupplierGenerators;
+ readSupplierGenerators = NULL;
+ ReadSupplier *
+ ReadSupplier **readSuppliers = new ReadSupplier *[nReadSuppliers];
+ for (int i = 0; i < nReadSuppliers; ) {
+ readSuppliers[i] = readSupplierGenerators[i]->generateNewReadSupplier();
+ if (NULL == readSuppliers[i]) {
+ nReadSuppliers--;
+ } else {
+ i++;
+ }
+ }
+ if (0 == nReadSuppliers) {
+ delete [] readSuppliers;
+ return NULL;
+ }
+ return new MultiInputReadSupplier(nReadSuppliers,readSuppliers); // The Supplier owns the array and suppliers we created
+ ReaderContext*
+ return readSupplierGenerators[0]->getContext();
+MultiInputPairedReadSupplierGenerator::MultiInputPairedReadSupplierGenerator(int i_nReadSuppliers, PairedReadSupplierGenerator **i_readSupplierGenerators)
+ nReadSuppliers = i_nReadSuppliers;
+ readSupplierGenerators = i_readSupplierGenerators; // We own the array and the generators.
+ for (int i = 0; i < nReadSuppliers; i++) {
+ delete readSupplierGenerators[i];
+ readSupplierGenerators[i] = NULL;
+ }
+ delete [] readSupplierGenerators;
+ readSupplierGenerators = NULL;
+ PairedReadSupplier *
+ PairedReadSupplier **readSuppliers = new PairedReadSupplier *[nReadSuppliers];
+ for (int i = 0; i < nReadSuppliers; ) {
+ readSuppliers[i] = readSupplierGenerators[i]->generateNewPairedReadSupplier();
+ if (NULL == readSuppliers[i]) {
+ nReadSuppliers--;
+ } else {
+ i++;
+ }
+ }
+ if (0 == nReadSuppliers) {
+ delete [] readSuppliers;
+ return NULL;
+ }
+ return new MultiInputPairedReadSupplier(nReadSuppliers,readSuppliers);
+ ReaderContext*
+ return readSupplierGenerators[0]->getContext();
diff --git a/SNAPLib/MultiInputReadSupplier.h b/SNAPLib/MultiInputReadSupplier.h
new file mode 100644
index 0000000..a3cb193
--- /dev/null
+++ b/SNAPLib/MultiInputReadSupplier.h
@@ -0,0 +1,111 @@
+Module Name:
+ MultiInputReadSupplier.h
+ Headers for a read supplier that combines other read suppliers. It's used when there are muliple input files to process.
+ Bill Bolosky, November, 2012
+ User mode service.
+Revision History:
+#pragma once
+#include "Read.h"
+#include "Compat.h"
+class MultiInputReadSupplier: public ReadSupplier {
+ MultiInputReadSupplier(int nReadSuppliers, ReadSupplier **i_readSuppliers);
+ virtual ~MultiInputReadSupplier();
+ virtual Read *getNextRead();
+ virtual void holdBatch(DataBatch batch);
+ virtual bool releaseBatch(DataBatch batch);
+ // info for a currently active supplier
+ struct ActiveRead
+ {
+ int index; // index in readSuppliers array
+ DataBatch lastBatch; // last batch read from this supplier
+ Read* firstReadInNextBatch;
+ };
+ int nRemainingReadSuppliers;
+ int nReadSuppliers;
+ int nextReadSupplier;
+ ReadSupplier **readSuppliers;
+ ActiveRead *activeReadSuppliers;
+class MultiInputPairedReadSupplier: public PairedReadSupplier {
+ MultiInputPairedReadSupplier(int nReadSuppliers, PairedReadSupplier **i_pairedReadSuppliers);
+ virtual ~MultiInputPairedReadSupplier();
+ virtual bool getNextReadPair(Read **read0, Read **read1);
+ virtual void holdBatch(DataBatch batch);
+ virtual bool releaseBatch(DataBatch batch);
+ // info for a currently active supplier
+ struct ActiveRead
+ {
+ int index; // index in readSuppliers array
+ DataBatch lastBatch[2]; // last batch read from this supplier
+ Read* firstReadInNextBatch[2];
+ };
+ int nRemainingReadSuppliers;
+ int nReadSuppliers;
+ int nextReadSupplier;
+ PairedReadSupplier **pairedReadSuppliers;
+ ActiveRead *activeReadSuppliers;
+class MultiInputReadSupplierGenerator: public ReadSupplierGenerator
+ MultiInputReadSupplierGenerator(int i_nReadSuppliers, ReadSupplierGenerator **i_readSupplierGenerators);
+ virtual ~MultiInputReadSupplierGenerator();
+ virtual ReadSupplier *generateNewReadSupplier();
+ virtual ReaderContext* getContext();
+ int nReadSuppliers;
+ ReadSupplierGenerator **readSupplierGenerators;
+class MultiInputPairedReadSupplierGenerator: public PairedReadSupplierGenerator
+ MultiInputPairedReadSupplierGenerator(int i_nReadSuppliers, PairedReadSupplierGenerator **i_readSupplierGenerators);
+ virtual ~MultiInputPairedReadSupplierGenerator();
+ virtual PairedReadSupplier *generateNewPairedReadSupplier();
+ virtual ReaderContext* getContext();
+ int nReadSuppliers;
+ PairedReadSupplierGenerator **readSupplierGenerators;
diff --git a/SNAPLib/PairedAligner.cpp b/SNAPLib/PairedAligner.cpp
new file mode 100644
index 0000000..89bad2f
--- /dev/null
+++ b/SNAPLib/PairedAligner.cpp
@@ -0,0 +1,657 @@
+Module Name:
+ PairedAligner.cpp
+ Functions for running the paired end aligner sub-program.
+ Matei Zaharia, February, 2012
+ User mode service.
+Revision History:
+ Adapted from cSNAP, which was in turn adapted from the scala prototype
+// TODO: This is really similar to the single-end aligner overall. It would be nice
+// to avoid code duplication.
+#include "stdafx.h"
+#include "options.h"
+#include <time.h>
+#include "Compat.h"
+#include "RangeSplitter.h"
+#include "GenomeIndex.h"
+#include "SAM.h"
+#include "ChimericPairedEndAligner.h"
+#include "Tables.h"
+#include "AlignerOptions.h"
+#include "AlignerContext.h"
+#include "AlignerStats.h"
+#include "FASTQ.h"
+#include "PairedAligner.h"
+#include "MultiInputReadSupplier.h"
+#include "Util.h"
+#include "IntersectingPairedEndAligner.h"
+#include "exit.h"
+#include "Error.h"
+using namespace std;
+using util::stringEndsWith;
+static const int DEFAULT_MIN_SPACING = 50;
+static const int DEFAULT_MAX_SPACING = 1000;
+struct PairedAlignerStats : public AlignerStats
+ // TODO: make these constants configurable
+ static const int MAX_DISTANCE = 1000;
+ static const int MAX_SCORE = 15;
+ _int64 sameComplement;
+ _int64* distanceCounts; // histogram of distances
+ // TODO: could save a bit of memory & time since this is a triangular matrix
+ _int64* scoreCounts; // 2-d histogram of scores for paired ends
+ static const unsigned maxMapq = 70;
+ static const unsigned nTimeBuckets = 32;
+ static const unsigned nHitsBuckets = 32;
+ static const unsigned nLVCallsBuckets = 32;
+ _int64 alignTogetherByMapqHistogram[maxMapq+1][nTimeBuckets];
+ _int64 totalTimeByMapqHistogram[maxMapq+1][nTimeBuckets];
+ _int64 nSmallHitsByTimeHistogram[nHitsBuckets][nTimeBuckets];
+ _int64 nLVCallsByTimeHistogram[nLVCallsBuckets][nTimeBuckets];
+ _int64 mapqByNLVCallsHistogram[maxMapq+1][nLVCallsBuckets];
+ _int64 mapqByNSmallHitsHistogram[maxMapq+1][nHitsBuckets];
+ PairedAlignerStats(AbstractStats* i_extra = NULL);
+ virtual ~PairedAlignerStats();
+ inline void incrementDistance(int distance) {
+ distanceCounts[max(0, min(MAX_DISTANCE, distance))]++;
+ }
+ inline void incrementScore(int s0, int s1)
+ {
+ // ensure s0 <= s1, both within range
+ s0 = max(0, min(MAX_SCORE, s0));
+ s1 = max(0, min(MAX_SCORE, s1));
+ if (s0 > s1) {
+ int t = s0; s0 = s1; s1 = t;
+ }
+ scoreCounts[s0*(MAX_SCORE+1)+s1]++;
+ }
+ inline void recordAlignTogetherMapqAndTime(unsigned mapq, _int64 timeInNanos, unsigned nSmallHits, unsigned nLVCalls) {
+ int timeBucket;
+ _int64 dividedTime = timeInNanos;
+ for (timeBucket = 0; timeBucket < nTimeBuckets-1; timeBucket++) {
+ if (dividedTime == 0) break;
+ dividedTime /= 2;
+ }
+ alignTogetherByMapqHistogram[mapq][timeBucket]++;
+ totalTimeByMapqHistogram[mapq][timeBucket] += timeInNanos;
+ int nHitsBucket;
+ int dividedHits = nSmallHits;
+ for (nHitsBucket = 0; nHitsBucket < nHitsBuckets; nHitsBucket++) {
+ if (0 == dividedHits) break;
+ dividedHits /= 2;
+ }
+ _ASSERT((char *)&nSmallHitsByTimeHistogram[nHitsBucket][timeBucket] < (char *)(this + 1));
+ nSmallHitsByTimeHistogram[nHitsBucket][timeBucket]++;
+ int nLVCallsBucket;
+ int dividedLVCalls = nLVCalls;
+ for (nLVCallsBucket = 0; nLVCallsBucket < nLVCallsBuckets; nLVCallsBucket++) {
+ if (dividedLVCalls == 0) break;
+ dividedLVCalls /= 2;
+ }
+ _ASSERT((char *)&nLVCallsByTimeHistogram[nLVCallsBucket][timeBucket] < (char *)(this + 1));
+ nLVCallsByTimeHistogram[nLVCallsBucket][timeBucket]++;
+ _ASSERT((char *)&mapqByNLVCallsHistogram[mapq][nLVCallsBucket] < (char *)(this + 1));
+ mapqByNLVCallsHistogram[mapq][nLVCallsBucket]++;
+ _ASSERT((char *)&mapqByNSmallHitsHistogram[mapq][nHitsBucket] < (char *)(this + 1));
+ mapqByNSmallHitsHistogram[mapq][nHitsBucket]++;
+ }
+ virtual void add(const AbstractStats * other);
+ virtual void printHistograms(FILE* output);
+const int PairedAlignerStats::MAX_DISTANCE;
+const int PairedAlignerStats::MAX_SCORE;
+PairedAlignerStats::PairedAlignerStats(AbstractStats* i_extra)
+ : AlignerStats(i_extra),
+ sameComplement(0)
+ int dsize = sizeof(_int64) * (MAX_DISTANCE+1);
+ distanceCounts = (_int64*)BigAlloc(dsize);
+ memset(distanceCounts, 0, dsize);
+ int ssize = sizeof(_int64) * (MAX_SCORE+1)*(MAX_SCORE+1);
+ scoreCounts = (_int64*)BigAlloc(ssize);
+ memset(scoreCounts, 0, ssize);
+ for (unsigned mapq = 0; mapq <= maxMapq; mapq++) {
+ for (unsigned timeBucket = 0; timeBucket < nTimeBuckets; timeBucket++) {
+ alignTogetherByMapqHistogram[mapq][timeBucket] = 0;
+ totalTimeByMapqHistogram[mapq][timeBucket] = 0;
+ }
+ for (unsigned smallHits = 0; smallHits < nHitsBuckets; smallHits++) {
+ mapqByNSmallHitsHistogram[mapq][smallHits] = 0;
+ }
+ for (unsigned lvCalls = 0; lvCalls < nLVCallsBuckets; lvCalls++) {
+ mapqByNLVCallsHistogram[mapq][lvCalls] = 0;
+ }
+ }
+ for (unsigned timeBucket = 0; timeBucket < nTimeBuckets; timeBucket++) {
+ for (unsigned smallHits = 0; smallHits < nHitsBuckets; smallHits++) {
+ nSmallHitsByTimeHistogram[smallHits][timeBucket] = 0;
+ }
+ for (unsigned lvCalls = 0; lvCalls < nLVCallsBuckets; lvCalls++) {
+ nLVCallsByTimeHistogram[lvCalls][timeBucket] = 0;
+ }
+ }
+ BigDealloc(distanceCounts);
+ BigDealloc(scoreCounts);
+void PairedAlignerStats::add(const AbstractStats * i_other)
+ AlignerStats::add(i_other);
+ PairedAlignerStats* other = (PairedAlignerStats*) i_other;
+ for (int i = 0; i < MAX_DISTANCE + 1; i++) {
+ distanceCounts[i] += other->distanceCounts[i];
+ }
+ for (int i = 0; i < (MAX_SCORE + 1) * (MAX_SCORE + 1); i++) {
+ scoreCounts[i] += other->scoreCounts[i];
+ }
+ for (unsigned mapq = 0; mapq <= maxMapq; mapq++) {
+ for (unsigned timeBucket = 0; timeBucket < nTimeBuckets; timeBucket++) {
+ alignTogetherByMapqHistogram[mapq][timeBucket] += other->alignTogetherByMapqHistogram[mapq][timeBucket];
+ totalTimeByMapqHistogram[mapq][timeBucket] += other->totalTimeByMapqHistogram[mapq][timeBucket];
+ }
+ for (unsigned smallHits = 0; smallHits < nHitsBuckets; smallHits++) {
+ mapqByNSmallHitsHistogram[mapq][smallHits] += other->mapqByNSmallHitsHistogram[mapq][smallHits];
+ }
+ for (unsigned lvCalls = 0; lvCalls < nLVCallsBuckets; lvCalls++) {
+ mapqByNLVCallsHistogram[mapq][lvCalls] += other->mapqByNLVCallsHistogram[mapq][lvCalls];
+ }
+ }
+ for (unsigned timeBucket = 0; timeBucket < nTimeBuckets; timeBucket++) {
+ for (unsigned smallHits = 0; smallHits < nHitsBuckets; smallHits++) {
+ nSmallHitsByTimeHistogram[smallHits][timeBucket] += other->nSmallHitsByTimeHistogram[smallHits][timeBucket];
+ }
+ for (unsigned lvCalls = 0; lvCalls < nLVCallsBuckets; lvCalls++) {
+ nLVCallsByTimeHistogram[lvCalls][timeBucket] += other->nLVCallsByTimeHistogram[lvCalls][timeBucket];
+ }
+ }
+void PairedAlignerStats::printHistograms(FILE* output)
+ AlignerStats::printHistograms(output);
+PairedAlignerOptions::PairedAlignerOptions(const char* i_commandLine)
+ : AlignerOptions(i_commandLine, true),
+ forceSpacing(false),
+ quicklyDropUnpairedReads(true)
+void PairedAlignerOptions::usageMessage()
+ AlignerOptions::usageMessage();
+ WriteErrorMessage(
+ "\n"
+ " -s min and max spacing to allow between paired ends (default: %d %d).\n"
+ " -fs force spacing to lie between min and max.\n"
+ " -H max hits for intersecting aligner (default: %d).\n"
+ " -mcp specifies the maximum candidate pool size (An internal data structure. \n"
+ " Only increase this if you get an error message saying to do so. If you're running\n"
+ " out of memory, you may want to reduce it. Default: %d)\n"
+ " -F b additional option to -F to require both mates to satisfy filter (default is just one)\n"
+ " If you specify -F b together with one of the other -F options, -F b MUST be second\n"
+ " -ku Keep unpaired-looking reads in SAM/BAM input. Ordinarily, if a read doesn't specify\n"
+ " mate information (RNEXT field is * and/or PNEXT is 0) then the code that matches reads will immdeiately\n"
+ " discard it. Specifying this flag may cause large memory usage for some input files,\n"
+ " but may be necessary for some strangely formatted input files. You'll also need to specify this\n"
+ " flag for SAM/BAM files that were aligned by a single-end aligner.\n"
+ ,
+bool PairedAlignerOptions::parse(const char** argv, int argc, int& n, bool *done)
+ *done = false;
+ if (strcmp(argv[n], "-s") == 0) {
+ if (n + 2 < argc) {
+ minSpacing = atoi(argv[n+1]);
+ maxSpacing = atoi(argv[n+2]);
+ n += 2;
+ return true;
+ }
+ return false;
+ } else if (strcmp(argv[n], "-H") == 0) {
+ if (n + 1 < argc) {
+ intersectingAlignerMaxHits = atoi(argv[n+1]);
+ n += 1;
+ return true;
+ }
+ return false;
+ } else if (strcmp(argv[n], "-fs") == 0) {
+ forceSpacing = true;
+ return true;
+ } else if (strcmp(argv[n], "-ku") == 0) {
+ quicklyDropUnpairedReads = false;
+ return true;
+ } else if (strcmp(argv[n], "-mcp") == 0) {
+ if (n + 1 < argc) {
+ maxCandidatePoolSize = atoi(argv[n+1]);
+ n += 1;
+ return true;
+ }
+ return false;
+ } else if (strcmp(argv[n], "-F") == 0 && n + 1 < argc && strcmp(argv[n + 1],"b") == 0) {
+ filterFlags |= FilterBothMatesMatch;
+ n += 1;
+ return true;
+ }
+ return AlignerOptions::parse(argv, argc, n, done);
+PairedAlignerContext::PairedAlignerContext(AlignerExtension* i_extension)
+ : AlignerContext( 0, NULL, NULL, i_extension)
+bool PairedAlignerContext::initialize()
+ AlignerContext::initialize();
+ PairedAlignerOptions* options2 = (PairedAlignerOptions*) options;
+ minSpacing = options2->minSpacing;
+ maxSpacing = options2->maxSpacing;
+ forceSpacing = options2->forceSpacing;
+ maxCandidatePoolSize = options2->maxCandidatePoolSize;
+ intersectingAlignerMaxHits = options2->intersectingAlignerMaxHits;
+ ignoreMismatchedIDs = options2->ignoreMismatchedIDs;
+ quicklyDropUnpairedReads = options2->quicklyDropUnpairedReads;
+ noUkkonen = options->noUkkonen;
+ noOrderedEvaluation = options->noOrderedEvaluation;
+ return true;
+AlignerStats* PairedAlignerContext::newStats()
+ return new PairedAlignerStats();
+void PairedAlignerContext::runTask()
+ ParallelTask<PairedAlignerContext> task(this);
+ task.run();
+void PairedAlignerContext::runIterationThread()
+ PreventMachineHibernationWhileThisThreadIsAlive();
+ PairedReadSupplier *supplier = pairedReadSupplierGenerator->generateNewPairedReadSupplier();
+ if (NULL == supplier) {
+ //
+ // No work for this thread to do.
+ //
+ return;
+ }
+ if (extension->runIterationThread(supplier, this)) {
+ delete supplier;
+ return;
+ }
+ Read *reads[NUM_READS_PER_PAIR];
+ int nSingleResults[2] = { 0, 0 };
+ if (index == NULL) {
+ // no alignment, just input/output
+ PairedAlignmentResult result;
+ memset(&result, 0, sizeof(result));
+ result.location[0] = result.location[1] = InvalidGenomeLocation;
+ while (supplier->getNextReadPair(&reads[0],&reads[1])) {
+ // Check that the two IDs form a pair; they will usually be foo/1 and foo/2 for some foo.
+ if (!ignoreMismatchedIDs && !readIdsMatch(reads[0], reads[1])) {
+ unsigned n[2] = {min(reads[0]->getIdLength(), 200u), min(reads[1]->getIdLength(), 200u)};
+ char* p[2] = {(char*) alloca(n[0] + 1), (char*) alloca(n[1] + 1)};
+ memcpy(p[0], reads[0]->getId(), n[0]); p[0][n[0]] = 0;
+ memcpy(p[1], reads[1]->getId(), n[1]); p[1][n[1]] = 0;
+ WriteErrorMessage( "Unmatched read IDs '%s' and '%s'. Use the -I option to ignore this.\n", p[0], p[1]);
+ soft_exit(1);
+ }
+ stats->totalReads += 2;
+ bool pass0 = options->passFilter(reads[0], result.status[0], false);
+ bool pass1 = options->passFilter(reads[1], result.status[1], false);
+ bool pass = (options->filterFlags & AlignerOptions::FilterBothMatesMatch)
+ ? (pass0 && pass1) : (pass0 || pass1);
+ if (pass && NULL != readWriter) {
+ readWriter->writePairs(readerContext, reads, &result, 1, NULL, nSingleResults, true);
+ }
+ }
+ delete supplier;
+ return;
+ }
+ int maxReadSize = MAX_READ_LENGTH;
+ size_t memoryPoolSize = IntersectingPairedEndAligner::getBigAllocatorReservation(index, intersectingAlignerMaxHits, maxReadSize, index->getSeedLength(),
+ numSeedsFromCommandLine, seedCoverage, maxDist, extraSearchDepth, maxCandidatePoolSize,
+ maxSecondaryAlignmentsPerContig);
+ memoryPoolSize += ChimericPairedEndAligner::getBigAllocatorReservation(index, maxReadSize, maxHits, index->getSeedLength(), numSeedsFromCommandLine, seedCoverage, maxDist,
+ extraSearchDepth, maxCandidatePoolSize, maxSecondaryAlignmentsPerContig);
+ unsigned maxPairedSecondaryHits;
+ unsigned maxSingleSecondaryHits;
+ if (maxSecondaryAlignmentAdditionalEditDistance < 0) {
+ maxPairedSecondaryHits = 0;
+ maxSingleSecondaryHits = 0;
+ } else {
+ maxPairedSecondaryHits = IntersectingPairedEndAligner::getMaxSecondaryResults(numSeedsFromCommandLine, seedCoverage, maxReadSize, maxHits, index->getSeedLength(), minSpacing, maxSpacing);
+ maxSingleSecondaryHits = ChimericPairedEndAligner::getMaxSingleEndSecondaryResults(numSeedsFromCommandLine, seedCoverage, maxReadSize, maxHits, index->getSeedLength());
+ }
+ memoryPoolSize += maxPairedSecondaryHits * sizeof(PairedAlignmentResult) + maxSingleSecondaryHits * sizeof(SingleAlignmentResult);
+ BigAllocator *allocator = new BigAllocator(memoryPoolSize);
+ IntersectingPairedEndAligner *intersectingAligner = new (allocator) IntersectingPairedEndAligner(index, maxReadSize, maxHits, maxDist, numSeedsFromCommandLine,
+ seedCoverage, minSpacing, maxSpacing, intersectingAlignerMaxHits, extraSearchDepth,
+ maxCandidatePoolSize, maxSecondaryAlignmentsPerContig ,allocator, noUkkonen, noOrderedEvaluation, noTruncation);
+ ChimericPairedEndAligner *aligner = new (allocator) ChimericPairedEndAligner(
+ index,
+ maxReadSize,
+ maxHits,
+ maxDist,
+ numSeedsFromCommandLine,
+ seedCoverage,
+ minWeightToCheck,
+ forceSpacing,
+ extraSearchDepth,
+ noUkkonen,
+ noOrderedEvaluation,
+ noTruncation,
+ intersectingAligner,
+ minReadLength,
+ maxSecondaryAlignmentsPerContig,
+ allocator);
+ allocator->checkCanaries();
+ PairedAlignmentResult *results = (PairedAlignmentResult *)allocator->allocate((1 + maxPairedSecondaryHits) * sizeof(*results)); // 1 + is for the primary result
+ SingleAlignmentResult *singleSecondaryResults = (SingleAlignmentResult *)allocator->allocate(maxSingleSecondaryHits * sizeof(*singleSecondaryResults));
+ ReadWriter *readWriter = this->readWriter;
+#ifdef _MSC_VER
+ if (options->useTimingBarrier) {
+ if (0 == InterlockedDecrementAndReturnNewValue(nThreadsAllocatingMemory)) {
+ AllowEventWaitersToProceed(memoryAllocationCompleteBarrier);
+ } else {
+ WaitForEvent(memoryAllocationCompleteBarrier);
+ }
+ }
+#endif // _MSC_VER
+ // Align the reads.
+ _uint64 lastReportTime = timeInMillis();
+ _uint64 readsWhenLastReported = 0;
+ while (supplier->getNextReadPair(&reads[0],&reads[1])) {
+ // Check that the two IDs form a pair; they will usually be foo/1 and foo/2 for some foo.
+ if (!ignoreMismatchedIDs) {
+ Read::checkIdMatch(reads[0], reads[1]);
+ }
+ stats->totalReads += 2;
+ // Skip the pair if there are too many Ns or 2s.
+ int maxDist = this->maxDist;
+ bool useful0 = reads[0]->getDataLength() >= minReadLength && (int)reads[0]->countOfNs() <= maxDist;
+ bool useful1 = reads[1]->getDataLength() >= minReadLength && (int)reads[1]->countOfNs() <= maxDist;
+ if (!useful0 && !useful1) {
+ PairedAlignmentResult result;
+ result.status[0] = NotFound;
+ result.status[1] = NotFound;
+ result.location[0] = InvalidGenomeLocation;
+ result.location[1] = InvalidGenomeLocation;
+ nSingleResults[0] = nSingleResults[1] = 0;
+ bool pass0 = options->passFilter(reads[0], result.status[0], false);
+ bool pass1 = options->passFilter(reads[1], result.status[1], false);
+ bool pass = (options->filterFlags & AlignerOptions::FilterBothMatesMatch)
+ ? (pass0 && pass1) : (pass0 || pass1);
+ if (pass && NULL != readWriter) {
+ readWriter->writePairs(readerContext, reads, &result, 1, NULL, nSingleResults, true);
+ }
+ continue;
+ } else {
+ // Here one the reads might still be hopeless, but maybe we can align the other.
+ stats->usefulReads += (useful0 && useful1) ? 2 : 1;
+ }
+ if (AlignerOptions::useHadoopErrorMessages && stats->totalReads % 10000 == 0 && timeInMillis() - lastReportTime > 10000) {
+ fprintf(stderr,"reporter:counter:SNAP,readsAligned,%lu\n",stats->totalReads - readsWhenLastReported);
+ readsWhenLastReported = stats->totalReads;
+ lastReportTime = timeInMillis();
+ }
+ _int64 startTime = timeInNanos();
+ int nSecondaryResults;
+ int nSingleSecondaryResults[2];
+ aligner->align(reads[0], reads[1], results, maxSecondaryAlignmentAdditionalEditDistance, maxPairedSecondaryHits, &nSecondaryResults, results + 1,
+ maxSingleSecondaryHits, maxSecondaryAlignments, &nSingleSecondaryResults[0], &nSingleSecondaryResults[1], singleSecondaryResults);
+ _int64 runTime = timeInNanos() - startTime;
+ int timeBucket = min(30, cheezyLogBase2(runTime));
+ stats->countByTimeBucket[timeBucket]++;
+ stats->nanosByTimeBucket[timeBucket] += runTime;
+ if (forceSpacing && isOneLocation(results[0].status[0]) != isOneLocation(results[0].status[1])) {
+ // either both align or neither do
+ results[0].status[0] = results[0].status[1] = NotFound;
+ results[0].location[0] = results[0].location[1] = InvalidGenomeLocation;
+ }
+ if (NULL != readWriter) {
+ bool firstIsPrimary = true;
+ for (int i = 0; i <= nSecondaryResults + 1; i++) { // Loop runs to <= nSecondaryResults because there's a primary result, too.
+ bool pass0 = options->passFilter(reads[0], results[i].status[0], false);
+ bool pass1 = options->passFilter(reads[1], results[i].status[1], false);
+ bool pass = (options->filterFlags & AlignerOptions::FilterBothMatesMatch)
+ ? (pass0 && pass1) : (pass0 || pass1);
+ if (!pass) {
+ //
+ // Remove this one from the list by copying the last one here.
+ //
+ results[i] = results[nSecondaryResults];
+ nSecondaryResults--;
+ if (0 == i) {
+ firstIsPrimary = false;
+ }
+ i--;
+ }
+ }
+ //
+ // Now check the single secondary alignments
+ //
+ SingleAlignmentResult *singleResults[2] = { singleSecondaryResults, singleSecondaryResults + nSingleSecondaryResults[0] };
+ for (int whichRead = 0; whichRead < NUM_READS_PER_PAIR; whichRead++) {
+ for (int whichAlignment = 0; whichAlignment < nSingleSecondaryResults[whichRead]; whichAlignment++) {
+ if (!options->passFilter(reads[whichRead], singleResults[whichRead][whichAlignment].status, false)) {
+ singleResults[whichRead][whichAlignment] = singleResults[whichRead][nSingleSecondaryResults[whichRead] - 1];
+ nSingleSecondaryResults[whichRead]--;
+ whichAlignment--;
+ }
+ }
+ }
+ readWriter->writePairs(readerContext, reads, results, nSecondaryResults + 1, singleResults, nSingleSecondaryResults, firstIsPrimary);
+ }
+ updateStats((PairedAlignerStats*) stats, reads[0], reads[1], &results[0], useful0, useful1);
+ }
+ stats->lvCalls = aligner->getLocationsScored();
+ allocator->checkCanaries();
+ aligner->~ChimericPairedEndAligner();
+ delete supplier;
+ intersectingAligner->~IntersectingPairedEndAligner();
+ delete allocator;
+void PairedAlignerContext::updateStats(PairedAlignerStats* stats, Read* read0, Read* read1, PairedAlignmentResult* result, bool useful0, bool useful1)
+ bool useful[2] = { useful0, useful1 };
+ // Update stats
+ for (int r = 0; r < 2; r++) {
+ if (useful[r]) {
+ if (isOneLocation(result->status[r])) {
+ stats->singleHits++;
+ } else if (result->status[r] == MultipleHits) {
+ stats->multiHits++;
+ } else {
+ _ASSERT(result->status[r] == NotFound);
+ stats->notFound++;
+ }
+ }
+ // Add in MAPQ stats
+ if (result->status[r] != NotFound) {
+ int mapq = result->mapq[r];
+ _ASSERT(mapq >= 0 && mapq <= AlignerStats::maxMapq);
+ stats->mapqHistogram[mapq]++;
+ }
+ }
+ if (result->direction[0] == result->direction[1]) {
+ stats->sameComplement++;
+ }
+ if (isOneLocation(result->status[0]) && isOneLocation(result->status[1])) {
+ stats->incrementDistance(abs((int) (result->location[0] - result->location[1])));
+ stats->incrementScore(result->score[0], result->score[1]);
+ }
+ if (result->fromAlignTogether) {
+ stats->recordAlignTogetherMapqAndTime(__max(result->mapq[0], result->mapq[1]), result->nanosInAlignTogether, result->nSmallHits, result->nLVCalls);
+ }
+ if (result->alignedAsPair) {
+ stats->alignedAsPairs += 2; // They are a pair, after all. Hence, +2.
+ }
+ void
+ if (1 == options->nInputs) {
+ //
+ // We've only got one input, so just connect it directly to the consumer.
+ //
+ pairedReadSupplierGenerator = options->inputs[0].createPairedReadSupplierGenerator(options->numThreads, quicklyDropUnpairedReads, readerContext);
+ } else {
+ //
+ // We've got multiple inputs, so use a MultiInputReadSupplier to combine the individual inputs.
+ //
+ PairedReadSupplierGenerator **generators = new PairedReadSupplierGenerator *[options->nInputs];
+ // use separate context for each supplier, initialized from common
+ for (int i = 0; i < options->nInputs; i++) {
+ ReaderContext context(readerContext);
+ generators[i] = options->inputs[i].createPairedReadSupplierGenerator(options->numThreads, quicklyDropUnpairedReads, context);
+ }
+ pairedReadSupplierGenerator = new MultiInputPairedReadSupplierGenerator(options->nInputs,generators);
+ }
+ ReaderContext* context = pairedReadSupplierGenerator->getContext();
+ readerContext.header = context->header;
+ readerContext.headerBytes = context->headerBytes;
+ readerContext.headerLength = context->headerLength;
+ readerContext.headerMatchesIndex = context->headerMatchesIndex;
+ void
+ if (readerContext.header != NULL) {
+ delete [] readerContext.header;
+ readerContext.header = NULL;
+ readerContext.headerLength = readerContext.headerBytes = 0;
+ readerContext.headerMatchesIndex = false;
+ }
+ delete pairedReadSupplierGenerator;
+ pairedReadSupplierGenerator = NULL;
diff --git a/SNAPLib/PairedAligner.h b/SNAPLib/PairedAligner.h
new file mode 100644
index 0000000..d5cf20a
--- /dev/null
+++ b/SNAPLib/PairedAligner.h
@@ -0,0 +1,91 @@
+Module Name:
+ SingleAligner.cpp
+ Functions for running the single end aligner sub-program.
+ Matei Zaharia, February, 2012
+ User mode service.
+Revision History:
+ Adapted from cSNAP, which was in turn adapted from the scala prototype
+#pragma once
+#include "stdafx.h"
+#include "AlignerContext.h"
+#include "ReadSupplierQueue.h"
+struct PairedAlignerStats;
+class PairedAlignerContext : public AlignerContext
+ PairedAlignerContext(AlignerExtension* i_extension = NULL);
+ // AlignerContext
+ virtual bool initialize();
+ virtual AlignerStats* newStats();
+ virtual void runTask();
+ virtual void runIterationThread();
+ // for subclasses
+ virtual void updateStats(PairedAlignerStats* stats, Read* read0, Read* read1, PairedAlignmentResult* result, bool useful0, bool useful1);
+ bool isPaired() {return true;}
+ virtual void typeSpecificBeginIteration();
+ virtual void typeSpecificNextIteration();
+ PairedReadSupplierGenerator *pairedReadSupplierGenerator;
+ int minSpacing;
+ int maxSpacing;
+ bool forceSpacing;
+ unsigned intersectingAlignerMaxHits;
+ unsigned maxCandidatePoolSize;
+ const char *fastqFile1;
+ bool ignoreMismatchedIDs;
+ bool quicklyDropUnpairedReads;
+ friend class AlignerContext2;
+struct PairedAlignerOptions : public AlignerOptions
+ PairedAlignerOptions(const char* i_commandLine);
+ virtual void usageMessage();
+ virtual bool parse(const char** argv, int argc, int& n, bool *done);
+ virtual bool isPaired() { return true; }
+ int minSpacing;
+ int maxSpacing;
+ bool forceSpacing;
+ unsigned intersectingAlignerMaxHits;
+ unsigned maxCandidatePoolSize;
+ bool quicklyDropUnpairedReads;
diff --git a/SNAPLib/PairedEndAligner.h b/SNAPLib/PairedEndAligner.h
new file mode 100644
index 0000000..0f29d6f
--- /dev/null
+++ b/SNAPLib/PairedEndAligner.h
@@ -0,0 +1,62 @@
+Module Name:
+ PairedEndAligner.h
+ Superclass for implementations of paired end aligners
+ Matei Zaharia, December, 2011
+ User mode service.
+Revision History:
+#pragma once
+#include "AlignmentResult.h"
+#include "directions.h"
+#include "LandauVishkin.h"
+#include "Read.h"
+ * Abstract interface for paired-end aligners.
+ */
+class PairedEndAligner
+ virtual ~PairedEndAligner() {}
+ virtual void align(
+ Read *read0,
+ Read *read1,
+ PairedAlignmentResult *result,
+ int maxEditDistanceForSecondaryResults,
+ int secondaryResultBufferSize,
+ int *nSecondaryResults,
+ PairedAlignmentResult *secondaryResults, // The caller passes in a buffer of secondaryResultBufferSize and it's filled in by align()
+ int singleSecondaryBufferSize,
+ int maxSecondaryAlignmentsToReturn,
+ int *nSingleEndSecondaryResultsForFirstRead,
+ int *nSingleEndSecondaryResultsForSecondRead,
+ SingleAlignmentResult *singleEndSecondaryResults // Single-end secondary alignments for when the paired-end alignment didn't work properly
+ ) = 0;
+ virtual void setLandauVishkin(
+ LandauVishkin<1> *landauVishkin,
+ LandauVishkin<-1> *reverseLandauVishkin)
+ {
+ }
+ virtual _int64 getLocationsScored() const = 0;
diff --git a/SNAPLib/PairedReadMatcher.cpp b/SNAPLib/PairedReadMatcher.cpp
new file mode 100644
index 0000000..cd0cb55
--- /dev/null
+++ b/SNAPLib/PairedReadMatcher.cpp
@@ -0,0 +1,433 @@
+Module Name:
+ PairedReadMatcher.cpp
+ Match paired-end reads coming in from different streams
+ Bill Bolosky, November, 2012
+ User mode service.
+Revision History:
+#include "stdafx.h"
+#include <map>
+#include "Compat.h"
+#include "Util.h"
+#include "Read.h"
+#include "DataReader.h"
+#include "VariableSizeMap.h"
+#include "PairedEndAligner.h"
+#include "SAM.h"
+#include "Error.h"
+// turn on to debug matching process
+// turn on to gather paired stats
+//#define STATISTICS
+using std::pair;
+class PairedReadMatcher: public PairedReadReader
+ PairedReadMatcher(ReadReader* i_single, bool i_quicklyDropUnpairedReads);
+ // PairedReadReader
+ virtual ~PairedReadMatcher();
+ virtual bool getNextReadPair(Read *read1, Read *read2);
+ virtual void reinit(_int64 startingOffset, _int64 amountOfFileToProcess)
+ { single->reinit(startingOffset, amountOfFileToProcess); }
+ virtual void holdBatch(DataBatch batch)
+ { single->holdBatch(batch); }
+ virtual bool releaseBatch(DataBatch batch);
+ virtual ReaderContext* getContext()
+ { return single->getContext(); }
+ ReadWithOwnMemory* allocOverflowRead();
+ void freeOverflowRead(ReadWithOwnMemory* read);
+ ReadReader* single; // reader for single reads
+ typedef _uint64 StringHash;
+ typedef VariableSizeMap<StringHash,Read> ReadMap;
+ DataBatch currentBatch; // for dropped reads
+ bool allDroppedInCurrentBatch;
+ DataBatch batch[2]; // 0 = current, 1 = previous
+ ReadMap unmatched[2]; // read id -> Read
+ typedef VariableSizeMap<PairedReadMatcher::StringHash,ReadWithOwnMemory*,150,MapNumericHash<PairedReadMatcher::StringHash>,80,0,true> OverflowMap;
+ OverflowMap overflow; // read id -> Read
+ typedef VariableSizeVector<ReadWithOwnMemory*> OverflowReadVector;
+ OverflowReadVector blocks; // BigAlloc blocks
+ static const int BlockSize = 10000; // # ReadWithOwnMemory per block
+ ExclusiveLock blockLock; // protects adding to blocks list
+ ReadWithOwnMemory* freeList; // head of free list, NULL if empty, use interlocked ops to update
+ typedef VariableSizeMap<_uint64,OverflowReadVector*> OverflowReadReleaseMap;
+ OverflowReadReleaseMap overflowRelease;
+ typedef VariableSizeMap<StringHash,char*> StringMap;
+ StringMap strings;
+ typedef VariableSizeMap<StringHash,int> HashSet;
+ HashSet overflowUsed;
+ int overflowTotal, overflowPeak;
+ bool quicklyDropUnpairedReads;
+ _uint64 nReadsQuicklyDropped;
+ Read localRead;
+ typedef struct
+ {
+ _int64 oldPairs; // # pairs matched from overflow
+ _int64 oldBatches; // # distinct matches matched from overflow
+ _int64 internalPairs; // #pairs matched within batch
+ _int64 previousPairs; // #pairs matched with previous batch
+ _int64 overflowPairs; // #pairs left over
+ _int64 totalReads; // total reads in batch
+ void clear() { memset(this, 0, sizeof(*this)); }
+ } BatchStats;
+ BatchStats currentStats, totalStats;
+ VariableSizeMap<_int64,int> currentBatches;
+ ReadReader* i_single,
+ bool i_quicklyDropUnpairedReads)
+ : single(i_single),
+ overflowTotal(0), overflowPeak(0),
+ quicklyDropUnpairedReads(i_quicklyDropUnpairedReads),
+ nReadsQuicklyDropped(0), freeList(NULL),
+ currentBatch(0, 0), allDroppedInCurrentBatch(false)
+ new (&unmatched[0]) VariableSizeMap<StringHash,Read>(10000);
+ new (&unmatched[1]) VariableSizeMap<StringHash,Read>(10000);
+ InitializeExclusiveLock(&blockLock);
+ currentStats.clear();
+ totalStats.clear();
+ for (OverflowReadVector::iterator i = blocks.begin(); i != blocks.end(); i++) {
+ BigDealloc(*i);
+ }
+ delete single;
+ DestroyExclusiveLock(&blockLock);
+ ReadWithOwnMemory*
+ while (true) {
+ ReadWithOwnMemory* next = freeList;
+ if (next == NULL) {
+ // alloc & init a new block of reads
+ ReadWithOwnMemory* block = (ReadWithOwnMemory*) BigAlloc(BlockSize * sizeof(ReadWithOwnMemory));
+ AcquireExclusiveLock(&blockLock);
+ blocks.push_back(block);
+ ReleaseExclusiveLock(&blockLock);
+ for (int i = 0; i < BlockSize - 1; i++) {
+ *(ReadWithOwnMemory**)&block[i] = &block[i+1];
+ }
+ while (true) {
+ ReadWithOwnMemory* head = freeList;
+ *(ReadWithOwnMemory**)&block[BlockSize-1] = head;
+ if (InterlockedCompareExchangePointerAndReturnOldValue((void*volatile*)&freeList, block, head) == head) {
+ break;
+ }
+ }
+ } else {
+ ReadWithOwnMemory* nextHead = *(ReadWithOwnMemory**)next;
+ if (InterlockedCompareExchangePointerAndReturnOldValue((void*volatile*)&freeList, nextHead, next) == next) {
+ return next;
+ }
+ }
+ }
+ void
+ ReadWithOwnMemory* read)
+ while (true) {
+ ReadWithOwnMemory* head = freeList;
+ *(ReadWithOwnMemory**)read = head;
+ if (InterlockedCompareExchangePointerAndReturnOldValue((void*volatile*)&freeList, read, head) == head) {
+ return;
+ }
+ }
+ bool
+ Read *read1,
+ Read *read2)
+ Read *outputReads[NUM_READS_PER_PAIR];
+ outputReads[0] = read1;
+ outputReads[1] = read2;
+ int readOneToOutputRead; // This is used to determine which of the output reads corresponds to one (the read that just came from getNextRead())
+ // That, in turn, is determined by the S/BAM flags in the read saying whether it was first-in-template.
+ int skipped = 0;
+ while (true) {
+ if (skipped++ == 10000) {
+ WriteErrorMessage( "warning: no matching read pairs in 10,000 reads, input file might be unsorted or have unexpected read id format\n");
+ }
+ if (! single->getNextRead(&localRead)) {
+ WriteErrorMessage("overflow total %d, peak %d\n", overflowTotal, overflowPeak);
+ int n = unmatched[0].size() + unmatched[1].size() + overflow.size();
+ if (n > 0) {
+ WriteErrorMessage( " warning: PairedReadMatcher discarding %d unpaired reads at eof\n", n);
+ int printed = 0;
+ char buffer[200];
+ for (OverflowMap::iterator i = overflow.begin(); i != overflow.end() && printed < 10; i = overflow.next(i)) {
+ int l = min((unsigned) sizeof(buffer)-1, i->value->getIdLength());
+ memcpy(buffer, i->value->getId(), l);
+ buffer[l] = 0;
+ WriteErrorMessage("%s\n", buffer);
+ printed++;
+ }
+ for (int i = 0; i < 2; i++) {
+ fprintf(stdout, "unmatched[%d]\n", i);
+ for (ReadMap::iterator j = unmatched[i].begin(); j != unmatched[i].end(); j = unmatched[i].next(j)) {
+ fprintf(stdout, "%s\n", strings[j->key]);
+ }
+ }
+ int printed = 0;
+ fprintf(stdout, "sample of overflow\n");
+ for (OverflowMap::iterator o = overflow.begin(); printed < 500 && o != overflow.end(); o = overflow.next(o)) {
+ if (NULL == overflowUsed.tryFind(o->key)) {
+ printed++;
+ fprintf(stdout, "%s\n", strings[o->key]);
+ }
+ }
+ }
+ if (nReadsQuicklyDropped > 0) {
+ WriteErrorMessage(" warning: PairedReadMatcher dropped %lld reads because they didn't have RNEXT and PNEXT filled in.\n"
+ " If your input file was generated by a single-end alignment (or this seems too big), use the -ku flag\n",
+ nReadsQuicklyDropped);
+ }
+ single->releaseBatch(batch[0]);
+ single->releaseBatch(batch[1]);
+ return false;
+ }
+ if (quicklyDropUnpairedReads) {
+ if (localRead.getOriginalPNEXT() == 0 || localRead.getOriginalRNEXTLength() == 1 && localRead.getOriginalRNEXT()[0] == '*') {
+ nReadsQuicklyDropped++;
+ skipped--;
+ continue;
+ }
+ }
+ if (localRead.getOriginalSAMFlags() & SAM_FIRST_SEGMENT) {
+ readOneToOutputRead = 0;
+ } else {
+ readOneToOutputRead = 1;
+ }
+ // build key for pending read table, removing /1 or /2 at end
+ const char* id = localRead.getId();
+ unsigned idLength = localRead.getIdLength();
+ // truncate at space or slash
+ char* slash = (char*) memchr((void*)id, '/', idLength);
+ if (slash != NULL) {
+ idLength = (unsigned)(slash - id);
+ }
+ char* space = (char*) memchr((void*)id, ' ', idLength);
+ if (space != NULL) {
+ idLength = (unsigned)(space - id);
+ }
+ StringHash key = util::hash64(id, idLength);
+ char* s = new char[idLength+1];
+ memcpy(s, id, idLength);
+ s[idLength] = 0;
+ char** p = strings.tryFind(key);
+ if (p != NULL && strcmp(*p, s)) {
+ WriteErrorMessage( "hash collision %ld of %s and %s\n", key, *p, s);
+ soft_exit(1);
+ }
+ if (p == NULL) {
+ strings.put(key, s);
+ }
+ if (localRead.getBatch() != batch[0]) {
+ currentStats.oldBatches = currentBatches.size();
+ currentStats.overflowPairs = unmatched[1].size();
+ totalStats.internalPairs += currentStats.internalPairs;
+ totalStats.previousPairs += currentStats.previousPairs;
+ totalStats.oldBatches += currentStats.oldBatches;
+ totalStats.oldPairs += currentStats.oldPairs;
+ totalStats.overflowPairs += currentStats.overflowPairs;
+ totalStats.totalReads += currentStats.totalReads;
+ fprintf(stderr,"batch %d:%d: internal %d pairs, previous %d pairs, old %d pairs from %d batches, overflow %d pairs\n"
+ "cumulative: internal %d pairs, previous %d pairs, old %d pairs from %d batches, overflow %d pairs\n",
+ batch[0].fileID, batch[0].batchID, currentStats.internalPairs, currentStats.previousPairs, currentStats.oldPairs, currentStats.oldBatches, currentStats.overflowPairs,
+ totalStats.internalPairs, totalStats.previousPairs, totalStats.oldPairs, totalStats.oldBatches, totalStats.overflowPairs);
+ currentStats.clear();
+ currentBatches.clear();
+ // roll over batches
+ if (unmatched[1].size() > 0) {
+ // copy remaining reads into overflow map
+ //fprintf(stderr,"warning: PairedReadMatcher overflow %d unpaired reads from %d:%d\n", unmatched[1].size(), batch[1].fileID, batch[1].batchID); //!!
+ //char* buf = (char*) alloca(500);
+ for (ReadMap::iterator r = unmatched[1].begin(); r != unmatched[1].end(); r = unmatched[1].next(r)) {
+ ReadWithOwnMemory* p = allocOverflowRead();
+ new (p) ReadWithOwnMemory(r->value);
+ _ASSERT(p->getData()[0]);
+ overflow.put(r->key, p);
+ char*s2 = *strings.tryFind(r->key);
+ int len = strlen(s2);
+ _ASSERT(! strncmp(s2, r->value.getId(), len));
+ ReadWithOwnMemory* rd = overflow.tryFind(r->key);
+ _ASSERT(! strncmp(s2, rd->getId(), len));
+ //memcpy(buf, r->value.getId(), r->value.getIdLength());
+ //buf[r->value.getIdLength()] = 0;
+ //fprintf(stderr, "overflow add %d:%d %s\n", batch[1].fileID, batch[1].batchID, buf);
+ }
+ overflowTotal += unmatched[1].size();
+ overflowPeak = max(overflow.size(), overflowPeak);
+ }
+ for (ReadMap::iterator i = unmatched[1].begin(); i != unmatched[1].end(); i = unmatched[1].next(i)) {
+ i->value.dispose();
+ }
+ unmatched[1].exchange(unmatched[0]);
+ unmatched[0].clear();
+ single->releaseBatch(batch[1]);
+ batch[1] = batch[0];
+ batch[0] = localRead.getBatch();
+ single->holdBatch(batch[0]);
+ currentStats.totalReads++;
+ }
+ ReadMap::iterator found = unmatched[0].find(key);
+ if (found != unmatched[0].end()) {
+ *outputReads[1-readOneToOutputRead] = found->value;
+ //fprintf(stderr, "current matched %d:%d->%d:%d %s\n", outputReads[1-readOneToOutputRead]->getBatch().fileID, outputReads[1-readOneToOutputRead]->getBatch().batchID, batch[0].fileID, batch[0].batchID, outputReads[1-readOneToOutputRead]->getId()); //!!
+ unmatched[0].erase(found->key);
+ currentStats.internalPairs++;
+ } else {
+ // try previous batch
+ found = unmatched[1].find(key);
+ if (found == unmatched[1].end()) {
+ // try overflow
+ OverflowMap::iterator found2 = overflow.find(key);
+ if (found2 == overflow.end()) {
+ // no match, remember it for later matching
+ unmatched[0].put(key, localRead);
+ _ASSERT(localRead.getData()[0] && unmatched[0][key].getData()[0]);
+ //fprintf(stderr, "unmatched add %d:%d %lx\n", batch[0].fileID, batch[0].batchID, key); //!!
+ continue;
+ } else {
+ // copy data into read, move from overflow table to release vector for current batch
+ found2->value->setBatch(batch[0]); // overwrite batch to match current
+ *outputReads[1-readOneToOutputRead] = * (Read*) found2->value;
+ _ASSERT(outputReads[1-readOneToOutputRead]->getData()[0]);
+ OverflowReadVector* v;
+ if (! overflowRelease.tryGet(batch[0].asKey(), &v)) {
+ v = new OverflowReadVector();
+ overflowRelease.put(batch[0].asKey(), v);
+ //fprintf(stderr,"overflow fetch into %d:%d\n", batch[0].fileID, batch[0].batchID);
+ }
+ v->push_back(found2->value);
+ overflow.erase(key);
+ //fprintf(stderr,"overflow matched %d:%d %s\n", read2->getBatch().fileID, read2->getBatch().batchID, read2->getId()); //!!
+ overflowUsed.put(key, 1);
+ currentStats.oldPairs++;
+ currentBatches.put(read2->getBatch().asKey(), 1);
+ }
+ } else {
+ // found a match in preceding batch
+ *outputReads[1-readOneToOutputRead] = found->value;
+ //fprintf(stderr,"prior matched %d:%d->%d:%d %s\n", found->value.getBatch().fileID, found->gvalue.etBatch().batchID, batch[0].fileID, batch[0].batchID, found->value.getId()); //!!
+ unmatched[1].erase(found->key);
+ currentStats.previousPairs++;
+ }
+ }
+ // found a match
+ *outputReads[readOneToOutputRead] = localRead;
+ return true;
+ }
+ bool
+ DataBatch batch)
+ if (batch.asKey() == 0) {
+ return true;
+ } else if (single->releaseBatch(batch)) {
+ OverflowReadVector* v = NULL;
+ if (overflowRelease.tryGet(batch.asKey(), &v)) {
+ // free memory for overflow reads
+ //fprintf(stderr, "PairedReadMatcher release %d overflow reads for batch %d:%d\n", v->size(), batch.fileID, batch.batchID);
+ for (OverflowReadVector::iterator i = v->begin(); i != v->end(); i++) {
+ freeOverflowRead(*i);
+ }
+ delete v;
+ overflowRelease.erase(batch.asKey());
+ }
+ return true;
+ } else {
+ return false;
+ }
+// define static factory function
+ PairedReadReader*
+ ReadReader* single,
+ bool quicklyDropUnpairedReads)
+ return new PairedReadMatcher(single, quicklyDropUnpairedReads);
diff --git a/SNAPLib/ParallelTask.cpp b/SNAPLib/ParallelTask.cpp
new file mode 100644
index 0000000..c473ad8
--- /dev/null
+++ b/SNAPLib/ParallelTask.cpp
@@ -0,0 +1,144 @@
+Module Name:
+ ParallelTask.cpp
+ Parallel task management
+ User mode service.
+#include "stdafx.h"
+#include "ParallelTask.h"
+#include "Error.h"
+using std::max;
+ParallelCoworker::ParallelCoworker(int i_numThreads, bool i_bindToProcessors, ParallelWorkerManager* i_manager, Callback i_callback, void* i_parameter)
+ : stopped(false), numThreads(i_numThreads), bindToProcessors(i_bindToProcessors), manager(i_manager), callback(i_callback), parameter(i_parameter)
+ workReady = new EventObject[numThreads];
+ workDone = new EventObject[numThreads];
+ workers = new ParallelWorker*[max(numThreads, 1)];
+ for (int i = 0; i < numThreads; i++) {
+ CreateEventObject(&workReady[i]);
+ PreventEventWaitersFromProceeding(&workReady[i]);
+ CreateEventObject(&workDone[i]);
+ PreventEventWaitersFromProceeding(&workDone[i]);
+ workers[i] = manager->createWorker();
+ workers[i]->configure(manager, i, numThreads);
+ }
+ CreateSingleWaiterObject(&finished);
+ for (int i = 0; i < numThreads; i++) {
+ DestroyEventObject(&workReady[i]);
+ DestroyEventObject(&workDone[i]);
+ delete workers[i];
+ }
+ delete [] workReady;
+ delete [] workDone;
+ delete [] workers;
+ delete task;
+ delete context;
+void ParallelCoworker::start()
+ context = new WorkerContext();
+ context->shared = this;
+ context->totalThreads = numThreads;
+ context->bindToProcessors = bindToProcessors;
+#ifdef _MSC_VER
+ context->useTimingBarrier = false;
+ task = new ParallelTask<WorkerContext>(context);
+ task->fork();
+void ParallelCoworker::step()
+ manager->beginStep();
+ for (int i = 0; i < numThreads; i++) {
+ PreventEventWaitersFromProceeding(&workDone[i]);
+ AllowEventWaitersToProceed(&workReady[i]);
+ }
+ // if async, thread 0 will callback when all workers finish
+ // if sync, wait for all workers to finish
+ if (callback == NULL) {
+ for (int i = 0; i < numThreads; i++) {
+ WaitForEvent(&workDone[i]);
+ }
+ manager->finishStep();
+ }
+void ParallelCoworker::stop()
+ stopped = true;
+ for (int i = 0; i < numThreads; i++) {
+ AllowEventWaitersToProceed(&workReady[i]);
+ }
+ if (!WaitForSingleWaiterObject(&finished)) {
+ WriteErrorMessage("Waiting for all threads to finish failed\n");
+ soft_exit(1);
+ }
+ void
+ shared->workers[threadNum]->initialize();
+ void
+ while (true) {
+ //fprintf(stderr, "worker task thread %d waiting to begin\n", GetCurrentThreadId());
+ WaitForEvent(&shared->workReady[threadNum]);
+ PreventEventWaitersFromProceeding(&shared->workReady[threadNum]);
+ if (shared->stopped) {
+ return;
+ }
+ //fprintf(stderr, "worker task thread %d begin\n", GetCurrentThreadId());
+ _int64 start = timeInMillis();
+ shared->workers[threadNum]->step();
+ //fprintf(stderr, "zip task thread %d done %lld ms\n", GetCurrentThreadId(), timeInMillis() - start);
+ AllowEventWaitersToProceed(&shared->workDone[threadNum]);
+ // if async, thread 0 will wait for everyone to finish and invoke callback
+ if (threadNum == 0 && shared->callback != NULL) {
+ for (int i = 1; i < shared->numThreads; i++) {
+ WaitForEvent(&shared->workDone[i]);
+ }
+ shared->manager->finishStep();
+ shared->callback(shared->parameter);
+ }
+ }
+ void
+WorkerContext::finishThread(WorkerContext* common)
+ if (threadNum == totalThreads - 1) {
+ SignalSingleWaiterObject(&shared->finished);
+ }
+ void
+ ParallelWorker* worker,
+ int threadNum,
+ int totalThreads)
+ worker->configure(this, threadNum, totalThreads);
diff --git a/SNAPLib/ParallelTask.h b/SNAPLib/ParallelTask.h
new file mode 100644
index 0000000..b08cd3d
--- /dev/null
+++ b/SNAPLib/ParallelTask.h
@@ -0,0 +1,299 @@
+Module Name:
+ ParallelTask.h
+ Simple parallel task manager
+ Ravi Pandya, May 2012
+ User mode service.
+Revision History:
+#pragma once
+#include "stdafx.h"
+#include "Compat.h"
+#include "exit.h"
+#include "Error.h"
+ Simple class to handle parallelized algorithms.
+ TContext should extend TContextBase, and provide the following methods:
+ void initializeThread()
+ Called once on main thread after TContext has been assigned from common,
+ and threadNum set.
+ void runThread()
+ Called to run the thread's work until termination.
+ May use something like RangeSplitter to get work.
+ void finishThread(TContext* common)
+ Called once on main thread after all threads have finished,
+ to write results back to common.
+ template <class TContext>
+class ParallelTask
+ inline TContext* getCommonContext() { return common; }
+ // i_common should have totalThreads & bindToProcessors set
+ ParallelTask(TContext* i_common);
+ // run all threads until completion, gather results in common
+ void run();
+ // run all tasks on a separate thread
+ void fork();
+ // initial & final context
+ TContext* common;
+ // array of per-thread contexts
+ TContext* contexts;
+ static void threadWorker(void* threadContext);
+ static void forkWorker(void* threadContext);
+ Base for type parameter to parallel task
+struct TaskContextBase
+ // should be set before passing to ParallelTask constructor
+ int totalThreads;
+ bool bindToProcessors;
+ // time taken to run in millis
+ _int64 time;
+ // for internal use:
+ int threadNum; // current thread number, 0...totalThreads-1
+ SingleWaiterObject *doneWaiter; // Gets notified when the last thread ends.
+ volatile int runningThreads;
+ volatile int *pRunningThreads;
+#ifdef _MSC_VER
+ volatile int *nThreadsAllocatingMemory;
+ EventObject *memoryAllocationCompleteBarrier;
+ bool useTimingBarrier;
+#endif // _MSC_VER
+ template <class TContext>
+ TContext* i_common)
+ : common(i_common), contexts(NULL)
+ _ASSERT(i_common->totalThreads > 0);
+ template <class TContext>
+ void
+ _int64 start = timeInMillis();
+ SingleWaiterObject doneWaiter;
+ if (!CreateSingleWaiterObject(&doneWaiter)) {
+ WriteErrorMessage( "Failed to create single waiter object for thread completion.\n");
+ soft_exit(1);
+ }
+#ifdef _MSC_VER
+ int nThreadsAllocatingMemory = common->totalThreads;
+ EventObject memoryAllocationCompleteBarrier;
+ CreateEventObject(&memoryAllocationCompleteBarrier);
+ common->nThreadsAllocatingMemory = &nThreadsAllocatingMemory;
+ common->memoryAllocationCompleteBarrier = &memoryAllocationCompleteBarrier;
+#endif // _MSC_VER
+ common->doneWaiter = &doneWaiter;
+ common->runningThreads = common->totalThreads;
+ common->pRunningThreads = &common->runningThreads;
+ contexts = new TContext[common->totalThreads];
+ for (int i = 0; i < common->totalThreads; i++) {
+ contexts[i] = *common;
+ contexts[i].threadNum = i;
+ contexts[i].initializeThread();
+ if (!StartNewThread(ParallelTask<TContext>::threadWorker, &contexts[i])) {
+ WriteErrorMessage( "Unable to start worker thread.\n");
+ soft_exit(1);
+ }
+ }
+#ifdef _MSC_VER
+ if (common->useTimingBarrier) {
+ WaitForEvent(&memoryAllocationCompleteBarrier);
+ WriteStatusMessage("Cleared timing barrier.\n");
+ start = timeInMillis();
+ }
+#endif // _MSC_VER
+ if (!WaitForSingleWaiterObject(&doneWaiter)) {
+ WriteErrorMessage( "Waiting for all threads to finish failed\n");
+ soft_exit(1);
+ }
+ DestroySingleWaiterObject(&doneWaiter);
+#ifdef _MSC_VER
+ DestroyEventObject(&memoryAllocationCompleteBarrier);
+#endif // _MSC_VER
+ for (int i = 0; i < common->totalThreads; i++) {
+ contexts[i].finishThread(common);
+ }
+ common->time = timeInMillis() - start;
+ template <class TContext>
+ void
+ if (!StartNewThread(ParallelTask<TContext>::forkWorker, this)) {
+ WriteErrorMessage( "Unable to fork task thread.\n");
+ soft_exit(1);
+ }
+ template <class TContext>
+ void
+ void* forkArg)
+ ((ParallelTask<TContext>*) forkArg)->run();
+ template <class TContext>
+ void
+ void* threadArg)
+ TContext* context = (TContext*) threadArg;
+ if (context->bindToProcessors) {
+ BindThreadToProcessor(context->threadNum);
+ }
+ context->runThread();
+ // Decrement the running thread count and wake up the waiter if it hits 0.
+ if (0 == InterlockedDecrementAndReturnNewValue(context->pRunningThreads)) {
+ SignalSingleWaiterObject(context->doneWaiter);
+ }
+struct WorkerContext;
+class ParallelWorker;
+class ParallelWorkerManager;
+// coroutined parallel workers
+// does code inline if numThreads = 0
+// can either callback when done, or synchronously wait for all to complete
+class ParallelCoworker
+ typedef void (*Callback)(void*);
+ ParallelCoworker(int i_numThreads, bool i_bindToProcessors, ParallelWorkerManager* supplier, Callback callback = NULL, void* parameter = NULL);
+ ~ParallelCoworker();
+ // start forked thread running
+ void start();
+ // do one unit of work, asynchronously if callback, else synchronously
+ void step();
+ // stop everything, waits until all threads exit
+ void stop();
+ ParallelWorkerManager* getManager() { return manager; }
+ EventObject *workReady; // One per worker thread
+ EventObject *workDone; // One per worker thread
+ ParallelWorker** workers; // one per worker thread
+ ParallelWorkerManager* manager;
+ volatile bool stopped;
+ const int numThreads;
+ const bool bindToProcessors;
+ Callback callback;
+ void* parameter;
+ WorkerContext* context;
+ ParallelTask<WorkerContext>* task;
+ SingleWaiterObject finished;
+ friend struct WorkerContext;
+// abstract classes for specifying the actual work
+// creates new per-thread workers
+class ParallelWorker;
+class ParallelWorkerManager
+ // todo: using void* context pointers to avoid pain of templates but should really be made typesafe
+ virtual void initialize(void* context) {}
+ virtual ParallelWorker* createWorker() = 0;
+ virtual void beginStep() {}
+ virtual void finishStep() {}
+ void configure(ParallelWorker* worker, int threadNum, int totalThreads); // special case
+// per-thread worker
+class ParallelWorker
+ ParallelWorker() {}
+ virtual ~ParallelWorker() {}
+ virtual void initialize() {}
+ virtual void step() = 0;
+ ParallelWorkerManager* getManager() { return manager; }
+ int getThreadNum() { return threadNum; }
+ int getNumThreads() { return numThreads; }
+ friend class ParallelCoworker;
+ friend class ParallelWorkerManager;
+ void configure(ParallelWorkerManager* i_manager, int i_threadNum, int i_numThreads)
+ { manager = i_manager; threadNum = i_threadNum; numThreads = i_numThreads; }
+ ParallelWorkerManager* manager;
+ int threadNum;
+ int numThreads;
+struct WorkerContext : public TaskContextBase
+ ParallelCoworker* shared;
+ void initializeThread();
+ void runThread();
+ void finishThread(WorkerContext* common);
diff --git a/SNAPLib/PriorityQueue.h b/SNAPLib/PriorityQueue.h
new file mode 100644
index 0000000..61ae85b
--- /dev/null
+++ b/SNAPLib/PriorityQueue.h
@@ -0,0 +1,117 @@
+#pragma once
+#include "Compat.h"
+#include "stdafx.h"
+#include "BigAlloc.h"
+#include "VariableSizeMap.h"
+using std::max;
+using std::min;
+// adapted from http://visualstudiomagazine.com/articles/2012/11/01/priority-queues-with-c.aspx
+template <typename P, typename V>
+class PriorityQueue
+ struct Entry
+ {
+ Entry() : value(), priority(0) {}
+ Entry(V v, P p) : value(v), priority(p) {}
+ void operator=(const Entry& e) { value = e.value; priority = e.priority; }
+ V value;
+ P priority;
+ };
+ typedef VariableSizeVector<Entry> EntryVector;
+#if 0
+ inline void check()
+ { _ASSERT(validate()); }
+ inline void check() {}
+ // add an element with specific priority
+ void add(V value, P pri)
+ {
+ queue.push_back(Entry(value, pri));
+ _int64 ci = queue.size() - 1;
+ while (ci > 0) {
+ _int64 pi = (ci - 1) / 2; // parent index
+ if (queue[ci].priority >= queue[pi].priority) {
+ break; // child >= parent so stop
+ }
+ Entry tmp = queue[ci]; queue[ci] = queue[pi]; queue[pi] = tmp;
+ ci = pi;
+ }
+ check();
+ }
+ V pop(P* o_priority = NULL)
+ {
+ _int64 li = queue.size() - 1; // last index (before removal)
+ Entry frontItem = queue[0]; // fetch the front
+ queue[0] = queue[li];
+ queue.erase(li);
+ --li; // last index (after removal)
+ int pi = 0; // parent index. start at front of pq
+ while (true) {
+ int ci = pi * 2 + 1; // left child index of parent
+ if (ci > li) {
+ break; // no children so done
+ }
+ int rc = ci + 1; // right child
+ if (rc <= li && queue[rc].priority < queue[ci].priority) {
+ // if there is a rc (ci + 1), and it is smaller than left child, use the rc instead
+ ci = rc;
+ }
+ if (queue[pi].priority <= queue[ci].priority) {
+ break; // parent is smaller than (or equal to) smallest child so done
+ }
+ Entry tmp = queue[pi]; queue[pi] = queue[ci]; queue[ci] = tmp; // swap parent and child
+ pi = ci;
+ }
+ check();
+ if (o_priority != NULL) {
+ *o_priority = frontItem.priority;
+ }
+ return frontItem.value;
+ }
+ V peek(P* o_priority = NULL) const
+ {
+ if (o_priority != NULL) {
+ *o_priority = queue[0].priority;
+ }
+ return queue[0].value;
+ }
+ void clear()
+ { queue.clear(); }
+ _int64 size() const
+ { return queue.size(); }
+ bool validate()
+ {
+ // is the heap property true for all queue?
+ if (queue.size() == 0) {
+ return true;
+ }
+ int li = queue.size() - 1; // last index
+ for (int pi = 0; pi <= li; ++pi) {
+ int lci = 2 * pi + 1; // left child index
+ int rci = 2 * pi + 2; // right child index
+ if (lci <= li && queue[pi].priority > queue[lci].priority) return false; // if lc exists and it's greater than parent then bad.
+ if (rci <= li && queue[pi].priority > queue[rci].priority) return false; // check the right child too.
+ }
+ return true;
+ }
+ EntryVector queue; // sorted list
diff --git a/SNAPLib/ProbabilityDistance.cpp b/SNAPLib/ProbabilityDistance.cpp
new file mode 100644
index 0000000..6900124
--- /dev/null
+++ b/SNAPLib/ProbabilityDistance.cpp
@@ -0,0 +1,135 @@
+#include "stdafx.h"
+#include "ProbabilityDistance.h"
+#include "Compat.h"
+#define TRACE printf
+#define TRACE(...) {}
+namespace {
+ inline double max3(double d1, double d2, double d3) {
+ if (d1 > d2) {
+ return (d1 > d3) ? d1 : d3;
+ } else {
+ return (d2 > d3) ? d2 : d3;
+ }
+ }
+ProbabilityDistance::ProbabilityDistance(double snpProb, double gapOpenProb, double gapExtensionProb)
+ snpLogProb = log(snpProb);
+ gapOpenLogProb = log(gapOpenProb);
+ gapExtensionLogProb = log(gapExtensionProb);
+ // Fill in the matchLogProb and mismatchLogProb tables for base quality values; assumes Phred+33 encoding
+ for (int q = 0; q < 256; q++) {
+ if (q < 33) {
+ matchLogProb[q] = NO_PROB;
+ mismatchLogProb[q] = NO_PROB;
+ } else {
+ // Probability that we misread this base
+ double errorProb = pow(10.0, -(q - 33) / 10.0);
+ // A match occurs if we didn't misread the base and it wasn't a SNP (technically it could also
+ // be that we misread it and it *was* a SNP, but that's pretty unlikely)
+ double matchProb = (1.0 - errorProb) * (1.0 - snpProb);
+ double mismatchProb = 1.0 - matchProb;
+ matchLogProb[q] = log(matchProb);
+ mismatchLogProb[q] = log(mismatchProb);
+ if (q >= 33 && q <= 'J') {
+ TRACE("q=%c: match %.04f, mismatch %.04f\n", q, matchProb, mismatchProb);
+ }
+ // TODO: Might be good to check the numerical stability of this (1 - small) stuff
+ }
+ }
+int ProbabilityDistance::compute(
+ const char *reference,
+ const char *read,
+ const char *quality,
+ int readLen,
+ int maxStartShift,
+ int maxShift, // Maximum overall shift to consider
+ double *matchProbability)
+ _ASSERT(maxStartShift < MAX_SHIFT);
+ _ASSERT(maxShift < MAX_SHIFT);
+ _ASSERT(maxStartShift <= maxShift);
+ // Fill in the readPos = 0 row to allow us to start only at -maxStartShift..+maxStartShift
+ for (int s = -maxShift-1; s <= maxShift+1; s++) {
+ if (s < -maxStartShift || s > maxStartShift) {
+ d[0][MAX_SHIFT+s][NO_GAP] = NO_PROB;
+ } else {
+ d[0][MAX_SHIFT+s][NO_GAP] = log(1.0);
+ }
+ }
+ // Now go through each readPos from 1 to readLen and compute how to best get there
+ for (int r = 1; r <= readLen; r++) {
+ // Add sentinels at the end of the array
+ d[r][MAX_SHIFT-maxShift-1][READ_GAP] = NO_PROB;
+ d[r][MAX_SHIFT+maxShift+1][READ_GAP] = NO_PROB;
+ d[r][MAX_SHIFT-maxShift-1][REF_GAP] = NO_PROB;
+ d[r][MAX_SHIFT+maxShift+1][REF_GAP] = NO_PROB;
+ d[r][MAX_SHIFT-maxShift-1][NO_GAP] = NO_PROB;
+ d[r][MAX_SHIFT+maxShift+1][NO_GAP] = NO_PROB;
+ // Fill in the rest of the values using dynamic program recurrence
+ for (int s = -maxShift; s <= maxShift; s++) {
+ // The NO_GAP case; we get here either from a previous NO_GAP or by closing a gap from the
+ // previous readPos, and in either case, we need to match the current base
+ double thisBaseProb = (read[r-1] == reference[r-1+s]) ? matchLogProb[quality[r-1]] : mismatchLogProb[quality[r-1]];
+ d[r][MAX_SHIFT+s][NO_GAP] = max3(d[r-1][MAX_SHIFT+s][NO_GAP] + thisBaseProb,
+ d[r-1][MAX_SHIFT+s][REF_GAP] + thisBaseProb,
+ d[r-1][MAX_SHIFT+s][READ_GAP] + thisBaseProb);
+ // The READ_GAP case; we can either open a new gap from the previous NO_GAP or REF_GAP cases, or
+ // extend a gap computed in the previous READ_GAP case
+ d[r][MAX_SHIFT+s][READ_GAP] = max3(d[r-1][MAX_SHIFT+s+1][NO_GAP] + gapOpenLogProb,
+ d[r-1][MAX_SHIFT+s+1][REF_GAP] + gapOpenLogProb,
+ d[r-1][MAX_SHIFT+s+1][READ_GAP] + gapExtensionLogProb);
+ // The REF_GAP case; we can either open a new gap from NO_GAP/READ_GAP, or extend one
+ d[r][MAX_SHIFT+s][REF_GAP] = max3(d[r][MAX_SHIFT+s-1][NO_GAP] + gapOpenLogProb,
+ d[r][MAX_SHIFT+s-1][REF_GAP] + gapExtensionLogProb,
+ d[r][MAX_SHIFT+s-1][READ_GAP] + gapOpenLogProb);
+ }
+ }
+ printf("Here is the final matrix:\n");
+ for (int r = 0; r <= readLen; r++) {
+ printf("%d: ", r);
+ for (int g = 0; g < 3; g++) {
+ for (int s = -maxShift; s <= maxShift; s++) {
+ printf("%7.2g ", d[r][MAX_SHIFT+s][g]);
+ }
+ if (g < 2) {
+ printf("| ");
+ }
+ }
+ printf("\n");
+ }
+ // Return the best probability, and a somewhat arbitrary score for it (TODO: need to actually compute # of edits)
+ double best = NO_PROB;
+ for (int s = -maxShift; s <= maxShift; s++) {
+ for (int g = 0; g < 3; g++) {
+ best = __max(best, d[readLen][MAX_SHIFT+s][g]);
+ }
+ }
+ *matchProbability = exp(best);
+ TRACE("Best match probability: %g (log: %.2g)\n", exp(best), best);
+ return 5;
diff --git a/SNAPLib/ProbabilityDistance.h b/SNAPLib/ProbabilityDistance.h
new file mode 100644
index 0000000..db08aca
--- /dev/null
+++ b/SNAPLib/ProbabilityDistance.h
@@ -0,0 +1,56 @@
+#pragma once
+#include "Read.h"
+// Similar to BoundedStringDistance and LandauVishkin, but computes the probability of a read
+// string being generated from a reference sequence given an error model, mutation model and
+// the quality scores of the bases in the read. For now, we assume that the statistical model
+// for errors is one with a "gap open probability" and a fixed "gap extension probability"
+// per base. For substitions, we could have different probabilities for each transition, but
+// we assume that they all have the same probability right now.
+class ProbabilityDistance {
+ static const int MAX_READ = MAX_READ_LENGTH;
+ static const int MAX_SHIFT = 20;
+ ProbabilityDistance(double snpProb, double gapOpenProb, double gapExtensionProb);
+ int compute(
+ const char *reference,
+ const char *read,
+ const char *quality,
+ int readLen,
+ int maxStartShift,
+ int maxTotalShift,
+ double *matchProbability);
+ double snpLogProb;
+ double gapOpenLogProb;
+ double gapExtensionLogProb;
+ double matchLogProb[256]; // [baseQuality]
+ double mismatchLogProb[256]; // [baseQuality]
+#define NO_PROB -1000000.0; // A really negative log probability -- basically zero. VC compiler won't allow static const double in a class.
+ enum GapStatus { NO_GAP, READ_GAP, REF_GAP };
+ // d[readPos][shift][gapStatus] is the best possible log probability for aligning the
+ // substring read[0..readPos] to reference[?..readPos + shift]. The "?" in reference is
+ // because we allow starting an alignment from reference[-maxStartShift..maxStartShift]
+ // instead of just reference[0], to deal with indels toward the start of the read.
+ double d[MAX_READ][2*MAX_SHIFT+1][3]; // [readPos][shift][gapStatus]
+ // A state in the D array, used for backtracking pointers
+ struct State {
+ int readPos;
+ int shift;
+ int gapStatus;
+ };
+ // Previous state in our dynamic program, for backtracking to print CIGAR strings
+ State prev[MAX_READ][2*MAX_SHIFT+1][3]; // [readPos][shift][gapStatus]
diff --git a/SNAPLib/RangeSplitter.cpp b/SNAPLib/RangeSplitter.cpp
new file mode 100644
index 0000000..99e0a2a
--- /dev/null
+++ b/SNAPLib/RangeSplitter.cpp
@@ -0,0 +1,260 @@
+Module Name:
+ RangeSplitter.cpp
+ Code to split a range into pieces for multiple cores to process. It's designed
+ to handle cores that proceed at varying rates.
+ Bill Bolosky, 2011
+ User mode service.
+Revision History:
+ Pulled out of cSNAP.cpp to make it useful for various versions of the aligner.
+ Generalized from FileSplitter to ranges, e.g. for scanning the genome /ravip/5/2012/
+#include "stdafx.h"
+#include "RangeSplitter.h"
+#include "SAM.h"
+#include "FASTQ.h"
+using std::max;
+using std::min;
+RangeSplitter::RangeSplitter(_int64 rangeEnd_, int numThreads_, unsigned divisionSize_, _int64 rangeBegin_, unsigned minMillis_, unsigned minRangeSize_)
+ numThreads = numThreads_;
+ rangeEnd = rangeEnd_;
+ rangeBegin = rangeBegin_;
+ position = rangeBegin_;
+ startTime = 0; // We'll initialize it when getNextRange() is first called
+ divisionSize = divisionSize_;
+ minMillis = minMillis_;
+ minRangeSize = minRangeSize_;
+bool RangeSplitter::getNextRange(_int64 *rangeStart, _int64 *rangeLength)
+ // If there are multiple threads, start each of them off with (rangeEnd / divionSize / numThreads),
+ // and then keep giving everyone 1 / (divisionSize * numThreads) of the remaining data or the amount
+ // of units processed per thread in minMillis ms, whichever is bigger.
+ // If there's just one thread, we give it the whole range at the beginning.
+ if (startTime == 0) {
+ // There's a possible "race" here if multiple threads start at the same time, but that's
+ // actually OK; we just want a rough idea of when we started the processing.
+ startTime = timeInMillis();
+ }
+ _int64 amountLeft = rangeEnd - position;
+ if (amountLeft <= 0) {
+ return false;
+ }
+ _int64 amountToTake;
+ if (numThreads == 1) {
+ amountToTake = rangeEnd;
+ } else if (amountLeft >= (rangeEnd - rangeBegin) / divisionSize) {
+ amountToTake = (rangeEnd - rangeBegin) / divisionSize / numThreads;
+ if (amountToTake == 0) {
+ amountToTake = amountLeft;
+ } else {
+ amountToTake = min(amountLeft, max(amountToTake, (_int64) minRangeSize));
+ }
+ } else {
+ // Figure out units processed in minMillis ms per thread (keeping in mind we ran numThreads in total).
+ _int64 unitsInMinms = (position - rangeBegin) * minMillis / max((_int64) (timeInMillis() - startTime) * numThreads, (_int64) 1);
+ amountToTake = max(amountLeft / divisionSize / numThreads, unitsInMinms);
+ amountToTake = max(amountToTake, (_int64) minRangeSize); // Avoid getting really tiny amounts at the end.
+ }
+ _ASSERT(amountToTake > 0);
+ _int64 oldPosition = position; // for debugging
+ _int64 startOffset = InterlockedAdd64AndReturnNewValue(&position, amountToTake) - amountToTake;
+ _ASSERT(position >= rangeBegin);
+ if (startOffset >= rangeEnd) {
+ // No work left to allocate.
+ return false;
+ }
+ // Don't run past EOF if there was a race above (threads looking at amountLeft at the same time).
+ amountToTake = min(amountToTake, rangeEnd - startOffset);
+ _ASSERT(amountToTake > 0);
+ *rangeStart = startOffset;
+ *rangeLength = amountToTake;
+ _ASSERT(startOffset >= rangeBegin && startOffset + amountToTake <= rangeEnd);
+ return true;
+ const char *i_fileName,
+ bool i_isSAM,
+ unsigned i_numThreads,
+ const ReaderContext& i_context)
+ : isSAM(i_isSAM), context(i_context), numThreads(i_numThreads)
+ fileName = new char[strlen(i_fileName) + 1];
+ strcpy(fileName, i_fileName);
+ //
+ // Figure out the header size based on file type. We set up the range splitter to skip the header. This both makes the work allocation more even,
+ // and also assures that in the case where the header is bigger than what would otherwise be the first work unit that the second guy in doesn't see
+ // header.
+ //
+ _int64 headerSize;
+ if (isSAM) {
+ SAMReader *reader = SAMReader::create(DataSupplier::Default, fileName, ReadSupplierQueue::BufferCount(numThreads), context, 0, 0);
+ if (!reader) {
+ WriteErrorMessage("Unable to create reader for SAM file '%s'\n", fileName);
+ soft_exit(1);
+ }
+ headerSize = reader->getContext()->headerBytes;
+ delete reader;
+ reader = NULL;
+ } else {
+ // FASTQ has no header.
+ headerSize = 0;
+ }
+ splitter = new RangeSplitter(QueryFileSize(fileName), numThreads, 5, headerSize, 200, 10 * MAX_READ_LENGTH);
+ReadSupplier *
+ _int64 rangeStart, rangeLength;
+ if (!splitter->getNextRange(&rangeStart, &rangeLength)) {
+ return NULL;
+ }
+ ReadReader *underlyingReader;
+ // todo: implement layered factory model
+ if (isSAM) {
+ underlyingReader = SAMReader::create(DataSupplier::Default, fileName, 2, context, rangeStart, rangeLength);
+ } else {
+ underlyingReader = FASTQReader::create(DataSupplier::Default, fileName, 2, rangeStart, rangeLength, context);
+ }
+ return new RangeSplittingReadSupplier(splitter,underlyingReader);
+ Read *
+ if (underlyingReader->getNextRead(&read)) {
+ return &read;
+ }
+ _int64 rangeStart, rangeLength;
+ if (!splitter->getNextRange(&rangeStart, &rangeLength)) {
+ return NULL;
+ }
+ underlyingReader->reinit(rangeStart,rangeLength);
+ if (!underlyingReader->getNextRead(&read)) {
+ return NULL;
+ }
+ return &read;
+ bool
+RangeSplittingPairedReadSupplier::getNextReadPair(Read **read1, Read **read2)
+ *read1 = &internalRead1;
+ *read2 = &internalRead2;
+ if (underlyingReader->getNextReadPair(&internalRead1,&internalRead2)) {
+ return true;
+ }
+ //
+ // We need to clear out the reads, because they may contain references to the buffers in the readers.
+ // These buffer reference counts get reset to 0 at reinit time, which causes problems when they're
+ // still live in read.
+ //
+ _int64 rangeStart, rangeLength;
+ if (!splitter->getNextRange(&rangeStart, &rangeLength)) {
+ return false;
+ }
+ underlyingReader->reinit(rangeStart,rangeLength);
+ return underlyingReader->getNextReadPair(&internalRead1, &internalRead2);
+ const char *i_fileName1, const char *i_fileName2, FileType i_fileType, unsigned i_numThreads,
+ bool i_quicklyDropUnpairedReads, const ReaderContext& i_context) :
+ fileType(i_fileType), numThreads(i_numThreads), context(i_context), quicklyDropUnpairedReads(i_quicklyDropUnpairedReads)
+ _ASSERT(strcmp(i_fileName1, "-") && (NULL == i_fileName2 || strcmp(i_fileName2, "-"))); // Can't use range splitter on stdin, because you can't seek or query size
+ fileName1 = new char[strlen(i_fileName1) + 1];
+ strcpy(fileName1, i_fileName1);
+ if (FASTQFile == fileType) {
+ fileName2 = new char[strlen(i_fileName2) + 1];
+ strcpy(fileName2, i_fileName2);
+ } else {
+ fileName2 = NULL;
+ }
+ splitter = new RangeSplitter(QueryFileSize(fileName1),numThreads);
+ delete [] fileName1;
+ delete [] fileName2;
+ delete splitter;
+ PairedReadSupplier *
+ _int64 rangeStart, rangeLength;
+ if (!splitter->getNextRange(&rangeStart, &rangeLength)) {
+ return NULL;
+ }
+ PairedReadReader *underlyingReader;
+ switch (fileType) {
+ case SAMFile:
+ underlyingReader = SAMReader::createPairedReader(DataSupplier::Default, fileName1, 2, rangeStart, rangeLength, quicklyDropUnpairedReads, context);
+ break;
+ case FASTQFile:
+ underlyingReader = PairedFASTQReader::create(DataSupplier::Default, fileName1, fileName2, 2, rangeStart, rangeLength, context);
+ break;
+ case InterleavedFASTQFile:
+ underlyingReader = PairedInterleavedFASTQReader::create(DataSupplier::Default, fileName1, 2, rangeStart, rangeLength, context);
+ break;
+ default:
+ WriteErrorMessage("RangeSplittingPairedReadSupplierGenerator::generateNewPairedReadSupplier(): unknown file type %d\n", fileType);
+ soft_exit(1);
+ }
+ return new RangeSplittingPairedReadSupplier(splitter,underlyingReader);
diff --git a/SNAPLib/RangeSplitter.h b/SNAPLib/RangeSplitter.h
new file mode 100644
index 0000000..dfc74a4
--- /dev/null
+++ b/SNAPLib/RangeSplitter.h
@@ -0,0 +1,132 @@
+Module Name:
+ RangeSplitter.h
+ Headers for code to split a range into pieces for multiple cores to process. It's designed
+ to handle cores that proceed at varying rates.
+ Bill Bolosky, 2011
+ User mode service.
+Revision History:
+ Pulled out of cSNAP.cpp to make it useful for various versions of the aligner
+ Generalized from FileSplitter to ranges, e.g. for scanning the genome /ravip/5/2012/
+#pragma once
+#include "Compat.h"
+#include "Read.h"
+#include "Genome.h"
+#include "AlignerOptions.h"
+// Utility class for letting multiple threads split chunks of a range to process.
+// This is used by the parallel versions of the aligners.
+class RangeSplitter
+ RangeSplitter(_int64 rangeEnd_, int numThreads_, unsigned divisonSize_ = 5, _int64 rangeBegin_ = 0, unsigned minMillis_ = 200, unsigned minRangeSize_ = 32768);
+ // Get the next range for a thread to process, or return false if the whole range is done.
+ bool getNextRange(_int64 *rangeStart, _int64 *rangeLength);
+ int numThreads;
+ _int64 rangeBegin;
+ _int64 rangeEnd;
+ unsigned divisionSize;
+ unsigned minMillis;
+ unsigned minRangeSize;
+ volatile _int64 position;
+ volatile _int64 startTime;
+class RangeSplittingReadSupplier : public ReadSupplier {
+ RangeSplittingReadSupplier(RangeSplitter *i_splitter, ReadReader *i_underlyingReader) :
+ splitter(i_splitter), underlyingReader(i_underlyingReader), read() {}
+ virtual ~RangeSplittingReadSupplier();
+ Read *getNextRead();
+ virtual void holdBatch(DataBatch batch)
+ { underlyingReader->holdBatch(batch); }
+ virtual bool releaseBatch(DataBatch batch)
+ { return underlyingReader->releaseBatch(batch); }
+ RangeSplitter *splitter;
+ ReadReader *underlyingReader;
+ Read read;
+class RangeSplittingReadSupplierGenerator: public ReadSupplierGenerator {
+ RangeSplittingReadSupplierGenerator(const char *i_fileName, bool i_isSAM, unsigned numThreads, const ReaderContext& context);
+ ~RangeSplittingReadSupplierGenerator() {delete splitter; delete [] fileName;}
+ ReadSupplier *generateNewReadSupplier();
+ ReaderContext* getContext() { return &context; }
+ RangeSplitter *splitter;
+ char *fileName;
+ const bool isSAM;
+ const int numThreads;
+ ReaderContext context;
+class RangeSplittingPairedReadSupplier : public PairedReadSupplier {
+ RangeSplittingPairedReadSupplier(RangeSplitter *i_splitter, PairedReadReader *i_underlyingReader) : splitter(i_splitter), underlyingReader(i_underlyingReader) {}
+ virtual ~RangeSplittingPairedReadSupplier();
+ virtual bool getNextReadPair(Read **read1, Read **read2);
+ virtual void holdBatch(DataBatch batch)
+ { underlyingReader->releaseBatch(batch); }
+ virtual bool releaseBatch(DataBatch batch)
+ { return underlyingReader->releaseBatch(batch); }
+ private:
+ PairedReadReader *underlyingReader;
+ RangeSplitter *splitter;
+ Read internalRead1;
+ Read internalRead2;
+ };
+class RangeSplittingPairedReadSupplierGenerator: public PairedReadSupplierGenerator {
+ RangeSplittingPairedReadSupplierGenerator(const char *i_fileName1, const char *i_fileName2, enum FileType i_fileType, unsigned numThreads, bool i_quicklyDropUnpairedReads, const ReaderContext& context);
+ ~RangeSplittingPairedReadSupplierGenerator();
+ PairedReadSupplier *generateNewPairedReadSupplier();
+ ReaderContext* getContext() { return &context; }
+ RangeSplitter *splitter;
+ char *fileName1;
+ char *fileName2;
+ const int numThreads;
+ enum FileType fileType;
+ ReaderContext context;
+ bool quicklyDropUnpairedReads;
diff --git a/SNAPLib/Read.cpp b/SNAPLib/Read.cpp
new file mode 100644
index 0000000..a2e4703
--- /dev/null
+++ b/SNAPLib/Read.cpp
@@ -0,0 +1,53 @@
+Module Name:
+ Read.cpp
+ Read class for the SNAP sequencer
+ Bill Bolosky, May, 2012
+ User mode service.
+Revision History:
+#include "stdafx.h"
+#include "Read.h"
+#include "SAM.h"
+#include "Error.h"
+ bool
+ AlignmentResult result)
+ return result == NotFound || result == SingleHit || result == MultipleHits || result == UnknownAlignment;
+ void
+ Read* read0,
+ Read* read1)
+ if (!readIdsMatch(read0, read1)) {
+ unsigned n[2] = {min(read0->getIdLength(), 200u), min(read1->getIdLength(), 200u)};
+ char* p[2] = {(char*) alloca(n[0] + 1), (char*) alloca(n[1] + 1)};
+ memcpy(p[0], read0->getId(), n[0]); p[0][n[0]] = 0;
+ memcpy(p[1], read1->getId(), n[1]); p[1][n[1]] = 0;
+ WriteErrorMessage("Unmatched read IDs '%s' and '%s'. Use the -I option to ignore this.\n", p[0], p[1]);
+ soft_exit(1);
+ }
+const unsigned Read::localBufferLength = 3 * MAX_READ_LENGTH;
+const unsigned DEFAULT_MIN_READ_LENGTH = 50;
diff --git a/SNAPLib/Read.h b/SNAPLib/Read.h
new file mode 100644
index 0000000..a784b59
--- /dev/null
+++ b/SNAPLib/Read.h
@@ -0,0 +1,860 @@
+Module Name:
+ Read.h
+ Headers for the Read class for the SNAP sequencer
+ Bill Bolosky, August, 2011
+ User mode service.
+Revision History:
+ Adapted from Matei Zaharia's Scala implementation.
+#pragma once
+#include "Compat.h"
+#include "Tables.h"
+#include "DataReader.h"
+#include "DataWriter.h"
+#include "directions.h"
+#include "Error.h"
+#include "Genome.h"
+#include "AlignmentResult.h"
+class FileFormat;
+class Genome;
+struct PairedAlignmentResult;
+//#define LONG_READS
+#ifdef LONG_READS
+#define MAX_READ_LENGTH 400000
+#define MAX_READ_LENGTH 400
+// Here's a brief description of the classes for input in SNAP:
+// Read:
+// A Read is some data that's come from a NGS machine. It includes some bases and associated quality score, as well as an ID.
+// Reads may be clipped (because the sequencing machine was unsure of some bases). They may be switched between forward and
+// reverse complement sense. They may or may not own their own memory for the various fields.
+// ReadReader:
+// A ReadReader understands how to generate reads from some input source (i.e., a FASTQ, SAM, BAM or CRAM file, for instance).
+// It owns the storage for the read's information (i.e., the base string), but does not own the Read object itself. It is responsible
+// for assuring that the memory for the read data is valid for the lifetime of the ReadReader (which, in practice, means it needs
+// to use mapped files). ReadReaders may assume that they will only be called from one thread.
+// PairedReadReader:
+// Similar to a ReadReader, except that it gets mate pairs of Reads.
+// ReadSupplier:
+// A class that supplies reads to a consumer. It looks similar to a ReadReader, except that it own the storage for the
+// ReadObject. The idea here is to allow the supplier to manage the memory that the Read object lives in so that a supplier
+// can be implemented by a parallel queue with batches of reads in it. Supplier may, of course, also be implemented in
+// different ways, such as range splitters. Like ReadReaders, ReadSuppliers will be called from only one thread. In
+// practice, ReadSuppliers will have underlying ReadReaders (which might be behind a shared queue, for example).
+// ReadSupplierGenerator:
+// A class that creates a ReadSupplier. This has to be thread safe. The usual pattern is that the initialization code will
+// create a read supplier generator, which will then be called on each of the threads to create a supplier, which will supply
+// the reads to be aligned.
+// PairedReadSupplierGenerator:
+// The paired version of a ReadSupplier.
+const int MaxReadLength = MAX_READ_LENGTH;
+class Read;
+enum ReadClippingType {NoClipping, ClipFront, ClipBack, ClipFrontAndBack};
+struct ReaderContext
+ const Genome* genome;
+ const char* defaultReadGroup;
+ const char* defaultReadGroupAux; // SAM or BAM depending on output format
+ int defaultReadGroupAuxLen;
+ ReadClippingType clipping;
+ bool paired;
+ bool ignoreSecondaryAlignments; // Should we just ignore reads with the Secondary Alignment bit set?
+ bool ignoreSupplementaryAlignments; // Should we just ignore reads with the Supplementary Alignment bit set?
+ const char* header; // allocated buffer for header
+ size_t headerLength; // length of string
+ size_t headerBytes; // bytes used for header in file
+ bool headerMatchesIndex; // header refseq matches current index
+class ReadReader {
+ ReadReader(const ReaderContext& i_context) : context(i_context) {}
+ virtual ~ReadReader() {}
+ // reading
+ virtual bool getNextRead(Read *readToUpdate) = 0;
+ virtual void reinit(_int64 startingOffset, _int64 amountOfFileToProcess) = 0;
+ // if you keep a read after the next call to getNextRead, you must call holdBatch
+ // this increments the reference count to the batch
+ virtual void holdBatch(DataBatch batch) = 0;
+ // decremens hold refcount, when all holds are released the batch is no longer valid
+ virtual bool releaseBatch(DataBatch batch) = 0;
+ ReaderContext* getContext() { return &context; }
+ ReaderContext context;
+class PairedReadReader {
+ virtual ~PairedReadReader() {}
+ // reading
+ virtual bool getNextReadPair(Read *read1, Read *read2) = 0;
+ virtual void reinit(_int64 startingOffset, _int64 amountOfFileToProcess) = 0;
+ virtual void holdBatch(DataBatch batch) = 0;
+ virtual bool releaseBatch(DataBatch batch) = 0;
+ virtual ReaderContext* getContext() = 0;
+ // wrap a single read source with a matcher that buffers reads until their mate is found
+ static PairedReadReader* PairMatcher(ReadReader* single, bool quicklyDropUnpairedReads);
+ static const int MatchBuffers = 2;
+class ReadSupplier {
+ virtual Read *getNextRead() = 0; // This read is valid until you call getNextRead, then it's done. Don't worry about deallocating it.
+ virtual ~ReadSupplier() {}
+ virtual void holdBatch(DataBatch batch) = 0;
+ virtual bool releaseBatch(DataBatch batch) = 0;
+class PairedReadSupplier {
+ // These read are valid until you call getNextRead, then they're done. Don't worry about deallocating them.
+ virtual bool getNextReadPair(Read **read0, Read **read1) = 0;
+ virtual ~PairedReadSupplier() {}
+ virtual void holdBatch(DataBatch batch) = 0;
+ virtual bool releaseBatch(DataBatch batch) = 0;
+class ReadSupplierGenerator {
+ virtual ReadSupplier *generateNewReadSupplier() = 0;
+ virtual ReaderContext* getContext() = 0;
+ virtual ~ReadSupplierGenerator() {}
+class PairedReadSupplierGenerator {
+ virtual PairedReadSupplier *generateNewPairedReadSupplier() = 0;
+ virtual ReaderContext* getContext() = 0;
+ virtual ~PairedReadSupplierGenerator() {}
+class ReadWriter {
+ virtual ~ReadWriter() {}
+ // write out header
+ virtual bool writeHeader(const ReaderContext& context, bool sorted, int argc, const char **argv, const char *version, const char *rgLine, bool omitSQLines) = 0;
+ //
+ // write a batch of single reads, the first one of which is a primary alignment and the rest secondary.
+ //
+ virtual bool writeReads(const ReaderContext& context, Read *read, SingleAlignmentResult *results, int nResults, bool firstIsPrimary) = 0;
+ //
+ // Write a batch of paired alignments, including some single secondary alignments. reads needs to be exactly two reads, singleAlignmentResult is a pointer to two arrays of
+ // SingleAlignmentResults for single alignments of the respective reads, and the number of results is given by the two ints pointed to by nSingleResults. The first paired
+ // result is primary, all others are secondary.
+ //
+ virtual bool writePairs(const ReaderContext& context, Read **reads /* array of size 2 */, PairedAlignmentResult *result, int nResults,
+ SingleAlignmentResult **singleResults /* array of size 2*/, int *nSingleResults /* array of size 2*/, bool firstIsPrimary) = 0;
+ // close out this thread
+ virtual void close() = 0;
+class DataWriterSupplier;
+class ReadWriterSupplier
+ virtual ReadWriter* getWriter() = 0;
+ virtual void close() = 0;
+ static ReadWriterSupplier* create(const FileFormat* format, DataWriterSupplier* dataSupplier,
+ const Genome* genome);
+#define READ_GROUP_FROM_AUX ((const char*) -1)
+class Read {
+ Read() :
+ id(NULL), data(NULL), quality(NULL),
+ localBufferAllocationOffset(0),
+ clippingState(NoClipping), currentReadDirection(FORWARD),
+ upcaseForwardRead(NULL), auxiliaryData(NULL), auxiliaryDataLength(0),
+ readGroup(NULL), originalAlignedLocation(-1), originalMAPQ(-1), originalSAMFlags(0),
+ originalFrontClipping(0), originalBackClipping(0), originalFrontHardClipping(0), originalBackHardClipping(0),
+ originalRNEXT(NULL), originalRNEXTLength(0), originalPNEXT(0), additionalFrontClipping(0)
+ {}
+ Read(const Read& other) : localBufferAllocationOffset(0)
+ {
+ copyFromOtherRead(other);
+ }
+ ~Read()
+ {
+ }
+ void dispose()
+ {
+ localBufferAllocationOffset = 0;
+ data = quality = unclippedData = unclippedQuality = externalData = NULL;
+ }
+ void operator=(const Read& other)
+ {
+ copyFromOtherRead(other);
+ }
+ void copyFromOtherRead(const Read& other)
+ {
+ id = other.id;
+ idLength = other.idLength;
+ frontClippedLength = other.frontClippedLength;
+ dataLength = other.dataLength;
+ externalData = other.externalData;
+ externalQuality = other.externalQuality;
+ currentReadDirection = other.currentReadDirection;
+ localBufferAllocationOffset = 0; // Clears out any allocations that might previously have been in the buffer
+ upcaseForwardRead = rcData = rcQuality = NULL;
+ unclippedLength = other.unclippedLength;
+ if (other.localBufferAllocationOffset != 0) {
+ //
+ // Copy the other read's local buffer to us.
+ //
+ assureLocalBufferLargeEnough();
+ _ASSERT(other.localBufferAllocationOffset <= localBufferLength);
+ memcpy(localBuffer, other.localBuffer, other.localBufferAllocationOffset);
+ localBufferAllocationOffset = other.localBufferAllocationOffset;
+ if (NULL != other.upcaseForwardRead) {
+ //
+ // Assert that it's in the other read's local buffer.
+ //
+ _ASSERT(other.upcaseForwardRead >= other.localBuffer && other.upcaseForwardRead <= other.localBuffer + other.localBufferAllocationOffset - unclippedLength);
+ //
+ // And put ours at the same offset in our local buffer.
+ //
+ upcaseForwardRead = localBuffer + (other.upcaseForwardRead - other.localBuffer);
+ }
+ if (NULL != other.rcData) {
+ //
+ // Assert that it's in the other read's local buffer.
+ //
+ _ASSERT(other.rcData >= other.localBuffer && other.rcData <= other.localBuffer + other.localBufferAllocationOffset - unclippedLength);
+ //
+ // And put ours at the same offset in our local buffer.
+ //
+ rcData = localBuffer + (other.rcData - other.localBuffer);
+ //
+ // And the same for RC quality.
+ //
+ _ASSERT(other.rcQuality >= other.localBuffer && other.rcQuality <= other.localBuffer + other.localBufferAllocationOffset - unclippedLength);
+ rcQuality = localBuffer + (other.rcQuality - other.localBuffer);
+ } else {
+ _ASSERT(NULL == other.rcQuality);
+ }
+ } else {
+ _ASSERT(other.upcaseForwardRead == NULL && other.rcData == NULL && other.rcQuality == NULL);
+ }
+ //
+ // Now set up the data, unclippedData, quality and unclippedQuality pointers.
+ //
+ if (NULL == other.localBuffer || other.data < other.localBuffer || other.data >= other.localBuffer + other.localBufferAllocationOffset - dataLength) {
+ //
+ // Not in the other read's local buffer, so it must be external. Copy it.
+ //
+ data = other.data;
+ _ASSERT(NULL == other.localBuffer || other.quality < other.localBuffer || other.quality >= other.localBuffer + other.localBufferAllocationOffset);
+ quality = other.quality;
+ _ASSERT(NULL == other.localBuffer || other.unclippedData < other.localBuffer || other.unclippedData >= other.localBuffer + other.localBufferAllocationOffset);
+ unclippedData = other.unclippedData;
+ _ASSERT(NULL == other.localBuffer || other.unclippedQuality < other.localBuffer || other.unclippedQuality >= other.localBuffer + other.localBufferAllocationOffset);
+ unclippedQuality = other.unclippedQuality;
+ } else {
+ //
+ // It is in the other read's local buffer. Copy the local buffer offsets from the other read into this one.
+ //
+ data = localBuffer + (other.data - other.localBuffer);
+ _ASSERT(other.quality >= other.localBuffer && other.quality <= other.localBuffer + other.localBufferAllocationOffset - dataLength);
+ quality = localBuffer + (other.quality - other.localBuffer);
+ _ASSERT(other.unclippedData >= other.localBuffer && other.unclippedData <= other.localBuffer + other.localBufferAllocationOffset - unclippedLength);
+ unclippedData = localBuffer + (other.unclippedData - other.localBuffer);
+ _ASSERT(other.unclippedQuality >= other.localBuffer && other.unclippedQuality <= other.localBuffer + other.localBufferAllocationOffset - unclippedLength);
+ unclippedQuality = localBuffer + (other.unclippedQuality - other.localBuffer);
+ }
+ clippingState = other.clippingState;
+ batch = other.batch;
+ readGroup = other.readGroup;
+ auxiliaryData = other.auxiliaryData;
+ auxiliaryDataLength = other.auxiliaryDataLength;
+ originalAlignedLocation = other.originalAlignedLocation;
+ originalMAPQ = other.originalMAPQ;
+ originalSAMFlags = other.originalSAMFlags;
+ originalFrontClipping = other.originalFrontClipping;
+ originalBackClipping = other.originalBackClipping;
+ originalFrontHardClipping = other.originalFrontHardClipping;
+ originalBackHardClipping = other.originalBackHardClipping;
+ originalRNEXT = other.originalRNEXT;
+ originalRNEXTLength = other.originalRNEXTLength;
+ originalPNEXT = other.originalPNEXT;
+ additionalFrontClipping = other.additionalFrontClipping;
+ }
+ //
+ // Initialize the Read. Reads do NOT take ownership of the memory to which they
+ // point, and it's the caller's responsibility to make sure that it continues to
+ // exist as long as the Read does. This is so that the caller can read a bunch of
+ // read data into a buffer, and then carve Reads out of it without doing further
+ // memory allocations, which would slow down the sequencing.
+ //
+ void init(
+ const char *i_id,
+ unsigned i_idLength,
+ const char *i_data,
+ const char *i_quality,
+ unsigned i_dataLength)
+ {
+ init(i_id, i_idLength, i_data, i_quality, i_dataLength, InvalidGenomeLocation, -1, 0, 0, 0, 0, 0, NULL, 0, 0);
+ }
+ void init(
+ const char * i_id,
+ unsigned i_idLength,
+ const char * i_data,
+ const char * i_quality,
+ unsigned i_dataLength,
+ GenomeLocation i_originalAlignedLocation,
+ unsigned i_originalMAPQ,
+ unsigned i_originalSAMFlags,
+ unsigned i_originalFrontClipping,
+ unsigned i_originalBackClipping,
+ unsigned i_originalFrontHardClipping,
+ unsigned i_originalBackHardClipping,
+ const char * i_originalRNEXT,
+ unsigned i_originalRNEXTLength,
+ unsigned i_originalPNEXT,
+ bool allUpper = false)
+ {
+ id = i_id;
+ idLength = i_idLength;
+ data = unclippedData = externalData = i_data;
+ quality = unclippedQuality = externalQuality = i_quality;
+ dataLength = i_dataLength;
+ unclippedLength = dataLength;
+ frontClippedLength = 0;
+ clippingState = NoClipping;
+ additionalFrontClipping = 0;
+ originalAlignedLocation = i_originalAlignedLocation;
+ originalMAPQ = i_originalMAPQ;
+ originalSAMFlags = i_originalSAMFlags;
+ originalFrontClipping = i_originalFrontClipping;
+ originalBackClipping = i_originalBackClipping;
+ originalFrontHardClipping = i_originalFrontHardClipping;
+ originalBackHardClipping = i_originalBackHardClipping;
+ originalRNEXT = i_originalRNEXT;
+ originalRNEXTLength = i_originalRNEXTLength;
+ originalPNEXT = i_originalPNEXT;
+ currentReadDirection = FORWARD;
+ localBufferAllocationOffset = 0; // Clears out any allocations that might previously have been in the buffer
+ upcaseForwardRead = rcData = rcQuality = NULL;
+ //
+ // Check for lower case letters in the data, and convert to upper case if there are any. Also convert
+ // '.' to N.
+ //
+ if (! allUpper) {
+ unsigned anyLowerCase = 0;
+ for (unsigned i = 0; i < dataLength; i++) {
+ anyLowerCase |= IS_LOWER_CASE_OR_DOT[data[i]];
+ }
+ if (anyLowerCase) {
+ assureLocalBufferLargeEnough();
+ upcaseForwardRead = localBuffer;
+ localBufferAllocationOffset += unclippedLength;
+ for (unsigned i = 0; i < dataLength; i++) {
+ upcaseForwardRead[i] = TO_UPPER_CASE_DOT_TO_N[data[i]];
+ }
+ unclippedData = data = upcaseForwardRead;
+ }
+ }
+ }
+ // For efficiency, this class holds id, data and quality pointers that are
+ // *NOT* guaranteed to be to null-terminated strings; use the the length fields
+ // to figure out how far to read into these strings.
+ inline const char *getId() const {return id;}
+ inline unsigned getIdLength() const {return idLength;}
+ inline const char *getData() const {return data;}
+ inline const char *getUnclippedData() const {return unclippedData;}
+ inline const char *getQuality() const {return quality;}
+ inline const char *getUnclippedQuality() const {return unclippedQuality;}
+ inline unsigned getDataLength() const {return dataLength;}
+ inline unsigned getUnclippedLength() const {return unclippedLength;}
+ inline unsigned getFrontClippedLength() const {return (unsigned)(data - unclippedData);} // number of bases clipped from the front of the read
+ inline unsigned getBackClippedLength() const {return unclippedLength - dataLength - getFrontClippedLength();}
+ inline void setUnclippedLength(unsigned length) {unclippedLength = length;}
+ inline ReadClippingType getClippingState() const {return clippingState;}
+ inline DataBatch getBatch() { return batch; }
+ inline void setBatch(DataBatch b) { batch = b; }
+ inline const char* getReadGroup() const { return readGroup; }
+ inline void setReadGroup(const char* rg) { readGroup = rg; }
+ inline GenomeLocation getOriginalAlignedLocation() {return originalAlignedLocation;}
+ inline unsigned getOriginalMAPQ() {return originalMAPQ;}
+ inline unsigned getOriginalSAMFlags() {return originalSAMFlags;}
+ inline unsigned getOriginalFrontClipping() {return originalFrontClipping;}
+ inline unsigned getOriginalBackClipping() {return originalBackClipping;}
+ inline unsigned getOriginalFrontHardClipping() {return originalFrontHardClipping;}
+ inline unsigned getOriginalBackHardClipping() {return originalBackHardClipping;}
+ inline const char *getOriginalRNEXT() {return originalRNEXT;}
+ inline unsigned getOriginalRNEXTLength() {return originalRNEXTLength;}
+ inline unsigned getOriginalPNEXT() {return originalPNEXT;}
+ inline void setAdditionalFrontClipping(int clipping)
+ {
+ data += clipping - additionalFrontClipping;
+ dataLength -= clipping - additionalFrontClipping;
+ quality += clipping - additionalFrontClipping;
+ additionalFrontClipping = clipping;
+ }
+ inline char* getAuxiliaryData(unsigned* o_length, bool * o_isSAM) const
+ {
+ *o_length = auxiliaryDataLength;
+ *o_isSAM = auxiliaryData && auxiliaryDataLength >= 5 && auxiliaryData[2] == ':';
+ return auxiliaryData;
+ }
+ inline void setAuxiliaryData(char* data, unsigned len)
+ { auxiliaryData = data; auxiliaryDataLength = len; }
+ void clip(ReadClippingType clipping, bool maintainOriginalClipping = false) {
+ if (clipping == clippingState) {
+ //
+ // Already in the right state.
+ //
+ return;
+ }
+ //
+ // Revert to unclipped, then clip to the correct state.
+ //
+ dataLength = unclippedLength;
+ frontClippedLength = 0;
+ data = unclippedData;
+ quality = unclippedQuality;
+ //
+ // First clip from the back.
+ //
+ if (ClipBack == clipping || ClipFrontAndBack == clipping) {
+ unsigned backClipping = 0;
+ while (dataLength > 0 && quality[dataLength - 1] == '#') {
+ dataLength--;
+ backClipping++;
+ }
+ if (maintainOriginalClipping && backClipping < originalBackClipping) {
+ dataLength -= (originalBackClipping - backClipping);
+ }
+ }
+ //
+ // Then clip from the beginning.
+ //
+ if (ClipFront == clipping || ClipFrontAndBack == clipping) {
+ frontClippedLength = 0;
+ while (frontClippedLength < dataLength && quality[frontClippedLength] == '#') {
+ frontClippedLength++;
+ }
+ if (maintainOriginalClipping) {
+ frontClippedLength = max(frontClippedLength, originalFrontClipping);
+ }
+ }
+ _ASSERT(frontClippedLength <= dataLength);
+ dataLength -= frontClippedLength;
+ data += frontClippedLength;
+ quality += frontClippedLength;
+ clippingState = clipping;
+ };
+ unsigned countOfTrailing2sInQuality() const { // 2 here is represented in Phred+33, or ascii '#'
+ unsigned count = 0;
+ while (count < dataLength && quality[dataLength - 1 - count] == '#') {
+ count++;
+ }
+ return count;
+ }
+ unsigned countOfNs() const {
+ unsigned count = 0;
+ for (unsigned i = 0; i < dataLength; i++) {
+ count += IS_N[data[i]];
+ }
+ return count;
+ }
+ void computeReverseCompliment(char *outputBuffer) { // Caller guarantees that outputBuffer is at least getDataLength() bytes
+ for (unsigned i = 0; i < dataLength; i++) {
+ outputBuffer[i] = COMPLEMENT[data[dataLength - i - 1]];
+ }
+ }
+ void becomeRC()
+ {
+ if (RC == currentReadDirection) {
+ //
+ // We've already RCed ourself. Switch back.
+ //
+ if (NULL != upcaseForwardRead) {
+ unclippedData = upcaseForwardRead;
+ } else {
+ unclippedData = externalData;
+ }
+ unclippedQuality = externalQuality;
+ currentReadDirection = FORWARD;
+ } else {
+ if (rcData != NULL) {
+ //
+ // We've already been RC, just switch back.
+ //
+ unclippedData = rcData;
+ unclippedQuality = rcQuality;
+ } else {
+ assureLocalBufferLargeEnough();
+ rcData = localBuffer + localBufferAllocationOffset;
+ localBufferAllocationOffset += unclippedLength;
+ rcQuality = localBuffer + localBufferAllocationOffset;
+ localBufferAllocationOffset += unclippedLength;
+ _ASSERT(localBufferAllocationOffset <= localBufferLength);
+ for (unsigned i = 0; i < unclippedLength; i++) {
+ rcData[i] = COMPLEMENT[unclippedData[unclippedLength - i - 1]];
+ rcQuality[unclippedLength-i-1] = unclippedQuality[i];
+ }
+ unclippedData = rcData;
+ unclippedQuality = rcQuality;
+ }
+ currentReadDirection = RC;
+ }
+ //
+ // The clipping reverses as we go to/from RC.
+ //
+ frontClippedLength = unclippedLength - dataLength - frontClippedLength;
+ data = unclippedData + frontClippedLength;
+ quality = unclippedQuality + frontClippedLength;
+ unsigned temp = originalFrontClipping;
+ originalFrontClipping = originalBackClipping;
+ originalBackClipping = temp;
+ temp = originalFrontHardClipping;
+ originalFrontHardClipping = originalBackHardClipping;
+ originalBackHardClipping = temp;
+ }
+ static void checkIdMatch(Read* read0, Read* read1);
+ static void computeClippingFromCigar(const char *cigarBuffer, unsigned *originalFrontClipping, unsigned *originalBackClipping, unsigned *originalFrontHardClipping, unsigned *originalBackHardClipping)
+ {
+ size_t cigarSize;
+ const size_t cigarLimit = 1000;
+ for (cigarSize = 0; cigarSize < cigarLimit && cigarBuffer[cigarSize] != '\0' && cigarBuffer[cigarSize] != '\t'; cigarSize++) {
+ // This loop body intentionally left blank.
+ }
+ if (cigarSize == cigarLimit) {
+ WriteErrorMessage( "Absurdly long cigar string.\n");
+ soft_exit(1);
+ }
+ size_t frontHardClippingChars, backHardClippingChars, frontClippingChars, backClippingChars;
+ //
+ // Pull off the hard clipping first.
+ //
+ ExtractClipping(cigarBuffer, cigarSize, originalFrontHardClipping, originalBackHardClipping, 'H', &frontHardClippingChars, &backHardClippingChars);
+ _ASSERT(frontHardClippingChars + backHardClippingChars <= cigarSize);
+ //
+ // Now look at what's left of the cigar string to see if there's soft clipping.
+ //
+ ExtractClipping(cigarBuffer + frontHardClippingChars, cigarSize - frontHardClippingChars - backHardClippingChars, originalFrontClipping, originalBackClipping,
+ 'S', &frontClippingChars, &backClippingChars);
+ }
+ const char *id;
+ const char *data;
+ const char *unclippedData;
+ const char *unclippedQuality;
+ const char *quality;
+ const char *readGroup;
+ unsigned idLength;
+ unsigned dataLength;
+ unsigned unclippedLength;
+ unsigned frontClippedLength;
+ ReadClippingType clippingState;
+ int additionalFrontClipping;
+ //
+ // Alignment data that was in the read when it was read from a file. While this should probably also be the place to put
+ // information that'll be used by the read writer, for now it's not. Hence, they're all called "original."
+ //
+ GenomeLocation originalAlignedLocation;
+ unsigned originalMAPQ;
+ unsigned originalSAMFlags;
+ unsigned originalFrontClipping;
+ unsigned originalBackClipping;
+ unsigned originalFrontHardClipping;
+ unsigned originalBackHardClipping;
+ const char *originalRNEXT;
+ unsigned originalRNEXTLength;
+ unsigned originalPNEXT;
+ //
+ // Memory that's local to this read and that is used to contain an upcased version of the read as well as
+ // RC read & quality strings. It survives init() so as to avoid memory allocation overhead.
+ //
+ char localBuffer[MAX_READ_LENGTH * 3];
+ static const unsigned localBufferLength;
+ unsigned localBufferAllocationOffset; // The next location to allocate in the local buffer.
+ char *upcaseForwardRead; // Either NULL or points into localBuffer. Used when the incoming read isn't all capitalized. Unclipped.
+ char *rcData; // Either NULL or points into localBuffer. Used when we've computed a reverse complement of the read, whether we're using it or not. Unclipped.
+ char *rcQuality; // Ditto for quality.
+ const char *externalData; // The data that was passed in at init() time, memory doesn't belong to this.
+ const char *externalQuality; // The quality that was passed in at init() time, memory doens't belong to this.
+ Direction currentReadDirection;
+ inline void assureLocalBufferLargeEnough()
+ {
+#if 0 // Always true with static allocation
+ if (localBufferLength < 3 * unclippedLength) {
+ _ASSERT(0 == localBufferAllocationOffset); // Can only do this when the buffer is empty
+ if (NULL != localBuffer) {
+ BigDealloc(localBuffer);
+ }
+ localBufferLength = RoundUpToPageSize(3 * unclippedLength);
+ localBuffer = (char *)BigAlloc(localBufferLength);
+ }
+#endif // 0
+ }
+ // batch for managing lifetime during input
+ DataBatch batch;
+ // auxiliary data in BAM or SAM format (can tell by looking at 3rd byte), if available
+ char* auxiliaryData;
+ unsigned auxiliaryDataLength;
+ //
+ // Pull the clipping info from the front and back of a cigar string.
+ static void ExtractClipping(const char *cigarBuffer, size_t cigarSize, unsigned *frontClipping, unsigned *backClipping, char clippingChar, size_t *frontClippingChars, size_t *backClippingChars)
+ {
+ *frontClipping = 0;
+ const size_t bufferSize = 20;
+ char buffer[bufferSize+1]; // +1 for trailing null
+ unsigned i;
+ for (i = 0; i < bufferSize && i < cigarSize && cigarBuffer[i] >= '0' && cigarBuffer[i] <= '9'; i++) {
+ buffer[i] = cigarBuffer[i];
+ }
+ if (cigarBuffer[i] == clippingChar) {
+ buffer[i] = '\0';
+ *frontClipping = atoi(buffer);
+ *frontClippingChars = i + 1;
+ } else {
+ *frontClippingChars = 0;
+ }
+ *backClipping = 0;
+ *backClippingChars = 0;
+ //
+ // Find the end of the cigar string by looking for either the end of the string or a tab. Just start where we
+ // were.
+ //
+ for (;i < cigarSize && cigarBuffer[i] != '\t' && cigarBuffer[i] != '\0'; i++) {
+ // This loop body intentionally left blank.
+ }
+ if (i > 1 && cigarBuffer[i-1] == clippingChar) {
+ for (i = i - 2; i >=0 && cigarBuffer[i] >= '0' && cigarBuffer[i] <= '9'; i--) {
+ // This loop body intentionally left blank.
+ }
+ //
+ // If we've gotten back to the beginning of the string, then the whole thing is one big soft clip. We arbitrarily
+ // select that to be front clipping, and so leave the back clipping alone.
+ if (i > 0) {
+ unsigned stringStart = i + 1;
+ for (i = stringStart; cigarBuffer[i] >= '0' && cigarBuffer[i] <= '9'; i++) {
+ buffer[i - stringStart] = cigarBuffer[i];
+ }
+ buffer[i - stringStart] = '\0';
+ *backClipping = atoi(buffer);
+ *backClippingChars = i - stringStart + 1;
+ }
+ }
+ }
+// Reads that copy the memory for their strings. They're less efficient than the base
+// Read class, but you can keep them around without holding references to the IO buffers
+// and eventually stopping the IO.
+class ReadWithOwnMemory : public Read {
+ ReadWithOwnMemory() : Read(), extraBuffer(NULL), dataBuffer(NULL), idBuffer(NULL), qualityBuffer(NULL), auxBuffer(NULL) {}
+ ReadWithOwnMemory(const Read &baseRead) {
+ set(baseRead);
+ }
+ // must manually call destructor!
+ void dispose() {
+ if (extraBuffer != NULL) {
+ delete [] extraBuffer;
+ }
+ }
+ void set(const Read &baseRead)
+ {
+ // allocate space in ownBuffer if possible; id/aux might need extraBuffer
+ dataBuffer = ownBuffer;
+ int ownBufferUsed = baseRead.getUnclippedLength() + 1;
+ qualityBuffer = ownBuffer + ownBufferUsed;
+ ownBufferUsed += baseRead.getUnclippedLength() + 1;
+ unsigned auxLen;
+ bool auxSam;
+ char* aux = baseRead.getAuxiliaryData(&auxLen, &auxSam);
+ if (baseRead.getIdLength() + 1 < sizeof(ownBuffer) - ownBufferUsed) {
+ idBuffer = ownBuffer + ownBufferUsed;
+ ownBufferUsed += baseRead.getIdLength() + 1;
+ } else {
+ idBuffer = NULL;
+ }
+ if (auxLen > 0 && auxLen < sizeof(ownBuffer) - ownBufferUsed) {
+ auxBuffer = ownBuffer + ownBufferUsed;
+ ownBufferUsed += auxLen;
+ } else {
+ auxBuffer = NULL;
+ }
+ if (idBuffer == NULL || (auxLen > 0 && auxBuffer == NULL)) {
+ extraBuffer = new char[(idBuffer == NULL ? baseRead.getIdLength() + 1 : 0) + auxLen];
+ int extraBufferUsed = 0;
+ if (idBuffer == NULL) {
+ idBuffer = extraBuffer;
+ extraBufferUsed += baseRead.getIdLength() + 1;
+ }
+ if (auxLen > 0 && auxBuffer == NULL) {
+ auxBuffer = extraBuffer + extraBufferUsed;
+ }
+ } else {
+ extraBuffer = NULL;
+ }
+ // copy data into buffers
+ memcpy(idBuffer,baseRead.getId(),baseRead.getIdLength());
+ idBuffer[baseRead.getIdLength()] = '\0'; // Even though it doesn't need to be null terminated, it seems like a good idea.
+ memcpy(dataBuffer,baseRead.getUnclippedData(),baseRead.getUnclippedLength());
+ dataBuffer[baseRead.getUnclippedLength()] = '\0';
+ memcpy(qualityBuffer,baseRead.getUnclippedQuality(),baseRead.getUnclippedLength());
+ qualityBuffer[baseRead.getUnclippedLength()] = '\0';
+ init(idBuffer,baseRead.getIdLength(),dataBuffer,qualityBuffer,baseRead.getUnclippedLength());
+ clip(baseRead.getClippingState());
+ setReadGroup(baseRead.getReadGroup());
+ if (aux != NULL && auxLen > 0) {
+ memcpy(auxBuffer, aux, auxLen);
+ setAuxiliaryData(auxBuffer, auxLen);
+ } else {
+ setAuxiliaryData(NULL, 0);
+ }
+ }
+ char ownBuffer[MAX_READ_LENGTH * 2 + 1000]; // internal buffer for copied data
+ char* extraBuffer; // extra buffer if internal buffer not big enough
+ // should all point into ownBuffer or extraBuffer
+ char *idBuffer;
+ char *dataBuffer;
+ char *qualityBuffer;
+ char *auxBuffer;
+extern const unsigned DEFAULT_MIN_READ_LENGTH;
diff --git a/SNAPLib/ReadReader.cpp b/SNAPLib/ReadReader.cpp
new file mode 100644
index 0000000..58592d1
--- /dev/null
+++ b/SNAPLib/ReadReader.cpp
@@ -0,0 +1,57 @@
+Module Name:
+ ReadReader.cpp
+ Concrete file reading classes
+ User mode service.
+#include "stdafx.h"
+#include "BigAlloc.h"
+#include "Compat.h"
+#include "Read.h"
+#include "AlignmentResult.h"
+#include "FileFormat.h"
+class SimpleReadReader : public ReadReader
+ SimpleReadReader(const FileFormat* i_format, DataReader* i_data, const ReaderContext& i_context)
+ : ReadReader(i_context), format(i_format), data(i_data), headerSize(0)
+ {}
+ virtual ~SimpleReadReader()
+ {
+ delete data;
+ }
+ virtual void reinit(_int64 startingOffset, _int64 amountOfFileToProcess);
+ virtual bool getNextRead(Read *readToUpdate);
+ virtual bool getNextRead(Read *read, AlignmentResult *alignmentResult, unsigned *genomeLocation, bool *isRC, unsigned *mapQ,
+ unsigned *flag, const char **cigar)
+ {
+ // return getNextRead(read,alignmentResult,genomeLocation,isRC,mapQ,flag,false,cigar);
+ }
+ virtual void holdBatch(DataBatch batch)
+ { data->holdBatch(batch); }
+ virtual bool releaseBatch(DataBatch batch)
+ { return data->releaseBatch(batch); }
+ const FileFormat* format;
+ DataReader* data;
+ _int64 headerSize;
diff --git a/SNAPLib/ReadSupplierQueue.cpp b/SNAPLib/ReadSupplierQueue.cpp
new file mode 100644
index 0000000..b8bddfe
--- /dev/null
+++ b/SNAPLib/ReadSupplierQueue.cpp
@@ -0,0 +1,729 @@
+Module Name:
+ ReadSupplierQueue.cpp
+ Code for parallel queue of reads
+ Bill Bolosky, November, 2012
+ User mode service.
+Revision History:
+#include "stdafx.h"
+#include "Read.h"
+#include "Compat.h"
+#include "ReadSupplierQueue.h"
+#include "exit.h"
+#include "SAM.h"
+ ReadSupplierQueue::ReadSupplierQueue(ReadReader *reader)
+ : tracker(64)
+ commonInit();
+ singleReader[0] = reader;
+ReadSupplierQueue::ReadSupplierQueue(ReadReader *firstHalfReader, ReadReader *secondHalfReader)
+ : tracker(64)
+ commonInit();
+ singleReader[0] = firstHalfReader;
+ singleReader[1] = secondHalfReader;
+ReadSupplierQueue::ReadSupplierQueue(PairedReadReader *i_pairedReader)
+ : tracker(128)
+ commonInit();
+ pairedReader = i_pairedReader;
+ nReadersRunning = 0;
+ nSuppliersRunning = 0;
+ allReadsQueued = false;
+ balance = 0;
+ emptyQueue->next = emptyQueue->prev = emptyQueue;
+ readyQueue[0].next = readyQueue[0].prev = &readyQueue[0];
+ readyQueue[1].next = readyQueue[1].prev = &readyQueue[1];
+ InitializeExclusiveLock(&lock);
+ CreateEventObject(&readsReady);
+ CreateEventObject(&emptyBuffersAvailable);
+ CreateEventObject(&allReadsConsumed);
+ //
+ // Create 2 buffers for the reader. We'll add more buffers as we add suppliers.
+ //
+ for (int i = 0 ; i < 2; i++) {
+ ReadQueueElement *element = new ReadQueueElement;
+ element->addToTail(emptyQueue);
+ }
+ AllowEventWaitersToProceed(&emptyBuffersAvailable);
+ for (int i = 0; i < 2; i++) {
+ CreateEventObject(&throttle[i]);
+ AllowEventWaitersToProceed(&throttle[i]);
+ singleReader[i] = NULL;
+ }
+ pairedReader = NULL;
+ elementSize = ReadQueueElement::MaxReadsPerElement;
+ delete singleReader[0];
+ delete singleReader[1];
+ delete pairedReader;
+ DestroyEventObject(&throttle[0]);
+ DestroyEventObject(&throttle[1]);
+ DestroyExclusiveLock(&lock);
+ bool
+ bool worked = true;
+ if (singleReader[1] == NULL) {
+ nReadersRunning = 1;
+ } else {
+ nReadersRunning = 2;
+ }
+ ReaderThreadParams *readerParams = new ReaderThreadParams;
+ readerParams->isSecondReader = false;
+ readerParams->queue = this;
+ if (!StartNewThread(ReaderThreadMain, readerParams)) {
+ return false;
+ }
+ if (singleReader[1] == NULL) {
+ return true;
+ }
+ readerParams = new ReaderThreadParams;
+ readerParams->isSecondReader = true;
+ readerParams->queue = this;
+ return (StartNewThread(ReaderThreadMain, readerParams));
+ void
+ WaitForEvent(&allReadsConsumed);
+ ReadSupplier *
+ AcquireExclusiveLock(&lock);
+ nSuppliersRunning++;
+ //
+ // Add more queue elements for this supplier.
+ //
+ for (int i = 0; i < 2; i++) {
+ ReadQueueElement *element = new ReadQueueElement;
+ element->addToTail(emptyQueue);
+ }
+ AllowEventWaitersToProceed(&emptyBuffersAvailable);
+ ReleaseExclusiveLock(&lock);
+ return new ReadSupplierFromQueue(this);
+ PairedReadSupplier *
+ const int addElements = (singleReader[1] == NULL) ? 2 : 4 + MaxImbalance;
+ ReadQueueElement * newElements[MaxImbalance + 4];
+ for (int i = 0 ; i < addElements; i++) {
+ newElements[i] = new ReadQueueElement;
+ }
+ AcquireExclusiveLock(&lock);
+ nSuppliersRunning++;
+ //
+ // Add two more queue elements (4+MaxImbalance for paired-end, double file).
+ //
+ for (int i = 0; i < addElements; i++) {
+ ReadQueueElement *element = newElements[i];
+ element->addToTail(emptyQueue);
+ }
+ AllowEventWaitersToProceed(&emptyBuffersAvailable);
+ ReleaseExclusiveLock(&lock);
+ return new PairedReadSupplierFromQueue(this, singleReader[1] != NULL);
+ ReaderContext*
+ return singleReader[0] != NULL ? singleReader[0]->getContext() : pairedReader->getContext();
+ ReadQueueElement *
+ _ASSERT(singleReader[1] == NULL); // i.e., we're doing file (but possibly single or paired end) reads
+ //WriteErrorMessage("Thread %u: getElement wait acquire lock\n", GetThreadId());
+ AcquireExclusiveLock(&lock);
+ //WriteErrorMessage("Thread %u: getElement acquired lock\n", GetThreadId());
+ while (!areAnyReadsReady()) {
+ //WriteErrorMessage("Thread %u: getElement loop releasing lock\n", GetThreadId());
+ ReleaseExclusiveLock(&lock);
+ //WriteErrorMessage("Thread %u: getElement loop released lock\n", GetThreadId());
+ if (allReadsQueued) {
+ //
+ // Everything's queued and the queue is empty. No more work.
+ //
+ //WriteErrorMessage("Thread %u: getElement loop exit allReadsQueued\n", GetThreadId());
+ return NULL;
+ }
+ //WriteErrorMessage("Thread %u: getElement loop wait readsReady\n", GetThreadId());
+ WaitForEvent(&readsReady);
+ //WriteErrorMessage("Thread %u: getElement loop wait acquire lock\n", GetThreadId());
+ AcquireExclusiveLock(&lock);
+ //WriteErrorMessage("Thread %u: getElement loop acquired lock\n", GetThreadId());
+ }
+ ReadQueueElement *element = readyQueue[0].next;
+ _ASSERT(element != &readyQueue[0]);
+ element->removeFromQueue();
+ if (!areAnyReadsReady() && !allReadsQueued) {
+ //WriteErrorMessage("Thread %u: getElement block readsReady\n", GetThreadId());
+ PreventEventWaitersFromProceeding(&readsReady);
+ }
+ ReleaseExclusiveLock(&lock);
+ //WriteErrorMessage("Thread %u: getElement released lock\n", GetThreadId());
+ return element;
+ bool
+ReadSupplierQueue::getElements(ReadQueueElement **element1, ReadQueueElement **element2)
+ _ASSERT(singleReader[1] != NULL); // i.e., we're doing paired file reads
+ //WriteErrorMessage("Thread %u: getElements wait acquire lock\n", GetThreadId());
+ AcquireExclusiveLock(&lock);
+ //WriteErrorMessage("Thread %u: getElements acquired lock\n", GetThreadId());
+ while (!areAnyReadsReady()) {
+ //WriteErrorMessage("Thread %u: getElements loop releasing lock\n", GetThreadId());
+ ReleaseExclusiveLock(&lock);
+ //WriteErrorMessage("Thread %u: getElements loop released lock\n", GetThreadId());
+ if (allReadsQueued) {
+ //
+ // Everything's queued and the queue is empty. No more work.
+ //
+ //WriteErrorMessage("Thread %u: getElement loop exit allReadsQueued\n", GetThreadId());
+ return NULL;
+ }
+ //WriteErrorMessage("Thread %u: getElements loop wait readsReady\n", GetThreadId());
+ WaitForEvent(&readsReady);
+ //WriteErrorMessage("Thread %u: getElements loop wait acquire lock\n", GetThreadId());
+ AcquireExclusiveLock(&lock);
+ //WriteErrorMessage("Thread %u: getElements loop acquired lock\n", GetThreadId());
+ }
+ ReadQueueElement* copyOut = NULL; // for adjusting sizes
+ *element1 = readyQueue[0].next;
+ *element2 = readyQueue[1].next;
+ if ((*element1)->totalReads != (*element2)->totalReads) {
+ copyOut = getEmptyElement(); // might release/acquire lock so state could change!
+ *element1 = readyQueue[0].next;
+ *element2 = readyQueue[1].next;
+ if ((*element1)->totalReads == (*element2)->totalReads) {
+ doneWithElement(copyOut);
+ }
+ }
+ if ((*element1)->totalReads == (*element2)->totalReads) {
+ (*element1)->removeFromQueue();
+ (*element2)->removeFromQueue();
+ } else {
+ //fprintf(stderr,"getElements different sizes %d %d\n", (*element1)->totalReads, (*element2)->totalReads);
+ // need to balance out reads between the two
+ // make a copy of the min# of reads from larger element
+ // shrink the larger element and leave it there
+ ReadQueueElement* elements[2] = {*element1, *element2};
+ int sizes[2] = {elements[0]->totalReads, elements[1]->totalReads};
+ int largerOne = elements[1]->totalReads > elements[0]->totalReads;
+ int minReads = elements[1-largerOne]->totalReads;
+ memcpy(copyOut->reads, elements[largerOne]->reads, minReads * sizeof(Read));
+ _ASSERT(elements[0]->totalReads == sizes[0] && elements[1]->totalReads == sizes[1] && elements[largerOne]->totalReads > elements[1-largerOne]->totalReads);
+ copyOut->totalReads = minReads;
+ _ASSERT(elements[0]->totalReads == sizes[0] && elements[1]->totalReads == sizes[1] && elements[largerOne]->totalReads > elements[1-largerOne]->totalReads);
+ memmove(elements[largerOne]->reads, &elements[largerOne]->reads[minReads],
+ (elements[largerOne]->totalReads - minReads) * sizeof(Read));
+ elements[largerOne]->totalReads -= minReads;
+ copyOut->batches.append(&elements[largerOne]->batches);
+ for (BatchVector::iterator i = copyOut->batches.begin(); i != copyOut->batches.end(); i++) {
+ holdBatch(*i);
+ }
+ if (largerOne == 0) {
+ *element1 = copyOut;
+ (*element2)->removeFromQueue();
+ } else {
+ (*element1)->removeFromQueue();
+ *element2 = copyOut;
+ }
+ //WriteErrorMessage("Thread %u: balanced sizes %d %d\n", GetThreadId(), sizes[0], sizes[1]);
+ }
+ //fprintf(stderr,"getElements %x/%x with %d/%d reads\n", (int) (*element1), (int) (*element2), (*element1)->totalReads, (*element2)->totalReads);
+ if (!areAnyReadsReady() && !allReadsQueued) {
+ //WriteErrorMessage("Thread %u: getElements block readsReady\n", GetThreadId());
+ PreventEventWaitersFromProceeding(&readsReady);
+ }
+ //WriteErrorMessage("Thread %u: getElements releasing lock\n", GetThreadId());
+ ReleaseExclusiveLock(&lock);
+ //WriteErrorMessage("Thread %u: getElements released lock", GetThreadId());
+ return true;
+ bool
+ReadSupplierQueue::areAnyReadsReady() // must hold the lock to call this.
+ if (readyQueue[0].next == &readyQueue[0]) {
+ return false;
+ }
+ if (singleReader[1] == NULL) {
+ return true;
+ }
+ return readyQueue[1].next != &readyQueue[1];
+ void
+ReadSupplierQueue::doneWithElement(ReadQueueElement *element)
+ //WriteErrorMessage("Thread %u: doneWithElement wait acquire lock\n", GetThreadId());
+ AcquireExclusiveLock(&lock);
+ //WriteErrorMessage("Thread %u: doneWithElement acquired lock\n", GetThreadId());
+ _ASSERT(element->totalReads > 0);
+ VariableSizeVector<DataBatch> batches = element->batches;
+ element->batches.clear();
+ element->addToTail(emptyQueue);
+ AllowEventWaitersToProceed(&emptyBuffersAvailable);
+ //WriteErrorMessage("Thread %u: doneWithElement releasing lock\n", GetThreadId());
+ ReleaseExclusiveLock(&lock);
+ //WriteErrorMessage("Thread %u: doneWithElement released lock\n", GetThreadId());
+ for (VariableSizeVector<DataBatch>::iterator b = batches.begin(); b != batches.end(); b++) {
+ releaseBatch(*b);
+ }
+ void
+ //WriteErrorMessage("Thread %u: supplierFinished wait acquire lock\n", GetThreadId());
+ AcquireExclusiveLock(&lock);
+ //WriteErrorMessage("Thread %u: supplierFinished acquired lock\n", GetThreadId());
+ _ASSERT(allReadsQueued);
+ _ASSERT(nSuppliersRunning > 0);
+ nSuppliersRunning--;
+ if (0 == nSuppliersRunning) {
+ AllowEventWaitersToProceed(&allReadsConsumed);
+ }
+ //WriteErrorMessage("Thread %u: supplierFinished releasing lock\n", GetThreadId());
+ ReleaseExclusiveLock(&lock);
+ //WriteErrorMessage("Thread %u: supplierFinished released lock\n", GetThreadId());
+ void
+ DataBatch batch)
+ if (pairedReader != NULL) {
+ pairedReader->holdBatch(batch);
+ } else if (singleReader[1] == NULL) {
+ singleReader[0]->holdBatch(batch);
+ } else {
+ singleReader[batch.fileID % 2]->holdBatch(DataBatch(batch.batchID, batch.fileID / 2));
+ }
+ bool
+ DataBatch batch)
+ if (pairedReader != NULL) {
+ return pairedReader->releaseBatch(batch);
+ } else if (singleReader[1] == NULL) {
+ return singleReader[0]->releaseBatch(batch);
+ } else {
+ return singleReader[batch.fileID % 2]->releaseBatch(DataBatch(batch.batchID, batch.fileID / 2));
+ }
+ void
+ReadSupplierQueue::ReaderThreadMain(void *param)
+ ReaderThreadParams *params = (ReaderThreadParams *)param;
+ params->queue->ReaderThread(params);
+ delete params;
+ ReadQueueElement*
+ while (emptyQueue->next == emptyQueue) {
+ // Wait for a buffer.
+ //WriteErrorMessage("Thread %u: getEmptyElement releasing lock\n", GetThreadId());
+ ReleaseExclusiveLock(&lock);
+ //WriteErrorMessage("Thread %u: getEmptyElement released lock\n", GetThreadId());
+ WaitForEvent(&emptyBuffersAvailable);
+ //WriteErrorMessage("Thread %u: getEmptyElement acquiring lock\n", GetThreadId());
+ AcquireExclusiveLock(&lock);
+ //WriteErrorMessage("Thread %u: getEmptyElement acquired lock\n", GetThreadId());
+ }
+ ReadQueueElement *element = emptyQueue->next;
+ element->removeFromQueue();
+ if (emptyQueue->next == emptyQueue) {
+ PreventEventWaitersFromProceeding(&emptyBuffersAvailable);
+ }
+ return element;
+ void
+ReadSupplierQueue::ReaderThread(ReaderThreadParams *params)
+ AcquireExclusiveLock(&lock);
+ bool done = false;
+ ReadReader *reader;
+ if (params->isSecondReader) {
+ reader = singleReader[1];
+ } else {
+ reader = singleReader[0]; // In the pairedReader case, this will just be NULL
+ }
+ int increment = (NULL == reader) ? 2 : 1;
+ int balanceIncrement = params->isSecondReader ? -1 : 1;
+ int firstOrSecond = params->isSecondReader ? 1 : 0;
+ bool isSingleReader = (NULL == singleReader[1]);
+ _int64 balanceTime = 0;
+ _int64 bufferWaitTime = 0;
+ _int64 processingTime = 0;
+ _int64 startTime = timeInNanos();
+ // may pass reads forward from element loops to maintain batching or element size
+ Read firstReadForNextElement[2];
+ bool hasFirstReadForNextElement = false;
+ while (!done) {
+ if ((!isSingleReader) && balance * balanceIncrement > MaxImbalance) {
+ //
+ // We're over full. Wait to get back in balance.
+ //
+ ReleaseExclusiveLock(&lock);
+ _int64 now = timeInNanos();
+ processingTime += now - startTime;
+ startTime = now;
+ WaitForEvent(&throttle[firstOrSecond]);
+ now = timeInNanos();
+ balanceTime += now - startTime;
+ startTime = now;
+ AcquireExclusiveLock(&lock);
+ _ASSERT(balance * balanceIncrement <= MaxImbalance);
+ }
+ // pull an empty element from the queue
+ _int64 now = timeInNanos();
+ processingTime += now - startTime;
+ startTime = now;
+ ReadQueueElement* element = getEmptyElement();
+ now = timeInNanos();
+ bufferWaitTime += now - startTime;
+ startTime = now;
+ //
+ // Now fill in the reads from the reader into the element until it's
+ // full or the reader finishes or it exceeds batch count
+ //
+ ReleaseExclusiveLock(&lock);
+ element->totalReads = 0;
+ for (; element->totalReads <= (int) elementSize - increment; element->totalReads += increment) {
+ if (NULL != reader) {
+ Read* read = &element->reads[element->totalReads];
+ if (hasFirstReadForNextElement) {
+ _ASSERT(element->totalReads == 0);
+ *read = firstReadForNextElement[0];
+ // already called holdBatch when firstReadForNextElement set
+ element->batches.push_back(read->getBatch());
+ hasFirstReadForNextElement = false;
+ } else {
+ done = ! reader->getNextRead(read);
+ if (done) {
+ break;
+ }
+ if (! isSingleReader) {
+ read->setBatch(DataBatch(read->getBatch().batchID, read->getBatch().fileID * 2 + firstOrSecond));
+ }
+ bool newBatch = element->totalReads == 0 ||
+ (read->getBatch() != read[-1].getBatch() && element->batches.search(read->getBatch()) == element->batches.end());
+ if (element->batches.size() + newBatch <= BatchesPerElement) {
+ // won't exceed limit for this element
+ if (newBatch && element->batches.add(read->getBatch())) {
+ holdBatch(read->getBatch());
+ }
+ } else {
+ // too many batches, hold for next queue element
+ firstReadForNextElement[0] = *read;
+ hasFirstReadForNextElement = true;
+ holdBatch(read->getBatch());
+ break;
+ }
+ }
+ } else if (NULL != pairedReader) {
+ Read* read = &element->reads[element->totalReads];
+ if (hasFirstReadForNextElement) {
+ _ASSERT(element->totalReads == 0);
+ read[0] = firstReadForNextElement[0];
+ read[1] = firstReadForNextElement[1];
+ // already called holdBatch when firstReadForNextElement set
+ element->batches.push_back(read[0].getBatch());
+ if (read[1].getBatch() != read[0].getBatch()) {
+ element->batches.push_back(read[1].getBatch());
+ }
+ hasFirstReadForNextElement = false;
+ } else {
+ done = !pairedReader->getNextReadPair(&read[0], &read[1]);
+ if (done) {
+ break;
+ }
+ DataBatch b[2] = {read[0].getBatch(), read[1].getBatch()};
+ bool newBatch[2] =
+ {(element->totalReads == 0 || read[-2].getBatch() != b[0]) &&
+ element->batches.search(b[0]) == element->batches.end(),
+ b[0] != b[1] && (element->totalReads == 0 || read[-1].getBatch() != b[1]) &&
+ element->batches.search(b[1]) == element->batches.end()};
+ if (element->batches.size() + newBatch[0] + newBatch[1] <= BatchesPerElement) {
+ if (newBatch[0] && element->batches.add(b[0])) {
+ holdBatch(b[0]);
+ }
+ if (newBatch[1] && element->batches.add(b[1])) {
+ holdBatch(b[1]);
+ }
+ } else {
+ firstReadForNextElement[0] = read[0];
+ firstReadForNextElement[1] = read[1];
+ holdBatch(b[0]);
+ if (b[1] != b[0]) {
+ holdBatch(b[1]);
+ }
+ hasFirstReadForNextElement = true;
+ break;
+ }
+ //WriteErrorMessage("ReadSupplierQueue::ReaderThread element %x batches %d:%d, %d:%d\n", (int) element, b[0].fileID, b[0].batchID, b[1].fileID, b[1].batchID);
+ }
+ }
+ }
+ //WriteErrorMessage("ReadSupplierQueue element[%d] %x with %d reads %d batches\n", firstOrSecond, (int) element, element->totalReads, element->batches.size());
+ AcquireExclusiveLock(&lock);
+ // do this before AllowEventWaitersToProceed to avoid race condition
+ if (done && 1 == nReadersRunning) {
+ //WriteErrorMessage("Thread %u: set allReadsQueued (%d) in ReaderThread...\n", GetThreadId(), element->totalReads);
+ allReadsQueued = true;
+ AllowEventWaitersToProceed(&readsReady); // Even if we have nothing to queue, allow the consumers to wake up so they can exit
+ }
+ if (element->totalReads > 0) {
+ element->addToTail(&readyQueue[firstOrSecond]);
+ if (isSingleReader || &readyQueue[1-firstOrSecond] != readyQueue[1-firstOrSecond].next) {
+ //
+ // Signal that an element is ready.
+ //
+ //WriteErrorMessage("Thread %u: signal readsReady in ReaderThread...\n", GetThreadId());
+ AllowEventWaitersToProceed(&readsReady);
+ }
+ if (!isSingleReader) {
+ //WriteErrorMessage("Thread %u: balance %d %+d = %d...\n", GetThreadId(), balance, balanceIncrement, balance + balanceIncrement);
+ balance += balanceIncrement;
+ if (balance * balanceIncrement > MaxImbalance) {
+ _ASSERT(balance * balanceIncrement == MaxImbalance + 1); // We can get at most one past the limit
+ //
+ // We're too far ahead. Close our throttle.
+ //
+ //WriteErrorMessage("Thread %u: close throttle %d in ReaderThread...\n", GetThreadId(), firstOrSecond);
+ PreventEventWaitersFromProceeding(&throttle[firstOrSecond]);
+ } else if (balance * -1 * balanceIncrement == MaxImbalance) {
+ //
+ // We just pushed it back into balance (barely) for the other guy. Allow him to
+ // proceed.
+ //
+ //WriteErrorMessage("Thread %u: release throttle %d in ReaderThread...\n", GetThreadId(), 1-firstOrSecond);
+ AllowEventWaitersToProceed(&throttle[1-firstOrSecond]);
+ }
+ }
+ }
+ } // While ! done
+ processingTime += timeInNanos() - startTime;
+ //WriteErrorMessage("ReadSupplier: %llds processing, %llds waiting for balance, %llds waiting for buffer\n", processingTime / 1000000000, balanceTime / 1000000000, bufferWaitTime / 1000000000);
+ _ASSERT(nReadersRunning > 0);
+ nReadersRunning--;
+ ReleaseExclusiveLock(&lock);
+ ReadSupplierQueue *i_queue)
+ :
+ queue(i_queue),
+ outOfReads(false),
+ currentElement(NULL),
+ nextReadIndex(0),
+ done(false)
+ Read *
+ if (done) {
+ return NULL;
+ }
+ ReadQueueElement* doneElement = NULL;
+ if (NULL != currentElement && nextReadIndex >= currentElement->totalReads) {
+ doneElement = currentElement;
+ currentElement = NULL;
+ }
+ if (NULL == currentElement) {
+ currentElement = queue->getElement();
+ if (doneElement != NULL) {
+ queue->doneWithElement(doneElement);
+ }
+ if (NULL == currentElement) {
+ done = true;
+ queue->supplierFinished();
+ return NULL;
+ }
+ nextReadIndex = 0;
+ }
+ return ¤tElement->reads[nextReadIndex++]; // Note the post increment.
+PairedReadSupplierFromQueue::PairedReadSupplierFromQueue(ReadSupplierQueue *i_queue, bool i_twoFiles) :
+ queue(i_queue), twoFiles(i_twoFiles), done(false),
+ currentElement(NULL), currentSecondElement(NULL), nextReadIndex(0) {}
+ bool
+PairedReadSupplierFromQueue::getNextReadPair(Read **read0, Read **read1)
+ if (done) {
+ *read0 = NULL;
+ *read1 = NULL;
+ return false;
+ }
+ if (NULL != currentElement && nextReadIndex >= currentElement->totalReads) {
+ //fprintf(stderr,"PairedReadSupplierFromQueue finished element %x with %d reads %d batches:", (int) currentElement, currentElement->totalReads, currentElement->batches.size()); for (BatchVector::iterator i = currentElement->batches.begin(); i != currentElement->batches.end(); i++) { fprintf(stderr," %d:%d", i->fileID, i->batchID); } fprintf(stderr,"\n");
+ queue->doneWithElement(currentElement);
+ currentElement = NULL;
+ if (twoFiles) {
+ //fprintf(stderr,"PairedReadSupplierFromQueue finished 2nd element %x with %d reads %d batches:", (int) currentSecondElement, currentSecondElement->totalReads, currentSecondElement->batches.size()); for (BatchVector::iterator i = currentSecondElement->batches.begin(); i != currentSecondElement->batches.end(); i++) { printf(" %d:%d", i->fileID, i->batchID); } printf("\n");
+ queue->doneWithElement(currentSecondElement);
+ currentSecondElement = NULL;
+ }
+ }
+ if (NULL == currentElement) {
+ if ((twoFiles && !queue->getElements(¤tElement, ¤tSecondElement)) ||
+ (!twoFiles && NULL == (currentElement = queue->getElement()))) {
+ done = true;
+ queue->supplierFinished();
+ *read0 = NULL;
+ *read1 = NULL;
+ return false;
+ }
+ nextReadIndex = 0;
+ }
+ if (twoFiles) {
+ // Assert that both elements match.
+ _ASSERT(currentSecondElement->totalReads == currentElement->totalReads);
+ for (int i = 0; i < currentElement->totalReads; i++) {
+ Read::checkIdMatch(¤tElement->reads[i], ¤tSecondElement->reads[i]);
+ }
+ } else {
+ //
+ // Assert that there are an even number of reads (since they're in pairs)
+ //
+ _ASSERT(currentElement->totalReads % 2 == 0);
+ for (int i = 0; i < currentElement->totalReads; i += 2) {
+ Read::checkIdMatch(¤tElement->reads[i], ¤tElement->reads[i+1]);
+ }
+ }
+ if (twoFiles) {
+ *read0 = ¤tElement->reads[nextReadIndex];
+ *read1 = ¤tSecondElement->reads[nextReadIndex];
+ Read::checkIdMatch(*read0, *read1);
+ nextReadIndex++;
+ } else {
+ *read0 = ¤tElement->reads[nextReadIndex];
+ *read1 = ¤tElement->reads[nextReadIndex+1];
+ Read::checkIdMatch(*read0, *read1);
+ nextReadIndex += 2;
+ }
+ return true;
diff --git a/SNAPLib/ReadSupplierQueue.h b/SNAPLib/ReadSupplierQueue.h
new file mode 100644
index 0000000..4f5628e
--- /dev/null
+++ b/SNAPLib/ReadSupplierQueue.h
@@ -0,0 +1,221 @@
+Module Name:
+ ReadSupplierQueue.h
+ Headers for parallel queue of reads
+ Bill Bolosky, November, 2012
+ User mode service.
+Revision History:
+#pragma once
+#include "Read.h"
+#include "Compat.h"
+#include "VariableSizeVector.h"
+#include "VariableSizeMap.h"
+using std::pair;
+class ReadSupplierFromQueue;
+class PairedReadSupplierFromQueue;
+typedef VariableSizeVector<DataBatch> BatchVector;
+struct ReadQueueElement {
+ ReadQueueElement()
+ : next(NULL), prev(NULL)
+ {
+ reads = (Read*) BigAlloc(MaxReadsPerElement * sizeof(Read));
+ }
+ ~ReadQueueElement()
+ {
+ BigDealloc(reads);
+ reads = NULL;
+ }
+ // note this should be about read buffer size for input reads
+#ifdef LONG_READS
+ static const int MaxReadsPerElement = 400;
+ static const int MaxReadsPerElement = 5000;
+ ReadQueueElement *next;
+ ReadQueueElement *prev;
+ int totalReads;
+ Read* reads;
+ BatchVector batches;
+ void addToTail(ReadQueueElement *queueHead) {
+ next = queueHead;
+ prev = queueHead->prev;
+ prev->next = this;
+ next->prev = this;
+ }
+ void removeFromQueue() {
+ prev->next = next;
+ next->prev = prev;
+ prev = next = NULL;
+ }
+class ReadSupplierQueue: public ReadSupplierGenerator, public PairedReadSupplierGenerator {
+ //
+ // This queue can handle several different kinds of inputs and outputs. It will do either single
+ // ended or paired reads. In both cases, it can accept multiple independent readers (typically
+ // one per (pair of) input file(s). For paired reads that come from pairs of input files (think
+ // FASTQ) it will run them independently and then combine the results as they're extracted. For
+ // paired reads that come from single files (SAM/BAM/CRAM, etc.) it still uses two queues internally,
+ // but they're both written by a single PairedReadReader.
+ //
+ //
+ // The version for single ended reads. This is useful for formats that can't be divided by the
+ // RangeSplitter, like BAM (though that's theoretically possible, so maybe..) It takes a set
+ // of readers (presumably for different files), each of which runs independently and in parallel.
+ //
+ ReadSupplierQueue(ReadReader *i_reader);
+ //
+ // The version for paired reads for which each end comes from a different Reader (and presumably
+ // file, think FASTQ). This is mostly useful for cases where the RangeSplitter can't handle
+ // the files, probably because they FASTQ files with unequal length reads).
+ //
+ ReadSupplierQueue(ReadReader *i_firstHalfReader, ReadReader *i_secondHalfReader);
+ //
+ // The version for paired reads that come from a single file but for which RangeSplitter won't
+ // work (BAM, CRAM, compressed FASTQ, maybe SRA).
+ //
+ ReadSupplierQueue(PairedReadReader *pairedReader);
+ virtual ~ReadSupplierQueue();
+ bool startReaders();
+ void waitUntilFinished();
+ ReadSupplier *generateNewReadSupplier();
+ PairedReadSupplier *generateNewPairedReadSupplier();
+ ReaderContext* getContext();
+ ReadQueueElement *getElement(); // Called from the supplier threads
+ bool getElements(ReadQueueElement **element1, ReadQueueElement **element2); // Called from supplier threads
+ void doneWithElement(ReadQueueElement *element);
+ void supplierFinished();
+ void holdBatch(DataBatch batch);
+ bool releaseBatch(DataBatch batch);
+ static int BufferCount(int numThreads)
+ { return (__max(numThreads,2) + 1) * BatchesPerElement; }
+ static const int BatchesPerElement = 4;
+ void commonInit();
+ ReadReader *singleReader[2]; // Only [0] is filled in for single ended reads
+ PairedReadReader *pairedReader; // This is filled in iff there are no single readers
+ ReadQueueElement readyQueue[2]; // Queue [1] is used only when there are two single end readers
+ BatchTracker tracker; // track batches used in queues, use refcount per element (not per read)
+ EventObject throttle[2]; // Two throttles, one for each of the readers. At least one must be open at all times.
+ int balance; // The size of readyQueue[0] - the size of readyQueue[1]. This is used to throttle.
+ static const int MaxImbalance = 5; // Engage the throttle when |balance| > MaxImbalance
+ volatile unsigned elementSize; // reads per element, used to ensure paired single readers use same size that is ~ buffer size
+ int nReadersRunning;
+ int nSuppliersRunning;
+ volatile bool allReadsQueued;
+ ReadQueueElement* getEmptyElement(); // must hold the lock to call this
+ bool areAnyReadsReady(); // must hold the lock to call this.
+ //
+ // Empty buffers waiting for the readers.
+ //
+ ReadQueueElement emptyQueue[1];
+ //
+ // Just one lock for all of the shared objects (the queues and Waiter objects, and counts of
+ // readers and suppliers running, as well as allReadsQueued).
+ //
+ ExclusiveLock lock;
+ EventObject readsReady;
+ EventObject emptyBuffersAvailable;
+ EventObject allReadsConsumed;
+ struct ReaderThreadParams {
+ ReadSupplierQueue *queue;
+ bool isSecondReader;
+ };
+ static void ReaderThreadMain(void *);
+ void ReaderThread(ReaderThreadParams *params);
+// A read supplier that takes its data from a ReadSupplierQueue.
+class ReadSupplierFromQueue: public ReadSupplier {
+ ReadSupplierFromQueue(ReadSupplierQueue *i_queue);
+ ~ReadSupplierFromQueue() {}
+ Read *getNextRead();
+ virtual void holdBatch(DataBatch batch)
+ { queue->holdBatch(batch); }
+ virtual bool releaseBatch(DataBatch batch)
+ { return queue->releaseBatch(batch); }
+ bool done;
+ ReadSupplierQueue *queue;
+ bool outOfReads;
+ ReadQueueElement *currentElement;
+ int nextReadIndex;
+class PairedReadSupplierFromQueue: public PairedReadSupplier {
+ PairedReadSupplierFromQueue(ReadSupplierQueue *i_queue, bool i_twoFiles);
+ ~PairedReadSupplierFromQueue();
+ bool getNextReadPair(Read **read0, Read **read1);
+ virtual void holdBatch(DataBatch batch)
+ { queue->holdBatch(batch); }
+ virtual bool releaseBatch(DataBatch batch)
+ { return queue->releaseBatch(batch); }
+ ReadSupplierQueue *queue;
+ bool done;
+ bool twoFiles;
+ ReadQueueElement *currentElement;
+ ReadQueueElement *currentSecondElement;
+ int nextReadIndex;
diff --git a/SNAPLib/ReadWriter.cpp b/SNAPLib/ReadWriter.cpp
new file mode 100644
index 0000000..ec38495
--- /dev/null
+++ b/SNAPLib/ReadWriter.cpp
@@ -0,0 +1,548 @@
+Module Name:
+ ReadWriter.cpp
+ General file writer.
+ User mode service.
+ Not thread safe.
+#include "stdafx.h"
+#include "BigAlloc.h"
+#include "Compat.h"
+#include "Read.h"
+#include "SAM.h"
+#include "Tables.h"
+#include "RangeSplitter.h"
+#include "ParallelTask.h"
+#include "Util.h"
+#include "ReadSupplierQueue.h"
+#include "FileFormat.h"
+#include "exit.h"
+#include "Error.h"
+#include "Genome.h"
+class SimpleReadWriter : public ReadWriter
+ SimpleReadWriter(const FileFormat* i_format, DataWriter* i_writer, const Genome* i_genome)
+ : format(i_format), writer(i_writer), genome(i_genome)
+ {}
+ virtual ~SimpleReadWriter()
+ {
+ delete writer;
+ }
+ virtual bool writeHeader(const ReaderContext& context, bool sorted, int argc, const char **argv, const char *version, const char *rgLine, bool omitSQLines);
+ virtual bool writeReads(const ReaderContext& context, Read *read, SingleAlignmentResult *results, int nResults, bool firstIsPrimary);
+ virtual bool writePairs(const ReaderContext& context, Read **reads /* array of size 2 */, PairedAlignmentResult *result, int nResults,
+ SingleAlignmentResult **singleResults /* array of size 2*/, int *nSingleResults /* array of size 2*/, bool firstIsPrimary);
+ virtual void close();
+ const FileFormat* format;
+ DataWriter* writer;
+ const Genome* genome;
+ LandauVishkinWithCigar lvc;
+ bool
+ const ReaderContext& context,
+ bool sorted,
+ int argc,
+ const char **argv,
+ const char *version,
+ const char *rgLine,
+ bool omitSQLines)
+ char* buffer;
+ size_t size;
+ size_t used;
+ char *localBuffer = NULL;
+ writer->inHeader(true);
+ if (! writer->getBuffer(&buffer, &size)) {
+ return false;
+ }
+ char *writerBuffer = buffer;
+ size_t writerBufferSize = size;
+ while (!format->writeHeader(context, buffer, size, &used, sorted, argc, argv, version, rgLine, omitSQLines)) {
+ delete[] localBuffer;
+ size = 2 * size;
+ localBuffer = new char[size];
+ buffer = localBuffer;
+ }
+ if (NULL == localBuffer) {
+ _ASSERT(writerBuffer == buffer);
+ writer->advance((unsigned)used, 0);
+ writer->nextBatch();
+ } else {
+ size_t bytesRemainingToWrite = used;
+ size_t bytesWritten = 0;
+ while (bytesRemainingToWrite > 0) {
+ size_t bytesToWrite = __min(bytesRemainingToWrite, writerBufferSize);
+ memcpy(writerBuffer, localBuffer + bytesWritten, bytesToWrite);
+ writer->advance(bytesToWrite);
+ writer->nextBatch();
+ if (!writer->getBuffer(&writerBuffer, &writerBufferSize)) {
+ return false;
+ }
+ bytesWritten += bytesToWrite;
+ bytesRemainingToWrite -= bytesToWrite;
+ }
+ delete[] localBuffer;
+ }
+ writer->inHeader(false);
+ return true;
+ bool
+ const ReaderContext& context,
+ Read *read,
+ SingleAlignmentResult *results,
+ int nResults,
+ bool firstIsPrimary)
+ char* buffer;
+ size_t size;
+ size_t used;
+ bool result = false;
+ for (int i = 0; i < nResults; i++) {
+ if (results[i].status == NotFound) {
+ results[i].location = InvalidGenomeLocation;
+ }
+ }
+ //
+ // We need to keep track of the offsets of all of the alignments in the output buffer so we can commit them. However,
+ // we want to avoid dynamic memory allocation as much as possible. So, we have a static buffer on the stack that's big enough
+ // for the great majority of cases, and then allocate dynamically if that's too small. Makes for annoying, but efficient
+ // code.
+ //
+ const int staticUsedBufferSize = 2000;
+ size_t staticUsedBuffer[staticUsedBufferSize];
+ GenomeLocation staticFinalLocationsBuffer[staticUsedBufferSize];
+ size_t *usedBuffer;
+ GenomeLocation *finalLocations;
+ if (nResults <= staticUsedBufferSize) {
+ usedBuffer = staticUsedBuffer;
+ finalLocations = staticFinalLocationsBuffer;
+ } else {
+ usedBuffer = new size_t[nResults];
+ finalLocations = new GenomeLocation[nResults];
+ }
+ for (int pass = 0; pass < 2; pass++) { // Make two passes, one with whatever buffer space is left and one with a clean buffer.
+ bool blewBuffer = false;
+ if (!writer->getBuffer(&buffer, &size)) {
+ goto done;
+ }
+ used = 0;
+ for (int whichResult = 0; whichResult < nResults; whichResult++) {
+ int addFrontClipping = 0;
+ read->setAdditionalFrontClipping(0);
+ int cumulativeAddFrontClipping = 0;
+ finalLocations[whichResult] = results[whichResult].location;
+ while (!format->writeRead(context, &lvc, buffer + used, size - used, &usedBuffer[whichResult], read->getIdLength(), read, results[whichResult].status,
+ results[whichResult].mapq, finalLocations[whichResult], results[whichResult].direction, (whichResult > 0) || !firstIsPrimary, &addFrontClipping)) {
+ if (0 == addFrontClipping) {
+ blewBuffer = true;
+ break;
+ }
+ // redo if read modified (e.g. to add soft clipping, or move alignment for a leading I.
+ const Genome::Contig *originalContig = results[whichResult].status == NotFound ? NULL
+ : genome->getContigAtLocation(results[whichResult].location);
+ const Genome::Contig *newContig = results[whichResult].status == NotFound ? NULL
+ : genome->getContigAtLocation(results[whichResult].location + addFrontClipping);
+ if (newContig == NULL || newContig != originalContig || finalLocations[whichResult] + addFrontClipping > originalContig->beginningLocation + originalContig->length - genome->getChromosomePadding()) {
+ //
+ // Altering this would push us over a contig boundary. Just give up on the read.
+ //
+ results[whichResult].status = NotFound;
+ results[whichResult].location = InvalidGenomeLocation;
+ finalLocations[whichResult] = InvalidGenomeLocation;
+ } else {
+ cumulativeAddFrontClipping += addFrontClipping;
+ if (addFrontClipping > 0) {
+ read->setAdditionalFrontClipping(cumulativeAddFrontClipping);
+ }
+ finalLocations[whichResult] = results[whichResult].location + cumulativeAddFrontClipping;
+ }
+ } // while formatting doesn't work
+ if (blewBuffer) {
+ break;
+ }
+ used += usedBuffer[whichResult];
+ _ASSERT(used <= size);
+ if (used > 0xffffffff) {
+ WriteErrorMessage("SimpleReadWriter:writeReads: used too big\n");
+ soft_exit(1);
+ }
+ } // for each result.
+ if (!blewBuffer) {
+ //
+ // Everything worked OK.
+ //
+ for (int whichResult = 0; whichResult < nResults; whichResult++) {
+ writer->advance((unsigned)usedBuffer[whichResult], finalLocations[whichResult]);
+ }
+ result = true;
+ goto done;
+ }
+ if (pass == 1) {
+ WriteErrorMessage("Failed to write into fresh buffer; trying providing the -wbs switch with a larger value\n");
+ soft_exit(1);
+ }
+ if (!writer->nextBatch()) {
+ goto done;
+ }
+ } // for each pass (i.e., not empty, empty buffer)
+ if (usedBuffer != staticUsedBuffer) {
+ delete[] usedBuffer;
+ usedBuffer = NULL;
+ delete[] finalLocations;
+ finalLocations = NULL;
+ }
+ read->setAdditionalFrontClipping(0);
+ return result;
+ bool
+ const ReaderContext& context,
+ Read **reads /* array of size NUM_READS_PER_PAIR */,
+ PairedAlignmentResult *result,
+ int nResults,
+ SingleAlignmentResult **singleResults /* array of size NUM_READS_PER_PAIR*/,
+ int *nSingleResults /* array of size NUM_READS_PER_PAIR*/,
+ bool firstIsPrimary)
+ bool retVal = false;
+ //
+ // We need to write all alignments for the pair into the same buffer, so that a write from
+ // some other thread doesn't separate them. We make two passes, trying to write into the
+ // existing buffer, and then into a clean one. If that doesn't work, abort the alignment
+ // run and ask for a bigger write buffer.
+ //
+ const int staticUsedBufferSize = 2000;
+ size_t staticUsedBuffer[NUM_READS_PER_PAIR][staticUsedBufferSize];
+ GenomeLocation staticLocationBuffer[NUM_READS_PER_PAIR][staticUsedBufferSize];
+ GenomeLocation *finalLocations[NUM_READS_PER_PAIR];
+ size_t *usedBuffer[NUM_READS_PER_PAIR];
+ if (nResults + nSingleResults[0] <= staticUsedBufferSize && nResults + nSingleResults[1] <= staticUsedBufferSize) {
+ usedBuffer[0] = staticUsedBuffer[0];
+ usedBuffer[1] = staticUsedBuffer[1];
+ finalLocations[0] = staticLocationBuffer[0];
+ finalLocations[1] = staticLocationBuffer[1];
+ } else {
+ usedBuffer[0] = new size_t[nResults * NUM_READS_PER_PAIR + nSingleResults[0] + nSingleResults[1]];
+ usedBuffer[1] = usedBuffer[0] + nResults + nSingleResults[0];
+ finalLocations[0] = new GenomeLocation[nResults * NUM_READS_PER_PAIR + nSingleResults[0] + nSingleResults[1]];
+ finalLocations[1] = finalLocations[0] + nResults + nSingleResults[0];
+ }
+ //
+ // For paired reads, we need to have the same QNAME for both of them, and it needs to be unique among all other
+ // reads in the dataset. For now, all we do is see if the read names end in /1 and /2, and if so truncate them.
+ //
+ size_t idLengths[NUM_READS_PER_PAIR];
+ idLengths[0] = reads[0]->getIdLength();
+ idLengths[1] = reads[1]->getIdLength();
+ if (idLengths[0] == idLengths[1] && idLengths[0] > 2 && reads[0]->getId()[idLengths[0]-2] == '/' && reads[1]->getId()[idLengths[0]-2] == '/') {
+ char lastChar0, lastChar1;
+ lastChar0 = reads[0]->getId()[idLengths[0] - 1];
+ lastChar1 = reads[1]->getId()[idLengths[1] - 1];
+ if ((lastChar0 == '1' || lastChar0 == '2') && (lastChar1 == '1' || lastChar1 == '2') &&
+ lastChar0 != lastChar1) {
+ idLengths[0] -= 2;
+ idLengths[1] -= 2;
+ }
+ }
+ for (int pass = 0; pass < 2; pass++) {
+ char* buffer;
+ size_t size;
+ size_t used = 0;
+ bool fitInBuffer = true;
+ if (!writer->getBuffer(&buffer, &size)) {
+ goto done;
+ }
+ //
+ // Write all of the pair alignments into the buffer.
+ //
+ for (int whichAlignmentPair = 0; whichAlignmentPair < nResults; whichAlignmentPair++) {
+ reads[0]->setAdditionalFrontClipping(0);
+ reads[1]->setAdditionalFrontClipping(0);
+ GenomeLocation locations[2];
+ locations[0] = result[whichAlignmentPair].status[0] != NotFound ? result[whichAlignmentPair].location[0] : InvalidGenomeLocation;
+ locations[1] = result[whichAlignmentPair].status[1] != NotFound ? result[whichAlignmentPair].location[1] : InvalidGenomeLocation;
+ int writeOrder[2]; // The order in which we write the reads, which is just numerical by genome location. SO writeOrder[0] gets written first, and writeOrder[1] second.
+ if (locations[0] <= locations[1]) {
+ writeOrder[0] = 0;
+ writeOrder[1] = 1;
+ } else {
+ writeOrder[0] = 1;
+ writeOrder[1] = 0;
+ }
+ bool secondReadLocationChanged;
+ int cumulativePositiveAddFrontClipping[NUM_READS_PER_PAIR] = { 0, 0 };
+ do {
+ size_t tentativeUsed = 0;
+ secondReadLocationChanged = false;
+ for (int firstOrSecond = 0; firstOrSecond < NUM_READS_PER_PAIR; firstOrSecond++) { // looping over the order in which the reads are written, not the order in which they arrived
+ int whichRead = writeOrder[firstOrSecond];
+ //
+ // Loop until we get a write with no additional front clipping.
+ //
+ int addFrontClipping = 0;
+ while (!format->writeRead(context, &lvc, buffer + used + tentativeUsed, size - used - tentativeUsed, &usedBuffer[firstOrSecond][whichAlignmentPair],
+ idLengths[whichRead], reads[whichRead], result[whichAlignmentPair].status[whichRead], result[whichAlignmentPair].mapq[whichRead], locations[whichRead], result[whichAlignmentPair].direction[whichRead],
+ whichAlignmentPair != 0 || !firstIsPrimary, &addFrontClipping, true, writeOrder[firstOrSecond] == 0,
+ reads[1 - whichRead], result[whichAlignmentPair].status[1 - whichRead], locations[1 - whichRead], result[whichAlignmentPair].direction[1 - whichRead],
+ result[whichAlignmentPair].alignedAsPair)) {
+ if (0 == addFrontClipping || locations[whichRead] == InvalidGenomeLocation) {
+ //
+ // We failed because we ran out of buffer.
+ //
+ goto blownBuffer;
+ }
+ if (1 == firstOrSecond) {
+ //
+ // If the location of the second read changed, we need to redo the first one as well, because it includes an offset to the second read
+ //
+ secondReadLocationChanged = true;
+ }
+ const Genome::Contig *originalContig = genome->getContigAtLocation(locations[whichRead]);
+ const Genome::Contig *newContig = genome->getContigAtLocation(locations[whichRead] + addFrontClipping);
+ if (newContig != originalContig || NULL == newContig || locations[whichRead] + addFrontClipping > originalContig->beginningLocation + originalContig->length - genome->getChromosomePadding()) {
+ //
+ // Altering this would push us over a contig boundary. Just give up on the read.
+ //
+ result[whichAlignmentPair].status[whichRead] = NotFound;
+ result[whichAlignmentPair].location[whichRead] = InvalidGenomeLocation;
+ locations[whichRead] = InvalidGenomeLocation;
+ } else {
+ if (addFrontClipping > 0) {
+ cumulativePositiveAddFrontClipping[firstOrSecond] += addFrontClipping;
+ reads[whichRead]->setAdditionalFrontClipping(cumulativePositiveAddFrontClipping[firstOrSecond]);
+ }
+ locations[whichRead] += addFrontClipping;
+ }
+ } // While formatting didn't work
+ tentativeUsed += usedBuffer[firstOrSecond][whichAlignmentPair];
+ } // for first or second read
+ } while (secondReadLocationChanged);
+ used += usedBuffer[0][whichAlignmentPair] + usedBuffer[1][whichAlignmentPair];
+ //
+ // Both reads are written into the buffer. Save the final locations we used for when we commit.
+ //
+ for (int whichRead = 0; whichRead < NUM_READS_PER_PAIR; whichRead++) {
+ finalLocations[whichRead][whichAlignmentPair] = locations[whichRead];
+ }
+ } // for each pair.
+ //
+ // Now write the single alignments.
+ //
+ for (int whichRead = 0; whichRead < NUM_READS_PER_PAIR; whichRead++) {
+ for (int whichAlignment = 0; whichAlignment < nSingleResults[whichRead]; whichAlignment++) {
+ int addFrontClipping;
+ reads[whichRead]->setAdditionalFrontClipping(0);
+ GenomeLocation location = singleResults[whichRead][whichAlignment].status != NotFound ? singleResults[whichRead][whichAlignment].location : InvalidGenomeLocation;
+ int cumulativePositiveAddFrontClipping = 0;
+ while (!format->writeRead(context, &lvc, buffer + used, size - used, &usedBuffer[whichRead][nResults + whichAlignment], reads[whichRead]->getIdLength(),
+ reads[whichRead], singleResults[whichRead][whichAlignment].status, singleResults[whichRead][whichAlignment].mapq, location, singleResults[whichRead][whichAlignment].direction,
+ true, &addFrontClipping)) {
+ if (0 == addFrontClipping) {
+ goto blownBuffer;
+ }
+ const Genome::Contig *originalContig = genome->getContigAtLocation(location);
+ const Genome::Contig *newContig = genome->getContigAtLocation(location + addFrontClipping);
+ if (newContig != originalContig || NULL == newContig || location + addFrontClipping > originalContig->beginningLocation + originalContig->length - genome->getChromosomePadding()) {
+ //
+ // Altering this would push us over a contig boundary. Just give up on the read.
+ //
+ singleResults[whichRead][whichAlignment].status = NotFound;
+ location = InvalidGenomeLocation;
+ } else {
+ if (addFrontClipping > 0) {
+ cumulativePositiveAddFrontClipping += addFrontClipping;
+ reads[whichRead]->setAdditionalFrontClipping(cumulativePositiveAddFrontClipping);
+ }
+ location += addFrontClipping;
+ }
+ }
+ finalLocations[whichRead][nResults + whichAlignment] = location;
+ used += usedBuffer[whichRead][nResults + whichAlignment];
+ } // For each single alignment of a read
+ } // For each read
+ //
+ // They all fit into the buffer.
+ //
+ //
+ // Commit the updates for the pairs.
+ //
+ for (int whichReadPair = 0; whichReadPair < nResults; whichReadPair++) {
+ for (int firstOrSecond = 0; firstOrSecond < NUM_READS_PER_PAIR; firstOrSecond++) {
+ // adjust for write order
+ int writeFirstOrSecond = (!!firstOrSecond) ^ (finalLocations[0][whichReadPair] > finalLocations[1][whichReadPair]); // goofy looking !! converts int to bool
+ writer->advance((unsigned)usedBuffer[firstOrSecond][whichReadPair],
+ finalLocations[writeFirstOrSecond][whichReadPair] == InvalidGenomeLocation ? finalLocations[1 - writeFirstOrSecond][whichReadPair] : finalLocations[writeFirstOrSecond][whichReadPair]);
+ }
+ }
+ //
+ // Now commit the updates for the single reads.
+ //
+ for (int whichRead = 0; whichRead < NUM_READS_PER_PAIR; whichRead++) {
+ for (int whichAlignment = 0; whichAlignment < nSingleResults[whichRead]; whichAlignment++) {
+ writer->advance((unsigned)usedBuffer[whichRead][nResults + whichAlignment], finalLocations[whichRead][nResults + whichAlignment]);
+ }
+ }
+ retVal = true;
+ break;
+ if (pass > 0) {
+ WriteErrorMessage("Unable to fit all alignments for one read pair into a single write buffer. Increase the size of the write buffer with -wbs, or reduce the number of alignments with -om or -omax\n");
+ WriteErrorMessage("Read id: '%.*s'\n", reads[0]->getIdLength(), reads[0]->getId());
+ soft_exit(1);
+ }
+ if (!writer->nextBatch()) {
+ goto done;
+ }
+ } // For each buffer full pass
+ if (usedBuffer[0] != staticUsedBuffer[0]) {
+ delete[] usedBuffer[0];
+ usedBuffer[0] = usedBuffer[1] = NULL;
+ delete[] finalLocations[0];
+ finalLocations[0] = finalLocations[1] = NULL;
+ }
+ reads[0]->setAdditionalFrontClipping(0);
+ reads[1]->setAdditionalFrontClipping(0);
+ return retVal;
+ void
+ writer->close();
+class SimpleReadWriterSupplier : public ReadWriterSupplier
+ SimpleReadWriterSupplier(const FileFormat* i_format, DataWriterSupplier* i_dataSupplier, const Genome* i_genome)
+ :
+ format(i_format),
+ dataSupplier(i_dataSupplier),
+ genome(i_genome)
+ {}
+ ~SimpleReadWriterSupplier()
+ {
+ delete dataSupplier;
+ }
+ virtual ReadWriter* getWriter()
+ {
+ return new SimpleReadWriter(format, dataSupplier->getWriter(), genome);
+ }
+ virtual void close()
+ {
+ dataSupplier->close();
+ }
+ const FileFormat* format;
+ DataWriterSupplier* dataSupplier;
+ const Genome* genome;
+ ReadWriterSupplier*
+ const FileFormat* format,
+ DataWriterSupplier* dataSupplier,
+ const Genome* genome)
+ return new SimpleReadWriterSupplier(format, dataSupplier, genome);
diff --git a/SNAPLib/SAM.cpp b/SNAPLib/SAM.cpp
new file mode 100644
index 0000000..073cf93
--- /dev/null
+++ b/SNAPLib/SAM.cpp
@@ -0,0 +1,1721 @@
+Module Name:
+ SAM.cpp
+ Sequence Alignment Map (SAM) file writer and reader.
+ User mode service.
+ SamWriter and SamReader (and their subclasses) aren't thread safe.
+#include "stdafx.h"
+#include "BigAlloc.h"
+#include "Compat.h"
+#include "Read.h"
+#include "SAM.h"
+#include "Bam.h"
+#include "Tables.h"
+#include "RangeSplitter.h"
+#include "ParallelTask.h"
+#include "Util.h"
+#include "ReadSupplierQueue.h"
+#include "FileFormat.h"
+#include "AlignerOptions.h"
+#include "directions.h"
+#include "exit.h"
+using std::max;
+using std::min;
+using util::strnchr;
+bool readIdsMatch(const char* id0, const char* id1)
+ for (unsigned i = 0; ; i++) {
+ char c0 = id0[i];
+ char c1 = id1[i];
+ if (c0 != c1) return false;
+ // don't parse the read ID after the first space or slash, which can represent metadata (or which half of the mate pair the read is).
+ if (c0 == 0 || c0 == ' ' || c0 == '/') return true;
+ }
+ return true;
+bool readIdsMatch(Read *read0, Read *read1)
+ if (read0->getIdLength() != read1->getIdLength()) {
+ return false;
+ }
+ for (unsigned i = 0; i < read0->getIdLength(); i++) {
+ char c0 = read0->getId()[i];
+ char c1 = read1->getId()[i];
+ if (c0 != c1) return false;
+ // don't parse the read ID after the first space or slash, which can represent metadata (or which half of the mate pair the read is).
+ if (c0 == ' ' || c0 == '/') return true;
+ }
+ return true;
+ char *
+strnchrs(char *str, char charToFind, char charToFind2, size_t maxLen) // Hokey version that looks for either of two chars
+ for (size_t i = 0; i < maxLen; i++) {
+ if (str[i] == charToFind || str[i] == charToFind2) {
+ return str + i;
+ }
+ if (str[i] == 0) {
+ return NULL;
+ }
+ }
+ return NULL;
+ char *
+SAMReader::skipToBeyondNextFieldSeparator(char *str, const char *endOfBuffer, size_t *o_charsUntilFirstSeparator)
+ if (NULL == str) return NULL;
+ char *nextChar = str;
+ while (nextChar < endOfBuffer && *nextChar != '\n' && *nextChar != '\t' && *nextChar != '\r' /* for Windows CRLF text */) {
+ nextChar++;
+ }
+ if (NULL != o_charsUntilFirstSeparator) {
+ *o_charsUntilFirstSeparator = nextChar - str;
+ }
+ if (nextChar >= endOfBuffer || *nextChar == '\n') {
+ return NULL;
+ }
+ while (nextChar < endOfBuffer && ('\t' == *nextChar || '\r' == *nextChar)) {
+ nextChar++;
+ }
+ if (nextChar >= endOfBuffer) {
+ return NULL;
+ }
+ return nextChar;
+ SAMReader *
+ DataSupplier* supplier,
+ const char *fileName,
+ int bufferCount,
+ const ReaderContext& context,
+ _int64 startingOffset,
+ _int64 amountOfFileToProcess)
+ DataReader* data = supplier->getDataReader(bufferCount, maxLineLen, 0.0, 0);
+ SAMReader *reader = new SAMReader(data, context);
+ reader->init(fileName, startingOffset, amountOfFileToProcess);
+ return reader;
+ void
+SAMReader::readHeader(const char *fileName)
+ // todo: allow for larger headers
+ _int64 headerSize = 512 * 1024; // 1M header initially (it's doubled before we use it)
+ _int64 oldHeaderSize = 0;
+ char* buffer;
+ bool sawWholeHeader;
+ do {
+ headerSize *= 2;
+ buffer = data->readHeader(&headerSize);
+ if (oldHeaderSize >= headerSize) {
+ //
+ // No new data, we hit EOF
+ //
+ return;
+ }
+ oldHeaderSize = headerSize;
+ if (!parseHeader(fileName, buffer, buffer + headerSize, context.genome, &headerSize, &context.headerMatchesIndex, &sawWholeHeader)) {
+ WriteErrorMessage("SAMReader: failed to parse header on '%s'\n", fileName);
+ soft_exit(1);
+ }
+ } while (!sawWholeHeader);
+ _ASSERT(context.header == NULL);
+ char* p = new char[headerSize + 1];
+ memcpy(p, buffer, headerSize);
+ p[headerSize] = 0;
+ context.header = p;
+ context.headerBytes = context.headerLength = headerSize;
+ DataReader* i_data,
+ const ReaderContext& i_context)
+ : ReadReader(i_context), data(i_data), headerSize(-1), clipping(i_context.clipping)
+// Implement the ReadReader form of getNextRead, which doesn't include the
+// alignment results by simply throwing them away.
+ bool
+SAMReader::getNextRead(Read *readToUpdate)
+ return getNextRead(readToUpdate, NULL, NULL, NULL, NULL, NULL, NULL);
+ bool
+ const char *fileName,
+ char *firstLine,
+ char *endOfBuffer,
+ const Genome *genome,
+ _int64 *o_headerSize,
+ bool *o_headerMatchesIndex,
+ bool *o_sawWholeHeader)
+ char *nextLineToProcess = firstLine;
+ *o_headerMatchesIndex = true;
+ int numSQLines = 0;
+ while (NULL != nextLineToProcess && nextLineToProcess < endOfBuffer && '@' == *nextLineToProcess) {
+ //
+ // Make sure we have the complete line.
+ //
+ bool foundCompleteLine = false;
+ for (char *c = nextLineToProcess; c < endOfBuffer; c++) {
+ if (*c == '\n') {
+ foundCompleteLine = true;
+ break;
+ }
+ }
+ if (!foundCompleteLine) {
+ *o_sawWholeHeader = false;
+ return true; // Parsed OK, but incomplete
+ }
+ if (!strncmp("@SQ",nextLineToProcess,3)) {
+ //
+ // These lines represent sequences in the reference genome, what are
+ // called "contigs" in the Genome class. (Roughly, chromosomes or major
+ // variants like some versions of the MHC genes on chr6; or more
+ // particularly the things that come in different FASTA files from the
+ // reference assembly).
+ //
+ // Verify that they actually match what's in our reference genome.
+ //
+ numSQLines++;
+ if (nextLineToProcess + 3 >= endOfBuffer || ' ' != nextLineToProcess[3] && '\t' != nextLineToProcess[3]) {
+ WriteErrorMessage("Malformed SAM file '%s' has @SQ without a following space or tab.\n",fileName);
+ return false;
+ }
+ char *snStart = nextLineToProcess + 4;
+ while (snStart < endOfBuffer && strncmp(snStart,"SN:",__min(3,endOfBuffer-snStart)) && *snStart != '\n' && *snStart != 0) {
+ snStart++;
+ }
+ if (snStart >= endOfBuffer || *snStart == '\n' || *snStart == 0) {
+ WriteErrorMessage("Malformed @SQ line doesn't have 'SN:' in file '%s'\n",fileName);
+ return false;
+ }
+ const size_t contigNameBufferSize = 512;
+ char contigName[contigNameBufferSize];
+ for (unsigned i = 0; i < contigNameBufferSize && snStart+3+i < endOfBuffer; i++) {
+ if (snStart[3+i] == ' ' || snStart[3+i] == '\t' || snStart[3+i] == '\n' || snStart[3+i] == 0) {
+ contigName[i] = '\0';
+ } else {
+ contigName[i] = snStart[3+i];
+ }
+ }
+ contigName[contigNameBufferSize - 1] = '\0';
+ if (genome == NULL || !genome->getLocationOfContig(contigName, NULL)) {
+ *o_headerMatchesIndex = false;
+ }
+ } else if (!strncmp("@HD",nextLineToProcess,3) || !strncmp("@RG",nextLineToProcess,3) || !strncmp("@PG",nextLineToProcess,3) ||
+ !strncmp("@CO",nextLineToProcess,3)) {
+ //
+ // Ignore these lines.
+ //
+ } else {
+ WriteErrorMessage("Unrecognized header line in SAM file.\n");
+ return false;
+ }
+ char * p = strnchr(nextLineToProcess,'\n',endOfBuffer-nextLineToProcess);
+ if (p == NULL) {
+ // no newline, look for null to truncate buffer
+ p = (char*) memchr(nextLineToProcess, 0, endOfBuffer - nextLineToProcess);
+ nextLineToProcess = p != NULL ? p + 1 : endOfBuffer;
+ break;
+ }
+ nextLineToProcess = p + 1;
+ }
+ *o_headerMatchesIndex &= genome != NULL && numSQLines == genome->getNumContigs();
+ *o_headerSize = nextLineToProcess - firstLine;
+ if (NULL != o_sawWholeHeader) {
+ *o_sawWholeHeader = nextLineToProcess < endOfBuffer;
+ }
+ return true;
+ bool
+SAMReader::parseLine(char *line, char *endOfBuffer, char *result[], size_t *linelength, size_t fieldLengths[])
+ *linelength = 0;
+ char *next = line;
+ char *endOfLine = strnchr(line,'\n',endOfBuffer-line);
+ if (NULL == endOfLine) {
+ return false;
+ }
+ //
+ // Skip over any leading spaces and tabs
+ //
+ while (next < endOfLine && (*next == ' ' || *next == '\t')) {
+ next++;
+ }
+ for (unsigned i = 0; i < nSAMFields; i++) {
+ if (NULL == next || next >= endOfLine) {
+ if (i == OPT) {
+ // no optional fields
+ result[OPT] = NULL;
+ break;
+ } else {
+ //
+ // Too few fields.
+ //
+ return false;
+ }
+ }
+ result[i] = next;
+ if (i == OPT) {
+ // OPT field is actually all fields until end of line
+ fieldLengths[OPT] = endOfLine - next;
+ break;
+ }
+ next = skipToBeyondNextFieldSeparator(next,endOfLine,&fieldLengths[i]);
+ }
+ *linelength = endOfLine - line + 1; // +1 skips over the \n
+ return true;
+ void
+ const Genome *genome,
+ char *line,
+ char *endOfBuffer,
+ Read *read,
+ AlignmentResult *alignmentResult,
+ GenomeLocation *out_genomeLocation,
+ Direction *direction,
+ unsigned *mapQ,
+ size_t *lineLength,
+ unsigned * flag,
+ const char ** cigar,
+ ReadClippingType clipping
+ )
+ char *field[nSAMFields];
+ size_t fieldLength[nSAMFields];
+ if (!parseLine(line, endOfBuffer, field, lineLength, fieldLength)) {
+ WriteErrorMessage( "Failed to parse SAM line:\n%.*s\n", lineLength, line);
+ soft_exit(1);
+ }
+ //
+ // We have to copy the contig name (RNAME) into its own buffer because the code in Genome expects
+ // it to be a null-terminated string, while all we've got is one that's space delimited.
+ //
+ const size_t contigNameBufferSize = 512;
+ char contigName[contigNameBufferSize];
+ GenomeLocation locationOfContig;
+ parseContigName(genome, contigName, contigNameBufferSize, &locationOfContig, NULL, field, fieldLength);
+ GenomeLocation genomeLocation = parseLocation(locationOfContig, field, fieldLength);
+ if (NULL != out_genomeLocation) {
+ *out_genomeLocation = genomeLocation;
+ }
+ if (fieldLength[SEQ] != fieldLength[QUAL]) {
+ WriteErrorMessage("SAMReader: QUAL string unequal in length to SEQ string.\n");
+ soft_exit(1);
+ }
+ unsigned _flag;
+ const size_t flagBufferSize = 20; // More than enough
+ char flagBuffer[flagBufferSize];
+ if (fieldLength[FLAG] >= flagBufferSize) {
+ WriteErrorMessage("SAMReader: flag field is too long.\n");
+ soft_exit(1);
+ }
+ memcpy(flagBuffer,field[FLAG],fieldLength[FLAG]);
+ flagBuffer[fieldLength[FLAG]] = '\0';
+ if (1 != sscanf(flagBuffer,"%d",&_flag)) {
+ WriteErrorMessage("SAMReader: couldn't parse FLAG field.\n");
+ soft_exit(1);
+ }
+ if (NULL != read) {
+ //
+ // Clip reads where the quality strings end in '#'
+ //
+ unsigned originalFrontClipping, originalBackClipping, originalFrontHardClipping, originalBackHardClipping;
+ Read::computeClippingFromCigar(field[CIGAR], &originalFrontClipping, &originalBackClipping, &originalFrontHardClipping, &originalBackHardClipping);
+ unsigned pnext = atoi(field[PNEXT]); // Relies on atoi() returning 0 for non-numeric fields (i.e., *)
+ read->init(field[QNAME],(unsigned)fieldLength[QNAME],field[SEQ],field[QUAL],(unsigned)fieldLength[SEQ], genomeLocation, atoi(field[MAPQ]), _flag,
+ originalFrontClipping, originalBackClipping, originalFrontHardClipping, originalBackHardClipping, field[RNEXT], (unsigned)fieldLength[RNEXT], pnext);
+ //
+ // If this read is RC in the SAM file, we need to reverse it here, since Reads are always the sense that they were as they came
+ // out of the base caller.
+ //
+ read->becomeRC();
+ }
+ read->clip(clipping);
+ if (field[OPT] != NULL) {
+ unsigned n = (unsigned) fieldLength[OPT];
+ while (n > 0 && (field[OPT][n-1] == '\n' || field[OPT][n-1] == '\r')) {
+ n--;
+ }
+ read->setAuxiliaryData(field[OPT], n);
+ for (char* p = field[OPT]; p != NULL && p < field[OPT] + fieldLength[OPT]; p = SAMReader::skipToBeyondNextFieldSeparator(p, field[OPT] + fieldLength[OPT])) {
+ if (strncmp(p, "RG:Z:", 5) == 0) {
+ read->setReadGroup(READ_GROUP_FROM_AUX);
+ break;
+ }
+ }
+ }
+ }
+ if (NULL != alignmentResult) {
+ if (_flag & SAM_UNMAPPED) {
+ *alignmentResult = NotFound;
+ } else {
+ if ('*' == contigName[0]) {
+ WriteErrorMessage("SAMReader: mapped read didn't have RNAME filled in.\n");
+ soft_exit(1);
+ }
+ *alignmentResult = SingleHit; // NB: This isn't quite right, we should look at MAPQ.
+ }
+ }
+ if (NULL != direction) {
+ *direction = (_flag & SAM_REVERSE_COMPLEMENT) ? RC : FORWARD;
+ }
+ if (NULL != mapQ) {
+ *mapQ = atoi(field[MAPQ]);
+ if (*mapQ > 255) {
+ WriteErrorMessage("SAMReader: MAPQ field has bogus value\n");
+ soft_exit(1);
+ }
+ }
+ if (NULL != flag) {
+ *flag = _flag;
+ }
+ if (NULL != cigar) {
+ *cigar = field[CIGAR];
+ }
+ void
+ const Genome* genome,
+ char* contigName,
+ size_t contigNameBufferSize,
+ GenomeLocation* o_locationOfContig,
+ int* o_indexOfContig,
+ char* field[],
+ size_t fieldLength[],
+ unsigned rfield)
+ if (fieldLength[rfield] >= contigNameBufferSize) { // >= because we need a byte for the \0
+ WriteErrorMessage("SAMReader: too long an RNAME. Can't parse.\n");
+ soft_exit(1);
+ }
+ memcpy(contigName,field[rfield],fieldLength[rfield]);
+ contigName[fieldLength[rfield]] = '\0';
+ *o_locationOfContig = 0;
+ if ('*' != contigName[0] && genome != NULL && !genome->getLocationOfContig(contigName, o_locationOfContig, o_indexOfContig)) {
+ //WriteErrorMessage("Unable to find contig '%s' in genome. SAM file malformed.\n",contigName);
+ //soft_exit(1);
+ }
+ GenomeLocation
+ GenomeLocation locationOfContig,
+ char* field[],
+ size_t fieldLength[],
+ unsigned rfield,
+ unsigned posfield)
+ unsigned oneBasedOffsetWithinContig = 0;
+ if ('*' != field[rfield][0] && '*' != field[posfield][0]) {
+ //
+ // We can't call sscanf directly into the mapped file, becuase it reads to the end of the
+ // string even when it's satisfied all of its fields. Since this can be gigabytes, it's not
+ // really good for perf. Instead, copy the POS field into a local buffer and null terminate it.
+ //
+ const unsigned posBufferSize = 20;
+ char posBuffer[posBufferSize];
+ if (fieldLength[posfield] >= posBufferSize) {
+ WriteErrorMessage("SAMReader: POS field too long.\n");
+ soft_exit(1);
+ }
+ memcpy(posBuffer,field[posfield],fieldLength[posfield]);
+ posBuffer[fieldLength[posfield]] = '\0';
+ if (0 == sscanf(posBuffer,"%d",&oneBasedOffsetWithinContig)) {
+ WriteErrorMessage("SAMReader: Unable to parse position when it was expected.\n");
+ soft_exit(1);
+ }
+ if (0 == oneBasedOffsetWithinContig) {
+ WriteErrorMessage("SAMReader: Position parsed as 0 when it was expected.\n");
+ soft_exit(1);
+ }
+ return locationOfContig + oneBasedOffsetWithinContig - 1; // -1 is because our offset is 0 based, while SAM is 1 based.
+ } else {
+ return InvalidGenomeLocation;
+ }
+ void
+ const char *fileName,
+ _int64 startingOffset,
+ _int64 amountOfFileToProcess)
+ if (! data->init(fileName)) {
+ WriteErrorMessage( "Unable to read file %s\n", fileName);
+ soft_exit(1);
+ }
+ if (0 == startingOffset) {
+ readHeader(fileName);
+ }
+ headerSize = context.headerBytes;
+ reinit(max(startingOffset, (_int64) context.headerBytes),
+ amountOfFileToProcess == 0 || startingOffset >= (_int64) context.headerBytes ? amountOfFileToProcess
+ : amountOfFileToProcess - (context.headerBytes - startingOffset));
+ void
+SAMReader::reinit(_int64 startingOffset, _int64 amountOfFileToProcess)
+ _ASSERT(-1 != headerSize && startingOffset >= headerSize); // Must call init() before reinit()
+ //
+ // There's no way to tell if we start at the very beginning of a read, we need to see the previous newline.
+ // So, read one byte before our assigned read in case that was the terminating newline of the previous read.
+ //
+ if (startingOffset > headerSize) {
+ startingOffset--;
+ amountOfFileToProcess++;
+ }
+ data->reinit(startingOffset, amountOfFileToProcess);
+ char* buffer;
+ _int64 validBytes;
+ if (!data->getData(&buffer, &validBytes)) {
+ return;
+ }
+ if (startingOffset != headerSize) {
+ char *firstNewline = strnchr(buffer,'\n',validBytes);
+ if (NULL == firstNewline) {
+ return;
+ }
+ data->advance((unsigned)(firstNewline - buffer + 1)); // +1 skips over the newline.
+ }
+ bool
+ Read *read,
+ AlignmentResult *alignmentResult,
+ GenomeLocation *genomeLocation,
+ Direction *direction,
+ unsigned *mapQ,
+ unsigned *flag,
+ bool ignoreEndOfRange,
+ const char **cigar)
+ unsigned local_flag;
+ if (NULL == flag) {
+ flag = &local_flag;
+ }
+ do {
+ char* buffer;
+ _int64 bytes;
+ if (! data->getData(&buffer, &bytes)) {
+ data->nextBatch();
+ if (! data->getData(&buffer, &bytes)) {
+ return false;
+ }
+ }
+ char *newLine = strnchr(buffer, '\n', bytes);
+ if (NULL == newLine) {
+ //
+ // There is no newline, so the line crosses the end of the buffer.
+ // This should never happen since underlying reader manages overflow between chunks.
+ //
+ WriteErrorMessage("SAM file has too long a line, or doesn't end with a newline! Failing. fileOffset = %lld\n", data->getFileOffset());
+ soft_exit(1);
+ }
+ size_t lineLength;
+ read->setReadGroup(context.defaultReadGroup);
+ getReadFromLine(context.genome, buffer,buffer + bytes, read, alignmentResult, genomeLocation, direction, mapQ, &lineLength, flag, cigar, clipping);
+ read->setBatch(data->getBatch());
+ data->advance((newLine + 1) - buffer);
+ } while ((context.ignoreSecondaryAlignments && ((*flag) & SAM_SECONDARY)) ||
+ (context.ignoreSupplementaryAlignments && ((*flag) & SAM_SUPPLEMENTARY)));
+ return true;
+ ReadSupplierGenerator *
+ const char *fileName,
+ int numThreads,
+ const ReaderContext& context)
+ //
+ // single-ended SAM files always can be read with the range splitter, unless reading from stdin, which needs a queue
+ //
+ if (!strcmp(fileName, "-")) {
+ //
+ // Stdin must run from a queue, not range splitter.
+ //
+ ReadReader* reader;
+ //
+ // Because we can only have one stdin reader, we need to use a queue if we're reading from stdin
+ //
+ reader = SAMReader::create(DataSupplier::Stdio, "-", ReadSupplierQueue::BufferCount(numThreads), context, 0, 0);
+ if (reader == NULL) {
+ return NULL;
+ }
+ ReadSupplierQueue *queue = new ReadSupplierQueue(reader);
+ queue->startReaders();
+ return queue;
+ } else {
+ RangeSplitter *splitter = new RangeSplitter(QueryFileSize(fileName), numThreads, 100);
+ return new RangeSplittingReadSupplierGenerator(fileName, true, numThreads, context);
+ }
+ PairedReadReader*
+ const DataSupplier* supplier,
+ const char *fileName,
+ int bufferCount,
+ _int64 startingOffset,
+ _int64 amountOfFileToProcess,
+ bool quicklyDropUnpairedReads,
+ const ReaderContext& context)
+ DataSupplier *data;
+ if (!strcmp("-", fileName)) {
+ data = DataSupplier::Stdio;
+ } else {
+ data = DataSupplier::Default;
+ }
+ SAMReader* reader = SAMReader::create(data, fileName, bufferCount + PairedReadReader::MatchBuffers, context, 0, 0);
+ if (reader == NULL) {
+ return NULL;
+ }
+ return PairedReadReader::PairMatcher(reader, quicklyDropUnpairedReads);
+ PairedReadSupplierGenerator *
+ const char *fileName,
+ int numThreads,
+ bool quicklyDropUnpairedReads,
+ const ReaderContext& context)
+ //
+ // need to use a queue so that pairs can be matched
+ //
+ PairedReadReader* paired = SAMReader::createPairedReader(DataSupplier::Default, fileName,
+ ReadSupplierQueue::BufferCount(numThreads), 0, 0, quicklyDropUnpairedReads, context);
+ if (paired == NULL) {
+ WriteErrorMessage( "Cannot create reader on %s\n", fileName);
+ soft_exit(1);
+ }
+ ReadSupplierQueue* queue = new ReadSupplierQueue(paired);
+ queue->startReaders();
+ return queue;
+const FileFormat* FileFormat::SAM[] = { new SAMFormat(false), new SAMFormat(true) };
+ void
+ const Genome* genome,
+ char* buffer,
+ _int64 bytes,
+ GenomeLocation* o_location,
+ GenomeDistance* o_readBytes,
+ int* o_refID,
+ int* o_pos) const
+ char* fields[SAMReader::nSAMFields];
+ size_t lengths[SAMReader::nSAMFields];
+ size_t lineLength;
+ SAMReader::parseLine(buffer, buffer + bytes, fields, &lineLength, lengths);
+ _ASSERT(lineLength < UINT32_MAX);
+ if (o_readBytes != NULL) {
+ *o_readBytes = (unsigned) lineLength;
+ }
+ if (lengths[SAMReader::POS] == 0 || fields[SAMReader::POS][0] == '*') {
+ if (lengths[SAMReader::PNEXT] == 0 || fields[SAMReader::PNEXT][0] == '*') {
+ if (o_location != NULL) {
+ *o_location = UINT32_MAX;
+ }
+ if (o_refID != NULL) {
+ *o_refID = -1;
+ }
+ if (o_pos != NULL) {
+ *o_pos = 0;
+ }
+ } else {
+ const size_t contigNameBufferSize = 512;
+ char contigName[contigNameBufferSize];
+ GenomeLocation locationOfContig;
+ SAMReader::parseContigName(genome, contigName, contigNameBufferSize, &locationOfContig, o_refID, fields, lengths, SAMReader::RNEXT);
+ if (o_location != NULL) {
+ *o_location = SAMReader::parseLocation(locationOfContig, fields, lengths, SAMReader::RNEXT, SAMReader::PNEXT);
+ }
+ }
+ } else {
+ const size_t contigNameBufferSize = 512;
+ char contigName[contigNameBufferSize];
+ GenomeLocation locationOfContig;
+ SAMReader::parseContigName(genome, contigName, contigNameBufferSize, &locationOfContig, o_refID, fields, lengths);
+ if (o_location != NULL) {
+ *o_location = SAMReader::parseLocation(locationOfContig, fields, lengths);
+ }
+ }
+// which @RG line fields to put in aux data of every read
+const char* FileFormat::RGLineToAux = "IDLBPLPUSM";
+ void
+ AlignerOptions* options,
+ ReaderContext* readerContext,
+ bool bam)
+ if (options->rgLineContents == NULL || *options->rgLineContents == '\0') {
+ readerContext->defaultReadGroupAux = "";
+ readerContext->defaultReadGroupAuxLen = 0;
+ return;
+ }
+ char* buffer = new char[strlen(options->rgLineContents) * 3]; // can't expend > 2x
+ const char* from = options->rgLineContents;
+ char* to = buffer;
+ // skip @RG
+ _ASSERT(strncmp(from, "@RG", 3) == 0);
+ while (*from && *from != '\t') {
+ from++;
+ }
+ while (*from) {
+ if (!(from[0] == '\t' && from[1] && from[1] != '\t' && from[2] && from[2] != '\t' && from[3] == ':')) {
+ WriteErrorMessage("Invalid @RG line: %s\n", options->rgLineContents);
+ soft_exit(1);
+ }
+ bool keep = false;
+ bool isID = false;
+ for (const char* a = RGLineToAux; *a; a += 2) {
+ if (from[1] == a[0] && from[2] == a[1]) {
+ keep = true;
+ isID = from[1] == 'I' && from[2] == 'D';
+ break;
+ }
+ }
+ if (keep) {
+ if (bam) {
+ BAMAlignAux* aux = (BAMAlignAux*)to;
+ aux->tag[0] = isID ? 'R' : from[1];
+ aux->tag[1] = isID ? 'G' : from[2];
+ aux->val_type = 'Z';
+ from += 4; // skip \tXX:
+ to = (char*)aux->value();
+ while (*from && *from != '\t') {
+ *to++ = *from++;
+ }
+ *to++ = 0;
+ } else {
+ // turn \tXX: into \tXX:Z:, change ID to RG
+ *to++ = *from++;
+ if (isID) {
+ *to++ = 'R';
+ *to++ = 'G';
+ from += 2;
+ } else {
+ *to++ = *from++;
+ *to++ = *from++;
+ }
+ *to++ = *from++;
+ *to++ = 'Z';
+ *to++ = ':';
+ // copy string attribute
+ while (*from && *from != '\t') {
+ *to++ = *from++;
+ }
+ }
+ } else {
+ from += 4;
+ while (*from && *from != '\t') {
+ from++;
+ }
+ }
+ }
+ readerContext->defaultReadGroupAux = buffer;
+ readerContext->defaultReadGroupAuxLen = (int) (to - buffer);
+ ReadWriterSupplier*
+ AlignerOptions* options,
+ const Genome* genome) const
+ DataWriterSupplier* dataSupplier;
+ if (options->sortOutput) {
+ size_t len = strlen(options->outputFile.fileName);
+ // todo: this is going to leak, but there's no easy way to free it, and it's small...
+ char* tempFileName = (char*) malloc(5 + len);
+ strcpy(tempFileName, options->outputFile.fileName);
+ strcpy(tempFileName + len, ".tmp");
+ dataSupplier = DataWriterSupplier::sorted(this, genome, tempFileName, options->sortMemory * (1ULL << 30),
+ options->numThreads, options->outputFile.fileName, NULL, options->writeBufferSize);
+ } else {
+ dataSupplier = DataWriterSupplier::create(options->outputFile.fileName, options->writeBufferSize);
+ }
+ return ReadWriterSupplier::create(this, dataSupplier, genome);
+ bool
+ const ReaderContext& context,
+ char *header,
+ size_t headerBufferSize,
+ size_t *headerActualSize,
+ bool sorted,
+ int argc,
+ const char **argv,
+ const char *version,
+ const char *rgLine,
+ bool omitSQLines) // Hacky option for Charles
+ const
+ char *commandLine;
+ size_t commandLineSize = 0;
+ for (int i = 0; i < argc; i++) {
+ commandLineSize += strlen(argv[i]) + 1; // +1 is either a space or the terminating null
+ }
+ commandLine = new char[commandLineSize];
+ commandLine[0] = '\0';
+ for (int i = 0; i < argc; i++) {
+ strcat(commandLine,argv[i]);
+ if (i != argc-1) {
+ strcat(commandLine," ");
+ }
+ }
+ size_t bytesConsumed = snprintf(header, headerBufferSize, "@HD\tVN:1.4\tSO:%s\n%s%s at PG\tID:SNAP\tPN:SNAP\tCL:%s\tVN:%s\n",
+ sorted ? "coordinate" : "unsorted",
+ context.header == NULL ? (rgLine == NULL ? "@RG\tID:FASTQ\tSM:sample" : rgLine) : "",
+ context.header == NULL ? "\n" : "",
+ commandLine,version);
+ delete [] commandLine;
+ commandLine = NULL;
+ if (bytesConsumed >= headerBufferSize) {
+ //WriteErrorMessage("SAMWriter: header buffer too small\n");
+ return false;
+ }
+ if (context.header != NULL) {
+ bool hasRG = false;
+ for (const char* p = context.header; p < context.header + context.headerLength; ) {
+ const char* newline = strnchr(p, '\n', (context.header + context.headerLength) - p);
+ if (newline == NULL) {
+ newline = context.header + context.headerLength;
+ }
+ _ASSERT(newline - p >= 3);
+ // skip @HD lines, and also @SQ lines if header does not match index
+ hasRG |= strncmp(p, "@RG", 3) == 0;
+ if (strncmp(p, "@HD", 3) != 0 &&
+ (context.headerMatchesIndex || strncmp(p, "@SQ", 3) != 0) &&
+ strncmp(p, "@PG\tID:SNAP\t", 12) != 0) {
+ if (bytesConsumed + (newline - p) + 1 >= headerBufferSize) {
+ //WriteErrorMessage("SAMWriter: header buffer too small\n");
+ return false;
+ }
+ memcpy(header + bytesConsumed, p, (newline - p));
+ * (header + bytesConsumed + (newline - p)) = '\n';
+ bytesConsumed += (newline - p) + 1;
+ }
+ p = newline + 1;
+ }
+ if (! hasRG) {
+ int n = snprintf(header + bytesConsumed, headerBufferSize - bytesConsumed, "%s\n",
+ rgLine == NULL ? "@RG\tID:FASTQ\tSM:sample" : rgLine);
+ if (n > headerBufferSize - bytesConsumed) {
+ //WriteErrorMessage( "SAMWriter: header buffer too small\n");
+ return false;
+ }
+ bytesConsumed += n;
+ }
+ }
+#ifndef SKIP_SQ_LINES
+ if ((context.header == NULL || ! context.headerMatchesIndex) && context.genome != NULL && !omitSQLines) {
+ // Write an @SQ line for each chromosome / contig in the genome
+ const Genome::Contig *contigs = context.genome->getContigs();
+ int numContigs = context.genome->getNumContigs();
+ GenomeDistance genomeLen = context.genome->getCountOfBases();
+ size_t originalBytesConsumed = bytesConsumed;
+ for (int i = 0; i < numContigs; i++) {
+ GenomeLocation start = contigs[i].beginningLocation;
+ GenomeLocation end = ((i + 1 < numContigs) ? contigs[i+1].beginningLocation : genomeLen) - context.genome->getChromosomePadding();
+ bytesConsumed += snprintf(header + bytesConsumed, headerBufferSize - bytesConsumed, "@SQ\tSN:%s\tLN:%u\n", contigs[i].name, end - start);
+ if (bytesConsumed >= headerBufferSize) {
+ // todo: increase buffer size (or change to write in batch
+ bytesConsumed = originalBytesConsumed;
+ //WriteErrorMessage("SAMWriter: header buffer too small, skipping @SQ lines\n");
+ return false;
+ }
+ }
+ }
+#endif // SKIP_SQ_LINES
+ *headerActualSize = bytesConsumed;
+ return true;
+ bool
+ const Genome * genome,
+ LandauVishkinWithCigar * lv,
+ // output data
+ char* data,
+ char* quality,
+ GenomeDistance dataSize,
+ const char*& contigName,
+ int& contigIndex,
+ int& flags,
+ GenomeDistance& positionInContig,
+ int& mapQuality,
+ const char*& matecontigName,
+ int& mateContigIndex,
+ GenomeDistance& matePositionInContig,
+ _int64& templateLength,
+ unsigned& fullLength,
+ const char*& clippedData,
+ unsigned& clippedLength,
+ unsigned& basesClippedBefore,
+ unsigned& basesClippedAfter,
+ // input data
+ size_t& qnameLen,
+ Read * read,
+ AlignmentResult result,
+ GenomeLocation genomeLocation,
+ Direction direction,
+ bool secondaryAlignment,
+ bool useM,
+ bool hasMate,
+ bool firstInPair,
+ bool alignedAsPair,
+ Read * mate,
+ AlignmentResult mateResult,
+ GenomeLocation mateLocation,
+ Direction mateDirection,
+ GenomeDistance *extraBasesClippedBefore)
+ contigName = "*";
+ positionInContig = 0;
+ const char *cigar = "*";
+ templateLength = 0;
+ if (secondaryAlignment) {
+ flags |= SAM_SECONDARY;
+ }
+ if (0 == qnameLen) {
+ qnameLen = read->getIdLength();
+ }
+ //
+ // If the aligner said it didn't find anything, treat it as such. Sometimes it will emit the
+ // best match that it found, even if it's not within the maximum edit distance limit (but will
+ // then say NotFound). Here, we force that to be SAM_UNMAPPED.
+ //
+ if (NotFound == result) {
+ genomeLocation = InvalidGenomeLocation;
+ }
+ if (InvalidGenomeLocation == genomeLocation) {
+ //
+ // If it's unmapped, then always emit it in the forward direction. This is necessary because we don't even include
+ // the SAM_REVERSE_COMPLEMENT flag for unmapped reads, so there's no way to tell that we reversed it.
+ //
+ direction = FORWARD;
+ }
+ // Write the data and quality strings. If the read is reverse complemented, these need to
+ // be backwards from the original read. Also, both need to be unclipped.
+ clippedLength = read->getDataLength();
+ fullLength = read->getUnclippedLength();
+ if (fullLength > dataSize) {
+ return false;
+ }
+ if (direction == RC) {
+ for (unsigned i = 0; i < fullLength; i++) {
+ data[fullLength - 1 - i] = COMPLEMENT[read->getUnclippedData()[i]];
+ quality[fullLength - 1 - i] = read->getUnclippedQuality()[i];
+ }
+ clippedData = &data[fullLength - clippedLength - read->getFrontClippedLength()];
+ basesClippedBefore = fullLength - clippedLength - read->getFrontClippedLength();
+ basesClippedAfter = read->getFrontClippedLength();
+ } else {
+ memcpy(data, read->getUnclippedData(), read->getUnclippedLength());
+ memcpy(quality, read->getUnclippedQuality(), read->getUnclippedLength());
+ clippedData = read->getData();
+ basesClippedBefore = read->getFrontClippedLength();
+ basesClippedAfter = fullLength - clippedLength - basesClippedBefore;
+ }
+ int editDistance = -1;
+ if (genomeLocation != InvalidGenomeLocation) {
+ if (direction == RC) {
+ }
+ const Genome::Contig *contig = genome->getContigForRead(genomeLocation, read->getDataLength(), extraBasesClippedBefore);
+ _ASSERT(NULL != contig && contig->length > genome->getChromosomePadding());
+ genomeLocation += *extraBasesClippedBefore;
+ contigName = contig->name;
+ contigIndex = (int)(contig - genome->getContigs());
+ positionInContig = genomeLocation - contig->beginningLocation + 1; // SAM is 1-based
+ mapQuality = max(0, min(70, mapQuality)); // FIXME: manifest constant.
+ } else {
+ flags |= SAM_UNMAPPED;
+ mapQuality = 0;
+ *extraBasesClippedBefore = 0;
+ }
+ if (hasMate) {
+ flags |= (firstInPair ? SAM_FIRST_SEGMENT : SAM_LAST_SEGMENT);
+ if (mateLocation != InvalidGenomeLocation) {
+ GenomeDistance mateExtraBasesClippedBefore;
+ const Genome::Contig *mateContig = genome->getContigForRead(mateLocation, mate->getDataLength(), &mateExtraBasesClippedBefore);
+ mateLocation += mateExtraBasesClippedBefore;
+ matecontigName = mateContig->name;
+ mateContigIndex = (int)(mateContig - genome->getContigs());
+ matePositionInContig = mateLocation - mateContig->beginningLocation + 1;
+ if (mateDirection == RC) {
+ }
+ if (genomeLocation == InvalidGenomeLocation) {
+ //
+ // The SAM spec says that for paired reads where exactly one end is unmapped that the unmapped
+ // half should just have RNAME and POS copied from the mate.
+ //
+ contigName = matecontigName;
+ contigIndex = mateContigIndex;
+ matecontigName = "=";
+ positionInContig = matePositionInContig;
+ }
+ } else {
+ //
+ // The mate's unmapped, so point it at us.
+ //
+ matecontigName = "=";
+ mateContigIndex = contigIndex;
+ matePositionInContig = positionInContig;
+ }
+ if (genomeLocation != InvalidGenomeLocation && mateLocation != InvalidGenomeLocation) {
+ if (alignedAsPair) {
+ flags |= SAM_ALL_ALIGNED;
+ }
+ // Also compute the length of the whole paired-end string whose ends we saw. This is slightly
+ // tricky because (a) we may have clipped some bases before/after each end and (b) we need to
+ // give a signed result based on whether our read is first or second in the pair.
+ GenomeLocation myStart = genomeLocation - basesClippedBefore;
+ GenomeLocation myEnd = genomeLocation + clippedLength + basesClippedAfter;
+ _int64 mateBasesClippedBefore = mate->getFrontClippedLength();
+ _int64 mateBasesClippedAfter = mate->getUnclippedLength() - mate->getDataLength() - mateBasesClippedBefore;
+ GenomeLocation mateStart = mateLocation - (mateDirection == RC ? mateBasesClippedAfter : mateBasesClippedBefore);
+ GenomeLocation mateEnd = mateLocation + mate->getDataLength() + (mateDirection == FORWARD ? mateBasesClippedAfter : mateBasesClippedBefore);
+ if (contigName == matecontigName) { // pointer (not value) comparison, but that's OK.
+ if (myStart < mateStart) {
+ templateLength = mateEnd - myStart;
+ } else {
+ templateLength = -(myEnd - mateStart);
+ }
+ } // otherwise leave TLEN as zero.
+ }
+ if (contigName == matecontigName) {
+ matecontigName = "="; // SAM Spec says to do this when they're equal (and not *, which won't happen because this is a pointer, not string, compare)
+ }
+ }
+ return true;
+ bool
+ const ReaderContext& context,
+ LandauVishkinWithCigar * lv,
+ char * buffer,
+ size_t bufferSpace,
+ size_t * spaceUsed,
+ size_t qnameLen,
+ Read * read,
+ AlignmentResult result,
+ int mapQuality,
+ GenomeLocation genomeLocation,
+ Direction direction,
+ bool secondaryAlignment,
+ int * o_addFrontClipping,
+ bool hasMate,
+ bool firstInPair,
+ Read * mate,
+ AlignmentResult mateResult,
+ GenomeLocation mateLocation,
+ Direction mateDirection,
+ bool alignedAsPair
+ ) const
+ const int cigarBufSize = MAX_READ * 2;
+ char cigarBuf[cigarBufSize];
+ const int cigarBufWithClippingSize = MAX_READ * 2 + 32;
+ char cigarBufWithClipping[cigarBufWithClippingSize];
+ int flags = 0;
+ const char *contigName = "*";
+ int contigIndex = -1;
+ GenomeDistance positionInContig = 0;
+ const char *cigar = "*";
+ const char *matecontigName = "*";
+ int mateContigIndex = -1;
+ GenomeDistance matePositionInContig = 0;
+ _int64 templateLength = 0;
+ char data[MAX_READ];
+ char quality[MAX_READ];
+ const char* clippedData;
+ unsigned fullLength;
+ unsigned clippedLength;
+ unsigned basesClippedBefore;
+ GenomeDistance extraBasesClippedBefore; // Clipping added if we align before the beginning of a chromosome
+ unsigned basesClippedAfter;
+ int editDistance = -1;
+ *o_addFrontClipping = 0;
+ if (!createSAMLine(context.genome, lv, data, quality, MAX_READ, contigName, contigIndex,
+ flags, positionInContig, mapQuality, matecontigName, mateContigIndex, matePositionInContig, templateLength,
+ fullLength, clippedData, clippedLength, basesClippedBefore, basesClippedAfter,
+ qnameLen, read, result, genomeLocation, direction, secondaryAlignment, useM,
+ hasMate, firstInPair, alignedAsPair, mate, mateResult, mateLocation, mateDirection,
+ &extraBasesClippedBefore))
+ {
+ return false;
+ }
+ if (genomeLocation != InvalidGenomeLocation) {
+ cigar = computeCigarString(context.genome, lv, cigarBuf, cigarBufSize, cigarBufWithClipping, cigarBufWithClippingSize,
+ clippedData, clippedLength, basesClippedBefore, extraBasesClippedBefore, basesClippedAfter,
+ read->getOriginalFrontHardClipping(), read->getOriginalBackHardClipping(), genomeLocation, direction, useM,
+ &editDistance, o_addFrontClipping);
+ if (*o_addFrontClipping != 0) {
+ return false;
+ }
+ }
+ // Write the SAM entry, which requires the following fields:
+ //
+ // 1. QNAME: Query name of the read or the read pair
+ // 2. FLAG: Bitwise flag (pairing, strand, mate strand, etc.)
+ // 3. RNAME: Reference sequence name
+ // 4. POS: 1-Based leftmost position of clipped alignment
+ // 5. MAPQ: Mapping quality (Phred-scaled)
+ // 6. CIGAR: Extended CIGAR string (operations: MIDNSHP)
+ // 7. MRNM: Mate reference name (‘=’ if same as RNAME)
+ // 8. MPOS: 1-based leftmost mate position
+ // 9. ISIZE: Inferred insert size
+ // 10. SEQQuery: Sequence on the same strand as the reference
+ // 11. QUAL: Query quality (ASCII-33=Phred base quality)
+ //
+ // Some FASTQ files have spaces in their ID strings, which is illegal in SAM. Just truncate them at the space.
+ //
+ const char *firstSpace = strnchr(read->getId(),' ',qnameLen);
+ if (NULL != firstSpace) {
+ qnameLen = (unsigned)(firstSpace - read->getId());
+ }
+ const int nmStringSize = 30;// Big enough that it won't buffer overflow regardless of the value of editDistance
+ char nmString[nmStringSize];
+ snprintf(nmString, nmStringSize, "\tNM:i:%d",editDistance);
+ unsigned auxLen;
+ bool auxSAM;
+ char* aux = read->getAuxiliaryData(&auxLen, &auxSAM);
+ static bool warningPrinted = false;
+ const char* readGroupSeparator = "";
+ const char* readGroupString = "";
+ if (aux != NULL && (! auxSAM)) {
+ if (! warningPrinted) {
+ WriteErrorMessage( "warning: translating optional fields from BAM->SAM not yet implemented, optional fields will not be included in output\n");
+ warningPrinted = true;
+ }
+ if (read->getReadGroup() == READ_GROUP_FROM_AUX) {
+ for (BAMAlignAux* bamAux = (BAMAlignAux*) aux; (char*) bamAux < aux + auxLen; bamAux = bamAux->next()) {
+ if (bamAux->tag[0] == 'R' && bamAux->tag[1] == 'G' && bamAux->val_type == 'Z') {
+ readGroupSeparator = "\tRG:Z:";
+ readGroupString = (char*) bamAux->value();
+ break;
+ }
+ }
+ }
+ aux = NULL;
+ auxLen = 0;
+ }
+ const char* rglineAux = "";
+ int rglineAuxLen = 0;
+ if (read->getReadGroup() != NULL && read->getReadGroup() != READ_GROUP_FROM_AUX) {
+ if (*readGroupString == 0 || strcmp(readGroupString, context.defaultReadGroup) == 0) {
+ readGroupSeparator = "";
+ readGroupString = "";
+ rglineAux = context.defaultReadGroupAux;
+ rglineAuxLen = context.defaultReadGroupAuxLen;
+ } else {
+ readGroupSeparator = "\tRG:Z:";
+ readGroupString = read->getReadGroup();
+ }
+ }
+ int charsInString = snprintf(buffer, bufferSpace, "%.*s\t%d\t%s\t%u\t%d\t%s\t%s\t%u\t%lld\t%.*s\t%.*s%s%.*s%s%s\tPG:Z:SNAP%s%.*s\n",
+ qnameLen, read->getId(),
+ flags,
+ contigName,
+ positionInContig,
+ mapQuality,
+ cigar,
+ matecontigName,
+ matePositionInContig,
+ templateLength,
+ fullLength, data,
+ fullLength, quality,
+ aux != NULL ? "\t" : "", auxLen, aux != NULL ? aux : "",
+ readGroupSeparator, readGroupString,
+ nmString, rglineAuxLen, rglineAux);
+ if (charsInString > bufferSpace) {
+ //
+ // Out of buffer space.
+ //
+ return false;
+ } else if (charsInString == bufferSpace) {
+ buffer[bufferSpace-1] = '\n'; // overwrite trailing null with newline
+ }
+ if (NULL != spaceUsed) {
+ *spaceUsed = charsInString;
+ }
+ return true;
+// Common cigar string computation between SAM and BAM formats.
+ void
+ CigarFormat cigarFormat,
+ const Genome * genome,
+ LandauVishkinWithCigar * lv,
+ char * cigarBuf,
+ int cigarBufLen,
+ const char * data,
+ GenomeDistance dataLength,
+ unsigned basesClippedBefore,
+ GenomeDistance extraBasesClippedBefore,
+ unsigned basesClippedAfter,
+ GenomeDistance *o_extraBasesClippedAfter,
+ GenomeLocation genomeLocation,
+ bool useM,
+ int * o_editDistance,
+ int *o_cigarBufUsed,
+ int * o_addFrontClipping)
+ if (dataLength > INT32_MAX - MAX_K) {
+ dataLength = INT32_MAX - MAX_K;
+ }
+ int netIndel;
+ *o_extraBasesClippedAfter = 0;
+ //
+ // Apply the extra clipping.
+ //
+ genomeLocation += extraBasesClippedBefore;
+ data += extraBasesClippedBefore;
+ dataLength -= extraBasesClippedBefore;
+ const Genome::Contig *contig = genome->getContigAtLocation(genomeLocation);
+ if (genomeLocation + dataLength > contig->beginningLocation + contig->length - genome->getChromosomePadding()) {
+ //
+ // The read hangs off the end of the contig. Soft clip it at the end. This is a tentative amount that assumes no net indels in the
+ // mapping, we'll refine it later if needed.
+ //
+ *o_extraBasesClippedAfter = genomeLocation + dataLength - (contig->beginningLocation + contig->length - genome->getChromosomePadding());
+ } else {
+ *o_extraBasesClippedAfter = 0;
+ }
+ const char *reference = genome->getSubstring(genomeLocation, dataLength);
+ if (NULL == reference) {
+ //
+ // Fell off the end of the contig.
+ //
+ *o_editDistance = 0;
+ *o_addFrontClipping = 0;
+ *o_cigarBufUsed = 0;
+ *cigarBuf = '*';
+ return;
+ }
+ *o_editDistance = lv->computeEditDistanceNormalized(
+ reference,
+ (int)(dataLength - *o_extraBasesClippedAfter + MAX_K), // Add space incase of indels. We know there's enough, because the reference is padded.
+ data,
+ (int)(dataLength - *o_extraBasesClippedAfter),
+ MAX_K - 1,
+ cigarBuf,
+ cigarBufLen,
+ useM,
+ cigarFormat,
+ o_cigarBufUsed,
+ o_addFrontClipping,
+ &netIndel);
+ if (*o_addFrontClipping != 0) {
+ //
+ // On this path, there really isn't a returned cigar string, it's sort of like an exception. We're going up a level and
+ // trying a different alignment.
+ //
+ return;
+ }
+ //
+ // Normally, we'd be done. However, if the amount that we would clip at the end of the read because of hanging off of the end
+ // of the contig changed, then we need to recompute. In some cases this is an iterative processess as we add or remove bits
+ // of read.
+ //
+ GenomeDistance newExtraBasesClippedAfter = __max(0, genomeLocation + dataLength + netIndel - (contig->beginningLocation + contig->length - genome->getChromosomePadding()));
+ for (GenomeDistance pass = 0; pass < dataLength; pass++) {
+ if (newExtraBasesClippedAfter == *o_extraBasesClippedAfter) {
+ *o_extraBasesClippedAfter = newExtraBasesClippedAfter;
+ return;
+ }
+ *o_extraBasesClippedAfter = newExtraBasesClippedAfter;
+ *o_editDistance = lv->computeEditDistanceNormalized(
+ reference,
+ (int)(dataLength - *o_extraBasesClippedAfter + MAX_K), // Add space incase of indels. We know there's enough, because the reference is padded.
+ data,
+ (int)(dataLength - *o_extraBasesClippedAfter),
+ MAX_K - 1,
+ cigarBuf,
+ cigarBufLen,
+ useM,
+ cigarFormat,
+ o_cigarBufUsed,
+ o_addFrontClipping,
+ &netIndel);
+ newExtraBasesClippedAfter = __max(0, genomeLocation + dataLength + netIndel - (contig->beginningLocation + contig->length - genome->getChromosomePadding()));
+ }
+ _ASSERT(!"cigar computation didn't converge");
+ *o_extraBasesClippedAfter = newExtraBasesClippedAfter;
+// Compute the CIGAR edit sequence string for a read against a given genome location.
+// Returns this string if possible or "*" if we fail to compute it (which would likely
+// be a bug due to lack of buffer space). The pointer returned may be to cigarBuf so it
+// will only be valid until computeCigarString is called again.
+ const char *
+ const Genome * genome,
+ LandauVishkinWithCigar * lv,
+ char * cigarBuf,
+ int cigarBufLen,
+ char * cigarBufWithClipping,
+ int cigarBufWithClippingLen,
+ const char * data,
+ GenomeDistance dataLength,
+ unsigned basesClippedBefore,
+ GenomeDistance extraBasesClippedBefore,
+ unsigned basesClippedAfter,
+ unsigned frontHardClipping,
+ unsigned backHardClipping,
+ GenomeLocation genomeLocation,
+ Direction direction,
+ bool useM,
+ int * o_editDistance,
+ int * o_addFrontClipping
+ GenomeDistance extraBasesClippedAfter;
+ int cigarBufUsed;
+ computeCigar(COMPACT_CIGAR_STRING, genome, lv, cigarBuf, cigarBufLen, data, dataLength, basesClippedBefore,
+ extraBasesClippedBefore, basesClippedAfter, &extraBasesClippedAfter, genomeLocation, useM,
+ o_editDistance, &cigarBufUsed, o_addFrontClipping);
+ if (*o_addFrontClipping != 0) {
+ return NULL;
+ }
+ if (*o_editDistance == -2) {
+ WriteErrorMessage( "WARNING: computeEditDistance returned -2; cigarBuf may be too small\n");
+ return "*";
+ } else if (*o_editDistance == -1) {
+ static bool warningPrinted = false;
+ if (!warningPrinted) {
+ WriteErrorMessage( "WARNING: computeEditDistance returned -1; this shouldn't happen\n");
+ warningPrinted = true;
+ }
+ return "*";
+ } else {
+ // Add some CIGAR instructions for soft-clipping if we've ignored some bases in the read.
+ char clipBefore[16] = {'\0'};
+ char clipAfter[16] = {'\0'};
+ char hardClipBefore[16] = {'\0'};
+ char hardClipAfter[16] = {'\0'};
+ if (frontHardClipping > 0) {
+ snprintf(hardClipBefore, sizeof(hardClipBefore), "%uH", frontHardClipping);
+ }
+ if (basesClippedBefore + extraBasesClippedBefore > 0) {
+ snprintf(clipBefore, sizeof(clipBefore), "%lluS", basesClippedBefore + extraBasesClippedBefore);
+ }
+ if (basesClippedAfter + extraBasesClippedAfter > 0) {
+ snprintf(clipAfter, sizeof(clipAfter), "%lluS", basesClippedAfter + extraBasesClippedAfter);
+ }
+ if (backHardClipping > 0) {
+ snprintf(hardClipAfter, sizeof(hardClipAfter), "%uH", backHardClipping);
+ }
+ snprintf(cigarBufWithClipping, cigarBufWithClippingLen, "%s%s%s%s%s", hardClipBefore, clipBefore, cigarBuf, clipAfter, hardClipAfter);
+ validateCigarString(genome, cigarBufWithClipping, cigarBufWithClippingLen,
+ data - basesClippedBefore, dataLength + (basesClippedBefore + basesClippedAfter), genomeLocation + extraBasesClippedBefore, direction, useM);
+ return cigarBufWithClipping;
+ }
+#ifdef _DEBUG
+ void
+ const Genome *genome, const char * cigarBuf, int cigarBufLen, const char *data, GenomeDistance dataLength, GenomeLocation genomeLocation, Direction direction, bool useM)
+ const char *nextChunkOfCigar = cigarBuf;
+ GenomeDistance offsetInData = 0;
+ const char *reference = genome->getSubstring(genomeLocation, dataLength);
+ if (NULL == reference) {
+ WriteErrorMessage("validateCigarString: couldn't look up genome data for location %lld\n", genomeLocation);
+ soft_exit(1);
+ }
+ GenomeDistance offsetInReference = 0;
+ bool sawNonH = false; // This is to make sure that the clipping types (H & S) occur only at the beginning or end of the cigar string.
+ bool sawTailS = false; // Did we see a S
+ bool sawLeadingS = false; // Have we seen the soft clip at the front of the cigar string?
+ bool sawTrailingH = false;
+ char previousOp = '\0'; // Make sure that we don't have two consecutive ops of the same type that should be merged
+ bool sawXorM = false;
+ bool lastItemWasIndel = false;
+ //
+ // First check to see that it's null terminated
+ //
+ bool nullTerminated = false;
+ for (size_t offset = 0; offset < cigarBufLen; offset++) {
+ if ('\0' == cigarBuf[offset]) {
+ nullTerminated = true;
+ break;
+ }
+ }
+ if (!nullTerminated) {
+ WriteErrorMessage("validateCigarString: non-null-terminated or overflow cigar string: '%.*s'\n", cigarBufLen, cigarBuf);
+ soft_exit(1);
+ }
+ const Genome::Contig *contig = genome->getContigAtLocation(genomeLocation);
+ if (NULL == contig) {
+ WriteErrorMessage("validateCigarString: read alignment location isn't in a chromosome, genomeLocation %lld\n", GenomeLocationAsInt64(genomeLocation));
+ soft_exit(1);
+ }
+ if (genomeLocation >= contig->beginningLocation + contig->length - genome->getChromosomePadding()) {
+ WriteErrorMessage("validateCigarString: alignment location is in genome padding: %lld, contig name %s, base %lld, len %lld, padding size %d\n",
+ GenomeLocationAsInt64(genomeLocation), contig->name, GenomeLocationAsInt64(contig->beginningLocation), contig->length, genome->getChromosomePadding());
+ soft_exit(1);
+ }
+ while ('\0' != *nextChunkOfCigar) {
+ unsigned len;
+ char op;
+ int fieldsScanned = sscanf(nextChunkOfCigar, "%d%c", &len, &op);
+ if (2 != fieldsScanned) {
+ WriteErrorMessage("validateCigarString: didn't scan two fields here '%s' in overall cigar string '%s'\n", nextChunkOfCigar, cigarBuf);
+ soft_exit(1);
+ }
+ if (0 == len) {
+ WriteErrorMessage("validateCigarString: got zero length field here '%s' in overall cigar string '%s'\n", nextChunkOfCigar, cigarBuf);
+ soft_exit(1);
+ }
+ if (op != 'H' && sawTailS) {
+ WriteErrorMessage("validateCigarString: saw incorrect op type after what should have been the terminal soft or hard clipping here '%s', in overall cigar string '%s'\n",
+ nextChunkOfCigar, cigarBuf);
+ soft_exit(1);
+ }
+ if (sawTrailingH) {
+ WriteErrorMessage("validateCigarString: saw op after what should have been the terminal hard clip here '%s' in overall cigar '%s'\n", nextChunkOfCigar, cigarBuf);
+ soft_exit(1);
+ }
+ if (op == previousOp) {
+ WriteErrorMessage("validateCigarString: saw consecutive ops of the same type '%c' here '%s' in overall cigar '%s'\n", op, nextChunkOfCigar, cigarBuf);
+ soft_exit(1);
+ }
+ switch (op) {
+ case 'M':
+ {
+ if (!useM) {
+ WriteErrorMessage("validateCigarString: generated an M when we were supposed to use X and = here '%s' in overall cigar string '%s'\n", nextChunkOfCigar, cigarBuf);
+ soft_exit(1);
+ }
+ offsetInData += len;
+ sawNonH = true;
+ sawXorM = true;
+ lastItemWasIndel = false;
+ break;
+ }
+ case 'X':
+ case '=':
+ {
+ if (useM) {
+ WriteErrorMessage("validateCigarString: generated an %c when were supposed to use M here '%s' in overall cigar string '%s'\n", op, nextChunkOfCigar, cigarBuf);
+ soft_exit(1);
+ }
+ if (len + offsetInData > dataLength) {
+ WriteErrorMessage("validateCigarString: cigar string overflowed read length, here '%s', overall cigar '%s'\n", nextChunkOfCigar, cigarBuf);
+ soft_exit(1);
+ }
+ for (unsigned offset = 0; offset < len; offset++) {
+ if ((data[offset + offsetInData] == reference[offset + offsetInReference]) == ('X' == op)) {
+ WriteErrorMessage("validateCigarString: saw a (non-)matching base in an %c range, offset %d, offsetInData %lld, offsetInReference %lld, data '%.*s', reference '%.*s', here '%s', overall cigar '%s'\n",
+ op, offset, offsetInData, offsetInReference, dataLength, data, dataLength, reference, nextChunkOfCigar, cigarBuf);
+ soft_exit(1);
+ }
+ }
+ offsetInData += len;
+ offsetInReference += len;
+ sawNonH = true;
+ sawXorM = true;
+ lastItemWasIndel = false;
+ break;
+ }
+ case 'I':
+ {
+ //
+ // Insertion uses up bases in the read but not in the reference.
+ //
+ if (len + offsetInData > dataLength) {
+ WriteErrorMessage("validateCigarString: insertion pushes cigar string overlength, here '%s' in overall cigar '%s'\n", nextChunkOfCigar, cigarBuf);
+ soft_exit(1);
+ }
+ if (!sawXorM) {
+ WriteErrorMessage("validateCigarString: cigar string started with I (after clipping) here '%s' in overall cigar '%s'\n", nextChunkOfCigar, cigarBuf);
+ soft_exit(1);
+ }
+ if (previousOp == 'D') {
+ WriteErrorMessage("validateCigarString: cigar string had D immediately followed by I here '%'s in overall cigar '%s'\n", nextChunkOfCigar, cigarBuf);
+ soft_exit(1);
+ }
+ offsetInData += len;
+ sawNonH = true;
+ lastItemWasIndel = true;
+ break;
+ }
+ case 'D':
+ {
+ if (!sawXorM) {
+ WriteErrorMessage("validateCigarString: cigar string started with D (after clipping) here '%s' in overall cigar '%s'\n", nextChunkOfCigar, cigarBuf);
+ soft_exit(1);
+ }
+ if (previousOp == 'I') {
+ WriteErrorMessage("validateCigarString: cigar string had I immediately followed by D here '%'s in overall cigar '%s'\n", nextChunkOfCigar, cigarBuf);
+ soft_exit(1);
+ }
+ //
+ // D uses up bases in the reference but not the read.
+ //
+ offsetInReference += len;
+ sawNonH = true;
+ lastItemWasIndel = true;
+ break;
+ }
+ case 'N':
+ case 'P':
+ {
+ WriteErrorMessage("validateCigarString: saw valid op type '%c' that SNAP shouldn't generate, here '%s' in overall cigar string '%s'\n", op, nextChunkOfCigar, cigarBuf);
+ soft_exit(1);
+ }
+ case 'H':
+ {
+ //
+ // Hard clip bases do not occur in the read string at all. All we can validate is that this is the first or last thing in the cigar string.
+ //
+ if (nextChunkOfCigar == cigarBuf) {
+ //
+ // First thing, this is OK.
+ //
+ break;
+ }
+ sawTrailingH = true;
+ break;
+ }
+ case 'S':
+ {
+ if (sawNonH) {
+ sawTailS = true;
+ }
+ sawNonH = true;
+ offsetInData += len;
+ break;
+ }
+ default: {
+ WriteErrorMessage("validateCigarString: got unrecognized cigar op '%c', here '%s' in overall string '%s'\n", op, nextChunkOfCigar, cigarBuf);
+ soft_exit(1);
+ }
+ }
+ previousOp = op;
+ //
+ // Now scan over the current op.
+ //
+ while ('0' <= *nextChunkOfCigar && '9' >= *nextChunkOfCigar) {
+ nextChunkOfCigar++;
+ }
+ if (*nextChunkOfCigar != op) {
+ WriteErrorMessage("validateCigarString: bug in validation code; expected op '%c', got '%c' at '%s' in '%s'\n", op, *nextChunkOfCigar, nextChunkOfCigar, cigarBuf);
+ soft_exit(1);
+ }
+ nextChunkOfCigar++;
+ }
+ if (offsetInData != dataLength) {
+ WriteErrorMessage("validateCigarString: Didn't consume entire read data, got %lld of %lld, cigar '%s'\n", offsetInData, dataLength, cigarBuf);
+ soft_exit(1);
+ }
+ if (lastItemWasIndel) {
+ WriteErrorMessage("validateCigarString: cigar string ended with indel '%s'\n", cigarBuf);
+ soft_exit(1);
+ }
+ //
+ // Make sure none of the non-soft-clipped part of the read is mapped onto padding.
+ //
+ if (genomeLocation + offsetInReference > contig->beginningLocation + contig->length - genome->getChromosomePadding()) {
+ WriteErrorMessage("validateCigarString: alignment runs into contig padding: %lld, contig name %s, base %lld, len %lld, padding size %d, offsetInReference %lld\n",
+ GenomeLocationAsInt64(genomeLocation), contig->name, GenomeLocationAsInt64(contig->beginningLocation), contig->length, genome->getChromosomePadding(), offsetInReference);
+ soft_exit(1);
+ }
+#endif // _DEBUG
diff --git a/SNAPLib/SAM.h b/SNAPLib/SAM.h
new file mode 100644
index 0000000..b09cf94
--- /dev/null
+++ b/SNAPLib/SAM.h
@@ -0,0 +1,236 @@
+Module Name:
+ SAM.h
+ Sequence Alignment Map (SAM) file writer.
+ User mode service.
+ This class is NOT thread safe. It's the caller's responsibility to ensure that
+ at most one thread uses an instance at any time.
+#pragma once
+#include "Compat.h"
+#include "LandauVishkin.h"
+#include "PairedEndAligner.h"
+#include "VariableSizeVector.h"
+#include "BufferedAsync.h"
+#include "directions.h"
+#include "Read.h"
+#include "DataReader.h"
+#include "FileFormat.h"
+bool readIdsMatch(const char* id0, const char* id1);
+bool readIdsMatch(Read *read0, Read *read1);
+ * Flags for the SAM file format; see http://samtools.sourceforge.net/SAM1.pdf for details.
+ */
+const int SAM_MULTI_SEGMENT = 0x001; // Read had multiple segments (i.e., paired ends).
+const int SAM_ALL_ALIGNED = 0x002; // All segments of a multi-segment read were aligned.
+const int SAM_UNMAPPED = 0x004; // This segment of the read is unmapped.
+const int SAM_NEXT_UNMAPPED = 0x008; // Next segment of the read is unmapped.
+const int SAM_REVERSE_COMPLEMENT = 0x010; // This segment of the read is reverse complemented.
+const int SAM_NEXT_REVERSED = 0x020; // Next segment of the read is reverse complemented.
+const int SAM_FIRST_SEGMENT = 0x040; // This is the first segment in the read.
+const int SAM_LAST_SEGMENT = 0x080; // This is the last segment in the read.
+const int SAM_SECONDARY = 0x100; // Secondary alignment for a read with multiple hits.
+const int SAM_FAILED_QC = 0x200; // Not passing quality controls.
+const int SAM_DUPLICATE = 0x400; // PCR or optical duplicate.
+const int SAM_SUPPLEMENTARY = 0x800; // Supplementary alignment
+class SAMReader : public ReadReader {
+ virtual ~SAMReader() {}
+ SAMReader(DataReader* i_data, const ReaderContext& i_context);
+ virtual void reinit(_int64 startingOffset, _int64 amountOfFileToProcess);
+ virtual bool getNextRead(Read *readToUpdate);
+ virtual bool getNextRead(Read *read, AlignmentResult *alignmentResult, GenomeLocation *genomeLocation, Direction *direction, unsigned *mapQ,
+ unsigned *flag, const char **cigar)
+ {
+ return getNextRead(read, alignmentResult, genomeLocation, direction, mapQ, flag, false, cigar);
+ }
+ virtual void holdBatch(DataBatch batch)
+ { data->holdBatch(batch); }
+ virtual bool releaseBatch(DataBatch batch)
+ { return data->releaseBatch(batch); }
+ static SAMReader* create(DataSupplier* supplier, const char *fileName,
+ int bufferCount, const ReaderContext& i_context,
+ _int64 startingOffset, _int64 amountOfFileToProcess);
+ static PairedReadReader* createPairedReader(const DataSupplier* supplier,
+ const char *fileName, int bufferCount, _int64 startingOffset, _int64 amountOfFileToProcess,
+ bool quicklyDropUnpairedReads, const ReaderContext& context);
+ static ReadSupplierGenerator *createReadSupplierGenerator(
+ const char *fileName, int numThreads, const ReaderContext& context);
+ static PairedReadSupplierGenerator *createPairedReadSupplierGenerator(
+ const char *fileName, int numThreads, bool quicklyDropUnpairedReads, const ReaderContext& context);
+ // result and fieldLengths must be of size nSAMFields
+ static bool parseHeader(const char *fileName, char *firstLine, char *endOfBuffer, const Genome *genome, _int64 *o_headerSize, bool* o_headerMatchesIndex, bool *o_sawWholeHeader = NULL);
+ static char* skipToBeyondNextFieldSeparator(char *str, const char *endOfBuffer, size_t *o_charsUntilFirstSeparator = NULL);
+ //
+ // 0-based Field numbers for the fields within a SAM line.
+ //
+ static const unsigned QNAME = 0;
+ static const unsigned FLAG = 1;
+ static const unsigned RNAME = 2;
+ static const unsigned POS = 3;
+ static const unsigned MAPQ = 4;
+ static const unsigned CIGAR = 5;
+ static const unsigned RNEXT = 6;
+ static const unsigned PNEXT = 7;
+ static const unsigned TLEN = 8;
+ static const unsigned SEQ = 9;
+ static const unsigned QUAL = 10;
+ static const unsigned OPT = 11;
+ static const unsigned nSAMFields = 12;
+ static const int maxLineLen = MAX_READ_LENGTH * 5;
+ static bool parseLine(char *line, char *endOfBuffer, char *result[],
+ size_t *lineLength, size_t fieldLengths[]);
+ static void parseContigName(const Genome* genome, char* contigName,
+ size_t contigNameBufferSize, GenomeLocation * o_locationOfContig, int* o_indexOfContig,
+ char* field[], size_t fieldLength[], unsigned rfield = RNAME);
+ static GenomeLocation parseLocation(GenomeLocation locationOfContig, char* field[], size_t fieldLength[], unsigned rfield = RNAME, unsigned posfield = POS);
+ virtual bool getNextRead(Read *read, AlignmentResult *alignmentResult,
+ GenomeLocation *genomeLocation, Direction *direction, unsigned *mapQ, unsigned *flag, bool ignoreEndOfRange, const char **cigar);
+ static void getReadFromLine(const Genome *genome, char *line, char *endOfBuffer, Read *read, AlignmentResult *alignmentResult,
+ GenomeLocation *genomeLocation, Direction *direction, unsigned *mapQ,
+ size_t *lineLength, unsigned *flag, const char **cigar, ReadClippingType clipping);
+ void readHeader(const char* fileName);
+ bool skipPartialHeader(_int64 *o_headerBytes);
+ void init(const char *fileName, _int64 startingOffset, _int64 amountOfFileToProcess);
+ DataReader* data;
+ _int64 headerSize;
+ ReadClippingType clipping;
+ bool didInitialSkip; // Have we skipped to the beginning of the first SAM line? We may start in the middle of one.
+ friend class SAMFormat;
+class SAMFormat : public FileFormat
+ SAMFormat(bool i_useM) : useM(i_useM) {}
+ virtual void getSortInfo(const Genome* genome, char* buffer, _int64 bytes, GenomeLocation* o_location, GenomeDistance* o_readBytes, int* o_refID, int* o_pos) const;
+ virtual void setupReaderContext(AlignerOptions* options, ReaderContext* readerContext) const
+ { FileFormat::setupReaderContext(options, readerContext, false); }
+ virtual ReadWriterSupplier* getWriterSupplier(AlignerOptions* options, const Genome* genome) const;
+ virtual bool writeHeader(
+ const ReaderContext& context, char *header, size_t headerBufferSize, size_t *headerActualSize,
+ bool sorted, int argc, const char **argv, const char *version, const char *rgLine, bool omitSQLines) const;
+ virtual bool writeRead(
+ const ReaderContext& context, LandauVishkinWithCigar * lv, char * buffer, size_t bufferSpace,
+ size_t * spaceUsed, size_t qnameLen, Read * read, AlignmentResult result,
+ int mapQuality, GenomeLocation genomeLocation, Direction direction, bool secondaryAlignment, int* o_addFrontClipping,
+ bool hasMate = false, bool firstInPair = false, Read * mate = NULL,
+ AlignmentResult mateResult = NotFound, GenomeLocation mateLocation = 0, Direction mateDirection = FORWARD,
+ bool alignedAsPair = false) const;
+ // calculate data needed to write SAM/BAM record
+ // very long argument list since this was extracted from
+ // original SAM record writing routine so it could be shared with BAM
+ static bool
+ createSAMLine(
+ const Genome * genome,
+ LandauVishkinWithCigar * lv,
+ // output data
+ char* data,
+ char* quality,
+ GenomeDistance dataSize,
+ const char*& contigName,
+ int& contigIndex,
+ int& flags,
+ GenomeDistance& positionInContig,
+ int& mapQuality,
+ const char*& mateContigName,
+ int& mateContigIndex,
+ GenomeDistance& matePositionInContig,
+ _int64& templateLength,
+ unsigned& fullLength,
+ const char*& clippedData,
+ unsigned& clippedLength,
+ unsigned& basesClippedBefore,
+ unsigned& basesClippedAfter,
+ // input data
+ size_t& qnameLen,
+ Read * read,
+ AlignmentResult result,
+ GenomeLocation genomeLocation,
+ Direction direction,
+ bool secondaryAlignment,
+ bool useM,
+ bool hasMate,
+ bool firstInPair,
+ bool alignedAsPair,
+ Read * mate,
+ AlignmentResult mateResult,
+ GenomeLocation mateLocation,
+ Direction mateDirection,
+ GenomeDistance *extraBasesClippedBefore);
+ static void computeCigar(CigarFormat cigarFormat, const Genome * genome, LandauVishkinWithCigar * lv,
+ char * cigarBuf, int cigarBufLen,
+ const char * data, GenomeDistance dataLength, unsigned basesClippedBefore, GenomeDistance extraBasesClippedBefore, unsigned basesClippedAfter,
+ GenomeDistance *o_extraBasesClippedAfter,
+ GenomeLocation genomeLocation, bool useM, int * o_editDistance, int *o_cigarBufUsed, int * o_addFrontClipping);
+ static const char * computeCigarString(const Genome * genome, LandauVishkinWithCigar * lv,
+ char * cigarBuf, int cigarBufLen, char * cigarBufWithClipping, int cigarBufWithClippingLen,
+ const char * data, GenomeDistance dataLength, unsigned basesClippedBefore, GenomeDistance extraBasesClippedBefore, unsigned basesClippedAfter,
+ unsigned frontHardClipped, unsigned backHardClipped,
+ GenomeLocation genomeLocation, Direction direction, bool useM, int * o_editDistance, int * o_addFrontClipping);
+#ifdef _DEBUG
+ static void validateCigarString(const Genome *genome, const char * cigarBuf, int cigarBufLen, const char *data, GenomeDistance dataLength, GenomeLocation genomeLocation, Direction direction, bool useM);
+#else // DEBUG
+ inline static void validateCigarString(const Genome *genome, const char * cigarBuf, int cigarBufLen, const char *data, GenomeDistance dataLength, GenomeLocation genomeLocation, Direction direction, bool useM) {}
+#endif // DEBUG
+ const bool useM;
diff --git a/SNAPLib/SNAPLib.vcxproj b/SNAPLib/SNAPLib.vcxproj
new file mode 100644
index 0000000..7eb93b2
--- /dev/null
+++ b/SNAPLib/SNAPLib.vcxproj
@@ -0,0 +1,276 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <ItemGroup Label="ProjectConfigurations">
+ <ProjectConfiguration Include="Debug|Win32">
+ <Configuration>Debug</Configuration>
+ <Platform>Win32</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Debug|x64">
+ <Configuration>Debug</Configuration>
+ <Platform>x64</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Release|Win32">
+ <Configuration>Release</Configuration>
+ <Platform>Win32</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Release|x64">
+ <Configuration>Release</Configuration>
+ <Platform>x64</Platform>
+ </ProjectConfiguration>
+ </ItemGroup>
+ <PropertyGroup Label="Globals">
+ <ProjectGuid>{E620DC13-195C-41EF-B33B-8FE7DE9F8ADC}</ProjectGuid>
+ <Keyword>Win32Proj</Keyword>
+ <RootNamespace>SNAPLib</RootNamespace>
+ </PropertyGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+ <ConfigurationType>StaticLibrary</ConfigurationType>
+ <UseDebugLibraries>true</UseDebugLibraries>
+ <PlatformToolset>v120</PlatformToolset>
+ <CharacterSet>MultiByte</CharacterSet>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+ <ConfigurationType>StaticLibrary</ConfigurationType>
+ <UseDebugLibraries>true</UseDebugLibraries>
+ <PlatformToolset>v120</PlatformToolset>
+ <CharacterSet>MultiByte</CharacterSet>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+ <ConfigurationType>StaticLibrary</ConfigurationType>
+ <UseDebugLibraries>false</UseDebugLibraries>
+ <PlatformToolset>v120</PlatformToolset>
+ <WholeProgramOptimization>true</WholeProgramOptimization>
+ <CharacterSet>MultiByte</CharacterSet>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+ <ConfigurationType>StaticLibrary</ConfigurationType>
+ <UseDebugLibraries>false</UseDebugLibraries>
+ <PlatformToolset>v120</PlatformToolset>
+ <WholeProgramOptimization>true</WholeProgramOptimization>
+ <CharacterSet>MultiByte</CharacterSet>
+ </PropertyGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+ <ImportGroup Label="ExtensionSettings">
+ </ImportGroup>
+ <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <PropertyGroup Label="UserMacros" />
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ <OutDir>$(SolutionDir)obj\lib\$(Configuration)\$(Platform)\</OutDir>
+ <IntDir>$(SolutionDir)obj\obj\snaplib\$(Configuration)\$(Platform)\</IntDir>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ <OutDir>$(SolutionDir)obj\lib\$(Configuration)\$(Platform)\</OutDir>
+ <IntDir>$(SolutionDir)obj\obj\snaplib\$(Configuration)\$(Platform)\</IntDir>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ <OutDir>$(SolutionDir)obj\lib\$(Configuration)\$(Platform)\</OutDir>
+ <IntDir>$(SolutionDir)obj\obj\snaplib\$(Configuration)\$(Platform)\</IntDir>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ <OutDir>$(SolutionDir)obj\lib\$(Configuration)\$(Platform)\</OutDir>
+ <IntDir>$(SolutionDir)obj\obj\snaplib\$(Configuration)\$(Platform)\</IntDir>
+ </PropertyGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ <ClCompile>
+ <PrecompiledHeader>Use</PrecompiledHeader>
+ <WarningLevel>Level3</WarningLevel>
+ <Optimization>Full</Optimization>
+ <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions);_CRT_SECURE_NO_WARNINGS</PreprocessorDefinitions>
+ <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+ <WholeProgramOptimization>true</WholeProgramOptimization>
+ <EnableEnhancedInstructionSet>StreamingSIMDExtensions2</EnableEnhancedInstructionSet>
+ <IntrinsicFunctions>true</IntrinsicFunctions>
+ <AdditionalIncludeDirectories>..\import\</AdditionalIncludeDirectories>
+ </ClCompile>
+ <Link>
+ <SubSystem>Windows</SubSystem>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ <ClCompile>
+ <PrecompiledHeader>Use</PrecompiledHeader>
+ <WarningLevel>Level3</WarningLevel>
+ <Optimization>Disabled</Optimization>
+ <PreprocessorDefinitions>_CRT_SECURE_NO_WARNINGS;WIN32;_DEBUG;_LIB;SNAP_HDFS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+ <WholeProgramOptimization>true</WholeProgramOptimization>
+ <EnableEnhancedInstructionSet>NotSet</EnableEnhancedInstructionSet>
+ <IntrinsicFunctions>true</IntrinsicFunctions>
+ <AdditionalIncludeDirectories>..\import\;..\import\pdclibhdfs\inc</AdditionalIncludeDirectories>
+ </ClCompile>
+ <Link>
+ <SubSystem>Windows</SubSystem>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ <ClCompile>
+ <WarningLevel>Level3</WarningLevel>
+ <PrecompiledHeader>Use</PrecompiledHeader>
+ <Optimization>Full</Optimization>
+ <FunctionLevelLinking>true</FunctionLevelLinking>
+ <IntrinsicFunctions>true</IntrinsicFunctions>
+ <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions);_CRT_SECURE_NO_WARNINGS</PreprocessorDefinitions>
+ <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+ <EnableEnhancedInstructionSet>StreamingSIMDExtensions2</EnableEnhancedInstructionSet>
+ <AdditionalIncludeDirectories>..\import\</AdditionalIncludeDirectories>
+ </ClCompile>
+ <Link>
+ <SubSystem>Windows</SubSystem>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <EnableCOMDATFolding>true</EnableCOMDATFolding>
+ <OptimizeReferences>true</OptimizeReferences>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ <ClCompile>
+ <WarningLevel>Level3</WarningLevel>
+ <PrecompiledHeader>Use</PrecompiledHeader>
+ <Optimization>Full</Optimization>
+ <FunctionLevelLinking>true</FunctionLevelLinking>
+ <IntrinsicFunctions>true</IntrinsicFunctions>
+ <PreprocessorDefinitions>_CRT_SECURE_NO_WARNINGS;WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions);SNAP_HDFS</PreprocessorDefinitions>
+ <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+ <EnableEnhancedInstructionSet>NotSet</EnableEnhancedInstructionSet>
+ <AdditionalIncludeDirectories>..\import\;..\import\pdclibhdfs\inc</AdditionalIncludeDirectories>
+ </ClCompile>
+ <Link>
+ <SubSystem>Windows</SubSystem>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <EnableCOMDATFolding>true</EnableCOMDATFolding>
+ <OptimizeReferences>true</OptimizeReferences>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemGroup>
+ <ClInclude Include="AlignerContext.h" />
+ <ClInclude Include="AlignerOptions.h" />
+ <ClInclude Include="AlignerStats.h" />
+ <ClInclude Include="AlignmentResult.h" />
+ <ClInclude Include="ApproximateCounter.h" />
+ <ClInclude Include="Bam.h" />
+ <ClInclude Include="BaseAligner.h" />
+ <ClInclude Include="BigAlloc.h" />
+ <ClInclude Include="BufferedAsync.h" />
+ <ClInclude Include="ChimericPairedEndAligner.h" />
+ <ClInclude Include="CommandProcessor.h" />
+ <ClInclude Include="Compat.h" />
+ <ClInclude Include="DataReader.h" />
+ <ClInclude Include="DataWriter.h" />
+ <ClInclude Include="directions.h" />
+ <ClInclude Include="Error.h" />
+ <ClInclude Include="exit.h" />
+ <ClInclude Include="FASTA.h" />
+ <ClInclude Include="FASTQ.h" />
+ <ClInclude Include="FileFormat.h" />
+ <ClInclude Include="FixedSizeMap.h" />
+ <ClInclude Include="FixedSizeSet.h" />
+ <ClInclude Include="FixedSizeVector.h" />
+ <ClInclude Include="GenericFile.h" />
+ <ClInclude Include="GenericFile_Blob.h" />
+ <ClInclude Include="GenericFile_HDFS.h" />
+ <ClInclude Include="GenericFile_map.h" />
+ <ClInclude Include="GenericFile_stdio.h" />
+ <ClInclude Include="Genome.h" />
+ <ClInclude Include="GenomeIndex.h" />
+ <ClInclude Include="GzipDataWriter.h" />
+ <ClInclude Include="HashTable.h" />
+ <ClInclude Include="Histogram.h" />
+ <ClInclude Include="IntersectingPairedEndAligner.h" />
+ <ClInclude Include="LandauVishkin.h" />
+ <ClInclude Include="mapq.h" />
+ <ClInclude Include="MultiInputReadSupplier.h" />
+ <ClInclude Include="options.h" />
+ <ClInclude Include="PairedAligner.h" />
+ <ClInclude Include="PairedEndAligner.h" />
+ <ClInclude Include="ParallelTask.h" />
+ <ClInclude Include="PriorityQueue.h" />
+ <ClInclude Include="ProbabilityDistance.h" />
+ <ClInclude Include="RangeSplitter.h" />
+ <ClInclude Include="Read.h" />
+ <ClInclude Include="ReadSupplierQueue.h" />
+ <ClInclude Include="SAM.h" />
+ <ClInclude Include="Seed.h" />
+ <ClInclude Include="SeedSequencer.h" />
+ <ClInclude Include="SingleAligner.h" />
+ <ClInclude Include="stdafx.h" />
+ <ClInclude Include="Tables.h" />
+ <ClInclude Include="targetver.h" />
+ <ClInclude Include="Util.h" />
+ <ClInclude Include="VariableSizeMap.h" />
+ <ClInclude Include="VariableSizeVector.h" />
+ <ClInclude Include="WindowsFileMapper.h" />
+ </ItemGroup>
+ <ItemGroup>
+ <ClCompile Include="AlignerContext.cpp" />
+ <ClCompile Include="AlignerOptions.cpp" />
+ <ClCompile Include="AlignerStats.cpp" />
+ <ClCompile Include="AlignmentResult.cpp" />
+ <ClCompile Include="ApproximateCounter.cpp" />
+ <ClCompile Include="Bam.cpp" />
+ <ClCompile Include="BaseAligner.cpp" />
+ <ClCompile Include="BiasTables.cpp" />
+ <ClCompile Include="BigAlloc.cpp" />
+ <ClCompile Include="BufferedAsync.cpp" />
+ <ClCompile Include="ChimericPairedEndAligner.cpp" />
+ <ClCompile Include="CommandProcessor.cpp" />
+ <ClCompile Include="Compat.cpp" />
+ <ClCompile Include="DataReader.cpp" />
+ <ClCompile Include="DataWriter.cpp" />
+ <ClCompile Include="Error.cpp" />
+ <ClCompile Include="exit.cpp" />
+ <ClCompile Include="FASTA.cpp" />
+ <ClCompile Include="FASTQ.cpp" />
+ <ClCompile Include="GenericFile.cpp" />
+ <ClCompile Include="GenericFile_Blob.cpp" />
+ <ClCompile Include="GenericFile_HDFS.cpp" />
+ <ClCompile Include="GenericFile_map.cpp" />
+ <ClCompile Include="GenericFile_stdio.cpp" />
+ <ClCompile Include="Genome.cpp" />
+ <ClCompile Include="GenomeIndex.cpp" />
+ <ClCompile Include="GzipDataWriter.cpp" />
+ <ClCompile Include="HashTable.cpp" />
+ <ClCompile Include="Histogram.cpp" />
+ <ClCompile Include="IntersectingPairedEndAligner.cpp" />
+ <ClCompile Include="LandauVishkin.cpp" />
+ <ClCompile Include="mapq.cpp" />
+ <ClCompile Include="MultiInputReadSupplier.cpp" />
+ <ClCompile Include="PairedAligner.cpp" />
+ <ClCompile Include="PairedReadMatcher.cpp" />
+ <ClCompile Include="ParallelTask.cpp" />
+ <ClCompile Include="ProbabilityDistance.cpp" />
+ <ClCompile Include="RangeSplitter.cpp" />
+ <ClCompile Include="Read.cpp" />
+ <ClCompile Include="ReadReader.cpp" />
+ <ClCompile Include="ReadSupplierQueue.cpp" />
+ <ClCompile Include="ReadWriter.cpp" />
+ <ClCompile Include="SAM.cpp" />
+ <ClCompile Include="Seed.cpp" />
+ <ClCompile Include="SeedSequencer.cpp" />
+ <ClCompile Include="SingleAligner.cpp" />
+ <ClCompile Include="SortedDataWriter.cpp" />
+ <ClCompile Include="stdafx.cpp">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Create</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Create</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Create</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Create</PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="Tables.cpp" />
+ <ClCompile Include="Util.cpp" />
+ </ItemGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+ <ImportGroup Label="ExtensionTargets">
+ </ImportGroup>
\ No newline at end of file
diff --git a/SNAPLib/SNAPLib.vcxproj.filters b/SNAPLib/SNAPLib.vcxproj.filters
new file mode 100644
index 0000000..18fe612
--- /dev/null
+++ b/SNAPLib/SNAPLib.vcxproj.filters
@@ -0,0 +1,342 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <ItemGroup>
+ <Filter Include="Source Files">
+ <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
+ <Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
+ </Filter>
+ <Filter Include="Header Files">
+ <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
+ <Extensions>h;hpp;hxx;hm;inl;inc;xsd</Extensions>
+ </Filter>
+ <Filter Include="Resource Files">
+ <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
+ <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
+ </Filter>
+ </ItemGroup>
+ <ItemGroup>
+ <ClInclude Include="stdafx.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="targetver.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="AlignerContext.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="AlignerOptions.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="AlignerStats.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="ApproximateCounter.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="Bam.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="BaseAligner.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="BigAlloc.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="BufferedAsync.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="Compat.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="DataReader.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="DataWriter.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="directions.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="exit.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="FASTA.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="FASTQ.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="FileFormat.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="FixedSizeMap.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="FixedSizeSet.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="FixedSizeVector.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="Genome.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="GenomeIndex.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="GzipDataWriter.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="HashTable.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="Histogram.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="IntersectingPairedEndAligner.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="LandauVishkin.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="mapq.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="MultiInputReadSupplier.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="options.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="PairedAligner.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="PairedEndAligner.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="ParallelTask.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="ProbabilityDistance.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="RangeSplitter.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="Read.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="ReadSupplierQueue.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="SAM.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="Seed.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="SeedSequencer.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="SingleAligner.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="Tables.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="Util.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="VariableSizeMap.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="VariableSizeVector.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="WindowsFileMapper.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="PriorityQueue.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="ChimericPairedEndAligner.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="Error.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="GenericFile.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="GenericFile_HDFS.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="GenericFile_stdio.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="GenericFile_Blob.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="AlignmentResult.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="GenericFile_map.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="CommandProcessor.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ </ItemGroup>
+ <ItemGroup>
+ <ClCompile Include="stdafx.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="AlignerContext.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="AlignerOptions.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="AlignerStats.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="ApproximateCounter.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="Bam.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="BaseAligner.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="BiasTables.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="BigAlloc.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="BufferedAsync.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="Compat.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="DataReader.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="DataWriter.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="exit.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="FASTA.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="FASTQ.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="Genome.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="GenomeIndex.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="GzipDataWriter.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="HashTable.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="Histogram.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="IntersectingPairedEndAligner.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="LandauVishkin.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="mapq.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="MultiInputReadSupplier.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="PairedAligner.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="PairedReadMatcher.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="ProbabilityDistance.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="RangeSplitter.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="ReadReader.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="ReadSupplierQueue.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="ReadWriter.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="SAM.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="Seed.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="SingleAligner.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="SortedDataWriter.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="Tables.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="Util.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="Read.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="ChimericPairedEndAligner.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="SeedSequencer.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="ParallelTask.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="Error.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="GenericFile.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="GenericFile_HDFS.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="GenericFile_stdio.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="GenericFile_Blob.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="GenericFile_map.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="CommandProcessor.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="AlignmentResult.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ </ItemGroup>
\ No newline at end of file
diff --git a/SNAPLib/Seed.cpp b/SNAPLib/Seed.cpp
new file mode 100644
index 0000000..ff253b3
--- /dev/null
+++ b/SNAPLib/Seed.cpp
@@ -0,0 +1,56 @@
+Module Name:
+ Seed.cpp
+ Code to handle seeds in the SNAP sequencer.
+ Bill Bolosky, August, 2011
+ User mode service.
+Revision History:
+ Adapted from Matei Zaharia's Scala implementation.
+#include "stdafx.h"
+#include "Seed.h"
+ bool
+Seed::DoesTextRepresentASeed(const char *textBases, unsigned seedLen)
+ for (unsigned i = 0; i < seedLen; i++) {
+ switch (textBases[i]) {
+ case 'A':
+ case 'G':
+ case 'C':
+ case 'T':
+ break;
+ default: return false;
+ }
+ }
+ return true;
+ Seed
+ _int64 bases,
+ int seedLength)
+ _int64 rc = 0;
+ _int64 b = bases;
+ for (int i = 0; i < seedLength; i++) {
+ rc = (rc << 2) | ((b & 3) ^ 3);
+ b = b >> 2;
+ }
+ return Seed(bases, rc);
\ No newline at end of file
diff --git a/SNAPLib/Seed.h b/SNAPLib/Seed.h
new file mode 100644
index 0000000..10eeb6a
--- /dev/null
+++ b/SNAPLib/Seed.h
@@ -0,0 +1,200 @@
+Module Name:
+ Seed.h
+ Headers for code to handle seeds in the SNAP sequencer.
+ Bill Bolosky, August, 2011
+ User mode service.
+Revision History:
+ Adapted from Matei Zaharia's Scala implementation.
+#pragma once
+#include "Compat.h"
+#include "Tables.h"
+#include "Util.h"
+const unsigned LargestSeedSize = 32;
+struct Seed {
+ //
+ // We exclude seeds with "N" in them. This checks for that (or other garbage).
+ //
+ static bool DoesTextRepresentASeed(const char *textBases, unsigned seedLen);
+ inline Seed(const char *textBases, unsigned seedLen)
+ {
+ bases = 0;
+ reverseComplement = 0;
+ for (unsigned i = 0; i < seedLen; i++) {
+ _uint64 encodedBase = BASE_VALUE[textBases[i]];
+ _ASSERT(255 != encodedBase);
+ bases |= encodedBase << ((seedLen - i - 1) * 2);
+ reverseComplement |= (encodedBase ^ 0x3) << (i * 2);
+ }
+ }
+ inline Seed() {}
+ inline Seed(_int64 i_bases, _int64 i_reverseComplement)
+ : bases(i_bases), reverseComplement(i_reverseComplement)
+ {
+ }
+ inline _uint64 getLowBases(unsigned keySizeInBytes) const { // Returns the lowest bases as an unsigned
+ if (keySizeInBytes == 8) {
+ return bases;
+ } else {
+ return bases & (((_uint64)1 << (keySizeInBytes * 8)) - 1);
+ }
+ }
+ inline unsigned getHighBases(unsigned keySizeInBytes) const { // Returns any high as an unsigned. If seedLen <= 16, returns 0.
+ if (keySizeInBytes == 8) {
+ return 0;
+ } else {
+ return (unsigned)(bases >> (keySizeInBytes * 8));
+ }
+ }
+ inline _uint64 getBases() const {
+ return bases;
+ }
+ inline _uint64 getRCBases() const {
+ return reverseComplement;
+ }
+ inline Seed operator~() const
+ //
+ // Compute the reverse complement of this. We just copy it from when our constructor ran.
+ //
+ {
+ Seed rc;
+ rc.bases = reverseComplement;
+ rc.reverseComplement = bases;
+ return rc;
+ }
+ inline bool isBiggerThanItsReverseComplement() const {
+ return bases > reverseComplement;
+ }
+ inline bool isOwnReverseComplement() const {
+ return bases == reverseComplement;
+ }
+ inline bool operator>(Seed &peer) const {
+ return bases > peer.bases;
+ }
+ inline bool operator>=(Seed &peer) const {
+ return bases >= peer.bases;
+ }
+ inline bool operator<(Seed &peer) const {
+ return bases < peer.bases;
+ }
+ inline bool operator<=(Seed &peer) const {
+ return bases <= peer.bases;
+ }
+ inline bool operator==(Seed &peer) const {
+ return bases == peer.bases;
+ }
+ inline bool operator!=(Seed &peer) const {
+ return bases != peer.bases;
+ }
+ inline void setBase(int i, int seedLen, int value) {
+ int shift = (seedLen - i - 1) * 2;
+ _int64 mask = (_int64) 3 << shift;
+ bases = (bases & ~mask) | (((_int64) value << shift) & mask);
+ int shift2 = i * 2;
+ _int64 mask2 = (_int64) 3 << shift2;
+ reverseComplement = (reverseComplement & ~mask2) | (((_int64) (value ^ 3) << shift2) & mask2);
+ }
+ inline int getBase(int i, int seedLen) {
+ return (int) (bases >> ((seedLen - i - 1) * 2)) & 3;
+ }
+ inline void shiftIn(int b, int seedLen) {
+ int shift = (seedLen - 1) * 2;
+ bases = ((bases << 2) & ~((_uint64) 3 << (shift + 2))) | (b & 3);
+ reverseComplement = ((_uint64) reverseComplement >> 2) | (((_uint64) (b ^ 3) & 3) << shift);
+ }
+ inline void toString(char* o_bases, int seedLength) {
+ for (int i = (seedLength - 1) * 2; i >= 0; i -= 2) {
+ *o_bases++ = VALUE_BASE[(bases >> i) & 3];
+ }
+ }
+ static Seed fromBases(_int64 bases, int seedLength);
+ inline _uint64 hash64()
+ {
+ return 1+util::hash64(min(bases, reverseComplement));
+ }
+ inline unsigned hash()
+ {
+ return (unsigned) hash64();
+ }
+ static _uint64 hash64(const char* sequence, int length)
+ {
+ if (length <= MaxBases) {
+ Seed s(sequence, length);
+ return s.hash64();
+ } else {
+ // string compare seq & reverse, hash smallest one
+ for (int i = 0; i < length / 2; i++) {
+ char c = sequence[i];
+ char r = COMPLEMENT[sequence[length - 1 - i]];
+ if (c < r) {
+ return util::hash(sequence, length);
+ } else if (r < c) {
+ char* rc = (char*) alloca(length);
+ util::toComplement(rc, sequence, length);
+ return util::hash64(rc, length);
+ }
+ }
+ return util::hash(sequence, length); // rc palindrome
+ }
+ }
+ static const int MaxBases = 32;
+ _uint64 bases;
+ //
+ // Since we pretty much always compute the reverse complement of a seed, we just keep it
+ // here. That way we only execute the loop once: when the constructor runs.
+ //
+ _uint64 reverseComplement;
diff --git a/SNAPLib/SeedSequencer.cpp b/SNAPLib/SeedSequencer.cpp
new file mode 100644
index 0000000..b27e5b2
--- /dev/null
+++ b/SNAPLib/SeedSequencer.cpp
@@ -0,0 +1,109 @@
+Module Name:
+ SeedSequencer.cpp
+ Code for determining the order of seeds in a read.
+ Bill Bolosky, August, 2013
+ User mode service.
+Revision History:
+#include "stdafx.h"
+#include "SeedSequencer.h"
+static SeedSequencer *Sequencers[LargestSeedSize + 1];
+void InitializeSeedSequencers()
+ for (unsigned i = 1; i <= LargestSeedSize; i++) {
+ Sequencers[i] = new SeedSequencer(i);
+ }
+SeedSequencer::SeedSequencer(unsigned i_seedSize) : seedSize(i_seedSize)
+ offsets = new unsigned[seedSize];
+ for (unsigned i = 0; i < seedSize; i++) {
+ offsets[i] = 0;
+ }
+ if (seedSize == 1) return; // Not that seed size 1 makes any sense or is in any way supported. But it's in the array, so we generate it.
+ struct WorkItem {
+ unsigned lowerBound;
+ unsigned upperBound;
+ WorkItem *next;
+ };
+ unsigned nFilledOffsets = 1;
+ WorkItem *workItems = new WorkItem;
+ WorkItem *tail = NULL;
+ workItems->next = NULL;
+ workItems->lowerBound = 1;
+ workItems->upperBound = seedSize - 1;
+ while (NULL != workItems) {
+ WorkItem *itemToProcess = workItems;
+ workItems = itemToProcess->next;
+ if (NULL == workItems) {
+ tail = NULL;
+ }
+ unsigned selectedLocation = (itemToProcess->lowerBound + itemToProcess->upperBound) / 2;
+ _ASSERT(offsets[selectedLocation] == 0);
+ offsets[selectedLocation] = nFilledOffsets;
+ nFilledOffsets++;
+ //
+ // Add the upper half to the tail. It will be the larger of the two if they're not equal, since / 2 rounds down.
+ //
+ if (itemToProcess->upperBound > selectedLocation) {
+ WorkItem *upperItem = new WorkItem;
+ upperItem->next = NULL;
+ upperItem->upperBound = itemToProcess->upperBound;
+ upperItem->lowerBound = selectedLocation + 1;
+ //
+ // Add it to the tail.
+ //
+ if (NULL != tail) {
+ tail->next = upperItem;
+ tail = upperItem;
+ } else {
+ _ASSERT(workItems == NULL);
+ workItems = tail = upperItem;
+ }
+ }
+ if (itemToProcess->lowerBound < selectedLocation) {
+ _ASSERT(workItems != NULL); // We must have already added an upper half.
+ itemToProcess->upperBound = selectedLocation - 1;
+ itemToProcess->next = NULL;
+ tail->next = itemToProcess;
+ tail = itemToProcess;
+ } else {
+ delete itemToProcess;
+ }
+ }
+ _ASSERT(nFilledOffsets == seedSize);
+unsigned GetWrappedNextSeedToTest(unsigned seedLen, unsigned wrapCount)
+ _ASSERT(seedLen <= LargestSeedSize);
+ return Sequencers[seedLen]->GetWrappedNextSeedToTest(wrapCount);
diff --git a/SNAPLib/SeedSequencer.h b/SNAPLib/SeedSequencer.h
new file mode 100644
index 0000000..cc12fdf
--- /dev/null
+++ b/SNAPLib/SeedSequencer.h
@@ -0,0 +1,388 @@
+Module Name:
+ SeedSequencer.h
+ Code for determining the order of seeds in a read.
+ Bill Bolosky, February, 2013
+ User mode service.
+Revision History:
+ Factored out of BaseAligner
+#pragma once
+#include "exit.h"
+#include "Seed.h"
+#include "Error.h"
+class SeedSequencer {
+ SeedSequencer(unsigned i_seedSize);
+ inline unsigned SeedOffset(unsigned wrapCount) {
+ _ASSERT(wrapCount < seedSize);
+ return offsets[wrapCount];
+ }
+ inline unsigned GetWrappedNextSeedToTest(unsigned wrapCount) {
+ _ASSERT(wrapCount < seedSize);
+ return(offsets[wrapCount]);
+ }
+ unsigned seedSize;
+ unsigned *offsets;
+void InitializeSeedSequencers();
+unsigned GetWrappedNextSeedToTest(unsigned seedLen, unsigned wrapCount)
+#if 1
+ ; // The definition is in SeedSequencer.cpp
+#else // 1 -- old style with switch/case
+ if (0 == wrapCount) {
+ return 0;
+ }
+ switch (seedLen) {
+ case 32: {
+ switch (wrapCount) {
+ case 1: return 16;
+ case 2: return 8;
+ case 3: return 24;
+ case 4: return 4;
+ case 5: return 20;
+ case 6: return 12;
+ case 7: return 28;
+ case 8: return 6;
+ case 9: return 18;
+ case 10: return 10;
+ case 11: return 26;
+ case 12: return 2;
+ case 13: return 14;
+ case 14: return 22;
+ case 15: return 30;
+ case 16: return 3;
+ case 17: return 17;
+ case 18: return 9;
+ case 19: return 23;
+ case 20: return 29;
+ case 21: return 7;
+ case 22: return 19;
+ case 23: return 27;
+ case 24: return 5;
+ case 25: return 25;
+ case 26: return 11;
+ case 27: return 21;
+ case 28: return 31;
+ case 29: return 15;
+ case 30: return 1;
+ case 31: return 13;
+ }
+ }
+ case 31: {
+ switch (wrapCount) {
+ case 1: return 15;
+ case 2: return 23;
+ case 3: return 8;
+ case 4: return 19;
+ case 5: return 4;
+ case 6: return 27;
+ case 7: return 11;
+ case 8: return 17;
+ case 9: return 6;
+ case 10: return 25;
+ case 11: return 2;
+ case 12: return 29;
+ case 13: return 13;
+ case 14: return 21;
+ case 15: return 9;
+ case 16: return 24;
+ case 17: return 3;
+ case 18: return 18;
+ case 19: return 10;
+ case 20: return 26;
+ case 21: return 5;
+ case 22: return 30;
+ case 23: return 14;
+ case 24: return 7;
+ case 25: return 22;
+ case 26: return 1;
+ case 27: return 16;
+ case 28: return 28;
+ case 29: return 12;
+ case 30: return 20;
+ }
+ }
+ case 25: {
+ switch (wrapCount) {
+ case 1: return 13;
+ case 2: return 6;
+ case 3: return 19;
+ case 4: return 3;
+ case 5: return 16;
+ case 6: return 22;
+ case 7: return 9;
+ case 8: return 11;
+ case 9: return 1;
+ case 10: return 14;
+ case 11: return 7;
+ case 12: return 20;
+ case 13: return 4;
+ case 14: return 17;
+ case 15: return 23;
+ case 16: return 2;
+ case 17: return 15;
+ case 18: return 5;
+ case 19: return 21;
+ case 20: return 8;
+ case 21: return 24;
+ case 22: return 10;
+ case 23: return 18;
+ case 24: return 12;
+ }
+ }
+ case 24:{
+ switch (wrapCount) {
+ case 1: return 12;
+ case 2: return 6;
+ case 3: return 18;
+ case 4: return 3;
+ case 5: return 15;
+ case 6: return 21;
+ case 7: return 9;
+ case 8: return 1;
+ case 9: return 13;
+ case 10: return 19;
+ case 11: return 7;
+ case 12: return 16;
+ case 13: return 4;
+ case 14: return 22;
+ case 15: return 10;
+ case 16: return 2;
+ case 17: return 14;
+ case 18: return 20;
+ case 19: return 5;
+ case 20: return 17;
+ case 21: return 8;
+ case 22: return 23;
+ case 23: return 11;
+ }
+ }
+ case 23: {
+ switch (wrapCount) {
+ case 1: return 12;
+ case 2: return 6;
+ case 3: return 17;
+ case 4: return 3;
+ case 5: return 9;
+ case 6: return 20;
+ case 7: return 14;
+ case 8: return 1;
+ case 9: return 4;
+ case 10: return 7;
+ case 11: return 10;
+ case 12: return 15;
+ case 13: return 18;
+ case 14: return 21;
+ case 15: return 4;
+ case 16: return 2;
+ case 17: return 5;
+ case 18: return 11;
+ case 19: return 16;
+ case 20: return 19;
+ case 21: return 22;
+ case 22: return 8;
+ }
+ }
+ case 22: {
+ switch (wrapCount) {
+ case 1: return 11;
+ case 2: return 6;
+ case 3: return 16;
+ case 4: return 3;
+ case 5: return 9;
+ case 6: return 14;
+ case 7: return 19;
+ case 8: return 2;
+ case 9: return 7;
+ case 10: return 12;
+ case 11: return 17;
+ case 12: return 20;
+ case 13: return 4;
+ case 14: return 1;
+ case 15: return 10;
+ case 16: return 13;
+ case 17: return 15;
+ case 18: return 18;
+ case 19: return 21;
+ case 20: return 5;
+ case 21: return 8;
+ default: _ASSERT(!"NOTREACHED");
+ }
+ }
+ case 21: {
+ switch (wrapCount) {
+ case 1: return 11;
+ case 2: return 6;
+ case 3: return 16;
+ case 4: return 3;
+ case 5: return 9;
+ case 6: return 13;
+ case 7: return 17;
+ case 8: return 18;
+ case 9: return 2;
+ case 10: return 5;
+ case 11: return 8;
+ case 12: return 15;
+ case 13: return 20;
+ case 14: return 1;
+ case 15: return 4;
+ case 16: return 7;
+ case 17: return 10;
+ case 18: return 12;
+ case 19: return 14;
+ case 20: return 19;
+ default: _ASSERT(!"NOTREACHED");
+ }
+ }
+ case 20: {
+ switch (wrapCount) {
+ case 1: return 10;
+ case 2: return 5;
+ case 3: return 15;
+ case 4: return 2;
+ case 5: return 7;
+ case 6: return 12;
+ case 7: return 17;
+ case 8: return 3;
+ case 9: return 9;
+ case 10: return 11;
+ case 11: return 13;
+ case 12: return 19;
+ case 13: return 1;
+ case 14: return 4;
+ case 15: return 6;
+ case 16: return 8;
+ case 17: return 14;
+ case 18: return 18;
+ case 19: return 16;
+ default: _ASSERT(!"NOTREACHED");
+ }
+ }
+ case 19: {
+ switch (wrapCount) {
+ case 1: return 10;
+ case 2: return 4;
+ case 3: return 14;
+ case 4: return 2;
+ case 5: return 6;
+ case 6: return 8;
+ case 7: return 12;
+ case 8: return 16;
+ case 9: return 18;
+ case 10: return 1;
+ case 11: return 3;
+ case 12: return 5;
+ case 13: return 7;
+ case 14: return 9;
+ case 15: return 11;
+ case 16: return 13;
+ case 17: return 15;
+ case 18: return 17;
+ default: _ASSERT(!"NOTREACHED");
+ }
+ }
+ case 18: {
+ switch (wrapCount) {
+ case 1: return 9;
+ case 2: return 4;
+ case 3: return 13;
+ case 4: return 2;
+ case 5: return 6;
+ case 6: return 11;
+ case 7: return 15;
+ case 8: return 1;
+ case 9: return 3;
+ case 10: return 5;
+ case 11: return 7;
+ case 12: return 8;
+ case 13: return 10;
+ case 14: return 12;
+ case 15: return 14;
+ case 16: return 16;
+ case 17: return 17;
+ default: _ASSERT(!"NOTREACHED");
+ }
+ }
+ case 17: {
+ switch (wrapCount) {
+ case 1: return 8;
+ case 2: return 4;
+ case 3: return 12;
+ case 4: return 2;
+ case 5: return 6;
+ case 6: return 10;
+ case 7: return 14;
+ case 8: return 1;
+ case 9: return 3;
+ case 10: return 5;
+ case 11: return 7;
+ case 12: return 9;
+ case 13: return 11;
+ case 14: return 13;
+ case 15: return 15;
+ case 16: return 16;
+ default: _ASSERT(!"NOTREACHED");
+ }
+ }
+ case 16: {
+ switch (wrapCount) {
+ case 1: return 8;
+ case 2: return 4;
+ case 3: return 12;
+ case 4: return 2;
+ case 5: return 6;
+ case 6: return 10;
+ case 7: return 14;
+ case 8: return 1;
+ case 9: return 3;
+ case 10: return 5;
+ case 11: return 7;
+ case 12: return 9;
+ case 13: return 11;
+ case 14: return 13;
+ case 15: return 15;
+ default: _ASSERT(!"NOTREACHED");
+ }
+ } // inner switch
+ default: WriteErrorMessage("SeedSequencer: Not set up to run with this seed size\n"); soft_exit(1);
+ } // outer switch
+ return 0;
+#endif // 1
\ No newline at end of file
diff --git a/SNAPLib/SingleAligner.cpp b/SNAPLib/SingleAligner.cpp
new file mode 100644
index 0000000..9921e72
--- /dev/null
+++ b/SNAPLib/SingleAligner.cpp
@@ -0,0 +1,304 @@
+Module Name:
+ SingleAligner.cpp
+ Functions for running the single end aligner sub-program.
+ Matei Zaharia, February, 2012
+ User mode service.
+Revision History:
+ Adapted from cSNAP, which was in turn adapted from the scala prototype
+#include "stdafx.h"
+#include "options.h"
+#include "BaseAligner.h"
+#include "Compat.h"
+#include "RangeSplitter.h"
+#include "GenomeIndex.h"
+#include "SAM.h"
+#include "Tables.h"
+#include "AlignerContext.h"
+#include "AlignerOptions.h"
+#include "FASTQ.h"
+#include "Util.h"
+#include "SingleAligner.h"
+#include "MultiInputReadSupplier.h"
+using namespace std;
+using util::stringEndsWith;
+SingleAlignerContext::SingleAlignerContext(AlignerExtension* i_extension)
+ : AlignerContext(0, NULL, NULL, i_extension)
+ AlignerStats*
+ return new AlignerStats();
+ void
+ ParallelTask<SingleAlignerContext> task(this);
+ task.run();
+ void
+ PreventMachineHibernationWhileThisThreadIsAlive();
+ ReadSupplier *supplier = readSupplierGenerator->generateNewReadSupplier();
+ if (NULL == supplier) {
+ //
+ // No work for this thread to do.
+ //
+ return;
+ }
+ if (extension->runIterationThread(supplier, this)) {
+ delete supplier;
+ return;
+ }
+ if (index == NULL) {
+ // no alignment, just input/output
+ Read *read;
+ while (NULL != (read = supplier->getNextRead())) {
+ stats->totalReads++;
+ SingleAlignmentResult result;
+ result.status = NotFound;
+ result.direction = FORWARD;
+ result.mapq = 0;
+ result.score = 0;
+ result.location = InvalidGenomeLocation;
+ if (NULL != readWriter && options->passFilter(read, NotFound, false)) {
+ readWriter->writeReads(readerContext, read, &result, 1, true);
+ }
+ }
+ delete supplier;
+ return;
+ }
+ int maxReadSize = MAX_READ_LENGTH;
+ SingleAlignmentResult *alignmentResults = NULL;
+ unsigned alignmentResultBufferCount;
+ if (maxSecondaryAlignmentAdditionalEditDistance < 0) {
+ alignmentResultBufferCount = 1; // For the primary alignment
+ } else {
+ alignmentResultBufferCount = BaseAligner::getMaxSecondaryResults(numSeedsFromCommandLine, seedCoverage, maxReadSize, maxHits, index->getSeedLength()) + 1; // +1 for the primary alignment
+ }
+ size_t alignmentResultBufferSize = sizeof(*alignmentResults) * (alignmentResultBufferCount + 1); // +1 is for primary result
+ BigAllocator *allocator = new BigAllocator(BaseAligner::getBigAllocatorReservation(index, true, maxHits, maxReadSize, index->getSeedLength(), numSeedsFromCommandLine, seedCoverage, maxSecondaryAlignmentsPerContig)
+ + alignmentResultBufferSize);
+ BaseAligner *aligner = new (allocator) BaseAligner(
+ index,
+ maxHits,
+ maxDist,
+ maxReadSize,
+ numSeedsFromCommandLine,
+ seedCoverage,
+ minWeightToCheck,
+ extraSearchDepth,
+ noUkkonen,
+ noOrderedEvaluation,
+ noTruncation,
+ maxSecondaryAlignmentsPerContig,
+ NULL, // LV (no need to cache in the single aligner)
+ NULL, // reverse LV
+ stats,
+ allocator);
+ alignmentResults = (SingleAlignmentResult *)allocator->allocate(alignmentResultBufferSize);
+ allocator->checkCanaries();
+ aligner->setExplorePopularSeeds(options->explorePopularSeeds);
+ aligner->setStopOnFirstHit(options->stopOnFirstHit);
+#ifdef _MSC_VER
+ if (options->useTimingBarrier) {
+ if (0 == InterlockedDecrementAndReturnNewValue(nThreadsAllocatingMemory)) {
+ AllowEventWaitersToProceed(memoryAllocationCompleteBarrier);
+ } else {
+ WaitForEvent(memoryAllocationCompleteBarrier);
+ }
+ }
+#endif // _MSC_VER
+ // Align the reads.
+ Read *read;
+ _uint64 lastReportTime = timeInMillis();
+ _uint64 readsWhenLastReported = 0;
+ while (NULL != (read = supplier->getNextRead())) {
+ stats->totalReads++;
+ if (AlignerOptions::useHadoopErrorMessages && stats->totalReads % 10000 == 0 && timeInMillis() - lastReportTime > 10000) {
+ fprintf(stderr,"reporter:counter:SNAP,readsAligned,%lu\n",stats->totalReads - readsWhenLastReported);
+ readsWhenLastReported = stats->totalReads;
+ lastReportTime = timeInMillis();
+ }
+ // Skip the read if it has too many Ns or trailing 2 quality scores.
+ if (read->getDataLength() < minReadLength || read->countOfNs() > maxDist) {
+ if (readWriter != NULL && options->passFilter(read, NotFound, true)) {
+ SingleAlignmentResult result;
+ result.status = NotFound;
+ result.location = InvalidGenomeLocation;
+ result.mapq = 0;
+ result.direction = FORWARD;
+ readWriter->writeReads(readerContext, read, &result, 1, true);
+ }
+ continue;
+ } else {
+ stats->usefulReads++;
+ }
+ _int64 startTime = timeInNanos();
+ int nSecondaryResults = 0;
+#ifdef LONG_READS
+ int oldMaxK = aligner->getMaxK();
+ if (options->maxDistFraction > 0.0) {
+ aligner->setMaxK(min(MAX_K, (int)(read->getDataLength() * options->maxDistFraction)));
+ }
+ aligner->AlignRead(read, alignmentResults, maxSecondaryAlignmentAdditionalEditDistance, alignmentResultBufferCount - 1, &nSecondaryResults, maxSecondaryAlignments, alignmentResults + 1);
+#ifdef LONG_READS
+ aligner->setMaxK(oldMaxK);
+ _int64 runTime = timeInNanos() - startTime;
+ int timeBucket = min(30, cheezyLogBase2(runTime));
+ stats->countByTimeBucket[timeBucket]++;
+ stats->nanosByTimeBucket[timeBucket] += runTime;
+ allocator->checkCanaries();
+ updateStats(stats, read, alignmentResults[0].status, alignmentResults[0].score, alignmentResults[0].mapq);
+ if (NULL != readWriter) {
+ //
+ // Remove any reads that don't pass the filter, then send the remainder down to the writer.
+ //
+ bool containsPrimary = true;
+ for (int i = 0; i <= nSecondaryResults; i++) {
+ if (!options->passFilter(read, alignmentResults[i].status, false)) {
+ if (i == 0) {
+ containsPrimary = false;
+ }
+ //
+ // Copy the last result here.
+ //
+ alignmentResults[i] = alignmentResults[nSecondaryResults];
+ nSecondaryResults--;
+ //
+ // And back up so it gets checked.
+ //
+ i--;
+ }
+ } // For each result
+ readWriter->writeReads(readerContext, read, alignmentResults, nSecondaryResults + 1, containsPrimary);
+ }
+ }
+ aligner->~BaseAligner(); // This calls the destructor without calling operator delete, allocator owns the memory.
+ if (supplier != NULL) {
+ delete supplier;
+ }
+ delete allocator; // This is what actually frees the memory.
+ void
+ AlignerStats* stats,
+ Read* read,
+ AlignmentResult result,
+ int score,
+ int mapq)
+ if (isOneLocation(result)) {
+ stats->singleHits++;
+ } else if (result == MultipleHits) {
+ stats->multiHits++;
+ } else {
+ _ASSERT(result == NotFound);
+ stats->notFound++;
+ }
+ if (result != NotFound) {
+ _ASSERT(mapq >= 0 && mapq <= AlignerStats::maxMapq);
+ stats->mapqHistogram[mapq]++;
+ }
+ void
+ if (1 == options->nInputs) {
+ //
+ // We've only got one input, so just connect it directly to the consumer.
+ //
+ readSupplierGenerator = options->inputs[0].createReadSupplierGenerator(options->numThreads, readerContext);
+ } else {
+ //
+ // We've got multiple inputs, so use a MultiInputReadSupplier to combine the individual inputs.
+ //
+ ReadSupplierGenerator **generators = new ReadSupplierGenerator *[options->nInputs];
+ // use separate context for each supplier, initialized from common
+ for (int i = 0; i < options->nInputs; i++) {
+ ReaderContext context(readerContext);
+ generators[i] = options->inputs[i].createReadSupplierGenerator(options->numThreads, context);
+ }
+ readSupplierGenerator = new MultiInputReadSupplierGenerator(options->nInputs,generators);
+ }
+ ReaderContext* context = readSupplierGenerator->getContext();
+ readerContext.header = context->header;
+ readerContext.headerBytes = context->headerBytes;
+ readerContext.headerLength = context->headerLength;
+ readerContext.headerMatchesIndex = context->headerMatchesIndex;
+ void
+ {
+ if (readerContext.header != NULL) {
+ delete [] readerContext.header;
+ readerContext.header = NULL;
+ readerContext.headerLength = readerContext.headerBytes = 0;
+ readerContext.headerMatchesIndex = false;
+ }
+ delete readSupplierGenerator;
+ readSupplierGenerator = NULL;
diff --git a/SNAPLib/SingleAligner.h b/SNAPLib/SingleAligner.h
new file mode 100644
index 0000000..f673f41
--- /dev/null
+++ b/SNAPLib/SingleAligner.h
@@ -0,0 +1,63 @@
+Module Name:
+ SingleAligner.cpp
+ Functions for running the single end aligner sub-program.
+ Matei Zaharia, February, 2012
+ User mode service.
+Revision History:
+ Adapted from cSNAP, which was in turn adapted from the scala prototype
+#pragma once
+#include "stdafx.h"
+#include "AlignerContext.h"
+#include "AlignerStats.h"
+#include "ReadSupplierQueue.h"
+#include "AlignmentResult.h"
+class SingleAlignerContext : public AlignerContext
+ SingleAlignerContext(AlignerExtension* i_extension = NULL);
+ // AlignerContext overrides
+ virtual AlignerStats* newStats();
+ virtual void runTask();
+ virtual void runIterationThread();
+ virtual void typeSpecificBeginIteration();
+ virtual void typeSpecificNextIteration();
+ // for subclasses
+ virtual void updateStats(AlignerStats* stats, Read* read, AlignmentResult result, int score, int mapq);
+ //RangeSplittingReadSupplierGenerator *readSupplierGenerator;
+ ReadSupplierGenerator *readSupplierGenerator;
+ friend class AlignerContext2;
+ bool isPaired() {return false;}
diff --git a/SNAPLib/SortedDataWriter.cpp b/SNAPLib/SortedDataWriter.cpp
new file mode 100644
index 0000000..85ad336
--- /dev/null
+++ b/SNAPLib/SortedDataWriter.cpp
@@ -0,0 +1,507 @@
+Module Name:
+ SortedDataWriter.cpp
+ File writer that sorts records using a temporary file.
+ User mode service.
+ Not thread safe.
+#include "stdafx.h"
+#include "BigAlloc.h"
+#include "Compat.h"
+#include "Util.h"
+#include "DataWriter.h"
+#include "BufferedAsync.h"
+#include "VariableSizeVector.h"
+#include "FileFormat.h"
+#include "PriorityQueue.h"
+#include "exit.h"
+#include "Bam.h"
+#include "Error.h"
+//#define VALIDATE_SORT 1
+using std::max;
+#pragma pack(push, 4)
+struct SortEntry
+ SortEntry() : offset(0), length(0), location(0) {}
+ SortEntry(size_t i_offset, GenomeDistance i_length, GenomeLocation i_location)
+ : offset(i_offset), length(i_length), location(i_location) {}
+ size_t offset; // offset in file
+ GenomeDistance length; // number of bytes
+ GenomeLocation location; // location in genome
+ static bool comparator(const SortEntry& e1, const SortEntry& e2)
+ {
+ return e1.location < e2.location;
+ }
+#pragma pack(pop)
+typedef VariableSizeVector<SortEntry,150,true> SortVector;
+struct SortBlock
+ SortBlock() : start(0), bytes(0), location(0), length(0), reader(NULL), minLocation(0), maxLocation(0) {}
+ SortBlock() : start(0), bytes(0), location(0), length(0), reader(NULL) {}
+ SortBlock(const SortBlock& other) { *this = other; }
+ void operator=(const SortBlock& other);
+ size_t start;
+ size_t bytes;
+ GenomeLocation minLocation, maxLocation;
+ // for mergesort phase
+ DataReader* reader;
+ GenomeLocation location; // genome location of current read
+ char* data; // read data in read buffer
+ GenomeDistance length; // length in bytes
+ void
+ const SortBlock& other)
+ start = other.start;
+ bytes = other.bytes;
+ location = other.location;
+ length = other.length;
+ reader = other.reader;
+ minLocation = other.minLocation;
+ maxLocation = other.maxLocation;
+typedef VariableSizeVector<SortBlock> SortBlockVector;
+class SortedDataFilterSupplier;
+class SortedDataFilter : public DataWriter::Filter
+ SortedDataFilter(SortedDataFilterSupplier* i_parent)
+ : Filter(DataWriter::CopyFilter), parent(i_parent), locations(10000000)
+ {}
+ virtual ~SortedDataFilter() {}
+ virtual void onAdvance(DataWriter* writer, size_t batchOffset, char* data, GenomeDistance bytes, GenomeLocation location);
+ virtual size_t onNextBatch(DataWriter* writer, size_t offset, size_t bytes);
+ SortedDataFilterSupplier* parent;
+ SortVector locations;
+class SortedDataFilterSupplier : public DataWriter::FilterSupplier
+ SortedDataFilterSupplier(
+ const FileFormat* i_fileFormat,
+ const Genome* i_genome,
+ const char* i_tempFileName,
+ const char* i_sortedFileName,
+ DataWriter::FilterSupplier* i_sortedFilterSupplier,
+ size_t i_bufferSize,
+ size_t i_bufferSpace,
+ FileEncoder* i_encoder = NULL)
+ :
+ format(i_fileFormat),
+ genome(i_genome),
+ FilterSupplier(DataWriter::CopyFilter),
+ encoder(i_encoder),
+ tempFileName(i_tempFileName),
+ sortedFileName(i_sortedFileName),
+ sortedFilterSupplier(i_sortedFilterSupplier),
+ bufferSize(i_bufferSize),
+ bufferSpace(i_bufferSpace),
+ blocks()
+ {
+ InitializeExclusiveLock(&lock);
+ }
+ virtual ~SortedDataFilterSupplier()
+ {
+ DestroyExclusiveLock(&lock);
+ }
+ virtual DataWriter::Filter* getFilter();
+ virtual void onClosing(DataWriterSupplier* supplier) {}
+ virtual void onClosed(DataWriterSupplier* supplier);
+ void setHeaderSize(size_t bytes)
+ { headerSize = bytes; }
+ void addBlock(size_t start, size_t bytes);
+ void addBlock(size_t start, size_t bytes, GenomeLocation minLocation, GenomeLocation maxLocation);
+ bool mergeSort();
+ const Genome* genome;
+ const FileFormat* format;
+ const char* tempFileName;
+ const char* sortedFileName;
+ DataWriter::FilterSupplier* sortedFilterSupplier;
+ FileEncoder* encoder;
+ size_t headerSize;
+ ExclusiveLock lock; // for adding blocks
+ SortBlockVector blocks;
+ size_t bufferSize;
+ size_t bufferSpace;
+ friend class SortedDataFilter;
+ void
+ DataWriter* writer,
+ size_t batchOffset,
+ char* data,
+ GenomeDistance bytes,
+ GenomeLocation location)
+ SortEntry entry(batchOffset, bytes, location);
+ if (memcmp(data, "BAM", 3) != 0 && memcmp(data, "@HD", 3) != 0) { // skip header block
+ GenomeLocation loc;
+ GenomeDistance len;
+ parent->format->getSortInfo(parent->genome, data, bytes, &loc, &len);
+ _ASSERT(loc == location);
+ }
+ locations.push_back(entry);
+ size_t
+ DataWriter* writer,
+ size_t offset,
+ size_t bytes)
+ // sort buffered reads by location for later merge sort
+ std::stable_sort(locations.begin(), locations.end(), SortEntry::comparator);
+ // copy from previous buffer into current in sorted order
+ char* fromBuffer;
+ size_t fromSize, fromUsed;
+ char* toBuffer;
+ size_t toSize, toUsed;
+ if (! (writer->getBatch(-1, &fromBuffer, &fromSize, &fromUsed) &&
+ writer->getBatch(0, &toBuffer, &toSize, &toUsed)))
+ {
+ WriteErrorMessage( "SortedDataFilter::onNextBatch getBatch failed\n");
+ soft_exit(1);
+ }
+ size_t target = 0;
+ GenomeLocation previous = 0;
+ for (VariableSizeVector<SortEntry>::iterator i = locations.begin(); i != locations.end(); i++) {
+ if (locations.size() > 1) { // skip header block
+ GenomeLocation loc;
+ GenomeDistance len;
+ parent->format->getSortInfo(parent->genome, fromBuffer + i->offset, i->length, &loc, &len);
+ _ASSERT(loc >= previous);
+ previous = loc;
+ }
+ memcpy(toBuffer + target, fromBuffer + i->offset, i->length);
+ target += i->length;
+ }
+ // remember block extent for later merge sort
+ SortBlock block;
+ // handle header specially
+ size_t header = offset > 0 ? 0 : locations[0].length;
+ if (header > 0) {
+ parent->setHeaderSize(header);
+ }
+ int first = offset == 0;
+ GenomeLocation minLocation = locations.size() > first ? locations[first].location : 0;
+ GenomeLocation maxLocation = locations.size() > first ? locations[locations.size() - 1].location : UINT32_MAX;
+ parent->addBlock(offset + header, bytes - header, minLocation, maxLocation);
+ parent->addBlock(offset + header, bytes - header);
+ locations.clear();
+ return target;
+ DataWriter::Filter*
+ return new SortedDataFilter(this);
+ void
+ DataWriterSupplier* supplier)
+ if (blocks.size() == 1 && sortedFilterSupplier == NULL) {
+ // just rename/move temp file to real file, we're done
+ DeleteSingleFile(sortedFileName); // if it exists
+ if (! MoveSingleFile(tempFileName, sortedFileName)) {
+ WriteErrorMessage( "unable to move temp file %s to final sorted file %s\n", tempFileName, sortedFileName);
+ soft_exit(1);
+ }
+ return;
+ }
+ // merge sort into final file
+ if (! mergeSort()) {
+ WriteErrorMessage( "merge sort failed\n");
+ soft_exit(1);
+ }
+ void
+ size_t start,
+ size_t bytes
+ , GenomeLocation minLocation
+ , GenomeLocation maxLocation
+ )
+ if (bytes > 0) {
+ AcquireExclusiveLock(&lock);
+ for (SortBlockVector::iterator i = blocks.begin(); i != blocks.end(); i++) {
+ _ASSERT(i->start + i->length <= start || start + bytes <= i->start);
+ }
+ SortBlock block;
+ block.start = start;
+ block.bytes = bytes;
+ block.minLocation = minLocation;
+ block.maxLocation = maxLocation;
+ blocks.push_back(block);
+ ReleaseExclusiveLock(&lock);
+ }
+ bool
+ // merge sort from temp file into sorted file
+ WriteStatusMessage("sorting...");
+ _int64 start = timeInMillis();
+ _int64 startReadWaitTime = DataReader::ReadWaitTime;
+ _int64 startReleaseWaitTime = DataReader::ReleaseWaitTime;
+ _int64 startWriteWaitTime = DataWriter::WaitTime;
+ _int64 startWriteFilterTime = DataWriter::FilterTime;
+ // set up buffered output
+ DataWriterSupplier* writerSupplier = DataWriterSupplier::create(sortedFileName, bufferSize, sortedFilterSupplier,
+ encoder, encoder != NULL ? 6 : 4); // use more buffers to let encoder run async
+ DataWriter* writer = writerSupplier->getWriter();
+ if (writer == NULL) {
+ WriteErrorMessage( "open sorted file for write failed\n");
+ return false;
+ }
+ DataSupplier* readerSupplier = DataSupplier::Default; // autorelease
+ // setup - open all files, read first block, begin read for second
+ if (blocks.size() > 5000) {
+ WriteErrorMessage("warning: merging %d blocks could be slow, try increasing sort memory with -sm option\n", blocks.size());
+ }
+ for (SortBlockVector::iterator i = blocks.begin(); i != blocks.end(); i++) {
+ i->reader = readerSupplier->getDataReader(1, MAX_READ_LENGTH * 8, 0.0,
+ min(1UL << 23, max(1UL << 17, bufferSpace / blocks.size()))); // 128kB to 8MB buffer space per block
+ i->reader->init(tempFileName);
+ i->reader->reinit(i->start, i->bytes);
+ }
+ // write out header
+ if (headerSize > 0xffffffff) {
+ WriteErrorMessage("SortedDataFilterSupplier: headerSize too big\n");
+ soft_exit(1);
+ }
+ if (headerSize > 0) {
+ blocks[0].reader->reinit(0, headerSize);
+ writer->inHeader(true);
+ char* rbuffer;
+ _int64 rbytes;
+ char* wbuffer;
+ size_t wbytes;
+ for (size_t left = headerSize; left > 0; ) {
+ if ((! blocks[0].reader->getData(&rbuffer, &rbytes)) || rbytes == 0) {
+ blocks[0].reader->nextBatch();
+ if (! blocks[0].reader->getData(&rbuffer, &rbytes)) {
+ WriteErrorMessage( "read header failed\n");
+ soft_exit(1);
+ }
+ }
+ if ((! writer->getBuffer(&wbuffer, &wbytes)) || wbytes == 0) {
+ writer->nextBatch();
+ if (! writer->getBuffer(&wbuffer, &wbytes)) {
+ WriteErrorMessage( "write header failed\n");
+ soft_exit(1);
+ }
+ }
+ size_t xfer = min(left, min((size_t) rbytes, wbytes));
+ _ASSERT(xfer > 0 && xfer <= UINT32_MAX);
+ memcpy(wbuffer, rbuffer, xfer);
+ blocks[0].reader->advance(xfer);
+ writer->advance((unsigned) xfer);
+ left -= xfer;
+ }
+ blocks[0].reader->reinit(blocks[0].start, blocks[0].bytes);
+ writer->nextBatch();
+ writer->inHeader(false);
+ }
+ // merge temp blocks into output
+ _int64 total = 0;
+ // get initial merge sort data
+ typedef PriorityQueue<GenomeLocation, _int64> BlockQueue;
+ BlockQueue queue;
+ for (SortBlockVector::iterator b = blocks.begin(); b != blocks.end(); b++) {
+ _int64 bytes;
+ b->reader->getData(&b->data, &bytes);
+ format->getSortInfo(genome, b->data, bytes, &b->location, &b->length);
+ queue.add((_uint32) (b - blocks.begin()), b->location);
+ }
+ GenomeLocation current = 0; // current location for validation
+ int lastRefID = -1, lastPos = 0;
+ while (queue.size() > 0) {
+ GenomeLocation check;
+ queue.peek(&check);
+ _ASSERT(check >= current);
+ GenomeLocation secondLocation;
+ _int64 smallestIndex = queue.pop();
+ _int64 secondIndex = queue.size() > 0 ? queue.peek(&secondLocation) : -1;
+ GenomeLocation limit = secondIndex != -1 ? secondLocation : InvalidGenomeLocation;
+ SortBlock* b = &blocks[smallestIndex];
+ char* writeBuffer;
+ size_t writeBytes;
+ writer->getBuffer(&writeBuffer, &writeBytes);
+ const int NBLOCKS = 20;
+ SortBlock oldBlocks[NBLOCKS];
+ int oldBlockIndex = 0;
+ while (b->location <= limit) {
+ _ASSERT(b->location >= b->minLocation && b->location <= b->maxLocation);
+ if (writeBytes < (size_t)b->length) {
+ writer->nextBatch();
+ writer->getBuffer(&writeBuffer, &writeBytes);
+ if (writeBytes < (size_t)b->length) {
+ WriteErrorMessage( "mergeSort: buffer size too small\n");
+ return false;
+ }
+ }
+ memcpy(writeBuffer, b->data, b->length);
+ if (format == FileFormat::BAM[0] || format == FileFormat::BAM[1]) {
+ ((BAMAlignment*)b->data)->validate();
+ }
+ int refID, pos;
+ format->getSortInfo(genome, b->data, b->length, NULL, NULL, &refID, &pos);
+ _ASSERT(refID == -1 || refID > lastRefID || (refID == lastRefID && pos >= lastPos));
+ if (refID != -1) {
+ lastRefID = refID;
+ lastPos = pos;
+ }
+ total++;
+ writer->advance(b->length);
+ writeBytes -= b->length;
+ writeBuffer += b->length;
+ oldBlocks[oldBlockIndex] = *b;
+ oldBlockIndex = (oldBlockIndex + 1) % NBLOCKS;
+ b->reader->advance(b->length);
+ _ASSERT(b->location >= current);
+ current = b->location;
+ _int64 readBytes;
+ if (! b->reader->getData(&b->data, &readBytes)) {
+ b->reader->nextBatch();
+ if (! b->reader->getData(&b->data, &readBytes)) {
+ _ASSERT(b->reader->isEOF());
+ delete b->reader;
+ b->reader = NULL;
+ break;
+ }
+ }
+ GenomeLocation previous = b->location;
+ format->getSortInfo(genome, b->data, readBytes, &b->location, &b->length);
+ _ASSERT(b->length <= readBytes && b->location >= previous);
+ }
+ if (b->reader != NULL) {
+ queue.add(smallestIndex, b->location);
+ }
+ }
+ // close everything
+ writer->close();
+ delete writer;
+ writerSupplier->close();
+ delete writerSupplier;
+ if (! DeleteSingleFile(tempFileName)) {
+ WriteErrorMessage( "warning: failure deleting temp file %s\n", tempFileName);
+ }
+ WriteStatusMessage("sorted %lld reads in %u blocks, %lld s\n"
+ "read wait align %.3f s + merge %.3f s, read release align %.3f s + merge %.3f s\n"
+ "write wait %.3f s align + %.3f s merge, write filter %.3f s align + %.3f s merge\n",
+ total, blocks.size(), (timeInMillis() - start)/1000,
+ startReadWaitTime * 1e-9, (DataReader::ReadWaitTime - startReadWaitTime) * 1e-9,
+ startReleaseWaitTime * 1e-9, (DataReader::ReleaseWaitTime - startReleaseWaitTime) * 1e-9,
+ startWriteWaitTime * 1e-9, (DataWriter::WaitTime - startWriteWaitTime) * 1e-9,
+ startWriteFilterTime * 1e-9, (DataWriter::FilterTime - startWriteFilterTime) * 1e-9);
+ return true;
+ DataWriterSupplier*
+ const FileFormat* format,
+ const Genome* genome,
+ const char* tempFileName,
+ size_t tempBufferMemory,
+ int numThreads,
+ const char* sortedFileName,
+ DataWriter::FilterSupplier* sortedFilterSuppler,
+ size_t maxBufferSize,
+ FileEncoder* encoder)
+ const int bufferCount = 3;
+ const size_t bufferSpace = tempBufferMemory > 0 ? tempBufferMemory : (numThreads * (size_t)1 << 30);
+ const size_t bufferSize = bufferSpace / (bufferCount * numThreads);
+ DataWriter::FilterSupplier* filterSupplier =
+ new SortedDataFilterSupplier(format, genome, tempFileName, sortedFileName, sortedFilterSuppler, bufferSize, bufferSpace, encoder);
+ return DataWriterSupplier::create(tempFileName, bufferSize, filterSupplier, NULL, bufferCount);
diff --git a/SNAPLib/Tables.cpp b/SNAPLib/Tables.cpp
new file mode 100644
index 0000000..a04e4ed
--- /dev/null
+++ b/SNAPLib/Tables.cpp
@@ -0,0 +1,94 @@
+#include "stdafx.h"
+#include "Tables.h"
+static const Tables tables;
+const char *COMPLEMENT = tables.getComplement();
+const char *IS_N = tables.getIsN();
+const int *BASE_VALUE = tables.getBaseValue();
+const int *BASE_VALUE_NO_N = tables.getBaseValueNoN();
+const char *VALUE_BASE = tables.getValueBase();
+const unsigned char *VALUE4_RC = tables.getValue4RC();
+const char *PACKED_BASE_VALUE = tables.getPackedBaseValue();
+const char *PACKED_QUALITY_MASK = tables.getPackedQualityMask();
+const char *PACKED_VALUE_BASE = tables.getPackedValueBase();
+const unsigned *IS_LOWER_CASE_OR_DOT = tables.getIsLowerCaseOrDot();
+const char *TO_UPPER_CASE_DOT_TO_N = tables.getToUpperCaseDotToN();
+const char *PACKED_VALUE_BASE_RC = tables.getPackedValueBaseRC();
+const char *CIGAR_QUAL_TO_SAM = tables.getCigarQualToSam();
+ memset(complement, 0, sizeof(complement));
+ memset(isN, 0, sizeof(isN));
+ complement['A'] = 'T';
+ complement['C'] = 'G';
+ complement['G'] = 'C';
+ complement['T'] = 'A';
+ complement['N'] = 'N';
+ complement['n'] = 'n';
+ isN['N'] = 1;
+ isN['n'] = 1;
+ // Base values chosen so that complements are bitwise opposites.
+ for (unsigned i = 0; i < 256; i++) {
+ baseValue[i] = 4;// Everything's an N unless it's not
+ }
+ baseValue['A'] = 0;
+ baseValue['G'] = 1;
+ baseValue['C'] = 2;
+ baseValue['T'] = 3;
+ // inverse of BASE_VALUE
+ valueBase[0] = 'A';
+ valueBase[1] = 'G';
+ valueBase[2] = 'C';
+ valueBase[3] = 'T';
+ valueBase[4] = 'N';
+ // Version that maps N's value to 0 instead of 4
+ memset(baseValueNoN, 0, sizeof(baseValueNoN));
+ baseValueNoN['A'] = 0;
+ baseValueNoN['G'] = 1;
+ baseValueNoN['C'] = 2;
+ baseValueNoN['T'] = 3;
+ // reverse complement of a byte of 4x2-bit values
+ for (int i = 0; i < 256; i++) {
+ value4RC[i] = 0xff ^ (((i & 0x03) << 6) | ((i & 0x0c) << 2) | ((i & 0x30) >> 2) | ((i & 0xc0) >> 6));
+ }
+ // packed base tables
+ for (int i = 0; i < 256; i++) {
+ packedValueBase[i] = i < 4 ? 'N' : "AGCT"[i >> 6];
+ packedValueBaseRC[i] = i < 4 ? 'N' : "TCGA"[i >> 6];
+ }
+ memset(packedBaseValue, 0, sizeof(packedBaseValue));
+ packedBaseValue['A'] = packedBaseValue['a'] = 0x00;
+ packedBaseValue['G'] = packedBaseValue['g'] = 0x40;
+ packedBaseValue['C'] = packedBaseValue['c'] = (char) 0x80;
+ packedBaseValue['T'] = packedBaseValue['t'] = (char) 0xc0;
+ memset(packedQualityMask, 0, 4);
+ memset(packedQualityMask + 4, 0x3f, sizeof(packedQualityMask) - 4);
+ for (unsigned i = 0; i < 256; i++) {
+ isLowerCaseOrDot[i] = 0;
+ toUpperCaseDotToN[i] = i;
+ }
+ for (unsigned i = 0x61; i <= 0x7a; i++) {
+ isLowerCaseOrDot[i] = 1;
+ toUpperCaseDotToN[i] = i - 0x20;
+ }
+ isLowerCaseOrDot['.'] = 1;
+ toUpperCaseDotToN['.'] = 'N';
+ for (unsigned i = 0; i < 256; i++) {
+ cigarQualToSam[i] = i > ('~' - '!') ? '!' : '!' + i;
+ }
diff --git a/SNAPLib/Tables.h b/SNAPLib/Tables.h
new file mode 100644
index 0000000..ca235be
--- /dev/null
+++ b/SNAPLib/Tables.h
@@ -0,0 +1,64 @@
+// Lookup tables that are too annoying to initialize with array expressions.
+// These includes things like complements of bases, numerical values, etc.
+// To avoid having an init() function that everyone must call, we stow these
+// in a special class and have one static instance of that class and variables
+// pointing to it to ensure that the initializer (class constructor) is run.
+#pragma once
+class Tables
+ char complement[256];
+ char isN[256];
+ int baseValue[256];
+ int baseValueNoN[256]; // Same as above but N maps to 0 instead of 4
+ char valueBase[5];
+ unsigned char value4RC[256]; // reverse complement of 4 bases/byte
+ unsigned isLowerCaseOrDot[256];
+ char toUpperCaseDotToN[256];
+ char packedBaseValue[256];
+ char packedQualityMask[256];
+ char packedValueBase[256];
+ char packedValueBaseRC[256];
+ char cigarQualToSam[256];
+ Tables();
+ const char *getComplement() const { return complement; }
+ const char *getIsN() const { return isN; }
+ const int *getBaseValue() const { return baseValue; }
+ const int *getBaseValueNoN() const { return baseValueNoN; }
+ const char *getValueBase() const { return valueBase; }
+ const unsigned char *getValue4RC() const { return value4RC; }
+ const char* getPackedBaseValue() const { return packedBaseValue; }
+ const char* getPackedQualityMask() const { return packedQualityMask; }
+ const char* getPackedValueBase() const { return packedValueBase; }
+ const char* getPackedValueBaseRC() const { return packedValueBaseRC; }
+ const unsigned *getIsLowerCaseOrDot() const {return isLowerCaseOrDot; }
+ const char *getToUpperCaseDotToN() const { return toUpperCaseDotToN; }
+ const char *getCigarQualToSam() const { return cigarQualToSam; }
+extern const char *COMPLEMENT;
+extern const char *IS_N;
+extern const int *BASE_VALUE;
+extern const char *VALUE_BASE;
+extern const unsigned char *VALUE4_RC;
+extern const char *PACKED_BASE_VALUE;
+extern const char *PACKED_QUALITY_MASK;
+extern const char *PACKED_VALUE_BASE;
+extern const char *PACKED_VALUE_BASE_RC;
+extern const int *BASE_VALUE_NO_N;
+extern const unsigned *IS_LOWER_CASE_OR_DOT;
+extern const char *TO_UPPER_CASE_DOT_TO_N;
+extern const char *CIGAR_QUAL_TO_SAM;
diff --git a/SNAPLib/Util.cpp b/SNAPLib/Util.cpp
new file mode 100644
index 0000000..3647071
--- /dev/null
+++ b/SNAPLib/Util.cpp
@@ -0,0 +1,177 @@
+Module Name:
+ Util.cpp
+ Generic support routines that don't seem to belong elsewhere.
+ Bill Bolosky, March, 2013
+ User mode service.
+Revision History:
+ Factored from other places
+#include "stdafx.h"
+#include "Util.h"
+#include "Error.h"
+_int64 FirstPowerOf2GreaterThanOrEqualTo(_int64 value)
+ int highestBitSet;
+ for (highestBitSet = 0; highestBitSet <= 62; highestBitSet++) { // Only go to 63, since this is signed
+ if (!(value & ~((((_int64)1) << highestBitSet) - 1))) {
+ highestBitSet -= 1;
+ break;
+ }
+ }
+ if (((_int64)1) << highestBitSet == value) return value;
+ return ((_int64)1) << (highestBitSet + 1);
+int cheezyLogBase2(_int64 value)
+ int retVal = 0;
+ value /= 2; // Since 2^0 = 1; we'll also define cheezyLogBase2(x) = 0 where x<= 0.
+ while (value > 0) {
+ retVal++;
+ value /= 2;
+ }
+ return retVal;
+ void
+ void* dst,
+ const void* src,
+ size_t bytes)
+ size_t dwords = bytes >> 3;
+ _uint64* p = (_uint64*) dst;
+ const _uint64* q = (const _uint64*) ((const char*)src + bytes - 8);
+ for (size_t i = 0; i < dwords; i++) {
+ *p++ = ByteSwapUI64(*q--);
+ }
+ int left = (int) (bytes & 7);
+ if (left > 0) {
+ char* p2 = (char*) p;
+ const char* q2 = (left - 1) + (const char*) src;
+ for (int i = 0; i < left; i++) {
+ *p2++ = *q2--;
+ }
+ }
+NWaiter::NWaiter(size_t n)
+ _signalsRequired = n;
+ _signalsReceived = 0;
+ InitializeExclusiveLock(&_lock);
+ CreateEventObject(&_waiter);
+ DestroyExclusiveLock(&_lock);
+ DestroyEventObject(&_waiter);
+void NWaiter::wait()
+ while (true) {
+ bool done;
+ AcquireExclusiveLock(&_lock);
+ done = (_signalsReceived >= _signalsRequired);
+ ReleaseExclusiveLock(&_lock);
+ if (done)
+ return;
+ else {
+ WaitForEvent(&_waiter);
+ }
+ }
+void NWaiter::signal()
+ AcquireExclusiveLock(&_lock);
+ _signalsReceived += 1;
+ ReleaseExclusiveLock(&_lock);
+ AllowEventWaitersToProceed(&_waiter);
+char *FormatUIntWithCommas(_uint64 val, char *outputBuffer, size_t outputBufferSize)
+ //
+ // First, figure out the number of digits.
+ //
+ unsigned nDigits = 0;
+ _uint64 x = val;
+ while (x > 0) {
+ nDigits++;
+ x = x / 10;
+ }
+ if (0 == nDigits) {
+ //
+ // Special case for the value 0 (which, I suppose if the world was rational, would be represented by the empty string. :-))
+ //
+ _ASSERT(0 == val);
+ nDigits = 1;
+ }
+ int nCommas = (nDigits - 1) / 3;
+ if (outputBufferSize < nDigits + nCommas + 1) {
+ WriteErrorMessage("Internal error: too small buffer for FormatUIntWithCommas, value %lld, outputBufferSize %lld\n", val, outputBufferSize);
+ if (outputBufferSize > 0) {
+ *outputBuffer = 0;
+ } else {
+ soft_exit(1);
+ }
+ return outputBuffer;
+ }
+ //
+ // Now build up the string backwards.
+ //
+ size_t offset = nDigits + nCommas;
+ outputBuffer[offset] = '\0';
+ if (0 == val) {
+ outputBuffer[0] = '0';
+ return outputBuffer;
+ }
+ x = val;
+ while (x > 0) {
+ char tempBuf[5];
+ if (x > 999) {
+ sprintf(tempBuf, ",%03lld", x % 1000);
+ _ASSERT(strlen(tempBuf) == 4);
+ } else {
+ sprintf(tempBuf, "%d", x);
+ }
+ _ASSERT(offset >= strlen(tempBuf));
+ offset -= strlen(tempBuf);
+ memcpy(outputBuffer + offset, tempBuf, strlen(tempBuf));
+ x /= 1000;
+ }
+ return outputBuffer;
diff --git a/SNAPLib/Util.h b/SNAPLib/Util.h
new file mode 100644
index 0000000..d5076da
--- /dev/null
+++ b/SNAPLib/Util.h
@@ -0,0 +1,538 @@
+#pragma once
+#include <map>
+#include "stdafx.h"
+#include "Compat.h"
+#include "Tables.h"
+#include "exit.h"
+using std::max;
+using std::min;
+// General utilities.
+namespace util {
+#define BEGEND(container) (container).begin(), (container).end()
+inline double ratio(double a, double b=1)
+ return a / (a + b);
+// Turn the value into a string with comma formatting (so 1,234,567 instead of 1234567).
+// Produces a null-terminated string.
+extern char *FormatUIntWithCommas(_uint64 val, char *outputBuffer, size_t outputBufferSize);
+const int MAXLINE = 1024;
+// You'd think this would be in the C library.
+// Like strchr, but with a max length so it doesn't
+// run over the end of the buffer. Basically,
+// strings suck in C.
+ inline char *
+strnchr(char *str, char charToFind, size_t maxLen)
+ for (size_t i = 0; i < maxLen; i++) {
+ if (str[i] == charToFind) {
+ return str + i;
+ }
+ if (str[i] == 0) {
+ return NULL;
+ }
+ }
+ return NULL;
+ inline const char *
+strnchr(const char *str, char charToFind, size_t maxLen)
+ for (size_t i = 0; i < maxLen; i++) {
+ if (str[i] == charToFind) {
+ return str + i;
+ }
+ if (str[i] == 0) {
+ return NULL;
+ }
+ }
+ return NULL;
+// Check whether a string str ends with a given pattern
+ inline bool
+stringEndsWith(const char* str, const char* pattern)
+ if (strlen(str) < strlen(pattern)) {
+ return false;
+ } else {
+#ifdef _MSC_VER
+ return _stricmp(str + (strlen(str) - strlen(pattern)), pattern) == 0;
+ return strcmp(str + (strlen(str) - strlen(pattern)), pattern) == 0;
+ }
+template <class Key, class T>
+const T &getOrElse(const std::map<Key, T> &m, typename std::map<Key, T>::const_iterator p,
+ const T &d=T())
+ return p == m.end() ? d : p->second;
+// Scala map method: lookup with default if not found.
+template <class Key, class T>
+const T &getOrElse(const std::map<Key, T> &m, const Key &k, const T &d=T())
+ return getOrElse(m, m.find(k), d);
+template <class RandIter, class T>
+size_t findIndex(RandIter first, RandIter last, const T &value)
+ return std::find(first, last, value) - first;
+// Analogue of Scala's 'mkString' method.
+template <class InIter>
+std::string joinWithSep(InIter first, InIter last, char sep)
+ std::string joined;
+ if (first == last)
+ return joined;
+ joined += *first++;
+ while (first != last)
+ (joined += sep) += *first++;
+ return joined;
+template <class T>
+void addIfAbsent(std::vector<T> *v, const T &e)
+ if (!std::count(v->begin(), v->end(), e))
+ v->push_back(e);
+inline bool startsWith(const char *s, const char *prefix)
+ return strstr(s, prefix) == s;
+inline const char *strstrAfter(const char *s, const char *t)
+ const char *tins = strstr(s, t);
+ return tins ? tins + strlen(t) : NULL;
+ inline void
+ char* rc,
+ const char* bases = NULL,
+ int length = -1,
+ bool toLower = false)
+ if (length < 0) {
+ length = (int) strlen(bases != NULL ? bases : rc);
+ if (bases != NULL) {
+ rc[length] = '\0';
+ }
+ }
+ if (bases != NULL && bases != rc) {
+ if (! toLower) {
+ for (int i = 0; i < length; i++) {
+ rc[i] = COMPLEMENT[bases[length - i - 1]];
+ }
+ } else {
+ for (int i = 0; i < length; i++) {
+ rc[i] = tolower(COMPLEMENT[bases[length - i - 1]]);
+ }
+ }
+ } else {
+ // reverse complement in place
+ for (int i = 0; i < length / 2; i++) {
+ char t = COMPLEMENT[rc[i]];
+ rc[i] = toLower ? tolower(COMPLEMENT[rc[length - i - 1]]) : COMPLEMENT[rc[length - i - 1]];
+ rc[length - i - 1] = toLower ? tolower(t) : t;
+ }
+ if (length % 2 == 1) {
+ rc[length / 2] = toLower ? tolower(COMPLEMENT[rc[length / 2]]) : COMPLEMENT[rc[length / 2]];
+ }
+ }
+ inline void
+ char* io_sequence,
+ int length,
+ const char* reference,
+ int refLength,
+ int offset) // offset of sequence from reference
+ int overlap = min(offset + length, refLength) - max(offset, 0); // # bases offset
+ int start = max(-offset, 0); // index in sequence
+ int last = 0;
+ for (int i = 0; i < overlap; i++) {
+ int is = start + i;
+ int ir = start + i + offset;
+ int found = 0;
+ for (int j = 0; j < 3; j++) {
+ int delta = j == 0 ? last :
+ j == 1 ? (last ? 0 : -1) :
+ (last ? -last : +1);
+ if (ir + delta >= 0 && ir + delta < refLength && toupper(reference[ir + delta]) == toupper(io_sequence[is])) {
+ io_sequence[is] = (islower(io_sequence[is]) ? "-,+" : "-.+")[1 + delta];
+ found = delta;
+ break;
+ }
+ }
+ last = found;
+ }
+ inline void
+ char* buffer,
+ int length)
+ for (int i = 0; i < length; i++) {
+ buffer[i] = tolower(buffer[i]);
+ }
+ inline unsigned
+ unsigned n)
+ unsigned factor = 1;
+ while (n >= 10) {
+ n /= 10;
+ factor *= 10;
+ }
+ return n * factor;
+ inline int
+ int n)
+ unsigned factor = 1;
+ if (n < 0) {
+ n = -n;
+ factor = -1;
+ }
+ while (n >= 10) {
+ n /= 10;
+ factor *= 10;
+ }
+ return n * factor;
+ inline _int64
+ _int64 n)
+ _int64 factor = 1;
+ while (abs(n) >= 10) {
+ n /= 10;
+ factor *= 10;
+ }
+ return n * factor;
+ // from MurmurHash3, public domain, http://code.google.com/p/smhasher/wiki/MurmurHash3
+#define ROTL32(x,y) bit_rotate_left(x,y)
+#define ROTL64(x,y) bit_rotate_left64(x,y)
+#ifdef _MSC_VER
+#define BIG_CONSTANT(x) (x)
+#define BIG_CONSTANT(x) (x##LLU)
+ inline _uint32
+ _uint32 h)
+ h ^= h >> 16;
+ h *= 0x85ebca6b;
+ h ^= h >> 13;
+ h *= 0xc2b2ae35;
+ h ^= h >> 16;
+ return h;
+ inline _uint64
+ _uint64 k)
+ k ^= k >> 33;
+ k *= BIG_CONSTANT(0xff51afd7ed558ccd);
+ k ^= k >> 33;
+ k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53);
+ k ^= k >> 33;
+ return k;
+ inline _uint32
+ const void* key,
+ int len)
+ const _uint8* data = (const _uint8*) key;
+ const int nblocks = len / 4;
+ _uint32 h1 = 0x811f6d67; // seed, const from a random guid for now
+ const _uint32 c1 = 0xcc9e2d51;
+ const _uint32 c2 = 0x1b873593;
+ //----------
+ // body
+ const _uint32 * blocks = (const _uint32 *)(data + nblocks*4);
+ for(int i = -nblocks; i; i++)
+ {
+ _uint32 k1 = blocks[i];
+ k1 *= c1;
+ k1 = ROTL32(k1,15);
+ k1 *= c2;
+ h1 ^= k1;
+ h1 = ROTL32(h1,13);
+ h1 = h1*5+0xe6546b64;
+ }
+ //----------
+ // tail
+ const _uint32 * tail = (const _uint32*)(data + nblocks*4);
+ _uint32 k1 = 0;
+ switch(len & 3)
+ {
+ case 3: k1 ^= tail[2] << 16;
+ case 2: k1 ^= tail[1] << 8;
+ case 1: k1 ^= tail[0];
+ k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
+ };
+ //----------
+ // finalization
+ h1 ^= len;
+ h1 = fmix32(h1);
+ return h1;
+ inline _uint64
+ _uint64 x)
+ return fmix64(x);
+ inline _uint64
+ const void* key,
+ int len)
+ const _uint8* data = (const _uint8*) key;
+ const int nblocks = len / 16;
+ _uint64 h1 = 0x460a5856818aaba3LL;
+ _uint64 h2 = h1;
+ const _uint64 c1 = BIG_CONSTANT(0x87c37b91114253d5);
+ const _uint64 c2 = BIG_CONSTANT(0x4cf5ad432745937f);
+ //----------
+ // body
+ const _uint64 * blocks = (const _uint64 *)(data);
+ for(int i = 0; i < nblocks; i++)
+ {
+ _uint64 k1 = blocks[i*2+0];
+ _uint64 k2 = blocks[i*2+1];
+ k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
+ h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729;
+ k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2;
+ h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5;
+ }
+ //----------
+ // tail
+ const _uint8 * tail = (const _uint8*)(data + nblocks*16);
+ _uint64 k1 = 0;
+ _uint64 k2 = 0;
+ switch(len & 15)
+ {
+ case 15: k2 ^= ((_uint64)tail[14]) << 48;
+ case 14: k2 ^= ((_uint64)tail[13]) << 40;
+ case 13: k2 ^= ((_uint64)tail[12]) << 32;
+ case 12: k2 ^= ((_uint64)tail[11]) << 24;
+ case 11: k2 ^= ((_uint64)tail[10]) << 16;
+ case 10: k2 ^= ((_uint64)tail[ 9]) << 8;
+ case 9: k2 ^= ((_uint64)tail[ 8]) << 0;
+ k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2;
+ case 8: k1 ^= ((_uint64)tail[ 7]) << 56;
+ case 7: k1 ^= ((_uint64)tail[ 6]) << 48;
+ case 6: k1 ^= ((_uint64)tail[ 5]) << 40;
+ case 5: k1 ^= ((_uint64)tail[ 4]) << 32;
+ case 4: k1 ^= ((_uint64)tail[ 3]) << 24;
+ case 3: k1 ^= ((_uint64)tail[ 2]) << 16;
+ case 2: k1 ^= ((_uint64)tail[ 1]) << 8;
+ case 1: k1 ^= ((_uint64)tail[ 0]) << 0;
+ k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
+ };
+ //----------
+ // finalization
+ h1 ^= len; h2 ^= len;
+ h1 += h2;
+ h2 += h1;
+ h1 = fmix64(h1);
+ h2 = fmix64(h2);
+ h1 += h2;
+ h2 += h1;
+ return h2;
+struct IdPair
+ unsigned id, value;
+ bool operator==(const IdPair& b) const
+ {
+ return id == b.id && value == b.value;
+ }
+ static bool comparator(const IdPair& a, const IdPair& b)
+ {
+ return a.id < b.id || (a.id == b.id && a.value < b.value);
+ }
+ static bool valueComparator(const IdPair& a, const IdPair& b)
+ {
+ return a.value < b.value || (a.value == b.value && a.id < b.id);
+ }
+ static bool valueComparatorDescending(const IdPair& a, const IdPair& b)
+ {
+ return a.value > b.value;
+ }
+ IdPair() : id(0), value(0) {}
+ IdPair(unsigned i_id, unsigned i_value) : id(i_id), value(i_value){}
+ // for use as key in VariableSizeMap
+ IdPair(int i) : id((unsigned) i), value(0) {}
+ bool operator==(int x) const
+ { return id == (unsigned) x && value == 0; }
+ bool operator!=(int x) const
+ { return id != (unsigned) x || value != 0; }
+ operator _uint64()
+ { return (((_uint64) id) << 32) | (_uint32) value; }
+struct IdIntPair
+ unsigned id;
+ int value;
+ bool operator==(const IdIntPair& b) const
+ {
+ return id == b.id && value == b.value;
+ }
+ static bool comparator(const IdIntPair& a, const IdIntPair& b)
+ {
+ return a.id < b.id || (a.id == b.id && a.value < b.value);
+ }
+ static bool valueComparator(const IdIntPair& a, const IdIntPair& b)
+ {
+ return a.value < b.value;
+ }
+ IdIntPair()
+ : id(0), value(0)
+ {}
+ IdIntPair(unsigned i_id, int i_value)
+ : id(i_id), value(i_value)
+ {}
+ IdIntPair(_uint64 x)
+ : id((unsigned) (x >> 32)), value((int) x)
+ {}
+ // for use as key in VariableSizeMap
+ IdIntPair(int i) : id((unsigned) i), value(0) {}
+ bool operator==(int x) const
+ { return id == (unsigned) x && value == 0; }
+ bool operator!=(int x) const
+ { return id != (unsigned) x || value != 0; }
+ operator _uint64()
+ { return (((_uint64) id) << 32) | (_uint32) value; }
+void memrevcpy(void* dst, const void* src, size_t bytes);
+} // namespace util
+_int64 FirstPowerOf2GreaterThanOrEqualTo(_int64 value);
+int cheezyLogBase2(_int64 value);
+// Check if a is within distance of b, coping properly with the varagies of unsigneds.
+// There's a similar function for GenomeLocations defined in Genome.h.
+inline bool isWithin(unsigned a, unsigned b, unsigned distance)
+ {
+ return a <= b && a+distance >= b || a >= b && a <= b + distance;
+inline int getSignBit64(_int64 value)
+ return (value >> 63) & 1;
+inline int getSignBit32(int value)
+ return (value >> 31) & 1;
+// Utility class for synchronization: NWaiter.
+// This class is initialized with a number, n.
+// It has two public methods: wait() and signal().
+// wait() will block until signal() has been called n times.
+class NWaiter
+ void wait();
+ void signal();
+ NWaiter(size_t n);
+ ~NWaiter();
+ size_t _signalsRequired;
+ size_t _signalsReceived;
+ EventObject _waiter;
+ ExclusiveLock _lock;
diff --git a/SNAPLib/VariableSizeMap.h b/SNAPLib/VariableSizeMap.h
new file mode 100644
index 0000000..218d2b7
--- /dev/null
+++ b/SNAPLib/VariableSizeMap.h
@@ -0,0 +1,665 @@
+#pragma once
+#include "Compat.h"
+#include "BigAlloc.h"
+#include "VariableSizeVector.h"
+// A hash function for numeric types.
+template<typename T>
+class MapNumericHash
+ inline _uint64 operator() (T value) const {
+ return ((_uint64)value * 131);
+ }
+template<typename K, typename V>
+struct VariableSizeMapEntry
+ VariableSizeMapEntry() : key(), value() {}
+ VariableSizeMapEntry(K k, V v) : key(k), value(v) {}
+ K key;
+ V value;
+// A variable-size hash map that allows automatic growth
+// and does not perform any memory allocation except when growing.
+// Allows multi-threaded put, as long as growth=0 (i.e. fixed-size)
+// Shared base class for single- and multi-valued maps
+using std::max;
+using std::min;
+ typename K,
+ typename V,
+ int growth = 150,
+ typename Hash = MapNumericHash<K>,
+ int fill = 80,
+ int _empty = 0,
+ int _tombstone = -1,
+ bool multi = false,
+ bool _big = false>
+class VariableSizeMapBase
+ VariableSizeMapBase(int i_capacity = 16)
+ : entries(NULL), count(0), capacity(i_capacity), occupied(0)
+ {
+ reserve(max(16,i_capacity));
+ }
+ VariableSizeMapBase(void** data, unsigned i_capacity)
+ : entries((Entry*) (3 + (_int64*) *data)),
+ capacity(i_capacity),
+ count((int) ((_int64*)*data)[0]),
+ limit((int) ((_int64*)*data)[1]),
+ occupied((int) ((_int64*)*data)[2])
+ {
+ *data = ((char*)*data) + (size_t) i_capacity * sizeof(Entry) + 3 * sizeof(_int64);
+ }
+ inline void grow()
+ {
+ _ASSERT(growth > 100);
+ _int64 larger = ((_int64) capacity * growth) / 100;
+ _ASSERT(larger < INT32_MAX);
+ reserve((int) larger);
+ }
+ inline void assign(VariableSizeMapBase<K,V>* other)
+ {
+ if (entries != NULL) {
+ if (_big) {
+ BigDealloc(entries);
+ } else {
+ delete [] entries;
+ }
+ }
+ entries = other->entries;
+ capacity = other->capacity;
+ count = other->count;
+ limit = other->limit;
+ hash = other->hash;
+ occupied = other->occupied;
+ other->entries = NULL;
+ other->count = 0;
+ }
+ inline int size()
+ { return count; }
+ inline int getCapacity()
+ { return capacity; }
+ ~VariableSizeMapBase()
+ {
+ if (entries != NULL) {
+ if (_big) {
+ BigDealloc(entries);
+ } else {
+ delete [] entries;
+ }
+ }
+ entries = NULL;
+ count = 0;
+ }
+ void reserve(int larger)
+ {
+ Entry* old = entries;
+ int small = capacity;
+ capacity = larger;
+ if (_big) {
+ entries = (Entry*) BigAlloc(larger * sizeof(Entry));
+ } else {
+ entries = new Entry[larger];
+ }
+ _ASSERT(entries != NULL);
+ clear();
+ count = 0;
+ // grow before it gets to a certain fraction; always leave 1 slot for empty sentinel
+ limit = growth == 0 ? capacity - 1 : min(capacity - 1, (int) (((_int64) capacity * fill) / 100));
+ _ASSERT(limit > 0);
+ if (old != NULL) {
+ for (int i = 0; i < small; i++) {
+ K k = old[i].key;
+ if (k != _empty && k != _tombstone) {
+ Entry* p = this->scan(k, true);
+ _ASSERT(p != NULL);
+ p->key = k;
+ p->value = old[i].value;
+ count++;
+ }
+ }
+ occupied = count;
+ if (_big) {
+ BigDealloc(old);
+ } else {
+ delete [] old;
+ }
+ }
+ }
+ void clear()
+ {
+ if (entries != NULL) {
+ if (_empty == 0 && sizeof(Entry) < 4 * sizeof(K) && ! _big) {
+ // optimize zero case
+ memset(entries, 0, capacity * sizeof(Entry));
+ } else {
+ const K e(_empty);
+ for (int i = 0; i < capacity; i++) {
+ entries[i].key = e;
+ }
+ }
+ }
+ count = occupied = 0;
+ }
+ typedef VariableSizeMapEntry<K,V> Entry;
+ typedef Entry* iterator;
+ iterator begin()
+ {
+ return next(&entries[-1]);
+ }
+ iterator next(iterator x)
+ {
+ Entry* final = &entries[capacity];
+ if (x < final) {
+ do {
+ x++;
+ } while (x < final && (x->key == _empty || x->key == _tombstone));
+ }
+ return x;
+ }
+ iterator end()
+ {
+ return &entries[capacity];
+ }
+ iterator find(K key)
+ {
+ Entry* p = this->scan(key, false);
+ return p != NULL ? p : end();
+ }
+ void writeFile(LargeFileHandle* file)
+ {
+ _int64 x = (_int64) count;
+ WriteLargeFile(file, &x, sizeof(_int64));
+ x = (_int64) limit;
+ WriteLargeFile(file, &x, sizeof(_int64));
+ x = (_int64) occupied;
+ WriteLargeFile(file, &x, sizeof(_int64));
+ WriteLargeFile(file, entries, sizeof(Entry) * (size_t) capacity);
+ }
+ static const int MaxQuadraticProbes = 3;
+ void init(int& pos, int& i, K key)
+ {
+ _ASSERT(key != _empty && key != _tombstone);
+ pos = hash(key) % capacity;
+ i = 1;
+ if (entries == NULL) {
+ reserve(capacity);
+ }
+ }
+ bool advance(int& pos, int& i, K key) const
+ {
+ if (i >= capacity + MaxQuadraticProbes) {
+ pos = capacity;
+ return false;
+ }
+ pos = (pos + (i <= MaxQuadraticProbes ? i : 1)) % capacity;
+ i++;
+ return true;
+ }
+ Entry* scan(K key, bool add)
+ {
+ int pos;
+ int i;
+ init(pos, i, key);
+ if (pos == capacity) {
+ return NULL;
+ }
+ while (true) {
+ Entry* p = &entries[pos];
+ K k = p->key;
+ if (k == key && ! (multi && add)) {
+ return p;
+ } else if (k == _empty) {
+ return add ? p : NULL;
+ } else if (add && k == _tombstone && ! multi) {
+ return p;
+ } else if (! advance(pos, i, key)) {
+ return NULL;
+ }
+ }
+ }
+ Entry *entries;
+ int capacity;
+ int count;
+ int occupied; // number of non-empty slots (includes tombstones)
+ int limit; // current limit (capacity * fill / 100)
+ Hash hash;
+// Single-valued map
+template< typename K, typename V, int growth = 150, typename Hash = MapNumericHash<K>,
+ int fill = 80, int _empty = 0, int _tombstone = -1, bool _big = false >
+class VariableSizeMap
+ : public VariableSizeMapBase<K,V,growth,Hash,fill,_empty,_tombstone,false,_big>
+ VariableSizeMap(int i_capacity = 16)
+ : VariableSizeMapBase<K,V,growth,Hash,fill,_empty,_tombstone,false,_big>(i_capacity)
+ {}
+ VariableSizeMap(const VariableSizeMap<K,V>& other)
+ {
+ this->assign((VariableSizeMapBase<K,V>*)&other);
+ }
+ VariableSizeMap(void** data, unsigned i_capacity)
+ : VariableSizeMapBase<K,V,growth,Hash,fill,_empty,_tombstone,false>(data, i_capacity)
+ {
+ }
+ typedef VariableSizeMapEntry<K,V> Entry;
+ inline void operator=(const VariableSizeMap<K,V>& other)
+ {
+ this->assign((VariableSizeMapBase<K,V>*)&other);
+ }
+ ~VariableSizeMap()
+ {}
+ inline bool tryGet(K key, V* o_value)
+ {
+ Entry* p = this->scan(key, false);
+ if (p != NULL) {
+ *o_value = p->value;
+ }
+ return p != NULL;
+ }
+ inline V* tryFind(K key)
+ {
+ Entry* p = this->scan(key, false);
+ return p != NULL ? &p->value : NULL;
+ }
+ inline V get(K key)
+ {
+ Entry* p = this->scan(key, false);
+ _ASSERT(p != NULL);
+ return p->value;
+ }
+ bool erase(K key)
+ {
+ Entry* p = this->scan(key, false);
+ if (p != NULL) {
+ p->key = K(_tombstone);
+ this->count--;
+ }
+ return p != NULL;
+ }
+ inline V& operator[](K key)
+ {
+ Entry* p = this->scan(key, false);
+ _ASSERT(p != NULL);
+ return p->value;
+ }
+ inline void put(K key, V value)
+ {
+ V* p;
+ if (! tryAdd(key, value, &p)) {
+ *p = value;
+ }
+ }
+ inline V* getOrAdd(K key)
+ {
+ V* p = tryFind(key);
+ if (p == NULL) {
+ tryAdd(key, V(), &p);
+ }
+ return p;
+ }
+ inline bool tryAdd(K key, V value, V** o_pvalue)
+ {
+ while (true) {
+ Entry* p = this->scan(key, true);
+ if (p == NULL) {
+ this->grow();
+ p = this->scan(key, true);
+ _ASSERT(p != NULL);
+ }
+ K prior = p->key;
+ if (prior == key) {
+ *o_pvalue = &p->value;
+ return false;
+ }
+ if (prior == _empty || prior == _tombstone) {
+ // single-threaded
+ p->key = key;
+ p->value = value;
+ this->count++;
+ // hack!! to get around gcc bug
+ int o = this->occupied;
+ int l = this->limit;
+ bool occupy = prior == _empty;
+ if (o < l || ! occupy) {
+ this->occupied += occupy;
+ *o_pvalue = &p->value;
+ } else {
+ this->grow();
+ *o_pvalue = &this->scan(key, false)->value; // lookup again after rehashing
+ }
+ return true;
+ }
+ }
+ }
+ void exchange(VariableSizeMap& other)
+ {
+ Entry* e = this->entries; this->entries = other.entries; other.entries = e;
+ int x = this->capacity; this->capacity = other.capacity; other.capacity = x;
+ x = this->count; this->count = other.count; other.count = x;
+ x = this->limit; this->limit = other.limit; other.limit = x;
+ x = this->occupied; this->occupied = other.occupied; other.occupied = x;
+ }
+template< typename K, typename V>
+class VariableSizeMapBig
+ : public VariableSizeMap<K,V,150,MapNumericHash<K>,80,0,-1,true>
+ VariableSizeMapBig(int n = 10000) : VariableSizeMap<K,V,150,MapNumericHash<K>,80,0,-1,true>(n) {}
+ void assign(VariableSizeMapBig<K,V>* other)
+ {
+ // todo: avoid copying from base class, c++ inheritance is nonsensical
+ if (this->entries != NULL) {
+ BigDealloc(this->entries);
+ }
+ this->entries = other->entries;
+ this->capacity = other->capacity;
+ this->count = other->count;
+ this->limit = other->limit;
+ this->hash = other->hash;
+ this->occupied = other->occupied;
+ other->entries = NULL;
+ other->count = 0;
+ }
+typedef VariableSizeMap<unsigned,unsigned> IdMap;
+typedef VariableSizeMap<unsigned,int> IdIntMap;
+// Single-valued map
+template< typename K, typename V, int growth = 150, typename Hash = MapNumericHash<K>, int fill = 80, int _empty = 0, int _tombstone = -1, bool _big = false >
+class VariableSizeMultiMap
+ : public VariableSizeMapBase<K,V,growth,Hash,fill,_empty,_tombstone,true, _big >
+ VariableSizeMultiMap(int i_capacity = 16)
+ : VariableSizeMapBase<K,V,growth,Hash,fill,_empty,_tombstone,true>(i_capacity)
+ {}
+ VariableSizeMultiMap(VariableSizeMultiMap& other)
+ : VariableSizeMapBase<K,V,growth,Hash,fill,_empty,_tombstone,true>(other.capacity)
+ {
+ this->assign(&other);
+ }
+ VariableSizeMultiMap(void** data, unsigned i_capacity)
+ : VariableSizeMapBase<K,V,growth,Hash,fill,_empty,_tombstone,true>(data, i_capacity)
+ {
+ }
+ typedef VariableSizeMapEntry<K,V> Entry;
+ inline void operator=(VariableSizeMultiMap<K,V> other)
+ {
+ this->assign(&other);
+ }
+ ~VariableSizeMultiMap()
+ {}
+ class valueIterator
+ {
+ public:
+ bool hasValue()
+ { return pos < map->capacity && map->entries[pos].key != _empty; }
+ Entry* operator*() const
+ { _ASSERT(pos < map->capacity); return &map->entries[pos]; }
+ Entry* operator->() const
+ { _ASSERT(pos < map->capacity); return &map->entries[pos]; }
+ void next()
+ {
+ if (hasValue()) {
+ K k;
+ do {
+ if (! map->advance(pos, i, key)) {
+ pos = map->capacity;
+ return;
+ }
+ } while ((k = map->entries[pos].key) != key && k != _empty);
+ }
+ }
+ valueIterator()
+ : map(NULL), pos(0), i(0), key()
+ {
+ }
+ valueIterator(const valueIterator& other)
+ : map(other.map), pos(other.pos), i(other.i), key(other.key)
+ {}
+ void operator= (const valueIterator& other)
+ {
+ map = other.map;
+ pos = other.pos;
+ i = other.i;
+ key = other.key;
+ }
+ private:
+ valueIterator(VariableSizeMultiMap* i_map, K i_key)
+ : map(i_map), key(i_key)
+ {
+ map->init(pos, i, key);
+ K k = map->entries[pos].key;
+ if (k != key && k != _empty) {
+ next(); // skip tombstones & other keys
+ }
+ }
+ friend class VariableSizeMultiMap;
+ VariableSizeMultiMap* map;
+ int pos;
+ int i;
+ K key;
+ };
+ friend class valueIterator;
+ inline valueIterator getAll(K key)
+ {
+ return valueIterator(this, key);
+ }
+ inline bool hasKey(K key)
+ {
+ return getAll(key).hasValue(); // todo: optimize
+ }
+ inline bool contains(K key, V value)
+ {
+ for (valueIterator i = getAll(key); i.hasValue(); i.next()) {
+ if (i->value == value) {
+ return true;
+ }
+ }
+ return false;
+ }
+ // always add even if value exists for key
+ inline void add(K key, V value)
+ {
+ if (this->occupied >= this->limit) {
+ this->grow();
+ }
+ Entry* p = this->scan(key, true);
+ if (p == NULL) {
+ // full of tombstones
+ this->grow();
+ p = this->scan(key, true);
+ _ASSERT(p != NULL);
+ }
+ this->occupied += p->key == _empty;
+ p->key = key;
+ p->value = value;
+ this->count++;
+ }
+ // if key-value exists, return false; else add & return true
+ inline bool put(K key, V value)
+ {
+ int pos; int i;
+ this->init(pos, i, key);
+ if (pos == this->capacity) {
+ this->add(key, value);
+ return true;
+ }
+ Entry* slot = NULL;
+ while (pos != this->capacity) {
+ Entry* p = &this->entries[pos];
+ if (p->key == key) {
+ if (p->value == value) {
+ return false;
+ }
+ // keep looking...
+ } else if (p->key == _tombstone) {
+ if (slot == NULL) {
+ slot = p; // remember in case we don't find a match
+ }
+ // keep looking...
+ } else if (p->key == _empty) {
+ if (slot == NULL) {
+ slot = p;
+ }
+ break; // got to end with no match
+ }
+ if (! this->advance(pos, i, key)) {
+ break;
+ }
+ }
+ if (slot != NULL) {
+ _ASSERT(slot->key == _empty || slot->key == _tombstone);
+ // hack!! to get around gcc bug
+ int o = this->occupied;
+ int l = this->limit;
+ bool occupy = slot->key == _empty;
+ if (o < l || ! occupy) {
+ slot->key = key;
+ slot->value = value;
+ this->count++;
+ this->occupied += occupy;
+ } else {
+ this->grow();
+ return this->put(key, value);
+ }
+ } else {
+ this->add(key, value);
+ }
+ _ASSERT(this->contains(key, value));
+ return true;
+ }
+ inline bool erase(K key, V value)
+ {
+ for (valueIterator i = getAll(key); i.hasValue(); i.next()) {
+ if (i->value == value) {
+ i->key = _tombstone;
+ this->count--;
+ return true;
+ }
+ }
+ return false;
+ }
+ inline int eraseAll(K key)
+ {
+ int n = 0;
+ for (valueIterator i = getAll(key); i.hasValue(); i.next()) {
+ i->key = _tombstone;
+ this->count--;
+ n++;
+ }
+ return n;
+ }
+ // whether a's values are a subset of b's values
+ inline bool isSubset(K a, K b)
+ {
+ for (valueIterator i = getAll(a); i.hasValue(); i.next()) {
+ if (! contains(b, i->value)) {
+ return false;
+ }
+ }
+ return true;
+ }
+ inline bool intersects(K a, K b)
+ {
+ for (valueIterator i = getAll(a); i.hasValue(); i.next()) {
+ if (contains(b, i->value)) {
+ return true;
+ }
+ }
+ return false;
+ }
+ inline void getAll(K key, VariableSizeVector<K>& o_result)
+ {
+ for (valueIterator i = getAll(key); i.hasValue(); i.next()) {
+ o_result.push_back(i->value);
+ }
+ }
+typedef VariableSizeMultiMap<unsigned,unsigned> IdMultiMap;
diff --git a/SNAPLib/VariableSizeVector.h b/SNAPLib/VariableSizeVector.h
new file mode 100644
index 0000000..e32d3dc
--- /dev/null
+++ b/SNAPLib/VariableSizeVector.h
@@ -0,0 +1,250 @@
+#pragma once
+#include "Util.h"
+// A variable-size vector that does not perform any memory allocation except to grow.
+template<typename V, int grow = 150, bool big = false>
+class VariableSizeVector
+ inline static void* allocate(size_t bytes)
+ {
+ if (bytes > (1L << 23) && ! big) {
+ WriteErrorMessage("%s: allocate %lld - consider using BigAlloc\n", __FUNCTION__, bytes);
+ }
+ return big ? BigAlloc(bytes) : malloc(bytes);
+ }
+ inline static void deallocate(void* p)
+ {
+ if (big) { BigDealloc(p); } else { free(p); }
+ }
+ VariableSizeVector(int i_capacity = 16)
+ : entries(NULL), count(0), capacity(i_capacity)
+ {}
+ VariableSizeVector(VariableSizeVector& other)
+ : entries(other.entries), count(other.count), capacity(other.capacity)
+ {
+ other.count = 0;
+ other.entries = NULL;
+ }
+ ~VariableSizeVector()
+ {
+ if (entries != NULL) {
+ deallocate(entries);
+ entries = NULL;
+ count = 0;
+ }
+ }
+ inline void increase()
+ {
+ if (entries == NULL) {
+ reserve(capacity);
+ } else if (count == capacity) {
+ reserve((int) (((_int64) count * grow) / 100));
+ }
+ }
+ void operator=(VariableSizeVector<V>& other)
+ {
+ entries = other.entries;
+ capacity = other.capacity;
+ count = other.count;
+ other.entries = NULL;
+ other.count = 0;
+ }
+ void reserve(_int64 newCapacity)
+ {
+ _ASSERT(newCapacity >= 0);
+ if (newCapacity <= capacity && entries != NULL) {
+ return;
+ }
+ V* old = entries;
+ capacity = __max(newCapacity, capacity);
+ entries = (V*) allocate(capacity * sizeof(V));
+ if (old != NULL) {
+ memcpy(entries, old, count * sizeof(V));
+ deallocate(old);
+ }
+ }
+ inline void clear()
+ {
+ count = 0;
+ }
+ inline void clean()
+ {
+ if (entries != NULL) {
+ deallocate(entries);
+ entries = NULL;
+ count = 0;
+ }
+ }
+ inline _int64 size() const
+ {
+ return count;
+ }
+ void truncate(int newCount)
+ {
+ if (newCount < count) {
+ count = newCount;
+ }
+ }
+ inline void push_back(V& value)
+ {
+ if (entries == NULL) {
+ reserve(capacity);
+ } else if (count == capacity) {
+ reserve((int) (((_int64) count * grow) / 100));
+ }
+ _ASSERT(count < capacity);
+ entries[count++] = value;
+ }
+ inline void push_back(const V& value)
+ {
+ increase();
+ _ASSERT(count < capacity);
+ entries[count++] = value;
+ }
+ inline void append(VariableSizeVector<V>* other)
+ {
+ if (other->count == 0) {
+ return;
+ }
+ reserve(count + other->count);
+ // todo: allow for operator assign/copy constructor?
+ memcpy(&entries[count], other->entries, other->count * sizeof(V));
+ count += other->count;
+ }
+ typedef bool comparator(const V& a, const V& b);
+ inline int insertionIndex(const V& value, comparator compare, bool before = false)
+ {
+ V* p = before ? std::lower_bound(entries, entries + count, value, compare)
+ : std::upper_bound(entries, entries + count, value, compare);
+ int index = (int) (p - entries);
+ _ASSERT(index >= 0 && index <= count);
+ return index;
+ }
+ // insert into sorted list, AFTER existing elements with same value
+ inline int insert(const V& value, comparator compare, bool before = false)
+ {
+ int index = insertionIndex( value, compare, before);
+ increase(); // todo: could fold memmove into new array copy to save time...
+ _ASSERT(count < capacity);
+ if (index < count) {
+ memmove(entries + (index + 1), entries + index, (count - index) * sizeof(V));
+ }
+ entries[index] = value;
+ count++;
+ return index;
+ }
+ inline bool add(const V& value)
+ {
+ for (int i = 0; i < count; i++) {
+ if (entries[i] == value) {
+ return false;
+ }
+ }
+ push_back(value);
+ return true;
+ }
+ inline void erase(_int64 index)
+ {
+ _ASSERT(index >= 0 && index < count);
+ if (index < 0 || index >= count) {
+ return;
+ }
+ if (index < count - 1) {
+ memmove(entries + index, entries + index + 1, (count - index - 1) * sizeof(V));
+ }
+ count--;
+ }
+ inline void extend(int size)
+ {
+ if (count < size) {
+ reserve(size);
+ memset(((V*)entries) + count, 0, sizeof(V) * (size - count));
+ count = size;
+ }
+ }
+ inline V& operator[](_int64 index) const
+ {
+ _ASSERT(index >= 0 && index < count);
+ return entries[index];
+ }
+ typedef V* iterator;
+ inline iterator findRange(const V& low, const V& high, comparator compare, iterator* o_end)
+ {
+ *o_end = entries + insertionIndex(high, compare, true);
+ return entries + insertionIndex(low, compare, true);
+ }
+ // unsorted search
+ inline iterator search(const V& value)
+ {
+ for (iterator i = begin(); i != end(); i++) {
+ if (*i == value) {
+ return i;
+ }
+ }
+ return end();
+ }
+ iterator begin()
+ {
+ return entries;
+ }
+ iterator end()
+ {
+ return &entries[count];
+ }
+ inline void remove(iterator p)
+ {
+ _ASSERT(p >= entries && p < entries + count);
+ if (p < entries + count - 1) {
+ memmove(p, p + 1, (count - (p - entries) - 1) * sizeof(V));
+ }
+ count--;
+ }
+ V *entries;
+ _int64 capacity;
+ _int64 count;
+typedef VariableSizeVector<unsigned> IdVector;
+typedef VariableSizeVector<int> IntVector;
+using util::IdPair;
+using util::IdIntPair;
+typedef VariableSizeVector<IdPair> IdPairVector;
+typedef VariableSizeVector<IdIntPair> IdIntPairVector;
diff --git a/SNAPLib/WindowsFileMapper.h b/SNAPLib/WindowsFileMapper.h
new file mode 100644
index 0000000..10a07ca
--- /dev/null
+++ b/SNAPLib/WindowsFileMapper.h
@@ -0,0 +1,36 @@
+Module Name:
+ WindowsFileMapper.h
+ Header for support code for file mapping on Windows
+ Bill Bolosky, November, 2012
+ User mode service.
+Revision History:
+#pragma once
+#ifdef _MSC_VER
+class WindowsFileMapper {
+ WindowsFileMapper();
+ bool init(const char *fileName);
+ const _int64 getFileSize();
+ char *createMapping(size_t offset);
+ void deleteMapping();
+#endif // _MSC_VER
\ No newline at end of file
diff --git a/SNAPLib/directions.h b/SNAPLib/directions.h
new file mode 100644
index 0000000..f3ca5f4
--- /dev/null
+++ b/SNAPLib/directions.h
@@ -0,0 +1,36 @@
+Module Name:
+ directions.h
+ Definitions for basic read directions (forward & reverse compliment)
+ Bill Bolosky, January, 2013
+ User mode service.
+Revision History:
+#pragma once
+const int NUM_DIRECTIONS = 2; // Forward and reverse compliment
+typedef int Direction;
+const int FORWARD = 0;
+const int RC = 1;
+inline Direction OppositeDirection(Direction direction) {
+ _ASSERT(FORWARD == direction || RC == direction);
+ return 1-direction;
\ No newline at end of file
diff --git a/SNAPLib/exit.cpp b/SNAPLib/exit.cpp
new file mode 100644
index 0000000..2695bda
--- /dev/null
+++ b/SNAPLib/exit.cpp
@@ -0,0 +1,42 @@
+Module Name:
+ exit.cpp
+ SNAP soft exit function
+ Bill Bolosky, February, 2013
+ User mode service.
+Revision History:
+#include "stdafx.h"
+#include "exit.h"
+#include "Error.h"
+// This exists solely as a place to set a breakpoint when debugging SNAP. It gets called both from the
+// soft_exit function and also from a few places in the code where we want to exit without printing the
+// warning message (like after printing the usage string), which was causing confusion.
+void soft_exit_no_print(int n)
+ exit(n);
+void soft_exit_function(int n, const char *fileName, int lineNum)
+ WriteErrorMessage("SNAP exited with exit code %d from line %d of file %s\n", n, lineNum, fileName);
+ soft_exit_no_print(n);
diff --git a/SNAPLib/exit.h b/SNAPLib/exit.h
new file mode 100644
index 0000000..2cd86e4
--- /dev/null
+++ b/SNAPLib/exit.h
@@ -0,0 +1,30 @@
+Module Name:
+ exit.h
+ Header for SNAP soft exit function
+ Bill Bolosky, February, 2013
+ User mode service.
+Revision History:
+#pragma once
+#define soft_exit(n) soft_exit_function(n, __FILE__, __LINE__)
+void soft_exit_no_print(int n);
+void soft_exit_function(int n, const char *fileName, int lineNum);
diff --git a/SNAPLib/mapq.cpp b/SNAPLib/mapq.cpp
new file mode 100644
index 0000000..d4b831e
--- /dev/null
+++ b/SNAPLib/mapq.cpp
@@ -0,0 +1,45 @@
+Module Name:
+ mapq.cpp
+ Support functions for mapping quality
+ Bill Bolosky, December, 2012
+ User mode service.
+Revision History:
+#include "stdafx.h"
+#include "Compat.h"
+#include "mapq.h"
+const int maxMAPQ = 70;
+static double mapqToProbabilityTable[maxMAPQ+1];
+void initializeMapqTables()
+ mapqToProbabilityTable[0] = .1; // This should technically be 0, but in practice it's a little better than that, so leave some chance here.
+ for (int i = 1; i <= maxMAPQ; i++) {
+ mapqToProbabilityTable[i] = 1- pow(10.0,((double)i) / -10.0);
+ }
+double mapqToProbability(int mapq)
+ _ASSERT(mapq >= 0 && mapq <= maxMAPQ);
+ return mapqToProbabilityTable[mapq];
diff --git a/SNAPLib/mapq.h b/SNAPLib/mapq.h
new file mode 100644
index 0000000..d288878
--- /dev/null
+++ b/SNAPLib/mapq.h
@@ -0,0 +1,68 @@
+Module Name:
+ mapq.h
+ Support functions for mapping quality
+ Bill Bolosky, December, 2012
+ User mode service.
+Revision History:
+#pragma once
+#include "directions.h"
+void initializeMapqTables();
+double mapqToProbability(int mapq); // The probability of a match for the given MAPQ
+inline int computeMAPQ(
+ double probabilityOfAllCandidates,
+ double probabilityOfBestCandidate,
+ int score,
+ int popularSeedsSkipped)
+ probabilityOfAllCandidates = __max(probabilityOfAllCandidates, probabilityOfBestCandidate); // You'd think this wouldn't be necessary, but floating point limited precision causes it to be.
+ _ASSERT(probabilityOfBestCandidate >= 0.0);
+ // Special case for MAPQ 70, which we generate only if there is no evidence of a mismatch at all.
+ // cheese is off, so no special casing MAPQ 70. If you want to turn is back on, return these three lines and then change the two instance of 70 to 69 in the
+ // next set below (baseMAPQ =, twice).
+ //
+// if (probabilityOfAllCandidates == probabilityOfBestCandidate && popularSeedsSkipped == 0 && score < 5) {
+// return 70;
+// }
+ double correctnessProbability = probabilityOfBestCandidate / probabilityOfAllCandidates;
+ int baseMAPQ;
+ if (correctnessProbability >= 1) {
+ baseMAPQ = 70;
+ } else {
+ baseMAPQ = __min(70, (int)(-10 * log10(1 - correctnessProbability)));
+ }
+ //
+ // Apply a penalty based on the number of overly popular seeds in the read
+ //
+ baseMAPQ = __max(0, baseMAPQ - __max(0, popularSeedsSkipped-10) / 2);
+ printf("computeMAPQ called at %u: score %d, pThis %g, pAll %g, result %d\n",
+ location, score, probabilityOfBestCandidate, probabilityOfAllCandidates, baseMAPQ);
+ return baseMAPQ;
diff --git a/SNAPLib/options.h b/SNAPLib/options.h
new file mode 100644
index 0000000..a47243e
--- /dev/null
+++ b/SNAPLib/options.h
@@ -0,0 +1,32 @@
+Module Name:
+ options.h
+ Compile time options for cSNAP
+ Bill Bolosky, September, 2011
+ User mode service.
+ This class is NOT thread safe. It's the caller's responsibility to ensure that
+ at most one thread uses an instance at any time.
+Revision History:
+ Adapted from Matei Zaharia's Scala implementation.
+#pragma once
+#define USE_DEVTEAM_OPTIONS 1 // Options only for the development team, not for the release version
diff --git a/SNAPLib/stdafx.cpp b/SNAPLib/stdafx.cpp
new file mode 100644
index 0000000..e392466
--- /dev/null
+++ b/SNAPLib/stdafx.cpp
@@ -0,0 +1,8 @@
+// stdafx.cpp : source file that includes just the standard includes
+// SNAPLib.pch will be the pre-compiled header
+// stdafx.obj will contain the pre-compiled type information
+#include "stdafx.h"
+// TODO: reference any additional headers you need in STDAFX.H
+// and not in this file
diff --git a/SNAPLib/stdafx.h b/SNAPLib/stdafx.h
new file mode 100644
index 0000000..4968795
--- /dev/null
+++ b/SNAPLib/stdafx.h
@@ -0,0 +1,53 @@
+// stdafx.h : include file for standard system include files,
+// or project specific include files that are used frequently, but
+// are changed infrequently
+#pragma once
+#ifdef _MSC_VER
+#include "targetver.h"
+#define WIN32_LEAN_AND_MEAN // Exclude rarely-used stuff from Windows headers
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+#include <ctype.h>
+#include <errno.h>
+#include <memory.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#ifdef _MSC_VER
+#include <tchar.h>
+#include <crtdbg.h>
+#include <windows.h>
+#include <direct.h>
+#include <wincrypt.h>
+#include <fcntl.h>
+#include <io.h>
+#else // _MSC_VER
+#include <assert.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <unistd.h>
+// MAP_ANONYMOUS is called MAP_ANON on OS X
+#endif // _MSC_VER
diff --git a/SNAPLib/targetver.h b/SNAPLib/targetver.h
new file mode 100644
index 0000000..87c0086
--- /dev/null
+++ b/SNAPLib/targetver.h
@@ -0,0 +1,8 @@
+#pragma once
+// Including SDKDDKVer.h defines the highest available Windows platform.
+// If you wish to build your application for a previous Windows platform, include WinSDKVer.h and
+// set the _WIN32_WINNT macro to the platform you wish to support before including SDKDDKVer.h.
+#include <SDKDDKVer.h>
diff --git a/apps/ComputeROC/.gitignore b/apps/ComputeROC/.gitignore
new file mode 100644
index 0000000..0f3a6b1
--- /dev/null
+++ b/apps/ComputeROC/.gitignore
@@ -0,0 +1,2 @@
diff --git a/apps/ComputeROC/ComputeROC.cpp b/apps/ComputeROC/ComputeROC.cpp
new file mode 100644
index 0000000..451e001
--- /dev/null
+++ b/apps/ComputeROC/ComputeROC.cpp
@@ -0,0 +1,431 @@
+Module Name:
+ ComputeROC.cpp
+ Take a SAM file with simulated reads and compute a ROC curve from it.
+ Bill Bolosky, December, 2012
+ User mode service.
+Revision History:
+#include "stdafx.h"
+#include "SAM.h"
+#include "Bam.h"
+#include "Genome.h"
+#include "Compat.h"
+#include "Read.h"
+#include "RangeSplitter.h"
+#include "BigAlloc.h"
+void usage()
+ fprintf(stderr,"usage: ComputeROC genomeDirectory inputFile {-b}\n");
+ fprintf(stderr," -b means to accept reads that match either end of the range regardless of RC\n");
+ fprintf(stderr," -c means to just count the number of reads that are aligned, not to worry about correctness\n");
+ fprintf(stderr," -v means to correct for the error in generating the wgsim coordinates in the Venter data\n");
+ fprintf(stderr," -e means to print out misaligned reads where the aligned location has a lower edit distance than the 'correct' one.\n");
+ fprintf(stderr," -70 means to print out any misaligned reads with MAPQ 70.\n");
+ fprintf(stderr,"You can specify only one of -b or -c\n");
+ exit(1);
+ReadSupplierGenerator *readSupplierGenerator;
+volatile _int64 nRunningThreads;
+SingleWaiterObject allThreadsDone;
+const char *inputFileName;
+const Genome *genome;
+bool matchBothWays = false;
+bool justCount = false;
+bool venter = false;
+unsigned slackAmount = 151;
+bool printBetterErrors = false;
+bool printErrorsAtMAPQ70 = false;
+static const int MaxMAPQ = 70;
+const unsigned MaxEditDistance = 100;
+struct ThreadContext {
+ unsigned whichThread;
+ _int64 countOfReads[MaxMAPQ+1];
+ _int64 countOfMisalignments[MaxMAPQ+1];
+ _int64 countOfMisalignetsWithBetterEditDistance[MaxMAPQ+1];
+ _int64 nUnaligned;
+ _int64 totalReads;
+ _int64 countOfReadsByEditDistance[MaxMAPQ+1][MaxEditDistance+1];
+ _int64 countOfMisalignmentsByEditDistance[MaxMAPQ+1][MaxEditDistance+1];
+ _int64 countOfMisalignetsWithBetterEditDistanceByEditDistance[MaxMAPQ+1][MaxEditDistance+1];
+ ThreadContext() {
+ nUnaligned = 0;
+ totalReads = 0;
+ for (int i = 0; i <= MaxMAPQ; i++) {
+ countOfReads[i] = countOfMisalignments[i] = countOfMisalignetsWithBetterEditDistance[i] = 0;
+ for (int j = 0; j <= MaxEditDistance; j++) {
+ countOfReadsByEditDistance[i][j] = 0;
+ countOfMisalignmentsByEditDistance[i][j] = 0;
+ countOfMisalignetsWithBetterEditDistanceByEditDistance[i][j] = 0;
+ }
+ }
+ }
+bool inline isADigit(char x) {
+ return x >= '0' && x <= '9';
+WorkerThreadMain(void *param)
+ ThreadContext *context = (ThreadContext *)param;
+ ReadSupplier *readSupplier = readSupplierGenerator->generateNewReadSupplier();
+ Read *read;
+ LandauVishkinWithCigar lv;
+ while (NULL != (read = readSupplier->getNextRead())) {
+ unsigned mapQ = read->getOriginalMAPQ();
+ unsigned genomeLocation = read->getOriginalAlignedLocation();
+ unsigned flag = read->getOriginalSAMFlags();
+ if (flag & SAM_UNMAPPED) {
+ genomeLocation = 0xffffffff;
+ }
+ if (mapQ < 0 || mapQ > MaxMAPQ) {
+ fprintf(stderr,"Invalid MAPQ: %d\n",mapQ);
+ exit(1);
+ }
+ context->totalReads++;
+ if (0xffffffff == genomeLocation) {
+ context->nUnaligned++;
+ } else if (justCount) {
+ context->countOfReads[mapQ]++;
+ } else if (!justCount) {
+ read->becomeRC();
+ }
+ const Genome::Contig *contig = genome->getContigAtLocation(genomeLocation);
+ if (NULL == contig) {
+ fprintf(stderr,"couldn't find genome contig for offset %u\n",genomeLocation);
+ exit(1);
+ }
+ unsigned offsetA, offsetB;
+ bool matched;
+ const unsigned cigarBufLen = 1000;
+ char cigarForAligned[cigarBufLen];
+ const char *alignedGenomeData = genome->getSubstring(genomeLocation, 1);
+ int editDistance = lv.computeEditDistance(alignedGenomeData, read->getDataLength() + 20, read->getData(), read->getDataLength(), 30, cigarForAligned, cigarBufLen, false);
+ if (editDistance == -1 || editDistance > MaxEditDistance) {
+ editDistance = MaxEditDistance;
+ }
+ //
+ // Parse the read ID. The format is ChrName_OffsetA_OffsetB_?:<more stuff>. This would be simple to parse, except that
+ // ChrName can include "_". So, we parse it by looking for the first : and then working backward.
+ //
+ char idBuffer[10000]; // Hopefully big enough. I'm not worried about malicious input data here.
+ memcpy(idBuffer,read->getId(),read->getIdLength());
+ idBuffer[read->getIdLength()] = 0;
+ const char *firstColon = strchr(idBuffer,':');
+ bool badParse = true;
+ size_t chrNameLen;
+ const char *beginningOfSecondNumber;
+ const char *beginningOfFirstNumber; int stage = 0;
+ unsigned offsetOfCorrectChromosome;
+ if (NULL != firstColon && firstColon - 3 > idBuffer && (*(firstColon-1) == '?' || isADigit(*(firstColon - 1)))) {
+ //
+ // We've parsed backwards to see that we have at least #: or ?: where '#' is a digit and ? is literal. If it's
+ // a digit, then scan backwards through that number.
+ //
+ const char *underscoreBeforeFirstColon = firstColon - 2;
+ while (underscoreBeforeFirstColon > idBuffer && isADigit(*underscoreBeforeFirstColon)) {
+ underscoreBeforeFirstColon--;
+ }
+ if (*underscoreBeforeFirstColon == '_' && (isADigit(*(underscoreBeforeFirstColon - 1)) || *(underscoreBeforeFirstColon - 1) == '_')) {
+ stage = 1;
+ if (isADigit(*(underscoreBeforeFirstColon - 1))) {
+ beginningOfSecondNumber = firstColon - 3;
+ while (beginningOfSecondNumber > idBuffer && isADigit(*beginningOfSecondNumber)) {
+ beginningOfSecondNumber--;
+ }
+ beginningOfSecondNumber++; // That loop actually moved us back one char before the beginning;
+ } else {
+ //
+ // There's only one number, we have two consecutive underscores.
+ //
+ beginningOfSecondNumber = underscoreBeforeFirstColon;
+ }
+ if (beginningOfSecondNumber - 2 > idBuffer && *(beginningOfSecondNumber - 1) == '_' && isADigit(*(beginningOfSecondNumber - 2))) {
+ stage = 2;
+ beginningOfFirstNumber = beginningOfSecondNumber - 2;
+ while (beginningOfFirstNumber > idBuffer && isADigit(*beginningOfFirstNumber)) {
+ beginningOfFirstNumber--;
+ }
+ beginningOfFirstNumber++; // Again, we went one too far.
+ offsetA = -1;
+ offsetB = -1;
+ if (*(beginningOfFirstNumber - 1) == '_' && 1 == sscanf(beginningOfFirstNumber,"%u",&offsetA) &&
+ ('_' == *beginningOfSecondNumber || 1 == sscanf(beginningOfSecondNumber,"%u", &offsetB))) {
+ stage = 3;
+ chrNameLen = (beginningOfFirstNumber - 1) - idBuffer;
+ char correctChromosomeName[1000];
+ memcpy(correctChromosomeName, idBuffer, chrNameLen);
+ correctChromosomeName[chrNameLen] = '\0';
+ if (venter && offsetB >= read->getDataLength()) {
+ offsetB -= read->getDataLength();
+ }
+ if (!genome->getOffsetOfContig(correctChromosomeName, &offsetOfCorrectChromosome)) {
+ fprintf(stderr, "Couldn't parse chromosome name '%s' from read id\n", correctChromosomeName);
+ } else {
+ badParse = false;
+ }
+ }
+ }
+ }
+ if (badParse) {
+ fprintf(stderr,"Unable to parse read ID '%s', perhaps this isn't simulated data. contiglen = %d, contigName = '%s', contig offset = %u, genome offset = %u\n", idBuffer, strlen(contig->name), contig->name, contig->beginningOffset, genomeLocation);
+ exit(1);
+ }
+ bool match0 = false;
+ bool match1 = false;
+ if (-1 == offsetA || -1 == offsetB) {
+ matched = false;
+ } else if(strncmp(contig->name, idBuffer, __min(read->getIdLength(), chrNameLen))) {
+ matched = false;
+ } else {
+ if (isWithin(offsetA, genomeLocation - contig->beginningOffset, slackAmount)) {
+ matched = true;
+ match0 = true;
+ } else if (isWithin(offsetB, genomeLocation - contig->beginningOffset, slackAmount)) {
+ matched = true;
+ match1 = true;
+ } else {
+ matched = false;
+ if (flag & SAM_FIRST_SEGMENT) {
+ match0 = true;
+ } else {
+ match1 = true;
+ }
+ }
+ }
+ context->countOfReads[mapQ]++;
+ context->countOfReadsByEditDistance[mapQ][editDistance]++;
+ if (!matched) {
+ context->countOfMisalignments[mapQ]++;
+ context->countOfMisalignmentsByEditDistance[mapQ][editDistance]++;
+ if ((70 == mapQ && printErrorsAtMAPQ70) || printBetterErrors) {
+ //
+ // We don't know which offset is correct, because neither one matched. Just take the one with the lower edit distance.
+ //
+ unsigned correctLocationA = offsetOfCorrectChromosome + offsetA;
+ unsigned correctLocationB = offsetOfCorrectChromosome + offsetB;
+ unsigned correctLocation = 0;
+ const char *correctData = NULL;
+ const char *dataA = genome->getSubstring(correctLocationA, 1);
+ const char *dataB = genome->getSubstring(correctLocationB, 1);
+ int distanceA, distanceB;
+ char cigarA[cigarBufLen];
+ char cigarB[cigarBufLen];
+ cigarA[0] = '*'; cigarA[1] = '\0';
+ cigarB[0] = '*'; cigarB[1] = '\0';
+ if (dataA == NULL) {
+ distanceA = -1;
+ } else {
+ distanceA = lv.computeEditDistance(dataA, read->getDataLength() + 20, read->getData(), read->getDataLength(), 30, cigarA, cigarBufLen, false);
+ }
+ if (dataB == NULL) {
+ distanceB = -1;
+ } else {
+ distanceB = lv.computeEditDistance(dataB, read->getDataLength() + 20, read->getData(), read->getDataLength(), 30, cigarB, cigarBufLen, false);
+ }
+ const char *correctGenomeData;
+ char *cigarForCorrect;
+ if (distanceA != -1 && distanceA <= distanceB || distanceB == -1) {
+ correctGenomeData = dataA;
+ correctLocation = correctLocationA;
+ cigarForCorrect = cigarA;
+ } else {
+ correctGenomeData = dataB;
+ correctLocation = correctLocationB;
+ cigarForCorrect = cigarB;
+ }
+ bool betterEditDistance = ((distanceA > editDistance && distanceB > editDistance) || (-1 == distanceA && -1 == distanceB));
+ if (betterEditDistance) {
+ context->countOfMisalignetsWithBetterEditDistanceByEditDistance[mapQ][editDistance]++;
+ context->countOfMisalignetsWithBetterEditDistance[mapQ]++;
+ }
+ // if (!printBetterErrors || (printBetterErrors && betterEditDistance)) {
+ // printf("%s\t%d\t%s\t%u\t%d\t%s\t*\t*\t100\t%.*s\t%.*s\tAlignedGenomeLocation:%u\tCorrectGenomeLocation: %u\tCigarForCorrect: %s\tCorrectData: %.*s\tAlignedData: %.*s\n",
+ // idBuffer, flag, contig->name, genomeLocation - contig->beginningOffset, mapQ, cigarForAligned, read.getDataLength(), read.getData(),
+ // read.getDataLength(), read.getQuality(), genomeLocation, correctLocation, cigarForCorrect, read.getDataLength(),
+ // correctGenomeData, read.getDataLength(), alignedGenomeData);
+ //}
+ }
+ }
+ }
+ } // if it was mapped
+ } // for each read from the sam reader
+ if (0 == InterlockedAdd64AndReturnNewValue(&nRunningThreads, -1)) {
+ SignalSingleWaiterObject(&allThreadsDone);
+ }
+int main(int argc, char * argv[])
+ BigAllocUseHugePages = false;
+ if (argc < 3) usage();
+ for (int i = 3; i < argc; i++) {
+ if (!strcmp(argv[i], "-b")) {
+ matchBothWays = true;
+ } else if (!strcmp(argv[i], "-c")) {
+ justCount = true;
+ } else if (!strcmp(argv[i], "-v")) {
+ venter = true;
+ } else if (!strcmp(argv[i], "-e")) {
+ printBetterErrors = true;
+ } else if (!strcmp(argv[i], "-70")) {
+ printErrorsAtMAPQ70 = true;
+ } else {
+ usage();
+ }
+ }
+ static const char *genomeSuffix = "Genome";
+ size_t filenameLen = strlen(argv[1]) + 1 + strlen(genomeSuffix) + 1;
+ char *fileName = new char[strlen(argv[1]) + 1 + strlen(genomeSuffix) + 1];
+ snprintf(fileName,filenameLen,"%s%c%s",argv[1],PATH_SEP,genomeSuffix);
+ genome = Genome::loadFromFile(fileName, 0);
+ if (NULL == genome) {
+ fprintf(stderr,"Unable to load genome from file '%s'\n",fileName);
+ return -1;
+ }
+ delete [] fileName;
+ fileName = NULL;
+ inputFileName = argv[2];
+ unsigned nThreads;
+#ifdef _DEBUG
+ nThreads = 1;
+#else // _DEBUG
+ nThreads = GetNumberOfProcessors();
+#endif // _DEBUG
+ DataSupplier::ThreadCount = nThreads;
+ nRunningThreads = nThreads;
+ ReaderContext readerContext;
+ readerContext.clipping = NoClipping;
+ readerContext.defaultReadGroup = "";
+ readerContext.genome = genome;
+ readerContext.ignoreSecondaryAlignments = true;
+ readerContext.ignoreSupplementaryAlignments = true;
+ readerContext.header = NULL;
+ readerContext.headerLength = 0;
+ readerContext.headerBytes = 0;
+ if (NULL != strrchr(inputFileName, '.') && !_stricmp(strrchr(inputFileName, '.'), ".bam")) {
+ readSupplierGenerator = BAMReader::createReadSupplierGenerator(inputFileName, nThreads, readerContext);
+ } else {
+ readSupplierGenerator = SAMReader::createReadSupplierGenerator(inputFileName, nThreads, readerContext);
+ }
+ CreateSingleWaiterObject(&allThreadsDone);
+ ThreadContext *contexts = new ThreadContext[nThreads];
+ for (unsigned i = 0; i < nThreads; i++) {
+ contexts[i].whichThread = i;
+ StartNewThread(WorkerThreadMain, &contexts[i]);
+ }
+ WaitForSingleWaiterObject(&allThreadsDone);
+ _int64 nUnaligned = 0;
+ _int64 totalReads = 0;
+ for (unsigned i = 0; i < nThreads; i++) {
+ nUnaligned += contexts[i].nUnaligned;
+ totalReads += contexts[i].totalReads;
+ }
+ printf("%lld reads, %lld unaligned (%0.2f%%)\n", (long long)totalReads, (long long)nUnaligned, 100. * (double)nUnaligned / (double)totalReads);
+ printf("MAPQ\tnReads\tnMisaligned");
+ if (printBetterErrors) {
+ printf("\tBetterMisaligned");
+ }
+ printf("\n");
+ for (int i = 0; i <= MaxMAPQ; i++) {
+ _int64 nReads = 0;
+ _int64 nMisaligned = 0;
+ _int64 betterMisaligned = 0;
+ for (unsigned j = 0; j < nThreads; j++) {
+ nReads += contexts[j].countOfReads[i];
+ nMisaligned += contexts[j].countOfMisalignments[i];
+ betterMisaligned += contexts[j].countOfMisalignetsWithBetterEditDistance[i];
+ }
+ printf("%d\t%lld\t%lld", i, (long long)nReads, (long long)nMisaligned);
+ if (printBetterErrors) {
+ printf("\t%lld", (long long)betterMisaligned);
+ }
+ printf("\n");
+ }
+ int maxEditDistanceSeen = 0;
+ for (unsigned i = 0; i < nThreads; i++) {
+ }
+ return 0;
diff --git a/apps/ComputeROC/ComputeROC.vcxproj b/apps/ComputeROC/ComputeROC.vcxproj
new file mode 100644
index 0000000..c439a7b
--- /dev/null
+++ b/apps/ComputeROC/ComputeROC.vcxproj
@@ -0,0 +1,173 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <ItemGroup Label="ProjectConfigurations">
+ <ProjectConfiguration Include="Debug|Win32">
+ <Configuration>Debug</Configuration>
+ <Platform>Win32</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Debug|x64">
+ <Configuration>Debug</Configuration>
+ <Platform>x64</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Release|Win32">
+ <Configuration>Release</Configuration>
+ <Platform>Win32</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Release|x64">
+ <Configuration>Release</Configuration>
+ <Platform>x64</Platform>
+ </ProjectConfiguration>
+ </ItemGroup>
+ <PropertyGroup Label="Globals">
+ <ProjectGuid>{EB694CE8-E805-41A0-9D08-C8BEED857166}</ProjectGuid>
+ <Keyword>Win32Proj</Keyword>
+ <RootNamespace>ComputeROC</RootNamespace>
+ </PropertyGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <UseDebugLibraries>true</UseDebugLibraries>
+ <PlatformToolset>v120</PlatformToolset>
+ <CharacterSet>MultiByte</CharacterSet>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <UseDebugLibraries>true</UseDebugLibraries>
+ <PlatformToolset>v120</PlatformToolset>
+ <CharacterSet>MultiByte</CharacterSet>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <UseDebugLibraries>false</UseDebugLibraries>
+ <PlatformToolset>v120</PlatformToolset>
+ <WholeProgramOptimization>true</WholeProgramOptimization>
+ <CharacterSet>MultiByte</CharacterSet>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <UseDebugLibraries>false</UseDebugLibraries>
+ <PlatformToolset>v120</PlatformToolset>
+ <WholeProgramOptimization>true</WholeProgramOptimization>
+ <CharacterSet>MultiByte</CharacterSet>
+ </PropertyGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+ <ImportGroup Label="ExtensionSettings">
+ </ImportGroup>
+ <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <PropertyGroup Label="UserMacros" />
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ <LinkIncremental>true</LinkIncremental>
+ <OutDir>$(SolutionDir)\obj\bin\$(Configuration)\$(Platform)\</OutDir>
+ <IntDir>$(SolutionDir)\obj\obj\snap\$(Configuration)\$(Platform)\</IntDir>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ <LinkIncremental>true</LinkIncremental>
+ <OutDir>$(SolutionDir)\obj\bin\$(Configuration)\$(Platform)\</OutDir>
+ <IntDir>$(SolutionDir)\obj\obj\ComputeROC\$(Configuration)\$(Platform)\</IntDir>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ <LinkIncremental>false</LinkIncremental>
+ <OutDir>$(SolutionDir)\obj\bin\$(Configuration)\$(Platform)\</OutDir>
+ <IntDir>$(SolutionDir)\obj\obj\snap\$(Configuration)\$(Platform)\</IntDir>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ <LinkIncremental>false</LinkIncremental>
+ <OutDir>$(SolutionDir)\obj\bin\$(Configuration)\$(Platform)\</OutDir>
+ <IntDir>$(SolutionDir)\obj\obj\ComputeROC\$(Configuration)\$(Platform)\</IntDir>
+ </PropertyGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ <ClCompile>
+ <PrecompiledHeader>Use</PrecompiledHeader>
+ <WarningLevel>Level3</WarningLevel>
+ <Optimization>Disabled</Optimization>
+ <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions);_CRT_SECURE_NO_WARNINGS</PreprocessorDefinitions>
+ <AdditionalIncludeDirectories>..\..\snaplib\</AdditionalIncludeDirectories>
+ </ClCompile>
+ <Link>
+ <SubSystem>Console</SubSystem>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <AdditionalDependencies>snaplib.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+ <AdditionalLibraryDirectories>$(SolutionDir)obj\lib\$(Configuration)\$(Platform)\;$(SolutionDir)import</AdditionalLibraryDirectories>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ <ClCompile>
+ <PrecompiledHeader>Use</PrecompiledHeader>
+ <WarningLevel>Level3</WarningLevel>
+ <Optimization>Disabled</Optimization>
+ <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions);_CRT_SECURE_NO_WARNINGS</PreprocessorDefinitions>
+ <AdditionalIncludeDirectories>..\..\snaplib\</AdditionalIncludeDirectories>
+ </ClCompile>
+ <Link>
+ <SubSystem>Console</SubSystem>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <AdditionalLibraryDirectories>$(SolutionDir)obj\lib\$(Configuration)\$(Platform)\;$(SolutionDir)import</AdditionalLibraryDirectories>
+ <AdditionalDependencies>libhdfs.lib;snaplib.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies);zlibstat.lib</AdditionalDependencies>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ <ClCompile>
+ <WarningLevel>Level3</WarningLevel>
+ <PrecompiledHeader>Use</PrecompiledHeader>
+ <Optimization>MaxSpeed</Optimization>
+ <FunctionLevelLinking>true</FunctionLevelLinking>
+ <IntrinsicFunctions>true</IntrinsicFunctions>
+ <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions);_CRT_SECURE_NO_WARNINGS</PreprocessorDefinitions>
+ <AdditionalIncludeDirectories>..\..\snaplib\</AdditionalIncludeDirectories>
+ </ClCompile>
+ <Link>
+ <SubSystem>Console</SubSystem>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <EnableCOMDATFolding>true</EnableCOMDATFolding>
+ <OptimizeReferences>true</OptimizeReferences>
+ <AdditionalDependencies>snaplib.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+ <AdditionalLibraryDirectories>$(SolutionDir)obj\lib\$(Configuration)\$(Platform)\;$(SolutionDir)import</AdditionalLibraryDirectories>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ <ClCompile>
+ <WarningLevel>Level3</WarningLevel>
+ <PrecompiledHeader>Use</PrecompiledHeader>
+ <Optimization>MaxSpeed</Optimization>
+ <FunctionLevelLinking>true</FunctionLevelLinking>
+ <IntrinsicFunctions>true</IntrinsicFunctions>
+ <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions);_CRT_SECURE_NO_WARNINGS</PreprocessorDefinitions>
+ <AdditionalIncludeDirectories>..\..\snaplib\</AdditionalIncludeDirectories>
+ </ClCompile>
+ <Link>
+ <SubSystem>Console</SubSystem>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <EnableCOMDATFolding>true</EnableCOMDATFolding>
+ <OptimizeReferences>true</OptimizeReferences>
+ <AdditionalLibraryDirectories>$(SolutionDir)obj\lib\$(Configuration)\$(Platform)\;$(SolutionDir)import</AdditionalLibraryDirectories>
+ <AdditionalDependencies>libhdfs.lib;snaplib.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies);zlibstat.lib</AdditionalDependencies>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemGroup>
+ <ClInclude Include="stdafx.h" />
+ <ClInclude Include="targetver.h" />
+ </ItemGroup>
+ <ItemGroup>
+ <ClCompile Include="ComputeROC.cpp" />
+ <ClCompile Include="stdafx.cpp">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Create</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Create</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Create</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Create</PrecompiledHeader>
+ </ClCompile>
+ </ItemGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+ <ImportGroup Label="ExtensionTargets">
+ </ImportGroup>
\ No newline at end of file
diff --git a/apps/ComputeROC/ComputeROC.vcxproj.filters b/apps/ComputeROC/ComputeROC.vcxproj.filters
new file mode 100644
index 0000000..db4af98
--- /dev/null
+++ b/apps/ComputeROC/ComputeROC.vcxproj.filters
@@ -0,0 +1,33 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <ItemGroup>
+ <Filter Include="Source Files">
+ <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
+ <Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
+ </Filter>
+ <Filter Include="Header Files">
+ <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
+ <Extensions>h;hpp;hxx;hm;inl;inc;xsd</Extensions>
+ </Filter>
+ <Filter Include="Resource Files">
+ <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
+ <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
+ </Filter>
+ </ItemGroup>
+ <ItemGroup>
+ <ClInclude Include="stdafx.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="targetver.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ </ItemGroup>
+ <ItemGroup>
+ <ClCompile Include="stdafx.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="ComputeROC.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ </ItemGroup>
\ No newline at end of file
diff --git a/apps/ComputeROC/stdafx.cpp b/apps/ComputeROC/stdafx.cpp
new file mode 100644
index 0000000..ef4f015
--- /dev/null
+++ b/apps/ComputeROC/stdafx.cpp
@@ -0,0 +1,8 @@
+// stdafx.cpp : source file that includes just the standard includes
+// snap.pch will be the pre-compiled header
+// stdafx.obj will contain the pre-compiled type information
+#include "stdafx.h"
+// TODO: reference any additional headers you need in STDAFX.H
+// and not in this file
diff --git a/apps/ComputeROC/stdafx.h b/apps/ComputeROC/stdafx.h
new file mode 100644
index 0000000..7c82d2b
--- /dev/null
+++ b/apps/ComputeROC/stdafx.h
@@ -0,0 +1,5 @@
+#ifdef _MSC_VER
+#include "..\..\SNAPLib\stdafx.h"
+#include "../../SNAPLib/stdafx.h"
diff --git a/apps/ComputeROC/targetver.h b/apps/ComputeROC/targetver.h
new file mode 100644
index 0000000..87c0086
--- /dev/null
+++ b/apps/ComputeROC/targetver.h
@@ -0,0 +1,8 @@
+#pragma once
+// Including SDKDDKVer.h defines the highest available Windows platform.
+// If you wish to build your application for a previous Windows platform, include WinSDKVer.h and
+// set the _WIN32_WINNT macro to the platform you wish to support before including SDKDDKVer.h.
+#include <SDKDDKVer.h>
diff --git a/apps/DistanceHist/DistanceHist.cpp b/apps/DistanceHist/DistanceHist.cpp
new file mode 100644
index 0000000..b578b83
--- /dev/null
+++ b/apps/DistanceHist/DistanceHist.cpp
@@ -0,0 +1,234 @@
+Module Name:
+ DistanceHist.cpp
+ Compute a histogram of the edit distances between simulated reads and their correct
+ alignments.
+ Bill Bolosky, May 2013
+ User mode service.
+Revision History:
+#include "stdafx.h"
+#include "Compat.h"
+#include "Genome.h"
+#include "exit.h"
+#include "SAM.h"
+#include "FASTQ.h"
+#include "WGsim.h"
+#include "LandauVishkin.h"
+#include "Tables.h"
+const Genome *genome = NULL;
+struct DistHistogram {
+ static const unsigned MaxDistance = 100;
+ unsigned counts[MaxDistance+2];
+ DistHistogram() {
+ for (unsigned i = 0 ; i < MaxDistance+2; i++) {
+ counts[i] = 0;
+ }
+ }
+ void addIn(const DistHistogram &peer) {
+ for (unsigned i = 0; i < MaxDistance+2; i++) {
+ counts[i] += peer.counts[i];
+ }
+ }
+ReadSupplierGenerator *readSupplierGenerator = NULL;
+volatile int nRunningThreads;
+SingleWaiterObject threadsDone;
+ fprintf(stderr,"usage: DistanceHist index inputFile\n");
+ soft_exit(1);
+void workerThreadMain(void *context)
+ DistHistogram histogram; // Don't use the context one until the end to avoid false sharing
+ LandauVishkinWithCigar lv;
+ ReadSupplier *readSupplier = readSupplierGenerator->generateNewReadSupplier();
+ const unsigned maxReadLen = MAX_READ_LENGTH;
+ char *rcBuffer = new char[maxReadLen];
+ Read *read;
+ while (NULL != (read = readSupplier->getNextRead())) {
+ unsigned readLen = read->getDataLength();
+ unsigned highOffset, lowOffset;
+ const char *readData = read->getData();
+ const char *quality = read->getQuality();
+ bool lowQual = false;
+ for (unsigned i = 0 ; i < readLen; i++) {
+ if (quality[i] < '?') {
+ lowQual = true;
+ break;
+ }
+ }
+ if (lowQual) {
+ continue;
+ }
+ char cigar[4][100];
+ const char *genomeData[4];
+ int edit[4];
+ //
+ // We don't care if it's misaligned (or in fact if it's aligned at all). Just get the
+ // offsets from the wgsim name.
+ //
+ wgsimReadMisaligned(read, 0, genome, 0, &lowOffset, &highOffset);
+ unsigned bestAt = 0;
+ cigar[0][0] = '\0';
+ int bestDistance = read->getDataLength();
+ genomeData[0] = genome->getSubstring(lowOffset, readLen + 20);
+ int dist = edit[0] = lv.computeEditDistance(genome->getSubstring(lowOffset, readLen + 20), readLen + 20, readData, readLen, MAX_K - 1, cigar[0], 100, false);
+ if (dist >= 0) {
+ bestDistance = dist;
+ }
+ genomeData[1] = genome->getSubstring(highOffset, readLen + 20);
+ edit[1] = dist = lv.computeEditDistance(genome->getSubstring(highOffset, readLen + 20), readLen + 20, readData, readLen, MAX_K - 1, cigar[1], 100, false);
+ if (dist >= 0 && dist < bestDistance) {
+ bestDistance = dist;
+ bestAt = 1;
+ }
+ for (unsigned i = 0; i < readLen; i++) {
+ rcBuffer[readLen - i - 1] = COMPLEMENT[readData[i]];
+ }
+ genomeData[2] = genome->getSubstring(lowOffset, readLen + 20);
+ edit[2] = dist = lv.computeEditDistance(genome->getSubstring(lowOffset, readLen + 20), readLen + 20, rcBuffer, readLen, MAX_K - 1, cigar[2], 100, false);
+ if (dist >= 0 && dist < bestDistance) {
+ bestDistance =dist;
+ bestAt = 2;
+ }
+ genomeData[3] = genome->getSubstring(highOffset, readLen + 20);
+ edit[3] = dist = lv.computeEditDistance(genome->getSubstring(highOffset, readLen + 20), readLen + 20, rcBuffer, readLen, MAX_K - 1, cigar[3], 100, false);
+ if (dist >= 0 && dist < bestDistance) {
+ bestDistance = dist;
+ bestAt = 3;
+ }
+ bool containsIndels = false;
+ for (size_t i = 0; i < strlen(cigar[bestAt]); i++) {
+ if (cigar[bestAt][i] == 'I' || cigar[bestAt][i] == 'D') {
+ containsIndels = true;
+ break;
+ }
+ }
+ if (containsIndels) {
+ continue;
+ }
+ if (bestDistance <0 || bestDistance > DistHistogram::MaxDistance) {
+ histogram.counts[DistHistogram::MaxDistance]++;
+ } else {
+ histogram.counts[bestDistance]++;
+ }
+ }
+ ((DistHistogram *)context)->addIn(histogram);
+ if (0 == InterlockedDecrementAndReturnNewValue(&nRunningThreads)) {
+ SignalSingleWaiterObject(&threadsDone);
+ }
+ delete rcBuffer; // Cause I'm just that kinda guy.
+void main(int argc, char * argv[])
+ if (3 != argc) usage();
+ BigAllocUseHugePages = false;
+ const char *genomeFileName = "Genome";
+ char *pathname = new char[strlen(argv[1]) + 1 /* for directory separator */ + strlen(genomeFileName) + 1 /* for null */];
+ sprintf(pathname, "%s%c%s", argv[1], PATH_SEP, genomeFileName);
+ _int64 start = timeInMillis();
+ printf("Loading genome...");
+ genome = Genome::loadFromFile(pathname, 0);
+ if (NULL == genome) {
+ fprintf(stderr,"Unable to load genome from file '%s'\n",pathname);
+ soft_exit(1);
+ }
+ printf("%llds.\n", (timeInMillis() + 500 - start) / 1000);
+ unsigned threadCount = GetNumberOfProcessors();
+#ifdef _DEBUG
+ threadCount = 1; // BJB
+#endif // _DEBUG
+ const char *lastDot = strchr(argv[2], '.');
+ if (NULL != lastDot && !_stricmp(lastDot,".sam")) {
+ readSupplierGenerator = SAMReader::createReadSupplierGenerator(argv[2], threadCount, genome);
+ } else {
+ readSupplierGenerator = FASTQReader::createReadSupplierGenerator(argv[2], threadCount);
+ }
+ if (NULL == readSupplierGenerator) {
+ fprintf(stderr,"Unable to open file '%s' to get reads\n", argv[2]);
+ soft_exit(1);
+ }
+ nRunningThreads = threadCount;
+ DistHistogram *histograms = new DistHistogram[threadCount];
+ CreateSingleWaiterObject(&threadsDone);
+ for (unsigned i = 0; i < threadCount; i++) {
+ StartNewThread(workerThreadMain, &histograms[i]);
+ }
+ WaitForSingleWaiterObject(&threadsDone);
+ for (unsigned i = 1; i < threadCount; i++) {
+ histograms[0].addIn(histograms[i]);
+ }
+ unsigned totalReads = 0;
+ for (unsigned i = 0; i < DistHistogram::MaxDistance+1; i++) {
+ printf("%d\t%d\n",i,histograms[0].counts[i]);
+ totalReads += histograms[0].counts[i];
+ }
+ if (histograms[0].counts[DistHistogram::MaxDistance+1] != 0) {
+ printf("More\t%d\n", histograms[0].counts[DistHistogram::MaxDistance+1]);
+ totalReads += histograms[0].counts[DistHistogram::MaxDistance+1];
+ }
+ _int64 stop = timeInMillis();
+ printf("\nProcessed %d reads in %llds, %lld reads/s\n", totalReads, (stop + 500 - start) / 1000, ((_int64) totalReads) * 1000 / (stop - start));
diff --git a/apps/DistanceHist/DistanceHist.vcxproj b/apps/DistanceHist/DistanceHist.vcxproj
new file mode 100644
index 0000000..830ef91
--- /dev/null
+++ b/apps/DistanceHist/DistanceHist.vcxproj
@@ -0,0 +1,173 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <ItemGroup Label="ProjectConfigurations">
+ <ProjectConfiguration Include="Debug|Win32">
+ <Configuration>Debug</Configuration>
+ <Platform>Win32</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Debug|x64">
+ <Configuration>Debug</Configuration>
+ <Platform>x64</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Release|Win32">
+ <Configuration>Release</Configuration>
+ <Platform>Win32</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Release|x64">
+ <Configuration>Release</Configuration>
+ <Platform>x64</Platform>
+ </ProjectConfiguration>
+ </ItemGroup>
+ <PropertyGroup Label="Globals">
+ <ProjectGuid>{029221BC-7BC0-448C-9A68-12D94A25F412}</ProjectGuid>
+ <Keyword>Win32Proj</Keyword>
+ <RootNamespace>DistanceHist</RootNamespace>
+ </PropertyGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <UseDebugLibraries>true</UseDebugLibraries>
+ <PlatformToolset>v120</PlatformToolset>
+ <CharacterSet>MultiByte</CharacterSet>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <UseDebugLibraries>true</UseDebugLibraries>
+ <PlatformToolset>v120</PlatformToolset>
+ <CharacterSet>MultiByte</CharacterSet>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <UseDebugLibraries>false</UseDebugLibraries>
+ <PlatformToolset>v120</PlatformToolset>
+ <WholeProgramOptimization>true</WholeProgramOptimization>
+ <CharacterSet>MultiByte</CharacterSet>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <UseDebugLibraries>false</UseDebugLibraries>
+ <PlatformToolset>v120</PlatformToolset>
+ <WholeProgramOptimization>true</WholeProgramOptimization>
+ <CharacterSet>MultiByte</CharacterSet>
+ </PropertyGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+ <ImportGroup Label="ExtensionSettings">
+ </ImportGroup>
+ <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <PropertyGroup Label="UserMacros" />
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ <LinkIncremental>true</LinkIncremental>
+ <OutDir>$(SolutionDir)\obj\bin\$(Configuration)\$(Platform)\</OutDir>
+ <IntDir>$(SolutionDir)\obj\obj\snap\$(Configuration)\$(Platform)\</IntDir>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ <LinkIncremental>true</LinkIncremental>
+ <OutDir>$(SolutionDir)\obj\bin\$(Configuration)\$(Platform)\</OutDir>
+ <IntDir>$(SolutionDir)\obj\obj\snap\$(Configuration)\$(Platform)\</IntDir>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ <LinkIncremental>false</LinkIncremental>
+ <OutDir>$(SolutionDir)\obj\bin\$(Configuration)\$(Platform)\</OutDir>
+ <IntDir>$(SolutionDir)\obj\obj\snap\$(Configuration)\$(Platform)\</IntDir>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ <LinkIncremental>false</LinkIncremental>
+ <OutDir>$(SolutionDir)\obj\bin\$(Configuration)\$(Platform)\</OutDir>
+ <IntDir>$(SolutionDir)\obj\obj\snap\$(Configuration)\$(Platform)\</IntDir>
+ </PropertyGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ <ClCompile>
+ <PrecompiledHeader>Use</PrecompiledHeader>
+ <WarningLevel>Level3</WarningLevel>
+ <Optimization>Disabled</Optimization>
+ <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions);_CRT_SECURE_NO_WARNINGS</PreprocessorDefinitions>
+ <AdditionalIncludeDirectories>..\..\snaplib\</AdditionalIncludeDirectories>
+ </ClCompile>
+ <Link>
+ <SubSystem>Console</SubSystem>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <AdditionalLibraryDirectories>$(SolutionDir)obj\lib\$(Configuration)\$(Platform)\;$(SolutionDir)import</AdditionalLibraryDirectories>
+ <AdditionalDependencies>snaplib.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies);zlibstat.lib</AdditionalDependencies>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ <ClCompile>
+ <PrecompiledHeader>Use</PrecompiledHeader>
+ <WarningLevel>Level3</WarningLevel>
+ <Optimization>Disabled</Optimization>
+ <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions);_CRT_SECURE_NO_WARNINGS</PreprocessorDefinitions>
+ <AdditionalIncludeDirectories>..\..\snaplib\</AdditionalIncludeDirectories>
+ </ClCompile>
+ <Link>
+ <SubSystem>Console</SubSystem>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <AdditionalLibraryDirectories>$(SolutionDir)obj\lib\$(Configuration)\$(Platform)\;$(SolutionDir)import</AdditionalLibraryDirectories>
+ <AdditionalDependencies>snaplib.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies);zlibstat.lib</AdditionalDependencies>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ <ClCompile>
+ <WarningLevel>Level3</WarningLevel>
+ <PrecompiledHeader>Use</PrecompiledHeader>
+ <Optimization>MaxSpeed</Optimization>
+ <FunctionLevelLinking>true</FunctionLevelLinking>
+ <IntrinsicFunctions>true</IntrinsicFunctions>
+ <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions);_CRT_SECURE_NO_WARNINGS</PreprocessorDefinitions>
+ <AdditionalIncludeDirectories>..\..\snaplib\</AdditionalIncludeDirectories>
+ </ClCompile>
+ <Link>
+ <SubSystem>Console</SubSystem>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <EnableCOMDATFolding>true</EnableCOMDATFolding>
+ <OptimizeReferences>true</OptimizeReferences>
+ <AdditionalLibraryDirectories>$(SolutionDir)obj\lib\$(Configuration)\$(Platform)\;$(SolutionDir)import</AdditionalLibraryDirectories>
+ <AdditionalDependencies>snaplib.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies);zlibstat.lib</AdditionalDependencies>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ <ClCompile>
+ <WarningLevel>Level3</WarningLevel>
+ <PrecompiledHeader>Use</PrecompiledHeader>
+ <Optimization>MaxSpeed</Optimization>
+ <FunctionLevelLinking>true</FunctionLevelLinking>
+ <IntrinsicFunctions>true</IntrinsicFunctions>
+ <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions);_CRT_SECURE_NO_WARNINGS</PreprocessorDefinitions>
+ <AdditionalIncludeDirectories>..\..\snaplib\</AdditionalIncludeDirectories>
+ </ClCompile>
+ <Link>
+ <SubSystem>Console</SubSystem>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <EnableCOMDATFolding>true</EnableCOMDATFolding>
+ <OptimizeReferences>true</OptimizeReferences>
+ <AdditionalLibraryDirectories>$(SolutionDir)obj\lib\$(Configuration)\$(Platform)\;$(SolutionDir)import</AdditionalLibraryDirectories>
+ <AdditionalDependencies>snaplib.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies);zlibstat.lib</AdditionalDependencies>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemGroup>
+ <ClInclude Include="stdafx.h" />
+ <ClInclude Include="targetver.h" />
+ </ItemGroup>
+ <ItemGroup>
+ <ClCompile Include="DistanceHist.cpp" />
+ <ClCompile Include="stdafx.cpp">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Create</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Create</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Create</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Create</PrecompiledHeader>
+ </ClCompile>
+ </ItemGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+ <ImportGroup Label="ExtensionTargets">
+ </ImportGroup>
\ No newline at end of file
diff --git a/apps/DistanceHist/DistanceHist.vcxproj.filters b/apps/DistanceHist/DistanceHist.vcxproj.filters
new file mode 100644
index 0000000..6dd986b
--- /dev/null
+++ b/apps/DistanceHist/DistanceHist.vcxproj.filters
@@ -0,0 +1,33 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <ItemGroup>
+ <Filter Include="Source Files">
+ <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
+ <Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
+ </Filter>
+ <Filter Include="Header Files">
+ <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
+ <Extensions>h;hpp;hxx;hm;inl;inc;xsd</Extensions>
+ </Filter>
+ <Filter Include="Resource Files">
+ <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
+ <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
+ </Filter>
+ </ItemGroup>
+ <ItemGroup>
+ <ClInclude Include="stdafx.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="targetver.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ </ItemGroup>
+ <ItemGroup>
+ <ClCompile Include="stdafx.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="DistanceHist.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ </ItemGroup>
\ No newline at end of file
diff --git a/apps/DistanceHist/stdafx.cpp b/apps/DistanceHist/stdafx.cpp
new file mode 100644
index 0000000..8d96bae
--- /dev/null
+++ b/apps/DistanceHist/stdafx.cpp
@@ -0,0 +1,8 @@
+// stdafx.cpp : source file that includes just the standard includes
+// DistanceHist.pch will be the pre-compiled header
+// stdafx.obj will contain the pre-compiled type information
+#include "stdafx.h"
+// TODO: reference any additional headers you need in STDAFX.H
+// and not in this file
diff --git a/apps/DistanceHist/stdafx.h b/apps/DistanceHist/stdafx.h
new file mode 100644
index 0000000..45028a6
--- /dev/null
+++ b/apps/DistanceHist/stdafx.h
@@ -0,0 +1,50 @@
+// stdafx.h : include file for standard system include files,
+// or project specific include files that are used frequently, but
+// are changed infrequently
+#pragma once
+#ifdef _MSC_VER
+#include "targetver.h"
+#define WIN32_LEAN_AND_MEAN // Exclude rarely-used stuff from Windows headers
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+#include <ctype.h>
+#include <errno.h>
+#include <memory.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#ifdef _MSC_VER
+#include <tchar.h>
+#include <crtdbg.h>
+#include <windows.h>
+#include <direct.h>
+#include <wincrypt.h>
+#include <assert.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <unistd.h>
+// MAP_ANONYMOUS is called MAP_ANON on OS X
diff --git a/apps/DistanceHist/targetver.h b/apps/DistanceHist/targetver.h
new file mode 100644
index 0000000..87c0086
--- /dev/null
+++ b/apps/DistanceHist/targetver.h
@@ -0,0 +1,8 @@
+#pragma once
+// Including SDKDDKVer.h defines the highest available Windows platform.
+// If you wish to build your application for a previous Windows platform, include WinSDKVer.h and
+// set the _WIN32_WINNT macro to the platform you wish to support before including SDKDDKVer.h.
+#include <SDKDDKVer.h>
diff --git a/apps/ExtractReads/ExtractReads.cpp b/apps/ExtractReads/ExtractReads.cpp
new file mode 100644
index 0000000..f432509
--- /dev/null
+++ b/apps/ExtractReads/ExtractReads.cpp
@@ -0,0 +1,109 @@
+Module Name:
+ ComputeROC.cpp
+ Take a SAM file with simulated reads and compute a ROC curve from it.
+ Bill Bolosky, December, 2012
+ User mode service.
+Revision History:
+#include "stdafx.h"
+#include "SAM.h"
+#include "BAM.h"
+#include "Genome.h"
+#include "Compat.h"
+#include "Read.h"
+#include "RangeSplitter.h"
+#include "BigAlloc.h"
+void usage()
+ fprintf(stderr,"usage: ExtractReads genomeDirectory input.bam output.sam chromosome\n");
+ exit(1);
+int main(int argc, char * argv[])
+ BigAllocUseHugePages = false;
+ if (argc != 5) usage();
+ static const char *genomeSuffix = "Genome";
+ size_t filenameLen = strlen(argv[1]) + 1 + strlen(genomeSuffix) + 1;
+ char *fileName = new char[strlen(argv[1]) + 1 + strlen(genomeSuffix) + 1];
+ snprintf(fileName,filenameLen,"%s%c%s",argv[1],PATH_SEP,genomeSuffix);
+ const Genome *genome = Genome::loadFromFile(fileName, 0);
+ if (NULL == genome) {
+ fprintf(stderr,"Unable to load genome from file '%s'\n",fileName);
+ return -1;
+ }
+ delete [] fileName;
+ fileName = NULL;
+ ReaderContext readerContext;
+ readerContext.header = NULL;
+ readerContext.genome = genome;
+ readerContext.clipping = NoClipping;
+ readerContext.paired = false;
+ readerContext.ignoreSecondaryAlignments = true;
+ readerContext.ignoreSupplementaryAlignments = true;
+ readerContext.defaultReadGroup = "";
+ ReadSupplierGenerator *readSupplierGenerator = BAMReader::createReadSupplierGenerator(fileName,1, readerContext);
+ ReadSupplier *readSupplier = readSupplierGenerator->generateNewReadSupplier();
+ AlignerOptions options("");
+ options.outputFileTemplate = argv[4];
+ options.sortOutput = false;
+ const FileFormat* format =
+ FileFormat::SAM[0]->isFormatOf(argv[3]) ? FileFormat::SAM[false] :
+ FileFormat::BAM[0]->isFormatOf(argv[3]) ? FileFormat::BAM[false] :
+ if (NULL == format) {
+ fprintf(stderr,"Can't determine format for output file (does it end in .sam or .bam?)\n");
+ return 0;
+ }
+ ReadWriterSupplier *writerSupplier = format->getWriterSupplier(&options, readerContext.genome);
+ ReadWriter* writer = writerSupplier->getWriter();
+ writer->writeHeader(readerContext, options.sortOutput, argc, (const char **)argv, "", "");
+ Read *read;
+ AlignmentResult alignmentResult;
+ unsigned genomeLocation;
+ bool isRC;
+ unsigned mapQ;
+ _int64 totalReads = 0;
+ _int64 emittedReads = 0;
+ while (readSupplier->getNextRead()) {
+ totalReads++;
+ const Genome::Contig *contig = genome->getContigAtLocation(genomeLocation);
+ if (NULL != contig && !strcmp(contig->name, argv[4])) {
+ emittedReads++;
+ writer->writeRead(read, alignmentResult, mapQ, genomeLocation, isRC ? RC : FORWARD);
+ }
+ }
+ writer->close();
+ printf("Processed %lld reads, of which %lld were emitted\n", totalReads, emittedReads);
+ return 0;
diff --git a/apps/ExtractReads/ExtractReads.vcxproj b/apps/ExtractReads/ExtractReads.vcxproj
new file mode 100644
index 0000000..7c719af
--- /dev/null
+++ b/apps/ExtractReads/ExtractReads.vcxproj
@@ -0,0 +1,171 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <ItemGroup Label="ProjectConfigurations">
+ <ProjectConfiguration Include="Debug|Win32">
+ <Configuration>Debug</Configuration>
+ <Platform>Win32</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Debug|x64">
+ <Configuration>Debug</Configuration>
+ <Platform>x64</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Release|Win32">
+ <Configuration>Release</Configuration>
+ <Platform>Win32</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Release|x64">
+ <Configuration>Release</Configuration>
+ <Platform>x64</Platform>
+ </ProjectConfiguration>
+ </ItemGroup>
+ <PropertyGroup Label="Globals">
+ <ProjectGuid>{FD52DC05-194B-4BD8-828E-A5679777D6C8}</ProjectGuid>
+ <Keyword>Win32Proj</Keyword>
+ <RootNamespace>ExtractReads</RootNamespace>
+ </PropertyGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <UseDebugLibraries>true</UseDebugLibraries>
+ <PlatformToolset>v120</PlatformToolset>
+ <CharacterSet>Unicode</CharacterSet>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <UseDebugLibraries>true</UseDebugLibraries>
+ <PlatformToolset>v120</PlatformToolset>
+ <CharacterSet>Unicode</CharacterSet>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <UseDebugLibraries>false</UseDebugLibraries>
+ <PlatformToolset>v120</PlatformToolset>
+ <WholeProgramOptimization>true</WholeProgramOptimization>
+ <CharacterSet>MultiByte</CharacterSet>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <UseDebugLibraries>false</UseDebugLibraries>
+ <PlatformToolset>v120</PlatformToolset>
+ <WholeProgramOptimization>true</WholeProgramOptimization>
+ <CharacterSet>MultiByte</CharacterSet>
+ </PropertyGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+ <ImportGroup Label="ExtensionSettings">
+ </ImportGroup>
+ <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <PropertyGroup Label="UserMacros" />
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ <LinkIncremental>true</LinkIncremental>
+ <OutDir>$(SolutionDir)\obj\bin\$(Configuration)\$(Platform)\</OutDir>
+ <IntDir>$(SolutionDir)\obj\obj\ComputeROC\$(Configuration)\$(Platform)\</IntDir>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ <LinkIncremental>true</LinkIncremental>
+ <OutDir>$(SolutionDir)\obj\bin\$(Configuration)\$(Platform)\</OutDir>
+ <IntDir>$(SolutionDir)\obj\obj\ComputeROC\$(Configuration)\$(Platform)\</IntDir>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ <LinkIncremental>false</LinkIncremental>
+ <OutDir>$(SolutionDir)\obj\bin\$(Configuration)\$(Platform)\</OutDir>
+ <IntDir>$(SolutionDir)\obj\obj\ComputeROC\$(Configuration)\$(Platform)\</IntDir>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ <LinkIncremental>false</LinkIncremental>
+ <OutDir>$(SolutionDir)\obj\bin\$(Configuration)\$(Platform)\</OutDir>
+ <IntDir>$(SolutionDir)\obj\obj\ComputeROC\$(Configuration)\$(Platform)\</IntDir>
+ </PropertyGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ <ClCompile>
+ <PrecompiledHeader>
+ </PrecompiledHeader>
+ <WarningLevel>Level3</WarningLevel>
+ <Optimization>Disabled</Optimization>
+ <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions));_CRT_SECURE_NO_WARNINGS</PreprocessorDefinitions>
+ </ClCompile>
+ <Link>
+ <SubSystem>Console</SubSystem>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <AdditionalLibraryDirectories>$(SolutionDir)obj\lib\$(Configuration)\$(Platform)\;$(SolutionDir)import</AdditionalLibraryDirectories>
+ <AdditionalDependencies>snaplib.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies);zlibstat.lib</AdditionalDependencies>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ <ClCompile>
+ <PrecompiledHeader>
+ </PrecompiledHeader>
+ <WarningLevel>Level3</WarningLevel>
+ <Optimization>Disabled</Optimization>
+ <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions));_CRT_SECURE_NO_WARNINGS</PreprocessorDefinitions>
+ <AdditionalIncludeDirectories>..\..\snaplib\</AdditionalIncludeDirectories>
+ </ClCompile>
+ <Link>
+ <SubSystem>Console</SubSystem>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <AdditionalLibraryDirectories>$(SolutionDir)obj\lib\$(Configuration)\$(Platform)\;$(SolutionDir)import</AdditionalLibraryDirectories>
+ <AdditionalDependencies>snaplib.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies);zlibstat.lib</AdditionalDependencies>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ <ClCompile>
+ <WarningLevel>Level3</WarningLevel>
+ <PrecompiledHeader>
+ </PrecompiledHeader>
+ <Optimization>MaxSpeed</Optimization>
+ <FunctionLevelLinking>true</FunctionLevelLinking>
+ <IntrinsicFunctions>true</IntrinsicFunctions>
+ <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions));_CRT_SECURE_NO_WARNINGS</PreprocessorDefinitions>
+ <AdditionalIncludeDirectories>..\..\snaplib\</AdditionalIncludeDirectories>
+ </ClCompile>
+ <Link>
+ <SubSystem>Console</SubSystem>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <EnableCOMDATFolding>true</EnableCOMDATFolding>
+ <OptimizeReferences>true</OptimizeReferences>
+ <AdditionalLibraryDirectories>$(SolutionDir)obj\lib\$(Configuration)\$(Platform)\;$(SolutionDir)import</AdditionalLibraryDirectories>
+ <AdditionalDependencies>snaplib.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies);zlibstat.lib</AdditionalDependencies>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ <ClCompile>
+ <WarningLevel>Level3</WarningLevel>
+ <PrecompiledHeader>
+ </PrecompiledHeader>
+ <Optimization>MaxSpeed</Optimization>
+ <FunctionLevelLinking>true</FunctionLevelLinking>
+ <IntrinsicFunctions>true</IntrinsicFunctions>
+ <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions));_CRT_SECURE_NO_WARNINGS</PreprocessorDefinitions>
+ <AdditionalIncludeDirectories>..\..\snaplib\</AdditionalIncludeDirectories>
+ </ClCompile>
+ <Link>
+ <SubSystem>Console</SubSystem>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <EnableCOMDATFolding>true</EnableCOMDATFolding>
+ <OptimizeReferences>true</OptimizeReferences>
+ <AdditionalLibraryDirectories>$(SolutionDir)obj\lib\$(Configuration)\$(Platform)\;$(SolutionDir)import</AdditionalLibraryDirectories>
+ <AdditionalDependencies>snaplib.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies);zlibstat.lib</AdditionalDependencies>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemGroup>
+ <ClInclude Include="stdafx.h" />
+ <ClInclude Include="targetver.h" />
+ </ItemGroup>
+ <ItemGroup>
+ <ClCompile Include="ExtractReads.cpp" />
+ <ClCompile Include="stdafx.cpp" />
+ </ItemGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+ <ImportGroup Label="ExtensionTargets">
+ </ImportGroup>
\ No newline at end of file
diff --git a/apps/ExtractReads/ExtractReads.vcxproj.filters b/apps/ExtractReads/ExtractReads.vcxproj.filters
new file mode 100644
index 0000000..719f5d6
--- /dev/null
+++ b/apps/ExtractReads/ExtractReads.vcxproj.filters
@@ -0,0 +1,33 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <ItemGroup>
+ <Filter Include="Source Files">
+ <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
+ <Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
+ </Filter>
+ <Filter Include="Header Files">
+ <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
+ <Extensions>h;hpp;hxx;hm;inl;inc;xsd</Extensions>
+ </Filter>
+ <Filter Include="Resource Files">
+ <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
+ <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
+ </Filter>
+ </ItemGroup>
+ <ItemGroup>
+ <ClInclude Include="stdafx.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="targetver.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ </ItemGroup>
+ <ItemGroup>
+ <ClCompile Include="stdafx.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="ExtractReads.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ </ItemGroup>
\ No newline at end of file
diff --git a/apps/ExtractReads/stdafx.cpp b/apps/ExtractReads/stdafx.cpp
new file mode 100644
index 0000000..f4641da
--- /dev/null
+++ b/apps/ExtractReads/stdafx.cpp
@@ -0,0 +1,8 @@
+// stdafx.cpp : source file that includes just the standard includes
+// ExtractReads.pch will be the pre-compiled header
+// stdafx.obj will contain the pre-compiled type information
+#include "stdafx.h"
+// TODO: reference any additional headers you need in STDAFX.H
+// and not in this file
diff --git a/apps/ExtractReads/stdafx.h b/apps/ExtractReads/stdafx.h
new file mode 100644
index 0000000..7c82d2b
--- /dev/null
+++ b/apps/ExtractReads/stdafx.h
@@ -0,0 +1,5 @@
+#ifdef _MSC_VER
+#include "..\..\SNAPLib\stdafx.h"
+#include "../../SNAPLib/stdafx.h"
diff --git a/apps/ExtractReads/targetver.h b/apps/ExtractReads/targetver.h
new file mode 100644
index 0000000..87c0086
--- /dev/null
+++ b/apps/ExtractReads/targetver.h
@@ -0,0 +1,8 @@
+#pragma once
+// Including SDKDDKVer.h defines the highest available Windows platform.
+// If you wish to build your application for a previous Windows platform, include WinSDKVer.h and
+// set the _WIN32_WINNT macro to the platform you wish to support before including SDKDDKVer.h.
+#include <SDKDDKVer.h>
diff --git a/apps/RandomizePIfastq/GoodRandom.cpp b/apps/RandomizePIfastq/GoodRandom.cpp
new file mode 100644
index 0000000..e251b06
--- /dev/null
+++ b/apps/RandomizePIfastq/GoodRandom.cpp
@@ -0,0 +1,159 @@
+Module Name:
+ GoodRandom.cpp
+ Cryptographically secure random number generator that's fast because it batches getting its random
+ source from CryptGenRand(). It's careful not to bias results when the request range isn't a power
+ of two.
+ In Linux, uses the Mersenne Twister to get the raw numbers and mostly just applies the anti-biasing
+ related to random number ranges that aren't powers of two.
+ Bill Bolosky, April, 2011
+#include "stdafx.h"
+#include "GoodRandom.h"
+//#include "mt64.h"
+unsigned MinBytesToStore(_uint64 maxValue)
+ if (0 == maxValue) {
+ return 0;
+ } else if (maxValue < 0x100) {
+ return 1;
+ } else if (maxValue < 0x10000) {
+ return 2;
+ } else if (maxValue < 0x1000000) {
+ return 3;
+ } else if (maxValue < 0x100000000) {
+ return 4;
+ } else if (maxValue < 0x10000000000) {
+ return 5;
+ } else if (maxValue < 0x1000000000000) {
+ return 6;
+ } else if (maxValue < 0x100000000000000) {
+ return 7;
+ } else {
+ // Wow
+ return 8;
+ }
+#ifdef _MSC_VER
+const unsigned GFRandomBufferSize = 10 * 1024 * 1024;
+struct GFRandomState {
+ char *buffer;
+ unsigned bufferUsed;
+__declspec(thread) GFRandomState *g_randomState = NULL;
+__declspec(thread) HCRYPTPROV g_hRandomCryptProv;
+_uint64 GoodFastRandom(_uint64 maxValue)
+ if (0 == maxValue) {
+ return 0;
+ }
+#ifdef _MSC_VER
+ if (maxValue > 0xffffffffffffff && maxValue != 0xffffffffffffffff) {
+ fprintf(stderr,"GoodFastRandom: writeme\n");
+ exit(1);
+ // This case requires different math in the roundoff error check below, because as written
+ // it would use 1 << 64, which isn't representable. I'm too lazy to write it now.
+ }
+ if (NULL == g_randomState) {
+ g_randomState= new GFRandomState;
+ g_randomState->buffer = new char[GFRandomBufferSize];
+ g_randomState->bufferUsed = GFRandomBufferSize; // Forces us to get new random data the first time through.
+ BOOL worked = CryptAcquireContext(
+ &g_hRandomCryptProv,
+ "BillKeyContainer",
+ NULL, // default provider
+ if (!worked && NTE_BAD_KEYSET == GetLastError()) {
+ worked = CryptAcquireContext(
+ &g_hRandomCryptProv,
+ "BillKeyContainer",
+ NULL, // default provider
+ }
+ if (!worked) {
+ fprintf(stderr,"Unable to get crypt provider, %d\n",GetLastError());
+ exit(1);
+ }
+ }
+#endif // _MSC_VER
+ unsigned bytesToGet = MinBytesToStore(maxValue);
+ for (;;) {
+ _uint64 rawValue = 0;
+#ifdef _MSC_VER
+ if (g_randomState->bufferUsed + bytesToGet > GFRandomBufferSize) {
+ if (!CryptGenRandom(g_hRandomCryptProv,GFRandomBufferSize,(PBYTE)g_randomState->buffer)) {
+ fprintf(stderr,"CryptGenRandom failed, %d\b\n",GetLastError());
+ exit(1);
+ }
+ g_randomState->bufferUsed = 0;
+ }
+ memcpy(&rawValue,g_randomState->buffer + g_randomState->bufferUsed,bytesToGet);
+ g_randomState->bufferUsed += bytesToGet;
+#else // _MSC_VER
+ #ifdef RELEASE
+ #if RAND_MAX % 0x100 != 0xff
+ fprintf(stderr,"Jesse was too lazy to correct random bias on your platform.\n");
+ exit(1);
+ #endif
+ for (unsigned b = 0; b < bytesToGet; ++b) {
+ // RAND_MAX could in theory be less than 0xffff.
+ _uint64 randomByte = rand() & 0xff;
+ rawValue |= randomByte << (b * 8);
+ }
+ #else
+ rawValue = genrand64_int64() >> (8 - bytesToGet) * 8;
+ #endif // RELEASE
+#endif // _MSC_VER
+ //
+ // Be careful here not to bias the result. If maxValue + 1 doesn't go evenly into 256^bytesToGet
+ // then there would be a bias toward the lower values, since they get one extra representation.
+ // So, if the value is in the last part, throw it away and try again.
+ //
+ if (0xffffffffffffffff == maxValue) {
+ //
+ // Special case for full range, because it wouldn't work with the code below.
+ //
+ return rawValue;
+ }
+ _uint64 maxRawValuePlusOne = ((_uint64)1) << (bytesToGet * 8);
+ if (rawValue < maxRawValuePlusOne - maxRawValuePlusOne % (maxValue+1)) {
+ return rawValue % (maxValue+1);
+ }
+ }
diff --git a/apps/RandomizePIfastq/GoodRandom.h b/apps/RandomizePIfastq/GoodRandom.h
new file mode 100644
index 0000000..bb9ddc0
--- /dev/null
+++ b/apps/RandomizePIfastq/GoodRandom.h
@@ -0,0 +1,33 @@
+Module Name:
+ GoodRandom.h
+ Cryptographically secure random number generator that's fast because it batches getting its random
+ source from CryptGenRand(). It's careful not to bias results when the request range isn't a power
+ of two.
+ Bill Bolosky, April, 2011
+ Not thread safe.
+#pragma once
+#include "Compat.h"
+unsigned MinBytesToStore(_uint64 maxValue);
+// This returns a cryptographically secure random number from {0..maxValue} (NOT {0..maxValue-1}!)
+// It's computationally efficient, too.
+_uint64 GoodFastRandom(_uint64 maxValue);
diff --git a/apps/RandomizePIfastq/RandomizePIfastq.cpp b/apps/RandomizePIfastq/RandomizePIfastq.cpp
new file mode 100644
index 0000000..fa291a9
--- /dev/null
+++ b/apps/RandomizePIfastq/RandomizePIfastq.cpp
@@ -0,0 +1,177 @@
+Module Name:
+ RandomizePIfastq.cpp
+ Cheezy little app that takes a paired, interleaved FASTQ and more-or-less randomizes it.
+ Bill Bolosky, September, 2013
+Revision History:
+#include "stdafx.h"
+#include "GoodRandom.h"
+class IntermediateFile {
+ IntermediateFile(const char *fileName, size_t i_bufferSize);
+ ~IntermediateFile();
+ void addLines(char *lines, size_t size);
+ void close();
+ void flush();
+ HANDLE hFile;
+ size_t bufferSize;
+ size_t bufferUsed;
+ char *buffer;
+IntermediateFile::IntermediateFile(const char *fileName, size_t i_bufferSize) : bufferSize(i_bufferSize)
+ if (INVALID_HANDLE_VALUE == hFile) {
+ fprintf(stderr,"Unable to open intermediate file '%s', %d\n", fileName, GetLastError());
+ return;
+ }
+ buffer = new char[bufferSize];
+ bufferUsed = 0;
+ close();
+ void
+IntermediateFile::addLines(char *lines, size_t size)
+ if (size + bufferUsed > bufferSize) {
+ flush();
+ }
+ memcpy(buffer + bufferUsed, lines, size);
+ bufferUsed += size;
+ void
+ flush();
+ CloseHandle(hFile);
+ hFile = NULL;
+ void
+ DWORD bytesWritten;
+ if (!WriteFile(hFile, buffer, bufferUsed, &bytesWritten, NULL)) {
+ fprintf(stderr,"WriteFile failed, %d\n", GetLastError());
+ return;
+ }
+ bufferUsed = 0;
+ void
+ fprintf(stderr,"usage: RandomizePIfastq inputFile outputFile\n");
+const int inputBufferSize = 100 * 1024 * 1024;
+char *inputBuffer[2];
+ void
+main(int argc, const char **argv)
+ for (int i = 0; i < 2; i++) {
+ inputBuffer[i] = new char[inputBufferSize];
+ }
+ const int nFiles = 1000;
+ if (3 != argc) {
+ usage();
+ return;
+ }
+ if (INVALID_HANDLE_VALUE == hInputFile) {
+ fprintf(stderr,"Unable to open '%s' for input, error %d\n", argv[1], GetLastError());
+ return;
+ }
+ IntermediateFile **files = new IntermediateFile *[nFiles];
+ for (int i = 0; i < nFiles; i++) {
+ char fileName[100];
+ sprintf(fileName,"piFQ.piece.%04d", i);
+ files[i] = new IntermediateFile(fileName, 10 * 1024 * 1024);
+ }
+ const int nLinesPerItem = 8;
+ int whichInputBuffer = 0;
+ DWORD usedBytes = 0;
+ DWORD validBytes = 0;
+ bool done = false;
+ for (;;) {
+ DWORD leftoverBytes = validBytes - usedBytes; // i.e., how much was in the old buffer minus what we used.
+ memcpy(inputBuffer[whichInputBuffer], inputBuffer[1 - whichInputBuffer] + usedBytes, leftoverBytes);
+ DWORD bytesRead;
+ if (!ReadFile(hInputFile, inputBuffer[whichInputBuffer] + leftoverBytes, inputBufferSize - leftoverBytes, &bytesRead, NULL)) {
+ fprintf(stderr,"Read error on input file %d\n", GetLastError());
+ return;
+ }
+ if (0 == bytesRead) {
+ if (0 != leftoverBytes) {
+ fprintf(stderr,"Input file doesn't seem to have a multiple of %d lines\n", nLinesPerItem);
+ }
+ break;
+ }
+ validBytes = leftoverBytes + bytesRead; // i.e., what we copied from the last one, plus what's new
+ usedBytes = 0;
+ for (;;) {
+ char *nextItem = inputBuffer[whichInputBuffer] + usedBytes;
+ int nNewLines = 0;
+ while (nNewLines < nLinesPerItem && usedBytes < validBytes) {
+ if (inputBuffer[whichInputBuffer][usedBytes] == '\n') {
+ nNewLines++;
+ }
+ usedBytes++; // NB: deliberately eating the \n
+ }
+ if (nNewLines < nLinesPerItem) {
+ usedBytes = nextItem - inputBuffer[whichInputBuffer];
+ break;
+ }
+ _uint64 whichFile = GoodFastRandom(nFiles - 1);
+ files[whichFile]->addLines(nextItem, usedBytes - (nextItem - inputBuffer[whichInputBuffer]));
+ }
+ whichInputBuffer = 1 - whichInputBuffer;
+ }
+ for (int i = 0; i < nFiles; i++) {
+ delete files[i]; // This also flushes them
+ }
\ No newline at end of file
diff --git a/apps/RandomizePIfastq/RandomizePIfastq.vcxproj b/apps/RandomizePIfastq/RandomizePIfastq.vcxproj
new file mode 100644
index 0000000..3528ef9
--- /dev/null
+++ b/apps/RandomizePIfastq/RandomizePIfastq.vcxproj
@@ -0,0 +1,175 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <ItemGroup Label="ProjectConfigurations">
+ <ProjectConfiguration Include="Debug|Win32">
+ <Configuration>Debug</Configuration>
+ <Platform>Win32</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Debug|x64">
+ <Configuration>Debug</Configuration>
+ <Platform>x64</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Release|Win32">
+ <Configuration>Release</Configuration>
+ <Platform>Win32</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Release|x64">
+ <Configuration>Release</Configuration>
+ <Platform>x64</Platform>
+ </ProjectConfiguration>
+ </ItemGroup>
+ <PropertyGroup Label="Globals">
+ <ProjectGuid>{1C797460-0410-4DE3-AA1A-8ECA471BA8C4}</ProjectGuid>
+ <Keyword>Win32Proj</Keyword>
+ <RootNamespace>RandomizePIfastq</RootNamespace>
+ </PropertyGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <UseDebugLibraries>true</UseDebugLibraries>
+ <PlatformToolset>v120</PlatformToolset>
+ <CharacterSet>MultiByte</CharacterSet>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <UseDebugLibraries>true</UseDebugLibraries>
+ <PlatformToolset>v120</PlatformToolset>
+ <CharacterSet>MultiByte</CharacterSet>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <UseDebugLibraries>false</UseDebugLibraries>
+ <PlatformToolset>v120</PlatformToolset>
+ <WholeProgramOptimization>true</WholeProgramOptimization>
+ <CharacterSet>MultiByte</CharacterSet>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <UseDebugLibraries>false</UseDebugLibraries>
+ <PlatformToolset>v120</PlatformToolset>
+ <WholeProgramOptimization>true</WholeProgramOptimization>
+ <CharacterSet>MultiByte</CharacterSet>
+ </PropertyGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+ <ImportGroup Label="ExtensionSettings">
+ </ImportGroup>
+ <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <PropertyGroup Label="UserMacros" />
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ <LinkIncremental>true</LinkIncremental>
+ <OutDir>$(SolutionDir)\obj\bin\$(Configuration)\$(Platform)\</OutDir>
+ <IntDir>$(SolutionDir)\obj\obj\snap\$(Configuration)\$(Platform)\</IntDir>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ <LinkIncremental>true</LinkIncremental>
+ <OutDir>$(SolutionDir)\obj\bin\$(Configuration)\$(Platform)\</OutDir>
+ <IntDir>$(SolutionDir)\obj\obj\snap\$(Configuration)\$(Platform)\</IntDir>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ <LinkIncremental>false</LinkIncremental>
+ <OutDir>$(SolutionDir)\obj\bin\$(Configuration)\$(Platform)\</OutDir>
+ <IntDir>$(SolutionDir)\obj\obj\snap\$(Configuration)\$(Platform)\</IntDir>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ <LinkIncremental>false</LinkIncremental>
+ <OutDir>$(SolutionDir)\obj\bin\$(Configuration)\$(Platform)\</OutDir>
+ <IntDir>$(SolutionDir)\obj\obj\snap\$(Configuration)\$(Platform)\</IntDir>
+ </PropertyGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ <ClCompile>
+ <PrecompiledHeader>Use</PrecompiledHeader>
+ <WarningLevel>Level3</WarningLevel>
+ <Optimization>Disabled</Optimization>
+ <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions);_CRT_SECURE_NO_WARNINGS</PreprocessorDefinitions>
+ <AdditionalIncludeDirectories>..\..\snaplib\;..\..\import</AdditionalIncludeDirectories>
+ </ClCompile>
+ <Link>
+ <SubSystem>Console</SubSystem>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <AdditionalLibraryDirectories>$(SolutionDir)obj\lib\$(Configuration)\$(Platform)\;$(SolutionDir)\import</AdditionalLibraryDirectories>
+ <AdditionalDependencies>snaplib.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies);zlibstat.lib</AdditionalDependencies>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ <ClCompile>
+ <PrecompiledHeader>Use</PrecompiledHeader>
+ <WarningLevel>Level3</WarningLevel>
+ <Optimization>Disabled</Optimization>
+ <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions);_CRT_SECURE_NO_WARNINGS</PreprocessorDefinitions>
+ <AdditionalIncludeDirectories>..\..\snaplib\;..\..\import</AdditionalIncludeDirectories>
+ </ClCompile>
+ <Link>
+ <SubSystem>Console</SubSystem>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <AdditionalLibraryDirectories>$(SolutionDir)obj\lib\$(Configuration)\$(Platform)\;$(SolutionDir)\import</AdditionalLibraryDirectories>
+ <AdditionalDependencies>snaplib.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies);zlibstat.lib</AdditionalDependencies>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ <ClCompile>
+ <WarningLevel>Level3</WarningLevel>
+ <PrecompiledHeader>Use</PrecompiledHeader>
+ <Optimization>MaxSpeed</Optimization>
+ <FunctionLevelLinking>true</FunctionLevelLinking>
+ <IntrinsicFunctions>true</IntrinsicFunctions>
+ <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions);_CRT_SECURE_NO_WARNINGS</PreprocessorDefinitions>
+ <AdditionalIncludeDirectories>..\..\snaplib\;..\..\import</AdditionalIncludeDirectories>
+ </ClCompile>
+ <Link>
+ <SubSystem>Console</SubSystem>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <EnableCOMDATFolding>true</EnableCOMDATFolding>
+ <OptimizeReferences>true</OptimizeReferences>
+ <AdditionalLibraryDirectories>$(SolutionDir)obj\lib\$(Configuration)\$(Platform)\;$(SolutionDir)\import</AdditionalLibraryDirectories>
+ <AdditionalDependencies>snaplib.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies);zlibstat.lib</AdditionalDependencies>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ <ClCompile>
+ <WarningLevel>Level3</WarningLevel>
+ <PrecompiledHeader>Use</PrecompiledHeader>
+ <Optimization>MaxSpeed</Optimization>
+ <FunctionLevelLinking>true</FunctionLevelLinking>
+ <IntrinsicFunctions>true</IntrinsicFunctions>
+ <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions);_CRT_SECURE_NO_WARNINGS</PreprocessorDefinitions>
+ <AdditionalIncludeDirectories>..\..\snaplib\;..\..\import</AdditionalIncludeDirectories>
+ </ClCompile>
+ <Link>
+ <SubSystem>Console</SubSystem>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <EnableCOMDATFolding>true</EnableCOMDATFolding>
+ <OptimizeReferences>true</OptimizeReferences>
+ <AdditionalLibraryDirectories>$(SolutionDir)obj\lib\$(Configuration)\$(Platform)\;$(SolutionDir)\import</AdditionalLibraryDirectories>
+ <AdditionalDependencies>snaplib.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies);zlibstat.lib</AdditionalDependencies>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemGroup>
+ <ClInclude Include="GoodRandom.h" />
+ <ClInclude Include="stdafx.h" />
+ <ClInclude Include="targetver.h" />
+ </ItemGroup>
+ <ItemGroup>
+ <ClCompile Include="GoodRandom.cpp" />
+ <ClCompile Include="RandomizePIfastq.cpp" />
+ <ClCompile Include="stdafx.cpp">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Create</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Create</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Create</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Create</PrecompiledHeader>
+ </ClCompile>
+ </ItemGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+ <ImportGroup Label="ExtensionTargets">
+ </ImportGroup>
\ No newline at end of file
diff --git a/apps/RandomizePIfastq/RandomizePIfastq.vcxproj.filters b/apps/RandomizePIfastq/RandomizePIfastq.vcxproj.filters
new file mode 100644
index 0000000..bf7eb0f
--- /dev/null
+++ b/apps/RandomizePIfastq/RandomizePIfastq.vcxproj.filters
@@ -0,0 +1,39 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <ItemGroup>
+ <Filter Include="Source Files">
+ <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
+ <Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
+ </Filter>
+ <Filter Include="Header Files">
+ <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
+ <Extensions>h;hpp;hxx;hm;inl;inc;xsd</Extensions>
+ </Filter>
+ <Filter Include="Resource Files">
+ <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
+ <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
+ </Filter>
+ </ItemGroup>
+ <ItemGroup>
+ <ClInclude Include="stdafx.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="targetver.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="GoodRandom.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ </ItemGroup>
+ <ItemGroup>
+ <ClCompile Include="stdafx.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="RandomizePIfastq.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="GoodRandom.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ </ItemGroup>
\ No newline at end of file
diff --git a/apps/RandomizePIfastq/stdafx.cpp b/apps/RandomizePIfastq/stdafx.cpp
new file mode 100644
index 0000000..2e33b3b
--- /dev/null
+++ b/apps/RandomizePIfastq/stdafx.cpp
@@ -0,0 +1,8 @@
+// stdafx.cpp : source file that includes just the standard includes
+// RandomizePIfastq.pch will be the pre-compiled header
+// stdafx.obj will contain the pre-compiled type information
+#include "stdafx.h"
+// TODO: reference any additional headers you need in STDAFX.H
+// and not in this file
diff --git a/apps/RandomizePIfastq/stdafx.h b/apps/RandomizePIfastq/stdafx.h
new file mode 100644
index 0000000..7c82d2b
--- /dev/null
+++ b/apps/RandomizePIfastq/stdafx.h
@@ -0,0 +1,5 @@
+#ifdef _MSC_VER
+#include "..\..\SNAPLib\stdafx.h"
+#include "../../SNAPLib/stdafx.h"
diff --git a/apps/RandomizePIfastq/targetver.h b/apps/RandomizePIfastq/targetver.h
new file mode 100644
index 0000000..87c0086
--- /dev/null
+++ b/apps/RandomizePIfastq/targetver.h
@@ -0,0 +1,8 @@
+#pragma once
+// Including SDKDDKVer.h defines the highest available Windows platform.
+// If you wish to build your application for a previous Windows platform, include WinSDKVer.h and
+// set the _WIN32_WINNT macro to the platform you wish to support before including SDKDDKVer.h.
+#include <SDKDDKVer.h>
diff --git a/apps/SNAPCommand/SNAPCommand.cpp b/apps/SNAPCommand/SNAPCommand.cpp
new file mode 100644
index 0000000..f3d9b7a
--- /dev/null
+++ b/apps/SNAPCommand/SNAPCommand.cpp
@@ -0,0 +1,105 @@
+Module Name:
+Send a command to SNAP running daemon mode
+Matei Zaharia & Bill Bolosky, February, 2012
+User mode service.
+Revision History:
+Adapted from cSNAP, which was in turn adapted from the scala prototype
+#include "stdafx.h"
+#include "Compat.h"
+#include "exit.h"
+#include "CommandProcessor.h"
+ fprintf(stderr, "usage: SNAPCommand {-p PipeName} <command to send to SNAP>\n");
+ fprintf(stderr, "Send command 'exit' to SNAP to have the server process exit.\n");
+ soft_exit_no_print(1);
+int main(int argc, const char **argv)
+ if (argc < 2) {
+ usage();
+ }
+ const char *pipeName;
+ int startingArg;
+ if (strcmp(argv[1], "-p") == 0) {
+ if (argc < 4) usage();
+ pipeName = argv[2];
+ startingArg = 3;
+ } else {
+ startingArg = 1;
+ }
+ NamedPipe *serverPipe = OpenNamedPipe(pipeName, false);
+ if (NULL == serverPipe) {
+ fprintf(stderr, "Unable to open pipe to server\n");
+ soft_exit(1);
+ }
+ //
+ // Send down the args.
+ //
+ char argcBuffer[100];
+ sprintf(argcBuffer, "%d", argc - startingArg + 1); // +1 is for the command name, argv[0]
+ if (!WriteToNamedPipe(serverPipe, argcBuffer)) {
+ fprintf(stderr, "Unable to send arg count to server\n");
+ soft_exit(1);
+ }
+ if (!WriteToNamedPipe(serverPipe, argv[0])) {
+ fprintf(stderr, "Error sending arg '%s' to server\n", argv[0]);
+ soft_exit(1);
+ }
+ for (int i = startingArg; i < argc; i++) {
+ if (!WriteToNamedPipe(serverPipe, argv[i])) {
+ fprintf(stderr, "Error sending arg '%s' to server\n", argv[i]);
+ soft_exit(1);
+ }
+ }
+ //
+ // Now process the results from SNAP, printing them out and waiting for the terminator.
+ //
+ const size_t outputBufferSize = 100000;
+ char outputBuffer[outputBufferSize];
+ while (ReadFromNamedPipe(serverPipe, outputBuffer, outputBufferSize)) {
+ if (strcmp(outputBuffer, CommandExecutedString) == 0) {
+ soft_exit_no_print(0);
+ }
+ printf("%s", outputBuffer);
+ fflush(stdout);
+ }
+ fprintf(stderr, "Error reading from server pipe\n");
+ soft_exit(1);
+ return 1;
diff --git a/apps/SNAPCommand/SNAPCommand.vcxproj b/apps/SNAPCommand/SNAPCommand.vcxproj
new file mode 100644
index 0000000..d1a07d8
--- /dev/null
+++ b/apps/SNAPCommand/SNAPCommand.vcxproj
@@ -0,0 +1,162 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <ItemGroup Label="ProjectConfigurations">
+ <ProjectConfiguration Include="Debug|Win32">
+ <Configuration>Debug</Configuration>
+ <Platform>Win32</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Debug|x64">
+ <Configuration>Debug</Configuration>
+ <Platform>x64</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Release|Win32">
+ <Configuration>Release</Configuration>
+ <Platform>Win32</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Release|x64">
+ <Configuration>Release</Configuration>
+ <Platform>x64</Platform>
+ </ProjectConfiguration>
+ </ItemGroup>
+ <PropertyGroup Label="Globals">
+ <ProjectGuid>{F555A574-597E-4C0E-ADFD-FC4C897B2085}</ProjectGuid>
+ <Keyword>Win32Proj</Keyword>
+ <RootNamespace>SNAPCommand</RootNamespace>
+ </PropertyGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <UseDebugLibraries>true</UseDebugLibraries>
+ <PlatformToolset>v120</PlatformToolset>
+ <CharacterSet>Unicode</CharacterSet>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <UseDebugLibraries>true</UseDebugLibraries>
+ <PlatformToolset>v120</PlatformToolset>
+ <CharacterSet>MultiByte</CharacterSet>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <UseDebugLibraries>false</UseDebugLibraries>
+ <PlatformToolset>v120</PlatformToolset>
+ <WholeProgramOptimization>true</WholeProgramOptimization>
+ <CharacterSet>Unicode</CharacterSet>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <UseDebugLibraries>false</UseDebugLibraries>
+ <PlatformToolset>v120</PlatformToolset>
+ <WholeProgramOptimization>true</WholeProgramOptimization>
+ <CharacterSet>MultiByte</CharacterSet>
+ </PropertyGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+ <ImportGroup Label="ExtensionSettings">
+ </ImportGroup>
+ <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <PropertyGroup Label="UserMacros" />
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ <LinkIncremental>true</LinkIncremental>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ <LinkIncremental>true</LinkIncremental>
+ <OutDir>$(SolutionDir)\obj\bin\$(Configuration)\$(Platform)\</OutDir>
+ <IntDir>$(SolutionDir)\obj\obj\SNAPCommand\$(Configuration)\$(Platform)\</IntDir>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ <LinkIncremental>false</LinkIncremental>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ <LinkIncremental>false</LinkIncremental>
+ <OutDir>$(SolutionDir)\obj\bin\$(Configuration)\$(Platform)\</OutDir>
+ <IntDir>$(SolutionDir)\obj\obj\SNAPCommand\$(Configuration)\$(Platform)\</IntDir>
+ </PropertyGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ <ClCompile>
+ <PrecompiledHeader>Use</PrecompiledHeader>
+ <WarningLevel>Level3</WarningLevel>
+ <Optimization>Disabled</Optimization>
+ <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ </ClCompile>
+ <Link>
+ <SubSystem>Console</SubSystem>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ <ClCompile>
+ <PrecompiledHeader>Use</PrecompiledHeader>
+ <WarningLevel>Level3</WarningLevel>
+ <Optimization>Disabled</Optimization>
+ <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions); _CRT_SECURE_NO_WARNINGS</PreprocessorDefinitions>
+ <AdditionalIncludeDirectories>..\..\snaplib\</AdditionalIncludeDirectories>
+ </ClCompile>
+ <Link>
+ <SubSystem>Console</SubSystem>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <AdditionalLibraryDirectories>$(SolutionDir)obj\lib\$(Configuration)\$(Platform)\;$(SolutionDir)import</AdditionalLibraryDirectories>
+ <AdditionalDependencies>libhdfs.lib;snaplib.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies);zlibstat.lib</AdditionalDependencies>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ <ClCompile>
+ <WarningLevel>Level3</WarningLevel>
+ <PrecompiledHeader>Use</PrecompiledHeader>
+ <Optimization>MaxSpeed</Optimization>
+ <FunctionLevelLinking>true</FunctionLevelLinking>
+ <IntrinsicFunctions>true</IntrinsicFunctions>
+ <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ </ClCompile>
+ <Link>
+ <SubSystem>Console</SubSystem>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <EnableCOMDATFolding>true</EnableCOMDATFolding>
+ <OptimizeReferences>true</OptimizeReferences>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ <ClCompile>
+ <WarningLevel>Level3</WarningLevel>
+ <PrecompiledHeader>Use</PrecompiledHeader>
+ <Optimization>MaxSpeed</Optimization>
+ <FunctionLevelLinking>true</FunctionLevelLinking>
+ <IntrinsicFunctions>true</IntrinsicFunctions>
+ <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions); _CRT_SECURE_NO_WARNINGS</PreprocessorDefinitions>
+ <AdditionalIncludeDirectories>..\..\snaplib\</AdditionalIncludeDirectories>
+ </ClCompile>
+ <Link>
+ <SubSystem>Console</SubSystem>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <EnableCOMDATFolding>true</EnableCOMDATFolding>
+ <OptimizeReferences>true</OptimizeReferences>
+ <AdditionalLibraryDirectories>$(SolutionDir)obj\lib\$(Configuration)\$(Platform)\;$(SolutionDir)import</AdditionalLibraryDirectories>
+ <AdditionalDependencies>libhdfs.lib;snaplib.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies);zlibstat.lib</AdditionalDependencies>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemGroup>
+ <ClInclude Include="stdafx.h" />
+ </ItemGroup>
+ <ItemGroup>
+ <ClCompile Include="SNAPCommand.cpp" />
+ <ClCompile Include="stdafx.cpp">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Create</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Create</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Create</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Create</PrecompiledHeader>
+ </ClCompile>
+ </ItemGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+ <ImportGroup Label="ExtensionTargets">
+ </ImportGroup>
\ No newline at end of file
diff --git a/apps/SNAPCommand/SNAPCommand.vcxproj.filters b/apps/SNAPCommand/SNAPCommand.vcxproj.filters
new file mode 100644
index 0000000..c85781f
--- /dev/null
+++ b/apps/SNAPCommand/SNAPCommand.vcxproj.filters
@@ -0,0 +1,30 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <ItemGroup>
+ <Filter Include="Source Files">
+ <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
+ <Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
+ </Filter>
+ <Filter Include="Header Files">
+ <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
+ <Extensions>h;hh;hpp;hxx;hm;inl;inc;xsd</Extensions>
+ </Filter>
+ <Filter Include="Resource Files">
+ <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
+ <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
+ </Filter>
+ </ItemGroup>
+ <ItemGroup>
+ <ClInclude Include="stdafx.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ </ItemGroup>
+ <ItemGroup>
+ <ClCompile Include="stdafx.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="SNAPCommand.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ </ItemGroup>
\ No newline at end of file
diff --git a/apps/SNAPCommand/stdafx.cpp b/apps/SNAPCommand/stdafx.cpp
new file mode 100644
index 0000000..e9217b3
--- /dev/null
+++ b/apps/SNAPCommand/stdafx.cpp
@@ -0,0 +1,8 @@
+// stdafx.cpp : source file that includes just the standard includes
+// SNAPCommand.pch will be the pre-compiled header
+// stdafx.obj will contain the pre-compiled type information
+#include "stdafx.h"
+// TODO: reference any additional headers you need in STDAFX.H
+// and not in this file
diff --git a/apps/SNAPCommand/stdafx.h b/apps/SNAPCommand/stdafx.h
new file mode 100644
index 0000000..45028a6
--- /dev/null
+++ b/apps/SNAPCommand/stdafx.h
@@ -0,0 +1,50 @@
+// stdafx.h : include file for standard system include files,
+// or project specific include files that are used frequently, but
+// are changed infrequently
+#pragma once
+#ifdef _MSC_VER
+#include "targetver.h"
+#define WIN32_LEAN_AND_MEAN // Exclude rarely-used stuff from Windows headers
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+#include <ctype.h>
+#include <errno.h>
+#include <memory.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#ifdef _MSC_VER
+#include <tchar.h>
+#include <crtdbg.h>
+#include <windows.h>
+#include <direct.h>
+#include <wincrypt.h>
+#include <assert.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <unistd.h>
+// MAP_ANONYMOUS is called MAP_ANON on OS X
diff --git a/apps/ToFASTQ/ToFASTQ.cpp b/apps/ToFASTQ/ToFASTQ.cpp
new file mode 100644
index 0000000..2f79c52
--- /dev/null
+++ b/apps/ToFASTQ/ToFASTQ.cpp
@@ -0,0 +1,223 @@
+Module Name:
+ ToFASTQ.cpp
+ Take a set of reads in SAM or BAM format and convert them to FASTQ
+ Bill Bolosky, January, 2014
+ User mode service.
+Revision History:
+#include "stdafx.h"
+#include "SAM.h"
+#include "Bam.h"
+#include "Genome.h"
+#include "Compat.h"
+#include "Read.h"
+#include "RangeSplitter.h"
+#include "BigAlloc.h"
+#include "FASTQ.h"
+void usage()
+ fprintf(stderr,"usage: ToFASTQ genomeIndex inputFile outputFile {outputFile2}\n");
+ fprintf(stderr," Specifying two output files means that the input is paired. If you specify only one output file, then\n");
+ fprintf(stderr," ToFASTQ will generate a single-ended FASTQ even for a paired input.\n");
+ fprintf(stderr," The genomeIndex must contain the same set of contigs used to align the input file.\n");
+ fprintf(stderr," To produce interleaved paired-end FASTQ, specify outputFile2 as '-i'.\n");
+ soft_exit(1);
+ReadSupplierGenerator *readSupplierGenerator = NULL;
+volatile _int64 nRunningThreads;
+SingleWaiterObject allThreadsDone;
+const char *inputFileName;
+const Genome *genome;
+FASTQWriter *fastqWriter[2] = {NULL, NULL};
+struct ThreadContext {
+ unsigned whichThread;
+ _int64 totalReads;
+ ThreadContext() {
+ totalReads = 0;
+ }
+bool inline isADigit(char x) {
+ return x >= '0' && x <= '9';
+WorkerThreadMain(void *param)
+ ThreadContext *context = (ThreadContext *)param;
+ ReadSupplier *readSupplier = readSupplierGenerator->generateNewReadSupplier();
+ Read *read;
+ while (NULL != (read = readSupplier->getNextRead())) {
+ context->totalReads++;
+ fastqWriter[0]->writeRead(read);
+ } // for each read from the reader
+ if (0 == InterlockedAdd64AndReturnNewValue(&nRunningThreads, -1)) {
+ SignalSingleWaiterObject(&allThreadsDone);
+ }
+ProcessPairedInput(PairedReadSupplierGenerator *pairedReadSupplierGenerator)
+ //
+ // This runs single threaded, because it's the only easy way to assure that the two output files match.
+ //
+ PairedReadSupplier *readSupplier = pairedReadSupplierGenerator->generateNewPairedReadSupplier();
+ Read *read[NUM_READS_PER_PAIR];
+ const size_t idBufferSize = 10000;
+ char idBuffer[idBufferSize];
+ _int64 totalReads = 0;
+ while (readSupplier->getNextReadPair(&read[0], &read[1])) {
+ for (int i = 0; i < NUM_READS_PER_PAIR; i++) {
+ //
+ // Create a new read with an ID that includes /1 or /2. Don't bother to fill in the stuff that the
+ // FASTQ writer doesn't care about anyway.
+ //
+ snprintf(idBuffer, idBufferSize-1,"%.*s/%d", read[i]->getIdLength(), read[i]->getId(), i+1);
+ Read local;
+ local.init(idBuffer, (unsigned)strlen(idBuffer),read[i]->getUnclippedData(), read[i]->getUnclippedQuality(), read[i]->getUnclippedLength(),
+ 0,0,0,0,0,0,0,NULL,0,0);
+ fastqWriter[i]->writeRead(&local);
+ }
+ totalReads += 2;
+ }
+ return totalReads;
+int main(int argc, char * argv[])
+ BigAllocUseHugePages = false;
+ if (4 != argc && 5 != argc) usage();
+ static const char *genomeSuffix = "Genome";
+ size_t filenameLen = strlen(argv[1]) + 1 + strlen(genomeSuffix) + 1;
+ char *fileName = new char[strlen(argv[1]) + 1 + strlen(genomeSuffix) + 1];
+ snprintf(fileName,filenameLen,"%s%c%s",argv[1],PATH_SEP,genomeSuffix);
+ genome = Genome::loadFromFile(fileName, 0);
+ if (NULL == genome) {
+ fprintf(stderr,"Unable to load genome from file '%s'\n",fileName);
+ return -1;
+ }
+ delete [] fileName;
+ fileName = NULL;
+ inputFileName = argv[2];
+ fastqWriter[0] = FASTQWriter::Factory(argv[3]);
+ if (NULL == fastqWriter[0]) {
+ fprintf(stderr,"Unable to open FASTQ writer for output file '%s'\n", argv[3]);
+ soft_exit(1);
+ }
+ _int64 totalReads = 0;
+ unsigned nThreads;
+#ifdef _DEBUG
+ nThreads = 1;
+#else // _DEBUG
+ if (5 == argc) {
+ nThreads = 1;
+ } else {
+ nThreads = GetNumberOfProcessors();
+ }
+#endif // _DEBUG
+ DataSupplier::ThreadCount = nThreads;
+ nRunningThreads = nThreads;
+ ReaderContext readerContext;
+ readerContext.clipping = NoClipping;
+ readerContext.defaultReadGroup = "";
+ readerContext.genome = genome;
+ readerContext.ignoreSecondaryAlignments = true;
+ readerContext.ignoreSupplementaryAlignments = true;
+ readerContext.header = NULL;
+ readerContext.headerLength = 0;
+ readerContext.headerBytes = 0;
+ if (5 == argc) {
+ if (!strcmp(argv[4], "-i")) {
+ fastqWriter[1] = fastqWriter[0]; // Interleaved
+ } else {
+ fastqWriter[1] = FASTQWriter::Factory(argv[4]);
+ if (NULL == fastqWriter[1]) {
+ fprintf(stderr,"Unable to open FASTQ writer for output file '%s'\n", argv[4]);
+ soft_exit(1);
+ }
+ }
+ PairedReadSupplierGenerator *pairedReadSupplierGenerator;
+ if (NULL != strrchr(inputFileName, '.') && !_stricmp(strrchr(inputFileName, '.'), ".bam")) {
+ pairedReadSupplierGenerator = BAMReader::createPairedReadSupplierGenerator(inputFileName, nThreads, true, readerContext);
+ } else {
+ pairedReadSupplierGenerator = SAMReader::createPairedReadSupplierGenerator(inputFileName, nThreads, true, readerContext);
+ }
+ totalReads = ProcessPairedInput(pairedReadSupplierGenerator);
+ if (fastqWriter[1] != fastqWriter[0]) {
+ delete fastqWriter[1];
+ } else {
+ fastqWriter[1] = NULL;
+ }
+ } else {
+ if (NULL != strrchr(inputFileName, '.') && !_stricmp(strrchr(inputFileName, '.'), ".bam")) {
+ readSupplierGenerator = BAMReader::createReadSupplierGenerator(inputFileName, nThreads, readerContext);
+ } else {
+ readSupplierGenerator = SAMReader::createReadSupplierGenerator(inputFileName, nThreads, readerContext);
+ }
+ CreateSingleWaiterObject(&allThreadsDone);
+ ThreadContext *contexts = new ThreadContext[nThreads];
+ for (unsigned i = 0; i < nThreads; i++) {
+ contexts[i].whichThread = i;
+ StartNewThread(WorkerThreadMain, &contexts[i]);
+ }
+ WaitForSingleWaiterObject(&allThreadsDone);
+ for (unsigned i = 0; i < nThreads; i++) {
+ totalReads += contexts[i].totalReads;
+ }
+ }
+ printf("%lld reads\n", totalReads);
+ delete fastqWriter[0];
+ return 0;
diff --git a/apps/ToFASTQ/ToFASTQ.vcxproj b/apps/ToFASTQ/ToFASTQ.vcxproj
new file mode 100644
index 0000000..53f266a
--- /dev/null
+++ b/apps/ToFASTQ/ToFASTQ.vcxproj
@@ -0,0 +1,162 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <ItemGroup Label="ProjectConfigurations">
+ <ProjectConfiguration Include="Debug|Win32">
+ <Configuration>Debug</Configuration>
+ <Platform>Win32</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Debug|x64">
+ <Configuration>Debug</Configuration>
+ <Platform>x64</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Release|Win32">
+ <Configuration>Release</Configuration>
+ <Platform>Win32</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Release|x64">
+ <Configuration>Release</Configuration>
+ <Platform>x64</Platform>
+ </ProjectConfiguration>
+ </ItemGroup>
+ <PropertyGroup Label="Globals">
+ <ProjectGuid>{E58705DB-9DFF-4892-B3BD-C76A352F9F37}</ProjectGuid>
+ <Keyword>Win32Proj</Keyword>
+ <RootNamespace>ToFASTQ</RootNamespace>
+ </PropertyGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <UseDebugLibraries>true</UseDebugLibraries>
+ <PlatformToolset>v120</PlatformToolset>
+ <CharacterSet>Unicode</CharacterSet>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <UseDebugLibraries>true</UseDebugLibraries>
+ <PlatformToolset>v120</PlatformToolset>
+ <CharacterSet>MultiByte</CharacterSet>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <UseDebugLibraries>false</UseDebugLibraries>
+ <PlatformToolset>v120</PlatformToolset>
+ <WholeProgramOptimization>true</WholeProgramOptimization>
+ <CharacterSet>Unicode</CharacterSet>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <UseDebugLibraries>false</UseDebugLibraries>
+ <PlatformToolset>v120</PlatformToolset>
+ <WholeProgramOptimization>true</WholeProgramOptimization>
+ <CharacterSet>MultiByte</CharacterSet>
+ </PropertyGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+ <ImportGroup Label="ExtensionSettings">
+ </ImportGroup>
+ <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <PropertyGroup Label="UserMacros" />
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ <LinkIncremental>true</LinkIncremental>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ <LinkIncremental>true</LinkIncremental>
+ <OutDir>$(SolutionDir)\obj\bin\$(Configuration)\$(Platform)\</OutDir>
+ <IntDir>$(SolutionDir)\obj\obj\ToFASTQ\$(Configuration)\$(Platform)\</IntDir>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ <LinkIncremental>false</LinkIncremental>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ <LinkIncremental>false</LinkIncremental>
+ <OutDir>$(SolutionDir)\obj\bin\$(Configuration)\$(Platform)\</OutDir>
+ <IntDir>$(SolutionDir)\obj\obj\ToFASTQ\$(Configuration)\$(Platform)\</IntDir>
+ </PropertyGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ <ClCompile>
+ <PrecompiledHeader>Use</PrecompiledHeader>
+ <WarningLevel>Level3</WarningLevel>
+ <Optimization>Disabled</Optimization>
+ <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ </ClCompile>
+ <Link>
+ <SubSystem>Console</SubSystem>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ <ClCompile>
+ <PrecompiledHeader>Use</PrecompiledHeader>
+ <WarningLevel>Level3</WarningLevel>
+ <Optimization>Disabled</Optimization>
+ <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions);_CRT_SECURE_NO_WARNINGS</PreprocessorDefinitions>
+ <AdditionalIncludeDirectories>..\..\snaplib\</AdditionalIncludeDirectories>
+ </ClCompile>
+ <Link>
+ <SubSystem>Console</SubSystem>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <AdditionalLibraryDirectories>$(SolutionDir)obj\lib\$(Configuration)\$(Platform)\;$(SolutionDir)import</AdditionalLibraryDirectories>
+ <AdditionalDependencies>libhdfs.lib;snaplib.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies);zlibstat.lib;libhdfs.lib</AdditionalDependencies>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ <ClCompile>
+ <WarningLevel>Level3</WarningLevel>
+ <PrecompiledHeader>Use</PrecompiledHeader>
+ <Optimization>MaxSpeed</Optimization>
+ <FunctionLevelLinking>true</FunctionLevelLinking>
+ <IntrinsicFunctions>true</IntrinsicFunctions>
+ <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ </ClCompile>
+ <Link>
+ <SubSystem>Console</SubSystem>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <EnableCOMDATFolding>true</EnableCOMDATFolding>
+ <OptimizeReferences>true</OptimizeReferences>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ <ClCompile>
+ <WarningLevel>Level3</WarningLevel>
+ <PrecompiledHeader>Use</PrecompiledHeader>
+ <Optimization>MaxSpeed</Optimization>
+ <FunctionLevelLinking>true</FunctionLevelLinking>
+ <IntrinsicFunctions>true</IntrinsicFunctions>
+ <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions);_CRT_SECURE_NO_WARNINGS</PreprocessorDefinitions>
+ <AdditionalIncludeDirectories>..\..\snaplib\</AdditionalIncludeDirectories>
+ </ClCompile>
+ <Link>
+ <SubSystem>Console</SubSystem>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <EnableCOMDATFolding>true</EnableCOMDATFolding>
+ <OptimizeReferences>true</OptimizeReferences>
+ <AdditionalLibraryDirectories>$(SolutionDir)obj\lib\$(Configuration)\$(Platform)\;$(SolutionDir)import</AdditionalLibraryDirectories>
+ <AdditionalDependencies>libhdfs.lib;snaplib.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies);zlibstat.lib</AdditionalDependencies>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemGroup>
+ <ClInclude Include="stdafx.h" />
+ </ItemGroup>
+ <ItemGroup>
+ <ClCompile Include="stdafx.cpp">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Create</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Create</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Create</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Create</PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="ToFASTQ.cpp" />
+ </ItemGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+ <ImportGroup Label="ExtensionTargets">
+ </ImportGroup>
\ No newline at end of file
diff --git a/apps/ToFASTQ/ToFASTQ.vcxproj.filters b/apps/ToFASTQ/ToFASTQ.vcxproj.filters
new file mode 100644
index 0000000..a6de0a3
--- /dev/null
+++ b/apps/ToFASTQ/ToFASTQ.vcxproj.filters
@@ -0,0 +1,30 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <ItemGroup>
+ <Filter Include="Source Files">
+ <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
+ <Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
+ </Filter>
+ <Filter Include="Header Files">
+ <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
+ <Extensions>h;hpp;hxx;hm;inl;inc;xsd</Extensions>
+ </Filter>
+ <Filter Include="Resource Files">
+ <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
+ <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
+ </Filter>
+ </ItemGroup>
+ <ItemGroup>
+ <ClInclude Include="stdafx.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ </ItemGroup>
+ <ItemGroup>
+ <ClCompile Include="stdafx.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="ToFASTQ.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ </ItemGroup>
\ No newline at end of file
diff --git a/apps/ToFASTQ/stdafx.cpp b/apps/ToFASTQ/stdafx.cpp
new file mode 100644
index 0000000..0a5e0b1
--- /dev/null
+++ b/apps/ToFASTQ/stdafx.cpp
@@ -0,0 +1,8 @@
+// stdafx.cpp : source file that includes just the standard includes
+// ToFASTQ.pch will be the pre-compiled header
+// stdafx.obj will contain the pre-compiled type information
+#include "stdafx.h"
+// TODO: reference any additional headers you need in STDAFX.H
+// and not in this file
diff --git a/apps/ToFASTQ/stdafx.h b/apps/ToFASTQ/stdafx.h
new file mode 100644
index 0000000..7c82d2b
--- /dev/null
+++ b/apps/ToFASTQ/stdafx.h
@@ -0,0 +1,5 @@
+#ifdef _MSC_VER
+#include "..\..\SNAPLib\stdafx.h"
+#include "../../SNAPLib/stdafx.h"
diff --git a/apps/snap/Main.cpp b/apps/snap/Main.cpp
new file mode 100644
index 0000000..eaa3d5e
--- /dev/null
+++ b/apps/snap/Main.cpp
@@ -0,0 +1,33 @@
+Module Name:
+ Main.cpp
+ Just call into the command processor from SNAPLib
+ Matei Zaharia & Bill Bolosky, February, 2012
+ User mode service.
+Revision History:
+ Adapted from cSNAP, which was in turn adapted from the scala prototype
+#include "stdafx.h"
+#include "CommandProcessor.h"
+//The SNAP_VERSION string moved to SNAPLib\CommandProcessor.cpp
+int main(int argc, const char **argv)
+ ProcessTopLevelCommands(argc, argv);
diff --git a/apps/snap/snap.vcxproj b/apps/snap/snap.vcxproj
new file mode 100644
index 0000000..61b6c7a
--- /dev/null
+++ b/apps/snap/snap.vcxproj
@@ -0,0 +1,184 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <ItemGroup Label="ProjectConfigurations">
+ <ProjectConfiguration Include="Debug|Win32">
+ <Configuration>Debug</Configuration>
+ <Platform>Win32</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Debug|x64">
+ <Configuration>Debug</Configuration>
+ <Platform>x64</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Release|Win32">
+ <Configuration>Release</Configuration>
+ <Platform>Win32</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Release|x64">
+ <Configuration>Release</Configuration>
+ <Platform>x64</Platform>
+ </ProjectConfiguration>
+ </ItemGroup>
+ <PropertyGroup Label="Globals">
+ <ProjectGuid>{76E127A5-2247-4A10-9DC8-59518C7C6636}</ProjectGuid>
+ <Keyword>Win32Proj</Keyword>
+ <RootNamespace>snap</RootNamespace>
+ </PropertyGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <UseDebugLibraries>true</UseDebugLibraries>
+ <PlatformToolset>v120</PlatformToolset>
+ <CharacterSet>MultiByte</CharacterSet>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <UseDebugLibraries>true</UseDebugLibraries>
+ <PlatformToolset>v120</PlatformToolset>
+ <CharacterSet>MultiByte</CharacterSet>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <UseDebugLibraries>false</UseDebugLibraries>
+ <PlatformToolset>v120</PlatformToolset>
+ <WholeProgramOptimization>true</WholeProgramOptimization>
+ <CharacterSet>MultiByte</CharacterSet>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <UseDebugLibraries>false</UseDebugLibraries>
+ <PlatformToolset>v120</PlatformToolset>
+ <WholeProgramOptimization>true</WholeProgramOptimization>
+ <CharacterSet>MultiByte</CharacterSet>
+ </PropertyGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+ <ImportGroup Label="ExtensionSettings">
+ </ImportGroup>
+ <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <PropertyGroup Label="UserMacros" />
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ <LinkIncremental>true</LinkIncremental>
+ <OutDir>$(SolutionDir)\obj\bin\$(Configuration)\$(Platform)\</OutDir>
+ <IntDir>$(SolutionDir)\obj\obj\snap\$(Configuration)\$(Platform)\</IntDir>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ <LinkIncremental>true</LinkIncremental>
+ <OutDir>$(SolutionDir)\obj\bin\$(Configuration)\$(Platform)\</OutDir>
+ <IntDir>$(SolutionDir)\obj\obj\snap\$(Configuration)\$(Platform)\</IntDir>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ <LinkIncremental>false</LinkIncremental>
+ <OutDir>$(SolutionDir)\obj\bin\$(Configuration)\$(Platform)\</OutDir>
+ <IntDir>$(SolutionDir)\obj\obj\snap\$(Configuration)\$(Platform)\</IntDir>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ <LinkIncremental>false</LinkIncremental>
+ <OutDir>$(SolutionDir)\obj\bin\$(Configuration)\$(Platform)\</OutDir>
+ <IntDir>$(SolutionDir)\obj\obj\snap\$(Configuration)\$(Platform)\</IntDir>
+ </PropertyGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ <ClCompile>
+ <PrecompiledHeader>Use</PrecompiledHeader>
+ <WarningLevel>Level3</WarningLevel>
+ <Optimization>Disabled</Optimization>
+ <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ <AdditionalIncludeDirectories>..\..\snaplib\</AdditionalIncludeDirectories>
+ <IntrinsicFunctions>true</IntrinsicFunctions>
+ <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+ <EnableEnhancedInstructionSet>StreamingSIMDExtensions2</EnableEnhancedInstructionSet>
+ </ClCompile>
+ <Link>
+ <SubSystem>Console</SubSystem>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <AdditionalLibraryDirectories>$(SolutionDir)obj\lib\$(Configuration)\$(Platform)\</AdditionalLibraryDirectories>
+ <AdditionalDependencies>snaplib.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ <ClCompile>
+ <PrecompiledHeader>Use</PrecompiledHeader>
+ <WarningLevel>Level3</WarningLevel>
+ <Optimization>Disabled</Optimization>
+ <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions); _CRT_SECURE_NO_WARNINGS</PreprocessorDefinitions>
+ <AdditionalIncludeDirectories>..\..\snaplib\</AdditionalIncludeDirectories>
+ <IntrinsicFunctions>true</IntrinsicFunctions>
+ <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+ <EnableEnhancedInstructionSet>NotSet</EnableEnhancedInstructionSet>
+ </ClCompile>
+ <Link>
+ <SubSystem>Console</SubSystem>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <AdditionalLibraryDirectories>$(SolutionDir)obj\lib\$(Configuration)\$(Platform)\;$(SolutionDir)import</AdditionalLibraryDirectories>
+ <AdditionalDependencies>libhdfs.lib;snaplib.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies);zlibstat.lib</AdditionalDependencies>
+ <StackReserveSize>8000000</StackReserveSize>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ <ClCompile>
+ <WarningLevel>Level3</WarningLevel>
+ <PrecompiledHeader>Use</PrecompiledHeader>
+ <Optimization>MaxSpeed</Optimization>
+ <FunctionLevelLinking>true</FunctionLevelLinking>
+ <IntrinsicFunctions>true</IntrinsicFunctions>
+ <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ <AdditionalIncludeDirectories>..\..\snaplib\</AdditionalIncludeDirectories>
+ <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+ <EnableEnhancedInstructionSet>StreamingSIMDExtensions2</EnableEnhancedInstructionSet>
+ </ClCompile>
+ <Link>
+ <SubSystem>Console</SubSystem>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <EnableCOMDATFolding>true</EnableCOMDATFolding>
+ <OptimizeReferences>true</OptimizeReferences>
+ <AdditionalLibraryDirectories>$(SolutionDir)obj\lib\$(Configuration)\$(Platform)\</AdditionalLibraryDirectories>
+ <AdditionalDependencies>snaplib.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ <ClCompile>
+ <WarningLevel>Level3</WarningLevel>
+ <PrecompiledHeader>Use</PrecompiledHeader>
+ <Optimization>MaxSpeed</Optimization>
+ <FunctionLevelLinking>true</FunctionLevelLinking>
+ <IntrinsicFunctions>true</IntrinsicFunctions>
+ <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions); _CRT_SECURE_NO_WARNINGS</PreprocessorDefinitions>
+ <AdditionalIncludeDirectories>..\..\snaplib\</AdditionalIncludeDirectories>
+ <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+ <EnableEnhancedInstructionSet>NotSet</EnableEnhancedInstructionSet>
+ </ClCompile>
+ <Link>
+ <SubSystem>Console</SubSystem>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <EnableCOMDATFolding>true</EnableCOMDATFolding>
+ <OptimizeReferences>true</OptimizeReferences>
+ <AdditionalLibraryDirectories>$(SolutionDir)obj\lib\$(Configuration)\$(Platform)\;$(SolutionDir)import</AdditionalLibraryDirectories>
+ <AdditionalDependencies>libhdfs.lib;snaplib.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies);zlibstat.lib</AdditionalDependencies>
+ <StackReserveSize>8000000</StackReserveSize>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemGroup>
+ <ClInclude Include="stdafx.h" />
+ </ItemGroup>
+ <ItemGroup>
+ <ClCompile Include="Main.cpp" />
+ <ClCompile Include="stdafx.cpp">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Create</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Create</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Create</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Create</PrecompiledHeader>
+ </ClCompile>
+ </ItemGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+ <ImportGroup Label="ExtensionTargets">
+ </ImportGroup>
\ No newline at end of file
diff --git a/apps/snap/snap.vcxproj.filters b/apps/snap/snap.vcxproj.filters
new file mode 100644
index 0000000..b82949c
--- /dev/null
+++ b/apps/snap/snap.vcxproj.filters
@@ -0,0 +1,30 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <ItemGroup>
+ <Filter Include="Source Files">
+ <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
+ <Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
+ </Filter>
+ <Filter Include="Header Files">
+ <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
+ <Extensions>h;hpp;hxx;hm;inl;inc;xsd</Extensions>
+ </Filter>
+ <Filter Include="Resource Files">
+ <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
+ <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
+ </Filter>
+ </ItemGroup>
+ <ItemGroup>
+ <ClInclude Include="stdafx.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ </ItemGroup>
+ <ItemGroup>
+ <ClCompile Include="stdafx.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="Main.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ </ItemGroup>
\ No newline at end of file
diff --git a/apps/snap/stdafx.cpp b/apps/snap/stdafx.cpp
new file mode 100644
index 0000000..e392466
--- /dev/null
+++ b/apps/snap/stdafx.cpp
@@ -0,0 +1,8 @@
+// stdafx.cpp : source file that includes just the standard includes
+// SNAPLib.pch will be the pre-compiled header
+// stdafx.obj will contain the pre-compiled type information
+#include "stdafx.h"
+// TODO: reference any additional headers you need in STDAFX.H
+// and not in this file
diff --git a/apps/snap/stdafx.h b/apps/snap/stdafx.h
new file mode 100644
index 0000000..45028a6
--- /dev/null
+++ b/apps/snap/stdafx.h
@@ -0,0 +1,50 @@
+// stdafx.h : include file for standard system include files,
+// or project specific include files that are used frequently, but
+// are changed infrequently
+#pragma once
+#ifdef _MSC_VER
+#include "targetver.h"
+#define WIN32_LEAN_AND_MEAN // Exclude rarely-used stuff from Windows headers
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+#include <ctype.h>
+#include <errno.h>
+#include <memory.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#ifdef _MSC_VER
+#include <tchar.h>
+#include <crtdbg.h>
+#include <windows.h>
+#include <direct.h>
+#include <wincrypt.h>
+#include <assert.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <unistd.h>
+// MAP_ANONYMOUS is called MAP_ANON on OS X
diff --git a/apps/stringz/GoodRandom.cpp b/apps/stringz/GoodRandom.cpp
new file mode 100644
index 0000000..e251b06
--- /dev/null
+++ b/apps/stringz/GoodRandom.cpp
@@ -0,0 +1,159 @@
+Module Name:
+ GoodRandom.cpp
+ Cryptographically secure random number generator that's fast because it batches getting its random
+ source from CryptGenRand(). It's careful not to bias results when the request range isn't a power
+ of two.
+ In Linux, uses the Mersenne Twister to get the raw numbers and mostly just applies the anti-biasing
+ related to random number ranges that aren't powers of two.
+ Bill Bolosky, April, 2011
+#include "stdafx.h"
+#include "GoodRandom.h"
+//#include "mt64.h"
+unsigned MinBytesToStore(_uint64 maxValue)
+ if (0 == maxValue) {
+ return 0;
+ } else if (maxValue < 0x100) {
+ return 1;
+ } else if (maxValue < 0x10000) {
+ return 2;
+ } else if (maxValue < 0x1000000) {
+ return 3;
+ } else if (maxValue < 0x100000000) {
+ return 4;
+ } else if (maxValue < 0x10000000000) {
+ return 5;
+ } else if (maxValue < 0x1000000000000) {
+ return 6;
+ } else if (maxValue < 0x100000000000000) {
+ return 7;
+ } else {
+ // Wow
+ return 8;
+ }
+#ifdef _MSC_VER
+const unsigned GFRandomBufferSize = 10 * 1024 * 1024;
+struct GFRandomState {
+ char *buffer;
+ unsigned bufferUsed;
+__declspec(thread) GFRandomState *g_randomState = NULL;
+__declspec(thread) HCRYPTPROV g_hRandomCryptProv;
+_uint64 GoodFastRandom(_uint64 maxValue)
+ if (0 == maxValue) {
+ return 0;
+ }
+#ifdef _MSC_VER
+ if (maxValue > 0xffffffffffffff && maxValue != 0xffffffffffffffff) {
+ fprintf(stderr,"GoodFastRandom: writeme\n");
+ exit(1);
+ // This case requires different math in the roundoff error check below, because as written
+ // it would use 1 << 64, which isn't representable. I'm too lazy to write it now.
+ }
+ if (NULL == g_randomState) {
+ g_randomState= new GFRandomState;
+ g_randomState->buffer = new char[GFRandomBufferSize];
+ g_randomState->bufferUsed = GFRandomBufferSize; // Forces us to get new random data the first time through.
+ BOOL worked = CryptAcquireContext(
+ &g_hRandomCryptProv,
+ "BillKeyContainer",
+ NULL, // default provider
+ if (!worked && NTE_BAD_KEYSET == GetLastError()) {
+ worked = CryptAcquireContext(
+ &g_hRandomCryptProv,
+ "BillKeyContainer",
+ NULL, // default provider
+ }
+ if (!worked) {
+ fprintf(stderr,"Unable to get crypt provider, %d\n",GetLastError());
+ exit(1);
+ }
+ }
+#endif // _MSC_VER
+ unsigned bytesToGet = MinBytesToStore(maxValue);
+ for (;;) {
+ _uint64 rawValue = 0;
+#ifdef _MSC_VER
+ if (g_randomState->bufferUsed + bytesToGet > GFRandomBufferSize) {
+ if (!CryptGenRandom(g_hRandomCryptProv,GFRandomBufferSize,(PBYTE)g_randomState->buffer)) {
+ fprintf(stderr,"CryptGenRandom failed, %d\b\n",GetLastError());
+ exit(1);
+ }
+ g_randomState->bufferUsed = 0;
+ }
+ memcpy(&rawValue,g_randomState->buffer + g_randomState->bufferUsed,bytesToGet);
+ g_randomState->bufferUsed += bytesToGet;
+#else // _MSC_VER
+ #ifdef RELEASE
+ #if RAND_MAX % 0x100 != 0xff
+ fprintf(stderr,"Jesse was too lazy to correct random bias on your platform.\n");
+ exit(1);
+ #endif
+ for (unsigned b = 0; b < bytesToGet; ++b) {
+ // RAND_MAX could in theory be less than 0xffff.
+ _uint64 randomByte = rand() & 0xff;
+ rawValue |= randomByte << (b * 8);
+ }
+ #else
+ rawValue = genrand64_int64() >> (8 - bytesToGet) * 8;
+ #endif // RELEASE
+#endif // _MSC_VER
+ //
+ // Be careful here not to bias the result. If maxValue + 1 doesn't go evenly into 256^bytesToGet
+ // then there would be a bias toward the lower values, since they get one extra representation.
+ // So, if the value is in the last part, throw it away and try again.
+ //
+ if (0xffffffffffffffff == maxValue) {
+ //
+ // Special case for full range, because it wouldn't work with the code below.
+ //
+ return rawValue;
+ }
+ _uint64 maxRawValuePlusOne = ((_uint64)1) << (bytesToGet * 8);
+ if (rawValue < maxRawValuePlusOne - maxRawValuePlusOne % (maxValue+1)) {
+ return rawValue % (maxValue+1);
+ }
+ }
diff --git a/apps/stringz/GoodRandom.h b/apps/stringz/GoodRandom.h
new file mode 100644
index 0000000..bb9ddc0
--- /dev/null
+++ b/apps/stringz/GoodRandom.h
@@ -0,0 +1,33 @@
+Module Name:
+ GoodRandom.h
+ Cryptographically secure random number generator that's fast because it batches getting its random
+ source from CryptGenRand(). It's careful not to bias results when the request range isn't a power
+ of two.
+ Bill Bolosky, April, 2011
+ Not thread safe.
+#pragma once
+#include "Compat.h"
+unsigned MinBytesToStore(_uint64 maxValue);
+// This returns a cryptographically secure random number from {0..maxValue} (NOT {0..maxValue-1}!)
+// It's computationally efficient, too.
+_uint64 GoodFastRandom(_uint64 maxValue);
diff --git a/apps/stringz/stdafx.cpp b/apps/stringz/stdafx.cpp
new file mode 100644
index 0000000..d875710
--- /dev/null
+++ b/apps/stringz/stdafx.cpp
@@ -0,0 +1,8 @@
+// stdafx.cpp : source file that includes just the standard includes
+// stringz.pch will be the pre-compiled header
+// stdafx.obj will contain the pre-compiled type information
+#include "stdafx.h"
+// TODO: reference any additional headers you need in STDAFX.H
+// and not in this file
diff --git a/apps/stringz/stdafx.h b/apps/stringz/stdafx.h
new file mode 100644
index 0000000..7c82d2b
--- /dev/null
+++ b/apps/stringz/stdafx.h
@@ -0,0 +1,5 @@
+#ifdef _MSC_VER
+#include "..\..\SNAPLib\stdafx.h"
+#include "../../SNAPLib/stdafx.h"
diff --git a/apps/stringz/stringz.cpp b/apps/stringz/stringz.cpp
new file mode 100644
index 0000000..7349b47
--- /dev/null
+++ b/apps/stringz/stringz.cpp
@@ -0,0 +1,245 @@
+// stringz.cpp : Defines the entry point for the console application.
+#include "stdafx.h"
+#include "GoodRandom.h"
+#include <math.h>
+size_t bigStringSize = 1000000;
+unsigned smallStringSize = 11;
+char *bigString = NULL;
+unsigned alphabetSize = 4;
+double differenceProb = .001;
+unsigned unitDifferenceTable[256];
+char *alphabetTable = NULL;
+double *probByEditDistance = NULL;
+ bigString = new char[bigStringSize];
+ alphabetTable = new char[alphabetSize];
+ if (alphabetSize == 4) {
+ //
+ // Special case 4 to be the genetic bases.
+ //
+ alphabetTable[0] = 'A';
+ alphabetTable[1] = 'C';
+ alphabetTable[2] = 'G';
+ alphabetTable[3] = 'T';
+ } else {
+ for (unsigned i = 0; i < alphabetSize; i++) {
+ alphabetTable[i] = 'A' + i;
+ }
+ }
+ for (size_t i = 0; i < bigStringSize; i++) {
+ bigString[i] = alphabetTable[GoodFastRandom(alphabetSize-1)];
+ }
+void Hamming(const char *str1, const char *str2, size_t len, double *matchProb, unsigned *editDistance)
+ *editDistance = 0;
+ for (size_t i = 0; i < len; i++) {
+ char xorValue = str1[i] ^ str2[i]; // This is 0 if they're equal, and not if they're not.
+ *editDistance += unitDifferenceTable[xorValue]; // Ditto for edit distance
+ }
+ *matchProb = probByEditDistance[*editDistance];
+ fprintf(stderr,"usage: stringz nStrings [smallStringSize [bigStringSize [differenceProb [alphabetSize]]]]\n");
+ exit(1);
+const unsigned max_MAPQ = 70;
+struct PerMAPQ {
+ size_t count;
+ size_t errors;
+ PerMAPQ() : count(0), errors(0) {}
+struct MAPQHistogram {
+ PerMAPQ perMAPQ[max_MAPQ+1];
+ void operator+=(MAPQHistogram &peer)
+ {
+ for (unsigned i = 0; i <= max_MAPQ; i++) {
+ perMAPQ[i].count += peer.perMAPQ[i].count;
+ perMAPQ[i].errors += peer.perMAPQ[i].errors;
+ }
+ }
+void runTest(size_t count, MAPQHistogram *result)
+ MAPQHistogram histogram;
+ char *smallString = new char[smallStringSize] + 1;
+ smallString[smallStringSize] = 0; // Not that anything looks at this.
+ unsigned oneHundredPer = (unsigned)(100/differenceProb); // The odds of a difference are 100 per this many. It lets us use GoodFastRandom to determine whether to introduce a difference.
+ for (size_t i = 0; i < count; i++) {
+ //
+ // Select a random offset in the big string.
+ //
+ size_t bigStringOffset = (size_t)GoodFastRandom(bigStringSize - smallStringSize);
+ //
+ // Mutate the string
+ //
+ for (unsigned j = 0; j < smallStringSize; j++) {
+ if (GoodFastRandom(oneHundredPer-1) < 100) {
+ smallString[j] = alphabetTable[(bigString[j+bigStringOffset] + 1)%alphabetSize];
+ } else {
+ smallString[j] = bigString[j+bigStringOffset];
+ }
+ }
+ size_t bestOffset = -1;
+ unsigned bestEditDistance = smallStringSize + 1;
+ double bestProbability = 0;
+ double totalProbability = 0;
+ size_t bestLocation = bigStringSize;
+ for (size_t j = 0; j < bigStringSize - smallStringSize; j++) {
+ double matchProb;
+ unsigned editDistance;
+ Hamming(smallString, bigString + j, smallStringSize, &matchProb, &editDistance);
+ if (editDistance < bestEditDistance) {
+ bestEditDistance = editDistance;
+ bestProbability = matchProb;
+ bestLocation = j;
+ }
+ totalProbability += matchProb;
+ }
+ //
+ // Compute MAPQ and update the histogram
+ //
+ unsigned MAPQ;
+ if (bestProbability == totalProbability) {
+ MAPQ = max_MAPQ;
+ } else {
+ MAPQ = __min(max_MAPQ-1, (int)(-10 * log10(1 - bestProbability/totalProbability)));
+ }
+ histogram.perMAPQ[MAPQ].count++;
+ if (bestLocation != bigStringOffset) {
+ histogram.perMAPQ[MAPQ].errors++;
+ }
+ }
+ *result = histogram;
+struct ThreadContext {
+ MAPQHistogram histogram;
+ size_t count;
+volatile int nThreadsRunning;
+SingleWaiterObject threadsDone;
+void WorkerThreadMain(void *param)
+ ThreadContext *context = (ThreadContext *)param;
+ runTest(context->count, &context->histogram);
+ if (0 == InterlockedDecrementAndReturnNewValue(&nThreadsRunning)) {
+ SignalSingleWaiterObject(&threadsDone);
+ }
+void main(int argc, char* argv[])
+ if (argc < 2 || argc > 6) usage();
+ size_t nStringz;
+ if (1 != sscanf(argv[1],"%lld", &nStringz) || nStringz <= 0) {
+ usage();
+ }
+ if (argc > 2) {
+ if (1 != sscanf(argv[2], "%d", &smallStringSize) || smallStringSize <= 0) usage();
+ if (argc > 3) {
+ if (1 != sscanf(argv[3], "%d", &bigStringSize) || bigStringSize <= 0) usage();
+ if (argc > 4) {
+ if (1 != sscanf(argv[4], "%lf", &differenceProb) || differenceProb <= 0.0) usage();
+ if (argc > 5) {
+ if (1 != sscanf(argv[5], "%d", &alphabetSize) || alphabetSize <=0 || alphabetSize >= 26) usage();
+ }
+ }
+ }
+ }
+ if (smallStringSize > bigStringSize) usage();
+ for (int i = 1; i < 256; i++) {
+ unitDifferenceTable[i] = 1;
+ }
+ unitDifferenceTable[0] = 0;
+ probByEditDistance = new double[smallStringSize+1];
+ for (unsigned i = 0; i <= smallStringSize; i++) {
+ probByEditDistance[i] = 1.0;
+ for (unsigned j = 0; j < i; j++) {
+ probByEditDistance[i] *= differenceProb/(alphabetSize - 1);
+ }
+ for (unsigned j = i+1; j <= smallStringSize; j++) {
+ probByEditDistance[i] *= (1 - differenceProb)/*/(alphabetSize - 1)*/;
+ }
+ }
+#ifdef _MSC_VER
+ if (!SetPriorityClass(GetCurrentProcess(), IDLE_PRIORITY_CLASS)) {
+ fprintf(stderr,"SetPriorityClass failed, %d\n", GetLastError());
+ }
+#endif // _MSC_VER
+ GenerateBigString();
+ unsigned nThreads = GetNumberOfProcessors();
+ ThreadContext *threadContexts = new ThreadContext[nThreads];
+ _int64 start = timeInMillis();
+ nThreadsRunning = nThreads;
+ CreateSingleWaiterObject(&threadsDone);
+ for (unsigned i = 0; i < nThreads - 1; i++) {
+ threadContexts[i].count = nStringz / nThreads;
+ StartNewThread(WorkerThreadMain, &threadContexts[i]);
+ }
+ threadContexts[nThreads-1].count = nStringz - (nStringz/nThreads) * (nThreads - 1);
+ StartNewThread(WorkerThreadMain, &threadContexts[nThreads - 1]);
+ WaitForSingleWaiterObject(&threadsDone);
+ MAPQHistogram result;
+ for (unsigned i = 0; i < nThreads; i++) {
+ result += threadContexts[i].histogram;
+ }
+ for (unsigned i = 0; i <= max_MAPQ; i++) {
+ printf("%d\t%lld\t%lld\n", i, result.perMAPQ[i].count, result.perMAPQ[i].errors);
+ }
+ _int64 stop = timeInMillis();
+ printf("\nProcessed %lld stringz in %llds, %lld stringz/s\n", nStringz, (stop - start + 500) / 1000, (nStringz * 1000) / (stop - start));
diff --git a/apps/stringz/stringz.vcxproj b/apps/stringz/stringz.vcxproj
new file mode 100644
index 0000000..3707f21
--- /dev/null
+++ b/apps/stringz/stringz.vcxproj
@@ -0,0 +1,177 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <ItemGroup Label="ProjectConfigurations">
+ <ProjectConfiguration Include="Debug|Win32">
+ <Configuration>Debug</Configuration>
+ <Platform>Win32</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Debug|x64">
+ <Configuration>Debug</Configuration>
+ <Platform>x64</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Release|Win32">
+ <Configuration>Release</Configuration>
+ <Platform>Win32</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Release|x64">
+ <Configuration>Release</Configuration>
+ <Platform>x64</Platform>
+ </ProjectConfiguration>
+ </ItemGroup>
+ <PropertyGroup Label="Globals">
+ <ProjectGuid>{A587E829-823D-4CA9-9CD4-C563A4617474}</ProjectGuid>
+ <Keyword>Win32Proj</Keyword>
+ <RootNamespace>stringz</RootNamespace>
+ </PropertyGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <UseDebugLibraries>true</UseDebugLibraries>
+ <CharacterSet>MultiByte</CharacterSet>
+ <PlatformToolset>v120</PlatformToolset>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <UseDebugLibraries>true</UseDebugLibraries>
+ <CharacterSet>MultiByte</CharacterSet>
+ <PlatformToolset>v120</PlatformToolset>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <UseDebugLibraries>false</UseDebugLibraries>
+ <WholeProgramOptimization>true</WholeProgramOptimization>
+ <CharacterSet>MultiByte</CharacterSet>
+ <PlatformToolset>v120</PlatformToolset>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <UseDebugLibraries>false</UseDebugLibraries>
+ <WholeProgramOptimization>true</WholeProgramOptimization>
+ <CharacterSet>MultiByte</CharacterSet>
+ <PlatformToolset>v120</PlatformToolset>
+ </PropertyGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+ <ImportGroup Label="ExtensionSettings">
+ </ImportGroup>
+ <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <PropertyGroup Label="UserMacros" />
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ <LinkIncremental>true</LinkIncremental>
+ <OutDir>$(SolutionDir)\obj\bin\$(Configuration)\$(Platform)\</OutDir>
+ <IntDir>$(SolutionDir)\obj\obj\snap\$(Configuration)\$(Platform)\</IntDir>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ <LinkIncremental>true</LinkIncremental>
+ <OutDir>$(SolutionDir)\obj\bin\$(Configuration)\$(Platform)\</OutDir>
+ <IntDir>$(SolutionDir)\obj\obj\snap\$(Configuration)\$(Platform)\</IntDir>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ <LinkIncremental>false</LinkIncremental>
+ <OutDir>$(SolutionDir)\obj\bin\$(Configuration)\$(Platform)\</OutDir>
+ <IntDir>$(SolutionDir)\obj\obj\snap\$(Configuration)\$(Platform)\</IntDir>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ <LinkIncremental>false</LinkIncremental>
+ <OutDir>$(SolutionDir)\obj\bin\$(Configuration)\$(Platform)\</OutDir>
+ <IntDir>$(SolutionDir)\obj\obj\snap\$(Configuration)\$(Platform)\</IntDir>
+ </PropertyGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ <ClCompile>
+ <PrecompiledHeader>Use</PrecompiledHeader>
+ <WarningLevel>Level3</WarningLevel>
+ <Optimization>Disabled</Optimization>
+ <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions);_CRT_SECURE_NO_WARNINGS</PreprocessorDefinitions>
+ <AdditionalIncludeDirectories>..\..\snaplib\;..\..\import</AdditionalIncludeDirectories>
+ </ClCompile>
+ <Link>
+ <SubSystem>Console</SubSystem>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <AdditionalLibraryDirectories>$(SolutionDir)obj\lib\$(Configuration)\$(Platform)\;$(SolutionDir)\import</AdditionalLibraryDirectories>
+ <AdditionalDependencies>zlibstat.lib;snaplib.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ <ClCompile>
+ <PrecompiledHeader>Use</PrecompiledHeader>
+ <WarningLevel>Level3</WarningLevel>
+ <Optimization>Disabled</Optimization>
+ <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions);_CRT_SECURE_NO_WARNINGS</PreprocessorDefinitions>
+ <AdditionalIncludeDirectories>..\..\snaplib\;..\..\import</AdditionalIncludeDirectories>
+ </ClCompile>
+ <Link>
+ <SubSystem>Console</SubSystem>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <AdditionalLibraryDirectories>$(SolutionDir)obj\lib\$(Configuration)\$(Platform)\;$(SolutionDir)\import</AdditionalLibraryDirectories>
+ <AdditionalDependencies>zlibstat.lib;snaplib.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ <ClCompile>
+ <WarningLevel>Level3</WarningLevel>
+ <PrecompiledHeader>Use</PrecompiledHeader>
+ <Optimization>MaxSpeed</Optimization>
+ <FunctionLevelLinking>true</FunctionLevelLinking>
+ <IntrinsicFunctions>true</IntrinsicFunctions>
+ <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions);_CRT_SECURE_NO_WARNINGS</PreprocessorDefinitions>
+ <AdditionalIncludeDirectories>..\..\snaplib\;..\..\import</AdditionalIncludeDirectories>
+ </ClCompile>
+ <Link>
+ <SubSystem>Console</SubSystem>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <EnableCOMDATFolding>true</EnableCOMDATFolding>
+ <OptimizeReferences>true</OptimizeReferences>
+ <AdditionalLibraryDirectories>$(SolutionDir)obj\lib\$(Configuration)\$(Platform)\;$(SolutionDir)\import</AdditionalLibraryDirectories>
+ <AdditionalDependencies>zlibstat.lib;snaplib.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ <ClCompile>
+ <WarningLevel>Level3</WarningLevel>
+ <PrecompiledHeader>Use</PrecompiledHeader>
+ <Optimization>MaxSpeed</Optimization>
+ <FunctionLevelLinking>true</FunctionLevelLinking>
+ <IntrinsicFunctions>true</IntrinsicFunctions>
+ <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions);_CRT_SECURE_NO_WARNINGS</PreprocessorDefinitions>
+ <AdditionalIncludeDirectories>..\..\snaplib\;..\..\import</AdditionalIncludeDirectories>
+ </ClCompile>
+ <Link>
+ <SubSystem>Console</SubSystem>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <EnableCOMDATFolding>true</EnableCOMDATFolding>
+ <OptimizeReferences>true</OptimizeReferences>
+ <AdditionalLibraryDirectories>$(SolutionDir)obj\lib\$(Configuration)\$(Platform)\;$(SolutionDir)\import</AdditionalLibraryDirectories>
+ <AdditionalDependencies>zlibstat.lib;snaplib.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemGroup>
+ <None Include="ReadMe.txt" />
+ </ItemGroup>
+ <ItemGroup>
+ <ClInclude Include="stdafx.h" />
+ <ClInclude Include="targetver.h" />
+ </ItemGroup>
+ <ItemGroup>
+ <ClCompile Include="GoodRandom.cpp" />
+ <ClCompile Include="stdafx.cpp">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Create</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Create</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Create</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Create</PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="stringz.cpp" />
+ </ItemGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+ <ImportGroup Label="ExtensionTargets">
+ </ImportGroup>
\ No newline at end of file
diff --git a/apps/stringz/stringz.vcxproj.filters b/apps/stringz/stringz.vcxproj.filters
new file mode 100644
index 0000000..ab37645
--- /dev/null
+++ b/apps/stringz/stringz.vcxproj.filters
@@ -0,0 +1,39 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <ItemGroup>
+ <Filter Include="Source Files">
+ <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
+ <Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
+ </Filter>
+ <Filter Include="Header Files">
+ <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
+ <Extensions>h;hpp;hxx;hm;inl;inc;xsd</Extensions>
+ </Filter>
+ <Filter Include="Resource Files">
+ <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
+ <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
+ </Filter>
+ </ItemGroup>
+ <ItemGroup>
+ <None Include="ReadMe.txt" />
+ </ItemGroup>
+ <ItemGroup>
+ <ClInclude Include="stdafx.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="targetver.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ </ItemGroup>
+ <ItemGroup>
+ <ClCompile Include="stdafx.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="stringz.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="GoodRandom.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ </ItemGroup>
\ No newline at end of file
diff --git a/apps/stringz/targetver.h b/apps/stringz/targetver.h
new file mode 100644
index 0000000..87c0086
--- /dev/null
+++ b/apps/stringz/targetver.h
@@ -0,0 +1,8 @@
+#pragma once
+// Including SDKDDKVer.h defines the highest available Windows platform.
+// If you wish to build your application for a previous Windows platform, include WinSDKVer.h and
+// set the _WIN32_WINNT macro to the platform you wish to support before including SDKDDKVer.h.
+#include <SDKDDKVer.h>
diff --git a/apps/wc/stdafx.cpp b/apps/wc/stdafx.cpp
new file mode 100644
index 0000000..cda96d8
--- /dev/null
+++ b/apps/wc/stdafx.cpp
@@ -0,0 +1,8 @@
+// stdafx.cpp : source file that includes just the standard includes
+// wc.pch will be the pre-compiled header
+// stdafx.obj will contain the pre-compiled type information
+#include "stdafx.h"
+// TODO: reference any additional headers you need in STDAFX.H
+// and not in this file
diff --git a/apps/wc/stdafx.h b/apps/wc/stdafx.h
new file mode 100644
index 0000000..7c82d2b
--- /dev/null
+++ b/apps/wc/stdafx.h
@@ -0,0 +1,5 @@
+#ifdef _MSC_VER
+#include "..\..\SNAPLib\stdafx.h"
+#include "../../SNAPLib/stdafx.h"
diff --git a/apps/wc/targetver.h b/apps/wc/targetver.h
new file mode 100644
index 0000000..87c0086
--- /dev/null
+++ b/apps/wc/targetver.h
@@ -0,0 +1,8 @@
+#pragma once
+// Including SDKDDKVer.h defines the highest available Windows platform.
+// If you wish to build your application for a previous Windows platform, include WinSDKVer.h and
+// set the _WIN32_WINNT macro to the platform you wish to support before including SDKDDKVer.h.
+#include <SDKDDKVer.h>
diff --git a/apps/wc/wc.cpp b/apps/wc/wc.cpp
new file mode 100644
index 0000000..a116dde
--- /dev/null
+++ b/apps/wc/wc.cpp
@@ -0,0 +1,259 @@
+Module Name:
+ wc.cpp
+ Version of the standard wc (word count) program that uses 64 bit counters.
+ Bill Bolosky, September, 2013
+Revision History:
+#include "stdafx.h"
+#include "Compat.h"
+#include "exit.h"
+void usage()
+ fprintf(stderr,"usage: wc [-lwc] [files]\n");
+ soft_exit(1);
+struct InputFile {
+ InputFile() : lines(0), words(0), chars(0), next(NULL) {}
+ char *fileName;
+ _uint64 lines, words, chars;
+ InputFile *next;
+SingleWaiterObject allThreadsDone;
+volatile _int64 nRunningThreads;
+void WorkerThreadMain(void *context)
+ InputFile *inputFile = (InputFile *)context;
+ FILE *file;
+ if (!strcmp(inputFile->fileName, "-")) {
+ file = stdin;
+ } else {
+ file = fopen(inputFile->fileName, "rb");
+ if (NULL == file) {
+ fprintf(stderr,"wc: unable to open input file '%s'\n", inputFile->fileName);
+ soft_exit(1);
+ }
+ }
+ const size_t bufferSize = 8 * 1024 * 1024; // A decent disk IO size
+ unsigned char *buffer = new unsigned char[bufferSize];
+ _uint64 lines = 0, words = 0, chars = 0;
+ //
+ // Rather than using conditional branches, use lookup tables.
+ //
+ int isSeparator[256];
+ int isLineBreak[256];
+ int isWordPart[256];
+ for (int x = 0; x < 256; x++) {
+ isSeparator[x] = isLineBreak[x] = isWordPart[x] = 0;
+ }
+ //
+ // Characters that come between words. Personally, I'd include punctuation, but the old wc doesn't.
+ //
+ isSeparator[' '] = 1;
+ isSeparator['\t'] = 1;
+ isSeparator['\n'] = 1;
+ isSeparator['\r'] = 1;
+ //
+ // Characters that end a line. We don't include "\r" because then CRLF text would look like it has twice
+ // as many lines as it does.
+ //
+ isLineBreak['\n'] = 1;
+ //
+ // Characters that are parts of words.
+ //
+ for (int x = 'a'; x <= 'z'; x++) {
+ isWordPart[x] = 1;
+ }
+ for (int x = 'A'; x <= 'Z'; x++) {
+ isWordPart[x] = 1;
+ }
+ for (int x = '0'; x <= '9'; x++) {
+ isWordPart[x] = 1;
+ }
+ //
+ // Now process the input file.
+ //
+ _uint64 notInAWord = 1;
+ size_t validBytes;
+ while (0 != (validBytes = fread(buffer, 1, bufferSize, file))) {
+ chars += validBytes;
+ for (size_t i = 0; i < validBytes; i++) {
+ unsigned char nextChar = buffer[i];
+ lines += isLineBreak[nextChar];
+ //
+ // Use branch-free logic to compute the word count.
+ //
+ words += notInAWord * isWordPart[nextChar]; // if (notInAWord && isWordPart) words++
+ notInAWord = 1 - ((1 - isSeparator[nextChar]) * (1 - notInAWord)); // notInAWord = isSeparator || notInAWord
+ notInAWord = notInAWord * (1 - isWordPart[nextChar]); // notInAWord = notInAWord && !isWordPart
+ } // for each char of the buffer
+ }
+ if (!feof(file)) {
+ fprintf(stderr,"Error reading file '%s'\n", inputFile->fileName);
+ soft_exit(1);
+ }
+ inputFile->chars = chars;
+ inputFile->words = words;
+ inputFile->lines = lines;
+ fclose(file);
+ delete [] buffer;
+ if (0 == InterlockedAdd64AndReturnNewValue(&nRunningThreads, -1)) {
+ SignalSingleWaiterObject(&allThreadsDone);
+ }
+void printOutputLine(
+ _uint64 chars,
+ _uint64 words,
+ _uint64 lines,
+ const char *fileName,
+ bool printChars,
+ bool printWords,
+ bool printLines)
+ printf("\t"); // Not sure why, but unix wc does this, so I'll keep it.
+ if (printLines) {
+ printf("%llu\t", lines);
+ }
+ if (printWords) {
+ printf("%llu\t", words);
+ }
+ if (printChars) {
+ printf("%llu\t", chars);
+ }
+ printf("%s\n", fileName);
+int main(int argc, char* argv[])
+ bool cmdLinePrintChars = false, cmdLinePrintWords = false, cmdLinePrintLines = false, seenStdin = false;
+ InputFile *inputFiles = NULL;
+ InputFile *lastInputFile = NULL;
+ _uint64 nInputFiles = 0;
+ for (int i = 1; i < argc; i++) {
+ if (argv[i][0] == '-' && argv[i][1] != '\0') {
+ //
+ // Option.
+ //
+ for (size_t j = 1; j < strlen(argv[i]); j++) {
+ switch (argv[i][j]) {
+ case 'l': cmdLinePrintLines = true; break;
+ case 'w': cmdLinePrintWords = true; break;
+ case 'c': cmdLinePrintChars = true; break;
+ default: usage();
+ } // switch
+ } // for each char in options
+ } else {
+ nInputFiles++;
+ if (!strcmp(argv[i], "-")) {
+ if (seenStdin) {
+ fprintf(stderr,"Can't specify '-' (read input from stdin) more than once\n");
+ soft_exit(1);
+ }
+ seenStdin = true;
+ }
+ InputFile *inputFile = new InputFile;
+ inputFile->fileName = argv[i];
+ if (lastInputFile == NULL) {
+ inputFiles = lastInputFile = inputFile;
+ } else {
+ lastInputFile->next = inputFile;
+ lastInputFile = inputFile;
+ }
+ }
+ } // for all args
+ if (0 == nInputFiles) {
+ //
+ // Read from stdin
+ //
+ InputFile *inputFile = new InputFile;
+ inputFile->fileName = "-";
+ inputFiles = lastInputFile = inputFile;
+ nInputFiles = 1;
+ }
+ //
+ // Fire up a thread for each file.
+ //
+ CreateSingleWaiterObject(&allThreadsDone);
+ nRunningThreads = nInputFiles;
+ InputFile *inputFile = inputFiles;
+ while (NULL != inputFile) {
+ StartNewThread(WorkerThreadMain, inputFile);
+ inputFile = inputFile->next;
+ }
+ WaitForSingleWaiterObject(&allThreadsDone);
+ _uint64 chars = 0, words = 0, lines = 0;
+ bool printChars, printWords, printLines;
+ if (cmdLinePrintChars || cmdLinePrintWords || cmdLinePrintLines) {
+ printChars = cmdLinePrintChars;
+ printWords = cmdLinePrintWords;
+ printLines = cmdLinePrintLines;
+ } else {
+ printChars = printWords = printLines = true;
+ }
+ inputFile = inputFiles;
+ while (NULL != inputFile) {
+ printOutputLine(inputFile->chars, inputFile->words, inputFile->lines, inputFile->fileName, printChars, printWords, printLines);
+ chars += inputFile->chars;
+ words += inputFile->words;
+ lines += inputFile->lines;
+ inputFile = inputFile->next;
+ }
+ if (nInputFiles > 1) {
+ printOutputLine(chars, words, lines, "Totals", printChars, printWords, printLines);
+ }
diff --git a/apps/wc/wc.vcxproj b/apps/wc/wc.vcxproj
new file mode 100644
index 0000000..a005639
--- /dev/null
+++ b/apps/wc/wc.vcxproj
@@ -0,0 +1,173 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <ItemGroup Label="ProjectConfigurations">
+ <ProjectConfiguration Include="Debug|Win32">
+ <Configuration>Debug</Configuration>
+ <Platform>Win32</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Debug|x64">
+ <Configuration>Debug</Configuration>
+ <Platform>x64</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Release|Win32">
+ <Configuration>Release</Configuration>
+ <Platform>Win32</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Release|x64">
+ <Configuration>Release</Configuration>
+ <Platform>x64</Platform>
+ </ProjectConfiguration>
+ </ItemGroup>
+ <PropertyGroup Label="Globals">
+ <ProjectGuid>{70D9DA2A-E423-4705-BC71-0198C365A730}</ProjectGuid>
+ <Keyword>Win32Proj</Keyword>
+ <RootNamespace>wc</RootNamespace>
+ </PropertyGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <UseDebugLibraries>true</UseDebugLibraries>
+ <PlatformToolset>v120</PlatformToolset>
+ <CharacterSet>MultiByte</CharacterSet>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <UseDebugLibraries>true</UseDebugLibraries>
+ <PlatformToolset>v120</PlatformToolset>
+ <CharacterSet>MultiByte</CharacterSet>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <UseDebugLibraries>false</UseDebugLibraries>
+ <PlatformToolset>v120</PlatformToolset>
+ <WholeProgramOptimization>true</WholeProgramOptimization>
+ <CharacterSet>MultiByte</CharacterSet>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <UseDebugLibraries>false</UseDebugLibraries>
+ <PlatformToolset>v120</PlatformToolset>
+ <WholeProgramOptimization>true</WholeProgramOptimization>
+ <CharacterSet>MultiByte</CharacterSet>
+ </PropertyGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+ <ImportGroup Label="ExtensionSettings">
+ </ImportGroup>
+ <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <PropertyGroup Label="UserMacros" />
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ <LinkIncremental>true</LinkIncremental>
+ <OutDir>$(SolutionDir)\obj\bin\$(Configuration)\$(Platform)\</OutDir>
+ <IntDir>$(SolutionDir)\obj\obj\snap\$(Configuration)\$(Platform)\</IntDir>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ <LinkIncremental>true</LinkIncremental>
+ <OutDir>$(SolutionDir)\obj\bin\$(Configuration)\$(Platform)\</OutDir>
+ <IntDir>$(SolutionDir)\obj\obj\snap\$(Configuration)\$(Platform)\</IntDir>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ <LinkIncremental>false</LinkIncremental>
+ <OutDir>$(SolutionDir)\obj\bin\$(Configuration)\$(Platform)\</OutDir>
+ <IntDir>$(SolutionDir)\obj\obj\snap\$(Configuration)\$(Platform)\</IntDir>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ <LinkIncremental>false</LinkIncremental>
+ <OutDir>$(SolutionDir)\obj\bin\$(Configuration)\$(Platform)\</OutDir>
+ <IntDir>$(SolutionDir)\obj\obj\snap\$(Configuration)\$(Platform)\</IntDir>
+ </PropertyGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ <ClCompile>
+ <PrecompiledHeader>Use</PrecompiledHeader>
+ <WarningLevel>Level3</WarningLevel>
+ <Optimization>Disabled</Optimization>
+ <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions);_CRT_SECURE_NO_WARNINGS</PreprocessorDefinitions>
+ <AdditionalIncludeDirectories>..\..\snaplib\</AdditionalIncludeDirectories>
+ </ClCompile>
+ <Link>
+ <SubSystem>Console</SubSystem>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <AdditionalLibraryDirectories>$(SolutionDir)obj\lib\$(Configuration)\$(Platform)\;$(SolutionDir)import</AdditionalLibraryDirectories>
+ <AdditionalDependencies>snaplib.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ <ClCompile>
+ <PrecompiledHeader>Use</PrecompiledHeader>
+ <WarningLevel>Level3</WarningLevel>
+ <Optimization>Disabled</Optimization>
+ <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions);_CRT_SECURE_NO_WARNINGS</PreprocessorDefinitions>
+ <AdditionalIncludeDirectories>..\..\snaplib\</AdditionalIncludeDirectories>
+ </ClCompile>
+ <Link>
+ <SubSystem>Console</SubSystem>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <AdditionalLibraryDirectories>$(SolutionDir)obj\lib\$(Configuration)\$(Platform)\;$(SolutionDir)import</AdditionalLibraryDirectories>
+ <AdditionalDependencies>snaplib.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ <ClCompile>
+ <WarningLevel>Level3</WarningLevel>
+ <PrecompiledHeader>Use</PrecompiledHeader>
+ <Optimization>MaxSpeed</Optimization>
+ <FunctionLevelLinking>true</FunctionLevelLinking>
+ <IntrinsicFunctions>true</IntrinsicFunctions>
+ <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions);_CRT_SECURE_NO_WARNINGS</PreprocessorDefinitions>
+ <AdditionalIncludeDirectories>..\..\snaplib\</AdditionalIncludeDirectories>
+ </ClCompile>
+ <Link>
+ <SubSystem>Console</SubSystem>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <EnableCOMDATFolding>true</EnableCOMDATFolding>
+ <OptimizeReferences>true</OptimizeReferences>
+ <AdditionalLibraryDirectories>$(SolutionDir)obj\lib\$(Configuration)\$(Platform)\;$(SolutionDir)import</AdditionalLibraryDirectories>
+ <AdditionalDependencies>snaplib.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ <ClCompile>
+ <WarningLevel>Level3</WarningLevel>
+ <PrecompiledHeader>Use</PrecompiledHeader>
+ <Optimization>MaxSpeed</Optimization>
+ <FunctionLevelLinking>true</FunctionLevelLinking>
+ <IntrinsicFunctions>true</IntrinsicFunctions>
+ <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions);_CRT_SECURE_NO_WARNINGS</PreprocessorDefinitions>
+ <AdditionalIncludeDirectories>..\..\snaplib\</AdditionalIncludeDirectories>
+ </ClCompile>
+ <Link>
+ <SubSystem>Console</SubSystem>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <EnableCOMDATFolding>true</EnableCOMDATFolding>
+ <OptimizeReferences>true</OptimizeReferences>
+ <AdditionalLibraryDirectories>$(SolutionDir)obj\lib\$(Configuration)\$(Platform)\;$(SolutionDir)import</AdditionalLibraryDirectories>
+ <AdditionalDependencies>snaplib.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemGroup>
+ <ClInclude Include="stdafx.h" />
+ <ClInclude Include="targetver.h" />
+ </ItemGroup>
+ <ItemGroup>
+ <ClCompile Include="stdafx.cpp">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Create</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Create</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Create</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Create</PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="wc.cpp" />
+ </ItemGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+ <ImportGroup Label="ExtensionTargets">
+ </ImportGroup>
\ No newline at end of file
diff --git a/apps/wc/wc.vcxproj.filters b/apps/wc/wc.vcxproj.filters
new file mode 100644
index 0000000..3b6cf66
--- /dev/null
+++ b/apps/wc/wc.vcxproj.filters
@@ -0,0 +1,33 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <ItemGroup>
+ <Filter Include="Source Files">
+ <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
+ <Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
+ </Filter>
+ <Filter Include="Header Files">
+ <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
+ <Extensions>h;hpp;hxx;hm;inl;inc;xsd</Extensions>
+ </Filter>
+ <Filter Include="Resource Files">
+ <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
+ <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
+ </Filter>
+ </ItemGroup>
+ <ItemGroup>
+ <ClInclude Include="stdafx.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="targetver.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ </ItemGroup>
+ <ItemGroup>
+ <ClCompile Include="stdafx.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="wc.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ </ItemGroup>
\ No newline at end of file
diff --git a/docs/Manual.docx b/docs/Manual.docx
new file mode 100644
index 0000000..2675062
Binary files /dev/null and b/docs/Manual.docx differ
diff --git a/docs/Manual.pdf b/docs/Manual.pdf
new file mode 100644
index 0000000..d854f8a
Binary files /dev/null and b/docs/Manual.pdf differ
diff --git a/docs/QuickStart.docx b/docs/QuickStart.docx
new file mode 100644
index 0000000..d9f4fa9
Binary files /dev/null and b/docs/QuickStart.docx differ
diff --git a/docs/QuickStart.pdf b/docs/QuickStart.pdf
new file mode 100644
index 0000000..431a7bd
Binary files /dev/null and b/docs/QuickStart.pdf differ
diff --git a/import/libhdfs.lib b/import/libhdfs.lib
new file mode 100755
index 0000000..cbe8a6f
Binary files /dev/null and b/import/libhdfs.lib differ
diff --git a/import/pdclibhdfs/inc/exception.h b/import/pdclibhdfs/inc/exception.h
new file mode 100755
index 0000000..33e9807
--- /dev/null
+++ b/import/pdclibhdfs/inc/exception.h
@@ -0,0 +1,172 @@
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ * Exception handling routines for libhdfs.
+ *
+ * The convention we follow here is to clear pending exceptions as soon as they
+ * are raised. Never assume that the caller of your function will clean up
+ * after you-- do it yourself. Unhandled exceptions can lead to memory leaks
+ * and other undefined behavior.
+ *
+ * If you encounter an exception, return a local reference to it. The caller is
+ * responsible for freeing the local reference, by calling a function like
+ * PrintExceptionAndFree. (You can also free exceptions directly by calling
+ * DeleteLocalRef. However, that would not produce an error message, so it's
+ * usually not what you want.)
+ */
+#include <jni.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <errno.h>
+#ifndef ENOLINK
+#define ENOLINK 67
+#ifndef ENOTSUP
+# define ENOTSUP 95
+ * Exception noprint flags
+ *
+ * Theses flags determine which exceptions should NOT be printed to stderr by
+ * the exception printing routines. For example, if you expect to see
+ * FileNotFound, you might use NOPRINT_EXC_FILE_NOT_FOUND, to avoid filling the
+ * logs with messages about routine events.
+ *
+ * On the other hand, if you don't expect any failures, you might pass
+ *
+ * You can OR these flags together to avoid printing multiple classes of
+ * exceptions.
+ */
+#define PRINT_EXC_ALL 0x00
+/* If we're not using GNU C, elide __attribute__ */
+#ifndef __GNUC__
+# define __attribute__(x) /*NOTHING*/
+ * Get information about an exception.
+ *
+ * @param excName The Exception name.
+ * This is a Java class name in JNI format.
+ * @param noPrintFlags Flags which determine which exceptions we should NOT
+ * print.
+ * @param excErrno (out param) The POSIX error number associated with the
+ * exception.
+ * @param shouldPrint (out param) Nonzero if we should print this exception,
+ * based on the noPrintFlags and its name.
+ */
+void getExceptionInfo ( const char *excName, int noPrintFlags,
+ int *excErrno, int *shouldPrint );
+ * Print out information about an exception and free it.
+ *
+ * @param env The JNI environment
+ * @param exc The exception to print and free
+ * @param noPrintFlags Flags which determine which exceptions we should NOT
+ * print.
+ * @param fmt Printf-style format list
+ * @param ap Printf-style varargs
+ *
+ * @return The POSIX error number associated with the exception
+ * object.
+ */
+int printExceptionAndFreeV ( JNIEnv *env,
+ jthrowable exc,
+ int noPrintFlags,
+ const char *fmt, va_list ap );
+ * Print out information about an exception and free it.
+ *
+ * @param env The JNI environment
+ * @param exc The exception to print and free
+ * @param noPrintFlags Flags which determine which exceptions we should NOT
+ * print.
+ * @param fmt Printf-style format list
+ * @param ... Printf-style varargs
+ *
+ * @return The POSIX error number associated with the exception
+ * object.
+ */
+int printExceptionAndFree ( JNIEnv *env,
+ jthrowable exc,
+ int noPrintFlags,
+ const char *fmt, ... ) __attribute__((format(printf, 4, 5)));
+ * Print out information about the pending exception and free it.
+ *
+ * @param env The JNI environment
+ * @param noPrintFlags Flags which determine which exceptions we should NOT
+ * print.
+ * @param fmt Printf-style format list
+ * @param ... Printf-style varargs
+ *
+ * @return The POSIX error number associated with the exception
+ * object.
+ */
+int printPendingExceptionAndFree ( JNIEnv *env,
+ int noPrintFlags,
+ const char *fmt, ... ) __attribute__((format(printf, 3, 4)));
+ * Get a local reference to the pending exception and clear it.
+ *
+ * Once it is cleared, the exception will no longer be pending. The caller will
+ * have to decide what to do with the exception object.
+ *
+ * @param env The JNI environment
+ *
+ * @return The exception, or NULL if there was no exception
+ */
+jthrowable getPendingExceptionAndClear ( JNIEnv *env );
+ * Create a new runtime error.
+ *
+ * This creates (but does not throw) a new RuntimeError.
+ *
+ * @param env The JNI environment
+ * @param fmt Printf-style format list
+ * @param ... Printf-style varargs
+ *
+ * @return A local reference to a RuntimeError
+ */
+jthrowable newRuntimeError ( JNIEnv *env,
+ const char *fmt, ... ) __attribute__((format(printf, 2, 3)));
diff --git a/import/pdclibhdfs/inc/expect.h b/import/pdclibhdfs/inc/expect.h
new file mode 100755
index 0000000..83bfb6b
--- /dev/null
+++ b/import/pdclibhdfs/inc/expect.h
@@ -0,0 +1,120 @@
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <stdio.h>
+#define EXPECT_ZERO(x) \
+ do { \
+ int __my_ret__ = x; \
+ if (__my_ret__) { \
+ int __my_errno__ = errno; \
+ fprintf(stderr, "TEST_ERROR: failed on line %d with return " \
+ "code %d (errno: %d): got nonzero from %s\n", \
+ __LINE__, __my_ret__, __my_errno__, #x); \
+ return __my_ret__; \
+ } \
+ } while (0);
+#define EXPECT_NULL(x) \
+ do { \
+ void* __my_ret__ = x; \
+ int __my_errno__ = errno; \
+ if (__my_ret__ != NULL) { \
+ fprintf(stderr, "TEST_ERROR: failed on line %d (errno: %d): " \
+ "got non-NULL value %p from %s\n", \
+ __LINE__, __my_errno__, __my_ret__, #x); \
+ return -1; \
+ } \
+ } while (0);
+#define EXPECT_NONNULL(x) \
+ do { \
+ void* __my_ret__ = x; \
+ int __my_errno__ = errno; \
+ if (__my_ret__ == NULL) { \
+ fprintf(stderr, "TEST_ERROR: failed on line %d (errno: %d): " \
+ "got NULL from %s\n", __LINE__, __my_errno__, #x); \
+ return -1; \
+ } \
+ } while (0);
+ do { \
+ int __my_ret__ = x; \
+ int __my_errno__ = errno; \
+ if (__my_ret__ != -1) { \
+ fprintf(stderr, "TEST_ERROR: failed on line %d with return " \
+ "code %d (errno: %d): expected -1 from %s\n", __LINE__, \
+ __my_ret__, __my_errno__, #x); \
+ return -1; \
+ } \
+ if (__my_errno__ != e) { \
+ fprintf(stderr, "TEST_ERROR: failed on line %d with return " \
+ "code %d (errno: %d): expected errno = %d from %s\n", \
+ __LINE__, __my_ret__, __my_errno__, e, #x); \
+ return -1; \
+ } \
+ } while (0);
+#define EXPECT_NONZERO(x) \
+ do { \
+ int __my_ret__ = x; \
+ int __my_errno__ = errno; \
+ if (!__my_ret__) { \
+ fprintf(stderr, "TEST_ERROR: failed on line %d with return " \
+ "code %d (errno: %d): got zero from %s\n", __LINE__, \
+ __my_ret__, __my_errno__, #x); \
+ return -1; \
+ } \
+ } while (0);
+ do { \
+ int __my_ret__ = x; \
+ int __my_errno__ = errno; \
+ if (__my_ret__ < 0) { \
+ fprintf(stderr, "TEST_ERROR: failed on line %d with return " \
+ "code %d (errno: %d): got negative return from %s\n", \
+ __LINE__, __my_ret__, __my_errno__, #x); \
+ return __my_ret__; \
+ } \
+ } while (0);
+#define EXPECT_INT_EQ(x, y) \
+ do { \
+ int __my_ret__ = y; \
+ int __my_errno__ = errno; \
+ if (__my_ret__ != (x)) { \
+ fprintf(stderr, "TEST_ERROR: failed on line %d with return " \
+ "code %d (errno: %d): expected %d\n", \
+ __LINE__, __my_ret__, __my_errno__, (x)); \
+ return -1; \
+ } \
+ } while (0);
+#define RETRY_ON_EINTR_GET_ERRNO(ret, expr) do { \
+ ret = expr; \
+ if (!ret) \
+ break; \
+ ret = -errno; \
+ } while (ret == -EINTR);
diff --git a/import/pdclibhdfs/inc/hdfs.h b/import/pdclibhdfs/inc/hdfs.h
new file mode 100755
index 0000000..f887247
--- /dev/null
+++ b/import/pdclibhdfs/inc/hdfs.h
@@ -0,0 +1,768 @@
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <fcntl.h> /* for O_RDONLY, O_WRONLY */
+#include <time.h> /* for time_t */
+#ifdef WIN32
+# ifdef WIN32_EXP
+# define WIN32EXP extern __declspec(dllexport)
+# else
+# define WIN32EXP extern __declspec(dllimport)
+# endif
+#ifndef O_ACCMODE
+#if _MSC_VER >= 1600
+#include <stdint.h>
+#if (_MSC_VER < 1300)
+ typedef signed char int8_t;
+ typedef signed short int16_t;
+ typedef signed int int32_t;
+ typedef unsigned char uint8_t;
+ typedef unsigned short uint16_t;
+ typedef unsigned int uint32_t;
+ typedef signed __int8 int8_t;
+ typedef signed __int16 int16_t;
+ typedef signed __int32 int32_t;
+ typedef unsigned __int8 uint8_t;
+ typedef unsigned __int16 uint16_t;
+ typedef unsigned __int32 uint32_t;
+typedef signed __int64 int64_t;
+typedef unsigned __int64 uint64_t;
+#define WIN32EXP
+#include <stdint.h>
+#ifndef WIN32
+ typedef int32_t tSize; /* size of data for read/write io ops */
+ typedef int64_t tOffset; /* offset within the file */
+ typedef uint16_t tPort; /* port */
+ typedef int tSize; /* size of data for read/write io ops */
+ typedef long long tOffset; /* offset within the file */
+ typedef unsigned short tPort; /* port */
+#ifndef PRId64
+#define PRId64 "I64d"
+#ifndef O_RDONLY
+#define O_RDONLY 1
+#ifndef O_WRONLY
+#define O_WRONLY 2
+#ifndef EINTERNAL
+#define EINTERNAL 255
+#undef WIN32EXP
+#define WIN32EXP
+/** All APIs set errno to meaningful values */
+#ifdef __cplusplus
+extern "C" {
+ // jelson
+ void staticLibInit();
+ /**
+ * Some utility decls used in libhdfs.
+ */
+ struct hdfsBuilder;
+ typedef time_t tTime; /* time type in seconds */
+ typedef enum tObjectKind {
+ kObjectKindFile = 'F',
+ kObjectKindDirectory = 'D',
+ } tObjectKind;
+ /**
+ * The C reflection of org.apache.org.hadoop.FileSystem .
+ */
+ struct hdfs_internal;
+ typedef struct hdfs_internal* hdfsFS;
+ struct hdfsFile_internal;
+ typedef struct hdfsFile_internal* hdfsFile;
+ WIN32EXP int hdfsLibInit ( void * );
+ /**
+ * Disable the direct read optimization for a file.
+ *
+ * This is mainly provided for unit testing purposes.
+ *
+ * @param file The HDFS file
+ */
+ WIN32EXP void hdfsFileDisableDirectRead(hdfsFile file);
+ /**
+ * Determine if a file is using the "direct read" optimization.
+ *
+ * @param file The HDFS file
+ * @return 1 if the file is using the direct read optimization,
+ * 0 otherwise.
+ */
+ WIN32EXP int hdfsFileUsesDirectRead(hdfsFile file);
+ /**
+ * Determine if a file is open for read.
+ *
+ * @param file The HDFS file
+ * @return 1 if the file is open for read; 0 otherwise
+ */
+ WIN32EXP int hdfsFileIsOpenForRead(hdfsFile file);
+ /**
+ * Determine if a file is open for write.
+ *
+ * @param file The HDFS file
+ * @return 1 if the file is open for write; 0 otherwise
+ */
+ WIN32EXP int hdfsFileIsOpenForWrite(hdfsFile file);
+ struct hdfsReadStatistics {
+ uint64_t totalBytesRead;
+ uint64_t totalLocalBytesRead;
+ uint64_t totalShortCircuitBytesRead;
+ };
+ /**
+ * Get read statistics about a file. This is only applicable to files
+ * opened for reading.
+ *
+ * @param file The HDFS file
+ * @param stats (out parameter) on a successful return, the read
+ * statistics. Unchanged otherwise. You must free the
+ * returned statistics with hdfsFileFreeReadStatistics.
+ * @return 0 if the statistics were successfully returned,
+ * -1 otherwise. On a failure, please check errno against
+ * ENOTSUP. webhdfs, LocalFilesystem, and so forth may
+ * not support read statistics.
+ */
+ WIN32EXP int hdfsFileGetReadStatistics(hdfsFile file,
+ struct hdfsReadStatistics **stats);
+ /**
+ * @param stats HDFS read statistics for a file.
+ *
+ * @return the number of remote bytes read.
+ */
+ WIN32EXP int64_t hdfsReadStatisticsGetRemoteBytesRead(
+ const struct hdfsReadStatistics *stats);
+ /**
+ * Free some HDFS read statistics.
+ *
+ * @param stats The HDFS read statistics to free.
+ */
+ WIN32EXP void hdfsFileFreeReadStatistics(struct hdfsReadStatistics *stats);
+ /**
+ * hdfsConnectAsUser - Connect to a hdfs file system as a specific user
+ * Connect to the hdfs.
+ * @param nn The NameNode. See hdfsBuilderSetNameNode for details.
+ * @param port The port on which the server is listening.
+ * @param user the user name (this is hadoop domain user). Or NULL is equivelant to hhdfsConnect(host, port)
+ * @return Returns a handle to the filesystem or NULL on error.
+ * @deprecated Use hdfsBuilderConnect instead.
+ */
+ WIN32EXP hdfsFS hdfsConnectAsUser(const char* nn, tPort port, const char *user);
+ /**
+ * hdfsConnect - Connect to a hdfs file system.
+ * Connect to the hdfs.
+ * @param nn The NameNode. See hdfsBuilderSetNameNode for details.
+ * @param port The port on which the server is listening.
+ * @return Returns a handle to the filesystem or NULL on error.
+ * @deprecated Use hdfsBuilderConnect instead.
+ */
+ WIN32EXP hdfsFS hdfsConnect(const char* nn, tPort port);
+ /**
+ * hdfsConnect - Connect to an hdfs file system.
+ *
+ * Forces a new instance to be created
+ *
+ * @param nn The NameNode. See hdfsBuilderSetNameNode for details.
+ * @param port The port on which the server is listening.
+ * @param user The user name to use when connecting
+ * @return Returns a handle to the filesystem or NULL on error.
+ * @deprecated Use hdfsBuilderConnect instead.
+ */
+ WIN32EXP hdfsFS hdfsConnectAsUserNewInstance ( const char* nn, tPort port,
+ const char *user );
+ /**
+ * hdfsConnect - Connect to an hdfs file system.
+ *
+ * Forces a new instance to be created
+ *
+ * @param nn The NameNode. See hdfsBuilderSetNameNode for details.
+ * @param port The port on which the server is listening.
+ * @return Returns a handle to the filesystem or NULL on error.
+ * @deprecated Use hdfsBuilderConnect instead.
+ */
+ WIN32EXP hdfsFS hdfsConnectNewInstance(const char* nn, tPort port);
+ /**
+ * Connect to HDFS using the parameters defined by the builder.
+ *
+ * The HDFS builder will be freed, whether or not the connection was
+ * successful.
+ *
+ * Every successful call to hdfsBuilderConnect should be matched with a call
+ * to hdfsDisconnect, when the hdfsFS is no longer needed.
+ *
+ * @param bld The HDFS builder
+ * @return Returns a handle to the filesystem, or NULL on error.
+ */
+ WIN32EXP hdfsFS hdfsBuilderConnect(struct hdfsBuilder *bld);
+ /**
+ * Create an HDFS builder.
+ *
+ * @return The HDFS builder, or NULL on error.
+ */
+ WIN32EXP struct hdfsBuilder *hdfsNewBuilder(void);
+ /**
+ * Force the builder to always create a new instance of the FileSystem,
+ * rather than possibly finding one in the cache.
+ *
+ * @param bld The HDFS builder
+ */
+ WIN32EXP void hdfsBuilderSetForceNewInstance(struct hdfsBuilder *bld);
+ /**
+ * Set the HDFS NameNode to connect to.
+ *
+ * @param bld The HDFS builder
+ * @param nn The NameNode to use.
+ *
+ * If the string given is 'default', the default NameNode
+ * configuration will be used (from the XML configuration files)
+ *
+ * If NULL is given, a LocalFileSystem will be created.
+ *
+ * If the string starts with a protocol type such as file:// or
+ * hdfs://, this protocol type will be used. If not, the
+ * hdfs:// protocol type will be used.
+ *
+ * You may specify a NameNode port in the usual way by
+ * passing a string of the format hdfs://<hostname>:<port>.
+ * Alternately, you may set the port with
+ * hdfsBuilderSetNameNodePort. However, you must not pass the
+ * port in two different ways.
+ */
+ WIN32EXP void hdfsBuilderSetNameNode(struct hdfsBuilder *bld, const char *nn);
+ /**
+ * Set the port of the HDFS NameNode to connect to.
+ *
+ * @param bld The HDFS builder
+ * @param port The port.
+ */
+ WIN32EXP void hdfsBuilderSetNameNodePort(struct hdfsBuilder *bld, tPort port);
+ /**
+ * Set the username to use when connecting to the HDFS cluster.
+ *
+ * @param bld The HDFS builder
+ * @param userName The user name. The string will be shallow-copied.
+ */
+ WIN32EXP void hdfsBuilderSetUserName(struct hdfsBuilder *bld, const char *userName);
+ /**
+ * Set the path to the Kerberos ticket cache to use when connecting to
+ * the HDFS cluster.
+ *
+ * @param bld The HDFS builder
+ * @param kerbTicketCachePath The Kerberos ticket cache path. The string
+ * will be shallow-copied.
+ */
+ WIN32EXP void hdfsBuilderSetKerbTicketCachePath(struct hdfsBuilder *bld,
+ const char *kerbTicketCachePath);
+ /**
+ * Free an HDFS builder.
+ *
+ * It is normally not necessary to call this function since
+ * hdfsBuilderConnect frees the builder.
+ *
+ * @param bld The HDFS builder
+ */
+ WIN32EXP void hdfsFreeBuilder(struct hdfsBuilder *bld);
+ /**
+ * Set a configuration string for an HdfsBuilder.
+ *
+ * @param key The key to set.
+ * @param val The value, or NULL to set no value.
+ * This will be shallow-copied. You are responsible for
+ * ensuring that it remains valid until the builder is
+ * freed.
+ *
+ * @return 0 on success; nonzero error code otherwise.
+ */
+ WIN32EXP int hdfsBuilderConfSetStr(struct hdfsBuilder *bld, const char *key,
+ const char *val);
+ /**
+ * Get a configuration string.
+ *
+ * @param key The key to find
+ * @param val (out param) The value. This will be set to NULL if the
+ * key isn't found. You must free this string with
+ * hdfsConfStrFree.
+ *
+ * @return 0 on success; nonzero error code otherwise.
+ * Failure to find the key is not an error.
+ */
+ WIN32EXP int hdfsConfGetStr(const char *key, char **val);
+ /**
+ * Get a configuration integer.
+ *
+ * @param key The key to find
+ * @param val (out param) The value. This will NOT be changed if the
+ * key isn't found.
+ *
+ * @return 0 on success; nonzero error code otherwise.
+ * Failure to find the key is not an error.
+ */
+ WIN32EXP int hdfsConfGetInt(const char *key, int32_t *val);
+ /**
+ * Free a configuration string found with hdfsConfGetStr.
+ *
+ * @param val A configuration string obtained from hdfsConfGetStr
+ */
+ WIN32EXP void hdfsConfStrFree(char *val);
+ /**
+ * hdfsDisconnect - Disconnect from the hdfs file system.
+ * Disconnect from hdfs.
+ * @param fs The configured filesystem handle.
+ * @return Returns 0 on success, -1 on error.
+ * Even if there is an error, the resources associated with the
+ * hdfsFS will be freed.
+ */
+ WIN32EXP int hdfsDisconnect(hdfsFS fs);
+ /**
+ * hdfsOpenFile - Open a hdfs file in given mode.
+ * @param fs The configured filesystem handle.
+ * @param path The full path to the file.
+ * @param flags - an | of bits/fcntl.h file flags - supported flags are O_RDONLY, O_WRONLY (meaning create or overwrite i.e., implies O_TRUNCAT),
+ * O_WRONLY|O_APPEND. Other flags are generally ignored other than (O_RDWR || (O_EXCL & O_CREAT)) which return NULL and set errno equal ENOTSUP.
+ * @param bufferSize Size of buffer for read/write - pass 0 if you want
+ * to use the default configured values.
+ * @param replication Block replication - pass 0 if you want to use
+ * the default configured values.
+ * @param blocksize Size of block - pass 0 if you want to use the
+ * default configured values.
+ * @return Returns the handle to the open file or NULL on error.
+ */
+ WIN32EXP hdfsFile hdfsOpenFile(hdfsFS fs, const char* path, int flags,
+ int bufferSize, short replication, tSize blocksize);
+ /**
+ * hdfsCloseFile - Close an open file.
+ * @param fs The configured filesystem handle.
+ * @param file The file handle.
+ * @return Returns 0 on success, -1 on error.
+ * On error, errno will be set appropriately.
+ * If the hdfs file was valid, the memory associated with it will
+ * be freed at the end of this call, even if there was an I/O
+ * error.
+ */
+ WIN32EXP int hdfsCloseFile(hdfsFS fs, hdfsFile file);
+ /**
+ * hdfsExists - Checks if a given path exsits on the filesystem
+ * @param fs The configured filesystem handle.
+ * @param path The path to look for
+ * @return Returns 0 on success, -1 on error.
+ */
+ WIN32EXP int hdfsExists(hdfsFS fs, const char *path);
+ /**
+ * hdfsSeek - Seek to given offset in file.
+ * This works only for files opened in read-only mode.
+ * @param fs The configured filesystem handle.
+ * @param file The file handle.
+ * @param desiredPos Offset into the file to seek into.
+ * @return Returns 0 on success, -1 on error.
+ */
+ WIN32EXP int hdfsSeek(hdfsFS fs, hdfsFile file, tOffset desiredPos);
+ /**
+ * hdfsTell - Get the current offset in the file, in bytes.
+ * @param fs The configured filesystem handle.
+ * @param file The file handle.
+ * @return Current offset, -1 on error.
+ */
+ WIN32EXP tOffset hdfsTell(hdfsFS fs, hdfsFile file);
+ /**
+ * hdfsRead - Read data from an open file.
+ * @param fs The configured filesystem handle.
+ * @param file The file handle.
+ * @param buffer The buffer to copy read bytes into.
+ * @param length The length of the buffer.
+ * @return On success, a positive number indicating how many bytes
+ * were read.
+ * On end-of-file, 0.
+ * On error, -1. Errno will be set to the error code.
+ * Just like the POSIX read function, hdfsRead will return -1
+ * and set errno to EINTR if data is temporarily unavailable,
+ * but we are not yet at the end of the file.
+ */
+ WIN32EXP tSize hdfsRead(hdfsFS fs, hdfsFile file, void* buffer, tSize length);
+ /**
+ * hdfsPread - Positional read of data from an open file.
+ * @param fs The configured filesystem handle.
+ * @param file The file handle.
+ * @param position Position from which to read
+ * @param buffer The buffer to copy read bytes into.
+ * @param length The length of the buffer.
+ * @return See hdfsRead
+ */
+ WIN32EXP tSize hdfsPread(hdfsFS fs, hdfsFile file, tOffset position,
+ void* buffer, tSize length);
+ /**
+ * hdfsWrite - Write data into an open file.
+ * @param fs The configured filesystem handle.
+ * @param file The file handle.
+ * @param buffer The data.
+ * @param length The no. of bytes to write.
+ * @return Returns the number of bytes written, -1 on error.
+ */
+ WIN32EXP tSize hdfsWrite(hdfsFS fs, hdfsFile file, const void* buffer,
+ tSize length);
+ /**
+ * hdfsWrite - Flush the data.
+ * @param fs The configured filesystem handle.
+ * @param file The file handle.
+ * @return Returns 0 on success, -1 on error.
+ */
+ WIN32EXP int hdfsFlush(hdfsFS fs, hdfsFile file);
+ /**
+ * hdfsHFlush - Flush out the data in client's user buffer. After the
+ * return of this call, new readers will see the data.
+ * @param fs configured filesystem handle
+ * @param file file handle
+ * @return 0 on success, -1 on error and sets errno
+ */
+ WIN32EXP int hdfsHFlush(hdfsFS fs, hdfsFile file);
+ /**
+ * hdfsHSync - Similar to posix fsync, Flush out the data in client's
+ * user buffer. all the way to the disk device (but the disk may have
+ * it in its cache).
+ * @param fs configured filesystem handle
+ * @param file file handle
+ * @return 0 on success, -1 on error and sets errno
+ */
+ WIN32EXP int hdfsHSync(hdfsFS fs, hdfsFile file);
+ /**
+ * hdfsAvailable - Number of bytes that can be read from this
+ * input stream without blocking.
+ * @param fs The configured filesystem handle.
+ * @param file The file handle.
+ * @return Returns available bytes; -1 on error.
+ */
+ WIN32EXP int hdfsAvailable(hdfsFS fs, hdfsFile file);
+ /**
+ * hdfsCopy - Copy file from one filesystem to another.
+ * @param srcFS The handle to source filesystem.
+ * @param src The path of source file.
+ * @param dstFS The handle to destination filesystem.
+ * @param dst The path of destination file.
+ * @return Returns 0 on success, -1 on error.
+ */
+ WIN32EXP int hdfsCopy(hdfsFS srcFS, const char* src, hdfsFS dstFS, const char* dst);
+ /**
+ * hdfsMove - Move file from one filesystem to another.
+ * @param srcFS The handle to source filesystem.
+ * @param src The path of source file.
+ * @param dstFS The handle to destination filesystem.
+ * @param dst The path of destination file.
+ * @return Returns 0 on success, -1 on error.
+ */
+ WIN32EXP int hdfsMove(hdfsFS srcFS, const char* src, hdfsFS dstFS, const char* dst);
+ /**
+ * hdfsDelete - Delete file.
+ * @param fs The configured filesystem handle.
+ * @param path The path of the file.
+ * @param recursive if path is a directory and set to
+ * non-zero, the directory is deleted else throws an exception. In
+ * case of a file the recursive argument is irrelevant.
+ * @return Returns 0 on success, -1 on error.
+ */
+ WIN32EXP int hdfsDelete(hdfsFS fs, const char* path, int recursive);
+ /**
+ * hdfsRename - Rename file.
+ * @param fs The configured filesystem handle.
+ * @param oldPath The path of the source file.
+ * @param newPath The path of the destination file.
+ * @return Returns 0 on success, -1 on error.
+ */
+ WIN32EXP int hdfsRename(hdfsFS fs, const char* oldPath, const char* newPath);
+ /**
+ * hdfsGetWorkingDirectory - Get the current working directory for
+ * the given filesystem.
+ * @param fs The configured filesystem handle.
+ * @param buffer The user-buffer to copy path of cwd into.
+ * @param bufferSize The length of user-buffer.
+ * @return Returns buffer, NULL on error.
+ */
+ WIN32EXP char* hdfsGetWorkingDirectory(hdfsFS fs, char *buffer, size_t bufferSize);
+ /**
+ * hdfsSetWorkingDirectory - Set the working directory. All relative
+ * paths will be resolved relative to it.
+ * @param fs The configured filesystem handle.
+ * @param path The path of the new 'cwd'.
+ * @return Returns 0 on success, -1 on error.
+ */
+ WIN32EXP int hdfsSetWorkingDirectory(hdfsFS fs, const char* path);
+ /**
+ * hdfsCreateDirectory - Make the given file and all non-existent
+ * parents into directories.
+ * @param fs The configured filesystem handle.
+ * @param path The path of the directory.
+ * @return Returns 0 on success, -1 on error.
+ */
+ WIN32EXP int hdfsCreateDirectory(hdfsFS fs, const char* path);
+ /**
+ * hdfsSetReplication - Set the replication of the specified
+ * file to the supplied value
+ * @param fs The configured filesystem handle.
+ * @param path The path of the file.
+ * @return Returns 0 on success, -1 on error.
+ */
+ WIN32EXP int hdfsSetReplication(hdfsFS fs, const char* path, int16_t replication);
+ /**
+ * hdfsFileInfo - Information about a file/directory.
+ */
+ typedef struct {
+ tObjectKind mKind; /* file or directory */
+ char *mName; /* the name of the file */
+ tTime mLastMod; /* the last modification time for the file in seconds */
+ tOffset mSize; /* the size of the file in bytes */
+ short mReplication; /* the count of replicas */
+ tOffset mBlockSize; /* the block size for the file */
+ char *mOwner; /* the owner of the file */
+ char *mGroup; /* the group associated with the file */
+ short mPermissions; /* the permissions associated with the file */
+ tTime mLastAccess; /* the last access time for the file in seconds */
+ } hdfsFileInfo;
+ /**
+ * hdfsListDirectory - Get list of files/directories for a given
+ * directory-path. hdfsFreeFileInfo should be called to deallocate memory.
+ * @param fs The configured filesystem handle.
+ * @param path The path of the directory.
+ * @param numEntries Set to the number of files/directories in path.
+ * @return Returns a dynamically-allocated array of hdfsFileInfo
+ * objects; NULL on error.
+ */
+ WIN32EXP hdfsFileInfo *hdfsListDirectory(hdfsFS fs, const char* path,
+ int *numEntries);
+ /**
+ * hdfsGetPathInfo - Get information about a path as a (dynamically
+ * allocated) single hdfsFileInfo struct. hdfsFreeFileInfo should be
+ * called when the pointer is no longer needed.
+ * @param fs The configured filesystem handle.
+ * @param path The path of the file.
+ * @return Returns a dynamically-allocated hdfsFileInfo object;
+ * NULL on error.
+ */
+ WIN32EXP hdfsFileInfo *hdfsGetPathInfo(hdfsFS fs, const char* path);
+ /**
+ * hdfsFreeFileInfo - Free up the hdfsFileInfo array (including fields)
+ * @param hdfsFileInfo The array of dynamically-allocated hdfsFileInfo
+ * objects.
+ * @param numEntries The size of the array.
+ */
+ WIN32EXP void hdfsFreeFileInfo(hdfsFileInfo *hdfsFileInfo, int numEntries);
+ /**
+ * hdfsGetHosts - Get hostnames where a particular block (determined by
+ * pos & blocksize) of a file is stored. The last element in the array
+ * is NULL. Due to replication, a single block could be present on
+ * multiple hosts.
+ * @param fs The configured filesystem handle.
+ * @param path The path of the file.
+ * @param start The start of the block.
+ * @param length The length of the block.
+ * @return Returns a dynamically-allocated 2-d array of blocks-hosts;
+ * NULL on error.
+ */
+ WIN32EXP char*** hdfsGetHosts(hdfsFS fs, const char* path,
+ tOffset start, tOffset length);
+ /**
+ * hdfsFreeHosts - Free up the structure returned by hdfsGetHosts
+ * @param hdfsFileInfo The array of dynamically-allocated hdfsFileInfo
+ * objects.
+ * @param numEntries The size of the array.
+ */
+ WIN32EXP void hdfsFreeHosts(char ***blockHosts);
+ /**
+ * hdfsGetDefaultBlockSize - Get the default blocksize.
+ *
+ * @param fs The configured filesystem handle.
+ * @deprecated Use hdfsGetDefaultBlockSizeAtPath instead.
+ *
+ * @return Returns the default blocksize, or -1 on error.
+ */
+ WIN32EXP tOffset hdfsGetDefaultBlockSize(hdfsFS fs);
+ /**
+ * hdfsGetDefaultBlockSizeAtPath - Get the default blocksize at the
+ * filesystem indicated by a given path.
+ *
+ * @param fs The configured filesystem handle.
+ * @param path The given path will be used to locate the actual
+ * filesystem. The full path does not have to exist.
+ *
+ * @return Returns the default blocksize, or -1 on error.
+ */
+ WIN32EXP tOffset hdfsGetDefaultBlockSizeAtPath(hdfsFS fs, const char *path);
+ /**
+ * hdfsGetCapacity - Return the raw capacity of the filesystem.
+ * @param fs The configured filesystem handle.
+ * @return Returns the raw-capacity; -1 on error.
+ */
+ WIN32EXP tOffset hdfsGetCapacity(hdfsFS fs);
+ /**
+ * hdfsGetUsed - Return the total raw size of all files in the filesystem.
+ * @param fs The configured filesystem handle.
+ * @return Returns the total-size; -1 on error.
+ */
+ WIN32EXP tOffset hdfsGetUsed(hdfsFS fs);
+ /**
+ * Change the user and/or group of a file or directory.
+ *
+ * @param fs The configured filesystem handle.
+ * @param path the path to the file or directory
+ * @param owner User string. Set to NULL for 'no change'
+ * @param group Group string. Set to NULL for 'no change'
+ * @return 0 on success else -1
+ */
+ WIN32EXP int hdfsChown(hdfsFS fs, const char* path, const char *owner,
+ const char *group);
+ /**
+ * hdfsChmod
+ * @param fs The configured filesystem handle.
+ * @param path the path to the file or directory
+ * @param mode the bitmask to set it to
+ * @return 0 on success else -1
+ */
+ WIN32EXP int hdfsChmod(hdfsFS fs, const char* path, short mode);
+ /**
+ * hdfsUtime
+ * @param fs The configured filesystem handle.
+ * @param path the path to the file or directory
+ * @param mtime new modification time or -1 for no change
+ * @param atime new access time or -1 for no change
+ * @return 0 on success else -1
+ */
+ WIN32EXP int hdfsUtime(hdfsFS fs, const char* path, tTime mtime, tTime atime);
+#ifdef __cplusplus
+#endif /*LIBHDFS_HDFS_H*/
+ * vim: ts=4: sw=4: et
+ */
diff --git a/import/pdclibhdfs/inc/hdfs_test.h b/import/pdclibhdfs/inc/hdfs_test.h
new file mode 100755
index 0000000..95b25b7
--- /dev/null
+++ b/import/pdclibhdfs/inc/hdfs_test.h
@@ -0,0 +1,39 @@
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+struct hdfsFile_internal;
+ * Some functions that are visible only for testing.
+ *
+ * This header is not meant to be exported or used outside of the libhdfs unit
+ * tests.
+ */
+#ifdef __cplusplus
+extern "C" {
+#ifdef __cplusplus
diff --git a/import/pdclibhdfs/inc/jni_helper.h b/import/pdclibhdfs/inc/jni_helper.h
new file mode 100755
index 0000000..f42779f
--- /dev/null
+++ b/import/pdclibhdfs/inc/jni_helper.h
@@ -0,0 +1,128 @@
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <jni.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#ifdef WIN32
+#include <windows.h>
+#ifndef WIN32
+#include <search.h>
+#include <pthread.h>
+#include <errno.h>
+#define PATH_SEPARATOR ':'
+/** Denote the method we want to invoke as STATIC or INSTANCE */
+typedef enum {
+} MethType;
+ * Create a new malloc'ed C string from a Java string.
+ *
+ * @param env The JNI environment
+ * @param jstr The Java string
+ * @param out (out param) the malloc'ed C string
+ *
+ * @return NULL on success; the exception otherwise
+ */
+jthrowable newCStr(JNIEnv *env, jstring jstr, char **out);
+ * Create a new Java string from a C string.
+ *
+ * @param env The JNI environment
+ * @param str The C string
+ * @param out (out param) the java string
+ *
+ * @return NULL on success; the exception otherwise
+ */
+jthrowable newJavaStr(JNIEnv *env, const char *str, jstring *out);
+ * Helper function to destroy a local reference of java.lang.Object
+ * @param env: The JNIEnv pointer.
+ * @param jFile: The local reference of java.lang.Object object
+ * @return None.
+ */
+void destroyLocalReference(JNIEnv *env, jobject jObject);
+/** invokeMethod: Invoke a Static or Instance method.
+ * className: Name of the class where the method can be found
+ * methName: Name of the method
+ * methSignature: the signature of the method "(arg-types)ret-type"
+ * methType: The type of the method (STATIC or INSTANCE)
+ * instObj: Required if the methType is INSTANCE. The object to invoke
+ the method on.
+ * env: The JNIEnv pointer
+ * retval: The pointer to a union type which will contain the result of the
+ method invocation, e.g. if the method returns an Object, retval will be
+ set to that, if the method returns boolean, retval will be set to the
+ value (JNI_TRUE or JNI_FALSE), etc.
+ * exc: If the methods throws any exception, this will contain the reference
+ * Arguments (the method arguments) must be passed after methSignature
+ * RETURNS: -1 on error and 0 on success. If -1 is returned, exc will have
+ a valid exception reference, and the result stored at retval is undefined.
+ */
+jthrowable invokeMethod(JNIEnv *env, jvalue *retval, MethType methType,
+ jobject instObj, const char *className, const char *methName,
+ const char *methSignature, ...);
+jthrowable constructNewObjectOfClass(JNIEnv *env, jobject *out, const char *className,
+ const char *ctorSignature, ...);
+jthrowable methodIdFromClass(const char *className, const char *methName,
+ const char *methSignature, MethType methType,
+ JNIEnv *env, jmethodID *out);
+jthrowable globalClassReference(const char *className, JNIEnv *env, jclass *out);
+/** classNameOfObject: Get an object's class name.
+ * @param jobj: The object.
+ * @param env: The JNIEnv pointer.
+ * @param name: (out param) On success, will contain a string containing the
+ * class name. This string must be freed by the caller.
+ * @return NULL on success, or the exception
+ */
+jthrowable classNameOfObject(jobject jobj, JNIEnv *env, char **name);
+/** getJNIEnv: A helper function to get the JNIEnv* for the given thread.
+ * If no JVM exists, then one will be created. JVM command line arguments
+ * are obtained from the LIBHDFS_OPTS environment variable.
+ * @param: None.
+ * @return The JNIEnv* corresponding to the thread.
+ * */
+JNIEnv* getJNIEnv(void);
+ * vim: ts=4: sw=4: et:
+ */
diff --git a/import/pdclibhdfs/inc/native_mini_dfs.h b/import/pdclibhdfs/inc/native_mini_dfs.h
new file mode 100755
index 0000000..0508fa9
--- /dev/null
+++ b/import/pdclibhdfs/inc/native_mini_dfs.h
@@ -0,0 +1,104 @@
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <jni.h> /* for jboolean */
+struct NativeMiniDfsCluster;
+ * Represents a configuration to use for creating a Native MiniDFSCluster
+ */
+struct NativeMiniDfsConf {
+ /**
+ * Nonzero if the cluster should be formatted prior to startup
+ */
+ jboolean doFormat;
+ /**
+ * Whether or not to enable webhdfs in MiniDfsCluster
+ */
+ jboolean webhdfsEnabled;
+ /**
+ * The http port of the namenode in MiniDfsCluster
+ */
+ jint namenodeHttpPort;
+ * Create a NativeMiniDfsBuilder
+ *
+ * @param conf (inout) The cluster configuration
+ *
+ * @return a NativeMiniDfsBuilder, or a NULL pointer on error.
+ */
+struct NativeMiniDfsCluster* nmdCreate(struct NativeMiniDfsConf *conf);
+ * Wait until a MiniDFSCluster comes out of safe mode.
+ *
+ * @param cl The cluster
+ *
+ * @return 0 on success; a non-zero error code if the cluster fails to
+ * come out of safe mode.
+ */
+int nmdWaitClusterUp(struct NativeMiniDfsCluster *cl);
+ * Shut down a NativeMiniDFS cluster
+ *
+ * @param cl The cluster
+ *
+ * @return 0 on success; a non-zero error code if an exception is
+ * thrown.
+ */
+int nmdShutdown(struct NativeMiniDfsCluster *cl);
+ * Destroy a Native MiniDFSCluster
+ *
+ * @param cl The cluster to destroy
+ */
+void nmdFree(struct NativeMiniDfsCluster* cl);
+ * Get the port that's in use by the given (non-HA) nativeMiniDfs
+ *
+ * @param cl The initialized NativeMiniDfsCluster
+ *
+ * @return the port, or a negative error code
+ */
+int nmdGetNameNodePort(const struct NativeMiniDfsCluster *cl);
+ * Get the http address that's in use by the given (non-HA) nativeMiniDfs
+ *
+ * @param cl The initialized NativeMiniDfsCluster
+ * @param port Used to capture the http port of the NameNode
+ * of the NativeMiniDfsCluster
+ * @param hostName Used to capture the http hostname of the NameNode
+ * of the NativeMiniDfsCluster
+ *
+ * @return 0 on success; a non-zero error code if failing to
+ * get the information.
+ */
+int nmdGetNameNodeHttpAddress(const struct NativeMiniDfsCluster *cl,
+ int *port, const char **hostName);
diff --git a/import/pdclibhdfs/inc/stdint.h-xx b/import/pdclibhdfs/inc/stdint.h-xx
new file mode 100755
index 0000000..f8c7d47
--- /dev/null
+++ b/import/pdclibhdfs/inc/stdint.h-xx
@@ -0,0 +1,259 @@
+// ISO C9x compliant stdint.h for Microsoft Visual Studio
+// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124
+// Copyright (c) 2006-2013 Alexander Chemeris
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// 1. Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+// 3. Neither the name of the product nor the names of its contributors may
+// be used to endorse or promote products derived from this software
+// without specific prior written permission.
+#ifndef _MSC_VER // [
+#error "Use this header only with Microsoft Visual C++ compilers!"
+#endif // _MSC_VER ]
+#ifndef _MSC_STDINT_H_ // [
+#define _MSC_STDINT_H_
+#if _MSC_VER > 1000
+#pragma once
+#if _MSC_VER >= 1600 // [
+#include <stdint.h>
+#else // ] _MSC_VER >= 1600 [
+#include <limits.h>
+// For Visual Studio 6 in C++ mode and for many Visual Studio versions when
+// compiling for ARM we should wrap <wchar.h> include with 'extern "C++" {}'
+// or compiler give many errors like this:
+// error C2733: second C linkage of overloaded function 'wmemchr' not allowed
+#ifdef __cplusplus
+extern "C" {
+# include <wchar.h>
+#ifdef __cplusplus
+// Define _W64 macros to mark types changing their size, like intptr_t.
+#ifndef _W64
+# if !defined(__midl) && (defined(_X86_) || defined(_M_IX86)) && _MSC_VER >= 1300
+# define _W64 __w64
+# else
+# define _W64
+# endif
+// 7.18.1 Integer types
+// Exact-width integer types
+// Visual Studio 6 and Embedded Visual C++ 4 doesn't
+// realize that, e.g. char has the same size as __int8
+// so we give up on __intX for them.
+#if (_MSC_VER < 1300)
+ typedef signed char int8_t;
+ typedef signed short int16_t;
+ typedef signed int int32_t;
+ typedef unsigned char uint8_t;
+ typedef unsigned short uint16_t;
+ typedef unsigned int uint32_t;
+ typedef signed __int8 int8_t;
+ typedef signed __int16 int16_t;
+ typedef signed __int32 int32_t;
+ typedef unsigned __int8 uint8_t;
+ typedef unsigned __int16 uint16_t;
+ typedef unsigned __int32 uint32_t;
+typedef signed __int64 int64_t;
+typedef unsigned __int64 uint64_t;
+// Minimum-width integer types
+typedef int8_t int_least8_t;
+typedef int16_t int_least16_t;
+typedef int32_t int_least32_t;
+typedef int64_t int_least64_t;
+typedef uint8_t uint_least8_t;
+typedef uint16_t uint_least16_t;
+typedef uint32_t uint_least32_t;
+typedef uint64_t uint_least64_t;
+// Fastest minimum-width integer types
+typedef int8_t int_fast8_t;
+typedef int16_t int_fast16_t;
+typedef int32_t int_fast32_t;
+typedef int64_t int_fast64_t;
+typedef uint8_t uint_fast8_t;
+typedef uint16_t uint_fast16_t;
+typedef uint32_t uint_fast32_t;
+typedef uint64_t uint_fast64_t;
+// Integer types capable of holding object pointers
+#ifdef _WIN64 // [
+ typedef signed __int64 intptr_t;
+ typedef unsigned __int64 uintptr_t;
+#else // _WIN64 ][
+ typedef _W64 signed int intptr_t;
+ typedef _W64 unsigned int uintptr_t;
+#endif // _WIN64 ]
+// Greatest-width integer types
+typedef int64_t intmax_t;
+typedef uint64_t uintmax_t;
+// 7.18.2 Limits of specified-width integer types
+#if !defined(__cplusplus) || defined(__STDC_LIMIT_MACROS) // [ See footnote 220 at page 257 and footnote 221 at page 259
+// Limits of exact-width integer types
+#define INT8_MIN ((int8_t)_I8_MIN)
+#define INT8_MAX _I8_MAX
+#define INT16_MIN ((int16_t)_I16_MIN)
+#define INT16_MAX _I16_MAX
+#define INT32_MIN ((int32_t)_I32_MIN)
+#define INT32_MAX _I32_MAX
+#define INT64_MIN ((int64_t)_I64_MIN)
+#define INT64_MAX _I64_MAX
+#define UINT8_MAX _UI8_MAX
+#define UINT16_MAX _UI16_MAX
+#define UINT32_MAX _UI32_MAX
+#define UINT64_MAX _UI64_MAX
+// Limits of minimum-width integer types
+#define INT_LEAST16_MIN INT16_MIN
+#define INT_LEAST16_MAX INT16_MAX
+#define INT_LEAST32_MIN INT32_MIN
+#define INT_LEAST32_MAX INT32_MAX
+#define INT_LEAST64_MIN INT64_MIN
+#define INT_LEAST64_MAX INT64_MAX
+// Limits of fastest minimum-width integer types
+#define INT_FAST16_MIN INT16_MIN
+#define INT_FAST16_MAX INT16_MAX
+#define INT_FAST32_MIN INT32_MIN
+#define INT_FAST32_MAX INT32_MAX
+#define INT_FAST64_MIN INT64_MIN
+#define INT_FAST64_MAX INT64_MAX
+// Limits of integer types capable of holding object pointers
+#ifdef _WIN64 // [
+# define INTPTR_MIN INT64_MIN
+# define INTPTR_MAX INT64_MAX
+#else // _WIN64 ][
+# define INTPTR_MIN INT32_MIN
+# define INTPTR_MAX INT32_MAX
+#endif // _WIN64 ]
+// Limits of greatest-width integer types
+// 7.18.3 Limits of other integer types
+#ifdef _WIN64 // [
+# define PTRDIFF_MIN _I64_MIN
+# define PTRDIFF_MAX _I64_MAX
+#else // _WIN64 ][
+# define PTRDIFF_MIN _I32_MIN
+# define PTRDIFF_MAX _I32_MAX
+#endif // _WIN64 ]
+#ifndef SIZE_MAX // [
+# ifdef _WIN64 // [
+# define SIZE_MAX _UI64_MAX
+# else // _WIN64 ][
+# define SIZE_MAX _UI32_MAX
+# endif // _WIN64 ]
+#endif // SIZE_MAX ]
+// WCHAR_MIN and WCHAR_MAX are also defined in <wchar.h>
+#ifndef WCHAR_MIN // [
+# define WCHAR_MIN 0
+#endif // WCHAR_MIN ]
+#ifndef WCHAR_MAX // [
+# define WCHAR_MAX _UI16_MAX
+#endif // WCHAR_MAX ]
+#define WINT_MIN 0
+#define WINT_MAX _UI16_MAX
+#endif // __STDC_LIMIT_MACROS ]
+// 7.18.4 Limits of other integer types
+#if !defined(__cplusplus) || defined(__STDC_CONSTANT_MACROS) // [ See footnote 224 at page 260
+// Macros for minimum-width integer constants
+#define INT8_C(val) val##i8
+#define INT16_C(val) val##i16
+#define INT32_C(val) val##i32
+#define INT64_C(val) val##i64
+#define UINT8_C(val) val##ui8
+#define UINT16_C(val) val##ui16
+#define UINT32_C(val) val##ui32
+#define UINT64_C(val) val##ui64
+// Macros for greatest-width integer constants
+// These #ifndef's are needed to prevent collisions with <boost/cstdint.hpp>.
+// Check out Issue 9 for the details.
+#ifndef INTMAX_C // [
+# define INTMAX_C INT64_C
+#endif // INTMAX_C ]
+#ifndef UINTMAX_C // [
+# define UINTMAX_C UINT64_C
+#endif // UINTMAX_C ]
+#endif // _MSC_VER >= 1600 ]
+#endif // _MSC_STDINT_H_ ]
diff --git a/import/pdclibhdfs/inc/uthash.h b/import/pdclibhdfs/inc/uthash.h
new file mode 100755
index 0000000..72acf11
--- /dev/null
+++ b/import/pdclibhdfs/inc/uthash.h
@@ -0,0 +1,948 @@
+Copyright (c) 2003-2013, Troy D. Hanson http://troydhanson.github.com/uthash/
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+#ifndef UTHASH_H
+#define UTHASH_H
+#include <string.h> /* memcmp,strlen */
+#include <stddef.h> /* ptrdiff_t */
+#include <stdlib.h> /* exit() */
+/* These macros use decltype or the earlier __typeof GNU extension.
+ As decltype is only available in newer compilers (VS2010 or gcc 4.3+
+ when compiling c++ source) this code uses whatever method is needed
+ or, for VS2008 where neither is available, uses casting workarounds. */
+#ifdef _MSC_VER /* MS compiler */
+#if _MSC_VER >= 1600 && defined(__cplusplus) /* VS2010 or newer in C++ mode */
+#define DECLTYPE(x) (decltype(x))
+#else /* VS2008 or older (or VS2010 in C mode) */
+#define NO_DECLTYPE
+#define DECLTYPE(x)
+#else /* GNU, Sun and other compilers */
+#define DECLTYPE(x) (__typeof(x))
+#define DECLTYPE_ASSIGN(dst,src) \
+do { \
+ char **_da_dst = (char**)(&(dst)); \
+ *_da_dst = (char*)(src); \
+} while(0)
+#define DECLTYPE_ASSIGN(dst,src) \
+do { \
+ (dst) = DECLTYPE(dst)(src); \
+} while(0)
+/* a number of the hash function use uint32_t which isn't defined on win32 */
+#ifdef _MSC_VER
+typedef unsigned int uint32_t;
+typedef unsigned char uint8_t;
+#include <inttypes.h> /* uint32_t */
+#define UTHASH_VERSION 1.9.8
+#ifndef uthash_fatal
+#define uthash_fatal(msg) exit(-1) /* fatal error (out of memory,etc) */
+#ifndef uthash_malloc
+#define uthash_malloc(sz) malloc(sz) /* malloc fcn */
+#ifndef uthash_free
+#define uthash_free(ptr,sz) free(ptr) /* free fcn */
+#ifndef uthash_noexpand_fyi
+#define uthash_noexpand_fyi(tbl) /* can be defined to log noexpand */
+#ifndef uthash_expand_fyi
+#define uthash_expand_fyi(tbl) /* can be defined to log expands */
+/* initial number of buckets */
+#define HASH_INITIAL_NUM_BUCKETS 32 /* initial number of buckets */
+#define HASH_INITIAL_NUM_BUCKETS_LOG2 5 /* lg2 of initial number of buckets */
+#define HASH_BKT_CAPACITY_THRESH 10 /* expand when bucket count reaches */
+/* calculate the element whose hash handle address is hhe */
+#define ELMT_FROM_HH(tbl,hhp) ((void*)(((char*)(hhp)) - ((tbl)->hho)))
+#define HASH_FIND(hh,head,keyptr,keylen,out) \
+do { \
+ unsigned _hf_bkt,_hf_hashv; \
+ out=NULL; \
+ if (head) { \
+ HASH_FCN(keyptr,keylen, (head)->hh.tbl->num_buckets, _hf_hashv, _hf_bkt); \
+ if (HASH_BLOOM_TEST((head)->hh.tbl, _hf_hashv)) { \
+ HASH_FIND_IN_BKT((head)->hh.tbl, hh, (head)->hh.tbl->buckets[ _hf_bkt ], \
+ keyptr,keylen,out); \
+ } \
+ } \
+} while (0)
+#ifdef HASH_BLOOM
+#define HASH_BLOOM_MAKE(tbl) \
+do { \
+ (tbl)->bloom_nbits = HASH_BLOOM; \
+ (tbl)->bloom_bv = (uint8_t*)uthash_malloc(HASH_BLOOM_BYTELEN); \
+ if (!((tbl)->bloom_bv)) { uthash_fatal( "out of memory"); } \
+ memset((tbl)->bloom_bv, 0, HASH_BLOOM_BYTELEN); \
+ (tbl)->bloom_sig = HASH_BLOOM_SIGNATURE; \
+} while (0)
+#define HASH_BLOOM_FREE(tbl) \
+do { \
+ uthash_free((tbl)->bloom_bv, HASH_BLOOM_BYTELEN); \
+} while (0)
+#define HASH_BLOOM_BITSET(bv,idx) (bv[(idx)/8] |= (1U << ((idx)%8)))
+#define HASH_BLOOM_BITTEST(bv,idx) (bv[(idx)/8] & (1U << ((idx)%8)))
+#define HASH_BLOOM_ADD(tbl,hashv) \
+ HASH_BLOOM_BITSET((tbl)->bloom_bv, (hashv & (uint32_t)((1ULL << (tbl)->bloom_nbits) - 1)))
+#define HASH_BLOOM_TEST(tbl,hashv) \
+ HASH_BLOOM_BITTEST((tbl)->bloom_bv, (hashv & (uint32_t)((1ULL << (tbl)->bloom_nbits) - 1)))
+#define HASH_BLOOM_MAKE(tbl)
+#define HASH_BLOOM_FREE(tbl)
+#define HASH_BLOOM_ADD(tbl,hashv)
+#define HASH_BLOOM_TEST(tbl,hashv) (1)
+#define HASH_MAKE_TABLE(hh,head) \
+do { \
+ (head)->hh.tbl = (UT_hash_table*)uthash_malloc( \
+ sizeof(UT_hash_table)); \
+ if (!((head)->hh.tbl)) { uthash_fatal( "out of memory"); } \
+ memset((head)->hh.tbl, 0, sizeof(UT_hash_table)); \
+ (head)->hh.tbl->tail = &((head)->hh); \
+ (head)->hh.tbl->num_buckets = HASH_INITIAL_NUM_BUCKETS; \
+ (head)->hh.tbl->log2_num_buckets = HASH_INITIAL_NUM_BUCKETS_LOG2; \
+ (head)->hh.tbl->hho = (char*)(&(head)->hh) - (char*)(head); \
+ (head)->hh.tbl->buckets = (UT_hash_bucket*)uthash_malloc( \
+ HASH_INITIAL_NUM_BUCKETS*sizeof(struct UT_hash_bucket)); \
+ if (! (head)->hh.tbl->buckets) { uthash_fatal( "out of memory"); } \
+ memset((head)->hh.tbl->buckets, 0, \
+ HASH_INITIAL_NUM_BUCKETS*sizeof(struct UT_hash_bucket)); \
+ HASH_BLOOM_MAKE((head)->hh.tbl); \
+ (head)->hh.tbl->signature = HASH_SIGNATURE; \
+} while(0)
+#define HASH_ADD(hh,head,fieldname,keylen_in,add) \
+ HASH_ADD_KEYPTR(hh,head,&((add)->fieldname),keylen_in,add)
+#define HASH_REPLACE(hh,head,fieldname,keylen_in,add,replaced) \
+do { \
+ replaced=NULL; \
+ HASH_FIND(hh,head,&((add)->fieldname),keylen_in,replaced); \
+ if (replaced!=NULL) { \
+ HASH_DELETE(hh,head,replaced); \
+ }; \
+ HASH_ADD(hh,head,fieldname,keylen_in,add); \
+} while(0)
+#define HASH_ADD_KEYPTR(hh,head,keyptr,keylen_in,add) \
+do { \
+ unsigned _ha_bkt; \
+ (add)->hh.next = NULL; \
+ (add)->hh.key = (char*)(keyptr); \
+ (add)->hh.keylen = (unsigned)(keylen_in); \
+ if (!(head)) { \
+ head = (add); \
+ (head)->hh.prev = NULL; \
+ HASH_MAKE_TABLE(hh,head); \
+ } else { \
+ (head)->hh.tbl->tail->next = (add); \
+ (add)->hh.prev = ELMT_FROM_HH((head)->hh.tbl, (head)->hh.tbl->tail); \
+ (head)->hh.tbl->tail = &((add)->hh); \
+ } \
+ (head)->hh.tbl->num_items++; \
+ (add)->hh.tbl = (head)->hh.tbl; \
+ HASH_FCN(keyptr,keylen_in, (head)->hh.tbl->num_buckets, \
+ (add)->hh.hashv, _ha_bkt); \
+ HASH_ADD_TO_BKT((head)->hh.tbl->buckets[_ha_bkt],&(add)->hh); \
+ HASH_BLOOM_ADD((head)->hh.tbl,(add)->hh.hashv); \
+ HASH_EMIT_KEY(hh,head,keyptr,keylen_in); \
+ HASH_FSCK(hh,head); \
+} while(0)
+#define HASH_TO_BKT( hashv, num_bkts, bkt ) \
+do { \
+ bkt = ((hashv) & ((num_bkts) - 1)); \
+} while(0)
+/* delete "delptr" from the hash table.
+ * "the usual" patch-up process for the app-order doubly-linked-list.
+ * The use of _hd_hh_del below deserves special explanation.
+ * These used to be expressed using (delptr) but that led to a bug
+ * if someone used the same symbol for the head and deletee, like
+ * HASH_DELETE(hh,users,users);
+ * We want that to work, but by changing the head (users) below
+ * we were forfeiting our ability to further refer to the deletee (users)
+ * in the patch-up process. Solution: use scratch space to
+ * copy the deletee pointer, then the latter references are via that
+ * scratch pointer rather than through the repointed (users) symbol.
+ */
+#define HASH_DELETE(hh,head,delptr) \
+do { \
+ unsigned _hd_bkt; \
+ struct UT_hash_handle *_hd_hh_del; \
+ if ( ((delptr)->hh.prev == NULL) && ((delptr)->hh.next == NULL) ) { \
+ uthash_free((head)->hh.tbl->buckets, \
+ (head)->hh.tbl->num_buckets*sizeof(struct UT_hash_bucket) ); \
+ HASH_BLOOM_FREE((head)->hh.tbl); \
+ uthash_free((head)->hh.tbl, sizeof(UT_hash_table)); \
+ head = NULL; \
+ } else { \
+ _hd_hh_del = &((delptr)->hh); \
+ if ((delptr) == ELMT_FROM_HH((head)->hh.tbl,(head)->hh.tbl->tail)) { \
+ (head)->hh.tbl->tail = \
+ (UT_hash_handle*)((ptrdiff_t)((delptr)->hh.prev) + \
+ (head)->hh.tbl->hho); \
+ } \
+ if ((delptr)->hh.prev) { \
+ ((UT_hash_handle*)((ptrdiff_t)((delptr)->hh.prev) + \
+ (head)->hh.tbl->hho))->next = (delptr)->hh.next; \
+ } else { \
+ DECLTYPE_ASSIGN(head,(delptr)->hh.next); \
+ } \
+ if (_hd_hh_del->next) { \
+ ((UT_hash_handle*)((ptrdiff_t)_hd_hh_del->next + \
+ (head)->hh.tbl->hho))->prev = \
+ _hd_hh_del->prev; \
+ } \
+ HASH_TO_BKT( _hd_hh_del->hashv, (head)->hh.tbl->num_buckets, _hd_bkt); \
+ HASH_DEL_IN_BKT(hh,(head)->hh.tbl->buckets[_hd_bkt], _hd_hh_del); \
+ (head)->hh.tbl->num_items--; \
+ } \
+ HASH_FSCK(hh,head); \
+} while (0)
+/* convenience forms of HASH_FIND/HASH_ADD/HASH_DEL */
+#define HASH_FIND_STR(head,findstr,out) \
+ HASH_FIND(hh,head,findstr,strlen(findstr),out)
+#define HASH_ADD_STR(head,strfield,add) \
+ HASH_ADD(hh,head,strfield,strlen(add->strfield),add)
+#define HASH_REPLACE_STR(head,strfield,add,replaced) \
+ HASH_REPLACE(hh,head,strfield,strlen(add->strfield),add,replaced)
+#define HASH_FIND_INT(head,findint,out) \
+ HASH_FIND(hh,head,findint,sizeof(int),out)
+#define HASH_ADD_INT(head,intfield,add) \
+ HASH_ADD(hh,head,intfield,sizeof(int),add)
+#define HASH_REPLACE_INT(head,intfield,add,replaced) \
+ HASH_REPLACE(hh,head,intfield,sizeof(int),add,replaced)
+#define HASH_FIND_PTR(head,findptr,out) \
+ HASH_FIND(hh,head,findptr,sizeof(void *),out)
+#define HASH_ADD_PTR(head,ptrfield,add) \
+ HASH_ADD(hh,head,ptrfield,sizeof(void *),add)
+#define HASH_REPLACE_PTR(head,ptrfield,add) \
+ HASH_REPLACE(hh,head,ptrfield,sizeof(void *),add,replaced)
+#define HASH_DEL(head,delptr) \
+ HASH_DELETE(hh,head,delptr)
+/* HASH_FSCK checks hash integrity on every add/delete when HASH_DEBUG is defined.
+ * This is for uthash developer only; it compiles away if HASH_DEBUG isn't defined.
+ */
+#ifdef HASH_DEBUG
+#define HASH_OOPS(...) do { fprintf(stderr,__VA_ARGS__); exit(-1); } while (0)
+#define HASH_FSCK(hh,head) \
+do { \
+ unsigned _bkt_i; \
+ unsigned _count, _bkt_count; \
+ char *_prev; \
+ struct UT_hash_handle *_thh; \
+ if (head) { \
+ _count = 0; \
+ for( _bkt_i = 0; _bkt_i < (head)->hh.tbl->num_buckets; _bkt_i++) { \
+ _bkt_count = 0; \
+ _thh = (head)->hh.tbl->buckets[_bkt_i].hh_head; \
+ _prev = NULL; \
+ while (_thh) { \
+ if (_prev != (char*)(_thh->hh_prev)) { \
+ HASH_OOPS("invalid hh_prev %p, actual %p\n", \
+ _thh->hh_prev, _prev ); \
+ } \
+ _bkt_count++; \
+ _prev = (char*)(_thh); \
+ _thh = _thh->hh_next; \
+ } \
+ _count += _bkt_count; \
+ if ((head)->hh.tbl->buckets[_bkt_i].count != _bkt_count) { \
+ HASH_OOPS("invalid bucket count %d, actual %d\n", \
+ (head)->hh.tbl->buckets[_bkt_i].count, _bkt_count); \
+ } \
+ } \
+ if (_count != (head)->hh.tbl->num_items) { \
+ HASH_OOPS("invalid hh item count %d, actual %d\n", \
+ (head)->hh.tbl->num_items, _count ); \
+ } \
+ /* traverse hh in app order; check next/prev integrity, count */ \
+ _count = 0; \
+ _prev = NULL; \
+ _thh = &(head)->hh; \
+ while (_thh) { \
+ _count++; \
+ if (_prev !=(char*)(_thh->prev)) { \
+ HASH_OOPS("invalid prev %p, actual %p\n", \
+ _thh->prev, _prev ); \
+ } \
+ _prev = (char*)ELMT_FROM_HH((head)->hh.tbl, _thh); \
+ _thh = ( _thh->next ? (UT_hash_handle*)((char*)(_thh->next) + \
+ (head)->hh.tbl->hho) : NULL ); \
+ } \
+ if (_count != (head)->hh.tbl->num_items) { \
+ HASH_OOPS("invalid app item count %d, actual %d\n", \
+ (head)->hh.tbl->num_items, _count ); \
+ } \
+ } \
+} while (0)
+#define HASH_FSCK(hh,head)
+/* When compiled with -DHASH_EMIT_KEYS, length-prefixed keys are emitted to
+ * the descriptor to which this macro is defined for tuning the hash function.
+ * The app can #include <unistd.h> to get the prototype for write(2). */
+#define HASH_EMIT_KEY(hh,head,keyptr,fieldlen) \
+do { \
+ unsigned _klen = fieldlen; \
+ write(HASH_EMIT_KEYS, &_klen, sizeof(_klen)); \
+ write(HASH_EMIT_KEYS, keyptr, fieldlen); \
+} while (0)
+#define HASH_EMIT_KEY(hh,head,keyptr,fieldlen)
+/* default to Jenkin's hash unless overridden e.g. DHASH_FUNCTION=HASH_SAX */
+/* The Bernstein hash function, used in Perl prior to v5.6 */
+#define HASH_BER(key,keylen,num_bkts,hashv,bkt) \
+do { \
+ unsigned _hb_keylen=keylen; \
+ char *_hb_key=(char*)(key); \
+ (hashv) = 0; \
+ while (_hb_keylen--) { (hashv) = ((hashv) * 33) + *_hb_key++; } \
+ bkt = (hashv) & (num_bkts-1); \
+} while (0)
+/* SAX/FNV/OAT/JEN hash functions are macro variants of those listed at
+ * http://eternallyconfuzzled.com/tuts/algorithms/jsw_tut_hashing.aspx */
+#define HASH_SAX(key,keylen,num_bkts,hashv,bkt) \
+do { \
+ unsigned _sx_i; \
+ char *_hs_key=(char*)(key); \
+ hashv = 0; \
+ for(_sx_i=0; _sx_i < keylen; _sx_i++) \
+ hashv ^= (hashv << 5) + (hashv >> 2) + _hs_key[_sx_i]; \
+ bkt = hashv & (num_bkts-1); \
+} while (0)
+#define HASH_FNV(key,keylen,num_bkts,hashv,bkt) \
+do { \
+ unsigned _fn_i; \
+ char *_hf_key=(char*)(key); \
+ hashv = 2166136261UL; \
+ for(_fn_i=0; _fn_i < keylen; _fn_i++) \
+ hashv = (hashv * 16777619) ^ _hf_key[_fn_i]; \
+ bkt = hashv & (num_bkts-1); \
+} while(0)
+#define HASH_OAT(key,keylen,num_bkts,hashv,bkt) \
+do { \
+ unsigned _ho_i; \
+ char *_ho_key=(char*)(key); \
+ hashv = 0; \
+ for(_ho_i=0; _ho_i < keylen; _ho_i++) { \
+ hashv += _ho_key[_ho_i]; \
+ hashv += (hashv << 10); \
+ hashv ^= (hashv >> 6); \
+ } \
+ hashv += (hashv << 3); \
+ hashv ^= (hashv >> 11); \
+ hashv += (hashv << 15); \
+ bkt = hashv & (num_bkts-1); \
+} while(0)
+#define HASH_JEN_MIX(a,b,c) \
+do { \
+ a -= b; a -= c; a ^= ( c >> 13 ); \
+ b -= c; b -= a; b ^= ( a << 8 ); \
+ c -= a; c -= b; c ^= ( b >> 13 ); \
+ a -= b; a -= c; a ^= ( c >> 12 ); \
+ b -= c; b -= a; b ^= ( a << 16 ); \
+ c -= a; c -= b; c ^= ( b >> 5 ); \
+ a -= b; a -= c; a ^= ( c >> 3 ); \
+ b -= c; b -= a; b ^= ( a << 10 ); \
+ c -= a; c -= b; c ^= ( b >> 15 ); \
+} while (0)
+#define HASH_JEN(key,keylen,num_bkts,hashv,bkt) \
+do { \
+ unsigned _hj_i,_hj_j,_hj_k; \
+ unsigned char *_hj_key=(unsigned char*)(key); \
+ hashv = 0xfeedbeef; \
+ _hj_i = _hj_j = 0x9e3779b9; \
+ _hj_k = (unsigned)(keylen); \
+ while (_hj_k >= 12) { \
+ _hj_i += (_hj_key[0] + ( (unsigned)_hj_key[1] << 8 ) \
+ + ( (unsigned)_hj_key[2] << 16 ) \
+ + ( (unsigned)_hj_key[3] << 24 ) ); \
+ _hj_j += (_hj_key[4] + ( (unsigned)_hj_key[5] << 8 ) \
+ + ( (unsigned)_hj_key[6] << 16 ) \
+ + ( (unsigned)_hj_key[7] << 24 ) ); \
+ hashv += (_hj_key[8] + ( (unsigned)_hj_key[9] << 8 ) \
+ + ( (unsigned)_hj_key[10] << 16 ) \
+ + ( (unsigned)_hj_key[11] << 24 ) ); \
+ \
+ HASH_JEN_MIX(_hj_i, _hj_j, hashv); \
+ \
+ _hj_key += 12; \
+ _hj_k -= 12; \
+ } \
+ hashv += keylen; \
+ switch ( _hj_k ) { \
+ case 11: hashv += ( (unsigned)_hj_key[10] << 24 ); \
+ case 10: hashv += ( (unsigned)_hj_key[9] << 16 ); \
+ case 9: hashv += ( (unsigned)_hj_key[8] << 8 ); \
+ case 8: _hj_j += ( (unsigned)_hj_key[7] << 24 ); \
+ case 7: _hj_j += ( (unsigned)_hj_key[6] << 16 ); \
+ case 6: _hj_j += ( (unsigned)_hj_key[5] << 8 ); \
+ case 5: _hj_j += _hj_key[4]; \
+ case 4: _hj_i += ( (unsigned)_hj_key[3] << 24 ); \
+ case 3: _hj_i += ( (unsigned)_hj_key[2] << 16 ); \
+ case 2: _hj_i += ( (unsigned)_hj_key[1] << 8 ); \
+ case 1: _hj_i += _hj_key[0]; \
+ } \
+ HASH_JEN_MIX(_hj_i, _hj_j, hashv); \
+ bkt = hashv & (num_bkts-1); \
+} while(0)
+/* The Paul Hsieh hash function */
+#undef get16bits
+#if (defined(__GNUC__) && defined(__i386__)) || defined(__WATCOMC__) \
+ || defined(_MSC_VER) || defined (__BORLANDC__) || defined (__TURBOC__)
+#define get16bits(d) (*((const uint16_t *) (d)))
+#if !defined (get16bits)
+#define get16bits(d) ((((uint32_t)(((const uint8_t *)(d))[1])) << 8) \
+ +(uint32_t)(((const uint8_t *)(d))[0]) )
+#define HASH_SFH(key,keylen,num_bkts,hashv,bkt) \
+do { \
+ unsigned char *_sfh_key=(unsigned char*)(key); \
+ uint32_t _sfh_tmp, _sfh_len = keylen; \
+ \
+ int _sfh_rem = _sfh_len & 3; \
+ _sfh_len >>= 2; \
+ hashv = 0xcafebabe; \
+ \
+ /* Main loop */ \
+ for (;_sfh_len > 0; _sfh_len--) { \
+ hashv += get16bits (_sfh_key); \
+ _sfh_tmp = (uint32_t)(get16bits (_sfh_key+2)) << 11 ^ hashv; \
+ hashv = (hashv << 16) ^ _sfh_tmp; \
+ _sfh_key += 2*sizeof (uint16_t); \
+ hashv += hashv >> 11; \
+ } \
+ \
+ /* Handle end cases */ \
+ switch (_sfh_rem) { \
+ case 3: hashv += get16bits (_sfh_key); \
+ hashv ^= hashv << 16; \
+ hashv ^= (uint32_t)(_sfh_key[sizeof (uint16_t)] << 18); \
+ hashv += hashv >> 11; \
+ break; \
+ case 2: hashv += get16bits (_sfh_key); \
+ hashv ^= hashv << 11; \
+ hashv += hashv >> 17; \
+ break; \
+ case 1: hashv += *_sfh_key; \
+ hashv ^= hashv << 10; \
+ hashv += hashv >> 1; \
+ } \
+ \
+ /* Force "avalanching" of final 127 bits */ \
+ hashv ^= hashv << 3; \
+ hashv += hashv >> 5; \
+ hashv ^= hashv << 4; \
+ hashv += hashv >> 17; \
+ hashv ^= hashv << 25; \
+ hashv += hashv >> 6; \
+ bkt = hashv & (num_bkts-1); \
+} while(0)
+/* The MurmurHash exploits some CPU's (x86,x86_64) tolerance for unaligned reads.
+ * For other types of CPU's (e.g. Sparc) an unaligned read causes a bus error.
+ * MurmurHash uses the faster approach only on CPU's where we know it's safe.
+ *
+ * Note the preprocessor built-in defines can be emitted using:
+ *
+ * gcc -m64 -dM -E - < /dev/null (on gcc)
+ * cc -## a.c (where a.c is a simple test file) (Sun Studio)
+ */
+#if (defined(__i386__) || defined(__x86_64__) || defined(_M_IX86))
+#define MUR_GETBLOCK(p,i) p[i]
+#else /* non intel */
+#define MUR_PLUS0_ALIGNED(p) (((unsigned long)p & 0x3) == 0)
+#define MUR_PLUS1_ALIGNED(p) (((unsigned long)p & 0x3) == 1)
+#define MUR_PLUS2_ALIGNED(p) (((unsigned long)p & 0x3) == 2)
+#define MUR_PLUS3_ALIGNED(p) (((unsigned long)p & 0x3) == 3)
+#define WP(p) ((uint32_t*)((unsigned long)(p) & ~3UL))
+#if (defined(__BIG_ENDIAN__) || defined(SPARC) || defined(__ppc__) || defined(__ppc64__))
+#define MUR_THREE_ONE(p) ((((*WP(p))&0x00ffffff) << 8) | (((*(WP(p)+1))&0xff000000) >> 24))
+#define MUR_TWO_TWO(p) ((((*WP(p))&0x0000ffff) <<16) | (((*(WP(p)+1))&0xffff0000) >> 16))
+#define MUR_ONE_THREE(p) ((((*WP(p))&0x000000ff) <<24) | (((*(WP(p)+1))&0xffffff00) >> 8))
+#else /* assume little endian non-intel */
+#define MUR_THREE_ONE(p) ((((*WP(p))&0xffffff00) >> 8) | (((*(WP(p)+1))&0x000000ff) << 24))
+#define MUR_TWO_TWO(p) ((((*WP(p))&0xffff0000) >>16) | (((*(WP(p)+1))&0x0000ffff) << 16))
+#define MUR_ONE_THREE(p) ((((*WP(p))&0xff000000) >>24) | (((*(WP(p)+1))&0x00ffffff) << 8))
+#define MUR_GETBLOCK(p,i) (MUR_PLUS0_ALIGNED(p) ? ((p)[i]) : \
+#define MUR_ROTL32(x,r) (((x) << (r)) | ((x) >> (32 - (r))))
+#define MUR_FMIX(_h) \
+do { \
+ _h ^= _h >> 16; \
+ _h *= 0x85ebca6b; \
+ _h ^= _h >> 13; \
+ _h *= 0xc2b2ae35l; \
+ _h ^= _h >> 16; \
+} while(0)
+#define HASH_MUR(key,keylen,num_bkts,hashv,bkt) \
+do { \
+ const uint8_t *_mur_data = (const uint8_t*)(key); \
+ const int _mur_nblocks = (keylen) / 4; \
+ uint32_t _mur_h1 = 0xf88D5353; \
+ uint32_t _mur_c1 = 0xcc9e2d51; \
+ uint32_t _mur_c2 = 0x1b873593; \
+ uint32_t _mur_k1 = 0; \
+ const uint8_t *_mur_tail; \
+ const uint32_t *_mur_blocks = (const uint32_t*)(_mur_data+_mur_nblocks*4); \
+ int _mur_i; \
+ for(_mur_i = -_mur_nblocks; _mur_i; _mur_i++) { \
+ _mur_k1 = MUR_GETBLOCK(_mur_blocks,_mur_i); \
+ _mur_k1 *= _mur_c1; \
+ _mur_k1 = MUR_ROTL32(_mur_k1,15); \
+ _mur_k1 *= _mur_c2; \
+ \
+ _mur_h1 ^= _mur_k1; \
+ _mur_h1 = MUR_ROTL32(_mur_h1,13); \
+ _mur_h1 = _mur_h1*5+0xe6546b64; \
+ } \
+ _mur_tail = (const uint8_t*)(_mur_data + _mur_nblocks*4); \
+ _mur_k1=0; \
+ switch((keylen) & 3) { \
+ case 3: _mur_k1 ^= _mur_tail[2] << 16; \
+ case 2: _mur_k1 ^= _mur_tail[1] << 8; \
+ case 1: _mur_k1 ^= _mur_tail[0]; \
+ _mur_k1 *= _mur_c1; \
+ _mur_k1 = MUR_ROTL32(_mur_k1,15); \
+ _mur_k1 *= _mur_c2; \
+ _mur_h1 ^= _mur_k1; \
+ } \
+ _mur_h1 ^= (keylen); \
+ MUR_FMIX(_mur_h1); \
+ hashv = _mur_h1; \
+ bkt = hashv & (num_bkts-1); \
+} while(0)
+/* key comparison function; return 0 if keys equal */
+#define HASH_KEYCMP(a,b,len) memcmp(a,b,len)
+/* iterate over items in a known bucket to find desired item */
+#define HASH_FIND_IN_BKT(tbl,hh,head,keyptr,keylen_in,out) \
+do { \
+ if (head.hh_head) DECLTYPE_ASSIGN(out,ELMT_FROM_HH(tbl,head.hh_head)); \
+ else out=NULL; \
+ while (out) { \
+ if ((out)->hh.keylen == keylen_in) { \
+ if ((HASH_KEYCMP((out)->hh.key,keyptr,keylen_in)) == 0) break; \
+ } \
+ if ((out)->hh.hh_next) DECLTYPE_ASSIGN(out,ELMT_FROM_HH(tbl,(out)->hh.hh_next)); \
+ else out = NULL; \
+ } \
+} while(0)
+/* add an item to a bucket */
+#define HASH_ADD_TO_BKT(head,addhh) \
+do { \
+ head.count++; \
+ (addhh)->hh_next = head.hh_head; \
+ (addhh)->hh_prev = NULL; \
+ if (head.hh_head) { (head).hh_head->hh_prev = (addhh); } \
+ (head).hh_head=addhh; \
+ if (head.count >= ((head.expand_mult+1) * HASH_BKT_CAPACITY_THRESH) \
+ && (addhh)->tbl->noexpand != 1) { \
+ HASH_EXPAND_BUCKETS((addhh)->tbl); \
+ } \
+} while(0)
+/* remove an item from a given bucket */
+#define HASH_DEL_IN_BKT(hh,head,hh_del) \
+ (head).count--; \
+ if ((head).hh_head == hh_del) { \
+ (head).hh_head = hh_del->hh_next; \
+ } \
+ if (hh_del->hh_prev) { \
+ hh_del->hh_prev->hh_next = hh_del->hh_next; \
+ } \
+ if (hh_del->hh_next) { \
+ hh_del->hh_next->hh_prev = hh_del->hh_prev; \
+ }
+/* Bucket expansion has the effect of doubling the number of buckets
+ * and redistributing the items into the new buckets. Ideally the
+ * items will distribute more or less evenly into the new buckets
+ * (the extent to which this is true is a measure of the quality of
+ * the hash function as it applies to the key domain).
+ *
+ * With the items distributed into more buckets, the chain length
+ * (item count) in each bucket is reduced. Thus by expanding buckets
+ * the hash keeps a bound on the chain length. This bounded chain
+ * length is the essence of how a hash provides constant time lookup.
+ *
+ * The calculation of tbl->ideal_chain_maxlen below deserves some
+ * explanation. First, keep in mind that we're calculating the ideal
+ * maximum chain length based on the *new* (doubled) bucket count.
+ * In fractions this is just n/b (n=number of items,b=new num buckets).
+ * Since the ideal chain length is an integer, we want to calculate
+ * ceil(n/b). We don't depend on floating point arithmetic in this
+ * hash, so to calculate ceil(n/b) with integers we could write
+ *
+ * ceil(n/b) = (n/b) + ((n%b)?1:0)
+ *
+ * and in fact a previous version of this hash did just that.
+ * But now we have improved things a bit by recognizing that b is
+ * always a power of two. We keep its base 2 log handy (call it lb),
+ * so now we can write this with a bit shift and logical AND:
+ *
+ * ceil(n/b) = (n>>lb) + ( (n & (b-1)) ? 1:0)
+ *
+ */
+#define HASH_EXPAND_BUCKETS(tbl) \
+do { \
+ unsigned _he_bkt; \
+ unsigned _he_bkt_i; \
+ struct UT_hash_handle *_he_thh, *_he_hh_nxt; \
+ UT_hash_bucket *_he_new_buckets, *_he_newbkt; \
+ _he_new_buckets = (UT_hash_bucket*)uthash_malloc( \
+ 2 * tbl->num_buckets * sizeof(struct UT_hash_bucket)); \
+ if (!_he_new_buckets) { uthash_fatal( "out of memory"); } \
+ memset(_he_new_buckets, 0, \
+ 2 * tbl->num_buckets * sizeof(struct UT_hash_bucket)); \
+ tbl->ideal_chain_maxlen = \
+ (tbl->num_items >> (tbl->log2_num_buckets+1)) + \
+ ((tbl->num_items & ((tbl->num_buckets*2)-1)) ? 1 : 0); \
+ tbl->nonideal_items = 0; \
+ for(_he_bkt_i = 0; _he_bkt_i < tbl->num_buckets; _he_bkt_i++) \
+ { \
+ _he_thh = tbl->buckets[ _he_bkt_i ].hh_head; \
+ while (_he_thh) { \
+ _he_hh_nxt = _he_thh->hh_next; \
+ HASH_TO_BKT( _he_thh->hashv, tbl->num_buckets*2, _he_bkt); \
+ _he_newbkt = &(_he_new_buckets[ _he_bkt ]); \
+ if (++(_he_newbkt->count) > tbl->ideal_chain_maxlen) { \
+ tbl->nonideal_items++; \
+ _he_newbkt->expand_mult = _he_newbkt->count / \
+ tbl->ideal_chain_maxlen; \
+ } \
+ _he_thh->hh_prev = NULL; \
+ _he_thh->hh_next = _he_newbkt->hh_head; \
+ if (_he_newbkt->hh_head) _he_newbkt->hh_head->hh_prev = \
+ _he_thh; \
+ _he_newbkt->hh_head = _he_thh; \
+ _he_thh = _he_hh_nxt; \
+ } \
+ } \
+ uthash_free( tbl->buckets, tbl->num_buckets*sizeof(struct UT_hash_bucket) ); \
+ tbl->num_buckets *= 2; \
+ tbl->log2_num_buckets++; \
+ tbl->buckets = _he_new_buckets; \
+ tbl->ineff_expands = (tbl->nonideal_items > (tbl->num_items >> 1)) ? \
+ (tbl->ineff_expands+1) : 0; \
+ if (tbl->ineff_expands > 1) { \
+ tbl->noexpand=1; \
+ uthash_noexpand_fyi(tbl); \
+ } \
+ uthash_expand_fyi(tbl); \
+} while(0)
+/* This is an adaptation of Simon Tatham's O(n log(n)) mergesort */
+/* Note that HASH_SORT assumes the hash handle name to be hh.
+ * HASH_SRT was added to allow the hash handle name to be passed in. */
+#define HASH_SORT(head,cmpfcn) HASH_SRT(hh,head,cmpfcn)
+#define HASH_SRT(hh,head,cmpfcn) \
+do { \
+ unsigned _hs_i; \
+ unsigned _hs_looping,_hs_nmerges,_hs_insize,_hs_psize,_hs_qsize; \
+ struct UT_hash_handle *_hs_p, *_hs_q, *_hs_e, *_hs_list, *_hs_tail; \
+ if (head) { \
+ _hs_insize = 1; \
+ _hs_looping = 1; \
+ _hs_list = &((head)->hh); \
+ while (_hs_looping) { \
+ _hs_p = _hs_list; \
+ _hs_list = NULL; \
+ _hs_tail = NULL; \
+ _hs_nmerges = 0; \
+ while (_hs_p) { \
+ _hs_nmerges++; \
+ _hs_q = _hs_p; \
+ _hs_psize = 0; \
+ for ( _hs_i = 0; _hs_i < _hs_insize; _hs_i++ ) { \
+ _hs_psize++; \
+ _hs_q = (UT_hash_handle*)((_hs_q->next) ? \
+ ((void*)((char*)(_hs_q->next) + \
+ (head)->hh.tbl->hho)) : NULL); \
+ if (! (_hs_q) ) break; \
+ } \
+ _hs_qsize = _hs_insize; \
+ while ((_hs_psize > 0) || ((_hs_qsize > 0) && _hs_q )) { \
+ if (_hs_psize == 0) { \
+ _hs_e = _hs_q; \
+ _hs_q = (UT_hash_handle*)((_hs_q->next) ? \
+ ((void*)((char*)(_hs_q->next) + \
+ (head)->hh.tbl->hho)) : NULL); \
+ _hs_qsize--; \
+ } else if ( (_hs_qsize == 0) || !(_hs_q) ) { \
+ _hs_e = _hs_p; \
+ if (_hs_p){ \
+ _hs_p = (UT_hash_handle*)((_hs_p->next) ? \
+ ((void*)((char*)(_hs_p->next) + \
+ (head)->hh.tbl->hho)) : NULL); \
+ } \
+ _hs_psize--; \
+ } else if (( \
+ cmpfcn(DECLTYPE(head)(ELMT_FROM_HH((head)->hh.tbl,_hs_p)), \
+ DECLTYPE(head)(ELMT_FROM_HH((head)->hh.tbl,_hs_q))) \
+ ) <= 0) { \
+ _hs_e = _hs_p; \
+ if (_hs_p){ \
+ _hs_p = (UT_hash_handle*)((_hs_p->next) ? \
+ ((void*)((char*)(_hs_p->next) + \
+ (head)->hh.tbl->hho)) : NULL); \
+ } \
+ _hs_psize--; \
+ } else { \
+ _hs_e = _hs_q; \
+ _hs_q = (UT_hash_handle*)((_hs_q->next) ? \
+ ((void*)((char*)(_hs_q->next) + \
+ (head)->hh.tbl->hho)) : NULL); \
+ _hs_qsize--; \
+ } \
+ if ( _hs_tail ) { \
+ _hs_tail->next = ((_hs_e) ? \
+ ELMT_FROM_HH((head)->hh.tbl,_hs_e) : NULL); \
+ } else { \
+ _hs_list = _hs_e; \
+ } \
+ if (_hs_e) { \
+ _hs_e->prev = ((_hs_tail) ? \
+ ELMT_FROM_HH((head)->hh.tbl,_hs_tail) : NULL); \
+ } \
+ _hs_tail = _hs_e; \
+ } \
+ _hs_p = _hs_q; \
+ } \
+ if (_hs_tail){ \
+ _hs_tail->next = NULL; \
+ } \
+ if ( _hs_nmerges <= 1 ) { \
+ _hs_looping=0; \
+ (head)->hh.tbl->tail = _hs_tail; \
+ DECLTYPE_ASSIGN(head,ELMT_FROM_HH((head)->hh.tbl, _hs_list)); \
+ } \
+ _hs_insize *= 2; \
+ } \
+ HASH_FSCK(hh,head); \
+ } \
+} while (0)
+/* This function selects items from one hash into another hash.
+ * The end result is that the selected items have dual presence
+ * in both hashes. There is no copy of the items made; rather
+ * they are added into the new hash through a secondary hash
+ * hash handle that must be present in the structure. */
+#define HASH_SELECT(hh_dst, dst, hh_src, src, cond) \
+do { \
+ unsigned _src_bkt, _dst_bkt; \
+ void *_last_elt=NULL, *_elt; \
+ UT_hash_handle *_src_hh, *_dst_hh, *_last_elt_hh=NULL; \
+ ptrdiff_t _dst_hho = ((char*)(&(dst)->hh_dst) - (char*)(dst)); \
+ if (src) { \
+ for(_src_bkt=0; _src_bkt < (src)->hh_src.tbl->num_buckets; _src_bkt++) { \
+ for(_src_hh = (src)->hh_src.tbl->buckets[_src_bkt].hh_head; \
+ _src_hh; \
+ _src_hh = _src_hh->hh_next) { \
+ _elt = ELMT_FROM_HH((src)->hh_src.tbl, _src_hh); \
+ if (cond(_elt)) { \
+ _dst_hh = (UT_hash_handle*)(((char*)_elt) + _dst_hho); \
+ _dst_hh->key = _src_hh->key; \
+ _dst_hh->keylen = _src_hh->keylen; \
+ _dst_hh->hashv = _src_hh->hashv; \
+ _dst_hh->prev = _last_elt; \
+ _dst_hh->next = NULL; \
+ if (_last_elt_hh) { _last_elt_hh->next = _elt; } \
+ if (!dst) { \
+ DECLTYPE_ASSIGN(dst,_elt); \
+ HASH_MAKE_TABLE(hh_dst,dst); \
+ } else { \
+ _dst_hh->tbl = (dst)->hh_dst.tbl; \
+ } \
+ HASH_TO_BKT(_dst_hh->hashv, _dst_hh->tbl->num_buckets, _dst_bkt); \
+ HASH_ADD_TO_BKT(_dst_hh->tbl->buckets[_dst_bkt],_dst_hh); \
+ (dst)->hh_dst.tbl->num_items++; \
+ _last_elt = _elt; \
+ _last_elt_hh = _dst_hh; \
+ } \
+ } \
+ } \
+ } \
+ HASH_FSCK(hh_dst,dst); \
+} while (0)
+#define HASH_CLEAR(hh,head) \
+do { \
+ if (head) { \
+ uthash_free((head)->hh.tbl->buckets, \
+ (head)->hh.tbl->num_buckets*sizeof(struct UT_hash_bucket)); \
+ HASH_BLOOM_FREE((head)->hh.tbl); \
+ uthash_free((head)->hh.tbl, sizeof(UT_hash_table)); \
+ (head)=NULL; \
+ } \
+} while(0)
+#define HASH_OVERHEAD(hh,head) \
+ (size_t)((((head)->hh.tbl->num_items * sizeof(UT_hash_handle)) + \
+ ((head)->hh.tbl->num_buckets * sizeof(UT_hash_bucket)) + \
+ (sizeof(UT_hash_table)) + \
+#define HASH_ITER(hh,head,el,tmp) \
+for((el)=(head), (*(char**)(&(tmp)))=(char*)((head)?(head)->hh.next:NULL); \
+ el; (el)=(tmp),(*(char**)(&(tmp)))=(char*)((tmp)?(tmp)->hh.next:NULL))
+#define HASH_ITER(hh,head,el,tmp) \
+for((el)=(head),(tmp)=DECLTYPE(el)((head)?(head)->hh.next:NULL); \
+ el; (el)=(tmp),(tmp)=DECLTYPE(el)((tmp)?(tmp)->hh.next:NULL))
+/* obtain a count of items in the hash */
+#define HASH_COUNT(head) HASH_CNT(hh,head)
+#define HASH_CNT(hh,head) ((head)?((head)->hh.tbl->num_items):0)
+typedef struct UT_hash_bucket {
+ struct UT_hash_handle *hh_head;
+ unsigned count;
+ /* expand_mult is normally set to 0. In this situation, the max chain length
+ * threshold is enforced at its default value, HASH_BKT_CAPACITY_THRESH. (If
+ * the bucket's chain exceeds this length, bucket expansion is triggered).
+ * However, setting expand_mult to a non-zero value delays bucket expansion
+ * (that would be triggered by additions to this particular bucket)
+ * until its chain length reaches a *multiple* of HASH_BKT_CAPACITY_THRESH.
+ * (The multiplier is simply expand_mult+1). The whole idea of this
+ * multiplier is to reduce bucket expansions, since they are expensive, in
+ * situations where we know that a particular bucket tends to be overused.
+ * It is better to let its chain length grow to a longer yet-still-bounded
+ * value, than to do an O(n) bucket expansion too often.
+ */
+ unsigned expand_mult;
+} UT_hash_bucket;
+/* random signature used only to find hash tables in external analysis */
+#define HASH_SIGNATURE 0xa0111fe1
+#define HASH_BLOOM_SIGNATURE 0xb12220f2
+typedef struct UT_hash_table {
+ UT_hash_bucket *buckets;
+ unsigned num_buckets, log2_num_buckets;
+ unsigned num_items;
+ struct UT_hash_handle *tail; /* tail hh in app order, for fast append */
+ ptrdiff_t hho; /* hash handle offset (byte pos of hash handle in element */
+ /* in an ideal situation (all buckets used equally), no bucket would have
+ * more than ceil(#items/#buckets) items. that's the ideal chain length. */
+ unsigned ideal_chain_maxlen;
+ /* nonideal_items is the number of items in the hash whose chain position
+ * exceeds the ideal chain maxlen. these items pay the penalty for an uneven
+ * hash distribution; reaching them in a chain traversal takes >ideal steps */
+ unsigned nonideal_items;
+ /* ineffective expands occur when a bucket doubling was performed, but
+ * afterward, more than half the items in the hash had nonideal chain
+ * positions. If this happens on two consecutive expansions we inhibit any
+ * further expansion, as it's not helping; this happens when the hash
+ * function isn't a good fit for the key domain. When expansion is inhibited
+ * the hash will still work, albeit no longer in constant time. */
+ unsigned ineff_expands, noexpand;
+ uint32_t signature; /* used only to find hash tables in external analysis */
+#ifdef HASH_BLOOM
+ uint32_t bloom_sig; /* used only to test bloom exists in external analysis */
+ uint8_t *bloom_bv;
+ char bloom_nbits;
+} UT_hash_table;
+typedef struct UT_hash_handle {
+ struct UT_hash_table *tbl;
+ void *prev; /* prev element in app order */
+ void *next; /* next element in app order */
+ struct UT_hash_handle *hh_prev; /* previous hh in bucket order */
+ struct UT_hash_handle *hh_next; /* next hh in bucket order */
+ void *key; /* ptr to enclosing struct's key */
+ unsigned keylen; /* enclosing struct's key len */
+ unsigned hashv; /* result of hash-fcn(key) */
+} UT_hash_handle;
+#endif /* UTHASH_H */
diff --git a/import/pdclibhdfs/libhdfs.sln b/import/pdclibhdfs/libhdfs.sln
new file mode 100755
index 0000000..5f49967
--- /dev/null
+++ b/import/pdclibhdfs/libhdfs.sln
@@ -0,0 +1,26 @@
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2012
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "libhdfs", "libhdfs\libhdfs.vcxproj", "{5B359A57-EC8A-4577-9F43-B3A00D9E5ECD}"
+ GlobalSection(SolutionConfigurationPlatforms) = preSolution
+ Debug|Win32 = Debug|Win32
+ Debug|x64 = Debug|x64
+ Release|Win32 = Release|Win32
+ Release|x64 = Release|x64
+ EndGlobalSection
+ GlobalSection(ProjectConfigurationPlatforms) = postSolution
+ {5B359A57-EC8A-4577-9F43-B3A00D9E5ECD}.Debug|Win32.ActiveCfg = Debug|Win32
+ {5B359A57-EC8A-4577-9F43-B3A00D9E5ECD}.Debug|Win32.Build.0 = Debug|Win32
+ {5B359A57-EC8A-4577-9F43-B3A00D9E5ECD}.Debug|x64.ActiveCfg = Debug|x64
+ {5B359A57-EC8A-4577-9F43-B3A00D9E5ECD}.Debug|x64.Build.0 = Debug|x64
+ {5B359A57-EC8A-4577-9F43-B3A00D9E5ECD}.Release|Win32.ActiveCfg = Release|Win32
+ {5B359A57-EC8A-4577-9F43-B3A00D9E5ECD}.Release|Win32.Build.0 = Release|Win32
+ {5B359A57-EC8A-4577-9F43-B3A00D9E5ECD}.Release|x64.ActiveCfg = Release|x64
+ {5B359A57-EC8A-4577-9F43-B3A00D9E5ECD}.Release|x64.Build.0 = Release|x64
+ EndGlobalSection
+ GlobalSection(SolutionProperties) = preSolution
+ HideSolutionNode = FALSE
+ EndGlobalSection
diff --git a/import/pdclibhdfs/libhdfs/ReadMe.txt b/import/pdclibhdfs/libhdfs/ReadMe.txt
new file mode 100755
index 0000000..5c886e1
--- /dev/null
+++ b/import/pdclibhdfs/libhdfs/ReadMe.txt
@@ -0,0 +1,37 @@
+ STATIC LIBRARY : libhdfs Project Overview
+AppWizard has created this libhdfs library project for you.
+This file contains a summary of what you will find in each of the files that
+make up your libhdfs application.
+ This is the main project file for VC++ projects generated using an Application Wizard.
+ It contains information about the version of Visual C++ that generated the file, and
+ information about the platforms, configurations, and project features selected with the
+ Application Wizard.
+ This is the filters file for VC++ projects generated using an Application Wizard.
+ It contains information about the association between the files in your project
+ and the filters. This association is used in the IDE to show grouping of files with
+ similar extensions under a specific node (for e.g. ".cpp" files are associated with the
+ "Source Files" filter).
+StdAfx.h, StdAfx.cpp
+ These files are used to build a precompiled header (PCH) file
+ named libhdfs.pch and a precompiled types file named StdAfx.obj.
+Other notes:
+AppWizard uses "TODO:" comments to indicate parts of the source code you
+should add to or customize.
diff --git a/import/pdclibhdfs/libhdfs/libhdfs.vcxproj b/import/pdclibhdfs/libhdfs/libhdfs.vcxproj
new file mode 100755
index 0000000..ae37376
--- /dev/null
+++ b/import/pdclibhdfs/libhdfs/libhdfs.vcxproj
@@ -0,0 +1,165 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <ItemGroup Label="ProjectConfigurations">
+ <ProjectConfiguration Include="Debug|Win32">
+ <Configuration>Debug</Configuration>
+ <Platform>Win32</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Debug|x64">
+ <Configuration>Debug</Configuration>
+ <Platform>x64</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Release|Win32">
+ <Configuration>Release</Configuration>
+ <Platform>Win32</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Release|x64">
+ <Configuration>Release</Configuration>
+ <Platform>x64</Platform>
+ </ProjectConfiguration>
+ </ItemGroup>
+ <PropertyGroup Label="Globals">
+ <ProjectGuid>{5B359A57-EC8A-4577-9F43-B3A00D9E5ECD}</ProjectGuid>
+ <Keyword>Win32Proj</Keyword>
+ <RootNamespace>libhdfs</RootNamespace>
+ </PropertyGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+ <ConfigurationType>StaticLibrary</ConfigurationType>
+ <UseDebugLibraries>true</UseDebugLibraries>
+ <PlatformToolset>v110</PlatformToolset>
+ <CharacterSet>Unicode</CharacterSet>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+ <ConfigurationType>StaticLibrary</ConfigurationType>
+ <UseDebugLibraries>true</UseDebugLibraries>
+ <PlatformToolset>v110</PlatformToolset>
+ <CharacterSet>Unicode</CharacterSet>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+ <ConfigurationType>StaticLibrary</ConfigurationType>
+ <UseDebugLibraries>false</UseDebugLibraries>
+ <PlatformToolset>v110</PlatformToolset>
+ <WholeProgramOptimization>true</WholeProgramOptimization>
+ <CharacterSet>Unicode</CharacterSet>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+ <ConfigurationType>StaticLibrary</ConfigurationType>
+ <UseDebugLibraries>false</UseDebugLibraries>
+ <PlatformToolset>v110</PlatformToolset>
+ <WholeProgramOptimization>true</WholeProgramOptimization>
+ <CharacterSet>Unicode</CharacterSet>
+ </PropertyGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+ <ImportGroup Label="ExtensionSettings">
+ </ImportGroup>
+ <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <PropertyGroup Label="UserMacros" />
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ <OutDir>$(SolutionDir)..</OutDir>
+ <IntDir>$(SolutionDir)\obj\obj\libhdfs\$(Configuration)\$(Platform)\</IntDir>
+ </PropertyGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ <ClCompile>
+ <PrecompiledHeader>Use</PrecompiledHeader>
+ <WarningLevel>Level3</WarningLevel>
+ <Optimization>Disabled</Optimization>
+ <PreprocessorDefinitions>WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ <SDLCheck>true</SDLCheck>
+ </ClCompile>
+ <Link>
+ <SubSystem>Windows</SubSystem>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ <ClCompile>
+ <PrecompiledHeader>NotUsing</PrecompiledHeader>
+ <WarningLevel>Level3</WarningLevel>
+ <Optimization>Disabled</Optimization>
+ <PreprocessorDefinitions>WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ <SDLCheck>true</SDLCheck>
+ <AdditionalIncludeDirectories>C:\Program Files\Java\jdk1.7.0_51\include;C:\Program Files\Java\jdk1.7.0_51\include\win32;..\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+ </ClCompile>
+ <Link>
+ <SubSystem>Windows</SubSystem>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ <ClCompile>
+ <WarningLevel>Level3</WarningLevel>
+ <PrecompiledHeader>NotUsing</PrecompiledHeader>
+ <Optimization>MaxSpeed</Optimization>
+ <FunctionLevelLinking>true</FunctionLevelLinking>
+ <IntrinsicFunctions>true</IntrinsicFunctions>
+ <PreprocessorDefinitions>WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ <SDLCheck>true</SDLCheck>
+ <AdditionalIncludeDirectories>..\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+ </ClCompile>
+ <Link>
+ <SubSystem>Windows</SubSystem>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <EnableCOMDATFolding>true</EnableCOMDATFolding>
+ <OptimizeReferences>true</OptimizeReferences>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ <ClCompile>
+ <WarningLevel>Level3</WarningLevel>
+ <PrecompiledHeader>NotUsing</PrecompiledHeader>
+ <Optimization>MaxSpeed</Optimization>
+ <FunctionLevelLinking>true</FunctionLevelLinking>
+ <IntrinsicFunctions>true</IntrinsicFunctions>
+ <PreprocessorDefinitions>WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ <SDLCheck>true</SDLCheck>
+ <AdditionalIncludeDirectories>C:\Program Files\Java\jdk1.7.0_51\include\win32;C:\Program Files\Java\jdk1.7.0_51\include;..\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+ </ClCompile>
+ <Link>
+ <SubSystem>Windows</SubSystem>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <EnableCOMDATFolding>true</EnableCOMDATFolding>
+ <OptimizeReferences>true</OptimizeReferences>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemGroup>
+ <Text Include="ReadMe.txt" />
+ </ItemGroup>
+ <ItemGroup>
+ <ClInclude Include="..\inc\exception.h" />
+ <ClInclude Include="..\inc\expect.h" />
+ <ClInclude Include="..\inc\hdfs.h" />
+ <ClInclude Include="..\inc\hdfs_test.h" />
+ <ClInclude Include="..\inc\jni_helper.h" />
+ <ClInclude Include="..\inc\native_mini_dfs.h" />
+ <ClInclude Include="..\inc\uthash.h" />
+ <ClInclude Include="stdafx.h" />
+ <ClInclude Include="targetver.h" />
+ </ItemGroup>
+ <ItemGroup>
+ <ClCompile Include="..\src\exception.c" />
+ <ClCompile Include="..\src\hdfs.c" />
+ <ClCompile Include="..\src\jni_helper.c" />
+ <ClCompile Include="..\src\native_mini_dfs.c" />
+ <ClCompile Include="stdafx.cpp">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Create</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Create</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Create</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Create</PrecompiledHeader>
+ </ClCompile>
+ </ItemGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+ <ImportGroup Label="ExtensionTargets">
+ </ImportGroup>
\ No newline at end of file
diff --git a/import/pdclibhdfs/libhdfs/libhdfs.vcxproj.filters b/import/pdclibhdfs/libhdfs/libhdfs.vcxproj.filters
new file mode 100755
index 0000000..e5dd724
--- /dev/null
+++ b/import/pdclibhdfs/libhdfs/libhdfs.vcxproj.filters
@@ -0,0 +1,66 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <ItemGroup>
+ <Filter Include="Source Files">
+ <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
+ <Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
+ </Filter>
+ <Filter Include="Header Files">
+ <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
+ <Extensions>h;hpp;hxx;hm;inl;inc;xsd</Extensions>
+ </Filter>
+ <Filter Include="Resource Files">
+ <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
+ <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
+ </Filter>
+ </ItemGroup>
+ <ItemGroup>
+ <Text Include="ReadMe.txt" />
+ </ItemGroup>
+ <ItemGroup>
+ <ClInclude Include="stdafx.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="targetver.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\inc\exception.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\inc\expect.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\inc\hdfs.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\inc\hdfs_test.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\inc\jni_helper.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\inc\native_mini_dfs.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\inc\uthash.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ </ItemGroup>
+ <ItemGroup>
+ <ClCompile Include="stdafx.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\src\exception.c">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\src\hdfs.c">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\src\jni_helper.c">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\src\native_mini_dfs.c">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ </ItemGroup>
\ No newline at end of file
diff --git a/import/pdclibhdfs/libhdfs/stdafx.cpp b/import/pdclibhdfs/libhdfs/stdafx.cpp
new file mode 100755
index 0000000..075e31d
--- /dev/null
+++ b/import/pdclibhdfs/libhdfs/stdafx.cpp
@@ -0,0 +1,8 @@
+// stdafx.cpp : source file that includes just the standard includes
+// libhdfs.pch will be the pre-compiled header
+// stdafx.obj will contain the pre-compiled type information
+#include "stdafx.h"
+// TODO: reference any additional headers you need in STDAFX.H
+// and not in this file
diff --git a/import/pdclibhdfs/libhdfs/stdafx.h b/import/pdclibhdfs/libhdfs/stdafx.h
new file mode 100755
index 0000000..c4ddfed
--- /dev/null
+++ b/import/pdclibhdfs/libhdfs/stdafx.h
@@ -0,0 +1,14 @@
+// stdafx.h : include file for standard system include files,
+// or project specific include files that are used frequently, but
+// are changed infrequently
+#pragma once
+#include "targetver.h"
+#define WIN32_LEAN_AND_MEAN // Exclude rarely-used stuff from Windows headers
+// TODO: reference additional headers your program requires here
diff --git a/import/pdclibhdfs/libhdfs/targetver.h b/import/pdclibhdfs/libhdfs/targetver.h
new file mode 100755
index 0000000..90e767b
--- /dev/null
+++ b/import/pdclibhdfs/libhdfs/targetver.h
@@ -0,0 +1,8 @@
+#pragma once
+// Including SDKDDKVer.h defines the highest available Windows platform.
+// If you wish to build your application for a previous Windows platform, include WinSDKVer.h and
+// set the _WIN32_WINNT macro to the platform you wish to support before including SDKDDKVer.h.
+#include <SDKDDKVer.h>
diff --git a/import/pdclibhdfs/src/.cmake.state b/import/pdclibhdfs/src/.cmake.state
new file mode 100755
index 0000000..d363c9d
Binary files /dev/null and b/import/pdclibhdfs/src/.cmake.state differ
diff --git a/import/pdclibhdfs/src/LICENSE.txt b/import/pdclibhdfs/src/LICENSE.txt
new file mode 100755
index 0000000..07b79c2
--- /dev/null
+++ b/import/pdclibhdfs/src/LICENSE.txt
@@ -0,0 +1,271 @@
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+ 1. Definitions.
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ implied, including, without limitation, any warranties or conditions
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+ APPENDIX: How to apply the Apache License to your work.
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+ Copyright [yyyy] [name of copyright owner]
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ See the License for the specific language governing permissions and
+ limitations under the License.
+The Apache Hadoop project contains subcomponents with separate copyright
+notices and license terms. Your use of the source code for the these
+subcomponents is subject to the terms and conditions of the following
+For the org.apache.hadoop.util.bloom.* classes:
+ *
+ * Copyright (c) 2005, European Commission project OneLab under contract
+ * 034819 (http://www.one-lab.org)
+ * All rights reserved.
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the distribution.
+ * - Neither the name of the University Catholique de Louvain - UCL
+ * nor the names of its contributors may be used to endorse or
+ * promote products derived from this software without specific prior
+ * written permission.
+ *
+ */
+For src/main/native/util/tree.h:
+ * Copyright 2002 Niels Provos <provos at citi.umich.edu>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ */
diff --git a/import/pdclibhdfs/src/NOTICE.txt b/import/pdclibhdfs/src/NOTICE.txt
new file mode 100755
index 0000000..fa3bb79
--- /dev/null
+++ b/import/pdclibhdfs/src/NOTICE.txt
@@ -0,0 +1,2 @@
+This product includes software developed by The Apache Software
+Foundation (http://www.apache.org/).
diff --git a/import/pdclibhdfs/src/TstOpsHdfs.tdprj.xml b/import/pdclibhdfs/src/TstOpsHdfs.tdprj.xml
new file mode 100755
index 0000000..752cdf5
--- /dev/null
+++ b/import/pdclibhdfs/src/TstOpsHdfs.tdprj.xml
@@ -0,0 +1,40 @@
+<?xml version="1.0" encoding="US-ASCII" ?>
+* *
+* TITLE: TstOpsHdfs.tdprj.xml *
+* *
+* Copyright 2009-2010 by Teradata Corporation. *
+* *
+* *
+* *
+* Purpose: To create makefiles used by SCM build process. *
+* *
+* Description: A XML file. *
+* *
+* Revision Date DR DID Comments *
+* =========== ======== ===== ======== ===================================== *
+* 08052010 116734 JR185072 Initial version *
+* *
+<Project Name="TstOpsHdfs"
+ ProductGroupName="tstlibhdfs">
+ <Package Package="No"/>
+ <ToolClassConfiguration ToolClass="MicrosoftResourceOtbePreprocessor">
+ <Set SetName="Defines" Name="TWBResource">
+ BUILDPRODUCTNAME="\"Teradata Parallel Transporter\""
+ BUILDPROJECT="\"Test libhdfs Read\""
+ </Set>
+ </ToolClassConfiguration>
+ <FileDefinitions>
+ <File Path="test_libhdfs_ops.c"/>
+ </FileDefinitions>
diff --git a/import/pdclibhdfs/src/TstReadHdfs.tdprj.xml b/import/pdclibhdfs/src/TstReadHdfs.tdprj.xml
new file mode 100755
index 0000000..23ef93f
--- /dev/null
+++ b/import/pdclibhdfs/src/TstReadHdfs.tdprj.xml
@@ -0,0 +1,40 @@
+<?xml version="1.0" encoding="US-ASCII" ?>
+* *
+* TITLE: TstReadHdfs.tdprj.xml *
+* *
+* Copyright 2009-2010 by Teradata Corporation. *
+* *
+* *
+* *
+* Purpose: To create makefiles used by SCM build process. *
+* *
+* Description: A XML file. *
+* *
+* Revision Date DR DID Comments *
+* =========== ======== ===== ======== ===================================== *
+* 08052010 116734 JR185072 Initial version *
+* *
+<Project Name="TstReadHdfs"
+ ProductGroupName="tstlibhdfs">
+ <Package Package="No"/>
+ <ToolClassConfiguration ToolClass="MicrosoftResourceOtbePreprocessor">
+ <Set SetName="Defines" Name="TWBResource">
+ BUILDPRODUCTNAME="\"Teradata Parallel Transporter\""
+ BUILDPROJECT="\"Test libhdfs Read\""
+ </Set>
+ </ToolClassConfiguration>
+ <FileDefinitions>
+ <File Path="test_libhdfs_read.c"/>
+ </FileDefinitions>
diff --git a/import/pdclibhdfs/src/TstWriteHdfs.tdprj.xml b/import/pdclibhdfs/src/TstWriteHdfs.tdprj.xml
new file mode 100755
index 0000000..92ada5e
--- /dev/null
+++ b/import/pdclibhdfs/src/TstWriteHdfs.tdprj.xml
@@ -0,0 +1,40 @@
+<?xml version="1.0" encoding="US-ASCII" ?>
+* *
+* TITLE: TstWriteHdfs.tdprj.xml *
+* *
+* Copyright 2009-2010 by Teradata Corporation. *
+* *
+* *
+* *
+* Purpose: To create makefiles used by SCM build process. *
+* *
+* Description: A XML file. *
+* *
+* Revision Date DR DID Comments *
+* =========== ======== ===== ======== ===================================== *
+* 08052010 116734 JR185072 Initial version *
+* *
+<Project Name="TstWriteHdfs"
+ ProductGroupName="tstlibhdfs">
+ <Package Package="No"/>
+ <ToolClassConfiguration ToolClass="MicrosoftResourceOtbePreprocessor">
+ <Set SetName="Defines" Name="TWBResource">
+ BUILDPRODUCTNAME="\"Teradata Parallel Transporter\""
+ BUILDPROJECT="\"Test libhdfs Read\""
+ </Set>
+ </ToolClassConfiguration>
+ <FileDefinitions>
+ <File Path="test_libhdfs_write.c"/>
+ </FileDefinitions>
diff --git a/import/pdclibhdfs/src/exception.c b/import/pdclibhdfs/src/exception.c
new file mode 100755
index 0000000..25f0144
--- /dev/null
+++ b/import/pdclibhdfs/src/exception.c
@@ -0,0 +1,232 @@
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "exception.h"
+#include "hdfs.h"
+#include "jni_helper.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#define EXCEPTION_INFO_LEN (sizeof(gExceptionInfo)/sizeof(gExceptionInfo[0]))
+struct ExceptionInfo {
+ const char * const name;
+ int noPrintFlag;
+ int excErrno;
+static const struct ExceptionInfo gExceptionInfo[] = {
+ {
+ "java/io/FileNotFoundException",
+ },
+ {
+ "org/apache/hadoop/security/AccessControlException",
+ },
+ {
+ "org/apache/hadoop/fs/UnresolvedLinkException",
+ },
+ {
+ "org/apache/hadoop/fs/ParentNotDirectoryException",
+ },
+ {
+ "java/lang/IllegalArgumentException",
+ },
+ {
+ "java/lang/OutOfMemoryError",
+ 0,
+ }
+void getExceptionInfo ( const char *excName, int noPrintFlags,
+ int *excErrno, int *shouldPrint )
+ int i;
+ for (i = 0; i < EXCEPTION_INFO_LEN; i++) {
+ if (strstr(gExceptionInfo[i].name, excName)) {
+ break;
+ }
+ }
+ *shouldPrint = !(gExceptionInfo[i].noPrintFlag & noPrintFlags);
+ *excErrno = gExceptionInfo[i].excErrno;
+ } else {
+ *shouldPrint = 1;
+ *excErrno = EINTERNAL;
+ }
+int printExceptionAndFreeV ( JNIEnv *env,
+ jthrowable exc,
+ int noPrintFlags,
+ const char *fmt, va_list ap )
+ int i, noPrint, excErrno;
+ char *className = NULL;
+ jstring jStr = NULL;
+ jvalue jVal;
+ jthrowable jthr;
+ const char *stackTrace;
+ jthr = classNameOfObject(exc, env, &className);
+ if (jthr) {
+ fprintf(stderr, "PrintExceptionAndFree: error determining class name "
+ "of exception.\n");
+ className = _strdup("(unknown)");
+ destroyLocalReference(env, jthr);
+ }
+ for (i = 0; i < EXCEPTION_INFO_LEN; i++) {
+ if (!strcmp(gExceptionInfo[i].name, className)) {
+ break;
+ }
+ }
+ noPrint = (gExceptionInfo[i].noPrintFlag & noPrintFlags);
+ excErrno = gExceptionInfo[i].excErrno;
+ } else {
+ noPrint = 0;
+ excErrno = EINTERNAL;
+ }
+ if (!noPrint) {
+ vfprintf(stderr, fmt, ap);
+ fprintf(stderr, " error:\n");
+ /* We don't want to use ExceptionDescribe here, because that requires a
+ pending exception. Instead, use ExceptionUtils. */
+ jthr = invokeMethod(env, &jVal, STATIC, NULL,
+ "org/apache/commons/lang/exception/ExceptionUtils",
+ "getStackTrace", "(Ljava/lang/Throwable;)Ljava/lang/String;", exc);
+ if (jthr) {
+ fprintf(stderr, "(unable to get stack trace for %s exception: "
+ "ExceptionUtils::getStackTrace error.)\n", className);
+ destroyLocalReference(env, jthr);
+ } else {
+ jStr = jVal.l;
+ stackTrace = (*env)->GetStringUTFChars(env, jStr, NULL);
+ if (!stackTrace) {
+ fprintf(stderr, "(unable to get stack trace for %s exception: "
+ "GetStringUTFChars error.)\n", className);
+ } else {
+ fprintf(stderr, "%s", stackTrace);
+ (*env)->ReleaseStringUTFChars(env, jStr, stackTrace);
+ }
+ }
+ }
+ destroyLocalReference(env, jStr);
+ destroyLocalReference(env, exc);
+ free(className);
+ return excErrno;
+int printExceptionAndFree ( JNIEnv *env,
+ jthrowable exc,
+ int noPrintFlags,
+ const char *fmt, ... )
+ va_list ap;
+ int ret = 0;
+ va_start(ap, fmt);
+ ret = printExceptionAndFreeV ( env, exc, noPrintFlags, fmt, ap );
+ va_end(ap);
+ return ret;
+int printPendingExceptionAndFree ( JNIEnv *env,
+ int noPrintFlags,
+ const char *fmt, ... )
+ va_list ap;
+ int ret;
+ jthrowable exc;
+ exc = (*env)->ExceptionOccurred(env);
+ if (!exc) {
+ va_start(ap, fmt);
+ vfprintf(stderr, fmt, ap);
+ va_end(ap);
+ fprintf(stderr, " error: (no exception)");
+ ret = 0;
+ } else {
+ (*env)->ExceptionClear(env);
+ va_start(ap, fmt);
+ ret = printExceptionAndFreeV(env, exc, noPrintFlags, fmt, ap);
+ va_end(ap);
+ }
+ return ret;
+jthrowable getPendingExceptionAndClear ( JNIEnv *env )
+ jthrowable jthr = (*env)->ExceptionOccurred(env);
+ if (!jthr) return NULL;
+ (*env)->ExceptionClear(env);
+ return jthr;
+jthrowable newRuntimeError ( JNIEnv *env, const char *fmt, ... )
+ char buf[512];
+ jobject out, exc;
+ jstring jstr;
+ va_list ap;
+ va_start(ap, fmt);
+ vsnprintf_s(buf, sizeof(buf), sizeof(buf)-1, fmt, ap);
+ va_end(ap);
+ jstr = (*env)->NewStringUTF(env, buf);
+ if (!jstr) {
+ /* We got an out of memory exception rather than a RuntimeException.
+ Too bad... */
+ return getPendingExceptionAndClear(env);
+ }
+ exc = constructNewObjectOfClass(env, &out, "RuntimeException",
+ "(java/lang/String;)V", jstr);
+ (*env)->DeleteLocalRef(env, jstr);
+ /* Again, we'll either get an out of memory exception or the
+ RuntimeException we wanted. */
+ return (exc) ? exc : out;
diff --git a/import/pdclibhdfs/src/hdfs.c b/import/pdclibhdfs/src/hdfs.c
new file mode 100755
index 0000000..c5dde87
--- /dev/null
+++ b/import/pdclibhdfs/src/hdfs.c
@@ -0,0 +1,3016 @@
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "exception.h"
+#include "hdfs.h"
+#include "jni_helper.h"
+#include <stdio.h>
+#include <string.h>
+/* Some frequently used Java paths */
+#define HADOOP_CONF "org/apache/hadoop/conf/Configuration"
+#define HADOOP_PATH "org/apache/hadoop/fs/Path"
+#define HADOOP_LOCALFS "org/apache/hadoop/fs/LocalFileSystem"
+#define HADOOP_FS "org/apache/hadoop/fs/FileSystem"
+#define HADOOP_FSSTATUS "org/apache/hadoop/fs/FsStatus"
+#define HADOOP_BLK_LOC "org/apache/hadoop/fs/BlockLocation"
+#define HADOOP_DFS "org/apache/hadoop/hdfs/DistributedFileSystem"
+#define HADOOP_ISTRM "org/apache/hadoop/fs/FSDataInputStream"
+#define HADOOP_OSTRM "org/apache/hadoop/fs/FSDataOutputStream"
+#define HADOOP_STAT "org/apache/hadoop/fs/FileStatus"
+#define HADOOP_FSPERM "org/apache/hadoop/fs/permission/FsPermission"
+#define JAVA_NET_ISA "java/net/InetSocketAddress"
+#define JAVA_NET_URI "java/net/URI"
+#define JAVA_STRING "java/lang/String"
+#define JAVA_VOID "V"
+/* Macros for constructing method signatures */
+#define JPARAM(X) "L" X ";"
+#define JARRPARAM(X) "[L" X ";"
+#define JMETHOD1(X, R) "(" X ")" R
+#define JMETHOD2(X, Y, R) "(" X Y ")" R
+#define JMETHOD3(X, Y, Z, R) "(" X Y Z")" R
+#define KERBEROS_TICKET_CACHE_PATH "hadoop.security.kerberos.ticket.cache.path"
+/* Bit fields for hdfsFile_internal flags */
+static tSize readDirect(hdfsFS fs, hdfsFile f, void* buffer, tSize length);
+static void hdfsFreeFileInfoEntry(hdfsFileInfo *hdfsFileInfo);
+ * hdfsJniEnv: A wrapper struct to be used as 'value'
+ * while saving thread -> JNIEnv* mappings
+ */
+typedef struct
+ JNIEnv* env;
+} hdfsJniEnv;
+ * The C equivalent of org.apache.org.hadoop.FSData(Input|Output)Stream .
+ */
+enum hdfsStreamType
+ TYPEIN = 1,
+ TYPEOUT = 2,
+ * The 'file-handle' to a file in hdfs.
+ */
+struct hdfsFile_internal {
+ void* file;
+ enum hdfsStreamType type;
+ int flags;
+struct hdfsBuilderConfOpt {
+ struct hdfsBuilderConfOpt *next;
+ const char *key;
+ const char *val;
+struct hdfsBuilder {
+ int forceNewInstance;
+ const char *nn;
+ tPort port;
+ const char *kerbTicketCachePath;
+ const char *userName;
+ struct hdfsBuilderConfOpt *opts;
+int hdfsFileIsOpenForRead (hdfsFile file)
+ return (file->type == TYPEIN);
+int hdfsFileGetReadStatistics ( hdfsFile file,
+ struct hdfsReadStatistics **stats )
+ jthrowable jthr;
+ jobject readStats = NULL;
+ jvalue jVal;
+ struct hdfsReadStatistics *s = NULL;
+ int ret;
+ JNIEnv* env = getJNIEnv();
+ if (env == NULL) {
+ errno = EINTERNAL;
+ return -1;
+ }
+ if (file->type != TYPEIN) {
+ ret = EINVAL;
+ goto done;
+ }
+ jthr = invokeMethod(env, &jVal, INSTANCE, file->file,
+ "org/apache/hadoop/hdfs/client/HdfsDataInputStream",
+ "getReadStatistics",
+ "()Lorg/apache/hadoop/hdfs/DFSInputStream$ReadStatistics;");
+ if (jthr) {
+ ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsFileGetReadStatistics: getReadStatistics failed");
+ goto done;
+ }
+ readStats = jVal.l;
+ s = malloc(sizeof(struct hdfsReadStatistics));
+ if (!s) {
+ ret = ENOMEM;
+ goto done;
+ }
+ jthr = invokeMethod(env, &jVal, INSTANCE, readStats,
+ "org/apache/hadoop/hdfs/DFSInputStream$ReadStatistics",
+ "getTotalBytesRead", "()J");
+ if (jthr) {
+ ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsFileGetReadStatistics: getTotalBytesRead failed");
+ goto done;
+ }
+ s->totalBytesRead = jVal.j;
+ jthr = invokeMethod(env, &jVal, INSTANCE, readStats,
+ "org/apache/hadoop/hdfs/DFSInputStream$ReadStatistics",
+ "getTotalLocalBytesRead", "()J");
+ if (jthr) {
+ ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsFileGetReadStatistics: getTotalLocalBytesRead failed");
+ goto done;
+ }
+ s->totalLocalBytesRead = jVal.j;
+ jthr = invokeMethod(env, &jVal, INSTANCE, readStats,
+ "org/apache/hadoop/hdfs/DFSInputStream$ReadStatistics",
+ "getTotalShortCircuitBytesRead", "()J");
+ if (jthr) {
+ ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsFileGetReadStatistics: getTotalShortCircuitBytesRead failed");
+ goto done;
+ }
+ s->totalShortCircuitBytesRead = jVal.j;
+ *stats = s;
+ s = NULL;
+ ret = 0;
+ destroyLocalReference(env, readStats);
+ free(s);
+ if (ret) {
+ errno = ret;
+ return -1;
+ }
+ return 0;
+int64_t hdfsReadStatisticsGetRemoteBytesRead(
+ const struct hdfsReadStatistics *stats)
+ return stats->totalBytesRead - stats->totalLocalBytesRead;
+void hdfsFileFreeReadStatistics(struct hdfsReadStatistics *stats)
+ free(stats);
+int hdfsFileIsOpenForWrite(hdfsFile file)
+ return (file->type == TYPEOUT);
+int hdfsFileUsesDirectRead(hdfsFile file)
+ return !!(file->flags & HDFS_FILE_SUPPORTS_DIRECT_READ);
+void hdfsFileDisableDirectRead(hdfsFile file)
+ * Helper function to create a org.apache.hadoop.fs.Path object.
+ * @param env: The JNIEnv pointer.
+ * @param path: The file-path for which to construct org.apache.hadoop.fs.Path
+ * object.
+ * @return Returns a jobject on success and NULL on error.
+ */
+static jthrowable constructNewObjectOfPath(JNIEnv *env, const char *path,
+ jobject *out)
+ jthrowable jthr;
+ jstring jPathString;
+ jobject jPath;
+ /*Construct a java.lang.String object*/
+ jthr = newJavaStr(env, path, &jPathString);
+ if (jthr)
+ return jthr;
+ /*Construct the org.apache.hadoop.fs.Path object */
+ jthr = constructNewObjectOfClass(env, &jPath, "org/apache/hadoop/fs/Path",
+ "(Ljava/lang/String;)V", jPathString);
+ destroyLocalReference(env, jPathString);
+ if (jthr)
+ return jthr;
+ *out = jPath;
+ return NULL;
+ * Set a configuration value.
+ *
+ * @param env The JNI environment
+ * @param jConfiguration The configuration object to modify
+ * @param key The key to modify
+ * @param value The value to set the key to
+ *
+ * @return NULL on success; exception otherwise
+ */
+static jthrowable hadoopConfSetStr(JNIEnv *env, jobject jConfiguration,
+ const char *key, const char *value)
+ jthrowable jthr;
+ jstring jkey = NULL, jvalue = NULL;
+ jthr = newJavaStr(env, key, &jkey);
+ if (jthr)
+ goto done;
+ jthr = newJavaStr(env, value, &jvalue);
+ if (jthr)
+ goto done;
+ jthr = invokeMethod(env, NULL, INSTANCE, jConfiguration,
+ jkey, jvalue);
+ destroyLocalReference(env, jkey);
+ destroyLocalReference(env, jvalue);
+ return jthr;
+ * Get a configuration value.
+ *
+ * @param env The JNI environment
+ * @param jConfiguration The configuration object to read
+ * @param key The key to fetch
+ * @param value The value to be returned
+ *
+ * @return NULL on success; exception otherwise
+ */
+static jthrowable hadoopConfGetStr(JNIEnv *env, jobject jConfiguration,
+ const char *key, char **val)
+ jthrowable jthr;
+ jvalue jVal;
+ jstring jkey = NULL, jRet = NULL;
+ jthr = newJavaStr(env, key, &jkey);
+ if (jthr)
+ goto done;
+ jthr = invokeMethod(env, &jVal, INSTANCE, jConfiguration,
+ if (jthr)
+ goto done;
+ jRet = jVal.l;
+ jthr = newCStr(env, jRet, val);
+ destroyLocalReference(env, jkey);
+ destroyLocalReference(env, jRet);
+ return jthr;
+int hdfsConfGetStr(const char *key, char **val)
+ JNIEnv *env;
+ int ret;
+ jthrowable jthr;
+ jobject jConfiguration = NULL;
+ env = getJNIEnv();
+ if (env == NULL) {
+ ret = EINTERNAL;
+ goto done;
+ }
+ jthr = constructNewObjectOfClass(env, &jConfiguration, HADOOP_CONF, "()V");
+ if (jthr) {
+ ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsConfGetStr(%s): new Configuration", key);
+ goto done;
+ }
+ jthr = hadoopConfGetStr(env, jConfiguration, key, val);
+ if (jthr) {
+ ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsConfGetStr(%s): hadoopConfGetStr", key);
+ goto done;
+ }
+ ret = 0;
+ destroyLocalReference(env, jConfiguration);
+ if (ret)
+ errno = ret;
+ return ret;
+void hdfsConfStrFree(char *val)
+ free(val);
+static jthrowable hadoopConfGetInt(JNIEnv *env, jobject jConfiguration,
+ const char *key, int32_t *val)
+ jthrowable jthr = NULL;
+ jvalue jVal;
+ jstring jkey = NULL;
+ jthr = newJavaStr(env, key, &jkey);
+ if (jthr)
+ return jthr;
+ jthr = invokeMethod(env, &jVal, INSTANCE, jConfiguration,
+ jkey, (jint)(*val));
+ destroyLocalReference(env, jkey);
+ if (jthr)
+ return jthr;
+ *val = jVal.i;
+ return NULL;
+int hdfsConfGetInt(const char *key, int32_t *val)
+ JNIEnv *env;
+ int ret;
+ jobject jConfiguration = NULL;
+ jthrowable jthr;
+ env = getJNIEnv();
+ if (env == NULL) {
+ ret = EINTERNAL;
+ goto done;
+ }
+ jthr = constructNewObjectOfClass(env, &jConfiguration, HADOOP_CONF, "()V");
+ if (jthr) {
+ ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsConfGetInt(%s): new Configuration", key);
+ goto done;
+ }
+ jthr = hadoopConfGetInt(env, jConfiguration, key, val);
+ if (jthr) {
+ ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsConfGetInt(%s): hadoopConfGetInt", key);
+ goto done;
+ }
+ ret = 0;
+ destroyLocalReference(env, jConfiguration);
+ if (ret)
+ errno = ret;
+ return ret;
+struct hdfsBuilder *hdfsNewBuilder(void)
+ struct hdfsBuilder *bld = calloc(1, sizeof(struct hdfsBuilder));
+ if (!bld) {
+ errno = ENOMEM;
+ return NULL;
+ }
+ return bld;
+int hdfsBuilderConfSetStr(struct hdfsBuilder *bld, const char *key,
+ const char *val)
+ struct hdfsBuilderConfOpt *opt, *next;
+ opt = calloc(1, sizeof(struct hdfsBuilderConfOpt));
+ if (!opt)
+ return -ENOMEM;
+ next = bld->opts;
+ bld->opts = opt;
+ opt->next = next;
+ opt->key = key;
+ opt->val = val;
+ return 0;
+void hdfsFreeBuilder(struct hdfsBuilder *bld)
+ struct hdfsBuilderConfOpt *cur, *next;
+ cur = bld->opts;
+ for (cur = bld->opts; cur; ) {
+ next = cur->next;
+ free(cur);
+ cur = next;
+ }
+ free(bld);
+void hdfsBuilderSetForceNewInstance(struct hdfsBuilder *bld)
+ bld->forceNewInstance = 1;
+void hdfsBuilderSetNameNode(struct hdfsBuilder *bld, const char *nn)
+ bld->nn = nn;
+void hdfsBuilderSetNameNodePort(struct hdfsBuilder *bld, tPort port)
+ bld->port = port;
+void hdfsBuilderSetUserName(struct hdfsBuilder *bld, const char *userName)
+ bld->userName = userName;
+void hdfsBuilderSetKerbTicketCachePath(struct hdfsBuilder *bld,
+ const char *kerbTicketCachePath)
+ bld->kerbTicketCachePath = kerbTicketCachePath;
+hdfsFS hdfsConnect(const char* host, tPort port)
+ struct hdfsBuilder *bld = hdfsNewBuilder();
+ if (!bld)
+ return NULL;
+ hdfsBuilderSetNameNode(bld, host);
+ hdfsBuilderSetNameNodePort(bld, port);
+ return hdfsBuilderConnect(bld);
+/** Always return a new FileSystem handle */
+hdfsFS hdfsConnectNewInstance(const char* host, tPort port)
+ struct hdfsBuilder *bld = hdfsNewBuilder();
+ if (!bld)
+ return NULL;
+ hdfsBuilderSetNameNode(bld, host);
+ hdfsBuilderSetNameNodePort(bld, port);
+ hdfsBuilderSetForceNewInstance(bld);
+ return hdfsBuilderConnect(bld);
+hdfsFS hdfsConnectAsUser(const char* host, tPort port, const char *user)
+ struct hdfsBuilder *bld = hdfsNewBuilder();
+ if (!bld)
+ return NULL;
+ hdfsBuilderSetNameNode(bld, host);
+ hdfsBuilderSetNameNodePort(bld, port);
+ hdfsBuilderSetUserName(bld, user);
+ return hdfsBuilderConnect(bld);
+/** Always return a new FileSystem handle */
+hdfsFS hdfsConnectAsUserNewInstance(const char* host, tPort port,
+ const char *user)
+ struct hdfsBuilder *bld = hdfsNewBuilder();
+ if (!bld)
+ return NULL;
+ hdfsBuilderSetNameNode(bld, host);
+ hdfsBuilderSetNameNodePort(bld, port);
+ hdfsBuilderSetForceNewInstance(bld);
+ hdfsBuilderSetUserName(bld, user);
+ return hdfsBuilderConnect(bld);
+ * Calculate the effective URI to use, given a builder configuration.
+ *
+ * If there is not already a URI scheme, we prepend 'hdfs://'.
+ *
+ * If there is not already a port specified, and a port was given to the
+ * builder, we suffix that port. If there is a port specified but also one in
+ * the URI, that is an error.
+ *
+ * @param bld The hdfs builder object
+ * @param uri (out param) dynamically allocated string representing the
+ * effective URI
+ *
+ * @return 0 on success; error code otherwise
+ */
+static int calcEffectiveURI ( struct hdfsBuilder *bld, char ** uri )
+ const char *scheme;
+ char suffix[64];
+ const char *lastColon;
+ char *u;
+ size_t uriLen;
+ if (!bld->nn)
+ return EINVAL;
+ scheme = (strstr(bld->nn, "://")) ? "" : "hdfs://";
+ if (bld->port == 0) {
+ suffix[0] = '\0';
+ } else {
+ lastColon = strrchr (bld->nn, ':');
+ if (lastColon && (strspn(lastColon + 1, "0123456789") ==
+ strlen(lastColon + 1))) {
+ fprintf(stderr, "port %d was given, but URI '%s' already "
+ "contains a port!\n", bld->port, bld->nn);
+ return EINVAL;
+ }
+ /*snprintf(suffix, sizeof(suffix), ":%d", bld->port);*/
+ sprintf_s(suffix, sizeof(suffix), ":%d", bld->port);
+ }
+ uriLen = strlen(scheme) + strlen(bld->nn) + strlen(suffix);
+ u = malloc((uriLen + 1) * (sizeof(char)));
+ if (!u) {
+ fprintf(stderr, "calcEffectiveURI: out of memory");
+ return ENOMEM;
+ }
+ /* snprintf(u, uriLen + 1, "%s%s%s", scheme, bld->nn, suffix);*/
+ sprintf_s ( u, uriLen + 1, "%s%s%s", scheme, bld->nn, suffix);
+ *uri = u;
+ return 0;
+static const char *maybeNull(const char *str)
+ return str ? str : "(NULL)";
+static const char *hdfsBuilderToStr(const struct hdfsBuilder *bld,
+ char *buf, size_t bufLen)
+ /* snprintf(buf, bufLen, "forceNewInstance=%d, nn=%s, port=%d, "*/
+ sprintf_s ( buf, bufLen, "forceNewInstance=%d, nn=%s, port=%d, "
+ "kerbTicketCachePath=%s, userName=%s",
+ bld->forceNewInstance, maybeNull(bld->nn), bld->port,
+ maybeNull(bld->kerbTicketCachePath), maybeNull(bld->userName));
+ return buf;
+hdfsFS hdfsBuilderConnect ( struct hdfsBuilder *bld )
+ JNIEnv *env = 0;
+ jobject jConfiguration = NULL, jFS = NULL, jURI = NULL, jCachePath = NULL;
+ jstring jURIString = NULL, jUserString = NULL;
+ jvalue jVal;
+ jthrowable jthr = NULL;
+ char *cURI = 0, buf[512];
+ int ret;
+ jobject jRet = NULL;
+ struct hdfsBuilderConfOpt *opt;
+ /*Get the JNIEnv* corresponding to current thread */
+ env = getJNIEnv();
+ if (env == NULL) {
+ ret = EINTERNAL;
+ goto done;
+ }
+ /* jConfiguration = new Configuration(); */
+ jthr = constructNewObjectOfClass(env, &jConfiguration, HADOOP_CONF, "()V");
+ if (jthr) {
+ ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsBuilderConnect(%s)", hdfsBuilderToStr(bld, buf, sizeof(buf)));
+ goto done;
+ }
+ /* set configuration value */
+ for (opt = bld->opts; opt; opt = opt->next) {
+ jthr = hadoopConfSetStr(env, jConfiguration, opt->key, opt->val);
+ if (jthr) {
+ ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsBuilderConnect(%s): error setting conf '%s' to '%s'",
+ hdfsBuilderToStr(bld, buf, sizeof(buf)), opt->key, opt->val);
+ goto done;
+ }
+ }
+ /*Check what type of FileSystem the caller wants... */
+ if (bld->nn == NULL) {
+ /* Get a local filesystem. */
+ if (bld->forceNewInstance) {
+ /* fs = FileSytem#newInstanceLocal(conf); */
+ jthr = invokeMethod(env, &jVal, STATIC, NULL, HADOOP_FS,
+ "newInstanceLocal", JMETHOD1(JPARAM(HADOOP_CONF),
+ JPARAM(HADOOP_LOCALFS)), jConfiguration);
+ if (jthr) {
+ ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsBuilderConnect(%s)",
+ hdfsBuilderToStr(bld, buf, sizeof(buf)));
+ goto done;
+ }
+ jFS = jVal.l;
+ } else {
+ /* fs = FileSytem#getLocal(conf); */
+ jthr = invokeMethod(env, &jVal, STATIC, NULL, HADOOP_FS, "getLocal",
+ jConfiguration);
+ if (jthr) {
+ ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsBuilderConnect(%s)",
+ hdfsBuilderToStr(bld, buf, sizeof(buf)));
+ goto done;
+ }
+ jFS = jVal.l;
+ }
+ } else {
+ if (!strcmp(bld->nn, "default")) {
+ /* jURI = FileSystem.getDefaultUri(conf) */
+ jthr = invokeMethod(env, &jVal, STATIC, NULL, HADOOP_FS,
+ "getDefaultUri",
+ "(Lorg/apache/hadoop/conf/Configuration;)Ljava/net/URI;",
+ jConfiguration);
+ if (jthr) {
+ ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsBuilderConnect(%s)",
+ hdfsBuilderToStr(bld, buf, sizeof(buf)));
+ goto done;
+ }
+ jURI = jVal.l;
+ } else {
+ /* fs = FileSystem#get(URI, conf, ugi); */
+ ret = calcEffectiveURI(bld, &cURI);
+ if (ret)
+ goto done;
+ jthr = newJavaStr(env, cURI, &jURIString);
+ if (jthr) {
+ ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsBuilderConnect(%s)",
+ hdfsBuilderToStr(bld, buf, sizeof(buf)));
+ goto done;
+ }
+ jthr = invokeMethod(env, &jVal, STATIC, NULL, JAVA_NET_URI,
+ "create", "(Ljava/lang/String;)Ljava/net/URI;",
+ jURIString);
+ if (jthr) {
+ ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsBuilderConnect(%s)",
+ hdfsBuilderToStr(bld, buf, sizeof(buf)));
+ goto done;
+ }
+ jURI = jVal.l;
+ }
+ if (bld->kerbTicketCachePath) {
+ jthr = hadoopConfSetStr(env, jConfiguration,
+ KERBEROS_TICKET_CACHE_PATH, bld->kerbTicketCachePath);
+ if (jthr) {
+ ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsBuilderConnect(%s)",
+ hdfsBuilderToStr(bld, buf, sizeof(buf)));
+ goto done;
+ }
+ }
+ jthr = newJavaStr(env, bld->userName, &jUserString);
+ if (jthr) {
+ ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsBuilderConnect(%s)",
+ hdfsBuilderToStr(bld, buf, sizeof(buf)));
+ goto done;
+ }
+ if (bld->forceNewInstance) {
+ jthr = invokeMethod(env, &jVal, STATIC, NULL, HADOOP_FS,
+ jURI, jConfiguration, jUserString);
+ if (jthr) {
+ ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsBuilderConnect(%s)",
+ hdfsBuilderToStr(bld, buf, sizeof(buf)));
+ goto done;
+ }
+ jFS = jVal.l;
+ } else {
+ jthr = invokeMethod(env, &jVal, STATIC, NULL, HADOOP_FS, "get",
+ jURI, jConfiguration, jUserString);
+ if (jthr) {
+ ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsBuilderConnect(%s)",
+ hdfsBuilderToStr(bld, buf, sizeof(buf)));
+ goto done;
+ }
+ jFS = jVal.l;
+ }
+ }
+ jRet = (*env)->NewGlobalRef(env, jFS);
+ if (!jRet) {
+ ret = printPendingExceptionAndFree(env, PRINT_EXC_ALL,
+ "hdfsBuilderConnect(%s)",
+ hdfsBuilderToStr(bld, buf, sizeof(buf)));
+ goto done;
+ }
+ ret = 0;
+ /* Release unnecessary local references */
+ destroyLocalReference(env, jConfiguration);
+ destroyLocalReference(env, jFS);
+ destroyLocalReference(env, jURI);
+ destroyLocalReference(env, jCachePath);
+ destroyLocalReference(env, jURIString);
+ destroyLocalReference(env, jUserString);
+ free(cURI);
+ hdfsFreeBuilder(bld);
+ if (ret) {
+ errno = ret;
+ return NULL;
+ }
+ return (hdfsFS)jRet;
+int hdfsDisconnect(hdfsFS fs)
+ fs.close() */
+ jobject jFS;
+ int ret;
+ jthrowable jthr;
+ /*Get the JNIEnv* corresponding to current thread */
+ JNIEnv* env = getJNIEnv();
+ if (env == NULL) {
+ errno = EINTERNAL;
+ return -1;
+ }
+ /*Parameters */
+ jFS = (jobject) fs;
+ /*Sanity check */
+ if (fs == NULL) {
+ errno = EBADF;
+ return -1;
+ }
+ jthr = invokeMethod(env, NULL, INSTANCE, jFS, HADOOP_FS,
+ "close", "()V");
+ if (jthr) {
+ ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsDisconnect: FileSystem#close");
+ } else {
+ ret = 0;
+ }
+ (*env)->DeleteGlobalRef(env, jFS);
+ if (ret) {
+ errno = ret;
+ return -1;
+ }
+ return 0;
+ * Get the default block size of a FileSystem object.
+ *
+ * @param env The Java env
+ * @param jFS The FileSystem object
+ * @param jPath The path to find the default blocksize at
+ * @param out (out param) the default block size
+ *
+ * @return NULL on success; or the exception
+ */
+static jthrowable getDefaultBlockSize(JNIEnv *env, jobject jFS,
+ jobject jPath, jlong *out)
+ jthrowable jthr;
+ jvalue jVal;
+ jthr = invokeMethod(env, &jVal, INSTANCE, jFS, HADOOP_FS,
+ "getDefaultBlockSize", JMETHOD1(JPARAM(HADOOP_PATH), "J"), jPath);
+ if (jthr)
+ return jthr;
+ *out = jVal.j;
+ return NULL;
+hdfsFile hdfsOpenFile(hdfsFS fs, const char* path, int flags,
+ int bufferSize, short replication, tSize blockSize)
+ /*
+ File f = new File(path);
+ FSData{Input|Output}Stream f{is|os} = fs.create(f);
+ return f{is|os};
+ */
+ /* Get the JNIEnv* corresponding to current thread */
+ JNIEnv* env = getJNIEnv();
+ int accmode = flags & O_ACCMODE;
+ jstring jStrBufferSize = NULL, jStrReplication = NULL;
+ jobject jConfiguration = NULL, jPath = NULL, jFile = NULL;
+ jobject jFS = (jobject)fs;
+ jthrowable jthr;
+ jvalue jVal;
+ hdfsFile file = NULL;
+ int ret;
+ const char* method = NULL;
+ const char* signature = NULL;
+ jint jBufferSize;
+ jshort jReplication;
+ if (env == NULL) {
+ errno = EINTERNAL;
+ return NULL;
+ }
+ if (accmode == O_RDONLY || accmode == O_WRONLY) {
+ /* yay */
+ } else if (accmode == O_RDWR) {
+ fprintf(stderr, "ERROR: cannot open an hdfs file in O_RDWR mode\n");
+ errno = ENOTSUP;
+ return NULL;
+ } else {
+ fprintf(stderr, "ERROR: cannot open an hdfs file in mode 0x%x\n", accmode);
+ errno = EINVAL;
+ return NULL;
+ }
+ if ((flags & O_CREAT) && (flags & O_EXCL)) {
+ fprintf(stderr, "WARN: hdfs does not truly support O_CREATE && O_EXCL\n");
+ }
+ /* The hadoop java api/signature */
+ if (accmode == O_RDONLY) {
+ method = "open";
+ } else if (flags & O_APPEND) {
+ method = "append";
+ } else {
+ method = "create";
+ }
+ /* Create an object of org.apache.hadoop.fs.Path */
+ jthr = constructNewObjectOfPath(env, path, &jPath);
+ if (jthr) {
+ ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsOpenFile(%s): constructNewObjectOfPath", path);
+ goto done;
+ }
+ /* Get the Configuration object from the FileSystem object */
+ jthr = invokeMethod(env, &jVal, INSTANCE, jFS, HADOOP_FS,
+ "getConf", JMETHOD1("", JPARAM(HADOOP_CONF)));
+ if (jthr) {
+ ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsOpenFile(%s): FileSystem#getConf", path);
+ goto done;
+ }
+ jConfiguration = jVal.l;
+ jStrBufferSize = (*env)->NewStringUTF(env, "io.file.buffer.size");
+ if (!jStrBufferSize) {
+ ret = printPendingExceptionAndFree(env, PRINT_EXC_ALL, "OOM");
+ goto done;
+ }
+ jStrReplication = (*env)->NewStringUTF(env, "dfs.replication");
+ if (!jStrReplication) {
+ ret = printPendingExceptionAndFree(env, PRINT_EXC_ALL, "OOM");
+ goto done;
+ }
+ if (!bufferSize) {
+ jthr = invokeMethod(env, &jVal, INSTANCE, jConfiguration,
+ HADOOP_CONF, "getInt", "(Ljava/lang/String;I)I",
+ jStrBufferSize, 4096);
+ if (jthr) {
+ ret = printExceptionAndFree(env, jthr, NOPRINT_EXC_FILE_NOT_FOUND |
+ "hdfsOpenFile(%s): Configuration#getInt(io.file.buffer.size)",
+ path);
+ goto done;
+ }
+ jBufferSize = jVal.i;
+ }
+ if ((accmode == O_WRONLY) && (flags & O_APPEND) == 0) {
+ if (!replication) {
+ jthr = invokeMethod(env, &jVal, INSTANCE, jConfiguration,
+ HADOOP_CONF, "getInt", "(Ljava/lang/String;I)I",
+ jStrReplication, 1);
+ if (jthr) {
+ ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsOpenFile(%s): Configuration#getInt(dfs.replication)",
+ path);
+ goto done;
+ }
+ jReplication = jVal.i;
+ }
+ }
+ /* Create and return either the FSDataInputStream or
+ FSDataOutputStream references jobject jStream */
+ /* READ? */
+ if (accmode == O_RDONLY) {
+ jthr = invokeMethod(env, &jVal, INSTANCE, jFS, HADOOP_FS,
+ method, signature, jPath, jBufferSize);
+ } else if ((accmode == O_WRONLY) && (flags & O_APPEND)) {
+ jthr = invokeMethod(env, &jVal, INSTANCE, jFS, HADOOP_FS,
+ method, signature, jPath);
+ } else {
+ jboolean jOverWrite = 1;
+ jlong jBlockSize = blockSize;
+ if (jBlockSize == 0) {
+ jthr = getDefaultBlockSize(env, jFS, jPath, &jBlockSize);
+ if (jthr) {
+ ret = EIO;
+ goto done;
+ }
+ }
+ jthr = invokeMethod(env, &jVal, INSTANCE, jFS, HADOOP_FS,
+ method, signature, jPath, jOverWrite,
+ jBufferSize, jReplication, jBlockSize);
+ }
+ if (jthr) {
+ ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsOpenFile(%s): FileSystem#%s(%s)", path, method, signature);
+ goto done;
+ }
+ jFile = jVal.l;
+ file = calloc(1, sizeof(struct hdfsFile_internal));
+ if (!file) {
+ fprintf(stderr, "hdfsOpenFile(%s): OOM create hdfsFile\n", path);
+ ret = ENOMEM;
+ goto done;
+ }
+ file->file = (*env)->NewGlobalRef(env, jFile);
+ if (!file->file) {
+ ret = printPendingExceptionAndFree(env, PRINT_EXC_ALL,
+ "hdfsOpenFile(%s): NewGlobalRef", path);
+ goto done;
+ }
+ file->type = (((flags & O_WRONLY) == 0) ? TYPEIN : TYPEOUT);
+ file->flags = 0;
+ if ((flags & O_WRONLY) == 0) {
+ /* Try a test read to see if we can do direct reads */
+#if 0
+ char buf;
+ if (readDirect(fs, file, &buf, 0) == 0) {
+ /* Success - 0-byte read should return 0 */
+ } else if (errno != ENOTSUP) {
+ /* Unexpected error. Clear it, don't set the direct flag.*/
+ fprintf(stderr,
+ "hdfsOpenFile(%s): WARN: Unexpected error %d when testing "
+ "for direct read compatibility\n", path, errno);
+ }
+ }
+ ret = 0;
+ destroyLocalReference(env, jStrBufferSize);
+ destroyLocalReference(env, jStrReplication);
+ destroyLocalReference(env, jConfiguration);
+ destroyLocalReference(env, jPath);
+ destroyLocalReference(env, jFile);
+ if (ret) {
+ if (file) {
+ if (file->file) {
+ (*env)->DeleteGlobalRef(env, file->file);
+ }
+ free(file);
+ }
+ errno = ret;
+ return NULL;
+ }
+ return file;
+int hdfsCloseFile(hdfsFS fs, hdfsFile file)
+ int ret;
+ file.close */
+ /* Caught exception */
+ jthrowable jthr;
+ char* mInterface;
+ /* Get the JNIEnv* corresponding to current thread */
+ JNIEnv* env = getJNIEnv();
+ if (env == NULL) {
+ errno = EINTERNAL;
+ return -1;
+ }
+ /* Sanity check */
+ if (!file || file->type == UNINITIALIZED) {
+ errno = EBADF;
+ return -1;
+ }
+ /* The interface whose 'close' method to be called */
+ if (file->type == TYPEIN)
+ mInterface = HADOOP_ISTRM;
+ else
+ mInterface = HADOOP_OSTRM;
+ jthr = invokeMethod(env, NULL, INSTANCE, file->file,
+ mInterface,
+ "close", "()V");
+ if (jthr) {
+ const char *interfaceShortName = (file->type == TYPEIN) ?
+ "FSDataInputStream" : "FSDataOutputStream";
+ ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "%s#close", interfaceShortName);
+ } else {
+ ret = 0;
+ }
+ /*De-allocate memory */
+ (*env)->DeleteGlobalRef(env, file->file);
+ free(file);
+ if (ret) {
+ errno = ret;
+ return -1;
+ }
+ return 0;
+int hdfsExists(hdfsFS fs, const char *path)
+ jobject jPath;
+ jvalue jVal;
+ jobject jFS = (jobject)fs;
+ jthrowable jthr;
+ JNIEnv *env = getJNIEnv();
+ if (env == NULL) {
+ errno = EINTERNAL;
+ return -1;
+ }
+ if (path == NULL) {
+ errno = EINVAL;
+ return -1;
+ }
+ jthr = constructNewObjectOfPath(env, path, &jPath);
+ if (jthr) {
+ errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsExists: constructNewObjectOfPath");
+ return -1;
+ }
+ jthr = invokeMethod(env, &jVal, INSTANCE, jFS, HADOOP_FS,
+ "exists", JMETHOD1(JPARAM(HADOOP_PATH), "Z"), jPath);
+ destroyLocalReference(env, jPath);
+ if (jthr) {
+ errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsExists: invokeMethod(%s)",
+ return -1;
+ }
+ if (jVal.z) {
+ return 0;
+ } else {
+ errno = ENOENT;
+ return -1;
+ }
+/* Checks input file for readiness for reading. */
+static int readPrepare(JNIEnv* env, hdfsFS fs, hdfsFile f,
+ jobject* jInputStream)
+ *jInputStream = (jobject)(f ? f->file : NULL);
+ /*Sanity check */
+ if (!f || f->type == UNINITIALIZED) {
+ errno = EBADF;
+ return -1;
+ }
+ /*Error checking... make sure that this file is 'readable' */
+ if (f->type != TYPEIN) {
+ fprintf(stderr, "Cannot read from a non-InputStream object!\n");
+ errno = EINVAL;
+ return -1;
+ }
+ return 0;
+tSize hdfsRead(hdfsFS fs, hdfsFile f, void* buffer, tSize length)
+ JNIEnv* env;
+ jobject jInputStream;
+ jbyteArray jbRarray;
+ jint noReadBytes = length;
+ jvalue jVal;
+ jthrowable jthr;
+ if (length == 0) {
+ return 0;
+ } else if (length < 0) {
+ fprintf(stderr, "hdfsRead: trying to read negative length of %d\n", length);
+ errno = EINVAL;
+ return -1;
+ }
+ return readDirect(fs, f, buffer, length);
+ }
+ byte [] bR = new byte[length];
+ fis.read(bR); */
+ /*Get the JNIEnv* corresponding to current thread */
+ env = getJNIEnv();
+ if (env == NULL) {
+ errno = EINTERNAL;
+ return -1;
+ }
+ /*Parameters */
+ if (readPrepare(env, fs, f, &jInputStream) == -1) {
+ return -1;
+ }
+ /*Read the requisite bytes */
+ jbRarray = (*env)->NewByteArray(env, length);
+ if (!jbRarray) {
+ errno = printPendingExceptionAndFree(env, PRINT_EXC_ALL,
+ "hdfsRead: NewByteArray");
+ return -1;
+ }
+ jthr = invokeMethod(env, &jVal, INSTANCE, jInputStream, HADOOP_ISTRM,
+ "read", "([B)I", jbRarray);
+ if (jthr) {
+ destroyLocalReference(env, jbRarray);
+ errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsRead: FSDataInputStream#read");
+ return -1;
+ }
+ if (jVal.i < 0) {
+ /* EOF */
+ destroyLocalReference(env, jbRarray);
+ return 0;
+ } else if (jVal.i == 0) {
+ destroyLocalReference(env, jbRarray);
+ errno = EINTR;
+ return -1;
+ }
+ (*env)->GetByteArrayRegion(env, jbRarray, 0, noReadBytes, buffer);
+ destroyLocalReference(env, jbRarray);
+ if ((*env)->ExceptionCheck(env)) {
+ errno = printPendingExceptionAndFree(env, PRINT_EXC_ALL,
+ "hdfsRead: GetByteArrayRegion");
+ return -1;
+ }
+ return jVal.i;
+/* Reads using the read(ByteBuffer) API, which does fewer copies */
+tSize readDirect(hdfsFS fs, hdfsFile f, void* buffer, tSize length)
+ ByteBuffer bbuffer = ByteBuffer.allocateDirect(length) // wraps C buffer
+ fis.read(bbuffer); */
+ /*Get the JNIEnv* corresponding to current thread */
+ jobject jInputStream;
+ JNIEnv* env = getJNIEnv();
+ jvalue jVal;
+ jthrowable jthr;
+ jobject bb;
+ if (env == NULL) {
+ errno = EINTERNAL;
+ return -1;
+ }
+ if (readPrepare(env, fs, f, &jInputStream) == -1) {
+ return -1;
+ }
+ /*Read the requisite bytes */
+ bb = (*env)->NewDirectByteBuffer(env, buffer, length);
+ if (bb == NULL) {
+ errno = printPendingExceptionAndFree(env, PRINT_EXC_ALL,
+ "readDirect: NewDirectByteBuffer");
+ return -1;
+ }
+ jthr = invokeMethod(env, &jVal, INSTANCE, jInputStream,
+ HADOOP_ISTRM, "read", "(Ljava/nio/ByteBuffer;)I", bb);
+ destroyLocalReference(env, bb);
+ if (jthr) {
+ errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "readDirect: FSDataInputStream#read");
+ return -1;
+ }
+ return (jVal.i < 0) ? 0 : jVal.i;
+tSize hdfsPread(hdfsFS fs, hdfsFile f, tOffset position,
+ void* buffer, tSize length)
+ JNIEnv* env;
+ jbyteArray jbRarray;
+ jvalue jVal;
+ jthrowable jthr;
+ if (length == 0) {
+ return 0;
+ } else if (length < 0) {
+ errno = EINVAL;
+ return -1;
+ }
+ if (!f || f->type == UNINITIALIZED) {
+ errno = EBADF;
+ return -1;
+ }
+ env = getJNIEnv();
+ if (env == NULL) {
+ errno = EINTERNAL;
+ return -1;
+ }
+ /*Error checking... make sure that this file is 'readable' */
+ if (f->type != TYPEIN) {
+ fprintf(stderr, "Cannot read from a non-InputStream object!\n");
+ errno = EINVAL;
+ return -1;
+ }
+ byte [] bR = new byte[length];
+ fis.read(pos, bR, 0, length); */
+ jbRarray = (*env)->NewByteArray(env, length);
+ if (!jbRarray) {
+ errno = printPendingExceptionAndFree(env, PRINT_EXC_ALL,
+ "hdfsPread: NewByteArray");
+ return -1;
+ }
+ jthr = invokeMethod(env, &jVal, INSTANCE, f->file, HADOOP_ISTRM,
+ "read", "(J[BII)I", position, jbRarray, 0, length);
+ if (jthr) {
+ destroyLocalReference(env, jbRarray);
+ errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsPread: FSDataInputStream#read");
+ return -1;
+ }
+ if (jVal.i < 0) {
+ /* EOF */
+ destroyLocalReference(env, jbRarray);
+ return 0;
+ } else if (jVal.i == 0) {
+ destroyLocalReference(env, jbRarray);
+ errno = EINTR;
+ return -1;
+ }
+ (*env)->GetByteArrayRegion(env, jbRarray, 0, jVal.i, buffer);
+ destroyLocalReference(env, jbRarray);
+ if ((*env)->ExceptionCheck(env)) {
+ errno = printPendingExceptionAndFree(env, PRINT_EXC_ALL,
+ "hdfsPread: GetByteArrayRegion");
+ return -1;
+ }
+ return jVal.i;
+tSize hdfsWrite(hdfsFS fs, hdfsFile f, const void* buffer, tSize length)
+ byte b[] = str.getBytes();
+ fso.write(b); */
+ jobject jOutputStream = f->file;
+ jbyteArray jbWarray;
+ jthrowable jthr;
+ /*Get the JNIEnv* corresponding to current thread */
+ JNIEnv* env = getJNIEnv();
+ if (env == NULL) {
+ errno = EINTERNAL;
+ return -1;
+ }
+ /*Sanity check */
+ if (!f || f->type == UNINITIALIZED) {
+ errno = EBADF;
+ return -1;
+ }
+ if (length < 0) {
+ errno = EINVAL;
+ return -1;
+ }
+ /*Error checking... make sure that this file is 'writable' */
+ if (f->type != TYPEOUT) {
+ fprintf(stderr, "Cannot write into a non-OutputStream object!\n");
+ errno = EINVAL;
+ return -1;
+ }
+ if (length < 0) {
+ errno = EINVAL;
+ return -1;
+ }
+ if (length == 0) {
+ return 0;
+ }
+ /*Write the requisite bytes into the file */
+ jbWarray = (*env)->NewByteArray(env, length);
+ if (!jbWarray) {
+ errno = printPendingExceptionAndFree(env, PRINT_EXC_ALL,
+ "hdfsWrite: NewByteArray");
+ return -1;
+ }
+ (*env)->SetByteArrayRegion(env, jbWarray, 0, length, buffer);
+ if ((*env)->ExceptionCheck(env)) {
+ destroyLocalReference(env, jbWarray);
+ errno = printPendingExceptionAndFree(env, PRINT_EXC_ALL,
+ "hdfsWrite(length = %d): SetByteArrayRegion", length);
+ return -1;
+ }
+ jthr = invokeMethod(env, NULL, INSTANCE, jOutputStream,
+ HADOOP_OSTRM, "write", "([B)V", jbWarray);
+ destroyLocalReference(env, jbWarray);
+ if (jthr) {
+ errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsWrite: FSDataOutputStream#write");
+ return -1;
+ }
+ /* Unlike most Java streams, FSDataOutputStream never does partial writes.
+ If we succeeded, all the data was written. */
+ return length;
+int hdfsSeek(hdfsFS fs, hdfsFile f, tOffset desiredPos)
+ fis.seek(pos); */
+ jobject jInputStream = f->file;
+ jthrowable jthr;
+ /*Get the JNIEnv* corresponding to current thread */
+ JNIEnv* env = getJNIEnv();
+ if (env == NULL) {
+ errno = EINTERNAL;
+ return -1;
+ }
+ /*Sanity check */
+ if (!f || f->type != TYPEIN) {
+ errno = EBADF;
+ return -1;
+ }
+ jthr = invokeMethod(env, NULL, INSTANCE, jInputStream,
+ HADOOP_ISTRM, "seek", "(J)V", desiredPos);
+ if (jthr) {
+ errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsSeek(desiredPos=%" PRId64 ")"
+ ": FSDataInputStream#seek", desiredPos);
+ return -1;
+ }
+ return 0;
+tOffset hdfsTell(hdfsFS fs, hdfsFile f)
+ pos = f.getPos(); */
+ jobject jStream = f->file;
+ char* mInterface;
+ jvalue jVal;
+ jthrowable jthr;
+ /* Get the JNIEnv* corresponding to current thread */
+ JNIEnv* env = getJNIEnv();
+ if (env == NULL) {
+ errno = EINTERNAL;
+ return -1;
+ }
+ /* Sanity check */
+ if (!f || f->type == UNINITIALIZED) {
+ errno = EBADF;
+ return -1;
+ }
+ /* The interface whose 'close' method to be called */
+ if (f->type == TYPEIN)
+ mInterface = HADOOP_ISTRM;
+ else
+ mInterface = HADOOP_OSTRM;
+ /* Parameters */
+ jthr = invokeMethod(env, &jVal, INSTANCE, jStream,
+ mInterface, "getPos", "()J");
+ if (jthr) {
+ errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsTell: %s#getPos",
+ ((f->type == TYPEIN) ? "FSDataInputStream" :
+ "FSDataOutputStream"));
+ return -1;
+ }
+ return jVal.j;
+int hdfsFlush(hdfsFS fs, hdfsFile f)
+ fos.flush(); */
+ jthrowable jthr;
+ /*Get the JNIEnv* corresponding to current thread */
+ JNIEnv* env = getJNIEnv();
+ if (env == NULL) {
+ errno = EINTERNAL;
+ return -1;
+ }
+ /*Sanity check */
+ if (!f || f->type != TYPEOUT) {
+ errno = EBADF;
+ return -1;
+ }
+ jthr = invokeMethod(env, NULL, INSTANCE, f->file,
+ HADOOP_OSTRM, "flush", "()V");
+ if (jthr) {
+ errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsFlush: FSDataInputStream#flush");
+ return -1;
+ }
+ return 0;
+int hdfsHFlush(hdfsFS fs, hdfsFile f)
+ jobject jOutputStream = f->file;
+ jthrowable jthr;
+ /*Get the JNIEnv* corresponding to current thread */
+ JNIEnv* env = getJNIEnv();
+ if (env == NULL) {
+ errno = EINTERNAL;
+ return -1;
+ }
+ /*Sanity check */
+ if (!f || f->type != TYPEOUT) {
+ errno = EBADF;
+ return -1;
+ }
+ jthr = invokeMethod(env, NULL, INSTANCE, jOutputStream,
+ HADOOP_OSTRM, "hflush", "()V");
+ if (jthr) {
+ errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsHFlush: FSDataOutputStream#hflush");
+ return -1;
+ }
+ return 0;
+int hdfsHSync(hdfsFS fs, hdfsFile f)
+ jobject jOutputStream = f->file;
+ jthrowable jthr;
+ /*Get the JNIEnv* corresponding to current thread */
+ JNIEnv* env = getJNIEnv();
+ if (env == NULL) {
+ errno = EINTERNAL;
+ return -1;
+ }
+ /*Sanity check */
+ if (!f || f->type != TYPEOUT) {
+ errno = EBADF;
+ return -1;
+ }
+ jthr = invokeMethod(env, NULL, INSTANCE, jOutputStream,
+ HADOOP_OSTRM, "hsync", "()V");
+ if (jthr) {
+ errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsHSync: FSDataOutputStream#hsync");
+ return -1;
+ }
+ return 0;
+int hdfsAvailable(hdfsFS fs, hdfsFile f)
+ fis.available(); */
+ jobject jInputStream = f->file;
+ jvalue jVal;
+ jthrowable jthr;
+ /*Get the JNIEnv* corresponding to current thread */
+ JNIEnv* env = getJNIEnv();
+ if (env == NULL) {
+ errno = EINTERNAL;
+ return -1;
+ }
+ /*Sanity check */
+ if (!f || f->type != TYPEIN) {
+ errno = EBADF;
+ return -1;
+ }
+ /*Parameters */
+ jthr = invokeMethod(env, &jVal, INSTANCE, jInputStream,
+ HADOOP_ISTRM, "available", "()I");
+ if (jthr) {
+ errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsAvailable: FSDataInputStream#available");
+ return -1;
+ }
+ return jVal.i;
+static int hdfsCopyImpl(hdfsFS srcFS, const char* src, hdfsFS dstFS,
+ const char* dst, jboolean deleteSource)
+ /*Parameters */
+ jobject jSrcFS = (jobject)srcFS;
+ jobject jDstFS = (jobject)dstFS;
+ jobject jConfiguration = NULL, jSrcPath = NULL, jDstPath = NULL;
+ jthrowable jthr;
+ jvalue jVal;
+ int ret;
+ FileUtil#copy(srcFS, srcPath, dstFS, dstPath,
+ deleteSource = false, conf) */
+ /*Get the JNIEnv* corresponding to current thread */
+ JNIEnv* env = getJNIEnv();
+ if (env == NULL) {
+ errno = EINTERNAL;
+ return -1;
+ }
+ jthr = constructNewObjectOfPath(env, src, &jSrcPath);
+ if (jthr) {
+ ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsCopyImpl(src=%s): constructNewObjectOfPath", src);
+ goto done;
+ }
+ jthr = constructNewObjectOfPath(env, dst, &jDstPath);
+ if (jthr) {
+ ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsCopyImpl(dst=%s): constructNewObjectOfPath", dst);
+ goto done;
+ }
+ /*Create the org.apache.hadoop.conf.Configuration object */
+ jthr = constructNewObjectOfClass(env, &jConfiguration,
+ HADOOP_CONF, "()V");
+ if (jthr) {
+ ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsCopyImpl: Configuration constructor");
+ goto done;
+ }
+ /*FileUtil#copy */
+ jthr = invokeMethod(env, &jVal, STATIC,
+ NULL, "org/apache/hadoop/fs/FileUtil", "copy",
+ "(Lorg/apache/hadoop/fs/FileSystem;Lorg/apache/hadoop/fs/Path;"
+ "Lorg/apache/hadoop/fs/FileSystem;Lorg/apache/hadoop/fs/Path;"
+ "ZLorg/apache/hadoop/conf/Configuration;)Z",
+ jSrcFS, jSrcPath, jDstFS, jDstPath, deleteSource,
+ jConfiguration);
+ if (jthr) {
+ ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsCopyImpl(src=%s, dst=%s, deleteSource=%d): "
+ "FileUtil#copy", src, dst, deleteSource);
+ goto done;
+ }
+ if (!jVal.z) {
+ ret = EIO;
+ goto done;
+ }
+ ret = 0;
+ destroyLocalReference(env, jConfiguration);
+ destroyLocalReference(env, jSrcPath);
+ destroyLocalReference(env, jDstPath);
+ if (ret) {
+ errno = ret;
+ return -1;
+ }
+ return 0;
+int hdfsCopy(hdfsFS srcFS, const char* src, hdfsFS dstFS, const char* dst)
+ return hdfsCopyImpl(srcFS, src, dstFS, dst, 0);
+int hdfsMove(hdfsFS srcFS, const char* src, hdfsFS dstFS, const char* dst)
+ return hdfsCopyImpl(srcFS, src, dstFS, dst, 1);
+int hdfsDelete(hdfsFS fs, const char* path, int recursive)
+ jobject jFS = (jobject)fs;
+ jthrowable jthr;
+ jobject jPath;
+ jvalue jVal;
+ jboolean jRecursive = recursive ? JNI_TRUE : JNI_FALSE;
+ Path p = new Path(path);
+ bool retval = fs.delete(p, recursive); */
+ /*Get the JNIEnv* corresponding to current thread */
+ JNIEnv* env = getJNIEnv();
+ if (env == NULL) {
+ errno = EINTERNAL;
+ return -1;
+ }
+ jthr = constructNewObjectOfPath(env, path, &jPath);
+ if (jthr) {
+ errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsDelete(path=%s): constructNewObjectOfPath", path);
+ return -1;
+ }
+ jthr = invokeMethod(env, &jVal, INSTANCE, jFS, HADOOP_FS,
+ "delete", "(Lorg/apache/hadoop/fs/Path;Z)Z",
+ jPath, jRecursive);
+ destroyLocalReference(env, jPath);
+ if (jthr) {
+ errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsDelete(path=%s, recursive=%d): "
+ "FileSystem#delete", path, recursive);
+ return -1;
+ }
+ if (!jVal.z) {
+ errno = EIO;
+ return -1;
+ }
+ return 0;
+int hdfsRename(hdfsFS fs, const char* oldPath, const char* newPath)
+ jobject jFS = (jobject)fs;
+ jthrowable jthr;
+ jobject jOldPath = NULL, jNewPath = NULL;
+ int ret = -1;
+ jvalue jVal;
+ Path old = new Path(oldPath);
+ Path new = new Path(newPath);
+ fs.rename(old, new); */
+ /*Get the JNIEnv* corresponding to current thread */
+ JNIEnv* env = getJNIEnv();
+ if (env == NULL) {
+ errno = EINTERNAL;
+ return -1;
+ }
+ jthr = constructNewObjectOfPath(env, oldPath, &jOldPath );
+ if (jthr) {
+ errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsRename: constructNewObjectOfPath(%s)", oldPath);
+ goto done;
+ }
+ jthr = constructNewObjectOfPath(env, newPath, &jNewPath);
+ if (jthr) {
+ errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsRename: constructNewObjectOfPath(%s)", newPath);
+ goto done;
+ }
+ /* Rename the file
+ TODO: use rename2 here? (See HDFS-3592) */
+ jthr = invokeMethod(env, &jVal, INSTANCE, jFS, HADOOP_FS, "rename",
+ jOldPath, jNewPath);
+ if (jthr) {
+ errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsRename(oldPath=%s, newPath=%s): FileSystem#rename",
+ oldPath, newPath);
+ goto done;
+ }
+ if (!jVal.z) {
+ errno = EIO;
+ goto done;
+ }
+ ret = 0;
+ destroyLocalReference(env, jOldPath);
+ destroyLocalReference(env, jNewPath);
+ return ret;
+char* hdfsGetWorkingDirectory(hdfsFS fs, char* buffer, size_t bufferSize)
+ jobject jPath = NULL;
+ jstring jPathString = NULL;
+ jobject jFS = (jobject)fs;
+ jvalue jVal;
+ jthrowable jthr;
+ int ret;
+ const char *jPathChars = NULL;
+ Path p = fs.getWorkingDirectory();
+ return p.toString() */
+ /*Get the JNIEnv* corresponding to current thread */
+ JNIEnv* env = getJNIEnv();
+ if (env == NULL) {
+ errno = EINTERNAL;
+ return NULL;
+ }
+ /*FileSystem#getWorkingDirectory() */
+ jthr = invokeMethod(env, &jVal, INSTANCE, jFS,
+ HADOOP_FS, "getWorkingDirectory",
+ "()Lorg/apache/hadoop/fs/Path;");
+ if (jthr) {
+ ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsGetWorkingDirectory: FileSystem#getWorkingDirectory");
+ goto done;
+ }
+ jPath = jVal.l;
+ if (!jPath) {
+ fprintf(stderr, "hdfsGetWorkingDirectory: "
+ "FileSystem#getWorkingDirectory returned NULL");
+ ret = -EIO;
+ goto done;
+ }
+ /*Path#toString() */
+ jthr = invokeMethod(env, &jVal, INSTANCE, jPath,
+ "org/apache/hadoop/fs/Path", "toString",
+ "()Ljava/lang/String;");
+ if (jthr) {
+ ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsGetWorkingDirectory: Path#toString");
+ goto done;
+ }
+ jPathString = jVal.l;
+ jPathChars = (*env)->GetStringUTFChars(env, jPathString, NULL);
+ if (!jPathChars) {
+ ret = printPendingExceptionAndFree(env, PRINT_EXC_ALL,
+ "hdfsGetWorkingDirectory: GetStringUTFChars");
+ goto done;
+ }
+ /*Copy to user-provided buffer */
+ /* ret = snprintf(buffer, bufferSize, "%s", jPathChars); */
+ ret = sprintf_s (buffer, bufferSize, "%s", jPathChars);
+ if (ret >= bufferSize) {
+ goto done;
+ }
+ ret = 0;
+ if (jPathChars) {
+ (*env)->ReleaseStringUTFChars(env, jPathString, jPathChars);
+ }
+ destroyLocalReference(env, jPath);
+ destroyLocalReference(env, jPathString);
+ if (ret) {
+ errno = ret;
+ return NULL;
+ }
+ return buffer;
+int hdfsSetWorkingDirectory(hdfsFS fs, const char* path)
+ jobject jFS = (jobject)fs;
+ jthrowable jthr;
+ jobject jPath;
+ fs.setWorkingDirectory(Path(path)); */
+ /*Get the JNIEnv* corresponding to current thread */
+ JNIEnv* env = getJNIEnv();
+ if (env == NULL) {
+ errno = EINTERNAL;
+ return -1;
+ }
+ /*Create an object of org.apache.hadoop.fs.Path */
+ jthr = constructNewObjectOfPath(env, path, &jPath);
+ if (jthr) {
+ errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsSetWorkingDirectory(%s): constructNewObjectOfPath",
+ path);
+ return -1;
+ }
+ /*FileSystem#setWorkingDirectory() */
+ jthr = invokeMethod(env, NULL, INSTANCE, jFS, HADOOP_FS,
+ "setWorkingDirectory",
+ "(Lorg/apache/hadoop/fs/Path;)V", jPath);
+ destroyLocalReference(env, jPath);
+ if (jthr) {
+ errno = printExceptionAndFree(env, jthr, NOPRINT_EXC_ILLEGAL_ARGUMENT,
+ "hdfsSetWorkingDirectory(%s): FileSystem#setWorkingDirectory",
+ path);
+ return -1;
+ }
+ return 0;
+int hdfsCreateDirectory(hdfsFS fs, const char* path)
+ jobject jFS = (jobject)fs;
+ jobject jPath;
+ jthrowable jthr;
+ jvalue jVal;
+ fs.mkdirs(new Path(path)); */
+ /*Get the JNIEnv* corresponding to current thread */
+ JNIEnv* env = getJNIEnv();
+ if (env == NULL) {
+ errno = EINTERNAL;
+ return -1;
+ }
+ /*Create an object of org.apache.hadoop.fs.Path */
+ jthr = constructNewObjectOfPath(env, path, &jPath);
+ if (jthr) {
+ errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsCreateDirectory(%s): constructNewObjectOfPath", path);
+ return -1;
+ }
+ /*Create the directory */
+ jVal.z = 0;
+ jthr = invokeMethod(env, &jVal, INSTANCE, jFS, HADOOP_FS,
+ "mkdirs", "(Lorg/apache/hadoop/fs/Path;)Z",
+ jPath);
+ destroyLocalReference(env, jPath);
+ if (jthr) {
+ errno = printExceptionAndFree(env, jthr,
+ "hdfsCreateDirectory(%s): FileSystem#mkdirs", path);
+ return -1;
+ }
+ if (!jVal.z) {
+ /* It's unclear under exactly which conditions FileSystem#mkdirs */
+ /* is supposed to return false (as opposed to throwing an exception.) */
+ /* It seems like the current code never actually returns false.
+ So we're going to translate this to EIO, since there seems to be
+ nothing more specific we can do with it. */
+ errno = EIO;
+ return -1;
+ }
+ return 0;
+int hdfsSetReplication(hdfsFS fs, const char* path, int16_t replication)
+ jobject jFS = (jobject)fs;
+ jthrowable jthr;
+ jvalue jVal;
+ /*Create an object of org.apache.hadoop.fs.Path */
+ jobject jPath;
+ fs.setReplication(new Path(path), replication); */
+ /*Get the JNIEnv* corresponding to current thread */
+ JNIEnv* env = getJNIEnv();
+ if (env == NULL) {
+ errno = EINTERNAL;
+ return -1;
+ }
+ jthr = constructNewObjectOfPath(env, path, &jPath);
+ if (jthr) {
+ errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsSetReplication(path=%s): constructNewObjectOfPath", path);
+ return -1;
+ }
+ /*Create the directory */
+ jthr = invokeMethod(env, &jVal, INSTANCE, jFS, HADOOP_FS,
+ "setReplication", "(Lorg/apache/hadoop/fs/Path;S)Z",
+ jPath, replication);
+ destroyLocalReference(env, jPath);
+ if (jthr) {
+ errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsSetReplication(path=%s, replication=%d): "
+ "FileSystem#setReplication", path, replication);
+ return -1;
+ }
+ if (!jVal.z) {
+ /* setReplication returns false "if file does not exist or is a
+ directory." So the nearest translation to that is ENOENT. */
+ errno = ENOENT;
+ return -1;
+ }
+ return 0;
+int hdfsChown(hdfsFS fs, const char* path, const char *owner, const char *group)
+ jobject jFS = (jobject)fs;
+ jobject jPath = NULL;
+ jstring jOwner = NULL, jGroup = NULL;
+ jthrowable jthr;
+ int ret;
+ fs.setOwner(path, owner, group) */
+ /*Get the JNIEnv* corresponding to current thread */
+ JNIEnv* env = getJNIEnv();
+ if (env == NULL) {
+ errno = EINTERNAL;
+ return -1;
+ }
+ if (owner == NULL && group == NULL) {
+ return 0;
+ }
+ jthr = constructNewObjectOfPath(env, path, &jPath);
+ if (jthr) {
+ ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsChown(path=%s): constructNewObjectOfPath", path);
+ goto done;
+ }
+ jthr = newJavaStr(env, owner, &jOwner);
+ if (jthr) {
+ ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsChown(path=%s): newJavaStr(%s)", path, owner);
+ goto done;
+ }
+ jthr = newJavaStr(env, group, &jGroup);
+ if (jthr) {
+ ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsChown(path=%s): newJavaStr(%s)", path, group);
+ goto done;
+ }
+ /*Create the directory */
+ jthr = invokeMethod(env, NULL, INSTANCE, jFS, HADOOP_FS,
+ jPath, jOwner, jGroup);
+ if (jthr) {
+ ret = printExceptionAndFree(env, jthr,
+ "hdfsChown(path=%s, owner=%s, group=%s): "
+ "FileSystem#setOwner", path, owner, group);
+ goto done;
+ }
+ ret = 0;
+ destroyLocalReference(env, jPath);
+ destroyLocalReference(env, jOwner);
+ destroyLocalReference(env, jGroup);
+ if (ret) {
+ errno = ret;
+ return -1;
+ }
+ return 0;
+int hdfsChmod(hdfsFS fs, const char* path, short mode)
+ int ret;
+ jthrowable jthr;
+ jobject jPath = NULL, jPermObj = NULL;
+ jobject jFS = (jobject)fs;
+ /* construct jPerm = FsPermission.createImmutable(short mode);*/
+ jshort jmode = mode;
+ fs.setPermission(path, FsPermission) */
+ /*Get the JNIEnv* corresponding to current thread */
+ JNIEnv* env = getJNIEnv();
+ if (env == NULL) {
+ errno = EINTERNAL;
+ return -1;
+ }
+ jthr = constructNewObjectOfClass(env, &jPermObj,
+ HADOOP_FSPERM,"(S)V",jmode);
+ if (jthr) {
+ ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "constructNewObjectOfClass(%s)", HADOOP_FSPERM);
+ return -1;
+ }
+ /*Create an object of org.apache.hadoop.fs.Path */
+ jthr = constructNewObjectOfPath(env, path, &jPath);
+ if (jthr) {
+ ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsChmod(%s): constructNewObjectOfPath", path);
+ goto done;
+ }
+ /*Create the directory */
+ jthr = invokeMethod(env, NULL, INSTANCE, jFS, HADOOP_FS,
+ "setPermission",
+ jPath, jPermObj);
+ if (jthr) {
+ ret = printExceptionAndFree(env, jthr,
+ "hdfsChmod(%s): FileSystem#setPermission", path);
+ goto done;
+ }
+ ret = 0;
+ destroyLocalReference(env, jPath);
+ destroyLocalReference(env, jPermObj);
+ if (ret) {
+ errno = ret;
+ return -1;
+ }
+ return 0;
+int hdfsUtime(hdfsFS fs, const char* path, tTime mtime, tTime atime)
+ fs.setTimes(src, mtime, atime) */
+ jthrowable jthr;
+ jobject jFS = (jobject)fs;
+ jobject jPath;
+ const tTime NO_CHANGE = -1;
+ jlong jmtime = (mtime == NO_CHANGE) ? -1 : (mtime * (jlong)1000);
+ jlong jatime = (atime == NO_CHANGE) ? -1 : (atime * (jlong)1000);
+ /*Get the JNIEnv* corresponding to current thread */
+ JNIEnv* env = getJNIEnv();
+ if (env == NULL) {
+ errno = EINTERNAL;
+ return -1;
+ }
+ /*Create an object of org.apache.hadoop.fs.Path */
+ jthr = constructNewObjectOfPath(env, path, &jPath);
+ if (jthr) {
+ errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsUtime(path=%s): constructNewObjectOfPath", path);
+ return -1;
+ }
+ jthr = invokeMethod(env, NULL, INSTANCE, jFS, HADOOP_FS,
+ jPath, jmtime, jatime);
+ destroyLocalReference(env, jPath);
+ if (jthr) {
+ errno = printExceptionAndFree(env, jthr,
+ "hdfsUtime(path=%s): FileSystem#setTimes", path);
+ return -1;
+ }
+ return 0;
+hdfsGetHosts(hdfsFS fs, const char* path, tOffset start, tOffset length)
+ fs.getFileBlockLoctions(new Path(path), start, length); */
+ jthrowable jthr;
+ jobject jPath = NULL;
+ jobject jFileStatus = NULL;
+ jvalue jFSVal, jVal;
+ jobjectArray jBlockLocations = NULL, jFileBlockHosts = NULL;
+ jstring jHost = NULL;
+ char*** blockHosts = NULL;
+ int i, j, ret;
+ jsize jNumFileBlocks = 0;
+ jobject jFS = (jobject)fs;
+ /*Get the JNIEnv* corresponding to current thread */
+ JNIEnv* env = getJNIEnv();
+ if (env == NULL) {
+ errno = EINTERNAL;
+ return NULL;
+ }
+ /*Create an object of org.apache.hadoop.fs.Path */
+ jthr = constructNewObjectOfPath(env, path, &jPath);
+ if (jthr) {
+ ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsGetHosts(path=%s): constructNewObjectOfPath", path);
+ goto done;
+ }
+ jthr = invokeMethod(env, &jFSVal, INSTANCE, jFS,
+ HADOOP_FS, "getFileStatus", "(Lorg/apache/hadoop/fs/Path;)"
+ "Lorg/apache/hadoop/fs/FileStatus;", jPath);
+ if (jthr) {
+ ret = printExceptionAndFree(env, jthr, NOPRINT_EXC_FILE_NOT_FOUND,
+ "hdfsGetHosts(path=%s, start=%"PRId64", length=%"PRId64"):"
+ "FileSystem#getFileStatus", path, start, length);
+ destroyLocalReference(env, jPath);
+ goto done;
+ }
+ jFileStatus = jFSVal.l;
+ /*org.apache.hadoop.fs.FileSystem#getFileBlockLocations */
+ jthr = invokeMethod(env, &jVal, INSTANCE, jFS,
+ HADOOP_FS, "getFileBlockLocations",
+ "(Lorg/apache/hadoop/fs/FileStatus;JJ)"
+ "[Lorg/apache/hadoop/fs/BlockLocation;",
+ jFileStatus, start, length);
+ if (jthr) {
+ ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsGetHosts(path=%s, start=%"PRId64", length=%"PRId64"):"
+ "FileSystem#getFileBlockLocations", path, start, length);
+ goto done;
+ }
+ jBlockLocations = jVal.l;
+ /*Figure out no of entries in jBlockLocations
+ Allocate memory and add NULL at the end */
+ jNumFileBlocks = (*env)->GetArrayLength(env, jBlockLocations);
+ blockHosts = calloc(jNumFileBlocks + 1, sizeof(char**));
+ if (blockHosts == NULL) {
+ ret = ENOMEM;
+ goto done;
+ }
+ if (jNumFileBlocks == 0) {
+ ret = 0;
+ goto done;
+ }
+ /*Now parse each block to get hostnames */
+ for (i = 0; i < jNumFileBlocks; ++i) {
+ jsize jNumBlockHosts;
+ const char *hostName;
+ jobject jFileBlock =
+ (*env)->GetObjectArrayElement(env, jBlockLocations, i);
+ if (!jFileBlock) {
+ ret = printPendingExceptionAndFree(env, PRINT_EXC_ALL,
+ "hdfsGetHosts(path=%s, start=%"PRId64", length=%"PRId64"):"
+ "GetObjectArrayElement(%d)", path, start, length, i);
+ goto done;
+ }
+ jthr = invokeMethod(env, &jVal, INSTANCE, jFileBlock, HADOOP_BLK_LOC,
+ "getHosts", "()[Ljava/lang/String;");
+ if (jthr) {
+ ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsGetHosts(path=%s, start=%"PRId64", length=%"PRId64"):"
+ "BlockLocation#getHosts", path, start, length);
+ goto done;
+ }
+ jFileBlockHosts = jVal.l;
+ if (!jFileBlockHosts) {
+ fprintf(stderr,
+ "hdfsGetHosts(path=%s, start=%"PRId64", length=%"PRId64"):"
+ "BlockLocation#getHosts returned NULL", path, start, length);
+ ret = EINTERNAL;
+ goto done;
+ }
+ /*Figure out no of hosts in jFileBlockHosts, and allocate the memory */
+ jNumBlockHosts = (*env)->GetArrayLength(env, jFileBlockHosts);
+ blockHosts[i] = calloc(jNumBlockHosts + 1, sizeof(char*));
+ if (!blockHosts[i]) {
+ ret = ENOMEM;
+ goto done;
+ }
+ /*Now parse each hostname */
+ for (j = 0; j < jNumBlockHosts; ++j) {
+ jHost = (*env)->GetObjectArrayElement(env, jFileBlockHosts, j);
+ if (!jHost) {
+ ret = printPendingExceptionAndFree(env, PRINT_EXC_ALL,
+ "hdfsGetHosts(path=%s, start=%"PRId64", length=%"PRId64"): "
+ "NewByteArray", path, start, length);
+ goto done;
+ }
+ hostName =
+ (const char*)((*env)->GetStringUTFChars(env, jHost, NULL));
+ if (!hostName) {
+ ret = printPendingExceptionAndFree(env, PRINT_EXC_ALL,
+ "hdfsGetHosts(path=%s, start=%"PRId64", length=%"PRId64", "
+ "j=%d out of %d): GetStringUTFChars",
+ path, start, length, j, jNumBlockHosts);
+ goto done;
+ }
+ blockHosts[i][j] = _strdup(hostName);
+ (*env)->ReleaseStringUTFChars(env, jHost, hostName);
+ if (!blockHosts[i][j]) {
+ ret = ENOMEM;
+ goto done;
+ }
+ destroyLocalReference(env, jHost);
+ jHost = NULL;
+ } /*end host name loop */
+ destroyLocalReference(env, jFileBlockHosts);
+ jFileBlockHosts = NULL;
+ } /*end block loop */
+ ret = 0;
+ destroyLocalReference(env, jPath);
+ destroyLocalReference(env, jFileStatus);
+ destroyLocalReference(env, jBlockLocations);
+ destroyLocalReference(env, jFileBlockHosts);
+ destroyLocalReference(env, jHost);
+ if (ret) {
+ if (blockHosts) {
+ hdfsFreeHosts(blockHosts);
+ }
+ return NULL;
+ }
+ return blockHosts;
+void hdfsFreeHosts(char ***blockHosts)
+ int i, j;
+ for (i=0; blockHosts[i]; i++) {
+ for (j=0; blockHosts[i][j]; j++) {
+ free(blockHosts[i][j]);
+ }
+ free(blockHosts[i]);
+ }
+ free(blockHosts);
+tOffset hdfsGetDefaultBlockSize(hdfsFS fs)
+ jobject jFS = (jobject)fs;
+ /*FileSystem#getDefaultBlockSize() */
+ jvalue jVal;
+ jthrowable jthr;
+ fs.getDefaultBlockSize(); */
+ /*Get the JNIEnv* corresponding to current thread */
+ JNIEnv* env = getJNIEnv();
+ if (env == NULL) {
+ errno = EINTERNAL;
+ return -1;
+ }
+ jthr = invokeMethod(env, &jVal, INSTANCE, jFS, HADOOP_FS,
+ "getDefaultBlockSize", "()J");
+ if (jthr) {
+ errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsGetDefaultBlockSize: FileSystem#getDefaultBlockSize");
+ return -1;
+ }
+ return jVal.j;
+tOffset hdfsGetDefaultBlockSizeAtPath(hdfsFS fs, const char *path)
+ fs.getDefaultBlockSize(path); */
+ jthrowable jthr;
+ jobject jFS = (jobject)fs;
+ jobject jPath;
+ tOffset blockSize;
+ JNIEnv* env = getJNIEnv();
+ if (env == NULL) {
+ errno = EINTERNAL;
+ return -1;
+ }
+ jthr = constructNewObjectOfPath(env, path, &jPath);
+ if (jthr) {
+ errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsGetDefaultBlockSize(path=%s): constructNewObjectOfPath",
+ path);
+ return -1;
+ }
+ jthr = getDefaultBlockSize(env, jFS, jPath, &blockSize);
+ (*env)->DeleteLocalRef(env, jPath);
+ if (jthr) {
+ errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsGetDefaultBlockSize(path=%s): "
+ "FileSystem#getDefaultBlockSize", path);
+ return -1;
+ }
+ return blockSize;
+tOffset hdfsGetCapacity(hdfsFS fs)
+ jobject jFS = (jobject)fs;
+ /*FileSystem#getStatus */
+ jvalue jVal;
+ jthrowable jthr;
+ jobject fss;
+ FsStatus fss = fs.getStatus();
+ return Fss.getCapacity(); */
+ /*Get the JNIEnv* corresponding to current thread */
+ JNIEnv* env = getJNIEnv();
+ if (env == NULL) {
+ errno = EINTERNAL;
+ return -1;
+ }
+ jthr = invokeMethod(env, &jVal, INSTANCE, jFS, HADOOP_FS,
+ "getStatus", "()Lorg/apache/hadoop/fs/FsStatus;");
+ if (jthr) {
+ errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsGetCapacity: FileSystem#getStatus");
+ return -1;
+ }
+ fss = (jobject)jVal.l;
+ jthr = invokeMethod(env, &jVal, INSTANCE, fss, HADOOP_FSSTATUS,
+ "getCapacity", "()J");
+ destroyLocalReference(env, fss);
+ if (jthr) {
+ errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsGetCapacity: FsStatus#getCapacity");
+ return -1;
+ }
+ return jVal.j;
+tOffset hdfsGetUsed(hdfsFS fs)
+ jobject jFS = (jobject)fs;
+ /*FileSystem#getStatus */
+ jvalue jVal;
+ jthrowable jthr;
+ jobject fss;
+ FsStatus fss = fs.getStatus();
+ return Fss.getUsed(); */
+ /*Get the JNIEnv* corresponding to current thread */
+ JNIEnv* env = getJNIEnv();
+ if (env == NULL) {
+ errno = EINTERNAL;
+ return -1;
+ }
+ jthr = invokeMethod(env, &jVal, INSTANCE, jFS, HADOOP_FS,
+ "getStatus", "()Lorg/apache/hadoop/fs/FsStatus;");
+ if (jthr) {
+ errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsGetUsed: FileSystem#getStatus");
+ return -1;
+ }
+ fss = (jobject)jVal.l;
+ jthr = invokeMethod(env, &jVal, INSTANCE, fss, HADOOP_FSSTATUS,
+ "getUsed", "()J");
+ destroyLocalReference(env, fss);
+ if (jthr) {
+ errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsGetUsed: FsStatus#getUsed");
+ return -1;
+ }
+ return jVal.j;
+static jthrowable
+getFileInfoFromStat(JNIEnv *env, jobject jStat, hdfsFileInfo *fileInfo)
+ jvalue jVal;
+ jthrowable jthr;
+ jobject jPath = NULL;
+ jstring jPathName = NULL;
+ jstring jUserName = NULL;
+ jstring jGroupName = NULL;
+ jobject jPermission = NULL;
+ const char *cPathName;
+ const char* cUserName;
+ const char* cGroupName;
+ jthr = invokeMethod(env, &jVal, INSTANCE, jStat,
+ HADOOP_STAT, "isDir", "()Z");
+ if (jthr)
+ goto done;
+ fileInfo->mKind = jVal.z ? kObjectKindDirectory : kObjectKindFile;
+ jthr = invokeMethod(env, &jVal, INSTANCE, jStat,
+ HADOOP_STAT, "getReplication", "()S");
+ if (jthr)
+ goto done;
+ fileInfo->mReplication = jVal.s;
+ jthr = invokeMethod(env, &jVal, INSTANCE, jStat,
+ HADOOP_STAT, "getBlockSize", "()J");
+ if (jthr)
+ goto done;
+ fileInfo->mBlockSize = jVal.j;
+ jthr = invokeMethod(env, &jVal, INSTANCE, jStat,
+ HADOOP_STAT, "getModificationTime", "()J");
+ if (jthr)
+ goto done;
+ fileInfo->mLastMod = jVal.j / 1000;
+ jthr = invokeMethod(env, &jVal, INSTANCE, jStat,
+ HADOOP_STAT, "getAccessTime", "()J");
+ if (jthr)
+ goto done;
+ fileInfo->mLastAccess = (tTime) (jVal.j / 1000);
+ if (fileInfo->mKind == kObjectKindFile) {
+ jthr = invokeMethod(env, &jVal, INSTANCE, jStat,
+ HADOOP_STAT, "getLen", "()J");
+ if (jthr)
+ goto done;
+ fileInfo->mSize = jVal.j;
+ }
+ jthr = invokeMethod(env, &jVal, INSTANCE, jStat, HADOOP_STAT,
+ "getPath", "()Lorg/apache/hadoop/fs/Path;");
+ if (jthr)
+ goto done;
+ jPath = jVal.l;
+ if (jPath == NULL) {
+ jthr = newRuntimeError(env, "org.apache.hadoop.fs.FileStatus#"
+ "getPath returned NULL!");
+ goto done;
+ }
+ jthr = invokeMethod(env, &jVal, INSTANCE, jPath, HADOOP_PATH,
+ "toString", "()Ljava/lang/String;");
+ if (jthr)
+ goto done;
+ jPathName = jVal.l;
+ cPathName =
+ (const char*) ((*env)->GetStringUTFChars(env, jPathName, NULL));
+ if (!cPathName) {
+ jthr = getPendingExceptionAndClear(env);
+ goto done;
+ }
+ fileInfo->mName = _strdup(cPathName);
+ (*env)->ReleaseStringUTFChars(env, jPathName, cPathName);
+ jthr = invokeMethod(env, &jVal, INSTANCE, jStat, HADOOP_STAT,
+ "getOwner", "()Ljava/lang/String;");
+ if (jthr)
+ goto done;
+ jUserName = jVal.l;
+ cUserName =
+ (const char*) ((*env)->GetStringUTFChars(env, jUserName, NULL));
+ if (!cUserName) {
+ jthr = getPendingExceptionAndClear(env);
+ goto done;
+ }
+ fileInfo->mOwner = _strdup(cUserName);
+ (*env)->ReleaseStringUTFChars(env, jUserName, cUserName);
+ jthr = invokeMethod(env, &jVal, INSTANCE, jStat, HADOOP_STAT,
+ "getGroup", "()Ljava/lang/String;");
+ if (jthr)
+ goto done;
+ jGroupName = jVal.l;
+ cGroupName = (const char*) ((*env)->GetStringUTFChars(env, jGroupName, NULL));
+ if (!cGroupName) {
+ jthr = getPendingExceptionAndClear(env);
+ goto done;
+ }
+ fileInfo->mGroup = _strdup(cGroupName);
+ (*env)->ReleaseStringUTFChars(env, jGroupName, cGroupName);
+ jthr = invokeMethod(env, &jVal, INSTANCE, jStat, HADOOP_STAT,
+ "getPermission",
+ "()Lorg/apache/hadoop/fs/permission/FsPermission;");
+ if (jthr)
+ goto done;
+ if (jVal.l == NULL) {
+ jthr = newRuntimeError(env, "%s#getPermission returned NULL!",
+ goto done;
+ }
+ jPermission = jVal.l;
+ jthr = invokeMethod(env, &jVal, INSTANCE, jPermission, HADOOP_FSPERM,
+ "toShort", "()S");
+ if (jthr)
+ goto done;
+ fileInfo->mPermissions = jVal.s;
+ jthr = NULL;
+ if (jthr)
+ hdfsFreeFileInfoEntry(fileInfo);
+ destroyLocalReference(env, jPath);
+ destroyLocalReference(env, jPathName);
+ destroyLocalReference(env, jUserName);
+ destroyLocalReference(env, jGroupName);
+ destroyLocalReference(env, jPermission);
+ destroyLocalReference(env, jPath);
+ return jthr;
+static jthrowable
+getFileInfo(JNIEnv *env, jobject jFS, jobject jPath, hdfsFileInfo **fileInfo)
+ fs.isDirectory(f)
+ fs.getModificationTime()
+ fs.getAccessTime()
+ fs.getLength(f)
+ f.getPath()
+ f.getOwner()
+ f.getGroup()
+ f.getPermission().toShort() */
+ jobject jStat;
+ jvalue jVal;
+ jthrowable jthr;
+ jthr = invokeMethod(env, &jVal, INSTANCE, jFS, HADOOP_FS,
+ jPath);
+ if (jthr)
+ return jthr;
+ if (jVal.z == 0) {
+ *fileInfo = NULL;
+ return NULL;
+ }
+ jthr = invokeMethod(env, &jVal, INSTANCE, jFS,
+ HADOOP_FS, "getFileStatus",
+ if (jthr)
+ return jthr;
+ jStat = jVal.l;
+ *fileInfo = calloc(1, sizeof(hdfsFileInfo));
+ if (!*fileInfo) {
+ destroyLocalReference(env, jStat);
+ return newRuntimeError(env, "getFileInfo: OOM allocating hdfsFileInfo");
+ }
+ jthr = getFileInfoFromStat(env, jStat, *fileInfo);
+ destroyLocalReference(env, jStat);
+ return jthr;
+hdfsFileInfo* hdfsListDirectory(hdfsFS fs, const char* path, int *numEntries)
+ Path p(path);
+ Path []pathList = fs.listPaths(p)
+ foreach path in pathList
+ getFileInfo(path) */
+ jthrowable jthr;
+ jobject jPath = NULL;
+ hdfsFileInfo *pathList = NULL;
+ jobjectArray jPathList = NULL;
+ jvalue jVal;
+ jsize jPathListSize = 0;
+ int ret;
+ jobject jFS = (jobject)fs;
+ /*Save path information in pathList */
+ jsize i;
+ jobject tmpStat;
+ /*Get the JNIEnv* corresponding to current thread */
+ JNIEnv* env = getJNIEnv();
+ if (env == NULL) {
+ errno = EINTERNAL;
+ return NULL;
+ }
+ /*Create an object of org.apache.hadoop.fs.Path */
+ jthr = constructNewObjectOfPath(env, path, &jPath);
+ if (jthr) {
+ ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsListDirectory(%s): constructNewObjectOfPath", path);
+ goto done;
+ }
+ jthr = invokeMethod(env, &jVal, INSTANCE, jFS, HADOOP_DFS, "listStatus",
+ jPath);
+ if (jthr) {
+ ret = printExceptionAndFree(env, jthr,
+ "hdfsListDirectory(%s): FileSystem#listStatus", path);
+ goto done;
+ }
+ jPathList = jVal.l;
+ /*Figure out the number of entries in that directory */
+ jPathListSize = (*env)->GetArrayLength(env, jPathList);
+ if (jPathListSize == 0) {
+ ret = 0;
+ goto done;
+ }
+ /*Allocate memory */
+ pathList = calloc(jPathListSize, sizeof(hdfsFileInfo));
+ if (pathList == NULL) {
+ ret = ENOMEM;
+ goto done;
+ }
+ for (i=0; i < jPathListSize; ++i) {
+ tmpStat = (*env)->GetObjectArrayElement(env, jPathList, i);
+ if (!tmpStat) {
+ ret = printPendingExceptionAndFree(env, PRINT_EXC_ALL,
+ "hdfsListDirectory(%s): GetObjectArrayElement(%d out of %d)",
+ path, i, jPathListSize);
+ goto done;
+ }
+ jthr = getFileInfoFromStat(env, tmpStat, &pathList[i]);
+ destroyLocalReference(env, tmpStat);
+ if (jthr) {
+ ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsListDirectory(%s): getFileInfoFromStat(%d out of %d)",
+ path, i, jPathListSize);
+ goto done;
+ }
+ } /*end path list loop */
+ ret = 0;
+ destroyLocalReference(env, jPath);
+ destroyLocalReference(env, jPathList);
+ if (ret) {
+ hdfsFreeFileInfo(pathList, jPathListSize);
+ errno = ret;
+ return NULL;
+ }
+ *numEntries = jPathListSize;
+ return pathList;
+hdfsFileInfo *hdfsGetPathInfo(hdfsFS fs, const char* path)
+ jobject jFS = (jobject)fs;
+ /*Create an object of org.apache.hadoop.fs.Path */
+ jobject jPath;
+ jthrowable jthr;
+ hdfsFileInfo *fileInfo;
+ File f(path);
+ fs.isDirectory(f)
+ fs.lastModified() ??
+ fs.getLength(f)
+ f.getPath() */
+ /*Get the JNIEnv* corresponding to current thread */
+ JNIEnv* env = getJNIEnv();
+ if (env == NULL) {
+ errno = EINTERNAL;
+ return NULL;
+ }
+ jthr = constructNewObjectOfPath(env, path, &jPath);
+ if (jthr) {
+ errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "hdfsGetPathInfo(%s): constructNewObjectOfPath", path);
+ return NULL;
+ }
+ jthr = getFileInfo(env, jFS, jPath, &fileInfo);
+ destroyLocalReference(env, jPath);
+ if (jthr) {
+ errno = printExceptionAndFree(env, jthr,
+ "hdfsGetPathInfo(%s): getFileInfo", path);
+ return NULL;
+ }
+ if (!fileInfo) {
+ errno = ENOENT;
+ return NULL;
+ }
+ return fileInfo;
+static void hdfsFreeFileInfoEntry(hdfsFileInfo *hdfsFileInfo)
+ free(hdfsFileInfo->mName);
+ free(hdfsFileInfo->mOwner);
+ free(hdfsFileInfo->mGroup);
+ memset(hdfsFileInfo, 0, sizeof(hdfsFileInfo));
+void hdfsFreeFileInfo(hdfsFileInfo *hdfsFileInfo, int numEntries)
+ /*Free the mName, mOwner, and mGroup */
+ int i;
+ for (i=0; i < numEntries; ++i) {
+ hdfsFreeFileInfoEntry(hdfsFileInfo + i);
+ }
+ /*Free entire block */
+ free(hdfsFileInfo);
+ * vim: ts=4: sw=4: et:
+ */
diff --git a/import/pdclibhdfs/src/jni_helper.c b/import/pdclibhdfs/src/jni_helper.c
new file mode 100755
index 0000000..520a3ae
--- /dev/null
+++ b/import/pdclibhdfs/src/jni_helper.c
@@ -0,0 +1,1181 @@
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*#include "config.h"*/
+#include "exception.h"
+#include "jni_helper.h"
+#include <stdio.h>
+#include <string.h>
+#ifdef WIN32
+#include "uthash.h"
+static void hdfsThreadDestructor(void *v);
+/** Pthreads thread-local storage for each library thread. */
+typedef struct HdfsTls {
+ JNIEnv *env;
+static JavaVM * hdfs_JVM = NULL;
+static short hdfs_InitLib = 0;
+#ifndef WIN32
+static void* hdfs_dl_handle = NULL;
+/*static pthread_rwlock_t hdfs_HashLock;*/
+static pthread_mutex_t hdfs_HashMutex = PTHREAD_MUTEX_INITIALIZER;
+static pthread_mutex_t hdfs_JvmMutex = PTHREAD_MUTEX_INITIALIZER;
+static int hdfs_hashTableInited = 0;
+/* SetUp Control variables */
+static pthread_once_t hdfs_threadInit_Once = PTHREAD_ONCE_INIT;
+static pthread_once_t hdfs_hashTable_Once = PTHREAD_ONCE_INIT;
+/** nonzero if we succeeded in initializing gTlsKey */
+static int hdfs_gTlsKeyInitialized = 0;
+/** Key that allows us to retrieve thread-local storage */
+static pthread_key_t hdfs_gTlsKey = 0;
+#define LOCK_HASH_TABLE() pthread_mutex_lock(&hdfs_HashMutex)
+#define UNLOCK_HASH_TABLE() pthread_mutex_unlock(&hdfs_HashMutex)
+typedef struct {
+ const char * key;
+ void * cls;
+ UT_hash_handle hh;
+PHASHITEM hdfs_HashTls = NULL;
+static DWORD hdfs_dwTlsIndex1 = 0;
+static HINSTANCE hdfs_hinstLib = 0;
+static HANDLE hdfs_JvmMutex = 0;
+static PSRWLOCK hdfs_HashLock = NULL;
+#ifdef WIN32
+#define LOCK_JVM_MUTEX() \
+dwWaitResult = WaitForSingleObject(hdfs_JvmMutex,INFINITE)
+#define LOCK_JVM_MUTEX() \
+#ifdef WIN32
+#define UNLOCK_JVM_MUTEX() \
+ ReleaseMutex(hdfs_JvmMutex)
+#define UNLOCK_JVM_MUTEX() \
+ pthread_mutex_unlock(&hdfs_JvmMutex)
+/** The Native return types that methods could return */
+#define JVOID 'V'
+#define JOBJECT 'L'
+#define JARRAYOBJECT '['
+#define JBOOLEAN 'Z'
+#define JBYTE 'B'
+#define JCHAR 'C'
+#define JSHORT 'S'
+#define JINT 'I'
+#define JLONG 'J'
+#define JFLOAT 'F'
+#define JDOUBLE 'D'
+ * MAX_HASH_TABLE_ELEM: The maximum no. of entries in the hashtable.
+ * It's set to 4096 to account for (classNames + No. of threads)
+ */
+#define MAX_HASH_TABLE_ELEM 4096
+#ifndef WIN32
+/* Invoked By "pthread_once" to create the thread key */
+static void Make_Thread_Key()
+ int ret = pthread_key_create(&hdfs_gTlsKey, hdfsThreadDestructor);
+ if (ret) {
+ fprintf(stderr, "getJNIEnv: pthread_key_create failed with "
+ "error %d\n", ret);
+ return;
+ }
+ hdfs_gTlsKeyInitialized = 1;
+/* Invoked By "pthread_once" to create the Hash Table */
+static void hashTableInit ()
+ /* if ( pthread_rwlock_init (&hdfs_HashLock, NULL) != 0 ) {
+ fprintf (stderr, "can't create rwlock for hash table");
+ return;
+ } */
+ if ( hcreate(MAX_HASH_TABLE_ELEM) == 0 ) {
+ fprintf ( stderr, "error creating hashtable, <%d>: %s\n",
+ errno, strerror(errno) );
+ return;
+ }
+ hdfs_hashTableInited = 1;
+static int insertEntryIntoTable ( const char *key, void *data )
+ ENTRY e, *ep = NULL;
+ if (key == NULL || data == NULL) {
+ return 0;
+ }
+ pthread_once ( &hdfs_hashTable_Once, hashTableInit );
+ if ( !hdfs_hashTableInited ) {
+ return -1;
+ }
+ e.data = data;
+ e.key = (char*) key;
+ /* if ( pthread_rwlock_wrlock(&hdfs_HashLock) != 0 ) {
+ fprintf (stderr, "can't get hash table wlock");
+ return -1;
+ } */
+ ep = hsearch(e, ENTER);
+ /*pthread_rwlock_unlock(&hdfs_HashLock); */
+ if (ep == NULL) {
+ fprintf(stderr, "warn adding key (%s) to hash table, <%d>: %s\n",
+ key, errno, strerror(errno));
+ }
+ return 0;
+static void* searchEntryFromTable ( const char *key )
+ ENTRY e, *ep = NULL;
+ if (key == NULL) {
+ return NULL;
+ }
+ pthread_once ( &hdfs_hashTable_Once, hashTableInit );
+ if ( !hdfs_hashTableInited ) {
+ return NULL;
+ }
+ e.key = (char*)key;
+ /* if ( pthread_rwlock_rdlock(&hdfs_HashLock) != 0 ) {
+ fprintf (stderr, "can't get hash table rdlock");
+ return NULL;
+ }*/
+ ep = hsearch(e, FIND);
+ /*pthread_rwlock_unlock(&hdfs_HashLock);*/
+ if (ep != NULL) {
+ return ep->data;
+ }
+ return NULL;
+static int insertEntryIntoTable ( const char *key, void *cls )
+ DWORD dwWaitResult;
+ if (key == NULL || cls == NULL) {
+ return 0;
+ }
+ item = (PHASHITEM) calloc ( 1, sizeof(HASHITEM) );
+ item->key = key;
+ item->cls = cls;
+ AcquireSRWLockExclusive ( hdfs_HashLock );
+ HASH_ADD_KEYPTR ( hh, hdfs_HashTls, item->key, strlen(item->key), item );
+ ReleaseSRWLockExclusive ( hdfs_HashLock );
+ return 0;
+static void* searchEntryFromTable ( const char *key )
+ DWORD dwWaitResult;
+ if (key == NULL) {
+ return NULL;
+ }
+ AcquireSRWLockShared ( hdfs_HashLock );
+ if (!hdfs_HashTls) {
+ // char *mykey = "dummy";
+ ReleaseSRWLockShared ( hdfs_HashLock );
+ /* add dummy entry for testing */
+ // insertEntryIntoTable ( mykey, -1 );
+ // HASH_FIND_STR( hdfs_HashTls, mykey, item);
+ // printf ("Test Hash mykey: %s\n", item->key);
+ return NULL;
+ }
+ HASH_FIND_STR( hdfs_HashTls, key, item);
+ ReleaseSRWLockShared ( hdfs_HashLock );
+ if (item != NULL) {
+ return item->cls;
+ }
+ return NULL;
+ * The function that is called whenever a thread with libhdfs thread local data
+ * is destroyed.
+ *
+ * @param v The thread-local data
+ */
+static void hdfsThreadDestructor(void *v)
+ HDFSTLS *tls = v;
+ JavaVM *vm;
+ JNIEnv *env = tls->env;
+ jint ret;
+ ret = (*env)->GetJavaVM(env, &vm);
+ if (ret) {
+ fprintf(stderr, "hdfsThreadDestructor: GetJavaVM failed with "
+ "error %d\n", ret);
+ (*env)->ExceptionDescribe(env);
+ } else {
+ (*vm)->DetachCurrentThread(vm);
+ }
+ free(tls);
+#ifdef WIN32
+ HINSTANCE hinstDLL, // handle to DLL module
+ DWORD fdwReason, // reason for calling function
+ LPVOID lpReserved ) // reserved
+ HDFSTLS *tls = NULL;
+ // Perform actions based on the reason for calling.
+ switch( fdwReason )
+ {
+ if ((hdfs_dwTlsIndex1 = TlsAlloc()) == TLS_OUT_OF_INDEXES)
+ return FALSE;
+ hdfs_HashLock = (PSRWLOCK) calloc ( 1, sizeof ( SRWLOCK ) );
+ if (!hdfs_HashLock) {
+ fprintf (stderr, " Could not allocate Hash Table Lock\n" );
+ return FALSE;
+ }
+ InitializeSRWLock ( hdfs_HashLock );
+ hdfs_JvmMutex = CreateMutex (
+ NULL, // default security attributes
+ FALSE, // initially not owned
+ NULL ); // unnamed mutex
+ if (hdfs_JvmMutex == NULL)
+ {
+ fprintf (stderr, "Create JVM Mutex error: %d\n", GetLastError() );
+ return FALSE;
+ }
+ fprintf (stderr, "dll attached\n" );
+ fprintf (stderr, "dll: tls1=%d\n", hdfs_dwTlsIndex1 );
+ // Initialize once for each new process.
+ // Return FALSE to fail DLL load.
+ break;
+ // Do thread-specific initialization.
+ fprintf (stderr, "dll: thread attach\n" );
+ break;
+ // Do thread-specific cleanup.
+ fprintf (stderr, "dll: detach thread\n" );
+ tls = TlsGetValue(hdfs_dwTlsIndex1);
+ if (tls) {
+ fprintf (stderr, "dll thread: invoke thread destructor\n" );
+ hdfsThreadDestructor(tls);
+ TlsSetValue(hdfs_dwTlsIndex1, NULL);
+ }
+ break;
+ // Perform any necessary cleanup.
+ fprintf (stderr, "dll: detach process\n" );
+ if (hdfs_HashTls) {
+ PHASHITEM item, tmp;
+ fprintf (stderr, "dll: clean up hash table\n" );
+ HASH_ITER ( hh, hdfs_HashTls, item, tmp ) {
+ fprintf (stderr, "dll: delete key %s\n", item->key );
+ HASH_DEL ( hdfs_HashTls, item );
+ free ( item );
+ }
+ }
+ tls = TlsGetValue(hdfs_dwTlsIndex1);
+ if (tls) {
+ fprintf (stderr, "dll: invoke thread destructor\n" );
+ hdfsThreadDestructor(tls);
+ }
+ // Release the TLS index.
+ TlsFree(hdfs_dwTlsIndex1);
+ if (hdfs_JvmMutex) CloseHandle(hdfs_JvmMutex);
+ if (hdfs_HashLock) free(hdfs_HashLock);
+ fprintf (stderr, "dll detached\n" );
+ break;
+ }
+ return TRUE; // Successful DLL_PROCESS_ATTACH.
+// jelson changes: start
+BOOL CALLBACK staticLibInit(PINIT_ONCE InitOnce, PVOID Parameter, PVOID *lpContext)
+void maybePerformStaticLibInit()
+ InitOnceExecuteOnce(&g_InitOnce, staticLibInit, NULL, NULL);
+// jelson changes: end
+int hdfsLibInit ( void * parms )
+ JNIEnv* env = getJNIEnv();
+ if (!env) return 1;
+ hdfs_InitLib = 1;
+ return 0;
+// test compiler
+void destroyLocalReference(JNIEnv *env, jobject jObject)
+ if (jObject)
+ (*env)->DeleteLocalRef(env, jObject);
+static jthrowable validateMethodType(JNIEnv *env, MethType methType)
+ if (methType != STATIC && methType != INSTANCE) {
+ return newRuntimeError(env, "validateMethodType(methType=%d): "
+ "illegal method type.\n", methType);
+ }
+ return NULL;
+jthrowable newJavaStr(JNIEnv *env, const char *str, jstring *out)
+ jstring jstr;
+ if (!str) {
+ /* Can't pass NULL to NewStringUTF: the result would be
+ * implementation-defined. */
+ *out = NULL;
+ return NULL;
+ }
+ jstr = (*env)->NewStringUTF(env, str);
+ if (!jstr) {
+ /* If NewStringUTF returns NULL, an exception has been thrown,
+ * which we need to handle. Probaly an OOM. */
+ return getPendingExceptionAndClear(env);
+ }
+ *out = jstr;
+ return NULL;
+jthrowable newCStr(JNIEnv *env, jstring jstr, char **out)
+ const char *tmp;
+ if (!jstr) {
+ *out = NULL;
+ return NULL;
+ }
+ tmp = (*env)->GetStringUTFChars(env, jstr, NULL);
+ if (!tmp) {
+ return getPendingExceptionAndClear(env);
+ }
+ *out = _strdup(tmp);
+ (*env)->ReleaseStringUTFChars(env, jstr, tmp);
+ return NULL;
+jthrowable invokeMethod ( JNIEnv *env, jvalue *retval, MethType methType,
+ jobject instObj, const char *className,
+ const char *methName, const char *methSignature, ... )
+ va_list args;
+ jclass cls;
+ jmethodID mid = NULL;
+ jthrowable jthr;
+ const char *str;
+ char returnType;
+ jthr = validateMethodType(env, methType);
+ if (jthr)
+ return jthr;
+ jthr = globalClassReference(className, env, &cls);
+ if (jthr)
+ return jthr;
+ jthr = methodIdFromClass(className, methName, methSignature,
+ methType, env, &mid);
+ if (jthr) {
+ return jthr;
+ }
+ str = methSignature;
+ while (*str != ')') str++;
+ str++;
+ returnType = *str;
+// printf ("Begin Method Invokation:%s ## %s\n", className, methName );
+ va_start(args, methSignature);
+ if (returnType == JOBJECT || returnType == JARRAYOBJECT) {
+ jobject jobj = NULL;
+ if (methType == STATIC) {
+ jobj = (*env)->CallStaticObjectMethodV(env, cls, mid, args);
+ }
+ else if (methType == INSTANCE) {
+ jobj = (*env)->CallObjectMethodV(env, instObj, mid, args);
+ }
+ retval->l = jobj;
+ }
+ else if (returnType == JVOID) {
+ if (methType == STATIC) {
+ (*env)->CallStaticVoidMethodV(env, cls, mid, args);
+ }
+ else if (methType == INSTANCE) {
+ (*env)->CallVoidMethodV(env, instObj, mid, args);
+ }
+ }
+ else if (returnType == JBOOLEAN) {
+ jboolean jbool = 0;
+ if (methType == STATIC) {
+ jbool = (*env)->CallStaticBooleanMethodV(env, cls, mid, args);
+ }
+ else if (methType == INSTANCE) {
+ jbool = (*env)->CallBooleanMethodV(env, instObj, mid, args);
+ }
+ retval->z = jbool;
+ }
+ else if (returnType == JSHORT) {
+ jshort js = 0;
+ if (methType == STATIC) {
+ js = (*env)->CallStaticShortMethodV(env, cls, mid, args);
+ }
+ else if (methType == INSTANCE) {
+ js = (*env)->CallShortMethodV(env, instObj, mid, args);
+ }
+ retval->s = js;
+ }
+ else if (returnType == JLONG) {
+ jlong jl = -1;
+ if (methType == STATIC) {
+ jl = (*env)->CallStaticLongMethodV(env, cls, mid, args);
+ }
+ else if (methType == INSTANCE) {
+ jl = (*env)->CallLongMethodV(env, instObj, mid, args);
+ }
+ retval->j = jl;
+ }
+ else if (returnType == JINT) {
+ jint ji = -1;
+ if (methType == STATIC) {
+ ji = (*env)->CallStaticIntMethodV(env, cls, mid, args);
+ }
+ else if (methType == INSTANCE) {
+ ji = (*env)->CallIntMethodV(env, instObj, mid, args);
+ }
+ retval->i = ji;
+ }
+ va_end(args);
+// printf ("End Method Invokation\n");
+ jthr = (*env)->ExceptionOccurred(env);
+ if (jthr) {
+ (*env)->ExceptionClear(env);
+ return jthr;
+ }
+// printf ("Method success\n");
+ return NULL;
+jthrowable constructNewObjectOfClass(JNIEnv *env, jobject *out, const char *className,
+ const char *ctorSignature, ...)
+ va_list args;
+ jclass cls;
+ jmethodID mid = NULL;
+ jobject jobj;
+ jthrowable jthr;
+ jthr = globalClassReference(className, env, &cls);
+ if (jthr)
+ return jthr;
+ jthr = methodIdFromClass(className, "<init>", ctorSignature,
+ INSTANCE, env, &mid);
+ if (jthr)
+ return jthr;
+ va_start(args, ctorSignature);
+ jobj = (*env)->NewObjectV(env, cls, mid, args);
+ va_end(args);
+ if (!jobj)
+ return getPendingExceptionAndClear(env);
+ *out = jobj;
+ return NULL;
+jthrowable methodIdFromClass(const char *className, const char *methName,
+ const char *methSignature, MethType methType,
+ JNIEnv *env, jmethodID *out)
+ jclass cls;
+ jthrowable jthr;
+ jmethodID mid = NULL;
+ jthr = validateMethodType(env, methType);
+ if (jthr)
+ return jthr;
+ jthr = globalClassReference(className, env, &cls);
+ if (jthr)
+ return jthr;
+ if (methType == STATIC) {
+ mid = (*env)->GetStaticMethodID(env, cls, methName, methSignature);
+ }
+ else if (methType == INSTANCE) {
+ mid = (*env)->GetMethodID(env, cls, methName, methSignature);
+ }
+ if (mid == NULL) {
+ fprintf(stderr, "could not find method %s from class %s with "
+ "signature %s\n", methName, className, methSignature);
+ return getPendingExceptionAndClear(env);
+ }
+ *out = mid;
+ return NULL;
+jthrowable globalClassReference(const char *className, JNIEnv *env, jclass *out)
+ jclass clsLocalRef;
+ jclass cls = searchEntryFromTable(className);
+ if (cls) {
+ *out = cls;
+ return NULL;
+ }
+ clsLocalRef = (*env)->FindClass(env,className);
+ if (clsLocalRef == NULL) {
+ return getPendingExceptionAndClear(env);
+ }
+ cls = (*env)->NewGlobalRef(env, clsLocalRef);
+ if (cls == NULL) {
+ (*env)->DeleteLocalRef(env, clsLocalRef);
+ return getPendingExceptionAndClear(env);
+ }
+ (*env)->DeleteLocalRef(env, clsLocalRef);
+ insertEntryIntoTable(className, cls);
+ *out = cls;
+ return NULL;
+jthrowable classNameOfObject(jobject jobj, JNIEnv *env, char **name)
+ jthrowable jthr;
+ jclass cls, clsClass = NULL;
+ jmethodID mid = NULL;
+ jstring str = NULL;
+ const char *cstr = NULL;
+ char *newstr;
+ cls = (*env)->GetObjectClass(env, jobj);
+ if (cls == NULL) {
+ jthr = getPendingExceptionAndClear(env);
+ goto done;
+ }
+ clsClass = (*env)->FindClass(env, "java/lang/Class");
+ if (clsClass == NULL) {
+ jthr = getPendingExceptionAndClear(env);
+ goto done;
+ }
+ mid = (*env)->GetMethodID(env, clsClass, "getName", "()Ljava/lang/String;");
+ if (mid == NULL) {
+ jthr = getPendingExceptionAndClear(env);
+ goto done;
+ }
+ str = (*env)->CallObjectMethod(env, cls, mid);
+ if (str == NULL) {
+ jthr = getPendingExceptionAndClear(env);
+ goto done;
+ }
+ cstr = (*env)->GetStringUTFChars(env, str, NULL);
+ if (!cstr) {
+ jthr = getPendingExceptionAndClear(env);
+ goto done;
+ }
+ newstr = _strdup(cstr);
+ if (newstr == NULL) {
+ jthr = newRuntimeError(env, "classNameOfObject: out of memory");
+ goto done;
+ }
+ *name = newstr;
+ jthr = NULL;
+ destroyLocalReference(env, cls);
+ destroyLocalReference(env, clsClass);
+ if (str) {
+ if (cstr)
+ (*env)->ReleaseStringUTFChars(env, str, cstr);
+ (*env)->DeleteLocalRef(env, str);
+ }
+ return jthr;
+ * Get the global JNI environemnt.
+ *
+ * We only have to create the JVM once. After that, we can use it in
+ * every thread. You must be holding the jvmMutex when you call this
+ * function.
+ *
+ * @return The JNIEnv on success; error code otherwise
+ */
+static JNIEnv* getGlobalJNIEnv(void)
+ const jsize vmBufLength = 1;
+ JavaVM* vmBuf[1];
+ JNIEnv *env = NULL;
+ jint rv = 0;
+ jint noVMs = 0;
+ jthrowable jthr;
+ /*JavaVM *vm = NULL; */
+ char *error = NULL;
+ typedef jint (*FGetVMS) (JavaVM**, const jsize, jint* );
+ FGetVMS fpGetVM = NULL;
+ typedef jint (*FCreateVM) (JavaVM**, void**, JavaVMInitArgs* );
+ FCreateVM fpCreateVM = NULL;
+// printf ( "Get Global JNI\n" );
+ #ifndef WIN32
+ char *JVMPath = getenv("LIBHDFS_JVM_PATH");
+ char jvmPath [2000] = "";
+ if (JVMPath) {
+ strcpy (jvmPath, JVMPath);
+ } else {
+ strcpy (jvmPath, "libjvm.so");
+ }
+ if (!hdfs_dl_handle) {
+ hdfs_dl_handle = (void*) dlopen( jvmPath, 0 );
+ if (!hdfs_dl_handle) {
+ printf( "!!! %s\n", dlerror() );
+ return NULL;
+ }
+ }
+ if (hdfs_dl_handle) {
+ fpGetVM = (FGetVMS) dlsym( hdfs_dl_handle, "JNI_GetCreatedJavaVMs" );
+ error = (char*) dlerror();
+ if (error != NULL) {
+ fprintf(stderr, "!!! %s\n", error );
+ return NULL;
+ }
+ fpCreateVM = (FCreateVM) dlsym( hdfs_dl_handle, "JNI_CreateJavaVM" );
+ error = (char*) dlerror();
+ if (error != NULL) {
+ fprintf(stderr, "!!! %s\n", error );
+ return NULL;
+ }
+ }
+ #else
+ if (hdfs_hinstLib == NULL) {
+ wchar_t *env_libhdfs_jvm_path;
+ size_t env_libhdfs_jvm_path_size;
+ wchar_t jvmPath[2000];
+ fprintf (stderr, "libhdfs: loading jvm\n" );
+ _wdupenv_s(&env_libhdfs_jvm_path, &env_libhdfs_jvm_path_size, L"LIBHDFS_JVM_PATH");
+ if (env_libhdfs_jvm_path != NULL) {
+ fprintf(stderr, "using value from LIBHDFS_JVM_PATH: %S\n", env_libhdfs_jvm_path);
+ wcscpy_s (jvmPath, _countof(jvmPath), env_libhdfs_jvm_path);
+ } else {
+ wchar_t *env_java_home;
+ size_t env_java_home_size;
+ _wdupenv_s(&env_java_home, &env_java_home_size, L"JAVA_HOME");
+ if (env_java_home != NULL) {
+ _snwprintf(jvmPath, _countof(jvmPath), L"%s\\jre\\bin\\server\\jvm.dll", env_java_home);
+ fprintf(stderr, "Found JAVA_HOME of %S; trying jvm path of '%S'\n", env_java_home, jvmPath);
+ } else {
+ wcscpy_s (jvmPath, _countof(jvmPath), L"c:\\program files\\java\\jre\\bin\\server\\jvm.dll");
+ fprintf(stderr, "LIBHDFS_JVM_PATH and JAVA_HOME both not set; blindingly trying %S\n", jvmPath);
+ }
+ }
+ hdfs_hinstLib = LoadLibrary ( jvmPath );
+ if (!hdfs_hinstLib) {
+ LPVOID lpMsgBuf;
+ DWORD dw = GetLastError();
+ FormatMessage (
+ dw,
+ (LPTSTR) &lpMsgBuf,
+ 0, NULL );
+ fprintf (stderr, "jvm load failed\n" );
+ fprintf (stderr, "Error Code:%d %s\n", dw, lpMsgBuf );
+ LocalFree(lpMsgBuf);
+ return NULL;
+ }
+ } /* endif load the dll */
+ if (hdfs_hinstLib)
+ {
+ // fprintf (stderr, "dll: get proc addresses\n" );
+ fpGetVM = (FGetVMS) GetProcAddress ( hdfs_hinstLib,
+ "JNI_GetCreatedJavaVMs" );
+ if (!fpGetVM) {
+ fprintf (stderr, "dll: could not get proc 'JNI_GetCreatedJavaVMs'\n" );
+ return NULL;
+ }
+ fpCreateVM = (FCreateVM) GetProcAddress ( hdfs_hinstLib,
+ "JNI_CreateJavaVM" );
+ if (!fpCreateVM) {
+ fprintf (stderr, "dll: could not get proc 'JNI_CreateJavaVM'\n" );
+ return NULL;
+ }
+ }
+ #endif
+ /*rv = JNI_GetCreatedJavaVMs(vmBuf, vmBufLength, &noVMs);*/
+ rv = (fpGetVM) (vmBuf, vmBufLength, &noVMs);
+ if (rv != 0) {
+ fprintf(stderr, "JNI_GetCreatedJavaVMs failed with error: %d\n", rv);
+ return NULL;
+ }
+ if (noVMs == 0) {
+ /*Get the environment variables for initializing the JVM */
+ char *hadoopClassPath = getenv("LIBHDFS_CLASSPATH");
+ char *hadoopClassPathVMArg = "-Djava.class.path=";
+ /*char *hadoopClassPathVMArg = "-cp ";*/
+ int optHadoopClassPathLen;
+ char *optHadoopClassPath = NULL;
+ int noArgs = 1, cnt = 0;
+ char *hadoopJvmArgs = NULL;
+ char jvmArgDelims[] = " ";
+ char *str, *token, *savePtr;
+ JavaVMOption *options = NULL;
+ JavaVMInitArgs vm_args;
+ if (hadoopClassPath == NULL) {
+ fprintf(stderr, "libhdfs: Environment variable LIBHDFS_CLASSPATH not set!\n");
+ return NULL;
+ }
+ optHadoopClassPathLen = strlen(hadoopClassPath) +
+ strlen(hadoopClassPathVMArg) + 1;
+ optHadoopClassPath = malloc(sizeof(char)*optHadoopClassPathLen);
+ /* snprintf ( optHadoopClassPath, optHadoopClassPathLen,
+ "%s%s", hadoopClassPathVMArg, hadoopClassPath );*/
+ #if 0
+ savePtr = strdup(hadoopClassPath);
+ str = strdup(hadoopClassPath);
+ savePtr[0] = 0;
+ cnt = strlen (str);
+ token = strtok (str, "\\");
+ while (token) {
+ strcat (savePtr,token);
+ if (strlen(savePtr) == cnt) break;
+ strcat (savePtr,"/");
+ token = strtok (NULL, "\\");
+ }
+ sprintf ( optHadoopClassPath,
+ "%s%s", hadoopClassPathVMArg, savePtr );
+ free (savePtr);
+ free (str);
+ #endif
+ sprintf ( optHadoopClassPath,
+ "%s%s", hadoopClassPathVMArg, hadoopClassPath );
+ /* Determine the # of LIBHDFS_OPTS args */
+ hadoopJvmArgs = getenv("LIBHDFS_OPTS");
+ if (hadoopJvmArgs != NULL) {
+ hadoopJvmArgs = _strdup(hadoopJvmArgs);
+ for (noArgs = 1, str = hadoopJvmArgs; ; noArgs++, str = NULL) {
+ /* token = strtok_r(str, jvmArgDelims, &savePtr);*/
+ token = strtok (str, jvmArgDelims);
+ if (NULL == token) {
+ break;
+ }
+ }
+ free(hadoopJvmArgs);
+ }
+ /* Now that we know the # args, populate the options array */
+ options = calloc(noArgs, sizeof(JavaVMOption));
+ options[0].optionString = optHadoopClassPath;
+ hadoopJvmArgs = getenv("LIBHDFS_OPTS");
+ if (hadoopJvmArgs != NULL) {
+ hadoopJvmArgs = _strdup(hadoopJvmArgs);
+ for (noArgs = 1, str = hadoopJvmArgs; ; noArgs++, str = NULL) {
+ /* token = strtok_r(str, jvmArgDelims, &savePtr);*/
+ token = strtok (str, jvmArgDelims);
+ if (NULL == token) {
+ break;
+ }
+ options[noArgs].optionString = token;
+ }
+ }
+ /*Create the VM */
+ vm_args.version = JNI_VERSION_1_2;
+ vm_args.options = options;
+ vm_args.nOptions = noArgs;
+ vm_args.ignoreUnrecognized = 1;
+ /*rv = JNI_CreateJavaVM(&vm, (void**) &env, &vm_args);*/
+ rv = (fpCreateVM) (&hdfs_JVM, (void**) &env, &vm_args);
+ if (hadoopJvmArgs != NULL) {
+ free(hadoopJvmArgs);
+ }
+ free(optHadoopClassPath);
+ free(options);
+ if (rv != 0) {
+ fprintf(stderr, "Call to JNI_CreateJavaVM failed "
+ "with error: %d\n", rv);
+ return NULL;
+ }
+ fprintf (stderr, "dll: jvm created\n" );
+ /*This is not backwards comaptible */
+ /*
+ jthr = invokeMethod ( env, NULL, STATIC, NULL,
+ "org/apache/hadoop/fs/FileSystem",
+ "loadFileSystems", "()V" );
+ if (jthr) {
+ printExceptionAndFree ( env, jthr, PRINT_EXC_ALL,
+ "loadFileSystems" );
+ return NULL;
+ }
+ printf ( "dll: return from GetEnv\n" ); */
+ return env;
+ }
+ // fprintf (stderr, "dll: attach current thread \n" );
+ /*Attach this thread to the VM */
+ /*vm = vmBuf[0];
+ rv = (*vm)->AttachCurrentThread(vm, (void**) &env, 0); */
+ if (!hdfs_JVM) hdfs_JVM = vmBuf[0];
+ rv = (*hdfs_JVM)->AttachCurrentThread(hdfs_JVM, (void**) &env, 0);
+ if (rv != 0) {
+ fprintf(stderr, "Call to AttachCurrentThread "
+ "failed with error: %d\n", rv);
+ return NULL;
+ }
+ // fprintf (stderr, "dll: return from GetEnv attach \n" );
+ return env;
+ * getJNIEnv: A helper function to get the JNIEnv* for the given thread.
+ * If no JVM exists, then one will be created. JVM command line arguments
+ * are obtained from the LIBHDFS_OPTS environment variable.
+ *
+ * Implementation note: we rely on POSIX thread-local storage (tls).
+ * This allows us to associate a destructor function with each thread, that
+ * will detach the thread from the Java VM when the thread terminates. If we
+ * failt to do this, it will cause a memory leak.
+ *
+ * However, POSIX TLS is not the most efficient way to do things. It requires a
+ * key to be initialized before it can be used. Since we don't know if this key
+ * is initialized at the start of this function, we have to lock a mutex first
+ * and check. Luckily, most operating systems support the more efficient
+ * __thread construct, which is initialized by the linker.
+ *
+ * @param: None.
+ * @return The JNIEnv* corresponding to the thread.
+ */
+JNIEnv* getJNIEnv(void)
+ JNIEnv *env = NULL;
+ HDFSTLS *tls = NULL;
+ int ret = 0;
+ jint rv = 0;
+#ifdef WIN32
+ DWORD dwWaitResult;
+ maybePerformStaticLibInit();
+ tls = TlsGetValue(hdfs_dwTlsIndex1);
+ if (tls) return tls->env;
+ static __thread HDFSTLS *quickTls = NULL;
+ if (quickTls) return quickTls->env;
+#ifndef WIN32
+ pthread_once(&hdfs_threadInit_Once, Make_Thread_Key);
+ if (!hdfs_gTlsKeyInitialized)
+ return NULL;
+ tls = pthread_getspecific(hdfs_gTlsKey);
+ if (tls) {
+ return tls->env;
+ }
+ if (!hdfs_InitLib) {
+ env = getGlobalJNIEnv();
+ } else {
+ rv = (*hdfs_JVM)->AttachCurrentThread(hdfs_JVM, (void**) &env, 0);
+ if (rv != 0) {
+ fprintf(stderr, "Call to AttachCurrentThread "
+ "failed with error: %d\n", rv);
+ return NULL;
+ }
+ }
+ if (!env) {
+ fprintf(stderr, "getJNIEnv: getGlobalJNIEnv failed\n");
+ return NULL;
+ }
+ tls = calloc ( 1, sizeof(HDFSTLS) );
+ if (!tls) {
+ fprintf(stderr, "getJNIEnv: OOM allocating %zd bytes\n",
+ sizeof(HDFSTLS) );
+ return NULL;
+ }
+ tls->env = env;
+#ifdef WIN32
+ // fprintf (stderr, "dll: save environment\n" );
+ if (!TlsSetValue(hdfs_dwTlsIndex1, tls))
+ return NULL;
+ return env;
+ quickTls = tls;
+ return env;
+#ifndef WIN32
+ ret = pthread_setspecific(hdfs_gTlsKey, tls);
+ if (ret) {
+ fprintf(stderr, "getJNIEnv: pthread_setspecific failed with "
+ "error code %d\n", ret);
+ hdfsThreadDestructor(tls);
+ return NULL;
+ }
+ return env;
diff --git a/import/pdclibhdfs/src/makefile.twb b/import/pdclibhdfs/src/makefile.twb
new file mode 100755
index 0000000..35f080d
--- /dev/null
+++ b/import/pdclibhdfs/src/makefile.twb
@@ -0,0 +1,88 @@
+CNFP_TOPDIR := ../../
+CNFP_CURDIR := pdclibhdfs/src/
+PRODUCT_CONFIG_XML_FILES := config/twb.tdprod.xml
+include $(CNFP_TOPDIR)../tdv/etc/build_common.mk
+LOCAL_LEAFMKFILE := $(CNFP_TOPDIR)pdclibhdfs/src/makefile.twb
+ifeq ($(IDEBUILD_PROJID),TstOpsHdfs)
+sinclude $(CNFP_TOPDIR)config/build.twb/$(HATAVARDIR)/ProjectIde/TstOpsHdfs.tdsubmk
+sinclude $(CNFP_TOPDIR)config/build.twb/$(HATAVARDIR)/ProjectLocal/TstOpsHdfs.tdsubmk
+ifeq ($(LAST_INCLUDED_PROJID),TstOpsHdfs)
+ifeq ($(IDEBUILD_PROJID),TstReadHdfs)
+sinclude $(CNFP_TOPDIR)config/build.twb/$(HATAVARDIR)/ProjectIde/TstReadHdfs.tdsubmk
+sinclude $(CNFP_TOPDIR)config/build.twb/$(HATAVARDIR)/ProjectLocal/TstReadHdfs.tdsubmk
+ifeq ($(LAST_INCLUDED_PROJID),TstReadHdfs)
+ifeq ($(IDEBUILD_PROJID),TstWriteHdfs)
+sinclude $(CNFP_TOPDIR)config/build.twb/$(HATAVARDIR)/ProjectIde/TstWriteHdfs.tdsubmk
+sinclude $(CNFP_TOPDIR)config/build.twb/$(HATAVARDIR)/ProjectLocal/TstWriteHdfs.tdsubmk
+ifeq ($(LAST_INCLUDED_PROJID),TstWriteHdfs)
+ifeq ($(IDEBUILD_PROJID),pdclibhdfs)
+sinclude $(CNFP_TOPDIR)config/build.twb/$(HATAVARDIR)/ProjectIde/pdclibhdfs.tdsubmk
+sinclude $(CNFP_TOPDIR)config/build.twb/$(HATAVARDIR)/ProjectLocal/pdclibhdfs.tdsubmk
+ifeq ($(LAST_INCLUDED_PROJID),pdclibhdfs)
+$(CNFP_pdclibhdfs_GLOBALEMITDONES) $(CNFP_pdclibhdfs_EXES) $(CNFP_pdclibhdfs_LIBS) $(CNFP_pdclibhdfs_OBJS): $(CNFP_pdclibhdfs_DRVSRC) $(CNFP_PUBLICDRVSRC)
+.PHONY: preppkg all globalemitdones exes libs objs $(LOCAL_PHONY_FILETARGS) drvsrc publicdrvsrc drvdir clean clobber
+all: globalemitdones exes libs objs drvsrc
+ $(MKDIR) -p $@
+ifneq ($(CNFP_SNAPSHOT_CONTEXT),ThisShouldBeNull)
+preppkg all globalemitdones exes libs objs $(LOCAL_PHONY_FILETARGS) drvsrc publicdrvsrc clean clobber : $(CNFP_OUTDIRLIST)
+preppkg all globalemitdones exes libs objs drvsrc publicdrvsrc drvdir clean clobber : $(GLOBAL_BLDENV_FILES) $(LOCAL_LEAFMKFILE) $(LOCAL_GLBLPRJSNIPPETS)
diff --git a/import/pdclibhdfs/src/makefile.twb.options b/import/pdclibhdfs/src/makefile.twb.options
new file mode 100755
index 0000000..ba5a1ca
--- /dev/null
+++ b/import/pdclibhdfs/src/makefile.twb.options
@@ -0,0 +1,5 @@
+sinclude makefile.twb.options_myown
diff --git a/import/pdclibhdfs/src/native_mini_dfs.c b/import/pdclibhdfs/src/native_mini_dfs.c
new file mode 100755
index 0000000..7249ce9
--- /dev/null
+++ b/import/pdclibhdfs/src/native_mini_dfs.c
@@ -0,0 +1,274 @@
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "exception.h"
+#include "jni_helper.h"
+#include "native_mini_dfs.h"
+#include <errno.h>
+#include <jni.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#define MINIDFS_CLUSTER_BUILDER "org/apache/hadoop/hdfs/MiniDFSCluster$Builder"
+#define MINIDFS_CLUSTER "org/apache/hadoop/hdfs/MiniDFSCluster"
+#define HADOOP_CONF "org/apache/hadoop/conf/Configuration"
+#define HADOOP_NAMENODE "org/apache/hadoop/hdfs/server/namenode/NameNode"
+#define JAVA_INETSOCKETADDRESS "java/net/InetSocketAddress"
+#define DFS_WEBHDFS_ENABLED_KEY "dfs.webhdfs.enabled"
+struct NativeMiniDfsCluster {
+ /**
+ * The NativeMiniDfsCluster object
+ */
+ jobject obj;
+struct NativeMiniDfsCluster* nmdCreate(struct NativeMiniDfsConf *conf)
+ struct NativeMiniDfsCluster* cl = NULL;
+ jobject bld = NULL, cobj = NULL, cluster = NULL;
+ jvalue val;
+ JNIEnv *env = getJNIEnv();
+ jthrowable jthr;
+ jstring jconfStr = NULL;
+ if (!env) {
+ fprintf(stderr, "nmdCreate: unable to construct JNIEnv.\n");
+ return NULL;
+ }
+ cl = calloc(1, sizeof(struct NativeMiniDfsCluster));
+ if (!cl) {
+ fprintf(stderr, "nmdCreate: OOM");
+ goto error;
+ }
+ jthr = constructNewObjectOfClass(env, &cobj, HADOOP_CONF, "()V");
+ if (jthr) {
+ printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "nmdCreate: new Configuration");
+ goto error;
+ }
+ if (conf->webhdfsEnabled) {
+ jthr = newJavaStr(env, DFS_WEBHDFS_ENABLED_KEY, &jconfStr);
+ if (jthr) {
+ printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "nmdCreate: new String");
+ goto error;
+ }
+ jthr = invokeMethod(env, NULL, INSTANCE, cobj, HADOOP_CONF,
+ "setBoolean", "(Ljava/lang/String;Z)V",
+ jconfStr, conf->webhdfsEnabled);
+ if (jthr) {
+ printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "nmdCreate: Configuration::setBoolean");
+ goto error;
+ }
+ }
+ jthr = constructNewObjectOfClass(env, &bld, MINIDFS_CLUSTER_BUILDER,
+ "(L"HADOOP_CONF";)V", cobj);
+ if (jthr) {
+ printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "nmdCreate: NativeMiniDfsCluster#Builder#Builder");
+ goto error;
+ }
+ jthr = invokeMethod(env, &val, INSTANCE, bld, MINIDFS_CLUSTER_BUILDER,
+ "format", "(Z)L" MINIDFS_CLUSTER_BUILDER ";", conf->doFormat);
+ if (jthr) {
+ printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "nmdCreate: "
+ "Builder::format");
+ goto error;
+ }
+ (*env)->DeleteLocalRef(env, val.l);
+ if (conf->webhdfsEnabled) {
+ jthr = invokeMethod(env, &val, INSTANCE, bld, MINIDFS_CLUSTER_BUILDER,
+ "nameNodeHttpPort", "(I)L" MINIDFS_CLUSTER_BUILDER ";",
+ conf->namenodeHttpPort);
+ if (jthr) {
+ printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "nmdCreate: "
+ "Builder::nameNodeHttpPort");
+ goto error;
+ }
+ (*env)->DeleteLocalRef(env, val.l);
+ }
+ jthr = invokeMethod(env, &val, INSTANCE, bld, MINIDFS_CLUSTER_BUILDER,
+ "build", "()L" MINIDFS_CLUSTER ";");
+ if (jthr) {
+ printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "nmdCreate: Builder#build");
+ goto error;
+ }
+ cluster = val.l;
+ cl->obj = (*env)->NewGlobalRef(env, val.l);
+ if (!cl->obj) {
+ printPendingExceptionAndFree(env, PRINT_EXC_ALL,
+ "nmdCreate: NewGlobalRef");
+ goto error;
+ }
+ (*env)->DeleteLocalRef(env, cluster);
+ (*env)->DeleteLocalRef(env, bld);
+ (*env)->DeleteLocalRef(env, cobj);
+ (*env)->DeleteLocalRef(env, jconfStr);
+ return cl;
+ (*env)->DeleteLocalRef(env, cluster);
+ (*env)->DeleteLocalRef(env, bld);
+ (*env)->DeleteLocalRef(env, cobj);
+ (*env)->DeleteLocalRef(env, jconfStr);
+ free(cl);
+ return NULL;
+void nmdFree(struct NativeMiniDfsCluster* cl)
+ JNIEnv *env = getJNIEnv();
+ if (!env) {
+ fprintf(stderr, "nmdFree: getJNIEnv failed\n");
+ free(cl);
+ return;
+ }
+ (*env)->DeleteGlobalRef(env, cl->obj);
+ free(cl);
+int nmdShutdown(struct NativeMiniDfsCluster* cl)
+ JNIEnv *env = getJNIEnv();
+ jthrowable jthr;
+ if (!env) {
+ fprintf(stderr, "nmdShutdown: getJNIEnv failed\n");
+ return -EIO;
+ }
+ jthr = invokeMethod(env, NULL, INSTANCE, cl->obj,
+ MINIDFS_CLUSTER, "shutdown", "()V");
+ if (jthr) {
+ printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "nmdShutdown: MiniDFSCluster#shutdown");
+ return -EIO;
+ }
+ return 0;
+int nmdWaitClusterUp(struct NativeMiniDfsCluster *cl)
+ jthrowable jthr;
+ JNIEnv *env = getJNIEnv();
+ if (!env) {
+ fprintf(stderr, "nmdWaitClusterUp: getJNIEnv failed\n");
+ return -EIO;
+ }
+ jthr = invokeMethod(env, NULL, INSTANCE, cl->obj,
+ MINIDFS_CLUSTER, "waitClusterUp", "()V");
+ if (jthr) {
+ printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "nmdWaitClusterUp: MiniDFSCluster#waitClusterUp ");
+ return -EIO;
+ }
+ return 0;
+int nmdGetNameNodePort(const struct NativeMiniDfsCluster *cl)
+ JNIEnv *env = getJNIEnv();
+ jvalue jVal;
+ jthrowable jthr;
+ if (!env) {
+ fprintf(stderr, "nmdHdfsConnect: getJNIEnv failed\n");
+ return -EIO;
+ }
+ // Note: this will have to be updated when HA nativeMiniDfs clusters are
+ // supported
+ jthr = invokeMethod(env, &jVal, INSTANCE, cl->obj,
+ MINIDFS_CLUSTER, "getNameNodePort", "()I");
+ if (jthr) {
+ printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "nmdHdfsConnect: MiniDFSCluster#getNameNodePort");
+ return -EIO;
+ }
+ return jVal.i;
+int nmdGetNameNodeHttpAddress(const struct NativeMiniDfsCluster *cl,
+ int *port, const char **hostName)
+ JNIEnv *env = getJNIEnv();
+ jvalue jVal;
+ jobject jNameNode, jAddress;
+ jthrowable jthr;
+ int ret = 0;
+ const char *host;
+ if (!env) {
+ fprintf(stderr, "nmdHdfsConnect: getJNIEnv failed\n");
+ return -EIO;
+ }
+ // First get the (first) NameNode of the cluster
+ jthr = invokeMethod(env, &jVal, INSTANCE, cl->obj, MINIDFS_CLUSTER,
+ "getNameNode", "()L" HADOOP_NAMENODE ";");
+ if (jthr) {
+ printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "nmdGetNameNodeHttpAddress: "
+ "MiniDFSCluster#getNameNode");
+ return -EIO;
+ }
+ jNameNode = jVal.l;
+ // Then get the http address (InetSocketAddress) of the NameNode
+ jthr = invokeMethod(env, &jVal, INSTANCE, jNameNode, HADOOP_NAMENODE,
+ "getHttpAddress", "()L" JAVA_INETSOCKETADDRESS ";");
+ if (jthr) {
+ ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "nmdGetNameNodeHttpAddress: "
+ "NameNode#getHttpAddress");
+ goto error_dlr_nn;
+ }
+ jAddress = jVal.l;
+ jthr = invokeMethod(env, &jVal, INSTANCE, jAddress,
+ if (jthr) {
+ ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "nmdGetNameNodeHttpAddress: "
+ "InetSocketAddress#getPort");
+ goto error_dlr_addr;
+ }
+ *port = jVal.i;
+ jthr = invokeMethod(env, &jVal, INSTANCE, jAddress, JAVA_INETSOCKETADDRESS,
+ "getHostName", "()Ljava/lang/String;");
+ if (jthr) {
+ ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+ "nmdGetNameNodeHttpAddress: "
+ "InetSocketAddress#getHostName");
+ goto error_dlr_addr;
+ }
+ host = (*env)->GetStringUTFChars(env, jVal.l, NULL);
+ *hostName = _strdup(host);
+ (*env)->ReleaseStringUTFChars(env, jVal.l, host);
+ (*env)->DeleteLocalRef(env, jAddress);
+ (*env)->DeleteLocalRef(env, jNameNode);
+ return ret;
diff --git a/import/pdclibhdfs/src/pdclibhdfs.tdprj.xml b/import/pdclibhdfs/src/pdclibhdfs.tdprj.xml
new file mode 100755
index 0000000..6b0534f
--- /dev/null
+++ b/import/pdclibhdfs/src/pdclibhdfs.tdprj.xml
@@ -0,0 +1,196 @@
+<?xml version="1.0" encoding="US-ASCII" ?>
+* *
+* TITLE: pdclibhdfs.tdprj.xml *
+* *
+* Copyright 2005-2006, 2008-2010 by Teradata Corporation. *
+* *
+* *
+* *
+* Purpose: To create makefiles used by SCM build process. *
+* *
+* Description: A XML file. *
+* *
+* Revision Date DR DID Comments *
+* =========== ======== ========= ======== ================================== *
+* 11062013 TPT17170 SJB Initial version *
+* *
+<Project Name="dclibhdfs"
+ ProductGroupName="Msg_Build"
+ >
+ <Package Package="No"/>
+ <Set SetName="Defines" Name="TWBResource">
+ BUILDPRODUCTNAME="\"Teradata Parallel Transporter\""
+ BUILDPROJECT="\"Teradata PT Hdfs Library\""
+ </Set>
+ <ToolClassConfiguration ToolClass="CCompiler">
+ <HaTaVar OsType="Unix">
+ <OptionValues
+ PositionIndependentCode="Yes"
+ />
+ </HaTaVar>
+ <HaTaVar OsType="Windows">
+ <Set SetName="Defines" Order="10">
+ </Set>
+ <OptionValues
+ RequiredLibraryType="MultiThreadedShared"
+ />
+ </HaTaVar>
+ <HaTaVar Ta="aix-power.32">
+ <Set SetName="IncludePath" Order="10">
+ $(CNFP_TOPDIR)../java/aix-power.32/include
+ </Set>
+ </HaTaVar>
+ <HaTaVar Ta="aix-power">
+ <Set SetName="IncludePath" Order="10" Name="64inc">
+ $(CNFP_TOPDIR)../java/aix-power/include
+ </Set>
+ </HaTaVar>
+ <HaTaVar Ta="hpux-pa.32,hpux-pa">
+ <Set SetName="IncludePath" Order="10">
+ $(CNFP_TOPDIR)../java/hpux-pa/include
+ $(CNFP_TOPDIR)../java/hpux-pa/include/hp-ux
+ </Set>
+ </HaTaVar>
+ <HaTaVar Ta="hpux-pa">
+ <Set SetName="IncludePath" Order="10" Name="64inc">
+ $(CNFP_TOPDIR)../java/hpux-pa/include
+ $(CNFP_TOPDIR)../java/hpux-pa/include/hp-ux
+ </Set>
+ </HaTaVar>
+ <HaTaVar Ta="hpux-ia64.32">
+ <Set SetName="IncludePath" Order="10">
+ $(CNFP_TOPDIR)../java/hpux-ia64/include
+ $(CNFP_TOPDIR)../java/hpux-ia64/include/hp-ux
+ </Set>
+ </HaTaVar>
+ <HaTaVar Ta="hpux-ia64">
+ <Set SetName="IncludePath" Order="10" Name="64inc">
+ $(CNFP_TOPDIR)../java/hpux-ia64/include
+ $(CNFP_TOPDIR)../java/hpux-ia64/include/hp-ux
+ </Set>
+ </HaTaVar>
+ <HaTaVar Ta="redhatlinux-i386,redhatlinux-ia64,suselinux-i386">
+ <Set SetName="IncludePath" Order="10">
+ $(CNFP_TOPDIR)../java/redhatlinux-i386/include
+ $(CNFP_TOPDIR)../java/redhatlinux-i386/include/linux
+ </Set>
+ </HaTaVar>
+ <HaTaVar Ta="redhatlinux-x8664,suselinux-x8664">
+ <Set SetName="IncludePath" Order="10" Name="64inc">
+ $(CNFP_TOPDIR)../java/redhatlinux-x8664/include
+ $(CNFP_TOPDIR)../java/redhatlinux-x8664/include/linux
+ </Set>
+ </HaTaVar>
+ <HaTaVar Ta="macos-x8664,macos-x64,macos-i386">
+ <Set SetName="IncludePath" Order="10">
+ $(CNFP_TOPDIR)../java/macos-x8664/include
+ $(CNFP_TOPDIR)../java/macos-x8664/include/linux
+ </Set>
+ </HaTaVar>
+ <HaTaVar Ta="suselinux-390.32">
+ <Set SetName="IncludePath" Order="10">
+ $(CNFP_TOPDIR)../java/suselinux-390.32/include
+ </Set>
+ </HaTaVar>
+ <HaTaVar Ta="suselinux-390">
+ <Set SetName="IncludePath" Order="10" Name="64inc">
+ $(CNFP_TOPDIR)../java/suselinux-390/include
+ </Set>
+ </HaTaVar>
+ <HaTaVar Ta="solaris-sparc.32">
+ <Set SetName="IncludePath" Order="10">
+ $(CNFP_TOPDIR)../java/solaris-sparc/include
+ $(CNFP_TOPDIR)../java/solaris-sparc/include/include
+ $(CNFP_TOPDIR)../java/solaris-sparc/include/solaris
+ </Set>
+ </HaTaVar>
+ <HaTaVar Ta="solaris-sparc">
+ <Set SetName="IncludePath" Order="10" Name="64inc">
+ $(CNFP_TOPDIR)../java/solaris-sparc/include
+ $(CNFP_TOPDIR)../java/solaris-sparc/include/include
+ $(CNFP_TOPDIR)../java/solaris-sparc/include/solaris
+ </Set>
+ </HaTaVar>
+ <HaTaVar Ta="solaris-i386">
+ <Set SetName="IncludePath" Order="10">
+ $(CNFP_TOPDIR)../java/solaris-i386/include
+ $(CNFP_TOPDIR)../java/solaris-i386/include/solaris
+ </Set>
+ </HaTaVar>
+ <HaTaVar Ta="solaris-x8664">
+ <Set SetName="IncludePath" Order="10" Name="64inc">
+ $(CNFP_TOPDIR)../java/solaris-x8664/include
+ $(CNFP_TOPDIR)../java/solaris-x8664/include/solaris
+ </Set>
+ </HaTaVar>
+ <HaTaVar OsType="Windows">
+ <Set SetName="IncludePath" Order="10">
+ $(CNFP_TOPDIR)../java/$(TARGET_ARCH)/include
+ $(CNFP_TOPDIR)../java/$(TARGET_ARCH)/include/win32
+ </Set>
+ </HaTaVar>
+ </ToolClassConfiguration>
+ <ToolClassConfiguration ToolClass="MicrosoftResourceCompiler">
+ <Set SetName="IncludePath" Order="50">
+ $(CNFP_TOPDIR)../java/$(TARGET_ARCH)/include
+ $(CNFP_TOPDIR)../java/$(TARGET_ARCH)/include/win32
+ </Set>
+ </ToolClassConfiguration>
+ <ToolClassConfiguration ToolClass="Linker">
+ <HaTaVar OsType="Windows">
+ <Set SetName="LinkLibraries" Order="20">
+ oldnames
+ </Set>
+ </HaTaVar>
+ <HaTaVar Ta="hpux-pa,hpux-ia64">
+ <Set SetName="LinkLibraries" Order="20">
+ dld
+ </Set>
+ </HaTaVar>
+ <HaTaVar Ta="redhatlinux-i386,suselinux-390,suselinux-390.32,suselinux-i386,suselinux-x8664">
+ <Set SetName="LinkLibraries" Order="20">
+ dl
+ pthread
+ </Set>
+ </HaTaVar>
+ <HaTaVar Ta="solaris-sparc,solaris-x8664">
+ <Set SetName="LinkLibraries" Order="20">
+ dl
+ </Set>
+ </HaTaVar>
+ </ToolClassConfiguration>
+ <FileDefinitions>
+ <File Path="hdfs.c"/>
+ <File Path="exception.c"/>
+ <File Path="jni_helper.c"/>
+ <HaTaVar OsType="Windows">
+ <File Path="$(CNFP_TOPDIR)pcommon/src/version.opprc"/>
+ </HaTaVar>
+ </FileDefinitions>
diff --git a/import/pdclibhdfs/src/test_libhdfs_ops.c b/import/pdclibhdfs/src/test_libhdfs_ops.c
new file mode 100755
index 0000000..ebb7984
--- /dev/null
+++ b/import/pdclibhdfs/src/test_libhdfs_ops.c
@@ -0,0 +1,582 @@
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "hdfs.h"
+#include "hdfs_test.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <errno.h>
+void permission_disp(short permissions, char *rtr) {
+ int i;
+ rtr[9] = '\0';
+ for(i=2;i>=0;i--)
+ {
+ short permissionsId = permissions >> (i * 3) & (short)7;
+ char* perm;
+ switch(permissionsId) {
+ case 7:
+ perm = "rwx"; break;
+ case 6:
+ perm = "rw-"; break;
+ case 5:
+ perm = "r-x"; break;
+ case 4:
+ perm = "r--"; break;
+ case 3:
+ perm = "-wx"; break;
+ case 2:
+ perm = "-w-"; break;
+ case 1:
+ perm = "--x"; break;
+ case 0:
+ perm = "---"; break;
+ default:
+ perm = "???";
+ }
+ strncpy(rtr, perm, 3);
+ rtr+=3;
+ }
+int main(int argc, char **argv) {
+ char buffer[32];
+ int num_written_bytes;
+ const char* writePath = "/tmp/testfile.txt";
+ #ifdef WIN32
+ const char* writePathLcl = "C:\\tmp\\testfile.txt";
+ #else
+ const char* writePathLcl = "/tmp/testfile.txt";
+ #endif
+ const char* fileContents = "Hello, World!";
+ hdfsFS fs;
+ hdfsFS lfs;
+ int totalResult = 0;
+ int result = 0;
+ /*fs = hdfsConnectNewInstance("default", 0);*/
+ fs = hdfsConnect("default", 0);
+ if(!fs) {
+ fprintf(stderr, "Oops! Failed to connect to hdfs!\n");
+ exit(-1);
+ }
+ /*lfs = hdfsConnectNewInstance(NULL, 0);*/
+ lfs = hdfsConnect(NULL, 0);
+ if(!lfs) {
+ fprintf(stderr, "Oops! Failed to connect to 'local' hdfs!\n");
+ exit(-1);
+ }
+ {
+ /*Write tests */
+ int currentPos = -1;
+ hdfsFile writeFile = hdfsOpenFile(fs, writePath, O_WRONLY|O_CREAT, 0, 0, 0);
+ if(!writeFile) {
+ fprintf(stderr, "Failed to open %s for writing!\n", writePath);
+ exit(-1);
+ }
+ fprintf(stderr, "Opened %s for writing successfully...\n", writePath);
+ num_written_bytes =
+ hdfsWrite(fs, writeFile, (void*)fileContents, strlen(fileContents)+1);
+ if (num_written_bytes != strlen(fileContents) + 1) {
+ fprintf(stderr, "Failed to write correct number of bytes - expected %d, got %d\n",
+ (int)(strlen(fileContents) + 1), (int)num_written_bytes);
+ exit(-1);
+ }
+ fprintf(stderr, "Wrote %d bytes\n", num_written_bytes);
+ if ((currentPos = hdfsTell(fs, writeFile)) == -1) {
+ fprintf(stderr,
+ "Failed to get current file position correctly! Got %ld!\n",
+ currentPos);
+ exit(-1);
+ }
+ fprintf(stderr, "Current position: %ld\n", currentPos);
+ if (hdfsFlush(fs, writeFile)) {
+ fprintf(stderr, "Failed to 'flush' %s\n", writePath);
+ exit(-1);
+ }
+ fprintf(stderr, "Flushed %s successfully!\n", writePath);
+ /*if (hdfsHFlush(fs, writeFile)) {
+ fprintf(stderr, "Failed to 'hflush' %s\n", writePath);
+ exit(-1);
+ }
+ fprintf(stderr, "HFlushed %s successfully!\n", writePath); */
+ hdfsCloseFile(fs, writeFile);
+ }
+ {
+ /*Read tests */
+ const char* readPath = "/tmp/testfile.txt";
+ int exists = hdfsExists(fs, readPath);
+ hdfsFile readFile;
+ int seekPos = 1;
+ int currentPos = -1;
+ int num_read_bytes;
+ hdfsFile localFile;
+ if (exists) {
+ fprintf(stderr, "Failed to validate existence of %s\n", readPath);
+ exit(-1);
+ }
+ readFile = hdfsOpenFile(fs, readPath, O_RDONLY, 0, 0, 0);
+ if (!readFile) {
+ fprintf(stderr, "Failed to open %s for reading!\n", readPath);
+ exit(-1);
+ }
+ if (!hdfsFileIsOpenForRead(readFile)) {
+ fprintf(stderr, "hdfsFileIsOpenForRead: we just opened a file "
+ "with O_RDONLY, and it did not show up as 'open for "
+ "read'\n");
+ exit(-1);
+ }
+ fprintf(stderr, "hdfsAvailable: %d\n", hdfsAvailable(fs, readFile));
+ if(hdfsSeek(fs, readFile, seekPos)) {
+ fprintf(stderr, "Failed to seek %s for reading!\n", readPath);
+ exit(-1);
+ }
+ if((currentPos = hdfsTell(fs, readFile)) != seekPos) {
+ fprintf(stderr,
+ "Failed to get current file position correctly! Got %ld!\n",
+ currentPos);
+ exit(-1);
+ }
+ fprintf(stderr, "Current position: %ld\n", currentPos);
+ if (!hdfsFileUsesDirectRead(readFile)) {
+ fprintf(stderr, "Direct read support not detected "
+ "for HDFS filesystem\n");
+ } else {
+ fprintf(stderr, "Direct read support detected for HDFS\n");
+ /* Test the direct read path */
+ if(hdfsSeek(fs, readFile, 0)) {
+ fprintf(stderr, "Failed to seek %s for reading!\n", readPath);
+ exit(-1);
+ }
+ memset(buffer, 0, sizeof(buffer));
+ num_read_bytes = hdfsRead(fs, readFile, (void*)buffer,
+ sizeof(buffer));
+ if (strncmp(fileContents, buffer, strlen(fileContents)) != 0) {
+ fprintf(stderr, "Failed to read (direct). Expected %s but got %s (%d bytes)\n",
+ fileContents, buffer, num_read_bytes);
+ exit(-1);
+ }
+ fprintf(stderr, "Read (direct) following %d bytes:\n%s\n",
+ num_read_bytes, buffer);
+ if (hdfsSeek(fs, readFile, 0L)) {
+ fprintf(stderr, "Failed to seek to file start!\n");
+ exit(-1);
+ }
+ /* Disable the direct read path so that we really go through the slow
+ read path */
+ hdfsFileDisableDirectRead(readFile);
+ }
+ num_read_bytes = hdfsRead(fs, readFile, (void*)buffer,
+ sizeof(buffer));
+ fprintf(stderr, "Read following %d bytes:\n%s\n",
+ num_read_bytes, buffer);
+ memset(buffer, 0, strlen(fileContents + 1));
+ num_read_bytes = hdfsPread(fs, readFile, 0, (void*)buffer,
+ sizeof(buffer));
+ fprintf(stderr, "Read following %d bytes:\n%s\n",
+ num_read_bytes, buffer);
+ hdfsCloseFile(fs, readFile);
+ fprintf(stderr,"Test Local File System %s\n", writePathLcl );
+ /* Test correct behaviour for unsupported filesystems */
+ localFile = hdfsOpenFile(lfs, writePathLcl, O_WRONLY|O_CREAT, 0, 0, 0);
+ if(!localFile) {
+ fprintf(stderr, "Failed to open %s for writing!\n", writePathLcl);
+ exit(-1);
+ }
+ num_written_bytes = hdfsWrite(lfs, localFile, (void*)fileContents,
+ strlen(fileContents) + 1);
+ hdfsCloseFile(lfs, localFile);
+ localFile = hdfsOpenFile(lfs, writePathLcl, O_RDONLY, 0, 0, 0);
+ if (hdfsFileUsesDirectRead(localFile)) {
+ fprintf(stderr, "Direct read support not detected for local "
+ "filesystem\n");
+ }
+ hdfsCloseFile(lfs, localFile);
+ }
+ {
+ /*Generic file-system operations */
+ const char* srcPath = "/tmp/testfile.txt";
+ const char* dstPath = "/tmp/testfile2.txt";
+ const char* slashTmp = "/tmp";
+ const char* newDirectory = "/tmp/newdir";
+ char buffer[256];
+ const char *resp;
+ hdfsFileInfo *fileInfo = NULL;
+ hdfsFileInfo *fileList = 0;
+ int numEntries = 0;
+ char*** hosts;
+ char *newOwner = "root";
+ /* setting tmp dir to 777 so later when connectAsUser nobody, we can write to it*/
+ short newPerm = 0666;
+ tTime newMtime = time(NULL);
+ tTime newAtime = time(NULL);
+ hdfsFileInfo *finfo;
+ fprintf(stderr, "hdfsCopy(remote-local): %s\n", ((result = hdfsCopy(fs, srcPath, lfs, srcPath)) ? "Failed!" : "Success!"));
+ totalResult += result;
+ fprintf(stderr, "hdfsCopy(remote-remote): %s\n", ((result = hdfsCopy(fs, srcPath, fs, dstPath)) ? "Failed!" : "Success!"));
+ totalResult += result;
+ fprintf(stderr, "hdfsMove(local-local): %s\n", ((result = hdfsMove(lfs, srcPath, lfs, dstPath)) ? "Failed!" : "Success!"));
+ totalResult += result;
+ fprintf(stderr, "hdfsMove(remote-local): %s\n", ((result = hdfsMove(fs, srcPath, lfs, srcPath)) ? "Failed!" : "Success!"));
+ totalResult += result;
+ fprintf(stderr, "hdfsRename: %s\n", ((result = hdfsRename(fs, dstPath, srcPath)) ? "Failed!" : "Success!"));
+ totalResult += result;
+ fprintf(stderr, "hdfsCopy(remote-remote): %s\n", ((result = hdfsCopy(fs, srcPath, fs, dstPath)) ? "Failed!" : "Success!"));
+ totalResult += result;
+ fprintf(stderr, "hdfsCreateDirectory: %s\n", ((result = hdfsCreateDirectory(fs, newDirectory)) ? "Failed!" : "Success!"));
+ totalResult += result;
+ fprintf(stderr, "hdfsSetReplication: %s\n", ((result = hdfsSetReplication(fs, srcPath, 2)) ? "Failed!" : "Success!"));
+ totalResult += result;
+ fprintf(stderr, "hdfsGetWorkingDirectory: %s\n", ((resp = hdfsGetWorkingDirectory(fs, buffer, sizeof(buffer))) ? buffer : "Failed!"));
+ totalResult += (resp ? 0 : 1);
+ fprintf(stderr, "hdfsSetWorkingDirectory: %s\n", ((result = hdfsSetWorkingDirectory(fs, slashTmp)) ? "Failed!" : "Success!"));
+ totalResult += result;
+ fprintf(stderr, "hdfsGetWorkingDirectory: %s\n", ((resp = hdfsGetWorkingDirectory(fs, buffer, sizeof(buffer))) ? buffer : "Failed!"));
+ totalResult += (resp ? 0 : 1);
+ fprintf(stderr, "hdfsGetDefaultBlockSize: %ld\n", hdfsGetDefaultBlockSize(fs));
+ fprintf(stderr, "hdfsGetCapacity: %ld\n", hdfsGetCapacity(fs));
+ fprintf(stderr, "hdfsGetUsed: %ld\n", hdfsGetUsed(fs));
+ if((fileInfo = hdfsGetPathInfo(fs, slashTmp)) != NULL) {
+ char permissions[10];
+ fprintf(stderr, "hdfsGetPathInfo - SUCCESS!\n");
+ fprintf(stderr, "Name: %s, ", fileInfo->mName);
+ fprintf(stderr, "Type: %c, ", (char)(fileInfo->mKind));
+ fprintf(stderr, "Replication: %d, ", fileInfo->mReplication);
+ fprintf(stderr, "BlockSize: %ld, ", fileInfo->mBlockSize);
+ fprintf(stderr, "Size: %ld, ", fileInfo->mSize);
+ fprintf(stderr, "LastMod: %s", ctime(&fileInfo->mLastMod));
+ fprintf(stderr, "Owner: %s, ", fileInfo->mOwner);
+ fprintf(stderr, "Group: %s, ", fileInfo->mGroup);
+ permission_disp(fileInfo->mPermissions, permissions);
+ fprintf(stderr, "Permissions: %d (%s)\n", fileInfo->mPermissions, permissions);
+ hdfsFreeFileInfo(fileInfo, 1);
+ } else {
+ totalResult++;
+ fprintf(stderr, "waah! hdfsGetPathInfo for %s - FAILED!\n", slashTmp);
+ }
+ if((fileList = hdfsListDirectory(fs, slashTmp, &numEntries)) != NULL) {
+ int i = 0;
+ char permissions[10];
+ for(i=0; i < numEntries; ++i) {
+ fprintf(stderr, "Name: %s, ", fileList[i].mName);
+ fprintf(stderr, "Type: %c, ", (char)fileList[i].mKind);
+ fprintf(stderr, "Replication: %d, ", fileList[i].mReplication);
+ fprintf(stderr, "BlockSize: %ld, ", fileList[i].mBlockSize);
+ fprintf(stderr, "Size: %ld, ", fileList[i].mSize);
+ fprintf(stderr, "LastMod: %s", ctime(&fileList[i].mLastMod));
+ fprintf(stderr, "Owner: %s, ", fileList[i].mOwner);
+ fprintf(stderr, "Group: %s, ", fileList[i].mGroup);
+ permission_disp(fileList[i].mPermissions, permissions);
+ fprintf(stderr, "Permissions: %d (%s)\n", fileList[i].mPermissions, permissions);
+ }
+ hdfsFreeFileInfo(fileList, numEntries);
+ } else {
+ if (errno) {
+ totalResult++;
+ fprintf(stderr, "waah! hdfsListDirectory - FAILED!\n");
+ } else {
+ fprintf(stderr, "Empty directory!\n");
+ }
+ }
+ hosts = hdfsGetHosts(fs, srcPath, 0, 1);
+ if(hosts) {
+ int i=0;
+ fprintf(stderr, "hdfsGetHosts - SUCCESS! ... \n");
+ while(hosts[i]) {
+ int j = 0;
+ while(hosts[i][j]) {
+ fprintf(stderr,
+ "\thosts[%d][%d] - %s\n", i, j, hosts[i][j]);
+ ++j;
+ }
+ ++i;
+ }
+ } else {
+ totalResult++;
+ fprintf(stderr, "waah! hdfsGetHosts - FAILED!\n");
+ }
+ /* chown write */
+ fprintf(stderr, "hdfsChown: %s\n", ((result = hdfsChown(fs, writePath, NULL, "users")) ? "Failed!" : "Success!"));
+ totalResult += result;
+ fprintf(stderr, "hdfsChown: %s\n", ((result = hdfsChown(fs, writePath, newOwner, NULL)) ? "Failed!" : "Success!"));
+ totalResult += result;
+ /* chmod write */
+ fprintf(stderr, "hdfsChmod: %s\n", ((result = hdfsChmod(fs, writePath, newPerm)) ? "Failed!" : "Success!"));
+ totalResult += result;
+#ifndef WIN32
+ sleep(2);
+ /* utime write */
+ fprintf(stderr, "hdfsUtime: %s\n", ((result = hdfsUtime(fs, writePath, newMtime, newAtime)) ? "Failed!" : "Success!"));
+ totalResult += result;
+ /* chown/chmod/utime read */
+ finfo = hdfsGetPathInfo(fs, writePath);
+ fprintf(stderr, "hdfsChown read: %s\n", ((result = (strcmp(finfo->mOwner, newOwner) != 0)) ? "Failed!" : "Success!"));
+ totalResult += result;
+ fprintf(stderr, "hdfsChmod read: %s\n", ((result = (finfo->mPermissions != newPerm)) ? "Failed!" : "Success!"));
+ totalResult += result;
+ /* will later use /tmp/ as a different user so enable it */
+ fprintf(stderr, "hdfsChmod: %s\n", ((result = hdfsChmod(fs, "/tmp/", 0777)) ? "Failed!" : "Success!"));
+ totalResult += result;
+ fprintf(stderr,"newMTime=%ld\n",newMtime);
+ fprintf(stderr,"curMTime=%ld\n",finfo->mLastMod);
+ fprintf(stderr, "hdfsUtime read (mtime): %s\n", ((result = (finfo->mLastMod != newMtime)) ? "Failed!" : "Success!"));
+ totalResult += result;
+ /* No easy way to turn on access times from hdfs_test right now
+ // fprintf(stderr, "hdfsUtime read (atime): %s\n", ((result = (finfo->mLastAccess != newAtime)) ? "Failed!" : "Success!"));
+ // totalResult += result; */
+ hdfsFreeFileInfo(finfo, 1);
+ /* Clean up */
+ fprintf(stderr, "hdfsDelete: %s\n", ((result = hdfsDelete(fs, newDirectory, 1)) ? "Failed!" : "Success!"));
+ totalResult += result;
+ fprintf(stderr, "hdfsDelete: %s\n", ((result = hdfsDelete(fs, srcPath, 1)) ? "Failed!" : "Success!"));
+ totalResult += result;
+ fprintf(stderr, "hdfsDelete: %s\n", ((result = hdfsDelete(lfs, srcPath, 1)) ? "Failed!" : "Success!"));
+ totalResult += result;
+ fprintf(stderr, "hdfsDelete: %s\n", ((result = hdfsDelete(lfs, dstPath, 1)) ? "Failed!" : "Success!"));
+ totalResult += result;
+ fprintf(stderr, "hdfsExists: %s\n", ((result = hdfsExists(fs, newDirectory)) ? "Success!" : "Failed!"));
+ totalResult += (result ? 0 : 1);
+ }
+ {
+ const char *writePath = "/tmp/appends";
+ char* buffer = "Hello,";
+ int num_written_bytes;
+ hdfsFileInfo *finfo;
+ hdfsFile readFile;
+ char rdbuffer[32];
+ int num_read_bytes;
+ /* CREATE */
+ hdfsFile writeFile = hdfsOpenFile(fs, writePath, O_WRONLY, 0, 0, 0);
+ if(!writeFile) {
+ fprintf(stderr, "Failed to open %s for writing!\n", writePath);
+ exit(-1);
+ }
+ fprintf(stderr, "Opened %s for writing successfully...\n", writePath);
+ num_written_bytes = hdfsWrite(fs, writeFile, (void*)buffer, strlen(buffer));
+ fprintf(stderr, "Wrote %d bytes\n", num_written_bytes);
+ if (hdfsFlush(fs, writeFile)) {
+ fprintf(stderr, "Failed to 'flush' %s\n", writePath);
+ exit(-1);
+ }
+ fprintf(stderr, "Flushed %s successfully!\n", writePath);
+ hdfsCloseFile(fs, writeFile);
+ fprintf(stderr,"Open file in append mode:%s\n", writePath );
+ /* RE-OPEN */
+ writeFile = hdfsOpenFile(fs, writePath, O_WRONLY|O_APPEND, 0, 0, 0);
+ if(!writeFile) {
+ fprintf(stderr, "Failed to open %s for writing!\n", writePath);
+ exit(-1);
+ }
+ fprintf(stderr, "Opened %s for writing successfully...\n", writePath);
+ buffer = " World";
+ num_written_bytes = hdfsWrite(fs, writeFile, (void*)buffer, strlen(buffer) + 1);
+ fprintf(stderr, "Wrote %d bytes\n", num_written_bytes);
+ if (hdfsFlush(fs, writeFile)) {
+ fprintf(stderr, "Failed to 'flush' %s\n", writePath);
+ exit(-1);
+ }
+ fprintf(stderr, "Flushed %s successfully!\n", writePath);
+ hdfsCloseFile(fs, writeFile);
+ /* CHECK size */
+ finfo = hdfsGetPathInfo(fs, writePath);
+ fprintf(stderr, "fileinfo->mSize: == total %s\n", ((result = (finfo->mSize == strlen("Hello, World") + 1)) ? "Success!" : "Failed!"));
+ totalResult += (result ? 0 : 1);
+ /* READ and check data */
+ readFile = hdfsOpenFile(fs, writePath, O_RDONLY, 0, 0, 0);
+ if (!readFile) {
+ fprintf(stderr, "Failed to open %s for reading!\n", writePath);
+ exit(-1);
+ }
+ num_read_bytes = hdfsRead(fs, readFile, (void*)rdbuffer, sizeof(rdbuffer));
+ fprintf(stderr, "Read following %d bytes:\n%s\n",
+ num_read_bytes, rdbuffer);
+ fprintf(stderr, "read == Hello, World %s\n", (result = (strcmp(rdbuffer, "Hello, World") == 0)) ? "Success!" : "Failed!");
+ hdfsCloseFile(fs, readFile);
+ /* DONE test appends */
+ }
+ totalResult += (hdfsDisconnect(fs) != 0);
+ {
+ /* Now test as connecting as a specific user
+ // This is only meant to test that we connected as that user, not to test
+ // the actual fs user capabilities. Thus just create a file and read
+ // the owner is correct. */
+ const char *tuser = "nobody";
+ char* buffer = "Hello, World!";
+ const char* writePath = "/tmp/usertestfile.txt";
+ hdfsFile writeFile;
+ int num_written_bytes;
+ hdfsFileInfo *finfo;
+ fs = hdfsConnectAsUser("default", 0, tuser);
+ /*fs = hdfsConnectAsUserNewInstance("default", 0, tuser);*/
+ if(!fs) {
+ fprintf(stderr, "Oops! Failed to connect to hdfs as user %s!\n",tuser);
+ exit(-1);
+ }
+ writeFile = hdfsOpenFile(fs, writePath, O_WRONLY|O_CREAT, 0, 0, 0);
+ if(!writeFile) {
+ fprintf(stderr, "Failed to open %s for writing!\n", writePath);
+ exit(-1);
+ }
+ fprintf(stderr, "Opened %s for writing successfully...\n", writePath);
+ num_written_bytes = hdfsWrite(fs, writeFile, (void*)buffer, strlen(buffer)+1);
+ fprintf(stderr, "Wrote %d bytes\n", num_written_bytes);
+ if (hdfsFlush(fs, writeFile)) {
+ fprintf(stderr, "Failed to 'flush' %s\n", writePath);
+ exit(-1);
+ }
+ fprintf(stderr, "Flushed %s successfully!\n", writePath);
+ hdfsCloseFile(fs, writeFile);
+ finfo = hdfsGetPathInfo(fs, writePath);
+ fprintf(stderr, "hdfs new file user is correct: %s\n", ((result = (strcmp(finfo->mOwner, tuser) != 0)) ? "Failed!" : "Success!"));
+ totalResult += result;
+ }
+ totalResult += (hdfsDisconnect(fs) != 0);
+ if (totalResult != 0) {
+ return -1;
+ } else {
+ return 0;
+ }
+ * vim: ts=4: sw=4: et:
+ */
diff --git a/import/pdclibhdfs/src/test_libhdfs_read.c b/import/pdclibhdfs/src/test_libhdfs_read.c
new file mode 100755
index 0000000..8613424
--- /dev/null
+++ b/import/pdclibhdfs/src/test_libhdfs_read.c
@@ -0,0 +1,79 @@
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifdef WIN32
+#include <windows.h>
+#include "hdfs.h"
+#include <stdio.h>
+#include <stdlib.h>
+int main(int argc, char **argv) {
+ hdfsFS fs;
+ char* rfile;
+ int bufferSize;
+ hdfsFile readFile;
+ char* buffer;
+ int curSize;
+ if (argc != 4) {
+ fprintf(stderr, "Usage: hdfs_read <filename> <filesize> <buffersize>\n");
+ exit(-1);
+ }
+ fs = hdfsConnect("default", 0);
+ if (!fs) {
+ fprintf(stderr, "Oops! Failed to connect to hdfs!\n");
+ exit(-1);
+ }
+ rfile = argv[1];
+ bufferSize = strtoul(argv[3], NULL, 10);
+ readFile = hdfsOpenFile(fs, rfile, O_RDONLY, bufferSize, 0, 0);
+ if (!readFile) {
+ fprintf(stderr, "Failed to open %s for writing!\n", rfile);
+ exit(-2);
+ }
+ /* data to be written to the file */
+ buffer = malloc(sizeof(char) * bufferSize);
+ if(buffer == NULL) {
+ return -2;
+ }
+ /* read from the file */
+ curSize = bufferSize;
+ for (; curSize == bufferSize;) {
+ curSize = hdfsRead(fs, readFile, (void*)buffer, curSize);
+ }
+ free(buffer);
+ hdfsCloseFile(fs, readFile);
+ hdfsDisconnect(fs);
+ return 0;
+ * vim: ts=4: sw=4: et:
+ */
diff --git a/import/pdclibhdfs/src/test_libhdfs_threaded.c b/import/pdclibhdfs/src/test_libhdfs_threaded.c
new file mode 100755
index 0000000..73d40af
--- /dev/null
+++ b/import/pdclibhdfs/src/test_libhdfs_threaded.c
@@ -0,0 +1,310 @@
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "expect.h"
+#include "hdfs.h"
+#include "native_mini_dfs.h"
+#include <errno.h>
+#include <stdint.h>
+#include <semaphore.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#define TO_STR_HELPER(X) #X
+#define TLH_MAX_THREADS 100
+#define TLH_DEFAULT_BLOCK_SIZE 134217728
+static sem_t tlhSem;
+static struct NativeMiniDfsCluster* tlhCluster;
+struct tlhThreadInfo {
+ /** Thread index */
+ int threadIdx;
+ /** 0 = thread was successful; error code otherwise */
+ int success;
+ /** pthread identifier */
+ pthread_t thread;
+static int hdfsSingleNameNodeConnect(struct NativeMiniDfsCluster *cl, hdfsFS *fs)
+ int ret, port;
+ hdfsFS hdfs;
+ struct hdfsBuilder *bld;
+ port = nmdGetNameNodePort(cl);
+ if (port < 0) {
+ fprintf(stderr, "hdfsSingleNameNodeConnect: nmdGetNameNodePort "
+ "returned error %d\n", port);
+ return port;
+ }
+ bld = hdfsNewBuilder();
+ if (!bld)
+ return -ENOMEM;
+ hdfsBuilderSetForceNewInstance(bld);
+ hdfsBuilderSetNameNode(bld, "localhost");
+ hdfsBuilderSetNameNodePort(bld, port);
+ hdfsBuilderConfSetStr(bld, "dfs.block.size",
+ hdfsBuilderConfSetStr(bld, "dfs.blocksize",
+ hdfs = hdfsBuilderConnect(bld);
+ if (!hdfs) {
+ ret = -errno;
+ return ret;
+ }
+ *fs = hdfs;
+ return 0;
+static int doTestGetDefaultBlockSize(hdfsFS fs, const char *path)
+ uint64_t blockSize;
+ int ret;
+ blockSize = hdfsGetDefaultBlockSize(fs);
+ if (blockSize < 0) {
+ ret = errno;
+ fprintf(stderr, "hdfsGetDefaultBlockSize failed with error %d\n", ret);
+ return ret;
+ } else if (blockSize != TLH_DEFAULT_BLOCK_SIZE) {
+ fprintf(stderr, "hdfsGetDefaultBlockSize got %"PRId64", but we "
+ "expected %d\n", blockSize, TLH_DEFAULT_BLOCK_SIZE);
+ return EIO;
+ }
+ blockSize = hdfsGetDefaultBlockSizeAtPath(fs, path);
+ if (blockSize < 0) {
+ ret = errno;
+ fprintf(stderr, "hdfsGetDefaultBlockSizeAtPath(%s) failed with "
+ "error %d\n", path, ret);
+ return ret;
+ } else if (blockSize != TLH_DEFAULT_BLOCK_SIZE) {
+ fprintf(stderr, "hdfsGetDefaultBlockSizeAtPath(%s) got "
+ "%"PRId64", but we expected %d\n",
+ path, blockSize, TLH_DEFAULT_BLOCK_SIZE);
+ return EIO;
+ }
+ return 0;
+static int doTestHdfsOperations(struct tlhThreadInfo *ti, hdfsFS fs)
+ char prefix[256], tmp[256];
+ hdfsFile file;
+ int ret, expected;
+ hdfsFileInfo *fileInfo;
+ struct hdfsReadStatistics *readStats = NULL;
+ snprintf(prefix, sizeof(prefix), "/tlhData%04d", ti->threadIdx);
+ if (hdfsExists(fs, prefix) == 0) {
+ EXPECT_ZERO(hdfsDelete(fs, prefix, 1));
+ }
+ EXPECT_ZERO(hdfsCreateDirectory(fs, prefix));
+ snprintf(tmp, sizeof(tmp), "%s/file", prefix);
+ EXPECT_ZERO(doTestGetDefaultBlockSize(fs, prefix));
+ /* There should not be any file to open for reading. */
+ EXPECT_NULL(hdfsOpenFile(fs, tmp, O_RDONLY, 0, 0, 0));
+ /* hdfsOpenFile should not accept mode = 3 */
+ EXPECT_NULL(hdfsOpenFile(fs, tmp, 3, 0, 0, 0));
+ file = hdfsOpenFile(fs, tmp, O_WRONLY, 0, 0, 0);
+ /* TODO: implement writeFully and use it here */
+ expected = strlen(prefix);
+ ret = hdfsWrite(fs, file, prefix, expected);
+ if (ret < 0) {
+ ret = errno;
+ fprintf(stderr, "hdfsWrite failed and set errno %d\n", ret);
+ return ret;
+ }
+ if (ret != expected) {
+ fprintf(stderr, "hdfsWrite was supposed to write %d bytes, but "
+ "it wrote %d\n", ret, expected);
+ return EIO;
+ }
+ EXPECT_ZERO(hdfsFlush(fs, file));
+ EXPECT_ZERO(hdfsHSync(fs, file));
+ EXPECT_ZERO(hdfsCloseFile(fs, file));
+ /* Let's re-open the file for reading */
+ file = hdfsOpenFile(fs, tmp, O_RDONLY, 0, 0, 0);
+ EXPECT_ZERO(hdfsFileGetReadStatistics(file, &readStats));
+ errno = 0;
+ EXPECT_ZERO(readStats->totalBytesRead);
+ EXPECT_ZERO(readStats->totalLocalBytesRead);
+ EXPECT_ZERO(readStats->totalShortCircuitBytesRead);
+ hdfsFileFreeReadStatistics(readStats);
+ /* TODO: implement readFully and use it here */
+ ret = hdfsRead(fs, file, tmp, sizeof(tmp));
+ if (ret < 0) {
+ ret = errno;
+ fprintf(stderr, "hdfsRead failed and set errno %d\n", ret);
+ return ret;
+ }
+ if (ret != expected) {
+ fprintf(stderr, "hdfsRead was supposed to read %d bytes, but "
+ "it read %d\n", ret, expected);
+ return EIO;
+ }
+ EXPECT_ZERO(hdfsFileGetReadStatistics(file, &readStats));
+ errno = 0;
+ EXPECT_INT_EQ(expected, readStats->totalBytesRead);
+ hdfsFileFreeReadStatistics(readStats);
+ EXPECT_ZERO(memcmp(prefix, tmp, expected));
+ EXPECT_ZERO(hdfsCloseFile(fs, file));
+ // TODO: Non-recursive delete should fail?
+ //EXPECT_NONZERO(hdfsDelete(fs, prefix, 0));
+ snprintf(tmp, sizeof(tmp), "%s/file", prefix);
+ EXPECT_ZERO(hdfsChown(fs, tmp, NULL, NULL));
+ EXPECT_ZERO(hdfsChown(fs, tmp, NULL, "doop"));
+ fileInfo = hdfsGetPathInfo(fs, tmp);
+ EXPECT_ZERO(strcmp("doop", fileInfo->mGroup));
+ hdfsFreeFileInfo(fileInfo, 1);
+ EXPECT_ZERO(hdfsChown(fs, tmp, "ha", "doop2"));
+ fileInfo = hdfsGetPathInfo(fs, tmp);
+ EXPECT_ZERO(strcmp("ha", fileInfo->mOwner));
+ EXPECT_ZERO(strcmp("doop2", fileInfo->mGroup));
+ hdfsFreeFileInfo(fileInfo, 1);
+ EXPECT_ZERO(hdfsChown(fs, tmp, "ha2", NULL));
+ fileInfo = hdfsGetPathInfo(fs, tmp);
+ EXPECT_ZERO(strcmp("ha2", fileInfo->mOwner));
+ EXPECT_ZERO(strcmp("doop2", fileInfo->mGroup));
+ hdfsFreeFileInfo(fileInfo, 1);
+ EXPECT_ZERO(hdfsDelete(fs, prefix, 1));
+ return 0;
+static void *testHdfsOperations(void *v)
+ struct tlhThreadInfo *ti = (struct tlhThreadInfo*)v;
+ hdfsFS fs = NULL;
+ int ret;
+ fprintf(stderr, "testHdfsOperations(threadIdx=%d): starting\n",
+ ti->threadIdx);
+ ret = hdfsSingleNameNodeConnect(tlhCluster, &fs);
+ if (ret) {
+ fprintf(stderr, "testHdfsOperations(threadIdx=%d): "
+ "hdfsSingleNameNodeConnect failed with error %d.\n",
+ ti->threadIdx, ret);
+ ti->success = EIO;
+ return NULL;
+ }
+ ti->success = doTestHdfsOperations(ti, fs);
+ if (hdfsDisconnect(fs)) {
+ ret = errno;
+ fprintf(stderr, "hdfsDisconnect error %d\n", ret);
+ ti->success = ret;
+ }
+ return NULL;
+static int checkFailures(struct tlhThreadInfo *ti, int tlhNumThreads)
+ int i, threadsFailed = 0;
+ const char *sep = "";
+ for (i = 0; i < tlhNumThreads; i++) {
+ if (ti[i].success != 0) {
+ threadsFailed = 1;
+ }
+ }
+ if (!threadsFailed) {
+ fprintf(stderr, "testLibHdfs: all threads succeeded. SUCCESS.\n");
+ return EXIT_SUCCESS;
+ }
+ fprintf(stderr, "testLibHdfs: some threads failed: [");
+ for (i = 0; i < tlhNumThreads; i++) {
+ if (ti[i].success != 0) {
+ fprintf(stderr, "%s%d", sep, i);
+ sep = ", ";
+ }
+ }
+ fprintf(stderr, "]. FAILURE.\n");
+ return EXIT_FAILURE;
+ * Test that we can write a file with libhdfs and then read it back
+ */
+int main(void)
+ int i, tlhNumThreads;
+ const char *tlhNumThreadsStr;
+ struct tlhThreadInfo ti[TLH_MAX_THREADS];
+ struct NativeMiniDfsConf conf = {
+ .doFormat = 1,
+ };
+ tlhNumThreadsStr = getenv("TLH_NUM_THREADS");
+ if (!tlhNumThreadsStr) {
+ tlhNumThreadsStr = "3";
+ }
+ tlhNumThreads = atoi(tlhNumThreadsStr);
+ if ((tlhNumThreads <= 0) || (tlhNumThreads > TLH_MAX_THREADS)) {
+ fprintf(stderr, "testLibHdfs: must have a number of threads "
+ "between 1 and %d inclusive, not %d\n",
+ TLH_MAX_THREADS, tlhNumThreads);
+ return EXIT_FAILURE;
+ }
+ memset(&ti[0], 0, sizeof(ti));
+ for (i = 0; i < tlhNumThreads; i++) {
+ ti[i].threadIdx = i;
+ }
+ EXPECT_ZERO(sem_init(&tlhSem, 0, tlhNumThreads));
+ tlhCluster = nmdCreate(&conf);
+ EXPECT_NONNULL(tlhCluster);
+ EXPECT_ZERO(nmdWaitClusterUp(tlhCluster));
+ for (i = 0; i < tlhNumThreads; i++) {
+ EXPECT_ZERO(pthread_create(&ti[i].thread, NULL,
+ testHdfsOperations, &ti[i]));
+ }
+ for (i = 0; i < tlhNumThreads; i++) {
+ EXPECT_ZERO(pthread_join(ti[i].thread, NULL));
+ }
+ EXPECT_ZERO(nmdShutdown(tlhCluster));
+ nmdFree(tlhCluster);
+ EXPECT_ZERO(sem_destroy(&tlhSem));
+ return checkFailures(ti, tlhNumThreads);
diff --git a/import/pdclibhdfs/src/test_libhdfs_write.c b/import/pdclibhdfs/src/test_libhdfs_write.c
new file mode 100755
index 0000000..376cfac
--- /dev/null
+++ b/import/pdclibhdfs/src/test_libhdfs_write.c
@@ -0,0 +1,103 @@
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "hdfs.h"
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+int main(int argc, char **argv) {
+ hdfsFS fs;
+ char* writeFileName;
+ int fileTotalSize;
+ long long tmpBufferSize;
+ int bufferSize;
+ hdfsFile writeFile;
+ char* buffer;
+ int i = 0;
+ int nrRemaining;
+ if (argc != 4) {
+ fprintf(stderr, "Usage: hdfs_write <filename> <filesize> <buffersize>\n");
+ exit(-1);
+ }
+ fs = hdfsConnect("default", 0);
+ if (!fs) {
+ fprintf(stderr, "Oops! Failed to connect to hdfs!\n");
+ exit(-1);
+ }
+ writeFileName = argv[1];
+ fileTotalSize = strtoul(argv[2], NULL, 10);
+ tmpBufferSize = strtoul(argv[3], NULL, 10);
+ /* sanity check */ /*
+ if(fileTotalSize == ULONG_MAX && errno == ERANGE) {
+ fprintf(stderr, "invalid file size %s - must be <= %lu\n", argv[2], ULONG_MAX);
+ exit(-3);
+ } */
+ /* currently libhdfs writes are of tSize which is int32 */
+ if(tmpBufferSize > INT_MAX) {
+ fprintf(stderr, "invalid buffer size libhdfs API write chunks must be <= %d\n",INT_MAX);
+ exit(-3);
+ }
+ bufferSize = tmpBufferSize;
+ writeFile = hdfsOpenFile(fs, writeFileName, O_WRONLY, bufferSize, 0, 0);
+ if (!writeFile) {
+ fprintf(stderr, "Failed to open %s for writing!\n", writeFileName);
+ exit(-2);
+ }
+ /* data to be written to the file */
+ buffer = malloc(sizeof(char) * bufferSize);
+ if(buffer == NULL) {
+ fprintf(stderr, "Could not allocate buffer of size %d\n", bufferSize);
+ return -2;
+ }
+ for (i=0; i < bufferSize; ++i) {
+ buffer[i] = 'a' + (i%26);
+ }
+ /* write to the file */
+ for (nrRemaining = fileTotalSize; nrRemaining > 0; nrRemaining -= bufferSize ) {
+ tSize curSize = ( bufferSize < nrRemaining ) ? bufferSize : (tSize)nrRemaining;
+ tSize written;
+ if ((written = hdfsWrite(fs, writeFile, (void*)buffer, curSize)) != curSize) {
+ fprintf(stderr, "ERROR: hdfsWrite returned an error on write: %d\n", written);
+ exit(-3);
+ }
+ }
+ free(buffer);
+ hdfsCloseFile(fs, writeFile);
+ hdfsDisconnect(fs);
+ return 0;
+ * vim: ts=4: sw=4: et:
+ */
diff --git a/import/pdclibhdfs/src/test_native_mini_dfs.c b/import/pdclibhdfs/src/test_native_mini_dfs.c
new file mode 100755
index 0000000..ae4f868
--- /dev/null
+++ b/import/pdclibhdfs/src/test_native_mini_dfs.c
@@ -0,0 +1,41 @@
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "expect.h"
+#include "native_mini_dfs.h"
+#include <errno.h>
+static struct NativeMiniDfsConf conf = {
+ .doFormat = 1,
+ * Test that we can create a MiniDFSCluster and shut it down.
+ */
+int main(void) {
+ struct NativeMiniDfsCluster* cl;
+ cl = nmdCreate(&conf);
+ EXPECT_ZERO(nmdWaitClusterUp(cl));
+ EXPECT_ZERO(nmdShutdown(cl));
+ nmdFree(cl);
+ return 0;
diff --git a/import/zconf.h b/import/zconf.h
new file mode 100644
index 0000000..9987a77
--- /dev/null
+++ b/import/zconf.h
@@ -0,0 +1,511 @@
+/* zconf.h -- configuration of the zlib compression library
+ * Copyright (C) 1995-2013 Jean-loup Gailly.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+/* @(#) $Id$ */
+#ifndef ZCONF_H
+#define ZCONF_H
+ * If you *really* need a unique prefix for all types and library functions,
+ * compile with -DZ_PREFIX. The "standard" zlib should be compiled without it.
+ * Even better than compiling with -DZ_PREFIX would be to use configure to set
+ * this permanently in zconf.h using "./configure --zprefix".
+ */
+#ifdef Z_PREFIX /* may be set to #if 1 by ./configure */
+# define Z_PREFIX_SET
+/* all linked symbols */
+# define _dist_code z__dist_code
+# define _length_code z__length_code
+# define _tr_align z__tr_align
+# define _tr_flush_bits z__tr_flush_bits
+# define _tr_flush_block z__tr_flush_block
+# define _tr_init z__tr_init
+# define _tr_stored_block z__tr_stored_block
+# define _tr_tally z__tr_tally
+# define adler32 z_adler32
+# define adler32_combine z_adler32_combine
+# define adler32_combine64 z_adler32_combine64
+# ifndef Z_SOLO
+# define compress z_compress
+# define compress2 z_compress2
+# define compressBound z_compressBound
+# endif
+# define crc32 z_crc32
+# define crc32_combine z_crc32_combine
+# define crc32_combine64 z_crc32_combine64
+# define deflate z_deflate
+# define deflateBound z_deflateBound
+# define deflateCopy z_deflateCopy
+# define deflateEnd z_deflateEnd
+# define deflateInit2_ z_deflateInit2_
+# define deflateInit_ z_deflateInit_
+# define deflateParams z_deflateParams
+# define deflatePending z_deflatePending
+# define deflatePrime z_deflatePrime
+# define deflateReset z_deflateReset
+# define deflateResetKeep z_deflateResetKeep
+# define deflateSetDictionary z_deflateSetDictionary
+# define deflateSetHeader z_deflateSetHeader
+# define deflateTune z_deflateTune
+# define deflate_copyright z_deflate_copyright
+# define get_crc_table z_get_crc_table
+# ifndef Z_SOLO
+# define gz_error z_gz_error
+# define gz_intmax z_gz_intmax
+# define gz_strwinerror z_gz_strwinerror
+# define gzbuffer z_gzbuffer
+# define gzclearerr z_gzclearerr
+# define gzclose z_gzclose
+# define gzclose_r z_gzclose_r
+# define gzclose_w z_gzclose_w
+# define gzdirect z_gzdirect
+# define gzdopen z_gzdopen
+# define gzeof z_gzeof
+# define gzerror z_gzerror
+# define gzflush z_gzflush
+# define gzgetc z_gzgetc
+# define gzgetc_ z_gzgetc_
+# define gzgets z_gzgets
+# define gzoffset z_gzoffset
+# define gzoffset64 z_gzoffset64
+# define gzopen z_gzopen
+# define gzopen64 z_gzopen64
+# ifdef _WIN32
+# define gzopen_w z_gzopen_w
+# endif
+# define gzprintf z_gzprintf
+# define gzvprintf z_gzvprintf
+# define gzputc z_gzputc
+# define gzputs z_gzputs
+# define gzread z_gzread
+# define gzrewind z_gzrewind
+# define gzseek z_gzseek
+# define gzseek64 z_gzseek64
+# define gzsetparams z_gzsetparams
+# define gztell z_gztell
+# define gztell64 z_gztell64
+# define gzungetc z_gzungetc
+# define gzwrite z_gzwrite
+# endif
+# define inflate z_inflate
+# define inflateBack z_inflateBack
+# define inflateBackEnd z_inflateBackEnd
+# define inflateBackInit_ z_inflateBackInit_
+# define inflateCopy z_inflateCopy
+# define inflateEnd z_inflateEnd
+# define inflateGetHeader z_inflateGetHeader
+# define inflateInit2_ z_inflateInit2_
+# define inflateInit_ z_inflateInit_
+# define inflateMark z_inflateMark
+# define inflatePrime z_inflatePrime
+# define inflateReset z_inflateReset
+# define inflateReset2 z_inflateReset2
+# define inflateSetDictionary z_inflateSetDictionary
+# define inflateGetDictionary z_inflateGetDictionary
+# define inflateSync z_inflateSync
+# define inflateSyncPoint z_inflateSyncPoint
+# define inflateUndermine z_inflateUndermine
+# define inflateResetKeep z_inflateResetKeep
+# define inflate_copyright z_inflate_copyright
+# define inflate_fast z_inflate_fast
+# define inflate_table z_inflate_table
+# ifndef Z_SOLO
+# define uncompress z_uncompress
+# endif
+# define zError z_zError
+# ifndef Z_SOLO
+# define zcalloc z_zcalloc
+# define zcfree z_zcfree
+# endif
+# define zlibCompileFlags z_zlibCompileFlags
+# define zlibVersion z_zlibVersion
+/* all zlib typedefs in zlib.h and zconf.h */
+# define Byte z_Byte
+# define Bytef z_Bytef
+# define alloc_func z_alloc_func
+# define charf z_charf
+# define free_func z_free_func
+# ifndef Z_SOLO
+# define gzFile z_gzFile
+# endif
+# define gz_header z_gz_header
+# define gz_headerp z_gz_headerp
+# define in_func z_in_func
+# define intf z_intf
+# define out_func z_out_func
+# define uInt z_uInt
+# define uIntf z_uIntf
+# define uLong z_uLong
+# define uLongf z_uLongf
+# define voidp z_voidp
+# define voidpc z_voidpc
+# define voidpf z_voidpf
+/* all zlib structs in zlib.h and zconf.h */
+# define gz_header_s z_gz_header_s
+# define internal_state z_internal_state
+#if defined(__MSDOS__) && !defined(MSDOS)
+# define MSDOS
+#if (defined(OS_2) || defined(__OS2__)) && !defined(OS2)
+# define OS2
+#if defined(_WINDOWS) && !defined(WINDOWS)
+# define WINDOWS
+#if defined(_WIN32) || defined(_WIN32_WCE) || defined(__WIN32__)
+# ifndef WIN32
+# define WIN32
+# endif
+#if (defined(MSDOS) || defined(OS2) || defined(WINDOWS)) && !defined(WIN32)
+# if !defined(__GNUC__) && !defined(__FLAT__) && !defined(__386__)
+# ifndef SYS16BIT
+# define SYS16BIT
+# endif
+# endif
+ * Compile with -DMAXSEG_64K if the alloc function cannot allocate more
+ * than 64k bytes at a time (needed on systems with 16-bit int).
+ */
+#ifdef SYS16BIT
+# define MAXSEG_64K
+#ifdef MSDOS
+# define UNALIGNED_OK
+#ifdef __STDC_VERSION__
+# ifndef STDC
+# define STDC
+# endif
+# if __STDC_VERSION__ >= 199901L
+# ifndef STDC99
+# define STDC99
+# endif
+# endif
+#if !defined(STDC) && (defined(__STDC__) || defined(__cplusplus))
+# define STDC
+#if !defined(STDC) && (defined(__GNUC__) || defined(__BORLANDC__))
+# define STDC
+#if !defined(STDC) && (defined(MSDOS) || defined(WINDOWS) || defined(WIN32))
+# define STDC
+#if !defined(STDC) && (defined(OS2) || defined(__HOS_AIX__))
+# define STDC
+#if defined(__OS400__) && !defined(STDC) /* iSeries (formerly AS/400). */
+# define STDC
+#ifndef STDC
+# ifndef const /* cannot use !defined(STDC) && !defined(const) on Mac */
+# define const /* note: need a more gentle solution here */
+# endif
+#if defined(ZLIB_CONST) && !defined(z_const)
+# define z_const const
+# define z_const
+/* Some Mac compilers merge all .h files incorrectly: */
+#if defined(__MWERKS__)||defined(applec)||defined(THINK_C)||defined(__SC__)
+# define NO_DUMMY_DECL
+/* Maximum value for memLevel in deflateInit2 */
+#ifndef MAX_MEM_LEVEL
+# ifdef MAXSEG_64K
+# define MAX_MEM_LEVEL 8
+# else
+# define MAX_MEM_LEVEL 9
+# endif
+/* Maximum value for windowBits in deflateInit2 and inflateInit2.
+ * WARNING: reducing MAX_WBITS makes minigzip unable to extract .gz files
+ * created by gzip. (Files created by minigzip can still be extracted by
+ * gzip.)
+ */
+#ifndef MAX_WBITS
+# define MAX_WBITS 15 /* 32K LZ77 window */
+/* The memory requirements for deflate are (in bytes):
+ (1 << (windowBits+2)) + (1 << (memLevel+9))
+ that is: 128K for windowBits=15 + 128K for memLevel = 8 (default values)
+ plus a few kilobytes for small objects. For example, if you want to reduce
+ the default memory requirements from 256K to 128K, compile with
+ Of course this will generally degrade compression (there's no free lunch).
+ The memory requirements for inflate are (in bytes) 1 << windowBits
+ that is, 32K for windowBits=15 (default value) plus a few kilobytes
+ for small objects.
+ /* Type declarations */
+#ifndef OF /* function prototypes */
+# ifdef STDC
+# define OF(args) args
+# else
+# define OF(args) ()
+# endif
+#ifndef Z_ARG /* function prototypes for stdarg */
+# if defined(STDC) || defined(Z_HAVE_STDARG_H)
+# define Z_ARG(args) args
+# else
+# define Z_ARG(args) ()
+# endif
+/* The following definitions for FAR are needed only for MSDOS mixed
+ * model programming (small or medium model with some far allocations).
+ * This was tested only with MSC; for other MSDOS compilers you may have
+ * to define NO_MEMCPY in zutil.h. If you don't need the mixed model,
+ * just define FAR to be empty.
+ */
+#ifdef SYS16BIT
+# if defined(M_I86SM) || defined(M_I86MM)
+ /* MSC small or medium model */
+# define SMALL_MEDIUM
+# ifdef _MSC_VER
+# define FAR _far
+# else
+# define FAR far
+# endif
+# endif
+# if (defined(__SMALL__) || defined(__MEDIUM__))
+ /* Turbo C small or medium model */
+# define SMALL_MEDIUM
+# ifdef __BORLANDC__
+# define FAR _far
+# else
+# define FAR far
+# endif
+# endif
+#if defined(WINDOWS) || defined(WIN32)
+ /* If building or using zlib as a DLL, define ZLIB_DLL.
+ * This is not mandatory, but it offers a little performance increase.
+ */
+# ifdef ZLIB_DLL
+# if defined(WIN32) && (!defined(__BORLANDC__) || (__BORLANDC__ >= 0x500))
+# define ZEXTERN extern __declspec(dllexport)
+# else
+# define ZEXTERN extern __declspec(dllimport)
+# endif
+# endif
+# endif /* ZLIB_DLL */
+ /* If building or using zlib with the WINAPI/WINAPIV calling convention,
+ * define ZLIB_WINAPI.
+ * Caution: the standard ZLIB1.DLL is NOT compiled using ZLIB_WINAPI.
+ */
+# ifdef ZLIB_WINAPI
+# ifdef FAR
+# undef FAR
+# endif
+# include <windows.h>
+ /* No need for _export, use ZLIB.DEF instead. */
+ /* For complete Windows compatibility, use WINAPI, not __stdcall. */
+# ifdef WIN32
+# else
+# endif
+# endif
+#if defined (__BEOS__)
+# ifdef ZLIB_DLL
+# define ZEXPORT __declspec(dllexport)
+# define ZEXPORTVA __declspec(dllexport)
+# else
+# define ZEXPORT __declspec(dllimport)
+# define ZEXPORTVA __declspec(dllimport)
+# endif
+# endif
+#ifndef ZEXTERN
+# define ZEXTERN extern
+#ifndef ZEXPORT
+# define ZEXPORT
+#ifndef ZEXPORTVA
+# define ZEXPORTVA
+#ifndef FAR
+# define FAR
+#if !defined(__MACTYPES__)
+typedef unsigned char Byte; /* 8 bits */
+typedef unsigned int uInt; /* 16 bits or more */
+typedef unsigned long uLong; /* 32 bits or more */
+ /* Borland C/C++ and some old MSC versions ignore FAR inside typedef */
+# define Bytef Byte FAR
+ typedef Byte FAR Bytef;
+typedef char FAR charf;
+typedef int FAR intf;
+typedef uInt FAR uIntf;
+typedef uLong FAR uLongf;
+#ifdef STDC
+ typedef void const *voidpc;
+ typedef void FAR *voidpf;
+ typedef void *voidp;
+ typedef Byte const *voidpc;
+ typedef Byte FAR *voidpf;
+ typedef Byte *voidp;
+#if !defined(Z_U4) && !defined(Z_SOLO) && defined(STDC)
+# include <limits.h>
+# if (UINT_MAX == 0xffffffffUL)
+# define Z_U4 unsigned
+# elif (ULONG_MAX == 0xffffffffUL)
+# define Z_U4 unsigned long
+# elif (USHRT_MAX == 0xffffffffUL)
+# define Z_U4 unsigned short
+# endif
+#ifdef Z_U4
+ typedef Z_U4 z_crc_t;
+ typedef unsigned long z_crc_t;
+#ifdef HAVE_UNISTD_H /* may be set to #if 1 by ./configure */
+# define Z_HAVE_UNISTD_H
+#ifdef HAVE_STDARG_H /* may be set to #if 1 by ./configure */
+# define Z_HAVE_STDARG_H
+#ifdef STDC
+# ifndef Z_SOLO
+# include <sys/types.h> /* for off_t */
+# endif
+#if defined(STDC) || defined(Z_HAVE_STDARG_H)
+# ifndef Z_SOLO
+# include <stdarg.h> /* for va_list */
+# endif
+#ifdef _WIN32
+# ifndef Z_SOLO
+# include <stddef.h> /* for wchar_t */
+# endif
+/* a little trick to accommodate both "#define _LARGEFILE64_SOURCE" and
+ * "#define _LARGEFILE64_SOURCE 1" as requesting 64-bit operations, (even
+ * though the former does not conform to the LFS document), but considering
+ * both "#undef _LARGEFILE64_SOURCE" and "#define _LARGEFILE64_SOURCE 0" as
+ * equivalently requesting no 64-bit operations
+ */
+#if defined(_LARGEFILE64_SOURCE) && -_LARGEFILE64_SOURCE - -1 == 1
+#if defined(__WATCOMC__) && !defined(Z_HAVE_UNISTD_H)
+# define Z_HAVE_UNISTD_H
+#ifndef Z_SOLO
+# if defined(Z_HAVE_UNISTD_H) || defined(_LARGEFILE64_SOURCE)
+# include <unistd.h> /* for SEEK_*, off_t, and _LFS64_LARGEFILE */
+# ifdef VMS
+# include <unixio.h> /* for off_t */
+# endif
+# ifndef z_off_t
+# define z_off_t off_t
+# endif
+# endif
+#if defined(_LFS64_LARGEFILE) && _LFS64_LARGEFILE-0
+# define Z_LFS64
+#if defined(_LARGEFILE64_SOURCE) && defined(Z_LFS64)
+# define Z_LARGE64
+#if defined(_FILE_OFFSET_BITS) && _FILE_OFFSET_BITS-0 == 64 && defined(Z_LFS64)
+# define Z_WANT64
+#if !defined(SEEK_SET) && !defined(Z_SOLO)
+# define SEEK_SET 0 /* Seek from beginning of file. */
+# define SEEK_CUR 1 /* Seek from current position. */
+# define SEEK_END 2 /* Set file pointer to EOF plus "offset" */
+#ifndef z_off_t
+# define z_off_t long
+#if !defined(_WIN32) && defined(Z_LARGE64)
+# define z_off64_t off64_t
+# if defined(_WIN32) && !defined(__GNUC__) && !defined(Z_SOLO)
+# define z_off64_t __int64
+# else
+# define z_off64_t z_off_t
+# endif
+/* MVS linker does not support external names larger than 8 bytes */
+#if defined(__MVS__)
+ #pragma map(deflateInit_,"DEIN")
+ #pragma map(deflateInit2_,"DEIN2")
+ #pragma map(deflateEnd,"DEEND")
+ #pragma map(deflateBound,"DEBND")
+ #pragma map(inflateInit_,"ININ")
+ #pragma map(inflateInit2_,"ININ2")
+ #pragma map(inflateEnd,"INEND")
+ #pragma map(inflateSync,"INSY")
+ #pragma map(inflateSetDictionary,"INSEDI")
+ #pragma map(compressBound,"CMBND")
+ #pragma map(inflate_table,"INTABL")
+ #pragma map(inflate_fast,"INFA")
+ #pragma map(inflate_copyright,"INCOPY")
+#endif /* ZCONF_H */
diff --git a/import/zlib.h b/import/zlib.h
new file mode 100644
index 0000000..3e0c767
--- /dev/null
+++ b/import/zlib.h
@@ -0,0 +1,1768 @@
+/* zlib.h -- interface of the 'zlib' general purpose compression library
+ version 1.2.8, April 28th, 2013
+ Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
+ This software is provided 'as-is', without any express or implied
+ warranty. In no event will the authors be held liable for any damages
+ arising from the use of this software.
+ Permission is granted to anyone to use this software for any purpose,
+ including commercial applications, and to alter it and redistribute it
+ freely, subject to the following restrictions:
+ 1. The origin of this software must not be misrepresented; you must not
+ claim that you wrote the original software. If you use this software
+ in a product, an acknowledgment in the product documentation would be
+ appreciated but is not required.
+ 2. Altered source versions must be plainly marked as such, and must not be
+ misrepresented as being the original software.
+ 3. This notice may not be removed or altered from any source distribution.
+ Jean-loup Gailly Mark Adler
+ jloup at gzip.org madler at alumni.caltech.edu
+ The data format used by the zlib library is described by RFCs (Request for
+ Comments) 1950 to 1952 in the files http://tools.ietf.org/html/rfc1950
+ (zlib format), rfc1951 (deflate format) and rfc1952 (gzip format).
+#ifndef ZLIB_H
+#define ZLIB_H
+#include "zconf.h"
+#ifdef __cplusplus
+extern "C" {
+#define ZLIB_VERSION "1.2.8"
+#define ZLIB_VERNUM 0x1280
+#define ZLIB_VER_MAJOR 1
+#define ZLIB_VER_MINOR 2
+ The 'zlib' compression library provides in-memory compression and
+ decompression functions, including integrity checks of the uncompressed data.
+ This version of the library supports only one compression method (deflation)
+ but other algorithms will be added later and will have the same stream
+ interface.
+ Compression can be done in a single step if the buffers are large enough,
+ or can be done by repeated calls of the compression function. In the latter
+ case, the application must provide more input and/or consume the output
+ (providing more output space) before each call.
+ The compressed data format used by default by the in-memory functions is
+ the zlib format, which is a zlib wrapper documented in RFC 1950, wrapped
+ around a deflate stream, which is itself documented in RFC 1951.
+ The library also supports reading and writing files in gzip (.gz) format
+ with an interface similar to that of stdio using the functions that start
+ with "gz". The gzip format is different from the zlib format. gzip is a
+ gzip wrapper, documented in RFC 1952, wrapped around a deflate stream.
+ This library can optionally read and write gzip streams in memory as well.
+ The zlib format was designed to be compact and fast for use in memory
+ and on communications channels. The gzip format was designed for single-
+ file compression on file systems, has a larger header than zlib to maintain
+ directory information, and uses a different, slower check method than zlib.
+ The library does not install any signal handler. The decoder checks
+ the consistency of the compressed data, so the library should never crash
+ even in case of corrupted input.
+typedef voidpf (*alloc_func) OF((voidpf opaque, uInt items, uInt size));
+typedef void (*free_func) OF((voidpf opaque, voidpf address));
+struct internal_state;
+typedef struct z_stream_s {
+ z_const Bytef *next_in; /* next input byte */
+ uInt avail_in; /* number of bytes available at next_in */
+ uLong total_in; /* total number of input bytes read so far */
+ Bytef *next_out; /* next output byte should be put there */
+ uInt avail_out; /* remaining free space at next_out */
+ uLong total_out; /* total number of bytes output so far */
+ z_const char *msg; /* last error message, NULL if no error */
+ struct internal_state FAR *state; /* not visible by applications */
+ alloc_func zalloc; /* used to allocate the internal state */
+ free_func zfree; /* used to free the internal state */
+ voidpf opaque; /* private data object passed to zalloc and zfree */
+ int data_type; /* best guess about the data type: binary or text */
+ uLong adler; /* adler32 value of the uncompressed data */
+ uLong reserved; /* reserved for future use */
+} z_stream;
+typedef z_stream FAR *z_streamp;
+ gzip header information passed to and from zlib routines. See RFC 1952
+ for more details on the meanings of these fields.
+typedef struct gz_header_s {
+ int text; /* true if compressed data believed to be text */
+ uLong time; /* modification time */
+ int xflags; /* extra flags (not used when writing a gzip file) */
+ int os; /* operating system */
+ Bytef *extra; /* pointer to extra field or Z_NULL if none */
+ uInt extra_len; /* extra field length (valid if extra != Z_NULL) */
+ uInt extra_max; /* space at extra (only when reading header) */
+ Bytef *name; /* pointer to zero-terminated file name or Z_NULL */
+ uInt name_max; /* space at name (only when reading header) */
+ Bytef *comment; /* pointer to zero-terminated comment or Z_NULL */
+ uInt comm_max; /* space at comment (only when reading header) */
+ int hcrc; /* true if there was or will be a header crc */
+ int done; /* true when done reading gzip header (not used
+ when writing a gzip file) */
+} gz_header;
+typedef gz_header FAR *gz_headerp;
+ The application must update next_in and avail_in when avail_in has dropped
+ to zero. It must update next_out and avail_out when avail_out has dropped
+ to zero. The application must initialize zalloc, zfree and opaque before
+ calling the init function. All other fields are set by the compression
+ library and must not be updated by the application.
+ The opaque value provided by the application will be passed as the first
+ parameter for calls of zalloc and zfree. This can be useful for custom
+ memory management. The compression library attaches no meaning to the
+ opaque value.
+ zalloc must return Z_NULL if there is not enough memory for the object.
+ If zlib is used in a multi-threaded application, zalloc and zfree must be
+ thread safe.
+ On 16-bit systems, the functions zalloc and zfree must be able to allocate
+ exactly 65536 bytes, but will not be required to allocate more than this if
+ the symbol MAXSEG_64K is defined (see zconf.h). WARNING: On MSDOS, pointers
+ returned by zalloc for objects of exactly 65536 bytes *must* have their
+ offset normalized to zero. The default allocation function provided by this
+ library ensures this (see zutil.c). To reduce memory requirements and avoid
+ any allocation of 64K objects, at the expense of compression ratio, compile
+ the library with -DMAX_WBITS=14 (see zconf.h).
+ The fields total_in and total_out can be used for statistics or progress
+ reports. After compression, total_in holds the total size of the
+ uncompressed data and may be saved for use in the decompressor (particularly
+ if the decompressor wants to decompress everything in a single step).
+ /* constants */
+#define Z_NO_FLUSH 0
+#define Z_PARTIAL_FLUSH 1
+#define Z_SYNC_FLUSH 2
+#define Z_FULL_FLUSH 3
+#define Z_FINISH 4
+#define Z_BLOCK 5
+#define Z_TREES 6
+/* Allowed flush values; see deflate() and inflate() below for details */
+#define Z_OK 0
+#define Z_STREAM_END 1
+#define Z_NEED_DICT 2
+#define Z_ERRNO (-1)
+#define Z_STREAM_ERROR (-2)
+#define Z_DATA_ERROR (-3)
+#define Z_MEM_ERROR (-4)
+#define Z_BUF_ERROR (-5)
+#define Z_VERSION_ERROR (-6)
+/* Return codes for the compression/decompression functions. Negative values
+ * are errors, positive values are used for special but normal events.
+ */
+#define Z_BEST_SPEED 1
+/* compression levels */
+#define Z_FILTERED 1
+#define Z_HUFFMAN_ONLY 2
+#define Z_RLE 3
+#define Z_FIXED 4
+/* compression strategy; see deflateInit2() below for details */
+#define Z_BINARY 0
+#define Z_TEXT 1
+#define Z_ASCII Z_TEXT /* for compatibility with 1.2.2 and earlier */
+#define Z_UNKNOWN 2
+/* Possible values of the data_type field (though see inflate()) */
+#define Z_DEFLATED 8
+/* The deflate compression method (the only one supported in this version) */
+#define Z_NULL 0 /* for initializing zalloc, zfree, opaque */
+#define zlib_version zlibVersion()
+/* for compatibility with versions < 1.0.2 */
+ /* basic functions */
+ZEXTERN const char * ZEXPORT zlibVersion OF((void));
+/* The application can compare zlibVersion and ZLIB_VERSION for consistency.
+ If the first character differs, the library code actually used is not
+ compatible with the zlib.h header file used by the application. This check
+ is automatically made by deflateInit and inflateInit.
+ */
+ZEXTERN int ZEXPORT deflateInit OF((z_streamp strm, int level));
+ Initializes the internal stream state for compression. The fields
+ zalloc, zfree and opaque must be initialized before by the caller. If
+ zalloc and zfree are set to Z_NULL, deflateInit updates them to use default
+ allocation functions.
+ The compression level must be Z_DEFAULT_COMPRESSION, or between 0 and 9:
+ 1 gives best speed, 9 gives best compression, 0 gives no compression at all
+ (the input data is simply copied a block at a time). Z_DEFAULT_COMPRESSION
+ requests a default compromise between speed and compression (currently
+ equivalent to level 6).
+ deflateInit returns Z_OK if success, Z_MEM_ERROR if there was not enough
+ memory, Z_STREAM_ERROR if level is not a valid compression level, or
+ Z_VERSION_ERROR if the zlib library version (zlib_version) is incompatible
+ with the version assumed by the caller (ZLIB_VERSION). msg is set to null
+ if there is no error message. deflateInit does not perform any compression:
+ this will be done by deflate().
+ZEXTERN int ZEXPORT deflate OF((z_streamp strm, int flush));
+ deflate compresses as much data as possible, and stops when the input
+ buffer becomes empty or the output buffer becomes full. It may introduce
+ some output latency (reading input without producing any output) except when
+ forced to flush.
+ The detailed semantics are as follows. deflate performs one or both of the
+ following actions:
+ - Compress more input starting at next_in and update next_in and avail_in
+ accordingly. If not all input can be processed (because there is not
+ enough room in the output buffer), next_in and avail_in are updated and
+ processing will resume at this point for the next call of deflate().
+ - Provide more output starting at next_out and update next_out and avail_out
+ accordingly. This action is forced if the parameter flush is non zero.
+ Forcing flush frequently degrades the compression ratio, so this parameter
+ should be set only when necessary (in interactive applications). Some
+ output may be provided even if flush is not set.
+ Before the call of deflate(), the application should ensure that at least
+ one of the actions is possible, by providing more input and/or consuming more
+ output, and updating avail_in or avail_out accordingly; avail_out should
+ never be zero before the call. The application can consume the compressed
+ output when it wants, for example when the output buffer is full (avail_out
+ == 0), or after each call of deflate(). If deflate returns Z_OK and with
+ zero avail_out, it must be called again after making room in the output
+ buffer because there might be more output pending.
+ Normally the parameter flush is set to Z_NO_FLUSH, which allows deflate to
+ decide how much data to accumulate before producing output, in order to
+ maximize compression.
+ If the parameter flush is set to Z_SYNC_FLUSH, all pending output is
+ flushed to the output buffer and the output is aligned on a byte boundary, so
+ that the decompressor can get all input data available so far. (In
+ particular avail_in is zero after the call if enough output space has been
+ provided before the call.) Flushing may degrade compression for some
+ compression algorithms and so it should be used only when necessary. This
+ completes the current deflate block and follows it with an empty stored block
+ that is three bits plus filler bits to the next byte, followed by four bytes
+ (00 00 ff ff).
+ If flush is set to Z_PARTIAL_FLUSH, all pending output is flushed to the
+ output buffer, but the output is not aligned to a byte boundary. All of the
+ input data so far will be available to the decompressor, as for Z_SYNC_FLUSH.
+ This completes the current deflate block and follows it with an empty fixed
+ codes block that is 10 bits long. This assures that enough bytes are output
+ in order for the decompressor to finish the block before the empty fixed code
+ block.
+ If flush is set to Z_BLOCK, a deflate block is completed and emitted, as
+ for Z_SYNC_FLUSH, but the output is not aligned on a byte boundary, and up to
+ seven bits of the current block are held to be written as the next byte after
+ the next deflate block is completed. In this case, the decompressor may not
+ be provided enough bits at this point in order to complete decompression of
+ the data provided so far to the compressor. It may need to wait for the next
+ block to be emitted. This is for advanced applications that need to control
+ the emission of deflate blocks.
+ If flush is set to Z_FULL_FLUSH, all output is flushed as with
+ Z_SYNC_FLUSH, and the compression state is reset so that decompression can
+ restart from this point if previous compressed data has been damaged or if
+ random access is desired. Using Z_FULL_FLUSH too often can seriously degrade
+ compression.
+ If deflate returns with avail_out == 0, this function must be called again
+ with the same value of the flush parameter and more output space (updated
+ avail_out), until the flush is complete (deflate returns with non-zero
+ avail_out). In the case of a Z_FULL_FLUSH or Z_SYNC_FLUSH, make sure that
+ avail_out is greater than six to avoid repeated flush markers due to
+ avail_out == 0 on return.
+ If the parameter flush is set to Z_FINISH, pending input is processed,
+ pending output is flushed and deflate returns with Z_STREAM_END if there was
+ enough output space; if deflate returns with Z_OK, this function must be
+ called again with Z_FINISH and more output space (updated avail_out) but no
+ more input data, until it returns with Z_STREAM_END or an error. After
+ deflate has returned Z_STREAM_END, the only possible operations on the stream
+ are deflateReset or deflateEnd.
+ Z_FINISH can be used immediately after deflateInit if all the compression
+ is to be done in a single step. In this case, avail_out must be at least the
+ value returned by deflateBound (see below). Then deflate is guaranteed to
+ return Z_STREAM_END. If not enough output space is provided, deflate will
+ not return Z_STREAM_END, and it must be called again as described above.
+ deflate() sets strm->adler to the adler32 checksum of all input read
+ so far (that is, total_in bytes).
+ deflate() may update strm->data_type if it can make a good guess about
+ the input data type (Z_BINARY or Z_TEXT). In doubt, the data is considered
+ binary. This field is only for information purposes and does not affect the
+ compression algorithm in any manner.
+ deflate() returns Z_OK if some progress has been made (more input
+ processed or more output produced), Z_STREAM_END if all input has been
+ consumed and all output has been produced (only when flush is set to
+ Z_FINISH), Z_STREAM_ERROR if the stream state was inconsistent (for example
+ if next_in or next_out was Z_NULL), Z_BUF_ERROR if no progress is possible
+ (for example avail_in or avail_out was zero). Note that Z_BUF_ERROR is not
+ fatal, and deflate() can be called again with more input and more output
+ space to continue compressing.
+ZEXTERN int ZEXPORT deflateEnd OF((z_streamp strm));
+ All dynamically allocated data structures for this stream are freed.
+ This function discards any unprocessed input and does not flush any pending
+ output.
+ deflateEnd returns Z_OK if success, Z_STREAM_ERROR if the
+ stream state was inconsistent, Z_DATA_ERROR if the stream was freed
+ prematurely (some input or output was discarded). In the error case, msg
+ may be set but then points to a static string (which must not be
+ deallocated).
+ZEXTERN int ZEXPORT inflateInit OF((z_streamp strm));
+ Initializes the internal stream state for decompression. The fields
+ next_in, avail_in, zalloc, zfree and opaque must be initialized before by
+ the caller. If next_in is not Z_NULL and avail_in is large enough (the
+ exact value depends on the compression method), inflateInit determines the
+ compression method from the zlib header and allocates all data structures
+ accordingly; otherwise the allocation will be deferred to the first call of
+ inflate. If zalloc and zfree are set to Z_NULL, inflateInit updates them to
+ use default allocation functions.
+ inflateInit returns Z_OK if success, Z_MEM_ERROR if there was not enough
+ memory, Z_VERSION_ERROR if the zlib library version is incompatible with the
+ version assumed by the caller, or Z_STREAM_ERROR if the parameters are
+ invalid, such as a null pointer to the structure. msg is set to null if
+ there is no error message. inflateInit does not perform any decompression
+ apart from possibly reading the zlib header if present: actual decompression
+ will be done by inflate(). (So next_in and avail_in may be modified, but
+ next_out and avail_out are unused and unchanged.) The current implementation
+ of inflateInit() does not process any header information -- that is deferred
+ until inflate() is called.
+ZEXTERN int ZEXPORT inflate OF((z_streamp strm, int flush));
+ inflate decompresses as much data as possible, and stops when the input
+ buffer becomes empty or the output buffer becomes full. It may introduce
+ some output latency (reading input without producing any output) except when
+ forced to flush.
+ The detailed semantics are as follows. inflate performs one or both of the
+ following actions:
+ - Decompress more input starting at next_in and update next_in and avail_in
+ accordingly. If not all input can be processed (because there is not
+ enough room in the output buffer), next_in is updated and processing will
+ resume at this point for the next call of inflate().
+ - Provide more output starting at next_out and update next_out and avail_out
+ accordingly. inflate() provides as much output as possible, until there is
+ no more input data or no more space in the output buffer (see below about
+ the flush parameter).
+ Before the call of inflate(), the application should ensure that at least
+ one of the actions is possible, by providing more input and/or consuming more
+ output, and updating the next_* and avail_* values accordingly. The
+ application can consume the uncompressed output when it wants, for example
+ when the output buffer is full (avail_out == 0), or after each call of
+ inflate(). If inflate returns Z_OK and with zero avail_out, it must be
+ called again after making room in the output buffer because there might be
+ more output pending.
+ The flush parameter of inflate() can be Z_NO_FLUSH, Z_SYNC_FLUSH, Z_FINISH,
+ Z_BLOCK, or Z_TREES. Z_SYNC_FLUSH requests that inflate() flush as much
+ output as possible to the output buffer. Z_BLOCK requests that inflate()
+ stop if and when it gets to the next deflate block boundary. When decoding
+ the zlib or gzip format, this will cause inflate() to return immediately
+ after the header and before the first block. When doing a raw inflate,
+ inflate() will go ahead and process the first block, and will return when it
+ gets to the end of that block, or when it runs out of data.
+ The Z_BLOCK option assists in appending to or combining deflate streams.
+ Also to assist in this, on return inflate() will set strm->data_type to the
+ number of unused bits in the last byte taken from strm->next_in, plus 64 if
+ inflate() is currently decoding the last block in the deflate stream, plus
+ 128 if inflate() returned immediately after decoding an end-of-block code or
+ decoding the complete header up to just before the first byte of the deflate
+ stream. The end-of-block will not be indicated until all of the uncompressed
+ data from that block has been written to strm->next_out. The number of
+ unused bits may in general be greater than seven, except when bit 7 of
+ data_type is set, in which case the number of unused bits will be less than
+ eight. data_type is set as noted here every time inflate() returns for all
+ flush options, and so can be used to determine the amount of currently
+ consumed input in bits.
+ The Z_TREES option behaves as Z_BLOCK does, but it also returns when the
+ end of each deflate block header is reached, before any actual data in that
+ block is decoded. This allows the caller to determine the length of the
+ deflate block header for later use in random access within a deflate block.
+ 256 is added to the value of strm->data_type when inflate() returns
+ immediately after reaching the end of the deflate block header.
+ inflate() should normally be called until it returns Z_STREAM_END or an
+ error. However if all decompression is to be performed in a single step (a
+ single call of inflate), the parameter flush should be set to Z_FINISH. In
+ this case all pending input is processed and all pending output is flushed;
+ avail_out must be large enough to hold all of the uncompressed data for the
+ operation to complete. (The size of the uncompressed data may have been
+ saved by the compressor for this purpose.) The use of Z_FINISH is not
+ required to perform an inflation in one step. However it may be used to
+ inform inflate that a faster approach can be used for the single inflate()
+ call. Z_FINISH also informs inflate to not maintain a sliding window if the
+ stream completes, which reduces inflate's memory footprint. If the stream
+ does not complete, either because not all of the stream is provided or not
+ enough output space is provided, then a sliding window will be allocated and
+ inflate() can be called again to continue the operation as if Z_NO_FLUSH had
+ been used.
+ In this implementation, inflate() always flushes as much output as
+ possible to the output buffer, and always uses the faster approach on the
+ first call. So the effects of the flush parameter in this implementation are
+ on the return value of inflate() as noted below, when inflate() returns early
+ when Z_BLOCK or Z_TREES is used, and when inflate() avoids the allocation of
+ memory for a sliding window when Z_FINISH is used.
+ If a preset dictionary is needed after this call (see inflateSetDictionary
+ below), inflate sets strm->adler to the Adler-32 checksum of the dictionary
+ chosen by the compressor and returns Z_NEED_DICT; otherwise it sets
+ strm->adler to the Adler-32 checksum of all output produced so far (that is,
+ total_out bytes) and returns Z_OK, Z_STREAM_END or an error code as described
+ below. At the end of the stream, inflate() checks that its computed adler32
+ checksum is equal to that saved by the compressor and returns Z_STREAM_END
+ only if the checksum is correct.
+ inflate() can decompress and check either zlib-wrapped or gzip-wrapped
+ deflate data. The header type is detected automatically, if requested when
+ initializing with inflateInit2(). Any information contained in the gzip
+ header is not retained, so applications that need that information should
+ instead use raw inflate, see inflateInit2() below, or inflateBack() and
+ perform their own processing of the gzip header and trailer. When processing
+ gzip-wrapped deflate data, strm->adler32 is set to the CRC-32 of the output
+ producted so far. The CRC-32 is checked against the gzip trailer.
+ inflate() returns Z_OK if some progress has been made (more input processed
+ or more output produced), Z_STREAM_END if the end of the compressed data has
+ been reached and all uncompressed output has been produced, Z_NEED_DICT if a
+ preset dictionary is needed at this point, Z_DATA_ERROR if the input data was
+ corrupted (input stream not conforming to the zlib format or incorrect check
+ value), Z_STREAM_ERROR if the stream structure was inconsistent (for example
+ next_in or next_out was Z_NULL), Z_MEM_ERROR if there was not enough memory,
+ Z_BUF_ERROR if no progress is possible or if there was not enough room in the
+ output buffer when Z_FINISH is used. Note that Z_BUF_ERROR is not fatal, and
+ inflate() can be called again with more input and more output space to
+ continue decompressing. If Z_DATA_ERROR is returned, the application may
+ then call inflateSync() to look for a good compression block if a partial
+ recovery of the data is desired.
+ZEXTERN int ZEXPORT inflateEnd OF((z_streamp strm));
+ All dynamically allocated data structures for this stream are freed.
+ This function discards any unprocessed input and does not flush any pending
+ output.
+ inflateEnd returns Z_OK if success, Z_STREAM_ERROR if the stream state
+ was inconsistent. In the error case, msg may be set but then points to a
+ static string (which must not be deallocated).
+ /* Advanced functions */
+ The following functions are needed only in some special applications.
+ZEXTERN int ZEXPORT deflateInit2 OF((z_streamp strm,
+ int level,
+ int method,
+ int windowBits,
+ int memLevel,
+ int strategy));
+ This is another version of deflateInit with more compression options. The
+ fields next_in, zalloc, zfree and opaque must be initialized before by the
+ caller.
+ The method parameter is the compression method. It must be Z_DEFLATED in
+ this version of the library.
+ The windowBits parameter is the base two logarithm of the window size
+ (the size of the history buffer). It should be in the range 8..15 for this
+ version of the library. Larger values of this parameter result in better
+ compression at the expense of memory usage. The default value is 15 if
+ deflateInit is used instead.
+ windowBits can also be -8..-15 for raw deflate. In this case, -windowBits
+ determines the window size. deflate() will then generate raw deflate data
+ with no zlib header or trailer, and will not compute an adler32 check value.
+ windowBits can also be greater than 15 for optional gzip encoding. Add
+ 16 to windowBits to write a simple gzip header and trailer around the
+ compressed data instead of a zlib wrapper. The gzip header will have no
+ file name, no extra data, no comment, no modification time (set to zero), no
+ header crc, and the operating system will be set to 255 (unknown). If a
+ gzip stream is being written, strm->adler is a crc32 instead of an adler32.
+ The memLevel parameter specifies how much memory should be allocated
+ for the internal compression state. memLevel=1 uses minimum memory but is
+ slow and reduces compression ratio; memLevel=9 uses maximum memory for
+ optimal speed. The default value is 8. See zconf.h for total memory usage
+ as a function of windowBits and memLevel.
+ The strategy parameter is used to tune the compression algorithm. Use the
+ value Z_DEFAULT_STRATEGY for normal data, Z_FILTERED for data produced by a
+ filter (or predictor), Z_HUFFMAN_ONLY to force Huffman encoding only (no
+ string match), or Z_RLE to limit match distances to one (run-length
+ encoding). Filtered data consists mostly of small values with a somewhat
+ random distribution. In this case, the compression algorithm is tuned to
+ compress them better. The effect of Z_FILTERED is to force more Huffman
+ coding and less string matching; it is somewhat intermediate between
+ Z_DEFAULT_STRATEGY and Z_HUFFMAN_ONLY. Z_RLE is designed to be almost as
+ fast as Z_HUFFMAN_ONLY, but give better compression for PNG image data. The
+ strategy parameter only affects the compression ratio but not the
+ correctness of the compressed output even if it is not set appropriately.
+ Z_FIXED prevents the use of dynamic Huffman codes, allowing for a simpler
+ decoder for special applications.
+ deflateInit2 returns Z_OK if success, Z_MEM_ERROR if there was not enough
+ memory, Z_STREAM_ERROR if any parameter is invalid (such as an invalid
+ method), or Z_VERSION_ERROR if the zlib library version (zlib_version) is
+ incompatible with the version assumed by the caller (ZLIB_VERSION). msg is
+ set to null if there is no error message. deflateInit2 does not perform any
+ compression: this will be done by deflate().
+ZEXTERN int ZEXPORT deflateSetDictionary OF((z_streamp strm,
+ const Bytef *dictionary,
+ uInt dictLength));
+ Initializes the compression dictionary from the given byte sequence
+ without producing any compressed output. When using the zlib format, this
+ function must be called immediately after deflateInit, deflateInit2 or
+ deflateReset, and before any call of deflate. When doing raw deflate, this
+ function must be called either before any call of deflate, or immediately
+ after the completion of a deflate block, i.e. after all input has been
+ consumed and all output has been delivered when using any of the flush
+ compressor and decompressor must use exactly the same dictionary (see
+ inflateSetDictionary).
+ The dictionary should consist of strings (byte sequences) that are likely
+ to be encountered later in the data to be compressed, with the most commonly
+ used strings preferably put towards the end of the dictionary. Using a
+ dictionary is most useful when the data to be compressed is short and can be
+ predicted with good accuracy; the data can then be compressed better than
+ with the default empty dictionary.
+ Depending on the size of the compression data structures selected by
+ deflateInit or deflateInit2, a part of the dictionary may in effect be
+ discarded, for example if the dictionary is larger than the window size
+ provided in deflateInit or deflateInit2. Thus the strings most likely to be
+ useful should be put at the end of the dictionary, not at the front. In
+ addition, the current implementation of deflate will use at most the window
+ size minus 262 bytes of the provided dictionary.
+ Upon return of this function, strm->adler is set to the adler32 value
+ of the dictionary; the decompressor may later use this value to determine
+ which dictionary has been used by the compressor. (The adler32 value
+ applies to the whole dictionary even if only a subset of the dictionary is
+ actually used by the compressor.) If a raw deflate was requested, then the
+ adler32 value is not computed and strm->adler is not set.
+ deflateSetDictionary returns Z_OK if success, or Z_STREAM_ERROR if a
+ parameter is invalid (e.g. dictionary being Z_NULL) or the stream state is
+ inconsistent (for example if deflate has already been called for this stream
+ or if not at a block boundary for raw deflate). deflateSetDictionary does
+ not perform any compression: this will be done by deflate().
+ZEXTERN int ZEXPORT deflateCopy OF((z_streamp dest,
+ z_streamp source));
+ Sets the destination stream as a complete copy of the source stream.
+ This function can be useful when several compression strategies will be
+ tried, for example when there are several ways of pre-processing the input
+ data with a filter. The streams that will be discarded should then be freed
+ by calling deflateEnd. Note that deflateCopy duplicates the internal
+ compression state which can be quite large, so this strategy is slow and can
+ consume lots of memory.
+ deflateCopy returns Z_OK if success, Z_MEM_ERROR if there was not
+ enough memory, Z_STREAM_ERROR if the source stream state was inconsistent
+ (such as zalloc being Z_NULL). msg is left unchanged in both source and
+ destination.
+ZEXTERN int ZEXPORT deflateReset OF((z_streamp strm));
+ This function is equivalent to deflateEnd followed by deflateInit,
+ but does not free and reallocate all the internal compression state. The
+ stream will keep the same compression level and any other attributes that
+ may have been set by deflateInit2.
+ deflateReset returns Z_OK if success, or Z_STREAM_ERROR if the source
+ stream state was inconsistent (such as zalloc or state being Z_NULL).
+ZEXTERN int ZEXPORT deflateParams OF((z_streamp strm,
+ int level,
+ int strategy));
+ Dynamically update the compression level and compression strategy. The
+ interpretation of level and strategy is as in deflateInit2. This can be
+ used to switch between compression and straight copy of the input data, or
+ to switch to a different kind of input data requiring a different strategy.
+ If the compression level is changed, the input available so far is
+ compressed with the old level (and may be flushed); the new level will take
+ effect only at the next call of deflate().
+ Before the call of deflateParams, the stream state must be set as for
+ a call of deflate(), since the currently available input may have to be
+ compressed and flushed. In particular, strm->avail_out must be non-zero.
+ deflateParams returns Z_OK if success, Z_STREAM_ERROR if the source
+ stream state was inconsistent or if a parameter was invalid, Z_BUF_ERROR if
+ strm->avail_out was zero.
+ZEXTERN int ZEXPORT deflateTune OF((z_streamp strm,
+ int good_length,
+ int max_lazy,
+ int nice_length,
+ int max_chain));
+ Fine tune deflate's internal compression parameters. This should only be
+ used by someone who understands the algorithm used by zlib's deflate for
+ searching for the best matching string, and even then only by the most
+ fanatic optimizer trying to squeeze out the last compressed bit for their
+ specific input data. Read the deflate.c source code for the meaning of the
+ max_lazy, good_length, nice_length, and max_chain parameters.
+ deflateTune() can be called after deflateInit() or deflateInit2(), and
+ returns Z_OK on success, or Z_STREAM_ERROR for an invalid deflate stream.
+ */
+ZEXTERN uLong ZEXPORT deflateBound OF((z_streamp strm,
+ uLong sourceLen));
+ deflateBound() returns an upper bound on the compressed size after
+ deflation of sourceLen bytes. It must be called after deflateInit() or
+ deflateInit2(), and after deflateSetHeader(), if used. This would be used
+ to allocate an output buffer for deflation in a single pass, and so would be
+ called before deflate(). If that first deflate() call is provided the
+ sourceLen input bytes, an output buffer allocated to the size returned by
+ deflateBound(), and the flush value Z_FINISH, then deflate() is guaranteed
+ to return Z_STREAM_END. Note that it is possible for the compressed size to
+ be larger than the value returned by deflateBound() if flush options other
+ than Z_FINISH or Z_NO_FLUSH are used.
+ZEXTERN int ZEXPORT deflatePending OF((z_streamp strm,
+ unsigned *pending,
+ int *bits));
+ deflatePending() returns the number of bytes and bits of output that have
+ been generated, but not yet provided in the available output. The bytes not
+ provided would be due to the available output space having being consumed.
+ The number of bits of output not provided are between 0 and 7, where they
+ await more bits to join them in order to fill out a full byte. If pending
+ or bits are Z_NULL, then those values are not set.
+ deflatePending returns Z_OK if success, or Z_STREAM_ERROR if the source
+ stream state was inconsistent.
+ */
+ZEXTERN int ZEXPORT deflatePrime OF((z_streamp strm,
+ int bits,
+ int value));
+ deflatePrime() inserts bits in the deflate output stream. The intent
+ is that this function is used to start off the deflate output with the bits
+ leftover from a previous deflate stream when appending to it. As such, this
+ function can only be used for raw deflate, and must be used before the first
+ deflate() call after a deflateInit2() or deflateReset(). bits must be less
+ than or equal to 16, and that many of the least significant bits of value
+ will be inserted in the output.
+ deflatePrime returns Z_OK if success, Z_BUF_ERROR if there was not enough
+ room in the internal buffer to insert the bits, or Z_STREAM_ERROR if the
+ source stream state was inconsistent.
+ZEXTERN int ZEXPORT deflateSetHeader OF((z_streamp strm,
+ gz_headerp head));
+ deflateSetHeader() provides gzip header information for when a gzip
+ stream is requested by deflateInit2(). deflateSetHeader() may be called
+ after deflateInit2() or deflateReset() and before the first call of
+ deflate(). The text, time, os, extra field, name, and comment information
+ in the provided gz_header structure are written to the gzip header (xflag is
+ ignored -- the extra flags are set according to the compression level). The
+ caller must assure that, if not Z_NULL, name and comment are terminated with
+ a zero byte, and that if extra is not Z_NULL, that extra_len bytes are
+ available there. If hcrc is true, a gzip header crc is included. Note that
+ the current versions of the command-line version of gzip (up through version
+ 1.3.x) do not support header crc's, and will report that it is a "multi-part
+ gzip file" and give up.
+ If deflateSetHeader is not used, the default gzip header has text false,
+ the time set to zero, and os set to 255, with no extra, name, or comment
+ fields. The gzip header is returned to the default state by deflateReset().
+ deflateSetHeader returns Z_OK if success, or Z_STREAM_ERROR if the source
+ stream state was inconsistent.
+ZEXTERN int ZEXPORT inflateInit2 OF((z_streamp strm,
+ int windowBits));
+ This is another version of inflateInit with an extra parameter. The
+ fields next_in, avail_in, zalloc, zfree and opaque must be initialized
+ before by the caller.
+ The windowBits parameter is the base two logarithm of the maximum window
+ size (the size of the history buffer). It should be in the range 8..15 for
+ this version of the library. The default value is 15 if inflateInit is used
+ instead. windowBits must be greater than or equal to the windowBits value
+ provided to deflateInit2() while compressing, or it must be equal to 15 if
+ deflateInit2() was not used. If a compressed stream with a larger window
+ size is given as input, inflate() will return with the error code
+ Z_DATA_ERROR instead of trying to allocate a larger window.
+ windowBits can also be zero to request that inflate use the window size in
+ the zlib header of the compressed stream.
+ windowBits can also be -8..-15 for raw inflate. In this case, -windowBits
+ determines the window size. inflate() will then process raw deflate data,
+ not looking for a zlib or gzip header, not generating a check value, and not
+ looking for any check values for comparison at the end of the stream. This
+ is for use with other formats that use the deflate compressed data format
+ such as zip. Those formats provide their own check values. If a custom
+ format is developed using the raw deflate format for compressed data, it is
+ recommended that a check value such as an adler32 or a crc32 be applied to
+ the uncompressed data as is done in the zlib, gzip, and zip formats. For
+ most applications, the zlib format should be used as is. Note that comments
+ above on the use in deflateInit2() applies to the magnitude of windowBits.
+ windowBits can also be greater than 15 for optional gzip decoding. Add
+ 32 to windowBits to enable zlib and gzip decoding with automatic header
+ detection, or add 16 to decode only the gzip format (the zlib format will
+ return a Z_DATA_ERROR). If a gzip stream is being decoded, strm->adler is a
+ crc32 instead of an adler32.
+ inflateInit2 returns Z_OK if success, Z_MEM_ERROR if there was not enough
+ memory, Z_VERSION_ERROR if the zlib library version is incompatible with the
+ version assumed by the caller, or Z_STREAM_ERROR if the parameters are
+ invalid, such as a null pointer to the structure. msg is set to null if
+ there is no error message. inflateInit2 does not perform any decompression
+ apart from possibly reading the zlib header if present: actual decompression
+ will be done by inflate(). (So next_in and avail_in may be modified, but
+ next_out and avail_out are unused and unchanged.) The current implementation
+ of inflateInit2() does not process any header information -- that is
+ deferred until inflate() is called.
+ZEXTERN int ZEXPORT inflateSetDictionary OF((z_streamp strm,
+ const Bytef *dictionary,
+ uInt dictLength));
+ Initializes the decompression dictionary from the given uncompressed byte
+ sequence. This function must be called immediately after a call of inflate,
+ if that call returned Z_NEED_DICT. The dictionary chosen by the compressor
+ can be determined from the adler32 value returned by that call of inflate.
+ The compressor and decompressor must use exactly the same dictionary (see
+ deflateSetDictionary). For raw inflate, this function can be called at any
+ time to set the dictionary. If the provided dictionary is smaller than the
+ window and there is already data in the window, then the provided dictionary
+ will amend what's there. The application must insure that the dictionary
+ that was used for compression is provided.
+ inflateSetDictionary returns Z_OK if success, Z_STREAM_ERROR if a
+ parameter is invalid (e.g. dictionary being Z_NULL) or the stream state is
+ inconsistent, Z_DATA_ERROR if the given dictionary doesn't match the
+ expected one (incorrect adler32 value). inflateSetDictionary does not
+ perform any decompression: this will be done by subsequent calls of
+ inflate().
+ZEXTERN int ZEXPORT inflateGetDictionary OF((z_streamp strm,
+ Bytef *dictionary,
+ uInt *dictLength));
+ Returns the sliding dictionary being maintained by inflate. dictLength is
+ set to the number of bytes in the dictionary, and that many bytes are copied
+ to dictionary. dictionary must have enough space, where 32768 bytes is
+ always enough. If inflateGetDictionary() is called with dictionary equal to
+ Z_NULL, then only the dictionary length is returned, and nothing is copied.
+ Similary, if dictLength is Z_NULL, then it is not set.
+ inflateGetDictionary returns Z_OK on success, or Z_STREAM_ERROR if the
+ stream state is inconsistent.
+ZEXTERN int ZEXPORT inflateSync OF((z_streamp strm));
+ Skips invalid compressed data until a possible full flush point (see above
+ for the description of deflate with Z_FULL_FLUSH) can be found, or until all
+ available input is skipped. No output is provided.
+ inflateSync searches for a 00 00 FF FF pattern in the compressed data.
+ All full flush points have this pattern, but not all occurrences of this
+ pattern are full flush points.
+ inflateSync returns Z_OK if a possible full flush point has been found,
+ Z_BUF_ERROR if no more input was provided, Z_DATA_ERROR if no flush point
+ has been found, or Z_STREAM_ERROR if the stream structure was inconsistent.
+ In the success case, the application may save the current current value of
+ total_in which indicates where valid compressed data was found. In the
+ error case, the application may repeatedly call inflateSync, providing more
+ input each time, until success or end of the input data.
+ZEXTERN int ZEXPORT inflateCopy OF((z_streamp dest,
+ z_streamp source));
+ Sets the destination stream as a complete copy of the source stream.
+ This function can be useful when randomly accessing a large stream. The
+ first pass through the stream can periodically record the inflate state,
+ allowing restarting inflate at those points when randomly accessing the
+ stream.
+ inflateCopy returns Z_OK if success, Z_MEM_ERROR if there was not
+ enough memory, Z_STREAM_ERROR if the source stream state was inconsistent
+ (such as zalloc being Z_NULL). msg is left unchanged in both source and
+ destination.
+ZEXTERN int ZEXPORT inflateReset OF((z_streamp strm));
+ This function is equivalent to inflateEnd followed by inflateInit,
+ but does not free and reallocate all the internal decompression state. The
+ stream will keep attributes that may have been set by inflateInit2.
+ inflateReset returns Z_OK if success, or Z_STREAM_ERROR if the source
+ stream state was inconsistent (such as zalloc or state being Z_NULL).
+ZEXTERN int ZEXPORT inflateReset2 OF((z_streamp strm,
+ int windowBits));
+ This function is the same as inflateReset, but it also permits changing
+ the wrap and window size requests. The windowBits parameter is interpreted
+ the same as it is for inflateInit2.
+ inflateReset2 returns Z_OK if success, or Z_STREAM_ERROR if the source
+ stream state was inconsistent (such as zalloc or state being Z_NULL), or if
+ the windowBits parameter is invalid.
+ZEXTERN int ZEXPORT inflatePrime OF((z_streamp strm,
+ int bits,
+ int value));
+ This function inserts bits in the inflate input stream. The intent is
+ that this function is used to start inflating at a bit position in the
+ middle of a byte. The provided bits will be used before any bytes are used
+ from next_in. This function should only be used with raw inflate, and
+ should be used before the first inflate() call after inflateInit2() or
+ inflateReset(). bits must be less than or equal to 16, and that many of the
+ least significant bits of value will be inserted in the input.
+ If bits is negative, then the input stream bit buffer is emptied. Then
+ inflatePrime() can be called again to put bits in the buffer. This is used
+ to clear out bits leftover after feeding inflate a block description prior
+ to feeding inflate codes.
+ inflatePrime returns Z_OK if success, or Z_STREAM_ERROR if the source
+ stream state was inconsistent.
+ZEXTERN long ZEXPORT inflateMark OF((z_streamp strm));
+ This function returns two values, one in the lower 16 bits of the return
+ value, and the other in the remaining upper bits, obtained by shifting the
+ return value down 16 bits. If the upper value is -1 and the lower value is
+ zero, then inflate() is currently decoding information outside of a block.
+ If the upper value is -1 and the lower value is non-zero, then inflate is in
+ the middle of a stored block, with the lower value equaling the number of
+ bytes from the input remaining to copy. If the upper value is not -1, then
+ it is the number of bits back from the current bit position in the input of
+ the code (literal or length/distance pair) currently being processed. In
+ that case the lower value is the number of bytes already emitted for that
+ code.
+ A code is being processed if inflate is waiting for more input to complete
+ decoding of the code, or if it has completed decoding but is waiting for
+ more output space to write the literal or match data.
+ inflateMark() is used to mark locations in the input data for random
+ access, which may be at bit positions, and to note those cases where the
+ output of a code may span boundaries of random access blocks. The current
+ location in the input stream can be determined from avail_in and data_type
+ as noted in the description for the Z_BLOCK flush parameter for inflate.
+ inflateMark returns the value noted above or -1 << 16 if the provided
+ source stream state was inconsistent.
+ZEXTERN int ZEXPORT inflateGetHeader OF((z_streamp strm,
+ gz_headerp head));
+ inflateGetHeader() requests that gzip header information be stored in the
+ provided gz_header structure. inflateGetHeader() may be called after
+ inflateInit2() or inflateReset(), and before the first call of inflate().
+ As inflate() processes the gzip stream, head->done is zero until the header
+ is completed, at which time head->done is set to one. If a zlib stream is
+ being decoded, then head->done is set to -1 to indicate that there will be
+ no gzip header information forthcoming. Note that Z_BLOCK or Z_TREES can be
+ used to force inflate() to return immediately after header processing is
+ complete and before any actual data is decompressed.
+ The text, time, xflags, and os fields are filled in with the gzip header
+ contents. hcrc is set to true if there is a header CRC. (The header CRC
+ was valid if done is set to one.) If extra is not Z_NULL, then extra_max
+ contains the maximum number of bytes to write to extra. Once done is true,
+ extra_len contains the actual extra field length, and extra contains the
+ extra field, or that field truncated if extra_max is less than extra_len.
+ If name is not Z_NULL, then up to name_max characters are written there,
+ terminated with a zero unless the length is greater than name_max. If
+ comment is not Z_NULL, then up to comm_max characters are written there,
+ terminated with a zero unless the length is greater than comm_max. When any
+ of extra, name, or comment are not Z_NULL and the respective field is not
+ present in the header, then that field is set to Z_NULL to signal its
+ absence. This allows the use of deflateSetHeader() with the returned
+ structure to duplicate the header. However if those fields are set to
+ allocated memory, then the application will need to save those pointers
+ elsewhere so that they can be eventually freed.
+ If inflateGetHeader is not used, then the header information is simply
+ discarded. The header is always checked for validity, including the header
+ CRC if present. inflateReset() will reset the process to discard the header
+ information. The application would need to call inflateGetHeader() again to
+ retrieve the header from the next gzip stream.
+ inflateGetHeader returns Z_OK if success, or Z_STREAM_ERROR if the source
+ stream state was inconsistent.
+ZEXTERN int ZEXPORT inflateBackInit OF((z_streamp strm, int windowBits,
+ unsigned char FAR *window));
+ Initialize the internal stream state for decompression using inflateBack()
+ calls. The fields zalloc, zfree and opaque in strm must be initialized
+ before the call. If zalloc and zfree are Z_NULL, then the default library-
+ derived memory allocation routines are used. windowBits is the base two
+ logarithm of the window size, in the range 8..15. window is a caller
+ supplied buffer of that size. Except for special applications where it is
+ assured that deflate was used with small window sizes, windowBits must be 15
+ and a 32K byte window must be supplied to be able to decompress general
+ deflate streams.
+ See inflateBack() for the usage of these routines.
+ inflateBackInit will return Z_OK on success, Z_STREAM_ERROR if any of
+ the parameters are invalid, Z_MEM_ERROR if the internal state could not be
+ allocated, or Z_VERSION_ERROR if the version of the library does not match
+ the version of the header file.
+typedef unsigned (*in_func) OF((void FAR *,
+ z_const unsigned char FAR * FAR *));
+typedef int (*out_func) OF((void FAR *, unsigned char FAR *, unsigned));
+ZEXTERN int ZEXPORT inflateBack OF((z_streamp strm,
+ in_func in, void FAR *in_desc,
+ out_func out, void FAR *out_desc));
+ inflateBack() does a raw inflate with a single call using a call-back
+ interface for input and output. This is potentially more efficient than
+ inflate() for file i/o applications, in that it avoids copying between the
+ output and the sliding window by simply making the window itself the output
+ buffer. inflate() can be faster on modern CPUs when used with large
+ buffers. inflateBack() trusts the application to not change the output
+ buffer passed by the output function, at least until inflateBack() returns.
+ inflateBackInit() must be called first to allocate the internal state
+ and to initialize the state with the user-provided window buffer.
+ inflateBack() may then be used multiple times to inflate a complete, raw
+ deflate stream with each call. inflateBackEnd() is then called to free the
+ allocated state.
+ A raw deflate stream is one with no zlib or gzip header or trailer.
+ This routine would normally be used in a utility that reads zip or gzip
+ files and writes out uncompressed files. The utility would decode the
+ header and process the trailer on its own, hence this routine expects only
+ the raw deflate stream to decompress. This is different from the normal
+ behavior of inflate(), which expects either a zlib or gzip header and
+ trailer around the deflate stream.
+ inflateBack() uses two subroutines supplied by the caller that are then
+ called by inflateBack() for input and output. inflateBack() calls those
+ routines until it reads a complete deflate stream and writes out all of the
+ uncompressed data, or until it encounters an error. The function's
+ parameters and return types are defined above in the in_func and out_func
+ typedefs. inflateBack() will call in(in_desc, &buf) which should return the
+ number of bytes of provided input, and a pointer to that input in buf. If
+ there is no input available, in() must return zero--buf is ignored in that
+ case--and inflateBack() will return a buffer error. inflateBack() will call
+ out(out_desc, buf, len) to write the uncompressed data buf[0..len-1]. out()
+ should return zero on success, or non-zero on failure. If out() returns
+ non-zero, inflateBack() will return with an error. Neither in() nor out()
+ are permitted to change the contents of the window provided to
+ inflateBackInit(), which is also the buffer that out() uses to write from.
+ The length written by out() will be at most the window size. Any non-zero
+ amount of input may be provided by in().
+ For convenience, inflateBack() can be provided input on the first call by
+ setting strm->next_in and strm->avail_in. If that input is exhausted, then
+ in() will be called. Therefore strm->next_in must be initialized before
+ calling inflateBack(). If strm->next_in is Z_NULL, then in() will be called
+ immediately for input. If strm->next_in is not Z_NULL, then strm->avail_in
+ must also be initialized, and then if strm->avail_in is not zero, input will
+ initially be taken from strm->next_in[0 .. strm->avail_in - 1].
+ The in_desc and out_desc parameters of inflateBack() is passed as the
+ first parameter of in() and out() respectively when they are called. These
+ descriptors can be optionally used to pass any information that the caller-
+ supplied in() and out() functions need to do their job.
+ On return, inflateBack() will set strm->next_in and strm->avail_in to
+ pass back any unused input that was provided by the last in() call. The
+ return values of inflateBack() can be Z_STREAM_END on success, Z_BUF_ERROR
+ if in() or out() returned an error, Z_DATA_ERROR if there was a format error
+ in the deflate stream (in which case strm->msg is set to indicate the nature
+ of the error), or Z_STREAM_ERROR if the stream was not properly initialized.
+ In the case of Z_BUF_ERROR, an input or output error can be distinguished
+ using strm->next_in which will be Z_NULL only if in() returned an error. If
+ strm->next_in is not Z_NULL, then the Z_BUF_ERROR was due to out() returning
+ non-zero. (in() will always be called before out(), so strm->next_in is
+ assured to be defined if out() returns non-zero.) Note that inflateBack()
+ cannot return Z_OK.
+ZEXTERN int ZEXPORT inflateBackEnd OF((z_streamp strm));
+ All memory allocated by inflateBackInit() is freed.
+ inflateBackEnd() returns Z_OK on success, or Z_STREAM_ERROR if the stream
+ state was inconsistent.
+ZEXTERN uLong ZEXPORT zlibCompileFlags OF((void));
+/* Return flags indicating compile-time options.
+ Type sizes, two bits each, 00 = 16 bits, 01 = 32, 10 = 64, 11 = other:
+ 1.0: size of uInt
+ 3.2: size of uLong
+ 5.4: size of voidpf (pointer)
+ 7.6: size of z_off_t
+ Compiler, assembler, and debug options:
+ 8: DEBUG
+ 9: ASMV or ASMINF -- use ASM code
+ 10: ZLIB_WINAPI -- exported functions use the WINAPI calling convention
+ 11: 0 (reserved)
+ One-time table building (smaller code, but not thread-safe if true):
+ 12: BUILDFIXED -- build static block decoding tables when needed
+ 13: DYNAMIC_CRC_TABLE -- build CRC calculation tables when needed
+ 14,15: 0 (reserved)
+ Library content (indicates missing functionality):
+ 16: NO_GZCOMPRESS -- gz* functions cannot compress (to avoid linking
+ deflate code when not needed)
+ 17: NO_GZIP -- deflate can't write gzip streams, and inflate can't detect
+ and decode gzip streams (to avoid linking crc code)
+ 18-19: 0 (reserved)
+ Operation variations (changes in library functionality):
+ 20: PKZIP_BUG_WORKAROUND -- slightly more permissive inflate
+ 21: FASTEST -- deflate algorithm with only one, lowest compression level
+ 22,23: 0 (reserved)
+ The sprintf variant used by gzprintf (zero is best):
+ 24: 0 = vs*, 1 = s* -- 1 means limited to 20 arguments after the format
+ 25: 0 = *nprintf, 1 = *printf -- 1 means gzprintf() not secure!
+ 26: 0 = returns value, 1 = void -- 1 means inferred string length returned
+ Remainder:
+ 27-31: 0 (reserved)
+ */
+#ifndef Z_SOLO
+ /* utility functions */
+ The following utility functions are implemented on top of the basic
+ stream-oriented functions. To simplify the interface, some default options
+ are assumed (compression level and memory usage, standard memory allocation
+ functions). The source code of these utility functions can be modified if
+ you need special options.
+ZEXTERN int ZEXPORT compress OF((Bytef *dest, uLongf *destLen,
+ const Bytef *source, uLong sourceLen));
+ Compresses the source buffer into the destination buffer. sourceLen is
+ the byte length of the source buffer. Upon entry, destLen is the total size
+ of the destination buffer, which must be at least the value returned by
+ compressBound(sourceLen). Upon exit, destLen is the actual size of the
+ compressed buffer.
+ compress returns Z_OK if success, Z_MEM_ERROR if there was not
+ enough memory, Z_BUF_ERROR if there was not enough room in the output
+ buffer.
+ZEXTERN int ZEXPORT compress2 OF((Bytef *dest, uLongf *destLen,
+ const Bytef *source, uLong sourceLen,
+ int level));
+ Compresses the source buffer into the destination buffer. The level
+ parameter has the same meaning as in deflateInit. sourceLen is the byte
+ length of the source buffer. Upon entry, destLen is the total size of the
+ destination buffer, which must be at least the value returned by
+ compressBound(sourceLen). Upon exit, destLen is the actual size of the
+ compressed buffer.
+ compress2 returns Z_OK if success, Z_MEM_ERROR if there was not enough
+ memory, Z_BUF_ERROR if there was not enough room in the output buffer,
+ Z_STREAM_ERROR if the level parameter is invalid.
+ZEXTERN uLong ZEXPORT compressBound OF((uLong sourceLen));
+ compressBound() returns an upper bound on the compressed size after
+ compress() or compress2() on sourceLen bytes. It would be used before a
+ compress() or compress2() call to allocate the destination buffer.
+ZEXTERN int ZEXPORT uncompress OF((Bytef *dest, uLongf *destLen,
+ const Bytef *source, uLong sourceLen));
+ Decompresses the source buffer into the destination buffer. sourceLen is
+ the byte length of the source buffer. Upon entry, destLen is the total size
+ of the destination buffer, which must be large enough to hold the entire
+ uncompressed data. (The size of the uncompressed data must have been saved
+ previously by the compressor and transmitted to the decompressor by some
+ mechanism outside the scope of this compression library.) Upon exit, destLen
+ is the actual size of the uncompressed buffer.
+ uncompress returns Z_OK if success, Z_MEM_ERROR if there was not
+ enough memory, Z_BUF_ERROR if there was not enough room in the output
+ buffer, or Z_DATA_ERROR if the input data was corrupted or incomplete. In
+ the case where there is not enough room, uncompress() will fill the output
+ buffer with the uncompressed data up to that point.
+ /* gzip file access functions */
+ This library supports reading and writing files in gzip (.gz) format with
+ an interface similar to that of stdio, using the functions that start with
+ "gz". The gzip format is different from the zlib format. gzip is a gzip
+ wrapper, documented in RFC 1952, wrapped around a deflate stream.
+typedef struct gzFile_s *gzFile; /* semi-opaque gzip file descriptor */
+ZEXTERN gzFile ZEXPORT gzopen OF((const char *path, const char *mode));
+ Opens a gzip (.gz) file for reading or writing. The mode parameter is as
+ in fopen ("rb" or "wb") but can also include a compression level ("wb9") or
+ a strategy: 'f' for filtered data as in "wb6f", 'h' for Huffman-only
+ compression as in "wb1h", 'R' for run-length encoding as in "wb1R", or 'F'
+ for fixed code compression as in "wb9F". (See the description of
+ deflateInit2 for more information about the strategy parameter.) 'T' will
+ request transparent writing or appending with no compression and not using
+ the gzip format.
+ "a" can be used instead of "w" to request that the gzip stream that will
+ be written be appended to the file. "+" will result in an error, since
+ reading and writing to the same gzip file is not supported. The addition of
+ "x" when writing will create the file exclusively, which fails if the file
+ already exists. On systems that support it, the addition of "e" when
+ reading or writing will set the flag to close the file on an execve() call.
+ These functions, as well as gzip, will read and decode a sequence of gzip
+ streams in a file. The append function of gzopen() can be used to create
+ such a file. (Also see gzflush() for another way to do this.) When
+ appending, gzopen does not test whether the file begins with a gzip stream,
+ nor does it look for the end of the gzip streams to begin appending. gzopen
+ will simply append a gzip stream to the existing file.
+ gzopen can be used to read a file which is not in gzip format; in this
+ case gzread will directly read from the file without decompression. When
+ reading, this will be detected automatically by looking for the magic two-
+ byte gzip header.
+ gzopen returns NULL if the file could not be opened, if there was
+ insufficient memory to allocate the gzFile state, or if an invalid mode was
+ specified (an 'r', 'w', or 'a' was not provided, or '+' was provided).
+ errno can be checked to determine if the reason gzopen failed was that the
+ file could not be opened.
+ZEXTERN gzFile ZEXPORT gzdopen OF((int fd, const char *mode));
+ gzdopen associates a gzFile with the file descriptor fd. File descriptors
+ are obtained from calls like open, dup, creat, pipe or fileno (if the file
+ has been previously opened with fopen). The mode parameter is as in gzopen.
+ The next call of gzclose on the returned gzFile will also close the file
+ descriptor fd, just like fclose(fdopen(fd, mode)) closes the file descriptor
+ fd. If you want to keep fd open, use fd = dup(fd_keep); gz = gzdopen(fd,
+ mode);. The duplicated descriptor should be saved to avoid a leak, since
+ gzdopen does not close fd if it fails. If you are using fileno() to get the
+ file descriptor from a FILE *, then you will have to use dup() to avoid
+ double-close()ing the file descriptor. Both gzclose() and fclose() will
+ close the associated file descriptor, so they need to have different file
+ descriptors.
+ gzdopen returns NULL if there was insufficient memory to allocate the
+ gzFile state, if an invalid mode was specified (an 'r', 'w', or 'a' was not
+ provided, or '+' was provided), or if fd is -1. The file descriptor is not
+ used until the next gz* read, write, seek, or close operation, so gzdopen
+ will not detect if fd is invalid (unless fd is -1).
+ZEXTERN int ZEXPORT gzbuffer OF((gzFile file, unsigned size));
+ Set the internal buffer size used by this library's functions. The
+ default buffer size is 8192 bytes. This function must be called after
+ gzopen() or gzdopen(), and before any other calls that read or write the
+ file. The buffer memory allocation is always deferred to the first read or
+ write. Two buffers are allocated, either both of the specified size when
+ writing, or one of the specified size and the other twice that size when
+ reading. A larger buffer size of, for example, 64K or 128K bytes will
+ noticeably increase the speed of decompression (reading).
+ The new buffer size also affects the maximum length for gzprintf().
+ gzbuffer() returns 0 on success, or -1 on failure, such as being called
+ too late.
+ZEXTERN int ZEXPORT gzsetparams OF((gzFile file, int level, int strategy));
+ Dynamically update the compression level or strategy. See the description
+ of deflateInit2 for the meaning of these parameters.
+ gzsetparams returns Z_OK if success, or Z_STREAM_ERROR if the file was not
+ opened for writing.
+ZEXTERN int ZEXPORT gzread OF((gzFile file, voidp buf, unsigned len));
+ Reads the given number of uncompressed bytes from the compressed file. If
+ the input file is not in gzip format, gzread copies the given number of
+ bytes into the buffer directly from the file.
+ After reaching the end of a gzip stream in the input, gzread will continue
+ to read, looking for another gzip stream. Any number of gzip streams may be
+ concatenated in the input file, and will all be decompressed by gzread().
+ If something other than a gzip stream is encountered after a gzip stream,
+ that remaining trailing garbage is ignored (and no error is returned).
+ gzread can be used to read a gzip file that is being concurrently written.
+ Upon reaching the end of the input, gzread will return with the available
+ data. If the error code returned by gzerror is Z_OK or Z_BUF_ERROR, then
+ gzclearerr can be used to clear the end of file indicator in order to permit
+ gzread to be tried again. Z_OK indicates that a gzip stream was completed
+ on the last gzread. Z_BUF_ERROR indicates that the input file ended in the
+ middle of a gzip stream. Note that gzread does not return -1 in the event
+ of an incomplete gzip stream. This error is deferred until gzclose(), which
+ will return Z_BUF_ERROR if the last gzread ended in the middle of a gzip
+ stream. Alternatively, gzerror can be used before gzclose to detect this
+ case.
+ gzread returns the number of uncompressed bytes actually read, less than
+ len for end of file, or -1 for error.
+ZEXTERN int ZEXPORT gzwrite OF((gzFile file,
+ voidpc buf, unsigned len));
+ Writes the given number of uncompressed bytes into the compressed file.
+ gzwrite returns the number of uncompressed bytes written or 0 in case of
+ error.
+ZEXTERN int ZEXPORTVA gzprintf Z_ARG((gzFile file, const char *format, ...));
+ Converts, formats, and writes the arguments to the compressed file under
+ control of the format string, as in fprintf. gzprintf returns the number of
+ uncompressed bytes actually written, or 0 in case of error. The number of
+ uncompressed bytes written is limited to 8191, or one less than the buffer
+ size given to gzbuffer(). The caller should assure that this limit is not
+ exceeded. If it is exceeded, then gzprintf() will return an error (0) with
+ nothing written. In this case, there may also be a buffer overflow with
+ unpredictable consequences, which is possible only if zlib was compiled with
+ the insecure functions sprintf() or vsprintf() because the secure snprintf()
+ or vsnprintf() functions were not available. This can be determined using
+ zlibCompileFlags().
+ZEXTERN int ZEXPORT gzputs OF((gzFile file, const char *s));
+ Writes the given null-terminated string to the compressed file, excluding
+ the terminating null character.
+ gzputs returns the number of characters written, or -1 in case of error.
+ZEXTERN char * ZEXPORT gzgets OF((gzFile file, char *buf, int len));
+ Reads bytes from the compressed file until len-1 characters are read, or a
+ newline character is read and transferred to buf, or an end-of-file
+ condition is encountered. If any characters are read or if len == 1, the
+ string is terminated with a null character. If no characters are read due
+ to an end-of-file or len < 1, then the buffer is left untouched.
+ gzgets returns buf which is a null-terminated string, or it returns NULL
+ for end-of-file or in case of error. If there was an error, the contents at
+ buf are indeterminate.
+ZEXTERN int ZEXPORT gzputc OF((gzFile file, int c));
+ Writes c, converted to an unsigned char, into the compressed file. gzputc
+ returns the value that was written, or -1 in case of error.
+ZEXTERN int ZEXPORT gzgetc OF((gzFile file));
+ Reads one byte from the compressed file. gzgetc returns this byte or -1
+ in case of end of file or error. This is implemented as a macro for speed.
+ As such, it does not do all of the checking the other functions do. I.e.
+ it does not check to see if file is NULL, nor whether the structure file
+ points to has been clobbered or not.
+ZEXTERN int ZEXPORT gzungetc OF((int c, gzFile file));
+ Push one character back onto the stream to be read as the first character
+ on the next read. At least one character of push-back is allowed.
+ gzungetc() returns the character pushed, or -1 on failure. gzungetc() will
+ fail if c is -1, and may fail if a character has been pushed but not read
+ yet. If gzungetc is used immediately after gzopen or gzdopen, at least the
+ output buffer size of pushed characters is allowed. (See gzbuffer above.)
+ The pushed character will be discarded if the stream is repositioned with
+ gzseek() or gzrewind().
+ZEXTERN int ZEXPORT gzflush OF((gzFile file, int flush));
+ Flushes all pending output into the compressed file. The parameter flush
+ is as in the deflate() function. The return value is the zlib error number
+ (see function gzerror below). gzflush is only permitted when writing.
+ If the flush parameter is Z_FINISH, the remaining data is written and the
+ gzip stream is completed in the output. If gzwrite() is called again, a new
+ gzip stream will be started in the output. gzread() is able to read such
+ concatented gzip streams.
+ gzflush should be called only when strictly necessary because it will
+ degrade compression if called too often.
+ZEXTERN z_off_t ZEXPORT gzseek OF((gzFile file,
+ z_off_t offset, int whence));
+ Sets the starting position for the next gzread or gzwrite on the given
+ compressed file. The offset represents a number of bytes in the
+ uncompressed data stream. The whence parameter is defined as in lseek(2);
+ the value SEEK_END is not supported.
+ If the file is opened for reading, this function is emulated but can be
+ extremely slow. If the file is opened for writing, only forward seeks are
+ supported; gzseek then compresses a sequence of zeroes up to the new
+ starting position.
+ gzseek returns the resulting offset location as measured in bytes from
+ the beginning of the uncompressed stream, or -1 in case of error, in
+ particular if the file is opened for writing and the new starting position
+ would be before the current position.
+ZEXTERN int ZEXPORT gzrewind OF((gzFile file));
+ Rewinds the given file. This function is supported only for reading.
+ gzrewind(file) is equivalent to (int)gzseek(file, 0L, SEEK_SET)
+ZEXTERN z_off_t ZEXPORT gztell OF((gzFile file));
+ Returns the starting position for the next gzread or gzwrite on the given
+ compressed file. This position represents a number of bytes in the
+ uncompressed data stream, and is zero when starting, even if appending or
+ reading a gzip stream from the middle of a file using gzdopen().
+ gztell(file) is equivalent to gzseek(file, 0L, SEEK_CUR)
+ZEXTERN z_off_t ZEXPORT gzoffset OF((gzFile file));
+ Returns the current offset in the file being read or written. This offset
+ includes the count of bytes that precede the gzip stream, for example when
+ appending or when using gzdopen() for reading. When reading, the offset
+ does not include as yet unused buffered input. This information can be used
+ for a progress indicator. On error, gzoffset() returns -1.
+ZEXTERN int ZEXPORT gzeof OF((gzFile file));
+ Returns true (1) if the end-of-file indicator has been set while reading,
+ false (0) otherwise. Note that the end-of-file indicator is set only if the
+ read tried to go past the end of the input, but came up short. Therefore,
+ just like feof(), gzeof() may return false even if there is no more data to
+ read, in the event that the last read request was for the exact number of
+ bytes remaining in the input file. This will happen if the input file size
+ is an exact multiple of the buffer size.
+ If gzeof() returns true, then the read functions will return no more data,
+ unless the end-of-file indicator is reset by gzclearerr() and the input file
+ has grown since the previous end of file was detected.
+ZEXTERN int ZEXPORT gzdirect OF((gzFile file));
+ Returns true (1) if file is being copied directly while reading, or false
+ (0) if file is a gzip stream being decompressed.
+ If the input file is empty, gzdirect() will return true, since the input
+ does not contain a gzip stream.
+ If gzdirect() is used immediately after gzopen() or gzdopen() it will
+ cause buffers to be allocated to allow reading the file to determine if it
+ is a gzip file. Therefore if gzbuffer() is used, it should be called before
+ gzdirect().
+ When writing, gzdirect() returns true (1) if transparent writing was
+ requested ("wT" for the gzopen() mode), or false (0) otherwise. (Note:
+ gzdirect() is not needed when writing. Transparent writing must be
+ explicitly requested, so the application already knows the answer. When
+ linking statically, using gzdirect() will include all of the zlib code for
+ gzip file reading and decompression, which may not be desired.)
+ZEXTERN int ZEXPORT gzclose OF((gzFile file));
+ Flushes all pending output if necessary, closes the compressed file and
+ deallocates the (de)compression state. Note that once file is closed, you
+ cannot call gzerror with file, since its structures have been deallocated.
+ gzclose must not be called more than once on the same file, just as free
+ must not be called more than once on the same allocation.
+ gzclose will return Z_STREAM_ERROR if file is not valid, Z_ERRNO on a
+ file operation error, Z_MEM_ERROR if out of memory, Z_BUF_ERROR if the
+ last read ended in the middle of a gzip stream, or Z_OK on success.
+ZEXTERN int ZEXPORT gzclose_r OF((gzFile file));
+ZEXTERN int ZEXPORT gzclose_w OF((gzFile file));
+ Same as gzclose(), but gzclose_r() is only for use when reading, and
+ gzclose_w() is only for use when writing or appending. The advantage to
+ using these instead of gzclose() is that they avoid linking in zlib
+ compression or decompression code that is not used when only reading or only
+ writing respectively. If gzclose() is used, then both compression and
+ decompression code will be included the application when linking to a static
+ zlib library.
+ZEXTERN const char * ZEXPORT gzerror OF((gzFile file, int *errnum));
+ Returns the error message for the last error which occurred on the given
+ compressed file. errnum is set to zlib error number. If an error occurred
+ in the file system and not in the compression library, errnum is set to
+ Z_ERRNO and the application may consult errno to get the exact error code.
+ The application must not modify the returned string. Future calls to
+ this function may invalidate the previously returned string. If file is
+ closed, then the string previously returned by gzerror will no longer be
+ available.
+ gzerror() should be used to distinguish errors from end-of-file for those
+ functions above that do not distinguish those cases in their return values.
+ZEXTERN void ZEXPORT gzclearerr OF((gzFile file));
+ Clears the error and end-of-file flags for file. This is analogous to the
+ clearerr() function in stdio. This is useful for continuing to read a gzip
+ file that is being written concurrently.
+#endif /* !Z_SOLO */
+ /* checksum functions */
+ These functions are not related to compression but are exported
+ anyway because they might be useful in applications using the compression
+ library.
+ZEXTERN uLong ZEXPORT adler32 OF((uLong adler, const Bytef *buf, uInt len));
+ Update a running Adler-32 checksum with the bytes buf[0..len-1] and
+ return the updated checksum. If buf is Z_NULL, this function returns the
+ required initial value for the checksum.
+ An Adler-32 checksum is almost as reliable as a CRC32 but can be computed
+ much faster.
+ Usage example:
+ uLong adler = adler32(0L, Z_NULL, 0);
+ while (read_buffer(buffer, length) != EOF) {
+ adler = adler32(adler, buffer, length);
+ }
+ if (adler != original_adler) error();
+ZEXTERN uLong ZEXPORT adler32_combine OF((uLong adler1, uLong adler2,
+ z_off_t len2));
+ Combine two Adler-32 checksums into one. For two sequences of bytes, seq1
+ and seq2 with lengths len1 and len2, Adler-32 checksums were calculated for
+ each, adler1 and adler2. adler32_combine() returns the Adler-32 checksum of
+ seq1 and seq2 concatenated, requiring only adler1, adler2, and len2. Note
+ that the z_off_t type (like off_t) is a signed integer. If len2 is
+ negative, the result has no meaning or utility.
+ZEXTERN uLong ZEXPORT crc32 OF((uLong crc, const Bytef *buf, uInt len));
+ Update a running CRC-32 with the bytes buf[0..len-1] and return the
+ updated CRC-32. If buf is Z_NULL, this function returns the required
+ initial value for the crc. Pre- and post-conditioning (one's complement) is
+ performed within this function so it shouldn't be done by the application.
+ Usage example:
+ uLong crc = crc32(0L, Z_NULL, 0);
+ while (read_buffer(buffer, length) != EOF) {
+ crc = crc32(crc, buffer, length);
+ }
+ if (crc != original_crc) error();
+ZEXTERN uLong ZEXPORT crc32_combine OF((uLong crc1, uLong crc2, z_off_t len2));
+ Combine two CRC-32 check values into one. For two sequences of bytes,
+ seq1 and seq2 with lengths len1 and len2, CRC-32 check values were
+ calculated for each, crc1 and crc2. crc32_combine() returns the CRC-32
+ check value of seq1 and seq2 concatenated, requiring only crc1, crc2, and
+ len2.
+ /* various hacks, don't look :) */
+/* deflateInit and inflateInit are macros to allow checking the zlib version
+ * and the compiler's view of z_stream:
+ */
+ZEXTERN int ZEXPORT deflateInit_ OF((z_streamp strm, int level,
+ const char *version, int stream_size));
+ZEXTERN int ZEXPORT inflateInit_ OF((z_streamp strm,
+ const char *version, int stream_size));
+ZEXTERN int ZEXPORT deflateInit2_ OF((z_streamp strm, int level, int method,
+ int windowBits, int memLevel,
+ int strategy, const char *version,
+ int stream_size));
+ZEXTERN int ZEXPORT inflateInit2_ OF((z_streamp strm, int windowBits,
+ const char *version, int stream_size));
+ZEXTERN int ZEXPORT inflateBackInit_ OF((z_streamp strm, int windowBits,
+ unsigned char FAR *window,
+ const char *version,
+ int stream_size));
+#define deflateInit(strm, level) \
+ deflateInit_((strm), (level), ZLIB_VERSION, (int)sizeof(z_stream))
+#define inflateInit(strm) \
+ inflateInit_((strm), ZLIB_VERSION, (int)sizeof(z_stream))
+#define deflateInit2(strm, level, method, windowBits, memLevel, strategy) \
+ deflateInit2_((strm),(level),(method),(windowBits),(memLevel),\
+ (strategy), ZLIB_VERSION, (int)sizeof(z_stream))
+#define inflateInit2(strm, windowBits) \
+ inflateInit2_((strm), (windowBits), ZLIB_VERSION, \
+ (int)sizeof(z_stream))
+#define inflateBackInit(strm, windowBits, window) \
+ inflateBackInit_((strm), (windowBits), (window), \
+ ZLIB_VERSION, (int)sizeof(z_stream))
+#ifndef Z_SOLO
+/* gzgetc() macro and its supporting function and exposed data structure. Note
+ * that the real internal state is much larger than the exposed structure.
+ * This abbreviated structure exposes just enough for the gzgetc() macro. The
+ * user should not mess with these exposed elements, since their names or
+ * behavior could change in the future, perhaps even capriciously. They can
+ * only be used by the gzgetc() macro. You have been warned.
+ */
+struct gzFile_s {
+ unsigned have;
+ unsigned char *next;
+ z_off64_t pos;
+ZEXTERN int ZEXPORT gzgetc_ OF((gzFile file)); /* backward compatibility */
+#ifdef Z_PREFIX_SET
+# undef z_gzgetc
+# define z_gzgetc(g) \
+ ((g)->have ? ((g)->have--, (g)->pos++, *((g)->next)++) : gzgetc(g))
+# define gzgetc(g) \
+ ((g)->have ? ((g)->have--, (g)->pos++, *((g)->next)++) : gzgetc(g))
+/* provide 64-bit offset functions if _LARGEFILE64_SOURCE defined, and/or
+ * change the regular functions to 64 bits if _FILE_OFFSET_BITS is 64 (if
+ * both are true, the application gets the *64 functions, and the regular
+ * functions are changed to 64 bits) -- in case these are set on systems
+ * without large file support, _LFS64_LARGEFILE must also be true
+ */
+#ifdef Z_LARGE64
+ ZEXTERN gzFile ZEXPORT gzopen64 OF((const char *, const char *));
+ ZEXTERN z_off64_t ZEXPORT gzseek64 OF((gzFile, z_off64_t, int));
+ ZEXTERN z_off64_t ZEXPORT gztell64 OF((gzFile));
+ ZEXTERN z_off64_t ZEXPORT gzoffset64 OF((gzFile));
+ ZEXTERN uLong ZEXPORT adler32_combine64 OF((uLong, uLong, z_off64_t));
+ ZEXTERN uLong ZEXPORT crc32_combine64 OF((uLong, uLong, z_off64_t));
+#if !defined(ZLIB_INTERNAL) && defined(Z_WANT64)
+# ifdef Z_PREFIX_SET
+# define z_gzopen z_gzopen64
+# define z_gzseek z_gzseek64
+# define z_gztell z_gztell64
+# define z_gzoffset z_gzoffset64
+# define z_adler32_combine z_adler32_combine64
+# define z_crc32_combine z_crc32_combine64
+# else
+# define gzopen gzopen64
+# define gzseek gzseek64
+# define gztell gztell64
+# define gzoffset gzoffset64
+# define adler32_combine adler32_combine64
+# define crc32_combine crc32_combine64
+# endif
+# ifndef Z_LARGE64
+ ZEXTERN gzFile ZEXPORT gzopen64 OF((const char *, const char *));
+ ZEXTERN z_off_t ZEXPORT gzseek64 OF((gzFile, z_off_t, int));
+ ZEXTERN z_off_t ZEXPORT gztell64 OF((gzFile));
+ ZEXTERN z_off_t ZEXPORT gzoffset64 OF((gzFile));
+ ZEXTERN uLong ZEXPORT adler32_combine64 OF((uLong, uLong, z_off_t));
+ ZEXTERN uLong ZEXPORT crc32_combine64 OF((uLong, uLong, z_off_t));
+# endif
+ ZEXTERN gzFile ZEXPORT gzopen OF((const char *, const char *));
+ ZEXTERN z_off_t ZEXPORT gzseek OF((gzFile, z_off_t, int));
+ ZEXTERN z_off_t ZEXPORT gztell OF((gzFile));
+ ZEXTERN z_off_t ZEXPORT gzoffset OF((gzFile));
+ ZEXTERN uLong ZEXPORT adler32_combine OF((uLong, uLong, z_off_t));
+ ZEXTERN uLong ZEXPORT crc32_combine OF((uLong, uLong, z_off_t));
+#else /* Z_SOLO */
+ ZEXTERN uLong ZEXPORT adler32_combine OF((uLong, uLong, z_off_t));
+ ZEXTERN uLong ZEXPORT crc32_combine OF((uLong, uLong, z_off_t));
+#endif /* !Z_SOLO */
+/* hack for buggy compilers */
+#if !defined(ZUTIL_H) && !defined(NO_DUMMY_DECL)
+ struct internal_state {int dummy;};
+/* undocumented functions */
+ZEXTERN const char * ZEXPORT zError OF((int));
+ZEXTERN int ZEXPORT inflateSyncPoint OF((z_streamp));
+ZEXTERN const z_crc_t FAR * ZEXPORT get_crc_table OF((void));
+ZEXTERN int ZEXPORT inflateUndermine OF((z_streamp, int));
+ZEXTERN int ZEXPORT inflateResetKeep OF((z_streamp));
+ZEXTERN int ZEXPORT deflateResetKeep OF((z_streamp));
+#if defined(_WIN32) && !defined(Z_SOLO)
+ZEXTERN gzFile ZEXPORT gzopen_w OF((const wchar_t *path,
+ const char *mode));
+#if defined(STDC) || defined(Z_HAVE_STDARG_H)
+# ifndef Z_SOLO
+ZEXTERN int ZEXPORTVA gzvprintf Z_ARG((gzFile file,
+ const char *format,
+ va_list va));
+# endif
+#ifdef __cplusplus
+#endif /* ZLIB_H */
diff --git a/import/zlibstat.lib b/import/zlibstat.lib
new file mode 100644
index 0000000..538e247
Binary files /dev/null and b/import/zlibstat.lib differ
diff --git a/scripts/testhdfs.bat b/scripts/testhdfs.bat
new file mode 100755
index 0000000..005b7bd
--- /dev/null
+++ b/scripts/testhdfs.bat
@@ -0,0 +1,28 @@
+rem This script demonstrates which env variables need to be set under windows
+rem to get SNAP to run with HDFS support.
+rem To run, ensure that JAVA_HOME and HADOOP_HOME are set in your shell.
+rem If not, you can set them here.
+rem set JAVA_HOME=c:\program files\java\jre6
+rem set HADOOP_HOME=c:\hadoop\hadoop-
+rem set LIBHDFS_OPTS=-verbose:jni
+for %%i in (%HADOOP_HOME%\lib\*.jar) do (
+for %%i in (%HADOOP_HOME%\*.jar) do (
+..\obj\bin\Debug\x64\snap paired hdfs:///scratch/moonshot/indices/hg19-20-5 example.bam
diff --git a/snap.sln b/snap.sln
new file mode 100644
index 0000000..9831a82
--- /dev/null
+++ b/snap.sln
@@ -0,0 +1,189 @@
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2013
+VisualStudioVersion = 12.0.21005.1
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "snap", "apps\snap\snap.vcxproj", "{76E127A5-2247-4A10-9DC8-59518C7C6636}"
+ ProjectSection(ProjectDependencies) = postProject
+ {E620DC13-195C-41EF-B33B-8FE7DE9F8ADC} = {E620DC13-195C-41EF-B33B-8FE7DE9F8ADC}
+ EndProjectSection
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "SNAPLib", "SNAPLib\SNAPLib.vcxproj", "{E620DC13-195C-41EF-B33B-8FE7DE9F8ADC}"
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "DistanceHist", "apps\DistanceHist\DistanceHist.vcxproj", "{029221BC-7BC0-448C-9A68-12D94A25F412}"
+ ProjectSection(ProjectDependencies) = postProject
+ {E620DC13-195C-41EF-B33B-8FE7DE9F8ADC} = {E620DC13-195C-41EF-B33B-8FE7DE9F8ADC}
+ EndProjectSection
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "stringz", "apps\stringz\stringz.vcxproj", "{A587E829-823D-4CA9-9CD4-C563A4617474}"
+ ProjectSection(ProjectDependencies) = postProject
+ {E620DC13-195C-41EF-B33B-8FE7DE9F8ADC} = {E620DC13-195C-41EF-B33B-8FE7DE9F8ADC}
+ EndProjectSection
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ComputeROC", "apps\ComputeROC\ComputeROC.vcxproj", "{EB694CE8-E805-41A0-9D08-C8BEED857166}"
+ ProjectSection(ProjectDependencies) = postProject
+ {E620DC13-195C-41EF-B33B-8FE7DE9F8ADC} = {E620DC13-195C-41EF-B33B-8FE7DE9F8ADC}
+ EndProjectSection
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "wc", "apps\wc\wc.vcxproj", "{70D9DA2A-E423-4705-BC71-0198C365A730}"
+ ProjectSection(ProjectDependencies) = postProject
+ {E620DC13-195C-41EF-B33B-8FE7DE9F8ADC} = {E620DC13-195C-41EF-B33B-8FE7DE9F8ADC}
+ EndProjectSection
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ExtractReads", "apps\ExtractReads\ExtractReads.vcxproj", "{FD52DC05-194B-4BD8-828E-A5679777D6C8}"
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ToFASTQ", "apps\ToFASTQ\ToFASTQ.vcxproj", "{E58705DB-9DFF-4892-B3BD-C76A352F9F37}"
+ ProjectSection(ProjectDependencies) = postProject
+ {E620DC13-195C-41EF-B33B-8FE7DE9F8ADC} = {E620DC13-195C-41EF-B33B-8FE7DE9F8ADC}
+ EndProjectSection
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "RandomizePIfastq", "apps\RandomizePIfastq\RandomizePIfastq.vcxproj", "{1C797460-0410-4DE3-AA1A-8ECA471BA8C4}"
+ ProjectSection(ProjectDependencies) = postProject
+ {E620DC13-195C-41EF-B33B-8FE7DE9F8ADC} = {E620DC13-195C-41EF-B33B-8FE7DE9F8ADC}
+ EndProjectSection
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "tests", "tests\tests.vcxproj", "{CC0CF065-B3A9-46E4-829C-9386F8FE0A0E}"
+ ProjectSection(ProjectDependencies) = postProject
+ {E620DC13-195C-41EF-B33B-8FE7DE9F8ADC} = {E620DC13-195C-41EF-B33B-8FE7DE9F8ADC}
+ EndProjectSection
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "SNAPCommand", "apps\SNAPCommand\SNAPCommand.vcxproj", "{F555A574-597E-4C0E-ADFD-FC4C897B2085}"
+ ProjectSection(ProjectDependencies) = postProject
+ {E620DC13-195C-41EF-B33B-8FE7DE9F8ADC} = {E620DC13-195C-41EF-B33B-8FE7DE9F8ADC}
+ EndProjectSection
+ GlobalSection(SolutionConfigurationPlatforms) = preSolution
+ Debug|Any CPU = Debug|Any CPU
+ Debug|Mixed Platforms = Debug|Mixed Platforms
+ Debug|Win32 = Debug|Win32
+ Debug|x64 = Debug|x64
+ Release|Any CPU = Release|Any CPU
+ Release|Mixed Platforms = Release|Mixed Platforms
+ Release|Win32 = Release|Win32
+ Release|x64 = Release|x64
+ EndGlobalSection
+ GlobalSection(ProjectConfigurationPlatforms) = postSolution
+ {76E127A5-2247-4A10-9DC8-59518C7C6636}.Debug|Any CPU.ActiveCfg = Debug|Win32
+ {76E127A5-2247-4A10-9DC8-59518C7C6636}.Debug|Mixed Platforms.ActiveCfg = Debug|x64
+ {76E127A5-2247-4A10-9DC8-59518C7C6636}.Debug|Mixed Platforms.Build.0 = Debug|x64
+ {76E127A5-2247-4A10-9DC8-59518C7C6636}.Debug|Win32.ActiveCfg = Debug|x64
+ {76E127A5-2247-4A10-9DC8-59518C7C6636}.Debug|Win32.Build.0 = Debug|x64
+ {76E127A5-2247-4A10-9DC8-59518C7C6636}.Debug|x64.ActiveCfg = Debug|x64
+ {76E127A5-2247-4A10-9DC8-59518C7C6636}.Debug|x64.Build.0 = Debug|x64
+ {76E127A5-2247-4A10-9DC8-59518C7C6636}.Release|Any CPU.ActiveCfg = Release|Win32
+ {76E127A5-2247-4A10-9DC8-59518C7C6636}.Release|Mixed Platforms.ActiveCfg = Release|x64
+ {76E127A5-2247-4A10-9DC8-59518C7C6636}.Release|Mixed Platforms.Build.0 = Release|x64
+ {76E127A5-2247-4A10-9DC8-59518C7C6636}.Release|Win32.ActiveCfg = Release|x64
+ {76E127A5-2247-4A10-9DC8-59518C7C6636}.Release|Win32.Build.0 = Release|x64
+ {76E127A5-2247-4A10-9DC8-59518C7C6636}.Release|x64.ActiveCfg = Release|x64
+ {76E127A5-2247-4A10-9DC8-59518C7C6636}.Release|x64.Build.0 = Release|x64
+ {E620DC13-195C-41EF-B33B-8FE7DE9F8ADC}.Debug|Any CPU.ActiveCfg = Debug|Win32
+ {E620DC13-195C-41EF-B33B-8FE7DE9F8ADC}.Debug|Mixed Platforms.ActiveCfg = Debug|x64
+ {E620DC13-195C-41EF-B33B-8FE7DE9F8ADC}.Debug|Mixed Platforms.Build.0 = Debug|x64
+ {E620DC13-195C-41EF-B33B-8FE7DE9F8ADC}.Debug|Win32.ActiveCfg = Debug|x64
+ {E620DC13-195C-41EF-B33B-8FE7DE9F8ADC}.Debug|Win32.Build.0 = Debug|x64
+ {E620DC13-195C-41EF-B33B-8FE7DE9F8ADC}.Debug|x64.ActiveCfg = Debug|x64
+ {E620DC13-195C-41EF-B33B-8FE7DE9F8ADC}.Debug|x64.Build.0 = Debug|x64
+ {E620DC13-195C-41EF-B33B-8FE7DE9F8ADC}.Release|Any CPU.ActiveCfg = Release|Win32
+ {E620DC13-195C-41EF-B33B-8FE7DE9F8ADC}.Release|Mixed Platforms.ActiveCfg = Release|x64
+ {E620DC13-195C-41EF-B33B-8FE7DE9F8ADC}.Release|Mixed Platforms.Build.0 = Release|x64
+ {E620DC13-195C-41EF-B33B-8FE7DE9F8ADC}.Release|Win32.ActiveCfg = Release|x64
+ {E620DC13-195C-41EF-B33B-8FE7DE9F8ADC}.Release|Win32.Build.0 = Release|x64
+ {E620DC13-195C-41EF-B33B-8FE7DE9F8ADC}.Release|x64.ActiveCfg = Release|x64
+ {E620DC13-195C-41EF-B33B-8FE7DE9F8ADC}.Release|x64.Build.0 = Release|x64
+ {029221BC-7BC0-448C-9A68-12D94A25F412}.Debug|Any CPU.ActiveCfg = Debug|Win32
+ {029221BC-7BC0-448C-9A68-12D94A25F412}.Debug|Mixed Platforms.ActiveCfg = Debug|x64
+ {029221BC-7BC0-448C-9A68-12D94A25F412}.Debug|Win32.ActiveCfg = Debug|x64
+ {029221BC-7BC0-448C-9A68-12D94A25F412}.Debug|x64.ActiveCfg = Debug|x64
+ {029221BC-7BC0-448C-9A68-12D94A25F412}.Release|Any CPU.ActiveCfg = Release|Win32
+ {029221BC-7BC0-448C-9A68-12D94A25F412}.Release|Mixed Platforms.ActiveCfg = Release|x64
+ {029221BC-7BC0-448C-9A68-12D94A25F412}.Release|Win32.ActiveCfg = Release|x64
+ {029221BC-7BC0-448C-9A68-12D94A25F412}.Release|x64.ActiveCfg = Release|x64
+ {A587E829-823D-4CA9-9CD4-C563A4617474}.Debug|Any CPU.ActiveCfg = Debug|Win32
+ {A587E829-823D-4CA9-9CD4-C563A4617474}.Debug|Mixed Platforms.ActiveCfg = Debug|x64
+ {A587E829-823D-4CA9-9CD4-C563A4617474}.Debug|Win32.ActiveCfg = Debug|x64
+ {A587E829-823D-4CA9-9CD4-C563A4617474}.Debug|x64.ActiveCfg = Debug|x64
+ {A587E829-823D-4CA9-9CD4-C563A4617474}.Release|Any CPU.ActiveCfg = Release|Win32
+ {A587E829-823D-4CA9-9CD4-C563A4617474}.Release|Mixed Platforms.ActiveCfg = Release|x64
+ {A587E829-823D-4CA9-9CD4-C563A4617474}.Release|Win32.ActiveCfg = Release|x64
+ {A587E829-823D-4CA9-9CD4-C563A4617474}.Release|x64.ActiveCfg = Release|x64
+ {EB694CE8-E805-41A0-9D08-C8BEED857166}.Debug|Any CPU.ActiveCfg = Debug|Win32
+ {EB694CE8-E805-41A0-9D08-C8BEED857166}.Debug|Mixed Platforms.ActiveCfg = Debug|x64
+ {EB694CE8-E805-41A0-9D08-C8BEED857166}.Debug|Win32.ActiveCfg = Debug|x64
+ {EB694CE8-E805-41A0-9D08-C8BEED857166}.Debug|Win32.Build.0 = Debug|x64
+ {EB694CE8-E805-41A0-9D08-C8BEED857166}.Debug|x64.ActiveCfg = Debug|x64
+ {EB694CE8-E805-41A0-9D08-C8BEED857166}.Release|Any CPU.ActiveCfg = Release|Win32
+ {EB694CE8-E805-41A0-9D08-C8BEED857166}.Release|Mixed Platforms.ActiveCfg = Release|x64
+ {EB694CE8-E805-41A0-9D08-C8BEED857166}.Release|Win32.ActiveCfg = Release|x64
+ {EB694CE8-E805-41A0-9D08-C8BEED857166}.Release|Win32.Build.0 = Release|x64
+ {EB694CE8-E805-41A0-9D08-C8BEED857166}.Release|x64.ActiveCfg = Release|x64
+ {70D9DA2A-E423-4705-BC71-0198C365A730}.Debug|Any CPU.ActiveCfg = Debug|Win32
+ {70D9DA2A-E423-4705-BC71-0198C365A730}.Debug|Mixed Platforms.ActiveCfg = Debug|x64
+ {70D9DA2A-E423-4705-BC71-0198C365A730}.Debug|Win32.ActiveCfg = Debug|x64
+ {70D9DA2A-E423-4705-BC71-0198C365A730}.Debug|x64.ActiveCfg = Debug|x64
+ {70D9DA2A-E423-4705-BC71-0198C365A730}.Release|Any CPU.ActiveCfg = Release|Win32
+ {70D9DA2A-E423-4705-BC71-0198C365A730}.Release|Mixed Platforms.ActiveCfg = Release|x64
+ {70D9DA2A-E423-4705-BC71-0198C365A730}.Release|Win32.ActiveCfg = Release|x64
+ {70D9DA2A-E423-4705-BC71-0198C365A730}.Release|x64.ActiveCfg = Release|x64
+ {FD52DC05-194B-4BD8-828E-A5679777D6C8}.Debug|Any CPU.ActiveCfg = Debug|Win32
+ {FD52DC05-194B-4BD8-828E-A5679777D6C8}.Debug|Mixed Platforms.ActiveCfg = Debug|x64
+ {FD52DC05-194B-4BD8-828E-A5679777D6C8}.Debug|Win32.ActiveCfg = Debug|Win32
+ {FD52DC05-194B-4BD8-828E-A5679777D6C8}.Debug|Win32.Build.0 = Debug|Win32
+ {FD52DC05-194B-4BD8-828E-A5679777D6C8}.Debug|x64.ActiveCfg = Debug|x64
+ {FD52DC05-194B-4BD8-828E-A5679777D6C8}.Release|Any CPU.ActiveCfg = Release|Win32
+ {FD52DC05-194B-4BD8-828E-A5679777D6C8}.Release|Mixed Platforms.ActiveCfg = Release|x64
+ {FD52DC05-194B-4BD8-828E-A5679777D6C8}.Release|Win32.ActiveCfg = Release|Win32
+ {FD52DC05-194B-4BD8-828E-A5679777D6C8}.Release|Win32.Build.0 = Release|Win32
+ {FD52DC05-194B-4BD8-828E-A5679777D6C8}.Release|x64.ActiveCfg = Release|x64
+ {E58705DB-9DFF-4892-B3BD-C76A352F9F37}.Debug|Any CPU.ActiveCfg = Debug|Win32
+ {E58705DB-9DFF-4892-B3BD-C76A352F9F37}.Debug|Mixed Platforms.ActiveCfg = Debug|x64
+ {E58705DB-9DFF-4892-B3BD-C76A352F9F37}.Debug|Win32.ActiveCfg = Debug|Win32
+ {E58705DB-9DFF-4892-B3BD-C76A352F9F37}.Debug|Win32.Build.0 = Debug|Win32
+ {E58705DB-9DFF-4892-B3BD-C76A352F9F37}.Debug|x64.ActiveCfg = Debug|x64
+ {E58705DB-9DFF-4892-B3BD-C76A352F9F37}.Release|Any CPU.ActiveCfg = Release|Win32
+ {E58705DB-9DFF-4892-B3BD-C76A352F9F37}.Release|Mixed Platforms.ActiveCfg = Release|x64
+ {E58705DB-9DFF-4892-B3BD-C76A352F9F37}.Release|Win32.ActiveCfg = Release|Win32
+ {E58705DB-9DFF-4892-B3BD-C76A352F9F37}.Release|Win32.Build.0 = Release|Win32
+ {E58705DB-9DFF-4892-B3BD-C76A352F9F37}.Release|x64.ActiveCfg = Release|x64
+ {1C797460-0410-4DE3-AA1A-8ECA471BA8C4}.Debug|Any CPU.ActiveCfg = Debug|Win32
+ {1C797460-0410-4DE3-AA1A-8ECA471BA8C4}.Debug|Mixed Platforms.ActiveCfg = Debug|x64
+ {1C797460-0410-4DE3-AA1A-8ECA471BA8C4}.Debug|Win32.ActiveCfg = Debug|Win32
+ {1C797460-0410-4DE3-AA1A-8ECA471BA8C4}.Debug|Win32.Build.0 = Debug|Win32
+ {1C797460-0410-4DE3-AA1A-8ECA471BA8C4}.Debug|x64.ActiveCfg = Debug|x64
+ {1C797460-0410-4DE3-AA1A-8ECA471BA8C4}.Release|Any CPU.ActiveCfg = Release|Win32
+ {1C797460-0410-4DE3-AA1A-8ECA471BA8C4}.Release|Mixed Platforms.ActiveCfg = Release|x64
+ {1C797460-0410-4DE3-AA1A-8ECA471BA8C4}.Release|Win32.ActiveCfg = Release|Win32
+ {1C797460-0410-4DE3-AA1A-8ECA471BA8C4}.Release|Win32.Build.0 = Release|Win32
+ {1C797460-0410-4DE3-AA1A-8ECA471BA8C4}.Release|x64.ActiveCfg = Release|x64
+ {CC0CF065-B3A9-46E4-829C-9386F8FE0A0E}.Debug|Any CPU.ActiveCfg = Debug|Win32
+ {CC0CF065-B3A9-46E4-829C-9386F8FE0A0E}.Debug|Mixed Platforms.ActiveCfg = Debug|x64
+ {CC0CF065-B3A9-46E4-829C-9386F8FE0A0E}.Debug|Mixed Platforms.Build.0 = Debug|x64
+ {CC0CF065-B3A9-46E4-829C-9386F8FE0A0E}.Debug|Win32.ActiveCfg = Debug|Win32
+ {CC0CF065-B3A9-46E4-829C-9386F8FE0A0E}.Debug|Win32.Build.0 = Debug|Win32
+ {CC0CF065-B3A9-46E4-829C-9386F8FE0A0E}.Debug|x64.ActiveCfg = Debug|x64
+ {CC0CF065-B3A9-46E4-829C-9386F8FE0A0E}.Debug|x64.Build.0 = Debug|x64
+ {CC0CF065-B3A9-46E4-829C-9386F8FE0A0E}.Release|Any CPU.ActiveCfg = Release|Win32
+ {CC0CF065-B3A9-46E4-829C-9386F8FE0A0E}.Release|Mixed Platforms.ActiveCfg = Release|x64
+ {CC0CF065-B3A9-46E4-829C-9386F8FE0A0E}.Release|Win32.ActiveCfg = Release|Win32
+ {CC0CF065-B3A9-46E4-829C-9386F8FE0A0E}.Release|Win32.Build.0 = Release|Win32
+ {CC0CF065-B3A9-46E4-829C-9386F8FE0A0E}.Release|x64.ActiveCfg = Release|x64
+ {CC0CF065-B3A9-46E4-829C-9386F8FE0A0E}.Release|x64.Build.0 = Release|x64
+ {F555A574-597E-4C0E-ADFD-FC4C897B2085}.Debug|Any CPU.ActiveCfg = Debug|Win32
+ {F555A574-597E-4C0E-ADFD-FC4C897B2085}.Debug|Mixed Platforms.ActiveCfg = Debug|x64
+ {F555A574-597E-4C0E-ADFD-FC4C897B2085}.Debug|Mixed Platforms.Build.0 = Debug|x64
+ {F555A574-597E-4C0E-ADFD-FC4C897B2085}.Debug|Win32.ActiveCfg = Debug|Win32
+ {F555A574-597E-4C0E-ADFD-FC4C897B2085}.Debug|Win32.Build.0 = Debug|Win32
+ {F555A574-597E-4C0E-ADFD-FC4C897B2085}.Debug|x64.ActiveCfg = Debug|x64
+ {F555A574-597E-4C0E-ADFD-FC4C897B2085}.Debug|x64.Build.0 = Debug|x64
+ {F555A574-597E-4C0E-ADFD-FC4C897B2085}.Release|Any CPU.ActiveCfg = Release|Win32
+ {F555A574-597E-4C0E-ADFD-FC4C897B2085}.Release|Mixed Platforms.ActiveCfg = Release|x64
+ {F555A574-597E-4C0E-ADFD-FC4C897B2085}.Release|Win32.ActiveCfg = Release|Win32
+ {F555A574-597E-4C0E-ADFD-FC4C897B2085}.Release|Win32.Build.0 = Release|Win32
+ {F555A574-597E-4C0E-ADFD-FC4C897B2085}.Release|x64.ActiveCfg = Release|x64
+ {F555A574-597E-4C0E-ADFD-FC4C897B2085}.Release|x64.Build.0 = Release|x64
+ EndGlobalSection
+ GlobalSection(SolutionProperties) = preSolution
+ HideSolutionNode = FALSE
+ EndGlobalSection
diff --git a/tests/.gitignore b/tests/.gitignore
new file mode 100644
index 0000000..1269488
--- /dev/null
+++ b/tests/.gitignore
@@ -0,0 +1 @@
diff --git a/tests/EventTest.cpp b/tests/EventTest.cpp
new file mode 100644
index 0000000..fc29042
--- /dev/null
+++ b/tests/EventTest.cpp
@@ -0,0 +1,137 @@
+#include "stdafx.h"
+#include "TestLib.h"
+#include "LandauVishkin.h"
+struct EventTest;
+struct TestContext {
+ EventTest* parent;
+ int index;
+ SingleWaiterObject singleWaiterObject;
+ void init(EventTest* parent, int i);
+struct EventTest {
+ EventObject event;
+ TestContext* contexts;
+ bool bind;
+ volatile int started;
+ volatile int proceeded;
+ void testManyWaiters(int threads, bool bind);
+ void testSingleWaiters(int threads, bool bind);
+void TestContext::init(EventTest* i_parent, int i) {
+ this->parent = i_parent;
+ index = i;
+ CreateSingleWaiterObject(&singleWaiterObject);
+void waitEqual(int value, volatile int* variable, const char* message)
+ for (int i = 0; i < 1000; i++) {
+ if (*variable == value) {
+ return;
+ }
+ SleepForMillis(5);
+ }
+ ASSERT_EQ_M(value, *variable, message);
+void testManyWaitersMain(void* c)
+ TestContext* context = (TestContext*) c;
+ if (context->parent->bind) {
+ BindThreadToProcessor(context->index%GetNumberOfProcessors());
+ }
+ //printf("start thread %d%s\n", context->index, context->parent->bind ? " bind" : "");
+ InterlockedIncrementAndReturnNewValue(&context->parent->started);
+ WaitForEvent(&context->parent->event);
+ //printf("proceed thread %d\n", context->index);
+ InterlockedIncrementAndReturnNewValue(&context->parent->proceeded);
+void EventTest::testManyWaiters(int threads, bool i_bind)
+ //printf("testing many %d threads%s\n", threads, i_bind ? " (bind)" : "");
+ contexts = new TestContext[threads];
+ CreateEventObject(&event);
+ PreventEventWaitersFromProceeding(&event);
+ started = proceeded = 0;
+ bind = i_bind;
+ for (int i = 0; i < threads; i++) {
+ contexts[i].init(this, i);
+ StartNewThread(testManyWaitersMain, &contexts[i]);
+ }
+ char buf[100];
+ sprintf(buf, "many started %d%s\n", threads, i_bind ? " bind" : "");
+ waitEqual(threads, &started, buf);
+ AllowEventWaitersToProceed(&event);
+ sprintf(buf, "many proceeded %d%s\n", threads, i_bind ? " bind" : "");
+ waitEqual(threads, &proceeded, buf);
+ DestroyEventObject(&event);
+TEST_F(EventTest, "many waiters") {
+ for (int i = 0; i < 100; i++) {
+ for (int threads = 1; threads <= 64; threads *= 2) {
+ for (int bind = 0; bind < (threads <= 16 ? 2 : 1); bind++) {
+ testManyWaiters(threads, bind);
+ }
+ }
+ }
+void testSingleWaitersMain(void* c)
+ TestContext* context = (TestContext*) c;
+ if (context->parent->bind) {
+ BindThreadToProcessor(context->index%GetNumberOfProcessors());
+ }
+ //printf("start thread %d%s\n", context->index, context->parent->bind ? " bind" : "");
+ InterlockedIncrementAndReturnNewValue(&context->parent->started);
+ WaitForSingleWaiterObject(&context->singleWaiterObject);
+ //printf("proceed thread %d\n", context->index);
+ InterlockedIncrementAndReturnNewValue(&context->parent->proceeded);
+ ResetSingleWaiterObject(&context->singleWaiterObject);
+void EventTest::testSingleWaiters(int threads, bool i_bind)
+ //printf("testing single %d threads%s\n", threads, i_bind ? " (bind)" : "");
+ contexts = new TestContext[threads];
+ started = proceeded = 0;
+ bind = i_bind;
+ for (int i = 0; i < threads; i++) {
+ contexts[i].init(this, i);
+ StartNewThread(testSingleWaitersMain, &contexts[i]);
+ }
+ //SleepForMillis(50);
+ char buf[100];
+ sprintf(buf, "single started %d%s\n", threads, i_bind ? " bind" : "");
+ waitEqual(threads, &started, buf);
+ for (int i = 0; i < threads; i++) {
+ SignalSingleWaiterObject(&contexts[i].singleWaiterObject);
+ sprintf(buf, "single proceeded %d of %d%s\n", i, threads, i_bind ? " bind" : "");
+ waitEqual(i + 1, &proceeded, buf);
+ //SleepForMillis(10); // allow more threads to release
+ sprintf(buf, "single after proceeded %d of %d%s\n", i, threads, i_bind ? " bind" : "");
+ ASSERT_EQ_M(i + 1, proceeded, buf);
+ }
+ for (int i = 0; i < threads; i++) {
+ DestroySingleWaiterObject(&contexts[i].singleWaiterObject);
+ }
+TEST_F(EventTest, "single waiters") {
+ for (int i = 0; i < 50; i++) {
+ for (int threads = 1; threads <= 64; threads *= 2) {
+ for (int bind = 0; bind < (threads <= 16 ? 2 : 1); bind++) {
+ testSingleWaiters(threads, bind);
+ }
+ }
+ }
diff --git a/tests/LandauVishkinTest.cpp b/tests/LandauVishkinTest.cpp
new file mode 100644
index 0000000..87920b4
--- /dev/null
+++ b/tests/LandauVishkinTest.cpp
@@ -0,0 +1,129 @@
+#include "stdafx.h"
+#include "TestLib.h"
+#include "LandauVishkin.h"
+// Test fixture for all the Landau-Viskhin Tests
+struct LandauVishkinTest {
+ LandauVishkin<> lv;
+ LandauVishkinWithCigar lvc;
+TEST_F(LandauVishkinTest, "equal strings") {
+ ASSERT_EQ(0, lv.computeEditDistance("abcde", 5, "abcde", 5, 2));
+TEST_F(LandauVishkinTest, "prefixes") {
+ ASSERT_EQ(0, lv.computeEditDistance("abcde", 5, "abcd", 4, 2));
+ ASSERT_EQ(0, lv.computeEditDistance("abcde", 5, "abc", 3, 2));
+ ASSERT_EQ(0, lv.computeEditDistance("abcde", 5, "ab", 2, 2));
+TEST_F(LandauVishkinTest, "non-equal strings") {
+ ASSERT_EQ(1, lv.computeEditDistance("abcde", 5, "abcdX", 5, 2));
+ ASSERT_EQ(1, lv.computeEditDistance("abcde", 5, "abde", 4, 2));
+ ASSERT_EQ(1, lv.computeEditDistance("abcde", 5, "bcde", 4, 2));
+ ASSERT_EQ(1, lv.computeEditDistance("abcde", 5, "abcXde", 6, 2));
+ ASSERT_EQ(2, lv.computeEditDistance("abcde", 5, "abXXe", 5, 2));
+ ASSERT_EQ(2, lv.computeEditDistance("abcde", 5, "abcXXde", 7, 2));
+TEST_F(LandauVishkinTest, "overly distant strings") {
+ ASSERT_EQ(-1, lv.computeEditDistance("abcde", 5, "XXXXX", 5, 2));
+TEST_F(LandauVishkinTest, "CIGAR strings") {
+ char cigarBuf[1024];
+ int bufLen = sizeof(cigarBuf);
+ lvc.computeEditDistance("abcde", 5, "abcde", 5, 2, cigarBuf, bufLen, false);
+ ASSERT_STREQ("5=", cigarBuf);
+ lvc.computeEditDistance("abcde", 5, "abcde", 5, 2, cigarBuf, bufLen, true);
+ ASSERT_STREQ("5M", cigarBuf);
+ lvc.computeEditDistance("abcdef", 6, "abcde", 5, 2, cigarBuf, bufLen, false);
+ ASSERT_STREQ("5=", cigarBuf);
+ lvc.computeEditDistance("abcdef", 6, "abcde", 5, 2, cigarBuf, bufLen, true);
+ ASSERT_STREQ("5M", cigarBuf);
+ lvc.computeEditDistance("abcde", 5, "abcdX", 5, 2, cigarBuf, bufLen, false);
+ ASSERT_STREQ("4=1X", cigarBuf); // This used to give 4=1I before
+ lvc.computeEditDistance("abcde", 5, "abcdX", 5, 2, cigarBuf, bufLen, true);
+ ASSERT_STREQ("5M", cigarBuf); // This used to give 4=1I before
+ lvc.computeEditDistance("abcde", 5, "Xbcde", 5, 2, cigarBuf, bufLen, false);
+ ASSERT_STREQ("1X4=", cigarBuf);
+ lvc.computeEditDistance("abcde", 5, "Xbcde", 5, 2, cigarBuf, bufLen, true);
+ ASSERT_STREQ("5M", cigarBuf);
+ lvc.computeEditDistance("abcde", 5, "abde", 4, 2, cigarBuf, bufLen, false);
+ ASSERT_STREQ("2=1D2=", cigarBuf);
+ lvc.computeEditDistance("abcde", 5, "abde", 4, 2, cigarBuf, bufLen, true);
+ ASSERT_STREQ("2M1D2M", cigarBuf);
+ lvc.computeEditDistance("abcde", 5, "bcde", 4, 2, cigarBuf, bufLen, false);
+ ASSERT_STREQ("1D4=", cigarBuf);
+ lvc.computeEditDistance("abcde", 5, "bcde", 4, 2, cigarBuf, bufLen, true);
+ ASSERT_STREQ("1D4M", cigarBuf);
+ lvc.computeEditDistance("abcde", 5, "abcXde", 6, 2, cigarBuf, bufLen, false);
+ ASSERT_STREQ("3=1I2=", cigarBuf);
+ lvc.computeEditDistance("abcde", 5, "abcXde", 6, 2, cigarBuf, bufLen, true);
+ ASSERT_STREQ("3M1I2M", cigarBuf);
+ lvc.computeEditDistance("abcde", 5, "abXXe", 5, 2, cigarBuf, bufLen, false);
+ ASSERT_STREQ("2=2X1=", cigarBuf);
+ lvc.computeEditDistance("abcde", 5, "abXXe", 5, 2, cigarBuf, bufLen, true);
+ ASSERT_STREQ("5M", cigarBuf);
+ lvc.computeEditDistance("abcde", 5, "abcXXde", 7, 3, cigarBuf, bufLen, false);
+ ASSERT_STREQ("3=2I2=", cigarBuf);
+ lvc.computeEditDistance("abcde", 5, "abcXXde", 7, 3, cigarBuf, bufLen, true);
+ ASSERT_STREQ("3M2I2M", cigarBuf);
+ lvc.computeEditDistance("ttttc", 5, "tttc", 4, 3, cigarBuf, bufLen, false);
+ ASSERT_STREQ("3=1X", cigarBuf);
+ lvc.computeEditDistance("ttttc", 5, "tttc", 4, 3, cigarBuf, bufLen, true);
+ ASSERT_STREQ("4M", cigarBuf);
+ lvc.computeEditDistance("tttcc", 5, "ttttc", 5, 3, cigarBuf, bufLen, false);
+ ASSERT_STREQ("3=1X1=", cigarBuf);
+ lvc.computeEditDistance("tttcc", 5, "ttttc", 5, 3, cigarBuf, bufLen, true);
+ ASSERT_STREQ("5M", cigarBuf);
+ lvc.computeEditDistance("tttcc", 5, "tttaa", 5, 3, cigarBuf, bufLen, false);
+ ASSERT_STREQ("3=2X", cigarBuf);
+ lvc.computeEditDistance("tttcc", 5, "tttaa", 5, 3, cigarBuf, bufLen, true);
+ ASSERT_STREQ("5M", cigarBuf);
+ // A real example where we used to give 1D and later 1I instead of 2X
+ lvc.computeEditDistance("atctcag", 7, "acttcag", 7, 3, cigarBuf, bufLen, false);
+ ASSERT_STREQ("1=2X4=", cigarBuf);
+ lvc.computeEditDistance("atctcag", 7, "acttcag", 7, 3, cigarBuf, bufLen, true);
+ ASSERT_STREQ("7M", cigarBuf);
+ // Edge cases when pattern is longer than available text
+ lvc.computeEditDistance("abc", 3, "abcde", 5, 3, cigarBuf, bufLen, false);
+ ASSERT_STREQ("3=2X", cigarBuf);
+ lvc.computeEditDistance("abc", 3, "abcde", 5, 3, cigarBuf, bufLen, true);
+ ASSERT_STREQ("5M", cigarBuf);
+ lvc.computeEditDistance("abc", 3, "abXde", 5, 3, cigarBuf, bufLen, false);
+ ASSERT_STREQ("2=3X", cigarBuf);
+ lvc.computeEditDistance("abc", 3, "abXde", 5, 3, cigarBuf, bufLen, true);
+ ASSERT_STREQ("5M", cigarBuf);
diff --git a/tests/ProbabilityDistanceTest.cpp b/tests/ProbabilityDistanceTest.cpp
new file mode 100644
index 0000000..f0a63f6
--- /dev/null
+++ b/tests/ProbabilityDistanceTest.cpp
@@ -0,0 +1,70 @@
+#include "stdafx.h"
+#include "Compat.h"
+#include "TestLib.h"
+#include "ProbabilityDistance.h"
+// Test fixture for all the ProbabilityDistance tests
+struct ProbabilityDistanceTest {
+ ProbabilityDistance dist;
+ double prob;
+ ProbabilityDistanceTest(): dist(0.1, 0.01, 0.2) {}
+TEST_F(ProbabilityDistanceTest, "basic probabilities") {
+ dist.compute("A", "A", "I", 1, 0, 0, &prob);
+ ASSERT_NEAR(0.9, prob);
+ dist.compute("A", "C", "I", 1, 0, 0, &prob);
+ ASSERT_NEAR(0.1, prob);
+ char quality10[2] = {43, 0};
+ dist.compute("A", "C", quality10, 1, 0, 0, &prob);
+ ASSERT_NEAR(0.19, prob); // 1 - (1 - 0.9) * (1 - 0.9)
+ // Check that allowing a shift at the start doesn't change it
+ dist.compute("A", "A", "I", 1, 1, 2, &prob);
+ ASSERT_NEAR(0.9, prob);
+ dist.compute("A", "C", "I", 1, 1, 2, &prob);
+ ASSERT_NEAR(0.1, prob);
+ dist.compute("A", "C", quality10, 1, 1, 2, &prob);
+ ASSERT_NEAR(0.19, prob); // 1 - (1 - 0.9) * (1 - 0.9)
+ dist.compute("AAAAA", "AAAAA", "IIIII", 5, 1, 2, &prob);
+ ASSERT_NEAR(pow(0.9, 5), prob);
+ dist.compute("AAAAA", "AACAA", "IIIII", 5, 1, 2, &prob);
+ ASSERT_NEAR(pow(0.9, 4) * 0.1, prob);
+TEST_F(ProbabilityDistanceTest, "indels") {
+ dist.compute("ACGTA", "ACGGTA", "IIIIII", 6, 1, 2, &prob);
+ ASSERT_NEAR(pow(0.9, 5) * 0.01, prob);
+ // Here it's better to count things as two substitutions than an indel and two mismatches
+ dist.compute("ACGTA", "ACTA", "IIII", 4, 1, 2, &prob);
+ ASSERT_NEAR(pow(0.9, 2) * pow(0.1, 2), prob);
+ dist.compute("ACGTACGT", "ACGTTACGT", "IIIIIIIII", 9, 1, 2, &prob);
+ ASSERT_NEAR(pow(0.9, 8) * 0.01, prob);
+ dist.compute("ACGTACGT", "ACGACGT", "IIIIIII", 7, 1, 2, &prob);
+ ASSERT_NEAR(pow(0.9, 7) * 0.01, prob);
+ dist.compute("ACGTACGT", "ACTACGT", "IIIIIII", 7, 0, 2, &prob);
+ ASSERT_NEAR(pow(0.9, 7) * 0.01, prob);
+ // Here we can start at shift 1 and get a better probability with substitutions than indels
+ dist.compute("ACGTACGT", "ACTACGT", "IIIIIII", 7, 1, 2, &prob);
+ ASSERT_NEAR(pow(0.9, 5) * pow(0.1, 2), prob);
+ dist.compute("ACGTACGT", "ACGTTTACGT", "IIIIIIIIII", 10, 1, 2, &prob);
+ ASSERT_NEAR(pow(0.9, 8) * 0.01 * 0.2, prob);
+ dist.compute("ACGTTTACGT", "ACGTACGT", "IIIIIIII", 8, 1, 2, &prob);
+ ASSERT_NEAR(pow(0.9, 8) * 0.01 * 0.2, prob);
diff --git a/tests/TestLib.cpp b/tests/TestLib.cpp
new file mode 100644
index 0000000..f96ce4d
--- /dev/null
+++ b/tests/TestLib.cpp
@@ -0,0 +1,43 @@
+#include <iostream>
+#include <cstring>
+#include "TestLib.h"
+using namespace std;
+using namespace test;
+int test::runAllTests(char *filter) {
+ const std::vector<TestCase*> &testCases = TestCase::getCases();
+ int tested = 0;
+ int passed = 0;
+ const char *prevFixture = "";
+ for (int i = 0; i < testCases.size(); i++) {
+ TestCase *tc = testCases[i];
+ if (filter != NULL && strstr(tc->fixture, filter) == NULL && strstr(tc->name, filter) == NULL) {
+ // Test name does not pass filter
+ continue;
+ }
+ tested++;
+ if (strcmp(tc->fixture, prevFixture) != 0) {
+ if (strlen(prevFixture) != 0) {
+ cout << endl;
+ }
+ cout << tc->fixture << ":" << endl;
+ prevFixture = tc->fixture;
+ }
+ cout << "- " << tc->name << ": " << flush;
+ try {
+ tc->run();
+ cout << "[OK]" << endl;
+ passed++;
+ } catch (TestFailedException &e) {
+ cout << "[FAILED]" << endl;
+ cout << " " << e.message << endl;
+ cout << " (" << e.file << ":" << e.line << ")" << endl;
+ }
+ }
+ cout << endl << passed << " / " << tested << " tests passed." << endl;
+ return (passed == tested ? 0 : 1);
diff --git a/tests/TestLib.h b/tests/TestLib.h
new file mode 100644
index 0000000..69065da
--- /dev/null
+++ b/tests/TestLib.h
@@ -0,0 +1,152 @@
+#pragma once
+ * A tiny unit testing library in the spirit of Google Test.
+ *
+ * To inplement a standalone test, write:
+ *
+ * TEST("description") { body }
+ *
+ * For fixture-based tests, define a struct Fixture with the fields you want
+ * available (all public) and any setup and teardown code, then use TEST_F:
+ *
+ * struct MyFixture {
+ * int field1;
+ * MyFixture() {} // Optional setup code
+ * ~MyFixture() {} // Optional teardown code
+ * }
+ *
+ * TEST_F(MyFixture, "description") { body }
+ *
+ * TEST_F(MyFixture, "description 2") { another body }
+ *
+ * In the body of a test, you can use the following macros and assertions:
+ *
+ * ASSERT(expression)
+ * ASSERT_M(expression, message)
+ * ASSERT_EQ(expected, actualValue)
+ * ASSERT_NE(expected, actualValue)
+ * ASSERT_STREQ(expected, actualValue) (for C strings)
+ * ASSERT_STRNE(expected, actualValue)
+ * ASSERT_NEAR(expected, actualValue) (for floats/doubles)
+ * FAIL(message)
+ */
+#include <iostream>
+#include <string>
+#include <sstream>
+#include <vector>
+namespace test {
+struct TestCase;
+typedef void (*FunctionPtr)();
+struct TestCase {
+ TestCase(const char *fixture_, const char *name_, FunctionPtr func_)
+ : fixture(fixture_), name(name_), func(func_) {
+ getCases().push_back(this);
+ }
+ void run() { func(); };
+ const char *fixture;
+ const char *name;
+ FunctionPtr func;
+ static std::vector<TestCase*>& getCases() {
+ static std::vector<TestCase*> cases;
+ return cases;
+ };
+struct TestFailedException {
+ TestFailedException(const char *file_, int line_, const std::string& message_)
+ : file(file_), line(line_), message(message_) {}
+ const char *file;
+ int line;
+ std::string message;
+int runAllTests(char *filter);
+#define CONCAT1( x, y ) x ## y
+#define CONCAT2( x, y ) CONCAT1( x, y ) /* To escape weird macro expansion rules */
+#define TEST_FUNC(line) CONCAT2(_test_func_, line)
+#define TEST_CASE(line) CONCAT2(_test_case_, line)
+#define TEST_CLASS(line) CONCAT2(_test_class_, line)
+#define TEST(name) \
+ static void TEST_FUNC(__LINE__) (); \
+ static test::TestCase TEST_CASE(__LINE__) (__FILE__, name, &TEST_FUNC(__LINE__)); \
+ static void TEST_FUNC(__LINE__) () /* body follows */
+#define TEST_F(fixture, name) \
+ namespace { struct TEST_CLASS(__LINE__) : public fixture { void _run(); }; } \
+ static void TEST_FUNC(__LINE__) () { TEST_CLASS(__LINE__) cls; cls._run(); } \
+ static test::TestCase TEST_CASE(__LINE__) (#fixture, name, &TEST_FUNC(__LINE__)); \
+ void TEST_CLASS(__LINE__)::_run() /* body follows */
+#define ASSERT(expr) \
+ if (!(expr)) { \
+ std::ostringstream oss; \
+ oss << "assertion failed: " #expr; \
+ throw test::TestFailedException(__FILE__, __LINE__, oss.str()); \
+ }
+#define ASSERT_M(expr, message) \
+ if (!(expr)) { \
+ std::ostringstream oss; \
+ oss << "assertion failed: " << message; \
+ throw test::TestFailedException(__FILE__, __LINE__, oss.str()); \
+ }
+#define ASSERT_EQ(expected, actual) \
+ if (!((expected) == (actual))) { \
+ std::ostringstream oss; \
+ oss << #actual << " was " << (actual) << ", expected " << (expected); \
+ throw test::TestFailedException(__FILE__, __LINE__, oss.str()); \
+ }
+#define ASSERT_EQ_M(expected, actual, message) \
+ if (!((expected) == (actual))) { \
+ std::ostringstream oss; \
+ oss << #actual << " was " << (actual) << ", expected " << (expected) << ": " << (message); \
+ throw test::TestFailedException(__FILE__, __LINE__, oss.str()); \
+ }
+#define ASSERT_NE(expected, actual) \
+ if (!((expected) != (actual))) { \
+ std::ostringstream oss; \
+ oss << #actual << " was " << (expected); \
+ throw test::TestFailedException(__FILE__, __LINE__, oss.str()); \
+ }
+#define ASSERT_STREQ(expected, actual) \
+ if (strcmp((expected), (actual)) != 0) { \
+ std::ostringstream oss; \
+ oss << #actual << " was \"" << (actual) << "\", expected \"" << (expected) << "\""; \
+ throw test::TestFailedException(__FILE__, __LINE__, oss.str()); \
+ }
+#define ASSERT_STRNE(expected, actual) \
+ if (strcmp((expected), (actual)) == 0) { \
+ std::ostringstream oss; \
+ oss << #actual << " was \"" << (expected) << "\""; \
+ throw test::TestFailedException(__FILE__, __LINE__, oss.str()); \
+ }
+#define ASSERT_NEAR(expected, actual) \
+ if ((expected) < 0.99 * (actual) || (expected) > 1.01 * (actual)) { \
+ std::ostringstream oss; \
+ oss << #actual << " was " << (actual) << ", expected near " << (expected); \
+ throw test::TestFailedException(__FILE__, __LINE__, oss.str()); \
+ }
+#define FAIL(message) \
+ throw test::TestFailedException(__FILE__, __LINE__, message);
diff --git a/tests/bin/ValidateSamFile.jar b/tests/bin/ValidateSamFile.jar
new file mode 100644
index 0000000..eafc722
Binary files /dev/null and b/tests/bin/ValidateSamFile.jar differ
diff --git a/tests/bin/diff.exe b/tests/bin/diff.exe
new file mode 100644
index 0000000..bc4df3f
Binary files /dev/null and b/tests/bin/diff.exe differ
diff --git a/tests/bin/grep.exe b/tests/bin/grep.exe
new file mode 100644
index 0000000..745bb80
Binary files /dev/null and b/tests/bin/grep.exe differ
diff --git a/tests/bin/gzip.exe b/tests/bin/gzip.exe
new file mode 100644
index 0000000..6ff816f
Binary files /dev/null and b/tests/bin/gzip.exe differ
diff --git a/tests/bin/msys-1.0.dll b/tests/bin/msys-1.0.dll
new file mode 100644
index 0000000..4aa4878
Binary files /dev/null and b/tests/bin/msys-1.0.dll differ
diff --git a/tests/bin/samtools.exe b/tests/bin/samtools.exe
new file mode 100644
index 0000000..f425d4f
Binary files /dev/null and b/tests/bin/samtools.exe differ
diff --git a/tests/bin/sort.exe b/tests/bin/sort.exe
new file mode 100644
index 0000000..554ea8b
Binary files /dev/null and b/tests/bin/sort.exe differ
diff --git a/tests/datatest.py b/tests/datatest.py
new file mode 100644
index 0000000..8230421
--- /dev/null
+++ b/tests/datatest.py
@@ -0,0 +1,109 @@
+# datatest.py
+# Run data i/o tests on SNAP
+# There are 3 possibilities for input:
+# FQ |SAM | BAM
+# There are 2 references datatest.fa and datatest2.fa (with an extra refseq)
+# There are 2 possibilities for output:
+# SAM | BAM
+# There are four possible reference files for output
+# FQ | SAM input file (BAM is like SAM)
+# datatest | datatest2 reference file
+# Input & reference files are stored in datatest
+# Temp files are put in datatest/temp
+import sys
+import os
+import shutil
+import subprocess
+if len(sys.argv) != 4:
+ print "usage: %s data_dir snap bin_dir" % sys.argv[0]
+ exit(1)
+data = sys.argv[1]
+snap = sys.argv[2]
+bin = sys.argv[3]
+quick = False
+run = "" # declare global
+def _f(name):
+ return os.path.normpath(data + "/" + name.replace("#", run))
+def _ff(names):
+ return [_f(x) for x in names]
+def runit(args, tag, strict=False, stdout=None, stdin=None):
+ print "> %s" % ' '.join(args)
+ fout = _f(("temp/stdout-%s" % tag)) if stdout == None else stdout
+ ferr = _f("temp/stderr-%s" % tag)
+ retcode = subprocess.call(args, stdout=open(fout, "w"), stderr=open(ferr, "w"))
+ if retcode != 0:
+ print "Run %s exited with %d" % (tag.replace("#", run), retcode)
+ print open(ferr, "r").read(),
+ if strict:
+ exit(1)
+ return False
+ else:
+ return True
+# setup data & temp directories with all needed input files
+temp = os.path.normpath(data + "/temp")
+if not quick:
+ if os.path.exists(temp):
+ shutil.rmtree(temp)
+ os.mkdir(temp)
+ # create indexex
+ runit([snap, "index", _f("datatest.fa"), _f("temp/datatest.idx"), "-c"], "snap-index", strict=True)
+ runit([snap, "index", _f("datatest2.fa"), _f("temp/datatest2.idx"), "-c", "-O500"], "snap-index", strict=True)
+runs = 0
+succeeded = 0
+for input_format in ["fq", "bam", "sam"]:
+ for index in ["datatest", "datatest2"]:
+ for output_format in ["sam", "bam"]:
+ runs += 1
+ temps = [] # temporary files, deleted on success
+ # build & run snap command line
+ outfile = _f("temp/%s-%s.%s" % (input_format, index, output_format))
+ args = [snap, "single", _f("temp/%s.idx" % index), _f("datatest." + input_format), "-t", "1", "-rg", "group1", "-o", outfile]
+ run = "%s-%s-%s" % (input_format, index, output_format)
+ ok = runit(args, "snap-#")
+ temps.append(outfile)
+ if not ok: continue
+ # validate output
+ ok = runit(["java", "-jar", _f("ValidateSamFile.jar"), "input=" + outfile, "output=" + _f("temp/validate-#")], "validate-#")
+ temps.append(_f("temp/validate-#"))
+ if not ok: continue
+ # translate output to sam if needed
+ check_output = outfile
+ if output_format == "bam":
+ check_output = outfile + ".sam"
+ ok = runit([bin + "samtools", "view", "-h", outfile, "-o", check_output], "bam2sam-#")
+ temps.append(check_output)
+ if not ok: continue
+ # compare with reference file
+ # don't yet translate attrs from bam<->sam
+ if input_format == "fq" or input_format == output_format:
+ # remove @PG line from output file
+ checkfile = _f("temp/nopg-#.sam")
+ temps.append(checkfile)
+ ok = runit([bin + "grep", "-v", "@PG", check_output],"grep-#", stdout=checkfile)
+ if not ok: continue
+ reffile = _f("correct-%s-%s.sam" % ((input_format if input_format != "bam" else "sam"), index))
+ ok = runit([bin + "diff", checkfile, reffile], "diff-#")
+ if not ok: continue
+ # delete temp files
+ for f in temps:
+ os.remove(f)
+ succeeded += 1
+print "completed %d runs, %d failures" % (runs, runs - succeeded)
diff --git a/tests/datatest/.gitignore b/tests/datatest/.gitignore
new file mode 100644
index 0000000..3602361
--- /dev/null
+++ b/tests/datatest/.gitignore
@@ -0,0 +1 @@
\ No newline at end of file
diff --git a/tests/datatest/ValidateSamFile.jar b/tests/datatest/ValidateSamFile.jar
new file mode 100644
index 0000000..0a57ffd
Binary files /dev/null and b/tests/datatest/ValidateSamFile.jar differ
diff --git a/tests/datatest/correct-fq-datatest.sam b/tests/datatest/correct-fq-datatest.sam
new file mode 100644
index 0000000..6f7444f
--- /dev/null
+++ b/tests/datatest/correct-fq-datatest.sam
@@ -0,0 +1,5 @@
+ at HD VN:1.4 SO:unsorted
+ at RG ID:group1 SM:sample
+ at SQ SN:ref1 LN:202
+read1 0 ref1 1 70 101= * 0 0 GTCACAAATGCCACAGAGCAAATGGTCCTGAACAAGCAAACAGAACAGGCCCAGAACACGCCAACCTGTTGAAGACAGAAAGTAGCTTCGTGGCCGGGGGG - at 4>3.>,;$B;A>@&A<<5:@5A?<6<1,>='=7A99=<;7;61></'3+5(<&5,0)30%/=:(&(842&54-+,578)776;.*,&/538)/%$(1,- RG:Z:group1 PG:Z:SNAP NM:i:0
diff --git a/tests/datatest/correct-fq-datatest2.sam b/tests/datatest/correct-fq-datatest2.sam
new file mode 100644
index 0000000..7b256cc
--- /dev/null
+++ b/tests/datatest/correct-fq-datatest2.sam
@@ -0,0 +1,6 @@
+ at HD VN:1.4 SO:unsorted
+ at RG ID:group1 SM:sample
+ at SQ SN:ref1 LN:202
+ at SQ SN:ref2 LN:202
+read1 0 ref1 1 70 101= * 0 0 GTCACAAATGCCACAGAGCAAATGGTCCTGAACAAGCAAACAGAACAGGCCCAGAACACGCCAACCTGTTGAAGACAGAAAGTAGCTTCGTGGCCGGGGGG - at 4>3.>,;$B;A>@&A<<5:@5A?<6<1,>='=7A99=<;7;61></'3+5(<&5,0)30%/=:(&(842&54-+,578)776;.*,&/538)/%$(1,- RG:Z:group1 PG:Z:SNAP NM:i:0
diff --git a/tests/datatest/correct-sam-datatest.sam b/tests/datatest/correct-sam-datatest.sam
new file mode 100644
index 0000000..ae95ab5
--- /dev/null
+++ b/tests/datatest/correct-sam-datatest.sam
@@ -0,0 +1,6 @@
+ at HD VN:1.4 SO:unsorted
+ at SQ SN:ref1 LN:202 SP:random
+ at RG ID:group1 SM:sample
+ at CO sample input file for testing header and attribute processing
+read1 0 ref1 1 70 101= * 0 0 GTCACAAATGCCACAGAGCAAATGGTCCTGAACAAGCAAACAGAACAGGCCCAGAACACGCCAACCTGTTGAAGACAGAAAGTAGCTTCGTGGCCGGGGGG - at 4>3.>,;$B;A>@&A<<5:@5A?<6<1,>='=7A99=<;7;61></'3+5(<&5,0)30%/=:(&(842&54-+,578)776;.*,&/538)/%$(1,- X0:Z:value0 X1:Z:value1 RG:Z:group1 PG:Z:SNAP NM:i:0
diff --git a/tests/datatest/correct-sam-datatest2.sam b/tests/datatest/correct-sam-datatest2.sam
new file mode 100644
index 0000000..ed62f19
--- /dev/null
+++ b/tests/datatest/correct-sam-datatest2.sam
@@ -0,0 +1,7 @@
+ at HD VN:1.4 SO:unsorted
+ at RG ID:group1 SM:sample
+ at CO sample input file for testing header and attribute processing
+ at SQ SN:ref1 LN:202
+ at SQ SN:ref2 LN:202
+read1 0 ref1 1 70 101= * 0 0 GTCACAAATGCCACAGAGCAAATGGTCCTGAACAAGCAAACAGAACAGGCCCAGAACACGCCAACCTGTTGAAGACAGAAAGTAGCTTCGTGGCCGGGGGG - at 4>3.>,;$B;A>@&A<<5:@5A?<6<1,>='=7A99=<;7;61></'3+5(<&5,0)30%/=:(&(842&54-+,578)776;.*,&/538)/%$(1,- X0:Z:value0 X1:Z:value1 RG:Z:group1 PG:Z:SNAP NM:i:0
diff --git a/tests/datatest/datatest.bam b/tests/datatest/datatest.bam
new file mode 100644
index 0000000..15780f5
Binary files /dev/null and b/tests/datatest/datatest.bam differ
diff --git a/tests/datatest/datatest.fa b/tests/datatest/datatest.fa
new file mode 100644
index 0000000..c17ea31
--- /dev/null
+++ b/tests/datatest/datatest.fa
@@ -0,0 +1,3 @@
diff --git a/tests/datatest/datatest.fq b/tests/datatest/datatest.fq
new file mode 100644
index 0000000..5d5ac24
--- /dev/null
+++ b/tests/datatest/datatest.fq
@@ -0,0 +1,8 @@
+ at read1
+- at 4>3.>,;$B;A>@&A<<5:@5A?<6<1,>='=7A99=<;7;61></'3+5(<&5,0)30%/=:(&(842&54-+,578)776;.*,&/538)/%$(1,-
+ at read1
+9;6@;:>:2(5.293?+,72$78:974?C>382;A?=:83;96:AB1>=D at 4A;C=AD+<E9=;CBC$<EB2 at A;BF=EE.FA5>ECE(%FFCDBB1A??F
diff --git a/tests/datatest/datatest.sam b/tests/datatest/datatest.sam
new file mode 100644
index 0000000..f8262fa
--- /dev/null
+++ b/tests/datatest/datatest.sam
@@ -0,0 +1,6 @@
+ at HD VN:1.4 SO:coordinate
+ at SQ SN:ref1 LN:202 SP:random
+ at RG ID:group1 SM:sample
+ at CO sample input file for testing header and attribute processing
+read1 99 ref1 3908 69 15=1X42=1X2=1X31=1X2=2X3= = 4442 635 GTCACAAATGCCACAGAGCAAATGGTCCTGAACAAGCAAACAGAACAGGCCCAGAACACGCCAACCTGTTGAAGACAGAAAGTAGCTTCGTGGCCGGGGGG - at 4>3.>,;$B;A>@&A<<5:@5A?<6<1,>='=7A99=<;7;61></'3+5(<&5,0)30%/=:(&(842&54-+,578)776;.*,&/538)/%$(1,- X0:Z:value0 X1:Z:value1 RG:Z:group1
+read1 147 ref1 4442 70 20=1X67=1X12= = 3908 -635 CCACAGCTCTGACTCCTGCATCCTTCTCCTGTGAAGGGGAGGGAGGTGGTGCTGCAGGGGAGGGGAGGGGGCTAGGAGATGTCACTGGGAGCGGAAACGGC 9;6@;:>:2(5.293?+,72$78:974?C>382;A?=:83;96:AB1>=D at 4A;C=AD+<E9=;CBC$<EB2 at A;BF=EE.FA5>ECE(%FFCDBB1A??F X0:Z:value0 X1:Z:value1
diff --git a/tests/datatest/datatest2.fa b/tests/datatest/datatest2.fa
new file mode 100644
index 0000000..7a961d0
--- /dev/null
+++ b/tests/datatest/datatest2.fa
@@ -0,0 +1,6 @@
diff --git a/tests/dup_reads.py b/tests/dup_reads.py
new file mode 100644
index 0000000..13eb353
--- /dev/null
+++ b/tests/dup_reads.py
@@ -0,0 +1,42 @@
+# dup_reads.py
+# create duplicate reads
+import sys
+import random
+def readread(f):
+ result = [f.readline(),f.readline(),f.readline(),f.readline()]
+ if (result[0] and (result[0][0] != "@" or result[2][0] != "+" or len(result[1]) != len(result[3]))):
+ sys.stderr.write("invalid fasta file near %s" % (result[0]))
+ exit(1)
+ return result
+def writeread(f, r):
+ for i in range(4):
+ f.write(r[i])
+if (len(sys.argv) < 4 or len(sys.argv) > 5):
+ print "usage: %s <# of duplicate reads> <max duplication> read1.fq [read2.fq]" % sys.argv[p]
+ exit(1)
+dupcount = int(sys.argv[1])
+maxdup = int(sys.argv[2])
+in1 = open(sys.argv[3], "r")
+out1 = open("dup_" + sys.argv[3], "w")
+paired = len(sys.argv) >= 5
+if paired:
+ in2 = open(sys.argv[4], "r")
+ out2 = open("dup_" + sys.argv[4], "w")
+for i in range(0, dupcount):
+ r1 = readread(in1)
+ if paired:
+ r2 = readread(in2)
+ ndup = random.randint(2,maxdup)
+ for j in range(0, ndup):
+ writeread(out1, ["@dup%d_%s" % (j, r1[0][1:]), r1[1], r1[2], r1[3]])
+ if paired:
+ writeread(out2, ["@dup%d_%s" % (j, r2[0][1:]), r2[1], r2[2], r2[3]])
diff --git a/tests/filetest.py b/tests/filetest.py
new file mode 100644
index 0000000..e997d3f
--- /dev/null
+++ b/tests/filetest.py
@@ -0,0 +1,175 @@
+# filetest.py
+# Run file i/o tests on SNAP
+# There are 4x2 possibilities for input:
+# FQ | FQZ | SAM | BAM
+# Single | paired
+# There is a compressed reference in data/xx.fa.gz
+# There are 3 input files stored:
+# xx_in1.fq.gz xx_in2.fq.gz xx_in12.bam
+# The others are created from these using gunzip and samtools
+# FQ/FQZ single : in1
+# FQ/FQZ paired : in1 & in2
+# BAM/SAM single/paired : in12
+# There are 2x2 possibilities for output:
+# SAM | BAM
+# Unsorted | sorted
+# These are validated against 4 output files using samtools & diff
+# xx_ref1.bam xx_sorted_ref1.bam xx_ref12.bam xx_sorted_ref12.bam
+# Input & output files are stored in data
+# Temp files are put in data/temp
+import sys
+import os
+import shutil
+import subprocess
+if (len(sys.argv) < 5 or len(sys.argv) > 6):
+ print "usage: %s data_dir file_base snap bin_dir [-quick]" % sys.argv[0]
+ exit(1)
+data = sys.argv[1]
+template = sys.argv[2]
+snap = sys.argv[3]
+bin = sys.argv[4]
+quick = len(sys.argv) == 6
+run = "" # declare global
+def _f(name):
+ return os.path.normpath(data + "/" + name.replace("^", template).replace("#", run))
+def _ff(names):
+ return [_f(x) for x in names]
+def runit(args, tag, strict=False, stdout=None, stdin=None):
+ print "$ %s%s%s" % (' '.join(args), "" if stdin == None else " < "+stdin, "" if stdout == None else " > "+stdout)
+ fout = _f(("temp/stdout-%s" % tag)) if stdout == None else stdout
+ ferr = _f("temp/stderr-%s" % tag)
+ retcode = subprocess.call(args, stdout=open(fout, "w"), stderr=open(ferr, "w"), stdin=(None if stdin==None else open(stdin, "r")))
+ if retcode != 0:
+ print "Run %s exited with %d" % (tag.replace("#", run), retcode)
+ print open(ferr, "r").read(),
+ if strict:
+ exit(1)
+ return False
+ else:
+ return True
+# setup data & temp directories with all needed input files
+temp = os.path.normpath(data + "/temp")
+if not quick:
+ if os.path.exists(temp):
+ shutil.rmtree(temp)
+ os.mkdir(temp)
+ runit([bin + "gzip", "-d", "-c", _f("^.fa.gz")], "gunzip-ref", strict=True, stdout=_f("temp/^.fa"))
+ runit([bin + "gzip", "-d", "-c", _f("^_in1.fq.gz")], "gunzip1", strict=True, stdout=_f("temp/^_in1.fq"))
+ runit([bin + "gzip", "-d", "-c", _f("^_in2.fq.gz")], "gunzip2", strict=True, stdout=_f("temp/^_in2.fq"))
+ runit([bin + "samtools", "view", "-h", _f("^_in12.bam"), "-o", _f("temp/^_in12.sam")], "samtools1", strict=True)
+ runit([bin + "samtools", "view", _f("^_ref1.bam"), "-o", _f("temp/^_ref1u.sam")], "samtools2", strict=True)
+ runit([bin + "sort", _f("temp/^_ref1u.sam")], "namesort-ref1", stdout=_f("temp/^_ref1.sam"), strict=True)
+ os.remove(_f("temp/^_ref1u.sam"))
+ runit([bin + "samtools", "view", _f("^_sorted_ref1.bam"), "-o", _f("temp/^_sorted_ref1.sam")], "samtools3", strict=True)
+ runit([bin + "samtools", "view", _f("^_ref12.bam"), "-o", _f("temp/^_ref12u.sam")], "samtools4", strict=True)
+ runit([bin + "sort", _f("temp/^_ref12u.sam")], "namesort-ref12", stdout=_f("temp/^_ref12.sam"), strict=True)
+ os.remove(_f("temp/^_ref12u.sam"))
+ runit([bin + "samtools", "view", _f("^_sorted_ref12.bam"), "-o", _f("temp/^_sorted_ref12.sam")], "samtools5", strict=True)
+ # create index
+ runit([snap, "index", _f("temp/^.fa"), _f("temp/^.idx")], "snap-index", strict=True)
+inputs = {
+ "fq": [["temp/^_in1.fq"], ["temp/^_in1.fq", "temp/^_in2.fq"]],
+ "fq.gz": [["^_in1.fq.gz"], ["^_in1.fq.gz", "^_in2.fq.gz"]],
+ "sam": [["temp/^_in12.sam"], ["temp/^_in12.sam"]],
+ "bam": [["^_in12.bam"], ["^_in12.bam"]]
+formats = {
+ "fq": "-fastq",
+ "fq.gz": "-compressedFastq",
+ "sam": "-sam",
+ "bam": "-bam"
+runs = 0
+failed = 0
+for input_format in ["fq", "fq.gz", "bam", "sam"]:
+ for paired in [0, 1]:
+ for input_format_2 in ["", "fq", "fq.gz", "bam", "sam"]:
+ test_output_format = ["sam", "bam"] if input_format == "fq" and input_format_2 == "" else ["bam"]
+ test_sorted = [0, 1] if (input_format == "fq" or input_format == "bam") and input_format_2 == "" else [0]
+ for output_format in test_output_format:
+ for sorted in test_sorted:
+ test_pipe = [0, 1] if sorted == 0 and (input_format == "bam" or input_format == "sam" or paired == 0) and input_format_2 == "" else [0]
+ for pipe in test_pipe:
+ test_threads = ["1", "4"] if input_format == "fq" and output_format == "bam" and sorted == 1 and pipe == 0 else ["4"]
+ for threads in test_threads:
+ runs += 1
+ temps = [] # temporary files, deleted on success
+ # build & run snap command line
+ args = [snap, ["single", "paired"][paired], _f("temp/^.idx")]
+ args = args + ["-t", threads, "-b", "-="]
+ if pipe == 0:
+ args = args + _ff(inputs[input_format][paired])
+ if input_format_2 != "":
+ args = args + _ff(inputs[input_format_2][paired])
+ else:
+ args = args + [formats[input_format], "-"]
+ if sorted:
+ args.append("-so")
+ run = "%s-%s-%s-%s-%s-%s-%s" % (input_format, input_format_2, ["single", "paired"][paired], output_format, ["unsorted", "sorted"][sorted], threads, ["file", "pipe"][pipe])
+ outfile = _f("temp/output-#." + output_format)
+ if pipe == 0:
+ args = args + ["-o", outfile]
+ else:
+ args = args + ["-o", formats[output_format], "-"]
+ ok = runit(args, "snap-#") if pipe ==0 else runit(args, "snap-#", stdin=_f(inputs[input_format][paired][0]), stdout=outfile)
+ temps.append(outfile)
+ if not ok:
+ failed += 1
+ continue
+ # validate output
+ ok = runit(["java", "-jar", _f("ValidateSamFile.jar"), "input=" + outfile, "output=" + _f("temp/validate-#")], "validate-#")
+ temps.append(_f("temp/validate-#"))
+ if not ok:
+ failed += 1
+ continue
+ if False:
+ # todo: need more sophisticated diff
+ # remove header, convert to SAM if needed
+ samfile = _f("temp/output-nh-#.sam")
+ ok = runit([bin + "samtools", "view"] + {"bam":[], "sam":["-S"]}[output_format] + [outfile, "-o", samfile], "samtools-view-#")
+ temps.append(samfile)
+ if not ok:
+ failed += 1
+ continue
+ outfile = samfile
+ # sort by name if not sorted by coordinate
+ if sorted == 0:
+ sortfile = _f("temp/output-namesort-#.sam")
+ ok = runit([bin + "sort", outfile], "namesort-#", stdout=sortfile)
+ temps.append(sortfile)
+ if not ok:
+ failed += 1
+ continue
+ outfile = sortfile
+ # compare with reference file
+ use12 = "12" if paired or input_format == "sam" or input_format == "bam" else "1"
+ reffile = _f("temp/^%s_ref%s.sam" % (["", "_sorted"][sorted], use12))
+ ok = runit([bin + "diff", outfile, reffile], "diff-#")
+ if not ok:
+ failed += 1
+ continue
+ # delete temp files
+ for f in temps:
+ os.remove(f)
+print "completed %d runs, %d failures" % (runs, failed)
diff --git a/tests/main.cpp b/tests/main.cpp
new file mode 100644
index 0000000..743505b
--- /dev/null
+++ b/tests/main.cpp
@@ -0,0 +1,7 @@
+#include "TestLib.h"
+int main(int argc, char **argv) {
+ // Allow passing in a substring to search for in test names
+ char *filter = (argc == 2 ? argv[1] : NULL);
+ return test::runAllTests(filter);
diff --git a/tests/tests.vcxproj b/tests/tests.vcxproj
new file mode 100644
index 0000000..44f1735
--- /dev/null
+++ b/tests/tests.vcxproj
@@ -0,0 +1,172 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <ItemGroup Label="ProjectConfigurations">
+ <ProjectConfiguration Include="Debug|Win32">
+ <Configuration>Debug</Configuration>
+ <Platform>Win32</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Debug|x64">
+ <Configuration>Debug</Configuration>
+ <Platform>x64</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Release|Win32">
+ <Configuration>Release</Configuration>
+ <Platform>Win32</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Release|x64">
+ <Configuration>Release</Configuration>
+ <Platform>x64</Platform>
+ </ProjectConfiguration>
+ </ItemGroup>
+ <ItemGroup>
+ <ClCompile Include="EventTest.cpp" />
+ <ClCompile Include="LandauVishkinTest.cpp" />
+ <ClCompile Include="main.cpp" />
+ <ClCompile Include="ProbabilityDistanceTest.cpp" />
+ <ClCompile Include="TestLib.cpp" />
+ </ItemGroup>
+ <ItemGroup>
+ <ClInclude Include="TestLib.h" />
+ </ItemGroup>
+ <PropertyGroup Label="Globals">
+ <ProjectGuid>{CC0CF065-B3A9-46E4-829C-9386F8FE0A0E}</ProjectGuid>
+ <Keyword>Win32Proj</Keyword>
+ <RootNamespace>tests</RootNamespace>
+ </PropertyGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <UseDebugLibraries>true</UseDebugLibraries>
+ <PlatformToolset>v120</PlatformToolset>
+ <CharacterSet>MultiByte</CharacterSet>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <UseDebugLibraries>true</UseDebugLibraries>
+ <PlatformToolset>v120</PlatformToolset>
+ <CharacterSet>MultiByte</CharacterSet>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <UseDebugLibraries>false</UseDebugLibraries>
+ <PlatformToolset>v120</PlatformToolset>
+ <WholeProgramOptimization>true</WholeProgramOptimization>
+ <CharacterSet>MultiByte</CharacterSet>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <UseDebugLibraries>false</UseDebugLibraries>
+ <PlatformToolset>v120</PlatformToolset>
+ <WholeProgramOptimization>true</WholeProgramOptimization>
+ <CharacterSet>MultiByte</CharacterSet>
+ </PropertyGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+ <ImportGroup Label="ExtensionSettings">
+ </ImportGroup>
+ <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <PropertyGroup Label="UserMacros" />
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ <LinkIncremental>true</LinkIncremental>
+ <OutDir>$(SolutionDir)\obj\bin\$(Configuration)\$(Platform)\</OutDir>
+ <IntDir>$(SolutionDir)\obj\obj\snap\$(Configuration)\$(Platform)\</IntDir>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ <LinkIncremental>true</LinkIncremental>
+ <OutDir>$(SolutionDir)\obj\bin\$(Configuration)\$(Platform)\</OutDir>
+ <IntDir>$(SolutionDir)\obj\obj\test\$(Configuration)\$(Platform)\</IntDir>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ <LinkIncremental>false</LinkIncremental>
+ <OutDir>$(SolutionDir)\obj\bin\$(Configuration)\$(Platform)\</OutDir>
+ <IntDir>$(SolutionDir)\obj\obj\snap\$(Configuration)\$(Platform)\</IntDir>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ <LinkIncremental>false</LinkIncremental>
+ <OutDir>$(SolutionDir)\obj\bin\$(Configuration)\$(Platform)\</OutDir>
+ <IntDir>$(SolutionDir)\obj\obj\test\$(Configuration)\$(Platform)\</IntDir>
+ </PropertyGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ <ClCompile>
+ <PrecompiledHeader>
+ </PrecompiledHeader>
+ <WarningLevel>Level3</WarningLevel>
+ <Optimization>Disabled</Optimization>
+ <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions);_CRT_SECURE_NO_WARNINGS</PreprocessorDefinitions>
+ <AdditionalIncludeDirectories>..\snaplib\</AdditionalIncludeDirectories>
+ </ClCompile>
+ <Link>
+ <SubSystem>Console</SubSystem>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <AdditionalDependencies>libhdfs.lib;snaplib.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies);zlibstat.lib</AdditionalDependencies>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ <ClCompile>
+ <PrecompiledHeader>
+ </PrecompiledHeader>
+ <WarningLevel>Level3</WarningLevel>
+ <Optimization>Disabled</Optimization>
+ <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions);_CRT_SECURE_NO_WARNINGS</PreprocessorDefinitions>
+ <AdditionalIncludeDirectories>..\snaplib\</AdditionalIncludeDirectories>
+ </ClCompile>
+ <Link>
+ <SubSystem>Console</SubSystem>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <AdditionalDependencies>libhdfs.lib;snaplib.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies);zlibstat.lib</AdditionalDependencies>
+ <AdditionalLibraryDirectories>$(SolutionDir)obj\lib\$(Configuration)\$(Platform)\;$(SolutionDir)import</AdditionalLibraryDirectories>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ <ClCompile>
+ <WarningLevel>Level3</WarningLevel>
+ <PrecompiledHeader>
+ </PrecompiledHeader>
+ <Optimization>MaxSpeed</Optimization>
+ <FunctionLevelLinking>true</FunctionLevelLinking>
+ <IntrinsicFunctions>true</IntrinsicFunctions>
+ <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions);_CRT_SECURE_NO_WARNINGS</PreprocessorDefinitions>
+ <AdditionalIncludeDirectories>..\snaplib\</AdditionalIncludeDirectories>
+ </ClCompile>
+ <Link>
+ <SubSystem>Console</SubSystem>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <EnableCOMDATFolding>true</EnableCOMDATFolding>
+ <OptimizeReferences>true</OptimizeReferences>
+ <AdditionalDependencies>libhdfs.lib;snaplib.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies);zlibstat.lib</AdditionalDependencies>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ <ClCompile>
+ <WarningLevel>Level3</WarningLevel>
+ <PrecompiledHeader>
+ </PrecompiledHeader>
+ <Optimization>MaxSpeed</Optimization>
+ <FunctionLevelLinking>true</FunctionLevelLinking>
+ <IntrinsicFunctions>true</IntrinsicFunctions>
+ <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions);_CRT_SECURE_NO_WARNINGS</PreprocessorDefinitions>
+ <AdditionalIncludeDirectories>..\snaplib\</AdditionalIncludeDirectories>
+ </ClCompile>
+ <Link>
+ <SubSystem>Console</SubSystem>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <EnableCOMDATFolding>true</EnableCOMDATFolding>
+ <OptimizeReferences>true</OptimizeReferences>
+ <AdditionalDependencies>libhdfs.lib;snaplib.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies);zlibstat.lib</AdditionalDependencies>
+ <AdditionalLibraryDirectories>$(SolutionDir)obj\lib\$(Configuration)\$(Platform)\;$(SolutionDir)import</AdditionalLibraryDirectories>
+ </Link>
+ </ItemDefinitionGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+ <ImportGroup Label="ExtensionTargets">
+ </ImportGroup>
\ No newline at end of file
diff --git a/tests/tests.vcxproj.filters b/tests/tests.vcxproj.filters
new file mode 100644
index 0000000..aa69803
--- /dev/null
+++ b/tests/tests.vcxproj.filters
@@ -0,0 +1,39 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <ItemGroup>
+ <Filter Include="Source Files">
+ <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
+ <Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
+ </Filter>
+ <Filter Include="Header Files">
+ <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
+ <Extensions>h;hh;hpp;hxx;hm;inl;inc;xsd</Extensions>
+ </Filter>
+ <Filter Include="Resource Files">
+ <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
+ <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
+ </Filter>
+ </ItemGroup>
+ <ItemGroup>
+ <ClCompile Include="EventTest.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="LandauVishkinTest.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="main.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="ProbabilityDistanceTest.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="TestLib.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ </ItemGroup>
+ <ItemGroup>
+ <ClInclude Include="TestLib.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ </ItemGroup>
\ No newline at end of file
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/snap-aligner.git
More information about the debian-med-commit
mailing list